1dnl  x86-64 mpn_divrem_1 -- mpn by limb division.
2
3dnl  Copyright 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C		norm	unorm	frac
24C K8		13	13	12
25C P4		44.2	44.2	42.3
26C P6 core2	25	24.5	19.3
27C P6 corei7	21.5	20.7	18
28C P6 atom	42	52	37
29
30C TODO
31C  * Compute the inverse without relying on the div instruction.
32C    Newton's method and mulq, or perhaps the faster fdiv.
33C  * Tune prologue.
34C  * Optimize for Core 2.
35
36C The code for unnormalized divisors works also for normalized divisors, but
37C for some reason it runs really slowly (on K8) for that case.  Use special
38C code until we can address this.  The Intel Atom is also affected, but
39C understandably (shld slowness).
40define(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',1)
41
42C mp_limb_t
43C mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
44C               mp_srcptr np, mp_size_t nn, mp_limb_t d)
45
46C mp_limb_t
47C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
48C                      mp_srcptr np, mp_size_t nn, mp_limb_t d,
49C                      mp_limb_t dinv, int cnt)
50
51C INPUT PARAMETERS
52define(`qp',		`%rdi')
53define(`fn_param',	`%rsi')
54define(`up_param',	`%rdx')
55define(`un_param',	`%rcx')
56define(`d',		`%r8')
57define(`dinv',		`%r9')		C only for mpn_preinv_divrem_1
58C       shift passed on stack		C only for mpn_preinv_divrem_1
59
60define(`cnt',		`%rcx')
61define(`up',		`%rsi')
62define(`fn',		`%r12')
63define(`un',		`%rbx')
64
65
66C rax rbx rcx rdx rsi rdi rbp r8  r9  r10 r11 r12 r13 r14 r15
67C         cnt         qp      d  dinv
68
69ASM_START()
70	TEXT
71	ALIGN(16)
72PROLOGUE(mpn_preinv_divrem_1)
73	xor	%eax, %eax
74	push	%r13
75	push	%r12
76	push	%rbp
77	push	%rbx
78
79	mov	fn_param, fn
80	mov	un_param, un
81	add	fn_param, un_param
82	mov	up_param, up
83
84	lea	-8(qp,un_param,8), qp
85
86	test	d, d
87	js	L(nent)
88	mov	40(%rsp), R8(cnt)
89	shl	R8(cnt), d
90	jmp	L(uent)
91EPILOGUE()
92
93	ALIGN(16)
94PROLOGUE(mpn_divrem_1)
95	xor	%eax, %eax
96	push	%r13
97	push	%r12
98	push	%rbp
99	push	%rbx
100
101	mov	fn_param, fn
102	mov	un_param, un
103	add	fn_param, un_param
104	mov	up_param, up
105	je	L(ret)
106
107	lea	-8(qp,un_param,8), qp
108	xor	R32(%rbp), R32(%rbp)
109
110
111ifdef(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',`
112	test	d, d
113	jns	L(unnormalized)
114
115L(normalized):
116	test	un, un
117	je	L(8)			C un == 0
118	mov	-8(up,un,8), %rbp
119	dec	un
120	mov	%rbp, %rax
121	sub	d, %rbp
122	cmovb	%rax, %rbp
123	sbb	%eax, %eax
124	inc	%eax
125	mov	%rax, (qp)
126	lea	-8(qp), qp
127L(8):
128	mov	d, %rdx
129	mov	$-1, %rax
130	not	%rdx
131	div	d			C FREE rax rdx rcx r9 r10 r11
132	mov	%rax, dinv
133	mov	%rbp, %rax
134	jmp	L(nent)
135
136	ALIGN(16)
137L(nloop):				C		    cycK8  cycP6  cycP4
138	mov	(up,un,8), %r10		C
139	lea	1(%rax), %rbp		C
140	mul	dinv			C		     0,13   0,19  0,45
141	add	%r10, %rax		C		     4      8     12
142	adc	%rbp, %rdx		C		     5      9     13
143	mov	%rax, %rbp		C		     5      9     13
144	mov	%rdx, %r13		C		     6      11    23
145	imul	d, %rdx			C		     6      11    23
146	sub	%rdx, %r10		C		     10     16    33
147	mov	d, %rax			C
148	add	%r10, %rax		C		     11     17    34
149	cmp	%rbp, %r10		C		     11     17    34
150	cmovb	%r10, %rax		C		     12     18    35
151	adc	$-1, %r13		C
152	cmp	d, %rax			C
153	jae	L(nfx)			C
154L(nok):	mov	%r13, (qp)		C
155	sub	$8, qp			C
156L(nent):dec	un			C
157	jns	L(nloop)		C
158
159	xor	%ecx, %ecx
160	jmp	L(87)
161
162L(nfx):	sub	d, %rax
163	inc	%r13
164	jmp	L(nok)
165')
166
167L(unnormalized):
168	test	un, un
169	je	L(44)
170	mov	-8(up,un,8), %rax
171	cmp	d, %rax
172	jae	L(44)
173	mov	%rbp, (qp)
174	mov	%rax, %rbp
175	lea	-8(qp), qp
176	je	L(ret)
177	dec	un
178L(44):
179	bsr	d, %rcx
180	not	%ecx
181	sal	%cl, d
182	sal	%cl, %rbp
183	mov	d, %rdx
184	mov	$-1, %rax
185	not	%rdx
186	div	d			C FREE rax rdx r9 r10 r11
187	test	un, un
188	mov	%rax, dinv
189	mov	%rbp, %rax
190	je	L(87)
191L(uent):
192	mov	-8(up,un,8), %rbp
193	shr	%cl, %rax
194	shld	%cl, %rbp, %rax
195	sub	$2, un
196	js	L(ulast)
197
198	ALIGN(16)
199L(uloop):
200	nop
201	mov	(up,un,8), %r10
202	lea	1(%rax), %r11
203	shld	%cl, %r10, %rbp
204	mul	dinv
205	add	%rbp, %rax
206	adc	%r11, %rdx
207	mov	%rax, %r11
208	mov	%rdx, %r13
209	imul	d, %rdx
210	sub	%rdx, %rbp
211	mov	d, %rax
212	add	%rbp, %rax
213	cmp	%r11, %rbp
214	cmovb	%rbp, %rax
215	adc	$-1, %r13
216	cmp	d, %rax
217	jae	L(ufx)
218L(uok):	mov	%r13, (qp)
219	sub	$8, qp
220	dec	un
221	mov	%r10, %rbp
222	jns	L(uloop)
223L(ulast):
224	lea	1(%rax), %r11
225	sal	%cl, %rbp
226	mul	dinv
227	add	%rbp, %rax
228	adc	%r11, %rdx
229	mov	%rax, %r11
230	mov	%rdx, %r13
231	imul	d, %rdx
232	sub	%rdx, %rbp
233	mov	d, %rax
234	add	%rbp, %rax
235	cmp	%r11, %rbp
236	cmovb	%rbp, %rax
237	adc	$-1, %r13
238	cmp	d, %rax
239	jae	L(93)
240L(69):	mov	%r13, (qp)
241	sub	$8, qp
242	jmp	L(87)
243
244L(ufx):	sub	d, %rax
245	inc	%r13
246	jmp	L(uok)
247
248L(93):	sub	d, %rax
249	inc	%r13
250	jmp	L(69)
251
252L(87):	mov	d, %rbp
253	neg	%rbp
254	jmp	L(87b)
255
256	ALIGN(16)
257L(floop):				C		    cycK8  cycP6  cycP4
258	lea	1(%rax), %r11		C
259	mul	dinv			C		     0,12
260	add	%r11, %rdx		C		     5
261	mov	%rax, %r11		C		     4
262	mov	%rdx, %r13		C		     6
263	imul	%rbp, %rdx		C		     6
264	mov	d, %rax			C
265	add	%rdx, %rax		C		     10
266	cmp	%r11, %rdx		C		     10
267	cmovb	%rdx, %rax		C		     11
268	adc	$-1, %r13		C
269	mov	%r13, (qp)		C
270	sub	$8, qp			C
271L(87b):	dec	fn			C
272	jns	L(floop)		C
273
274	shr	%cl, %rax
275L(ret):	pop	%rbx
276	pop	%rbp
277	pop	%r12
278	pop	%r13
279	ret
280EPILOGUE()
281