1dnl  x86-64 mpn_divrem_1 -- mpn by limb division.
2
3dnl  Copyright 2004, 2005, 2007-2010, 2012, 2014 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C		norm	unorm	frac
35C AMD K8,K9	15	15	12
36C AMD K10	15	15	12
37C Intel P4	44	44	43
38C Intel core2	24	24	19.5
39C Intel corei	19	19	18
40C Intel atom	51	51	36
41C VIA nano	46	44	22.5
42
43C mp_limb_t
44C mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
45C               mp_srcptr np, mp_size_t nn, mp_limb_t d)
46
47C mp_limb_t
48C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
49C                      mp_srcptr np, mp_size_t nn, mp_limb_t d,
50C                      mp_limb_t dinv, int cnt)
51
52C INPUT PARAMETERS
53define(`qp',		`%rdi')
54define(`fn_param',	`%rsi')
55define(`up_param',	`%rdx')
56define(`un_param',	`%rcx')
57define(`d',		`%r8')
58define(`dinv',		`%r9')		C only for mpn_preinv_divrem_1
59C       shift passed on stack		C only for mpn_preinv_divrem_1
60
61define(`cnt',		`%rcx')
62define(`up',		`%rsi')
63define(`fn',		`%r12')
64define(`un',		`%rbx')
65
66
67C rax rbx rcx rdx rsi rdi rbp r8  r9  r10 r11 r12 r13 r14 r15
68C         cnt         qp      d  dinv
69
70ABI_SUPPORT(DOS64)
71ABI_SUPPORT(STD64)
72
73IFSTD(`define(`CNTOFF',		`40($1)')')
74IFDOS(`define(`CNTOFF',		`104($1)')')
75
76ASM_START()
77	TEXT
78	ALIGN(16)
79PROLOGUE(mpn_preinv_divrem_1)
80	FUNC_ENTRY(4)
81IFDOS(`	mov	56(%rsp), %r8	')
82IFDOS(`	mov	64(%rsp), %r9	')
83	xor	R32(%rax), R32(%rax)
84	push	%r13
85	push	%r12
86	push	%rbp
87	push	%rbx
88
89	mov	fn_param, fn
90	mov	un_param, un
91	add	fn_param, un_param
92	mov	up_param, up
93
94	lea	-8(qp,un_param,8), qp
95
96	mov	CNTOFF(%rsp), R8(cnt)
97	shl	R8(cnt), d
98	jmp	L(ent)
99EPILOGUE()
100
101	ALIGN(16)
102PROLOGUE(mpn_divrem_1)
103	FUNC_ENTRY(4)
104IFDOS(`	mov	56(%rsp), %r8	')
105	xor	R32(%rax), R32(%rax)
106	push	%r13
107	push	%r12
108	push	%rbp
109	push	%rbx
110
111	mov	fn_param, fn
112	mov	un_param, un
113	add	fn_param, un_param
114	mov	up_param, up
115	je	L(ret)
116
117	lea	-8(qp,un_param,8), qp
118	xor	R32(%rbp), R32(%rbp)
119
120L(unnormalized):
121	test	un, un
122	je	L(44)
123	mov	-8(up,un,8), %rax
124	cmp	d, %rax
125	jae	L(44)
126	mov	%rbp, (qp)
127	mov	%rax, %rbp
128	lea	-8(qp), qp
129	je	L(ret)
130	dec	un
131L(44):
132	bsr	d, %rcx
133	not	R32(%rcx)
134	sal	R8(%rcx), d
135	sal	R8(%rcx), %rbp
136
137	push	%rcx
138IFSTD(`	push	%rdi		')
139IFSTD(`	push	%rsi		')
140	push	%r8
141IFSTD(`	sub	$8, %rsp	')
142IFSTD(`	mov	d, %rdi		')
143IFDOS(`	sub	$40, %rsp	')
144IFDOS(`	mov	d, %rcx		')
145	ASSERT(nz, `test $15, %rsp')
146	CALL(	mpn_invert_limb)
147IFSTD(`	add	$8, %rsp	')
148IFDOS(`	add	$40, %rsp	')
149	pop	%r8
150IFSTD(`	pop	%rsi		')
151IFSTD(`	pop	%rdi		')
152	pop	%rcx
153
154	mov	%rax, dinv
155	mov	%rbp, %rax
156	test	un, un
157	je	L(frac)
158
159L(ent):	mov	-8(up,un,8), %rbp
160	shr	R8(%rcx), %rax
161	shld	R8(%rcx), %rbp, %rax
162	sub	$2, un
163	js	L(end)
164
165	ALIGN(16)
166L(top):	lea	1(%rax), %r11
167	mul	dinv
168	mov	(up,un,8), %r10
169	shld	R8(%rcx), %r10, %rbp
170	mov	%rbp, %r13
171	add	%rax, %r13
172	adc	%r11, %rdx
173	mov	%rdx, %r11
174	imul	d, %rdx
175	sub	%rdx, %rbp
176	lea	(d,%rbp), %rax
177	sub	$8, qp
178	cmp	%r13, %rbp
179	cmovc	%rbp, %rax
180	adc	$-1, %r11
181	cmp	d, %rax
182	jae	L(ufx)
183L(uok):	dec	un
184	mov	%r11, 8(qp)
185	mov	%r10, %rbp
186	jns	L(top)
187
188L(end):	lea	1(%rax), %r11
189	sal	R8(%rcx), %rbp
190	mul	dinv
191	add	%rbp, %rax
192	adc	%r11, %rdx
193	mov	%rax, %r11
194	mov	%rdx, %r13
195	imul	d, %rdx
196	sub	%rdx, %rbp
197	mov	d, %rax
198	add	%rbp, %rax
199	cmp	%r11, %rbp
200	cmovc	%rbp, %rax
201	adc	$-1, %r13
202	cmp	d, %rax
203	jae	L(efx)
204L(eok):	mov	%r13, (qp)
205	sub	$8, qp
206	jmp	L(frac)
207
208L(ufx):	sub	d, %rax
209	inc	%r11
210	jmp	L(uok)
211L(efx):	sub	d, %rax
212	inc	%r13
213	jmp	L(eok)
214
215L(frac):mov	d, %rbp
216	neg	%rbp
217	jmp	L(fent)
218
219	ALIGN(16)			C	    K8-K10  P6-CNR P6-NHM  P4
220L(ftop):mul	dinv			C	      0,12   0,17   0,17
221	add	%r11, %rdx		C	      5      8     10
222	mov	%rax, %r11		C	      4      8      3
223	mov	%rdx, %r13		C	      6      9     11
224	imul	%rbp, %rdx		C	      6      9     11
225	mov	d, %rax			C
226	add	%rdx, %rax		C	     10     14     14
227	cmp	%r11, %rdx		C	     10     14     14
228	cmovc	%rdx, %rax		C	     11     15     15
229	adc	$-1, %r13		C
230	mov	%r13, (qp)		C
231	sub	$8, qp			C
232L(fent):lea	1(%rax), %r11		C
233	dec	fn			C
234	jns	L(ftop)			C
235
236	shr	R8(%rcx), %rax
237L(ret):	pop	%rbx
238	pop	%rbp
239	pop	%r12
240	pop	%r13
241	FUNC_EXIT()
242	ret
243EPILOGUE()
244