1dnl  x86-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
2
3dnl  Copyright 2007, 2008, 2010 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C		norm	frac
24C K8		20	20
25C P4		73	73
26C P6 core2	37	37
27C P6 corei7	33	33
28
29C TODO
30C  * Perhaps compute the inverse without relying on divq?  Could either use
31C    Newton's method and mulq, or perhaps the faster fdiv.
32C  * The loop has not been carefully tuned, nor analysed for critical path
33C    length.  It seems that 20 c/l is a bit long, compared to the 13 c/l for
34C    mpn_divrem_1.
35C  * Clean up.  This code is really crude.
36
37
38C INPUT PARAMETERS
39define(`qp',		`%rdi')
40define(`fn',		`%rsi')
41define(`up_param',	`%rdx')
42define(`un_param',	`%rcx')
43define(`dp',		`%r8')
44
45define(`dinv',		`%r9')
46
47
48C rax rbx rcx rdx rsi rdi rbp r8  r9  r10 r11 r12 r13 r14 r15
49C         cnt         qp      d  dinv
50
51ASM_START()
52	TEXT
53	ALIGN(16)
54PROLOGUE(mpn_divrem_2)
55
56	push	%r15
57	lea	(%rdx,%rcx,8), %rax
58	push	%r14
59	push	%r13
60	mov	%rsi, %r13
61	push	%r12
62	lea	-24(%rax), %r12
63	push	%rbp
64	mov	%rdi, %rbp
65	push	%rbx
66	mov	8(%r8), %r11
67	mov	-8(%rax), %r9
68	mov	(%r8), %r8
69	mov	-16(%rax), %r10
70	xor	R32(%r15), R32(%r15)
71	cmp	%r9, %r11
72	ja	L(2)
73	setb	%dl
74	cmp	%r10, %r8
75	setbe	%al
76	orb	%al, %dl
77	jne	L(23)
78L(2):
79	lea	-3(%rcx,%r13), %rbx	C un + fn - 3
80	test	%rbx, %rbx
81	js	L(6)
82	mov	%r11, %rdx
83	mov	$-1, %rax
84	not	%rdx
85	div	%r11
86	mov	%r11, %rdx
87	mov	%rax, %rdi
88	imul	%rax, %rdx
89	mov	%rdx, %r14
90	mul	%r8
91	mov	%rdx, %rcx
92	mov	$-1, %rdx
93	add	%r8, %r14
94	adc	$0, %rdx
95	add	%rcx, %r14
96	adc	$0, %rdx
97	js	L(8)
98L(18):
99	dec	%rdi
100	sub	%r11, %r14
101	sbb	$0, %rdx
102	jns	L(18)
103L(8):
104
105C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
106C n2      un      n1 dinv qp  d0        d1  up  fn      msl
107C     n2  un     -d1      n1    dinv XX              XX
108
109ifdef(`NEW',`
110	lea	(%rbp,%rbx,8), %rbp
111	mov	%rbx, %rcx		C un
112	mov	%r9, %rbx
113	mov	%rdi, %r9		C di
114	mov	%r10, %r14
115	mov	%r11, %rsi
116	neg	%rsi			C -d1
117	ALIGN(16)
118L(loop):
119	mov	%r9, %rax		C di		ncp
120	mul	%rbx			C		0, 18
121	add	%r14, %rax		C		4
122	mov	%rax, %r10		C q0		5
123	adc	%rbx, %rdx		C		5
124	mov	%rdx, %rdi		C q		6
125	imul	%rsi, %rdx		C		6
126	mov	%r8, %rax		C		ncp
127	lea	(%rdx, %r14), %rbx	C n1 -= ...	7
128	mul	%rdi			C		7
129	xor	R32(%r14), R32(%r14)	C
130	cmp	%rcx, %r13		C
131	jg	L(19)			C
132	mov	(%r12), %r14		C
133	sub	$8, %r12		C
134L(19):	sub	%r8, %r14		C		ncp
135	sbb	%r11, %rbx		C		9
136	sub	%rax, %r14		C		11
137	sbb	%rdx, %rbx		C		12
138	inc	%rdi			C		7
139	xor	R32(%rdx), R32(%rdx)	C
140	cmp	%r10, %rbx		C		13
141	mov	%r8, %rax		C d0		ncp
142	adc	$-1, %rdx		C mask		14
143	add	%rdx, %rdi		C q--		15
144	and	%rdx, %rax		C d0 or 0	15
145	and	%r11, %rdx		C d1 or 0	15
146	add	%rax, %r14		C		16
147	adc	%rdx, %rbx		C		16
148	cmp	%r11, %rbx		C		17
149	jae	L(fix)			C
150L(bck):	mov	%rdi, (%rbp)		C
151	sub	$8, %rbp		C
152	dec	%rcx
153	jns	L(loop)
154
155	mov	%r14, %r10
156	mov	%rbx, %r9
157',`
158	lea	(%rbp,%rbx,8), %rbp
159	mov	%rbx, %rcx
160	mov	%r9, %rax
161	mov	%r10, %rsi
162	ALIGN(16)
163L(loop):
164	mov	%rax, %r14		C		0, 19
165	mul	%rdi			C		0
166	mov	%r11, %r9		C		1
167	add	%rsi, %rax		C		4
168	mov	%rax, %rbx		C q0		5
169	adc	%r14, %rdx		C q		5
170	lea	1(%rdx), %r10		C		6
171	mov	%rdx, %rax		C		6
172	imul	%rdx, %r9		C		6
173	sub	%r9, %rsi		C		10
174	xor	R32(%r9), R32(%r9)	C
175	mul	%r8			C		7
176	cmp	%rcx, %r13		C
177	jg	L(13)			C
178	mov	(%r12), %r9		C
179	sub	$8, %r12		C
180L(13):	sub	%r8, %r9		C		ncp
181	sbb	%r11, %rsi		C		11
182	sub	%rax, %r9		C		11
183	sbb	%rdx, %rsi		C		12
184	cmp	%rbx, %rsi		C		13
185	sbb	%rax, %rax		C		14
186	not	%rax			C		15
187	add	%rax, %r10		C		16
188	mov	%r8, %rbx		C		ncp
189	and	%rax, %rbx		C		16
190	and	%r11, %rax		C		16
191	add	%rbx, %r9		C		17
192	adc	%rsi, %rax		C		18
193	cmp	%rax, %r11		C		19
194	jbe	L(fix)			C
195L(bck):	mov	%r10, (%rbp)		C
196	sub	$8, %rbp		C
197	mov	%r9, %rsi		C		18
198	dec	%rcx
199	jns	L(loop)
200
201	mov	%rsi, %r10
202	mov	%rax, %r9
203')
204L(6):
205	mov	%r10, 8(%r12)
206	mov	%r9, 16(%r12)
207	pop	%rbx
208	pop	%rbp
209	pop	%r12
210	pop	%r13
211	pop	%r14
212	mov	%r15, %rax
213	pop	%r15
214	ret
215
216L(23):	inc	R32(%r15)
217	sub	%r8, %r10
218	sbb	%r11, %r9
219	jmp	L(2)
220
221ifdef(`NEW',`
222L(fix):	seta	%dl
223	cmp	%r8, %r14
224	setae	%al
225	orb	%dl, %al
226	je	L(bck)
227	inc	%rdi
228	sub	%r8, %r14
229	sbb	%r11, %rbx
230	jmp	L(bck)
231',`
232L(fix):	jb	L(88)
233	cmp	%r8, %r9
234	jb	L(bck)
235L(88):	inc	%r10
236	sub	%r8, %r9
237	sbb	%r11, %rax
238	jmp	L(bck)
239')
240EPILOGUE()
241