1dnl  AMD K6 mpn_divexact_1 -- mpn by limb exact division.
2
3dnl  Copyright 2000, 2001, 2002, 2007 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C         divisor
24C       odd   even
25C K6:   10.0  12.0  cycles/limb
26C K6-2: 10.0  11.5
27
28
29C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
30C                      mp_limb_t divisor);
31C
32C A simple divl is used for size==1.  This is about 10 cycles faster for an
33C odd divisor or 20 cycles for an even divisor.
34C
35C The loops are quite sensitive to code alignment, speeds should be
36C rechecked (odd and even divisor, pic and non-pic) if contemplating
37C changing anything.
38
39defframe(PARAM_DIVISOR,16)
40defframe(PARAM_SIZE,   12)
41defframe(PARAM_SRC,    8)
42defframe(PARAM_DST,    4)
43
44dnl  re-use parameter space
45define(VAR_INVERSE,`PARAM_DST')
46
47	TEXT
48
49	ALIGN(32)
50PROLOGUE(mpn_divexact_1)
51deflit(`FRAME',0)
52
53	movl	PARAM_SIZE, %ecx
54
55	movl	PARAM_SRC, %eax
56	xorl	%edx, %edx
57
58	cmpl	$1, %ecx
59	jnz	L(two_or_more)
60
61	movl	(%eax), %eax
62
63	divl	PARAM_DIVISOR
64
65	movl	PARAM_DST, %ecx
66	movl	%eax, (%ecx)
67
68	ret
69
70
71L(two_or_more):
72	movl	PARAM_DIVISOR, %eax
73	pushl	%ebx		FRAME_pushl()
74
75	movl	PARAM_SRC, %ebx
76	pushl	%ebp		FRAME_pushl()
77
78L(strip_twos):
79	shrl	%eax
80	incl	%edx			C will get shift+1
81
82	jnc	L(strip_twos)
83	pushl	%esi		FRAME_pushl()
84
85	leal	1(%eax,%eax), %esi	C d without twos
86	andl	$127, %eax		C d/2, 7 bits
87
88ifdef(`PIC',`
89	LEA(	binvert_limb_table, %ebp)
90Zdisp(	movzbl,	0,(%eax,%ebp), %eax)
91',`
92	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
93')
94	pushl	%edi		FRAME_pushl()
95
96	leal	(%eax,%eax), %ebp	C 2*inv
97
98	imull	%eax, %eax		C inv*inv
99
100	movl	PARAM_DST, %edi
101
102	imull	%esi, %eax		C inv*inv*d
103
104	subl	%eax, %ebp		C inv = 2*inv - inv*inv*d
105	leal	(%ebp,%ebp), %eax	C 2*inv
106
107	imull	%ebp, %ebp		C inv*inv
108
109	movl	%esi, PARAM_DIVISOR	C d without twos
110	leal	(%ebx,%ecx,4), %ebx	C src end
111
112	imull	%esi, %ebp		C inv*inv*d
113
114	leal	(%edi,%ecx,4), %edi	C dst end
115	negl	%ecx			C -size
116
117	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
118	subl	$1, %edx		C shift amount, and clear carry
119
120	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
121	pushl	%eax	FRAME_pushl()
122	imull	PARAM_DIVISOR, %eax
123	cmpl	$1, %eax
124	popl	%eax	FRAME_popl()')
125
126	movl	%eax, VAR_INVERSE
127	jnz	L(even)
128
129	movl	(%ebx,%ecx,4), %esi	C src low limb
130	jmp	L(odd_entry)
131
132
133	ALIGN(16)
134	nop	C code alignment
135L(odd_top):
136	C eax	scratch
137	C ebx	src end
138	C ecx	counter, limbs, negative
139	C edx	inverse
140	C esi	next limb, adjusted for carry
141	C edi	dst end
142	C ebp	carry bit, 0 or -1
143
144	imull	%edx, %esi
145
146	movl	PARAM_DIVISOR, %eax
147	movl	%esi, -4(%edi,%ecx,4)
148
149	mull	%esi			C carry limb in edx
150
151	subl	%ebp, %edx		C apply carry bit
152	movl	(%ebx,%ecx,4), %esi
153
154L(odd_entry):
155	subl	%edx, %esi		C apply carry limb
156	movl	VAR_INVERSE, %edx
157
158	sbbl	%ebp, %ebp		C 0 or -1
159
160	incl	%ecx
161	jnz	L(odd_top)
162
163
164	imull	%edx, %esi
165
166	movl	%esi, -4(%edi,%ecx,4)
167
168	popl	%edi
169	popl	%esi
170
171	popl	%ebp
172	popl	%ebx
173
174	ret
175
176
177L(even):
178	C eax
179	C ebx	src end
180	C ecx	-size
181	C edx	twos
182	C esi
183	C edi	dst end
184	C ebp
185
186	xorl	%ebp, %ebp
187Zdisp(	movq,	0,(%ebx,%ecx,4), %mm0)	C src[0,1]
188
189	movd	%edx, %mm7
190	movl	VAR_INVERSE, %edx
191
192	addl	$2, %ecx
193	psrlq	%mm7, %mm0
194
195	movd	%mm0, %esi
196	jz	L(even_two)		C if only two limbs
197
198
199C Out-of-order execution is good enough to hide the load/rshift/movd
200C latency.  Having imul at the top of the loop gives 11.5 c/l instead of 12,
201C on K6-2.  In fact there's only 11 of decode, but nothing running at 11 has
202C been found.  Maybe the fact every second movq is unaligned costs the extra
203C 0.5.
204
205L(even_top):
206	C eax	scratch
207	C ebx	src end
208	C ecx	counter, limbs, negative
209	C edx	inverse
210	C esi	next limb, adjusted for carry
211	C edi	dst end
212	C ebp	carry bit, 0 or -1
213	C
214	C mm0	scratch, source limbs
215	C mm7	twos
216
217	imull	%edx, %esi
218
219	movl	%esi, -8(%edi,%ecx,4)
220	movl	PARAM_DIVISOR, %eax
221
222	mull	%esi			C carry limb in edx
223
224	movq	-4(%ebx,%ecx,4), %mm0
225	psrlq	%mm7, %mm0
226
227	movd	%mm0, %esi
228	subl	%ebp, %edx		C apply carry bit
229
230	subl	%edx, %esi		C apply carry limb
231	movl	VAR_INVERSE, %edx
232
233	sbbl	%ebp, %ebp		C 0 or -1
234
235	incl	%ecx
236	jnz	L(even_top)
237
238
239L(even_two):
240	movd	-4(%ebx), %mm0		C src high limb
241	psrlq	%mm7, %mm0
242
243	imull	%edx, %esi
244
245	movl	%esi, -8(%edi)
246	movl	PARAM_DIVISOR, %eax
247
248	mull	%esi			C carry limb in edx
249
250	movd	%mm0, %esi
251	subl	%ebp, %edx		C apply carry bit
252
253	movl	VAR_INVERSE, %eax
254	subl	%edx, %esi		C apply carry limb
255
256	imull	%eax, %esi
257
258	movl	%esi, -4(%edi)
259
260	popl	%edi
261	popl	%esi
262
263	popl	%ebp
264	popl	%ebx
265
266	emms_or_femms
267
268	ret
269
270EPILOGUE()
271