1dnl  AMD K7 mpn_gcd_1 -- mpn by 1 gcd.
2
3dnl  Copyright 2000, 2001, 2002, 2009 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C K7: 6.75 cycles/bit (approx)  1x1 gcd
24C     11.0 cycles/limb          Nx1 reduction (modexact_1_odd)
25
26
27dnl  Reduce using x%y if x is more than DIV_THRESHOLD bits bigger than y,
28dnl  where x is the larger of the two.  See tune/README for more.
29dnl
30dnl  divl at 40 cycles compared to the gcd at about 7 cycles/bitpair
31dnl  suggests 40/7*2=11.4 but 7 seems to be about right.
32
33deflit(DIV_THRESHOLD, 7)
34
35
36C table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
37C
38C This is mixed in with the code, but as per the k7 optimization manual it's
39C a full cache line and suitably aligned so it won't get swapped between
40C code and data.  Having it in TEXT rather than RODATA saves needing a GOT
41C entry when PIC.
42C
43C Actually, there doesn't seem to be a measurable difference between this in
44C it's own cache line or plonked in the middle of the code.  Presumably
45C since TEXT is read-only there's no worries about coherency.
46
47deflit(MAXSHIFT, 6)
48deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
49
50	TEXT
51	ALIGN(64)
52L(table):
53	.byte	MAXSHIFT
54forloop(i,1,MASK,
55`	.byte	m4_count_trailing_zeros(i)
56')
57
58
59C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t limb);
60C
61
62defframe(PARAM_LIMB,   12)
63defframe(PARAM_SIZE,    8)
64defframe(PARAM_SRC,     4)
65
66defframe(SAVE_EBX,     -4)
67defframe(SAVE_ESI,     -8)
68defframe(SAVE_EDI,    -12)
69defframe(SAVE_EBP,    -16)
70defframe(CALL_DIVISOR,-20)
71defframe(CALL_SIZE,   -24)
72defframe(CALL_SRC,    -28)
73
74deflit(STACK_SPACE, 28)
75
76	TEXT
77	ALIGN(16)
78
79PROLOGUE(mpn_gcd_1)
80deflit(`FRAME',0)
81
82	ASSERT(ne, `cmpl $0, PARAM_LIMB')	C y!=0
83	ASSERT(ae, `cmpl $1, PARAM_SIZE')	C size>=1
84
85	mov	PARAM_SRC, %eax
86	mov	PARAM_LIMB, %edx
87	sub	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
88
89	mov	%esi, SAVE_ESI
90	mov	%ebx, SAVE_EBX
91
92	mov	(%eax), %esi		C src low limb
93
94ifdef(`PIC',`
95	mov	%edi, SAVE_EDI
96	call	L(movl_eip_to_edi)
97L(here):
98	add	$L(table)-L(here), %edi
99')
100
101	mov	%esi, %ebx
102	or	%edx, %esi	C x|y
103	mov	$-1, %ecx
104
105L(twos):
106	inc	%ecx
107	shr	%esi
108	jnc	L(twos)		C 3/4 chance of x or y odd already
109
110	shr	%cl, %ebx
111	shr	%cl, %edx
112	mov	%ecx, %esi	C common twos
113
114	mov	PARAM_SIZE, %ecx
115	cmp	$1, %ecx
116	ja	L(divide)
117
118
119	C eax
120	C ebx	x
121	C ecx
122	C edx	y
123	C esi	common twos
124	C edi	[PIC] L(table)
125	C ebp
126
127	mov	%edx, %eax
128	cmp	%ebx, %edx
129
130	cmovb(	%ebx, %eax)	C swap to make x bigger than y
131	cmovb(	%edx, %ebx)
132
133
134L(strip_y):
135	C eax	x
136	C ebx	y
137	C ecx
138	C edx
139	C esi	common twos
140	C edi	[PIC] L(table)
141	C ebp
142
143	ASSERT(nz,`orl %ebx,%ebx')
144	shr	%ebx
145	jnc	L(strip_y)
146	rcl	%ebx
147
148
149	C eax	x
150	C ebx	y (odd)
151	C ecx
152	C edx
153	C esi	common twos
154	C edi	[PIC] L(table)
155	C ebp
156
157	mov	%eax, %ecx
158	mov	%ebx, %edx
159	shr	$DIV_THRESHOLD, %eax
160
161	cmp	%eax, %ebx
162	mov	%ecx, %eax
163	ja	L(strip_x_entry)	C do x%y if x much bigger than y
164
165
166	xor	%edx, %edx
167
168	div	%ebx
169
170	or	%edx, %edx
171	mov	%edx, %ecx		C remainder -> x
172	mov	%ebx, %edx		C y
173
174	jz	L(done_ebx)
175	jmp	L(strip_x)
176
177
178	C Offset 0x9D here for non-PIC.  About 0.4 cycles/bit is saved by
179	C ensuring the end of the jnz at the end of this loop doesn't cross
180	C into the next cache line at 0xC0.
181	C
182	C PIC on the other hand is offset 0xAC here and extends to 0xC9, so
183	C it crosses but doesn't suffer any measurable slowdown.
184
185L(top):
186	C eax	x
187	C ebx	y-x
188	C ecx	x-y
189	C edx	y
190	C esi	twos, for use at end
191	C edi	[PIC] L(table)
192
193	cmovc(	%ebx, %ecx)		C if x-y gave carry, use x and y-x
194	cmovc(	%eax, %edx)
195
196L(strip_x):
197	mov	%ecx, %eax
198L(strip_x_entry):
199	and	$MASK, %ecx
200
201	ASSERT(nz, `orl %eax, %eax')
202
203ifdef(`PIC',`
204	mov	(%ecx,%edi), %cl
205',`
206	mov	L(table) (%ecx), %cl
207')
208
209	shr	%cl, %eax
210	cmp	$MAXSHIFT, %cl
211
212	mov	%eax, %ecx
213	mov	%edx, %ebx
214	je	L(strip_x)
215
216	ASSERT(nz, `test $1, %eax')	C both odd
217	ASSERT(nz, `test $1, %edx')
218
219	sub	%eax, %ebx
220	sub	%edx, %ecx
221	jnz	L(top)
222
223
224L(done):
225	mov	%esi, %ecx
226	mov	SAVE_ESI, %esi
227ifdef(`PIC',`
228	mov	SAVE_EDI, %edi
229')
230
231	shl	%cl, %eax
232	mov	SAVE_EBX, %ebx
233	add	$FRAME, %esp
234
235	ret
236
237
238
239C -----------------------------------------------------------------------------
240C two or more limbs
241
242dnl  MODEXACT_THRESHOLD is the size at which it's better to call
243dnl  mpn_modexact_1_odd than do an inline loop.
244
245deflit(MODEXACT_THRESHOLD, ifdef(`PIC',6,5))
246
247L(divide):
248	C eax	src
249	C ebx
250	C ecx	size
251	C edx	y
252	C esi	common twos
253	C edi	[PIC] L(table)
254	C ebp
255
256L(divide_strip_y):
257	ASSERT(nz,`or %edx,%edx')
258	shr	%edx
259	jnc	L(divide_strip_y)
260	lea	1(%edx,%edx), %ebx		C y now odd
261
262	mov	%ebp, SAVE_EBP
263	mov	%eax, %ebp
264	mov	-4(%eax,%ecx,4), %eax		C src high limb
265
266	cmp	$MODEXACT_THRESHOLD, %ecx
267	jae	L(modexact)
268
269	cmp	%ebx, %eax			C high cmp divisor
270	mov	$0, %edx
271
272	cmovc(	%eax, %edx)			C skip a div if high<divisor
273	sbb	$0, %ecx
274
275
276L(divide_top):
277	C eax	scratch (quotient)
278	C ebx	y
279	C ecx	counter (size to 1, inclusive)
280	C edx	carry (remainder)
281	C esi	common twos
282	C edi	[PIC] L(table)
283	C ebp	src
284
285	mov	-4(%ebp,%ecx,4), %eax
286
287	div	%ebx
288
289	dec	%ecx
290	jnz	L(divide_top)
291
292
293	C eax
294	C ebx	y (odd)
295	C ecx
296	C edx	x
297	C esi	common twos
298	C edi	[PIC] L(table)
299	C ebp
300
301	or	%edx, %edx
302	mov	SAVE_EBP, %ebp
303	mov	%edx, %eax
304
305	mov	%edx, %ecx
306	mov	%ebx, %edx
307	jnz	L(strip_x_entry)
308
309
310L(done_ebx):
311	mov	%ebx, %eax
312	jmp	L(done)
313
314
315
316L(modexact):
317	C eax
318	C ebx	y
319	C ecx	size
320	C edx
321	C esi	common twos
322	C edi	[PIC] L(table)
323	C ebp	src
324
325ifdef(`PIC',`
326	mov	%ebp, CALL_SRC
327	mov	%ebx, %ebp		C y
328	mov	%edi, %ebx		C L(table)
329
330	add	$_GLOBAL_OFFSET_TABLE_+[.-L(table)], %ebx
331	mov	%ebp, CALL_DIVISOR
332	mov	%ecx, CALL_SIZE
333
334	call	GSYM_PREFIX`'mpn_modexact_1_odd@PLT
335',`
336dnl non-PIC
337	mov	%ebx, CALL_DIVISOR
338	mov	%ebp, CALL_SRC
339	mov	%ecx, CALL_SIZE
340
341	call	GSYM_PREFIX`'mpn_modexact_1_odd
342')
343
344	C eax	x
345	C ebx	[non-PIC] y
346	C ecx
347	C edx
348	C esi	common twos
349	C edi	[PIC] L(table)
350	C ebp	[PIC] y
351
352	or	%eax, %eax
353	mov	ifdef(`PIC',`%ebp',`%ebx'), %edx
354	mov	SAVE_EBP, %ebp
355
356	mov	%eax, %ecx
357	jnz	L(strip_x_entry)
358
359	mov	%edx, %eax
360	jmp	L(done)
361
362
363ifdef(`PIC', `
364L(movl_eip_to_edi):
365	mov	(%esp), %edi
366	ret_internal
367')
368
369EPILOGUE()
370