1dnl  AMD K7 mpn_sqr_basecase -- square an mpn number.
2
3dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product
24C     (measured on the speed difference between 25 and 50 limbs, which is
25C     roughly the Karatsuba recursing range).
26
27
28dnl  These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for
29dnl  some comments.
30
31deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
32
33ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
34`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
35
36m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
37deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
38
39
40C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
41C
42C With a SQR_TOOM2_THRESHOLD around 50 this code is about 1500 bytes,
43C which is quite a bit, but is considered good value since squares big
44C enough to use most of the code will be spending quite a few cycles in it.
45
46
47defframe(PARAM_SIZE,12)
48defframe(PARAM_SRC, 8)
49defframe(PARAM_DST, 4)
50
51	TEXT
52	ALIGN(32)
53PROLOGUE(mpn_sqr_basecase)
54deflit(`FRAME',0)
55
56	movl	PARAM_SIZE, %ecx
57	movl	PARAM_SRC, %eax
58	cmpl	$2, %ecx
59
60	movl	PARAM_DST, %edx
61	je	L(two_limbs)
62	ja	L(three_or_more)
63
64
65C------------------------------------------------------------------------------
66C one limb only
67	C eax	src
68	C ecx	size
69	C edx	dst
70
71	movl	(%eax), %eax
72	movl	%edx, %ecx
73
74	mull	%eax
75
76	movl	%edx, 4(%ecx)
77	movl	%eax, (%ecx)
78	ret
79
80
81C------------------------------------------------------------------------------
82C
83C Using the read/modify/write "add"s seems to be faster than saving and
84C restoring registers.  Perhaps the loads for the first set hide under the
85C mul latency and the second gets store to load forwarding.
86
87	ALIGN(16)
88L(two_limbs):
89	C eax	src
90	C ebx
91	C ecx	size
92	C edx	dst
93deflit(`FRAME',0)
94
95	pushl	%ebx		FRAME_pushl()
96	movl	%eax, %ebx	C src
97	movl	(%eax), %eax
98
99	movl	%edx, %ecx	C dst
100
101	mull	%eax		C src[0]^2
102
103	movl	%eax, (%ecx)	C dst[0]
104	movl	4(%ebx), %eax
105
106	movl	%edx, 4(%ecx)	C dst[1]
107
108	mull	%eax		C src[1]^2
109
110	movl	%eax, 8(%ecx)	C dst[2]
111	movl	(%ebx), %eax
112
113	movl	%edx, 12(%ecx)	C dst[3]
114
115	mull	4(%ebx)		C src[0]*src[1]
116
117	popl	%ebx
118
119	addl	%eax, 4(%ecx)
120	adcl	%edx, 8(%ecx)
121	adcl	$0, 12(%ecx)
122	ASSERT(nc)
123
124	addl	%eax, 4(%ecx)
125	adcl	%edx, 8(%ecx)
126	adcl	$0, 12(%ecx)
127	ASSERT(nc)
128
129	ret
130
131
132C------------------------------------------------------------------------------
133defframe(SAVE_EBX,  -4)
134defframe(SAVE_ESI,  -8)
135defframe(SAVE_EDI, -12)
136defframe(SAVE_EBP, -16)
137deflit(STACK_SPACE, 16)
138
139L(three_or_more):
140	subl	$STACK_SPACE, %esp
141	cmpl	$4, %ecx
142	jae	L(four_or_more)
143deflit(`FRAME',STACK_SPACE)
144
145
146C------------------------------------------------------------------------------
147C Three limbs
148C
149C Writing out the loads and stores separately at the end of this code comes
150C out about 10 cycles faster than using adcls to memory.
151
152	C eax	src
153	C ecx	size
154	C edx	dst
155
156	movl	%ebx, SAVE_EBX
157	movl	%eax, %ebx	C src
158	movl	(%eax), %eax
159
160	movl	%edx, %ecx	C dst
161	movl	%esi, SAVE_ESI
162	movl	%edi, SAVE_EDI
163
164	mull	%eax		C src[0] ^ 2
165
166	movl	%eax, (%ecx)
167	movl	4(%ebx), %eax
168	movl	%edx, 4(%ecx)
169
170	mull	%eax		C src[1] ^ 2
171
172	movl	%eax, 8(%ecx)
173	movl	8(%ebx), %eax
174	movl	%edx, 12(%ecx)
175
176	mull	%eax		C src[2] ^ 2
177
178	movl	%eax, 16(%ecx)
179	movl	(%ebx), %eax
180	movl	%edx, 20(%ecx)
181
182	mull	4(%ebx)		C src[0] * src[1]
183
184	movl	%eax, %esi
185	movl	(%ebx), %eax
186	movl	%edx, %edi
187
188	mull	8(%ebx)		C src[0] * src[2]
189
190	addl	%eax, %edi
191	movl	%ebp, SAVE_EBP
192	movl	$0, %ebp
193
194	movl	4(%ebx), %eax
195	adcl	%edx, %ebp
196
197	mull	8(%ebx)		C src[1] * src[2]
198
199	xorl	%ebx, %ebx
200	addl	%eax, %ebp
201
202	adcl	$0, %edx
203
204	C eax
205	C ebx	zero, will be dst[5]
206	C ecx	dst
207	C edx	dst[4]
208	C esi	dst[1]
209	C edi	dst[2]
210	C ebp	dst[3]
211
212	adcl	$0, %edx
213	addl	%esi, %esi
214
215	adcl	%edi, %edi
216	movl	4(%ecx), %eax
217
218	adcl	%ebp, %ebp
219
220	adcl	%edx, %edx
221
222	adcl	$0, %ebx
223	addl	%eax, %esi
224	movl	8(%ecx), %eax
225
226	adcl	%eax, %edi
227	movl	12(%ecx), %eax
228	movl	%esi, 4(%ecx)
229
230	adcl	%eax, %ebp
231	movl	16(%ecx), %eax
232	movl	%edi, 8(%ecx)
233
234	movl	SAVE_ESI, %esi
235	movl	SAVE_EDI, %edi
236
237	adcl	%eax, %edx
238	movl	20(%ecx), %eax
239	movl	%ebp, 12(%ecx)
240
241	adcl	%ebx, %eax
242	ASSERT(nc)
243	movl	SAVE_EBX, %ebx
244	movl	SAVE_EBP, %ebp
245
246	movl	%edx, 16(%ecx)
247	movl	%eax, 20(%ecx)
248	addl	$FRAME, %esp
249
250	ret
251
252
253C------------------------------------------------------------------------------
254L(four_or_more):
255
256C First multiply src[0]*src[1..size-1] and store at dst[1..size].
257C Further products are added in rather than stored.
258
259	C eax	src
260	C ebx
261	C ecx	size
262	C edx	dst
263	C esi
264	C edi
265	C ebp
266
267defframe(`VAR_COUNTER',-20)
268defframe(`VAR_JMP',    -24)
269deflit(EXTRA_STACK_SPACE, 8)
270
271	movl	%ebx, SAVE_EBX
272	movl	%edi, SAVE_EDI
273	leal	(%edx,%ecx,4), %edi	C &dst[size]
274
275	movl	%esi, SAVE_ESI
276	movl	%ebp, SAVE_EBP
277	leal	(%eax,%ecx,4), %esi	C &src[size]
278
279	movl	(%eax), %ebp		C multiplier
280	movl	$0, %ebx
281	decl	%ecx
282
283	negl	%ecx
284	subl	$EXTRA_STACK_SPACE, %esp
285FRAME_subl_esp(EXTRA_STACK_SPACE)
286
287L(mul_1):
288	C eax	scratch
289	C ebx	carry
290	C ecx	counter
291	C edx	scratch
292	C esi	&src[size]
293	C edi	&dst[size]
294	C ebp	multiplier
295
296	movl	(%esi,%ecx,4), %eax
297
298	mull	%ebp
299
300	addl	%ebx, %eax
301	movl	%eax, (%edi,%ecx,4)
302	movl	$0, %ebx
303
304	adcl	%edx, %ebx
305	incl	%ecx
306	jnz	L(mul_1)
307
308
309C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
310C
311C The last two products, which are the bottom right corner of the product
312C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
313C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
314C cases that need to be done.
315C
316C The unrolled code is the same as in mpn_addmul_1, see that routine for
317C some comments.
318C
319C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive.
320C
321C VAR_JMP is the computed jump into the unrolled code, stepped by one code
322C chunk each outer loop.
323C
324C K7 does branch prediction on indirect jumps, which is bad since it's a
325C different target each time.  There seems no way to avoid this.
326
327dnl  This value also hard coded in some shifts and adds
328deflit(CODE_BYTES_PER_LIMB, 17)
329
330dnl  With the unmodified &src[size] and &dst[size] pointers, the
331dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
332dnl  values up to 31, but above that an offset must be added to them.
333
334deflit(OFFSET,
335ifelse(eval(UNROLL_COUNT>31),1,
336eval((UNROLL_COUNT-31)*4),
3370))
338
339dnl  Because the last chunk of code is generated differently, a label placed
340dnl  at the end doesn't work.  Instead calculate the implied end using the
341dnl  start and how many chunks of code there are.
342
343deflit(UNROLL_INNER_END,
344`L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)')
345
346	C eax
347	C ebx	carry
348	C ecx
349	C edx
350	C esi	&src[size]
351	C edi	&dst[size]
352	C ebp
353
354	movl	PARAM_SIZE, %ecx
355	movl	%ebx, (%edi)
356
357	subl	$4, %ecx
358	jz	L(corner)
359
360	negl	%ecx
361ifelse(OFFSET,0,,`subl	$OFFSET, %edi')
362ifelse(OFFSET,0,,`subl	$OFFSET, %esi')
363
364	movl	%ecx, %edx
365	shll	$4, %ecx
366
367ifdef(`PIC',`
368	call	L(pic_calc)
369L(here):
370',`
371	leal	UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
372')
373
374
375	C The calculated jump mustn't come out to before the start of the
376	C code available.  This is the limit UNROLL_COUNT puts on the src
377	C operand size, but checked here directly using the jump address.
378	ASSERT(ae,
379	`movl_text_address(L(unroll_inner_start), %eax)
380	cmpl	%eax, %ecx')
381
382
383C------------------------------------------------------------------------------
384	ALIGN(16)
385L(unroll_outer_top):
386	C eax
387	C ebx	high limb to store
388	C ecx	VAR_JMP
389	C edx	VAR_COUNTER, limbs, negative
390	C esi	&src[size], constant
391	C edi	dst ptr, high of last addmul
392	C ebp
393
394	movl	-12+OFFSET(%esi,%edx,4), %ebp	C next multiplier
395	movl	-8+OFFSET(%esi,%edx,4), %eax	C first of multiplicand
396
397	movl	%edx, VAR_COUNTER
398
399	mull	%ebp
400
401define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz($@)')')
402
403	testb	$1, %cl
404	movl	%edx, %ebx	C high carry
405	movl	%ecx, %edx	C jump
406
407	movl	%eax, %ecx	C low carry
408	cmovX(	%ebx, %ecx)	C high carry reverse
409	cmovX(	%eax, %ebx)	C low carry reverse
410
411	leal	CODE_BYTES_PER_LIMB(%edx), %eax
412	xorl	%edx, %edx
413	leal	4(%edi), %edi
414
415	movl	%eax, VAR_JMP
416
417	jmp	*%eax
418
419
420ifdef(`PIC',`
421L(pic_calc):
422	addl	(%esp), %ecx
423	addl	$UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx
424	addl	%edx, %ecx
425	ret_internal
426')
427
428
429	C Must be an even address to preserve the significance of the low
430	C bit of the jump address indicating which way around ecx/ebx should
431	C start.
432	ALIGN(2)
433
434L(unroll_inner_start):
435	C eax	next limb
436	C ebx	carry high
437	C ecx	carry low
438	C edx	scratch
439	C esi	src
440	C edi	dst
441	C ebp	multiplier
442
443forloop(`i', UNROLL_COUNT, 1, `
444	deflit(`disp_src', eval(-i*4 + OFFSET))
445	deflit(`disp_dst', eval(disp_src - 4))
446
447	m4_assert(`disp_src>=-128 && disp_src<128')
448	m4_assert(`disp_dst>=-128 && disp_dst<128')
449
450ifelse(eval(i%2),0,`
451Zdisp(	movl,	disp_src,(%esi), %eax)
452	adcl	%edx, %ebx
453
454	mull	%ebp
455
456Zdisp(  addl,	%ecx, disp_dst,(%edi))
457	movl	$0, %ecx
458
459	adcl	%eax, %ebx
460
461',`
462	dnl  this bit comes out last
463Zdisp(  movl,	disp_src,(%esi), %eax)
464	adcl	%edx, %ecx
465
466	mull	%ebp
467
468Zdisp(	addl,	%ebx, disp_dst,(%edi))
469
470ifelse(forloop_last,0,
471`	movl	$0, %ebx')
472
473	adcl	%eax, %ecx
474')
475')
476
477	C eax	next limb
478	C ebx	carry high
479	C ecx	carry low
480	C edx	scratch
481	C esi	src
482	C edi	dst
483	C ebp	multiplier
484
485	adcl	$0, %edx
486	addl	%ecx, -4+OFFSET(%edi)
487	movl	VAR_JMP, %ecx
488
489	adcl	$0, %edx
490
491	movl	%edx, m4_empty_if_zero(OFFSET) (%edi)
492	movl	VAR_COUNTER, %edx
493
494	incl	%edx
495	jnz	L(unroll_outer_top)
496
497
498ifelse(OFFSET,0,,`
499	addl	$OFFSET, %esi
500	addl	$OFFSET, %edi
501')
502
503
504C------------------------------------------------------------------------------
505L(corner):
506	C esi	&src[size]
507	C edi	&dst[2*size-5]
508
509	movl	-12(%esi), %ebp
510	movl	-8(%esi), %eax
511	movl	%eax, %ecx
512
513	mull	%ebp
514
515	addl	%eax, -4(%edi)
516	movl	-4(%esi), %eax
517
518	adcl	$0, %edx
519	movl	%edx, %ebx
520	movl	%eax, %esi
521
522	mull	%ebp
523
524	addl	%ebx, %eax
525
526	adcl	$0, %edx
527	addl	%eax, (%edi)
528	movl	%esi, %eax
529
530	adcl	$0, %edx
531	movl	%edx, %ebx
532
533	mull	%ecx
534
535	addl	%ebx, %eax
536	movl	%eax, 4(%edi)
537
538	adcl	$0, %edx
539	movl	%edx, 8(%edi)
540
541
542
543C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
544
545L(lshift_start):
546	movl	PARAM_SIZE, %eax
547	movl	PARAM_DST, %edi
548	xorl	%ecx, %ecx		C clear carry
549
550	leal	(%edi,%eax,8), %edi
551	notl	%eax			C -size-1, preserve carry
552
553	leal	2(%eax), %eax		C -(size-1)
554
555L(lshift):
556	C eax	counter, negative
557	C ebx
558	C ecx
559	C edx
560	C esi
561	C edi	dst, pointing just after last limb
562	C ebp
563
564	rcll	-4(%edi,%eax,8)
565	rcll	(%edi,%eax,8)
566	incl	%eax
567	jnz	L(lshift)
568
569	setc	%al
570
571	movl	PARAM_SRC, %esi
572	movl	%eax, -4(%edi)		C dst most significant limb
573
574	movl	PARAM_SIZE, %ecx
575
576
577C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
578C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
579C low limb of src[0]^2.
580
581	movl	(%esi), %eax		C src[0]
582
583	mull	%eax
584
585	leal	(%esi,%ecx,4), %esi	C src point just after last limb
586	negl	%ecx
587
588	movl	%eax, (%edi,%ecx,8)	C dst[0]
589	incl	%ecx
590
591L(diag):
592	C eax	scratch
593	C ebx	scratch
594	C ecx	counter, negative
595	C edx	carry
596	C esi	src just after last limb
597	C edi	dst just after last limb
598	C ebp
599
600	movl	(%esi,%ecx,4), %eax
601	movl	%edx, %ebx
602
603	mull	%eax
604
605	addl	%ebx, -4(%edi,%ecx,8)
606	adcl	%eax, (%edi,%ecx,8)
607	adcl	$0, %edx
608
609	incl	%ecx
610	jnz	L(diag)
611
612
613	movl	SAVE_ESI, %esi
614	movl	SAVE_EBX, %ebx
615
616	addl	%edx, -4(%edi)		C dst most significant limb
617	movl	SAVE_EDI, %edi
618
619	movl	SAVE_EBP, %ebp
620	addl	$FRAME, %esp
621
622	ret
623
624EPILOGUE()
625