1dnl  AMD K6 mpn_sqr_basecase -- square an mpn number.
2
3dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular
24C     product (measured on the speed difference between 17 and 33 limbs,
25C     which is roughly the Karatsuba recursing range).
26
27
28dnl  SQR_TOOM2_THRESHOLD_MAX is the maximum SQR_TOOM2_THRESHOLD this
29dnl  code supports.  This value is used only by the tune program to know
30dnl  what it can go up to.  (An attempt to compile with a bigger value will
31dnl  trigger some m4_assert()s in the code, making the build fail.)
32dnl
33dnl  The value is determined by requiring the displacements in the unrolled
34dnl  addmul to fit in single bytes.  This means a maximum UNROLL_COUNT of
35dnl  63, giving a maximum SQR_TOOM2_THRESHOLD of 66.
36
37deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
38
39
40dnl  Allow a value from the tune program to override config.m4.
41
42ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
43`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
44
45
46dnl  UNROLL_COUNT is the number of code chunks in the unrolled addmul.  The
47dnl  number required is determined by SQR_TOOM2_THRESHOLD, since
48dnl  mpn_sqr_basecase only needs to handle sizes < SQR_TOOM2_THRESHOLD.
49dnl
50dnl  The first addmul is the biggest, and this takes the second least
51dnl  significant limb and multiplies it by the third least significant and
52dnl  up.  Hence for a maximum operand size of SQR_TOOM2_THRESHOLD-1
53dnl  limbs, UNROLL_COUNT needs to be SQR_TOOM2_THRESHOLD-3.
54
55m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
56deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
57
58
59C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
60C
61C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a
62C lot of function call overheads are avoided, especially when the given size
63C is small.
64C
65C The code size might look a bit excessive, but not all of it is executed
66C and so won't fill up the code cache.  The 1x1, 2x2 and 3x3 special cases
67C clearly apply only to those sizes; mid sizes like 10x10 only need part of
68C the unrolled addmul; and big sizes like 35x35 that do need all of it will
69C at least be getting value for money, because 35x35 spends something like
70C 5780 cycles here.
71C
72C Different values of UNROLL_COUNT give slightly different speeds, between
73C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs.
74C This isn't a big difference, but it's presumably some alignment effect
75C which if understood could give a simple speedup.
76
77defframe(PARAM_SIZE,12)
78defframe(PARAM_SRC, 8)
79defframe(PARAM_DST, 4)
80
81	TEXT
82	ALIGN(32)
83PROLOGUE(mpn_sqr_basecase)
84deflit(`FRAME',0)
85
86	movl	PARAM_SIZE, %ecx
87	movl	PARAM_SRC, %eax
88
89	cmpl	$2, %ecx
90	je	L(two_limbs)
91
92	movl	PARAM_DST, %edx
93	ja	L(three_or_more)
94
95
96C -----------------------------------------------------------------------------
97C one limb only
98	C eax	src
99	C ebx
100	C ecx	size
101	C edx	dst
102
103	movl	(%eax), %eax
104	movl	%edx, %ecx
105
106	mull	%eax
107
108	movl	%eax, (%ecx)
109	movl	%edx, 4(%ecx)
110	ret
111
112
113C -----------------------------------------------------------------------------
114	ALIGN(16)
115L(two_limbs):
116	C eax	src
117	C ebx
118	C ecx	size
119	C edx	dst
120
121	pushl	%ebx
122	movl	%eax, %ebx	C src
123deflit(`FRAME',4)
124
125	movl	(%ebx), %eax
126	movl	PARAM_DST, %ecx
127
128	mull	%eax		C src[0]^2
129
130	movl	%eax, (%ecx)
131	movl	4(%ebx), %eax
132
133	movl	%edx, 4(%ecx)
134
135	mull	%eax		C src[1]^2
136
137	movl	%eax, 8(%ecx)
138	movl	(%ebx), %eax
139
140	movl	%edx, 12(%ecx)
141	movl	4(%ebx), %edx
142
143	mull	%edx		C src[0]*src[1]
144
145	addl	%eax, 4(%ecx)
146
147	adcl	%edx, 8(%ecx)
148	adcl	$0, 12(%ecx)
149
150	popl	%ebx
151	addl	%eax, 4(%ecx)
152
153	adcl	%edx, 8(%ecx)
154	adcl	$0, 12(%ecx)
155
156	ret
157
158
159C -----------------------------------------------------------------------------
160L(three_or_more):
161deflit(`FRAME',0)
162	cmpl	$4, %ecx
163	jae	L(four_or_more)
164
165
166C -----------------------------------------------------------------------------
167C three limbs
168	C eax	src
169	C ecx	size
170	C edx	dst
171
172	pushl	%ebx
173	movl	%eax, %ebx	C src
174
175	movl	(%ebx), %eax
176	movl	%edx, %ecx	C dst
177
178	mull	%eax		C src[0] ^ 2
179
180	movl	%eax, (%ecx)
181	movl	4(%ebx), %eax
182
183	movl	%edx, 4(%ecx)
184	pushl	%esi
185
186	mull	%eax		C src[1] ^ 2
187
188	movl	%eax, 8(%ecx)
189	movl	8(%ebx), %eax
190
191	movl	%edx, 12(%ecx)
192	pushl	%edi
193
194	mull	%eax		C src[2] ^ 2
195
196	movl	%eax, 16(%ecx)
197	movl	(%ebx), %eax
198
199	movl	%edx, 20(%ecx)
200	movl	4(%ebx), %edx
201
202	mull	%edx		C src[0] * src[1]
203
204	movl	%eax, %esi
205	movl	(%ebx), %eax
206
207	movl	%edx, %edi
208	movl	8(%ebx), %edx
209
210	pushl	%ebp
211	xorl	%ebp, %ebp
212
213	mull	%edx		C src[0] * src[2]
214
215	addl	%eax, %edi
216	movl	4(%ebx), %eax
217
218	adcl	%edx, %ebp
219
220	movl	8(%ebx), %edx
221
222	mull	%edx		C src[1] * src[2]
223
224	addl	%eax, %ebp
225
226	adcl	$0, %edx
227
228
229	C eax	will be dst[5]
230	C ebx
231	C ecx	dst
232	C edx	dst[4]
233	C esi	dst[1]
234	C edi	dst[2]
235	C ebp	dst[3]
236
237	xorl	%eax, %eax
238	addl	%esi, %esi
239	adcl	%edi, %edi
240	adcl	%ebp, %ebp
241	adcl	%edx, %edx
242	adcl	$0, %eax
243
244	addl	%esi, 4(%ecx)
245	adcl	%edi, 8(%ecx)
246	adcl	%ebp, 12(%ecx)
247
248	popl	%ebp
249	popl	%edi
250
251	adcl	%edx, 16(%ecx)
252
253	popl	%esi
254	popl	%ebx
255
256	adcl	%eax, 20(%ecx)
257	ASSERT(nc)
258
259	ret
260
261
262C -----------------------------------------------------------------------------
263
264defframe(SAVE_EBX,   -4)
265defframe(SAVE_ESI,   -8)
266defframe(SAVE_EDI,   -12)
267defframe(SAVE_EBP,   -16)
268defframe(VAR_COUNTER,-20)
269defframe(VAR_JMP,    -24)
270deflit(STACK_SPACE, 24)
271
272	ALIGN(16)
273L(four_or_more):
274
275	C eax	src
276	C ebx
277	C ecx	size
278	C edx	dst
279	C esi
280	C edi
281	C ebp
282
283C First multiply src[0]*src[1..size-1] and store at dst[1..size].
284C
285C A test was done calling mpn_mul_1 here to get the benefit of its unrolled
286C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off
287C a 5780 cycle operation, which is not surprising since the loop here is 8
288C c/l and mpn_mul_1 is 6.25 c/l.
289
290	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
291
292	movl	%edi, SAVE_EDI
293	leal	4(%edx), %edi
294
295	movl	%ebx, SAVE_EBX
296	leal	4(%eax), %ebx
297
298	movl	%esi, SAVE_ESI
299	xorl	%esi, %esi
300
301	movl	%ebp, SAVE_EBP
302
303	C eax
304	C ebx	src+4
305	C ecx	size
306	C edx
307	C esi
308	C edi	dst+4
309	C ebp
310
311	movl	(%eax), %ebp	C multiplier
312	leal	-1(%ecx), %ecx	C size-1, and pad to a 16 byte boundary
313
314
315	ALIGN(16)
316L(mul_1):
317	C eax	scratch
318	C ebx	src ptr
319	C ecx	counter
320	C edx	scratch
321	C esi	carry
322	C edi	dst ptr
323	C ebp	multiplier
324
325	movl	(%ebx), %eax
326	addl	$4, %ebx
327
328	mull	%ebp
329
330	addl	%esi, %eax
331	movl	$0, %esi
332
333	adcl	%edx, %esi
334
335	movl	%eax, (%edi)
336	addl	$4, %edi
337
338	loop	L(mul_1)
339
340
341C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
342C
343C The last two addmuls, which are the bottom right corner of the product
344C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
345C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
346C cases that need to be done.
347C
348C The unrolled code is the same as mpn_addmul_1(), see that routine for some
349C comments.
350C
351C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
352C
353C VAR_JMP is the computed jump into the unrolled code, stepped by one code
354C chunk each outer loop.
355C
356C K6 doesn't do any branch prediction on indirect jumps, which is good
357C actually because it's a different target each time.  The unrolled addmul
358C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of
359C the indirect jump is quickly recovered.
360
361
362dnl  This value is also implicitly encoded in a shift and add.
363dnl
364deflit(CODE_BYTES_PER_LIMB, 15)
365
366dnl  With the unmodified &src[size] and &dst[size] pointers, the
367dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
368dnl  values up to 31.  Above that an offset must be added to them.
369dnl
370deflit(OFFSET,
371ifelse(eval(UNROLL_COUNT>31),1,
372eval((UNROLL_COUNT-31)*4),
3730))
374
375	C eax
376	C ebx	&src[size]
377	C ecx
378	C edx
379	C esi	carry
380	C edi	&dst[size]
381	C ebp
382
383	movl	PARAM_SIZE, %ecx
384	movl	%esi, (%edi)
385
386	subl	$4, %ecx
387	jz	L(corner)
388
389	movl	%ecx, %edx
390ifelse(OFFSET,0,,
391`	subl	$OFFSET, %ebx')
392
393	shll	$4, %ecx
394ifelse(OFFSET,0,,
395`	subl	$OFFSET, %edi')
396
397	negl	%ecx
398
399ifdef(`PIC',`
400	call	L(pic_calc)
401L(here):
402',`
403	leal	L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
404')
405	negl	%edx
406
407
408	C The calculated jump mustn't be before the start of the available
409	C code.  This is the limitation UNROLL_COUNT puts on the src operand
410	C size, but checked here using the jump address directly.
411	C
412	ASSERT(ae,`
413	movl_text_address( L(unroll_inner_start), %eax)
414	cmpl	%eax, %ecx
415	')
416
417
418C -----------------------------------------------------------------------------
419	ALIGN(16)
420L(unroll_outer_top):
421	C eax
422	C ebx	&src[size], constant
423	C ecx	VAR_JMP
424	C edx	VAR_COUNTER, limbs, negative
425	C esi	high limb to store
426	C edi	dst ptr, high of last addmul
427	C ebp
428
429	movl	-12+OFFSET(%ebx,%edx,4), %ebp	C multiplier
430	movl	%edx, VAR_COUNTER
431
432	movl	-8+OFFSET(%ebx,%edx,4), %eax	C first limb of multiplicand
433
434	mull	%ebp
435
436	testb	$1, %cl
437
438	movl	%edx, %esi	C high carry
439	movl	%ecx, %edx	C jump
440
441	movl	%eax, %ecx	C low carry
442	leal	CODE_BYTES_PER_LIMB(%edx), %edx
443
444	movl	%edx, VAR_JMP
445	leal	4(%edi), %edi
446
447	C A branch-free version of this using some xors was found to be a
448	C touch slower than just a conditional jump, despite the jump
449	C switching between taken and not taken on every loop.
450
451ifelse(eval(UNROLL_COUNT%2),0,
452	jz,jnz)	L(unroll_noswap)
453	movl	%esi, %eax	C high,low carry other way around
454
455	movl	%ecx, %esi
456	movl	%eax, %ecx
457L(unroll_noswap):
458
459	jmp	*%edx
460
461
462	C Must be on an even address here so the low bit of the jump address
463	C will indicate which way around ecx/esi should start.
464	C
465	C An attempt was made at padding here to get the end of the unrolled
466	C code to come out on a good alignment, to save padding before
467	C L(corner).  This worked, but turned out to run slower than just an
468	C ALIGN(2).  The reason for this is not clear, it might be related
469	C to the different speeds on different UNROLL_COUNTs noted above.
470
471	ALIGN(2)
472
473L(unroll_inner_start):
474	C eax	scratch
475	C ebx	src
476	C ecx	carry low
477	C edx	scratch
478	C esi	carry high
479	C edi	dst
480	C ebp	multiplier
481	C
482	C 15 code bytes each limb
483	C ecx/esi swapped on each chunk
484
485forloop(`i', UNROLL_COUNT, 1, `
486	deflit(`disp_src', eval(-i*4 + OFFSET))
487	deflit(`disp_dst', eval(disp_src - 4))
488
489	m4_assert(`disp_src>=-128 && disp_src<128')
490	m4_assert(`disp_dst>=-128 && disp_dst<128')
491
492ifelse(eval(i%2),0,`
493Zdisp(	movl,	disp_src,(%ebx), %eax)
494	mull	%ebp
495Zdisp(	addl,	%esi, disp_dst,(%edi))
496	adcl	%eax, %ecx
497	movl	%edx, %esi
498	jadcl0( %esi)
499',`
500	dnl  this one comes out last
501Zdisp(	movl,	disp_src,(%ebx), %eax)
502	mull	%ebp
503Zdisp(	addl,	%ecx, disp_dst,(%edi))
504	adcl	%eax, %esi
505	movl	%edx, %ecx
506	jadcl0( %ecx)
507')
508')
509L(unroll_inner_end):
510
511	addl	%esi, -4+OFFSET(%edi)
512
513	movl	VAR_COUNTER, %edx
514	jadcl0(	%ecx)
515
516	movl	%ecx, m4_empty_if_zero(OFFSET)(%edi)
517	movl	VAR_JMP, %ecx
518
519	incl	%edx
520	jnz	L(unroll_outer_top)
521
522
523ifelse(OFFSET,0,,`
524	addl	$OFFSET, %ebx
525	addl	$OFFSET, %edi
526')
527
528
529C -----------------------------------------------------------------------------
530	ALIGN(16)
531L(corner):
532	C ebx	&src[size]
533	C edi	&dst[2*size-5]
534
535	movl	-12(%ebx), %ebp
536
537	movl	-8(%ebx), %eax
538	movl	%eax, %ecx
539
540	mull	%ebp
541
542	addl	%eax, -4(%edi)
543	adcl	$0, %edx
544
545	movl	-4(%ebx), %eax
546	movl	%edx, %esi
547	movl	%eax, %ebx
548
549	mull	%ebp
550
551	addl	%esi, %eax
552	adcl	$0, %edx
553
554	addl	%eax, (%edi)
555	adcl	$0, %edx
556
557	movl	%edx, %esi
558	movl	%ebx, %eax
559
560	mull	%ecx
561
562	addl	%esi, %eax
563	movl	%eax, 4(%edi)
564
565	adcl	$0, %edx
566
567	movl	%edx, 8(%edi)
568
569
570C -----------------------------------------------------------------------------
571C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
572C The loop measures about 6 cycles/iteration, though it looks like it should
573C decode in 5.
574
575L(lshift_start):
576	movl	PARAM_SIZE, %ecx
577
578	movl	PARAM_DST, %edi
579	subl	$1, %ecx		C size-1 and clear carry
580
581	movl	PARAM_SRC, %ebx
582	movl	%ecx, %edx
583
584	xorl	%eax, %eax		C ready for adcl
585
586
587	ALIGN(16)
588L(lshift):
589	C eax
590	C ebx	src (for later use)
591	C ecx	counter, decrementing
592	C edx	size-1 (for later use)
593	C esi
594	C edi	dst, incrementing
595	C ebp
596
597	rcll	4(%edi)
598	rcll	8(%edi)
599	leal	8(%edi), %edi
600	loop	L(lshift)
601
602
603	adcl	%eax, %eax
604
605	movl	%eax, 4(%edi)		C dst most significant limb
606	movl	(%ebx), %eax		C src[0]
607
608	leal	4(%ebx,%edx,4), %ebx	C &src[size]
609	subl	%edx, %ecx		C -(size-1)
610
611
612C -----------------------------------------------------------------------------
613C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
614C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
615C low limb of src[0]^2.
616
617
618	mull	%eax
619
620	movl	%eax, (%edi,%ecx,8)	C dst[0]
621
622
623	ALIGN(16)
624L(diag):
625	C eax	scratch
626	C ebx	&src[size]
627	C ecx	counter, negative
628	C edx	carry
629	C esi	scratch
630	C edi	dst[2*size-2]
631	C ebp
632
633	movl	(%ebx,%ecx,4), %eax
634	movl	%edx, %esi
635
636	mull	%eax
637
638	addl	%esi, 4(%edi,%ecx,8)
639	adcl	%eax, 8(%edi,%ecx,8)
640	adcl	$0, %edx
641
642	incl	%ecx
643	jnz	L(diag)
644
645
646	movl	SAVE_EBX, %ebx
647	movl	SAVE_ESI, %esi
648
649	addl	%edx, 4(%edi)		C dst most significant limb
650
651	movl	SAVE_EDI, %edi
652	movl	SAVE_EBP, %ebp
653	addl	$FRAME, %esp
654	ret
655
656
657
658C -----------------------------------------------------------------------------
659ifdef(`PIC',`
660L(pic_calc):
661	C See mpn/x86/README about old gas bugs
662	addl	(%esp), %ecx
663	addl	$L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
664	addl	%edx, %ecx
665	ret_internal
666')
667
668
669EPILOGUE()
670