1dnl  AMD K6 mpn_mul_basecase -- multiply two mpn numbers.
2
3dnl  Copyright 1999-2003 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop
35C     unrolling).
36
37
38
39dnl  K6: UNROLL_COUNT cycles/product (approx)
40dnl           8           9.75
41dnl          16           9.3
42dnl          32           9.3
43dnl  Maximum possible with the current code is 32.
44dnl
45dnl  With 16 the inner unrolled loop fits exactly in a 256 byte block, which
46dnl  might explain it's good performance.
47
48deflit(UNROLL_COUNT, 16)
49
50
51C void mpn_mul_basecase (mp_ptr wp,
52C                        mp_srcptr xp, mp_size_t xsize,
53C                        mp_srcptr yp, mp_size_t ysize);
54C
55C Calculate xp,xsize multiplied by yp,ysize, storing the result in
56C wp,xsize+ysize.
57C
58C This routine is essentially the same as mpn/generic/mul_basecase.c, but
59C it's faster because it does most of the mpn_addmul_1() entry code only
60C once.  The saving is about 10-20% on typical sizes coming from the
61C Karatsuba multiply code.
62C
63C Enhancements:
64C
65C The mul_1 loop is about 8.5 c/l, which is slower than mpn_mul_1 at 6.25
66C c/l.  Could call mpn_mul_1 when ysize is big enough to make it worthwhile.
67C
68C The main unrolled addmul loop could be shared by mpn_addmul_1, using some
69C extra stack setups and maybe 2 or 3 wasted cycles at the end.  Code saving
70C would be 256 bytes.
71
72ifdef(`PIC',`
73deflit(UNROLL_THRESHOLD, 8)
74',`
75deflit(UNROLL_THRESHOLD, 8)
76')
77
78defframe(PARAM_YSIZE,20)
79defframe(PARAM_YP,   16)
80defframe(PARAM_XSIZE,12)
81defframe(PARAM_XP,   8)
82defframe(PARAM_WP,   4)
83
84	TEXT
85	ALIGN(32)
86PROLOGUE(mpn_mul_basecase)
87deflit(`FRAME',0)
88
89	movl	PARAM_XSIZE, %ecx
90	movl	PARAM_YP, %eax
91
92	movl	PARAM_XP, %edx
93	movl	(%eax), %eax	C yp low limb
94
95	cmpl	$2, %ecx
96	ja	L(xsize_more_than_two_limbs)
97	je	L(two_by_something)
98
99
100	C one limb by one limb
101
102	movl	(%edx), %edx	C xp low limb
103	movl	PARAM_WP, %ecx
104
105	mull	%edx
106
107	movl	%eax, (%ecx)
108	movl	%edx, 4(%ecx)
109	ret
110
111
112C -----------------------------------------------------------------------------
113L(two_by_something):
114	decl	PARAM_YSIZE
115	pushl	%ebx
116deflit(`FRAME',4)
117
118	movl	PARAM_WP, %ebx
119	pushl	%esi
120deflit(`FRAME',8)
121
122	movl	%eax, %ecx	C yp low limb
123	movl	(%edx), %eax	C xp low limb
124
125	movl	%edx, %esi	C xp
126	jnz	L(two_by_two)
127
128
129	C two limbs by one limb
130
131	mull	%ecx
132
133	movl	%eax, (%ebx)
134	movl	4(%esi), %eax
135
136	movl	%edx, %esi	C carry
137
138	mull	%ecx
139
140	addl	%eax, %esi
141	movl	%esi, 4(%ebx)
142
143	adcl	$0, %edx
144
145	movl	%edx, 8(%ebx)
146	popl	%esi
147
148	popl	%ebx
149	ret
150
151
152
153C -----------------------------------------------------------------------------
154	ALIGN(16)
155L(two_by_two):
156	C eax	xp low limb
157	C ebx	wp
158	C ecx	yp low limb
159	C edx
160	C esi	xp
161	C edi
162	C ebp
163deflit(`FRAME',8)
164
165	mull	%ecx		C xp[0] * yp[0]
166
167	push	%edi
168deflit(`FRAME',12)
169	movl	%eax, (%ebx)
170
171	movl	4(%esi), %eax
172	movl	%edx, %edi	C carry, for wp[1]
173
174	mull	%ecx		C xp[1] * yp[0]
175
176	addl	%eax, %edi
177	movl	PARAM_YP, %ecx
178
179	adcl	$0, %edx
180
181	movl	%edi, 4(%ebx)
182	movl	4(%ecx), %ecx	C yp[1]
183
184	movl	4(%esi), %eax	C xp[1]
185	movl	%edx, %edi	C carry, for wp[2]
186
187	mull	%ecx		C xp[1] * yp[1]
188
189	addl	%eax, %edi
190
191	adcl	$0, %edx
192
193	movl	(%esi), %eax	C xp[0]
194	movl	%edx, %esi	C carry, for wp[3]
195
196	mull	%ecx		C xp[0] * yp[1]
197
198	addl	%eax, 4(%ebx)
199	adcl	%edx, %edi
200	adcl	$0, %esi
201
202	movl	%edi, 8(%ebx)
203	popl	%edi
204
205	movl	%esi, 12(%ebx)
206	popl	%esi
207
208	popl	%ebx
209	ret
210
211
212C -----------------------------------------------------------------------------
213	ALIGN(16)
214L(xsize_more_than_two_limbs):
215
216C The first limb of yp is processed with a simple mpn_mul_1 style loop
217C inline.  Unrolling this doesn't seem worthwhile since it's only run once
218C (whereas the addmul below is run ysize-1 many times).  A call to the
219C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
220C popping, and doesn't seem likely to be worthwhile on the typical 10-20
221C limb operations the Karatsuba code calls here with.
222
223	C eax	yp[0]
224	C ebx
225	C ecx	xsize
226	C edx	xp
227	C esi
228	C edi
229	C ebp
230deflit(`FRAME',0)
231
232	pushl	%edi		defframe_pushl(SAVE_EDI)
233	pushl	%ebp		defframe_pushl(SAVE_EBP)
234
235	movl	PARAM_WP, %edi
236	pushl	%esi		defframe_pushl(SAVE_ESI)
237
238	movl	%eax, %ebp
239	pushl	%ebx		defframe_pushl(SAVE_EBX)
240
241	leal	(%edx,%ecx,4), %ebx	C xp end
242	xorl	%esi, %esi
243
244	leal	(%edi,%ecx,4), %edi	C wp end of mul1
245	negl	%ecx
246
247
248L(mul1):
249	C eax	scratch
250	C ebx	xp end
251	C ecx	counter, negative
252	C edx	scratch
253	C esi	carry
254	C edi	wp end of mul1
255	C ebp	multiplier
256
257	movl	(%ebx,%ecx,4), %eax
258
259	mull	%ebp
260
261	addl	%esi, %eax
262	movl	$0, %esi
263
264	adcl	%edx, %esi
265
266	movl	%eax, (%edi,%ecx,4)
267	incl	%ecx
268
269	jnz	L(mul1)
270
271
272	movl	PARAM_YSIZE, %edx
273	movl	%esi, (%edi)		C final carry
274
275	movl	PARAM_XSIZE, %ecx
276	decl	%edx
277
278	jnz	L(ysize_more_than_one_limb)
279
280	popl	%ebx
281	popl	%esi
282	popl	%ebp
283	popl	%edi
284	ret
285
286
287L(ysize_more_than_one_limb):
288	cmpl	$UNROLL_THRESHOLD, %ecx
289	movl	PARAM_YP, %eax
290
291	jae	L(unroll)
292
293
294C -----------------------------------------------------------------------------
295C Simple addmul loop.
296C
297C Using ebx and edi pointing at the ends of their respective locations saves
298C a couple of instructions in the outer loop.  The inner loop is still 11
299C cycles, the same as the simple loop in aorsmul_1.asm.
300
301	C eax	yp
302	C ebx	xp end
303	C ecx	xsize
304	C edx	ysize-1
305	C esi
306	C edi	wp end of mul1
307	C ebp
308
309	movl	4(%eax), %ebp		C multiplier
310	negl	%ecx
311
312	movl	%ecx, PARAM_XSIZE	C -xsize
313	xorl	%esi, %esi		C initial carry
314
315	leal	4(%eax,%edx,4), %eax	C yp end
316	negl	%edx
317
318	movl	%eax, PARAM_YP
319	movl	%edx, PARAM_YSIZE
320
321	jmp	L(simple_outer_entry)
322
323
324	C aligning here saves a couple of cycles
325	ALIGN(16)
326L(simple_outer_top):
327	C edx	ysize counter, negative
328
329	movl	PARAM_YP, %eax		C yp end
330	xorl	%esi, %esi		C carry
331
332	movl	PARAM_XSIZE, %ecx	C -xsize
333	movl	%edx, PARAM_YSIZE
334
335	movl	(%eax,%edx,4), %ebp	C yp limb multiplier
336L(simple_outer_entry):
337	addl	$4, %edi
338
339
340L(simple_inner):
341	C eax	scratch
342	C ebx	xp end
343	C ecx	counter, negative
344	C edx	scratch
345	C esi	carry
346	C edi	wp end of this addmul
347	C ebp	multiplier
348
349	movl	(%ebx,%ecx,4), %eax
350
351	mull	%ebp
352
353	addl	%esi, %eax
354	movl	$0, %esi
355
356	adcl	$0, %edx
357	addl	%eax, (%edi,%ecx,4)
358	adcl	%edx, %esi
359
360	incl	%ecx
361	jnz	L(simple_inner)
362
363
364	movl	PARAM_YSIZE, %edx
365	movl	%esi, (%edi)
366
367	incl	%edx
368	jnz	L(simple_outer_top)
369
370
371	popl	%ebx
372	popl	%esi
373	popl	%ebp
374	popl	%edi
375	ret
376
377
378C -----------------------------------------------------------------------------
379C Unrolled loop.
380C
381C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for
382C some comments.
383C
384C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to
385C 0, inclusive.
386C
387C VAR_JMP is the computed jump into the unrolled loop.
388C
389C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop
390C is entered.
391C
392C VAR_XP_LOW is the least significant limb of xp, which is needed at the
393C start of the unrolled loop.  This can't just be fetched through the xp
394C pointer because of the offset applied to it.
395C
396C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
397C inclusive.
398C
399C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
400C added to give the location of the next limb of yp, which is the multiplier
401C in the unrolled loop.
402C
403C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added
404C to give the starting point in the destination for each unrolled loop (this
405C point is one limb upwards for each limb of yp processed).
406C
407C Having PARAM_YSIZE count negative to zero means it's not necessary to
408C store new values of PARAM_YP and PARAM_WP on each loop.  Those values on
409C the stack remain constant and on each loop an leal adjusts them with the
410C PARAM_YSIZE counter value.
411
412
413defframe(VAR_COUNTER,      -20)
414defframe(VAR_COUNTER_INIT, -24)
415defframe(VAR_JMP,          -28)
416defframe(VAR_XP_LOW,       -32)
417deflit(VAR_STACK_SPACE, 16)
418
419dnl  For some strange reason using (%esp) instead of 0(%esp) is a touch
420dnl  slower in this code, hence the defframe empty-if-zero feature is
421dnl  disabled.
422dnl
423dnl  If VAR_COUNTER is at (%esp), the effect is worse.  In this case the
424dnl  unrolled loop is 255 instead of 256 bytes, but quite how this affects
425dnl  anything isn't clear.
426dnl
427define(`defframe_empty_if_zero_disabled',1)
428
429L(unroll):
430	C eax	yp (not used)
431	C ebx	xp end (not used)
432	C ecx	xsize
433	C edx	ysize-1
434	C esi
435	C edi	wp end of mul1 (not used)
436	C ebp
437deflit(`FRAME', 16)
438
439	leal	-2(%ecx), %ebp	C one limb processed at start,
440	decl	%ecx		C and ebp is one less
441
442	shrl	$UNROLL_LOG2, %ebp
443	negl	%ecx
444
445	subl	$VAR_STACK_SPACE, %esp
446deflit(`FRAME', 16+VAR_STACK_SPACE)
447	andl	$UNROLL_MASK, %ecx
448
449	movl	%ecx, %esi
450	shll	$4, %ecx
451
452	movl	%ebp, VAR_COUNTER_INIT
453	negl	%esi
454
455	C 15 code bytes per limb
456ifdef(`PIC',`
457	call	L(pic_calc)
458L(unroll_here):
459',`
460	leal	L(unroll_entry) (%ecx,%esi,1), %ecx
461')
462
463	movl	PARAM_XP, %ebx
464	movl	%ebp, VAR_COUNTER
465
466	movl	PARAM_WP, %edi
467	movl	%ecx, VAR_JMP
468
469	movl	(%ebx), %eax
470	leal	4(%edi,%esi,4), %edi	C wp adjust for unrolling and mul1
471
472	leal	(%ebx,%esi,4), %ebx	C xp adjust for unrolling
473
474	movl	%eax, VAR_XP_LOW
475
476	movl	%ebx, PARAM_XP
477	movl	PARAM_YP, %ebx
478
479	leal	(%edi,%edx,4), %ecx	C wp adjust for ysize indexing
480	movl	4(%ebx), %ebp		C multiplier (yp second limb)
481
482	leal	4(%ebx,%edx,4), %ebx	C yp adjust for ysize indexing
483
484	movl	%ecx, PARAM_WP
485
486	leal	1(%esi), %ecx	C adjust parity for decl %ecx above
487
488	movl	%ebx, PARAM_YP
489	negl	%edx
490
491	movl	%edx, PARAM_YSIZE
492	jmp	L(unroll_outer_entry)
493
494
495ifdef(`PIC',`
496L(pic_calc):
497	C See mpn/x86/README about old gas bugs
498	leal	(%ecx,%esi,1), %ecx
499	addl	$L(unroll_entry)-L(unroll_here), %ecx
500	addl	(%esp), %ecx
501	ret_internal
502')
503
504
505C -----------------------------------------------------------------------------
506	C Aligning here saves a couple of cycles per loop.  Using 32 doesn't
507	C cost any extra space, since the inner unrolled loop below is
508	C aligned to 32.
509	ALIGN(32)
510L(unroll_outer_top):
511	C edx	ysize
512
513	movl	PARAM_YP, %eax
514	movl	%edx, PARAM_YSIZE	C incremented ysize counter
515
516	movl	PARAM_WP, %edi
517
518	movl	VAR_COUNTER_INIT, %ebx
519	movl	(%eax,%edx,4), %ebp	C next multiplier
520
521	movl	PARAM_XSIZE, %ecx
522	leal	(%edi,%edx,4), %edi	C adjust wp for where we are in yp
523
524	movl	VAR_XP_LOW, %eax
525	movl	%ebx, VAR_COUNTER
526
527L(unroll_outer_entry):
528	mull	%ebp
529
530	C using testb is a tiny bit faster than testl
531	testb	$1, %cl
532
533	movl	%eax, %ecx	C low carry
534	movl	VAR_JMP, %eax
535
536	movl	%edx, %esi	C high carry
537	movl	PARAM_XP, %ebx
538
539	jnz	L(unroll_noswap)
540	movl	%ecx, %esi	C high,low carry other way around
541
542	movl	%edx, %ecx
543L(unroll_noswap):
544
545	jmp	*%eax
546
547
548
549C -----------------------------------------------------------------------------
550	ALIGN(32)
551L(unroll_top):
552	C eax	scratch
553	C ebx	xp
554	C ecx	carry low
555	C edx	scratch
556	C esi	carry high
557	C edi	wp
558	C ebp	multiplier
559	C VAR_COUNTER  loop counter
560	C
561	C 15 code bytes each limb
562
563	leal	UNROLL_BYTES(%edi), %edi
564
565L(unroll_entry):
566deflit(CHUNK_COUNT,2)
567forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
568	deflit(`disp0', eval(i*CHUNK_COUNT*4))
569	deflit(`disp1', eval(disp0 + 4))
570	deflit(`disp2', eval(disp1 + 4))
571
572	movl	disp1(%ebx), %eax
573	mull	%ebp
574Zdisp(	addl,	%ecx, disp0,(%edi))
575	adcl	%eax, %esi
576	movl	%edx, %ecx
577	jadcl0( %ecx)
578
579	movl	disp2(%ebx), %eax
580	mull	%ebp
581	addl	%esi, disp1(%edi)
582	adcl	%eax, %ecx
583	movl	%edx, %esi
584	jadcl0( %esi)
585')
586
587	decl	VAR_COUNTER
588	leal	UNROLL_BYTES(%ebx), %ebx
589
590	jns	L(unroll_top)
591
592
593	movl	PARAM_YSIZE, %edx
594	addl	%ecx, UNROLL_BYTES(%edi)
595
596	adcl	$0, %esi
597
598	incl	%edx
599	movl	%esi, UNROLL_BYTES+4(%edi)
600
601	jnz	L(unroll_outer_top)
602
603
604	movl	SAVE_ESI, %esi
605	movl	SAVE_EBP, %ebp
606	movl	SAVE_EDI, %edi
607	movl	SAVE_EBX, %ebx
608
609	addl	$FRAME, %esp
610	ret
611
612EPILOGUE()
613