1dnl  AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
2
3dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
24C     limbs/loop unrolling).
25
26
27
28dnl  K7 UNROLL_COUNT cycles/product (at around 20x20)
29dnl           8           4.67
30dnl          16           4.59
31dnl          32           4.42
32dnl  Maximum possible with the current code is 32.
33dnl
34dnl  At 32 the typical 13-26 limb sizes from the karatsuba code will get
35dnl  done with a straight run through a block of code, no inner loop.  Using
36dnl  32 gives 1k of code, but the k7 has a 64k L1 code cache.
37
38deflit(UNROLL_COUNT, 32)
39
40
41C void mpn_mul_basecase (mp_ptr wp,
42C                        mp_srcptr xp, mp_size_t xsize,
43C                        mp_srcptr yp, mp_size_t ysize);
44C
45C Calculate xp,xsize multiplied by yp,ysize, storing the result in
46C wp,xsize+ysize.
47C
48C This routine is essentially the same as mpn/generic/mul_basecase.c, but
49C it's faster because it does most of the mpn_addmul_1() startup
50C calculations only once.  The saving is 15-25% on typical sizes coming from
51C the Karatsuba multiply code.
52
53ifdef(`PIC',`
54deflit(UNROLL_THRESHOLD, 5)
55',`
56deflit(UNROLL_THRESHOLD, 5)
57')
58
59defframe(PARAM_YSIZE,20)
60defframe(PARAM_YP,   16)
61defframe(PARAM_XSIZE,12)
62defframe(PARAM_XP,   8)
63defframe(PARAM_WP,   4)
64
65	TEXT
66	ALIGN(32)
67PROLOGUE(mpn_mul_basecase)
68deflit(`FRAME',0)
69
70	movl	PARAM_XSIZE, %ecx
71	movl	PARAM_YP, %eax
72
73	movl	PARAM_XP, %edx
74	movl	(%eax), %eax	C yp low limb
75
76	cmpl	$2, %ecx
77	ja	L(xsize_more_than_two)
78	je	L(two_by_something)
79
80
81	C one limb by one limb
82
83	mull	(%edx)
84
85	movl	PARAM_WP, %ecx
86	movl	%eax, (%ecx)
87	movl	%edx, 4(%ecx)
88	ret
89
90
91C -----------------------------------------------------------------------------
92L(two_by_something):
93deflit(`FRAME',0)
94	decl	PARAM_YSIZE
95	pushl	%ebx		defframe_pushl(`SAVE_EBX')
96	movl	%eax, %ecx	C yp low limb
97
98	movl	PARAM_WP, %ebx
99	pushl	%esi		defframe_pushl(`SAVE_ESI')
100	movl	%edx, %esi	C xp
101
102	movl	(%edx), %eax	C xp low limb
103	jnz	L(two_by_two)
104
105
106	C two limbs by one limb
107
108	mull	%ecx
109
110	movl	%eax, (%ebx)
111	movl	4(%esi), %eax
112	movl	%edx, %esi	C carry
113
114	mull	%ecx
115
116	addl	%eax, %esi
117
118	movl	%esi, 4(%ebx)
119	movl	SAVE_ESI, %esi
120
121	adcl	$0, %edx
122
123	movl	%edx, 8(%ebx)
124	movl	SAVE_EBX, %ebx
125	addl	$FRAME, %esp
126
127	ret
128
129
130
131C -----------------------------------------------------------------------------
132C Could load yp earlier into another register.
133
134	ALIGN(16)
135L(two_by_two):
136	C eax	xp low limb
137	C ebx	wp
138	C ecx	yp low limb
139	C edx
140	C esi	xp
141	C edi
142	C ebp
143
144dnl  FRAME carries on from previous
145
146	mull	%ecx		C xp[0] * yp[0]
147
148	push	%edi		defframe_pushl(`SAVE_EDI')
149	movl	%edx, %edi	C carry, for wp[1]
150
151	movl	%eax, (%ebx)
152	movl	4(%esi), %eax
153
154	mull	%ecx		C xp[1] * yp[0]
155
156	addl	%eax, %edi
157	movl	PARAM_YP, %ecx
158
159	adcl	$0, %edx
160	movl	4(%ecx), %ecx	C yp[1]
161	movl	%edi, 4(%ebx)
162
163	movl	4(%esi), %eax	C xp[1]
164	movl	%edx, %edi	C carry, for wp[2]
165
166	mull	%ecx		C xp[1] * yp[1]
167
168	addl	%eax, %edi
169
170	adcl	$0, %edx
171	movl	(%esi), %eax	C xp[0]
172
173	movl	%edx, %esi	C carry, for wp[3]
174
175	mull	%ecx		C xp[0] * yp[1]
176
177	addl	%eax, 4(%ebx)
178	adcl	%edx, %edi
179	movl	%edi, 8(%ebx)
180
181	adcl	$0, %esi
182	movl	SAVE_EDI, %edi
183	movl	%esi, 12(%ebx)
184
185	movl	SAVE_ESI, %esi
186	movl	SAVE_EBX, %ebx
187	addl	$FRAME, %esp
188
189	ret
190
191
192C -----------------------------------------------------------------------------
193	ALIGN(16)
194L(xsize_more_than_two):
195
196C The first limb of yp is processed with a simple mpn_mul_1 style loop
197C inline.  Unrolling this doesn't seem worthwhile since it's only run once
198C (whereas the addmul below is run ysize-1 many times).  A call to the
199C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
200C popping, and doesn't seem likely to be worthwhile on the typical 13-26
201C limb operations the Karatsuba code calls here with.
202
203	C eax	yp[0]
204	C ebx
205	C ecx	xsize
206	C edx	xp
207	C esi
208	C edi
209	C ebp
210
211dnl  FRAME doesn't carry on from previous, no pushes yet here
212defframe(`SAVE_EBX',-4)
213defframe(`SAVE_ESI',-8)
214defframe(`SAVE_EDI',-12)
215defframe(`SAVE_EBP',-16)
216deflit(`FRAME',0)
217
218	subl	$16, %esp
219deflit(`FRAME',16)
220
221	movl	%edi, SAVE_EDI
222	movl	PARAM_WP, %edi
223
224	movl	%ebx, SAVE_EBX
225	movl	%ebp, SAVE_EBP
226	movl	%eax, %ebp
227
228	movl	%esi, SAVE_ESI
229	xorl	%ebx, %ebx
230	leal	(%edx,%ecx,4), %esi	C xp end
231
232	leal	(%edi,%ecx,4), %edi	C wp end of mul1
233	negl	%ecx
234
235
236L(mul1):
237	C eax	scratch
238	C ebx	carry
239	C ecx	counter, negative
240	C edx	scratch
241	C esi	xp end
242	C edi	wp end of mul1
243	C ebp	multiplier
244
245	movl	(%esi,%ecx,4), %eax
246
247	mull	%ebp
248
249	addl	%ebx, %eax
250	movl	%eax, (%edi,%ecx,4)
251	movl	$0, %ebx
252
253	adcl	%edx, %ebx
254	incl	%ecx
255	jnz	L(mul1)
256
257
258	movl	PARAM_YSIZE, %edx
259	movl	PARAM_XSIZE, %ecx
260
261	movl	%ebx, (%edi)		C final carry
262	decl	%edx
263
264	jnz	L(ysize_more_than_one)
265
266
267	movl	SAVE_EDI, %edi
268	movl	SAVE_EBX, %ebx
269
270	movl	SAVE_EBP, %ebp
271	movl	SAVE_ESI, %esi
272	addl	$FRAME, %esp
273
274	ret
275
276
277L(ysize_more_than_one):
278	cmpl	$UNROLL_THRESHOLD, %ecx
279	movl	PARAM_YP, %eax
280
281	jae	L(unroll)
282
283
284C -----------------------------------------------------------------------------
285	C simple addmul looping
286	C
287	C eax	yp
288	C ebx
289	C ecx	xsize
290	C edx	ysize-1
291	C esi	xp end
292	C edi	wp end of mul1
293	C ebp
294
295	leal	4(%eax,%edx,4), %ebp	C yp end
296	negl	%ecx
297	negl	%edx
298
299	movl	(%esi,%ecx,4), %eax	C xp low limb
300	movl	%edx, PARAM_YSIZE	C -(ysize-1)
301	incl	%ecx
302
303	xorl	%ebx, %ebx		C initial carry
304	movl	%ecx, PARAM_XSIZE	C -(xsize-1)
305	movl	%ebp, PARAM_YP
306
307	movl	(%ebp,%edx,4), %ebp	C yp second lowest limb - multiplier
308	jmp	L(simple_outer_entry)
309
310
311	C this is offset 0x121 so close enough to aligned
312L(simple_outer_top):
313	C ebp	ysize counter, negative
314
315	movl	PARAM_YP, %edx
316	movl	PARAM_XSIZE, %ecx	C -(xsize-1)
317	xorl	%ebx, %ebx		C carry
318
319	movl	%ebp, PARAM_YSIZE
320	addl	$4, %edi		C next position in wp
321
322	movl	(%edx,%ebp,4), %ebp	C yp limb - multiplier
323	movl	-4(%esi,%ecx,4), %eax	C xp low limb
324
325
326L(simple_outer_entry):
327
328L(simple_inner):
329	C eax	xp limb
330	C ebx	carry limb
331	C ecx	loop counter (negative)
332	C edx	scratch
333	C esi	xp end
334	C edi	wp end
335	C ebp	multiplier
336
337	mull	%ebp
338
339	addl	%eax, %ebx
340	adcl	$0, %edx
341
342	addl	%ebx, (%edi,%ecx,4)
343	movl	(%esi,%ecx,4), %eax
344	adcl	$0, %edx
345
346	incl	%ecx
347	movl	%edx, %ebx
348	jnz	L(simple_inner)
349
350
351	mull	%ebp
352
353	movl	PARAM_YSIZE, %ebp
354	addl	%eax, %ebx
355
356	adcl	$0, %edx
357	addl	%ebx, (%edi)
358
359	adcl	$0, %edx
360	incl	%ebp
361
362	movl	%edx, 4(%edi)
363	jnz	L(simple_outer_top)
364
365
366	movl	SAVE_EBX, %ebx
367	movl	SAVE_ESI, %esi
368
369	movl	SAVE_EDI, %edi
370	movl	SAVE_EBP, %ebp
371	addl	$FRAME, %esp
372
373	ret
374
375
376
377C -----------------------------------------------------------------------------
378C
379C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
380C comments.
381C
382C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
383C increment xp and wp.  This is used to adjust back xp and wp, and rshifted
384C to given an initial VAR_COUNTER at the top of the outer loop.
385C
386C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
387C up to -1, inclusive.
388C
389C VAR_JMP is the computed jump into the unrolled loop.
390C
391C VAR_XP_LOW is the least significant limb of xp, which is needed at the
392C start of the unrolled loop.
393C
394C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
395C inclusive.
396C
397C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
398C added to give the location of the next limb of yp, which is the multiplier
399C in the unrolled loop.
400C
401C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
402C outer loop to take care of xp, wp and the inner loop counter.
403
404defframe(VAR_COUNTER,  -20)
405defframe(VAR_ADJUST,   -24)
406defframe(VAR_JMP,      -28)
407defframe(VAR_XP_LOW,   -32)
408deflit(VAR_EXTRA_SPACE, 16)
409
410
411L(unroll):
412	C eax	yp
413	C ebx
414	C ecx	xsize
415	C edx	ysize-1
416	C esi	xp end
417	C edi	wp end of mul1
418	C ebp
419
420	movl	PARAM_XP, %esi
421	movl	4(%eax), %ebp		C multiplier (yp second limb)
422	leal	4(%eax,%edx,4), %eax	C yp adjust for ysize indexing
423
424	movl	PARAM_WP, %edi
425	movl	%eax, PARAM_YP
426	negl	%edx
427
428	movl	%edx, PARAM_YSIZE
429	leal	UNROLL_COUNT-2(%ecx), %ebx	C (xsize-1)+UNROLL_COUNT-1
430	decl	%ecx				C xsize-1
431
432	movl	(%esi), %eax		C xp low limb
433	andl	$-UNROLL_MASK-1, %ebx
434	negl	%ecx
435
436	subl	$VAR_EXTRA_SPACE, %esp
437deflit(`FRAME',16+VAR_EXTRA_SPACE)
438	negl	%ebx
439	andl	$UNROLL_MASK, %ecx
440
441	movl	%ebx, VAR_ADJUST
442	movl	%ecx, %edx
443	shll	$4, %ecx
444
445	sarl	$UNROLL_LOG2, %ebx
446
447	C 17 code bytes per limb
448ifdef(`PIC',`
449	call	L(pic_calc)
450L(unroll_here):
451',`
452	leal	L(unroll_entry) (%ecx,%edx,1), %ecx
453')
454	negl	%edx
455
456	movl	%eax, VAR_XP_LOW
457	movl	%ecx, VAR_JMP
458	leal	4(%edi,%edx,4), %edi	C wp and xp, adjust for unrolling,
459	leal	4(%esi,%edx,4), %esi	C  and start at second limb
460	jmp	L(unroll_outer_entry)
461
462
463ifdef(`PIC',`
464L(pic_calc):
465	C See mpn/x86/README about old gas bugs
466	leal	(%ecx,%edx,1), %ecx
467	addl	$L(unroll_entry)-L(unroll_here), %ecx
468	addl	(%esp), %ecx
469	ret_internal
470')
471
472
473C --------------------------------------------------------------------------
474	ALIGN(32)
475L(unroll_outer_top):
476	C ebp	ysize counter, negative
477
478	movl	VAR_ADJUST, %ebx
479	movl	PARAM_YP, %edx
480
481	movl	VAR_XP_LOW, %eax
482	movl	%ebp, PARAM_YSIZE	C store incremented ysize counter
483
484	leal	4(%edi,%ebx,4), %edi
485	leal	(%esi,%ebx,4), %esi
486	sarl	$UNROLL_LOG2, %ebx
487
488	movl	(%edx,%ebp,4), %ebp	C yp next multiplier
489	movl	VAR_JMP, %ecx
490
491L(unroll_outer_entry):
492	mull	%ebp
493
494	testb	$1, %cl		C and clear carry bit
495	movl	%ebx, VAR_COUNTER
496	movl	$0, %ebx
497
498	movl	$0, %ecx
499	cmovz(	%eax, %ecx)	C eax into low carry, zero into high carry limb
500	cmovnz(	%eax, %ebx)
501
502	C Extra fetch of VAR_JMP is bad, but registers are tight
503	jmp	*VAR_JMP
504
505
506C -----------------------------------------------------------------------------
507	ALIGN(32)
508L(unroll_top):
509	C eax	xp limb
510	C ebx	carry high
511	C ecx	carry low
512	C edx	scratch
513	C esi	xp+8
514	C edi	wp
515	C ebp	yp multiplier limb
516	C
517	C VAR_COUNTER  loop counter, negative
518	C
519	C 17 bytes each limb
520
521L(unroll_entry):
522
523deflit(CHUNK_COUNT,2)
524forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
525	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
526	deflit(`disp1', eval(disp0 + 4))
527
528Zdisp(	movl,	disp0,(%esi), %eax)
529	adcl	%edx, %ebx
530
531	mull	%ebp
532
533Zdisp(	addl,	%ecx, disp0,(%edi))
534	movl	$0, %ecx
535
536	adcl	%eax, %ebx
537
538
539	movl	disp1(%esi), %eax
540	adcl	%edx, %ecx
541
542	mull	%ebp
543
544	addl	%ebx, disp1(%edi)
545	movl	$0, %ebx
546
547	adcl	%eax, %ecx
548')
549
550
551	incl	VAR_COUNTER
552	leal	UNROLL_BYTES(%esi), %esi
553	leal	UNROLL_BYTES(%edi), %edi
554
555	jnz	L(unroll_top)
556
557
558	C eax
559	C ebx	zero
560	C ecx	low
561	C edx	high
562	C esi
563	C edi	wp, pointing at second last limb)
564	C ebp
565	C
566	C carry flag to be added to high
567
568deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
569deflit(`disp1', eval(disp0-0 + 4))
570
571	movl	PARAM_YSIZE, %ebp
572	adcl	$0, %edx
573	addl	%ecx, disp0(%edi)
574
575	adcl	$0, %edx
576	incl	%ebp
577
578	movl	%edx, disp1(%edi)
579	jnz	L(unroll_outer_top)
580
581
582	movl	SAVE_ESI, %esi
583	movl	SAVE_EBP, %ebp
584
585	movl	SAVE_EDI, %edi
586	movl	SAVE_EBX, %ebx
587	addl	$FRAME, %esp
588
589	ret
590
591EPILOGUE()
592