1dnl  PowerPC-64 mpn_basecase.
2
3dnl  Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008 Free Software
4dnl  Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C		cycles/limb
24C POWER3/PPC630:    6-18
25C POWER4/PPC970:     8
26C POWER5:            8
27
28
29C INPUT PARAMETERS
30define(`rp', `r3')
31define(`up', `r4')
32define(`un', `r5')
33define(`vp', `r6')
34define(`vn', `r7')
35
36define(`v0',	   `r25')
37define(`outer_rp', `r22')
38define(`outer_up', `r23')
39
40ASM_START()
41PROLOGUE(mpn_mul_basecase)
42
43C Special code for un <= 2, for efficiency of these important cases,
44C and since it simplifies the default code.
45	cmpdi	cr0, un, 2
46	bgt	cr0, L(un_gt2)
47	cmpdi	cr6, vn, 1
48	ld	r7, 0(vp)
49	ld	r5, 0(up)
50	mulld	r8, r5, r7	C weight 0
51	mulhdu	r9, r5, r7	C weight 1
52	std	r8, 0(rp)
53	beq	cr0, L(2x)
54	std	r9, 8(rp)
55	blr
56	ALIGN(16)
57L(2x):	ld	r0, 8(up)
58	mulld	r8, r0, r7	C weight 1
59	mulhdu	r10, r0, r7	C weight 2
60	addc	r9, r9, r8
61	addze	r10, r10
62	bne	cr6, L(2x2)
63	std	r9, 8(rp)
64	std	r10, 16(rp)
65	blr
66	ALIGN(16)
67L(2x2):	ld	r6, 8(vp)
68	nop
69	mulld	r8, r5, r6	C weight 1
70	mulhdu	r11, r5, r6	C weight 2
71	addc	r9, r9, r8
72	std	r9, 8(rp)
73	adde	r11, r11, r10
74	mulld	r12, r0, r6	C weight 2
75	mulhdu	r0, r0, r6	C weight 3
76	addze	r0, r0
77	addc	r11, r11, r12
78	addze	r0, r0
79	std	r11, 16(rp)
80	std	r0, 24(rp)
81	blr
82
83L(un_gt2):
84	std	r31, -8(r1)
85	std	r30, -16(r1)
86	std	r29, -24(r1)
87	std	r28, -32(r1)
88	std	r27, -40(r1)
89	std	r26, -48(r1)
90	std	r25, -56(r1)
91	std	r24, -64(r1)
92	std	r23, -72(r1)
93	std	r22, -80(r1)
94
95	mr	outer_rp, rp
96	mr	outer_up, up
97
98	ld	v0, 0(vp)	C new v limb
99	addi	vp, vp, 8
100	ld	r26, 0(up)
101
102	rldicl.	r0, un, 0,62	C r0 = n & 3, set cr0
103	cmpdi	cr6, r0, 2
104	addi	un, un, 1	C compute count...
105	srdi	un, un, 2	C ...for ctr
106	mtctr	un		C copy inner loop count into ctr
107	beq	cr0, L(b0)
108	blt	cr6, L(b1)
109	beq	cr6, L(b2)
110
111
112	ALIGN(16)
113L(b3):	mulld	r0, r26, v0
114	mulhdu	r12, r26, v0
115	addic	r0, r0, 0
116	std	r0, 0(rp)
117	ld	r26, 8(up)
118	ld	r27, 16(up)
119	bdz	L(end_m_3)
120
121	ALIGN(16)
122L(lo_m_3):
123	mulld	r0, r26, v0
124	mulhdu	r31, r26, v0
125	ld	r26, 24(up)
126	nop
127	mulld	r24, r27, v0
128	mulhdu	r8, r27, v0
129	ld	r27, 32(up)
130	nop
131	adde	r0, r0, r12
132	adde	r24, r24, r31
133	mulld	r9, r26, v0
134	mulhdu	r10, r26, v0
135	ld	r26, 40(up)
136	nop
137	mulld	r11, r27, v0
138	mulhdu	r12, r27, v0
139	ld	r27, 48(up)
140	std	r0, 8(rp)
141	adde	r9, r9, r8
142	std	r24, 16(rp)
143	adde	r11, r11, r10
144	std	r9, 24(rp)
145	addi	up, up, 32
146	std	r11, 32(rp)
147	addi	rp, rp, 32
148	bdnz	L(lo_m_3)
149
150	ALIGN(16)
151L(end_m_3):
152	mulld	r0, r26, v0
153	mulhdu	r31, r26, v0
154
155	mulld	r24, r27, v0
156	mulhdu	r8, r27, v0
157
158	adde	r0, r0, r12
159	adde	r24, r24, r31
160
161	std	r0, 8(rp)
162	std	r24, 16(rp)
163	addze	r8, r8
164	std	r8, 24(rp)
165	addic.	vn, vn, -1
166	beq	L(ret)
167
168	ALIGN(16)
169L(outer_lo_3):
170	mtctr	un		C copy inner loop count into ctr
171	addi	rp, outer_rp, 8
172	mr	up, outer_up
173	addi	outer_rp, outer_rp, 8
174	ld	v0, 0(vp)	C new v limb
175	addi	vp, vp, 8
176	ld	r26, 0(up)
177	ld	r28, 0(rp)
178	mulld	r0, r26, v0
179	mulhdu	r12, r26, v0
180	addc	r0, r0, r28
181	std	r0, 0(rp)
182	ld	r26, 8(up)
183	ld	r27, 16(up)
184	bdz	L(end_3)
185
186	ALIGN(16)		C registers dying
187L(lo_3):
188	mulld	r0, r26, v0	C
189	mulhdu	r10, r26, v0	C 26
190	ld	r26, 24(up)	C
191	ld	r28, 8(rp)	C
192	mulld	r24, r27, v0	C
193	mulhdu	r8, r27, v0	C 27
194	ld	r27, 32(up)	C
195	ld	r29, 16(rp)	C
196	adde	r0, r0, r12	C 0 12
197	adde	r24, r24, r10	C 24 10
198	mulld	r9, r26, v0	C
199	mulhdu	r10, r26, v0	C 26
200	ld	r26, 40(up)	C
201	ld	r30, 24(rp)	C
202	mulld	r11, r27, v0	C
203	mulhdu	r12, r27, v0	C 27
204	ld	r27, 48(up)	C
205	ld	r31, 32(rp)	C
206	adde	r9, r9, r8	C 8 9
207	adde	r11, r11, r10	C 10 11
208	addze	r12, r12	C 12
209	addc	r0, r0, r28	C 0 28
210	std	r0, 8(rp)	C 0
211	adde	r24, r24, r29	C 7 29
212	std	r24, 16(rp)	C 7
213	adde	r9, r9, r30	C 9 30
214	std	r9, 24(rp)	C 9
215	adde	r11, r11, r31	C 11 31
216	std	r11, 32(rp)	C 11
217	addi	up, up, 32	C
218	addi	rp, rp, 32	C
219	bdnz	L(lo_3)	C
220
221	ALIGN(16)
222L(end_3):
223	mulld	r0, r26, v0
224	mulhdu	r10, r26, v0
225	ld	r28, 8(rp)
226	nop
227	mulld	r24, r27, v0
228	mulhdu	r8, r27, v0
229	ld	r29, 16(rp)
230	nop
231	adde	r0, r0, r12
232	adde	r24, r24, r10
233	addze	r8, r8
234	addc	r0, r0, r28
235	std	r0, 8(rp)
236	adde	r24, r24, r29
237	std	r24, 16(rp)
238	addze	r8, r8
239	std	r8, 24(rp)
240
241	addic.	vn, vn, -1
242	bne	L(outer_lo_3)
243	b	L(ret)
244
245
246	ALIGN(16)
247L(b0):	ld	r27, 8(up)
248	addi	up, up, 8
249	mulld	r0, r26, v0
250	mulhdu	r10, r26, v0
251	mulld	r24, r27, v0
252	mulhdu	r8, r27, v0
253	addc	r24, r24, r10
254	addze	r12, r8
255	std	r0, 0(rp)
256	std	r24, 8(rp)
257	addi	rp, rp, 8
258	ld	r26, 8(up)
259	ld	r27, 16(up)
260	bdz	L(end_m_0)
261
262	ALIGN(16)
263L(lo_m_0):
264	mulld	r0, r26, v0
265	mulhdu	r31, r26, v0
266	ld	r26, 24(up)
267	nop
268	mulld	r24, r27, v0
269	mulhdu	r8, r27, v0
270	ld	r27, 32(up)
271	nop
272	adde	r0, r0, r12
273	adde	r24, r24, r31
274	mulld	r9, r26, v0
275	mulhdu	r10, r26, v0
276	ld	r26, 40(up)
277	nop
278	mulld	r11, r27, v0
279	mulhdu	r12, r27, v0
280	ld	r27, 48(up)
281	std	r0, 8(rp)
282	adde	r9, r9, r8
283	std	r24, 16(rp)
284	adde	r11, r11, r10
285	std	r9, 24(rp)
286	addi	up, up, 32
287	std	r11, 32(rp)
288	addi	rp, rp, 32
289	bdnz	L(lo_m_0)
290
291	ALIGN(16)
292L(end_m_0):
293	mulld	r0, r26, v0
294	mulhdu	r31, r26, v0
295
296	mulld	r24, r27, v0
297	mulhdu	r8, r27, v0
298
299	adde	r0, r0, r12
300	adde	r24, r24, r31
301
302	std	r0, 8(rp)
303	addze	r8, r8
304	std	r24, 16(rp)
305	addic.	vn, vn, -1
306	std	r8, 24(rp)
307	nop
308	beq	L(ret)
309
310	ALIGN(16)
311L(outer_lo_0):
312	mtctr	un		C copy inner loop count into ctr
313	addi	rp, outer_rp, 16
314	addi	up, outer_up, 8
315	addi	outer_rp, outer_rp, 8
316	ld	v0, 0(vp)	C new v limb
317	addi	vp, vp, 8
318	ld	r26, -8(up)
319	ld	r27, 0(up)
320	ld	r28, -8(rp)
321	ld	r29, 0(rp)
322	nop
323	nop
324	mulld	r0, r26, v0
325	mulhdu	r10, r26, v0
326	mulld	r24, r27, v0
327	mulhdu	r8, r27, v0
328	addc	r24, r24, r10
329	addze	r12, r8
330	addc	r0, r0, r28
331	std	r0, -8(rp)
332	adde	r24, r24, r29
333	std	r24, 0(rp)
334	ld	r26, 8(up)
335	ld	r27, 16(up)
336	bdz	L(end_0)
337
338	ALIGN(16)		C registers dying
339L(lo_0):
340	mulld	r0, r26, v0	C
341	mulhdu	r10, r26, v0	C 26
342	ld	r26, 24(up)	C
343	ld	r28, 8(rp)	C
344	mulld	r24, r27, v0	C
345	mulhdu	r8, r27, v0	C 27
346	ld	r27, 32(up)	C
347	ld	r29, 16(rp)	C
348	adde	r0, r0, r12	C 0 12
349	adde	r24, r24, r10	C 24 10
350	mulld	r9, r26, v0	C
351	mulhdu	r10, r26, v0	C 26
352	ld	r26, 40(up)	C
353	ld	r30, 24(rp)	C
354	mulld	r11, r27, v0	C
355	mulhdu	r12, r27, v0	C 27
356	ld	r27, 48(up)	C
357	ld	r31, 32(rp)	C
358	adde	r9, r9, r8	C 8 9
359	adde	r11, r11, r10	C 10 11
360	addze	r12, r12	C 12
361	addc	r0, r0, r28	C 0 28
362	std	r0, 8(rp)	C 0
363	adde	r24, r24, r29	C 7 29
364	std	r24, 16(rp)	C 7
365	adde	r9, r9, r30	C 9 30
366	std	r9, 24(rp)	C 9
367	adde	r11, r11, r31	C 11 31
368	std	r11, 32(rp)	C 11
369	addi	up, up, 32	C
370	addi	rp, rp, 32	C
371	bdnz	L(lo_0)	C
372
373	ALIGN(16)
374L(end_0):
375	mulld	r0, r26, v0
376	mulhdu	r10, r26, v0
377	ld	r28, 8(rp)
378	nop
379	mulld	r24, r27, v0
380	mulhdu	r8, r27, v0
381	ld	r29, 16(rp)
382	nop
383	adde	r0, r0, r12
384	adde	r24, r24, r10
385	addze	r8, r8
386	addic.	vn, vn, -1
387	addc	r0, r0, r28
388	std	r0, 8(rp)
389	adde	r24, r24, r29
390	std	r24, 16(rp)
391	addze	r8, r8
392	std	r8, 24(rp)
393	bne	L(outer_lo_0)
394	b	L(ret)
395
396
397	ALIGN(16)
398L(b1):	ld	r27, 8(up)
399	nop
400	mulld	r0, r26, v0
401	mulhdu	r31, r26, v0
402	ld	r26, 16(up)
403	mulld	r24, r27, v0
404	mulhdu	r8, r27, v0
405	mulld	r9, r26, v0
406	mulhdu	r10, r26, v0
407	addc	r24, r24, r31
408	adde	r9, r9, r8
409	addze	r12, r10
410	std	r0, 0(rp)
411	std	r24, 8(rp)
412	std	r9, 16(rp)
413	addi	up, up, 16
414	addi	rp, rp, 16
415	ld	r26, 8(up)
416	ld	r27, 16(up)
417	bdz	L(end_m_1)
418
419	ALIGN(16)
420L(lo_m_1):
421	mulld	r0, r26, v0
422	mulhdu	r31, r26, v0
423	ld	r26, 24(up)
424	nop
425	mulld	r24, r27, v0
426	mulhdu	r8, r27, v0
427	ld	r27, 32(up)
428	nop
429	adde	r0, r0, r12
430	adde	r24, r24, r31
431	mulld	r9, r26, v0
432	mulhdu	r10, r26, v0
433	ld	r26, 40(up)
434	nop
435	mulld	r11, r27, v0
436	mulhdu	r12, r27, v0
437	ld	r27, 48(up)
438	std	r0, 8(rp)
439	adde	r9, r9, r8
440	std	r24, 16(rp)
441	adde	r11, r11, r10
442	std	r9, 24(rp)
443	addi	up, up, 32
444	std	r11, 32(rp)
445	addi	rp, rp, 32
446	bdnz	L(lo_m_1)
447
448	ALIGN(16)
449L(end_m_1):
450	mulld	r0, r26, v0
451	mulhdu	r31, r26, v0
452
453	mulld	r24, r27, v0
454	mulhdu	r8, r27, v0
455
456	adde	r0, r0, r12
457	adde	r24, r24, r31
458
459	std	r0, 8(rp)
460	addze	r8, r8
461	std	r24, 16(rp)
462	addic.	vn, vn, -1
463	std	r8, 24(rp)
464	nop
465	beq	L(ret)
466
467	ALIGN(16)
468L(outer_lo_1):
469	mtctr	un		C copy inner loop count into ctr
470	addi	rp, outer_rp, 24
471	addi	up, outer_up, 16
472	addi	outer_rp, outer_rp, 8
473	ld	v0, 0(vp)	C new v limb
474	addi	vp, vp, 8
475	ld	r26, -16(up)
476	ld	r27, -8(up)
477	mulld	r0, r26, v0
478	mulhdu	r31, r26, v0
479	ld	r26, 0(up)
480	ld	r28, -16(rp)
481	mulld	r24, r27, v0
482	mulhdu	r8, r27, v0
483	ld	r29, -8(rp)
484	ld	r30, 0(rp)
485	mulld	r9, r26, v0
486	mulhdu	r10, r26, v0
487	addc	r24, r24, r31
488	adde	r9, r9, r8
489	addze	r12, r10
490	addc	r0, r0, r28
491	std	r0, -16(rp)
492	adde	r24, r24, r29
493	std	r24, -8(rp)
494	adde	r9, r9, r30
495	std	r9, 0(rp)
496	ld	r26, 8(up)
497	ld	r27, 16(up)
498	bdz	L(end_1)
499
500	ALIGN(16)		C registers dying
501L(lo_1):
502	mulld	r0, r26, v0	C
503	mulhdu	r10, r26, v0	C 26
504	ld	r26, 24(up)	C
505	ld	r28, 8(rp)	C
506	mulld	r24, r27, v0	C
507	mulhdu	r8, r27, v0	C 27
508	ld	r27, 32(up)	C
509	ld	r29, 16(rp)	C
510	adde	r0, r0, r12	C 0 12
511	adde	r24, r24, r10	C 24 10
512	mulld	r9, r26, v0	C
513	mulhdu	r10, r26, v0	C 26
514	ld	r26, 40(up)	C
515	ld	r30, 24(rp)	C
516	mulld	r11, r27, v0	C
517	mulhdu	r12, r27, v0	C 27
518	ld	r27, 48(up)	C
519	ld	r31, 32(rp)	C
520	adde	r9, r9, r8	C 8 9
521	adde	r11, r11, r10	C 10 11
522	addze	r12, r12	C 12
523	addc	r0, r0, r28	C 0 28
524	std	r0, 8(rp)	C 0
525	adde	r24, r24, r29	C 7 29
526	std	r24, 16(rp)	C 7
527	adde	r9, r9, r30	C 9 30
528	std	r9, 24(rp)	C 9
529	adde	r11, r11, r31	C 11 31
530	std	r11, 32(rp)	C 11
531	addi	up, up, 32	C
532	addi	rp, rp, 32	C
533	bdnz	L(lo_1)	C
534
535	ALIGN(16)
536L(end_1):
537	mulld	r0, r26, v0
538	mulhdu	r10, r26, v0
539	ld	r28, 8(rp)
540	nop
541	mulld	r24, r27, v0
542	mulhdu	r8, r27, v0
543	ld	r29, 16(rp)
544	nop
545	adde	r0, r0, r12
546	adde	r24, r24, r10
547	addze	r8, r8
548	addic.	vn, vn, -1
549	addc	r0, r0, r28
550	std	r0, 8(rp)
551	adde	r24, r24, r29
552	std	r24, 16(rp)
553	addze	r8, r8
554	std	r8, 24(rp)
555	bne	L(outer_lo_1)
556	b	L(ret)
557
558
559	ALIGN(16)
560L(b2):	ld	r27, 8(up)
561	addi	up, up, -8
562	addi	rp, rp, -8
563	li	r12, 0
564	addic	r12, r12, 0
565
566	ALIGN(16)
567L(lo_m_2):
568	mulld	r0, r26, v0
569	mulhdu	r31, r26, v0
570	ld	r26, 24(up)
571	nop
572	mulld	r24, r27, v0
573	mulhdu	r8, r27, v0
574	ld	r27, 32(up)
575	nop
576	adde	r0, r0, r12
577	adde	r24, r24, r31
578	mulld	r9, r26, v0
579	mulhdu	r10, r26, v0
580	ld	r26, 40(up)
581	nop
582	mulld	r11, r27, v0
583	mulhdu	r12, r27, v0
584	ld	r27, 48(up)
585	std	r0, 8(rp)
586	adde	r9, r9, r8
587	std	r24, 16(rp)
588	adde	r11, r11, r10
589	std	r9, 24(rp)
590	addi	up, up, 32
591	std	r11, 32(rp)
592
593	addi	rp, rp, 32
594	bdnz	L(lo_m_2)
595
596	ALIGN(16)
597L(end_m_2):
598	mulld	r0, r26, v0
599	mulhdu	r31, r26, v0
600
601	mulld	r24, r27, v0
602	mulhdu	r8, r27, v0
603
604	adde	r0, r0, r12
605	adde	r24, r24, r31
606
607	std	r0, 8(rp)
608	addze	r8, r8
609	std	r24, 16(rp)
610	addic.	vn, vn, -1
611	std	r8, 24(rp)
612	nop
613	beq	L(ret)
614
615	ALIGN(16)
616L(outer_lo_2):
617	mtctr	un		C copy inner loop count into ctr
618	addi	rp, outer_rp, 0
619	addi	up, outer_up, -8
620	addi	outer_rp, outer_rp, 8
621	ld	v0, 0(vp)	C new v limb
622	addi	vp, vp, 8
623	ld	r26, 8(up)
624	ld	r27, 16(up)
625	li	r12, 0
626	addic	r12, r12, 0
627
628	ALIGN(16)		C registers dying
629L(lo_2):
630	mulld	r0, r26, v0	C
631	mulhdu	r10, r26, v0	C 26
632	ld	r26, 24(up)	C
633	ld	r28, 8(rp)	C
634	mulld	r24, r27, v0	C
635	mulhdu	r8, r27, v0	C 27
636	ld	r27, 32(up)	C
637	ld	r29, 16(rp)	C
638	adde	r0, r0, r12	C 0 12
639	adde	r24, r24, r10	C 24 10
640	mulld	r9, r26, v0	C
641	mulhdu	r10, r26, v0	C 26
642	ld	r26, 40(up)	C
643	ld	r30, 24(rp)	C
644	mulld	r11, r27, v0	C
645	mulhdu	r12, r27, v0	C 27
646	ld	r27, 48(up)	C
647	ld	r31, 32(rp)	C
648	adde	r9, r9, r8	C 8 9
649	adde	r11, r11, r10	C 10 11
650	addze	r12, r12	C 12
651	addc	r0, r0, r28	C 0 28
652	std	r0, 8(rp)	C 0
653	adde	r24, r24, r29	C 7 29
654	std	r24, 16(rp)	C 7
655	adde	r9, r9, r30	C 9 30
656	std	r9, 24(rp)	C 9
657	adde	r11, r11, r31	C 11 31
658	std	r11, 32(rp)	C 11
659	addi	up, up, 32	C
660	addi	rp, rp, 32	C
661	bdnz	L(lo_2)	C
662
663	ALIGN(16)
664L(end_2):
665	mulld	r0, r26, v0
666	mulhdu	r10, r26, v0
667	ld	r28, 8(rp)
668	nop
669	mulld	r24, r27, v0
670	mulhdu	r8, r27, v0
671	ld	r29, 16(rp)
672	nop
673	adde	r0, r0, r12
674	adde	r24, r24, r10
675	addze	r8, r8
676	addic.	vn, vn, -1
677	addc	r0, r0, r28
678	std	r0, 8(rp)
679	adde	r24, r24, r29
680	std	r24, 16(rp)
681	addze	r8, r8
682	std	r8, 24(rp)
683	bne	L(outer_lo_2)
684	b	L(ret)
685
686
687L(ret):	ld	r31, -8(r1)
688	ld	r30, -16(r1)
689	ld	r29, -24(r1)
690	ld	r28, -32(r1)
691	ld	r27, -40(r1)
692	ld	r26, -48(r1)
693	ld	r25, -56(r1)
694	ld	r24, -64(r1)
695	ld	r23, -72(r1)
696	ld	r22, -80(r1)
697	blr
698EPILOGUE()
699