1dnl  PowerPC-64 mpn_mul_basecase.
2
3dnl  Copyright 1999-2001, 2003-2006, 2008, 2010 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C		    cycles/limb
34C POWER3/PPC630		 ?
35C POWER4/PPC970		 ?
36C POWER5		 ?
37C POWER6		12.25
38
39C TODO
40C  * Reduce register usage.  At least 4 register less can be used.
41C  * Unroll more.  8-way unrolling would bring us to 10 c/l, 16-way unrolling
42C    would bring us to 9 c/l.
43C  * The bdz insns for b1 and b2 will never branch,
44C  * Align things better, perhaps by moving things like pointer updates from
45C    before to after loops.
46
47C INPUT PARAMETERS
48define(`rp', `r3')
49define(`up', `r4')
50define(`un', `r5')
51define(`vp', `r6')
52define(`vn', `r7')
53
54define(`v0',	   `r25')
55define(`outer_rp', `r22')
56define(`outer_up', `r23')
57
58ASM_START()
59PROLOGUE(mpn_mul_basecase)
60
61C Special code for un <= 2, for efficiency of these important cases,
62C and since it simplifies the default code.
63	cmpdi	cr0, un, 2
64	bgt	cr0, L(un_gt2)
65	cmpdi	cr6, vn, 1
66	ld	r7, 0(vp)
67	ld	r5, 0(up)
68	mulld	r8, r5, r7	C weight 0
69	mulhdu	r9, r5, r7	C weight 1
70	std	r8, 0(rp)
71	beq	cr0, L(2x)
72	std	r9, 8(rp)
73	blr
74	ALIGN(16)
75L(2x):	ld	r0, 8(up)
76	mulld	r8, r0, r7	C weight 1
77	mulhdu	r10, r0, r7	C weight 2
78	addc	r9, r9, r8
79	addze	r10, r10
80	bne	cr6, L(2x2)
81	std	r9, 8(rp)
82	std	r10, 16(rp)
83	blr
84	ALIGN(16)
85L(2x2):	ld	r6, 8(vp)
86	nop
87	mulld	r8, r5, r6	C weight 1
88	mulhdu	r11, r5, r6	C weight 2
89	mulld	r12, r0, r6	C weight 2
90	mulhdu	r0, r0, r6	C weight 3
91	addc	r9, r9, r8
92	std	r9, 8(rp)
93	adde	r11, r11, r10
94	addze	r0, r0
95	addc	r11, r11, r12
96	addze	r0, r0
97	std	r11, 16(rp)
98	std	r0, 24(rp)
99	blr
100
101L(un_gt2):
102	std	r31, -8(r1)
103	std	r30, -16(r1)
104	std	r29, -24(r1)
105	std	r28, -32(r1)
106	std	r27, -40(r1)
107	std	r26, -48(r1)
108	std	r25, -56(r1)
109	std	r24, -64(r1)
110	std	r23, -72(r1)
111	std	r22, -80(r1)
112	std	r21, -88(r1)
113	std	r20, -96(r1)
114
115	mr	outer_rp, rp
116	mr	outer_up, up
117
118	ld	v0, 0(vp)	C new v limb
119	addi	vp, vp, 8
120	ld	r26, 0(up)
121
122	rldicl.	r0, un, 0,62	C r0 = n & 3, set cr0
123	cmpdi	cr6, r0, 2
124	addi	un, un, 4	C compute count...
125	srdi	un, un, 2	C ...for ctr
126	mtctr	un		C copy inner loop count into ctr
127	beq	cr0, L(b0)
128	blt	cr6, L(b1)
129	beq	cr6, L(b2)
130
131
132	ALIGN(16)
133L(b3):
134	ld	r27, 8(up)
135	ld	r20, 16(up)
136	mulld	r0, r26, v0
137	mulhdu	r31, r26, v0
138	mulld	r24, r27, v0
139	mulhdu	r8, r27, v0
140	mulld	r9, r20, v0
141	mulhdu	r10, r20, v0
142	addc	r24, r24, r31
143	adde	r9, r9, r8
144	addze	r12, r10
145	std	r0, 0(rp)
146	std	r24, 8(rp)
147	std	r9, 16(rp)
148	addi	up, up, 16
149	addi	rp, rp, 16
150	bdz	L(end_m_3)
151
152	ALIGN(32)
153L(lo_m_3):
154	ld	r26, 8(up)
155	ld	r27, 16(up)
156	ld	r20, 24(up)
157	ld	r21, 32(up)
158	mulld	r0, r26, v0
159	mulhdu	r31, r26, v0
160	mulld	r24, r27, v0
161	mulhdu	r8, r27, v0
162	mulld	r9, r20, v0
163	mulhdu	r27, r20, v0
164	mulld	r11, r21, v0
165	mulhdu	r26, r21, v0
166	adde	r0, r0, r12
167	adde	r24, r24, r31
168	std	r0, 8(rp)
169	adde	r9, r9, r8
170	std	r24, 16(rp)
171	adde	r11, r11, r27
172	std	r9, 24(rp)
173	addi	up, up, 32
174	std	r11, 32(rp)
175	addi	rp, rp, 32
176	mr	r12, r26
177	bdnz	L(lo_m_3)
178
179	ALIGN(16)
180L(end_m_3):
181	addze	r12, r12
182	addic.	vn, vn, -1
183	std	r12, 8(rp)
184	beq	L(ret)
185
186	ALIGN(16)
187L(outer_lo_3):
188	mtctr	un		C copy inner loop count into ctr
189	addi	rp, outer_rp, 24
190	addi	up, outer_up, 16
191	addi	outer_rp, outer_rp, 8
192	ld	v0, 0(vp)	C new v limb
193	addi	vp, vp, 8
194	ld	r26, -16(up)
195	ld	r27, -8(up)
196	ld	r20, 0(up)
197	mulld	r0, r26, v0
198	mulhdu	r31, r26, v0
199	mulld	r24, r27, v0
200	mulhdu	r8, r27, v0
201	mulld	r9, r20, v0
202	mulhdu	r10, r20, v0
203	ld	r28, -16(rp)
204	ld	r29, -8(rp)
205	ld	r30, 0(rp)
206	addc	r24, r24, r31
207	adde	r9, r9, r8
208	addze	r12, r10
209	addc	r0, r0, r28
210	std	r0, -16(rp)
211	adde	r24, r24, r29
212	std	r24, -8(rp)
213	adde	r9, r9, r30
214	std	r9, 0(rp)
215	bdz	L(end_3)
216
217	ALIGN(32)		C registers dying
218L(lo_3):
219	ld	r26, 8(up)
220	ld	r27, 16(up)
221	ld	r20, 24(up)	C
222	ld	r21, 32(up)	C
223	addi	up, up, 32	C
224	addi	rp, rp, 32	C
225	mulld	r0, r26, v0	C
226	mulhdu	r10, r26, v0	C 26
227	mulld	r24, r27, v0	C
228	mulhdu	r8, r27, v0	C 27
229	mulld	r9, r20, v0	C
230	mulhdu	r27, r20, v0	C 26
231	mulld	r11, r21, v0	C
232	mulhdu	r26, r21, v0	C 27
233	ld	r28, -24(rp)	C
234	adde	r0, r0, r12	C 0 12
235	ld	r29, -16(rp)	C
236	adde	r24, r24, r10	C 24 10
237	ld	r30, -8(rp)	C
238	ld	r31, 0(rp)	C
239	adde	r9, r9, r8	C 8 9
240	adde	r11, r11, r27	C 27 11
241	addze	r12, r26	C 26
242	addc	r0, r0, r28	C 0 28
243	std	r0, -24(rp)	C 0
244	adde	r24, r24, r29	C 7 29
245	std	r24, -16(rp)	C 7
246	adde	r9, r9, r30	C 9 30
247	std	r9, -8(rp)	C 9
248	adde	r11, r11, r31	C 11 31
249	std	r11, 0(rp)	C 11
250	bdnz	L(lo_3)		C
251
252	ALIGN(16)
253L(end_3):
254	addze	r12, r12
255	addic.	vn, vn, -1
256	std	r12, 8(rp)
257	bne	L(outer_lo_3)
258	b	L(ret)
259
260
261	ALIGN(16)
262L(b1):
263	mulld	r0, r26, v0
264	mulhdu	r12, r26, v0
265	addic	r0, r0, 0
266	std	r0, 0(rp)
267	bdz	L(end_m_1)
268
269	ALIGN(16)
270L(lo_m_1):
271	ld	r26, 8(up)
272	ld	r27, 16(up)
273	ld	r20, 24(up)
274	ld	r21, 32(up)
275	mulld	r0, r26, v0
276	mulhdu	r31, r26, v0
277	mulld	r24, r27, v0
278	mulhdu	r8, r27, v0
279	mulld	r9, r20, v0
280	mulhdu	r27, r20, v0
281	mulld	r11, r21, v0
282	mulhdu	r26, r21, v0
283	adde	r0, r0, r12
284	adde	r24, r24, r31
285	std	r0, 8(rp)
286	adde	r9, r9, r8
287	std	r24, 16(rp)
288	adde	r11, r11, r27
289	std	r9, 24(rp)
290	addi	up, up, 32
291	std	r11, 32(rp)
292	addi	rp, rp, 32
293	mr	r12, r26
294	bdnz	L(lo_m_1)
295
296	ALIGN(16)
297L(end_m_1):
298	addze	r12, r12
299	addic.	vn, vn, -1
300	std	r12, 8(rp)
301	beq	L(ret)
302
303	ALIGN(16)
304L(outer_lo_1):
305	mtctr	un		C copy inner loop count into ctr
306	addi	rp, outer_rp, 8
307	mr	up, outer_up
308	addi	outer_rp, outer_rp, 8
309	ld	v0, 0(vp)	C new v limb
310	addi	vp, vp, 8
311	ld	r26, 0(up)
312	ld	r28, 0(rp)
313	mulld	r0, r26, v0
314	mulhdu	r12, r26, v0
315	addc	r0, r0, r28
316	std	r0, 0(rp)
317	bdz	L(end_1)
318
319	ALIGN(32)		C registers dying
320L(lo_1):
321	ld	r26, 8(up)
322	ld	r27, 16(up)
323	ld	r20, 24(up)	C
324	ld	r21, 32(up)	C
325	addi	up, up, 32	C
326	addi	rp, rp, 32	C
327	mulld	r0, r26, v0	C
328	mulhdu	r10, r26, v0	C 26
329	mulld	r24, r27, v0	C
330	mulhdu	r8, r27, v0	C 27
331	mulld	r9, r20, v0	C
332	mulhdu	r27, r20, v0	C 26
333	mulld	r11, r21, v0	C
334	mulhdu	r26, r21, v0	C 27
335	ld	r28, -24(rp)	C
336	adde	r0, r0, r12	C 0 12
337	ld	r29, -16(rp)	C
338	adde	r24, r24, r10	C 24 10
339	ld	r30, -8(rp)	C
340	ld	r31, 0(rp)	C
341	adde	r9, r9, r8	C 8 9
342	adde	r11, r11, r27	C 27 11
343	addze	r12, r26	C 26
344	addc	r0, r0, r28	C 0 28
345	std	r0, -24(rp)	C 0
346	adde	r24, r24, r29	C 7 29
347	std	r24, -16(rp)	C 7
348	adde	r9, r9, r30	C 9 30
349	std	r9, -8(rp)	C 9
350	adde	r11, r11, r31	C 11 31
351	std	r11, 0(rp)	C 11
352	bdnz	L(lo_1)		C
353
354	ALIGN(16)
355L(end_1):
356	addze	r12, r12
357	addic.	vn, vn, -1
358	std	r12, 8(rp)
359	bne	L(outer_lo_1)
360	b	L(ret)
361
362
363	ALIGN(16)
364L(b0):
365	addi	up, up, -8
366	addi	rp, rp, -8
367	li	r12, 0
368	addic	r12, r12, 0
369	bdz	L(end_m_0)
370
371	ALIGN(16)
372L(lo_m_0):
373	ld	r26, 8(up)
374	ld	r27, 16(up)
375	ld	r20, 24(up)
376	ld	r21, 32(up)
377	mulld	r0, r26, v0
378	mulhdu	r31, r26, v0
379	mulld	r24, r27, v0
380	mulhdu	r8, r27, v0
381	mulld	r9, r20, v0
382	mulhdu	r27, r20, v0
383	mulld	r11, r21, v0
384	mulhdu	r26, r21, v0
385	adde	r0, r0, r12
386	adde	r24, r24, r31
387	std	r0, 8(rp)
388	adde	r9, r9, r8
389	std	r24, 16(rp)
390	adde	r11, r11, r27
391	std	r9, 24(rp)
392	addi	up, up, 32
393	std	r11, 32(rp)
394	addi	rp, rp, 32
395	mr	r12, r26
396	bdnz	L(lo_m_0)
397
398	ALIGN(16)
399L(end_m_0):
400	addze	r12, r12
401	addic.	vn, vn, -1
402	std	r12, 8(rp)
403	beq	L(ret)
404
405	ALIGN(16)
406L(outer_lo_0):
407	mtctr	un		C copy inner loop count into ctr
408	addi	rp, outer_rp, 0
409	addi	up, outer_up, -8
410	addi	outer_rp, outer_rp, 8
411	ld	v0, 0(vp)	C new v limb
412	addi	vp, vp, 8
413	li	r12, 0
414	addic	r12, r12, 0
415	bdz	L(end_0)
416
417	ALIGN(32)		C registers dying
418L(lo_0):
419	ld	r26, 8(up)
420	ld	r27, 16(up)
421	ld	r20, 24(up)	C
422	ld	r21, 32(up)	C
423	addi	up, up, 32	C
424	addi	rp, rp, 32	C
425	mulld	r0, r26, v0	C
426	mulhdu	r10, r26, v0	C 26
427	mulld	r24, r27, v0	C
428	mulhdu	r8, r27, v0	C 27
429	mulld	r9, r20, v0	C
430	mulhdu	r27, r20, v0	C 26
431	mulld	r11, r21, v0	C
432	mulhdu	r26, r21, v0	C 27
433	ld	r28, -24(rp)	C
434	adde	r0, r0, r12	C 0 12
435	ld	r29, -16(rp)	C
436	adde	r24, r24, r10	C 24 10
437	ld	r30, -8(rp)	C
438	ld	r31, 0(rp)	C
439	adde	r9, r9, r8	C 8 9
440	adde	r11, r11, r27	C 27 11
441	addze	r12, r26	C 26
442	addc	r0, r0, r28	C 0 28
443	std	r0, -24(rp)	C 0
444	adde	r24, r24, r29	C 7 29
445	std	r24, -16(rp)	C 7
446	adde	r9, r9, r30	C 9 30
447	std	r9, -8(rp)	C 9
448	adde	r11, r11, r31	C 11 31
449	std	r11, 0(rp)	C 11
450	bdnz	L(lo_0)		C
451
452	ALIGN(16)
453L(end_0):
454	addze	r12, r12
455	addic.	vn, vn, -1
456	std	r12, 8(rp)
457	bne	L(outer_lo_0)
458	b	L(ret)
459
460
461	ALIGN(16)
462L(b2):	ld	r27, 8(up)
463	addi	up, up, 8
464	mulld	r0, r26, v0
465	mulhdu	r10, r26, v0
466	mulld	r24, r27, v0
467	mulhdu	r8, r27, v0
468	addc	r24, r24, r10
469	addze	r12, r8
470	std	r0, 0(rp)
471	std	r24, 8(rp)
472	addi	rp, rp, 8
473	bdz	L(end_m_2)
474
475	ALIGN(16)
476L(lo_m_2):
477	ld	r26, 8(up)
478	ld	r27, 16(up)
479	ld	r20, 24(up)
480	ld	r21, 32(up)
481	mulld	r0, r26, v0
482	mulhdu	r31, r26, v0
483	mulld	r24, r27, v0
484	mulhdu	r8, r27, v0
485	mulld	r9, r20, v0
486	mulhdu	r27, r20, v0
487	mulld	r11, r21, v0
488	mulhdu	r26, r21, v0
489	adde	r0, r0, r12
490	adde	r24, r24, r31
491	std	r0, 8(rp)
492	adde	r9, r9, r8
493	std	r24, 16(rp)
494	adde	r11, r11, r27
495	std	r9, 24(rp)
496	addi	up, up, 32
497	std	r11, 32(rp)
498	addi	rp, rp, 32
499	mr	r12, r26
500	bdnz	L(lo_m_2)
501
502	ALIGN(16)
503L(end_m_2):
504	addze	r12, r12
505	addic.	vn, vn, -1
506	std	r12, 8(rp)
507	beq	L(ret)
508
509	ALIGN(16)
510L(outer_lo_2):
511	mtctr	un		C copy inner loop count into ctr
512	addi	rp, outer_rp, 16
513	addi	up, outer_up, 8
514	addi	outer_rp, outer_rp, 8
515	ld	v0, 0(vp)	C new v limb
516	addi	vp, vp, 8
517	ld	r26, -8(up)
518	ld	r27, 0(up)
519	ld	r28, -8(rp)
520	ld	r29, 0(rp)
521	mulld	r0, r26, v0
522	mulhdu	r10, r26, v0
523	mulld	r24, r27, v0
524	mulhdu	r8, r27, v0
525	addc	r24, r24, r10
526	addze	r12, r8
527	addc	r0, r0, r28
528	std	r0, -8(rp)
529	adde	r24, r24, r29
530	std	r24, 0(rp)
531	bdz	L(end_2)
532
533	ALIGN(16)		C registers dying
534L(lo_2):
535	ld	r26, 8(up)
536	ld	r27, 16(up)
537	ld	r20, 24(up)	C
538	ld	r21, 32(up)	C
539	addi	up, up, 32	C
540	addi	rp, rp, 32	C
541	mulld	r0, r26, v0	C
542	mulhdu	r10, r26, v0	C 26
543	mulld	r24, r27, v0	C
544	mulhdu	r8, r27, v0	C 27
545	mulld	r9, r20, v0	C
546	mulhdu	r27, r20, v0	C 26
547	mulld	r11, r21, v0	C
548	mulhdu	r26, r21, v0	C 27
549	ld	r28, -24(rp)	C
550	adde	r0, r0, r12	C 0 12
551	ld	r29, -16(rp)	C
552	adde	r24, r24, r10	C 24 10
553	ld	r30, -8(rp)	C
554	ld	r31, 0(rp)	C
555	adde	r9, r9, r8	C 8 9
556	adde	r11, r11, r27	C 27 11
557	addze	r12, r26	C 26
558	addc	r0, r0, r28	C 0 28
559	std	r0, -24(rp)	C 0
560	adde	r24, r24, r29	C 7 29
561	std	r24, -16(rp)	C 7
562	adde	r9, r9, r30	C 9 30
563	std	r9, -8(rp)	C 9
564	adde	r11, r11, r31	C 11 31
565	std	r11, 0(rp)	C 11
566	bdnz	L(lo_2)		C
567
568	ALIGN(16)
569L(end_2):
570	addze	r12, r12
571	addic.	vn, vn, -1
572	std	r12, 8(rp)
573	bne	L(outer_lo_2)
574C	b	L(ret)
575
576L(ret):	ld	r31, -8(r1)
577	ld	r30, -16(r1)
578	ld	r29, -24(r1)
579	ld	r28, -32(r1)
580	ld	r27, -40(r1)
581	ld	r26, -48(r1)
582	ld	r25, -56(r1)
583	ld	r24, -64(r1)
584	ld	r23, -72(r1)
585	ld	r22, -80(r1)
586	ld	r21, -88(r1)
587	ld	r20, -96(r1)
588	blr
589EPILOGUE()
590