1dnl  Alpha ev6 mpn_mul_1 -- Multiply a limb vector with a limb and store the
2dnl  result in a second limb vector.
3
4dnl  Copyright 2000, 2001, 2005 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C INPUT PARAMETERS
35C res_ptr	r16
36C s1_ptr	r17
37C size		r18
38C s2_limb	r19
39
40C This code runs at 2.25 cycles/limb on EV6.
41
42C This code was written in close cooperation with ev6 pipeline expert
43C Steve Root.  Any errors are tege's fault, though.
44
45C Code structure:
46
47C  code for n < 8
48C  code for n > 8	code for (n mod 8)
49C			code for (n div 8)	feed-in code
50C						8-way unrolled loop
51C						wind-down code
52
53C Some notes about unrolled loop:
54C
55C   r1-r8     multiplies and workup
56C   r21-r28   multiplies and workup
57C   r9-r12    loads
58C   r0       -1
59C   r20,r29,r13-r15  scramble
60C
61C   We're doing 7 of the 8 carry propagations with a br fixup code and 1 with a
62C   put-the-carry-into-hi.  The idea is that these branches are very rarely
63C   taken, and since a non-taken branch consumes no resources, that is better
64C   than an addq.
65C
66C   Software pipeline: a load in cycle #09, feeds a mul in cycle #16, feeds an
67C   add NEXT cycle #09 which feeds a store in NEXT cycle #02
68
69C The code could use some further work:
70C   1. Speed up really small multiplies.  The default alpha/mul_1.asm code is
71C      faster than this for size < 3.
72C   2. Improve feed-in code, perhaps with the equivalent of switch(n%8) unless
73C      that is too costly.
74C   3. Consider using 4-way unrolling, even if that runs slower.
75C   4. Reduce register usage.  In particular, try to avoid using r29.
76
77ASM_START()
78PROLOGUE(mpn_mul_1)
79	cmpult	r18,	8,	r1
80	beq	r1,	$Large
81$Lsmall:
82	ldq	r2,0(r17)	C r2 = s1_limb
83	lda	r18,-1(r18)	C size--
84	mulq	r2,r19,r3	C r3 = prod_low
85	bic	r31,r31,r4	C clear cy_limb
86	umulh	r2,r19,r0	C r0 = prod_high
87	beq	r18,$Le1a	C jump if size was == 1
88	ldq	r2,8(r17)	C r2 = s1_limb
89	lda	r18,-1(r18)	C size--
90	stq	r3,0(r16)
91	beq	r18,$Le2a	C jump if size was == 2
92	ALIGN(8)
93$Lopa:	mulq	r2,r19,r3	C r3 = prod_low
94	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
95	lda	r18,-1(r18)	C size--
96	umulh	r2,r19,r4	C r4 = cy_limb
97	ldq	r2,16(r17)	C r2 = s1_limb
98	lda	r17,8(r17)	C s1_ptr++
99	addq	r3,r0,r3	C r3 = cy_limb + prod_low
100	stq	r3,8(r16)
101	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
102	lda	r16,8(r16)	C res_ptr++
103	bne	r18,$Lopa
104
105$Le2a:	mulq	r2,r19,r3	C r3 = prod_low
106	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
107	umulh	r2,r19,r4	C r4 = cy_limb
108	addq	r3,r0,r3	C r3 = cy_limb + prod_low
109	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
110	stq	r3,8(r16)
111	addq	r4,r0,r0	C cy_limb = prod_high + cy
112	ret	r31,(r26),1
113$Le1a:	stq	r3,0(r16)
114	ret	r31,(r26),1
115
116$Large:
117	lda	r30,	-224(r30)
118	stq	r26,	0(r30)
119	stq	r9,	8(r30)
120	stq	r10,	16(r30)
121	stq	r11,	24(r30)
122	stq	r12,	32(r30)
123	stq	r13,	40(r30)
124	stq	r14,	48(r30)
125	stq	r15,	56(r30)
126	stq	r29,	64(r30)
127
128	and	r18,	7,	r20	C count for the first loop, 0-7
129	srl	r18,	3,	r18	C count for unrolled loop
130	bis	r31,	r31,	r21
131	beq	r20,	$L_8_or_more	C skip first loop
132
133$L_9_or_more:
134	ldq	r2,0(r17)	C r2 = s1_limb
135	lda	r17,8(r17)	C s1_ptr++
136	lda	r20,-1(r20)	C size--
137	mulq	r2,r19,r3	C r3 = prod_low
138	umulh	r2,r19,r21	C r21 = prod_high
139	beq	r20,$Le1b	C jump if size was == 1
140	bis	r31, r31, r0	C FIXME: shouldn't need this
141	ldq	r2,0(r17)	C r2 = s1_limb
142	lda	r17,8(r17)	C s1_ptr++
143	lda	r20,-1(r20)	C size--
144	stq	r3,0(r16)
145	lda	r16,8(r16)	C res_ptr++
146	beq	r20,$Le2b	C jump if size was == 2
147	ALIGN(8)
148$Lopb:	mulq	r2,r19,r3	C r3 = prod_low
149	addq	r21,r0,r0	C cy_limb = cy_limb + 'cy'
150	lda	r20,-1(r20)	C size--
151	umulh	r2,r19,r21	C r21 = prod_high
152	ldq	r2,0(r17)	C r2 = s1_limb
153	lda	r17,8(r17)	C s1_ptr++
154	addq	r3,r0,r3	C r3 = cy_limb + prod_low
155	stq	r3,0(r16)
156	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
157	lda	r16,8(r16)	C res_ptr++
158	bne	r20,$Lopb
159
160$Le2b:	mulq	r2,r19,r3	C r3 = prod_low
161	addq	r21,r0,r0	C cy_limb = cy_limb + 'cy'
162	umulh	r2,r19,r21	C r21 = prod_high
163	addq	r3,r0,r3	C r3 = cy_limb + prod_low
164	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
165	stq	r3,0(r16)
166	lda	r16,8(r16)	C res_ptr++
167	addq	r21,r0,r21	C cy_limb = prod_high + cy
168	br	r31,	$L_8_or_more
169$Le1b:	stq	r3,0(r16)
170	lda	r16,8(r16)	C res_ptr++
171
172$L_8_or_more:
173	lda	r0,	-1(r31)		C put -1 in r0, for tricky loop control
174	lda	r17,	-32(r17)	C L1 bookkeeping
175	lda	r18,	-1(r18)		C decrement count
176
177	ldq	r9,	32(r17)		C L1
178	ldq	r10,	40(r17)		C L1
179	mulq	r9,	r19,	r22	C U1 #07
180	ldq	r11,	48(r17)		C L1
181	umulh	r9,	r19,	r23	C U1 #08
182	ldq	r12,	56(r17)		C L1
183	mulq	r10,	r19,	r24	C U1 #09
184	ldq	r9,	64(r17)		C L1
185
186	lda	r17,	64(r17)		C L1 bookkeeping
187
188	umulh	r10,	r19,	r25	C U1 #11
189	mulq	r11,	r19,	r26	C U1 #12
190	umulh	r11,	r19,	r27	C U1 #13
191	mulq	r12,	r19,	r28	C U1 #14
192	ldq	r10,	8(r17)		C L1
193	umulh	r12,	r19,	r1	C U1 #15
194	ldq	r11,	16(r17)		C L1
195	mulq	r9,	r19,	r2	C U1 #16
196	ldq	r12,	24(r17)		C L1
197	umulh	r9,	r19,	r3	C U1 #17
198	addq	r21,	r22,	r13	C L1 mov
199	mulq	r10,	r19,	r4	C U1 #18
200	addq	r23,	r24,	r22	C L0 sum 2 mul's
201	cmpult	r13,	r21,	r14	C L1 carry from sum
202	bgt	r18,	$L_16_or_more
203
204	cmpult	r22,	r24,	r24	C U0 carry from sum
205	umulh	r10,	r19,	r5	C U1 #02
206	addq	r25,	r26,	r23	C U0 sum 2 mul's
207	mulq	r11,	r19,	r6	C U1 #03
208	cmpult	r23,	r26,	r25	C U0 carry from sum
209	umulh	r11,	r19,	r7	C U1 #04
210	addq	r27,	r28,	r28	C U0 sum 2 mul's
211	mulq	r12,	r19,	r8	C U1 #05
212	cmpult	r28,	r27,	r15	C L0 carry from sum
213	lda	r16,	32(r16)		C L1 bookkeeping
214	addq	r13,	r31,	r13	C U0 start carry cascade
215	umulh	r12,	r19,	r21	C U1 #06
216	br	r31,	$ret0c
217
218$L_16_or_more:
219C ---------------------------------------------------------------
220	subq	r18,1,r18
221	cmpult	r22,	r24,	r24	C U0 carry from sum
222	ldq	r9,	32(r17)		C L1
223
224	umulh	r10,	r19,	r5	C U1 #02
225	addq	r25,	r26,	r23	C U0 sum 2 mul's
226	mulq	r11,	r19,	r6	C U1 #03
227	cmpult	r23,	r26,	r25	C U0 carry from sum
228	umulh	r11,	r19,	r7	C U1 #04
229	addq	r27,	r28,	r28	C U0 sum 2 mul's
230	mulq	r12,	r19,	r8	C U1 #05
231	cmpult	r28,	r27,	r15	C L0 carry from sum
232	lda	r16,	32(r16)		C L1 bookkeeping
233	addq	r13,	r31,	r13	C U0 start carry cascade
234
235	umulh	r12,	r19,	r21	C U1 #06
236C	beq	r13,	$fix0w		C U0
237$ret0w:	addq	r22,	r14,	r26	C L0
238	ldq	r10,	40(r17)		C L1
239
240	mulq	r9,	r19,	r22	C U1 #07
241	beq	r26,	$fix1w		C U0
242$ret1w:	addq	r23,	r24,	r27	C L0
243	ldq	r11,	48(r17)		C L1
244
245	umulh	r9,	r19,	r23	C U1 #08
246	beq	r27,	$fix2w		C U0
247$ret2w:	addq	r28,	r25,	r28	C L0
248	ldq	r12,	56(r17)		C L1
249
250	mulq	r10,	r19,	r24	C U1 #09
251	beq	r28,	$fix3w		C U0
252$ret3w:	addq	r1,	r2,	r20	C L0 sum 2 mul's
253	ldq	r9,	64(r17)		C L1
254
255	addq	r3,	r4,	r2	C L0 #10 2 mul's
256	lda	r17,	64(r17)		C L1 bookkeeping
257	cmpult	r20,	r1,	r29	C U0 carry from sum
258
259	umulh	r10,	r19,	r25	C U1 #11
260	cmpult	r2,	r4,	r4	C U0 carry from sum
261	stq	r13,	-32(r16)	C L0
262	stq	r26,	-24(r16)	C L1
263
264	mulq	r11,	r19,	r26	C U1 #12
265	addq	r5,	r6,	r14	C U0 sum 2 mul's
266	stq	r27,	-16(r16)	C L0
267	stq	r28,	-8(r16)		C L1
268
269	umulh	r11,	r19,	r27	C U1 #13
270	cmpult	r14,	r6,	r3	C U0 carry from sum
271C could do cross-jumping here:
272C	bra	$L_middle_of_unrolled_loop
273	mulq	r12,	r19,	r28	C U1 #14
274	addq	r7,	r3,	r5	C L0 eat carry
275	addq	r20,	r15,	r20	C U0 carry cascade
276	ldq	r10,	8(r17)		C L1
277
278	umulh	r12,	r19,	r1	C U1 #15
279	beq	r20,	$fix4		C U0
280$ret4w:	addq	r2,	r29,	r6	C L0
281	ldq	r11,	16(r17)		C L1
282
283	mulq	r9,	r19,	r2	C U1 #16
284	beq	r6,	$fix5		C U0
285$ret5w:	addq	r14,	r4,	r7	C L0
286	ldq	r12,	24(r17)		C L1
287
288	umulh	r9,	r19,	r3	C U1 #17
289	beq	r7,	$fix6		C U0
290$ret6w:	addq	r5,	r8,	r8	C L0 sum 2
291	addq	r21,	r22,	r13	C L1 sum 2 mul's
292
293	mulq	r10,	r19,	r4	C U1 #18
294	addq	r23,	r24,	r22	C L0 sum 2 mul's
295	cmpult	r13,	r21,	r14	C L1 carry from sum
296	ble	r18,	$Lend		C U0
297C ---------------------------------------------------------------
298	ALIGN(16)
299$Loop:
300	umulh	r0,	r18,	r18	C U1 #01 decrement r18!
301	cmpult	r8,	r5,	r29	C L0 carry from last bunch
302	cmpult	r22,	r24,	r24	C U0 carry from sum
303	ldq	r9,	32(r17)		C L1
304
305	umulh	r10,	r19,	r5	C U1 #02
306	addq	r25,	r26,	r23	C U0 sum 2 mul's
307	stq	r20,	0(r16)		C L0
308	stq	r6,	8(r16)		C L1
309
310	mulq	r11,	r19,	r6	C U1 #03
311	cmpult	r23,	r26,	r25	C U0 carry from sum
312	stq	r7,	16(r16)		C L0
313	stq	r8,	24(r16)		C L1
314
315	umulh	r11,	r19,	r7	C U1 #04
316	bis	r31,	r31,	r31	C L0 st slosh
317	bis	r31,	r31,	r31	C L1 st slosh
318	addq	r27,	r28,	r28	C U0 sum 2 mul's
319
320	mulq	r12,	r19,	r8	C U1 #05
321	cmpult	r28,	r27,	r15	C L0 carry from sum
322	lda	r16,	64(r16)		C L1 bookkeeping
323	addq	r13,	r29,	r13	C U0 start carry cascade
324
325	umulh	r12,	r19,	r21	C U1 #06
326	beq	r13,	$fix0		C U0
327$ret0:	addq	r22,	r14,	r26	C L0
328	ldq	r10,	40(r17)		C L1
329
330	mulq	r9,	r19,	r22	C U1 #07
331	beq	r26,	$fix1		C U0
332$ret1:	addq	r23,	r24,	r27	C L0
333	ldq	r11,	48(r17)		C L1
334
335	umulh	r9,	r19,	r23	C U1 #08
336	beq	r27,	$fix2		C U0
337$ret2:	addq	r28,	r25,	r28	C L0
338	ldq	r12,	56(r17)		C L1
339
340	mulq	r10,	r19,	r24	C U1 #09
341	beq	r28,	$fix3		C U0
342$ret3:	addq	r1,	r2,	r20	C L0 sum 2 mul's
343	ldq	r9,	64(r17)		C L1
344
345	addq	r3,	r4,	r2	C L0 #10 2 mul's
346	bis	r31,	r31,	r31	C U1 mul hole
347	lda	r17,	64(r17)		C L1 bookkeeping
348	cmpult	r20,	r1,	r29	C U0 carry from sum
349
350	umulh	r10,	r19,	r25	C U1 #11
351	cmpult	r2,	r4,	r4	C U0 carry from sum
352	stq	r13,	-32(r16)	C L0
353	stq	r26,	-24(r16)	C L1
354
355	mulq	r11,	r19,	r26	C U1 #12
356	addq	r5,	r6,	r14	C U0 sum 2 mul's
357	stq	r27,	-16(r16)	C L0
358	stq	r28,	-8(r16)		C L1
359
360	umulh	r11,	r19,	r27	C U1 #13
361	bis	r31,	r31,	r31	C L0 st slosh
362	bis	r31,	r31,	r31	C L1 st slosh
363	cmpult	r14,	r6,	r3	C U0 carry from sum
364$L_middle_of_unrolled_loop:
365	mulq	r12,	r19,	r28	C U1 #14
366	addq	r7,	r3,	r5	C L0 eat carry
367	addq	r20,	r15,	r20	C U0 carry cascade
368	ldq	r10,	8(r17)		C L1
369
370	umulh	r12,	r19,	r1	C U1 #15
371	beq	r20,	$fix4		C U0
372$ret4:	addq	r2,	r29,	r6	C L0
373	ldq	r11,	16(r17)		C L1
374
375	mulq	r9,	r19,	r2	C U1 #16
376	beq	r6,	$fix5		C U0
377$ret5:	addq	r14,	r4,	r7	C L0
378	ldq	r12,	24(r17)		C L1
379
380	umulh	r9,	r19,	r3	C U1 #17
381	beq	r7,	$fix6		C U0
382$ret6:	addq	r5,	r8,	r8	C L0 sum 2
383	addq	r21,	r22,	r13	C L1 sum 2 mul's
384
385	mulq	r10,	r19,	r4	C U1 #18
386	addq	r23,	r24,	r22	C L0 sum 2 mul's
387	cmpult	r13,	r21,	r14	C L1 carry from sum
388	bgt	r18,	$Loop		C U0
389C ---------------------------------------------------------------
390$Lend:
391	cmpult	r8,	r5,	r29	C L0 carry from last bunch
392	cmpult	r22,	r24,	r24	C U0 carry from sum
393
394	umulh	r10,	r19,	r5	C U1 #02
395	addq	r25,	r26,	r23	C U0 sum 2 mul's
396	stq	r20,	0(r16)		C L0
397	stq	r6,	8(r16)		C L1
398
399	mulq	r11,	r19,	r6	C U1 #03
400	cmpult	r23,	r26,	r25	C U0 carry from sum
401	stq	r7,	16(r16)		C L0
402	stq	r8,	24(r16)		C L1
403
404	umulh	r11,	r19,	r7	C U1 #04
405	addq	r27,	r28,	r28	C U0 sum 2 mul's
406
407	mulq	r12,	r19,	r8	C U1 #05
408	cmpult	r28,	r27,	r15	C L0 carry from sum
409	lda	r16,	64(r16)		C L1 bookkeeping
410	addq	r13,	r29,	r13	C U0 start carry cascade
411
412	umulh	r12,	r19,	r21	C U1 #06
413	beq	r13,	$fix0c		C U0
414$ret0c:	addq	r22,	r14,	r26	C L0
415	beq	r26,	$fix1c		C U0
416$ret1c:	addq	r23,	r24,	r27	C L0
417	beq	r27,	$fix2c		C U0
418$ret2c:	addq	r28,	r25,	r28	C L0
419	beq	r28,	$fix3c		C U0
420$ret3c:	addq	r1,	r2,	r20	C L0 sum 2 mul's
421	addq	r3,	r4,	r2	C L0 #10 2 mul's
422	lda	r17,	64(r17)		C L1 bookkeeping
423	cmpult	r20,	r1,	r29	C U0 carry from sum
424	cmpult	r2,	r4,	r4	C U0 carry from sum
425	stq	r13,	-32(r16)	C L0
426	stq	r26,	-24(r16)	C L1
427	addq	r5,	r6,	r14	C U0 sum 2 mul's
428	stq	r27,	-16(r16)	C L0
429	stq	r28,	-8(r16)		C L1
430	cmpult	r14,	r6,	r3	C U0 carry from sum
431	addq	r7,	r3,	r5	C L0 eat carry
432	addq	r20,	r15,	r20	C U0 carry cascade
433	beq	r20,	$fix4c		C U0
434$ret4c:	addq	r2,	r29,	r6	C L0
435	beq	r6,	$fix5c		C U0
436$ret5c:	addq	r14,	r4,	r7	C L0
437	beq	r7,	$fix6c		C U0
438$ret6c:	addq	r5,	r8,	r8	C L0 sum 2
439	cmpult	r8,	r5,	r29	C L0 carry from last bunch
440	stq	r20,	0(r16)		C L0
441	stq	r6,	8(r16)		C L1
442	stq	r7,	16(r16)		C L0
443	stq	r8,	24(r16)		C L1
444	addq	r29,	r21,	r0
445
446	ldq	r26,	0(r30)
447	ldq	r9,	8(r30)
448	ldq	r10,	16(r30)
449	ldq	r11,	24(r30)
450	ldq	r12,	32(r30)
451	ldq	r13,	40(r30)
452	ldq	r14,	48(r30)
453	ldq	r15,	56(r30)
454	ldq	r29,	64(r30)
455	lda	r30,	224(r30)
456	ret	r31,	(r26),	1
457
458C $fix0w:	bis	r14,	r29,	r14	C join carries
459C	br	r31,	$ret0w
460$fix1w:	bis	r24,	r14,	r24	C join carries
461	br	r31,	$ret1w
462$fix2w:	bis	r25,	r24,	r25	C join carries
463	br	r31,	$ret2w
464$fix3w:	bis	r15,	r25,	r15	C join carries
465	br	r31,	$ret3w
466$fix0:	bis	r14,	r29,	r14	C join carries
467	br	r31,	$ret0
468$fix1:	bis	r24,	r14,	r24	C join carries
469	br	r31,	$ret1
470$fix2:	bis	r25,	r24,	r25	C join carries
471	br	r31,	$ret2
472$fix3:	bis	r15,	r25,	r15	C join carries
473	br	r31,	$ret3
474$fix4:	bis	r29,	r15,	r29	C join carries
475	br	r31,	$ret4
476$fix5:	bis	r4,	r29,	r4	C join carries
477	br	r31,	$ret5
478$fix6:	addq	r5,	r4,	r5	C can't carry twice!
479	br	r31,	$ret6
480$fix0c:	bis	r14,	r29,	r14	C join carries
481	br	r31,	$ret0c
482$fix1c:	bis	r24,	r14,	r24	C join carries
483	br	r31,	$ret1c
484$fix2c:	bis	r25,	r24,	r25	C join carries
485	br	r31,	$ret2c
486$fix3c:	bis	r15,	r25,	r15	C join carries
487	br	r31,	$ret3c
488$fix4c:	bis	r29,	r15,	r29	C join carries
489	br	r31,	$ret4c
490$fix5c:	bis	r4,	r29,	r4	C join carries
491	br	r31,	$ret5c
492$fix6c:	addq	r5,	r4,	r5	C can't carry twice!
493	br	r31,	$ret6c
494
495EPILOGUE(mpn_mul_1)
496ASM_END()
497