1dnl  Alpha ev6 nails mpn_mul_1.
2
3dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C      cycles/limb
34C EV4:    42
35C EV5:    18
36C EV6:     3.25
37
38C TODO
39C  * Reroll loop for 3.0 c/l with current 4-way unrolling.
40C  * The loop is overscheduled wrt loads and wrt multiplies, in particular
41C    umulh.
42C  * Use FP loop count and multiple exit points, that would simplify feed-in lp0
43C    and would work since the loop structure is really regular.
44
45C  INPUT PARAMETERS
46define(`rp',`r16')
47define(`up',`r17')
48define(`n', `r18')
49define(`vl0',`r19')
50
51define(`numb_mask',`r6')
52
53define(`m0a',`r0')
54define(`m0b',`r1')
55define(`m1a',`r2')
56define(`m1b',`r3')
57define(`m2a',`r20')
58define(`m2b',`r21')
59define(`m3a',`r22')
60define(`m3b',`r23')
61
62define(`acc0',`r25')
63define(`acc1',`r27')
64
65define(`ul0',`r4')
66define(`ul1',`r5')
67define(`ul2',`r4')
68define(`ul3',`r5')
69
70define(`rl0',`r24')
71define(`rl1',`r24')
72define(`rl2',`r24')
73define(`rl3',`r24')
74
75define(`t0',`r7')
76define(`t1',`r8')
77
78define(`NAIL_BITS',`GMP_NAIL_BITS')
79define(`NUMB_BITS',`GMP_NUMB_BITS')
80
81dnl  This declaration is munged by configure
82NAILS_SUPPORT(1-63)
83
84ASM_START()
85PROLOGUE(mpn_mul_1)
86	sll	vl0, NAIL_BITS, vl0
87	lda	numb_mask, -1(r31)
88	srl	numb_mask, NAIL_BITS, numb_mask
89
90	and	n,	3,	r25
91	cmpeq	r25,	1,	r21
92	bne	r21,	L(1m4)
93	cmpeq	r25,	2,	r21
94	bne	r21,	L(2m4)
95	beq	r25,	L(0m4)
96
97L(3m4):	ldq	ul3,	0(up)
98	lda	n,	-4(n)
99	ldq	ul0,	8(up)
100	mulq	vl0,	ul3,	m3a
101	umulh	vl0,	ul3,	m3b
102	ldq	ul1,	16(up)
103	lda	up,	24(up)
104	lda	rp,	-8(rp)
105	mulq	vl0,	ul0,	m0a
106	umulh	vl0,	ul0,	m0b
107	bge	n,	L(ge3)
108
109	mulq	vl0,	ul1,	m1a
110	umulh	vl0,	ul1,	m1b
111	srl	m3a,NAIL_BITS,	t0
112	addq	t0,	r31,	acc1
113	srl	m0a,NAIL_BITS,	t0
114	addq	t0,	m3b,	acc0
115	srl	acc1,NUMB_BITS,	t1
116	br	r31,	L(ta3)
117
118L(ge3):	ldq	ul2,	0(up)
119	mulq	vl0,	ul1,	m1a
120	umulh	vl0,	ul1,	m1b
121	srl	m3a,NAIL_BITS,	t0
122	ldq	ul3,	8(up)
123	lda	n,	-4(n)
124	mulq	vl0,	ul2,	m2a
125	addq	t0,	r31,	acc1
126	umulh	vl0,	ul2,	m2b
127	srl	m0a,NAIL_BITS,	t0
128	ldq	ul0,	16(up)
129	mulq	vl0,	ul3,	m3a
130	addq	t0,	m3b,	acc0
131	srl	acc1,NUMB_BITS,	t1
132	br	r31,	L(el3)
133
134L(0m4):	lda	n,	-8(n)
135	ldq	ul2,	0(up)
136	ldq	ul3,	8(up)
137	mulq	vl0,	ul2,	m2a
138	umulh	vl0,	ul2,	m2b
139	ldq	ul0,	16(up)
140	mulq	vl0,	ul3,	m3a
141	umulh	vl0,	ul3,	m3b
142	ldq	ul1,	24(up)
143	lda	up,	32(up)
144	mulq	vl0,	ul0,	m0a
145	umulh	vl0,	ul0,	m0b
146	bge	n,	L(ge4)
147
148	srl	m2a,NAIL_BITS,	t0
149	mulq	vl0,	ul1,	m1a
150	addq	t0,	r31,	acc0
151	umulh	vl0,	ul1,	m1b
152	srl	m3a,NAIL_BITS,	t0
153	addq	t0,	m2b,	acc1
154	srl	acc0,NUMB_BITS,	t1
155	br	r31,	L(ta4)
156
157L(ge4):	srl	m2a,NAIL_BITS,	t0
158	ldq	ul2,	0(up)
159	mulq	vl0,	ul1,	m1a
160	addq	t0,	r31,	acc0
161	umulh	vl0,	ul1,	m1b
162	srl	m3a,NAIL_BITS,	t0
163	ldq	ul3,	8(up)
164	lda	n,	-4(n)
165	mulq	vl0,	ul2,	m2a
166	addq	t0,	m2b,	acc1
167	srl	acc0,NUMB_BITS,	t1
168	br	r31,	L(el0)
169
170L(2m4):	lda	n,	-4(n)
171	ldq	ul0,	0(up)
172	ldq	ul1,	8(up)
173	lda	up,	16(up)
174	lda	rp,	-16(rp)
175	mulq	vl0,	ul0,	m0a
176	umulh	vl0,	ul0,	m0b
177	bge	n,	L(ge2)
178
179	mulq	vl0,	ul1,	m1a
180	umulh	vl0,	ul1,	m1b
181	srl	m0a,NAIL_BITS,	t0
182	addq	t0,	r31,	acc0
183	srl	m1a,NAIL_BITS,	t0
184	addq	t0,	m0b,	acc1
185	srl	acc0,NUMB_BITS,	t1
186	br	r31,	L(ta2)
187
188L(ge2):	ldq	ul2,	0(up)
189	mulq	vl0,	ul1,	m1a
190	umulh	vl0,	ul1,	m1b
191	ldq	ul3,	8(up)
192	lda	n,	-4(n)
193	mulq	vl0,	ul2,	m2a
194	umulh	vl0,	ul2,	m2b
195	srl	m0a,NAIL_BITS,	t0
196	ldq	ul0,	16(up)
197	mulq	vl0,	ul3,	m3a
198	addq	t0,	r31,	acc0
199	umulh	vl0,	ul3,	m3b
200	srl	m1a,NAIL_BITS,	t0
201	ldq	ul1,	24(up)
202	lda	up,	32(up)
203	lda	rp,	32(rp)
204	mulq	vl0,	ul0,	m0a
205	addq	t0,	m0b,	acc1
206	srl	acc0,NUMB_BITS,	t1
207	bge	n,	L(el2)
208
209	br	r31,	L(ta6)
210
211L(1m4):	lda	n,	-4(n)
212	ldq	ul1,	0(up)
213	lda	up,	8(up)
214	lda	rp,	-24(rp)
215	bge	n,	L(ge1)
216
217	mulq	vl0,	ul1,	m1a
218	umulh	vl0,	ul1,	m1b
219	srl	m1a,NAIL_BITS,	t0
220	addq	t0,	r31,	acc1
221	and	acc1,numb_mask,	r28
222	srl	acc1,NUMB_BITS,	t1
223	stq	r28,	24(rp)
224	addq	t1,	m1b,	r0
225	ret	r31,	(r26),	1
226
227L(ge1):	ldq	ul2,	0(up)
228	mulq	vl0,	ul1,	m1a
229	umulh	vl0,	ul1,	m1b
230	ldq	ul3,	8(up)
231	lda	n,	-4(n)
232	mulq	vl0,	ul2,	m2a
233	umulh	vl0,	ul2,	m2b
234	ldq	ul0,	16(up)
235	mulq	vl0,	ul3,	m3a
236	umulh	vl0,	ul3,	m3b
237	srl	m1a,NAIL_BITS,	t0
238	ldq	ul1,	24(up)
239	lda	up,	32(up)
240	lda	rp,	32(rp)
241	mulq	vl0,	ul0,	m0a
242	addq	t0,	r31,	acc1
243	umulh	vl0,	ul0,	m0b
244	srl	m2a,NAIL_BITS,	t0
245	mulq	vl0,	ul1,	m1a
246	addq	t0,	m1b,	acc0
247	srl	acc1,NUMB_BITS,	t1
248	blt	n,	L(ta5)
249
250L(ge5):	ldq	ul2,	0(up)
251	br	r31,	L(el1)
252
253	ALIGN(16)
254L(top):	mulq	vl0,	ul0,	m0a		C U1
255	addq	t0,	m0b,	acc1		C L0
256	srl	acc0,NUMB_BITS,	t1		C U0
257	stq	r28,	-24(rp)			C L1
258C
259L(el2):	umulh	vl0,	ul0,	m0b		C U1
260	and	acc0,numb_mask,	r28		C L0
261	unop					C U0
262	unop					C L1
263C
264	unop					C U1
265	addq	t1,	acc1,	acc1		C L0
266	srl	m2a,NAIL_BITS,	t0		C U0
267	ldq	ul2,	0(up)			C L1
268C
269	mulq	vl0,	ul1,	m1a		C U1
270	addq	t0,	m1b,	acc0		C L0
271	srl	acc1,NUMB_BITS,	t1		C U0
272	stq	r28,	-16(rp)			C L1
273C
274L(el1):	umulh	vl0,	ul1,	m1b		C U1
275	and	acc1,numb_mask,	r28		C L0
276	unop					C U0
277	lda	n,	-4(n)			C L1
278C
279	unop					C U1
280	addq	t1,	acc0,	acc0		C L0
281	srl	m3a,NAIL_BITS,	t0		C U0
282	ldq	ul3,	8(up)			C L1
283C
284	mulq	vl0,	ul2,	m2a		C U1
285	addq	t0,	m2b,	acc1		C L0
286	srl	acc0,NUMB_BITS,	t1		C U0
287	stq	r28,	-8(rp)			C L1
288C
289L(el0):	umulh	vl0,	ul2,	m2b		C U1
290	and	acc0,numb_mask,	r28		C L0
291	unop					C U0
292	unop					C L1
293C
294	unop					C U1
295	addq	t1,	acc1,	acc1		C L0
296	srl	m0a,NAIL_BITS,	t0		C U0
297	ldq	ul0,	16(up)			C L1
298C
299	mulq	vl0,	ul3,	m3a		C U1
300	addq	t0,	m3b,	acc0		C L0
301	srl	acc1,NUMB_BITS,	t1		C U0
302	stq	r28,	0(rp)			C L1
303C
304L(el3):	umulh	vl0,	ul3,	m3b		C U1
305	and	acc1,numb_mask,	r28		C L0
306	unop					C U0
307	unop					C L1
308C
309	unop					C U1
310	addq	t1,	acc0,	acc0		C L0
311	srl	m1a,NAIL_BITS,	t0		C U0
312	ldq	ul1,	24(up)			C L1
313C
314	lda	up,	32(up)			C L0
315	unop					C U1
316	lda	rp,	32(rp)			C L1
317	bge	n,	L(top)			C U0
318
319L(end):	mulq	vl0,	ul0,	m0a
320	addq	t0,	m0b,	acc1
321	srl	acc0,NUMB_BITS,	t1
322	stq	r28,	-24(rp)
323L(ta6):	umulh	vl0,	ul0,	m0b
324	and	acc0,numb_mask,	r28
325	addq	t1,	acc1,	acc1
326	srl	m2a,NAIL_BITS,	t0
327	mulq	vl0,	ul1,	m1a
328	addq	t0,	m1b,	acc0
329	srl	acc1,NUMB_BITS,	t1
330	stq	r28,	-16(rp)
331L(ta5):	umulh	vl0,	ul1,	m1b
332	and	acc1,numb_mask,	r28
333	addq	t1,	acc0,	acc0
334	srl	m3a,NAIL_BITS,	t0
335	addq	t0,	m2b,	acc1
336	srl	acc0,NUMB_BITS,	t1
337	stq	r28,	-8(rp)
338	ALIGN(16)
339L(ta4):	and	acc0,numb_mask,	r28
340	addq	t1,	acc1,	acc1
341	srl	m0a,NAIL_BITS,	t0
342	addq	t0,	m3b,	acc0
343	srl	acc1,NUMB_BITS,	t1
344	stq	r28,	0(rp)
345	unop
346	ALIGN(16)
347L(ta3):	and	acc1,numb_mask,	r28
348	addq	t1,	acc0,	acc0
349	srl	m1a,NAIL_BITS,	t0
350	addq	t0,	m0b,	acc1
351	srl	acc0,NUMB_BITS,	t1
352	stq	r28,	8(rp)
353	unop
354	ALIGN(16)
355L(ta2):	and	acc0,numb_mask,	r28
356	addq	t1,	acc1,	acc1
357	srl	acc1,NUMB_BITS,	t1
358	stq	r28,	16(rp)
359	and	acc1,numb_mask,	r28
360	addq	t1,	m1b,	r0
361	stq	r28,	24(rp)
362	ret	r31,	(r26),	1
363EPILOGUE()
364ASM_END()
365