1dnl  Alpha ev6 mpn_addmul_1 and mpn_submul_1.
2
3dnl  Copyright 2000, 2003-2005, 2008 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C      cycles/limb
34C EV4:    42
35C EV5:    18
36C EV6:     3.5
37
38C  INPUT PARAMETERS
39define(`rp',	`r16')
40define(`up',	`r17')
41define(`n',	`r18')
42define(`v0',	`r19')
43
44dnl  This code was written in cooperation with ev6 pipeline expert Steve Root.
45
46dnl  The stores can issue a cycle late so we have paired no-op's to 'catch'
47dnl  them, so that further disturbance to the schedule is damped.
48
49dnl  We couldn't pair the loads, because the entangled schedule of the carry's
50dnl  has to happen on one side {0} of the machine.
51
52dnl  This is a great schedule for the d_cache, a poor schedule for the b_cache.
53dnl  The lockup on U0 means that any stall can't be recovered from.  Consider a
54dnl  ldq in L1, say that load gets stalled because it collides with a fill from
55dnl  the b_cache.  On the next cycle, this load gets priority.  If first looks
56dnl  at L0, and goes there.  The instruction we intended for L0 gets to look at
57dnl  L1, which is NOT where we want it.  It either stalls 1, because it can't
58dnl  go in L0, or goes there, and causes a further instruction to stall.
59
60dnl  So for b_cache, we're likely going to want to put one or more cycles back
61dnl  into the code! And, of course, put in lds prefetch for the rp[] operand.
62dnl  At a place where we have an mt followed by a bookkeeping, put the
63dnl  bookkeeping in upper, and the prefetch into lower.
64
65dnl  Note, the ldq's and stq's are at the end of the quadpacks.  Note, we'd
66dnl  like not to have an ldq or an stq to preceded a conditional branch in a
67dnl  quadpack.  The conditional branch moves the retire pointer one cycle
68dnl  later.
69
70ifdef(`OPERATION_addmul_1',`
71    define(`ADDSUB',	`addq')
72    define(`CMPCY',	`cmpult	$2,$1')
73    define(`func',	`mpn_addmul_1')
74')
75ifdef(`OPERATION_submul_1',`
76    define(`ADDSUB',	`subq')
77    define(`CMPCY',	`cmpult	$1,$2')
78    define(`func',	`mpn_submul_1')
79')
80
81MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
82
83ASM_START()
84PROLOGUE(func)
85	ldq	r3,	0(up)		C
86	and	r18,	7,	r20	C
87	lda	r18,	-9(r18)		C
88	cmpeq	r20,	1,	r21	C
89	beq	r21,	$L1		C
90
91$1mod8:	ldq	r5,	0(rp)		C
92	mulq	v0,	r3,	r7	C
93	umulh	v0,	r3,	r8	C
94	ADDSUB	r5,	r7,	r23	C
95	CMPCY(	r5,	r23),	r20	C
96	addq	r8,	r20,	r0	C
97	stq	r23,	0(rp)		C
98	bge	r18,	$ent1		C
99	ret	r31,	(r26),	1	C
100
101$L1:	lda	r8,	0(r31)		C zero carry reg
102	lda	r24,	0(r31)		C zero carry reg
103	cmpeq	r20,	2,	r21	C
104	bne	r21,	$2mod8		C
105	cmpeq	r20,	3,	r21	C
106	bne	r21,	$3mod8		C
107	cmpeq	r20,	4,	r21	C
108	bne	r21,	$4mod8		C
109	cmpeq	r20,	5,	r21	C
110	bne	r21,	$5mod8		C
111	cmpeq	r20,	6,	r21	C
112	bne	r21,	$6mod8		C
113	cmpeq	r20,	7,	r21	C
114	beq	r21,	$0mod8		C
115
116$7mod8:	ldq	r5,	0(rp)		C
117	lda	up,	8(up)		C
118	mulq	v0,	r3,	r7	C
119	umulh	v0,	r3,	r24	C
120	ADDSUB	r5,	r7,	r23	C
121	CMPCY(	r5,	r23),	r20	C
122	addq	r24,	r20,	r24	C
123	stq	r23,	0(rp)		C
124	lda	rp,	8(rp)		C
125	ldq	r3,	0(up)		C
126$6mod8:	ldq	r1,	8(up)		C
127	mulq	v0,	r3,	r25	C
128	umulh	v0,	r3,	r3	C
129	mulq	v0,	r1,	r28	C
130	ldq	r0,	16(up)		C
131	ldq	r4,	0(rp)		C
132	umulh	v0,	r1,	r8	C
133	ldq	r1,	24(up)		C
134	lda	up,	48(up)		C L1 bookkeeping
135	mulq	v0,	r0,	r2	C
136	ldq	r5,	8(rp)		C
137	lda	rp,	-32(rp)		C L1 bookkeeping
138	umulh	v0,	r0,	r6	C
139	ADDSUB	r4,	r25,	r25	C lo + acc
140	mulq	v0,	r1,	r7	C
141	br	r31,	$ent6		C
142
143$ent1:	lda	up,	8(up)		C
144	lda	rp,	8(rp)		C
145	lda	r8,	0(r0)		C
146	ldq	r3,	0(up)		C
147$0mod8:	ldq	r1,	8(up)		C
148	mulq	v0,	r3,	r2	C
149	umulh	v0,	r3,	r6	C
150	mulq	v0,	r1,	r7	C
151	ldq	r0,	16(up)		C
152	ldq	r4,	0(rp)		C
153	umulh	v0,	r1,	r24	C
154	ldq	r1,	24(up)		C
155	mulq	v0,	r0,	r25	C
156	ldq	r5,	8(rp)		C
157	umulh	v0,	r0,	r3	C
158	ADDSUB	r4,	r2,	r2	C lo + acc
159	mulq	v0,	r1,	r28	C
160	lda	rp,	-16(rp)		C
161	br	r31,	$ent0		C
162
163$3mod8:	ldq	r5,	0(rp)		C
164	lda	up,	8(up)		C
165	mulq	v0,	r3,	r7	C
166	umulh	v0,	r3,	r8	C
167	ADDSUB	r5,	r7,	r23	C
168	CMPCY(	r5,	r23),	r20	C
169	addq	r8,	r20,	r24	C
170	stq	r23,	0(rp)		C
171	lda	rp,	8(rp)		C
172	ldq	r3,	0(up)		C
173$2mod8:	ldq	r1,	8(up)		C
174	mulq	v0,	r3,	r25	C
175	umulh	v0,	r3,	r3	C
176	mulq	v0,	r1,	r28	C
177	ble	r18,	$n23		C
178	ldq	r0,	16(up)		C
179	ldq	r4,	0(rp)		C
180	umulh	v0,	r1,	r8	C
181	ldq	r1,	24(up)		C
182	lda	up,	16(up)		C L1 bookkeeping
183	mulq	v0,	r0,	r2	C
184	ldq	r5,	8(rp)		C
185	lda	rp,	0(rp)		C L1 bookkeeping
186	umulh	v0,	r0,	r6	C
187	ADDSUB	r4,	r25,	r25	C lo + acc
188	mulq	v0,	r1,	r7	C
189	br	r31,	$ent2		C
190
191$5mod8:	ldq	r5,	0(rp)		C
192	lda	up,	8(up)		C
193	mulq	v0,	r3,	r7	C
194	umulh	v0,	r3,	r24	C
195	ADDSUB	r5,	r7,	r23	C
196	CMPCY(	r5,	r23),	r20	C
197	addq	r24,	r20,	r8	C
198	stq	r23,	0(rp)		C
199	lda	rp,	8(rp)		C
200	ldq	r3,	0(up)		C
201$4mod8:	ldq	r1,	8(up)		C
202	mulq	v0,	r3,	r2	C
203	umulh	v0,	r3,	r6	C
204	mulq	v0,	r1,	r7	C
205	ldq	r0,	16(up)		C
206	ldq	r4,	0(rp)		C
207	umulh	v0,	r1,	r24	C
208	ldq	r1,	24(up)		C
209	lda	up,	32(up)		C L1 bookkeeping
210	mulq	v0,	r0,	r25	C
211	ldq	r5,	8(rp)		C
212	lda	rp,	16(rp)		C L1 bookkeeping
213	umulh	v0,	r0,	r3	C
214	ADDSUB	r4,	r2,	r2	C lo + acc
215	mulq	v0,	r1,	r28	C
216	CMPCY(	r4,	r2),	r20	C L0 lo add => carry
217	ADDSUB	r2,	r8,	r22	C U0 hi add => answer
218	ble	r18,	$Lend		C
219	ALIGN(16)
220$Loop:
221	bis	r31,	r31,	r31	C U1 mt
222	CMPCY(	r2,	r22),	r21	C L0 hi add => carry
223	addq	r6,	r20,	r6	C U0 hi mul + carry
224	ldq	r0,	0(up)		C
225
226	bis	r31,	r31,	r31	C U1 mt
227	ADDSUB	r5,	r7,	r7	C L0 lo + acc
228	addq	r6,	r21,	r6	C U0 hi mul + carry
229	ldq	r4,	0(rp)		C L1
230
231	umulh	v0,	r1,	r8	C U1
232	CMPCY(	r5,	r7),	r20	C L0 lo add => carry
233	ADDSUB	r7,	r6,	r23	C U0 hi add => answer
234	ldq	r1,	8(up)		C L1
235
236	mulq	v0,	r0,	r2	C U1
237	CMPCY(	r7,	r23),	r21	C L0 hi add => carry
238	addq	r24,	r20,	r24	C U0 hi mul + carry
239	ldq	r5,	8(rp)		C L1
240
241	umulh	v0,	r0,	r6	C U1
242	ADDSUB	r4,	r25,	r25	C U0 lo + acc
243	stq	r22,	-16(rp)		C L0
244	stq	r23,	-8(rp)		C L1
245
246	bis	r31,	r31,	r31	C L0 st slosh
247	mulq	v0,	r1,	r7	C U1
248	bis	r31,	r31,	r31	C L1 st slosh
249	addq	r24,	r21,	r24	C U0 hi mul + carry
250$ent2:
251	CMPCY(	r4,	r25),	r20	C L0 lo add => carry
252	bis	r31,	r31,	r31	C U1 mt
253	lda	r18,	-8(r18)		C L1 bookkeeping
254	ADDSUB	r25,	r24,	r22	C U0 hi add => answer
255
256	bis	r31,	r31,	r31	C U1 mt
257	CMPCY(	r25,	r22),	r21	C L0 hi add => carry
258	addq	r3,	r20,	r3	C U0 hi mul + carry
259	ldq	r0,	16(up)		C L1
260
261	bis	r31,	r31,	r31	C U1 mt
262	ADDSUB	r5,	r28,	r28	C L0 lo + acc
263	addq	r3,	r21,	r3	C U0 hi mul + carry
264	ldq	r4,	16(rp)		C L1
265
266	umulh	v0,	r1,	r24	C U1
267	CMPCY(	r5,	r28),	r20	C L0 lo add => carry
268	ADDSUB	r28,	r3,	r23	C U0 hi add => answer
269	ldq	r1,	24(up)		C L1
270
271	mulq	v0,	r0,	r25	C U1
272	CMPCY(	r28,	r23),	r21	C L0 hi add => carry
273	addq	r8,	r20,	r8	C U0 hi mul + carry
274	ldq	r5,	24(rp)		C L1
275
276	umulh	v0,	r0,	r3	C U1
277	ADDSUB	r4,	r2,	r2	C U0 lo + acc
278	stq	r22,	0(rp)		C L0
279	stq	r23,	8(rp)		C L1
280
281	bis	r31,	r31,	r31	C L0 st slosh
282	mulq	v0,	r1,	r28	C U1
283	bis	r31,	r31,	r31	C L1 st slosh
284	addq	r8,	r21,	r8	C U0 hi mul + carry
285$ent0:
286	CMPCY(	r4,	r2),	r20	C L0 lo add => carry
287	bis	r31,	r31,	r31	C U1 mt
288	lda	up,	64(up)		C L1 bookkeeping
289	ADDSUB	r2,	r8,	r22	C U0 hi add => answer
290
291	bis	r31,	r31,	r31	C U1 mt
292	CMPCY(	r2,	r22),	r21	C L0 hi add => carry
293	addq	r6,	r20,	r6	C U0 hi mul + carry
294	ldq	r0,	-32(up)		C L1
295
296	bis	r31,	r31,	r31	C U1 mt
297	ADDSUB	r5,	r7,	r7	C L0 lo + acc
298	addq	r6,	r21,	r6	C U0 hi mul + carry
299	ldq	r4,	32(rp)		C L1
300
301	umulh	v0,	r1,	r8	C U1
302	CMPCY(	r5,	r7),	r20	C L0 lo add => carry
303	ADDSUB	r7,	r6,	r23	C U0 hi add => answer
304	ldq	r1,	-24(up)		C L1
305
306	mulq	v0,	r0,	r2	C U1
307	CMPCY(	r7,	r23),	r21	C L0 hi add => carry
308	addq	r24,	r20,	r24	C U0 hi mul + carry
309	ldq	r5,	40(rp)		C L1
310
311	umulh	v0,	r0,	r6	C U1
312	ADDSUB	r4,	r25,	r25	C U0 lo + acc
313	stq	r22,	16(rp)		C L0
314	stq	r23,	24(rp)		C L1
315
316	bis	r31,	r31,	r31	C L0 st slosh
317	mulq	v0,	r1,	r7	C U1
318	bis	r31,	r31,	r31	C L1 st slosh
319	addq	r24,	r21,	r24	C U0 hi mul + carry
320$ent6:
321	CMPCY(	r4,	r25),	r20	C L0 lo add => carry
322	bis	r31,	r31,	r31	C U1 mt
323	lda	rp,	64(rp)		C L1 bookkeeping
324	ADDSUB	r25,	r24,	r22	C U0 hi add => answer
325
326	bis	r31,	r31,	r31	C U1 mt
327	CMPCY(	r25,	r22),	r21	C L0 hi add => carry
328	addq	r3,	r20,	r3	C U0 hi mul + carry
329	ldq	r0,	-16(up)		C L1
330
331	bis	r31,	r31,	r31	C U1 mt
332	ADDSUB	r5,	r28,	r28	C L0 lo + acc
333	addq	r3,	r21,	r3	C U0 hi mul + carry
334	ldq	r4,	-16(rp)		C L1
335
336	umulh	v0,	r1,	r24	C U1
337	CMPCY(	r5,	r28),	r20	C L0 lo add => carry
338	ADDSUB	r28,	r3,	r23	C U0 hi add => answer
339	ldq	r1,	-8(up)		C L1
340
341	mulq	v0,	r0,	r25	C U1
342	CMPCY(	r28,	r23),	r21	C L0 hi add => carry
343	addq	r8,	r20,	r8	C U0 hi mul + carry
344	ldq	r5,	-8(rp)		C L1
345
346	umulh	v0,	r0,	r3	C U1
347	ADDSUB	r4,	r2,	r2	C U0 lo + acc
348	stq	r22,	-32(rp)		C L0
349	stq	r23,	-24(rp)		C L1
350
351	bis	r31,	r31,	r31	C L0 st slosh
352	mulq	v0,	r1,	r28	C U1
353	bis	r31,	r31,	r31	C L1 st slosh
354	addq	r8,	r21,	r8	C U0 hi mul + carry
355
356	CMPCY(	r4,	r2),	r20	C L0 lo add => carry
357	ADDSUB	r2,	r8,	r22	C U0 hi add => answer
358	ldl	r31,	256(up)		C prefetch up[]
359	bgt	r18,	$Loop		C U1 bookkeeping
360
361$Lend:	CMPCY(	r2,	r22),	r21	C
362	addq	r6,	r20,	r6	C
363	ADDSUB	r5,	r7,	r7	C
364	addq	r6,	r21,	r6	C
365	ldq	r4,	0(rp)		C
366	umulh	v0,	r1,	r8	C
367	CMPCY(	r5,	r7),	r20	C
368	ADDSUB	r7,	r6,	r23	C
369	CMPCY(r7,	r23),	r21	C
370	addq	r24,	r20,	r24	C
371	ldq	r5,	8(rp)		C
372	ADDSUB	r4,	r25,	r25	C
373	stq	r22,	-16(rp)		C
374	stq	r23,	-8(rp)		C
375	addq	r24,	r21,	r24	C
376	br	L(x)
377
378	ALIGN(16)
379$n23:	ldq	r4,	0(rp)		C
380	ldq	r5,	8(rp)		C
381	umulh	v0,	r1,	r8	C
382	ADDSUB	r4,	r25,	r25	C
383L(x):	CMPCY(	r4,	r25),	r20	C
384	ADDSUB	r25,	r24,	r22	C
385	CMPCY(	r25,	r22),	r21	C
386	addq	r3,	r20,	r3	C
387	ADDSUB	r5,	r28,	r28	C
388	addq	r3,	r21,	r3	C
389	CMPCY(	r5,	r28),	r20	C
390	ADDSUB	r28,	r3,	r23	C
391	CMPCY(	r28,	r23),	r21	C
392	addq	r8,	r20,	r8	C
393	stq	r22,	0(rp)		C
394	stq	r23,	8(rp)		C
395	addq	r8,	r21,	r0	C
396	ret	r31,	(r26),	1	C
397EPILOGUE()
398ASM_END()
399