aors_n.asm revision 1.1.1.2
1dnl  IA-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2003, 2004, 2005, 2010, 2011 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C           cycles/limb
25C Itanium:      2.67
26C Itanium 2:    1.25
27
28C TODO
29C  * Consider using special code for small n, using something like
30C    "switch (8 * (n >= 8) + (n mod 8))" to enter it and feed-in code.
31C  * The non-nc code was trimmed cycle for cycle to its current state.  It is
32C    probably hard to save more that an odd cycle there.  The nc code is much
33C    rawer (since tune/speed doesn't have any applicable direct measurements).
34C  * Without the nc entry points, this becomes around 1800 bytes of object
35C    code; the nc code adds over 1000 bytes.  We should perhaps sacrifice a
36C    few cycles for the non-nc code and let it fall into the nc code.
37
38C INPUT PARAMETERS
39define(`rp', `r32')
40define(`up', `r33')
41define(`vp', `r34')
42define(`n',  `r35')
43define(`cy', `r36')
44
45ifdef(`OPERATION_add_n',`
46  define(ADDSUB,	add)
47  define(CND,		ltu)
48  define(INCR,		1)
49  define(LIM,		-1)
50  define(LIM2,		0)
51  define(func,    mpn_add_n)
52  define(func_nc, mpn_add_nc)
53')
54ifdef(`OPERATION_sub_n',`
55  define(ADDSUB,	sub)
56  define(CND,		gtu)
57  define(INCR,		-1)
58  define(LIM,		0)
59  define(LIM2,		-1)
60  define(func,    mpn_sub_n)
61  define(func_nc, mpn_sub_nc)
62')
63
64define(cmpeqor, `cmp.eq.or')
65define(PFDIST, 500)
66
67C Some useful aliases for registers we use
68define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
69define(`v0',`r24') define(`v1',`r25') define(`v2',`r26') define(`v3',`r27')
70define(`w0',`r28') define(`w1',`r29') define(`w2',`r30') define(`w3',`r31')
71define(`rpx',`r3')
72define(`upadv',`r20') define(`vpadv',`r21')
73
74MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
75
76ASM_START()
77PROLOGUE(func_nc)
78	.prologue
79	.save	ar.lc, r2
80	.body
81ifdef(`HAVE_ABI_32',`
82	addp4	rp = 0, rp		C			M I
83	addp4	up = 0, up		C			M I
84	addp4	vp = 0, vp		C			M I
85	zxt4	n = n			C			I
86	;;
87')
88
89 {.mmi;	ld8	r11 = [vp], 8		C			M01
90	ld8	r10 = [up], 8		C			M01
91	mov	r2 = ar.lc		C			I0
92}{.mmi;	and	r14 = 7, n		C			M I
93	cmp.lt	p15, p14 = 8, n		C			M I
94	add	n = -6, n		C			M I
95	;;
96}
97.mmi;	add	upadv = PFDIST, up	C Merging these lines into the feed-in
98	add	vpadv = PFDIST, vp	C code could save a cycle per call at
99	mov	r23 = cy		C the expense of code size.
100	;;
101{.mmi;	cmp.eq	p6, p0 = 1, r14		C			M I
102	cmp.eq	p7, p0 = 2, r14		C			M I
103	cmp.eq	p8, p0 = 3, r14		C			M I
104}{.bbb
105   (p6)	br.dptk	.Lc001			C			B
106   (p7)	br.dptk	.Lc010			C			B
107   (p8)	br.dptk	.Lc011			C			B
108	;;
109}
110{.mmi;	cmp.eq	p9, p0 = 4, r14		C			M I
111	cmp.eq	p10, p0 = 5, r14	C			M I
112	cmp.eq	p11, p0 = 6, r14	C			M I
113}{.bbb
114   (p9)	br.dptk	.Lc100			C			B
115  (p10)	br.dptk	.Lc101			C			B
116  (p11)	br.dptk	.Lc110			C			B
117	;;
118}{.mmi;	ld8	r19 = [vp], 8		C			M01
119	ld8	r18 = [up], 8		C			M01
120	cmp.ne	p13, p0 = 0, cy		C copy cy to p13	M I
121}{.mmb;	cmp.eq	p12, p0 = 7, r14	C			M I
122	nop	0
123  (p12)	br.dptk	.Lc111			C			B
124	;;
125}
126
127.Lc000:
128.mmi;	ld8	v3 = [vp], 8		C			M01
129	ld8	u3 = [up], 8		C			M01
130	shr.u	n = n, 3		C			I0
131	;;
132.mmi;	add	vpadv = PFDIST, vp	C			M I
133	ld8	v0 = [vp], 8		C			M01
134	mov	ar.lc = n		C			I0
135.mmi;	ld8	u0 = [up], 8		C			M01
136	ADDSUB	w1 = r10, r11		C			M I
137	nop	0
138	;;
139.mmi;	add	upadv = PFDIST, up	C			M I
140	ld8	v1 = [vp], 8		C			M01
141	cmp.CND	p7, p0 = w1, r10	C			M I
142.mmi;	ld8	u1 = [up], 8		C			M01
143	ADDSUB	w2 = r18, r19		C			M I
144	add	rpx = 8, rp		C			M I
145	;;
146.mmi;	ld8	v2 = [vp], 8		C			M01
147	cmp.CND	p8, p0 = w2, r18	C			M I
148  (p13)	cmpeqor	p7, p0 = LIM, w1	C			M I
149.mmi;	ld8	u2 = [up], 8		C			M01
150  (p13)	add	w1 = INCR, w1		C			M I
151	ADDSUB	w3 = u3, v3		C			M I
152	;;
153.mmi;	ld8	v3 = [vp], 8		C			M01
154	cmp.CND	p9, p0 = w3, u3		C			M I
155   (p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
156.mmb;	ld8	u3 = [up], 8		C			M01
157   (p7)	add	w2 = INCR, w2		C			M I
158	br	L(m0)
159
160
161.Lc001:
162.mmi;
163  (p15)	ld8	v1 = [vp], 8		C			M01
164  (p15)	ld8	u1 = [up], 8		C			M01
165	ADDSUB	w0 = r10, r11		C			M I
166.mmb;	nop	0
167	nop	0
168  (p15)	br	1f
169	;;
170.mmi;	cmp.ne	p9, p0 = 0, r23		C			M I
171	mov	r8 = 0
172	cmp.CND	p6, p0 = w0, r10	C			M I
173	;;
174.mmb;
175   (p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
176   (p9)	add	w0 = INCR, w0		C			M I
177	br	L(cj1)			C			B
1781:
179.mmi;	ld8	v2 = [vp], 8		C			M01
180	ld8	u2 = [up], 8		C			M01
181	shr.u	n = n, 3		C			I0
182	;;
183.mmi;	ld8	v3 = [vp], 8		C			M01
184	ld8	u3 = [up], 8		C			M01
185	mov	ar.lc = n		C			I0
186.mmi;	nop	0
187	cmp.ne	p9, p0 = 0, r23		C			M I
188	nop	0
189	;;
190.mmi;	ld8	v0 = [vp], 8		C			M01
191	cmp.CND	p6, p0 = w0, r10	C			M I
192	add	rpx = 16, rp		C			M I
193.mmb;	ld8	u0 = [up], 8		C			M01
194	ADDSUB	w1 = u1, v1		C			M I
195	br	L(c1)			C			B
196
197
198.Lc010:
199.mmi;	ld8	v0 = [vp], 8		C			M01
200	ld8	u0 = [up], 8		C			M01
201	mov	r8 = 0			C			M I
202.mmb;	ADDSUB	w3 = r10, r11		C			M I
203	cmp.ne	p8, p0 = 0, r23		C			M I
204  (p15)	br	1f			C			B
205	;;
206.mmi;	cmp.CND	p9, p0 = w3, r10	C			M I
207	ADDSUB	w0 = u0, v0		C			M I
208   (p8)	add	w3 = INCR, w3		C			M I
209	;;
210.mmb;	cmp.CND	p6, p0 = w0, u0		C			M I
211   (p8)	cmpeqor	p9, p0 = LIM2, w3	C			M I
212	br	L(cj2)			C			B
2131:
214.mmi;	ld8	v1 = [vp], 8		C			M01
215	ld8	u1 = [up], 8		C			M01
216	shr.u	n = n, 3		C			I0
217	;;
218.mmi;	ld8	v2 = [vp], 8		C			M01
219	ld8	u2 = [up], 8		C			M01
220	mov	ar.lc = n		C			I0
221	;;
222.mmi;	ld8	v3 = [vp], 8		C			M01
223	ld8	u3 = [up], 8		C			M01
224	cmp.CND	p9, p0 = w3, r10	C			M I
225	;;
226.mmi;
227   (p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
228   (p8)	add	w3 = INCR, w3		C			M I
229	ADDSUB	w0 = u0, v0		C			M I
230.mmb;	add	rpx = 24, rp		C			M I
231	nop	0
232	br	L(m23)			C			B
233
234
235.Lc011:
236.mmi;	ld8	v3 = [vp], 8		C			M01
237	ld8	u3 = [up], 8		C			M01
238	shr.u	n = n, 3		C			I0
239.mmi;	ADDSUB	w2 = r10, r11		C			M I
240	cmp.ne	p7, p0 = 0, r23		C			M I
241	nop	0
242	;;
243.mmb;	ld8	v0 = [vp], 8		C			M01
244	ld8	u0 = [up], 8		C			M01
245  (p15)	br	1f			C			B
246.mmi;	cmp.CND	p8, p0 = w2, r10	C			M I
247	ADDSUB	w3 = u3, v3		C			M I
248	nop	0
249	;;
250.mmb;
251   (p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
252   (p7)	add	w2 = INCR, w2		C			M I
253	br	L(cj3)			C			B
2541:
255.mmi;	ld8	v1 = [vp], 8		C			M01
256	ld8	u1 = [up], 8		C			M01
257	ADDSUB	w3 = u3, v3		C			M I
258	;;
259.mmi;	ld8	v2 = [vp], 8		C			M01
260	ld8	u2 = [up], 8		C			M01
261	cmp.CND	p8, p0 = w2, r10	C			M I
262	;;
263.mmi;	ld8	v3 = [vp], 8		C			M01
264	cmp.CND	p9, p0 = w3, u3		C			M I
265	mov	ar.lc = n		C			I0
266.mmi;	ld8	u3 = [up], 8		C			M01
267   (p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
268   (p7)	add	w2 = INCR, w2		C			M I
269	;;
270.mmi;	add	rpx = 32, rp		C			M I
271	st8	[rp] = w2, 8		C			M23
272   (p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
273.mmb;
274   (p8)	add	w3 = INCR, w3		C			M I
275	ADDSUB	w0 = u0, v0		C			M I
276	br	L(m23)
277
278
279.Lc100:
280.mmi;	ld8	v2 = [vp], 8		C			M01
281	ld8	u2 = [up], 8		C			M01
282	shr.u	n = n, 3		C			I0
283.mmi;	ADDSUB	w1 = r10, r11		C			M I
284	nop	0
285	nop	0
286	;;
287.mmi;	ld8	v3 = [vp], 8		C			M01
288	ld8	u3 = [up], 8		C			M01
289	add	rpx = 8, rp		C			M I
290.mmi;	cmp.ne	p6, p0 = 0, r23		C			M I
291	cmp.CND	p7, p0 = w1, r10	C			M I
292	nop	0
293	;;
294.mmi;	ld8	v0 = [vp], 8		C			M01
295	ld8	u0 = [up], 8		C			M01
296	ADDSUB	w2 = u2, v2		C			M I
297.mmb;
298   (p6)	cmpeqor	p7, p0 = LIM, w1	C			M I
299   (p6)	add	w1 = INCR, w1		C			M I
300  (p14)	br	L(cj4)
301	;;
302.mmi;	ld8	v1 = [vp], 8		C			M01
303	ld8	u1 = [up], 8		C			M01
304	mov	ar.lc = n		C			I0
305	;;
306.mmi;	ld8	v2 = [vp], 8		C			M01
307	cmp.CND	p8, p0 = w2, u2		C			M I
308	nop	0
309.mmi;	ld8	u2 = [up], 8		C			M01
310	nop	0
311	ADDSUB	w3 = u3, v3		C			M I
312	;;
313.mmi;	ld8	v3 = [vp], 8		C			M01
314	cmp.CND	p9, p0 = w3, u3		C			M I
315   (p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
316.mmb;	ld8	u3 = [up], 8		C			M01
317   (p7)	add	w2 = INCR, w2		C			M I
318	br	L(m4)
319
320
321.Lc101:
322.mmi;	ld8	v1 = [vp], 8		C			M01
323	ld8	u1 = [up], 8		C			M01
324	shr.u	n = n, 3		C			I0
325	;;
326.mmi;	ld8	v2 = [vp], 8		C			M01
327	ld8	u2 = [up], 8		C			M01
328	mov	ar.lc = n		C			I0
329	;;
330.mmi;	ld8	v3 = [vp], 8		C			M01
331	ld8	u3 = [up], 8		C			M01
332	ADDSUB	w0 = r10, r11		C			M I
333.mmi;	cmp.ne	p9, p0 = 0, r23		C			M I
334	add	rpx = 16, rp		C			M I
335	nop	0
336	;;
337.mmi;	ld8	v0 = [vp], 8		C			M01
338	cmp.CND	p6, p0 = w0, r10	C			M I
339	ld8	u0 = [up], 8		C			M01
340.mbb;	ADDSUB	w1 = u1, v1		C			M I
341  (p15)	br	L(c5)			C			B
342	br	L(end)			C			B
343
344
345.Lc110:
346.mmi;	ld8	v0 = [vp], 8		C			M01
347	ld8	u0 = [up], 8		C			M01
348	shr.u	n = n, 3		C			I0
349	;;
350.mmi;	add	upadv = PFDIST, up	C			M I
351	add	vpadv = PFDIST, vp	C			M I
352	mov	ar.lc = n		C			I0
353.mmi;	ld8	v1 = [vp], 8		C			M01
354	ld8	u1 = [up], 8		C			M01
355	ADDSUB	w3 = r10, r11		C			M I
356	;;
357.mmi;	ld8	v2 = [vp], 8		C			M01
358	ld8	u2 = [up], 8		C			M01
359	ADDSUB	w0 = u0, v0		C			M I
360.mmi;	cmp.CND	p9, p0 = w3, r10	C			M I
361	cmp.ne	p8, p0 = 0, r23		C			M I
362	add	rpx = 24, rp		C			M I
363	;;
364.mmi;	ld8	v3 = [vp], 8		C			M01
365	ld8	u3 = [up], 8		C			M01
366	nop	0
367.mmb;
368   (p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
369   (p8)	add	w3 = INCR, w3		C			M I
370	br	L(m67)			C			B
371
372
373.Lc111:
374.mmi;	ld8	v0 = [vp], 8		C			M01
375	ld8	u0 = [up], 8		C			M01
376	shr.u	n = n, 3		C			I0
377	;;
378.mmi;	add	upadv = PFDIST, up	C			M I
379	ld8	v1 = [vp], 8		C			M01
380	mov	ar.lc = n		C			I0
381.mmi;	ld8	u1 = [up], 8		C			M01
382	ADDSUB	w2 = r10, r11		C			M I
383	nop	0
384	;;
385.mmi;	add	vpadv = PFDIST, vp	C			M I
386	ld8	v2 = [vp], 8		C			M01
387	cmp.CND	p8, p0 = w2, r10	C			M I
388.mmi;	ld8	u2 = [up], 8		C			M01
389	ADDSUB	w3 = r18, r19		C			M I
390	nop	0
391	;;
392.mmi;	ld8	v3 = [vp], 8		C			M01
393	cmp.CND	p9, p0 = w3, r18	C			M I
394  (p13)	cmpeqor	p8, p0 = LIM, w2	C			M I
395.mmi;	ld8	u3 = [up], 8		C			M01
396  (p13)	add	w2 = INCR, w2		C			M I
397	nop	0
398	;;
399.mmi;	add	rpx = 32, rp		C			M I
400	st8	[rp] = w2, 8		C			M23
401   (p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
402.mmb;
403   (p8)	add	w3 = INCR, w3		C			M I
404	ADDSUB	w0 = u0, v0		C			M I
405	br	L(m67)
406
407EPILOGUE()
408
409ASM_START()
410PROLOGUE(func)
411	.prologue
412	.save	ar.lc, r2
413	.body
414ifdef(`HAVE_ABI_32',`
415	addp4	rp = 0, rp		C			M I
416	addp4	up = 0, up		C			M I
417	addp4	vp = 0, vp		C			M I
418	zxt4	n = n			C			I
419	;;
420')
421
422 {.mmi;	ld8	r11 = [vp], 8		C			M01
423	ld8	r10 = [up], 8		C			M01
424	mov	r2 = ar.lc		C			I0
425}{.mmi;	and	r14 = 7, n		C			M I
426	cmp.lt	p15, p14 = 8, n		C			M I
427	add	n = -6, n		C			M I
428	;;
429}{.mmi;	cmp.eq	p6, p0 = 1, r14		C			M I
430	cmp.eq	p7, p0 = 2, r14		C			M I
431	cmp.eq	p8, p0 = 3, r14		C			M I
432}{.bbb
433   (p6)	br.dptk	.Lb001			C			B
434   (p7)	br.dptk	.Lb010			C			B
435   (p8)	br.dptk	.Lb011			C			B
436	;;
437}{.mmi;	cmp.eq	p9, p0 = 4, r14		C			M I
438	cmp.eq	p10, p0 = 5, r14	C			M I
439	cmp.eq	p11, p0 = 6, r14	C			M I
440}{.bbb
441   (p9)	br.dptk	.Lb100			C			B
442  (p10)	br.dptk	.Lb101			C			B
443  (p11)	br.dptk	.Lb110			C			B
444	;;
445}{.mmi;	ld8	r19 = [vp], 8		C			M01
446	ld8	r18 = [up], 8		C			M01
447	cmp.ne	p13, p0 = r0, r0	C clear "CF"		M I
448}{.mmb;	cmp.eq	p12, p0 = 7, r14	C			M I
449	mov	r23 = 0			C			M I
450  (p12)	br.dptk	.Lb111			C			B
451	;;
452}
453
454.Lb000:
455.mmi;	ld8	v3 = [vp], 8		C			M01
456	ld8	u3 = [up], 8		C			M01
457	shr.u	n = n, 3		C			I0
458	;;
459.mmi;	ld8	v0 = [vp], 8		C			M01
460	ld8	u0 = [up], 8		C			M01
461	ADDSUB	w1 = r10, r11		C			M I
462	;;
463.mmi;	ld8	v1 = [vp], 8		C			M01
464	cmp.CND	p7, p0 = w1, r10	C			M I
465	mov	ar.lc = n		C			I0
466.mmi;	ld8	u1 = [up], 8		C			M01
467	ADDSUB	w2 = r18, r19		C			M I
468	add	rpx = 8, rp		C			M I
469	;;
470.mmi;	add	upadv = PFDIST, up
471	add	vpadv = PFDIST, vp
472	cmp.CND	p8, p0 = w2, r18	C			M I
473.mmi;	ld8	v2 = [vp], 8		C			M01
474	ld8	u2 = [up], 8		C			M01
475	ADDSUB	w3 = u3, v3		C			M I
476	;;
477.mmi;	ld8	v3 = [vp], 8		C			M01
478	cmp.CND	p9, p0 = w3, u3		C			M I
479   (p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
480.mmb;	ld8	u3 = [up], 8		C			M01
481   (p7)	add	w2 = INCR, w2		C			M I
482	br	L(m0)			C			B
483
484
485	ALIGN(32)
486.Lb001:
487.mmi;	ADDSUB	w0 = r10, r11		C			M I
488  (p15)	ld8	v1 = [vp], 8		C			M01
489	mov	r8 = 0			C			M I
490	;;
491.mmb;	cmp.CND	p6, p0 = w0, r10	C			M I
492  (p15)	ld8	u1 = [up], 8		C			M01
493  (p14)	br	L(cj1)			C			B
494	;;
495.mmi;	add	upadv = PFDIST, up
496	add	vpadv = PFDIST, vp
497	shr.u	n = n, 3		C			I0
498.mmi;	ld8	v2 = [vp], 8		C			M01
499	ld8	u2 = [up], 8		C			M01
500	cmp.CND	p6, p0 = w0, r10	C			M I
501	;;
502.mmi;	ld8	v3 = [vp], 8		C			M01
503	ld8	u3 = [up], 8		C			M01
504	mov	ar.lc = n		C			I0
505	;;
506.mmi;	ld8	v0 = [vp], 8		C			M01
507	ld8	u0 = [up], 8		C			M01
508	ADDSUB	w1 = u1, v1		C			M I
509	;;
510.mmi;	ld8	v1 = [vp], 8		C			M01
511	cmp.CND	p7, p0 = w1, u1		C			M I
512	ADDSUB	w2 = u2, v2		C			M I
513.mmb;	ld8	u1 = [up], 8		C			M01
514	add	rpx = 16, rp		C			M I
515	br	L(m1)			C			B
516
517
518	ALIGN(32)
519.Lb010:
520.mmi;	ld8	v0 = [vp], 8		C			M01
521	ld8	u0 = [up], 8		C			M01
522	shr.u	n = n, 3		C			I0
523.mmb;	ADDSUB	w3 = r10, r11		C			M I
524	nop	0
525  (p15)	br	L(gt2)			C			B
526	;;
527.mmi;	cmp.CND	p9, p0 = w3, r10	C			M I
528	ADDSUB	w0 = u0, v0		C			M I
529	mov	r8 = 0			C			M I
530	;;
531.mmb;	nop	0
532	cmp.CND	p6, p0 = w0, u0		C			M I
533	br	L(cj2)			C			B
534L(gt2):
535.mmi;	ld8	v1 = [vp], 8		C			M01
536	ld8	u1 = [up], 8		C			M01
537	nop	0
538	;;
539.mmi;	add	upadv = PFDIST, up
540	add	vpadv = PFDIST, vp
541	mov	ar.lc = n		C			I0
542.mmi;	ld8	v2 = [vp], 8		C			M01
543	ld8	u2 = [up], 8		C			M01
544	nop	0
545	;;
546.mmi;	ld8	v3 = [vp], 8		C			M01
547	cmp.CND	p9, p0 = w3, r10	C			M I
548	ADDSUB	w0 = u0, v0		C			M I
549.mmb;	ld8	u3 = [up], 8		C			M01
550	add	rpx = 24, rp		C			M I
551	br	L(m23)			C			B
552
553
554	ALIGN(32)
555.Lb011:
556.mmi;	ld8	v3 = [vp], 8		C			M01
557	ld8	u3 = [up], 8		C			M01
558	ADDSUB	w2 = r10, r11		C			M I
559	;;
560.mmb;	ld8	v0 = [vp], 8		C			M01
561	ld8	u0 = [up], 8		C			M01
562  (p15)	br	1f			C			B
563.mmb;	cmp.CND	p8, p0 = w2, r10	C			M I
564	ADDSUB	w3 = u3, v3		C			M I
565	br	L(cj3)			C			B
5661:
567.mmi;	ld8	v1 = [vp], 8		C			M01
568	ld8	u1 = [up], 8		C			M01
569	shr.u	n = n, 3		C			I0
570	;;
571.mmi;	add	upadv = PFDIST, up
572	add	vpadv = PFDIST, vp
573	ADDSUB	w3 = u3, v3		C			M I
574.mmi;	ld8	v2 = [vp], 8		C			M01
575	ld8	u2 = [up], 8		C			M01
576	cmp.CND	p8, p0 = w2, r10	C			M I
577	;;
578.mmi;	ld8	v3 = [vp], 8		C			M01
579	cmp.CND	p9, p0 = w3, u3		C			M I
580	mov	ar.lc = n		C			I0
581.mmi;	ld8	u3 = [up], 8		C			M01
582	nop	0
583	nop	0
584	;;
585.mmi;	add	rpx = 32, rp		C			M I
586	st8	[rp] = w2, 8		C			M23
587   (p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
588.mmb;
589   (p8)	add	w3 = INCR, w3		C			M I
590	ADDSUB	w0 = u0, v0		C			M I
591	br	L(m23)			C			B
592
593
594	ALIGN(32)
595.Lb100:
596.mmi;	ld8	v2 = [vp], 8		C			M01
597	ld8	u2 = [up], 8		C			M01
598	shr.u	n = n, 3		C			I0
599	;;
600.mmi;	ld8	v3 = [vp], 8		C			M01
601	ld8	u3 = [up], 8		C			M01
602	ADDSUB	w1 = r10, r11		C			M I
603	;;
604.mmi;	ld8	v0 = [vp], 8		C			M01
605	ld8	u0 = [up], 8		C			M01
606	cmp.CND	p7, p0 = w1, r10	C			M I
607.mmb;	nop	0
608	ADDSUB	w2 = u2, v2		C			M I
609  (p14)	br	L(cj4)			C			B
610	;;
611L(gt4):
612.mmi;	add	upadv = PFDIST, up
613	add	vpadv = PFDIST, vp
614	mov	ar.lc = n		C			I0
615	ld8	v1 = [vp], 8		C			M01
616	ld8	u1 = [up], 8		C			M01
617	nop	0
618	;;
619.mmi;	ld8	v2 = [vp], 8		C			M01
620	cmp.CND	p8, p0 = w2, u2		C			M I
621	nop	0
622.mmi;	ld8	u2 = [up], 8		C			M01
623	ADDSUB	w3 = u3, v3		C			M I
624	add	rpx = 8, rp		C			M I
625	;;
626.mmi;	ld8	v3 = [vp], 8		C			M01
627	cmp.CND	p9, p0 = w3, u3		C			M I
628   (p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
629.mmb;	ld8	u3 = [up], 8		C			M01
630   (p7)	add	w2 = INCR, w2		C			M I
631	br	L(m4)			C			B
632
633
634	ALIGN(32)
635.Lb101:
636.mmi;	ld8	v1 = [vp], 8		C			M01
637	ld8	u1 = [up], 8		C			M01
638	shr.u	n = n, 3		C			I0
639	;;
640.mmi;	ld8	v2 = [vp], 8		C			M01
641	ld8	u2 = [up], 8		C			M01
642	ADDSUB	w0 = r10, r11		C			M I
643	;;
644.mmi;	add	upadv = PFDIST, up
645	add	vpadv = PFDIST, vp
646	add	rpx = 16, rp		C			M I
647	ld8	v3 = [vp], 8		C			M01
648	ld8	u3 = [up], 8		C			M01
649	nop	0
650	;;
651.mmi;	ld8	v0 = [vp], 8		C			M01
652	cmp.CND	p6, p0 = w0, r10	C			M I
653	nop	0
654.mmb;	ld8	u0 = [up], 8		C			M01
655	ADDSUB	w1 = u1, v1		C			M I
656  (p14)	br	L(cj5)			C			B
657	;;
658L(gt5):
659.mmi;	ld8	v1 = [vp], 8		C			M01
660	cmp.CND	p7, p0 = w1, u1		C			M I
661	mov	ar.lc = n		C			I0
662.mmb;	ld8	u1 = [up], 8		C			M01
663	ADDSUB	w2 = u2, v2		C			M I
664	br	L(m5)			C			B
665
666
667	ALIGN(32)
668.Lb110:
669.mmi;	ld8	v0 = [vp], 8		C			M01
670	ld8	u0 = [up], 8		C			M01
671	shr.u	n = n, 3		C			I0
672	;;
673.mmi;	ld8	v1 = [vp], 8		C			M01
674	ld8	u1 = [up], 8		C			M01
675	ADDSUB	w3 = r10, r11		C			M I
676	;;
677.mmi;	add	upadv = PFDIST, up
678	add	vpadv = PFDIST, vp
679	mov	ar.lc = n		C			I0
680.mmi;	ld8	v2 = [vp], 8		C			M01
681	ld8	u2 = [up], 8		C			M01
682	nop	0
683	;;
684.mmi;	ld8	v3 = [vp], 8		C			M01
685	cmp.CND	p9, p0 = w3, r10	C			M I
686	ADDSUB	w0 = u0, v0		C			M I
687.mmb;	ld8	u3 = [up], 8		C			M01
688	add	rpx = 24, rp		C			M I
689	br	L(m67)			C			B
690
691
692	ALIGN(32)
693.Lb111:
694.mmi;	ld8	v0 = [vp], 8		C			M01
695	ld8	u0 = [up], 8		C			M01
696	shr.u	n = n, 3		C			I0
697	;;
698.mmi;	ld8	v1 = [vp], 8		C			M01
699	ld8	u1 = [up], 8		C			M01
700	ADDSUB	w2 = r10, r11		C			M I
701	;;
702.mmi;	ld8	v2 = [vp], 8		C			M01
703	cmp.CND	p8, p0 = w2, r10	C			M I
704	mov	ar.lc = n		C			I0
705.mmi;	ld8	u2 = [up], 8		C			M01
706	ADDSUB	w3 = r18, r19		C			M I
707	nop	0
708	;;
709.mmi;	add	upadv = PFDIST, up
710	add	vpadv = PFDIST, vp
711	nop	0
712.mmi;	ld8	v3 = [vp], 8		C			M01
713	ld8	u3 = [up], 8		C			M01
714	cmp.CND	p9, p0 = w3, r18	C			M I
715	;;
716.mmi;	add	rpx = 32, rp		C			M I
717	st8	[rp] = w2, 8		C			M23
718   (p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
719.mmb;
720   (p8)	add	w3 = INCR, w3		C			M I
721	ADDSUB	w0 = u0, v0		C			M I
722	br	L(m67)			C			B
723
724
725C *** MAIN LOOP START ***
726	ALIGN(32)
727L(top):
728L(c5):	ld8	v1 = [vp], 8		C			M01
729	cmp.CND	p7, p0 = w1, u1		C			M I
730   (p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
731	ld8	u1 = [up], 8		C			M01
732   (p9)	add	w0 = INCR, w0		C			M I
733	ADDSUB	w2 = u2, v2		C			M I
734	;;
735L(m5):	ld8	v2 = [vp], 8		C			M01
736	cmp.CND	p8, p0 = w2, u2		C			M I
737   (p6)	cmpeqor	p7, p0 = LIM, w1	C			M I
738	ld8	u2 = [up], 8		C			M01
739   (p6)	add	w1 = INCR, w1		C			M I
740	ADDSUB	w3 = u3, v3		C			M I
741	;;
742	st8	[rp] = w0, 8		C			M23
743	ld8	v3 = [vp], 8		C			M01
744	cmp.CND	p9, p0 = w3, u3		C			M I
745   (p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
746	ld8	u3 = [up], 8		C			M01
747   (p7)	add	w2 = INCR, w2		C			M I
748	;;
749L(m4):	st8	[rp] = w1, 16		C			M23
750	st8	[rpx] = w2, 32		C			M23
751   (p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
752	lfetch	[upadv], 64
753   (p8)	add	w3 = INCR, w3		C			M I
754	ADDSUB	w0 = u0, v0		C			M I
755	;;
756L(m23):	st8	[rp] = w3, 8		C			M23
757	ld8	v0 = [vp], 8		C			M01
758	cmp.CND	p6, p0 = w0, u0		C			M I
759	ld8	u0 = [up], 8		C			M01
760	ADDSUB	w1 = u1, v1		C			M I
761	nop.b	0
762	;;
763L(c1):	ld8	v1 = [vp], 8		C			M01
764	cmp.CND	p7, p0 = w1, u1		C			M I
765   (p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
766	ld8	u1 = [up], 8		C			M01
767   (p9)	add	w0 = INCR, w0		C			M I
768	ADDSUB	w2 = u2, v2		C			M I
769	;;
770L(m1):	ld8	v2 = [vp], 8		C			M01
771	cmp.CND	p8, p0 = w2, u2		C			M I
772   (p6)	cmpeqor	p7, p0 = LIM, w1	C			M I
773	ld8	u2 = [up], 8		C			M01
774   (p6)	add	w1 = INCR, w1		C			M I
775	ADDSUB	w3 = u3, v3		C			M I
776	;;
777	st8	[rp] = w0, 8		C			M23
778	ld8	v3 = [vp], 8		C			M01
779	cmp.CND	p9, p0 = w3, u3		C			M I
780   (p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
781	ld8	u3 = [up], 8		C			M01
782   (p7)	add	w2 = INCR, w2		C			M I
783	;;
784L(m0):	st8	[rp] = w1, 16		C			M23
785	st8	[rpx] = w2, 32		C			M23
786   (p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
787	lfetch	[vpadv], 64
788   (p8)	add	w3 = INCR, w3		C			M I
789	ADDSUB	w0 = u0, v0		C			M I
790	;;
791L(m67):	st8	[rp] = w3, 8		C			M23
792	ld8	v0 = [vp], 8		C			M01
793	cmp.CND	p6, p0 = w0, u0		C			M I
794	ld8	u0 = [up], 8		C			M01
795	ADDSUB	w1 = u1, v1		C			M I
796	br.cloop.dptk	L(top)		C			B
797	;;
798C *** MAIN LOOP END ***
799
800L(end):
801.mmi;
802   (p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
803   (p9)	add	w0 = INCR, w0		C			M I
804	mov	ar.lc = r2		C			I0
805L(cj5):
806.mmi;	cmp.CND	p7, p0 = w1, u1		C			M I
807	ADDSUB	w2 = u2, v2		C			M I
808	nop	0
809	;;
810.mmi;	st8	[rp] = w0, 8		C			M23
811   (p6)	cmpeqor	p7, p0 = LIM, w1	C			M I
812   (p6)	add	w1 = INCR, w1		C			M I
813L(cj4):
814.mmi;	cmp.CND	p8, p0 = w2, u2		C			M I
815	ADDSUB	w3 = u3, v3		C			M I
816	nop	0
817	;;
818.mmi;	st8	[rp] = w1, 8		C			M23
819   (p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
820   (p7)	add	w2 = INCR, w2		C			M I
821L(cj3):
822.mmi;	cmp.CND	p9, p0 = w3, u3		C			M I
823	ADDSUB	w0 = u0, v0		C			M I
824	nop	0
825	;;
826.mmi;	st8	[rp] = w2, 8		C			M23
827   (p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
828   (p8)	add	w3 = INCR, w3		C			M I
829.mmi;	cmp.CND	p6, p0 = w0, u0		C			M I
830	nop	0
831	mov	r8 = 0			C			M I
832	;;
833L(cj2):
834.mmi;	st8	[rp] = w3, 8		C			M23
835   (p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
836   (p9)	add	w0 = INCR, w0		C			M I
837	;;
838L(cj1):
839.mmb;	st8	[rp] = w0, 8		C			M23
840   (p6)	mov	r8 = 1			C			M I
841	br.ret.sptk.many b0		C			B
842EPILOGUE()
843ASM_END()
844