1dnl  IA-64 mpn_addmul_2 -- Multiply a n-limb number with a 2-limb number and
2dnl  add the result to a (n+1)-limb number.
3
4dnl  Copyright 2004, 2005 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C         cycles/limb
24C Itanium:    3.65
25C Itanium 2:  1.625
26
27C Note that this is very similar to mul_2.asm.  If you change this file,
28C please change that file too.
29
30C TODO
31C  * Clean up variable names, and try to decrease the number of distinct
32C    registers used.
33C  * Cleanup feed-in code to not require zeroing several registers.
34C  * Make sure we don't depend on uninitialized predicate registers.
35C  * We currently cross-jump very aggressively, at the expense of a few cycles
36C    per operation.  Consider changing that.
37C  * Could perhaps save a few cycles by using 1 c/l carry propagation in
38C    wind-down code.
39C  * Ultimately rewrite.  The problem with this code is that it first uses a
40C    loaded u value in one xma pair, then leaves it live over several unrelated
41C    xma pairs, before it uses it again.  It should actually be quite possible
42C    to just swap some aligned xma pairs around.  But we should then schedule
43C    u loads further from the first use.
44
45C INPUT PARAMETERS
46define(`rp',`r32')
47define(`up',`r33')
48define(`n',`r34')
49define(`vp',`r35')
50
51define(`srp',`r3')
52
53define(`v0',`f6')
54define(`v1',`f7')
55
56define(`s0',`r14')
57define(`acc0',`r15')
58
59define(`pr0_0',`r16') define(`pr0_1',`r17')
60define(`pr0_2',`r18') define(`pr0_3',`r19')
61
62define(`pr1_0',`r20') define(`pr1_1',`r21')
63define(`pr1_2',`r22') define(`pr1_3',`r23')
64
65define(`acc1_0',`r24') define(`acc1_1',`r25')
66define(`acc1_2',`r26') define(`acc1_3',`r27')
67
68dnl define(`',`r28')
69dnl define(`',`r29')
70dnl define(`',`r30')
71dnl define(`',`r31')
72
73define(`fp0b_0',`f8') define(`fp0b_1',`f9')
74define(`fp0b_2',`f10') define(`fp0b_3',`f11')
75
76define(`fp1a_0',`f12') define(`fp1a_1',`f13')
77define(`fp1a_2',`f14') define(`fp1a_3',`f15')
78
79define(`fp1b_0',`f32') define(`fp1b_1',`f33')
80define(`fp1b_2',`f34') define(`fp1b_3',`f35')
81
82define(`fp2a_0',`f36') define(`fp2a_1',`f37')
83define(`fp2a_2',`f38') define(`fp2a_3',`f39')
84
85define(`r_0',`f40') define(`r_1',`f41')
86define(`r_2',`f42') define(`r_3',`f43')
87
88define(`u_0',`f44') define(`u_1',`f45')
89define(`u_2',`f46') define(`u_3',`f47')
90
91define(`rx',`f48')
92define(`ux',`f49')
93define(`ry',`f50')
94define(`uy',`f51')
95
96ASM_START()
97PROLOGUE(mpn_addmul_2)
98	.prologue
99	.save	ar.lc, r2
100	.body
101
102ifdef(`HAVE_ABI_32',
103`	addp4		rp = 0, rp		C			M I
104	addp4		up = 0, up		C			M I
105	addp4		vp = 0, vp		C			M I
106	zxt4		n = n			C			I
107	;;')
108
109{.mmi		C 00
110	ldf8		ux = [up], 8		C			M
111	ldf8		v0 = [vp], 8		C			M
112	mov.i		r2 = ar.lc		C			I0
113}{.mmi
114	ldf8		rx = [rp], 8		C			M
115	and		r14 = 3, n		C			M I
116	add		n = -2, n		C			M I
117	;;
118}{.mmi		C 01
119	ldf8		uy = [up], 8		C			M
120	ldf8		v1 = [vp]		C			M
121	shr.u		n = n, 2		C			I0
122}{.mmi
123	ldf8		ry = [rp], -8		C			M
124	cmp.eq		p10, p0 = 1, r14	C			M I
125	cmp.eq		p11, p0 = 2, r14	C			M I
126	;;
127}{.mmi		C 02
128	add		srp = 16, rp		C			M I
129	cmp.eq		p12, p0 = 3, r14	C			M I
130	mov.i		ar.lc = n		C			I0
131}{.bbb
132  (p10) br.dptk		.Lb01			C			B
133  (p11) br.dptk		.Lb10			C			B
134  (p12) br.dptk		.Lb11			C			B
135	;;
136}
137
138	ALIGN(32)
139.Lb00:	ldf8		r_1 = [srp], 8
140	ldf8		u_1 = [up], 8
141	mov		acc1_2 = 0
142	mov		pr1_2 = 0
143	mov		pr0_3 = 0
144	cmp.ne		p8, p9 = r0, r0
145	;;
146	ldf8		r_2 = [srp], 8
147	xma.l		fp0b_3 = ux, v0, rx
148	cmp.ne		p12, p13 = r0, r0
149	ldf8		u_2 = [up], 8
150	xma.hu		fp1a_3 = ux, v0, rx
151	br.cloop.dptk	.grt4
152
153	xma.l		fp0b_0 = uy, v0, ry
154	xma.hu		fp1a_0 = uy, v0, ry
155	;;
156	getf.sig	acc0 = fp0b_3
157	xma.l		fp1b_3 = ux, v1, fp1a_3
158	xma.hu		fp2a_3 = ux, v1, fp1a_3
159	;;
160	xma.l		fp0b_1 = u_1, v0, r_1
161	xma.hu		fp1a_1 = u_1, v0, r_1
162	;;
163	getf.sig	pr0_0 = fp0b_0
164	xma.l		fp1b_0 = uy, v1, fp1a_0
165	xma.hu		fp2a_0 = uy, v1, fp1a_0
166	;;
167	getf.sig	pr1_3 = fp1b_3
168	getf.sig	acc1_3 = fp2a_3
169	xma.l		fp0b_2 = u_2, v0, r_2
170	xma.hu		fp1a_2 = u_2, v0, r_2
171	br		.Lcj4
172
173.grt4:	xma.l		fp0b_0 = uy, v0, ry
174	xma.hu		fp1a_0 = uy, v0, ry
175	;;
176	ldf8		r_3 = [srp], 8
177	getf.sig	acc0 = fp0b_3
178	xma.l		fp1b_3 = ux, v1, fp1a_3
179	ldf8		u_3 = [up], 8
180	xma.hu		fp2a_3 = ux, v1, fp1a_3
181	;;
182	xma.l		fp0b_1 = u_1, v0, r_1
183	xma.hu		fp1a_1 = u_1, v0, r_1
184	;;
185	ldf8		r_0 = [srp], 8
186	getf.sig	pr0_0 = fp0b_0
187	xma.l		fp1b_0 = uy, v1, fp1a_0
188	xma.hu		fp2a_0 = uy, v1, fp1a_0
189	;;
190	ldf8		u_0 = [up], 8
191	getf.sig	pr1_3 = fp1b_3
192	;;
193	getf.sig	acc1_3 = fp2a_3
194	xma.l		fp0b_2 = u_2, v0, r_2
195	xma.hu		fp1a_2 = u_2, v0, r_2
196	br		.LL00
197
198
199	ALIGN(32)
200.Lb01:	ldf8		r_0 = [srp], 8		C M
201	ldf8		u_0 = [up], 8		C M
202	mov		acc1_1 = 0		C M I
203	mov		pr1_1 = 0		C M I
204	mov		pr0_2 = 0		C M I
205	cmp.ne		p6, p7 = r0, r0		C M I
206	;;
207	ldf8		r_1 = [srp], 8		C M
208	xma.l		fp0b_2 = ux, v0, rx	C F
209	cmp.ne		p10, p11 = r0, r0	C M I
210	ldf8		u_1 = [up], 8		C M
211	xma.hu		fp1a_2 = ux, v0, rx	C F
212	;;
213	xma.l		fp0b_3 = uy, v0, ry	C F
214	xma.hu		fp1a_3 = uy, v0, ry	C F
215	;;
216	getf.sig	acc0 = fp0b_2		C M
217	ldf8		r_2 = [srp], 8		C M
218	xma.l		fp1b_2 = ux, v1,fp1a_2	C F
219	xma.hu		fp2a_2 = ux, v1,fp1a_2	C F
220	ldf8		u_2 = [up], 8		C M
221	br.cloop.dptk	.grt5
222
223	xma.l		fp0b_0 = u_0, v0, r_0	C F
224	xma.hu		fp1a_0 = u_0, v0, r_0	C F
225	;;
226	getf.sig	pr0_3 = fp0b_3		C M
227	xma.l		fp1b_3 = uy, v1,fp1a_3	C F
228	xma.hu		fp2a_3 = uy, v1,fp1a_3	C F
229	;;
230	getf.sig	pr1_2 = fp1b_2		C M
231	getf.sig	acc1_2 = fp2a_2		C M
232	xma.l		fp0b_1 = u_1, v0, r_1	C F
233	xma.hu		fp1a_1 = u_1, v0, r_1	C F
234	br		.Lcj5
235
236.grt5:	xma.l		fp0b_0 = u_0, v0, r_0
237	xma.hu		fp1a_0 = u_0, v0, r_0
238	;;
239	getf.sig	pr0_3 = fp0b_3
240	ldf8		r_3 = [srp], 8
241	xma.l		fp1b_3 = uy, v1, fp1a_3
242	xma.hu		fp2a_3 = uy, v1, fp1a_3
243	;;
244	ldf8		u_3 = [up], 8
245	getf.sig	pr1_2 = fp1b_2
246	;;
247	getf.sig	acc1_2 = fp2a_2
248	xma.l		fp0b_1 = u_1, v0, r_1
249	xma.hu		fp1a_1 = u_1, v0, r_1
250	br		.LL01
251
252
253	ALIGN(32)
254.Lb10:		C 03
255	br.cloop.dptk	.grt2
256		C 04
257		C 05
258		C 06
259	xma.l		fp0b_1 = ux, v0, rx
260	xma.hu		fp1a_1 = ux, v0, rx
261	;;	C 07
262	xma.l		fp0b_2 = uy, v0, ry
263	xma.hu		fp1a_2 = uy, v0, ry
264	;;	C 08
265		C 09
266		C 10
267	stf8		[rp] = fp0b_1, 8
268	xma.l		fp1b_1 = ux, v1, fp1a_1
269	xma.hu		fp2a_1 = ux, v1, fp1a_1
270	;;	C 11
271	getf.sig	acc0 = fp0b_2
272	xma.l		fp1b_2 = uy, v1, fp1a_2
273	xma.hu		fp2a_2 = uy, v1, fp1a_2
274	;;	C 12
275		C 13
276		C 14
277	getf.sig	pr1_1 = fp1b_1
278		C 15
279	getf.sig	acc1_1 = fp2a_1
280		C 16
281	getf.sig	pr1_2 = fp1b_2
282		C 17
283	getf.sig	r8 = fp2a_2
284	;;	C 18
285		C 19
286	add		s0 = pr1_1, acc0
287	;;	C 20
288	st8		[rp] = s0, 8
289	cmp.ltu		p8, p9 = s0, pr1_1
290	sub		r31 = -1, acc1_1
291	;;	C 21
292	.pred.rel "mutex", p8, p9
293  (p8)	add		acc0 = pr1_2, acc1_1, 1
294  (p9)	add		acc0 = pr1_2, acc1_1
295  (p8)	cmp.leu		p10, p0 = r31, pr1_2
296  (p9)	cmp.ltu		p10, p0 = r31, pr1_2
297	;;	C 22
298	st8		[rp] = acc0, 8
299	mov.i		ar.lc = r2
300  (p10)	add		r8 = 1, r8
301	br.ret.sptk.many b0
302
303
304.grt2:	ldf8		r_3 = [srp], 8
305	ldf8		u_3 = [up], 8
306	mov		acc1_0 = 0
307	;;
308	ldf8		r_0 = [srp], 8
309	xma.l		fp0b_1 = ux, v0, rx
310	mov		pr1_0 = 0
311	ldf8		u_0 = [up], 8
312	xma.hu		fp1a_1 = ux, v0, rx
313	mov		pr0_1 = 0
314	;;
315	xma.l		fp0b_2 = uy, v0, ry
316	xma.hu		fp1a_2 = uy, v0, ry
317	;;
318	getf.sig	acc0 = fp0b_1
319	ldf8		r_1 = [srp], 8
320	xma.l		fp1b_1 = ux, v1, fp1a_1
321	xma.hu		fp2a_1 = ux, v1, fp1a_1
322	;;
323	ldf8		u_1 = [up], 8
324	xma.l		fp0b_3 = u_3, v0, r_3
325	xma.hu		fp1a_3 = u_3, v0, r_3
326	;;
327	getf.sig	pr0_2 = fp0b_2
328	ldf8		r_2 = [srp], 8
329	xma.l		fp1b_2 = uy, v1, fp1a_2
330	xma.hu		fp2a_2 = uy, v1, fp1a_2
331	;;
332	ldf8		u_2 = [up], 8
333	getf.sig	pr1_1 = fp1b_1
334	;;
335	getf.sig	acc1_1 = fp2a_1
336	xma.l		fp0b_0 = u_0, v0, r_0
337	cmp.ne		p8, p9 = r0, r0
338	cmp.ne		p12, p13 = r0, r0
339	xma.hu		fp1a_0 = u_0, v0, r_0
340	br		.LL10
341
342
343	ALIGN(32)
344.Lb11:	mov		acc1_3 = 0
345	mov		pr1_3 = 0
346	mov		pr0_0 = 0
347	cmp.ne		p6, p7 = r0, r0
348	;;
349	ldf8		r_2 = [srp], 8
350	ldf8		u_2 = [up], 8
351	br.cloop.dptk	.grt3
352	;;
353	xma.l		fp0b_0 = ux, v0, rx
354	xma.hu		fp1a_0 = ux, v0, rx
355	;;
356	cmp.ne		p10, p11 = r0, r0
357	xma.l		fp0b_1 = uy, v0, ry
358	xma.hu		fp1a_1 = uy, v0, ry
359	;;
360	getf.sig	acc0 = fp0b_0
361	xma.l		fp1b_0 = ux, v1, fp1a_0
362	xma.hu		fp2a_0 = ux, v1, fp1a_0
363	;;
364	xma.l		fp0b_2 = u_2, v0, r_2
365	xma.hu		fp1a_2 = u_2, v0, r_2
366	;;
367	getf.sig	pr0_1 = fp0b_1
368	xma.l		fp1b_1 = uy, v1, fp1a_1
369	xma.hu		fp2a_1 = uy, v1, fp1a_1
370	;;
371	getf.sig	pr1_0 = fp1b_0
372	getf.sig	acc1_0 = fp2a_0
373	br		.Lcj3
374
375.grt3:	ldf8		r_3 = [srp], 8
376	xma.l		fp0b_0 = ux, v0, rx
377	cmp.ne		p10, p11 = r0, r0
378	ldf8		u_3 = [up], 8
379	xma.hu		fp1a_0 = ux, v0, rx
380	;;
381	xma.l		fp0b_1 = uy, v0, ry
382	xma.hu		fp1a_1 = uy, v0, ry
383	;;
384	getf.sig	acc0 = fp0b_0
385	ldf8		r_0 = [srp], 8
386	xma.l		fp1b_0 = ux, v1, fp1a_0
387	ldf8		u_0 = [up], 8
388	xma.hu		fp2a_0 = ux, v1, fp1a_0
389	;;
390	xma.l		fp0b_2 = u_2, v0, r_2
391	xma.hu		fp1a_2 = u_2, v0, r_2
392	;;
393	getf.sig	pr0_1 = fp0b_1
394	ldf8		r_1 = [srp], 8
395	xma.l		fp1b_1 = uy, v1, fp1a_1
396	xma.hu		fp2a_1 = uy, v1, fp1a_1
397	;;
398	ldf8		u_1 = [up], 8
399	getf.sig	pr1_0 = fp1b_0
400	;;
401	getf.sig	acc1_0 = fp2a_0
402	xma.l		fp0b_3 = u_3, v0, r_3
403	xma.hu		fp1a_3 = u_3, v0, r_3
404	br		.LL11
405
406
407C *** MAIN LOOP START ***
408	ALIGN(32)
409.Loop:						C 00
410	.pred.rel "mutex", p12, p13
411	getf.sig	pr0_3 = fp0b_3
412	ldf8		r_3 = [srp], 8
413	xma.l		fp1b_3 = u_3, v1, fp1a_3
414  (p12)	add		s0 = pr1_0, acc0, 1
415  (p13)	add		s0 = pr1_0, acc0
416	xma.hu		fp2a_3 = u_3, v1, fp1a_3
417	;;					C 01
418	.pred.rel "mutex", p8, p9
419	.pred.rel "mutex", p12, p13
420	ldf8		u_3 = [up], 8
421	getf.sig	pr1_2 = fp1b_2
422  (p8)	cmp.leu		p6, p7 = acc0, pr0_1
423  (p9)	cmp.ltu		p6, p7 = acc0, pr0_1
424  (p12)	cmp.leu		p10, p11 = s0, pr1_0
425  (p13)	cmp.ltu		p10, p11 = s0, pr1_0
426	;;					C 02
427	.pred.rel "mutex", p6, p7
428	getf.sig	acc1_2 = fp2a_2
429	st8		[rp] = s0, 8
430	xma.l		fp0b_1 = u_1, v0, r_1
431  (p6)	add		acc0 = pr0_2, acc1_0, 1
432  (p7)	add		acc0 = pr0_2, acc1_0
433	xma.hu		fp1a_1 = u_1, v0, r_1
434	;;					C 03
435.LL01:
436	.pred.rel "mutex", p10, p11
437	getf.sig	pr0_0 = fp0b_0
438	ldf8		r_0 = [srp], 8
439	xma.l		fp1b_0 = u_0, v1, fp1a_0
440  (p10)	add		s0 = pr1_1, acc0, 1
441  (p11)	add		s0 = pr1_1, acc0
442	xma.hu		fp2a_0 = u_0, v1, fp1a_0
443	;;					C 04
444	.pred.rel "mutex", p6, p7
445	.pred.rel "mutex", p10, p11
446	ldf8		u_0 = [up], 8
447	getf.sig	pr1_3 = fp1b_3
448  (p6)	cmp.leu		p8, p9 = acc0, pr0_2
449  (p7)	cmp.ltu		p8, p9 = acc0, pr0_2
450  (p10)	cmp.leu		p12, p13 = s0, pr1_1
451  (p11)	cmp.ltu		p12, p13 = s0, pr1_1
452	;;					C 05
453	.pred.rel "mutex", p8, p9
454	getf.sig	acc1_3 = fp2a_3
455	st8		[rp] = s0, 8
456	xma.l		fp0b_2 = u_2, v0, r_2
457  (p8)	add		acc0 = pr0_3, acc1_1, 1
458  (p9)	add		acc0 = pr0_3, acc1_1
459	xma.hu		fp1a_2 = u_2, v0, r_2
460	;;					C 06
461.LL00:
462	.pred.rel "mutex", p12, p13
463	getf.sig	pr0_1 = fp0b_1
464	ldf8		r_1 = [srp], 8
465	xma.l		fp1b_1 = u_1, v1, fp1a_1
466  (p12)	add		s0 = pr1_2, acc0, 1
467  (p13)	add		s0 = pr1_2, acc0
468	xma.hu		fp2a_1 = u_1, v1, fp1a_1
469	;;					C 07
470	.pred.rel "mutex", p8, p9
471	.pred.rel "mutex", p12, p13
472	ldf8		u_1 = [up], 8
473	getf.sig	pr1_0 = fp1b_0
474  (p8)	cmp.leu		p6, p7 = acc0, pr0_3
475  (p9)	cmp.ltu		p6, p7 = acc0, pr0_3
476  (p12)	cmp.leu		p10, p11 = s0, pr1_2
477  (p13)	cmp.ltu		p10, p11 = s0, pr1_2
478	;;					C 08
479	.pred.rel "mutex", p6, p7
480	getf.sig	acc1_0 = fp2a_0
481	st8		[rp] = s0, 8
482	xma.l		fp0b_3 = u_3, v0, r_3
483  (p6)	add		acc0 = pr0_0, acc1_2, 1
484  (p7)	add		acc0 = pr0_0, acc1_2
485	xma.hu		fp1a_3 = u_3, v0, r_3
486	;;					C 09
487.LL11:
488	.pred.rel "mutex", p10, p11
489	getf.sig	pr0_2 = fp0b_2
490	ldf8		r_2 = [srp], 8
491	xma.l		fp1b_2 = u_2, v1, fp1a_2
492  (p10)	add		s0 = pr1_3, acc0, 1
493  (p11)	add		s0 = pr1_3, acc0
494	xma.hu		fp2a_2 = u_2, v1, fp1a_2
495	;;					C 10
496	.pred.rel "mutex", p6, p7
497	.pred.rel "mutex", p10, p11
498	ldf8		u_2 = [up], 8
499	getf.sig	pr1_1 = fp1b_1
500  (p6)	cmp.leu		p8, p9 = acc0, pr0_0
501  (p7)	cmp.ltu		p8, p9 = acc0, pr0_0
502  (p10)	cmp.leu		p12, p13 = s0, pr1_3
503  (p11)	cmp.ltu		p12, p13 = s0, pr1_3
504	;;					C 11
505	.pred.rel "mutex", p8, p9
506	getf.sig	acc1_1 = fp2a_1
507	st8		[rp] = s0, 8
508	xma.l		fp0b_0 = u_0, v0, r_0
509  (p8)	add		acc0 = pr0_1, acc1_3, 1
510  (p9)	add		acc0 = pr0_1, acc1_3
511	xma.hu		fp1a_0 = u_0, v0, r_0
512.LL10:	br.cloop.dptk	.Loop			C 12
513	;;
514C *** MAIN LOOP END ***
515
516.Lcj6:
517	.pred.rel "mutex", p12, p13
518	getf.sig	pr0_3 = fp0b_3
519	xma.l		fp1b_3 = u_3, v1, fp1a_3
520  (p12)	add		s0 = pr1_0, acc0, 1
521  (p13)	add		s0 = pr1_0, acc0
522	xma.hu		fp2a_3 = u_3, v1, fp1a_3
523	;;
524	.pred.rel "mutex", p8, p9
525	.pred.rel "mutex", p12, p13
526	getf.sig	pr1_2 = fp1b_2
527  (p8)	cmp.leu		p6, p7 = acc0, pr0_1
528  (p9)	cmp.ltu		p6, p7 = acc0, pr0_1
529  (p12)	cmp.leu		p10, p11 = s0, pr1_0
530  (p13)	cmp.ltu		p10, p11 = s0, pr1_0
531	;;
532	.pred.rel "mutex", p6, p7
533	getf.sig	acc1_2 = fp2a_2
534	st8		[rp] = s0, 8
535	xma.l		fp0b_1 = u_1, v0, r_1
536  (p6)	add		acc0 = pr0_2, acc1_0, 1
537  (p7)	add		acc0 = pr0_2, acc1_0
538	xma.hu		fp1a_1 = u_1, v0, r_1
539	;;
540.Lcj5:
541	.pred.rel "mutex", p10, p11
542	getf.sig	pr0_0 = fp0b_0
543	xma.l		fp1b_0 = u_0, v1, fp1a_0
544  (p10)	add		s0 = pr1_1, acc0, 1
545  (p11)	add		s0 = pr1_1, acc0
546	xma.hu		fp2a_0 = u_0, v1, fp1a_0
547	;;
548	.pred.rel "mutex", p6, p7
549	.pred.rel "mutex", p10, p11
550	getf.sig	pr1_3 = fp1b_3
551  (p6)	cmp.leu		p8, p9 = acc0, pr0_2
552  (p7)	cmp.ltu		p8, p9 = acc0, pr0_2
553  (p10)	cmp.leu		p12, p13 = s0, pr1_1
554  (p11)	cmp.ltu		p12, p13 = s0, pr1_1
555	;;
556	.pred.rel "mutex", p8, p9
557	getf.sig	acc1_3 = fp2a_3
558	st8		[rp] = s0, 8
559	xma.l		fp0b_2 = u_2, v0, r_2
560  (p8)	add		acc0 = pr0_3, acc1_1, 1
561  (p9)	add		acc0 = pr0_3, acc1_1
562	xma.hu		fp1a_2 = u_2, v0, r_2
563	;;
564.Lcj4:
565	.pred.rel "mutex", p12, p13
566	getf.sig	pr0_1 = fp0b_1
567	xma.l		fp1b_1 = u_1, v1, fp1a_1
568  (p12)	add		s0 = pr1_2, acc0, 1
569  (p13)	add		s0 = pr1_2, acc0
570	xma.hu		fp2a_1 = u_1, v1, fp1a_1
571	;;
572	.pred.rel "mutex", p8, p9
573	.pred.rel "mutex", p12, p13
574	getf.sig	pr1_0 = fp1b_0
575  (p8)	cmp.leu		p6, p7 = acc0, pr0_3
576  (p9)	cmp.ltu		p6, p7 = acc0, pr0_3
577  (p12)	cmp.leu		p10, p11 = s0, pr1_2
578  (p13)	cmp.ltu		p10, p11 = s0, pr1_2
579	;;
580	.pred.rel "mutex", p6, p7
581	getf.sig	acc1_0 = fp2a_0
582	st8		[rp] = s0, 8
583  (p6)	add		acc0 = pr0_0, acc1_2, 1
584  (p7)	add		acc0 = pr0_0, acc1_2
585	;;
586.Lcj3:
587	.pred.rel "mutex", p10, p11
588	getf.sig	pr0_2 = fp0b_2
589	xma.l		fp1b_2 = u_2, v1, fp1a_2
590  (p10)	add		s0 = pr1_3, acc0, 1
591  (p11)	add		s0 = pr1_3, acc0
592	xma.hu		fp2a_2 = u_2, v1, fp1a_2
593	;;
594	.pred.rel "mutex", p6, p7
595	.pred.rel "mutex", p10, p11
596	getf.sig	pr1_1 = fp1b_1
597  (p6)	cmp.leu		p8, p9 = acc0, pr0_0
598  (p7)	cmp.ltu		p8, p9 = acc0, pr0_0
599  (p10)	cmp.leu		p12, p13 = s0, pr1_3
600  (p11)	cmp.ltu		p12, p13 = s0, pr1_3
601	;;
602	.pred.rel "mutex", p8, p9
603	getf.sig	acc1_1 = fp2a_1
604	st8		[rp] = s0, 8
605  (p8)	add		acc0 = pr0_1, acc1_3, 1
606  (p9)	add		acc0 = pr0_1, acc1_3
607	;;
608.Lcj2:
609	.pred.rel "mutex", p12, p13
610  (p12)	add		s0 = pr1_0, acc0, 1
611  (p13)	add		s0 = pr1_0, acc0
612	;;
613	.pred.rel "mutex", p8, p9
614	.pred.rel "mutex", p12, p13
615	getf.sig	pr1_2 = fp1b_2
616  (p8)	cmp.leu		p6, p7 = acc0, pr0_1
617  (p9)	cmp.ltu		p6, p7 = acc0, pr0_1
618  (p12)	cmp.leu		p10, p11 = s0, pr1_0
619  (p13)	cmp.ltu		p10, p11 = s0, pr1_0
620	;;
621	.pred.rel "mutex", p6, p7
622	getf.sig	acc1_2 = fp2a_2
623	st8		[rp] = s0, 8
624  (p6)	add		acc0 = pr0_2, acc1_0, 1
625  (p7)	add		acc0 = pr0_2, acc1_0
626	;;
627	.pred.rel "mutex", p10, p11
628  (p10)	add		s0 = pr1_1, acc0, 1
629  (p11)	add		s0 = pr1_1, acc0
630	;;
631	.pred.rel "mutex", p6, p7
632	.pred.rel "mutex", p10, p11
633  (p6)	cmp.leu		p8, p9 = acc0, pr0_2
634  (p7)	cmp.ltu		p8, p9 = acc0, pr0_2
635  (p10)	cmp.leu		p12, p13 = s0, pr1_1
636  (p11)	cmp.ltu		p12, p13 = s0, pr1_1
637	;;
638	.pred.rel "mutex", p8, p9
639	st8		[rp] = s0, 8
640  (p8)	add		acc0 = pr1_2, acc1_1, 1
641  (p9)	add		acc0 = pr1_2, acc1_1
642	;;
643	.pred.rel "mutex", p8, p9
644  (p8)	cmp.leu		p10, p11 = acc0, pr1_2
645  (p9)	cmp.ltu		p10, p11 = acc0, pr1_2
646  (p12)	add		acc0 = 1, acc0
647	;;
648	st8		[rp] = acc0, 8
649  (p12)	cmp.eq.or	p10, p0 = 0, acc0
650	mov		r8 = acc1_2
651	;;
652	.pred.rel "mutex", p10, p11
653  (p10)	add		r8 = 1, r8
654	mov.i		ar.lc = r2
655	br.ret.sptk.many b0
656EPILOGUE()
657ASM_END()
658