1dnl  IA-64 mpn_bdiv_dbm1.
2
3dnl  Copyright 2008, 2009 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C         cycles/limb
23C Itanium:    4
24C Itanium 2:  2
25
26C TODO
27C  * Optimize feed-in and wind-down code, both for speed and code size.
28
29C INPUT PARAMETERS
30define(`rp', `r32')
31define(`up', `r33')
32define(`n', `r34')
33define(`bd', `r35')
34
35ASM_START()
36PROLOGUE(mpn_bdiv_dbm1c)
37	.prologue
38	.save		ar.lc, r2
39	.body
40
41ifdef(`HAVE_ABI_32',
42`	addp4		rp = 0, rp		C M I
43	addp4		up = 0, up		C M I
44	zxt4		n = n			C I
45	;;
46')
47{.mmb
48	mov		r15 = r36		C M I
49	ldf8		f9 = [up], 8		C M
50	nop.b		0			C B
51}
52.Lcommon:
53{.mii
54	adds		r16 = -1, n		C M I
55	mov		r2 = ar.lc		C I0
56	and		r14 = 3, n		C M I
57	;;
58}
59{.mii
60	setf.sig	f6 = bd			C M2 M3
61	shr.u		r31 = r16, 2		C I0
62	cmp.eq		p10, p0 = 0, r14	C M I
63}
64{.mii
65	nop.m		0			C M
66	cmp.eq		p11, p0 = 2, r14	C M I
67	cmp.eq		p12, p0 = 3, r14	C M I
68	;;
69}
70{.mii
71	cmp.ne		p6, p7 = r0, r0		C M I
72	mov.i		ar.lc = r31		C I0
73	cmp.ne		p8, p9 = r0, r0		C M I
74}
75{.bbb
76  (p10)	br.dptk		.Lb00			C B
77  (p11)	br.dptk		.Lb10			C B
78  (p12)	br.dptk		.Lb11			C B
79	;;
80}
81
82.Lb01:	br.cloop.dptk	.grt1
83	;;
84	xma.l		f38 = f9, f6, f0
85	xma.hu		f39 = f9, f6, f0
86	;;
87	getf.sig	r26 = f38
88	getf.sig	r27 = f39
89	br		.Lcj1
90
91.grt1:	ldf8		f10 = [r33], 8
92	;;
93	ldf8		f11 = [r33], 8
94	;;
95	ldf8		f12 = [r33], 8
96	;;
97	xma.l		f38 = f9, f6, f0
98	xma.hu		f39 = f9, f6, f0
99	;;
100	ldf8		f13 = [r33], 8
101	;;
102	xma.l		f32 = f10, f6, f0
103	xma.hu		f33 = f10, f6, f0
104	br.cloop.dptk	.grt5
105
106	;;
107	getf.sig	r26 = f38
108	xma.l		f34 = f11, f6, f0
109	xma.hu		f35 = f11, f6, f0
110	;;
111	getf.sig	r27 = f39
112	;;
113	getf.sig	r20 = f32
114	xma.l		f36 = f12, f6, f0
115	xma.hu		f37 = f12, f6, f0
116	;;
117	getf.sig	r21 = f33
118	;;
119	getf.sig	r22 = f34
120	xma.l		f38 = f13, f6, f0
121	xma.hu		f39 = f13, f6, f0
122	br		.Lcj5
123
124.grt5:	ldf8		f10 = [r33], 8
125	;;
126	getf.sig	r26 = f38
127	xma.l		f34 = f11, f6, f0
128	xma.hu		f35 = f11, f6, f0
129	;;
130	getf.sig	r27 = f39
131	ldf8		f11 = [r33], 8
132	;;
133	getf.sig	r20 = f32
134	xma.l		f36 = f12, f6, f0
135	xma.hu		f37 = f12, f6, f0
136	;;
137	getf.sig	r21 = f33
138	ldf8		f12 = [r33], 8
139	;;
140	getf.sig	r22 = f34
141	xma.l		f38 = f13, f6, f0
142	xma.hu		f39 = f13, f6, f0
143	br		.LL01
144
145.Lb10:	ldf8		f13 = [r33], 8
146	br.cloop.dptk	.grt2
147	;;
148
149	xma.l		f36 = f9, f6, f0
150	xma.hu		f37 = f9, f6, f0
151	;;
152	xma.l		f38 = f13, f6, f0
153	xma.hu		f39 = f13, f6, f0
154	;;
155	getf.sig	r24 = f36
156	;;
157	getf.sig	r25 = f37
158	;;
159	getf.sig	r26 = f38
160	;;
161	getf.sig	r27 = f39
162	br		.Lcj2
163
164.grt2:	ldf8		f10 = [r33], 8
165	;;
166	ldf8		f11 = [r33], 8
167	;;
168	xma.l		f36 = f9, f6, f0
169	xma.hu		f37 = f9, f6, f0
170	;;
171	ldf8		f12 = [r33], 8
172	;;
173	xma.l		f38 = f13, f6, f0
174	xma.hu		f39 = f13, f6, f0
175	;;
176	ldf8		f13 = [r33], 8
177	;;
178	getf.sig	r24 = f36
179	xma.l		f32 = f10, f6, f0
180	xma.hu		f33 = f10, f6, f0
181	br.cloop.dptk	.grt6
182
183	getf.sig	r25 = f37
184	;;
185	getf.sig	r26 = f38
186	xma.l		f34 = f11, f6, f0
187	xma.hu		f35 = f11, f6, f0
188	;;
189	getf.sig	r27 = f39
190	;;
191	getf.sig	r20 = f32
192	xma.l		f36 = f12, f6, f0
193	xma.hu		f37 = f12, f6, f0
194	br		.Lcj6
195
196.grt6:	getf.sig	r25 = f37
197	ldf8		f10 = [r33], 8
198	;;
199	getf.sig	r26 = f38
200	xma.l		f34 = f11, f6, f0
201	xma.hu		f35 = f11, f6, f0
202	;;
203	getf.sig	r27 = f39
204	ldf8		f11 = [r33], 8
205	;;
206	getf.sig	r20 = f32
207	xma.l		f36 = f12, f6, f0
208	xma.hu		f37 = f12, f6, f0
209	br		.LL10
210
211
212.Lb11:	ldf8		f12 = [r33], 8
213	;;
214	ldf8		f13 = [r33], 8
215	br.cloop.dptk	.grt3
216	;;
217
218	xma.l		f34 = f9, f6, f0
219	xma.hu		f35 = f9, f6, f0
220	;;
221	xma.l		f36 = f12, f6, f0
222	xma.hu		f37 = f12, f6, f0
223	;;
224	getf.sig	r22 = f34
225	xma.l		f38 = f13, f6, f0
226	xma.hu		f39 = f13, f6, f0
227	;;
228	getf.sig	r23 = f35
229	;;
230	getf.sig	r24 = f36
231	;;
232	getf.sig	r25 = f37
233	;;
234	getf.sig	r26 = f38
235	br		.Lcj3
236
237.grt3:	ldf8		f10 = [r33], 8
238	;;
239	xma.l		f34 = f9, f6, f0
240	xma.hu		f35 = f9, f6, f0
241	;;
242	ldf8		f11 = [r33], 8
243	;;
244	xma.l		f36 = f12, f6, f0
245	xma.hu		f37 = f12, f6, f0
246	;;
247	ldf8		f12 = [r33], 8
248	;;
249	getf.sig	r22 = f34
250	xma.l		f38 = f13, f6, f0
251	xma.hu		f39 = f13, f6, f0
252	;;
253	getf.sig	r23 = f35
254	ldf8		f13 = [r33], 8
255	;;
256	getf.sig	r24 = f36
257	xma.l		f32 = f10, f6, f0
258	xma.hu		f33 = f10, f6, f0
259	br.cloop.dptk	.grt7
260
261	getf.sig	r25 = f37
262	;;
263	getf.sig	r26 = f38
264	xma.l		f34 = f11, f6, f0
265	xma.hu		f35 = f11, f6, f0
266	br		.Lcj7
267
268.grt7:	getf.sig	r25 = f37
269	ldf8		f10 = [r33], 8
270	;;
271	getf.sig	r26 = f38
272	xma.l		f34 = f11, f6, f0
273	xma.hu		f35 = f11, f6, f0
274	br		.LL11
275
276
277.Lb00:	ldf8		f11 = [r33], 8
278	;;
279	ldf8		f12 = [r33], 8
280	;;
281	ldf8		f13 = [r33], 8
282	br.cloop.dptk	.grt4
283	;;
284
285	xma.l		f32 = f9, f6, f0
286	xma.hu		f33 = f9, f6, f0
287	;;
288	xma.l		f34 = f11, f6, f0
289	xma.hu		f35 = f11, f6, f0
290	;;
291	getf.sig	r20 = f32
292	xma.l		f36 = f12, f6, f0
293	xma.hu		f37 = f12, f6, f0
294	;;
295	getf.sig	r21 = f33
296	;;
297	getf.sig	r22 = f34
298	xma.l		f38 = f13, f6, f0
299	xma.hu		f39 = f13, f6, f0
300	;;
301	getf.sig	r23 = f35
302	;;
303	getf.sig	r24 = f36
304	br		.Lcj4
305
306.grt4:	xma.l		f32 = f9, f6, f0
307	xma.hu		f33 = f9, f6, f0
308	;;
309	ldf8		f10 = [r33], 8
310	;;
311	xma.l		f34 = f11, f6, f0
312	xma.hu		f35 = f11, f6, f0
313	;;
314	ldf8		f11 = [r33], 8
315	;;
316	getf.sig	r20 = f32
317	xma.l		f36 = f12, f6, f0
318	xma.hu		f37 = f12, f6, f0
319	;;
320	getf.sig	r21 = f33
321	ldf8		f12 = [r33], 8
322	;;
323	getf.sig	r22 = f34
324	xma.l		f38 = f13, f6, f0
325	xma.hu		f39 = f13, f6, f0
326	;;
327	getf.sig	r23 = f35
328	ldf8		f13 = [r33], 8
329	;;
330	getf.sig	r24 = f36
331	xma.l		f32 = f10, f6, f0
332	xma.hu		f33 = f10, f6, f0
333	br.cloop.dptk	.LL00
334	br		.Lcj8
335
336C *** MAIN LOOP START ***
337	ALIGN(32)
338.Ltop:
339	.pred.rel "mutex",p6,p7
340C	.mfi
341	getf.sig	r24 = f36
342	xma.l		f32 = f10, f6, f0
343  (p6)	sub		r15 = r19, r27, 1
344C	.mfi
345	st8		[r32] = r19, 8
346	xma.hu		f33 = f10, f6, f0
347  (p7)	sub		r15 = r19, r27
348	;;
349.LL00:
350C	.mfi
351	getf.sig	r25 = f37
352	nop.f 0
353	cmp.ltu		p6, p7 = r15, r20
354C	.mib
355	ldf8		f10 = [r33], 8
356	sub		r16 = r15, r20
357	nop.b 0
358	;;
359
360C	.mfi
361	getf.sig	r26 = f38
362	xma.l		f34 = f11, f6, f0
363  (p6)	sub		r15 = r16, r21, 1
364C	.mfi
365	st8		[r32] = r16, 8
366	xma.hu		f35 = f11, f6, f0
367  (p7)	sub		r15 = r16, r21
368	;;
369.LL11:
370C	.mfi
371	getf.sig	r27 = f39
372	nop.f 0
373	cmp.ltu		p6, p7 = r15, r22
374C	.mib
375	ldf8		f11 = [r33], 8
376	sub		r17 = r15, r22
377	nop.b 0
378	;;
379
380C	.mfi
381	getf.sig	r20 = f32
382	xma.l		f36 = f12, f6, f0
383  (p6)	sub		r15 = r17, r23, 1
384C	.mfi
385	st8		[r32] = r17, 8
386	xma.hu		f37 = f12, f6, f0
387  (p7)	sub		r15 = r17, r23
388	;;
389.LL10:
390C	.mfi
391	getf.sig	r21 = f33
392	nop.f 0
393	cmp.ltu		p6, p7 = r15, r24
394C	.mib
395	ldf8		f12 = [r33], 8
396	sub		r18 = r15, r24
397	nop.b 0
398	;;
399
400C	.mfi
401	getf.sig	r22 = f34
402	xma.l		f38 = f13, f6, f0
403  (p6)	sub		r15 = r18, r25, 1
404C	.mfi
405	st8		[r32] = r18, 8
406	xma.hu		f39 = f13, f6, f0
407  (p7)	sub		r15 = r18, r25
408	;;
409.LL01:
410C	.mfi
411	getf.sig	r23 = f35
412	nop.f 0
413	cmp.ltu		p6, p7 = r15, r26
414C	.mib
415	ldf8		f13 = [r33], 8
416	sub		r19 = r15, r26
417	br.cloop.sptk.few .Ltop
418C *** MAIN LOOP END ***
419	;;
420
421	getf.sig	r24 = f36
422	xma.l		f32 = f10, f6, f0
423  (p6)	sub		r15 = r19, r27, 1
424	st8		[r32] = r19, 8
425	xma.hu		f33 = f10, f6, f0
426  (p7)	sub		r15 = r19, r27
427	;;
428.Lcj8:	getf.sig	r25 = f37
429	cmp.ltu		p6, p7 = r15, r20
430	sub		r16 = r15, r20
431	;;
432	getf.sig	r26 = f38
433	xma.l		f34 = f11, f6, f0
434  (p6)	sub		r15 = r16, r21, 1
435	st8		[r32] = r16, 8
436	xma.hu		f35 = f11, f6, f0
437  (p7)	sub		r15 = r16, r21
438	;;
439.Lcj7:	getf.sig	r27 = f39
440	cmp.ltu		p6, p7 = r15, r22
441	sub		r17 = r15, r22
442	;;
443	getf.sig	r20 = f32
444	xma.l		f36 = f12, f6, f0
445  (p6)	sub		r15 = r17, r23, 1
446	st8		[r32] = r17, 8
447	xma.hu		f37 = f12, f6, f0
448  (p7)	sub		r15 = r17, r23
449	;;
450.Lcj6:	getf.sig	r21 = f33
451	cmp.ltu		p6, p7 = r15, r24
452	sub		r18 = r15, r24
453	;;
454	getf.sig	r22 = f34
455	xma.l		f38 = f13, f6, f0
456  (p6)	sub		r15 = r18, r25, 1
457	st8		[r32] = r18, 8
458	xma.hu		f39 = f13, f6, f0
459  (p7)	sub		r15 = r18, r25
460	;;
461.Lcj5:	getf.sig	r23 = f35
462	cmp.ltu		p6, p7 = r15, r26
463	sub		r19 = r15, r26
464	;;
465	getf.sig	r24 = f36
466  (p6)	sub		r15 = r19, r27, 1
467	st8		[r32] = r19, 8
468  (p7)	sub		r15 = r19, r27
469	;;
470.Lcj4:	getf.sig	r25 = f37
471	cmp.ltu		p6, p7 = r15, r20
472	sub		r16 = r15, r20
473	;;
474	getf.sig	r26 = f38
475  (p6)	sub		r15 = r16, r21, 1
476	st8		[r32] = r16, 8
477  (p7)	sub		r15 = r16, r21
478	;;
479.Lcj3:	getf.sig	r27 = f39
480	cmp.ltu		p6, p7 = r15, r22
481	sub		r17 = r15, r22
482	;;
483  (p6)	sub		r15 = r17, r23, 1
484	st8		[r32] = r17, 8
485  (p7)	sub		r15 = r17, r23
486	;;
487.Lcj2:	cmp.ltu		p6, p7 = r15, r24
488	sub		r18 = r15, r24
489	;;
490  (p6)	sub		r15 = r18, r25, 1
491	st8		[r32] = r18, 8
492  (p7)	sub		r15 = r18, r25
493	;;
494.Lcj1:	cmp.ltu		p6, p7 = r15, r26
495	sub		r19 = r15, r26
496	;;
497  (p6)	sub		r8 = r19, r27, 1
498	st8		[r32] = r19
499  (p7)	sub		r8 = r19, r27
500	mov ar.lc = r2
501	br.ret.sptk.many b0
502EPILOGUE()
503ASM_END()
504