1dnl  IA-64 mpn_bdiv_dbm1.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2008, 2009 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C         cycles/limb
36C Itanium:    4
37C Itanium 2:  2
38
39C TODO
40C  * Optimize feed-in and wind-down code, both for speed and code size.
41
42C INPUT PARAMETERS
43define(`rp', `r32')
44define(`up', `r33')
45define(`n', `r34')
46define(`bd', `r35')
47
48ASM_START()
49PROLOGUE(mpn_bdiv_dbm1c)
50	.prologue
51	.save		ar.lc, r2
52	.body
53
54ifdef(`HAVE_ABI_32',
55`	addp4		rp = 0, rp		C M I
56	addp4		up = 0, up		C M I
57	zxt4		n = n			C I
58	;;
59')
60{.mmb
61	mov		r15 = r36		C M I
62	ldf8		f9 = [up], 8		C M
63	nop.b		0			C B
64}
65.Lcommon:
66{.mii
67	adds		r16 = -1, n		C M I
68	mov		r2 = ar.lc		C I0
69	and		r14 = 3, n		C M I
70	;;
71}
72{.mii
73	setf.sig	f6 = bd			C M2 M3
74	shr.u		r31 = r16, 2		C I0
75	cmp.eq		p10, p0 = 0, r14	C M I
76}
77{.mii
78	nop.m		0			C M
79	cmp.eq		p11, p0 = 2, r14	C M I
80	cmp.eq		p12, p0 = 3, r14	C M I
81	;;
82}
83{.mii
84	cmp.ne		p6, p7 = r0, r0		C M I
85	mov.i		ar.lc = r31		C I0
86	cmp.ne		p8, p9 = r0, r0		C M I
87}
88{.bbb
89  (p10)	br.dptk		.Lb00			C B
90  (p11)	br.dptk		.Lb10			C B
91  (p12)	br.dptk		.Lb11			C B
92	;;
93}
94
95.Lb01:	br.cloop.dptk	.grt1
96	;;
97	xma.l		f38 = f9, f6, f0
98	xma.hu		f39 = f9, f6, f0
99	;;
100	getf.sig	r26 = f38
101	getf.sig	r27 = f39
102	br		.Lcj1
103
104.grt1:	ldf8		f10 = [r33], 8
105	;;
106	ldf8		f11 = [r33], 8
107	;;
108	ldf8		f12 = [r33], 8
109	;;
110	xma.l		f38 = f9, f6, f0
111	xma.hu		f39 = f9, f6, f0
112	;;
113	ldf8		f13 = [r33], 8
114	;;
115	xma.l		f32 = f10, f6, f0
116	xma.hu		f33 = f10, f6, f0
117	br.cloop.dptk	.grt5
118
119	;;
120	getf.sig	r26 = f38
121	xma.l		f34 = f11, f6, f0
122	xma.hu		f35 = f11, f6, f0
123	;;
124	getf.sig	r27 = f39
125	;;
126	getf.sig	r20 = f32
127	xma.l		f36 = f12, f6, f0
128	xma.hu		f37 = f12, f6, f0
129	;;
130	getf.sig	r21 = f33
131	;;
132	getf.sig	r22 = f34
133	xma.l		f38 = f13, f6, f0
134	xma.hu		f39 = f13, f6, f0
135	br		.Lcj5
136
137.grt5:	ldf8		f10 = [r33], 8
138	;;
139	getf.sig	r26 = f38
140	xma.l		f34 = f11, f6, f0
141	xma.hu		f35 = f11, f6, f0
142	;;
143	getf.sig	r27 = f39
144	ldf8		f11 = [r33], 8
145	;;
146	getf.sig	r20 = f32
147	xma.l		f36 = f12, f6, f0
148	xma.hu		f37 = f12, f6, f0
149	;;
150	getf.sig	r21 = f33
151	ldf8		f12 = [r33], 8
152	;;
153	getf.sig	r22 = f34
154	xma.l		f38 = f13, f6, f0
155	xma.hu		f39 = f13, f6, f0
156	br		.LL01
157
158.Lb10:	ldf8		f13 = [r33], 8
159	br.cloop.dptk	.grt2
160	;;
161
162	xma.l		f36 = f9, f6, f0
163	xma.hu		f37 = f9, f6, f0
164	;;
165	xma.l		f38 = f13, f6, f0
166	xma.hu		f39 = f13, f6, f0
167	;;
168	getf.sig	r24 = f36
169	;;
170	getf.sig	r25 = f37
171	;;
172	getf.sig	r26 = f38
173	;;
174	getf.sig	r27 = f39
175	br		.Lcj2
176
177.grt2:	ldf8		f10 = [r33], 8
178	;;
179	ldf8		f11 = [r33], 8
180	;;
181	xma.l		f36 = f9, f6, f0
182	xma.hu		f37 = f9, f6, f0
183	;;
184	ldf8		f12 = [r33], 8
185	;;
186	xma.l		f38 = f13, f6, f0
187	xma.hu		f39 = f13, f6, f0
188	;;
189	ldf8		f13 = [r33], 8
190	;;
191	getf.sig	r24 = f36
192	xma.l		f32 = f10, f6, f0
193	xma.hu		f33 = f10, f6, f0
194	br.cloop.dptk	.grt6
195
196	getf.sig	r25 = f37
197	;;
198	getf.sig	r26 = f38
199	xma.l		f34 = f11, f6, f0
200	xma.hu		f35 = f11, f6, f0
201	;;
202	getf.sig	r27 = f39
203	;;
204	getf.sig	r20 = f32
205	xma.l		f36 = f12, f6, f0
206	xma.hu		f37 = f12, f6, f0
207	br		.Lcj6
208
209.grt6:	getf.sig	r25 = f37
210	ldf8		f10 = [r33], 8
211	;;
212	getf.sig	r26 = f38
213	xma.l		f34 = f11, f6, f0
214	xma.hu		f35 = f11, f6, f0
215	;;
216	getf.sig	r27 = f39
217	ldf8		f11 = [r33], 8
218	;;
219	getf.sig	r20 = f32
220	xma.l		f36 = f12, f6, f0
221	xma.hu		f37 = f12, f6, f0
222	br		.LL10
223
224
225.Lb11:	ldf8		f12 = [r33], 8
226	;;
227	ldf8		f13 = [r33], 8
228	br.cloop.dptk	.grt3
229	;;
230
231	xma.l		f34 = f9, f6, f0
232	xma.hu		f35 = f9, f6, f0
233	;;
234	xma.l		f36 = f12, f6, f0
235	xma.hu		f37 = f12, f6, f0
236	;;
237	getf.sig	r22 = f34
238	xma.l		f38 = f13, f6, f0
239	xma.hu		f39 = f13, f6, f0
240	;;
241	getf.sig	r23 = f35
242	;;
243	getf.sig	r24 = f36
244	;;
245	getf.sig	r25 = f37
246	;;
247	getf.sig	r26 = f38
248	br		.Lcj3
249
250.grt3:	ldf8		f10 = [r33], 8
251	;;
252	xma.l		f34 = f9, f6, f0
253	xma.hu		f35 = f9, f6, f0
254	;;
255	ldf8		f11 = [r33], 8
256	;;
257	xma.l		f36 = f12, f6, f0
258	xma.hu		f37 = f12, f6, f0
259	;;
260	ldf8		f12 = [r33], 8
261	;;
262	getf.sig	r22 = f34
263	xma.l		f38 = f13, f6, f0
264	xma.hu		f39 = f13, f6, f0
265	;;
266	getf.sig	r23 = f35
267	ldf8		f13 = [r33], 8
268	;;
269	getf.sig	r24 = f36
270	xma.l		f32 = f10, f6, f0
271	xma.hu		f33 = f10, f6, f0
272	br.cloop.dptk	.grt7
273
274	getf.sig	r25 = f37
275	;;
276	getf.sig	r26 = f38
277	xma.l		f34 = f11, f6, f0
278	xma.hu		f35 = f11, f6, f0
279	br		.Lcj7
280
281.grt7:	getf.sig	r25 = f37
282	ldf8		f10 = [r33], 8
283	;;
284	getf.sig	r26 = f38
285	xma.l		f34 = f11, f6, f0
286	xma.hu		f35 = f11, f6, f0
287	br		.LL11
288
289
290.Lb00:	ldf8		f11 = [r33], 8
291	;;
292	ldf8		f12 = [r33], 8
293	;;
294	ldf8		f13 = [r33], 8
295	br.cloop.dptk	.grt4
296	;;
297
298	xma.l		f32 = f9, f6, f0
299	xma.hu		f33 = f9, f6, f0
300	;;
301	xma.l		f34 = f11, f6, f0
302	xma.hu		f35 = f11, f6, f0
303	;;
304	getf.sig	r20 = f32
305	xma.l		f36 = f12, f6, f0
306	xma.hu		f37 = f12, f6, f0
307	;;
308	getf.sig	r21 = f33
309	;;
310	getf.sig	r22 = f34
311	xma.l		f38 = f13, f6, f0
312	xma.hu		f39 = f13, f6, f0
313	;;
314	getf.sig	r23 = f35
315	;;
316	getf.sig	r24 = f36
317	br		.Lcj4
318
319.grt4:	xma.l		f32 = f9, f6, f0
320	xma.hu		f33 = f9, f6, f0
321	;;
322	ldf8		f10 = [r33], 8
323	;;
324	xma.l		f34 = f11, f6, f0
325	xma.hu		f35 = f11, f6, f0
326	;;
327	ldf8		f11 = [r33], 8
328	;;
329	getf.sig	r20 = f32
330	xma.l		f36 = f12, f6, f0
331	xma.hu		f37 = f12, f6, f0
332	;;
333	getf.sig	r21 = f33
334	ldf8		f12 = [r33], 8
335	;;
336	getf.sig	r22 = f34
337	xma.l		f38 = f13, f6, f0
338	xma.hu		f39 = f13, f6, f0
339	;;
340	getf.sig	r23 = f35
341	ldf8		f13 = [r33], 8
342	;;
343	getf.sig	r24 = f36
344	xma.l		f32 = f10, f6, f0
345	xma.hu		f33 = f10, f6, f0
346	br.cloop.dptk	.LL00
347	br		.Lcj8
348
349C *** MAIN LOOP START ***
350	ALIGN(32)
351.Ltop:
352	.pred.rel "mutex",p6,p7
353C	.mfi
354	getf.sig	r24 = f36
355	xma.l		f32 = f10, f6, f0
356  (p6)	sub		r15 = r19, r27, 1
357C	.mfi
358	st8		[r32] = r19, 8
359	xma.hu		f33 = f10, f6, f0
360  (p7)	sub		r15 = r19, r27
361	;;
362.LL00:
363C	.mfi
364	getf.sig	r25 = f37
365	nop.f 0
366	cmp.ltu		p6, p7 = r15, r20
367C	.mib
368	ldf8		f10 = [r33], 8
369	sub		r16 = r15, r20
370	nop.b 0
371	;;
372
373C	.mfi
374	getf.sig	r26 = f38
375	xma.l		f34 = f11, f6, f0
376  (p6)	sub		r15 = r16, r21, 1
377C	.mfi
378	st8		[r32] = r16, 8
379	xma.hu		f35 = f11, f6, f0
380  (p7)	sub		r15 = r16, r21
381	;;
382.LL11:
383C	.mfi
384	getf.sig	r27 = f39
385	nop.f 0
386	cmp.ltu		p6, p7 = r15, r22
387C	.mib
388	ldf8		f11 = [r33], 8
389	sub		r17 = r15, r22
390	nop.b 0
391	;;
392
393C	.mfi
394	getf.sig	r20 = f32
395	xma.l		f36 = f12, f6, f0
396  (p6)	sub		r15 = r17, r23, 1
397C	.mfi
398	st8		[r32] = r17, 8
399	xma.hu		f37 = f12, f6, f0
400  (p7)	sub		r15 = r17, r23
401	;;
402.LL10:
403C	.mfi
404	getf.sig	r21 = f33
405	nop.f 0
406	cmp.ltu		p6, p7 = r15, r24
407C	.mib
408	ldf8		f12 = [r33], 8
409	sub		r18 = r15, r24
410	nop.b 0
411	;;
412
413C	.mfi
414	getf.sig	r22 = f34
415	xma.l		f38 = f13, f6, f0
416  (p6)	sub		r15 = r18, r25, 1
417C	.mfi
418	st8		[r32] = r18, 8
419	xma.hu		f39 = f13, f6, f0
420  (p7)	sub		r15 = r18, r25
421	;;
422.LL01:
423C	.mfi
424	getf.sig	r23 = f35
425	nop.f 0
426	cmp.ltu		p6, p7 = r15, r26
427C	.mib
428	ldf8		f13 = [r33], 8
429	sub		r19 = r15, r26
430	br.cloop.sptk.few .Ltop
431C *** MAIN LOOP END ***
432	;;
433
434	getf.sig	r24 = f36
435	xma.l		f32 = f10, f6, f0
436  (p6)	sub		r15 = r19, r27, 1
437	st8		[r32] = r19, 8
438	xma.hu		f33 = f10, f6, f0
439  (p7)	sub		r15 = r19, r27
440	;;
441.Lcj8:	getf.sig	r25 = f37
442	cmp.ltu		p6, p7 = r15, r20
443	sub		r16 = r15, r20
444	;;
445	getf.sig	r26 = f38
446	xma.l		f34 = f11, f6, f0
447  (p6)	sub		r15 = r16, r21, 1
448	st8		[r32] = r16, 8
449	xma.hu		f35 = f11, f6, f0
450  (p7)	sub		r15 = r16, r21
451	;;
452.Lcj7:	getf.sig	r27 = f39
453	cmp.ltu		p6, p7 = r15, r22
454	sub		r17 = r15, r22
455	;;
456	getf.sig	r20 = f32
457	xma.l		f36 = f12, f6, f0
458  (p6)	sub		r15 = r17, r23, 1
459	st8		[r32] = r17, 8
460	xma.hu		f37 = f12, f6, f0
461  (p7)	sub		r15 = r17, r23
462	;;
463.Lcj6:	getf.sig	r21 = f33
464	cmp.ltu		p6, p7 = r15, r24
465	sub		r18 = r15, r24
466	;;
467	getf.sig	r22 = f34
468	xma.l		f38 = f13, f6, f0
469  (p6)	sub		r15 = r18, r25, 1
470	st8		[r32] = r18, 8
471	xma.hu		f39 = f13, f6, f0
472  (p7)	sub		r15 = r18, r25
473	;;
474.Lcj5:	getf.sig	r23 = f35
475	cmp.ltu		p6, p7 = r15, r26
476	sub		r19 = r15, r26
477	;;
478	getf.sig	r24 = f36
479  (p6)	sub		r15 = r19, r27, 1
480	st8		[r32] = r19, 8
481  (p7)	sub		r15 = r19, r27
482	;;
483.Lcj4:	getf.sig	r25 = f37
484	cmp.ltu		p6, p7 = r15, r20
485	sub		r16 = r15, r20
486	;;
487	getf.sig	r26 = f38
488  (p6)	sub		r15 = r16, r21, 1
489	st8		[r32] = r16, 8
490  (p7)	sub		r15 = r16, r21
491	;;
492.Lcj3:	getf.sig	r27 = f39
493	cmp.ltu		p6, p7 = r15, r22
494	sub		r17 = r15, r22
495	;;
496  (p6)	sub		r15 = r17, r23, 1
497	st8		[r32] = r17, 8
498  (p7)	sub		r15 = r17, r23
499	;;
500.Lcj2:	cmp.ltu		p6, p7 = r15, r24
501	sub		r18 = r15, r24
502	;;
503  (p6)	sub		r15 = r18, r25, 1
504	st8		[r32] = r18, 8
505  (p7)	sub		r15 = r18, r25
506	;;
507.Lcj1:	cmp.ltu		p6, p7 = r15, r26
508	sub		r19 = r15, r26
509	;;
510  (p6)	sub		r8 = r19, r27, 1
511	st8		[r32] = r19
512  (p7)	sub		r8 = r19, r27
513	mov ar.lc = r2
514	br.ret.sptk.many b0
515EPILOGUE()
516ASM_END()
517