1dnl  IA-64 mpn_divrem_1 and mpn_preinv_divrem_1 -- Divide an mpn number by an
2dnl  unnormalized limb.
3
4dnl  Copyright 2002, 2004, 2005 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23
24C         cycles/limb
25C Itanium:    40-42
26C Itanium 2:  29-30
27
28C This was generated by gcc, then the loops were optimized.  The preinv entry
29C point was shoehorned into the file.  Lots of things outside the loops could
30C be streamlined.  It would probably be a good idea to merge the loops for
31C normalized and unnormalized divisor, since the shifting stuff is done for
32C free in parallel with other operations.  It would even be possible to merge
33C all loops, if the ld8 were made conditional.
34
35C TODO
36C  * Consider delaying inversion for normalized mpn_divrem_1 entry till after
37C    computing leading limb.
38C  * Inline and interleave limb inversion code with loop setup code.
39
40ASM_START()
41
42C HP's assembler requires these declarations for importing mpn_invert_limb
43	.global	mpn_invert_limb
44	.type	mpn_invert_limb,@function
45
46C INPUT PARAMETERS
47C rp    = r32
48C qxn   = r33
49C up    = r34
50C n     = r35
51C vl    = r36
52C vlinv = r37  (preinv only)
53C cnt = r38    (preinv only)
54
55PROLOGUE(mpn_preinv_divrem_1)
56	.prologue
57	.save	ar.pfs, r42
58	alloc		r42 = ar.pfs, 7, 8, 1, 0
59	.save	ar.lc, r44
60	mov		r44 = ar.lc
61	.save	rp, r41
62	mov		r41 = b0
63	.body
64ifdef(`HAVE_ABI_32',
65`	addp4		r32 = 0, r32
66	sxt4		r33 = r33
67	addp4		r34 = 0, r34
68	sxt4		r35 = r35
69	;;
70')
71	mov		r40 = r38
72	shladd		r34 = r35, 3, r34
73	;;
74	adds		r34 = -8, r34
75	;;
76	ld8		r39 = [r34], -8
77	;;
78
79	add		r15 = r35, r33
80	;;
81	mov		r8 = r37
82	shladd		r32 = r15, 3, r32	C r32 = rp + n + qxn
83	cmp.le		p8, p0 = 0, r36
84	;;
85	adds		r32 = -8, r32		C r32 = rp + n + qxn - 1
86	cmp.leu		p6, p7 = r36, r39
87   (p8)	br.cond.dpnt	.Lpunnorm
88	;;
89
90   (p6)	addl		r15 = 1, r0
91   (p7)	mov		r15 = r0
92	;;
93   (p6)	sub		r38 = r39, r36
94   (p7)	mov		r38 = r39
95	st8		[r32] = r15, -8
96	adds		r35 = -2, r35		C un -= 2
97	br	.Lpn
98
99.Lpunnorm:
100   (p6)	add		r34 = 8, r34
101	mov		r38 = 0			C r = 0
102	shl		r36 = r36, r40
103   (p6)	br.cond.dptk	.Lpu
104	;;
105	shl		r38 = r39, r40		C r = ahigh << cnt
106	cmp.ne		p8, p0 = 1, r35
107	st8		[r32] = r0, -8
108	adds		r35 = -1, r35		C un--
109   (p8)	br.cond.dpnt	.Lpu
110
111	mov		r23 = 1
112	;;
113	setf.sig	f6 = r8
114	setf.sig	f12 = r23
115	br		.L435
116EPILOGUE()
117
118
119PROLOGUE(mpn_divrem_1)
120	.prologue
121	.save	ar.pfs, r42
122	alloc		r42 = ar.pfs, 5, 8, 1, 0
123	.save	ar.lc, r44
124	mov		r44 = ar.lc
125	.save	rp, r41
126	mov		r41 = b0
127	.body
128ifdef(`HAVE_ABI_32',
129`	addp4		r32 = 0, r32
130	sxt4		r33 = r33
131	addp4		r34 = 0, r34
132	sxt4		r35 = r35
133	;;
134')
135	mov		r38 = r0
136	add		r15 = r35, r33
137	;;
138	cmp.ne		p6, p7 = 0, r15
139	;;
140   (p7)	mov		r8 = r0
141   (p7)	br.cond.dpnt	.Lret
142	shladd		r14 = r15, 3, r32	C r14 = rp + n + qxn
143	cmp.le		p6, p7 = 0, r36
144	;;
145	adds		r32 = -8, r14		C r32 = rp + n + qxn - 1
146   (p6)	br.cond.dpnt	.Lunnorm
147	cmp.eq		p6, p7 = 0, r35
148   (p6)	br.cond.dpnt	.L179
149	shladd		r14 = r35, 3, r34
150	;;
151	adds		r14 = -8, r14
152	adds		r35 = -1, r35
153	;;
154	ld8		r38 = [r14]
155	;;
156	cmp.leu		p6, p7 = r36, r38
157	;;
158   (p6)	addl		r15 = 1, r0
159   (p7)	mov		r15 = r0
160	;;
161	st8		[r32] = r15, -8
162  (p6)	sub		r38 = r38, r36
163
164.L179:
165	mov		r45 = r36
166	adds		r35 = -1, r35
167	br.call.sptk.many b0 = mpn_invert_limb
168	;;
169	shladd		r34 = r35, 3, r34
170.Lpn:
171	mov		r23 = 1
172	;;
173	setf.sig	f6 = r8
174	setf.sig	f12 = r23
175	cmp.le		p6, p7 = 0, r35
176	mov		r40 = 0
177   (p7)	br.cond.dpnt	.L435
178	setf.sig	f10 = r36
179	mov		ar.lc = r35
180	setf.sig	f7 = r38
181	;;
182	sub		r28 = -1, r36
183C Develop quotient limbs for normalized divisor
184.Loop1:		C 00				C q=r18 nh=r38/f7
185	ld8		r20 = [r34], -8
186	xma.hu		f11 = f7, f6, f0
187	;;	C 04
188	xma.l		f8 = f11, f12, f7	C q = q + nh
189	;;	C 08
190	getf.sig	r18 = f8
191	xma.hu		f9 = f8, f10, f0
192	xma.l		f8 = f8, f10, f0
193	;;	C 12
194	getf.sig	r16 = f9
195		C 13
196	getf.sig	r15 = f8
197	;;	C 18
198	cmp.ltu		p6, p7 = r20, r15
199	sub		r15 = r20, r15
200	sub		r16 = r38, r16
201	;;	C 19
202   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0?
203   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0?
204   (p6)	add		r16 = -1, r16
205   (p0)	cmp.ne.unc	p6, p7 = r0, r0
206	;;	C 20
207   (p8)	cmp.ltu		p6, p7 = r15, r36
208   (p8)	sub		r15 = r15, r36
209   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
210	;;	C 21
211	.pred.rel "mutex",p6,p7
212   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0 still?
213   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0 still?
214	cmp.ltu		p6, p7 = r15, r36	C speculative
215	sub		r28 = r15, r36		C speculative, just for cmp
216	;;	C 22
217   (p8)	cmp.ltu		p6, p7 = r28, r36	C redo last cmp if needed
218   (p8)	mov		r15 = r28
219   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
220	;;	C 23
221   (p6)	setf.sig	f7 = r15
222   (p7)	sub		r15 = r15, r36
223   (p7)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
224	;;	C 24
225   (p7)	setf.sig	f7 = r15
226	st8		[r32] = r18, -8
227	mov		r38 = r15
228	br.cloop.dptk	.Loop1
229		C 29/30
230	br.sptk		.L435
231	;;
232.Lunnorm:
233	mux1		r16 = r36, @rev
234	cmp.eq		p6, p7 = 0, r35
235   (p6)	br.cond.dpnt	.L322
236	shladd		r34 = r35, 3, r34
237	;;
238	adds		r34 = -8, r34
239	;;
240	ld8		r39 = [r34]
241	;;
242	cmp.leu		p6, p7 = r36, r39
243   (p6)	br.cond.dptk	.L322
244	adds		r34 = -8, r34
245	;;
246	mov		r38 = r39
247	;;
248	cmp.ne		p6, p7 = 1, r15
249	st8		[r32] = r0, -8
250	;;
251   (p7)	mov		r8 = r38
252   (p7)	br.cond.dpnt	.Lret
253	adds		r35 = -1, r35
254.L322:
255	sub		r14 = r0, r16
256	;;
257	or		r14 = r16, r14
258	;;
259	mov		r16 = -8
260	czx1.l		r14 = r14
261	;;
262	shladd		r16 = r14, 3, r16
263	;;
264	shr.u		r14 = r36, r16
265	;;
266	cmp.geu		p6, p7 = 15, r14
267	;;
268   (p7)	shr.u		r14 = r14, 4
269   (p7)	adds		r16 = 4, r16
270	;;
271	cmp.geu		p6, p7 = 3, r14
272	;;
273   (p7)	shr.u		r14 = r14, 2
274   (p7)	adds		r16 = 2, r16
275	;;
276	tbit.nz		p6, p7 = r14, 1
277	;;
278	.pred.rel "mutex",p6,p7
279  (p6)	sub		r40 = 62, r16
280  (p7)	sub		r40 = 63, r16
281	;;
282	shl		r45 = r36, r40
283	shl		r36 = r36, r40
284	shl		r38 = r38, r40
285	br.call.sptk.many b0 = mpn_invert_limb
286	;;
287.Lpu:
288	mov		r23 = 1
289	;;
290	setf.sig	f6 = r8
291	setf.sig	f12 = r23
292	cmp.eq		p6, p7 = 0, r35
293   (p6)	br.cond.dpnt	.L435
294	sub		r16 = 64, r40
295	adds		r35 = -2, r35
296	;;
297	ld8		r39 = [r34], -8
298	cmp.le		p6, p7 = 0, r35
299	;;
300	shr.u		r14 = r39, r16
301	;;
302	or		r38 = r14, r38
303   (p7)	br.cond.dpnt	.Lend3
304	;;
305	mov		r22 = r16
306	setf.sig	f10 = r36
307	setf.sig	f7 = r38
308	mov		ar.lc = r35
309	;;
310C Develop quotient limbs for unnormalized divisor
311.Loop3:
312	ld8		r14 = [r34], -8
313	xma.hu		f11 = f7, f6, f0
314	;;
315	xma.l		f8 = f11, f12, f7	C q = q + nh
316	;;
317	getf.sig	r18 = f8
318	xma.hu		f9 = f8, f10, f0
319	shl		r20 = r39, r40
320	xma.l		f8 = f8, f10, f0
321	shr.u		r24 = r14, r22
322	;;
323	getf.sig	r16 = f9
324	getf.sig	r15 = f8
325	or		r20 = r24, r20
326	;;
327	cmp.ltu		p6, p7 = r20, r15
328	sub		r15 = r20, r15
329	sub		r16 = r38, r16
330	;;
331   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0?
332   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0?
333   (p6)	add		r16 = -1, r16
334   (p0)	cmp.ne.unc	p6, p7 = r0, r0
335	;;
336   (p8)	cmp.ltu		p6, p7 = r15, r36
337   (p8)	sub		r15 = r15, r36
338   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
339	;;
340	.pred.rel "mutex",p6,p7
341   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0 still?
342   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0 still?
343	cmp.ltu		p6, p7 = r15, r36	C speculative
344	sub		r28 = r15, r36		C speculative, just for cmp
345	;;
346   (p8)	cmp.ltu		p6, p7 = r28, r36	C redo last cmp if needed
347   (p8)	mov		r15 = r28
348   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
349	;;
350   (p6)	setf.sig	f7 = r15
351   (p7)	sub		r15 = r15, r36
352   (p7)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
353	;;
354   (p7)	setf.sig	f7 = r15
355	st8		[r32] = r18, -8
356	mov		r39 = r14
357	mov		r38 = r15
358	br.cloop.dptk	.Loop3
359	;;
360.Lend3:
361	setf.sig	f10 = r36
362	setf.sig	f7 = r38
363	;;
364	xma.hu		f11 = f7, f6, f0
365	;;
366	xma.l		f8 = f11, f12, f7	C q = q + nh
367	;;
368	getf.sig	r18 = f8
369	xma.hu		f9 = f8, f10, f0
370	shl		r20 = r39, r40
371	xma.l		f8 = f8, f10, f0
372	;;
373	getf.sig	r16 = f9
374	getf.sig	r15 = f8
375	;;
376	cmp.ltu		p6, p7 = r20, r15
377	sub		r15 = r20, r15
378	sub		r16 = r38, r16
379	;;
380   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0?
381   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0?
382   (p6)	add		r16 = -1, r16
383   (p0)	cmp.ne.unc	p6, p7 = r0, r0
384	;;
385   (p8)	cmp.ltu		p6, p7 = r15, r36
386   (p8)	sub		r15 = r15, r36
387   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
388	;;
389	.pred.rel "mutex",p6,p7
390   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0 still?
391   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0 still?
392	;;
393   (p8)	sub		r15 = r15, r36
394   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
395	;;
396	cmp.ltu		p6, p7 = r15, r36
397	;;
398   (p7)	sub		r15 = r15, r36
399   (p7)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
400	;;
401	st8		[r32] = r18, -8
402	mov		r38 = r15
403.L435:
404	adds		r35 = -1, r33
405	cmp.le		p6, p7 = 1, r33
406   (p7)	br.cond.dpnt	.Lend4
407	;;
408	setf.sig	f7 = r38
409	setf.sig	f10 = r36
410	mov		ar.lc = r35
411	;;
412.Loop4:
413	xma.hu		f11 = f7, f6, f0
414	;;
415	xma.l		f8 = f11, f12, f7	C q = q + nh
416	;;
417	getf.sig	r18 = f8
418	xma.hu		f9 = f8, f10, f0
419	xma.l		f8 = f8, f10, f0
420	;;
421	getf.sig	r16 = f9
422	getf.sig	r15 = f8
423	;;
424	cmp.ltu		p6, p7 = 0, r15
425	sub		r15 = 0, r15
426	sub		r16 = r38, r16
427	;;
428   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0?
429   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0?
430   (p6)	add		r16 = -1, r16
431   (p0)	cmp.ne.unc	p6, p7 = r0, r0
432	;;
433   (p8)	cmp.ltu		p6, p7 = r15, r36
434   (p8)	sub		r15 = r15, r36
435   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
436	;;
437	.pred.rel "mutex",p6,p7
438   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0 still?
439   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0 still?
440	cmp.ltu		p6, p7 = r15, r36	C speculative
441	sub		r28 = r15, r36		C speculative, just for cmp
442	;;
443   (p8)	cmp.ltu		p6, p7 = r28, r36	C redo last cmp if needed
444   (p8)	mov		r15 = r28
445   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
446	;;
447   (p6)	setf.sig	f7 = r15
448   (p7)	sub		r15 = r15, r36
449   (p7)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
450	;;
451   (p7)	setf.sig	f7 = r15
452	st8		[r32] = r18, -8
453	mov		r38 = r15
454	br.cloop.dptk	.Loop4
455	;;
456.Lend4:
457	shr.u		r8 = r38, r40
458.Lret:
459	mov		ar.pfs = r42
460	mov		ar.lc = r44
461	mov		b0 = r41
462	br.ret.sptk.many b0
463EPILOGUE()
464ASM_END()
465