1dnl  IA-64 mpn_divexact_1 -- mpn by limb exact division.
2
3dnl  Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C            cycles/limb
23C Itanium:      16
24C Itanium 2:     8
25
26C INPUT PARAMETERS
27define(`rp', `r32')
28define(`up', `r33')
29define(`n',  `r34')
30define(`divisor', `r35')
31
32define(`lshift', `r24')
33define(`rshift', `r25')
34
35C This code is a bit messy, and not as similar to mode1o.asm as desired.
36
37C The critical path during initialization is for computing the inverse of the
38C divisor.  Since odd divisors are probably common, we conditionally execute
39C the initial count_traling_zeros code and the downshift.
40
41C Possible improvement: Merge more of the feed-in code into the inverse
42C computation.
43
44ASM_START()
45	.text
46	.align	32
47.Ltab:
48data1	0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF
49data1	0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF
50data1	0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF
51data1	0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF
52data1	0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF
53data1	0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F
54data1	0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F
55data1	0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F
56data1	0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F
57data1	0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F
58data1	0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F
59data1	0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F
60data1	0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F
61data1	0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F
62data1	0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F
63data1	0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF
64
65
66PROLOGUE(mpn_divexact_1)
67	.prologue
68	.save		ar.lc, r2
69	.body
70
71 {.mmi;	add		r8 = -1, divisor	C M0
72	nop		0			C M1
73	tbit.z		p8, p9 = divisor, 0	C I0
74}
75ifdef(`HAVE_ABI_32',
76`	addp4		rp = 0, rp		C M2  rp extend
77	addp4		up = 0, up		C M3  up extend
78	sxt4		n = n')			C I1  size extend
79	;;
80.Lhere:
81 {.mmi;	ld8		r20 = [up], 8		C M0  up[0]
82  (p8)	andcm		r8 = r8, divisor	C M1
83	mov		r15 = ip		C I0  .Lhere
84	;;
85}{.mii
86	.pred.rel "mutex", p8, p9
87  (p9)	mov		rshift = 0		C M0
88  (p8)	popcnt		rshift = r8		C I0 r8 = cnt_lo_zeros(divisor)
89	cmp.eq		p6, p10 = 1, n		C I1
90	;;
91}{.mii;	add		r9 = .Ltab-.Lhere, r15	C M0
92  (p8)	shr.u		divisor = divisor, rshift C I0
93	nop		0			C I1
94	;;
95}{.mmi;	add		n = -4, n		C M0  size-1
96  (p10)	ld8		r21 = [up], 8		C M1  up[1]
97	mov		r14 = 2			C M1  2
98}{.mfi;	setf.sig	f6 = divisor		C M2  divisor
99	mov		f9 = f0			C M3  carry		FIXME
100	zxt1		r3 = divisor		C I1  divisor low byte
101	;;
102}{.mmi;	add		r3 = r9, r3		C M0  table offset ip and index
103	sub		r16 = 0, divisor	C M1  -divisor
104	mov		r2 = ar.lc		C I0
105}{.mmi;	sub		lshift = 64, rshift	C M2
106	setf.sig	f13 = r14		C M3  2 in significand
107	mov		r17 = -1		C I1  -1
108	;;
109}{.mmi;	ld1		r3 = [r3]		C M0  inverse, 8 bits
110	nop		0			C M1
111	mov		ar.lc = n		C I0  size-1 loop count
112}{.mmi;	setf.sig	f12 = r16		C M2  -divisor
113	setf.sig	f8 = r17		C M3  -1
114	cmp.eq		p7, p0 = -2, n		C I1
115	;;
116}{.mmi;	setf.sig	f7 = r3			C M2  inverse, 8 bits
117	cmp.eq		p8, p0 = -1, n		C M0
118	shr.u		r23 = r20, rshift	C I0
119	;;
120}
121
122	C f6	divisor
123	C f7	inverse, being calculated
124	C f8	-1, will be -inverse
125	C f9	carry
126	C f12	-divisor
127	C f13	2
128	C f14	scratch
129
130	xmpy.l		f14 = f13, f7		C Newton 2*i
131	xmpy.l		f7 = f7, f7		C Newton i*i
132	;;
133	xma.l		f7 = f7, f12, f14	C Newton i*i*-d + 2*i, 16 bits
134	;;
135	setf.sig	f10 = r23		C speculative, used iff n = 1
136	xmpy.l		f14 = f13, f7		C Newton 2*i
137	shl		r22 = r21, lshift	C speculative, used iff n > 1
138	xmpy.l		f7 = f7, f7		C Newton i*i
139	;;
140	or		r31 = r22, r23		C speculative, used iff n > 1
141	xma.l		f7 = f7, f12, f14	C Newton i*i*-d + 2*i, 32 bits
142	shr.u		r23 = r21, rshift	C speculative, used iff n > 1
143	;;
144	setf.sig	f11 = r31		C speculative, used iff n > 1
145	xmpy.l		f14 = f13, f7		C Newton 2*i
146	xmpy.l		f7 = f7, f7		C Newton i*i
147	;;
148	xma.l		f7 = f7, f12, f14	C Newton i*i*-d + 2*i, 64 bits
149
150  (p7)	br.cond.dptk	.Ln2
151  (p10)	br.cond.dptk	.grt3
152	;;
153
154.Ln1:	xmpy.l		f12 = f10, f7		C q = ulimb * inverse
155	br		.Lx1
156
157.Ln2:
158	xmpy.l		f8 = f7, f8		C -inverse = inverse * -1
159	xmpy.l		f12 = f11, f7		C q = ulimb * inverse
160	setf.sig	f11 = r23
161	br		.Lx2
162
163.grt3:
164	ld8		r21 = [up], 8		C up[2]
165	xmpy.l		f8 = f7, f8		C -inverse = inverse * -1
166	;;
167	shl		r22 = r21, lshift
168	;;
169	xmpy.l		f12 = f11, f7		C q = ulimb * inverse
170	;;
171	or		r31 = r22, r23
172	shr.u		r23 = r21, rshift
173	;;
174	setf.sig	f11 = r31
175  (p8)	br.cond.dptk	.Lx3			C branch for n = 3
176	;;
177	ld8		r21 = [up], 8
178	br		.Lent
179
180.Loop:	ld8		r21 = [up], 8
181	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
182	;;
183.Lent:	add		r16 = 160, up
184	shl		r22 = r21, lshift
185	;;
186	stf8		[rp] = f12, 8
187	xma.hu		f9 = f12, f6, f9	C c = high(q * divisor + c)
188	xmpy.l		f10 = f11, f7		C si = ulimb * inverse
189	;;
190	or		r31 = r22, r23
191	shr.u		r23 = r21, rshift
192	;;
193	lfetch		[r16]
194	setf.sig	f11 = r31
195	br.cloop.sptk.few.clr .Loop
196
197
198	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
199	;;
200.Lx3:	stf8		[rp] = f12, 8
201	xma.hu		f9 = f12, f6, f9	C c = high(q * divisor + c)
202	xmpy.l		f10 = f11, f7		C si = ulimb * inverse
203	;;
204	setf.sig	f11 = r23
205	;;
206	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
207	;;
208.Lx2:	stf8		[rp] = f12, 8
209	xma.hu		f9 = f12, f6, f9	C c = high(q * divisor + c)
210	xmpy.l		f10 = f11, f7		C si = ulimb * inverse
211	;;
212	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
213	;;
214.Lx1:	stf8		[rp] = f12, 8
215	mov		ar.lc = r2		C I0
216	br.ret.sptk.many b0
217EPILOGUE()
218