diveby3.asm revision 1.1.1.2
1dnl  Alpha mpn_divexact_by3c -- mpn division by 3, expecting no remainder.
2
3dnl  Copyright 2004, 2005, 2009 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C      cycles/limb
34C EV4:    22
35C EV5:    11.5
36C EV6:     6.3		Note that mpn_bdiv_dbm1c is faster
37
38C TODO
39C  * Remove the unops, they benefit just ev6, which no longer uses this file.
40C  * Try prefetch for destination, using lds.
41C  * Improve feed-in code, by moving initial mulq earlier; make initial load
42C    to u0/u0 to save some copying.
43C  * Combine u0 and u2, u1 and u3.
44
45C INPUT PARAMETERS
46define(`rp',	`r16')
47define(`up',	`r17')
48define(`n',	`r18')
49define(`cy',	`r19')
50
51ASM_START()
52
53DATASTART(L(LC),8)
54	.quad	0xAAAAAAAAAAAAAAAB
55	.quad	0x5555555555555555
56	.quad	0xAAAAAAAAAAAAAAAA
57DATAEND()
58
59define(`xAAAAAAAAAAAAAAAB',	`r20')
60define(`x5555555555555555',	`r21')
61define(`xAAAAAAAAAAAAAAAA',	`r22')
62define(`u0',	`r0')	define(`u1',	`r1')
63define(`u2',	`r2')	define(`u3',	`r3')
64define(`l0',	`r25')	define(`x',	`r8')
65define(`q0',	`r4')	define(`q1',	`r5')
66define(`p6',	`r6')	define(`p7',	`r7')
67define(`t0',	`r23')	define(`t1',	`r24')
68define(`cymask',`r28')
69
70
71PROLOGUE(mpn_divexact_by3c,gp)
72
73	ldq	r28, 0(up)			C load first limb early
74
75C Put magic constants in registers
76	lda	r0, L(LC)
77	ldq	xAAAAAAAAAAAAAAAB, 0(r0)
78	ldq	x5555555555555555, 8(r0)
79	ldq	xAAAAAAAAAAAAAAAA, 16(r0)
80
81C Compute initial l0 value
82	cmpeq	cy, 1, p6
83	cmpeq	cy, 2, p7
84	negq	p6, p6
85	and	p6, x5555555555555555, l0
86	cmovne	p7, xAAAAAAAAAAAAAAAA, l0
87
88C Feed-in depending on (n mod 4)
89	and	n, 3, r8
90	lda	n, -3(n)
91	cmpeq	r8, 1, r4
92	cmpeq	r8, 2, r5
93	bne	r4, $Lb01
94	bne	r5, $Lb10
95	beq	r8, $Lb00
96
97$Lb11:	ldq	u3, 8(up)
98	lda	up, -24(up)
99	lda	rp, -24(rp)
100	mulq	r28, xAAAAAAAAAAAAAAAB, q0
101	mov	r28, u2
102	br	r31, $L11
103
104$Lb00:	ldq	u2, 8(up)
105	lda	up, -16(up)
106	lda	rp, -16(rp)
107	mulq	r28, xAAAAAAAAAAAAAAAB, q1
108	mov	r28, u1
109	br	r31, $L00
110
111$Lb01:	lda	rp, -8(rp)
112	mulq	r28, xAAAAAAAAAAAAAAAB, q0
113	mov	r28, u0
114	blt	n, $Lcj1
115	ldq	u1, 8(up)
116	lda	up, -8(up)
117	br	r31, $L01
118
119$Lb10:	ldq	u0, 8(up)
120	mulq	r28, xAAAAAAAAAAAAAAAB, q1
121	mov	r28, u3
122	blt	n, $Lend
123
124	ALIGN(16)
125$Ltop:
126C 0
127	cmpult	u3, cy, cy			C L0
128	mulq	u0, xAAAAAAAAAAAAAAAB, q0	C U1
129	ldq	u1, 16(up)			C L1
130	addq	q1, l0, x			C U0
131C 1
132	negq	cy, cymask			C L0
133	unop					C U1
134	unop					C L1
135	cmpult	x5555555555555555, x, p6	C U0
136C 2
137	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
138	unop
139	unop
140	negq	p6, t0				C L0
141C 3
142	negq	p7, t1				C L0
143	and	cymask, x5555555555555555, l0	C U1
144	addq	p6, cy, cy
145	and	t0, x5555555555555555, t0
146C 4
147	and	t1, x5555555555555555, t1
148	addq	p7, cy, cy
149	unop
150	addq	t0, l0, l0
151C 5
152	addq	t1, l0, l0
153	unop
154	stq	x, 0(rp)			C L1
155	unop
156$L01:
157C 0
158	cmpult	u0, cy, cy			C L0
159	mulq	u1, xAAAAAAAAAAAAAAAB, q1	C U1
160	ldq	u2, 24(up)			C L1
161	addq	q0, l0, x			C U0
162C 1
163	negq	cy, cymask			C L0
164	unop					C U1
165	unop					C L1
166	cmpult	x5555555555555555, x, p6	C U0
167C 2
168	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
169	unop
170	unop
171	negq	p6, t0				C L0
172C 3
173	negq	p7, t1				C L0
174	and	cymask, x5555555555555555, l0	C U1
175	addq	p6, cy, cy
176	and	t0, x5555555555555555, t0
177C 4
178	and	t1, x5555555555555555, t1
179	addq	p7, cy, cy
180	unop
181	addq	t0, l0, l0
182C 5
183	addq	t1, l0, l0
184	unop
185	stq	x, 8(rp)			C L1
186	unop
187$L00:
188C 0
189	cmpult	u1, cy, cy			C L0
190	mulq	u2, xAAAAAAAAAAAAAAAB, q0	C U1
191	ldq	u3, 32(up)			C L1
192	addq	q1, l0, x			C U0
193C 1
194	negq	cy, cymask			C L0
195	unop					C U1
196	unop					C L1
197	cmpult	x5555555555555555, x, p6	C U0
198C 2
199	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
200	unop
201	unop
202	negq	p6, t0				C L0
203C 3
204	negq	p7, t1				C L0
205	and	cymask, x5555555555555555, l0	C U1
206	addq	p6, cy, cy
207	and	t0, x5555555555555555, t0
208C 4
209	and	t1, x5555555555555555, t1
210	addq	p7, cy, cy
211	unop
212	addq	t0, l0, l0
213C 5
214	addq	t1, l0, l0
215	unop
216	stq	x, 16(rp)			C L1
217	unop
218$L11:
219C 0
220	cmpult	u2, cy, cy			C L0
221	mulq	u3, xAAAAAAAAAAAAAAAB, q1	C U1
222	ldq	u0, 40(up)			C L1
223	addq	q0, l0, x			C U0
224C 1
225	negq	cy, cymask			C L0
226	unop					C U1
227	unop					C L1
228	cmpult	x5555555555555555, x, p6	C U0
229C 2
230	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
231	lda	n, -4(n)			C L1 bookkeeping
232	unop
233	negq	p6, t0				C L0
234C 3
235	negq	p7, t1				C L0
236	and	cymask, x5555555555555555, l0	C U1
237	addq	p6, cy, cy
238	and	t0, x5555555555555555, t0
239C 4
240	and	t1, x5555555555555555, t1
241	addq	p7, cy, cy
242	unop
243	addq	t0, l0, l0
244C 5
245	addq	t1, l0, l0
246	unop
247	stq	x, 24(rp)			C L1
248	lda	up, 32(up)
249C
250	ldl	r31, 256(up)			C prefetch
251	unop
252	lda	rp, 32(rp)
253	bge	n, $Ltop			C U1
254C *** MAIN LOOP END ***
255$Lend:
256
257	cmpult	u3, cy, cy			C L0
258	mulq	u0, xAAAAAAAAAAAAAAAB, q0	C U1
259	unop
260	addq	q1, l0, x			C U0
261C 1
262	negq	cy, cymask			C L0
263	unop					C U1
264	unop					C L1
265	cmpult	x5555555555555555, x, p6	C U0
266C 2
267	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
268	unop
269	unop
270	negq	p6, t0				C L0
271C 3
272	negq	p7, t1				C L0
273	and	cymask, x5555555555555555, l0	C U1
274	addq	p6, cy, cy
275	and	t0, x5555555555555555, t0
276C 4
277	and	t1, x5555555555555555, t1
278	addq	p7, cy, cy
279	unop
280	addq	t0, l0, l0
281C 5
282	addq	t1, l0, l0
283	unop
284	stq	x, 0(rp)			C L1
285	unop
286$Lcj1:
287	cmpult	u0, cy, cy			C L0
288	addq	q0, l0, x			C U0
289	cmpult	x5555555555555555, x, p6	C U0
290	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
291	addq	p6, cy, cy
292	addq	p7, cy, r0
293	stq	x, 8(rp)			C L1
294
295	ret	r31,(r26),1
296EPILOGUE()
297ASM_END()
298
299C This is useful for playing with various schedules.
300C Expand as: one(0)one(1)one(2)one(3)
301define(`one',`
302C 0
303	cmpult	`$'eval(($1+3)%4), cy, cy		C L0
304	mulq	`$'$1, xAAAAAAAAAAAAAAAB, `$'eval(4+$1%2) C U1
305	ldq	`$'eval(($1+1)%4), eval($1*8+16)(up)	C L1
306	addq	`$'eval(4+($1+1)%2), l0, x		C U0
307C 1
308	negq	cy, cymask				C L0
309	unop						C U1
310	unop						C L1
311	cmpult	x5555555555555555, x, p6		C U0
312C 2
313	cmpult	xAAAAAAAAAAAAAAAA, x, p7		C U1
314	unop
315	unop
316	negq	p6, t0					C L0
317C 3
318	negq	p7, t1					C L0
319	and	cymask, x5555555555555555, l0		C U1
320	addq	p6, cy, cy
321	and	t0, x5555555555555555, t0
322C 4
323	and	t1, x5555555555555555, t1
324	addq	p7, cy, cy
325	unop
326	addq	t0, l0, l0
327C 5
328	addq	t1, l0, l0
329	unop
330	stq	x, eval($1*8)(rp)			C L1
331	unop
332')
333