diveby3.asm revision 1.1.1.1
1dnl  Alpha mpn_divexact_by3c -- mpn division by 3, expecting no remainder.
2
3dnl  Copyright 2004, 2005, 2009 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C      cycles/limb
23C EV4:    22
24C EV5:    11.5
25C EV6:     6.3		Note that mpn_bdiv_dbm1c is faster
26
27C TODO
28C  * Remove the unops, they benefit just ev6, which no longer uses this file.
29C  * Try prefetch for destination, using lds.
30C  * Improve feed-in code, by moving initial mulq earlier; make initial load
31C    to u0/u0 to save some copying.
32C  * Combine u0 and u2, u1 and u3.
33
34C INPUT PARAMETERS
35define(`rp',	`r16')
36define(`up',	`r17')
37define(`n',	`r18')
38define(`cy',	`r19')
39
40ASM_START()
41
42DATASTART(L(LC))
43	.quad	0xAAAAAAAAAAAAAAAB
44	.quad	0x5555555555555555
45	.quad	0xAAAAAAAAAAAAAAAA
46DATAEND()
47
48define(`xAAAAAAAAAAAAAAAB',	`r20')
49define(`x5555555555555555',	`r21')
50define(`xAAAAAAAAAAAAAAAA',	`r22')
51define(`u0',	`r0')	define(`u1',	`r1')
52define(`u2',	`r2')	define(`u3',	`r3')
53define(`l0',	`r25')	define(`x',	`r8')
54define(`q0',	`r4')	define(`q1',	`r5')
55define(`p6',	`r6')	define(`p7',	`r7')
56define(`t0',	`r23')	define(`t1',	`r24')
57define(`cymask',`r28')
58
59
60PROLOGUE(mpn_divexact_by3c,gp)
61
62	ldq	r28, 0(up)			C load first limb early
63
64C Put magic constants in registers
65	lda	r0, L(LC)
66	ldq	xAAAAAAAAAAAAAAAB, 0(r0)
67	ldq	x5555555555555555, 8(r0)
68	ldq	xAAAAAAAAAAAAAAAA, 16(r0)
69
70C Compute initial l0 value
71	cmpeq	cy, 1, p6
72	cmpeq	cy, 2, p7
73	negq	p6, p6
74	and	p6, x5555555555555555, l0
75	cmovne	p7, xAAAAAAAAAAAAAAAA, l0
76
77C Feed-in depending on (n mod 4)
78	and	n, 3, r8
79	lda	n, -3(n)
80	cmpeq	r8, 1, r4
81	cmpeq	r8, 2, r5
82	bne	r4, $Lb01
83	bne	r5, $Lb10
84	beq	r8, $Lb00
85
86$Lb11:	ldq	u3, 8(up)
87	lda	up, -24(up)
88	lda	rp, -24(rp)
89	mulq	r28, xAAAAAAAAAAAAAAAB, q0
90	mov	r28, u2
91	br	r31, $L11
92
93$Lb00:	ldq	u2, 8(up)
94	lda	up, -16(up)
95	lda	rp, -16(rp)
96	mulq	r28, xAAAAAAAAAAAAAAAB, q1
97	mov	r28, u1
98	br	r31, $L00
99
100$Lb01:	lda	rp, -8(rp)
101	mulq	r28, xAAAAAAAAAAAAAAAB, q0
102	mov	r28, u0
103	blt	n, $Lcj1
104	ldq	u1, 8(up)
105	lda	up, -8(up)
106	br	r31, $L01
107
108$Lb10:	ldq	u0, 8(up)
109	mulq	r28, xAAAAAAAAAAAAAAAB, q1
110	mov	r28, u3
111	blt	n, $Lend
112
113	ALIGN(16)
114$Ltop:
115C 0
116	cmpult	u3, cy, cy			C L0
117	mulq	u0, xAAAAAAAAAAAAAAAB, q0	C U1
118	ldq	u1, 16(up)			C L1
119	addq	q1, l0, x			C U0
120C 1
121	negq	cy, cymask			C L0
122	unop					C U1
123	unop					C L1
124	cmpult	x5555555555555555, x, p6	C U0
125C 2
126	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
127	unop
128	unop
129	negq	p6, t0				C L0
130C 3
131	negq	p7, t1				C L0
132	and	cymask, x5555555555555555, l0	C U1
133	addq	p6, cy, cy
134	and	t0, x5555555555555555, t0
135C 4
136	and	t1, x5555555555555555, t1
137	addq	p7, cy, cy
138	unop
139	addq	t0, l0, l0
140C 5
141	addq	t1, l0, l0
142	unop
143	stq	x, 0(rp)			C L1
144	unop
145$L01:
146C 0
147	cmpult	u0, cy, cy			C L0
148	mulq	u1, xAAAAAAAAAAAAAAAB, q1	C U1
149	ldq	u2, 24(up)			C L1
150	addq	q0, l0, x			C U0
151C 1
152	negq	cy, cymask			C L0
153	unop					C U1
154	unop					C L1
155	cmpult	x5555555555555555, x, p6	C U0
156C 2
157	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
158	unop
159	unop
160	negq	p6, t0				C L0
161C 3
162	negq	p7, t1				C L0
163	and	cymask, x5555555555555555, l0	C U1
164	addq	p6, cy, cy
165	and	t0, x5555555555555555, t0
166C 4
167	and	t1, x5555555555555555, t1
168	addq	p7, cy, cy
169	unop
170	addq	t0, l0, l0
171C 5
172	addq	t1, l0, l0
173	unop
174	stq	x, 8(rp)			C L1
175	unop
176$L00:
177C 0
178	cmpult	u1, cy, cy			C L0
179	mulq	u2, xAAAAAAAAAAAAAAAB, q0	C U1
180	ldq	u3, 32(up)			C L1
181	addq	q1, l0, x			C U0
182C 1
183	negq	cy, cymask			C L0
184	unop					C U1
185	unop					C L1
186	cmpult	x5555555555555555, x, p6	C U0
187C 2
188	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
189	unop
190	unop
191	negq	p6, t0				C L0
192C 3
193	negq	p7, t1				C L0
194	and	cymask, x5555555555555555, l0	C U1
195	addq	p6, cy, cy
196	and	t0, x5555555555555555, t0
197C 4
198	and	t1, x5555555555555555, t1
199	addq	p7, cy, cy
200	unop
201	addq	t0, l0, l0
202C 5
203	addq	t1, l0, l0
204	unop
205	stq	x, 16(rp)			C L1
206	unop
207$L11:
208C 0
209	cmpult	u2, cy, cy			C L0
210	mulq	u3, xAAAAAAAAAAAAAAAB, q1	C U1
211	ldq	u0, 40(up)			C L1
212	addq	q0, l0, x			C U0
213C 1
214	negq	cy, cymask			C L0
215	unop					C U1
216	unop					C L1
217	cmpult	x5555555555555555, x, p6	C U0
218C 2
219	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
220	lda	n, -4(n)			C L1 bookkeeping
221	unop
222	negq	p6, t0				C L0
223C 3
224	negq	p7, t1				C L0
225	and	cymask, x5555555555555555, l0	C U1
226	addq	p6, cy, cy
227	and	t0, x5555555555555555, t0
228C 4
229	and	t1, x5555555555555555, t1
230	addq	p7, cy, cy
231	unop
232	addq	t0, l0, l0
233C 5
234	addq	t1, l0, l0
235	unop
236	stq	x, 24(rp)			C L1
237	lda	up, 32(up)
238C
239	ldl	r31, 256(up)			C prefetch
240	unop
241	lda	rp, 32(rp)
242	bge	n, $Ltop			C U1
243C *** MAIN LOOP END ***
244$Lend:
245
246	cmpult	u3, cy, cy			C L0
247	mulq	u0, xAAAAAAAAAAAAAAAB, q0	C U1
248	unop
249	addq	q1, l0, x			C U0
250C 1
251	negq	cy, cymask			C L0
252	unop					C U1
253	unop					C L1
254	cmpult	x5555555555555555, x, p6	C U0
255C 2
256	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
257	unop
258	unop
259	negq	p6, t0				C L0
260C 3
261	negq	p7, t1				C L0
262	and	cymask, x5555555555555555, l0	C U1
263	addq	p6, cy, cy
264	and	t0, x5555555555555555, t0
265C 4
266	and	t1, x5555555555555555, t1
267	addq	p7, cy, cy
268	unop
269	addq	t0, l0, l0
270C 5
271	addq	t1, l0, l0
272	unop
273	stq	x, 0(rp)			C L1
274	unop
275$Lcj1:
276	cmpult	u0, cy, cy			C L0
277	addq	q0, l0, x			C U0
278	cmpult	x5555555555555555, x, p6	C U0
279	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
280	addq	p6, cy, cy
281	addq	p7, cy, r0
282	stq	x, 8(rp)			C L1
283
284	ret	r31,(r26),1
285EPILOGUE()
286ASM_END()
287
288C This is useful for playing with various schedules.
289C Expand as: one(0)one(1)one(2)one(3)
290define(`one',`
291C 0
292	cmpult	`$'eval(($1+3)%4), cy, cy		C L0
293	mulq	`$'$1, xAAAAAAAAAAAAAAAB, `$'eval(4+$1%2) C U1
294	ldq	`$'eval(($1+1)%4), eval($1*8+16)(up)	C L1
295	addq	`$'eval(4+($1+1)%2), l0, x		C U0
296C 1
297	negq	cy, cymask				C L0
298	unop						C U1
299	unop						C L1
300	cmpult	x5555555555555555, x, p6		C U0
301C 2
302	cmpult	xAAAAAAAAAAAAAAAA, x, p7		C U1
303	unop
304	unop
305	negq	p6, t0					C L0
306C 3
307	negq	p7, t1					C L0
308	and	cymask, x5555555555555555, l0		C U1
309	addq	p6, cy, cy
310	and	t0, x5555555555555555, t0
311C 4
312	and	t1, x5555555555555555, t1
313	addq	p7, cy, cy
314	unop
315	addq	t0, l0, l0
316C 5
317	addq	t1, l0, l0
318	unop
319	stq	x, eval($1*8)(rp)			C L1
320	unop
321')
322