1dnl  Alpha mpn_bdiv_dbm1c.
2
3dnl  Copyright 2008 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C      cycles/limb
23C EV4:     42
24C EV5:     18
25C EV6:      3
26
27C TODO
28C  * Try less unrolling, 2-way should give the same performance.
29C  * Optimize feed-in and wind-down code, for speed, and perhaps further for
30C    code size.
31C  * This runs optimally given the algorithm, r8 is on a 3 operation recurrency
32C    path.  We have not tried very hard to find a better algorithm.  Perhaps
33C    it would be a good task for the GNU superoptimizer.
34
35C INPUT PARAMETERS
36define(`rp', `r16')
37define(`up', `r17')
38define(`n',  `r18')
39define(`bd', `r19')
40define(`cy', `r19')
41
42
43ASM_START()
44PROLOGUE(mpn_bdiv_dbm1c)
45	mov	r20, r8
46
47	ldq	r24, 0(r17)
48	and	r18, 3, r28
49	lda	r18, -4(r18)
50	beq	r28, L(b0)
51	cmpeq	r28, 1, r21
52	bne	r21, L(b1)
53	cmpeq	r28, 2, r21
54	bne	r21, L(b2)
55
56
57L(b3):	ldq	r2, 8(r17)
58	ldq	r3, 16(r17)
59	bgt	r18, L(gt3)
60
61	mulq	r24, r19, r5	C U1
62	umulh	r24, r19, r21	C U1
63	mulq	r2, r19, r6	C U1
64	umulh	r2, r19, r22	C U1
65	mulq	r3, r19, r7	C U1
66	umulh	r3, r19, r23	C U1
67	lda	r16, -32(r16)
68	br	L(cj3)
69
70L(gt3):	ldq	r0, 24(r17)
71	mulq	r24, r19, r5	C U1
72	umulh	r24, r19, r21	C U1
73	ldq	r1, 32(r17)
74	mulq	r2, r19, r6	C U1
75	umulh	r2, r19, r22	C U1
76	ldq	r2, 40(r17)
77	mulq	r3, r19, r7	C U1
78	umulh	r3, r19, r23	C U1
79	ldq	r3, 48(r17)
80	lda	r18, -4(r18)
81	lda	r17, 56(r17)
82	mulq	r0, r19, r4	C U1
83	bgt	r18, L(L3)
84
85	br	L(cj7)
86
87
88L(b2):	ldq	r3, 8(r17)
89	bgt	r18, L(gt2)
90
91	mulq	r24, r19, r6	C U1
92	umulh	r24, r19, r22	C U1
93	mulq	r3, r19, r7	C U1
94	umulh	r3, r19, r23	C U1
95	lda	r16, -40(r16)
96	br	L(cj2)
97
98L(gt2):	ldq	r0, 16(r17)
99	ldq	r1, 24(r17)
100	mulq	r24, r19, r6	C U1
101	umulh	r24, r19, r22	C U1
102	ldq	r2, 32(r17)
103	mulq	r3, r19, r7	C U1
104	umulh	r3, r19, r23	C U1
105	ldq	r3, 40(r17)
106	lda	r18, -4(r18)
107	lda	r17, 48(r17)
108	mulq	r0, r19, r4	C U1
109	umulh	r0, r19, r20	C U1
110	lda	r16, -8(r16)
111	bgt	r18, L(gt6)
112
113	mulq	r1, r19, r5	C U1
114	br	L(cj6)
115
116L(gt6):	ldq	r0, 0(r17)
117	mulq	r1, r19, r5	C U1
118	br	L(L2)
119
120
121L(b1):	bgt	r18, L(gt1)
122
123	mulq	r24, r19, r7	C U1
124	umulh	r24, r19, r23	C U1
125	lda	r16, -48(r16)
126	br	L(cj1)
127
128L(gt1):	ldq	r0, 8(r17)
129	ldq	r1, 16(r17)
130	ldq	r2, 24(r17)
131	mulq	r24, r19, r7	C U1
132	umulh	r24, r19, r23	C U1
133	ldq	r3, 32(r17)
134	lda	r18, -4(r18)
135	lda	r17, 40(r17)
136	mulq	r0, r19, r4	C U1
137	umulh	r0, r19, r20	C U1
138	lda	r16, -16(r16)
139	bgt	r18, L(gt5)
140
141	mulq	r1, r19, r5	C U1
142	umulh	r1, r19, r21	C U1
143	mulq	r2, r19, r6	C U1
144	br	L(cj5)
145
146L(gt5):	ldq	r0, 0(r17)
147	mulq	r1, r19, r5	C U1
148	umulh	r1, r19, r21	C U1
149	ldq	r1, 8(r17)
150	mulq	r2, r19, r6	C U1
151	br	L(L1)
152
153
154L(b0):	ldq	r1, 8(r17)
155	ldq	r2, 16(r17)
156	ldq	r3, 24(r17)
157	lda	r17, 32(r17)
158	lda	r16, -24(r16)
159	mulq	r24, r19, r4	C U1
160	umulh	r24, r19, r20	C U1
161	bgt	r18, L(gt4)
162
163	mulq	r1, r19, r5	C U1
164	umulh	r1, r19, r21	C U1
165	mulq	r2, r19, r6	C U1
166	umulh	r2, r19, r22	C U1
167	mulq	r3, r19, r7	C U1
168	br	L(cj4)
169
170L(gt4):	ldq	r0, 0(r17)
171	mulq	r1, r19, r5	C U1
172	umulh	r1, r19, r21	C U1
173	ldq	r1, 8(r17)
174	mulq	r2, r19, r6	C U1
175	umulh	r2, r19, r22	C U1
176	ldq	r2, 16(r17)
177	mulq	r3, r19, r7	C U1
178	br	L(L0)
179
180C *** MAIN LOOP START ***
181	ALIGN(16)
182L(top):	mulq	r0, r19, r4	C U1
183	subq	r8, r28, r8
184L(L3):	umulh	r0, r19, r20	C U1
185	cmpult	r8, r5, r28
186	ldq	r0, 0(r17)
187	subq	r8, r5, r8
188	addq	r21, r28, r28
189	stq	r8, 0(r16)
190
191	mulq	r1, r19, r5	C U1
192	subq	r8, r28, r8
193L(L2):	umulh	r1, r19, r21	C U1
194	cmpult	r8, r6, r28
195	ldq	r1, 8(r17)
196	subq	r8, r6, r8
197	addq	r22, r28, r28
198	stq	r8, 8(r16)
199
200	mulq	r2, r19, r6	C U1
201	subq	r8, r28, r8
202L(L1):	umulh	r2, r19, r22	C U1
203	cmpult	r8, r7, r28
204	ldq	r2, 16(r17)
205	subq	r8, r7, r8
206	addq	r23, r28, r28
207	stq	r8, 16(r16)
208
209	mulq	r3, r19, r7	C U1
210	subq	r8, r28, r8
211L(L0):	umulh	r3, r19, r23	C U1
212	cmpult	r8, r4, r28
213	ldq	r3, 24(r17)
214	subq	r8, r4, r8
215	addq	r20, r28, r28
216	stq	r8, 24(r16)
217
218	lda	r18, -4(r18)
219	lda	r17, 32(r17)
220	lda	r16, 32(r16)
221	bgt	r18, L(top)
222C *** MAIN LOOP END ***
223
224	mulq	r0, r19, r4	C U1
225	subq	r8, r28, r8
226L(cj7):	umulh	r0, r19, r20	C U1
227	cmpult	r8, r5, r28
228	subq	r8, r5, r8
229	addq	r21, r28, r28
230	stq	r8, 0(r16)
231	mulq	r1, r19, r5	C U1
232	subq	r8, r28, r8
233L(cj6):	umulh	r1, r19, r21	C U1
234	cmpult	r8, r6, r28
235	subq	r8, r6, r8
236	addq	r22, r28, r28
237	stq	r8, 8(r16)
238	mulq	r2, r19, r6	C U1
239	subq	r8, r28, r8
240L(cj5):	umulh	r2, r19, r22	C U1
241	cmpult	r8, r7, r28
242	subq	r8, r7, r8
243	addq	r23, r28, r28
244	stq	r8, 16(r16)
245	mulq	r3, r19, r7	C U1
246	subq	r8, r28, r8
247L(cj4):	umulh	r3, r19, r23	C U1
248	cmpult	r8, r4, r28
249	subq	r8, r4, r8
250	addq	r20, r28, r28
251	stq	r8, 24(r16)
252	subq	r8, r28, r8
253L(cj3):	cmpult	r8, r5, r28
254	subq	r8, r5, r8
255	addq	r21, r28, r28
256	stq	r8, 32(r16)
257	subq	r8, r28, r8
258L(cj2):	cmpult	r8, r6, r28
259	subq	r8, r6, r8
260	addq	r22, r28, r28
261	stq	r8, 40(r16)
262	subq	r8, r28, r8
263L(cj1):	cmpult	r8, r7, r28
264	subq	r8, r7, r8
265	addq	r23, r28, r28
266	stq	r8, 48(r16)
267	subq	r8, r28, r0
268	ret	r31, (r26), 1
269
270EPILOGUE()
271ASM_END()
272