1dnl  Alpha mpn_bdiv_dbm1c.
2
3dnl  Copyright 2008 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C      cycles/limb
34C EV4:     42
35C EV5:     18
36C EV6:      3
37
38C TODO
39C  * Try less unrolling, 2-way should give the same performance.
40C  * Optimize feed-in and wind-down code, for speed, and perhaps further for
41C    code size.
42C  * This runs optimally given the algorithm, r8 is on a 3 operation recurrency
43C    path.  We have not tried very hard to find a better algorithm.  Perhaps
44C    it would be a good task for the GNU superoptimizer.
45
46C INPUT PARAMETERS
47define(`rp', `r16')
48define(`up', `r17')
49define(`n',  `r18')
50define(`bd', `r19')
51define(`cy', `r19')
52
53
54ASM_START()
55PROLOGUE(mpn_bdiv_dbm1c)
56	mov	r20, r8
57
58	ldq	r24, 0(r17)
59	and	r18, 3, r28
60	lda	r18, -4(r18)
61	beq	r28, L(b0)
62	cmpeq	r28, 1, r21
63	bne	r21, L(b1)
64	cmpeq	r28, 2, r21
65	bne	r21, L(b2)
66
67
68L(b3):	ldq	r2, 8(r17)
69	ldq	r3, 16(r17)
70	bgt	r18, L(gt3)
71
72	mulq	r24, r19, r5	C U1
73	umulh	r24, r19, r21	C U1
74	mulq	r2, r19, r6	C U1
75	umulh	r2, r19, r22	C U1
76	mulq	r3, r19, r7	C U1
77	umulh	r3, r19, r23	C U1
78	lda	r16, -32(r16)
79	br	L(cj3)
80
81L(gt3):	ldq	r0, 24(r17)
82	mulq	r24, r19, r5	C U1
83	umulh	r24, r19, r21	C U1
84	ldq	r1, 32(r17)
85	mulq	r2, r19, r6	C U1
86	umulh	r2, r19, r22	C U1
87	ldq	r2, 40(r17)
88	mulq	r3, r19, r7	C U1
89	umulh	r3, r19, r23	C U1
90	ldq	r3, 48(r17)
91	lda	r18, -4(r18)
92	lda	r17, 56(r17)
93	mulq	r0, r19, r4	C U1
94	bgt	r18, L(L3)
95
96	br	L(cj7)
97
98
99L(b2):	ldq	r3, 8(r17)
100	bgt	r18, L(gt2)
101
102	mulq	r24, r19, r6	C U1
103	umulh	r24, r19, r22	C U1
104	mulq	r3, r19, r7	C U1
105	umulh	r3, r19, r23	C U1
106	lda	r16, -40(r16)
107	br	L(cj2)
108
109L(gt2):	ldq	r0, 16(r17)
110	ldq	r1, 24(r17)
111	mulq	r24, r19, r6	C U1
112	umulh	r24, r19, r22	C U1
113	ldq	r2, 32(r17)
114	mulq	r3, r19, r7	C U1
115	umulh	r3, r19, r23	C U1
116	ldq	r3, 40(r17)
117	lda	r18, -4(r18)
118	lda	r17, 48(r17)
119	mulq	r0, r19, r4	C U1
120	umulh	r0, r19, r20	C U1
121	lda	r16, -8(r16)
122	bgt	r18, L(gt6)
123
124	mulq	r1, r19, r5	C U1
125	br	L(cj6)
126
127L(gt6):	ldq	r0, 0(r17)
128	mulq	r1, r19, r5	C U1
129	br	L(L2)
130
131
132L(b1):	bgt	r18, L(gt1)
133
134	mulq	r24, r19, r7	C U1
135	umulh	r24, r19, r23	C U1
136	lda	r16, -48(r16)
137	br	L(cj1)
138
139L(gt1):	ldq	r0, 8(r17)
140	ldq	r1, 16(r17)
141	ldq	r2, 24(r17)
142	mulq	r24, r19, r7	C U1
143	umulh	r24, r19, r23	C U1
144	ldq	r3, 32(r17)
145	lda	r18, -4(r18)
146	lda	r17, 40(r17)
147	mulq	r0, r19, r4	C U1
148	umulh	r0, r19, r20	C U1
149	lda	r16, -16(r16)
150	bgt	r18, L(gt5)
151
152	mulq	r1, r19, r5	C U1
153	umulh	r1, r19, r21	C U1
154	mulq	r2, r19, r6	C U1
155	br	L(cj5)
156
157L(gt5):	ldq	r0, 0(r17)
158	mulq	r1, r19, r5	C U1
159	umulh	r1, r19, r21	C U1
160	ldq	r1, 8(r17)
161	mulq	r2, r19, r6	C U1
162	br	L(L1)
163
164
165L(b0):	ldq	r1, 8(r17)
166	ldq	r2, 16(r17)
167	ldq	r3, 24(r17)
168	lda	r17, 32(r17)
169	lda	r16, -24(r16)
170	mulq	r24, r19, r4	C U1
171	umulh	r24, r19, r20	C U1
172	bgt	r18, L(gt4)
173
174	mulq	r1, r19, r5	C U1
175	umulh	r1, r19, r21	C U1
176	mulq	r2, r19, r6	C U1
177	umulh	r2, r19, r22	C U1
178	mulq	r3, r19, r7	C U1
179	br	L(cj4)
180
181L(gt4):	ldq	r0, 0(r17)
182	mulq	r1, r19, r5	C U1
183	umulh	r1, r19, r21	C U1
184	ldq	r1, 8(r17)
185	mulq	r2, r19, r6	C U1
186	umulh	r2, r19, r22	C U1
187	ldq	r2, 16(r17)
188	mulq	r3, r19, r7	C U1
189	br	L(L0)
190
191C *** MAIN LOOP START ***
192	ALIGN(16)
193L(top):	mulq	r0, r19, r4	C U1
194	subq	r8, r28, r8
195L(L3):	umulh	r0, r19, r20	C U1
196	cmpult	r8, r5, r28
197	ldq	r0, 0(r17)
198	subq	r8, r5, r8
199	addq	r21, r28, r28
200	stq	r8, 0(r16)
201
202	mulq	r1, r19, r5	C U1
203	subq	r8, r28, r8
204L(L2):	umulh	r1, r19, r21	C U1
205	cmpult	r8, r6, r28
206	ldq	r1, 8(r17)
207	subq	r8, r6, r8
208	addq	r22, r28, r28
209	stq	r8, 8(r16)
210
211	mulq	r2, r19, r6	C U1
212	subq	r8, r28, r8
213L(L1):	umulh	r2, r19, r22	C U1
214	cmpult	r8, r7, r28
215	ldq	r2, 16(r17)
216	subq	r8, r7, r8
217	addq	r23, r28, r28
218	stq	r8, 16(r16)
219
220	mulq	r3, r19, r7	C U1
221	subq	r8, r28, r8
222L(L0):	umulh	r3, r19, r23	C U1
223	cmpult	r8, r4, r28
224	ldq	r3, 24(r17)
225	subq	r8, r4, r8
226	addq	r20, r28, r28
227	stq	r8, 24(r16)
228
229	lda	r18, -4(r18)
230	lda	r17, 32(r17)
231	lda	r16, 32(r16)
232	bgt	r18, L(top)
233C *** MAIN LOOP END ***
234
235	mulq	r0, r19, r4	C U1
236	subq	r8, r28, r8
237L(cj7):	umulh	r0, r19, r20	C U1
238	cmpult	r8, r5, r28
239	subq	r8, r5, r8
240	addq	r21, r28, r28
241	stq	r8, 0(r16)
242	mulq	r1, r19, r5	C U1
243	subq	r8, r28, r8
244L(cj6):	umulh	r1, r19, r21	C U1
245	cmpult	r8, r6, r28
246	subq	r8, r6, r8
247	addq	r22, r28, r28
248	stq	r8, 8(r16)
249	mulq	r2, r19, r6	C U1
250	subq	r8, r28, r8
251L(cj5):	umulh	r2, r19, r22	C U1
252	cmpult	r8, r7, r28
253	subq	r8, r7, r8
254	addq	r23, r28, r28
255	stq	r8, 16(r16)
256	mulq	r3, r19, r7	C U1
257	subq	r8, r28, r8
258L(cj4):	umulh	r3, r19, r23	C U1
259	cmpult	r8, r4, r28
260	subq	r8, r4, r8
261	addq	r20, r28, r28
262	stq	r8, 24(r16)
263	subq	r8, r28, r8
264L(cj3):	cmpult	r8, r5, r28
265	subq	r8, r5, r8
266	addq	r21, r28, r28
267	stq	r8, 32(r16)
268	subq	r8, r28, r8
269L(cj2):	cmpult	r8, r6, r28
270	subq	r8, r6, r8
271	addq	r22, r28, r28
272	stq	r8, 40(r16)
273	subq	r8, r28, r8
274L(cj1):	cmpult	r8, r7, r28
275	subq	r8, r7, r8
276	addq	r23, r28, r28
277	stq	r8, 48(r16)
278	subq	r8, r28, r0
279	ret	r31, (r26), 1
280
281EPILOGUE()
282ASM_END()
283