1dnl Alpha mpn_mod_1s_4p
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C TODO:
36C  * Optimise.  2.75 c/l should be possible.
37C  * Write a proper mpn_mod_1s_4p_cps.  The code below was compiler generated.
38C  * Optimise feed-in code, starting the sw pipeline in switch code.
39C  * Shorten software pipeline.  The mul instructions are scheduled too far
40C    from their users.  Fixing this will allow us to use fewer registers.
41C  * If we cannot reduce register usage, write perhaps small-n basecase.
42C  * Does this work for PIC?
43
44C      cycles/limb
45C EV4:     ?
46C EV5:    23
47C EV6:     3
48
49define(`ap',     `r16')
50define(`n',      `r17')
51define(`pl',     `r24')
52define(`ph',     `r25')
53define(`rl',     `r6')
54define(`rh',     `r7')
55define(`B1modb', `r1')
56define(`B2modb', `r2')
57define(`B3modb', `r3')
58define(`B4modb', `r4')
59define(`B5modb', `r5')
60
61ASM_START()
62PROLOGUE(mpn_mod_1s_4p)
63	lda	r30, -64(r30)
64	stq	r9, 8(r30)
65	ldq	B1modb, 16(r19)
66	stq	r10, 16(r30)
67	ldq	B2modb, 24(r19)
68	stq	r11, 24(r30)
69	ldq	B3modb, 32(r19)
70	stq	r12, 32(r30)
71	ldq	B4modb, 40(r19)
72	stq	r13, 40(r30)
73	ldq	B5modb, 48(r19)
74	s8addq	n, ap, ap		C point ap at vector end
75
76	and	n, 3, r0
77	lda	n, -4(n)
78	beq	r0, L(b0)
79	lda	r6, -2(r0)
80	blt	r6, L(b1)
81	beq	r6, L(b2)
82
83L(b3):	ldq	r21, -16(ap)
84	ldq	r22, -8(ap)
85	ldq	r20, -24(ap)
86	mulq	r21, B1modb, r8
87	umulh	r21, B1modb, r12
88	mulq	r22, B2modb, r9
89	umulh	r22, B2modb, r13
90	addq	r8, r20, pl
91	cmpult	pl, r8, r0
92	addq	r0, r12, ph
93	addq	r9, pl, rl
94	cmpult	rl, r9, r0
95	addq	r13, ph, ph
96	addq	r0, ph, rh
97	lda	ap, -56(ap)
98	br	L(com)
99
100L(b0):	ldq	r21, -24(ap)
101	ldq	r22, -16(ap)
102	ldq	r23, -8(ap)
103	ldq	r20, -32(ap)
104	mulq	r21, B1modb, r8
105	umulh	r21, B1modb, r12
106	mulq	r22, B2modb, r9
107	umulh	r22, B2modb, r13
108	mulq	r23, B3modb, r10
109	umulh	r23, B3modb, r27
110	addq	r8, r20, pl
111	cmpult	pl, r8, r0
112	addq	r0, r12, ph
113	addq	r9, pl, pl
114	cmpult	pl, r9, r0
115	addq	r13, ph, ph
116	addq	r0, ph, ph
117	addq	r10, pl, rl
118	cmpult	rl, r10, r0
119	addq	r27, ph, ph
120	addq	r0, ph, rh
121	lda	ap, -64(ap)
122	br	L(com)
123
124L(b1):	bis	r31, r31, rh
125	ldq	rl, -8(ap)
126	lda	ap, -40(ap)
127	br	L(com)
128
129L(b2):	ldq	rh, -8(ap)
130	ldq	rl, -16(ap)
131	lda	ap, -48(ap)
132
133L(com):	ble	n, L(ed3)
134	ldq	r21, 8(ap)
135	ldq	r22, 16(ap)
136	ldq	r23, 24(ap)
137	ldq	r20, 0(ap)
138	lda	n, -4(n)
139	lda	ap, -32(ap)
140	mulq	r21, B1modb, r8
141	umulh	r21, B1modb, r12
142	mulq	r22, B2modb, r9
143	umulh	r22, B2modb, r13
144	mulq	r23, B3modb, r10
145	umulh	r23, B3modb, r27
146	mulq	rl, B4modb, r11
147	umulh	rl, B4modb, r28
148	ble	n, L(ed2)
149
150	ALIGN(16)
151L(top):	ldq	r21, 8(ap)
152	mulq	rh, B5modb, rl
153	addq	r8, r20, pl
154	ldq	r22, 16(ap)
155	cmpult	pl, r8, r0
156	umulh	rh, B5modb, rh
157	ldq	r23, 24(ap)
158	addq	r0, r12, ph
159	addq	r9, pl, pl
160	mulq	r21, B1modb, r8
161	cmpult	pl, r9, r0
162	addq	r13, ph, ph
163	umulh	r21, B1modb, r12
164	lda	ap, -32(ap)
165	addq	r0, ph, ph
166	addq	r10, pl, pl
167	mulq	r22, B2modb, r9
168	cmpult	pl, r10, r0
169	addq	r27, ph, ph
170	addq	r11, pl, pl
171	umulh	r22, B2modb, r13
172	addq	r0, ph, ph
173	cmpult	pl, r11, r0
174	addq	r28, ph, ph
175	mulq	r23, B3modb, r10
176	ldq	r20, 32(ap)
177	addq	pl, rl, rl
178	umulh	r23, B3modb, r27
179	addq	r0, ph, ph
180	cmpult	rl, pl, r0
181	mulq	rl, B4modb, r11
182	addq	ph, rh, rh
183	umulh	rl, B4modb, r28
184	addq	r0, rh, rh
185	lda	n, -4(n)
186	bgt	n, L(top)
187
188L(ed2):	mulq	rh, B5modb, rl
189	addq	r8, r20, pl
190	umulh	rh, B5modb, rh
191	cmpult	pl, r8, r0
192	addq	r0, r12, ph
193	addq	r9, pl, pl
194	cmpult	pl, r9, r0
195	addq	r13, ph, ph
196	addq	r0, ph, ph
197	addq	r10, pl, pl
198	cmpult	pl, r10, r0
199	addq	r27, ph, ph
200	addq	r11, pl, pl
201	addq	r0, ph, ph
202	cmpult	pl, r11, r0
203	addq	r28, ph, ph
204	addq	pl, rl, rl
205	addq	r0, ph, ph
206	cmpult	rl, pl, r0
207	addq	ph, rh, rh
208	addq	r0, rh, rh
209
210L(ed3):	mulq	rh, B1modb, r8
211	umulh	rh, B1modb, rh
212	addq	r8, rl, rl
213	cmpult	rl, r8, r0
214	addq	r0, rh, rh
215
216	ldq	r24, 8(r19)		C cnt
217	sll	rh, r24, rh
218	subq	r31, r24, r25
219	srl	rl, r25, r2
220	sll	rl, r24, rl
221	or	r2, rh, rh
222
223	ldq	r23, 0(r19)		C bi
224	mulq	rh, r23, r8
225	umulh	rh, r23, r9
226	addq	rh, 1, r7
227	addq	r8, rl, r8		C ql
228	cmpult	r8, rl, r0
229	addq	r9, r7, r9
230	addq	r0, r9, r9		C qh
231	mulq	r9, r18, r21		C qh * b
232	subq	rl, r21, rl
233	cmpult	r8, rl, r0		C rl > ql
234	negq	r0, r0
235	and	r0, r18, r0
236	addq	rl, r0, rl
237	cmpule	r18, rl, r0		C rl >= b
238	negq	r0, r0
239	and	r0, r18, r0
240	subq	rl, r0, rl
241
242	srl	rl, r24, r0
243
244	ldq	r9, 8(r30)
245	ldq	r10, 16(r30)
246	ldq	r11, 24(r30)
247	ldq	r12, 32(r30)
248	ldq	r13, 40(r30)
249	lda	r30, 64(r30)
250	ret	r31, (r26), 1
251EPILOGUE()
252
253PROLOGUE(mpn_mod_1s_4p_cps,gp)
254	lda	r30, -32(r30)
255	stq	r26, 0(r30)
256	stq	r9, 8(r30)
257	stq	r10, 16(r30)
258	stq	r11, 24(r30)
259	mov	r16, r11
260	LEA(	r4, __clz_tab)
261	lda	r10, 65(r31)
262	cmpbge	r31, r17, r1
263	srl	r1, 1, r1
264	xor	r1, 127, r1
265	addq	r1, r4, r1
266	ldq_u	r2, 0(r1)
267	extbl	r2, r1, r2
268	s8subq	r2, 7, r2
269	srl	r17, r2, r3
270	subq	r10, r2, r10
271	addq	r3, r4, r3
272	ldq_u	r1, 0(r3)
273	extbl	r1, r3, r1
274	subq	r10, r1, r10
275	sll	r17, r10, r9
276	mov	r9, r16
277	jsr	r26, mpn_invert_limb
278	LDGP(	r29, 0(r26))
279	subq	r31, r10, r2
280	lda	r1, 1(r31)
281	sll	r1, r10, r1
282	subq	r31, r9, r3
283	srl	r0, r2, r2
284	ldq	r26, 0(r30)
285	bis	r2, r1, r2
286	stq	r0, 0(r11)
287	stq	r10, 8(r11)
288	mulq	r2, r3, r2
289	srl	r2, r10, r3
290	umulh	r2, r0, r1
291	stq	r3, 16(r11)
292	mulq	r2, r0, r3
293	ornot	r31, r1, r1
294	subq	r1, r2, r1
295	mulq	r1, r9, r1
296	addq	r1, r9, r2
297	cmpule	r1, r3, r3
298	cmoveq	r3, r2, r1
299	srl	r1, r10, r3
300	umulh	r1, r0, r2
301	stq	r3, 24(r11)
302	mulq	r1, r0, r3
303	ornot	r31, r2, r2
304	subq	r2, r1, r2
305	mulq	r2, r9, r2
306	addq	r2, r9, r1
307	cmpule	r2, r3, r3
308	cmoveq	r3, r1, r2
309	srl	r2, r10, r1
310	umulh	r2, r0, r3
311	stq	r1, 32(r11)
312	mulq	r2, r0, r1
313	ornot	r31, r3, r3
314	subq	r3, r2, r3
315	mulq	r3, r9, r3
316	addq	r3, r9, r2
317	cmpule	r3, r1, r1
318	cmoveq	r1, r2, r3
319	srl	r3, r10, r2
320	umulh	r3, r0, r1
321	stq	r2, 40(r11)
322	mulq	r3, r0, r0
323	ornot	r31, r1, r1
324	subq	r1, r3, r1
325	mulq	r1, r9, r1
326	addq	r1, r9, r9
327	cmpule	r1, r0, r0
328	cmoveq	r0, r9, r1
329	ldq	r9, 8(r30)
330	srl	r1, r10, r1
331	ldq	r10, 16(r30)
332	stq	r1, 48(r11)
333	ldq	r11, 24(r30)
334	lda	r30, 32(r30)
335	ret	r31, (r26), 1
336EPILOGUE()
337