1dnl  PowerPC-32 mpn_mod_34lsub1 -- mpn remainder mod 2^24-1.
2
3dnl  Copyright 2002, 2003, 2005, 2006, 2007 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20
21include(`../config.m4')
22
23
24C                cycles/limb
25C 603e:              -
26C 604e:              -
27C 75x (G3):          -
28C 7400,7410 (G4):    1          simple load-use scheduling results in 0.75
29C 744x,745x (G4+):   0.75
30C ppc970:            0.75
31C power4:            -
32C power5:            -
33
34C TODO
35C  * Either start using the low-end masking constants, or remove them.
36C  * Merge multiple feed-in cases into a parameterized code block.
37C  * Reduce register usage.  It should be possible to almost halve it.
38
39define(`up', `r3')
40define(`n', `r4')
41
42define(`a0', `v3')
43define(`a1', `v4')
44define(`a2', `v5')
45define(`c0', `v6')
46define(`c1', `v7')
47define(`c2', `v8')
48define(`z',  `v9')
49define(`x0', `v10')
50define(`x1', `v11')
51define(`x2', `v12')
52define(`x3', `v13')
53define(`pv', `v14')
54define(`y0', `v0')
55define(`y1', `v1')
56define(`y2', `v2')
57define(`y3', `v15')
58
59ASM_START()
60PROLOGUE(mpn_mod_34lsub1)
61	cmpwi	cr0, n, 20		C tuned cutoff point
62	bge	L(large)
63
64	li	r9, 0			C result accumulator
65	mulli	r10, n, 0xb		C 0xb = ceil(32/3)
66	srwi.	r10, r10, 5		C r10 = floor(n/3), n < 32
67	beq	L(small_tail)
68	mtctr	r10
69	lwz	r6, 0(up)
70	lwz	r7, 4(up)
71	lwzu	r8, 8(up)
72	subf	n, r10, n
73	subf	n, r10, n
74	subf	n, r10, n
75	bdz	L(small_end)
76
77	ALIGN(16)
78L(los):	rlwinm	r0, r6, 0,8,31
79	add	r9, r9, r0		C add 24b from u0
80	srwi	r0, r6, 24
81	lwz	r6, 4(up)
82	rlwimi	r0, r7, 8, 0x00ffff00	C --111100
83	add	r9, r9, r0		C add 8b from u0 and 16b from u1
84	srwi	r0, r7, 16
85	lwz	r7, 8(up)
86	rlwimi	r0, r8, 16, 0x00ff0000	C --221111
87	add	r9, r9, r0		C add 16b from u1 and 8b from u2
88	srwi	r0, r8, 8		C --222222
89	lwzu	r8, 12(up)
90	add	r9, r9, r0		C add 24b from u2
91	bdnz	L(los)
92L(small_end):
93	rlwinm	r0, r6, 0,8,31
94	add	r9, r9, r0		C add 24b from u0
95	srwi	r0, r6, 24
96	rlwimi	r0, r7, 8, 0x00ffff00	C --111100
97	add	r9, r9, r0		C add 8b from u0 and 16b from u1
98	srwi	r0, r7, 16
99	rlwimi	r0, r8, 16, 0x00ff0000	C --221111
100	add	r9, r9, r0		C add 16b from u1 and 8b from u2
101	srwi	r0, r8, 8		C --222222
102	add	r9, r9, r0		C add 24b from u2
103
104	addi	up, up, 4
105	rlwinm	r0, r9, 0,8,31
106	srwi	r9, r9, 24
107	add	r9, r9, r0
108
109L(small_tail):
110	cmpi	cr0, n, 1
111	blt	L(ret)
112
113	lwz	r6, 0(up)
114	rlwinm	r0, r6, 0,8,31
115	srwi	r6, r6, 24
116	add	r9, r9, r0
117	add	r9, r9, r6
118
119	beq	L(ret)
120
121	lwz	r6, 4(up)
122	rlwinm	r0, r6, 8,8,23
123	srwi	r6, r6, 16
124	add	r9, r9, r0
125	add	r9, r9, r6
126
127L(ret):	mr	r3, r9
128	blr
129
130
131L(large):
132	mfspr	r10, 256
133	oris	r0, r10, 0xffff		C Set VRSAVE bit 0-15
134	mtspr	256, r0
135
136	andi.	r7, up, 15
137	vxor	a0, v0, v0
138	lis	r0, 0xaaaa
139	vxor	a1, v0, v0
140	ori	r0, r0, 0xaaab
141	vxor	a2, v0, v0
142	li	r5, 16
143	vxor	c0, v0, v0
144	li	r6, 32
145	vxor	c1, v0, v0
146	LEAL(	r11, cnsts)
147	vxor	c2, v0, v0
148	vxor	z, v0, v0
149
150	beq	L(aligned16)
151
152	cmpwi	cr7, r7, 8
153	bge	cr7, L(na4)
154
155	lvx	a2, 0, up
156	addi	up, up, 16
157	vsldoi	a2, a2, z, 4
158	vsldoi	a2, z, a2, 12
159
160	addi	n, n, 9
161	mulhwu	r0, n, r0
162	srwi	r0, r0, 3		C r0 = floor(n/12)
163	mtctr	r0
164
165	mulli	r8, r0, 12
166	subf	n, r8, n
167	b	L(2)
168
169L(na4):	bne	cr7, L(na8)
170
171	lvx	a1, 0, up
172	addi	up, up, -16
173	vsldoi	a1, a1, z, 8
174	vsldoi	a1, z, a1, 8
175
176	addi	n, n, 6
177	mulhwu	r0, n, r0
178	srwi	r0, r0, 3		C r0 = floor(n/12)
179	mtctr	r0
180
181	mulli	r8, r0, 12
182	subf	n, r8, n
183	b	L(1)
184
185L(na8):
186	lvx	a0, 0, up
187	vsldoi	a0, a0, z, 12
188	vsldoi	a0, z, a0, 4
189
190	addi	n, n, 3
191	mulhwu	r0, n, r0
192	srwi	r0, r0, 3		C r0 = floor(n/12)
193	mtctr	r0
194
195	mulli	r8, r0, 12
196	subf	n, r8, n
197	b	L(0)
198
199L(aligned16):
200	mulhwu	r0, n, r0
201	srwi	r0, r0, 3		C r0 = floor(n/12)
202	mtctr	r0
203
204	mulli	r8, r0, 12
205	subf	n, r8, n
206
207	lvx	a0, 0, up
208L(0):	lvx	a1, r5, up
209L(1):	lvx	a2, r6, up
210	addi	up, up, 48
211L(2):	bdz	L(end)
212	li	r12, 256
213	li	r9, 288
214	ALIGN(32)
215L(top):
216	lvx	v0, 0, up
217	vaddcuw	v10, a0, v0
218	vadduwm	a0, a0, v0
219	vadduwm	c0, c0, v10
220
221	lvx	v1, r5, up
222	vaddcuw	v10, a1, v1
223	vadduwm	a1, a1, v1
224	vadduwm	c1, c1, v10
225
226	lvx	v2, r6, up
227	dcbt	up, r12
228	dcbt	up, r9
229	addi	up, up, 48
230	vaddcuw	v10, a2, v2
231	vadduwm	a2, a2, v2
232	vadduwm	c2, c2, v10
233	bdnz	L(top)
234
235L(end):
236C n = 0...11
237	cmpwi	cr0, n, 0
238	beq	L(sum)
239	cmpwi	cr0, n, 4
240	ble	L(tail.1..4)
241	cmpwi	cr0, n, 8
242	ble	L(tail.5..8)
243
244L(tail.9..11):
245	lvx	v0, 0, up
246	vaddcuw	v10, a0, v0
247	vadduwm	a0, a0, v0
248	vadduwm	c0, c0, v10
249
250	lvx	v1, r5, up
251	vaddcuw	v10, a1, v1
252	vadduwm	a1, a1, v1
253	vadduwm	c1, c1, v10
254
255	lvx	v2, r6, up
256
257	addi	r8, r11, 96
258	rlwinm	r3, n ,4,26,27
259	lvx	v11, r3, r8
260	vand	v2, v2, v11
261
262	vaddcuw	v10, a2, v2
263	vadduwm	a2, a2, v2
264	vadduwm	c2, c2, v10
265	b	L(sum)
266
267L(tail.5..8):
268	lvx	v0, 0, up
269	vaddcuw	v10, a0, v0
270	vadduwm	a0, a0, v0
271	vadduwm	c0, c0, v10
272
273	lvx	v1, r5, up
274
275	addi	r8, r11, 96
276	rlwinm	r3, n ,4,26,27
277	lvx	v11, r3, r8
278	vand	v1, v1, v11
279
280	vaddcuw	v10, a1, v1
281	vadduwm	a1, a1, v1
282	vadduwm	c1, c1, v10
283	b	L(sum)
284
285L(tail.1..4):
286	lvx	v0, 0, up
287
288	addi	r8, r11, 96
289	rlwinm	r3, n ,4,26,27
290	lvx	v11, r3, r8
291	vand	v0, v0, v11
292
293	vaddcuw	v10, a0, v0
294	vadduwm	a0, a0, v0
295	vadduwm	c0, c0, v10
296
297L(sum):	lvx	pv, 0, r11
298	vperm	x0, a0, z, pv		C extract 4 24-bit field from a0
299	vperm	y0, c2, z, pv
300	lvx	pv, r5, r11
301	vperm	x1, a1, z, pv		C extract 4 24-bit field from a1
302	vperm	y1, c0, z, pv		C extract 4 24-bit field from a1
303	lvx	pv, r6, r11
304	vperm	x2, a2, z, pv		C extract 4 24-bit field from a1
305	vperm	y2, c1, z, pv		C extract 4 24-bit field from a1
306	li	r10,  48
307	lvx	pv, r10, r11
308	vperm	x3, a0, z, pv		C extract remaining/partial a0 fields
309	vperm	y3, c2, z, pv		C extract remaining/partial a0 fields
310	li	r10,  64
311	lvx	pv, r10, r11
312	vperm	x3, a1, x3, pv		C insert remaining/partial a1 fields
313	vperm	y3, c0, y3, pv		C insert remaining/partial a1 fields
314	li	r10,  80
315	lvx	pv, r10, r11
316	vperm	x3, a2, x3, pv		C insert remaining/partial a2 fields
317	vperm	y3, c1, y3, pv		C insert remaining/partial a2 fields
318
319C We now have 4 128-bit accumulators to sum
320	vadduwm	x0, x0, x1
321	vadduwm	x2, x2, x3
322	vadduwm	x0, x0, x2
323
324	vadduwm	y0, y0, y1
325	vadduwm	y2, y2, y3
326	vadduwm	y0, y0, y2
327
328	vadduwm	x0, x0, y0
329
330C Reduce 32-bit fields
331	vsumsws	x0, x0, z
332
333	li	r7, -16			C FIXME: does all ppc32 ABIs...
334	stvx	x0, r7, r1		C FIXME: ...support storing below sp?
335	lwz	r3, -4(r1)
336
337	mtspr	256, r10
338	blr
339EPILOGUE()
340
341C load	|      v0       |      v1       |      v2       |
342C acc	|      a0       |      a1       |      a2       |
343C carry	|      c0       |      c1       |      c2       |
344C	| 0   1   2   3 | 4   5   6   7 | 8   9  10  11 |  128
345C	|---|---|---|---|---|---|---|---|---|---|---|---|   32
346C	|  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |   24
347C	|     |     |     |     |     |     |     |     |   48
348
349C       $---------------$---------------$---------------$---------------$
350C       |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   |
351C       |_______________________________________________________________|
352C   |           |           |           |           |           |           |
353C       <-hi16-> <--- 24 --> <--- 24 --> <--- 24 --> <--- 24 --> <-lo16->
354
355
356DEF_OBJECT(cnsts,16)
357C Permutation vectors in the order they are used above
358C #      00   01   02   03    04   05   06   07    08   09   0a   0b    0c   0d   0e   0f
359 .byte 0x10,0x01,0x02,0x03, 0x10,0x06,0x07,0x00, 0x10,0x0b,0x04,0x05, 0x10,0x08,0x09,0x0a C a0
360 .byte 0x10,0x07,0x00,0x01, 0x10,0x04,0x05,0x06, 0x10,0x09,0x0a,0x0b, 0x10,0x0e,0x0f,0x08 C a1
361 .byte 0x10,0x00,0x01,0x02, 0x10,0x05,0x06,0x07, 0x10,0x0a,0x0b,0x04, 0x10,0x0f,0x08,0x09 C a2
362 .byte 0x10,0x0d,0x0e,0x0f, 0x10,0x10,0x10,0x0c, 0x10,0x10,0x10,0x10, 0x10,0x10,0x10,0x10 C part a0
363 .byte 0x10,0x11,0x12,0x13, 0x10,0x02,0x03,0x17, 0x10,0x10,0x0c,0x0d, 0x10,0x10,0x10,0x10 C part a1
364 .byte 0x10,0x11,0x12,0x13, 0x10,0x15,0x16,0x17, 0x10,0x03,0x1a,0x1b, 0x10,0x0c,0x0d,0x0e C part a2
365C Masks for high end of number
366 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
367 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
368 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
369 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
370C Masks for low end of number
371C .byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
372C .byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
373C .byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
374C .byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
375END_OBJECT(cnsts)
376