mod_34lsub1.asm revision 1.1.1.2
1dnl  PowerPC-32 mpn_mod_34lsub1 -- mpn remainder mod 2^24-1.
2
3dnl  Copyright 2002, 2003, 2005, 2006, 2007, 2012 Free Software Foundation,
4dnl  Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21
22include(`../config.m4')
23
24
25C                cycles/limb
26C 603e:              -
27C 604e:              -
28C 75x (G3):          -
29C 7400,7410 (G4):    1          simple load-use scheduling results in 0.75
30C 744x,745x (G4+):   0.75
31C ppc970:            0.75
32C power4:            -
33C power5:            -
34
35C TODO
36C  * Either start using the low-end masking constants, or remove them.
37C  * Merge multiple feed-in cases into a parameterized code block.
38C  * Reduce register usage.  It should be possible to almost halve it.
39
40define(`up', `r3')
41define(`n', `r4')
42
43define(`a0', `v3')
44define(`a1', `v4')
45define(`a2', `v5')
46define(`c0', `v6')
47define(`c1', `v7')
48define(`c2', `v8')
49define(`z',  `v9')
50define(`x0', `v10')
51define(`x1', `v11')
52define(`x2', `v12')
53define(`x3', `v13')
54define(`pv', `v14')
55define(`y0', `v0')
56define(`y1', `v1')
57define(`y2', `v2')
58define(`y3', `v15')
59
60ASM_START()
61PROLOGUE(mpn_mod_34lsub1)
62	cmpwi	cr0, n, 20		C tuned cutoff point
63	bge	L(large)
64
65	li	r9, 0			C result accumulator
66	mulli	r10, n, 0xb		C 0xb = ceil(32/3)
67	srwi.	r10, r10, 5		C r10 = floor(n/3), n < 32
68	beq	L(small_tail)
69	mtctr	r10
70	lwz	r6, 0(up)
71	lwz	r7, 4(up)
72	lwzu	r8, 8(up)
73	subf	n, r10, n
74	subf	n, r10, n
75	subf	n, r10, n
76	bdz	L(small_end)
77
78	ALIGN(16)
79L(los):	rlwinm	r0, r6, 0,8,31
80	add	r9, r9, r0		C add 24b from u0
81	srwi	r0, r6, 24
82	lwz	r6, 4(up)
83	rlwimi	r0, r7, 8, 0x00ffff00	C --111100
84	add	r9, r9, r0		C add 8b from u0 and 16b from u1
85	srwi	r0, r7, 16
86	lwz	r7, 8(up)
87	rlwimi	r0, r8, 16, 0x00ff0000	C --221111
88	add	r9, r9, r0		C add 16b from u1 and 8b from u2
89	srwi	r0, r8, 8		C --222222
90	lwzu	r8, 12(up)
91	add	r9, r9, r0		C add 24b from u2
92	bdnz	L(los)
93L(small_end):
94	rlwinm	r0, r6, 0,8,31
95	add	r9, r9, r0		C add 24b from u0
96	srwi	r0, r6, 24
97	rlwimi	r0, r7, 8, 0x00ffff00	C --111100
98	add	r9, r9, r0		C add 8b from u0 and 16b from u1
99	srwi	r0, r7, 16
100	rlwimi	r0, r8, 16, 0x00ff0000	C --221111
101	add	r9, r9, r0		C add 16b from u1 and 8b from u2
102	srwi	r0, r8, 8		C --222222
103	add	r9, r9, r0		C add 24b from u2
104
105	addi	up, up, 4
106	rlwinm	r0, r9, 0,8,31
107	srwi	r9, r9, 24
108	add	r9, r9, r0
109
110L(small_tail):
111	cmpi	cr0, n, 1
112	blt	L(ret)
113
114	lwz	r6, 0(up)
115	rlwinm	r0, r6, 0,8,31
116	srwi	r6, r6, 24
117	add	r9, r9, r0
118	add	r9, r9, r6
119
120	beq	L(ret)
121
122	lwz	r6, 4(up)
123	rlwinm	r0, r6, 8,8,23
124	srwi	r6, r6, 16
125	add	r9, r9, r0
126	add	r9, r9, r6
127
128L(ret):	mr	r3, r9
129	blr
130
131
132L(large):
133	mfspr	r10, 256
134	oris	r0, r10, 0xffff		C Set VRSAVE bit 0-15
135	mtspr	256, r0
136
137	andi.	r7, up, 15
138	vxor	a0, v0, v0
139	lis	r9, 0xaaaa
140	vxor	a1, v0, v0
141	ori	r9, r9, 0xaaab
142	vxor	a2, v0, v0
143	li	r5, 16
144	vxor	c0, v0, v0
145	li	r6, 32
146	vxor	c1, v0, v0
147	LEAL(	r11, cnsts)		C CAUTION clobbers r0 for elf, darwin
148	vxor	c2, v0, v0
149	vxor	z, v0, v0
150
151	beq	L(aligned16)
152
153	cmpwi	cr7, r7, 8
154	bge	cr7, L(na4)
155
156	lvx	a2, 0, up
157	addi	up, up, 16
158	vsldoi	a2, a2, z, 4
159	vsldoi	a2, z, a2, 12
160
161	addi	n, n, 9
162	mulhwu	r0, n, r9
163	srwi	r0, r0, 3		C r0 = floor(n/12)
164	mtctr	r0
165
166	mulli	r8, r0, 12
167	subf	n, r8, n
168	b	L(2)
169
170L(na4):	bne	cr7, L(na8)
171
172	lvx	a1, 0, up
173	addi	up, up, -16
174	vsldoi	a1, a1, z, 8
175	vsldoi	a1, z, a1, 8
176
177	addi	n, n, 6
178	mulhwu	r0, n, r9
179	srwi	r0, r0, 3		C r0 = floor(n/12)
180	mtctr	r0
181
182	mulli	r8, r0, 12
183	subf	n, r8, n
184	b	L(1)
185
186L(na8):
187	lvx	a0, 0, up
188	vsldoi	a0, a0, z, 12
189	vsldoi	a0, z, a0, 4
190
191	addi	n, n, 3
192	mulhwu	r0, n, r9
193	srwi	r0, r0, 3		C r0 = floor(n/12)
194	mtctr	r0
195
196	mulli	r8, r0, 12
197	subf	n, r8, n
198	b	L(0)
199
200L(aligned16):
201	mulhwu	r0, n, r9
202	srwi	r0, r0, 3		C r0 = floor(n/12)
203	mtctr	r0
204
205	mulli	r8, r0, 12
206	subf	n, r8, n
207
208	lvx	a0, 0, up
209L(0):	lvx	a1, r5, up
210L(1):	lvx	a2, r6, up
211	addi	up, up, 48
212L(2):	bdz	L(end)
213	li	r12, 256
214	li	r9, 288
215	ALIGN(32)
216L(top):
217	lvx	v0, 0, up
218	vaddcuw	v10, a0, v0
219	vadduwm	a0, a0, v0
220	vadduwm	c0, c0, v10
221
222	lvx	v1, r5, up
223	vaddcuw	v10, a1, v1
224	vadduwm	a1, a1, v1
225	vadduwm	c1, c1, v10
226
227	lvx	v2, r6, up
228	dcbt	up, r12
229	dcbt	up, r9
230	addi	up, up, 48
231	vaddcuw	v10, a2, v2
232	vadduwm	a2, a2, v2
233	vadduwm	c2, c2, v10
234	bdnz	L(top)
235
236L(end):
237C n = 0...11
238	cmpwi	cr0, n, 0
239	beq	L(sum)
240	cmpwi	cr0, n, 4
241	ble	L(tail.1..4)
242	cmpwi	cr0, n, 8
243	ble	L(tail.5..8)
244
245L(tail.9..11):
246	lvx	v0, 0, up
247	vaddcuw	v10, a0, v0
248	vadduwm	a0, a0, v0
249	vadduwm	c0, c0, v10
250
251	lvx	v1, r5, up
252	vaddcuw	v10, a1, v1
253	vadduwm	a1, a1, v1
254	vadduwm	c1, c1, v10
255
256	lvx	v2, r6, up
257
258	addi	r8, r11, 96
259	rlwinm	r3, n ,4,26,27
260	lvx	v11, r3, r8
261	vand	v2, v2, v11
262
263	vaddcuw	v10, a2, v2
264	vadduwm	a2, a2, v2
265	vadduwm	c2, c2, v10
266	b	L(sum)
267
268L(tail.5..8):
269	lvx	v0, 0, up
270	vaddcuw	v10, a0, v0
271	vadduwm	a0, a0, v0
272	vadduwm	c0, c0, v10
273
274	lvx	v1, r5, up
275
276	addi	r8, r11, 96
277	rlwinm	r3, n ,4,26,27
278	lvx	v11, r3, r8
279	vand	v1, v1, v11
280
281	vaddcuw	v10, a1, v1
282	vadduwm	a1, a1, v1
283	vadduwm	c1, c1, v10
284	b	L(sum)
285
286L(tail.1..4):
287	lvx	v0, 0, up
288
289	addi	r8, r11, 96
290	rlwinm	r3, n ,4,26,27
291	lvx	v11, r3, r8
292	vand	v0, v0, v11
293
294	vaddcuw	v10, a0, v0
295	vadduwm	a0, a0, v0
296	vadduwm	c0, c0, v10
297
298L(sum):	lvx	pv, 0, r11
299	vperm	x0, a0, z, pv		C extract 4 24-bit field from a0
300	vperm	y0, c2, z, pv
301	lvx	pv, r5, r11
302	vperm	x1, a1, z, pv		C extract 4 24-bit field from a1
303	vperm	y1, c0, z, pv		C extract 4 24-bit field from a1
304	lvx	pv, r6, r11
305	vperm	x2, a2, z, pv		C extract 4 24-bit field from a1
306	vperm	y2, c1, z, pv		C extract 4 24-bit field from a1
307	li	r10,  48
308	lvx	pv, r10, r11
309	vperm	x3, a0, z, pv		C extract remaining/partial a0 fields
310	vperm	y3, c2, z, pv		C extract remaining/partial a0 fields
311	li	r10,  64
312	lvx	pv, r10, r11
313	vperm	x3, a1, x3, pv		C insert remaining/partial a1 fields
314	vperm	y3, c0, y3, pv		C insert remaining/partial a1 fields
315	li	r10,  80
316	lvx	pv, r10, r11
317	vperm	x3, a2, x3, pv		C insert remaining/partial a2 fields
318	vperm	y3, c1, y3, pv		C insert remaining/partial a2 fields
319
320C We now have 4 128-bit accumulators to sum
321	vadduwm	x0, x0, x1
322	vadduwm	x2, x2, x3
323	vadduwm	x0, x0, x2
324
325	vadduwm	y0, y0, y1
326	vadduwm	y2, y2, y3
327	vadduwm	y0, y0, y2
328
329	vadduwm	x0, x0, y0
330
331C Reduce 32-bit fields
332	vsumsws	x0, x0, z
333
334	li	r7, -16			C FIXME: does all ppc32 ABIs...
335	stvx	x0, r7, r1		C FIXME: ...support storing below sp?
336	lwz	r3, -4(r1)
337
338	mtspr	256, r10
339	blr
340EPILOGUE()
341
342C load	|      v0       |      v1       |      v2       |
343C acc	|      a0       |      a1       |      a2       |
344C carry	|      c0       |      c1       |      c2       |
345C	| 0   1   2   3 | 4   5   6   7 | 8   9  10  11 |  128
346C	|---|---|---|---|---|---|---|---|---|---|---|---|   32
347C	|  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |   24
348C	|     |     |     |     |     |     |     |     |   48
349
350C       $---------------$---------------$---------------$---------------$
351C       |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   |
352C       |_______________________________________________________________|
353C   |           |           |           |           |           |           |
354C       <-hi16-> <--- 24 --> <--- 24 --> <--- 24 --> <--- 24 --> <-lo16->
355
356
357DEF_OBJECT(cnsts,16)
358C Permutation vectors in the order they are used above
359C #      00   01   02   03    04   05   06   07    08   09   0a   0b    0c   0d   0e   0f
360 .byte 0x10,0x01,0x02,0x03, 0x10,0x06,0x07,0x00, 0x10,0x0b,0x04,0x05, 0x10,0x08,0x09,0x0a C a0
361 .byte 0x10,0x07,0x00,0x01, 0x10,0x04,0x05,0x06, 0x10,0x09,0x0a,0x0b, 0x10,0x0e,0x0f,0x08 C a1
362 .byte 0x10,0x00,0x01,0x02, 0x10,0x05,0x06,0x07, 0x10,0x0a,0x0b,0x04, 0x10,0x0f,0x08,0x09 C a2
363 .byte 0x10,0x0d,0x0e,0x0f, 0x10,0x10,0x10,0x0c, 0x10,0x10,0x10,0x10, 0x10,0x10,0x10,0x10 C part a0
364 .byte 0x10,0x11,0x12,0x13, 0x10,0x02,0x03,0x17, 0x10,0x10,0x0c,0x0d, 0x10,0x10,0x10,0x10 C part a1
365 .byte 0x10,0x11,0x12,0x13, 0x10,0x15,0x16,0x17, 0x10,0x03,0x1a,0x1b, 0x10,0x0c,0x0d,0x0e C part a2
366C Masks for high end of number
367 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
368 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
369 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
370 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
371C Masks for low end of number
372C .byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
373C .byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
374C .byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
375C .byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
376END_OBJECT(cnsts)
377