1dnl  PowerPC-32 mpn_mod_34lsub1 -- mpn remainder mod 2^24-1.
2
3dnl  Copyright 2002, 2003, 2005-2007, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31
32include(`../config.m4')
33
34
35C                cycles/limb
36C 603e:              -
37C 604e:              -
38C 75x (G3):          -
39C 7400,7410 (G4):    1          simple load-use scheduling results in 0.75
40C 744x,745x (G4+):   0.75
41C ppc970:            0.75
42C power4:            -
43C power5:            -
44
45C TODO
46C  * Either start using the low-end masking constants, or remove them.
47C  * Merge multiple feed-in cases into a parameterized code block.
48C  * Reduce register usage.  It should be possible to almost halve it.
49
50define(`up', `r3')
51define(`n', `r4')
52
53define(`a0', `v3')
54define(`a1', `v4')
55define(`a2', `v5')
56define(`c0', `v6')
57define(`c1', `v7')
58define(`c2', `v8')
59define(`z',  `v9')
60define(`x0', `v10')
61define(`x1', `v11')
62define(`x2', `v12')
63define(`x3', `v13')
64define(`pv', `v14')
65define(`y0', `v0')
66define(`y1', `v1')
67define(`y2', `v2')
68define(`y3', `v15')
69
70ASM_START()
71PROLOGUE(mpn_mod_34lsub1)
72	cmpwi	cr0, n, 20		C tuned cutoff point
73	bge	L(large)
74
75	li	r9, 0			C result accumulator
76	mulli	r10, n, 0xb		C 0xb = ceil(32/3)
77	srwi.	r10, r10, 5		C r10 = floor(n/3), n < 32
78	beq	L(small_tail)
79	mtctr	r10
80	lwz	r6, 0(up)
81	lwz	r7, 4(up)
82	lwzu	r8, 8(up)
83	subf	n, r10, n
84	subf	n, r10, n
85	subf	n, r10, n
86	bdz	L(small_end)
87
88	ALIGN(16)
89L(los):	rlwinm	r0, r6, 0,8,31
90	add	r9, r9, r0		C add 24b from u0
91	srwi	r0, r6, 24
92	lwz	r6, 4(up)
93	rlwimi	r0, r7, 8, 0x00ffff00	C --111100
94	add	r9, r9, r0		C add 8b from u0 and 16b from u1
95	srwi	r0, r7, 16
96	lwz	r7, 8(up)
97	rlwimi	r0, r8, 16, 0x00ff0000	C --221111
98	add	r9, r9, r0		C add 16b from u1 and 8b from u2
99	srwi	r0, r8, 8		C --222222
100	lwzu	r8, 12(up)
101	add	r9, r9, r0		C add 24b from u2
102	bdnz	L(los)
103L(small_end):
104	rlwinm	r0, r6, 0,8,31
105	add	r9, r9, r0		C add 24b from u0
106	srwi	r0, r6, 24
107	rlwimi	r0, r7, 8, 0x00ffff00	C --111100
108	add	r9, r9, r0		C add 8b from u0 and 16b from u1
109	srwi	r0, r7, 16
110	rlwimi	r0, r8, 16, 0x00ff0000	C --221111
111	add	r9, r9, r0		C add 16b from u1 and 8b from u2
112	srwi	r0, r8, 8		C --222222
113	add	r9, r9, r0		C add 24b from u2
114
115	addi	up, up, 4
116	rlwinm	r0, r9, 0,8,31
117	srwi	r9, r9, 24
118	add	r9, r9, r0
119
120L(small_tail):
121	cmpi	cr0, n, 1
122	blt	L(ret)
123
124	lwz	r6, 0(up)
125	rlwinm	r0, r6, 0,8,31
126	srwi	r6, r6, 24
127	add	r9, r9, r0
128	add	r9, r9, r6
129
130	beq	L(ret)
131
132	lwz	r6, 4(up)
133	rlwinm	r0, r6, 8,8,23
134	srwi	r6, r6, 16
135	add	r9, r9, r0
136	add	r9, r9, r6
137
138L(ret):	mr	r3, r9
139	blr
140
141
142L(large):
143	stwu	r1, -32(r1)
144	mfspr	r10, 256
145	oris	r0, r10, 0xffff		C Set VRSAVE bit 0-15
146	mtspr	256, r0
147
148	andi.	r7, up, 15
149	vxor	a0, v0, v0
150	lis	r9, 0xaaaa
151	vxor	a1, v0, v0
152	ori	r9, r9, 0xaaab
153	vxor	a2, v0, v0
154	li	r5, 16
155	vxor	c0, v0, v0
156	li	r6, 32
157	vxor	c1, v0, v0
158	LEAL(	r11, cnsts)		C CAUTION clobbers r0 for elf, darwin
159	vxor	c2, v0, v0
160	vxor	z, v0, v0
161
162	beq	L(aligned16)
163
164	cmpwi	cr7, r7, 8
165	bge	cr7, L(na4)
166
167	lvx	a2, 0, up
168	addi	up, up, 16
169	vsldoi	a2, a2, z, 4
170	vsldoi	a2, z, a2, 12
171
172	addi	n, n, 9
173	mulhwu	r0, n, r9
174	srwi	r0, r0, 3		C r0 = floor(n/12)
175	mtctr	r0
176
177	mulli	r8, r0, 12
178	subf	n, r8, n
179	b	L(2)
180
181L(na4):	bne	cr7, L(na8)
182
183	lvx	a1, 0, up
184	addi	up, up, -16
185	vsldoi	a1, a1, z, 8
186	vsldoi	a1, z, a1, 8
187
188	addi	n, n, 6
189	mulhwu	r0, n, r9
190	srwi	r0, r0, 3		C r0 = floor(n/12)
191	mtctr	r0
192
193	mulli	r8, r0, 12
194	subf	n, r8, n
195	b	L(1)
196
197L(na8):
198	lvx	a0, 0, up
199	vsldoi	a0, a0, z, 12
200	vsldoi	a0, z, a0, 4
201
202	addi	n, n, 3
203	mulhwu	r0, n, r9
204	srwi	r0, r0, 3		C r0 = floor(n/12)
205	mtctr	r0
206
207	mulli	r8, r0, 12
208	subf	n, r8, n
209	b	L(0)
210
211L(aligned16):
212	mulhwu	r0, n, r9
213	srwi	r0, r0, 3		C r0 = floor(n/12)
214	mtctr	r0
215
216	mulli	r8, r0, 12
217	subf	n, r8, n
218
219	lvx	a0, 0, up
220L(0):	lvx	a1, r5, up
221L(1):	lvx	a2, r6, up
222	addi	up, up, 48
223L(2):	bdz	L(end)
224	li	r12, 256
225	li	r9, 288
226	ALIGN(32)
227L(top):
228	lvx	v0, 0, up
229	vaddcuw	v10, a0, v0
230	vadduwm	a0, a0, v0
231	vadduwm	c0, c0, v10
232
233	lvx	v1, r5, up
234	vaddcuw	v10, a1, v1
235	vadduwm	a1, a1, v1
236	vadduwm	c1, c1, v10
237
238	lvx	v2, r6, up
239	dcbt	up, r12
240	dcbt	up, r9
241	addi	up, up, 48
242	vaddcuw	v10, a2, v2
243	vadduwm	a2, a2, v2
244	vadduwm	c2, c2, v10
245	bdnz	L(top)
246
247L(end):
248C n = 0...11
249	cmpwi	cr0, n, 0
250	beq	L(sum)
251	cmpwi	cr0, n, 4
252	ble	L(tail.1..4)
253	cmpwi	cr0, n, 8
254	ble	L(tail.5..8)
255
256L(tail.9..11):
257	lvx	v0, 0, up
258	vaddcuw	v10, a0, v0
259	vadduwm	a0, a0, v0
260	vadduwm	c0, c0, v10
261
262	lvx	v1, r5, up
263	vaddcuw	v10, a1, v1
264	vadduwm	a1, a1, v1
265	vadduwm	c1, c1, v10
266
267	lvx	v2, r6, up
268
269	addi	r8, r11, 96
270	rlwinm	r3, n ,4,26,27
271	lvx	v11, r3, r8
272	vand	v2, v2, v11
273
274	vaddcuw	v10, a2, v2
275	vadduwm	a2, a2, v2
276	vadduwm	c2, c2, v10
277	b	L(sum)
278
279L(tail.5..8):
280	lvx	v0, 0, up
281	vaddcuw	v10, a0, v0
282	vadduwm	a0, a0, v0
283	vadduwm	c0, c0, v10
284
285	lvx	v1, r5, up
286
287	addi	r8, r11, 96
288	rlwinm	r3, n ,4,26,27
289	lvx	v11, r3, r8
290	vand	v1, v1, v11
291
292	vaddcuw	v10, a1, v1
293	vadduwm	a1, a1, v1
294	vadduwm	c1, c1, v10
295	b	L(sum)
296
297L(tail.1..4):
298	lvx	v0, 0, up
299
300	addi	r8, r11, 96
301	rlwinm	r3, n ,4,26,27
302	lvx	v11, r3, r8
303	vand	v0, v0, v11
304
305	vaddcuw	v10, a0, v0
306	vadduwm	a0, a0, v0
307	vadduwm	c0, c0, v10
308
309L(sum):	lvx	pv, 0, r11
310	vperm	x0, a0, z, pv		C extract 4 24-bit field from a0
311	vperm	y0, c2, z, pv
312	lvx	pv, r5, r11
313	vperm	x1, a1, z, pv		C extract 4 24-bit field from a1
314	vperm	y1, c0, z, pv		C extract 4 24-bit field from a1
315	lvx	pv, r6, r11
316	vperm	x2, a2, z, pv		C extract 4 24-bit field from a1
317	vperm	y2, c1, z, pv		C extract 4 24-bit field from a1
318	li	r10,  48
319	lvx	pv, r10, r11
320	vperm	x3, a0, z, pv		C extract remaining/partial a0 fields
321	vperm	y3, c2, z, pv		C extract remaining/partial a0 fields
322	li	r10,  64
323	lvx	pv, r10, r11
324	vperm	x3, a1, x3, pv		C insert remaining/partial a1 fields
325	vperm	y3, c0, y3, pv		C insert remaining/partial a1 fields
326	li	r10,  80
327	lvx	pv, r10, r11
328	vperm	x3, a2, x3, pv		C insert remaining/partial a2 fields
329	vperm	y3, c1, y3, pv		C insert remaining/partial a2 fields
330
331C We now have 4 128-bit accumulators to sum
332	vadduwm	x0, x0, x1
333	vadduwm	x2, x2, x3
334	vadduwm	x0, x0, x2
335
336	vadduwm	y0, y0, y1
337	vadduwm	y2, y2, y3
338	vadduwm	y0, y0, y2
339
340	vadduwm	x0, x0, y0
341
342C Reduce 32-bit fields
343	vsumsws	x0, x0, z
344
345	li	r7, 16
346	stvx	x0, r7, r1
347	lwz	r3, 28(r1)
348
349	mtspr	256, r10
350	addi	r1, r1, 32
351	blr
352EPILOGUE()
353
354C load	|      v0       |      v1       |      v2       |
355C acc	|      a0       |      a1       |      a2       |
356C carry	|      c0       |      c1       |      c2       |
357C	| 0   1   2   3 | 4   5   6   7 | 8   9  10  11 |  128
358C	|---|---|---|---|---|---|---|---|---|---|---|---|   32
359C	|  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |   24
360C	|     |     |     |     |     |     |     |     |   48
361
362C       $---------------$---------------$---------------$---------------$
363C       |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   |
364C       |_______________________________________________________________|
365C   |           |           |           |           |           |           |
366C       <-hi16-> <--- 24 --> <--- 24 --> <--- 24 --> <--- 24 --> <-lo16->
367
368
369DEF_OBJECT(cnsts,16)
370C Permutation vectors in the order they are used above
371C #      00   01   02   03    04   05   06   07    08   09   0a   0b    0c   0d   0e   0f
372 .byte 0x10,0x01,0x02,0x03, 0x10,0x06,0x07,0x00, 0x10,0x0b,0x04,0x05, 0x10,0x08,0x09,0x0a C a0
373 .byte 0x10,0x07,0x00,0x01, 0x10,0x04,0x05,0x06, 0x10,0x09,0x0a,0x0b, 0x10,0x0e,0x0f,0x08 C a1
374 .byte 0x10,0x00,0x01,0x02, 0x10,0x05,0x06,0x07, 0x10,0x0a,0x0b,0x04, 0x10,0x0f,0x08,0x09 C a2
375 .byte 0x10,0x0d,0x0e,0x0f, 0x10,0x10,0x10,0x0c, 0x10,0x10,0x10,0x10, 0x10,0x10,0x10,0x10 C part a0
376 .byte 0x10,0x11,0x12,0x13, 0x10,0x02,0x03,0x17, 0x10,0x10,0x0c,0x0d, 0x10,0x10,0x10,0x10 C part a1
377 .byte 0x10,0x11,0x12,0x13, 0x10,0x15,0x16,0x17, 0x10,0x03,0x1a,0x1b, 0x10,0x0c,0x0d,0x0e C part a2
378C Masks for high end of number
379 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
380 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
381 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
382 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
383C Masks for low end of number
384C .byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
385C .byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
386C .byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
387C .byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
388END_OBJECT(cnsts)
389