1dnl  PowerPC-32 mpn_mod_34lsub1 -- mpn remainder mod 2^24-1.
2
3dnl  Copyright 2002, 2003, 2005 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C                cycles/limb
24C 603e:            ?
25C 604e:            3
26C 75x (G3):        3
27C 7400,7410 (G4):  3
28C 744x,745x (G4+): 3
29C power4/ppc970:   2.5
30C power5:          2.5
31
32C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
33C
34C There seems no need to schedule the loads back, the code is still 3.0 c/l
35C on 750/7400 no matter where they're placed.
36C
37C Alternatives:
38C
39C Fetching half words would allow add instead for accumulating, instead of
40C adde and its serialization.  An outer loop would be required though, since
41C 2^16 halfwords can overflow.  lhz+add would be 2.0 c/l, but if there's
42C also a bdz or bdnz for each and a pointer update say every three limbs
43C then the total would be 2.67 c/l which isn't much faster than the current
44C simpler code.
45
46ASM_START()
47PROLOGUE(mpn_mod_34lsub1)
48
49	C r3	src
50	C r4	size
51
52	mtctr	r4
53	addic	r6, r3, 8		C &src[2], and clear CA
54
55	lwz	r3, 0(r3)		C acc0 = src[0]
56	bdz	L(done)
57
58	lwz	r4, -4(r6)		C acc1 = src[1]
59	bdz	L(two)
60
61	lwz	r5, 0(r6)		C acc2 = src[2]
62	lis	r7, 0			C no carry if just three limbs
63
64	bdz	L(three)
65	lis	r7, 1			C 0x10000 carry pos
66
67L(top):
68	C r3	acc0
69	C r4	acc1
70	C r5	acc2
71	C r6	src, incrementing
72	C r7	carry pos
73
74	lwz	r0, 4(r6)
75	adde	r3, r3, r0
76	bdz	L(end0)
77
78	lwz	r0, 8(r6)
79	adde	r4, r4, r0
80	bdz	L(end1)
81
82	lwzu	r0, 12(r6)
83	adde	r5, r5, r0
84	bdnz	L(top)
85
86
87	srwi	r7, r7, 8
88L(end0):
89	srwi	r7, r7, 8
90L(end1):
91	subfe	r0, r0, r0		C -1 if not CA
92
93	andc	r7, r7, r0		C final carry, 0x10000, 0x100, 1 or 0
94L(three):
95	rlwinm	r6, r3, 0,8,31		C acc0 low
96
97	add	r7, r7, r6
98	rlwinm	r6, r3, 8,24,31		C acc0 high
99
100	add	r7, r7, r6
101	rlwinm	r6, r4, 8,8,23		C acc1 low
102
103	add	r7, r7, r6
104	rlwinm	r6, r4, 16,16,31	C acc1 high
105
106	add	r7, r7, r6
107	rlwinm	r6, r5, 16,8,15		C acc2 low
108
109	add	r7, r7, r6
110	rlwinm	r6, r5, 24,8,31		C acc2 high
111
112	add	r3, r7, r6
113
114L(done):
115	blr
116
117L(two):
118	C r3	acc0
119	C r4	acc1
120
121	rlwinm	r5, r3, 8,24,31		C acc0 high
122	rlwinm	r3, r3, 0,8,31		C acc0 low
123
124	add	r3, r3, r5		C acc0 high + low
125	rlwinm	r5, r4, 16,16,31	C acc1 high
126
127	add	r3, r3, r5		C add acc1 high
128	rlwinm	r5, r4, 8,8,23		C acc1 low
129
130	add	r3, r3, r5		C add acc1 low
131
132	blr
133
134EPILOGUE()
135