1dnl  PowerPC-32 mpn_mod_34lsub1 -- mpn remainder mod 2^24-1.
2
3dnl  Copyright 2002, 2003, 2005 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C                cycles/limb
35C 603e:            ?
36C 604e:            3
37C 75x (G3):        3
38C 7400,7410 (G4):  3
39C 744x,745x (G4+): 3
40C power4/ppc970:   2.5
41C power5:          2.5
42
43C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
44C
45C There seems no need to schedule the loads back, the code is still 3.0 c/l
46C on 750/7400 no matter where they're placed.
47C
48C Alternatives:
49C
50C Fetching half words would allow add instead for accumulating, instead of
51C adde and its serialization.  An outer loop would be required though, since
52C 2^16 halfwords can overflow.  lhz+add would be 2.0 c/l, but if there's
53C also a bdz or bdnz for each and a pointer update say every three limbs
54C then the total would be 2.67 c/l which isn't much faster than the current
55C simpler code.
56
57ASM_START()
58PROLOGUE(mpn_mod_34lsub1)
59
60	C r3	src
61	C r4	size
62
63	mtctr	r4
64	addic	r6, r3, 8		C &src[2], and clear CA
65
66	lwz	r3, 0(r3)		C acc0 = src[0]
67	bdz	L(done)
68
69	lwz	r4, -4(r6)		C acc1 = src[1]
70	bdz	L(two)
71
72	lwz	r5, 0(r6)		C acc2 = src[2]
73	lis	r7, 0			C no carry if just three limbs
74
75	bdz	L(three)
76	lis	r7, 1			C 0x10000 carry pos
77
78L(top):
79	C r3	acc0
80	C r4	acc1
81	C r5	acc2
82	C r6	src, incrementing
83	C r7	carry pos
84
85	lwz	r0, 4(r6)
86	adde	r3, r3, r0
87	bdz	L(end0)
88
89	lwz	r0, 8(r6)
90	adde	r4, r4, r0
91	bdz	L(end1)
92
93	lwzu	r0, 12(r6)
94	adde	r5, r5, r0
95	bdnz	L(top)
96
97
98	srwi	r7, r7, 8
99L(end0):
100	srwi	r7, r7, 8
101L(end1):
102	subfe	r0, r0, r0		C -1 if not CA
103
104	andc	r7, r7, r0		C final carry, 0x10000, 0x100, 1 or 0
105L(three):
106	rlwinm	r6, r3, 0,8,31		C acc0 low
107
108	add	r7, r7, r6
109	rlwinm	r6, r3, 8,24,31		C acc0 high
110
111	add	r7, r7, r6
112	rlwinm	r6, r4, 8,8,23		C acc1 low
113
114	add	r7, r7, r6
115	rlwinm	r6, r4, 16,16,31	C acc1 high
116
117	add	r7, r7, r6
118	rlwinm	r6, r5, 16,8,15		C acc2 low
119
120	add	r7, r7, r6
121	rlwinm	r6, r5, 24,8,31		C acc2 high
122
123	add	r3, r7, r6
124
125L(done):
126	blr
127
128L(two):
129	C r3	acc0
130	C r4	acc1
131
132	rlwinm	r5, r3, 8,24,31		C acc0 high
133	rlwinm	r3, r3, 0,8,31		C acc0 low
134
135	add	r3, r3, r5		C acc0 high + low
136	rlwinm	r5, r4, 16,16,31	C acc1 high
137
138	add	r3, r3, r5		C add acc1 high
139	rlwinm	r5, r4, 8,8,23		C acc1 low
140
141	add	r3, r3, r5		C add acc1 low
142
143	blr
144
145EPILOGUE()
146