1dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_and_n, mpn_andn_n, mpn_nand_n,
2dnl  mpn_ior_n, mpn_iorn_n, mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise
3dnl  logical operations.
4
5dnl  Copyright 2006 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24
25C               and,ior,andn,nior,xor    iorn,xnor         nand
26C                   cycles/limb         cycles/limb    cycles/limb
27C 7400,7410 (G4):       1.39                 ?              ?
28C 744x,745x (G4+):      1.14                1.39           1.39
29C 970:                  1.7                 2.0            2.0
30
31C STATUS
32C  * Works for all sizes and alignment for 32-bit limbs.
33C  * Works for n >= 4 for 64-bit limbs; untested for smaller operands.
34C  * Current performance makes this pointless for 970
35
36C TODO
37C  * Might want to make variants when just one of the source operands needs
38C    vperm, and when neither needs it.  The latter runs 50% faster on 7400.
39C  * Idea: If the source operands are equally aligned, we could do the logops
40C    first, then vperm before storing!  That means we never need more than one
41C    vperm, ever!
42C  * Perhaps align `rp' after initial alignment loop?
43C  * Instead of having scalar code in the beginning and end, consider using
44C    read-modify-write vector code.
45C  * Software pipeline?  Hopefully not too important, this is hairy enough
46C    already.
47C  * At least be more clever about operand loading, i.e., load v operands before
48C    u operands, since v operands are sometimes negated.
49
50define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
51define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
52define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
53
54define(`vnegb', `')		C default neg-before to null
55define(`vnega', `')		C default neg-before to null
56
57ifdef(`OPERATION_and_n',
58`	define(`func',	`mpn_and_n')
59	define(`logopS',`and	$1,$2,$3')
60	define(`logop',	`vand	$1,$2,$3')')
61ifdef(`OPERATION_andn_n',
62`	define(`func',	`mpn_andn_n')
63	define(`logopS',`andc	$1,$2,$3')
64	define(`logop',	`vandc	$1,$2,$3')')
65ifdef(`OPERATION_nand_n',
66`	define(`func',	`mpn_nand_n')
67	define(`logopS',`nand	$1,$2,$3')
68	define(`logop',	`vand	$1,$2,$3')
69	define(`vnega',	`vnor	$1,$2,$2')')
70ifdef(`OPERATION_ior_n',
71`	define(`func',	`mpn_ior_n')
72	define(`logopS',`or	$1,$2,$3')
73	define(`logop',	`vor	$1,$2,$3')')
74ifdef(`OPERATION_iorn_n',
75`	define(`func',	`mpn_iorn_n')
76	define(`logopS',`orc	$1,$2,$3')
77	define(`vnegb',	`vnor	$1,$2,$2')
78	define(`logop',	`vor	$1,$2,$3')')
79ifdef(`OPERATION_nior_n',
80`	define(`func',	`mpn_nior_n')
81	define(`logopS',`nor	$1,$2,$3')
82	define(`logop',	`vnor	$1,$2,$3')')
83ifdef(`OPERATION_xor_n',
84`	define(`func',	`mpn_xor_n')
85	define(`logopS',`xor	$1,$2,$3')
86	define(`logop',	`vxor	$1,$2,$3')')
87ifdef(`OPERATION_xnor_n',
88`	define(`func',`mpn_xnor_n')
89	define(`logopS',`eqv	$1,$2,$3')
90	define(`vnegb',	`vnor	$1,$2,$2')
91	define(`logop',	`vxor	$1,$2,$3')')
92
93ifelse(GMP_LIMB_BITS,`32',`
94	define(`LIMB32',`	$1')
95	define(`LIMB64',`')
96',`
97	define(`LIMB32',`')
98	define(`LIMB64',`	$1')
99')
100
101C INPUT PARAMETERS
102define(`rp',	`r3')
103define(`up',	`r4')
104define(`vp',	`r5')
105define(`n',	`r6')
106
107define(`us',	`v8')
108define(`vs',	`v9')
109
110MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
111
112ASM_START()
113PROLOGUE(func)
114
115LIMB32(`cmpwi	cr0, n, 8	')
116LIMB64(`cmpdi	cr0, n, 4	')
117	bge	L(big)
118
119	mtctr	n
120
121LIMB32(`lwz	r8, 0(up)	')
122LIMB32(`lwz	r9, 0(vp)	')
123LIMB32(`logopS(	r0, r8, r9)	')
124LIMB32(`stw	r0, 0(rp)	')
125LIMB32(`bdz	L(endS)		')
126
127L(topS):
128LIMB32(`lwzu	r8, 4(up)	')
129LIMB64(`ld	r8, 0(up)	')
130LIMB64(`addi	up, up, GMP_LIMB_BYTES	')
131LIMB32(`lwzu	r9, 4(vp)	')
132LIMB64(`ld	r9, 0(vp)	')
133LIMB64(`addi	vp, vp, GMP_LIMB_BYTES	')
134	logopS(	r0, r8, r9)
135LIMB32(`stwu	r0, 4(rp)	')
136LIMB64(`std	r0, 0(rp)	')
137LIMB64(`addi	rp, rp, GMP_LIMB_BYTES	')
138	bdnz	L(topS)
139L(endS):
140	blr
141
142L(big):	mfspr	r12, 256
143	oris	r0, r12, 0xfffc		C Set VRSAVE bit 0-13 FIXME
144	mtspr	256, r0
145
146C First loop until the destination is 16-byte aligned.  This will execute 0 or 1
147C times for 64-bit machines, and 0 to 3 times for 32-bit machines.
148
149LIMB32(`rlwinm.	r0, rp, 30,30,31')	C (rp >> 2) mod 4
150LIMB64(`rlwinm.	r0, rp, 29,31,31')	C (rp >> 3) mod 2
151	beq	L(aligned)
152
153	subfic	r7, r0, LIMBS_PER_VR
154LIMB32(`li	r10, 0		')
155	subf	n, r7, n
156L(top0):
157LIMB32(`lwz	r8, 0(up)	')
158LIMB64(`ld	r8, 0(up)	')
159	addi	up, up, GMP_LIMB_BYTES
160LIMB32(`lwz	r9, 0(vp)	')
161LIMB64(`ld	r9, 0(vp)	')
162	addi	vp, vp, GMP_LIMB_BYTES
163LIMB32(`addic.	r7, r7, -1	')
164	logopS(	r0, r8, r9)
165LIMB32(`stwx	r0, r10, rp	')
166LIMB64(`std	r0, 0(rp)	')
167LIMB32(`addi	r10, r10, GMP_LIMB_BYTES')
168LIMB32(`bne	L(top0)		')
169
170	addi	rp, rp, 16		C update rp, but preserve its alignment
171
172L(aligned):
173LIMB64(`srdi	r7, n, 1	')	C loop count corresponding to n
174LIMB32(`srwi	r7, n, 2	')	C loop count corresponding to n
175	mtctr	r7			C copy n to count register
176
177	li	r10, 16
178	lvsl	us, 0, up
179	lvsl	vs, 0, vp
180
181	lvx	v2, 0, up
182	lvx	v3, 0, vp
183	bdnz	L(gt1)
184	lvx	v0, r10, up
185	lvx	v1, r10, vp
186	vperm	v4, v2, v0, us
187	vperm	v5, v3, v1, vs
188	vnegb(	v5, v5)
189	logop(	v6, v4, v5)
190	vnega(	v6, v6)
191	stvx	v6, 0, rp
192	addi	up, up, 16
193	addi	vp, vp, 16
194	addi	rp, rp, 4
195	b	L(tail)
196
197L(gt1):	addi	up, up, 16
198	addi	vp, vp, 16
199
200L(top):	lvx	v0, 0, up
201	lvx	v1, 0, vp
202	vperm	v4, v2, v0, us
203	vperm	v5, v3, v1, vs
204	vnegb(	v5, v5)
205	logop(	v6, v4, v5)
206	vnega(	v6, v6)
207	stvx	v6, 0, rp
208	bdz	L(end)
209	lvx	v2, r10, up
210	lvx	v3, r10, vp
211	vperm	v4, v0, v2, us
212	vperm	v5, v1, v3, vs
213	vnegb(	v5, v5)
214	logop(	v6, v4, v5)
215	vnega(	v6, v6)
216	stvx	v6, r10, rp
217	addi	up, up, 32
218	addi	vp, vp, 32
219	addi	rp, rp, 32
220	bdnz	L(top)
221
222	andi.	r0, up, 15
223	vxor	v0, v0, v0
224	beq	1f
225	lvx	v0, 0, up
2261:	andi.	r0, vp, 15
227	vxor	v1, v1, v1
228	beq	1f
229	lvx	v1, 0, vp
2301:	vperm	v4, v2, v0, us
231	vperm	v5, v3, v1, vs
232	vnegb(	v5, v5)
233	logop(	v6, v4, v5)
234	vnega(	v6, v6)
235	stvx	v6, 0, rp
236	addi	rp, rp, 4
237	b	L(tail)
238
239L(end):	andi.	r0, up, 15
240	vxor	v2, v2, v2
241	beq	1f
242	lvx	v2, r10, up
2431:	andi.	r0, vp, 15
244	vxor	v3, v3, v3
245	beq	1f
246	lvx	v3, r10, vp
2471:	vperm	v4, v0, v2, us
248	vperm	v5, v1, v3, vs
249	vnegb(	v5, v5)
250	logop(	v6, v4, v5)
251	vnega(	v6, v6)
252	stvx	v6, r10, rp
253
254	addi	up, up, 16
255	addi	vp, vp, 16
256	addi	rp, rp, 20
257
258L(tail):
259LIMB32(`rlwinm.	r7, n, 0,30,31	')	C r7 = n mod 4
260LIMB64(`rlwinm.	r7, n, 0,31,31	')	C r7 = n mod 2
261	beq	L(ret)
262	addi	rp, rp, 15
263LIMB32(`rlwinm	rp, rp, 0,0,27	')
264LIMB64(`rldicr	rp, rp, 0,59	')
265	li	r10, 0
266L(top2):
267LIMB32(`lwzx	r8, r10, up	')
268LIMB64(`ldx	r8, r10, up	')
269LIMB32(`lwzx	r9, r10, vp	')
270LIMB64(`ldx	r9, r10, vp	')
271LIMB32(`addic.	r7, r7, -1	')
272	logopS(	r0, r8, r9)
273LIMB32(`stwx	r0, r10, rp	')
274LIMB64(`std	r0, 0(rp)	')
275LIMB32(`addi	r10, r10, GMP_LIMB_BYTES')
276LIMB32(`bne	L(top2)		')
277
278L(ret):	mtspr	256, r12
279	blr
280EPILOGUE()
281
282C This works for 64-bit PowerPC, since a limb ptr can only be aligned
283C in 2 relevant ways, which means we can always find a pair of aligned
284C pointers of rp, up, and vp.
285C process words until rp is 16-byte aligned
286C if (((up | vp) & 15) == 0)
287C   process with VMX without any vperm
288C else if ((up & 15) != 0 && (vp & 15) != 0)
289C   process with VMX using vperm on store data
290C else if ((up & 15) != 0)
291C   process with VMX using vperm on up data
292C else
293C   process with VMX using vperm on vp data
294C
295C	rlwinm,	r0, up, 0,28,31
296C	rlwinm	r0, vp, 0,28,31
297C	cmpwi	cr7, r0, 0
298C	cror	cr6, cr0, cr7
299C	crand	cr0, cr0, cr7
300