1dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_and_n, mpn_andn_n, mpn_nand_n,
2dnl  mpn_ior_n, mpn_iorn_n, mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise
3dnl  logical operations.
4
5dnl  Copyright 2006 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35
36C               and,ior,andn,nior,xor    iorn,xnor         nand
37C                   cycles/limb         cycles/limb    cycles/limb
38C 7400,7410 (G4):       1.39                 ?              ?
39C 744x,745x (G4+):      1.14                1.39           1.39
40C 970:                  1.7                 2.0            2.0
41
42C STATUS
43C  * Works for all sizes and alignment for 32-bit limbs.
44C  * Works for n >= 4 for 64-bit limbs; untested for smaller operands.
45C  * Current performance makes this pointless for 970
46
47C TODO
48C  * Might want to make variants when just one of the source operands needs
49C    vperm, and when neither needs it.  The latter runs 50% faster on 7400.
50C  * Idea: If the source operands are equally aligned, we could do the logops
51C    first, then vperm before storing!  That means we never need more than one
52C    vperm, ever!
53C  * Perhaps align `rp' after initial alignment loop?
54C  * Instead of having scalar code in the beginning and end, consider using
55C    read-modify-write vector code.
56C  * Software pipeline?  Hopefully not too important, this is hairy enough
57C    already.
58C  * At least be more clever about operand loading, i.e., load v operands before
59C    u operands, since v operands are sometimes negated.
60
61define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
62define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
63define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
64
65define(`vnegb', `')		C default neg-before to null
66define(`vnega', `')		C default neg-before to null
67
68ifdef(`OPERATION_and_n',
69`	define(`func',	`mpn_and_n')
70	define(`logopS',`and	$1,$2,$3')
71	define(`logop',	`vand	$1,$2,$3')')
72ifdef(`OPERATION_andn_n',
73`	define(`func',	`mpn_andn_n')
74	define(`logopS',`andc	$1,$2,$3')
75	define(`logop',	`vandc	$1,$2,$3')')
76ifdef(`OPERATION_nand_n',
77`	define(`func',	`mpn_nand_n')
78	define(`logopS',`nand	$1,$2,$3')
79	define(`logop',	`vand	$1,$2,$3')
80	define(`vnega',	`vnor	$1,$2,$2')')
81ifdef(`OPERATION_ior_n',
82`	define(`func',	`mpn_ior_n')
83	define(`logopS',`or	$1,$2,$3')
84	define(`logop',	`vor	$1,$2,$3')')
85ifdef(`OPERATION_iorn_n',
86`	define(`func',	`mpn_iorn_n')
87	define(`logopS',`orc	$1,$2,$3')
88	define(`vnegb',	`vnor	$1,$2,$2')
89	define(`logop',	`vor	$1,$2,$3')')
90ifdef(`OPERATION_nior_n',
91`	define(`func',	`mpn_nior_n')
92	define(`logopS',`nor	$1,$2,$3')
93	define(`logop',	`vnor	$1,$2,$3')')
94ifdef(`OPERATION_xor_n',
95`	define(`func',	`mpn_xor_n')
96	define(`logopS',`xor	$1,$2,$3')
97	define(`logop',	`vxor	$1,$2,$3')')
98ifdef(`OPERATION_xnor_n',
99`	define(`func',`mpn_xnor_n')
100	define(`logopS',`eqv	$1,$2,$3')
101	define(`vnegb',	`vnor	$1,$2,$2')
102	define(`logop',	`vxor	$1,$2,$3')')
103
104ifelse(GMP_LIMB_BITS,`32',`
105	define(`LIMB32',`	$1')
106	define(`LIMB64',`')
107',`
108	define(`LIMB32',`')
109	define(`LIMB64',`	$1')
110')
111
112C INPUT PARAMETERS
113define(`rp',	`r3')
114define(`up',	`r4')
115define(`vp',	`r5')
116define(`n',	`r6')
117
118define(`us',	`v8')
119define(`vs',	`v9')
120
121MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
122
123ASM_START()
124PROLOGUE(func)
125
126LIMB32(`cmpwi	cr0, n, 8	')
127LIMB64(`cmpdi	cr0, n, 4	')
128	bge	L(big)
129
130	mtctr	n
131
132LIMB32(`lwz	r8, 0(up)	')
133LIMB32(`lwz	r9, 0(vp)	')
134LIMB32(`logopS(	r0, r8, r9)	')
135LIMB32(`stw	r0, 0(rp)	')
136LIMB32(`bdz	L(endS)		')
137
138L(topS):
139LIMB32(`lwzu	r8, 4(up)	')
140LIMB64(`ld	r8, 0(up)	')
141LIMB64(`addi	up, up, GMP_LIMB_BYTES	')
142LIMB32(`lwzu	r9, 4(vp)	')
143LIMB64(`ld	r9, 0(vp)	')
144LIMB64(`addi	vp, vp, GMP_LIMB_BYTES	')
145	logopS(	r0, r8, r9)
146LIMB32(`stwu	r0, 4(rp)	')
147LIMB64(`std	r0, 0(rp)	')
148LIMB64(`addi	rp, rp, GMP_LIMB_BYTES	')
149	bdnz	L(topS)
150L(endS):
151	blr
152
153L(big):	mfspr	r12, 256
154	oris	r0, r12, 0xfffc		C Set VRSAVE bit 0-13 FIXME
155	mtspr	256, r0
156
157C First loop until the destination is 16-byte aligned.  This will execute 0 or 1
158C times for 64-bit machines, and 0 to 3 times for 32-bit machines.
159
160LIMB32(`rlwinm.	r0, rp, 30,30,31')	C (rp >> 2) mod 4
161LIMB64(`rlwinm.	r0, rp, 29,31,31')	C (rp >> 3) mod 2
162	beq	L(aligned)
163
164	subfic	r7, r0, LIMBS_PER_VR
165LIMB32(`li	r10, 0		')
166	subf	n, r7, n
167L(top0):
168LIMB32(`lwz	r8, 0(up)	')
169LIMB64(`ld	r8, 0(up)	')
170	addi	up, up, GMP_LIMB_BYTES
171LIMB32(`lwz	r9, 0(vp)	')
172LIMB64(`ld	r9, 0(vp)	')
173	addi	vp, vp, GMP_LIMB_BYTES
174LIMB32(`addic.	r7, r7, -1	')
175	logopS(	r0, r8, r9)
176LIMB32(`stwx	r0, r10, rp	')
177LIMB64(`std	r0, 0(rp)	')
178LIMB32(`addi	r10, r10, GMP_LIMB_BYTES')
179LIMB32(`bne	L(top0)		')
180
181	addi	rp, rp, 16		C update rp, but preserve its alignment
182
183L(aligned):
184LIMB64(`srdi	r7, n, 1	')	C loop count corresponding to n
185LIMB32(`srwi	r7, n, 2	')	C loop count corresponding to n
186	mtctr	r7			C copy n to count register
187
188	li	r10, 16
189	lvsl	us, 0, up
190	lvsl	vs, 0, vp
191
192	lvx	v2, 0, up
193	lvx	v3, 0, vp
194	bdnz	L(gt1)
195	lvx	v0, r10, up
196	lvx	v1, r10, vp
197	vperm	v4, v2, v0, us
198	vperm	v5, v3, v1, vs
199	vnegb(	v5, v5)
200	logop(	v6, v4, v5)
201	vnega(	v6, v6)
202	stvx	v6, 0, rp
203	addi	up, up, 16
204	addi	vp, vp, 16
205	addi	rp, rp, 4
206	b	L(tail)
207
208L(gt1):	addi	up, up, 16
209	addi	vp, vp, 16
210
211L(top):	lvx	v0, 0, up
212	lvx	v1, 0, vp
213	vperm	v4, v2, v0, us
214	vperm	v5, v3, v1, vs
215	vnegb(	v5, v5)
216	logop(	v6, v4, v5)
217	vnega(	v6, v6)
218	stvx	v6, 0, rp
219	bdz	L(end)
220	lvx	v2, r10, up
221	lvx	v3, r10, vp
222	vperm	v4, v0, v2, us
223	vperm	v5, v1, v3, vs
224	vnegb(	v5, v5)
225	logop(	v6, v4, v5)
226	vnega(	v6, v6)
227	stvx	v6, r10, rp
228	addi	up, up, 32
229	addi	vp, vp, 32
230	addi	rp, rp, 32
231	bdnz	L(top)
232
233	andi.	r0, up, 15
234	vxor	v0, v0, v0
235	beq	1f
236	lvx	v0, 0, up
2371:	andi.	r0, vp, 15
238	vxor	v1, v1, v1
239	beq	1f
240	lvx	v1, 0, vp
2411:	vperm	v4, v2, v0, us
242	vperm	v5, v3, v1, vs
243	vnegb(	v5, v5)
244	logop(	v6, v4, v5)
245	vnega(	v6, v6)
246	stvx	v6, 0, rp
247	addi	rp, rp, 4
248	b	L(tail)
249
250L(end):	andi.	r0, up, 15
251	vxor	v2, v2, v2
252	beq	1f
253	lvx	v2, r10, up
2541:	andi.	r0, vp, 15
255	vxor	v3, v3, v3
256	beq	1f
257	lvx	v3, r10, vp
2581:	vperm	v4, v0, v2, us
259	vperm	v5, v1, v3, vs
260	vnegb(	v5, v5)
261	logop(	v6, v4, v5)
262	vnega(	v6, v6)
263	stvx	v6, r10, rp
264
265	addi	up, up, 16
266	addi	vp, vp, 16
267	addi	rp, rp, 20
268
269L(tail):
270LIMB32(`rlwinm.	r7, n, 0,30,31	')	C r7 = n mod 4
271LIMB64(`rlwinm.	r7, n, 0,31,31	')	C r7 = n mod 2
272	beq	L(ret)
273	addi	rp, rp, 15
274LIMB32(`rlwinm	rp, rp, 0,0,27	')
275LIMB64(`rldicr	rp, rp, 0,59	')
276	li	r10, 0
277L(top2):
278LIMB32(`lwzx	r8, r10, up	')
279LIMB64(`ldx	r8, r10, up	')
280LIMB32(`lwzx	r9, r10, vp	')
281LIMB64(`ldx	r9, r10, vp	')
282LIMB32(`addic.	r7, r7, -1	')
283	logopS(	r0, r8, r9)
284LIMB32(`stwx	r0, r10, rp	')
285LIMB64(`std	r0, 0(rp)	')
286LIMB32(`addi	r10, r10, GMP_LIMB_BYTES')
287LIMB32(`bne	L(top2)		')
288
289L(ret):	mtspr	256, r12
290	blr
291EPILOGUE()
292
293C This works for 64-bit PowerPC, since a limb ptr can only be aligned
294C in 2 relevant ways, which means we can always find a pair of aligned
295C pointers of rp, up, and vp.
296C process words until rp is 16-byte aligned
297C if (((up | vp) & 15) == 0)
298C   process with VMX without any vperm
299C else if ((up & 15) != 0 && (vp & 15) != 0)
300C   process with VMX using vperm on store data
301C else if ((up & 15) != 0)
302C   process with VMX using vperm on up data
303C else
304C   process with VMX using vperm on vp data
305C
306C	rlwinm,	r0, up, 0,28,31
307C	rlwinm	r0, vp, 0,28,31
308C	cmpwi	cr7, r0, 0
309C	cror	cr6, cr0, cr7
310C	crand	cr0, cr0, cr7
311