1dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_and_n, mpn_andn_n, mpn_nand_n, 2dnl mpn_ior_n, mpn_iorn_n, mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise 3dnl logical operations. 4 5dnl Copyright 2006 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35 36C and,ior,andn,nior,xor iorn,xnor nand 37C cycles/limb cycles/limb cycles/limb 38C 7400,7410 (G4): 1.39 ? ? 39C 744x,745x (G4+): 1.14 1.39 1.39 40C 970: 1.7 2.0 2.0 41 42C STATUS 43C * Works for all sizes and alignment for 32-bit limbs. 44C * Works for n >= 4 for 64-bit limbs; untested for smaller operands. 45C * Current performance makes this pointless for 970 46 47C TODO 48C * Might want to make variants when just one of the source operands needs 49C vperm, and when neither needs it. The latter runs 50% faster on 7400. 50C * Idea: If the source operands are equally aligned, we could do the logops 51C first, then vperm before storing! That means we never need more than one 52C vperm, ever! 53C * Perhaps align `rp' after initial alignment loop? 54C * Instead of having scalar code in the beginning and end, consider using 55C read-modify-write vector code. 56C * Software pipeline? Hopefully not too important, this is hairy enough 57C already. 58C * At least be more clever about operand loading, i.e., load v operands before 59C u operands, since v operands are sometimes negated. 60 61define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) 62define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) 63define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) 64 65define(`vnegb', `') C default neg-before to null 66define(`vnega', `') C default neg-before to null 67 68ifdef(`OPERATION_and_n', 69` define(`func', `mpn_and_n') 70 define(`logopS',`and $1,$2,$3') 71 define(`logop', `vand $1,$2,$3')') 72ifdef(`OPERATION_andn_n', 73` define(`func', `mpn_andn_n') 74 define(`logopS',`andc $1,$2,$3') 75 define(`logop', `vandc $1,$2,$3')') 76ifdef(`OPERATION_nand_n', 77` define(`func', `mpn_nand_n') 78 define(`logopS',`nand $1,$2,$3') 79 define(`logop', `vand $1,$2,$3') 80 define(`vnega', `vnor $1,$2,$2')') 81ifdef(`OPERATION_ior_n', 82` define(`func', `mpn_ior_n') 83 define(`logopS',`or $1,$2,$3') 84 define(`logop', `vor $1,$2,$3')') 85ifdef(`OPERATION_iorn_n', 86` define(`func', `mpn_iorn_n') 87 define(`logopS',`orc $1,$2,$3') 88 define(`vnegb', `vnor $1,$2,$2') 89 define(`logop', `vor $1,$2,$3')') 90ifdef(`OPERATION_nior_n', 91` define(`func', `mpn_nior_n') 92 define(`logopS',`nor $1,$2,$3') 93 define(`logop', `vnor $1,$2,$3')') 94ifdef(`OPERATION_xor_n', 95` define(`func', `mpn_xor_n') 96 define(`logopS',`xor $1,$2,$3') 97 define(`logop', `vxor $1,$2,$3')') 98ifdef(`OPERATION_xnor_n', 99` define(`func',`mpn_xnor_n') 100 define(`logopS',`eqv $1,$2,$3') 101 define(`vnegb', `vnor $1,$2,$2') 102 define(`logop', `vxor $1,$2,$3')') 103 104ifelse(GMP_LIMB_BITS,`32',` 105 define(`LIMB32',` $1') 106 define(`LIMB64',`') 107',` 108 define(`LIMB32',`') 109 define(`LIMB64',` $1') 110') 111 112C INPUT PARAMETERS 113define(`rp', `r3') 114define(`up', `r4') 115define(`vp', `r5') 116define(`n', `r6') 117 118define(`us', `v8') 119define(`vs', `v9') 120 121MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) 122 123ASM_START() 124PROLOGUE(func) 125 126LIMB32(`cmpwi cr0, n, 8 ') 127LIMB64(`cmpdi cr0, n, 4 ') 128 bge L(big) 129 130 mtctr n 131 132LIMB32(`lwz r8, 0(up) ') 133LIMB32(`lwz r9, 0(vp) ') 134LIMB32(`logopS( r0, r8, r9) ') 135LIMB32(`stw r0, 0(rp) ') 136LIMB32(`bdz L(endS) ') 137 138L(topS): 139LIMB32(`lwzu r8, 4(up) ') 140LIMB64(`ld r8, 0(up) ') 141LIMB64(`addi up, up, GMP_LIMB_BYTES ') 142LIMB32(`lwzu r9, 4(vp) ') 143LIMB64(`ld r9, 0(vp) ') 144LIMB64(`addi vp, vp, GMP_LIMB_BYTES ') 145 logopS( r0, r8, r9) 146LIMB32(`stwu r0, 4(rp) ') 147LIMB64(`std r0, 0(rp) ') 148LIMB64(`addi rp, rp, GMP_LIMB_BYTES ') 149 bdnz L(topS) 150L(endS): 151 blr 152 153L(big): mfspr r12, 256 154 oris r0, r12, 0xfffc C Set VRSAVE bit 0-13 FIXME 155 mtspr 256, r0 156 157C First loop until the destination is 16-byte aligned. This will execute 0 or 1 158C times for 64-bit machines, and 0 to 3 times for 32-bit machines. 159 160LIMB32(`rlwinm. r0, rp, 30,30,31') C (rp >> 2) mod 4 161LIMB64(`rlwinm. r0, rp, 29,31,31') C (rp >> 3) mod 2 162 beq L(aligned) 163 164 subfic r7, r0, LIMBS_PER_VR 165LIMB32(`li r10, 0 ') 166 subf n, r7, n 167L(top0): 168LIMB32(`lwz r8, 0(up) ') 169LIMB64(`ld r8, 0(up) ') 170 addi up, up, GMP_LIMB_BYTES 171LIMB32(`lwz r9, 0(vp) ') 172LIMB64(`ld r9, 0(vp) ') 173 addi vp, vp, GMP_LIMB_BYTES 174LIMB32(`addic. r7, r7, -1 ') 175 logopS( r0, r8, r9) 176LIMB32(`stwx r0, r10, rp ') 177LIMB64(`std r0, 0(rp) ') 178LIMB32(`addi r10, r10, GMP_LIMB_BYTES') 179LIMB32(`bne L(top0) ') 180 181 addi rp, rp, 16 C update rp, but preserve its alignment 182 183L(aligned): 184LIMB64(`srdi r7, n, 1 ') C loop count corresponding to n 185LIMB32(`srwi r7, n, 2 ') C loop count corresponding to n 186 mtctr r7 C copy n to count register 187 188 li r10, 16 189 lvsl us, 0, up 190 lvsl vs, 0, vp 191 192 lvx v2, 0, up 193 lvx v3, 0, vp 194 bdnz L(gt1) 195 lvx v0, r10, up 196 lvx v1, r10, vp 197 vperm v4, v2, v0, us 198 vperm v5, v3, v1, vs 199 vnegb( v5, v5) 200 logop( v6, v4, v5) 201 vnega( v6, v6) 202 stvx v6, 0, rp 203 addi up, up, 16 204 addi vp, vp, 16 205 addi rp, rp, 4 206 b L(tail) 207 208L(gt1): addi up, up, 16 209 addi vp, vp, 16 210 211L(top): lvx v0, 0, up 212 lvx v1, 0, vp 213 vperm v4, v2, v0, us 214 vperm v5, v3, v1, vs 215 vnegb( v5, v5) 216 logop( v6, v4, v5) 217 vnega( v6, v6) 218 stvx v6, 0, rp 219 bdz L(end) 220 lvx v2, r10, up 221 lvx v3, r10, vp 222 vperm v4, v0, v2, us 223 vperm v5, v1, v3, vs 224 vnegb( v5, v5) 225 logop( v6, v4, v5) 226 vnega( v6, v6) 227 stvx v6, r10, rp 228 addi up, up, 32 229 addi vp, vp, 32 230 addi rp, rp, 32 231 bdnz L(top) 232 233 andi. r0, up, 15 234 vxor v0, v0, v0 235 beq 1f 236 lvx v0, 0, up 2371: andi. r0, vp, 15 238 vxor v1, v1, v1 239 beq 1f 240 lvx v1, 0, vp 2411: vperm v4, v2, v0, us 242 vperm v5, v3, v1, vs 243 vnegb( v5, v5) 244 logop( v6, v4, v5) 245 vnega( v6, v6) 246 stvx v6, 0, rp 247 addi rp, rp, 4 248 b L(tail) 249 250L(end): andi. r0, up, 15 251 vxor v2, v2, v2 252 beq 1f 253 lvx v2, r10, up 2541: andi. r0, vp, 15 255 vxor v3, v3, v3 256 beq 1f 257 lvx v3, r10, vp 2581: vperm v4, v0, v2, us 259 vperm v5, v1, v3, vs 260 vnegb( v5, v5) 261 logop( v6, v4, v5) 262 vnega( v6, v6) 263 stvx v6, r10, rp 264 265 addi up, up, 16 266 addi vp, vp, 16 267 addi rp, rp, 20 268 269L(tail): 270LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4 271LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2 272 beq L(ret) 273 addi rp, rp, 15 274LIMB32(`rlwinm rp, rp, 0,0,27 ') 275LIMB64(`rldicr rp, rp, 0,59 ') 276 li r10, 0 277L(top2): 278LIMB32(`lwzx r8, r10, up ') 279LIMB64(`ldx r8, r10, up ') 280LIMB32(`lwzx r9, r10, vp ') 281LIMB64(`ldx r9, r10, vp ') 282LIMB32(`addic. r7, r7, -1 ') 283 logopS( r0, r8, r9) 284LIMB32(`stwx r0, r10, rp ') 285LIMB64(`std r0, 0(rp) ') 286LIMB32(`addi r10, r10, GMP_LIMB_BYTES') 287LIMB32(`bne L(top2) ') 288 289L(ret): mtspr 256, r12 290 blr 291EPILOGUE() 292 293C This works for 64-bit PowerPC, since a limb ptr can only be aligned 294C in 2 relevant ways, which means we can always find a pair of aligned 295C pointers of rp, up, and vp. 296C process words until rp is 16-byte aligned 297C if (((up | vp) & 15) == 0) 298C process with VMX without any vperm 299C else if ((up & 15) != 0 && (vp & 15) != 0) 300C process with VMX using vperm on store data 301C else if ((up & 15) != 0) 302C process with VMX using vperm on up data 303C else 304C process with VMX using vperm on vp data 305C 306C rlwinm, r0, up, 0,28,31 307C rlwinm r0, vp, 0,28,31 308C cmpwi cr7, r0, 0 309C cror cr6, cr0, cr7 310C crand cr0, cr0, cr7 311