1dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_and_n, mpn_andn_n, mpn_nand_n, 2dnl mpn_ior_n, mpn_iorn_n, mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise 3dnl logical operations. 4 5dnl Copyright 2006 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24 25C and,ior,andn,nior,xor iorn,xnor nand 26C cycles/limb cycles/limb cycles/limb 27C 7400,7410 (G4): 1.39 ? ? 28C 744x,745x (G4+): 1.14 1.39 1.39 29C 970: 1.7 2.0 2.0 30 31C STATUS 32C * Works for all sizes and alignment for 32-bit limbs. 33C * Works for n >= 4 for 64-bit limbs; untested for smaller operands. 34C * Current performance makes this pointless for 970 35 36C TODO 37C * Might want to make variants when just one of the source operands needs 38C vperm, and when neither needs it. The latter runs 50% faster on 7400. 39C * Idea: If the source operands are equally aligned, we could do the logops 40C first, then vperm before storing! That means we never need more than one 41C vperm, ever! 42C * Perhaps align `rp' after initial alignment loop? 43C * Instead of having scalar code in the beginning and end, consider using 44C read-modify-write vector code. 45C * Software pipeline? Hopefully not too important, this is hairy enough 46C already. 47C * At least be more clever about operand loading, i.e., load v operands before 48C u operands, since v operands are sometimes negated. 49 50define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) 51define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) 52define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) 53 54define(`vnegb', `') C default neg-before to null 55define(`vnega', `') C default neg-before to null 56 57ifdef(`OPERATION_and_n', 58` define(`func', `mpn_and_n') 59 define(`logopS',`and $1,$2,$3') 60 define(`logop', `vand $1,$2,$3')') 61ifdef(`OPERATION_andn_n', 62` define(`func', `mpn_andn_n') 63 define(`logopS',`andc $1,$2,$3') 64 define(`logop', `vandc $1,$2,$3')') 65ifdef(`OPERATION_nand_n', 66` define(`func', `mpn_nand_n') 67 define(`logopS',`nand $1,$2,$3') 68 define(`logop', `vand $1,$2,$3') 69 define(`vnega', `vnor $1,$2,$2')') 70ifdef(`OPERATION_ior_n', 71` define(`func', `mpn_ior_n') 72 define(`logopS',`or $1,$2,$3') 73 define(`logop', `vor $1,$2,$3')') 74ifdef(`OPERATION_iorn_n', 75` define(`func', `mpn_iorn_n') 76 define(`logopS',`orc $1,$2,$3') 77 define(`vnegb', `vnor $1,$2,$2') 78 define(`logop', `vor $1,$2,$3')') 79ifdef(`OPERATION_nior_n', 80` define(`func', `mpn_nior_n') 81 define(`logopS',`nor $1,$2,$3') 82 define(`logop', `vnor $1,$2,$3')') 83ifdef(`OPERATION_xor_n', 84` define(`func', `mpn_xor_n') 85 define(`logopS',`xor $1,$2,$3') 86 define(`logop', `vxor $1,$2,$3')') 87ifdef(`OPERATION_xnor_n', 88` define(`func',`mpn_xnor_n') 89 define(`logopS',`eqv $1,$2,$3') 90 define(`vnegb', `vnor $1,$2,$2') 91 define(`logop', `vxor $1,$2,$3')') 92 93ifelse(GMP_LIMB_BITS,`32',` 94 define(`LIMB32',` $1') 95 define(`LIMB64',`') 96',` 97 define(`LIMB32',`') 98 define(`LIMB64',` $1') 99') 100 101C INPUT PARAMETERS 102define(`rp', `r3') 103define(`up', `r4') 104define(`vp', `r5') 105define(`n', `r6') 106 107define(`us', `v8') 108define(`vs', `v9') 109 110MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) 111 112ASM_START() 113PROLOGUE(func) 114 115LIMB32(`cmpwi cr0, n, 8 ') 116LIMB64(`cmpdi cr0, n, 4 ') 117 bge L(big) 118 119 mtctr n 120 121LIMB32(`lwz r8, 0(up) ') 122LIMB32(`lwz r9, 0(vp) ') 123LIMB32(`logopS( r0, r8, r9) ') 124LIMB32(`stw r0, 0(rp) ') 125LIMB32(`bdz L(endS) ') 126 127L(topS): 128LIMB32(`lwzu r8, 4(up) ') 129LIMB64(`ld r8, 0(up) ') 130LIMB64(`addi up, up, GMP_LIMB_BYTES ') 131LIMB32(`lwzu r9, 4(vp) ') 132LIMB64(`ld r9, 0(vp) ') 133LIMB64(`addi vp, vp, GMP_LIMB_BYTES ') 134 logopS( r0, r8, r9) 135LIMB32(`stwu r0, 4(rp) ') 136LIMB64(`std r0, 0(rp) ') 137LIMB64(`addi rp, rp, GMP_LIMB_BYTES ') 138 bdnz L(topS) 139L(endS): 140 blr 141 142L(big): mfspr r12, 256 143 oris r0, r12, 0xfffc C Set VRSAVE bit 0-13 FIXME 144 mtspr 256, r0 145 146C First loop until the destination is 16-byte aligned. This will execute 0 or 1 147C times for 64-bit machines, and 0 to 3 times for 32-bit machines. 148 149LIMB32(`rlwinm. r0, rp, 30,30,31') C (rp >> 2) mod 4 150LIMB64(`rlwinm. r0, rp, 29,31,31') C (rp >> 3) mod 2 151 beq L(aligned) 152 153 subfic r7, r0, LIMBS_PER_VR 154LIMB32(`li r10, 0 ') 155 subf n, r7, n 156L(top0): 157LIMB32(`lwz r8, 0(up) ') 158LIMB64(`ld r8, 0(up) ') 159 addi up, up, GMP_LIMB_BYTES 160LIMB32(`lwz r9, 0(vp) ') 161LIMB64(`ld r9, 0(vp) ') 162 addi vp, vp, GMP_LIMB_BYTES 163LIMB32(`addic. r7, r7, -1 ') 164 logopS( r0, r8, r9) 165LIMB32(`stwx r0, r10, rp ') 166LIMB64(`std r0, 0(rp) ') 167LIMB32(`addi r10, r10, GMP_LIMB_BYTES') 168LIMB32(`bne L(top0) ') 169 170 addi rp, rp, 16 C update rp, but preserve its alignment 171 172L(aligned): 173LIMB64(`srdi r7, n, 1 ') C loop count corresponding to n 174LIMB32(`srwi r7, n, 2 ') C loop count corresponding to n 175 mtctr r7 C copy n to count register 176 177 li r10, 16 178 lvsl us, 0, up 179 lvsl vs, 0, vp 180 181 lvx v2, 0, up 182 lvx v3, 0, vp 183 bdnz L(gt1) 184 lvx v0, r10, up 185 lvx v1, r10, vp 186 vperm v4, v2, v0, us 187 vperm v5, v3, v1, vs 188 vnegb( v5, v5) 189 logop( v6, v4, v5) 190 vnega( v6, v6) 191 stvx v6, 0, rp 192 addi up, up, 16 193 addi vp, vp, 16 194 addi rp, rp, 4 195 b L(tail) 196 197L(gt1): addi up, up, 16 198 addi vp, vp, 16 199 200L(top): lvx v0, 0, up 201 lvx v1, 0, vp 202 vperm v4, v2, v0, us 203 vperm v5, v3, v1, vs 204 vnegb( v5, v5) 205 logop( v6, v4, v5) 206 vnega( v6, v6) 207 stvx v6, 0, rp 208 bdz L(end) 209 lvx v2, r10, up 210 lvx v3, r10, vp 211 vperm v4, v0, v2, us 212 vperm v5, v1, v3, vs 213 vnegb( v5, v5) 214 logop( v6, v4, v5) 215 vnega( v6, v6) 216 stvx v6, r10, rp 217 addi up, up, 32 218 addi vp, vp, 32 219 addi rp, rp, 32 220 bdnz L(top) 221 222 andi. r0, up, 15 223 vxor v0, v0, v0 224 beq 1f 225 lvx v0, 0, up 2261: andi. r0, vp, 15 227 vxor v1, v1, v1 228 beq 1f 229 lvx v1, 0, vp 2301: vperm v4, v2, v0, us 231 vperm v5, v3, v1, vs 232 vnegb( v5, v5) 233 logop( v6, v4, v5) 234 vnega( v6, v6) 235 stvx v6, 0, rp 236 addi rp, rp, 4 237 b L(tail) 238 239L(end): andi. r0, up, 15 240 vxor v2, v2, v2 241 beq 1f 242 lvx v2, r10, up 2431: andi. r0, vp, 15 244 vxor v3, v3, v3 245 beq 1f 246 lvx v3, r10, vp 2471: vperm v4, v0, v2, us 248 vperm v5, v1, v3, vs 249 vnegb( v5, v5) 250 logop( v6, v4, v5) 251 vnega( v6, v6) 252 stvx v6, r10, rp 253 254 addi up, up, 16 255 addi vp, vp, 16 256 addi rp, rp, 20 257 258L(tail): 259LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4 260LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2 261 beq L(ret) 262 addi rp, rp, 15 263LIMB32(`rlwinm rp, rp, 0,0,27 ') 264LIMB64(`rldicr rp, rp, 0,59 ') 265 li r10, 0 266L(top2): 267LIMB32(`lwzx r8, r10, up ') 268LIMB64(`ldx r8, r10, up ') 269LIMB32(`lwzx r9, r10, vp ') 270LIMB64(`ldx r9, r10, vp ') 271LIMB32(`addic. r7, r7, -1 ') 272 logopS( r0, r8, r9) 273LIMB32(`stwx r0, r10, rp ') 274LIMB64(`std r0, 0(rp) ') 275LIMB32(`addi r10, r10, GMP_LIMB_BYTES') 276LIMB32(`bne L(top2) ') 277 278L(ret): mtspr 256, r12 279 blr 280EPILOGUE() 281 282C This works for 64-bit PowerPC, since a limb ptr can only be aligned 283C in 2 relevant ways, which means we can always find a pair of aligned 284C pointers of rp, up, and vp. 285C process words until rp is 16-byte aligned 286C if (((up | vp) & 15) == 0) 287C process with VMX without any vperm 288C else if ((up & 15) != 0 && (vp & 15) != 0) 289C process with VMX using vperm on store data 290C else if ((up & 15) != 0) 291C process with VMX using vperm on up data 292C else 293C process with VMX using vperm on vp data 294C 295C rlwinm, r0, up, 0,28,31 296C rlwinm r0, vp, 0,28,31 297C cmpwi cr7, r0, 0 298C cror cr6, cr0, cr7 299C crand cr0, cr0, cr7 300