1dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_popcount. 2 3dnl Copyright 2006, 2010 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb 34C 7400,7410 (G4): ? 35C 744x,745x (G4+): 1.125 36C 970 (G5): 2.25 37 38C TODO 39C * Rewrite the awkward huge n outer loop code. 40C * Two lvx, two vperm, and two vxor could make us a similar hamdist. 41C * Compress cnsts table in 64-bit mode, only half the values are needed. 42 43define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) 44define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) 45define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) 46 47define(`OPERATION_popcount') 48 49define(`ap', `r3') 50define(`n', `r4') 51 52define(`rtab', `v10') 53define(`cnt4', `v11') 54 55ifelse(GMP_LIMB_BITS,32,` 56 define(`LIMB32',` $1') 57 define(`LIMB64',`') 58',` 59 define(`LIMB32',`') 60 define(`LIMB64',` $1') 61') 62 63C The inner loop handles up to 2^34 bits, i.e., 2^31 64-limbs, due to overflow 64C in vsum4ubs. For large operands, we work in chunks, of size LIMBS_PER_CHUNK. 65define(`LIMBS_PER_CHUNK', 0x1000) 66define(`LIMBS_CHUNK_THRES', 0x1001) 67 68ASM_START() 69PROLOGUE(mpn_popcount,toc) 70 mfspr r10, 256 71 oris r0, r10, 0xfffc C Set VRSAVE bit 0-13 72 mtspr 256, r0 73 74ifdef(`HAVE_ABI_mode32', 75` rldicl n, n, 0, 32') C zero extend n 76 77C Load various constants into vector registers 78 LEAL( r11, cnsts) 79 li r12, 16 80 vspltisb cnt4, 4 C 0x0404...04 used as shift count 81 82 li r7, 160 83 lvx rtab, 0, r11 84 85LIMB64(`lis r0, LIMBS_CHUNK_THRES ') 86LIMB64(`cmpd cr7, n, r0 ') 87 88 lvx v0, 0, ap 89 addi r7, r11, 80 90 rlwinm r6, ap, 2,26,29 91 lvx v8, r7, r6 92 vand v0, v0, v8 93 94LIMB32(`rlwinm r8, ap, 30,30,31 ') 95LIMB64(`rlwinm r8, ap, 29,31,31 ') 96 add n, n, r8 C compensate n for rounded down `ap' 97 98 vxor v1, v1, v1 99 li r8, 0 C grand total count 100 101 vxor v12, v12, v12 C zero total count 102 vxor v13, v13, v13 C zero total count 103 104 addic. n, n, -LIMBS_PER_VR 105 ble L(sum) 106 107 addic. n, n, -LIMBS_PER_VR 108 ble L(lsum) 109 110C For 64-bit machines, handle huge n that would overflow vsum4ubs 111LIMB64(`ble cr7, L(small) ') 112LIMB64(`addis r9, n, -LIMBS_PER_CHUNK ') C remaining n 113LIMB64(`lis n, LIMBS_PER_CHUNK ') 114 115 ALIGN(16) 116L(small): 117LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n 118LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n 119 addi r7, r7, 1 120 mtctr r7 C copy n to count register 121 b L(ent) 122 123 ALIGN(16) 124L(top): 125 lvx v0, 0, ap 126L(ent): lvx v1, r12, ap 127 addi ap, ap, 32 128 vsrb v8, v0, cnt4 129 vsrb v9, v1, cnt4 130 vperm v2, rtab, rtab, v0 131 vperm v3, rtab, rtab, v8 132 vperm v4, rtab, rtab, v1 133 vperm v5, rtab, rtab, v9 134 vaddubm v6, v2, v3 135 vaddubm v7, v4, v5 136 vsum4ubs v12, v6, v12 137 vsum4ubs v13, v7, v13 138 bdnz L(top) 139 140 andi. n, n, eval(LIMBS_PER_2VR-1) 141 beq L(rt) 142 143 lvx v0, 0, ap 144 vxor v1, v1, v1 145 cmpwi n, LIMBS_PER_VR 146 ble L(sum) 147L(lsum): 148 vor v1, v0, v0 149 lvx v0, r12, ap 150L(sum): 151LIMB32(`rlwinm r6, n, 4,26,27 ') 152LIMB64(`rlwinm r6, n, 5,26,26 ') 153 addi r7, r11, 16 154 lvx v8, r7, r6 155 vand v0, v0, v8 156 vsrb v8, v0, cnt4 157 vsrb v9, v1, cnt4 158 vperm v2, rtab, rtab, v0 159 vperm v3, rtab, rtab, v8 160 vperm v4, rtab, rtab, v1 161 vperm v5, rtab, rtab, v9 162 vaddubm v6, v2, v3 163 vaddubm v7, v4, v5 164 vsum4ubs v12, v6, v12 165 vsum4ubs v13, v7, v13 166 167 ALIGN(16) 168L(rt): vadduwm v3, v12, v13 169 li r7, -16 C FIXME: does all ppc32 and ppc64 ABIs 170 stvx v3, r7, r1 C FIXME: ...support storing below sp? 171 172 lwz r7, -16(r1) 173 add r8, r8, r7 174 lwz r7, -12(r1) 175 add r8, r8, r7 176 lwz r7, -8(r1) 177 add r8, r8, r7 178 lwz r7, -4(r1) 179 add r8, r8, r7 180 181C Handle outer loop for huge n. We inherit cr7 and r0 from above. 182LIMB64(`ble cr7, L(ret) 183 vxor v12, v12, v12 C zero total count 184 vxor v13, v13, v13 C zero total count 185 mr n, r9 186 cmpd cr7, n, r0 187 ble cr7, L(2) 188 addis r9, n, -LIMBS_PER_CHUNK C remaining n 189 lis n, LIMBS_PER_CHUNK 190L(2): srdi r7, n, 2 C loop count corresponding to n 191 mtctr r7 C copy n to count register 192 b L(top) 193') 194 195 ALIGN(16) 196L(ret): mr r3, r8 197 mtspr 256, r10 198 blr 199EPILOGUE() 200 201DEF_OBJECT(cnsts,16) 202C Counts for vperm 203 .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 204 .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 205C Masks for high end of number 206 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 207 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 208 209 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 210 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 211 212 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 213 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 214 215 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 216 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 217C Masks for low end of number 218 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 219 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 220 221 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff 222 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 223 224 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 225 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 226 227 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 228 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff 229END_OBJECT(cnsts) 230ASM_END() 231