popcount.asm revision 1.1.1.2
1dnl ARM64 Neon mpn_popcount -- mpn bit population count. 2 3dnl Copyright 2013, 2014 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb 34C Cortex-A53 2.5 35C Cortex-A57 1.14 36C X-Gene 3 37 38C TODO 39C * Consider greater unrolling. 40C * Arrange to align the pointer, if that helps performance. Use the same 41C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry 42C valgrind!) 43C * Explore if explicit align directives, e.g., "[ptr:128]" help. 44C * See rth's gmp-devel 2013-02/03 messages about final summation tricks. 45 46changecom(blah) 47 48C INPUT PARAMETERS 49define(`ap', x0) 50define(`n', x1) 51 52C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end 53C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or 54C (8*2^16-1)/64 = 0x1fff limbs. We use a chunksize close to that, but which 55C allows the huge count code to jump deep into the code (at L(chu)). 56 57define(`maxsize', 0x1fff) 58define(`chunksize',0x1ff0) 59 60ASM_START() 61PROLOGUE(mpn_popcount) 62 63 mov x11, #maxsize 64 cmp n, x11 65 b.hi L(gt8k) 66 67L(lt8k): 68 movi v4.16b, #0 C clear summation register 69 movi v5.16b, #0 C clear summation register 70 71 tbz n, #0, L(xx0) 72 sub n, n, #1 73 ld1 {v0.1d}, [ap], #8 C load 1 limb 74 cnt v6.16b, v0.16b 75 uadalp v4.8h, v6.16b C could also splat 76 77L(xx0): tbz n, #1, L(x00) 78 sub n, n, #2 79 ld1 {v0.2d}, [ap], #16 C load 2 limbs 80 cnt v6.16b, v0.16b 81 uadalp v4.8h, v6.16b 82 83L(x00): tbz n, #2, L(000) 84 subs n, n, #4 85 ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs 86 b.ls L(sum) 87 88L(gt4): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs 89 sub n, n, #4 90 cnt v6.16b, v0.16b 91 cnt v7.16b, v1.16b 92 b L(mid) 93 94L(000): subs n, n, #8 95 b.lo L(e0) 96 97L(chu): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs 98 ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs 99 cnt v6.16b, v2.16b 100 cnt v7.16b, v3.16b 101 subs n, n, #8 102 b.lo L(end) 103 104L(top): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs 105 uadalp v4.8h, v6.16b 106 cnt v6.16b, v0.16b 107 uadalp v5.8h, v7.16b 108 cnt v7.16b, v1.16b 109L(mid): ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs 110 subs n, n, #8 111 uadalp v4.8h, v6.16b 112 cnt v6.16b, v2.16b 113 uadalp v5.8h, v7.16b 114 cnt v7.16b, v3.16b 115 b.hs L(top) 116 117L(end): uadalp v4.8h, v6.16b 118 uadalp v5.8h, v7.16b 119L(sum): cnt v6.16b, v0.16b 120 cnt v7.16b, v1.16b 121 uadalp v4.8h, v6.16b 122 uadalp v5.8h, v7.16b 123 add v4.8h, v4.8h, v5.8h 124 C we have 8 16-bit counts 125L(e0): uaddlp v4.4s, v4.8h C we have 4 32-bit counts 126 uaddlp v4.2d, v4.4s C we have 2 64-bit counts 127 mov x0, v4.d[0] 128 mov x1, v4.d[1] 129 add x0, x0, x1 130 ret 131 132C Code for count > maxsize. Splits operand and calls above code. 133define(`ap2', x5) C caller-saves reg not used above 134L(gt8k): 135 mov x8, x30 136 mov x7, n C full count (caller-saves reg not used above) 137 mov x4, #0 C total sum (caller-saves reg not used above) 138 mov x9, #chunksize*8 C caller-saves reg not used above 139 mov x10, #chunksize C caller-saves reg not used above 140 1411: add ap2, ap, x9 C point at subsequent block 142 mov n, #chunksize-8 C count for this invocation, adjusted for entry pt 143 movi v4.16b, #0 C clear chunk summation register 144 movi v5.16b, #0 C clear chunk summation register 145 bl L(chu) C jump deep inside code 146 add x4, x4, x0 147 mov ap, ap2 C put chunk pointer in place for calls 148 sub x7, x7, x10 149 cmp x7, x11 150 b.hi 1b 151 152 mov n, x7 C count for final invocation 153 bl L(lt8k) 154 add x0, x4, x0 155 mov x30, x8 156 ret 157EPILOGUE() 158