1dnl ARM Neon mpn_popcount -- mpn bit population count. 2 3dnl Copyright 2013 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb 34C StrongARM: - 35C XScale - 36C Cortex-A7 ? 37C Cortex-A8 ? 38C Cortex-A9 1.125 39C Cortex-A15 0.56 40 41C TODO 42C * Explore using vldr and vldm. Does it help on A9? (These loads do 43C 64-bits-at-a-time, which will mess up in big-endian mode. Except not for 44C popcount. Except perhaps also for popcount for the edge loads.) 45C * Arrange to align the pointer, if that helps performance. Use the same 46C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry 47C valgrind!) 48C * Explore if explicit align directives, e.g., "[ptr:128]" help. 49C * See rth's gmp-devel 2013-02/03 messages about final summation tricks. 50 51C INPUT PARAMETERS 52define(`ap', r0) 53define(`n', r1) 54 55C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end 56C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or 57C (8*2^16-1)/32 = 0x3fff limbs. We use a chunksize close to that, but which 58C can be represented as a 8-bit ARM constant. 59C 60define(`chunksize',0x3f80) 61 62ASM_START() 63PROLOGUE(mpn_popcount) 64 65 cmp n, #chunksize 66 bhi L(gt16k) 67 68L(lt16k): 69 vmov.i64 q8, #0 C clear summation register 70 vmov.i64 q9, #0 C clear summation register 71 72 tst n, #1 73 beq L(xxx0) 74 vmov.i64 d0, #0 75 sub n, n, #1 76 vld1.32 {d0[0]}, [ap]! C load 1 limb 77 vcnt.8 d24, d0 78 vpadal.u8 d16, d24 C d16/q8 = 0; could just splat 79 80L(xxx0):tst n, #2 81 beq L(xx00) 82 sub n, n, #2 83 vld1.32 {d0}, [ap]! C load 2 limbs 84 vcnt.8 d24, d0 85 vpadal.u8 d16, d24 86 87L(xx00):tst n, #4 88 beq L(x000) 89 sub n, n, #4 90 vld1.32 {q0}, [ap]! C load 4 limbs 91 vcnt.8 q12, q0 92 vpadal.u8 q8, q12 93 94L(x000):tst n, #8 95 beq L(0000) 96 97 subs n, n, #8 98 vld1.32 {q0,q1}, [ap]! C load 8 limbs 99 bls L(sum) 100 101L(gt8): vld1.32 {q2,q3}, [ap]! C load 8 limbs 102 sub n, n, #8 103 vcnt.8 q12, q0 104 vcnt.8 q13, q1 105 b L(mid) 106 107L(0000):subs n, n, #16 108 blo L(e0) 109 110 vld1.32 {q2,q3}, [ap]! C load 8 limbs 111 vld1.32 {q0,q1}, [ap]! C load 8 limbs 112 vcnt.8 q12, q2 113 vcnt.8 q13, q3 114 subs n, n, #16 115 blo L(end) 116 117L(top): vld1.32 {q2,q3}, [ap]! C load 8 limbs 118 vpadal.u8 q8, q12 119 vcnt.8 q12, q0 120 vpadal.u8 q9, q13 121 vcnt.8 q13, q1 122L(mid): vld1.32 {q0,q1}, [ap]! C load 8 limbs 123 subs n, n, #16 124 vpadal.u8 q8, q12 125 vcnt.8 q12, q2 126 vpadal.u8 q9, q13 127 vcnt.8 q13, q3 128 bhs L(top) 129 130L(end): vpadal.u8 q8, q12 131 vpadal.u8 q9, q13 132L(sum): vcnt.8 q12, q0 133 vcnt.8 q13, q1 134 vpadal.u8 q8, q12 135 vpadal.u8 q9, q13 136 vadd.i16 q8, q8, q9 137 C we have 8 16-bit counts 138L(e0): vpaddl.u16 q8, q8 C we have 4 32-bit counts 139 vpaddl.u32 q8, q8 C we have 2 64-bit counts 140 vmov.32 r0, d16[0] 141 vmov.32 r1, d17[0] 142 add r0, r0, r1 143 bx lr 144 145C Code for large count. Splits operand and calls above code. 146define(`ap2', r2) C caller-saves reg not used above 147L(gt16k): 148 push {r4,r14} 149 mov ap2, ap 150 mov r3, n C full count 151 mov r4, #0 C total sum 152 1531: mov n, #chunksize C count for this invocation 154 bl L(lt16k) C could jump deep inside code 155 add ap2, ap2, #chunksize*4 C point at next chunk 156 add r4, r4, r0 157 mov ap, ap2 C put chunk pointer in place for call 158 sub r3, r3, #chunksize 159 cmp r3, #chunksize 160 bhi 1b 161 162 mov n, r3 C count for final invocation 163 bl L(lt16k) 164 add r0, r4, r0 165 pop {r4,pc} 166EPILOGUE() 167