1dnl AMD64 SSSE3 mpn_popcount -- population count. 2 3dnl Copyright 2010-2017 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31 32include(`../config.m4') 33 34C cycles/limb good for cpu? 35C AMD K8,K9 n/a 36C AMD K10 n/a 37C AMD bd1 1.79-1.91 n 38C AMD bd2 1.73-1.85 n 39C AMD bd3 ? 40C AMD bd4 1.73-1.85 n 41C AMD zen 1.47 n 42C AMD bobcat 8.0 n 43C AMD jaguar 4.78 n 44C Intel P4 n/a 45C Intel CNR 3.75 46C Intel PNR 2.61 y 47C Intel NHM 2.03 n 48C Intel SBR 1.87 n 49C Intel IBR 1.52-1.58 n 50C Intel HWL 1.52-1.58 n 51C Intel BWL 1.52-1.58 n 52C Intel SKL 1.51 n 53C Intel atom 12.3 n 54C Intel SLM 9.1 n 55C VIA nano ? 56 57C TODO 58C * This was hand-written without too much thought about optimal insn 59C selection; check to see of it can be improved. 60C * Consider doing some instruction scheduling. 61 62define(`up', `%rdi') 63define(`n', `%rsi') 64 65ASM_START() 66 TEXT 67 ALIGN(32) 68PROLOGUE(mpn_popcount) 69 lea L(cnsts)(%rip), %r9 70 71ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48)', 72 `define(`OFF1',64) define(`OFF2',80)') 73 movdqa OFF1`'(%r9), %xmm7 74 movdqa OFF2`'(%r9), %xmm6 75 pxor %xmm4, %xmm4 76 pxor %xmm5, %xmm5 77 pxor %xmm8, %xmm8 78 79 mov R32(n), R32(%rax) 80 and $7, R32(%rax) 81ifdef(`PIC',` 82 movslq (%r9,%rax,4), %rax 83 add %r9, %rax 84 jmp *%rax 85',` 86 jmp *(%r9,%rax,8) 87') 88 89L(1): movq (up), %xmm1 90 add $8, up 91 jmp L(e1) 92 93L(2): add $-48, up 94 jmp L(e2) 95 96L(3): movq (up), %xmm1 97 add $-40, up 98 jmp L(e3) 99 100L(4): add $-32, up 101 jmp L(e4) 102 103L(5): movq (up), %xmm1 104 add $-24, up 105 jmp L(e5) 106 107L(6): add $-16, up 108 jmp L(e6) 109 110L(7): movq (up), %xmm1 111 add $-8, up 112 jmp L(e7) 113 114 ALIGN(32) 115L(top): lddqu (up), %xmm1 116L(e7): movdqa %xmm6, %xmm0 C copy mask register 117 movdqa %xmm7, %xmm2 C copy count register 118 movdqa %xmm7, %xmm3 C copy count register 119 pand %xmm1, %xmm0 120 psrlw $4, %xmm1 121 pand %xmm6, %xmm1 122 pshufb %xmm0, %xmm2 123 pshufb %xmm1, %xmm3 124 paddb %xmm2, %xmm3 125 paddb %xmm3, %xmm4 126L(e6): lddqu 16(up), %xmm1 127L(e5): movdqa %xmm6, %xmm0 128 movdqa %xmm7, %xmm2 129 movdqa %xmm7, %xmm3 130 pand %xmm1, %xmm0 131 psrlw $4, %xmm1 132 pand %xmm6, %xmm1 133 pshufb %xmm0, %xmm2 134 pshufb %xmm1, %xmm3 135 paddb %xmm2, %xmm3 136 paddb %xmm3, %xmm4 137L(e4): lddqu 32(up), %xmm1 138L(e3): movdqa %xmm6, %xmm0 139 movdqa %xmm7, %xmm2 140 movdqa %xmm7, %xmm3 141 pand %xmm1, %xmm0 142 psrlw $4, %xmm1 143 pand %xmm6, %xmm1 144 pshufb %xmm0, %xmm2 145 pshufb %xmm1, %xmm3 146 paddb %xmm2, %xmm3 147 paddb %xmm3, %xmm4 148L(e2): lddqu 48(up), %xmm1 149 add $64, up 150L(e1): movdqa %xmm6, %xmm0 151 movdqa %xmm7, %xmm2 152 movdqa %xmm7, %xmm3 153 pand %xmm1, %xmm0 154 psrlw $4, %xmm1 155 pand %xmm6, %xmm1 156 pshufb %xmm0, %xmm2 157 pshufb %xmm1, %xmm3 158 psadbw %xmm5, %xmm4 C sum to 8 x 16-bit counts 159 paddb %xmm2, %xmm3 160 paddq %xmm4, %xmm8 C sum to 2 x 64-bit counts 161 movdqa %xmm3, %xmm4 162 sub $8, n 163 jg L(top) 164 165 psadbw %xmm5, %xmm4 166 paddq %xmm4, %xmm8 167 pshufd $14, %xmm8, %xmm0 168 paddq %xmm8, %xmm0 169 movq %xmm0, %rax 170 ret 171EPILOGUE() 172DEF_OBJECT(L(cnsts),16,`JUMPTABSECT') 173 JMPENT( L(top), L(cnsts)) 174 JMPENT( L(1), L(cnsts)) 175 JMPENT( L(2), L(cnsts)) 176 JMPENT( L(3), L(cnsts)) 177 JMPENT( L(4), L(cnsts)) 178 JMPENT( L(5), L(cnsts)) 179 JMPENT( L(6), L(cnsts)) 180 JMPENT( L(7), L(cnsts)) 181 .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 182 .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 183 .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f 184 .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f 185END_OBJECT(L(cnsts)) 186