1dnl AMD64 mpn_popcount -- population count. 2 3dnl Copyright 2017 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31 32include(`../config.m4') 33 34C cycles/limb 35C AMD K8,K9 n/a 36C AMD K10 1.39 37C AMD bd1 4 38C AMD bd2 4 39C AMD bd3 ? 40C AMD bd4 ? 41C AMD zen 0.72 42C AMD bobcat 5.78 43C AMD jaguar 1.27 44C Intel P4 n/a 45C Intel core2 n/a 46C Intel NHM 1.04 47C Intel SBR 1.02 48C Intel IBR 1.0 49C Intel HWL 1.0 50C Intel BWL 1.0 51C Intel SKL 1.0 52C Intel atom n/a 53C Intel SLM 1.34 54C VIA nano n/a 55 56C TODO 57C * We could approach 0.5 c/l for AMD Zen with more unrolling. That would 58C not cause any additional feed-in overhead as we already use a jump table. 59C * An AVX pshufb based variant should approach 0.5 c/l on Haswell and later 60C Intel hardware. Perhaps mix such a loop with popcnt instructions. 61C * The random placement of the L0, L1, L2, etc blocks are due to branch 62C shortening. 63 64define(`up', `%rdi') 65define(`n', `%rsi') 66 67ABI_SUPPORT(DOS64) 68ABI_SUPPORT(STD64) 69 70ASM_START() 71 TEXT 72 ALIGN(32) 73PROLOGUE(mpn_popcount) 74 FUNC_ENTRY(2) 75 76 mov R32(n), R32(%r8) 77 and $7, R32(%r8) 78 79 .byte 0xf3,0x48,0x0f,0xb8,0x07 C popcnt (up), %rax 80 xor R32(%rcx), R32(%rcx) 81 82 lea L(tab)(%rip), %r9 83ifdef(`PIC',` 84 movslq (%r9,%r8,4), %r8 85 add %r9, %r8 86 jmp *%r8 87',` 88 jmp *(%r9,%r8,8) 89') 90 91L(3): .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x08 C popcnt 8(up), %r10 92 .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x10 C popcnt 16(up), %r11 93 add $24, up 94 sub $8, n 95 jg L(e34) 96 add %r10, %rax 97 add %r11, %rax 98L(s1): FUNC_EXIT() 99 ret 100 101L(1): sub $8, n 102 jle L(s1) 103 .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x08 C popcnt 8(up), %r8 104 .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x10 C popcnt 16(up), %r9 105 add $8, up 106 jmp L(e12) 107 108L(7): .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x08 C popcnt 0x8(%rdi),%r10 109 .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x10 C popcnt 0x10(%rdi),%r11 110 add $-8, up 111 jmp L(e07) 112 113L(0): .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%rcx 114 .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 C popcnt 0x10(%rdi),%r10 115 .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 C popcnt 0x18(%rdi),%r11 116 jmp L(e07) 117 118L(4): .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%rcx 119 .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 C popcnt 0x10(%rdi),%r10 120 .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 C popcnt 0x18(%rdi),%r11 121 add $32, up 122 sub $8, n 123 jle L(x4) 124 125 ALIGN(16) 126L(top): 127L(e34): .byte 0xf3,0x4c,0x0f,0xb8,0x07 C popcnt (%rdi),%r8 128 .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%r9 129 add %r10, %rcx 130 add %r11, %rax 131L(e12): .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 C popcnt 0x10(%rdi),%r10 132 .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 C popcnt 0x18(%rdi),%r11 133 add %r8, %rcx 134 add %r9, %rax 135L(e07): .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x20 C popcnt 0x20(%rdi),%r8 136 .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x28 C popcnt 0x28(%rdi),%r9 137 add %r10, %rcx 138 add %r11, %rax 139L(e56): .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x30 C popcnt 0x30(%rdi),%r10 140 .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x38 C popcnt 0x38(%rdi),%r11 141 add $64, up 142 add %r8, %rcx 143 add %r9, %rax 144 sub $8, n 145 jg L(top) 146 147L(x4): add %r10, %rcx 148 add %r11, %rax 149L(x2): add %rcx, %rax 150 151 FUNC_EXIT() 152 ret 153 154L(2): .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%rcx 155 sub $8, n 156 jle L(x2) 157 .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x10 C popcnt 0x10(%rdi),%r8 158 .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x18 C popcnt 0x18(%rdi),%r9 159 add $16, up 160 jmp L(e12) 161 162L(5): .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x08 C popcnt 0x8(%rdi),%r8 163 .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x10 C popcnt 0x10(%rdi),%r9 164 add $-24, up 165 jmp L(e56) 166 167L(6): .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%rcx 168 .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x10 C popcnt 0x10(%rdi),%r8 169 .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x18 C popcnt 0x18(%rdi),%r9 170 add $-16, up 171 jmp L(e56) 172EPILOGUE() 173 JUMPTABSECT 174 ALIGN(8) 175L(tab): JMPENT( L(0), L(tab)) 176 JMPENT( L(1), L(tab)) 177 JMPENT( L(2), L(tab)) 178 JMPENT( L(3), L(tab)) 179 JMPENT( L(4), L(tab)) 180 JMPENT( L(5), L(tab)) 181 JMPENT( L(6), L(tab)) 182 JMPENT( L(7), L(tab)) 183