1dnl AMD64 mpn_hamdist -- hamming distance. 2 3dnl Copyright 2017 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31 32include(`../config.m4') 33 34C cycles/limb 35C AMD K8,K9 n/a 36C AMD K10 3.26 37C AMD bd1 4.2 38C AMD bd2 4.2 39C AMD bd3 ? 40C AMD bd4 ? 41C AMD zen 1.15 42C AMD bobcat 7.29 43C AMD jaguar 2.53 44C Intel P4 n/a 45C Intel core2 n/a 46C Intel NHM 2.03 47C Intel SBR 1.66 48C Intel IBR 1.62 49C Intel HWL 1.50 50C Intel BWL 1.50 51C Intel SKL 1.50 52C Intel atom n/a 53C Intel SLM 2.55 54C VIA nano n/a 55 56C TODO 57C * An AVX pshufb based variant should approach 0.5 c/l on Haswell and later 58C Intel hardware. Perhaps mix such a loop with popcnt instructions. 59C * The random placement of the L0, L1, L2, etc blocks are due to branch 60C shortening. More work could be done there. 61C * Combine the accumulators rax and rcx into one register to save some 62C bookkeeping and a push/pop pair. Unfortunately this cause a slight 63C slowdown for at leat NHM and SBR. 64 65define(`up', `%rdi') 66define(`vp', `%rsi') 67define(`n', `%rdx') 68 69ABI_SUPPORT(DOS64) 70ABI_SUPPORT(STD64) 71 72define(`sum', `lea ($1,$2), $2') 73define(`sum', `add $1, $2') 74 75ASM_START() 76 TEXT 77 ALIGN(32) 78PROLOGUE(mpn_hamdist) 79 FUNC_ENTRY(3) 80 push %rbx 81 push %rbp 82 83 mov (up), %r10 84 xor (vp), %r10 85 86 mov R32(n), R32(%r8) 87 and $3, R32(%r8) 88 89 xor R32(%rcx), R32(%rcx) 90 .byte 0xf3,0x49,0x0f,0xb8,0xc2 C popcnt %r10,%rax 91 92 lea L(tab)(%rip), %r9 93ifdef(`PIC',` 94 movslq (%r9,%r8,4), %r8 95 add %r9, %r8 96 jmp *%r8 97',` 98 jmp *(%r9,%r8,8) 99') 100 101L(3): mov 8(up), %r10 102 mov 16(up), %r11 103 xor 8(vp), %r10 104 xor 16(vp), %r11 105 xor R32(%rbp), R32(%rbp) 106 sub $4, n 107 jle L(x3) 108 mov 24(up), %r8 109 mov 32(up), %r9 110 add $24, up 111 add $24, vp 112 jmp L(e3) 113 114L(0): mov 8(up), %r9 115 xor 8(vp), %r9 116 mov 16(up), %r10 117 mov 24(up), %r11 118 xor R32(%rbx), R32(%rbx) 119 xor 16(vp), %r10 120 xor 24(vp), %r11 121 add $32, up 122 add $32, vp 123 sub $4, n 124 jle L(x4) 125 126 ALIGN(16) 127L(top): 128L(e0): .byte 0xf3,0x49,0x0f,0xb8,0xe9 C popcnt %r9,%rbp 129 mov (up), %r8 130 mov 8(up), %r9 131 sum( %rbx, %rax) 132L(e3): .byte 0xf3,0x49,0x0f,0xb8,0xda C popcnt %r10,%rbx 133 xor (vp), %r8 134 xor 8(vp), %r9 135 sum( %rbp, %rcx) 136L(e2): .byte 0xf3,0x49,0x0f,0xb8,0xeb C popcnt %r11,%rbp 137 mov 16(up), %r10 138 mov 24(up), %r11 139 add $32, up 140 sum( %rbx, %rax) 141L(e1): .byte 0xf3,0x49,0x0f,0xb8,0xd8 C popcnt %r8,%rbx 142 xor 16(vp), %r10 143 xor 24(vp), %r11 144 add $32, vp 145 sum( %rbp, %rcx) 146 sub $4, n 147 jg L(top) 148 149L(x4): .byte 0xf3,0x49,0x0f,0xb8,0xe9 C popcnt %r9,%rbp 150 sum( %rbx, %rax) 151L(x3): .byte 0xf3,0x49,0x0f,0xb8,0xda C popcnt %r10,%rbx 152 sum( %rbp, %rcx) 153 .byte 0xf3,0x49,0x0f,0xb8,0xeb C popcnt %r11,%rbp 154 sum( %rbx, %rax) 155 sum( %rbp, %rcx) 156L(x2): add %rcx, %rax 157L(x1): pop %rbp 158 pop %rbx 159 FUNC_EXIT() 160 ret 161 162L(2): mov 8(up), %r11 163 xor 8(vp), %r11 164 sub $2, n 165 jle L(n2) 166 mov 16(up), %r8 167 mov 24(up), %r9 168 xor R32(%rbx), R32(%rbx) 169 xor 16(vp), %r8 170 xor 24(vp), %r9 171 add $16, up 172 add $16, vp 173 jmp L(e2) 174L(n2): .byte 0xf3,0x49,0x0f,0xb8,0xcb C popcnt %r11,%rcx 175 jmp L(x2) 176 177L(1): dec n 178 jle L(x1) 179 mov 8(up), %r8 180 mov 16(up), %r9 181 xor 8(vp), %r8 182 xor 16(vp), %r9 183 xor R32(%rbp), R32(%rbp) 184 mov 24(up), %r10 185 mov 32(up), %r11 186 add $40, up 187 add $8, vp 188 jmp L(e1) 189 190EPILOGUE() 191 JUMPTABSECT 192 ALIGN(8) 193L(tab): JMPENT( L(0), L(tab)) 194 JMPENT( L(1), L(tab)) 195 JMPENT( L(2), L(tab)) 196 JMPENT( L(3), L(tab)) 197