1274101Sbrooksdnl AMD64 SSSE3/XOP mpn_hamdist -- hamming distance. 2244541Sbrooks 3244541Sbrooksdnl Copyright 2010-2017 Free Software Foundation, Inc. 4244541Sbrooks 5244541Sbrooksdnl This file is part of the GNU MP Library. 6244541Sbrooksdnl 7244541Sbrooksdnl The GNU MP Library is free software; you can redistribute it and/or modify 8244541Sbrooksdnl it under the terms of either: 9244541Sbrooksdnl 10244541Sbrooksdnl * the GNU Lesser General Public License as published by the Free 11244541Sbrooksdnl Software Foundation; either version 3 of the License, or (at your 12244541Sbrooksdnl option) any later version. 13244541Sbrooksdnl 14244541Sbrooksdnl or 15244541Sbrooksdnl 16244541Sbrooksdnl * the GNU General Public License as published by the Free Software 17244541Sbrooksdnl Foundation; either version 2 of the License, or (at your option) any 18244541Sbrooksdnl later version. 19244541Sbrooksdnl 20244541Sbrooksdnl or both in parallel, as here. 21244541Sbrooksdnl 22244541Sbrooksdnl The GNU MP Library is distributed in the hope that it will be useful, but 23244541Sbrooksdnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24244541Sbrooksdnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25244541Sbrooksdnl for more details. 26244541Sbrooksdnl 27244541Sbrooksdnl You should have received copies of the GNU General Public License and the 28244541Sbrooksdnl GNU Lesser General Public License along with the GNU MP Library. If not, 29244541Sbrooksdnl see https://www.gnu.org/licenses/. 30244541Sbrooks 31244541Sbrooks 32244541Sbrooksinclude(`../config.m4') 33244541Sbrooks 34244541SbrooksC cycles/limb good for cpu? 35244541SbrooksC AMD K8,K9 n/a 36244541SbrooksC AMD K10 n/a 37244541SbrooksC AMD bd1 1.51-2.0 y 38244541SbrooksC AMD bd2 1.50-1.9 y 39244541SbrooksC AMD bd3 ? 40244541SbrooksC AMD bd4 ? 41244541SbrooksC AMD zen n/a 42244541SbrooksC AMD bobcat n/a 43244541SbrooksC AMD jaguar n/a 44244541SbrooksC Intel P4 n/a 45244541SbrooksC Intel PNR n/a 46244541SbrooksC Intel NHM n/a 47244541SbrooksC Intel SBR n/a 48244541SbrooksC Intel IBR n/a 49244541SbrooksC Intel HWL n/a 50244541SbrooksC Intel BWL n/a 51244541SbrooksC Intel SKL n/a 52244541SbrooksC Intel atom n/a 53244541SbrooksC Intel SLM n/a 54244541SbrooksC VIA nano n/a 55244541Sbrooks 56244541SbrooksC TODO 57244541SbrooksC * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we 58244541SbrooksC intend to support old systems. 59244541Sbrooks 60244541SbrooksC We use vpshlb and vpperm below, which are XOP extensions to AVX. Some 61244541SbrooksC systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX. 62244541SbrooksC We fall back to the core2 code. 63244541Sbrooksifdef(`GMP_AVX_NOT_REALLY_AVAILABLE',` 64244541SbrooksMULFUNC_PROLOGUE(mpn_hamdist) 65244541Sbrooksinclude_mpn(`x86_64/core2/hamdist.asm') 66244541Sbrooks',` 67244541Sbrooks 68244541Sbrooksdefine(`up', `%rdi') 69244541Sbrooksdefine(`vp', `%rsi') 70274101Sbrooksdefine(`n', `%rdx') 71244541Sbrooks 72244541SbrooksABI_SUPPORT(DOS64) 73244541SbrooksABI_SUPPORT(STD64) 74244541Sbrooks 75244541SbrooksASM_START() 76244541Sbrooks TEXT 77244541Sbrooks ALIGN(32) 78244541SbrooksPROLOGUE(mpn_hamdist) 79244541Sbrooks FUNC_ENTRY(3) 80244541Sbrooks cmp $5, n 81244541Sbrooks jl L(sma) 82244541Sbrooks 83258655Sbrooks lea L(cnsts)(%rip), %r9 84256996Sbrooks 85244541Sbrooks xor R32(%r10), R32(%r10) 86244541Sbrooks test $8, R8(vp) 87244541Sbrooks jz L(ali) 88244541Sbrooks mov (up), %r8 89244541Sbrooks xor (vp), %r8 90244541Sbrooks add $8, up 91244541Sbrooks add $8, vp 92244541Sbrooks dec n 93244541Sbrooks popcnt %r8, %r10 94244541SbrooksL(ali): 95244541Sbrooks 96244541Sbrooksifdef(`PIC', `define(`OFF1',16) define(`OFF2',32) define(`OFF3',48)', 97244541Sbrooks `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)') 98244541Sbrooks movdqa OFF1`'(%r9), %xmm7 C nibble counts table 99244541Sbrooks movdqa OFF2`'(%r9), %xmm6 C splat shift counts 100244541Sbrooks movdqa OFF3`'(%r9), %xmm5 C masks 101244541Sbrooks pxor %xmm4, %xmm4 102244541Sbrooks pxor %xmm8, %xmm8 C grand total count 103244541Sbrooks 104244541Sbrooks mov R32(n), R32(%rax) 105274101Sbrooks and $6, R32(%rax) 106244541Sbrooks lea -64(up,%rax,8), up 107244541Sbrooks lea -64(vp,%rax,8), vp 108244541Sbrooksifdef(`PIC',` 109244541Sbrooks movslq (%r9,%rax,2), %r11 110244541Sbrooks add %r9, %r11 111244541Sbrooks jmp *%r11 112244541Sbrooks',` 113244541Sbrooks jmp *(%r9,%rax,4) 114244541Sbrooks') 115244541Sbrooks 116244541SbrooksL(0): add $64, up 117244541Sbrooks add $64, vp 118244541Sbrooks sub $2, n 119244541Sbrooks 120244541Sbrooks ALIGN(32) 121244541SbrooksL(top): lddqu (up), %xmm0 122244541Sbrooks pxor (vp), %xmm0 123244541Sbrooks .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 124244541Sbrooks pand %xmm5, %xmm0 125244541Sbrooks pand %xmm5, %xmm1 126244541Sbrooks .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 127244541Sbrooks .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3 128244541Sbrooks paddb %xmm2, %xmm3 129244541Sbrooks paddb %xmm3, %xmm4 130244541SbrooksL(6): lddqu 16(up), %xmm0 131244541Sbrooks pxor 16(vp), %xmm0 132244541Sbrooks .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 133244541Sbrooks pand %xmm5, %xmm0 134244541Sbrooks pand %xmm5, %xmm1 135244541Sbrooks .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 136244541Sbrooks .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3 137244541Sbrooks paddb %xmm2, %xmm3 138244541Sbrooks paddb %xmm3, %xmm4 139244541SbrooksL(4): lddqu 32(up), %xmm0 140244541Sbrooks pxor 32(vp), %xmm0 141244541Sbrooks .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 142244541Sbrooks pand %xmm5, %xmm0 143244541Sbrooks pand %xmm5, %xmm1 144244541Sbrooks .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 145244541Sbrooks .byte 0x8f,0xe9,0x78,0xd3,0xc4 C vphaddubq %xmm4, %xmm0 146244541Sbrooks .byte 0x8f,0xe8,0x40,0xa3,0xe7,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm4 147244541Sbrooks paddb %xmm2, %xmm3 148244541Sbrooks paddb %xmm2, %xmm4 149244541Sbrooks paddq %xmm0, %xmm8 C sum to 2 x 64-bit counts 150244541SbrooksL(2): mov 48(up), %r8 151244541Sbrooks mov 56(up), %r9 152244541Sbrooks add $64, up 153244541Sbrooks xor 48(vp), %r8 154244541Sbrooks xor 56(vp), %r9 155244541Sbrooks add $64, vp 156244541Sbrooks popcnt %r8, %r8 157244541Sbrooks popcnt %r9, %r9 158244541Sbrooks add %r8, %r10 159244541Sbrooks add %r9, %r10 160244541Sbrooks sub $8, n 161244541Sbrooks jg L(top) 162244541Sbrooks 163244541Sbrooks test $1, R8(n) 164244541Sbrooks jz L(x) 165244541Sbrooks mov (up), %r8 166244541Sbrooks xor (vp), %r8 167244541Sbrooks popcnt %r8, %r8 168244541Sbrooks add %r8, %r10 169244541SbrooksL(x): .byte 0x8f,0xe9,0x78,0xd3,0xc4 C vphaddubq %xmm4, %xmm0 170244541Sbrooks paddq %xmm0, %xmm8 171244541Sbrooks pshufd $14, %xmm8, %xmm0 172244541Sbrooks paddq %xmm8, %xmm0 173244541Sbrooks movq %xmm0, %rax 174244541Sbrooks add %r10, %rax 175244541Sbrooks FUNC_EXIT() 176244541Sbrooks ret 177244541Sbrooks 178244541SbrooksL(sma): mov (up), %r8 179244541Sbrooks xor (vp), %r8 180244541Sbrooks popcnt %r8, %rax 181244541Sbrooks dec n 182244541Sbrooks jz L(ed) 183244541SbrooksL(tp): mov 8(up), %r8 184244541Sbrooks add $8, up 185244541Sbrooks xor 8(vp), %r8 186244541Sbrooks add $8, vp 187244541Sbrooks popcnt %r8, %r8 188244541Sbrooks add %r8, %rax 189244541Sbrooks dec n 190244541Sbrooks jnz L(tp) 191244541SbrooksL(ed): FUNC_EXIT() 192244541Sbrooks ret 193244541SbrooksEPILOGUE() 194244541SbrooksDEF_OBJECT(L(cnsts),16,`JUMPTABSECT') 195244541Sbrooks JMPENT( L(0), L(cnsts)) 196244541Sbrooks JMPENT( L(2), L(cnsts)) 197244541Sbrooks JMPENT( L(4), L(cnsts)) 198244541Sbrooks JMPENT( L(6), L(cnsts)) 199244541Sbrooks .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 200244541Sbrooks .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 201244541Sbrooks .byte -4,-4,-4,-4,-4,-4,-4,-4 202244541Sbrooks .byte -4,-4,-4,-4,-4,-4,-4,-4 203244541Sbrooks .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f 204244541Sbrooks .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f 205244541SbrooksEND_OBJECT(L(cnsts)) 206244541Sbrooks') 207244541Sbrooks