1dnl AMD64 SSSE3 mpn_hamdist -- hamming distance. 2 3dnl Copyright 2010-2017 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31 32include(`../config.m4') 33 34C cycles/limb good for cpu? 35C AMD K8,K9 n/a 36C AMD K10 n/a 37C AMD bd1 ? 38C AMD bd2 ? 39C AMD bd3 ? 40C AMD bd4 ? 41C AMD zen ? 42C AMD bobcat ? 43C AMD jaguar ? 44C Intel P4 n/a 45C Intel CNR 4.50 y 46C Intel PNR 3.28 y 47C Intel NHM ? 48C Intel SBR ? 49C Intel IBR ? 50C Intel HWL ? 51C Intel BWL ? 52C Intel SKL ? 53C Intel atom ? 54C Intel SLM ? 55C VIA nano ? 56 57C TODO 58C * This was hand-written without too much thought about optimal insn 59C selection; check to see of it can be improved. 60C * Consider doing some instruction scheduling. 61 62define(`up', `%rdi') 63define(`vp', `%rsi') 64define(`n', `%rdx') 65 66ASM_START() 67 TEXT 68 ALIGN(32) 69PROLOGUE(mpn_hamdist) 70 lea L(cnsts)(%rip), %r9 71 72ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48)', 73 `define(`OFF1',64) define(`OFF2',80)') 74 movdqa OFF1`'(%r9), %xmm7 75 movdqa OFF2`'(%r9), %xmm6 76 pxor %xmm4, %xmm4 77 pxor %xmm5, %xmm5 78 pxor %xmm8, %xmm8 79 80 mov R32(n), R32(%rax) 81 and $7, R32(%rax) 82ifdef(`PIC',` 83 movslq (%r9,%rax,4), %rax 84 add %r9, %rax 85 jmp *%rax 86',` 87 jmp *(%r9,%rax,8) 88') 89 90L(1): movq (up), %xmm1 91 add $8, up 92 movq (vp), %xmm10 93 add $8, vp 94 pxor %xmm10, %xmm1 95 jmp L(e1) 96 97L(2): add $-48, up 98 add $-48, vp 99 jmp L(e2) 100 101L(3): movq (up), %xmm1 102 add $-40, up 103 movq (vp), %xmm10 104 add $-40, vp 105 pxor %xmm10, %xmm1 106 jmp L(e3) 107 108L(4): add $-32, up 109 add $-32, vp 110 jmp L(e4) 111 112L(5): movq (up), %xmm1 113 add $-24, up 114 movq (vp), %xmm10 115 add $-24, vp 116 pxor %xmm10, %xmm1 117 jmp L(e5) 118 119L(6): add $-16, up 120 add $-16, vp 121 jmp L(e6) 122 123L(7): movq (up), %xmm1 124 add $-8, up 125 movq (vp), %xmm10 126 add $-8, vp 127 pxor %xmm10, %xmm1 128 jmp L(e7) 129 130 ALIGN(32) 131L(top): lddqu (up), %xmm1 132 lddqu (vp), %xmm10 133 pxor %xmm10, %xmm1 134L(e7): movdqa %xmm6, %xmm0 C copy mask register 135 movdqa %xmm7, %xmm2 C copy count register 136 movdqa %xmm7, %xmm3 C copy count register 137 pand %xmm1, %xmm0 138 psrlw $4, %xmm1 139 pand %xmm6, %xmm1 140 pshufb %xmm0, %xmm2 141 pshufb %xmm1, %xmm3 142 paddb %xmm2, %xmm3 143 paddb %xmm3, %xmm4 144L(e6): lddqu 16(up), %xmm1 145 lddqu 16(vp), %xmm10 146 pxor %xmm10, %xmm1 147L(e5): movdqa %xmm6, %xmm0 148 movdqa %xmm7, %xmm2 149 movdqa %xmm7, %xmm3 150 pand %xmm1, %xmm0 151 psrlw $4, %xmm1 152 pand %xmm6, %xmm1 153 pshufb %xmm0, %xmm2 154 pshufb %xmm1, %xmm3 155 paddb %xmm2, %xmm3 156 paddb %xmm3, %xmm4 157L(e4): lddqu 32(up), %xmm1 158 lddqu 32(vp), %xmm10 159 pxor %xmm10, %xmm1 160L(e3): movdqa %xmm6, %xmm0 161 movdqa %xmm7, %xmm2 162 movdqa %xmm7, %xmm3 163 pand %xmm1, %xmm0 164 psrlw $4, %xmm1 165 pand %xmm6, %xmm1 166 pshufb %xmm0, %xmm2 167 pshufb %xmm1, %xmm3 168 paddb %xmm2, %xmm3 169 paddb %xmm3, %xmm4 170L(e2): lddqu 48(up), %xmm1 171 add $64, up 172 lddqu 48(vp), %xmm10 173 add $64, vp 174 pxor %xmm10, %xmm1 175L(e1): movdqa %xmm6, %xmm0 176 movdqa %xmm7, %xmm2 177 movdqa %xmm7, %xmm3 178 pand %xmm1, %xmm0 179 psrlw $4, %xmm1 180 pand %xmm6, %xmm1 181 pshufb %xmm0, %xmm2 182 pshufb %xmm1, %xmm3 183 psadbw %xmm5, %xmm4 C sum to 8 x 16-bit counts 184 paddb %xmm2, %xmm3 185 paddq %xmm4, %xmm8 C sum to 2 x 64-bit counts 186 movdqa %xmm3, %xmm4 187 sub $8, n 188 jg L(top) 189 190 psadbw %xmm5, %xmm4 191 paddq %xmm4, %xmm8 192 pshufd $14, %xmm8, %xmm0 193 paddq %xmm8, %xmm0 194 movq %xmm0, %rax 195 ret 196EPILOGUE() 197DEF_OBJECT(L(cnsts),16,`JUMPTABSECT') 198 JMPENT( L(top), L(cnsts)) 199 JMPENT( L(1), L(cnsts)) 200 JMPENT( L(2), L(cnsts)) 201 JMPENT( L(3), L(cnsts)) 202 JMPENT( L(4), L(cnsts)) 203 JMPENT( L(5), L(cnsts)) 204 JMPENT( L(6), L(cnsts)) 205 JMPENT( L(7), L(cnsts)) 206 .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 207 .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 208 .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f 209 .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f 210END_OBJECT(L(cnsts)) 211