popham.asm revision 1.1.1.4
1dnl AMD64 mpn_popcount, mpn_hamdist -- population count and hamming distance. 2 3dnl Copyright 2004, 2005, 2007, 2010-2012, 2017 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31 32include(`../config.m4') 33 34 35C popcount hamdist 36C cycles/limb cycles/limb 37C AMD K8,K9 6 7 38C AMD K10 6 7 39C Intel P4 12 14.3 40C Intel core2 7 8 41C Intel corei ? 7.3 42C Intel atom 16.5 17.5 43C VIA nano 8.75 10.4 44 45C TODO 46C * Tune. It should be possible to reach 5 c/l for popcount and 6 c/l for 47C hamdist for K8/K9. 48 49 50ifdef(`OPERATION_popcount',` 51 define(`func',`mpn_popcount') 52 define(`up', `%rdi') 53 define(`n', `%rsi') 54 define(`h55555555', `%r10') 55 define(`h33333333', `%r11') 56 define(`h0f0f0f0f', `%rcx') 57 define(`h01010101', `%rdx') 58 define(`POP', `$1') 59 define(`HAM', `dnl') 60') 61ifdef(`OPERATION_hamdist',` 62 define(`func',`mpn_hamdist') 63 define(`up', `%rdi') 64 define(`vp', `%rsi') 65 define(`n', `%rdx') 66 define(`h55555555', `%r10') 67 define(`h33333333', `%r11') 68 define(`h0f0f0f0f', `%rcx') 69 define(`h01010101', `%r12') 70 define(`POP', `dnl') 71 define(`HAM', `$1') 72') 73 74 75MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) 76 77ABI_SUPPORT(DOS64) 78ABI_SUPPORT(STD64) 79 80ASM_START() 81 TEXT 82 ALIGN(32) 83PROLOGUE(func) 84 POP(` FUNC_ENTRY(2) ') 85 HAM(` FUNC_ENTRY(3) ') 86 push %rbx 87 mov $0x5555555555555555, h55555555 88 push %rbp 89 mov $0x3333333333333333, h33333333 90 HAM(` push %r12 ') 91 lea (up,n,8), up 92 mov $0x0f0f0f0f0f0f0f0f, h0f0f0f0f 93 HAM(` lea (vp,n,8), vp ') 94 neg n 95 mov $0x0101010101010101, h01010101 96 xor R32(%rax), R32(%rax) 97 test $1, R8(n) 98 jz L(top) 99 100 mov (up,n,8), %r8 101 HAM(` xor (vp,n,8), %r8 ') 102 103 mov %r8, %r9 104 shr %r8 105 and h55555555, %r8 106 sub %r8, %r9 107 108 mov %r9, %r8 109 shr $2, %r9 110 and h33333333, %r8 111 and h33333333, %r9 112 add %r8, %r9 C 16 4-bit fields (0..4) 113 114 dec n 115 jmp L(mid) 116 117 ALIGN(16) 118L(top): mov (up,n,8), %r8 119 mov 8(up,n,8), %rbx 120 HAM(` xor (vp,n,8), %r8 ') 121 HAM(` xor 8(vp,n,8), %rbx ') 122 123 mov %r8, %r9 124 mov %rbx, %rbp 125 shr %r8 126 shr %rbx 127 and h55555555, %r8 128 and h55555555, %rbx 129 sub %r8, %r9 130 sub %rbx, %rbp 131 132 mov %r9, %r8 133 mov %rbp, %rbx 134 shr $2, %r9 135 shr $2, %rbp 136 and h33333333, %r8 137 and h33333333, %r9 138 and h33333333, %rbx 139 and h33333333, %rbp 140 add %r8, %r9 C 16 4-bit fields (0..4) 141 add %rbx, %rbp C 16 4-bit fields (0..4) 142 143 add %rbp, %r9 C 16 4-bit fields (0..8) 144L(mid): mov %r9, %r8 145 shr $4, %r9 146 and h0f0f0f0f, %r8 147 and h0f0f0f0f, %r9 148 add %r8, %r9 C 8 8-bit fields (0..16) 149 150 imul h01010101, %r9 C sum the 8 fields in high 8 bits 151 shr $56, %r9 152 153 add %r9, %rax C add to total 154 add $2, n 155 jnc L(top) 156 157L(end): 158 HAM(` pop %r12 ') 159 pop %rbp 160 pop %rbx 161 FUNC_EXIT() 162 ret 163EPILOGUE() 164