popham.asm revision 1.1.1.2
1dnl AMD64 mpn_popcount, mpn_hamdist -- population count and hamming distance. 2 3dnl Copyright 2004, 2005, 2007, 2010, 2011, 2012 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20 21include(`../config.m4') 22 23 24C popcount hamdist 25C cycles/limb cycles/limb 26C AMD K8,K9 6 7 27C AMD K10 6 7 28C Intel P4 12 14.3 29C Intel core2 7 8 30C Intel corei ? 7.3 31C Intel atom 16.5 17.5 32C VIA nano 8.75 10.4 33 34C TODO 35C * Tune. It should be possible to reach 5 c/l for popcount and 6 c/l for 36C hamdist for K8/K9. 37 38 39ifdef(`OPERATION_popcount',` 40 define(`func',`mpn_popcount') 41 define(`up', `%rdi') 42 define(`n', `%rsi') 43 define(`h55555555', `%r10') 44 define(`h33333333', `%r11') 45 define(`h0f0f0f0f', `%rcx') 46 define(`h01010101', `%rdx') 47 define(`POP', `$1') 48 define(`HAM', `dnl') 49') 50ifdef(`OPERATION_hamdist',` 51 define(`func',`mpn_hamdist') 52 define(`up', `%rdi') 53 define(`vp', `%rsi') 54 define(`n', `%rdx') 55 define(`h55555555', `%r10') 56 define(`h33333333', `%r11') 57 define(`h0f0f0f0f', `%rcx') 58 define(`h01010101', `%r14') 59 define(`POP', `dnl') 60 define(`HAM', `$1') 61') 62 63 64MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) 65 66ABI_SUPPORT(DOS64) 67ABI_SUPPORT(STD64) 68 69ASM_START() 70 TEXT 71 ALIGN(32) 72PROLOGUE(func) 73 POP(` FUNC_ENTRY(2) ') 74 HAM(` FUNC_ENTRY(3) ') 75 push %r12 76 push %r13 77 HAM(` push %r14 ') 78 79 mov $0x5555555555555555, h55555555 80 mov $0x3333333333333333, h33333333 81 mov $0x0f0f0f0f0f0f0f0f, h0f0f0f0f 82 mov $0x0101010101010101, h01010101 83 84 lea (up,n,8), up 85 HAM(` lea (vp,n,8), vp ') 86 neg n 87 88 xor R32(%rax), R32(%rax) 89 90 bt $0, R32(n) 91 jnc L(top) 92 93 mov (up,n,8), %r8 94 HAM(` xor (vp,n,8), %r8 ') 95 96 mov %r8, %r9 97 shr %r8 98 and h55555555, %r8 99 sub %r8, %r9 100 101 mov %r9, %r8 102 shr $2, %r9 103 and h33333333, %r8 104 and h33333333, %r9 105 add %r8, %r9 C 16 4-bit fields (0..4) 106 107 mov %r9, %r8 108 shr $4, %r9 109 and h0f0f0f0f, %r8 110 and h0f0f0f0f, %r9 111 add %r8, %r9 C 8 8-bit fields (0..16) 112 113 imul h01010101, %r9 C sum the 8 fields in high 8 bits 114 shr $56, %r9 115 116 mov %r9, %rax C add to total 117 add $1, n 118 jz L(end) 119 120 ALIGN(16) 121L(top): mov (up,n,8), %r8 122 mov 8(up,n,8), %r12 123 HAM(` xor (vp,n,8), %r8 ') 124 HAM(` xor 8(vp,n,8), %r12 ') 125 126 mov %r8, %r9 127 mov %r12, %r13 128 shr %r8 129 shr %r12 130 and h55555555, %r8 131 and h55555555, %r12 132 sub %r8, %r9 133 sub %r12, %r13 134 135 mov %r9, %r8 136 mov %r13, %r12 137 shr $2, %r9 138 shr $2, %r13 139 and h33333333, %r8 140 and h33333333, %r9 141 and h33333333, %r12 142 and h33333333, %r13 143 add %r8, %r9 C 16 4-bit fields (0..4) 144 add %r12, %r13 C 16 4-bit fields (0..4) 145 146 add %r13, %r9 C 16 4-bit fields (0..8) 147 mov %r9, %r8 148 shr $4, %r9 149 and h0f0f0f0f, %r8 150 and h0f0f0f0f, %r9 151 add %r8, %r9 C 8 8-bit fields (0..16) 152 153 imul h01010101, %r9 C sum the 8 fields in high 8 bits 154 shr $56, %r9 155 156 add %r9, %rax C add to total 157 add $2, n 158 jnc L(top) 159 160L(end): 161 HAM(` pop %r14 ') 162 pop %r13 163 pop %r12 164 FUNC_EXIT() 165 ret 166EPILOGUE() 167