popham.asm revision 1.1.1.1
1dnl AMD64 mpn_popcount, mpn_hamdist -- population count and hamming distance. 2 3dnl Copyright 2004, 2005, 2007 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20 21include(`../config.m4') 22 23 24C popcount hamdist 25C cycles/limb cycles/limb 26C K8,K9: 6 7 27C K10: 6 7 28C P4: 12 14.3 29C P6-15: 7 8 30 31C TODO 32C * Tune. It should be possible to reach 5 c/l for popcount and 6 c/l for 33C hamdist for K8/K9. 34 35 36ifdef(`OPERATION_popcount',` 37 define(`func',`mpn_popcount') 38 define(`up', `%rdi') 39 define(`n', `%rsi') 40 define(`h55555555', `%r10') 41 define(`h33333333', `%r11') 42 define(`h0f0f0f0f', `%rcx') 43 define(`h01010101', `%rdx') 44 define(`HAM', `dnl') 45') 46ifdef(`OPERATION_hamdist',` 47 define(`func',`mpn_hamdist') 48 define(`up', `%rdi') 49 define(`vp', `%rsi') 50 define(`n', `%rdx') 51 define(`h55555555', `%r10') 52 define(`h33333333', `%r11') 53 define(`h0f0f0f0f', `%rcx') 54 define(`h01010101', `%r14') 55 define(`HAM', `$1') 56') 57 58 59MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) 60 61ASM_START() 62 TEXT 63 ALIGN(32) 64PROLOGUE(func) 65 66 pushq %r12 67 pushq %r13 68 HAM(` pushq %r14 ') 69 70 movq $0x5555555555555555, h55555555 71 movq $0x3333333333333333, h33333333 72 movq $0x0f0f0f0f0f0f0f0f, h0f0f0f0f 73 movq $0x0101010101010101, h01010101 74 75 leaq (up,n,8), up 76 HAM(` leaq (vp,n,8), vp ') 77 negq n 78 79 xorl %eax, %eax 80 81 btq $0, n 82 jnc L(oop) 83 84 movq (up,n,8), %r8 85 HAM(` xorq (vp,n,8), %r8 ') 86 87 movq %r8, %r9 88 shrq %r8 89 andq h55555555, %r8 90 subq %r8, %r9 91 92 movq %r9, %r8 93 shrq $2, %r9 94 andq h33333333, %r8 95 andq h33333333, %r9 96 addq %r8, %r9 C 16 4-bit fields (0..4) 97 98 movq %r9, %r8 99 shrq $4, %r9 100 andq h0f0f0f0f, %r8 101 andq h0f0f0f0f, %r9 102 addq %r8, %r9 C 8 8-bit fields (0..16) 103 104 imulq h01010101, %r9 C sum the 8 fields in high 8 bits 105 shrq $56, %r9 106 107 addq %r9, %rax C add to total 108 addq $1, n 109 jz L(done) 110 111 ALIGN(16) 112L(oop): movq (up,n,8), %r8 113 movq 8(up,n,8), %r12 114 HAM(` xorq (vp,n,8), %r8 ') 115 HAM(` xorq 8(vp,n,8), %r12 ') 116 117 movq %r8, %r9 118 movq %r12, %r13 119 shrq %r8 120 shrq %r12 121 andq h55555555, %r8 122 andq h55555555, %r12 123 subq %r8, %r9 124 subq %r12, %r13 125 126 movq %r9, %r8 127 movq %r13, %r12 128 shrq $2, %r9 129 shrq $2, %r13 130 andq h33333333, %r8 131 andq h33333333, %r9 132 andq h33333333, %r12 133 andq h33333333, %r13 134 addq %r8, %r9 C 16 4-bit fields (0..4) 135 addq %r12, %r13 C 16 4-bit fields (0..4) 136 137 addq %r13, %r9 C 16 4-bit fields (0..8) 138 movq %r9, %r8 139 shrq $4, %r9 140 andq h0f0f0f0f, %r8 141 andq h0f0f0f0f, %r9 142 addq %r8, %r9 C 8 8-bit fields (0..16) 143 144 imulq h01010101, %r9 C sum the 8 fields in high 8 bits 145 shrq $56, %r9 146 147 addq %r9, %rax C add to total 148 addq $2, n 149 jnc L(oop) 150 151L(done): 152 HAM(` popq %r14 ') 153 popq %r13 154 popq %r12 155 ret 156 157EPILOGUE() 158