1dnl Intel P5 mpn_hamdist -- mpn hamming distance. 2 3dnl Copyright 2001, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C P5: 14.0 cycles/limb 24 25 26C unsigned long mpn_hamdist (mp_srcptr src1, mp_srcptr src2, mp_size_t size); 27C 28C It might be possible to shave 1 cycle from the loop, and hence 2 29C cycles/limb. The xorb is taking 2 cycles, but a separate load and xor 30C would be 1, if the right schedule could be found (not found so far). 31C Wanting to avoid potential cache bank clashes makes it tricky. 32 33C The slightly strange quoting here helps the renaming done by tune/many.pl. 34deflit(TABLE_NAME, 35m4_assert_defined(`GSYM_PREFIX') 36GSYM_PREFIX`'mpn_popcount``'_table') 37 38defframe(PARAM_SIZE,12) 39defframe(PARAM_SRC2, 8) 40defframe(PARAM_SRC1, 4) 41 42 TEXT 43 ALIGN(8) 44 45PROLOGUE(mpn_hamdist) 46deflit(`FRAME',0) 47 48 movl PARAM_SIZE, %ecx 49 pushl %esi FRAME_pushl() 50 51 shll %ecx C size in byte pairs 52 pushl %edi FRAME_pushl() 53 54ifdef(`PIC',` 55 pushl %ebx FRAME_pushl() 56 pushl %ebp FRAME_pushl() 57 58 call L(here) FRAME_pushl() 59L(here): 60 movl PARAM_SRC1, %esi 61 popl %ebp FRAME_popl() 62 63 movl PARAM_SRC2, %edi 64 addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp 65 66 xorl %ebx, %ebx C byte 67 xorl %edx, %edx C byte 68 69 movl TABLE_NAME@GOT(%ebp), %ebp 70 xorl %eax, %eax C total 71define(TABLE,`(%ebp,$1)') 72 73',` 74dnl non-PIC 75 movl PARAM_SRC1, %esi 76 movl PARAM_SRC2, %edi 77 78 xorl %eax, %eax C total 79 pushl %ebx FRAME_pushl() 80 81 xorl %edx, %edx C byte 82 xorl %ebx, %ebx C byte 83 84define(TABLE,`TABLE_NAME($1)') 85') 86 87 88 C The nop after the xorb seems necessary. Although a movb might be 89 C expected to go down the V pipe in the second cycle of the xorb, it 90 C doesn't and costs an extra 2 cycles. 91L(top): 92 C eax total 93 C ebx byte 94 C ecx counter, 2*size to 2 95 C edx byte 96 C esi src1 97 C edi src2 98 C ebp [PIC] table 99 100 addl %ebx, %eax 101 movb -1(%esi,%ecx,2), %bl 102 103 addl %edx, %eax 104 movb -1(%edi,%ecx,2), %dl 105 106 xorb %dl, %bl 107 movb -2(%esi,%ecx,2), %dl 108 109 xorb -2(%edi,%ecx,2), %dl 110 nop 111 112 movb TABLE(%ebx), %bl 113 decl %ecx 114 115 movb TABLE(%edx), %dl 116 jnz L(top) 117 118 119ifdef(`PIC',` 120 popl %ebp 121') 122 addl %ebx, %eax 123 popl %ebx 124 125 addl %edx, %eax 126 popl %edi 127 128 popl %esi 129 130 ret 131 132EPILOGUE() 133