1dnl Intel P5 mpn_hamdist -- mpn hamming distance. 2 3dnl Copyright 2001, 2002, 2014, 2015 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C P5: 14.0 cycles/limb 35 36 37C unsigned long mpn_hamdist (mp_srcptr src1, mp_srcptr src2, mp_size_t size); 38C 39C It might be possible to shave 1 cycle from the loop, and hence 2 40C cycles/limb. The xorb is taking 2 cycles, but a separate load and xor 41C would be 1, if the right schedule could be found (not found so far). 42C Wanting to avoid potential cache bank clashes makes it tricky. 43 44C The slightly strange quoting here helps the renaming done by tune/many.pl. 45deflit(TABLE_NAME, 46m4_assert_defined(`GSYM_PREFIX') 47GSYM_PREFIX`'mpn_popcount``'_table') 48 49C FIXME: referencing popcount.asm's table is incorrect as it hurt incremental 50C linking. 51 52defframe(PARAM_SIZE,12) 53defframe(PARAM_SRC2, 8) 54defframe(PARAM_SRC1, 4) 55 56 TEXT 57 ALIGN(8) 58 59PROLOGUE(mpn_hamdist) 60deflit(`FRAME',0) 61 62 movl PARAM_SIZE, %ecx 63 pushl %esi FRAME_pushl() 64 65 shll %ecx C size in byte pairs 66 pushl %edi FRAME_pushl() 67 68ifdef(`PIC',` 69 pushl %ebx FRAME_pushl() 70 pushl %ebp FRAME_pushl() 71ifdef(`DARWIN',` 72 movl PARAM_SRC1, %esi 73 movl PARAM_SRC2, %edi 74 LEA( TABLE_NAME, %ebp) 75 xorl %ebx, %ebx C byte 76 xorl %edx, %edx C byte 77 xorl %eax, %eax C total 78',` 79 call L(here) FRAME_pushl() 80L(here): 81 movl PARAM_SRC1, %esi 82 popl %ebp FRAME_popl() 83 84 movl PARAM_SRC2, %edi 85 addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp 86 87 xorl %ebx, %ebx C byte 88 xorl %edx, %edx C byte 89 90 movl TABLE_NAME@GOT(%ebp), %ebp 91 xorl %eax, %eax C total 92') 93define(TABLE,`(%ebp,$1)') 94',` 95dnl non-PIC 96 movl PARAM_SRC1, %esi 97 movl PARAM_SRC2, %edi 98 99 xorl %eax, %eax C total 100 pushl %ebx FRAME_pushl() 101 102 xorl %edx, %edx C byte 103 xorl %ebx, %ebx C byte 104 105define(TABLE,`TABLE_NAME($1)') 106') 107 108 109 C The nop after the xorb seems necessary. Although a movb might be 110 C expected to go down the V pipe in the second cycle of the xorb, it 111 C doesn't and costs an extra 2 cycles. 112L(top): 113 C eax total 114 C ebx byte 115 C ecx counter, 2*size to 2 116 C edx byte 117 C esi src1 118 C edi src2 119 C ebp [PIC] table 120 121 addl %ebx, %eax 122 movb -1(%esi,%ecx,2), %bl 123 124 addl %edx, %eax 125 movb -1(%edi,%ecx,2), %dl 126 127 xorb %dl, %bl 128 movb -2(%esi,%ecx,2), %dl 129 130 xorb -2(%edi,%ecx,2), %dl 131 nop 132 133 movb TABLE(%ebx), %bl 134 decl %ecx 135 136 movb TABLE(%edx), %dl 137 jnz L(top) 138 139 140ifdef(`PIC',` 141 popl %ebp 142') 143 addl %ebx, %eax 144 popl %ebx 145 146 addl %edx, %eax 147 popl %edi 148 149 popl %esi 150 151 ret 152 153EPILOGUE() 154ASM_END() 155