1dnl  Intel P5 mpn_hamdist -- mpn hamming distance.
2
3dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C P5: 14.0 cycles/limb
24
25
26C unsigned long mpn_hamdist (mp_srcptr src1, mp_srcptr src2, mp_size_t size);
27C
28C It might be possible to shave 1 cycle from the loop, and hence 2
29C cycles/limb.  The xorb is taking 2 cycles, but a separate load and xor
30C would be 1, if the right schedule could be found (not found so far).
31C Wanting to avoid potential cache bank clashes makes it tricky.
32
33C The slightly strange quoting here helps the renaming done by tune/many.pl.
34deflit(TABLE_NAME,
35m4_assert_defined(`GSYM_PREFIX')
36GSYM_PREFIX`'mpn_popcount``'_table')
37
38defframe(PARAM_SIZE,12)
39defframe(PARAM_SRC2, 8)
40defframe(PARAM_SRC1, 4)
41
42	TEXT
43	ALIGN(8)
44
45PROLOGUE(mpn_hamdist)
46deflit(`FRAME',0)
47
48	movl	PARAM_SIZE, %ecx
49	pushl	%esi	FRAME_pushl()
50
51	shll	%ecx		C size in byte pairs
52	pushl	%edi	FRAME_pushl()
53
54ifdef(`PIC',`
55	pushl	%ebx	FRAME_pushl()
56	pushl	%ebp	FRAME_pushl()
57
58	call	L(here)	FRAME_pushl()
59L(here):
60	movl	PARAM_SRC1, %esi
61	popl	%ebp	FRAME_popl()
62
63	movl	PARAM_SRC2, %edi
64	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
65
66	xorl	%ebx, %ebx	C byte
67	xorl	%edx, %edx	C byte
68
69	movl	TABLE_NAME@GOT(%ebp), %ebp
70	xorl	%eax, %eax	C total
71define(TABLE,`(%ebp,$1)')
72
73',`
74dnl non-PIC
75	movl	PARAM_SRC1, %esi
76	movl	PARAM_SRC2, %edi
77
78	xorl	%eax, %eax	C total
79	pushl	%ebx	FRAME_pushl()
80
81	xorl	%edx, %edx	C byte
82	xorl	%ebx, %ebx	C byte
83
84define(TABLE,`TABLE_NAME($1)')
85')
86
87
88	C The nop after the xorb seems necessary.  Although a movb might be
89	C expected to go down the V pipe in the second cycle of the xorb, it
90	C doesn't and costs an extra 2 cycles.
91L(top):
92	C eax	total
93	C ebx	byte
94	C ecx	counter, 2*size to 2
95	C edx	byte
96	C esi	src1
97	C edi	src2
98	C ebp	[PIC] table
99
100	addl	%ebx, %eax
101	movb	-1(%esi,%ecx,2), %bl
102
103	addl	%edx, %eax
104	movb	-1(%edi,%ecx,2), %dl
105
106	xorb	%dl, %bl
107	movb	-2(%esi,%ecx,2), %dl
108
109	xorb	-2(%edi,%ecx,2), %dl
110	nop
111
112	movb	TABLE(%ebx), %bl
113	decl	%ecx
114
115	movb	TABLE(%edx), %dl
116	jnz	L(top)
117
118
119ifdef(`PIC',`
120	popl	%ebp
121')
122	addl	%ebx, %eax
123	popl	%ebx
124
125	addl	%edx, %eax
126	popl	%edi
127
128	popl	%esi
129
130	ret
131
132EPILOGUE()
133