1dnl  Intel P5 mpn_hamdist -- mpn hamming distance.
2
3dnl  Copyright 2001, 2002, 2014, 2015 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C P5: 14.0 cycles/limb
35
36
37C unsigned long mpn_hamdist (mp_srcptr src1, mp_srcptr src2, mp_size_t size);
38C
39C It might be possible to shave 1 cycle from the loop, and hence 2
40C cycles/limb.  The xorb is taking 2 cycles, but a separate load and xor
41C would be 1, if the right schedule could be found (not found so far).
42C Wanting to avoid potential cache bank clashes makes it tricky.
43
44C The slightly strange quoting here helps the renaming done by tune/many.pl.
45deflit(TABLE_NAME,
46m4_assert_defined(`GSYM_PREFIX')
47GSYM_PREFIX`'mpn_popcount``'_table')
48
49C FIXME: referencing popcount.asm's table is incorrect as it hurt incremental
50C linking.
51
52defframe(PARAM_SIZE,12)
53defframe(PARAM_SRC2, 8)
54defframe(PARAM_SRC1, 4)
55
56	TEXT
57	ALIGN(8)
58
59PROLOGUE(mpn_hamdist)
60deflit(`FRAME',0)
61
62	movl	PARAM_SIZE, %ecx
63	pushl	%esi	FRAME_pushl()
64
65	shll	%ecx		C size in byte pairs
66	pushl	%edi	FRAME_pushl()
67
68ifdef(`PIC',`
69	pushl	%ebx	FRAME_pushl()
70	pushl	%ebp	FRAME_pushl()
71ifdef(`DARWIN',`
72	movl	PARAM_SRC1, %esi
73	movl	PARAM_SRC2, %edi
74	LEA(	TABLE_NAME, %ebp)
75	xorl	%ebx, %ebx	C byte
76	xorl	%edx, %edx	C byte
77	xorl	%eax, %eax	C total
78',`
79	call	L(here)	FRAME_pushl()
80L(here):
81	movl	PARAM_SRC1, %esi
82	popl	%ebp	FRAME_popl()
83
84	movl	PARAM_SRC2, %edi
85	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
86
87	xorl	%ebx, %ebx	C byte
88	xorl	%edx, %edx	C byte
89
90	movl	TABLE_NAME@GOT(%ebp), %ebp
91	xorl	%eax, %eax	C total
92')
93define(TABLE,`(%ebp,$1)')
94',`
95dnl non-PIC
96	movl	PARAM_SRC1, %esi
97	movl	PARAM_SRC2, %edi
98
99	xorl	%eax, %eax	C total
100	pushl	%ebx	FRAME_pushl()
101
102	xorl	%edx, %edx	C byte
103	xorl	%ebx, %ebx	C byte
104
105define(TABLE,`TABLE_NAME($1)')
106')
107
108
109	C The nop after the xorb seems necessary.  Although a movb might be
110	C expected to go down the V pipe in the second cycle of the xorb, it
111	C doesn't and costs an extra 2 cycles.
112L(top):
113	C eax	total
114	C ebx	byte
115	C ecx	counter, 2*size to 2
116	C edx	byte
117	C esi	src1
118	C edi	src2
119	C ebp	[PIC] table
120
121	addl	%ebx, %eax
122	movb	-1(%esi,%ecx,2), %bl
123
124	addl	%edx, %eax
125	movb	-1(%edi,%ecx,2), %dl
126
127	xorb	%dl, %bl
128	movb	-2(%esi,%ecx,2), %dl
129
130	xorb	-2(%edi,%ecx,2), %dl
131	nop
132
133	movb	TABLE(%ebx), %bl
134	decl	%ecx
135
136	movb	TABLE(%edx), %dl
137	jnz	L(top)
138
139
140ifdef(`PIC',`
141	popl	%ebp
142')
143	addl	%ebx, %eax
144	popl	%ebx
145
146	addl	%edx, %eax
147	popl	%edi
148
149	popl	%esi
150
151	ret
152
153EPILOGUE()
154ASM_END()
155