1dnl  AMD64 mpn_hamdist -- hamming distance.
2
3dnl  Copyright 2017 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31
32include(`../config.m4')
33
34C		    cycles/limb
35C AMD K8,K9		 n/a
36C AMD K10		 3.26
37C AMD bd1		 4.2
38C AMD bd2		 4.2
39C AMD bd3		 ?
40C AMD bd4		 ?
41C AMD zen		 1.15
42C AMD bobcat		 7.29
43C AMD jaguar		 2.53
44C Intel P4		 n/a
45C Intel core2		 n/a
46C Intel NHM		 2.03
47C Intel SBR		 1.66
48C Intel IBR		 1.62
49C Intel HWL		 1.50
50C Intel BWL		 1.50
51C Intel SKL		 1.50
52C Intel atom		 n/a
53C Intel SLM		 2.55
54C VIA nano		 n/a
55
56C TODO
57C  * An AVX pshufb based variant should approach 0.5 c/l on Haswell and later
58C    Intel hardware.  Perhaps mix such a loop with popcnt instructions.
59C  * The random placement of the L0, L1, L2, etc blocks are due to branch
60C    shortening.  More work could be done there.
61C  * Combine the accumulators rax and rcx into one register to save some
62C    bookkeeping and a push/pop pair.  Unfortunately this cause a slight
63C    slowdown for at leat NHM and SBR.
64
65define(`up',		`%rdi')
66define(`vp',		`%rsi')
67define(`n',		`%rdx')
68
69ABI_SUPPORT(DOS64)
70ABI_SUPPORT(STD64)
71
72define(`sum', `lea	($1,$2), $2')
73define(`sum', `add	$1, $2')
74
75ASM_START()
76	TEXT
77	ALIGN(32)
78PROLOGUE(mpn_hamdist)
79	FUNC_ENTRY(3)
80	push	%rbx
81	push	%rbp
82
83	mov	(up), %r10
84	xor	(vp), %r10
85
86	mov	R32(n), R32(%r8)
87	and	$3, R32(%r8)
88
89	xor	R32(%rcx), R32(%rcx)
90	.byte	0xf3,0x49,0x0f,0xb8,0xc2	C popcnt %r10,%rax
91
92	lea	L(tab)(%rip), %r9
93ifdef(`PIC',`
94	movslq	(%r9,%r8,4), %r8
95	add	%r9, %r8
96	jmp	*%r8
97',`
98	jmp	*(%r9,%r8,8)
99')
100
101L(3):	mov	8(up), %r10
102	mov	16(up), %r11
103	xor	8(vp), %r10
104	xor	16(vp), %r11
105	xor	R32(%rbp), R32(%rbp)
106	sub	$4, n
107	jle	L(x3)
108	mov	24(up), %r8
109	mov	32(up), %r9
110	add	$24, up
111	add	$24, vp
112	jmp	L(e3)
113
114L(0):	mov	8(up), %r9
115	xor	8(vp), %r9
116	mov	16(up), %r10
117	mov	24(up), %r11
118	xor	R32(%rbx), R32(%rbx)
119	xor	16(vp), %r10
120	xor	24(vp), %r11
121	add	$32, up
122	add	$32, vp
123	sub	$4, n
124	jle	L(x4)
125
126	ALIGN(16)
127L(top):
128L(e0):	.byte	0xf3,0x49,0x0f,0xb8,0xe9	C popcnt %r9,%rbp
129	mov	(up), %r8
130	mov	8(up), %r9
131	sum(	%rbx, %rax)
132L(e3):	.byte	0xf3,0x49,0x0f,0xb8,0xda	C popcnt %r10,%rbx
133	xor	(vp), %r8
134	xor	8(vp), %r9
135	sum(	%rbp, %rcx)
136L(e2):	.byte	0xf3,0x49,0x0f,0xb8,0xeb	C popcnt %r11,%rbp
137	mov	16(up), %r10
138	mov	24(up), %r11
139	add	$32, up
140	sum(	%rbx, %rax)
141L(e1):	.byte	0xf3,0x49,0x0f,0xb8,0xd8	C popcnt %r8,%rbx
142	xor	16(vp), %r10
143	xor	24(vp), %r11
144	add	$32, vp
145	sum(	%rbp, %rcx)
146	sub	$4, n
147	jg	L(top)
148
149L(x4):	.byte	0xf3,0x49,0x0f,0xb8,0xe9	C popcnt %r9,%rbp
150	sum(	%rbx, %rax)
151L(x3):	.byte	0xf3,0x49,0x0f,0xb8,0xda	C popcnt %r10,%rbx
152	sum(	%rbp, %rcx)
153	.byte	0xf3,0x49,0x0f,0xb8,0xeb	C popcnt %r11,%rbp
154	sum(	%rbx, %rax)
155	sum(	%rbp, %rcx)
156L(x2):	add	%rcx, %rax
157L(x1):	pop	%rbp
158	pop	%rbx
159	FUNC_EXIT()
160	ret
161
162L(2):	mov	8(up), %r11
163	xor	8(vp), %r11
164	sub	$2, n
165	jle	L(n2)
166	mov	16(up), %r8
167	mov	24(up), %r9
168	xor	R32(%rbx), R32(%rbx)
169	xor	16(vp), %r8
170	xor	24(vp), %r9
171	add	$16, up
172	add	$16, vp
173	jmp	L(e2)
174L(n2):	.byte	0xf3,0x49,0x0f,0xb8,0xcb	C popcnt %r11,%rcx
175	jmp	L(x2)
176
177L(1):	dec	n
178	jle	L(x1)
179	mov	8(up), %r8
180	mov	16(up), %r9
181	xor	8(vp), %r8
182	xor	16(vp), %r9
183	xor	R32(%rbp), R32(%rbp)
184	mov	24(up), %r10
185	mov	32(up), %r11
186	add	$40, up
187	add	$8, vp
188	jmp	L(e1)
189
190EPILOGUE()
191	JUMPTABSECT
192	ALIGN(8)
193L(tab):	JMPENT(	L(0), L(tab))
194	JMPENT(	L(1), L(tab))
195	JMPENT(	L(2), L(tab))
196	JMPENT(	L(3), L(tab))
197