1274101Sbrooksdnl  AMD64 SSSE3/XOP mpn_hamdist -- hamming distance.
2244541Sbrooks
3244541Sbrooksdnl  Copyright 2010-2017 Free Software Foundation, Inc.
4244541Sbrooks
5244541Sbrooksdnl  This file is part of the GNU MP Library.
6244541Sbrooksdnl
7244541Sbrooksdnl  The GNU MP Library is free software; you can redistribute it and/or modify
8244541Sbrooksdnl  it under the terms of either:
9244541Sbrooksdnl
10244541Sbrooksdnl    * the GNU Lesser General Public License as published by the Free
11244541Sbrooksdnl      Software Foundation; either version 3 of the License, or (at your
12244541Sbrooksdnl      option) any later version.
13244541Sbrooksdnl
14244541Sbrooksdnl  or
15244541Sbrooksdnl
16244541Sbrooksdnl    * the GNU General Public License as published by the Free Software
17244541Sbrooksdnl      Foundation; either version 2 of the License, or (at your option) any
18244541Sbrooksdnl      later version.
19244541Sbrooksdnl
20244541Sbrooksdnl  or both in parallel, as here.
21244541Sbrooksdnl
22244541Sbrooksdnl  The GNU MP Library is distributed in the hope that it will be useful, but
23244541Sbrooksdnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24244541Sbrooksdnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25244541Sbrooksdnl  for more details.
26244541Sbrooksdnl
27244541Sbrooksdnl  You should have received copies of the GNU General Public License and the
28244541Sbrooksdnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29244541Sbrooksdnl  see https://www.gnu.org/licenses/.
30244541Sbrooks
31244541Sbrooks
32244541Sbrooksinclude(`../config.m4')
33244541Sbrooks
34244541SbrooksC		    cycles/limb	  good for cpu?
35244541SbrooksC AMD K8,K9		n/a
36244541SbrooksC AMD K10		n/a
37244541SbrooksC AMD bd1	     1.51-2.0		y
38244541SbrooksC AMD bd2	     1.50-1.9		y
39244541SbrooksC AMD bd3		 ?
40244541SbrooksC AMD bd4		 ?
41244541SbrooksC AMD zen		n/a
42244541SbrooksC AMD bobcat		n/a
43244541SbrooksC AMD jaguar		n/a
44244541SbrooksC Intel P4		n/a
45244541SbrooksC Intel PNR		n/a
46244541SbrooksC Intel NHM		n/a
47244541SbrooksC Intel SBR		n/a
48244541SbrooksC Intel IBR		n/a
49244541SbrooksC Intel HWL		n/a
50244541SbrooksC Intel BWL		n/a
51244541SbrooksC Intel SKL		n/a
52244541SbrooksC Intel atom		n/a
53244541SbrooksC Intel SLM		n/a
54244541SbrooksC VIA nano		n/a
55244541Sbrooks
56244541SbrooksC TODO
57244541SbrooksC  * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we
58244541SbrooksC    intend to support old systems.
59244541Sbrooks
60244541SbrooksC We use vpshlb and vpperm below, which are XOP extensions to AVX.  Some
61244541SbrooksC systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX.
62244541SbrooksC We fall back to the core2 code.
63244541Sbrooksifdef(`GMP_AVX_NOT_REALLY_AVAILABLE',`
64244541SbrooksMULFUNC_PROLOGUE(mpn_hamdist)
65244541Sbrooksinclude_mpn(`x86_64/core2/hamdist.asm')
66244541Sbrooks',`
67244541Sbrooks
68244541Sbrooksdefine(`up',		`%rdi')
69244541Sbrooksdefine(`vp',		`%rsi')
70274101Sbrooksdefine(`n',		`%rdx')
71244541Sbrooks
72244541SbrooksABI_SUPPORT(DOS64)
73244541SbrooksABI_SUPPORT(STD64)
74244541Sbrooks
75244541SbrooksASM_START()
76244541Sbrooks	TEXT
77244541Sbrooks	ALIGN(32)
78244541SbrooksPROLOGUE(mpn_hamdist)
79244541Sbrooks	FUNC_ENTRY(3)
80244541Sbrooks	cmp	$5, n
81244541Sbrooks	jl	L(sma)
82244541Sbrooks
83258655Sbrooks	lea	L(cnsts)(%rip), %r9
84256996Sbrooks
85244541Sbrooks	xor	R32(%r10), R32(%r10)
86244541Sbrooks	test	$8, R8(vp)
87244541Sbrooks	jz	L(ali)
88244541Sbrooks	mov	(up), %r8
89244541Sbrooks	xor	(vp), %r8
90244541Sbrooks	add	$8, up
91244541Sbrooks	add	$8, vp
92244541Sbrooks	dec	n
93244541Sbrooks	popcnt	%r8, %r10
94244541SbrooksL(ali):
95244541Sbrooks
96244541Sbrooksifdef(`PIC', `define(`OFF1',16) define(`OFF2',32) define(`OFF3',48)',
97244541Sbrooks	     `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)')
98244541Sbrooks	movdqa	OFF1`'(%r9), %xmm7	C nibble counts table
99244541Sbrooks	movdqa	OFF2`'(%r9), %xmm6	C splat shift counts
100244541Sbrooks	movdqa	OFF3`'(%r9), %xmm5	C masks
101244541Sbrooks	pxor	%xmm4, %xmm4
102244541Sbrooks	pxor	%xmm8, %xmm8		C grand total count
103244541Sbrooks
104244541Sbrooks	mov	R32(n), R32(%rax)
105274101Sbrooks	and	$6, R32(%rax)
106244541Sbrooks	lea	-64(up,%rax,8), up
107244541Sbrooks	lea	-64(vp,%rax,8), vp
108244541Sbrooksifdef(`PIC',`
109244541Sbrooks	movslq	(%r9,%rax,2), %r11
110244541Sbrooks	add	%r9, %r11
111244541Sbrooks	jmp	*%r11
112244541Sbrooks',`
113244541Sbrooks	jmp	*(%r9,%rax,4)
114244541Sbrooks')
115244541Sbrooks
116244541SbrooksL(0):	add	$64, up
117244541Sbrooks	add	$64, vp
118244541Sbrooks	sub	$2, n
119244541Sbrooks
120244541Sbrooks	ALIGN(32)
121244541SbrooksL(top):	lddqu	(up), %xmm0
122244541Sbrooks	pxor	(vp), %xmm0
123244541Sbrooks	.byte	0x8f,0xe9,0x48,0x94,0xc8	C vpshlb %xmm6, %xmm0, %xmm1
124244541Sbrooks	pand	%xmm5, %xmm0
125244541Sbrooks	pand	%xmm5, %xmm1
126244541Sbrooks	.byte	0x8f,0xe8,0x40,0xa3,0xd7,0x00	C vpperm %xmm0,%xmm7,%xmm7,%xmm2
127244541Sbrooks	.byte	0x8f,0xe8,0x40,0xa3,0xdf,0x10	C vpperm %xmm1,%xmm7,%xmm7,%xmm3
128244541Sbrooks	paddb	%xmm2, %xmm3
129244541Sbrooks	paddb	%xmm3, %xmm4
130244541SbrooksL(6):	lddqu	16(up), %xmm0
131244541Sbrooks	pxor	16(vp), %xmm0
132244541Sbrooks	.byte	0x8f,0xe9,0x48,0x94,0xc8	C vpshlb %xmm6, %xmm0, %xmm1
133244541Sbrooks	pand	%xmm5, %xmm0
134244541Sbrooks	pand	%xmm5, %xmm1
135244541Sbrooks	.byte	0x8f,0xe8,0x40,0xa3,0xd7,0x00	C vpperm %xmm0,%xmm7,%xmm7,%xmm2
136244541Sbrooks	.byte	0x8f,0xe8,0x40,0xa3,0xdf,0x10	C vpperm %xmm1,%xmm7,%xmm7,%xmm3
137244541Sbrooks	paddb	%xmm2, %xmm3
138244541Sbrooks	paddb	%xmm3, %xmm4
139244541SbrooksL(4):	lddqu	32(up), %xmm0
140244541Sbrooks	pxor	32(vp), %xmm0
141244541Sbrooks	.byte	0x8f,0xe9,0x48,0x94,0xc8	C vpshlb %xmm6, %xmm0, %xmm1
142244541Sbrooks	pand	%xmm5, %xmm0
143244541Sbrooks	pand	%xmm5, %xmm1
144244541Sbrooks	.byte	0x8f,0xe8,0x40,0xa3,0xd7,0x00	C vpperm %xmm0,%xmm7,%xmm7,%xmm2
145244541Sbrooks	.byte	0x8f,0xe9,0x78,0xd3,0xc4	C vphaddubq %xmm4, %xmm0
146244541Sbrooks	.byte	0x8f,0xe8,0x40,0xa3,0xe7,0x10	C vpperm %xmm1,%xmm7,%xmm7,%xmm4
147244541Sbrooks	paddb	%xmm2, %xmm3
148244541Sbrooks	paddb	%xmm2, %xmm4
149244541Sbrooks	paddq	%xmm0, %xmm8		C sum to 2 x 64-bit counts
150244541SbrooksL(2):	mov	48(up), %r8
151244541Sbrooks	mov	56(up), %r9
152244541Sbrooks	add	$64, up
153244541Sbrooks	xor	48(vp), %r8
154244541Sbrooks	xor	56(vp), %r9
155244541Sbrooks	add	$64, vp
156244541Sbrooks	popcnt	%r8, %r8
157244541Sbrooks	popcnt	%r9, %r9
158244541Sbrooks	add	%r8, %r10
159244541Sbrooks	add	%r9, %r10
160244541Sbrooks	sub	$8, n
161244541Sbrooks	jg	L(top)
162244541Sbrooks
163244541Sbrooks	test	$1, R8(n)
164244541Sbrooks	jz	L(x)
165244541Sbrooks	mov	(up), %r8
166244541Sbrooks	xor	(vp), %r8
167244541Sbrooks	popcnt	%r8, %r8
168244541Sbrooks	add	%r8, %r10
169244541SbrooksL(x):	.byte	0x8f,0xe9,0x78,0xd3,0xc4	C vphaddubq %xmm4, %xmm0
170244541Sbrooks	paddq	%xmm0, %xmm8
171244541Sbrooks	pshufd	$14, %xmm8, %xmm0
172244541Sbrooks	paddq	%xmm8, %xmm0
173244541Sbrooks	movq	%xmm0, %rax
174244541Sbrooks	add	%r10, %rax
175244541Sbrooks	FUNC_EXIT()
176244541Sbrooks	ret
177244541Sbrooks
178244541SbrooksL(sma):	mov	(up), %r8
179244541Sbrooks	xor	(vp), %r8
180244541Sbrooks	popcnt	%r8, %rax
181244541Sbrooks	dec	n
182244541Sbrooks	jz	L(ed)
183244541SbrooksL(tp):	mov	8(up), %r8
184244541Sbrooks	add	$8, up
185244541Sbrooks	xor	8(vp), %r8
186244541Sbrooks	add	$8, vp
187244541Sbrooks	popcnt	%r8, %r8
188244541Sbrooks	add	%r8, %rax
189244541Sbrooks	dec	n
190244541Sbrooks	jnz	L(tp)
191244541SbrooksL(ed):	FUNC_EXIT()
192244541Sbrooks	ret
193244541SbrooksEPILOGUE()
194244541SbrooksDEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
195244541Sbrooks	JMPENT(	L(0), L(cnsts))
196244541Sbrooks	JMPENT(	L(2), L(cnsts))
197244541Sbrooks	JMPENT(	L(4), L(cnsts))
198244541Sbrooks	JMPENT(	L(6), L(cnsts))
199244541Sbrooks	.byte	0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
200244541Sbrooks	.byte	0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
201244541Sbrooks	.byte	-4,-4,-4,-4,-4,-4,-4,-4
202244541Sbrooks	.byte	-4,-4,-4,-4,-4,-4,-4,-4
203244541Sbrooks	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
204244541Sbrooks	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
205244541SbrooksEND_OBJECT(L(cnsts))
206244541Sbrooks')
207244541Sbrooks