1dnl  AMD64 SSSE3 mpn_hamdist -- hamming distance.
2
3dnl  Copyright 2010-2017 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31
32include(`../config.m4')
33
34C		    cycles/limb	  good for cpu?
35C AMD K8,K9		n/a
36C AMD K10		n/a
37C AMD bd1		 ?
38C AMD bd2		 ?
39C AMD bd3		 ?
40C AMD bd4		 ?
41C AMD zen		 ?
42C AMD bobcat		 ?
43C AMD jaguar		 ?
44C Intel P4		n/a
45C Intel CNR		 4.50		y
46C Intel PNR		 3.28		y
47C Intel NHM		 ?
48C Intel SBR		 ?
49C Intel IBR		 ?
50C Intel HWL		 ?
51C Intel BWL		 ?
52C Intel SKL		 ?
53C Intel atom		 ?
54C Intel SLM		 ?
55C VIA nano		 ?
56
57C TODO
58C  * This was hand-written without too much thought about optimal insn
59C    selection; check to see of it can be improved.
60C  * Consider doing some instruction scheduling.
61
62define(`up',		`%rdi')
63define(`vp',		`%rsi')
64define(`n',		`%rdx')
65
66ASM_START()
67	TEXT
68	ALIGN(32)
69PROLOGUE(mpn_hamdist)
70	lea	L(cnsts)(%rip), %r9
71
72ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48)',
73	     `define(`OFF1',64) define(`OFF2',80)')
74	movdqa	OFF1`'(%r9), %xmm7
75	movdqa	OFF2`'(%r9), %xmm6
76	pxor	%xmm4, %xmm4
77	pxor	%xmm5, %xmm5
78	pxor	%xmm8, %xmm8
79
80	mov	R32(n), R32(%rax)
81	and	$7, R32(%rax)
82ifdef(`PIC',`
83	movslq	(%r9,%rax,4), %rax
84	add	%r9, %rax
85	jmp	*%rax
86',`
87	jmp	*(%r9,%rax,8)
88')
89
90L(1):	movq	(up), %xmm1
91	add	$8, up
92	movq	(vp), %xmm10
93	add	$8, vp
94	pxor	%xmm10, %xmm1
95	jmp	L(e1)
96
97L(2):	add	$-48, up
98	add	$-48, vp
99	jmp	L(e2)
100
101L(3):	movq	(up), %xmm1
102	add	$-40, up
103	movq	(vp), %xmm10
104	add	$-40, vp
105	pxor	%xmm10, %xmm1
106	jmp	L(e3)
107
108L(4):	add	$-32, up
109	add	$-32, vp
110	jmp	L(e4)
111
112L(5):	movq	(up), %xmm1
113	add	$-24, up
114	movq	(vp), %xmm10
115	add	$-24, vp
116	pxor	%xmm10, %xmm1
117	jmp	L(e5)
118
119L(6):	add	$-16, up
120	add	$-16, vp
121	jmp	L(e6)
122
123L(7):	movq	(up), %xmm1
124	add	$-8, up
125	movq	(vp), %xmm10
126	add	$-8, vp
127	pxor	%xmm10, %xmm1
128	jmp	L(e7)
129
130	ALIGN(32)
131L(top):	lddqu	(up), %xmm1
132	lddqu	(vp), %xmm10
133	pxor	%xmm10, %xmm1
134L(e7):	movdqa	%xmm6, %xmm0		C copy mask register
135	movdqa	%xmm7, %xmm2		C copy count register
136	movdqa	%xmm7, %xmm3		C copy count register
137	pand	%xmm1, %xmm0
138	psrlw	$4, %xmm1
139	pand	%xmm6, %xmm1
140	pshufb	%xmm0, %xmm2
141	pshufb	%xmm1, %xmm3
142	paddb	%xmm2, %xmm3
143	paddb	%xmm3, %xmm4
144L(e6):	lddqu	16(up), %xmm1
145	lddqu	16(vp), %xmm10
146	pxor	%xmm10, %xmm1
147L(e5):	movdqa	%xmm6, %xmm0
148	movdqa	%xmm7, %xmm2
149	movdqa	%xmm7, %xmm3
150	pand	%xmm1, %xmm0
151	psrlw	$4, %xmm1
152	pand	%xmm6, %xmm1
153	pshufb	%xmm0, %xmm2
154	pshufb	%xmm1, %xmm3
155	paddb	%xmm2, %xmm3
156	paddb	%xmm3, %xmm4
157L(e4):	lddqu	32(up), %xmm1
158	lddqu	32(vp), %xmm10
159	pxor	%xmm10, %xmm1
160L(e3):	movdqa	%xmm6, %xmm0
161	movdqa	%xmm7, %xmm2
162	movdqa	%xmm7, %xmm3
163	pand	%xmm1, %xmm0
164	psrlw	$4, %xmm1
165	pand	%xmm6, %xmm1
166	pshufb	%xmm0, %xmm2
167	pshufb	%xmm1, %xmm3
168	paddb	%xmm2, %xmm3
169	paddb	%xmm3, %xmm4
170L(e2):	lddqu	48(up), %xmm1
171	add	$64, up
172	lddqu	48(vp), %xmm10
173	add	$64, vp
174	pxor	%xmm10, %xmm1
175L(e1):	movdqa	%xmm6, %xmm0
176	movdqa	%xmm7, %xmm2
177	movdqa	%xmm7, %xmm3
178	pand	%xmm1, %xmm0
179	psrlw	$4, %xmm1
180	pand	%xmm6, %xmm1
181	pshufb	%xmm0, %xmm2
182	pshufb	%xmm1, %xmm3
183	psadbw	%xmm5, %xmm4		C sum to 8 x 16-bit counts
184	paddb	%xmm2, %xmm3
185	paddq	%xmm4, %xmm8		C sum to 2 x 64-bit counts
186	movdqa	%xmm3, %xmm4
187	sub	$8, n
188	jg	L(top)
189
190	psadbw	%xmm5, %xmm4
191	paddq	%xmm4, %xmm8
192	pshufd	$14, %xmm8, %xmm0
193	paddq	%xmm8, %xmm0
194	movq	%xmm0, %rax
195	ret
196EPILOGUE()
197DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
198	JMPENT(	L(top), L(cnsts))
199	JMPENT(	L(1), L(cnsts))
200	JMPENT(	L(2), L(cnsts))
201	JMPENT(	L(3), L(cnsts))
202	JMPENT(	L(4), L(cnsts))
203	JMPENT(	L(5), L(cnsts))
204	JMPENT(	L(6), L(cnsts))
205	JMPENT(	L(7), L(cnsts))
206	.byte	0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
207	.byte	0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
208	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
209	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
210END_OBJECT(L(cnsts))
211