1dnl  AMD64 SSSE3 mpn_popcount -- population count.
2
3dnl  Copyright 2010-2017 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31
32include(`../config.m4')
33
34C		    cycles/limb	  good for cpu?
35C AMD K8,K9		n/a
36C AMD K10		n/a
37C AMD bd1	     1.79-1.91		n
38C AMD bd2	     1.73-1.85		n
39C AMD bd3		 ?
40C AMD bd4	     1.73-1.85		n
41C AMD zen		 1.47		n
42C AMD bobcat		 8.0		n
43C AMD jaguar		 4.78		n
44C Intel P4		n/a
45C Intel CNR		 3.75
46C Intel PNR		 2.61		y
47C Intel NHM		 2.03		n
48C Intel SBR		 1.87		n
49C Intel IBR	     1.52-1.58		n
50C Intel HWL	     1.52-1.58		n
51C Intel BWL	     1.52-1.58		n
52C Intel SKL		 1.51		n
53C Intel atom		12.3		n
54C Intel SLM		 9.1		n
55C VIA nano		 ?
56
57C TODO
58C  * This was hand-written without too much thought about optimal insn
59C    selection; check to see of it can be improved.
60C  * Consider doing some instruction scheduling.
61
62define(`up',		`%rdi')
63define(`n',		`%rsi')
64
65ASM_START()
66	TEXT
67	ALIGN(32)
68PROLOGUE(mpn_popcount)
69	lea	L(cnsts)(%rip), %r9
70
71ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48)',
72	     `define(`OFF1',64) define(`OFF2',80)')
73	movdqa	OFF1`'(%r9), %xmm7
74	movdqa	OFF2`'(%r9), %xmm6
75	pxor	%xmm4, %xmm4
76	pxor	%xmm5, %xmm5
77	pxor	%xmm8, %xmm8
78
79	mov	R32(n), R32(%rax)
80	and	$7, R32(%rax)
81ifdef(`PIC',`
82	movslq	(%r9,%rax,4), %rax
83	add	%r9, %rax
84	jmp	*%rax
85',`
86	jmp	*(%r9,%rax,8)
87')
88
89L(1):	movq	(up), %xmm1
90	add	$8, up
91	jmp	L(e1)
92
93L(2):	add	$-48, up
94	jmp	L(e2)
95
96L(3):	movq	(up), %xmm1
97	add	$-40, up
98	jmp	L(e3)
99
100L(4):	add	$-32, up
101	jmp	L(e4)
102
103L(5):	movq	(up), %xmm1
104	add	$-24, up
105	jmp	L(e5)
106
107L(6):	add	$-16, up
108	jmp	L(e6)
109
110L(7):	movq	(up), %xmm1
111	add	$-8, up
112	jmp	L(e7)
113
114	ALIGN(32)
115L(top):	lddqu	(up), %xmm1
116L(e7):	movdqa	%xmm6, %xmm0		C copy mask register
117	movdqa	%xmm7, %xmm2		C copy count register
118	movdqa	%xmm7, %xmm3		C copy count register
119	pand	%xmm1, %xmm0
120	psrlw	$4, %xmm1
121	pand	%xmm6, %xmm1
122	pshufb	%xmm0, %xmm2
123	pshufb	%xmm1, %xmm3
124	paddb	%xmm2, %xmm3
125	paddb	%xmm3, %xmm4
126L(e6):	lddqu	16(up), %xmm1
127L(e5):	movdqa	%xmm6, %xmm0
128	movdqa	%xmm7, %xmm2
129	movdqa	%xmm7, %xmm3
130	pand	%xmm1, %xmm0
131	psrlw	$4, %xmm1
132	pand	%xmm6, %xmm1
133	pshufb	%xmm0, %xmm2
134	pshufb	%xmm1, %xmm3
135	paddb	%xmm2, %xmm3
136	paddb	%xmm3, %xmm4
137L(e4):	lddqu	32(up), %xmm1
138L(e3):	movdqa	%xmm6, %xmm0
139	movdqa	%xmm7, %xmm2
140	movdqa	%xmm7, %xmm3
141	pand	%xmm1, %xmm0
142	psrlw	$4, %xmm1
143	pand	%xmm6, %xmm1
144	pshufb	%xmm0, %xmm2
145	pshufb	%xmm1, %xmm3
146	paddb	%xmm2, %xmm3
147	paddb	%xmm3, %xmm4
148L(e2):	lddqu	48(up), %xmm1
149	add	$64, up
150L(e1):	movdqa	%xmm6, %xmm0
151	movdqa	%xmm7, %xmm2
152	movdqa	%xmm7, %xmm3
153	pand	%xmm1, %xmm0
154	psrlw	$4, %xmm1
155	pand	%xmm6, %xmm1
156	pshufb	%xmm0, %xmm2
157	pshufb	%xmm1, %xmm3
158	psadbw	%xmm5, %xmm4		C sum to 8 x 16-bit counts
159	paddb	%xmm2, %xmm3
160	paddq	%xmm4, %xmm8		C sum to 2 x 64-bit counts
161	movdqa	%xmm3, %xmm4
162	sub	$8, n
163	jg	L(top)
164
165	psadbw	%xmm5, %xmm4
166	paddq	%xmm4, %xmm8
167	pshufd	$14, %xmm8, %xmm0
168	paddq	%xmm8, %xmm0
169	movq	%xmm0, %rax
170	ret
171EPILOGUE()
172DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
173	JMPENT(	L(top), L(cnsts))
174	JMPENT(	L(1), L(cnsts))
175	JMPENT(	L(2), L(cnsts))
176	JMPENT(	L(3), L(cnsts))
177	JMPENT(	L(4), L(cnsts))
178	JMPENT(	L(5), L(cnsts))
179	JMPENT(	L(6), L(cnsts))
180	JMPENT(	L(7), L(cnsts))
181	.byte	0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
182	.byte	0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
183	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
184	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
185END_OBJECT(L(cnsts))
186