1dnl  AMD64 mpn_popcount -- population count.
2
3dnl  Copyright 2017 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31
32include(`../config.m4')
33
34C		    cycles/limb
35C AMD K8,K9		 n/a
36C AMD K10		 1.39
37C AMD bd1		 4
38C AMD bd2		 4
39C AMD bd3		 ?
40C AMD bd4		 ?
41C AMD zen		 0.72
42C AMD bobcat		 5.78
43C AMD jaguar		 1.27
44C Intel P4		 n/a
45C Intel core2		 n/a
46C Intel NHM		 1.04
47C Intel SBR		 1.02
48C Intel IBR		 1.0
49C Intel HWL		 1.0
50C Intel BWL		 1.0
51C Intel SKL		 1.0
52C Intel atom		 n/a
53C Intel SLM		 1.34
54C VIA nano		 n/a
55
56C TODO
57C  * We could approach 0.5 c/l for AMD Zen with more unrolling.  That would
58C    not cause any additional feed-in overhead as we already use a jump table.
59C  * An AVX pshufb based variant should approach 0.5 c/l on Haswell and later
60C    Intel hardware.  Perhaps mix such a loop with popcnt instructions.
61C  * The random placement of the L0, L1, L2, etc blocks are due to branch
62C    shortening.
63
64define(`up',		`%rdi')
65define(`n',		`%rsi')
66
67ABI_SUPPORT(DOS64)
68ABI_SUPPORT(STD64)
69
70ASM_START()
71	TEXT
72	ALIGN(32)
73PROLOGUE(mpn_popcount)
74	FUNC_ENTRY(2)
75
76	mov	R32(n), R32(%r8)
77	and	$7, R32(%r8)
78
79	.byte	0xf3,0x48,0x0f,0xb8,0x07	C popcnt (up), %rax
80	xor	R32(%rcx), R32(%rcx)
81
82	lea	L(tab)(%rip), %r9
83ifdef(`PIC',`
84	movslq	(%r9,%r8,4), %r8
85	add	%r9, %r8
86	jmp	*%r8
87',`
88	jmp	*(%r9,%r8,8)
89')
90
91L(3):	.byte	0xf3,0x4c,0x0f,0xb8,0x57,0x08	C popcnt 8(up), %r10
92	.byte	0xf3,0x4c,0x0f,0xb8,0x5f,0x10	C popcnt 16(up), %r11
93	add	$24, up
94	sub	$8, n
95	jg	L(e34)
96	add	%r10, %rax
97	add	%r11, %rax
98L(s1):	FUNC_EXIT()
99	ret
100
101L(1):	sub	$8, n
102	jle	L(s1)
103	.byte	0xf3,0x4c,0x0f,0xb8,0x47,0x08	C popcnt 8(up), %r8
104	.byte	0xf3,0x4c,0x0f,0xb8,0x4f,0x10	C popcnt 16(up), %r9
105	add	$8, up
106	jmp	L(e12)
107
108L(7):	.byte	0xf3,0x4c,0x0f,0xb8,0x57,0x08	C popcnt 0x8(%rdi),%r10
109	.byte	0xf3,0x4c,0x0f,0xb8,0x5f,0x10	C popcnt 0x10(%rdi),%r11
110	add	$-8, up
111	jmp	L(e07)
112
113L(0):	.byte	0xf3,0x48,0x0f,0xb8,0x4f,0x08	C popcnt 0x8(%rdi),%rcx
114	.byte	0xf3,0x4c,0x0f,0xb8,0x57,0x10	C popcnt 0x10(%rdi),%r10
115	.byte	0xf3,0x4c,0x0f,0xb8,0x5f,0x18	C popcnt 0x18(%rdi),%r11
116	jmp	L(e07)
117
118L(4):	.byte	0xf3,0x48,0x0f,0xb8,0x4f,0x08	C popcnt 0x8(%rdi),%rcx
119	.byte	0xf3,0x4c,0x0f,0xb8,0x57,0x10	C popcnt 0x10(%rdi),%r10
120	.byte	0xf3,0x4c,0x0f,0xb8,0x5f,0x18	C popcnt 0x18(%rdi),%r11
121	add	$32, up
122	sub	$8, n
123	jle	L(x4)
124
125	ALIGN(16)
126L(top):
127L(e34):	.byte	0xf3,0x4c,0x0f,0xb8,0x07	C popcnt (%rdi),%r8
128	.byte	0xf3,0x4c,0x0f,0xb8,0x4f,0x08	C popcnt 0x8(%rdi),%r9
129	add	%r10, %rcx
130	add	%r11, %rax
131L(e12):	.byte	0xf3,0x4c,0x0f,0xb8,0x57,0x10	C popcnt 0x10(%rdi),%r10
132	.byte	0xf3,0x4c,0x0f,0xb8,0x5f,0x18	C popcnt 0x18(%rdi),%r11
133	add	%r8, %rcx
134	add	%r9, %rax
135L(e07):	.byte	0xf3,0x4c,0x0f,0xb8,0x47,0x20	C popcnt 0x20(%rdi),%r8
136	.byte	0xf3,0x4c,0x0f,0xb8,0x4f,0x28	C popcnt 0x28(%rdi),%r9
137	add	%r10, %rcx
138	add	%r11, %rax
139L(e56):	.byte	0xf3,0x4c,0x0f,0xb8,0x57,0x30	C popcnt 0x30(%rdi),%r10
140	.byte	0xf3,0x4c,0x0f,0xb8,0x5f,0x38	C popcnt 0x38(%rdi),%r11
141	add	$64, up
142	add	%r8, %rcx
143	add	%r9, %rax
144	sub	$8, n
145	jg	L(top)
146
147L(x4):	add	%r10, %rcx
148	add	%r11, %rax
149L(x2):	add	%rcx, %rax
150
151	FUNC_EXIT()
152	ret
153
154L(2):	.byte	0xf3,0x48,0x0f,0xb8,0x4f,0x08	C popcnt 0x8(%rdi),%rcx
155	sub	$8, n
156	jle	L(x2)
157	.byte	0xf3,0x4c,0x0f,0xb8,0x47,0x10	C popcnt 0x10(%rdi),%r8
158	.byte	0xf3,0x4c,0x0f,0xb8,0x4f,0x18	C popcnt 0x18(%rdi),%r9
159	add	$16, up
160	jmp	L(e12)
161
162L(5):	.byte	0xf3,0x4c,0x0f,0xb8,0x47,0x08	C popcnt 0x8(%rdi),%r8
163	.byte	0xf3,0x4c,0x0f,0xb8,0x4f,0x10	C popcnt 0x10(%rdi),%r9
164	add	$-24, up
165	jmp	L(e56)
166
167L(6):	.byte	0xf3,0x48,0x0f,0xb8,0x4f,0x08	C popcnt 0x8(%rdi),%rcx
168	.byte	0xf3,0x4c,0x0f,0xb8,0x47,0x10	C popcnt 0x10(%rdi),%r8
169	.byte	0xf3,0x4c,0x0f,0xb8,0x4f,0x18	C popcnt 0x18(%rdi),%r9
170	add	$-16, up
171	jmp	L(e56)
172EPILOGUE()
173	JUMPTABSECT
174	ALIGN(8)
175L(tab):	JMPENT(	L(0), L(tab))
176	JMPENT(	L(1), L(tab))
177	JMPENT(	L(2), L(tab))
178	JMPENT(	L(3), L(tab))
179	JMPENT(	L(4), L(tab))
180	JMPENT(	L(5), L(tab))
181	JMPENT(	L(6), L(tab))
182	JMPENT(	L(7), L(tab))
183