1dnl  AMD64 mpn_popcount, mpn_hamdist -- population count and hamming distance.
2
3dnl  Copyright 2004, 2005, 2007, 2010-2012, 2017 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31
32include(`../config.m4')
33
34
35C		     popcount	      hamdist
36C		    cycles/limb	    cycles/limb
37C AMD K8,K9		 6		 7
38C AMD K10		 6		 7
39C Intel P4		12		14.3
40C Intel core2		 7		 8
41C Intel corei		 ?		 7.3
42C Intel atom		16.5		17.5
43C VIA nano		 8.75		10.4
44
45C TODO
46C  * Tune.  It should be possible to reach 5 c/l for popcount and 6 c/l for
47C    hamdist for K8/K9.
48
49
50ifdef(`OPERATION_popcount',`
51  define(`func',`mpn_popcount')
52  define(`up',		`%rdi')
53  define(`n',		`%rsi')
54  define(`h55555555',	`%r10')
55  define(`h33333333',	`%r11')
56  define(`h0f0f0f0f',	`%rcx')
57  define(`h01010101',	`%rdx')
58  define(`POP',		`$1')
59  define(`HAM',		`dnl')
60')
61ifdef(`OPERATION_hamdist',`
62  define(`func',`mpn_hamdist')
63  define(`up',		`%rdi')
64  define(`vp',		`%rsi')
65  define(`n',		`%rdx')
66  define(`h55555555',	`%r10')
67  define(`h33333333',	`%r11')
68  define(`h0f0f0f0f',	`%rcx')
69  define(`h01010101',	`%r12')
70  define(`POP',		`dnl')
71  define(`HAM',		`$1')
72')
73
74
75MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
76
77ABI_SUPPORT(DOS64)
78ABI_SUPPORT(STD64)
79
80ASM_START()
81	TEXT
82	ALIGN(32)
83PROLOGUE(func)
84 POP(`	FUNC_ENTRY(2)		')
85 HAM(`	FUNC_ENTRY(3)		')
86	push	%rbx
87	mov	$0x5555555555555555, h55555555
88	push	%rbp
89	mov	$0x3333333333333333, h33333333
90 HAM(`	push	%r12		')
91	lea	(up,n,8), up
92	mov	$0x0f0f0f0f0f0f0f0f, h0f0f0f0f
93 HAM(`	lea	(vp,n,8), vp	')
94	neg	n
95	mov	$0x0101010101010101, h01010101
96	xor	R32(%rax), R32(%rax)
97	test	$1, R8(n)
98	jz	L(top)
99
100	mov	(up,n,8), %r8
101 HAM(`	xor	(vp,n,8), %r8	')
102
103	mov	%r8, %r9
104	shr	%r8
105	and	h55555555, %r8
106	sub	%r8, %r9
107
108	mov	%r9, %r8
109	shr	$2, %r9
110	and	h33333333, %r8
111	and	h33333333, %r9
112	add	%r8, %r9		C 16 4-bit fields (0..4)
113
114	dec	n
115	jmp	L(mid)
116
117	ALIGN(16)
118L(top):	mov	(up,n,8), %r8
119	mov	8(up,n,8), %rbx
120 HAM(`	xor	(vp,n,8), %r8	')
121 HAM(`	xor	8(vp,n,8), %rbx	')
122
123	mov	%r8, %r9
124	mov	%rbx, %rbp
125	shr	%r8
126	shr	%rbx
127	and	h55555555, %r8
128	and	h55555555, %rbx
129	sub	%r8, %r9
130	sub	%rbx, %rbp
131
132	mov	%r9, %r8
133	mov	%rbp, %rbx
134	shr	$2, %r9
135	shr	$2, %rbp
136	and	h33333333, %r8
137	and	h33333333, %r9
138	and	h33333333, %rbx
139	and	h33333333, %rbp
140	add	%r8, %r9		C 16 4-bit fields (0..4)
141	add	%rbx, %rbp		C 16 4-bit fields (0..4)
142
143	add	%rbp, %r9		C 16 4-bit fields (0..8)
144L(mid):	mov	%r9, %r8
145	shr	$4, %r9
146	and	h0f0f0f0f, %r8
147	and	h0f0f0f0f, %r9
148	add	%r8, %r9		C 8 8-bit fields (0..16)
149
150	imul	h01010101, %r9		C sum the 8 fields in high 8 bits
151	shr	$56, %r9
152
153	add	%r9, %rax		C add to total
154	add	$2, n
155	jnc	L(top)
156
157L(end):
158 HAM(`	pop	%r12		')
159	pop	%rbp
160	pop	%rbx
161	FUNC_EXIT()
162	ret
163EPILOGUE()
164