popham.asm revision 1.1.1.2
1dnl  AMD64 mpn_popcount, mpn_hamdist -- population count and hamming distance.
2
3dnl  Copyright 2004, 2005, 2007, 2010, 2011, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20
21include(`../config.m4')
22
23
24C		     popcount	      hamdist
25C		    cycles/limb	    cycles/limb
26C AMD K8,K9		 6		 7
27C AMD K10		 6		 7
28C Intel P4		12		14.3
29C Intel core2		 7		 8
30C Intel corei		 ?		 7.3
31C Intel atom		16.5		17.5
32C VIA nano		 8.75		10.4
33
34C TODO
35C  * Tune.  It should be possible to reach 5 c/l for popcount and 6 c/l for
36C    hamdist for K8/K9.
37
38
39ifdef(`OPERATION_popcount',`
40  define(`func',`mpn_popcount')
41  define(`up',		`%rdi')
42  define(`n',		`%rsi')
43  define(`h55555555',	`%r10')
44  define(`h33333333',	`%r11')
45  define(`h0f0f0f0f',	`%rcx')
46  define(`h01010101',	`%rdx')
47  define(`POP',		`$1')
48  define(`HAM',		`dnl')
49')
50ifdef(`OPERATION_hamdist',`
51  define(`func',`mpn_hamdist')
52  define(`up',		`%rdi')
53  define(`vp',		`%rsi')
54  define(`n',		`%rdx')
55  define(`h55555555',	`%r10')
56  define(`h33333333',	`%r11')
57  define(`h0f0f0f0f',	`%rcx')
58  define(`h01010101',	`%r14')
59  define(`POP',		`dnl')
60  define(`HAM',		`$1')
61')
62
63
64MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
65
66ABI_SUPPORT(DOS64)
67ABI_SUPPORT(STD64)
68
69ASM_START()
70	TEXT
71	ALIGN(32)
72PROLOGUE(func)
73 POP(`	FUNC_ENTRY(2)		')
74 HAM(`	FUNC_ENTRY(3)		')
75	push	%r12
76	push	%r13
77 HAM(`	push	%r14		')
78
79	mov	$0x5555555555555555, h55555555
80	mov	$0x3333333333333333, h33333333
81	mov	$0x0f0f0f0f0f0f0f0f, h0f0f0f0f
82	mov	$0x0101010101010101, h01010101
83
84	lea	(up,n,8), up
85 HAM(`	lea	(vp,n,8), vp	')
86	neg	n
87
88	xor	R32(%rax), R32(%rax)
89
90	bt	$0, R32(n)
91	jnc	L(top)
92
93	mov	(up,n,8), %r8
94 HAM(`	xor	(vp,n,8), %r8	')
95
96	mov	%r8, %r9
97	shr	%r8
98	and	h55555555, %r8
99	sub	%r8, %r9
100
101	mov	%r9, %r8
102	shr	$2, %r9
103	and	h33333333, %r8
104	and	h33333333, %r9
105	add	%r8, %r9		C 16 4-bit fields (0..4)
106
107	mov	%r9, %r8
108	shr	$4, %r9
109	and	h0f0f0f0f, %r8
110	and	h0f0f0f0f, %r9
111	add	%r8, %r9		C 8 8-bit fields (0..16)
112
113	imul	h01010101, %r9		C sum the 8 fields in high 8 bits
114	shr	$56, %r9
115
116	mov	%r9, %rax		C add to total
117	add	$1, n
118	jz	L(end)
119
120	ALIGN(16)
121L(top):	mov	(up,n,8), %r8
122	mov	8(up,n,8), %r12
123 HAM(`	xor	(vp,n,8), %r8	')
124 HAM(`	xor	8(vp,n,8), %r12	')
125
126	mov	%r8, %r9
127	mov	%r12, %r13
128	shr	%r8
129	shr	%r12
130	and	h55555555, %r8
131	and	h55555555, %r12
132	sub	%r8, %r9
133	sub	%r12, %r13
134
135	mov	%r9, %r8
136	mov	%r13, %r12
137	shr	$2, %r9
138	shr	$2, %r13
139	and	h33333333, %r8
140	and	h33333333, %r9
141	and	h33333333, %r12
142	and	h33333333, %r13
143	add	%r8, %r9		C 16 4-bit fields (0..4)
144	add	%r12, %r13		C 16 4-bit fields (0..4)
145
146	add	%r13, %r9		C 16 4-bit fields (0..8)
147	mov	%r9, %r8
148	shr	$4, %r9
149	and	h0f0f0f0f, %r8
150	and	h0f0f0f0f, %r9
151	add	%r8, %r9		C 8 8-bit fields (0..16)
152
153	imul	h01010101, %r9		C sum the 8 fields in high 8 bits
154	shr	$56, %r9
155
156	add	%r9, %rax		C add to total
157	add	$2, n
158	jnc	L(top)
159
160L(end):
161 HAM(`	pop	%r14		')
162	pop	%r13
163	pop	%r12
164	FUNC_EXIT()
165	ret
166EPILOGUE()
167