popham.asm revision 1.1.1.1
1dnl  AMD64 mpn_popcount, mpn_hamdist -- population count and hamming distance.
2
3dnl  Copyright 2004, 2005, 2007 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20
21include(`../config.m4')
22
23
24C		     popcount	      hamdist
25C		    cycles/limb	    cycles/limb
26C K8,K9:		 6		 7
27C K10:			 6		 7
28C P4:			12		14.3
29C P6-15:		 7		 8
30
31C TODO
32C  * Tune.  It should be possible to reach 5 c/l for popcount and 6 c/l for
33C    hamdist for K8/K9.
34
35
36ifdef(`OPERATION_popcount',`
37  define(`func',`mpn_popcount')
38  define(`up',		`%rdi')
39  define(`n',		`%rsi')
40  define(`h55555555',	`%r10')
41  define(`h33333333',	`%r11')
42  define(`h0f0f0f0f',	`%rcx')
43  define(`h01010101',	`%rdx')
44  define(`HAM',		`dnl')
45')
46ifdef(`OPERATION_hamdist',`
47  define(`func',`mpn_hamdist')
48  define(`up',		`%rdi')
49  define(`vp',		`%rsi')
50  define(`n',		`%rdx')
51  define(`h55555555',	`%r10')
52  define(`h33333333',	`%r11')
53  define(`h0f0f0f0f',	`%rcx')
54  define(`h01010101',	`%r14')
55  define(`HAM',		`$1')
56')
57
58
59MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
60
61ASM_START()
62	TEXT
63	ALIGN(32)
64PROLOGUE(func)
65
66	pushq	%r12
67	pushq	%r13
68 HAM(`	pushq	%r14		')
69
70	movq	$0x5555555555555555, h55555555
71	movq	$0x3333333333333333, h33333333
72	movq	$0x0f0f0f0f0f0f0f0f, h0f0f0f0f
73	movq	$0x0101010101010101, h01010101
74
75	leaq	(up,n,8), up
76 HAM(`	leaq	(vp,n,8), vp	')
77	negq	n
78
79	xorl	%eax, %eax
80
81	btq	$0, n
82	jnc	L(oop)
83
84	movq	(up,n,8), %r8
85 HAM(`	xorq	(vp,n,8), %r8	')
86
87	movq	%r8, %r9
88	shrq	%r8
89	andq	h55555555, %r8
90	subq	%r8, %r9
91
92	movq	%r9, %r8
93	shrq	$2, %r9
94	andq	h33333333, %r8
95	andq	h33333333, %r9
96	addq	%r8, %r9		C 16 4-bit fields (0..4)
97
98	movq	%r9, %r8
99	shrq	$4, %r9
100	andq	h0f0f0f0f, %r8
101	andq	h0f0f0f0f, %r9
102	addq	%r8, %r9		C 8 8-bit fields (0..16)
103
104	imulq	h01010101, %r9		C sum the 8 fields in high 8 bits
105	shrq	$56, %r9
106
107	addq	%r9, %rax		C add to total
108	addq	$1, n
109	jz	L(done)
110
111	ALIGN(16)
112L(oop):	movq	(up,n,8), %r8
113	movq	8(up,n,8), %r12
114 HAM(`	xorq	(vp,n,8), %r8	')
115 HAM(`	xorq	8(vp,n,8), %r12	')
116
117	movq	%r8, %r9
118	movq	%r12, %r13
119	shrq	%r8
120	shrq	%r12
121	andq	h55555555, %r8
122	andq	h55555555, %r12
123	subq	%r8, %r9
124	subq	%r12, %r13
125
126	movq	%r9, %r8
127	movq	%r13, %r12
128	shrq	$2, %r9
129	shrq	$2, %r13
130	andq	h33333333, %r8
131	andq	h33333333, %r9
132	andq	h33333333, %r12
133	andq	h33333333, %r13
134	addq	%r8, %r9		C 16 4-bit fields (0..4)
135	addq	%r12, %r13		C 16 4-bit fields (0..4)
136
137	addq	%r13, %r9		C 16 4-bit fields (0..8)
138	movq	%r9, %r8
139	shrq	$4, %r9
140	andq	h0f0f0f0f, %r8
141	andq	h0f0f0f0f, %r9
142	addq	%r8, %r9		C 8 8-bit fields (0..16)
143
144	imulq	h01010101, %r9		C sum the 8 fields in high 8 bits
145	shrq	$56, %r9
146
147	addq	%r9, %rax		C add to total
148	addq	$2, n
149	jnc	L(oop)
150
151L(done):
152 HAM(`	popq	%r14		')
153	popq	%r13
154	popq	%r12
155	ret
156
157EPILOGUE()
158