popham.asm revision 1.1.1.3
1dnl  AMD64 mpn_popcount, mpn_hamdist -- population count and hamming distance.
2
3dnl  Copyright 2004, 2005, 2007, 2010-2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31
32include(`../config.m4')
33
34
35C		     popcount	      hamdist
36C		    cycles/limb	    cycles/limb
37C AMD K8,K9		 6		 7
38C AMD K10		 6		 7
39C Intel P4		12		14.3
40C Intel core2		 7		 8
41C Intel corei		 ?		 7.3
42C Intel atom		16.5		17.5
43C VIA nano		 8.75		10.4
44
45C TODO
46C  * Tune.  It should be possible to reach 5 c/l for popcount and 6 c/l for
47C    hamdist for K8/K9.
48
49
50ifdef(`OPERATION_popcount',`
51  define(`func',`mpn_popcount')
52  define(`up',		`%rdi')
53  define(`n',		`%rsi')
54  define(`h55555555',	`%r10')
55  define(`h33333333',	`%r11')
56  define(`h0f0f0f0f',	`%rcx')
57  define(`h01010101',	`%rdx')
58  define(`POP',		`$1')
59  define(`HAM',		`dnl')
60')
61ifdef(`OPERATION_hamdist',`
62  define(`func',`mpn_hamdist')
63  define(`up',		`%rdi')
64  define(`vp',		`%rsi')
65  define(`n',		`%rdx')
66  define(`h55555555',	`%r10')
67  define(`h33333333',	`%r11')
68  define(`h0f0f0f0f',	`%rcx')
69  define(`h01010101',	`%r14')
70  define(`POP',		`dnl')
71  define(`HAM',		`$1')
72')
73
74
75MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
76
77ABI_SUPPORT(DOS64)
78ABI_SUPPORT(STD64)
79
80ASM_START()
81	TEXT
82	ALIGN(32)
83PROLOGUE(func)
84 POP(`	FUNC_ENTRY(2)		')
85 HAM(`	FUNC_ENTRY(3)		')
86	push	%r12
87	push	%r13
88 HAM(`	push	%r14		')
89
90	mov	$0x5555555555555555, h55555555
91	mov	$0x3333333333333333, h33333333
92	mov	$0x0f0f0f0f0f0f0f0f, h0f0f0f0f
93	mov	$0x0101010101010101, h01010101
94
95	lea	(up,n,8), up
96 HAM(`	lea	(vp,n,8), vp	')
97	neg	n
98
99	xor	R32(%rax), R32(%rax)
100
101	bt	$0, R32(n)
102	jnc	L(top)
103
104	mov	(up,n,8), %r8
105 HAM(`	xor	(vp,n,8), %r8	')
106
107	mov	%r8, %r9
108	shr	%r8
109	and	h55555555, %r8
110	sub	%r8, %r9
111
112	mov	%r9, %r8
113	shr	$2, %r9
114	and	h33333333, %r8
115	and	h33333333, %r9
116	add	%r8, %r9		C 16 4-bit fields (0..4)
117
118	mov	%r9, %r8
119	shr	$4, %r9
120	and	h0f0f0f0f, %r8
121	and	h0f0f0f0f, %r9
122	add	%r8, %r9		C 8 8-bit fields (0..16)
123
124	imul	h01010101, %r9		C sum the 8 fields in high 8 bits
125	shr	$56, %r9
126
127	mov	%r9, %rax		C add to total
128	add	$1, n
129	jz	L(end)
130
131	ALIGN(16)
132L(top):	mov	(up,n,8), %r8
133	mov	8(up,n,8), %r12
134 HAM(`	xor	(vp,n,8), %r8	')
135 HAM(`	xor	8(vp,n,8), %r12	')
136
137	mov	%r8, %r9
138	mov	%r12, %r13
139	shr	%r8
140	shr	%r12
141	and	h55555555, %r8
142	and	h55555555, %r12
143	sub	%r8, %r9
144	sub	%r12, %r13
145
146	mov	%r9, %r8
147	mov	%r13, %r12
148	shr	$2, %r9
149	shr	$2, %r13
150	and	h33333333, %r8
151	and	h33333333, %r9
152	and	h33333333, %r12
153	and	h33333333, %r13
154	add	%r8, %r9		C 16 4-bit fields (0..4)
155	add	%r12, %r13		C 16 4-bit fields (0..4)
156
157	add	%r13, %r9		C 16 4-bit fields (0..8)
158	mov	%r9, %r8
159	shr	$4, %r9
160	and	h0f0f0f0f, %r8
161	and	h0f0f0f0f, %r9
162	add	%r8, %r9		C 8 8-bit fields (0..16)
163
164	imul	h01010101, %r9		C sum the 8 fields in high 8 bits
165	shr	$56, %r9
166
167	add	%r9, %rax		C add to total
168	add	$2, n
169	jnc	L(top)
170
171L(end):
172 HAM(`	pop	%r14		')
173	pop	%r13
174	pop	%r12
175	FUNC_EXIT()
176	ret
177EPILOGUE()
178