1dnl  Intel P5 mpn_popcount -- mpn bit population count.
2
3dnl  Copyright 2001, 2002, 2014, 2015 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C P5: 8.0 cycles/limb
35
36
37C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
38C
39C An arithmetic approach has been found to be slower than the table lookup,
40C due to needing too many instructions.
41
42C The slightly strange quoting here helps the renaming done by tune/many.pl.
43deflit(TABLE_NAME,
44m4_assert_defined(`GSYM_PREFIX')
45GSYM_PREFIX`'mpn_popcount``'_table')
46
47C FIXME: exporting the table to hamdist is incorrect as it hurt incremental
48C linking.
49
50	RODATA
51	ALIGN(8)
52	GLOBL	TABLE_NAME
53TABLE_NAME:
54forloop(i,0,255,
55`	.byte	m4_popcount(i)
56')
57
58defframe(PARAM_SIZE,8)
59defframe(PARAM_SRC, 4)
60
61	TEXT
62	ALIGN(8)
63
64PROLOGUE(mpn_popcount)
65deflit(`FRAME',0)
66
67	movl	PARAM_SIZE, %ecx
68	pushl	%esi	FRAME_pushl()
69
70ifdef(`PIC',`
71	pushl	%ebx	FRAME_pushl()
72	pushl	%ebp	FRAME_pushl()
73ifdef(`DARWIN',`
74	shll	%ecx		C size in byte pairs
75	LEA(	TABLE_NAME, %ebp)
76	movl	PARAM_SRC, %esi
77	xorl	%eax, %eax	C total
78	xorl	%ebx, %ebx	C byte
79	xorl	%edx, %edx	C byte
80',`
81	call	L(here)
82L(here):
83	popl	%ebp
84	shll	%ecx		C size in byte pairs
85
86	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
87	movl	PARAM_SRC, %esi
88
89	xorl	%eax, %eax	C total
90	xorl	%ebx, %ebx	C byte
91
92	movl	TABLE_NAME@GOT(%ebp), %ebp
93	xorl	%edx, %edx	C byte
94')
95define(TABLE,`(%ebp,$1)')
96',`
97dnl non-PIC
98	shll	%ecx		C size in byte pairs
99	movl	PARAM_SRC, %esi
100
101	pushl	%ebx	FRAME_pushl()
102	xorl	%eax, %eax	C total
103
104	xorl	%ebx, %ebx	C byte
105	xorl	%edx, %edx	C byte
106
107define(TABLE,`TABLE_NAME`'($1)')
108')
109
110
111	ALIGN(8)	C necessary on P55 for claimed speed
112L(top):
113	C eax	total
114	C ebx	byte
115	C ecx	counter, 2*size to 2
116	C edx	byte
117	C esi	src
118	C edi
119	C ebp	[PIC] table
120
121	addl	%ebx, %eax
122	movb	-1(%esi,%ecx,2), %bl
123
124	addl	%edx, %eax
125	movb	-2(%esi,%ecx,2), %dl
126
127	movb	TABLE(%ebx), %bl
128	decl	%ecx
129
130	movb	TABLE(%edx), %dl
131	jnz	L(top)
132
133
134ifdef(`PIC',`
135	popl	%ebp
136')
137	addl	%ebx, %eax
138	popl	%ebx
139
140	addl	%edx, %eax
141	popl	%esi
142
143	ret
144
145EPILOGUE()
146ASM_END()
147