popcount.asm revision 1.1.1.1
1dnl  ARM64 Neon mpn_popcount -- mpn bit population count.
2
3dnl  Copyright 2013, 2014 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb
34C Cortex-A53	 ?
35C Cortex-A57	 ?
36
37C TODO
38C  * Consider greater unrolling.
39C  * Arrange to align the pointer, if that helps performance.  Use the same
40C    read-and-mask trick we use on PCs, for simplicity and performance.  (Sorry
41C    valgrind!)
42C  * Explore if explicit align directives, e.g., "[ptr:128]" help.
43C  * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
44
45changecom(@&*$)
46
47C INPUT PARAMETERS
48define(`ap', x0)
49define(`n',  x1)
50
51C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end
52C up with 8 16-bit counters.  Therefore, we can sum to 8(2^16-1) bits, or
53C (8*2^16-1)/64 = 0x1fff limbs.  We use a chunksize close to that, but which
54C  allows the huge count code to jump deep into the code (at L(chu)).
55
56define(`maxsize',  0x1fff)
57define(`chunksize',0x1ff0)
58
59ASM_START()
60PROLOGUE(mpn_popcount)
61
62	mov	x11, #maxsize
63	cmp	n, x11
64	b.hi	L(gt8k)
65
66L(lt8k):
67	movi	v4.16b, #0			C clear summation register
68	movi	v5.16b, #0			C clear summation register
69
70	tbz	n, #0, L(xx0)
71	sub	n, n, #1
72	ld1	{v0.1d}, [ap], #8		C load 1 limb
73	cnt	v6.16b, v0.16b
74	uadalp	v4.8h,  v6.16b			C could also splat
75
76L(xx0):	tbz	n, #1, L(x00)
77	sub	n, n, #2
78	ld1	{v0.2d}, [ap], #16		C load 2 limbs
79	cnt	v6.16b, v0.16b
80	uadalp	v4.8h,  v6.16b
81
82L(x00):	tbz	n, #2, L(000)
83	subs	n, n, #4
84	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
85	b.ls	L(sum)
86
87L(gt4):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
88	sub	n, n, #4
89	cnt	v6.16b, v0.16b
90	cnt	v7.16b, v1.16b
91	b	L(mid)
92
93L(000):	subs	n, n, #8
94	b.lo	L(e0)
95
96L(chu):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
97	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
98	cnt	v6.16b, v2.16b
99	cnt	v7.16b, v3.16b
100	subs	n, n, #8
101	b.lo	L(end)
102
103L(top):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
104	uadalp	v4.8h,  v6.16b
105	cnt	v6.16b, v0.16b
106	uadalp	v5.8h,  v7.16b
107	cnt	v7.16b, v1.16b
108L(mid):	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
109	subs	n, n, #8
110	uadalp	v4.8h,  v6.16b
111	cnt	v6.16b, v2.16b
112	uadalp	v5.8h,  v7.16b
113	cnt	v7.16b, v3.16b
114	b.hs	L(top)
115
116L(end):	uadalp	v4.8h,  v6.16b
117	uadalp	v5.8h,  v7.16b
118L(sum):	cnt	v6.16b, v0.16b
119	cnt	v7.16b, v1.16b
120	uadalp	v4.8h,  v6.16b
121	uadalp	v5.8h,  v7.16b
122	add	v4.8h, v4.8h, v5.8h
123					C we have 8 16-bit counts
124L(e0):	uaddlp	v4.4s,  v4.8h		C we have 4 32-bit counts
125	uaddlp	v4.2d,  v4.4s		C we have 2 64-bit counts
126	mov	x0, v4.d[0]
127	mov	x1, v4.d[1]
128	add	x0, x0, x1
129	ret
130
131C Code for count > maxsize.  Splits operand and calls above code.
132define(`ap2', x5)			C caller-saves reg not used above
133L(gt8k):
134	mov	x8, x30
135	mov	x7, n			C full count (caller-saves reg not used above)
136	mov	x4, #0			C total sum  (caller-saves reg not used above)
137	mov	x9, #chunksize*8	C caller-saves reg not used above
138	mov	x10, #chunksize		C caller-saves reg not used above
139
1401:	add	ap2, ap, x9		C point at subsequent block
141	mov	n, #chunksize-8		C count for this invocation, adjusted for entry pt
142	movi	v4.16b, #0		C clear chunk summation register
143	movi	v5.16b, #0		C clear chunk summation register
144	bl	L(chu)			C jump deep inside code
145	add	x4, x4, x0
146	mov	ap, ap2			C put chunk pointer in place for calls
147	sub	x7, x7, x10
148	cmp	x7, x11
149	b.hi	1b
150
151	mov	n, x7			C count for final invocation
152	bl	L(lt8k)
153	add	x0, x4, x0
154	mov	x30, x8
155	ret
156EPILOGUE()
157