1dnl  ARM64 Neon mpn_popcount -- mpn bit population count.
2
3dnl  Copyright 2013, 2014 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb
34C Cortex-A53	 2.5
35C Cortex-A57	 1.14
36C X-Gene	 3
37
38C TODO
39C  * Consider greater unrolling.
40C  * Arrange to align the pointer, if that helps performance.  Use the same
41C    read-and-mask trick we use on PCs, for simplicity and performance.  (Sorry
42C    valgrind!)
43C  * Explore if explicit align directives, e.g., "[ptr:128]" help.
44C  * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
45
46changecom(blah)
47
48C INPUT PARAMETERS
49define(`ap', x0)
50define(`n',  x1)
51
52C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end
53C up with 8 16-bit counters.  Therefore, we can sum to 8(2^16-1) bits, or
54C (8*2^16-1)/64 = 0x1fff limbs.  We use a chunksize close to that, but which
55C  allows the huge count code to jump deep into the code (at L(chu)).
56
57define(`maxsize',  0x1fff)
58define(`chunksize',0x1ff0)
59
60ASM_START()
61PROLOGUE(mpn_popcount)
62
63	mov	x11, #maxsize
64	cmp	n, x11
65	b.hi	L(gt8k)
66
67L(lt8k):
68	movi	v4.16b, #0			C clear summation register
69	movi	v5.16b, #0			C clear summation register
70
71	tbz	n, #0, L(xx0)
72	sub	n, n, #1
73	ld1	{v0.1d}, [ap], #8		C load 1 limb
74	cnt	v6.16b, v0.16b
75	uadalp	v4.8h,  v6.16b			C could also splat
76
77L(xx0):	tbz	n, #1, L(x00)
78	sub	n, n, #2
79	ld1	{v0.2d}, [ap], #16		C load 2 limbs
80	cnt	v6.16b, v0.16b
81	uadalp	v4.8h,  v6.16b
82
83L(x00):	tbz	n, #2, L(000)
84	subs	n, n, #4
85	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
86	b.ls	L(sum)
87
88L(gt4):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
89	sub	n, n, #4
90	cnt	v6.16b, v0.16b
91	cnt	v7.16b, v1.16b
92	b	L(mid)
93
94L(000):	subs	n, n, #8
95	b.lo	L(e0)
96
97L(chu):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
98	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
99	cnt	v6.16b, v2.16b
100	cnt	v7.16b, v3.16b
101	subs	n, n, #8
102	b.lo	L(end)
103
104L(top):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
105	uadalp	v4.8h,  v6.16b
106	cnt	v6.16b, v0.16b
107	uadalp	v5.8h,  v7.16b
108	cnt	v7.16b, v1.16b
109L(mid):	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
110	subs	n, n, #8
111	uadalp	v4.8h,  v6.16b
112	cnt	v6.16b, v2.16b
113	uadalp	v5.8h,  v7.16b
114	cnt	v7.16b, v3.16b
115	b.hs	L(top)
116
117L(end):	uadalp	v4.8h,  v6.16b
118	uadalp	v5.8h,  v7.16b
119L(sum):	cnt	v6.16b, v0.16b
120	cnt	v7.16b, v1.16b
121	uadalp	v4.8h,  v6.16b
122	uadalp	v5.8h,  v7.16b
123	add	v4.8h, v4.8h, v5.8h
124					C we have 8 16-bit counts
125L(e0):	uaddlp	v4.4s,  v4.8h		C we have 4 32-bit counts
126	uaddlp	v4.2d,  v4.4s		C we have 2 64-bit counts
127	mov	x0, v4.d[0]
128	mov	x1, v4.d[1]
129	add	x0, x0, x1
130	ret
131
132C Code for count > maxsize.  Splits operand and calls above code.
133define(`ap2', x5)			C caller-saves reg not used above
134L(gt8k):
135	mov	x8, x30
136	mov	x7, n			C full count (caller-saves reg not used above)
137	mov	x4, #0			C total sum  (caller-saves reg not used above)
138	mov	x9, #chunksize*8	C caller-saves reg not used above
139	mov	x10, #chunksize		C caller-saves reg not used above
140
1411:	add	ap2, ap, x9		C point at subsequent block
142	mov	n, #chunksize-8		C count for this invocation, adjusted for entry pt
143	movi	v4.16b, #0		C clear chunk summation register
144	movi	v5.16b, #0		C clear chunk summation register
145	bl	L(chu)			C jump deep inside code
146	add	x4, x4, x0
147	mov	ap, ap2			C put chunk pointer in place for calls
148	sub	x7, x7, x10
149	cmp	x7, x11
150	b.hi	1b
151
152	mov	n, x7			C count for final invocation
153	bl	L(lt8k)
154	add	x0, x4, x0
155	mov	x30, x8
156	ret
157EPILOGUE()
158