1dnl  ARM mpn_popcount and mpn_hamdist.
2
3dnl  Contributed to the GNU project by Torbj��rn Granlund.
4
5dnl  Copyright 2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C		     popcount	      hamdist
36C		    cycles/limb	    cycles/limb
37C StrongARM		 -
38C XScale		 -
39C Cortex-A7		 ?
40C Cortex-A8		 ?
41C Cortex-A9		 8.94		 9.47
42C Cortex-A15		 5.67		 6.44
43
44C Architecture requirements:
45C v5	-
46C v5t	-
47C v5te	ldrd strd
48C v6	usada8
49C v6t2	-
50C v7a	-
51
52ifdef(`OPERATION_popcount',`
53  define(`func',`mpn_popcount')
54  define(`ap',		`r0')
55  define(`n',		`r1')
56  define(`a0',		`r2')
57  define(`a1',		`r3')
58  define(`s',		`r5')
59  define(`b_01010101',	`r6')
60  define(`b_00110011',	`r7')
61  define(`b_00001111',	`r8')
62  define(`zero',	`r9')
63  define(`POPC',	`$1')
64  define(`HAMD',	`dnl')
65')
66ifdef(`OPERATION_hamdist',`
67  define(`func',`mpn_hamdist')
68  define(`ap',		`r0')
69  define(`bp',		`r1')
70  define(`n',		`r2')
71  define(`a0',		`r6')
72  define(`a1',		`r7')
73  define(`b0',		`r4')
74  define(`b1',		`r5')
75  define(`s',		`r11')
76  define(`b_01010101',	`r8')
77  define(`b_00110011',	`r9')
78  define(`b_00001111',	`r10')
79  define(`zero',	`r3')
80  define(`POPC',	`dnl')
81  define(`HAMD',	`$1')
82')
83
84MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
85
86ASM_START()
87PROLOGUE(func)
88POPC(`	push	{ r4-r9 }	')
89HAMD(`	push	{ r4-r11 }	')
90
91	ldr	b_01010101, =0x55555555
92	mov	r12, #0
93	ldr	b_00110011, =0x33333333
94	mov	zero, #0
95	ldr	b_00001111, =0x0f0f0f0f
96
97	tst	n, #1
98	beq	L(evn)
99
100L(odd):	ldr	a1, [ap], #4		C 1 x 32 1-bit accumulators, 0-1
101HAMD(`	ldr	b1, [bp], #4	')	C 1 x 32 1-bit accumulators, 0-1
102HAMD(`	eor	a1, a1, b1	')
103	and	r4, b_01010101, a1, lsr #1
104	sub	a1, a1, r4
105	and	r4, a1, b_00110011
106	bic	r5, a1, b_00110011
107	add	r5, r4, r5, lsr #2	C 8 4-bit accumulators, 0-4
108	subs	n, n, #1
109	b	L(mid)
110
111L(evn):	mov	s, #0
112
113L(top):	ldrd	a0, a1, [ap], #8	C 2 x 32 1-bit accumulators, 0-1
114HAMD(`	ldrd	b0, b1, [bp], #8')
115HAMD(`	eor	a0, a0, b0	')
116HAMD(`	eor	a1, a1, b1	')
117	subs	n, n, #2
118	usada8	r12, s, zero, r12
119	and	r4, b_01010101, a0, lsr #1
120	sub	a0, a0, r4
121	and	r4, b_01010101, a1, lsr #1
122	sub	a1, a1, r4
123	and	r4, a0, b_00110011
124	bic	r5, a0, b_00110011
125	add	a0, r4, r5, lsr #2	C 8 4-bit accumulators, 0-4
126	and	r4, a1, b_00110011
127	bic	r5, a1, b_00110011
128	add	a1, r4, r5, lsr #2	C 8 4-bit accumulators, 0-4
129	add	r5, a0, a1		C 8 4-bit accumulators, 0-8
130L(mid):	and	r4, r5, b_00001111
131	bic	r5, r5, b_00001111
132	add	s, r4, r5, lsr #4	C 4 8-bit accumulators
133	bne	L(top)
134
135	usada8	r0, s, zero, r12
136POPC(`	pop	{ r4-r9 }	')
137HAMD(`	pop	{ r4-r11 }	')
138	bx	r14
139EPILOGUE()
140