1dnl  ARM Neon mpn_lshiftc.
2
3dnl  Contributed to the GNU project by Torbj��rn Granlund.
4
5dnl  Copyright 2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb     cycles/limb     cycles/limb      good
36C              aligned	      unaligned	      best seen	     for cpu?
37C StrongARM	 -		 -
38C XScale	 -		 -
39C Cortex-A7	 ?		 ?
40C Cortex-A8	 ?		 ?
41C Cortex-A9	 3.5		 3.5				Y
42C Cortex-A15	 1.75		 1.75				Y
43
44
45C We read 64 bits at a time at 32-bit aligned addresses, and except for the
46C first and last store, we write using 64-bit aligned addresses.  All shifting
47C is done on 64-bit words in 'extension' registers.
48C
49C It should be possible to read also using 64-bit alignment, by manipulating
50C the shift count for unaligned operands.  Not done, since it does not seem to
51C matter for A9 or A15.
52C
53C This will not work in big-endian mode.
54
55C TODO
56C  * Try using 128-bit operations.  Note that Neon lacks pure 128-bit shifts,
57C    which might make it tricky.
58C  * Clean up and simplify.
59C  * Consider sharing most of the code for lshift and rshift, since the feed-in
60C    code, the loop, and most of the wind-down code are identical.
61C  * Replace the basecase code with code using 'extension' registers.
62C  * Optimise.  It is not clear that this loop insn permutation is optimal for
63C    either A9 or A15.
64
65C INPUT PARAMETERS
66define(`rp',  `r0')
67define(`ap',  `r1')
68define(`n',   `r2')
69define(`cnt', `r3')
70
71ASM_START(neon)
72	TEXT
73	ALIGN(64)
74PROLOGUE(mpn_lshiftc)
75	mov	r12, n, lsl #2
76	add	rp, rp, r12
77	add	ap, ap, r12
78
79	cmp	n, #4			C SIMD code n limit
80	ble	L(base)
81
82	vdup.32	d6, r3			C left shift count is positive
83	sub	r3, r3, #64		C right shift count is negative
84	vdup.32	d7, r3
85	mov	r12, #-8		C lshift pointer update offset
86
87	sub	ap, ap, #8
88	vld1.32	{d19}, [ap], r12	C load initial 2 limbs
89	vshl.u64 d18, d19, d7		C retval
90
91	tst	rp, #4			C is rp 64-bit aligned already?
92	beq	L(rp_aligned)		C yes, skip
93	vmvn	 d19, d19
94	add	ap, ap, #4		C move back ap pointer
95	vshl.u64 d4, d19, d6
96	sub	n, n, #1		C first limb handled
97	sub	 rp, rp, #4
98	vst1.32	 {d4[1]}, [rp]		C store first limb, rp gets aligned
99	vld1.32	 {d19}, [ap], r12	C load ap[1] and ap[2]
100
101L(rp_aligned):
102	sub	rp, rp, #8
103	subs	n, n, #6
104	vmvn	 d19, d19
105	blt	L(two_or_three_more)
106	tst	n, #2
107	beq	L(2)
108
109L(1):	vld1.32	 {d17}, [ap], r12
110	vshl.u64 d5, d19, d6
111	vmvn	 d17, d17
112	vld1.32	 {d16}, [ap], r12
113	vshl.u64 d0, d17, d7
114	vshl.u64 d4, d17, d6
115	sub	n, n, #2
116	b	 L(mid)
117
118L(2):	vld1.32	 {d16}, [ap], r12
119	vshl.u64 d4, d19, d6
120	vmvn	 d16, d16
121	vld1.32	 {d17}, [ap], r12
122	vshl.u64 d1, d16, d7
123	vshl.u64 d5, d16, d6
124	subs	n, n, #4
125	blt	L(end)
126
127L(top):	vmvn	 d17, d17
128	vld1.32	 {d16}, [ap], r12
129	vorr	 d2, d4, d1
130	vshl.u64 d0, d17, d7
131	vshl.u64 d4, d17, d6
132	vst1.32	 {d2}, [rp:64], r12
133L(mid):	vmvn	 d16, d16
134	vld1.32	 {d17}, [ap], r12
135	vorr	 d3, d5, d0
136	vshl.u64 d1, d16, d7
137	vshl.u64 d5, d16, d6
138	vst1.32	 {d3}, [rp:64], r12
139	subs	n, n, #4
140	bge	L(top)
141
142L(end):	tst	 n, #1
143	beq	 L(evn)
144
145	vorr	 d2, d4, d1
146	vst1.32	 {d2}, [rp:64], r12
147	b	 L(cj1)
148
149L(evn):	vmvn	 d17, d17
150	vorr	 d2, d4, d1
151	vshl.u64 d0, d17, d7
152	vshl.u64 d4, d17, d6
153	vst1.32	 {d2}, [rp:64], r12
154	vmov.u8	 d17, #255
155	vorr	 d2, d5, d0
156	vshl.u64 d0, d17, d7
157	vorr	 d3, d4, d0
158	b	 L(cj2)
159
160C Load last 2 - 3 limbs, store last 4 - 5 limbs
161L(two_or_three_more):
162	tst	n, #1
163	beq	L(l2)
164
165L(l3):	vshl.u64 d5, d19, d6
166	vld1.32	 {d17}, [ap], r12
167L(cj1):	vmov.u8	 d16, #0
168	add	 ap, ap, #4
169	vmvn	 d17, d17
170	vld1.32	 {d16[1]}, [ap], r12
171	vshl.u64 d0, d17, d7
172	vshl.u64 d4, d17, d6
173	vmvn	 d16, d16
174	vorr	 d3, d5, d0
175	vshl.u64 d1, d16, d7
176	vshl.u64 d5, d16, d6
177	vst1.32	 {d3}, [rp:64], r12
178	vorr	 d2, d4, d1
179	vst1.32	 {d2}, [rp:64], r12
180	add	 rp, rp, #4
181	vst1.32	 {d5[1]}, [rp]
182	vmov.32	 r0, d18[0]
183	bx	lr
184
185L(l2):	vld1.32	 {d16}, [ap], r12
186	vshl.u64 d4, d19, d6
187	vmvn	 d16, d16
188	vshl.u64 d1, d16, d7
189	vshl.u64 d5, d16, d6
190	vmov.u8	 d17, #255
191	vorr	 d2, d4, d1
192	vshl.u64 d0, d17, d7
193	vorr	 d3, d5, d0
194L(cj2):	vst1.32	 {d2}, [rp:64], r12
195	vst1.32	 {d3}, [rp]
196	vmov.32	 r0, d18[0]
197	bx	lr
198
199
200define(`tnc', `r12')
201L(base):
202	push	{r4, r6, r7, r8}
203	ldr	r4, [ap, #-4]!
204	rsb	tnc, cnt, #32
205	mvn	r6, r4
206
207	mov	r7, r6, lsl cnt
208	tst	n, #1
209	beq	L(ev)			C n even
210
211L(od):	subs	n, n, #2
212	bcc	L(ed1)			C n = 1
213	ldr	r8, [ap, #-4]!
214	mvn	r8, r8
215	b	L(md)			C n = 3
216
217L(ev):	ldr	r6, [ap, #-4]!
218	mvn	r6, r6
219	subs	n, n, #2
220	beq	L(ed)			C n = 3
221					C n = 4
222L(tp):	ldr	r8, [ap, #-4]!
223	orr	r7, r7, r6, lsr tnc
224	str	r7, [rp, #-4]!
225	mvn	r8, r8
226	mov	r7, r6, lsl cnt
227L(md):	ldr	r6, [ap, #-4]!
228	orr	r7, r7, r8, lsr tnc
229	str	r7, [rp, #-4]!
230	mvn	r6, r6
231	mov	r7, r8, lsl cnt
232
233L(ed):	orr	r7, r7, r6, lsr tnc
234	str	r7, [rp, #-4]!
235	mov	r7, r6, lsl cnt
236L(ed1):	mvn	r6, #0
237	orr	r7, r7, r6, lsr tnc
238	str	r7, [rp, #-4]
239	mov	r0, r4, lsr tnc
240	pop	{r4, r6, r7, r8}
241	bx	r14
242EPILOGUE()
243