1/* Copyright (c) 2013, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26
27/* Assumptions:
28 *
29 * ARMv8-a, AArch64
30 * Unaligned accesses
31 */
32
33
34/* Parameters and result.  */
35#define dstin	x0
36#define src	x1
37#define count	x2
38#define tmp1	x3
39#define tmp1w	w3
40#define tmp2	x4
41#define tmp2w	w4
42#define tmp3	x5
43#define tmp3w	w5
44#define dst	x6
45
46#define A_l	x7
47#define A_h	x8
48#define B_l	x9
49#define B_h	x10
50#define C_l	x11
51#define C_h	x12
52#define D_l	x13
53#define D_h	x14
54
55
56
57.align 6
58.globl _bcopy
59_bcopy:		/* void bcopy(const void *src, void *dest, size_t len); */
60	mov		x3, x0
61	mov		x0, x1
62	mov		x1, x3
63
64.globl _memcpy
65_memcpy:
66	mov	dst, dstin
67	cmp	count, #64
68	b.ge	.Lcpy_not_short
69	cmp	count, #15
70	b.le	.Ltail15tiny
71
72	/* Deal with small copies quickly by dropping straight into the
73	 * exit block.  */
74.Ltail63:
75	/* Copy up to 48 bytes of data.  At this point we only need the
76	 * bottom 6 bits of count to be accurate.  */
77	ands	tmp1, count, #0x30
78	b.eq	.Ltail15
79	add	dst, dst, tmp1
80	add	src, src, tmp1
81	cmp	tmp1w, #0x20
82	b.eq	1f
83	b.lt	2f
84	ldp	A_l, A_h, [src, #-48]
85	stp	A_l, A_h, [dst, #-48]
861:
87	ldp	A_l, A_h, [src, #-32]
88	stp	A_l, A_h, [dst, #-32]
892:
90	ldp	A_l, A_h, [src, #-16]
91	stp	A_l, A_h, [dst, #-16]
92
93.Ltail15:
94	ands	count, count, #15
95	b.eq	1f
96	add	src, src, count
97	ldp	A_l, A_h, [src, #-16]
98	add	dst, dst, count
99	stp	A_l, A_h, [dst, #-16]
1001:
101	ret
102
103.Ltail15tiny:
104	/* Copy up to 15 bytes of data.  Does not assume additional data
105	   being copied.  */
106	tbz	count, #3, 1f
107	ldr	tmp1, [src], #8
108	str	tmp1, [dst], #8
1091:
110	tbz	count, #2, 1f
111	ldr	tmp1w, [src], #4
112	str	tmp1w, [dst], #4
1131:
114	tbz	count, #1, 1f
115	ldrh	tmp1w, [src], #2
116	strh	tmp1w, [dst], #2
1171:
118	tbz	count, #0, 1f
119	ldrb	tmp1w, [src]
120	strb	tmp1w, [dst]
1211:
122	ret
123
124.Lcpy_not_short:
125	/* We don't much care about the alignment of DST, but we want SRC
126	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
127	 * boundaries on both loads and stores.  */
128	neg	tmp2, src
129	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
130	b.eq	2f
131	sub	count, count, tmp2
132	/* Copy more data than needed; it's faster than jumping
133	 * around copying sub-Quadword quantities.  We know that
134	 * it can't overrun.  */
135	ldp	A_l, A_h, [src]
136	add	src, src, tmp2
137	stp	A_l, A_h, [dst]
138	add	dst, dst, tmp2
139	/* There may be less than 63 bytes to go now.  */
140	cmp	count, #63
141	b.le	.Ltail63
1422:
143	subs	count, count, #128
144	b.ge	.Lcpy_body_large
145	/* Less than 128 bytes to copy, so handle 64 here and then jump
146	 * to the tail.  */
147	ldp	A_l, A_h, [src]
148	ldp	B_l, B_h, [src, #16]
149	ldp	C_l, C_h, [src, #32]
150	ldp	D_l, D_h, [src, #48]
151	stp	A_l, A_h, [dst]
152	stp	B_l, B_h, [dst, #16]
153	stp	C_l, C_h, [dst, #32]
154	stp	D_l, D_h, [dst, #48]
155	tst	count, #0x3f
156	add	src, src, #64
157	add	dst, dst, #64
158	b.ne	.Ltail63
159	ret
160
161	/* Critical loop.  Start at a new cache line boundary.  Assuming
162	 * 64 bytes per line this ensures the entire loop is in one line.  */
163	.p2align 6
164.Lcpy_body_large:
165	/* There are at least 128 bytes to copy.  */
166	ldp	A_l, A_h, [src, #0]
167	sub	dst, dst, #16		/* Pre-bias.  */
168	ldp	B_l, B_h, [src, #16]
169	ldp	C_l, C_h, [src, #32]
170	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
1711:
172	stp	A_l, A_h, [dst, #16]
173	ldp	A_l, A_h, [src, #16]
174	stp	B_l, B_h, [dst, #32]
175	ldp	B_l, B_h, [src, #32]
176	stp	C_l, C_h, [dst, #48]
177	ldp	C_l, C_h, [src, #48]
178	stp	D_l, D_h, [dst, #64]!
179	ldp	D_l, D_h, [src, #64]!
180	subs	count, count, #64
181	b.ge	1b
182	stp	A_l, A_h, [dst, #16]
183	stp	B_l, B_h, [dst, #32]
184	stp	C_l, C_h, [dst, #48]
185	stp	D_l, D_h, [dst, #64]
186	add	src, src, #16
187	add	dst, dst, #64 + 16
188	tst	count, #0x3f
189	b.ne	.Ltail63
190	ret
191
192.align 6
193.globl _memmove
194_memmove:
195	cmp	dstin, src
196	b.lo	.Ldownwards
197	add	tmp1, src, count
198	cmp	dstin, tmp1
199	b.hs	_memcpy		/* No overlap.  */
200
201	/* Upwards move with potential overlap.
202	 * Need to move from the tail backwards.  SRC and DST point one
203	 * byte beyond the remaining data to move.  */
204	add	dst, dstin, count
205	add	src, src, count
206	cmp	count, #64
207	b.ge	.Lmov_not_short_up
208
209	/* Deal with small moves quickly by dropping straight into the
210	 * exit block.  */
211.Ltail63up:
212	/* Move up to 48 bytes of data.  At this point we only need the
213	 * bottom 6 bits of count to be accurate.  */
214	ands	tmp1, count, #0x30
215	b.eq	.Ltail15up
216	sub	dst, dst, tmp1
217	sub	src, src, tmp1
218	cmp	tmp1w, #0x20
219	b.eq	1f
220	b.lt	2f
221	ldp	A_l, A_h, [src, #32]
222	stp	A_l, A_h, [dst, #32]
2231:
224	ldp	A_l, A_h, [src, #16]
225	stp	A_l, A_h, [dst, #16]
2262:
227	ldp	A_l, A_h, [src]
228	stp	A_l, A_h, [dst]
229.Ltail15up:
230	/* Move up to 15 bytes of data.  Does not assume additional data
231	 * being moved.  */
232	tbz	count, #3, 1f
233	ldr	tmp1, [src, #-8]!
234	str	tmp1, [dst, #-8]!
2351:
236	tbz	count, #2, 1f
237	ldr	tmp1w, [src, #-4]!
238	str	tmp1w, [dst, #-4]!
2391:
240	tbz	count, #1, 1f
241	ldrh	tmp1w, [src, #-2]!
242	strh	tmp1w, [dst, #-2]!
2431:
244	tbz	count, #0, 1f
245	ldrb	tmp1w, [src, #-1]
246	strb	tmp1w, [dst, #-1]
2471:
248	ret
249
250.Lmov_not_short_up:
251	/* We don't much care about the alignment of DST, but we want SRC
252	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
253	 * boundaries on both loads and stores.  */
254	ands	tmp2, src, #15		/* Bytes to reach alignment.  */
255	b.eq	2f
256	sub	count, count, tmp2
257	/* Move enough data to reach alignment; unlike memcpy, we have to
258	 * be aware of the overlap, which means we can't move data twice.  */
259	tbz	tmp2, #3, 1f
260	ldr	tmp1, [src, #-8]!
261	str	tmp1, [dst, #-8]!
2621:
263	tbz	tmp2, #2, 1f
264	ldr	tmp1w, [src, #-4]!
265	str	tmp1w, [dst, #-4]!
2661:
267	tbz	tmp2, #1, 1f
268	ldrh	tmp1w, [src, #-2]!
269	strh	tmp1w, [dst, #-2]!
2701:
271	tbz	tmp2, #0, 1f
272	ldrb	tmp1w, [src, #-1]!
273	strb	tmp1w, [dst, #-1]!
2741:
275
276	/* There may be less than 63 bytes to go now.  */
277	cmp	count, #63
278	b.le	.Ltail63up
2792:
280	subs	count, count, #128
281	b.ge	.Lmov_body_large_up
282	/* Less than 128 bytes to move, so handle 64 here and then jump
283	 * to the tail.  */
284	ldp	A_l, A_h, [src, #-64]!
285	ldp	B_l, B_h, [src, #16]
286	ldp	C_l, C_h, [src, #32]
287	ldp	D_l, D_h, [src, #48]
288	stp	A_l, A_h, [dst, #-64]!
289	stp	B_l, B_h, [dst, #16]
290	stp	C_l, C_h, [dst, #32]
291	stp	D_l, D_h, [dst, #48]
292	tst	count, #0x3f
293	b.ne	.Ltail63up
294	ret
295
296	/* Critical loop.  Start at a new Icache line boundary.  Assuming
297	 * 64 bytes per line this ensures the entire loop is in one line.  */
298	.p2align 6
299.Lmov_body_large_up:
300	/* There are at least 128 bytes to move.  */
301	ldp	A_l, A_h, [src, #-16]
302	ldp	B_l, B_h, [src, #-32]
303	ldp	C_l, C_h, [src, #-48]
304	ldp	D_l, D_h, [src, #-64]!
3051:
306	stp	A_l, A_h, [dst, #-16]
307	ldp	A_l, A_h, [src, #-16]
308	stp	B_l, B_h, [dst, #-32]
309	ldp	B_l, B_h, [src, #-32]
310	stp	C_l, C_h, [dst, #-48]
311	ldp	C_l, C_h, [src, #-48]
312	stp	D_l, D_h, [dst, #-64]!
313	ldp	D_l, D_h, [src, #-64]!
314	subs	count, count, #64
315	b.ge	1b
316	stp	A_l, A_h, [dst, #-16]
317	stp	B_l, B_h, [dst, #-32]
318	stp	C_l, C_h, [dst, #-48]
319	stp	D_l, D_h, [dst, #-64]!
320	tst	count, #0x3f
321	b.ne	.Ltail63up
322	ret
323
324
325.Ldownwards:
326	/* For a downwards move we can safely use memcpy provided that
327	 * DST is more than 16 bytes away from SRC.  */
328	sub	tmp1, src, #16
329	cmp	dstin, tmp1
330	b.ls	_memcpy		/* May overlap, but not critically.  */
331
332	mov	dst, dstin	/* Preserve DSTIN for return value.  */
333	cmp	count, #64
334	b.ge	.Lmov_not_short_down
335
336	/* Deal with small moves quickly by dropping straight into the
337	 * exit block.  */
338.Ltail63down:
339	/* Move up to 48 bytes of data.  At this point we only need the
340	 * bottom 6 bits of count to be accurate.  */
341	ands	tmp1, count, #0x30
342	b.eq	.Ltail15down
343	add	dst, dst, tmp1
344	add	src, src, tmp1
345	cmp	tmp1w, #0x20
346	b.eq	1f
347	b.lt	2f
348	ldp	A_l, A_h, [src, #-48]
349	stp	A_l, A_h, [dst, #-48]
3501:
351	ldp	A_l, A_h, [src, #-32]
352	stp	A_l, A_h, [dst, #-32]
3532:
354	ldp	A_l, A_h, [src, #-16]
355	stp	A_l, A_h, [dst, #-16]
356.Ltail15down:
357	/* Move up to 15 bytes of data.  Does not assume additional data
358	   being moved.  */
359	tbz	count, #3, 1f
360	ldr	tmp1, [src], #8
361	str	tmp1, [dst], #8
3621:
363	tbz	count, #2, 1f
364	ldr	tmp1w, [src], #4
365	str	tmp1w, [dst], #4
3661:
367	tbz	count, #1, 1f
368	ldrh	tmp1w, [src], #2
369	strh	tmp1w, [dst], #2
3701:
371	tbz	count, #0, 1f
372	ldrb	tmp1w, [src]
373	strb	tmp1w, [dst]
3741:
375	ret
376
377.Lmov_not_short_down:
378	/* We don't much care about the alignment of DST, but we want SRC
379	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
380	 * boundaries on both loads and stores.  */
381	neg	tmp2, src
382	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
383	b.eq	2f
384	sub	count, count, tmp2
385	/* Move enough data to reach alignment; unlike memcpy, we have to
386	 * be aware of the overlap, which means we can't move data twice.  */
387	tbz	tmp2, #3, 1f
388	ldr	tmp1, [src], #8
389	str	tmp1, [dst], #8
3901:
391	tbz	tmp2, #2, 1f
392	ldr	tmp1w, [src], #4
393	str	tmp1w, [dst], #4
3941:
395	tbz	tmp2, #1, 1f
396	ldrh	tmp1w, [src], #2
397	strh	tmp1w, [dst], #2
3981:
399	tbz	tmp2, #0, 1f
400	ldrb	tmp1w, [src], #1
401	strb	tmp1w, [dst], #1
4021:
403
404	/* There may be less than 63 bytes to go now.  */
405	cmp	count, #63
406	b.le	.Ltail63down
4072:
408	subs	count, count, #128
409	b.ge	.Lmov_body_large_down
410	/* Less than 128 bytes to move, so handle 64 here and then jump
411	 * to the tail.  */
412	ldp	A_l, A_h, [src]
413	ldp	B_l, B_h, [src, #16]
414	ldp	C_l, C_h, [src, #32]
415	ldp	D_l, D_h, [src, #48]
416	stp	A_l, A_h, [dst]
417	stp	B_l, B_h, [dst, #16]
418	stp	C_l, C_h, [dst, #32]
419	stp	D_l, D_h, [dst, #48]
420	tst	count, #0x3f
421	add	src, src, #64
422	add	dst, dst, #64
423	b.ne	.Ltail63down
424	ret
425
426	/* Critical loop.  Start at a new cache line boundary.  Assuming
427	 * 64 bytes per line this ensures the entire loop is in one line.  */
428	.p2align 6
429.Lmov_body_large_down:
430	/* There are at least 128 bytes to move.  */
431	ldp	A_l, A_h, [src, #0]
432	sub	dst, dst, #16		/* Pre-bias.  */
433	ldp	B_l, B_h, [src, #16]
434	ldp	C_l, C_h, [src, #32]
435	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
4361:
437	stp	A_l, A_h, [dst, #16]
438	ldp	A_l, A_h, [src, #16]
439	stp	B_l, B_h, [dst, #32]
440	ldp	B_l, B_h, [src, #32]
441	stp	C_l, C_h, [dst, #48]
442	ldp	C_l, C_h, [src, #48]
443	stp	D_l, D_h, [dst, #64]!
444	ldp	D_l, D_h, [src, #64]!
445	subs	count, count, #64
446	b.ge	1b
447	stp	A_l, A_h, [dst, #16]
448	stp	B_l, B_h, [dst, #32]
449	stp	C_l, C_h, [dst, #48]
450	stp	D_l, D_h, [dst, #64]
451	add	src, src, #16
452	add	dst, dst, #64 + 16
453	tst	count, #0x3f
454	b.ne	.Ltail63down
455	ret
456