1/*	$NetBSD: memcpy_arm.S,v 1.1 2003/10/14 07:51:45 scw Exp $	*/
2
3/*-
4 * Copyright (c) 1997 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Neil A. Carson and Mark Brinicombe
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include <machine/asm.h>
33__FBSDID("$FreeBSD$");
34/*
35 * This is one fun bit of code ...
36 * Some easy listening music is suggested while trying to understand this
37 * code e.g. Iron Maiden
38 *
39 * For anyone attempting to understand it :
40 *
41 * The core code is implemented here with simple stubs for memcpy().
42 *
43 * All local labels are prefixed with Lmemcpy_
44 * Following the prefix a label starting f is used in the forward copy code
45 * while a label using b is used in the backwards copy code
46 * The source and destination addresses determine whether a forward or
47 * backward copy is performed.
48 * Separate bits of code are used to deal with the following situations
49 * for both the forward and backwards copy.
50 * unaligned source address
51 * unaligned destination address
52 * Separate copy routines are used to produce an optimised result for each
53 * of these cases.
54 * The copy code will use LDM/STM instructions to copy up to 32 bytes at
55 * a time where possible.
56 *
57 * Note: r12 (aka ip) can be trashed during the function along with
58 * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
59 * Additional registers are preserved prior to use i.e. r4, r5 & lr
60 *
61 * Apologies for the state of the comments ;-)
62 */
63/* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
64ENTRY(memcpy)
65	/* save leaf functions having to store this away */
66	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
67
68	subs	r2, r2, #4
69	blt	.Lmemcpy_l4		/* less than 4 bytes */
70	ands	r12, r0, #3
71	bne	.Lmemcpy_destul		/* oh unaligned destination addr */
72	ands	r12, r1, #3
73	bne	.Lmemcpy_srcul		/* oh unaligned source addr */
74
75.Lmemcpy_t8:
76	/* We have aligned source and destination */
77	subs	r2, r2, #8
78	blt	.Lmemcpy_l12		/* less than 12 bytes (4 from above) */
79	subs	r2, r2, #0x14
80	blt	.Lmemcpy_l32		/* less than 32 bytes (12 from above) */
81	stmdb	sp!, {r4}		/* borrow r4 */
82
83	/* blat 32 bytes at a time */
84	/* XXX for really big copies perhaps we should use more registers */
85.Lmemcpy_loop32:
86	ldmia	r1!, {r3, r4, r12, lr}
87	stmia	r0!, {r3, r4, r12, lr}
88	ldmia	r1!, {r3, r4, r12, lr}
89	stmia	r0!, {r3, r4, r12, lr}
90	subs	r2, r2, #0x20
91	bge	.Lmemcpy_loop32
92
93	cmn	r2, #0x10
94	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
95	stmgeia	r0!, {r3, r4, r12, lr}
96	subge	r2, r2, #0x10
97	ldmia	sp!, {r4}		/* return r4 */
98
99.Lmemcpy_l32:
100	adds	r2, r2, #0x14
101
102	/* blat 12 bytes at a time */
103.Lmemcpy_loop12:
104	ldmgeia	r1!, {r3, r12, lr}
105	stmgeia	r0!, {r3, r12, lr}
106	subges	r2, r2, #0x0c
107	bge	.Lmemcpy_loop12
108
109.Lmemcpy_l12:
110	adds	r2, r2, #8
111	blt	.Lmemcpy_l4
112
113	subs	r2, r2, #4
114	ldrlt	r3, [r1], #4
115	strlt	r3, [r0], #4
116	ldmgeia	r1!, {r3, r12}
117	stmgeia	r0!, {r3, r12}
118	subge	r2, r2, #4
119
120.Lmemcpy_l4:
121	/* less than 4 bytes to go */
122	adds	r2, r2, #4
123#ifdef __APCS_26_
124	ldmeqia sp!, {r0, pc}^		/* done */
125#else
126	ldmeqia	sp!, {r0, pc}		/* done */
127#endif
128	/* copy the crud byte at a time */
129	cmp	r2, #2
130	ldrb	r3, [r1], #1
131	strb	r3, [r0], #1
132	ldrgeb	r3, [r1], #1
133	strgeb	r3, [r0], #1
134	ldrgtb	r3, [r1], #1
135	strgtb	r3, [r0], #1
136	ldmia	sp!, {r0, pc}
137
138	/* erg - unaligned destination */
139.Lmemcpy_destul:
140	rsb	r12, r12, #4
141	cmp	r12, #2
142
143	/* align destination with byte copies */
144	ldrb	r3, [r1], #1
145	strb	r3, [r0], #1
146	ldrgeb	r3, [r1], #1
147	strgeb	r3, [r0], #1
148	ldrgtb	r3, [r1], #1
149	strgtb	r3, [r0], #1
150	subs	r2, r2, r12
151	blt	.Lmemcpy_l4		/* less the 4 bytes */
152
153	ands	r12, r1, #3
154	beq	.Lmemcpy_t8		/* we have an aligned source */
155
156	/* erg - unaligned source */
157	/* This is where it gets nasty ... */
158.Lmemcpy_srcul:
159	bic	r1, r1, #3
160	ldr	lr, [r1], #4
161	cmp	r12, #2
162	bgt	.Lmemcpy_srcul3
163	beq	.Lmemcpy_srcul2
164	cmp	r2, #0x0c
165	blt	.Lmemcpy_srcul1loop4
166	sub	r2, r2, #0x0c
167	stmdb	sp!, {r4, r5}
168
169.Lmemcpy_srcul1loop16:
170#ifdef __ARMEB__
171	mov	r3, lr, lsl #8
172#else
173	mov	r3, lr, lsr #8
174#endif
175	ldmia	r1!, {r4, r5, r12, lr}
176#ifdef __ARMEB__
177	orr	r3, r3, r4, lsr #24
178	mov	r4, r4, lsl #8
179	orr	r4, r4, r5, lsr #24
180	mov	r5, r5, lsl #8
181	orr	r5, r5, r12, lsr #24
182	mov	r12, r12, lsl #8
183	orr	r12, r12, lr, lsr #24
184#else
185	orr	r3, r3, r4, lsl #24
186	mov	r4, r4, lsr #8
187	orr	r4, r4, r5, lsl #24
188	mov	r5, r5, lsr #8
189	orr	r5, r5, r12, lsl #24
190	mov	r12, r12, lsr #8
191	orr	r12, r12, lr, lsl #24
192#endif
193	stmia	r0!, {r3-r5, r12}
194	subs	r2, r2, #0x10
195	bge	.Lmemcpy_srcul1loop16
196	ldmia	sp!, {r4, r5}
197	adds	r2, r2, #0x0c
198	blt	.Lmemcpy_srcul1l4
199
200.Lmemcpy_srcul1loop4:
201#ifdef __ARMEB__
202	mov	r12, lr, lsl #8
203#else
204	mov	r12, lr, lsr #8
205#endif
206	ldr	lr, [r1], #4
207#ifdef __ARMEB__
208	orr	r12, r12, lr, lsr #24
209#else
210	orr	r12, r12, lr, lsl #24
211#endif
212	str	r12, [r0], #4
213	subs	r2, r2, #4
214	bge	.Lmemcpy_srcul1loop4
215
216.Lmemcpy_srcul1l4:
217	sub	r1, r1, #3
218	b	.Lmemcpy_l4
219
220.Lmemcpy_srcul2:
221	cmp	r2, #0x0c
222	blt	.Lmemcpy_srcul2loop4
223	sub	r2, r2, #0x0c
224	stmdb	sp!, {r4, r5}
225
226.Lmemcpy_srcul2loop16:
227#ifdef __ARMEB__
228	mov	r3, lr, lsl #16
229#else
230	mov	r3, lr, lsr #16
231#endif
232	ldmia	r1!, {r4, r5, r12, lr}
233#ifdef __ARMEB__
234	orr	r3, r3, r4, lsr #16
235	mov	r4, r4, lsl #16
236	orr	r4, r4, r5, lsr #16
237	mov	r5, r5, lsl #16
238	orr	r5, r5, r12, lsr #16
239	mov	r12, r12, lsl #16
240	orr	r12, r12, lr, lsr #16
241#else
242	orr	r3, r3, r4, lsl #16
243	mov	r4, r4, lsr #16
244	orr	r4, r4, r5, lsl #16
245	mov	r5, r5, lsr #16
246	orr	r5, r5, r12, lsl #16
247	mov	r12, r12, lsr #16
248	orr	r12, r12, lr, lsl #16
249#endif
250	stmia	r0!, {r3-r5, r12}
251	subs	r2, r2, #0x10
252	bge	.Lmemcpy_srcul2loop16
253	ldmia	sp!, {r4, r5}
254	adds	r2, r2, #0x0c
255	blt	.Lmemcpy_srcul2l4
256
257.Lmemcpy_srcul2loop4:
258#ifdef __ARMEB__
259	mov	r12, lr, lsl #16
260#else
261	mov	r12, lr, lsr #16
262#endif
263	ldr	lr, [r1], #4
264#ifdef __ARMEB__
265	orr	r12, r12, lr, lsr #16
266#else
267	orr	r12, r12, lr, lsl #16
268#endif
269	str	r12, [r0], #4
270	subs	r2, r2, #4
271	bge	.Lmemcpy_srcul2loop4
272
273.Lmemcpy_srcul2l4:
274	sub	r1, r1, #2
275	b	.Lmemcpy_l4
276
277.Lmemcpy_srcul3:
278	cmp	r2, #0x0c
279	blt	.Lmemcpy_srcul3loop4
280	sub	r2, r2, #0x0c
281	stmdb	sp!, {r4, r5}
282
283.Lmemcpy_srcul3loop16:
284#ifdef __ARMEB__
285	mov	r3, lr, lsl #24
286#else
287	mov	r3, lr, lsr #24
288#endif
289	ldmia	r1!, {r4, r5, r12, lr}
290#ifdef __ARMEB__
291	orr	r3, r3, r4, lsr #8
292	mov	r4, r4, lsl #24
293	orr	r4, r4, r5, lsr #8
294	mov	r5, r5, lsl #24
295	orr	r5, r5, r12, lsr #8
296	mov	r12, r12, lsl #24
297	orr	r12, r12, lr, lsr #8
298#else
299	orr	r3, r3, r4, lsl #8
300	mov	r4, r4, lsr #24
301	orr	r4, r4, r5, lsl #8
302	mov	r5, r5, lsr #24
303	orr	r5, r5, r12, lsl #8
304	mov	r12, r12, lsr #24
305	orr	r12, r12, lr, lsl #8
306#endif
307	stmia	r0!, {r3-r5, r12}
308	subs	r2, r2, #0x10
309	bge	.Lmemcpy_srcul3loop16
310	ldmia	sp!, {r4, r5}
311	adds	r2, r2, #0x0c
312	blt	.Lmemcpy_srcul3l4
313
314.Lmemcpy_srcul3loop4:
315#ifdef __ARMEB__
316	mov	r12, lr, lsl #24
317#else
318	mov	r12, lr, lsr #24
319#endif
320	ldr	lr, [r1], #4
321#ifdef __ARMEB__
322	orr	r12, r12, lr, lsr #8
323#else
324	orr	r12, r12, lr, lsl #8
325#endif
326	str	r12, [r0], #4
327	subs	r2, r2, #4
328	bge	.Lmemcpy_srcul3loop4
329
330.Lmemcpy_srcul3l4:
331	sub	r1, r1, #1
332	b	.Lmemcpy_l4
333