memcpy_arm.S revision 129202
1/*	$NetBSD: memcpy_arm.S,v 1.1 2003/10/14 07:51:45 scw Exp $	*/
2
3/*-
4 * Copyright (c) 1997 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Neil A. Carson and Mark Brinicombe
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *        This product includes software developed by the NetBSD
21 *        Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 *    contributors may be used to endorse or promote products derived
24 *    from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38
39#include <machine/asm.h>
40__FBSDID("$FreeBSD: head/lib/libc/arm/string/memcpy_arm.S 129202 2004-05-14 12:04:31Z cognet $");
41/*
42 * This is one fun bit of code ...
43 * Some easy listening music is suggested while trying to understand this
44 * code e.g. Iron Maiden
45 *
46 * For anyone attempting to understand it :
47 *
48 * The core code is implemented here with simple stubs for memcpy().
49 *
50 * All local labels are prefixed with Lmemcpy_
51 * Following the prefix a label starting f is used in the forward copy code
52 * while a label using b is used in the backwards copy code
53 * The source and destination addresses determine whether a forward or
54 * backward copy is performed.
55 * Separate bits of code are used to deal with the following situations
56 * for both the forward and backwards copy.
57 * unaligned source address
58 * unaligned destination address
59 * Separate copy routines are used to produce an optimised result for each
60 * of these cases.
61 * The copy code will use LDM/STM instructions to copy up to 32 bytes at
62 * a time where possible.
63 *
64 * Note: r12 (aka ip) can be trashed during the function along with
65 * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
66 * Additional registers are preserved prior to use i.e. r4, r5 & lr
67 *
68 * Apologies for the state of the comments ;-)
69 */
70/* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
71ENTRY(memcpy)
72	/* save leaf functions having to store this away */
73	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
74
75	subs	r2, r2, #4
76	blt	.Lmemcpy_l4		/* less than 4 bytes */
77	ands	r12, r0, #3
78	bne	.Lmemcpy_destul		/* oh unaligned destination addr */
79	ands	r12, r1, #3
80	bne	.Lmemcpy_srcul		/* oh unaligned source addr */
81
82.Lmemcpy_t8:
83	/* We have aligned source and destination */
84	subs	r2, r2, #8
85	blt	.Lmemcpy_l12		/* less than 12 bytes (4 from above) */
86	subs	r2, r2, #0x14
87	blt	.Lmemcpy_l32		/* less than 32 bytes (12 from above) */
88	stmdb	sp!, {r4}		/* borrow r4 */
89
90	/* blat 32 bytes at a time */
91	/* XXX for really big copies perhaps we should use more registers */
92.Lmemcpy_loop32:
93	ldmia	r1!, {r3, r4, r12, lr}
94	stmia	r0!, {r3, r4, r12, lr}
95	ldmia	r1!, {r3, r4, r12, lr}
96	stmia	r0!, {r3, r4, r12, lr}
97	subs	r2, r2, #0x20
98	bge	.Lmemcpy_loop32
99
100	cmn	r2, #0x10
101	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
102	stmgeia	r0!, {r3, r4, r12, lr}
103	subge	r2, r2, #0x10
104	ldmia	sp!, {r4}		/* return r4 */
105
106.Lmemcpy_l32:
107	adds	r2, r2, #0x14
108
109	/* blat 12 bytes at a time */
110.Lmemcpy_loop12:
111	ldmgeia	r1!, {r3, r12, lr}
112	stmgeia	r0!, {r3, r12, lr}
113	subges	r2, r2, #0x0c
114	bge	.Lmemcpy_loop12
115
116.Lmemcpy_l12:
117	adds	r2, r2, #8
118	blt	.Lmemcpy_l4
119
120	subs	r2, r2, #4
121	ldrlt	r3, [r1], #4
122	strlt	r3, [r0], #4
123	ldmgeia	r1!, {r3, r12}
124	stmgeia	r0!, {r3, r12}
125	subge	r2, r2, #4
126
127.Lmemcpy_l4:
128	/* less than 4 bytes to go */
129	adds	r2, r2, #4
130#ifdef __APCS_26_
131	ldmeqia sp!, {r0, pc}^		/* done */
132#else
133	ldmeqia	sp!, {r0, pc}		/* done */
134#endif
135	/* copy the crud byte at a time */
136	cmp	r2, #2
137	ldrb	r3, [r1], #1
138	strb	r3, [r0], #1
139	ldrgeb	r3, [r1], #1
140	strgeb	r3, [r0], #1
141	ldrgtb	r3, [r1], #1
142	strgtb	r3, [r0], #1
143	ldmia	sp!, {r0, pc}
144
145	/* erg - unaligned destination */
146.Lmemcpy_destul:
147	rsb	r12, r12, #4
148	cmp	r12, #2
149
150	/* align destination with byte copies */
151	ldrb	r3, [r1], #1
152	strb	r3, [r0], #1
153	ldrgeb	r3, [r1], #1
154	strgeb	r3, [r0], #1
155	ldrgtb	r3, [r1], #1
156	strgtb	r3, [r0], #1
157	subs	r2, r2, r12
158	blt	.Lmemcpy_l4		/* less the 4 bytes */
159
160	ands	r12, r1, #3
161	beq	.Lmemcpy_t8		/* we have an aligned source */
162
163	/* erg - unaligned source */
164	/* This is where it gets nasty ... */
165.Lmemcpy_srcul:
166	bic	r1, r1, #3
167	ldr	lr, [r1], #4
168	cmp	r12, #2
169	bgt	.Lmemcpy_srcul3
170	beq	.Lmemcpy_srcul2
171	cmp	r2, #0x0c
172	blt	.Lmemcpy_srcul1loop4
173	sub	r2, r2, #0x0c
174	stmdb	sp!, {r4, r5}
175
176.Lmemcpy_srcul1loop16:
177#ifdef __ARMEB__
178	mov	r3, lr, lsl #8
179#else
180	mov	r3, lr, lsr #8
181#endif
182	ldmia	r1!, {r4, r5, r12, lr}
183#ifdef __ARMEB__
184	orr	r3, r3, r4, lsr #24
185	mov	r4, r4, lsl #8
186	orr	r4, r4, r5, lsr #24
187	mov	r5, r5, lsl #8
188	orr	r5, r5, r12, lsr #24
189	mov	r12, r12, lsl #8
190	orr	r12, r12, lr, lsr #24
191#else
192	orr	r3, r3, r4, lsl #24
193	mov	r4, r4, lsr #8
194	orr	r4, r4, r5, lsl #24
195	mov	r5, r5, lsr #8
196	orr	r5, r5, r12, lsl #24
197	mov	r12, r12, lsr #8
198	orr	r12, r12, lr, lsl #24
199#endif
200	stmia	r0!, {r3-r5, r12}
201	subs	r2, r2, #0x10
202	bge	.Lmemcpy_srcul1loop16
203	ldmia	sp!, {r4, r5}
204	adds	r2, r2, #0x0c
205	blt	.Lmemcpy_srcul1l4
206
207.Lmemcpy_srcul1loop4:
208#ifdef __ARMEB__
209	mov	r12, lr, lsl #8
210#else
211	mov	r12, lr, lsr #8
212#endif
213	ldr	lr, [r1], #4
214#ifdef __ARMEB__
215	orr	r12, r12, lr, lsr #24
216#else
217	orr	r12, r12, lr, lsl #24
218#endif
219	str	r12, [r0], #4
220	subs	r2, r2, #4
221	bge	.Lmemcpy_srcul1loop4
222
223.Lmemcpy_srcul1l4:
224	sub	r1, r1, #3
225	b	.Lmemcpy_l4
226
227.Lmemcpy_srcul2:
228	cmp	r2, #0x0c
229	blt	.Lmemcpy_srcul2loop4
230	sub	r2, r2, #0x0c
231	stmdb	sp!, {r4, r5}
232
233.Lmemcpy_srcul2loop16:
234#ifdef __ARMEB__
235	mov	r3, lr, lsl #16
236#else
237	mov	r3, lr, lsr #16
238#endif
239	ldmia	r1!, {r4, r5, r12, lr}
240#ifdef __ARMEB__
241	orr	r3, r3, r4, lsr #16
242	mov	r4, r4, lsl #16
243	orr	r4, r4, r5, lsr #16
244	mov	r5, r5, lsl #16
245	orr	r5, r5, r12, lsr #16
246	mov	r12, r12, lsl #16
247	orr	r12, r12, lr, lsr #16
248#else
249	orr	r3, r3, r4, lsl #16
250	mov	r4, r4, lsr #16
251	orr	r4, r4, r5, lsl #16
252	mov	r5, r5, lsr #16
253	orr	r5, r5, r12, lsl #16
254	mov	r12, r12, lsr #16
255	orr	r12, r12, lr, lsl #16
256#endif
257	stmia	r0!, {r3-r5, r12}
258	subs	r2, r2, #0x10
259	bge	.Lmemcpy_srcul2loop16
260	ldmia	sp!, {r4, r5}
261	adds	r2, r2, #0x0c
262	blt	.Lmemcpy_srcul2l4
263
264.Lmemcpy_srcul2loop4:
265#ifdef __ARMEB__
266	mov	r12, lr, lsl #16
267#else
268	mov	r12, lr, lsr #16
269#endif
270	ldr	lr, [r1], #4
271#ifdef __ARMEB__
272	orr	r12, r12, lr, lsr #16
273#else
274	orr	r12, r12, lr, lsl #16
275#endif
276	str	r12, [r0], #4
277	subs	r2, r2, #4
278	bge	.Lmemcpy_srcul2loop4
279
280.Lmemcpy_srcul2l4:
281	sub	r1, r1, #2
282	b	.Lmemcpy_l4
283
284.Lmemcpy_srcul3:
285	cmp	r2, #0x0c
286	blt	.Lmemcpy_srcul3loop4
287	sub	r2, r2, #0x0c
288	stmdb	sp!, {r4, r5}
289
290.Lmemcpy_srcul3loop16:
291#ifdef __ARMEB__
292	mov	r3, lr, lsl #24
293#else
294	mov	r3, lr, lsr #24
295#endif
296	ldmia	r1!, {r4, r5, r12, lr}
297#ifdef __ARMEB__
298	orr	r3, r3, r4, lsr #8
299	mov	r4, r4, lsl #24
300	orr	r4, r4, r5, lsr #8
301	mov	r5, r5, lsl #24
302	orr	r5, r5, r12, lsr #8
303	mov	r12, r12, lsl #24
304	orr	r12, r12, lr, lsr #8
305#else
306	orr	r3, r3, r4, lsl #8
307	mov	r4, r4, lsr #24
308	orr	r4, r4, r5, lsl #8
309	mov	r5, r5, lsr #24
310	orr	r5, r5, r12, lsl #8
311	mov	r12, r12, lsr #24
312	orr	r12, r12, lr, lsl #8
313#endif
314	stmia	r0!, {r3-r5, r12}
315	subs	r2, r2, #0x10
316	bge	.Lmemcpy_srcul3loop16
317	ldmia	sp!, {r4, r5}
318	adds	r2, r2, #0x0c
319	blt	.Lmemcpy_srcul3l4
320
321.Lmemcpy_srcul3loop4:
322#ifdef __ARMEB__
323	mov	r12, lr, lsl #24
324#else
325	mov	r12, lr, lsr #24
326#endif
327	ldr	lr, [r1], #4
328#ifdef __ARMEB__
329	orr	r12, r12, lr, lsr #8
330#else
331	orr	r12, r12, lr, lsl #8
332#endif
333	str	r12, [r0], #4
334	subs	r2, r2, #4
335	bge	.Lmemcpy_srcul3loop4
336
337.Lmemcpy_srcul3l4:
338	sub	r1, r1, #1
339	b	.Lmemcpy_l4
340