memcpy_arm.S revision 129202
1129202Scognet/*	$NetBSD: memcpy_arm.S,v 1.1 2003/10/14 07:51:45 scw Exp $	*/
2129202Scognet
3129202Scognet/*-
4129202Scognet * Copyright (c) 1997 The NetBSD Foundation, Inc.
5129202Scognet * All rights reserved.
6129202Scognet *
7129202Scognet * This code is derived from software contributed to The NetBSD Foundation
8129202Scognet * by Neil A. Carson and Mark Brinicombe
9129202Scognet *
10129202Scognet * Redistribution and use in source and binary forms, with or without
11129202Scognet * modification, are permitted provided that the following conditions
12129202Scognet * are met:
13129202Scognet * 1. Redistributions of source code must retain the above copyright
14129202Scognet *    notice, this list of conditions and the following disclaimer.
15129202Scognet * 2. Redistributions in binary form must reproduce the above copyright
16129202Scognet *    notice, this list of conditions and the following disclaimer in the
17129202Scognet *    documentation and/or other materials provided with the distribution.
18129202Scognet * 3. All advertising materials mentioning features or use of this software
19129202Scognet *    must display the following acknowledgement:
20129202Scognet *        This product includes software developed by the NetBSD
21129202Scognet *        Foundation, Inc. and its contributors.
22129202Scognet * 4. Neither the name of The NetBSD Foundation nor the names of its
23129202Scognet *    contributors may be used to endorse or promote products derived
24129202Scognet *    from this software without specific prior written permission.
25129202Scognet *
26129202Scognet * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27129202Scognet * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28129202Scognet * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29129202Scognet * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30129202Scognet * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31129202Scognet * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32129202Scognet * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33129202Scognet * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34129202Scognet * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35129202Scognet * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36129202Scognet * POSSIBILITY OF SUCH DAMAGE.
37129202Scognet */
38129202Scognet
39129202Scognet#include <machine/asm.h>
40129202Scognet__FBSDID("$FreeBSD: head/lib/libc/arm/string/memcpy_arm.S 129202 2004-05-14 12:04:31Z cognet $");
41129202Scognet/*
42129202Scognet * This is one fun bit of code ...
43129202Scognet * Some easy listening music is suggested while trying to understand this
44129202Scognet * code e.g. Iron Maiden
45129202Scognet *
46129202Scognet * For anyone attempting to understand it :
47129202Scognet *
48129202Scognet * The core code is implemented here with simple stubs for memcpy().
49129202Scognet *
50129202Scognet * All local labels are prefixed with Lmemcpy_
51129202Scognet * Following the prefix a label starting f is used in the forward copy code
52129202Scognet * while a label using b is used in the backwards copy code
53129202Scognet * The source and destination addresses determine whether a forward or
54129202Scognet * backward copy is performed.
55129202Scognet * Separate bits of code are used to deal with the following situations
56129202Scognet * for both the forward and backwards copy.
57129202Scognet * unaligned source address
58129202Scognet * unaligned destination address
59129202Scognet * Separate copy routines are used to produce an optimised result for each
60129202Scognet * of these cases.
61129202Scognet * The copy code will use LDM/STM instructions to copy up to 32 bytes at
62129202Scognet * a time where possible.
63129202Scognet *
64129202Scognet * Note: r12 (aka ip) can be trashed during the function along with
65129202Scognet * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
66129202Scognet * Additional registers are preserved prior to use i.e. r4, r5 & lr
67129202Scognet *
68129202Scognet * Apologies for the state of the comments ;-)
69129202Scognet */
70129202Scognet/* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
71129202ScognetENTRY(memcpy)
72129202Scognet	/* save leaf functions having to store this away */
73129202Scognet	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
74129202Scognet
75129202Scognet	subs	r2, r2, #4
76129202Scognet	blt	.Lmemcpy_l4		/* less than 4 bytes */
77129202Scognet	ands	r12, r0, #3
78129202Scognet	bne	.Lmemcpy_destul		/* oh unaligned destination addr */
79129202Scognet	ands	r12, r1, #3
80129202Scognet	bne	.Lmemcpy_srcul		/* oh unaligned source addr */
81129202Scognet
82129202Scognet.Lmemcpy_t8:
83129202Scognet	/* We have aligned source and destination */
84129202Scognet	subs	r2, r2, #8
85129202Scognet	blt	.Lmemcpy_l12		/* less than 12 bytes (4 from above) */
86129202Scognet	subs	r2, r2, #0x14
87129202Scognet	blt	.Lmemcpy_l32		/* less than 32 bytes (12 from above) */
88129202Scognet	stmdb	sp!, {r4}		/* borrow r4 */
89129202Scognet
90129202Scognet	/* blat 32 bytes at a time */
91129202Scognet	/* XXX for really big copies perhaps we should use more registers */
92129202Scognet.Lmemcpy_loop32:
93129202Scognet	ldmia	r1!, {r3, r4, r12, lr}
94129202Scognet	stmia	r0!, {r3, r4, r12, lr}
95129202Scognet	ldmia	r1!, {r3, r4, r12, lr}
96129202Scognet	stmia	r0!, {r3, r4, r12, lr}
97129202Scognet	subs	r2, r2, #0x20
98129202Scognet	bge	.Lmemcpy_loop32
99129202Scognet
100129202Scognet	cmn	r2, #0x10
101129202Scognet	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
102129202Scognet	stmgeia	r0!, {r3, r4, r12, lr}
103129202Scognet	subge	r2, r2, #0x10
104129202Scognet	ldmia	sp!, {r4}		/* return r4 */
105129202Scognet
106129202Scognet.Lmemcpy_l32:
107129202Scognet	adds	r2, r2, #0x14
108129202Scognet
109129202Scognet	/* blat 12 bytes at a time */
110129202Scognet.Lmemcpy_loop12:
111129202Scognet	ldmgeia	r1!, {r3, r12, lr}
112129202Scognet	stmgeia	r0!, {r3, r12, lr}
113129202Scognet	subges	r2, r2, #0x0c
114129202Scognet	bge	.Lmemcpy_loop12
115129202Scognet
116129202Scognet.Lmemcpy_l12:
117129202Scognet	adds	r2, r2, #8
118129202Scognet	blt	.Lmemcpy_l4
119129202Scognet
120129202Scognet	subs	r2, r2, #4
121129202Scognet	ldrlt	r3, [r1], #4
122129202Scognet	strlt	r3, [r0], #4
123129202Scognet	ldmgeia	r1!, {r3, r12}
124129202Scognet	stmgeia	r0!, {r3, r12}
125129202Scognet	subge	r2, r2, #4
126129202Scognet
127129202Scognet.Lmemcpy_l4:
128129202Scognet	/* less than 4 bytes to go */
129129202Scognet	adds	r2, r2, #4
130129202Scognet#ifdef __APCS_26_
131129202Scognet	ldmeqia sp!, {r0, pc}^		/* done */
132129202Scognet#else
133129202Scognet	ldmeqia	sp!, {r0, pc}		/* done */
134129202Scognet#endif
135129202Scognet	/* copy the crud byte at a time */
136129202Scognet	cmp	r2, #2
137129202Scognet	ldrb	r3, [r1], #1
138129202Scognet	strb	r3, [r0], #1
139129202Scognet	ldrgeb	r3, [r1], #1
140129202Scognet	strgeb	r3, [r0], #1
141129202Scognet	ldrgtb	r3, [r1], #1
142129202Scognet	strgtb	r3, [r0], #1
143129202Scognet	ldmia	sp!, {r0, pc}
144129202Scognet
145129202Scognet	/* erg - unaligned destination */
146129202Scognet.Lmemcpy_destul:
147129202Scognet	rsb	r12, r12, #4
148129202Scognet	cmp	r12, #2
149129202Scognet
150129202Scognet	/* align destination with byte copies */
151129202Scognet	ldrb	r3, [r1], #1
152129202Scognet	strb	r3, [r0], #1
153129202Scognet	ldrgeb	r3, [r1], #1
154129202Scognet	strgeb	r3, [r0], #1
155129202Scognet	ldrgtb	r3, [r1], #1
156129202Scognet	strgtb	r3, [r0], #1
157129202Scognet	subs	r2, r2, r12
158129202Scognet	blt	.Lmemcpy_l4		/* less the 4 bytes */
159129202Scognet
160129202Scognet	ands	r12, r1, #3
161129202Scognet	beq	.Lmemcpy_t8		/* we have an aligned source */
162129202Scognet
163129202Scognet	/* erg - unaligned source */
164129202Scognet	/* This is where it gets nasty ... */
165129202Scognet.Lmemcpy_srcul:
166129202Scognet	bic	r1, r1, #3
167129202Scognet	ldr	lr, [r1], #4
168129202Scognet	cmp	r12, #2
169129202Scognet	bgt	.Lmemcpy_srcul3
170129202Scognet	beq	.Lmemcpy_srcul2
171129202Scognet	cmp	r2, #0x0c
172129202Scognet	blt	.Lmemcpy_srcul1loop4
173129202Scognet	sub	r2, r2, #0x0c
174129202Scognet	stmdb	sp!, {r4, r5}
175129202Scognet
176129202Scognet.Lmemcpy_srcul1loop16:
177129202Scognet#ifdef __ARMEB__
178129202Scognet	mov	r3, lr, lsl #8
179129202Scognet#else
180129202Scognet	mov	r3, lr, lsr #8
181129202Scognet#endif
182129202Scognet	ldmia	r1!, {r4, r5, r12, lr}
183129202Scognet#ifdef __ARMEB__
184129202Scognet	orr	r3, r3, r4, lsr #24
185129202Scognet	mov	r4, r4, lsl #8
186129202Scognet	orr	r4, r4, r5, lsr #24
187129202Scognet	mov	r5, r5, lsl #8
188129202Scognet	orr	r5, r5, r12, lsr #24
189129202Scognet	mov	r12, r12, lsl #8
190129202Scognet	orr	r12, r12, lr, lsr #24
191129202Scognet#else
192129202Scognet	orr	r3, r3, r4, lsl #24
193129202Scognet	mov	r4, r4, lsr #8
194129202Scognet	orr	r4, r4, r5, lsl #24
195129202Scognet	mov	r5, r5, lsr #8
196129202Scognet	orr	r5, r5, r12, lsl #24
197129202Scognet	mov	r12, r12, lsr #8
198129202Scognet	orr	r12, r12, lr, lsl #24
199129202Scognet#endif
200129202Scognet	stmia	r0!, {r3-r5, r12}
201129202Scognet	subs	r2, r2, #0x10
202129202Scognet	bge	.Lmemcpy_srcul1loop16
203129202Scognet	ldmia	sp!, {r4, r5}
204129202Scognet	adds	r2, r2, #0x0c
205129202Scognet	blt	.Lmemcpy_srcul1l4
206129202Scognet
207129202Scognet.Lmemcpy_srcul1loop4:
208129202Scognet#ifdef __ARMEB__
209129202Scognet	mov	r12, lr, lsl #8
210129202Scognet#else
211129202Scognet	mov	r12, lr, lsr #8
212129202Scognet#endif
213129202Scognet	ldr	lr, [r1], #4
214129202Scognet#ifdef __ARMEB__
215129202Scognet	orr	r12, r12, lr, lsr #24
216129202Scognet#else
217129202Scognet	orr	r12, r12, lr, lsl #24
218129202Scognet#endif
219129202Scognet	str	r12, [r0], #4
220129202Scognet	subs	r2, r2, #4
221129202Scognet	bge	.Lmemcpy_srcul1loop4
222129202Scognet
223129202Scognet.Lmemcpy_srcul1l4:
224129202Scognet	sub	r1, r1, #3
225129202Scognet	b	.Lmemcpy_l4
226129202Scognet
227129202Scognet.Lmemcpy_srcul2:
228129202Scognet	cmp	r2, #0x0c
229129202Scognet	blt	.Lmemcpy_srcul2loop4
230129202Scognet	sub	r2, r2, #0x0c
231129202Scognet	stmdb	sp!, {r4, r5}
232129202Scognet
233129202Scognet.Lmemcpy_srcul2loop16:
234129202Scognet#ifdef __ARMEB__
235129202Scognet	mov	r3, lr, lsl #16
236129202Scognet#else
237129202Scognet	mov	r3, lr, lsr #16
238129202Scognet#endif
239129202Scognet	ldmia	r1!, {r4, r5, r12, lr}
240129202Scognet#ifdef __ARMEB__
241129202Scognet	orr	r3, r3, r4, lsr #16
242129202Scognet	mov	r4, r4, lsl #16
243129202Scognet	orr	r4, r4, r5, lsr #16
244129202Scognet	mov	r5, r5, lsl #16
245129202Scognet	orr	r5, r5, r12, lsr #16
246129202Scognet	mov	r12, r12, lsl #16
247129202Scognet	orr	r12, r12, lr, lsr #16
248129202Scognet#else
249129202Scognet	orr	r3, r3, r4, lsl #16
250129202Scognet	mov	r4, r4, lsr #16
251129202Scognet	orr	r4, r4, r5, lsl #16
252129202Scognet	mov	r5, r5, lsr #16
253129202Scognet	orr	r5, r5, r12, lsl #16
254129202Scognet	mov	r12, r12, lsr #16
255129202Scognet	orr	r12, r12, lr, lsl #16
256129202Scognet#endif
257129202Scognet	stmia	r0!, {r3-r5, r12}
258129202Scognet	subs	r2, r2, #0x10
259129202Scognet	bge	.Lmemcpy_srcul2loop16
260129202Scognet	ldmia	sp!, {r4, r5}
261129202Scognet	adds	r2, r2, #0x0c
262129202Scognet	blt	.Lmemcpy_srcul2l4
263129202Scognet
264129202Scognet.Lmemcpy_srcul2loop4:
265129202Scognet#ifdef __ARMEB__
266129202Scognet	mov	r12, lr, lsl #16
267129202Scognet#else
268129202Scognet	mov	r12, lr, lsr #16
269129202Scognet#endif
270129202Scognet	ldr	lr, [r1], #4
271129202Scognet#ifdef __ARMEB__
272129202Scognet	orr	r12, r12, lr, lsr #16
273129202Scognet#else
274129202Scognet	orr	r12, r12, lr, lsl #16
275129202Scognet#endif
276129202Scognet	str	r12, [r0], #4
277129202Scognet	subs	r2, r2, #4
278129202Scognet	bge	.Lmemcpy_srcul2loop4
279129202Scognet
280129202Scognet.Lmemcpy_srcul2l4:
281129202Scognet	sub	r1, r1, #2
282129202Scognet	b	.Lmemcpy_l4
283129202Scognet
284129202Scognet.Lmemcpy_srcul3:
285129202Scognet	cmp	r2, #0x0c
286129202Scognet	blt	.Lmemcpy_srcul3loop4
287129202Scognet	sub	r2, r2, #0x0c
288129202Scognet	stmdb	sp!, {r4, r5}
289129202Scognet
290129202Scognet.Lmemcpy_srcul3loop16:
291129202Scognet#ifdef __ARMEB__
292129202Scognet	mov	r3, lr, lsl #24
293129202Scognet#else
294129202Scognet	mov	r3, lr, lsr #24
295129202Scognet#endif
296129202Scognet	ldmia	r1!, {r4, r5, r12, lr}
297129202Scognet#ifdef __ARMEB__
298129202Scognet	orr	r3, r3, r4, lsr #8
299129202Scognet	mov	r4, r4, lsl #24
300129202Scognet	orr	r4, r4, r5, lsr #8
301129202Scognet	mov	r5, r5, lsl #24
302129202Scognet	orr	r5, r5, r12, lsr #8
303129202Scognet	mov	r12, r12, lsl #24
304129202Scognet	orr	r12, r12, lr, lsr #8
305129202Scognet#else
306129202Scognet	orr	r3, r3, r4, lsl #8
307129202Scognet	mov	r4, r4, lsr #24
308129202Scognet	orr	r4, r4, r5, lsl #8
309129202Scognet	mov	r5, r5, lsr #24
310129202Scognet	orr	r5, r5, r12, lsl #8
311129202Scognet	mov	r12, r12, lsr #24
312129202Scognet	orr	r12, r12, lr, lsl #8
313129202Scognet#endif
314129202Scognet	stmia	r0!, {r3-r5, r12}
315129202Scognet	subs	r2, r2, #0x10
316129202Scognet	bge	.Lmemcpy_srcul3loop16
317129202Scognet	ldmia	sp!, {r4, r5}
318129202Scognet	adds	r2, r2, #0x0c
319129202Scognet	blt	.Lmemcpy_srcul3l4
320129202Scognet
321129202Scognet.Lmemcpy_srcul3loop4:
322129202Scognet#ifdef __ARMEB__
323129202Scognet	mov	r12, lr, lsl #24
324129202Scognet#else
325129202Scognet	mov	r12, lr, lsr #24
326129202Scognet#endif
327129202Scognet	ldr	lr, [r1], #4
328129202Scognet#ifdef __ARMEB__
329129202Scognet	orr	r12, r12, lr, lsr #8
330129202Scognet#else
331129202Scognet	orr	r12, r12, lr, lsl #8
332129202Scognet#endif
333129202Scognet	str	r12, [r0], #4
334129202Scognet	subs	r2, r2, #4
335129202Scognet	bge	.Lmemcpy_srcul3loop4
336129202Scognet
337129202Scognet.Lmemcpy_srcul3l4:
338129202Scognet	sub	r1, r1, #1
339129202Scognet	b	.Lmemcpy_l4
340