support.S revision 175255
1192886Sedwin/*-
2192886Sedwin * Copyright (c) 2004 Olivier Houchard
367578Swollman * All rights reserved.
4273718Sedwin *
52742Swollman * Redistribution and use in source and binary forms, with or without
6273718Sedwin * modification, are permitted provided that the following conditions
7273718Sedwin * are met:
82742Swollman * 1. Redistributions of source code must retain the above copyright
9274559Sedwin *    notice, this list of conditions and the following disclaimer.
102742Swollman * 2. Redistributions in binary form must reproduce the above copyright
11274559Sedwin *    notice, this list of conditions and the following disclaimer in the
12158421Swollman *    documentation and/or other materials provided with the distribution.
13158421Swollman *
14274559Sedwin * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
152742Swollman * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1686222Swollman * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1720094Swollman * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
1820094Swollman * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1920094Swollman * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20274559Sedwin * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21274559Sedwin * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2220094Swollman * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
232742Swollman * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
242742Swollman * SUCH DAMAGE.
252742Swollman */
262742Swollman/*
27248307Sedwin * Copyright 2003 Wasabi Systems, Inc.
28273718Sedwin * All rights reserved.
29273718Sedwin *
30248307Sedwin * Written by Steve C. Woodford for Wasabi Systems, Inc.
3114343Swollman *
3258787Sru * Redistribution and use in source and binary forms, with or without
3314343Swollman * modification, are permitted provided that the following conditions
3430711Swollman * are met:
3530711Swollman * 1. Redistributions of source code must retain the above copyright
36149514Swollman *    notice, this list of conditions and the following disclaimer.
37270728Spluknet * 2. Redistributions in binary form must reproduce the above copyright
38270728Spluknet *    notice, this list of conditions and the following disclaimer in the
39270728Spluknet *    documentation and/or other materials provided with the distribution.
40270728Spluknet * 3. All advertising materials mentioning features or use of this software
41270728Spluknet *    must display the following acknowledgement:
422742Swollman *      This product includes software developed for the NetBSD Project by
43270728Spluknet *      Wasabi Systems, Inc.
4430711Swollman * 4. The name of Wasabi Systems, Inc. may not be used to endorse
4530711Swollman *    or promote products derived from this software without specific prior
4630711Swollman *    written permission.
4730711Swollman *
482742Swollman * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
4930711Swollman * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
5030711Swollman * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
5130711Swollman * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
5230711Swollman * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
5330711Swollman * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
5430711Swollman * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
5530711Swollman * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
56270728Spluknet * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
5730711Swollman * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
5830711Swollman * POSSIBILITY OF SUCH DAMAGE.
592742Swollman */
6030711Swollman/*
6130711Swollman * Copyright (c) 1997 The NetBSD Foundation, Inc.
6230711Swollman * All rights reserved.
63226289Sedwin *
6430711Swollman * This code is derived from software contributed to The NetBSD Foundation
6530711Swollman * by Neil A. Carson and Mark Brinicombe
662742Swollman *
672742Swollman * Redistribution and use in source and binary forms, with or without
682742Swollman * modification, are permitted provided that the following conditions
692742Swollman * are met:
7019878Swollman * 1. Redistributions of source code must retain the above copyright
71158421Swollman *    notice, this list of conditions and the following disclaimer.
7219878Swollman * 2. Redistributions in binary form must reproduce the above copyright
7319878Swollman *    notice, this list of conditions and the following disclaimer in the
7419878Swollman *    documentation and/or other materials provided with the distribution.
7519878Swollman * 3. All advertising materials mentioning features or use of this software
762742Swollman *    must display the following acknowledgement:
7719878Swollman *	This product includes software developed by the NetBSD
782742Swollman *	Foundation, Inc. and its contributors.
7919878Swollman * 4. Neither the name of The NetBSD Foundation nor the names of its
802742Swollman *    contributors may be used to endorse or promote products derived
81158421Swollman *    from this software without specific prior written permission.
822742Swollman *
832742Swollman * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
8419878Swollman * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
852742Swollman * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
8619878Swollman * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
872742Swollman * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
8819878Swollman * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
892742Swollman * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
9019878Swollman * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
912742Swollman * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
92158421Swollman * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
93158421Swollman * POSSIBILITY OF SUCH DAMAGE.
942742Swollman */
95273718Sedwin
96273718Sedwin#include <machine/asm.h>
97273718Sedwin#include <machine/asmacros.h>
9819878Swollman__FBSDID("$FreeBSD: head/sys/arm/arm/support.S 175255 2008-01-12 21:11:43Z cognet $");
992742Swollman
10019878Swollman#include "assym.s"
10119878Swollman
10219878Swollman.L_arm_memcpy:
10319878Swollman	.word	_C_LABEL(_arm_memcpy)
10419878Swollman.L_arm_bzero:
1052742Swollman	.word	_C_LABEL(_arm_bzero)
1062742Swollman.L_min_memcpy_size:
1072742Swollman	.word	_C_LABEL(_min_memcpy_size)
108273718Sedwin.L_min_bzero_size:
1092742Swollman	.word	_C_LABEL(_min_bzero_size)
1102742Swollman/*
111273718Sedwin * memset: Sets a block of memory to the specified value
1122742Swollman *
1132742Swollman * On entry:
114270728Spluknet *   r0 - dest address
1152742Swollman *   r1 - byte to write
1162742Swollman *   r2 - number of bytes to write
117273718Sedwin *
1182742Swollman * On exit:
1192742Swollman *   r0 - dest address
120273718Sedwin */
1212742Swollman/* LINTSTUB: Func: void bzero(void *, size_t) */
1222742SwollmanENTRY(bzero)
123273718Sedwin	ldr	r3, .L_arm_bzero
124273718Sedwin	ldr	r3, [r3]
125273718Sedwin	cmp	r3, #0
126273718Sedwin	beq	.Lnormal0
127273718Sedwin	ldr	r2, .L_min_bzero_size
128273718Sedwin	ldr	r2, [r2]
1292742Swollman	cmp	r1, r2
130273718Sedwin	blt	.Lnormal0
1312742Swollman	stmfd	sp!, {r0, r1, lr}
1322742Swollman	mov	r2, #0
133273718Sedwin	mov	lr, pc
13419878Swollman	mov	pc, r3
1352742Swollman	cmp	r0, #0
1362742Swollman	ldmfd	sp!, {r0, r1, lr}
137273718Sedwin	RETeq
1382742Swollman.Lnormal0:
1392742Swollman	mov	r3, #0x00
1402742Swollman	b	do_memset
141273718Sedwin
14230711Swollman/* LINTSTUB: Func: void *memset(void *, int, size_t) */
14330711SwollmanENTRY(memset)
14430711Swollman	and	r3, r1, #0xff		/* We deal with bytes */
1452742Swollman	mov	r1, r2
1462742Swollmando_memset:
147274559Sedwin	cmp	r1, #0x04		/* Do we have less than 4 bytes */
1482742Swollman	mov	ip, r0
149273718Sedwin	blt	.Lmemset_lessthanfour
150273718Sedwin
15130711Swollman	/* Ok first we will word align the address */
15230711Swollman	ands	r2, ip, #0x03		/* Get the bottom two bits */
153273718Sedwin	bne	.Lmemset_wordunaligned	/* The address is not word aligned */
1542742Swollman
155273718Sedwin	/* We are now word aligned */
1562742Swollman.Lmemset_wordaligned:
1572742Swollman	orr	r3, r3, r3, lsl #8	/* Extend value to 16-bits */
15830711Swollman#ifdef _ARM_ARCH_5E
159270728Spluknet	tst	ip, #0x04		/* Quad-align for armv5e */
160270728Spluknet#else
161270728Spluknet	cmp	r1, #0x10
162270728Spluknet#endif
163270728Spluknet	orr	r3, r3, r3, lsl #16	/* Extend value to 32-bits */
164270728Spluknet#ifdef _ARM_ARCH_5E
165270728Spluknet	subne	r1, r1, #0x04		/* Quad-align if necessary */
166270728Spluknet	strne	r3, [ip], #0x04
167270728Spluknet	cmp	r1, #0x10
168270728Spluknet#endif
1692742Swollman	blt	.Lmemset_loop4		/* If less than 16 then use words */
1702742Swollman	mov	r2, r3			/* Duplicate data */
171274559Sedwin	cmp	r1, #0x80		/* If < 128 then skip the big loop */
1722742Swollman	blt	.Lmemset_loop32
1732742Swollman
1742742Swollman	/* Do 128 bytes at a time */
1752742Swollman.Lmemset_loop128:
1762742Swollman	subs	r1, r1, #0x80
177248307Sedwin#ifdef _ARM_ARCH_5E
178248307Sedwin	strged	r2, [ip], #0x08
179248307Sedwin	strged	r2, [ip], #0x08
180248307Sedwin	strged	r2, [ip], #0x08
181248307Sedwin	strged	r2, [ip], #0x08
1822742Swollman	strged	r2, [ip], #0x08
18319878Swollman	strged	r2, [ip], #0x08
1842742Swollman	strged	r2, [ip], #0x08
18519878Swollman	strged	r2, [ip], #0x08
1862742Swollman	strged	r2, [ip], #0x08
18719878Swollman	strged	r2, [ip], #0x08
1882742Swollman	strged	r2, [ip], #0x08
1892742Swollman	strged	r2, [ip], #0x08
19019878Swollman	strged	r2, [ip], #0x08
19119878Swollman	strged	r2, [ip], #0x08
1922742Swollman	strged	r2, [ip], #0x08
19319878Swollman	strged	r2, [ip], #0x08
19419878Swollman#else
1952742Swollman	stmgeia	ip!, {r2-r3}
19630711Swollman	stmgeia	ip!, {r2-r3}
19719878Swollman	stmgeia	ip!, {r2-r3}
19819878Swollman	stmgeia	ip!, {r2-r3}
19919878Swollman	stmgeia	ip!, {r2-r3}
20019878Swollman	stmgeia	ip!, {r2-r3}
20130711Swollman	stmgeia	ip!, {r2-r3}
20243014Swollman	stmgeia	ip!, {r2-r3}
20343543Swollman	stmgeia	ip!, {r2-r3}
204221092Sedwin	stmgeia	ip!, {r2-r3}
205221092Sedwin	stmgeia	ip!, {r2-r3}
206221092Sedwin	stmgeia	ip!, {r2-r3}
207221092Sedwin	stmgeia	ip!, {r2-r3}
208221092Sedwin	stmgeia	ip!, {r2-r3}
209221092Sedwin	stmgeia	ip!, {r2-r3}
210221092Sedwin	stmgeia	ip!, {r2-r3}
211221092Sedwin#endif
212221092Sedwin	bgt	.Lmemset_loop128
213221092Sedwin	RETeq			/* Zero length so just exit */
214221092Sedwin
215267473Sedwin	add	r1, r1, #0x80		/* Adjust for extra sub */
216163302Sru
217163302Sru	/* Do 32 bytes at a time */
218163302Sru.Lmemset_loop32:
219163302Sru	subs	r1, r1, #0x20
220163302Sru#ifdef _ARM_ARCH_5E
221267473Sedwin	strged	r2, [ip], #0x08
222171948Sedwin	strged	r2, [ip], #0x08
223171948Sedwin	strged	r2, [ip], #0x08
224171948Sedwin	strged	r2, [ip], #0x08
225270728Spluknet#else
226240457Sedwin	stmgeia	ip!, {r2-r3}
227171948Sedwin	stmgeia	ip!, {r2-r3}
228172479Sedwin	stmgeia	ip!, {r2-r3}
229172479Sedwin	stmgeia	ip!, {r2-r3}
230267473Sedwin#endif
231172479Sedwin	bgt	.Lmemset_loop32
232172479Sedwin	RETeq			/* Zero length so just exit */
233172479Sedwin
234172479Sedwin	adds	r1, r1, #0x10		/* Partially adjust for extra sub */
235172479Sedwin
236172479Sedwin	/* Deal with 16 bytes or more */
237172479Sedwin#ifdef _ARM_ARCH_5E
238171948Sedwin	strged	r2, [ip], #0x08
239172479Sedwin	strged	r2, [ip], #0x08
24020094Swollman#else
241191618Sedwin	stmgeia	ip!, {r2-r3}
242191618Sedwin	stmgeia	ip!, {r2-r3}
243191618Sedwin#endif
244191618Sedwin	RETeq			/* Zero length so just exit */
245192886Sedwin
246191618Sedwin	addlt	r1, r1, #0x10		/* Possibly adjust for extra sub */
247192886Sedwin
248192886Sedwin	/* We have at least 4 bytes so copy as words */
249191618Sedwin.Lmemset_loop4:
250192886Sedwin	subs	r1, r1, #0x04
251192886Sedwin	strge	r3, [ip], #0x04
252191618Sedwin	bgt	.Lmemset_loop4
253192886Sedwin	RETeq			/* Zero length so just exit */
254192886Sedwin
255191618Sedwin#ifdef _ARM_ARCH_5E
256192886Sedwin	/* Compensate for 64-bit alignment check */
257191618Sedwin	adds	r1, r1, #0x04
258191618Sedwin	RETeq
259191618Sedwin	cmp	r1, #2
260191618Sedwin#else
261191618Sedwin	cmp	r1, #-2
262191618Sedwin#endif
263191618Sedwin
264270728Spluknet	strb	r3, [ip], #0x01		/* Set 1 byte */
265270728Spluknet	strgeb	r3, [ip], #0x01		/* Set another byte */
266270728Spluknet	strgtb	r3, [ip]		/* and a third */
267191618Sedwin	RET			/* Exit */
268191618Sedwin
269191618Sedwin.Lmemset_wordunaligned:
270191618Sedwin	rsb	r2, r2, #0x004
271191618Sedwin	strb	r3, [ip], #0x01		/* Set 1 byte */
272196582Sedwin	cmp	r2, #0x02
273196582Sedwin	strgeb	r3, [ip], #0x01		/* Set another byte */
274240457Sedwin	sub	r1, r1, r2
275196582Sedwin	strgtb	r3, [ip], #0x01		/* and a third */
276196582Sedwin	cmp	r1, #0x04		/* More than 4 bytes left? */
277240457Sedwin	bge	.Lmemset_wordaligned	/* Yup */
278196582Sedwin
279196582Sedwin.Lmemset_lessthanfour:
280196582Sedwin	cmp	r1, #0x00
281240457Sedwin	RETeq			/* Zero length so exit */
282196582Sedwin	strb	r3, [ip], #0x01		/* Set 1 byte */
283196582Sedwin	cmp	r1, #0x02
284196582Sedwin	strgeb	r3, [ip], #0x01		/* Set another byte */
285196582Sedwin	strgtb	r3, [ip]		/* and a third */
286210718Sedwin	RET			/* Exit */
287270728Spluknet
288210718SedwinENTRY(bcmp)
289210718Sedwin	mov	ip, r0
290210718Sedwin	cmp	r2, #0x06
291210718Sedwin	beq	.Lmemcmp_6bytes
292270728Spluknet	mov	r0, #0x00
293210718Sedwin
294210718Sedwin	/* Are both addresses aligned the same way? */
295210718Sedwin	cmp	r2, #0x00
296265978Sedwin	eornes	r3, ip, r1
297265978Sedwin	RETeq			/* len == 0, or same addresses! */
298265978Sedwin	tst	r3, #0x03
299265978Sedwin	subne	r2, r2, #0x01
300273718Sedwin	bne	.Lmemcmp_bytewise2	/* Badly aligned. Do it the slow way */
301265978Sedwin
302265978Sedwin	/* Word-align the addresses, if necessary */
303267473Sedwin	sub	r3, r1, #0x05
304267473Sedwin	ands	r3, r3, #0x03
305267473Sedwin	add	r3, r3, r3, lsl #1
306267473Sedwin	addne	pc, pc, r3, lsl #3
307267473Sedwin	nop
308267473Sedwin
309267473Sedwin	/* Compare up to 3 bytes */
310267473Sedwin	ldrb	r0, [ip], #0x01
311267473Sedwin	ldrb	r3, [r1], #0x01
312267473Sedwin	subs	r0, r0, r3
313267473Sedwin	RETne
314267473Sedwin	subs	r2, r2, #0x01
315267473Sedwin	RETeq
316267473Sedwin
317265978Sedwin	/* Compare up to 2 bytes */
318265978Sedwin	ldrb	r0, [ip], #0x01
319267473Sedwin	ldrb	r3, [r1], #0x01
320267473Sedwin	subs	r0, r0, r3
321265978Sedwin	RETne
322283042Sedwin	subs	r2, r2, #0x01
323283042Sedwin	RETeq
324283042Sedwin
325283042Sedwin	/* Compare 1 byte */
326283042Sedwin	ldrb	r0, [ip], #0x01
327283042Sedwin	ldrb	r3, [r1], #0x01
328283042Sedwin	subs	r0, r0, r3
329283042Sedwin	RETne
330283042Sedwin	subs	r2, r2, #0x01
331283079Sedwin	RETeq
332283079Sedwin
333283079Sedwin	/* Compare 4 bytes at a time, if possible */
334283079Sedwin	subs	r2, r2, #0x04
335283079Sedwin	bcc	.Lmemcmp_bytewise
336283079Sedwin.Lmemcmp_word_aligned:
337283079Sedwin	ldr	r0, [ip], #0x04
338283079Sedwin	ldr	r3, [r1], #0x04
339283079Sedwin	subs	r2, r2, #0x04
340283079Sedwin	cmpcs	r0, r3
341284397Sedwin	beq	.Lmemcmp_word_aligned
342284397Sedwin	sub	r0, r0, r3
343284397Sedwin
344284397Sedwin	/* Correct for extra subtraction, and check if done */
345283079Sedwin	adds	r2, r2, #0x04
346267473Sedwin	cmpeq	r0, #0x00		/* If done, did all bytes match? */
347267473Sedwin	RETeq			/* Yup. Just return */
348267473Sedwin
349267473Sedwin	/* Re-do the final word byte-wise */
350267473Sedwin	sub	ip, ip, #0x04
351265978Sedwin	sub	r1, r1, #0x04
352267473Sedwin
353267473Sedwin.Lmemcmp_bytewise:
354283079Sedwin	add	r2, r2, #0x03
355191618Sedwin.Lmemcmp_bytewise2:
3562742Swollman	ldrb	r0, [ip], #0x01
357248307Sedwin	ldrb	r3, [r1], #0x01
35819878Swollman	subs	r2, r2, #0x01
3592742Swollman	cmpcs	r0, r3
3602742Swollman	beq	.Lmemcmp_bytewise2
361273718Sedwin	sub	r0, r0, r3
3622742Swollman	RET
3632742Swollman
3642742Swollman	/*
365274559Sedwin	 * 6 byte compares are very common, thanks to the network stack.
3662742Swollman	 * This code is hand-scheduled to reduce the number of stalls for
3672742Swollman	 * load results. Everything else being equal, this will be ~32%
368273718Sedwin	 * faster than a byte-wise memcmp.
3692742Swollman	 */
3702742Swollman	.align	5
371270728Spluknet.Lmemcmp_6bytes:
3722742Swollman	ldrb	r3, [r1, #0x00]		/* r3 = b2#0 */
3732742Swollman	ldrb	r0, [ip, #0x00]		/* r0 = b1#0 */
3742742Swollman	ldrb	r2, [r1, #0x01]		/* r2 = b2#1 */
375270728Spluknet	subs	r0, r0, r3		/* r0 = b1#0 - b2#0 */
376270728Spluknet	ldreqb	r3, [ip, #0x01]		/* r3 = b1#1 */
377270728Spluknet	RETne			/* Return if mismatch on #0 */
378270728Spluknet	subs	r0, r3, r2		/* r0 = b1#1 - b2#1 */
379270728Spluknet	ldreqb	r3, [r1, #0x02]		/* r3 = b2#2 */
380270728Spluknet	ldreqb	r0, [ip, #0x02]		/* r0 = b1#2 */
381270728Spluknet	RETne			/* Return if mismatch on #1 */
382270728Spluknet	ldrb	r2, [r1, #0x03]		/* r2 = b2#3 */
3832742Swollman	subs	r0, r0, r3		/* r0 = b1#2 - b2#2 */
3842742Swollman	ldreqb	r3, [ip, #0x03]		/* r3 = b1#3 */
3852742Swollman	RETne			/* Return if mismatch on #2 */
3862742Swollman	subs	r0, r3, r2		/* r0 = b1#3 - b2#3 */
3872742Swollman	ldreqb	r3, [r1, #0x04]		/* r3 = b2#4 */
388270728Spluknet	ldreqb	r0, [ip, #0x04]		/* r0 = b1#4 */
3892742Swollman	RETne			/* Return if mismatch on #3 */
3902742Swollman	ldrb	r2, [r1, #0x05]		/* r2 = b2#5 */
391273718Sedwin	subs	r0, r0, r3		/* r0 = b1#4 - b2#4 */
392273718Sedwin	ldreqb	r3, [ip, #0x05]		/* r3 = b1#5 */
393273718Sedwin	RETne			/* Return if mismatch on #4 */
394273718Sedwin	sub	r0, r3, r2		/* r0 = b1#5 - b2#5 */
395273718Sedwin	RET
396273718Sedwin
3972742SwollmanENTRY(bcopy)
398273718Sedwin	/* switch the source and destination registers */
39930711Swollman	eor     r0, r1, r0
40030711Swollman	eor     r1, r0, r1
4012742Swollman	eor     r0, r1, r0
4022742SwollmanENTRY(memmove)
4032742Swollman	/* Do the buffers overlap? */
4042742Swollman	cmp	r0, r1
4052742Swollman	RETeq		/* Bail now if src/dst are the same */
40630711Swollman	subcc	r3, r0, r1	/* if (dst > src) r3 = dst - src */
407226289Sedwin	subcs	r3, r1, r0	/* if (src > dsr) r3 = src - dst */
4082742Swollman	cmp	r3, r2		/* if (r3 < len) we have an overlap */
409274559Sedwin	bcc	PIC_SYM(_C_LABEL(memcpy), PLT)
410274559Sedwin
411274559Sedwin	/* Determine copy direction */
412274559Sedwin	cmp	r1, r0
413274559Sedwin	bcc	.Lmemmove_backwards
414274559Sedwin
415274559Sedwin	moveq	r0, #0			/* Quick abort for len=0 */
416274559Sedwin	RETeq
417274559Sedwin
4182742Swollman	stmdb	sp!, {r0, lr}		/* memmove() returns dest addr */
4192742Swollman	subs	r2, r2, #4
420273718Sedwin	blt	.Lmemmove_fl4		/* less than 4 bytes */
4212742Swollman	ands	r12, r0, #3
4222742Swollman	bne	.Lmemmove_fdestul	/* oh unaligned destination addr */
423158421Swollman	ands	r12, r1, #3
4242742Swollman	bne	.Lmemmove_fsrcul		/* oh unaligned source addr */
42586222Swollman
42614343Swollman.Lmemmove_ft8:
427158421Swollman	/* We have aligned source and destination */
428158421Swollman	subs	r2, r2, #8
429158421Swollman	blt	.Lmemmove_fl12		/* less than 12 bytes (4 from above) */
430158421Swollman	subs	r2, r2, #0x14
4312742Swollman	blt	.Lmemmove_fl32		/* less than 32 bytes (12 from above) */
4322742Swollman	stmdb	sp!, {r4}		/* borrow r4 */
4332742Swollman
43419878Swollman	/* blat 32 bytes at a time */
43530711Swollman	/* XXX for really big copies perhaps we should use more registers */
4362742Swollman.Lmemmove_floop32:
4372742Swollman	ldmia	r1!, {r3, r4, r12, lr}
4382742Swollman	stmia	r0!, {r3, r4, r12, lr}
4392742Swollman	ldmia	r1!, {r3, r4, r12, lr}
4402742Swollman	stmia	r0!, {r3, r4, r12, lr}
441243003Sedwin	subs	r2, r2, #0x20
442243003Sedwin	bge	.Lmemmove_floop32
443243003Sedwin
444243003Sedwin	cmn	r2, #0x10
445243003Sedwin	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
446243003Sedwin	stmgeia	r0!, {r3, r4, r12, lr}
447243003Sedwin	subge	r2, r2, #0x10
448243003Sedwin	ldmia	sp!, {r4}		/* return r4 */
449243003Sedwin
450243003Sedwin.Lmemmove_fl32:
451243003Sedwin	adds	r2, r2, #0x14
452243003Sedwin
453243003Sedwin	/* blat 12 bytes at a time */
454243003Sedwin.Lmemmove_floop12:
455257681Sedwin	ldmgeia	r1!, {r3, r12, lr}
456257681Sedwin	stmgeia	r0!, {r3, r12, lr}
457257681Sedwin	subges	r2, r2, #0x0c
458257681Sedwin	bge	.Lmemmove_floop12
459257681Sedwin
460257681Sedwin.Lmemmove_fl12:
461257681Sedwin	adds	r2, r2, #8
462257681Sedwin	blt	.Lmemmove_fl4
4632742Swollman
46419878Swollman	subs	r2, r2, #4
4652742Swollman	ldrlt	r3, [r1], #4
46619878Swollman	strlt	r3, [r0], #4
4672742Swollman	ldmgeia	r1!, {r3, r12}
46819878Swollman	stmgeia	r0!, {r3, r12}
4692742Swollman	subge	r2, r2, #4
47019878Swollman
4712742Swollman.Lmemmove_fl4:
47219878Swollman	/* less than 4 bytes to go */
47319878Swollman	adds	r2, r2, #4
4742742Swollman	ldmeqia	sp!, {r0, pc}		/* done */
47519878Swollman
476149514Swollman	/* copy the crud byte at a time */
477243003Sedwin	cmp	r2, #2
478243003Sedwin	ldrb	r3, [r1], #1
479257681Sedwin	strb	r3, [r0], #1
480257681Sedwin	ldrgeb	r3, [r1], #1
4812742Swollman	strgeb	r3, [r0], #1
4822742Swollman	ldrgtb	r3, [r1], #1
48319878Swollman	strgtb	r3, [r0], #1
4842742Swollman	ldmia	sp!, {r0, pc}
48558787Sru
486243003Sedwin	/* erg - unaligned destination */
487273718Sedwin.Lmemmove_fdestul:
48858787Sru	rsb	r12, r12, #4
489243003Sedwin	cmp	r12, #2
490273718Sedwin
491273718Sedwin	/* align destination with byte copies */
492257681Sedwin	ldrb	r3, [r1], #1
4932742Swollman	strb	r3, [r0], #1
4942742Swollman	ldrgeb	r3, [r1], #1
495274559Sedwin	strgeb	r3, [r0], #1
4962742Swollman	ldrgtb	r3, [r1], #1
4972742Swollman	strgtb	r3, [r0], #1
498273718Sedwin	subs	r2, r2, r12
4992742Swollman	blt	.Lmemmove_fl4		/* less the 4 bytes */
5002742Swollman
5012742Swollman	ands	r12, r1, #3
502270728Spluknet	beq	.Lmemmove_ft8		/* we have an aligned source */
5032742Swollman
5042742Swollman	/* erg - unaligned source */
505181421Sedwin	/* This is where it gets nasty ... */
506181421Sedwin.Lmemmove_fsrcul:
507181421Sedwin	bic	r1, r1, #3
508181421Sedwin	ldr	lr, [r1], #4
509240457Sedwin	cmp	r12, #2
510181421Sedwin	bgt	.Lmemmove_fsrcul3
511181421Sedwin	beq	.Lmemmove_fsrcul2
512181424Sedwin	cmp	r2, #0x0c
513181421Sedwin	blt	.Lmemmove_fsrcul1loop4
514181421Sedwin	sub	r2, r2, #0x0c
515181424Sedwin	stmdb	sp!, {r4, r5}
516181421Sedwin
517181421Sedwin.Lmemmove_fsrcul1loop16:
518181421Sedwin#ifdef __ARMEB__
519181421Sedwin	mov	r3, lr, lsl #8
520181424Sedwin#else
521181421Sedwin	mov	r3, lr, lsr #8
522181421Sedwin#endif
523181421Sedwin	ldmia	r1!, {r4, r5, r12, lr}
524181424Sedwin#ifdef __ARMEB__
525181424Sedwin	orr	r3, r3, r4, lsr #24
526181424Sedwin	mov	r4, r4, lsl #8
527181424Sedwin	orr	r4, r4, r5, lsr #24
528181424Sedwin	mov	r5, r5, lsl #8
529181424Sedwin	orr	r5, r5, r12, lsr #24
530181424Sedwin	mov	r12, r12, lsl #8
531240457Sedwin	orr	r12, r12, lr, lsr #24
532240457Sedwin#else
533240457Sedwin	orr	r3, r3, r4, lsl #24
534240457Sedwin	mov	r4, r4, lsr #8
535240457Sedwin	orr	r4, r4, r5, lsl #24
536240457Sedwin	mov	r5, r5, lsr #8
537181424Sedwin	orr	r5, r5, r12, lsl #24
538181424Sedwin	mov	r12, r12, lsr #8
539181424Sedwin	orr	r12, r12, lr, lsl #24
540181424Sedwin#endif
541286750Sedwin	stmia	r0!, {r3-r5, r12}
542181424Sedwin	subs	r2, r2, #0x10
543181424Sedwin	bge	.Lmemmove_fsrcul1loop16
544181424Sedwin	ldmia	sp!, {r4, r5}
545270728Spluknet	adds	r2, r2, #0x0c
546270728Spluknet	blt	.Lmemmove_fsrcul1l4
547181424Sedwin
548181424Sedwin.Lmemmove_fsrcul1loop4:
549183066Sedwin#ifdef __ARMEB__
550183066Sedwin	mov	r12, lr, lsl #8
551192886Sedwin#else
552183066Sedwin	mov	r12, lr, lsr #8
553183066Sedwin#endif
554183066Sedwin	ldr	lr, [r1], #4
555183066Sedwin#ifdef __ARMEB__
556183066Sedwin	orr	r12, r12, lr, lsr #24
557183066Sedwin#else
558183066Sedwin	orr	r12, r12, lr, lsl #24
559183066Sedwin#endif
560183066Sedwin	str	r12, [r0], #4
561270728Spluknet	subs	r2, r2, #4
562183066Sedwin	bge	.Lmemmove_fsrcul1loop4
563183066Sedwin
564183864Sedwin.Lmemmove_fsrcul1l4:
565183864Sedwin	sub	r1, r1, #3
566183864Sedwin	b	.Lmemmove_fl4
567183864Sedwin
568183864Sedwin.Lmemmove_fsrcul2:
569183864Sedwin	cmp	r2, #0x0c
570196581Sedwin	blt	.Lmemmove_fsrcul2loop4
571196581Sedwin	sub	r2, r2, #0x0c
572196581Sedwin	stmdb	sp!, {r4, r5}
573196581Sedwin
574196581Sedwin.Lmemmove_fsrcul2loop16:
575196581Sedwin#ifdef __ARMEB__
576196581Sedwin	mov	r3, lr, lsl #16
577196581Sedwin#else
578196581Sedwin	mov	r3, lr, lsr #16
579196581Sedwin#endif
580196581Sedwin	ldmia	r1!, {r4, r5, r12, lr}
581196581Sedwin#ifdef __ARMEB__
582240457Sedwin	orr	r3, r3, r4, lsr #16
583196581Sedwin	mov	r4, r4, lsl #16
584196581Sedwin	orr	r4, r4, r5, lsr #16
585196581Sedwin	mov	r5, r5, lsl #16
586181421Sedwin	orr	r5, r5, r12, lsr #16
587181421Sedwin	mov	r12, r12, lsl #16
588181421Sedwin	orr	r12, r12, lr, lsr #16
589196581Sedwin#else
590196581Sedwin	orr	r3, r3, r4, lsl #16
5912742Swollman	mov	r4, r4, lsr #16
592273718Sedwin	orr	r4, r4, r5, lsl #16
593181421Sedwin	mov	r5, r5, lsr #16
5942742Swollman	orr	r5, r5, r12, lsl #16
5952742Swollman	mov	r12, r12, lsr #16
5962742Swollman	orr	r12, r12, lr, lsl #16
5972742Swollman#endif
598274559Sedwin	stmia	r0!, {r3-r5, r12}
5992742Swollman	subs	r2, r2, #0x10
6002742Swollman	bge	.Lmemmove_fsrcul2loop16
601270728Spluknet	ldmia	sp!, {r4, r5}
602181418Sedwin	adds	r2, r2, #0x0c
603181418Sedwin	blt	.Lmemmove_fsrcul2l4
604181418Sedwin
605181418Sedwin.Lmemmove_fsrcul2loop4:
606181418Sedwin#ifdef __ARMEB__
607181418Sedwin	mov	r12, lr, lsl #16
608181418Sedwin#else
609270728Spluknet	mov	r12, lr, lsr #16
610181418Sedwin#endif
611181418Sedwin	ldr	lr, [r1], #4
612181418Sedwin#ifdef __ARMEB__
613181418Sedwin	orr	r12, r12, lr, lsr #16
614181418Sedwin#else
615273718Sedwin	orr	r12, r12, lr, lsl #16
616273718Sedwin#endif
617181418Sedwin	str	r12, [r0], #4
618181418Sedwin	subs	r2, r2, #4
619181418Sedwin	bge	.Lmemmove_fsrcul2loop4
620181418Sedwin
621181418Sedwin.Lmemmove_fsrcul2l4:
622181418Sedwin	sub	r1, r1, #2
623181418Sedwin	b	.Lmemmove_fl4
624181418Sedwin
625270728Spluknet.Lmemmove_fsrcul3:
626270728Spluknet	cmp	r2, #0x0c
627181418Sedwin	blt	.Lmemmove_fsrcul3loop4
628270728Spluknet	sub	r2, r2, #0x0c
629270728Spluknet	stmdb	sp!, {r4, r5}
630270728Spluknet
631181418Sedwin.Lmemmove_fsrcul3loop16:
632270728Spluknet#ifdef __ARMEB__
633270728Spluknet	mov	r3, lr, lsl #24
634270728Spluknet#else
635270728Spluknet	mov	r3, lr, lsr #24
636181418Sedwin#endif
637181418Sedwin	ldmia	r1!, {r4, r5, r12, lr}
638181418Sedwin#ifdef __ARMEB__
639181418Sedwin	orr	r3, r3, r4, lsr #8
640181418Sedwin	mov	r4, r4, lsl #24
641181418Sedwin	orr	r4, r4, r5, lsr #8
642270728Spluknet	mov	r5, r5, lsl #24
643270728Spluknet	orr	r5, r5, r12, lsr #8
644181418Sedwin	mov	r12, r12, lsl #24
645270728Spluknet	orr	r12, r12, lr, lsr #8
646181418Sedwin#else
647181418Sedwin	orr	r3, r3, r4, lsl #8
648183066Sedwin	mov	r4, r4, lsr #24
649240457Sedwin	orr	r4, r4, r5, lsl #8
650240457Sedwin	mov	r5, r5, lsr #24
651183066Sedwin	orr	r5, r5, r12, lsl #8
652183066Sedwin	mov	r12, r12, lsr #24
653183066Sedwin	orr	r12, r12, lr, lsl #8
654183066Sedwin#endif
655183066Sedwin	stmia	r0!, {r3-r5, r12}
656183066Sedwin	subs	r2, r2, #0x10
657183066Sedwin	bge	.Lmemmove_fsrcul3loop16
658190372Sedwin	ldmia	sp!, {r4, r5}
659190372Sedwin	adds	r2, r2, #0x0c
660190372Sedwin	blt	.Lmemmove_fsrcul3l4
661190372Sedwin
662190372Sedwin.Lmemmove_fsrcul3loop4:
663190372Sedwin#ifdef __ARMEB__
664190372Sedwin	mov	r12, lr, lsl #24
665190372Sedwin#else
666190372Sedwin	mov	r12, lr, lsr #24
667190372Sedwin#endif
668190372Sedwin	ldr	lr, [r1], #4
669190372Sedwin#ifdef __ARMEB__
670190372Sedwin	orr	r12, r12, lr, lsr #8
671270728Spluknet#else
672190372Sedwin	orr	r12, r12, lr, lsl #8
673286750Sedwin#endif
674190372Sedwin	str	r12, [r0], #4
675190372Sedwin	subs	r2, r2, #4
676190372Sedwin	bge	.Lmemmove_fsrcul3loop4
677190372Sedwin
678190372Sedwin.Lmemmove_fsrcul3l4:
679190372Sedwin	sub	r1, r1, #1
680190372Sedwin	b	.Lmemmove_fl4
681206868Sedwin
682206868Sedwin.Lmemmove_backwards:
683206868Sedwin	add	r1, r1, r2
684206868Sedwin	add	r0, r0, r2
685206868Sedwin	subs	r2, r2, #4
686206868Sedwin	blt	.Lmemmove_bl4		/* less than 4 bytes */
687206868Sedwin	ands	r12, r0, #3
688206868Sedwin	bne	.Lmemmove_bdestul	/* oh unaligned destination addr */
689206868Sedwin	ands	r12, r1, #3
690206868Sedwin	bne	.Lmemmove_bsrcul		/* oh unaligned source addr */
691206868Sedwin
692220286Sedwin.Lmemmove_bt8:
693220286Sedwin	/* We have aligned source and destination */
694273718Sedwin	subs	r2, r2, #8
695273718Sedwin	blt	.Lmemmove_bl12		/* less than 12 bytes (4 from above) */
696220286Sedwin	stmdb	sp!, {r4, lr}
697220286Sedwin	subs	r2, r2, #0x14		/* less than 32 bytes (12 from above) */
698220286Sedwin	blt	.Lmemmove_bl32
699220286Sedwin
700220286Sedwin	/* blat 32 bytes at a time */
701220286Sedwin	/* XXX for really big copies perhaps we should use more registers */
702220286Sedwin.Lmemmove_bloop32:
703220286Sedwin	ldmdb	r1!, {r3, r4, r12, lr}
704220286Sedwin	stmdb	r0!, {r3, r4, r12, lr}
705220286Sedwin	ldmdb	r1!, {r3, r4, r12, lr}
706220286Sedwin	stmdb	r0!, {r3, r4, r12, lr}
707270728Spluknet	subs	r2, r2, #0x20
708220286Sedwin	bge	.Lmemmove_bloop32
709220286Sedwin
710220286Sedwin.Lmemmove_bl32:
711220286Sedwin	cmn	r2, #0x10
712220286Sedwin	ldmgedb	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
713220286Sedwin	stmgedb	r0!, {r3, r4, r12, lr}
714220286Sedwin	subge	r2, r2, #0x10
715220286Sedwin	adds	r2, r2, #0x14
716220286Sedwin	ldmgedb	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */
717220286Sedwin	stmgedb	r0!, {r3, r12, lr}
718220286Sedwin	subge	r2, r2, #0x0c
719220286Sedwin	ldmia	sp!, {r4, lr}
720240457Sedwin
721270728Spluknet.Lmemmove_bl12:
722270728Spluknet	adds	r2, r2, #8
723270728Spluknet	blt	.Lmemmove_bl4
724240457Sedwin	subs	r2, r2, #4
725240457Sedwin	ldrlt	r3, [r1, #-4]!
726240457Sedwin	strlt	r3, [r0, #-4]!
727240457Sedwin	ldmgedb	r1!, {r3, r12}
728240457Sedwin	stmgedb	r0!, {r3, r12}
729240457Sedwin	subge	r2, r2, #4
730240457Sedwin
731240457Sedwin.Lmemmove_bl4:
732240457Sedwin	/* less than 4 bytes to go */
733240457Sedwin	adds	r2, r2, #4
734240457Sedwin	RETeq			/* done */
735240457Sedwin
736240457Sedwin	/* copy the crud byte at a time */
737240457Sedwin	cmp	r2, #2
738240457Sedwin	ldrb	r3, [r1, #-1]!
739240457Sedwin	strb	r3, [r0, #-1]!
740240457Sedwin	ldrgeb	r3, [r1, #-1]!
741240457Sedwin	strgeb	r3, [r0, #-1]!
742270728Spluknet	ldrgtb	r3, [r1, #-1]!
743270728Spluknet	strgtb	r3, [r0, #-1]!
744240457Sedwin	RET
745240457Sedwin
746240457Sedwin	/* erg - unaligned destination */
747240457Sedwin.Lmemmove_bdestul:
748240457Sedwin	cmp	r12, #2
749240457Sedwin
750240457Sedwin	/* align destination with byte copies */
751240457Sedwin	ldrb	r3, [r1, #-1]!
752240457Sedwin	strb	r3, [r0, #-1]!
753240457Sedwin	ldrgeb	r3, [r1, #-1]!
754240457Sedwin	strgeb	r3, [r0, #-1]!
755240457Sedwin	ldrgtb	r3, [r1, #-1]!
756240457Sedwin	strgtb	r3, [r0, #-1]!
757240457Sedwin	subs	r2, r2, r12
758240457Sedwin	blt	.Lmemmove_bl4		/* less than 4 bytes to go */
759240457Sedwin	ands	r12, r1, #3
760248307Sedwin	beq	.Lmemmove_bt8		/* we have an aligned source */
761248307Sedwin
762248307Sedwin	/* erg - unaligned source */
763248307Sedwin	/* This is where it gets nasty ... */
764248307Sedwin.Lmemmove_bsrcul:
765273718Sedwin	bic	r1, r1, #3
766253009Sedwin	ldr	r3, [r1, #0]
767253009Sedwin	cmp	r12, #2
768253009Sedwin	blt	.Lmemmove_bsrcul1
769253009Sedwin	beq	.Lmemmove_bsrcul2
770253009Sedwin	cmp	r2, #0x0c
771253009Sedwin	blt	.Lmemmove_bsrcul3loop4
772257681Sedwin	sub	r2, r2, #0x0c
773257681Sedwin	stmdb	sp!, {r4, r5, lr}
774257681Sedwin
775257681Sedwin.Lmemmove_bsrcul3loop16:
776257681Sedwin#ifdef __ARMEB__
777257681Sedwin	mov	lr, r3, lsr #8
778257681Sedwin#else
779257681Sedwin	mov	lr, r3, lsl #8
780257681Sedwin#endif
781257681Sedwin	ldmdb	r1!, {r3-r5, r12}
782263901Sedwin#ifdef __ARMEB__
783263901Sedwin	orr	lr, lr, r12, lsl #24
784263901Sedwin	mov	r12, r12, lsr #8
785267473Sedwin	orr	r12, r12, r5, lsl #24
786267473Sedwin	mov	r5, r5, lsr #8
787267473Sedwin	orr	r5, r5, r4, lsl #24
788267473Sedwin	mov	r4, r4, lsr #8
789267473Sedwin	orr	r4, r4, r3, lsl #24
790267473Sedwin#else
791284397Sedwin	orr	lr, lr, r12, lsr #24
792284397Sedwin	mov	r12, r12, lsl #8
793284397Sedwin	orr	r12, r12, r5, lsr #24
794284397Sedwin	mov	r5, r5, lsl #8
795284397Sedwin	orr	r5, r5, r4, lsr #24
796284397Sedwin	mov	r4, r4, lsl #8
797284397Sedwin	orr	r4, r4, r3, lsr #24
798284397Sedwin#endif
799284397Sedwin	stmdb	r0!, {r4, r5, r12, lr}
800284397Sedwin	subs	r2, r2, #0x10
801284397Sedwin	bge	.Lmemmove_bsrcul3loop16
802284397Sedwin	ldmia	sp!, {r4, r5, lr}
803284397Sedwin	adds	r2, r2, #0x0c
804284397Sedwin	blt	.Lmemmove_bsrcul3l4
805284397Sedwin
806284397Sedwin.Lmemmove_bsrcul3loop4:
807284397Sedwin#ifdef __ARMEB__
808284397Sedwin	mov	r12, r3, lsr #8
809284397Sedwin#else
810284397Sedwin	mov	r12, r3, lsl #8
811284397Sedwin#endif
812284397Sedwin	ldr	r3, [r1, #-4]!
813267473Sedwin#ifdef __ARMEB__
814284397Sedwin	orr	r12, r12, r3, lsl #24
815284397Sedwin#else
816267473Sedwin	orr	r12, r12, r3, lsr #24
817267473Sedwin#endif
818284397Sedwin	str	r12, [r0, #-4]!
819284397Sedwin	subs	r2, r2, #4
820267473Sedwin	bge	.Lmemmove_bsrcul3loop4
821267473Sedwin
822267473Sedwin.Lmemmove_bsrcul3l4:
823284397Sedwin	add	r1, r1, #3
824284397Sedwin	b	.Lmemmove_bl4
825284397Sedwin
826267473Sedwin.Lmemmove_bsrcul2:
827267473Sedwin	cmp	r2, #0x0c
828267473Sedwin	blt	.Lmemmove_bsrcul2loop4
829267473Sedwin	sub	r2, r2, #0x0c
830267473Sedwin	stmdb	sp!, {r4, r5, lr}
831267473Sedwin
832267473Sedwin.Lmemmove_bsrcul2loop16:
833267473Sedwin#ifdef __ARMEB__
834267473Sedwin	mov	lr, r3, lsr #16
835248307Sedwin#else
836248307Sedwin	mov	lr, r3, lsl #16
83720094Swollman#endif
838183066Sedwin	ldmdb	r1!, {r3-r5, r12}
83919878Swollman#ifdef __ARMEB__
8402742Swollman	orr	lr, lr, r12, lsl #16
84119878Swollman	mov	r12, r12, lsr #16
8422742Swollman	orr	r12, r12, r5, lsl #16
84319878Swollman	mov	r5, r5, lsr #16
8442742Swollman	orr	r5, r5, r4, lsl #16
84519878Swollman	mov	r4, r4, lsr #16
8462742Swollman	orr	r4, r4, r3, lsl #16
84719878Swollman#else
8482742Swollman	orr	lr, lr, r12, lsr #16
84919878Swollman	mov	r12, r12, lsl #16
8502742Swollman	orr	r12, r12, r5, lsr #16
8512742Swollman	mov	r5, r5, lsl #16
85219878Swollman	orr	r5, r5, r4, lsr #16
8532742Swollman	mov	r4, r4, lsl #16
854181418Sedwin	orr	r4, r4, r3, lsr #16
855183066Sedwin#endif
856190372Sedwin	stmdb	r0!, {r4, r5, r12, lr}
857267473Sedwin	subs	r2, r2, #0x10
858206868Sedwin	bge	.Lmemmove_bsrcul2loop16
859206868Sedwin	ldmia	sp!, {r4, r5, lr}
860220286Sedwin	adds	r2, r2, #0x0c
861267473Sedwin	blt	.Lmemmove_bsrcul2l4
862267473Sedwin
863267473Sedwin.Lmemmove_bsrcul2loop4:
864267473Sedwin#ifdef __ARMEB__
865267473Sedwin	mov	r12, r3, lsr #16
866267473Sedwin#else
867267473Sedwin	mov	r12, r3, lsl #16
868267473Sedwin#endif
869284397Sedwin	ldr	r3, [r1, #-4]!
870267473Sedwin#ifdef __ARMEB__
871267473Sedwin	orr	r12, r12, r3, lsl #16
872284397Sedwin#else
873284397Sedwin	orr	r12, r12, r3, lsr #16
874284397Sedwin#endif
875284397Sedwin	str	r12, [r0, #-4]!
876284397Sedwin	subs	r2, r2, #4
877284397Sedwin	bge	.Lmemmove_bsrcul2loop4
878284397Sedwin
879284397Sedwin.Lmemmove_bsrcul2l4:
880284397Sedwin	add	r1, r1, #2
881284397Sedwin	b	.Lmemmove_bl4
882284397Sedwin
883284397Sedwin.Lmemmove_bsrcul1:
884284397Sedwin	cmp	r2, #0x0c
885284397Sedwin	blt	.Lmemmove_bsrcul1loop4
886284397Sedwin	sub	r2, r2, #0x0c
887284397Sedwin	stmdb	sp!, {r4, r5, lr}
888284397Sedwin
889284397Sedwin.Lmemmove_bsrcul1loop32:
890267473Sedwin#ifdef __ARMEB__
891284397Sedwin	mov	lr, r3, lsr #24
892284397Sedwin#else
893240457Sedwin	mov	lr, r3, lsl #24
8942742Swollman#endif
8952742Swollman	ldmdb	r1!, {r3-r5, r12}
89619878Swollman#ifdef __ARMEB__
89719878Swollman	orr	lr, lr, r12, lsl #8
898181418Sedwin	mov	r12, r12, lsr #24
899257681Sedwin	orr	r12, r12, r5, lsl #8
90019878Swollman	mov	r5, r5, lsr #24
901257681Sedwin	orr	r5, r5, r4, lsl #8
902257681Sedwin	mov	r4, r4, lsr #24
903257681Sedwin	orr	r4, r4, r3, lsl #8
904257681Sedwin#else
905257681Sedwin	orr	lr, lr, r12, lsr #8
906257681Sedwin	mov	r12, r12, lsl #24
907257681Sedwin	orr	r12, r12, r5, lsr #8
908257681Sedwin	mov	r5, r5, lsl #24
909257681Sedwin	orr	r5, r5, r4, lsr #8
910257681Sedwin	mov	r4, r4, lsl #24
911270728Spluknet	orr	r4, r4, r3, lsr #8
91230711Swollman#endif
913257681Sedwin	stmdb	r0!, {r4, r5, r12, lr}
9148029Swollman	subs	r2, r2, #0x10
9152742Swollman	bge	.Lmemmove_bsrcul1loop32
916273718Sedwin	ldmia	sp!, {r4, r5, lr}
917273718Sedwin	adds	r2, r2, #0x0c
918273718Sedwin	blt	.Lmemmove_bsrcul1l4
919273718Sedwin
920273718Sedwin.Lmemmove_bsrcul1loop4:
921273718Sedwin#ifdef __ARMEB__
9222742Swollman	mov	r12, r3, lsr #24
9232742Swollman#else
92430711Swollman	mov	r12, r3, lsl #24
925273718Sedwin#endif
926273718Sedwin	ldr	r3, [r1, #-4]!
927273718Sedwin#ifdef __ARMEB__
928273718Sedwin	orr	r12, r12, r3, lsl #8
929273718Sedwin#else
930273718Sedwin	orr	r12, r12, r3, lsr #8
931273718Sedwin#endif
9322742Swollman	str	r12, [r0, #-4]!
9332742Swollman	subs	r2, r2, #4
934158421Swollman	bge	.Lmemmove_bsrcul1loop4
935158421Swollman
936169811Swollman.Lmemmove_bsrcul1l4:
937273718Sedwin	add	r1, r1, #1
938273718Sedwin	b	.Lmemmove_bl4
939169811Swollman
940169811Swollman#if !defined(_ARM_ARCH_5E)
941169811SwollmanENTRY(memcpy)
942169811Swollman	/* save leaf functions having to store this away */
943169811Swollman	/* Do not check arm_memcpy if we're running from flash */
944169811Swollman#ifdef FLASHADDR
945240457Sedwin#if FLASHADDR > PHYSADDR
946169811Swollman	ldr	r3, =FLASHADDR
947169811Swollman	cmp	r3, pc
948169811Swollman	bls	.Lnormal
949169811Swollman#else
95020094Swollman	ldr	r3, =FLASHADDR
95120094Swollman	cmp	r3, pc
95220094Swollman	bhi	.Lnormal
9532742Swollman#endif
9542742Swollman#endif
955273718Sedwin	ldr	r3, .L_arm_memcpy
956273718Sedwin	ldr	r3, [r3]
957273718Sedwin	cmp	r3, #0
95830711Swollman	beq	.Lnormal
95930711Swollman	ldr	r3, .L_min_memcpy_size
96030711Swollman	ldr	r3, [r3]
9612742Swollman	cmp	r2, r3
9622742Swollman	blt	.Lnormal
963273718Sedwin	stmfd	sp!, {r0-r2, r4, lr}
9642742Swollman	mov	r3, #0
9652742Swollman	ldr	r4, .L_arm_memcpy
9662742Swollman	mov	lr, pc
9672742Swollman	ldr	pc, [r4]
96830711Swollman	cmp	r0, #0
969273718Sedwin	ldmfd	sp!, {r0-r2, r4, lr}
970273718Sedwin	RETeq
971273718Sedwin
972273718Sedwin.Lnormal:
973273718Sedwin	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
974273718Sedwin
975273718Sedwin	subs	r2, r2, #4
976273718Sedwin	blt	.Lmemcpy_l4		/* less than 4 bytes */
977273718Sedwin	ands	r12, r0, #3
9782742Swollman	bne	.Lmemcpy_destul		/* oh unaligned destination addr */
979270728Spluknet	ands	r12, r1, #3
9802742Swollman	bne	.Lmemcpy_srcul		/* oh unaligned source addr */
981273718Sedwin
982270728Spluknet.Lmemcpy_t8:
98343014Swollman	/* We have aligned source and destination */
984270728Spluknet	subs	r2, r2, #8
985270728Spluknet	blt	.Lmemcpy_l12		/* less than 12 bytes (4 from above) */
986270728Spluknet	subs	r2, r2, #0x14
98743014Swollman	blt	.Lmemcpy_l32		/* less than 32 bytes (12 from above) */
988270728Spluknet	stmdb	sp!, {r4}		/* borrow r4 */
989270728Spluknet
99058787Sru	/* blat 32 bytes at a time */
99143014Swollman	/* XXX for really big copies perhaps we should use more registers */
99243014Swollman.Lmemcpy_loop32:
99343014Swollman	ldmia	r1!, {r3, r4, r12, lr}
99443014Swollman	stmia	r0!, {r3, r4, r12, lr}
99543014Swollman	ldmia	r1!, {r3, r4, r12, lr}
99643014Swollman	stmia	r0!, {r3, r4, r12, lr}
9972742Swollman	subs	r2, r2, #0x20
9982742Swollman	bge	.Lmemcpy_loop32
999273718Sedwin
10002742Swollman	cmn	r2, #0x10
10012742Swollman	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
1002270728Spluknet	stmgeia	r0!, {r3, r4, r12, lr}
100314343Swollman	subge	r2, r2, #0x10
100430711Swollman	ldmia	sp!, {r4}		/* return r4 */
1005270728Spluknet
100658787Sru.Lmemcpy_l32:
100758787Sru	adds	r2, r2, #0x14
1008270728Spluknet
100914343Swollman	/* blat 12 bytes at a time */
1010270728Spluknet.Lmemcpy_loop12:
10112742Swollman	ldmgeia	r1!, {r3, r12, lr}
1012270728Spluknet	stmgeia	r0!, {r3, r12, lr}
10132742Swollman	subges	r2, r2, #0x0c
10142742Swollman	bge	.Lmemcpy_loop12
10152742Swollman
1016273718Sedwin.Lmemcpy_l12:
101719878Swollman	adds	r2, r2, #8
101886222Swollman	blt	.Lmemcpy_l4
101986222Swollman
102086222Swollman	subs	r2, r2, #4
102186222Swollman	ldrlt	r3, [r1], #4
102286222Swollman	strlt	r3, [r0], #4
102386222Swollman	ldmgeia	r1!, {r3, r12}
10242742Swollman	stmgeia	r0!, {r3, r12}
10252742Swollman	subge	r2, r2, #4
1026270728Spluknet
10272742Swollman.Lmemcpy_l4:
10282742Swollman	/* less than 4 bytes to go */
1029274559Sedwin	adds	r2, r2, #4
10302742Swollman#ifdef __APCS_26_
10312742Swollman	ldmeqia sp!, {r0, pc}^		/* done */
10322742Swollman#else
103330711Swollman	ldmeqia	sp!, {r0, pc}		/* done */
10342742Swollman#endif
10352742Swollman	/* copy the crud byte at a time */
10362742Swollman	cmp	r2, #2
103730711Swollman	ldrb	r3, [r1], #1
103830711Swollman	strb	r3, [r0], #1
1039273718Sedwin	ldrgeb	r3, [r1], #1
1040273718Sedwin	strgeb	r3, [r0], #1
1041273718Sedwin	ldrgtb	r3, [r1], #1
104230711Swollman	strgtb	r3, [r0], #1
104358787Sru	ldmia	sp!, {r0, pc}
10442742Swollman
10452742Swollman	/* erg - unaligned destination */
10462742Swollman.Lmemcpy_destul:
104758787Sru	rsb	r12, r12, #4
1048270728Spluknet	cmp	r12, #2
1049270728Spluknet
1050270728Spluknet	/* align destination with byte copies */
105158787Sru	ldrb	r3, [r1], #1
105258787Sru	strb	r3, [r0], #1
105358787Sru	ldrgeb	r3, [r1], #1
105458787Sru	strgeb	r3, [r0], #1
10552742Swollman	ldrgtb	r3, [r1], #1
105619878Swollman	strgtb	r3, [r0], #1
105714343Swollman	subs	r2, r2, r12
105819878Swollman	blt	.Lmemcpy_l4		/* less the 4 bytes */
105919878Swollman
10602742Swollman	ands	r12, r1, #3
10612742Swollman	beq	.Lmemcpy_t8		/* we have an aligned source */
106258787Sru
106358787Sru	/* erg - unaligned source */
10642742Swollman	/* This is where it gets nasty ... */
1065226289Sedwin.Lmemcpy_srcul:
1066257681Sedwin	bic	r1, r1, #3
1067226289Sedwin	ldr	lr, [r1], #4
10682742Swollman	cmp	r12, #2
1069273718Sedwin	bgt	.Lmemcpy_srcul3
10702742Swollman	beq	.Lmemcpy_srcul2
10712742Swollman	cmp	r2, #0x0c
1072274559Sedwin	blt	.Lmemcpy_srcul1loop4
10732742Swollman	sub	r2, r2, #0x0c
10742742Swollman	stmdb	sp!, {r4, r5}
1075270728Spluknet
10762742Swollman.Lmemcpy_srcul1loop16:
10772742Swollman	mov	r3, lr, lsr #8
1078149514Swollman	ldmia	r1!, {r4, r5, r12, lr}
1079149514Swollman	orr	r3, r3, r4, lsl #24
1080270728Spluknet	mov	r4, r4, lsr #8
1081149514Swollman	orr	r4, r4, r5, lsl #24
1082149514Swollman	mov	r5, r5, lsr #8
1083149514Swollman	orr	r5, r5, r12, lsl #24
1084149514Swollman	mov	r12, r12, lsr #8
1085149514Swollman	orr	r12, r12, lr, lsl #24
1086149514Swollman	stmia	r0!, {r3-r5, r12}
1087149514Swollman	subs	r2, r2, #0x10
1088149514Swollman	bge	.Lmemcpy_srcul1loop16
1089270728Spluknet	ldmia	sp!, {r4, r5}
1090273718Sedwin	adds	r2, r2, #0x0c
1091149514Swollman	blt	.Lmemcpy_srcul1l4
1092149514Swollman
1093158421Swollman.Lmemcpy_srcul1loop4:
1094158421Swollman	mov	r12, lr, lsr #8
1095158421Swollman	ldr	lr, [r1], #4
1096158421Swollman	orr	r12, r12, lr, lsl #24
1097158421Swollman	str	r12, [r0], #4
1098158421Swollman	subs	r2, r2, #4
1099149514Swollman	bge	.Lmemcpy_srcul1loop4
1100190372Sedwin
1101190372Sedwin.Lmemcpy_srcul1l4:
1102190372Sedwin	sub	r1, r1, #3
1103190372Sedwin	b	.Lmemcpy_l4
1104190372Sedwin
1105190372Sedwin.Lmemcpy_srcul2:
1106190372Sedwin	cmp	r2, #0x0c
1107190372Sedwin	blt	.Lmemcpy_srcul2loop4
1108190372Sedwin	sub	r2, r2, #0x0c
1109190372Sedwin	stmdb	sp!, {r4, r5}
1110190372Sedwin
1111190372Sedwin.Lmemcpy_srcul2loop16:
1112190372Sedwin	mov	r3, lr, lsr #16
1113190372Sedwin	ldmia	r1!, {r4, r5, r12, lr}
1114190372Sedwin	orr	r3, r3, r4, lsl #16
1115190372Sedwin	mov	r4, r4, lsr #16
1116190372Sedwin	orr	r4, r4, r5, lsl #16
1117190372Sedwin	mov	r5, r5, lsr #16
1118190372Sedwin	orr	r5, r5, r12, lsl #16
1119190372Sedwin	mov	r12, r12, lsr #16
1120190372Sedwin	orr	r12, r12, lr, lsl #16
1121270728Spluknet	stmia	r0!, {r3-r5, r12}
1122270728Spluknet	subs	r2, r2, #0x10
1123270728Spluknet	bge	.Lmemcpy_srcul2loop16
1124270728Spluknet	ldmia	sp!, {r4, r5}
1125270728Spluknet	adds	r2, r2, #0x0c
1126270728Spluknet	blt	.Lmemcpy_srcul2l4
1127190372Sedwin
1128206868Sedwin.Lmemcpy_srcul2loop4:
1129206868Sedwin	mov	r12, lr, lsr #16
1130206868Sedwin	ldr	lr, [r1], #4
1131206868Sedwin	orr	r12, r12, lr, lsl #16
1132206868Sedwin	str	r12, [r0], #4
1133206868Sedwin	subs	r2, r2, #4
1134206868Sedwin	bge	.Lmemcpy_srcul2loop4
1135206868Sedwin
1136206868Sedwin.Lmemcpy_srcul2l4:
1137206868Sedwin	sub	r1, r1, #2
1138206868Sedwin	b	.Lmemcpy_l4
1139206868Sedwin
1140206868Sedwin.Lmemcpy_srcul3:
1141206868Sedwin	cmp	r2, #0x0c
11422742Swollman	blt	.Lmemcpy_srcul3loop4
114319878Swollman	sub	r2, r2, #0x0c
11442742Swollman	stmdb	sp!, {r4, r5}
114519878Swollman
11462742Swollman.Lmemcpy_srcul3loop16:
114719878Swollman	mov	r3, lr, lsr #24
11482742Swollman	ldmia	r1!, {r4, r5, r12, lr}
114919878Swollman	orr	r3, r3, r4, lsl #8
11502742Swollman	mov	r4, r4, lsr #24
115119878Swollman	orr	r4, r4, r5, lsl #8
11522742Swollman	mov	r5, r5, lsr #24
115319878Swollman	orr	r5, r5, r12, lsl #8
11542742Swollman	mov	r12, r12, lsr #24
11552742Swollman	orr	r12, r12, lr, lsl #8
115619878Swollman	stmia	r0!, {r3-r5, r12}
11572742Swollman	subs	r2, r2, #0x10
115819878Swollman	bge	.Lmemcpy_srcul3loop16
11592742Swollman	ldmia	sp!, {r4, r5}
116019878Swollman	adds	r2, r2, #0x0c
116120094Swollman	blt	.Lmemcpy_srcul3l4
116219878Swollman
116319878Swollman.Lmemcpy_srcul3loop4:
1164149514Swollman	mov	r12, lr, lsr #24
1165149514Swollman	ldr	lr, [r1], #4
1166190372Sedwin	orr	r12, r12, lr, lsl #8
1167190372Sedwin	str	r12, [r0], #4
1168206868Sedwin	subs	r2, r2, #4
1169158421Swollman	bge	.Lmemcpy_srcul3loop4
1170158421Swollman
1171158421Swollman.Lmemcpy_srcul3l4:
11722742Swollman	sub	r1, r1, #1
11732742Swollman	b	.Lmemcpy_l4
1174273718Sedwin#else
117519878Swollman/* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
11762742SwollmanENTRY(memcpy)
11772742Swollman	pld	[r1]
1178274559Sedwin	cmp	r2, #0x0c
11792742Swollman	ble	.Lmemcpy_short		/* <= 12 bytes */
11802742Swollman#ifdef FLASHADDR
11812742Swollman#if FLASHADDR > PHYSADDR
1182273718Sedwin	ldr	r3, =FLASHADDR
1183	cmp	r3, pc
1184	bls	.Lnormal
1185#else
1186	ldr	r3, =FLASHADDR
1187	cmp	r3, pc
1188	bhi	.Lnormal
1189#endif
1190#endif
1191	ldr	r3, .L_arm_memcpy
1192	ldr	r3, [r3]
1193	cmp	r3, #0
1194	beq	.Lnormal
1195	ldr	r3, .L_min_memcpy_size
1196	ldr	r3, [r3]
1197	cmp	r2, r3
1198	blt	.Lnormal
1199	stmfd	sp!, {r0-r2, r4, lr}
1200	mov	r3, #0
1201	ldr	r4, .L_arm_memcpy
1202	mov	lr, pc
1203	ldr	pc, [r4]
1204	cmp	r0, #0
1205	ldmfd	sp!, {r0-r2, r4, lr}
1206	RETeq
1207.Lnormal:
1208	mov	r3, r0			/* We must not clobber r0 */
1209
1210	/* Word-align the destination buffer */
1211	ands	ip, r3, #0x03		/* Already word aligned? */
1212	beq	.Lmemcpy_wordaligned	/* Yup */
1213	cmp	ip, #0x02
1214	ldrb	ip, [r1], #0x01
1215	sub	r2, r2, #0x01
1216	strb	ip, [r3], #0x01
1217	ldrleb	ip, [r1], #0x01
1218	suble	r2, r2, #0x01
1219	strleb	ip, [r3], #0x01
1220	ldrltb	ip, [r1], #0x01
1221	sublt	r2, r2, #0x01
1222	strltb	ip, [r3], #0x01
1223
1224	/* Destination buffer is now word aligned */
1225.Lmemcpy_wordaligned:
1226	ands	ip, r1, #0x03		/* Is src also word-aligned? */
1227	bne	.Lmemcpy_bad_align	/* Nope. Things just got bad */
1228
1229	/* Quad-align the destination buffer */
1230	tst	r3, #0x07		/* Already quad aligned? */
1231	ldrne	ip, [r1], #0x04
1232	stmfd	sp!, {r4-r9}		/* Free up some registers */
1233	subne	r2, r2, #0x04
1234	strne	ip, [r3], #0x04
1235
1236	/* Destination buffer quad aligned, source is at least word aligned */
1237	subs	r2, r2, #0x80
1238	blt	.Lmemcpy_w_lessthan128
1239
1240	/* Copy 128 bytes at a time */
1241.Lmemcpy_w_loop128:
1242	ldr	r4, [r1], #0x04		/* LD:00-03 */
1243	ldr	r5, [r1], #0x04		/* LD:04-07 */
1244	pld	[r1, #0x18]		/* Prefetch 0x20 */
1245	ldr	r6, [r1], #0x04		/* LD:08-0b */
1246	ldr	r7, [r1], #0x04		/* LD:0c-0f */
1247	ldr	r8, [r1], #0x04		/* LD:10-13 */
1248	ldr	r9, [r1], #0x04		/* LD:14-17 */
1249	strd	r4, [r3], #0x08		/* ST:00-07 */
1250	ldr	r4, [r1], #0x04		/* LD:18-1b */
1251	ldr	r5, [r1], #0x04		/* LD:1c-1f */
1252	strd	r6, [r3], #0x08		/* ST:08-0f */
1253	ldr	r6, [r1], #0x04		/* LD:20-23 */
1254	ldr	r7, [r1], #0x04		/* LD:24-27 */
1255	pld	[r1, #0x18]		/* Prefetch 0x40 */
1256	strd	r8, [r3], #0x08		/* ST:10-17 */
1257	ldr	r8, [r1], #0x04		/* LD:28-2b */
1258	ldr	r9, [r1], #0x04		/* LD:2c-2f */
1259	strd	r4, [r3], #0x08		/* ST:18-1f */
1260	ldr	r4, [r1], #0x04		/* LD:30-33 */
1261	ldr	r5, [r1], #0x04		/* LD:34-37 */
1262	strd	r6, [r3], #0x08		/* ST:20-27 */
1263	ldr	r6, [r1], #0x04		/* LD:38-3b */
1264	ldr	r7, [r1], #0x04		/* LD:3c-3f */
1265	strd	r8, [r3], #0x08		/* ST:28-2f */
1266	ldr	r8, [r1], #0x04		/* LD:40-43 */
1267	ldr	r9, [r1], #0x04		/* LD:44-47 */
1268	pld	[r1, #0x18]		/* Prefetch 0x60 */
1269	strd	r4, [r3], #0x08		/* ST:30-37 */
1270	ldr	r4, [r1], #0x04		/* LD:48-4b */
1271	ldr	r5, [r1], #0x04		/* LD:4c-4f */
1272	strd	r6, [r3], #0x08		/* ST:38-3f */
1273	ldr	r6, [r1], #0x04		/* LD:50-53 */
1274	ldr	r7, [r1], #0x04		/* LD:54-57 */
1275	strd	r8, [r3], #0x08		/* ST:40-47 */
1276	ldr	r8, [r1], #0x04		/* LD:58-5b */
1277	ldr	r9, [r1], #0x04		/* LD:5c-5f */
1278	strd	r4, [r3], #0x08		/* ST:48-4f */
1279	ldr	r4, [r1], #0x04		/* LD:60-63 */
1280	ldr	r5, [r1], #0x04		/* LD:64-67 */
1281	pld	[r1, #0x18]		/* Prefetch 0x80 */
1282	strd	r6, [r3], #0x08		/* ST:50-57 */
1283	ldr	r6, [r1], #0x04		/* LD:68-6b */
1284	ldr	r7, [r1], #0x04		/* LD:6c-6f */
1285	strd	r8, [r3], #0x08		/* ST:58-5f */
1286	ldr	r8, [r1], #0x04		/* LD:70-73 */
1287	ldr	r9, [r1], #0x04		/* LD:74-77 */
1288	strd	r4, [r3], #0x08		/* ST:60-67 */
1289	ldr	r4, [r1], #0x04		/* LD:78-7b */
1290	ldr	r5, [r1], #0x04		/* LD:7c-7f */
1291	strd	r6, [r3], #0x08		/* ST:68-6f */
1292	strd	r8, [r3], #0x08		/* ST:70-77 */
1293	subs	r2, r2, #0x80
1294	strd	r4, [r3], #0x08		/* ST:78-7f */
1295	bge	.Lmemcpy_w_loop128
1296
1297.Lmemcpy_w_lessthan128:
1298	adds	r2, r2, #0x80		/* Adjust for extra sub */
1299	ldmeqfd	sp!, {r4-r9}
1300	RETeq			/* Return now if done */
1301	subs	r2, r2, #0x20
1302	blt	.Lmemcpy_w_lessthan32
1303
1304	/* Copy 32 bytes at a time */
1305.Lmemcpy_w_loop32:
1306	ldr	r4, [r1], #0x04
1307	ldr	r5, [r1], #0x04
1308	pld	[r1, #0x18]
1309	ldr	r6, [r1], #0x04
1310	ldr	r7, [r1], #0x04
1311	ldr	r8, [r1], #0x04
1312	ldr	r9, [r1], #0x04
1313	strd	r4, [r3], #0x08
1314	ldr	r4, [r1], #0x04
1315	ldr	r5, [r1], #0x04
1316	strd	r6, [r3], #0x08
1317	strd	r8, [r3], #0x08
1318	subs	r2, r2, #0x20
1319	strd	r4, [r3], #0x08
1320	bge	.Lmemcpy_w_loop32
1321
1322.Lmemcpy_w_lessthan32:
1323	adds	r2, r2, #0x20		/* Adjust for extra sub */
1324	ldmeqfd	sp!, {r4-r9}
1325	RETeq			/* Return now if done */
1326
1327	and	r4, r2, #0x18
1328	rsbs	r4, r4, #0x18
1329	addne	pc, pc, r4, lsl #1
1330	nop
1331
1332	/* At least 24 bytes remaining */
1333	ldr	r4, [r1], #0x04
1334	ldr	r5, [r1], #0x04
1335	sub	r2, r2, #0x08
1336	strd	r4, [r3], #0x08
1337
1338	/* At least 16 bytes remaining */
1339	ldr	r4, [r1], #0x04
1340	ldr	r5, [r1], #0x04
1341	sub	r2, r2, #0x08
1342	strd	r4, [r3], #0x08
1343
1344	/* At least 8 bytes remaining */
1345	ldr	r4, [r1], #0x04
1346	ldr	r5, [r1], #0x04
1347	subs	r2, r2, #0x08
1348	strd	r4, [r3], #0x08
1349
1350	/* Less than 8 bytes remaining */
1351	ldmfd	sp!, {r4-r9}
1352	RETeq			/* Return now if done */
1353	subs	r2, r2, #0x04
1354	ldrge	ip, [r1], #0x04
1355	strge	ip, [r3], #0x04
1356	RETeq			/* Return now if done */
1357	addlt	r2, r2, #0x04
1358	ldrb	ip, [r1], #0x01
1359	cmp	r2, #0x02
1360	ldrgeb	r2, [r1], #0x01
1361	strb	ip, [r3], #0x01
1362	ldrgtb	ip, [r1]
1363	strgeb	r2, [r3], #0x01
1364	strgtb	ip, [r3]
1365	RET
1366
1367
1368/*
1369 * At this point, it has not been possible to word align both buffers.
1370 * The destination buffer is word aligned, but the source buffer is not.
1371 */
1372.Lmemcpy_bad_align:
1373	stmfd	sp!, {r4-r7}
1374	bic	r1, r1, #0x03
1375	cmp	ip, #2
1376	ldr	ip, [r1], #0x04
1377	bgt	.Lmemcpy_bad3
1378	beq	.Lmemcpy_bad2
1379	b	.Lmemcpy_bad1
1380
1381.Lmemcpy_bad1_loop16:
1382#ifdef __ARMEB__
1383	mov	r4, ip, lsl #8
1384#else
1385	mov	r4, ip, lsr #8
1386#endif
1387	ldr	r5, [r1], #0x04
1388	pld	[r1, #0x018]
1389	ldr	r6, [r1], #0x04
1390	ldr	r7, [r1], #0x04
1391	ldr	ip, [r1], #0x04
1392#ifdef __ARMEB__
1393	orr	r4, r4, r5, lsr #24
1394	mov	r5, r5, lsl #8
1395	orr	r5, r5, r6, lsr #24
1396	mov	r6, r6, lsl #8
1397	orr	r6, r6, r7, lsr #24
1398	mov	r7, r7, lsl #8
1399	orr	r7, r7, ip, lsr #24
1400#else
1401	orr	r4, r4, r5, lsl #24
1402	mov	r5, r5, lsr #8
1403	orr	r5, r5, r6, lsl #24
1404	mov	r6, r6, lsr #8
1405	orr	r6, r6, r7, lsl #24
1406	mov	r7, r7, lsr #8
1407	orr	r7, r7, ip, lsl #24
1408#endif
1409	str	r4, [r3], #0x04
1410	str	r5, [r3], #0x04
1411	str	r6, [r3], #0x04
1412	str	r7, [r3], #0x04
1413.Lmemcpy_bad1:
1414	subs	r2, r2, #0x10
1415	bge	.Lmemcpy_bad1_loop16
1416
1417	adds	r2, r2, #0x10
1418	ldmeqfd	sp!, {r4-r7}
1419	RETeq			/* Return now if done */
1420	subs	r2, r2, #0x04
1421	sublt	r1, r1, #0x03
1422	blt	.Lmemcpy_bad_done
1423
1424.Lmemcpy_bad1_loop4:
1425#ifdef __ARMEB__
1426	mov	r4, ip, lsl #8
1427#else
1428	mov	r4, ip, lsr #8
1429#endif
1430	ldr	ip, [r1], #0x04
1431	subs	r2, r2, #0x04
1432#ifdef __ARMEB__
1433	orr	r4, r4, ip, lsr #24
1434#else
1435	orr	r4, r4, ip, lsl #24
1436#endif
1437	str	r4, [r3], #0x04
1438	bge	.Lmemcpy_bad1_loop4
1439	sub	r1, r1, #0x03
1440	b	.Lmemcpy_bad_done
1441
1442.Lmemcpy_bad2_loop16:
1443#ifdef __ARMEB__
1444	mov	r4, ip, lsl #16
1445#else
1446	mov	r4, ip, lsr #16
1447#endif
1448	ldr	r5, [r1], #0x04
1449	pld	[r1, #0x018]
1450	ldr	r6, [r1], #0x04
1451	ldr	r7, [r1], #0x04
1452	ldr	ip, [r1], #0x04
1453#ifdef __ARMEB__
1454	orr	r4, r4, r5, lsr #16
1455	mov	r5, r5, lsl #16
1456	orr	r5, r5, r6, lsr #16
1457	mov	r6, r6, lsl #16
1458	orr	r6, r6, r7, lsr #16
1459	mov	r7, r7, lsl #16
1460	orr	r7, r7, ip, lsr #16
1461#else
1462	orr	r4, r4, r5, lsl #16
1463	mov	r5, r5, lsr #16
1464	orr	r5, r5, r6, lsl #16
1465	mov	r6, r6, lsr #16
1466	orr	r6, r6, r7, lsl #16
1467	mov	r7, r7, lsr #16
1468	orr	r7, r7, ip, lsl #16
1469#endif
1470	str	r4, [r3], #0x04
1471	str	r5, [r3], #0x04
1472	str	r6, [r3], #0x04
1473	str	r7, [r3], #0x04
1474.Lmemcpy_bad2:
1475	subs	r2, r2, #0x10
1476	bge	.Lmemcpy_bad2_loop16
1477
1478	adds	r2, r2, #0x10
1479	ldmeqfd	sp!, {r4-r7}
1480	RETeq			/* Return now if done */
1481	subs	r2, r2, #0x04
1482	sublt	r1, r1, #0x02
1483	blt	.Lmemcpy_bad_done
1484
1485.Lmemcpy_bad2_loop4:
1486#ifdef __ARMEB__
1487	mov	r4, ip, lsl #16
1488#else
1489	mov	r4, ip, lsr #16
1490#endif
1491	ldr	ip, [r1], #0x04
1492	subs	r2, r2, #0x04
1493#ifdef __ARMEB__
1494	orr	r4, r4, ip, lsr #16
1495#else
1496	orr	r4, r4, ip, lsl #16
1497#endif
1498	str	r4, [r3], #0x04
1499	bge	.Lmemcpy_bad2_loop4
1500	sub	r1, r1, #0x02
1501	b	.Lmemcpy_bad_done
1502
1503.Lmemcpy_bad3_loop16:
1504#ifdef __ARMEB__
1505	mov	r4, ip, lsl #24
1506#else
1507	mov	r4, ip, lsr #24
1508#endif
1509	ldr	r5, [r1], #0x04
1510	pld	[r1, #0x018]
1511	ldr	r6, [r1], #0x04
1512	ldr	r7, [r1], #0x04
1513	ldr	ip, [r1], #0x04
1514#ifdef __ARMEB__
1515	orr	r4, r4, r5, lsr #8
1516	mov	r5, r5, lsl #24
1517	orr	r5, r5, r6, lsr #8
1518	mov	r6, r6, lsl #24
1519	orr	r6, r6, r7, lsr #8
1520	mov	r7, r7, lsl #24
1521	orr	r7, r7, ip, lsr #8
1522#else
1523	orr	r4, r4, r5, lsl #8
1524	mov	r5, r5, lsr #24
1525	orr	r5, r5, r6, lsl #8
1526	mov	r6, r6, lsr #24
1527	orr	r6, r6, r7, lsl #8
1528	mov	r7, r7, lsr #24
1529	orr	r7, r7, ip, lsl #8
1530#endif
1531	str	r4, [r3], #0x04
1532	str	r5, [r3], #0x04
1533	str	r6, [r3], #0x04
1534	str	r7, [r3], #0x04
1535.Lmemcpy_bad3:
1536	subs	r2, r2, #0x10
1537	bge	.Lmemcpy_bad3_loop16
1538
1539	adds	r2, r2, #0x10
1540	ldmeqfd	sp!, {r4-r7}
1541	RETeq			/* Return now if done */
1542	subs	r2, r2, #0x04
1543	sublt	r1, r1, #0x01
1544	blt	.Lmemcpy_bad_done
1545
1546.Lmemcpy_bad3_loop4:
1547#ifdef __ARMEB__
1548	mov	r4, ip, lsl #24
1549#else
1550	mov	r4, ip, lsr #24
1551#endif
1552	ldr	ip, [r1], #0x04
1553	subs	r2, r2, #0x04
1554#ifdef __ARMEB__
1555	orr	r4, r4, ip, lsr #8
1556#else
1557	orr	r4, r4, ip, lsl #8
1558#endif
1559	str	r4, [r3], #0x04
1560	bge	.Lmemcpy_bad3_loop4
1561	sub	r1, r1, #0x01
1562
1563.Lmemcpy_bad_done:
1564	ldmfd	sp!, {r4-r7}
1565	adds	r2, r2, #0x04
1566	RETeq
1567	ldrb	ip, [r1], #0x01
1568	cmp	r2, #0x02
1569	ldrgeb	r2, [r1], #0x01
1570	strb	ip, [r3], #0x01
1571	ldrgtb	ip, [r1]
1572	strgeb	r2, [r3], #0x01
1573	strgtb	ip, [r3]
1574	RET
1575
1576
1577/*
1578 * Handle short copies (less than 16 bytes), possibly misaligned.
1579 * Some of these are *very* common, thanks to the network stack,
1580 * and so are handled specially.
1581 */
1582.Lmemcpy_short:
1583	add	pc, pc, r2, lsl #2
1584	nop
1585	RET			/* 0x00 */
1586	b	.Lmemcpy_bytewise	/* 0x01 */
1587	b	.Lmemcpy_bytewise	/* 0x02 */
1588	b	.Lmemcpy_bytewise	/* 0x03 */
1589	b	.Lmemcpy_4		/* 0x04 */
1590	b	.Lmemcpy_bytewise	/* 0x05 */
1591	b	.Lmemcpy_6		/* 0x06 */
1592	b	.Lmemcpy_bytewise	/* 0x07 */
1593	b	.Lmemcpy_8		/* 0x08 */
1594	b	.Lmemcpy_bytewise	/* 0x09 */
1595	b	.Lmemcpy_bytewise	/* 0x0a */
1596	b	.Lmemcpy_bytewise	/* 0x0b */
1597	b	.Lmemcpy_c		/* 0x0c */
1598.Lmemcpy_bytewise:
1599	mov	r3, r0			/* We must not clobber r0 */
1600	ldrb	ip, [r1], #0x01
16011:	subs	r2, r2, #0x01
1602	strb	ip, [r3], #0x01
1603	ldrneb	ip, [r1], #0x01
1604	bne	1b
1605	RET
1606
1607/******************************************************************************
1608 * Special case for 4 byte copies
1609 */
1610#define	LMEMCPY_4_LOG2	6	/* 64 bytes */
1611#define	LMEMCPY_4_PAD	.align LMEMCPY_4_LOG2
1612	LMEMCPY_4_PAD
1613.Lmemcpy_4:
1614	and	r2, r1, #0x03
1615	orr	r2, r2, r0, lsl #2
1616	ands	r2, r2, #0x0f
1617	sub	r3, pc, #0x14
1618	addne	pc, r3, r2, lsl #LMEMCPY_4_LOG2
1619
1620/*
1621 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1622 */
1623	ldr	r2, [r1]
1624	str	r2, [r0]
1625	RET
1626	LMEMCPY_4_PAD
1627
1628/*
1629 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1630 */
1631	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
1632	ldr	r2, [r1, #3]		/* BE:r2 = 3xxx  LE:r2 = xxx3 */
1633#ifdef __ARMEB__
1634	mov	r3, r3, lsl #8		/* r3 = 012. */
1635	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
1636#else
1637	mov	r3, r3, lsr #8		/* r3 = .210 */
1638	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
1639#endif
1640	str	r3, [r0]
1641	RET
1642	LMEMCPY_4_PAD
1643
1644/*
1645 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1646 */
1647#ifdef __ARMEB__
1648	ldrh	r3, [r1]
1649	ldrh	r2, [r1, #0x02]
1650#else
1651	ldrh	r3, [r1, #0x02]
1652	ldrh	r2, [r1]
1653#endif
1654	orr	r3, r2, r3, lsl #16
1655	str	r3, [r0]
1656	RET
1657	LMEMCPY_4_PAD
1658
1659/*
1660 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1661 */
1662	ldr	r3, [r1, #-3]		/* BE:r3 = xxx0  LE:r3 = 0xxx */
1663	ldr	r2, [r1, #1]		/* BE:r2 = 123x  LE:r2 = x321 */
1664#ifdef __ARMEB__
1665	mov	r3, r3, lsl #24		/* r3 = 0... */
1666	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
1667#else
1668	mov	r3, r3, lsr #24		/* r3 = ...0 */
1669	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
1670#endif
1671	str	r3, [r0]
1672	RET
1673	LMEMCPY_4_PAD
1674
1675/*
1676 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1677 */
1678	ldr	r2, [r1]
1679#ifdef __ARMEB__
1680	strb	r2, [r0, #0x03]
1681	mov	r3, r2, lsr #8
1682	mov	r1, r2, lsr #24
1683	strb	r1, [r0]
1684#else
1685	strb	r2, [r0]
1686	mov	r3, r2, lsr #8
1687	mov	r1, r2, lsr #24
1688	strb	r1, [r0, #0x03]
1689#endif
1690	strh	r3, [r0, #0x01]
1691	RET
1692	LMEMCPY_4_PAD
1693
1694/*
1695 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1696 */
1697	ldrb	r2, [r1]
1698	ldrh	r3, [r1, #0x01]
1699	ldrb	r1, [r1, #0x03]
1700	strb	r2, [r0]
1701	strh	r3, [r0, #0x01]
1702	strb	r1, [r0, #0x03]
1703	RET
1704	LMEMCPY_4_PAD
1705
1706/*
1707 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1708 */
1709	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1710	ldrh	r3, [r1, #0x02]		/* LE:r3 = ..23  LE:r3 = ..32 */
1711#ifdef __ARMEB__
1712	mov	r1, r2, lsr #8		/* r1 = ...0 */
1713	strb	r1, [r0]
1714	mov	r2, r2, lsl #8		/* r2 = .01. */
1715	orr	r2, r2, r3, lsr #8	/* r2 = .012 */
1716#else
1717	strb	r2, [r0]
1718	mov	r2, r2, lsr #8		/* r2 = ...1 */
1719	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
1720	mov	r3, r3, lsr #8		/* r3 = ...3 */
1721#endif
1722	strh	r2, [r0, #0x01]
1723	strb	r3, [r0, #0x03]
1724	RET
1725	LMEMCPY_4_PAD
1726
1727/*
1728 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1729 */
1730	ldrb	r2, [r1]
1731	ldrh	r3, [r1, #0x01]
1732	ldrb	r1, [r1, #0x03]
1733	strb	r2, [r0]
1734	strh	r3, [r0, #0x01]
1735	strb	r1, [r0, #0x03]
1736	RET
1737	LMEMCPY_4_PAD
1738
1739/*
1740 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1741 */
1742	ldr	r2, [r1]
1743#ifdef __ARMEB__
1744	strh	r2, [r0, #0x02]
1745	mov	r3, r2, lsr #16
1746	strh	r3, [r0]
1747#else
1748	strh	r2, [r0]
1749	mov	r3, r2, lsr #16
1750	strh	r3, [r0, #0x02]
1751#endif
1752	RET
1753	LMEMCPY_4_PAD
1754
1755/*
1756 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1757 */
1758	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1759	ldr	r3, [r1, #3]		/* BE:r3 = 3xxx  LE:r3 = xxx3 */
1760	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
1761	strh	r1, [r0]
1762#ifdef __ARMEB__
1763	mov	r2, r2, lsl #8		/* r2 = 012. */
1764	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
1765#else
1766	mov	r2, r2, lsr #24		/* r2 = ...2 */
1767	orr	r2, r2, r3, lsl #8	/* r2 = xx32 */
1768#endif
1769	strh	r2, [r0, #0x02]
1770	RET
1771	LMEMCPY_4_PAD
1772
1773/*
1774 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1775 */
1776	ldrh	r2, [r1]
1777	ldrh	r3, [r1, #0x02]
1778	strh	r2, [r0]
1779	strh	r3, [r0, #0x02]
1780	RET
1781	LMEMCPY_4_PAD
1782
1783/*
1784 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1785 */
1786	ldr	r3, [r1, #1]		/* BE:r3 = 123x  LE:r3 = x321 */
1787	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
1788	mov	r1, r3, lsr #8		/* BE:r1 = .123  LE:r1 = .x32 */
1789	strh	r1, [r0, #0x02]
1790#ifdef __ARMEB__
1791	mov	r3, r3, lsr #24		/* r3 = ...1 */
1792	orr	r3, r3, r2, lsl #8	/* r3 = xx01 */
1793#else
1794	mov	r3, r3, lsl #8		/* r3 = 321. */
1795	orr	r3, r3, r2, lsr #24	/* r3 = 3210 */
1796#endif
1797	strh	r3, [r0]
1798	RET
1799	LMEMCPY_4_PAD
1800
1801/*
1802 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1803 */
1804	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1805#ifdef __ARMEB__
1806	strb	r2, [r0, #0x03]
1807	mov	r3, r2, lsr #8
1808	mov	r1, r2, lsr #24
1809	strh	r3, [r0, #0x01]
1810	strb	r1, [r0]
1811#else
1812	strb	r2, [r0]
1813	mov	r3, r2, lsr #8
1814	mov	r1, r2, lsr #24
1815	strh	r3, [r0, #0x01]
1816	strb	r1, [r0, #0x03]
1817#endif
1818	RET
1819	LMEMCPY_4_PAD
1820
1821/*
1822 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1823 */
1824	ldrb	r2, [r1]
1825	ldrh	r3, [r1, #0x01]
1826	ldrb	r1, [r1, #0x03]
1827	strb	r2, [r0]
1828	strh	r3, [r0, #0x01]
1829	strb	r1, [r0, #0x03]
1830	RET
1831	LMEMCPY_4_PAD
1832
1833/*
1834 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1835 */
1836#ifdef __ARMEB__
1837	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
1838	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1839	strb	r3, [r0, #0x03]
1840	mov	r3, r3, lsr #8		/* r3 = ...2 */
1841	orr	r3, r3, r2, lsl #8	/* r3 = ..12 */
1842	strh	r3, [r0, #0x01]
1843	mov	r2, r2, lsr #8		/* r2 = ...0 */
1844	strb	r2, [r0]
1845#else
1846	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1847	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
1848	strb	r2, [r0]
1849	mov	r2, r2, lsr #8		/* r2 = ...1 */
1850	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
1851	strh	r2, [r0, #0x01]
1852	mov	r3, r3, lsr #8		/* r3 = ...3 */
1853	strb	r3, [r0, #0x03]
1854#endif
1855	RET
1856	LMEMCPY_4_PAD
1857
1858/*
1859 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1860 */
1861	ldrb	r2, [r1]
1862	ldrh	r3, [r1, #0x01]
1863	ldrb	r1, [r1, #0x03]
1864	strb	r2, [r0]
1865	strh	r3, [r0, #0x01]
1866	strb	r1, [r0, #0x03]
1867	RET
1868	LMEMCPY_4_PAD
1869
1870
1871/******************************************************************************
1872 * Special case for 6 byte copies
1873 */
1874#define	LMEMCPY_6_LOG2	6	/* 64 bytes */
1875#define	LMEMCPY_6_PAD	.align LMEMCPY_6_LOG2
1876	LMEMCPY_6_PAD
1877.Lmemcpy_6:
1878	and	r2, r1, #0x03
1879	orr	r2, r2, r0, lsl #2
1880	ands	r2, r2, #0x0f
1881	sub	r3, pc, #0x14
1882	addne	pc, r3, r2, lsl #LMEMCPY_6_LOG2
1883
1884/*
1885 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1886 */
1887	ldr	r2, [r1]
1888	ldrh	r3, [r1, #0x04]
1889	str	r2, [r0]
1890	strh	r3, [r0, #0x04]
1891	RET
1892	LMEMCPY_6_PAD
1893
1894/*
1895 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1896 */
1897	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1898	ldr	r3, [r1, #0x03]		/* BE:r3 = 345x  LE:r3 = x543 */
1899#ifdef __ARMEB__
1900	mov	r2, r2, lsl #8		/* r2 = 012. */
1901	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
1902#else
1903	mov	r2, r2, lsr #8		/* r2 = .210 */
1904	orr	r2, r2, r3, lsl #24	/* r2 = 3210 */
1905#endif
1906	mov	r3, r3, lsr #8		/* BE:r3 = .345  LE:r3 = .x54 */
1907	str	r2, [r0]
1908	strh	r3, [r0, #0x04]
1909	RET
1910	LMEMCPY_6_PAD
1911
1912/*
1913 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1914 */
1915	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1916	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1917#ifdef __ARMEB__
1918	mov	r1, r3, lsr #16		/* r1 = ..23 */
1919	orr	r1, r1, r2, lsl #16	/* r1 = 0123 */
1920	str	r1, [r0]
1921	strh	r3, [r0, #0x04]
1922#else
1923	mov	r1, r3, lsr #16		/* r1 = ..54 */
1924	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
1925	str	r2, [r0]
1926	strh	r1, [r0, #0x04]
1927#endif
1928	RET
1929	LMEMCPY_6_PAD
1930
1931/*
1932 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1933 */
1934	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
1935	ldr	r3, [r1, #1]		/* BE:r3 = 1234  LE:r3 = 4321 */
1936	ldr	r1, [r1, #5]		/* BE:r1 = 5xxx  LE:r3 = xxx5 */
1937#ifdef __ARMEB__
1938	mov	r2, r2, lsl #24		/* r2 = 0... */
1939	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
1940	mov	r3, r3, lsl #8		/* r3 = 234. */
1941	orr	r1, r3, r1, lsr #24	/* r1 = 2345 */
1942#else
1943	mov	r2, r2, lsr #24		/* r2 = ...0 */
1944	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
1945	mov	r1, r1, lsl #8		/* r1 = xx5. */
1946	orr	r1, r1, r3, lsr #24	/* r1 = xx54 */
1947#endif
1948	str	r2, [r0]
1949	strh	r1, [r0, #0x04]
1950	RET
1951	LMEMCPY_6_PAD
1952
1953/*
1954 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1955 */
1956	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
1957	ldrh	r2, [r1, #0x04]		/* BE:r2 = ..45  LE:r2 = ..54 */
1958	mov	r1, r3, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
1959	strh	r1, [r0, #0x01]
1960#ifdef __ARMEB__
1961	mov	r1, r3, lsr #24		/* r1 = ...0 */
1962	strb	r1, [r0]
1963	mov	r3, r3, lsl #8		/* r3 = 123. */
1964	orr	r3, r3, r2, lsr #8	/* r3 = 1234 */
1965#else
1966	strb	r3, [r0]
1967	mov	r3, r3, lsr #24		/* r3 = ...3 */
1968	orr	r3, r3, r2, lsl #8	/* r3 = .543 */
1969	mov	r2, r2, lsr #8		/* r2 = ...5 */
1970#endif
1971	strh	r3, [r0, #0x03]
1972	strb	r2, [r0, #0x05]
1973	RET
1974	LMEMCPY_6_PAD
1975
1976/*
1977 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1978 */
1979	ldrb	r2, [r1]
1980	ldrh	r3, [r1, #0x01]
1981	ldrh	ip, [r1, #0x03]
1982	ldrb	r1, [r1, #0x05]
1983	strb	r2, [r0]
1984	strh	r3, [r0, #0x01]
1985	strh	ip, [r0, #0x03]
1986	strb	r1, [r0, #0x05]
1987	RET
1988	LMEMCPY_6_PAD
1989
1990/*
1991 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1992 */
1993	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1994	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
1995#ifdef __ARMEB__
1996	mov	r3, r2, lsr #8		/* r3 = ...0 */
1997	strb	r3, [r0]
1998	strb	r1, [r0, #0x05]
1999	mov	r3, r1, lsr #8		/* r3 = .234 */
2000	strh	r3, [r0, #0x03]
2001	mov	r3, r2, lsl #8		/* r3 = .01. */
2002	orr	r3, r3, r1, lsr #24	/* r3 = .012 */
2003	strh	r3, [r0, #0x01]
2004#else
2005	strb	r2, [r0]
2006	mov	r3, r1, lsr #24
2007	strb	r3, [r0, #0x05]
2008	mov	r3, r1, lsr #8		/* r3 = .543 */
2009	strh	r3, [r0, #0x03]
2010	mov	r3, r2, lsr #8		/* r3 = ...1 */
2011	orr	r3, r3, r1, lsl #8	/* r3 = 4321 */
2012	strh	r3, [r0, #0x01]
2013#endif
2014	RET
2015	LMEMCPY_6_PAD
2016
2017/*
2018 * 0111: dst is 8-bit aligned, src is 8-bit aligned
2019 */
2020	ldrb	r2, [r1]
2021	ldrh	r3, [r1, #0x01]
2022	ldrh	ip, [r1, #0x03]
2023	ldrb	r1, [r1, #0x05]
2024	strb	r2, [r0]
2025	strh	r3, [r0, #0x01]
2026	strh	ip, [r0, #0x03]
2027	strb	r1, [r0, #0x05]
2028	RET
2029	LMEMCPY_6_PAD
2030
2031/*
2032 * 1000: dst is 16-bit aligned, src is 32-bit aligned
2033 */
2034#ifdef __ARMEB__
2035	ldr	r2, [r1]		/* r2 = 0123 */
2036	ldrh	r3, [r1, #0x04]		/* r3 = ..45 */
2037	mov	r1, r2, lsr #16		/* r1 = ..01 */
2038	orr	r3, r3, r2, lsl#16	/* r3 = 2345 */
2039	strh	r1, [r0]
2040	str	r3, [r0, #0x02]
2041#else
2042	ldrh	r2, [r1, #0x04]		/* r2 = ..54 */
2043	ldr	r3, [r1]		/* r3 = 3210 */
2044	mov	r2, r2, lsl #16		/* r2 = 54.. */
2045	orr	r2, r2, r3, lsr #16	/* r2 = 5432 */
2046	strh	r3, [r0]
2047	str	r2, [r0, #0x02]
2048#endif
2049	RET
2050	LMEMCPY_6_PAD
2051
2052/*
2053 * 1001: dst is 16-bit aligned, src is 8-bit aligned
2054 */
2055	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
2056	ldr	r2, [r1, #3]		/* BE:r2 = 345x  LE:r2 = x543 */
2057	mov	r1, r3, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
2058#ifdef __ARMEB__
2059	mov	r2, r2, lsr #8		/* r2 = .345 */
2060	orr	r2, r2, r3, lsl #24	/* r2 = 2345 */
2061#else
2062	mov	r2, r2, lsl #8		/* r2 = 543. */
2063	orr	r2, r2, r3, lsr #24	/* r2 = 5432 */
2064#endif
2065	strh	r1, [r0]
2066	str	r2, [r0, #0x02]
2067	RET
2068	LMEMCPY_6_PAD
2069
2070/*
2071 * 1010: dst is 16-bit aligned, src is 16-bit aligned
2072 */
2073	ldrh	r2, [r1]
2074	ldr	r3, [r1, #0x02]
2075	strh	r2, [r0]
2076	str	r3, [r0, #0x02]
2077	RET
2078	LMEMCPY_6_PAD
2079
2080/*
2081 * 1011: dst is 16-bit aligned, src is 8-bit aligned
2082 */
2083	ldrb	r3, [r1]		/* r3 = ...0 */
2084	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
2085	ldrb	r1, [r1, #0x05]		/* r1 = ...5 */
2086#ifdef __ARMEB__
2087	mov	r3, r3, lsl #8		/* r3 = ..0. */
2088	orr	r3, r3, r2, lsr #24	/* r3 = ..01 */
2089	orr	r1, r1, r2, lsl #8	/* r1 = 2345 */
2090#else
2091	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
2092	mov	r1, r1, lsl #24		/* r1 = 5... */
2093	orr	r1, r1, r2, lsr #8	/* r1 = 5432 */
2094#endif
2095	strh	r3, [r0]
2096	str	r1, [r0, #0x02]
2097	RET
2098	LMEMCPY_6_PAD
2099
2100/*
2101 * 1100: dst is 8-bit aligned, src is 32-bit aligned
2102 */
2103	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2104	ldrh	r1, [r1, #0x04]		/* BE:r1 = ..45  LE:r1 = ..54 */
2105#ifdef __ARMEB__
2106	mov	r3, r2, lsr #24		/* r3 = ...0 */
2107	strb	r3, [r0]
2108	mov	r2, r2, lsl #8		/* r2 = 123. */
2109	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
2110#else
2111	strb	r2, [r0]
2112	mov	r2, r2, lsr #8		/* r2 = .321 */
2113	orr	r2, r2, r1, lsl #24	/* r2 = 4321 */
2114	mov	r1, r1, lsr #8		/* r1 = ...5 */
2115#endif
2116	str	r2, [r0, #0x01]
2117	strb	r1, [r0, #0x05]
2118	RET
2119	LMEMCPY_6_PAD
2120
2121/*
2122 * 1101: dst is 8-bit aligned, src is 8-bit aligned
2123 */
2124	ldrb	r2, [r1]
2125	ldrh	r3, [r1, #0x01]
2126	ldrh	ip, [r1, #0x03]
2127	ldrb	r1, [r1, #0x05]
2128	strb	r2, [r0]
2129	strh	r3, [r0, #0x01]
2130	strh	ip, [r0, #0x03]
2131	strb	r1, [r0, #0x05]
2132	RET
2133	LMEMCPY_6_PAD
2134
2135/*
2136 * 1110: dst is 8-bit aligned, src is 16-bit aligned
2137 */
2138	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2139	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
2140#ifdef __ARMEB__
2141	mov	r3, r2, lsr #8		/* r3 = ...0 */
2142	strb	r3, [r0]
2143	mov	r2, r2, lsl #24		/* r2 = 1... */
2144	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
2145#else
2146	strb	r2, [r0]
2147	mov	r2, r2, lsr #8		/* r2 = ...1 */
2148	orr	r2, r2, r1, lsl #8	/* r2 = 4321 */
2149	mov	r1, r1, lsr #24		/* r1 = ...5 */
2150#endif
2151	str	r2, [r0, #0x01]
2152	strb	r1, [r0, #0x05]
2153	RET
2154	LMEMCPY_6_PAD
2155
2156/*
2157 * 1111: dst is 8-bit aligned, src is 8-bit aligned
2158 */
2159	ldrb	r2, [r1]
2160	ldr	r3, [r1, #0x01]
2161	ldrb	r1, [r1, #0x05]
2162	strb	r2, [r0]
2163	str	r3, [r0, #0x01]
2164	strb	r1, [r0, #0x05]
2165	RET
2166	LMEMCPY_6_PAD
2167
2168
2169/******************************************************************************
2170 * Special case for 8 byte copies
2171 */
2172#define	LMEMCPY_8_LOG2	6	/* 64 bytes */
2173#define	LMEMCPY_8_PAD	.align LMEMCPY_8_LOG2
2174	LMEMCPY_8_PAD
2175.Lmemcpy_8:
2176	and	r2, r1, #0x03
2177	orr	r2, r2, r0, lsl #2
2178	ands	r2, r2, #0x0f
2179	sub	r3, pc, #0x14
2180	addne	pc, r3, r2, lsl #LMEMCPY_8_LOG2
2181
2182/*
2183 * 0000: dst is 32-bit aligned, src is 32-bit aligned
2184 */
2185	ldr	r2, [r1]
2186	ldr	r3, [r1, #0x04]
2187	str	r2, [r0]
2188	str	r3, [r0, #0x04]
2189	RET
2190	LMEMCPY_8_PAD
2191
2192/*
2193 * 0001: dst is 32-bit aligned, src is 8-bit aligned
2194 */
2195	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
2196	ldr	r2, [r1, #0x03]		/* BE:r2 = 3456  LE:r2 = 6543 */
2197	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
2198#ifdef __ARMEB__
2199	mov	r3, r3, lsl #8		/* r3 = 012. */
2200	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
2201	orr	r2, r1, r2, lsl #8	/* r2 = 4567 */
2202#else
2203	mov	r3, r3, lsr #8		/* r3 = .210 */
2204	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
2205	mov	r1, r1, lsl #24		/* r1 = 7... */
2206	orr	r2, r1, r2, lsr #8	/* r2 = 7654 */
2207#endif
2208	str	r3, [r0]
2209	str	r2, [r0, #0x04]
2210	RET
2211	LMEMCPY_8_PAD
2212
2213/*
2214 * 0010: dst is 32-bit aligned, src is 16-bit aligned
2215 */
2216	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2217	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2218	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
2219#ifdef __ARMEB__
2220	mov	r2, r2, lsl #16		/* r2 = 01.. */
2221	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
2222	orr	r3, r1, r3, lsl #16	/* r3 = 4567 */
2223#else
2224	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
2225	mov	r3, r3, lsr #16		/* r3 = ..54 */
2226	orr	r3, r3, r1, lsl #16	/* r3 = 7654 */
2227#endif
2228	str	r2, [r0]
2229	str	r3, [r0, #0x04]
2230	RET
2231	LMEMCPY_8_PAD
2232
2233/*
2234 * 0011: dst is 32-bit aligned, src is 8-bit aligned
2235 */
2236	ldrb	r3, [r1]		/* r3 = ...0 */
2237	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
2238	ldr	r1, [r1, #0x05]		/* BE:r1 = 567x  LE:r1 = x765 */
2239#ifdef __ARMEB__
2240	mov	r3, r3, lsl #24		/* r3 = 0... */
2241	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
2242	mov	r2, r2, lsl #24		/* r2 = 4... */
2243	orr	r2, r2, r1, lsr #8	/* r2 = 4567 */
2244#else
2245	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
2246	mov	r2, r2, lsr #24		/* r2 = ...4 */
2247	orr	r2, r2, r1, lsl #8	/* r2 = 7654 */
2248#endif
2249	str	r3, [r0]
2250	str	r2, [r0, #0x04]
2251	RET
2252	LMEMCPY_8_PAD
2253
2254/*
2255 * 0100: dst is 8-bit aligned, src is 32-bit aligned
2256 */
2257	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
2258	ldr	r2, [r1, #0x04]		/* BE:r2 = 4567  LE:r2 = 7654 */
2259#ifdef __ARMEB__
2260	mov	r1, r3, lsr #24		/* r1 = ...0 */
2261	strb	r1, [r0]
2262	mov	r1, r3, lsr #8		/* r1 = .012 */
2263	strb	r2, [r0, #0x07]
2264	mov	r3, r3, lsl #24		/* r3 = 3... */
2265	orr	r3, r3, r2, lsr #8	/* r3 = 3456 */
2266#else
2267	strb	r3, [r0]
2268	mov	r1, r2, lsr #24		/* r1 = ...7 */
2269	strb	r1, [r0, #0x07]
2270	mov	r1, r3, lsr #8		/* r1 = .321 */
2271	mov	r3, r3, lsr #24		/* r3 = ...3 */
2272	orr	r3, r3, r2, lsl #8	/* r3 = 6543 */
2273#endif
2274	strh	r1, [r0, #0x01]
2275	str	r3, [r0, #0x03]
2276	RET
2277	LMEMCPY_8_PAD
2278
2279/*
2280 * 0101: dst is 8-bit aligned, src is 8-bit aligned
2281 */
2282	ldrb	r2, [r1]
2283	ldrh	r3, [r1, #0x01]
2284	ldr	ip, [r1, #0x03]
2285	ldrb	r1, [r1, #0x07]
2286	strb	r2, [r0]
2287	strh	r3, [r0, #0x01]
2288	str	ip, [r0, #0x03]
2289	strb	r1, [r0, #0x07]
2290	RET
2291	LMEMCPY_8_PAD
2292
2293/*
2294 * 0110: dst is 8-bit aligned, src is 16-bit aligned
2295 */
2296	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2297	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2298	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
2299#ifdef __ARMEB__
2300	mov	ip, r2, lsr #8		/* ip = ...0 */
2301	strb	ip, [r0]
2302	mov	ip, r2, lsl #8		/* ip = .01. */
2303	orr	ip, ip, r3, lsr #24	/* ip = .012 */
2304	strb	r1, [r0, #0x07]
2305	mov	r3, r3, lsl #8		/* r3 = 345. */
2306	orr	r3, r3, r1, lsr #8	/* r3 = 3456 */
2307#else
2308	strb	r2, [r0]		/* 0 */
2309	mov	ip, r1, lsr #8		/* ip = ...7 */
2310	strb	ip, [r0, #0x07]		/* 7 */
2311	mov	ip, r2, lsr #8		/* ip = ...1 */
2312	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
2313	mov	r3, r3, lsr #8		/* r3 = .543 */
2314	orr	r3, r3, r1, lsl #24	/* r3 = 6543 */
2315#endif
2316	strh	ip, [r0, #0x01]
2317	str	r3, [r0, #0x03]
2318	RET
2319	LMEMCPY_8_PAD
2320
2321/*
2322 * 0111: dst is 8-bit aligned, src is 8-bit aligned
2323 */
2324	ldrb	r3, [r1]		/* r3 = ...0 */
2325	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
2326	ldrh	r2, [r1, #0x05]		/* BE:r2 = ..56  LE:r2 = ..65 */
2327	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
2328	strb	r3, [r0]
2329	mov	r3, ip, lsr #16		/* BE:r3 = ..12  LE:r3 = ..43 */
2330#ifdef __ARMEB__
2331	strh	r3, [r0, #0x01]
2332	orr	r2, r2, ip, lsl #16	/* r2 = 3456 */
2333#else
2334	strh	ip, [r0, #0x01]
2335	orr	r2, r3, r2, lsl #16	/* r2 = 6543 */
2336#endif
2337	str	r2, [r0, #0x03]
2338	strb	r1, [r0, #0x07]
2339	RET
2340	LMEMCPY_8_PAD
2341
2342/*
2343 * 1000: dst is 16-bit aligned, src is 32-bit aligned
2344 */
2345	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2346	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2347	mov	r1, r2, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
2348#ifdef __ARMEB__
2349	strh	r1, [r0]
2350	mov	r1, r3, lsr #16		/* r1 = ..45 */
2351	orr	r2, r1 ,r2, lsl #16	/* r2 = 2345 */
2352#else
2353	strh	r2, [r0]
2354	orr	r2, r1, r3, lsl #16	/* r2 = 5432 */
2355	mov	r3, r3, lsr #16		/* r3 = ..76 */
2356#endif
2357	str	r2, [r0, #0x02]
2358	strh	r3, [r0, #0x06]
2359	RET
2360	LMEMCPY_8_PAD
2361
2362/*
2363 * 1001: dst is 16-bit aligned, src is 8-bit aligned
2364 */
2365	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
2366	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
2367	ldrb	ip, [r1, #0x07]		/* ip = ...7 */
2368	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
2369	strh	r1, [r0]
2370#ifdef __ARMEB__
2371	mov	r1, r2, lsl #24		/* r1 = 2... */
2372	orr	r1, r1, r3, lsr #8	/* r1 = 2345 */
2373	orr	r3, ip, r3, lsl #8	/* r3 = 4567 */
2374#else
2375	mov	r1, r2, lsr #24		/* r1 = ...2 */
2376	orr	r1, r1, r3, lsl #8	/* r1 = 5432 */
2377	mov	r3, r3, lsr #24		/* r3 = ...6 */
2378	orr	r3, r3, ip, lsl #8	/* r3 = ..76 */
2379#endif
2380	str	r1, [r0, #0x02]
2381	strh	r3, [r0, #0x06]
2382	RET
2383	LMEMCPY_8_PAD
2384
2385/*
2386 * 1010: dst is 16-bit aligned, src is 16-bit aligned
2387 */
2388	ldrh	r2, [r1]
2389	ldr	ip, [r1, #0x02]
2390	ldrh	r3, [r1, #0x06]
2391	strh	r2, [r0]
2392	str	ip, [r0, #0x02]
2393	strh	r3, [r0, #0x06]
2394	RET
2395	LMEMCPY_8_PAD
2396
2397/*
2398 * 1011: dst is 16-bit aligned, src is 8-bit aligned
2399 */
2400	ldr	r3, [r1, #0x05]		/* BE:r3 = 567x  LE:r3 = x765 */
2401	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
2402	ldrb	ip, [r1]		/* ip = ...0 */
2403	mov	r1, r3, lsr #8		/* BE:r1 = .567  LE:r1 = .x76 */
2404	strh	r1, [r0, #0x06]
2405#ifdef __ARMEB__
2406	mov	r3, r3, lsr #24		/* r3 = ...5 */
2407	orr	r3, r3, r2, lsl #8	/* r3 = 2345 */
2408	mov	r2, r2, lsr #24		/* r2 = ...1 */
2409	orr	r2, r2, ip, lsl #8	/* r2 = ..01 */
2410#else
2411	mov	r3, r3, lsl #24		/* r3 = 5... */
2412	orr	r3, r3, r2, lsr #8	/* r3 = 5432 */
2413	orr	r2, ip, r2, lsl #8	/* r2 = 3210 */
2414#endif
2415	str	r3, [r0, #0x02]
2416	strh	r2, [r0]
2417	RET
2418	LMEMCPY_8_PAD
2419
2420/*
2421 * 1100: dst is 8-bit aligned, src is 32-bit aligned
2422 */
2423	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2424	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2425	mov	r1, r3, lsr #8		/* BE:r1 = .456  LE:r1 = .765 */
2426	strh	r1, [r0, #0x05]
2427#ifdef __ARMEB__
2428	strb	r3, [r0, #0x07]
2429	mov	r1, r2, lsr #24		/* r1 = ...0 */
2430	strb	r1, [r0]
2431	mov	r2, r2, lsl #8		/* r2 = 123. */
2432	orr	r2, r2, r3, lsr #24	/* r2 = 1234 */
2433	str	r2, [r0, #0x01]
2434#else
2435	strb	r2, [r0]
2436	mov	r1, r3, lsr #24		/* r1 = ...7 */
2437	strb	r1, [r0, #0x07]
2438	mov	r2, r2, lsr #8		/* r2 = .321 */
2439	orr	r2, r2, r3, lsl #24	/* r2 = 4321 */
2440	str	r2, [r0, #0x01]
2441#endif
2442	RET
2443	LMEMCPY_8_PAD
2444
2445/*
2446 * 1101: dst is 8-bit aligned, src is 8-bit aligned
2447 */
2448	ldrb	r3, [r1]		/* r3 = ...0 */
2449	ldrh	r2, [r1, #0x01]		/* BE:r2 = ..12  LE:r2 = ..21 */
2450	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
2451	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
2452	strb	r3, [r0]
2453	mov	r3, ip, lsr #16		/* BE:r3 = ..34  LE:r3 = ..65 */
2454#ifdef __ARMEB__
2455	strh	ip, [r0, #0x05]
2456	orr	r2, r3, r2, lsl #16	/* r2 = 1234 */
2457#else
2458	strh	r3, [r0, #0x05]
2459	orr	r2, r2, ip, lsl #16	/* r2 = 4321 */
2460#endif
2461	str	r2, [r0, #0x01]
2462	strb	r1, [r0, #0x07]
2463	RET
2464	LMEMCPY_8_PAD
2465
2466/*
2467 * 1110: dst is 8-bit aligned, src is 16-bit aligned
2468 */
2469	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2470	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2471	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
2472#ifdef __ARMEB__
2473	mov	ip, r2, lsr #8		/* ip = ...0 */
2474	strb	ip, [r0]
2475	mov	ip, r2, lsl #24		/* ip = 1... */
2476	orr	ip, ip, r3, lsr #8	/* ip = 1234 */
2477	strb	r1, [r0, #0x07]
2478	mov	r1, r1, lsr #8		/* r1 = ...6 */
2479	orr	r1, r1, r3, lsl #8	/* r1 = 3456 */
2480#else
2481	strb	r2, [r0]
2482	mov	ip, r2, lsr #8		/* ip = ...1 */
2483	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
2484	mov	r2, r1, lsr #8		/* r2 = ...7 */
2485	strb	r2, [r0, #0x07]
2486	mov	r1, r1, lsl #8		/* r1 = .76. */
2487	orr	r1, r1, r3, lsr #24	/* r1 = .765 */
2488#endif
2489	str	ip, [r0, #0x01]
2490	strh	r1, [r0, #0x05]
2491	RET
2492	LMEMCPY_8_PAD
2493
2494/*
2495 * 1111: dst is 8-bit aligned, src is 8-bit aligned
2496 */
2497	ldrb	r2, [r1]
2498	ldr	ip, [r1, #0x01]
2499	ldrh	r3, [r1, #0x05]
2500	ldrb	r1, [r1, #0x07]
2501	strb	r2, [r0]
2502	str	ip, [r0, #0x01]
2503	strh	r3, [r0, #0x05]
2504	strb	r1, [r0, #0x07]
2505	RET
2506	LMEMCPY_8_PAD
2507
2508/******************************************************************************
2509 * Special case for 12 byte copies
2510 */
2511#define	LMEMCPY_C_LOG2	7	/* 128 bytes */
2512#define	LMEMCPY_C_PAD	.align LMEMCPY_C_LOG2
2513	LMEMCPY_C_PAD
2514.Lmemcpy_c:
2515	and	r2, r1, #0x03
2516	orr	r2, r2, r0, lsl #2
2517	ands	r2, r2, #0x0f
2518	sub	r3, pc, #0x14
2519	addne	pc, r3, r2, lsl #LMEMCPY_C_LOG2
2520
2521/*
2522 * 0000: dst is 32-bit aligned, src is 32-bit aligned
2523 */
2524	ldr	r2, [r1]
2525	ldr	r3, [r1, #0x04]
2526	ldr	r1, [r1, #0x08]
2527	str	r2, [r0]
2528	str	r3, [r0, #0x04]
2529	str	r1, [r0, #0x08]
2530	RET
2531	LMEMCPY_C_PAD
2532
2533/*
2534 * 0001: dst is 32-bit aligned, src is 8-bit aligned
2535 */
2536	ldrb	r2, [r1, #0xb]		/* r2 = ...B */
2537	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
2538	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
2539	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
2540#ifdef __ARMEB__
2541	orr	r2, r2, ip, lsl #8	/* r2 = 89AB */
2542	str	r2, [r0, #0x08]
2543	mov	r2, ip, lsr #24		/* r2 = ...7 */
2544	orr	r2, r2, r3, lsl #8	/* r2 = 4567 */
2545	mov	r1, r1, lsl #8		/* r1 = 012. */
2546	orr	r1, r1, r3, lsr #24	/* r1 = 0123 */
2547#else
2548	mov	r2, r2, lsl #24		/* r2 = B... */
2549	orr	r2, r2, ip, lsr #8	/* r2 = BA98 */
2550	str	r2, [r0, #0x08]
2551	mov	r2, ip, lsl #24		/* r2 = 7... */
2552	orr	r2, r2, r3, lsr #8	/* r2 = 7654 */
2553	mov	r1, r1, lsr #8		/* r1 = .210 */
2554	orr	r1, r1, r3, lsl #24	/* r1 = 3210 */
2555#endif
2556	str	r2, [r0, #0x04]
2557	str	r1, [r0]
2558	RET
2559	LMEMCPY_C_PAD
2560
2561/*
2562 * 0010: dst is 32-bit aligned, src is 16-bit aligned
2563 */
2564	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2565	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2566	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
2567	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
2568#ifdef __ARMEB__
2569	mov	r2, r2, lsl #16		/* r2 = 01.. */
2570	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
2571	str	r2, [r0]
2572	mov	r3, r3, lsl #16		/* r3 = 45.. */
2573	orr	r3, r3, ip, lsr #16	/* r3 = 4567 */
2574	orr	r1, r1, ip, lsl #16	/* r1 = 89AB */
2575#else
2576	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
2577	str	r2, [r0]
2578	mov	r3, r3, lsr #16		/* r3 = ..54 */
2579	orr	r3, r3, ip, lsl #16	/* r3 = 7654 */
2580	mov	r1, r1, lsl #16		/* r1 = BA.. */
2581	orr	r1, r1, ip, lsr #16	/* r1 = BA98 */
2582#endif
2583	str	r3, [r0, #0x04]
2584	str	r1, [r0, #0x08]
2585	RET
2586	LMEMCPY_C_PAD
2587
2588/*
2589 * 0011: dst is 32-bit aligned, src is 8-bit aligned
2590 */
2591	ldrb	r2, [r1]		/* r2 = ...0 */
2592	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
2593	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
2594	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
2595#ifdef __ARMEB__
2596	mov	r2, r2, lsl #24		/* r2 = 0... */
2597	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
2598	str	r2, [r0]
2599	mov	r3, r3, lsl #24		/* r3 = 4... */
2600	orr	r3, r3, ip, lsr #8	/* r3 = 4567 */
2601	mov	r1, r1, lsr #8		/* r1 = .9AB */
2602	orr	r1, r1, ip, lsl #24	/* r1 = 89AB */
2603#else
2604	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
2605	str	r2, [r0]
2606	mov	r3, r3, lsr #24		/* r3 = ...4 */
2607	orr	r3, r3, ip, lsl #8	/* r3 = 7654 */
2608	mov	r1, r1, lsl #8		/* r1 = BA9. */
2609	orr	r1, r1, ip, lsr #24	/* r1 = BA98 */
2610#endif
2611	str	r3, [r0, #0x04]
2612	str	r1, [r0, #0x08]
2613	RET
2614	LMEMCPY_C_PAD
2615
2616/*
2617 * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
2618 */
2619	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2620	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2621	ldr	ip, [r1, #0x08]		/* BE:ip = 89AB  LE:ip = BA98 */
2622	mov	r1, r2, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
2623	strh	r1, [r0, #0x01]
2624#ifdef __ARMEB__
2625	mov	r1, r2, lsr #24		/* r1 = ...0 */
2626	strb	r1, [r0]
2627	mov	r1, r2, lsl #24		/* r1 = 3... */
2628	orr	r2, r1, r3, lsr #8	/* r1 = 3456 */
2629	mov	r1, r3, lsl #24		/* r1 = 7... */
2630	orr	r1, r1, ip, lsr #8	/* r1 = 789A */
2631#else
2632	strb	r2, [r0]
2633	mov	r1, r2, lsr #24		/* r1 = ...3 */
2634	orr	r2, r1, r3, lsl #8	/* r1 = 6543 */
2635	mov	r1, r3, lsr #24		/* r1 = ...7 */
2636	orr	r1, r1, ip, lsl #8	/* r1 = A987 */
2637	mov	ip, ip, lsr #24		/* ip = ...B */
2638#endif
2639	str	r2, [r0, #0x03]
2640	str	r1, [r0, #0x07]
2641	strb	ip, [r0, #0x0b]
2642	RET
2643	LMEMCPY_C_PAD
2644
2645/*
2646 * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
2647 */
2648	ldrb	r2, [r1]
2649	ldrh	r3, [r1, #0x01]
2650	ldr	ip, [r1, #0x03]
2651	strb	r2, [r0]
2652	ldr	r2, [r1, #0x07]
2653	ldrb	r1, [r1, #0x0b]
2654	strh	r3, [r0, #0x01]
2655	str	ip, [r0, #0x03]
2656	str	r2, [r0, #0x07]
2657	strb	r1, [r0, #0x0b]
2658	RET
2659	LMEMCPY_C_PAD
2660
2661/*
2662 * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
2663 */
2664	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2665	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2666	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
2667	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
2668#ifdef __ARMEB__
2669	mov	r2, r2, ror #8		/* r2 = 1..0 */
2670	strb	r2, [r0]
2671	mov	r2, r2, lsr #16		/* r2 = ..1. */
2672	orr	r2, r2, r3, lsr #24	/* r2 = ..12 */
2673	strh	r2, [r0, #0x01]
2674	mov	r2, r3, lsl #8		/* r2 = 345. */
2675	orr	r3, r2, ip, lsr #24	/* r3 = 3456 */
2676	mov	r2, ip, lsl #8		/* r2 = 789. */
2677	orr	r2, r2, r1, lsr #8	/* r2 = 789A */
2678#else
2679	strb	r2, [r0]
2680	mov	r2, r2, lsr #8		/* r2 = ...1 */
2681	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
2682	strh	r2, [r0, #0x01]
2683	mov	r2, r3, lsr #8		/* r2 = .543 */
2684	orr	r3, r2, ip, lsl #24	/* r3 = 6543 */
2685	mov	r2, ip, lsr #8		/* r2 = .987 */
2686	orr	r2, r2, r1, lsl #24	/* r2 = A987 */
2687	mov	r1, r1, lsr #8		/* r1 = ...B */
2688#endif
2689	str	r3, [r0, #0x03]
2690	str	r2, [r0, #0x07]
2691	strb	r1, [r0, #0x0b]
2692	RET
2693	LMEMCPY_C_PAD
2694
2695/*
2696 * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
2697 */
2698	ldrb	r2, [r1]
2699	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
2700	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
2701	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
2702	strb	r2, [r0]
2703#ifdef __ARMEB__
2704	mov	r2, r3, lsr #16		/* r2 = ..12 */
2705	strh	r2, [r0, #0x01]
2706	mov	r3, r3, lsl #16		/* r3 = 34.. */
2707	orr	r3, r3, ip, lsr #16	/* r3 = 3456 */
2708	mov	ip, ip, lsl #16		/* ip = 78.. */
2709	orr	ip, ip, r1, lsr #16	/* ip = 789A */
2710	mov	r1, r1, lsr #8		/* r1 = .9AB */
2711#else
2712	strh	r3, [r0, #0x01]
2713	mov	r3, r3, lsr #16		/* r3 = ..43 */
2714	orr	r3, r3, ip, lsl #16	/* r3 = 6543 */
2715	mov	ip, ip, lsr #16		/* ip = ..87 */
2716	orr	ip, ip, r1, lsl #16	/* ip = A987 */
2717	mov	r1, r1, lsr #16		/* r1 = ..xB */
2718#endif
2719	str	r3, [r0, #0x03]
2720	str	ip, [r0, #0x07]
2721	strb	r1, [r0, #0x0b]
2722	RET
2723	LMEMCPY_C_PAD
2724
2725/*
2726 * 1000: dst is 16-bit aligned, src is 32-bit aligned
2727 */
2728	ldr	ip, [r1]		/* BE:ip = 0123  LE:ip = 3210 */
2729	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2730	ldr	r2, [r1, #0x08]		/* BE:r2 = 89AB  LE:r2 = BA98 */
2731	mov	r1, ip, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
2732#ifdef __ARMEB__
2733	strh	r1, [r0]
2734	mov	r1, ip, lsl #16		/* r1 = 23.. */
2735	orr	r1, r1, r3, lsr #16	/* r1 = 2345 */
2736	mov	r3, r3, lsl #16		/* r3 = 67.. */
2737	orr	r3, r3, r2, lsr #16	/* r3 = 6789 */
2738#else
2739	strh	ip, [r0]
2740	orr	r1, r1, r3, lsl #16	/* r1 = 5432 */
2741	mov	r3, r3, lsr #16		/* r3 = ..76 */
2742	orr	r3, r3, r2, lsl #16	/* r3 = 9876 */
2743	mov	r2, r2, lsr #16		/* r2 = ..BA */
2744#endif
2745	str	r1, [r0, #0x02]
2746	str	r3, [r0, #0x06]
2747	strh	r2, [r0, #0x0a]
2748	RET
2749	LMEMCPY_C_PAD
2750
2751/*
2752 * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
2753 */
2754	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
2755	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
2756	mov	ip, r2, lsr #8		/* BE:ip = .x01  LE:ip = .210 */
2757	strh	ip, [r0]
2758	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
2759	ldrb	r1, [r1, #0x0b]		/* r1 = ...B */
2760#ifdef __ARMEB__
2761	mov	r2, r2, lsl #24		/* r2 = 2... */
2762	orr	r2, r2, r3, lsr #8	/* r2 = 2345 */
2763	mov	r3, r3, lsl #24		/* r3 = 6... */
2764	orr	r3, r3, ip, lsr #8	/* r3 = 6789 */
2765	orr	r1, r1, ip, lsl #8	/* r1 = 89AB */
2766#else
2767	mov	r2, r2, lsr #24		/* r2 = ...2 */
2768	orr	r2, r2, r3, lsl #8	/* r2 = 5432 */
2769	mov	r3, r3, lsr #24		/* r3 = ...6 */
2770	orr	r3, r3, ip, lsl #8	/* r3 = 9876 */
2771	mov	r1, r1, lsl #8		/* r1 = ..B. */
2772	orr	r1, r1, ip, lsr #24	/* r1 = ..BA */
2773#endif
2774	str	r2, [r0, #0x02]
2775	str	r3, [r0, #0x06]
2776	strh	r1, [r0, #0x0a]
2777	RET
2778	LMEMCPY_C_PAD
2779
2780/*
2781 * 1010: dst is 16-bit aligned, src is 16-bit aligned
2782 */
2783	ldrh	r2, [r1]
2784	ldr	r3, [r1, #0x02]
2785	ldr	ip, [r1, #0x06]
2786	ldrh	r1, [r1, #0x0a]
2787	strh	r2, [r0]
2788	str	r3, [r0, #0x02]
2789	str	ip, [r0, #0x06]
2790	strh	r1, [r0, #0x0a]
2791	RET
2792	LMEMCPY_C_PAD
2793
2794/*
2795 * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
2796 */
2797	ldr	r2, [r1, #0x09]		/* BE:r2 = 9ABx  LE:r2 = xBA9 */
2798	ldr	r3, [r1, #0x05]		/* BE:r3 = 5678  LE:r3 = 8765 */
2799	mov	ip, r2, lsr #8		/* BE:ip = .9AB  LE:ip = .xBA */
2800	strh	ip, [r0, #0x0a]
2801	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
2802	ldrb	r1, [r1]		/* r1 = ...0 */
2803#ifdef __ARMEB__
2804	mov	r2, r2, lsr #24		/* r2 = ...9 */
2805	orr	r2, r2, r3, lsl #8	/* r2 = 6789 */
2806	mov	r3, r3, lsr #24		/* r3 = ...5 */
2807	orr	r3, r3, ip, lsl #8	/* r3 = 2345 */
2808	mov	r1, r1, lsl #8		/* r1 = ..0. */
2809	orr	r1, r1, ip, lsr #24	/* r1 = ..01 */
2810#else
2811	mov	r2, r2, lsl #24		/* r2 = 9... */
2812	orr	r2, r2, r3, lsr #8	/* r2 = 9876 */
2813	mov	r3, r3, lsl #24		/* r3 = 5... */
2814	orr	r3, r3, ip, lsr #8	/* r3 = 5432 */
2815	orr	r1, r1, ip, lsl #8	/* r1 = 3210 */
2816#endif
2817	str	r2, [r0, #0x06]
2818	str	r3, [r0, #0x02]
2819	strh	r1, [r0]
2820	RET
2821	LMEMCPY_C_PAD
2822
2823/*
2824 * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
2825 */
2826	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2827	ldr	ip, [r1, #0x04]		/* BE:ip = 4567  LE:ip = 7654 */
2828	ldr	r1, [r1, #0x08]		/* BE:r1 = 89AB  LE:r1 = BA98 */
2829#ifdef __ARMEB__
2830	mov	r3, r2, lsr #24		/* r3 = ...0 */
2831	strb	r3, [r0]
2832	mov	r2, r2, lsl #8		/* r2 = 123. */
2833	orr	r2, r2, ip, lsr #24	/* r2 = 1234 */
2834	str	r2, [r0, #0x01]
2835	mov	r2, ip, lsl #8		/* r2 = 567. */
2836	orr	r2, r2, r1, lsr #24	/* r2 = 5678 */
2837	str	r2, [r0, #0x05]
2838	mov	r2, r1, lsr #8		/* r2 = ..9A */
2839	strh	r2, [r0, #0x09]
2840	strb	r1, [r0, #0x0b]
2841#else
2842	strb	r2, [r0]
2843	mov	r3, r2, lsr #8		/* r3 = .321 */
2844	orr	r3, r3, ip, lsl #24	/* r3 = 4321 */
2845	str	r3, [r0, #0x01]
2846	mov	r3, ip, lsr #8		/* r3 = .765 */
2847	orr	r3, r3, r1, lsl #24	/* r3 = 8765 */
2848	str	r3, [r0, #0x05]
2849	mov	r1, r1, lsr #8		/* r1 = .BA9 */
2850	strh	r1, [r0, #0x09]
2851	mov	r1, r1, lsr #16		/* r1 = ...B */
2852	strb	r1, [r0, #0x0b]
2853#endif
2854	RET
2855	LMEMCPY_C_PAD
2856
2857/*
2858 * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
2859 */
2860	ldrb	r2, [r1, #0x0b]		/* r2 = ...B */
2861	ldr	r3, [r1, #0x07]		/* BE:r3 = 789A  LE:r3 = A987 */
2862	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
2863	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
2864	strb	r2, [r0, #0x0b]
2865#ifdef __ARMEB__
2866	strh	r3, [r0, #0x09]
2867	mov	r3, r3, lsr #16		/* r3 = ..78 */
2868	orr	r3, r3, ip, lsl #16	/* r3 = 5678 */
2869	mov	ip, ip, lsr #16		/* ip = ..34 */
2870	orr	ip, ip, r1, lsl #16	/* ip = 1234 */
2871	mov	r1, r1, lsr #16		/* r1 = ..x0 */
2872#else
2873	mov	r2, r3, lsr #16		/* r2 = ..A9 */
2874	strh	r2, [r0, #0x09]
2875	mov	r3, r3, lsl #16		/* r3 = 87.. */
2876	orr	r3, r3, ip, lsr #16	/* r3 = 8765 */
2877	mov	ip, ip, lsl #16		/* ip = 43.. */
2878	orr	ip, ip, r1, lsr #16	/* ip = 4321 */
2879	mov	r1, r1, lsr #8		/* r1 = .210 */
2880#endif
2881	str	r3, [r0, #0x05]
2882	str	ip, [r0, #0x01]
2883	strb	r1, [r0]
2884	RET
2885	LMEMCPY_C_PAD
2886
2887/*
2888 * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
2889 */
2890#ifdef __ARMEB__
2891	ldrh	r2, [r1, #0x0a]		/* r2 = ..AB */
2892	ldr	ip, [r1, #0x06]		/* ip = 6789 */
2893	ldr	r3, [r1, #0x02]		/* r3 = 2345 */
2894	ldrh	r1, [r1]		/* r1 = ..01 */
2895	strb	r2, [r0, #0x0b]
2896	mov	r2, r2, lsr #8		/* r2 = ...A */
2897	orr	r2, r2, ip, lsl #8	/* r2 = 789A */
2898	mov	ip, ip, lsr #8		/* ip = .678 */
2899	orr	ip, ip, r3, lsl #24	/* ip = 5678 */
2900	mov	r3, r3, lsr #8		/* r3 = .234 */
2901	orr	r3, r3, r1, lsl #24	/* r3 = 1234 */
2902	mov	r1, r1, lsr #8		/* r1 = ...0 */
2903	strb	r1, [r0]
2904	str	r3, [r0, #0x01]
2905	str	ip, [r0, #0x05]
2906	strh	r2, [r0, #0x09]
2907#else
2908	ldrh	r2, [r1]		/* r2 = ..10 */
2909	ldr	r3, [r1, #0x02]		/* r3 = 5432 */
2910	ldr	ip, [r1, #0x06]		/* ip = 9876 */
2911	ldrh	r1, [r1, #0x0a]		/* r1 = ..BA */
2912	strb	r2, [r0]
2913	mov	r2, r2, lsr #8		/* r2 = ...1 */
2914	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
2915	mov	r3, r3, lsr #24		/* r3 = ...5 */
2916	orr	r3, r3, ip, lsl #8	/* r3 = 8765 */
2917	mov	ip, ip, lsr #24		/* ip = ...9 */
2918	orr	ip, ip, r1, lsl #8	/* ip = .BA9 */
2919	mov	r1, r1, lsr #8		/* r1 = ...B */
2920	str	r2, [r0, #0x01]
2921	str	r3, [r0, #0x05]
2922	strh	ip, [r0, #0x09]
2923	strb	r1, [r0, #0x0b]
2924#endif
2925	RET
2926	LMEMCPY_C_PAD
2927
2928/*
2929 * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
2930 */
2931	ldrb	r2, [r1]
2932	ldr	r3, [r1, #0x01]
2933	ldr	ip, [r1, #0x05]
2934	strb	r2, [r0]
2935	ldrh	r2, [r1, #0x09]
2936	ldrb	r1, [r1, #0x0b]
2937	str	r3, [r0, #0x01]
2938	str	ip, [r0, #0x05]
2939	strh	r2, [r0, #0x09]
2940	strb	r1, [r0, #0x0b]
2941	RET
2942#endif /* _ARM_ARCH_5E */
2943
2944#ifdef GPROF
2945
2946ENTRY(user)
2947	nop
2948ENTRY(btrap)
2949	nop
2950ENTRY(etrap)
2951	nop
2952ENTRY(bintr)
2953	nop
2954ENTRY(eintr)
2955	nop
2956
2957#endif
2958