support.S revision 175255
1/*-
2 * Copyright (c) 2004 Olivier Houchard
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26/*
27 * Copyright 2003 Wasabi Systems, Inc.
28 * All rights reserved.
29 *
30 * Written by Steve C. Woodford for Wasabi Systems, Inc.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 *    notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 *    notice, this list of conditions and the following disclaimer in the
39 *    documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 *    must display the following acknowledgement:
42 *      This product includes software developed for the NetBSD Project by
43 *      Wasabi Systems, Inc.
44 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
45 *    or promote products derived from this software without specific prior
46 *    written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
50 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
51 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
52 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
53 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
54 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
55 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
56 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
57 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
58 * POSSIBILITY OF SUCH DAMAGE.
59 */
60/*
61 * Copyright (c) 1997 The NetBSD Foundation, Inc.
62 * All rights reserved.
63 *
64 * This code is derived from software contributed to The NetBSD Foundation
65 * by Neil A. Carson and Mark Brinicombe
66 *
67 * Redistribution and use in source and binary forms, with or without
68 * modification, are permitted provided that the following conditions
69 * are met:
70 * 1. Redistributions of source code must retain the above copyright
71 *    notice, this list of conditions and the following disclaimer.
72 * 2. Redistributions in binary form must reproduce the above copyright
73 *    notice, this list of conditions and the following disclaimer in the
74 *    documentation and/or other materials provided with the distribution.
75 * 3. All advertising materials mentioning features or use of this software
76 *    must display the following acknowledgement:
77 *	This product includes software developed by the NetBSD
78 *	Foundation, Inc. and its contributors.
79 * 4. Neither the name of The NetBSD Foundation nor the names of its
80 *    contributors may be used to endorse or promote products derived
81 *    from this software without specific prior written permission.
82 *
83 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
84 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
85 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
86 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
87 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
88 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
89 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
90 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
91 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
92 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
93 * POSSIBILITY OF SUCH DAMAGE.
94 */
95
96#include <machine/asm.h>
97#include <machine/asmacros.h>
98__FBSDID("$FreeBSD: head/sys/arm/arm/support.S 175255 2008-01-12 21:11:43Z cognet $");
99
100#include "assym.s"
101
102.L_arm_memcpy:
103	.word	_C_LABEL(_arm_memcpy)
104.L_arm_bzero:
105	.word	_C_LABEL(_arm_bzero)
106.L_min_memcpy_size:
107	.word	_C_LABEL(_min_memcpy_size)
108.L_min_bzero_size:
109	.word	_C_LABEL(_min_bzero_size)
110/*
111 * memset: Sets a block of memory to the specified value
112 *
113 * On entry:
114 *   r0 - dest address
115 *   r1 - byte to write
116 *   r2 - number of bytes to write
117 *
118 * On exit:
119 *   r0 - dest address
120 */
121/* LINTSTUB: Func: void bzero(void *, size_t) */
122ENTRY(bzero)
123	ldr	r3, .L_arm_bzero
124	ldr	r3, [r3]
125	cmp	r3, #0
126	beq	.Lnormal0
127	ldr	r2, .L_min_bzero_size
128	ldr	r2, [r2]
129	cmp	r1, r2
130	blt	.Lnormal0
131	stmfd	sp!, {r0, r1, lr}
132	mov	r2, #0
133	mov	lr, pc
134	mov	pc, r3
135	cmp	r0, #0
136	ldmfd	sp!, {r0, r1, lr}
137	RETeq
138.Lnormal0:
139	mov	r3, #0x00
140	b	do_memset
141
142/* LINTSTUB: Func: void *memset(void *, int, size_t) */
143ENTRY(memset)
144	and	r3, r1, #0xff		/* We deal with bytes */
145	mov	r1, r2
146do_memset:
147	cmp	r1, #0x04		/* Do we have less than 4 bytes */
148	mov	ip, r0
149	blt	.Lmemset_lessthanfour
150
151	/* Ok first we will word align the address */
152	ands	r2, ip, #0x03		/* Get the bottom two bits */
153	bne	.Lmemset_wordunaligned	/* The address is not word aligned */
154
155	/* We are now word aligned */
156.Lmemset_wordaligned:
157	orr	r3, r3, r3, lsl #8	/* Extend value to 16-bits */
158#ifdef _ARM_ARCH_5E
159	tst	ip, #0x04		/* Quad-align for armv5e */
160#else
161	cmp	r1, #0x10
162#endif
163	orr	r3, r3, r3, lsl #16	/* Extend value to 32-bits */
164#ifdef _ARM_ARCH_5E
165	subne	r1, r1, #0x04		/* Quad-align if necessary */
166	strne	r3, [ip], #0x04
167	cmp	r1, #0x10
168#endif
169	blt	.Lmemset_loop4		/* If less than 16 then use words */
170	mov	r2, r3			/* Duplicate data */
171	cmp	r1, #0x80		/* If < 128 then skip the big loop */
172	blt	.Lmemset_loop32
173
174	/* Do 128 bytes at a time */
175.Lmemset_loop128:
176	subs	r1, r1, #0x80
177#ifdef _ARM_ARCH_5E
178	strged	r2, [ip], #0x08
179	strged	r2, [ip], #0x08
180	strged	r2, [ip], #0x08
181	strged	r2, [ip], #0x08
182	strged	r2, [ip], #0x08
183	strged	r2, [ip], #0x08
184	strged	r2, [ip], #0x08
185	strged	r2, [ip], #0x08
186	strged	r2, [ip], #0x08
187	strged	r2, [ip], #0x08
188	strged	r2, [ip], #0x08
189	strged	r2, [ip], #0x08
190	strged	r2, [ip], #0x08
191	strged	r2, [ip], #0x08
192	strged	r2, [ip], #0x08
193	strged	r2, [ip], #0x08
194#else
195	stmgeia	ip!, {r2-r3}
196	stmgeia	ip!, {r2-r3}
197	stmgeia	ip!, {r2-r3}
198	stmgeia	ip!, {r2-r3}
199	stmgeia	ip!, {r2-r3}
200	stmgeia	ip!, {r2-r3}
201	stmgeia	ip!, {r2-r3}
202	stmgeia	ip!, {r2-r3}
203	stmgeia	ip!, {r2-r3}
204	stmgeia	ip!, {r2-r3}
205	stmgeia	ip!, {r2-r3}
206	stmgeia	ip!, {r2-r3}
207	stmgeia	ip!, {r2-r3}
208	stmgeia	ip!, {r2-r3}
209	stmgeia	ip!, {r2-r3}
210	stmgeia	ip!, {r2-r3}
211#endif
212	bgt	.Lmemset_loop128
213	RETeq			/* Zero length so just exit */
214
215	add	r1, r1, #0x80		/* Adjust for extra sub */
216
217	/* Do 32 bytes at a time */
218.Lmemset_loop32:
219	subs	r1, r1, #0x20
220#ifdef _ARM_ARCH_5E
221	strged	r2, [ip], #0x08
222	strged	r2, [ip], #0x08
223	strged	r2, [ip], #0x08
224	strged	r2, [ip], #0x08
225#else
226	stmgeia	ip!, {r2-r3}
227	stmgeia	ip!, {r2-r3}
228	stmgeia	ip!, {r2-r3}
229	stmgeia	ip!, {r2-r3}
230#endif
231	bgt	.Lmemset_loop32
232	RETeq			/* Zero length so just exit */
233
234	adds	r1, r1, #0x10		/* Partially adjust for extra sub */
235
236	/* Deal with 16 bytes or more */
237#ifdef _ARM_ARCH_5E
238	strged	r2, [ip], #0x08
239	strged	r2, [ip], #0x08
240#else
241	stmgeia	ip!, {r2-r3}
242	stmgeia	ip!, {r2-r3}
243#endif
244	RETeq			/* Zero length so just exit */
245
246	addlt	r1, r1, #0x10		/* Possibly adjust for extra sub */
247
248	/* We have at least 4 bytes so copy as words */
249.Lmemset_loop4:
250	subs	r1, r1, #0x04
251	strge	r3, [ip], #0x04
252	bgt	.Lmemset_loop4
253	RETeq			/* Zero length so just exit */
254
255#ifdef _ARM_ARCH_5E
256	/* Compensate for 64-bit alignment check */
257	adds	r1, r1, #0x04
258	RETeq
259	cmp	r1, #2
260#else
261	cmp	r1, #-2
262#endif
263
264	strb	r3, [ip], #0x01		/* Set 1 byte */
265	strgeb	r3, [ip], #0x01		/* Set another byte */
266	strgtb	r3, [ip]		/* and a third */
267	RET			/* Exit */
268
269.Lmemset_wordunaligned:
270	rsb	r2, r2, #0x004
271	strb	r3, [ip], #0x01		/* Set 1 byte */
272	cmp	r2, #0x02
273	strgeb	r3, [ip], #0x01		/* Set another byte */
274	sub	r1, r1, r2
275	strgtb	r3, [ip], #0x01		/* and a third */
276	cmp	r1, #0x04		/* More than 4 bytes left? */
277	bge	.Lmemset_wordaligned	/* Yup */
278
279.Lmemset_lessthanfour:
280	cmp	r1, #0x00
281	RETeq			/* Zero length so exit */
282	strb	r3, [ip], #0x01		/* Set 1 byte */
283	cmp	r1, #0x02
284	strgeb	r3, [ip], #0x01		/* Set another byte */
285	strgtb	r3, [ip]		/* and a third */
286	RET			/* Exit */
287
288ENTRY(bcmp)
289	mov	ip, r0
290	cmp	r2, #0x06
291	beq	.Lmemcmp_6bytes
292	mov	r0, #0x00
293
294	/* Are both addresses aligned the same way? */
295	cmp	r2, #0x00
296	eornes	r3, ip, r1
297	RETeq			/* len == 0, or same addresses! */
298	tst	r3, #0x03
299	subne	r2, r2, #0x01
300	bne	.Lmemcmp_bytewise2	/* Badly aligned. Do it the slow way */
301
302	/* Word-align the addresses, if necessary */
303	sub	r3, r1, #0x05
304	ands	r3, r3, #0x03
305	add	r3, r3, r3, lsl #1
306	addne	pc, pc, r3, lsl #3
307	nop
308
309	/* Compare up to 3 bytes */
310	ldrb	r0, [ip], #0x01
311	ldrb	r3, [r1], #0x01
312	subs	r0, r0, r3
313	RETne
314	subs	r2, r2, #0x01
315	RETeq
316
317	/* Compare up to 2 bytes */
318	ldrb	r0, [ip], #0x01
319	ldrb	r3, [r1], #0x01
320	subs	r0, r0, r3
321	RETne
322	subs	r2, r2, #0x01
323	RETeq
324
325	/* Compare 1 byte */
326	ldrb	r0, [ip], #0x01
327	ldrb	r3, [r1], #0x01
328	subs	r0, r0, r3
329	RETne
330	subs	r2, r2, #0x01
331	RETeq
332
333	/* Compare 4 bytes at a time, if possible */
334	subs	r2, r2, #0x04
335	bcc	.Lmemcmp_bytewise
336.Lmemcmp_word_aligned:
337	ldr	r0, [ip], #0x04
338	ldr	r3, [r1], #0x04
339	subs	r2, r2, #0x04
340	cmpcs	r0, r3
341	beq	.Lmemcmp_word_aligned
342	sub	r0, r0, r3
343
344	/* Correct for extra subtraction, and check if done */
345	adds	r2, r2, #0x04
346	cmpeq	r0, #0x00		/* If done, did all bytes match? */
347	RETeq			/* Yup. Just return */
348
349	/* Re-do the final word byte-wise */
350	sub	ip, ip, #0x04
351	sub	r1, r1, #0x04
352
353.Lmemcmp_bytewise:
354	add	r2, r2, #0x03
355.Lmemcmp_bytewise2:
356	ldrb	r0, [ip], #0x01
357	ldrb	r3, [r1], #0x01
358	subs	r2, r2, #0x01
359	cmpcs	r0, r3
360	beq	.Lmemcmp_bytewise2
361	sub	r0, r0, r3
362	RET
363
364	/*
365	 * 6 byte compares are very common, thanks to the network stack.
366	 * This code is hand-scheduled to reduce the number of stalls for
367	 * load results. Everything else being equal, this will be ~32%
368	 * faster than a byte-wise memcmp.
369	 */
370	.align	5
371.Lmemcmp_6bytes:
372	ldrb	r3, [r1, #0x00]		/* r3 = b2#0 */
373	ldrb	r0, [ip, #0x00]		/* r0 = b1#0 */
374	ldrb	r2, [r1, #0x01]		/* r2 = b2#1 */
375	subs	r0, r0, r3		/* r0 = b1#0 - b2#0 */
376	ldreqb	r3, [ip, #0x01]		/* r3 = b1#1 */
377	RETne			/* Return if mismatch on #0 */
378	subs	r0, r3, r2		/* r0 = b1#1 - b2#1 */
379	ldreqb	r3, [r1, #0x02]		/* r3 = b2#2 */
380	ldreqb	r0, [ip, #0x02]		/* r0 = b1#2 */
381	RETne			/* Return if mismatch on #1 */
382	ldrb	r2, [r1, #0x03]		/* r2 = b2#3 */
383	subs	r0, r0, r3		/* r0 = b1#2 - b2#2 */
384	ldreqb	r3, [ip, #0x03]		/* r3 = b1#3 */
385	RETne			/* Return if mismatch on #2 */
386	subs	r0, r3, r2		/* r0 = b1#3 - b2#3 */
387	ldreqb	r3, [r1, #0x04]		/* r3 = b2#4 */
388	ldreqb	r0, [ip, #0x04]		/* r0 = b1#4 */
389	RETne			/* Return if mismatch on #3 */
390	ldrb	r2, [r1, #0x05]		/* r2 = b2#5 */
391	subs	r0, r0, r3		/* r0 = b1#4 - b2#4 */
392	ldreqb	r3, [ip, #0x05]		/* r3 = b1#5 */
393	RETne			/* Return if mismatch on #4 */
394	sub	r0, r3, r2		/* r0 = b1#5 - b2#5 */
395	RET
396
397ENTRY(bcopy)
398	/* switch the source and destination registers */
399	eor     r0, r1, r0
400	eor     r1, r0, r1
401	eor     r0, r1, r0
402ENTRY(memmove)
403	/* Do the buffers overlap? */
404	cmp	r0, r1
405	RETeq		/* Bail now if src/dst are the same */
406	subcc	r3, r0, r1	/* if (dst > src) r3 = dst - src */
407	subcs	r3, r1, r0	/* if (src > dsr) r3 = src - dst */
408	cmp	r3, r2		/* if (r3 < len) we have an overlap */
409	bcc	PIC_SYM(_C_LABEL(memcpy), PLT)
410
411	/* Determine copy direction */
412	cmp	r1, r0
413	bcc	.Lmemmove_backwards
414
415	moveq	r0, #0			/* Quick abort for len=0 */
416	RETeq
417
418	stmdb	sp!, {r0, lr}		/* memmove() returns dest addr */
419	subs	r2, r2, #4
420	blt	.Lmemmove_fl4		/* less than 4 bytes */
421	ands	r12, r0, #3
422	bne	.Lmemmove_fdestul	/* oh unaligned destination addr */
423	ands	r12, r1, #3
424	bne	.Lmemmove_fsrcul		/* oh unaligned source addr */
425
426.Lmemmove_ft8:
427	/* We have aligned source and destination */
428	subs	r2, r2, #8
429	blt	.Lmemmove_fl12		/* less than 12 bytes (4 from above) */
430	subs	r2, r2, #0x14
431	blt	.Lmemmove_fl32		/* less than 32 bytes (12 from above) */
432	stmdb	sp!, {r4}		/* borrow r4 */
433
434	/* blat 32 bytes at a time */
435	/* XXX for really big copies perhaps we should use more registers */
436.Lmemmove_floop32:
437	ldmia	r1!, {r3, r4, r12, lr}
438	stmia	r0!, {r3, r4, r12, lr}
439	ldmia	r1!, {r3, r4, r12, lr}
440	stmia	r0!, {r3, r4, r12, lr}
441	subs	r2, r2, #0x20
442	bge	.Lmemmove_floop32
443
444	cmn	r2, #0x10
445	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
446	stmgeia	r0!, {r3, r4, r12, lr}
447	subge	r2, r2, #0x10
448	ldmia	sp!, {r4}		/* return r4 */
449
450.Lmemmove_fl32:
451	adds	r2, r2, #0x14
452
453	/* blat 12 bytes at a time */
454.Lmemmove_floop12:
455	ldmgeia	r1!, {r3, r12, lr}
456	stmgeia	r0!, {r3, r12, lr}
457	subges	r2, r2, #0x0c
458	bge	.Lmemmove_floop12
459
460.Lmemmove_fl12:
461	adds	r2, r2, #8
462	blt	.Lmemmove_fl4
463
464	subs	r2, r2, #4
465	ldrlt	r3, [r1], #4
466	strlt	r3, [r0], #4
467	ldmgeia	r1!, {r3, r12}
468	stmgeia	r0!, {r3, r12}
469	subge	r2, r2, #4
470
471.Lmemmove_fl4:
472	/* less than 4 bytes to go */
473	adds	r2, r2, #4
474	ldmeqia	sp!, {r0, pc}		/* done */
475
476	/* copy the crud byte at a time */
477	cmp	r2, #2
478	ldrb	r3, [r1], #1
479	strb	r3, [r0], #1
480	ldrgeb	r3, [r1], #1
481	strgeb	r3, [r0], #1
482	ldrgtb	r3, [r1], #1
483	strgtb	r3, [r0], #1
484	ldmia	sp!, {r0, pc}
485
486	/* erg - unaligned destination */
487.Lmemmove_fdestul:
488	rsb	r12, r12, #4
489	cmp	r12, #2
490
491	/* align destination with byte copies */
492	ldrb	r3, [r1], #1
493	strb	r3, [r0], #1
494	ldrgeb	r3, [r1], #1
495	strgeb	r3, [r0], #1
496	ldrgtb	r3, [r1], #1
497	strgtb	r3, [r0], #1
498	subs	r2, r2, r12
499	blt	.Lmemmove_fl4		/* less the 4 bytes */
500
501	ands	r12, r1, #3
502	beq	.Lmemmove_ft8		/* we have an aligned source */
503
504	/* erg - unaligned source */
505	/* This is where it gets nasty ... */
506.Lmemmove_fsrcul:
507	bic	r1, r1, #3
508	ldr	lr, [r1], #4
509	cmp	r12, #2
510	bgt	.Lmemmove_fsrcul3
511	beq	.Lmemmove_fsrcul2
512	cmp	r2, #0x0c
513	blt	.Lmemmove_fsrcul1loop4
514	sub	r2, r2, #0x0c
515	stmdb	sp!, {r4, r5}
516
517.Lmemmove_fsrcul1loop16:
518#ifdef __ARMEB__
519	mov	r3, lr, lsl #8
520#else
521	mov	r3, lr, lsr #8
522#endif
523	ldmia	r1!, {r4, r5, r12, lr}
524#ifdef __ARMEB__
525	orr	r3, r3, r4, lsr #24
526	mov	r4, r4, lsl #8
527	orr	r4, r4, r5, lsr #24
528	mov	r5, r5, lsl #8
529	orr	r5, r5, r12, lsr #24
530	mov	r12, r12, lsl #8
531	orr	r12, r12, lr, lsr #24
532#else
533	orr	r3, r3, r4, lsl #24
534	mov	r4, r4, lsr #8
535	orr	r4, r4, r5, lsl #24
536	mov	r5, r5, lsr #8
537	orr	r5, r5, r12, lsl #24
538	mov	r12, r12, lsr #8
539	orr	r12, r12, lr, lsl #24
540#endif
541	stmia	r0!, {r3-r5, r12}
542	subs	r2, r2, #0x10
543	bge	.Lmemmove_fsrcul1loop16
544	ldmia	sp!, {r4, r5}
545	adds	r2, r2, #0x0c
546	blt	.Lmemmove_fsrcul1l4
547
548.Lmemmove_fsrcul1loop4:
549#ifdef __ARMEB__
550	mov	r12, lr, lsl #8
551#else
552	mov	r12, lr, lsr #8
553#endif
554	ldr	lr, [r1], #4
555#ifdef __ARMEB__
556	orr	r12, r12, lr, lsr #24
557#else
558	orr	r12, r12, lr, lsl #24
559#endif
560	str	r12, [r0], #4
561	subs	r2, r2, #4
562	bge	.Lmemmove_fsrcul1loop4
563
564.Lmemmove_fsrcul1l4:
565	sub	r1, r1, #3
566	b	.Lmemmove_fl4
567
568.Lmemmove_fsrcul2:
569	cmp	r2, #0x0c
570	blt	.Lmemmove_fsrcul2loop4
571	sub	r2, r2, #0x0c
572	stmdb	sp!, {r4, r5}
573
574.Lmemmove_fsrcul2loop16:
575#ifdef __ARMEB__
576	mov	r3, lr, lsl #16
577#else
578	mov	r3, lr, lsr #16
579#endif
580	ldmia	r1!, {r4, r5, r12, lr}
581#ifdef __ARMEB__
582	orr	r3, r3, r4, lsr #16
583	mov	r4, r4, lsl #16
584	orr	r4, r4, r5, lsr #16
585	mov	r5, r5, lsl #16
586	orr	r5, r5, r12, lsr #16
587	mov	r12, r12, lsl #16
588	orr	r12, r12, lr, lsr #16
589#else
590	orr	r3, r3, r4, lsl #16
591	mov	r4, r4, lsr #16
592	orr	r4, r4, r5, lsl #16
593	mov	r5, r5, lsr #16
594	orr	r5, r5, r12, lsl #16
595	mov	r12, r12, lsr #16
596	orr	r12, r12, lr, lsl #16
597#endif
598	stmia	r0!, {r3-r5, r12}
599	subs	r2, r2, #0x10
600	bge	.Lmemmove_fsrcul2loop16
601	ldmia	sp!, {r4, r5}
602	adds	r2, r2, #0x0c
603	blt	.Lmemmove_fsrcul2l4
604
605.Lmemmove_fsrcul2loop4:
606#ifdef __ARMEB__
607	mov	r12, lr, lsl #16
608#else
609	mov	r12, lr, lsr #16
610#endif
611	ldr	lr, [r1], #4
612#ifdef __ARMEB__
613	orr	r12, r12, lr, lsr #16
614#else
615	orr	r12, r12, lr, lsl #16
616#endif
617	str	r12, [r0], #4
618	subs	r2, r2, #4
619	bge	.Lmemmove_fsrcul2loop4
620
621.Lmemmove_fsrcul2l4:
622	sub	r1, r1, #2
623	b	.Lmemmove_fl4
624
625.Lmemmove_fsrcul3:
626	cmp	r2, #0x0c
627	blt	.Lmemmove_fsrcul3loop4
628	sub	r2, r2, #0x0c
629	stmdb	sp!, {r4, r5}
630
631.Lmemmove_fsrcul3loop16:
632#ifdef __ARMEB__
633	mov	r3, lr, lsl #24
634#else
635	mov	r3, lr, lsr #24
636#endif
637	ldmia	r1!, {r4, r5, r12, lr}
638#ifdef __ARMEB__
639	orr	r3, r3, r4, lsr #8
640	mov	r4, r4, lsl #24
641	orr	r4, r4, r5, lsr #8
642	mov	r5, r5, lsl #24
643	orr	r5, r5, r12, lsr #8
644	mov	r12, r12, lsl #24
645	orr	r12, r12, lr, lsr #8
646#else
647	orr	r3, r3, r4, lsl #8
648	mov	r4, r4, lsr #24
649	orr	r4, r4, r5, lsl #8
650	mov	r5, r5, lsr #24
651	orr	r5, r5, r12, lsl #8
652	mov	r12, r12, lsr #24
653	orr	r12, r12, lr, lsl #8
654#endif
655	stmia	r0!, {r3-r5, r12}
656	subs	r2, r2, #0x10
657	bge	.Lmemmove_fsrcul3loop16
658	ldmia	sp!, {r4, r5}
659	adds	r2, r2, #0x0c
660	blt	.Lmemmove_fsrcul3l4
661
662.Lmemmove_fsrcul3loop4:
663#ifdef __ARMEB__
664	mov	r12, lr, lsl #24
665#else
666	mov	r12, lr, lsr #24
667#endif
668	ldr	lr, [r1], #4
669#ifdef __ARMEB__
670	orr	r12, r12, lr, lsr #8
671#else
672	orr	r12, r12, lr, lsl #8
673#endif
674	str	r12, [r0], #4
675	subs	r2, r2, #4
676	bge	.Lmemmove_fsrcul3loop4
677
678.Lmemmove_fsrcul3l4:
679	sub	r1, r1, #1
680	b	.Lmemmove_fl4
681
682.Lmemmove_backwards:
683	add	r1, r1, r2
684	add	r0, r0, r2
685	subs	r2, r2, #4
686	blt	.Lmemmove_bl4		/* less than 4 bytes */
687	ands	r12, r0, #3
688	bne	.Lmemmove_bdestul	/* oh unaligned destination addr */
689	ands	r12, r1, #3
690	bne	.Lmemmove_bsrcul		/* oh unaligned source addr */
691
692.Lmemmove_bt8:
693	/* We have aligned source and destination */
694	subs	r2, r2, #8
695	blt	.Lmemmove_bl12		/* less than 12 bytes (4 from above) */
696	stmdb	sp!, {r4, lr}
697	subs	r2, r2, #0x14		/* less than 32 bytes (12 from above) */
698	blt	.Lmemmove_bl32
699
700	/* blat 32 bytes at a time */
701	/* XXX for really big copies perhaps we should use more registers */
702.Lmemmove_bloop32:
703	ldmdb	r1!, {r3, r4, r12, lr}
704	stmdb	r0!, {r3, r4, r12, lr}
705	ldmdb	r1!, {r3, r4, r12, lr}
706	stmdb	r0!, {r3, r4, r12, lr}
707	subs	r2, r2, #0x20
708	bge	.Lmemmove_bloop32
709
710.Lmemmove_bl32:
711	cmn	r2, #0x10
712	ldmgedb	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
713	stmgedb	r0!, {r3, r4, r12, lr}
714	subge	r2, r2, #0x10
715	adds	r2, r2, #0x14
716	ldmgedb	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */
717	stmgedb	r0!, {r3, r12, lr}
718	subge	r2, r2, #0x0c
719	ldmia	sp!, {r4, lr}
720
721.Lmemmove_bl12:
722	adds	r2, r2, #8
723	blt	.Lmemmove_bl4
724	subs	r2, r2, #4
725	ldrlt	r3, [r1, #-4]!
726	strlt	r3, [r0, #-4]!
727	ldmgedb	r1!, {r3, r12}
728	stmgedb	r0!, {r3, r12}
729	subge	r2, r2, #4
730
731.Lmemmove_bl4:
732	/* less than 4 bytes to go */
733	adds	r2, r2, #4
734	RETeq			/* done */
735
736	/* copy the crud byte at a time */
737	cmp	r2, #2
738	ldrb	r3, [r1, #-1]!
739	strb	r3, [r0, #-1]!
740	ldrgeb	r3, [r1, #-1]!
741	strgeb	r3, [r0, #-1]!
742	ldrgtb	r3, [r1, #-1]!
743	strgtb	r3, [r0, #-1]!
744	RET
745
746	/* erg - unaligned destination */
747.Lmemmove_bdestul:
748	cmp	r12, #2
749
750	/* align destination with byte copies */
751	ldrb	r3, [r1, #-1]!
752	strb	r3, [r0, #-1]!
753	ldrgeb	r3, [r1, #-1]!
754	strgeb	r3, [r0, #-1]!
755	ldrgtb	r3, [r1, #-1]!
756	strgtb	r3, [r0, #-1]!
757	subs	r2, r2, r12
758	blt	.Lmemmove_bl4		/* less than 4 bytes to go */
759	ands	r12, r1, #3
760	beq	.Lmemmove_bt8		/* we have an aligned source */
761
762	/* erg - unaligned source */
763	/* This is where it gets nasty ... */
764.Lmemmove_bsrcul:
765	bic	r1, r1, #3
766	ldr	r3, [r1, #0]
767	cmp	r12, #2
768	blt	.Lmemmove_bsrcul1
769	beq	.Lmemmove_bsrcul2
770	cmp	r2, #0x0c
771	blt	.Lmemmove_bsrcul3loop4
772	sub	r2, r2, #0x0c
773	stmdb	sp!, {r4, r5, lr}
774
775.Lmemmove_bsrcul3loop16:
776#ifdef __ARMEB__
777	mov	lr, r3, lsr #8
778#else
779	mov	lr, r3, lsl #8
780#endif
781	ldmdb	r1!, {r3-r5, r12}
782#ifdef __ARMEB__
783	orr	lr, lr, r12, lsl #24
784	mov	r12, r12, lsr #8
785	orr	r12, r12, r5, lsl #24
786	mov	r5, r5, lsr #8
787	orr	r5, r5, r4, lsl #24
788	mov	r4, r4, lsr #8
789	orr	r4, r4, r3, lsl #24
790#else
791	orr	lr, lr, r12, lsr #24
792	mov	r12, r12, lsl #8
793	orr	r12, r12, r5, lsr #24
794	mov	r5, r5, lsl #8
795	orr	r5, r5, r4, lsr #24
796	mov	r4, r4, lsl #8
797	orr	r4, r4, r3, lsr #24
798#endif
799	stmdb	r0!, {r4, r5, r12, lr}
800	subs	r2, r2, #0x10
801	bge	.Lmemmove_bsrcul3loop16
802	ldmia	sp!, {r4, r5, lr}
803	adds	r2, r2, #0x0c
804	blt	.Lmemmove_bsrcul3l4
805
806.Lmemmove_bsrcul3loop4:
807#ifdef __ARMEB__
808	mov	r12, r3, lsr #8
809#else
810	mov	r12, r3, lsl #8
811#endif
812	ldr	r3, [r1, #-4]!
813#ifdef __ARMEB__
814	orr	r12, r12, r3, lsl #24
815#else
816	orr	r12, r12, r3, lsr #24
817#endif
818	str	r12, [r0, #-4]!
819	subs	r2, r2, #4
820	bge	.Lmemmove_bsrcul3loop4
821
822.Lmemmove_bsrcul3l4:
823	add	r1, r1, #3
824	b	.Lmemmove_bl4
825
826.Lmemmove_bsrcul2:
827	cmp	r2, #0x0c
828	blt	.Lmemmove_bsrcul2loop4
829	sub	r2, r2, #0x0c
830	stmdb	sp!, {r4, r5, lr}
831
832.Lmemmove_bsrcul2loop16:
833#ifdef __ARMEB__
834	mov	lr, r3, lsr #16
835#else
836	mov	lr, r3, lsl #16
837#endif
838	ldmdb	r1!, {r3-r5, r12}
839#ifdef __ARMEB__
840	orr	lr, lr, r12, lsl #16
841	mov	r12, r12, lsr #16
842	orr	r12, r12, r5, lsl #16
843	mov	r5, r5, lsr #16
844	orr	r5, r5, r4, lsl #16
845	mov	r4, r4, lsr #16
846	orr	r4, r4, r3, lsl #16
847#else
848	orr	lr, lr, r12, lsr #16
849	mov	r12, r12, lsl #16
850	orr	r12, r12, r5, lsr #16
851	mov	r5, r5, lsl #16
852	orr	r5, r5, r4, lsr #16
853	mov	r4, r4, lsl #16
854	orr	r4, r4, r3, lsr #16
855#endif
856	stmdb	r0!, {r4, r5, r12, lr}
857	subs	r2, r2, #0x10
858	bge	.Lmemmove_bsrcul2loop16
859	ldmia	sp!, {r4, r5, lr}
860	adds	r2, r2, #0x0c
861	blt	.Lmemmove_bsrcul2l4
862
863.Lmemmove_bsrcul2loop4:
864#ifdef __ARMEB__
865	mov	r12, r3, lsr #16
866#else
867	mov	r12, r3, lsl #16
868#endif
869	ldr	r3, [r1, #-4]!
870#ifdef __ARMEB__
871	orr	r12, r12, r3, lsl #16
872#else
873	orr	r12, r12, r3, lsr #16
874#endif
875	str	r12, [r0, #-4]!
876	subs	r2, r2, #4
877	bge	.Lmemmove_bsrcul2loop4
878
879.Lmemmove_bsrcul2l4:
880	add	r1, r1, #2
881	b	.Lmemmove_bl4
882
883.Lmemmove_bsrcul1:
884	cmp	r2, #0x0c
885	blt	.Lmemmove_bsrcul1loop4
886	sub	r2, r2, #0x0c
887	stmdb	sp!, {r4, r5, lr}
888
889.Lmemmove_bsrcul1loop32:
890#ifdef __ARMEB__
891	mov	lr, r3, lsr #24
892#else
893	mov	lr, r3, lsl #24
894#endif
895	ldmdb	r1!, {r3-r5, r12}
896#ifdef __ARMEB__
897	orr	lr, lr, r12, lsl #8
898	mov	r12, r12, lsr #24
899	orr	r12, r12, r5, lsl #8
900	mov	r5, r5, lsr #24
901	orr	r5, r5, r4, lsl #8
902	mov	r4, r4, lsr #24
903	orr	r4, r4, r3, lsl #8
904#else
905	orr	lr, lr, r12, lsr #8
906	mov	r12, r12, lsl #24
907	orr	r12, r12, r5, lsr #8
908	mov	r5, r5, lsl #24
909	orr	r5, r5, r4, lsr #8
910	mov	r4, r4, lsl #24
911	orr	r4, r4, r3, lsr #8
912#endif
913	stmdb	r0!, {r4, r5, r12, lr}
914	subs	r2, r2, #0x10
915	bge	.Lmemmove_bsrcul1loop32
916	ldmia	sp!, {r4, r5, lr}
917	adds	r2, r2, #0x0c
918	blt	.Lmemmove_bsrcul1l4
919
920.Lmemmove_bsrcul1loop4:
921#ifdef __ARMEB__
922	mov	r12, r3, lsr #24
923#else
924	mov	r12, r3, lsl #24
925#endif
926	ldr	r3, [r1, #-4]!
927#ifdef __ARMEB__
928	orr	r12, r12, r3, lsl #8
929#else
930	orr	r12, r12, r3, lsr #8
931#endif
932	str	r12, [r0, #-4]!
933	subs	r2, r2, #4
934	bge	.Lmemmove_bsrcul1loop4
935
936.Lmemmove_bsrcul1l4:
937	add	r1, r1, #1
938	b	.Lmemmove_bl4
939
940#if !defined(_ARM_ARCH_5E)
941ENTRY(memcpy)
942	/* save leaf functions having to store this away */
943	/* Do not check arm_memcpy if we're running from flash */
944#ifdef FLASHADDR
945#if FLASHADDR > PHYSADDR
946	ldr	r3, =FLASHADDR
947	cmp	r3, pc
948	bls	.Lnormal
949#else
950	ldr	r3, =FLASHADDR
951	cmp	r3, pc
952	bhi	.Lnormal
953#endif
954#endif
955	ldr	r3, .L_arm_memcpy
956	ldr	r3, [r3]
957	cmp	r3, #0
958	beq	.Lnormal
959	ldr	r3, .L_min_memcpy_size
960	ldr	r3, [r3]
961	cmp	r2, r3
962	blt	.Lnormal
963	stmfd	sp!, {r0-r2, r4, lr}
964	mov	r3, #0
965	ldr	r4, .L_arm_memcpy
966	mov	lr, pc
967	ldr	pc, [r4]
968	cmp	r0, #0
969	ldmfd	sp!, {r0-r2, r4, lr}
970	RETeq
971
972.Lnormal:
973	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
974
975	subs	r2, r2, #4
976	blt	.Lmemcpy_l4		/* less than 4 bytes */
977	ands	r12, r0, #3
978	bne	.Lmemcpy_destul		/* oh unaligned destination addr */
979	ands	r12, r1, #3
980	bne	.Lmemcpy_srcul		/* oh unaligned source addr */
981
982.Lmemcpy_t8:
983	/* We have aligned source and destination */
984	subs	r2, r2, #8
985	blt	.Lmemcpy_l12		/* less than 12 bytes (4 from above) */
986	subs	r2, r2, #0x14
987	blt	.Lmemcpy_l32		/* less than 32 bytes (12 from above) */
988	stmdb	sp!, {r4}		/* borrow r4 */
989
990	/* blat 32 bytes at a time */
991	/* XXX for really big copies perhaps we should use more registers */
992.Lmemcpy_loop32:
993	ldmia	r1!, {r3, r4, r12, lr}
994	stmia	r0!, {r3, r4, r12, lr}
995	ldmia	r1!, {r3, r4, r12, lr}
996	stmia	r0!, {r3, r4, r12, lr}
997	subs	r2, r2, #0x20
998	bge	.Lmemcpy_loop32
999
1000	cmn	r2, #0x10
1001	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
1002	stmgeia	r0!, {r3, r4, r12, lr}
1003	subge	r2, r2, #0x10
1004	ldmia	sp!, {r4}		/* return r4 */
1005
1006.Lmemcpy_l32:
1007	adds	r2, r2, #0x14
1008
1009	/* blat 12 bytes at a time */
1010.Lmemcpy_loop12:
1011	ldmgeia	r1!, {r3, r12, lr}
1012	stmgeia	r0!, {r3, r12, lr}
1013	subges	r2, r2, #0x0c
1014	bge	.Lmemcpy_loop12
1015
1016.Lmemcpy_l12:
1017	adds	r2, r2, #8
1018	blt	.Lmemcpy_l4
1019
1020	subs	r2, r2, #4
1021	ldrlt	r3, [r1], #4
1022	strlt	r3, [r0], #4
1023	ldmgeia	r1!, {r3, r12}
1024	stmgeia	r0!, {r3, r12}
1025	subge	r2, r2, #4
1026
1027.Lmemcpy_l4:
1028	/* less than 4 bytes to go */
1029	adds	r2, r2, #4
1030#ifdef __APCS_26_
1031	ldmeqia sp!, {r0, pc}^		/* done */
1032#else
1033	ldmeqia	sp!, {r0, pc}		/* done */
1034#endif
1035	/* copy the crud byte at a time */
1036	cmp	r2, #2
1037	ldrb	r3, [r1], #1
1038	strb	r3, [r0], #1
1039	ldrgeb	r3, [r1], #1
1040	strgeb	r3, [r0], #1
1041	ldrgtb	r3, [r1], #1
1042	strgtb	r3, [r0], #1
1043	ldmia	sp!, {r0, pc}
1044
1045	/* erg - unaligned destination */
1046.Lmemcpy_destul:
1047	rsb	r12, r12, #4
1048	cmp	r12, #2
1049
1050	/* align destination with byte copies */
1051	ldrb	r3, [r1], #1
1052	strb	r3, [r0], #1
1053	ldrgeb	r3, [r1], #1
1054	strgeb	r3, [r0], #1
1055	ldrgtb	r3, [r1], #1
1056	strgtb	r3, [r0], #1
1057	subs	r2, r2, r12
1058	blt	.Lmemcpy_l4		/* less the 4 bytes */
1059
1060	ands	r12, r1, #3
1061	beq	.Lmemcpy_t8		/* we have an aligned source */
1062
1063	/* erg - unaligned source */
1064	/* This is where it gets nasty ... */
1065.Lmemcpy_srcul:
1066	bic	r1, r1, #3
1067	ldr	lr, [r1], #4
1068	cmp	r12, #2
1069	bgt	.Lmemcpy_srcul3
1070	beq	.Lmemcpy_srcul2
1071	cmp	r2, #0x0c
1072	blt	.Lmemcpy_srcul1loop4
1073	sub	r2, r2, #0x0c
1074	stmdb	sp!, {r4, r5}
1075
1076.Lmemcpy_srcul1loop16:
1077	mov	r3, lr, lsr #8
1078	ldmia	r1!, {r4, r5, r12, lr}
1079	orr	r3, r3, r4, lsl #24
1080	mov	r4, r4, lsr #8
1081	orr	r4, r4, r5, lsl #24
1082	mov	r5, r5, lsr #8
1083	orr	r5, r5, r12, lsl #24
1084	mov	r12, r12, lsr #8
1085	orr	r12, r12, lr, lsl #24
1086	stmia	r0!, {r3-r5, r12}
1087	subs	r2, r2, #0x10
1088	bge	.Lmemcpy_srcul1loop16
1089	ldmia	sp!, {r4, r5}
1090	adds	r2, r2, #0x0c
1091	blt	.Lmemcpy_srcul1l4
1092
1093.Lmemcpy_srcul1loop4:
1094	mov	r12, lr, lsr #8
1095	ldr	lr, [r1], #4
1096	orr	r12, r12, lr, lsl #24
1097	str	r12, [r0], #4
1098	subs	r2, r2, #4
1099	bge	.Lmemcpy_srcul1loop4
1100
1101.Lmemcpy_srcul1l4:
1102	sub	r1, r1, #3
1103	b	.Lmemcpy_l4
1104
1105.Lmemcpy_srcul2:
1106	cmp	r2, #0x0c
1107	blt	.Lmemcpy_srcul2loop4
1108	sub	r2, r2, #0x0c
1109	stmdb	sp!, {r4, r5}
1110
1111.Lmemcpy_srcul2loop16:
1112	mov	r3, lr, lsr #16
1113	ldmia	r1!, {r4, r5, r12, lr}
1114	orr	r3, r3, r4, lsl #16
1115	mov	r4, r4, lsr #16
1116	orr	r4, r4, r5, lsl #16
1117	mov	r5, r5, lsr #16
1118	orr	r5, r5, r12, lsl #16
1119	mov	r12, r12, lsr #16
1120	orr	r12, r12, lr, lsl #16
1121	stmia	r0!, {r3-r5, r12}
1122	subs	r2, r2, #0x10
1123	bge	.Lmemcpy_srcul2loop16
1124	ldmia	sp!, {r4, r5}
1125	adds	r2, r2, #0x0c
1126	blt	.Lmemcpy_srcul2l4
1127
1128.Lmemcpy_srcul2loop4:
1129	mov	r12, lr, lsr #16
1130	ldr	lr, [r1], #4
1131	orr	r12, r12, lr, lsl #16
1132	str	r12, [r0], #4
1133	subs	r2, r2, #4
1134	bge	.Lmemcpy_srcul2loop4
1135
1136.Lmemcpy_srcul2l4:
1137	sub	r1, r1, #2
1138	b	.Lmemcpy_l4
1139
1140.Lmemcpy_srcul3:
1141	cmp	r2, #0x0c
1142	blt	.Lmemcpy_srcul3loop4
1143	sub	r2, r2, #0x0c
1144	stmdb	sp!, {r4, r5}
1145
1146.Lmemcpy_srcul3loop16:
1147	mov	r3, lr, lsr #24
1148	ldmia	r1!, {r4, r5, r12, lr}
1149	orr	r3, r3, r4, lsl #8
1150	mov	r4, r4, lsr #24
1151	orr	r4, r4, r5, lsl #8
1152	mov	r5, r5, lsr #24
1153	orr	r5, r5, r12, lsl #8
1154	mov	r12, r12, lsr #24
1155	orr	r12, r12, lr, lsl #8
1156	stmia	r0!, {r3-r5, r12}
1157	subs	r2, r2, #0x10
1158	bge	.Lmemcpy_srcul3loop16
1159	ldmia	sp!, {r4, r5}
1160	adds	r2, r2, #0x0c
1161	blt	.Lmemcpy_srcul3l4
1162
1163.Lmemcpy_srcul3loop4:
1164	mov	r12, lr, lsr #24
1165	ldr	lr, [r1], #4
1166	orr	r12, r12, lr, lsl #8
1167	str	r12, [r0], #4
1168	subs	r2, r2, #4
1169	bge	.Lmemcpy_srcul3loop4
1170
1171.Lmemcpy_srcul3l4:
1172	sub	r1, r1, #1
1173	b	.Lmemcpy_l4
1174#else
1175/* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
1176ENTRY(memcpy)
1177	pld	[r1]
1178	cmp	r2, #0x0c
1179	ble	.Lmemcpy_short		/* <= 12 bytes */
1180#ifdef FLASHADDR
1181#if FLASHADDR > PHYSADDR
1182	ldr	r3, =FLASHADDR
1183	cmp	r3, pc
1184	bls	.Lnormal
1185#else
1186	ldr	r3, =FLASHADDR
1187	cmp	r3, pc
1188	bhi	.Lnormal
1189#endif
1190#endif
1191	ldr	r3, .L_arm_memcpy
1192	ldr	r3, [r3]
1193	cmp	r3, #0
1194	beq	.Lnormal
1195	ldr	r3, .L_min_memcpy_size
1196	ldr	r3, [r3]
1197	cmp	r2, r3
1198	blt	.Lnormal
1199	stmfd	sp!, {r0-r2, r4, lr}
1200	mov	r3, #0
1201	ldr	r4, .L_arm_memcpy
1202	mov	lr, pc
1203	ldr	pc, [r4]
1204	cmp	r0, #0
1205	ldmfd	sp!, {r0-r2, r4, lr}
1206	RETeq
1207.Lnormal:
1208	mov	r3, r0			/* We must not clobber r0 */
1209
1210	/* Word-align the destination buffer */
1211	ands	ip, r3, #0x03		/* Already word aligned? */
1212	beq	.Lmemcpy_wordaligned	/* Yup */
1213	cmp	ip, #0x02
1214	ldrb	ip, [r1], #0x01
1215	sub	r2, r2, #0x01
1216	strb	ip, [r3], #0x01
1217	ldrleb	ip, [r1], #0x01
1218	suble	r2, r2, #0x01
1219	strleb	ip, [r3], #0x01
1220	ldrltb	ip, [r1], #0x01
1221	sublt	r2, r2, #0x01
1222	strltb	ip, [r3], #0x01
1223
1224	/* Destination buffer is now word aligned */
1225.Lmemcpy_wordaligned:
1226	ands	ip, r1, #0x03		/* Is src also word-aligned? */
1227	bne	.Lmemcpy_bad_align	/* Nope. Things just got bad */
1228
1229	/* Quad-align the destination buffer */
1230	tst	r3, #0x07		/* Already quad aligned? */
1231	ldrne	ip, [r1], #0x04
1232	stmfd	sp!, {r4-r9}		/* Free up some registers */
1233	subne	r2, r2, #0x04
1234	strne	ip, [r3], #0x04
1235
1236	/* Destination buffer quad aligned, source is at least word aligned */
1237	subs	r2, r2, #0x80
1238	blt	.Lmemcpy_w_lessthan128
1239
1240	/* Copy 128 bytes at a time */
1241.Lmemcpy_w_loop128:
1242	ldr	r4, [r1], #0x04		/* LD:00-03 */
1243	ldr	r5, [r1], #0x04		/* LD:04-07 */
1244	pld	[r1, #0x18]		/* Prefetch 0x20 */
1245	ldr	r6, [r1], #0x04		/* LD:08-0b */
1246	ldr	r7, [r1], #0x04		/* LD:0c-0f */
1247	ldr	r8, [r1], #0x04		/* LD:10-13 */
1248	ldr	r9, [r1], #0x04		/* LD:14-17 */
1249	strd	r4, [r3], #0x08		/* ST:00-07 */
1250	ldr	r4, [r1], #0x04		/* LD:18-1b */
1251	ldr	r5, [r1], #0x04		/* LD:1c-1f */
1252	strd	r6, [r3], #0x08		/* ST:08-0f */
1253	ldr	r6, [r1], #0x04		/* LD:20-23 */
1254	ldr	r7, [r1], #0x04		/* LD:24-27 */
1255	pld	[r1, #0x18]		/* Prefetch 0x40 */
1256	strd	r8, [r3], #0x08		/* ST:10-17 */
1257	ldr	r8, [r1], #0x04		/* LD:28-2b */
1258	ldr	r9, [r1], #0x04		/* LD:2c-2f */
1259	strd	r4, [r3], #0x08		/* ST:18-1f */
1260	ldr	r4, [r1], #0x04		/* LD:30-33 */
1261	ldr	r5, [r1], #0x04		/* LD:34-37 */
1262	strd	r6, [r3], #0x08		/* ST:20-27 */
1263	ldr	r6, [r1], #0x04		/* LD:38-3b */
1264	ldr	r7, [r1], #0x04		/* LD:3c-3f */
1265	strd	r8, [r3], #0x08		/* ST:28-2f */
1266	ldr	r8, [r1], #0x04		/* LD:40-43 */
1267	ldr	r9, [r1], #0x04		/* LD:44-47 */
1268	pld	[r1, #0x18]		/* Prefetch 0x60 */
1269	strd	r4, [r3], #0x08		/* ST:30-37 */
1270	ldr	r4, [r1], #0x04		/* LD:48-4b */
1271	ldr	r5, [r1], #0x04		/* LD:4c-4f */
1272	strd	r6, [r3], #0x08		/* ST:38-3f */
1273	ldr	r6, [r1], #0x04		/* LD:50-53 */
1274	ldr	r7, [r1], #0x04		/* LD:54-57 */
1275	strd	r8, [r3], #0x08		/* ST:40-47 */
1276	ldr	r8, [r1], #0x04		/* LD:58-5b */
1277	ldr	r9, [r1], #0x04		/* LD:5c-5f */
1278	strd	r4, [r3], #0x08		/* ST:48-4f */
1279	ldr	r4, [r1], #0x04		/* LD:60-63 */
1280	ldr	r5, [r1], #0x04		/* LD:64-67 */
1281	pld	[r1, #0x18]		/* Prefetch 0x80 */
1282	strd	r6, [r3], #0x08		/* ST:50-57 */
1283	ldr	r6, [r1], #0x04		/* LD:68-6b */
1284	ldr	r7, [r1], #0x04		/* LD:6c-6f */
1285	strd	r8, [r3], #0x08		/* ST:58-5f */
1286	ldr	r8, [r1], #0x04		/* LD:70-73 */
1287	ldr	r9, [r1], #0x04		/* LD:74-77 */
1288	strd	r4, [r3], #0x08		/* ST:60-67 */
1289	ldr	r4, [r1], #0x04		/* LD:78-7b */
1290	ldr	r5, [r1], #0x04		/* LD:7c-7f */
1291	strd	r6, [r3], #0x08		/* ST:68-6f */
1292	strd	r8, [r3], #0x08		/* ST:70-77 */
1293	subs	r2, r2, #0x80
1294	strd	r4, [r3], #0x08		/* ST:78-7f */
1295	bge	.Lmemcpy_w_loop128
1296
1297.Lmemcpy_w_lessthan128:
1298	adds	r2, r2, #0x80		/* Adjust for extra sub */
1299	ldmeqfd	sp!, {r4-r9}
1300	RETeq			/* Return now if done */
1301	subs	r2, r2, #0x20
1302	blt	.Lmemcpy_w_lessthan32
1303
1304	/* Copy 32 bytes at a time */
1305.Lmemcpy_w_loop32:
1306	ldr	r4, [r1], #0x04
1307	ldr	r5, [r1], #0x04
1308	pld	[r1, #0x18]
1309	ldr	r6, [r1], #0x04
1310	ldr	r7, [r1], #0x04
1311	ldr	r8, [r1], #0x04
1312	ldr	r9, [r1], #0x04
1313	strd	r4, [r3], #0x08
1314	ldr	r4, [r1], #0x04
1315	ldr	r5, [r1], #0x04
1316	strd	r6, [r3], #0x08
1317	strd	r8, [r3], #0x08
1318	subs	r2, r2, #0x20
1319	strd	r4, [r3], #0x08
1320	bge	.Lmemcpy_w_loop32
1321
1322.Lmemcpy_w_lessthan32:
1323	adds	r2, r2, #0x20		/* Adjust for extra sub */
1324	ldmeqfd	sp!, {r4-r9}
1325	RETeq			/* Return now if done */
1326
1327	and	r4, r2, #0x18
1328	rsbs	r4, r4, #0x18
1329	addne	pc, pc, r4, lsl #1
1330	nop
1331
1332	/* At least 24 bytes remaining */
1333	ldr	r4, [r1], #0x04
1334	ldr	r5, [r1], #0x04
1335	sub	r2, r2, #0x08
1336	strd	r4, [r3], #0x08
1337
1338	/* At least 16 bytes remaining */
1339	ldr	r4, [r1], #0x04
1340	ldr	r5, [r1], #0x04
1341	sub	r2, r2, #0x08
1342	strd	r4, [r3], #0x08
1343
1344	/* At least 8 bytes remaining */
1345	ldr	r4, [r1], #0x04
1346	ldr	r5, [r1], #0x04
1347	subs	r2, r2, #0x08
1348	strd	r4, [r3], #0x08
1349
1350	/* Less than 8 bytes remaining */
1351	ldmfd	sp!, {r4-r9}
1352	RETeq			/* Return now if done */
1353	subs	r2, r2, #0x04
1354	ldrge	ip, [r1], #0x04
1355	strge	ip, [r3], #0x04
1356	RETeq			/* Return now if done */
1357	addlt	r2, r2, #0x04
1358	ldrb	ip, [r1], #0x01
1359	cmp	r2, #0x02
1360	ldrgeb	r2, [r1], #0x01
1361	strb	ip, [r3], #0x01
1362	ldrgtb	ip, [r1]
1363	strgeb	r2, [r3], #0x01
1364	strgtb	ip, [r3]
1365	RET
1366
1367
1368/*
1369 * At this point, it has not been possible to word align both buffers.
1370 * The destination buffer is word aligned, but the source buffer is not.
1371 */
1372.Lmemcpy_bad_align:
1373	stmfd	sp!, {r4-r7}
1374	bic	r1, r1, #0x03
1375	cmp	ip, #2
1376	ldr	ip, [r1], #0x04
1377	bgt	.Lmemcpy_bad3
1378	beq	.Lmemcpy_bad2
1379	b	.Lmemcpy_bad1
1380
1381.Lmemcpy_bad1_loop16:
1382#ifdef __ARMEB__
1383	mov	r4, ip, lsl #8
1384#else
1385	mov	r4, ip, lsr #8
1386#endif
1387	ldr	r5, [r1], #0x04
1388	pld	[r1, #0x018]
1389	ldr	r6, [r1], #0x04
1390	ldr	r7, [r1], #0x04
1391	ldr	ip, [r1], #0x04
1392#ifdef __ARMEB__
1393	orr	r4, r4, r5, lsr #24
1394	mov	r5, r5, lsl #8
1395	orr	r5, r5, r6, lsr #24
1396	mov	r6, r6, lsl #8
1397	orr	r6, r6, r7, lsr #24
1398	mov	r7, r7, lsl #8
1399	orr	r7, r7, ip, lsr #24
1400#else
1401	orr	r4, r4, r5, lsl #24
1402	mov	r5, r5, lsr #8
1403	orr	r5, r5, r6, lsl #24
1404	mov	r6, r6, lsr #8
1405	orr	r6, r6, r7, lsl #24
1406	mov	r7, r7, lsr #8
1407	orr	r7, r7, ip, lsl #24
1408#endif
1409	str	r4, [r3], #0x04
1410	str	r5, [r3], #0x04
1411	str	r6, [r3], #0x04
1412	str	r7, [r3], #0x04
1413.Lmemcpy_bad1:
1414	subs	r2, r2, #0x10
1415	bge	.Lmemcpy_bad1_loop16
1416
1417	adds	r2, r2, #0x10
1418	ldmeqfd	sp!, {r4-r7}
1419	RETeq			/* Return now if done */
1420	subs	r2, r2, #0x04
1421	sublt	r1, r1, #0x03
1422	blt	.Lmemcpy_bad_done
1423
1424.Lmemcpy_bad1_loop4:
1425#ifdef __ARMEB__
1426	mov	r4, ip, lsl #8
1427#else
1428	mov	r4, ip, lsr #8
1429#endif
1430	ldr	ip, [r1], #0x04
1431	subs	r2, r2, #0x04
1432#ifdef __ARMEB__
1433	orr	r4, r4, ip, lsr #24
1434#else
1435	orr	r4, r4, ip, lsl #24
1436#endif
1437	str	r4, [r3], #0x04
1438	bge	.Lmemcpy_bad1_loop4
1439	sub	r1, r1, #0x03
1440	b	.Lmemcpy_bad_done
1441
1442.Lmemcpy_bad2_loop16:
1443#ifdef __ARMEB__
1444	mov	r4, ip, lsl #16
1445#else
1446	mov	r4, ip, lsr #16
1447#endif
1448	ldr	r5, [r1], #0x04
1449	pld	[r1, #0x018]
1450	ldr	r6, [r1], #0x04
1451	ldr	r7, [r1], #0x04
1452	ldr	ip, [r1], #0x04
1453#ifdef __ARMEB__
1454	orr	r4, r4, r5, lsr #16
1455	mov	r5, r5, lsl #16
1456	orr	r5, r5, r6, lsr #16
1457	mov	r6, r6, lsl #16
1458	orr	r6, r6, r7, lsr #16
1459	mov	r7, r7, lsl #16
1460	orr	r7, r7, ip, lsr #16
1461#else
1462	orr	r4, r4, r5, lsl #16
1463	mov	r5, r5, lsr #16
1464	orr	r5, r5, r6, lsl #16
1465	mov	r6, r6, lsr #16
1466	orr	r6, r6, r7, lsl #16
1467	mov	r7, r7, lsr #16
1468	orr	r7, r7, ip, lsl #16
1469#endif
1470	str	r4, [r3], #0x04
1471	str	r5, [r3], #0x04
1472	str	r6, [r3], #0x04
1473	str	r7, [r3], #0x04
1474.Lmemcpy_bad2:
1475	subs	r2, r2, #0x10
1476	bge	.Lmemcpy_bad2_loop16
1477
1478	adds	r2, r2, #0x10
1479	ldmeqfd	sp!, {r4-r7}
1480	RETeq			/* Return now if done */
1481	subs	r2, r2, #0x04
1482	sublt	r1, r1, #0x02
1483	blt	.Lmemcpy_bad_done
1484
1485.Lmemcpy_bad2_loop4:
1486#ifdef __ARMEB__
1487	mov	r4, ip, lsl #16
1488#else
1489	mov	r4, ip, lsr #16
1490#endif
1491	ldr	ip, [r1], #0x04
1492	subs	r2, r2, #0x04
1493#ifdef __ARMEB__
1494	orr	r4, r4, ip, lsr #16
1495#else
1496	orr	r4, r4, ip, lsl #16
1497#endif
1498	str	r4, [r3], #0x04
1499	bge	.Lmemcpy_bad2_loop4
1500	sub	r1, r1, #0x02
1501	b	.Lmemcpy_bad_done
1502
1503.Lmemcpy_bad3_loop16:
1504#ifdef __ARMEB__
1505	mov	r4, ip, lsl #24
1506#else
1507	mov	r4, ip, lsr #24
1508#endif
1509	ldr	r5, [r1], #0x04
1510	pld	[r1, #0x018]
1511	ldr	r6, [r1], #0x04
1512	ldr	r7, [r1], #0x04
1513	ldr	ip, [r1], #0x04
1514#ifdef __ARMEB__
1515	orr	r4, r4, r5, lsr #8
1516	mov	r5, r5, lsl #24
1517	orr	r5, r5, r6, lsr #8
1518	mov	r6, r6, lsl #24
1519	orr	r6, r6, r7, lsr #8
1520	mov	r7, r7, lsl #24
1521	orr	r7, r7, ip, lsr #8
1522#else
1523	orr	r4, r4, r5, lsl #8
1524	mov	r5, r5, lsr #24
1525	orr	r5, r5, r6, lsl #8
1526	mov	r6, r6, lsr #24
1527	orr	r6, r6, r7, lsl #8
1528	mov	r7, r7, lsr #24
1529	orr	r7, r7, ip, lsl #8
1530#endif
1531	str	r4, [r3], #0x04
1532	str	r5, [r3], #0x04
1533	str	r6, [r3], #0x04
1534	str	r7, [r3], #0x04
1535.Lmemcpy_bad3:
1536	subs	r2, r2, #0x10
1537	bge	.Lmemcpy_bad3_loop16
1538
1539	adds	r2, r2, #0x10
1540	ldmeqfd	sp!, {r4-r7}
1541	RETeq			/* Return now if done */
1542	subs	r2, r2, #0x04
1543	sublt	r1, r1, #0x01
1544	blt	.Lmemcpy_bad_done
1545
1546.Lmemcpy_bad3_loop4:
1547#ifdef __ARMEB__
1548	mov	r4, ip, lsl #24
1549#else
1550	mov	r4, ip, lsr #24
1551#endif
1552	ldr	ip, [r1], #0x04
1553	subs	r2, r2, #0x04
1554#ifdef __ARMEB__
1555	orr	r4, r4, ip, lsr #8
1556#else
1557	orr	r4, r4, ip, lsl #8
1558#endif
1559	str	r4, [r3], #0x04
1560	bge	.Lmemcpy_bad3_loop4
1561	sub	r1, r1, #0x01
1562
1563.Lmemcpy_bad_done:
1564	ldmfd	sp!, {r4-r7}
1565	adds	r2, r2, #0x04
1566	RETeq
1567	ldrb	ip, [r1], #0x01
1568	cmp	r2, #0x02
1569	ldrgeb	r2, [r1], #0x01
1570	strb	ip, [r3], #0x01
1571	ldrgtb	ip, [r1]
1572	strgeb	r2, [r3], #0x01
1573	strgtb	ip, [r3]
1574	RET
1575
1576
1577/*
1578 * Handle short copies (less than 16 bytes), possibly misaligned.
1579 * Some of these are *very* common, thanks to the network stack,
1580 * and so are handled specially.
1581 */
1582.Lmemcpy_short:
1583	add	pc, pc, r2, lsl #2
1584	nop
1585	RET			/* 0x00 */
1586	b	.Lmemcpy_bytewise	/* 0x01 */
1587	b	.Lmemcpy_bytewise	/* 0x02 */
1588	b	.Lmemcpy_bytewise	/* 0x03 */
1589	b	.Lmemcpy_4		/* 0x04 */
1590	b	.Lmemcpy_bytewise	/* 0x05 */
1591	b	.Lmemcpy_6		/* 0x06 */
1592	b	.Lmemcpy_bytewise	/* 0x07 */
1593	b	.Lmemcpy_8		/* 0x08 */
1594	b	.Lmemcpy_bytewise	/* 0x09 */
1595	b	.Lmemcpy_bytewise	/* 0x0a */
1596	b	.Lmemcpy_bytewise	/* 0x0b */
1597	b	.Lmemcpy_c		/* 0x0c */
1598.Lmemcpy_bytewise:
1599	mov	r3, r0			/* We must not clobber r0 */
1600	ldrb	ip, [r1], #0x01
16011:	subs	r2, r2, #0x01
1602	strb	ip, [r3], #0x01
1603	ldrneb	ip, [r1], #0x01
1604	bne	1b
1605	RET
1606
1607/******************************************************************************
1608 * Special case for 4 byte copies
1609 */
1610#define	LMEMCPY_4_LOG2	6	/* 64 bytes */
1611#define	LMEMCPY_4_PAD	.align LMEMCPY_4_LOG2
1612	LMEMCPY_4_PAD
1613.Lmemcpy_4:
1614	and	r2, r1, #0x03
1615	orr	r2, r2, r0, lsl #2
1616	ands	r2, r2, #0x0f
1617	sub	r3, pc, #0x14
1618	addne	pc, r3, r2, lsl #LMEMCPY_4_LOG2
1619
1620/*
1621 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1622 */
1623	ldr	r2, [r1]
1624	str	r2, [r0]
1625	RET
1626	LMEMCPY_4_PAD
1627
1628/*
1629 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1630 */
1631	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
1632	ldr	r2, [r1, #3]		/* BE:r2 = 3xxx  LE:r2 = xxx3 */
1633#ifdef __ARMEB__
1634	mov	r3, r3, lsl #8		/* r3 = 012. */
1635	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
1636#else
1637	mov	r3, r3, lsr #8		/* r3 = .210 */
1638	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
1639#endif
1640	str	r3, [r0]
1641	RET
1642	LMEMCPY_4_PAD
1643
1644/*
1645 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1646 */
1647#ifdef __ARMEB__
1648	ldrh	r3, [r1]
1649	ldrh	r2, [r1, #0x02]
1650#else
1651	ldrh	r3, [r1, #0x02]
1652	ldrh	r2, [r1]
1653#endif
1654	orr	r3, r2, r3, lsl #16
1655	str	r3, [r0]
1656	RET
1657	LMEMCPY_4_PAD
1658
1659/*
1660 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1661 */
1662	ldr	r3, [r1, #-3]		/* BE:r3 = xxx0  LE:r3 = 0xxx */
1663	ldr	r2, [r1, #1]		/* BE:r2 = 123x  LE:r2 = x321 */
1664#ifdef __ARMEB__
1665	mov	r3, r3, lsl #24		/* r3 = 0... */
1666	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
1667#else
1668	mov	r3, r3, lsr #24		/* r3 = ...0 */
1669	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
1670#endif
1671	str	r3, [r0]
1672	RET
1673	LMEMCPY_4_PAD
1674
1675/*
1676 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1677 */
1678	ldr	r2, [r1]
1679#ifdef __ARMEB__
1680	strb	r2, [r0, #0x03]
1681	mov	r3, r2, lsr #8
1682	mov	r1, r2, lsr #24
1683	strb	r1, [r0]
1684#else
1685	strb	r2, [r0]
1686	mov	r3, r2, lsr #8
1687	mov	r1, r2, lsr #24
1688	strb	r1, [r0, #0x03]
1689#endif
1690	strh	r3, [r0, #0x01]
1691	RET
1692	LMEMCPY_4_PAD
1693
1694/*
1695 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1696 */
1697	ldrb	r2, [r1]
1698	ldrh	r3, [r1, #0x01]
1699	ldrb	r1, [r1, #0x03]
1700	strb	r2, [r0]
1701	strh	r3, [r0, #0x01]
1702	strb	r1, [r0, #0x03]
1703	RET
1704	LMEMCPY_4_PAD
1705
1706/*
1707 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1708 */
1709	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1710	ldrh	r3, [r1, #0x02]		/* LE:r3 = ..23  LE:r3 = ..32 */
1711#ifdef __ARMEB__
1712	mov	r1, r2, lsr #8		/* r1 = ...0 */
1713	strb	r1, [r0]
1714	mov	r2, r2, lsl #8		/* r2 = .01. */
1715	orr	r2, r2, r3, lsr #8	/* r2 = .012 */
1716#else
1717	strb	r2, [r0]
1718	mov	r2, r2, lsr #8		/* r2 = ...1 */
1719	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
1720	mov	r3, r3, lsr #8		/* r3 = ...3 */
1721#endif
1722	strh	r2, [r0, #0x01]
1723	strb	r3, [r0, #0x03]
1724	RET
1725	LMEMCPY_4_PAD
1726
1727/*
1728 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1729 */
1730	ldrb	r2, [r1]
1731	ldrh	r3, [r1, #0x01]
1732	ldrb	r1, [r1, #0x03]
1733	strb	r2, [r0]
1734	strh	r3, [r0, #0x01]
1735	strb	r1, [r0, #0x03]
1736	RET
1737	LMEMCPY_4_PAD
1738
1739/*
1740 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1741 */
1742	ldr	r2, [r1]
1743#ifdef __ARMEB__
1744	strh	r2, [r0, #0x02]
1745	mov	r3, r2, lsr #16
1746	strh	r3, [r0]
1747#else
1748	strh	r2, [r0]
1749	mov	r3, r2, lsr #16
1750	strh	r3, [r0, #0x02]
1751#endif
1752	RET
1753	LMEMCPY_4_PAD
1754
1755/*
1756 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1757 */
1758	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1759	ldr	r3, [r1, #3]		/* BE:r3 = 3xxx  LE:r3 = xxx3 */
1760	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
1761	strh	r1, [r0]
1762#ifdef __ARMEB__
1763	mov	r2, r2, lsl #8		/* r2 = 012. */
1764	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
1765#else
1766	mov	r2, r2, lsr #24		/* r2 = ...2 */
1767	orr	r2, r2, r3, lsl #8	/* r2 = xx32 */
1768#endif
1769	strh	r2, [r0, #0x02]
1770	RET
1771	LMEMCPY_4_PAD
1772
1773/*
1774 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1775 */
1776	ldrh	r2, [r1]
1777	ldrh	r3, [r1, #0x02]
1778	strh	r2, [r0]
1779	strh	r3, [r0, #0x02]
1780	RET
1781	LMEMCPY_4_PAD
1782
1783/*
1784 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1785 */
1786	ldr	r3, [r1, #1]		/* BE:r3 = 123x  LE:r3 = x321 */
1787	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
1788	mov	r1, r3, lsr #8		/* BE:r1 = .123  LE:r1 = .x32 */
1789	strh	r1, [r0, #0x02]
1790#ifdef __ARMEB__
1791	mov	r3, r3, lsr #24		/* r3 = ...1 */
1792	orr	r3, r3, r2, lsl #8	/* r3 = xx01 */
1793#else
1794	mov	r3, r3, lsl #8		/* r3 = 321. */
1795	orr	r3, r3, r2, lsr #24	/* r3 = 3210 */
1796#endif
1797	strh	r3, [r0]
1798	RET
1799	LMEMCPY_4_PAD
1800
1801/*
1802 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1803 */
1804	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1805#ifdef __ARMEB__
1806	strb	r2, [r0, #0x03]
1807	mov	r3, r2, lsr #8
1808	mov	r1, r2, lsr #24
1809	strh	r3, [r0, #0x01]
1810	strb	r1, [r0]
1811#else
1812	strb	r2, [r0]
1813	mov	r3, r2, lsr #8
1814	mov	r1, r2, lsr #24
1815	strh	r3, [r0, #0x01]
1816	strb	r1, [r0, #0x03]
1817#endif
1818	RET
1819	LMEMCPY_4_PAD
1820
1821/*
1822 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1823 */
1824	ldrb	r2, [r1]
1825	ldrh	r3, [r1, #0x01]
1826	ldrb	r1, [r1, #0x03]
1827	strb	r2, [r0]
1828	strh	r3, [r0, #0x01]
1829	strb	r1, [r0, #0x03]
1830	RET
1831	LMEMCPY_4_PAD
1832
1833/*
1834 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1835 */
1836#ifdef __ARMEB__
1837	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
1838	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1839	strb	r3, [r0, #0x03]
1840	mov	r3, r3, lsr #8		/* r3 = ...2 */
1841	orr	r3, r3, r2, lsl #8	/* r3 = ..12 */
1842	strh	r3, [r0, #0x01]
1843	mov	r2, r2, lsr #8		/* r2 = ...0 */
1844	strb	r2, [r0]
1845#else
1846	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1847	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
1848	strb	r2, [r0]
1849	mov	r2, r2, lsr #8		/* r2 = ...1 */
1850	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
1851	strh	r2, [r0, #0x01]
1852	mov	r3, r3, lsr #8		/* r3 = ...3 */
1853	strb	r3, [r0, #0x03]
1854#endif
1855	RET
1856	LMEMCPY_4_PAD
1857
1858/*
1859 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1860 */
1861	ldrb	r2, [r1]
1862	ldrh	r3, [r1, #0x01]
1863	ldrb	r1, [r1, #0x03]
1864	strb	r2, [r0]
1865	strh	r3, [r0, #0x01]
1866	strb	r1, [r0, #0x03]
1867	RET
1868	LMEMCPY_4_PAD
1869
1870
1871/******************************************************************************
1872 * Special case for 6 byte copies
1873 */
1874#define	LMEMCPY_6_LOG2	6	/* 64 bytes */
1875#define	LMEMCPY_6_PAD	.align LMEMCPY_6_LOG2
1876	LMEMCPY_6_PAD
1877.Lmemcpy_6:
1878	and	r2, r1, #0x03
1879	orr	r2, r2, r0, lsl #2
1880	ands	r2, r2, #0x0f
1881	sub	r3, pc, #0x14
1882	addne	pc, r3, r2, lsl #LMEMCPY_6_LOG2
1883
1884/*
1885 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1886 */
1887	ldr	r2, [r1]
1888	ldrh	r3, [r1, #0x04]
1889	str	r2, [r0]
1890	strh	r3, [r0, #0x04]
1891	RET
1892	LMEMCPY_6_PAD
1893
1894/*
1895 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1896 */
1897	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1898	ldr	r3, [r1, #0x03]		/* BE:r3 = 345x  LE:r3 = x543 */
1899#ifdef __ARMEB__
1900	mov	r2, r2, lsl #8		/* r2 = 012. */
1901	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
1902#else
1903	mov	r2, r2, lsr #8		/* r2 = .210 */
1904	orr	r2, r2, r3, lsl #24	/* r2 = 3210 */
1905#endif
1906	mov	r3, r3, lsr #8		/* BE:r3 = .345  LE:r3 = .x54 */
1907	str	r2, [r0]
1908	strh	r3, [r0, #0x04]
1909	RET
1910	LMEMCPY_6_PAD
1911
1912/*
1913 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1914 */
1915	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1916	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1917#ifdef __ARMEB__
1918	mov	r1, r3, lsr #16		/* r1 = ..23 */
1919	orr	r1, r1, r2, lsl #16	/* r1 = 0123 */
1920	str	r1, [r0]
1921	strh	r3, [r0, #0x04]
1922#else
1923	mov	r1, r3, lsr #16		/* r1 = ..54 */
1924	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
1925	str	r2, [r0]
1926	strh	r1, [r0, #0x04]
1927#endif
1928	RET
1929	LMEMCPY_6_PAD
1930
1931/*
1932 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1933 */
1934	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
1935	ldr	r3, [r1, #1]		/* BE:r3 = 1234  LE:r3 = 4321 */
1936	ldr	r1, [r1, #5]		/* BE:r1 = 5xxx  LE:r3 = xxx5 */
1937#ifdef __ARMEB__
1938	mov	r2, r2, lsl #24		/* r2 = 0... */
1939	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
1940	mov	r3, r3, lsl #8		/* r3 = 234. */
1941	orr	r1, r3, r1, lsr #24	/* r1 = 2345 */
1942#else
1943	mov	r2, r2, lsr #24		/* r2 = ...0 */
1944	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
1945	mov	r1, r1, lsl #8		/* r1 = xx5. */
1946	orr	r1, r1, r3, lsr #24	/* r1 = xx54 */
1947#endif
1948	str	r2, [r0]
1949	strh	r1, [r0, #0x04]
1950	RET
1951	LMEMCPY_6_PAD
1952
1953/*
1954 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1955 */
1956	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
1957	ldrh	r2, [r1, #0x04]		/* BE:r2 = ..45  LE:r2 = ..54 */
1958	mov	r1, r3, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
1959	strh	r1, [r0, #0x01]
1960#ifdef __ARMEB__
1961	mov	r1, r3, lsr #24		/* r1 = ...0 */
1962	strb	r1, [r0]
1963	mov	r3, r3, lsl #8		/* r3 = 123. */
1964	orr	r3, r3, r2, lsr #8	/* r3 = 1234 */
1965#else
1966	strb	r3, [r0]
1967	mov	r3, r3, lsr #24		/* r3 = ...3 */
1968	orr	r3, r3, r2, lsl #8	/* r3 = .543 */
1969	mov	r2, r2, lsr #8		/* r2 = ...5 */
1970#endif
1971	strh	r3, [r0, #0x03]
1972	strb	r2, [r0, #0x05]
1973	RET
1974	LMEMCPY_6_PAD
1975
1976/*
1977 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1978 */
1979	ldrb	r2, [r1]
1980	ldrh	r3, [r1, #0x01]
1981	ldrh	ip, [r1, #0x03]
1982	ldrb	r1, [r1, #0x05]
1983	strb	r2, [r0]
1984	strh	r3, [r0, #0x01]
1985	strh	ip, [r0, #0x03]
1986	strb	r1, [r0, #0x05]
1987	RET
1988	LMEMCPY_6_PAD
1989
1990/*
1991 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1992 */
1993	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1994	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
1995#ifdef __ARMEB__
1996	mov	r3, r2, lsr #8		/* r3 = ...0 */
1997	strb	r3, [r0]
1998	strb	r1, [r0, #0x05]
1999	mov	r3, r1, lsr #8		/* r3 = .234 */
2000	strh	r3, [r0, #0x03]
2001	mov	r3, r2, lsl #8		/* r3 = .01. */
2002	orr	r3, r3, r1, lsr #24	/* r3 = .012 */
2003	strh	r3, [r0, #0x01]
2004#else
2005	strb	r2, [r0]
2006	mov	r3, r1, lsr #24
2007	strb	r3, [r0, #0x05]
2008	mov	r3, r1, lsr #8		/* r3 = .543 */
2009	strh	r3, [r0, #0x03]
2010	mov	r3, r2, lsr #8		/* r3 = ...1 */
2011	orr	r3, r3, r1, lsl #8	/* r3 = 4321 */
2012	strh	r3, [r0, #0x01]
2013#endif
2014	RET
2015	LMEMCPY_6_PAD
2016
2017/*
2018 * 0111: dst is 8-bit aligned, src is 8-bit aligned
2019 */
2020	ldrb	r2, [r1]
2021	ldrh	r3, [r1, #0x01]
2022	ldrh	ip, [r1, #0x03]
2023	ldrb	r1, [r1, #0x05]
2024	strb	r2, [r0]
2025	strh	r3, [r0, #0x01]
2026	strh	ip, [r0, #0x03]
2027	strb	r1, [r0, #0x05]
2028	RET
2029	LMEMCPY_6_PAD
2030
2031/*
2032 * 1000: dst is 16-bit aligned, src is 32-bit aligned
2033 */
2034#ifdef __ARMEB__
2035	ldr	r2, [r1]		/* r2 = 0123 */
2036	ldrh	r3, [r1, #0x04]		/* r3 = ..45 */
2037	mov	r1, r2, lsr #16		/* r1 = ..01 */
2038	orr	r3, r3, r2, lsl#16	/* r3 = 2345 */
2039	strh	r1, [r0]
2040	str	r3, [r0, #0x02]
2041#else
2042	ldrh	r2, [r1, #0x04]		/* r2 = ..54 */
2043	ldr	r3, [r1]		/* r3 = 3210 */
2044	mov	r2, r2, lsl #16		/* r2 = 54.. */
2045	orr	r2, r2, r3, lsr #16	/* r2 = 5432 */
2046	strh	r3, [r0]
2047	str	r2, [r0, #0x02]
2048#endif
2049	RET
2050	LMEMCPY_6_PAD
2051
2052/*
2053 * 1001: dst is 16-bit aligned, src is 8-bit aligned
2054 */
2055	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
2056	ldr	r2, [r1, #3]		/* BE:r2 = 345x  LE:r2 = x543 */
2057	mov	r1, r3, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
2058#ifdef __ARMEB__
2059	mov	r2, r2, lsr #8		/* r2 = .345 */
2060	orr	r2, r2, r3, lsl #24	/* r2 = 2345 */
2061#else
2062	mov	r2, r2, lsl #8		/* r2 = 543. */
2063	orr	r2, r2, r3, lsr #24	/* r2 = 5432 */
2064#endif
2065	strh	r1, [r0]
2066	str	r2, [r0, #0x02]
2067	RET
2068	LMEMCPY_6_PAD
2069
2070/*
2071 * 1010: dst is 16-bit aligned, src is 16-bit aligned
2072 */
2073	ldrh	r2, [r1]
2074	ldr	r3, [r1, #0x02]
2075	strh	r2, [r0]
2076	str	r3, [r0, #0x02]
2077	RET
2078	LMEMCPY_6_PAD
2079
2080/*
2081 * 1011: dst is 16-bit aligned, src is 8-bit aligned
2082 */
2083	ldrb	r3, [r1]		/* r3 = ...0 */
2084	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
2085	ldrb	r1, [r1, #0x05]		/* r1 = ...5 */
2086#ifdef __ARMEB__
2087	mov	r3, r3, lsl #8		/* r3 = ..0. */
2088	orr	r3, r3, r2, lsr #24	/* r3 = ..01 */
2089	orr	r1, r1, r2, lsl #8	/* r1 = 2345 */
2090#else
2091	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
2092	mov	r1, r1, lsl #24		/* r1 = 5... */
2093	orr	r1, r1, r2, lsr #8	/* r1 = 5432 */
2094#endif
2095	strh	r3, [r0]
2096	str	r1, [r0, #0x02]
2097	RET
2098	LMEMCPY_6_PAD
2099
2100/*
2101 * 1100: dst is 8-bit aligned, src is 32-bit aligned
2102 */
2103	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2104	ldrh	r1, [r1, #0x04]		/* BE:r1 = ..45  LE:r1 = ..54 */
2105#ifdef __ARMEB__
2106	mov	r3, r2, lsr #24		/* r3 = ...0 */
2107	strb	r3, [r0]
2108	mov	r2, r2, lsl #8		/* r2 = 123. */
2109	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
2110#else
2111	strb	r2, [r0]
2112	mov	r2, r2, lsr #8		/* r2 = .321 */
2113	orr	r2, r2, r1, lsl #24	/* r2 = 4321 */
2114	mov	r1, r1, lsr #8		/* r1 = ...5 */
2115#endif
2116	str	r2, [r0, #0x01]
2117	strb	r1, [r0, #0x05]
2118	RET
2119	LMEMCPY_6_PAD
2120
2121/*
2122 * 1101: dst is 8-bit aligned, src is 8-bit aligned
2123 */
2124	ldrb	r2, [r1]
2125	ldrh	r3, [r1, #0x01]
2126	ldrh	ip, [r1, #0x03]
2127	ldrb	r1, [r1, #0x05]
2128	strb	r2, [r0]
2129	strh	r3, [r0, #0x01]
2130	strh	ip, [r0, #0x03]
2131	strb	r1, [r0, #0x05]
2132	RET
2133	LMEMCPY_6_PAD
2134
2135/*
2136 * 1110: dst is 8-bit aligned, src is 16-bit aligned
2137 */
2138	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2139	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
2140#ifdef __ARMEB__
2141	mov	r3, r2, lsr #8		/* r3 = ...0 */
2142	strb	r3, [r0]
2143	mov	r2, r2, lsl #24		/* r2 = 1... */
2144	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
2145#else
2146	strb	r2, [r0]
2147	mov	r2, r2, lsr #8		/* r2 = ...1 */
2148	orr	r2, r2, r1, lsl #8	/* r2 = 4321 */
2149	mov	r1, r1, lsr #24		/* r1 = ...5 */
2150#endif
2151	str	r2, [r0, #0x01]
2152	strb	r1, [r0, #0x05]
2153	RET
2154	LMEMCPY_6_PAD
2155
2156/*
2157 * 1111: dst is 8-bit aligned, src is 8-bit aligned
2158 */
2159	ldrb	r2, [r1]
2160	ldr	r3, [r1, #0x01]
2161	ldrb	r1, [r1, #0x05]
2162	strb	r2, [r0]
2163	str	r3, [r0, #0x01]
2164	strb	r1, [r0, #0x05]
2165	RET
2166	LMEMCPY_6_PAD
2167
2168
2169/******************************************************************************
2170 * Special case for 8 byte copies
2171 */
2172#define	LMEMCPY_8_LOG2	6	/* 64 bytes */
2173#define	LMEMCPY_8_PAD	.align LMEMCPY_8_LOG2
2174	LMEMCPY_8_PAD
2175.Lmemcpy_8:
2176	and	r2, r1, #0x03
2177	orr	r2, r2, r0, lsl #2
2178	ands	r2, r2, #0x0f
2179	sub	r3, pc, #0x14
2180	addne	pc, r3, r2, lsl #LMEMCPY_8_LOG2
2181
2182/*
2183 * 0000: dst is 32-bit aligned, src is 32-bit aligned
2184 */
2185	ldr	r2, [r1]
2186	ldr	r3, [r1, #0x04]
2187	str	r2, [r0]
2188	str	r3, [r0, #0x04]
2189	RET
2190	LMEMCPY_8_PAD
2191
2192/*
2193 * 0001: dst is 32-bit aligned, src is 8-bit aligned
2194 */
2195	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
2196	ldr	r2, [r1, #0x03]		/* BE:r2 = 3456  LE:r2 = 6543 */
2197	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
2198#ifdef __ARMEB__
2199	mov	r3, r3, lsl #8		/* r3 = 012. */
2200	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
2201	orr	r2, r1, r2, lsl #8	/* r2 = 4567 */
2202#else
2203	mov	r3, r3, lsr #8		/* r3 = .210 */
2204	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
2205	mov	r1, r1, lsl #24		/* r1 = 7... */
2206	orr	r2, r1, r2, lsr #8	/* r2 = 7654 */
2207#endif
2208	str	r3, [r0]
2209	str	r2, [r0, #0x04]
2210	RET
2211	LMEMCPY_8_PAD
2212
2213/*
2214 * 0010: dst is 32-bit aligned, src is 16-bit aligned
2215 */
2216	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2217	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2218	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
2219#ifdef __ARMEB__
2220	mov	r2, r2, lsl #16		/* r2 = 01.. */
2221	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
2222	orr	r3, r1, r3, lsl #16	/* r3 = 4567 */
2223#else
2224	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
2225	mov	r3, r3, lsr #16		/* r3 = ..54 */
2226	orr	r3, r3, r1, lsl #16	/* r3 = 7654 */
2227#endif
2228	str	r2, [r0]
2229	str	r3, [r0, #0x04]
2230	RET
2231	LMEMCPY_8_PAD
2232
2233/*
2234 * 0011: dst is 32-bit aligned, src is 8-bit aligned
2235 */
2236	ldrb	r3, [r1]		/* r3 = ...0 */
2237	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
2238	ldr	r1, [r1, #0x05]		/* BE:r1 = 567x  LE:r1 = x765 */
2239#ifdef __ARMEB__
2240	mov	r3, r3, lsl #24		/* r3 = 0... */
2241	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
2242	mov	r2, r2, lsl #24		/* r2 = 4... */
2243	orr	r2, r2, r1, lsr #8	/* r2 = 4567 */
2244#else
2245	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
2246	mov	r2, r2, lsr #24		/* r2 = ...4 */
2247	orr	r2, r2, r1, lsl #8	/* r2 = 7654 */
2248#endif
2249	str	r3, [r0]
2250	str	r2, [r0, #0x04]
2251	RET
2252	LMEMCPY_8_PAD
2253
2254/*
2255 * 0100: dst is 8-bit aligned, src is 32-bit aligned
2256 */
2257	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
2258	ldr	r2, [r1, #0x04]		/* BE:r2 = 4567  LE:r2 = 7654 */
2259#ifdef __ARMEB__
2260	mov	r1, r3, lsr #24		/* r1 = ...0 */
2261	strb	r1, [r0]
2262	mov	r1, r3, lsr #8		/* r1 = .012 */
2263	strb	r2, [r0, #0x07]
2264	mov	r3, r3, lsl #24		/* r3 = 3... */
2265	orr	r3, r3, r2, lsr #8	/* r3 = 3456 */
2266#else
2267	strb	r3, [r0]
2268	mov	r1, r2, lsr #24		/* r1 = ...7 */
2269	strb	r1, [r0, #0x07]
2270	mov	r1, r3, lsr #8		/* r1 = .321 */
2271	mov	r3, r3, lsr #24		/* r3 = ...3 */
2272	orr	r3, r3, r2, lsl #8	/* r3 = 6543 */
2273#endif
2274	strh	r1, [r0, #0x01]
2275	str	r3, [r0, #0x03]
2276	RET
2277	LMEMCPY_8_PAD
2278
2279/*
2280 * 0101: dst is 8-bit aligned, src is 8-bit aligned
2281 */
2282	ldrb	r2, [r1]
2283	ldrh	r3, [r1, #0x01]
2284	ldr	ip, [r1, #0x03]
2285	ldrb	r1, [r1, #0x07]
2286	strb	r2, [r0]
2287	strh	r3, [r0, #0x01]
2288	str	ip, [r0, #0x03]
2289	strb	r1, [r0, #0x07]
2290	RET
2291	LMEMCPY_8_PAD
2292
2293/*
2294 * 0110: dst is 8-bit aligned, src is 16-bit aligned
2295 */
2296	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2297	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2298	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
2299#ifdef __ARMEB__
2300	mov	ip, r2, lsr #8		/* ip = ...0 */
2301	strb	ip, [r0]
2302	mov	ip, r2, lsl #8		/* ip = .01. */
2303	orr	ip, ip, r3, lsr #24	/* ip = .012 */
2304	strb	r1, [r0, #0x07]
2305	mov	r3, r3, lsl #8		/* r3 = 345. */
2306	orr	r3, r3, r1, lsr #8	/* r3 = 3456 */
2307#else
2308	strb	r2, [r0]		/* 0 */
2309	mov	ip, r1, lsr #8		/* ip = ...7 */
2310	strb	ip, [r0, #0x07]		/* 7 */
2311	mov	ip, r2, lsr #8		/* ip = ...1 */
2312	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
2313	mov	r3, r3, lsr #8		/* r3 = .543 */
2314	orr	r3, r3, r1, lsl #24	/* r3 = 6543 */
2315#endif
2316	strh	ip, [r0, #0x01]
2317	str	r3, [r0, #0x03]
2318	RET
2319	LMEMCPY_8_PAD
2320
2321/*
2322 * 0111: dst is 8-bit aligned, src is 8-bit aligned
2323 */
2324	ldrb	r3, [r1]		/* r3 = ...0 */
2325	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
2326	ldrh	r2, [r1, #0x05]		/* BE:r2 = ..56  LE:r2 = ..65 */
2327	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
2328	strb	r3, [r0]
2329	mov	r3, ip, lsr #16		/* BE:r3 = ..12  LE:r3 = ..43 */
2330#ifdef __ARMEB__
2331	strh	r3, [r0, #0x01]
2332	orr	r2, r2, ip, lsl #16	/* r2 = 3456 */
2333#else
2334	strh	ip, [r0, #0x01]
2335	orr	r2, r3, r2, lsl #16	/* r2 = 6543 */
2336#endif
2337	str	r2, [r0, #0x03]
2338	strb	r1, [r0, #0x07]
2339	RET
2340	LMEMCPY_8_PAD
2341
2342/*
2343 * 1000: dst is 16-bit aligned, src is 32-bit aligned
2344 */
2345	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2346	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2347	mov	r1, r2, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
2348#ifdef __ARMEB__
2349	strh	r1, [r0]
2350	mov	r1, r3, lsr #16		/* r1 = ..45 */
2351	orr	r2, r1 ,r2, lsl #16	/* r2 = 2345 */
2352#else
2353	strh	r2, [r0]
2354	orr	r2, r1, r3, lsl #16	/* r2 = 5432 */
2355	mov	r3, r3, lsr #16		/* r3 = ..76 */
2356#endif
2357	str	r2, [r0, #0x02]
2358	strh	r3, [r0, #0x06]
2359	RET
2360	LMEMCPY_8_PAD
2361
2362/*
2363 * 1001: dst is 16-bit aligned, src is 8-bit aligned
2364 */
2365	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
2366	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
2367	ldrb	ip, [r1, #0x07]		/* ip = ...7 */
2368	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
2369	strh	r1, [r0]
2370#ifdef __ARMEB__
2371	mov	r1, r2, lsl #24		/* r1 = 2... */
2372	orr	r1, r1, r3, lsr #8	/* r1 = 2345 */
2373	orr	r3, ip, r3, lsl #8	/* r3 = 4567 */
2374#else
2375	mov	r1, r2, lsr #24		/* r1 = ...2 */
2376	orr	r1, r1, r3, lsl #8	/* r1 = 5432 */
2377	mov	r3, r3, lsr #24		/* r3 = ...6 */
2378	orr	r3, r3, ip, lsl #8	/* r3 = ..76 */
2379#endif
2380	str	r1, [r0, #0x02]
2381	strh	r3, [r0, #0x06]
2382	RET
2383	LMEMCPY_8_PAD
2384
2385/*
2386 * 1010: dst is 16-bit aligned, src is 16-bit aligned
2387 */
2388	ldrh	r2, [r1]
2389	ldr	ip, [r1, #0x02]
2390	ldrh	r3, [r1, #0x06]
2391	strh	r2, [r0]
2392	str	ip, [r0, #0x02]
2393	strh	r3, [r0, #0x06]
2394	RET
2395	LMEMCPY_8_PAD
2396
2397/*
2398 * 1011: dst is 16-bit aligned, src is 8-bit aligned
2399 */
2400	ldr	r3, [r1, #0x05]		/* BE:r3 = 567x  LE:r3 = x765 */
2401	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
2402	ldrb	ip, [r1]		/* ip = ...0 */
2403	mov	r1, r3, lsr #8		/* BE:r1 = .567  LE:r1 = .x76 */
2404	strh	r1, [r0, #0x06]
2405#ifdef __ARMEB__
2406	mov	r3, r3, lsr #24		/* r3 = ...5 */
2407	orr	r3, r3, r2, lsl #8	/* r3 = 2345 */
2408	mov	r2, r2, lsr #24		/* r2 = ...1 */
2409	orr	r2, r2, ip, lsl #8	/* r2 = ..01 */
2410#else
2411	mov	r3, r3, lsl #24		/* r3 = 5... */
2412	orr	r3, r3, r2, lsr #8	/* r3 = 5432 */
2413	orr	r2, ip, r2, lsl #8	/* r2 = 3210 */
2414#endif
2415	str	r3, [r0, #0x02]
2416	strh	r2, [r0]
2417	RET
2418	LMEMCPY_8_PAD
2419
2420/*
2421 * 1100: dst is 8-bit aligned, src is 32-bit aligned
2422 */
2423	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2424	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2425	mov	r1, r3, lsr #8		/* BE:r1 = .456  LE:r1 = .765 */
2426	strh	r1, [r0, #0x05]
2427#ifdef __ARMEB__
2428	strb	r3, [r0, #0x07]
2429	mov	r1, r2, lsr #24		/* r1 = ...0 */
2430	strb	r1, [r0]
2431	mov	r2, r2, lsl #8		/* r2 = 123. */
2432	orr	r2, r2, r3, lsr #24	/* r2 = 1234 */
2433	str	r2, [r0, #0x01]
2434#else
2435	strb	r2, [r0]
2436	mov	r1, r3, lsr #24		/* r1 = ...7 */
2437	strb	r1, [r0, #0x07]
2438	mov	r2, r2, lsr #8		/* r2 = .321 */
2439	orr	r2, r2, r3, lsl #24	/* r2 = 4321 */
2440	str	r2, [r0, #0x01]
2441#endif
2442	RET
2443	LMEMCPY_8_PAD
2444
2445/*
2446 * 1101: dst is 8-bit aligned, src is 8-bit aligned
2447 */
2448	ldrb	r3, [r1]		/* r3 = ...0 */
2449	ldrh	r2, [r1, #0x01]		/* BE:r2 = ..12  LE:r2 = ..21 */
2450	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
2451	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
2452	strb	r3, [r0]
2453	mov	r3, ip, lsr #16		/* BE:r3 = ..34  LE:r3 = ..65 */
2454#ifdef __ARMEB__
2455	strh	ip, [r0, #0x05]
2456	orr	r2, r3, r2, lsl #16	/* r2 = 1234 */
2457#else
2458	strh	r3, [r0, #0x05]
2459	orr	r2, r2, ip, lsl #16	/* r2 = 4321 */
2460#endif
2461	str	r2, [r0, #0x01]
2462	strb	r1, [r0, #0x07]
2463	RET
2464	LMEMCPY_8_PAD
2465
2466/*
2467 * 1110: dst is 8-bit aligned, src is 16-bit aligned
2468 */
2469	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2470	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2471	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
2472#ifdef __ARMEB__
2473	mov	ip, r2, lsr #8		/* ip = ...0 */
2474	strb	ip, [r0]
2475	mov	ip, r2, lsl #24		/* ip = 1... */
2476	orr	ip, ip, r3, lsr #8	/* ip = 1234 */
2477	strb	r1, [r0, #0x07]
2478	mov	r1, r1, lsr #8		/* r1 = ...6 */
2479	orr	r1, r1, r3, lsl #8	/* r1 = 3456 */
2480#else
2481	strb	r2, [r0]
2482	mov	ip, r2, lsr #8		/* ip = ...1 */
2483	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
2484	mov	r2, r1, lsr #8		/* r2 = ...7 */
2485	strb	r2, [r0, #0x07]
2486	mov	r1, r1, lsl #8		/* r1 = .76. */
2487	orr	r1, r1, r3, lsr #24	/* r1 = .765 */
2488#endif
2489	str	ip, [r0, #0x01]
2490	strh	r1, [r0, #0x05]
2491	RET
2492	LMEMCPY_8_PAD
2493
2494/*
2495 * 1111: dst is 8-bit aligned, src is 8-bit aligned
2496 */
2497	ldrb	r2, [r1]
2498	ldr	ip, [r1, #0x01]
2499	ldrh	r3, [r1, #0x05]
2500	ldrb	r1, [r1, #0x07]
2501	strb	r2, [r0]
2502	str	ip, [r0, #0x01]
2503	strh	r3, [r0, #0x05]
2504	strb	r1, [r0, #0x07]
2505	RET
2506	LMEMCPY_8_PAD
2507
2508/******************************************************************************
2509 * Special case for 12 byte copies
2510 */
2511#define	LMEMCPY_C_LOG2	7	/* 128 bytes */
2512#define	LMEMCPY_C_PAD	.align LMEMCPY_C_LOG2
2513	LMEMCPY_C_PAD
2514.Lmemcpy_c:
2515	and	r2, r1, #0x03
2516	orr	r2, r2, r0, lsl #2
2517	ands	r2, r2, #0x0f
2518	sub	r3, pc, #0x14
2519	addne	pc, r3, r2, lsl #LMEMCPY_C_LOG2
2520
2521/*
2522 * 0000: dst is 32-bit aligned, src is 32-bit aligned
2523 */
2524	ldr	r2, [r1]
2525	ldr	r3, [r1, #0x04]
2526	ldr	r1, [r1, #0x08]
2527	str	r2, [r0]
2528	str	r3, [r0, #0x04]
2529	str	r1, [r0, #0x08]
2530	RET
2531	LMEMCPY_C_PAD
2532
2533/*
2534 * 0001: dst is 32-bit aligned, src is 8-bit aligned
2535 */
2536	ldrb	r2, [r1, #0xb]		/* r2 = ...B */
2537	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
2538	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
2539	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
2540#ifdef __ARMEB__
2541	orr	r2, r2, ip, lsl #8	/* r2 = 89AB */
2542	str	r2, [r0, #0x08]
2543	mov	r2, ip, lsr #24		/* r2 = ...7 */
2544	orr	r2, r2, r3, lsl #8	/* r2 = 4567 */
2545	mov	r1, r1, lsl #8		/* r1 = 012. */
2546	orr	r1, r1, r3, lsr #24	/* r1 = 0123 */
2547#else
2548	mov	r2, r2, lsl #24		/* r2 = B... */
2549	orr	r2, r2, ip, lsr #8	/* r2 = BA98 */
2550	str	r2, [r0, #0x08]
2551	mov	r2, ip, lsl #24		/* r2 = 7... */
2552	orr	r2, r2, r3, lsr #8	/* r2 = 7654 */
2553	mov	r1, r1, lsr #8		/* r1 = .210 */
2554	orr	r1, r1, r3, lsl #24	/* r1 = 3210 */
2555#endif
2556	str	r2, [r0, #0x04]
2557	str	r1, [r0]
2558	RET
2559	LMEMCPY_C_PAD
2560
2561/*
2562 * 0010: dst is 32-bit aligned, src is 16-bit aligned
2563 */
2564	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2565	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2566	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
2567	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
2568#ifdef __ARMEB__
2569	mov	r2, r2, lsl #16		/* r2 = 01.. */
2570	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
2571	str	r2, [r0]
2572	mov	r3, r3, lsl #16		/* r3 = 45.. */
2573	orr	r3, r3, ip, lsr #16	/* r3 = 4567 */
2574	orr	r1, r1, ip, lsl #16	/* r1 = 89AB */
2575#else
2576	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
2577	str	r2, [r0]
2578	mov	r3, r3, lsr #16		/* r3 = ..54 */
2579	orr	r3, r3, ip, lsl #16	/* r3 = 7654 */
2580	mov	r1, r1, lsl #16		/* r1 = BA.. */
2581	orr	r1, r1, ip, lsr #16	/* r1 = BA98 */
2582#endif
2583	str	r3, [r0, #0x04]
2584	str	r1, [r0, #0x08]
2585	RET
2586	LMEMCPY_C_PAD
2587
2588/*
2589 * 0011: dst is 32-bit aligned, src is 8-bit aligned
2590 */
2591	ldrb	r2, [r1]		/* r2 = ...0 */
2592	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
2593	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
2594	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
2595#ifdef __ARMEB__
2596	mov	r2, r2, lsl #24		/* r2 = 0... */
2597	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
2598	str	r2, [r0]
2599	mov	r3, r3, lsl #24		/* r3 = 4... */
2600	orr	r3, r3, ip, lsr #8	/* r3 = 4567 */
2601	mov	r1, r1, lsr #8		/* r1 = .9AB */
2602	orr	r1, r1, ip, lsl #24	/* r1 = 89AB */
2603#else
2604	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
2605	str	r2, [r0]
2606	mov	r3, r3, lsr #24		/* r3 = ...4 */
2607	orr	r3, r3, ip, lsl #8	/* r3 = 7654 */
2608	mov	r1, r1, lsl #8		/* r1 = BA9. */
2609	orr	r1, r1, ip, lsr #24	/* r1 = BA98 */
2610#endif
2611	str	r3, [r0, #0x04]
2612	str	r1, [r0, #0x08]
2613	RET
2614	LMEMCPY_C_PAD
2615
2616/*
2617 * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
2618 */
2619	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2620	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2621	ldr	ip, [r1, #0x08]		/* BE:ip = 89AB  LE:ip = BA98 */
2622	mov	r1, r2, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
2623	strh	r1, [r0, #0x01]
2624#ifdef __ARMEB__
2625	mov	r1, r2, lsr #24		/* r1 = ...0 */
2626	strb	r1, [r0]
2627	mov	r1, r2, lsl #24		/* r1 = 3... */
2628	orr	r2, r1, r3, lsr #8	/* r1 = 3456 */
2629	mov	r1, r3, lsl #24		/* r1 = 7... */
2630	orr	r1, r1, ip, lsr #8	/* r1 = 789A */
2631#else
2632	strb	r2, [r0]
2633	mov	r1, r2, lsr #24		/* r1 = ...3 */
2634	orr	r2, r1, r3, lsl #8	/* r1 = 6543 */
2635	mov	r1, r3, lsr #24		/* r1 = ...7 */
2636	orr	r1, r1, ip, lsl #8	/* r1 = A987 */
2637	mov	ip, ip, lsr #24		/* ip = ...B */
2638#endif
2639	str	r2, [r0, #0x03]
2640	str	r1, [r0, #0x07]
2641	strb	ip, [r0, #0x0b]
2642	RET
2643	LMEMCPY_C_PAD
2644
2645/*
2646 * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
2647 */
2648	ldrb	r2, [r1]
2649	ldrh	r3, [r1, #0x01]
2650	ldr	ip, [r1, #0x03]
2651	strb	r2, [r0]
2652	ldr	r2, [r1, #0x07]
2653	ldrb	r1, [r1, #0x0b]
2654	strh	r3, [r0, #0x01]
2655	str	ip, [r0, #0x03]
2656	str	r2, [r0, #0x07]
2657	strb	r1, [r0, #0x0b]
2658	RET
2659	LMEMCPY_C_PAD
2660
2661/*
2662 * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
2663 */
2664	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2665	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2666	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
2667	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
2668#ifdef __ARMEB__
2669	mov	r2, r2, ror #8		/* r2 = 1..0 */
2670	strb	r2, [r0]
2671	mov	r2, r2, lsr #16		/* r2 = ..1. */
2672	orr	r2, r2, r3, lsr #24	/* r2 = ..12 */
2673	strh	r2, [r0, #0x01]
2674	mov	r2, r3, lsl #8		/* r2 = 345. */
2675	orr	r3, r2, ip, lsr #24	/* r3 = 3456 */
2676	mov	r2, ip, lsl #8		/* r2 = 789. */
2677	orr	r2, r2, r1, lsr #8	/* r2 = 789A */
2678#else
2679	strb	r2, [r0]
2680	mov	r2, r2, lsr #8		/* r2 = ...1 */
2681	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
2682	strh	r2, [r0, #0x01]
2683	mov	r2, r3, lsr #8		/* r2 = .543 */
2684	orr	r3, r2, ip, lsl #24	/* r3 = 6543 */
2685	mov	r2, ip, lsr #8		/* r2 = .987 */
2686	orr	r2, r2, r1, lsl #24	/* r2 = A987 */
2687	mov	r1, r1, lsr #8		/* r1 = ...B */
2688#endif
2689	str	r3, [r0, #0x03]
2690	str	r2, [r0, #0x07]
2691	strb	r1, [r0, #0x0b]
2692	RET
2693	LMEMCPY_C_PAD
2694
2695/*
2696 * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
2697 */
2698	ldrb	r2, [r1]
2699	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
2700	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
2701	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
2702	strb	r2, [r0]
2703#ifdef __ARMEB__
2704	mov	r2, r3, lsr #16		/* r2 = ..12 */
2705	strh	r2, [r0, #0x01]
2706	mov	r3, r3, lsl #16		/* r3 = 34.. */
2707	orr	r3, r3, ip, lsr #16	/* r3 = 3456 */
2708	mov	ip, ip, lsl #16		/* ip = 78.. */
2709	orr	ip, ip, r1, lsr #16	/* ip = 789A */
2710	mov	r1, r1, lsr #8		/* r1 = .9AB */
2711#else
2712	strh	r3, [r0, #0x01]
2713	mov	r3, r3, lsr #16		/* r3 = ..43 */
2714	orr	r3, r3, ip, lsl #16	/* r3 = 6543 */
2715	mov	ip, ip, lsr #16		/* ip = ..87 */
2716	orr	ip, ip, r1, lsl #16	/* ip = A987 */
2717	mov	r1, r1, lsr #16		/* r1 = ..xB */
2718#endif
2719	str	r3, [r0, #0x03]
2720	str	ip, [r0, #0x07]
2721	strb	r1, [r0, #0x0b]
2722	RET
2723	LMEMCPY_C_PAD
2724
2725/*
2726 * 1000: dst is 16-bit aligned, src is 32-bit aligned
2727 */
2728	ldr	ip, [r1]		/* BE:ip = 0123  LE:ip = 3210 */
2729	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2730	ldr	r2, [r1, #0x08]		/* BE:r2 = 89AB  LE:r2 = BA98 */
2731	mov	r1, ip, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
2732#ifdef __ARMEB__
2733	strh	r1, [r0]
2734	mov	r1, ip, lsl #16		/* r1 = 23.. */
2735	orr	r1, r1, r3, lsr #16	/* r1 = 2345 */
2736	mov	r3, r3, lsl #16		/* r3 = 67.. */
2737	orr	r3, r3, r2, lsr #16	/* r3 = 6789 */
2738#else
2739	strh	ip, [r0]
2740	orr	r1, r1, r3, lsl #16	/* r1 = 5432 */
2741	mov	r3, r3, lsr #16		/* r3 = ..76 */
2742	orr	r3, r3, r2, lsl #16	/* r3 = 9876 */
2743	mov	r2, r2, lsr #16		/* r2 = ..BA */
2744#endif
2745	str	r1, [r0, #0x02]
2746	str	r3, [r0, #0x06]
2747	strh	r2, [r0, #0x0a]
2748	RET
2749	LMEMCPY_C_PAD
2750
2751/*
2752 * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
2753 */
2754	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
2755	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
2756	mov	ip, r2, lsr #8		/* BE:ip = .x01  LE:ip = .210 */
2757	strh	ip, [r0]
2758	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
2759	ldrb	r1, [r1, #0x0b]		/* r1 = ...B */
2760#ifdef __ARMEB__
2761	mov	r2, r2, lsl #24		/* r2 = 2... */
2762	orr	r2, r2, r3, lsr #8	/* r2 = 2345 */
2763	mov	r3, r3, lsl #24		/* r3 = 6... */
2764	orr	r3, r3, ip, lsr #8	/* r3 = 6789 */
2765	orr	r1, r1, ip, lsl #8	/* r1 = 89AB */
2766#else
2767	mov	r2, r2, lsr #24		/* r2 = ...2 */
2768	orr	r2, r2, r3, lsl #8	/* r2 = 5432 */
2769	mov	r3, r3, lsr #24		/* r3 = ...6 */
2770	orr	r3, r3, ip, lsl #8	/* r3 = 9876 */
2771	mov	r1, r1, lsl #8		/* r1 = ..B. */
2772	orr	r1, r1, ip, lsr #24	/* r1 = ..BA */
2773#endif
2774	str	r2, [r0, #0x02]
2775	str	r3, [r0, #0x06]
2776	strh	r1, [r0, #0x0a]
2777	RET
2778	LMEMCPY_C_PAD
2779
2780/*
2781 * 1010: dst is 16-bit aligned, src is 16-bit aligned
2782 */
2783	ldrh	r2, [r1]
2784	ldr	r3, [r1, #0x02]
2785	ldr	ip, [r1, #0x06]
2786	ldrh	r1, [r1, #0x0a]
2787	strh	r2, [r0]
2788	str	r3, [r0, #0x02]
2789	str	ip, [r0, #0x06]
2790	strh	r1, [r0, #0x0a]
2791	RET
2792	LMEMCPY_C_PAD
2793
2794/*
2795 * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
2796 */
2797	ldr	r2, [r1, #0x09]		/* BE:r2 = 9ABx  LE:r2 = xBA9 */
2798	ldr	r3, [r1, #0x05]		/* BE:r3 = 5678  LE:r3 = 8765 */
2799	mov	ip, r2, lsr #8		/* BE:ip = .9AB  LE:ip = .xBA */
2800	strh	ip, [r0, #0x0a]
2801	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
2802	ldrb	r1, [r1]		/* r1 = ...0 */
2803#ifdef __ARMEB__
2804	mov	r2, r2, lsr #24		/* r2 = ...9 */
2805	orr	r2, r2, r3, lsl #8	/* r2 = 6789 */
2806	mov	r3, r3, lsr #24		/* r3 = ...5 */
2807	orr	r3, r3, ip, lsl #8	/* r3 = 2345 */
2808	mov	r1, r1, lsl #8		/* r1 = ..0. */
2809	orr	r1, r1, ip, lsr #24	/* r1 = ..01 */
2810#else
2811	mov	r2, r2, lsl #24		/* r2 = 9... */
2812	orr	r2, r2, r3, lsr #8	/* r2 = 9876 */
2813	mov	r3, r3, lsl #24		/* r3 = 5... */
2814	orr	r3, r3, ip, lsr #8	/* r3 = 5432 */
2815	orr	r1, r1, ip, lsl #8	/* r1 = 3210 */
2816#endif
2817	str	r2, [r0, #0x06]
2818	str	r3, [r0, #0x02]
2819	strh	r1, [r0]
2820	RET
2821	LMEMCPY_C_PAD
2822
2823/*
2824 * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
2825 */
2826	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2827	ldr	ip, [r1, #0x04]		/* BE:ip = 4567  LE:ip = 7654 */
2828	ldr	r1, [r1, #0x08]		/* BE:r1 = 89AB  LE:r1 = BA98 */
2829#ifdef __ARMEB__
2830	mov	r3, r2, lsr #24		/* r3 = ...0 */
2831	strb	r3, [r0]
2832	mov	r2, r2, lsl #8		/* r2 = 123. */
2833	orr	r2, r2, ip, lsr #24	/* r2 = 1234 */
2834	str	r2, [r0, #0x01]
2835	mov	r2, ip, lsl #8		/* r2 = 567. */
2836	orr	r2, r2, r1, lsr #24	/* r2 = 5678 */
2837	str	r2, [r0, #0x05]
2838	mov	r2, r1, lsr #8		/* r2 = ..9A */
2839	strh	r2, [r0, #0x09]
2840	strb	r1, [r0, #0x0b]
2841#else
2842	strb	r2, [r0]
2843	mov	r3, r2, lsr #8		/* r3 = .321 */
2844	orr	r3, r3, ip, lsl #24	/* r3 = 4321 */
2845	str	r3, [r0, #0x01]
2846	mov	r3, ip, lsr #8		/* r3 = .765 */
2847	orr	r3, r3, r1, lsl #24	/* r3 = 8765 */
2848	str	r3, [r0, #0x05]
2849	mov	r1, r1, lsr #8		/* r1 = .BA9 */
2850	strh	r1, [r0, #0x09]
2851	mov	r1, r1, lsr #16		/* r1 = ...B */
2852	strb	r1, [r0, #0x0b]
2853#endif
2854	RET
2855	LMEMCPY_C_PAD
2856
2857/*
2858 * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
2859 */
2860	ldrb	r2, [r1, #0x0b]		/* r2 = ...B */
2861	ldr	r3, [r1, #0x07]		/* BE:r3 = 789A  LE:r3 = A987 */
2862	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
2863	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
2864	strb	r2, [r0, #0x0b]
2865#ifdef __ARMEB__
2866	strh	r3, [r0, #0x09]
2867	mov	r3, r3, lsr #16		/* r3 = ..78 */
2868	orr	r3, r3, ip, lsl #16	/* r3 = 5678 */
2869	mov	ip, ip, lsr #16		/* ip = ..34 */
2870	orr	ip, ip, r1, lsl #16	/* ip = 1234 */
2871	mov	r1, r1, lsr #16		/* r1 = ..x0 */
2872#else
2873	mov	r2, r3, lsr #16		/* r2 = ..A9 */
2874	strh	r2, [r0, #0x09]
2875	mov	r3, r3, lsl #16		/* r3 = 87.. */
2876	orr	r3, r3, ip, lsr #16	/* r3 = 8765 */
2877	mov	ip, ip, lsl #16		/* ip = 43.. */
2878	orr	ip, ip, r1, lsr #16	/* ip = 4321 */
2879	mov	r1, r1, lsr #8		/* r1 = .210 */
2880#endif
2881	str	r3, [r0, #0x05]
2882	str	ip, [r0, #0x01]
2883	strb	r1, [r0]
2884	RET
2885	LMEMCPY_C_PAD
2886
2887/*
2888 * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
2889 */
2890#ifdef __ARMEB__
2891	ldrh	r2, [r1, #0x0a]		/* r2 = ..AB */
2892	ldr	ip, [r1, #0x06]		/* ip = 6789 */
2893	ldr	r3, [r1, #0x02]		/* r3 = 2345 */
2894	ldrh	r1, [r1]		/* r1 = ..01 */
2895	strb	r2, [r0, #0x0b]
2896	mov	r2, r2, lsr #8		/* r2 = ...A */
2897	orr	r2, r2, ip, lsl #8	/* r2 = 789A */
2898	mov	ip, ip, lsr #8		/* ip = .678 */
2899	orr	ip, ip, r3, lsl #24	/* ip = 5678 */
2900	mov	r3, r3, lsr #8		/* r3 = .234 */
2901	orr	r3, r3, r1, lsl #24	/* r3 = 1234 */
2902	mov	r1, r1, lsr #8		/* r1 = ...0 */
2903	strb	r1, [r0]
2904	str	r3, [r0, #0x01]
2905	str	ip, [r0, #0x05]
2906	strh	r2, [r0, #0x09]
2907#else
2908	ldrh	r2, [r1]		/* r2 = ..10 */
2909	ldr	r3, [r1, #0x02]		/* r3 = 5432 */
2910	ldr	ip, [r1, #0x06]		/* ip = 9876 */
2911	ldrh	r1, [r1, #0x0a]		/* r1 = ..BA */
2912	strb	r2, [r0]
2913	mov	r2, r2, lsr #8		/* r2 = ...1 */
2914	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
2915	mov	r3, r3, lsr #24		/* r3 = ...5 */
2916	orr	r3, r3, ip, lsl #8	/* r3 = 8765 */
2917	mov	ip, ip, lsr #24		/* ip = ...9 */
2918	orr	ip, ip, r1, lsl #8	/* ip = .BA9 */
2919	mov	r1, r1, lsr #8		/* r1 = ...B */
2920	str	r2, [r0, #0x01]
2921	str	r3, [r0, #0x05]
2922	strh	ip, [r0, #0x09]
2923	strb	r1, [r0, #0x0b]
2924#endif
2925	RET
2926	LMEMCPY_C_PAD
2927
2928/*
2929 * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
2930 */
2931	ldrb	r2, [r1]
2932	ldr	r3, [r1, #0x01]
2933	ldr	ip, [r1, #0x05]
2934	strb	r2, [r0]
2935	ldrh	r2, [r1, #0x09]
2936	ldrb	r1, [r1, #0x0b]
2937	str	r3, [r0, #0x01]
2938	str	ip, [r0, #0x05]
2939	strh	r2, [r0, #0x09]
2940	strb	r1, [r0, #0x0b]
2941	RET
2942#endif /* _ARM_ARCH_5E */
2943
2944#ifdef GPROF
2945
2946ENTRY(user)
2947	nop
2948ENTRY(btrap)
2949	nop
2950ENTRY(etrap)
2951	nop
2952ENTRY(bintr)
2953	nop
2954ENTRY(eintr)
2955	nop
2956
2957#endif
2958