1/*-
2 * Copyright (c) 2004 Olivier Houchard
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26/*
27 * Copyright 2003 Wasabi Systems, Inc.
28 * All rights reserved.
29 *
30 * Written by Steve C. Woodford for Wasabi Systems, Inc.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 *    notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 *    notice, this list of conditions and the following disclaimer in the
39 *    documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 *    must display the following acknowledgement:
42 *      This product includes software developed for the NetBSD Project by
43 *      Wasabi Systems, Inc.
44 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
45 *    or promote products derived from this software without specific prior
46 *    written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
50 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
51 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
52 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
53 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
54 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
55 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
56 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
57 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
58 * POSSIBILITY OF SUCH DAMAGE.
59 */
60/*
61 * Copyright (c) 1997 The NetBSD Foundation, Inc.
62 * All rights reserved.
63 *
64 * This code is derived from software contributed to The NetBSD Foundation
65 * by Neil A. Carson and Mark Brinicombe
66 *
67 * Redistribution and use in source and binary forms, with or without
68 * modification, are permitted provided that the following conditions
69 * are met:
70 * 1. Redistributions of source code must retain the above copyright
71 *    notice, this list of conditions and the following disclaimer.
72 * 2. Redistributions in binary form must reproduce the above copyright
73 *    notice, this list of conditions and the following disclaimer in the
74 *    documentation and/or other materials provided with the distribution.
75 *
76 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
77 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
78 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
79 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
80 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
81 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
82 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
83 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
84 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
85 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
86 * POSSIBILITY OF SUCH DAMAGE.
87 */
88
89#include <machine/asm.h>
90__FBSDID("$FreeBSD$");
91
92#include "assym.s"
93
94.L_arm_memcpy:
95	.word	_C_LABEL(_arm_memcpy)
96.L_arm_bzero:
97	.word	_C_LABEL(_arm_bzero)
98.L_min_memcpy_size:
99	.word	_C_LABEL(_min_memcpy_size)
100.L_min_bzero_size:
101	.word	_C_LABEL(_min_bzero_size)
102/*
103 * memset: Sets a block of memory to the specified value
104 *
105 * On entry:
106 *   r0 - dest address
107 *   r1 - byte to write
108 *   r2 - number of bytes to write
109 *
110 * On exit:
111 *   r0 - dest address
112 */
113/* LINTSTUB: Func: void bzero(void *, size_t) */
114ENTRY(bzero)
115	ldr	r3, .L_arm_bzero
116	ldr	r3, [r3]
117	cmp	r3, #0
118	beq	.Lnormal0
119	ldr	r2, .L_min_bzero_size
120	ldr	r2, [r2]
121	cmp	r1, r2
122	blt	.Lnormal0
123	stmfd	sp!, {r0, r1, lr}
124	mov	r2, #0
125	mov	lr, pc
126	mov	pc, r3
127	cmp	r0, #0
128	ldmfd	sp!, {r0, r1, lr}
129	RETeq
130.Lnormal0:
131	mov	r3, #0x00
132	b	do_memset
133EEND(bzero)
134/* LINTSTUB: Func: void *memset(void *, int, size_t) */
135ENTRY(memset)
136	and	r3, r1, #0xff		/* We deal with bytes */
137	mov	r1, r2
138do_memset:
139	cmp	r1, #0x04		/* Do we have less than 4 bytes */
140	mov	ip, r0
141	blt	.Lmemset_lessthanfour
142
143	/* Ok first we will word align the address */
144	ands	r2, ip, #0x03		/* Get the bottom two bits */
145	bne	.Lmemset_wordunaligned	/* The address is not word aligned */
146
147	/* We are now word aligned */
148.Lmemset_wordaligned:
149	orr	r3, r3, r3, lsl #8	/* Extend value to 16-bits */
150#ifdef _ARM_ARCH_5E
151	tst	ip, #0x04		/* Quad-align for armv5e */
152#else
153	cmp	r1, #0x10
154#endif
155	orr	r3, r3, r3, lsl #16	/* Extend value to 32-bits */
156#ifdef _ARM_ARCH_5E
157	subne	r1, r1, #0x04		/* Quad-align if necessary */
158	strne	r3, [ip], #0x04
159	cmp	r1, #0x10
160#endif
161	blt	.Lmemset_loop4		/* If less than 16 then use words */
162	mov	r2, r3			/* Duplicate data */
163	cmp	r1, #0x80		/* If < 128 then skip the big loop */
164	blt	.Lmemset_loop32
165
166	/* Do 128 bytes at a time */
167.Lmemset_loop128:
168	subs	r1, r1, #0x80
169#ifdef _ARM_ARCH_5E
170	strged	r2, [ip], #0x08
171	strged	r2, [ip], #0x08
172	strged	r2, [ip], #0x08
173	strged	r2, [ip], #0x08
174	strged	r2, [ip], #0x08
175	strged	r2, [ip], #0x08
176	strged	r2, [ip], #0x08
177	strged	r2, [ip], #0x08
178	strged	r2, [ip], #0x08
179	strged	r2, [ip], #0x08
180	strged	r2, [ip], #0x08
181	strged	r2, [ip], #0x08
182	strged	r2, [ip], #0x08
183	strged	r2, [ip], #0x08
184	strged	r2, [ip], #0x08
185	strged	r2, [ip], #0x08
186#else
187	stmgeia	ip!, {r2-r3}
188	stmgeia	ip!, {r2-r3}
189	stmgeia	ip!, {r2-r3}
190	stmgeia	ip!, {r2-r3}
191	stmgeia	ip!, {r2-r3}
192	stmgeia	ip!, {r2-r3}
193	stmgeia	ip!, {r2-r3}
194	stmgeia	ip!, {r2-r3}
195	stmgeia	ip!, {r2-r3}
196	stmgeia	ip!, {r2-r3}
197	stmgeia	ip!, {r2-r3}
198	stmgeia	ip!, {r2-r3}
199	stmgeia	ip!, {r2-r3}
200	stmgeia	ip!, {r2-r3}
201	stmgeia	ip!, {r2-r3}
202	stmgeia	ip!, {r2-r3}
203#endif
204	bgt	.Lmemset_loop128
205	RETeq			/* Zero length so just exit */
206
207	add	r1, r1, #0x80		/* Adjust for extra sub */
208
209	/* Do 32 bytes at a time */
210.Lmemset_loop32:
211	subs	r1, r1, #0x20
212#ifdef _ARM_ARCH_5E
213	strged	r2, [ip], #0x08
214	strged	r2, [ip], #0x08
215	strged	r2, [ip], #0x08
216	strged	r2, [ip], #0x08
217#else
218	stmgeia	ip!, {r2-r3}
219	stmgeia	ip!, {r2-r3}
220	stmgeia	ip!, {r2-r3}
221	stmgeia	ip!, {r2-r3}
222#endif
223	bgt	.Lmemset_loop32
224	RETeq			/* Zero length so just exit */
225
226	adds	r1, r1, #0x10		/* Partially adjust for extra sub */
227
228	/* Deal with 16 bytes or more */
229#ifdef _ARM_ARCH_5E
230	strged	r2, [ip], #0x08
231	strged	r2, [ip], #0x08
232#else
233	stmgeia	ip!, {r2-r3}
234	stmgeia	ip!, {r2-r3}
235#endif
236	RETeq			/* Zero length so just exit */
237
238	addlt	r1, r1, #0x10		/* Possibly adjust for extra sub */
239
240	/* We have at least 4 bytes so copy as words */
241.Lmemset_loop4:
242	subs	r1, r1, #0x04
243	strge	r3, [ip], #0x04
244	bgt	.Lmemset_loop4
245	RETeq			/* Zero length so just exit */
246
247#ifdef _ARM_ARCH_5E
248	/* Compensate for 64-bit alignment check */
249	adds	r1, r1, #0x04
250	RETeq
251	cmp	r1, #2
252#else
253	cmp	r1, #-2
254#endif
255
256	strb	r3, [ip], #0x01		/* Set 1 byte */
257	strgeb	r3, [ip], #0x01		/* Set another byte */
258	strgtb	r3, [ip]		/* and a third */
259	RET			/* Exit */
260
261.Lmemset_wordunaligned:
262	rsb	r2, r2, #0x004
263	strb	r3, [ip], #0x01		/* Set 1 byte */
264	cmp	r2, #0x02
265	strgeb	r3, [ip], #0x01		/* Set another byte */
266	sub	r1, r1, r2
267	strgtb	r3, [ip], #0x01		/* and a third */
268	cmp	r1, #0x04		/* More than 4 bytes left? */
269	bge	.Lmemset_wordaligned	/* Yup */
270
271.Lmemset_lessthanfour:
272	cmp	r1, #0x00
273	RETeq			/* Zero length so exit */
274	strb	r3, [ip], #0x01		/* Set 1 byte */
275	cmp	r1, #0x02
276	strgeb	r3, [ip], #0x01		/* Set another byte */
277	strgtb	r3, [ip]		/* and a third */
278	RET			/* Exit */
279END(memset)
280
281ENTRY(bcmp)
282	mov	ip, r0
283	cmp	r2, #0x06
284	beq	.Lmemcmp_6bytes
285	mov	r0, #0x00
286
287	/* Are both addresses aligned the same way? */
288	cmp	r2, #0x00
289	eornes	r3, ip, r1
290	RETeq			/* len == 0, or same addresses! */
291	tst	r3, #0x03
292	subne	r2, r2, #0x01
293	bne	.Lmemcmp_bytewise2	/* Badly aligned. Do it the slow way */
294
295	/* Word-align the addresses, if necessary */
296	sub	r3, r1, #0x05
297	ands	r3, r3, #0x03
298	add	r3, r3, r3, lsl #1
299	addne	pc, pc, r3, lsl #3
300	nop
301
302	/* Compare up to 3 bytes */
303	ldrb	r0, [ip], #0x01
304	ldrb	r3, [r1], #0x01
305	subs	r0, r0, r3
306	RETne
307	subs	r2, r2, #0x01
308	RETeq
309
310	/* Compare up to 2 bytes */
311	ldrb	r0, [ip], #0x01
312	ldrb	r3, [r1], #0x01
313	subs	r0, r0, r3
314	RETne
315	subs	r2, r2, #0x01
316	RETeq
317
318	/* Compare 1 byte */
319	ldrb	r0, [ip], #0x01
320	ldrb	r3, [r1], #0x01
321	subs	r0, r0, r3
322	RETne
323	subs	r2, r2, #0x01
324	RETeq
325
326	/* Compare 4 bytes at a time, if possible */
327	subs	r2, r2, #0x04
328	bcc	.Lmemcmp_bytewise
329.Lmemcmp_word_aligned:
330	ldr	r0, [ip], #0x04
331	ldr	r3, [r1], #0x04
332	subs	r2, r2, #0x04
333	cmpcs	r0, r3
334	beq	.Lmemcmp_word_aligned
335	sub	r0, r0, r3
336
337	/* Correct for extra subtraction, and check if done */
338	adds	r2, r2, #0x04
339	cmpeq	r0, #0x00		/* If done, did all bytes match? */
340	RETeq			/* Yup. Just return */
341
342	/* Re-do the final word byte-wise */
343	sub	ip, ip, #0x04
344	sub	r1, r1, #0x04
345
346.Lmemcmp_bytewise:
347	add	r2, r2, #0x03
348.Lmemcmp_bytewise2:
349	ldrb	r0, [ip], #0x01
350	ldrb	r3, [r1], #0x01
351	subs	r2, r2, #0x01
352	cmpcs	r0, r3
353	beq	.Lmemcmp_bytewise2
354	sub	r0, r0, r3
355	RET
356
357	/*
358	 * 6 byte compares are very common, thanks to the network stack.
359	 * This code is hand-scheduled to reduce the number of stalls for
360	 * load results. Everything else being equal, this will be ~32%
361	 * faster than a byte-wise memcmp.
362	 */
363	.align	5
364.Lmemcmp_6bytes:
365	ldrb	r3, [r1, #0x00]		/* r3 = b2#0 */
366	ldrb	r0, [ip, #0x00]		/* r0 = b1#0 */
367	ldrb	r2, [r1, #0x01]		/* r2 = b2#1 */
368	subs	r0, r0, r3		/* r0 = b1#0 - b2#0 */
369	ldreqb	r3, [ip, #0x01]		/* r3 = b1#1 */
370	RETne			/* Return if mismatch on #0 */
371	subs	r0, r3, r2		/* r0 = b1#1 - b2#1 */
372	ldreqb	r3, [r1, #0x02]		/* r3 = b2#2 */
373	ldreqb	r0, [ip, #0x02]		/* r0 = b1#2 */
374	RETne			/* Return if mismatch on #1 */
375	ldrb	r2, [r1, #0x03]		/* r2 = b2#3 */
376	subs	r0, r0, r3		/* r0 = b1#2 - b2#2 */
377	ldreqb	r3, [ip, #0x03]		/* r3 = b1#3 */
378	RETne			/* Return if mismatch on #2 */
379	subs	r0, r3, r2		/* r0 = b1#3 - b2#3 */
380	ldreqb	r3, [r1, #0x04]		/* r3 = b2#4 */
381	ldreqb	r0, [ip, #0x04]		/* r0 = b1#4 */
382	RETne			/* Return if mismatch on #3 */
383	ldrb	r2, [r1, #0x05]		/* r2 = b2#5 */
384	subs	r0, r0, r3		/* r0 = b1#4 - b2#4 */
385	ldreqb	r3, [ip, #0x05]		/* r3 = b1#5 */
386	RETne			/* Return if mismatch on #4 */
387	sub	r0, r3, r2		/* r0 = b1#5 - b2#5 */
388	RET
389END(bcmp)
390
391ENTRY(bcopy)
392	/* switch the source and destination registers */
393	eor     r0, r1, r0
394	eor     r1, r0, r1
395	eor     r0, r1, r0
396EENTRY(memmove)
397	/* Do the buffers overlap? */
398	cmp	r0, r1
399	RETeq		/* Bail now if src/dst are the same */
400	subcc	r3, r0, r1	/* if (dst > src) r3 = dst - src */
401	subcs	r3, r1, r0	/* if (src > dsr) r3 = src - dst */
402	cmp	r3, r2		/* if (r3 < len) we have an overlap */
403	bcc	PIC_SYM(_C_LABEL(memcpy), PLT)
404
405	/* Determine copy direction */
406	cmp	r1, r0
407	bcc	.Lmemmove_backwards
408
409	moveq	r0, #0			/* Quick abort for len=0 */
410	RETeq
411
412	stmdb	sp!, {r0, lr}		/* memmove() returns dest addr */
413	subs	r2, r2, #4
414	blt	.Lmemmove_fl4		/* less than 4 bytes */
415	ands	r12, r0, #3
416	bne	.Lmemmove_fdestul	/* oh unaligned destination addr */
417	ands	r12, r1, #3
418	bne	.Lmemmove_fsrcul		/* oh unaligned source addr */
419
420.Lmemmove_ft8:
421	/* We have aligned source and destination */
422	subs	r2, r2, #8
423	blt	.Lmemmove_fl12		/* less than 12 bytes (4 from above) */
424	subs	r2, r2, #0x14
425	blt	.Lmemmove_fl32		/* less than 32 bytes (12 from above) */
426	stmdb	sp!, {r4}		/* borrow r4 */
427
428	/* blat 32 bytes at a time */
429	/* XXX for really big copies perhaps we should use more registers */
430.Lmemmove_floop32:
431	ldmia	r1!, {r3, r4, r12, lr}
432	stmia	r0!, {r3, r4, r12, lr}
433	ldmia	r1!, {r3, r4, r12, lr}
434	stmia	r0!, {r3, r4, r12, lr}
435	subs	r2, r2, #0x20
436	bge	.Lmemmove_floop32
437
438	cmn	r2, #0x10
439	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
440	stmgeia	r0!, {r3, r4, r12, lr}
441	subge	r2, r2, #0x10
442	ldmia	sp!, {r4}		/* return r4 */
443
444.Lmemmove_fl32:
445	adds	r2, r2, #0x14
446
447	/* blat 12 bytes at a time */
448.Lmemmove_floop12:
449	ldmgeia	r1!, {r3, r12, lr}
450	stmgeia	r0!, {r3, r12, lr}
451	subges	r2, r2, #0x0c
452	bge	.Lmemmove_floop12
453
454.Lmemmove_fl12:
455	adds	r2, r2, #8
456	blt	.Lmemmove_fl4
457
458	subs	r2, r2, #4
459	ldrlt	r3, [r1], #4
460	strlt	r3, [r0], #4
461	ldmgeia	r1!, {r3, r12}
462	stmgeia	r0!, {r3, r12}
463	subge	r2, r2, #4
464
465.Lmemmove_fl4:
466	/* less than 4 bytes to go */
467	adds	r2, r2, #4
468	ldmeqia	sp!, {r0, pc}		/* done */
469
470	/* copy the crud byte at a time */
471	cmp	r2, #2
472	ldrb	r3, [r1], #1
473	strb	r3, [r0], #1
474	ldrgeb	r3, [r1], #1
475	strgeb	r3, [r0], #1
476	ldrgtb	r3, [r1], #1
477	strgtb	r3, [r0], #1
478	ldmia	sp!, {r0, pc}
479
480	/* erg - unaligned destination */
481.Lmemmove_fdestul:
482	rsb	r12, r12, #4
483	cmp	r12, #2
484
485	/* align destination with byte copies */
486	ldrb	r3, [r1], #1
487	strb	r3, [r0], #1
488	ldrgeb	r3, [r1], #1
489	strgeb	r3, [r0], #1
490	ldrgtb	r3, [r1], #1
491	strgtb	r3, [r0], #1
492	subs	r2, r2, r12
493	blt	.Lmemmove_fl4		/* less the 4 bytes */
494
495	ands	r12, r1, #3
496	beq	.Lmemmove_ft8		/* we have an aligned source */
497
498	/* erg - unaligned source */
499	/* This is where it gets nasty ... */
500.Lmemmove_fsrcul:
501	bic	r1, r1, #3
502	ldr	lr, [r1], #4
503	cmp	r12, #2
504	bgt	.Lmemmove_fsrcul3
505	beq	.Lmemmove_fsrcul2
506	cmp	r2, #0x0c
507	blt	.Lmemmove_fsrcul1loop4
508	sub	r2, r2, #0x0c
509	stmdb	sp!, {r4, r5}
510
511.Lmemmove_fsrcul1loop16:
512#ifdef __ARMEB__
513	mov	r3, lr, lsl #8
514#else
515	mov	r3, lr, lsr #8
516#endif
517	ldmia	r1!, {r4, r5, r12, lr}
518#ifdef __ARMEB__
519	orr	r3, r3, r4, lsr #24
520	mov	r4, r4, lsl #8
521	orr	r4, r4, r5, lsr #24
522	mov	r5, r5, lsl #8
523	orr	r5, r5, r12, lsr #24
524	mov	r12, r12, lsl #8
525	orr	r12, r12, lr, lsr #24
526#else
527	orr	r3, r3, r4, lsl #24
528	mov	r4, r4, lsr #8
529	orr	r4, r4, r5, lsl #24
530	mov	r5, r5, lsr #8
531	orr	r5, r5, r12, lsl #24
532	mov	r12, r12, lsr #8
533	orr	r12, r12, lr, lsl #24
534#endif
535	stmia	r0!, {r3-r5, r12}
536	subs	r2, r2, #0x10
537	bge	.Lmemmove_fsrcul1loop16
538	ldmia	sp!, {r4, r5}
539	adds	r2, r2, #0x0c
540	blt	.Lmemmove_fsrcul1l4
541
542.Lmemmove_fsrcul1loop4:
543#ifdef __ARMEB__
544	mov	r12, lr, lsl #8
545#else
546	mov	r12, lr, lsr #8
547#endif
548	ldr	lr, [r1], #4
549#ifdef __ARMEB__
550	orr	r12, r12, lr, lsr #24
551#else
552	orr	r12, r12, lr, lsl #24
553#endif
554	str	r12, [r0], #4
555	subs	r2, r2, #4
556	bge	.Lmemmove_fsrcul1loop4
557
558.Lmemmove_fsrcul1l4:
559	sub	r1, r1, #3
560	b	.Lmemmove_fl4
561
562.Lmemmove_fsrcul2:
563	cmp	r2, #0x0c
564	blt	.Lmemmove_fsrcul2loop4
565	sub	r2, r2, #0x0c
566	stmdb	sp!, {r4, r5}
567
568.Lmemmove_fsrcul2loop16:
569#ifdef __ARMEB__
570	mov	r3, lr, lsl #16
571#else
572	mov	r3, lr, lsr #16
573#endif
574	ldmia	r1!, {r4, r5, r12, lr}
575#ifdef __ARMEB__
576	orr	r3, r3, r4, lsr #16
577	mov	r4, r4, lsl #16
578	orr	r4, r4, r5, lsr #16
579	mov	r5, r5, lsl #16
580	orr	r5, r5, r12, lsr #16
581	mov	r12, r12, lsl #16
582	orr	r12, r12, lr, lsr #16
583#else
584	orr	r3, r3, r4, lsl #16
585	mov	r4, r4, lsr #16
586	orr	r4, r4, r5, lsl #16
587	mov	r5, r5, lsr #16
588	orr	r5, r5, r12, lsl #16
589	mov	r12, r12, lsr #16
590	orr	r12, r12, lr, lsl #16
591#endif
592	stmia	r0!, {r3-r5, r12}
593	subs	r2, r2, #0x10
594	bge	.Lmemmove_fsrcul2loop16
595	ldmia	sp!, {r4, r5}
596	adds	r2, r2, #0x0c
597	blt	.Lmemmove_fsrcul2l4
598
599.Lmemmove_fsrcul2loop4:
600#ifdef __ARMEB__
601	mov	r12, lr, lsl #16
602#else
603	mov	r12, lr, lsr #16
604#endif
605	ldr	lr, [r1], #4
606#ifdef __ARMEB__
607	orr	r12, r12, lr, lsr #16
608#else
609	orr	r12, r12, lr, lsl #16
610#endif
611	str	r12, [r0], #4
612	subs	r2, r2, #4
613	bge	.Lmemmove_fsrcul2loop4
614
615.Lmemmove_fsrcul2l4:
616	sub	r1, r1, #2
617	b	.Lmemmove_fl4
618
619.Lmemmove_fsrcul3:
620	cmp	r2, #0x0c
621	blt	.Lmemmove_fsrcul3loop4
622	sub	r2, r2, #0x0c
623	stmdb	sp!, {r4, r5}
624
625.Lmemmove_fsrcul3loop16:
626#ifdef __ARMEB__
627	mov	r3, lr, lsl #24
628#else
629	mov	r3, lr, lsr #24
630#endif
631	ldmia	r1!, {r4, r5, r12, lr}
632#ifdef __ARMEB__
633	orr	r3, r3, r4, lsr #8
634	mov	r4, r4, lsl #24
635	orr	r4, r4, r5, lsr #8
636	mov	r5, r5, lsl #24
637	orr	r5, r5, r12, lsr #8
638	mov	r12, r12, lsl #24
639	orr	r12, r12, lr, lsr #8
640#else
641	orr	r3, r3, r4, lsl #8
642	mov	r4, r4, lsr #24
643	orr	r4, r4, r5, lsl #8
644	mov	r5, r5, lsr #24
645	orr	r5, r5, r12, lsl #8
646	mov	r12, r12, lsr #24
647	orr	r12, r12, lr, lsl #8
648#endif
649	stmia	r0!, {r3-r5, r12}
650	subs	r2, r2, #0x10
651	bge	.Lmemmove_fsrcul3loop16
652	ldmia	sp!, {r4, r5}
653	adds	r2, r2, #0x0c
654	blt	.Lmemmove_fsrcul3l4
655
656.Lmemmove_fsrcul3loop4:
657#ifdef __ARMEB__
658	mov	r12, lr, lsl #24
659#else
660	mov	r12, lr, lsr #24
661#endif
662	ldr	lr, [r1], #4
663#ifdef __ARMEB__
664	orr	r12, r12, lr, lsr #8
665#else
666	orr	r12, r12, lr, lsl #8
667#endif
668	str	r12, [r0], #4
669	subs	r2, r2, #4
670	bge	.Lmemmove_fsrcul3loop4
671
672.Lmemmove_fsrcul3l4:
673	sub	r1, r1, #1
674	b	.Lmemmove_fl4
675
676.Lmemmove_backwards:
677	add	r1, r1, r2
678	add	r0, r0, r2
679	subs	r2, r2, #4
680	blt	.Lmemmove_bl4		/* less than 4 bytes */
681	ands	r12, r0, #3
682	bne	.Lmemmove_bdestul	/* oh unaligned destination addr */
683	ands	r12, r1, #3
684	bne	.Lmemmove_bsrcul		/* oh unaligned source addr */
685
686.Lmemmove_bt8:
687	/* We have aligned source and destination */
688	subs	r2, r2, #8
689	blt	.Lmemmove_bl12		/* less than 12 bytes (4 from above) */
690	stmdb	sp!, {r4, lr}
691	subs	r2, r2, #0x14		/* less than 32 bytes (12 from above) */
692	blt	.Lmemmove_bl32
693
694	/* blat 32 bytes at a time */
695	/* XXX for really big copies perhaps we should use more registers */
696.Lmemmove_bloop32:
697	ldmdb	r1!, {r3, r4, r12, lr}
698	stmdb	r0!, {r3, r4, r12, lr}
699	ldmdb	r1!, {r3, r4, r12, lr}
700	stmdb	r0!, {r3, r4, r12, lr}
701	subs	r2, r2, #0x20
702	bge	.Lmemmove_bloop32
703
704.Lmemmove_bl32:
705	cmn	r2, #0x10
706	ldmgedb	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
707	stmgedb	r0!, {r3, r4, r12, lr}
708	subge	r2, r2, #0x10
709	adds	r2, r2, #0x14
710	ldmgedb	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */
711	stmgedb	r0!, {r3, r12, lr}
712	subge	r2, r2, #0x0c
713	ldmia	sp!, {r4, lr}
714
715.Lmemmove_bl12:
716	adds	r2, r2, #8
717	blt	.Lmemmove_bl4
718	subs	r2, r2, #4
719	ldrlt	r3, [r1, #-4]!
720	strlt	r3, [r0, #-4]!
721	ldmgedb	r1!, {r3, r12}
722	stmgedb	r0!, {r3, r12}
723	subge	r2, r2, #4
724
725.Lmemmove_bl4:
726	/* less than 4 bytes to go */
727	adds	r2, r2, #4
728	RETeq			/* done */
729
730	/* copy the crud byte at a time */
731	cmp	r2, #2
732	ldrb	r3, [r1, #-1]!
733	strb	r3, [r0, #-1]!
734	ldrgeb	r3, [r1, #-1]!
735	strgeb	r3, [r0, #-1]!
736	ldrgtb	r3, [r1, #-1]!
737	strgtb	r3, [r0, #-1]!
738	RET
739
740	/* erg - unaligned destination */
741.Lmemmove_bdestul:
742	cmp	r12, #2
743
744	/* align destination with byte copies */
745	ldrb	r3, [r1, #-1]!
746	strb	r3, [r0, #-1]!
747	ldrgeb	r3, [r1, #-1]!
748	strgeb	r3, [r0, #-1]!
749	ldrgtb	r3, [r1, #-1]!
750	strgtb	r3, [r0, #-1]!
751	subs	r2, r2, r12
752	blt	.Lmemmove_bl4		/* less than 4 bytes to go */
753	ands	r12, r1, #3
754	beq	.Lmemmove_bt8		/* we have an aligned source */
755
756	/* erg - unaligned source */
757	/* This is where it gets nasty ... */
758.Lmemmove_bsrcul:
759	bic	r1, r1, #3
760	ldr	r3, [r1, #0]
761	cmp	r12, #2
762	blt	.Lmemmove_bsrcul1
763	beq	.Lmemmove_bsrcul2
764	cmp	r2, #0x0c
765	blt	.Lmemmove_bsrcul3loop4
766	sub	r2, r2, #0x0c
767	stmdb	sp!, {r4, r5, lr}
768
769.Lmemmove_bsrcul3loop16:
770#ifdef __ARMEB__
771	mov	lr, r3, lsr #8
772#else
773	mov	lr, r3, lsl #8
774#endif
775	ldmdb	r1!, {r3-r5, r12}
776#ifdef __ARMEB__
777	orr	lr, lr, r12, lsl #24
778	mov	r12, r12, lsr #8
779	orr	r12, r12, r5, lsl #24
780	mov	r5, r5, lsr #8
781	orr	r5, r5, r4, lsl #24
782	mov	r4, r4, lsr #8
783	orr	r4, r4, r3, lsl #24
784#else
785	orr	lr, lr, r12, lsr #24
786	mov	r12, r12, lsl #8
787	orr	r12, r12, r5, lsr #24
788	mov	r5, r5, lsl #8
789	orr	r5, r5, r4, lsr #24
790	mov	r4, r4, lsl #8
791	orr	r4, r4, r3, lsr #24
792#endif
793	stmdb	r0!, {r4, r5, r12, lr}
794	subs	r2, r2, #0x10
795	bge	.Lmemmove_bsrcul3loop16
796	ldmia	sp!, {r4, r5, lr}
797	adds	r2, r2, #0x0c
798	blt	.Lmemmove_bsrcul3l4
799
800.Lmemmove_bsrcul3loop4:
801#ifdef __ARMEB__
802	mov	r12, r3, lsr #8
803#else
804	mov	r12, r3, lsl #8
805#endif
806	ldr	r3, [r1, #-4]!
807#ifdef __ARMEB__
808	orr	r12, r12, r3, lsl #24
809#else
810	orr	r12, r12, r3, lsr #24
811#endif
812	str	r12, [r0, #-4]!
813	subs	r2, r2, #4
814	bge	.Lmemmove_bsrcul3loop4
815
816.Lmemmove_bsrcul3l4:
817	add	r1, r1, #3
818	b	.Lmemmove_bl4
819
820.Lmemmove_bsrcul2:
821	cmp	r2, #0x0c
822	blt	.Lmemmove_bsrcul2loop4
823	sub	r2, r2, #0x0c
824	stmdb	sp!, {r4, r5, lr}
825
826.Lmemmove_bsrcul2loop16:
827#ifdef __ARMEB__
828	mov	lr, r3, lsr #16
829#else
830	mov	lr, r3, lsl #16
831#endif
832	ldmdb	r1!, {r3-r5, r12}
833#ifdef __ARMEB__
834	orr	lr, lr, r12, lsl #16
835	mov	r12, r12, lsr #16
836	orr	r12, r12, r5, lsl #16
837	mov	r5, r5, lsr #16
838	orr	r5, r5, r4, lsl #16
839	mov	r4, r4, lsr #16
840	orr	r4, r4, r3, lsl #16
841#else
842	orr	lr, lr, r12, lsr #16
843	mov	r12, r12, lsl #16
844	orr	r12, r12, r5, lsr #16
845	mov	r5, r5, lsl #16
846	orr	r5, r5, r4, lsr #16
847	mov	r4, r4, lsl #16
848	orr	r4, r4, r3, lsr #16
849#endif
850	stmdb	r0!, {r4, r5, r12, lr}
851	subs	r2, r2, #0x10
852	bge	.Lmemmove_bsrcul2loop16
853	ldmia	sp!, {r4, r5, lr}
854	adds	r2, r2, #0x0c
855	blt	.Lmemmove_bsrcul2l4
856
857.Lmemmove_bsrcul2loop4:
858#ifdef __ARMEB__
859	mov	r12, r3, lsr #16
860#else
861	mov	r12, r3, lsl #16
862#endif
863	ldr	r3, [r1, #-4]!
864#ifdef __ARMEB__
865	orr	r12, r12, r3, lsl #16
866#else
867	orr	r12, r12, r3, lsr #16
868#endif
869	str	r12, [r0, #-4]!
870	subs	r2, r2, #4
871	bge	.Lmemmove_bsrcul2loop4
872
873.Lmemmove_bsrcul2l4:
874	add	r1, r1, #2
875	b	.Lmemmove_bl4
876
877.Lmemmove_bsrcul1:
878	cmp	r2, #0x0c
879	blt	.Lmemmove_bsrcul1loop4
880	sub	r2, r2, #0x0c
881	stmdb	sp!, {r4, r5, lr}
882
883.Lmemmove_bsrcul1loop32:
884#ifdef __ARMEB__
885	mov	lr, r3, lsr #24
886#else
887	mov	lr, r3, lsl #24
888#endif
889	ldmdb	r1!, {r3-r5, r12}
890#ifdef __ARMEB__
891	orr	lr, lr, r12, lsl #8
892	mov	r12, r12, lsr #24
893	orr	r12, r12, r5, lsl #8
894	mov	r5, r5, lsr #24
895	orr	r5, r5, r4, lsl #8
896	mov	r4, r4, lsr #24
897	orr	r4, r4, r3, lsl #8
898#else
899	orr	lr, lr, r12, lsr #8
900	mov	r12, r12, lsl #24
901	orr	r12, r12, r5, lsr #8
902	mov	r5, r5, lsl #24
903	orr	r5, r5, r4, lsr #8
904	mov	r4, r4, lsl #24
905	orr	r4, r4, r3, lsr #8
906#endif
907	stmdb	r0!, {r4, r5, r12, lr}
908	subs	r2, r2, #0x10
909	bge	.Lmemmove_bsrcul1loop32
910	ldmia	sp!, {r4, r5, lr}
911	adds	r2, r2, #0x0c
912	blt	.Lmemmove_bsrcul1l4
913
914.Lmemmove_bsrcul1loop4:
915#ifdef __ARMEB__
916	mov	r12, r3, lsr #24
917#else
918	mov	r12, r3, lsl #24
919#endif
920	ldr	r3, [r1, #-4]!
921#ifdef __ARMEB__
922	orr	r12, r12, r3, lsl #8
923#else
924	orr	r12, r12, r3, lsr #8
925#endif
926	str	r12, [r0, #-4]!
927	subs	r2, r2, #4
928	bge	.Lmemmove_bsrcul1loop4
929
930.Lmemmove_bsrcul1l4:
931	add	r1, r1, #1
932	b	.Lmemmove_bl4
933EEND(memmove)
934END(bcopy)
935
936#if !defined(_ARM_ARCH_5E)
937ENTRY(memcpy)
938	/* save leaf functions having to store this away */
939	/* Do not check arm_memcpy if we're running from flash */
940#if defined(FLASHADDR) && defined(PHYSADDR)
941#if FLASHADDR > PHYSADDR
942	ldr	r3, =FLASHADDR
943	cmp	r3, pc
944	bls	.Lnormal
945#else
946	ldr	r3, =FLASHADDR
947	cmp	r3, pc
948	bhi	.Lnormal
949#endif
950#endif
951	ldr	r3, .L_arm_memcpy
952	ldr	r3, [r3]
953	cmp	r3, #0
954	beq	.Lnormal
955	ldr	r3, .L_min_memcpy_size
956	ldr	r3, [r3]
957	cmp	r2, r3
958	blt	.Lnormal
959	stmfd	sp!, {r0-r2, r4, lr}
960	mov	r3, #0
961	ldr	r4, .L_arm_memcpy
962	mov	lr, pc
963	ldr	pc, [r4]
964	cmp	r0, #0
965	ldmfd	sp!, {r0-r2, r4, lr}
966	RETeq
967
968.Lnormal:
969	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
970
971	subs	r2, r2, #4
972	blt	.Lmemcpy_l4		/* less than 4 bytes */
973	ands	r12, r0, #3
974	bne	.Lmemcpy_destul		/* oh unaligned destination addr */
975	ands	r12, r1, #3
976	bne	.Lmemcpy_srcul		/* oh unaligned source addr */
977
978.Lmemcpy_t8:
979	/* We have aligned source and destination */
980	subs	r2, r2, #8
981	blt	.Lmemcpy_l12		/* less than 12 bytes (4 from above) */
982	subs	r2, r2, #0x14
983	blt	.Lmemcpy_l32		/* less than 32 bytes (12 from above) */
984	stmdb	sp!, {r4}		/* borrow r4 */
985
986	/* blat 32 bytes at a time */
987	/* XXX for really big copies perhaps we should use more registers */
988.Lmemcpy_loop32:
989	ldmia	r1!, {r3, r4, r12, lr}
990	stmia	r0!, {r3, r4, r12, lr}
991	ldmia	r1!, {r3, r4, r12, lr}
992	stmia	r0!, {r3, r4, r12, lr}
993	subs	r2, r2, #0x20
994	bge	.Lmemcpy_loop32
995
996	cmn	r2, #0x10
997	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
998	stmgeia	r0!, {r3, r4, r12, lr}
999	subge	r2, r2, #0x10
1000	ldmia	sp!, {r4}		/* return r4 */
1001
1002.Lmemcpy_l32:
1003	adds	r2, r2, #0x14
1004
1005	/* blat 12 bytes at a time */
1006.Lmemcpy_loop12:
1007	ldmgeia	r1!, {r3, r12, lr}
1008	stmgeia	r0!, {r3, r12, lr}
1009	subges	r2, r2, #0x0c
1010	bge	.Lmemcpy_loop12
1011
1012.Lmemcpy_l12:
1013	adds	r2, r2, #8
1014	blt	.Lmemcpy_l4
1015
1016	subs	r2, r2, #4
1017	ldrlt	r3, [r1], #4
1018	strlt	r3, [r0], #4
1019	ldmgeia	r1!, {r3, r12}
1020	stmgeia	r0!, {r3, r12}
1021	subge	r2, r2, #4
1022
1023.Lmemcpy_l4:
1024	/* less than 4 bytes to go */
1025	adds	r2, r2, #4
1026#ifdef __APCS_26_
1027	ldmeqia sp!, {r0, pc}^		/* done */
1028#else
1029	ldmeqia	sp!, {r0, pc}		/* done */
1030#endif
1031	/* copy the crud byte at a time */
1032	cmp	r2, #2
1033	ldrb	r3, [r1], #1
1034	strb	r3, [r0], #1
1035	ldrgeb	r3, [r1], #1
1036	strgeb	r3, [r0], #1
1037	ldrgtb	r3, [r1], #1
1038	strgtb	r3, [r0], #1
1039	ldmia	sp!, {r0, pc}
1040
1041	/* erg - unaligned destination */
1042.Lmemcpy_destul:
1043	rsb	r12, r12, #4
1044	cmp	r12, #2
1045
1046	/* align destination with byte copies */
1047	ldrb	r3, [r1], #1
1048	strb	r3, [r0], #1
1049	ldrgeb	r3, [r1], #1
1050	strgeb	r3, [r0], #1
1051	ldrgtb	r3, [r1], #1
1052	strgtb	r3, [r0], #1
1053	subs	r2, r2, r12
1054	blt	.Lmemcpy_l4		/* less the 4 bytes */
1055
1056	ands	r12, r1, #3
1057	beq	.Lmemcpy_t8		/* we have an aligned source */
1058
1059	/* erg - unaligned source */
1060	/* This is where it gets nasty ... */
1061.Lmemcpy_srcul:
1062	bic	r1, r1, #3
1063	ldr	lr, [r1], #4
1064	cmp	r12, #2
1065	bgt	.Lmemcpy_srcul3
1066	beq	.Lmemcpy_srcul2
1067	cmp	r2, #0x0c
1068	blt	.Lmemcpy_srcul1loop4
1069	sub	r2, r2, #0x0c
1070	stmdb	sp!, {r4, r5}
1071
1072.Lmemcpy_srcul1loop16:
1073	mov	r3, lr, lsr #8
1074	ldmia	r1!, {r4, r5, r12, lr}
1075	orr	r3, r3, r4, lsl #24
1076	mov	r4, r4, lsr #8
1077	orr	r4, r4, r5, lsl #24
1078	mov	r5, r5, lsr #8
1079	orr	r5, r5, r12, lsl #24
1080	mov	r12, r12, lsr #8
1081	orr	r12, r12, lr, lsl #24
1082	stmia	r0!, {r3-r5, r12}
1083	subs	r2, r2, #0x10
1084	bge	.Lmemcpy_srcul1loop16
1085	ldmia	sp!, {r4, r5}
1086	adds	r2, r2, #0x0c
1087	blt	.Lmemcpy_srcul1l4
1088
1089.Lmemcpy_srcul1loop4:
1090	mov	r12, lr, lsr #8
1091	ldr	lr, [r1], #4
1092	orr	r12, r12, lr, lsl #24
1093	str	r12, [r0], #4
1094	subs	r2, r2, #4
1095	bge	.Lmemcpy_srcul1loop4
1096
1097.Lmemcpy_srcul1l4:
1098	sub	r1, r1, #3
1099	b	.Lmemcpy_l4
1100
1101.Lmemcpy_srcul2:
1102	cmp	r2, #0x0c
1103	blt	.Lmemcpy_srcul2loop4
1104	sub	r2, r2, #0x0c
1105	stmdb	sp!, {r4, r5}
1106
1107.Lmemcpy_srcul2loop16:
1108	mov	r3, lr, lsr #16
1109	ldmia	r1!, {r4, r5, r12, lr}
1110	orr	r3, r3, r4, lsl #16
1111	mov	r4, r4, lsr #16
1112	orr	r4, r4, r5, lsl #16
1113	mov	r5, r5, lsr #16
1114	orr	r5, r5, r12, lsl #16
1115	mov	r12, r12, lsr #16
1116	orr	r12, r12, lr, lsl #16
1117	stmia	r0!, {r3-r5, r12}
1118	subs	r2, r2, #0x10
1119	bge	.Lmemcpy_srcul2loop16
1120	ldmia	sp!, {r4, r5}
1121	adds	r2, r2, #0x0c
1122	blt	.Lmemcpy_srcul2l4
1123
1124.Lmemcpy_srcul2loop4:
1125	mov	r12, lr, lsr #16
1126	ldr	lr, [r1], #4
1127	orr	r12, r12, lr, lsl #16
1128	str	r12, [r0], #4
1129	subs	r2, r2, #4
1130	bge	.Lmemcpy_srcul2loop4
1131
1132.Lmemcpy_srcul2l4:
1133	sub	r1, r1, #2
1134	b	.Lmemcpy_l4
1135
1136.Lmemcpy_srcul3:
1137	cmp	r2, #0x0c
1138	blt	.Lmemcpy_srcul3loop4
1139	sub	r2, r2, #0x0c
1140	stmdb	sp!, {r4, r5}
1141
1142.Lmemcpy_srcul3loop16:
1143	mov	r3, lr, lsr #24
1144	ldmia	r1!, {r4, r5, r12, lr}
1145	orr	r3, r3, r4, lsl #8
1146	mov	r4, r4, lsr #24
1147	orr	r4, r4, r5, lsl #8
1148	mov	r5, r5, lsr #24
1149	orr	r5, r5, r12, lsl #8
1150	mov	r12, r12, lsr #24
1151	orr	r12, r12, lr, lsl #8
1152	stmia	r0!, {r3-r5, r12}
1153	subs	r2, r2, #0x10
1154	bge	.Lmemcpy_srcul3loop16
1155	ldmia	sp!, {r4, r5}
1156	adds	r2, r2, #0x0c
1157	blt	.Lmemcpy_srcul3l4
1158
1159.Lmemcpy_srcul3loop4:
1160	mov	r12, lr, lsr #24
1161	ldr	lr, [r1], #4
1162	orr	r12, r12, lr, lsl #8
1163	str	r12, [r0], #4
1164	subs	r2, r2, #4
1165	bge	.Lmemcpy_srcul3loop4
1166
1167.Lmemcpy_srcul3l4:
1168	sub	r1, r1, #1
1169	b	.Lmemcpy_l4
1170END(memcpy)
1171
1172#else
1173/* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
1174ENTRY(memcpy)
1175	pld	[r1]
1176	cmp	r2, #0x0c
1177	ble	.Lmemcpy_short		/* <= 12 bytes */
1178#ifdef FLASHADDR
1179#if FLASHADDR > PHYSADDR
1180	ldr	r3, =FLASHADDR
1181	cmp	r3, pc
1182	bls	.Lnormal
1183#else
1184	ldr	r3, =FLASHADDR
1185	cmp	r3, pc
1186	bhi	.Lnormal
1187#endif
1188#endif
1189	ldr	r3, .L_arm_memcpy
1190	ldr	r3, [r3]
1191	cmp	r3, #0
1192	beq	.Lnormal
1193	ldr	r3, .L_min_memcpy_size
1194	ldr	r3, [r3]
1195	cmp	r2, r3
1196	blt	.Lnormal
1197	stmfd	sp!, {r0-r2, r4, lr}
1198	mov	r3, #0
1199	ldr	r4, .L_arm_memcpy
1200	mov	lr, pc
1201	ldr	pc, [r4]
1202	cmp	r0, #0
1203	ldmfd	sp!, {r0-r2, r4, lr}
1204	RETeq
1205.Lnormal:
1206	mov	r3, r0			/* We must not clobber r0 */
1207
1208	/* Word-align the destination buffer */
1209	ands	ip, r3, #0x03		/* Already word aligned? */
1210	beq	.Lmemcpy_wordaligned	/* Yup */
1211	cmp	ip, #0x02
1212	ldrb	ip, [r1], #0x01
1213	sub	r2, r2, #0x01
1214	strb	ip, [r3], #0x01
1215	ldrleb	ip, [r1], #0x01
1216	suble	r2, r2, #0x01
1217	strleb	ip, [r3], #0x01
1218	ldrltb	ip, [r1], #0x01
1219	sublt	r2, r2, #0x01
1220	strltb	ip, [r3], #0x01
1221
1222	/* Destination buffer is now word aligned */
1223.Lmemcpy_wordaligned:
1224	ands	ip, r1, #0x03		/* Is src also word-aligned? */
1225	bne	.Lmemcpy_bad_align	/* Nope. Things just got bad */
1226
1227	/* Quad-align the destination buffer */
1228	tst	r3, #0x07		/* Already quad aligned? */
1229	ldrne	ip, [r1], #0x04
1230	stmfd	sp!, {r4-r9}		/* Free up some registers */
1231	subne	r2, r2, #0x04
1232	strne	ip, [r3], #0x04
1233
1234	/* Destination buffer quad aligned, source is at least word aligned */
1235	subs	r2, r2, #0x80
1236	blt	.Lmemcpy_w_lessthan128
1237
1238	/* Copy 128 bytes at a time */
1239.Lmemcpy_w_loop128:
1240	ldr	r4, [r1], #0x04		/* LD:00-03 */
1241	ldr	r5, [r1], #0x04		/* LD:04-07 */
1242	pld	[r1, #0x18]		/* Prefetch 0x20 */
1243	ldr	r6, [r1], #0x04		/* LD:08-0b */
1244	ldr	r7, [r1], #0x04		/* LD:0c-0f */
1245	ldr	r8, [r1], #0x04		/* LD:10-13 */
1246	ldr	r9, [r1], #0x04		/* LD:14-17 */
1247	strd	r4, [r3], #0x08		/* ST:00-07 */
1248	ldr	r4, [r1], #0x04		/* LD:18-1b */
1249	ldr	r5, [r1], #0x04		/* LD:1c-1f */
1250	strd	r6, [r3], #0x08		/* ST:08-0f */
1251	ldr	r6, [r1], #0x04		/* LD:20-23 */
1252	ldr	r7, [r1], #0x04		/* LD:24-27 */
1253	pld	[r1, #0x18]		/* Prefetch 0x40 */
1254	strd	r8, [r3], #0x08		/* ST:10-17 */
1255	ldr	r8, [r1], #0x04		/* LD:28-2b */
1256	ldr	r9, [r1], #0x04		/* LD:2c-2f */
1257	strd	r4, [r3], #0x08		/* ST:18-1f */
1258	ldr	r4, [r1], #0x04		/* LD:30-33 */
1259	ldr	r5, [r1], #0x04		/* LD:34-37 */
1260	strd	r6, [r3], #0x08		/* ST:20-27 */
1261	ldr	r6, [r1], #0x04		/* LD:38-3b */
1262	ldr	r7, [r1], #0x04		/* LD:3c-3f */
1263	strd	r8, [r3], #0x08		/* ST:28-2f */
1264	ldr	r8, [r1], #0x04		/* LD:40-43 */
1265	ldr	r9, [r1], #0x04		/* LD:44-47 */
1266	pld	[r1, #0x18]		/* Prefetch 0x60 */
1267	strd	r4, [r3], #0x08		/* ST:30-37 */
1268	ldr	r4, [r1], #0x04		/* LD:48-4b */
1269	ldr	r5, [r1], #0x04		/* LD:4c-4f */
1270	strd	r6, [r3], #0x08		/* ST:38-3f */
1271	ldr	r6, [r1], #0x04		/* LD:50-53 */
1272	ldr	r7, [r1], #0x04		/* LD:54-57 */
1273	strd	r8, [r3], #0x08		/* ST:40-47 */
1274	ldr	r8, [r1], #0x04		/* LD:58-5b */
1275	ldr	r9, [r1], #0x04		/* LD:5c-5f */
1276	strd	r4, [r3], #0x08		/* ST:48-4f */
1277	ldr	r4, [r1], #0x04		/* LD:60-63 */
1278	ldr	r5, [r1], #0x04		/* LD:64-67 */
1279	pld	[r1, #0x18]		/* Prefetch 0x80 */
1280	strd	r6, [r3], #0x08		/* ST:50-57 */
1281	ldr	r6, [r1], #0x04		/* LD:68-6b */
1282	ldr	r7, [r1], #0x04		/* LD:6c-6f */
1283	strd	r8, [r3], #0x08		/* ST:58-5f */
1284	ldr	r8, [r1], #0x04		/* LD:70-73 */
1285	ldr	r9, [r1], #0x04		/* LD:74-77 */
1286	strd	r4, [r3], #0x08		/* ST:60-67 */
1287	ldr	r4, [r1], #0x04		/* LD:78-7b */
1288	ldr	r5, [r1], #0x04		/* LD:7c-7f */
1289	strd	r6, [r3], #0x08		/* ST:68-6f */
1290	strd	r8, [r3], #0x08		/* ST:70-77 */
1291	subs	r2, r2, #0x80
1292	strd	r4, [r3], #0x08		/* ST:78-7f */
1293	bge	.Lmemcpy_w_loop128
1294
1295.Lmemcpy_w_lessthan128:
1296	adds	r2, r2, #0x80		/* Adjust for extra sub */
1297	ldmeqfd	sp!, {r4-r9}
1298	RETeq			/* Return now if done */
1299	subs	r2, r2, #0x20
1300	blt	.Lmemcpy_w_lessthan32
1301
1302	/* Copy 32 bytes at a time */
1303.Lmemcpy_w_loop32:
1304	ldr	r4, [r1], #0x04
1305	ldr	r5, [r1], #0x04
1306	pld	[r1, #0x18]
1307	ldr	r6, [r1], #0x04
1308	ldr	r7, [r1], #0x04
1309	ldr	r8, [r1], #0x04
1310	ldr	r9, [r1], #0x04
1311	strd	r4, [r3], #0x08
1312	ldr	r4, [r1], #0x04
1313	ldr	r5, [r1], #0x04
1314	strd	r6, [r3], #0x08
1315	strd	r8, [r3], #0x08
1316	subs	r2, r2, #0x20
1317	strd	r4, [r3], #0x08
1318	bge	.Lmemcpy_w_loop32
1319
1320.Lmemcpy_w_lessthan32:
1321	adds	r2, r2, #0x20		/* Adjust for extra sub */
1322	ldmeqfd	sp!, {r4-r9}
1323	RETeq			/* Return now if done */
1324
1325	and	r4, r2, #0x18
1326	rsbs	r4, r4, #0x18
1327	addne	pc, pc, r4, lsl #1
1328	nop
1329
1330	/* At least 24 bytes remaining */
1331	ldr	r4, [r1], #0x04
1332	ldr	r5, [r1], #0x04
1333	sub	r2, r2, #0x08
1334	strd	r4, [r3], #0x08
1335
1336	/* At least 16 bytes remaining */
1337	ldr	r4, [r1], #0x04
1338	ldr	r5, [r1], #0x04
1339	sub	r2, r2, #0x08
1340	strd	r4, [r3], #0x08
1341
1342	/* At least 8 bytes remaining */
1343	ldr	r4, [r1], #0x04
1344	ldr	r5, [r1], #0x04
1345	subs	r2, r2, #0x08
1346	strd	r4, [r3], #0x08
1347
1348	/* Less than 8 bytes remaining */
1349	ldmfd	sp!, {r4-r9}
1350	RETeq			/* Return now if done */
1351	subs	r2, r2, #0x04
1352	ldrge	ip, [r1], #0x04
1353	strge	ip, [r3], #0x04
1354	RETeq			/* Return now if done */
1355	addlt	r2, r2, #0x04
1356	ldrb	ip, [r1], #0x01
1357	cmp	r2, #0x02
1358	ldrgeb	r2, [r1], #0x01
1359	strb	ip, [r3], #0x01
1360	ldrgtb	ip, [r1]
1361	strgeb	r2, [r3], #0x01
1362	strgtb	ip, [r3]
1363	RET
1364
1365
1366/*
1367 * At this point, it has not been possible to word align both buffers.
1368 * The destination buffer is word aligned, but the source buffer is not.
1369 */
1370.Lmemcpy_bad_align:
1371	stmfd	sp!, {r4-r7}
1372	bic	r1, r1, #0x03
1373	cmp	ip, #2
1374	ldr	ip, [r1], #0x04
1375	bgt	.Lmemcpy_bad3
1376	beq	.Lmemcpy_bad2
1377	b	.Lmemcpy_bad1
1378
1379.Lmemcpy_bad1_loop16:
1380#ifdef __ARMEB__
1381	mov	r4, ip, lsl #8
1382#else
1383	mov	r4, ip, lsr #8
1384#endif
1385	ldr	r5, [r1], #0x04
1386	pld	[r1, #0x018]
1387	ldr	r6, [r1], #0x04
1388	ldr	r7, [r1], #0x04
1389	ldr	ip, [r1], #0x04
1390#ifdef __ARMEB__
1391	orr	r4, r4, r5, lsr #24
1392	mov	r5, r5, lsl #8
1393	orr	r5, r5, r6, lsr #24
1394	mov	r6, r6, lsl #8
1395	orr	r6, r6, r7, lsr #24
1396	mov	r7, r7, lsl #8
1397	orr	r7, r7, ip, lsr #24
1398#else
1399	orr	r4, r4, r5, lsl #24
1400	mov	r5, r5, lsr #8
1401	orr	r5, r5, r6, lsl #24
1402	mov	r6, r6, lsr #8
1403	orr	r6, r6, r7, lsl #24
1404	mov	r7, r7, lsr #8
1405	orr	r7, r7, ip, lsl #24
1406#endif
1407	str	r4, [r3], #0x04
1408	str	r5, [r3], #0x04
1409	str	r6, [r3], #0x04
1410	str	r7, [r3], #0x04
1411.Lmemcpy_bad1:
1412	subs	r2, r2, #0x10
1413	bge	.Lmemcpy_bad1_loop16
1414
1415	adds	r2, r2, #0x10
1416	ldmeqfd	sp!, {r4-r7}
1417	RETeq			/* Return now if done */
1418	subs	r2, r2, #0x04
1419	sublt	r1, r1, #0x03
1420	blt	.Lmemcpy_bad_done
1421
1422.Lmemcpy_bad1_loop4:
1423#ifdef __ARMEB__
1424	mov	r4, ip, lsl #8
1425#else
1426	mov	r4, ip, lsr #8
1427#endif
1428	ldr	ip, [r1], #0x04
1429	subs	r2, r2, #0x04
1430#ifdef __ARMEB__
1431	orr	r4, r4, ip, lsr #24
1432#else
1433	orr	r4, r4, ip, lsl #24
1434#endif
1435	str	r4, [r3], #0x04
1436	bge	.Lmemcpy_bad1_loop4
1437	sub	r1, r1, #0x03
1438	b	.Lmemcpy_bad_done
1439
1440.Lmemcpy_bad2_loop16:
1441#ifdef __ARMEB__
1442	mov	r4, ip, lsl #16
1443#else
1444	mov	r4, ip, lsr #16
1445#endif
1446	ldr	r5, [r1], #0x04
1447	pld	[r1, #0x018]
1448	ldr	r6, [r1], #0x04
1449	ldr	r7, [r1], #0x04
1450	ldr	ip, [r1], #0x04
1451#ifdef __ARMEB__
1452	orr	r4, r4, r5, lsr #16
1453	mov	r5, r5, lsl #16
1454	orr	r5, r5, r6, lsr #16
1455	mov	r6, r6, lsl #16
1456	orr	r6, r6, r7, lsr #16
1457	mov	r7, r7, lsl #16
1458	orr	r7, r7, ip, lsr #16
1459#else
1460	orr	r4, r4, r5, lsl #16
1461	mov	r5, r5, lsr #16
1462	orr	r5, r5, r6, lsl #16
1463	mov	r6, r6, lsr #16
1464	orr	r6, r6, r7, lsl #16
1465	mov	r7, r7, lsr #16
1466	orr	r7, r7, ip, lsl #16
1467#endif
1468	str	r4, [r3], #0x04
1469	str	r5, [r3], #0x04
1470	str	r6, [r3], #0x04
1471	str	r7, [r3], #0x04
1472.Lmemcpy_bad2:
1473	subs	r2, r2, #0x10
1474	bge	.Lmemcpy_bad2_loop16
1475
1476	adds	r2, r2, #0x10
1477	ldmeqfd	sp!, {r4-r7}
1478	RETeq			/* Return now if done */
1479	subs	r2, r2, #0x04
1480	sublt	r1, r1, #0x02
1481	blt	.Lmemcpy_bad_done
1482
1483.Lmemcpy_bad2_loop4:
1484#ifdef __ARMEB__
1485	mov	r4, ip, lsl #16
1486#else
1487	mov	r4, ip, lsr #16
1488#endif
1489	ldr	ip, [r1], #0x04
1490	subs	r2, r2, #0x04
1491#ifdef __ARMEB__
1492	orr	r4, r4, ip, lsr #16
1493#else
1494	orr	r4, r4, ip, lsl #16
1495#endif
1496	str	r4, [r3], #0x04
1497	bge	.Lmemcpy_bad2_loop4
1498	sub	r1, r1, #0x02
1499	b	.Lmemcpy_bad_done
1500
1501.Lmemcpy_bad3_loop16:
1502#ifdef __ARMEB__
1503	mov	r4, ip, lsl #24
1504#else
1505	mov	r4, ip, lsr #24
1506#endif
1507	ldr	r5, [r1], #0x04
1508	pld	[r1, #0x018]
1509	ldr	r6, [r1], #0x04
1510	ldr	r7, [r1], #0x04
1511	ldr	ip, [r1], #0x04
1512#ifdef __ARMEB__
1513	orr	r4, r4, r5, lsr #8
1514	mov	r5, r5, lsl #24
1515	orr	r5, r5, r6, lsr #8
1516	mov	r6, r6, lsl #24
1517	orr	r6, r6, r7, lsr #8
1518	mov	r7, r7, lsl #24
1519	orr	r7, r7, ip, lsr #8
1520#else
1521	orr	r4, r4, r5, lsl #8
1522	mov	r5, r5, lsr #24
1523	orr	r5, r5, r6, lsl #8
1524	mov	r6, r6, lsr #24
1525	orr	r6, r6, r7, lsl #8
1526	mov	r7, r7, lsr #24
1527	orr	r7, r7, ip, lsl #8
1528#endif
1529	str	r4, [r3], #0x04
1530	str	r5, [r3], #0x04
1531	str	r6, [r3], #0x04
1532	str	r7, [r3], #0x04
1533.Lmemcpy_bad3:
1534	subs	r2, r2, #0x10
1535	bge	.Lmemcpy_bad3_loop16
1536
1537	adds	r2, r2, #0x10
1538	ldmeqfd	sp!, {r4-r7}
1539	RETeq			/* Return now if done */
1540	subs	r2, r2, #0x04
1541	sublt	r1, r1, #0x01
1542	blt	.Lmemcpy_bad_done
1543
1544.Lmemcpy_bad3_loop4:
1545#ifdef __ARMEB__
1546	mov	r4, ip, lsl #24
1547#else
1548	mov	r4, ip, lsr #24
1549#endif
1550	ldr	ip, [r1], #0x04
1551	subs	r2, r2, #0x04
1552#ifdef __ARMEB__
1553	orr	r4, r4, ip, lsr #8
1554#else
1555	orr	r4, r4, ip, lsl #8
1556#endif
1557	str	r4, [r3], #0x04
1558	bge	.Lmemcpy_bad3_loop4
1559	sub	r1, r1, #0x01
1560
1561.Lmemcpy_bad_done:
1562	ldmfd	sp!, {r4-r7}
1563	adds	r2, r2, #0x04
1564	RETeq
1565	ldrb	ip, [r1], #0x01
1566	cmp	r2, #0x02
1567	ldrgeb	r2, [r1], #0x01
1568	strb	ip, [r3], #0x01
1569	ldrgtb	ip, [r1]
1570	strgeb	r2, [r3], #0x01
1571	strgtb	ip, [r3]
1572	RET
1573
1574
1575/*
1576 * Handle short copies (less than 16 bytes), possibly misaligned.
1577 * Some of these are *very* common, thanks to the network stack,
1578 * and so are handled specially.
1579 */
1580.Lmemcpy_short:
1581	add	pc, pc, r2, lsl #2
1582	nop
1583	RET			/* 0x00 */
1584	b	.Lmemcpy_bytewise	/* 0x01 */
1585	b	.Lmemcpy_bytewise	/* 0x02 */
1586	b	.Lmemcpy_bytewise	/* 0x03 */
1587	b	.Lmemcpy_4		/* 0x04 */
1588	b	.Lmemcpy_bytewise	/* 0x05 */
1589	b	.Lmemcpy_6		/* 0x06 */
1590	b	.Lmemcpy_bytewise	/* 0x07 */
1591	b	.Lmemcpy_8		/* 0x08 */
1592	b	.Lmemcpy_bytewise	/* 0x09 */
1593	b	.Lmemcpy_bytewise	/* 0x0a */
1594	b	.Lmemcpy_bytewise	/* 0x0b */
1595	b	.Lmemcpy_c		/* 0x0c */
1596.Lmemcpy_bytewise:
1597	mov	r3, r0			/* We must not clobber r0 */
1598	ldrb	ip, [r1], #0x01
15991:	subs	r2, r2, #0x01
1600	strb	ip, [r3], #0x01
1601	ldrneb	ip, [r1], #0x01
1602	bne	1b
1603	RET
1604
1605/******************************************************************************
1606 * Special case for 4 byte copies
1607 */
1608#define	LMEMCPY_4_LOG2	6	/* 64 bytes */
1609#define	LMEMCPY_4_PAD	.align LMEMCPY_4_LOG2
1610	LMEMCPY_4_PAD
1611.Lmemcpy_4:
1612	and	r2, r1, #0x03
1613	orr	r2, r2, r0, lsl #2
1614	ands	r2, r2, #0x0f
1615	sub	r3, pc, #0x14
1616	addne	pc, r3, r2, lsl #LMEMCPY_4_LOG2
1617
1618/*
1619 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1620 */
1621	ldr	r2, [r1]
1622	str	r2, [r0]
1623	RET
1624	LMEMCPY_4_PAD
1625
1626/*
1627 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1628 */
1629	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
1630	ldr	r2, [r1, #3]		/* BE:r2 = 3xxx  LE:r2 = xxx3 */
1631#ifdef __ARMEB__
1632	mov	r3, r3, lsl #8		/* r3 = 012. */
1633	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
1634#else
1635	mov	r3, r3, lsr #8		/* r3 = .210 */
1636	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
1637#endif
1638	str	r3, [r0]
1639	RET
1640	LMEMCPY_4_PAD
1641
1642/*
1643 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1644 */
1645#ifdef __ARMEB__
1646	ldrh	r3, [r1]
1647	ldrh	r2, [r1, #0x02]
1648#else
1649	ldrh	r3, [r1, #0x02]
1650	ldrh	r2, [r1]
1651#endif
1652	orr	r3, r2, r3, lsl #16
1653	str	r3, [r0]
1654	RET
1655	LMEMCPY_4_PAD
1656
1657/*
1658 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1659 */
1660	ldr	r3, [r1, #-3]		/* BE:r3 = xxx0  LE:r3 = 0xxx */
1661	ldr	r2, [r1, #1]		/* BE:r2 = 123x  LE:r2 = x321 */
1662#ifdef __ARMEB__
1663	mov	r3, r3, lsl #24		/* r3 = 0... */
1664	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
1665#else
1666	mov	r3, r3, lsr #24		/* r3 = ...0 */
1667	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
1668#endif
1669	str	r3, [r0]
1670	RET
1671	LMEMCPY_4_PAD
1672
1673/*
1674 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1675 */
1676	ldr	r2, [r1]
1677#ifdef __ARMEB__
1678	strb	r2, [r0, #0x03]
1679	mov	r3, r2, lsr #8
1680	mov	r1, r2, lsr #24
1681	strb	r1, [r0]
1682#else
1683	strb	r2, [r0]
1684	mov	r3, r2, lsr #8
1685	mov	r1, r2, lsr #24
1686	strb	r1, [r0, #0x03]
1687#endif
1688	strh	r3, [r0, #0x01]
1689	RET
1690	LMEMCPY_4_PAD
1691
1692/*
1693 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1694 */
1695	ldrb	r2, [r1]
1696	ldrh	r3, [r1, #0x01]
1697	ldrb	r1, [r1, #0x03]
1698	strb	r2, [r0]
1699	strh	r3, [r0, #0x01]
1700	strb	r1, [r0, #0x03]
1701	RET
1702	LMEMCPY_4_PAD
1703
1704/*
1705 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1706 */
1707	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1708	ldrh	r3, [r1, #0x02]		/* LE:r3 = ..23  LE:r3 = ..32 */
1709#ifdef __ARMEB__
1710	mov	r1, r2, lsr #8		/* r1 = ...0 */
1711	strb	r1, [r0]
1712	mov	r2, r2, lsl #8		/* r2 = .01. */
1713	orr	r2, r2, r3, lsr #8	/* r2 = .012 */
1714#else
1715	strb	r2, [r0]
1716	mov	r2, r2, lsr #8		/* r2 = ...1 */
1717	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
1718	mov	r3, r3, lsr #8		/* r3 = ...3 */
1719#endif
1720	strh	r2, [r0, #0x01]
1721	strb	r3, [r0, #0x03]
1722	RET
1723	LMEMCPY_4_PAD
1724
1725/*
1726 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1727 */
1728	ldrb	r2, [r1]
1729	ldrh	r3, [r1, #0x01]
1730	ldrb	r1, [r1, #0x03]
1731	strb	r2, [r0]
1732	strh	r3, [r0, #0x01]
1733	strb	r1, [r0, #0x03]
1734	RET
1735	LMEMCPY_4_PAD
1736
1737/*
1738 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1739 */
1740	ldr	r2, [r1]
1741#ifdef __ARMEB__
1742	strh	r2, [r0, #0x02]
1743	mov	r3, r2, lsr #16
1744	strh	r3, [r0]
1745#else
1746	strh	r2, [r0]
1747	mov	r3, r2, lsr #16
1748	strh	r3, [r0, #0x02]
1749#endif
1750	RET
1751	LMEMCPY_4_PAD
1752
1753/*
1754 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1755 */
1756	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1757	ldr	r3, [r1, #3]		/* BE:r3 = 3xxx  LE:r3 = xxx3 */
1758	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
1759	strh	r1, [r0]
1760#ifdef __ARMEB__
1761	mov	r2, r2, lsl #8		/* r2 = 012. */
1762	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
1763#else
1764	mov	r2, r2, lsr #24		/* r2 = ...2 */
1765	orr	r2, r2, r3, lsl #8	/* r2 = xx32 */
1766#endif
1767	strh	r2, [r0, #0x02]
1768	RET
1769	LMEMCPY_4_PAD
1770
1771/*
1772 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1773 */
1774	ldrh	r2, [r1]
1775	ldrh	r3, [r1, #0x02]
1776	strh	r2, [r0]
1777	strh	r3, [r0, #0x02]
1778	RET
1779	LMEMCPY_4_PAD
1780
1781/*
1782 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1783 */
1784	ldr	r3, [r1, #1]		/* BE:r3 = 123x  LE:r3 = x321 */
1785	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
1786	mov	r1, r3, lsr #8		/* BE:r1 = .123  LE:r1 = .x32 */
1787	strh	r1, [r0, #0x02]
1788#ifdef __ARMEB__
1789	mov	r3, r3, lsr #24		/* r3 = ...1 */
1790	orr	r3, r3, r2, lsl #8	/* r3 = xx01 */
1791#else
1792	mov	r3, r3, lsl #8		/* r3 = 321. */
1793	orr	r3, r3, r2, lsr #24	/* r3 = 3210 */
1794#endif
1795	strh	r3, [r0]
1796	RET
1797	LMEMCPY_4_PAD
1798
1799/*
1800 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1801 */
1802	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1803#ifdef __ARMEB__
1804	strb	r2, [r0, #0x03]
1805	mov	r3, r2, lsr #8
1806	mov	r1, r2, lsr #24
1807	strh	r3, [r0, #0x01]
1808	strb	r1, [r0]
1809#else
1810	strb	r2, [r0]
1811	mov	r3, r2, lsr #8
1812	mov	r1, r2, lsr #24
1813	strh	r3, [r0, #0x01]
1814	strb	r1, [r0, #0x03]
1815#endif
1816	RET
1817	LMEMCPY_4_PAD
1818
1819/*
1820 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1821 */
1822	ldrb	r2, [r1]
1823	ldrh	r3, [r1, #0x01]
1824	ldrb	r1, [r1, #0x03]
1825	strb	r2, [r0]
1826	strh	r3, [r0, #0x01]
1827	strb	r1, [r0, #0x03]
1828	RET
1829	LMEMCPY_4_PAD
1830
1831/*
1832 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1833 */
1834#ifdef __ARMEB__
1835	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
1836	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1837	strb	r3, [r0, #0x03]
1838	mov	r3, r3, lsr #8		/* r3 = ...2 */
1839	orr	r3, r3, r2, lsl #8	/* r3 = ..12 */
1840	strh	r3, [r0, #0x01]
1841	mov	r2, r2, lsr #8		/* r2 = ...0 */
1842	strb	r2, [r0]
1843#else
1844	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1845	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
1846	strb	r2, [r0]
1847	mov	r2, r2, lsr #8		/* r2 = ...1 */
1848	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
1849	strh	r2, [r0, #0x01]
1850	mov	r3, r3, lsr #8		/* r3 = ...3 */
1851	strb	r3, [r0, #0x03]
1852#endif
1853	RET
1854	LMEMCPY_4_PAD
1855
1856/*
1857 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1858 */
1859	ldrb	r2, [r1]
1860	ldrh	r3, [r1, #0x01]
1861	ldrb	r1, [r1, #0x03]
1862	strb	r2, [r0]
1863	strh	r3, [r0, #0x01]
1864	strb	r1, [r0, #0x03]
1865	RET
1866	LMEMCPY_4_PAD
1867
1868
1869/******************************************************************************
1870 * Special case for 6 byte copies
1871 */
1872#define	LMEMCPY_6_LOG2	6	/* 64 bytes */
1873#define	LMEMCPY_6_PAD	.align LMEMCPY_6_LOG2
1874	LMEMCPY_6_PAD
1875.Lmemcpy_6:
1876	and	r2, r1, #0x03
1877	orr	r2, r2, r0, lsl #2
1878	ands	r2, r2, #0x0f
1879	sub	r3, pc, #0x14
1880	addne	pc, r3, r2, lsl #LMEMCPY_6_LOG2
1881
1882/*
1883 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1884 */
1885	ldr	r2, [r1]
1886	ldrh	r3, [r1, #0x04]
1887	str	r2, [r0]
1888	strh	r3, [r0, #0x04]
1889	RET
1890	LMEMCPY_6_PAD
1891
1892/*
1893 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1894 */
1895	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1896	ldr	r3, [r1, #0x03]		/* BE:r3 = 345x  LE:r3 = x543 */
1897#ifdef __ARMEB__
1898	mov	r2, r2, lsl #8		/* r2 = 012. */
1899	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
1900#else
1901	mov	r2, r2, lsr #8		/* r2 = .210 */
1902	orr	r2, r2, r3, lsl #24	/* r2 = 3210 */
1903#endif
1904	mov	r3, r3, lsr #8		/* BE:r3 = .345  LE:r3 = .x54 */
1905	str	r2, [r0]
1906	strh	r3, [r0, #0x04]
1907	RET
1908	LMEMCPY_6_PAD
1909
1910/*
1911 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1912 */
1913	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1914	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1915#ifdef __ARMEB__
1916	mov	r1, r3, lsr #16		/* r1 = ..23 */
1917	orr	r1, r1, r2, lsl #16	/* r1 = 0123 */
1918	str	r1, [r0]
1919	strh	r3, [r0, #0x04]
1920#else
1921	mov	r1, r3, lsr #16		/* r1 = ..54 */
1922	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
1923	str	r2, [r0]
1924	strh	r1, [r0, #0x04]
1925#endif
1926	RET
1927	LMEMCPY_6_PAD
1928
1929/*
1930 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1931 */
1932	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
1933	ldr	r3, [r1, #1]		/* BE:r3 = 1234  LE:r3 = 4321 */
1934	ldr	r1, [r1, #5]		/* BE:r1 = 5xxx  LE:r3 = xxx5 */
1935#ifdef __ARMEB__
1936	mov	r2, r2, lsl #24		/* r2 = 0... */
1937	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
1938	mov	r3, r3, lsl #8		/* r3 = 234. */
1939	orr	r1, r3, r1, lsr #24	/* r1 = 2345 */
1940#else
1941	mov	r2, r2, lsr #24		/* r2 = ...0 */
1942	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
1943	mov	r1, r1, lsl #8		/* r1 = xx5. */
1944	orr	r1, r1, r3, lsr #24	/* r1 = xx54 */
1945#endif
1946	str	r2, [r0]
1947	strh	r1, [r0, #0x04]
1948	RET
1949	LMEMCPY_6_PAD
1950
1951/*
1952 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1953 */
1954	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
1955	ldrh	r2, [r1, #0x04]		/* BE:r2 = ..45  LE:r2 = ..54 */
1956	mov	r1, r3, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
1957	strh	r1, [r0, #0x01]
1958#ifdef __ARMEB__
1959	mov	r1, r3, lsr #24		/* r1 = ...0 */
1960	strb	r1, [r0]
1961	mov	r3, r3, lsl #8		/* r3 = 123. */
1962	orr	r3, r3, r2, lsr #8	/* r3 = 1234 */
1963#else
1964	strb	r3, [r0]
1965	mov	r3, r3, lsr #24		/* r3 = ...3 */
1966	orr	r3, r3, r2, lsl #8	/* r3 = .543 */
1967	mov	r2, r2, lsr #8		/* r2 = ...5 */
1968#endif
1969	strh	r3, [r0, #0x03]
1970	strb	r2, [r0, #0x05]
1971	RET
1972	LMEMCPY_6_PAD
1973
1974/*
1975 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1976 */
1977	ldrb	r2, [r1]
1978	ldrh	r3, [r1, #0x01]
1979	ldrh	ip, [r1, #0x03]
1980	ldrb	r1, [r1, #0x05]
1981	strb	r2, [r0]
1982	strh	r3, [r0, #0x01]
1983	strh	ip, [r0, #0x03]
1984	strb	r1, [r0, #0x05]
1985	RET
1986	LMEMCPY_6_PAD
1987
1988/*
1989 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1990 */
1991	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1992	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
1993#ifdef __ARMEB__
1994	mov	r3, r2, lsr #8		/* r3 = ...0 */
1995	strb	r3, [r0]
1996	strb	r1, [r0, #0x05]
1997	mov	r3, r1, lsr #8		/* r3 = .234 */
1998	strh	r3, [r0, #0x03]
1999	mov	r3, r2, lsl #8		/* r3 = .01. */
2000	orr	r3, r3, r1, lsr #24	/* r3 = .012 */
2001	strh	r3, [r0, #0x01]
2002#else
2003	strb	r2, [r0]
2004	mov	r3, r1, lsr #24
2005	strb	r3, [r0, #0x05]
2006	mov	r3, r1, lsr #8		/* r3 = .543 */
2007	strh	r3, [r0, #0x03]
2008	mov	r3, r2, lsr #8		/* r3 = ...1 */
2009	orr	r3, r3, r1, lsl #8	/* r3 = 4321 */
2010	strh	r3, [r0, #0x01]
2011#endif
2012	RET
2013	LMEMCPY_6_PAD
2014
2015/*
2016 * 0111: dst is 8-bit aligned, src is 8-bit aligned
2017 */
2018	ldrb	r2, [r1]
2019	ldrh	r3, [r1, #0x01]
2020	ldrh	ip, [r1, #0x03]
2021	ldrb	r1, [r1, #0x05]
2022	strb	r2, [r0]
2023	strh	r3, [r0, #0x01]
2024	strh	ip, [r0, #0x03]
2025	strb	r1, [r0, #0x05]
2026	RET
2027	LMEMCPY_6_PAD
2028
2029/*
2030 * 1000: dst is 16-bit aligned, src is 32-bit aligned
2031 */
2032#ifdef __ARMEB__
2033	ldr	r2, [r1]		/* r2 = 0123 */
2034	ldrh	r3, [r1, #0x04]		/* r3 = ..45 */
2035	mov	r1, r2, lsr #16		/* r1 = ..01 */
2036	orr	r3, r3, r2, lsl#16	/* r3 = 2345 */
2037	strh	r1, [r0]
2038	str	r3, [r0, #0x02]
2039#else
2040	ldrh	r2, [r1, #0x04]		/* r2 = ..54 */
2041	ldr	r3, [r1]		/* r3 = 3210 */
2042	mov	r2, r2, lsl #16		/* r2 = 54.. */
2043	orr	r2, r2, r3, lsr #16	/* r2 = 5432 */
2044	strh	r3, [r0]
2045	str	r2, [r0, #0x02]
2046#endif
2047	RET
2048	LMEMCPY_6_PAD
2049
2050/*
2051 * 1001: dst is 16-bit aligned, src is 8-bit aligned
2052 */
2053	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
2054	ldr	r2, [r1, #3]		/* BE:r2 = 345x  LE:r2 = x543 */
2055	mov	r1, r3, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
2056#ifdef __ARMEB__
2057	mov	r2, r2, lsr #8		/* r2 = .345 */
2058	orr	r2, r2, r3, lsl #24	/* r2 = 2345 */
2059#else
2060	mov	r2, r2, lsl #8		/* r2 = 543. */
2061	orr	r2, r2, r3, lsr #24	/* r2 = 5432 */
2062#endif
2063	strh	r1, [r0]
2064	str	r2, [r0, #0x02]
2065	RET
2066	LMEMCPY_6_PAD
2067
2068/*
2069 * 1010: dst is 16-bit aligned, src is 16-bit aligned
2070 */
2071	ldrh	r2, [r1]
2072	ldr	r3, [r1, #0x02]
2073	strh	r2, [r0]
2074	str	r3, [r0, #0x02]
2075	RET
2076	LMEMCPY_6_PAD
2077
2078/*
2079 * 1011: dst is 16-bit aligned, src is 8-bit aligned
2080 */
2081	ldrb	r3, [r1]		/* r3 = ...0 */
2082	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
2083	ldrb	r1, [r1, #0x05]		/* r1 = ...5 */
2084#ifdef __ARMEB__
2085	mov	r3, r3, lsl #8		/* r3 = ..0. */
2086	orr	r3, r3, r2, lsr #24	/* r3 = ..01 */
2087	orr	r1, r1, r2, lsl #8	/* r1 = 2345 */
2088#else
2089	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
2090	mov	r1, r1, lsl #24		/* r1 = 5... */
2091	orr	r1, r1, r2, lsr #8	/* r1 = 5432 */
2092#endif
2093	strh	r3, [r0]
2094	str	r1, [r0, #0x02]
2095	RET
2096	LMEMCPY_6_PAD
2097
2098/*
2099 * 1100: dst is 8-bit aligned, src is 32-bit aligned
2100 */
2101	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2102	ldrh	r1, [r1, #0x04]		/* BE:r1 = ..45  LE:r1 = ..54 */
2103#ifdef __ARMEB__
2104	mov	r3, r2, lsr #24		/* r3 = ...0 */
2105	strb	r3, [r0]
2106	mov	r2, r2, lsl #8		/* r2 = 123. */
2107	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
2108#else
2109	strb	r2, [r0]
2110	mov	r2, r2, lsr #8		/* r2 = .321 */
2111	orr	r2, r2, r1, lsl #24	/* r2 = 4321 */
2112	mov	r1, r1, lsr #8		/* r1 = ...5 */
2113#endif
2114	str	r2, [r0, #0x01]
2115	strb	r1, [r0, #0x05]
2116	RET
2117	LMEMCPY_6_PAD
2118
2119/*
2120 * 1101: dst is 8-bit aligned, src is 8-bit aligned
2121 */
2122	ldrb	r2, [r1]
2123	ldrh	r3, [r1, #0x01]
2124	ldrh	ip, [r1, #0x03]
2125	ldrb	r1, [r1, #0x05]
2126	strb	r2, [r0]
2127	strh	r3, [r0, #0x01]
2128	strh	ip, [r0, #0x03]
2129	strb	r1, [r0, #0x05]
2130	RET
2131	LMEMCPY_6_PAD
2132
2133/*
2134 * 1110: dst is 8-bit aligned, src is 16-bit aligned
2135 */
2136	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2137	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
2138#ifdef __ARMEB__
2139	mov	r3, r2, lsr #8		/* r3 = ...0 */
2140	strb	r3, [r0]
2141	mov	r2, r2, lsl #24		/* r2 = 1... */
2142	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
2143#else
2144	strb	r2, [r0]
2145	mov	r2, r2, lsr #8		/* r2 = ...1 */
2146	orr	r2, r2, r1, lsl #8	/* r2 = 4321 */
2147	mov	r1, r1, lsr #24		/* r1 = ...5 */
2148#endif
2149	str	r2, [r0, #0x01]
2150	strb	r1, [r0, #0x05]
2151	RET
2152	LMEMCPY_6_PAD
2153
2154/*
2155 * 1111: dst is 8-bit aligned, src is 8-bit aligned
2156 */
2157	ldrb	r2, [r1]
2158	ldr	r3, [r1, #0x01]
2159	ldrb	r1, [r1, #0x05]
2160	strb	r2, [r0]
2161	str	r3, [r0, #0x01]
2162	strb	r1, [r0, #0x05]
2163	RET
2164	LMEMCPY_6_PAD
2165
2166
2167/******************************************************************************
2168 * Special case for 8 byte copies
2169 */
2170#define	LMEMCPY_8_LOG2	6	/* 64 bytes */
2171#define	LMEMCPY_8_PAD	.align LMEMCPY_8_LOG2
2172	LMEMCPY_8_PAD
2173.Lmemcpy_8:
2174	and	r2, r1, #0x03
2175	orr	r2, r2, r0, lsl #2
2176	ands	r2, r2, #0x0f
2177	sub	r3, pc, #0x14
2178	addne	pc, r3, r2, lsl #LMEMCPY_8_LOG2
2179
2180/*
2181 * 0000: dst is 32-bit aligned, src is 32-bit aligned
2182 */
2183	ldr	r2, [r1]
2184	ldr	r3, [r1, #0x04]
2185	str	r2, [r0]
2186	str	r3, [r0, #0x04]
2187	RET
2188	LMEMCPY_8_PAD
2189
2190/*
2191 * 0001: dst is 32-bit aligned, src is 8-bit aligned
2192 */
2193	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
2194	ldr	r2, [r1, #0x03]		/* BE:r2 = 3456  LE:r2 = 6543 */
2195	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
2196#ifdef __ARMEB__
2197	mov	r3, r3, lsl #8		/* r3 = 012. */
2198	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
2199	orr	r2, r1, r2, lsl #8	/* r2 = 4567 */
2200#else
2201	mov	r3, r3, lsr #8		/* r3 = .210 */
2202	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
2203	mov	r1, r1, lsl #24		/* r1 = 7... */
2204	orr	r2, r1, r2, lsr #8	/* r2 = 7654 */
2205#endif
2206	str	r3, [r0]
2207	str	r2, [r0, #0x04]
2208	RET
2209	LMEMCPY_8_PAD
2210
2211/*
2212 * 0010: dst is 32-bit aligned, src is 16-bit aligned
2213 */
2214	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2215	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2216	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
2217#ifdef __ARMEB__
2218	mov	r2, r2, lsl #16		/* r2 = 01.. */
2219	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
2220	orr	r3, r1, r3, lsl #16	/* r3 = 4567 */
2221#else
2222	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
2223	mov	r3, r3, lsr #16		/* r3 = ..54 */
2224	orr	r3, r3, r1, lsl #16	/* r3 = 7654 */
2225#endif
2226	str	r2, [r0]
2227	str	r3, [r0, #0x04]
2228	RET
2229	LMEMCPY_8_PAD
2230
2231/*
2232 * 0011: dst is 32-bit aligned, src is 8-bit aligned
2233 */
2234	ldrb	r3, [r1]		/* r3 = ...0 */
2235	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
2236	ldr	r1, [r1, #0x05]		/* BE:r1 = 567x  LE:r1 = x765 */
2237#ifdef __ARMEB__
2238	mov	r3, r3, lsl #24		/* r3 = 0... */
2239	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
2240	mov	r2, r2, lsl #24		/* r2 = 4... */
2241	orr	r2, r2, r1, lsr #8	/* r2 = 4567 */
2242#else
2243	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
2244	mov	r2, r2, lsr #24		/* r2 = ...4 */
2245	orr	r2, r2, r1, lsl #8	/* r2 = 7654 */
2246#endif
2247	str	r3, [r0]
2248	str	r2, [r0, #0x04]
2249	RET
2250	LMEMCPY_8_PAD
2251
2252/*
2253 * 0100: dst is 8-bit aligned, src is 32-bit aligned
2254 */
2255	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
2256	ldr	r2, [r1, #0x04]		/* BE:r2 = 4567  LE:r2 = 7654 */
2257#ifdef __ARMEB__
2258	mov	r1, r3, lsr #24		/* r1 = ...0 */
2259	strb	r1, [r0]
2260	mov	r1, r3, lsr #8		/* r1 = .012 */
2261	strb	r2, [r0, #0x07]
2262	mov	r3, r3, lsl #24		/* r3 = 3... */
2263	orr	r3, r3, r2, lsr #8	/* r3 = 3456 */
2264#else
2265	strb	r3, [r0]
2266	mov	r1, r2, lsr #24		/* r1 = ...7 */
2267	strb	r1, [r0, #0x07]
2268	mov	r1, r3, lsr #8		/* r1 = .321 */
2269	mov	r3, r3, lsr #24		/* r3 = ...3 */
2270	orr	r3, r3, r2, lsl #8	/* r3 = 6543 */
2271#endif
2272	strh	r1, [r0, #0x01]
2273	str	r3, [r0, #0x03]
2274	RET
2275	LMEMCPY_8_PAD
2276
2277/*
2278 * 0101: dst is 8-bit aligned, src is 8-bit aligned
2279 */
2280	ldrb	r2, [r1]
2281	ldrh	r3, [r1, #0x01]
2282	ldr	ip, [r1, #0x03]
2283	ldrb	r1, [r1, #0x07]
2284	strb	r2, [r0]
2285	strh	r3, [r0, #0x01]
2286	str	ip, [r0, #0x03]
2287	strb	r1, [r0, #0x07]
2288	RET
2289	LMEMCPY_8_PAD
2290
2291/*
2292 * 0110: dst is 8-bit aligned, src is 16-bit aligned
2293 */
2294	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2295	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2296	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
2297#ifdef __ARMEB__
2298	mov	ip, r2, lsr #8		/* ip = ...0 */
2299	strb	ip, [r0]
2300	mov	ip, r2, lsl #8		/* ip = .01. */
2301	orr	ip, ip, r3, lsr #24	/* ip = .012 */
2302	strb	r1, [r0, #0x07]
2303	mov	r3, r3, lsl #8		/* r3 = 345. */
2304	orr	r3, r3, r1, lsr #8	/* r3 = 3456 */
2305#else
2306	strb	r2, [r0]		/* 0 */
2307	mov	ip, r1, lsr #8		/* ip = ...7 */
2308	strb	ip, [r0, #0x07]		/* 7 */
2309	mov	ip, r2, lsr #8		/* ip = ...1 */
2310	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
2311	mov	r3, r3, lsr #8		/* r3 = .543 */
2312	orr	r3, r3, r1, lsl #24	/* r3 = 6543 */
2313#endif
2314	strh	ip, [r0, #0x01]
2315	str	r3, [r0, #0x03]
2316	RET
2317	LMEMCPY_8_PAD
2318
2319/*
2320 * 0111: dst is 8-bit aligned, src is 8-bit aligned
2321 */
2322	ldrb	r3, [r1]		/* r3 = ...0 */
2323	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
2324	ldrh	r2, [r1, #0x05]		/* BE:r2 = ..56  LE:r2 = ..65 */
2325	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
2326	strb	r3, [r0]
2327	mov	r3, ip, lsr #16		/* BE:r3 = ..12  LE:r3 = ..43 */
2328#ifdef __ARMEB__
2329	strh	r3, [r0, #0x01]
2330	orr	r2, r2, ip, lsl #16	/* r2 = 3456 */
2331#else
2332	strh	ip, [r0, #0x01]
2333	orr	r2, r3, r2, lsl #16	/* r2 = 6543 */
2334#endif
2335	str	r2, [r0, #0x03]
2336	strb	r1, [r0, #0x07]
2337	RET
2338	LMEMCPY_8_PAD
2339
2340/*
2341 * 1000: dst is 16-bit aligned, src is 32-bit aligned
2342 */
2343	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2344	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2345	mov	r1, r2, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
2346#ifdef __ARMEB__
2347	strh	r1, [r0]
2348	mov	r1, r3, lsr #16		/* r1 = ..45 */
2349	orr	r2, r1 ,r2, lsl #16	/* r2 = 2345 */
2350#else
2351	strh	r2, [r0]
2352	orr	r2, r1, r3, lsl #16	/* r2 = 5432 */
2353	mov	r3, r3, lsr #16		/* r3 = ..76 */
2354#endif
2355	str	r2, [r0, #0x02]
2356	strh	r3, [r0, #0x06]
2357	RET
2358	LMEMCPY_8_PAD
2359
2360/*
2361 * 1001: dst is 16-bit aligned, src is 8-bit aligned
2362 */
2363	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
2364	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
2365	ldrb	ip, [r1, #0x07]		/* ip = ...7 */
2366	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
2367	strh	r1, [r0]
2368#ifdef __ARMEB__
2369	mov	r1, r2, lsl #24		/* r1 = 2... */
2370	orr	r1, r1, r3, lsr #8	/* r1 = 2345 */
2371	orr	r3, ip, r3, lsl #8	/* r3 = 4567 */
2372#else
2373	mov	r1, r2, lsr #24		/* r1 = ...2 */
2374	orr	r1, r1, r3, lsl #8	/* r1 = 5432 */
2375	mov	r3, r3, lsr #24		/* r3 = ...6 */
2376	orr	r3, r3, ip, lsl #8	/* r3 = ..76 */
2377#endif
2378	str	r1, [r0, #0x02]
2379	strh	r3, [r0, #0x06]
2380	RET
2381	LMEMCPY_8_PAD
2382
2383/*
2384 * 1010: dst is 16-bit aligned, src is 16-bit aligned
2385 */
2386	ldrh	r2, [r1]
2387	ldr	ip, [r1, #0x02]
2388	ldrh	r3, [r1, #0x06]
2389	strh	r2, [r0]
2390	str	ip, [r0, #0x02]
2391	strh	r3, [r0, #0x06]
2392	RET
2393	LMEMCPY_8_PAD
2394
2395/*
2396 * 1011: dst is 16-bit aligned, src is 8-bit aligned
2397 */
2398	ldr	r3, [r1, #0x05]		/* BE:r3 = 567x  LE:r3 = x765 */
2399	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
2400	ldrb	ip, [r1]		/* ip = ...0 */
2401	mov	r1, r3, lsr #8		/* BE:r1 = .567  LE:r1 = .x76 */
2402	strh	r1, [r0, #0x06]
2403#ifdef __ARMEB__
2404	mov	r3, r3, lsr #24		/* r3 = ...5 */
2405	orr	r3, r3, r2, lsl #8	/* r3 = 2345 */
2406	mov	r2, r2, lsr #24		/* r2 = ...1 */
2407	orr	r2, r2, ip, lsl #8	/* r2 = ..01 */
2408#else
2409	mov	r3, r3, lsl #24		/* r3 = 5... */
2410	orr	r3, r3, r2, lsr #8	/* r3 = 5432 */
2411	orr	r2, ip, r2, lsl #8	/* r2 = 3210 */
2412#endif
2413	str	r3, [r0, #0x02]
2414	strh	r2, [r0]
2415	RET
2416	LMEMCPY_8_PAD
2417
2418/*
2419 * 1100: dst is 8-bit aligned, src is 32-bit aligned
2420 */
2421	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2422	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2423	mov	r1, r3, lsr #8		/* BE:r1 = .456  LE:r1 = .765 */
2424	strh	r1, [r0, #0x05]
2425#ifdef __ARMEB__
2426	strb	r3, [r0, #0x07]
2427	mov	r1, r2, lsr #24		/* r1 = ...0 */
2428	strb	r1, [r0]
2429	mov	r2, r2, lsl #8		/* r2 = 123. */
2430	orr	r2, r2, r3, lsr #24	/* r2 = 1234 */
2431	str	r2, [r0, #0x01]
2432#else
2433	strb	r2, [r0]
2434	mov	r1, r3, lsr #24		/* r1 = ...7 */
2435	strb	r1, [r0, #0x07]
2436	mov	r2, r2, lsr #8		/* r2 = .321 */
2437	orr	r2, r2, r3, lsl #24	/* r2 = 4321 */
2438	str	r2, [r0, #0x01]
2439#endif
2440	RET
2441	LMEMCPY_8_PAD
2442
2443/*
2444 * 1101: dst is 8-bit aligned, src is 8-bit aligned
2445 */
2446	ldrb	r3, [r1]		/* r3 = ...0 */
2447	ldrh	r2, [r1, #0x01]		/* BE:r2 = ..12  LE:r2 = ..21 */
2448	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
2449	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
2450	strb	r3, [r0]
2451	mov	r3, ip, lsr #16		/* BE:r3 = ..34  LE:r3 = ..65 */
2452#ifdef __ARMEB__
2453	strh	ip, [r0, #0x05]
2454	orr	r2, r3, r2, lsl #16	/* r2 = 1234 */
2455#else
2456	strh	r3, [r0, #0x05]
2457	orr	r2, r2, ip, lsl #16	/* r2 = 4321 */
2458#endif
2459	str	r2, [r0, #0x01]
2460	strb	r1, [r0, #0x07]
2461	RET
2462	LMEMCPY_8_PAD
2463
2464/*
2465 * 1110: dst is 8-bit aligned, src is 16-bit aligned
2466 */
2467	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2468	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2469	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
2470#ifdef __ARMEB__
2471	mov	ip, r2, lsr #8		/* ip = ...0 */
2472	strb	ip, [r0]
2473	mov	ip, r2, lsl #24		/* ip = 1... */
2474	orr	ip, ip, r3, lsr #8	/* ip = 1234 */
2475	strb	r1, [r0, #0x07]
2476	mov	r1, r1, lsr #8		/* r1 = ...6 */
2477	orr	r1, r1, r3, lsl #8	/* r1 = 3456 */
2478#else
2479	strb	r2, [r0]
2480	mov	ip, r2, lsr #8		/* ip = ...1 */
2481	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
2482	mov	r2, r1, lsr #8		/* r2 = ...7 */
2483	strb	r2, [r0, #0x07]
2484	mov	r1, r1, lsl #8		/* r1 = .76. */
2485	orr	r1, r1, r3, lsr #24	/* r1 = .765 */
2486#endif
2487	str	ip, [r0, #0x01]
2488	strh	r1, [r0, #0x05]
2489	RET
2490	LMEMCPY_8_PAD
2491
2492/*
2493 * 1111: dst is 8-bit aligned, src is 8-bit aligned
2494 */
2495	ldrb	r2, [r1]
2496	ldr	ip, [r1, #0x01]
2497	ldrh	r3, [r1, #0x05]
2498	ldrb	r1, [r1, #0x07]
2499	strb	r2, [r0]
2500	str	ip, [r0, #0x01]
2501	strh	r3, [r0, #0x05]
2502	strb	r1, [r0, #0x07]
2503	RET
2504	LMEMCPY_8_PAD
2505
2506/******************************************************************************
2507 * Special case for 12 byte copies
2508 */
2509#define	LMEMCPY_C_LOG2	7	/* 128 bytes */
2510#define	LMEMCPY_C_PAD	.align LMEMCPY_C_LOG2
2511	LMEMCPY_C_PAD
2512.Lmemcpy_c:
2513	and	r2, r1, #0x03
2514	orr	r2, r2, r0, lsl #2
2515	ands	r2, r2, #0x0f
2516	sub	r3, pc, #0x14
2517	addne	pc, r3, r2, lsl #LMEMCPY_C_LOG2
2518
2519/*
2520 * 0000: dst is 32-bit aligned, src is 32-bit aligned
2521 */
2522	ldr	r2, [r1]
2523	ldr	r3, [r1, #0x04]
2524	ldr	r1, [r1, #0x08]
2525	str	r2, [r0]
2526	str	r3, [r0, #0x04]
2527	str	r1, [r0, #0x08]
2528	RET
2529	LMEMCPY_C_PAD
2530
2531/*
2532 * 0001: dst is 32-bit aligned, src is 8-bit aligned
2533 */
2534	ldrb	r2, [r1, #0xb]		/* r2 = ...B */
2535	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
2536	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
2537	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
2538#ifdef __ARMEB__
2539	orr	r2, r2, ip, lsl #8	/* r2 = 89AB */
2540	str	r2, [r0, #0x08]
2541	mov	r2, ip, lsr #24		/* r2 = ...7 */
2542	orr	r2, r2, r3, lsl #8	/* r2 = 4567 */
2543	mov	r1, r1, lsl #8		/* r1 = 012. */
2544	orr	r1, r1, r3, lsr #24	/* r1 = 0123 */
2545#else
2546	mov	r2, r2, lsl #24		/* r2 = B... */
2547	orr	r2, r2, ip, lsr #8	/* r2 = BA98 */
2548	str	r2, [r0, #0x08]
2549	mov	r2, ip, lsl #24		/* r2 = 7... */
2550	orr	r2, r2, r3, lsr #8	/* r2 = 7654 */
2551	mov	r1, r1, lsr #8		/* r1 = .210 */
2552	orr	r1, r1, r3, lsl #24	/* r1 = 3210 */
2553#endif
2554	str	r2, [r0, #0x04]
2555	str	r1, [r0]
2556	RET
2557	LMEMCPY_C_PAD
2558
2559/*
2560 * 0010: dst is 32-bit aligned, src is 16-bit aligned
2561 */
2562	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2563	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2564	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
2565	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
2566#ifdef __ARMEB__
2567	mov	r2, r2, lsl #16		/* r2 = 01.. */
2568	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
2569	str	r2, [r0]
2570	mov	r3, r3, lsl #16		/* r3 = 45.. */
2571	orr	r3, r3, ip, lsr #16	/* r3 = 4567 */
2572	orr	r1, r1, ip, lsl #16	/* r1 = 89AB */
2573#else
2574	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
2575	str	r2, [r0]
2576	mov	r3, r3, lsr #16		/* r3 = ..54 */
2577	orr	r3, r3, ip, lsl #16	/* r3 = 7654 */
2578	mov	r1, r1, lsl #16		/* r1 = BA.. */
2579	orr	r1, r1, ip, lsr #16	/* r1 = BA98 */
2580#endif
2581	str	r3, [r0, #0x04]
2582	str	r1, [r0, #0x08]
2583	RET
2584	LMEMCPY_C_PAD
2585
2586/*
2587 * 0011: dst is 32-bit aligned, src is 8-bit aligned
2588 */
2589	ldrb	r2, [r1]		/* r2 = ...0 */
2590	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
2591	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
2592	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
2593#ifdef __ARMEB__
2594	mov	r2, r2, lsl #24		/* r2 = 0... */
2595	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
2596	str	r2, [r0]
2597	mov	r3, r3, lsl #24		/* r3 = 4... */
2598	orr	r3, r3, ip, lsr #8	/* r3 = 4567 */
2599	mov	r1, r1, lsr #8		/* r1 = .9AB */
2600	orr	r1, r1, ip, lsl #24	/* r1 = 89AB */
2601#else
2602	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
2603	str	r2, [r0]
2604	mov	r3, r3, lsr #24		/* r3 = ...4 */
2605	orr	r3, r3, ip, lsl #8	/* r3 = 7654 */
2606	mov	r1, r1, lsl #8		/* r1 = BA9. */
2607	orr	r1, r1, ip, lsr #24	/* r1 = BA98 */
2608#endif
2609	str	r3, [r0, #0x04]
2610	str	r1, [r0, #0x08]
2611	RET
2612	LMEMCPY_C_PAD
2613
2614/*
2615 * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
2616 */
2617	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2618	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2619	ldr	ip, [r1, #0x08]		/* BE:ip = 89AB  LE:ip = BA98 */
2620	mov	r1, r2, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
2621	strh	r1, [r0, #0x01]
2622#ifdef __ARMEB__
2623	mov	r1, r2, lsr #24		/* r1 = ...0 */
2624	strb	r1, [r0]
2625	mov	r1, r2, lsl #24		/* r1 = 3... */
2626	orr	r2, r1, r3, lsr #8	/* r1 = 3456 */
2627	mov	r1, r3, lsl #24		/* r1 = 7... */
2628	orr	r1, r1, ip, lsr #8	/* r1 = 789A */
2629#else
2630	strb	r2, [r0]
2631	mov	r1, r2, lsr #24		/* r1 = ...3 */
2632	orr	r2, r1, r3, lsl #8	/* r1 = 6543 */
2633	mov	r1, r3, lsr #24		/* r1 = ...7 */
2634	orr	r1, r1, ip, lsl #8	/* r1 = A987 */
2635	mov	ip, ip, lsr #24		/* ip = ...B */
2636#endif
2637	str	r2, [r0, #0x03]
2638	str	r1, [r0, #0x07]
2639	strb	ip, [r0, #0x0b]
2640	RET
2641	LMEMCPY_C_PAD
2642
2643/*
2644 * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
2645 */
2646	ldrb	r2, [r1]
2647	ldrh	r3, [r1, #0x01]
2648	ldr	ip, [r1, #0x03]
2649	strb	r2, [r0]
2650	ldr	r2, [r1, #0x07]
2651	ldrb	r1, [r1, #0x0b]
2652	strh	r3, [r0, #0x01]
2653	str	ip, [r0, #0x03]
2654	str	r2, [r0, #0x07]
2655	strb	r1, [r0, #0x0b]
2656	RET
2657	LMEMCPY_C_PAD
2658
2659/*
2660 * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
2661 */
2662	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2663	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2664	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
2665	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
2666#ifdef __ARMEB__
2667	mov	r2, r2, ror #8		/* r2 = 1..0 */
2668	strb	r2, [r0]
2669	mov	r2, r2, lsr #16		/* r2 = ..1. */
2670	orr	r2, r2, r3, lsr #24	/* r2 = ..12 */
2671	strh	r2, [r0, #0x01]
2672	mov	r2, r3, lsl #8		/* r2 = 345. */
2673	orr	r3, r2, ip, lsr #24	/* r3 = 3456 */
2674	mov	r2, ip, lsl #8		/* r2 = 789. */
2675	orr	r2, r2, r1, lsr #8	/* r2 = 789A */
2676#else
2677	strb	r2, [r0]
2678	mov	r2, r2, lsr #8		/* r2 = ...1 */
2679	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
2680	strh	r2, [r0, #0x01]
2681	mov	r2, r3, lsr #8		/* r2 = .543 */
2682	orr	r3, r2, ip, lsl #24	/* r3 = 6543 */
2683	mov	r2, ip, lsr #8		/* r2 = .987 */
2684	orr	r2, r2, r1, lsl #24	/* r2 = A987 */
2685	mov	r1, r1, lsr #8		/* r1 = ...B */
2686#endif
2687	str	r3, [r0, #0x03]
2688	str	r2, [r0, #0x07]
2689	strb	r1, [r0, #0x0b]
2690	RET
2691	LMEMCPY_C_PAD
2692
2693/*
2694 * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
2695 */
2696	ldrb	r2, [r1]
2697	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
2698	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
2699	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
2700	strb	r2, [r0]
2701#ifdef __ARMEB__
2702	mov	r2, r3, lsr #16		/* r2 = ..12 */
2703	strh	r2, [r0, #0x01]
2704	mov	r3, r3, lsl #16		/* r3 = 34.. */
2705	orr	r3, r3, ip, lsr #16	/* r3 = 3456 */
2706	mov	ip, ip, lsl #16		/* ip = 78.. */
2707	orr	ip, ip, r1, lsr #16	/* ip = 789A */
2708	mov	r1, r1, lsr #8		/* r1 = .9AB */
2709#else
2710	strh	r3, [r0, #0x01]
2711	mov	r3, r3, lsr #16		/* r3 = ..43 */
2712	orr	r3, r3, ip, lsl #16	/* r3 = 6543 */
2713	mov	ip, ip, lsr #16		/* ip = ..87 */
2714	orr	ip, ip, r1, lsl #16	/* ip = A987 */
2715	mov	r1, r1, lsr #16		/* r1 = ..xB */
2716#endif
2717	str	r3, [r0, #0x03]
2718	str	ip, [r0, #0x07]
2719	strb	r1, [r0, #0x0b]
2720	RET
2721	LMEMCPY_C_PAD
2722
2723/*
2724 * 1000: dst is 16-bit aligned, src is 32-bit aligned
2725 */
2726	ldr	ip, [r1]		/* BE:ip = 0123  LE:ip = 3210 */
2727	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2728	ldr	r2, [r1, #0x08]		/* BE:r2 = 89AB  LE:r2 = BA98 */
2729	mov	r1, ip, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
2730#ifdef __ARMEB__
2731	strh	r1, [r0]
2732	mov	r1, ip, lsl #16		/* r1 = 23.. */
2733	orr	r1, r1, r3, lsr #16	/* r1 = 2345 */
2734	mov	r3, r3, lsl #16		/* r3 = 67.. */
2735	orr	r3, r3, r2, lsr #16	/* r3 = 6789 */
2736#else
2737	strh	ip, [r0]
2738	orr	r1, r1, r3, lsl #16	/* r1 = 5432 */
2739	mov	r3, r3, lsr #16		/* r3 = ..76 */
2740	orr	r3, r3, r2, lsl #16	/* r3 = 9876 */
2741	mov	r2, r2, lsr #16		/* r2 = ..BA */
2742#endif
2743	str	r1, [r0, #0x02]
2744	str	r3, [r0, #0x06]
2745	strh	r2, [r0, #0x0a]
2746	RET
2747	LMEMCPY_C_PAD
2748
2749/*
2750 * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
2751 */
2752	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
2753	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
2754	mov	ip, r2, lsr #8		/* BE:ip = .x01  LE:ip = .210 */
2755	strh	ip, [r0]
2756	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
2757	ldrb	r1, [r1, #0x0b]		/* r1 = ...B */
2758#ifdef __ARMEB__
2759	mov	r2, r2, lsl #24		/* r2 = 2... */
2760	orr	r2, r2, r3, lsr #8	/* r2 = 2345 */
2761	mov	r3, r3, lsl #24		/* r3 = 6... */
2762	orr	r3, r3, ip, lsr #8	/* r3 = 6789 */
2763	orr	r1, r1, ip, lsl #8	/* r1 = 89AB */
2764#else
2765	mov	r2, r2, lsr #24		/* r2 = ...2 */
2766	orr	r2, r2, r3, lsl #8	/* r2 = 5432 */
2767	mov	r3, r3, lsr #24		/* r3 = ...6 */
2768	orr	r3, r3, ip, lsl #8	/* r3 = 9876 */
2769	mov	r1, r1, lsl #8		/* r1 = ..B. */
2770	orr	r1, r1, ip, lsr #24	/* r1 = ..BA */
2771#endif
2772	str	r2, [r0, #0x02]
2773	str	r3, [r0, #0x06]
2774	strh	r1, [r0, #0x0a]
2775	RET
2776	LMEMCPY_C_PAD
2777
2778/*
2779 * 1010: dst is 16-bit aligned, src is 16-bit aligned
2780 */
2781	ldrh	r2, [r1]
2782	ldr	r3, [r1, #0x02]
2783	ldr	ip, [r1, #0x06]
2784	ldrh	r1, [r1, #0x0a]
2785	strh	r2, [r0]
2786	str	r3, [r0, #0x02]
2787	str	ip, [r0, #0x06]
2788	strh	r1, [r0, #0x0a]
2789	RET
2790	LMEMCPY_C_PAD
2791
2792/*
2793 * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
2794 */
2795	ldr	r2, [r1, #0x09]		/* BE:r2 = 9ABx  LE:r2 = xBA9 */
2796	ldr	r3, [r1, #0x05]		/* BE:r3 = 5678  LE:r3 = 8765 */
2797	mov	ip, r2, lsr #8		/* BE:ip = .9AB  LE:ip = .xBA */
2798	strh	ip, [r0, #0x0a]
2799	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
2800	ldrb	r1, [r1]		/* r1 = ...0 */
2801#ifdef __ARMEB__
2802	mov	r2, r2, lsr #24		/* r2 = ...9 */
2803	orr	r2, r2, r3, lsl #8	/* r2 = 6789 */
2804	mov	r3, r3, lsr #24		/* r3 = ...5 */
2805	orr	r3, r3, ip, lsl #8	/* r3 = 2345 */
2806	mov	r1, r1, lsl #8		/* r1 = ..0. */
2807	orr	r1, r1, ip, lsr #24	/* r1 = ..01 */
2808#else
2809	mov	r2, r2, lsl #24		/* r2 = 9... */
2810	orr	r2, r2, r3, lsr #8	/* r2 = 9876 */
2811	mov	r3, r3, lsl #24		/* r3 = 5... */
2812	orr	r3, r3, ip, lsr #8	/* r3 = 5432 */
2813	orr	r1, r1, ip, lsl #8	/* r1 = 3210 */
2814#endif
2815	str	r2, [r0, #0x06]
2816	str	r3, [r0, #0x02]
2817	strh	r1, [r0]
2818	RET
2819	LMEMCPY_C_PAD
2820
2821/*
2822 * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
2823 */
2824	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2825	ldr	ip, [r1, #0x04]		/* BE:ip = 4567  LE:ip = 7654 */
2826	ldr	r1, [r1, #0x08]		/* BE:r1 = 89AB  LE:r1 = BA98 */
2827#ifdef __ARMEB__
2828	mov	r3, r2, lsr #24		/* r3 = ...0 */
2829	strb	r3, [r0]
2830	mov	r2, r2, lsl #8		/* r2 = 123. */
2831	orr	r2, r2, ip, lsr #24	/* r2 = 1234 */
2832	str	r2, [r0, #0x01]
2833	mov	r2, ip, lsl #8		/* r2 = 567. */
2834	orr	r2, r2, r1, lsr #24	/* r2 = 5678 */
2835	str	r2, [r0, #0x05]
2836	mov	r2, r1, lsr #8		/* r2 = ..9A */
2837	strh	r2, [r0, #0x09]
2838	strb	r1, [r0, #0x0b]
2839#else
2840	strb	r2, [r0]
2841	mov	r3, r2, lsr #8		/* r3 = .321 */
2842	orr	r3, r3, ip, lsl #24	/* r3 = 4321 */
2843	str	r3, [r0, #0x01]
2844	mov	r3, ip, lsr #8		/* r3 = .765 */
2845	orr	r3, r3, r1, lsl #24	/* r3 = 8765 */
2846	str	r3, [r0, #0x05]
2847	mov	r1, r1, lsr #8		/* r1 = .BA9 */
2848	strh	r1, [r0, #0x09]
2849	mov	r1, r1, lsr #16		/* r1 = ...B */
2850	strb	r1, [r0, #0x0b]
2851#endif
2852	RET
2853	LMEMCPY_C_PAD
2854
2855/*
2856 * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
2857 */
2858	ldrb	r2, [r1, #0x0b]		/* r2 = ...B */
2859	ldr	r3, [r1, #0x07]		/* BE:r3 = 789A  LE:r3 = A987 */
2860	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
2861	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
2862	strb	r2, [r0, #0x0b]
2863#ifdef __ARMEB__
2864	strh	r3, [r0, #0x09]
2865	mov	r3, r3, lsr #16		/* r3 = ..78 */
2866	orr	r3, r3, ip, lsl #16	/* r3 = 5678 */
2867	mov	ip, ip, lsr #16		/* ip = ..34 */
2868	orr	ip, ip, r1, lsl #16	/* ip = 1234 */
2869	mov	r1, r1, lsr #16		/* r1 = ..x0 */
2870#else
2871	mov	r2, r3, lsr #16		/* r2 = ..A9 */
2872	strh	r2, [r0, #0x09]
2873	mov	r3, r3, lsl #16		/* r3 = 87.. */
2874	orr	r3, r3, ip, lsr #16	/* r3 = 8765 */
2875	mov	ip, ip, lsl #16		/* ip = 43.. */
2876	orr	ip, ip, r1, lsr #16	/* ip = 4321 */
2877	mov	r1, r1, lsr #8		/* r1 = .210 */
2878#endif
2879	str	r3, [r0, #0x05]
2880	str	ip, [r0, #0x01]
2881	strb	r1, [r0]
2882	RET
2883	LMEMCPY_C_PAD
2884
2885/*
2886 * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
2887 */
2888#ifdef __ARMEB__
2889	ldrh	r2, [r1, #0x0a]		/* r2 = ..AB */
2890	ldr	ip, [r1, #0x06]		/* ip = 6789 */
2891	ldr	r3, [r1, #0x02]		/* r3 = 2345 */
2892	ldrh	r1, [r1]		/* r1 = ..01 */
2893	strb	r2, [r0, #0x0b]
2894	mov	r2, r2, lsr #8		/* r2 = ...A */
2895	orr	r2, r2, ip, lsl #8	/* r2 = 789A */
2896	mov	ip, ip, lsr #8		/* ip = .678 */
2897	orr	ip, ip, r3, lsl #24	/* ip = 5678 */
2898	mov	r3, r3, lsr #8		/* r3 = .234 */
2899	orr	r3, r3, r1, lsl #24	/* r3 = 1234 */
2900	mov	r1, r1, lsr #8		/* r1 = ...0 */
2901	strb	r1, [r0]
2902	str	r3, [r0, #0x01]
2903	str	ip, [r0, #0x05]
2904	strh	r2, [r0, #0x09]
2905#else
2906	ldrh	r2, [r1]		/* r2 = ..10 */
2907	ldr	r3, [r1, #0x02]		/* r3 = 5432 */
2908	ldr	ip, [r1, #0x06]		/* ip = 9876 */
2909	ldrh	r1, [r1, #0x0a]		/* r1 = ..BA */
2910	strb	r2, [r0]
2911	mov	r2, r2, lsr #8		/* r2 = ...1 */
2912	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
2913	mov	r3, r3, lsr #24		/* r3 = ...5 */
2914	orr	r3, r3, ip, lsl #8	/* r3 = 8765 */
2915	mov	ip, ip, lsr #24		/* ip = ...9 */
2916	orr	ip, ip, r1, lsl #8	/* ip = .BA9 */
2917	mov	r1, r1, lsr #8		/* r1 = ...B */
2918	str	r2, [r0, #0x01]
2919	str	r3, [r0, #0x05]
2920	strh	ip, [r0, #0x09]
2921	strb	r1, [r0, #0x0b]
2922#endif
2923	RET
2924	LMEMCPY_C_PAD
2925
2926/*
2927 * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
2928 */
2929	ldrb	r2, [r1]
2930	ldr	r3, [r1, #0x01]
2931	ldr	ip, [r1, #0x05]
2932	strb	r2, [r0]
2933	ldrh	r2, [r1, #0x09]
2934	ldrb	r1, [r1, #0x0b]
2935	str	r3, [r0, #0x01]
2936	str	ip, [r0, #0x05]
2937	strh	r2, [r0, #0x09]
2938	strb	r1, [r0, #0x0b]
2939	RET
2940END(memcpy)
2941#endif /* _ARM_ARCH_5E */
2942
2943#ifdef GPROF
2944
2945ENTRY(user)
2946	nop
2947END(user)
2948ENTRY(btrap)
2949	nop
2950END(btrap)
2951ENTRY(etrap)
2952	nop
2953END(etrap)
2954ENTRY(bintr)
2955	nop
2956END(bintr)
2957ENTRY(eintr)
2958	nop
2959END(eintr)
2960#endif
2961