1/*-
2 * Copyright (c) 2004 Olivier Houchard
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26/*
27 * Copyright 2003 Wasabi Systems, Inc.
28 * All rights reserved.
29 *
30 * Written by Steve C. Woodford for Wasabi Systems, Inc.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 *    notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 *    notice, this list of conditions and the following disclaimer in the
39 *    documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 *    must display the following acknowledgement:
42 *      This product includes software developed for the NetBSD Project by
43 *      Wasabi Systems, Inc.
44 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
45 *    or promote products derived from this software without specific prior
46 *    written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
50 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
51 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
52 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
53 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
54 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
55 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
56 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
57 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
58 * POSSIBILITY OF SUCH DAMAGE.
59 */
60/*
61 * Copyright (c) 1997 The NetBSD Foundation, Inc.
62 * All rights reserved.
63 *
64 * This code is derived from software contributed to The NetBSD Foundation
65 * by Neil A. Carson and Mark Brinicombe
66 *
67 * Redistribution and use in source and binary forms, with or without
68 * modification, are permitted provided that the following conditions
69 * are met:
70 * 1. Redistributions of source code must retain the above copyright
71 *    notice, this list of conditions and the following disclaimer.
72 * 2. Redistributions in binary form must reproduce the above copyright
73 *    notice, this list of conditions and the following disclaimer in the
74 *    documentation and/or other materials provided with the distribution.
75 *
76 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
77 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
78 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
79 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
80 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
81 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
82 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
83 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
84 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
85 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
86 * POSSIBILITY OF SUCH DAMAGE.
87 */
88
89#include <machine/asm.h>
90#include <machine/asmacros.h>
91__FBSDID("$FreeBSD$");
92
93#include "assym.s"
94
95.L_arm_memcpy:
96	.word	_C_LABEL(_arm_memcpy)
97.L_arm_bzero:
98	.word	_C_LABEL(_arm_bzero)
99.L_min_memcpy_size:
100	.word	_C_LABEL(_min_memcpy_size)
101.L_min_bzero_size:
102	.word	_C_LABEL(_min_bzero_size)
103/*
104 * memset: Sets a block of memory to the specified value
105 *
106 * On entry:
107 *   r0 - dest address
108 *   r1 - byte to write
109 *   r2 - number of bytes to write
110 *
111 * On exit:
112 *   r0 - dest address
113 */
114/* LINTSTUB: Func: void bzero(void *, size_t) */
115ENTRY(bzero)
116	ldr	r3, .L_arm_bzero
117	ldr	r3, [r3]
118	cmp	r3, #0
119	beq	.Lnormal0
120	ldr	r2, .L_min_bzero_size
121	ldr	r2, [r2]
122	cmp	r1, r2
123	blt	.Lnormal0
124	stmfd	sp!, {r0, r1, lr}
125	mov	r2, #0
126	mov	lr, pc
127	mov	pc, r3
128	cmp	r0, #0
129	ldmfd	sp!, {r0, r1, lr}
130	RETeq
131.Lnormal0:
132	mov	r3, #0x00
133	b	do_memset
134
135/* LINTSTUB: Func: void *memset(void *, int, size_t) */
136ENTRY(memset)
137	and	r3, r1, #0xff		/* We deal with bytes */
138	mov	r1, r2
139do_memset:
140	cmp	r1, #0x04		/* Do we have less than 4 bytes */
141	mov	ip, r0
142	blt	.Lmemset_lessthanfour
143
144	/* Ok first we will word align the address */
145	ands	r2, ip, #0x03		/* Get the bottom two bits */
146	bne	.Lmemset_wordunaligned	/* The address is not word aligned */
147
148	/* We are now word aligned */
149.Lmemset_wordaligned:
150	orr	r3, r3, r3, lsl #8	/* Extend value to 16-bits */
151#ifdef _ARM_ARCH_5E
152	tst	ip, #0x04		/* Quad-align for armv5e */
153#else
154	cmp	r1, #0x10
155#endif
156	orr	r3, r3, r3, lsl #16	/* Extend value to 32-bits */
157#ifdef _ARM_ARCH_5E
158	subne	r1, r1, #0x04		/* Quad-align if necessary */
159	strne	r3, [ip], #0x04
160	cmp	r1, #0x10
161#endif
162	blt	.Lmemset_loop4		/* If less than 16 then use words */
163	mov	r2, r3			/* Duplicate data */
164	cmp	r1, #0x80		/* If < 128 then skip the big loop */
165	blt	.Lmemset_loop32
166
167	/* Do 128 bytes at a time */
168.Lmemset_loop128:
169	subs	r1, r1, #0x80
170#ifdef _ARM_ARCH_5E
171	strged	r2, [ip], #0x08
172	strged	r2, [ip], #0x08
173	strged	r2, [ip], #0x08
174	strged	r2, [ip], #0x08
175	strged	r2, [ip], #0x08
176	strged	r2, [ip], #0x08
177	strged	r2, [ip], #0x08
178	strged	r2, [ip], #0x08
179	strged	r2, [ip], #0x08
180	strged	r2, [ip], #0x08
181	strged	r2, [ip], #0x08
182	strged	r2, [ip], #0x08
183	strged	r2, [ip], #0x08
184	strged	r2, [ip], #0x08
185	strged	r2, [ip], #0x08
186	strged	r2, [ip], #0x08
187#else
188	stmgeia	ip!, {r2-r3}
189	stmgeia	ip!, {r2-r3}
190	stmgeia	ip!, {r2-r3}
191	stmgeia	ip!, {r2-r3}
192	stmgeia	ip!, {r2-r3}
193	stmgeia	ip!, {r2-r3}
194	stmgeia	ip!, {r2-r3}
195	stmgeia	ip!, {r2-r3}
196	stmgeia	ip!, {r2-r3}
197	stmgeia	ip!, {r2-r3}
198	stmgeia	ip!, {r2-r3}
199	stmgeia	ip!, {r2-r3}
200	stmgeia	ip!, {r2-r3}
201	stmgeia	ip!, {r2-r3}
202	stmgeia	ip!, {r2-r3}
203	stmgeia	ip!, {r2-r3}
204#endif
205	bgt	.Lmemset_loop128
206	RETeq			/* Zero length so just exit */
207
208	add	r1, r1, #0x80		/* Adjust for extra sub */
209
210	/* Do 32 bytes at a time */
211.Lmemset_loop32:
212	subs	r1, r1, #0x20
213#ifdef _ARM_ARCH_5E
214	strged	r2, [ip], #0x08
215	strged	r2, [ip], #0x08
216	strged	r2, [ip], #0x08
217	strged	r2, [ip], #0x08
218#else
219	stmgeia	ip!, {r2-r3}
220	stmgeia	ip!, {r2-r3}
221	stmgeia	ip!, {r2-r3}
222	stmgeia	ip!, {r2-r3}
223#endif
224	bgt	.Lmemset_loop32
225	RETeq			/* Zero length so just exit */
226
227	adds	r1, r1, #0x10		/* Partially adjust for extra sub */
228
229	/* Deal with 16 bytes or more */
230#ifdef _ARM_ARCH_5E
231	strged	r2, [ip], #0x08
232	strged	r2, [ip], #0x08
233#else
234	stmgeia	ip!, {r2-r3}
235	stmgeia	ip!, {r2-r3}
236#endif
237	RETeq			/* Zero length so just exit */
238
239	addlt	r1, r1, #0x10		/* Possibly adjust for extra sub */
240
241	/* We have at least 4 bytes so copy as words */
242.Lmemset_loop4:
243	subs	r1, r1, #0x04
244	strge	r3, [ip], #0x04
245	bgt	.Lmemset_loop4
246	RETeq			/* Zero length so just exit */
247
248#ifdef _ARM_ARCH_5E
249	/* Compensate for 64-bit alignment check */
250	adds	r1, r1, #0x04
251	RETeq
252	cmp	r1, #2
253#else
254	cmp	r1, #-2
255#endif
256
257	strb	r3, [ip], #0x01		/* Set 1 byte */
258	strgeb	r3, [ip], #0x01		/* Set another byte */
259	strgtb	r3, [ip]		/* and a third */
260	RET			/* Exit */
261
262.Lmemset_wordunaligned:
263	rsb	r2, r2, #0x004
264	strb	r3, [ip], #0x01		/* Set 1 byte */
265	cmp	r2, #0x02
266	strgeb	r3, [ip], #0x01		/* Set another byte */
267	sub	r1, r1, r2
268	strgtb	r3, [ip], #0x01		/* and a third */
269	cmp	r1, #0x04		/* More than 4 bytes left? */
270	bge	.Lmemset_wordaligned	/* Yup */
271
272.Lmemset_lessthanfour:
273	cmp	r1, #0x00
274	RETeq			/* Zero length so exit */
275	strb	r3, [ip], #0x01		/* Set 1 byte */
276	cmp	r1, #0x02
277	strgeb	r3, [ip], #0x01		/* Set another byte */
278	strgtb	r3, [ip]		/* and a third */
279	RET			/* Exit */
280END(bzero)
281END(memset)
282
283ENTRY(bcmp)
284	mov	ip, r0
285	cmp	r2, #0x06
286	beq	.Lmemcmp_6bytes
287	mov	r0, #0x00
288
289	/* Are both addresses aligned the same way? */
290	cmp	r2, #0x00
291	eornes	r3, ip, r1
292	RETeq			/* len == 0, or same addresses! */
293	tst	r3, #0x03
294	subne	r2, r2, #0x01
295	bne	.Lmemcmp_bytewise2	/* Badly aligned. Do it the slow way */
296
297	/* Word-align the addresses, if necessary */
298	sub	r3, r1, #0x05
299	ands	r3, r3, #0x03
300	add	r3, r3, r3, lsl #1
301	addne	pc, pc, r3, lsl #3
302	nop
303
304	/* Compare up to 3 bytes */
305	ldrb	r0, [ip], #0x01
306	ldrb	r3, [r1], #0x01
307	subs	r0, r0, r3
308	RETne
309	subs	r2, r2, #0x01
310	RETeq
311
312	/* Compare up to 2 bytes */
313	ldrb	r0, [ip], #0x01
314	ldrb	r3, [r1], #0x01
315	subs	r0, r0, r3
316	RETne
317	subs	r2, r2, #0x01
318	RETeq
319
320	/* Compare 1 byte */
321	ldrb	r0, [ip], #0x01
322	ldrb	r3, [r1], #0x01
323	subs	r0, r0, r3
324	RETne
325	subs	r2, r2, #0x01
326	RETeq
327
328	/* Compare 4 bytes at a time, if possible */
329	subs	r2, r2, #0x04
330	bcc	.Lmemcmp_bytewise
331.Lmemcmp_word_aligned:
332	ldr	r0, [ip], #0x04
333	ldr	r3, [r1], #0x04
334	subs	r2, r2, #0x04
335	cmpcs	r0, r3
336	beq	.Lmemcmp_word_aligned
337	sub	r0, r0, r3
338
339	/* Correct for extra subtraction, and check if done */
340	adds	r2, r2, #0x04
341	cmpeq	r0, #0x00		/* If done, did all bytes match? */
342	RETeq			/* Yup. Just return */
343
344	/* Re-do the final word byte-wise */
345	sub	ip, ip, #0x04
346	sub	r1, r1, #0x04
347
348.Lmemcmp_bytewise:
349	add	r2, r2, #0x03
350.Lmemcmp_bytewise2:
351	ldrb	r0, [ip], #0x01
352	ldrb	r3, [r1], #0x01
353	subs	r2, r2, #0x01
354	cmpcs	r0, r3
355	beq	.Lmemcmp_bytewise2
356	sub	r0, r0, r3
357	RET
358
359	/*
360	 * 6 byte compares are very common, thanks to the network stack.
361	 * This code is hand-scheduled to reduce the number of stalls for
362	 * load results. Everything else being equal, this will be ~32%
363	 * faster than a byte-wise memcmp.
364	 */
365	.align	5
366.Lmemcmp_6bytes:
367	ldrb	r3, [r1, #0x00]		/* r3 = b2#0 */
368	ldrb	r0, [ip, #0x00]		/* r0 = b1#0 */
369	ldrb	r2, [r1, #0x01]		/* r2 = b2#1 */
370	subs	r0, r0, r3		/* r0 = b1#0 - b2#0 */
371	ldreqb	r3, [ip, #0x01]		/* r3 = b1#1 */
372	RETne			/* Return if mismatch on #0 */
373	subs	r0, r3, r2		/* r0 = b1#1 - b2#1 */
374	ldreqb	r3, [r1, #0x02]		/* r3 = b2#2 */
375	ldreqb	r0, [ip, #0x02]		/* r0 = b1#2 */
376	RETne			/* Return if mismatch on #1 */
377	ldrb	r2, [r1, #0x03]		/* r2 = b2#3 */
378	subs	r0, r0, r3		/* r0 = b1#2 - b2#2 */
379	ldreqb	r3, [ip, #0x03]		/* r3 = b1#3 */
380	RETne			/* Return if mismatch on #2 */
381	subs	r0, r3, r2		/* r0 = b1#3 - b2#3 */
382	ldreqb	r3, [r1, #0x04]		/* r3 = b2#4 */
383	ldreqb	r0, [ip, #0x04]		/* r0 = b1#4 */
384	RETne			/* Return if mismatch on #3 */
385	ldrb	r2, [r1, #0x05]		/* r2 = b2#5 */
386	subs	r0, r0, r3		/* r0 = b1#4 - b2#4 */
387	ldreqb	r3, [ip, #0x05]		/* r3 = b1#5 */
388	RETne			/* Return if mismatch on #4 */
389	sub	r0, r3, r2		/* r0 = b1#5 - b2#5 */
390	RET
391END(bcmp)
392
393ENTRY(bcopy)
394	/* switch the source and destination registers */
395	eor     r0, r1, r0
396	eor     r1, r0, r1
397	eor     r0, r1, r0
398ENTRY(memmove)
399	/* Do the buffers overlap? */
400	cmp	r0, r1
401	RETeq		/* Bail now if src/dst are the same */
402	subcc	r3, r0, r1	/* if (dst > src) r3 = dst - src */
403	subcs	r3, r1, r0	/* if (src > dsr) r3 = src - dst */
404	cmp	r3, r2		/* if (r3 < len) we have an overlap */
405	bcc	PIC_SYM(_C_LABEL(memcpy), PLT)
406
407	/* Determine copy direction */
408	cmp	r1, r0
409	bcc	.Lmemmove_backwards
410
411	moveq	r0, #0			/* Quick abort for len=0 */
412	RETeq
413
414	stmdb	sp!, {r0, lr}		/* memmove() returns dest addr */
415	subs	r2, r2, #4
416	blt	.Lmemmove_fl4		/* less than 4 bytes */
417	ands	r12, r0, #3
418	bne	.Lmemmove_fdestul	/* oh unaligned destination addr */
419	ands	r12, r1, #3
420	bne	.Lmemmove_fsrcul		/* oh unaligned source addr */
421
422.Lmemmove_ft8:
423	/* We have aligned source and destination */
424	subs	r2, r2, #8
425	blt	.Lmemmove_fl12		/* less than 12 bytes (4 from above) */
426	subs	r2, r2, #0x14
427	blt	.Lmemmove_fl32		/* less than 32 bytes (12 from above) */
428	stmdb	sp!, {r4}		/* borrow r4 */
429
430	/* blat 32 bytes at a time */
431	/* XXX for really big copies perhaps we should use more registers */
432.Lmemmove_floop32:
433	ldmia	r1!, {r3, r4, r12, lr}
434	stmia	r0!, {r3, r4, r12, lr}
435	ldmia	r1!, {r3, r4, r12, lr}
436	stmia	r0!, {r3, r4, r12, lr}
437	subs	r2, r2, #0x20
438	bge	.Lmemmove_floop32
439
440	cmn	r2, #0x10
441	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
442	stmgeia	r0!, {r3, r4, r12, lr}
443	subge	r2, r2, #0x10
444	ldmia	sp!, {r4}		/* return r4 */
445
446.Lmemmove_fl32:
447	adds	r2, r2, #0x14
448
449	/* blat 12 bytes at a time */
450.Lmemmove_floop12:
451	ldmgeia	r1!, {r3, r12, lr}
452	stmgeia	r0!, {r3, r12, lr}
453	subges	r2, r2, #0x0c
454	bge	.Lmemmove_floop12
455
456.Lmemmove_fl12:
457	adds	r2, r2, #8
458	blt	.Lmemmove_fl4
459
460	subs	r2, r2, #4
461	ldrlt	r3, [r1], #4
462	strlt	r3, [r0], #4
463	ldmgeia	r1!, {r3, r12}
464	stmgeia	r0!, {r3, r12}
465	subge	r2, r2, #4
466
467.Lmemmove_fl4:
468	/* less than 4 bytes to go */
469	adds	r2, r2, #4
470	ldmeqia	sp!, {r0, pc}		/* done */
471
472	/* copy the crud byte at a time */
473	cmp	r2, #2
474	ldrb	r3, [r1], #1
475	strb	r3, [r0], #1
476	ldrgeb	r3, [r1], #1
477	strgeb	r3, [r0], #1
478	ldrgtb	r3, [r1], #1
479	strgtb	r3, [r0], #1
480	ldmia	sp!, {r0, pc}
481
482	/* erg - unaligned destination */
483.Lmemmove_fdestul:
484	rsb	r12, r12, #4
485	cmp	r12, #2
486
487	/* align destination with byte copies */
488	ldrb	r3, [r1], #1
489	strb	r3, [r0], #1
490	ldrgeb	r3, [r1], #1
491	strgeb	r3, [r0], #1
492	ldrgtb	r3, [r1], #1
493	strgtb	r3, [r0], #1
494	subs	r2, r2, r12
495	blt	.Lmemmove_fl4		/* less the 4 bytes */
496
497	ands	r12, r1, #3
498	beq	.Lmemmove_ft8		/* we have an aligned source */
499
500	/* erg - unaligned source */
501	/* This is where it gets nasty ... */
502.Lmemmove_fsrcul:
503	bic	r1, r1, #3
504	ldr	lr, [r1], #4
505	cmp	r12, #2
506	bgt	.Lmemmove_fsrcul3
507	beq	.Lmemmove_fsrcul2
508	cmp	r2, #0x0c
509	blt	.Lmemmove_fsrcul1loop4
510	sub	r2, r2, #0x0c
511	stmdb	sp!, {r4, r5}
512
513.Lmemmove_fsrcul1loop16:
514#ifdef __ARMEB__
515	mov	r3, lr, lsl #8
516#else
517	mov	r3, lr, lsr #8
518#endif
519	ldmia	r1!, {r4, r5, r12, lr}
520#ifdef __ARMEB__
521	orr	r3, r3, r4, lsr #24
522	mov	r4, r4, lsl #8
523	orr	r4, r4, r5, lsr #24
524	mov	r5, r5, lsl #8
525	orr	r5, r5, r12, lsr #24
526	mov	r12, r12, lsl #8
527	orr	r12, r12, lr, lsr #24
528#else
529	orr	r3, r3, r4, lsl #24
530	mov	r4, r4, lsr #8
531	orr	r4, r4, r5, lsl #24
532	mov	r5, r5, lsr #8
533	orr	r5, r5, r12, lsl #24
534	mov	r12, r12, lsr #8
535	orr	r12, r12, lr, lsl #24
536#endif
537	stmia	r0!, {r3-r5, r12}
538	subs	r2, r2, #0x10
539	bge	.Lmemmove_fsrcul1loop16
540	ldmia	sp!, {r4, r5}
541	adds	r2, r2, #0x0c
542	blt	.Lmemmove_fsrcul1l4
543
544.Lmemmove_fsrcul1loop4:
545#ifdef __ARMEB__
546	mov	r12, lr, lsl #8
547#else
548	mov	r12, lr, lsr #8
549#endif
550	ldr	lr, [r1], #4
551#ifdef __ARMEB__
552	orr	r12, r12, lr, lsr #24
553#else
554	orr	r12, r12, lr, lsl #24
555#endif
556	str	r12, [r0], #4
557	subs	r2, r2, #4
558	bge	.Lmemmove_fsrcul1loop4
559
560.Lmemmove_fsrcul1l4:
561	sub	r1, r1, #3
562	b	.Lmemmove_fl4
563
564.Lmemmove_fsrcul2:
565	cmp	r2, #0x0c
566	blt	.Lmemmove_fsrcul2loop4
567	sub	r2, r2, #0x0c
568	stmdb	sp!, {r4, r5}
569
570.Lmemmove_fsrcul2loop16:
571#ifdef __ARMEB__
572	mov	r3, lr, lsl #16
573#else
574	mov	r3, lr, lsr #16
575#endif
576	ldmia	r1!, {r4, r5, r12, lr}
577#ifdef __ARMEB__
578	orr	r3, r3, r4, lsr #16
579	mov	r4, r4, lsl #16
580	orr	r4, r4, r5, lsr #16
581	mov	r5, r5, lsl #16
582	orr	r5, r5, r12, lsr #16
583	mov	r12, r12, lsl #16
584	orr	r12, r12, lr, lsr #16
585#else
586	orr	r3, r3, r4, lsl #16
587	mov	r4, r4, lsr #16
588	orr	r4, r4, r5, lsl #16
589	mov	r5, r5, lsr #16
590	orr	r5, r5, r12, lsl #16
591	mov	r12, r12, lsr #16
592	orr	r12, r12, lr, lsl #16
593#endif
594	stmia	r0!, {r3-r5, r12}
595	subs	r2, r2, #0x10
596	bge	.Lmemmove_fsrcul2loop16
597	ldmia	sp!, {r4, r5}
598	adds	r2, r2, #0x0c
599	blt	.Lmemmove_fsrcul2l4
600
601.Lmemmove_fsrcul2loop4:
602#ifdef __ARMEB__
603	mov	r12, lr, lsl #16
604#else
605	mov	r12, lr, lsr #16
606#endif
607	ldr	lr, [r1], #4
608#ifdef __ARMEB__
609	orr	r12, r12, lr, lsr #16
610#else
611	orr	r12, r12, lr, lsl #16
612#endif
613	str	r12, [r0], #4
614	subs	r2, r2, #4
615	bge	.Lmemmove_fsrcul2loop4
616
617.Lmemmove_fsrcul2l4:
618	sub	r1, r1, #2
619	b	.Lmemmove_fl4
620
621.Lmemmove_fsrcul3:
622	cmp	r2, #0x0c
623	blt	.Lmemmove_fsrcul3loop4
624	sub	r2, r2, #0x0c
625	stmdb	sp!, {r4, r5}
626
627.Lmemmove_fsrcul3loop16:
628#ifdef __ARMEB__
629	mov	r3, lr, lsl #24
630#else
631	mov	r3, lr, lsr #24
632#endif
633	ldmia	r1!, {r4, r5, r12, lr}
634#ifdef __ARMEB__
635	orr	r3, r3, r4, lsr #8
636	mov	r4, r4, lsl #24
637	orr	r4, r4, r5, lsr #8
638	mov	r5, r5, lsl #24
639	orr	r5, r5, r12, lsr #8
640	mov	r12, r12, lsl #24
641	orr	r12, r12, lr, lsr #8
642#else
643	orr	r3, r3, r4, lsl #8
644	mov	r4, r4, lsr #24
645	orr	r4, r4, r5, lsl #8
646	mov	r5, r5, lsr #24
647	orr	r5, r5, r12, lsl #8
648	mov	r12, r12, lsr #24
649	orr	r12, r12, lr, lsl #8
650#endif
651	stmia	r0!, {r3-r5, r12}
652	subs	r2, r2, #0x10
653	bge	.Lmemmove_fsrcul3loop16
654	ldmia	sp!, {r4, r5}
655	adds	r2, r2, #0x0c
656	blt	.Lmemmove_fsrcul3l4
657
658.Lmemmove_fsrcul3loop4:
659#ifdef __ARMEB__
660	mov	r12, lr, lsl #24
661#else
662	mov	r12, lr, lsr #24
663#endif
664	ldr	lr, [r1], #4
665#ifdef __ARMEB__
666	orr	r12, r12, lr, lsr #8
667#else
668	orr	r12, r12, lr, lsl #8
669#endif
670	str	r12, [r0], #4
671	subs	r2, r2, #4
672	bge	.Lmemmove_fsrcul3loop4
673
674.Lmemmove_fsrcul3l4:
675	sub	r1, r1, #1
676	b	.Lmemmove_fl4
677
678.Lmemmove_backwards:
679	add	r1, r1, r2
680	add	r0, r0, r2
681	subs	r2, r2, #4
682	blt	.Lmemmove_bl4		/* less than 4 bytes */
683	ands	r12, r0, #3
684	bne	.Lmemmove_bdestul	/* oh unaligned destination addr */
685	ands	r12, r1, #3
686	bne	.Lmemmove_bsrcul		/* oh unaligned source addr */
687
688.Lmemmove_bt8:
689	/* We have aligned source and destination */
690	subs	r2, r2, #8
691	blt	.Lmemmove_bl12		/* less than 12 bytes (4 from above) */
692	stmdb	sp!, {r4, lr}
693	subs	r2, r2, #0x14		/* less than 32 bytes (12 from above) */
694	blt	.Lmemmove_bl32
695
696	/* blat 32 bytes at a time */
697	/* XXX for really big copies perhaps we should use more registers */
698.Lmemmove_bloop32:
699	ldmdb	r1!, {r3, r4, r12, lr}
700	stmdb	r0!, {r3, r4, r12, lr}
701	ldmdb	r1!, {r3, r4, r12, lr}
702	stmdb	r0!, {r3, r4, r12, lr}
703	subs	r2, r2, #0x20
704	bge	.Lmemmove_bloop32
705
706.Lmemmove_bl32:
707	cmn	r2, #0x10
708	ldmgedb	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
709	stmgedb	r0!, {r3, r4, r12, lr}
710	subge	r2, r2, #0x10
711	adds	r2, r2, #0x14
712	ldmgedb	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */
713	stmgedb	r0!, {r3, r12, lr}
714	subge	r2, r2, #0x0c
715	ldmia	sp!, {r4, lr}
716
717.Lmemmove_bl12:
718	adds	r2, r2, #8
719	blt	.Lmemmove_bl4
720	subs	r2, r2, #4
721	ldrlt	r3, [r1, #-4]!
722	strlt	r3, [r0, #-4]!
723	ldmgedb	r1!, {r3, r12}
724	stmgedb	r0!, {r3, r12}
725	subge	r2, r2, #4
726
727.Lmemmove_bl4:
728	/* less than 4 bytes to go */
729	adds	r2, r2, #4
730	RETeq			/* done */
731
732	/* copy the crud byte at a time */
733	cmp	r2, #2
734	ldrb	r3, [r1, #-1]!
735	strb	r3, [r0, #-1]!
736	ldrgeb	r3, [r1, #-1]!
737	strgeb	r3, [r0, #-1]!
738	ldrgtb	r3, [r1, #-1]!
739	strgtb	r3, [r0, #-1]!
740	RET
741
742	/* erg - unaligned destination */
743.Lmemmove_bdestul:
744	cmp	r12, #2
745
746	/* align destination with byte copies */
747	ldrb	r3, [r1, #-1]!
748	strb	r3, [r0, #-1]!
749	ldrgeb	r3, [r1, #-1]!
750	strgeb	r3, [r0, #-1]!
751	ldrgtb	r3, [r1, #-1]!
752	strgtb	r3, [r0, #-1]!
753	subs	r2, r2, r12
754	blt	.Lmemmove_bl4		/* less than 4 bytes to go */
755	ands	r12, r1, #3
756	beq	.Lmemmove_bt8		/* we have an aligned source */
757
758	/* erg - unaligned source */
759	/* This is where it gets nasty ... */
760.Lmemmove_bsrcul:
761	bic	r1, r1, #3
762	ldr	r3, [r1, #0]
763	cmp	r12, #2
764	blt	.Lmemmove_bsrcul1
765	beq	.Lmemmove_bsrcul2
766	cmp	r2, #0x0c
767	blt	.Lmemmove_bsrcul3loop4
768	sub	r2, r2, #0x0c
769	stmdb	sp!, {r4, r5, lr}
770
771.Lmemmove_bsrcul3loop16:
772#ifdef __ARMEB__
773	mov	lr, r3, lsr #8
774#else
775	mov	lr, r3, lsl #8
776#endif
777	ldmdb	r1!, {r3-r5, r12}
778#ifdef __ARMEB__
779	orr	lr, lr, r12, lsl #24
780	mov	r12, r12, lsr #8
781	orr	r12, r12, r5, lsl #24
782	mov	r5, r5, lsr #8
783	orr	r5, r5, r4, lsl #24
784	mov	r4, r4, lsr #8
785	orr	r4, r4, r3, lsl #24
786#else
787	orr	lr, lr, r12, lsr #24
788	mov	r12, r12, lsl #8
789	orr	r12, r12, r5, lsr #24
790	mov	r5, r5, lsl #8
791	orr	r5, r5, r4, lsr #24
792	mov	r4, r4, lsl #8
793	orr	r4, r4, r3, lsr #24
794#endif
795	stmdb	r0!, {r4, r5, r12, lr}
796	subs	r2, r2, #0x10
797	bge	.Lmemmove_bsrcul3loop16
798	ldmia	sp!, {r4, r5, lr}
799	adds	r2, r2, #0x0c
800	blt	.Lmemmove_bsrcul3l4
801
802.Lmemmove_bsrcul3loop4:
803#ifdef __ARMEB__
804	mov	r12, r3, lsr #8
805#else
806	mov	r12, r3, lsl #8
807#endif
808	ldr	r3, [r1, #-4]!
809#ifdef __ARMEB__
810	orr	r12, r12, r3, lsl #24
811#else
812	orr	r12, r12, r3, lsr #24
813#endif
814	str	r12, [r0, #-4]!
815	subs	r2, r2, #4
816	bge	.Lmemmove_bsrcul3loop4
817
818.Lmemmove_bsrcul3l4:
819	add	r1, r1, #3
820	b	.Lmemmove_bl4
821
822.Lmemmove_bsrcul2:
823	cmp	r2, #0x0c
824	blt	.Lmemmove_bsrcul2loop4
825	sub	r2, r2, #0x0c
826	stmdb	sp!, {r4, r5, lr}
827
828.Lmemmove_bsrcul2loop16:
829#ifdef __ARMEB__
830	mov	lr, r3, lsr #16
831#else
832	mov	lr, r3, lsl #16
833#endif
834	ldmdb	r1!, {r3-r5, r12}
835#ifdef __ARMEB__
836	orr	lr, lr, r12, lsl #16
837	mov	r12, r12, lsr #16
838	orr	r12, r12, r5, lsl #16
839	mov	r5, r5, lsr #16
840	orr	r5, r5, r4, lsl #16
841	mov	r4, r4, lsr #16
842	orr	r4, r4, r3, lsl #16
843#else
844	orr	lr, lr, r12, lsr #16
845	mov	r12, r12, lsl #16
846	orr	r12, r12, r5, lsr #16
847	mov	r5, r5, lsl #16
848	orr	r5, r5, r4, lsr #16
849	mov	r4, r4, lsl #16
850	orr	r4, r4, r3, lsr #16
851#endif
852	stmdb	r0!, {r4, r5, r12, lr}
853	subs	r2, r2, #0x10
854	bge	.Lmemmove_bsrcul2loop16
855	ldmia	sp!, {r4, r5, lr}
856	adds	r2, r2, #0x0c
857	blt	.Lmemmove_bsrcul2l4
858
859.Lmemmove_bsrcul2loop4:
860#ifdef __ARMEB__
861	mov	r12, r3, lsr #16
862#else
863	mov	r12, r3, lsl #16
864#endif
865	ldr	r3, [r1, #-4]!
866#ifdef __ARMEB__
867	orr	r12, r12, r3, lsl #16
868#else
869	orr	r12, r12, r3, lsr #16
870#endif
871	str	r12, [r0, #-4]!
872	subs	r2, r2, #4
873	bge	.Lmemmove_bsrcul2loop4
874
875.Lmemmove_bsrcul2l4:
876	add	r1, r1, #2
877	b	.Lmemmove_bl4
878
879.Lmemmove_bsrcul1:
880	cmp	r2, #0x0c
881	blt	.Lmemmove_bsrcul1loop4
882	sub	r2, r2, #0x0c
883	stmdb	sp!, {r4, r5, lr}
884
885.Lmemmove_bsrcul1loop32:
886#ifdef __ARMEB__
887	mov	lr, r3, lsr #24
888#else
889	mov	lr, r3, lsl #24
890#endif
891	ldmdb	r1!, {r3-r5, r12}
892#ifdef __ARMEB__
893	orr	lr, lr, r12, lsl #8
894	mov	r12, r12, lsr #24
895	orr	r12, r12, r5, lsl #8
896	mov	r5, r5, lsr #24
897	orr	r5, r5, r4, lsl #8
898	mov	r4, r4, lsr #24
899	orr	r4, r4, r3, lsl #8
900#else
901	orr	lr, lr, r12, lsr #8
902	mov	r12, r12, lsl #24
903	orr	r12, r12, r5, lsr #8
904	mov	r5, r5, lsl #24
905	orr	r5, r5, r4, lsr #8
906	mov	r4, r4, lsl #24
907	orr	r4, r4, r3, lsr #8
908#endif
909	stmdb	r0!, {r4, r5, r12, lr}
910	subs	r2, r2, #0x10
911	bge	.Lmemmove_bsrcul1loop32
912	ldmia	sp!, {r4, r5, lr}
913	adds	r2, r2, #0x0c
914	blt	.Lmemmove_bsrcul1l4
915
916.Lmemmove_bsrcul1loop4:
917#ifdef __ARMEB__
918	mov	r12, r3, lsr #24
919#else
920	mov	r12, r3, lsl #24
921#endif
922	ldr	r3, [r1, #-4]!
923#ifdef __ARMEB__
924	orr	r12, r12, r3, lsl #8
925#else
926	orr	r12, r12, r3, lsr #8
927#endif
928	str	r12, [r0, #-4]!
929	subs	r2, r2, #4
930	bge	.Lmemmove_bsrcul1loop4
931
932.Lmemmove_bsrcul1l4:
933	add	r1, r1, #1
934	b	.Lmemmove_bl4
935END(bcopy)
936END(memmove)
937
938#if !defined(_ARM_ARCH_5E)
939ENTRY(memcpy)
940	/* save leaf functions having to store this away */
941	/* Do not check arm_memcpy if we're running from flash */
942#ifdef FLASHADDR
943#if FLASHADDR > PHYSADDR
944	ldr	r3, =FLASHADDR
945	cmp	r3, pc
946	bls	.Lnormal
947#else
948	ldr	r3, =FLASHADDR
949	cmp	r3, pc
950	bhi	.Lnormal
951#endif
952#endif
953	ldr	r3, .L_arm_memcpy
954	ldr	r3, [r3]
955	cmp	r3, #0
956	beq	.Lnormal
957	ldr	r3, .L_min_memcpy_size
958	ldr	r3, [r3]
959	cmp	r2, r3
960	blt	.Lnormal
961	stmfd	sp!, {r0-r2, r4, lr}
962	mov	r3, #0
963	ldr	r4, .L_arm_memcpy
964	mov	lr, pc
965	ldr	pc, [r4]
966	cmp	r0, #0
967	ldmfd	sp!, {r0-r2, r4, lr}
968	RETeq
969
970.Lnormal:
971	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
972
973	subs	r2, r2, #4
974	blt	.Lmemcpy_l4		/* less than 4 bytes */
975	ands	r12, r0, #3
976	bne	.Lmemcpy_destul		/* oh unaligned destination addr */
977	ands	r12, r1, #3
978	bne	.Lmemcpy_srcul		/* oh unaligned source addr */
979
980.Lmemcpy_t8:
981	/* We have aligned source and destination */
982	subs	r2, r2, #8
983	blt	.Lmemcpy_l12		/* less than 12 bytes (4 from above) */
984	subs	r2, r2, #0x14
985	blt	.Lmemcpy_l32		/* less than 32 bytes (12 from above) */
986	stmdb	sp!, {r4}		/* borrow r4 */
987
988	/* blat 32 bytes at a time */
989	/* XXX for really big copies perhaps we should use more registers */
990.Lmemcpy_loop32:
991	ldmia	r1!, {r3, r4, r12, lr}
992	stmia	r0!, {r3, r4, r12, lr}
993	ldmia	r1!, {r3, r4, r12, lr}
994	stmia	r0!, {r3, r4, r12, lr}
995	subs	r2, r2, #0x20
996	bge	.Lmemcpy_loop32
997
998	cmn	r2, #0x10
999	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
1000	stmgeia	r0!, {r3, r4, r12, lr}
1001	subge	r2, r2, #0x10
1002	ldmia	sp!, {r4}		/* return r4 */
1003
1004.Lmemcpy_l32:
1005	adds	r2, r2, #0x14
1006
1007	/* blat 12 bytes at a time */
1008.Lmemcpy_loop12:
1009	ldmgeia	r1!, {r3, r12, lr}
1010	stmgeia	r0!, {r3, r12, lr}
1011	subges	r2, r2, #0x0c
1012	bge	.Lmemcpy_loop12
1013
1014.Lmemcpy_l12:
1015	adds	r2, r2, #8
1016	blt	.Lmemcpy_l4
1017
1018	subs	r2, r2, #4
1019	ldrlt	r3, [r1], #4
1020	strlt	r3, [r0], #4
1021	ldmgeia	r1!, {r3, r12}
1022	stmgeia	r0!, {r3, r12}
1023	subge	r2, r2, #4
1024
1025.Lmemcpy_l4:
1026	/* less than 4 bytes to go */
1027	adds	r2, r2, #4
1028#ifdef __APCS_26_
1029	ldmeqia sp!, {r0, pc}^		/* done */
1030#else
1031	ldmeqia	sp!, {r0, pc}		/* done */
1032#endif
1033	/* copy the crud byte at a time */
1034	cmp	r2, #2
1035	ldrb	r3, [r1], #1
1036	strb	r3, [r0], #1
1037	ldrgeb	r3, [r1], #1
1038	strgeb	r3, [r0], #1
1039	ldrgtb	r3, [r1], #1
1040	strgtb	r3, [r0], #1
1041	ldmia	sp!, {r0, pc}
1042
1043	/* erg - unaligned destination */
1044.Lmemcpy_destul:
1045	rsb	r12, r12, #4
1046	cmp	r12, #2
1047
1048	/* align destination with byte copies */
1049	ldrb	r3, [r1], #1
1050	strb	r3, [r0], #1
1051	ldrgeb	r3, [r1], #1
1052	strgeb	r3, [r0], #1
1053	ldrgtb	r3, [r1], #1
1054	strgtb	r3, [r0], #1
1055	subs	r2, r2, r12
1056	blt	.Lmemcpy_l4		/* less the 4 bytes */
1057
1058	ands	r12, r1, #3
1059	beq	.Lmemcpy_t8		/* we have an aligned source */
1060
1061	/* erg - unaligned source */
1062	/* This is where it gets nasty ... */
1063.Lmemcpy_srcul:
1064	bic	r1, r1, #3
1065	ldr	lr, [r1], #4
1066	cmp	r12, #2
1067	bgt	.Lmemcpy_srcul3
1068	beq	.Lmemcpy_srcul2
1069	cmp	r2, #0x0c
1070	blt	.Lmemcpy_srcul1loop4
1071	sub	r2, r2, #0x0c
1072	stmdb	sp!, {r4, r5}
1073
1074.Lmemcpy_srcul1loop16:
1075	mov	r3, lr, lsr #8
1076	ldmia	r1!, {r4, r5, r12, lr}
1077	orr	r3, r3, r4, lsl #24
1078	mov	r4, r4, lsr #8
1079	orr	r4, r4, r5, lsl #24
1080	mov	r5, r5, lsr #8
1081	orr	r5, r5, r12, lsl #24
1082	mov	r12, r12, lsr #8
1083	orr	r12, r12, lr, lsl #24
1084	stmia	r0!, {r3-r5, r12}
1085	subs	r2, r2, #0x10
1086	bge	.Lmemcpy_srcul1loop16
1087	ldmia	sp!, {r4, r5}
1088	adds	r2, r2, #0x0c
1089	blt	.Lmemcpy_srcul1l4
1090
1091.Lmemcpy_srcul1loop4:
1092	mov	r12, lr, lsr #8
1093	ldr	lr, [r1], #4
1094	orr	r12, r12, lr, lsl #24
1095	str	r12, [r0], #4
1096	subs	r2, r2, #4
1097	bge	.Lmemcpy_srcul1loop4
1098
1099.Lmemcpy_srcul1l4:
1100	sub	r1, r1, #3
1101	b	.Lmemcpy_l4
1102
1103.Lmemcpy_srcul2:
1104	cmp	r2, #0x0c
1105	blt	.Lmemcpy_srcul2loop4
1106	sub	r2, r2, #0x0c
1107	stmdb	sp!, {r4, r5}
1108
1109.Lmemcpy_srcul2loop16:
1110	mov	r3, lr, lsr #16
1111	ldmia	r1!, {r4, r5, r12, lr}
1112	orr	r3, r3, r4, lsl #16
1113	mov	r4, r4, lsr #16
1114	orr	r4, r4, r5, lsl #16
1115	mov	r5, r5, lsr #16
1116	orr	r5, r5, r12, lsl #16
1117	mov	r12, r12, lsr #16
1118	orr	r12, r12, lr, lsl #16
1119	stmia	r0!, {r3-r5, r12}
1120	subs	r2, r2, #0x10
1121	bge	.Lmemcpy_srcul2loop16
1122	ldmia	sp!, {r4, r5}
1123	adds	r2, r2, #0x0c
1124	blt	.Lmemcpy_srcul2l4
1125
1126.Lmemcpy_srcul2loop4:
1127	mov	r12, lr, lsr #16
1128	ldr	lr, [r1], #4
1129	orr	r12, r12, lr, lsl #16
1130	str	r12, [r0], #4
1131	subs	r2, r2, #4
1132	bge	.Lmemcpy_srcul2loop4
1133
1134.Lmemcpy_srcul2l4:
1135	sub	r1, r1, #2
1136	b	.Lmemcpy_l4
1137
1138.Lmemcpy_srcul3:
1139	cmp	r2, #0x0c
1140	blt	.Lmemcpy_srcul3loop4
1141	sub	r2, r2, #0x0c
1142	stmdb	sp!, {r4, r5}
1143
1144.Lmemcpy_srcul3loop16:
1145	mov	r3, lr, lsr #24
1146	ldmia	r1!, {r4, r5, r12, lr}
1147	orr	r3, r3, r4, lsl #8
1148	mov	r4, r4, lsr #24
1149	orr	r4, r4, r5, lsl #8
1150	mov	r5, r5, lsr #24
1151	orr	r5, r5, r12, lsl #8
1152	mov	r12, r12, lsr #24
1153	orr	r12, r12, lr, lsl #8
1154	stmia	r0!, {r3-r5, r12}
1155	subs	r2, r2, #0x10
1156	bge	.Lmemcpy_srcul3loop16
1157	ldmia	sp!, {r4, r5}
1158	adds	r2, r2, #0x0c
1159	blt	.Lmemcpy_srcul3l4
1160
1161.Lmemcpy_srcul3loop4:
1162	mov	r12, lr, lsr #24
1163	ldr	lr, [r1], #4
1164	orr	r12, r12, lr, lsl #8
1165	str	r12, [r0], #4
1166	subs	r2, r2, #4
1167	bge	.Lmemcpy_srcul3loop4
1168
1169.Lmemcpy_srcul3l4:
1170	sub	r1, r1, #1
1171	b	.Lmemcpy_l4
1172END(memcpy)
1173
1174#else
1175/* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
1176ENTRY(memcpy)
1177	pld	[r1]
1178	cmp	r2, #0x0c
1179	ble	.Lmemcpy_short		/* <= 12 bytes */
1180#ifdef FLASHADDR
1181#if FLASHADDR > PHYSADDR
1182	ldr	r3, =FLASHADDR
1183	cmp	r3, pc
1184	bls	.Lnormal
1185#else
1186	ldr	r3, =FLASHADDR
1187	cmp	r3, pc
1188	bhi	.Lnormal
1189#endif
1190#endif
1191	ldr	r3, .L_arm_memcpy
1192	ldr	r3, [r3]
1193	cmp	r3, #0
1194	beq	.Lnormal
1195	ldr	r3, .L_min_memcpy_size
1196	ldr	r3, [r3]
1197	cmp	r2, r3
1198	blt	.Lnormal
1199	stmfd	sp!, {r0-r2, r4, lr}
1200	mov	r3, #0
1201	ldr	r4, .L_arm_memcpy
1202	mov	lr, pc
1203	ldr	pc, [r4]
1204	cmp	r0, #0
1205	ldmfd	sp!, {r0-r2, r4, lr}
1206	RETeq
1207.Lnormal:
1208	mov	r3, r0			/* We must not clobber r0 */
1209
1210	/* Word-align the destination buffer */
1211	ands	ip, r3, #0x03		/* Already word aligned? */
1212	beq	.Lmemcpy_wordaligned	/* Yup */
1213	cmp	ip, #0x02
1214	ldrb	ip, [r1], #0x01
1215	sub	r2, r2, #0x01
1216	strb	ip, [r3], #0x01
1217	ldrleb	ip, [r1], #0x01
1218	suble	r2, r2, #0x01
1219	strleb	ip, [r3], #0x01
1220	ldrltb	ip, [r1], #0x01
1221	sublt	r2, r2, #0x01
1222	strltb	ip, [r3], #0x01
1223
1224	/* Destination buffer is now word aligned */
1225.Lmemcpy_wordaligned:
1226	ands	ip, r1, #0x03		/* Is src also word-aligned? */
1227	bne	.Lmemcpy_bad_align	/* Nope. Things just got bad */
1228
1229	/* Quad-align the destination buffer */
1230	tst	r3, #0x07		/* Already quad aligned? */
1231	ldrne	ip, [r1], #0x04
1232	stmfd	sp!, {r4-r9}		/* Free up some registers */
1233	subne	r2, r2, #0x04
1234	strne	ip, [r3], #0x04
1235
1236	/* Destination buffer quad aligned, source is at least word aligned */
1237	subs	r2, r2, #0x80
1238	blt	.Lmemcpy_w_lessthan128
1239
1240	/* Copy 128 bytes at a time */
1241.Lmemcpy_w_loop128:
1242	ldr	r4, [r1], #0x04		/* LD:00-03 */
1243	ldr	r5, [r1], #0x04		/* LD:04-07 */
1244	pld	[r1, #0x18]		/* Prefetch 0x20 */
1245	ldr	r6, [r1], #0x04		/* LD:08-0b */
1246	ldr	r7, [r1], #0x04		/* LD:0c-0f */
1247	ldr	r8, [r1], #0x04		/* LD:10-13 */
1248	ldr	r9, [r1], #0x04		/* LD:14-17 */
1249	strd	r4, [r3], #0x08		/* ST:00-07 */
1250	ldr	r4, [r1], #0x04		/* LD:18-1b */
1251	ldr	r5, [r1], #0x04		/* LD:1c-1f */
1252	strd	r6, [r3], #0x08		/* ST:08-0f */
1253	ldr	r6, [r1], #0x04		/* LD:20-23 */
1254	ldr	r7, [r1], #0x04		/* LD:24-27 */
1255	pld	[r1, #0x18]		/* Prefetch 0x40 */
1256	strd	r8, [r3], #0x08		/* ST:10-17 */
1257	ldr	r8, [r1], #0x04		/* LD:28-2b */
1258	ldr	r9, [r1], #0x04		/* LD:2c-2f */
1259	strd	r4, [r3], #0x08		/* ST:18-1f */
1260	ldr	r4, [r1], #0x04		/* LD:30-33 */
1261	ldr	r5, [r1], #0x04		/* LD:34-37 */
1262	strd	r6, [r3], #0x08		/* ST:20-27 */
1263	ldr	r6, [r1], #0x04		/* LD:38-3b */
1264	ldr	r7, [r1], #0x04		/* LD:3c-3f */
1265	strd	r8, [r3], #0x08		/* ST:28-2f */
1266	ldr	r8, [r1], #0x04		/* LD:40-43 */
1267	ldr	r9, [r1], #0x04		/* LD:44-47 */
1268	pld	[r1, #0x18]		/* Prefetch 0x60 */
1269	strd	r4, [r3], #0x08		/* ST:30-37 */
1270	ldr	r4, [r1], #0x04		/* LD:48-4b */
1271	ldr	r5, [r1], #0x04		/* LD:4c-4f */
1272	strd	r6, [r3], #0x08		/* ST:38-3f */
1273	ldr	r6, [r1], #0x04		/* LD:50-53 */
1274	ldr	r7, [r1], #0x04		/* LD:54-57 */
1275	strd	r8, [r3], #0x08		/* ST:40-47 */
1276	ldr	r8, [r1], #0x04		/* LD:58-5b */
1277	ldr	r9, [r1], #0x04		/* LD:5c-5f */
1278	strd	r4, [r3], #0x08		/* ST:48-4f */
1279	ldr	r4, [r1], #0x04		/* LD:60-63 */
1280	ldr	r5, [r1], #0x04		/* LD:64-67 */
1281	pld	[r1, #0x18]		/* Prefetch 0x80 */
1282	strd	r6, [r3], #0x08		/* ST:50-57 */
1283	ldr	r6, [r1], #0x04		/* LD:68-6b */
1284	ldr	r7, [r1], #0x04		/* LD:6c-6f */
1285	strd	r8, [r3], #0x08		/* ST:58-5f */
1286	ldr	r8, [r1], #0x04		/* LD:70-73 */
1287	ldr	r9, [r1], #0x04		/* LD:74-77 */
1288	strd	r4, [r3], #0x08		/* ST:60-67 */
1289	ldr	r4, [r1], #0x04		/* LD:78-7b */
1290	ldr	r5, [r1], #0x04		/* LD:7c-7f */
1291	strd	r6, [r3], #0x08		/* ST:68-6f */
1292	strd	r8, [r3], #0x08		/* ST:70-77 */
1293	subs	r2, r2, #0x80
1294	strd	r4, [r3], #0x08		/* ST:78-7f */
1295	bge	.Lmemcpy_w_loop128
1296
1297.Lmemcpy_w_lessthan128:
1298	adds	r2, r2, #0x80		/* Adjust for extra sub */
1299	ldmeqfd	sp!, {r4-r9}
1300	RETeq			/* Return now if done */
1301	subs	r2, r2, #0x20
1302	blt	.Lmemcpy_w_lessthan32
1303
1304	/* Copy 32 bytes at a time */
1305.Lmemcpy_w_loop32:
1306	ldr	r4, [r1], #0x04
1307	ldr	r5, [r1], #0x04
1308	pld	[r1, #0x18]
1309	ldr	r6, [r1], #0x04
1310	ldr	r7, [r1], #0x04
1311	ldr	r8, [r1], #0x04
1312	ldr	r9, [r1], #0x04
1313	strd	r4, [r3], #0x08
1314	ldr	r4, [r1], #0x04
1315	ldr	r5, [r1], #0x04
1316	strd	r6, [r3], #0x08
1317	strd	r8, [r3], #0x08
1318	subs	r2, r2, #0x20
1319	strd	r4, [r3], #0x08
1320	bge	.Lmemcpy_w_loop32
1321
1322.Lmemcpy_w_lessthan32:
1323	adds	r2, r2, #0x20		/* Adjust for extra sub */
1324	ldmeqfd	sp!, {r4-r9}
1325	RETeq			/* Return now if done */
1326
1327	and	r4, r2, #0x18
1328	rsbs	r4, r4, #0x18
1329	addne	pc, pc, r4, lsl #1
1330	nop
1331
1332	/* At least 24 bytes remaining */
1333	ldr	r4, [r1], #0x04
1334	ldr	r5, [r1], #0x04
1335	sub	r2, r2, #0x08
1336	strd	r4, [r3], #0x08
1337
1338	/* At least 16 bytes remaining */
1339	ldr	r4, [r1], #0x04
1340	ldr	r5, [r1], #0x04
1341	sub	r2, r2, #0x08
1342	strd	r4, [r3], #0x08
1343
1344	/* At least 8 bytes remaining */
1345	ldr	r4, [r1], #0x04
1346	ldr	r5, [r1], #0x04
1347	subs	r2, r2, #0x08
1348	strd	r4, [r3], #0x08
1349
1350	/* Less than 8 bytes remaining */
1351	ldmfd	sp!, {r4-r9}
1352	RETeq			/* Return now if done */
1353	subs	r2, r2, #0x04
1354	ldrge	ip, [r1], #0x04
1355	strge	ip, [r3], #0x04
1356	RETeq			/* Return now if done */
1357	addlt	r2, r2, #0x04
1358	ldrb	ip, [r1], #0x01
1359	cmp	r2, #0x02
1360	ldrgeb	r2, [r1], #0x01
1361	strb	ip, [r3], #0x01
1362	ldrgtb	ip, [r1]
1363	strgeb	r2, [r3], #0x01
1364	strgtb	ip, [r3]
1365	RET
1366
1367
1368/*
1369 * At this point, it has not been possible to word align both buffers.
1370 * The destination buffer is word aligned, but the source buffer is not.
1371 */
1372.Lmemcpy_bad_align:
1373	stmfd	sp!, {r4-r7}
1374	bic	r1, r1, #0x03
1375	cmp	ip, #2
1376	ldr	ip, [r1], #0x04
1377	bgt	.Lmemcpy_bad3
1378	beq	.Lmemcpy_bad2
1379	b	.Lmemcpy_bad1
1380
1381.Lmemcpy_bad1_loop16:
1382#ifdef __ARMEB__
1383	mov	r4, ip, lsl #8
1384#else
1385	mov	r4, ip, lsr #8
1386#endif
1387	ldr	r5, [r1], #0x04
1388	pld	[r1, #0x018]
1389	ldr	r6, [r1], #0x04
1390	ldr	r7, [r1], #0x04
1391	ldr	ip, [r1], #0x04
1392#ifdef __ARMEB__
1393	orr	r4, r4, r5, lsr #24
1394	mov	r5, r5, lsl #8
1395	orr	r5, r5, r6, lsr #24
1396	mov	r6, r6, lsl #8
1397	orr	r6, r6, r7, lsr #24
1398	mov	r7, r7, lsl #8
1399	orr	r7, r7, ip, lsr #24
1400#else
1401	orr	r4, r4, r5, lsl #24
1402	mov	r5, r5, lsr #8
1403	orr	r5, r5, r6, lsl #24
1404	mov	r6, r6, lsr #8
1405	orr	r6, r6, r7, lsl #24
1406	mov	r7, r7, lsr #8
1407	orr	r7, r7, ip, lsl #24
1408#endif
1409	str	r4, [r3], #0x04
1410	str	r5, [r3], #0x04
1411	str	r6, [r3], #0x04
1412	str	r7, [r3], #0x04
1413.Lmemcpy_bad1:
1414	subs	r2, r2, #0x10
1415	bge	.Lmemcpy_bad1_loop16
1416
1417	adds	r2, r2, #0x10
1418	ldmeqfd	sp!, {r4-r7}
1419	RETeq			/* Return now if done */
1420	subs	r2, r2, #0x04
1421	sublt	r1, r1, #0x03
1422	blt	.Lmemcpy_bad_done
1423
1424.Lmemcpy_bad1_loop4:
1425#ifdef __ARMEB__
1426	mov	r4, ip, lsl #8
1427#else
1428	mov	r4, ip, lsr #8
1429#endif
1430	ldr	ip, [r1], #0x04
1431	subs	r2, r2, #0x04
1432#ifdef __ARMEB__
1433	orr	r4, r4, ip, lsr #24
1434#else
1435	orr	r4, r4, ip, lsl #24
1436#endif
1437	str	r4, [r3], #0x04
1438	bge	.Lmemcpy_bad1_loop4
1439	sub	r1, r1, #0x03
1440	b	.Lmemcpy_bad_done
1441
1442.Lmemcpy_bad2_loop16:
1443#ifdef __ARMEB__
1444	mov	r4, ip, lsl #16
1445#else
1446	mov	r4, ip, lsr #16
1447#endif
1448	ldr	r5, [r1], #0x04
1449	pld	[r1, #0x018]
1450	ldr	r6, [r1], #0x04
1451	ldr	r7, [r1], #0x04
1452	ldr	ip, [r1], #0x04
1453#ifdef __ARMEB__
1454	orr	r4, r4, r5, lsr #16
1455	mov	r5, r5, lsl #16
1456	orr	r5, r5, r6, lsr #16
1457	mov	r6, r6, lsl #16
1458	orr	r6, r6, r7, lsr #16
1459	mov	r7, r7, lsl #16
1460	orr	r7, r7, ip, lsr #16
1461#else
1462	orr	r4, r4, r5, lsl #16
1463	mov	r5, r5, lsr #16
1464	orr	r5, r5, r6, lsl #16
1465	mov	r6, r6, lsr #16
1466	orr	r6, r6, r7, lsl #16
1467	mov	r7, r7, lsr #16
1468	orr	r7, r7, ip, lsl #16
1469#endif
1470	str	r4, [r3], #0x04
1471	str	r5, [r3], #0x04
1472	str	r6, [r3], #0x04
1473	str	r7, [r3], #0x04
1474.Lmemcpy_bad2:
1475	subs	r2, r2, #0x10
1476	bge	.Lmemcpy_bad2_loop16
1477
1478	adds	r2, r2, #0x10
1479	ldmeqfd	sp!, {r4-r7}
1480	RETeq			/* Return now if done */
1481	subs	r2, r2, #0x04
1482	sublt	r1, r1, #0x02
1483	blt	.Lmemcpy_bad_done
1484
1485.Lmemcpy_bad2_loop4:
1486#ifdef __ARMEB__
1487	mov	r4, ip, lsl #16
1488#else
1489	mov	r4, ip, lsr #16
1490#endif
1491	ldr	ip, [r1], #0x04
1492	subs	r2, r2, #0x04
1493#ifdef __ARMEB__
1494	orr	r4, r4, ip, lsr #16
1495#else
1496	orr	r4, r4, ip, lsl #16
1497#endif
1498	str	r4, [r3], #0x04
1499	bge	.Lmemcpy_bad2_loop4
1500	sub	r1, r1, #0x02
1501	b	.Lmemcpy_bad_done
1502
1503.Lmemcpy_bad3_loop16:
1504#ifdef __ARMEB__
1505	mov	r4, ip, lsl #24
1506#else
1507	mov	r4, ip, lsr #24
1508#endif
1509	ldr	r5, [r1], #0x04
1510	pld	[r1, #0x018]
1511	ldr	r6, [r1], #0x04
1512	ldr	r7, [r1], #0x04
1513	ldr	ip, [r1], #0x04
1514#ifdef __ARMEB__
1515	orr	r4, r4, r5, lsr #8
1516	mov	r5, r5, lsl #24
1517	orr	r5, r5, r6, lsr #8
1518	mov	r6, r6, lsl #24
1519	orr	r6, r6, r7, lsr #8
1520	mov	r7, r7, lsl #24
1521	orr	r7, r7, ip, lsr #8
1522#else
1523	orr	r4, r4, r5, lsl #8
1524	mov	r5, r5, lsr #24
1525	orr	r5, r5, r6, lsl #8
1526	mov	r6, r6, lsr #24
1527	orr	r6, r6, r7, lsl #8
1528	mov	r7, r7, lsr #24
1529	orr	r7, r7, ip, lsl #8
1530#endif
1531	str	r4, [r3], #0x04
1532	str	r5, [r3], #0x04
1533	str	r6, [r3], #0x04
1534	str	r7, [r3], #0x04
1535.Lmemcpy_bad3:
1536	subs	r2, r2, #0x10
1537	bge	.Lmemcpy_bad3_loop16
1538
1539	adds	r2, r2, #0x10
1540	ldmeqfd	sp!, {r4-r7}
1541	RETeq			/* Return now if done */
1542	subs	r2, r2, #0x04
1543	sublt	r1, r1, #0x01
1544	blt	.Lmemcpy_bad_done
1545
1546.Lmemcpy_bad3_loop4:
1547#ifdef __ARMEB__
1548	mov	r4, ip, lsl #24
1549#else
1550	mov	r4, ip, lsr #24
1551#endif
1552	ldr	ip, [r1], #0x04
1553	subs	r2, r2, #0x04
1554#ifdef __ARMEB__
1555	orr	r4, r4, ip, lsr #8
1556#else
1557	orr	r4, r4, ip, lsl #8
1558#endif
1559	str	r4, [r3], #0x04
1560	bge	.Lmemcpy_bad3_loop4
1561	sub	r1, r1, #0x01
1562
1563.Lmemcpy_bad_done:
1564	ldmfd	sp!, {r4-r7}
1565	adds	r2, r2, #0x04
1566	RETeq
1567	ldrb	ip, [r1], #0x01
1568	cmp	r2, #0x02
1569	ldrgeb	r2, [r1], #0x01
1570	strb	ip, [r3], #0x01
1571	ldrgtb	ip, [r1]
1572	strgeb	r2, [r3], #0x01
1573	strgtb	ip, [r3]
1574	RET
1575
1576
1577/*
1578 * Handle short copies (less than 16 bytes), possibly misaligned.
1579 * Some of these are *very* common, thanks to the network stack,
1580 * and so are handled specially.
1581 */
1582.Lmemcpy_short:
1583	add	pc, pc, r2, lsl #2
1584	nop
1585	RET			/* 0x00 */
1586	b	.Lmemcpy_bytewise	/* 0x01 */
1587	b	.Lmemcpy_bytewise	/* 0x02 */
1588	b	.Lmemcpy_bytewise	/* 0x03 */
1589	b	.Lmemcpy_4		/* 0x04 */
1590	b	.Lmemcpy_bytewise	/* 0x05 */
1591	b	.Lmemcpy_6		/* 0x06 */
1592	b	.Lmemcpy_bytewise	/* 0x07 */
1593	b	.Lmemcpy_8		/* 0x08 */
1594	b	.Lmemcpy_bytewise	/* 0x09 */
1595	b	.Lmemcpy_bytewise	/* 0x0a */
1596	b	.Lmemcpy_bytewise	/* 0x0b */
1597	b	.Lmemcpy_c		/* 0x0c */
1598.Lmemcpy_bytewise:
1599	mov	r3, r0			/* We must not clobber r0 */
1600	ldrb	ip, [r1], #0x01
16011:	subs	r2, r2, #0x01
1602	strb	ip, [r3], #0x01
1603	ldrneb	ip, [r1], #0x01
1604	bne	1b
1605	RET
1606
1607/******************************************************************************
1608 * Special case for 4 byte copies
1609 */
1610#define	LMEMCPY_4_LOG2	6	/* 64 bytes */
1611#define	LMEMCPY_4_PAD	.align LMEMCPY_4_LOG2
1612	LMEMCPY_4_PAD
1613.Lmemcpy_4:
1614	and	r2, r1, #0x03
1615	orr	r2, r2, r0, lsl #2
1616	ands	r2, r2, #0x0f
1617	sub	r3, pc, #0x14
1618	addne	pc, r3, r2, lsl #LMEMCPY_4_LOG2
1619
1620/*
1621 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1622 */
1623	ldr	r2, [r1]
1624	str	r2, [r0]
1625	RET
1626	LMEMCPY_4_PAD
1627
1628/*
1629 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1630 */
1631	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
1632	ldr	r2, [r1, #3]		/* BE:r2 = 3xxx  LE:r2 = xxx3 */
1633#ifdef __ARMEB__
1634	mov	r3, r3, lsl #8		/* r3 = 012. */
1635	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
1636#else
1637	mov	r3, r3, lsr #8		/* r3 = .210 */
1638	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
1639#endif
1640	str	r3, [r0]
1641	RET
1642	LMEMCPY_4_PAD
1643
1644/*
1645 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1646 */
1647#ifdef __ARMEB__
1648	ldrh	r3, [r1]
1649	ldrh	r2, [r1, #0x02]
1650#else
1651	ldrh	r3, [r1, #0x02]
1652	ldrh	r2, [r1]
1653#endif
1654	orr	r3, r2, r3, lsl #16
1655	str	r3, [r0]
1656	RET
1657	LMEMCPY_4_PAD
1658
1659/*
1660 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1661 */
1662	ldr	r3, [r1, #-3]		/* BE:r3 = xxx0  LE:r3 = 0xxx */
1663	ldr	r2, [r1, #1]		/* BE:r2 = 123x  LE:r2 = x321 */
1664#ifdef __ARMEB__
1665	mov	r3, r3, lsl #24		/* r3 = 0... */
1666	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
1667#else
1668	mov	r3, r3, lsr #24		/* r3 = ...0 */
1669	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
1670#endif
1671	str	r3, [r0]
1672	RET
1673	LMEMCPY_4_PAD
1674
1675/*
1676 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1677 */
1678	ldr	r2, [r1]
1679#ifdef __ARMEB__
1680	strb	r2, [r0, #0x03]
1681	mov	r3, r2, lsr #8
1682	mov	r1, r2, lsr #24
1683	strb	r1, [r0]
1684#else
1685	strb	r2, [r0]
1686	mov	r3, r2, lsr #8
1687	mov	r1, r2, lsr #24
1688	strb	r1, [r0, #0x03]
1689#endif
1690	strh	r3, [r0, #0x01]
1691	RET
1692	LMEMCPY_4_PAD
1693
1694/*
1695 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1696 */
1697	ldrb	r2, [r1]
1698	ldrh	r3, [r1, #0x01]
1699	ldrb	r1, [r1, #0x03]
1700	strb	r2, [r0]
1701	strh	r3, [r0, #0x01]
1702	strb	r1, [r0, #0x03]
1703	RET
1704	LMEMCPY_4_PAD
1705
1706/*
1707 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1708 */
1709	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1710	ldrh	r3, [r1, #0x02]		/* LE:r3 = ..23  LE:r3 = ..32 */
1711#ifdef __ARMEB__
1712	mov	r1, r2, lsr #8		/* r1 = ...0 */
1713	strb	r1, [r0]
1714	mov	r2, r2, lsl #8		/* r2 = .01. */
1715	orr	r2, r2, r3, lsr #8	/* r2 = .012 */
1716#else
1717	strb	r2, [r0]
1718	mov	r2, r2, lsr #8		/* r2 = ...1 */
1719	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
1720	mov	r3, r3, lsr #8		/* r3 = ...3 */
1721#endif
1722	strh	r2, [r0, #0x01]
1723	strb	r3, [r0, #0x03]
1724	RET
1725	LMEMCPY_4_PAD
1726
1727/*
1728 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1729 */
1730	ldrb	r2, [r1]
1731	ldrh	r3, [r1, #0x01]
1732	ldrb	r1, [r1, #0x03]
1733	strb	r2, [r0]
1734	strh	r3, [r0, #0x01]
1735	strb	r1, [r0, #0x03]
1736	RET
1737	LMEMCPY_4_PAD
1738
1739/*
1740 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1741 */
1742	ldr	r2, [r1]
1743#ifdef __ARMEB__
1744	strh	r2, [r0, #0x02]
1745	mov	r3, r2, lsr #16
1746	strh	r3, [r0]
1747#else
1748	strh	r2, [r0]
1749	mov	r3, r2, lsr #16
1750	strh	r3, [r0, #0x02]
1751#endif
1752	RET
1753	LMEMCPY_4_PAD
1754
1755/*
1756 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1757 */
1758	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1759	ldr	r3, [r1, #3]		/* BE:r3 = 3xxx  LE:r3 = xxx3 */
1760	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
1761	strh	r1, [r0]
1762#ifdef __ARMEB__
1763	mov	r2, r2, lsl #8		/* r2 = 012. */
1764	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
1765#else
1766	mov	r2, r2, lsr #24		/* r2 = ...2 */
1767	orr	r2, r2, r3, lsl #8	/* r2 = xx32 */
1768#endif
1769	strh	r2, [r0, #0x02]
1770	RET
1771	LMEMCPY_4_PAD
1772
1773/*
1774 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1775 */
1776	ldrh	r2, [r1]
1777	ldrh	r3, [r1, #0x02]
1778	strh	r2, [r0]
1779	strh	r3, [r0, #0x02]
1780	RET
1781	LMEMCPY_4_PAD
1782
1783/*
1784 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1785 */
1786	ldr	r3, [r1, #1]		/* BE:r3 = 123x  LE:r3 = x321 */
1787	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
1788	mov	r1, r3, lsr #8		/* BE:r1 = .123  LE:r1 = .x32 */
1789	strh	r1, [r0, #0x02]
1790#ifdef __ARMEB__
1791	mov	r3, r3, lsr #24		/* r3 = ...1 */
1792	orr	r3, r3, r2, lsl #8	/* r3 = xx01 */
1793#else
1794	mov	r3, r3, lsl #8		/* r3 = 321. */
1795	orr	r3, r3, r2, lsr #24	/* r3 = 3210 */
1796#endif
1797	strh	r3, [r0]
1798	RET
1799	LMEMCPY_4_PAD
1800
1801/*
1802 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1803 */
1804	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1805#ifdef __ARMEB__
1806	strb	r2, [r0, #0x03]
1807	mov	r3, r2, lsr #8
1808	mov	r1, r2, lsr #24
1809	strh	r3, [r0, #0x01]
1810	strb	r1, [r0]
1811#else
1812	strb	r2, [r0]
1813	mov	r3, r2, lsr #8
1814	mov	r1, r2, lsr #24
1815	strh	r3, [r0, #0x01]
1816	strb	r1, [r0, #0x03]
1817#endif
1818	RET
1819	LMEMCPY_4_PAD
1820
1821/*
1822 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1823 */
1824	ldrb	r2, [r1]
1825	ldrh	r3, [r1, #0x01]
1826	ldrb	r1, [r1, #0x03]
1827	strb	r2, [r0]
1828	strh	r3, [r0, #0x01]
1829	strb	r1, [r0, #0x03]
1830	RET
1831	LMEMCPY_4_PAD
1832
1833/*
1834 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1835 */
1836#ifdef __ARMEB__
1837	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
1838	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1839	strb	r3, [r0, #0x03]
1840	mov	r3, r3, lsr #8		/* r3 = ...2 */
1841	orr	r3, r3, r2, lsl #8	/* r3 = ..12 */
1842	strh	r3, [r0, #0x01]
1843	mov	r2, r2, lsr #8		/* r2 = ...0 */
1844	strb	r2, [r0]
1845#else
1846	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1847	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
1848	strb	r2, [r0]
1849	mov	r2, r2, lsr #8		/* r2 = ...1 */
1850	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
1851	strh	r2, [r0, #0x01]
1852	mov	r3, r3, lsr #8		/* r3 = ...3 */
1853	strb	r3, [r0, #0x03]
1854#endif
1855	RET
1856	LMEMCPY_4_PAD
1857
1858/*
1859 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1860 */
1861	ldrb	r2, [r1]
1862	ldrh	r3, [r1, #0x01]
1863	ldrb	r1, [r1, #0x03]
1864	strb	r2, [r0]
1865	strh	r3, [r0, #0x01]
1866	strb	r1, [r0, #0x03]
1867	RET
1868	LMEMCPY_4_PAD
1869
1870
1871/******************************************************************************
1872 * Special case for 6 byte copies
1873 */
1874#define	LMEMCPY_6_LOG2	6	/* 64 bytes */
1875#define	LMEMCPY_6_PAD	.align LMEMCPY_6_LOG2
1876	LMEMCPY_6_PAD
1877.Lmemcpy_6:
1878	and	r2, r1, #0x03
1879	orr	r2, r2, r0, lsl #2
1880	ands	r2, r2, #0x0f
1881	sub	r3, pc, #0x14
1882	addne	pc, r3, r2, lsl #LMEMCPY_6_LOG2
1883
1884/*
1885 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1886 */
1887	ldr	r2, [r1]
1888	ldrh	r3, [r1, #0x04]
1889	str	r2, [r0]
1890	strh	r3, [r0, #0x04]
1891	RET
1892	LMEMCPY_6_PAD
1893
1894/*
1895 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1896 */
1897	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1898	ldr	r3, [r1, #0x03]		/* BE:r3 = 345x  LE:r3 = x543 */
1899#ifdef __ARMEB__
1900	mov	r2, r2, lsl #8		/* r2 = 012. */
1901	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
1902#else
1903	mov	r2, r2, lsr #8		/* r2 = .210 */
1904	orr	r2, r2, r3, lsl #24	/* r2 = 3210 */
1905#endif
1906	mov	r3, r3, lsr #8		/* BE:r3 = .345  LE:r3 = .x54 */
1907	str	r2, [r0]
1908	strh	r3, [r0, #0x04]
1909	RET
1910	LMEMCPY_6_PAD
1911
1912/*
1913 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1914 */
1915	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1916	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1917#ifdef __ARMEB__
1918	mov	r1, r3, lsr #16		/* r1 = ..23 */
1919	orr	r1, r1, r2, lsl #16	/* r1 = 0123 */
1920	str	r1, [r0]
1921	strh	r3, [r0, #0x04]
1922#else
1923	mov	r1, r3, lsr #16		/* r1 = ..54 */
1924	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
1925	str	r2, [r0]
1926	strh	r1, [r0, #0x04]
1927#endif
1928	RET
1929	LMEMCPY_6_PAD
1930
1931/*
1932 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1933 */
1934	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
1935	ldr	r3, [r1, #1]		/* BE:r3 = 1234  LE:r3 = 4321 */
1936	ldr	r1, [r1, #5]		/* BE:r1 = 5xxx  LE:r3 = xxx5 */
1937#ifdef __ARMEB__
1938	mov	r2, r2, lsl #24		/* r2 = 0... */
1939	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
1940	mov	r3, r3, lsl #8		/* r3 = 234. */
1941	orr	r1, r3, r1, lsr #24	/* r1 = 2345 */
1942#else
1943	mov	r2, r2, lsr #24		/* r2 = ...0 */
1944	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
1945	mov	r1, r1, lsl #8		/* r1 = xx5. */
1946	orr	r1, r1, r3, lsr #24	/* r1 = xx54 */
1947#endif
1948	str	r2, [r0]
1949	strh	r1, [r0, #0x04]
1950	RET
1951	LMEMCPY_6_PAD
1952
1953/*
1954 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1955 */
1956	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
1957	ldrh	r2, [r1, #0x04]		/* BE:r2 = ..45  LE:r2 = ..54 */
1958	mov	r1, r3, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
1959	strh	r1, [r0, #0x01]
1960#ifdef __ARMEB__
1961	mov	r1, r3, lsr #24		/* r1 = ...0 */
1962	strb	r1, [r0]
1963	mov	r3, r3, lsl #8		/* r3 = 123. */
1964	orr	r3, r3, r2, lsr #8	/* r3 = 1234 */
1965#else
1966	strb	r3, [r0]
1967	mov	r3, r3, lsr #24		/* r3 = ...3 */
1968	orr	r3, r3, r2, lsl #8	/* r3 = .543 */
1969	mov	r2, r2, lsr #8		/* r2 = ...5 */
1970#endif
1971	strh	r3, [r0, #0x03]
1972	strb	r2, [r0, #0x05]
1973	RET
1974	LMEMCPY_6_PAD
1975
1976/*
1977 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1978 */
1979	ldrb	r2, [r1]
1980	ldrh	r3, [r1, #0x01]
1981	ldrh	ip, [r1, #0x03]
1982	ldrb	r1, [r1, #0x05]
1983	strb	r2, [r0]
1984	strh	r3, [r0, #0x01]
1985	strh	ip, [r0, #0x03]
1986	strb	r1, [r0, #0x05]
1987	RET
1988	LMEMCPY_6_PAD
1989
1990/*
1991 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1992 */
1993	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1994	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
1995#ifdef __ARMEB__
1996	mov	r3, r2, lsr #8		/* r3 = ...0 */
1997	strb	r3, [r0]
1998	strb	r1, [r0, #0x05]
1999	mov	r3, r1, lsr #8		/* r3 = .234 */
2000	strh	r3, [r0, #0x03]
2001	mov	r3, r2, lsl #8		/* r3 = .01. */
2002	orr	r3, r3, r1, lsr #24	/* r3 = .012 */
2003	strh	r3, [r0, #0x01]
2004#else
2005	strb	r2, [r0]
2006	mov	r3, r1, lsr #24
2007	strb	r3, [r0, #0x05]
2008	mov	r3, r1, lsr #8		/* r3 = .543 */
2009	strh	r3, [r0, #0x03]
2010	mov	r3, r2, lsr #8		/* r3 = ...1 */
2011	orr	r3, r3, r1, lsl #8	/* r3 = 4321 */
2012	strh	r3, [r0, #0x01]
2013#endif
2014	RET
2015	LMEMCPY_6_PAD
2016
2017/*
2018 * 0111: dst is 8-bit aligned, src is 8-bit aligned
2019 */
2020	ldrb	r2, [r1]
2021	ldrh	r3, [r1, #0x01]
2022	ldrh	ip, [r1, #0x03]
2023	ldrb	r1, [r1, #0x05]
2024	strb	r2, [r0]
2025	strh	r3, [r0, #0x01]
2026	strh	ip, [r0, #0x03]
2027	strb	r1, [r0, #0x05]
2028	RET
2029	LMEMCPY_6_PAD
2030
2031/*
2032 * 1000: dst is 16-bit aligned, src is 32-bit aligned
2033 */
2034#ifdef __ARMEB__
2035	ldr	r2, [r1]		/* r2 = 0123 */
2036	ldrh	r3, [r1, #0x04]		/* r3 = ..45 */
2037	mov	r1, r2, lsr #16		/* r1 = ..01 */
2038	orr	r3, r3, r2, lsl#16	/* r3 = 2345 */
2039	strh	r1, [r0]
2040	str	r3, [r0, #0x02]
2041#else
2042	ldrh	r2, [r1, #0x04]		/* r2 = ..54 */
2043	ldr	r3, [r1]		/* r3 = 3210 */
2044	mov	r2, r2, lsl #16		/* r2 = 54.. */
2045	orr	r2, r2, r3, lsr #16	/* r2 = 5432 */
2046	strh	r3, [r0]
2047	str	r2, [r0, #0x02]
2048#endif
2049	RET
2050	LMEMCPY_6_PAD
2051
2052/*
2053 * 1001: dst is 16-bit aligned, src is 8-bit aligned
2054 */
2055	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
2056	ldr	r2, [r1, #3]		/* BE:r2 = 345x  LE:r2 = x543 */
2057	mov	r1, r3, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
2058#ifdef __ARMEB__
2059	mov	r2, r2, lsr #8		/* r2 = .345 */
2060	orr	r2, r2, r3, lsl #24	/* r2 = 2345 */
2061#else
2062	mov	r2, r2, lsl #8		/* r2 = 543. */
2063	orr	r2, r2, r3, lsr #24	/* r2 = 5432 */
2064#endif
2065	strh	r1, [r0]
2066	str	r2, [r0, #0x02]
2067	RET
2068	LMEMCPY_6_PAD
2069
2070/*
2071 * 1010: dst is 16-bit aligned, src is 16-bit aligned
2072 */
2073	ldrh	r2, [r1]
2074	ldr	r3, [r1, #0x02]
2075	strh	r2, [r0]
2076	str	r3, [r0, #0x02]
2077	RET
2078	LMEMCPY_6_PAD
2079
2080/*
2081 * 1011: dst is 16-bit aligned, src is 8-bit aligned
2082 */
2083	ldrb	r3, [r1]		/* r3 = ...0 */
2084	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
2085	ldrb	r1, [r1, #0x05]		/* r1 = ...5 */
2086#ifdef __ARMEB__
2087	mov	r3, r3, lsl #8		/* r3 = ..0. */
2088	orr	r3, r3, r2, lsr #24	/* r3 = ..01 */
2089	orr	r1, r1, r2, lsl #8	/* r1 = 2345 */
2090#else
2091	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
2092	mov	r1, r1, lsl #24		/* r1 = 5... */
2093	orr	r1, r1, r2, lsr #8	/* r1 = 5432 */
2094#endif
2095	strh	r3, [r0]
2096	str	r1, [r0, #0x02]
2097	RET
2098	LMEMCPY_6_PAD
2099
2100/*
2101 * 1100: dst is 8-bit aligned, src is 32-bit aligned
2102 */
2103	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2104	ldrh	r1, [r1, #0x04]		/* BE:r1 = ..45  LE:r1 = ..54 */
2105#ifdef __ARMEB__
2106	mov	r3, r2, lsr #24		/* r3 = ...0 */
2107	strb	r3, [r0]
2108	mov	r2, r2, lsl #8		/* r2 = 123. */
2109	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
2110#else
2111	strb	r2, [r0]
2112	mov	r2, r2, lsr #8		/* r2 = .321 */
2113	orr	r2, r2, r1, lsl #24	/* r2 = 4321 */
2114	mov	r1, r1, lsr #8		/* r1 = ...5 */
2115#endif
2116	str	r2, [r0, #0x01]
2117	strb	r1, [r0, #0x05]
2118	RET
2119	LMEMCPY_6_PAD
2120
2121/*
2122 * 1101: dst is 8-bit aligned, src is 8-bit aligned
2123 */
2124	ldrb	r2, [r1]
2125	ldrh	r3, [r1, #0x01]
2126	ldrh	ip, [r1, #0x03]
2127	ldrb	r1, [r1, #0x05]
2128	strb	r2, [r0]
2129	strh	r3, [r0, #0x01]
2130	strh	ip, [r0, #0x03]
2131	strb	r1, [r0, #0x05]
2132	RET
2133	LMEMCPY_6_PAD
2134
2135/*
2136 * 1110: dst is 8-bit aligned, src is 16-bit aligned
2137 */
2138	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2139	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
2140#ifdef __ARMEB__
2141	mov	r3, r2, lsr #8		/* r3 = ...0 */
2142	strb	r3, [r0]
2143	mov	r2, r2, lsl #24		/* r2 = 1... */
2144	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
2145#else
2146	strb	r2, [r0]
2147	mov	r2, r2, lsr #8		/* r2 = ...1 */
2148	orr	r2, r2, r1, lsl #8	/* r2 = 4321 */
2149	mov	r1, r1, lsr #24		/* r1 = ...5 */
2150#endif
2151	str	r2, [r0, #0x01]
2152	strb	r1, [r0, #0x05]
2153	RET
2154	LMEMCPY_6_PAD
2155
2156/*
2157 * 1111: dst is 8-bit aligned, src is 8-bit aligned
2158 */
2159	ldrb	r2, [r1]
2160	ldr	r3, [r1, #0x01]
2161	ldrb	r1, [r1, #0x05]
2162	strb	r2, [r0]
2163	str	r3, [r0, #0x01]
2164	strb	r1, [r0, #0x05]
2165	RET
2166	LMEMCPY_6_PAD
2167
2168
2169/******************************************************************************
2170 * Special case for 8 byte copies
2171 */
2172#define	LMEMCPY_8_LOG2	6	/* 64 bytes */
2173#define	LMEMCPY_8_PAD	.align LMEMCPY_8_LOG2
2174	LMEMCPY_8_PAD
2175.Lmemcpy_8:
2176	and	r2, r1, #0x03
2177	orr	r2, r2, r0, lsl #2
2178	ands	r2, r2, #0x0f
2179	sub	r3, pc, #0x14
2180	addne	pc, r3, r2, lsl #LMEMCPY_8_LOG2
2181
2182/*
2183 * 0000: dst is 32-bit aligned, src is 32-bit aligned
2184 */
2185	ldr	r2, [r1]
2186	ldr	r3, [r1, #0x04]
2187	str	r2, [r0]
2188	str	r3, [r0, #0x04]
2189	RET
2190	LMEMCPY_8_PAD
2191
2192/*
2193 * 0001: dst is 32-bit aligned, src is 8-bit aligned
2194 */
2195	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
2196	ldr	r2, [r1, #0x03]		/* BE:r2 = 3456  LE:r2 = 6543 */
2197	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
2198#ifdef __ARMEB__
2199	mov	r3, r3, lsl #8		/* r3 = 012. */
2200	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
2201	orr	r2, r1, r2, lsl #8	/* r2 = 4567 */
2202#else
2203	mov	r3, r3, lsr #8		/* r3 = .210 */
2204	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
2205	mov	r1, r1, lsl #24		/* r1 = 7... */
2206	orr	r2, r1, r2, lsr #8	/* r2 = 7654 */
2207#endif
2208	str	r3, [r0]
2209	str	r2, [r0, #0x04]
2210	RET
2211	LMEMCPY_8_PAD
2212
2213/*
2214 * 0010: dst is 32-bit aligned, src is 16-bit aligned
2215 */
2216	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2217	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2218	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
2219#ifdef __ARMEB__
2220	mov	r2, r2, lsl #16		/* r2 = 01.. */
2221	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
2222	orr	r3, r1, r3, lsl #16	/* r3 = 4567 */
2223#else
2224	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
2225	mov	r3, r3, lsr #16		/* r3 = ..54 */
2226	orr	r3, r3, r1, lsl #16	/* r3 = 7654 */
2227#endif
2228	str	r2, [r0]
2229	str	r3, [r0, #0x04]
2230	RET
2231	LMEMCPY_8_PAD
2232
2233/*
2234 * 0011: dst is 32-bit aligned, src is 8-bit aligned
2235 */
2236	ldrb	r3, [r1]		/* r3 = ...0 */
2237	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
2238	ldr	r1, [r1, #0x05]		/* BE:r1 = 567x  LE:r1 = x765 */
2239#ifdef __ARMEB__
2240	mov	r3, r3, lsl #24		/* r3 = 0... */
2241	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
2242	mov	r2, r2, lsl #24		/* r2 = 4... */
2243	orr	r2, r2, r1, lsr #8	/* r2 = 4567 */
2244#else
2245	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
2246	mov	r2, r2, lsr #24		/* r2 = ...4 */
2247	orr	r2, r2, r1, lsl #8	/* r2 = 7654 */
2248#endif
2249	str	r3, [r0]
2250	str	r2, [r0, #0x04]
2251	RET
2252	LMEMCPY_8_PAD
2253
2254/*
2255 * 0100: dst is 8-bit aligned, src is 32-bit aligned
2256 */
2257	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
2258	ldr	r2, [r1, #0x04]		/* BE:r2 = 4567  LE:r2 = 7654 */
2259#ifdef __ARMEB__
2260	mov	r1, r3, lsr #24		/* r1 = ...0 */
2261	strb	r1, [r0]
2262	mov	r1, r3, lsr #8		/* r1 = .012 */
2263	strb	r2, [r0, #0x07]
2264	mov	r3, r3, lsl #24		/* r3 = 3... */
2265	orr	r3, r3, r2, lsr #8	/* r3 = 3456 */
2266#else
2267	strb	r3, [r0]
2268	mov	r1, r2, lsr #24		/* r1 = ...7 */
2269	strb	r1, [r0, #0x07]
2270	mov	r1, r3, lsr #8		/* r1 = .321 */
2271	mov	r3, r3, lsr #24		/* r3 = ...3 */
2272	orr	r3, r3, r2, lsl #8	/* r3 = 6543 */
2273#endif
2274	strh	r1, [r0, #0x01]
2275	str	r3, [r0, #0x03]
2276	RET
2277	LMEMCPY_8_PAD
2278
2279/*
2280 * 0101: dst is 8-bit aligned, src is 8-bit aligned
2281 */
2282	ldrb	r2, [r1]
2283	ldrh	r3, [r1, #0x01]
2284	ldr	ip, [r1, #0x03]
2285	ldrb	r1, [r1, #0x07]
2286	strb	r2, [r0]
2287	strh	r3, [r0, #0x01]
2288	str	ip, [r0, #0x03]
2289	strb	r1, [r0, #0x07]
2290	RET
2291	LMEMCPY_8_PAD
2292
2293/*
2294 * 0110: dst is 8-bit aligned, src is 16-bit aligned
2295 */
2296	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2297	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2298	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
2299#ifdef __ARMEB__
2300	mov	ip, r2, lsr #8		/* ip = ...0 */
2301	strb	ip, [r0]
2302	mov	ip, r2, lsl #8		/* ip = .01. */
2303	orr	ip, ip, r3, lsr #24	/* ip = .012 */
2304	strb	r1, [r0, #0x07]
2305	mov	r3, r3, lsl #8		/* r3 = 345. */
2306	orr	r3, r3, r1, lsr #8	/* r3 = 3456 */
2307#else
2308	strb	r2, [r0]		/* 0 */
2309	mov	ip, r1, lsr #8		/* ip = ...7 */
2310	strb	ip, [r0, #0x07]		/* 7 */
2311	mov	ip, r2, lsr #8		/* ip = ...1 */
2312	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
2313	mov	r3, r3, lsr #8		/* r3 = .543 */
2314	orr	r3, r3, r1, lsl #24	/* r3 = 6543 */
2315#endif
2316	strh	ip, [r0, #0x01]
2317	str	r3, [r0, #0x03]
2318	RET
2319	LMEMCPY_8_PAD
2320
2321/*
2322 * 0111: dst is 8-bit aligned, src is 8-bit aligned
2323 */
2324	ldrb	r3, [r1]		/* r3 = ...0 */
2325	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
2326	ldrh	r2, [r1, #0x05]		/* BE:r2 = ..56  LE:r2 = ..65 */
2327	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
2328	strb	r3, [r0]
2329	mov	r3, ip, lsr #16		/* BE:r3 = ..12  LE:r3 = ..43 */
2330#ifdef __ARMEB__
2331	strh	r3, [r0, #0x01]
2332	orr	r2, r2, ip, lsl #16	/* r2 = 3456 */
2333#else
2334	strh	ip, [r0, #0x01]
2335	orr	r2, r3, r2, lsl #16	/* r2 = 6543 */
2336#endif
2337	str	r2, [r0, #0x03]
2338	strb	r1, [r0, #0x07]
2339	RET
2340	LMEMCPY_8_PAD
2341
2342/*
2343 * 1000: dst is 16-bit aligned, src is 32-bit aligned
2344 */
2345	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2346	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2347	mov	r1, r2, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
2348#ifdef __ARMEB__
2349	strh	r1, [r0]
2350	mov	r1, r3, lsr #16		/* r1 = ..45 */
2351	orr	r2, r1 ,r2, lsl #16	/* r2 = 2345 */
2352#else
2353	strh	r2, [r0]
2354	orr	r2, r1, r3, lsl #16	/* r2 = 5432 */
2355	mov	r3, r3, lsr #16		/* r3 = ..76 */
2356#endif
2357	str	r2, [r0, #0x02]
2358	strh	r3, [r0, #0x06]
2359	RET
2360	LMEMCPY_8_PAD
2361
2362/*
2363 * 1001: dst is 16-bit aligned, src is 8-bit aligned
2364 */
2365	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
2366	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
2367	ldrb	ip, [r1, #0x07]		/* ip = ...7 */
2368	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
2369	strh	r1, [r0]
2370#ifdef __ARMEB__
2371	mov	r1, r2, lsl #24		/* r1 = 2... */
2372	orr	r1, r1, r3, lsr #8	/* r1 = 2345 */
2373	orr	r3, ip, r3, lsl #8	/* r3 = 4567 */
2374#else
2375	mov	r1, r2, lsr #24		/* r1 = ...2 */
2376	orr	r1, r1, r3, lsl #8	/* r1 = 5432 */
2377	mov	r3, r3, lsr #24		/* r3 = ...6 */
2378	orr	r3, r3, ip, lsl #8	/* r3 = ..76 */
2379#endif
2380	str	r1, [r0, #0x02]
2381	strh	r3, [r0, #0x06]
2382	RET
2383	LMEMCPY_8_PAD
2384
2385/*
2386 * 1010: dst is 16-bit aligned, src is 16-bit aligned
2387 */
2388	ldrh	r2, [r1]
2389	ldr	ip, [r1, #0x02]
2390	ldrh	r3, [r1, #0x06]
2391	strh	r2, [r0]
2392	str	ip, [r0, #0x02]
2393	strh	r3, [r0, #0x06]
2394	RET
2395	LMEMCPY_8_PAD
2396
2397/*
2398 * 1011: dst is 16-bit aligned, src is 8-bit aligned
2399 */
2400	ldr	r3, [r1, #0x05]		/* BE:r3 = 567x  LE:r3 = x765 */
2401	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
2402	ldrb	ip, [r1]		/* ip = ...0 */
2403	mov	r1, r3, lsr #8		/* BE:r1 = .567  LE:r1 = .x76 */
2404	strh	r1, [r0, #0x06]
2405#ifdef __ARMEB__
2406	mov	r3, r3, lsr #24		/* r3 = ...5 */
2407	orr	r3, r3, r2, lsl #8	/* r3 = 2345 */
2408	mov	r2, r2, lsr #24		/* r2 = ...1 */
2409	orr	r2, r2, ip, lsl #8	/* r2 = ..01 */
2410#else
2411	mov	r3, r3, lsl #24		/* r3 = 5... */
2412	orr	r3, r3, r2, lsr #8	/* r3 = 5432 */
2413	orr	r2, ip, r2, lsl #8	/* r2 = 3210 */
2414#endif
2415	str	r3, [r0, #0x02]
2416	strh	r2, [r0]
2417	RET
2418	LMEMCPY_8_PAD
2419
2420/*
2421 * 1100: dst is 8-bit aligned, src is 32-bit aligned
2422 */
2423	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2424	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2425	mov	r1, r3, lsr #8		/* BE:r1 = .456  LE:r1 = .765 */
2426	strh	r1, [r0, #0x05]
2427#ifdef __ARMEB__
2428	strb	r3, [r0, #0x07]
2429	mov	r1, r2, lsr #24		/* r1 = ...0 */
2430	strb	r1, [r0]
2431	mov	r2, r2, lsl #8		/* r2 = 123. */
2432	orr	r2, r2, r3, lsr #24	/* r2 = 1234 */
2433	str	r2, [r0, #0x01]
2434#else
2435	strb	r2, [r0]
2436	mov	r1, r3, lsr #24		/* r1 = ...7 */
2437	strb	r1, [r0, #0x07]
2438	mov	r2, r2, lsr #8		/* r2 = .321 */
2439	orr	r2, r2, r3, lsl #24	/* r2 = 4321 */
2440	str	r2, [r0, #0x01]
2441#endif
2442	RET
2443	LMEMCPY_8_PAD
2444
2445/*
2446 * 1101: dst is 8-bit aligned, src is 8-bit aligned
2447 */
2448	ldrb	r3, [r1]		/* r3 = ...0 */
2449	ldrh	r2, [r1, #0x01]		/* BE:r2 = ..12  LE:r2 = ..21 */
2450	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
2451	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
2452	strb	r3, [r0]
2453	mov	r3, ip, lsr #16		/* BE:r3 = ..34  LE:r3 = ..65 */
2454#ifdef __ARMEB__
2455	strh	ip, [r0, #0x05]
2456	orr	r2, r3, r2, lsl #16	/* r2 = 1234 */
2457#else
2458	strh	r3, [r0, #0x05]
2459	orr	r2, r2, ip, lsl #16	/* r2 = 4321 */
2460#endif
2461	str	r2, [r0, #0x01]
2462	strb	r1, [r0, #0x07]
2463	RET
2464	LMEMCPY_8_PAD
2465
2466/*
2467 * 1110: dst is 8-bit aligned, src is 16-bit aligned
2468 */
2469	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2470	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2471	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
2472#ifdef __ARMEB__
2473	mov	ip, r2, lsr #8		/* ip = ...0 */
2474	strb	ip, [r0]
2475	mov	ip, r2, lsl #24		/* ip = 1... */
2476	orr	ip, ip, r3, lsr #8	/* ip = 1234 */
2477	strb	r1, [r0, #0x07]
2478	mov	r1, r1, lsr #8		/* r1 = ...6 */
2479	orr	r1, r1, r3, lsl #8	/* r1 = 3456 */
2480#else
2481	strb	r2, [r0]
2482	mov	ip, r2, lsr #8		/* ip = ...1 */
2483	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
2484	mov	r2, r1, lsr #8		/* r2 = ...7 */
2485	strb	r2, [r0, #0x07]
2486	mov	r1, r1, lsl #8		/* r1 = .76. */
2487	orr	r1, r1, r3, lsr #24	/* r1 = .765 */
2488#endif
2489	str	ip, [r0, #0x01]
2490	strh	r1, [r0, #0x05]
2491	RET
2492	LMEMCPY_8_PAD
2493
2494/*
2495 * 1111: dst is 8-bit aligned, src is 8-bit aligned
2496 */
2497	ldrb	r2, [r1]
2498	ldr	ip, [r1, #0x01]
2499	ldrh	r3, [r1, #0x05]
2500	ldrb	r1, [r1, #0x07]
2501	strb	r2, [r0]
2502	str	ip, [r0, #0x01]
2503	strh	r3, [r0, #0x05]
2504	strb	r1, [r0, #0x07]
2505	RET
2506	LMEMCPY_8_PAD
2507
2508/******************************************************************************
2509 * Special case for 12 byte copies
2510 */
2511#define	LMEMCPY_C_LOG2	7	/* 128 bytes */
2512#define	LMEMCPY_C_PAD	.align LMEMCPY_C_LOG2
2513	LMEMCPY_C_PAD
2514.Lmemcpy_c:
2515	and	r2, r1, #0x03
2516	orr	r2, r2, r0, lsl #2
2517	ands	r2, r2, #0x0f
2518	sub	r3, pc, #0x14
2519	addne	pc, r3, r2, lsl #LMEMCPY_C_LOG2
2520
2521/*
2522 * 0000: dst is 32-bit aligned, src is 32-bit aligned
2523 */
2524	ldr	r2, [r1]
2525	ldr	r3, [r1, #0x04]
2526	ldr	r1, [r1, #0x08]
2527	str	r2, [r0]
2528	str	r3, [r0, #0x04]
2529	str	r1, [r0, #0x08]
2530	RET
2531	LMEMCPY_C_PAD
2532
2533/*
2534 * 0001: dst is 32-bit aligned, src is 8-bit aligned
2535 */
2536	ldrb	r2, [r1, #0xb]		/* r2 = ...B */
2537	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
2538	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
2539	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
2540#ifdef __ARMEB__
2541	orr	r2, r2, ip, lsl #8	/* r2 = 89AB */
2542	str	r2, [r0, #0x08]
2543	mov	r2, ip, lsr #24		/* r2 = ...7 */
2544	orr	r2, r2, r3, lsl #8	/* r2 = 4567 */
2545	mov	r1, r1, lsl #8		/* r1 = 012. */
2546	orr	r1, r1, r3, lsr #24	/* r1 = 0123 */
2547#else
2548	mov	r2, r2, lsl #24		/* r2 = B... */
2549	orr	r2, r2, ip, lsr #8	/* r2 = BA98 */
2550	str	r2, [r0, #0x08]
2551	mov	r2, ip, lsl #24		/* r2 = 7... */
2552	orr	r2, r2, r3, lsr #8	/* r2 = 7654 */
2553	mov	r1, r1, lsr #8		/* r1 = .210 */
2554	orr	r1, r1, r3, lsl #24	/* r1 = 3210 */
2555#endif
2556	str	r2, [r0, #0x04]
2557	str	r1, [r0]
2558	RET
2559	LMEMCPY_C_PAD
2560
2561/*
2562 * 0010: dst is 32-bit aligned, src is 16-bit aligned
2563 */
2564	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2565	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2566	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
2567	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
2568#ifdef __ARMEB__
2569	mov	r2, r2, lsl #16		/* r2 = 01.. */
2570	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
2571	str	r2, [r0]
2572	mov	r3, r3, lsl #16		/* r3 = 45.. */
2573	orr	r3, r3, ip, lsr #16	/* r3 = 4567 */
2574	orr	r1, r1, ip, lsl #16	/* r1 = 89AB */
2575#else
2576	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
2577	str	r2, [r0]
2578	mov	r3, r3, lsr #16		/* r3 = ..54 */
2579	orr	r3, r3, ip, lsl #16	/* r3 = 7654 */
2580	mov	r1, r1, lsl #16		/* r1 = BA.. */
2581	orr	r1, r1, ip, lsr #16	/* r1 = BA98 */
2582#endif
2583	str	r3, [r0, #0x04]
2584	str	r1, [r0, #0x08]
2585	RET
2586	LMEMCPY_C_PAD
2587
2588/*
2589 * 0011: dst is 32-bit aligned, src is 8-bit aligned
2590 */
2591	ldrb	r2, [r1]		/* r2 = ...0 */
2592	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
2593	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
2594	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
2595#ifdef __ARMEB__
2596	mov	r2, r2, lsl #24		/* r2 = 0... */
2597	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
2598	str	r2, [r0]
2599	mov	r3, r3, lsl #24		/* r3 = 4... */
2600	orr	r3, r3, ip, lsr #8	/* r3 = 4567 */
2601	mov	r1, r1, lsr #8		/* r1 = .9AB */
2602	orr	r1, r1, ip, lsl #24	/* r1 = 89AB */
2603#else
2604	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
2605	str	r2, [r0]
2606	mov	r3, r3, lsr #24		/* r3 = ...4 */
2607	orr	r3, r3, ip, lsl #8	/* r3 = 7654 */
2608	mov	r1, r1, lsl #8		/* r1 = BA9. */
2609	orr	r1, r1, ip, lsr #24	/* r1 = BA98 */
2610#endif
2611	str	r3, [r0, #0x04]
2612	str	r1, [r0, #0x08]
2613	RET
2614	LMEMCPY_C_PAD
2615
2616/*
2617 * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
2618 */
2619	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2620	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2621	ldr	ip, [r1, #0x08]		/* BE:ip = 89AB  LE:ip = BA98 */
2622	mov	r1, r2, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
2623	strh	r1, [r0, #0x01]
2624#ifdef __ARMEB__
2625	mov	r1, r2, lsr #24		/* r1 = ...0 */
2626	strb	r1, [r0]
2627	mov	r1, r2, lsl #24		/* r1 = 3... */
2628	orr	r2, r1, r3, lsr #8	/* r1 = 3456 */
2629	mov	r1, r3, lsl #24		/* r1 = 7... */
2630	orr	r1, r1, ip, lsr #8	/* r1 = 789A */
2631#else
2632	strb	r2, [r0]
2633	mov	r1, r2, lsr #24		/* r1 = ...3 */
2634	orr	r2, r1, r3, lsl #8	/* r1 = 6543 */
2635	mov	r1, r3, lsr #24		/* r1 = ...7 */
2636	orr	r1, r1, ip, lsl #8	/* r1 = A987 */
2637	mov	ip, ip, lsr #24		/* ip = ...B */
2638#endif
2639	str	r2, [r0, #0x03]
2640	str	r1, [r0, #0x07]
2641	strb	ip, [r0, #0x0b]
2642	RET
2643	LMEMCPY_C_PAD
2644
2645/*
2646 * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
2647 */
2648	ldrb	r2, [r1]
2649	ldrh	r3, [r1, #0x01]
2650	ldr	ip, [r1, #0x03]
2651	strb	r2, [r0]
2652	ldr	r2, [r1, #0x07]
2653	ldrb	r1, [r1, #0x0b]
2654	strh	r3, [r0, #0x01]
2655	str	ip, [r0, #0x03]
2656	str	r2, [r0, #0x07]
2657	strb	r1, [r0, #0x0b]
2658	RET
2659	LMEMCPY_C_PAD
2660
2661/*
2662 * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
2663 */
2664	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2665	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2666	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
2667	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
2668#ifdef __ARMEB__
2669	mov	r2, r2, ror #8		/* r2 = 1..0 */
2670	strb	r2, [r0]
2671	mov	r2, r2, lsr #16		/* r2 = ..1. */
2672	orr	r2, r2, r3, lsr #24	/* r2 = ..12 */
2673	strh	r2, [r0, #0x01]
2674	mov	r2, r3, lsl #8		/* r2 = 345. */
2675	orr	r3, r2, ip, lsr #24	/* r3 = 3456 */
2676	mov	r2, ip, lsl #8		/* r2 = 789. */
2677	orr	r2, r2, r1, lsr #8	/* r2 = 789A */
2678#else
2679	strb	r2, [r0]
2680	mov	r2, r2, lsr #8		/* r2 = ...1 */
2681	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
2682	strh	r2, [r0, #0x01]
2683	mov	r2, r3, lsr #8		/* r2 = .543 */
2684	orr	r3, r2, ip, lsl #24	/* r3 = 6543 */
2685	mov	r2, ip, lsr #8		/* r2 = .987 */
2686	orr	r2, r2, r1, lsl #24	/* r2 = A987 */
2687	mov	r1, r1, lsr #8		/* r1 = ...B */
2688#endif
2689	str	r3, [r0, #0x03]
2690	str	r2, [r0, #0x07]
2691	strb	r1, [r0, #0x0b]
2692	RET
2693	LMEMCPY_C_PAD
2694
2695/*
2696 * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
2697 */
2698	ldrb	r2, [r1]
2699	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
2700	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
2701	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
2702	strb	r2, [r0]
2703#ifdef __ARMEB__
2704	mov	r2, r3, lsr #16		/* r2 = ..12 */
2705	strh	r2, [r0, #0x01]
2706	mov	r3, r3, lsl #16		/* r3 = 34.. */
2707	orr	r3, r3, ip, lsr #16	/* r3 = 3456 */
2708	mov	ip, ip, lsl #16		/* ip = 78.. */
2709	orr	ip, ip, r1, lsr #16	/* ip = 789A */
2710	mov	r1, r1, lsr #8		/* r1 = .9AB */
2711#else
2712	strh	r3, [r0, #0x01]
2713	mov	r3, r3, lsr #16		/* r3 = ..43 */
2714	orr	r3, r3, ip, lsl #16	/* r3 = 6543 */
2715	mov	ip, ip, lsr #16		/* ip = ..87 */
2716	orr	ip, ip, r1, lsl #16	/* ip = A987 */
2717	mov	r1, r1, lsr #16		/* r1 = ..xB */
2718#endif
2719	str	r3, [r0, #0x03]
2720	str	ip, [r0, #0x07]
2721	strb	r1, [r0, #0x0b]
2722	RET
2723	LMEMCPY_C_PAD
2724
2725/*
2726 * 1000: dst is 16-bit aligned, src is 32-bit aligned
2727 */
2728	ldr	ip, [r1]		/* BE:ip = 0123  LE:ip = 3210 */
2729	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2730	ldr	r2, [r1, #0x08]		/* BE:r2 = 89AB  LE:r2 = BA98 */
2731	mov	r1, ip, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
2732#ifdef __ARMEB__
2733	strh	r1, [r0]
2734	mov	r1, ip, lsl #16		/* r1 = 23.. */
2735	orr	r1, r1, r3, lsr #16	/* r1 = 2345 */
2736	mov	r3, r3, lsl #16		/* r3 = 67.. */
2737	orr	r3, r3, r2, lsr #16	/* r3 = 6789 */
2738#else
2739	strh	ip, [r0]
2740	orr	r1, r1, r3, lsl #16	/* r1 = 5432 */
2741	mov	r3, r3, lsr #16		/* r3 = ..76 */
2742	orr	r3, r3, r2, lsl #16	/* r3 = 9876 */
2743	mov	r2, r2, lsr #16		/* r2 = ..BA */
2744#endif
2745	str	r1, [r0, #0x02]
2746	str	r3, [r0, #0x06]
2747	strh	r2, [r0, #0x0a]
2748	RET
2749	LMEMCPY_C_PAD
2750
2751/*
2752 * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
2753 */
2754	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
2755	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
2756	mov	ip, r2, lsr #8		/* BE:ip = .x01  LE:ip = .210 */
2757	strh	ip, [r0]
2758	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
2759	ldrb	r1, [r1, #0x0b]		/* r1 = ...B */
2760#ifdef __ARMEB__
2761	mov	r2, r2, lsl #24		/* r2 = 2... */
2762	orr	r2, r2, r3, lsr #8	/* r2 = 2345 */
2763	mov	r3, r3, lsl #24		/* r3 = 6... */
2764	orr	r3, r3, ip, lsr #8	/* r3 = 6789 */
2765	orr	r1, r1, ip, lsl #8	/* r1 = 89AB */
2766#else
2767	mov	r2, r2, lsr #24		/* r2 = ...2 */
2768	orr	r2, r2, r3, lsl #8	/* r2 = 5432 */
2769	mov	r3, r3, lsr #24		/* r3 = ...6 */
2770	orr	r3, r3, ip, lsl #8	/* r3 = 9876 */
2771	mov	r1, r1, lsl #8		/* r1 = ..B. */
2772	orr	r1, r1, ip, lsr #24	/* r1 = ..BA */
2773#endif
2774	str	r2, [r0, #0x02]
2775	str	r3, [r0, #0x06]
2776	strh	r1, [r0, #0x0a]
2777	RET
2778	LMEMCPY_C_PAD
2779
2780/*
2781 * 1010: dst is 16-bit aligned, src is 16-bit aligned
2782 */
2783	ldrh	r2, [r1]
2784	ldr	r3, [r1, #0x02]
2785	ldr	ip, [r1, #0x06]
2786	ldrh	r1, [r1, #0x0a]
2787	strh	r2, [r0]
2788	str	r3, [r0, #0x02]
2789	str	ip, [r0, #0x06]
2790	strh	r1, [r0, #0x0a]
2791	RET
2792	LMEMCPY_C_PAD
2793
2794/*
2795 * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
2796 */
2797	ldr	r2, [r1, #0x09]		/* BE:r2 = 9ABx  LE:r2 = xBA9 */
2798	ldr	r3, [r1, #0x05]		/* BE:r3 = 5678  LE:r3 = 8765 */
2799	mov	ip, r2, lsr #8		/* BE:ip = .9AB  LE:ip = .xBA */
2800	strh	ip, [r0, #0x0a]
2801	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
2802	ldrb	r1, [r1]		/* r1 = ...0 */
2803#ifdef __ARMEB__
2804	mov	r2, r2, lsr #24		/* r2 = ...9 */
2805	orr	r2, r2, r3, lsl #8	/* r2 = 6789 */
2806	mov	r3, r3, lsr #24		/* r3 = ...5 */
2807	orr	r3, r3, ip, lsl #8	/* r3 = 2345 */
2808	mov	r1, r1, lsl #8		/* r1 = ..0. */
2809	orr	r1, r1, ip, lsr #24	/* r1 = ..01 */
2810#else
2811	mov	r2, r2, lsl #24		/* r2 = 9... */
2812	orr	r2, r2, r3, lsr #8	/* r2 = 9876 */
2813	mov	r3, r3, lsl #24		/* r3 = 5... */
2814	orr	r3, r3, ip, lsr #8	/* r3 = 5432 */
2815	orr	r1, r1, ip, lsl #8	/* r1 = 3210 */
2816#endif
2817	str	r2, [r0, #0x06]
2818	str	r3, [r0, #0x02]
2819	strh	r1, [r0]
2820	RET
2821	LMEMCPY_C_PAD
2822
2823/*
2824 * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
2825 */
2826	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2827	ldr	ip, [r1, #0x04]		/* BE:ip = 4567  LE:ip = 7654 */
2828	ldr	r1, [r1, #0x08]		/* BE:r1 = 89AB  LE:r1 = BA98 */
2829#ifdef __ARMEB__
2830	mov	r3, r2, lsr #24		/* r3 = ...0 */
2831	strb	r3, [r0]
2832	mov	r2, r2, lsl #8		/* r2 = 123. */
2833	orr	r2, r2, ip, lsr #24	/* r2 = 1234 */
2834	str	r2, [r0, #0x01]
2835	mov	r2, ip, lsl #8		/* r2 = 567. */
2836	orr	r2, r2, r1, lsr #24	/* r2 = 5678 */
2837	str	r2, [r0, #0x05]
2838	mov	r2, r1, lsr #8		/* r2 = ..9A */
2839	strh	r2, [r0, #0x09]
2840	strb	r1, [r0, #0x0b]
2841#else
2842	strb	r2, [r0]
2843	mov	r3, r2, lsr #8		/* r3 = .321 */
2844	orr	r3, r3, ip, lsl #24	/* r3 = 4321 */
2845	str	r3, [r0, #0x01]
2846	mov	r3, ip, lsr #8		/* r3 = .765 */
2847	orr	r3, r3, r1, lsl #24	/* r3 = 8765 */
2848	str	r3, [r0, #0x05]
2849	mov	r1, r1, lsr #8		/* r1 = .BA9 */
2850	strh	r1, [r0, #0x09]
2851	mov	r1, r1, lsr #16		/* r1 = ...B */
2852	strb	r1, [r0, #0x0b]
2853#endif
2854	RET
2855	LMEMCPY_C_PAD
2856
2857/*
2858 * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
2859 */
2860	ldrb	r2, [r1, #0x0b]		/* r2 = ...B */
2861	ldr	r3, [r1, #0x07]		/* BE:r3 = 789A  LE:r3 = A987 */
2862	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
2863	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
2864	strb	r2, [r0, #0x0b]
2865#ifdef __ARMEB__
2866	strh	r3, [r0, #0x09]
2867	mov	r3, r3, lsr #16		/* r3 = ..78 */
2868	orr	r3, r3, ip, lsl #16	/* r3 = 5678 */
2869	mov	ip, ip, lsr #16		/* ip = ..34 */
2870	orr	ip, ip, r1, lsl #16	/* ip = 1234 */
2871	mov	r1, r1, lsr #16		/* r1 = ..x0 */
2872#else
2873	mov	r2, r3, lsr #16		/* r2 = ..A9 */
2874	strh	r2, [r0, #0x09]
2875	mov	r3, r3, lsl #16		/* r3 = 87.. */
2876	orr	r3, r3, ip, lsr #16	/* r3 = 8765 */
2877	mov	ip, ip, lsl #16		/* ip = 43.. */
2878	orr	ip, ip, r1, lsr #16	/* ip = 4321 */
2879	mov	r1, r1, lsr #8		/* r1 = .210 */
2880#endif
2881	str	r3, [r0, #0x05]
2882	str	ip, [r0, #0x01]
2883	strb	r1, [r0]
2884	RET
2885	LMEMCPY_C_PAD
2886
2887/*
2888 * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
2889 */
2890#ifdef __ARMEB__
2891	ldrh	r2, [r1, #0x0a]		/* r2 = ..AB */
2892	ldr	ip, [r1, #0x06]		/* ip = 6789 */
2893	ldr	r3, [r1, #0x02]		/* r3 = 2345 */
2894	ldrh	r1, [r1]		/* r1 = ..01 */
2895	strb	r2, [r0, #0x0b]
2896	mov	r2, r2, lsr #8		/* r2 = ...A */
2897	orr	r2, r2, ip, lsl #8	/* r2 = 789A */
2898	mov	ip, ip, lsr #8		/* ip = .678 */
2899	orr	ip, ip, r3, lsl #24	/* ip = 5678 */
2900	mov	r3, r3, lsr #8		/* r3 = .234 */
2901	orr	r3, r3, r1, lsl #24	/* r3 = 1234 */
2902	mov	r1, r1, lsr #8		/* r1 = ...0 */
2903	strb	r1, [r0]
2904	str	r3, [r0, #0x01]
2905	str	ip, [r0, #0x05]
2906	strh	r2, [r0, #0x09]
2907#else
2908	ldrh	r2, [r1]		/* r2 = ..10 */
2909	ldr	r3, [r1, #0x02]		/* r3 = 5432 */
2910	ldr	ip, [r1, #0x06]		/* ip = 9876 */
2911	ldrh	r1, [r1, #0x0a]		/* r1 = ..BA */
2912	strb	r2, [r0]
2913	mov	r2, r2, lsr #8		/* r2 = ...1 */
2914	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
2915	mov	r3, r3, lsr #24		/* r3 = ...5 */
2916	orr	r3, r3, ip, lsl #8	/* r3 = 8765 */
2917	mov	ip, ip, lsr #24		/* ip = ...9 */
2918	orr	ip, ip, r1, lsl #8	/* ip = .BA9 */
2919	mov	r1, r1, lsr #8		/* r1 = ...B */
2920	str	r2, [r0, #0x01]
2921	str	r3, [r0, #0x05]
2922	strh	ip, [r0, #0x09]
2923	strb	r1, [r0, #0x0b]
2924#endif
2925	RET
2926	LMEMCPY_C_PAD
2927
2928/*
2929 * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
2930 */
2931	ldrb	r2, [r1]
2932	ldr	r3, [r1, #0x01]
2933	ldr	ip, [r1, #0x05]
2934	strb	r2, [r0]
2935	ldrh	r2, [r1, #0x09]
2936	ldrb	r1, [r1, #0x0b]
2937	str	r3, [r0, #0x01]
2938	str	ip, [r0, #0x05]
2939	strh	r2, [r0, #0x09]
2940	strb	r1, [r0, #0x0b]
2941	RET
2942END(memcpy)
2943#endif /* _ARM_ARCH_5E */
2944
2945#ifdef GPROF
2946
2947ENTRY(user)
2948	nop
2949ENTRY(btrap)
2950	nop
2951ENTRY(etrap)
2952	nop
2953ENTRY(bintr)
2954	nop
2955ENTRY(eintr)
2956	nop
2957
2958#endif
2959