memcpy_xscale.S revision 270882
1/*	$NetBSD: memcpy_xscale.S,v 1.1 2003/10/14 07:51:45 scw Exp $	*/
2
3/*
4 * Copyright 2003 Wasabi Systems, Inc.
5 * All rights reserved.
6 *
7 * Written by Steve C. Woodford for Wasabi Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *      This product includes software developed for the NetBSD Project by
20 *      Wasabi Systems, Inc.
21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22 *    or promote products derived from this software without specific prior
23 *    written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38#include <machine/asm.h>
39__FBSDID("$FreeBSD: head/lib/libc/arm/string/memcpy_xscale.S 270882 2014-08-31 17:21:51Z ian $");
40
41/* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
42ENTRY(memcpy)
43	pld	[r1]
44	cmp	r2, #0x0c
45	ble	.Lmemcpy_short		/* <= 12 bytes */
46	mov	r3, r0			/* We must not clobber r0 */
47
48	/* Word-align the destination buffer */
49	ands	ip, r3, #0x03		/* Already word aligned? */
50	beq	.Lmemcpy_wordaligned	/* Yup */
51	cmp	ip, #0x02
52	ldrb	ip, [r1], #0x01
53	sub	r2, r2, #0x01
54	strb	ip, [r3], #0x01
55	ldrleb	ip, [r1], #0x01
56	suble	r2, r2, #0x01
57	strleb	ip, [r3], #0x01
58	ldrltb	ip, [r1], #0x01
59	sublt	r2, r2, #0x01
60	strltb	ip, [r3], #0x01
61
62	/* Destination buffer is now word aligned */
63.Lmemcpy_wordaligned:
64	ands	ip, r1, #0x03		/* Is src also word-aligned? */
65	bne	.Lmemcpy_bad_align	/* Nope. Things just got bad */
66
67	/* Quad-align the destination buffer */
68	tst	r3, #0x07		/* Already quad aligned? */
69	ldrne	ip, [r1], #0x04
70	stmfd	sp!, {r4-r9}		/* Free up some registers */
71	subne	r2, r2, #0x04
72	strne	ip, [r3], #0x04
73
74	/* Destination buffer quad aligned, source is at least word aligned */
75	subs	r2, r2, #0x80
76	blt	.Lmemcpy_w_lessthan128
77
78	/* Copy 128 bytes at a time */
79.Lmemcpy_w_loop128:
80	ldr	r4, [r1], #0x04		/* LD:00-03 */
81	ldr	r5, [r1], #0x04		/* LD:04-07 */
82	pld	[r1, #0x18]		/* Prefetch 0x20 */
83	ldr	r6, [r1], #0x04		/* LD:08-0b */
84	ldr	r7, [r1], #0x04		/* LD:0c-0f */
85	ldr	r8, [r1], #0x04		/* LD:10-13 */
86	ldr	r9, [r1], #0x04		/* LD:14-17 */
87	strd	r4, [r3], #0x08		/* ST:00-07 */
88	ldr	r4, [r1], #0x04		/* LD:18-1b */
89	ldr	r5, [r1], #0x04		/* LD:1c-1f */
90	strd	r6, [r3], #0x08		/* ST:08-0f */
91	ldr	r6, [r1], #0x04		/* LD:20-23 */
92	ldr	r7, [r1], #0x04		/* LD:24-27 */
93	pld	[r1, #0x18]		/* Prefetch 0x40 */
94	strd	r8, [r3], #0x08		/* ST:10-17 */
95	ldr	r8, [r1], #0x04		/* LD:28-2b */
96	ldr	r9, [r1], #0x04		/* LD:2c-2f */
97	strd	r4, [r3], #0x08		/* ST:18-1f */
98	ldr	r4, [r1], #0x04		/* LD:30-33 */
99	ldr	r5, [r1], #0x04		/* LD:34-37 */
100	strd	r6, [r3], #0x08		/* ST:20-27 */
101	ldr	r6, [r1], #0x04		/* LD:38-3b */
102	ldr	r7, [r1], #0x04		/* LD:3c-3f */
103	strd	r8, [r3], #0x08		/* ST:28-2f */
104	ldr	r8, [r1], #0x04		/* LD:40-43 */
105	ldr	r9, [r1], #0x04		/* LD:44-47 */
106	pld	[r1, #0x18]		/* Prefetch 0x60 */
107	strd	r4, [r3], #0x08		/* ST:30-37 */
108	ldr	r4, [r1], #0x04		/* LD:48-4b */
109	ldr	r5, [r1], #0x04		/* LD:4c-4f */
110	strd	r6, [r3], #0x08		/* ST:38-3f */
111	ldr	r6, [r1], #0x04		/* LD:50-53 */
112	ldr	r7, [r1], #0x04		/* LD:54-57 */
113	strd	r8, [r3], #0x08		/* ST:40-47 */
114	ldr	r8, [r1], #0x04		/* LD:58-5b */
115	ldr	r9, [r1], #0x04		/* LD:5c-5f */
116	strd	r4, [r3], #0x08		/* ST:48-4f */
117	ldr	r4, [r1], #0x04		/* LD:60-63 */
118	ldr	r5, [r1], #0x04		/* LD:64-67 */
119	pld	[r1, #0x18]		/* Prefetch 0x80 */
120	strd	r6, [r3], #0x08		/* ST:50-57 */
121	ldr	r6, [r1], #0x04		/* LD:68-6b */
122	ldr	r7, [r1], #0x04		/* LD:6c-6f */
123	strd	r8, [r3], #0x08		/* ST:58-5f */
124	ldr	r8, [r1], #0x04		/* LD:70-73 */
125	ldr	r9, [r1], #0x04		/* LD:74-77 */
126	strd	r4, [r3], #0x08		/* ST:60-67 */
127	ldr	r4, [r1], #0x04		/* LD:78-7b */
128	ldr	r5, [r1], #0x04		/* LD:7c-7f */
129	strd	r6, [r3], #0x08		/* ST:68-6f */
130	strd	r8, [r3], #0x08		/* ST:70-77 */
131	subs	r2, r2, #0x80
132	strd	r4, [r3], #0x08		/* ST:78-7f */
133	bge	.Lmemcpy_w_loop128
134
135.Lmemcpy_w_lessthan128:
136	adds	r2, r2, #0x80		/* Adjust for extra sub */
137	ldmeqfd	sp!, {r4-r9}
138	bxeq	lr			/* Return now if done */
139	subs	r2, r2, #0x20
140	blt	.Lmemcpy_w_lessthan32
141
142	/* Copy 32 bytes at a time */
143.Lmemcpy_w_loop32:
144	ldr	r4, [r1], #0x04
145	ldr	r5, [r1], #0x04
146	pld	[r1, #0x18]
147	ldr	r6, [r1], #0x04
148	ldr	r7, [r1], #0x04
149	ldr	r8, [r1], #0x04
150	ldr	r9, [r1], #0x04
151	strd	r4, [r3], #0x08
152	ldr	r4, [r1], #0x04
153	ldr	r5, [r1], #0x04
154	strd	r6, [r3], #0x08
155	strd	r8, [r3], #0x08
156	subs	r2, r2, #0x20
157	strd	r4, [r3], #0x08
158	bge	.Lmemcpy_w_loop32
159
160.Lmemcpy_w_lessthan32:
161	adds	r2, r2, #0x20		/* Adjust for extra sub */
162	ldmeqfd	sp!, {r4-r9}
163	bxeq	lr			/* Return now if done */
164
165	and	r4, r2, #0x18
166	rsbs	r4, r4, #0x18
167	addne	pc, pc, r4, lsl #1
168	nop
169
170	/* At least 24 bytes remaining */
171	ldr	r4, [r1], #0x04
172	ldr	r5, [r1], #0x04
173	sub	r2, r2, #0x08
174	strd	r4, [r3], #0x08
175
176	/* At least 16 bytes remaining */
177	ldr	r4, [r1], #0x04
178	ldr	r5, [r1], #0x04
179	sub	r2, r2, #0x08
180	strd	r4, [r3], #0x08
181
182	/* At least 8 bytes remaining */
183	ldr	r4, [r1], #0x04
184	ldr	r5, [r1], #0x04
185	subs	r2, r2, #0x08
186	strd	r4, [r3], #0x08
187
188	/* Less than 8 bytes remaining */
189	ldmfd	sp!, {r4-r9}
190	bxeq	lr			/* Return now if done */
191	subs	r2, r2, #0x04
192	ldrge	ip, [r1], #0x04
193	strge	ip, [r3], #0x04
194	bxeq	lr			/* Return now if done */
195	addlt	r2, r2, #0x04
196	ldrb	ip, [r1], #0x01
197	cmp	r2, #0x02
198	ldrgeb	r2, [r1], #0x01
199	strb	ip, [r3], #0x01
200	ldrgtb	ip, [r1]
201	strgeb	r2, [r3], #0x01
202	strgtb	ip, [r3]
203	bx	lr
204
205
206/*
207 * At this point, it has not been possible to word align both buffers.
208 * The destination buffer is word aligned, but the source buffer is not.
209 */
210.Lmemcpy_bad_align:
211	stmfd	sp!, {r4-r7}
212	bic	r1, r1, #0x03
213	cmp	ip, #2
214	ldr	ip, [r1], #0x04
215	bgt	.Lmemcpy_bad3
216	beq	.Lmemcpy_bad2
217	b	.Lmemcpy_bad1
218
219.Lmemcpy_bad1_loop16:
220#ifdef __ARMEB__
221	mov	r4, ip, lsl #8
222#else
223	mov	r4, ip, lsr #8
224#endif
225	ldr	r5, [r1], #0x04
226	pld	[r1, #0x018]
227	ldr	r6, [r1], #0x04
228	ldr	r7, [r1], #0x04
229	ldr	ip, [r1], #0x04
230#ifdef __ARMEB__
231	orr	r4, r4, r5, lsr #24
232	mov	r5, r5, lsl #8
233	orr	r5, r5, r6, lsr #24
234	mov	r6, r6, lsl #8
235	orr	r6, r6, r7, lsr #24
236	mov	r7, r7, lsl #8
237	orr	r7, r7, ip, lsr #24
238#else
239	orr	r4, r4, r5, lsl #24
240	mov	r5, r5, lsr #8
241	orr	r5, r5, r6, lsl #24
242	mov	r6, r6, lsr #8
243	orr	r6, r6, r7, lsl #24
244	mov	r7, r7, lsr #8
245	orr	r7, r7, ip, lsl #24
246#endif
247	str	r4, [r3], #0x04
248	str	r5, [r3], #0x04
249	str	r6, [r3], #0x04
250	str	r7, [r3], #0x04
251.Lmemcpy_bad1:
252	subs	r2, r2, #0x10
253	bge	.Lmemcpy_bad1_loop16
254
255	adds	r2, r2, #0x10
256	ldmeqfd	sp!, {r4-r7}
257	bxeq	lr			/* Return now if done */
258	subs	r2, r2, #0x04
259	sublt	r1, r1, #0x03
260	blt	.Lmemcpy_bad_done
261
262.Lmemcpy_bad1_loop4:
263#ifdef __ARMEB__
264	mov	r4, ip, lsl #8
265#else
266	mov	r4, ip, lsr #8
267#endif
268	ldr	ip, [r1], #0x04
269	subs	r2, r2, #0x04
270#ifdef __ARMEB__
271	orr	r4, r4, ip, lsr #24
272#else
273	orr	r4, r4, ip, lsl #24
274#endif
275	str	r4, [r3], #0x04
276	bge	.Lmemcpy_bad1_loop4
277	sub	r1, r1, #0x03
278	b	.Lmemcpy_bad_done
279
280.Lmemcpy_bad2_loop16:
281#ifdef __ARMEB__
282	mov	r4, ip, lsl #16
283#else
284	mov	r4, ip, lsr #16
285#endif
286	ldr	r5, [r1], #0x04
287	pld	[r1, #0x018]
288	ldr	r6, [r1], #0x04
289	ldr	r7, [r1], #0x04
290	ldr	ip, [r1], #0x04
291#ifdef __ARMEB__
292	orr	r4, r4, r5, lsr #16
293	mov	r5, r5, lsl #16
294	orr	r5, r5, r6, lsr #16
295	mov	r6, r6, lsl #16
296	orr	r6, r6, r7, lsr #16
297	mov	r7, r7, lsl #16
298	orr	r7, r7, ip, lsr #16
299#else
300	orr	r4, r4, r5, lsl #16
301	mov	r5, r5, lsr #16
302	orr	r5, r5, r6, lsl #16
303	mov	r6, r6, lsr #16
304	orr	r6, r6, r7, lsl #16
305	mov	r7, r7, lsr #16
306	orr	r7, r7, ip, lsl #16
307#endif
308	str	r4, [r3], #0x04
309	str	r5, [r3], #0x04
310	str	r6, [r3], #0x04
311	str	r7, [r3], #0x04
312.Lmemcpy_bad2:
313	subs	r2, r2, #0x10
314	bge	.Lmemcpy_bad2_loop16
315
316	adds	r2, r2, #0x10
317	ldmeqfd	sp!, {r4-r7}
318	bxeq	lr			/* Return now if done */
319	subs	r2, r2, #0x04
320	sublt	r1, r1, #0x02
321	blt	.Lmemcpy_bad_done
322
323.Lmemcpy_bad2_loop4:
324#ifdef __ARMEB__
325	mov	r4, ip, lsl #16
326#else
327	mov	r4, ip, lsr #16
328#endif
329	ldr	ip, [r1], #0x04
330	subs	r2, r2, #0x04
331#ifdef __ARMEB__
332	orr	r4, r4, ip, lsr #16
333#else
334	orr	r4, r4, ip, lsl #16
335#endif
336	str	r4, [r3], #0x04
337	bge	.Lmemcpy_bad2_loop4
338	sub	r1, r1, #0x02
339	b	.Lmemcpy_bad_done
340
341.Lmemcpy_bad3_loop16:
342#ifdef __ARMEB__
343	mov	r4, ip, lsl #24
344#else
345	mov	r4, ip, lsr #24
346#endif
347	ldr	r5, [r1], #0x04
348	pld	[r1, #0x018]
349	ldr	r6, [r1], #0x04
350	ldr	r7, [r1], #0x04
351	ldr	ip, [r1], #0x04
352#ifdef __ARMEB__
353	orr	r4, r4, r5, lsr #8
354	mov	r5, r5, lsl #24
355	orr	r5, r5, r6, lsr #8
356	mov	r6, r6, lsl #24
357	orr	r6, r6, r7, lsr #8
358	mov	r7, r7, lsl #24
359	orr	r7, r7, ip, lsr #8
360#else
361	orr	r4, r4, r5, lsl #8
362	mov	r5, r5, lsr #24
363	orr	r5, r5, r6, lsl #8
364	mov	r6, r6, lsr #24
365	orr	r6, r6, r7, lsl #8
366	mov	r7, r7, lsr #24
367	orr	r7, r7, ip, lsl #8
368#endif
369	str	r4, [r3], #0x04
370	str	r5, [r3], #0x04
371	str	r6, [r3], #0x04
372	str	r7, [r3], #0x04
373.Lmemcpy_bad3:
374	subs	r2, r2, #0x10
375	bge	.Lmemcpy_bad3_loop16
376
377	adds	r2, r2, #0x10
378	ldmeqfd	sp!, {r4-r7}
379	bxeq	lr			/* Return now if done */
380	subs	r2, r2, #0x04
381	sublt	r1, r1, #0x01
382	blt	.Lmemcpy_bad_done
383
384.Lmemcpy_bad3_loop4:
385#ifdef __ARMEB__
386	mov	r4, ip, lsl #24
387#else
388	mov	r4, ip, lsr #24
389#endif
390	ldr	ip, [r1], #0x04
391	subs	r2, r2, #0x04
392#ifdef __ARMEB__
393	orr	r4, r4, ip, lsr #8
394#else
395	orr	r4, r4, ip, lsl #8
396#endif
397	str	r4, [r3], #0x04
398	bge	.Lmemcpy_bad3_loop4
399	sub	r1, r1, #0x01
400
401.Lmemcpy_bad_done:
402	ldmfd	sp!, {r4-r7}
403	adds	r2, r2, #0x04
404	bxeq	lr
405	ldrb	ip, [r1], #0x01
406	cmp	r2, #0x02
407	ldrgeb	r2, [r1], #0x01
408	strb	ip, [r3], #0x01
409	ldrgtb	ip, [r1]
410	strgeb	r2, [r3], #0x01
411	strgtb	ip, [r3]
412	bx	lr
413
414
415/*
416 * Handle short copies (less than 16 bytes), possibly misaligned.
417 * Some of these are *very* common, thanks to the network stack,
418 * and so are handled specially.
419 */
420.Lmemcpy_short:
421#ifndef _STANDALONE
422	add	pc, pc, r2, lsl #2
423	nop
424	bx	lr			/* 0x00 */
425	b	.Lmemcpy_bytewise	/* 0x01 */
426	b	.Lmemcpy_bytewise	/* 0x02 */
427	b	.Lmemcpy_bytewise	/* 0x03 */
428	b	.Lmemcpy_4		/* 0x04 */
429	b	.Lmemcpy_bytewise	/* 0x05 */
430	b	.Lmemcpy_6		/* 0x06 */
431	b	.Lmemcpy_bytewise	/* 0x07 */
432	b	.Lmemcpy_8		/* 0x08 */
433	b	.Lmemcpy_bytewise	/* 0x09 */
434	b	.Lmemcpy_bytewise	/* 0x0a */
435	b	.Lmemcpy_bytewise	/* 0x0b */
436	b	.Lmemcpy_c		/* 0x0c */
437#endif
438.Lmemcpy_bytewise:
439	mov	r3, r0			/* We must not clobber r0 */
440	ldrb	ip, [r1], #0x01
4411:	subs	r2, r2, #0x01
442	strb	ip, [r3], #0x01
443	ldrneb	ip, [r1], #0x01
444	bne	1b
445	bx	lr
446
447#ifndef _STANDALONE
448/******************************************************************************
449 * Special case for 4 byte copies
450 */
451#define	LMEMCPY_4_LOG2	6	/* 64 bytes */
452#define	LMEMCPY_4_PAD	.align LMEMCPY_4_LOG2
453	LMEMCPY_4_PAD
454.Lmemcpy_4:
455	and	r2, r1, #0x03
456	orr	r2, r2, r0, lsl #2
457	ands	r2, r2, #0x0f
458	sub	r3, pc, #0x14
459	addne	pc, r3, r2, lsl #LMEMCPY_4_LOG2
460
461/*
462 * 0000: dst is 32-bit aligned, src is 32-bit aligned
463 */
464	ldr	r2, [r1]
465	str	r2, [r0]
466	bx	lr
467	LMEMCPY_4_PAD
468
469/*
470 * 0001: dst is 32-bit aligned, src is 8-bit aligned
471 */
472	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
473	ldr	r2, [r1, #3]		/* BE:r2 = 3xxx  LE:r2 = xxx3 */
474#ifdef __ARMEB__
475	mov	r3, r3, lsl #8		/* r3 = 012. */
476	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
477#else
478	mov	r3, r3, lsr #8		/* r3 = .210 */
479	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
480#endif
481	str	r3, [r0]
482	bx	lr
483	LMEMCPY_4_PAD
484
485/*
486 * 0010: dst is 32-bit aligned, src is 16-bit aligned
487 */
488#ifdef __ARMEB__
489	ldrh	r3, [r1]
490	ldrh	r2, [r1, #0x02]
491#else
492	ldrh	r3, [r1, #0x02]
493	ldrh	r2, [r1]
494#endif
495	orr	r3, r2, r3, lsl #16
496	str	r3, [r0]
497	bx	lr
498	LMEMCPY_4_PAD
499
500/*
501 * 0011: dst is 32-bit aligned, src is 8-bit aligned
502 */
503	ldr	r3, [r1, #-3]		/* BE:r3 = xxx0  LE:r3 = 0xxx */
504	ldr	r2, [r1, #1]		/* BE:r2 = 123x  LE:r2 = x321 */
505#ifdef __ARMEB__
506	mov	r3, r3, lsl #24		/* r3 = 0... */
507	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
508#else
509	mov	r3, r3, lsr #24		/* r3 = ...0 */
510	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
511#endif
512	str	r3, [r0]
513	bx	lr
514	LMEMCPY_4_PAD
515
516/*
517 * 0100: dst is 8-bit aligned, src is 32-bit aligned
518 */
519	ldr	r2, [r1]
520#ifdef __ARMEB__
521	strb	r2, [r0, #0x03]
522	mov	r3, r2, lsr #8
523	mov	r1, r2, lsr #24
524	strb	r1, [r0]
525#else
526	strb	r2, [r0]
527	mov	r3, r2, lsr #8
528	mov	r1, r2, lsr #24
529	strb	r1, [r0, #0x03]
530#endif
531	strh	r3, [r0, #0x01]
532	bx	lr
533	LMEMCPY_4_PAD
534
535/*
536 * 0101: dst is 8-bit aligned, src is 8-bit aligned
537 */
538	ldrb	r2, [r1]
539	ldrh	r3, [r1, #0x01]
540	ldrb	r1, [r1, #0x03]
541	strb	r2, [r0]
542	strh	r3, [r0, #0x01]
543	strb	r1, [r0, #0x03]
544	bx	lr
545	LMEMCPY_4_PAD
546
547/*
548 * 0110: dst is 8-bit aligned, src is 16-bit aligned
549 */
550	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
551	ldrh	r3, [r1, #0x02]		/* LE:r3 = ..23  LE:r3 = ..32 */
552#ifdef __ARMEB__
553	mov	r1, r2, lsr #8		/* r1 = ...0 */
554	strb	r1, [r0]
555	mov	r2, r2, lsl #8		/* r2 = .01. */
556	orr	r2, r2, r3, lsr #8	/* r2 = .012 */
557#else
558	strb	r2, [r0]
559	mov	r2, r2, lsr #8		/* r2 = ...1 */
560	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
561	mov	r3, r3, lsr #8		/* r3 = ...3 */
562#endif
563	strh	r2, [r0, #0x01]
564	strb	r3, [r0, #0x03]
565	bx	lr
566	LMEMCPY_4_PAD
567
568/*
569 * 0111: dst is 8-bit aligned, src is 8-bit aligned
570 */
571	ldrb	r2, [r1]
572	ldrh	r3, [r1, #0x01]
573	ldrb	r1, [r1, #0x03]
574	strb	r2, [r0]
575	strh	r3, [r0, #0x01]
576	strb	r1, [r0, #0x03]
577	bx	lr
578	LMEMCPY_4_PAD
579
580/*
581 * 1000: dst is 16-bit aligned, src is 32-bit aligned
582 */
583	ldr	r2, [r1]
584#ifdef __ARMEB__
585	strh	r2, [r0, #0x02]
586	mov	r3, r2, lsr #16
587	strh	r3, [r0]
588#else
589	strh	r2, [r0]
590	mov	r3, r2, lsr #16
591	strh	r3, [r0, #0x02]
592#endif
593	bx	 lr
594	LMEMCPY_4_PAD
595
596/*
597 * 1001: dst is 16-bit aligned, src is 8-bit aligned
598 */
599	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
600	ldr	r3, [r1, #3]		/* BE:r3 = 3xxx  LE:r3 = xxx3 */
601	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
602	strh	r1, [r0]
603#ifdef __ARMEB__
604	mov	r2, r2, lsl #8		/* r2 = 012. */
605	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
606#else
607	mov	r2, r2, lsr #24		/* r2 = ...2 */
608	orr	r2, r2, r3, lsl #8	/* r2 = xx32 */
609#endif
610	strh	r2, [r0, #0x02]
611	bx	lr
612	LMEMCPY_4_PAD
613
614/*
615 * 1010: dst is 16-bit aligned, src is 16-bit aligned
616 */
617	ldrh	r2, [r1]
618	ldrh	r3, [r1, #0x02]
619	strh	r2, [r0]
620	strh	r3, [r0, #0x02]
621	bx	lr
622	LMEMCPY_4_PAD
623
624/*
625 * 1011: dst is 16-bit aligned, src is 8-bit aligned
626 */
627	ldr	r3, [r1, #1]		/* BE:r3 = 123x  LE:r3 = x321 */
628	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
629	mov	r1, r3, lsr #8		/* BE:r1 = .123  LE:r1 = .x32 */
630	strh	r1, [r0, #0x02]
631#ifdef __ARMEB__
632	mov	r3, r3, lsr #24		/* r3 = ...1 */
633	orr	r3, r3, r2, lsl #8	/* r3 = xx01 */
634#else
635	mov	r3, r3, lsl #8		/* r3 = 321. */
636	orr	r3, r3, r2, lsr #24	/* r3 = 3210 */
637#endif
638	strh	r3, [r0]
639	bx	lr
640	LMEMCPY_4_PAD
641
642/*
643 * 1100: dst is 8-bit aligned, src is 32-bit aligned
644 */
645	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
646#ifdef __ARMEB__
647	strb	r2, [r0, #0x03]
648	mov	r3, r2, lsr #8
649	mov	r1, r2, lsr #24
650	strh	r3, [r0, #0x01]
651	strb	r1, [r0]
652#else
653	strb	r2, [r0]
654	mov	r3, r2, lsr #8
655	mov	r1, r2, lsr #24
656	strh	r3, [r0, #0x01]
657	strb	r1, [r0, #0x03]
658#endif
659	bx	lr
660	LMEMCPY_4_PAD
661
662/*
663 * 1101: dst is 8-bit aligned, src is 8-bit aligned
664 */
665	ldrb	r2, [r1]
666	ldrh	r3, [r1, #0x01]
667	ldrb	r1, [r1, #0x03]
668	strb	r2, [r0]
669	strh	r3, [r0, #0x01]
670	strb	r1, [r0, #0x03]
671	bx	lr
672	LMEMCPY_4_PAD
673
674/*
675 * 1110: dst is 8-bit aligned, src is 16-bit aligned
676 */
677#ifdef __ARMEB__
678	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
679	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
680	strb	r3, [r0, #0x03]
681	mov	r3, r3, lsr #8		/* r3 = ...2 */
682	orr	r3, r3, r2, lsl #8	/* r3 = ..12 */
683	strh	r3, [r0, #0x01]
684	mov	r2, r2, lsr #8		/* r2 = ...0 */
685	strb	r2, [r0]
686#else
687	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
688	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
689	strb	r2, [r0]
690	mov	r2, r2, lsr #8		/* r2 = ...1 */
691	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
692	strh	r2, [r0, #0x01]
693	mov	r3, r3, lsr #8		/* r3 = ...3 */
694	strb	r3, [r0, #0x03]
695#endif
696	bx	lr
697	LMEMCPY_4_PAD
698
699/*
700 * 1111: dst is 8-bit aligned, src is 8-bit aligned
701 */
702	ldrb	r2, [r1]
703	ldrh	r3, [r1, #0x01]
704	ldrb	r1, [r1, #0x03]
705	strb	r2, [r0]
706	strh	r3, [r0, #0x01]
707	strb	r1, [r0, #0x03]
708	bx	lr
709	LMEMCPY_4_PAD
710
711
712/******************************************************************************
713 * Special case for 6 byte copies
714 */
715#define	LMEMCPY_6_LOG2	6	/* 64 bytes */
716#define	LMEMCPY_6_PAD	.align LMEMCPY_6_LOG2
717	LMEMCPY_6_PAD
718.Lmemcpy_6:
719	and	r2, r1, #0x03
720	orr	r2, r2, r0, lsl #2
721	ands	r2, r2, #0x0f
722	sub	r3, pc, #0x14
723	addne	pc, r3, r2, lsl #LMEMCPY_6_LOG2
724
725/*
726 * 0000: dst is 32-bit aligned, src is 32-bit aligned
727 */
728	ldr	r2, [r1]
729	ldrh	r3, [r1, #0x04]
730	str	r2, [r0]
731	strh	r3, [r0, #0x04]
732	bx	lr
733	LMEMCPY_6_PAD
734
735/*
736 * 0001: dst is 32-bit aligned, src is 8-bit aligned
737 */
738	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
739	ldr	r3, [r1, #0x03]		/* BE:r3 = 345x  LE:r3 = x543 */
740#ifdef __ARMEB__
741	mov	r2, r2, lsl #8		/* r2 = 012. */
742	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
743#else
744	mov	r2, r2, lsr #8		/* r2 = .210 */
745	orr	r2, r2, r3, lsl #24	/* r2 = 3210 */
746#endif
747	mov	r3, r3, lsr #8		/* BE:r3 = .345  LE:r3 = .x54 */
748	str	r2, [r0]
749	strh	r3, [r0, #0x04]
750	bx	lr
751	LMEMCPY_6_PAD
752
753/*
754 * 0010: dst is 32-bit aligned, src is 16-bit aligned
755 */
756	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
757	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
758#ifdef __ARMEB__
759	mov	r1, r3, lsr #16		/* r1 = ..23 */
760	orr	r1, r1, r2, lsl #16	/* r1 = 0123 */
761	str	r1, [r0]
762	strh	r3, [r0, #0x04]
763#else
764	mov	r1, r3, lsr #16		/* r1 = ..54 */
765	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
766	str	r2, [r0]
767	strh	r1, [r0, #0x04]
768#endif
769	bx	lr
770	LMEMCPY_6_PAD
771
772/*
773 * 0011: dst is 32-bit aligned, src is 8-bit aligned
774 */
775	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
776	ldr	r3, [r1, #1]		/* BE:r3 = 1234  LE:r3 = 4321 */
777	ldr	r1, [r1, #5]		/* BE:r1 = 5xxx  LE:r3 = xxx5 */
778#ifdef __ARMEB__
779	mov	r2, r2, lsl #24		/* r2 = 0... */
780	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
781	mov	r3, r3, lsl #8		/* r3 = 234. */
782	orr	r1, r3, r1, lsr #24	/* r1 = 2345 */
783#else
784	mov	r2, r2, lsr #24		/* r2 = ...0 */
785	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
786	mov	r1, r1, lsl #8		/* r1 = xx5. */
787	orr	r1, r1, r3, lsr #24	/* r1 = xx54 */
788#endif
789	str	r2, [r0]
790	strh	r1, [r0, #0x04]
791	bx	lr
792	LMEMCPY_6_PAD
793
794/*
795 * 0100: dst is 8-bit aligned, src is 32-bit aligned
796 */
797	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
798	ldrh	r2, [r1, #0x04]		/* BE:r2 = ..45  LE:r2 = ..54 */
799	mov	r1, r3, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
800	strh	r1, [r0, #0x01]
801#ifdef __ARMEB__
802	mov	r1, r3, lsr #24		/* r1 = ...0 */
803	strb	r1, [r0]
804	mov	r3, r3, lsl #8		/* r3 = 123. */
805	orr	r3, r3, r2, lsr #8	/* r3 = 1234 */
806#else
807	strb	r3, [r0]
808	mov	r3, r3, lsr #24		/* r3 = ...3 */
809	orr	r3, r3, r2, lsl #8	/* r3 = .543 */
810	mov	r2, r2, lsr #8		/* r2 = ...5 */
811#endif
812	strh	r3, [r0, #0x03]
813	strb	r2, [r0, #0x05]
814	bx	lr
815	LMEMCPY_6_PAD
816
817/*
818 * 0101: dst is 8-bit aligned, src is 8-bit aligned
819 */
820	ldrb	r2, [r1]
821	ldrh	r3, [r1, #0x01]
822	ldrh	ip, [r1, #0x03]
823	ldrb	r1, [r1, #0x05]
824	strb	r2, [r0]
825	strh	r3, [r0, #0x01]
826	strh	ip, [r0, #0x03]
827	strb	r1, [r0, #0x05]
828	bx	lr
829	LMEMCPY_6_PAD
830
831/*
832 * 0110: dst is 8-bit aligned, src is 16-bit aligned
833 */
834	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
835	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
836#ifdef __ARMEB__
837	mov	r3, r2, lsr #8		/* r3 = ...0 */
838	strb	r3, [r0]
839	strb	r1, [r0, #0x05]
840	mov	r3, r1, lsr #8		/* r3 = .234 */
841	strh	r3, [r0, #0x03]
842	mov	r3, r2, lsl #8		/* r3 = .01. */
843	orr	r3, r3, r1, lsr #24	/* r3 = .012 */
844	strh	r3, [r0, #0x01]
845#else
846	strb	r2, [r0]
847	mov	r3, r1, lsr #24
848	strb	r3, [r0, #0x05]
849	mov	r3, r1, lsr #8		/* r3 = .543 */
850	strh	r3, [r0, #0x03]
851	mov	r3, r2, lsr #8		/* r3 = ...1 */
852	orr	r3, r3, r1, lsl #8	/* r3 = 4321 */
853	strh	r3, [r0, #0x01]
854#endif
855	bx	lr
856	LMEMCPY_6_PAD
857
858/*
859 * 0111: dst is 8-bit aligned, src is 8-bit aligned
860 */
861	ldrb	r2, [r1]
862	ldrh	r3, [r1, #0x01]
863	ldrh	ip, [r1, #0x03]
864	ldrb	r1, [r1, #0x05]
865	strb	r2, [r0]
866	strh	r3, [r0, #0x01]
867	strh	ip, [r0, #0x03]
868	strb	r1, [r0, #0x05]
869	bx	lr
870	LMEMCPY_6_PAD
871
872/*
873 * 1000: dst is 16-bit aligned, src is 32-bit aligned
874 */
875#ifdef __ARMEB__
876	ldr	r2, [r1]		/* r2 = 0123 */
877	ldrh	r3, [r1, #0x04]		/* r3 = ..45 */
878	mov	r1, r2, lsr #16		/* r1 = ..01 */
879	orr	r3, r3, r2, lsl#16	/* r3 = 2345 */
880	strh	r1, [r0]
881	str	r3, [r0, #0x02]
882#else
883	ldrh	r2, [r1, #0x04]		/* r2 = ..54 */
884	ldr	r3, [r1]		/* r3 = 3210 */
885	mov	r2, r2, lsl #16		/* r2 = 54.. */
886	orr	r2, r2, r3, lsr #16	/* r2 = 5432 */
887	strh	r3, [r0]
888	str	r2, [r0, #0x02]
889#endif
890	bx	lr
891	LMEMCPY_6_PAD
892
893/*
894 * 1001: dst is 16-bit aligned, src is 8-bit aligned
895 */
896	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
897	ldr	r2, [r1, #3]		/* BE:r2 = 345x  LE:r2 = x543 */
898	mov	r1, r3, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
899#ifdef __ARMEB__
900	mov	r2, r2, lsr #8		/* r2 = .345 */
901	orr	r2, r2, r3, lsl #24	/* r2 = 2345 */
902#else
903	mov	r2, r2, lsl #8		/* r2 = 543. */
904	orr	r2, r2, r3, lsr #24	/* r2 = 5432 */
905#endif
906	strh	r1, [r0]
907	str	r2, [r0, #0x02]
908	bx	lr
909	LMEMCPY_6_PAD
910
911/*
912 * 1010: dst is 16-bit aligned, src is 16-bit aligned
913 */
914	ldrh	r2, [r1]
915	ldr	r3, [r1, #0x02]
916	strh	r2, [r0]
917	str	r3, [r0, #0x02]
918	bx	lr
919	LMEMCPY_6_PAD
920
921/*
922 * 1011: dst is 16-bit aligned, src is 8-bit aligned
923 */
924	ldrb	r3, [r1]		/* r3 = ...0 */
925	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
926	ldrb	r1, [r1, #0x05]		/* r1 = ...5 */
927#ifdef __ARMEB__
928	mov	r3, r3, lsl #8		/* r3 = ..0. */
929	orr	r3, r3, r2, lsr #24	/* r3 = ..01 */
930	orr	r1, r1, r2, lsl #8	/* r1 = 2345 */
931#else
932	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
933	mov	r1, r1, lsl #24		/* r1 = 5... */
934	orr	r1, r1, r2, lsr #8	/* r1 = 5432 */
935#endif
936	strh	r3, [r0]
937	str	r1, [r0, #0x02]
938	bx	lr
939	LMEMCPY_6_PAD
940
941/*
942 * 1100: dst is 8-bit aligned, src is 32-bit aligned
943 */
944	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
945	ldrh	r1, [r1, #0x04]		/* BE:r1 = ..45  LE:r1 = ..54 */
946#ifdef __ARMEB__
947	mov	r3, r2, lsr #24		/* r3 = ...0 */
948	strb	r3, [r0]
949	mov	r2, r2, lsl #8		/* r2 = 123. */
950	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
951#else
952	strb	r2, [r0]
953	mov	r2, r2, lsr #8		/* r2 = .321 */
954	orr	r2, r2, r1, lsl #24	/* r2 = 4321 */
955	mov	r1, r1, lsr #8		/* r1 = ...5 */
956#endif
957	str	r2, [r0, #0x01]
958	strb	r1, [r0, #0x05]
959	bx	lr
960	LMEMCPY_6_PAD
961
962/*
963 * 1101: dst is 8-bit aligned, src is 8-bit aligned
964 */
965	ldrb	r2, [r1]
966	ldrh	r3, [r1, #0x01]
967	ldrh	ip, [r1, #0x03]
968	ldrb	r1, [r1, #0x05]
969	strb	r2, [r0]
970	strh	r3, [r0, #0x01]
971	strh	ip, [r0, #0x03]
972	strb	r1, [r0, #0x05]
973	bx	lr
974	LMEMCPY_6_PAD
975
976/*
977 * 1110: dst is 8-bit aligned, src is 16-bit aligned
978 */
979	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
980	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
981#ifdef __ARMEB__
982	mov	r3, r2, lsr #8		/* r3 = ...0 */
983	strb	r3, [r0]
984	mov	r2, r2, lsl #24		/* r2 = 1... */
985	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
986#else
987	strb	r2, [r0]
988	mov	r2, r2, lsr #8		/* r2 = ...1 */
989	orr	r2, r2, r1, lsl #8	/* r2 = 4321 */
990	mov	r1, r1, lsr #24		/* r1 = ...5 */
991#endif
992	str	r2, [r0, #0x01]
993	strb	r1, [r0, #0x05]
994	bx	lr
995	LMEMCPY_6_PAD
996
997/*
998 * 1111: dst is 8-bit aligned, src is 8-bit aligned
999 */
1000	ldrb	r2, [r1]
1001	ldr	r3, [r1, #0x01]
1002	ldrb	r1, [r1, #0x05]
1003	strb	r2, [r0]
1004	str	r3, [r0, #0x01]
1005	strb	r1, [r0, #0x05]
1006	bx	lr
1007	LMEMCPY_6_PAD
1008
1009
1010/******************************************************************************
1011 * Special case for 8 byte copies
1012 */
1013#define	LMEMCPY_8_LOG2	6	/* 64 bytes */
1014#define	LMEMCPY_8_PAD	.align LMEMCPY_8_LOG2
1015	LMEMCPY_8_PAD
1016.Lmemcpy_8:
1017	and	r2, r1, #0x03
1018	orr	r2, r2, r0, lsl #2
1019	ands	r2, r2, #0x0f
1020	sub	r3, pc, #0x14
1021	addne	pc, r3, r2, lsl #LMEMCPY_8_LOG2
1022
1023/*
1024 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1025 */
1026	ldr	r2, [r1]
1027	ldr	r3, [r1, #0x04]
1028	str	r2, [r0]
1029	str	r3, [r0, #0x04]
1030	bx	lr
1031	LMEMCPY_8_PAD
1032
1033/*
1034 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1035 */
1036	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
1037	ldr	r2, [r1, #0x03]		/* BE:r2 = 3456  LE:r2 = 6543 */
1038	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
1039#ifdef __ARMEB__
1040	mov	r3, r3, lsl #8		/* r3 = 012. */
1041	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
1042	orr	r2, r1, r2, lsl #8	/* r2 = 4567 */
1043#else
1044	mov	r3, r3, lsr #8		/* r3 = .210 */
1045	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
1046	mov	r1, r1, lsl #24		/* r1 = 7... */
1047	orr	r2, r1, r2, lsr #8	/* r2 = 7654 */
1048#endif
1049	str	r3, [r0]
1050	str	r2, [r0, #0x04]
1051	bx	lr
1052	LMEMCPY_8_PAD
1053
1054/*
1055 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1056 */
1057	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1058	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1059	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
1060#ifdef __ARMEB__
1061	mov	r2, r2, lsl #16		/* r2 = 01.. */
1062	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
1063	orr	r3, r1, r3, lsl #16	/* r3 = 4567 */
1064#else
1065	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
1066	mov	r3, r3, lsr #16		/* r3 = ..54 */
1067	orr	r3, r3, r1, lsl #16	/* r3 = 7654 */
1068#endif
1069	str	r2, [r0]
1070	str	r3, [r0, #0x04]
1071	bx	lr
1072	LMEMCPY_8_PAD
1073
1074/*
1075 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1076 */
1077	ldrb	r3, [r1]		/* r3 = ...0 */
1078	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
1079	ldr	r1, [r1, #0x05]		/* BE:r1 = 567x  LE:r1 = x765 */
1080#ifdef __ARMEB__
1081	mov	r3, r3, lsl #24		/* r3 = 0... */
1082	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
1083	mov	r2, r2, lsl #24		/* r2 = 4... */
1084	orr	r2, r2, r1, lsr #8	/* r2 = 4567 */
1085#else
1086	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
1087	mov	r2, r2, lsr #24		/* r2 = ...4 */
1088	orr	r2, r2, r1, lsl #8	/* r2 = 7654 */
1089#endif
1090	str	r3, [r0]
1091	str	r2, [r0, #0x04]
1092	bx	lr
1093	LMEMCPY_8_PAD
1094
1095/*
1096 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1097 */
1098	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
1099	ldr	r2, [r1, #0x04]		/* BE:r2 = 4567  LE:r2 = 7654 */
1100#ifdef __ARMEB__
1101	mov	r1, r3, lsr #24		/* r1 = ...0 */
1102	strb	r1, [r0]
1103	mov	r1, r3, lsr #8		/* r1 = .012 */
1104	strb	r2, [r0, #0x07]
1105	mov	r3, r3, lsl #24		/* r3 = 3... */
1106	orr	r3, r3, r2, lsr #8	/* r3 = 3456 */
1107#else
1108	strb	r3, [r0]
1109	mov	r1, r2, lsr #24		/* r1 = ...7 */
1110	strb	r1, [r0, #0x07]
1111	mov	r1, r3, lsr #8		/* r1 = .321 */
1112	mov	r3, r3, lsr #24		/* r3 = ...3 */
1113	orr	r3, r3, r2, lsl #8	/* r3 = 6543 */
1114#endif
1115	strh	r1, [r0, #0x01]
1116	str	r3, [r0, #0x03]
1117	bx	lr
1118	LMEMCPY_8_PAD
1119
1120/*
1121 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1122 */
1123	ldrb	r2, [r1]
1124	ldrh	r3, [r1, #0x01]
1125	ldr	ip, [r1, #0x03]
1126	ldrb	r1, [r1, #0x07]
1127	strb	r2, [r0]
1128	strh	r3, [r0, #0x01]
1129	str	ip, [r0, #0x03]
1130	strb	r1, [r0, #0x07]
1131	bx	lr
1132	LMEMCPY_8_PAD
1133
1134/*
1135 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1136 */
1137	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1138	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1139	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
1140#ifdef __ARMEB__
1141	mov	ip, r2, lsr #8		/* ip = ...0 */
1142	strb	ip, [r0]
1143	mov	ip, r2, lsl #8		/* ip = .01. */
1144	orr	ip, ip, r3, lsr #24	/* ip = .012 */
1145	strb	r1, [r0, #0x07]
1146	mov	r3, r3, lsl #8		/* r3 = 345. */
1147	orr	r3, r3, r1, lsr #8	/* r3 = 3456 */
1148#else
1149	strb	r2, [r0]		/* 0 */
1150	mov	ip, r1, lsr #8		/* ip = ...7 */
1151	strb	ip, [r0, #0x07]		/* 7 */
1152	mov	ip, r2, lsr #8		/* ip = ...1 */
1153	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
1154	mov	r3, r3, lsr #8		/* r3 = .543 */
1155	orr	r3, r3, r1, lsl #24	/* r3 = 6543 */
1156#endif
1157	strh	ip, [r0, #0x01]
1158	str	r3, [r0, #0x03]
1159	bx	lr
1160	LMEMCPY_8_PAD
1161
1162/*
1163 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1164 */
1165	ldrb	r3, [r1]		/* r3 = ...0 */
1166	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
1167	ldrh	r2, [r1, #0x05]		/* BE:r2 = ..56  LE:r2 = ..65 */
1168	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
1169	strb	r3, [r0]
1170	mov	r3, ip, lsr #16		/* BE:r3 = ..12  LE:r3 = ..43 */
1171#ifdef __ARMEB__
1172	strh	r3, [r0, #0x01]
1173	orr	r2, r2, ip, lsl #16	/* r2 = 3456 */
1174#else
1175	strh	ip, [r0, #0x01]
1176	orr	r2, r3, r2, lsl #16	/* r2 = 6543 */
1177#endif
1178	str	r2, [r0, #0x03]
1179	strb	r1, [r0, #0x07]
1180	bx	lr
1181	LMEMCPY_8_PAD
1182
1183/*
1184 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1185 */
1186	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1187	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1188	mov	r1, r2, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
1189#ifdef __ARMEB__
1190	strh	r1, [r0]
1191	mov	r1, r3, lsr #16		/* r1 = ..45 */
1192	orr	r2, r1 ,r2, lsl #16	/* r2 = 2345 */
1193#else
1194	strh	r2, [r0]
1195	orr	r2, r1, r3, lsl #16	/* r2 = 5432 */
1196	mov	r3, r3, lsr #16		/* r3 = ..76 */
1197#endif
1198	str	r2, [r0, #0x02]
1199	strh	r3, [r0, #0x06]
1200	bx	lr
1201	LMEMCPY_8_PAD
1202
1203/*
1204 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1205 */
1206	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1207	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
1208	ldrb	ip, [r1, #0x07]		/* ip = ...7 */
1209	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
1210	strh	r1, [r0]
1211#ifdef __ARMEB__
1212	mov	r1, r2, lsl #24		/* r1 = 2... */
1213	orr	r1, r1, r3, lsr #8	/* r1 = 2345 */
1214	orr	r3, ip, r3, lsl #8	/* r3 = 4567 */
1215#else
1216	mov	r1, r2, lsr #24		/* r1 = ...2 */
1217	orr	r1, r1, r3, lsl #8	/* r1 = 5432 */
1218	mov	r3, r3, lsr #24		/* r3 = ...6 */
1219	orr	r3, r3, ip, lsl #8	/* r3 = ..76 */
1220#endif
1221	str	r1, [r0, #0x02]
1222	strh	r3, [r0, #0x06]
1223	bx	lr
1224	LMEMCPY_8_PAD
1225
1226/*
1227 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1228 */
1229	ldrh	r2, [r1]
1230	ldr	ip, [r1, #0x02]
1231	ldrh	r3, [r1, #0x06]
1232	strh	r2, [r0]
1233	str	ip, [r0, #0x02]
1234	strh	r3, [r0, #0x06]
1235	bx	lr
1236	LMEMCPY_8_PAD
1237
1238/*
1239 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1240 */
1241	ldr	r3, [r1, #0x05]		/* BE:r3 = 567x  LE:r3 = x765 */
1242	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
1243	ldrb	ip, [r1]		/* ip = ...0 */
1244	mov	r1, r3, lsr #8		/* BE:r1 = .567  LE:r1 = .x76 */
1245	strh	r1, [r0, #0x06]
1246#ifdef __ARMEB__
1247	mov	r3, r3, lsr #24		/* r3 = ...5 */
1248	orr	r3, r3, r2, lsl #8	/* r3 = 2345 */
1249	mov	r2, r2, lsr #24		/* r2 = ...1 */
1250	orr	r2, r2, ip, lsl #8	/* r2 = ..01 */
1251#else
1252	mov	r3, r3, lsl #24		/* r3 = 5... */
1253	orr	r3, r3, r2, lsr #8	/* r3 = 5432 */
1254	orr	r2, ip, r2, lsl #8	/* r2 = 3210 */
1255#endif
1256	str	r3, [r0, #0x02]
1257	strh	r2, [r0]
1258	bx	lr
1259	LMEMCPY_8_PAD
1260
1261/*
1262 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1263 */
1264	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1265	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1266	mov	r1, r3, lsr #8		/* BE:r1 = .456  LE:r1 = .765 */
1267	strh	r1, [r0, #0x05]
1268#ifdef __ARMEB__
1269	strb	r3, [r0, #0x07]
1270	mov	r1, r2, lsr #24		/* r1 = ...0 */
1271	strb	r1, [r0]
1272	mov	r2, r2, lsl #8		/* r2 = 123. */
1273	orr	r2, r2, r3, lsr #24	/* r2 = 1234 */
1274	str	r2, [r0, #0x01]
1275#else
1276	strb	r2, [r0]
1277	mov	r1, r3, lsr #24		/* r1 = ...7 */
1278	strb	r1, [r0, #0x07]
1279	mov	r2, r2, lsr #8		/* r2 = .321 */
1280	orr	r2, r2, r3, lsl #24	/* r2 = 4321 */
1281	str	r2, [r0, #0x01]
1282#endif
1283	bx	 lr
1284	LMEMCPY_8_PAD
1285
1286/*
1287 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1288 */
1289	ldrb	r3, [r1]		/* r3 = ...0 */
1290	ldrh	r2, [r1, #0x01]		/* BE:r2 = ..12  LE:r2 = ..21 */
1291	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
1292	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
1293	strb	r3, [r0]
1294	mov	r3, ip, lsr #16		/* BE:r3 = ..34  LE:r3 = ..65 */
1295#ifdef __ARMEB__
1296	strh	ip, [r0, #0x05]
1297	orr	r2, r3, r2, lsl #16	/* r2 = 1234 */
1298#else
1299	strh	r3, [r0, #0x05]
1300	orr	r2, r2, ip, lsl #16	/* r2 = 4321 */
1301#endif
1302	str	r2, [r0, #0x01]
1303	strb	r1, [r0, #0x07]
1304	bx	lr
1305	LMEMCPY_8_PAD
1306
1307/*
1308 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1309 */
1310	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1311	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1312	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
1313#ifdef __ARMEB__
1314	mov	ip, r2, lsr #8		/* ip = ...0 */
1315	strb	ip, [r0]
1316	mov	ip, r2, lsl #24		/* ip = 1... */
1317	orr	ip, ip, r3, lsr #8	/* ip = 1234 */
1318	strb	r1, [r0, #0x07]
1319	mov	r1, r1, lsr #8		/* r1 = ...6 */
1320	orr	r1, r1, r3, lsl #8	/* r1 = 3456 */
1321#else
1322	strb	r2, [r0]
1323	mov	ip, r2, lsr #8		/* ip = ...1 */
1324	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
1325	mov	r2, r1, lsr #8		/* r2 = ...7 */
1326	strb	r2, [r0, #0x07]
1327	mov	r1, r1, lsl #8		/* r1 = .76. */
1328	orr	r1, r1, r3, lsr #24	/* r1 = .765 */
1329#endif
1330	str	ip, [r0, #0x01]
1331	strh	r1, [r0, #0x05]
1332	bx	lr
1333	LMEMCPY_8_PAD
1334
1335/*
1336 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1337 */
1338	ldrb	r2, [r1]
1339	ldr	ip, [r1, #0x01]
1340	ldrh	r3, [r1, #0x05]
1341	ldrb	r1, [r1, #0x07]
1342	strb	r2, [r0]
1343	str	ip, [r0, #0x01]
1344	strh	r3, [r0, #0x05]
1345	strb	r1, [r0, #0x07]
1346	bx	lr
1347	LMEMCPY_8_PAD
1348
1349/******************************************************************************
1350 * Special case for 12 byte copies
1351 */
1352#define	LMEMCPY_C_LOG2	7	/* 128 bytes */
1353#define	LMEMCPY_C_PAD	.align LMEMCPY_C_LOG2
1354	LMEMCPY_C_PAD
1355.Lmemcpy_c:
1356	and	r2, r1, #0x03
1357	orr	r2, r2, r0, lsl #2
1358	ands	r2, r2, #0x0f
1359	sub	r3, pc, #0x14
1360	addne	pc, r3, r2, lsl #LMEMCPY_C_LOG2
1361
1362/*
1363 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1364 */
1365	ldr	r2, [r1]
1366	ldr	r3, [r1, #0x04]
1367	ldr	r1, [r1, #0x08]
1368	str	r2, [r0]
1369	str	r3, [r0, #0x04]
1370	str	r1, [r0, #0x08]
1371	bx	lr
1372	LMEMCPY_C_PAD
1373
1374/*
1375 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1376 */
1377	ldrb	r2, [r1, #0xb]		/* r2 = ...B */
1378	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
1379	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
1380	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
1381#ifdef __ARMEB__
1382	orr	r2, r2, ip, lsl #8	/* r2 = 89AB */
1383	str	r2, [r0, #0x08]
1384	mov	r2, ip, lsr #24		/* r2 = ...7 */
1385	orr	r2, r2, r3, lsl #8	/* r2 = 4567 */
1386	mov	r1, r1, lsl #8		/* r1 = 012. */
1387	orr	r1, r1, r3, lsr #24	/* r1 = 0123 */
1388#else
1389	mov	r2, r2, lsl #24		/* r2 = B... */
1390	orr	r2, r2, ip, lsr #8	/* r2 = BA98 */
1391	str	r2, [r0, #0x08]
1392	mov	r2, ip, lsl #24		/* r2 = 7... */
1393	orr	r2, r2, r3, lsr #8	/* r2 = 7654 */
1394	mov	r1, r1, lsr #8		/* r1 = .210 */
1395	orr	r1, r1, r3, lsl #24	/* r1 = 3210 */
1396#endif
1397	str	r2, [r0, #0x04]
1398	str	r1, [r0]
1399	bx	lr
1400	LMEMCPY_C_PAD
1401
1402/*
1403 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1404 */
1405	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1406	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1407	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
1408	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
1409#ifdef __ARMEB__
1410	mov	r2, r2, lsl #16		/* r2 = 01.. */
1411	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
1412	str	r2, [r0]
1413	mov	r3, r3, lsl #16		/* r3 = 45.. */
1414	orr	r3, r3, ip, lsr #16	/* r3 = 4567 */
1415	orr	r1, r1, ip, lsl #16	/* r1 = 89AB */
1416#else
1417	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
1418	str	r2, [r0]
1419	mov	r3, r3, lsr #16		/* r3 = ..54 */
1420	orr	r3, r3, ip, lsl #16	/* r3 = 7654 */
1421	mov	r1, r1, lsl #16		/* r1 = BA.. */
1422	orr	r1, r1, ip, lsr #16	/* r1 = BA98 */
1423#endif
1424	str	r3, [r0, #0x04]
1425	str	r1, [r0, #0x08]
1426	bx	lr
1427	LMEMCPY_C_PAD
1428
1429/*
1430 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1431 */
1432	ldrb	r2, [r1]		/* r2 = ...0 */
1433	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
1434	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
1435	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
1436#ifdef __ARMEB__
1437	mov	r2, r2, lsl #24		/* r2 = 0... */
1438	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
1439	str	r2, [r0]
1440	mov	r3, r3, lsl #24		/* r3 = 4... */
1441	orr	r3, r3, ip, lsr #8	/* r3 = 4567 */
1442	mov	r1, r1, lsr #8		/* r1 = .9AB */
1443	orr	r1, r1, ip, lsl #24	/* r1 = 89AB */
1444#else
1445	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
1446	str	r2, [r0]
1447	mov	r3, r3, lsr #24		/* r3 = ...4 */
1448	orr	r3, r3, ip, lsl #8	/* r3 = 7654 */
1449	mov	r1, r1, lsl #8		/* r1 = BA9. */
1450	orr	r1, r1, ip, lsr #24	/* r1 = BA98 */
1451#endif
1452	str	r3, [r0, #0x04]
1453	str	r1, [r0, #0x08]
1454	bx	lr
1455	LMEMCPY_C_PAD
1456
1457/*
1458 * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
1459 */
1460	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1461	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1462	ldr	ip, [r1, #0x08]		/* BE:ip = 89AB  LE:ip = BA98 */
1463	mov	r1, r2, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
1464	strh	r1, [r0, #0x01]
1465#ifdef __ARMEB__
1466	mov	r1, r2, lsr #24		/* r1 = ...0 */
1467	strb	r1, [r0]
1468	mov	r1, r2, lsl #24		/* r1 = 3... */
1469	orr	r2, r1, r3, lsr #8	/* r1 = 3456 */
1470	mov	r1, r3, lsl #24		/* r1 = 7... */
1471	orr	r1, r1, ip, lsr #8	/* r1 = 789A */
1472#else
1473	strb	r2, [r0]
1474	mov	r1, r2, lsr #24		/* r1 = ...3 */
1475	orr	r2, r1, r3, lsl #8	/* r1 = 6543 */
1476	mov	r1, r3, lsr #24		/* r1 = ...7 */
1477	orr	r1, r1, ip, lsl #8	/* r1 = A987 */
1478	mov	ip, ip, lsr #24		/* ip = ...B */
1479#endif
1480	str	r2, [r0, #0x03]
1481	str	r1, [r0, #0x07]
1482	strb	ip, [r0, #0x0b]
1483	bx	lr
1484	LMEMCPY_C_PAD
1485
1486/*
1487 * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
1488 */
1489	ldrb	r2, [r1]
1490	ldrh	r3, [r1, #0x01]
1491	ldr	ip, [r1, #0x03]
1492	strb	r2, [r0]
1493	ldr	r2, [r1, #0x07]
1494	ldrb	r1, [r1, #0x0b]
1495	strh	r3, [r0, #0x01]
1496	str	ip, [r0, #0x03]
1497	str	r2, [r0, #0x07]
1498	strb	r1, [r0, #0x0b]
1499	bx	lr
1500	LMEMCPY_C_PAD
1501
1502/*
1503 * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
1504 */
1505	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1506	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1507	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
1508	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
1509#ifdef __ARMEB__
1510	mov	r2, r2, ror #8		/* r2 = 1..0 */
1511	strb	r2, [r0]
1512	mov	r2, r2, lsr #16		/* r2 = ..1. */
1513	orr	r2, r2, r3, lsr #24	/* r2 = ..12 */
1514	strh	r2, [r0, #0x01]
1515	mov	r2, r3, lsl #8		/* r2 = 345. */
1516	orr	r3, r2, ip, lsr #24	/* r3 = 3456 */
1517	mov	r2, ip, lsl #8		/* r2 = 789. */
1518	orr	r2, r2, r1, lsr #8	/* r2 = 789A */
1519#else
1520	strb	r2, [r0]
1521	mov	r2, r2, lsr #8		/* r2 = ...1 */
1522	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
1523	strh	r2, [r0, #0x01]
1524	mov	r2, r3, lsr #8		/* r2 = .543 */
1525	orr	r3, r2, ip, lsl #24	/* r3 = 6543 */
1526	mov	r2, ip, lsr #8		/* r2 = .987 */
1527	orr	r2, r2, r1, lsl #24	/* r2 = A987 */
1528	mov	r1, r1, lsr #8		/* r1 = ...B */
1529#endif
1530	str	r3, [r0, #0x03]
1531	str	r2, [r0, #0x07]
1532	strb	r1, [r0, #0x0b]
1533	bx	lr
1534	LMEMCPY_C_PAD
1535
1536/*
1537 * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
1538 */
1539	ldrb	r2, [r1]
1540	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
1541	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
1542	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
1543	strb	r2, [r0]
1544#ifdef __ARMEB__
1545	mov	r2, r3, lsr #16		/* r2 = ..12 */
1546	strh	r2, [r0, #0x01]
1547	mov	r3, r3, lsl #16		/* r3 = 34.. */
1548	orr	r3, r3, ip, lsr #16	/* r3 = 3456 */
1549	mov	ip, ip, lsl #16		/* ip = 78.. */
1550	orr	ip, ip, r1, lsr #16	/* ip = 789A */
1551	mov	r1, r1, lsr #8		/* r1 = .9AB */
1552#else
1553	strh	r3, [r0, #0x01]
1554	mov	r3, r3, lsr #16		/* r3 = ..43 */
1555	orr	r3, r3, ip, lsl #16	/* r3 = 6543 */
1556	mov	ip, ip, lsr #16		/* ip = ..87 */
1557	orr	ip, ip, r1, lsl #16	/* ip = A987 */
1558	mov	r1, r1, lsr #16		/* r1 = ..xB */
1559#endif
1560	str	r3, [r0, #0x03]
1561	str	ip, [r0, #0x07]
1562	strb	r1, [r0, #0x0b]
1563	bx	lr
1564	LMEMCPY_C_PAD
1565
1566/*
1567 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1568 */
1569	ldr	ip, [r1]		/* BE:ip = 0123  LE:ip = 3210 */
1570	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1571	ldr	r2, [r1, #0x08]		/* BE:r2 = 89AB  LE:r2 = BA98 */
1572	mov	r1, ip, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
1573#ifdef __ARMEB__
1574	strh	r1, [r0]
1575	mov	r1, ip, lsl #16		/* r1 = 23.. */
1576	orr	r1, r1, r3, lsr #16	/* r1 = 2345 */
1577	mov	r3, r3, lsl #16		/* r3 = 67.. */
1578	orr	r3, r3, r2, lsr #16	/* r3 = 6789 */
1579#else
1580	strh	ip, [r0]
1581	orr	r1, r1, r3, lsl #16	/* r1 = 5432 */
1582	mov	r3, r3, lsr #16		/* r3 = ..76 */
1583	orr	r3, r3, r2, lsl #16	/* r3 = 9876 */
1584	mov	r2, r2, lsr #16		/* r2 = ..BA */
1585#endif
1586	str	r1, [r0, #0x02]
1587	str	r3, [r0, #0x06]
1588	strh	r2, [r0, #0x0a]
1589	bx	lr
1590	LMEMCPY_C_PAD
1591
1592/*
1593 * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
1594 */
1595	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1596	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
1597	mov	ip, r2, lsr #8		/* BE:ip = .x01  LE:ip = .210 */
1598	strh	ip, [r0]
1599	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
1600	ldrb	r1, [r1, #0x0b]		/* r1 = ...B */
1601#ifdef __ARMEB__
1602	mov	r2, r2, lsl #24		/* r2 = 2... */
1603	orr	r2, r2, r3, lsr #8	/* r2 = 2345 */
1604	mov	r3, r3, lsl #24		/* r3 = 6... */
1605	orr	r3, r3, ip, lsr #8	/* r3 = 6789 */
1606	orr	r1, r1, ip, lsl #8	/* r1 = 89AB */
1607#else
1608	mov	r2, r2, lsr #24		/* r2 = ...2 */
1609	orr	r2, r2, r3, lsl #8	/* r2 = 5432 */
1610	mov	r3, r3, lsr #24		/* r3 = ...6 */
1611	orr	r3, r3, ip, lsl #8	/* r3 = 9876 */
1612	mov	r1, r1, lsl #8		/* r1 = ..B. */
1613	orr	r1, r1, ip, lsr #24	/* r1 = ..BA */
1614#endif
1615	str	r2, [r0, #0x02]
1616	str	r3, [r0, #0x06]
1617	strh	r1, [r0, #0x0a]
1618	bx	lr
1619	LMEMCPY_C_PAD
1620
1621/*
1622 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1623 */
1624	ldrh	r2, [r1]
1625	ldr	r3, [r1, #0x02]
1626	ldr	ip, [r1, #0x06]
1627	ldrh	r1, [r1, #0x0a]
1628	strh	r2, [r0]
1629	str	r3, [r0, #0x02]
1630	str	ip, [r0, #0x06]
1631	strh	r1, [r0, #0x0a]
1632	bx	lr
1633	LMEMCPY_C_PAD
1634
1635/*
1636 * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
1637 */
1638	ldr	r2, [r1, #0x09]		/* BE:r2 = 9ABx  LE:r2 = xBA9 */
1639	ldr	r3, [r1, #0x05]		/* BE:r3 = 5678  LE:r3 = 8765 */
1640	mov	ip, r2, lsr #8		/* BE:ip = .9AB  LE:ip = .xBA */
1641	strh	ip, [r0, #0x0a]
1642	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
1643	ldrb	r1, [r1]		/* r1 = ...0 */
1644#ifdef __ARMEB__
1645	mov	r2, r2, lsr #24		/* r2 = ...9 */
1646	orr	r2, r2, r3, lsl #8	/* r2 = 6789 */
1647	mov	r3, r3, lsr #24		/* r3 = ...5 */
1648	orr	r3, r3, ip, lsl #8	/* r3 = 2345 */
1649	mov	r1, r1, lsl #8		/* r1 = ..0. */
1650	orr	r1, r1, ip, lsr #24	/* r1 = ..01 */
1651#else
1652	mov	r2, r2, lsl #24		/* r2 = 9... */
1653	orr	r2, r2, r3, lsr #8	/* r2 = 9876 */
1654	mov	r3, r3, lsl #24		/* r3 = 5... */
1655	orr	r3, r3, ip, lsr #8	/* r3 = 5432 */
1656	orr	r1, r1, ip, lsl #8	/* r1 = 3210 */
1657#endif
1658	str	r2, [r0, #0x06]
1659	str	r3, [r0, #0x02]
1660	strh	r1, [r0]
1661	bx	lr
1662	LMEMCPY_C_PAD
1663
1664/*
1665 * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
1666 */
1667	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1668	ldr	ip, [r1, #0x04]		/* BE:ip = 4567  LE:ip = 7654 */
1669	ldr	r1, [r1, #0x08]		/* BE:r1 = 89AB  LE:r1 = BA98 */
1670#ifdef __ARMEB__
1671	mov	r3, r2, lsr #24		/* r3 = ...0 */
1672	strb	r3, [r0]
1673	mov	r2, r2, lsl #8		/* r2 = 123. */
1674	orr	r2, r2, ip, lsr #24	/* r2 = 1234 */
1675	str	r2, [r0, #0x01]
1676	mov	r2, ip, lsl #8		/* r2 = 567. */
1677	orr	r2, r2, r1, lsr #24	/* r2 = 5678 */
1678	str	r2, [r0, #0x05]
1679	mov	r2, r1, lsr #8		/* r2 = ..9A */
1680	strh	r2, [r0, #0x09]
1681	strb	r1, [r0, #0x0b]
1682#else
1683	strb	r2, [r0]
1684	mov	r3, r2, lsr #8		/* r3 = .321 */
1685	orr	r3, r3, ip, lsl #24	/* r3 = 4321 */
1686	str	r3, [r0, #0x01]
1687	mov	r3, ip, lsr #8		/* r3 = .765 */
1688	orr	r3, r3, r1, lsl #24	/* r3 = 8765 */
1689	str	r3, [r0, #0x05]
1690	mov	r1, r1, lsr #8		/* r1 = .BA9 */
1691	strh	r1, [r0, #0x09]
1692	mov	r1, r1, lsr #16		/* r1 = ...B */
1693	strb	r1, [r0, #0x0b]
1694#endif
1695	bx	lr
1696	LMEMCPY_C_PAD
1697
1698/*
1699 * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
1700 */
1701	ldrb	r2, [r1, #0x0b]		/* r2 = ...B */
1702	ldr	r3, [r1, #0x07]		/* BE:r3 = 789A  LE:r3 = A987 */
1703	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
1704	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
1705	strb	r2, [r0, #0x0b]
1706#ifdef __ARMEB__
1707	strh	r3, [r0, #0x09]
1708	mov	r3, r3, lsr #16		/* r3 = ..78 */
1709	orr	r3, r3, ip, lsl #16	/* r3 = 5678 */
1710	mov	ip, ip, lsr #16		/* ip = ..34 */
1711	orr	ip, ip, r1, lsl #16	/* ip = 1234 */
1712	mov	r1, r1, lsr #16		/* r1 = ..x0 */
1713#else
1714	mov	r2, r3, lsr #16		/* r2 = ..A9 */
1715	strh	r2, [r0, #0x09]
1716	mov	r3, r3, lsl #16		/* r3 = 87.. */
1717	orr	r3, r3, ip, lsr #16	/* r3 = 8765 */
1718	mov	ip, ip, lsl #16		/* ip = 43.. */
1719	orr	ip, ip, r1, lsr #16	/* ip = 4321 */
1720	mov	r1, r1, lsr #8		/* r1 = .210 */
1721#endif
1722	str	r3, [r0, #0x05]
1723	str	ip, [r0, #0x01]
1724	strb	r1, [r0]
1725	bx	lr
1726	LMEMCPY_C_PAD
1727
1728/*
1729 * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
1730 */
1731#ifdef __ARMEB__
1732	ldrh	r2, [r1, #0x0a]		/* r2 = ..AB */
1733	ldr	ip, [r1, #0x06]		/* ip = 6789 */
1734	ldr	r3, [r1, #0x02]		/* r3 = 2345 */
1735	ldrh	r1, [r1]		/* r1 = ..01 */
1736	strb	r2, [r0, #0x0b]
1737	mov	r2, r2, lsr #8		/* r2 = ...A */
1738	orr	r2, r2, ip, lsl #8	/* r2 = 789A */
1739	mov	ip, ip, lsr #8		/* ip = .678 */
1740	orr	ip, ip, r3, lsl #24	/* ip = 5678 */
1741	mov	r3, r3, lsr #8		/* r3 = .234 */
1742	orr	r3, r3, r1, lsl #24	/* r3 = 1234 */
1743	mov	r1, r1, lsr #8		/* r1 = ...0 */
1744	strb	r1, [r0]
1745	str	r3, [r0, #0x01]
1746	str	ip, [r0, #0x05]
1747	strh	r2, [r0, #0x09]
1748#else
1749	ldrh	r2, [r1]		/* r2 = ..10 */
1750	ldr	r3, [r1, #0x02]		/* r3 = 5432 */
1751	ldr	ip, [r1, #0x06]		/* ip = 9876 */
1752	ldrh	r1, [r1, #0x0a]		/* r1 = ..BA */
1753	strb	r2, [r0]
1754	mov	r2, r2, lsr #8		/* r2 = ...1 */
1755	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
1756	mov	r3, r3, lsr #24		/* r3 = ...5 */
1757	orr	r3, r3, ip, lsl #8	/* r3 = 8765 */
1758	mov	ip, ip, lsr #24		/* ip = ...9 */
1759	orr	ip, ip, r1, lsl #8	/* ip = .BA9 */
1760	mov	r1, r1, lsr #8		/* r1 = ...B */
1761	str	r2, [r0, #0x01]
1762	str	r3, [r0, #0x05]
1763	strh	ip, [r0, #0x09]
1764	strb	r1, [r0, #0x0b]
1765#endif
1766	bx	lr
1767	LMEMCPY_C_PAD
1768
1769/*
1770 * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
1771 */
1772	ldrb	r2, [r1]
1773	ldr	r3, [r1, #0x01]
1774	ldr	ip, [r1, #0x05]
1775	strb	r2, [r0]
1776	ldrh	r2, [r1, #0x09]
1777	ldrb	r1, [r1, #0x0b]
1778	str	r3, [r0, #0x01]
1779	str	ip, [r0, #0x05]
1780	strh	r2, [r0, #0x09]
1781	strb	r1, [r0, #0x0b]
1782	bx	lr
1783#endif	/* !_STANDALONE */
1784END(memcpy)
1785