bcopyinout_xscale.S revision 239268
1/*	$NetBSD: bcopyinout_xscale.S,v 1.3 2003/12/15 09:27:18 scw Exp $	*/
2
3/*-
4 * Copyright 2003 Wasabi Systems, Inc.
5 * All rights reserved.
6 *
7 * Written by Steve C. Woodford for Wasabi Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *      This product includes software developed for the NetBSD Project by
20 *      Wasabi Systems, Inc.
21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22 *    or promote products derived from this software without specific prior
23 *    written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38#include <machine/asm.h>
39__FBSDID("$FreeBSD: head/sys/arm/arm/bcopyinout_xscale.S 239268 2012-08-15 03:03:03Z gonzo $");
40
41	.text
42	.align	0
43
44#ifdef _ARM_ARCH_6
45#define GET_PCB(tmp) \
46	mrc p15, 0, tmp, c13, c0, 4; \
47	add	tmp, tmp, #(PC_CURPCB)
48#else
49.Lcurpcb:
50	.word	_C_LABEL(__pcpu) + PC_CURPCB
51#define GET_PCB(tmp) \
52	ldr	tmp, .Lcurpcb
53#endif
54
55/*
56 * r0 = user space address
57 * r1 = kernel space address
58 * r2 = length
59 *
60 * Copies bytes from user space to kernel space
61 */
62ENTRY(copyin)
63	cmp	r2, #0x00
64	movle	r0, #0x00
65	movle	pc, lr			/* Bail early if length is <= 0 */
66
67	ldr	r3, .L_arm_memcpy
68	ldr	r3, [r3]
69	cmp	r3, #0
70	beq	.Lnormal
71	ldr	r3, .L_min_memcpy_size
72	ldr	r3, [r3]
73	cmp	r2, r3
74	blt	.Lnormal
75	stmfd	sp!, {r0-r2, r4, lr}
76	mov     r3, r0
77	mov     r0, r1
78	mov     r1, r3
79	mov     r3, #2 /* SRC_IS_USER */
80	ldr	r4, .L_arm_memcpy
81	mov	lr, pc
82	ldr	pc, [r4]
83	cmp     r0, #0
84	ldmfd   sp!, {r0-r2, r4, lr}
85	moveq	r0, #0
86	RETeq
87
88.Lnormal:
89	stmfd	sp!, {r10-r11, lr}
90
91	GET_PCB(r10)
92	ldr	r10, [r10]
93
94	mov	r3, #0x00
95	adr	ip, .Lcopyin_fault
96	ldr	r11, [r10, #PCB_ONFAULT]
97	str	ip, [r10, #PCB_ONFAULT]
98	bl	.Lcopyin_guts
99	str	r11, [r10, #PCB_ONFAULT]
100	mov	r0, #0x00
101	ldmfd	sp!, {r10-r11, pc}
102
103.Lcopyin_fault:
104	ldr	r0, =EFAULT
105	str	r11, [r10, #PCB_ONFAULT]
106	cmp	r3, #0x00
107	ldmgtfd	sp!, {r4-r7}		/* r3 > 0 Restore r4-r7 */
108	ldmltfd	sp!, {r4-r9}		/* r3 < 0 Restore r4-r9 */
109	ldmfd	sp!, {r10-r11, pc}
110
111.Lcopyin_guts:
112	pld	[r0]
113	/* Word-align the destination buffer */
114	ands	ip, r1, #0x03		/* Already word aligned? */
115	beq	.Lcopyin_wordaligned	/* Yup */
116	rsb	ip, ip, #0x04
117	cmp	r2, ip			/* Enough bytes left to align it? */
118	blt	.Lcopyin_l4_2		/* Nope. Just copy bytewise */
119	sub	r2, r2, ip
120	rsbs	ip, ip, #0x03
121	addne	pc, pc, ip, lsl #3
122	nop
123	ldrbt	ip, [r0], #0x01
124	strb	ip, [r1], #0x01
125	ldrbt	ip, [r0], #0x01
126	strb	ip, [r1], #0x01
127	ldrbt	ip, [r0], #0x01
128	strb	ip, [r1], #0x01
129	cmp	r2, #0x00		/* All done? */
130	RETeq
131
132	/* Destination buffer is now word aligned */
133.Lcopyin_wordaligned:
134	ands	ip, r0, #0x03		/* Is src also word-aligned? */
135	bne	.Lcopyin_bad_align	/* Nope. Things just got bad */
136	cmp	r2, #0x08		/* Less than 8 bytes remaining? */
137	blt	.Lcopyin_w_less_than8
138
139	/* Quad-align the destination buffer */
140	tst	r1, #0x07		/* Already quad aligned? */
141	ldrnet	ip, [r0], #0x04
142	strne	ip, [r1], #0x04
143	subne	r2, r2, #0x04
144	stmfd	sp!, {r4-r9}		/* Free up some registers */
145	mov	r3, #-1			/* Signal restore r4-r9 */
146
147	/* Destination buffer quad aligned, source is word aligned */
148	subs	r2, r2, #0x80
149	blt	.Lcopyin_w_lessthan128
150
151	/* Copy 128 bytes at a time */
152.Lcopyin_w_loop128:
153	ldrt	r4, [r0], #0x04		/* LD:00-03 */
154	ldrt	r5, [r0], #0x04		/* LD:04-07 */
155	pld	[r0, #0x18]		/* Prefetch 0x20 */
156	ldrt	r6, [r0], #0x04		/* LD:08-0b */
157	ldrt	r7, [r0], #0x04		/* LD:0c-0f */
158	ldrt	r8, [r0], #0x04		/* LD:10-13 */
159	ldrt	r9, [r0], #0x04		/* LD:14-17 */
160	strd	r4, [r1], #0x08		/* ST:00-07 */
161	ldrt	r4, [r0], #0x04		/* LD:18-1b */
162	ldrt	r5, [r0], #0x04		/* LD:1c-1f */
163	strd	r6, [r1], #0x08		/* ST:08-0f */
164	ldrt	r6, [r0], #0x04		/* LD:20-23 */
165	ldrt	r7, [r0], #0x04		/* LD:24-27 */
166	pld	[r0, #0x18]		/* Prefetch 0x40 */
167	strd	r8, [r1], #0x08		/* ST:10-17 */
168	ldrt	r8, [r0], #0x04		/* LD:28-2b */
169	ldrt	r9, [r0], #0x04		/* LD:2c-2f */
170	strd	r4, [r1], #0x08		/* ST:18-1f */
171	ldrt	r4, [r0], #0x04		/* LD:30-33 */
172	ldrt	r5, [r0], #0x04		/* LD:34-37 */
173	strd	r6, [r1], #0x08		/* ST:20-27 */
174	ldrt	r6, [r0], #0x04		/* LD:38-3b */
175	ldrt	r7, [r0], #0x04		/* LD:3c-3f */
176	strd	r8, [r1], #0x08		/* ST:28-2f */
177	ldrt	r8, [r0], #0x04		/* LD:40-43 */
178	ldrt	r9, [r0], #0x04		/* LD:44-47 */
179	pld	[r0, #0x18]		/* Prefetch 0x60 */
180	strd	r4, [r1], #0x08		/* ST:30-37 */
181	ldrt	r4, [r0], #0x04		/* LD:48-4b */
182	ldrt	r5, [r0], #0x04		/* LD:4c-4f */
183	strd	r6, [r1], #0x08		/* ST:38-3f */
184	ldrt	r6, [r0], #0x04		/* LD:50-53 */
185	ldrt	r7, [r0], #0x04		/* LD:54-57 */
186	strd	r8, [r1], #0x08		/* ST:40-47 */
187	ldrt	r8, [r0], #0x04		/* LD:58-5b */
188	ldrt	r9, [r0], #0x04		/* LD:5c-5f */
189	strd	r4, [r1], #0x08		/* ST:48-4f */
190	ldrt	r4, [r0], #0x04		/* LD:60-63 */
191	ldrt	r5, [r0], #0x04		/* LD:64-67 */
192	pld	[r0, #0x18]		/* Prefetch 0x80 */
193	strd	r6, [r1], #0x08		/* ST:50-57 */
194	ldrt	r6, [r0], #0x04		/* LD:68-6b */
195	ldrt	r7, [r0], #0x04		/* LD:6c-6f */
196	strd	r8, [r1], #0x08		/* ST:58-5f */
197	ldrt	r8, [r0], #0x04		/* LD:70-73 */
198	ldrt	r9, [r0], #0x04		/* LD:74-77 */
199	strd	r4, [r1], #0x08		/* ST:60-67 */
200	ldrt	r4, [r0], #0x04		/* LD:78-7b */
201	ldrt	r5, [r0], #0x04		/* LD:7c-7f */
202	strd	r6, [r1], #0x08		/* ST:68-6f */
203	strd	r8, [r1], #0x08		/* ST:70-77 */
204	subs	r2, r2, #0x80
205	strd	r4, [r1], #0x08		/* ST:78-7f */
206	bge	.Lcopyin_w_loop128
207
208.Lcopyin_w_lessthan128:
209	adds	r2, r2, #0x80		/* Adjust for extra sub */
210	ldmeqfd	sp!, {r4-r9}
211	RETeq
212	subs	r2, r2, #0x20
213	blt	.Lcopyin_w_lessthan32
214
215	/* Copy 32 bytes at a time */
216.Lcopyin_w_loop32:
217	ldrt	r4, [r0], #0x04
218	ldrt	r5, [r0], #0x04
219	pld	[r0, #0x18]
220	ldrt	r6, [r0], #0x04
221	ldrt	r7, [r0], #0x04
222	ldrt	r8, [r0], #0x04
223	ldrt	r9, [r0], #0x04
224	strd	r4, [r1], #0x08
225	ldrt	r4, [r0], #0x04
226	ldrt	r5, [r0], #0x04
227	strd	r6, [r1], #0x08
228	strd	r8, [r1], #0x08
229	subs	r2, r2, #0x20
230	strd	r4, [r1], #0x08
231	bge	.Lcopyin_w_loop32
232
233.Lcopyin_w_lessthan32:
234	adds	r2, r2, #0x20		/* Adjust for extra sub */
235	ldmeqfd	sp!, {r4-r9}
236	RETeq				/* Return now if done */
237
238	and	r4, r2, #0x18
239	rsb	r5, r4, #0x18
240	subs	r2, r2, r4
241	add	pc, pc, r5, lsl #1
242	nop
243
244	/* At least 24 bytes remaining */
245	ldrt	r4, [r0], #0x04
246	ldrt	r5, [r0], #0x04
247	nop
248	strd	r4, [r1], #0x08
249
250	/* At least 16 bytes remaining */
251	ldrt	r4, [r0], #0x04
252	ldrt	r5, [r0], #0x04
253	nop
254	strd	r4, [r1], #0x08
255
256	/* At least 8 bytes remaining */
257	ldrt	r4, [r0], #0x04
258	ldrt	r5, [r0], #0x04
259	nop
260	strd	r4, [r1], #0x08
261
262	/* Less than 8 bytes remaining */
263	ldmfd	sp!, {r4-r9}
264	RETeq				/* Return now if done */
265	mov	r3, #0x00
266
267.Lcopyin_w_less_than8:
268	subs	r2, r2, #0x04
269	ldrget	ip, [r0], #0x04
270	strge	ip, [r1], #0x04
271	RETeq				/* Return now if done */
272	addlt	r2, r2, #0x04
273	ldrbt	ip, [r0], #0x01
274	cmp	r2, #0x02
275	ldrgebt	r2, [r0], #0x01
276	strb	ip, [r1], #0x01
277	ldrgtbt	ip, [r0]
278	strgeb	r2, [r1], #0x01
279	strgtb	ip, [r1]
280	RET
281
282/*
283 * At this point, it has not been possible to word align both buffers.
284 * The destination buffer (r1) is word aligned, but the source buffer
285 * (r0) is not.
286 */
287.Lcopyin_bad_align:
288	stmfd	sp!, {r4-r7}
289	mov	r3, #0x01
290	bic	r0, r0, #0x03
291	cmp	ip, #2
292	ldrt	ip, [r0], #0x04
293	bgt	.Lcopyin_bad3
294	beq	.Lcopyin_bad2
295	b	.Lcopyin_bad1
296
297.Lcopyin_bad1_loop16:
298#ifdef __ARMEB__
299	mov	r4, ip, lsl #8
300#else
301	mov	r4, ip, lsr #8
302#endif
303	ldrt	r5, [r0], #0x04
304	pld	[r0, #0x018]
305	ldrt	r6, [r0], #0x04
306	ldrt	r7, [r0], #0x04
307	ldrt	ip, [r0], #0x04
308#ifdef __ARMEB__
309	orr	r4, r4, r5, lsr #24
310	mov	r5, r5, lsl #8
311	orr	r5, r5, r6, lsr #24
312	mov	r6, r6, lsl #8
313	orr	r6, r6, r7, lsr #24
314	mov	r7, r7, lsl #8
315	orr	r7, r7, ip, lsr #24
316#else
317	orr	r4, r4, r5, lsl #24
318	mov	r5, r5, lsr #8
319	orr	r5, r5, r6, lsl #24
320	mov	r6, r6, lsr #8
321	orr	r6, r6, r7, lsl #24
322	mov	r7, r7, lsr #8
323	orr	r7, r7, ip, lsl #24
324#endif
325	str	r4, [r1], #0x04
326	str	r5, [r1], #0x04
327	str	r6, [r1], #0x04
328	str	r7, [r1], #0x04
329.Lcopyin_bad1:
330	subs	r2, r2, #0x10
331	bge	.Lcopyin_bad1_loop16
332
333	adds	r2, r2, #0x10
334	ldmeqfd	sp!, {r4-r7}
335	RETeq				/* Return now if done */
336	subs	r2, r2, #0x04
337	sublt	r0, r0, #0x03
338	blt	.Lcopyin_l4
339
340.Lcopyin_bad1_loop4:
341#ifdef __ARMEB__
342	mov	r4, ip, lsl #8
343#else
344	mov	r4, ip, lsr #8
345#endif
346	ldrt	ip, [r0], #0x04
347	subs	r2, r2, #0x04
348#ifdef __ARMEB__
349	orr	r4, r4, ip, lsr #24
350#else
351	orr	r4, r4, ip, lsl #24
352#endif
353	str	r4, [r1], #0x04
354	bge	.Lcopyin_bad1_loop4
355	sub	r0, r0, #0x03
356	b	.Lcopyin_l4
357
358.Lcopyin_bad2_loop16:
359#ifdef __ARMEB__
360	mov	r4, ip, lsl #16
361#else
362	mov	r4, ip, lsr #16
363#endif
364	ldrt	r5, [r0], #0x04
365	pld	[r0, #0x018]
366	ldrt	r6, [r0], #0x04
367	ldrt	r7, [r0], #0x04
368	ldrt	ip, [r0], #0x04
369#ifdef __ARMEB__
370	orr	r4, r4, r5, lsr #16
371	mov	r5, r5, lsl #16
372	orr	r5, r5, r6, lsr #16
373	mov	r6, r6, lsl #16
374	orr	r6, r6, r7, lsr #16
375	mov	r7, r7, lsl #16
376	orr	r7, r7, ip, lsr #16
377#else
378	orr	r4, r4, r5, lsl #16
379	mov	r5, r5, lsr #16
380	orr	r5, r5, r6, lsl #16
381	mov	r6, r6, lsr #16
382	orr	r6, r6, r7, lsl #16
383	mov	r7, r7, lsr #16
384	orr	r7, r7, ip, lsl #16
385#endif
386	str	r4, [r1], #0x04
387	str	r5, [r1], #0x04
388	str	r6, [r1], #0x04
389	str	r7, [r1], #0x04
390.Lcopyin_bad2:
391	subs	r2, r2, #0x10
392	bge	.Lcopyin_bad2_loop16
393
394	adds	r2, r2, #0x10
395	ldmeqfd	sp!, {r4-r7}
396	RETeq				/* Return now if done */
397	subs	r2, r2, #0x04
398	sublt	r0, r0, #0x02
399	blt	.Lcopyin_l4
400
401.Lcopyin_bad2_loop4:
402#ifdef __ARMEB__
403	mov	r4, ip, lsl #16
404#else
405	mov	r4, ip, lsr #16
406#endif
407	ldrt	ip, [r0], #0x04
408	subs	r2, r2, #0x04
409#ifdef __ARMEB__
410	orr	r4, r4, ip, lsr #16
411#else
412	orr	r4, r4, ip, lsl #16
413#endif
414	str	r4, [r1], #0x04
415	bge	.Lcopyin_bad2_loop4
416	sub	r0, r0, #0x02
417	b	.Lcopyin_l4
418
419.Lcopyin_bad3_loop16:
420#ifdef __ARMEB__
421	mov	r4, ip, lsl #24
422#else
423	mov	r4, ip, lsr #24
424#endif
425	ldrt	r5, [r0], #0x04
426	pld	[r0, #0x018]
427	ldrt	r6, [r0], #0x04
428	ldrt	r7, [r0], #0x04
429	ldrt	ip, [r0], #0x04
430#ifdef __ARMEB__
431	orr	r4, r4, r5, lsr #8
432	mov	r5, r5, lsl #24
433	orr	r5, r5, r6, lsr #8
434	mov	r6, r6, lsl #24
435	orr	r6, r6, r7, lsr #8
436	mov	r7, r7, lsl #24
437	orr	r7, r7, ip, lsr #8
438#else
439	orr	r4, r4, r5, lsl #8
440	mov	r5, r5, lsr #24
441	orr	r5, r5, r6, lsl #8
442	mov	r6, r6, lsr #24
443	orr	r6, r6, r7, lsl #8
444	mov	r7, r7, lsr #24
445	orr	r7, r7, ip, lsl #8
446#endif
447	str	r4, [r1], #0x04
448	str	r5, [r1], #0x04
449	str	r6, [r1], #0x04
450	str	r7, [r1], #0x04
451.Lcopyin_bad3:
452	subs	r2, r2, #0x10
453	bge	.Lcopyin_bad3_loop16
454
455	adds	r2, r2, #0x10
456	ldmeqfd	sp!, {r4-r7}
457	RETeq				/* Return now if done */
458	subs	r2, r2, #0x04
459	sublt	r0, r0, #0x01
460	blt	.Lcopyin_l4
461
462.Lcopyin_bad3_loop4:
463#ifdef __ARMEB__
464	mov	r4, ip, lsl #24
465#else
466	mov	r4, ip, lsr #24
467#endif
468	ldrt	ip, [r0], #0x04
469	subs	r2, r2, #0x04
470#ifdef __ARMEB__
471	orr	r4, r4, ip, lsr #8
472#else
473	orr	r4, r4, ip, lsl #8
474#endif
475	str	r4, [r1], #0x04
476	bge	.Lcopyin_bad3_loop4
477	sub	r0, r0, #0x01
478
479.Lcopyin_l4:
480	ldmfd	sp!, {r4-r7}
481	mov	r3, #0x00
482	adds	r2, r2, #0x04
483	RETeq
484.Lcopyin_l4_2:
485	rsbs	r2, r2, #0x03
486	addne	pc, pc, r2, lsl #3
487	nop
488	ldrbt	ip, [r0], #0x01
489	strb	ip, [r1], #0x01
490	ldrbt	ip, [r0], #0x01
491	strb	ip, [r1], #0x01
492	ldrbt	ip, [r0]
493	strb	ip, [r1]
494	RET
495
496
497/*
498 * r0 = kernel space address
499 * r1 = user space address
500 * r2 = length
501 *
502 * Copies bytes from kernel space to user space
503 */
504ENTRY(copyout)
505	cmp	r2, #0x00
506	movle	r0, #0x00
507	movle	pc, lr			/* Bail early if length is <= 0 */
508
509	ldr	r3, .L_arm_memcpy
510	ldr	r3, [r3]
511	cmp	r3, #0
512	beq	.Lnormale
513	ldr	r3, .L_min_memcpy_size
514	ldr	r3, [r3]
515	cmp	r2, r3
516	blt	.Lnormale
517	stmfd	sp!, {r0-r2, r4, lr}
518	mov     r3, r0
519	mov     r0, r1
520	mov     r1, r3
521	mov     r3, #1 /* DST_IS_USER */
522	ldr	r4, .L_arm_memcpy
523	mov	lr, pc
524	ldr	pc, [r4]
525	cmp     r0, #0
526	ldmfd   sp!, {r0-r2, r4, lr}
527	moveq	r0, #0
528	RETeq
529
530.Lnormale:
531	stmfd	sp!, {r10-r11, lr}
532
533	GET_PCB(r10)
534	ldr	r10, [r10]
535
536	mov	r3, #0x00
537	adr	ip, .Lcopyout_fault
538	ldr	r11, [r10, #PCB_ONFAULT]
539	str	ip, [r10, #PCB_ONFAULT]
540	bl	.Lcopyout_guts
541	str	r11, [r10, #PCB_ONFAULT]
542	mov	r0, #0x00
543	ldmfd	sp!, {r10-r11, pc}
544
545.Lcopyout_fault:
546	ldr	r0, =EFAULT
547	str	r11, [r10, #PCB_ONFAULT]
548	cmp	r3, #0x00
549	ldmgtfd	sp!, {r4-r7}		/* r3 > 0 Restore r4-r7 */
550	ldmltfd	sp!, {r4-r9}		/* r3 < 0 Restore r4-r9 */
551	ldmfd	sp!, {r10-r11, pc}
552
553.Lcopyout_guts:
554	pld	[r0]
555	/* Word-align the destination buffer */
556	ands	ip, r1, #0x03		/* Already word aligned? */
557	beq	.Lcopyout_wordaligned	/* Yup */
558	rsb	ip, ip, #0x04
559	cmp	r2, ip			/* Enough bytes left to align it? */
560	blt	.Lcopyout_l4_2		/* Nope. Just copy bytewise */
561	sub	r2, r2, ip
562	rsbs	ip, ip, #0x03
563	addne	pc, pc, ip, lsl #3
564	nop
565	ldrb	ip, [r0], #0x01
566	strbt	ip, [r1], #0x01
567	ldrb	ip, [r0], #0x01
568	strbt	ip, [r1], #0x01
569	ldrb	ip, [r0], #0x01
570	strbt	ip, [r1], #0x01
571	cmp	r2, #0x00		/* All done? */
572	RETeq
573
574	/* Destination buffer is now word aligned */
575.Lcopyout_wordaligned:
576	ands	ip, r0, #0x03		/* Is src also word-aligned? */
577	bne	.Lcopyout_bad_align	/* Nope. Things just got bad */
578	cmp	r2, #0x08		/* Less than 8 bytes remaining? */
579	blt	.Lcopyout_w_less_than8
580
581	/* Quad-align the destination buffer */
582	tst	r0, #0x07		/* Already quad aligned? */
583	ldrne	ip, [r0], #0x04
584	subne	r2, r2, #0x04
585	strnet	ip, [r1], #0x04
586
587	stmfd	sp!, {r4-r9}		/* Free up some registers */
588	mov	r3, #-1			/* Signal restore r4-r9 */
589
590	/* Destination buffer word aligned, source is quad aligned */
591	subs	r2, r2, #0x80
592	blt	.Lcopyout_w_lessthan128
593
594	/* Copy 128 bytes at a time */
595.Lcopyout_w_loop128:
596	ldrd	r4, [r0], #0x08		/* LD:00-07 */
597	pld	[r0, #0x18]		/* Prefetch 0x20 */
598	ldrd	r6, [r0], #0x08		/* LD:08-0f */
599	ldrd	r8, [r0], #0x08		/* LD:10-17 */
600	strt	r4, [r1], #0x04		/* ST:00-03 */
601	strt	r5, [r1], #0x04		/* ST:04-07 */
602	ldrd	r4, [r0], #0x08		/* LD:18-1f */
603	strt	r6, [r1], #0x04		/* ST:08-0b */
604	strt	r7, [r1], #0x04		/* ST:0c-0f */
605	ldrd	r6, [r0], #0x08		/* LD:20-27 */
606	pld	[r0, #0x18]		/* Prefetch 0x40 */
607	strt	r8, [r1], #0x04		/* ST:10-13 */
608	strt	r9, [r1], #0x04		/* ST:14-17 */
609	ldrd	r8, [r0], #0x08		/* LD:28-2f */
610	strt	r4, [r1], #0x04		/* ST:18-1b */
611	strt	r5, [r1], #0x04		/* ST:1c-1f */
612	ldrd	r4, [r0], #0x08		/* LD:30-37 */
613	strt	r6, [r1], #0x04		/* ST:20-23 */
614	strt	r7, [r1], #0x04		/* ST:24-27 */
615	ldrd	r6, [r0], #0x08		/* LD:38-3f */
616	strt	r8, [r1], #0x04		/* ST:28-2b */
617	strt	r9, [r1], #0x04		/* ST:2c-2f */
618	ldrd	r8, [r0], #0x08		/* LD:40-47 */
619	pld	[r0, #0x18]		/* Prefetch 0x60 */
620	strt	r4, [r1], #0x04		/* ST:30-33 */
621	strt	r5, [r1], #0x04		/* ST:34-37 */
622	ldrd	r4, [r0], #0x08		/* LD:48-4f */
623	strt	r6, [r1], #0x04		/* ST:38-3b */
624	strt	r7, [r1], #0x04		/* ST:3c-3f */
625	ldrd	r6, [r0], #0x08		/* LD:50-57 */
626	strt	r8, [r1], #0x04		/* ST:40-43 */
627	strt	r9, [r1], #0x04		/* ST:44-47 */
628	ldrd	r8, [r0], #0x08		/* LD:58-4f */
629	strt	r4, [r1], #0x04		/* ST:48-4b */
630	strt	r5, [r1], #0x04		/* ST:4c-4f */
631	ldrd	r4, [r0], #0x08		/* LD:60-67 */
632	pld	[r0, #0x18]		/* Prefetch 0x80 */
633	strt	r6, [r1], #0x04		/* ST:50-53 */
634	strt	r7, [r1], #0x04		/* ST:54-57 */
635	ldrd	r6, [r0], #0x08		/* LD:68-6f */
636	strt	r8, [r1], #0x04		/* ST:58-5b */
637	strt	r9, [r1], #0x04		/* ST:5c-5f */
638	ldrd	r8, [r0], #0x08		/* LD:70-77 */
639	strt	r4, [r1], #0x04		/* ST:60-63 */
640	strt	r5, [r1], #0x04		/* ST:64-67 */
641	ldrd	r4, [r0], #0x08		/* LD:78-7f */
642	strt	r6, [r1], #0x04		/* ST:68-6b */
643	strt	r7, [r1], #0x04		/* ST:6c-6f */
644	strt	r8, [r1], #0x04		/* ST:70-73 */
645	strt	r9, [r1], #0x04		/* ST:74-77 */
646	subs	r2, r2, #0x80
647	strt	r4, [r1], #0x04		/* ST:78-7b */
648	strt	r5, [r1], #0x04		/* ST:7c-7f */
649	bge	.Lcopyout_w_loop128
650
651.Lcopyout_w_lessthan128:
652	adds	r2, r2, #0x80		/* Adjust for extra sub */
653	ldmeqfd	sp!, {r4-r9}
654	RETeq				/* Return now if done */
655	subs	r2, r2, #0x20
656	blt	.Lcopyout_w_lessthan32
657
658	/* Copy 32 bytes at a time */
659.Lcopyout_w_loop32:
660	ldrd	r4, [r0], #0x08
661	pld	[r0, #0x18]
662	ldrd	r6, [r0], #0x08
663	ldrd	r8, [r0], #0x08
664	strt	r4, [r1], #0x04
665	strt	r5, [r1], #0x04
666	ldrd	r4, [r0], #0x08
667	strt	r6, [r1], #0x04
668	strt	r7, [r1], #0x04
669	strt	r8, [r1], #0x04
670	strt	r9, [r1], #0x04
671	subs	r2, r2, #0x20
672	strt	r4, [r1], #0x04
673	strt	r5, [r1], #0x04
674	bge	.Lcopyout_w_loop32
675
676.Lcopyout_w_lessthan32:
677	adds	r2, r2, #0x20		/* Adjust for extra sub */
678	ldmeqfd	sp!, {r4-r9}
679	RETeq				/* Return now if done */
680
681	and	r4, r2, #0x18
682	rsb	r5, r4, #0x18
683	subs	r2, r2, r4
684	add	pc, pc, r5, lsl #1
685	nop
686
687	/* At least 24 bytes remaining */
688	ldrd	r4, [r0], #0x08
689	strt	r4, [r1], #0x04
690	strt	r5, [r1], #0x04
691	nop
692
693	/* At least 16 bytes remaining */
694	ldrd	r4, [r0], #0x08
695	strt	r4, [r1], #0x04
696	strt	r5, [r1], #0x04
697	nop
698
699	/* At least 8 bytes remaining */
700	ldrd	r4, [r0], #0x08
701	strt	r4, [r1], #0x04
702	strt	r5, [r1], #0x04
703	nop
704
705	/* Less than 8 bytes remaining */
706	ldmfd	sp!, {r4-r9}
707	RETeq				/* Return now if done */
708	mov	r3, #0x00
709
710.Lcopyout_w_less_than8:
711	subs	r2, r2, #0x04
712	ldrge	ip, [r0], #0x04
713	strget	ip, [r1], #0x04
714	RETeq				/* Return now if done */
715	addlt	r2, r2, #0x04
716	ldrb	ip, [r0], #0x01
717	cmp	r2, #0x02
718	ldrgeb	r2, [r0], #0x01
719	strbt	ip, [r1], #0x01
720	ldrgtb	ip, [r0]
721	strgebt	r2, [r1], #0x01
722	strgtbt	ip, [r1]
723	RET
724
725/*
726 * At this point, it has not been possible to word align both buffers.
727 * The destination buffer (r1) is word aligned, but the source buffer
728 * (r0) is not.
729 */
730.Lcopyout_bad_align:
731	stmfd	sp!, {r4-r7}
732	mov	r3, #0x01
733	bic	r0, r0, #0x03
734	cmp	ip, #2
735	ldr	ip, [r0], #0x04
736	bgt	.Lcopyout_bad3
737	beq	.Lcopyout_bad2
738	b	.Lcopyout_bad1
739
740.Lcopyout_bad1_loop16:
741#ifdef	__ARMEB__
742	mov	r4, ip, lsl #8
743#else
744	mov	r4, ip, lsr #8
745#endif
746	ldr	r5, [r0], #0x04
747	pld	[r0, #0x018]
748	ldr	r6, [r0], #0x04
749	ldr	r7, [r0], #0x04
750	ldr	ip, [r0], #0x04
751#ifdef	__ARMEB__
752	orr	r4, r4, r5, lsr #24
753	mov	r5, r5, lsl #8
754	orr	r5, r5, r6, lsr #24
755	mov	r6, r6, lsl #8
756	orr	r6, r6, r7, lsr #24
757	mov	r7, r7, lsl #8
758	orr	r7, r7, ip, lsr #24
759#else
760	orr	r4, r4, r5, lsl #24
761	mov	r5, r5, lsr #8
762	orr	r5, r5, r6, lsl #24
763	mov	r6, r6, lsr #8
764	orr	r6, r6, r7, lsl #24
765	mov	r7, r7, lsr #8
766	orr	r7, r7, ip, lsl #24
767#endif
768	strt	r4, [r1], #0x04
769	strt	r5, [r1], #0x04
770	strt	r6, [r1], #0x04
771	strt	r7, [r1], #0x04
772.Lcopyout_bad1:
773	subs	r2, r2, #0x10
774	bge	.Lcopyout_bad1_loop16
775
776	adds	r2, r2, #0x10
777	ldmeqfd	sp!, {r4-r7}
778	RETeq				/* Return now if done */
779	subs	r2, r2, #0x04
780	sublt	r0, r0, #0x03
781	blt	.Lcopyout_l4
782
783.Lcopyout_bad1_loop4:
784#ifdef __ARMEB__
785	mov	r4, ip, lsl #8
786#else
787	mov	r4, ip, lsr #8
788#endif
789	ldr	ip, [r0], #0x04
790	subs	r2, r2, #0x04
791#ifdef __ARMEB__
792	orr	r4, r4, ip, lsr #24
793#else
794	orr	r4, r4, ip, lsl #24
795#endif
796	strt	r4, [r1], #0x04
797	bge	.Lcopyout_bad1_loop4
798	sub	r0, r0, #0x03
799	b	.Lcopyout_l4
800
801.Lcopyout_bad2_loop16:
802#ifdef __ARMEB__
803	mov	r4, ip, lsl #16
804#else
805	mov	r4, ip, lsr #16
806#endif
807	ldr	r5, [r0], #0x04
808	pld	[r0, #0x018]
809	ldr	r6, [r0], #0x04
810	ldr	r7, [r0], #0x04
811	ldr	ip, [r0], #0x04
812#ifdef __ARMEB__
813	orr	r4, r4, r5, lsr #16
814	mov	r5, r5, lsl #16
815	orr	r5, r5, r6, lsr #16
816	mov	r6, r6, lsl #16
817	orr	r6, r6, r7, lsr #16
818	mov	r7, r7, lsl #16
819	orr	r7, r7, ip, lsr #16
820#else
821	orr	r4, r4, r5, lsl #16
822	mov	r5, r5, lsr #16
823	orr	r5, r5, r6, lsl #16
824	mov	r6, r6, lsr #16
825	orr	r6, r6, r7, lsl #16
826	mov	r7, r7, lsr #16
827	orr	r7, r7, ip, lsl #16
828#endif
829	strt	r4, [r1], #0x04
830	strt	r5, [r1], #0x04
831	strt	r6, [r1], #0x04
832	strt	r7, [r1], #0x04
833.Lcopyout_bad2:
834	subs	r2, r2, #0x10
835	bge	.Lcopyout_bad2_loop16
836
837	adds	r2, r2, #0x10
838	ldmeqfd	sp!, {r4-r7}
839	RETeq				/* Return now if done */
840	subs	r2, r2, #0x04
841	sublt	r0, r0, #0x02
842	blt	.Lcopyout_l4
843
844.Lcopyout_bad2_loop4:
845#ifdef __ARMEB__
846	mov	r4, ip, lsl #16
847#else
848	mov	r4, ip, lsr #16
849#endif
850	ldr	ip, [r0], #0x04
851	subs	r2, r2, #0x04
852#ifdef __ARMEB__
853	orr	r4, r4, ip, lsr #16
854#else
855	orr	r4, r4, ip, lsl #16
856#endif
857	strt	r4, [r1], #0x04
858	bge	.Lcopyout_bad2_loop4
859	sub	r0, r0, #0x02
860	b	.Lcopyout_l4
861
862.Lcopyout_bad3_loop16:
863#ifdef __ARMEB__
864	mov	r4, ip, lsl #24
865#else
866	mov	r4, ip, lsr #24
867#endif
868	ldr	r5, [r0], #0x04
869	pld	[r0, #0x018]
870	ldr	r6, [r0], #0x04
871	ldr	r7, [r0], #0x04
872	ldr	ip, [r0], #0x04
873#ifdef __ARMEB__
874	orr	r4, r4, r5, lsr #8
875	mov	r5, r5, lsl #24
876	orr	r5, r5, r6, lsr #8
877	mov	r6, r6, lsl #24
878	orr	r6, r6, r7, lsr #8
879	mov	r7, r7, lsl #24
880	orr	r7, r7, ip, lsr #8
881#else
882	orr	r4, r4, r5, lsl #8
883	mov	r5, r5, lsr #24
884	orr	r5, r5, r6, lsl #8
885	mov	r6, r6, lsr #24
886	orr	r6, r6, r7, lsl #8
887	mov	r7, r7, lsr #24
888	orr	r7, r7, ip, lsl #8
889#endif
890	strt	r4, [r1], #0x04
891	strt	r5, [r1], #0x04
892	strt	r6, [r1], #0x04
893	strt	r7, [r1], #0x04
894.Lcopyout_bad3:
895	subs	r2, r2, #0x10
896	bge	.Lcopyout_bad3_loop16
897
898	adds	r2, r2, #0x10
899	ldmeqfd	sp!, {r4-r7}
900	RETeq				/* Return now if done */
901	subs	r2, r2, #0x04
902	sublt	r0, r0, #0x01
903	blt	.Lcopyout_l4
904
905.Lcopyout_bad3_loop4:
906#ifdef __ARMEB__
907	mov	r4, ip, lsl #24
908#else
909	mov	r4, ip, lsr #24
910#endif
911	ldr	ip, [r0], #0x04
912	subs	r2, r2, #0x04
913#ifdef __ARMEB__
914	orr	r4, r4, ip, lsr #8
915#else
916	orr	r4, r4, ip, lsl #8
917#endif
918	strt	r4, [r1], #0x04
919	bge	.Lcopyout_bad3_loop4
920	sub	r0, r0, #0x01
921
922.Lcopyout_l4:
923	ldmfd	sp!, {r4-r7}
924	mov	r3, #0x00
925	adds	r2, r2, #0x04
926	RETeq
927.Lcopyout_l4_2:
928	rsbs	r2, r2, #0x03
929	addne	pc, pc, r2, lsl #3
930	nop
931	ldrb	ip, [r0], #0x01
932	strbt	ip, [r1], #0x01
933	ldrb	ip, [r0], #0x01
934	strbt	ip, [r1], #0x01
935	ldrb	ip, [r0]
936	strbt	ip, [r1]
937	RET
938