bcopyinout_xscale.S revision 275767
1/*	$NetBSD: bcopyinout_xscale.S,v 1.3 2003/12/15 09:27:18 scw Exp $	*/
2
3/*-
4 * Copyright 2003 Wasabi Systems, Inc.
5 * All rights reserved.
6 *
7 * Written by Steve C. Woodford for Wasabi Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *      This product includes software developed for the NetBSD Project by
20 *      Wasabi Systems, Inc.
21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22 *    or promote products derived from this software without specific prior
23 *    written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38#include <machine/asm.h>
39__FBSDID("$FreeBSD: stable/10/sys/arm/arm/bcopyinout_xscale.S 275767 2014-12-14 16:28:53Z andrew $");
40
41	.syntax	unified
42	.text
43	.align	0
44
45#ifdef _ARM_ARCH_6
46#define GET_PCB(tmp) \
47	mrc p15, 0, tmp, c13, c0, 4; \
48	add	tmp, tmp, #(TD_PCB)
49#else
50.Lcurpcb:
51	.word	_C_LABEL(__pcpu) + PC_CURPCB
52#define GET_PCB(tmp) \
53	ldr	tmp, .Lcurpcb
54#endif
55
56/*
57 * r0 = user space address
58 * r1 = kernel space address
59 * r2 = length
60 *
61 * Copies bytes from user space to kernel space
62 */
63ENTRY(copyin)
64	cmp	r2, #0x00
65	movle	r0, #0x00
66	movle	pc, lr			/* Bail early if length is <= 0 */
67
68	ldr	r3, .L_arm_memcpy
69	ldr	r3, [r3]
70	cmp	r3, #0
71	beq	.Lnormal
72	ldr	r3, .L_min_memcpy_size
73	ldr	r3, [r3]
74	cmp	r2, r3
75	blt	.Lnormal
76	stmfd	sp!, {r0-r2, r4, lr}
77	mov     r3, r0
78	mov     r0, r1
79	mov     r1, r3
80	mov     r3, #2 /* SRC_IS_USER */
81	ldr	r4, .L_arm_memcpy
82	mov	lr, pc
83	ldr	pc, [r4]
84	cmp     r0, #0
85	ldmfd   sp!, {r0-r2, r4, lr}
86	moveq	r0, #0
87	RETeq
88
89.Lnormal:
90	stmfd	sp!, {r10-r11, lr}
91
92	GET_PCB(r10)
93	ldr	r10, [r10]
94
95	mov	r3, #0x00
96	adr	ip, .Lcopyin_fault
97	ldr	r11, [r10, #PCB_ONFAULT]
98	str	ip, [r10, #PCB_ONFAULT]
99	bl	.Lcopyin_guts
100	str	r11, [r10, #PCB_ONFAULT]
101	mov	r0, #0x00
102	ldmfd	sp!, {r10-r11, pc}
103
104.Lcopyin_fault:
105	ldr	r0, =EFAULT
106	str	r11, [r10, #PCB_ONFAULT]
107	cmp	r3, #0x00
108	ldmfdgt	sp!, {r4-r7}		/* r3 > 0 Restore r4-r7 */
109	ldmfdlt	sp!, {r4-r9}		/* r3 < 0 Restore r4-r9 */
110	ldmfd	sp!, {r10-r11, pc}
111
112.Lcopyin_guts:
113	pld	[r0]
114	/* Word-align the destination buffer */
115	ands	ip, r1, #0x03		/* Already word aligned? */
116	beq	.Lcopyin_wordaligned	/* Yup */
117	rsb	ip, ip, #0x04
118	cmp	r2, ip			/* Enough bytes left to align it? */
119	blt	.Lcopyin_l4_2		/* Nope. Just copy bytewise */
120	sub	r2, r2, ip
121	rsbs	ip, ip, #0x03
122	addne	pc, pc, ip, lsl #3
123	nop
124	ldrbt	ip, [r0], #0x01
125	strb	ip, [r1], #0x01
126	ldrbt	ip, [r0], #0x01
127	strb	ip, [r1], #0x01
128	ldrbt	ip, [r0], #0x01
129	strb	ip, [r1], #0x01
130	cmp	r2, #0x00		/* All done? */
131	RETeq
132
133	/* Destination buffer is now word aligned */
134.Lcopyin_wordaligned:
135	ands	ip, r0, #0x03		/* Is src also word-aligned? */
136	bne	.Lcopyin_bad_align	/* Nope. Things just got bad */
137	cmp	r2, #0x08		/* Less than 8 bytes remaining? */
138	blt	.Lcopyin_w_less_than8
139
140	/* Quad-align the destination buffer */
141	tst	r1, #0x07		/* Already quad aligned? */
142	ldrtne	ip, [r0], #0x04
143	strne	ip, [r1], #0x04
144	subne	r2, r2, #0x04
145	stmfd	sp!, {r4-r9}		/* Free up some registers */
146	mov	r3, #-1			/* Signal restore r4-r9 */
147
148	/* Destination buffer quad aligned, source is word aligned */
149	subs	r2, r2, #0x80
150	blt	.Lcopyin_w_lessthan128
151
152	/* Copy 128 bytes at a time */
153.Lcopyin_w_loop128:
154	ldrt	r4, [r0], #0x04		/* LD:00-03 */
155	ldrt	r5, [r0], #0x04		/* LD:04-07 */
156	pld	[r0, #0x18]		/* Prefetch 0x20 */
157	ldrt	r6, [r0], #0x04		/* LD:08-0b */
158	ldrt	r7, [r0], #0x04		/* LD:0c-0f */
159	ldrt	r8, [r0], #0x04		/* LD:10-13 */
160	ldrt	r9, [r0], #0x04		/* LD:14-17 */
161	strd	r4, [r1], #0x08		/* ST:00-07 */
162	ldrt	r4, [r0], #0x04		/* LD:18-1b */
163	ldrt	r5, [r0], #0x04		/* LD:1c-1f */
164	strd	r6, [r1], #0x08		/* ST:08-0f */
165	ldrt	r6, [r0], #0x04		/* LD:20-23 */
166	ldrt	r7, [r0], #0x04		/* LD:24-27 */
167	pld	[r0, #0x18]		/* Prefetch 0x40 */
168	strd	r8, [r1], #0x08		/* ST:10-17 */
169	ldrt	r8, [r0], #0x04		/* LD:28-2b */
170	ldrt	r9, [r0], #0x04		/* LD:2c-2f */
171	strd	r4, [r1], #0x08		/* ST:18-1f */
172	ldrt	r4, [r0], #0x04		/* LD:30-33 */
173	ldrt	r5, [r0], #0x04		/* LD:34-37 */
174	strd	r6, [r1], #0x08		/* ST:20-27 */
175	ldrt	r6, [r0], #0x04		/* LD:38-3b */
176	ldrt	r7, [r0], #0x04		/* LD:3c-3f */
177	strd	r8, [r1], #0x08		/* ST:28-2f */
178	ldrt	r8, [r0], #0x04		/* LD:40-43 */
179	ldrt	r9, [r0], #0x04		/* LD:44-47 */
180	pld	[r0, #0x18]		/* Prefetch 0x60 */
181	strd	r4, [r1], #0x08		/* ST:30-37 */
182	ldrt	r4, [r0], #0x04		/* LD:48-4b */
183	ldrt	r5, [r0], #0x04		/* LD:4c-4f */
184	strd	r6, [r1], #0x08		/* ST:38-3f */
185	ldrt	r6, [r0], #0x04		/* LD:50-53 */
186	ldrt	r7, [r0], #0x04		/* LD:54-57 */
187	strd	r8, [r1], #0x08		/* ST:40-47 */
188	ldrt	r8, [r0], #0x04		/* LD:58-5b */
189	ldrt	r9, [r0], #0x04		/* LD:5c-5f */
190	strd	r4, [r1], #0x08		/* ST:48-4f */
191	ldrt	r4, [r0], #0x04		/* LD:60-63 */
192	ldrt	r5, [r0], #0x04		/* LD:64-67 */
193	pld	[r0, #0x18]		/* Prefetch 0x80 */
194	strd	r6, [r1], #0x08		/* ST:50-57 */
195	ldrt	r6, [r0], #0x04		/* LD:68-6b */
196	ldrt	r7, [r0], #0x04		/* LD:6c-6f */
197	strd	r8, [r1], #0x08		/* ST:58-5f */
198	ldrt	r8, [r0], #0x04		/* LD:70-73 */
199	ldrt	r9, [r0], #0x04		/* LD:74-77 */
200	strd	r4, [r1], #0x08		/* ST:60-67 */
201	ldrt	r4, [r0], #0x04		/* LD:78-7b */
202	ldrt	r5, [r0], #0x04		/* LD:7c-7f */
203	strd	r6, [r1], #0x08		/* ST:68-6f */
204	strd	r8, [r1], #0x08		/* ST:70-77 */
205	subs	r2, r2, #0x80
206	strd	r4, [r1], #0x08		/* ST:78-7f */
207	bge	.Lcopyin_w_loop128
208
209.Lcopyin_w_lessthan128:
210	adds	r2, r2, #0x80		/* Adjust for extra sub */
211	ldmfdeq	sp!, {r4-r9}
212	RETeq
213	subs	r2, r2, #0x20
214	blt	.Lcopyin_w_lessthan32
215
216	/* Copy 32 bytes at a time */
217.Lcopyin_w_loop32:
218	ldrt	r4, [r0], #0x04
219	ldrt	r5, [r0], #0x04
220	pld	[r0, #0x18]
221	ldrt	r6, [r0], #0x04
222	ldrt	r7, [r0], #0x04
223	ldrt	r8, [r0], #0x04
224	ldrt	r9, [r0], #0x04
225	strd	r4, [r1], #0x08
226	ldrt	r4, [r0], #0x04
227	ldrt	r5, [r0], #0x04
228	strd	r6, [r1], #0x08
229	strd	r8, [r1], #0x08
230	subs	r2, r2, #0x20
231	strd	r4, [r1], #0x08
232	bge	.Lcopyin_w_loop32
233
234.Lcopyin_w_lessthan32:
235	adds	r2, r2, #0x20		/* Adjust for extra sub */
236	ldmfdeq	sp!, {r4-r9}
237	RETeq				/* Return now if done */
238
239	and	r4, r2, #0x18
240	rsb	r5, r4, #0x18
241	subs	r2, r2, r4
242	add	pc, pc, r5, lsl #1
243	nop
244
245	/* At least 24 bytes remaining */
246	ldrt	r4, [r0], #0x04
247	ldrt	r5, [r0], #0x04
248	nop
249	strd	r4, [r1], #0x08
250
251	/* At least 16 bytes remaining */
252	ldrt	r4, [r0], #0x04
253	ldrt	r5, [r0], #0x04
254	nop
255	strd	r4, [r1], #0x08
256
257	/* At least 8 bytes remaining */
258	ldrt	r4, [r0], #0x04
259	ldrt	r5, [r0], #0x04
260	nop
261	strd	r4, [r1], #0x08
262
263	/* Less than 8 bytes remaining */
264	ldmfd	sp!, {r4-r9}
265	RETeq				/* Return now if done */
266	mov	r3, #0x00
267
268.Lcopyin_w_less_than8:
269	subs	r2, r2, #0x04
270	ldrtge	ip, [r0], #0x04
271	strge	ip, [r1], #0x04
272	RETeq				/* Return now if done */
273	addlt	r2, r2, #0x04
274	ldrbt	ip, [r0], #0x01
275	cmp	r2, #0x02
276	ldrbtge	r2, [r0], #0x01
277	strb	ip, [r1], #0x01
278	ldrbtgt	ip, [r0]
279	strbge	r2, [r1], #0x01
280	strbgt	ip, [r1]
281	RET
282
283/*
284 * At this point, it has not been possible to word align both buffers.
285 * The destination buffer (r1) is word aligned, but the source buffer
286 * (r0) is not.
287 */
288.Lcopyin_bad_align:
289	stmfd	sp!, {r4-r7}
290	mov	r3, #0x01
291	bic	r0, r0, #0x03
292	cmp	ip, #2
293	ldrt	ip, [r0], #0x04
294	bgt	.Lcopyin_bad3
295	beq	.Lcopyin_bad2
296	b	.Lcopyin_bad1
297
298.Lcopyin_bad1_loop16:
299#ifdef __ARMEB__
300	mov	r4, ip, lsl #8
301#else
302	mov	r4, ip, lsr #8
303#endif
304	ldrt	r5, [r0], #0x04
305	pld	[r0, #0x018]
306	ldrt	r6, [r0], #0x04
307	ldrt	r7, [r0], #0x04
308	ldrt	ip, [r0], #0x04
309#ifdef __ARMEB__
310	orr	r4, r4, r5, lsr #24
311	mov	r5, r5, lsl #8
312	orr	r5, r5, r6, lsr #24
313	mov	r6, r6, lsl #8
314	orr	r6, r6, r7, lsr #24
315	mov	r7, r7, lsl #8
316	orr	r7, r7, ip, lsr #24
317#else
318	orr	r4, r4, r5, lsl #24
319	mov	r5, r5, lsr #8
320	orr	r5, r5, r6, lsl #24
321	mov	r6, r6, lsr #8
322	orr	r6, r6, r7, lsl #24
323	mov	r7, r7, lsr #8
324	orr	r7, r7, ip, lsl #24
325#endif
326	str	r4, [r1], #0x04
327	str	r5, [r1], #0x04
328	str	r6, [r1], #0x04
329	str	r7, [r1], #0x04
330.Lcopyin_bad1:
331	subs	r2, r2, #0x10
332	bge	.Lcopyin_bad1_loop16
333
334	adds	r2, r2, #0x10
335	ldmfdeq	sp!, {r4-r7}
336	RETeq				/* Return now if done */
337	subs	r2, r2, #0x04
338	sublt	r0, r0, #0x03
339	blt	.Lcopyin_l4
340
341.Lcopyin_bad1_loop4:
342#ifdef __ARMEB__
343	mov	r4, ip, lsl #8
344#else
345	mov	r4, ip, lsr #8
346#endif
347	ldrt	ip, [r0], #0x04
348	subs	r2, r2, #0x04
349#ifdef __ARMEB__
350	orr	r4, r4, ip, lsr #24
351#else
352	orr	r4, r4, ip, lsl #24
353#endif
354	str	r4, [r1], #0x04
355	bge	.Lcopyin_bad1_loop4
356	sub	r0, r0, #0x03
357	b	.Lcopyin_l4
358
359.Lcopyin_bad2_loop16:
360#ifdef __ARMEB__
361	mov	r4, ip, lsl #16
362#else
363	mov	r4, ip, lsr #16
364#endif
365	ldrt	r5, [r0], #0x04
366	pld	[r0, #0x018]
367	ldrt	r6, [r0], #0x04
368	ldrt	r7, [r0], #0x04
369	ldrt	ip, [r0], #0x04
370#ifdef __ARMEB__
371	orr	r4, r4, r5, lsr #16
372	mov	r5, r5, lsl #16
373	orr	r5, r5, r6, lsr #16
374	mov	r6, r6, lsl #16
375	orr	r6, r6, r7, lsr #16
376	mov	r7, r7, lsl #16
377	orr	r7, r7, ip, lsr #16
378#else
379	orr	r4, r4, r5, lsl #16
380	mov	r5, r5, lsr #16
381	orr	r5, r5, r6, lsl #16
382	mov	r6, r6, lsr #16
383	orr	r6, r6, r7, lsl #16
384	mov	r7, r7, lsr #16
385	orr	r7, r7, ip, lsl #16
386#endif
387	str	r4, [r1], #0x04
388	str	r5, [r1], #0x04
389	str	r6, [r1], #0x04
390	str	r7, [r1], #0x04
391.Lcopyin_bad2:
392	subs	r2, r2, #0x10
393	bge	.Lcopyin_bad2_loop16
394
395	adds	r2, r2, #0x10
396	ldmfdeq	sp!, {r4-r7}
397	RETeq				/* Return now if done */
398	subs	r2, r2, #0x04
399	sublt	r0, r0, #0x02
400	blt	.Lcopyin_l4
401
402.Lcopyin_bad2_loop4:
403#ifdef __ARMEB__
404	mov	r4, ip, lsl #16
405#else
406	mov	r4, ip, lsr #16
407#endif
408	ldrt	ip, [r0], #0x04
409	subs	r2, r2, #0x04
410#ifdef __ARMEB__
411	orr	r4, r4, ip, lsr #16
412#else
413	orr	r4, r4, ip, lsl #16
414#endif
415	str	r4, [r1], #0x04
416	bge	.Lcopyin_bad2_loop4
417	sub	r0, r0, #0x02
418	b	.Lcopyin_l4
419
420.Lcopyin_bad3_loop16:
421#ifdef __ARMEB__
422	mov	r4, ip, lsl #24
423#else
424	mov	r4, ip, lsr #24
425#endif
426	ldrt	r5, [r0], #0x04
427	pld	[r0, #0x018]
428	ldrt	r6, [r0], #0x04
429	ldrt	r7, [r0], #0x04
430	ldrt	ip, [r0], #0x04
431#ifdef __ARMEB__
432	orr	r4, r4, r5, lsr #8
433	mov	r5, r5, lsl #24
434	orr	r5, r5, r6, lsr #8
435	mov	r6, r6, lsl #24
436	orr	r6, r6, r7, lsr #8
437	mov	r7, r7, lsl #24
438	orr	r7, r7, ip, lsr #8
439#else
440	orr	r4, r4, r5, lsl #8
441	mov	r5, r5, lsr #24
442	orr	r5, r5, r6, lsl #8
443	mov	r6, r6, lsr #24
444	orr	r6, r6, r7, lsl #8
445	mov	r7, r7, lsr #24
446	orr	r7, r7, ip, lsl #8
447#endif
448	str	r4, [r1], #0x04
449	str	r5, [r1], #0x04
450	str	r6, [r1], #0x04
451	str	r7, [r1], #0x04
452.Lcopyin_bad3:
453	subs	r2, r2, #0x10
454	bge	.Lcopyin_bad3_loop16
455
456	adds	r2, r2, #0x10
457	ldmfdeq	sp!, {r4-r7}
458	RETeq				/* Return now if done */
459	subs	r2, r2, #0x04
460	sublt	r0, r0, #0x01
461	blt	.Lcopyin_l4
462
463.Lcopyin_bad3_loop4:
464#ifdef __ARMEB__
465	mov	r4, ip, lsl #24
466#else
467	mov	r4, ip, lsr #24
468#endif
469	ldrt	ip, [r0], #0x04
470	subs	r2, r2, #0x04
471#ifdef __ARMEB__
472	orr	r4, r4, ip, lsr #8
473#else
474	orr	r4, r4, ip, lsl #8
475#endif
476	str	r4, [r1], #0x04
477	bge	.Lcopyin_bad3_loop4
478	sub	r0, r0, #0x01
479
480.Lcopyin_l4:
481	ldmfd	sp!, {r4-r7}
482	mov	r3, #0x00
483	adds	r2, r2, #0x04
484	RETeq
485.Lcopyin_l4_2:
486	rsbs	r2, r2, #0x03
487	addne	pc, pc, r2, lsl #3
488	nop
489	ldrbt	ip, [r0], #0x01
490	strb	ip, [r1], #0x01
491	ldrbt	ip, [r0], #0x01
492	strb	ip, [r1], #0x01
493	ldrbt	ip, [r0]
494	strb	ip, [r1]
495	RET
496END(copyin)
497
498/*
499 * r0 = kernel space address
500 * r1 = user space address
501 * r2 = length
502 *
503 * Copies bytes from kernel space to user space
504 */
505ENTRY(copyout)
506	cmp	r2, #0x00
507	movle	r0, #0x00
508	movle	pc, lr			/* Bail early if length is <= 0 */
509
510	ldr	r3, .L_arm_memcpy
511	ldr	r3, [r3]
512	cmp	r3, #0
513	beq	.Lnormale
514	ldr	r3, .L_min_memcpy_size
515	ldr	r3, [r3]
516	cmp	r2, r3
517	blt	.Lnormale
518	stmfd	sp!, {r0-r2, r4, lr}
519	mov     r3, r0
520	mov     r0, r1
521	mov     r1, r3
522	mov     r3, #1 /* DST_IS_USER */
523	ldr	r4, .L_arm_memcpy
524	mov	lr, pc
525	ldr	pc, [r4]
526	cmp     r0, #0
527	ldmfd   sp!, {r0-r2, r4, lr}
528	moveq	r0, #0
529	RETeq
530
531.Lnormale:
532	stmfd	sp!, {r10-r11, lr}
533
534	GET_PCB(r10)
535	ldr	r10, [r10]
536
537	mov	r3, #0x00
538	adr	ip, .Lcopyout_fault
539	ldr	r11, [r10, #PCB_ONFAULT]
540	str	ip, [r10, #PCB_ONFAULT]
541	bl	.Lcopyout_guts
542	str	r11, [r10, #PCB_ONFAULT]
543	mov	r0, #0x00
544	ldmfd	sp!, {r10-r11, pc}
545
546.Lcopyout_fault:
547	ldr	r0, =EFAULT
548	str	r11, [r10, #PCB_ONFAULT]
549	cmp	r3, #0x00
550	ldmfdgt	sp!, {r4-r7}		/* r3 > 0 Restore r4-r7 */
551	ldmfdlt	sp!, {r4-r9}		/* r3 < 0 Restore r4-r9 */
552	ldmfd	sp!, {r10-r11, pc}
553
554.Lcopyout_guts:
555	pld	[r0]
556	/* Word-align the destination buffer */
557	ands	ip, r1, #0x03		/* Already word aligned? */
558	beq	.Lcopyout_wordaligned	/* Yup */
559	rsb	ip, ip, #0x04
560	cmp	r2, ip			/* Enough bytes left to align it? */
561	blt	.Lcopyout_l4_2		/* Nope. Just copy bytewise */
562	sub	r2, r2, ip
563	rsbs	ip, ip, #0x03
564	addne	pc, pc, ip, lsl #3
565	nop
566	ldrb	ip, [r0], #0x01
567	strbt	ip, [r1], #0x01
568	ldrb	ip, [r0], #0x01
569	strbt	ip, [r1], #0x01
570	ldrb	ip, [r0], #0x01
571	strbt	ip, [r1], #0x01
572	cmp	r2, #0x00		/* All done? */
573	RETeq
574
575	/* Destination buffer is now word aligned */
576.Lcopyout_wordaligned:
577	ands	ip, r0, #0x03		/* Is src also word-aligned? */
578	bne	.Lcopyout_bad_align	/* Nope. Things just got bad */
579	cmp	r2, #0x08		/* Less than 8 bytes remaining? */
580	blt	.Lcopyout_w_less_than8
581
582	/* Quad-align the destination buffer */
583	tst	r0, #0x07		/* Already quad aligned? */
584	ldrne	ip, [r0], #0x04
585	subne	r2, r2, #0x04
586	strtne	ip, [r1], #0x04
587
588	stmfd	sp!, {r4-r9}		/* Free up some registers */
589	mov	r3, #-1			/* Signal restore r4-r9 */
590
591	/* Destination buffer word aligned, source is quad aligned */
592	subs	r2, r2, #0x80
593	blt	.Lcopyout_w_lessthan128
594
595	/* Copy 128 bytes at a time */
596.Lcopyout_w_loop128:
597	ldrd	r4, [r0], #0x08		/* LD:00-07 */
598	pld	[r0, #0x18]		/* Prefetch 0x20 */
599	ldrd	r6, [r0], #0x08		/* LD:08-0f */
600	ldrd	r8, [r0], #0x08		/* LD:10-17 */
601	strt	r4, [r1], #0x04		/* ST:00-03 */
602	strt	r5, [r1], #0x04		/* ST:04-07 */
603	ldrd	r4, [r0], #0x08		/* LD:18-1f */
604	strt	r6, [r1], #0x04		/* ST:08-0b */
605	strt	r7, [r1], #0x04		/* ST:0c-0f */
606	ldrd	r6, [r0], #0x08		/* LD:20-27 */
607	pld	[r0, #0x18]		/* Prefetch 0x40 */
608	strt	r8, [r1], #0x04		/* ST:10-13 */
609	strt	r9, [r1], #0x04		/* ST:14-17 */
610	ldrd	r8, [r0], #0x08		/* LD:28-2f */
611	strt	r4, [r1], #0x04		/* ST:18-1b */
612	strt	r5, [r1], #0x04		/* ST:1c-1f */
613	ldrd	r4, [r0], #0x08		/* LD:30-37 */
614	strt	r6, [r1], #0x04		/* ST:20-23 */
615	strt	r7, [r1], #0x04		/* ST:24-27 */
616	ldrd	r6, [r0], #0x08		/* LD:38-3f */
617	strt	r8, [r1], #0x04		/* ST:28-2b */
618	strt	r9, [r1], #0x04		/* ST:2c-2f */
619	ldrd	r8, [r0], #0x08		/* LD:40-47 */
620	pld	[r0, #0x18]		/* Prefetch 0x60 */
621	strt	r4, [r1], #0x04		/* ST:30-33 */
622	strt	r5, [r1], #0x04		/* ST:34-37 */
623	ldrd	r4, [r0], #0x08		/* LD:48-4f */
624	strt	r6, [r1], #0x04		/* ST:38-3b */
625	strt	r7, [r1], #0x04		/* ST:3c-3f */
626	ldrd	r6, [r0], #0x08		/* LD:50-57 */
627	strt	r8, [r1], #0x04		/* ST:40-43 */
628	strt	r9, [r1], #0x04		/* ST:44-47 */
629	ldrd	r8, [r0], #0x08		/* LD:58-4f */
630	strt	r4, [r1], #0x04		/* ST:48-4b */
631	strt	r5, [r1], #0x04		/* ST:4c-4f */
632	ldrd	r4, [r0], #0x08		/* LD:60-67 */
633	pld	[r0, #0x18]		/* Prefetch 0x80 */
634	strt	r6, [r1], #0x04		/* ST:50-53 */
635	strt	r7, [r1], #0x04		/* ST:54-57 */
636	ldrd	r6, [r0], #0x08		/* LD:68-6f */
637	strt	r8, [r1], #0x04		/* ST:58-5b */
638	strt	r9, [r1], #0x04		/* ST:5c-5f */
639	ldrd	r8, [r0], #0x08		/* LD:70-77 */
640	strt	r4, [r1], #0x04		/* ST:60-63 */
641	strt	r5, [r1], #0x04		/* ST:64-67 */
642	ldrd	r4, [r0], #0x08		/* LD:78-7f */
643	strt	r6, [r1], #0x04		/* ST:68-6b */
644	strt	r7, [r1], #0x04		/* ST:6c-6f */
645	strt	r8, [r1], #0x04		/* ST:70-73 */
646	strt	r9, [r1], #0x04		/* ST:74-77 */
647	subs	r2, r2, #0x80
648	strt	r4, [r1], #0x04		/* ST:78-7b */
649	strt	r5, [r1], #0x04		/* ST:7c-7f */
650	bge	.Lcopyout_w_loop128
651
652.Lcopyout_w_lessthan128:
653	adds	r2, r2, #0x80		/* Adjust for extra sub */
654	ldmfdeq	sp!, {r4-r9}
655	RETeq				/* Return now if done */
656	subs	r2, r2, #0x20
657	blt	.Lcopyout_w_lessthan32
658
659	/* Copy 32 bytes at a time */
660.Lcopyout_w_loop32:
661	ldrd	r4, [r0], #0x08
662	pld	[r0, #0x18]
663	ldrd	r6, [r0], #0x08
664	ldrd	r8, [r0], #0x08
665	strt	r4, [r1], #0x04
666	strt	r5, [r1], #0x04
667	ldrd	r4, [r0], #0x08
668	strt	r6, [r1], #0x04
669	strt	r7, [r1], #0x04
670	strt	r8, [r1], #0x04
671	strt	r9, [r1], #0x04
672	subs	r2, r2, #0x20
673	strt	r4, [r1], #0x04
674	strt	r5, [r1], #0x04
675	bge	.Lcopyout_w_loop32
676
677.Lcopyout_w_lessthan32:
678	adds	r2, r2, #0x20		/* Adjust for extra sub */
679	ldmfdeq	sp!, {r4-r9}
680	RETeq				/* Return now if done */
681
682	and	r4, r2, #0x18
683	rsb	r5, r4, #0x18
684	subs	r2, r2, r4
685	add	pc, pc, r5, lsl #1
686	nop
687
688	/* At least 24 bytes remaining */
689	ldrd	r4, [r0], #0x08
690	strt	r4, [r1], #0x04
691	strt	r5, [r1], #0x04
692	nop
693
694	/* At least 16 bytes remaining */
695	ldrd	r4, [r0], #0x08
696	strt	r4, [r1], #0x04
697	strt	r5, [r1], #0x04
698	nop
699
700	/* At least 8 bytes remaining */
701	ldrd	r4, [r0], #0x08
702	strt	r4, [r1], #0x04
703	strt	r5, [r1], #0x04
704	nop
705
706	/* Less than 8 bytes remaining */
707	ldmfd	sp!, {r4-r9}
708	RETeq				/* Return now if done */
709	mov	r3, #0x00
710
711.Lcopyout_w_less_than8:
712	subs	r2, r2, #0x04
713	ldrge	ip, [r0], #0x04
714	strtge	ip, [r1], #0x04
715	RETeq				/* Return now if done */
716	addlt	r2, r2, #0x04
717	ldrb	ip, [r0], #0x01
718	cmp	r2, #0x02
719	ldrbge	r2, [r0], #0x01
720	strbt	ip, [r1], #0x01
721	ldrbgt	ip, [r0]
722	strbtge	r2, [r1], #0x01
723	strbtgt	ip, [r1]
724	RET
725
726/*
727 * At this point, it has not been possible to word align both buffers.
728 * The destination buffer (r1) is word aligned, but the source buffer
729 * (r0) is not.
730 */
731.Lcopyout_bad_align:
732	stmfd	sp!, {r4-r7}
733	mov	r3, #0x01
734	bic	r0, r0, #0x03
735	cmp	ip, #2
736	ldr	ip, [r0], #0x04
737	bgt	.Lcopyout_bad3
738	beq	.Lcopyout_bad2
739	b	.Lcopyout_bad1
740
741.Lcopyout_bad1_loop16:
742#ifdef	__ARMEB__
743	mov	r4, ip, lsl #8
744#else
745	mov	r4, ip, lsr #8
746#endif
747	ldr	r5, [r0], #0x04
748	pld	[r0, #0x018]
749	ldr	r6, [r0], #0x04
750	ldr	r7, [r0], #0x04
751	ldr	ip, [r0], #0x04
752#ifdef	__ARMEB__
753	orr	r4, r4, r5, lsr #24
754	mov	r5, r5, lsl #8
755	orr	r5, r5, r6, lsr #24
756	mov	r6, r6, lsl #8
757	orr	r6, r6, r7, lsr #24
758	mov	r7, r7, lsl #8
759	orr	r7, r7, ip, lsr #24
760#else
761	orr	r4, r4, r5, lsl #24
762	mov	r5, r5, lsr #8
763	orr	r5, r5, r6, lsl #24
764	mov	r6, r6, lsr #8
765	orr	r6, r6, r7, lsl #24
766	mov	r7, r7, lsr #8
767	orr	r7, r7, ip, lsl #24
768#endif
769	strt	r4, [r1], #0x04
770	strt	r5, [r1], #0x04
771	strt	r6, [r1], #0x04
772	strt	r7, [r1], #0x04
773.Lcopyout_bad1:
774	subs	r2, r2, #0x10
775	bge	.Lcopyout_bad1_loop16
776
777	adds	r2, r2, #0x10
778	ldmfdeq	sp!, {r4-r7}
779	RETeq				/* Return now if done */
780	subs	r2, r2, #0x04
781	sublt	r0, r0, #0x03
782	blt	.Lcopyout_l4
783
784.Lcopyout_bad1_loop4:
785#ifdef __ARMEB__
786	mov	r4, ip, lsl #8
787#else
788	mov	r4, ip, lsr #8
789#endif
790	ldr	ip, [r0], #0x04
791	subs	r2, r2, #0x04
792#ifdef __ARMEB__
793	orr	r4, r4, ip, lsr #24
794#else
795	orr	r4, r4, ip, lsl #24
796#endif
797	strt	r4, [r1], #0x04
798	bge	.Lcopyout_bad1_loop4
799	sub	r0, r0, #0x03
800	b	.Lcopyout_l4
801
802.Lcopyout_bad2_loop16:
803#ifdef __ARMEB__
804	mov	r4, ip, lsl #16
805#else
806	mov	r4, ip, lsr #16
807#endif
808	ldr	r5, [r0], #0x04
809	pld	[r0, #0x018]
810	ldr	r6, [r0], #0x04
811	ldr	r7, [r0], #0x04
812	ldr	ip, [r0], #0x04
813#ifdef __ARMEB__
814	orr	r4, r4, r5, lsr #16
815	mov	r5, r5, lsl #16
816	orr	r5, r5, r6, lsr #16
817	mov	r6, r6, lsl #16
818	orr	r6, r6, r7, lsr #16
819	mov	r7, r7, lsl #16
820	orr	r7, r7, ip, lsr #16
821#else
822	orr	r4, r4, r5, lsl #16
823	mov	r5, r5, lsr #16
824	orr	r5, r5, r6, lsl #16
825	mov	r6, r6, lsr #16
826	orr	r6, r6, r7, lsl #16
827	mov	r7, r7, lsr #16
828	orr	r7, r7, ip, lsl #16
829#endif
830	strt	r4, [r1], #0x04
831	strt	r5, [r1], #0x04
832	strt	r6, [r1], #0x04
833	strt	r7, [r1], #0x04
834.Lcopyout_bad2:
835	subs	r2, r2, #0x10
836	bge	.Lcopyout_bad2_loop16
837
838	adds	r2, r2, #0x10
839	ldmfdeq	sp!, {r4-r7}
840	RETeq				/* Return now if done */
841	subs	r2, r2, #0x04
842	sublt	r0, r0, #0x02
843	blt	.Lcopyout_l4
844
845.Lcopyout_bad2_loop4:
846#ifdef __ARMEB__
847	mov	r4, ip, lsl #16
848#else
849	mov	r4, ip, lsr #16
850#endif
851	ldr	ip, [r0], #0x04
852	subs	r2, r2, #0x04
853#ifdef __ARMEB__
854	orr	r4, r4, ip, lsr #16
855#else
856	orr	r4, r4, ip, lsl #16
857#endif
858	strt	r4, [r1], #0x04
859	bge	.Lcopyout_bad2_loop4
860	sub	r0, r0, #0x02
861	b	.Lcopyout_l4
862
863.Lcopyout_bad3_loop16:
864#ifdef __ARMEB__
865	mov	r4, ip, lsl #24
866#else
867	mov	r4, ip, lsr #24
868#endif
869	ldr	r5, [r0], #0x04
870	pld	[r0, #0x018]
871	ldr	r6, [r0], #0x04
872	ldr	r7, [r0], #0x04
873	ldr	ip, [r0], #0x04
874#ifdef __ARMEB__
875	orr	r4, r4, r5, lsr #8
876	mov	r5, r5, lsl #24
877	orr	r5, r5, r6, lsr #8
878	mov	r6, r6, lsl #24
879	orr	r6, r6, r7, lsr #8
880	mov	r7, r7, lsl #24
881	orr	r7, r7, ip, lsr #8
882#else
883	orr	r4, r4, r5, lsl #8
884	mov	r5, r5, lsr #24
885	orr	r5, r5, r6, lsl #8
886	mov	r6, r6, lsr #24
887	orr	r6, r6, r7, lsl #8
888	mov	r7, r7, lsr #24
889	orr	r7, r7, ip, lsl #8
890#endif
891	strt	r4, [r1], #0x04
892	strt	r5, [r1], #0x04
893	strt	r6, [r1], #0x04
894	strt	r7, [r1], #0x04
895.Lcopyout_bad3:
896	subs	r2, r2, #0x10
897	bge	.Lcopyout_bad3_loop16
898
899	adds	r2, r2, #0x10
900	ldmfdeq	sp!, {r4-r7}
901	RETeq				/* Return now if done */
902	subs	r2, r2, #0x04
903	sublt	r0, r0, #0x01
904	blt	.Lcopyout_l4
905
906.Lcopyout_bad3_loop4:
907#ifdef __ARMEB__
908	mov	r4, ip, lsl #24
909#else
910	mov	r4, ip, lsr #24
911#endif
912	ldr	ip, [r0], #0x04
913	subs	r2, r2, #0x04
914#ifdef __ARMEB__
915	orr	r4, r4, ip, lsr #8
916#else
917	orr	r4, r4, ip, lsl #8
918#endif
919	strt	r4, [r1], #0x04
920	bge	.Lcopyout_bad3_loop4
921	sub	r0, r0, #0x01
922
923.Lcopyout_l4:
924	ldmfd	sp!, {r4-r7}
925	mov	r3, #0x00
926	adds	r2, r2, #0x04
927	RETeq
928.Lcopyout_l4_2:
929	rsbs	r2, r2, #0x03
930	addne	pc, pc, r2, lsl #3
931	nop
932	ldrb	ip, [r0], #0x01
933	strbt	ip, [r1], #0x01
934	ldrb	ip, [r0], #0x01
935	strbt	ip, [r1], #0x01
936	ldrb	ip, [r0]
937	strbt	ip, [r1]
938	RET
939END(copyout)
940
941