1/*	$NetBSD: bcopyinout_xscale.S,v 1.3 2003/12/15 09:27:18 scw Exp $	*/
2
3/*-
4 * Copyright 2003 Wasabi Systems, Inc.
5 * All rights reserved.
6 *
7 * Written by Steve C. Woodford for Wasabi Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *      This product includes software developed for the NetBSD Project by
20 *      Wasabi Systems, Inc.
21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22 *    or promote products derived from this software without specific prior
23 *    written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38#include <machine/asm.h>
39__FBSDID("$FreeBSD$");
40
41	.syntax	unified
42	.text
43	.align	2
44
45#if __ARM_ARCH >= 6
46#define GET_PCB(tmp) \
47	mrc p15, 0, tmp, c13, c0, 4; \
48	add	tmp, tmp, #(TD_PCB)
49#else
50.Lcurpcb:
51	.word	_C_LABEL(__pcpu) + PC_CURPCB
52#define GET_PCB(tmp) \
53	ldr	tmp, .Lcurpcb
54#endif
55
56/*
57 * r0 = user space address
58 * r1 = kernel space address
59 * r2 = length
60 *
61 * Copies bytes from user space to kernel space
62 */
63ENTRY(copyin)
64	cmp	r2, #0x00
65	movle	r0, #0x00
66	movle	pc, lr			/* Bail early if length is <= 0 */
67
68	adds	r3, r0, r2
69	movcs	r0, #EFAULT
70	RETc(cs)
71
72	ldr	r12, =(VM_MAXUSER_ADDRESS + 1)
73	cmp	r3, r12
74	movcs	r0, #EFAULT
75	RETc(cs)
76
77	ldr	r3, .L_arm_memcpy
78	ldr	r3, [r3]
79	cmp	r3, #0
80	beq	.Lnormal
81	ldr	r3, .L_min_memcpy_size
82	ldr	r3, [r3]
83	cmp	r2, r3
84	blt	.Lnormal
85	stmfd	sp!, {r0-r2, r4, lr}
86	mov     r3, r0
87	mov     r0, r1
88	mov     r1, r3
89	mov     r3, #2 /* SRC_IS_USER */
90	ldr	r4, .L_arm_memcpy
91	mov	lr, pc
92	ldr	pc, [r4]
93	cmp     r0, #0
94	ldmfd   sp!, {r0-r2, r4, lr}
95	moveq	r0, #0
96	RETeq
97
98.Lnormal:
99	stmfd	sp!, {r10-r11, lr}
100
101	GET_PCB(r10)
102	ldr	r10, [r10]
103
104	mov	r3, #0x00
105	adr	ip, .Lcopyin_fault
106	ldr	r11, [r10, #PCB_ONFAULT]
107	str	ip, [r10, #PCB_ONFAULT]
108	bl	.Lcopyin_guts
109	str	r11, [r10, #PCB_ONFAULT]
110	mov	r0, #0x00
111	ldmfd	sp!, {r10-r11, pc}
112
113.Lcopyin_fault:
114	ldr	r0, =EFAULT
115	str	r11, [r10, #PCB_ONFAULT]
116	cmp	r3, #0x00
117	ldmfdgt	sp!, {r4-r7}		/* r3 > 0 Restore r4-r7 */
118	ldmfdlt	sp!, {r4-r9}		/* r3 < 0 Restore r4-r9 */
119	ldmfd	sp!, {r10-r11, pc}
120
121.Lcopyin_guts:
122	pld	[r0]
123	/* Word-align the destination buffer */
124	ands	ip, r1, #0x03		/* Already word aligned? */
125	beq	.Lcopyin_wordaligned	/* Yup */
126	rsb	ip, ip, #0x04
127	cmp	r2, ip			/* Enough bytes left to align it? */
128	blt	.Lcopyin_l4_2		/* Nope. Just copy bytewise */
129	sub	r2, r2, ip
130	rsbs	ip, ip, #0x03
131	addne	pc, pc, ip, lsl #3
132	nop
133	ldrbt	ip, [r0], #0x01
134	strb	ip, [r1], #0x01
135	ldrbt	ip, [r0], #0x01
136	strb	ip, [r1], #0x01
137	ldrbt	ip, [r0], #0x01
138	strb	ip, [r1], #0x01
139	cmp	r2, #0x00		/* All done? */
140	RETeq
141
142	/* Destination buffer is now word aligned */
143.Lcopyin_wordaligned:
144	ands	ip, r0, #0x03		/* Is src also word-aligned? */
145	bne	.Lcopyin_bad_align	/* Nope. Things just got bad */
146	cmp	r2, #0x08		/* Less than 8 bytes remaining? */
147	blt	.Lcopyin_w_less_than8
148
149	/* Quad-align the destination buffer */
150	tst	r1, #0x07		/* Already quad aligned? */
151	ldrtne	ip, [r0], #0x04
152	strne	ip, [r1], #0x04
153	subne	r2, r2, #0x04
154	stmfd	sp!, {r4-r9}		/* Free up some registers */
155	mov	r3, #-1			/* Signal restore r4-r9 */
156
157	/* Destination buffer quad aligned, source is word aligned */
158	subs	r2, r2, #0x80
159	blt	.Lcopyin_w_lessthan128
160
161	/* Copy 128 bytes at a time */
162.Lcopyin_w_loop128:
163	ldrt	r4, [r0], #0x04		/* LD:00-03 */
164	ldrt	r5, [r0], #0x04		/* LD:04-07 */
165	pld	[r0, #0x18]		/* Prefetch 0x20 */
166	ldrt	r6, [r0], #0x04		/* LD:08-0b */
167	ldrt	r7, [r0], #0x04		/* LD:0c-0f */
168	ldrt	r8, [r0], #0x04		/* LD:10-13 */
169	ldrt	r9, [r0], #0x04		/* LD:14-17 */
170	strd	r4, [r1], #0x08		/* ST:00-07 */
171	ldrt	r4, [r0], #0x04		/* LD:18-1b */
172	ldrt	r5, [r0], #0x04		/* LD:1c-1f */
173	strd	r6, [r1], #0x08		/* ST:08-0f */
174	ldrt	r6, [r0], #0x04		/* LD:20-23 */
175	ldrt	r7, [r0], #0x04		/* LD:24-27 */
176	pld	[r0, #0x18]		/* Prefetch 0x40 */
177	strd	r8, [r1], #0x08		/* ST:10-17 */
178	ldrt	r8, [r0], #0x04		/* LD:28-2b */
179	ldrt	r9, [r0], #0x04		/* LD:2c-2f */
180	strd	r4, [r1], #0x08		/* ST:18-1f */
181	ldrt	r4, [r0], #0x04		/* LD:30-33 */
182	ldrt	r5, [r0], #0x04		/* LD:34-37 */
183	strd	r6, [r1], #0x08		/* ST:20-27 */
184	ldrt	r6, [r0], #0x04		/* LD:38-3b */
185	ldrt	r7, [r0], #0x04		/* LD:3c-3f */
186	strd	r8, [r1], #0x08		/* ST:28-2f */
187	ldrt	r8, [r0], #0x04		/* LD:40-43 */
188	ldrt	r9, [r0], #0x04		/* LD:44-47 */
189	pld	[r0, #0x18]		/* Prefetch 0x60 */
190	strd	r4, [r1], #0x08		/* ST:30-37 */
191	ldrt	r4, [r0], #0x04		/* LD:48-4b */
192	ldrt	r5, [r0], #0x04		/* LD:4c-4f */
193	strd	r6, [r1], #0x08		/* ST:38-3f */
194	ldrt	r6, [r0], #0x04		/* LD:50-53 */
195	ldrt	r7, [r0], #0x04		/* LD:54-57 */
196	strd	r8, [r1], #0x08		/* ST:40-47 */
197	ldrt	r8, [r0], #0x04		/* LD:58-5b */
198	ldrt	r9, [r0], #0x04		/* LD:5c-5f */
199	strd	r4, [r1], #0x08		/* ST:48-4f */
200	ldrt	r4, [r0], #0x04		/* LD:60-63 */
201	ldrt	r5, [r0], #0x04		/* LD:64-67 */
202	pld	[r0, #0x18]		/* Prefetch 0x80 */
203	strd	r6, [r1], #0x08		/* ST:50-57 */
204	ldrt	r6, [r0], #0x04		/* LD:68-6b */
205	ldrt	r7, [r0], #0x04		/* LD:6c-6f */
206	strd	r8, [r1], #0x08		/* ST:58-5f */
207	ldrt	r8, [r0], #0x04		/* LD:70-73 */
208	ldrt	r9, [r0], #0x04		/* LD:74-77 */
209	strd	r4, [r1], #0x08		/* ST:60-67 */
210	ldrt	r4, [r0], #0x04		/* LD:78-7b */
211	ldrt	r5, [r0], #0x04		/* LD:7c-7f */
212	strd	r6, [r1], #0x08		/* ST:68-6f */
213	strd	r8, [r1], #0x08		/* ST:70-77 */
214	subs	r2, r2, #0x80
215	strd	r4, [r1], #0x08		/* ST:78-7f */
216	bge	.Lcopyin_w_loop128
217
218.Lcopyin_w_lessthan128:
219	adds	r2, r2, #0x80		/* Adjust for extra sub */
220	ldmfdeq	sp!, {r4-r9}
221	RETeq
222	subs	r2, r2, #0x20
223	blt	.Lcopyin_w_lessthan32
224
225	/* Copy 32 bytes at a time */
226.Lcopyin_w_loop32:
227	ldrt	r4, [r0], #0x04
228	ldrt	r5, [r0], #0x04
229	pld	[r0, #0x18]
230	ldrt	r6, [r0], #0x04
231	ldrt	r7, [r0], #0x04
232	ldrt	r8, [r0], #0x04
233	ldrt	r9, [r0], #0x04
234	strd	r4, [r1], #0x08
235	ldrt	r4, [r0], #0x04
236	ldrt	r5, [r0], #0x04
237	strd	r6, [r1], #0x08
238	strd	r8, [r1], #0x08
239	subs	r2, r2, #0x20
240	strd	r4, [r1], #0x08
241	bge	.Lcopyin_w_loop32
242
243.Lcopyin_w_lessthan32:
244	adds	r2, r2, #0x20		/* Adjust for extra sub */
245	ldmfdeq	sp!, {r4-r9}
246	RETeq				/* Return now if done */
247
248	and	r4, r2, #0x18
249	rsb	r5, r4, #0x18
250	subs	r2, r2, r4
251	add	pc, pc, r5, lsl #1
252	nop
253
254	/* At least 24 bytes remaining */
255	ldrt	r4, [r0], #0x04
256	ldrt	r5, [r0], #0x04
257	nop
258	strd	r4, [r1], #0x08
259
260	/* At least 16 bytes remaining */
261	ldrt	r4, [r0], #0x04
262	ldrt	r5, [r0], #0x04
263	nop
264	strd	r4, [r1], #0x08
265
266	/* At least 8 bytes remaining */
267	ldrt	r4, [r0], #0x04
268	ldrt	r5, [r0], #0x04
269	nop
270	strd	r4, [r1], #0x08
271
272	/* Less than 8 bytes remaining */
273	ldmfd	sp!, {r4-r9}
274	RETeq				/* Return now if done */
275	mov	r3, #0x00
276
277.Lcopyin_w_less_than8:
278	subs	r2, r2, #0x04
279	ldrtge	ip, [r0], #0x04
280	strge	ip, [r1], #0x04
281	RETeq				/* Return now if done */
282	addlt	r2, r2, #0x04
283	ldrbt	ip, [r0], #0x01
284	cmp	r2, #0x02
285	ldrbtge	r2, [r0], #0x01
286	strb	ip, [r1], #0x01
287	ldrbtgt	ip, [r0]
288	strbge	r2, [r1], #0x01
289	strbgt	ip, [r1]
290	RET
291
292/*
293 * At this point, it has not been possible to word align both buffers.
294 * The destination buffer (r1) is word aligned, but the source buffer
295 * (r0) is not.
296 */
297.Lcopyin_bad_align:
298	stmfd	sp!, {r4-r7}
299	mov	r3, #0x01
300	bic	r0, r0, #0x03
301	cmp	ip, #2
302	ldrt	ip, [r0], #0x04
303	bgt	.Lcopyin_bad3
304	beq	.Lcopyin_bad2
305	b	.Lcopyin_bad1
306
307.Lcopyin_bad1_loop16:
308#ifdef __ARMEB__
309	mov	r4, ip, lsl #8
310#else
311	mov	r4, ip, lsr #8
312#endif
313	ldrt	r5, [r0], #0x04
314	pld	[r0, #0x018]
315	ldrt	r6, [r0], #0x04
316	ldrt	r7, [r0], #0x04
317	ldrt	ip, [r0], #0x04
318#ifdef __ARMEB__
319	orr	r4, r4, r5, lsr #24
320	mov	r5, r5, lsl #8
321	orr	r5, r5, r6, lsr #24
322	mov	r6, r6, lsl #8
323	orr	r6, r6, r7, lsr #24
324	mov	r7, r7, lsl #8
325	orr	r7, r7, ip, lsr #24
326#else
327	orr	r4, r4, r5, lsl #24
328	mov	r5, r5, lsr #8
329	orr	r5, r5, r6, lsl #24
330	mov	r6, r6, lsr #8
331	orr	r6, r6, r7, lsl #24
332	mov	r7, r7, lsr #8
333	orr	r7, r7, ip, lsl #24
334#endif
335	str	r4, [r1], #0x04
336	str	r5, [r1], #0x04
337	str	r6, [r1], #0x04
338	str	r7, [r1], #0x04
339.Lcopyin_bad1:
340	subs	r2, r2, #0x10
341	bge	.Lcopyin_bad1_loop16
342
343	adds	r2, r2, #0x10
344	ldmfdeq	sp!, {r4-r7}
345	RETeq				/* Return now if done */
346	subs	r2, r2, #0x04
347	sublt	r0, r0, #0x03
348	blt	.Lcopyin_l4
349
350.Lcopyin_bad1_loop4:
351#ifdef __ARMEB__
352	mov	r4, ip, lsl #8
353#else
354	mov	r4, ip, lsr #8
355#endif
356	ldrt	ip, [r0], #0x04
357	subs	r2, r2, #0x04
358#ifdef __ARMEB__
359	orr	r4, r4, ip, lsr #24
360#else
361	orr	r4, r4, ip, lsl #24
362#endif
363	str	r4, [r1], #0x04
364	bge	.Lcopyin_bad1_loop4
365	sub	r0, r0, #0x03
366	b	.Lcopyin_l4
367
368.Lcopyin_bad2_loop16:
369#ifdef __ARMEB__
370	mov	r4, ip, lsl #16
371#else
372	mov	r4, ip, lsr #16
373#endif
374	ldrt	r5, [r0], #0x04
375	pld	[r0, #0x018]
376	ldrt	r6, [r0], #0x04
377	ldrt	r7, [r0], #0x04
378	ldrt	ip, [r0], #0x04
379#ifdef __ARMEB__
380	orr	r4, r4, r5, lsr #16
381	mov	r5, r5, lsl #16
382	orr	r5, r5, r6, lsr #16
383	mov	r6, r6, lsl #16
384	orr	r6, r6, r7, lsr #16
385	mov	r7, r7, lsl #16
386	orr	r7, r7, ip, lsr #16
387#else
388	orr	r4, r4, r5, lsl #16
389	mov	r5, r5, lsr #16
390	orr	r5, r5, r6, lsl #16
391	mov	r6, r6, lsr #16
392	orr	r6, r6, r7, lsl #16
393	mov	r7, r7, lsr #16
394	orr	r7, r7, ip, lsl #16
395#endif
396	str	r4, [r1], #0x04
397	str	r5, [r1], #0x04
398	str	r6, [r1], #0x04
399	str	r7, [r1], #0x04
400.Lcopyin_bad2:
401	subs	r2, r2, #0x10
402	bge	.Lcopyin_bad2_loop16
403
404	adds	r2, r2, #0x10
405	ldmfdeq	sp!, {r4-r7}
406	RETeq				/* Return now if done */
407	subs	r2, r2, #0x04
408	sublt	r0, r0, #0x02
409	blt	.Lcopyin_l4
410
411.Lcopyin_bad2_loop4:
412#ifdef __ARMEB__
413	mov	r4, ip, lsl #16
414#else
415	mov	r4, ip, lsr #16
416#endif
417	ldrt	ip, [r0], #0x04
418	subs	r2, r2, #0x04
419#ifdef __ARMEB__
420	orr	r4, r4, ip, lsr #16
421#else
422	orr	r4, r4, ip, lsl #16
423#endif
424	str	r4, [r1], #0x04
425	bge	.Lcopyin_bad2_loop4
426	sub	r0, r0, #0x02
427	b	.Lcopyin_l4
428
429.Lcopyin_bad3_loop16:
430#ifdef __ARMEB__
431	mov	r4, ip, lsl #24
432#else
433	mov	r4, ip, lsr #24
434#endif
435	ldrt	r5, [r0], #0x04
436	pld	[r0, #0x018]
437	ldrt	r6, [r0], #0x04
438	ldrt	r7, [r0], #0x04
439	ldrt	ip, [r0], #0x04
440#ifdef __ARMEB__
441	orr	r4, r4, r5, lsr #8
442	mov	r5, r5, lsl #24
443	orr	r5, r5, r6, lsr #8
444	mov	r6, r6, lsl #24
445	orr	r6, r6, r7, lsr #8
446	mov	r7, r7, lsl #24
447	orr	r7, r7, ip, lsr #8
448#else
449	orr	r4, r4, r5, lsl #8
450	mov	r5, r5, lsr #24
451	orr	r5, r5, r6, lsl #8
452	mov	r6, r6, lsr #24
453	orr	r6, r6, r7, lsl #8
454	mov	r7, r7, lsr #24
455	orr	r7, r7, ip, lsl #8
456#endif
457	str	r4, [r1], #0x04
458	str	r5, [r1], #0x04
459	str	r6, [r1], #0x04
460	str	r7, [r1], #0x04
461.Lcopyin_bad3:
462	subs	r2, r2, #0x10
463	bge	.Lcopyin_bad3_loop16
464
465	adds	r2, r2, #0x10
466	ldmfdeq	sp!, {r4-r7}
467	RETeq				/* Return now if done */
468	subs	r2, r2, #0x04
469	sublt	r0, r0, #0x01
470	blt	.Lcopyin_l4
471
472.Lcopyin_bad3_loop4:
473#ifdef __ARMEB__
474	mov	r4, ip, lsl #24
475#else
476	mov	r4, ip, lsr #24
477#endif
478	ldrt	ip, [r0], #0x04
479	subs	r2, r2, #0x04
480#ifdef __ARMEB__
481	orr	r4, r4, ip, lsr #8
482#else
483	orr	r4, r4, ip, lsl #8
484#endif
485	str	r4, [r1], #0x04
486	bge	.Lcopyin_bad3_loop4
487	sub	r0, r0, #0x01
488
489.Lcopyin_l4:
490	ldmfd	sp!, {r4-r7}
491	mov	r3, #0x00
492	adds	r2, r2, #0x04
493	RETeq
494.Lcopyin_l4_2:
495	rsbs	r2, r2, #0x03
496	addne	pc, pc, r2, lsl #3
497	nop
498	ldrbt	ip, [r0], #0x01
499	strb	ip, [r1], #0x01
500	ldrbt	ip, [r0], #0x01
501	strb	ip, [r1], #0x01
502	ldrbt	ip, [r0]
503	strb	ip, [r1]
504	RET
505END(copyin)
506
507/*
508 * r0 = kernel space address
509 * r1 = user space address
510 * r2 = length
511 *
512 * Copies bytes from kernel space to user space
513 */
514ENTRY(copyout)
515	cmp	r2, #0x00
516	movle	r0, #0x00
517	movle	pc, lr			/* Bail early if length is <= 0 */
518
519	adds	r3, r1, r2
520	movcs	r0, #EFAULT
521	RETc(cs)
522
523	ldr	r12, =(VM_MAXUSER_ADDRESS + 1)
524	cmp	r3, r12
525	movcs	r0, #EFAULT
526	RETc(cs)
527
528	ldr	r3, .L_arm_memcpy
529	ldr	r3, [r3]
530	cmp	r3, #0
531	beq	.Lnormale
532	ldr	r3, .L_min_memcpy_size
533	ldr	r3, [r3]
534	cmp	r2, r3
535	blt	.Lnormale
536	stmfd	sp!, {r0-r2, r4, lr}
537	mov     r3, r0
538	mov     r0, r1
539	mov     r1, r3
540	mov     r3, #1 /* DST_IS_USER */
541	ldr	r4, .L_arm_memcpy
542	mov	lr, pc
543	ldr	pc, [r4]
544	cmp     r0, #0
545	ldmfd   sp!, {r0-r2, r4, lr}
546	moveq	r0, #0
547	RETeq
548
549.Lnormale:
550	stmfd	sp!, {r10-r11, lr}
551
552	GET_PCB(r10)
553	ldr	r10, [r10]
554
555	mov	r3, #0x00
556	adr	ip, .Lcopyout_fault
557	ldr	r11, [r10, #PCB_ONFAULT]
558	str	ip, [r10, #PCB_ONFAULT]
559	bl	.Lcopyout_guts
560	str	r11, [r10, #PCB_ONFAULT]
561	mov	r0, #0x00
562	ldmfd	sp!, {r10-r11, pc}
563
564.Lcopyout_fault:
565	ldr	r0, =EFAULT
566	str	r11, [r10, #PCB_ONFAULT]
567	cmp	r3, #0x00
568	ldmfdgt	sp!, {r4-r7}		/* r3 > 0 Restore r4-r7 */
569	ldmfdlt	sp!, {r4-r9}		/* r3 < 0 Restore r4-r9 */
570	ldmfd	sp!, {r10-r11, pc}
571
572.Lcopyout_guts:
573	pld	[r0]
574	/* Word-align the destination buffer */
575	ands	ip, r1, #0x03		/* Already word aligned? */
576	beq	.Lcopyout_wordaligned	/* Yup */
577	rsb	ip, ip, #0x04
578	cmp	r2, ip			/* Enough bytes left to align it? */
579	blt	.Lcopyout_l4_2		/* Nope. Just copy bytewise */
580	sub	r2, r2, ip
581	rsbs	ip, ip, #0x03
582	addne	pc, pc, ip, lsl #3
583	nop
584	ldrb	ip, [r0], #0x01
585	strbt	ip, [r1], #0x01
586	ldrb	ip, [r0], #0x01
587	strbt	ip, [r1], #0x01
588	ldrb	ip, [r0], #0x01
589	strbt	ip, [r1], #0x01
590	cmp	r2, #0x00		/* All done? */
591	RETeq
592
593	/* Destination buffer is now word aligned */
594.Lcopyout_wordaligned:
595	ands	ip, r0, #0x03		/* Is src also word-aligned? */
596	bne	.Lcopyout_bad_align	/* Nope. Things just got bad */
597	cmp	r2, #0x08		/* Less than 8 bytes remaining? */
598	blt	.Lcopyout_w_less_than8
599
600	/* Quad-align the destination buffer */
601	tst	r0, #0x07		/* Already quad aligned? */
602	ldrne	ip, [r0], #0x04
603	subne	r2, r2, #0x04
604	strtne	ip, [r1], #0x04
605
606	stmfd	sp!, {r4-r9}		/* Free up some registers */
607	mov	r3, #-1			/* Signal restore r4-r9 */
608
609	/* Destination buffer word aligned, source is quad aligned */
610	subs	r2, r2, #0x80
611	blt	.Lcopyout_w_lessthan128
612
613	/* Copy 128 bytes at a time */
614.Lcopyout_w_loop128:
615	ldrd	r4, [r0], #0x08		/* LD:00-07 */
616	pld	[r0, #0x18]		/* Prefetch 0x20 */
617	ldrd	r6, [r0], #0x08		/* LD:08-0f */
618	ldrd	r8, [r0], #0x08		/* LD:10-17 */
619	strt	r4, [r1], #0x04		/* ST:00-03 */
620	strt	r5, [r1], #0x04		/* ST:04-07 */
621	ldrd	r4, [r0], #0x08		/* LD:18-1f */
622	strt	r6, [r1], #0x04		/* ST:08-0b */
623	strt	r7, [r1], #0x04		/* ST:0c-0f */
624	ldrd	r6, [r0], #0x08		/* LD:20-27 */
625	pld	[r0, #0x18]		/* Prefetch 0x40 */
626	strt	r8, [r1], #0x04		/* ST:10-13 */
627	strt	r9, [r1], #0x04		/* ST:14-17 */
628	ldrd	r8, [r0], #0x08		/* LD:28-2f */
629	strt	r4, [r1], #0x04		/* ST:18-1b */
630	strt	r5, [r1], #0x04		/* ST:1c-1f */
631	ldrd	r4, [r0], #0x08		/* LD:30-37 */
632	strt	r6, [r1], #0x04		/* ST:20-23 */
633	strt	r7, [r1], #0x04		/* ST:24-27 */
634	ldrd	r6, [r0], #0x08		/* LD:38-3f */
635	strt	r8, [r1], #0x04		/* ST:28-2b */
636	strt	r9, [r1], #0x04		/* ST:2c-2f */
637	ldrd	r8, [r0], #0x08		/* LD:40-47 */
638	pld	[r0, #0x18]		/* Prefetch 0x60 */
639	strt	r4, [r1], #0x04		/* ST:30-33 */
640	strt	r5, [r1], #0x04		/* ST:34-37 */
641	ldrd	r4, [r0], #0x08		/* LD:48-4f */
642	strt	r6, [r1], #0x04		/* ST:38-3b */
643	strt	r7, [r1], #0x04		/* ST:3c-3f */
644	ldrd	r6, [r0], #0x08		/* LD:50-57 */
645	strt	r8, [r1], #0x04		/* ST:40-43 */
646	strt	r9, [r1], #0x04		/* ST:44-47 */
647	ldrd	r8, [r0], #0x08		/* LD:58-4f */
648	strt	r4, [r1], #0x04		/* ST:48-4b */
649	strt	r5, [r1], #0x04		/* ST:4c-4f */
650	ldrd	r4, [r0], #0x08		/* LD:60-67 */
651	pld	[r0, #0x18]		/* Prefetch 0x80 */
652	strt	r6, [r1], #0x04		/* ST:50-53 */
653	strt	r7, [r1], #0x04		/* ST:54-57 */
654	ldrd	r6, [r0], #0x08		/* LD:68-6f */
655	strt	r8, [r1], #0x04		/* ST:58-5b */
656	strt	r9, [r1], #0x04		/* ST:5c-5f */
657	ldrd	r8, [r0], #0x08		/* LD:70-77 */
658	strt	r4, [r1], #0x04		/* ST:60-63 */
659	strt	r5, [r1], #0x04		/* ST:64-67 */
660	ldrd	r4, [r0], #0x08		/* LD:78-7f */
661	strt	r6, [r1], #0x04		/* ST:68-6b */
662	strt	r7, [r1], #0x04		/* ST:6c-6f */
663	strt	r8, [r1], #0x04		/* ST:70-73 */
664	strt	r9, [r1], #0x04		/* ST:74-77 */
665	subs	r2, r2, #0x80
666	strt	r4, [r1], #0x04		/* ST:78-7b */
667	strt	r5, [r1], #0x04		/* ST:7c-7f */
668	bge	.Lcopyout_w_loop128
669
670.Lcopyout_w_lessthan128:
671	adds	r2, r2, #0x80		/* Adjust for extra sub */
672	ldmfdeq	sp!, {r4-r9}
673	RETeq				/* Return now if done */
674	subs	r2, r2, #0x20
675	blt	.Lcopyout_w_lessthan32
676
677	/* Copy 32 bytes at a time */
678.Lcopyout_w_loop32:
679	ldrd	r4, [r0], #0x08
680	pld	[r0, #0x18]
681	ldrd	r6, [r0], #0x08
682	ldrd	r8, [r0], #0x08
683	strt	r4, [r1], #0x04
684	strt	r5, [r1], #0x04
685	ldrd	r4, [r0], #0x08
686	strt	r6, [r1], #0x04
687	strt	r7, [r1], #0x04
688	strt	r8, [r1], #0x04
689	strt	r9, [r1], #0x04
690	subs	r2, r2, #0x20
691	strt	r4, [r1], #0x04
692	strt	r5, [r1], #0x04
693	bge	.Lcopyout_w_loop32
694
695.Lcopyout_w_lessthan32:
696	adds	r2, r2, #0x20		/* Adjust for extra sub */
697	ldmfdeq	sp!, {r4-r9}
698	RETeq				/* Return now if done */
699
700	and	r4, r2, #0x18
701	rsb	r5, r4, #0x18
702	subs	r2, r2, r4
703	add	pc, pc, r5, lsl #1
704	nop
705
706	/* At least 24 bytes remaining */
707	ldrd	r4, [r0], #0x08
708	strt	r4, [r1], #0x04
709	strt	r5, [r1], #0x04
710	nop
711
712	/* At least 16 bytes remaining */
713	ldrd	r4, [r0], #0x08
714	strt	r4, [r1], #0x04
715	strt	r5, [r1], #0x04
716	nop
717
718	/* At least 8 bytes remaining */
719	ldrd	r4, [r0], #0x08
720	strt	r4, [r1], #0x04
721	strt	r5, [r1], #0x04
722	nop
723
724	/* Less than 8 bytes remaining */
725	ldmfd	sp!, {r4-r9}
726	RETeq				/* Return now if done */
727	mov	r3, #0x00
728
729.Lcopyout_w_less_than8:
730	subs	r2, r2, #0x04
731	ldrge	ip, [r0], #0x04
732	strtge	ip, [r1], #0x04
733	RETeq				/* Return now if done */
734	addlt	r2, r2, #0x04
735	ldrb	ip, [r0], #0x01
736	cmp	r2, #0x02
737	ldrbge	r2, [r0], #0x01
738	strbt	ip, [r1], #0x01
739	ldrbgt	ip, [r0]
740	strbtge	r2, [r1], #0x01
741	strbtgt	ip, [r1]
742	RET
743
744/*
745 * At this point, it has not been possible to word align both buffers.
746 * The destination buffer (r1) is word aligned, but the source buffer
747 * (r0) is not.
748 */
749.Lcopyout_bad_align:
750	stmfd	sp!, {r4-r7}
751	mov	r3, #0x01
752	bic	r0, r0, #0x03
753	cmp	ip, #2
754	ldr	ip, [r0], #0x04
755	bgt	.Lcopyout_bad3
756	beq	.Lcopyout_bad2
757	b	.Lcopyout_bad1
758
759.Lcopyout_bad1_loop16:
760#ifdef	__ARMEB__
761	mov	r4, ip, lsl #8
762#else
763	mov	r4, ip, lsr #8
764#endif
765	ldr	r5, [r0], #0x04
766	pld	[r0, #0x018]
767	ldr	r6, [r0], #0x04
768	ldr	r7, [r0], #0x04
769	ldr	ip, [r0], #0x04
770#ifdef	__ARMEB__
771	orr	r4, r4, r5, lsr #24
772	mov	r5, r5, lsl #8
773	orr	r5, r5, r6, lsr #24
774	mov	r6, r6, lsl #8
775	orr	r6, r6, r7, lsr #24
776	mov	r7, r7, lsl #8
777	orr	r7, r7, ip, lsr #24
778#else
779	orr	r4, r4, r5, lsl #24
780	mov	r5, r5, lsr #8
781	orr	r5, r5, r6, lsl #24
782	mov	r6, r6, lsr #8
783	orr	r6, r6, r7, lsl #24
784	mov	r7, r7, lsr #8
785	orr	r7, r7, ip, lsl #24
786#endif
787	strt	r4, [r1], #0x04
788	strt	r5, [r1], #0x04
789	strt	r6, [r1], #0x04
790	strt	r7, [r1], #0x04
791.Lcopyout_bad1:
792	subs	r2, r2, #0x10
793	bge	.Lcopyout_bad1_loop16
794
795	adds	r2, r2, #0x10
796	ldmfdeq	sp!, {r4-r7}
797	RETeq				/* Return now if done */
798	subs	r2, r2, #0x04
799	sublt	r0, r0, #0x03
800	blt	.Lcopyout_l4
801
802.Lcopyout_bad1_loop4:
803#ifdef __ARMEB__
804	mov	r4, ip, lsl #8
805#else
806	mov	r4, ip, lsr #8
807#endif
808	ldr	ip, [r0], #0x04
809	subs	r2, r2, #0x04
810#ifdef __ARMEB__
811	orr	r4, r4, ip, lsr #24
812#else
813	orr	r4, r4, ip, lsl #24
814#endif
815	strt	r4, [r1], #0x04
816	bge	.Lcopyout_bad1_loop4
817	sub	r0, r0, #0x03
818	b	.Lcopyout_l4
819
820.Lcopyout_bad2_loop16:
821#ifdef __ARMEB__
822	mov	r4, ip, lsl #16
823#else
824	mov	r4, ip, lsr #16
825#endif
826	ldr	r5, [r0], #0x04
827	pld	[r0, #0x018]
828	ldr	r6, [r0], #0x04
829	ldr	r7, [r0], #0x04
830	ldr	ip, [r0], #0x04
831#ifdef __ARMEB__
832	orr	r4, r4, r5, lsr #16
833	mov	r5, r5, lsl #16
834	orr	r5, r5, r6, lsr #16
835	mov	r6, r6, lsl #16
836	orr	r6, r6, r7, lsr #16
837	mov	r7, r7, lsl #16
838	orr	r7, r7, ip, lsr #16
839#else
840	orr	r4, r4, r5, lsl #16
841	mov	r5, r5, lsr #16
842	orr	r5, r5, r6, lsl #16
843	mov	r6, r6, lsr #16
844	orr	r6, r6, r7, lsl #16
845	mov	r7, r7, lsr #16
846	orr	r7, r7, ip, lsl #16
847#endif
848	strt	r4, [r1], #0x04
849	strt	r5, [r1], #0x04
850	strt	r6, [r1], #0x04
851	strt	r7, [r1], #0x04
852.Lcopyout_bad2:
853	subs	r2, r2, #0x10
854	bge	.Lcopyout_bad2_loop16
855
856	adds	r2, r2, #0x10
857	ldmfdeq	sp!, {r4-r7}
858	RETeq				/* Return now if done */
859	subs	r2, r2, #0x04
860	sublt	r0, r0, #0x02
861	blt	.Lcopyout_l4
862
863.Lcopyout_bad2_loop4:
864#ifdef __ARMEB__
865	mov	r4, ip, lsl #16
866#else
867	mov	r4, ip, lsr #16
868#endif
869	ldr	ip, [r0], #0x04
870	subs	r2, r2, #0x04
871#ifdef __ARMEB__
872	orr	r4, r4, ip, lsr #16
873#else
874	orr	r4, r4, ip, lsl #16
875#endif
876	strt	r4, [r1], #0x04
877	bge	.Lcopyout_bad2_loop4
878	sub	r0, r0, #0x02
879	b	.Lcopyout_l4
880
881.Lcopyout_bad3_loop16:
882#ifdef __ARMEB__
883	mov	r4, ip, lsl #24
884#else
885	mov	r4, ip, lsr #24
886#endif
887	ldr	r5, [r0], #0x04
888	pld	[r0, #0x018]
889	ldr	r6, [r0], #0x04
890	ldr	r7, [r0], #0x04
891	ldr	ip, [r0], #0x04
892#ifdef __ARMEB__
893	orr	r4, r4, r5, lsr #8
894	mov	r5, r5, lsl #24
895	orr	r5, r5, r6, lsr #8
896	mov	r6, r6, lsl #24
897	orr	r6, r6, r7, lsr #8
898	mov	r7, r7, lsl #24
899	orr	r7, r7, ip, lsr #8
900#else
901	orr	r4, r4, r5, lsl #8
902	mov	r5, r5, lsr #24
903	orr	r5, r5, r6, lsl #8
904	mov	r6, r6, lsr #24
905	orr	r6, r6, r7, lsl #8
906	mov	r7, r7, lsr #24
907	orr	r7, r7, ip, lsl #8
908#endif
909	strt	r4, [r1], #0x04
910	strt	r5, [r1], #0x04
911	strt	r6, [r1], #0x04
912	strt	r7, [r1], #0x04
913.Lcopyout_bad3:
914	subs	r2, r2, #0x10
915	bge	.Lcopyout_bad3_loop16
916
917	adds	r2, r2, #0x10
918	ldmfdeq	sp!, {r4-r7}
919	RETeq				/* Return now if done */
920	subs	r2, r2, #0x04
921	sublt	r0, r0, #0x01
922	blt	.Lcopyout_l4
923
924.Lcopyout_bad3_loop4:
925#ifdef __ARMEB__
926	mov	r4, ip, lsl #24
927#else
928	mov	r4, ip, lsr #24
929#endif
930	ldr	ip, [r0], #0x04
931	subs	r2, r2, #0x04
932#ifdef __ARMEB__
933	orr	r4, r4, ip, lsr #8
934#else
935	orr	r4, r4, ip, lsl #8
936#endif
937	strt	r4, [r1], #0x04
938	bge	.Lcopyout_bad3_loop4
939	sub	r0, r0, #0x01
940
941.Lcopyout_l4:
942	ldmfd	sp!, {r4-r7}
943	mov	r3, #0x00
944	adds	r2, r2, #0x04
945	RETeq
946.Lcopyout_l4_2:
947	rsbs	r2, r2, #0x03
948	addne	pc, pc, r2, lsl #3
949	nop
950	ldrb	ip, [r0], #0x01
951	strbt	ip, [r1], #0x01
952	ldrb	ip, [r0], #0x01
953	strbt	ip, [r1], #0x01
954	ldrb	ip, [r0]
955	strbt	ip, [r1]
956	RET
957END(copyout)
958
959