bcopyinout_xscale.S revision 284264
1/*	$NetBSD: bcopyinout_xscale.S,v 1.3 2003/12/15 09:27:18 scw Exp $	*/
2
3/*-
4 * Copyright 2003 Wasabi Systems, Inc.
5 * All rights reserved.
6 *
7 * Written by Steve C. Woodford for Wasabi Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *      This product includes software developed for the NetBSD Project by
20 *      Wasabi Systems, Inc.
21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22 *    or promote products derived from this software without specific prior
23 *    written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38#include <machine/asm.h>
39__FBSDID("$FreeBSD: head/sys/arm/arm/bcopyinout_xscale.S 284264 2015-06-11 13:54:18Z andrew $");
40
41#include <machine/acle-compat.h>
42
43	.syntax	unified
44	.text
45	.align	2
46
47#if __ARM_ARCH >= 6
48#define GET_PCB(tmp) \
49	mrc p15, 0, tmp, c13, c0, 4; \
50	add	tmp, tmp, #(TD_PCB)
51#else
52.Lcurpcb:
53	.word	_C_LABEL(__pcpu) + PC_CURPCB
54#define GET_PCB(tmp) \
55	ldr	tmp, .Lcurpcb
56#endif
57
58/*
59 * r0 = user space address
60 * r1 = kernel space address
61 * r2 = length
62 *
63 * Copies bytes from user space to kernel space
64 */
65ENTRY(copyin)
66	cmp	r2, #0x00
67	movle	r0, #0x00
68	movle	pc, lr			/* Bail early if length is <= 0 */
69
70	ldr	r3, .L_arm_memcpy
71	ldr	r3, [r3]
72	cmp	r3, #0
73	beq	.Lnormal
74	ldr	r3, .L_min_memcpy_size
75	ldr	r3, [r3]
76	cmp	r2, r3
77	blt	.Lnormal
78	stmfd	sp!, {r0-r2, r4, lr}
79	mov     r3, r0
80	mov     r0, r1
81	mov     r1, r3
82	mov     r3, #2 /* SRC_IS_USER */
83	ldr	r4, .L_arm_memcpy
84	mov	lr, pc
85	ldr	pc, [r4]
86	cmp     r0, #0
87	ldmfd   sp!, {r0-r2, r4, lr}
88	moveq	r0, #0
89	RETeq
90
91.Lnormal:
92	stmfd	sp!, {r10-r11, lr}
93
94	GET_PCB(r10)
95	ldr	r10, [r10]
96
97	mov	r3, #0x00
98	adr	ip, .Lcopyin_fault
99	ldr	r11, [r10, #PCB_ONFAULT]
100	str	ip, [r10, #PCB_ONFAULT]
101	bl	.Lcopyin_guts
102	str	r11, [r10, #PCB_ONFAULT]
103	mov	r0, #0x00
104	ldmfd	sp!, {r10-r11, pc}
105
106.Lcopyin_fault:
107	ldr	r0, =EFAULT
108	str	r11, [r10, #PCB_ONFAULT]
109	cmp	r3, #0x00
110	ldmfdgt	sp!, {r4-r7}		/* r3 > 0 Restore r4-r7 */
111	ldmfdlt	sp!, {r4-r9}		/* r3 < 0 Restore r4-r9 */
112	ldmfd	sp!, {r10-r11, pc}
113
114.Lcopyin_guts:
115	pld	[r0]
116	/* Word-align the destination buffer */
117	ands	ip, r1, #0x03		/* Already word aligned? */
118	beq	.Lcopyin_wordaligned	/* Yup */
119	rsb	ip, ip, #0x04
120	cmp	r2, ip			/* Enough bytes left to align it? */
121	blt	.Lcopyin_l4_2		/* Nope. Just copy bytewise */
122	sub	r2, r2, ip
123	rsbs	ip, ip, #0x03
124	addne	pc, pc, ip, lsl #3
125	nop
126	ldrbt	ip, [r0], #0x01
127	strb	ip, [r1], #0x01
128	ldrbt	ip, [r0], #0x01
129	strb	ip, [r1], #0x01
130	ldrbt	ip, [r0], #0x01
131	strb	ip, [r1], #0x01
132	cmp	r2, #0x00		/* All done? */
133	RETeq
134
135	/* Destination buffer is now word aligned */
136.Lcopyin_wordaligned:
137	ands	ip, r0, #0x03		/* Is src also word-aligned? */
138	bne	.Lcopyin_bad_align	/* Nope. Things just got bad */
139	cmp	r2, #0x08		/* Less than 8 bytes remaining? */
140	blt	.Lcopyin_w_less_than8
141
142	/* Quad-align the destination buffer */
143	tst	r1, #0x07		/* Already quad aligned? */
144	ldrtne	ip, [r0], #0x04
145	strne	ip, [r1], #0x04
146	subne	r2, r2, #0x04
147	stmfd	sp!, {r4-r9}		/* Free up some registers */
148	mov	r3, #-1			/* Signal restore r4-r9 */
149
150	/* Destination buffer quad aligned, source is word aligned */
151	subs	r2, r2, #0x80
152	blt	.Lcopyin_w_lessthan128
153
154	/* Copy 128 bytes at a time */
155.Lcopyin_w_loop128:
156	ldrt	r4, [r0], #0x04		/* LD:00-03 */
157	ldrt	r5, [r0], #0x04		/* LD:04-07 */
158	pld	[r0, #0x18]		/* Prefetch 0x20 */
159	ldrt	r6, [r0], #0x04		/* LD:08-0b */
160	ldrt	r7, [r0], #0x04		/* LD:0c-0f */
161	ldrt	r8, [r0], #0x04		/* LD:10-13 */
162	ldrt	r9, [r0], #0x04		/* LD:14-17 */
163	strd	r4, [r1], #0x08		/* ST:00-07 */
164	ldrt	r4, [r0], #0x04		/* LD:18-1b */
165	ldrt	r5, [r0], #0x04		/* LD:1c-1f */
166	strd	r6, [r1], #0x08		/* ST:08-0f */
167	ldrt	r6, [r0], #0x04		/* LD:20-23 */
168	ldrt	r7, [r0], #0x04		/* LD:24-27 */
169	pld	[r0, #0x18]		/* Prefetch 0x40 */
170	strd	r8, [r1], #0x08		/* ST:10-17 */
171	ldrt	r8, [r0], #0x04		/* LD:28-2b */
172	ldrt	r9, [r0], #0x04		/* LD:2c-2f */
173	strd	r4, [r1], #0x08		/* ST:18-1f */
174	ldrt	r4, [r0], #0x04		/* LD:30-33 */
175	ldrt	r5, [r0], #0x04		/* LD:34-37 */
176	strd	r6, [r1], #0x08		/* ST:20-27 */
177	ldrt	r6, [r0], #0x04		/* LD:38-3b */
178	ldrt	r7, [r0], #0x04		/* LD:3c-3f */
179	strd	r8, [r1], #0x08		/* ST:28-2f */
180	ldrt	r8, [r0], #0x04		/* LD:40-43 */
181	ldrt	r9, [r0], #0x04		/* LD:44-47 */
182	pld	[r0, #0x18]		/* Prefetch 0x60 */
183	strd	r4, [r1], #0x08		/* ST:30-37 */
184	ldrt	r4, [r0], #0x04		/* LD:48-4b */
185	ldrt	r5, [r0], #0x04		/* LD:4c-4f */
186	strd	r6, [r1], #0x08		/* ST:38-3f */
187	ldrt	r6, [r0], #0x04		/* LD:50-53 */
188	ldrt	r7, [r0], #0x04		/* LD:54-57 */
189	strd	r8, [r1], #0x08		/* ST:40-47 */
190	ldrt	r8, [r0], #0x04		/* LD:58-5b */
191	ldrt	r9, [r0], #0x04		/* LD:5c-5f */
192	strd	r4, [r1], #0x08		/* ST:48-4f */
193	ldrt	r4, [r0], #0x04		/* LD:60-63 */
194	ldrt	r5, [r0], #0x04		/* LD:64-67 */
195	pld	[r0, #0x18]		/* Prefetch 0x80 */
196	strd	r6, [r1], #0x08		/* ST:50-57 */
197	ldrt	r6, [r0], #0x04		/* LD:68-6b */
198	ldrt	r7, [r0], #0x04		/* LD:6c-6f */
199	strd	r8, [r1], #0x08		/* ST:58-5f */
200	ldrt	r8, [r0], #0x04		/* LD:70-73 */
201	ldrt	r9, [r0], #0x04		/* LD:74-77 */
202	strd	r4, [r1], #0x08		/* ST:60-67 */
203	ldrt	r4, [r0], #0x04		/* LD:78-7b */
204	ldrt	r5, [r0], #0x04		/* LD:7c-7f */
205	strd	r6, [r1], #0x08		/* ST:68-6f */
206	strd	r8, [r1], #0x08		/* ST:70-77 */
207	subs	r2, r2, #0x80
208	strd	r4, [r1], #0x08		/* ST:78-7f */
209	bge	.Lcopyin_w_loop128
210
211.Lcopyin_w_lessthan128:
212	adds	r2, r2, #0x80		/* Adjust for extra sub */
213	ldmfdeq	sp!, {r4-r9}
214	RETeq
215	subs	r2, r2, #0x20
216	blt	.Lcopyin_w_lessthan32
217
218	/* Copy 32 bytes at a time */
219.Lcopyin_w_loop32:
220	ldrt	r4, [r0], #0x04
221	ldrt	r5, [r0], #0x04
222	pld	[r0, #0x18]
223	ldrt	r6, [r0], #0x04
224	ldrt	r7, [r0], #0x04
225	ldrt	r8, [r0], #0x04
226	ldrt	r9, [r0], #0x04
227	strd	r4, [r1], #0x08
228	ldrt	r4, [r0], #0x04
229	ldrt	r5, [r0], #0x04
230	strd	r6, [r1], #0x08
231	strd	r8, [r1], #0x08
232	subs	r2, r2, #0x20
233	strd	r4, [r1], #0x08
234	bge	.Lcopyin_w_loop32
235
236.Lcopyin_w_lessthan32:
237	adds	r2, r2, #0x20		/* Adjust for extra sub */
238	ldmfdeq	sp!, {r4-r9}
239	RETeq				/* Return now if done */
240
241	and	r4, r2, #0x18
242	rsb	r5, r4, #0x18
243	subs	r2, r2, r4
244	add	pc, pc, r5, lsl #1
245	nop
246
247	/* At least 24 bytes remaining */
248	ldrt	r4, [r0], #0x04
249	ldrt	r5, [r0], #0x04
250	nop
251	strd	r4, [r1], #0x08
252
253	/* At least 16 bytes remaining */
254	ldrt	r4, [r0], #0x04
255	ldrt	r5, [r0], #0x04
256	nop
257	strd	r4, [r1], #0x08
258
259	/* At least 8 bytes remaining */
260	ldrt	r4, [r0], #0x04
261	ldrt	r5, [r0], #0x04
262	nop
263	strd	r4, [r1], #0x08
264
265	/* Less than 8 bytes remaining */
266	ldmfd	sp!, {r4-r9}
267	RETeq				/* Return now if done */
268	mov	r3, #0x00
269
270.Lcopyin_w_less_than8:
271	subs	r2, r2, #0x04
272	ldrtge	ip, [r0], #0x04
273	strge	ip, [r1], #0x04
274	RETeq				/* Return now if done */
275	addlt	r2, r2, #0x04
276	ldrbt	ip, [r0], #0x01
277	cmp	r2, #0x02
278	ldrbtge	r2, [r0], #0x01
279	strb	ip, [r1], #0x01
280	ldrbtgt	ip, [r0]
281	strbge	r2, [r1], #0x01
282	strbgt	ip, [r1]
283	RET
284
285/*
286 * At this point, it has not been possible to word align both buffers.
287 * The destination buffer (r1) is word aligned, but the source buffer
288 * (r0) is not.
289 */
290.Lcopyin_bad_align:
291	stmfd	sp!, {r4-r7}
292	mov	r3, #0x01
293	bic	r0, r0, #0x03
294	cmp	ip, #2
295	ldrt	ip, [r0], #0x04
296	bgt	.Lcopyin_bad3
297	beq	.Lcopyin_bad2
298	b	.Lcopyin_bad1
299
300.Lcopyin_bad1_loop16:
301#ifdef __ARMEB__
302	mov	r4, ip, lsl #8
303#else
304	mov	r4, ip, lsr #8
305#endif
306	ldrt	r5, [r0], #0x04
307	pld	[r0, #0x018]
308	ldrt	r6, [r0], #0x04
309	ldrt	r7, [r0], #0x04
310	ldrt	ip, [r0], #0x04
311#ifdef __ARMEB__
312	orr	r4, r4, r5, lsr #24
313	mov	r5, r5, lsl #8
314	orr	r5, r5, r6, lsr #24
315	mov	r6, r6, lsl #8
316	orr	r6, r6, r7, lsr #24
317	mov	r7, r7, lsl #8
318	orr	r7, r7, ip, lsr #24
319#else
320	orr	r4, r4, r5, lsl #24
321	mov	r5, r5, lsr #8
322	orr	r5, r5, r6, lsl #24
323	mov	r6, r6, lsr #8
324	orr	r6, r6, r7, lsl #24
325	mov	r7, r7, lsr #8
326	orr	r7, r7, ip, lsl #24
327#endif
328	str	r4, [r1], #0x04
329	str	r5, [r1], #0x04
330	str	r6, [r1], #0x04
331	str	r7, [r1], #0x04
332.Lcopyin_bad1:
333	subs	r2, r2, #0x10
334	bge	.Lcopyin_bad1_loop16
335
336	adds	r2, r2, #0x10
337	ldmfdeq	sp!, {r4-r7}
338	RETeq				/* Return now if done */
339	subs	r2, r2, #0x04
340	sublt	r0, r0, #0x03
341	blt	.Lcopyin_l4
342
343.Lcopyin_bad1_loop4:
344#ifdef __ARMEB__
345	mov	r4, ip, lsl #8
346#else
347	mov	r4, ip, lsr #8
348#endif
349	ldrt	ip, [r0], #0x04
350	subs	r2, r2, #0x04
351#ifdef __ARMEB__
352	orr	r4, r4, ip, lsr #24
353#else
354	orr	r4, r4, ip, lsl #24
355#endif
356	str	r4, [r1], #0x04
357	bge	.Lcopyin_bad1_loop4
358	sub	r0, r0, #0x03
359	b	.Lcopyin_l4
360
361.Lcopyin_bad2_loop16:
362#ifdef __ARMEB__
363	mov	r4, ip, lsl #16
364#else
365	mov	r4, ip, lsr #16
366#endif
367	ldrt	r5, [r0], #0x04
368	pld	[r0, #0x018]
369	ldrt	r6, [r0], #0x04
370	ldrt	r7, [r0], #0x04
371	ldrt	ip, [r0], #0x04
372#ifdef __ARMEB__
373	orr	r4, r4, r5, lsr #16
374	mov	r5, r5, lsl #16
375	orr	r5, r5, r6, lsr #16
376	mov	r6, r6, lsl #16
377	orr	r6, r6, r7, lsr #16
378	mov	r7, r7, lsl #16
379	orr	r7, r7, ip, lsr #16
380#else
381	orr	r4, r4, r5, lsl #16
382	mov	r5, r5, lsr #16
383	orr	r5, r5, r6, lsl #16
384	mov	r6, r6, lsr #16
385	orr	r6, r6, r7, lsl #16
386	mov	r7, r7, lsr #16
387	orr	r7, r7, ip, lsl #16
388#endif
389	str	r4, [r1], #0x04
390	str	r5, [r1], #0x04
391	str	r6, [r1], #0x04
392	str	r7, [r1], #0x04
393.Lcopyin_bad2:
394	subs	r2, r2, #0x10
395	bge	.Lcopyin_bad2_loop16
396
397	adds	r2, r2, #0x10
398	ldmfdeq	sp!, {r4-r7}
399	RETeq				/* Return now if done */
400	subs	r2, r2, #0x04
401	sublt	r0, r0, #0x02
402	blt	.Lcopyin_l4
403
404.Lcopyin_bad2_loop4:
405#ifdef __ARMEB__
406	mov	r4, ip, lsl #16
407#else
408	mov	r4, ip, lsr #16
409#endif
410	ldrt	ip, [r0], #0x04
411	subs	r2, r2, #0x04
412#ifdef __ARMEB__
413	orr	r4, r4, ip, lsr #16
414#else
415	orr	r4, r4, ip, lsl #16
416#endif
417	str	r4, [r1], #0x04
418	bge	.Lcopyin_bad2_loop4
419	sub	r0, r0, #0x02
420	b	.Lcopyin_l4
421
422.Lcopyin_bad3_loop16:
423#ifdef __ARMEB__
424	mov	r4, ip, lsl #24
425#else
426	mov	r4, ip, lsr #24
427#endif
428	ldrt	r5, [r0], #0x04
429	pld	[r0, #0x018]
430	ldrt	r6, [r0], #0x04
431	ldrt	r7, [r0], #0x04
432	ldrt	ip, [r0], #0x04
433#ifdef __ARMEB__
434	orr	r4, r4, r5, lsr #8
435	mov	r5, r5, lsl #24
436	orr	r5, r5, r6, lsr #8
437	mov	r6, r6, lsl #24
438	orr	r6, r6, r7, lsr #8
439	mov	r7, r7, lsl #24
440	orr	r7, r7, ip, lsr #8
441#else
442	orr	r4, r4, r5, lsl #8
443	mov	r5, r5, lsr #24
444	orr	r5, r5, r6, lsl #8
445	mov	r6, r6, lsr #24
446	orr	r6, r6, r7, lsl #8
447	mov	r7, r7, lsr #24
448	orr	r7, r7, ip, lsl #8
449#endif
450	str	r4, [r1], #0x04
451	str	r5, [r1], #0x04
452	str	r6, [r1], #0x04
453	str	r7, [r1], #0x04
454.Lcopyin_bad3:
455	subs	r2, r2, #0x10
456	bge	.Lcopyin_bad3_loop16
457
458	adds	r2, r2, #0x10
459	ldmfdeq	sp!, {r4-r7}
460	RETeq				/* Return now if done */
461	subs	r2, r2, #0x04
462	sublt	r0, r0, #0x01
463	blt	.Lcopyin_l4
464
465.Lcopyin_bad3_loop4:
466#ifdef __ARMEB__
467	mov	r4, ip, lsl #24
468#else
469	mov	r4, ip, lsr #24
470#endif
471	ldrt	ip, [r0], #0x04
472	subs	r2, r2, #0x04
473#ifdef __ARMEB__
474	orr	r4, r4, ip, lsr #8
475#else
476	orr	r4, r4, ip, lsl #8
477#endif
478	str	r4, [r1], #0x04
479	bge	.Lcopyin_bad3_loop4
480	sub	r0, r0, #0x01
481
482.Lcopyin_l4:
483	ldmfd	sp!, {r4-r7}
484	mov	r3, #0x00
485	adds	r2, r2, #0x04
486	RETeq
487.Lcopyin_l4_2:
488	rsbs	r2, r2, #0x03
489	addne	pc, pc, r2, lsl #3
490	nop
491	ldrbt	ip, [r0], #0x01
492	strb	ip, [r1], #0x01
493	ldrbt	ip, [r0], #0x01
494	strb	ip, [r1], #0x01
495	ldrbt	ip, [r0]
496	strb	ip, [r1]
497	RET
498END(copyin)
499
500/*
501 * r0 = kernel space address
502 * r1 = user space address
503 * r2 = length
504 *
505 * Copies bytes from kernel space to user space
506 */
507ENTRY(copyout)
508	cmp	r2, #0x00
509	movle	r0, #0x00
510	movle	pc, lr			/* Bail early if length is <= 0 */
511
512	ldr	r3, .L_arm_memcpy
513	ldr	r3, [r3]
514	cmp	r3, #0
515	beq	.Lnormale
516	ldr	r3, .L_min_memcpy_size
517	ldr	r3, [r3]
518	cmp	r2, r3
519	blt	.Lnormale
520	stmfd	sp!, {r0-r2, r4, lr}
521	mov     r3, r0
522	mov     r0, r1
523	mov     r1, r3
524	mov     r3, #1 /* DST_IS_USER */
525	ldr	r4, .L_arm_memcpy
526	mov	lr, pc
527	ldr	pc, [r4]
528	cmp     r0, #0
529	ldmfd   sp!, {r0-r2, r4, lr}
530	moveq	r0, #0
531	RETeq
532
533.Lnormale:
534	stmfd	sp!, {r10-r11, lr}
535
536	GET_PCB(r10)
537	ldr	r10, [r10]
538
539	mov	r3, #0x00
540	adr	ip, .Lcopyout_fault
541	ldr	r11, [r10, #PCB_ONFAULT]
542	str	ip, [r10, #PCB_ONFAULT]
543	bl	.Lcopyout_guts
544	str	r11, [r10, #PCB_ONFAULT]
545	mov	r0, #0x00
546	ldmfd	sp!, {r10-r11, pc}
547
548.Lcopyout_fault:
549	ldr	r0, =EFAULT
550	str	r11, [r10, #PCB_ONFAULT]
551	cmp	r3, #0x00
552	ldmfdgt	sp!, {r4-r7}		/* r3 > 0 Restore r4-r7 */
553	ldmfdlt	sp!, {r4-r9}		/* r3 < 0 Restore r4-r9 */
554	ldmfd	sp!, {r10-r11, pc}
555
556.Lcopyout_guts:
557	pld	[r0]
558	/* Word-align the destination buffer */
559	ands	ip, r1, #0x03		/* Already word aligned? */
560	beq	.Lcopyout_wordaligned	/* Yup */
561	rsb	ip, ip, #0x04
562	cmp	r2, ip			/* Enough bytes left to align it? */
563	blt	.Lcopyout_l4_2		/* Nope. Just copy bytewise */
564	sub	r2, r2, ip
565	rsbs	ip, ip, #0x03
566	addne	pc, pc, ip, lsl #3
567	nop
568	ldrb	ip, [r0], #0x01
569	strbt	ip, [r1], #0x01
570	ldrb	ip, [r0], #0x01
571	strbt	ip, [r1], #0x01
572	ldrb	ip, [r0], #0x01
573	strbt	ip, [r1], #0x01
574	cmp	r2, #0x00		/* All done? */
575	RETeq
576
577	/* Destination buffer is now word aligned */
578.Lcopyout_wordaligned:
579	ands	ip, r0, #0x03		/* Is src also word-aligned? */
580	bne	.Lcopyout_bad_align	/* Nope. Things just got bad */
581	cmp	r2, #0x08		/* Less than 8 bytes remaining? */
582	blt	.Lcopyout_w_less_than8
583
584	/* Quad-align the destination buffer */
585	tst	r0, #0x07		/* Already quad aligned? */
586	ldrne	ip, [r0], #0x04
587	subne	r2, r2, #0x04
588	strtne	ip, [r1], #0x04
589
590	stmfd	sp!, {r4-r9}		/* Free up some registers */
591	mov	r3, #-1			/* Signal restore r4-r9 */
592
593	/* Destination buffer word aligned, source is quad aligned */
594	subs	r2, r2, #0x80
595	blt	.Lcopyout_w_lessthan128
596
597	/* Copy 128 bytes at a time */
598.Lcopyout_w_loop128:
599	ldrd	r4, [r0], #0x08		/* LD:00-07 */
600	pld	[r0, #0x18]		/* Prefetch 0x20 */
601	ldrd	r6, [r0], #0x08		/* LD:08-0f */
602	ldrd	r8, [r0], #0x08		/* LD:10-17 */
603	strt	r4, [r1], #0x04		/* ST:00-03 */
604	strt	r5, [r1], #0x04		/* ST:04-07 */
605	ldrd	r4, [r0], #0x08		/* LD:18-1f */
606	strt	r6, [r1], #0x04		/* ST:08-0b */
607	strt	r7, [r1], #0x04		/* ST:0c-0f */
608	ldrd	r6, [r0], #0x08		/* LD:20-27 */
609	pld	[r0, #0x18]		/* Prefetch 0x40 */
610	strt	r8, [r1], #0x04		/* ST:10-13 */
611	strt	r9, [r1], #0x04		/* ST:14-17 */
612	ldrd	r8, [r0], #0x08		/* LD:28-2f */
613	strt	r4, [r1], #0x04		/* ST:18-1b */
614	strt	r5, [r1], #0x04		/* ST:1c-1f */
615	ldrd	r4, [r0], #0x08		/* LD:30-37 */
616	strt	r6, [r1], #0x04		/* ST:20-23 */
617	strt	r7, [r1], #0x04		/* ST:24-27 */
618	ldrd	r6, [r0], #0x08		/* LD:38-3f */
619	strt	r8, [r1], #0x04		/* ST:28-2b */
620	strt	r9, [r1], #0x04		/* ST:2c-2f */
621	ldrd	r8, [r0], #0x08		/* LD:40-47 */
622	pld	[r0, #0x18]		/* Prefetch 0x60 */
623	strt	r4, [r1], #0x04		/* ST:30-33 */
624	strt	r5, [r1], #0x04		/* ST:34-37 */
625	ldrd	r4, [r0], #0x08		/* LD:48-4f */
626	strt	r6, [r1], #0x04		/* ST:38-3b */
627	strt	r7, [r1], #0x04		/* ST:3c-3f */
628	ldrd	r6, [r0], #0x08		/* LD:50-57 */
629	strt	r8, [r1], #0x04		/* ST:40-43 */
630	strt	r9, [r1], #0x04		/* ST:44-47 */
631	ldrd	r8, [r0], #0x08		/* LD:58-4f */
632	strt	r4, [r1], #0x04		/* ST:48-4b */
633	strt	r5, [r1], #0x04		/* ST:4c-4f */
634	ldrd	r4, [r0], #0x08		/* LD:60-67 */
635	pld	[r0, #0x18]		/* Prefetch 0x80 */
636	strt	r6, [r1], #0x04		/* ST:50-53 */
637	strt	r7, [r1], #0x04		/* ST:54-57 */
638	ldrd	r6, [r0], #0x08		/* LD:68-6f */
639	strt	r8, [r1], #0x04		/* ST:58-5b */
640	strt	r9, [r1], #0x04		/* ST:5c-5f */
641	ldrd	r8, [r0], #0x08		/* LD:70-77 */
642	strt	r4, [r1], #0x04		/* ST:60-63 */
643	strt	r5, [r1], #0x04		/* ST:64-67 */
644	ldrd	r4, [r0], #0x08		/* LD:78-7f */
645	strt	r6, [r1], #0x04		/* ST:68-6b */
646	strt	r7, [r1], #0x04		/* ST:6c-6f */
647	strt	r8, [r1], #0x04		/* ST:70-73 */
648	strt	r9, [r1], #0x04		/* ST:74-77 */
649	subs	r2, r2, #0x80
650	strt	r4, [r1], #0x04		/* ST:78-7b */
651	strt	r5, [r1], #0x04		/* ST:7c-7f */
652	bge	.Lcopyout_w_loop128
653
654.Lcopyout_w_lessthan128:
655	adds	r2, r2, #0x80		/* Adjust for extra sub */
656	ldmfdeq	sp!, {r4-r9}
657	RETeq				/* Return now if done */
658	subs	r2, r2, #0x20
659	blt	.Lcopyout_w_lessthan32
660
661	/* Copy 32 bytes at a time */
662.Lcopyout_w_loop32:
663	ldrd	r4, [r0], #0x08
664	pld	[r0, #0x18]
665	ldrd	r6, [r0], #0x08
666	ldrd	r8, [r0], #0x08
667	strt	r4, [r1], #0x04
668	strt	r5, [r1], #0x04
669	ldrd	r4, [r0], #0x08
670	strt	r6, [r1], #0x04
671	strt	r7, [r1], #0x04
672	strt	r8, [r1], #0x04
673	strt	r9, [r1], #0x04
674	subs	r2, r2, #0x20
675	strt	r4, [r1], #0x04
676	strt	r5, [r1], #0x04
677	bge	.Lcopyout_w_loop32
678
679.Lcopyout_w_lessthan32:
680	adds	r2, r2, #0x20		/* Adjust for extra sub */
681	ldmfdeq	sp!, {r4-r9}
682	RETeq				/* Return now if done */
683
684	and	r4, r2, #0x18
685	rsb	r5, r4, #0x18
686	subs	r2, r2, r4
687	add	pc, pc, r5, lsl #1
688	nop
689
690	/* At least 24 bytes remaining */
691	ldrd	r4, [r0], #0x08
692	strt	r4, [r1], #0x04
693	strt	r5, [r1], #0x04
694	nop
695
696	/* At least 16 bytes remaining */
697	ldrd	r4, [r0], #0x08
698	strt	r4, [r1], #0x04
699	strt	r5, [r1], #0x04
700	nop
701
702	/* At least 8 bytes remaining */
703	ldrd	r4, [r0], #0x08
704	strt	r4, [r1], #0x04
705	strt	r5, [r1], #0x04
706	nop
707
708	/* Less than 8 bytes remaining */
709	ldmfd	sp!, {r4-r9}
710	RETeq				/* Return now if done */
711	mov	r3, #0x00
712
713.Lcopyout_w_less_than8:
714	subs	r2, r2, #0x04
715	ldrge	ip, [r0], #0x04
716	strtge	ip, [r1], #0x04
717	RETeq				/* Return now if done */
718	addlt	r2, r2, #0x04
719	ldrb	ip, [r0], #0x01
720	cmp	r2, #0x02
721	ldrbge	r2, [r0], #0x01
722	strbt	ip, [r1], #0x01
723	ldrbgt	ip, [r0]
724	strbtge	r2, [r1], #0x01
725	strbtgt	ip, [r1]
726	RET
727
728/*
729 * At this point, it has not been possible to word align both buffers.
730 * The destination buffer (r1) is word aligned, but the source buffer
731 * (r0) is not.
732 */
733.Lcopyout_bad_align:
734	stmfd	sp!, {r4-r7}
735	mov	r3, #0x01
736	bic	r0, r0, #0x03
737	cmp	ip, #2
738	ldr	ip, [r0], #0x04
739	bgt	.Lcopyout_bad3
740	beq	.Lcopyout_bad2
741	b	.Lcopyout_bad1
742
743.Lcopyout_bad1_loop16:
744#ifdef	__ARMEB__
745	mov	r4, ip, lsl #8
746#else
747	mov	r4, ip, lsr #8
748#endif
749	ldr	r5, [r0], #0x04
750	pld	[r0, #0x018]
751	ldr	r6, [r0], #0x04
752	ldr	r7, [r0], #0x04
753	ldr	ip, [r0], #0x04
754#ifdef	__ARMEB__
755	orr	r4, r4, r5, lsr #24
756	mov	r5, r5, lsl #8
757	orr	r5, r5, r6, lsr #24
758	mov	r6, r6, lsl #8
759	orr	r6, r6, r7, lsr #24
760	mov	r7, r7, lsl #8
761	orr	r7, r7, ip, lsr #24
762#else
763	orr	r4, r4, r5, lsl #24
764	mov	r5, r5, lsr #8
765	orr	r5, r5, r6, lsl #24
766	mov	r6, r6, lsr #8
767	orr	r6, r6, r7, lsl #24
768	mov	r7, r7, lsr #8
769	orr	r7, r7, ip, lsl #24
770#endif
771	strt	r4, [r1], #0x04
772	strt	r5, [r1], #0x04
773	strt	r6, [r1], #0x04
774	strt	r7, [r1], #0x04
775.Lcopyout_bad1:
776	subs	r2, r2, #0x10
777	bge	.Lcopyout_bad1_loop16
778
779	adds	r2, r2, #0x10
780	ldmfdeq	sp!, {r4-r7}
781	RETeq				/* Return now if done */
782	subs	r2, r2, #0x04
783	sublt	r0, r0, #0x03
784	blt	.Lcopyout_l4
785
786.Lcopyout_bad1_loop4:
787#ifdef __ARMEB__
788	mov	r4, ip, lsl #8
789#else
790	mov	r4, ip, lsr #8
791#endif
792	ldr	ip, [r0], #0x04
793	subs	r2, r2, #0x04
794#ifdef __ARMEB__
795	orr	r4, r4, ip, lsr #24
796#else
797	orr	r4, r4, ip, lsl #24
798#endif
799	strt	r4, [r1], #0x04
800	bge	.Lcopyout_bad1_loop4
801	sub	r0, r0, #0x03
802	b	.Lcopyout_l4
803
804.Lcopyout_bad2_loop16:
805#ifdef __ARMEB__
806	mov	r4, ip, lsl #16
807#else
808	mov	r4, ip, lsr #16
809#endif
810	ldr	r5, [r0], #0x04
811	pld	[r0, #0x018]
812	ldr	r6, [r0], #0x04
813	ldr	r7, [r0], #0x04
814	ldr	ip, [r0], #0x04
815#ifdef __ARMEB__
816	orr	r4, r4, r5, lsr #16
817	mov	r5, r5, lsl #16
818	orr	r5, r5, r6, lsr #16
819	mov	r6, r6, lsl #16
820	orr	r6, r6, r7, lsr #16
821	mov	r7, r7, lsl #16
822	orr	r7, r7, ip, lsr #16
823#else
824	orr	r4, r4, r5, lsl #16
825	mov	r5, r5, lsr #16
826	orr	r5, r5, r6, lsl #16
827	mov	r6, r6, lsr #16
828	orr	r6, r6, r7, lsl #16
829	mov	r7, r7, lsr #16
830	orr	r7, r7, ip, lsl #16
831#endif
832	strt	r4, [r1], #0x04
833	strt	r5, [r1], #0x04
834	strt	r6, [r1], #0x04
835	strt	r7, [r1], #0x04
836.Lcopyout_bad2:
837	subs	r2, r2, #0x10
838	bge	.Lcopyout_bad2_loop16
839
840	adds	r2, r2, #0x10
841	ldmfdeq	sp!, {r4-r7}
842	RETeq				/* Return now if done */
843	subs	r2, r2, #0x04
844	sublt	r0, r0, #0x02
845	blt	.Lcopyout_l4
846
847.Lcopyout_bad2_loop4:
848#ifdef __ARMEB__
849	mov	r4, ip, lsl #16
850#else
851	mov	r4, ip, lsr #16
852#endif
853	ldr	ip, [r0], #0x04
854	subs	r2, r2, #0x04
855#ifdef __ARMEB__
856	orr	r4, r4, ip, lsr #16
857#else
858	orr	r4, r4, ip, lsl #16
859#endif
860	strt	r4, [r1], #0x04
861	bge	.Lcopyout_bad2_loop4
862	sub	r0, r0, #0x02
863	b	.Lcopyout_l4
864
865.Lcopyout_bad3_loop16:
866#ifdef __ARMEB__
867	mov	r4, ip, lsl #24
868#else
869	mov	r4, ip, lsr #24
870#endif
871	ldr	r5, [r0], #0x04
872	pld	[r0, #0x018]
873	ldr	r6, [r0], #0x04
874	ldr	r7, [r0], #0x04
875	ldr	ip, [r0], #0x04
876#ifdef __ARMEB__
877	orr	r4, r4, r5, lsr #8
878	mov	r5, r5, lsl #24
879	orr	r5, r5, r6, lsr #8
880	mov	r6, r6, lsl #24
881	orr	r6, r6, r7, lsr #8
882	mov	r7, r7, lsl #24
883	orr	r7, r7, ip, lsr #8
884#else
885	orr	r4, r4, r5, lsl #8
886	mov	r5, r5, lsr #24
887	orr	r5, r5, r6, lsl #8
888	mov	r6, r6, lsr #24
889	orr	r6, r6, r7, lsl #8
890	mov	r7, r7, lsr #24
891	orr	r7, r7, ip, lsl #8
892#endif
893	strt	r4, [r1], #0x04
894	strt	r5, [r1], #0x04
895	strt	r6, [r1], #0x04
896	strt	r7, [r1], #0x04
897.Lcopyout_bad3:
898	subs	r2, r2, #0x10
899	bge	.Lcopyout_bad3_loop16
900
901	adds	r2, r2, #0x10
902	ldmfdeq	sp!, {r4-r7}
903	RETeq				/* Return now if done */
904	subs	r2, r2, #0x04
905	sublt	r0, r0, #0x01
906	blt	.Lcopyout_l4
907
908.Lcopyout_bad3_loop4:
909#ifdef __ARMEB__
910	mov	r4, ip, lsl #24
911#else
912	mov	r4, ip, lsr #24
913#endif
914	ldr	ip, [r0], #0x04
915	subs	r2, r2, #0x04
916#ifdef __ARMEB__
917	orr	r4, r4, ip, lsr #8
918#else
919	orr	r4, r4, ip, lsl #8
920#endif
921	strt	r4, [r1], #0x04
922	bge	.Lcopyout_bad3_loop4
923	sub	r0, r0, #0x01
924
925.Lcopyout_l4:
926	ldmfd	sp!, {r4-r7}
927	mov	r3, #0x00
928	adds	r2, r2, #0x04
929	RETeq
930.Lcopyout_l4_2:
931	rsbs	r2, r2, #0x03
932	addne	pc, pc, r2, lsl #3
933	nop
934	ldrb	ip, [r0], #0x01
935	strbt	ip, [r1], #0x01
936	ldrb	ip, [r0], #0x01
937	strbt	ip, [r1], #0x01
938	ldrb	ip, [r0]
939	strbt	ip, [r1]
940	RET
941END(copyout)
942
943