1/*
2 * Copyright (c) 2006, 2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24/*****************************************************************************
25 * ARMv5 and ARMv6 implementation, also used in dyld on later archs          *
26 *****************************************************************************/
27
28#include <arm/arch.h>
29
30.text
31.align 2
32
33	.globl _memcpy
34	.globl _bcopy
35	.globl _memmove
36
37_bcopy:		/* void bcopy(const void *src, void *dest, size_t len); */
38	mov		r3, r0
39	mov		r0, r1
40	mov		r1, r3
41
42_memcpy:		/* void *memcpy(void *dest, const void *src, size_t len); */
43	/* check for zero len or if the pointers are the same */
44	cmp		r2, #0
45	cmpne	r0, r1
46	bxeq	lr
47
48	/* save r0 (return value), r4 (scratch), and r5 (scratch) */
49	stmfd	sp!, { r0, r4, r5, r7, lr }
50	add	r7, sp, #12
51
52	/* check for overlap. r3 <- distance between src & dest */
53	subhs	r3, r0, r1
54	sublo	r3, r1, r0
55	cmp		r3, r2			/* if distance(src, dest) < len, we have overlap */
56	blo		Loverlap
57
58Lnormalforwardcopy:
59	/* are src and dest dissimilarly word aligned? */
60	mov		r12, r0, lsl #30
61	cmp		r12, r1, lsl #30
62	bne		Lnonwordaligned_forward
63
64	/* if len < 64, do a quick forward copy */
65	cmp		r2, #64
66	blt		Lsmallforwardcopy
67
68	/* check for 16 byte src/dest unalignment */
69	tst		r0, #0xf
70	bne		Lsimilarlyunaligned
71
72	/* check for 32 byte dest unalignment */
73	tst		r0, #(1<<4)
74	bne		Lunaligned_32
75
76Lmorethan64_aligned:
77	/* save some more registers to use in the copy */
78	stmfd	sp!, { r6, r8, r10, r11 }
79
80	/* pre-subtract 64 from the len counter to avoid an extra compare in the loop */
81	sub		r2, r2, #64
82
83L64loop:
84	/* copy 64 bytes at a time */
85	ldmia	r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
86#ifdef _ARM_ARCH_6
87	pld		[r1, #32]
88#endif
89	stmia	r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
90	ldmia	r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
91	subs	r2, r2, #64
92#ifdef _ARM_ARCH_6
93	pld		[r1, #32]
94#endif
95	stmia	r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
96	bge		L64loop
97
98	/* restore the scratch registers we just saved */
99	ldmfd	sp!, { r6, r8, r10, r11 }
100
101	/* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */
102	adds	r2, r2, #64
103	beq		Lexit
104
105Llessthan64_aligned:
106	/* copy 16 bytes at a time until we have < 16 bytes */
107	cmp		r2, #16
108	ldmiage	r1!, { r3, r4, r5, r12 }
109	stmiage	r0!, { r3, r4, r5, r12 }
110	subsge	r2, r2, #16
111	bgt		Llessthan64_aligned
112	beq		Lexit
113
114Llessthan16_aligned:
115	mov		r2, r2, lsl #28
116	msr		cpsr_f, r2
117
118	ldmiami	r1!, { r2, r3 }
119	ldreq	r4, [r1], #4
120	ldrhcs	r5, [r1], #2
121	ldrbvs	r12, [r1], #1
122
123	stmiami	r0!, { r2, r3 }
124	streq	r4, [r0], #4
125	strhcs	r5, [r0], #2
126	strbvs	r12, [r0], #1
127	b		Lexit
128
129Lsimilarlyunaligned:
130	/* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */
131	mov		r12, r0, lsl #28
132	rsb		r12, r12, #0
133	msr		cpsr_f, r12
134
135	ldrbvs	r3, [r1], #1
136	ldrhcs	r4, [r1], #2
137	ldreq	r5, [r1], #4
138
139	strbvs	r3, [r0], #1
140	strhcs	r4, [r0], #2
141	streq	r5, [r0], #4
142
143	ldmiami	r1!, { r3, r4 }
144	stmiami	r0!, { r3, r4 }
145
146	subs	r2, r2, r12, lsr #28
147	beq		Lexit
148
149Lunaligned_32:
150	/* bring up to dest 32 byte alignment */
151	tst		r0, #(1 << 4)
152	ldmiane	r1!, { r3, r4, r5, r12 }
153	stmiane	r0!, { r3, r4, r5, r12 }
154	subne	r2, r2, #16
155
156	/* we should now be aligned, see what copy method we should use */
157	cmp		r2, #64
158	bge		Lmorethan64_aligned
159	b		Llessthan64_aligned
160
161Lbytewise2:
162	/* copy 2 bytes at a time */
163	subs	r2, r2, #2
164
165	ldrb	r3, [r1], #1
166	ldrbpl	r4, [r1], #1
167
168	strb	r3, [r0], #1
169	strbpl	r4, [r0], #1
170
171	bhi		Lbytewise2
172	b		Lexit
173
174Lbytewise:
175	/* simple bytewise forward copy */
176	ldrb	r3, [r1], #1
177	subs	r2, r2, #1
178	strb	r3, [r0], #1
179	bne		Lbytewise
180	b		Lexit
181
182Lsmallforwardcopy:
183	/* src and dest are word aligned similarly, less than 64 bytes to copy */
184	cmp		r2, #4
185	blt		Lbytewise2
186
187	/* bytewise copy until word aligned */
188	tst		r1, #3
189Lwordalignloop:
190	ldrbne	r3, [r1], #1
191	strbne	r3, [r0], #1
192	subne	r2, r2, #1
193	tstne	r1, #3
194	bne		Lwordalignloop
195
196	cmp		r2, #16
197	bge		Llessthan64_aligned
198	blt		Llessthan16_aligned
199
200Loverlap:
201	/* src and dest overlap in some way, len > 0 */
202	cmp		r0, r1				/* if dest > src */
203	bhi		Loverlap_srclower
204
205Loverlap_destlower:
206	/* dest < src, see if we can still do a fast forward copy or fallback to slow forward copy */
207	cmp		r3, #64
208	bge		Lnormalforwardcopy 	/* overlap is greater than one stride of the copy, use normal copy */
209
210	cmp		r3, #2
211	bge		Lbytewise2
212	b		Lbytewise
213
214	/* the following routines deal with having to copy in the reverse direction */
215Loverlap_srclower:
216	/* src < dest, with overlap */
217
218	/* src += len; dest += len; */
219	add		r0, r0, r2
220	add		r1, r1, r2
221
222	/* we have to copy in reverse no matter what, test if we can we use a large block reverse copy */
223	cmp		r2, #64				/* less than 64 bytes to copy? */
224	cmpgt	r3, #64				/* less than 64 bytes of nonoverlap? */
225	blt		Lbytewise_reverse
226
227	/* test of src and dest are nonword aligned differently */
228	mov		r3, r0, lsl #30
229	cmp		r3, r1, lsl #30
230	bne		Lbytewise_reverse
231
232	/* test if src and dest are non word aligned or dest is non 16 byte aligned */
233	tst		r0, #0xf
234	bne		Lunaligned_reverse_similarly
235
236	/* test for dest 32 byte alignment */
237	tst		r0, #(1<<4)
238	bne		Lunaligned_32_reverse_similarly
239
240	/* 64 byte reverse block copy, src and dest aligned */
241Lmorethan64_aligned_reverse:
242	/* save some more registers to use in the copy */
243	stmfd	sp!, { r6, r8, r10, r11 }
244
245	/* pre-subtract 64 from the len counter to avoid an extra compare in the loop */
246	sub		r2, r2, #64
247
248L64loop_reverse:
249	/* copy 64 bytes at a time */
250	ldmdb	r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
251#ifdef _ARM_ARCH_6
252	pld		[r1, #-32]
253#endif
254	stmdb	r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
255	ldmdb	r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
256	subs	r2, r2, #64
257#ifdef _ARM_ARCH_6
258	pld		[r1, #-32]
259#endif
260	stmdb	r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
261	bge		L64loop_reverse
262
263	/* restore the scratch registers we just saved */
264	ldmfd	sp!, { r6, r8, r10, r11 }
265
266	/* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */
267	adds	r2, r2, #64
268	beq		Lexit
269
270Lbytewise_reverse:
271	ldrb	r3, [r1, #-1]!
272	strb	r3, [r0, #-1]!
273	subs	r2, r2, #1
274	bne		Lbytewise_reverse
275	b		Lexit
276
277Lunaligned_reverse_similarly:
278	/* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */
279	mov		r12, r0, lsl #28
280	msr		cpsr_f, r12
281
282	ldrbvs	r3, [r1, #-1]!
283	ldrhcs	r4, [r1, #-2]!
284	ldreq	r5, [r1, #-4]!
285
286	strbvs	r3, [r0, #-1]!
287	strhcs	r4, [r0, #-2]!
288	streq	r5, [r0, #-4]!
289
290	ldmdbmi	r1!, { r3, r4 }
291	stmdbmi	r0!, { r3, r4 }
292
293	subs	r2, r2, r12, lsr #28
294	beq		Lexit
295
296Lunaligned_32_reverse_similarly:
297	/* bring up to dest 32 byte alignment */
298	tst		r0, #(1 << 4)
299	ldmdbne	r1!, { r3, r4, r5, r12 }
300	stmdbne	r0!, { r3, r4, r5, r12 }
301	subne	r2, r2, #16
302
303	/* we should now be aligned, see what copy method we should use */
304	cmp		r2, #64
305	bge		Lmorethan64_aligned_reverse
306	b		Lbytewise_reverse
307
308	/* the following routines deal with non word aligned copies */
309Lnonwordaligned_forward:
310	cmp		r2, #8
311	blt		Lbytewise2			/* not worth the effort with less than 24 bytes total */
312
313	/* bytewise copy until src word aligned */
314	tst		r1, #3
315Lwordalignloop2:
316	ldrbne	r3, [r1], #1
317	strbne	r3, [r0], #1
318	subne	r2, r2, #1
319	tstne	r1, #3
320	bne		Lwordalignloop2
321
322	/* figure out how the src and dest are unaligned */
323	and		r3, r0, #3
324	cmp		r3, #2
325	blt		Lalign1_forward
326	beq		Lalign2_forward
327	bgt		Lalign3_forward
328
329Lalign1_forward:
330	/* the dest pointer is 1 byte off from src */
331	mov		r12, r2, lsr #2		/* number of words we should copy */
332	sub		r0, r0, #1
333
334	/* prime the copy */
335	ldrb	r4, [r0]			/* load D[7:0] */
336
337Lalign1_forward_loop:
338	ldr		r3, [r1], #4		/* load S */
339	orr		r4, r4, r3, lsl #8	/* D[31:8] = S[24:0] */
340	str		r4, [r0], #4		/* save D */
341	mov		r4, r3, lsr #24		/* D[7:0] = S[31:25] */
342	subs	r12, r12, #1
343	bne		Lalign1_forward_loop
344
345	/* finish the copy off */
346	strb	r4, [r0], #1		/* save D[7:0] */
347
348	ands	r2, r2, #3
349	beq		Lexit
350	b		Lbytewise2
351
352Lalign2_forward:
353	/* the dest pointer is 2 bytes off from src */
354	mov		r12, r2, lsr #2		/* number of words we should copy */
355	sub		r0, r0, #2
356
357	/* prime the copy */
358	ldrh	r4, [r0]			/* load D[15:0] */
359
360Lalign2_forward_loop:
361	ldr		r3, [r1], #4		/* load S */
362	orr		r4, r4, r3, lsl #16	/* D[31:16] = S[15:0] */
363	str		r4, [r0], #4		/* save D */
364	mov		r4, r3, lsr #16		/* D[15:0] = S[31:15] */
365	subs	r12, r12, #1
366	bne		Lalign2_forward_loop
367
368	/* finish the copy off */
369	strh	r4, [r0], #2		/* save D[15:0] */
370
371	ands	r2, r2, #3
372	beq		Lexit
373	b		Lbytewise2
374
375Lalign3_forward:
376	/* the dest pointer is 3 bytes off from src */
377	mov		r12, r2, lsr #2		/* number of words we should copy */
378	sub		r0, r0, #3
379
380	/* prime the copy */
381	ldr		r4, [r0]
382	and		r4, r4, #0x00ffffff	/* load D[24:0] */
383
384Lalign3_forward_loop:
385	ldr		r3, [r1], #4		/* load S */
386	orr		r4, r4, r3, lsl #24	/* D[31:25] = S[7:0] */
387	str		r4, [r0], #4		/* save D */
388	mov		r4, r3, lsr #8		/* D[24:0] = S[31:8] */
389	subs	r12, r12, #1
390	bne		Lalign3_forward_loop
391
392	/* finish the copy off */
393	strh	r4, [r0], #2		/* save D[15:0] */
394	mov		r4, r4, lsr #16
395	strb	r4, [r0], #1		/* save D[23:16] */
396
397	ands	r2, r2, #3
398	beq		Lexit
399	b		Lbytewise2
400
401Lexit:
402	ldmfd	sp!, {r0, r4, r5, r7, pc}
403
404