1/*	$OpenBSD: memcpy.S,v 1.6 2015/06/08 14:22:05 jsg Exp $	*/
2/*	$NetBSD: memcpy.S,v 1.2 2001/11/20 00:29:20 chris Exp $	*/
3
4/*-
5 * Copyright (c) 1997 The NetBSD Foundation, Inc.
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Neil A. Carson and Mark Brinicombe
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33#include <machine/asm.h>
34
35/*
36 * This is one fun bit of code ...
37 * Some easy listening music is suggested while trying to understand this
38 * code e.g. Iron Maiden
39 *
40 * For anyone attempting to understand it :
41 *
42 * The core code is implemented here with simple stubs for memcpy()
43 * memmove() and bcopy().
44 *
45 * All local labels are prefixed with Lmemcpy_
46 * Following the prefix a label starting f is used in the forward copy code
47 * while a label using b is used in the backwards copy code
48 * The source and destination addresses determine whether a forward or
49 * backward copy is performed.
50 * Separate bits of code are used to deal with the following situations
51 * for both the forward and backwards copy.
52 * unaligned source address
53 * unaligned destination address
54 * Separate copy routines are used to produce an optimised result for each
55 * of these cases.
56 * The copy code will use LDM/STM instructions to copy up to 32 bytes at
57 * a time where possible.
58 *
59 * Note: r12 (aka ip) can be trashed during the function along with
60 * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
61 * Additional registers are preserved prior to use i.e. r4, r5 & lr
62 *
63 * Apologies for the state of the comments ;-)
64 */
65
66.syntax unified
67
68ENTRY(memcpy)
69ENTRY_NP(memmove)
70	/* Determine copy direction */
71	cmp	r1, r0
72
73	moveq	pc, lr
74
75	/* save leaf functions having to store this away */
76	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
77
78	bcc	Lmemcpy_backwards
79
80	/* start of forwards copy */
81	subs	r2, r2, #4
82	blt	Lmemcpy_fl4		/* less than 4 bytes */
83	ands	r12, r0, #3
84	bne	Lmemcpy_fdestul		/* oh unaligned destination addr */
85	ands	r12, r1, #3
86	bne	Lmemcpy_fsrcul		/* oh unaligned source addr */
87
88Lmemcpy_ft8:
89	/* We have aligned source and destination */
90	subs	r2, r2, #8
91	blt	Lmemcpy_fl12		/* less than 12 bytes (4 from above) */
92	subs	r2, r2, #0x14
93	blt	Lmemcpy_fl32		/* less than 32 bytes (12 from above) */
94	stmdb	sp!, {r4}		/* borrow r4 */
95
96	/* blat 32 bytes at a time */
97	/* XXX for really big copies perhaps we should use more registers */
98Lmemcpy_floop32:
99	ldmia	r1!, {r3, r4, r12, lr}
100	stmia	r0!, {r3, r4, r12, lr}
101	ldmia	r1!, {r3, r4, r12, lr}
102	stmia	r0!, {r3, r4, r12, lr}
103	subs	r2, r2, #0x20
104	bge	Lmemcpy_floop32
105
106	cmn	r2, #0x10
107	ldmiage	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
108	stmiage	r0!, {r3, r4, r12, lr}
109	subge	r2, r2, #0x10
110	ldmia	sp!, {r4}		/* return r4 */
111
112Lmemcpy_fl32:
113	adds	r2, r2, #0x14
114
115	/* blat 12 bytes at a time */
116Lmemcpy_floop12:
117	ldmiage	r1!, {r3, r12, lr}
118	stmiage	r0!, {r3, r12, lr}
119	subsge	r2, r2, #0x0c
120	bge	Lmemcpy_floop12
121
122Lmemcpy_fl12:
123	adds	r2, r2, #8
124	blt	Lmemcpy_fl4
125
126	subs	r2, r2, #4
127	ldrlt	r3, [r1], #4
128	strlt	r3, [r0], #4
129	ldmiage	r1!, {r3, r12}
130	stmiage	r0!, {r3, r12}
131	subge	r2, r2, #4
132
133Lmemcpy_fl4:
134	/* less than 4 bytes to go */
135	adds	r2, r2, #4
136#ifdef __APCS_26_
137	ldmiaeq	sp!, {r0, pc}^		/* done */
138#else
139	ldmiaeq	sp!, {r0, pc}		/* done */
140#endif
141	/* copy the crud byte at a time */
142	cmp	r2, #2
143	ldrb	r3, [r1], #1
144	strb	r3, [r0], #1
145	ldrbge	r3, [r1], #1
146	strbge	r3, [r0], #1
147	ldrbgt	r3, [r1], #1
148	strbgt	r3, [r0], #1
149	ldmia	sp!, {r0, pc}
150
151	/* erg - unaligned destination */
152Lmemcpy_fdestul:
153	rsb	r12, r12, #4
154	cmp	r12, #2
155
156	/* align destination with byte copies */
157	ldrb	r3, [r1], #1
158	strb	r3, [r0], #1
159	ldrbge	r3, [r1], #1
160	strbge	r3, [r0], #1
161	ldrbgt	r3, [r1], #1
162	strbgt	r3, [r0], #1
163	subs	r2, r2, r12
164	blt	Lmemcpy_fl4		/* less the 4 bytes */
165
166	ands	r12, r1, #3
167	beq	Lmemcpy_ft8		/* we have an aligned source */
168
169	/* erg - unaligned source */
170	/* This is where it gets nasty ... */
171Lmemcpy_fsrcul:
172	bic	r1, r1, #3
173	ldr	lr, [r1], #4
174	cmp	r12, #2
175	bgt	Lmemcpy_fsrcul3
176	beq	Lmemcpy_fsrcul2
177	cmp	r2, #0x0c
178	blt	Lmemcpy_fsrcul1loop4
179	sub	r2, r2, #0x0c
180	stmdb	sp!, {r4, r5}
181
182Lmemcpy_fsrcul1loop16:
183	mov	r3, lr, lsr #8
184	ldmia	r1!, {r4, r5, r12, lr}
185	orr	r3, r3, r4, lsl #24
186	mov	r4, r4, lsr #8
187	orr	r4, r4, r5, lsl #24
188	mov	r5, r5, lsr #8
189	orr	r5, r5, r12, lsl #24
190	mov	r12, r12, lsr #8
191	orr	r12, r12, lr, lsl #24
192	stmia	r0!, {r3-r5, r12}
193	subs	r2, r2, #0x10
194	bge	Lmemcpy_fsrcul1loop16
195	ldmia	sp!, {r4, r5}
196	adds	r2, r2, #0x0c
197	blt	Lmemcpy_fsrcul1l4
198
199Lmemcpy_fsrcul1loop4:
200	mov	r12, lr, lsr #8
201	ldr	lr, [r1], #4
202	orr	r12, r12, lr, lsl #24
203	str	r12, [r0], #4
204	subs	r2, r2, #4
205	bge	Lmemcpy_fsrcul1loop4
206
207Lmemcpy_fsrcul1l4:
208	sub	r1, r1, #3
209	b	Lmemcpy_fl4
210
211Lmemcpy_fsrcul2:
212	cmp	r2, #0x0c
213	blt	Lmemcpy_fsrcul2loop4
214	sub	r2, r2, #0x0c
215	stmdb	sp!, {r4, r5}
216
217Lmemcpy_fsrcul2loop16:
218	mov	r3, lr, lsr #16
219	ldmia	r1!, {r4, r5, r12, lr}
220	orr	r3, r3, r4, lsl #16
221	mov	r4, r4, lsr #16
222	orr	r4, r4, r5, lsl #16
223	mov	r5, r5, lsr #16
224	orr	r5, r5, r12, lsl #16
225	mov	r12, r12, lsr #16
226	orr	r12, r12, lr, lsl #16
227	stmia	r0!, {r3-r5, r12}
228	subs	r2, r2, #0x10
229	bge	Lmemcpy_fsrcul2loop16
230	ldmia	sp!, {r4, r5}
231	adds	r2, r2, #0x0c
232	blt	Lmemcpy_fsrcul2l4
233
234Lmemcpy_fsrcul2loop4:
235	mov	r12, lr, lsr #16
236	ldr	lr, [r1], #4
237	orr	r12, r12, lr, lsl #16
238	str	r12, [r0], #4
239	subs	r2, r2, #4
240	bge	Lmemcpy_fsrcul2loop4
241
242Lmemcpy_fsrcul2l4:
243	sub	r1, r1, #2
244	b	Lmemcpy_fl4
245
246Lmemcpy_fsrcul3:
247	cmp	r2, #0x0c
248	blt	Lmemcpy_fsrcul3loop4
249	sub	r2, r2, #0x0c
250	stmdb	sp!, {r4, r5}
251
252Lmemcpy_fsrcul3loop16:
253	mov	r3, lr, lsr #24
254	ldmia	r1!, {r4, r5, r12, lr}
255	orr	r3, r3, r4, lsl #8
256	mov	r4, r4, lsr #24
257	orr	r4, r4, r5, lsl #8
258	mov	r5, r5, lsr #24
259	orr	r5, r5, r12, lsl #8
260	mov	r12, r12, lsr #24
261	orr	r12, r12, lr, lsl #8
262	stmia	r0!, {r3-r5, r12}
263	subs	r2, r2, #0x10
264	bge	Lmemcpy_fsrcul3loop16
265	ldmia	sp!, {r4, r5}
266	adds	r2, r2, #0x0c
267	blt	Lmemcpy_fsrcul3l4
268
269Lmemcpy_fsrcul3loop4:
270	mov	r12, lr, lsr #24
271	ldr	lr, [r1], #4
272	orr	r12, r12, lr, lsl #8
273	str	r12, [r0], #4
274	subs	r2, r2, #4
275	bge	Lmemcpy_fsrcul3loop4
276
277Lmemcpy_fsrcul3l4:
278	sub	r1, r1, #1
279	b	Lmemcpy_fl4
280
281Lmemcpy_backwards:
282	add	r1, r1, r2
283	add	r0, r0, r2
284	subs	r2, r2, #4
285	blt	Lmemcpy_bl4		/* less than 4 bytes */
286	ands	r12, r0, #3
287	bne	Lmemcpy_bdestul		/* oh unaligned destination addr */
288	ands	r12, r1, #3
289	bne	Lmemcpy_bsrcul		/* oh unaligned source addr */
290
291Lmemcpy_bt8:
292	/* We have aligned source and destination */
293	subs	r2, r2, #8
294	blt	Lmemcpy_bl12		/* less than 12 bytes (4 from above) */
295	stmdb	sp!, {r4}
296	subs	r2, r2, #0x14		/* less than 32 bytes (12 from above) */
297	blt	Lmemcpy_bl32
298
299	/* blat 32 bytes at a time */
300	/* XXX for really big copies perhaps we should use more registers */
301Lmemcpy_bloop32:
302	ldmdb	r1!, {r3, r4, r12, lr}
303	stmdb	r0!, {r3, r4, r12, lr}
304	ldmdb	r1!, {r3, r4, r12, lr}
305	stmdb	r0!, {r3, r4, r12, lr}
306	subs	r2, r2, #0x20
307	bge	Lmemcpy_bloop32
308
309Lmemcpy_bl32:
310	cmn	r2, #0x10
311	ldmdbge	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
312	stmdbge	r0!, {r3, r4, r12, lr}
313	subge	r2, r2, #0x10
314	adds	r2, r2, #0x14
315	ldmdbge	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */
316	stmdbge	r0!, {r3, r12, lr}
317	subge	r2, r2, #0x0c
318	ldmia	sp!, {r4}
319
320Lmemcpy_bl12:
321	adds	r2, r2, #8
322	blt	Lmemcpy_bl4
323	subs	r2, r2, #4
324	ldrlt	r3, [r1, #-4]!
325	strlt	r3, [r0, #-4]!
326	ldmdbge	r1!, {r3, r12}
327	stmdbge	r0!, {r3, r12}
328	subge	r2, r2, #4
329
330Lmemcpy_bl4:
331	/* less than 4 bytes to go */
332	adds	r2, r2, #4
333	ldmiaeq	sp!, {r0, pc}
334
335	/* copy the crud byte at a time */
336	cmp	r2, #2
337	ldrb	r3, [r1, #-1]!
338	strb	r3, [r0, #-1]!
339	ldrbge	r3, [r1, #-1]!
340	strbge	r3, [r0, #-1]!
341	ldrbgt	r3, [r1, #-1]!
342	strbgt	r3, [r0, #-1]!
343	ldmia	sp!, {r0, pc}
344
345	/* erg - unaligned destination */
346Lmemcpy_bdestul:
347	cmp	r12, #2
348
349	/* align destination with byte copies */
350	ldrb	r3, [r1, #-1]!
351	strb	r3, [r0, #-1]!
352	ldrbge	r3, [r1, #-1]!
353	strbge	r3, [r0, #-1]!
354	ldrbgt	r3, [r1, #-1]!
355	strbgt	r3, [r0, #-1]!
356	subs	r2, r2, r12
357	blt	Lmemcpy_bl4		/* less than 4 bytes to go */
358	ands	r12, r1, #3
359	beq	Lmemcpy_bt8		/* we have an aligned source */
360
361	/* erg - unaligned source */
362	/* This is where it gets nasty ... */
363Lmemcpy_bsrcul:
364	bic	r1, r1, #3
365	ldr	r3, [r1, #0]
366	cmp	r12, #2
367	blt	Lmemcpy_bsrcul1
368	beq	Lmemcpy_bsrcul2
369	cmp	r2, #0x0c
370	blt	Lmemcpy_bsrcul3loop4
371	sub	r2, r2, #0x0c
372	stmdb	sp!, {r4, r5}
373
374Lmemcpy_bsrcul3loop16:
375	mov	lr, r3, lsl #8
376	ldmdb	r1!, {r3-r5, r12}
377	orr	lr, lr, r12, lsr #24
378	mov	r12, r12, lsl #8
379	orr	r12, r12, r5, lsr #24
380	mov	r5, r5, lsl #8
381	orr	r5, r5, r4, lsr #24
382	mov	r4, r4, lsl #8
383	orr	r4, r4, r3, lsr #24
384	stmdb	r0!, {r4, r5, r12, lr}
385	subs	r2, r2, #0x10
386	bge	Lmemcpy_bsrcul3loop16
387	ldmia	sp!, {r4, r5}
388	adds	r2, r2, #0x0c
389	blt	Lmemcpy_bsrcul3l4
390
391Lmemcpy_bsrcul3loop4:
392	mov	r12, r3, lsl #8
393	ldr	r3, [r1, #-4]!
394	orr	r12, r12, r3, lsr #24
395	str	r12, [r0, #-4]!
396	subs	r2, r2, #4
397	bge	Lmemcpy_bsrcul3loop4
398
399Lmemcpy_bsrcul3l4:
400	add	r1, r1, #3
401	b	Lmemcpy_bl4
402
403Lmemcpy_bsrcul2:
404	cmp	r2, #0x0c
405	blt	Lmemcpy_bsrcul2loop4
406	sub	r2, r2, #0x0c
407	stmdb	sp!, {r4, r5}
408
409Lmemcpy_bsrcul2loop16:
410	mov	lr, r3, lsl #16
411	ldmdb	r1!, {r3-r5, r12}
412	orr	lr, lr, r12, lsr #16
413	mov	r12, r12, lsl #16
414	orr	r12, r12, r5, lsr #16
415	mov	r5, r5, lsl #16
416	orr	r5, r5, r4, lsr #16
417	mov	r4, r4, lsl #16
418	orr	r4, r4, r3, lsr #16
419	stmdb	r0!, {r4, r5, r12, lr}
420	subs	r2, r2, #0x10
421	bge	Lmemcpy_bsrcul2loop16
422	ldmia	sp!, {r4, r5}
423	adds	r2, r2, #0x0c
424	blt	Lmemcpy_bsrcul2l4
425
426Lmemcpy_bsrcul2loop4:
427	mov	r12, r3, lsl #16
428	ldr	r3, [r1, #-4]!
429	orr	r12, r12, r3, lsr #16
430	str	r12, [r0, #-4]!
431	subs	r2, r2, #4
432	bge	Lmemcpy_bsrcul2loop4
433
434Lmemcpy_bsrcul2l4:
435	add	r1, r1, #2
436	b	Lmemcpy_bl4
437
438Lmemcpy_bsrcul1:
439	cmp	r2, #0x0c
440	blt	Lmemcpy_bsrcul1loop4
441	sub	r2, r2, #0x0c
442	stmdb	sp!, {r4, r5}
443
444Lmemcpy_bsrcul1loop32:
445	mov	lr, r3, lsl #24
446	ldmdb	r1!, {r3-r5, r12}
447	orr	lr, lr, r12, lsr #8
448	mov	r12, r12, lsl #24
449	orr	r12, r12, r5, lsr #8
450	mov	r5, r5, lsl #24
451	orr	r5, r5, r4, lsr #8
452	mov	r4, r4, lsl #24
453	orr	r4, r4, r3, lsr #8
454	stmdb	r0!, {r4, r5, r12, lr}
455	subs	r2, r2, #0x10
456	bge	Lmemcpy_bsrcul1loop32
457	ldmia	sp!, {r4, r5}
458	adds	r2, r2, #0x0c
459	blt	Lmemcpy_bsrcul1l4
460
461Lmemcpy_bsrcul1loop4:
462	mov	r12, r3, lsl #24
463	ldr	r3, [r1, #-4]!
464	orr	r12, r12, r3, lsr #8
465	str	r12, [r0, #-4]!
466	subs	r2, r2, #4
467	bge	Lmemcpy_bsrcul1loop4
468
469Lmemcpy_bsrcul1l4:
470	add	r1, r1, #1
471	b	Lmemcpy_bl4
472
473