1/* $NetBSD: memcpy_arm.S,v 1.1 2003/10/14 07:51:45 scw Exp $ */ 2 3/*- 4 * Copyright (c) 1997 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Neil A. Carson and Mark Brinicombe 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32#include <machine/asm.h> 33__FBSDID("$FreeBSD$"); 34/* 35 * This is one fun bit of code ... 36 * Some easy listening music is suggested while trying to understand this 37 * code e.g. Iron Maiden 38 * 39 * For anyone attempting to understand it : 40 * 41 * The core code is implemented here with simple stubs for memcpy(). 42 * 43 * All local labels are prefixed with Lmemcpy_ 44 * Following the prefix a label starting f is used in the forward copy code 45 * while a label using b is used in the backwards copy code 46 * The source and destination addresses determine whether a forward or 47 * backward copy is performed. 48 * Separate bits of code are used to deal with the following situations 49 * for both the forward and backwards copy. 50 * unaligned source address 51 * unaligned destination address 52 * Separate copy routines are used to produce an optimised result for each 53 * of these cases. 54 * The copy code will use LDM/STM instructions to copy up to 32 bytes at 55 * a time where possible. 56 * 57 * Note: r12 (aka ip) can be trashed during the function along with 58 * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out. 59 * Additional registers are preserved prior to use i.e. r4, r5 & lr 60 * 61 * Apologies for the state of the comments ;-) 62 */ 63/* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */ 64ENTRY(memcpy) 65 /* save leaf functions having to store this away */ 66 stmdb sp!, {r0, lr} /* memcpy() returns dest addr */ 67 68 subs r2, r2, #4 69 blt .Lmemcpy_l4 /* less than 4 bytes */ 70 ands r12, r0, #3 71 bne .Lmemcpy_destul /* oh unaligned destination addr */ 72 ands r12, r1, #3 73 bne .Lmemcpy_srcul /* oh unaligned source addr */ 74 75.Lmemcpy_t8: 76 /* We have aligned source and destination */ 77 subs r2, r2, #8 78 blt .Lmemcpy_l12 /* less than 12 bytes (4 from above) */ 79 subs r2, r2, #0x14 80 blt .Lmemcpy_l32 /* less than 32 bytes (12 from above) */ 81 stmdb sp!, {r4} /* borrow r4 */ 82 83 /* blat 32 bytes at a time */ 84 /* XXX for really big copies perhaps we should use more registers */ 85.Lmemcpy_loop32: 86 ldmia r1!, {r3, r4, r12, lr} 87 stmia r0!, {r3, r4, r12, lr} 88 ldmia r1!, {r3, r4, r12, lr} 89 stmia r0!, {r3, r4, r12, lr} 90 subs r2, r2, #0x20 91 bge .Lmemcpy_loop32 92 93 cmn r2, #0x10 94 ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ 95 stmgeia r0!, {r3, r4, r12, lr} 96 subge r2, r2, #0x10 97 ldmia sp!, {r4} /* return r4 */ 98 99.Lmemcpy_l32: 100 adds r2, r2, #0x14 101 102 /* blat 12 bytes at a time */ 103.Lmemcpy_loop12: 104 ldmgeia r1!, {r3, r12, lr} 105 stmgeia r0!, {r3, r12, lr} 106 subges r2, r2, #0x0c 107 bge .Lmemcpy_loop12 108 109.Lmemcpy_l12: 110 adds r2, r2, #8 111 blt .Lmemcpy_l4 112 113 subs r2, r2, #4 114 ldrlt r3, [r1], #4 115 strlt r3, [r0], #4 116 ldmgeia r1!, {r3, r12} 117 stmgeia r0!, {r3, r12} 118 subge r2, r2, #4 119 120.Lmemcpy_l4: 121 /* less than 4 bytes to go */ 122 adds r2, r2, #4 123#ifdef __APCS_26_ 124 ldmeqia sp!, {r0, pc}^ /* done */ 125#else 126 ldmeqia sp!, {r0, pc} /* done */ 127#endif 128 /* copy the crud byte at a time */ 129 cmp r2, #2 130 ldrb r3, [r1], #1 131 strb r3, [r0], #1 132 ldrgeb r3, [r1], #1 133 strgeb r3, [r0], #1 134 ldrgtb r3, [r1], #1 135 strgtb r3, [r0], #1 136 ldmia sp!, {r0, pc} 137 138 /* erg - unaligned destination */ 139.Lmemcpy_destul: 140 rsb r12, r12, #4 141 cmp r12, #2 142 143 /* align destination with byte copies */ 144 ldrb r3, [r1], #1 145 strb r3, [r0], #1 146 ldrgeb r3, [r1], #1 147 strgeb r3, [r0], #1 148 ldrgtb r3, [r1], #1 149 strgtb r3, [r0], #1 150 subs r2, r2, r12 151 blt .Lmemcpy_l4 /* less the 4 bytes */ 152 153 ands r12, r1, #3 154 beq .Lmemcpy_t8 /* we have an aligned source */ 155 156 /* erg - unaligned source */ 157 /* This is where it gets nasty ... */ 158.Lmemcpy_srcul: 159 bic r1, r1, #3 160 ldr lr, [r1], #4 161 cmp r12, #2 162 bgt .Lmemcpy_srcul3 163 beq .Lmemcpy_srcul2 164 cmp r2, #0x0c 165 blt .Lmemcpy_srcul1loop4 166 sub r2, r2, #0x0c 167 stmdb sp!, {r4, r5} 168 169.Lmemcpy_srcul1loop16: 170#ifdef __ARMEB__ 171 mov r3, lr, lsl #8 172#else 173 mov r3, lr, lsr #8 174#endif 175 ldmia r1!, {r4, r5, r12, lr} 176#ifdef __ARMEB__ 177 orr r3, r3, r4, lsr #24 178 mov r4, r4, lsl #8 179 orr r4, r4, r5, lsr #24 180 mov r5, r5, lsl #8 181 orr r5, r5, r12, lsr #24 182 mov r12, r12, lsl #8 183 orr r12, r12, lr, lsr #24 184#else 185 orr r3, r3, r4, lsl #24 186 mov r4, r4, lsr #8 187 orr r4, r4, r5, lsl #24 188 mov r5, r5, lsr #8 189 orr r5, r5, r12, lsl #24 190 mov r12, r12, lsr #8 191 orr r12, r12, lr, lsl #24 192#endif 193 stmia r0!, {r3-r5, r12} 194 subs r2, r2, #0x10 195 bge .Lmemcpy_srcul1loop16 196 ldmia sp!, {r4, r5} 197 adds r2, r2, #0x0c 198 blt .Lmemcpy_srcul1l4 199 200.Lmemcpy_srcul1loop4: 201#ifdef __ARMEB__ 202 mov r12, lr, lsl #8 203#else 204 mov r12, lr, lsr #8 205#endif 206 ldr lr, [r1], #4 207#ifdef __ARMEB__ 208 orr r12, r12, lr, lsr #24 209#else 210 orr r12, r12, lr, lsl #24 211#endif 212 str r12, [r0], #4 213 subs r2, r2, #4 214 bge .Lmemcpy_srcul1loop4 215 216.Lmemcpy_srcul1l4: 217 sub r1, r1, #3 218 b .Lmemcpy_l4 219 220.Lmemcpy_srcul2: 221 cmp r2, #0x0c 222 blt .Lmemcpy_srcul2loop4 223 sub r2, r2, #0x0c 224 stmdb sp!, {r4, r5} 225 226.Lmemcpy_srcul2loop16: 227#ifdef __ARMEB__ 228 mov r3, lr, lsl #16 229#else 230 mov r3, lr, lsr #16 231#endif 232 ldmia r1!, {r4, r5, r12, lr} 233#ifdef __ARMEB__ 234 orr r3, r3, r4, lsr #16 235 mov r4, r4, lsl #16 236 orr r4, r4, r5, lsr #16 237 mov r5, r5, lsl #16 238 orr r5, r5, r12, lsr #16 239 mov r12, r12, lsl #16 240 orr r12, r12, lr, lsr #16 241#else 242 orr r3, r3, r4, lsl #16 243 mov r4, r4, lsr #16 244 orr r4, r4, r5, lsl #16 245 mov r5, r5, lsr #16 246 orr r5, r5, r12, lsl #16 247 mov r12, r12, lsr #16 248 orr r12, r12, lr, lsl #16 249#endif 250 stmia r0!, {r3-r5, r12} 251 subs r2, r2, #0x10 252 bge .Lmemcpy_srcul2loop16 253 ldmia sp!, {r4, r5} 254 adds r2, r2, #0x0c 255 blt .Lmemcpy_srcul2l4 256 257.Lmemcpy_srcul2loop4: 258#ifdef __ARMEB__ 259 mov r12, lr, lsl #16 260#else 261 mov r12, lr, lsr #16 262#endif 263 ldr lr, [r1], #4 264#ifdef __ARMEB__ 265 orr r12, r12, lr, lsr #16 266#else 267 orr r12, r12, lr, lsl #16 268#endif 269 str r12, [r0], #4 270 subs r2, r2, #4 271 bge .Lmemcpy_srcul2loop4 272 273.Lmemcpy_srcul2l4: 274 sub r1, r1, #2 275 b .Lmemcpy_l4 276 277.Lmemcpy_srcul3: 278 cmp r2, #0x0c 279 blt .Lmemcpy_srcul3loop4 280 sub r2, r2, #0x0c 281 stmdb sp!, {r4, r5} 282 283.Lmemcpy_srcul3loop16: 284#ifdef __ARMEB__ 285 mov r3, lr, lsl #24 286#else 287 mov r3, lr, lsr #24 288#endif 289 ldmia r1!, {r4, r5, r12, lr} 290#ifdef __ARMEB__ 291 orr r3, r3, r4, lsr #8 292 mov r4, r4, lsl #24 293 orr r4, r4, r5, lsr #8 294 mov r5, r5, lsl #24 295 orr r5, r5, r12, lsr #8 296 mov r12, r12, lsl #24 297 orr r12, r12, lr, lsr #8 298#else 299 orr r3, r3, r4, lsl #8 300 mov r4, r4, lsr #24 301 orr r4, r4, r5, lsl #8 302 mov r5, r5, lsr #24 303 orr r5, r5, r12, lsl #8 304 mov r12, r12, lsr #24 305 orr r12, r12, lr, lsl #8 306#endif 307 stmia r0!, {r3-r5, r12} 308 subs r2, r2, #0x10 309 bge .Lmemcpy_srcul3loop16 310 ldmia sp!, {r4, r5} 311 adds r2, r2, #0x0c 312 blt .Lmemcpy_srcul3l4 313 314.Lmemcpy_srcul3loop4: 315#ifdef __ARMEB__ 316 mov r12, lr, lsl #24 317#else 318 mov r12, lr, lsr #24 319#endif 320 ldr lr, [r1], #4 321#ifdef __ARMEB__ 322 orr r12, r12, lr, lsr #8 323#else 324 orr r12, r12, lr, lsl #8 325#endif 326 str r12, [r0], #4 327 subs r2, r2, #4 328 bge .Lmemcpy_srcul3loop4 329 330.Lmemcpy_srcul3l4: 331 sub r1, r1, #1 332 b .Lmemcpy_l4 333