1/* Copyright (c) 2013, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of the Linaro nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 26 27/* Assumptions: 28 * 29 * ARMv8-a, AArch64 30 * Unaligned accesses 31 */ 32 33 34/* Parameters and result. */ 35#define dstin x0 36#define src x1 37#define count x2 38#define tmp1 x3 39#define tmp1w w3 40#define tmp2 x4 41#define tmp2w w4 42#define tmp3 x5 43#define tmp3w w5 44#define dst x6 45 46#define A_l x7 47#define A_h x8 48#define B_l x9 49#define B_h x10 50#define C_l x11 51#define C_h x12 52#define D_l x13 53#define D_h x14 54 55 56 57.align 6 58.globl _bcopy 59_bcopy: /* void bcopy(const void *src, void *dest, size_t len); */ 60 mov x3, x0 61 mov x0, x1 62 mov x1, x3 63 64.globl _memcpy 65_memcpy: 66 mov dst, dstin 67 cmp count, #64 68 b.ge .Lcpy_not_short 69 cmp count, #15 70 b.le .Ltail15tiny 71 72 /* Deal with small copies quickly by dropping straight into the 73 * exit block. */ 74.Ltail63: 75 /* Copy up to 48 bytes of data. At this point we only need the 76 * bottom 6 bits of count to be accurate. */ 77 ands tmp1, count, #0x30 78 b.eq .Ltail15 79 add dst, dst, tmp1 80 add src, src, tmp1 81 cmp tmp1w, #0x20 82 b.eq 1f 83 b.lt 2f 84 ldp A_l, A_h, [src, #-48] 85 stp A_l, A_h, [dst, #-48] 861: 87 ldp A_l, A_h, [src, #-32] 88 stp A_l, A_h, [dst, #-32] 892: 90 ldp A_l, A_h, [src, #-16] 91 stp A_l, A_h, [dst, #-16] 92 93.Ltail15: 94 ands count, count, #15 95 b.eq 1f 96 add src, src, count 97 ldp A_l, A_h, [src, #-16] 98 add dst, dst, count 99 stp A_l, A_h, [dst, #-16] 1001: 101 ret 102 103.Ltail15tiny: 104 /* Copy up to 15 bytes of data. Does not assume additional data 105 being copied. */ 106 tbz count, #3, 1f 107 ldr tmp1, [src], #8 108 str tmp1, [dst], #8 1091: 110 tbz count, #2, 1f 111 ldr tmp1w, [src], #4 112 str tmp1w, [dst], #4 1131: 114 tbz count, #1, 1f 115 ldrh tmp1w, [src], #2 116 strh tmp1w, [dst], #2 1171: 118 tbz count, #0, 1f 119 ldrb tmp1w, [src] 120 strb tmp1w, [dst] 1211: 122 ret 123 124.Lcpy_not_short: 125 /* We don't much care about the alignment of DST, but we want SRC 126 * to be 128-bit (16 byte) aligned so that we don't cross cache line 127 * boundaries on both loads and stores. */ 128 neg tmp2, src 129 ands tmp2, tmp2, #15 /* Bytes to reach alignment. */ 130 b.eq 2f 131 sub count, count, tmp2 132 /* Copy more data than needed; it's faster than jumping 133 * around copying sub-Quadword quantities. We know that 134 * it can't overrun. */ 135 ldp A_l, A_h, [src] 136 add src, src, tmp2 137 stp A_l, A_h, [dst] 138 add dst, dst, tmp2 139 /* There may be less than 63 bytes to go now. */ 140 cmp count, #63 141 b.le .Ltail63 1422: 143 subs count, count, #128 144 b.ge .Lcpy_body_large 145 /* Less than 128 bytes to copy, so handle 64 here and then jump 146 * to the tail. */ 147 ldp A_l, A_h, [src] 148 ldp B_l, B_h, [src, #16] 149 ldp C_l, C_h, [src, #32] 150 ldp D_l, D_h, [src, #48] 151 stp A_l, A_h, [dst] 152 stp B_l, B_h, [dst, #16] 153 stp C_l, C_h, [dst, #32] 154 stp D_l, D_h, [dst, #48] 155 tst count, #0x3f 156 add src, src, #64 157 add dst, dst, #64 158 b.ne .Ltail63 159 ret 160 161 /* Critical loop. Start at a new cache line boundary. Assuming 162 * 64 bytes per line this ensures the entire loop is in one line. */ 163 .p2align 6 164.Lcpy_body_large: 165 /* There are at least 128 bytes to copy. */ 166 ldp A_l, A_h, [src, #0] 167 sub dst, dst, #16 /* Pre-bias. */ 168 ldp B_l, B_h, [src, #16] 169 ldp C_l, C_h, [src, #32] 170 ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */ 1711: 172 stp A_l, A_h, [dst, #16] 173 ldp A_l, A_h, [src, #16] 174 stp B_l, B_h, [dst, #32] 175 ldp B_l, B_h, [src, #32] 176 stp C_l, C_h, [dst, #48] 177 ldp C_l, C_h, [src, #48] 178 stp D_l, D_h, [dst, #64]! 179 ldp D_l, D_h, [src, #64]! 180 subs count, count, #64 181 b.ge 1b 182 stp A_l, A_h, [dst, #16] 183 stp B_l, B_h, [dst, #32] 184 stp C_l, C_h, [dst, #48] 185 stp D_l, D_h, [dst, #64] 186 add src, src, #16 187 add dst, dst, #64 + 16 188 tst count, #0x3f 189 b.ne .Ltail63 190 ret 191 192.align 6 193.globl _memmove 194_memmove: 195 cmp dstin, src 196 b.lo .Ldownwards 197 add tmp1, src, count 198 cmp dstin, tmp1 199 b.hs _memcpy /* No overlap. */ 200 201 /* Upwards move with potential overlap. 202 * Need to move from the tail backwards. SRC and DST point one 203 * byte beyond the remaining data to move. */ 204 add dst, dstin, count 205 add src, src, count 206 cmp count, #64 207 b.ge .Lmov_not_short_up 208 209 /* Deal with small moves quickly by dropping straight into the 210 * exit block. */ 211.Ltail63up: 212 /* Move up to 48 bytes of data. At this point we only need the 213 * bottom 6 bits of count to be accurate. */ 214 ands tmp1, count, #0x30 215 b.eq .Ltail15up 216 sub dst, dst, tmp1 217 sub src, src, tmp1 218 cmp tmp1w, #0x20 219 b.eq 1f 220 b.lt 2f 221 ldp A_l, A_h, [src, #32] 222 stp A_l, A_h, [dst, #32] 2231: 224 ldp A_l, A_h, [src, #16] 225 stp A_l, A_h, [dst, #16] 2262: 227 ldp A_l, A_h, [src] 228 stp A_l, A_h, [dst] 229.Ltail15up: 230 /* Move up to 15 bytes of data. Does not assume additional data 231 * being moved. */ 232 tbz count, #3, 1f 233 ldr tmp1, [src, #-8]! 234 str tmp1, [dst, #-8]! 2351: 236 tbz count, #2, 1f 237 ldr tmp1w, [src, #-4]! 238 str tmp1w, [dst, #-4]! 2391: 240 tbz count, #1, 1f 241 ldrh tmp1w, [src, #-2]! 242 strh tmp1w, [dst, #-2]! 2431: 244 tbz count, #0, 1f 245 ldrb tmp1w, [src, #-1] 246 strb tmp1w, [dst, #-1] 2471: 248 ret 249 250.Lmov_not_short_up: 251 /* We don't much care about the alignment of DST, but we want SRC 252 * to be 128-bit (16 byte) aligned so that we don't cross cache line 253 * boundaries on both loads and stores. */ 254 ands tmp2, src, #15 /* Bytes to reach alignment. */ 255 b.eq 2f 256 sub count, count, tmp2 257 /* Move enough data to reach alignment; unlike memcpy, we have to 258 * be aware of the overlap, which means we can't move data twice. */ 259 tbz tmp2, #3, 1f 260 ldr tmp1, [src, #-8]! 261 str tmp1, [dst, #-8]! 2621: 263 tbz tmp2, #2, 1f 264 ldr tmp1w, [src, #-4]! 265 str tmp1w, [dst, #-4]! 2661: 267 tbz tmp2, #1, 1f 268 ldrh tmp1w, [src, #-2]! 269 strh tmp1w, [dst, #-2]! 2701: 271 tbz tmp2, #0, 1f 272 ldrb tmp1w, [src, #-1]! 273 strb tmp1w, [dst, #-1]! 2741: 275 276 /* There may be less than 63 bytes to go now. */ 277 cmp count, #63 278 b.le .Ltail63up 2792: 280 subs count, count, #128 281 b.ge .Lmov_body_large_up 282 /* Less than 128 bytes to move, so handle 64 here and then jump 283 * to the tail. */ 284 ldp A_l, A_h, [src, #-64]! 285 ldp B_l, B_h, [src, #16] 286 ldp C_l, C_h, [src, #32] 287 ldp D_l, D_h, [src, #48] 288 stp A_l, A_h, [dst, #-64]! 289 stp B_l, B_h, [dst, #16] 290 stp C_l, C_h, [dst, #32] 291 stp D_l, D_h, [dst, #48] 292 tst count, #0x3f 293 b.ne .Ltail63up 294 ret 295 296 /* Critical loop. Start at a new Icache line boundary. Assuming 297 * 64 bytes per line this ensures the entire loop is in one line. */ 298 .p2align 6 299.Lmov_body_large_up: 300 /* There are at least 128 bytes to move. */ 301 ldp A_l, A_h, [src, #-16] 302 ldp B_l, B_h, [src, #-32] 303 ldp C_l, C_h, [src, #-48] 304 ldp D_l, D_h, [src, #-64]! 3051: 306 stp A_l, A_h, [dst, #-16] 307 ldp A_l, A_h, [src, #-16] 308 stp B_l, B_h, [dst, #-32] 309 ldp B_l, B_h, [src, #-32] 310 stp C_l, C_h, [dst, #-48] 311 ldp C_l, C_h, [src, #-48] 312 stp D_l, D_h, [dst, #-64]! 313 ldp D_l, D_h, [src, #-64]! 314 subs count, count, #64 315 b.ge 1b 316 stp A_l, A_h, [dst, #-16] 317 stp B_l, B_h, [dst, #-32] 318 stp C_l, C_h, [dst, #-48] 319 stp D_l, D_h, [dst, #-64]! 320 tst count, #0x3f 321 b.ne .Ltail63up 322 ret 323 324 325.Ldownwards: 326 /* For a downwards move we can safely use memcpy provided that 327 * DST is more than 16 bytes away from SRC. */ 328 sub tmp1, src, #16 329 cmp dstin, tmp1 330 b.ls _memcpy /* May overlap, but not critically. */ 331 332 mov dst, dstin /* Preserve DSTIN for return value. */ 333 cmp count, #64 334 b.ge .Lmov_not_short_down 335 336 /* Deal with small moves quickly by dropping straight into the 337 * exit block. */ 338.Ltail63down: 339 /* Move up to 48 bytes of data. At this point we only need the 340 * bottom 6 bits of count to be accurate. */ 341 ands tmp1, count, #0x30 342 b.eq .Ltail15down 343 add dst, dst, tmp1 344 add src, src, tmp1 345 cmp tmp1w, #0x20 346 b.eq 1f 347 b.lt 2f 348 ldp A_l, A_h, [src, #-48] 349 stp A_l, A_h, [dst, #-48] 3501: 351 ldp A_l, A_h, [src, #-32] 352 stp A_l, A_h, [dst, #-32] 3532: 354 ldp A_l, A_h, [src, #-16] 355 stp A_l, A_h, [dst, #-16] 356.Ltail15down: 357 /* Move up to 15 bytes of data. Does not assume additional data 358 being moved. */ 359 tbz count, #3, 1f 360 ldr tmp1, [src], #8 361 str tmp1, [dst], #8 3621: 363 tbz count, #2, 1f 364 ldr tmp1w, [src], #4 365 str tmp1w, [dst], #4 3661: 367 tbz count, #1, 1f 368 ldrh tmp1w, [src], #2 369 strh tmp1w, [dst], #2 3701: 371 tbz count, #0, 1f 372 ldrb tmp1w, [src] 373 strb tmp1w, [dst] 3741: 375 ret 376 377.Lmov_not_short_down: 378 /* We don't much care about the alignment of DST, but we want SRC 379 * to be 128-bit (16 byte) aligned so that we don't cross cache line 380 * boundaries on both loads and stores. */ 381 neg tmp2, src 382 ands tmp2, tmp2, #15 /* Bytes to reach alignment. */ 383 b.eq 2f 384 sub count, count, tmp2 385 /* Move enough data to reach alignment; unlike memcpy, we have to 386 * be aware of the overlap, which means we can't move data twice. */ 387 tbz tmp2, #3, 1f 388 ldr tmp1, [src], #8 389 str tmp1, [dst], #8 3901: 391 tbz tmp2, #2, 1f 392 ldr tmp1w, [src], #4 393 str tmp1w, [dst], #4 3941: 395 tbz tmp2, #1, 1f 396 ldrh tmp1w, [src], #2 397 strh tmp1w, [dst], #2 3981: 399 tbz tmp2, #0, 1f 400 ldrb tmp1w, [src], #1 401 strb tmp1w, [dst], #1 4021: 403 404 /* There may be less than 63 bytes to go now. */ 405 cmp count, #63 406 b.le .Ltail63down 4072: 408 subs count, count, #128 409 b.ge .Lmov_body_large_down 410 /* Less than 128 bytes to move, so handle 64 here and then jump 411 * to the tail. */ 412 ldp A_l, A_h, [src] 413 ldp B_l, B_h, [src, #16] 414 ldp C_l, C_h, [src, #32] 415 ldp D_l, D_h, [src, #48] 416 stp A_l, A_h, [dst] 417 stp B_l, B_h, [dst, #16] 418 stp C_l, C_h, [dst, #32] 419 stp D_l, D_h, [dst, #48] 420 tst count, #0x3f 421 add src, src, #64 422 add dst, dst, #64 423 b.ne .Ltail63down 424 ret 425 426 /* Critical loop. Start at a new cache line boundary. Assuming 427 * 64 bytes per line this ensures the entire loop is in one line. */ 428 .p2align 6 429.Lmov_body_large_down: 430 /* There are at least 128 bytes to move. */ 431 ldp A_l, A_h, [src, #0] 432 sub dst, dst, #16 /* Pre-bias. */ 433 ldp B_l, B_h, [src, #16] 434 ldp C_l, C_h, [src, #32] 435 ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */ 4361: 437 stp A_l, A_h, [dst, #16] 438 ldp A_l, A_h, [src, #16] 439 stp B_l, B_h, [dst, #32] 440 ldp B_l, B_h, [src, #32] 441 stp C_l, C_h, [dst, #48] 442 ldp C_l, C_h, [src, #48] 443 stp D_l, D_h, [dst, #64]! 444 ldp D_l, D_h, [src, #64]! 445 subs count, count, #64 446 b.ge 1b 447 stp A_l, A_h, [dst, #16] 448 stp B_l, B_h, [dst, #32] 449 stp C_l, C_h, [dst, #48] 450 stp D_l, D_h, [dst, #64] 451 add src, src, #16 452 add dst, dst, #64 + 16 453 tst count, #0x3f 454 b.ne .Ltail63down 455 ret 456