1/* 2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#include <machine/cpu_capabilities.h> 30#include <machine/commpage.h> 31 32/* 33 * The bcopy/memcpy loops, tuned for 64-bit Pentium-M class processors with 34 * Supplemental SSE3 and 64-byte cache lines. This is the 64-bit version. 35 * 36 * The following #defines are tightly coupled to the u-architecture: 37 */ 38 39#define kShort 80 // too short to bother with SSE (must be >=80) 40#define kVeryLong (500*1024) // large enough for non-temporal stores (>=8192 and <2GB) 41#define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl" 42 43 44// void bcopy(const void *src, void *dst, size_t len); 45 46 .text 47 .code64 48 .align 5, 0x90 49LZero: 50Lbcopy_sse3x_64: // void bcopy(const void *src, void *dst, size_t len) 51 pushq %rbp // set up a frame for backtraces 52 movq %rsp,%rbp 53 movq %rsi,%rax // copy dest ptr 54 movq %rdi,%rsi // xchange source and dest ptrs 55 movq %rax,%rdi 56 subq %rsi,%rax // (dest - source) 57 cmpq %rdx,%rax // must move in reverse if (dest - source) < length 58 jb LReverseIsland 59 cmpq $(kShort),%rdx // long enough to bother with SSE? 60 jbe LShort // no 61 jmp LNotShort 62 63// 64// void *memcpy(void *dst, const void *src, size_t len); 65// void *memmove(void *dst, const void *src, size_t len); 66// 67// NB: These need to be 32 bytes from bcopy(): 68// 69 70 .align 5, 0x90 71Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len) 72Lmemmove: // void *memmove(void *dst, const void *src, size_t len) 73 pushq %rbp // set up a frame for backtraces 74 movq %rsp,%rbp 75 movq %rdi,%r11 // save return value here 76 movq %rdi,%rax 77 subq %rsi,%rax // (dest - source) 78 cmpq %rdx,%rax // must move in reverse if (dest - source) < length 79 jb LReverseIsland 80 cmpq $(kShort),%rdx // long enough to bother with SSE? 81 ja LNotShort // yes 82 83// Handle short forward copies. As the most common case, this is the fall-through path. 84// rdx = length (<= kShort) 85// rsi = source ptr 86// rdi = dest ptr 87 88LShort: 89 movl %edx,%ecx // copy length using 32-bit operation 90 shrl $2,%ecx // get #doublewords 91 jz LLeftovers 922: // loop copying doublewords 93 movl (%rsi),%eax 94 addq $4,%rsi 95 movl %eax,(%rdi) 96 addq $4,%rdi 97 decl %ecx 98 jnz 2b 99LLeftovers: // handle leftover bytes (0..3) in last word 100 andl $3,%edx // any leftover bytes? 101 jz 5f 1024: // loop copying bytes 103 movb (%rsi),%al 104 incq %rsi 105 movb %al,(%rdi) 106 incq %rdi 107 decl %edx 108 jnz 4b 1095: 110 movq %r11,%rax // get return value (dst ptr) for memcpy/memmove 111 popq %rbp 112 ret 113 114 115LReverseIsland: // keep the "jb" above a short branch... 116 jmp LReverse // ...because reverse moves are uncommon 117 118 119// Handle forward moves that are long enough to justify use of SSE. 120// First, 16-byte align the destination. 121// rdx = length (> kShort) 122// rsi = source ptr 123// rdi = dest ptr 124 125LNotShort: 126 cmpq $(kVeryLong),%rdx // long enough to justify heavyweight loops? 127 jae LVeryLong // use very-long-operand path 128 movl %edi,%ecx // copy low half of destination ptr 129 negl %ecx 130 andl $15,%ecx // get #bytes to align destination 131 jz LDestAligned // already aligned 132 subl %ecx,%edx // decrement length 133 rep // align destination 134 movsb 135 136 137// Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks, 138// based on the alignment of the source. All vector loads and stores are aligned. 139// Even though this means we have to shift and repack vectors, doing so is much faster 140// than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already, 141// there is at least one chunk. When we enter the copy loops, the following registers 142// are set up: 143// rdx = residual length (0..63) 144// rcx = -(length to move), a multiple of 64 less than 2GB 145// rsi = ptr to 1st source byte not to move (unaligned) 146// rdi = ptr to 1st dest byte not to move (aligned) 147 148LDestAligned: 149 movq %rdx,%rcx // copy length 150 movl %esi,%eax // copy low half of source address 151 andl $63,%edx // get remaining bytes for LShort 152 andl $15,%eax // mask to low 4 bits of source address 153 andq $-64,%rcx // get number of bytes we will copy in inner loop 154// We'd like to use lea with rip-relative addressing, but cannot in a .code64 block. 155// lea LTable(%rip),%r8 // point to dispatch table 156 movq $(_COMM_PAGE_32_TO_64(_COMM_PAGE_BCOPY)),%r8 // work around 4586528 157 addq $(LTable-LZero),%r8 // work around 4586528 158 addq %rcx,%rsi // point to 1st byte not copied 159 addq %rcx,%rdi 160 movl (%r8,%rax,4),%eax // get offset of routine 161 negq %rcx // now generate offset to 1st byte to be copied 162 addq %r8,%rax // generate address of copy loop 163 jmp *%rax // enter copy loop, selected by source alignment 164 165 .align 2 166LTable: // table of copy loop addresses 167 .long (LMod0 - LTable) 168 .long (LMod1 - LTable) 169 .long (LMod2 - LTable) 170 .long (LMod3 - LTable) 171 .long (LMod4 - LTable) 172 .long (LMod5 - LTable) 173 .long (LMod6 - LTable) 174 .long (LMod7 - LTable) 175 .long (LMod8 - LTable) 176 .long (LMod9 - LTable) 177 .long (LMod10 - LTable) 178 .long (LMod11 - LTable) 179 .long (LMod12 - LTable) 180 .long (LMod13 - LTable) 181 .long (LMod14 - LTable) 182 .long (LMod15 - LTable) 183 184 185// Very long forward moves. These are at least several pages. They are special cased 186// and aggressively optimized, not so much because they are common or useful, but 187// because they are subject to benchmark. There isn't enough room for them in the 188// area reserved on the commpage for bcopy, so we put them elsewhere. We call 189// the longcopy routine using the normal ABI: 190// rdi = dest 191// rsi = source 192// rdx = length (>= kVeryLong bytes) 193 194LVeryLong: 195 pushq %r11 // save return value 196 movq $_COMM_PAGE_32_TO_64(_COMM_PAGE_LONGCOPY),%rax 197 call *%rax // call very long operand routine 198 popq %rax // pop return value 199 popq %rbp 200 ret 201 202 203// On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 16-byte 204// aligned operands from about 32KB up to kVeryLong for the hot cache case, and from 205// about 256 bytes up to kVeryLong for cold caches. This is because the microcode 206// avoids having to read destination cache lines that will be completely overwritten. 207// The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since 208// we do not know if the destination is in cache or not. 209 210Lfastpath: 211 addq %rcx,%rsi // restore ptrs to 1st byte of source and dest 212 addq %rcx,%rdi 213 negl %ecx // make length positive (known to be < 2GB) 214 orl %edx,%ecx // restore total #bytes remaining to move 215 cld // we'll move forward 216 shrl $2,%ecx // compute #words to move 217 rep // the u-code will optimize this 218 movsl 219 jmp LLeftovers // handle 0..3 leftover bytes 220 221 222// Forward loop for medium length operands in which low four bits of %rsi == 0000 223 224LMod0: 225 cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong) 226 jle Lfastpath // long enough for fastpath in microcode 227 jmp 1f 228 .align 4,0x90 // 16-byte align inner loops 2291: // loop over 64-byte chunks 230 movdqa (%rsi,%rcx),%xmm0 231 movdqa 16(%rsi,%rcx),%xmm1 232 movdqa 32(%rsi,%rcx),%xmm2 233 movdqa 48(%rsi,%rcx),%xmm3 234 235 movdqa %xmm0,(%rdi,%rcx) 236 movdqa %xmm1,16(%rdi,%rcx) 237 movdqa %xmm2,32(%rdi,%rcx) 238 movdqa %xmm3,48(%rdi,%rcx) 239 240 addq $64,%rcx 241 jnz 1b 242 243 jmp LShort // copy remaining 0..63 bytes and done 244 245 246// Forward loop for medium length operands in which low four bits of %rsi == 0001 247 248LMod1: 249 movdqa -1(%rsi,%rcx),%xmm0 // prime the loop by loading 1st quadword 2501: // loop over 64-byte chunks 251 movdqa 15(%rsi,%rcx),%xmm1 252 movdqa 31(%rsi,%rcx),%xmm2 253 movdqa 47(%rsi,%rcx),%xmm3 254 movdqa 63(%rsi,%rcx),%xmm4 255 256 movdqa %xmm0,%xmm5 257 movdqa %xmm4,%xmm0 258 259 palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) 260 palignr $1,%xmm2,%xmm3 261 palignr $1,%xmm1,%xmm2 262 palignr $1,%xmm5,%xmm1 263 264 movdqa %xmm1,(%rdi,%rcx) 265 movdqa %xmm2,16(%rdi,%rcx) 266 movdqa %xmm3,32(%rdi,%rcx) 267 movdqa %xmm4,48(%rdi,%rcx) 268 269 addq $64,%rcx 270 jnz 1b 271 272 jmp LShort // copy remaining 0..63 bytes and done 273 274 275// Forward loop for medium length operands in which low four bits of %rsi == 0010 276 277LMod2: 278 movdqa -2(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq 2791: // loop over 64-byte chunks 280 movdqa 14(%rsi,%rcx),%xmm1 281 movdqa 30(%rsi,%rcx),%xmm2 282 movdqa 46(%rsi,%rcx),%xmm3 283 movdqa 62(%rsi,%rcx),%xmm4 284 285 movdqa %xmm0,%xmm5 286 movdqa %xmm4,%xmm0 287 288 palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) 289 palignr $2,%xmm2,%xmm3 290 palignr $2,%xmm1,%xmm2 291 palignr $2,%xmm5,%xmm1 292 293 movdqa %xmm1,(%rdi,%rcx) 294 movdqa %xmm2,16(%rdi,%rcx) 295 movdqa %xmm3,32(%rdi,%rcx) 296 movdqa %xmm4,48(%rdi,%rcx) 297 298 addq $64,%rcx 299 jnz 1b 300 301 jmp LShort // copy remaining 0..63 bytes and done 302 303 304// Forward loop for medium length operands in which low four bits of %rsi == 0011 305 306LMod3: 307 movdqa -3(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq 3081: // loop over 64-byte chunks 309 movdqa 13(%rsi,%rcx),%xmm1 310 movdqa 29(%rsi,%rcx),%xmm2 311 movdqa 45(%rsi,%rcx),%xmm3 312 movdqa 61(%rsi,%rcx),%xmm4 313 314 movdqa %xmm0,%xmm5 315 movdqa %xmm4,%xmm0 316 317 palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) 318 palignr $3,%xmm2,%xmm3 319 palignr $3,%xmm1,%xmm2 320 palignr $3,%xmm5,%xmm1 321 322 movdqa %xmm1,(%rdi,%rcx) 323 movdqa %xmm2,16(%rdi,%rcx) 324 movdqa %xmm3,32(%rdi,%rcx) 325 movdqa %xmm4,48(%rdi,%rcx) 326 327 addq $64,%rcx 328 jnz 1b 329 330 jmp LShort // copy remaining 0..63 bytes and done 331 332 333// Forward loop for medium length operands in which low four bits of %rsi == 0100 334// We use the float single data type in order to use "movss" to merge vectors. 335 336LMod4: 337 movaps -4(%rsi,%rcx),%xmm0 // 4-byte aligned: prime the loop 338 jmp 1f 339 .align 4,0x90 3401: // loop over 64-byte chunks 341 movaps 12(%rsi,%rcx),%xmm1 342 movaps 28(%rsi,%rcx),%xmm2 343 movss %xmm1,%xmm0 // copy low 4 bytes of source into destination 344 pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01) 345 movaps 44(%rsi,%rcx),%xmm3 346 movss %xmm2,%xmm1 347 pshufd $(0x39),%xmm1,%xmm1 348 movaps 60(%rsi,%rcx),%xmm4 349 movss %xmm3,%xmm2 350 pshufd $(0x39),%xmm2,%xmm2 351 352 movaps %xmm0,(%rdi,%rcx) 353 movss %xmm4,%xmm3 354 pshufd $(0x39),%xmm3,%xmm3 355 movaps %xmm1,16(%rdi,%rcx) 356 movaps %xmm2,32(%rdi,%rcx) 357 movaps %xmm4,%xmm0 358 movaps %xmm3,48(%rdi,%rcx) 359 360 addq $64,%rcx 361 jnz 1b 362 363 jmp LShort // copy remaining 0..63 bytes and done 364 365 366// Forward loop for medium length operands in which low four bits of %rsi == 0101 367 368LMod5: 369 movdqa -5(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq 3701: // loop over 64-byte chunks 371 movdqa 11(%rsi,%rcx),%xmm1 372 movdqa 27(%rsi,%rcx),%xmm2 373 movdqa 43(%rsi,%rcx),%xmm3 374 movdqa 59(%rsi,%rcx),%xmm4 375 376 movdqa %xmm0,%xmm5 377 movdqa %xmm4,%xmm0 378 379 palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) 380 palignr $5,%xmm2,%xmm3 381 palignr $5,%xmm1,%xmm2 382 palignr $5,%xmm5,%xmm1 383 384 movdqa %xmm1,(%rdi,%rcx) 385 movdqa %xmm2,16(%rdi,%rcx) 386 movdqa %xmm3,32(%rdi,%rcx) 387 movdqa %xmm4,48(%rdi,%rcx) 388 389 addq $64,%rcx 390 jnz 1b 391 392 jmp LShort // copy remaining 0..63 bytes and done 393 394 395// Forward loop for medium length operands in which low four bits of %rsi == 0110 396 397LMod6: 398 movdqa -6(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq 3991: // loop over 64-byte chunks 400 movdqa 10(%rsi,%rcx),%xmm1 401 movdqa 26(%rsi,%rcx),%xmm2 402 movdqa 42(%rsi,%rcx),%xmm3 403 movdqa 58(%rsi,%rcx),%xmm4 404 405 movdqa %xmm0,%xmm5 406 movdqa %xmm4,%xmm0 407 408 palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) 409 palignr $6,%xmm2,%xmm3 410 palignr $6,%xmm1,%xmm2 411 palignr $6,%xmm5,%xmm1 412 413 movdqa %xmm1,(%rdi,%rcx) 414 movdqa %xmm2,16(%rdi,%rcx) 415 movdqa %xmm3,32(%rdi,%rcx) 416 movdqa %xmm4,48(%rdi,%rcx) 417 418 addq $64,%rcx 419 jnz 1b 420 421 jmp LShort // copy remaining 0..63 bytes and done 422 423 424// Forward loop for medium length operands in which low four bits of %rsi == 0111 425 426LMod7: 427 movdqa -7(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq 4281: // loop over 64-byte chunks 429 movdqa 9(%rsi,%rcx),%xmm1 430 movdqa 25(%rsi,%rcx),%xmm2 431 movdqa 41(%rsi,%rcx),%xmm3 432 movdqa 57(%rsi,%rcx),%xmm4 433 434 movdqa %xmm0,%xmm5 435 movdqa %xmm4,%xmm0 436 437 palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) 438 palignr $7,%xmm2,%xmm3 439 palignr $7,%xmm1,%xmm2 440 palignr $7,%xmm5,%xmm1 441 442 movdqa %xmm1,(%rdi,%rcx) 443 movdqa %xmm2,16(%rdi,%rcx) 444 movdqa %xmm3,32(%rdi,%rcx) 445 movdqa %xmm4,48(%rdi,%rcx) 446 447 addq $64,%rcx 448 jnz 1b 449 450 jmp LShort // copy remaining 0..63 bytes and done 451 452 453// Forward loop for medium length operands in which low four bits of %rsi == 1000 454// We use the float double data type in order to use "shufpd" to shift by 8 bytes. 455 456LMod8: 457 cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong) 458 jle Lfastpath // long enough for fastpath in microcode 459 movapd -8(%rsi,%rcx),%xmm0 // 8-byte aligned: prime the loop 460 jmp 1f 461 .align 4,0x90 4621: // loop over 64-byte chunks 463 movapd 8(%rsi,%rcx),%xmm1 464 movapd 24(%rsi,%rcx),%xmm2 465 shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes) 466 movapd 40(%rsi,%rcx),%xmm3 467 shufpd $01,%xmm2,%xmm1 468 movapd 56(%rsi,%rcx),%xmm4 469 shufpd $01,%xmm3,%xmm2 470 471 movapd %xmm0,(%rdi,%rcx) 472 shufpd $01,%xmm4,%xmm3 473 movapd %xmm1,16(%rdi,%rcx) 474 movapd %xmm2,32(%rdi,%rcx) 475 movapd %xmm4,%xmm0 476 movapd %xmm3,48(%rdi,%rcx) 477 478 addq $64,%rcx 479 jnz 1b 480 481 jmp LShort // copy remaining 0..63 bytes and done 482 483 484// Forward loop for medium length operands in which low four bits of %rsi == 1001 485 486LMod9: 487 movdqa -9(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq 4881: // loop over 64-byte chunks 489 movdqa 7(%rsi,%rcx),%xmm1 490 movdqa 23(%rsi,%rcx),%xmm2 491 movdqa 39(%rsi,%rcx),%xmm3 492 movdqa 55(%rsi,%rcx),%xmm4 493 494 movdqa %xmm0,%xmm5 495 movdqa %xmm4,%xmm0 496 497 palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) 498 palignr $9,%xmm2,%xmm3 499 palignr $9,%xmm1,%xmm2 500 palignr $9,%xmm5,%xmm1 501 502 movdqa %xmm1,(%rdi,%rcx) 503 movdqa %xmm2,16(%rdi,%rcx) 504 movdqa %xmm3,32(%rdi,%rcx) 505 movdqa %xmm4,48(%rdi,%rcx) 506 507 addq $64,%rcx 508 jnz 1b 509 510 jmp LShort // copy remaining 0..63 bytes and done 511 512 513// Forward loop for medium length operands in which low four bits of %rsi == 1010 514 515LMod10: 516 movdqa -10(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq 5171: // loop over 64-byte chunks 518 movdqa 6(%rsi,%rcx),%xmm1 519 movdqa 22(%rsi,%rcx),%xmm2 520 movdqa 38(%rsi,%rcx),%xmm3 521 movdqa 54(%rsi,%rcx),%xmm4 522 523 movdqa %xmm0,%xmm5 524 movdqa %xmm4,%xmm0 525 526 palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) 527 palignr $10,%xmm2,%xmm3 528 palignr $10,%xmm1,%xmm2 529 palignr $10,%xmm5,%xmm1 530 531 movdqa %xmm1,(%rdi,%rcx) 532 movdqa %xmm2,16(%rdi,%rcx) 533 movdqa %xmm3,32(%rdi,%rcx) 534 movdqa %xmm4,48(%rdi,%rcx) 535 536 addq $64,%rcx 537 jnz 1b 538 539 jmp LShort // copy remaining 0..63 bytes and done 540 541 542// Forward loop for medium length operands in which low four bits of %rsi == 1011 543 544LMod11: 545 movdqa -11(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq 5461: // loop over 64-byte chunks 547 movdqa 5(%rsi,%rcx),%xmm1 548 movdqa 21(%rsi,%rcx),%xmm2 549 movdqa 37(%rsi,%rcx),%xmm3 550 movdqa 53(%rsi,%rcx),%xmm4 551 552 movdqa %xmm0,%xmm5 553 movdqa %xmm4,%xmm0 554 555 palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) 556 palignr $11,%xmm2,%xmm3 557 palignr $11,%xmm1,%xmm2 558 palignr $11,%xmm5,%xmm1 559 560 movdqa %xmm1,(%rdi,%rcx) 561 movdqa %xmm2,16(%rdi,%rcx) 562 movdqa %xmm3,32(%rdi,%rcx) 563 movdqa %xmm4,48(%rdi,%rcx) 564 565 addq $64,%rcx 566 jnz 1b 567 568 jmp LShort // copy remaining 0..63 bytes and done 569 570 571// Forward loop for medium length operands in which low four bits of %rsi == 1100 572// We use the float single data type in order to use "movss" to merge vectors. 573 574LMod12: 575 movss (%rsi,%rcx),%xmm0 // prefetch 1st four bytes of source, right justified 576 jmp 1f 577 .align 4,0x90 5781: // loop over 64-byte chunks 579 pshufd $(0x93),4(%rsi,%rcx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11) 580 pshufd $(0x93),20(%rsi,%rcx),%xmm2 581 pshufd $(0x93),36(%rsi,%rcx),%xmm3 582 pshufd $(0x93),52(%rsi,%rcx),%xmm4 583 584 movaps %xmm4,%xmm5 585 movss %xmm3,%xmm4 // copy low 4 bytes of source into destination 586 movss %xmm2,%xmm3 587 movss %xmm1,%xmm2 588 movss %xmm0,%xmm1 589 590 movaps %xmm1,(%rdi,%rcx) 591 movaps %xmm2,16(%rdi,%rcx) 592 movaps %xmm5,%xmm0 593 movaps %xmm3,32(%rdi,%rcx) 594 movaps %xmm4,48(%rdi,%rcx) 595 596 addq $64,%rcx 597 jnz 1b 598 599 jmp LShort // copy remaining 0..63 bytes and done 600 601 602// Forward loop for medium length operands in which low four bits of %rsi == 1101 603 604LMod13: 605 movdqa -13(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq 6061: // loop over 64-byte chunks 607 movdqa 3(%rsi,%rcx),%xmm1 608 movdqa 19(%rsi,%rcx),%xmm2 609 movdqa 35(%rsi,%rcx),%xmm3 610 movdqa 51(%rsi,%rcx),%xmm4 611 612 movdqa %xmm0,%xmm5 613 movdqa %xmm4,%xmm0 614 615 palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) 616 palignr $13,%xmm2,%xmm3 617 palignr $13,%xmm1,%xmm2 618 palignr $13,%xmm5,%xmm1 619 620 movdqa %xmm1,(%rdi,%rcx) 621 movdqa %xmm2,16(%rdi,%rcx) 622 movdqa %xmm3,32(%rdi,%rcx) 623 movdqa %xmm4,48(%rdi,%rcx) 624 625 addq $64,%rcx 626 jnz 1b 627 628 jmp LShort // copy remaining 0..63 bytes and done 629 630 631// Forward loop for medium length operands in which low four bits of %rsi == 1110 632 633LMod14: 634 movdqa -14(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq 6351: // loop over 64-byte chunks 636 movdqa 2(%rsi,%rcx),%xmm1 637 movdqa 18(%rsi,%rcx),%xmm2 638 movdqa 34(%rsi,%rcx),%xmm3 639 movdqa 50(%rsi,%rcx),%xmm4 640 641 movdqa %xmm0,%xmm5 642 movdqa %xmm4,%xmm0 643 644 palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) 645 palignr $14,%xmm2,%xmm3 646 palignr $14,%xmm1,%xmm2 647 palignr $14,%xmm5,%xmm1 648 649 movdqa %xmm1,(%rdi,%rcx) 650 movdqa %xmm2,16(%rdi,%rcx) 651 movdqa %xmm3,32(%rdi,%rcx) 652 movdqa %xmm4,48(%rdi,%rcx) 653 654 addq $64,%rcx 655 jnz 1b 656 657 jmp LShort // copy remaining 0..63 bytes and done 658 659 660// Forward loop for medium length operands in which low four bits of %rsi == 1111 661 662LMod15: 663 movdqa -15(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq 6641: // loop over 64-byte chunks 665 movdqa 1(%rsi,%rcx),%xmm1 666 movdqa 17(%rsi,%rcx),%xmm2 667 movdqa 33(%rsi,%rcx),%xmm3 668 movdqa 49(%rsi,%rcx),%xmm4 669 670 movdqa %xmm0,%xmm5 671 movdqa %xmm4,%xmm0 672 673 palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) 674 palignr $15,%xmm2,%xmm3 675 palignr $15,%xmm1,%xmm2 676 palignr $15,%xmm5,%xmm1 677 678 movdqa %xmm1,(%rdi,%rcx) 679 movdqa %xmm2,16(%rdi,%rcx) 680 movdqa %xmm3,32(%rdi,%rcx) 681 movdqa %xmm4,48(%rdi,%rcx) 682 683 addq $64,%rcx 684 jnz 1b 685 686 jmp LShort // copy remaining 0..63 bytes and done 687 688 689// Reverse moves. These are not optimized as aggressively as their forward 690// counterparts, as they are only used with destructive overlap. 691// rdx = length 692// rsi = source ptr 693// rdi = dest ptr 694 695LReverse: 696 addq %rdx,%rsi // point to end of strings 697 addq %rdx,%rdi 698 cmpq $(kShort),%rdx // long enough to bother with SSE? 699 ja LReverseNotShort // yes 700 701// Handle reverse short copies. 702// edx = length (<= kShort) 703// rsi = one byte past end of source 704// rdi = one byte past end of dest 705 706LReverseShort: 707 movl %edx,%ecx // copy length 708 shrl $3,%ecx // #quadwords 709 jz 3f 7101: 711 subq $8,%rsi 712 movq (%rsi),%rax 713 subq $8,%rdi 714 movq %rax,(%rdi) 715 decl %ecx 716 jnz 1b 7173: 718 andl $7,%edx // bytes? 719 jz 5f 7204: 721 decq %rsi 722 movb (%rsi),%al 723 decq %rdi 724 movb %al,(%rdi) 725 decl %edx 726 jnz 4b 7275: 728 movq %r11,%rax // get return value (dst ptr) for memcpy/memmove 729 popq %rbp 730 ret 731 732// Handle a reverse move long enough to justify using SSE. 733// rdx = length (> kShort) 734// rsi = one byte past end of source 735// rdi = one byte past end of dest 736 737LReverseNotShort: 738 movl %edi,%ecx // copy destination 739 andl $15,%ecx // get #bytes to align destination 740 je LReverseDestAligned // already aligned 741 subq %rcx,%rdx // adjust length 7421: // loop copying 1..15 bytes 743 decq %rsi 744 movb (%rsi),%al 745 decq %rdi 746 movb %al,(%rdi) 747 decl %ecx 748 jnz 1b 749 750// Destination is now aligned. Prepare for reverse loops. 751 752LReverseDestAligned: 753 movq %rdx,%rcx // copy length 754 andl $63,%edx // get remaining bytes for LReverseShort 755 andq $-64,%rcx // get number of bytes we will copy in inner loop 756 subq %rcx,%rsi // point to endpoint of copy 757 subq %rcx,%rdi 758 testl $15,%esi // is source aligned too? 759 jnz LReverseUnalignedLoop // no 760 761LReverseAlignedLoop: // loop over 64-byte chunks 762 movdqa -16(%rsi,%rcx),%xmm0 763 movdqa -32(%rsi,%rcx),%xmm1 764 movdqa -48(%rsi,%rcx),%xmm2 765 movdqa -64(%rsi,%rcx),%xmm3 766 767 movdqa %xmm0,-16(%rdi,%rcx) 768 movdqa %xmm1,-32(%rdi,%rcx) 769 movdqa %xmm2,-48(%rdi,%rcx) 770 movdqa %xmm3,-64(%rdi,%rcx) 771 772 subq $64,%rcx 773 jne LReverseAlignedLoop 774 775 jmp LReverseShort // copy remaining 0..63 bytes and done 776 777 778// Reverse, unaligned loop. LDDQU==MOVDQU on these machines. 779 780LReverseUnalignedLoop: // loop over 64-byte chunks 781 movdqu -16(%rsi,%rcx),%xmm0 782 movdqu -32(%rsi,%rcx),%xmm1 783 movdqu -48(%rsi,%rcx),%xmm2 784 movdqu -64(%rsi,%rcx),%xmm3 785 786 movdqa %xmm0,-16(%rdi,%rcx) 787 movdqa %xmm1,-32(%rdi,%rcx) 788 movdqa %xmm2,-48(%rdi,%rcx) 789 movdqa %xmm3,-64(%rdi,%rcx) 790 791 subq $64,%rcx 792 jne LReverseUnalignedLoop 793 794 jmp LReverseShort // copy remaining 0..63 bytes and done 795 796 797 COMMPAGE_DESCRIPTOR(bcopy_sse3x_64,_COMM_PAGE_BCOPY,kHasSSE2+kHasSupplementalSSE3+kCache64,kHasSSE4_2) 798