1/* 2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#include <machine/cpu_capabilities.h> 30#include <machine/commpage.h> 31 32/* 33 * The bcopy/memcpy loops, tuned for Pentium-M class processors with 34 * Supplemental SSE3 and 64-byte cache lines. 35 * 36 * The following #defines are tightly coupled to the u-architecture: 37 */ 38 39#define kShort 80 // too short to bother with SSE (must be >=80) 40#define kVeryLong (500*1024) // large enough for non-temporal stores (must be >= 8192) 41#define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl" 42 43 44// void bcopy(const void *src, void *dst, size_t len); 45 46 .text 47 .align 5, 0x90 48LZero: 49Lbcopy_sse3x: // void bcopy(const void *src, void *dst, size_t len) 50 pushl %ebp // set up a frame for backtraces 51 movl %esp,%ebp 52 pushl %esi 53 pushl %edi 54 movl 8(%ebp),%esi // get source ptr 55 movl 12(%ebp),%edi // get dest ptr 56 movl 16(%ebp),%ecx // get length 57 movl %edi,%edx 58 subl %esi,%edx // (dest - source) 59 cmpl %ecx,%edx // must move in reverse if (dest - source) < length 60 jb LReverseIsland 61 cmpl $(kShort),%ecx // long enough to bother with SSE? 62 jbe Lshort // no 63 jmp LNotShort 64 65// 66// void *memcpy(void *dst, const void *src, size_t len); 67// void *memmove(void *dst, const void *src, size_t len); 68// 69// NB: These need to be 32 bytes from bcopy(): 70// 71 72 .align 5, 0x90 73Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len) 74Lmemmove: // void *memmove(void *dst, const void *src, size_t len) 75 pushl %ebp // set up a frame for backtraces 76 movl %esp,%ebp 77 pushl %esi 78 pushl %edi 79 movl 8(%ebp),%edi // get dest ptr 80 movl 12(%ebp),%esi // get source ptr 81 movl 16(%ebp),%ecx // get length 82 movl %edi,%edx 83 subl %esi,%edx // (dest - source) 84 cmpl %ecx,%edx // must move in reverse if (dest - source) < length 85 jb LReverseIsland 86 cmpl $(kShort),%ecx // long enough to bother with SSE? 87 ja LNotShort // yes 88 89// Handle short forward copies. As the most common case, this is the fall-through path. 90// ecx = length (<= kShort) 91// esi = source ptr 92// edi = dest ptr 93 94Lshort: 95 movl %ecx,%edx // copy length 96 shrl $2,%ecx // get #doublewords 97 jz LLeftovers 982: // loop copying doublewords 99 movl (%esi),%eax 100 addl $4,%esi 101 movl %eax,(%edi) 102 addl $4,%edi 103 dec %ecx 104 jnz 2b 105LLeftovers: // handle leftover bytes (0..3) in last word 106 andl $3,%edx // any leftover bytes? 107 jz Lexit 1084: // loop copying bytes 109 movb (%esi),%al 110 inc %esi 111 movb %al,(%edi) 112 inc %edi 113 dec %edx 114 jnz 4b 115Lexit: 116 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove 117 popl %edi 118 popl %esi 119 popl %ebp 120 ret 121 122 123LReverseIsland: // keep the "jb" above a short branch... 124 jmp LReverse // ...because reverse moves are uncommon 125 126 127// Handle forward moves that are long enough to justify use of SSE3. 128// First, 16-byte align the destination. 129// ecx = length (> kShort) 130// esi = source ptr 131// edi = dest ptr 132 133LNotShort: 134 cmpl $(kVeryLong),%ecx // long enough to justify heavyweight loops? 135 movl %edi,%edx // copy destination 136 jae LVeryLong // use very-long-operand path 137 negl %edx 138 andl $15,%edx // get #bytes to align destination 139 jz LDestAligned // already aligned 140 subl %edx,%ecx // decrement length 1411: // loop copying 1..15 bytes 142 movb (%esi),%al 143 inc %esi 144 movb %al,(%edi) 145 inc %edi 146 dec %edx 147 jnz 1b 148 149// Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks, 150// based on the alignment of the source. All vector loads and stores are aligned. 151// Even though this means we have to shift and repack vectors, doing so is much faster 152// than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already, 153// there is at least one chunk. When we enter the copy loops, the following registers 154// are set up: 155// ecx = residual length (0..63) 156// edx = -(length to move), a multiple of 64 157// esi = ptr to 1st source byte not to move (unaligned) 158// edi = ptr to 1st dest byte not to move (aligned) 159 160LDestAligned: 161 movl %ecx,%edx // copy length 162 movl %esi,%eax // copy source address 163 andl $63,%ecx // get remaining bytes for Lshort 164 andl $-64,%edx // get number of bytes we will copy in inner loop 165 andl $15,%eax // mask to low 4 bits of source address 166 addl %edx,%esi // point to 1st byte not copied 167 addl %edx,%edi 168 negl %edx // now generate offset to 1st byte to be copied 169 movl (_COMM_PAGE_BCOPY+LTable-LZero)(,%eax,4),%eax 170 jmp *%eax 171 172 .align 2 173LTable: // table of copy loop addresses 174 .long LMod0 + _COMM_PAGE_BCOPY - LZero 175 .long LMod1 + _COMM_PAGE_BCOPY - LZero 176 .long LMod2 + _COMM_PAGE_BCOPY - LZero 177 .long LMod3 + _COMM_PAGE_BCOPY - LZero 178 .long LMod4 + _COMM_PAGE_BCOPY - LZero 179 .long LMod5 + _COMM_PAGE_BCOPY - LZero 180 .long LMod6 + _COMM_PAGE_BCOPY - LZero 181 .long LMod7 + _COMM_PAGE_BCOPY - LZero 182 .long LMod8 + _COMM_PAGE_BCOPY - LZero 183 .long LMod9 + _COMM_PAGE_BCOPY - LZero 184 .long LMod10 + _COMM_PAGE_BCOPY - LZero 185 .long LMod11 + _COMM_PAGE_BCOPY - LZero 186 .long LMod12 + _COMM_PAGE_BCOPY - LZero 187 .long LMod13 + _COMM_PAGE_BCOPY - LZero 188 .long LMod14 + _COMM_PAGE_BCOPY - LZero 189 .long LMod15 + _COMM_PAGE_BCOPY - LZero 190 191 192// Very long forward moves. These are at least several pages. They are special cased 193// and aggressively optimized, not so much because they are common or useful, but 194// because they are subject to benchmark. There isn't enough room for them in the 195// area reserved on the commpage for bcopy, so we put them elsewhere. We call 196// the longcopy routine using the normal ABI. 197 198LVeryLong: 199 pushl %ecx // length (>= kVeryLong) 200 pushl %esi // source ptr 201 pushl %edi // dest ptr 202 movl $(_COMM_PAGE_LONGCOPY),%eax 203 call *%eax // do the long copy 204 addl $12,%esp // pop off our parameters 205 jmp Lexit 206 207 208// On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 8-byte 209// aligned operands from about 32KB up to kVeryLong for the hot cache case, and from 210// about 256 bytes up to kVeryLong for cold caches. This is because the microcode 211// avoids having to read destination cache lines that will be completely overwritten. 212// The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since 213// we do not know if the destination is in cache or not. 214 215Lfastpath: 216 addl %edx,%esi // restore ptrs to 1st byte of source and dest 217 addl %edx,%edi 218 negl %edx // make length positive 219 orl %edx,%ecx // restore total #bytes remaining to move 220 cld // we'll move forward 221 movl %ecx,%edx // copy total length to move 222 shrl $2,%ecx // compute #words to move 223 rep // the u-code will optimize this 224 movsl 225 jmp LLeftovers // handle 0..3 leftover bytes 226 227 228// Forward loop for medium length operands in which low four bits of %esi == 0000 229 230LMod0: 231 cmpl $(-kFastUCode),%edx // %edx == -length, where (length < kVeryLong) 232 jle Lfastpath // long enough for fastpath in microcode 233 jmp 1f 234 .align 4,0x90 // 16-byte align inner loops 2351: // loop over 64-byte chunks 236 movdqa (%esi,%edx),%xmm0 237 movdqa 16(%esi,%edx),%xmm1 238 movdqa 32(%esi,%edx),%xmm2 239 movdqa 48(%esi,%edx),%xmm3 240 241 movdqa %xmm0,(%edi,%edx) 242 movdqa %xmm1,16(%edi,%edx) 243 movdqa %xmm2,32(%edi,%edx) 244 movdqa %xmm3,48(%edi,%edx) 245 246 addl $64,%edx 247 jnz 1b 248 249 jmp Lshort // copy remaining 0..63 bytes and done 250 251 252// Forward loop for medium length operands in which low four bits of %esi == 0001 253 254LMod1: 255 movdqa -1(%esi,%edx),%xmm0 // prime the loop by loading 1st quadword 2561: // loop over 64-byte chunks 257 movdqa 15(%esi,%edx),%xmm1 258 movdqa 31(%esi,%edx),%xmm2 259 movdqa 47(%esi,%edx),%xmm3 260 movdqa 63(%esi,%edx),%xmm4 261 262 movdqa %xmm0,%xmm5 263 movdqa %xmm4,%xmm0 264 265 palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) 266 palignr $1,%xmm2,%xmm3 267 palignr $1,%xmm1,%xmm2 268 palignr $1,%xmm5,%xmm1 269 270 movdqa %xmm1,(%edi,%edx) 271 movdqa %xmm2,16(%edi,%edx) 272 movdqa %xmm3,32(%edi,%edx) 273 movdqa %xmm4,48(%edi,%edx) 274 275 addl $64,%edx 276 jnz 1b 277 278 jmp Lshort // copy remaining 0..63 bytes and done 279 280 281// Forward loop for medium length operands in which low four bits of %esi == 0010 282 283LMod2: 284 movdqa -2(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq 2851: // loop over 64-byte chunks 286 movdqa 14(%esi,%edx),%xmm1 287 movdqa 30(%esi,%edx),%xmm2 288 movdqa 46(%esi,%edx),%xmm3 289 movdqa 62(%esi,%edx),%xmm4 290 291 movdqa %xmm0,%xmm5 292 movdqa %xmm4,%xmm0 293 294 palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) 295 palignr $2,%xmm2,%xmm3 296 palignr $2,%xmm1,%xmm2 297 palignr $2,%xmm5,%xmm1 298 299 movdqa %xmm1,(%edi,%edx) 300 movdqa %xmm2,16(%edi,%edx) 301 movdqa %xmm3,32(%edi,%edx) 302 movdqa %xmm4,48(%edi,%edx) 303 304 addl $64,%edx 305 jnz 1b 306 307 jmp Lshort // copy remaining 0..63 bytes and done 308 309 310// Forward loop for medium length operands in which low four bits of %esi == 0011 311 312LMod3: 313 movdqa -3(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq 3141: // loop over 64-byte chunks 315 movdqa 13(%esi,%edx),%xmm1 316 movdqa 29(%esi,%edx),%xmm2 317 movdqa 45(%esi,%edx),%xmm3 318 movdqa 61(%esi,%edx),%xmm4 319 320 movdqa %xmm0,%xmm5 321 movdqa %xmm4,%xmm0 322 323 palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) 324 palignr $3,%xmm2,%xmm3 325 palignr $3,%xmm1,%xmm2 326 palignr $3,%xmm5,%xmm1 327 328 movdqa %xmm1,(%edi,%edx) 329 movdqa %xmm2,16(%edi,%edx) 330 movdqa %xmm3,32(%edi,%edx) 331 movdqa %xmm4,48(%edi,%edx) 332 333 addl $64,%edx 334 jnz 1b 335 336 jmp Lshort // copy remaining 0..63 bytes and done 337 338 339// Forward loop for medium length operands in which low four bits of %esi == 0100 340// We use the float single data type in order to use "movss" to merge vectors. 341 342LMod4: 343 movaps -4(%esi,%edx),%xmm0 // 4-byte aligned: prime the loop 344 jmp 1f 345 .align 4,0x90 3461: // loop over 64-byte chunks 347 movaps 12(%esi,%edx),%xmm1 348 movaps 28(%esi,%edx),%xmm2 349 movss %xmm1,%xmm0 // copy low 4 bytes of source into destination 350 pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01) 351 movaps 44(%esi,%edx),%xmm3 352 movss %xmm2,%xmm1 353 pshufd $(0x39),%xmm1,%xmm1 354 movaps 60(%esi,%edx),%xmm4 355 movss %xmm3,%xmm2 356 pshufd $(0x39),%xmm2,%xmm2 357 358 movaps %xmm0,(%edi,%edx) 359 movss %xmm4,%xmm3 360 pshufd $(0x39),%xmm3,%xmm3 361 movaps %xmm1,16(%edi,%edx) 362 movaps %xmm2,32(%edi,%edx) 363 movaps %xmm4,%xmm0 364 movaps %xmm3,48(%edi,%edx) 365 366 addl $64,%edx 367 jnz 1b 368 369 jmp Lshort // copy remaining 0..63 bytes and done 370 371 372// Forward loop for medium length operands in which low four bits of %esi == 0101 373 374LMod5: 375 movdqa -5(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq 3761: // loop over 64-byte chunks 377 movdqa 11(%esi,%edx),%xmm1 378 movdqa 27(%esi,%edx),%xmm2 379 movdqa 43(%esi,%edx),%xmm3 380 movdqa 59(%esi,%edx),%xmm4 381 382 movdqa %xmm0,%xmm5 383 movdqa %xmm4,%xmm0 384 385 palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) 386 palignr $5,%xmm2,%xmm3 387 palignr $5,%xmm1,%xmm2 388 palignr $5,%xmm5,%xmm1 389 390 movdqa %xmm1,(%edi,%edx) 391 movdqa %xmm2,16(%edi,%edx) 392 movdqa %xmm3,32(%edi,%edx) 393 movdqa %xmm4,48(%edi,%edx) 394 395 addl $64,%edx 396 jnz 1b 397 398 jmp Lshort // copy remaining 0..63 bytes and done 399 400 401// Forward loop for medium length operands in which low four bits of %esi == 0110 402 403LMod6: 404 movdqa -6(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq 4051: // loop over 64-byte chunks 406 movdqa 10(%esi,%edx),%xmm1 407 movdqa 26(%esi,%edx),%xmm2 408 movdqa 42(%esi,%edx),%xmm3 409 movdqa 58(%esi,%edx),%xmm4 410 411 movdqa %xmm0,%xmm5 412 movdqa %xmm4,%xmm0 413 414 palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) 415 palignr $6,%xmm2,%xmm3 416 palignr $6,%xmm1,%xmm2 417 palignr $6,%xmm5,%xmm1 418 419 movdqa %xmm1,(%edi,%edx) 420 movdqa %xmm2,16(%edi,%edx) 421 movdqa %xmm3,32(%edi,%edx) 422 movdqa %xmm4,48(%edi,%edx) 423 424 addl $64,%edx 425 jnz 1b 426 427 jmp Lshort // copy remaining 0..63 bytes and done 428 429 430// Forward loop for medium length operands in which low four bits of %esi == 0111 431 432LMod7: 433 movdqa -7(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq 4341: // loop over 64-byte chunks 435 movdqa 9(%esi,%edx),%xmm1 436 movdqa 25(%esi,%edx),%xmm2 437 movdqa 41(%esi,%edx),%xmm3 438 movdqa 57(%esi,%edx),%xmm4 439 440 movdqa %xmm0,%xmm5 441 movdqa %xmm4,%xmm0 442 443 palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) 444 palignr $7,%xmm2,%xmm3 445 palignr $7,%xmm1,%xmm2 446 palignr $7,%xmm5,%xmm1 447 448 movdqa %xmm1,(%edi,%edx) 449 movdqa %xmm2,16(%edi,%edx) 450 movdqa %xmm3,32(%edi,%edx) 451 movdqa %xmm4,48(%edi,%edx) 452 453 addl $64,%edx 454 jnz 1b 455 456 jmp Lshort // copy remaining 0..63 bytes and done 457 458 459// Forward loop for medium length operands in which low four bits of %esi == 1000 460// We use the float double data type in order to use "shufpd" to shift by 8 bytes. 461 462LMod8: 463 cmpl $(-kFastUCode),%edx // %edx == -length, where (length < kVeryLong) 464 jle Lfastpath // long enough for fastpath in microcode 465 movapd -8(%esi,%edx),%xmm0 // 8-byte aligned: prime the loop 466 jmp 1f 467 .align 4,0x90 4681: // loop over 64-byte chunks 469 movapd 8(%esi,%edx),%xmm1 470 movapd 24(%esi,%edx),%xmm2 471 shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes) 472 movapd 40(%esi,%edx),%xmm3 473 shufpd $01,%xmm2,%xmm1 474 movapd 56(%esi,%edx),%xmm4 475 shufpd $01,%xmm3,%xmm2 476 477 movapd %xmm0,(%edi,%edx) 478 shufpd $01,%xmm4,%xmm3 479 movapd %xmm1,16(%edi,%edx) 480 movapd %xmm2,32(%edi,%edx) 481 movapd %xmm4,%xmm0 482 movapd %xmm3,48(%edi,%edx) 483 484 addl $64,%edx 485 jnz 1b 486 487 jmp Lshort // copy remaining 0..63 bytes and done 488 489 490// Forward loop for medium length operands in which low four bits of %esi == 1001 491 492LMod9: 493 movdqa -9(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq 4941: // loop over 64-byte chunks 495 movdqa 7(%esi,%edx),%xmm1 496 movdqa 23(%esi,%edx),%xmm2 497 movdqa 39(%esi,%edx),%xmm3 498 movdqa 55(%esi,%edx),%xmm4 499 500 movdqa %xmm0,%xmm5 501 movdqa %xmm4,%xmm0 502 503 palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) 504 palignr $9,%xmm2,%xmm3 505 palignr $9,%xmm1,%xmm2 506 palignr $9,%xmm5,%xmm1 507 508 movdqa %xmm1,(%edi,%edx) 509 movdqa %xmm2,16(%edi,%edx) 510 movdqa %xmm3,32(%edi,%edx) 511 movdqa %xmm4,48(%edi,%edx) 512 513 addl $64,%edx 514 jnz 1b 515 516 jmp Lshort // copy remaining 0..63 bytes and done 517 518 519// Forward loop for medium length operands in which low four bits of %esi == 1010 520 521LMod10: 522 movdqa -10(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq 5231: // loop over 64-byte chunks 524 movdqa 6(%esi,%edx),%xmm1 525 movdqa 22(%esi,%edx),%xmm2 526 movdqa 38(%esi,%edx),%xmm3 527 movdqa 54(%esi,%edx),%xmm4 528 529 movdqa %xmm0,%xmm5 530 movdqa %xmm4,%xmm0 531 532 palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) 533 palignr $10,%xmm2,%xmm3 534 palignr $10,%xmm1,%xmm2 535 palignr $10,%xmm5,%xmm1 536 537 movdqa %xmm1,(%edi,%edx) 538 movdqa %xmm2,16(%edi,%edx) 539 movdqa %xmm3,32(%edi,%edx) 540 movdqa %xmm4,48(%edi,%edx) 541 542 addl $64,%edx 543 jnz 1b 544 545 jmp Lshort // copy remaining 0..63 bytes and done 546 547 548// Forward loop for medium length operands in which low four bits of %esi == 1011 549 550LMod11: 551 movdqa -11(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq 5521: // loop over 64-byte chunks 553 movdqa 5(%esi,%edx),%xmm1 554 movdqa 21(%esi,%edx),%xmm2 555 movdqa 37(%esi,%edx),%xmm3 556 movdqa 53(%esi,%edx),%xmm4 557 558 movdqa %xmm0,%xmm5 559 movdqa %xmm4,%xmm0 560 561 palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) 562 palignr $11,%xmm2,%xmm3 563 palignr $11,%xmm1,%xmm2 564 palignr $11,%xmm5,%xmm1 565 566 movdqa %xmm1,(%edi,%edx) 567 movdqa %xmm2,16(%edi,%edx) 568 movdqa %xmm3,32(%edi,%edx) 569 movdqa %xmm4,48(%edi,%edx) 570 571 addl $64,%edx 572 jnz 1b 573 574 jmp Lshort // copy remaining 0..63 bytes and done 575 576 577// Forward loop for medium length operands in which low four bits of %esi == 1100 578// We use the float single data type in order to use "movss" to merge vectors. 579 580LMod12: 581 movss (%esi,%edx),%xmm0 // prefetch 1st four bytes of source, right justified 582 jmp 1f 583 .align 4,0x90 5841: // loop over 64-byte chunks 585 pshufd $(0x93),4(%esi,%edx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11) 586 pshufd $(0x93),20(%esi,%edx),%xmm2 587 pshufd $(0x93),36(%esi,%edx),%xmm3 588 pshufd $(0x93),52(%esi,%edx),%xmm4 589 590 movaps %xmm4,%xmm5 591 movss %xmm3,%xmm4 // copy low 4 bytes of source into destination 592 movss %xmm2,%xmm3 593 movss %xmm1,%xmm2 594 movss %xmm0,%xmm1 595 596 movaps %xmm1,(%edi,%edx) 597 movaps %xmm2,16(%edi,%edx) 598 movaps %xmm5,%xmm0 599 movaps %xmm3,32(%edi,%edx) 600 movaps %xmm4,48(%edi,%edx) 601 602 addl $64,%edx 603 jnz 1b 604 605 jmp Lshort // copy remaining 0..63 bytes and done 606 607 608// Forward loop for medium length operands in which low four bits of %esi == 1101 609 610LMod13: 611 movdqa -13(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq 6121: // loop over 64-byte chunks 613 movdqa 3(%esi,%edx),%xmm1 614 movdqa 19(%esi,%edx),%xmm2 615 movdqa 35(%esi,%edx),%xmm3 616 movdqa 51(%esi,%edx),%xmm4 617 618 movdqa %xmm0,%xmm5 619 movdqa %xmm4,%xmm0 620 621 palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) 622 palignr $13,%xmm2,%xmm3 623 palignr $13,%xmm1,%xmm2 624 palignr $13,%xmm5,%xmm1 625 626 movdqa %xmm1,(%edi,%edx) 627 movdqa %xmm2,16(%edi,%edx) 628 movdqa %xmm3,32(%edi,%edx) 629 movdqa %xmm4,48(%edi,%edx) 630 631 addl $64,%edx 632 jnz 1b 633 634 jmp Lshort // copy remaining 0..63 bytes and done 635 636 637// Forward loop for medium length operands in which low four bits of %esi == 1110 638 639LMod14: 640 movdqa -14(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq 6411: // loop over 64-byte chunks 642 movdqa 2(%esi,%edx),%xmm1 643 movdqa 18(%esi,%edx),%xmm2 644 movdqa 34(%esi,%edx),%xmm3 645 movdqa 50(%esi,%edx),%xmm4 646 647 movdqa %xmm0,%xmm5 648 movdqa %xmm4,%xmm0 649 650 palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) 651 palignr $14,%xmm2,%xmm3 652 palignr $14,%xmm1,%xmm2 653 palignr $14,%xmm5,%xmm1 654 655 movdqa %xmm1,(%edi,%edx) 656 movdqa %xmm2,16(%edi,%edx) 657 movdqa %xmm3,32(%edi,%edx) 658 movdqa %xmm4,48(%edi,%edx) 659 660 addl $64,%edx 661 jnz 1b 662 663 jmp Lshort // copy remaining 0..63 bytes and done 664 665 666// Forward loop for medium length operands in which low four bits of %esi == 1111 667 668LMod15: 669 movdqa -15(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq 6701: // loop over 64-byte chunks 671 movdqa 1(%esi,%edx),%xmm1 672 movdqa 17(%esi,%edx),%xmm2 673 movdqa 33(%esi,%edx),%xmm3 674 movdqa 49(%esi,%edx),%xmm4 675 676 movdqa %xmm0,%xmm5 677 movdqa %xmm4,%xmm0 678 679 palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) 680 palignr $15,%xmm2,%xmm3 681 palignr $15,%xmm1,%xmm2 682 palignr $15,%xmm5,%xmm1 683 684 movdqa %xmm1,(%edi,%edx) 685 movdqa %xmm2,16(%edi,%edx) 686 movdqa %xmm3,32(%edi,%edx) 687 movdqa %xmm4,48(%edi,%edx) 688 689 addl $64,%edx 690 jnz 1b 691 692 jmp Lshort // copy remaining 0..63 bytes and done 693 694 695// Reverse moves. These are not optimized as aggressively as their forward 696// counterparts, as they are only used with destructive overlap. 697// ecx = length 698// esi = source ptr 699// edi = dest ptr 700 701LReverse: 702 addl %ecx,%esi // point to end of strings 703 addl %ecx,%edi 704 cmpl $(kShort),%ecx // long enough to bother with SSE? 705 ja LReverseNotShort // yes 706 707// Handle reverse short copies. 708// ecx = length 709// esi = one byte past end of source 710// edi = one byte past end of dest 711 712LReverseShort: 713 movl %ecx,%edx // copy length 714 shrl $2,%ecx // #words 715 jz 3f 7161: 717 subl $4,%esi 718 movl (%esi),%eax 719 subl $4,%edi 720 movl %eax,(%edi) 721 dec %ecx 722 jnz 1b 7233: 724 andl $3,%edx // bytes? 725 jz 5f 7264: 727 dec %esi 728 movb (%esi),%al 729 dec %edi 730 movb %al,(%edi) 731 dec %edx 732 jnz 4b 7335: 734 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove 735 popl %edi 736 popl %esi 737 popl %ebp 738 ret 739 740// Handle a reverse move long enough to justify using SSE. 741// ecx = length 742// esi = one byte past end of source 743// edi = one byte past end of dest 744 745LReverseNotShort: 746 movl %edi,%edx // copy destination 747 andl $15,%edx // get #bytes to align destination 748 je LReverseDestAligned // already aligned 749 subl %edx,%ecx // adjust length 7501: // loop copying 1..15 bytes 751 dec %esi 752 movb (%esi),%al 753 dec %edi 754 movb %al,(%edi) 755 dec %edx 756 jnz 1b 757 758// Destination is now aligned. Prepare for reverse loops. 759 760LReverseDestAligned: 761 movl %ecx,%edx // copy length 762 andl $63,%ecx // get remaining bytes for Lshort 763 andl $-64,%edx // get number of bytes we will copy in inner loop 764 subl %edx,%esi // point to endpoint of copy 765 subl %edx,%edi 766 testl $15,%esi // is source aligned too? 767 jnz LReverseUnalignedLoop // no 768 769LReverseAlignedLoop: // loop over 64-byte chunks 770 movdqa -16(%esi,%edx),%xmm0 771 movdqa -32(%esi,%edx),%xmm1 772 movdqa -48(%esi,%edx),%xmm2 773 movdqa -64(%esi,%edx),%xmm3 774 775 movdqa %xmm0,-16(%edi,%edx) 776 movdqa %xmm1,-32(%edi,%edx) 777 movdqa %xmm2,-48(%edi,%edx) 778 movdqa %xmm3,-64(%edi,%edx) 779 780 subl $64,%edx 781 jne LReverseAlignedLoop 782 783 jmp LReverseShort // copy remaining 0..63 bytes and done 784 785 786// Reverse, unaligned loop. LDDQU==MOVDQU on these machines. 787 788LReverseUnalignedLoop: // loop over 64-byte chunks 789 movdqu -16(%esi,%edx),%xmm0 790 movdqu -32(%esi,%edx),%xmm1 791 movdqu -48(%esi,%edx),%xmm2 792 movdqu -64(%esi,%edx),%xmm3 793 794 movdqa %xmm0,-16(%edi,%edx) 795 movdqa %xmm1,-32(%edi,%edx) 796 movdqa %xmm2,-48(%edi,%edx) 797 movdqa %xmm3,-64(%edi,%edx) 798 799 subl $64,%edx 800 jne LReverseUnalignedLoop 801 802 jmp LReverseShort // copy remaining 0..63 bytes and done 803 804 805 COMMPAGE_DESCRIPTOR(bcopy_sse3x,_COMM_PAGE_BCOPY,kHasSSE2+kHasSupplementalSSE3+kCache64,kHasSSE4_2) 806