1/* 2 * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#include <machine/cpu_capabilities.h> 30#include <machine/commpage.h> 31 32/* 33 * The bcopy/memcpy loops, tuned for Pentium-M class processors with SSE2 34 * and 64-byte cache lines, such as Core and Core 2. 35 * 36 * The following #defines are tightly coupled to the u-architecture: 37 */ 38 39#define kShort 80 // too short to bother with SSE (must be >=80) 40#define kVeryLong (500*1024) // large enough for non-temporal stores (must be >= 8192) 41#define kBigChunk (256*1024) // outer loop chunk size for kVeryLong sized operands 42#define kFastUCode (16*1024) // cutoff for microcode fastpath for "rep/movsl" 43 44 45// void bcopy(const void *src, void *dst, size_t len); 46 47 .text 48 .align 5, 0x90 49Lbcopy_sse2: // void bcopy(const void *src, void *dst, size_t len) 50 pushl %ebp // set up a frame for backtraces 51 movl %esp,%ebp 52 pushl %esi 53 pushl %edi 54 movl 8(%ebp),%esi // get source ptr 55 movl 12(%ebp),%edi // get dest ptr 56 jmp Ljoin 57 58// 59// void *memcpy(void *dst, const void *src, size_t len); 60// void *memmove(void *dst, const void *src, size_t len); 61// 62// NB: These need to be 32 bytes from bcopy(): 63// 64 65 .align 5, 0x90 66Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len) 67Lmemmove: // void *memmove(void *dst, const void *src, size_t len) 68 pushl %ebp // set up a frame for backtraces 69 movl %esp,%ebp 70 pushl %esi 71 pushl %edi 72 movl 8(%ebp),%edi // get dest ptr 73 movl 12(%ebp),%esi // get source ptr 74 75Ljoin: // here from bcopy() with esi and edi loaded 76 movl 16(%ebp),%ecx // get length 77 movl %edi,%edx 78 subl %esi,%edx // (dest - source) 79 cmpl %ecx,%edx // must move in reverse if (dest - source) < length 80 jb LReverseIsland 81Lrejoin: // here from very-long-operand copies 82 cmpl $(kShort),%ecx // long enough to bother with SSE? 83 ja LNotShort // yes 84 85// Handle short forward copies. As the most common case, this is the fall-through path. 86// ecx = length (<= kShort) 87// esi = source ptr 88// edi = dest ptr 89 90Lshort: 91 movl %ecx,%edx // copy length 92 shrl $2,%ecx // get #doublewords 93 jz LLeftovers 942: // loop copying doublewords 95 movl (%esi),%eax 96 addl $4,%esi 97 movl %eax,(%edi) 98 addl $4,%edi 99 dec %ecx 100 jnz 2b 101LLeftovers: // handle leftover bytes (0..3) in last word 102 andl $3,%edx // any leftover bytes? 103 jz 5f 1044: // loop copying bytes 105 movb (%esi),%al 106 inc %esi 107 movb %al,(%edi) 108 inc %edi 109 dec %edx 110 jnz 4b 1115: 112 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove 113 popl %edi 114 popl %esi 115 popl %ebp 116 ret 117 118 119LReverseIsland: // keep the "jb" above a short branch... 120 jmp LReverse // ...because reverse moves are uncommon 121 122 123// Handle forward moves that are long enough to justify use of SSE3. 124// First, 16-byte align the destination. 125// ecx = length (> kShort) 126// esi = source ptr 127// edi = dest ptr 128 129LNotShort: 130 cmpl $(kVeryLong),%ecx // long enough to justify heavyweight loops? 131 movl %edi,%edx // copy destination 132 jae LVeryLong // use very-long-operand path 133 negl %edx 134 andl $15,%edx // get #bytes to align destination 135 jz LDestAligned // already aligned 136 subl %edx,%ecx // decrement length 1371: // loop copying 1..15 bytes 138 movb (%esi),%al 139 inc %esi 140 movb %al,(%edi) 141 inc %edi 142 dec %edx 143 jnz 1b 144 145// Destination is now aligned. Prepare for forward loops over 64-byte chunks. 146// Since kShort>=80 and we've moved at most 15 bytes already, there is at least one chunk. 147 148LDestAligned: 149 movl %ecx,%edx // copy length 150 movl %ecx,%eax // twice 151 andl $63,%ecx // get remaining bytes for Lshort 152 andl $-64,%edx // get number of bytes we will copy in inner loop 153 addl %edx,%esi // point to 1st byte not copied 154 addl %edx,%edi 155 negl %edx // now generate offset to 1st byte to be copied 156 testl $15,%esi // is source aligned too? 157 jnz LUnalignedLoop // no 158 159 160 cmpl $(kFastUCode),%eax // long enough for the fastpath in microcode? 161 jb LAlignedLoop // no, use SSE 162 cld // we'll move forward 163 movl %eax,%ecx // copy length again 164 shrl $2,%ecx // compute #words to move 165 addl %edx,%esi // restore ptrs to 1st byte of source and dest 166 addl %edx,%edi 167 rep // the u-code will optimize this 168 movsl 169 movl %eax,%edx // original length 170 jmp LLeftovers // handle 0..3 leftover bytes 171 172 173// Forward aligned loop for medium length operands (kShort < n < kVeryLong). 174 175 .align 4,0x90 // 16-byte align inner loops 176LAlignedLoop: // loop over 64-byte chunks 177 movdqa (%esi,%edx),%xmm0 178 movdqa 16(%esi,%edx),%xmm1 179 movdqa 32(%esi,%edx),%xmm2 180 movdqa 48(%esi,%edx),%xmm3 181 182 movdqa %xmm0,(%edi,%edx) 183 movdqa %xmm1,16(%edi,%edx) 184 movdqa %xmm2,32(%edi,%edx) 185 movdqa %xmm3,48(%edi,%edx) 186 187 addl $64,%edx 188 jnz LAlignedLoop 189 190 jmp Lshort // copy remaining 0..15 bytes and done 191 192 193// Forward unaligned loop for medium length operands (kShort < n < kVeryLong). 194// Note that LDDQU==MOVDQU on these machines, ie we don't care when we cross 195// source cache lines. 196 197 .align 4,0x90 // 16-byte align inner loops 198LUnalignedLoop: // loop over 64-byte chunks 199 movdqu (%esi,%edx),%xmm0 // the loads are unaligned 200 movdqu 16(%esi,%edx),%xmm1 201 movdqu 32(%esi,%edx),%xmm2 202 movdqu 48(%esi,%edx),%xmm3 203 204 movdqa %xmm0,(%edi,%edx) // we can use aligned stores 205 movdqa %xmm1,16(%edi,%edx) 206 movdqa %xmm2,32(%edi,%edx) 207 movdqa %xmm3,48(%edi,%edx) 208 209 addl $64,%edx 210 jnz LUnalignedLoop 211 212 jmp Lshort // copy remaining 0..63 bytes and done 213 214 215// Very long forward moves. These are at least several pages, so we loop over big 216// chunks of memory (kBigChunk in size.) We first prefetch the chunk, and then copy 217// it using non-temporal stores. Hopefully all the reads occur in the prefetch loop, 218// so the copy loop reads from L2 and writes directly to memory (with write combining.) 219// This minimizes bus turnaround and maintains good DRAM page locality. 220// Note that for this scheme to work, kVeryLong must be a large fraction of L2 cache 221// size. Otherwise, it is counter-productive to bypass L2 on the stores. 222// ecx = length (>= kVeryLong bytes) 223// edi = dest (aligned) 224// esi = source 225 226LVeryLong: 227 pushl %ebx // we'll need to use this 228 movl %edi,%ebx // copy dest ptr 229 negl %ebx 230 andl $63,%ebx // get #bytes to cache line align destination 231 jz LBigChunkLoop // already aligned 232 233// Cache line align destination, so temporal stores in copy loops work right. 234 235 pushl %ecx // save total length remaining 236 pushl %ebx // arg3 - #bytes to align destination (1..63) 237 pushl %esi // arg2 - source 238 pushl %edi // arg1 - dest 239 call Lmemcpy // align the destination 240 movl 12(%esp),%ecx // recover total length 241 addl $16,%esp 242 addl %ebx,%esi // adjust ptrs and lengths past copy 243 addl %ebx,%edi 244 subl %ebx,%ecx 245 246// Loop over big chunks. 247// ecx = length remaining (>= 4096) 248// edi = dest (64-byte aligned) 249// esi = source (may be unaligned) 250 251LBigChunkLoop: 252 movl $(kBigChunk),%edx // assume we can do a full chunk 253 cmpl %edx,%ecx // do we have a full chunk left to do? 254 cmovbl %ecx,%edx // if not, only move what we have left 255 andl $-4096,%edx // we work in page multiples 256 xor %eax,%eax // initialize chunk offset 257 jmp LTouchLoop 258 259// Because the source may be unaligned, we use byte loads to touch. 260// ecx = length remaining (including this chunk) 261// edi = ptr to start of dest chunk 262// esi = ptr to start of source chunk 263// edx = chunk length (multiples of pages) 264// ebx = scratch reg used to read a byte of each cache line 265// eax = chunk offset 266 267 .align 4,0x90 // 16-byte align inner loops 268LTouchLoop: 269 movzb (%esi,%eax),%ebx // touch line 0, 2, 4, or 6 of page 270 movzb 1*64(%esi,%eax),%ebx // touch line 1, 3, 5, or 7 271 movzb 8*64(%esi,%eax),%ebx // touch line 8, 10, 12, or 14 272 movzb 9*64(%esi,%eax),%ebx // etc 273 274 movzb 16*64(%esi,%eax),%ebx 275 movzb 17*64(%esi,%eax),%ebx 276 movzb 24*64(%esi,%eax),%ebx 277 movzb 25*64(%esi,%eax),%ebx 278 279 movzb 32*64(%esi,%eax),%ebx 280 movzb 33*64(%esi,%eax),%ebx 281 movzb 40*64(%esi,%eax),%ebx 282 movzb 41*64(%esi,%eax),%ebx 283 284 movzb 48*64(%esi,%eax),%ebx 285 movzb 49*64(%esi,%eax),%ebx 286 movzb 56*64(%esi,%eax),%ebx 287 movzb 57*64(%esi,%eax),%ebx 288 289 subl $-128,%eax // next slice of page (adding 128 w 8-bit immediate) 290 testl $512,%eax // done with this page? 291 jz LTouchLoop // no, next of four slices 292 addl $(4096-512),%eax // move on to next page 293 cmpl %eax,%edx // done with this chunk? 294 jnz LTouchLoop // no, do next page 295 296// The chunk has been pre-fetched, now copy it using non-temporal stores. 297// There are two copy loops, depending on whether the source is 16-byte aligned 298// or not. 299 300 addl %edx,%esi // increment ptrs by chunk length 301 addl %edx,%edi 302 subl %edx,%ecx // adjust remaining length 303 negl %edx // prepare loop index (counts up to 0) 304 testl $15,%esi // is source 16-byte aligned? 305 jnz LVeryLongUnaligned // source is not aligned 306 jmp LVeryLongAligned 307 308 .align 4,0x90 // 16-byte align inner loops 309LVeryLongAligned: // aligned loop over 128-bytes 310 movdqa (%esi,%edx),%xmm0 311 movdqa 16(%esi,%edx),%xmm1 312 movdqa 32(%esi,%edx),%xmm2 313 movdqa 48(%esi,%edx),%xmm3 314 movdqa 64(%esi,%edx),%xmm4 315 movdqa 80(%esi,%edx),%xmm5 316 movdqa 96(%esi,%edx),%xmm6 317 movdqa 112(%esi,%edx),%xmm7 318 319 movntdq %xmm0,(%edi,%edx) 320 movntdq %xmm1,16(%edi,%edx) 321 movntdq %xmm2,32(%edi,%edx) 322 movntdq %xmm3,48(%edi,%edx) 323 movntdq %xmm4,64(%edi,%edx) 324 movntdq %xmm5,80(%edi,%edx) 325 movntdq %xmm6,96(%edi,%edx) 326 movntdq %xmm7,112(%edi,%edx) 327 328 subl $-128,%edx // add 128 with an 8-bit immediate 329 jnz LVeryLongAligned 330 jmp LVeryLongChunkEnd 331 332 .align 4,0x90 // 16-byte align inner loops 333LVeryLongUnaligned: // unaligned loop over 128-bytes 334 movdqu (%esi,%edx),%xmm0 335 movdqu 16(%esi,%edx),%xmm1 336 movdqu 32(%esi,%edx),%xmm2 337 movdqu 48(%esi,%edx),%xmm3 338 movdqu 64(%esi,%edx),%xmm4 339 movdqu 80(%esi,%edx),%xmm5 340 movdqu 96(%esi,%edx),%xmm6 341 movdqu 112(%esi,%edx),%xmm7 342 343 movntdq %xmm0,(%edi,%edx) 344 movntdq %xmm1,16(%edi,%edx) 345 movntdq %xmm2,32(%edi,%edx) 346 movntdq %xmm3,48(%edi,%edx) 347 movntdq %xmm4,64(%edi,%edx) 348 movntdq %xmm5,80(%edi,%edx) 349 movntdq %xmm6,96(%edi,%edx) 350 movntdq %xmm7,112(%edi,%edx) 351 352 subl $-128,%edx // add 128 with an 8-bit immediate 353 jnz LVeryLongUnaligned 354 355LVeryLongChunkEnd: 356 cmpl $4096,%ecx // at least another page to go? 357 jae LBigChunkLoop // yes 358 359 sfence // required by non-temporal stores 360 popl %ebx 361 jmp Lrejoin // handle remaining (0..4095) bytes 362 363 364// Reverse moves. 365// ecx = length 366// esi = source ptr 367// edi = dest ptr 368 369LReverse: 370 addl %ecx,%esi // point to end of strings 371 addl %ecx,%edi 372 cmpl $(kShort),%ecx // long enough to bother with SSE? 373 ja LReverseNotShort // yes 374 375// Handle reverse short copies. 376// ecx = length 377// esi = one byte past end of source 378// edi = one byte past end of dest 379 380LReverseShort: 381 movl %ecx,%edx // copy length 382 shrl $2,%ecx // #words 383 jz 3f 3841: 385 subl $4,%esi 386 movl (%esi),%eax 387 subl $4,%edi 388 movl %eax,(%edi) 389 dec %ecx 390 jnz 1b 3913: 392 andl $3,%edx // bytes? 393 jz 5f 3944: 395 dec %esi 396 movb (%esi),%al 397 dec %edi 398 movb %al,(%edi) 399 dec %edx 400 jnz 4b 4015: 402 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove 403 popl %edi 404 popl %esi 405 popl %ebp 406 ret 407 408// Handle a reverse move long enough to justify using SSE. 409// ecx = length 410// esi = one byte past end of source 411// edi = one byte past end of dest 412 413LReverseNotShort: 414 movl %edi,%edx // copy destination 415 andl $15,%edx // get #bytes to align destination 416 je LReverseDestAligned // already aligned 417 subl %edx,%ecx // adjust length 4181: // loop copying 1..15 bytes 419 dec %esi 420 movb (%esi),%al 421 dec %edi 422 movb %al,(%edi) 423 dec %edx 424 jnz 1b 425 426// Destination is now aligned. Prepare for reverse loops. 427 428LReverseDestAligned: 429 movl %ecx,%edx // copy length 430 andl $63,%ecx // get remaining bytes for Lshort 431 andl $-64,%edx // get number of bytes we will copy in inner loop 432 subl %edx,%esi // point to endpoint of copy 433 subl %edx,%edi 434 testl $15,%esi // is source aligned too? 435 jnz LReverseUnalignedLoop // no 436 jmp LReverseAlignedLoop // use aligned loop 437 438 .align 4,0x90 // 16-byte align inner loops 439LReverseAlignedLoop: // loop over 64-byte chunks 440 movdqa -16(%esi,%edx),%xmm0 441 movdqa -32(%esi,%edx),%xmm1 442 movdqa -48(%esi,%edx),%xmm2 443 movdqa -64(%esi,%edx),%xmm3 444 445 movdqa %xmm0,-16(%edi,%edx) 446 movdqa %xmm1,-32(%edi,%edx) 447 movdqa %xmm2,-48(%edi,%edx) 448 movdqa %xmm3,-64(%edi,%edx) 449 450 subl $64,%edx 451 jne LReverseAlignedLoop 452 453 jmp LReverseShort // copy remaining 0..63 bytes and done 454 455 456// Reverse, unaligned loop. LDDQU==MOVDQU on these machines. 457 458 .align 4,0x90 // 16-byte align inner loops 459LReverseUnalignedLoop: // loop over 64-byte chunks 460 movdqu -16(%esi,%edx),%xmm0 461 movdqu -32(%esi,%edx),%xmm1 462 movdqu -48(%esi,%edx),%xmm2 463 movdqu -64(%esi,%edx),%xmm3 464 465 movdqa %xmm0,-16(%edi,%edx) 466 movdqa %xmm1,-32(%edi,%edx) 467 movdqa %xmm2,-48(%edi,%edx) 468 movdqa %xmm3,-64(%edi,%edx) 469 470 subl $64,%edx 471 jne LReverseUnalignedLoop 472 473 jmp LReverseShort // copy remaining 0..63 bytes and done 474 475 476 COMMPAGE_DESCRIPTOR(bcopy_sse2,_COMM_PAGE_BCOPY,kHasSSE2+kCache64,kHasSupplementalSSE3) 477