1/* 2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* ======================================= 29 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X 30 * ======================================= 31 * 32 * Version of 6/11/2003, tuned for the IBM 970. 33 * 34 * Register usage. Note the rather delicate way we assign multiple uses 35 * to the same register. Beware. 36 * r0 = temp (NB: cannot use r0 for any constant such as "c16") 37 * r3 = not used, as memcpy and memmove return 1st parameter as a value 38 * r4 = source ptr ("rs") 39 * r5 = count of bytes to move ("rc") 40 * r6 = "w1", "c16", or "cm17" 41 * r7 = "w2", "c32", or "cm33" 42 * r8 = "w3", "c48", or "cm49" 43 * r9 = "w4", or "cm1" 44 * r10 = vrsave ("rv") 45 * r11 = unused 46 * r12 = destination ptr ("rd") 47 * v0 = permute vector ("vp") 48 * v1-v8 = qw's loaded from source 49 *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz") 50 */ 51#define rs r4 52#define rd r12 53#define rc r5 54#define rv r10 55 56#define w1 r6 57#define w2 r7 58#define w3 r8 59#define w4 r9 60 61#define c16 r6 62#define cm17 r6 63#define c32 r7 64#define cm33 r7 65#define c48 r8 66#define cm49 r8 67#define cm1 r9 68 69#define vp v0 70#define vw v9 71#define vx v10 72#define vy v11 73#define vz v12 74 75#include <sys/appleapiopts.h> 76#include <ppc/asm.h> 77#include <machine/cpu_capabilities.h> 78#include <machine/commpage.h> 79 80 .text 81/* 82 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary 83 * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following 84 * simple transformations: 85 * - all word compares are changed to doubleword 86 * - all "srwi[.]" opcodes are changed to "srdi[.]" 87 * Nothing else is done. For this to work, the following rules must be 88 * carefully followed: 89 * - do not use carry or overflow 90 * - only use record mode if you are sure the results are mode-invariant 91 * for example, all "andi." and almost all "rlwinm." are fine 92 * - do not use "slwi", "slw", or "srw" 93 * An imaginative programmer could break the porting model in other ways, but the above 94 * are the most likely problem areas. It is perhaps surprising how well in practice 95 * this simple method works. 96 */ 97 98#define kShort 64 99#define kVeryLong (128*1024) 100 101 102// Main entry points. 103 104 .align 5 105bcopy_970: // void bcopy(const void *src, void *dst, size_t len) 106 cmplwi rc,kShort // short or long? 107 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc 108 mr rd,r4 // move registers to canonic spot 109 mr rs,r3 110 blt LShort // handle short operands 111 dcbt 0,rs // touch in the first line of source 112 dcbtst 0,rd // touch in destination 113 b LLong1 // join long operand code 114 115// NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses. 116 117 .align 5 118Lmemcpy_970: // void* memcpy(void *dst, void *src, size_t len) 119Lmemmove_970: // void* memmove(void *dst, const void *src, size_t len) 120 cmplwi rc,kShort // short or long? 121 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc 122 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc 123 bge LLong0 // handle long operands 124 125// Handle short operands. 126// rs = source 127// rd = destination 128// rc = count 129// w1 = (rd-rs), must move reverse if (rd-rs)<rc 130 131LShort: 132 cmplw cr1,w1,rc // set cr1 blt if we must move reverse 133 mtcrf 0x02,rc // move length to cr6 and cr7 one at a time 134 mtcrf 0x01,rc 135 blt-- cr1,LShortReverse 136 137// Forward short operands. This is the most frequent case, so it is inline. 138 139 bf 26,0f // 32-byte chunk to move? 140 ld w1,0(rs) 141 ld w2,8(rs) 142 ld w3,16(rs) 143 ld w4,24(rs) 144 addi rs,rs,32 145 std w1,0(rd) 146 std w2,8(rd) 147 std w3,16(rd) 148 std w4,24(rd) 149 addi rd,rd,32 1500: 151LShort32: 152 bf 27,1f // quadword to move? 153 ld w1,0(rs) 154 ld w3,8(rs) 155 addi rs,rs,16 156 std w1,0(rd) 157 std w3,8(rd) 158 addi rd,rd,16 1591: 160LShort16: // join here to xfer 0-15 bytes 161 bf 28,2f // doubleword? 162 ld w1,0(rs) 163 addi rs,rs,8 164 std w1,0(rd) 165 addi rd,rd,8 1662: 167 bf 29,3f // word? 168 lwz w1,0(rs) 169 addi rs,rs,4 170 stw w1,0(rd) 171 addi rd,rd,4 1723: 173 bf 30,4f // halfword to move? 174 lhz w1,0(rs) 175 addi rs,rs,2 176 sth w1,0(rd) 177 addi rd,rd,2 1784: 179 bflr 31 // skip if no odd byte 180 lbz w1,0(rs) 181 stb w1,0(rd) 182 blr 183 184 185// Handle short reverse operands. 186// cr = length in bits 26-31 187 188LShortReverse: 189 add rs,rs,rc // adjust ptrs for reverse move 190 add rd,rd,rc 191 bf 26,0f // 32 bytes to move? 192 ld w1,-8(rs) 193 ld w2,-16(rs) 194 ld w3,-24(rs) 195 ldu w4,-32(rs) 196 std w1,-8(rd) 197 std w2,-16(rd) 198 std w3,-24(rd) 199 stdu w4,-32(rd) 2000: 201 bf 27,1f // quadword to move? 202 ld w1,-8(rs) 203 ldu w2,-16(rs) 204 std w1,-8(rd) 205 stdu w2,-16(rd) 2061: 207LShortReverse16: // join here to xfer 0-15 bytes and return 208 bf 28,2f // doubleword? 209 ldu w1,-8(rs) 210 stdu w1,-8(rd) 2112: 212 bf 29,3f // word? 213 lwzu w1,-4(rs) 214 stwu w1,-4(rd) 2153: 216 bf 30,4f // halfword to move? 217 lhzu w1,-2(rs) 218 sthu w1,-2(rd) 2194: 220 bflr 31 // done if no odd byte 221 lbz w1,-1(rs) // no update 222 stb w1,-1(rd) 223 blr 224 225 226// Long operands, use Altivec in most cases. 227// rs = source 228// rd = destination 229// rc = count 230// w1 = (rd-rs), must move reverse if (rd-rs)<rc 231 232LLong0: // entry from memmove() 233 dcbt 0,rs // touch in source 234 dcbtst 0,rd // touch in destination 235LLong1: // entry from bcopy() with operands already touched in 236 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse 237 neg w3,rd // start to compute #bytes to align destination 238 rlwinm w2,w1,0,0xF // 16-byte aligned? (w2==0 if so) 239 andi. w4,w3,0xF // w4 <- #bytes to 16-byte align destination 240 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned 241 blt-- cr1,LLongReverse // handle reverse moves 242 sub rc,rc,w4 // adjust length for aligning destination 243 srwi r0,rc,7 // get #cache lines to copy (may be 0) 244 cmpwi cr1,r0,0 // set cr1 on #chunks 245 beq LFwdAligned // dest is already aligned 246 247// 16-byte align destination. 248 249 mtcrf 0x01,w4 // cr7 <- #bytes to align dest (nonzero) 250 bf 31,1f // byte to move? 251 lbz w1,0(rs) 252 addi rs,rs,1 253 stb w1,0(rd) 254 addi rd,rd,1 2551: 256 bf 30,2f // halfword? 257 lhz w1,0(rs) 258 addi rs,rs,2 259 sth w1,0(rd) 260 addi rd,rd,2 2612: 262 bf 29,3f // word? 263 lwz w1,0(rs) 264 addi rs,rs,4 265 stw w1,0(rd) 266 addi rd,rd,4 2673: 268 bf 28,LFwdAligned // doubleword? 269 ld w1,0(rs) 270 addi rs,rs,8 271 std w1,0(rd) 272 addi rd,rd,8 273 274 275// Forward, destination is 16-byte aligned. There are five cases: 276// 1. If the length>=kVeryLong (ie, several pages), then use the 277// "bigcopy" path that pulls all the punches. This is the fastest 278// case for cold-cache operands, as any this long will likely be. 279// 2. If length>=128 and source is 16-byte aligned, then use the 280// lvx/stvx loop over 128-byte chunks. This is the fastest 281// case for hot-cache operands, 2nd fastest for cold. 282// 3. If length>=128 and source is not 16-byte aligned, then use the 283// lvx/vperm/stvx loop over 128-byte chunks. 284// 4. If length<128 and source is 8-byte aligned, then use the 285// ld/std loop over 32-byte chunks. 286// 5. If length<128 and source is not 8-byte aligned, then use the 287// lvx/vperm/stvx loop over 32-byte chunks. This is the slowest case. 288// Registers at this point: 289// r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0) 290// rs = alignment unknown 291// rd = 16-byte aligned 292// rc = bytes remaining 293// w2 = low 4 bits of (rd-rs), used to check alignment 294// cr5 = beq if source is also 16-byte aligned 295 296LFwdAligned: 297 andi. w3,w2,7 // is source at least 8-byte aligned? 298 mtcrf 0x01,rc // move leftover count to cr7 for LShort16 299 bne cr1,LFwdLongVectors // at least one 128-byte chunk, so use vectors 300 srwi w1,rc,5 // get 32-byte chunk count 301 mtcrf 0x02,rc // move bit 27 of length to cr6 for LShort32 302 mtctr w1 // set up 32-byte loop (w1!=0) 303 beq LFwdMedAligned // source is 8-byte aligned, so use ld/std loop 304 mfspr rv,vrsave // get bitmap of live vector registers 305 oris w4,rv,0xFFF8 // we use v0-v12 306 li c16,16 // get constant used in lvx 307 li c32,32 308 mtspr vrsave,w4 // update mask 309 lvx v1,0,rs // prefetch 1st source quadword 310 lvsl vp,0,rs // get permute vector to shift left 311 312 313// Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx. 314 3151: // loop over 32-byte chunks 316 lvx v2,c16,rs 317 lvx v3,c32,rs 318 addi rs,rs,32 319 vperm vx,v1,v2,vp 320 vperm vy,v2,v3,vp 321 vor v1,v3,v3 // v1 <- v3 322 stvx vx,0,rd 323 stvx vy,c16,rd 324 addi rd,rd,32 325 bdnz 1b 326 327 mtspr vrsave,rv // restore bitmap of live vr's 328 b LShort32 329 330 331// Fewer than 128 bytes and doubleword aligned: use ld/std. 332 333 .align 5 334LFwdMedAligned: // loop over 32-byte chunks 335 ld w1,0(rs) 336 ld w2,8(rs) 337 ld w3,16(rs) 338 ld w4,24(rs) 339 addi rs,rs,32 340 std w1,0(rd) 341 std w2,8(rd) 342 std w3,16(rd) 343 std w4,24(rd) 344 addi rd,rd,32 345 bdnz LFwdMedAligned 346 347 b LShort32 348 349 350// Forward, 128 bytes or more: use vectors. When entered: 351// r0 = 128-byte chunks to move (>0) 352// rd = 16-byte aligned 353// cr5 = beq if source is 16-byte aligned 354// cr7 = low 4 bits of rc (ie, leftover byte count 0-15) 355// We set up many registers: 356// ctr = number of 128-byte chunks to move 357// r0/cr0 = leftover QWs to move 358// cr7 = low 4 bits of rc (ie, leftover byte count 0-15) 359// cr6 = beq if leftover byte count is 0 360// rv = original value of VRSave 361// c16,c32,c48 = loaded 362 363LFwdLongVectors: 364 mfspr rv,vrsave // get bitmap of live vector registers 365 lis w3,kVeryLong>>16 // cutoff for very-long-operand special case path 366 cmplw cr1,rc,w3 // very long operand? 367 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 368 bge-- cr1,LBigCopy // handle big copies separately 369 mtctr r0 // set up loop count 370 cmpwi cr6,w3,0 // set cr6 on leftover byte count 371 oris w4,rv,0xFFF8 // we use v0-v12 372 rlwinm. r0,rc,28,29,31 // get number of quadword leftovers (0-7) and set cr0 373 li c16,16 // get constants used in ldvx/stvx 374 mtspr vrsave,w4 // update mask 375 li c32,32 376 li c48,48 377 beq cr5,LFwdLongAligned // source is also 16-byte aligned, no need for vperm 378 lvsl vp,0,rs // get permute vector to shift left 379 lvx v1,0,rs // prefetch 1st source quadword 380 b LFwdLongUnaligned 381 382 383// Forward, long, unaligned vector loop. 384 385 .align 5 // align inner loops 386LFwdLongUnaligned: // loop over 128-byte chunks 387 addi w4,rs,64 388 lvx v2,c16,rs 389 lvx v3,c32,rs 390 lvx v4,c48,rs 391 lvx v5,0,w4 392 lvx v6,c16,w4 393 vperm vw,v1,v2,vp 394 lvx v7,c32,w4 395 lvx v8,c48,w4 396 addi rs,rs,128 397 vperm vx,v2,v3,vp 398 addi w4,rd,64 399 lvx v1,0,rs 400 stvx vw,0,rd 401 vperm vy,v3,v4,vp 402 stvx vx,c16,rd 403 vperm vz,v4,v5,vp 404 stvx vy,c32,rd 405 vperm vw,v5,v6,vp 406 stvx vz,c48,rd 407 vperm vx,v6,v7,vp 408 addi rd,rd,128 409 stvx vw,0,w4 410 vperm vy,v7,v8,vp 411 stvx vx,c16,w4 412 vperm vz,v8,v1,vp 413 stvx vy,c32,w4 414 stvx vz,c48,w4 415 bdnz LFwdLongUnaligned 416 417 beq 4f // no leftover quadwords 418 mtctr r0 4193: // loop over remaining quadwords 420 lvx v2,c16,rs 421 addi rs,rs,16 422 vperm vx,v1,v2,vp 423 vor v1,v2,v2 // v1 <- v2 424 stvx vx,0,rd 425 addi rd,rd,16 426 bdnz 3b 4274: 428 mtspr vrsave,rv // restore bitmap of live vr's 429 bne cr6,LShort16 // handle last 0-15 bytes if any 430 blr 431 432 433// Forward, long, 16-byte aligned vector loop. 434 435 .align 5 436LFwdLongAligned: // loop over 128-byte chunks 437 addi w4,rs,64 438 lvx v1,0,rs 439 lvx v2,c16,rs 440 lvx v3,c32,rs 441 lvx v4,c48,rs 442 lvx v5,0,w4 443 lvx v6,c16,w4 444 lvx v7,c32,w4 445 lvx v8,c48,w4 446 addi rs,rs,128 447 addi w4,rd,64 448 stvx v1,0,rd 449 stvx v2,c16,rd 450 stvx v3,c32,rd 451 stvx v4,c48,rd 452 stvx v5,0,w4 453 stvx v6,c16,w4 454 stvx v7,c32,w4 455 stvx v8,c48,w4 456 addi rd,rd,128 457 bdnz LFwdLongAligned 458 459 beq 4f // no leftover quadwords 460 mtctr r0 4613: // loop over remaining quadwords (1-7) 462 lvx v1,0,rs 463 addi rs,rs,16 464 stvx v1,0,rd 465 addi rd,rd,16 466 bdnz 3b 4674: 468 mtspr vrsave,rv // restore bitmap of live vr's 469 bne cr6,LShort16 // handle last 0-15 bytes if any 470 blr 471 472 473// Long, reverse moves. 474// rs = source 475// rd = destination 476// rc = count 477// cr5 = beq if relatively 16-byte aligned 478 479LLongReverse: 480 add rd,rd,rc // point to end of operands 481 add rs,rs,rc 482 andi. r0,rd,0xF // #bytes to 16-byte align destination 483 beq 2f // already aligned 484 485// 16-byte align destination. 486 487 mtctr r0 // set up for loop 488 sub rc,rc,r0 4891: 490 lbzu w1,-1(rs) 491 stbu w1,-1(rd) 492 bdnz 1b 493 494// Prepare for reverse vector loop. When entered: 495// rd = 16-byte aligned 496// cr5 = beq if source also 16-byte aligned 497// We set up many registers: 498// ctr/cr1 = number of 64-byte chunks to move (may be 0) 499// r0/cr0 = leftover QWs to move 500// cr7 = low 4 bits of rc (ie, leftover byte count 0-15) 501// cr6 = beq if leftover byte count is 0 502// cm1 = -1 503// rv = original value of vrsave 504 5052: 506 mfspr rv,vrsave // get bitmap of live vector registers 507 srwi r0,rc,6 // get count of 64-byte chunks to move (may be 0) 508 oris w1,rv,0xFFF8 // we use v0-v12 509 mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16 510 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too 511 cmpwi cr1,r0,0 // set cr1 on chunk count 512 mtspr vrsave,w1 // update mask 513 mtctr r0 // set up loop count 514 cmpwi cr6,w3,0 // set cr6 on leftover byte count 515 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0 516 li cm1,-1 // get constants used in ldvx/stvx 517 518 bne cr5,LReverseVecUnal // handle unaligned operands 519 beq cr1,2f // no chunks (if no chunks, must be leftover QWs) 520 li cm17,-17 521 li cm33,-33 522 li cm49,-49 523 b 1f 524 525// Long, reverse 16-byte-aligned vector loop. 526 527 .align 5 // align inner loops 5281: // loop over 64-byte chunks 529 lvx v1,cm1,rs 530 lvx v2,cm17,rs 531 lvx v3,cm33,rs 532 lvx v4,cm49,rs 533 subi rs,rs,64 534 stvx v1,cm1,rd 535 stvx v2,cm17,rd 536 stvx v3,cm33,rd 537 stvx v4,cm49,rd 538 subi rd,rd,64 539 bdnz 1b 540 541 beq 4f // no leftover quadwords 5422: // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7 543 mtctr r0 5443: // loop over remaining quadwords (1-7) 545 lvx v1,cm1,rs 546 subi rs,rs,16 547 stvx v1,cm1,rd 548 subi rd,rd,16 549 bdnz 3b 5504: 551 mtspr vrsave,rv // restore bitmap of live vr's 552 bne cr6,LShortReverse16 // handle last 0-15 bytes if any 553 blr 554 555 556// Long, reverse, unaligned vector loop. 557// ctr/cr1 = number of 64-byte chunks to move (may be 0) 558// r0/cr0 = leftover QWs to move 559// cr7 = low 4 bits of rc (ie, leftover byte count 0-15) 560// cr6 = beq if leftover byte count is 0 561// rv = original value of vrsave 562// cm1 = -1 563 564LReverseVecUnal: 565 lvsl vp,0,rs // get permute vector to shift left 566 lvx v1,cm1,rs // v1 always looks ahead 567 li cm17,-17 568 beq cr1,2f // no chunks (if no chunks, must be leftover QWs) 569 li cm33,-33 570 li cm49,-49 571 b 1f 572 573 .align 5 // align the inner loops 5741: // loop over 64-byte chunks 575 lvx v2,cm17,rs 576 lvx v3,cm33,rs 577 lvx v4,cm49,rs 578 subi rs,rs,64 579 vperm vx,v2,v1,vp 580 lvx v1,cm1,rs 581 vperm vy,v3,v2,vp 582 stvx vx,cm1,rd 583 vperm vz,v4,v3,vp 584 stvx vy,cm17,rd 585 vperm vx,v1,v4,vp 586 stvx vz,cm33,rd 587 stvx vx,cm49,rd 588 subi rd,rd,64 589 bdnz 1b 590 591 beq 4f // no leftover quadwords 5922: // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7 593 mtctr r0 5943: // loop over 1-3 quadwords 595 lvx v2,cm17,rs 596 subi rs,rs,16 597 vperm vx,v2,v1,vp 598 vor v1,v2,v2 // v1 <- v2 599 stvx vx,cm1,rd 600 subi rd,rd,16 601 bdnz 3b 6024: 603 mtspr vrsave,rv // restore bitmap of live vr's 604 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any 605 blr 606 607 608// Very Big Copy Path. Save our return address in the stack for help decoding backtraces. 609// The conditions bigcopy expects are: 610// r0 = return address (also stored in caller's SF) 611// r4 = source ptr 612// r5 = length (at least several pages) 613// r12 = dest ptr 614 615LBigCopy: 616 lis r2,0x4000 // r2 <- 0x40000000 617 mflr r0 // get our return address 618 add. r2,r2,r2 // set cr0_lt if running in 32-bit mode 619 stw r0,8(r1) // save return, assuming 32-bit mode ("crsave" if 64-bit mode) 620 blta _COMM_PAGE_BIGCOPY // 32-bit mode, join big operand copy 621 std r0,16(r1) // save return in correct spot for 64-bit mode 622 ba _COMM_PAGE_BIGCOPY // then join big operand code 623 624 625 COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0, \ 626 kCommPageMTCRF+kCommPageBoth+kPort32to64) 627