1/* 2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* ======================================= 29 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X 30 * ======================================= 31 * 32 * Version of 2/20/2003, tuned for G4. The inner loops use DCBA to avoid 33 * reading destination cache lines. Only the 7450 actually benefits from 34 * this, and then only in the cold-cache case. On 7400s and 7455s, we 35 * patch the DCBAs into NOPs. 36 * 37 * Register usage. Note we use R2, so this code will not run in a PEF/CFM 38 * environment. Note also the rather delicate way we assign multiple uses 39 * to the same register. Beware. 40 * 41 * r0 = "w7" or temp (NB: cannot use r0 for any constant such as "c16") 42 * r2 = "w8" or vrsave ("rv") 43 * r3 = not used, as memcpy and memmove return 1st parameter as a value 44 * r4 = source ptr ("rs") 45 * r5 = count of bytes to move ("rc") 46 * r6 = "w1", "c16", or "cm17" 47 * r7 = "w2", "c32", or "cm33" 48 * r8 = "w3", "c48", or "cm49" 49 * r9 = "w4", or "cm1" 50 * r10 = "w5", "c96", or "cm97" 51 * r11 = "w6", "c128", or "cm129" 52 * r12 = destination ptr ("rd") 53 * v0 = permute vector ("vp") 54 * v1-v4 = qw's loaded from source 55 * v5-v7 = permuted qw's ("vw", "vx", "vy") 56 */ 57#define rs r4 58#define rd r12 59#define rc r5 60#define rv r2 61 62#define w1 r6 63#define w2 r7 64#define w3 r8 65#define w4 r9 66#define w5 r10 67#define w6 r11 68#define w7 r0 69#define w8 r2 70 71#define c16 r6 72#define cm17 r6 73#define c32 r7 74#define cm33 r7 75#define c48 r8 76#define cm49 r8 77#define cm1 r9 78#define c96 r10 79#define cm97 r10 80#define c128 r11 81#define cm129 r11 82 83#define vp v0 84#define vw v5 85#define vx v6 86#define vy v7 87 88#include <sys/appleapiopts.h> 89#include <ppc/asm.h> 90#include <machine/cpu_capabilities.h> 91#include <machine/commpage.h> 92 93 .text 94 95#define kMedium 32 // too long for inline loopless code 96#define kLong 96 // long enough to justify use of Altivec 97 98 99// Main entry points. 100 101 .align 5 102bcopy_g4: // void bcopy(const void *src, void *dst, size_t len) 103 cmplwi rc,kMedium // short or long? 104 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc 105 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse 106 mr rd,r4 // start to move registers to canonic spot 107 mr rs,r3 108 blt+ LShort // handle short operands 109 dcbt 0,r3 // touch in destination 110 b LMedium // join medium/long operand code 111 112// NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses. 113 114 .align 5 115Lmemcpy_g4: // void* memcpy(void *dst, void *src, size_t len) 116Lmemmove_g4: // void* memmove(void *dst, const void *src, size_t len) 117 cmplwi rc,kMedium // short or long? 118 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc 119 dcbt 0,r4 // touch in the first line of source 120 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse 121 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc 122 bge- LMedium // handle medium or long operands 123 124// Handle short operands. 125 126LShort: 127 andi. r0,rc,0x10 // test bit 27 separately (faster on G4) 128 mtcrf 0x01,rc // put length bits 28-31 in cr7 129 blt- cr1,LShortReverse 130 131// Forward short operands. This is the most frequent case, so it is inline. 132 133 beq LShort16 // quadword to move? 134 lwz w1,0(rs) 135 lwz w2,4(rs) 136 lwz w3,8(rs) 137 lwz w4,12(rs) 138 addi rs,rs,16 139 stw w1,0(rd) 140 stw w2,4(rd) 141 stw w3,8(rd) 142 stw w4,12(rd) 143 addi rd,rd,16 144LShort16: // join here to xfer 0-15 bytes 145 bf 28,2f // doubleword? 146 lwz w1,0(rs) 147 lwz w2,4(rs) 148 addi rs,rs,8 149 stw w1,0(rd) 150 stw w2,4(rd) 151 addi rd,rd,8 1522: 153 bf 29,3f // word? 154 lwz w1,0(rs) 155 addi rs,rs,4 156 stw w1,0(rd) 157 addi rd,rd,4 1583: 159 bf 30,4f // halfword to move? 160 lhz w1,0(rs) 161 addi rs,rs,2 162 sth w1,0(rd) 163 addi rd,rd,2 1644: 165 bflr 31 // skip if no odd byte 166 lbz w1,0(rs) 167 stb w1,0(rd) 168 blr 169 170 171// Handle short reverse operands. 172// cr0 = bne if bit 27 of length is set 173// cr7 = bits 28-31 of length 174 175LShortReverse: 176 add rs,rs,rc // adjust ptrs for reverse move 177 add rd,rd,rc 178 beq LShortReverse16 // quadword to move? 179 lwz w1,-4(rs) 180 lwz w2,-8(rs) 181 lwz w3,-12(rs) 182 lwzu w4,-16(rs) 183 stw w1,-4(rd) 184 stw w2,-8(rd) 185 stw w3,-12(rd) 186 stwu w4,-16(rd) 187LShortReverse16: // join here to xfer 0-15 bytes and return 188 bf 28,2f // doubleword? 189 lwz w1,-4(rs) 190 lwzu w2,-8(rs) 191 stw w1,-4(rd) 192 stwu w2,-8(rd) 1932: 194 bf 29,3f // word? 195 lwzu w1,-4(rs) 196 stwu w1,-4(rd) 1973: 198 bf 30,4f // halfword to move? 199 lhzu w1,-2(rs) 200 sthu w1,-2(rd) 2014: 202 bflr 31 // done if no odd byte 203 lbz w1,-1(rs) // no update 204 stb w1,-1(rd) 205 blr 206 207 208// Medium and long operands. Use Altivec if long enough, else scalar loops. 209// w1 = (rd-rs), used to check for alignment 210// cr1 = blt iff we must move reverse 211 212 .align 4 213LMedium: 214 dcbtst 0,rd // touch in destination 215 cmplwi cr7,rc,kLong // long enough for vectors? 216 neg w3,rd // start to compute #bytes to align destination 217 rlwinm r0,w1,0,0x7 // check relative 8-byte alignment 218 andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination 219 blt cr1,LMediumReverse // handle reverse moves 220 rlwinm w4,w3,0,0x1F // w4 <- #bytes to 32-byte align destination 221 cmpwi cr6,r0,0 // set cr6 beq if relatively aligned 222 bge cr7,LFwdLong // long enough for vectors 223 224// Medium length: use scalar loops. 225// w6/cr0 = #bytes to 8-byte align destination 226// cr6 = beq if relatively doubleword aligned 227 228 sub rc,rc,w6 // decrement length remaining 229 beq 1f // skip if dest already doubleword aligned 230 mtxer w6 // set up count for move 231 lswx w1,0,rs // move w6 bytes to align destination 232 stswx w1,0,rd 233 add rs,rs,w6 // bump ptrs past 234 add rd,rd,w6 2351: 236 srwi r0,rc,4 // get # 16-byte chunks (>=1) 237 mtcrf 0x01,rc // save remaining byte count here for LShort16 238 mtctr r0 // set up 16-byte loop 239 bne cr6,3f // source not 4-byte aligned 240 b 2f 241 242 .align 4 2432: // loop over 16-byte aligned chunks 244 lfd f0,0(rs) 245 lfd f1,8(rs) 246 addi rs,rs,16 247 stfd f0,0(rd) 248 stfd f1,8(rd) 249 addi rd,rd,16 250 bdnz 2b 251 252 b LShort16 253 254 .align 4 2553: // loop over 16-byte unaligned chunks 256 lwz w1,0(rs) 257 lwz w2,4(rs) 258 lwz w3,8(rs) 259 lwz w4,12(rs) 260 addi rs,rs,16 261 stw w1,0(rd) 262 stw w2,4(rd) 263 stw w3,8(rd) 264 stw w4,12(rd) 265 addi rd,rd,16 266 bdnz 3b 267 268 b LShort16 269 270 271// Vector loops. First, we must 32-byte align the destination. 272// w1 = (rd-rs), used to check for reverse and alignment 273// w4 = #bytes to 32-byte align destination 274// rc = long enough for at least one vector loop 275 276LFwdLong: 277 cmpwi w4,0 // dest already aligned? 278 sub rc,rc,w4 // adjust length 279 mtcrf 0x01,w4 // cr7 <- #bytes to align dest 280 rlwinm w2,w1,0,0xF // relatively 16-byte aligned? 281 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7 282 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1) 283 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned 284 beq LFwdAligned // dest is already aligned 285 286// 32-byte align destination. 287 288 bf 31,1f // byte to move? 289 lbz w1,0(rs) 290 addi rs,rs,1 291 stb w1,0(rd) 292 addi rd,rd,1 2931: 294 bf 30,2f // halfword? 295 lhz w1,0(rs) 296 addi rs,rs,2 297 sth w1,0(rd) 298 addi rd,rd,2 2992: 300 bf 29,3f // word? 301 lwz w1,0(rs) 302 addi rs,rs,4 303 stw w1,0(rd) 304 addi rd,rd,4 3053: 306 bf 28,4f // doubleword? 307 lwz w1,0(rs) 308 lwz w2,4(rs) 309 addi rs,rs,8 310 stw w1,0(rd) 311 stw w2,4(rd) 312 addi rd,rd,8 3134: 314 bf 27,LFwdAligned // quadword? 315 lwz w1,0(rs) 316 lwz w2,4(rs) 317 lwz w3,8(rs) 318 lwz w4,12(rs) 319 addi rs,rs,16 320 stw w1,0(rd) 321 stw w2,4(rd) 322 stw w3,8(rd) 323 stw w4,12(rd) 324 addi rd,rd,16 325 326 327// Destination is 32-byte aligned. 328// r0 = count of 64-byte chunks to move (not 0) 329// rd = 32-byte aligned 330// rc = bytes remaining 331// cr5 = beq if source is 16-byte aligned 332// We set up many registers: 333// ctr = number of 64-byte chunks to move 334// r0/cr0 = leftover QWs to move 335// cr7 = low 4 bits of rc (ie, leftover byte count 0-15) 336// cr6 = beq if leftover byte count is 0 337// rv = original value of vrsave 338// c16 etc = loaded 339 340LFwdAligned: 341 mfspr rv,vrsave // get bitmap of live vector registers 342 mtcrf 0x01,rc // move leftover count to cr7 for LShort16 343 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 344 mtctr r0 // set up loop count 345 cmpwi cr6,w3,0 // set cr6 on leftover byte count 346 oris w1,rv,0xFF00 // we use v0-v7 347 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0 348 mtspr vrsave,w1 // update mask 349 li c16,16 // get constants used in ldvx/stvx 350 li c32,32 351 li c48,48 352 li c96,96 353 li c128,128 354 bne cr5,LForwardVecUnal // handle unaligned operands 355 b 1f 356 357 .align 4 3581: // loop over 64-byte chunks 359 dcbt c96,rs 360 dcbt c128,rs 361 lvx v1,0,rs 362 lvx v2,c16,rs 363 lvx v3,c32,rs 364 lvx v4,c48,rs 365 addi rs,rs,64 366 dcba 0,rd // patched to NOP on some machines 367 stvx v1,0,rd 368 stvx v2,c16,rd 369 dcba c32,rd // patched to NOP on some machines 370 stvx v3,c32,rd 371 stvx v4,c48,rd 372 addi rd,rd,64 373 bdnz 1b 374 375 beq 4f // no leftover quadwords 376 mtctr r0 3773: // loop over remaining quadwords (1-3) 378 lvx v1,0,rs 379 addi rs,rs,16 380 stvx v1,0,rd 381 addi rd,rd,16 382 bdnz 3b 3834: 384 mtspr vrsave,rv // restore bitmap of live vr's 385 bne cr6,LShort16 // handle last 0-15 bytes if any 386 blr 387 388 389// Long, forward, unaligned vector loop. 390 391LForwardVecUnal: 392 lvsl vp,0,rs // get permute vector to shift left 393 lvx v1,0,rs // prefetch 1st source quadword 394 b 1f 395 396 .align 4 // align inner loops 3971: // loop over 64-byte chunks 398 lvx v2,c16,rs 399 dcbt c96,rs 400 lvx v3,c32,rs 401 dcbt c128,rs 402 lvx v4,c48,rs 403 addi rs,rs,64 404 vperm vw,v1,v2,vp 405 lvx v1,0,rs 406 vperm vx,v2,v3,vp 407 dcba 0,rd // patched to NOP on some machines 408 stvx vw,0,rd 409 vperm vy,v3,v4,vp 410 stvx vx,c16,rd 411 vperm vw,v4,v1,vp 412 dcba c32,rd // patched to NOP on some machines 413 stvx vy,c32,rd 414 stvx vw,c48,rd 415 addi rd,rd,64 416 bdnz 1b 417 418 beq- 4f // no leftover quadwords 419 mtctr r0 4203: // loop over remaining quadwords 421 lvx v2,c16,rs 422 addi rs,rs,16 423 vperm vx,v1,v2,vp 424 vor v1,v2,v2 // v1 <- v2 425 stvx vx,0,rd 426 addi rd,rd,16 427 bdnz 3b 4284: 429 mtspr vrsave,rv // restore bitmap of live vr's 430 bne cr6,LShort16 // handle last 0-15 bytes if any 431 blr 432 433 434// Medium and long, reverse moves. We use altivec if the operands are long enough, 435// else a lwz/stx loop. 436// w1 = (rd-rs), used to check for reverse and alignment 437// cr7 = bge if long 438 439LMediumReverse: 440 add rd,rd,rc // point to end of operands 441 add rs,rs,rc 442 andi. w4,rd,0x1F // w4 <- #bytes to 32-byte align destination 443 rlwinm w6,rd,0,0x3 // w6 <- #bytes to 4-byte align destination 444 bge cr7,LLongReverse // long enough for vectors 445 446// Scalar loop. 447// w6 = #bytes to 4-byte align destination 448 449 sub rc,rc,w6 // decrement length remaining 450 mtxer w6 // set up count for move 451 sub rs,rs,w6 // back up ptrs 452 sub rd,rd,w6 453 srwi r0,rc,4 // get # 16-byte chunks (>=1) 454 mtcrf 0x01,rc // set remaining byte count here for LShortReverse16 455 lswx w1,0,rs // move w6 bytes to align destination 456 stswx w1,0,rd 457 mtctr r0 // set up 16-byte loop 458 b 1f 459 460 .align 4 4611: // loop over 16-byte aligned chunks 462 lwz w1,-4(rs) 463 lwz w2,-8(rs) 464 lwz w3,-12(rs) 465 lwzu w4,-16(rs) 466 stw w1,-4(rd) 467 stw w2,-8(rd) 468 stw w3,-12(rd) 469 stwu w4,-16(rd) 470 bdnz 1b 471 472 b LShortReverse16 473 474 475// Reverse vector loops. First, we must 32-byte align the destination. 476// w1 = (rd-rs), used to check for reverse and alignment 477// w4/cr0 = #bytes to 32-byte align destination 478// rc = long enough for at least one vector loop 479 480LLongReverse: 481 sub rc,rc,w4 // adjust length 482 mtcrf 0x01,w4 // cr7 <- #bytes to align dest 483 rlwinm w2,w1,0,0xF // relatively 16-byte aligned? 484 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7 485 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1) 486 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned 487 beq LReverseAligned // dest is already aligned 488 489// 32-byte align destination. 490 491 bf 31,1f // byte to move? 492 lbzu w1,-1(rs) 493 stbu w1,-1(rd) 4941: 495 bf 30,2f // halfword? 496 lhzu w1,-2(rs) 497 sthu w1,-2(rd) 4982: 499 bf 29,3f // word? 500 lwzu w1,-4(rs) 501 stwu w1,-4(rd) 5023: 503 bf 28,4f // doubleword? 504 lwz w1,-4(rs) 505 lwzu w2,-8(rs) 506 stw w1,-4(rd) 507 stwu w2,-8(rd) 5084: 509 bf 27,LReverseAligned // quadword? 510 lwz w1,-4(rs) 511 lwz w2,-8(rs) 512 lwz w3,-12(rs) 513 lwzu w4,-16(rs) 514 stw w1,-4(rd) 515 stw w2,-8(rd) 516 stw w3,-12(rd) 517 stwu w4,-16(rd) 518 519// Destination is 32-byte aligned. 520// r0 = count of 64-byte chunks to move (not 0) 521// rd = 32-byte aligned 522// rc = bytes remaining 523// cr5 = beq if source is 16-byte aligned 524// We set up many registers: 525// ctr = number of 64-byte chunks to move 526// r0/cr0 = leftover QWs to move 527// cr7 = low 4 bits of rc (ie, leftover byte count 0-15) 528// cr6 = beq if leftover byte count is 0 529// rv = original value of vrsave 530// cm1 etc = loaded 531 532LReverseAligned: 533 mfspr rv,vrsave // get bitmap of live vector registers 534 mtcrf 0x01,rc // move leftover count to cr7 for LShort16 535 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 536 mtctr r0 // set up loop count 537 cmpwi cr6,w3,0 // set cr6 on leftover byte count 538 oris w1,rv,0xFF00 // we use v0-v7 539 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0 540 mtspr vrsave,w1 // update mask 541 li cm1,-1 // get constants used in ldvx/stvx 542 li cm17,-17 543 li cm33,-33 544 li cm49,-49 545 li cm97,-97 546 li cm129,-129 547 bne cr5,LReverseVecUnal // handle unaligned operands 548 b 1f 549 550 .align 4 // align inner loops 5511: // loop over 64-byte chunks 552 dcbt cm97,rs 553 dcbt cm129,rs 554 lvx v1,cm1,rs 555 lvx v2,cm17,rs 556 lvx v3,cm33,rs 557 lvx v4,cm49,rs 558 subi rs,rs,64 559 stvx v1,cm1,rd 560 stvx v2,cm17,rd 561 stvx v3,cm33,rd 562 stvx v4,cm49,rd 563 subi rd,rd,64 564 bdnz 1b 565 566 beq 4f // no leftover quadwords 567 mtctr r0 5683: // loop over remaining quadwords (1-7) 569 lvx v1,cm1,rs 570 subi rs,rs,16 571 stvx v1,cm1,rd 572 subi rd,rd,16 573 bdnz 3b 5744: 575 mtspr vrsave,rv // restore bitmap of live vr's 576 bne cr6,LShortReverse16 // handle last 0-15 bytes if any 577 blr 578 579 580// Long, reverse, unaligned vector loop. 581 582LReverseVecUnal: 583 lvsl vp,0,rs // get permute vector to shift left 584 lvx v1,cm1,rs // v1 always looks ahead 585 b 1f 586 587 .align 4 // align the inner loops 5881: // loop over 64-byte chunks 589 lvx v2,cm17,rs 590 dcbt cm97,rs 591 lvx v3,cm33,rs 592 dcbt cm129,rs 593 lvx v4,cm49,rs 594 subi rs,rs,64 595 vperm vw,v2,v1,vp 596 lvx v1,cm1,rs 597 vperm vx,v3,v2,vp 598 stvx vw,cm1,rd 599 vperm vy,v4,v3,vp 600 stvx vx,cm17,rd 601 vperm vw,v1,v4,vp 602 stvx vy,cm33,rd 603 stvx vw,cm49,rd 604 subi rd,rd,64 605 bdnz 1b 606 607 beq 3f // no leftover quadwords 608 mtctr r0 6092: // loop over 1-3 quadwords 610 lvx v2,cm17,rs 611 subi rs,rs,16 612 vperm vx,v2,v1,vp 613 vor v1,v2,v2 // v1 <- v2 614 stvx vx,cm1,rd 615 subi rd,rd,16 616 bdnz 2b 6173: 618 mtspr vrsave,rv // restore bitmap of live vr's 619 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any 620 blr 621 622 COMMPAGE_DESCRIPTOR(bcopy_g4,_COMM_PAGE_BCOPY,kHasAltivec,k64Bit,kCommPageDCBA+kCommPage32) 623