1/* 2 * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28; 29; Copy bytes of data around. Handles overlapped data. 30; 31; 32#include <ppc/asm.h> 33#include <ppc/proc_reg.h> 34#include <assym.s> 35 36; These routines use CR5 for certain flags: 37; Use CR5_lt to indicate non-cached (in bcopy and memcpy) 38#define noncache 20 39 40 41; The bcopy_phys variants use a stack frame so they can call bcopy as a subroutine. 42#define BCOPY_SF_SIZE 32 // total size 43#define BCOPY_SF_MSR 16 // we save caller's MSR here (possibly minus VEC and FP) 44 45 46#define kShort 32 // short operands are special cased 47 48 49; void bcopy_physvir_32(from, to, nbytes) 50; 51; Attempt to copy physically addressed memory with translation on if conditions are met. 52; Otherwise do a normal bcopy_phys. This routine is used because some 32-bit processors 53; are very slow doing real-mode (translation off) copies, so we set up temporary BATs 54; for the passed phys addrs and do the copy with translation on. 55; 56; Rules are: - neither source nor destination can cross a page. 57; - Interrupts must be disabled when this routine is called. 58; - Translation must be on when called. 59; 60; To do the copy, we build a 128 DBAT for both the source and sink. If both are the same, only one 61; is loaded. We do not touch the IBATs, so there is no issue if either physical page 62; address is the same as the virtual address of the instructions we are executing. 63; 64; At the end, we invalidate the used DBATs. 65; 66; Note that the address parameters are long longs. We will transform these to 64-bit 67; values. Note that on 32-bit architectures that this will ignore the high half of the 68; passed in value. This should be ok since we can not have any bigger than 32 bit addresses 69; there anyhow. 70; 71; Note also that this routine is used only on 32-bit machines. If you're contemplating use 72; on a 64-bit processor, use the physical memory window instead; please refer to copypv() 73; for an example of how this is done. 74 75 .align 5 76 .globl EXT(bcopy_physvir_32) 77 78LEXT(bcopy_physvir_32) 79 mflr r0 ; get return address 80 rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg 81 mfsprg r8,2 ; get processor feature flags 82 stw r0,8(r1) ; save return address 83 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits 84 stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy 85 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test 86 subi r0,r7,1 ; get length - 1 87 rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg 88 add r11,r3,r0 ; Point to last byte of sink 89 mr r5,r7 ; Get the length into the right register 90 rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits 91 92; This test for page overflow may not work if the length is negative. Negative lengths are invalid input 93; to bcopy_physvir() on 32-bit machines, and will result in a panic. 94 95 add r12,r4,r0 ; Point to last byte of source 96 xor r7,r11,r3 ; See if we went to next page 97 xor r8,r12,r4 ; See if we went to next page 98 or r0,r7,r8 ; Combine wrap 99 100// li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes 101 li r9,((2<<3)|2) ; Set default attributes 102 rlwinm. r0,r0,0,0,19 ; Did we overflow a page? 103 li r7,2 ; Set validity flags 104 li r8,2 ; Set validity flags 105 bne- bcopy_phys1 ; Overflowed page, do normal physical copy... 106 107 rlwimi r11,r9,0,15,31 ; Set sink lower DBAT value 108 rlwimi r12,r9,0,15,31 ; Set source lower DBAT value 109 rlwimi r7,r11,0,0,14 ; Set sink upper DBAT value 110 rlwimi r8,r12,0,0,14 ; Set source upper DBAT value 111 cmplw cr1,r11,r12 ; See if sink and source are same block 112 113 sync 114 115 mtdbatl 0,r11 ; Set sink lower DBAT 116 mtdbatu 0,r7 ; Set sink upper DBAT 117 118 beq- cr1,bcpvsame ; Source and sink are in same block 119 120 mtdbatl 1,r12 ; Set source lower DBAT 121 mtdbatu 1,r8 ; Set source upper DBAT 122 123bcpvsame: 124 sync ; wait for the BATs to stabilize 125 isync 126 127 bl EXT(bcopy) ; BATs set up, args in r3-r5, so do the copy with DR on 128 129 li r0,0 ; Get set to invalidate upper half of BATs 130 sync ; Make sure all is well 131 mtdbatu 0,r0 ; Clear sink upper DBAT 132 mtdbatu 1,r0 ; Clear source upper DBAT 133 sync 134 isync 135 136 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address 137 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame 138 mtlr r0 139 blr 140 141 142; void bcopy_phys(from, to, nbytes) 143; 144; Turns off data translation before the copy. This one will not work in user state. 145; This routine is used on 32 and 64-bit machines. 146; 147; Note that the address parameters are long longs. We will transform these to 64-bit 148; values. Note that on 32-bit architectures that this will ignore the high half of the 149; passed in value. This should be ok since we can not have any bigger than 32 bit addresses 150; there anyhow. 151; 152; Also note that you probably will not be happy if either the sink or source spans across the 153; boundary between RAM and I/O space. Good chance of hanging the machine and this code 154; will not check, so be careful. 155; 156; NOTE: when called, translation must be on, and we must be in 32-bit mode. 157; Interrupts may or may not be disabled. 158 159 .align 5 160 .globl EXT(bcopy_phys) 161 162LEXT(bcopy_phys) 163 mflr r0 ; get return address 164 rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg 165 stw r0,8(r1) ; save 166 mfsprg r8,2 ; get processor feature flags 167 stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy 168 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits 169 rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg 170 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test 171 rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits 172 mr r5,r7 ; Get the length into the right register 173 174bcopy_phys1: ; enter from bcopy_physvir with pf64Bit in cr6 and parms in r3-r5 175 mfmsr r9 ; Get the MSR 176 lis r6,hi16(MASK(MSR_VEC)) ; Get vector enable 177 ori r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR)) ; Add in FP and DR 178 andc r9,r9,r6 ; unconditionally turn DR, VEC, and FP off 179 bt++ pf64Bitb,bcopy_phys64 ; skip if 64-bit (only they take hint) 180 181; 32-bit CPUs 182 183 mtmsr r9 ; turn DR, FP, and VEC off 184 isync ; Wait for it 185 186 bl EXT(bcopy) ; do the copy with translation off and caching on 187 188 mfmsr r9 ; Get the MSR 189 ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on (but leave VEC and FP off) 190 mtmsr r9 ; restore msr 191 isync ; wait for it to happen 192 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on 193 mtlr r0 194 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame 195 blr 196 197 198; 64-bit: turn DR off and SF on. 199 200bcopy_phys64: ; r9 = MSR with DP, VEC, and FP off 201 ori r8,r9,lo16(MASK(MSR_DR)) ; make a copy with DR back on... this is what we return to caller 202 srdi r2,r3,31 ; Get a 1 if source is in I/O memory 203 li r0,1 ; Note - we use this in a couple places below 204 srdi r10,r4,31 ; Get a 1 if sink is in I/O memory 205 std r8,BCOPY_SF_MSR(r1) ; save caller's MSR so we remember whether EE was on 206 rldimi r9,r0,63,MSR_SF_BIT ; set SF on in MSR we will copy with 207 cmpldi cr0,r2,1 ; Is source in I/O memory? 208 cmpldi cr7,r10,1 ; Is sink in I/O memory? 209 mtmsrd r9 ; turn 64-bit addressing on, data translation off 210 isync ; wait for it to happen 211 cror cr7_eq,cr0_eq,cr7_eq ; See if either source or sink is in I/O area 212 beq-- cr7,io_space_real_mode_copy ; an operand is in I/O space 213 214 bl EXT(bcopy) ; do copy with DR off and SF on, cache enabled 215 216bcopy_phys64x: 217 mfmsr r9 ; Get the MSR we used to copy 218 rldicl r9,r9,0,MSR_SF_BIT+1 ; clear SF 219 ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on 220 mtmsrd r9 ; turn 64-bit mode off, translation back on 221 isync ; wait for it to happen 222 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on 223 ld r8,BCOPY_SF_MSR(r1) ; get caller's MSR once translation is back on 224 mtlr r0 225 mtmsrd r8,1 ; turn EE back on if necessary 226 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame 227 blr 228 229; We need to copy with DR off, but one of the operands is in I/O space. To avoid wedging U3, 230; which cannot handle a cache burst in I/O space, we must turn caching off for the real memory access. 231; This can only be done by setting bits in HID4. We cannot lose control and execute random code in 232; this state, so we have to disable interrupts as well. This is an unpleasant hack. 233 234io_space_real_mode_copy: ; r0=1, r9=MSR we want to copy with 235 sldi r11,r0,31-MSR_EE_BIT ; Get a mask for the EE bit 236 sldi r0,r0,32+8 ; Get the right bit to turn off caching 237 andc r9,r9,r11 ; Turn off EE bit 238 mfspr r2,hid4 ; Get HID4 239 mtmsrd r9,1 ; Force off EE 240 or r2,r2,r0 ; Set bit to make real accesses cache-inhibited 241 sync ; Sync up 242 mtspr hid4,r2 ; Make real accesses cache-inhibited 243 isync ; Toss prefetches 244 245 lis r12,0xE000 ; Get the unlikeliest ESID possible 246 srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000 247 slbie r12 ; Make sure the ERAT is cleared 248 249 sync 250 isync 251 252 bl EXT(bcopy_nc) ; copy with SF on and EE, DR, VEC, and FP off, cache inhibited 253 254 li r0,1 ; Get a 1 255 sldi r0,r0,32+8 ; Get the right bit to turn off caching 256 mfspr r2,hid4 ; Get HID4 257 andc r2,r2,r0 ; Clear bit to make real accesses cache-inhibited 258 sync ; Sync up 259 mtspr hid4,r2 ; Make real accesses not cache-inhibited 260 isync ; Toss prefetches 261 262 lis r12,0xE000 ; Get the unlikeliest ESID possible 263 srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000 264 slbie r12 ; Make sure the ERAT is cleared 265 b bcopy_phys64x 266 267 268; 269; shortcopy 270; 271; Special case short operands (<32 bytes), which are very common. Note that the check for 272; reverse vs normal moves isn't quite correct in 64-bit mode; in rare cases we will move in 273; reverse when it wasn't necessary to do so. This is OK, since performance of the two cases 274; is similar. We do get the direction right when it counts (ie, when the operands overlap.) 275; Also note that we use the G3/G4 "backend" code, even on G5. This is OK too, since G5 has 276; plenty of load/store dispatch bandwidth in this case, the extra ops are hidden by latency, 277; and using word instead of doubleword moves reduces the possibility of unaligned accesses, 278; which cost about 20 cycles if they cross a 32-byte boundary on G5. Finally, because we 279; might do unaligned accesses this code cannot be called from bcopy_nc(). 280; r4 = destination 281; r5 = length (<32) 282; r6 = source 283; r12 = (dest - source) 284 285 .align 5 286shortcopy: 287 cmplw r12,r5 ; must move reverse if (dest-source)<length 288 mtcrf 2,r5 ; move length to cr6 and cr7 one at a time... 289 mtcrf 1,r5 ; ...which is faster on G4 and G5 290 bge++ backend ; handle forward moves (most common case) 291 add r6,r6,r5 ; point one past end of operands in reverse moves 292 add r4,r4,r5 293 b bbackend ; handle reverse moves 294 295; 296; void bcopy(from, to, nbytes) 297; 298; NOTE: bcopy is called from copyin and copyout etc with the "thread_recover" ptr set. 299; This means bcopy must not set up a stack frame or touch non-volatile registers, and also means that it 300; cannot rely on turning off interrupts, because we expect to get DSIs and have execution aborted by a "longjmp" 301; to the thread_recover routine. What this means is that it would be hard to use vector or floating point 302; registers to accelerate the copy. 303; 304; NOTE: this code can be called in any of three "modes": 305; - on 32-bit processors (32-byte cache line) 306; - on 64-bit processors running in 32-bit mode (128-byte cache line) 307; - on 64-bit processors running in 64-bit mode (128-byte cache line) 308 309 .align 5 310 .globl EXT(bcopy) 311 .globl EXT(bcopy_nop_if_32bit) 312 313LEXT(bcopy) 314 cmplwi cr1,r5,kShort ; less than 32 bytes? 315 sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check 316 mr r6,r3 ; Set source (must preserve r3 for memcopy return) 317 blt cr1,shortcopy ; special case short operands 318 crclr noncache ; Set cached 319LEXT(bcopy_nop_if_32bit) 320 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor) 321 bne+ copyit32 ; handle 32-bit processor 322 blr ; to==from so nothing to do 323 324; 325; bcopy_nc(from, to, nbytes) 326; 327; bcopy_nc() operates on non-cached memory so we can not use any kind of cache instructions. 328; Furthermore, we must avoid all unaligned accesses on 64-bit machines, since they take 329; alignment exceptions. Thus we cannot use "shortcopy", which could do unaligned lwz/stw. 330; Like bcopy(), bcopy_nc() can be called both in 32- and 64-bit mode. 331 332 .align 5 333 .globl EXT(bcopy_nc) 334 .globl EXT(bcopy_nc_nop_if_32bit) 335 336LEXT(bcopy_nc) 337 cmpwi cr1,r5,0 ; Check if we have a 0 length 338 sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check 339 mr r6,r3 ; Set source (must preserve r3 for memcopy return) 340 crset noncache ; Set non-cached 341 cror cr0_eq,cr1_eq,cr0_eq ; set cr0 beq if either length zero or to==from 342LEXT(bcopy_nc_nop_if_32bit) 343 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor) 344 bne+ copyit32 ; handle 32-bit processor 345 blr ; either zero length or to==from 346 347; 348; void* memcpy(to, from, nbytes) 349; void* memmove(to, from, nbytes) 350; 351; memcpy() and memmove() are only called in 32-bit mode, albeit on both 32- and 64-bit processors. 352; However, they would work correctly if called in 64-bit mode. 353 354 .align 5 355 .globl EXT(memcpy) 356 .globl EXT(memmove) 357 .globl EXT(memcpy_nop_if_32bit) 358 359LEXT(memcpy) 360LEXT(memmove) 361 cmplwi cr1,r5,kShort ; less than 32 bytes? 362 sub. r12,r3,r4 ; test for to==from in mode-independent way, start fwd/rev check 363 mr r6,r4 ; Set source 364 mr r4,r3 ; Set the "to" (must preserve r3 for return value) 365 blt cr1,shortcopy ; special case short operands 366 crclr noncache ; Set cached 367LEXT(memcpy_nop_if_32bit) 368 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor) 369 beqlr- ; exit if to==from 370 371 372; Here to copy on 32-bit processors. 373; 374; When we move the memory, forward overlays must be handled. We 375; also can not use the cache instructions if we are from bcopy_nc. 376; We need to preserve R3 because it needs to be returned for memcpy. 377; We can be interrupted and lose control here. 378; 379; When entered: 380; r4 = destination 381; r5 = length (>0) 382; r6 = source 383; r12 = (dest - source) 384; cr5 = noncache flag 385 386copyit32: ; WARNING! can drop down to this label 387 cmplw cr1,r12,r5 ; must move reverse if (dest-source)<length 388 cntlzw r11,r5 ; get magnitude of length 389 dcbt 0,r6 ; start to touch in source 390 lis r10,hi16(0x80000000) ; get 0x80000000 391 neg r9,r4 ; start to get alignment for destination 392 dcbtst 0,r4 ; start to touch in destination 393 sraw r8,r10,r11 ; get mask based on operand length, to limit alignment 394 blt- cr1,reverse32bit ; reverse move required 395 396; Forward moves on 32-bit machines, also word aligned uncached ops on 64-bit machines. 397; NOTE: we never do an unaligned access if the source and destination are "relatively" 398; word aligned. We depend on this in the uncached case on 64-bit processors. 399; r4 = destination 400; r5 = length (>0) 401; r6 = source 402; r8 = inverse of largest mask smaller than operand length 403; r9 = neg(dest), used to compute alignment 404; cr5 = noncache flag 405 406forward32bit: ; enter from 64-bit CPUs with word aligned uncached operands 407 rlwinm r7,r9,0,0x1F ; get bytes to 32-byte-align destination 408 andc. r0,r7,r8 ; limit to the maximum front end move 409 mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time... 410 beq alline ; Already on a line... 411 412 mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5 413 sub r5,r5,r0 ; Set the length left to move 414 415 bf 31,alhalf ; No single byte to do... 416 lbz r7,0(r6) ; Get the byte 417 addi r6,r6,1 ; Point to the next 418 stb r7,0(r4) ; Save the single 419 addi r4,r4,1 ; Bump sink 420 421; Sink is halfword aligned here 422 423alhalf: bf 30,alword ; No halfword to do... 424 lhz r7,0(r6) ; Get the halfword 425 addi r6,r6,2 ; Point to the next 426 sth r7,0(r4) ; Save the halfword 427 addi r4,r4,2 ; Bump sink 428 429; Sink is word aligned here 430 431alword: bf 29,aldouble ; No word to do... 432 lwz r7,0(r6) ; Get the word 433 addi r6,r6,4 ; Point to the next 434 stw r7,0(r4) ; Save the word 435 addi r4,r4,4 ; Bump sink 436 437; Sink is double aligned here 438 439aldouble: bf 28,alquad ; No double to do... 440 lwz r7,0(r6) ; Get the first word 441 lwz r8,4(r6) ; Get the second word 442 addi r6,r6,8 ; Point to the next 443 stw r7,0(r4) ; Save the first word 444 stw r8,4(r4) ; Save the second word 445 addi r4,r4,8 ; Bump sink 446 447; Sink is quadword aligned here 448 449alquad: bf 27,alline ; No quad to do... 450 lwz r7,0(r6) ; Get the first word 451 lwz r8,4(r6) ; Get the second word 452 lwz r9,8(r6) ; Get the third word 453 stw r7,0(r4) ; Save the first word 454 lwz r11,12(r6) ; Get the fourth word 455 addi r6,r6,16 ; Point to the next 456 stw r8,4(r4) ; Save the second word 457 stw r9,8(r4) ; Save the third word 458 stw r11,12(r4) ; Save the fourth word 459 addi r4,r4,16 ; Bump sink 460 461; Sink is line aligned here 462 463alline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move 464 mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time... 465 mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5 466 beq- backend ; No full lines to move 467 468 mtctr r0 ; set up loop count 469 li r0,96 ; Stride for touch ahead 470 b nxtline 471 472 .align 4 473nxtline: 474 lwz r2,0(r6) ; Get the first word 475 lwz r5,4(r6) ; Get the second word 476 lwz r7,8(r6) ; Get the third word 477 lwz r8,12(r6) ; Get the fourth word 478 lwz r9,16(r6) ; Get the fifth word 479 lwz r10,20(r6) ; Get the sixth word 480 lwz r11,24(r6) ; Get the seventh word 481 lwz r12,28(r6) ; Get the eighth word 482 bt- noncache,skipz ; Skip if we are not cached... 483 dcbz 0,r4 ; Blow away the whole line because we are replacing it 484 dcbt r6,r0 ; Touch ahead a bit 485skipz: 486 addi r6,r6,32 ; Point to the next 487 stw r2,0(r4) ; Save the first word 488 stw r5,4(r4) ; Save the second word 489 stw r7,8(r4) ; Save the third word 490 stw r8,12(r4) ; Save the fourth word 491 stw r9,16(r4) ; Save the fifth word 492 stw r10,20(r4) ; Save the sixth word 493 stw r11,24(r4) ; Save the seventh word 494 stw r12,28(r4) ; Save the eighth word 495 addi r4,r4,32 ; Bump sink 496 bdnz+ nxtline ; Do the next line, if any... 497 498 499; Move backend quadword 500 501backend: ; Join here from "shortcopy" for forward moves <32 bytes 502 bf 27,noquad ; No quad to do... 503 lwz r7,0(r6) ; Get the first word 504 lwz r8,4(r6) ; Get the second word 505 lwz r9,8(r6) ; Get the third word 506 lwz r11,12(r6) ; Get the fourth word 507 stw r7,0(r4) ; Save the first word 508 addi r6,r6,16 ; Point to the next 509 stw r8,4(r4) ; Save the second word 510 stw r9,8(r4) ; Save the third word 511 stw r11,12(r4) ; Save the fourth word 512 addi r4,r4,16 ; Bump sink 513 514; Move backend double 515 516noquad: bf 28,nodouble ; No double to do... 517 lwz r7,0(r6) ; Get the first word 518 lwz r8,4(r6) ; Get the second word 519 addi r6,r6,8 ; Point to the next 520 stw r7,0(r4) ; Save the first word 521 stw r8,4(r4) ; Save the second word 522 addi r4,r4,8 ; Bump sink 523 524; Move backend word 525 526nodouble: bf 29,noword ; No word to do... 527 lwz r7,0(r6) ; Get the word 528 addi r6,r6,4 ; Point to the next 529 stw r7,0(r4) ; Save the word 530 addi r4,r4,4 ; Bump sink 531 532; Move backend halfword 533 534noword: bf 30,nohalf ; No halfword to do... 535 lhz r7,0(r6) ; Get the halfword 536 addi r6,r6,2 ; Point to the next 537 sth r7,0(r4) ; Save the halfword 538 addi r4,r4,2 ; Bump sink 539 540; Move backend byte 541 542nohalf: bflr 31 ; Leave cuz we are all done... 543 lbz r7,0(r6) ; Get the byte 544 stb r7,0(r4) ; Save the single 545 blr 546 547 548; Reverse moves on 32-bit machines, also reverse word aligned uncached moves on 64-bit machines. 549; NOTE: we never do an unaligned access if the source and destination are "relatively" 550; word aligned. We depend on this in the uncached case on 64-bit processors. 551; These are slower because we don't bother with dcbz. Fortunately, reverse moves are uncommon. 552; r4 = destination 553; r5 = length (>0) 554; r6 = source 555; r8 = inverse of largest mask smaller than operand length 556; cr5 = noncache flag (but we don't dcbz anyway) 557 558reverse32bit: ; here from 64-bit code with word aligned uncached operands 559 add r4,r5,r4 ; Point past the last sink byte 560 add r6,r5,r6 ; Point past the last source byte 561 rlwinm r7,r4,0,0x1F ; Calculate the length to align dest on cache boundary 562 li r12,-1 ; Make sure we touch in the actual line 563 andc. r0,r7,r8 ; Apply movement limit 564 dcbt r12,r6 ; Touch in the last line of source 565 mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time... 566 dcbtst r12,r4 ; Touch in the last line of the sink 567 mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5 568 beq- balline ; Aready on cache line boundary (or too short to bother) 569 570 sub r5,r5,r0 ; Precaculate move length left after alignment 571 572 bf 31,balhalf ; No single byte to do... 573 lbz r7,-1(r6) ; Get the byte 574 subi r6,r6,1 ; Point to the next 575 stb r7,-1(r4) ; Save the single 576 subi r4,r4,1 ; Bump sink 577 578; Sink is halfword aligned here 579 580balhalf: bf 30,balword ; No halfword to do... 581 lhz r7,-2(r6) ; Get the halfword 582 subi r6,r6,2 ; Point to the next 583 sth r7,-2(r4) ; Save the halfword 584 subi r4,r4,2 ; Bump sink 585 586; Sink is word aligned here 587 588balword: bf 29,baldouble ; No word to do... 589 lwz r7,-4(r6) ; Get the word 590 subi r6,r6,4 ; Point to the next 591 stw r7,-4(r4) ; Save the word 592 subi r4,r4,4 ; Bump sink 593 594; Sink is double aligned here 595 596baldouble: bf 28,balquad ; No double to do... 597 lwz r7,-8(r6) ; Get the first word 598 lwz r8,-4(r6) ; Get the second word 599 subi r6,r6,8 ; Point to the next 600 stw r7,-8(r4) ; Save the first word 601 stw r8,-4(r4) ; Save the second word 602 subi r4,r4,8 ; Bump sink 603 604; Sink is quadword aligned here 605 606balquad: bf 27,balline ; No quad to do... 607 lwz r7,-16(r6) ; Get the first word 608 lwz r8,-12(r6) ; Get the second word 609 lwz r9,-8(r6) ; Get the third word 610 lwz r11,-4(r6) ; Get the fourth word 611 stw r7,-16(r4) ; Save the first word 612 subi r6,r6,16 ; Point to the next 613 stw r8,-12(r4) ; Save the second word 614 stw r9,-8(r4) ; Save the third word 615 stw r11,-4(r4) ; Save the fourth word 616 subi r4,r4,16 ; Bump sink 617 618; Sink is line aligned here 619 620balline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move 621 mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time... 622 mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5 623 beq- bbackend ; No full lines to move 624 mtctr r0 ; set up loop count 625 b bnxtline 626 627 .align 4 628bnxtline: 629 lwz r7,-32(r6) ; Get the first word 630 lwz r5,-28(r6) ; Get the second word 631 lwz r2,-24(r6) ; Get the third word 632 lwz r12,-20(r6) ; Get the third word 633 lwz r11,-16(r6) ; Get the fifth word 634 lwz r10,-12(r6) ; Get the sixth word 635 lwz r9,-8(r6) ; Get the seventh word 636 lwz r8,-4(r6) ; Get the eighth word 637 subi r6,r6,32 ; Point to the next 638 639 stw r7,-32(r4) ; Get the first word 640 stw r5,-28(r4) ; Get the second word 641 stw r2,-24(r4) ; Get the third word 642 stw r12,-20(r4) ; Get the third word 643 stw r11,-16(r4) ; Get the fifth word 644 stw r10,-12(r4) ; Get the sixth word 645 stw r9,-8(r4) ; Get the seventh word 646 stw r8,-4(r4) ; Get the eighth word 647 subi r4,r4,32 ; Bump sink 648 649 bdnz+ bnxtline ; Do the next line, if any... 650 651; 652; Note: We touched these lines in at the beginning 653; 654 655; Move backend quadword 656 657bbackend: ; Join here from "shortcopy" for reverse moves of <32 bytes 658 bf 27,bnoquad ; No quad to do... 659 lwz r7,-16(r6) ; Get the first word 660 lwz r8,-12(r6) ; Get the second word 661 lwz r9,-8(r6) ; Get the third word 662 lwz r11,-4(r6) ; Get the fourth word 663 stw r7,-16(r4) ; Save the first word 664 subi r6,r6,16 ; Point to the next 665 stw r8,-12(r4) ; Save the second word 666 stw r9,-8(r4) ; Save the third word 667 stw r11,-4(r4) ; Save the fourth word 668 subi r4,r4,16 ; Bump sink 669 670; Move backend double 671 672bnoquad: bf 28,bnodouble ; No double to do... 673 lwz r7,-8(r6) ; Get the first word 674 lwz r8,-4(r6) ; Get the second word 675 subi r6,r6,8 ; Point to the next 676 stw r7,-8(r4) ; Save the first word 677 stw r8,-4(r4) ; Save the second word 678 subi r4,r4,8 ; Bump sink 679 680; Move backend word 681 682bnodouble: bf 29,bnoword ; No word to do... 683 lwz r7,-4(r6) ; Get the word 684 subi r6,r6,4 ; Point to the next 685 stw r7,-4(r4) ; Save the word 686 subi r4,r4,4 ; Bump sink 687 688; Move backend halfword 689 690bnoword: bf 30,bnohalf ; No halfword to do... 691 lhz r7,-2(r6) ; Get the halfword 692 subi r6,r6,2 ; Point to the next 693 sth r7,-2(r4) ; Save the halfword 694 subi r4,r4,2 ; Bump sink 695 696; Move backend byte 697 698bnohalf: bflr 31 ; Leave cuz we are all done... 699 lbz r7,-1(r6) ; Get the byte 700 stb r7,-1(r4) ; Save the single 701 blr 702 703 704// Here on 64-bit processors, which have a 128-byte cache line. This can be 705// called either in 32 or 64-bit mode, which makes the test for reverse moves 706// a little tricky. We've already filtered out the (sou==dest) and (len==0) 707// special cases. 708// 709// When entered: 710// r4 = destination (32 or 64-bit ptr) 711// r5 = length (always 32 bits) 712// r6 = source (32 or 64-bit ptr) 713// r12 = (dest - source), reverse move required if (dest-source)<length 714// cr5 = noncache flag 715 716 .align 5 717copyit64: 718 rlwinm r7,r5,0,0,31 // truncate length to 32-bit, in case we're running in 64-bit mode 719 cntlzw r11,r5 // get magnitude of length 720 dcbt 0,r6 // touch in 1st block of source 721 dcbtst 0,r4 // touch in 1st destination cache block 722 subc r7,r12,r7 // set Carry if (dest-source)>=length, in mode-independent way 723 li r0,0 // get a 0 724 lis r10,hi16(0x80000000)// get 0x80000000 725 addze. r0,r0 // set cr0 on carry bit (beq if reverse move required) 726 neg r9,r4 // start to get alignment for destination 727 sraw r8,r10,r11 // get mask based on operand length, to limit alignment 728 bt-- noncache,c64uncached// skip if uncached 729 beq-- c64rdouble // handle cached reverse moves 730 731 732// Forward, cached or doubleword aligned uncached. This is the common case. 733// NOTE: we never do an unaligned access if the source and destination are "relatively" 734// doubleword aligned. We depend on this in the uncached case. 735// r4 = destination 736// r5 = length (>0) 737// r6 = source 738// r8 = inverse of largest mask smaller than operand length 739// r9 = neg(dest), used to compute alignment 740// cr5 = noncache flag 741 742c64double: 743 rlwinm r7,r9,0,0x7F // get #bytes to 128-byte align destination 744 andc r7,r7,r8 // limit by operand length 745 andi. r8,r7,7 // r8 <- #bytes to doubleword align 746 srwi r9,r7,3 // r9 <- #doublewords to 128-byte align 747 sub r5,r5,r7 // adjust length remaining 748 cmpwi cr1,r9,0 // any doublewords to move to cache align? 749 srwi r10,r5,7 // r10 <- 128-byte chunks to xfer after aligning dest 750 cmpwi cr7,r10,0 // set cr7 on chunk count 751 beq c64double2 // dest already doubleword aligned 752 mtctr r8 753 b c64double1 754 755 .align 5 // align inner loops 756c64double1: // copy bytes until dest is doubleword aligned 757 lbz r0,0(r6) 758 addi r6,r6,1 759 stb r0,0(r4) 760 addi r4,r4,1 761 bdnz c64double1 762 763c64double2: // r9/cr1=doublewords, r10/cr7=128-byte chunks 764 beq cr1,c64double4 // no doublewords to xfer in order to cache align 765 mtctr r9 766 b c64double3 767 768 .align 5 // align inner loops 769c64double3: // copy doublewords until dest is 128-byte aligned 770 ld r7,0(r6) 771 addi r6,r6,8 772 std r7,0(r4) 773 addi r4,r4,8 774 bdnz c64double3 775 776// Here to xfer 128-byte chunks, if any. Since we only have 8 GPRs for 777// data (64 bytes), we load/store each twice per 128-byte chunk. 778 779c64double4: // r10/cr7=128-byte chunks 780 rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords, after moving chunks 781 cmpwi cr1,r0,0 // set cr1 on leftover doublewords 782 beq cr7,c64double7 // no 128-byte chunks 783 784 ; We must check for (source-dest)<128 in a mode-independent way. If within 128 bytes, 785 ; turn on "noncache" because we cannot use dcbz128 even if operands are cacheable. 786 787 sub r8,r6,r4 // r8 <- (source - dest) 788 rldicr. r0,r8,0,63-7 // zero low 7 bits and check for 0, mode independent 789 cror noncache,cr0_eq,noncache // turn on "noncache" flag if (source-dest)<128 790 mtctr r10 791 b c64InnerLoop 792 793 .align 5 // align inner loop 794c64InnerLoop: // loop copying 128-byte cache lines to 128-aligned destination 795 ld r0,0(r6) // start pipe: load 1st half-line 796 ld r2,8(r6) 797 ld r7,16(r6) 798 ld r8,24(r6) 799 ld r9,32(r6) 800 ld r10,40(r6) 801 ld r11,48(r6) 802 ld r12,56(r6) 803 bt noncache,c64InnerLoop1 // skip if uncached or overlap 804 dcbz128 0,r4 // avoid prefetch of next cache line 805c64InnerLoop1: 806 807 std r0,0(r4) 808 std r2,8(r4) 809 std r7,16(r4) 810 std r8,24(r4) 811 std r9,32(r4) 812 std r10,40(r4) 813 std r11,48(r4) 814 std r12,56(r4) 815 816 ld r0,64(r6) // load 2nd half of chunk 817 ld r2,72(r6) 818 ld r7,80(r6) 819 ld r8,88(r6) 820 ld r9,96(r6) 821 ld r10,104(r6) 822 ld r11,112(r6) 823 ld r12,120(r6) 824 addi r6,r6,128 825 826 std r0,64(r4) 827 std r2,72(r4) 828 std r7,80(r4) 829 std r8,88(r4) 830 std r9,96(r4) 831 std r10,104(r4) 832 std r11,112(r4) 833 std r12,120(r4) 834 addi r4,r4,128 // advance to next dest chunk 835 836 bdnz c64InnerLoop // loop if more chunks 837 838 839c64double7: // r5 <- leftover bytes, cr1 set on doubleword count 840 rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords (0-15) 841 andi. r5,r5,7 // r5/cr0 <- count of leftover bytes (0-7) 842 beq cr1,c64byte // no leftover doublewords 843 mtctr r0 844 b c64double8 845 846 .align 5 // align inner loop 847c64double8: // loop copying leftover doublewords 848 ld r0,0(r6) 849 addi r6,r6,8 850 std r0,0(r4) 851 addi r4,r4,8 852 bdnz c64double8 853 854 855// Forward byte loop. 856 857c64byte: // r5/cr0 <- byte count (can be big if unaligned uncached) 858 beqlr // done if no leftover bytes 859 mtctr r5 860 b c64byte1 861 862 .align 5 // align inner loop 863c64byte1: 864 lbz r0,0(r6) 865 addi r6,r6,1 866 stb r0,0(r4) 867 addi r4,r4,1 868 bdnz c64byte1 869 870 blr 871 872 873// Uncached copies. We must avoid unaligned accesses, since they always take alignment 874// exceptions on uncached memory on 64-bit processors. This may mean we copy long operands 875// a byte at a time, but that is still much faster than alignment exceptions. 876// r4 = destination 877// r5 = length (>0) 878// r6 = source 879// r8 = inverse of largest mask smaller than operand length 880// r9 = neg(dest), used to compute alignment 881// r12 = (dest-source), used to test relative alignment 882// cr0 = beq if reverse move required 883// cr5 = noncache flag 884 885c64uncached: 886 rlwinm r10,r12,0,29,31 // relatively doubleword aligned? 887 rlwinm r11,r12,0,30,31 // relatively word aligned? 888 cmpwi cr7,r10,0 // set cr7 beq if doubleword aligned 889 cmpwi cr1,r11,0 // set cr1 beq if word aligned 890 beq-- c64reverseUncached 891 892 beq cr7,c64double // doubleword aligned 893 beq cr1,forward32bit // word aligned, use G3/G4 code 894 cmpwi r5,0 // set cr0 on byte count 895 b c64byte // unaligned operands 896 897c64reverseUncached: 898 beq cr7,c64rdouble // doubleword aligned so can use LD/STD 899 beq cr1,reverse32bit // word aligned, use G3/G4 code 900 add r6,r6,r5 // point to (end+1) of source and dest 901 add r4,r4,r5 902 cmpwi r5,0 // set cr0 on length 903 b c64rbyte // copy a byte at a time 904 905 906 907// Reverse doubleword copies. This is used for all cached copies, and doubleword 908// aligned uncached copies. 909// r4 = destination 910// r5 = length (>0) 911// r6 = source 912// r8 = inverse of largest mask of low-order 1s smaller than operand length 913// cr5 = noncache flag 914 915c64rdouble: 916 add r6,r6,r5 // point to (end+1) of source and dest 917 add r4,r4,r5 918 rlwinm r7,r4,0,29,31 // r7 <- #bytes to doubleword align dest 919 andc. r7,r7,r8 // limit by operand length 920 sub r5,r5,r7 // adjust length 921 srwi r8,r5,6 // r8 <- 64-byte chunks to xfer 922 cmpwi cr1,r8,0 // any chunks? 923 beq c64rd2 // source already doubleword aligned 924 mtctr r7 925 926c64rd1: // copy bytes until source doublword aligned 927 lbzu r0,-1(r6) 928 stbu r0,-1(r4) 929 bdnz c64rd1 930 931c64rd2: // r8/cr1 <- count of 64-byte chunks 932 rlwinm r0,r5,29,29,31 // r0 <- count of leftover doublewords 933 andi. r5,r5,7 // r5/cr0 <- count of leftover bytes 934 cmpwi cr7,r0,0 // leftover doublewords? 935 beq cr1,c64rd4 // no chunks to xfer 936 mtctr r8 937 b c64rd3 938 939 .align 5 // align inner loop 940c64rd3: // loop copying 64-byte chunks 941 ld r7,-8(r6) 942 ld r8,-16(r6) 943 ld r9,-24(r6) 944 ld r10,-32(r6) 945 ld r11,-40(r6) 946 ld r12,-48(r6) 947 std r7,-8(r4) 948 std r8,-16(r4) 949 ld r7,-56(r6) 950 ldu r8,-64(r6) 951 std r9,-24(r4) 952 std r10,-32(r4) 953 std r11,-40(r4) 954 std r12,-48(r4) 955 std r7,-56(r4) 956 stdu r8,-64(r4) 957 bdnz c64rd3 958 959c64rd4: // r0/cr7 = leftover doublewords r5/cr0 = leftover bytes 960 beq cr7,c64rbyte // no leftover doublewords 961 mtctr r0 962 963c64rd5: // loop copying leftover doublewords 964 ldu r0,-8(r6) 965 stdu r0,-8(r4) 966 bdnz c64rd5 967 968 969// Reverse byte loop. 970 971c64rbyte: // r5/cr0 <- byte count (can be big if unaligned uncached) 972 beqlr // done if no leftover bytes 973 mtctr r5 974 975c64rbyte1: 976 lbzu r0,-1(r6) 977 stbu r0,-1(r4) 978 bdnz c64rbyte1 979 980 blr 981 982