1/* 2 * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#include <ppc/asm.h> 30#include <ppc/exception.h> 31#include <assym.s> 32 33 .text 34 .align 2 35 .globl _memset 36 .globl _bzero 37 .globl _bzero_nc 38 .globl _bzero_phys 39 .globl _bzero_phys_nc 40 41 42// ***************************** 43// * B Z E R O _ P H Y S _ N C * 44// ***************************** 45// 46// void bzero_phys_nc(addr64_t phys_addr, uint32_t length); 47// 48// Takes a phys addr in (r3,r4), and length in r5. NO CACHING 49 50 .align 5 51LEXT(bzero_phys_nc) 52 mflr r12 // save return address 53 rlwinm r3,r3,0,1,0 // coallesce long-long in (r3,r4) into reg64_t in r3 54 rlwimi r3,r4,0,0,31 55 mr r4,r5 // put length where bzero() expects it 56 bl EXT(ml_set_physical_get_ffs) // turn DR off, SF on, features in cr6, old MSR in r11 57 bl EXT(bzero_nc) // use normal bzero() routine 58 mtlr r12 // restore return 59 b EXT(ml_restore) // restore MSR, turning DR on and SF off 60 61 62// *********************** 63// * B Z E R O _ P H Y S * 64// *********************** 65// 66// void bzero_phys(addr64_t phys_addr, uint32_t length); 67// 68// Takes a phys addr in (r3,r4), and length in r5. We leave cache on. 69 70 .align 5 71LEXT(bzero_phys) 72 mflr r12 // save return address 73 rlwinm r3,r3,0,1,0 // coallesce long-long in (r3,r4) into reg64_t in r3 74 rlwimi r3,r4,0,0,31 75 mr r4,r5 // put length where bzero() expects it 76 bl EXT(ml_set_physical_get_ffs) // turn DR off, SF on, features in cr6, old MSR in r11 77 bl EXT(bzero) // use normal bzero() routine 78 mtlr r12 // restore return 79 b EXT(ml_restore) // restore MSR, turning DR on and SF off 80 81 82// ******************* 83// * B Z E R O _ N C * 84// ******************* 85// 86// void bzero_nc(char *addr, unsigned int length); 87// 88// For use with uncached memory. Doesn't seem to be used at all, so probably not 89// performance critical. NB: we must avoid unaligned stores, because some 90// machines (eg, 970) take alignment exceptions on _any_ unaligned op to uncached 91// memory. Of course, we must also avoid dcbz. 92 93LEXT(bzero_nc) 94 cmplwi cr1,r4,20 // too short to bother with 16-byte loops? 95 cmplwi cr7,r4,0 // check for (len==0) 96 li r6,0 // get a 0 97 bge cr1,bznc1 // skip if length >=20 98 mtctr r4 // set up byte loop 99 beqlr-- cr7 // done if len=0 100 101// Short operands, loop over bytes. 102 103bznc0: 104 stb r6,0(r3) 105 addi r3,r3,1 106 bdnz bznc0 107 blr 108 109// Handle operands long enough to do doubleword stores; we must doubleword 110// align, to avoid alignment exceptions. 111 112bznc1: 113 neg r7,r3 // start to compute #bytes to align 114 mfsprg r10,2 // get feature flags 115 andi. r0,r7,7 // get #bytes to doubleword align 116 mr r5,r3 // make copy of operand ptr as bcopy expects 117 mtcrf 0x02,r10 // put pf64Bitb etc in cr6 118 beq bzero_tail // already doubleword aligned 119 sub r4,r4,r0 // adjust count 120 mtctr r0 // set up loop 121bznc2: // zero bytes until doubleword aligned 122 stb r6,0(r5) 123 addi r5,r5,1 124 bdnz bznc2 125 b bzero_tail // join bzero, now that r5 is aligned 126 127 128// ************* *************** 129// * B Z E R O * and * M E M S E T * 130// ************* *************** 131// 132// void * memset(void *b, int c, size_t len); 133// void bzero(void *b, size_t len); 134// 135// These routines support G3, G4, and the 970, and run in both 32 and 136// 64-bit mode. Lengths (size_t) are always 32 bits. 137// 138// Register use: 139// r0 = temp 140// r2 = temp 141// r3 = original ptr, not changed since memset returns it 142// r4 = count of bytes to set 143// r5 = working operand ptr ("rp") 144// r6 = value to store (usually 0) 145// r7-r9 = temps 146// r10 = feature flags 147// r11 = old MSR (if bzero_phys) 148// r12 = return address (if bzero_phys) 149// cr6 = feature flags (pf64Bit, pf128Byte, and pf32Byte) 150 151 .align 5 152LEXT(memset) // void * memset(void *b, int c, size_t len); 153 andi. r6,r4,0xFF // copy value to working register, test for 0 154 mr r4,r5 // move length to working register 155 bne-- memset1 // skip if nonzero 156LEXT(bzero) // void bzero(void *b, size_t len); 157 dcbtst 0,r3 // touch in 1st cache block 158 mfsprg r10,2 // get features 159 li r6,0 // get a 0 160 neg r7,r3 // start to compute #bytes to align 161 andi. r0,r10,pf128Byte+pf32Byte // get cache line size 162 mtcrf 0x02,r10 // put pf128Byte etc in cr6 163 cmplw r4,r0 // operand length >= cache line size? 164 mr r5,r3 // make copy of operand ptr (can't change r3) 165 blt bzero_tail // too short for dcbz (or dcbz128) 166 rlwinm r0,r7,0,0x1F // get #bytes to 32-byte align 167 rlwinm r9,r7,0,0x7F // get #bytes to 128-byte align 168 bt++ pf128Byteb,bzero_128 // skip if 128-byte processor 169 170// Operand length >=32 and cache line size is 32. 171// r0 = #bytes to 32-byte align 172// r4 = length 173// r5 = ptr to operand 174// r6 = 0 175 176 sub r2,r4,r0 // adjust length 177 cmpwi cr1,r0,0 // already 32-byte aligned? 178 srwi. r8,r2,5 // get #32-byte chunks 179 beq bzero_tail // not long enough to dcbz 180 mtctr r8 // set up loop count 181 rlwinm r4,r2,0,27,31 // mask down to leftover byte count 182 beq cr1,bz_dcbz32 // skip if already 32-byte aligned 183 184// 32-byte align. We just store 32 0s, rather than test and use conditional 185// branches. This is usually faster, because there are no mispredicts. 186 187 stw r6,0(r5) // zero next 32 bytes 188 stw r6,4(r5) 189 stw r6,8(r5) 190 stw r6,12(r5) 191 stw r6,16(r5) 192 stw r6,20(r5) 193 stw r6,24(r5) 194 stw r6,28(r5) 195 add r5,r5,r0 // now r5 is 32-byte aligned 196 b bz_dcbz32 197 198// Loop doing 32-byte version of DCBZ instruction. 199 200 .align 4 // align the inner loop 201bz_dcbz32: 202 dcbz 0,r5 // zero another 32 bytes 203 addi r5,r5,32 204 bdnz bz_dcbz32 205 206// Store trailing bytes. This routine is used both by bzero and memset. 207// r4 = #bytes to store (may be large if memset) 208// r5 = address 209// r6 = value to store (in all 8 bytes) 210// cr6 = pf64Bit etc flags 211 212bzero_tail: 213 srwi. r0,r4,4 // get #(16-byte-chunks) 214 mtcrf 0x01,r4 // remaining byte count to cr7 215 beq bzt3 // no 16-byte chunks 216 mtctr r0 // set up loop count 217 bt++ pf64Bitb,bzt2 // skip if 64-bit processor 218 b bzt1 219 .align 5 220bzt1: // loop over 16-byte chunks on 32-bit processor 221 stw r6,0(r5) 222 stw r6,4(r5) 223 stw r6,8(r5) 224 stw r6,12(r5) 225 addi r5,r5,16 226 bdnz bzt1 227 b bzt3 228 .align 5 229bzt2: // loop over 16-byte chunks on 64-bit processor 230 std r6,0(r5) 231 std r6,8(r5) 232 addi r5,r5,16 233 bdnz bzt2 234 bf 28,bzt4 // 8-byte chunk? 235 std r6,0(r5) 236 addi r5,r5,8 237 b bzt4 238bzt3: 239 bf 28,bzt4 // 8-byte chunk? 240 stw r6,0(r5) 241 stw r6,4(r5) 242 addi r5,r5,8 243bzt4: 244 bf 29,bzt5 // word? 245 stw r6,0(r5) 246 addi r5,r5,4 247bzt5: 248 bf 30,bzt6 // halfword? 249 sth r6,0(r5) 250 addi r5,r5,2 251bzt6: 252 bflr 31 // byte? 253 stb r6,0(r5) 254 blr 255 256// Operand length is >=128 and cache line size is 128. We assume that 257// because the linesize is 128 bytes, this is a 64-bit processor. 258// r4 = length 259// r5 = ptr to operand 260// r6 = 0 261// r7 = neg(r5) 262// r9 = #bytes to 128-byte align 263 264 .align 5 265bzero_128: 266 sub r2,r4,r9 // r2 <- length remaining after cache-line aligning 267 rlwinm r0,r7,0,0xF // r0 <- #bytes to 16-byte align 268 srwi. r8,r2,7 // r8 <- number of cache lines to 0 269 std r6,0(r5) // always store 16 bytes to 16-byte align... 270 std r6,8(r5) // ...even if too short for dcbz128 271 add r5,r5,r0 // 16-byte align ptr 272 sub r4,r4,r0 // adjust count 273 beq bzero_tail // r8==0, not long enough to dcbz128 274 sub. r7,r9,r0 // get #bytes remaining to 128-byte align 275 rlwinm r4,r2,0,0x7F // r4 <- length remaining after dcbz128'ing 276 mtctr r8 // set up dcbz128 loop 277 beq bz_dcbz128 // already 128-byte aligned 278 b bz_align // enter loop over 16-byte chunks 279 280// 128-byte align by looping over 16-byte chunks. 281 282 .align 5 283bz_align: // loop over 16-byte chunks 284 subic. r7,r7,16 // more to go? 285 std r6,0(r5) 286 std r6,8(r5) 287 addi r5,r5,16 288 bgt bz_align 289 290 b bz_dcbz128 // enter dcbz128 loop 291 292// Loop over 128-byte cache lines. 293// r4 = length remaining after cache lines (0..127) 294// r5 = ptr (128-byte aligned) 295// r6 = 0 296// ctr = count of cache lines to 0 297 298 .align 5 299bz_dcbz128: 300 dcbz128 0,r5 // zero a 128-byte cache line 301 addi r5,r5,128 302 bdnz bz_dcbz128 303 304 b bzero_tail // handle leftovers 305 306 307// Handle memset() for nonzero values. This case is relatively infrequent; 308// the large majority of memset() calls are for 0. 309// r3 = ptr 310// r4 = count 311// r6 = value in lower byte (nonzero) 312 313memset1: 314 cmplwi r4,16 // too short to bother aligning? 315 rlwimi r6,r6,8,16,23 // replicate value to low 2 bytes 316 mr r5,r3 // make working copy of operand ptr 317 rlwimi r6,r6,16,0,15 // value now in all 4 bytes 318 blt bzero_tail // length<16, we won't be using "std" 319 mfsprg r10,2 // get feature flags 320 neg r7,r5 // start to compute #bytes to align 321 rlwinm r6,r6,0,1,0 // value now in all 8 bytes (if 64-bit) 322 andi. r0,r7,7 // r6 <- #bytes to doubleword align 323 stw r6,0(r5) // store 8 bytes to avoid a loop 324 stw r6,4(r5) 325 mtcrf 0x02,r10 // get pf64Bit flag etc in cr6 326 sub r4,r4,r0 // adjust count 327 add r5,r5,r0 // doubleword align ptr 328 b bzero_tail 329 330 331 332