1/* 2 * Copyright (c) 2003-2005 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#include <sys/appleapiopts.h> 30#include <ppc/asm.h> 31#include <ppc/proc_reg.h> 32#include <machine/cpu_capabilities.h> 33#include <machine/commpage.h> 34 35 36// commpage_time_dcba() uses a stack frame as follows: 37 38#define kBufSiz 1024 // Size of the buffer we use to do DCBA timing on G4 39#define kSFSize (kBufSiz+128+16) // Stack frame size, which contains the 128-byte-aligned buffer 40#define kLoopCnt 5 // Iterations of the timing loop 41#define kDCBA 22 // Bit in cr5 used as a flag in timing loop 42 43 44// commpage_set_timestamp() uses the red zone for temporary storage: 45 46#define rzSaveF1 -8 // caller's FPR1 47#define rzSaveF2 -16 // caller's FPR2 48#define rzSaveF3 -24 // caller's FPR3 49#define rzSaveF4 -32 // caller's FPR4 50#define rzSaveF5 -40 // caller's FPR5 51#define rzNewTimeBase -48 // used to load 64-bit TBR into a FPR 52 53 54// commpage_set_timestamp() uses the following data. kkTicksPerSec remembers 55// the number used to compute _COMM_PAGE_SEC_PER_TICK. Since this constant 56// rarely changes, we use it to avoid needless recomputation. It is a double 57// value, pre-initialize with an exponent of 2**52. 58 59#define kkBinary0 0 // offset in data to long long 0 (a constant) 60#define kkDouble1 8 // offset in data to double 1.0 (a constant) 61#define kkTicksPerSec 16 // offset in data to double(ticks_per_sec) 62 63 .data 64 .align 3 // three doubleword fields 65Ldata: 66 .long 0 // kkBinary0 67 .long 0 68 .double 1.0e0 // kkDouble1 69 .long 0x43300000 // kkTicksPerSec (plus 2**52) 70 .long 0 // this is where we store ticks_per_sec, to float 71 72 .text 73 .align 2 74 .globl EXT(commpage_time_dcba) 75 .globl EXT(commpage_set_timestamp) 76 77 78/* *********************************************** 79 * * C O M M P A G E _ S E T _ T I M E S T A M P * 80 * *********************************************** 81 * 82 * Update the gettimeofday() shared data on the commpages, as follows: 83 * _COMM_PAGE_TIMESTAMP = the clock offset at timebase (seconds) 84 * _COMM_PAGE_TIMEBASE = the timebase at which the timestamp was valid 85 * _COMM_PAGE_SEC_PER_TICK = multiply timebase ticks by this to get seconds (double) 86 * The convention is that if the timebase is 0, the data is invalid. Because other 87 * CPUs are reading the three values asynchronously and must get a consistent set, 88 * it is critical that we update them with the following protocol: 89 * 1. set timebase to 0 (atomically), to invalidate all three values 90 * 2. eieio (to create a barrier in stores to cacheable memory) 91 * 3. change timestamp and "secs per tick" 92 * 4. eieio 93 * 5. set timebase nonzero (atomically) 94 * This works because readers read the timebase, then the timestamp and divisor, sync 95 * if MP, then read the timebase a second time and check to be sure it is equal to the first. 96 * 97 * We could save a few cycles on 64-bit machines by special casing them, but it probably 98 * isn't necessary because this routine shouldn't be called very often. 99 * 100 * When called: 101 * r3 = upper half of timebase (timebase is disabled if 0) 102 * r4 = lower half of timebase 103 * r5 = upper half of timestamp 104 * r6 = lower half of timestamp 105 * r7 = divisor (ie, timebase ticks per sec) 106 * We set up: 107 * r8 = ptr to our static data (kkBinary0, kkDouble1, kkTicksPerSec) 108 * r9 = ptr to 32-bit commpage in kernel map 109 * r10 = ptr to 64-bit commpage in kernel map 110 * 111 * --> Interrupts must be disabled and rtclock locked when called. <-- 112 */ 113 114 .align 5 115LEXT(commpage_set_timestamp) // void commpage_set_timestamp(tbr,secs,divisor) 116 mfmsr r11 // get MSR 117 ori r2,r11,MASK(MSR_FP) // turn FP on 118 mtmsr r2 119 isync // wait until MSR changes take effect 120 121 or. r0,r3,r4 // is timebase 0? (thus disabled) 122 lis r8,hi16(Ldata) // point to our data 123 lis r9,ha16(EXT(commPagePtr32)) // get ptrs to address of commpages in kernel map 124 lis r10,ha16(EXT(commPagePtr64)) 125 stfd f1,rzSaveF1(r1) // save a FPR in the red zone 126 ori r8,r8,lo16(Ldata) 127 lwz r9,lo16(EXT(commPagePtr32))(r9) // r9 <- 32-bit commpage ptr 128 lwz r10,lo16(EXT(commPagePtr64))(r10) // r10 <- 64-bit commpage ptr 129 lfd f1,kkBinary0(r8) // get fixed 0s 130 li r0,_COMM_PAGE_BASE_ADDRESS // get va in user space of commpage 131 cmpwi cr1,r9,0 // is 32-bit commpage allocated yet? 132 cmpwi cr6,r10,0 // is 64-bit commpage allocated yet? 133 sub r9,r9,r0 // r9 <- 32-bit commpage address, biased by user va 134 sub r10,r10,r0 // r10<- 64-bit commpage address 135 beq-- cr1,3f // skip if 32-bit commpage not allocated (64-bit won't be either) 136 bne++ cr6,1f // skip if 64-bit commpage is allocated 137 mr r10,r9 // if no 64-bit commpage, point to 32-bit version with r10 too 1381: 139 stfd f1,_COMM_PAGE_TIMEBASE(r9) // turn off the 32-bit-commpage timestamp (atomically) 140 stfd f1,_COMM_PAGE_TIMEBASE(r10) // and the 64-bit one too 141 eieio // make sure all CPUs see it is off 142 beq 3f // all we had to do is turn off timestamp 143 144 lwz r0,kkTicksPerSec+4(r8) // get last ticks_per_sec (or 0 if first) 145 stw r3,rzNewTimeBase(r1) // store new timebase so we can lfd 146 stw r4,rzNewTimeBase+4(r1) 147 cmpw r0,r7 // do we need to recompute _COMM_PAGE_SEC_PER_TICK? 148 stw r5,_COMM_PAGE_TIMESTAMP(r9) // store the new timestamp in the 32-bit page 149 stw r6,_COMM_PAGE_TIMESTAMP+4(r9) 150 stw r5,_COMM_PAGE_TIMESTAMP(r10)// and the 64-bit commpage 151 stw r6,_COMM_PAGE_TIMESTAMP+4(r10) 152 lfd f1,rzNewTimeBase(r1) // get timebase in a FPR so we can store atomically 153 beq++ 2f // same ticks_per_sec, no need to recompute 154 155 stw r7,kkTicksPerSec+4(r8) // must recompute SEC_PER_TICK 156 stfd f2,rzSaveF2(r1) // we'll need a few more temp FPRs 157 stfd f3,rzSaveF3(r1) 158 stfd f4,rzSaveF4(r1) 159 stfd f5,rzSaveF5(r1) 160 lfd f2,_COMM_PAGE_2_TO_52(r9) // f2 <- double(2**52) 161 lfd f3,kkTicksPerSec(r8) // float new ticks_per_sec + 2**52 162 lfd f4,kkDouble1(r8) // f4 <- double(1.0) 163 mffs f5 // save caller's FPSCR 164 mtfsfi 7,1 // clear Inexeact Exception bit, set round-to-zero 165 fsub f3,f3,f2 // get ticks_per_sec 166 fdiv f3,f4,f3 // divide 1 by ticks_per_sec to get SEC_PER_TICK 167 stfd f3,_COMM_PAGE_SEC_PER_TICK(r9) 168 stfd f3,_COMM_PAGE_SEC_PER_TICK(r10) 169 mtfsf 0xFF,f5 // restore FPSCR 170 lfd f2,rzSaveF2(r1) // restore FPRs 171 lfd f3,rzSaveF3(r1) 172 lfd f4,rzSaveF4(r1) 173 lfd f5,rzSaveF5(r1) 1742: // f1 == new timestamp 175 eieio // wait until the stores take 176 stfd f1,_COMM_PAGE_TIMEBASE(r9) // then turn the timestamp back on (atomically) 177 stfd f1,_COMM_PAGE_TIMEBASE(r10) // both 1783: // here once all fields updated 179 lfd f1,rzSaveF1(r1) // restore last FPR 180 mtmsr r11 // turn FP back off 181 isync 182 blr 183 184 185/* *************************************** 186 * * C O M M P A G E _ T I M E _ D C B A * 187 * *************************************** 188 * 189 * Not all processors that support the DCBA opcode actually benefit from it. 190 * Some store-gather and read-cancel well enough that there is no need to use 191 * DCBA to avoid fetching cache lines that will be completely overwritten, while 192 * others have this feature disabled (to work around errata etc), and so benefit 193 * from DCBA. Since it is hard to tell the one group from the other, we just 194 * time loops with and without DCBA, and pick the fastest. Thus we avoid 195 * delicate dependence on processor and/or platform revisions. 196 * 197 * We return either kDcbaRecommended or zero. 198 * 199 * int commpage_time_dcba( void ); 200 */ 201 202LEXT(commpage_time_dcba) 203 mflr r12 // get return 204 stw r12,8(r1) // save 205 stwu r1,-kSFSize(r1) // carve our temp buffer from the stack 206 addi r11,r1,127+16 // get base address... 207 rlwinm r11,r11,0,0,24 // ...of our buffer, 128-byte aligned 208 crset kDCBA // first, use DCBA 209 bl LTest // time it with DCBA 210 srwi r0,r3,3 // bias 12 pct in favor of not using DCBA... 211 add r10,r3,r0 // ...because DCBA is always slower with warm cache 212 crclr kDCBA 213 bl LTest // time without DCBA 214 cmplw r10,r3 // which is better? 215 mtlr r12 // restore return 216 lwz r1,0(r1) // pop off our stack frame 217 li r3,kDcbaRecommended // assume using DCBA is faster 218 bltlr 219 li r3,0 // no DCBA is faster 220 blr 221 222 223// Subroutine to time a loop with or without DCBA. 224// kDCBA = set if we should use DCBA 225// r11 = base of buffer to use for test (kBufSiz bytes) 226// 227// We return TBR ticks in r3. 228// We use r0,r3-r9. 229 230LTest: 231 li r4,kLoopCnt // number of times to loop 232 li r3,-1 // initialize fastest time 2331: 234 mr r6,r11 // initialize buffer ptr 235 li r0,kBufSiz/32 // r0 <- cache blocks to test 236 mtctr r0 2372: 238 dcbf 0,r6 // first, force the blocks out of the cache 239 addi r6,r6,32 240 bdnz 2b 241 sync // make sure all the flushes take 242 mr r6,r11 // re-initialize buffer ptr 243 mtctr r0 // reset cache-block count 244 mftbu r7 // remember upper half so we can check for carry 245 mftb r8 // start the timer 2463: // loop over cache blocks 247 bf kDCBA,4f // should we DCBA? 248 dcba 0,r6 2494: 250 stw r0,0(r6) // store the entire cache block 251 stw r0,4(r6) 252 stw r0,8(r6) 253 stw r0,12(r6) 254 stw r0,16(r6) 255 stw r0,20(r6) 256 stw r0,24(r6) 257 stw r0,28(r6) 258 addi r6,r6,32 259 bdnz 3b 260 mftb r9 261 mftbu r0 262 cmpw r0,r7 // did timebase carry? 263 bne 1b // yes, retest rather than fuss 264 sub r9,r9,r8 // r9 <- time for this loop 265 cmplw r9,r3 // faster than current best? 266 bge 5f // no 267 mr r3,r9 // remember fastest time through loop 2685: 269 subi r4,r4,1 // decrement outer loop count 270 cmpwi r4,0 // more to go? 271 bne 1b // loop if so 272 blr // return fastest time in r3 273