1/*
2 * Copyright (c) 2003-2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/appleapiopts.h>
30#include <ppc/asm.h>
31#include <ppc/proc_reg.h>
32#include <machine/cpu_capabilities.h>
33#include <machine/commpage.h>
34
35
36// commpage_time_dcba() uses a stack frame as follows:
37
38#define	kBufSiz		1024				// Size of the buffer we use to do DCBA timing on G4
39#define	kSFSize		(kBufSiz+128+16)	// Stack frame size, which contains the 128-byte-aligned buffer
40#define	kLoopCnt	5					// Iterations of the timing loop
41#define	kDCBA		22					// Bit in cr5 used as a flag in timing loop
42
43
44// commpage_set_timestamp() uses the red zone for temporary storage:
45
46#define	rzSaveF1			-8		// caller's FPR1
47#define	rzSaveF2			-16		// caller's FPR2
48#define	rzSaveF3			-24		// caller's FPR3
49#define	rzSaveF4			-32		// caller's FPR4
50#define	rzSaveF5			-40		// caller's FPR5
51#define	rzNewTimeBase		-48		// used to load 64-bit TBR into a FPR
52
53
54// commpage_set_timestamp() uses the following data.  kkTicksPerSec remembers
55// the number used to compute _COMM_PAGE_SEC_PER_TICK.  Since this constant
56// rarely changes, we use it to avoid needless recomputation.  It is a double
57// value, pre-initialize with an exponent of 2**52.
58
59#define	kkBinary0		0					// offset in data to long long 0 (a constant)
60#define	kkDouble1		8					// offset in data to double 1.0 (a constant)
61#define	kkTicksPerSec	16					// offset in data to double(ticks_per_sec)
62
63        .data
64        .align	3							// three doubleword fields
65Ldata:
66        .long	0							// kkBinary0
67        .long	0
68        .double	1.0e0						// kkDouble1
69        .long	0x43300000					// kkTicksPerSec (plus 2**52)
70        .long	0							// this is where we store ticks_per_sec, to float
71
72        .text
73        .align	2
74        .globl	EXT(commpage_time_dcba)
75        .globl	EXT(commpage_set_timestamp)
76
77
78/*	***********************************************
79 *	* C O M M P A G E _ S E T _ T I M E S T A M P *
80 *	***********************************************
81 *
82 *	Update the gettimeofday() shared data on the commpages, as follows:
83 *		_COMM_PAGE_TIMESTAMP = the clock offset at timebase (seconds)
84 *		_COMM_PAGE_TIMEBASE = the timebase at which the timestamp was valid
85 *		_COMM_PAGE_SEC_PER_TICK = multiply timebase ticks by this to get seconds (double)
86 *	The convention is that if the timebase is 0, the data is invalid.  Because other
87 *	CPUs are reading the three values asynchronously and must get a consistent set,
88 *	it is critical that we update them with the following protocol:
89 *		1. set timebase to 0 (atomically), to invalidate all three values
90 *		2. eieio (to create a barrier in stores to cacheable memory)
91 *		3. change timestamp and "secs per tick"
92 *		4. eieio
93 *		5. set timebase nonzero (atomically)
94 *	This works because readers read the timebase, then the timestamp and divisor, sync
95 *	if MP, then read the timebase a second time and check to be sure it is equal to the first.
96 *
97 *	We could save a few cycles on 64-bit machines by special casing them, but it probably
98 *	isn't necessary because this routine shouldn't be called very often.
99 *
100 *	When called:
101 *		r3 = upper half of timebase (timebase is disabled if 0)
102 *		r4 = lower half of timebase
103 *		r5 = upper half of timestamp
104 *		r6 = lower half of timestamp
105 *		r7 = divisor (ie, timebase ticks per sec)
106 *	We set up:
107 *		r8 = ptr to our static data (kkBinary0, kkDouble1, kkTicksPerSec)
108 *		r9 = ptr to 32-bit commpage in kernel map
109 *     r10 = ptr to 64-bit commpage in kernel map
110 *
111 *	--> Interrupts must be disabled and rtclock locked when called.  <--
112 */
113
114        .align	5
115LEXT(commpage_set_timestamp)				// void commpage_set_timestamp(tbr,secs,divisor)
116        mfmsr	r11							// get MSR
117        ori		r2,r11,MASK(MSR_FP)			// turn FP on
118        mtmsr	r2
119        isync								// wait until MSR changes take effect
120
121        or.		r0,r3,r4					// is timebase 0? (thus disabled)
122        lis		r8,hi16(Ldata)				// point to our data
123        lis		r9,ha16(EXT(commPagePtr32))	// get ptrs to address of commpages in kernel map
124		lis		r10,ha16(EXT(commPagePtr64))
125        stfd	f1,rzSaveF1(r1)				// save a FPR in the red zone
126        ori		r8,r8,lo16(Ldata)
127        lwz		r9,lo16(EXT(commPagePtr32))(r9)	// r9 <- 32-bit commpage ptr
128		lwz		r10,lo16(EXT(commPagePtr64))(r10) // r10 <- 64-bit commpage ptr
129        lfd		f1,kkBinary0(r8)			// get fixed 0s
130        li		r0,_COMM_PAGE_BASE_ADDRESS	// get va in user space of commpage
131        cmpwi	cr1,r9,0					// is 32-bit commpage allocated yet?
132		cmpwi   cr6,r10,0					// is 64-bit commpage allocated yet?
133        sub		r9,r9,r0					// r9 <- 32-bit commpage address, biased by user va
134		sub		r10,r10,r0					// r10<- 64-bit commpage address
135        beq--	cr1,3f						// skip if 32-bit commpage not allocated (64-bit won't be either)
136		bne++   cr6,1f						// skip if 64-bit commpage is allocated
137		mr		r10,r9						// if no 64-bit commpage, point to 32-bit version with r10 too
1381:
139        stfd	f1,_COMM_PAGE_TIMEBASE(r9)	// turn off the 32-bit-commpage timestamp (atomically)
140		stfd	f1,_COMM_PAGE_TIMEBASE(r10) // and the 64-bit one too
141        eieio								// make sure all CPUs see it is off
142        beq		3f							// all we had to do is turn off timestamp
143
144        lwz		r0,kkTicksPerSec+4(r8)		// get last ticks_per_sec (or 0 if first)
145        stw		r3,rzNewTimeBase(r1)		// store new timebase so we can lfd
146        stw		r4,rzNewTimeBase+4(r1)
147        cmpw	r0,r7						// do we need to recompute _COMM_PAGE_SEC_PER_TICK?
148        stw		r5,_COMM_PAGE_TIMESTAMP(r9)	// store the new timestamp in the 32-bit page
149        stw		r6,_COMM_PAGE_TIMESTAMP+4(r9)
150        stw		r5,_COMM_PAGE_TIMESTAMP(r10)// and the 64-bit commpage
151        stw		r6,_COMM_PAGE_TIMESTAMP+4(r10)
152        lfd		f1,rzNewTimeBase(r1)		// get timebase in a FPR so we can store atomically
153        beq++	2f							// same ticks_per_sec, no need to recompute
154
155        stw		r7,kkTicksPerSec+4(r8)		// must recompute SEC_PER_TICK
156        stfd	f2,rzSaveF2(r1)				// we'll need a few more temp FPRs
157        stfd	f3,rzSaveF3(r1)
158        stfd	f4,rzSaveF4(r1)
159        stfd	f5,rzSaveF5(r1)
160        lfd		f2,_COMM_PAGE_2_TO_52(r9)	// f2 <- double(2**52)
161        lfd		f3,kkTicksPerSec(r8)		// float new ticks_per_sec + 2**52
162        lfd		f4,kkDouble1(r8)			// f4 <- double(1.0)
163        mffs	f5							// save caller's FPSCR
164        mtfsfi	7,1							// clear Inexeact Exception bit, set round-to-zero
165        fsub	f3,f3,f2					// get ticks_per_sec
166        fdiv	f3,f4,f3					// divide 1 by ticks_per_sec to get SEC_PER_TICK
167        stfd	f3,_COMM_PAGE_SEC_PER_TICK(r9)
168        stfd	f3,_COMM_PAGE_SEC_PER_TICK(r10)
169        mtfsf	0xFF,f5						// restore FPSCR
170        lfd		f2,rzSaveF2(r1)				// restore FPRs
171        lfd		f3,rzSaveF3(r1)
172        lfd		f4,rzSaveF4(r1)
173        lfd		f5,rzSaveF5(r1)
1742:											// f1 == new timestamp
175        eieio								// wait until the stores take
176        stfd	f1,_COMM_PAGE_TIMEBASE(r9)	// then turn the timestamp back on (atomically)
177        stfd	f1,_COMM_PAGE_TIMEBASE(r10)	// both
1783:											// here once all fields updated
179        lfd		f1,rzSaveF1(r1)				// restore last FPR
180        mtmsr	r11							// turn FP back off
181        isync
182        blr
183
184
185/*	***************************************
186 *	* C O M M P A G E _ T I M E _ D C B A *
187 *	***************************************
188 *
189 *	Not all processors that support the DCBA opcode actually benefit from it.
190 *	Some store-gather and read-cancel well enough that there is no need to use
191 *	DCBA to avoid fetching cache lines that will be completely overwritten, while
192 *	others have this feature disabled (to work around errata etc), and so benefit
193 *	from DCBA.  Since it is hard to tell the one group from the other, we just
194 *	time loops with and without DCBA, and pick the fastest.  Thus we avoid
195 *	delicate dependence on processor and/or platform revisions.
196 *
197 *	We return either kDcbaRecommended or zero.
198 *
199 *		int commpage_time_dcba( void );
200 */
201
202LEXT(commpage_time_dcba)
203        mflr	r12					// get return
204        stw		r12,8(r1)			// save
205        stwu	r1,-kSFSize(r1)		// carve our temp buffer from the stack
206        addi	r11,r1,127+16		// get base address...
207        rlwinm	r11,r11,0,0,24		// ...of our buffer, 128-byte aligned
208        crset	kDCBA				// first, use DCBA
209        bl		LTest				// time it with DCBA
210        srwi	r0,r3,3				// bias 12 pct in favor of not using DCBA...
211        add		r10,r3,r0			// ...because DCBA is always slower with warm cache
212        crclr	kDCBA
213        bl		LTest				// time without DCBA
214        cmplw	r10,r3				// which is better?
215        mtlr	r12					// restore return
216        lwz		r1,0(r1)			// pop off our stack frame
217        li		r3,kDcbaRecommended		// assume using DCBA is faster
218        bltlr
219        li		r3,0			// no DCBA is faster
220        blr
221
222
223// Subroutine to time a loop with or without DCBA.
224//		kDCBA = set if we should use DCBA
225//		r11 = base of buffer to use for test (kBufSiz bytes)
226//
227//		We return TBR ticks in r3.
228//		We use r0,r3-r9.
229
230LTest:
231        li		r4,kLoopCnt			// number of times to loop
232        li		r3,-1				// initialize fastest time
2331:
234        mr		r6,r11				// initialize buffer ptr
235        li		r0,kBufSiz/32		// r0 <- cache blocks to test
236        mtctr	r0
2372:
238        dcbf	0,r6				// first, force the blocks out of the cache
239        addi	r6,r6,32
240        bdnz	2b
241        sync						// make sure all the flushes take
242        mr		r6,r11				// re-initialize buffer ptr
243        mtctr	r0					// reset cache-block count
244        mftbu	r7					// remember upper half so we can check for carry
245        mftb	r8					// start the timer
2463:									// loop over cache blocks
247        bf		kDCBA,4f			// should we DCBA?
248        dcba	0,r6
2494:
250        stw		r0,0(r6)			// store the entire cache block
251        stw		r0,4(r6)
252        stw		r0,8(r6)
253        stw		r0,12(r6)
254        stw		r0,16(r6)
255        stw		r0,20(r6)
256        stw		r0,24(r6)
257        stw		r0,28(r6)
258        addi	r6,r6,32
259        bdnz	3b
260        mftb	r9
261        mftbu	r0
262        cmpw	r0,r7				// did timebase carry?
263        bne		1b					// yes, retest rather than fuss
264        sub		r9,r9,r8			// r9 <- time for this loop
265        cmplw	r9,r3				// faster than current best?
266        bge		5f					// no
267        mr		r3,r9				// remember fastest time through loop
2685:
269        subi	r4,r4,1				// decrement outer loop count
270        cmpwi	r4,0				// more to go?
271        bne		1b					// loop if so
272        blr							// return fastest time in r3
273