1/*-
2 * Copyright (c) 2014, 2015 Antti Kantee.  All Rights Reserved.
3 * Copyright (c) 2015 Martin Lucina.  All Rights Reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
15 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <hw/kernel.h>
28#include <hw/clock_subr.h>
29
30#include <arch/x86/hypervisor.h>
31#include <arch/x86/var.h>
32#include <arch/x86/tsc.h>
33
34#include <bmk-core/core.h>
35#include <bmk-core/platform.h>
36#include <bmk-core/printf.h>
37
38#define NSEC_PER_SEC	1000000000ULL
39/*
40 * Minimum delta to sleep using PIT. Programming seems to have an overhead of
41 * 3-4us, but play it safe here.
42 */
43#define PIT_MIN_DELTA	16
44
45/* clock isr trampoline (in locore.S) */
46void cpu_isr_clock(void);
47
48/*
49 * Multiplier for converting nsecs to PIT ticks. (1.32) fixed point.
50 *
51 * Calculated as:
52 *
53 *     f = NSEC_PER_SEC / TIMER_HZ   (0.31) fixed point.
54 *     pit_mult = 1 / f              (1.32) fixed point.
55 */
56static const uint32_t pit_mult
57    = (1ULL << 63) / ((NSEC_PER_SEC << 31) / TIMER_HZ);
58
59/* RTC wall time offset at monotonic time base. */
60static bmk_time_t rtc_epochoffset;
61
62/* True if using pvclock for timekeeping, false if using TSC-based clock. */
63static int have_pvclock;
64
65/*
66 * TSC clock specific.
67 */
68
69/* Base time values at the last call to tscclock_monotonic(). */
70static bmk_time_t time_base;
71static uint64_t tsc_base;
72
73/* Multiplier for converting TSC ticks to nsecs. (0.32) fixed point. */
74static uint32_t tsc_mult;
75
76/*
77 * pvclock specific.
78 */
79
80/* Xen/KVM per-vcpu time ABI. */
81struct pvclock_vcpu_time_info {
82	uint32_t version;
83	uint32_t pad0;
84	uint64_t tsc_timestamp;
85	uint64_t system_time;
86	uint32_t tsc_to_system_mul;
87	int8_t tsc_shift;
88	uint8_t flags;
89	uint8_t pad[2];
90} __attribute__((__packed__));
91
92/* Xen/KVM wall clock ABI. */
93struct pvclock_wall_clock {
94	uint32_t version;
95	uint32_t sec;
96	uint32_t nsec;
97} __attribute__((__packed__));
98
99/*
100 * pvclock structures shared with hypervisor.
101 * TODO: These should be pointers (for Xen HVM support), but we can't use
102 * bmk_pgalloc() here.
103 */
104volatile static struct pvclock_vcpu_time_info pvclock_ti;
105volatile static struct pvclock_wall_clock pvclock_wc;
106
107/*
108 * Calculate prod = (a * b) where a is (64.0) fixed point and b is (0.32) fixed
109 * point.  The intermediate product is (64.32) fixed point, discarding the
110 * fractional bits leaves us with a (64.0) fixed point result.
111 *
112 * XXX Document what range of (a, b) is safe from overflow in this calculation.
113 */
114static inline uint64_t
115mul64_32(uint64_t a, uint32_t b)
116{
117	uint64_t prod;
118#if defined(__x86_64__)
119	/* For x86_64 the computation can be done using 64-bit multiply and
120	 * shift. */
121	__asm__ (
122		"mul %%rdx ; "
123		"shrd $32, %%rdx, %%rax"
124		: "=a" (prod)
125		: "0" (a), "d" ((uint64_t)b)
126	);
127#elif defined(__i386__)
128	/* For i386 we compute the partial products and add them up, discarding
129	 * the lower 32 bits of the product in the process. */
130	uint32_t h = (uint32_t)(a >> 32);
131	uint32_t l = (uint32_t)a;
132	uint32_t t1, t2;
133	__asm__ (
134		"mul  %5       ; "  /* %edx:%eax = (l * b)                    */
135		"mov  %4,%%eax ; "  /* %eax = h                               */
136		"mov  %%edx,%4 ; "  /* t1 = ((l * b) >> 32)                   */
137		"mul  %5       ; "  /* %edx:%eax = (h * b)                    */
138		"xor  %5,%5    ; "  /* t2 = 0                                 */
139		"add  %4,%%eax ; "  /* %eax = (h * b) + t1 (LSW)              */
140		"adc  %5,%%edx ; "  /* %edx = (h * b) + t1 (MSW)              */
141		: "=A" (prod), "=r" (t1), "=r" (t2)
142		: "a" (l), "1" (h), "2" (b)
143	);
144#else
145#error mul64_32 not supported for target architecture
146#endif
147
148	return prod;
149}
150
151/*
152 * Read the current i8254 channel 0 tick count.
153 */
154static unsigned int
155i8254_gettick(void)
156{
157	uint16_t rdval;
158
159	outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH);
160	rdval = inb(TIMER_CNTR);
161	rdval |= (inb(TIMER_CNTR) << 8);
162	return rdval;
163}
164
165/*
166 * Delay for approximately n microseconds using the i8254 channel 0 counter.
167 * Timer must be programmed appropriately before calling this function.
168 */
169static void
170i8254_delay(unsigned int n)
171{
172	unsigned int cur_tick, initial_tick;
173	int remaining;
174	const unsigned long timer_rval = TIMER_HZ / 100;
175
176	initial_tick = i8254_gettick();
177
178	remaining = (unsigned long long) n * TIMER_HZ / 1000000;
179
180	while (remaining > 1) {
181		cur_tick = i8254_gettick();
182		if (cur_tick > initial_tick)
183			remaining -= timer_rval - (cur_tick - initial_tick);
184		else
185			remaining -= initial_tick - cur_tick;
186		initial_tick = cur_tick;
187	}
188}
189
190/*
191 * Read a RTC register. Due to PC platform braindead-ness also disables NMI.
192 */
193static inline uint8_t
194rtc_read(uint8_t reg)
195{
196
197	outb(RTC_COMMAND, reg | RTC_NMI_DISABLE);
198	return inb(RTC_DATA);
199}
200
201/*
202 * Return current RTC time. Note that due to waiting for the update cycle to
203 * complete, this call may take some time.
204 */
205static bmk_time_t
206rtc_gettimeofday(void)
207{
208	struct bmk_clock_ymdhms dt;
209
210	splhigh();
211
212	/*
213	 * If RTC_UIP is down, we have at least 244us to obtain a
214	 * consistent reading before an update can occur.
215	 */
216	while (rtc_read(RTC_STATUS_A) & RTC_UIP)
217		continue;
218
219	dt.dt_sec = bcdtobin(rtc_read(RTC_SEC));
220	dt.dt_min = bcdtobin(rtc_read(RTC_MIN));
221	dt.dt_hour = bcdtobin(rtc_read(RTC_HOUR));
222	dt.dt_day = bcdtobin(rtc_read(RTC_DAY));
223	dt.dt_mon = bcdtobin(rtc_read(RTC_MONTH));
224	dt.dt_year = bcdtobin(rtc_read(RTC_YEAR)) + 2000;
225
226	spl0();
227
228	return clock_ymdhms_to_secs(&dt) * NSEC_PER_SEC;
229}
230
231/*
232 * Return monotonic time using TSC clock.
233 */
234static bmk_time_t
235tscclock_monotonic(void)
236{
237	uint64_t tsc_now, tsc_delta;
238
239	/*
240	 * Update time_base (monotonic time) and tsc_base (TSC time).
241	 */
242	tsc_now = rdtsc();
243	tsc_delta = tsc_now - tsc_base;
244	time_base += mul64_32(tsc_delta, tsc_mult);
245	tsc_base = tsc_now;
246
247	return time_base;
248}
249
250/*
251 * Calibrate TSC and initialise TSC clock.
252 */
253static int
254tscclock_init(void)
255{
256	uint64_t tsc_freq;
257
258	/* Initialise i8254 timer channel 0 to mode 2 at 100 Hz */
259	outb(TIMER_MODE, TIMER_SEL0 | TIMER_RATEGEN | TIMER_16BIT);
260	outb(TIMER_CNTR, (TIMER_HZ / 100) & 0xff);
261	outb(TIMER_CNTR, (TIMER_HZ / 100) >> 8);
262
263	/*
264	 * Read RTC time to use as epoch offset. This must be done just
265	 * before tsc_base is initialised in order to get a correct
266	 * offset.
267	 */
268	rtc_epochoffset = rtc_gettimeofday();
269
270	/*
271	 * Calculate TSC frequency by calibrating against an 0.1s delay
272	 * using the i8254 timer.
273	 */
274	spl0();
275	tsc_base = rdtsc();
276	i8254_delay(100000);
277	tsc_freq = (rdtsc() - tsc_base) * 10;
278	splhigh();
279	bmk_printf("x86_initclocks(): TSC frequency estimate is %llu Hz\n",
280		(unsigned long long)tsc_freq);
281
282	/*
283	 * Calculate TSC scaling multiplier.
284	 *
285	 * (0.32) tsc_mult = NSEC_PER_SEC (32.32) / tsc_freq (32.0)
286	 */
287	tsc_mult = (NSEC_PER_SEC << 32) / tsc_freq;
288
289	/*
290	 * Monotonic time begins at tsc_base (first read of TSC before
291	 * calibration).
292	 */
293	time_base = mul64_32(tsc_base, tsc_mult);
294
295	return 0;
296}
297
298/*
299 * Return monotonic time using PV clock.
300 */
301static bmk_time_t
302pvclock_monotonic(void)
303{
304	uint32_t version;
305	uint64_t delta, time_now;
306
307	do {
308		version = pvclock_ti.version;
309		__asm__ ("mfence" ::: "memory");
310		delta = rdtsc() - pvclock_ti.tsc_timestamp;
311		if (pvclock_ti.tsc_shift < 0)
312			delta >>= -pvclock_ti.tsc_shift;
313		else
314			delta <<= pvclock_ti.tsc_shift;
315		time_now = mul64_32(delta, pvclock_ti.tsc_to_system_mul) +
316			pvclock_ti.system_time;
317		__asm__ ("mfence" ::: "memory");
318	} while ((pvclock_ti.version & 1) || (pvclock_ti.version != version));
319
320	return (bmk_time_t)time_now;
321}
322
323/*
324 * Read wall time offset since system boot using PV clock.
325 */
326static bmk_time_t
327pvclock_read_wall_clock(void)
328{
329	uint32_t version;
330	bmk_time_t wc_boot;
331
332	do {
333		version = pvclock_wc.version;
334		__asm__ ("mfence" ::: "memory");
335		wc_boot = pvclock_wc.sec * NSEC_PER_SEC;
336		wc_boot += pvclock_wc.nsec;
337		__asm__ ("mfence" ::: "memory");
338	} while ((pvclock_wc.version & 1) || (pvclock_wc.version != version));
339
340	return wc_boot;
341}
342
343/*
344 * Initialise PV clock. Returns zero if successful (PV clock is available).
345 *
346 * Source: Linux kernel, Documentation/virtual/kvm/{msr,cpuid}.txt
347 */
348static int
349pvclock_init(void)
350{
351	uint32_t eax, ebx, ecx, edx;
352	uint32_t msr_kvm_system_time, msr_kvm_wall_clock;
353
354	if (hypervisor_detect() != HYPERVISOR_KVM)
355		return 1;
356	/*
357	 * Prefer new-style MSRs, and bail entirely if neither is indicated as
358	 * available by CPUID.
359	 */
360	x86_cpuid(0x40000001, &eax, &ebx, &ecx, &edx);
361	if (eax & (1 << 3)) {
362		msr_kvm_system_time = 0x4b564d01;
363		msr_kvm_wall_clock = 0x4b564d00;
364	}
365	else if (eax & (1 << 0)) {
366		msr_kvm_system_time = 0x12;
367		msr_kvm_wall_clock = 0x11;
368	}
369	else
370		return 1;
371
372	__asm__ __volatile("wrmsr" ::
373		"c" (msr_kvm_system_time),
374		"a" ((uint32_t)((uintptr_t)&pvclock_ti | 0x1)),
375#if defined(__x86_64__)
376		"d" ((uint32_t)((uintptr_t)&pvclock_ti >> 32))
377#else
378		"d" (0)
379#endif
380	);
381	__asm__ __volatile("wrmsr" ::
382		"c" (msr_kvm_wall_clock),
383		"a" ((uint32_t)((uintptr_t)&pvclock_wc)),
384#if defined(__x86_64__)
385		"d" ((uint32_t)((uintptr_t)&pvclock_wc >> 32))
386#else
387		"d" (0)
388#endif
389	);
390	/* Initialise epoch offset using wall clock time */
391	rtc_epochoffset = pvclock_read_wall_clock();
392
393	return 0;
394}
395
396void
397x86_initclocks(void)
398{
399	uint32_t eax, ebx, ecx, edx;
400	uint32_t have_tsc = 0, invariant_tsc = 0;
401
402	/* Verify that TSC is supported. */
403	x86_cpuid(0x0, &eax, &ebx, &ecx, &edx);
404	if (eax >= 0x1) {
405		x86_cpuid(0x1, &eax, &ebx, &ecx, &edx);
406		have_tsc = edx & (1 << 4);
407	}
408	if (!have_tsc)
409		bmk_platform_halt("Processor does not support RDTSC");
410	/* And that it is invariant. TODO: Potentially halt here if not? */
411	x86_cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
412	if (eax >= 0x80000007) {
413		x86_cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
414		invariant_tsc = edx & (1 << 8);
415	}
416	if (!invariant_tsc)
417		bmk_printf("WARNING: Processor claims to not support "
418		    "invariant TSC.\n");
419
420	/*
421	 * Use PV clock if available, otherwise use TSC for timekeeping.
422	 */
423	if (pvclock_init() == 0)
424		have_pvclock = 1;
425	else
426		tscclock_init();
427	bmk_printf("x86_initclocks(): Using %s for timekeeping\n",
428		have_pvclock ? "PV clock" : "TSC");
429
430	/*
431	 * Initialise i8254 timer channel 0 to mode 4 (one shot).
432	 */
433	outb(TIMER_MODE, TIMER_SEL0 | TIMER_ONESHOT | TIMER_16BIT);
434
435	/*
436	 * Map i8254 interrupt vector and enable it in the PIC.
437	 */
438	x86_fillgate(32, cpu_isr_clock, 0);
439	pic1mask &= ~(1<<0);
440	outb(PIC1_DATA, pic1mask);
441}
442
443/*
444 * Return monotonic time since system boot in nanoseconds.
445 */
446bmk_time_t
447bmk_platform_cpu_clock_monotonic(void)
448{
449	if (have_pvclock)
450		return pvclock_monotonic();
451	else
452		return tscclock_monotonic();
453}
454
455/*
456 * Return epoch offset (wall time offset to monotonic clock start).
457 */
458bmk_time_t
459bmk_platform_cpu_clock_epochoffset(void)
460{
461
462	return rtc_epochoffset;
463}
464
465/*
466 * Block the CPU until monotonic time is *no later than* the specified time.
467 * Returns early if any interrupts are serviced, or if the requested delay is
468 * too short.
469 */
470 uint64_t ccount = 0;
471 uint64_t prev;
472 uint64_t ts;
473
474void
475bmk_platform_cpu_block(bmk_time_t until)
476{
477	bmk_time_t now, delta_ns;
478	uint64_t delta_ticks;
479	unsigned int ticks;
480	int s;
481
482	bmk_assert(spldepth > 0);
483
484	/*
485	 * Return if called too late.  Doing do ensures that the time
486	 * delta is positive.
487	 */
488	now = bmk_platform_cpu_clock_monotonic();
489	if (until <= now)
490		return;
491
492	/*
493	 * Compute delta in PIT ticks. Return if it is less than minimum safe
494	 * amount of ticks.  Essentially this will cause us to spin until
495	 * the timeout.
496	 */
497	delta_ns = until - now;
498	delta_ticks = mul64_32(delta_ns, pit_mult);
499	if (delta_ticks < PIT_MIN_DELTA) {
500		/*
501		 * Since we are "spinning", quickly enable interrupts in
502		 * the hopes that we might get new work and can do something
503		 * else than spin.
504		 */
505		__asm__ __volatile__(
506			"sti;\n"
507			"nop;\n"	/* ints are enabled 1 instr after sti */
508			"cli;\n");
509		return;
510	}
511
512	/*
513	 * Program the timer to interrupt the CPU after the delay has expired.
514	 * Maximum timer delay is 65535 ticks.
515	 */
516	if (delta_ticks > 65535)
517		ticks = 65535;
518	else
519		ticks = delta_ticks;
520
521	/*
522	 * Note that according to the Intel 82C54 datasheet, p12 the
523	 * interrupt is actually delivered in N + 1 ticks.
524	 */
525	outb(TIMER_CNTR, (ticks - 1) & 0xff);
526	outb(TIMER_CNTR, (ticks - 1) >> 8);
527
528	/*
529	 * Wait for any interrupt. If we got an interrupt then
530	 * just return into the scheduler which will check if there is
531	 * work to do and send us back here if not.
532	 *
533	 * TODO: It would be more efficient for longer sleeps to be
534	 * able to distinguish if the interrupt was the PIT interrupt
535	 * and no other, but this will do for now.
536	 */
537	s = spldepth;
538	spldepth = 0;
539    prev = rdtsc_pure();
540
541    while(1) {
542        ts = rdtsc_pure();;
543        //
544        if ((ts - prev) < 150) {
545            asm volatile("cli" ::: "memory");
546            ccount += (unsigned long long)(ts - prev);
547            asm volatile("sti" ::: "memory");
548            prev = ts;
549        } else {
550            spldepth = s;
551            prev = ts;
552            return;
553        }
554    }
555}
556