1/*- 2 * Copyright (c) 2014, 2015 Antti Kantee. All Rights Reserved. 3 * Copyright (c) 2015 Martin Lucina. All Rights Reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 15 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <hw/kernel.h> 28#include <hw/clock_subr.h> 29 30#include <arch/x86/hypervisor.h> 31#include <arch/x86/var.h> 32#include <arch/x86/tsc.h> 33 34#include <bmk-core/core.h> 35#include <bmk-core/platform.h> 36#include <bmk-core/printf.h> 37 38#define NSEC_PER_SEC 1000000000ULL 39/* 40 * Minimum delta to sleep using PIT. Programming seems to have an overhead of 41 * 3-4us, but play it safe here. 42 */ 43#define PIT_MIN_DELTA 16 44 45/* clock isr trampoline (in locore.S) */ 46void cpu_isr_clock(void); 47 48/* 49 * Multiplier for converting nsecs to PIT ticks. (1.32) fixed point. 50 * 51 * Calculated as: 52 * 53 * f = NSEC_PER_SEC / TIMER_HZ (0.31) fixed point. 54 * pit_mult = 1 / f (1.32) fixed point. 55 */ 56static const uint32_t pit_mult 57 = (1ULL << 63) / ((NSEC_PER_SEC << 31) / TIMER_HZ); 58 59/* RTC wall time offset at monotonic time base. */ 60static bmk_time_t rtc_epochoffset; 61 62/* True if using pvclock for timekeeping, false if using TSC-based clock. */ 63static int have_pvclock; 64 65/* 66 * TSC clock specific. 67 */ 68 69/* Base time values at the last call to tscclock_monotonic(). */ 70static bmk_time_t time_base; 71static uint64_t tsc_base; 72 73/* Multiplier for converting TSC ticks to nsecs. (0.32) fixed point. */ 74static uint32_t tsc_mult; 75 76/* 77 * pvclock specific. 78 */ 79 80/* Xen/KVM per-vcpu time ABI. */ 81struct pvclock_vcpu_time_info { 82 uint32_t version; 83 uint32_t pad0; 84 uint64_t tsc_timestamp; 85 uint64_t system_time; 86 uint32_t tsc_to_system_mul; 87 int8_t tsc_shift; 88 uint8_t flags; 89 uint8_t pad[2]; 90} __attribute__((__packed__)); 91 92/* Xen/KVM wall clock ABI. */ 93struct pvclock_wall_clock { 94 uint32_t version; 95 uint32_t sec; 96 uint32_t nsec; 97} __attribute__((__packed__)); 98 99/* 100 * pvclock structures shared with hypervisor. 101 * TODO: These should be pointers (for Xen HVM support), but we can't use 102 * bmk_pgalloc() here. 103 */ 104volatile static struct pvclock_vcpu_time_info pvclock_ti; 105volatile static struct pvclock_wall_clock pvclock_wc; 106 107/* 108 * Calculate prod = (a * b) where a is (64.0) fixed point and b is (0.32) fixed 109 * point. The intermediate product is (64.32) fixed point, discarding the 110 * fractional bits leaves us with a (64.0) fixed point result. 111 * 112 * XXX Document what range of (a, b) is safe from overflow in this calculation. 113 */ 114static inline uint64_t 115mul64_32(uint64_t a, uint32_t b) 116{ 117 uint64_t prod; 118#if defined(__x86_64__) 119 /* For x86_64 the computation can be done using 64-bit multiply and 120 * shift. */ 121 __asm__ ( 122 "mul %%rdx ; " 123 "shrd $32, %%rdx, %%rax" 124 : "=a" (prod) 125 : "0" (a), "d" ((uint64_t)b) 126 ); 127#elif defined(__i386__) 128 /* For i386 we compute the partial products and add them up, discarding 129 * the lower 32 bits of the product in the process. */ 130 uint32_t h = (uint32_t)(a >> 32); 131 uint32_t l = (uint32_t)a; 132 uint32_t t1, t2; 133 __asm__ ( 134 "mul %5 ; " /* %edx:%eax = (l * b) */ 135 "mov %4,%%eax ; " /* %eax = h */ 136 "mov %%edx,%4 ; " /* t1 = ((l * b) >> 32) */ 137 "mul %5 ; " /* %edx:%eax = (h * b) */ 138 "xor %5,%5 ; " /* t2 = 0 */ 139 "add %4,%%eax ; " /* %eax = (h * b) + t1 (LSW) */ 140 "adc %5,%%edx ; " /* %edx = (h * b) + t1 (MSW) */ 141 : "=A" (prod), "=r" (t1), "=r" (t2) 142 : "a" (l), "1" (h), "2" (b) 143 ); 144#else 145#error mul64_32 not supported for target architecture 146#endif 147 148 return prod; 149} 150 151/* 152 * Read the current i8254 channel 0 tick count. 153 */ 154static unsigned int 155i8254_gettick(void) 156{ 157 uint16_t rdval; 158 159 outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH); 160 rdval = inb(TIMER_CNTR); 161 rdval |= (inb(TIMER_CNTR) << 8); 162 return rdval; 163} 164 165/* 166 * Delay for approximately n microseconds using the i8254 channel 0 counter. 167 * Timer must be programmed appropriately before calling this function. 168 */ 169static void 170i8254_delay(unsigned int n) 171{ 172 unsigned int cur_tick, initial_tick; 173 int remaining; 174 const unsigned long timer_rval = TIMER_HZ / 100; 175 176 initial_tick = i8254_gettick(); 177 178 remaining = (unsigned long long) n * TIMER_HZ / 1000000; 179 180 while (remaining > 1) { 181 cur_tick = i8254_gettick(); 182 if (cur_tick > initial_tick) 183 remaining -= timer_rval - (cur_tick - initial_tick); 184 else 185 remaining -= initial_tick - cur_tick; 186 initial_tick = cur_tick; 187 } 188} 189 190/* 191 * Read a RTC register. Due to PC platform braindead-ness also disables NMI. 192 */ 193static inline uint8_t 194rtc_read(uint8_t reg) 195{ 196 197 outb(RTC_COMMAND, reg | RTC_NMI_DISABLE); 198 return inb(RTC_DATA); 199} 200 201/* 202 * Return current RTC time. Note that due to waiting for the update cycle to 203 * complete, this call may take some time. 204 */ 205static bmk_time_t 206rtc_gettimeofday(void) 207{ 208 struct bmk_clock_ymdhms dt; 209 210 splhigh(); 211 212 /* 213 * If RTC_UIP is down, we have at least 244us to obtain a 214 * consistent reading before an update can occur. 215 */ 216 while (rtc_read(RTC_STATUS_A) & RTC_UIP) 217 continue; 218 219 dt.dt_sec = bcdtobin(rtc_read(RTC_SEC)); 220 dt.dt_min = bcdtobin(rtc_read(RTC_MIN)); 221 dt.dt_hour = bcdtobin(rtc_read(RTC_HOUR)); 222 dt.dt_day = bcdtobin(rtc_read(RTC_DAY)); 223 dt.dt_mon = bcdtobin(rtc_read(RTC_MONTH)); 224 dt.dt_year = bcdtobin(rtc_read(RTC_YEAR)) + 2000; 225 226 spl0(); 227 228 return clock_ymdhms_to_secs(&dt) * NSEC_PER_SEC; 229} 230 231/* 232 * Return monotonic time using TSC clock. 233 */ 234static bmk_time_t 235tscclock_monotonic(void) 236{ 237 uint64_t tsc_now, tsc_delta; 238 239 /* 240 * Update time_base (monotonic time) and tsc_base (TSC time). 241 */ 242 tsc_now = rdtsc(); 243 tsc_delta = tsc_now - tsc_base; 244 time_base += mul64_32(tsc_delta, tsc_mult); 245 tsc_base = tsc_now; 246 247 return time_base; 248} 249 250/* 251 * Calibrate TSC and initialise TSC clock. 252 */ 253static int 254tscclock_init(void) 255{ 256 uint64_t tsc_freq; 257 258 /* Initialise i8254 timer channel 0 to mode 2 at 100 Hz */ 259 outb(TIMER_MODE, TIMER_SEL0 | TIMER_RATEGEN | TIMER_16BIT); 260 outb(TIMER_CNTR, (TIMER_HZ / 100) & 0xff); 261 outb(TIMER_CNTR, (TIMER_HZ / 100) >> 8); 262 263 /* 264 * Read RTC time to use as epoch offset. This must be done just 265 * before tsc_base is initialised in order to get a correct 266 * offset. 267 */ 268 rtc_epochoffset = rtc_gettimeofday(); 269 270 /* 271 * Calculate TSC frequency by calibrating against an 0.1s delay 272 * using the i8254 timer. 273 */ 274 spl0(); 275 tsc_base = rdtsc(); 276 i8254_delay(100000); 277 tsc_freq = (rdtsc() - tsc_base) * 10; 278 splhigh(); 279 bmk_printf("x86_initclocks(): TSC frequency estimate is %llu Hz\n", 280 (unsigned long long)tsc_freq); 281 282 /* 283 * Calculate TSC scaling multiplier. 284 * 285 * (0.32) tsc_mult = NSEC_PER_SEC (32.32) / tsc_freq (32.0) 286 */ 287 tsc_mult = (NSEC_PER_SEC << 32) / tsc_freq; 288 289 /* 290 * Monotonic time begins at tsc_base (first read of TSC before 291 * calibration). 292 */ 293 time_base = mul64_32(tsc_base, tsc_mult); 294 295 return 0; 296} 297 298/* 299 * Return monotonic time using PV clock. 300 */ 301static bmk_time_t 302pvclock_monotonic(void) 303{ 304 uint32_t version; 305 uint64_t delta, time_now; 306 307 do { 308 version = pvclock_ti.version; 309 __asm__ ("mfence" ::: "memory"); 310 delta = rdtsc() - pvclock_ti.tsc_timestamp; 311 if (pvclock_ti.tsc_shift < 0) 312 delta >>= -pvclock_ti.tsc_shift; 313 else 314 delta <<= pvclock_ti.tsc_shift; 315 time_now = mul64_32(delta, pvclock_ti.tsc_to_system_mul) + 316 pvclock_ti.system_time; 317 __asm__ ("mfence" ::: "memory"); 318 } while ((pvclock_ti.version & 1) || (pvclock_ti.version != version)); 319 320 return (bmk_time_t)time_now; 321} 322 323/* 324 * Read wall time offset since system boot using PV clock. 325 */ 326static bmk_time_t 327pvclock_read_wall_clock(void) 328{ 329 uint32_t version; 330 bmk_time_t wc_boot; 331 332 do { 333 version = pvclock_wc.version; 334 __asm__ ("mfence" ::: "memory"); 335 wc_boot = pvclock_wc.sec * NSEC_PER_SEC; 336 wc_boot += pvclock_wc.nsec; 337 __asm__ ("mfence" ::: "memory"); 338 } while ((pvclock_wc.version & 1) || (pvclock_wc.version != version)); 339 340 return wc_boot; 341} 342 343/* 344 * Initialise PV clock. Returns zero if successful (PV clock is available). 345 * 346 * Source: Linux kernel, Documentation/virtual/kvm/{msr,cpuid}.txt 347 */ 348static int 349pvclock_init(void) 350{ 351 uint32_t eax, ebx, ecx, edx; 352 uint32_t msr_kvm_system_time, msr_kvm_wall_clock; 353 354 if (hypervisor_detect() != HYPERVISOR_KVM) 355 return 1; 356 /* 357 * Prefer new-style MSRs, and bail entirely if neither is indicated as 358 * available by CPUID. 359 */ 360 x86_cpuid(0x40000001, &eax, &ebx, &ecx, &edx); 361 if (eax & (1 << 3)) { 362 msr_kvm_system_time = 0x4b564d01; 363 msr_kvm_wall_clock = 0x4b564d00; 364 } 365 else if (eax & (1 << 0)) { 366 msr_kvm_system_time = 0x12; 367 msr_kvm_wall_clock = 0x11; 368 } 369 else 370 return 1; 371 372 __asm__ __volatile("wrmsr" :: 373 "c" (msr_kvm_system_time), 374 "a" ((uint32_t)((uintptr_t)&pvclock_ti | 0x1)), 375#if defined(__x86_64__) 376 "d" ((uint32_t)((uintptr_t)&pvclock_ti >> 32)) 377#else 378 "d" (0) 379#endif 380 ); 381 __asm__ __volatile("wrmsr" :: 382 "c" (msr_kvm_wall_clock), 383 "a" ((uint32_t)((uintptr_t)&pvclock_wc)), 384#if defined(__x86_64__) 385 "d" ((uint32_t)((uintptr_t)&pvclock_wc >> 32)) 386#else 387 "d" (0) 388#endif 389 ); 390 /* Initialise epoch offset using wall clock time */ 391 rtc_epochoffset = pvclock_read_wall_clock(); 392 393 return 0; 394} 395 396void 397x86_initclocks(void) 398{ 399 uint32_t eax, ebx, ecx, edx; 400 uint32_t have_tsc = 0, invariant_tsc = 0; 401 402 /* Verify that TSC is supported. */ 403 x86_cpuid(0x0, &eax, &ebx, &ecx, &edx); 404 if (eax >= 0x1) { 405 x86_cpuid(0x1, &eax, &ebx, &ecx, &edx); 406 have_tsc = edx & (1 << 4); 407 } 408 if (!have_tsc) 409 bmk_platform_halt("Processor does not support RDTSC"); 410 /* And that it is invariant. TODO: Potentially halt here if not? */ 411 x86_cpuid(0x80000000, &eax, &ebx, &ecx, &edx); 412 if (eax >= 0x80000007) { 413 x86_cpuid(0x80000007, &eax, &ebx, &ecx, &edx); 414 invariant_tsc = edx & (1 << 8); 415 } 416 if (!invariant_tsc) 417 bmk_printf("WARNING: Processor claims to not support " 418 "invariant TSC.\n"); 419 420 /* 421 * Use PV clock if available, otherwise use TSC for timekeeping. 422 */ 423 if (pvclock_init() == 0) 424 have_pvclock = 1; 425 else 426 tscclock_init(); 427 bmk_printf("x86_initclocks(): Using %s for timekeeping\n", 428 have_pvclock ? "PV clock" : "TSC"); 429 430 /* 431 * Initialise i8254 timer channel 0 to mode 4 (one shot). 432 */ 433 outb(TIMER_MODE, TIMER_SEL0 | TIMER_ONESHOT | TIMER_16BIT); 434 435 /* 436 * Map i8254 interrupt vector and enable it in the PIC. 437 */ 438 x86_fillgate(32, cpu_isr_clock, 0); 439 pic1mask &= ~(1<<0); 440 outb(PIC1_DATA, pic1mask); 441} 442 443/* 444 * Return monotonic time since system boot in nanoseconds. 445 */ 446bmk_time_t 447bmk_platform_cpu_clock_monotonic(void) 448{ 449 if (have_pvclock) 450 return pvclock_monotonic(); 451 else 452 return tscclock_monotonic(); 453} 454 455/* 456 * Return epoch offset (wall time offset to monotonic clock start). 457 */ 458bmk_time_t 459bmk_platform_cpu_clock_epochoffset(void) 460{ 461 462 return rtc_epochoffset; 463} 464 465/* 466 * Block the CPU until monotonic time is *no later than* the specified time. 467 * Returns early if any interrupts are serviced, or if the requested delay is 468 * too short. 469 */ 470 uint64_t ccount = 0; 471 uint64_t prev; 472 uint64_t ts; 473 474void 475bmk_platform_cpu_block(bmk_time_t until) 476{ 477 bmk_time_t now, delta_ns; 478 uint64_t delta_ticks; 479 unsigned int ticks; 480 int s; 481 482 bmk_assert(spldepth > 0); 483 484 /* 485 * Return if called too late. Doing do ensures that the time 486 * delta is positive. 487 */ 488 now = bmk_platform_cpu_clock_monotonic(); 489 if (until <= now) 490 return; 491 492 /* 493 * Compute delta in PIT ticks. Return if it is less than minimum safe 494 * amount of ticks. Essentially this will cause us to spin until 495 * the timeout. 496 */ 497 delta_ns = until - now; 498 delta_ticks = mul64_32(delta_ns, pit_mult); 499 if (delta_ticks < PIT_MIN_DELTA) { 500 /* 501 * Since we are "spinning", quickly enable interrupts in 502 * the hopes that we might get new work and can do something 503 * else than spin. 504 */ 505 __asm__ __volatile__( 506 "sti;\n" 507 "nop;\n" /* ints are enabled 1 instr after sti */ 508 "cli;\n"); 509 return; 510 } 511 512 /* 513 * Program the timer to interrupt the CPU after the delay has expired. 514 * Maximum timer delay is 65535 ticks. 515 */ 516 if (delta_ticks > 65535) 517 ticks = 65535; 518 else 519 ticks = delta_ticks; 520 521 /* 522 * Note that according to the Intel 82C54 datasheet, p12 the 523 * interrupt is actually delivered in N + 1 ticks. 524 */ 525 outb(TIMER_CNTR, (ticks - 1) & 0xff); 526 outb(TIMER_CNTR, (ticks - 1) >> 8); 527 528 /* 529 * Wait for any interrupt. If we got an interrupt then 530 * just return into the scheduler which will check if there is 531 * work to do and send us back here if not. 532 * 533 * TODO: It would be more efficient for longer sleeps to be 534 * able to distinguish if the interrupt was the PIT interrupt 535 * and no other, but this will do for now. 536 */ 537 s = spldepth; 538 spldepth = 0; 539 prev = rdtsc_pure(); 540 541 while(1) { 542 ts = rdtsc_pure();; 543 // 544 if ((ts - prev) < 150) { 545 asm volatile("cli" ::: "memory"); 546 ccount += (unsigned long long)(ts - prev); 547 asm volatile("sti" ::: "memory"); 548 prev = ts; 549 } else { 550 spldepth = s; 551 prev = ts; 552 return; 553 } 554 } 555} 556