1/*- 2 * Copyright (c) 2009 Adrian Chadd 3 * Copyright (c) 2012 Spectra Logic Corporation 4 * Copyright (c) 2014 Bryan Venteicher 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29#include <sys/param.h> 30#include <sys/systm.h> 31#include <sys/bus.h> 32#include <sys/clock.h> 33#include <sys/conf.h> 34#include <sys/fcntl.h> 35#include <sys/limits.h> 36#include <sys/mman.h> 37#include <sys/proc.h> 38#include <sys/smp.h> 39#include <sys/sysctl.h> 40#include <sys/vdso.h> 41 42#include <vm/vm.h> 43#include <vm/pmap.h> 44 45#include <machine/atomic.h> 46#include <machine/cpufunc.h> 47#include <machine/md_var.h> 48#include <machine/pvclock.h> 49 50/* 51 * Last system time. This is used to guarantee a monotonically non-decreasing 52 * clock for the kernel codepath and approximate the same for the vDSO codepath. 53 * In theory, this should be unnecessary absent hypervisor bug(s) and/or what 54 * should be rare cases where TSC jitter may still be visible despite the 55 * hypervisor's best efforts. 56 */ 57static volatile uint64_t pvclock_last_systime; 58 59static uint64_t pvclock_getsystime(struct pvclock *pvc); 60static void pvclock_read_time_info( 61 struct pvclock_vcpu_time_info *ti, uint64_t *ns, uint8_t *flags); 62static void pvclock_read_wall_clock(struct pvclock_wall_clock *wc, 63 struct timespec *ts); 64static u_int pvclock_tc_get_timecount(struct timecounter *tc); 65static uint32_t pvclock_tc_vdso_timehands( 66 struct vdso_timehands *vdso_th, struct timecounter *tc); 67#ifdef COMPAT_FREEBSD32 68static uint32_t pvclock_tc_vdso_timehands32( 69 struct vdso_timehands32 *vdso_th, struct timecounter *tc); 70#endif 71 72static d_open_t pvclock_cdev_open; 73static d_mmap_t pvclock_cdev_mmap; 74 75static struct cdevsw pvclock_cdev_cdevsw = { 76 .d_version = D_VERSION, 77 .d_name = PVCLOCK_CDEVNAME, 78 .d_open = pvclock_cdev_open, 79 .d_mmap = pvclock_cdev_mmap, 80}; 81 82void 83pvclock_resume(void) 84{ 85 atomic_store_rel_64(&pvclock_last_systime, 0); 86} 87 88uint64_t 89pvclock_tsc_freq(struct pvclock_vcpu_time_info *ti) 90{ 91 uint64_t freq; 92 93 freq = (1000000000ULL << 32) / ti->tsc_to_system_mul; 94 if (ti->tsc_shift < 0) 95 freq <<= -ti->tsc_shift; 96 else 97 freq >>= ti->tsc_shift; 98 return (freq); 99} 100 101static void 102pvclock_read_time_info(struct pvclock_vcpu_time_info *ti, 103 uint64_t *ns, uint8_t *flags) 104{ 105 uint64_t delta; 106 uint32_t version; 107 108 do { 109 version = atomic_load_acq_32(&ti->version); 110 delta = rdtsc_ordered() - ti->tsc_timestamp; 111 *ns = ti->system_time + pvclock_scale_delta(delta, 112 ti->tsc_to_system_mul, ti->tsc_shift); 113 *flags = ti->flags; 114 atomic_thread_fence_acq(); 115 } while ((ti->version & 1) != 0 || ti->version != version); 116} 117 118static void 119pvclock_read_wall_clock(struct pvclock_wall_clock *wc, struct timespec *ts) 120{ 121 uint32_t version; 122 123 do { 124 version = atomic_load_acq_32(&wc->version); 125 ts->tv_sec = wc->sec; 126 ts->tv_nsec = wc->nsec; 127 atomic_thread_fence_acq(); 128 } while ((wc->version & 1) != 0 || wc->version != version); 129} 130 131static uint64_t 132pvclock_getsystime(struct pvclock *pvc) 133{ 134 uint64_t now, last, ret; 135 uint8_t flags; 136 137 critical_enter(); 138 pvclock_read_time_info(&pvc->timeinfos[curcpu], &now, &flags); 139 ret = now; 140 if ((flags & PVCLOCK_FLAG_TSC_STABLE) == 0) { 141 last = atomic_load_acq_64(&pvclock_last_systime); 142 do { 143 if (last > now) { 144 ret = last; 145 break; 146 } 147 } while (!atomic_fcmpset_rel_64(&pvclock_last_systime, &last, 148 now)); 149 } 150 critical_exit(); 151 return (ret); 152} 153 154/* 155 * NOTE: Transitional-only; this should be removed after 'dev/xen/timer/timer.c' 156 * has been migrated to the 'struct pvclock' API. 157 */ 158uint64_t 159pvclock_get_timecount(struct pvclock_vcpu_time_info *ti) 160{ 161 uint64_t now, last, ret; 162 uint8_t flags; 163 164 pvclock_read_time_info(ti, &now, &flags); 165 ret = now; 166 if ((flags & PVCLOCK_FLAG_TSC_STABLE) == 0) { 167 last = atomic_load_acq_64(&pvclock_last_systime); 168 do { 169 if (last > now) { 170 ret = last; 171 break; 172 } 173 } while (!atomic_fcmpset_rel_64(&pvclock_last_systime, &last, 174 now)); 175 } 176 return (ret); 177} 178 179/* 180 * NOTE: Transitional-only; this should be removed after 'dev/xen/timer/timer.c' 181 * has been migrated to the 'struct pvclock' API. 182 */ 183void 184pvclock_get_wallclock(struct pvclock_wall_clock *wc, struct timespec *ts) 185{ 186 pvclock_read_wall_clock(wc, ts); 187} 188 189static int 190pvclock_cdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) 191{ 192 if (oflags & FWRITE) 193 return (EPERM); 194 return (0); 195} 196 197static int 198pvclock_cdev_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, 199 int nprot, vm_memattr_t *memattr) 200{ 201 if (offset >= mp_ncpus * sizeof(struct pvclock_vcpu_time_info)) 202 return (EINVAL); 203 if (PROT_EXTRACT(nprot) != PROT_READ) 204 return (EACCES); 205 *paddr = vtophys((uintptr_t)dev->si_drv1 + offset); 206 *memattr = VM_MEMATTR_DEFAULT; 207 return (0); 208} 209 210static u_int 211pvclock_tc_get_timecount(struct timecounter *tc) 212{ 213 struct pvclock *pvc = tc->tc_priv; 214 215 return (pvclock_getsystime(pvc) & UINT_MAX); 216} 217 218static uint32_t 219pvclock_tc_vdso_timehands(struct vdso_timehands *vdso_th, 220 struct timecounter *tc) 221{ 222 struct pvclock *pvc = tc->tc_priv; 223 224 if (pvc->cdev == NULL) 225 return (0); 226 227 vdso_th->th_algo = VDSO_TH_ALGO_X86_PVCLK; 228 vdso_th->th_x86_shift = 0; 229 vdso_th->th_x86_hpet_idx = 0; 230 vdso_th->th_x86_pvc_last_systime = 231 atomic_load_acq_64(&pvclock_last_systime); 232 vdso_th->th_x86_pvc_stable_mask = !pvc->vdso_force_unstable && 233 pvc->stable_flag_supported ? PVCLOCK_FLAG_TSC_STABLE : 0; 234 bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); 235 return ((amd_feature & AMDID_RDTSCP) != 0 || 236 ((vdso_th->th_x86_pvc_stable_mask & PVCLOCK_FLAG_TSC_STABLE) != 0 && 237 pvc->vdso_enable_without_rdtscp)); 238} 239 240#ifdef COMPAT_FREEBSD32 241static uint32_t 242pvclock_tc_vdso_timehands32(struct vdso_timehands32 *vdso_th, 243 struct timecounter *tc) 244{ 245 struct pvclock *pvc = tc->tc_priv; 246 247 if (pvc->cdev == NULL) 248 return (0); 249 250 vdso_th->th_algo = VDSO_TH_ALGO_X86_PVCLK; 251 vdso_th->th_x86_shift = 0; 252 vdso_th->th_x86_hpet_idx = 0; 253 *(uint64_t *)&vdso_th->th_x86_pvc_last_systime[0] = 254 atomic_load_acq_64(&pvclock_last_systime); 255 vdso_th->th_x86_pvc_stable_mask = !pvc->vdso_force_unstable && 256 pvc->stable_flag_supported ? PVCLOCK_FLAG_TSC_STABLE : 0; 257 bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); 258 return ((amd_feature & AMDID_RDTSCP) != 0 || 259 ((vdso_th->th_x86_pvc_stable_mask & PVCLOCK_FLAG_TSC_STABLE) != 0 && 260 pvc->vdso_enable_without_rdtscp)); 261} 262#endif 263 264void 265pvclock_gettime(struct pvclock *pvc, struct timespec *ts) 266{ 267 struct timespec system_ts; 268 uint64_t system_ns; 269 270 pvclock_read_wall_clock(pvc->get_wallclock(pvc->get_wallclock_arg), ts); 271 system_ns = pvclock_getsystime(pvc); 272 system_ts.tv_sec = system_ns / 1000000000ULL; 273 system_ts.tv_nsec = system_ns % 1000000000ULL; 274 timespecadd(ts, &system_ts, ts); 275} 276 277void 278pvclock_init(struct pvclock *pvc, device_t dev, const char *tc_name, 279 int tc_quality, u_int tc_flags) 280{ 281 struct make_dev_args mda; 282 int err; 283 284 KASSERT(((uintptr_t)pvc->timeinfos & PAGE_MASK) == 0, 285 ("Specified time info page(s) address is not page-aligned.")); 286 287 /* Set up vDSO stable-flag suppression test facility: */ 288 pvc->vdso_force_unstable = false; 289 SYSCTL_ADD_BOOL(device_get_sysctl_ctx(dev), 290 SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, 291 "vdso_force_unstable", CTLFLAG_RW, &pvc->vdso_force_unstable, 0, 292 "Forcibly deassert stable flag in vDSO codepath"); 293 294 /* 295 * Make it possible to use the vDSO page even when the hypervisor does 296 * not support the rdtscp instruction. This is disabled by default for 297 * compatibility with old libc. 298 */ 299 pvc->vdso_enable_without_rdtscp = false; 300 SYSCTL_ADD_BOOL(device_get_sysctl_ctx(dev), 301 SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, 302 "vdso_enable_without_rdtscp", CTLFLAG_RWTUN, 303 &pvc->vdso_enable_without_rdtscp, 0, 304 "Allow the use of a vDSO when rdtscp is not available"); 305 306 /* Set up timecounter and timecounter-supporting members: */ 307 pvc->tc.tc_get_timecount = pvclock_tc_get_timecount; 308 pvc->tc.tc_poll_pps = NULL; 309 pvc->tc.tc_counter_mask = ~0U; 310 pvc->tc.tc_frequency = 1000000000ULL; 311 pvc->tc.tc_name = tc_name; 312 pvc->tc.tc_quality = tc_quality; 313 pvc->tc.tc_flags = tc_flags; 314 pvc->tc.tc_priv = pvc; 315 pvc->tc.tc_fill_vdso_timehands = pvclock_tc_vdso_timehands; 316#ifdef COMPAT_FREEBSD32 317 pvc->tc.tc_fill_vdso_timehands32 = pvclock_tc_vdso_timehands32; 318#endif 319 320 /* Set up cdev for userspace mmapping of vCPU 0 time info page: */ 321 make_dev_args_init(&mda); 322 mda.mda_devsw = &pvclock_cdev_cdevsw; 323 mda.mda_uid = UID_ROOT; 324 mda.mda_gid = GID_WHEEL; 325 mda.mda_mode = 0444; 326 mda.mda_si_drv1 = pvc->timeinfos; 327 err = make_dev_s(&mda, &pvc->cdev, PVCLOCK_CDEVNAME); 328 if (err != 0) { 329 device_printf(dev, "Could not create /dev/%s, error %d. Fast " 330 "time of day will be unavailable for this timecounter.\n", 331 PVCLOCK_CDEVNAME, err); 332 KASSERT(pvc->cdev == NULL, 333 ("Failed make_dev_s() unexpectedly inited cdev.")); 334 } 335 336 /* Register timecounter: */ 337 tc_init(&pvc->tc); 338 339 /* 340 * Register wallclock: 341 * The RTC registration API expects a resolution in microseconds; 342 * pvclock's 1ns resolution is rounded up to 1us. 343 */ 344 clock_register(dev, 1); 345} 346 347int 348pvclock_destroy(struct pvclock *pvc) 349{ 350 /* 351 * Not currently possible since there is no teardown counterpart of 352 * 'tc_init()'. 353 */ 354 return (EBUSY); 355} 356