1/*-
2 * Copyright (c) 2009 Adrian Chadd
3 * Copyright (c) 2012 Spectra Logic Corporation
4 * Copyright (c) 2014 Bryan Venteicher
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/bus.h>
32#include <sys/clock.h>
33#include <sys/conf.h>
34#include <sys/fcntl.h>
35#include <sys/limits.h>
36#include <sys/mman.h>
37#include <sys/proc.h>
38#include <sys/smp.h>
39#include <sys/sysctl.h>
40#include <sys/vdso.h>
41
42#include <vm/vm.h>
43#include <vm/pmap.h>
44
45#include <machine/atomic.h>
46#include <machine/cpufunc.h>
47#include <machine/md_var.h>
48#include <machine/pvclock.h>
49
50/*
51 * Last system time. This is used to guarantee a monotonically non-decreasing
52 * clock for the kernel codepath and approximate the same for the vDSO codepath.
53 * In theory, this should be unnecessary absent hypervisor bug(s) and/or what
54 * should be rare cases where TSC jitter may still be visible despite the
55 * hypervisor's best efforts.
56 */
57static volatile uint64_t pvclock_last_systime;
58
59static uint64_t		 pvclock_getsystime(struct pvclock *pvc);
60static void		 pvclock_read_time_info(
61    struct pvclock_vcpu_time_info *ti, uint64_t *ns, uint8_t *flags);
62static void		 pvclock_read_wall_clock(struct pvclock_wall_clock *wc,
63    struct timespec *ts);
64static u_int		 pvclock_tc_get_timecount(struct timecounter *tc);
65static uint32_t		 pvclock_tc_vdso_timehands(
66    struct vdso_timehands *vdso_th, struct timecounter *tc);
67#ifdef COMPAT_FREEBSD32
68static uint32_t		 pvclock_tc_vdso_timehands32(
69    struct vdso_timehands32 *vdso_th, struct timecounter *tc);
70#endif
71
72static d_open_t		 pvclock_cdev_open;
73static d_mmap_t		 pvclock_cdev_mmap;
74
75static struct cdevsw	 pvclock_cdev_cdevsw = {
76	.d_version =	D_VERSION,
77	.d_name =	PVCLOCK_CDEVNAME,
78	.d_open =	pvclock_cdev_open,
79	.d_mmap =	pvclock_cdev_mmap,
80};
81
82void
83pvclock_resume(void)
84{
85	atomic_store_rel_64(&pvclock_last_systime, 0);
86}
87
88uint64_t
89pvclock_tsc_freq(struct pvclock_vcpu_time_info *ti)
90{
91	uint64_t freq;
92
93	freq = (1000000000ULL << 32) / ti->tsc_to_system_mul;
94	if (ti->tsc_shift < 0)
95		freq <<= -ti->tsc_shift;
96	else
97		freq >>= ti->tsc_shift;
98	return (freq);
99}
100
101static void
102pvclock_read_time_info(struct pvclock_vcpu_time_info *ti,
103    uint64_t *ns, uint8_t *flags)
104{
105	uint64_t delta;
106	uint32_t version;
107
108	do {
109		version = atomic_load_acq_32(&ti->version);
110		delta = rdtsc_ordered() - ti->tsc_timestamp;
111		*ns = ti->system_time + pvclock_scale_delta(delta,
112		    ti->tsc_to_system_mul, ti->tsc_shift);
113		*flags = ti->flags;
114		atomic_thread_fence_acq();
115	} while ((ti->version & 1) != 0 || ti->version != version);
116}
117
118static void
119pvclock_read_wall_clock(struct pvclock_wall_clock *wc, struct timespec *ts)
120{
121	uint32_t version;
122
123	do {
124		version = atomic_load_acq_32(&wc->version);
125		ts->tv_sec = wc->sec;
126		ts->tv_nsec = wc->nsec;
127		atomic_thread_fence_acq();
128	} while ((wc->version & 1) != 0 || wc->version != version);
129}
130
131static uint64_t
132pvclock_getsystime(struct pvclock *pvc)
133{
134	uint64_t now, last, ret;
135	uint8_t flags;
136
137	critical_enter();
138	pvclock_read_time_info(&pvc->timeinfos[curcpu], &now, &flags);
139	ret = now;
140	if ((flags & PVCLOCK_FLAG_TSC_STABLE) == 0) {
141		last = atomic_load_acq_64(&pvclock_last_systime);
142		do {
143			if (last > now) {
144				ret = last;
145				break;
146			}
147		} while (!atomic_fcmpset_rel_64(&pvclock_last_systime, &last,
148		    now));
149	}
150	critical_exit();
151	return (ret);
152}
153
154/*
155 * NOTE: Transitional-only; this should be removed after 'dev/xen/timer/timer.c'
156 * has been migrated to the 'struct pvclock' API.
157 */
158uint64_t
159pvclock_get_timecount(struct pvclock_vcpu_time_info *ti)
160{
161	uint64_t now, last, ret;
162	uint8_t flags;
163
164	pvclock_read_time_info(ti, &now, &flags);
165	ret = now;
166	if ((flags & PVCLOCK_FLAG_TSC_STABLE) == 0) {
167		last = atomic_load_acq_64(&pvclock_last_systime);
168		do {
169			if (last > now) {
170				ret = last;
171				break;
172			}
173		} while (!atomic_fcmpset_rel_64(&pvclock_last_systime, &last,
174		    now));
175	}
176	return (ret);
177}
178
179/*
180 * NOTE: Transitional-only; this should be removed after 'dev/xen/timer/timer.c'
181 * has been migrated to the 'struct pvclock' API.
182 */
183void
184pvclock_get_wallclock(struct pvclock_wall_clock *wc, struct timespec *ts)
185{
186	pvclock_read_wall_clock(wc, ts);
187}
188
189static int
190pvclock_cdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
191{
192	if (oflags & FWRITE)
193		return (EPERM);
194	return (0);
195}
196
197static int
198pvclock_cdev_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
199    int nprot, vm_memattr_t *memattr)
200{
201	if (offset >= mp_ncpus * sizeof(struct pvclock_vcpu_time_info))
202		return (EINVAL);
203	if (PROT_EXTRACT(nprot) != PROT_READ)
204		return (EACCES);
205	*paddr = vtophys((uintptr_t)dev->si_drv1 + offset);
206	*memattr = VM_MEMATTR_DEFAULT;
207	return (0);
208}
209
210static u_int
211pvclock_tc_get_timecount(struct timecounter *tc)
212{
213	struct pvclock *pvc = tc->tc_priv;
214
215	return (pvclock_getsystime(pvc) & UINT_MAX);
216}
217
218static uint32_t
219pvclock_tc_vdso_timehands(struct vdso_timehands *vdso_th,
220    struct timecounter *tc)
221{
222	struct pvclock *pvc = tc->tc_priv;
223
224	if (pvc->cdev == NULL)
225		return (0);
226
227	vdso_th->th_algo = VDSO_TH_ALGO_X86_PVCLK;
228	vdso_th->th_x86_shift = 0;
229	vdso_th->th_x86_hpet_idx = 0;
230	vdso_th->th_x86_pvc_last_systime =
231	    atomic_load_acq_64(&pvclock_last_systime);
232	vdso_th->th_x86_pvc_stable_mask = !pvc->vdso_force_unstable &&
233	    pvc->stable_flag_supported ? PVCLOCK_FLAG_TSC_STABLE : 0;
234	bzero(vdso_th->th_res, sizeof(vdso_th->th_res));
235	return ((amd_feature & AMDID_RDTSCP) != 0 ||
236	    ((vdso_th->th_x86_pvc_stable_mask & PVCLOCK_FLAG_TSC_STABLE) != 0 &&
237	    pvc->vdso_enable_without_rdtscp));
238}
239
240#ifdef COMPAT_FREEBSD32
241static uint32_t
242pvclock_tc_vdso_timehands32(struct vdso_timehands32 *vdso_th,
243    struct timecounter *tc)
244{
245	struct pvclock *pvc = tc->tc_priv;
246
247	if (pvc->cdev == NULL)
248		return (0);
249
250	vdso_th->th_algo = VDSO_TH_ALGO_X86_PVCLK;
251	vdso_th->th_x86_shift = 0;
252	vdso_th->th_x86_hpet_idx = 0;
253	*(uint64_t *)&vdso_th->th_x86_pvc_last_systime[0] =
254	    atomic_load_acq_64(&pvclock_last_systime);
255	vdso_th->th_x86_pvc_stable_mask = !pvc->vdso_force_unstable &&
256	    pvc->stable_flag_supported ? PVCLOCK_FLAG_TSC_STABLE : 0;
257	bzero(vdso_th->th_res, sizeof(vdso_th->th_res));
258	return ((amd_feature & AMDID_RDTSCP) != 0 ||
259	    ((vdso_th->th_x86_pvc_stable_mask & PVCLOCK_FLAG_TSC_STABLE) != 0 &&
260	    pvc->vdso_enable_without_rdtscp));
261}
262#endif
263
264void
265pvclock_gettime(struct pvclock *pvc, struct timespec *ts)
266{
267	struct timespec system_ts;
268	uint64_t system_ns;
269
270	pvclock_read_wall_clock(pvc->get_wallclock(pvc->get_wallclock_arg), ts);
271	system_ns = pvclock_getsystime(pvc);
272	system_ts.tv_sec = system_ns / 1000000000ULL;
273	system_ts.tv_nsec = system_ns % 1000000000ULL;
274	timespecadd(ts, &system_ts, ts);
275}
276
277void
278pvclock_init(struct pvclock *pvc, device_t dev, const char *tc_name,
279    int tc_quality, u_int tc_flags)
280{
281	struct make_dev_args mda;
282	int err;
283
284	KASSERT(((uintptr_t)pvc->timeinfos & PAGE_MASK) == 0,
285	    ("Specified time info page(s) address is not page-aligned."));
286
287	/* Set up vDSO stable-flag suppression test facility: */
288	pvc->vdso_force_unstable = false;
289	SYSCTL_ADD_BOOL(device_get_sysctl_ctx(dev),
290	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
291	    "vdso_force_unstable", CTLFLAG_RW, &pvc->vdso_force_unstable, 0,
292	    "Forcibly deassert stable flag in vDSO codepath");
293
294	/*
295	 * Make it possible to use the vDSO page even when the hypervisor does
296	 * not support the rdtscp instruction.  This is disabled by default for
297	 * compatibility with old libc.
298	 */
299	pvc->vdso_enable_without_rdtscp = false;
300	SYSCTL_ADD_BOOL(device_get_sysctl_ctx(dev),
301	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
302	    "vdso_enable_without_rdtscp", CTLFLAG_RWTUN,
303	    &pvc->vdso_enable_without_rdtscp, 0,
304	    "Allow the use of a vDSO when rdtscp is not available");
305
306	/* Set up timecounter and timecounter-supporting members: */
307	pvc->tc.tc_get_timecount = pvclock_tc_get_timecount;
308	pvc->tc.tc_poll_pps = NULL;
309	pvc->tc.tc_counter_mask = ~0U;
310	pvc->tc.tc_frequency = 1000000000ULL;
311	pvc->tc.tc_name = tc_name;
312	pvc->tc.tc_quality = tc_quality;
313	pvc->tc.tc_flags = tc_flags;
314	pvc->tc.tc_priv = pvc;
315	pvc->tc.tc_fill_vdso_timehands = pvclock_tc_vdso_timehands;
316#ifdef COMPAT_FREEBSD32
317	pvc->tc.tc_fill_vdso_timehands32 = pvclock_tc_vdso_timehands32;
318#endif
319
320	/* Set up cdev for userspace mmapping of vCPU 0 time info page: */
321	make_dev_args_init(&mda);
322	mda.mda_devsw = &pvclock_cdev_cdevsw;
323	mda.mda_uid = UID_ROOT;
324	mda.mda_gid = GID_WHEEL;
325	mda.mda_mode = 0444;
326	mda.mda_si_drv1 = pvc->timeinfos;
327	err = make_dev_s(&mda, &pvc->cdev, PVCLOCK_CDEVNAME);
328	if (err != 0) {
329		device_printf(dev, "Could not create /dev/%s, error %d. Fast "
330		    "time of day will be unavailable for this timecounter.\n",
331		    PVCLOCK_CDEVNAME, err);
332		KASSERT(pvc->cdev == NULL,
333		    ("Failed make_dev_s() unexpectedly inited cdev."));
334	}
335
336	/* Register timecounter: */
337	tc_init(&pvc->tc);
338
339	/*
340	 * Register wallclock:
341	 *     The RTC registration API expects a resolution in microseconds;
342	 *     pvclock's 1ns resolution is rounded up to 1us.
343	 */
344	clock_register(dev, 1);
345}
346
347int
348pvclock_destroy(struct pvclock *pvc)
349{
350	/*
351	 * Not currently possible since there is no teardown counterpart of
352	 * 'tc_init()'.
353	 */
354	return (EBUSY);
355}
356