kern_tc.c revision 277406
1/*-
2 * ----------------------------------------------------------------------------
3 * "THE BEER-WARE LICENSE" (Revision 42):
4 * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
5 * can do whatever you want with this stuff. If we meet some day, and you think
6 * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
7 * ----------------------------------------------------------------------------
8 *
9 * Copyright (c) 2011 The FreeBSD Foundation
10 * All rights reserved.
11 *
12 * Portions of this software were developed by Julien Ridoux at the University
13 * of Melbourne under sponsorship from the FreeBSD Foundation.
14 */
15
16#include <sys/cdefs.h>
17__FBSDID("$FreeBSD: head/sys/kern/kern_tc.c 277406 2015-01-20 03:54:30Z neel $");
18
19#include "opt_compat.h"
20#include "opt_ntp.h"
21#include "opt_ffclock.h"
22
23#include <sys/param.h>
24#include <sys/kernel.h>
25#include <sys/limits.h>
26#ifdef FFCLOCK
27#include <sys/lock.h>
28#include <sys/mutex.h>
29#endif
30#include <sys/sysctl.h>
31#include <sys/syslog.h>
32#include <sys/systm.h>
33#include <sys/timeffc.h>
34#include <sys/timepps.h>
35#include <sys/timetc.h>
36#include <sys/timex.h>
37#include <sys/vdso.h>
38
39/*
40 * A large step happens on boot.  This constant detects such steps.
41 * It is relatively small so that ntp_update_second gets called enough
42 * in the typical 'missed a couple of seconds' case, but doesn't loop
43 * forever when the time step is large.
44 */
45#define LARGE_STEP	200
46
47/*
48 * Implement a dummy timecounter which we can use until we get a real one
49 * in the air.  This allows the console and other early stuff to use
50 * time services.
51 */
52
53static u_int
54dummy_get_timecount(struct timecounter *tc)
55{
56	static u_int now;
57
58	return (++now);
59}
60
61static struct timecounter dummy_timecounter = {
62	dummy_get_timecount, 0, ~0u, 1000000, "dummy", -1000000
63};
64
65struct timehands {
66	/* These fields must be initialized by the driver. */
67	struct timecounter	*th_counter;
68	int64_t			th_adjustment;
69	uint64_t		th_scale;
70	u_int	 		th_offset_count;
71	struct bintime		th_offset;
72	struct timeval		th_microtime;
73	struct timespec		th_nanotime;
74	/* Fields not to be copied in tc_windup start with th_generation. */
75	volatile u_int		th_generation;
76	struct timehands	*th_next;
77};
78
79static struct timehands th0;
80static struct timehands th9 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th0};
81static struct timehands th8 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th9};
82static struct timehands th7 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th8};
83static struct timehands th6 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th7};
84static struct timehands th5 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th6};
85static struct timehands th4 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th5};
86static struct timehands th3 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th4};
87static struct timehands th2 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th3};
88static struct timehands th1 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th2};
89static struct timehands th0 = {
90	&dummy_timecounter,
91	0,
92	(uint64_t)-1 / 1000000,
93	0,
94	{1, 0},
95	{0, 0},
96	{0, 0},
97	1,
98	&th1
99};
100
101static struct timehands *volatile timehands = &th0;
102struct timecounter *timecounter = &dummy_timecounter;
103static struct timecounter *timecounters = &dummy_timecounter;
104
105int tc_min_ticktock_freq = 1;
106
107volatile time_t time_second = 1;
108volatile time_t time_uptime = 1;
109
110struct bintime boottimebin;
111struct timeval boottime;
112static int sysctl_kern_boottime(SYSCTL_HANDLER_ARGS);
113SYSCTL_PROC(_kern, KERN_BOOTTIME, boottime, CTLTYPE_STRUCT|CTLFLAG_RD,
114    NULL, 0, sysctl_kern_boottime, "S,timeval", "System boottime");
115
116SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, "");
117static SYSCTL_NODE(_kern_timecounter, OID_AUTO, tc, CTLFLAG_RW, 0, "");
118
119static int timestepwarnings;
120SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RW,
121    &timestepwarnings, 0, "Log time steps");
122
123struct bintime bt_timethreshold;
124struct bintime bt_tickthreshold;
125sbintime_t sbt_timethreshold;
126sbintime_t sbt_tickthreshold;
127struct bintime tc_tick_bt;
128sbintime_t tc_tick_sbt;
129int tc_precexp;
130int tc_timepercentage = TC_DEFAULTPERC;
131static int sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS);
132SYSCTL_PROC(_kern_timecounter, OID_AUTO, alloweddeviation,
133    CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
134    sysctl_kern_timecounter_adjprecision, "I",
135    "Allowed time interval deviation in percents");
136
137static void tc_windup(void);
138static void cpu_tick_calibrate(int);
139
140void dtrace_getnanotime(struct timespec *tsp);
141
142static int
143sysctl_kern_boottime(SYSCTL_HANDLER_ARGS)
144{
145#ifndef __mips__
146#ifdef SCTL_MASK32
147	int tv[2];
148
149	if (req->flags & SCTL_MASK32) {
150		tv[0] = boottime.tv_sec;
151		tv[1] = boottime.tv_usec;
152		return SYSCTL_OUT(req, tv, sizeof(tv));
153	} else
154#endif
155#endif
156		return SYSCTL_OUT(req, &boottime, sizeof(boottime));
157}
158
159static int
160sysctl_kern_timecounter_get(SYSCTL_HANDLER_ARGS)
161{
162	u_int ncount;
163	struct timecounter *tc = arg1;
164
165	ncount = tc->tc_get_timecount(tc);
166	return sysctl_handle_int(oidp, &ncount, 0, req);
167}
168
169static int
170sysctl_kern_timecounter_freq(SYSCTL_HANDLER_ARGS)
171{
172	uint64_t freq;
173	struct timecounter *tc = arg1;
174
175	freq = tc->tc_frequency;
176	return sysctl_handle_64(oidp, &freq, 0, req);
177}
178
179/*
180 * Return the difference between the timehands' counter value now and what
181 * was when we copied it to the timehands' offset_count.
182 */
183static __inline u_int
184tc_delta(struct timehands *th)
185{
186	struct timecounter *tc;
187
188	tc = th->th_counter;
189	return ((tc->tc_get_timecount(tc) - th->th_offset_count) &
190	    tc->tc_counter_mask);
191}
192
193/*
194 * Functions for reading the time.  We have to loop until we are sure that
195 * the timehands that we operated on was not updated under our feet.  See
196 * the comment in <sys/time.h> for a description of these 12 functions.
197 */
198
199#ifdef FFCLOCK
200void
201fbclock_binuptime(struct bintime *bt)
202{
203	struct timehands *th;
204	unsigned int gen;
205
206	do {
207		th = timehands;
208		gen = th->th_generation;
209		*bt = th->th_offset;
210		bintime_addx(bt, th->th_scale * tc_delta(th));
211	} while (gen == 0 || gen != th->th_generation);
212}
213
214void
215fbclock_nanouptime(struct timespec *tsp)
216{
217	struct bintime bt;
218
219	fbclock_binuptime(&bt);
220	bintime2timespec(&bt, tsp);
221}
222
223void
224fbclock_microuptime(struct timeval *tvp)
225{
226	struct bintime bt;
227
228	fbclock_binuptime(&bt);
229	bintime2timeval(&bt, tvp);
230}
231
232void
233fbclock_bintime(struct bintime *bt)
234{
235
236	fbclock_binuptime(bt);
237	bintime_add(bt, &boottimebin);
238}
239
240void
241fbclock_nanotime(struct timespec *tsp)
242{
243	struct bintime bt;
244
245	fbclock_bintime(&bt);
246	bintime2timespec(&bt, tsp);
247}
248
249void
250fbclock_microtime(struct timeval *tvp)
251{
252	struct bintime bt;
253
254	fbclock_bintime(&bt);
255	bintime2timeval(&bt, tvp);
256}
257
258void
259fbclock_getbinuptime(struct bintime *bt)
260{
261	struct timehands *th;
262	unsigned int gen;
263
264	do {
265		th = timehands;
266		gen = th->th_generation;
267		*bt = th->th_offset;
268	} while (gen == 0 || gen != th->th_generation);
269}
270
271void
272fbclock_getnanouptime(struct timespec *tsp)
273{
274	struct timehands *th;
275	unsigned int gen;
276
277	do {
278		th = timehands;
279		gen = th->th_generation;
280		bintime2timespec(&th->th_offset, tsp);
281	} while (gen == 0 || gen != th->th_generation);
282}
283
284void
285fbclock_getmicrouptime(struct timeval *tvp)
286{
287	struct timehands *th;
288	unsigned int gen;
289
290	do {
291		th = timehands;
292		gen = th->th_generation;
293		bintime2timeval(&th->th_offset, tvp);
294	} while (gen == 0 || gen != th->th_generation);
295}
296
297void
298fbclock_getbintime(struct bintime *bt)
299{
300	struct timehands *th;
301	unsigned int gen;
302
303	do {
304		th = timehands;
305		gen = th->th_generation;
306		*bt = th->th_offset;
307	} while (gen == 0 || gen != th->th_generation);
308	bintime_add(bt, &boottimebin);
309}
310
311void
312fbclock_getnanotime(struct timespec *tsp)
313{
314	struct timehands *th;
315	unsigned int gen;
316
317	do {
318		th = timehands;
319		gen = th->th_generation;
320		*tsp = th->th_nanotime;
321	} while (gen == 0 || gen != th->th_generation);
322}
323
324void
325fbclock_getmicrotime(struct timeval *tvp)
326{
327	struct timehands *th;
328	unsigned int gen;
329
330	do {
331		th = timehands;
332		gen = th->th_generation;
333		*tvp = th->th_microtime;
334	} while (gen == 0 || gen != th->th_generation);
335}
336#else /* !FFCLOCK */
337void
338binuptime(struct bintime *bt)
339{
340	struct timehands *th;
341	u_int gen;
342
343	do {
344		th = timehands;
345		gen = th->th_generation;
346		*bt = th->th_offset;
347		bintime_addx(bt, th->th_scale * tc_delta(th));
348	} while (gen == 0 || gen != th->th_generation);
349}
350
351void
352nanouptime(struct timespec *tsp)
353{
354	struct bintime bt;
355
356	binuptime(&bt);
357	bintime2timespec(&bt, tsp);
358}
359
360void
361microuptime(struct timeval *tvp)
362{
363	struct bintime bt;
364
365	binuptime(&bt);
366	bintime2timeval(&bt, tvp);
367}
368
369void
370bintime(struct bintime *bt)
371{
372
373	binuptime(bt);
374	bintime_add(bt, &boottimebin);
375}
376
377void
378nanotime(struct timespec *tsp)
379{
380	struct bintime bt;
381
382	bintime(&bt);
383	bintime2timespec(&bt, tsp);
384}
385
386void
387microtime(struct timeval *tvp)
388{
389	struct bintime bt;
390
391	bintime(&bt);
392	bintime2timeval(&bt, tvp);
393}
394
395void
396getbinuptime(struct bintime *bt)
397{
398	struct timehands *th;
399	u_int gen;
400
401	do {
402		th = timehands;
403		gen = th->th_generation;
404		*bt = th->th_offset;
405	} while (gen == 0 || gen != th->th_generation);
406}
407
408void
409getnanouptime(struct timespec *tsp)
410{
411	struct timehands *th;
412	u_int gen;
413
414	do {
415		th = timehands;
416		gen = th->th_generation;
417		bintime2timespec(&th->th_offset, tsp);
418	} while (gen == 0 || gen != th->th_generation);
419}
420
421void
422getmicrouptime(struct timeval *tvp)
423{
424	struct timehands *th;
425	u_int gen;
426
427	do {
428		th = timehands;
429		gen = th->th_generation;
430		bintime2timeval(&th->th_offset, tvp);
431	} while (gen == 0 || gen != th->th_generation);
432}
433
434void
435getbintime(struct bintime *bt)
436{
437	struct timehands *th;
438	u_int gen;
439
440	do {
441		th = timehands;
442		gen = th->th_generation;
443		*bt = th->th_offset;
444	} while (gen == 0 || gen != th->th_generation);
445	bintime_add(bt, &boottimebin);
446}
447
448void
449getnanotime(struct timespec *tsp)
450{
451	struct timehands *th;
452	u_int gen;
453
454	do {
455		th = timehands;
456		gen = th->th_generation;
457		*tsp = th->th_nanotime;
458	} while (gen == 0 || gen != th->th_generation);
459}
460
461void
462getmicrotime(struct timeval *tvp)
463{
464	struct timehands *th;
465	u_int gen;
466
467	do {
468		th = timehands;
469		gen = th->th_generation;
470		*tvp = th->th_microtime;
471	} while (gen == 0 || gen != th->th_generation);
472}
473#endif /* FFCLOCK */
474
475#ifdef FFCLOCK
476/*
477 * Support for feed-forward synchronization algorithms. This is heavily inspired
478 * by the timehands mechanism but kept independent from it. *_windup() functions
479 * have some connection to avoid accessing the timecounter hardware more than
480 * necessary.
481 */
482
483/* Feed-forward clock estimates kept updated by the synchronization daemon. */
484struct ffclock_estimate ffclock_estimate;
485struct bintime ffclock_boottime;	/* Feed-forward boot time estimate. */
486uint32_t ffclock_status;		/* Feed-forward clock status. */
487int8_t ffclock_updated;			/* New estimates are available. */
488struct mtx ffclock_mtx;			/* Mutex on ffclock_estimate. */
489
490struct fftimehands {
491	struct ffclock_estimate	cest;
492	struct bintime		tick_time;
493	struct bintime		tick_time_lerp;
494	ffcounter		tick_ffcount;
495	uint64_t		period_lerp;
496	volatile uint8_t	gen;
497	struct fftimehands	*next;
498};
499
500#define	NUM_ELEMENTS(x) (sizeof(x) / sizeof(*x))
501
502static struct fftimehands ffth[10];
503static struct fftimehands *volatile fftimehands = ffth;
504
505static void
506ffclock_init(void)
507{
508	struct fftimehands *cur;
509	struct fftimehands *last;
510
511	memset(ffth, 0, sizeof(ffth));
512
513	last = ffth + NUM_ELEMENTS(ffth) - 1;
514	for (cur = ffth; cur < last; cur++)
515		cur->next = cur + 1;
516	last->next = ffth;
517
518	ffclock_updated = 0;
519	ffclock_status = FFCLOCK_STA_UNSYNC;
520	mtx_init(&ffclock_mtx, "ffclock lock", NULL, MTX_DEF);
521}
522
523/*
524 * Reset the feed-forward clock estimates. Called from inittodr() to get things
525 * kick started and uses the timecounter nominal frequency as a first period
526 * estimate. Note: this function may be called several time just after boot.
527 * Note: this is the only function that sets the value of boot time for the
528 * monotonic (i.e. uptime) version of the feed-forward clock.
529 */
530void
531ffclock_reset_clock(struct timespec *ts)
532{
533	struct timecounter *tc;
534	struct ffclock_estimate cest;
535
536	tc = timehands->th_counter;
537	memset(&cest, 0, sizeof(struct ffclock_estimate));
538
539	timespec2bintime(ts, &ffclock_boottime);
540	timespec2bintime(ts, &(cest.update_time));
541	ffclock_read_counter(&cest.update_ffcount);
542	cest.leapsec_next = 0;
543	cest.period = ((1ULL << 63) / tc->tc_frequency) << 1;
544	cest.errb_abs = 0;
545	cest.errb_rate = 0;
546	cest.status = FFCLOCK_STA_UNSYNC;
547	cest.leapsec_total = 0;
548	cest.leapsec = 0;
549
550	mtx_lock(&ffclock_mtx);
551	bcopy(&cest, &ffclock_estimate, sizeof(struct ffclock_estimate));
552	ffclock_updated = INT8_MAX;
553	mtx_unlock(&ffclock_mtx);
554
555	printf("ffclock reset: %s (%llu Hz), time = %ld.%09lu\n", tc->tc_name,
556	    (unsigned long long)tc->tc_frequency, (long)ts->tv_sec,
557	    (unsigned long)ts->tv_nsec);
558}
559
560/*
561 * Sub-routine to convert a time interval measured in RAW counter units to time
562 * in seconds stored in bintime format.
563 * NOTE: bintime_mul requires u_int, but the value of the ffcounter may be
564 * larger than the max value of u_int (on 32 bit architecture). Loop to consume
565 * extra cycles.
566 */
567static void
568ffclock_convert_delta(ffcounter ffdelta, uint64_t period, struct bintime *bt)
569{
570	struct bintime bt2;
571	ffcounter delta, delta_max;
572
573	delta_max = (1ULL << (8 * sizeof(unsigned int))) - 1;
574	bintime_clear(bt);
575	do {
576		if (ffdelta > delta_max)
577			delta = delta_max;
578		else
579			delta = ffdelta;
580		bt2.sec = 0;
581		bt2.frac = period;
582		bintime_mul(&bt2, (unsigned int)delta);
583		bintime_add(bt, &bt2);
584		ffdelta -= delta;
585	} while (ffdelta > 0);
586}
587
588/*
589 * Update the fftimehands.
590 * Push the tick ffcount and time(s) forward based on current clock estimate.
591 * The conversion from ffcounter to bintime relies on the difference clock
592 * principle, whose accuracy relies on computing small time intervals. If a new
593 * clock estimate has been passed by the synchronisation daemon, make it
594 * current, and compute the linear interpolation for monotonic time if needed.
595 */
596static void
597ffclock_windup(unsigned int delta)
598{
599	struct ffclock_estimate *cest;
600	struct fftimehands *ffth;
601	struct bintime bt, gap_lerp;
602	ffcounter ffdelta;
603	uint64_t frac;
604	unsigned int polling;
605	uint8_t forward_jump, ogen;
606
607	/*
608	 * Pick the next timehand, copy current ffclock estimates and move tick
609	 * times and counter forward.
610	 */
611	forward_jump = 0;
612	ffth = fftimehands->next;
613	ogen = ffth->gen;
614	ffth->gen = 0;
615	cest = &ffth->cest;
616	bcopy(&fftimehands->cest, cest, sizeof(struct ffclock_estimate));
617	ffdelta = (ffcounter)delta;
618	ffth->period_lerp = fftimehands->period_lerp;
619
620	ffth->tick_time = fftimehands->tick_time;
621	ffclock_convert_delta(ffdelta, cest->period, &bt);
622	bintime_add(&ffth->tick_time, &bt);
623
624	ffth->tick_time_lerp = fftimehands->tick_time_lerp;
625	ffclock_convert_delta(ffdelta, ffth->period_lerp, &bt);
626	bintime_add(&ffth->tick_time_lerp, &bt);
627
628	ffth->tick_ffcount = fftimehands->tick_ffcount + ffdelta;
629
630	/*
631	 * Assess the status of the clock, if the last update is too old, it is
632	 * likely the synchronisation daemon is dead and the clock is free
633	 * running.
634	 */
635	if (ffclock_updated == 0) {
636		ffdelta = ffth->tick_ffcount - cest->update_ffcount;
637		ffclock_convert_delta(ffdelta, cest->period, &bt);
638		if (bt.sec > 2 * FFCLOCK_SKM_SCALE)
639			ffclock_status |= FFCLOCK_STA_UNSYNC;
640	}
641
642	/*
643	 * If available, grab updated clock estimates and make them current.
644	 * Recompute time at this tick using the updated estimates. The clock
645	 * estimates passed the feed-forward synchronisation daemon may result
646	 * in time conversion that is not monotonically increasing (just after
647	 * the update). time_lerp is a particular linear interpolation over the
648	 * synchronisation algo polling period that ensures monotonicity for the
649	 * clock ids requesting it.
650	 */
651	if (ffclock_updated > 0) {
652		bcopy(&ffclock_estimate, cest, sizeof(struct ffclock_estimate));
653		ffdelta = ffth->tick_ffcount - cest->update_ffcount;
654		ffth->tick_time = cest->update_time;
655		ffclock_convert_delta(ffdelta, cest->period, &bt);
656		bintime_add(&ffth->tick_time, &bt);
657
658		/* ffclock_reset sets ffclock_updated to INT8_MAX */
659		if (ffclock_updated == INT8_MAX)
660			ffth->tick_time_lerp = ffth->tick_time;
661
662		if (bintime_cmp(&ffth->tick_time, &ffth->tick_time_lerp, >))
663			forward_jump = 1;
664		else
665			forward_jump = 0;
666
667		bintime_clear(&gap_lerp);
668		if (forward_jump) {
669			gap_lerp = ffth->tick_time;
670			bintime_sub(&gap_lerp, &ffth->tick_time_lerp);
671		} else {
672			gap_lerp = ffth->tick_time_lerp;
673			bintime_sub(&gap_lerp, &ffth->tick_time);
674		}
675
676		/*
677		 * The reset from the RTC clock may be far from accurate, and
678		 * reducing the gap between real time and interpolated time
679		 * could take a very long time if the interpolated clock insists
680		 * on strict monotonicity. The clock is reset under very strict
681		 * conditions (kernel time is known to be wrong and
682		 * synchronization daemon has been restarted recently.
683		 * ffclock_boottime absorbs the jump to ensure boot time is
684		 * correct and uptime functions stay consistent.
685		 */
686		if (((ffclock_status & FFCLOCK_STA_UNSYNC) == FFCLOCK_STA_UNSYNC) &&
687		    ((cest->status & FFCLOCK_STA_UNSYNC) == 0) &&
688		    ((cest->status & FFCLOCK_STA_WARMUP) == FFCLOCK_STA_WARMUP)) {
689			if (forward_jump)
690				bintime_add(&ffclock_boottime, &gap_lerp);
691			else
692				bintime_sub(&ffclock_boottime, &gap_lerp);
693			ffth->tick_time_lerp = ffth->tick_time;
694			bintime_clear(&gap_lerp);
695		}
696
697		ffclock_status = cest->status;
698		ffth->period_lerp = cest->period;
699
700		/*
701		 * Compute corrected period used for the linear interpolation of
702		 * time. The rate of linear interpolation is capped to 5000PPM
703		 * (5ms/s).
704		 */
705		if (bintime_isset(&gap_lerp)) {
706			ffdelta = cest->update_ffcount;
707			ffdelta -= fftimehands->cest.update_ffcount;
708			ffclock_convert_delta(ffdelta, cest->period, &bt);
709			polling = bt.sec;
710			bt.sec = 0;
711			bt.frac = 5000000 * (uint64_t)18446744073LL;
712			bintime_mul(&bt, polling);
713			if (bintime_cmp(&gap_lerp, &bt, >))
714				gap_lerp = bt;
715
716			/* Approximate 1 sec by 1-(1/2^64) to ease arithmetic */
717			frac = 0;
718			if (gap_lerp.sec > 0) {
719				frac -= 1;
720				frac /= ffdelta / gap_lerp.sec;
721			}
722			frac += gap_lerp.frac / ffdelta;
723
724			if (forward_jump)
725				ffth->period_lerp += frac;
726			else
727				ffth->period_lerp -= frac;
728		}
729
730		ffclock_updated = 0;
731	}
732	if (++ogen == 0)
733		ogen = 1;
734	ffth->gen = ogen;
735	fftimehands = ffth;
736}
737
738/*
739 * Adjust the fftimehands when the timecounter is changed. Stating the obvious,
740 * the old and new hardware counter cannot be read simultaneously. tc_windup()
741 * does read the two counters 'back to back', but a few cycles are effectively
742 * lost, and not accumulated in tick_ffcount. This is a fairly radical
743 * operation for a feed-forward synchronization daemon, and it is its job to not
744 * pushing irrelevant data to the kernel. Because there is no locking here,
745 * simply force to ignore pending or next update to give daemon a chance to
746 * realize the counter has changed.
747 */
748static void
749ffclock_change_tc(struct timehands *th)
750{
751	struct fftimehands *ffth;
752	struct ffclock_estimate *cest;
753	struct timecounter *tc;
754	uint8_t ogen;
755
756	tc = th->th_counter;
757	ffth = fftimehands->next;
758	ogen = ffth->gen;
759	ffth->gen = 0;
760
761	cest = &ffth->cest;
762	bcopy(&(fftimehands->cest), cest, sizeof(struct ffclock_estimate));
763	cest->period = ((1ULL << 63) / tc->tc_frequency ) << 1;
764	cest->errb_abs = 0;
765	cest->errb_rate = 0;
766	cest->status |= FFCLOCK_STA_UNSYNC;
767
768	ffth->tick_ffcount = fftimehands->tick_ffcount;
769	ffth->tick_time_lerp = fftimehands->tick_time_lerp;
770	ffth->tick_time = fftimehands->tick_time;
771	ffth->period_lerp = cest->period;
772
773	/* Do not lock but ignore next update from synchronization daemon. */
774	ffclock_updated--;
775
776	if (++ogen == 0)
777		ogen = 1;
778	ffth->gen = ogen;
779	fftimehands = ffth;
780}
781
782/*
783 * Retrieve feed-forward counter and time of last kernel tick.
784 */
785void
786ffclock_last_tick(ffcounter *ffcount, struct bintime *bt, uint32_t flags)
787{
788	struct fftimehands *ffth;
789	uint8_t gen;
790
791	/*
792	 * No locking but check generation has not changed. Also need to make
793	 * sure ffdelta is positive, i.e. ffcount > tick_ffcount.
794	 */
795	do {
796		ffth = fftimehands;
797		gen = ffth->gen;
798		if ((flags & FFCLOCK_LERP) == FFCLOCK_LERP)
799			*bt = ffth->tick_time_lerp;
800		else
801			*bt = ffth->tick_time;
802		*ffcount = ffth->tick_ffcount;
803	} while (gen == 0 || gen != ffth->gen);
804}
805
806/*
807 * Absolute clock conversion. Low level function to convert ffcounter to
808 * bintime. The ffcounter is converted using the current ffclock period estimate
809 * or the "interpolated period" to ensure monotonicity.
810 * NOTE: this conversion may have been deferred, and the clock updated since the
811 * hardware counter has been read.
812 */
813void
814ffclock_convert_abs(ffcounter ffcount, struct bintime *bt, uint32_t flags)
815{
816	struct fftimehands *ffth;
817	struct bintime bt2;
818	ffcounter ffdelta;
819	uint8_t gen;
820
821	/*
822	 * No locking but check generation has not changed. Also need to make
823	 * sure ffdelta is positive, i.e. ffcount > tick_ffcount.
824	 */
825	do {
826		ffth = fftimehands;
827		gen = ffth->gen;
828		if (ffcount > ffth->tick_ffcount)
829			ffdelta = ffcount - ffth->tick_ffcount;
830		else
831			ffdelta = ffth->tick_ffcount - ffcount;
832
833		if ((flags & FFCLOCK_LERP) == FFCLOCK_LERP) {
834			*bt = ffth->tick_time_lerp;
835			ffclock_convert_delta(ffdelta, ffth->period_lerp, &bt2);
836		} else {
837			*bt = ffth->tick_time;
838			ffclock_convert_delta(ffdelta, ffth->cest.period, &bt2);
839		}
840
841		if (ffcount > ffth->tick_ffcount)
842			bintime_add(bt, &bt2);
843		else
844			bintime_sub(bt, &bt2);
845	} while (gen == 0 || gen != ffth->gen);
846}
847
848/*
849 * Difference clock conversion.
850 * Low level function to Convert a time interval measured in RAW counter units
851 * into bintime. The difference clock allows measuring small intervals much more
852 * reliably than the absolute clock.
853 */
854void
855ffclock_convert_diff(ffcounter ffdelta, struct bintime *bt)
856{
857	struct fftimehands *ffth;
858	uint8_t gen;
859
860	/* No locking but check generation has not changed. */
861	do {
862		ffth = fftimehands;
863		gen = ffth->gen;
864		ffclock_convert_delta(ffdelta, ffth->cest.period, bt);
865	} while (gen == 0 || gen != ffth->gen);
866}
867
868/*
869 * Access to current ffcounter value.
870 */
871void
872ffclock_read_counter(ffcounter *ffcount)
873{
874	struct timehands *th;
875	struct fftimehands *ffth;
876	unsigned int gen, delta;
877
878	/*
879	 * ffclock_windup() called from tc_windup(), safe to rely on
880	 * th->th_generation only, for correct delta and ffcounter.
881	 */
882	do {
883		th = timehands;
884		gen = th->th_generation;
885		ffth = fftimehands;
886		delta = tc_delta(th);
887		*ffcount = ffth->tick_ffcount;
888	} while (gen == 0 || gen != th->th_generation);
889
890	*ffcount += delta;
891}
892
893void
894binuptime(struct bintime *bt)
895{
896
897	binuptime_fromclock(bt, sysclock_active);
898}
899
900void
901nanouptime(struct timespec *tsp)
902{
903
904	nanouptime_fromclock(tsp, sysclock_active);
905}
906
907void
908microuptime(struct timeval *tvp)
909{
910
911	microuptime_fromclock(tvp, sysclock_active);
912}
913
914void
915bintime(struct bintime *bt)
916{
917
918	bintime_fromclock(bt, sysclock_active);
919}
920
921void
922nanotime(struct timespec *tsp)
923{
924
925	nanotime_fromclock(tsp, sysclock_active);
926}
927
928void
929microtime(struct timeval *tvp)
930{
931
932	microtime_fromclock(tvp, sysclock_active);
933}
934
935void
936getbinuptime(struct bintime *bt)
937{
938
939	getbinuptime_fromclock(bt, sysclock_active);
940}
941
942void
943getnanouptime(struct timespec *tsp)
944{
945
946	getnanouptime_fromclock(tsp, sysclock_active);
947}
948
949void
950getmicrouptime(struct timeval *tvp)
951{
952
953	getmicrouptime_fromclock(tvp, sysclock_active);
954}
955
956void
957getbintime(struct bintime *bt)
958{
959
960	getbintime_fromclock(bt, sysclock_active);
961}
962
963void
964getnanotime(struct timespec *tsp)
965{
966
967	getnanotime_fromclock(tsp, sysclock_active);
968}
969
970void
971getmicrotime(struct timeval *tvp)
972{
973
974	getmicrouptime_fromclock(tvp, sysclock_active);
975}
976
977#endif /* FFCLOCK */
978
979/*
980 * This is a clone of getnanotime and used for walltimestamps.
981 * The dtrace_ prefix prevents fbt from creating probes for
982 * it so walltimestamp can be safely used in all fbt probes.
983 */
984void
985dtrace_getnanotime(struct timespec *tsp)
986{
987	struct timehands *th;
988	u_int gen;
989
990	do {
991		th = timehands;
992		gen = th->th_generation;
993		*tsp = th->th_nanotime;
994	} while (gen == 0 || gen != th->th_generation);
995}
996
997/*
998 * System clock currently providing time to the system. Modifiable via sysctl
999 * when the FFCLOCK option is defined.
1000 */
1001int sysclock_active = SYSCLOCK_FBCK;
1002
1003/* Internal NTP status and error estimates. */
1004extern int time_status;
1005extern long time_esterror;
1006
1007/*
1008 * Take a snapshot of sysclock data which can be used to compare system clocks
1009 * and generate timestamps after the fact.
1010 */
1011void
1012sysclock_getsnapshot(struct sysclock_snap *clock_snap, int fast)
1013{
1014	struct fbclock_info *fbi;
1015	struct timehands *th;
1016	struct bintime bt;
1017	unsigned int delta, gen;
1018#ifdef FFCLOCK
1019	ffcounter ffcount;
1020	struct fftimehands *ffth;
1021	struct ffclock_info *ffi;
1022	struct ffclock_estimate cest;
1023
1024	ffi = &clock_snap->ff_info;
1025#endif
1026
1027	fbi = &clock_snap->fb_info;
1028	delta = 0;
1029
1030	do {
1031		th = timehands;
1032		gen = th->th_generation;
1033		fbi->th_scale = th->th_scale;
1034		fbi->tick_time = th->th_offset;
1035#ifdef FFCLOCK
1036		ffth = fftimehands;
1037		ffi->tick_time = ffth->tick_time_lerp;
1038		ffi->tick_time_lerp = ffth->tick_time_lerp;
1039		ffi->period = ffth->cest.period;
1040		ffi->period_lerp = ffth->period_lerp;
1041		clock_snap->ffcount = ffth->tick_ffcount;
1042		cest = ffth->cest;
1043#endif
1044		if (!fast)
1045			delta = tc_delta(th);
1046	} while (gen == 0 || gen != th->th_generation);
1047
1048	clock_snap->delta = delta;
1049	clock_snap->sysclock_active = sysclock_active;
1050
1051	/* Record feedback clock status and error. */
1052	clock_snap->fb_info.status = time_status;
1053	/* XXX: Very crude estimate of feedback clock error. */
1054	bt.sec = time_esterror / 1000000;
1055	bt.frac = ((time_esterror - bt.sec) * 1000000) *
1056	    (uint64_t)18446744073709ULL;
1057	clock_snap->fb_info.error = bt;
1058
1059#ifdef FFCLOCK
1060	if (!fast)
1061		clock_snap->ffcount += delta;
1062
1063	/* Record feed-forward clock leap second adjustment. */
1064	ffi->leapsec_adjustment = cest.leapsec_total;
1065	if (clock_snap->ffcount > cest.leapsec_next)
1066		ffi->leapsec_adjustment -= cest.leapsec;
1067
1068	/* Record feed-forward clock status and error. */
1069	clock_snap->ff_info.status = cest.status;
1070	ffcount = clock_snap->ffcount - cest.update_ffcount;
1071	ffclock_convert_delta(ffcount, cest.period, &bt);
1072	/* 18446744073709 = int(2^64/1e12), err_bound_rate in [ps/s]. */
1073	bintime_mul(&bt, cest.errb_rate * (uint64_t)18446744073709ULL);
1074	/* 18446744073 = int(2^64 / 1e9), since err_abs in [ns]. */
1075	bintime_addx(&bt, cest.errb_abs * (uint64_t)18446744073ULL);
1076	clock_snap->ff_info.error = bt;
1077#endif
1078}
1079
1080/*
1081 * Convert a sysclock snapshot into a struct bintime based on the specified
1082 * clock source and flags.
1083 */
1084int
1085sysclock_snap2bintime(struct sysclock_snap *cs, struct bintime *bt,
1086    int whichclock, uint32_t flags)
1087{
1088#ifdef FFCLOCK
1089	struct bintime bt2;
1090	uint64_t period;
1091#endif
1092
1093	switch (whichclock) {
1094	case SYSCLOCK_FBCK:
1095		*bt = cs->fb_info.tick_time;
1096
1097		/* If snapshot was created with !fast, delta will be >0. */
1098		if (cs->delta > 0)
1099			bintime_addx(bt, cs->fb_info.th_scale * cs->delta);
1100
1101		if ((flags & FBCLOCK_UPTIME) == 0)
1102			bintime_add(bt, &boottimebin);
1103		break;
1104#ifdef FFCLOCK
1105	case SYSCLOCK_FFWD:
1106		if (flags & FFCLOCK_LERP) {
1107			*bt = cs->ff_info.tick_time_lerp;
1108			period = cs->ff_info.period_lerp;
1109		} else {
1110			*bt = cs->ff_info.tick_time;
1111			period = cs->ff_info.period;
1112		}
1113
1114		/* If snapshot was created with !fast, delta will be >0. */
1115		if (cs->delta > 0) {
1116			ffclock_convert_delta(cs->delta, period, &bt2);
1117			bintime_add(bt, &bt2);
1118		}
1119
1120		/* Leap second adjustment. */
1121		if (flags & FFCLOCK_LEAPSEC)
1122			bt->sec -= cs->ff_info.leapsec_adjustment;
1123
1124		/* Boot time adjustment, for uptime/monotonic clocks. */
1125		if (flags & FFCLOCK_UPTIME)
1126			bintime_sub(bt, &ffclock_boottime);
1127		break;
1128#endif
1129	default:
1130		return (EINVAL);
1131		break;
1132	}
1133
1134	return (0);
1135}
1136
1137/*
1138 * Initialize a new timecounter and possibly use it.
1139 */
1140void
1141tc_init(struct timecounter *tc)
1142{
1143	u_int u;
1144	struct sysctl_oid *tc_root;
1145
1146	u = tc->tc_frequency / tc->tc_counter_mask;
1147	/* XXX: We need some margin here, 10% is a guess */
1148	u *= 11;
1149	u /= 10;
1150	if (u > hz && tc->tc_quality >= 0) {
1151		tc->tc_quality = -2000;
1152		if (bootverbose) {
1153			printf("Timecounter \"%s\" frequency %ju Hz",
1154			    tc->tc_name, (uintmax_t)tc->tc_frequency);
1155			printf(" -- Insufficient hz, needs at least %u\n", u);
1156		}
1157	} else if (tc->tc_quality >= 0 || bootverbose) {
1158		printf("Timecounter \"%s\" frequency %ju Hz quality %d\n",
1159		    tc->tc_name, (uintmax_t)tc->tc_frequency,
1160		    tc->tc_quality);
1161	}
1162
1163	tc->tc_next = timecounters;
1164	timecounters = tc;
1165	/*
1166	 * Set up sysctl tree for this counter.
1167	 */
1168	tc_root = SYSCTL_ADD_NODE(NULL,
1169	    SYSCTL_STATIC_CHILDREN(_kern_timecounter_tc), OID_AUTO, tc->tc_name,
1170	    CTLFLAG_RW, 0, "timecounter description");
1171	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
1172	    "mask", CTLFLAG_RD, &(tc->tc_counter_mask), 0,
1173	    "mask for implemented bits");
1174	SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
1175	    "counter", CTLTYPE_UINT | CTLFLAG_RD, tc, sizeof(*tc),
1176	    sysctl_kern_timecounter_get, "IU", "current timecounter value");
1177	SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
1178	    "frequency", CTLTYPE_U64 | CTLFLAG_RD, tc, sizeof(*tc),
1179	     sysctl_kern_timecounter_freq, "QU", "timecounter frequency");
1180	SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
1181	    "quality", CTLFLAG_RD, &(tc->tc_quality), 0,
1182	    "goodness of time counter");
1183	/*
1184	 * Never automatically use a timecounter with negative quality.
1185	 * Even though we run on the dummy counter, switching here may be
1186	 * worse since this timecounter may not be monotonous.
1187	 */
1188	if (tc->tc_quality < 0)
1189		return;
1190	if (tc->tc_quality < timecounter->tc_quality)
1191		return;
1192	if (tc->tc_quality == timecounter->tc_quality &&
1193	    tc->tc_frequency < timecounter->tc_frequency)
1194		return;
1195	(void)tc->tc_get_timecount(tc);
1196	(void)tc->tc_get_timecount(tc);
1197	timecounter = tc;
1198}
1199
1200/* Report the frequency of the current timecounter. */
1201uint64_t
1202tc_getfrequency(void)
1203{
1204
1205	return (timehands->th_counter->tc_frequency);
1206}
1207
1208/*
1209 * Step our concept of UTC.  This is done by modifying our estimate of
1210 * when we booted.
1211 * XXX: not locked.
1212 */
1213void
1214tc_setclock(struct timespec *ts)
1215{
1216	struct timespec tbef, taft;
1217	struct bintime bt, bt2;
1218
1219	cpu_tick_calibrate(1);
1220	nanotime(&tbef);
1221	timespec2bintime(ts, &bt);
1222	binuptime(&bt2);
1223	bintime_sub(&bt, &bt2);
1224	bintime_add(&bt2, &boottimebin);
1225	boottimebin = bt;
1226	bintime2timeval(&bt, &boottime);
1227
1228	/* XXX fiddle all the little crinkly bits around the fiords... */
1229	tc_windup();
1230	nanotime(&taft);
1231	if (timestepwarnings) {
1232		log(LOG_INFO,
1233		    "Time stepped from %jd.%09ld to %jd.%09ld (%jd.%09ld)\n",
1234		    (intmax_t)tbef.tv_sec, tbef.tv_nsec,
1235		    (intmax_t)taft.tv_sec, taft.tv_nsec,
1236		    (intmax_t)ts->tv_sec, ts->tv_nsec);
1237	}
1238	cpu_tick_calibrate(1);
1239}
1240
1241/*
1242 * Initialize the next struct timehands in the ring and make
1243 * it the active timehands.  Along the way we might switch to a different
1244 * timecounter and/or do seconds processing in NTP.  Slightly magic.
1245 */
1246static void
1247tc_windup(void)
1248{
1249	struct bintime bt;
1250	struct timehands *th, *tho;
1251	uint64_t scale;
1252	u_int delta, ncount, ogen;
1253	int i;
1254	time_t t;
1255
1256	/*
1257	 * Make the next timehands a copy of the current one, but do not
1258	 * overwrite the generation or next pointer.  While we update
1259	 * the contents, the generation must be zero.
1260	 */
1261	tho = timehands;
1262	th = tho->th_next;
1263	ogen = th->th_generation;
1264	th->th_generation = 0;
1265	bcopy(tho, th, offsetof(struct timehands, th_generation));
1266
1267	/*
1268	 * Capture a timecounter delta on the current timecounter and if
1269	 * changing timecounters, a counter value from the new timecounter.
1270	 * Update the offset fields accordingly.
1271	 */
1272	delta = tc_delta(th);
1273	if (th->th_counter != timecounter)
1274		ncount = timecounter->tc_get_timecount(timecounter);
1275	else
1276		ncount = 0;
1277#ifdef FFCLOCK
1278	ffclock_windup(delta);
1279#endif
1280	th->th_offset_count += delta;
1281	th->th_offset_count &= th->th_counter->tc_counter_mask;
1282	while (delta > th->th_counter->tc_frequency) {
1283		/* Eat complete unadjusted seconds. */
1284		delta -= th->th_counter->tc_frequency;
1285		th->th_offset.sec++;
1286	}
1287	if ((delta > th->th_counter->tc_frequency / 2) &&
1288	    (th->th_scale * delta < ((uint64_t)1 << 63))) {
1289		/* The product th_scale * delta just barely overflows. */
1290		th->th_offset.sec++;
1291	}
1292	bintime_addx(&th->th_offset, th->th_scale * delta);
1293
1294	/*
1295	 * Hardware latching timecounters may not generate interrupts on
1296	 * PPS events, so instead we poll them.  There is a finite risk that
1297	 * the hardware might capture a count which is later than the one we
1298	 * got above, and therefore possibly in the next NTP second which might
1299	 * have a different rate than the current NTP second.  It doesn't
1300	 * matter in practice.
1301	 */
1302	if (tho->th_counter->tc_poll_pps)
1303		tho->th_counter->tc_poll_pps(tho->th_counter);
1304
1305	/*
1306	 * Deal with NTP second processing.  The for loop normally
1307	 * iterates at most once, but in extreme situations it might
1308	 * keep NTP sane if timeouts are not run for several seconds.
1309	 * At boot, the time step can be large when the TOD hardware
1310	 * has been read, so on really large steps, we call
1311	 * ntp_update_second only twice.  We need to call it twice in
1312	 * case we missed a leap second.
1313	 */
1314	bt = th->th_offset;
1315	bintime_add(&bt, &boottimebin);
1316	i = bt.sec - tho->th_microtime.tv_sec;
1317	if (i > LARGE_STEP)
1318		i = 2;
1319	for (; i > 0; i--) {
1320		t = bt.sec;
1321		ntp_update_second(&th->th_adjustment, &bt.sec);
1322		if (bt.sec != t)
1323			boottimebin.sec += bt.sec - t;
1324	}
1325	/* Update the UTC timestamps used by the get*() functions. */
1326	/* XXX shouldn't do this here.  Should force non-`get' versions. */
1327	bintime2timeval(&bt, &th->th_microtime);
1328	bintime2timespec(&bt, &th->th_nanotime);
1329
1330	/* Now is a good time to change timecounters. */
1331	if (th->th_counter != timecounter) {
1332#ifndef __arm__
1333		if ((timecounter->tc_flags & TC_FLAGS_C2STOP) != 0)
1334			cpu_disable_c2_sleep++;
1335		if ((th->th_counter->tc_flags & TC_FLAGS_C2STOP) != 0)
1336			cpu_disable_c2_sleep--;
1337#endif
1338		th->th_counter = timecounter;
1339		th->th_offset_count = ncount;
1340		tc_min_ticktock_freq = max(1, timecounter->tc_frequency /
1341		    (((uint64_t)timecounter->tc_counter_mask + 1) / 3));
1342#ifdef FFCLOCK
1343		ffclock_change_tc(th);
1344#endif
1345	}
1346
1347	/*-
1348	 * Recalculate the scaling factor.  We want the number of 1/2^64
1349	 * fractions of a second per period of the hardware counter, taking
1350	 * into account the th_adjustment factor which the NTP PLL/adjtime(2)
1351	 * processing provides us with.
1352	 *
1353	 * The th_adjustment is nanoseconds per second with 32 bit binary
1354	 * fraction and we want 64 bit binary fraction of second:
1355	 *
1356	 *	 x = a * 2^32 / 10^9 = a * 4.294967296
1357	 *
1358	 * The range of th_adjustment is +/- 5000PPM so inside a 64bit int
1359	 * we can only multiply by about 850 without overflowing, that
1360	 * leaves no suitably precise fractions for multiply before divide.
1361	 *
1362	 * Divide before multiply with a fraction of 2199/512 results in a
1363	 * systematic undercompensation of 10PPM of th_adjustment.  On a
1364	 * 5000PPM adjustment this is a 0.05PPM error.  This is acceptable.
1365 	 *
1366	 * We happily sacrifice the lowest of the 64 bits of our result
1367	 * to the goddess of code clarity.
1368	 *
1369	 */
1370	scale = (uint64_t)1 << 63;
1371	scale += (th->th_adjustment / 1024) * 2199;
1372	scale /= th->th_counter->tc_frequency;
1373	th->th_scale = scale * 2;
1374
1375	/*
1376	 * Now that the struct timehands is again consistent, set the new
1377	 * generation number, making sure to not make it zero.
1378	 */
1379	if (++ogen == 0)
1380		ogen = 1;
1381	th->th_generation = ogen;
1382
1383	/* Go live with the new struct timehands. */
1384#ifdef FFCLOCK
1385	switch (sysclock_active) {
1386	case SYSCLOCK_FBCK:
1387#endif
1388		time_second = th->th_microtime.tv_sec;
1389		time_uptime = th->th_offset.sec;
1390#ifdef FFCLOCK
1391		break;
1392	case SYSCLOCK_FFWD:
1393		time_second = fftimehands->tick_time_lerp.sec;
1394		time_uptime = fftimehands->tick_time_lerp.sec - ffclock_boottime.sec;
1395		break;
1396	}
1397#endif
1398
1399	timehands = th;
1400	timekeep_push_vdso();
1401}
1402
1403/* Report or change the active timecounter hardware. */
1404static int
1405sysctl_kern_timecounter_hardware(SYSCTL_HANDLER_ARGS)
1406{
1407	char newname[32];
1408	struct timecounter *newtc, *tc;
1409	int error;
1410
1411	tc = timecounter;
1412	strlcpy(newname, tc->tc_name, sizeof(newname));
1413
1414	error = sysctl_handle_string(oidp, &newname[0], sizeof(newname), req);
1415	if (error != 0 || req->newptr == NULL ||
1416	    strcmp(newname, tc->tc_name) == 0)
1417		return (error);
1418	for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) {
1419		if (strcmp(newname, newtc->tc_name) != 0)
1420			continue;
1421
1422		/* Warm up new timecounter. */
1423		(void)newtc->tc_get_timecount(newtc);
1424		(void)newtc->tc_get_timecount(newtc);
1425
1426		timecounter = newtc;
1427
1428		/*
1429		 * The vdso timehands update is deferred until the next
1430		 * 'tc_windup()'.
1431		 *
1432		 * This is prudent given that 'timekeep_push_vdso()' does not
1433		 * use any locking and that it can be called in hard interrupt
1434		 * context via 'tc_windup()'.
1435		 */
1436		return (0);
1437	}
1438	return (EINVAL);
1439}
1440
1441SYSCTL_PROC(_kern_timecounter, OID_AUTO, hardware, CTLTYPE_STRING | CTLFLAG_RW,
1442    0, 0, sysctl_kern_timecounter_hardware, "A",
1443    "Timecounter hardware selected");
1444
1445
1446/* Report or change the active timecounter hardware. */
1447static int
1448sysctl_kern_timecounter_choice(SYSCTL_HANDLER_ARGS)
1449{
1450	char buf[32], *spc;
1451	struct timecounter *tc;
1452	int error;
1453
1454	spc = "";
1455	error = 0;
1456	for (tc = timecounters; error == 0 && tc != NULL; tc = tc->tc_next) {
1457		sprintf(buf, "%s%s(%d)",
1458		    spc, tc->tc_name, tc->tc_quality);
1459		error = SYSCTL_OUT(req, buf, strlen(buf));
1460		spc = " ";
1461	}
1462	return (error);
1463}
1464
1465SYSCTL_PROC(_kern_timecounter, OID_AUTO, choice, CTLTYPE_STRING | CTLFLAG_RD,
1466    0, 0, sysctl_kern_timecounter_choice, "A", "Timecounter hardware detected");
1467
1468/*
1469 * RFC 2783 PPS-API implementation.
1470 */
1471
1472static int
1473pps_fetch(struct pps_fetch_args *fapi, struct pps_state *pps)
1474{
1475	int err, timo;
1476	pps_seq_t aseq, cseq;
1477	struct timeval tv;
1478
1479	if (fapi->tsformat && fapi->tsformat != PPS_TSFMT_TSPEC)
1480		return (EINVAL);
1481
1482	/*
1483	 * If no timeout is requested, immediately return whatever values were
1484	 * most recently captured.  If timeout seconds is -1, that's a request
1485	 * to block without a timeout.  WITNESS won't let us sleep forever
1486	 * without a lock (we really don't need a lock), so just repeatedly
1487	 * sleep a long time.
1488	 */
1489	if (fapi->timeout.tv_sec || fapi->timeout.tv_nsec) {
1490		if (fapi->timeout.tv_sec == -1)
1491			timo = 0x7fffffff;
1492		else {
1493			tv.tv_sec = fapi->timeout.tv_sec;
1494			tv.tv_usec = fapi->timeout.tv_nsec / 1000;
1495			timo = tvtohz(&tv);
1496		}
1497		aseq = pps->ppsinfo.assert_sequence;
1498		cseq = pps->ppsinfo.clear_sequence;
1499		while (aseq == pps->ppsinfo.assert_sequence &&
1500		    cseq == pps->ppsinfo.clear_sequence) {
1501			err = tsleep(pps, PCATCH, "ppsfch", timo);
1502			if (err == EWOULDBLOCK && fapi->timeout.tv_sec == -1) {
1503				continue;
1504			} else if (err != 0) {
1505				return (err);
1506			}
1507		}
1508	}
1509
1510	pps->ppsinfo.current_mode = pps->ppsparam.mode;
1511	fapi->pps_info_buf = pps->ppsinfo;
1512
1513	return (0);
1514}
1515
1516int
1517pps_ioctl(u_long cmd, caddr_t data, struct pps_state *pps)
1518{
1519	pps_params_t *app;
1520	struct pps_fetch_args *fapi;
1521#ifdef FFCLOCK
1522	struct pps_fetch_ffc_args *fapi_ffc;
1523#endif
1524#ifdef PPS_SYNC
1525	struct pps_kcbind_args *kapi;
1526#endif
1527
1528	KASSERT(pps != NULL, ("NULL pps pointer in pps_ioctl"));
1529	switch (cmd) {
1530	case PPS_IOC_CREATE:
1531		return (0);
1532	case PPS_IOC_DESTROY:
1533		return (0);
1534	case PPS_IOC_SETPARAMS:
1535		app = (pps_params_t *)data;
1536		if (app->mode & ~pps->ppscap)
1537			return (EINVAL);
1538#ifdef FFCLOCK
1539		/* Ensure only a single clock is selected for ffc timestamp. */
1540		if ((app->mode & PPS_TSCLK_MASK) == PPS_TSCLK_MASK)
1541			return (EINVAL);
1542#endif
1543		pps->ppsparam = *app;
1544		return (0);
1545	case PPS_IOC_GETPARAMS:
1546		app = (pps_params_t *)data;
1547		*app = pps->ppsparam;
1548		app->api_version = PPS_API_VERS_1;
1549		return (0);
1550	case PPS_IOC_GETCAP:
1551		*(int*)data = pps->ppscap;
1552		return (0);
1553	case PPS_IOC_FETCH:
1554		fapi = (struct pps_fetch_args *)data;
1555		return (pps_fetch(fapi, pps));
1556#ifdef FFCLOCK
1557	case PPS_IOC_FETCH_FFCOUNTER:
1558		fapi_ffc = (struct pps_fetch_ffc_args *)data;
1559		if (fapi_ffc->tsformat && fapi_ffc->tsformat !=
1560		    PPS_TSFMT_TSPEC)
1561			return (EINVAL);
1562		if (fapi_ffc->timeout.tv_sec || fapi_ffc->timeout.tv_nsec)
1563			return (EOPNOTSUPP);
1564		pps->ppsinfo_ffc.current_mode = pps->ppsparam.mode;
1565		fapi_ffc->pps_info_buf_ffc = pps->ppsinfo_ffc;
1566		/* Overwrite timestamps if feedback clock selected. */
1567		switch (pps->ppsparam.mode & PPS_TSCLK_MASK) {
1568		case PPS_TSCLK_FBCK:
1569			fapi_ffc->pps_info_buf_ffc.assert_timestamp =
1570			    pps->ppsinfo.assert_timestamp;
1571			fapi_ffc->pps_info_buf_ffc.clear_timestamp =
1572			    pps->ppsinfo.clear_timestamp;
1573			break;
1574		case PPS_TSCLK_FFWD:
1575			break;
1576		default:
1577			break;
1578		}
1579		return (0);
1580#endif /* FFCLOCK */
1581	case PPS_IOC_KCBIND:
1582#ifdef PPS_SYNC
1583		kapi = (struct pps_kcbind_args *)data;
1584		/* XXX Only root should be able to do this */
1585		if (kapi->tsformat && kapi->tsformat != PPS_TSFMT_TSPEC)
1586			return (EINVAL);
1587		if (kapi->kernel_consumer != PPS_KC_HARDPPS)
1588			return (EINVAL);
1589		if (kapi->edge & ~pps->ppscap)
1590			return (EINVAL);
1591		pps->kcmode = kapi->edge;
1592		return (0);
1593#else
1594		return (EOPNOTSUPP);
1595#endif
1596	default:
1597		return (ENOIOCTL);
1598	}
1599}
1600
1601void
1602pps_init(struct pps_state *pps)
1603{
1604	pps->ppscap |= PPS_TSFMT_TSPEC | PPS_CANWAIT;
1605	if (pps->ppscap & PPS_CAPTUREASSERT)
1606		pps->ppscap |= PPS_OFFSETASSERT;
1607	if (pps->ppscap & PPS_CAPTURECLEAR)
1608		pps->ppscap |= PPS_OFFSETCLEAR;
1609#ifdef FFCLOCK
1610	pps->ppscap |= PPS_TSCLK_MASK;
1611#endif
1612}
1613
1614void
1615pps_capture(struct pps_state *pps)
1616{
1617	struct timehands *th;
1618
1619	KASSERT(pps != NULL, ("NULL pps pointer in pps_capture"));
1620	th = timehands;
1621	pps->capgen = th->th_generation;
1622	pps->capth = th;
1623#ifdef FFCLOCK
1624	pps->capffth = fftimehands;
1625#endif
1626	pps->capcount = th->th_counter->tc_get_timecount(th->th_counter);
1627	if (pps->capgen != th->th_generation)
1628		pps->capgen = 0;
1629}
1630
1631void
1632pps_event(struct pps_state *pps, int event)
1633{
1634	struct bintime bt;
1635	struct timespec ts, *tsp, *osp;
1636	u_int tcount, *pcount;
1637	int foff, fhard;
1638	pps_seq_t *pseq;
1639#ifdef FFCLOCK
1640	struct timespec *tsp_ffc;
1641	pps_seq_t *pseq_ffc;
1642	ffcounter *ffcount;
1643#endif
1644
1645	KASSERT(pps != NULL, ("NULL pps pointer in pps_event"));
1646	/* If the timecounter was wound up underneath us, bail out. */
1647	if (pps->capgen == 0 || pps->capgen != pps->capth->th_generation)
1648		return;
1649
1650	/* Things would be easier with arrays. */
1651	if (event == PPS_CAPTUREASSERT) {
1652		tsp = &pps->ppsinfo.assert_timestamp;
1653		osp = &pps->ppsparam.assert_offset;
1654		foff = pps->ppsparam.mode & PPS_OFFSETASSERT;
1655		fhard = pps->kcmode & PPS_CAPTUREASSERT;
1656		pcount = &pps->ppscount[0];
1657		pseq = &pps->ppsinfo.assert_sequence;
1658#ifdef FFCLOCK
1659		ffcount = &pps->ppsinfo_ffc.assert_ffcount;
1660		tsp_ffc = &pps->ppsinfo_ffc.assert_timestamp;
1661		pseq_ffc = &pps->ppsinfo_ffc.assert_sequence;
1662#endif
1663	} else {
1664		tsp = &pps->ppsinfo.clear_timestamp;
1665		osp = &pps->ppsparam.clear_offset;
1666		foff = pps->ppsparam.mode & PPS_OFFSETCLEAR;
1667		fhard = pps->kcmode & PPS_CAPTURECLEAR;
1668		pcount = &pps->ppscount[1];
1669		pseq = &pps->ppsinfo.clear_sequence;
1670#ifdef FFCLOCK
1671		ffcount = &pps->ppsinfo_ffc.clear_ffcount;
1672		tsp_ffc = &pps->ppsinfo_ffc.clear_timestamp;
1673		pseq_ffc = &pps->ppsinfo_ffc.clear_sequence;
1674#endif
1675	}
1676
1677	/*
1678	 * If the timecounter changed, we cannot compare the count values, so
1679	 * we have to drop the rest of the PPS-stuff until the next event.
1680	 */
1681	if (pps->ppstc != pps->capth->th_counter) {
1682		pps->ppstc = pps->capth->th_counter;
1683		*pcount = pps->capcount;
1684		pps->ppscount[2] = pps->capcount;
1685		return;
1686	}
1687
1688	/* Convert the count to a timespec. */
1689	tcount = pps->capcount - pps->capth->th_offset_count;
1690	tcount &= pps->capth->th_counter->tc_counter_mask;
1691	bt = pps->capth->th_offset;
1692	bintime_addx(&bt, pps->capth->th_scale * tcount);
1693	bintime_add(&bt, &boottimebin);
1694	bintime2timespec(&bt, &ts);
1695
1696	/* If the timecounter was wound up underneath us, bail out. */
1697	if (pps->capgen != pps->capth->th_generation)
1698		return;
1699
1700	*pcount = pps->capcount;
1701	(*pseq)++;
1702	*tsp = ts;
1703
1704	if (foff) {
1705		timespecadd(tsp, osp);
1706		if (tsp->tv_nsec < 0) {
1707			tsp->tv_nsec += 1000000000;
1708			tsp->tv_sec -= 1;
1709		}
1710	}
1711
1712#ifdef FFCLOCK
1713	*ffcount = pps->capffth->tick_ffcount + tcount;
1714	bt = pps->capffth->tick_time;
1715	ffclock_convert_delta(tcount, pps->capffth->cest.period, &bt);
1716	bintime_add(&bt, &pps->capffth->tick_time);
1717	bintime2timespec(&bt, &ts);
1718	(*pseq_ffc)++;
1719	*tsp_ffc = ts;
1720#endif
1721
1722#ifdef PPS_SYNC
1723	if (fhard) {
1724		uint64_t scale;
1725
1726		/*
1727		 * Feed the NTP PLL/FLL.
1728		 * The FLL wants to know how many (hardware) nanoseconds
1729		 * elapsed since the previous event.
1730		 */
1731		tcount = pps->capcount - pps->ppscount[2];
1732		pps->ppscount[2] = pps->capcount;
1733		tcount &= pps->capth->th_counter->tc_counter_mask;
1734		scale = (uint64_t)1 << 63;
1735		scale /= pps->capth->th_counter->tc_frequency;
1736		scale *= 2;
1737		bt.sec = 0;
1738		bt.frac = 0;
1739		bintime_addx(&bt, scale * tcount);
1740		bintime2timespec(&bt, &ts);
1741		hardpps(tsp, ts.tv_nsec + 1000000000 * ts.tv_sec);
1742	}
1743#endif
1744
1745	/* Wakeup anyone sleeping in pps_fetch().  */
1746	wakeup(pps);
1747}
1748
1749/*
1750 * Timecounters need to be updated every so often to prevent the hardware
1751 * counter from overflowing.  Updating also recalculates the cached values
1752 * used by the get*() family of functions, so their precision depends on
1753 * the update frequency.
1754 */
1755
1756static int tc_tick;
1757SYSCTL_INT(_kern_timecounter, OID_AUTO, tick, CTLFLAG_RD, &tc_tick, 0,
1758    "Approximate number of hardclock ticks in a millisecond");
1759
1760void
1761tc_ticktock(int cnt)
1762{
1763	static int count;
1764
1765	count += cnt;
1766	if (count < tc_tick)
1767		return;
1768	count = 0;
1769	tc_windup();
1770}
1771
1772static void __inline
1773tc_adjprecision(void)
1774{
1775	int t;
1776
1777	if (tc_timepercentage > 0) {
1778		t = (99 + tc_timepercentage) / tc_timepercentage;
1779		tc_precexp = fls(t + (t >> 1)) - 1;
1780		FREQ2BT(hz / tc_tick, &bt_timethreshold);
1781		FREQ2BT(hz, &bt_tickthreshold);
1782		bintime_shift(&bt_timethreshold, tc_precexp);
1783		bintime_shift(&bt_tickthreshold, tc_precexp);
1784	} else {
1785		tc_precexp = 31;
1786		bt_timethreshold.sec = INT_MAX;
1787		bt_timethreshold.frac = ~(uint64_t)0;
1788		bt_tickthreshold = bt_timethreshold;
1789	}
1790	sbt_timethreshold = bttosbt(bt_timethreshold);
1791	sbt_tickthreshold = bttosbt(bt_tickthreshold);
1792}
1793
1794static int
1795sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS)
1796{
1797	int error, val;
1798
1799	val = tc_timepercentage;
1800	error = sysctl_handle_int(oidp, &val, 0, req);
1801	if (error != 0 || req->newptr == NULL)
1802		return (error);
1803	tc_timepercentage = val;
1804	if (cold)
1805		goto done;
1806	tc_adjprecision();
1807done:
1808	return (0);
1809}
1810
1811static void
1812inittimecounter(void *dummy)
1813{
1814	u_int p;
1815	int tick_rate;
1816
1817	/*
1818	 * Set the initial timeout to
1819	 * max(1, <approx. number of hardclock ticks in a millisecond>).
1820	 * People should probably not use the sysctl to set the timeout
1821	 * to smaller than its inital value, since that value is the
1822	 * smallest reasonable one.  If they want better timestamps they
1823	 * should use the non-"get"* functions.
1824	 */
1825	if (hz > 1000)
1826		tc_tick = (hz + 500) / 1000;
1827	else
1828		tc_tick = 1;
1829	tc_adjprecision();
1830	FREQ2BT(hz, &tick_bt);
1831	tick_sbt = bttosbt(tick_bt);
1832	tick_rate = hz / tc_tick;
1833	FREQ2BT(tick_rate, &tc_tick_bt);
1834	tc_tick_sbt = bttosbt(tc_tick_bt);
1835	p = (tc_tick * 1000000) / hz;
1836	printf("Timecounters tick every %d.%03u msec\n", p / 1000, p % 1000);
1837
1838#ifdef FFCLOCK
1839	ffclock_init();
1840#endif
1841	/* warm up new timecounter (again) and get rolling. */
1842	(void)timecounter->tc_get_timecount(timecounter);
1843	(void)timecounter->tc_get_timecount(timecounter);
1844	tc_windup();
1845}
1846
1847SYSINIT(timecounter, SI_SUB_CLOCKS, SI_ORDER_SECOND, inittimecounter, NULL);
1848
1849/* Cpu tick handling -------------------------------------------------*/
1850
1851static int cpu_tick_variable;
1852static uint64_t	cpu_tick_frequency;
1853
1854static uint64_t
1855tc_cpu_ticks(void)
1856{
1857	static uint64_t base;
1858	static unsigned last;
1859	unsigned u;
1860	struct timecounter *tc;
1861
1862	tc = timehands->th_counter;
1863	u = tc->tc_get_timecount(tc) & tc->tc_counter_mask;
1864	if (u < last)
1865		base += (uint64_t)tc->tc_counter_mask + 1;
1866	last = u;
1867	return (u + base);
1868}
1869
1870void
1871cpu_tick_calibration(void)
1872{
1873	static time_t last_calib;
1874
1875	if (time_uptime != last_calib && !(time_uptime & 0xf)) {
1876		cpu_tick_calibrate(0);
1877		last_calib = time_uptime;
1878	}
1879}
1880
1881/*
1882 * This function gets called every 16 seconds on only one designated
1883 * CPU in the system from hardclock() via cpu_tick_calibration()().
1884 *
1885 * Whenever the real time clock is stepped we get called with reset=1
1886 * to make sure we handle suspend/resume and similar events correctly.
1887 */
1888
1889static void
1890cpu_tick_calibrate(int reset)
1891{
1892	static uint64_t c_last;
1893	uint64_t c_this, c_delta;
1894	static struct bintime  t_last;
1895	struct bintime t_this, t_delta;
1896	uint32_t divi;
1897
1898	if (reset) {
1899		/* The clock was stepped, abort & reset */
1900		t_last.sec = 0;
1901		return;
1902	}
1903
1904	/* we don't calibrate fixed rate cputicks */
1905	if (!cpu_tick_variable)
1906		return;
1907
1908	getbinuptime(&t_this);
1909	c_this = cpu_ticks();
1910	if (t_last.sec != 0) {
1911		c_delta = c_this - c_last;
1912		t_delta = t_this;
1913		bintime_sub(&t_delta, &t_last);
1914		/*
1915		 * Headroom:
1916		 * 	2^(64-20) / 16[s] =
1917		 * 	2^(44) / 16[s] =
1918		 * 	17.592.186.044.416 / 16 =
1919		 * 	1.099.511.627.776 [Hz]
1920		 */
1921		divi = t_delta.sec << 20;
1922		divi |= t_delta.frac >> (64 - 20);
1923		c_delta <<= 20;
1924		c_delta /= divi;
1925		if (c_delta > cpu_tick_frequency) {
1926			if (0 && bootverbose)
1927				printf("cpu_tick increased to %ju Hz\n",
1928				    c_delta);
1929			cpu_tick_frequency = c_delta;
1930		}
1931	}
1932	c_last = c_this;
1933	t_last = t_this;
1934}
1935
1936void
1937set_cputicker(cpu_tick_f *func, uint64_t freq, unsigned var)
1938{
1939
1940	if (func == NULL) {
1941		cpu_ticks = tc_cpu_ticks;
1942	} else {
1943		cpu_tick_frequency = freq;
1944		cpu_tick_variable = var;
1945		cpu_ticks = func;
1946	}
1947}
1948
1949uint64_t
1950cpu_tickrate(void)
1951{
1952
1953	if (cpu_ticks == tc_cpu_ticks)
1954		return (tc_getfrequency());
1955	return (cpu_tick_frequency);
1956}
1957
1958/*
1959 * We need to be slightly careful converting cputicks to microseconds.
1960 * There is plenty of margin in 64 bits of microseconds (half a million
1961 * years) and in 64 bits at 4 GHz (146 years), but if we do a multiply
1962 * before divide conversion (to retain precision) we find that the
1963 * margin shrinks to 1.5 hours (one millionth of 146y).
1964 * With a three prong approach we never lose significant bits, no
1965 * matter what the cputick rate and length of timeinterval is.
1966 */
1967
1968uint64_t
1969cputick2usec(uint64_t tick)
1970{
1971
1972	if (tick > 18446744073709551LL)		/* floor(2^64 / 1000) */
1973		return (tick / (cpu_tickrate() / 1000000LL));
1974	else if (tick > 18446744073709LL)	/* floor(2^64 / 1000000) */
1975		return ((tick * 1000LL) / (cpu_tickrate() / 1000LL));
1976	else
1977		return ((tick * 1000000LL) / cpu_tickrate());
1978}
1979
1980cpu_tick_f	*cpu_ticks = tc_cpu_ticks;
1981
1982static int vdso_th_enable = 1;
1983static int
1984sysctl_fast_gettime(SYSCTL_HANDLER_ARGS)
1985{
1986	int old_vdso_th_enable, error;
1987
1988	old_vdso_th_enable = vdso_th_enable;
1989	error = sysctl_handle_int(oidp, &old_vdso_th_enable, 0, req);
1990	if (error != 0)
1991		return (error);
1992	vdso_th_enable = old_vdso_th_enable;
1993	return (0);
1994}
1995SYSCTL_PROC(_kern_timecounter, OID_AUTO, fast_gettime,
1996    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1997    NULL, 0, sysctl_fast_gettime, "I", "Enable fast time of day");
1998
1999uint32_t
2000tc_fill_vdso_timehands(struct vdso_timehands *vdso_th)
2001{
2002	struct timehands *th;
2003	uint32_t enabled;
2004
2005	th = timehands;
2006	vdso_th->th_algo = VDSO_TH_ALGO_1;
2007	vdso_th->th_scale = th->th_scale;
2008	vdso_th->th_offset_count = th->th_offset_count;
2009	vdso_th->th_counter_mask = th->th_counter->tc_counter_mask;
2010	vdso_th->th_offset = th->th_offset;
2011	vdso_th->th_boottime = boottimebin;
2012	enabled = cpu_fill_vdso_timehands(vdso_th, th->th_counter);
2013	if (!vdso_th_enable)
2014		enabled = 0;
2015	return (enabled);
2016}
2017
2018#ifdef COMPAT_FREEBSD32
2019uint32_t
2020tc_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32)
2021{
2022	struct timehands *th;
2023	uint32_t enabled;
2024
2025	th = timehands;
2026	vdso_th32->th_algo = VDSO_TH_ALGO_1;
2027	*(uint64_t *)&vdso_th32->th_scale[0] = th->th_scale;
2028	vdso_th32->th_offset_count = th->th_offset_count;
2029	vdso_th32->th_counter_mask = th->th_counter->tc_counter_mask;
2030	vdso_th32->th_offset.sec = th->th_offset.sec;
2031	*(uint64_t *)&vdso_th32->th_offset.frac[0] = th->th_offset.frac;
2032	vdso_th32->th_boottime.sec = boottimebin.sec;
2033	*(uint64_t *)&vdso_th32->th_boottime.frac[0] = boottimebin.frac;
2034	enabled = cpu_fill_vdso_timehands32(vdso_th32, th->th_counter);
2035	if (!vdso_th_enable)
2036		enabled = 0;
2037	return (enabled);
2038}
2039#endif
2040