kern_tc.c revision 118842
1/*-
2 * ----------------------------------------------------------------------------
3 * "THE BEER-WARE LICENSE" (Revision 42):
4 * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
5 * can do whatever you want with this stuff. If we meet some day, and you think
6 * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
7 * ----------------------------------------------------------------------------
8 */
9
10#include <sys/cdefs.h>
11__FBSDID("$FreeBSD: head/sys/kern/kern_tc.c 118842 2003-08-12 20:34:31Z mux $");
12
13#include "opt_ntp.h"
14
15#include <sys/param.h>
16#include <sys/kernel.h>
17#include <sys/sysctl.h>
18#include <sys/systm.h>
19#include <sys/timepps.h>
20#include <sys/timetc.h>
21#include <sys/timex.h>
22
23/*
24 * a large step happens on boot.  This constant detects such
25 * a steps.  It is relatively small so that ntp_update_second gets called
26 * enough in the typical 'missed a couple of seconds' case, but doesn't
27 * loop forever when the time step is large.
28 */
29#define LARGE_STEP	200
30
31/*
32 * Implement a dummy timecounter which we can use until we get a real one
33 * in the air.  This allows the console and other early stuff to use
34 * time services.
35 */
36
37static u_int
38dummy_get_timecount(struct timecounter *tc)
39{
40	static u_int now;
41
42	return (++now);
43}
44
45static struct timecounter dummy_timecounter = {
46	dummy_get_timecount, 0, ~0u, 1000000, "dummy",
47};
48
49struct timehands {
50	/* These fields must be initialized by the driver. */
51	struct timecounter	*th_counter;
52	int64_t			th_adjustment;
53	u_int64_t		th_scale;
54	u_int	 		th_offset_count;
55	struct bintime		th_offset;
56	struct timeval		th_microtime;
57	struct timespec		th_nanotime;
58	/* Fields not to be copied in tc_windup start with th_generation. */
59	volatile u_int		th_generation;
60	struct timehands	*th_next;
61};
62
63extern struct timehands th0;
64static struct timehands th9 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th0};
65static struct timehands th8 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th9};
66static struct timehands th7 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th8};
67static struct timehands th6 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th7};
68static struct timehands th5 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th6};
69static struct timehands th4 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th5};
70static struct timehands th3 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th4};
71static struct timehands th2 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th3};
72static struct timehands th1 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th2};
73static struct timehands th0 = {
74	&dummy_timecounter,
75	0,
76	(uint64_t)-1 / 1000000,
77	0,
78	{1, 0},
79	{0, 0},
80	{0, 0},
81	1,
82	&th1
83};
84
85static struct timehands *volatile timehands = &th0;
86struct timecounter *timecounter = &dummy_timecounter;
87static struct timecounter *timecounters = &dummy_timecounter;
88
89time_t time_second = 1;
90time_t time_uptime = 0;
91
92static struct bintime boottimebin;
93struct timeval boottime;
94SYSCTL_STRUCT(_kern, KERN_BOOTTIME, boottime, CTLFLAG_RD,
95    &boottime, timeval, "System boottime");
96
97SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, "");
98
99#define TC_STATS(foo) \
100	static u_int foo; \
101	SYSCTL_UINT(_kern_timecounter, OID_AUTO, foo, CTLFLAG_RD, &foo, 0, "");\
102	struct __hack
103
104TC_STATS(nbinuptime);    TC_STATS(nnanouptime);    TC_STATS(nmicrouptime);
105TC_STATS(nbintime);      TC_STATS(nnanotime);      TC_STATS(nmicrotime);
106TC_STATS(ngetbinuptime); TC_STATS(ngetnanouptime); TC_STATS(ngetmicrouptime);
107TC_STATS(ngetbintime);   TC_STATS(ngetnanotime);   TC_STATS(ngetmicrotime);
108TC_STATS(nsetclock);
109
110#undef TC_STATS
111
112static void tc_windup(void);
113
114/*
115 * Return the difference between the timehands' counter value now and what
116 * was when we copied it to the timehands' offset_count.
117 */
118static __inline u_int
119tc_delta(struct timehands *th)
120{
121	struct timecounter *tc;
122
123	tc = th->th_counter;
124	return ((tc->tc_get_timecount(tc) - th->th_offset_count) &
125	    tc->tc_counter_mask);
126}
127
128/*
129 * Functions for reading the time.  We have to loop until we are sure that
130 * the timehands that we operated on was not updated under our feet.  See
131 * the comment in <sys/time.h> for a description of these 12 functions.
132 */
133
134void
135binuptime(struct bintime *bt)
136{
137	struct timehands *th;
138	u_int gen;
139
140	nbinuptime++;
141	do {
142		th = timehands;
143		gen = th->th_generation;
144		*bt = th->th_offset;
145		bintime_addx(bt, th->th_scale * tc_delta(th));
146	} while (gen == 0 || gen != th->th_generation);
147}
148
149void
150nanouptime(struct timespec *tsp)
151{
152	struct bintime bt;
153
154	nnanouptime++;
155	binuptime(&bt);
156	bintime2timespec(&bt, tsp);
157}
158
159void
160microuptime(struct timeval *tvp)
161{
162	struct bintime bt;
163
164	nmicrouptime++;
165	binuptime(&bt);
166	bintime2timeval(&bt, tvp);
167}
168
169void
170bintime(struct bintime *bt)
171{
172
173	nbintime++;
174	binuptime(bt);
175	bintime_add(bt, &boottimebin);
176}
177
178void
179nanotime(struct timespec *tsp)
180{
181	struct bintime bt;
182
183	nnanotime++;
184	bintime(&bt);
185	bintime2timespec(&bt, tsp);
186}
187
188void
189microtime(struct timeval *tvp)
190{
191	struct bintime bt;
192
193	nmicrotime++;
194	bintime(&bt);
195	bintime2timeval(&bt, tvp);
196}
197
198void
199getbinuptime(struct bintime *bt)
200{
201	struct timehands *th;
202	u_int gen;
203
204	ngetbinuptime++;
205	do {
206		th = timehands;
207		gen = th->th_generation;
208		*bt = th->th_offset;
209	} while (gen == 0 || gen != th->th_generation);
210}
211
212void
213getnanouptime(struct timespec *tsp)
214{
215	struct timehands *th;
216	u_int gen;
217
218	ngetnanouptime++;
219	do {
220		th = timehands;
221		gen = th->th_generation;
222		bintime2timespec(&th->th_offset, tsp);
223	} while (gen == 0 || gen != th->th_generation);
224}
225
226void
227getmicrouptime(struct timeval *tvp)
228{
229	struct timehands *th;
230	u_int gen;
231
232	ngetmicrouptime++;
233	do {
234		th = timehands;
235		gen = th->th_generation;
236		bintime2timeval(&th->th_offset, tvp);
237	} while (gen == 0 || gen != th->th_generation);
238}
239
240void
241getbintime(struct bintime *bt)
242{
243	struct timehands *th;
244	u_int gen;
245
246	ngetbintime++;
247	do {
248		th = timehands;
249		gen = th->th_generation;
250		*bt = th->th_offset;
251	} while (gen == 0 || gen != th->th_generation);
252	bintime_add(bt, &boottimebin);
253}
254
255void
256getnanotime(struct timespec *tsp)
257{
258	struct timehands *th;
259	u_int gen;
260
261	ngetnanotime++;
262	do {
263		th = timehands;
264		gen = th->th_generation;
265		*tsp = th->th_nanotime;
266	} while (gen == 0 || gen != th->th_generation);
267}
268
269void
270getmicrotime(struct timeval *tvp)
271{
272	struct timehands *th;
273	u_int gen;
274
275	ngetmicrotime++;
276	do {
277		th = timehands;
278		gen = th->th_generation;
279		*tvp = th->th_microtime;
280	} while (gen == 0 || gen != th->th_generation);
281}
282
283/*
284 * Initialize a new timecounter.
285 * We should really try to rank the timecounters and intelligently determine
286 * if the new timecounter is better than the current one.  This is subject
287 * to further study.  For now always use the new timecounter.
288 */
289void
290tc_init(struct timecounter *tc)
291{
292	unsigned u;
293
294	printf("Timecounter \"%s\" frequency %ju Hz",
295	    tc->tc_name, (intmax_t)tc->tc_frequency);
296
297	u = tc->tc_frequency / tc->tc_counter_mask;
298	if (u > hz) {
299		printf(" -- Insufficient hz, needs at least %u\n", u);
300		return;
301	}
302	tc->tc_next = timecounters;
303	timecounters = tc;
304	printf("\n");
305	(void)tc->tc_get_timecount(tc);
306	(void)tc->tc_get_timecount(tc);
307	timecounter = tc;
308}
309
310/* Report the frequency of the current timecounter. */
311u_int64_t
312tc_getfrequency(void)
313{
314
315	return (timehands->th_counter->tc_frequency);
316}
317
318/*
319 * Step our concept of UTC.  This is done by modifying our estimate of
320 * when we booted.  XXX: needs futher work.
321 */
322void
323tc_setclock(struct timespec *ts)
324{
325	struct timespec ts2;
326
327	nsetclock++;
328	nanouptime(&ts2);
329	boottime.tv_sec = ts->tv_sec - ts2.tv_sec;
330	/* XXX boottime should probably be a timespec. */
331	boottime.tv_usec = (ts->tv_nsec - ts2.tv_nsec) / 1000;
332	if (boottime.tv_usec < 0) {
333		boottime.tv_usec += 1000000;
334		boottime.tv_sec--;
335	}
336	timeval2bintime(&boottime, &boottimebin);
337
338	/* XXX fiddle all the little crinkly bits around the fiords... */
339	tc_windup();
340}
341
342/*
343 * Initialize the next struct timehands in the ring and make
344 * it the active timehands.  Along the way we might switch to a different
345 * timecounter and/or do seconds processing in NTP.  Slightly magic.
346 */
347static void
348tc_windup(void)
349{
350	struct bintime bt;
351	struct timehands *th, *tho;
352	u_int64_t scale;
353	u_int delta, ncount, ogen;
354	int i;
355	time_t t;
356
357	/*
358	 * Make the next timehands a copy of the current one, but do not
359	 * overwrite the generation or next pointer.  While we update
360	 * the contents, the generation must be zero.
361	 */
362	tho = timehands;
363	th = tho->th_next;
364	ogen = th->th_generation;
365	th->th_generation = 0;
366	bcopy(tho, th, offsetof(struct timehands, th_generation));
367
368	/*
369	 * Capture a timecounter delta on the current timecounter and if
370	 * changing timecounters, a counter value from the new timecounter.
371	 * Update the offset fields accordingly.
372	 */
373	delta = tc_delta(th);
374	if (th->th_counter != timecounter)
375		ncount = timecounter->tc_get_timecount(timecounter);
376	else
377		ncount = 0;
378	th->th_offset_count += delta;
379	th->th_offset_count &= th->th_counter->tc_counter_mask;
380	bintime_addx(&th->th_offset, th->th_scale * delta);
381
382	/*
383	 * Hardware latching timecounters may not generate interrupts on
384	 * PPS events, so instead we poll them.  There is a finite risk that
385	 * the hardware might capture a count which is later than the one we
386	 * got above, and therefore possibly in the next NTP second which might
387	 * have a different rate than the current NTP second.  It doesn't
388	 * matter in practice.
389	 */
390	if (tho->th_counter->tc_poll_pps)
391		tho->th_counter->tc_poll_pps(tho->th_counter);
392
393 	/*
394	 * Compute the UTC time, before any leapsecond adjustments, are
395	 * made.
396	 */
397	bt = th->th_offset;
398	bintime_add(&bt, &boottimebin);
399
400	/*
401	 * Deal with NTP second processing.  The for loop normally only
402	 * iterates once, but in extreme situations it might keep NTP sane
403	 * if timeouts are not run for several seconds.  At boot, the
404	 * time step can be large when the TOD hardware has been read, so
405	 * on really large steps, we call ntp_update_second only once.
406	 */
407	for (i = bt.sec - tho->th_microtime.tv_sec; i > 0; i--) {
408		t = bt.sec;
409		ntp_update_second(&th->th_adjustment, &bt.sec);
410		if (bt.sec != t)
411			boottimebin.sec += bt.sec - t;
412		if (i > LARGE_STEP)
413			break;
414	}
415
416	/* Now is a good time to change timecounters. */
417	if (th->th_counter != timecounter) {
418		th->th_counter = timecounter;
419		th->th_offset_count = ncount;
420	}
421
422	/*-
423	 * Recalculate the scaling factor.  We want the number of 1/2^64
424	 * fractions of a second per period of the hardware counter, taking
425	 * into account the th_adjustment factor which the NTP PLL/adjtime(2)
426	 * processing provides us with.
427	 *
428	 * The th_adjustment is nanoseconds per second with 32 bit binary
429	 * fraction and we want 64 bit binary fraction of second:
430	 *
431	 *	 x = a * 2^32 / 10^9 = a * 4.294967296
432	 *
433	 * The range of th_adjustment is +/- 5000PPM so inside a 64bit int
434	 * we can only multiply by about 850 without overflowing, but that
435	 * leaves suitably precise fractions for multiply before divide.
436	 *
437	 * Divide before multiply with a fraction of 2199/512 results in a
438	 * systematic undercompensation of 10PPM of th_adjustment.  On a
439	 * 5000PPM adjustment this is a 0.05PPM error.  This is acceptable.
440 	 *
441	 * We happily sacrifice the lowest of the 64 bits of our result
442	 * to the goddess of code clarity.
443	 *
444	 */
445	scale = (u_int64_t)1 << 63;
446	scale += (th->th_adjustment / 1024) * 2199;
447	scale /= th->th_counter->tc_frequency;
448	th->th_scale = scale * 2;
449
450	bintime2timeval(&bt, &th->th_microtime);
451	bintime2timespec(&bt, &th->th_nanotime);
452
453	/*
454	 * Now that the struct timehands is again consistent, set the new
455	 * generation number, making sure to not make it zero.
456	 */
457	if (++ogen == 0)
458		ogen = 1;
459	th->th_generation = ogen;
460
461	/* Go live with the new struct timehands. */
462	time_second = th->th_microtime.tv_sec;
463	time_uptime = th->th_offset.sec;
464	timehands = th;
465}
466
467/* Report or change the active timecounter hardware. */
468static int
469sysctl_kern_timecounter_hardware(SYSCTL_HANDLER_ARGS)
470{
471	char newname[32];
472	struct timecounter *newtc, *tc;
473	int error;
474
475	tc = timecounter;
476	strlcpy(newname, tc->tc_name, sizeof(newname));
477
478	error = sysctl_handle_string(oidp, &newname[0], sizeof(newname), req);
479	if (error != 0 || req->newptr == NULL ||
480	    strcmp(newname, tc->tc_name) == 0)
481		return (error);
482	for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) {
483		if (strcmp(newname, newtc->tc_name) != 0)
484			continue;
485
486		/* Warm up new timecounter. */
487		(void)newtc->tc_get_timecount(newtc);
488		(void)newtc->tc_get_timecount(newtc);
489
490		timecounter = newtc;
491		return (0);
492	}
493	return (EINVAL);
494}
495
496SYSCTL_PROC(_kern_timecounter, OID_AUTO, hardware, CTLTYPE_STRING | CTLFLAG_RW,
497    0, 0, sysctl_kern_timecounter_hardware, "A", "");
498
499/*
500 * RFC 2783 PPS-API implementation.
501 */
502
503int
504pps_ioctl(u_long cmd, caddr_t data, struct pps_state *pps)
505{
506	pps_params_t *app;
507	struct pps_fetch_args *fapi;
508#ifdef PPS_SYNC
509	struct pps_kcbind_args *kapi;
510#endif
511
512	switch (cmd) {
513	case PPS_IOC_CREATE:
514		return (0);
515	case PPS_IOC_DESTROY:
516		return (0);
517	case PPS_IOC_SETPARAMS:
518		app = (pps_params_t *)data;
519		if (app->mode & ~pps->ppscap)
520			return (EINVAL);
521		pps->ppsparam = *app;
522		return (0);
523	case PPS_IOC_GETPARAMS:
524		app = (pps_params_t *)data;
525		*app = pps->ppsparam;
526		app->api_version = PPS_API_VERS_1;
527		return (0);
528	case PPS_IOC_GETCAP:
529		*(int*)data = pps->ppscap;
530		return (0);
531	case PPS_IOC_FETCH:
532		fapi = (struct pps_fetch_args *)data;
533		if (fapi->tsformat && fapi->tsformat != PPS_TSFMT_TSPEC)
534			return (EINVAL);
535		if (fapi->timeout.tv_sec || fapi->timeout.tv_nsec)
536			return (EOPNOTSUPP);
537		pps->ppsinfo.current_mode = pps->ppsparam.mode;
538		fapi->pps_info_buf = pps->ppsinfo;
539		return (0);
540	case PPS_IOC_KCBIND:
541#ifdef PPS_SYNC
542		kapi = (struct pps_kcbind_args *)data;
543		/* XXX Only root should be able to do this */
544		if (kapi->tsformat && kapi->tsformat != PPS_TSFMT_TSPEC)
545			return (EINVAL);
546		if (kapi->kernel_consumer != PPS_KC_HARDPPS)
547			return (EINVAL);
548		if (kapi->edge & ~pps->ppscap)
549			return (EINVAL);
550		pps->kcmode = kapi->edge;
551		return (0);
552#else
553		return (EOPNOTSUPP);
554#endif
555	default:
556		return (ENOTTY);
557	}
558}
559
560void
561pps_init(struct pps_state *pps)
562{
563	pps->ppscap |= PPS_TSFMT_TSPEC;
564	if (pps->ppscap & PPS_CAPTUREASSERT)
565		pps->ppscap |= PPS_OFFSETASSERT;
566	if (pps->ppscap & PPS_CAPTURECLEAR)
567		pps->ppscap |= PPS_OFFSETCLEAR;
568}
569
570void
571pps_capture(struct pps_state *pps)
572{
573	struct timehands *th;
574
575	th = timehands;
576	pps->capgen = th->th_generation;
577	pps->capth = th;
578	pps->capcount = th->th_counter->tc_get_timecount(th->th_counter);
579	if (pps->capgen != th->th_generation)
580		pps->capgen = 0;
581}
582
583void
584pps_event(struct pps_state *pps, int event)
585{
586	struct bintime bt;
587	struct timespec ts, *tsp, *osp;
588	u_int tcount, *pcount;
589	int foff, fhard;
590	pps_seq_t *pseq;
591
592	/* If the timecounter was wound up underneath us, bail out. */
593	if (pps->capgen == 0 || pps->capgen != pps->capth->th_generation)
594		return;
595
596	/* Things would be easier with arrays. */
597	if (event == PPS_CAPTUREASSERT) {
598		tsp = &pps->ppsinfo.assert_timestamp;
599		osp = &pps->ppsparam.assert_offset;
600		foff = pps->ppsparam.mode & PPS_OFFSETASSERT;
601		fhard = pps->kcmode & PPS_CAPTUREASSERT;
602		pcount = &pps->ppscount[0];
603		pseq = &pps->ppsinfo.assert_sequence;
604	} else {
605		tsp = &pps->ppsinfo.clear_timestamp;
606		osp = &pps->ppsparam.clear_offset;
607		foff = pps->ppsparam.mode & PPS_OFFSETCLEAR;
608		fhard = pps->kcmode & PPS_CAPTURECLEAR;
609		pcount = &pps->ppscount[1];
610		pseq = &pps->ppsinfo.clear_sequence;
611	}
612
613	/*
614	 * If the timecounter changed, we cannot compare the count values, so
615	 * we have to drop the rest of the PPS-stuff until the next event.
616	 */
617	if (pps->ppstc != pps->capth->th_counter) {
618		pps->ppstc = pps->capth->th_counter;
619		*pcount = pps->capcount;
620		pps->ppscount[2] = pps->capcount;
621		return;
622	}
623
624	/* Return if nothing really happened. */
625	if (*pcount == pps->capcount)
626		return;
627
628	/* Convert the count to a timespec. */
629	tcount = pps->capcount - pps->capth->th_offset_count;
630	tcount &= pps->capth->th_counter->tc_counter_mask;
631	bt = pps->capth->th_offset;
632	bintime_addx(&bt, pps->capth->th_scale * tcount);
633	bintime_add(&bt, &boottimebin);
634	bintime2timespec(&bt, &ts);
635
636	/* If the timecounter was wound up underneath us, bail out. */
637	if (pps->capgen != pps->capth->th_generation)
638		return;
639
640	*pcount = pps->capcount;
641	(*pseq)++;
642	*tsp = ts;
643
644	if (foff) {
645		timespecadd(tsp, osp);
646		if (tsp->tv_nsec < 0) {
647			tsp->tv_nsec += 1000000000;
648			tsp->tv_sec -= 1;
649		}
650	}
651#ifdef PPS_SYNC
652	if (fhard) {
653		u_int64_t scale;
654
655		/*
656		 * Feed the NTP PLL/FLL.
657		 * The FLL wants to know how many (hardware) nanoseconds
658		 * elapsed since the previous event.
659		 */
660		tcount = pps->capcount - pps->ppscount[2];
661		pps->ppscount[2] = pps->capcount;
662		tcount &= pps->capth->th_counter->tc_counter_mask;
663		scale = (u_int64_t)1 << 63;
664		scale /= pps->capth->th_counter->tc_frequency;
665		scale *= 2;
666		bt.sec = 0;
667		bt.frac = 0;
668		bintime_addx(&bt, scale * tcount);
669		bintime2timespec(&bt, &ts);
670		hardpps(tsp, ts.tv_nsec + 1000000000 * ts.tv_sec);
671	}
672#endif
673}
674
675/*
676 * Timecounters need to be updated every so often to prevent the hardware
677 * counter from overflowing.  Updating also recalculates the cached values
678 * used by the get*() family of functions, so their precision depends on
679 * the update frequency.
680 */
681
682static int tc_tick;
683SYSCTL_INT(_kern_timecounter, OID_AUTO, tick, CTLFLAG_RD, &tc_tick, 0, "");
684
685void
686tc_ticktock(void)
687{
688	static int count;
689
690	if (++count < tc_tick)
691		return;
692	count = 0;
693	tc_windup();
694}
695
696static void
697inittimecounter(void *dummy)
698{
699	u_int p;
700
701	/*
702	 * Set the initial timeout to
703	 * max(1, <approx. number of hardclock ticks in a millisecond>).
704	 * People should probably not use the sysctl to set the timeout
705	 * to smaller than its inital value, since that value is the
706	 * smallest reasonable one.  If they want better timestamps they
707	 * should use the non-"get"* functions.
708	 */
709	if (hz > 1000)
710		tc_tick = (hz + 500) / 1000;
711	else
712		tc_tick = 1;
713	p = (tc_tick * 1000000) / hz;
714	printf("Timecounters tick every %d.%03u msec\n", p / 1000, p % 1000);
715
716	/* warm up new timecounter (again) and get rolling. */
717	(void)timecounter->tc_get_timecount(timecounter);
718	(void)timecounter->tc_get_timecount(timecounter);
719}
720
721SYSINIT(timecounter, SI_SUB_CLOCKS, SI_ORDER_SECOND, inittimecounter, NULL)
722