14Srgrimes/*-
2110379Sphk * Copyright (c) 1998-2003 Poul-Henning Kamp
34Srgrimes * All rights reserved.
44Srgrimes *
54Srgrimes * Redistribution and use in source and binary forms, with or without
64Srgrimes * modification, are permitted provided that the following conditions
74Srgrimes * are met:
84Srgrimes * 1. Redistributions of source code must retain the above copyright
94Srgrimes *    notice, this list of conditions and the following disclaimer.
104Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
114Srgrimes *    notice, this list of conditions and the following disclaimer in the
124Srgrimes *    documentation and/or other materials provided with the distribution.
134Srgrimes *
14110379Sphk * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
154Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
164Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17110379Sphk * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
184Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
194Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
204Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
214Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
224Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
234Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
244Srgrimes * SUCH DAMAGE.
254Srgrimes */
264Srgrimes
27115683Sobrien#include <sys/cdefs.h>
28115683Sobrien__FBSDID("$FreeBSD: stable/11/sys/x86/x86/tsc.c 363433 2020-07-22 21:17:02Z jkim $");
29115683Sobrien
30237433Skib#include "opt_compat.h"
3116299Spst#include "opt_clock.h"
3213228Swollman
332056Swollman#include <sys/param.h>
34167905Snjl#include <sys/bus.h>
35167905Snjl#include <sys/cpu.h>
36221214Sjkim#include <sys/limits.h>
37167905Snjl#include <sys/malloc.h>
382056Swollman#include <sys/systm.h>
39113348Sdes#include <sys/sysctl.h>
402056Swollman#include <sys/time.h>
4158377Sphk#include <sys/timetc.h>
422056Swollman#include <sys/kernel.h>
4385835Siwasaki#include <sys/power.h>
44113348Sdes#include <sys/smp.h>
45237433Skib#include <sys/vdso.h>
464180Sbde#include <machine/clock.h>
47216272Sjkim#include <machine/cputypes.h>
4832054Sphk#include <machine/md_var.h>
4932054Sphk#include <machine/specialreg.h>
50273800Sjhb#include <x86/vmware.h>
51305866Skib#include <dev/acpica/acpi_hpet.h>
5215508Sbde
53167905Snjl#include "cpufreq_if.h"
54167905Snjl
55216163Sjkimuint64_t	tsc_freq;
56184102Sjkimint		tsc_is_invariant;
57220579Sjkimint		tsc_perf_stat;
58220579Sjkim
59167905Snjlstatic eventhandler_tag tsc_levels_tag, tsc_pre_tag, tsc_post_tag;
601390Ssos
61184102SjkimSYSCTL_INT(_kern_timecounter, OID_AUTO, invariant_tsc, CTLFLAG_RDTUN,
62184108Sjkim    &tsc_is_invariant, 0, "Indicates whether the TSC is P-state invariant");
63184102Sjkim
64113348Sdes#ifdef SMP
65249324Sneelint	smp_tsc;
66121307SsilbySYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc, CTLFLAG_RDTUN, &smp_tsc, 0,
67113348Sdes    "Indicates whether the TSC is safe to use in SMP mode");
68249625Smav
69249625Smavint	smp_tsc_adjust = 0;
70249625SmavSYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc_adjust, CTLFLAG_RDTUN,
71249625Smav    &smp_tsc_adjust, 0, "Try to adjust TSC on APs to match BSP");
72113348Sdes#endif
73113348Sdes
74246212Skibstatic int	tsc_shift = 1;
75246212SkibSYSCTL_INT(_kern_timecounter, OID_AUTO, tsc_shift, CTLFLAG_RDTUN,
76246212Skib    &tsc_shift, 0, "Shift to pre-apply for the maximum TSC frequency");
77246212Skib
78219473Sjkimstatic int	tsc_disabled;
79219473SjkimSYSCTL_INT(_machdep, OID_AUTO, disable_tsc, CTLFLAG_RDTUN, &tsc_disabled, 0,
80219473Sjkim    "Disable x86 Time Stamp Counter");
81219473Sjkim
82220577Sjkimstatic int	tsc_skip_calibration;
83220577SjkimSYSCTL_INT(_machdep, OID_AUTO, disable_tsc_calibration, CTLFLAG_RDTUN,
84220577Sjkim    &tsc_skip_calibration, 0, "Disable TSC frequency calibration");
85220577Sjkim
86167905Snjlstatic void tsc_freq_changed(void *arg, const struct cf_level *level,
87167905Snjl    int status);
88167905Snjlstatic void tsc_freq_changing(void *arg, const struct cf_level *level,
89167905Snjl    int *status);
90222866Sjkimstatic unsigned tsc_get_timecount(struct timecounter *tc);
91238973Skibstatic inline unsigned tsc_get_timecount_low(struct timecounter *tc);
92238973Skibstatic unsigned tsc_get_timecount_lfence(struct timecounter *tc);
93238973Skibstatic unsigned tsc_get_timecount_low_lfence(struct timecounter *tc);
94238973Skibstatic unsigned tsc_get_timecount_mfence(struct timecounter *tc);
95238973Skibstatic unsigned tsc_get_timecount_low_mfence(struct timecounter *tc);
96167905Snjlstatic void tsc_levels_changed(void *arg, int unit);
97305866Skibstatic uint32_t x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th,
98305866Skib    struct timecounter *tc);
99305866Skib#ifdef COMPAT_FREEBSD32
100305866Skibstatic uint32_t x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32,
101305866Skib    struct timecounter *tc);
102305866Skib#endif
10317353Sbde
10440610Sphkstatic struct timecounter tsc_timecounter = {
105305866Skib	.tc_get_timecount =		tsc_get_timecount,
106305866Skib	.tc_counter_mask =		~0u,
107305866Skib	.tc_name =			"TSC",
108305866Skib	.tc_quality =			800,	/* adjusted in code */
109305866Skib	.tc_fill_vdso_timehands = 	x86_tsc_vdso_timehands,
110305866Skib#ifdef COMPAT_FREEBSD32
111305866Skib	.tc_fill_vdso_timehands32 = 	x86_tsc_vdso_timehands32,
112305866Skib#endif
11333690Sphk};
11433690Sphk
115273800Sjhbstatic void
116221214Sjkimtsc_freq_vmware(void)
117221214Sjkim{
118221214Sjkim	u_int regs[4];
119221214Sjkim
120221214Sjkim	if (hv_high >= 0x40000010) {
121221214Sjkim		do_cpuid(0x40000010, regs);
122221214Sjkim		tsc_freq = regs[0] * 1000;
123221214Sjkim	} else {
124221214Sjkim		vmware_hvcall(VMW_HVCMD_GETHZ, regs);
125221214Sjkim		if (regs[1] != UINT_MAX)
126221214Sjkim			tsc_freq = regs[0] | ((uint64_t)regs[1] << 32);
127221214Sjkim	}
128221214Sjkim	tsc_is_invariant = 1;
129221214Sjkim}
130221214Sjkim
131333161Skib/*
132333161Skib * Calculate TSC frequency using information from the CPUID leaf 0x15
133353007Skib * 'Time Stamp Counter and Nominal Core Crystal Clock'.  If leaf 0x15
134353007Skib * is not functional, as it is on Skylake/Kabylake, try 0x16 'Processor
135353007Skib * Frequency Information'.  Leaf 0x16 is described in the SDM as
136353007Skib * informational only, but if 0x15 did not work, and TSC calibration
137353007Skib * is disabled, it is the best we can get at all.  It should still be
138333161Skib * an improvement over the parsing of the CPU model name in
139333161Skib * tsc_freq_intel(), when available.
140333161Skib */
141333161Skibstatic bool
142333161Skibtsc_freq_cpuid(void)
143333161Skib{
144333161Skib	u_int regs[4];
145333161Skib
146333161Skib	if (cpu_high < 0x15)
147333161Skib		return (false);
148333161Skib	do_cpuid(0x15, regs);
149353007Skib	if (regs[0] != 0 && regs[1] != 0 && regs[2] != 0) {
150353007Skib		tsc_freq = (uint64_t)regs[2] * regs[1] / regs[0];
151353007Skib		return (true);
152353007Skib	}
153353007Skib
154353007Skib	if (cpu_high < 0x16)
155333161Skib		return (false);
156353007Skib	do_cpuid(0x16, regs);
157353007Skib	if (regs[0] != 0) {
158353007Skib		tsc_freq = (uint64_t)regs[0] * 1000000;
159353007Skib		return (true);
160353007Skib	}
161353007Skib
162353007Skib	return (false);
163333161Skib}
164333161Skib
165220577Sjkimstatic void
166220577Sjkimtsc_freq_intel(void)
1671390Ssos{
168220577Sjkim	char brand[48];
169220577Sjkim	u_int regs[4];
170220577Sjkim	uint64_t freq;
171220577Sjkim	char *p;
172220577Sjkim	u_int i;
1731390Ssos
174220577Sjkim	/*
175220577Sjkim	 * Intel Processor Identification and the CPUID Instruction
176220577Sjkim	 * Application Note 485.
177220577Sjkim	 * http://www.intel.com/assets/pdf/appnote/241618.pdf
178220577Sjkim	 */
179220577Sjkim	if (cpu_exthigh >= 0x80000004) {
180220577Sjkim		p = brand;
181220577Sjkim		for (i = 0x80000002; i < 0x80000005; i++) {
182220577Sjkim			do_cpuid(i, regs);
183220577Sjkim			memcpy(p, regs, sizeof(regs));
184220577Sjkim			p += sizeof(regs);
185220577Sjkim		}
186220577Sjkim		p = NULL;
187220577Sjkim		for (i = 0; i < sizeof(brand) - 1; i++)
188220577Sjkim			if (brand[i] == 'H' && brand[i + 1] == 'z')
189220577Sjkim				p = brand + i;
190220577Sjkim		if (p != NULL) {
191220577Sjkim			p -= 5;
192220577Sjkim			switch (p[4]) {
193220577Sjkim			case 'M':
194220577Sjkim				i = 1;
195220577Sjkim				break;
196220577Sjkim			case 'G':
197220577Sjkim				i = 1000;
198220577Sjkim				break;
199220577Sjkim			case 'T':
200220577Sjkim				i = 1000000;
201220577Sjkim				break;
202220577Sjkim			default:
203220577Sjkim				return;
204220577Sjkim			}
205220577Sjkim#define	C2D(c)	((c) - '0')
206220577Sjkim			if (p[1] == '.') {
207220577Sjkim				freq = C2D(p[0]) * 1000;
208220577Sjkim				freq += C2D(p[2]) * 100;
209220577Sjkim				freq += C2D(p[3]) * 10;
210220577Sjkim				freq *= i * 1000;
211220577Sjkim			} else {
212220577Sjkim				freq = C2D(p[0]) * 1000;
213220577Sjkim				freq += C2D(p[1]) * 100;
214220577Sjkim				freq += C2D(p[2]) * 10;
215220577Sjkim				freq += C2D(p[3]);
216220577Sjkim				freq *= i * 1000000;
217220577Sjkim			}
218220577Sjkim#undef C2D
219220577Sjkim			tsc_freq = freq;
220220577Sjkim		}
221220577Sjkim	}
222220577Sjkim}
22332054Sphk
224220577Sjkimstatic void
225220577Sjkimprobe_tsc_freq(void)
226220577Sjkim{
227220579Sjkim	u_int regs[4];
228220577Sjkim	uint64_t tsc1, tsc2;
22915508Sbde
230221214Sjkim	if (cpu_high >= 6) {
231221214Sjkim		do_cpuid(6, regs);
232221214Sjkim		if ((regs[2] & CPUID_PERF_STAT) != 0) {
233221214Sjkim			/*
234221214Sjkim			 * XXX Some emulators expose host CPUID without actual
235221214Sjkim			 * support for these MSRs.  We must test whether they
236221214Sjkim			 * really work.
237221214Sjkim			 */
238221214Sjkim			wrmsr(MSR_MPERF, 0);
239221214Sjkim			wrmsr(MSR_APERF, 0);
240221214Sjkim			DELAY(10);
241221214Sjkim			if (rdmsr(MSR_MPERF) > 0 && rdmsr(MSR_APERF) > 0)
242221214Sjkim				tsc_perf_stat = 1;
243221214Sjkim		}
244221214Sjkim	}
245221214Sjkim
246273800Sjhb	if (vm_guest == VM_GUEST_VMWARE) {
247273800Sjhb		tsc_freq_vmware();
248221214Sjkim		return;
249273800Sjhb	}
250221214Sjkim
251216272Sjkim	switch (cpu_vendor_id) {
252216272Sjkim	case CPU_VENDOR_AMD:
253219469Sjkim		if ((amd_pminfo & AMDPM_TSC_INVARIANT) != 0 ||
254219469Sjkim		    (vm_guest == VM_GUEST_NO &&
255219469Sjkim		    CPUID_TO_FAMILY(cpu_id) >= 0x10))
256216272Sjkim			tsc_is_invariant = 1;
257238973Skib		if (cpu_feature & CPUID_SSE2) {
258238973Skib			tsc_timecounter.tc_get_timecount =
259238973Skib			    tsc_get_timecount_mfence;
260238973Skib		}
261216272Sjkim		break;
262216272Sjkim	case CPU_VENDOR_INTEL:
263219469Sjkim		if ((amd_pminfo & AMDPM_TSC_INVARIANT) != 0 ||
264219469Sjkim		    (vm_guest == VM_GUEST_NO &&
265219469Sjkim		    ((CPUID_TO_FAMILY(cpu_id) == 0x6 &&
266216272Sjkim		    CPUID_TO_MODEL(cpu_id) >= 0xe) ||
267216272Sjkim		    (CPUID_TO_FAMILY(cpu_id) == 0xf &&
268219469Sjkim		    CPUID_TO_MODEL(cpu_id) >= 0x3))))
269216272Sjkim			tsc_is_invariant = 1;
270238973Skib		if (cpu_feature & CPUID_SSE2) {
271238973Skib			tsc_timecounter.tc_get_timecount =
272238973Skib			    tsc_get_timecount_lfence;
273238973Skib		}
274216272Sjkim		break;
275216272Sjkim	case CPU_VENDOR_CENTAUR:
276219469Sjkim		if (vm_guest == VM_GUEST_NO &&
277219469Sjkim		    CPUID_TO_FAMILY(cpu_id) == 0x6 &&
278216272Sjkim		    CPUID_TO_MODEL(cpu_id) >= 0xf &&
279216272Sjkim		    (rdmsr(0x1203) & 0x100000000ULL) == 0)
280216272Sjkim			tsc_is_invariant = 1;
281238973Skib		if (cpu_feature & CPUID_SSE2) {
282238973Skib			tsc_timecounter.tc_get_timecount =
283238973Skib			    tsc_get_timecount_lfence;
284238973Skib		}
285216272Sjkim		break;
286216272Sjkim	}
287216272Sjkim
288220577Sjkim	if (tsc_skip_calibration) {
289333161Skib		if (tsc_freq_cpuid())
290333161Skib			;
291333161Skib		else if (cpu_vendor_id == CPU_VENDOR_INTEL)
292220577Sjkim			tsc_freq_intel();
293333161Skib	} else {
294333161Skib		if (bootverbose)
295333161Skib			printf("Calibrating TSC clock ... ");
296333161Skib		tsc1 = rdtsc();
297333161Skib		DELAY(1000000);
298333161Skib		tsc2 = rdtsc();
299333161Skib		tsc_freq = tsc2 - tsc1;
300220577Sjkim	}
301220577Sjkim	if (bootverbose)
302220577Sjkim		printf("TSC clock: %ju Hz\n", (intmax_t)tsc_freq);
303220577Sjkim}
304220577Sjkim
305220577Sjkimvoid
306220577Sjkiminit_TSC(void)
307220577Sjkim{
308220577Sjkim
309220577Sjkim	if ((cpu_feature & CPUID_TSC) == 0 || tsc_disabled)
310220577Sjkim		return;
311220577Sjkim
312271082Sjhb#ifdef __i386__
313271082Sjhb	/* The TSC is known to be broken on certain CPUs. */
314271082Sjhb	switch (cpu_vendor_id) {
315271082Sjhb	case CPU_VENDOR_AMD:
316271082Sjhb		switch (cpu_id & 0xFF0) {
317271082Sjhb		case 0x500:
318271082Sjhb			/* K5 Model 0 */
319271082Sjhb			return;
320271082Sjhb		}
321271082Sjhb		break;
322271082Sjhb	case CPU_VENDOR_CENTAUR:
323271082Sjhb		switch (cpu_id & 0xff0) {
324271082Sjhb		case 0x540:
325271082Sjhb			/*
326271082Sjhb			 * http://www.centtech.com/c6_data_sheet.pdf
327271082Sjhb			 *
328271082Sjhb			 * I-12 RDTSC may return incoherent values in EDX:EAX
329271082Sjhb			 * I-13 RDTSC hangs when certain event counters are used
330271082Sjhb			 */
331271082Sjhb			return;
332271082Sjhb		}
333271082Sjhb		break;
334271082Sjhb	case CPU_VENDOR_NSC:
335271082Sjhb		switch (cpu_id & 0xff0) {
336271082Sjhb		case 0x540:
337271082Sjhb			if ((cpu_id & CPUID_STEPPING) == 0)
338271082Sjhb				return;
339271082Sjhb			break;
340271082Sjhb		}
341271082Sjhb		break;
342271082Sjhb	}
343271082Sjhb#endif
344271082Sjhb
345220577Sjkim	probe_tsc_freq();
346220577Sjkim
347167905Snjl	/*
348216274Sjkim	 * Inform CPU accounting about our boot-time clock rate.  This will
349216274Sjkim	 * be updated if someone loads a cpufreq driver after boot that
350216274Sjkim	 * discovers a new max frequency.
351167905Snjl	 */
352220577Sjkim	if (tsc_freq != 0)
353221178Sjkim		set_cputicker(rdtsc, tsc_freq, !tsc_is_invariant);
354167905Snjl
355216274Sjkim	if (tsc_is_invariant)
356216274Sjkim		return;
357216274Sjkim
358167905Snjl	/* Register to find out about changes in CPU frequency. */
359184108Sjkim	tsc_pre_tag = EVENTHANDLER_REGISTER(cpufreq_pre_change,
360184108Sjkim	    tsc_freq_changing, NULL, EVENTHANDLER_PRI_FIRST);
361167905Snjl	tsc_post_tag = EVENTHANDLER_REGISTER(cpufreq_post_change,
362167905Snjl	    tsc_freq_changed, NULL, EVENTHANDLER_PRI_FIRST);
363167905Snjl	tsc_levels_tag = EVENTHANDLER_REGISTER(cpufreq_levels_changed,
364167905Snjl	    tsc_levels_changed, NULL, EVENTHANDLER_PRI_ANY);
365118550Sphk}
36634617Sphk
367221703Sjkim#ifdef SMP
368221703Sjkim
369238973Skib/*
370238973Skib * RDTSC is not a serializing instruction, and does not drain
371238973Skib * instruction stream, so we need to drain the stream before executing
372238973Skib * it.  It could be fixed by use of RDTSCP, except the instruction is
373238973Skib * not available everywhere.
374238973Skib *
375238973Skib * Use CPUID for draining in the boot-time SMP constistency test.  The
376238973Skib * timecounters use MFENCE for AMD CPUs, and LFENCE for others (Intel
377238973Skib * and VIA) when SSE2 is present, and nothing on older machines which
378238973Skib * also do not issue RDTSC prematurely.  There, testing for SSE2 and
379238975Skib * vendor is too cumbersome, and we learn about TSC presence from CPUID.
380238973Skib *
381238973Skib * Do not use do_cpuid(), since we do not need CPUID results, which
382238973Skib * have to be written into memory with do_cpuid().
383238973Skib */
384238973Skib#define	TSC_READ(x)							\
385238973Skibstatic void								\
386238973Skibtsc_read_##x(void *arg)							\
387238973Skib{									\
388239133Sjimharris	uint64_t *tsc = arg;						\
389238973Skib	u_int cpu = PCPU_GET(cpuid);					\
390238973Skib									\
391238973Skib	__asm __volatile("cpuid" : : : "eax", "ebx", "ecx", "edx");	\
392239133Sjimharris	tsc[cpu * 3 + x] = rdtsc();					\
393221703Sjkim}
394221703SjkimTSC_READ(0)
395221703SjkimTSC_READ(1)
396221703SjkimTSC_READ(2)
397221703Sjkim#undef TSC_READ
398221703Sjkim
399221703Sjkim#define	N	1000
400221703Sjkim
401221703Sjkimstatic void
402221703Sjkimcomp_smp_tsc(void *arg)
403221703Sjkim{
404239133Sjimharris	uint64_t *tsc;
405239133Sjimharris	int64_t d1, d2;
406221703Sjkim	u_int cpu = PCPU_GET(cpuid);
407221703Sjkim	u_int i, j, size;
408221703Sjkim
409221703Sjkim	size = (mp_maxid + 1) * 3;
410221703Sjkim	for (i = 0, tsc = arg; i < N; i++, tsc += size)
411221703Sjkim		CPU_FOREACH(j) {
412221703Sjkim			if (j == cpu)
413221703Sjkim				continue;
414221703Sjkim			d1 = tsc[cpu * 3 + 1] - tsc[j * 3];
415221703Sjkim			d2 = tsc[cpu * 3 + 2] - tsc[j * 3 + 1];
416221703Sjkim			if (d1 <= 0 || d2 <= 0) {
417221703Sjkim				smp_tsc = 0;
418221703Sjkim				return;
419221703Sjkim			}
420221703Sjkim		}
421221703Sjkim}
422221703Sjkim
423249625Smavstatic void
424249625Smavadj_smp_tsc(void *arg)
425249625Smav{
426249625Smav	uint64_t *tsc;
427249625Smav	int64_t d, min, max;
428249625Smav	u_int cpu = PCPU_GET(cpuid);
429249625Smav	u_int first, i, size;
430249625Smav
431249625Smav	first = CPU_FIRST();
432249625Smav	if (cpu == first)
433249625Smav		return;
434249625Smav	min = INT64_MIN;
435249625Smav	max = INT64_MAX;
436249625Smav	size = (mp_maxid + 1) * 3;
437249625Smav	for (i = 0, tsc = arg; i < N; i++, tsc += size) {
438249625Smav		d = tsc[first * 3] - tsc[cpu * 3 + 1];
439249625Smav		if (d > min)
440249625Smav			min = d;
441249625Smav		d = tsc[first * 3 + 1] - tsc[cpu * 3 + 2];
442249625Smav		if (d > min)
443249625Smav			min = d;
444249625Smav		d = tsc[first * 3 + 1] - tsc[cpu * 3];
445249625Smav		if (d < max)
446249625Smav			max = d;
447249625Smav		d = tsc[first * 3 + 2] - tsc[cpu * 3 + 1];
448249625Smav		if (d < max)
449249625Smav			max = d;
450249625Smav	}
451249625Smav	if (min > max)
452249625Smav		return;
453249625Smav	d = min / 2 + max / 2;
454249625Smav	__asm __volatile (
455249625Smav		"movl $0x10, %%ecx\n\t"
456249625Smav		"rdmsr\n\t"
457249625Smav		"addl %%edi, %%eax\n\t"
458249625Smav		"adcl %%esi, %%edx\n\t"
459249625Smav		"wrmsr\n"
460249625Smav		: /* No output */
461249625Smav		: "D" ((uint32_t)d), "S" ((uint32_t)(d >> 32))
462249625Smav		: "ax", "cx", "dx", "cc"
463249625Smav	);
464249625Smav}
465249625Smav
466221703Sjkimstatic int
467335657Savgtest_tsc(int adj_max_count)
468221703Sjkim{
469239133Sjimharris	uint64_t *data, *tsc;
470249625Smav	u_int i, size, adj;
471221703Sjkim
472246212Skib	if ((!smp_tsc && !tsc_is_invariant) || vm_guest)
473221703Sjkim		return (-100);
474221703Sjkim	size = (mp_maxid + 1) * 3;
475221703Sjkim	data = malloc(sizeof(*data) * size * N, M_TEMP, M_WAITOK);
476249625Smav	adj = 0;
477249625Smavretry:
478221703Sjkim	for (i = 0, tsc = data; i < N; i++, tsc += size)
479221703Sjkim		smp_rendezvous(tsc_read_0, tsc_read_1, tsc_read_2, tsc);
480221703Sjkim	smp_tsc = 1;	/* XXX */
481328386Spkelsey	smp_rendezvous(smp_no_rendezvous_barrier, comp_smp_tsc,
482328386Spkelsey	    smp_no_rendezvous_barrier, data);
483335657Savg	if (!smp_tsc && adj < adj_max_count) {
484249625Smav		adj++;
485328386Spkelsey		smp_rendezvous(smp_no_rendezvous_barrier, adj_smp_tsc,
486328386Spkelsey		    smp_no_rendezvous_barrier, data);
487249625Smav		goto retry;
488249625Smav	}
489221703Sjkim	free(data, M_TEMP);
490221703Sjkim	if (bootverbose)
491249625Smav		printf("SMP: %sed TSC synchronization test%s\n",
492249625Smav		    smp_tsc ? "pass" : "fail",
493249625Smav		    adj > 0 ? " after adjustment" : "");
494222869Sjkim	if (smp_tsc && tsc_is_invariant) {
495222869Sjkim		switch (cpu_vendor_id) {
496222869Sjkim		case CPU_VENDOR_AMD:
497222869Sjkim			/*
498363433Sjkim			 * Processor Programming Reference (PPR) for AMD
499363433Sjkim			 * Family 17h states that the TSC uses a common
500363433Sjkim			 * reference for all sockets, cores and threads.
501363433Sjkim			 */
502363433Sjkim			if (CPUID_TO_FAMILY(cpu_id) >= 0x17)
503363433Sjkim				return (1000);
504363433Sjkim			/*
505222869Sjkim			 * Starting with Family 15h processors, TSC clock
506222869Sjkim			 * source is in the north bridge.  Check whether
507222869Sjkim			 * we have a single-socket/multi-core platform.
508222869Sjkim			 * XXX Need more work for complex cases.
509222869Sjkim			 */
510222869Sjkim			if (CPUID_TO_FAMILY(cpu_id) < 0x15 ||
511222869Sjkim			    (amd_feature2 & AMDID2_CMP) == 0 ||
512222869Sjkim			    smp_cpus > (cpu_procinfo2 & AMDID_CMP_CORES) + 1)
513222869Sjkim				break;
514222869Sjkim			return (1000);
515222869Sjkim		case CPU_VENDOR_INTEL:
516222869Sjkim			/*
517222869Sjkim			 * XXX Assume Intel platforms have synchronized TSCs.
518222869Sjkim			 */
519222869Sjkim			return (1000);
520222869Sjkim		}
521222869Sjkim		return (800);
522222869Sjkim	}
523222869Sjkim	return (-100);
524221703Sjkim}
525221703Sjkim
526221703Sjkim#undef N
527221703Sjkim
528221703Sjkim#endif /* SMP */
529221703Sjkim
530221703Sjkimstatic void
531118550Sphkinit_TSC_tc(void)
532118550Sphk{
533222866Sjkim	uint64_t max_freq;
534222866Sjkim	int shift;
535209103Smav
536219673Sjkim	if ((cpu_feature & CPUID_TSC) == 0 || tsc_disabled)
537209103Smav		return;
538209103Smav
53934617Sphk	/*
540222866Sjkim	 * Limit timecounter frequency to fit in an int and prevent it from
541222866Sjkim	 * overflowing too fast.
542222866Sjkim	 */
543222866Sjkim	max_freq = UINT_MAX;
544222866Sjkim
545222866Sjkim	/*
546160964Syar	 * We can not use the TSC if we support APM.  Precise timekeeping
54749186Smsmith	 * on an APM'ed machine is at best a fools pursuit, since
54834617Sphk	 * any and all of the time spent in various SMM code can't
54934617Sphk	 * be reliably accounted for.  Reading the RTC is your only
550160964Syar	 * source of reliable time info.  The i8254 loses too, of course,
55134617Sphk	 * but we need to have some kind of time...
55249186Smsmith	 * We don't know at this point whether APM is going to be used
55349186Smsmith	 * or not, nor when it might be activated.  Play it safe.
55434617Sphk	 */
55585835Siwasaki	if (power_pm_get_type() == POWER_PM_TYPE_APM) {
556118987Sphk		tsc_timecounter.tc_quality = -1000;
55785835Siwasaki		if (bootverbose)
558110370Sphk			printf("TSC timecounter disabled: APM enabled.\n");
559221703Sjkim		goto init;
56064031Sphk	}
56134617Sphk
562223426Sjkim	/*
563276724Sjhb	 * Intel CPUs without a C-state invariant TSC can stop the TSC
564277900Sjhb	 * in either C2 or C3.  Disable use of C2 and C3 while using
565277900Sjhb	 * the TSC as the timecounter.  The timecounter can be changed
566277900Sjhb	 * to enable C2 and C3.
567277900Sjhb	 *
568277900Sjhb	 * Note that the TSC is used as the cputicker for computing
569277900Sjhb	 * thread runtime regardless of the timecounter setting, so
570277900Sjhb	 * using an alternate timecounter and enabling C2 or C3 can
571277900Sjhb	 * result incorrect runtimes for kernel idle threads (but not
572277900Sjhb	 * for any non-idle threads).
573223426Sjkim	 */
574314999Skib	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
575223426Sjkim	    (amd_pminfo & AMDPM_TSC_INVARIANT) == 0) {
576276724Sjhb		tsc_timecounter.tc_flags |= TC_FLAGS_C2STOP;
577223426Sjkim		if (bootverbose)
578277900Sjhb			printf("TSC timecounter disables C2 and C3.\n");
579223426Sjkim	}
580223426Sjkim
581118987Sphk	/*
582246212Skib	 * We can not use the TSC in SMP mode unless the TSCs on all CPUs
583246212Skib	 * are synchronized.  If the user is sure that the system has
584246212Skib	 * synchronized TSCs, set kern.timecounter.smp_tsc tunable to a
585246212Skib	 * non-zero value.  The TSC seems unreliable in virtualized SMP
586225069Ssilby	 * environments, so it is set to a negative quality in those cases.
587118987Sphk	 */
588335657Savg#ifdef SMP
589246212Skib	if (mp_ncpus > 1)
590335657Savg		tsc_timecounter.tc_quality = test_tsc(smp_tsc_adjust);
591335657Savg	else
592335657Savg#endif /* SMP */
593335657Savg	if (tsc_is_invariant)
594222869Sjkim		tsc_timecounter.tc_quality = 1000;
595246212Skib	max_freq >>= tsc_shift;
596222869Sjkim
597221703Sjkiminit:
598246212Skib	for (shift = 0; shift <= 31 && (tsc_freq >> shift) > max_freq; shift++)
599222866Sjkim		;
600246212Skib	if ((cpu_feature & CPUID_SSE2) != 0 && mp_ncpus > 1) {
601246212Skib		if (cpu_vendor_id == CPU_VENDOR_AMD) {
602246212Skib			tsc_timecounter.tc_get_timecount = shift > 0 ?
603246212Skib			    tsc_get_timecount_low_mfence :
604246212Skib			    tsc_get_timecount_mfence;
605246212Skib		} else {
606246212Skib			tsc_timecounter.tc_get_timecount = shift > 0 ?
607246212Skib			    tsc_get_timecount_low_lfence :
608246212Skib			    tsc_get_timecount_lfence;
609246212Skib		}
610246212Skib	} else {
611246212Skib		tsc_timecounter.tc_get_timecount = shift > 0 ?
612246212Skib		    tsc_get_timecount_low : tsc_get_timecount;
613246212Skib	}
614222866Sjkim	if (shift > 0) {
615222866Sjkim		tsc_timecounter.tc_name = "TSC-low";
616222866Sjkim		if (bootverbose)
617222884Sjkim			printf("TSC timecounter discards lower %d bit(s)\n",
618222866Sjkim			    shift);
619222866Sjkim	}
620219461Sjkim	if (tsc_freq != 0) {
621222866Sjkim		tsc_timecounter.tc_frequency = tsc_freq >> shift;
622222866Sjkim		tsc_timecounter.tc_priv = (void *)(intptr_t)shift;
62358377Sphk		tc_init(&tsc_timecounter);
62433690Sphk	}
6254Srgrimes}
626221703SjkimSYSINIT(tsc_tc, SI_SUB_SMP, SI_ORDER_ANY, init_TSC_tc, NULL);
6274Srgrimes
628335657Savgvoid
629335657Savgresume_TSC(void)
630335657Savg{
631335657Savg#ifdef SMP
632335657Savg	int quality;
633335657Savg
634335657Savg	/* If TSC was not good on boot, it is unlikely to become good now. */
635335657Savg	if (tsc_timecounter.tc_quality < 0)
636335657Savg		return;
637335657Savg	/* Nothing to do with UP. */
638335657Savg	if (mp_ncpus < 2)
639335657Savg		return;
640335657Savg
641335657Savg	/*
642335657Savg	 * If TSC was good, a single synchronization should be enough,
643335657Savg	 * but honour smp_tsc_adjust if it's set.
644335657Savg	 */
645335657Savg	quality = test_tsc(MAX(smp_tsc_adjust, 1));
646335657Savg	if (quality != tsc_timecounter.tc_quality) {
647335657Savg		printf("TSC timecounter quality changed: %d -> %d\n",
648335657Savg		    tsc_timecounter.tc_quality, quality);
649335657Savg		tsc_timecounter.tc_quality = quality;
650335657Savg	}
651335657Savg#endif /* SMP */
652335657Savg}
653335657Savg
654167905Snjl/*
655167905Snjl * When cpufreq levels change, find out about the (new) max frequency.  We
656167905Snjl * use this to update CPU accounting in case it got a lower estimate at boot.
657167905Snjl */
658167905Snjlstatic void
659167905Snjltsc_levels_changed(void *arg, int unit)
660167905Snjl{
661167905Snjl	device_t cf_dev;
662167905Snjl	struct cf_level *levels;
663167905Snjl	int count, error;
664167905Snjl	uint64_t max_freq;
665167905Snjl
666167905Snjl	/* Only use values from the first CPU, assuming all are equal. */
667167905Snjl	if (unit != 0)
668167905Snjl		return;
669167905Snjl
670167905Snjl	/* Find the appropriate cpufreq device instance. */
671167905Snjl	cf_dev = devclass_get_device(devclass_find("cpufreq"), unit);
672167905Snjl	if (cf_dev == NULL) {
673167905Snjl		printf("tsc_levels_changed() called but no cpufreq device?\n");
674167905Snjl		return;
675167905Snjl	}
676167905Snjl
677167905Snjl	/* Get settings from the device and find the max frequency. */
678167905Snjl	count = 64;
679167905Snjl	levels = malloc(count * sizeof(*levels), M_TEMP, M_NOWAIT);
680167905Snjl	if (levels == NULL)
681167905Snjl		return;
682167905Snjl	error = CPUFREQ_LEVELS(cf_dev, levels, &count);
683167905Snjl	if (error == 0 && count != 0) {
684167905Snjl		max_freq = (uint64_t)levels[0].total_set.freq * 1000000;
685167905Snjl		set_cputicker(rdtsc, max_freq, 1);
686167905Snjl	} else
687167905Snjl		printf("tsc_levels_changed: no max freq found\n");
688167905Snjl	free(levels, M_TEMP);
689167905Snjl}
690167905Snjl
691167905Snjl/*
692167905Snjl * If the TSC timecounter is in use, veto the pending change.  It may be
693167905Snjl * possible in the future to handle a dynamically-changing timecounter rate.
694167905Snjl */
695167905Snjlstatic void
696167905Snjltsc_freq_changing(void *arg, const struct cf_level *level, int *status)
697167905Snjl{
698167905Snjl
699216274Sjkim	if (*status != 0 || timecounter != &tsc_timecounter)
700167905Snjl		return;
701167905Snjl
702167905Snjl	printf("timecounter TSC must not be in use when "
703184102Sjkim	    "changing frequencies; change denied\n");
704167905Snjl	*status = EBUSY;
705167905Snjl}
706167905Snjl
707167905Snjl/* Update TSC freq with the value indicated by the caller. */
708167905Snjlstatic void
709167905Snjltsc_freq_changed(void *arg, const struct cf_level *level, int status)
710167905Snjl{
711220433Sjkim	uint64_t freq;
712216276Sjkim
713216276Sjkim	/* If there was an error during the transition, don't do anything. */
714219473Sjkim	if (tsc_disabled || status != 0)
715167905Snjl		return;
716167905Snjl
717167905Snjl	/* Total setting for this level gives the new frequency in MHz. */
718220433Sjkim	freq = (uint64_t)level->total_set.freq * 1000000;
719220433Sjkim	atomic_store_rel_64(&tsc_freq, freq);
720222866Sjkim	tsc_timecounter.tc_frequency =
721222866Sjkim	    freq >> (int)(intptr_t)tsc_timecounter.tc_priv;
722167905Snjl}
723167905Snjl
72415508Sbdestatic int
72562573Sphksysctl_machdep_tsc_freq(SYSCTL_HANDLER_ARGS)
72615508Sbde{
72715508Sbde	int error;
728110039Sphk	uint64_t freq;
72915508Sbde
730220433Sjkim	freq = atomic_load_acq_64(&tsc_freq);
731220433Sjkim	if (freq == 0)
73215508Sbde		return (EOPNOTSUPP);
733217616Smdf	error = sysctl_handle_64(oidp, &freq, 0, req);
734219700Sjkim	if (error == 0 && req->newptr != NULL) {
735220433Sjkim		atomic_store_rel_64(&tsc_freq, freq);
736222884Sjkim		atomic_store_rel_64(&tsc_timecounter.tc_frequency,
737222884Sjkim		    freq >> (int)(intptr_t)tsc_timecounter.tc_priv);
738219700Sjkim	}
73915508Sbde	return (error);
74015508Sbde}
74115508Sbde
742217616SmdfSYSCTL_PROC(_machdep, OID_AUTO, tsc_freq, CTLTYPE_U64 | CTLFLAG_RW,
743220613Sjkim    0, 0, sysctl_machdep_tsc_freq, "QU", "Time Stamp Counter frequency");
74433690Sphk
745220632Sjkimstatic u_int
746222866Sjkimtsc_get_timecount(struct timecounter *tc __unused)
74733690Sphk{
748220632Sjkim
749220632Sjkim	return (rdtsc32());
75033690Sphk}
751222866Sjkim
752238973Skibstatic inline u_int
753222884Sjkimtsc_get_timecount_low(struct timecounter *tc)
754222866Sjkim{
755223211Sjkim	uint32_t rv;
756222866Sjkim
757223211Sjkim	__asm __volatile("rdtsc; shrd %%cl, %%edx, %0"
758238973Skib	    : "=a" (rv) : "c" ((int)(intptr_t)tc->tc_priv) : "edx");
759223211Sjkim	return (rv);
760222866Sjkim}
761237433Skib
762238973Skibstatic u_int
763238973Skibtsc_get_timecount_lfence(struct timecounter *tc __unused)
764238973Skib{
765238973Skib
766238973Skib	lfence();
767238973Skib	return (rdtsc32());
768238973Skib}
769238973Skib
770238973Skibstatic u_int
771238973Skibtsc_get_timecount_low_lfence(struct timecounter *tc)
772238973Skib{
773238973Skib
774238973Skib	lfence();
775238973Skib	return (tsc_get_timecount_low(tc));
776238973Skib}
777238973Skib
778238973Skibstatic u_int
779238973Skibtsc_get_timecount_mfence(struct timecounter *tc __unused)
780238973Skib{
781238973Skib
782238973Skib	mfence();
783238973Skib	return (rdtsc32());
784238973Skib}
785238973Skib
786238973Skibstatic u_int
787238973Skibtsc_get_timecount_low_mfence(struct timecounter *tc)
788238973Skib{
789238973Skib
790238973Skib	mfence();
791238973Skib	return (tsc_get_timecount_low(tc));
792238973Skib}
793238973Skib
794305866Skibstatic uint32_t
795305866Skibx86_tsc_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc)
796237433Skib{
797237433Skib
798305866Skib	vdso_th->th_algo = VDSO_TH_ALGO_X86_TSC;
799277406Sneel	vdso_th->th_x86_shift = (int)(intptr_t)tc->tc_priv;
800305866Skib	vdso_th->th_x86_hpet_idx = 0xffffffff;
801237433Skib	bzero(vdso_th->th_res, sizeof(vdso_th->th_res));
802305866Skib	return (1);
803237433Skib}
804237433Skib
805237433Skib#ifdef COMPAT_FREEBSD32
806305866Skibstatic uint32_t
807305866Skibx86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32,
808277406Sneel    struct timecounter *tc)
809237433Skib{
810237433Skib
811305866Skib	vdso_th32->th_algo = VDSO_TH_ALGO_X86_TSC;
812277406Sneel	vdso_th32->th_x86_shift = (int)(intptr_t)tc->tc_priv;
813305866Skib	vdso_th32->th_x86_hpet_idx = 0xffffffff;
814237433Skib	bzero(vdso_th32->th_res, sizeof(vdso_th32->th_res));
815305866Skib	return (1);
816237433Skib}
817237433Skib#endif
818