hwpmc_piv.c revision 147867
1/*-
2 * Copyright (c) 2003-2005 Joseph Koshy
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/dev/hwpmc/hwpmc_piv.c 147867 2005-07-09 17:29:36Z jkoshy $");
29
30#include <sys/param.h>
31#include <sys/lock.h>
32#include <sys/mutex.h>
33#include <sys/pmc.h>
34#include <sys/pmckern.h>
35#include <sys/smp.h>
36#include <sys/systm.h>
37
38#include <machine/cpufunc.h>
39#include <machine/md_var.h>
40#include <machine/specialreg.h>
41
42/*
43 * PENTIUM 4 SUPPORT
44 *
45 * The P4 has 18 PMCs, divided into 4 groups with 4,4,4 and 6 PMCs
46 * respectively.  Each PMC comprises of two model specific registers:
47 * a counter configuration control register (CCCR) and a counter
48 * register that holds the actual event counts.
49 *
50 * Configuring an event requires the use of one of 45 event selection
51 * control registers (ESCR).  Events are associated with specific
52 * ESCRs.  Each PMC group has a set of ESCRs it can use.
53 *
54 * - The BPU counter group (4 PMCs) can use the 16 ESCRs:
55 *   BPU_ESCR{0,1}, IS_ESCR{0,1}, MOB_ESCR{0,1}, ITLB_ESCR{0,1},
56 *   PMH_ESCR{0,1}, IX_ESCR{0,1}, FSB_ESCR{0,}, BSU_ESCR{0,1}.
57 *
58 * - The MS counter group (4 PMCs) can use the 6 ESCRs: MS_ESCR{0,1},
59 *   TC_ESCR{0,1}, TBPU_ESCR{0,1}.
60 *
61 * - The FLAME counter group (4 PMCs) can use the 10 ESCRs:
62 *   FLAME_ESCR{0,1}, FIRM_ESCR{0,1}, SAAT_ESCR{0,1}, U2L_ESCR{0,1},
63 *   DAC_ESCR{0,1}.
64 *
65 * - The IQ counter group (6 PMCs) can use the 13 ESCRs: IQ_ESCR{0,1},
66 *   ALF_ESCR{0,1}, RAT_ESCR{0,1}, SSU_ESCR0, CRU_ESCR{0,1,2,3,4,5}.
67 *
68 * Even-numbered ESCRs can be used with counters 0, 1 and 4 (if
69 * present) of a counter group.  Odd-numbers ESCRs can be used with
70 * counters 2, 3 and 5 (if present) of a counter group.  The
71 * 'p4_escrs[]' table describes these restrictions in a form that
72 * function 'p4_allocate()' uses for making allocation decisions.
73 *
74 * SYSTEM-MODE AND THREAD-MODE ALLOCATION
75 *
76 * In addition to remembering the state of PMC rows
77 * ('FREE','STANDALONE', or 'THREAD'), we similar need to track the
78 * state of ESCR rows.  If an ESCR is allocated to a system-mode PMC
79 * on a CPU we cannot allocate this to a thread-mode PMC.  On a
80 * multi-cpu (multiple physical CPUs) system, ESCR allocation on each
81 * CPU is tracked by the pc_escrs[] array.
82 *
83 * Each system-mode PMC that is using an ESCR records its row-index in
84 * the appropriate entry and system-mode allocation attempts check
85 * that an ESCR is available using this array.  Process-mode PMCs do
86 * not use the pc_escrs[] array, since ESCR row itself would have been
87 * marked as in 'THREAD' mode.
88 *
89 * HYPERTHREADING SUPPORT
90 *
91 * When HTT is enabled, the FreeBSD kernel treats the two 'logical'
92 * cpus as independent CPUs and can schedule kernel threads on them
93 * independently.  However, the two logical CPUs share the same set of
94 * PMC resources.  We need to ensure that:
95 * - PMCs that use the PMC_F_DESCENDANTS semantics are handled correctly,
96 *   and,
97 * - Threads of multi-threaded processes that get scheduled on the same
98 *   physical CPU are handled correctly.
99 *
100 * HTT Detection
101 *
102 * Not all HTT capable systems will have HTT enabled since users may
103 * have turned HTT support off using the appropriate sysctls
104 * (machdep.hlt_logical_cpus or machdep.logical_cpus_mask).  We detect
105 * the presence of HTT by remembering if 'p4_init()' was called for a
106 * logical CPU.  Note that hwpmc(4) cannot deal with a change in HTT
107 * status once it is loaded.
108 *
109 * Handling HTT READ / WRITE / START / STOP
110 *
111 * PMC resources are shared across multiple logical CPUs.  In each
112 * physical CPU's state we keep track of a 'runcount' which reflects
113 * the number of PMC-using processes that have been scheduled on the
114 * logical CPUs of this physical CPU.  Process-mode PMC operations
115 * will actually 'start' or 'stop' hardware only if these are the
116 * first or last processes respectively to use the hardware.  PMC
117 * values written by a 'write' operation are saved and are transferred
118 * to hardware at PMC 'start' time if the runcount is 0.  If the
119 * runcount is greater than 0 at the time of a 'start' operation, we
120 * keep track of the actual hardware value at the time of the 'start'
121 * operation and use this to adjust the final readings at PMC 'stop'
122 * or 'read' time.
123 *
124 * Execution sequences:
125 *
126 * Case 1:   CPUx   +...-		(no overlap)
127 *	     CPUy         +...-
128 *           RC   0 1   0 1   0
129 *
130 * Case 2:   CPUx   +........-		(partial overlap)
131 * 	     CPUy       +........-
132 *           RC   0 1   2    1   0
133 *
134 * Case 3:   CPUx   +..............-	(fully overlapped)
135 *	     CPUy       +.....-
136 *	     RC   0 1   2     1    0
137 *
138 *     Key:
139 *     'CPU[xy]' : one of the two logical processors on a HTT CPU.
140 *     'RC'      : run count (#threads per physical core).
141 *     '+'       : point in time when a thread is put on a CPU.
142 *     '-'       : point in time where a thread is taken off a CPU.
143 *
144 * Handling HTT CONFIG
145 *
146 * Different processes attached to the same PMC may get scheduled on
147 * the two logical processors in the package.  We keep track of config
148 * and de-config operations using the CFGFLAGS fields of the per-physical
149 * cpu state.
150 */
151
152#define	P4_PMCS()				\
153	P4_PMC(BPU_COUNTER0)			\
154	P4_PMC(BPU_COUNTER1)			\
155	P4_PMC(BPU_COUNTER2)			\
156	P4_PMC(BPU_COUNTER3)			\
157	P4_PMC(MS_COUNTER0)			\
158	P4_PMC(MS_COUNTER1)			\
159	P4_PMC(MS_COUNTER2)			\
160	P4_PMC(MS_COUNTER3)			\
161	P4_PMC(FLAME_COUNTER0)			\
162	P4_PMC(FLAME_COUNTER1)			\
163	P4_PMC(FLAME_COUNTER2)			\
164	P4_PMC(FLAME_COUNTER3)			\
165	P4_PMC(IQ_COUNTER0)			\
166	P4_PMC(IQ_COUNTER1)			\
167	P4_PMC(IQ_COUNTER2)			\
168	P4_PMC(IQ_COUNTER3)			\
169	P4_PMC(IQ_COUNTER4)			\
170	P4_PMC(IQ_COUNTER5)			\
171	P4_PMC(NONE)
172
173enum pmc_p4pmc {
174#undef	P4_PMC
175#define	P4_PMC(N)	P4_PMC_##N ,
176	P4_PMCS()
177};
178
179/*
180 * P4 ESCR descriptors
181 */
182
183#define	P4_ESCRS()							\
184    P4_ESCR(BSU_ESCR0,	0x3A0, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
185    P4_ESCR(BSU_ESCR1,	0x3A1, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
186    P4_ESCR(FSB_ESCR0,	0x3A2, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
187    P4_ESCR(FSB_ESCR1,	0x3A3, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
188    P4_ESCR(FIRM_ESCR0,	0x3A4, FLAME_COUNTER0, FLAME_COUNTER1, NONE)	\
189    P4_ESCR(FIRM_ESCR1,	0x3A5, FLAME_COUNTER2, FLAME_COUNTER3, NONE)	\
190    P4_ESCR(FLAME_ESCR0, 0x3A6, FLAME_COUNTER0, FLAME_COUNTER1, NONE)	\
191    P4_ESCR(FLAME_ESCR1, 0x3A7, FLAME_COUNTER2, FLAME_COUNTER3, NONE)	\
192    P4_ESCR(DAC_ESCR0,	0x3A8, FLAME_COUNTER0, FLAME_COUNTER1, NONE)	\
193    P4_ESCR(DAC_ESCR1,	0x3A9, FLAME_COUNTER2, FLAME_COUNTER3, NONE)	\
194    P4_ESCR(MOB_ESCR0,	0x3AA, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
195    P4_ESCR(MOB_ESCR1,	0x3AB, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
196    P4_ESCR(PMH_ESCR0,	0x3AC, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
197    P4_ESCR(PMH_ESCR1,	0x3AD, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
198    P4_ESCR(SAAT_ESCR0,	0x3AE, FLAME_COUNTER0, FLAME_COUNTER1, NONE)	\
199    P4_ESCR(SAAT_ESCR1,	0x3AF, FLAME_COUNTER2, FLAME_COUNTER3, NONE)	\
200    P4_ESCR(U2L_ESCR0,	0x3B0, FLAME_COUNTER0, FLAME_COUNTER1, NONE)	\
201    P4_ESCR(U2L_ESCR1,	0x3B1, FLAME_COUNTER2, FLAME_COUNTER3, NONE)	\
202    P4_ESCR(BPU_ESCR0,	0x3B2, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
203    P4_ESCR(BPU_ESCR1,	0x3B3, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
204    P4_ESCR(IS_ESCR0,	0x3B4, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
205    P4_ESCR(IS_ESCR1,	0x3B5, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
206    P4_ESCR(ITLB_ESCR0,	0x3B6, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
207    P4_ESCR(ITLB_ESCR1,	0x3B7, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
208    P4_ESCR(CRU_ESCR0,	0x3B8, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4)	\
209    P4_ESCR(CRU_ESCR1,	0x3B9, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5)	\
210    P4_ESCR(IQ_ESCR0,	0x3BA, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4)	\
211    P4_ESCR(IQ_ESCR1,	0x3BB, IQ_COUNTER1, IQ_COUNTER3, IQ_COUNTER5)	\
212    P4_ESCR(RAT_ESCR0,	0x3BC, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4)	\
213    P4_ESCR(RAT_ESCR1,	0x3BD, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5)	\
214    P4_ESCR(SSU_ESCR0,	0x3BE, IQ_COUNTER0, IQ_COUNTER2, IQ_COUNTER4)	\
215    P4_ESCR(MS_ESCR0,	0x3C0, MS_COUNTER0, MS_COUNTER1, NONE)		\
216    P4_ESCR(MS_ESCR1,	0x3C1, MS_COUNTER2, MS_COUNTER3, NONE)		\
217    P4_ESCR(TBPU_ESCR0,	0x3C2, MS_COUNTER0, MS_COUNTER1, NONE)		\
218    P4_ESCR(TBPU_ESCR1,	0x3C3, MS_COUNTER2, MS_COUNTER3, NONE)		\
219    P4_ESCR(TC_ESCR0,	0x3C4, MS_COUNTER0, MS_COUNTER1, NONE)		\
220    P4_ESCR(TC_ESCR1,	0x3C5, MS_COUNTER2, MS_COUNTER3, NONE)		\
221    P4_ESCR(IX_ESCR0,	0x3C8, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
222    P4_ESCR(IX_ESCR1,	0x3C9, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
223    P4_ESCR(ALF_ESCR0,	0x3CA, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4)	\
224    P4_ESCR(ALF_ESCR1,	0x3CB, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5)	\
225    P4_ESCR(CRU_ESCR2,	0x3CC, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4)	\
226    P4_ESCR(CRU_ESCR3,	0x3CD, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5)	\
227    P4_ESCR(CRU_ESCR4,	0x3E0, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4)	\
228    P4_ESCR(CRU_ESCR5,	0x3E1, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5)	\
229    P4_ESCR(NONE,		~0,    NONE, NONE, NONE)
230
231enum pmc_p4escr {
232#define	P4_ESCR(N, MSR, P1, P2, P3)	P4_ESCR_##N ,
233	P4_ESCRS()
234#undef	P4_ESCR
235};
236
237struct pmc_p4escr_descr {
238	const char	pm_escrname[PMC_NAME_MAX];
239	u_short		pm_escr_msr;
240	const enum pmc_p4pmc pm_pmcs[P4_MAX_PMC_PER_ESCR];
241};
242
243static struct pmc_p4escr_descr p4_escrs[] =
244{
245#define	P4_ESCR(N, MSR, P1, P2, P3)		\
246	{					\
247		.pm_escrname = #N,		\
248		.pm_escr_msr = (MSR),		\
249		.pm_pmcs =			\
250		{				\
251			P4_PMC_##P1,		\
252			P4_PMC_##P2,		\
253			P4_PMC_##P3		\
254		}				\
255	} ,
256
257	P4_ESCRS()
258
259#undef	P4_ESCR
260};
261
262/*
263 * P4 Event descriptor
264 */
265
266struct p4_event_descr {
267	const enum pmc_event pm_event;
268	const uint32_t	pm_escr_eventselect;
269	const uint32_t	pm_cccr_select;
270	const char	pm_is_ti_event;
271	enum pmc_p4escr	pm_escrs[P4_MAX_ESCR_PER_EVENT];
272};
273
274static struct p4_event_descr p4_events[] = {
275
276#define	P4_EVDESCR(NAME, ESCREVENTSEL, CCCRSEL, TI_EVENT, ESCR0, ESCR1)	\
277	{								\
278		.pm_event            = PMC_EV_P4_##NAME,		\
279		.pm_escr_eventselect = (ESCREVENTSEL),			\
280		.pm_cccr_select      = (CCCRSEL),			\
281		.pm_is_ti_event	     = (TI_EVENT),			\
282		.pm_escrs            =					\
283		{							\
284			P4_ESCR_##ESCR0,				\
285			P4_ESCR_##ESCR1					\
286		}							\
287	}
288
289P4_EVDESCR(TC_DELIVER_MODE,	0x01, 0x01, TRUE,  TC_ESCR0,	TC_ESCR1),
290P4_EVDESCR(BPU_FETCH_REQUEST,	0x03, 0x00, FALSE, BPU_ESCR0,	BPU_ESCR1),
291P4_EVDESCR(ITLB_REFERENCE,	0x18, 0x03, FALSE, ITLB_ESCR0,	ITLB_ESCR1),
292P4_EVDESCR(MEMORY_CANCEL,	0x02, 0x05, FALSE, DAC_ESCR0,	DAC_ESCR1),
293P4_EVDESCR(MEMORY_COMPLETE,	0x08, 0x02, FALSE, SAAT_ESCR0,	SAAT_ESCR1),
294P4_EVDESCR(LOAD_PORT_REPLAY,	0x04, 0x02, FALSE, SAAT_ESCR0,	SAAT_ESCR1),
295P4_EVDESCR(STORE_PORT_REPLAY,	0x05, 0x02, FALSE, SAAT_ESCR0,	SAAT_ESCR1),
296P4_EVDESCR(MOB_LOAD_REPLAY,	0x03, 0x02, FALSE, MOB_ESCR0,	MOB_ESCR1),
297P4_EVDESCR(PAGE_WALK_TYPE,	0x01, 0x04, TRUE,  PMH_ESCR0,	PMH_ESCR1),
298P4_EVDESCR(BSQ_CACHE_REFERENCE,	0x0C, 0x07, FALSE, BSU_ESCR0,	BSU_ESCR1),
299P4_EVDESCR(IOQ_ALLOCATION,	0x03, 0x06, FALSE, FSB_ESCR0,	FSB_ESCR1),
300P4_EVDESCR(IOQ_ACTIVE_ENTRIES,	0x1A, 0x06, FALSE, FSB_ESCR1,	NONE),
301P4_EVDESCR(FSB_DATA_ACTIVITY,	0x17, 0x06, TRUE,  FSB_ESCR0,	FSB_ESCR1),
302P4_EVDESCR(BSQ_ALLOCATION,	0x05, 0x07, FALSE, BSU_ESCR0,	NONE),
303P4_EVDESCR(BSQ_ACTIVE_ENTRIES,	0x06, 0x07, FALSE, BSU_ESCR1,	NONE),
304	/* BSQ_ACTIVE_ENTRIES inherits CPU specificity from BSQ_ALLOCATION */
305P4_EVDESCR(SSE_INPUT_ASSIST,	0x34, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
306P4_EVDESCR(PACKED_SP_UOP,	0x08, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
307P4_EVDESCR(PACKED_DP_UOP,	0x0C, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
308P4_EVDESCR(SCALAR_SP_UOP,	0x0A, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
309P4_EVDESCR(SCALAR_DP_UOP,	0x0E, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
310P4_EVDESCR(64BIT_MMX_UOP,	0x02, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
311P4_EVDESCR(128BIT_MMX_UOP,	0x1A, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
312P4_EVDESCR(X87_FP_UOP,		0x04, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
313P4_EVDESCR(X87_SIMD_MOVES_UOP,	0x2E, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
314P4_EVDESCR(GLOBAL_POWER_EVENTS,	0x13, 0x06, FALSE, FSB_ESCR0,	FSB_ESCR1),
315P4_EVDESCR(TC_MS_XFER,		0x05, 0x00, FALSE, MS_ESCR0,	MS_ESCR1),
316P4_EVDESCR(UOP_QUEUE_WRITES,	0x09, 0x00, FALSE, MS_ESCR0,	MS_ESCR1),
317P4_EVDESCR(RETIRED_MISPRED_BRANCH_TYPE,
318    				0x05, 0x02, FALSE, TBPU_ESCR0,	TBPU_ESCR1),
319P4_EVDESCR(RETIRED_BRANCH_TYPE,	0x04, 0x02, FALSE, TBPU_ESCR0,	TBPU_ESCR1),
320P4_EVDESCR(RESOURCE_STALL,	0x01, 0x01, FALSE, ALF_ESCR0,	ALF_ESCR1),
321P4_EVDESCR(WC_BUFFER,		0x05, 0x05, TRUE,  DAC_ESCR0,	DAC_ESCR1),
322P4_EVDESCR(B2B_CYCLES,		0x16, 0x03, TRUE,  FSB_ESCR0,	FSB_ESCR1),
323P4_EVDESCR(BNR,			0x08, 0x03, TRUE,  FSB_ESCR0,	FSB_ESCR1),
324P4_EVDESCR(SNOOP,		0x06, 0x03, TRUE,  FSB_ESCR0,	FSB_ESCR1),
325P4_EVDESCR(RESPONSE,		0x04, 0x03, TRUE,  FSB_ESCR0,	FSB_ESCR1),
326P4_EVDESCR(FRONT_END_EVENT,	0x08, 0x05, FALSE, CRU_ESCR2,	CRU_ESCR3),
327P4_EVDESCR(EXECUTION_EVENT,	0x0C, 0x05, FALSE, CRU_ESCR2,	CRU_ESCR3),
328P4_EVDESCR(REPLAY_EVENT, 	0x09, 0x05, FALSE, CRU_ESCR2,	CRU_ESCR3),
329P4_EVDESCR(INSTR_RETIRED,	0x02, 0x04, FALSE, CRU_ESCR0,	CRU_ESCR1),
330P4_EVDESCR(UOPS_RETIRED,	0x01, 0x04, FALSE, CRU_ESCR0,	CRU_ESCR1),
331P4_EVDESCR(UOP_TYPE,		0x02, 0x02, FALSE, RAT_ESCR0,	RAT_ESCR1),
332P4_EVDESCR(BRANCH_RETIRED,	0x06, 0x05, FALSE, CRU_ESCR2,	CRU_ESCR3),
333P4_EVDESCR(MISPRED_BRANCH_RETIRED, 0x03, 0x04, FALSE, CRU_ESCR0, CRU_ESCR1),
334P4_EVDESCR(X87_ASSIST,		0x03, 0x05, FALSE, CRU_ESCR2,	CRU_ESCR3),
335P4_EVDESCR(MACHINE_CLEAR,	0x02, 0x05, FALSE, CRU_ESCR2,	CRU_ESCR3)
336
337#undef	P4_EVDESCR
338};
339
340#define	P4_EVENT_IS_TI(E) ((E)->pm_is_ti_event == TRUE)
341
342#define	P4_NEVENTS	(PMC_EV_P4_LAST - PMC_EV_P4_FIRST + 1)
343
344/*
345 * P4 PMC descriptors
346 */
347
348struct p4pmc_descr {
349	struct pmc_descr pm_descr; 	/* common information */
350	enum pmc_p4pmc	pm_pmcnum;	/* PMC number */
351	uint32_t	pm_pmc_msr; 	/* PERFCTR MSR address */
352	uint32_t	pm_cccr_msr;  	/* CCCR MSR address */
353};
354
355static struct p4pmc_descr p4_pmcdesc[P4_NPMCS] = {
356
357	/*
358	 * TSC descriptor
359	 */
360
361	{
362		.pm_descr =
363		{
364			.pd_name  = "TSC",
365			.pd_class = PMC_CLASS_TSC,
366			.pd_caps  = PMC_CAP_READ | PMC_CAP_WRITE,
367			.pd_width = 64
368		},
369		.pm_pmcnum   = ~0,
370		.pm_cccr_msr = ~0,
371		.pm_pmc_msr  = 0x10,
372	},
373
374	/*
375	 * P4 PMCS
376	 */
377
378#define	P4_PMC_CAPS (PMC_CAP_INTERRUPT | PMC_CAP_USER | PMC_CAP_SYSTEM |  \
379	PMC_CAP_EDGE | PMC_CAP_THRESHOLD | PMC_CAP_READ | PMC_CAP_WRITE | \
380	PMC_CAP_INVERT | PMC_CAP_QUALIFIER | PMC_CAP_PRECISE |            \
381	PMC_CAP_TAGGING | PMC_CAP_CASCADE)
382
383#define	P4_PMCDESCR(N, PMC, CCCR)			\
384	{						\
385		.pm_descr =				\
386		{					\
387			.pd_name = #N,			\
388			.pd_class = PMC_CLASS_P4,	\
389			.pd_caps = P4_PMC_CAPS,		\
390			.pd_width = 40			\
391		},					\
392		.pm_pmcnum      = P4_PMC_##N,		\
393		.pm_cccr_msr 	= (CCCR),		\
394		.pm_pmc_msr	= (PMC)			\
395	}
396
397	P4_PMCDESCR(BPU_COUNTER0,	0x300,	0x360),
398	P4_PMCDESCR(BPU_COUNTER1,	0x301,	0x361),
399	P4_PMCDESCR(BPU_COUNTER2,	0x302,	0x362),
400	P4_PMCDESCR(BPU_COUNTER3,	0x303,	0x363),
401	P4_PMCDESCR(MS_COUNTER0,	0x304,	0x364),
402	P4_PMCDESCR(MS_COUNTER1,	0x305,	0x365),
403	P4_PMCDESCR(MS_COUNTER2,	0x306,	0x366),
404	P4_PMCDESCR(MS_COUNTER3,	0x307,	0x367),
405	P4_PMCDESCR(FLAME_COUNTER0,	0x308,	0x368),
406	P4_PMCDESCR(FLAME_COUNTER1,	0x309,	0x369),
407	P4_PMCDESCR(FLAME_COUNTER2,	0x30A,	0x36A),
408	P4_PMCDESCR(FLAME_COUNTER3,	0x30B,	0x36B),
409	P4_PMCDESCR(IQ_COUNTER0,	0x30C,	0x36C),
410	P4_PMCDESCR(IQ_COUNTER1,	0x30D,	0x36D),
411	P4_PMCDESCR(IQ_COUNTER2,	0x30E,	0x36E),
412	P4_PMCDESCR(IQ_COUNTER3,	0x30F,	0x36F),
413	P4_PMCDESCR(IQ_COUNTER4,	0x310,	0x370),
414	P4_PMCDESCR(IQ_COUNTER5,	0x311,	0x371),
415
416#undef	P4_PMCDESCR
417};
418
419/* HTT support */
420#define	P4_NHTT					2 /* logical processors/chip */
421#define	P4_HTT_CPU_INDEX_0			0
422#define	P4_HTT_CPU_INDEX_1			1
423
424static int p4_system_has_htt;
425
426/*
427 * Per-CPU data structure for P4 class CPUs
428 *
429 * [common stuff]
430 * [19 struct pmc_hw pointers]
431 * [19 struct pmc_hw structures]
432 * [45 ESCRs status bytes]
433 * [per-cpu spin mutex]
434 * [19 flag fields for holding config flags and a runcount]
435 * [19*2 hw value fields]	(Thread mode PMC support)
436 *    or
437 * [19*2 EIP values]		(Sampling mode PMCs)
438 * [19*2 pmc value fields]	(Thread mode PMC support))
439 */
440
441struct p4_cpu {
442	struct pmc_cpu	pc_common;
443	struct pmc_hw	*pc_hwpmcs[P4_NPMCS];
444	struct pmc_hw	pc_p4pmcs[P4_NPMCS];
445	char		pc_escrs[P4_NESCR];
446	struct mtx	pc_mtx;		/* spin lock */
447	uint32_t	pc_intrflag;	/* NMI handler flags */
448	unsigned int	pc_intrlock;	/* NMI handler spin lock */
449	unsigned char	pc_flags[P4_NPMCS]; /* 4 bits each: {cfg,run}count */
450	union {
451		pmc_value_t pc_hw[P4_NPMCS * P4_NHTT];
452		uintptr_t   pc_ip[P4_NPMCS * P4_NHTT];
453	}		pc_si;
454	pmc_value_t	pc_pmc_values[P4_NPMCS * P4_NHTT];
455};
456
457/*
458 * A 'logical' CPU shares PMC resources with partner 'physical' CPU,
459 * except the TSC, which is architectural and hence seperate.  The
460 * 'logical' CPU descriptor thus has pointers to the physical CPUs
461 * descriptor state except for the TSC (rowindex 0) which is not
462 * shared.
463 */
464
465struct p4_logicalcpu {
466	struct pmc_cpu	pc_common;
467	struct pmc_hw	*pc_hwpmcs[P4_NPMCS];
468	struct pmc_hw	pc_tsc;
469};
470
471#define	P4_PCPU_PMC_VALUE(PC,RI,CPU) 	(PC)->pc_pmc_values[(RI)*((CPU) & 1)]
472#define	P4_PCPU_HW_VALUE(PC,RI,CPU)	(PC)->pc_si.pc_hw[(RI)*((CPU) & 1)]
473#define	P4_PCPU_SAVED_IP(PC,RI,CPU)	(PC)->pc_si.pc_ip[(RI)*((CPU) & 1)]
474
475#define	P4_PCPU_GET_FLAGS(PC,RI,MASK)	((PC)->pc_flags[(RI)] & (MASK))
476#define	P4_PCPU_SET_FLAGS(PC,RI,MASK,VAL)	do {	\
477	char _tmp;					\
478	_tmp = (PC)->pc_flags[(RI)];			\
479	_tmp &= ~(MASK);				\
480	_tmp |= (VAL) & (MASK);				\
481	(PC)->pc_flags[(RI)] = _tmp;			\
482} while (0)
483
484#define	P4_PCPU_GET_RUNCOUNT(PC,RI)	P4_PCPU_GET_FLAGS(PC,RI,0x0F)
485#define	P4_PCPU_SET_RUNCOUNT(PC,RI,V)	P4_PCPU_SET_FLAGS(PC,RI,0x0F,V)
486
487#define	P4_PCPU_GET_CFGFLAGS(PC,RI)	(P4_PCPU_GET_FLAGS(PC,RI,0xF0) >> 4)
488#define	P4_PCPU_SET_CFGFLAGS(PC,RI,C)	P4_PCPU_SET_FLAGS(PC,RI,0xF0,((C) <<4))
489
490#define	P4_CPU_TO_FLAG(C)		(pmc_cpu_is_logical(cpu) ? 0x2 : 0x1)
491
492#define	P4_PCPU_GET_INTRFLAG(PC,I)	((PC)->pc_intrflag & (1 << (I)))
493#define	P4_PCPU_SET_INTRFLAG(PC,I,V)	do {		\
494		uint32_t __mask;			\
495		__mask = 1 << (I);			\
496		if ((V))				\
497			(PC)->pc_intrflag |= __mask;	\
498		else					\
499			(PC)->pc_intrflag &= ~__mask;	\
500	} while (0)
501
502/*
503 * A minimal spin lock implementation for use inside the NMI handler.
504 *
505 * We don't want to use a regular spin lock here, because curthread
506 * may not be consistent at the time the handler is invoked.
507 */
508#define	P4_PCPU_ACQ_INTR_SPINLOCK(PC) do {				\
509		while (!atomic_cmpset_acq_int(&pc->pc_intrlock, 0, 1))	\
510			ia32_pause();					\
511	} while (0)
512#define	P4_PCPU_REL_INTR_SPINLOCK(PC) 					\
513	atomic_store_rel_int(&pc->pc_intrlock, 0);
514
515/* ESCR row disposition */
516static int p4_escrdisp[P4_NESCR];
517
518#define	P4_ESCR_ROW_DISP_IS_THREAD(E)		(p4_escrdisp[(E)] > 0)
519#define	P4_ESCR_ROW_DISP_IS_STANDALONE(E)	(p4_escrdisp[(E)] < 0)
520#define	P4_ESCR_ROW_DISP_IS_FREE(E)		(p4_escrdisp[(E)] == 0)
521
522#define	P4_ESCR_MARK_ROW_STANDALONE(E) do {				\
523	KASSERT(p4_escrdisp[(E)] <= 0, ("[p4,%d] row disposition error",\
524		    __LINE__));						\
525	atomic_add_int(&p4_escrdisp[(E)], -1);				\
526	KASSERT(p4_escrdisp[(E)] >= (-mp_ncpus), ("[p4,%d] row "	\
527		"disposition error", __LINE__));			\
528} while (0)
529
530#define	P4_ESCR_UNMARK_ROW_STANDALONE(E) do {				\
531	atomic_add_int(&p4_escrdisp[(E)], 1);				\
532	KASSERT(p4_escrdisp[(E)] <= 0, ("[p4,%d] row disposition error",\
533		    __LINE__));						\
534} while (0)
535
536#define	P4_ESCR_MARK_ROW_THREAD(E) do {					 \
537	KASSERT(p4_escrdisp[(E)] >= 0, ("[p4,%d] row disposition error", \
538		    __LINE__));						 \
539	atomic_add_int(&p4_escrdisp[(E)], 1);				 \
540} while (0)
541
542#define	P4_ESCR_UNMARK_ROW_THREAD(E) do {				 \
543	atomic_add_int(&p4_escrdisp[(E)], -1);				 \
544	KASSERT(p4_escrdisp[(E)] >= 0, ("[p4,%d] row disposition error",\
545		    __LINE__));						 \
546} while (0)
547
548#define	P4_PMC_IS_STOPPED(cccr)	((rdmsr(cccr) & P4_CCCR_ENABLE) == 0)
549
550#define	P4_TO_PHYSICAL_CPU(cpu) (pmc_cpu_is_logical(cpu) ?		\
551    ((cpu) & ~1) : (cpu))
552
553#define	P4_CCCR_Tx_MASK	(~(P4_CCCR_OVF_PMI_T0|P4_CCCR_OVF_PMI_T1|	\
554			     P4_CCCR_ENABLE|P4_CCCR_OVF))
555#define	P4_ESCR_Tx_MASK	(~(P4_ESCR_T0_OS|P4_ESCR_T0_USR|P4_ESCR_T1_OS|	\
556			     P4_ESCR_T1_USR))
557
558/*
559 * support routines
560 */
561
562static struct p4_event_descr *
563p4_find_event(enum pmc_event ev)
564{
565	int n;
566
567	for (n = 0; n < P4_NEVENTS; n++)
568		if (p4_events[n].pm_event == ev)
569			break;
570	if (n == P4_NEVENTS)
571		return NULL;
572	return &p4_events[n];
573}
574
575/*
576 * Initialize per-cpu state
577 */
578
579static int
580p4_init(int cpu)
581{
582	int n, phycpu;
583	char *pescr;
584	struct p4_cpu *pcs;
585	struct p4_logicalcpu *plcs;
586	struct pmc_hw *phw;
587
588	KASSERT(cpu >= 0 && cpu < mp_ncpus,
589	    ("[p4,%d] insane cpu number %d", __LINE__, cpu));
590
591	PMCDBG(MDP,INI,0, "p4-init cpu=%d logical=%d", cpu,
592	    pmc_cpu_is_logical(cpu) != 0);
593
594	/*
595	 * A 'logical' CPU shares its per-cpu state with its physical
596	 * CPU.  The physical CPU would have been initialized prior to
597	 * the initialization for this cpu.
598	 */
599
600	if (pmc_cpu_is_logical(cpu)) {
601		phycpu = P4_TO_PHYSICAL_CPU(cpu);
602		pcs = (struct p4_cpu *) pmc_pcpu[phycpu];
603		PMCDBG(MDP,INI,1, "p4-init cpu=%d phycpu=%d pcs=%p",
604		    cpu, phycpu, pcs);
605		KASSERT(pcs,
606		    ("[p4,%d] Null Per-Cpu state cpu=%d phycpu=%d", __LINE__,
607			cpu, phycpu));
608		if (pcs == NULL) /* decline to init */
609			return ENXIO;
610
611		p4_system_has_htt = 1;
612
613		MALLOC(plcs, struct p4_logicalcpu *,
614		    sizeof(struct p4_logicalcpu), M_PMC, M_WAITOK|M_ZERO);
615
616		/* The TSC is architectural state and is not shared */
617		plcs->pc_hwpmcs[0] = &plcs->pc_tsc;
618		plcs->pc_tsc.phw_state = PMC_PHW_FLAG_IS_ENABLED |
619		    PMC_PHW_CPU_TO_STATE(cpu) | PMC_PHW_INDEX_TO_STATE(0) |
620		    PMC_PHW_FLAG_IS_SHAREABLE;
621
622		/* Other PMCs are shared with the physical CPU */
623		for (n = 1; n < P4_NPMCS; n++)
624			plcs->pc_hwpmcs[n] = pcs->pc_hwpmcs[n];
625
626		pmc_pcpu[cpu] = (struct pmc_cpu *) plcs;
627		return 0;
628	}
629
630	MALLOC(pcs, struct p4_cpu *, sizeof(struct p4_cpu), M_PMC,
631	    M_WAITOK|M_ZERO);
632
633	if (pcs == NULL)
634		return ENOMEM;
635	phw = pcs->pc_p4pmcs;
636
637	for (n = 0; n < P4_NPMCS; n++, phw++) {
638		phw->phw_state   = PMC_PHW_FLAG_IS_ENABLED |
639		    PMC_PHW_CPU_TO_STATE(cpu) | PMC_PHW_INDEX_TO_STATE(n);
640		phw->phw_pmc     = NULL;
641		pcs->pc_hwpmcs[n] = phw;
642	}
643
644	/* Mark the TSC as shareable */
645	pcs->pc_hwpmcs[0]->phw_state |= PMC_PHW_FLAG_IS_SHAREABLE;
646
647	pescr = pcs->pc_escrs;
648	for (n = 0; n < P4_NESCR; n++)
649		*pescr++ = P4_INVALID_PMC_INDEX;
650	pmc_pcpu[cpu] = (struct pmc_cpu *) pcs;
651
652	mtx_init(&pcs->pc_mtx, "p4-pcpu", "pmc", MTX_SPIN);
653
654	return 0;
655}
656
657/*
658 * Destroy per-cpu state.
659 */
660
661static int
662p4_cleanup(int cpu)
663{
664	struct p4_cpu *pcs;
665
666	PMCDBG(MDP,INI,0, "p4-cleanup cpu=%d", cpu);
667
668	if ((pcs = (struct p4_cpu *) pmc_pcpu[cpu]) == NULL)
669		return 0;
670
671	/*
672	 * If the CPU is physical we need to teardown the
673	 * full MD state.
674	 */
675	if (!pmc_cpu_is_logical(cpu))
676		mtx_destroy(&pcs->pc_mtx);
677
678	FREE(pcs, M_PMC);
679
680	pmc_pcpu[cpu] = NULL;
681
682	return 0;
683}
684
685/*
686 * Context switch in.
687 */
688
689static int
690p4_switch_in(struct pmc_cpu *pc, struct pmc_process *pp)
691{
692	(void) pc;
693
694	PMCDBG(MDP,SWI,1, "pc=%p pp=%p enable-msr=%d", pc, pp,
695	    (pp->pp_flags & PMC_PP_ENABLE_MSR_ACCESS) != 0);
696
697	/* enable the RDPMC instruction */
698	if (pp->pp_flags & PMC_PP_ENABLE_MSR_ACCESS)
699		load_cr4(rcr4() | CR4_PCE);
700
701	PMCDBG(MDP,SWI,2, "cr4=0x%x", (uint32_t) rcr4());
702
703	return 0;
704}
705
706/*
707 * Context switch out.
708 */
709
710static int
711p4_switch_out(struct pmc_cpu *pc, struct pmc_process *pp)
712{
713	(void) pc;
714	(void) pp;		/* can be null */
715
716	PMCDBG(MDP,SWO,1, "pc=%p pp=%p", pc, pp);
717
718	/* always disallow the RDPMC instruction */
719	load_cr4(rcr4() & ~CR4_PCE);
720
721	PMCDBG(MDP,SWO,2, "cr4=0x%x", (uint32_t) rcr4());
722
723	return 0;
724}
725
726/*
727 * Read a PMC
728 */
729
730static int
731p4_read_pmc(int cpu, int ri, pmc_value_t *v)
732{
733	enum pmc_mode mode;
734	struct p4pmc_descr *pd;
735	struct pmc *pm;
736	struct p4_cpu *pc;
737	struct pmc_hw *phw;
738	pmc_value_t tmp;
739
740	KASSERT(cpu >= 0 && cpu < mp_ncpus,
741	    ("[p4,%d] illegal CPU value %d", __LINE__, cpu));
742	KASSERT(ri >= 0 && ri < P4_NPMCS,
743	    ("[p4,%d] illegal row-index %d", __LINE__, ri));
744
745
746	if (ri == 0) {	/* TSC */
747#if	DEBUG
748		pc  = (struct p4_cpu *) pmc_pcpu[cpu];
749		phw = pc->pc_hwpmcs[ri];
750		pm  = phw->phw_pmc;
751
752		KASSERT(pm, ("[p4,%d] cpu=%d ri=%d not configured", __LINE__,
753			    cpu, ri));
754		KASSERT(PMC_TO_CLASS(pm) == PMC_CLASS_TSC,
755		    ("[p4,%d] cpu=%d ri=%d not a TSC (%d)", __LINE__, cpu, ri,
756			PMC_TO_CLASS(pm)));
757		KASSERT(PMC_IS_COUNTING_MODE(PMC_TO_MODE(pm)),
758		    ("[p4,%d] TSC counter in non-counting mode", __LINE__));
759#endif
760		*v = rdtsc();
761		PMCDBG(MDP,REA,2, "p4-read -> %jx", *v);
762		return 0;
763	}
764
765	pc  = (struct p4_cpu *) pmc_pcpu[P4_TO_PHYSICAL_CPU(cpu)];
766	phw = pc->pc_hwpmcs[ri];
767	pd  = &p4_pmcdesc[ri];
768	pm  = phw->phw_pmc;
769
770	KASSERT(pm != NULL,
771	    ("[p4,%d] No owner for HWPMC [cpu%d,pmc%d]", __LINE__,
772		cpu, ri));
773
774	KASSERT(pd->pm_descr.pd_class == PMC_TO_CLASS(pm),
775	    ("[p4,%d] class mismatch pd %d != id class %d", __LINE__,
776		pd->pm_descr.pd_class, PMC_TO_CLASS(pm)));
777
778	mode = PMC_TO_MODE(pm);
779
780	PMCDBG(MDP,REA,1, "p4-read cpu=%d ri=%d mode=%d", cpu, ri, mode);
781
782	KASSERT(pd->pm_descr.pd_class == PMC_CLASS_P4,
783	    ("[p4,%d] unknown PMC class %d", __LINE__, pd->pm_descr.pd_class));
784
785	tmp = rdmsr(p4_pmcdesc[ri].pm_pmc_msr);
786
787	if (PMC_IS_VIRTUAL_MODE(mode)) {
788		if (tmp < P4_PCPU_HW_VALUE(pc,ri,cpu)) /* 40 bit overflow */
789			tmp += (P4_PERFCTR_MASK + 1) -
790			    P4_PCPU_HW_VALUE(pc,ri,cpu);
791		else
792			tmp -= P4_PCPU_HW_VALUE(pc,ri,cpu);
793		tmp += P4_PCPU_PMC_VALUE(pc,ri,cpu);
794	}
795
796	if (PMC_IS_SAMPLING_MODE(mode)) /* undo transformation */
797		*v = P4_PERFCTR_VALUE_TO_RELOAD_COUNT(tmp);
798	else
799		*v = tmp;
800
801	PMCDBG(MDP,REA,2, "p4-read -> %jx", *v);
802	return 0;
803}
804
805/*
806 * Write a PMC
807 */
808
809static int
810p4_write_pmc(int cpu, int ri, pmc_value_t v)
811{
812	enum pmc_mode mode;
813	struct pmc *pm;
814	struct p4_cpu *pc;
815	const struct pmc_hw *phw;
816	const struct p4pmc_descr *pd;
817
818	KASSERT(cpu >= 0 && cpu < mp_ncpus,
819	    ("[amd,%d] illegal CPU value %d", __LINE__, cpu));
820	KASSERT(ri >= 0 && ri < P4_NPMCS,
821	    ("[amd,%d] illegal row-index %d", __LINE__, ri));
822
823
824	/*
825	 * The P4's TSC register is writeable, but we don't allow a
826	 * write as changing the TSC's value could interfere with
827	 * timekeeping and other system functions.
828	 */
829	if (ri == 0) {
830#if	DEBUG
831		pc  = (struct p4_cpu *) pmc_pcpu[cpu];
832		phw = pc->pc_hwpmcs[ri];
833		pm  = phw->phw_pmc;
834		KASSERT(pm, ("[p4,%d] cpu=%d ri=%d not configured", __LINE__,
835			    cpu, ri));
836		KASSERT(PMC_TO_CLASS(pm) == PMC_CLASS_TSC,
837		    ("[p4,%d] cpu=%d ri=%d not a TSC (%d)", __LINE__,
838			cpu, ri, PMC_TO_CLASS(pm)));
839#endif
840		return 0;
841	}
842
843	/* Shared PMCs */
844	pc  = (struct p4_cpu *) pmc_pcpu[P4_TO_PHYSICAL_CPU(cpu)];
845	phw = pc->pc_hwpmcs[ri];
846	pm  = phw->phw_pmc;
847	pd  = &p4_pmcdesc[ri];
848
849	KASSERT(pm != NULL,
850	    ("[p4,%d] No owner for HWPMC [cpu%d,pmc%d]", __LINE__,
851		cpu, ri));
852
853	mode = PMC_TO_MODE(pm);
854
855	PMCDBG(MDP,WRI,1, "p4-write cpu=%d ri=%d mode=%d v=%jx", cpu, ri,
856	    mode, v);
857
858	/*
859	 * write the PMC value to the register/saved value: for
860	 * sampling mode PMCs, the value to be programmed into the PMC
861	 * counter is -(C+1) where 'C' is the requested sample rate.
862	 */
863	if (PMC_IS_SAMPLING_MODE(mode))
864		v = P4_RELOAD_COUNT_TO_PERFCTR_VALUE(v);
865
866	if (PMC_IS_SYSTEM_MODE(mode))
867		wrmsr(pd->pm_pmc_msr, v);
868	else
869		P4_PCPU_PMC_VALUE(pc,ri,cpu) = v;
870
871	return 0;
872}
873
874/*
875 * Configure a PMC 'pm' on the given CPU and row-index.
876 *
877 * 'pm' may be NULL to indicate de-configuration.
878 *
879 * On HTT systems, a PMC may get configured twice, once for each
880 * "logical" CPU.  We track this using the CFGFLAGS field of the
881 * per-cpu state; this field is a bit mask with one bit each for
882 * logical CPUs 0 & 1.
883 */
884
885static int
886p4_config_pmc(int cpu, int ri, struct pmc *pm)
887{
888	struct pmc_hw *phw;
889	struct p4_cpu *pc;
890	int cfgflags, cpuflag;
891
892	KASSERT(cpu >= 0 && cpu < mp_ncpus,
893	    ("[p4,%d] illegal CPU %d", __LINE__, cpu));
894	KASSERT(ri >= 0 && ri < P4_NPMCS,
895	    ("[p4,%d] illegal row-index %d", __LINE__, ri));
896
897	PMCDBG(MDP,CFG,1, "cpu=%d ri=%d pm=%p", cpu, ri, pm);
898
899	if (ri == 0) {		/* TSC */
900		pc = (struct p4_cpu *) pmc_pcpu[cpu];
901		phw = pc->pc_hwpmcs[ri];
902
903		KASSERT(pm == NULL || phw->phw_pmc == NULL,
904		    ("[p4,%d] hwpmc doubly config'ed", __LINE__));
905		phw->phw_pmc = pm;
906		return 0;
907	}
908
909	/* Shared PMCs */
910
911	pc = (struct p4_cpu *) pmc_pcpu[P4_TO_PHYSICAL_CPU(cpu)];
912	phw = pc->pc_hwpmcs[ri];
913
914	KASSERT(pm == NULL || phw->phw_pmc == NULL ||
915	    (p4_system_has_htt && phw->phw_pmc == pm),
916	    ("[p4,%d] hwpmc not unconfigured before re-config", __LINE__));
917
918	mtx_lock_spin(&pc->pc_mtx);
919	cfgflags = P4_PCPU_GET_CFGFLAGS(pc,ri);
920
921	KASSERT(cfgflags >= 0 || cfgflags <= 3,
922	    ("[p4,%d] illegal cfgflags cfg=%d on cpu=%d ri=%d", __LINE__,
923		cfgflags, cpu, ri));
924
925	KASSERT(cfgflags == 0 || phw->phw_pmc,
926	    ("[p4,%d] cpu=%d ri=%d pmc configured with zero cfg count",
927		__LINE__, cpu, ri));
928
929	cpuflag = P4_CPU_TO_FLAG(cpu);
930
931	if (pm) {		/* config */
932		if (cfgflags == 0)
933			phw->phw_pmc = pm;
934
935		KASSERT(phw->phw_pmc == pm,
936		    ("[p4,%d] cpu=%d ri=%d config %p != hw %p",
937			__LINE__, cpu, ri, pm, phw->phw_pmc));
938
939		cfgflags |= cpuflag;
940	} else {		/* unconfig */
941		cfgflags &= ~cpuflag;
942
943		if (cfgflags == 0)
944			phw->phw_pmc = NULL;
945	}
946
947	KASSERT(cfgflags >= 0 || cfgflags <= 3,
948	    ("[p4,%d] illegal runcount cfg=%d on cpu=%d ri=%d", __LINE__,
949		cfgflags, cpu, ri));
950
951	P4_PCPU_SET_CFGFLAGS(pc,ri,cfgflags);
952
953	mtx_unlock_spin(&pc->pc_mtx);
954
955	return 0;
956}
957
958/*
959 * Retrieve a configured PMC pointer from hardware state.
960 */
961
962static int
963p4_get_config(int cpu, int ri, struct pmc **ppm)
964{
965	struct p4_cpu *pc;
966	struct pmc_hw *phw;
967	int cfgflags;
968
969	pc = (struct p4_cpu *) pmc_pcpu[P4_TO_PHYSICAL_CPU(cpu)];
970	phw = pc->pc_hwpmcs[ri];
971
972	mtx_lock_spin(&pc->pc_mtx);
973	cfgflags = P4_PCPU_GET_CFGFLAGS(pc,ri);
974	mtx_unlock_spin(&pc->pc_mtx);
975
976	if (cfgflags & P4_CPU_TO_FLAG(cpu))
977		*ppm = phw->phw_pmc; /* PMC config'ed on this CPU */
978	else
979		*ppm = NULL;
980
981	return 0;
982}
983
984/*
985 * Allocate a PMC.
986 *
987 * The allocation strategy differs between HTT and non-HTT systems.
988 *
989 * The non-HTT case:
990 *   - Given the desired event and the PMC row-index, lookup the
991 *   list of valid ESCRs for the event.
992 *   - For each valid ESCR:
993 *     - Check if the ESCR is free and the ESCR row is in a compatible
994 *       mode (i.e., system or process))
995 *     - Check if the ESCR is usable with a P4 PMC at the desired row-index.
996 *   If everything matches, we determine the appropriate bit values for the
997 *   ESCR and CCCR registers.
998 *
999 * The HTT case:
1000 *
1001 * - Process mode PMCs require special care.  The FreeBSD scheduler could
1002 *   schedule any two processes on the same physical CPU.  We need to ensure
1003 *   that a given PMC row-index is never allocated to two different
1004 *   PMCs owned by different user-processes.
1005 *   This is ensured by always allocating a PMC from a 'FREE' PMC row
1006 *   if the system has HTT active.
1007 * - A similar check needs to be done for ESCRs; we do not want two PMCs
1008 *   using the same ESCR to be scheduled at the same time.  Thus ESCR
1009 *   allocation is also restricted to FREE rows if the system has HTT
1010 *   enabled.
1011 * - Thirdly, some events are 'thread-independent' terminology, i.e.,
1012 *   the PMC hardware cannot distinguish between events caused by
1013 *   different logical CPUs.  This makes it impossible to assign events
1014 *   to a given thread of execution.  If the system has HTT enabled,
1015 *   these events are not allowed for process-mode PMCs.
1016 */
1017
1018static int
1019p4_allocate_pmc(int cpu, int ri, struct pmc *pm,
1020    const struct pmc_op_pmcallocate *a)
1021{
1022	int found, n, m;
1023	uint32_t caps, cccrvalue, escrvalue, tflags;
1024	enum pmc_p4escr escr;
1025	struct p4_cpu *pc;
1026	struct p4_event_descr *pevent;
1027	const struct p4pmc_descr *pd;
1028
1029	KASSERT(cpu >= 0 && cpu < mp_ncpus,
1030	    ("[p4,%d] illegal CPU %d", __LINE__, cpu));
1031	KASSERT(ri >= 0 && ri < P4_NPMCS,
1032	    ("[p4,%d] illegal row-index value %d", __LINE__, ri));
1033
1034	pd = &p4_pmcdesc[ri];
1035
1036	PMCDBG(MDP,ALL,1, "p4-allocate ri=%d class=%d pmccaps=0x%x "
1037	    "reqcaps=0x%x", ri, pd->pm_descr.pd_class, pd->pm_descr.pd_caps,
1038	    pm->pm_caps);
1039
1040	/* check class */
1041	if (pd->pm_descr.pd_class != a->pm_class)
1042		return EINVAL;
1043
1044	/* check requested capabilities */
1045	caps = a->pm_caps;
1046	if ((pd->pm_descr.pd_caps & caps) != caps)
1047		return EPERM;
1048
1049	if (pd->pm_descr.pd_class == PMC_CLASS_TSC) {
1050		/* TSC's are always allocated in system-wide counting mode */
1051		if (a->pm_ev != PMC_EV_TSC_TSC ||
1052		    a->pm_mode != PMC_MODE_SC)
1053			return EINVAL;
1054		return 0;
1055	}
1056
1057	/*
1058	 * If the system has HTT enabled, and the desired allocation
1059	 * mode is process-private, and the PMC row disposition is not
1060	 * FREE (0), decline the allocation.
1061	 */
1062
1063	if (p4_system_has_htt &&
1064	    PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)) &&
1065	    pmc_getrowdisp(ri) != 0)
1066		return EBUSY;
1067
1068	KASSERT(pd->pm_descr.pd_class == PMC_CLASS_P4,
1069	    ("[p4,%d] unknown PMC class %d", __LINE__,
1070		pd->pm_descr.pd_class));
1071
1072	if (pm->pm_event < PMC_EV_P4_FIRST ||
1073	    pm->pm_event > PMC_EV_P4_LAST)
1074		return EINVAL;
1075
1076	if ((pevent = p4_find_event(pm->pm_event)) == NULL)
1077		return ESRCH;
1078
1079	PMCDBG(MDP,ALL,2, "pevent={ev=%d,escrsel=0x%x,cccrsel=0x%x,isti=%d}",
1080	    pevent->pm_event, pevent->pm_escr_eventselect,
1081	    pevent->pm_cccr_select, pevent->pm_is_ti_event);
1082
1083	/*
1084	 * Some PMC events are 'thread independent'and therefore
1085	 * cannot be used for process-private modes if HTT is being
1086	 * used.
1087	 */
1088
1089	if (P4_EVENT_IS_TI(pevent) &&
1090	    PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)) &&
1091	    p4_system_has_htt)
1092		return EINVAL;
1093
1094	pc = (struct p4_cpu *) pmc_pcpu[P4_TO_PHYSICAL_CPU(cpu)];
1095
1096	found   = 0;
1097
1098	/* look for a suitable ESCR for this event */
1099	for (n = 0; n < P4_MAX_ESCR_PER_EVENT && !found; n++) {
1100		if ((escr = pevent->pm_escrs[n]) == P4_ESCR_NONE)
1101			break;	/* out of ESCRs */
1102		/*
1103		 * Check ESCR row disposition.
1104		 *
1105		 * If the request is for a system-mode PMC, then the
1106		 * ESCR row should not be in process-virtual mode, and
1107		 * should also be free on the current CPU.
1108		 */
1109
1110		if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
1111		    if (P4_ESCR_ROW_DISP_IS_THREAD(escr) ||
1112			pc->pc_escrs[escr] != P4_INVALID_PMC_INDEX)
1113			    continue;
1114		}
1115
1116		/*
1117		 * If the request is for a process-virtual PMC, and if
1118		 * HTT is not enabled, we can use an ESCR row that is
1119		 * either FREE or already in process mode.
1120		 *
1121		 * If HTT is enabled, then we need to ensure that a
1122		 * given ESCR is never allocated to two PMCS that
1123		 * could run simultaneously on the two logical CPUs of
1124		 * a CPU package.  We ensure this be only allocating
1125		 * ESCRs from rows marked as 'FREE'.
1126		 */
1127
1128		if (PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm))) {
1129			if (p4_system_has_htt) {
1130				if (!P4_ESCR_ROW_DISP_IS_FREE(escr))
1131					continue;
1132			} else
1133				if (P4_ESCR_ROW_DISP_IS_STANDALONE(escr))
1134					continue;
1135		}
1136
1137		/*
1138		 * We found a suitable ESCR for this event.  Now check if
1139		 * this escr can work with the PMC at row-index 'ri'.
1140		 */
1141
1142		for (m = 0; m < P4_MAX_PMC_PER_ESCR; m++)
1143			if (p4_escrs[escr].pm_pmcs[m] == pd->pm_pmcnum) {
1144				found = 1;
1145				break;
1146			}
1147	}
1148
1149	if (found == 0)
1150		return ESRCH;
1151
1152	KASSERT((int) escr >= 0 && escr < P4_NESCR,
1153	    ("[p4,%d] illegal ESCR value %d", __LINE__, escr));
1154
1155	/* mark ESCR row mode */
1156	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
1157		pc->pc_escrs[escr] = ri; /* mark ESCR as in use on this cpu */
1158		P4_ESCR_MARK_ROW_STANDALONE(escr);
1159	} else {
1160		KASSERT(pc->pc_escrs[escr] == P4_INVALID_PMC_INDEX,
1161		    ("[p4,%d] escr[%d] already in use", __LINE__, escr));
1162		P4_ESCR_MARK_ROW_THREAD(escr);
1163	}
1164
1165	pm->pm_md.pm_p4.pm_p4_escrmsr   = p4_escrs[escr].pm_escr_msr;
1166	pm->pm_md.pm_p4.pm_p4_escr      = escr;
1167
1168	cccrvalue = P4_CCCR_TO_ESCR_SELECT(pevent->pm_cccr_select);
1169	escrvalue = P4_ESCR_TO_EVENT_SELECT(pevent->pm_escr_eventselect);
1170
1171	/* CCCR fields */
1172	if (caps & PMC_CAP_THRESHOLD)
1173		cccrvalue |= (a->pm_md.pm_p4.pm_p4_cccrconfig &
1174		    P4_CCCR_THRESHOLD_MASK) | P4_CCCR_COMPARE;
1175
1176	if (caps & PMC_CAP_EDGE)
1177		cccrvalue |= P4_CCCR_EDGE;
1178
1179	if (caps & PMC_CAP_INVERT)
1180		cccrvalue |= P4_CCCR_COMPLEMENT;
1181
1182	if (p4_system_has_htt)
1183		cccrvalue |= a->pm_md.pm_p4.pm_p4_cccrconfig &
1184		    P4_CCCR_ACTIVE_THREAD_MASK;
1185	else			/* no HTT; thread field should be '11b' */
1186		cccrvalue |= P4_CCCR_TO_ACTIVE_THREAD(0x3);
1187
1188	if (caps & PMC_CAP_CASCADE)
1189		cccrvalue |= P4_CCCR_CASCADE;
1190
1191	/* On HTT systems the PMI T0 field may get moved to T1 at pmc start */
1192	if (caps & PMC_CAP_INTERRUPT)
1193		cccrvalue |= P4_CCCR_OVF_PMI_T0;
1194
1195	/* ESCR fields */
1196	if (caps & PMC_CAP_QUALIFIER)
1197		escrvalue |= a->pm_md.pm_p4.pm_p4_escrconfig &
1198		    P4_ESCR_EVENT_MASK_MASK;
1199	if (caps & PMC_CAP_TAGGING)
1200		escrvalue |= (a->pm_md.pm_p4.pm_p4_escrconfig &
1201		    P4_ESCR_TAG_VALUE_MASK) | P4_ESCR_TAG_ENABLE;
1202	if (caps & PMC_CAP_QUALIFIER)
1203		escrvalue |= (a->pm_md.pm_p4.pm_p4_escrconfig &
1204		    P4_ESCR_EVENT_MASK_MASK);
1205
1206	/* HTT: T0_{OS,USR} bits may get moved to T1 at pmc start */
1207	tflags = 0;
1208	if (caps & PMC_CAP_SYSTEM)
1209		tflags |= P4_ESCR_T0_OS;
1210	if (caps & PMC_CAP_USER)
1211		tflags |= P4_ESCR_T0_USR;
1212	if (tflags == 0)
1213		tflags = (P4_ESCR_T0_OS|P4_ESCR_T0_USR);
1214	escrvalue |= tflags;
1215
1216	pm->pm_md.pm_p4.pm_p4_cccrvalue = cccrvalue;
1217	pm->pm_md.pm_p4.pm_p4_escrvalue = escrvalue;
1218
1219	PMCDBG(MDP,ALL,2, "p4-allocate cccrsel=0x%x cccrval=0x%x "
1220	    "escr=%d escrmsr=0x%x escrval=0x%x", pevent->pm_cccr_select,
1221	    cccrvalue, escr, pm->pm_md.pm_p4.pm_p4_escrmsr, escrvalue);
1222
1223	return 0;
1224}
1225
1226/*
1227 * release a PMC.
1228 */
1229
1230static int
1231p4_release_pmc(int cpu, int ri, struct pmc *pm)
1232{
1233	enum pmc_p4escr escr;
1234	struct pmc_hw *phw;
1235	struct p4_cpu *pc;
1236
1237	if (p4_pmcdesc[ri].pm_descr.pd_class == PMC_CLASS_TSC)
1238		return 0;
1239
1240	escr = pm->pm_md.pm_p4.pm_p4_escr;
1241
1242	PMCDBG(MDP,REL,1, "p4-release cpu=%d ri=%d escr=%d", cpu, ri, escr);
1243
1244	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
1245		pc  = (struct p4_cpu *) pmc_pcpu[P4_TO_PHYSICAL_CPU(cpu)];
1246		phw = pc->pc_hwpmcs[ri];
1247
1248		KASSERT(phw->phw_pmc == NULL,
1249		    ("[p4,%d] releasing configured PMC ri=%d", __LINE__, ri));
1250
1251		P4_ESCR_UNMARK_ROW_STANDALONE(escr);
1252		KASSERT(pc->pc_escrs[escr] == ri,
1253		    ("[p4,%d] escr[%d] not allocated to ri %d", __LINE__,
1254			escr, ri));
1255	        pc->pc_escrs[escr] = P4_INVALID_PMC_INDEX; /* mark as free */
1256	} else
1257		P4_ESCR_UNMARK_ROW_THREAD(escr);
1258
1259	return 0;
1260}
1261
1262/*
1263 * Start a PMC
1264 */
1265
1266static int
1267p4_start_pmc(int cpu, int ri)
1268{
1269	int rc;
1270	uint32_t cccrvalue, cccrtbits, escrvalue, escrmsr, escrtbits;
1271	struct pmc *pm;
1272	struct p4_cpu *pc;
1273	struct pmc_hw *phw;
1274	struct p4pmc_descr *pd;
1275
1276	KASSERT(cpu >= 0 && cpu < mp_ncpus,
1277	    ("[p4,%d] illegal CPU value %d", __LINE__, cpu));
1278	KASSERT(ri >= 0 && ri < P4_NPMCS,
1279	    ("[p4,%d] illegal row-index %d", __LINE__, ri));
1280
1281	pc  = (struct p4_cpu *) pmc_pcpu[P4_TO_PHYSICAL_CPU(cpu)];
1282	phw = pc->pc_hwpmcs[ri];
1283	pm  = phw->phw_pmc;
1284	pd  = &p4_pmcdesc[ri];
1285
1286	KASSERT(pm != NULL,
1287	    ("[p4,%d] starting cpu%d,pmc%d with null pmc", __LINE__,
1288		cpu, ri));
1289
1290	PMCDBG(MDP,STA,1, "p4-start cpu=%d ri=%d", cpu, ri);
1291
1292	if (pd->pm_descr.pd_class == PMC_CLASS_TSC) /* TSC are always on */
1293		return 0;
1294
1295	KASSERT(pd->pm_descr.pd_class == PMC_CLASS_P4,
1296	    ("[p4,%d] wrong PMC class %d", __LINE__,
1297		pd->pm_descr.pd_class));
1298
1299	/* retrieve the desired CCCR/ESCR values from the PMC */
1300	cccrvalue = pm->pm_md.pm_p4.pm_p4_cccrvalue;
1301	escrvalue = pm->pm_md.pm_p4.pm_p4_escrvalue;
1302	escrmsr   = pm->pm_md.pm_p4.pm_p4_escrmsr;
1303
1304	/* extract and zero the logical processor selection bits */
1305	cccrtbits = cccrvalue & P4_CCCR_OVF_PMI_T0;
1306	escrtbits = escrvalue & (P4_ESCR_T0_OS|P4_ESCR_T0_USR);
1307	cccrvalue &= ~P4_CCCR_OVF_PMI_T0;
1308	escrvalue &= ~(P4_ESCR_T0_OS|P4_ESCR_T0_USR);
1309
1310	if (pmc_cpu_is_logical(cpu)) { /* shift T0 bits to T1 position */
1311		cccrtbits <<= 1;
1312		escrtbits >>= 2;
1313	}
1314
1315	/* start system mode PMCs directly */
1316	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
1317		wrmsr(escrmsr, escrvalue | escrtbits);
1318		wrmsr(pd->pm_cccr_msr, cccrvalue | cccrtbits | P4_CCCR_ENABLE);
1319		return 0;
1320	}
1321
1322	/*
1323	 * Thread mode PMCs
1324	 *
1325	 * On HTT machines, the same PMC could be scheduled on the
1326	 * same physical CPU twice (once for each logical CPU), for
1327	 * example, if two threads of a multi-threaded process get
1328	 * scheduled on the same CPU.
1329	 *
1330	 */
1331
1332	mtx_lock_spin(&pc->pc_mtx);
1333
1334	rc = P4_PCPU_GET_RUNCOUNT(pc,ri);
1335	KASSERT(rc == 0 || rc == 1,
1336	    ("[p4,%d] illegal runcount cpu=%d ri=%d rc=%d", __LINE__, cpu, ri,
1337		rc));
1338
1339	if (rc == 0) {		/* 1st CPU and the non-HTT case */
1340
1341		KASSERT(P4_PMC_IS_STOPPED(pd->pm_cccr_msr),
1342		    ("[p4,%d] cpu=%d ri=%d cccr=0x%x not stopped", __LINE__,
1343			cpu, ri, pd->pm_cccr_msr));
1344
1345		/* write out the low 40 bits of the saved value to hardware */
1346		wrmsr(pd->pm_pmc_msr,
1347		    P4_PCPU_PMC_VALUE(pc,ri,cpu) & P4_PERFCTR_MASK);
1348
1349	} else if (rc == 1) {		/* 2nd CPU */
1350
1351		/*
1352		 * Stop the PMC and retrieve the CCCR and ESCR values
1353		 * from their MSRs, and turn on the additional T[0/1]
1354		 * bits for the 2nd CPU.
1355		 */
1356
1357		cccrvalue = rdmsr(pd->pm_cccr_msr);
1358		wrmsr(pd->pm_cccr_msr, cccrvalue & ~P4_CCCR_ENABLE);
1359
1360		/* check that the configuration bits read back match the PMC */
1361		KASSERT((cccrvalue & P4_CCCR_Tx_MASK) ==
1362		    (pm->pm_md.pm_p4.pm_p4_cccrvalue & P4_CCCR_Tx_MASK),
1363		    ("[p4,%d] Extra CCCR bits cpu=%d rc=%d ri=%d "
1364			"cccr=0x%x PMC=0x%x", __LINE__, cpu, rc, ri,
1365			cccrvalue & P4_CCCR_Tx_MASK,
1366			pm->pm_md.pm_p4.pm_p4_cccrvalue & P4_CCCR_Tx_MASK));
1367		KASSERT(cccrvalue & P4_CCCR_ENABLE,
1368		    ("[p4,%d] 2nd cpu rc=%d cpu=%d ri=%d not running",
1369			__LINE__, rc, cpu, ri));
1370		KASSERT((cccrvalue & cccrtbits) == 0,
1371		    ("[p4,%d] CCCR T0/T1 mismatch rc=%d cpu=%d ri=%d"
1372		     "cccrvalue=0x%x tbits=0x%x", __LINE__, rc, cpu, ri,
1373			cccrvalue, cccrtbits));
1374
1375		escrvalue = rdmsr(escrmsr);
1376
1377		KASSERT((escrvalue & P4_ESCR_Tx_MASK) ==
1378		    (pm->pm_md.pm_p4.pm_p4_escrvalue & P4_ESCR_Tx_MASK),
1379		    ("[p4,%d] Extra ESCR bits cpu=%d rc=%d ri=%d "
1380			"escr=0x%x pm=0x%x", __LINE__, cpu, rc, ri,
1381			escrvalue & P4_ESCR_Tx_MASK,
1382			pm->pm_md.pm_p4.pm_p4_escrvalue & P4_ESCR_Tx_MASK));
1383		KASSERT((escrvalue & escrtbits) == 0,
1384		    ("[p4,%d] ESCR T0/T1 mismatch rc=%d cpu=%d ri=%d "
1385		     "escrmsr=0x%x escrvalue=0x%x tbits=0x%x", __LINE__,
1386			rc, cpu, ri, escrmsr, escrvalue, escrtbits));
1387	}
1388
1389	/* Enable the correct bits for this CPU. */
1390	escrvalue |= escrtbits;
1391	cccrvalue |= cccrtbits | P4_CCCR_ENABLE;
1392
1393	/* Save HW value at the time of starting hardware */
1394	P4_PCPU_HW_VALUE(pc,ri,cpu) = rdmsr(pd->pm_pmc_msr);
1395
1396	/* Program the ESCR and CCCR and start the PMC */
1397	wrmsr(escrmsr, escrvalue);
1398	wrmsr(pd->pm_cccr_msr, cccrvalue);
1399
1400	++rc;
1401	P4_PCPU_SET_RUNCOUNT(pc,ri,rc);
1402
1403	mtx_unlock_spin(&pc->pc_mtx);
1404
1405	PMCDBG(MDP,STA,2,"p4-start cpu=%d rc=%d ri=%d escr=%d "
1406	    "escrmsr=0x%x escrvalue=0x%x cccr_config=0x%x v=%jx", cpu, rc,
1407	    ri, pm->pm_md.pm_p4.pm_p4_escr, escrmsr, escrvalue,
1408	    cccrvalue, P4_PCPU_HW_VALUE(pc,ri,cpu));
1409
1410	return 0;
1411}
1412
1413/*
1414 * Stop a PMC.
1415 */
1416
1417static int
1418p4_stop_pmc(int cpu, int ri)
1419{
1420	int rc;
1421	uint32_t cccrvalue, cccrtbits, escrvalue, escrmsr, escrtbits;
1422	struct pmc *pm;
1423	struct p4_cpu *pc;
1424	struct pmc_hw *phw;
1425	struct p4pmc_descr *pd;
1426	pmc_value_t tmp;
1427
1428	KASSERT(cpu >= 0 && cpu < mp_ncpus,
1429	    ("[p4,%d] illegal CPU value %d", __LINE__, cpu));
1430	KASSERT(ri >= 0 && ri < P4_NPMCS,
1431	    ("[p4,%d] illegal row index %d", __LINE__, ri));
1432
1433	pd  = &p4_pmcdesc[ri];
1434
1435	if (pd->pm_descr.pd_class == PMC_CLASS_TSC)
1436		return 0;
1437
1438	pc  = (struct p4_cpu *) pmc_pcpu[P4_TO_PHYSICAL_CPU(cpu)];
1439	phw = pc->pc_hwpmcs[ri];
1440
1441	KASSERT(phw != NULL,
1442	    ("[p4,%d] null phw for cpu%d, ri%d", __LINE__, cpu, ri));
1443
1444	pm  = phw->phw_pmc;
1445
1446	KASSERT(pm != NULL,
1447	    ("[p4,%d] null pmc for cpu%d, ri%d", __LINE__, cpu, ri));
1448
1449	PMCDBG(MDP,STO,1, "p4-stop cpu=%d ri=%d", cpu, ri);
1450
1451	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
1452		wrmsr(pd->pm_cccr_msr,
1453		    pm->pm_md.pm_p4.pm_p4_cccrvalue & ~P4_CCCR_ENABLE);
1454		return 0;
1455	}
1456
1457	/*
1458	 * Thread mode PMCs.
1459	 *
1460	 * On HTT machines, this PMC may be in use by two threads
1461	 * running on two logical CPUS.  Thus we look at the
1462	 * 'pm_runcount' field and only turn off the appropriate TO/T1
1463	 * bits (and keep the PMC running) if two logical CPUs were
1464	 * using the PMC.
1465	 *
1466	 */
1467
1468	/* bits to mask */
1469	cccrtbits = P4_CCCR_OVF_PMI_T0;
1470	escrtbits = P4_ESCR_T0_OS | P4_ESCR_T0_USR;
1471	if (pmc_cpu_is_logical(cpu)) {
1472		cccrtbits <<= 1;
1473		escrtbits >>= 2;
1474	}
1475
1476	mtx_lock_spin(&pc->pc_mtx);
1477
1478	rc = P4_PCPU_GET_RUNCOUNT(pc,ri);
1479
1480	KASSERT(rc == 2 || rc == 1,
1481	    ("[p4,%d] illegal runcount cpu=%d ri=%d rc=%d", __LINE__, cpu, ri,
1482		rc));
1483
1484	--rc;
1485
1486	P4_PCPU_SET_RUNCOUNT(pc,ri,rc);
1487
1488	/* Stop this PMC */
1489	cccrvalue = rdmsr(pd->pm_cccr_msr);
1490	wrmsr(pd->pm_cccr_msr, cccrvalue & ~P4_CCCR_ENABLE);
1491
1492	escrmsr   = pm->pm_md.pm_p4.pm_p4_escrmsr;
1493	escrvalue = rdmsr(escrmsr);
1494
1495	/* The current CPU should be running on this PMC */
1496	KASSERT(escrvalue & escrtbits,
1497	    ("[p4,%d] ESCR T0/T1 mismatch cpu=%d rc=%d ri=%d escrmsr=0x%x "
1498		"escrvalue=0x%x tbits=0x%x", __LINE__, cpu, rc, ri, escrmsr,
1499		escrvalue, escrtbits));
1500	KASSERT(PMC_IS_COUNTING_MODE(PMC_TO_MODE(pm)) ||
1501	    (cccrvalue & cccrtbits),
1502	    ("[p4,%d] CCCR T0/T1 mismatch cpu=%d ri=%d cccrvalue=0x%x "
1503		"tbits=0x%x", __LINE__, cpu, ri, cccrvalue, cccrtbits));
1504
1505	/* get the current hardware reading */
1506	tmp = rdmsr(pd->pm_pmc_msr);
1507
1508	if (rc == 1) {		/* need to keep the PMC running */
1509		escrvalue &= ~escrtbits;
1510		cccrvalue &= ~cccrtbits;
1511		wrmsr(escrmsr, escrvalue);
1512		wrmsr(pd->pm_cccr_msr, cccrvalue);
1513	}
1514
1515	mtx_unlock_spin(&pc->pc_mtx);
1516
1517	PMCDBG(MDP,STO,2, "p4-stop cpu=%d rc=%d ri=%d escrmsr=0x%x "
1518	    "escrval=0x%x cccrval=0x%x v=%jx", cpu, rc, ri, escrmsr,
1519	    escrvalue, cccrvalue, tmp);
1520
1521	if (tmp < P4_PCPU_HW_VALUE(pc,ri,cpu)) /* 40 bit counter overflow */
1522		tmp += (P4_PERFCTR_MASK + 1) - P4_PCPU_HW_VALUE(pc,ri,cpu);
1523	else
1524		tmp -= P4_PCPU_HW_VALUE(pc,ri,cpu);
1525
1526	P4_PCPU_PMC_VALUE(pc,ri,cpu) += tmp;
1527
1528	return 0;
1529}
1530
1531/*
1532 * Handle an interrupt.
1533 *
1534 * The hardware sets the CCCR_OVF whenever a counter overflow occurs, so the handler
1535 * examines all the 18 CCCR registers, processing the counters that have overflowed.
1536 *
1537 * On HTT machines, the CCCR register is shared and will interrupt
1538 * both logical processors if so configured.  Thus multiple logical
1539 * CPUs could enter the NMI service routine at the same time.  These
1540 * will get serialized using a per-cpu spinlock dedicated for use in
1541 * the NMI handler.
1542 */
1543
1544static int
1545p4_intr(int cpu, uintptr_t eip, int usermode)
1546{
1547	int i, did_interrupt, error, ri;
1548	uint32_t cccrval, ovf_mask, ovf_partner;
1549	struct p4_cpu *pc;
1550	struct pmc_hw *phw;
1551	struct pmc *pm;
1552	pmc_value_t v;
1553
1554	PMCDBG(MDP,INT, 1, "cpu=%d eip=%p um=%d", cpu, (void *) eip, usermode);
1555
1556	pc = (struct p4_cpu *) pmc_pcpu[P4_TO_PHYSICAL_CPU(cpu)];
1557
1558	ovf_mask = pmc_cpu_is_logical(cpu) ?
1559	    P4_CCCR_OVF_PMI_T1 : P4_CCCR_OVF_PMI_T0;
1560	ovf_mask |= P4_CCCR_OVF;
1561	if (p4_system_has_htt)
1562		ovf_partner = pmc_cpu_is_logical(cpu) ? P4_CCCR_OVF_PMI_T0 :
1563		    P4_CCCR_OVF_PMI_T1;
1564	else
1565		ovf_partner = 0;
1566	did_interrupt = 0;
1567
1568	if (p4_system_has_htt)
1569		P4_PCPU_ACQ_INTR_SPINLOCK(pc);
1570
1571	/*
1572	 * Loop through all CCCRs, looking for ones that have
1573	 * interrupted this CPU.
1574	 */
1575	for (i = 0; i < P4_NPMCS-1; i++) {
1576
1577		ri = i + 1;	/* row index */
1578
1579		/*
1580		 * Check if our partner logical CPU has already marked
1581		 * this PMC has having interrupted it.  If so, reset
1582		 * the flag and process the interrupt, but leave the
1583		 * hardware alone.
1584		 */
1585		if (p4_system_has_htt && P4_PCPU_GET_INTRFLAG(pc,ri)) {
1586			P4_PCPU_SET_INTRFLAG(pc,ri,0);
1587			did_interrupt = 1;
1588
1589			/*
1590			 * Ignore de-configured or stopped PMCs.
1591			 * Ignore PMCs not in sampling mode.
1592			 */
1593			phw = pc->pc_hwpmcs[ri];
1594			pm  = phw->phw_pmc;
1595			if (pm == NULL ||
1596			    pm->pm_state != PMC_STATE_RUNNING ||
1597			    !PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) {
1598				continue;
1599			}
1600			(void) pmc_process_interrupt(cpu, pm, eip, usermode);
1601			continue;
1602		}
1603
1604		/*
1605		 * Fresh interrupt.  Look for the CCCR_OVF bit
1606		 * and the OVF_Tx bit for this logical
1607		 * processor being set.
1608		 */
1609		cccrval = rdmsr(P4_CCCR_MSR_FIRST + i);
1610
1611		if ((cccrval & ovf_mask) != ovf_mask)
1612			continue;
1613
1614		/*
1615		 * If the other logical CPU would also have been
1616		 * interrupted due to the PMC being shared, record
1617		 * this fact in the per-cpu saved interrupt flag
1618		 * bitmask.
1619		 */
1620		if (p4_system_has_htt && (cccrval & ovf_partner))
1621			P4_PCPU_SET_INTRFLAG(pc, ri, 1);
1622
1623		v = rdmsr(P4_PERFCTR_MSR_FIRST + i);
1624
1625		PMCDBG(MDP,INT, 2, "ri=%d v=%jx", ri, v);
1626
1627		/* Stop the counter, and reset the overflow  bit */
1628		cccrval &= ~(P4_CCCR_OVF | P4_CCCR_ENABLE);
1629		wrmsr(P4_CCCR_MSR_FIRST + i, cccrval);
1630
1631		did_interrupt = 1;
1632
1633		/*
1634		 * Ignore de-configured or stopped PMCs.  Ignore PMCs
1635		 * not in sampling mode.
1636		 */
1637		phw = pc->pc_hwpmcs[ri];
1638		pm  = phw->phw_pmc;
1639
1640		if (pm == NULL ||
1641		    pm->pm_state != PMC_STATE_RUNNING ||
1642		    !PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) {
1643			continue;
1644		}
1645
1646		/*
1647		 * Process the interrupt.  Re-enable the PMC if
1648		 * processing was successful.
1649		 */
1650		error = pmc_process_interrupt(cpu, pm, eip, usermode);
1651
1652		/*
1653		 * Only the first processor executing the NMI handler
1654		 * in a HTT pair will restart a PMC, and that too
1655		 * only if there were no errors.
1656		 */
1657		v = P4_RELOAD_COUNT_TO_PERFCTR_VALUE(
1658			pm->pm_sc.pm_reloadcount);
1659		wrmsr(P4_PERFCTR_MSR_FIRST + i, v);
1660		if (error == 0)
1661			wrmsr(P4_CCCR_MSR_FIRST + i,
1662			    cccrval | P4_CCCR_ENABLE);
1663	}
1664
1665	/* allow the other CPU to proceed */
1666	if (p4_system_has_htt)
1667		P4_PCPU_REL_INTR_SPINLOCK(pc);
1668
1669	/*
1670	 * On Intel P4 CPUs, the PMC 'pcint' entry in the LAPIC gets
1671	 * masked when a PMC interrupts the CPU.  We need to unmask
1672	 * the interrupt source explicitly.
1673	 */
1674
1675	if (did_interrupt)
1676		pmc_x86_lapic_enable_pmc_interrupt();
1677
1678	atomic_add_int(did_interrupt ? &pmc_stats.pm_intr_processed :
1679	    &pmc_stats.pm_intr_ignored, 1);
1680
1681	return did_interrupt;
1682}
1683
1684/*
1685 * Describe a CPU's PMC state.
1686 */
1687
1688static int
1689p4_describe(int cpu, int ri, struct pmc_info *pi,
1690    struct pmc **ppmc)
1691{
1692	int error;
1693	size_t copied;
1694	struct pmc_hw *phw;
1695	const struct p4pmc_descr *pd;
1696
1697	KASSERT(cpu >= 0 && cpu < mp_ncpus,
1698	    ("[p4,%d] illegal CPU %d", __LINE__, cpu));
1699	KASSERT(ri >= 0 && ri < P4_NPMCS,
1700	    ("[p4,%d] row-index %d out of range", __LINE__, ri));
1701
1702	PMCDBG(MDP,OPS,1,"p4-describe cpu=%d ri=%d", cpu, ri);
1703
1704	if (pmc_cpu_is_logical(cpu))
1705		return EINVAL;
1706
1707	phw = pmc_pcpu[cpu]->pc_hwpmcs[ri];
1708	pd  = &p4_pmcdesc[ri];
1709
1710	if ((error = copystr(pd->pm_descr.pd_name, pi->pm_name,
1711		 PMC_NAME_MAX, &copied)) != 0)
1712		return error;
1713
1714	pi->pm_class = pd->pm_descr.pd_class;
1715
1716	if (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) {
1717		pi->pm_enabled = TRUE;
1718		*ppmc          = phw->phw_pmc;
1719	} else {
1720		pi->pm_enabled = FALSE;
1721		*ppmc          = NULL;
1722	}
1723
1724	return 0;
1725}
1726
1727/*
1728 * Get MSR# for use with RDPMC.
1729 */
1730
1731static int
1732p4_get_msr(int ri, uint32_t *msr)
1733{
1734	KASSERT(ri >= 0 && ri < P4_NPMCS,
1735	    ("[p4,%d] ri %d out of range", __LINE__, ri));
1736
1737	*msr = p4_pmcdesc[ri].pm_pmc_msr - P4_PERFCTR_MSR_FIRST;
1738
1739	PMCDBG(MDP,OPS, 1, "ri=%d getmsr=0x%x", ri, *msr);
1740
1741	return 0;
1742}
1743
1744
1745int
1746pmc_initialize_p4(struct pmc_mdep *pmc_mdep)
1747{
1748	struct p4_event_descr *pe;
1749
1750	KASSERT(strcmp(cpu_vendor, "GenuineIntel") == 0,
1751	    ("[p4,%d] Initializing non-intel processor", __LINE__));
1752
1753	PMCDBG(MDP,INI,1, "%s", "p4-initialize");
1754
1755	switch (pmc_mdep->pmd_cputype) {
1756	case PMC_CPU_INTEL_PIV:
1757
1758		pmc_mdep->pmd_npmc	    = P4_NPMCS;
1759		pmc_mdep->pmd_classes[1].pm_class = PMC_CLASS_P4;
1760		pmc_mdep->pmd_classes[1].pm_caps  = P4_PMC_CAPS;
1761		pmc_mdep->pmd_classes[1].pm_width = 40;
1762		pmc_mdep->pmd_nclasspmcs[1] = 18;
1763
1764		pmc_mdep->pmd_init    	    = p4_init;
1765		pmc_mdep->pmd_cleanup 	    = p4_cleanup;
1766		pmc_mdep->pmd_switch_in     = p4_switch_in;
1767		pmc_mdep->pmd_switch_out    = p4_switch_out;
1768		pmc_mdep->pmd_read_pmc 	    = p4_read_pmc;
1769		pmc_mdep->pmd_write_pmc     = p4_write_pmc;
1770		pmc_mdep->pmd_config_pmc    = p4_config_pmc;
1771		pmc_mdep->pmd_get_config    = p4_get_config;
1772		pmc_mdep->pmd_allocate_pmc  = p4_allocate_pmc;
1773		pmc_mdep->pmd_release_pmc   = p4_release_pmc;
1774		pmc_mdep->pmd_start_pmc     = p4_start_pmc;
1775		pmc_mdep->pmd_stop_pmc      = p4_stop_pmc;
1776		pmc_mdep->pmd_intr	    = p4_intr;
1777		pmc_mdep->pmd_describe      = p4_describe;
1778		pmc_mdep->pmd_get_msr  	    = p4_get_msr; /* i386 */
1779
1780		/* model specific munging */
1781		if ((cpu_id & 0xFFF) < 0xF27) {
1782
1783			/*
1784			 * On P4 and Xeon with CPUID < (Family 15,
1785			 * Model 2, Stepping 7), only one ESCR is
1786			 * available for the IOQ_ALLOCATION event.
1787			 */
1788
1789			pe = p4_find_event(PMC_EV_P4_IOQ_ALLOCATION);
1790			pe->pm_escrs[1] = P4_ESCR_NONE;
1791		}
1792
1793		break;
1794
1795	default:
1796		KASSERT(0,("[p4,%d] Unknown CPU type", __LINE__));
1797		return ENOSYS;
1798	}
1799
1800	return 0;
1801}
1802