hwpmc_piv.c revision 153728
1/*-
2 * Copyright (c) 2003-2005 Joseph Koshy
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/dev/hwpmc/hwpmc_piv.c 153728 2005-12-26 09:27:24Z jkoshy $");
29
30#include <sys/param.h>
31#include <sys/lock.h>
32#include <sys/mutex.h>
33#include <sys/pmc.h>
34#include <sys/pmckern.h>
35#include <sys/smp.h>
36#include <sys/systm.h>
37
38#include <machine/cpufunc.h>
39#include <machine/md_var.h>
40#include <machine/specialreg.h>
41
42/*
43 * PENTIUM 4 SUPPORT
44 *
45 * The P4 has 18 PMCs, divided into 4 groups with 4,4,4 and 6 PMCs
46 * respectively.  Each PMC comprises of two model specific registers:
47 * a counter configuration control register (CCCR) and a counter
48 * register that holds the actual event counts.
49 *
50 * Configuring an event requires the use of one of 45 event selection
51 * control registers (ESCR).  Events are associated with specific
52 * ESCRs.  Each PMC group has a set of ESCRs it can use.
53 *
54 * - The BPU counter group (4 PMCs) can use the 16 ESCRs:
55 *   BPU_ESCR{0,1}, IS_ESCR{0,1}, MOB_ESCR{0,1}, ITLB_ESCR{0,1},
56 *   PMH_ESCR{0,1}, IX_ESCR{0,1}, FSB_ESCR{0,}, BSU_ESCR{0,1}.
57 *
58 * - The MS counter group (4 PMCs) can use the 6 ESCRs: MS_ESCR{0,1},
59 *   TC_ESCR{0,1}, TBPU_ESCR{0,1}.
60 *
61 * - The FLAME counter group (4 PMCs) can use the 10 ESCRs:
62 *   FLAME_ESCR{0,1}, FIRM_ESCR{0,1}, SAAT_ESCR{0,1}, U2L_ESCR{0,1},
63 *   DAC_ESCR{0,1}.
64 *
65 * - The IQ counter group (6 PMCs) can use the 13 ESCRs: IQ_ESCR{0,1},
66 *   ALF_ESCR{0,1}, RAT_ESCR{0,1}, SSU_ESCR0, CRU_ESCR{0,1,2,3,4,5}.
67 *
68 * Even-numbered ESCRs can be used with counters 0, 1 and 4 (if
69 * present) of a counter group.  Odd-numbers ESCRs can be used with
70 * counters 2, 3 and 5 (if present) of a counter group.  The
71 * 'p4_escrs[]' table describes these restrictions in a form that
72 * function 'p4_allocate()' uses for making allocation decisions.
73 *
74 * SYSTEM-MODE AND THREAD-MODE ALLOCATION
75 *
76 * In addition to remembering the state of PMC rows
77 * ('FREE','STANDALONE', or 'THREAD'), we similar need to track the
78 * state of ESCR rows.  If an ESCR is allocated to a system-mode PMC
79 * on a CPU we cannot allocate this to a thread-mode PMC.  On a
80 * multi-cpu (multiple physical CPUs) system, ESCR allocation on each
81 * CPU is tracked by the pc_escrs[] array.
82 *
83 * Each system-mode PMC that is using an ESCR records its row-index in
84 * the appropriate entry and system-mode allocation attempts check
85 * that an ESCR is available using this array.  Process-mode PMCs do
86 * not use the pc_escrs[] array, since ESCR row itself would have been
87 * marked as in 'THREAD' mode.
88 *
89 * HYPERTHREADING SUPPORT
90 *
91 * When HTT is enabled, the FreeBSD kernel treats the two 'logical'
92 * cpus as independent CPUs and can schedule kernel threads on them
93 * independently.  However, the two logical CPUs share the same set of
94 * PMC resources.  We need to ensure that:
95 * - PMCs that use the PMC_F_DESCENDANTS semantics are handled correctly,
96 *   and,
97 * - Threads of multi-threaded processes that get scheduled on the same
98 *   physical CPU are handled correctly.
99 *
100 * HTT Detection
101 *
102 * Not all HTT capable systems will have HTT enabled.  We detect the
103 * presence of HTT by detecting if 'p4_init()' was called for a secondary
104 * CPU in a HTT pair.
105 *
106 * Note that hwpmc(4) cannot currently deal with a change in HTT status once
107 * loaded.
108 *
109 * Handling HTT READ / WRITE / START / STOP
110 *
111 * PMC resources are shared across the CPUs in an HTT pair.  We
112 * designate the lower numbered CPU in a HTT pair as the 'primary'
113 * CPU.  In each primary CPU's state we keep track of a 'runcount'
114 * which reflects the number of PMC-using processes that have been
115 * scheduled on its secondary CPU.  Process-mode PMC operations will
116 * actually 'start' or 'stop' hardware only if these are the first or
117 * last processes respectively to use the hardware.  PMC values
118 * written by a 'write' operation are saved and are transferred to
119 * hardware at PMC 'start' time if the runcount is 0.  If the runcount
120 * is greater than 0 at the time of a 'start' operation, we keep track
121 * of the actual hardware value at the time of the 'start' operation
122 * and use this to adjust the final readings at PMC 'stop' or 'read'
123 * time.
124 *
125 * Execution sequences:
126 *
127 * Case 1:   CPUx   +...-		(no overlap)
128 *	     CPUy         +...-
129 *           RC   0 1   0 1   0
130 *
131 * Case 2:   CPUx   +........-		(partial overlap)
132 * 	     CPUy       +........-
133 *           RC   0 1   2    1   0
134 *
135 * Case 3:   CPUx   +..............-	(fully overlapped)
136 *	     CPUy       +.....-
137 *	     RC   0 1   2     1    0
138 *
139 *     Key:
140 *     'CPU[xy]' : one of the two logical processors on a HTT CPU.
141 *     'RC'      : run count (#threads per physical core).
142 *     '+'       : point in time when a thread is put on a CPU.
143 *     '-'       : point in time where a thread is taken off a CPU.
144 *
145 * Handling HTT CONFIG
146 *
147 * Different processes attached to the same PMC may get scheduled on
148 * the two logical processors in the package.  We keep track of config
149 * and de-config operations using the CFGFLAGS fields of the per-physical
150 * cpu state.
151 *
152 * Handling TSCs
153 *
154 * TSCs are architectural state and each CPU in a HTT pair has its own
155 * TSC register.
156 */
157
158#define	P4_PMCS()				\
159	P4_PMC(BPU_COUNTER0)			\
160	P4_PMC(BPU_COUNTER1)			\
161	P4_PMC(BPU_COUNTER2)			\
162	P4_PMC(BPU_COUNTER3)			\
163	P4_PMC(MS_COUNTER0)			\
164	P4_PMC(MS_COUNTER1)			\
165	P4_PMC(MS_COUNTER2)			\
166	P4_PMC(MS_COUNTER3)			\
167	P4_PMC(FLAME_COUNTER0)			\
168	P4_PMC(FLAME_COUNTER1)			\
169	P4_PMC(FLAME_COUNTER2)			\
170	P4_PMC(FLAME_COUNTER3)			\
171	P4_PMC(IQ_COUNTER0)			\
172	P4_PMC(IQ_COUNTER1)			\
173	P4_PMC(IQ_COUNTER2)			\
174	P4_PMC(IQ_COUNTER3)			\
175	P4_PMC(IQ_COUNTER4)			\
176	P4_PMC(IQ_COUNTER5)			\
177	P4_PMC(NONE)
178
179enum pmc_p4pmc {
180#undef	P4_PMC
181#define	P4_PMC(N)	P4_PMC_##N ,
182	P4_PMCS()
183};
184
185/*
186 * P4 ESCR descriptors
187 */
188
189#define	P4_ESCRS()							\
190    P4_ESCR(BSU_ESCR0,	0x3A0, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
191    P4_ESCR(BSU_ESCR1,	0x3A1, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
192    P4_ESCR(FSB_ESCR0,	0x3A2, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
193    P4_ESCR(FSB_ESCR1,	0x3A3, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
194    P4_ESCR(FIRM_ESCR0,	0x3A4, FLAME_COUNTER0, FLAME_COUNTER1, NONE)	\
195    P4_ESCR(FIRM_ESCR1,	0x3A5, FLAME_COUNTER2, FLAME_COUNTER3, NONE)	\
196    P4_ESCR(FLAME_ESCR0, 0x3A6, FLAME_COUNTER0, FLAME_COUNTER1, NONE)	\
197    P4_ESCR(FLAME_ESCR1, 0x3A7, FLAME_COUNTER2, FLAME_COUNTER3, NONE)	\
198    P4_ESCR(DAC_ESCR0,	0x3A8, FLAME_COUNTER0, FLAME_COUNTER1, NONE)	\
199    P4_ESCR(DAC_ESCR1,	0x3A9, FLAME_COUNTER2, FLAME_COUNTER3, NONE)	\
200    P4_ESCR(MOB_ESCR0,	0x3AA, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
201    P4_ESCR(MOB_ESCR1,	0x3AB, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
202    P4_ESCR(PMH_ESCR0,	0x3AC, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
203    P4_ESCR(PMH_ESCR1,	0x3AD, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
204    P4_ESCR(SAAT_ESCR0,	0x3AE, FLAME_COUNTER0, FLAME_COUNTER1, NONE)	\
205    P4_ESCR(SAAT_ESCR1,	0x3AF, FLAME_COUNTER2, FLAME_COUNTER3, NONE)	\
206    P4_ESCR(U2L_ESCR0,	0x3B0, FLAME_COUNTER0, FLAME_COUNTER1, NONE)	\
207    P4_ESCR(U2L_ESCR1,	0x3B1, FLAME_COUNTER2, FLAME_COUNTER3, NONE)	\
208    P4_ESCR(BPU_ESCR0,	0x3B2, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
209    P4_ESCR(BPU_ESCR1,	0x3B3, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
210    P4_ESCR(IS_ESCR0,	0x3B4, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
211    P4_ESCR(IS_ESCR1,	0x3B5, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
212    P4_ESCR(ITLB_ESCR0,	0x3B6, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
213    P4_ESCR(ITLB_ESCR1,	0x3B7, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
214    P4_ESCR(CRU_ESCR0,	0x3B8, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4)	\
215    P4_ESCR(CRU_ESCR1,	0x3B9, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5)	\
216    P4_ESCR(IQ_ESCR0,	0x3BA, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4)	\
217    P4_ESCR(IQ_ESCR1,	0x3BB, IQ_COUNTER1, IQ_COUNTER3, IQ_COUNTER5)	\
218    P4_ESCR(RAT_ESCR0,	0x3BC, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4)	\
219    P4_ESCR(RAT_ESCR1,	0x3BD, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5)	\
220    P4_ESCR(SSU_ESCR0,	0x3BE, IQ_COUNTER0, IQ_COUNTER2, IQ_COUNTER4)	\
221    P4_ESCR(MS_ESCR0,	0x3C0, MS_COUNTER0, MS_COUNTER1, NONE)		\
222    P4_ESCR(MS_ESCR1,	0x3C1, MS_COUNTER2, MS_COUNTER3, NONE)		\
223    P4_ESCR(TBPU_ESCR0,	0x3C2, MS_COUNTER0, MS_COUNTER1, NONE)		\
224    P4_ESCR(TBPU_ESCR1,	0x3C3, MS_COUNTER2, MS_COUNTER3, NONE)		\
225    P4_ESCR(TC_ESCR0,	0x3C4, MS_COUNTER0, MS_COUNTER1, NONE)		\
226    P4_ESCR(TC_ESCR1,	0x3C5, MS_COUNTER2, MS_COUNTER3, NONE)		\
227    P4_ESCR(IX_ESCR0,	0x3C8, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
228    P4_ESCR(IX_ESCR1,	0x3C9, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
229    P4_ESCR(ALF_ESCR0,	0x3CA, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4)	\
230    P4_ESCR(ALF_ESCR1,	0x3CB, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5)	\
231    P4_ESCR(CRU_ESCR2,	0x3CC, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4)	\
232    P4_ESCR(CRU_ESCR3,	0x3CD, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5)	\
233    P4_ESCR(CRU_ESCR4,	0x3E0, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4)	\
234    P4_ESCR(CRU_ESCR5,	0x3E1, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5)	\
235    P4_ESCR(NONE,		~0,    NONE, NONE, NONE)
236
237enum pmc_p4escr {
238#define	P4_ESCR(N, MSR, P1, P2, P3)	P4_ESCR_##N ,
239	P4_ESCRS()
240#undef	P4_ESCR
241};
242
243struct pmc_p4escr_descr {
244	const char	pm_escrname[PMC_NAME_MAX];
245	u_short		pm_escr_msr;
246	const enum pmc_p4pmc pm_pmcs[P4_MAX_PMC_PER_ESCR];
247};
248
249static struct pmc_p4escr_descr p4_escrs[] =
250{
251#define	P4_ESCR(N, MSR, P1, P2, P3)		\
252	{					\
253		.pm_escrname = #N,		\
254		.pm_escr_msr = (MSR),		\
255		.pm_pmcs =			\
256		{				\
257			P4_PMC_##P1,		\
258			P4_PMC_##P2,		\
259			P4_PMC_##P3		\
260		}				\
261	} ,
262
263	P4_ESCRS()
264
265#undef	P4_ESCR
266};
267
268/*
269 * P4 Event descriptor
270 */
271
272struct p4_event_descr {
273	const enum pmc_event pm_event;
274	const uint32_t	pm_escr_eventselect;
275	const uint32_t	pm_cccr_select;
276	const char	pm_is_ti_event;
277	enum pmc_p4escr	pm_escrs[P4_MAX_ESCR_PER_EVENT];
278};
279
280static struct p4_event_descr p4_events[] = {
281
282#define	P4_EVDESCR(NAME, ESCREVENTSEL, CCCRSEL, TI_EVENT, ESCR0, ESCR1)	\
283	{								\
284		.pm_event            = PMC_EV_P4_##NAME,		\
285		.pm_escr_eventselect = (ESCREVENTSEL),			\
286		.pm_cccr_select      = (CCCRSEL),			\
287		.pm_is_ti_event	     = (TI_EVENT),			\
288		.pm_escrs            =					\
289		{							\
290			P4_ESCR_##ESCR0,				\
291			P4_ESCR_##ESCR1					\
292		}							\
293	}
294
295P4_EVDESCR(TC_DELIVER_MODE,	0x01, 0x01, TRUE,  TC_ESCR0,	TC_ESCR1),
296P4_EVDESCR(BPU_FETCH_REQUEST,	0x03, 0x00, FALSE, BPU_ESCR0,	BPU_ESCR1),
297P4_EVDESCR(ITLB_REFERENCE,	0x18, 0x03, FALSE, ITLB_ESCR0,	ITLB_ESCR1),
298P4_EVDESCR(MEMORY_CANCEL,	0x02, 0x05, FALSE, DAC_ESCR0,	DAC_ESCR1),
299P4_EVDESCR(MEMORY_COMPLETE,	0x08, 0x02, FALSE, SAAT_ESCR0,	SAAT_ESCR1),
300P4_EVDESCR(LOAD_PORT_REPLAY,	0x04, 0x02, FALSE, SAAT_ESCR0,	SAAT_ESCR1),
301P4_EVDESCR(STORE_PORT_REPLAY,	0x05, 0x02, FALSE, SAAT_ESCR0,	SAAT_ESCR1),
302P4_EVDESCR(MOB_LOAD_REPLAY,	0x03, 0x02, FALSE, MOB_ESCR0,	MOB_ESCR1),
303P4_EVDESCR(PAGE_WALK_TYPE,	0x01, 0x04, TRUE,  PMH_ESCR0,	PMH_ESCR1),
304P4_EVDESCR(BSQ_CACHE_REFERENCE,	0x0C, 0x07, FALSE, BSU_ESCR0,	BSU_ESCR1),
305P4_EVDESCR(IOQ_ALLOCATION,	0x03, 0x06, FALSE, FSB_ESCR0,	FSB_ESCR1),
306P4_EVDESCR(IOQ_ACTIVE_ENTRIES,	0x1A, 0x06, FALSE, FSB_ESCR1,	NONE),
307P4_EVDESCR(FSB_DATA_ACTIVITY,	0x17, 0x06, TRUE,  FSB_ESCR0,	FSB_ESCR1),
308P4_EVDESCR(BSQ_ALLOCATION,	0x05, 0x07, FALSE, BSU_ESCR0,	NONE),
309P4_EVDESCR(BSQ_ACTIVE_ENTRIES,	0x06, 0x07, FALSE, BSU_ESCR1,	NONE),
310	/* BSQ_ACTIVE_ENTRIES inherits CPU specificity from BSQ_ALLOCATION */
311P4_EVDESCR(SSE_INPUT_ASSIST,	0x34, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
312P4_EVDESCR(PACKED_SP_UOP,	0x08, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
313P4_EVDESCR(PACKED_DP_UOP,	0x0C, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
314P4_EVDESCR(SCALAR_SP_UOP,	0x0A, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
315P4_EVDESCR(SCALAR_DP_UOP,	0x0E, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
316P4_EVDESCR(64BIT_MMX_UOP,	0x02, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
317P4_EVDESCR(128BIT_MMX_UOP,	0x1A, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
318P4_EVDESCR(X87_FP_UOP,		0x04, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
319P4_EVDESCR(X87_SIMD_MOVES_UOP,	0x2E, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
320P4_EVDESCR(GLOBAL_POWER_EVENTS,	0x13, 0x06, FALSE, FSB_ESCR0,	FSB_ESCR1),
321P4_EVDESCR(TC_MS_XFER,		0x05, 0x00, FALSE, MS_ESCR0,	MS_ESCR1),
322P4_EVDESCR(UOP_QUEUE_WRITES,	0x09, 0x00, FALSE, MS_ESCR0,	MS_ESCR1),
323P4_EVDESCR(RETIRED_MISPRED_BRANCH_TYPE,
324    				0x05, 0x02, FALSE, TBPU_ESCR0,	TBPU_ESCR1),
325P4_EVDESCR(RETIRED_BRANCH_TYPE,	0x04, 0x02, FALSE, TBPU_ESCR0,	TBPU_ESCR1),
326P4_EVDESCR(RESOURCE_STALL,	0x01, 0x01, FALSE, ALF_ESCR0,	ALF_ESCR1),
327P4_EVDESCR(WC_BUFFER,		0x05, 0x05, TRUE,  DAC_ESCR0,	DAC_ESCR1),
328P4_EVDESCR(B2B_CYCLES,		0x16, 0x03, TRUE,  FSB_ESCR0,	FSB_ESCR1),
329P4_EVDESCR(BNR,			0x08, 0x03, TRUE,  FSB_ESCR0,	FSB_ESCR1),
330P4_EVDESCR(SNOOP,		0x06, 0x03, TRUE,  FSB_ESCR0,	FSB_ESCR1),
331P4_EVDESCR(RESPONSE,		0x04, 0x03, TRUE,  FSB_ESCR0,	FSB_ESCR1),
332P4_EVDESCR(FRONT_END_EVENT,	0x08, 0x05, FALSE, CRU_ESCR2,	CRU_ESCR3),
333P4_EVDESCR(EXECUTION_EVENT,	0x0C, 0x05, FALSE, CRU_ESCR2,	CRU_ESCR3),
334P4_EVDESCR(REPLAY_EVENT, 	0x09, 0x05, FALSE, CRU_ESCR2,	CRU_ESCR3),
335P4_EVDESCR(INSTR_RETIRED,	0x02, 0x04, FALSE, CRU_ESCR0,	CRU_ESCR1),
336P4_EVDESCR(UOPS_RETIRED,	0x01, 0x04, FALSE, CRU_ESCR0,	CRU_ESCR1),
337P4_EVDESCR(UOP_TYPE,		0x02, 0x02, FALSE, RAT_ESCR0,	RAT_ESCR1),
338P4_EVDESCR(BRANCH_RETIRED,	0x06, 0x05, FALSE, CRU_ESCR2,	CRU_ESCR3),
339P4_EVDESCR(MISPRED_BRANCH_RETIRED, 0x03, 0x04, FALSE, CRU_ESCR0, CRU_ESCR1),
340P4_EVDESCR(X87_ASSIST,		0x03, 0x05, FALSE, CRU_ESCR2,	CRU_ESCR3),
341P4_EVDESCR(MACHINE_CLEAR,	0x02, 0x05, FALSE, CRU_ESCR2,	CRU_ESCR3)
342
343#undef	P4_EVDESCR
344};
345
346#define	P4_EVENT_IS_TI(E) ((E)->pm_is_ti_event == TRUE)
347
348#define	P4_NEVENTS	(PMC_EV_P4_LAST - PMC_EV_P4_FIRST + 1)
349
350/*
351 * P4 PMC descriptors
352 */
353
354struct p4pmc_descr {
355	struct pmc_descr pm_descr; 	/* common information */
356	enum pmc_p4pmc	pm_pmcnum;	/* PMC number */
357	uint32_t	pm_pmc_msr; 	/* PERFCTR MSR address */
358	uint32_t	pm_cccr_msr;  	/* CCCR MSR address */
359};
360
361static struct p4pmc_descr p4_pmcdesc[P4_NPMCS] = {
362
363	/*
364	 * TSC descriptor
365	 */
366
367	{
368		.pm_descr =
369		{
370			.pd_name  = "TSC",
371			.pd_class = PMC_CLASS_TSC,
372			.pd_caps  = PMC_CAP_READ | PMC_CAP_WRITE,
373			.pd_width = 64
374		},
375		.pm_pmcnum   = ~0,
376		.pm_cccr_msr = ~0,
377		.pm_pmc_msr  = 0x10,
378	},
379
380	/*
381	 * P4 PMCS
382	 */
383
384#define	P4_PMC_CAPS (PMC_CAP_INTERRUPT | PMC_CAP_USER | PMC_CAP_SYSTEM |  \
385	PMC_CAP_EDGE | PMC_CAP_THRESHOLD | PMC_CAP_READ | PMC_CAP_WRITE | \
386	PMC_CAP_INVERT | PMC_CAP_QUALIFIER | PMC_CAP_PRECISE |            \
387	PMC_CAP_TAGGING | PMC_CAP_CASCADE)
388
389#define	P4_PMCDESCR(N, PMC, CCCR)			\
390	{						\
391		.pm_descr =				\
392		{					\
393			.pd_name = #N,			\
394			.pd_class = PMC_CLASS_P4,	\
395			.pd_caps = P4_PMC_CAPS,		\
396			.pd_width = 40			\
397		},					\
398		.pm_pmcnum      = P4_PMC_##N,		\
399		.pm_cccr_msr 	= (CCCR),		\
400		.pm_pmc_msr	= (PMC)			\
401	}
402
403	P4_PMCDESCR(BPU_COUNTER0,	0x300,	0x360),
404	P4_PMCDESCR(BPU_COUNTER1,	0x301,	0x361),
405	P4_PMCDESCR(BPU_COUNTER2,	0x302,	0x362),
406	P4_PMCDESCR(BPU_COUNTER3,	0x303,	0x363),
407	P4_PMCDESCR(MS_COUNTER0,	0x304,	0x364),
408	P4_PMCDESCR(MS_COUNTER1,	0x305,	0x365),
409	P4_PMCDESCR(MS_COUNTER2,	0x306,	0x366),
410	P4_PMCDESCR(MS_COUNTER3,	0x307,	0x367),
411	P4_PMCDESCR(FLAME_COUNTER0,	0x308,	0x368),
412	P4_PMCDESCR(FLAME_COUNTER1,	0x309,	0x369),
413	P4_PMCDESCR(FLAME_COUNTER2,	0x30A,	0x36A),
414	P4_PMCDESCR(FLAME_COUNTER3,	0x30B,	0x36B),
415	P4_PMCDESCR(IQ_COUNTER0,	0x30C,	0x36C),
416	P4_PMCDESCR(IQ_COUNTER1,	0x30D,	0x36D),
417	P4_PMCDESCR(IQ_COUNTER2,	0x30E,	0x36E),
418	P4_PMCDESCR(IQ_COUNTER3,	0x30F,	0x36F),
419	P4_PMCDESCR(IQ_COUNTER4,	0x310,	0x370),
420	P4_PMCDESCR(IQ_COUNTER5,	0x311,	0x371),
421
422#undef	P4_PMCDESCR
423};
424
425/* HTT support */
426#define	P4_NHTT					2 /* logical processors/chip */
427
428static int p4_system_has_htt;
429
430/*
431 * Per-CPU data structure for P4 class CPUs
432 *
433 * [common stuff]
434 * [19 struct pmc_hw pointers]
435 * [19 struct pmc_hw structures]
436 * [45 ESCRs status bytes]
437 * [per-cpu spin mutex]
438 * [19 flag fields for holding config flags and a runcount]
439 * [19*2 hw value fields]	(Thread mode PMC support)
440 *    or
441 * [19*2 EIP values]		(Sampling mode PMCs)
442 * [19*2 pmc value fields]	(Thread mode PMC support))
443 */
444
445struct p4_cpu {
446	struct pmc_cpu	pc_common;
447	struct pmc_hw	*pc_hwpmcs[P4_NPMCS];
448	struct pmc_hw	pc_p4pmcs[P4_NPMCS];
449	char		pc_escrs[P4_NESCR];
450	struct mtx	pc_mtx;		/* spin lock */
451	uint32_t	pc_intrflag;	/* NMI handler flags */
452	unsigned int	pc_intrlock;	/* NMI handler spin lock */
453	unsigned char	pc_flags[P4_NPMCS]; /* 4 bits each: {cfg,run}count */
454	union {
455		pmc_value_t pc_hw[P4_NPMCS * P4_NHTT];
456		uintptr_t   pc_ip[P4_NPMCS * P4_NHTT];
457	}		pc_si;
458	pmc_value_t	pc_pmc_values[P4_NPMCS * P4_NHTT];
459};
460
461/*
462 * A 'logical' CPU shares PMC resources with partner 'physical' CPU,
463 * except the TSC, which is architectural and hence seperate.  The
464 * 'logical' CPU descriptor thus has pointers to the physical CPUs
465 * descriptor state except for the TSC (rowindex 0) which is not
466 * shared.
467 */
468
469struct p4_logicalcpu {
470	struct pmc_cpu	pc_common;
471	struct pmc_hw	*pc_hwpmcs[P4_NPMCS];
472	struct pmc_hw	pc_tsc;
473};
474
475#define	P4_PCPU_PMC_VALUE(PC,RI,CPU) 	(PC)->pc_pmc_values[(RI)*((CPU) & 1)]
476#define	P4_PCPU_HW_VALUE(PC,RI,CPU)	(PC)->pc_si.pc_hw[(RI)*((CPU) & 1)]
477#define	P4_PCPU_SAVED_IP(PC,RI,CPU)	(PC)->pc_si.pc_ip[(RI)*((CPU) & 1)]
478
479#define	P4_PCPU_GET_FLAGS(PC,RI,MASK)	((PC)->pc_flags[(RI)] & (MASK))
480#define	P4_PCPU_SET_FLAGS(PC,RI,MASK,VAL)	do {	\
481	char _tmp;					\
482	_tmp = (PC)->pc_flags[(RI)];			\
483	_tmp &= ~(MASK);				\
484	_tmp |= (VAL) & (MASK);				\
485	(PC)->pc_flags[(RI)] = _tmp;			\
486} while (0)
487
488#define	P4_PCPU_GET_RUNCOUNT(PC,RI)	P4_PCPU_GET_FLAGS(PC,RI,0x0F)
489#define	P4_PCPU_SET_RUNCOUNT(PC,RI,V)	P4_PCPU_SET_FLAGS(PC,RI,0x0F,V)
490
491#define	P4_PCPU_GET_CFGFLAGS(PC,RI)	(P4_PCPU_GET_FLAGS(PC,RI,0xF0) >> 4)
492#define	P4_PCPU_SET_CFGFLAGS(PC,RI,C)	P4_PCPU_SET_FLAGS(PC,RI,0xF0,((C) <<4))
493
494#define	P4_CPU_TO_FLAG(C)		(P4_CPU_IS_HTT_SECONDARY(cpu) ? 0x2 : 0x1)
495
496#define	P4_PCPU_GET_INTRFLAG(PC,I)	((PC)->pc_intrflag & (1 << (I)))
497#define	P4_PCPU_SET_INTRFLAG(PC,I,V)	do {		\
498		uint32_t __mask;			\
499		__mask = 1 << (I);			\
500		if ((V))				\
501			(PC)->pc_intrflag |= __mask;	\
502		else					\
503			(PC)->pc_intrflag &= ~__mask;	\
504	} while (0)
505
506/*
507 * A minimal spin lock implementation for use inside the NMI handler.
508 *
509 * We don't want to use a regular spin lock here, because curthread
510 * may not be consistent at the time the handler is invoked.
511 */
512#define	P4_PCPU_ACQ_INTR_SPINLOCK(PC) do {				\
513		while (!atomic_cmpset_acq_int(&pc->pc_intrlock, 0, 1))	\
514			ia32_pause();					\
515	} while (0)
516#define	P4_PCPU_REL_INTR_SPINLOCK(PC) 					\
517	atomic_store_rel_int(&pc->pc_intrlock, 0);
518
519/* ESCR row disposition */
520static int p4_escrdisp[P4_NESCR];
521
522#define	P4_ESCR_ROW_DISP_IS_THREAD(E)		(p4_escrdisp[(E)] > 0)
523#define	P4_ESCR_ROW_DISP_IS_STANDALONE(E)	(p4_escrdisp[(E)] < 0)
524#define	P4_ESCR_ROW_DISP_IS_FREE(E)		(p4_escrdisp[(E)] == 0)
525
526#define	P4_ESCR_MARK_ROW_STANDALONE(E) do {				\
527	KASSERT(p4_escrdisp[(E)] <= 0, ("[p4,%d] row disposition error",\
528		    __LINE__));						\
529	atomic_add_int(&p4_escrdisp[(E)], -1);				\
530	KASSERT(p4_escrdisp[(E)] >= (-mp_ncpus), ("[p4,%d] row "	\
531		"disposition error", __LINE__));			\
532} while (0)
533
534#define	P4_ESCR_UNMARK_ROW_STANDALONE(E) do {				\
535	atomic_add_int(&p4_escrdisp[(E)], 1);				\
536	KASSERT(p4_escrdisp[(E)] <= 0, ("[p4,%d] row disposition error",\
537		    __LINE__));						\
538} while (0)
539
540#define	P4_ESCR_MARK_ROW_THREAD(E) do {					 \
541	KASSERT(p4_escrdisp[(E)] >= 0, ("[p4,%d] row disposition error", \
542		    __LINE__));						 \
543	atomic_add_int(&p4_escrdisp[(E)], 1);				 \
544} while (0)
545
546#define	P4_ESCR_UNMARK_ROW_THREAD(E) do {				 \
547	atomic_add_int(&p4_escrdisp[(E)], -1);				 \
548	KASSERT(p4_escrdisp[(E)] >= 0, ("[p4,%d] row disposition error", \
549		    __LINE__));						 \
550} while (0)
551
552#define	P4_PMC_IS_STOPPED(cccr)	((rdmsr(cccr) & P4_CCCR_ENABLE) == 0)
553
554#define	P4_CPU_IS_HTT_SECONDARY(cpu)					\
555	(p4_system_has_htt ? ((cpu) & 1) : 0)
556#define	P4_TO_HTT_PRIMARY(cpu) 						\
557	(p4_system_has_htt ? ((cpu) & ~1) : (cpu))
558
559#define	P4_CCCR_Tx_MASK	(~(P4_CCCR_OVF_PMI_T0|P4_CCCR_OVF_PMI_T1|	\
560			     P4_CCCR_ENABLE|P4_CCCR_OVF))
561#define	P4_ESCR_Tx_MASK	(~(P4_ESCR_T0_OS|P4_ESCR_T0_USR|P4_ESCR_T1_OS|	\
562			     P4_ESCR_T1_USR))
563
564/*
565 * support routines
566 */
567
568static struct p4_event_descr *
569p4_find_event(enum pmc_event ev)
570{
571	int n;
572
573	for (n = 0; n < P4_NEVENTS; n++)
574		if (p4_events[n].pm_event == ev)
575			break;
576	if (n == P4_NEVENTS)
577		return NULL;
578	return &p4_events[n];
579}
580
581/*
582 * Initialize per-cpu state
583 */
584
585static int
586p4_init(int cpu)
587{
588	int n, phycpu;
589	char *pescr;
590	struct p4_cpu *pcs;
591	struct p4_logicalcpu *plcs;
592	struct pmc_hw *phw;
593
594	KASSERT(cpu >= 0 && cpu < mp_ncpus,
595	    ("[p4,%d] insane cpu number %d", __LINE__, cpu));
596
597	PMCDBG(MDP,INI,0, "p4-init cpu=%d logical=%d", cpu,
598	    pmc_cpu_is_logical(cpu) != 0);
599
600	/*
601	 * The two CPUs in an HT pair share their per-cpu state.
602	 *
603	 * For HT capable CPUs, we assume that the two logical
604	 * processors in the HT pair get two consecutive CPU ids
605	 * starting with an even id #.
606	 *
607	 * The primary CPU (the even numbered CPU of the pair) would
608	 * have been initialized prior to the initialization for the
609	 * secondary.
610	 */
611
612	if (pmc_cpu_is_logical(cpu) && (cpu & 1)) {
613
614		p4_system_has_htt = 1;
615
616		phycpu = P4_TO_HTT_PRIMARY(cpu);
617		pcs = (struct p4_cpu *) pmc_pcpu[phycpu];
618		PMCDBG(MDP,INI,1, "p4-init cpu=%d phycpu=%d pcs=%p",
619		    cpu, phycpu, pcs);
620		KASSERT(pcs,
621		    ("[p4,%d] Null Per-Cpu state cpu=%d phycpu=%d", __LINE__,
622			cpu, phycpu));
623		if (pcs == NULL) /* decline to init */
624			return ENXIO;
625
626		MALLOC(plcs, struct p4_logicalcpu *,
627		    sizeof(struct p4_logicalcpu), M_PMC, M_WAITOK|M_ZERO);
628
629		/* The TSC is architectural state and is not shared */
630		plcs->pc_hwpmcs[0] = &plcs->pc_tsc;
631		plcs->pc_tsc.phw_state = PMC_PHW_FLAG_IS_ENABLED |
632		    PMC_PHW_CPU_TO_STATE(cpu) | PMC_PHW_INDEX_TO_STATE(0) |
633		    PMC_PHW_FLAG_IS_SHAREABLE;
634
635		/* Other PMCs are shared with the physical CPU */
636		for (n = 1; n < P4_NPMCS; n++)
637			plcs->pc_hwpmcs[n] = pcs->pc_hwpmcs[n];
638
639		pmc_pcpu[cpu] = (struct pmc_cpu *) plcs;
640		return 0;
641	}
642
643	MALLOC(pcs, struct p4_cpu *, sizeof(struct p4_cpu), M_PMC,
644	    M_WAITOK|M_ZERO);
645
646	if (pcs == NULL)
647		return ENOMEM;
648	phw = pcs->pc_p4pmcs;
649
650	for (n = 0; n < P4_NPMCS; n++, phw++) {
651		phw->phw_state   = PMC_PHW_FLAG_IS_ENABLED |
652		    PMC_PHW_CPU_TO_STATE(cpu) | PMC_PHW_INDEX_TO_STATE(n);
653		phw->phw_pmc     = NULL;
654		pcs->pc_hwpmcs[n] = phw;
655	}
656
657	/* Mark the TSC as shareable */
658	pcs->pc_hwpmcs[0]->phw_state |= PMC_PHW_FLAG_IS_SHAREABLE;
659
660	pescr = pcs->pc_escrs;
661	for (n = 0; n < P4_NESCR; n++)
662		*pescr++ = P4_INVALID_PMC_INDEX;
663	pmc_pcpu[cpu] = (struct pmc_cpu *) pcs;
664
665	mtx_init(&pcs->pc_mtx, "p4-pcpu", "pmc", MTX_SPIN);
666
667	return 0;
668}
669
670/*
671 * Destroy per-cpu state.
672 */
673
674static int
675p4_cleanup(int cpu)
676{
677	struct p4_cpu *pcs;
678
679	PMCDBG(MDP,INI,0, "p4-cleanup cpu=%d", cpu);
680
681	if ((pcs = (struct p4_cpu *) pmc_pcpu[cpu]) == NULL)
682		return 0;
683
684	/*
685	 * If the CPU is physical we need to teardown the
686	 * full MD state.
687	 */
688	if (!P4_CPU_IS_HTT_SECONDARY(cpu))
689		mtx_destroy(&pcs->pc_mtx);
690
691	FREE(pcs, M_PMC);
692
693	pmc_pcpu[cpu] = NULL;
694
695	return 0;
696}
697
698/*
699 * Context switch in.
700 */
701
702static int
703p4_switch_in(struct pmc_cpu *pc, struct pmc_process *pp)
704{
705	(void) pc;
706
707	PMCDBG(MDP,SWI,1, "pc=%p pp=%p enable-msr=%d", pc, pp,
708	    (pp->pp_flags & PMC_PP_ENABLE_MSR_ACCESS) != 0);
709
710	/* enable the RDPMC instruction */
711	if (pp->pp_flags & PMC_PP_ENABLE_MSR_ACCESS)
712		load_cr4(rcr4() | CR4_PCE);
713
714	PMCDBG(MDP,SWI,2, "cr4=0x%x", (uint32_t) rcr4());
715
716	return 0;
717}
718
719/*
720 * Context switch out.
721 */
722
723static int
724p4_switch_out(struct pmc_cpu *pc, struct pmc_process *pp)
725{
726	(void) pc;
727	(void) pp;		/* can be null */
728
729	PMCDBG(MDP,SWO,1, "pc=%p pp=%p", pc, pp);
730
731	/* always disallow the RDPMC instruction */
732	load_cr4(rcr4() & ~CR4_PCE);
733
734	PMCDBG(MDP,SWO,2, "cr4=0x%x", (uint32_t) rcr4());
735
736	return 0;
737}
738
739/*
740 * Read a PMC
741 */
742
743static int
744p4_read_pmc(int cpu, int ri, pmc_value_t *v)
745{
746	enum pmc_mode mode;
747	struct p4pmc_descr *pd;
748	struct pmc *pm;
749	struct p4_cpu *pc;
750	struct pmc_hw *phw;
751	pmc_value_t tmp;
752
753	KASSERT(cpu >= 0 && cpu < mp_ncpus,
754	    ("[p4,%d] illegal CPU value %d", __LINE__, cpu));
755	KASSERT(ri >= 0 && ri < P4_NPMCS,
756	    ("[p4,%d] illegal row-index %d", __LINE__, ri));
757
758
759	if (ri == 0) {	/* TSC */
760#ifdef	DEBUG
761		pc  = (struct p4_cpu *) pmc_pcpu[cpu];
762		phw = pc->pc_hwpmcs[ri];
763		pm  = phw->phw_pmc;
764
765		KASSERT(pm, ("[p4,%d] cpu=%d ri=%d not configured", __LINE__,
766			    cpu, ri));
767		KASSERT(PMC_TO_CLASS(pm) == PMC_CLASS_TSC,
768		    ("[p4,%d] cpu=%d ri=%d not a TSC (%d)", __LINE__, cpu, ri,
769			PMC_TO_CLASS(pm)));
770		KASSERT(PMC_IS_COUNTING_MODE(PMC_TO_MODE(pm)),
771		    ("[p4,%d] TSC counter in non-counting mode", __LINE__));
772#endif
773		*v = rdtsc();
774		PMCDBG(MDP,REA,2, "p4-read -> %jx", *v);
775		return 0;
776	}
777
778	pc  = (struct p4_cpu *) pmc_pcpu[P4_TO_HTT_PRIMARY(cpu)];
779	phw = pc->pc_hwpmcs[ri];
780	pd  = &p4_pmcdesc[ri];
781	pm  = phw->phw_pmc;
782
783	KASSERT(pm != NULL,
784	    ("[p4,%d] No owner for HWPMC [cpu%d,pmc%d]", __LINE__,
785		cpu, ri));
786
787	KASSERT(pd->pm_descr.pd_class == PMC_TO_CLASS(pm),
788	    ("[p4,%d] class mismatch pd %d != id class %d", __LINE__,
789		pd->pm_descr.pd_class, PMC_TO_CLASS(pm)));
790
791	mode = PMC_TO_MODE(pm);
792
793	PMCDBG(MDP,REA,1, "p4-read cpu=%d ri=%d mode=%d", cpu, ri, mode);
794
795	KASSERT(pd->pm_descr.pd_class == PMC_CLASS_P4,
796	    ("[p4,%d] unknown PMC class %d", __LINE__, pd->pm_descr.pd_class));
797
798	tmp = rdmsr(p4_pmcdesc[ri].pm_pmc_msr);
799
800	if (PMC_IS_VIRTUAL_MODE(mode)) {
801		if (tmp < P4_PCPU_HW_VALUE(pc,ri,cpu)) /* 40 bit overflow */
802			tmp += (P4_PERFCTR_MASK + 1) -
803			    P4_PCPU_HW_VALUE(pc,ri,cpu);
804		else
805			tmp -= P4_PCPU_HW_VALUE(pc,ri,cpu);
806		tmp += P4_PCPU_PMC_VALUE(pc,ri,cpu);
807	}
808
809	if (PMC_IS_SAMPLING_MODE(mode)) /* undo transformation */
810		*v = P4_PERFCTR_VALUE_TO_RELOAD_COUNT(tmp);
811	else
812		*v = tmp;
813
814	PMCDBG(MDP,REA,2, "p4-read -> %jx", *v);
815	return 0;
816}
817
818/*
819 * Write a PMC
820 */
821
822static int
823p4_write_pmc(int cpu, int ri, pmc_value_t v)
824{
825	enum pmc_mode mode;
826	struct pmc *pm;
827	struct p4_cpu *pc;
828	const struct pmc_hw *phw;
829	const struct p4pmc_descr *pd;
830
831	KASSERT(cpu >= 0 && cpu < mp_ncpus,
832	    ("[amd,%d] illegal CPU value %d", __LINE__, cpu));
833	KASSERT(ri >= 0 && ri < P4_NPMCS,
834	    ("[amd,%d] illegal row-index %d", __LINE__, ri));
835
836
837	/*
838	 * The P4's TSC register is writeable, but we don't allow a
839	 * write as changing the TSC's value could interfere with
840	 * timekeeping and other system functions.
841	 */
842	if (ri == 0) {
843#ifdef	DEBUG
844		pc  = (struct p4_cpu *) pmc_pcpu[cpu];
845		phw = pc->pc_hwpmcs[ri];
846		pm  = phw->phw_pmc;
847		KASSERT(pm, ("[p4,%d] cpu=%d ri=%d not configured", __LINE__,
848			    cpu, ri));
849		KASSERT(PMC_TO_CLASS(pm) == PMC_CLASS_TSC,
850		    ("[p4,%d] cpu=%d ri=%d not a TSC (%d)", __LINE__,
851			cpu, ri, PMC_TO_CLASS(pm)));
852#endif
853		return 0;
854	}
855
856	/* Shared PMCs */
857	pc  = (struct p4_cpu *) pmc_pcpu[P4_TO_HTT_PRIMARY(cpu)];
858	phw = pc->pc_hwpmcs[ri];
859	pm  = phw->phw_pmc;
860	pd  = &p4_pmcdesc[ri];
861
862	KASSERT(pm != NULL,
863	    ("[p4,%d] No owner for HWPMC [cpu%d,pmc%d]", __LINE__,
864		cpu, ri));
865
866	mode = PMC_TO_MODE(pm);
867
868	PMCDBG(MDP,WRI,1, "p4-write cpu=%d ri=%d mode=%d v=%jx", cpu, ri,
869	    mode, v);
870
871	/*
872	 * write the PMC value to the register/saved value: for
873	 * sampling mode PMCs, the value to be programmed into the PMC
874	 * counter is -(C+1) where 'C' is the requested sample rate.
875	 */
876	if (PMC_IS_SAMPLING_MODE(mode))
877		v = P4_RELOAD_COUNT_TO_PERFCTR_VALUE(v);
878
879	if (PMC_IS_SYSTEM_MODE(mode))
880		wrmsr(pd->pm_pmc_msr, v);
881	else
882		P4_PCPU_PMC_VALUE(pc,ri,cpu) = v;
883
884	return 0;
885}
886
887/*
888 * Configure a PMC 'pm' on the given CPU and row-index.
889 *
890 * 'pm' may be NULL to indicate de-configuration.
891 *
892 * On HTT systems, a PMC may get configured twice, once for each
893 * "logical" CPU.  We track this using the CFGFLAGS field of the
894 * per-cpu state; this field is a bit mask with one bit each for
895 * logical CPUs 0 & 1.
896 */
897
898static int
899p4_config_pmc(int cpu, int ri, struct pmc *pm)
900{
901	struct pmc_hw *phw;
902	struct p4_cpu *pc;
903	int cfgflags, cpuflag;
904
905	KASSERT(cpu >= 0 && cpu < mp_ncpus,
906	    ("[p4,%d] illegal CPU %d", __LINE__, cpu));
907	KASSERT(ri >= 0 && ri < P4_NPMCS,
908	    ("[p4,%d] illegal row-index %d", __LINE__, ri));
909
910	PMCDBG(MDP,CFG,1, "cpu=%d ri=%d pm=%p", cpu, ri, pm);
911
912	if (ri == 0) {		/* TSC */
913		pc = (struct p4_cpu *) pmc_pcpu[cpu];
914		phw = pc->pc_hwpmcs[ri];
915
916		KASSERT(pm == NULL || phw->phw_pmc == NULL,
917		    ("[p4,%d] hwpmc doubly config'ed", __LINE__));
918		phw->phw_pmc = pm;
919		return 0;
920	}
921
922	/* Shared PMCs */
923
924	pc = (struct p4_cpu *) pmc_pcpu[P4_TO_HTT_PRIMARY(cpu)];
925	phw = pc->pc_hwpmcs[ri];
926
927	KASSERT(pm == NULL || phw->phw_pmc == NULL ||
928	    (p4_system_has_htt && phw->phw_pmc == pm),
929	    ("[p4,%d] hwpmc not unconfigured before re-config", __LINE__));
930
931	mtx_lock_spin(&pc->pc_mtx);
932	cfgflags = P4_PCPU_GET_CFGFLAGS(pc,ri);
933
934	KASSERT(cfgflags >= 0 || cfgflags <= 3,
935	    ("[p4,%d] illegal cfgflags cfg=%d on cpu=%d ri=%d", __LINE__,
936		cfgflags, cpu, ri));
937
938	KASSERT(cfgflags == 0 || phw->phw_pmc,
939	    ("[p4,%d] cpu=%d ri=%d pmc configured with zero cfg count",
940		__LINE__, cpu, ri));
941
942	cpuflag = P4_CPU_TO_FLAG(cpu);
943
944	if (pm) {		/* config */
945		if (cfgflags == 0)
946			phw->phw_pmc = pm;
947
948		KASSERT(phw->phw_pmc == pm,
949		    ("[p4,%d] cpu=%d ri=%d config %p != hw %p",
950			__LINE__, cpu, ri, pm, phw->phw_pmc));
951
952		cfgflags |= cpuflag;
953	} else {		/* unconfig */
954		cfgflags &= ~cpuflag;
955
956		if (cfgflags == 0)
957			phw->phw_pmc = NULL;
958	}
959
960	KASSERT(cfgflags >= 0 || cfgflags <= 3,
961	    ("[p4,%d] illegal runcount cfg=%d on cpu=%d ri=%d", __LINE__,
962		cfgflags, cpu, ri));
963
964	P4_PCPU_SET_CFGFLAGS(pc,ri,cfgflags);
965
966	mtx_unlock_spin(&pc->pc_mtx);
967
968	return 0;
969}
970
971/*
972 * Retrieve a configured PMC pointer from hardware state.
973 */
974
975static int
976p4_get_config(int cpu, int ri, struct pmc **ppm)
977{
978	struct p4_cpu *pc;
979	struct pmc_hw *phw;
980	int cfgflags;
981
982	pc = (struct p4_cpu *) pmc_pcpu[P4_TO_HTT_PRIMARY(cpu)];
983	phw = pc->pc_hwpmcs[ri];
984
985	mtx_lock_spin(&pc->pc_mtx);
986	cfgflags = P4_PCPU_GET_CFGFLAGS(pc,ri);
987	mtx_unlock_spin(&pc->pc_mtx);
988
989	if (cfgflags & P4_CPU_TO_FLAG(cpu))
990		*ppm = phw->phw_pmc; /* PMC config'ed on this CPU */
991	else
992		*ppm = NULL;
993
994	return 0;
995}
996
997/*
998 * Allocate a PMC.
999 *
1000 * The allocation strategy differs between HTT and non-HTT systems.
1001 *
1002 * The non-HTT case:
1003 *   - Given the desired event and the PMC row-index, lookup the
1004 *   list of valid ESCRs for the event.
1005 *   - For each valid ESCR:
1006 *     - Check if the ESCR is free and the ESCR row is in a compatible
1007 *       mode (i.e., system or process))
1008 *     - Check if the ESCR is usable with a P4 PMC at the desired row-index.
1009 *   If everything matches, we determine the appropriate bit values for the
1010 *   ESCR and CCCR registers.
1011 *
1012 * The HTT case:
1013 *
1014 * - Process mode PMCs require special care.  The FreeBSD scheduler could
1015 *   schedule any two processes on the same physical CPU.  We need to ensure
1016 *   that a given PMC row-index is never allocated to two different
1017 *   PMCs owned by different user-processes.
1018 *   This is ensured by always allocating a PMC from a 'FREE' PMC row
1019 *   if the system has HTT active.
1020 * - A similar check needs to be done for ESCRs; we do not want two PMCs
1021 *   using the same ESCR to be scheduled at the same time.  Thus ESCR
1022 *   allocation is also restricted to FREE rows if the system has HTT
1023 *   enabled.
1024 * - Thirdly, some events are 'thread-independent' terminology, i.e.,
1025 *   the PMC hardware cannot distinguish between events caused by
1026 *   different logical CPUs.  This makes it impossible to assign events
1027 *   to a given thread of execution.  If the system has HTT enabled,
1028 *   these events are not allowed for process-mode PMCs.
1029 */
1030
1031static int
1032p4_allocate_pmc(int cpu, int ri, struct pmc *pm,
1033    const struct pmc_op_pmcallocate *a)
1034{
1035	int found, n, m;
1036	uint32_t caps, cccrvalue, escrvalue, tflags;
1037	enum pmc_p4escr escr;
1038	struct p4_cpu *pc;
1039	struct p4_event_descr *pevent;
1040	const struct p4pmc_descr *pd;
1041
1042	KASSERT(cpu >= 0 && cpu < mp_ncpus,
1043	    ("[p4,%d] illegal CPU %d", __LINE__, cpu));
1044	KASSERT(ri >= 0 && ri < P4_NPMCS,
1045	    ("[p4,%d] illegal row-index value %d", __LINE__, ri));
1046
1047	pd = &p4_pmcdesc[ri];
1048
1049	PMCDBG(MDP,ALL,1, "p4-allocate ri=%d class=%d pmccaps=0x%x "
1050	    "reqcaps=0x%x", ri, pd->pm_descr.pd_class, pd->pm_descr.pd_caps,
1051	    pm->pm_caps);
1052
1053	/* check class */
1054	if (pd->pm_descr.pd_class != a->pm_class)
1055		return EINVAL;
1056
1057	/* check requested capabilities */
1058	caps = a->pm_caps;
1059	if ((pd->pm_descr.pd_caps & caps) != caps)
1060		return EPERM;
1061
1062	if (pd->pm_descr.pd_class == PMC_CLASS_TSC) {
1063		/* TSC's are always allocated in system-wide counting mode */
1064		if (a->pm_ev != PMC_EV_TSC_TSC ||
1065		    a->pm_mode != PMC_MODE_SC)
1066			return EINVAL;
1067		return 0;
1068	}
1069
1070	/*
1071	 * If the system has HTT enabled, and the desired allocation
1072	 * mode is process-private, and the PMC row disposition is not
1073	 * FREE (0), decline the allocation.
1074	 */
1075
1076	if (p4_system_has_htt &&
1077	    PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)) &&
1078	    pmc_getrowdisp(ri) != 0)
1079		return EBUSY;
1080
1081	KASSERT(pd->pm_descr.pd_class == PMC_CLASS_P4,
1082	    ("[p4,%d] unknown PMC class %d", __LINE__,
1083		pd->pm_descr.pd_class));
1084
1085	if (pm->pm_event < PMC_EV_P4_FIRST ||
1086	    pm->pm_event > PMC_EV_P4_LAST)
1087		return EINVAL;
1088
1089	if ((pevent = p4_find_event(pm->pm_event)) == NULL)
1090		return ESRCH;
1091
1092	PMCDBG(MDP,ALL,2, "pevent={ev=%d,escrsel=0x%x,cccrsel=0x%x,isti=%d}",
1093	    pevent->pm_event, pevent->pm_escr_eventselect,
1094	    pevent->pm_cccr_select, pevent->pm_is_ti_event);
1095
1096	/*
1097	 * Some PMC events are 'thread independent'and therefore
1098	 * cannot be used for process-private modes if HTT is being
1099	 * used.
1100	 */
1101
1102	if (P4_EVENT_IS_TI(pevent) &&
1103	    PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)) &&
1104	    p4_system_has_htt)
1105		return EINVAL;
1106
1107	pc = (struct p4_cpu *) pmc_pcpu[P4_TO_HTT_PRIMARY(cpu)];
1108
1109	found   = 0;
1110
1111	/* look for a suitable ESCR for this event */
1112	for (n = 0; n < P4_MAX_ESCR_PER_EVENT && !found; n++) {
1113		if ((escr = pevent->pm_escrs[n]) == P4_ESCR_NONE)
1114			break;	/* out of ESCRs */
1115		/*
1116		 * Check ESCR row disposition.
1117		 *
1118		 * If the request is for a system-mode PMC, then the
1119		 * ESCR row should not be in process-virtual mode, and
1120		 * should also be free on the current CPU.
1121		 */
1122
1123		if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
1124		    if (P4_ESCR_ROW_DISP_IS_THREAD(escr) ||
1125			pc->pc_escrs[escr] != P4_INVALID_PMC_INDEX)
1126			    continue;
1127		}
1128
1129		/*
1130		 * If the request is for a process-virtual PMC, and if
1131		 * HTT is not enabled, we can use an ESCR row that is
1132		 * either FREE or already in process mode.
1133		 *
1134		 * If HTT is enabled, then we need to ensure that a
1135		 * given ESCR is never allocated to two PMCS that
1136		 * could run simultaneously on the two logical CPUs of
1137		 * a CPU package.  We ensure this be only allocating
1138		 * ESCRs from rows marked as 'FREE'.
1139		 */
1140
1141		if (PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm))) {
1142			if (p4_system_has_htt) {
1143				if (!P4_ESCR_ROW_DISP_IS_FREE(escr))
1144					continue;
1145			} else
1146				if (P4_ESCR_ROW_DISP_IS_STANDALONE(escr))
1147					continue;
1148		}
1149
1150		/*
1151		 * We found a suitable ESCR for this event.  Now check if
1152		 * this escr can work with the PMC at row-index 'ri'.
1153		 */
1154
1155		for (m = 0; m < P4_MAX_PMC_PER_ESCR; m++)
1156			if (p4_escrs[escr].pm_pmcs[m] == pd->pm_pmcnum) {
1157				found = 1;
1158				break;
1159			}
1160	}
1161
1162	if (found == 0)
1163		return ESRCH;
1164
1165	KASSERT((int) escr >= 0 && escr < P4_NESCR,
1166	    ("[p4,%d] illegal ESCR value %d", __LINE__, escr));
1167
1168	/* mark ESCR row mode */
1169	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
1170		pc->pc_escrs[escr] = ri; /* mark ESCR as in use on this cpu */
1171		P4_ESCR_MARK_ROW_STANDALONE(escr);
1172	} else {
1173		KASSERT(pc->pc_escrs[escr] == P4_INVALID_PMC_INDEX,
1174		    ("[p4,%d] escr[%d] already in use", __LINE__, escr));
1175		P4_ESCR_MARK_ROW_THREAD(escr);
1176	}
1177
1178	pm->pm_md.pm_p4.pm_p4_escrmsr   = p4_escrs[escr].pm_escr_msr;
1179	pm->pm_md.pm_p4.pm_p4_escr      = escr;
1180
1181	cccrvalue = P4_CCCR_TO_ESCR_SELECT(pevent->pm_cccr_select);
1182	escrvalue = P4_ESCR_TO_EVENT_SELECT(pevent->pm_escr_eventselect);
1183
1184	/* CCCR fields */
1185	if (caps & PMC_CAP_THRESHOLD)
1186		cccrvalue |= (a->pm_md.pm_p4.pm_p4_cccrconfig &
1187		    P4_CCCR_THRESHOLD_MASK) | P4_CCCR_COMPARE;
1188
1189	if (caps & PMC_CAP_EDGE)
1190		cccrvalue |= P4_CCCR_EDGE;
1191
1192	if (caps & PMC_CAP_INVERT)
1193		cccrvalue |= P4_CCCR_COMPLEMENT;
1194
1195	if (p4_system_has_htt)
1196		cccrvalue |= a->pm_md.pm_p4.pm_p4_cccrconfig &
1197		    P4_CCCR_ACTIVE_THREAD_MASK;
1198	else			/* no HTT; thread field should be '11b' */
1199		cccrvalue |= P4_CCCR_TO_ACTIVE_THREAD(0x3);
1200
1201	if (caps & PMC_CAP_CASCADE)
1202		cccrvalue |= P4_CCCR_CASCADE;
1203
1204	/* On HTT systems the PMI T0 field may get moved to T1 at pmc start */
1205	if (caps & PMC_CAP_INTERRUPT)
1206		cccrvalue |= P4_CCCR_OVF_PMI_T0;
1207
1208	/* ESCR fields */
1209	if (caps & PMC_CAP_QUALIFIER)
1210		escrvalue |= a->pm_md.pm_p4.pm_p4_escrconfig &
1211		    P4_ESCR_EVENT_MASK_MASK;
1212	if (caps & PMC_CAP_TAGGING)
1213		escrvalue |= (a->pm_md.pm_p4.pm_p4_escrconfig &
1214		    P4_ESCR_TAG_VALUE_MASK) | P4_ESCR_TAG_ENABLE;
1215	if (caps & PMC_CAP_QUALIFIER)
1216		escrvalue |= (a->pm_md.pm_p4.pm_p4_escrconfig &
1217		    P4_ESCR_EVENT_MASK_MASK);
1218
1219	/* HTT: T0_{OS,USR} bits may get moved to T1 at pmc start */
1220	tflags = 0;
1221	if (caps & PMC_CAP_SYSTEM)
1222		tflags |= P4_ESCR_T0_OS;
1223	if (caps & PMC_CAP_USER)
1224		tflags |= P4_ESCR_T0_USR;
1225	if (tflags == 0)
1226		tflags = (P4_ESCR_T0_OS|P4_ESCR_T0_USR);
1227	escrvalue |= tflags;
1228
1229	pm->pm_md.pm_p4.pm_p4_cccrvalue = cccrvalue;
1230	pm->pm_md.pm_p4.pm_p4_escrvalue = escrvalue;
1231
1232	PMCDBG(MDP,ALL,2, "p4-allocate cccrsel=0x%x cccrval=0x%x "
1233	    "escr=%d escrmsr=0x%x escrval=0x%x", pevent->pm_cccr_select,
1234	    cccrvalue, escr, pm->pm_md.pm_p4.pm_p4_escrmsr, escrvalue);
1235
1236	return 0;
1237}
1238
1239/*
1240 * release a PMC.
1241 */
1242
1243static int
1244p4_release_pmc(int cpu, int ri, struct pmc *pm)
1245{
1246	enum pmc_p4escr escr;
1247	struct pmc_hw *phw;
1248	struct p4_cpu *pc;
1249
1250	if (p4_pmcdesc[ri].pm_descr.pd_class == PMC_CLASS_TSC)
1251		return 0;
1252
1253	escr = pm->pm_md.pm_p4.pm_p4_escr;
1254
1255	PMCDBG(MDP,REL,1, "p4-release cpu=%d ri=%d escr=%d", cpu, ri, escr);
1256
1257	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
1258		pc  = (struct p4_cpu *) pmc_pcpu[P4_TO_HTT_PRIMARY(cpu)];
1259		phw = pc->pc_hwpmcs[ri];
1260
1261		KASSERT(phw->phw_pmc == NULL,
1262		    ("[p4,%d] releasing configured PMC ri=%d", __LINE__, ri));
1263
1264		P4_ESCR_UNMARK_ROW_STANDALONE(escr);
1265		KASSERT(pc->pc_escrs[escr] == ri,
1266		    ("[p4,%d] escr[%d] not allocated to ri %d", __LINE__,
1267			escr, ri));
1268	        pc->pc_escrs[escr] = P4_INVALID_PMC_INDEX; /* mark as free */
1269	} else
1270		P4_ESCR_UNMARK_ROW_THREAD(escr);
1271
1272	return 0;
1273}
1274
1275/*
1276 * Start a PMC
1277 */
1278
1279static int
1280p4_start_pmc(int cpu, int ri)
1281{
1282	int rc;
1283	uint32_t cccrvalue, cccrtbits, escrvalue, escrmsr, escrtbits;
1284	struct pmc *pm;
1285	struct p4_cpu *pc;
1286	struct pmc_hw *phw;
1287	struct p4pmc_descr *pd;
1288
1289	KASSERT(cpu >= 0 && cpu < mp_ncpus,
1290	    ("[p4,%d] illegal CPU value %d", __LINE__, cpu));
1291	KASSERT(ri >= 0 && ri < P4_NPMCS,
1292	    ("[p4,%d] illegal row-index %d", __LINE__, ri));
1293
1294	pc  = (struct p4_cpu *) pmc_pcpu[P4_TO_HTT_PRIMARY(cpu)];
1295	phw = pc->pc_hwpmcs[ri];
1296	pm  = phw->phw_pmc;
1297	pd  = &p4_pmcdesc[ri];
1298
1299	KASSERT(pm != NULL,
1300	    ("[p4,%d] starting cpu%d,pmc%d with null pmc", __LINE__,
1301		cpu, ri));
1302
1303	PMCDBG(MDP,STA,1, "p4-start cpu=%d ri=%d", cpu, ri);
1304
1305	if (pd->pm_descr.pd_class == PMC_CLASS_TSC) /* TSC are always on */
1306		return 0;
1307
1308	KASSERT(pd->pm_descr.pd_class == PMC_CLASS_P4,
1309	    ("[p4,%d] wrong PMC class %d", __LINE__,
1310		pd->pm_descr.pd_class));
1311
1312	/* retrieve the desired CCCR/ESCR values from the PMC */
1313	cccrvalue = pm->pm_md.pm_p4.pm_p4_cccrvalue;
1314	escrvalue = pm->pm_md.pm_p4.pm_p4_escrvalue;
1315	escrmsr   = pm->pm_md.pm_p4.pm_p4_escrmsr;
1316
1317	/* extract and zero the logical processor selection bits */
1318	cccrtbits = cccrvalue & P4_CCCR_OVF_PMI_T0;
1319	escrtbits = escrvalue & (P4_ESCR_T0_OS|P4_ESCR_T0_USR);
1320	cccrvalue &= ~P4_CCCR_OVF_PMI_T0;
1321	escrvalue &= ~(P4_ESCR_T0_OS|P4_ESCR_T0_USR);
1322
1323	if (P4_CPU_IS_HTT_SECONDARY(cpu)) { /* shift T0 bits to T1 position */
1324		cccrtbits <<= 1;
1325		escrtbits >>= 2;
1326	}
1327
1328	/* start system mode PMCs directly */
1329	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
1330		wrmsr(escrmsr, escrvalue | escrtbits);
1331		wrmsr(pd->pm_cccr_msr, cccrvalue | cccrtbits | P4_CCCR_ENABLE);
1332		return 0;
1333	}
1334
1335	/*
1336	 * Thread mode PMCs
1337	 *
1338	 * On HTT machines, the same PMC could be scheduled on the
1339	 * same physical CPU twice (once for each logical CPU), for
1340	 * example, if two threads of a multi-threaded process get
1341	 * scheduled on the same CPU.
1342	 *
1343	 */
1344
1345	mtx_lock_spin(&pc->pc_mtx);
1346
1347	rc = P4_PCPU_GET_RUNCOUNT(pc,ri);
1348	KASSERT(rc == 0 || rc == 1,
1349	    ("[p4,%d] illegal runcount cpu=%d ri=%d rc=%d", __LINE__, cpu, ri,
1350		rc));
1351
1352	if (rc == 0) {		/* 1st CPU and the non-HTT case */
1353
1354		KASSERT(P4_PMC_IS_STOPPED(pd->pm_cccr_msr),
1355		    ("[p4,%d] cpu=%d ri=%d cccr=0x%x not stopped", __LINE__,
1356			cpu, ri, pd->pm_cccr_msr));
1357
1358		/* write out the low 40 bits of the saved value to hardware */
1359		wrmsr(pd->pm_pmc_msr,
1360		    P4_PCPU_PMC_VALUE(pc,ri,cpu) & P4_PERFCTR_MASK);
1361
1362	} else if (rc == 1) {		/* 2nd CPU */
1363
1364		/*
1365		 * Stop the PMC and retrieve the CCCR and ESCR values
1366		 * from their MSRs, and turn on the additional T[0/1]
1367		 * bits for the 2nd CPU.
1368		 */
1369
1370		cccrvalue = rdmsr(pd->pm_cccr_msr);
1371		wrmsr(pd->pm_cccr_msr, cccrvalue & ~P4_CCCR_ENABLE);
1372
1373		/* check that the configuration bits read back match the PMC */
1374		KASSERT((cccrvalue & P4_CCCR_Tx_MASK) ==
1375		    (pm->pm_md.pm_p4.pm_p4_cccrvalue & P4_CCCR_Tx_MASK),
1376		    ("[p4,%d] Extra CCCR bits cpu=%d rc=%d ri=%d "
1377			"cccr=0x%x PMC=0x%x", __LINE__, cpu, rc, ri,
1378			cccrvalue & P4_CCCR_Tx_MASK,
1379			pm->pm_md.pm_p4.pm_p4_cccrvalue & P4_CCCR_Tx_MASK));
1380		KASSERT(cccrvalue & P4_CCCR_ENABLE,
1381		    ("[p4,%d] 2nd cpu rc=%d cpu=%d ri=%d not running",
1382			__LINE__, rc, cpu, ri));
1383		KASSERT((cccrvalue & cccrtbits) == 0,
1384		    ("[p4,%d] CCCR T0/T1 mismatch rc=%d cpu=%d ri=%d"
1385		     "cccrvalue=0x%x tbits=0x%x", __LINE__, rc, cpu, ri,
1386			cccrvalue, cccrtbits));
1387
1388		escrvalue = rdmsr(escrmsr);
1389
1390		KASSERT((escrvalue & P4_ESCR_Tx_MASK) ==
1391		    (pm->pm_md.pm_p4.pm_p4_escrvalue & P4_ESCR_Tx_MASK),
1392		    ("[p4,%d] Extra ESCR bits cpu=%d rc=%d ri=%d "
1393			"escr=0x%x pm=0x%x", __LINE__, cpu, rc, ri,
1394			escrvalue & P4_ESCR_Tx_MASK,
1395			pm->pm_md.pm_p4.pm_p4_escrvalue & P4_ESCR_Tx_MASK));
1396		KASSERT((escrvalue & escrtbits) == 0,
1397		    ("[p4,%d] ESCR T0/T1 mismatch rc=%d cpu=%d ri=%d "
1398		     "escrmsr=0x%x escrvalue=0x%x tbits=0x%x", __LINE__,
1399			rc, cpu, ri, escrmsr, escrvalue, escrtbits));
1400	}
1401
1402	/* Enable the correct bits for this CPU. */
1403	escrvalue |= escrtbits;
1404	cccrvalue |= cccrtbits | P4_CCCR_ENABLE;
1405
1406	/* Save HW value at the time of starting hardware */
1407	P4_PCPU_HW_VALUE(pc,ri,cpu) = rdmsr(pd->pm_pmc_msr);
1408
1409	/* Program the ESCR and CCCR and start the PMC */
1410	wrmsr(escrmsr, escrvalue);
1411	wrmsr(pd->pm_cccr_msr, cccrvalue);
1412
1413	++rc;
1414	P4_PCPU_SET_RUNCOUNT(pc,ri,rc);
1415
1416	mtx_unlock_spin(&pc->pc_mtx);
1417
1418	PMCDBG(MDP,STA,2,"p4-start cpu=%d rc=%d ri=%d escr=%d "
1419	    "escrmsr=0x%x escrvalue=0x%x cccr_config=0x%x v=%jx", cpu, rc,
1420	    ri, pm->pm_md.pm_p4.pm_p4_escr, escrmsr, escrvalue,
1421	    cccrvalue, P4_PCPU_HW_VALUE(pc,ri,cpu));
1422
1423	return 0;
1424}
1425
1426/*
1427 * Stop a PMC.
1428 */
1429
1430static int
1431p4_stop_pmc(int cpu, int ri)
1432{
1433	int rc;
1434	uint32_t cccrvalue, cccrtbits, escrvalue, escrmsr, escrtbits;
1435	struct pmc *pm;
1436	struct p4_cpu *pc;
1437	struct pmc_hw *phw;
1438	struct p4pmc_descr *pd;
1439	pmc_value_t tmp;
1440
1441	KASSERT(cpu >= 0 && cpu < mp_ncpus,
1442	    ("[p4,%d] illegal CPU value %d", __LINE__, cpu));
1443	KASSERT(ri >= 0 && ri < P4_NPMCS,
1444	    ("[p4,%d] illegal row index %d", __LINE__, ri));
1445
1446	pd  = &p4_pmcdesc[ri];
1447
1448	if (pd->pm_descr.pd_class == PMC_CLASS_TSC)
1449		return 0;
1450
1451	pc  = (struct p4_cpu *) pmc_pcpu[P4_TO_HTT_PRIMARY(cpu)];
1452	phw = pc->pc_hwpmcs[ri];
1453
1454	KASSERT(phw != NULL,
1455	    ("[p4,%d] null phw for cpu%d, ri%d", __LINE__, cpu, ri));
1456
1457	pm  = phw->phw_pmc;
1458
1459	KASSERT(pm != NULL,
1460	    ("[p4,%d] null pmc for cpu%d, ri%d", __LINE__, cpu, ri));
1461
1462	PMCDBG(MDP,STO,1, "p4-stop cpu=%d ri=%d", cpu, ri);
1463
1464	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
1465		wrmsr(pd->pm_cccr_msr,
1466		    pm->pm_md.pm_p4.pm_p4_cccrvalue & ~P4_CCCR_ENABLE);
1467		return 0;
1468	}
1469
1470	/*
1471	 * Thread mode PMCs.
1472	 *
1473	 * On HTT machines, this PMC may be in use by two threads
1474	 * running on two logical CPUS.  Thus we look at the
1475	 * 'pm_runcount' field and only turn off the appropriate TO/T1
1476	 * bits (and keep the PMC running) if two logical CPUs were
1477	 * using the PMC.
1478	 *
1479	 */
1480
1481	/* bits to mask */
1482	cccrtbits = P4_CCCR_OVF_PMI_T0;
1483	escrtbits = P4_ESCR_T0_OS | P4_ESCR_T0_USR;
1484	if (P4_CPU_IS_HTT_SECONDARY(cpu)) {
1485		cccrtbits <<= 1;
1486		escrtbits >>= 2;
1487	}
1488
1489	mtx_lock_spin(&pc->pc_mtx);
1490
1491	rc = P4_PCPU_GET_RUNCOUNT(pc,ri);
1492
1493	KASSERT(rc == 2 || rc == 1,
1494	    ("[p4,%d] illegal runcount cpu=%d ri=%d rc=%d", __LINE__, cpu, ri,
1495		rc));
1496
1497	--rc;
1498
1499	P4_PCPU_SET_RUNCOUNT(pc,ri,rc);
1500
1501	/* Stop this PMC */
1502	cccrvalue = rdmsr(pd->pm_cccr_msr);
1503	wrmsr(pd->pm_cccr_msr, cccrvalue & ~P4_CCCR_ENABLE);
1504
1505	escrmsr   = pm->pm_md.pm_p4.pm_p4_escrmsr;
1506	escrvalue = rdmsr(escrmsr);
1507
1508	/* The current CPU should be running on this PMC */
1509	KASSERT(escrvalue & escrtbits,
1510	    ("[p4,%d] ESCR T0/T1 mismatch cpu=%d rc=%d ri=%d escrmsr=0x%x "
1511		"escrvalue=0x%x tbits=0x%x", __LINE__, cpu, rc, ri, escrmsr,
1512		escrvalue, escrtbits));
1513	KASSERT(PMC_IS_COUNTING_MODE(PMC_TO_MODE(pm)) ||
1514	    (cccrvalue & cccrtbits),
1515	    ("[p4,%d] CCCR T0/T1 mismatch cpu=%d ri=%d cccrvalue=0x%x "
1516		"tbits=0x%x", __LINE__, cpu, ri, cccrvalue, cccrtbits));
1517
1518	/* get the current hardware reading */
1519	tmp = rdmsr(pd->pm_pmc_msr);
1520
1521	if (rc == 1) {		/* need to keep the PMC running */
1522		escrvalue &= ~escrtbits;
1523		cccrvalue &= ~cccrtbits;
1524		wrmsr(escrmsr, escrvalue);
1525		wrmsr(pd->pm_cccr_msr, cccrvalue);
1526	}
1527
1528	mtx_unlock_spin(&pc->pc_mtx);
1529
1530	PMCDBG(MDP,STO,2, "p4-stop cpu=%d rc=%d ri=%d escrmsr=0x%x "
1531	    "escrval=0x%x cccrval=0x%x v=%jx", cpu, rc, ri, escrmsr,
1532	    escrvalue, cccrvalue, tmp);
1533
1534	if (tmp < P4_PCPU_HW_VALUE(pc,ri,cpu)) /* 40 bit counter overflow */
1535		tmp += (P4_PERFCTR_MASK + 1) - P4_PCPU_HW_VALUE(pc,ri,cpu);
1536	else
1537		tmp -= P4_PCPU_HW_VALUE(pc,ri,cpu);
1538
1539	P4_PCPU_PMC_VALUE(pc,ri,cpu) += tmp;
1540
1541	return 0;
1542}
1543
1544/*
1545 * Handle an interrupt.
1546 *
1547 * The hardware sets the CCCR_OVF whenever a counter overflow occurs,
1548 * so the handler examines all the 18 CCCR registers, processing the
1549 * counters that have overflowed.
1550 *
1551 * On HTT machines, the CCCR register is shared and will interrupt
1552 * both logical processors if so configured.  Thus multiple logical
1553 * CPUs could enter the NMI service routine at the same time.  These
1554 * will get serialized using a per-cpu spinlock dedicated for use in
1555 * the NMI handler.
1556 */
1557
1558static int
1559p4_intr(int cpu, uintptr_t eip, int usermode)
1560{
1561	int i, did_interrupt, error, ri;
1562	uint32_t cccrval, ovf_mask, ovf_partner;
1563	struct p4_cpu *pc;
1564	struct pmc_hw *phw;
1565	struct pmc *pm;
1566	pmc_value_t v;
1567
1568	PMCDBG(MDP,INT, 1, "cpu=%d eip=%p um=%d", cpu, (void *) eip, usermode);
1569
1570	pc = (struct p4_cpu *) pmc_pcpu[P4_TO_HTT_PRIMARY(cpu)];
1571
1572	ovf_mask = P4_CPU_IS_HTT_SECONDARY(cpu) ?
1573	    P4_CCCR_OVF_PMI_T1 : P4_CCCR_OVF_PMI_T0;
1574	ovf_mask |= P4_CCCR_OVF;
1575	if (p4_system_has_htt)
1576		ovf_partner = P4_CPU_IS_HTT_SECONDARY(cpu) ? P4_CCCR_OVF_PMI_T0 :
1577		    P4_CCCR_OVF_PMI_T1;
1578	else
1579		ovf_partner = 0;
1580	did_interrupt = 0;
1581
1582	if (p4_system_has_htt)
1583		P4_PCPU_ACQ_INTR_SPINLOCK(pc);
1584
1585	/*
1586	 * Loop through all CCCRs, looking for ones that have
1587	 * interrupted this CPU.
1588	 */
1589	for (i = 0; i < P4_NPMCS-1; i++) {
1590
1591		ri = i + 1;	/* row index */
1592
1593		/*
1594		 * Check if our partner logical CPU has already marked
1595		 * this PMC has having interrupted it.  If so, reset
1596		 * the flag and process the interrupt, but leave the
1597		 * hardware alone.
1598		 */
1599		if (p4_system_has_htt && P4_PCPU_GET_INTRFLAG(pc,ri)) {
1600			P4_PCPU_SET_INTRFLAG(pc,ri,0);
1601			did_interrupt = 1;
1602
1603			/*
1604			 * Ignore de-configured or stopped PMCs.
1605			 * Ignore PMCs not in sampling mode.
1606			 */
1607			phw = pc->pc_hwpmcs[ri];
1608			pm  = phw->phw_pmc;
1609			if (pm == NULL ||
1610			    pm->pm_state != PMC_STATE_RUNNING ||
1611			    !PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) {
1612				continue;
1613			}
1614			(void) pmc_process_interrupt(cpu, pm, eip, usermode);
1615			continue;
1616		}
1617
1618		/*
1619		 * Fresh interrupt.  Look for the CCCR_OVF bit
1620		 * and the OVF_Tx bit for this logical
1621		 * processor being set.
1622		 */
1623		cccrval = rdmsr(P4_CCCR_MSR_FIRST + i);
1624
1625		if ((cccrval & ovf_mask) != ovf_mask)
1626			continue;
1627
1628		/*
1629		 * If the other logical CPU would also have been
1630		 * interrupted due to the PMC being shared, record
1631		 * this fact in the per-cpu saved interrupt flag
1632		 * bitmask.
1633		 */
1634		if (p4_system_has_htt && (cccrval & ovf_partner))
1635			P4_PCPU_SET_INTRFLAG(pc, ri, 1);
1636
1637		v = rdmsr(P4_PERFCTR_MSR_FIRST + i);
1638
1639		PMCDBG(MDP,INT, 2, "ri=%d v=%jx", ri, v);
1640
1641		/* Stop the counter, and reset the overflow  bit */
1642		cccrval &= ~(P4_CCCR_OVF | P4_CCCR_ENABLE);
1643		wrmsr(P4_CCCR_MSR_FIRST + i, cccrval);
1644
1645		did_interrupt = 1;
1646
1647		/*
1648		 * Ignore de-configured or stopped PMCs.  Ignore PMCs
1649		 * not in sampling mode.
1650		 */
1651		phw = pc->pc_hwpmcs[ri];
1652		pm  = phw->phw_pmc;
1653
1654		if (pm == NULL ||
1655		    pm->pm_state != PMC_STATE_RUNNING ||
1656		    !PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) {
1657			continue;
1658		}
1659
1660		/*
1661		 * Process the interrupt.  Re-enable the PMC if
1662		 * processing was successful.
1663		 */
1664		error = pmc_process_interrupt(cpu, pm, eip, usermode);
1665
1666		/*
1667		 * Only the first processor executing the NMI handler
1668		 * in a HTT pair will restart a PMC, and that too
1669		 * only if there were no errors.
1670		 */
1671		v = P4_RELOAD_COUNT_TO_PERFCTR_VALUE(
1672			pm->pm_sc.pm_reloadcount);
1673		wrmsr(P4_PERFCTR_MSR_FIRST + i, v);
1674		if (error == 0)
1675			wrmsr(P4_CCCR_MSR_FIRST + i,
1676			    cccrval | P4_CCCR_ENABLE);
1677	}
1678
1679	/* allow the other CPU to proceed */
1680	if (p4_system_has_htt)
1681		P4_PCPU_REL_INTR_SPINLOCK(pc);
1682
1683	/*
1684	 * On Intel P4 CPUs, the PMC 'pcint' entry in the LAPIC gets
1685	 * masked when a PMC interrupts the CPU.  We need to unmask
1686	 * the interrupt source explicitly.
1687	 */
1688
1689	if (did_interrupt)
1690		pmc_x86_lapic_enable_pmc_interrupt();
1691
1692	atomic_add_int(did_interrupt ? &pmc_stats.pm_intr_processed :
1693	    &pmc_stats.pm_intr_ignored, 1);
1694
1695	return did_interrupt;
1696}
1697
1698/*
1699 * Describe a CPU's PMC state.
1700 */
1701
1702static int
1703p4_describe(int cpu, int ri, struct pmc_info *pi,
1704    struct pmc **ppmc)
1705{
1706	int error;
1707	size_t copied;
1708	struct pmc_hw *phw;
1709	const struct p4pmc_descr *pd;
1710
1711	KASSERT(cpu >= 0 && cpu < mp_ncpus,
1712	    ("[p4,%d] illegal CPU %d", __LINE__, cpu));
1713	KASSERT(ri >= 0 && ri < P4_NPMCS,
1714	    ("[p4,%d] row-index %d out of range", __LINE__, ri));
1715
1716	PMCDBG(MDP,OPS,1,"p4-describe cpu=%d ri=%d", cpu, ri);
1717
1718	if (P4_CPU_IS_HTT_SECONDARY(cpu))
1719		return EINVAL;
1720
1721	phw = pmc_pcpu[cpu]->pc_hwpmcs[ri];
1722	pd  = &p4_pmcdesc[ri];
1723
1724	if ((error = copystr(pd->pm_descr.pd_name, pi->pm_name,
1725		 PMC_NAME_MAX, &copied)) != 0)
1726		return error;
1727
1728	pi->pm_class = pd->pm_descr.pd_class;
1729
1730	if (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) {
1731		pi->pm_enabled = TRUE;
1732		*ppmc          = phw->phw_pmc;
1733	} else {
1734		pi->pm_enabled = FALSE;
1735		*ppmc          = NULL;
1736	}
1737
1738	return 0;
1739}
1740
1741/*
1742 * Get MSR# for use with RDPMC.
1743 */
1744
1745static int
1746p4_get_msr(int ri, uint32_t *msr)
1747{
1748	KASSERT(ri >= 0 && ri < P4_NPMCS,
1749	    ("[p4,%d] ri %d out of range", __LINE__, ri));
1750
1751	*msr = p4_pmcdesc[ri].pm_pmc_msr - P4_PERFCTR_MSR_FIRST;
1752
1753	PMCDBG(MDP,OPS, 1, "ri=%d getmsr=0x%x", ri, *msr);
1754
1755	return 0;
1756}
1757
1758
1759int
1760pmc_initialize_p4(struct pmc_mdep *pmc_mdep)
1761{
1762	struct p4_event_descr *pe;
1763
1764	KASSERT(strcmp(cpu_vendor, "GenuineIntel") == 0,
1765	    ("[p4,%d] Initializing non-intel processor", __LINE__));
1766
1767	PMCDBG(MDP,INI,1, "%s", "p4-initialize");
1768
1769	switch (pmc_mdep->pmd_cputype) {
1770	case PMC_CPU_INTEL_PIV:
1771
1772		pmc_mdep->pmd_npmc	    = P4_NPMCS;
1773		pmc_mdep->pmd_classes[1].pm_class = PMC_CLASS_P4;
1774		pmc_mdep->pmd_classes[1].pm_caps  = P4_PMC_CAPS;
1775		pmc_mdep->pmd_classes[1].pm_width = 40;
1776		pmc_mdep->pmd_nclasspmcs[1] = 18;
1777
1778		pmc_mdep->pmd_init    	    = p4_init;
1779		pmc_mdep->pmd_cleanup 	    = p4_cleanup;
1780		pmc_mdep->pmd_switch_in     = p4_switch_in;
1781		pmc_mdep->pmd_switch_out    = p4_switch_out;
1782		pmc_mdep->pmd_read_pmc 	    = p4_read_pmc;
1783		pmc_mdep->pmd_write_pmc     = p4_write_pmc;
1784		pmc_mdep->pmd_config_pmc    = p4_config_pmc;
1785		pmc_mdep->pmd_get_config    = p4_get_config;
1786		pmc_mdep->pmd_allocate_pmc  = p4_allocate_pmc;
1787		pmc_mdep->pmd_release_pmc   = p4_release_pmc;
1788		pmc_mdep->pmd_start_pmc     = p4_start_pmc;
1789		pmc_mdep->pmd_stop_pmc      = p4_stop_pmc;
1790		pmc_mdep->pmd_intr	    = p4_intr;
1791		pmc_mdep->pmd_describe      = p4_describe;
1792		pmc_mdep->pmd_get_msr  	    = p4_get_msr; /* i386 */
1793
1794		/* model specific munging */
1795		if ((cpu_id & 0xFFF) < 0xF27) {
1796
1797			/*
1798			 * On P4 and Xeon with CPUID < (Family 15,
1799			 * Model 2, Stepping 7), only one ESCR is
1800			 * available for the IOQ_ALLOCATION event.
1801			 */
1802
1803			pe = p4_find_event(PMC_EV_P4_IOQ_ALLOCATION);
1804			pe->pm_escrs[1] = P4_ESCR_NONE;
1805		}
1806
1807		break;
1808
1809	default:
1810		KASSERT(0,("[p4,%d] Unknown CPU type", __LINE__));
1811		return ENOSYS;
1812	}
1813
1814	return 0;
1815}
1816