hwpmc_piv.c revision 157210
1/*-
2 * Copyright (c) 2003-2005 Joseph Koshy
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/dev/hwpmc/hwpmc_piv.c 157210 2006-03-28 14:09:21Z jkoshy $");
29
30#include <sys/param.h>
31#include <sys/lock.h>
32#include <sys/mutex.h>
33#include <sys/pmc.h>
34#include <sys/pmckern.h>
35#include <sys/smp.h>
36#include <sys/systm.h>
37
38#include <machine/cpufunc.h>
39#include <machine/md_var.h>
40#include <machine/specialreg.h>
41
42/*
43 * PENTIUM 4 SUPPORT
44 *
45 * The P4 has 18 PMCs, divided into 4 groups with 4,4,4 and 6 PMCs
46 * respectively.  Each PMC comprises of two model specific registers:
47 * a counter configuration control register (CCCR) and a counter
48 * register that holds the actual event counts.
49 *
50 * Configuring an event requires the use of one of 45 event selection
51 * control registers (ESCR).  Events are associated with specific
52 * ESCRs.  Each PMC group has a set of ESCRs it can use.
53 *
54 * - The BPU counter group (4 PMCs) can use the 16 ESCRs:
55 *   BPU_ESCR{0,1}, IS_ESCR{0,1}, MOB_ESCR{0,1}, ITLB_ESCR{0,1},
56 *   PMH_ESCR{0,1}, IX_ESCR{0,1}, FSB_ESCR{0,}, BSU_ESCR{0,1}.
57 *
58 * - The MS counter group (4 PMCs) can use the 6 ESCRs: MS_ESCR{0,1},
59 *   TC_ESCR{0,1}, TBPU_ESCR{0,1}.
60 *
61 * - The FLAME counter group (4 PMCs) can use the 10 ESCRs:
62 *   FLAME_ESCR{0,1}, FIRM_ESCR{0,1}, SAAT_ESCR{0,1}, U2L_ESCR{0,1},
63 *   DAC_ESCR{0,1}.
64 *
65 * - The IQ counter group (6 PMCs) can use the 13 ESCRs: IQ_ESCR{0,1},
66 *   ALF_ESCR{0,1}, RAT_ESCR{0,1}, SSU_ESCR0, CRU_ESCR{0,1,2,3,4,5}.
67 *
68 * Even-numbered ESCRs can be used with counters 0, 1 and 4 (if
69 * present) of a counter group.  Odd-numbers ESCRs can be used with
70 * counters 2, 3 and 5 (if present) of a counter group.  The
71 * 'p4_escrs[]' table describes these restrictions in a form that
72 * function 'p4_allocate()' uses for making allocation decisions.
73 *
74 * SYSTEM-MODE AND THREAD-MODE ALLOCATION
75 *
76 * In addition to remembering the state of PMC rows
77 * ('FREE','STANDALONE', or 'THREAD'), we similar need to track the
78 * state of ESCR rows.  If an ESCR is allocated to a system-mode PMC
79 * on a CPU we cannot allocate this to a thread-mode PMC.  On a
80 * multi-cpu (multiple physical CPUs) system, ESCR allocation on each
81 * CPU is tracked by the pc_escrs[] array.
82 *
83 * Each system-mode PMC that is using an ESCR records its row-index in
84 * the appropriate entry and system-mode allocation attempts check
85 * that an ESCR is available using this array.  Process-mode PMCs do
86 * not use the pc_escrs[] array, since ESCR row itself would have been
87 * marked as in 'THREAD' mode.
88 *
89 * HYPERTHREADING SUPPORT
90 *
91 * When HTT is enabled, the FreeBSD kernel treats the two 'logical'
92 * cpus as independent CPUs and can schedule kernel threads on them
93 * independently.  However, the two logical CPUs share the same set of
94 * PMC resources.  We need to ensure that:
95 * - PMCs that use the PMC_F_DESCENDANTS semantics are handled correctly,
96 *   and,
97 * - Threads of multi-threaded processes that get scheduled on the same
98 *   physical CPU are handled correctly.
99 *
100 * HTT Detection
101 *
102 * Not all HTT capable systems will have HTT enabled.  We detect the
103 * presence of HTT by detecting if 'p4_init()' was called for a secondary
104 * CPU in a HTT pair.
105 *
106 * Note that hwpmc(4) cannot currently deal with a change in HTT status once
107 * loaded.
108 *
109 * Handling HTT READ / WRITE / START / STOP
110 *
111 * PMC resources are shared across the CPUs in an HTT pair.  We
112 * designate the lower numbered CPU in a HTT pair as the 'primary'
113 * CPU.  In each primary CPU's state we keep track of a 'runcount'
114 * which reflects the number of PMC-using processes that have been
115 * scheduled on its secondary CPU.  Process-mode PMC operations will
116 * actually 'start' or 'stop' hardware only if these are the first or
117 * last processes respectively to use the hardware.  PMC values
118 * written by a 'write' operation are saved and are transferred to
119 * hardware at PMC 'start' time if the runcount is 0.  If the runcount
120 * is greater than 0 at the time of a 'start' operation, we keep track
121 * of the actual hardware value at the time of the 'start' operation
122 * and use this to adjust the final readings at PMC 'stop' or 'read'
123 * time.
124 *
125 * Execution sequences:
126 *
127 * Case 1:   CPUx   +...-		(no overlap)
128 *	     CPUy         +...-
129 *           RC   0 1   0 1   0
130 *
131 * Case 2:   CPUx   +........-		(partial overlap)
132 * 	     CPUy       +........-
133 *           RC   0 1   2    1   0
134 *
135 * Case 3:   CPUx   +..............-	(fully overlapped)
136 *	     CPUy       +.....-
137 *	     RC   0 1   2     1    0
138 *
139 *     Key:
140 *     'CPU[xy]' : one of the two logical processors on a HTT CPU.
141 *     'RC'      : run count (#threads per physical core).
142 *     '+'       : point in time when a thread is put on a CPU.
143 *     '-'       : point in time where a thread is taken off a CPU.
144 *
145 * Handling HTT CONFIG
146 *
147 * Different processes attached to the same PMC may get scheduled on
148 * the two logical processors in the package.  We keep track of config
149 * and de-config operations using the CFGFLAGS fields of the per-physical
150 * cpu state.
151 *
152 * Handling TSCs
153 *
154 * TSCs are architectural state and each CPU in a HTT pair has its own
155 * TSC register.
156 */
157
158#define	P4_PMCS()				\
159	P4_PMC(BPU_COUNTER0)			\
160	P4_PMC(BPU_COUNTER1)			\
161	P4_PMC(BPU_COUNTER2)			\
162	P4_PMC(BPU_COUNTER3)			\
163	P4_PMC(MS_COUNTER0)			\
164	P4_PMC(MS_COUNTER1)			\
165	P4_PMC(MS_COUNTER2)			\
166	P4_PMC(MS_COUNTER3)			\
167	P4_PMC(FLAME_COUNTER0)			\
168	P4_PMC(FLAME_COUNTER1)			\
169	P4_PMC(FLAME_COUNTER2)			\
170	P4_PMC(FLAME_COUNTER3)			\
171	P4_PMC(IQ_COUNTER0)			\
172	P4_PMC(IQ_COUNTER1)			\
173	P4_PMC(IQ_COUNTER2)			\
174	P4_PMC(IQ_COUNTER3)			\
175	P4_PMC(IQ_COUNTER4)			\
176	P4_PMC(IQ_COUNTER5)			\
177	P4_PMC(NONE)
178
179enum pmc_p4pmc {
180#undef	P4_PMC
181#define	P4_PMC(N)	P4_PMC_##N ,
182	P4_PMCS()
183};
184
185/*
186 * P4 ESCR descriptors
187 */
188
189#define	P4_ESCRS()							\
190    P4_ESCR(BSU_ESCR0,	0x3A0, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
191    P4_ESCR(BSU_ESCR1,	0x3A1, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
192    P4_ESCR(FSB_ESCR0,	0x3A2, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
193    P4_ESCR(FSB_ESCR1,	0x3A3, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
194    P4_ESCR(FIRM_ESCR0,	0x3A4, FLAME_COUNTER0, FLAME_COUNTER1, NONE)	\
195    P4_ESCR(FIRM_ESCR1,	0x3A5, FLAME_COUNTER2, FLAME_COUNTER3, NONE)	\
196    P4_ESCR(FLAME_ESCR0, 0x3A6, FLAME_COUNTER0, FLAME_COUNTER1, NONE)	\
197    P4_ESCR(FLAME_ESCR1, 0x3A7, FLAME_COUNTER2, FLAME_COUNTER3, NONE)	\
198    P4_ESCR(DAC_ESCR0,	0x3A8, FLAME_COUNTER0, FLAME_COUNTER1, NONE)	\
199    P4_ESCR(DAC_ESCR1,	0x3A9, FLAME_COUNTER2, FLAME_COUNTER3, NONE)	\
200    P4_ESCR(MOB_ESCR0,	0x3AA, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
201    P4_ESCR(MOB_ESCR1,	0x3AB, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
202    P4_ESCR(PMH_ESCR0,	0x3AC, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
203    P4_ESCR(PMH_ESCR1,	0x3AD, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
204    P4_ESCR(SAAT_ESCR0,	0x3AE, FLAME_COUNTER0, FLAME_COUNTER1, NONE)	\
205    P4_ESCR(SAAT_ESCR1,	0x3AF, FLAME_COUNTER2, FLAME_COUNTER3, NONE)	\
206    P4_ESCR(U2L_ESCR0,	0x3B0, FLAME_COUNTER0, FLAME_COUNTER1, NONE)	\
207    P4_ESCR(U2L_ESCR1,	0x3B1, FLAME_COUNTER2, FLAME_COUNTER3, NONE)	\
208    P4_ESCR(BPU_ESCR0,	0x3B2, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
209    P4_ESCR(BPU_ESCR1,	0x3B3, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
210    P4_ESCR(IS_ESCR0,	0x3B4, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
211    P4_ESCR(IS_ESCR1,	0x3B5, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
212    P4_ESCR(ITLB_ESCR0,	0x3B6, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
213    P4_ESCR(ITLB_ESCR1,	0x3B7, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
214    P4_ESCR(CRU_ESCR0,	0x3B8, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4)	\
215    P4_ESCR(CRU_ESCR1,	0x3B9, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5)	\
216    P4_ESCR(IQ_ESCR0,	0x3BA, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4)	\
217    P4_ESCR(IQ_ESCR1,	0x3BB, IQ_COUNTER1, IQ_COUNTER3, IQ_COUNTER5)	\
218    P4_ESCR(RAT_ESCR0,	0x3BC, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4)	\
219    P4_ESCR(RAT_ESCR1,	0x3BD, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5)	\
220    P4_ESCR(SSU_ESCR0,	0x3BE, IQ_COUNTER0, IQ_COUNTER2, IQ_COUNTER4)	\
221    P4_ESCR(MS_ESCR0,	0x3C0, MS_COUNTER0, MS_COUNTER1, NONE)		\
222    P4_ESCR(MS_ESCR1,	0x3C1, MS_COUNTER2, MS_COUNTER3, NONE)		\
223    P4_ESCR(TBPU_ESCR0,	0x3C2, MS_COUNTER0, MS_COUNTER1, NONE)		\
224    P4_ESCR(TBPU_ESCR1,	0x3C3, MS_COUNTER2, MS_COUNTER3, NONE)		\
225    P4_ESCR(TC_ESCR0,	0x3C4, MS_COUNTER0, MS_COUNTER1, NONE)		\
226    P4_ESCR(TC_ESCR1,	0x3C5, MS_COUNTER2, MS_COUNTER3, NONE)		\
227    P4_ESCR(IX_ESCR0,	0x3C8, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
228    P4_ESCR(IX_ESCR1,	0x3C9, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
229    P4_ESCR(ALF_ESCR0,	0x3CA, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4)	\
230    P4_ESCR(ALF_ESCR1,	0x3CB, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5)	\
231    P4_ESCR(CRU_ESCR2,	0x3CC, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4)	\
232    P4_ESCR(CRU_ESCR3,	0x3CD, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5)	\
233    P4_ESCR(CRU_ESCR4,	0x3E0, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4)	\
234    P4_ESCR(CRU_ESCR5,	0x3E1, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5)	\
235    P4_ESCR(NONE,		~0,    NONE, NONE, NONE)
236
237enum pmc_p4escr {
238#define	P4_ESCR(N, MSR, P1, P2, P3)	P4_ESCR_##N ,
239	P4_ESCRS()
240#undef	P4_ESCR
241};
242
243struct pmc_p4escr_descr {
244	const char	pm_escrname[PMC_NAME_MAX];
245	u_short		pm_escr_msr;
246	const enum pmc_p4pmc pm_pmcs[P4_MAX_PMC_PER_ESCR];
247};
248
249static struct pmc_p4escr_descr p4_escrs[] =
250{
251#define	P4_ESCR(N, MSR, P1, P2, P3)		\
252	{					\
253		.pm_escrname = #N,		\
254		.pm_escr_msr = (MSR),		\
255		.pm_pmcs =			\
256		{				\
257			P4_PMC_##P1,		\
258			P4_PMC_##P2,		\
259			P4_PMC_##P3		\
260		}				\
261	} ,
262
263	P4_ESCRS()
264
265#undef	P4_ESCR
266};
267
268/*
269 * P4 Event descriptor
270 */
271
272struct p4_event_descr {
273	const enum pmc_event pm_event;
274	const uint32_t	pm_escr_eventselect;
275	const uint32_t	pm_cccr_select;
276	const char	pm_is_ti_event;
277	enum pmc_p4escr	pm_escrs[P4_MAX_ESCR_PER_EVENT];
278};
279
280static struct p4_event_descr p4_events[] = {
281
282#define	P4_EVDESCR(NAME, ESCREVENTSEL, CCCRSEL, TI_EVENT, ESCR0, ESCR1)	\
283	{								\
284		.pm_event            = PMC_EV_P4_##NAME,		\
285		.pm_escr_eventselect = (ESCREVENTSEL),			\
286		.pm_cccr_select      = (CCCRSEL),			\
287		.pm_is_ti_event	     = (TI_EVENT),			\
288		.pm_escrs            =					\
289		{							\
290			P4_ESCR_##ESCR0,				\
291			P4_ESCR_##ESCR1					\
292		}							\
293	}
294
295P4_EVDESCR(TC_DELIVER_MODE,	0x01, 0x01, TRUE,  TC_ESCR0,	TC_ESCR1),
296P4_EVDESCR(BPU_FETCH_REQUEST,	0x03, 0x00, FALSE, BPU_ESCR0,	BPU_ESCR1),
297P4_EVDESCR(ITLB_REFERENCE,	0x18, 0x03, FALSE, ITLB_ESCR0,	ITLB_ESCR1),
298P4_EVDESCR(MEMORY_CANCEL,	0x02, 0x05, FALSE, DAC_ESCR0,	DAC_ESCR1),
299P4_EVDESCR(MEMORY_COMPLETE,	0x08, 0x02, FALSE, SAAT_ESCR0,	SAAT_ESCR1),
300P4_EVDESCR(LOAD_PORT_REPLAY,	0x04, 0x02, FALSE, SAAT_ESCR0,	SAAT_ESCR1),
301P4_EVDESCR(STORE_PORT_REPLAY,	0x05, 0x02, FALSE, SAAT_ESCR0,	SAAT_ESCR1),
302P4_EVDESCR(MOB_LOAD_REPLAY,	0x03, 0x02, FALSE, MOB_ESCR0,	MOB_ESCR1),
303P4_EVDESCR(PAGE_WALK_TYPE,	0x01, 0x04, TRUE,  PMH_ESCR0,	PMH_ESCR1),
304P4_EVDESCR(BSQ_CACHE_REFERENCE,	0x0C, 0x07, FALSE, BSU_ESCR0,	BSU_ESCR1),
305P4_EVDESCR(IOQ_ALLOCATION,	0x03, 0x06, FALSE, FSB_ESCR0,	FSB_ESCR1),
306P4_EVDESCR(IOQ_ACTIVE_ENTRIES,	0x1A, 0x06, FALSE, FSB_ESCR1,	NONE),
307P4_EVDESCR(FSB_DATA_ACTIVITY,	0x17, 0x06, TRUE,  FSB_ESCR0,	FSB_ESCR1),
308P4_EVDESCR(BSQ_ALLOCATION,	0x05, 0x07, FALSE, BSU_ESCR0,	NONE),
309P4_EVDESCR(BSQ_ACTIVE_ENTRIES,	0x06, 0x07, FALSE, BSU_ESCR1,	NONE),
310	/* BSQ_ACTIVE_ENTRIES inherits CPU specificity from BSQ_ALLOCATION */
311P4_EVDESCR(SSE_INPUT_ASSIST,	0x34, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
312P4_EVDESCR(PACKED_SP_UOP,	0x08, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
313P4_EVDESCR(PACKED_DP_UOP,	0x0C, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
314P4_EVDESCR(SCALAR_SP_UOP,	0x0A, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
315P4_EVDESCR(SCALAR_DP_UOP,	0x0E, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
316P4_EVDESCR(64BIT_MMX_UOP,	0x02, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
317P4_EVDESCR(128BIT_MMX_UOP,	0x1A, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
318P4_EVDESCR(X87_FP_UOP,		0x04, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
319P4_EVDESCR(X87_SIMD_MOVES_UOP,	0x2E, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
320P4_EVDESCR(GLOBAL_POWER_EVENTS,	0x13, 0x06, FALSE, FSB_ESCR0,	FSB_ESCR1),
321P4_EVDESCR(TC_MS_XFER,		0x05, 0x00, FALSE, MS_ESCR0,	MS_ESCR1),
322P4_EVDESCR(UOP_QUEUE_WRITES,	0x09, 0x00, FALSE, MS_ESCR0,	MS_ESCR1),
323P4_EVDESCR(RETIRED_MISPRED_BRANCH_TYPE,
324    				0x05, 0x02, FALSE, TBPU_ESCR0,	TBPU_ESCR1),
325P4_EVDESCR(RETIRED_BRANCH_TYPE,	0x04, 0x02, FALSE, TBPU_ESCR0,	TBPU_ESCR1),
326P4_EVDESCR(RESOURCE_STALL,	0x01, 0x01, FALSE, ALF_ESCR0,	ALF_ESCR1),
327P4_EVDESCR(WC_BUFFER,		0x05, 0x05, TRUE,  DAC_ESCR0,	DAC_ESCR1),
328P4_EVDESCR(B2B_CYCLES,		0x16, 0x03, TRUE,  FSB_ESCR0,	FSB_ESCR1),
329P4_EVDESCR(BNR,			0x08, 0x03, TRUE,  FSB_ESCR0,	FSB_ESCR1),
330P4_EVDESCR(SNOOP,		0x06, 0x03, TRUE,  FSB_ESCR0,	FSB_ESCR1),
331P4_EVDESCR(RESPONSE,		0x04, 0x03, TRUE,  FSB_ESCR0,	FSB_ESCR1),
332P4_EVDESCR(FRONT_END_EVENT,	0x08, 0x05, FALSE, CRU_ESCR2,	CRU_ESCR3),
333P4_EVDESCR(EXECUTION_EVENT,	0x0C, 0x05, FALSE, CRU_ESCR2,	CRU_ESCR3),
334P4_EVDESCR(REPLAY_EVENT, 	0x09, 0x05, FALSE, CRU_ESCR2,	CRU_ESCR3),
335P4_EVDESCR(INSTR_RETIRED,	0x02, 0x04, FALSE, CRU_ESCR0,	CRU_ESCR1),
336P4_EVDESCR(UOPS_RETIRED,	0x01, 0x04, FALSE, CRU_ESCR0,	CRU_ESCR1),
337P4_EVDESCR(UOP_TYPE,		0x02, 0x02, FALSE, RAT_ESCR0,	RAT_ESCR1),
338P4_EVDESCR(BRANCH_RETIRED,	0x06, 0x05, FALSE, CRU_ESCR2,	CRU_ESCR3),
339P4_EVDESCR(MISPRED_BRANCH_RETIRED, 0x03, 0x04, FALSE, CRU_ESCR0, CRU_ESCR1),
340P4_EVDESCR(X87_ASSIST,		0x03, 0x05, FALSE, CRU_ESCR2,	CRU_ESCR3),
341P4_EVDESCR(MACHINE_CLEAR,	0x02, 0x05, FALSE, CRU_ESCR2,	CRU_ESCR3)
342
343#undef	P4_EVDESCR
344};
345
346#define	P4_EVENT_IS_TI(E) ((E)->pm_is_ti_event == TRUE)
347
348#define	P4_NEVENTS	(PMC_EV_P4_LAST - PMC_EV_P4_FIRST + 1)
349
350/*
351 * P4 PMC descriptors
352 */
353
354struct p4pmc_descr {
355	struct pmc_descr pm_descr; 	/* common information */
356	enum pmc_p4pmc	pm_pmcnum;	/* PMC number */
357	uint32_t	pm_pmc_msr; 	/* PERFCTR MSR address */
358	uint32_t	pm_cccr_msr;  	/* CCCR MSR address */
359};
360
361static struct p4pmc_descr p4_pmcdesc[P4_NPMCS] = {
362
363	/*
364	 * TSC descriptor
365	 */
366
367	{
368		.pm_descr =
369		{
370			.pd_name  = "TSC",
371			.pd_class = PMC_CLASS_TSC,
372			.pd_caps  = PMC_CAP_READ | PMC_CAP_WRITE,
373			.pd_width = 64
374		},
375		.pm_pmcnum   = ~0,
376		.pm_cccr_msr = ~0,
377		.pm_pmc_msr  = 0x10,
378	},
379
380	/*
381	 * P4 PMCS
382	 */
383
384#define	P4_PMC_CAPS (PMC_CAP_INTERRUPT | PMC_CAP_USER | PMC_CAP_SYSTEM |  \
385	PMC_CAP_EDGE | PMC_CAP_THRESHOLD | PMC_CAP_READ | PMC_CAP_WRITE | \
386	PMC_CAP_INVERT | PMC_CAP_QUALIFIER | PMC_CAP_PRECISE |            \
387	PMC_CAP_TAGGING | PMC_CAP_CASCADE)
388
389#define	P4_PMCDESCR(N, PMC, CCCR)			\
390	{						\
391		.pm_descr =				\
392		{					\
393			.pd_name = #N,			\
394			.pd_class = PMC_CLASS_P4,	\
395			.pd_caps = P4_PMC_CAPS,		\
396			.pd_width = 40			\
397		},					\
398		.pm_pmcnum      = P4_PMC_##N,		\
399		.pm_cccr_msr 	= (CCCR),		\
400		.pm_pmc_msr	= (PMC)			\
401	}
402
403	P4_PMCDESCR(BPU_COUNTER0,	0x300,	0x360),
404	P4_PMCDESCR(BPU_COUNTER1,	0x301,	0x361),
405	P4_PMCDESCR(BPU_COUNTER2,	0x302,	0x362),
406	P4_PMCDESCR(BPU_COUNTER3,	0x303,	0x363),
407	P4_PMCDESCR(MS_COUNTER0,	0x304,	0x364),
408	P4_PMCDESCR(MS_COUNTER1,	0x305,	0x365),
409	P4_PMCDESCR(MS_COUNTER2,	0x306,	0x366),
410	P4_PMCDESCR(MS_COUNTER3,	0x307,	0x367),
411	P4_PMCDESCR(FLAME_COUNTER0,	0x308,	0x368),
412	P4_PMCDESCR(FLAME_COUNTER1,	0x309,	0x369),
413	P4_PMCDESCR(FLAME_COUNTER2,	0x30A,	0x36A),
414	P4_PMCDESCR(FLAME_COUNTER3,	0x30B,	0x36B),
415	P4_PMCDESCR(IQ_COUNTER0,	0x30C,	0x36C),
416	P4_PMCDESCR(IQ_COUNTER1,	0x30D,	0x36D),
417	P4_PMCDESCR(IQ_COUNTER2,	0x30E,	0x36E),
418	P4_PMCDESCR(IQ_COUNTER3,	0x30F,	0x36F),
419	P4_PMCDESCR(IQ_COUNTER4,	0x310,	0x370),
420	P4_PMCDESCR(IQ_COUNTER5,	0x311,	0x371),
421
422#undef	P4_PMCDESCR
423};
424
425/* HTT support */
426#define	P4_NHTT					2 /* logical processors/chip */
427
428static int p4_system_has_htt;
429
430/*
431 * Per-CPU data structure for P4 class CPUs
432 *
433 * [common stuff]
434 * [19 struct pmc_hw pointers]
435 * [19 struct pmc_hw structures]
436 * [45 ESCRs status bytes]
437 * [per-cpu spin mutex]
438 * [19 flag fields for holding config flags and a runcount]
439 * [19*2 hw value fields]	(Thread mode PMC support)
440 *    or
441 * [19*2 EIP values]		(Sampling mode PMCs)
442 * [19*2 pmc value fields]	(Thread mode PMC support))
443 */
444
445struct p4_cpu {
446	struct pmc_cpu	pc_common;
447	struct pmc_hw	*pc_hwpmcs[P4_NPMCS];
448	struct pmc_hw	pc_p4pmcs[P4_NPMCS];
449	char		pc_escrs[P4_NESCR];
450	struct mtx	pc_mtx;		/* spin lock */
451	uint32_t	pc_intrflag;	/* NMI handler flags */
452	unsigned int	pc_intrlock;	/* NMI handler spin lock */
453	unsigned char	pc_flags[P4_NPMCS]; /* 4 bits each: {cfg,run}count */
454	union {
455		pmc_value_t pc_hw[P4_NPMCS * P4_NHTT];
456		uintptr_t   pc_ip[P4_NPMCS * P4_NHTT];
457	}		pc_si;
458	pmc_value_t	pc_pmc_values[P4_NPMCS * P4_NHTT];
459};
460
461/*
462 * A 'logical' CPU shares PMC resources with partner 'physical' CPU,
463 * except the TSC, which is architectural and hence seperate.  The
464 * 'logical' CPU descriptor thus has pointers to the physical CPUs
465 * descriptor state except for the TSC (rowindex 0) which is not
466 * shared.
467 */
468
469struct p4_logicalcpu {
470	struct pmc_cpu	pc_common;
471	struct pmc_hw	*pc_hwpmcs[P4_NPMCS];
472	struct pmc_hw	pc_tsc;
473};
474
475#define	P4_PCPU_PMC_VALUE(PC,RI,CPU) 	(PC)->pc_pmc_values[(RI)*((CPU) & 1)]
476#define	P4_PCPU_HW_VALUE(PC,RI,CPU)	(PC)->pc_si.pc_hw[(RI)*((CPU) & 1)]
477#define	P4_PCPU_SAVED_IP(PC,RI,CPU)	(PC)->pc_si.pc_ip[(RI)*((CPU) & 1)]
478
479#define	P4_PCPU_GET_FLAGS(PC,RI,MASK)	((PC)->pc_flags[(RI)] & (MASK))
480#define	P4_PCPU_SET_FLAGS(PC,RI,MASK,VAL)	do {	\
481	char _tmp;					\
482	_tmp = (PC)->pc_flags[(RI)];			\
483	_tmp &= ~(MASK);				\
484	_tmp |= (VAL) & (MASK);				\
485	(PC)->pc_flags[(RI)] = _tmp;			\
486} while (0)
487
488#define	P4_PCPU_GET_RUNCOUNT(PC,RI)	P4_PCPU_GET_FLAGS(PC,RI,0x0F)
489#define	P4_PCPU_SET_RUNCOUNT(PC,RI,V)	P4_PCPU_SET_FLAGS(PC,RI,0x0F,V)
490
491#define	P4_PCPU_GET_CFGFLAGS(PC,RI)	(P4_PCPU_GET_FLAGS(PC,RI,0xF0) >> 4)
492#define	P4_PCPU_SET_CFGFLAGS(PC,RI,C)	P4_PCPU_SET_FLAGS(PC,RI,0xF0,((C) <<4))
493
494#define	P4_CPU_TO_FLAG(C)		(P4_CPU_IS_HTT_SECONDARY(cpu) ? 0x2 : 0x1)
495
496#define	P4_PCPU_GET_INTRFLAG(PC,I)	((PC)->pc_intrflag & (1 << (I)))
497#define	P4_PCPU_SET_INTRFLAG(PC,I,V)	do {		\
498		uint32_t __mask;			\
499		__mask = 1 << (I);			\
500		if ((V))				\
501			(PC)->pc_intrflag |= __mask;	\
502		else					\
503			(PC)->pc_intrflag &= ~__mask;	\
504	} while (0)
505
506/*
507 * A minimal spin lock implementation for use inside the NMI handler.
508 *
509 * We don't want to use a regular spin lock here, because curthread
510 * may not be consistent at the time the handler is invoked.
511 */
512#define	P4_PCPU_ACQ_INTR_SPINLOCK(PC) do {				\
513		while (!atomic_cmpset_acq_int(&pc->pc_intrlock, 0, 1))	\
514			ia32_pause();					\
515	} while (0)
516#define	P4_PCPU_REL_INTR_SPINLOCK(PC) 					\
517	atomic_store_rel_int(&pc->pc_intrlock, 0);
518
519/* ESCR row disposition */
520static int p4_escrdisp[P4_NESCR];
521
522#define	P4_ESCR_ROW_DISP_IS_THREAD(E)		(p4_escrdisp[(E)] > 0)
523#define	P4_ESCR_ROW_DISP_IS_STANDALONE(E)	(p4_escrdisp[(E)] < 0)
524#define	P4_ESCR_ROW_DISP_IS_FREE(E)		(p4_escrdisp[(E)] == 0)
525
526#define	P4_ESCR_MARK_ROW_STANDALONE(E) do {				\
527	KASSERT(p4_escrdisp[(E)] <= 0, ("[p4,%d] row disposition error",\
528		    __LINE__));						\
529	atomic_add_int(&p4_escrdisp[(E)], -1);				\
530	KASSERT(p4_escrdisp[(E)] >= (-mp_ncpus), ("[p4,%d] row "	\
531		"disposition error", __LINE__));			\
532} while (0)
533
534#define	P4_ESCR_UNMARK_ROW_STANDALONE(E) do {				\
535	atomic_add_int(&p4_escrdisp[(E)], 1);				\
536	KASSERT(p4_escrdisp[(E)] <= 0, ("[p4,%d] row disposition error",\
537		    __LINE__));						\
538} while (0)
539
540#define	P4_ESCR_MARK_ROW_THREAD(E) do {					 \
541	KASSERT(p4_escrdisp[(E)] >= 0, ("[p4,%d] row disposition error", \
542		    __LINE__));						 \
543	atomic_add_int(&p4_escrdisp[(E)], 1);				 \
544} while (0)
545
546#define	P4_ESCR_UNMARK_ROW_THREAD(E) do {				 \
547	atomic_add_int(&p4_escrdisp[(E)], -1);				 \
548	KASSERT(p4_escrdisp[(E)] >= 0, ("[p4,%d] row disposition error", \
549		    __LINE__));						 \
550} while (0)
551
552#define	P4_PMC_IS_STOPPED(cccr)	((rdmsr(cccr) & P4_CCCR_ENABLE) == 0)
553
554#define	P4_CPU_IS_HTT_SECONDARY(cpu)					\
555	(p4_system_has_htt ? ((cpu) & 1) : 0)
556#define	P4_TO_HTT_PRIMARY(cpu) 						\
557	(p4_system_has_htt ? ((cpu) & ~1) : (cpu))
558
559#define	P4_CCCR_Tx_MASK	(~(P4_CCCR_OVF_PMI_T0|P4_CCCR_OVF_PMI_T1|	\
560			     P4_CCCR_ENABLE|P4_CCCR_OVF))
561#define	P4_ESCR_Tx_MASK	(~(P4_ESCR_T0_OS|P4_ESCR_T0_USR|P4_ESCR_T1_OS|	\
562			     P4_ESCR_T1_USR))
563
564/*
565 * support routines
566 */
567
568static struct p4_event_descr *
569p4_find_event(enum pmc_event ev)
570{
571	int n;
572
573	for (n = 0; n < P4_NEVENTS; n++)
574		if (p4_events[n].pm_event == ev)
575			break;
576	if (n == P4_NEVENTS)
577		return NULL;
578	return &p4_events[n];
579}
580
581/*
582 * Initialize per-cpu state
583 */
584
585static int
586p4_init(int cpu)
587{
588	int n, phycpu;
589	char *pescr;
590	struct p4_cpu *pcs;
591	struct p4_logicalcpu *plcs;
592	struct pmc_hw *phw;
593
594	KASSERT(cpu >= 0 && cpu < mp_ncpus,
595	    ("[p4,%d] insane cpu number %d", __LINE__, cpu));
596
597	PMCDBG(MDP,INI,0, "p4-init cpu=%d logical=%d", cpu,
598	    pmc_cpu_is_logical(cpu) != 0);
599
600	/*
601	 * The two CPUs in an HT pair share their per-cpu state.
602	 *
603	 * For HT capable CPUs, we assume that the two logical
604	 * processors in the HT pair get two consecutive CPU ids
605	 * starting with an even id #.
606	 *
607	 * The primary CPU (the even numbered CPU of the pair) would
608	 * have been initialized prior to the initialization for the
609	 * secondary.
610	 */
611
612	if (pmc_cpu_is_logical(cpu) && (cpu & 1)) {
613
614		p4_system_has_htt = 1;
615
616		phycpu = P4_TO_HTT_PRIMARY(cpu);
617		pcs = (struct p4_cpu *) pmc_pcpu[phycpu];
618		PMCDBG(MDP,INI,1, "p4-init cpu=%d phycpu=%d pcs=%p",
619		    cpu, phycpu, pcs);
620		KASSERT(pcs,
621		    ("[p4,%d] Null Per-Cpu state cpu=%d phycpu=%d", __LINE__,
622			cpu, phycpu));
623		if (pcs == NULL) /* decline to init */
624			return ENXIO;
625
626		MALLOC(plcs, struct p4_logicalcpu *,
627		    sizeof(struct p4_logicalcpu), M_PMC, M_WAITOK|M_ZERO);
628
629		/* The TSC is architectural state and is not shared */
630		plcs->pc_hwpmcs[0] = &plcs->pc_tsc;
631		plcs->pc_tsc.phw_state = PMC_PHW_FLAG_IS_ENABLED |
632		    PMC_PHW_CPU_TO_STATE(cpu) | PMC_PHW_INDEX_TO_STATE(0) |
633		    PMC_PHW_FLAG_IS_SHAREABLE;
634
635		/* Other PMCs are shared with the physical CPU */
636		for (n = 1; n < P4_NPMCS; n++)
637			plcs->pc_hwpmcs[n] = pcs->pc_hwpmcs[n];
638
639		pmc_pcpu[cpu] = (struct pmc_cpu *) plcs;
640		return 0;
641	}
642
643	MALLOC(pcs, struct p4_cpu *, sizeof(struct p4_cpu), M_PMC,
644	    M_WAITOK|M_ZERO);
645
646	if (pcs == NULL)
647		return ENOMEM;
648	phw = pcs->pc_p4pmcs;
649
650	for (n = 0; n < P4_NPMCS; n++, phw++) {
651		phw->phw_state   = PMC_PHW_FLAG_IS_ENABLED |
652		    PMC_PHW_CPU_TO_STATE(cpu) | PMC_PHW_INDEX_TO_STATE(n);
653		phw->phw_pmc     = NULL;
654		pcs->pc_hwpmcs[n] = phw;
655	}
656
657	/* Mark the TSC as shareable */
658	pcs->pc_hwpmcs[0]->phw_state |= PMC_PHW_FLAG_IS_SHAREABLE;
659
660	pescr = pcs->pc_escrs;
661	for (n = 0; n < P4_NESCR; n++)
662		*pescr++ = P4_INVALID_PMC_INDEX;
663	pmc_pcpu[cpu] = (struct pmc_cpu *) pcs;
664
665	mtx_init(&pcs->pc_mtx, "p4-pcpu", "pmc", MTX_SPIN);
666
667	return 0;
668}
669
670/*
671 * Destroy per-cpu state.
672 */
673
674static int
675p4_cleanup(int cpu)
676{
677	int i;
678	struct p4_cpu *pcs;
679
680	PMCDBG(MDP,INI,0, "p4-cleanup cpu=%d", cpu);
681
682	if ((pcs = (struct p4_cpu *) pmc_pcpu[cpu]) == NULL)
683		return 0;
684
685	/* Turn off all PMCs on this CPU */
686	for (i = 0; i < P4_NPMCS - 1; i++)
687		wrmsr(P4_CCCR_MSR_FIRST + i,
688		    rdmsr(P4_CCCR_MSR_FIRST + i) & ~P4_CCCR_ENABLE);
689
690	/*
691	 * If the CPU is physical we need to teardown the
692	 * full MD state.
693	 */
694	if (!P4_CPU_IS_HTT_SECONDARY(cpu))
695		mtx_destroy(&pcs->pc_mtx);
696
697	FREE(pcs, M_PMC);
698
699	pmc_pcpu[cpu] = NULL;
700
701	return 0;
702}
703
704/*
705 * Context switch in.
706 */
707
708static int
709p4_switch_in(struct pmc_cpu *pc, struct pmc_process *pp)
710{
711	(void) pc;
712
713	PMCDBG(MDP,SWI,1, "pc=%p pp=%p enable-msr=%d", pc, pp,
714	    (pp->pp_flags & PMC_PP_ENABLE_MSR_ACCESS) != 0);
715
716	/* enable the RDPMC instruction */
717	if (pp->pp_flags & PMC_PP_ENABLE_MSR_ACCESS)
718		load_cr4(rcr4() | CR4_PCE);
719
720	PMCDBG(MDP,SWI,2, "cr4=0x%x", (uint32_t) rcr4());
721
722	return 0;
723}
724
725/*
726 * Context switch out.
727 */
728
729static int
730p4_switch_out(struct pmc_cpu *pc, struct pmc_process *pp)
731{
732	(void) pc;
733	(void) pp;		/* can be null */
734
735	PMCDBG(MDP,SWO,1, "pc=%p pp=%p", pc, pp);
736
737	/* always disallow the RDPMC instruction */
738	load_cr4(rcr4() & ~CR4_PCE);
739
740	PMCDBG(MDP,SWO,2, "cr4=0x%x", (uint32_t) rcr4());
741
742	return 0;
743}
744
745/*
746 * Read a PMC
747 */
748
749static int
750p4_read_pmc(int cpu, int ri, pmc_value_t *v)
751{
752	enum pmc_mode mode;
753	struct p4pmc_descr *pd;
754	struct pmc *pm;
755	struct p4_cpu *pc;
756	struct pmc_hw *phw;
757	pmc_value_t tmp;
758
759	KASSERT(cpu >= 0 && cpu < mp_ncpus,
760	    ("[p4,%d] illegal CPU value %d", __LINE__, cpu));
761	KASSERT(ri >= 0 && ri < P4_NPMCS,
762	    ("[p4,%d] illegal row-index %d", __LINE__, ri));
763
764
765	if (ri == 0) {	/* TSC */
766#ifdef	DEBUG
767		pc  = (struct p4_cpu *) pmc_pcpu[cpu];
768		phw = pc->pc_hwpmcs[ri];
769		pm  = phw->phw_pmc;
770
771		KASSERT(pm, ("[p4,%d] cpu=%d ri=%d not configured", __LINE__,
772			    cpu, ri));
773		KASSERT(PMC_TO_CLASS(pm) == PMC_CLASS_TSC,
774		    ("[p4,%d] cpu=%d ri=%d not a TSC (%d)", __LINE__, cpu, ri,
775			PMC_TO_CLASS(pm)));
776		KASSERT(PMC_IS_COUNTING_MODE(PMC_TO_MODE(pm)),
777		    ("[p4,%d] TSC counter in non-counting mode", __LINE__));
778#endif
779		*v = rdtsc();
780		PMCDBG(MDP,REA,2, "p4-read -> %jx", *v);
781		return 0;
782	}
783
784	pc  = (struct p4_cpu *) pmc_pcpu[P4_TO_HTT_PRIMARY(cpu)];
785	phw = pc->pc_hwpmcs[ri];
786	pd  = &p4_pmcdesc[ri];
787	pm  = phw->phw_pmc;
788
789	KASSERT(pm != NULL,
790	    ("[p4,%d] No owner for HWPMC [cpu%d,pmc%d]", __LINE__,
791		cpu, ri));
792
793	KASSERT(pd->pm_descr.pd_class == PMC_TO_CLASS(pm),
794	    ("[p4,%d] class mismatch pd %d != id class %d", __LINE__,
795		pd->pm_descr.pd_class, PMC_TO_CLASS(pm)));
796
797	mode = PMC_TO_MODE(pm);
798
799	PMCDBG(MDP,REA,1, "p4-read cpu=%d ri=%d mode=%d", cpu, ri, mode);
800
801	KASSERT(pd->pm_descr.pd_class == PMC_CLASS_P4,
802	    ("[p4,%d] unknown PMC class %d", __LINE__, pd->pm_descr.pd_class));
803
804	tmp = rdmsr(p4_pmcdesc[ri].pm_pmc_msr);
805
806	if (PMC_IS_VIRTUAL_MODE(mode)) {
807		if (tmp < P4_PCPU_HW_VALUE(pc,ri,cpu)) /* 40 bit overflow */
808			tmp += (P4_PERFCTR_MASK + 1) -
809			    P4_PCPU_HW_VALUE(pc,ri,cpu);
810		else
811			tmp -= P4_PCPU_HW_VALUE(pc,ri,cpu);
812		tmp += P4_PCPU_PMC_VALUE(pc,ri,cpu);
813	}
814
815	if (PMC_IS_SAMPLING_MODE(mode)) /* undo transformation */
816		*v = P4_PERFCTR_VALUE_TO_RELOAD_COUNT(tmp);
817	else
818		*v = tmp;
819
820	PMCDBG(MDP,REA,2, "p4-read -> %jx", *v);
821	return 0;
822}
823
824/*
825 * Write a PMC
826 */
827
828static int
829p4_write_pmc(int cpu, int ri, pmc_value_t v)
830{
831	enum pmc_mode mode;
832	struct pmc *pm;
833	struct p4_cpu *pc;
834	const struct pmc_hw *phw;
835	const struct p4pmc_descr *pd;
836
837	KASSERT(cpu >= 0 && cpu < mp_ncpus,
838	    ("[amd,%d] illegal CPU value %d", __LINE__, cpu));
839	KASSERT(ri >= 0 && ri < P4_NPMCS,
840	    ("[amd,%d] illegal row-index %d", __LINE__, ri));
841
842
843	/*
844	 * The P4's TSC register is writeable, but we don't allow a
845	 * write as changing the TSC's value could interfere with
846	 * timekeeping and other system functions.
847	 */
848	if (ri == 0) {
849#ifdef	DEBUG
850		pc  = (struct p4_cpu *) pmc_pcpu[cpu];
851		phw = pc->pc_hwpmcs[ri];
852		pm  = phw->phw_pmc;
853		KASSERT(pm, ("[p4,%d] cpu=%d ri=%d not configured", __LINE__,
854			    cpu, ri));
855		KASSERT(PMC_TO_CLASS(pm) == PMC_CLASS_TSC,
856		    ("[p4,%d] cpu=%d ri=%d not a TSC (%d)", __LINE__,
857			cpu, ri, PMC_TO_CLASS(pm)));
858#endif
859		return 0;
860	}
861
862	/* Shared PMCs */
863	pc  = (struct p4_cpu *) pmc_pcpu[P4_TO_HTT_PRIMARY(cpu)];
864	phw = pc->pc_hwpmcs[ri];
865	pm  = phw->phw_pmc;
866	pd  = &p4_pmcdesc[ri];
867
868	KASSERT(pm != NULL,
869	    ("[p4,%d] No owner for HWPMC [cpu%d,pmc%d]", __LINE__,
870		cpu, ri));
871
872	mode = PMC_TO_MODE(pm);
873
874	PMCDBG(MDP,WRI,1, "p4-write cpu=%d ri=%d mode=%d v=%jx", cpu, ri,
875	    mode, v);
876
877	/*
878	 * write the PMC value to the register/saved value: for
879	 * sampling mode PMCs, the value to be programmed into the PMC
880	 * counter is -(C+1) where 'C' is the requested sample rate.
881	 */
882	if (PMC_IS_SAMPLING_MODE(mode))
883		v = P4_RELOAD_COUNT_TO_PERFCTR_VALUE(v);
884
885	if (PMC_IS_SYSTEM_MODE(mode))
886		wrmsr(pd->pm_pmc_msr, v);
887	else
888		P4_PCPU_PMC_VALUE(pc,ri,cpu) = v;
889
890	return 0;
891}
892
893/*
894 * Configure a PMC 'pm' on the given CPU and row-index.
895 *
896 * 'pm' may be NULL to indicate de-configuration.
897 *
898 * On HTT systems, a PMC may get configured twice, once for each
899 * "logical" CPU.  We track this using the CFGFLAGS field of the
900 * per-cpu state; this field is a bit mask with one bit each for
901 * logical CPUs 0 & 1.
902 */
903
904static int
905p4_config_pmc(int cpu, int ri, struct pmc *pm)
906{
907	struct pmc_hw *phw;
908	struct p4_cpu *pc;
909	int cfgflags, cpuflag;
910
911	KASSERT(cpu >= 0 && cpu < mp_ncpus,
912	    ("[p4,%d] illegal CPU %d", __LINE__, cpu));
913	KASSERT(ri >= 0 && ri < P4_NPMCS,
914	    ("[p4,%d] illegal row-index %d", __LINE__, ri));
915
916	PMCDBG(MDP,CFG,1, "cpu=%d ri=%d pm=%p", cpu, ri, pm);
917
918	if (ri == 0) {		/* TSC */
919		pc = (struct p4_cpu *) pmc_pcpu[cpu];
920		phw = pc->pc_hwpmcs[ri];
921
922		KASSERT(pm == NULL || phw->phw_pmc == NULL,
923		    ("[p4,%d] hwpmc doubly config'ed", __LINE__));
924		phw->phw_pmc = pm;
925		return 0;
926	}
927
928	/* Shared PMCs */
929
930	pc = (struct p4_cpu *) pmc_pcpu[P4_TO_HTT_PRIMARY(cpu)];
931	phw = pc->pc_hwpmcs[ri];
932
933	KASSERT(pm == NULL || phw->phw_pmc == NULL ||
934	    (p4_system_has_htt && phw->phw_pmc == pm),
935	    ("[p4,%d] hwpmc not unconfigured before re-config", __LINE__));
936
937	mtx_lock_spin(&pc->pc_mtx);
938	cfgflags = P4_PCPU_GET_CFGFLAGS(pc,ri);
939
940	KASSERT(cfgflags >= 0 || cfgflags <= 3,
941	    ("[p4,%d] illegal cfgflags cfg=%d on cpu=%d ri=%d", __LINE__,
942		cfgflags, cpu, ri));
943
944	KASSERT(cfgflags == 0 || phw->phw_pmc,
945	    ("[p4,%d] cpu=%d ri=%d pmc configured with zero cfg count",
946		__LINE__, cpu, ri));
947
948	cpuflag = P4_CPU_TO_FLAG(cpu);
949
950	if (pm) {		/* config */
951		if (cfgflags == 0)
952			phw->phw_pmc = pm;
953
954		KASSERT(phw->phw_pmc == pm,
955		    ("[p4,%d] cpu=%d ri=%d config %p != hw %p",
956			__LINE__, cpu, ri, pm, phw->phw_pmc));
957
958		cfgflags |= cpuflag;
959	} else {		/* unconfig */
960		cfgflags &= ~cpuflag;
961
962		if (cfgflags == 0)
963			phw->phw_pmc = NULL;
964	}
965
966	KASSERT(cfgflags >= 0 || cfgflags <= 3,
967	    ("[p4,%d] illegal runcount cfg=%d on cpu=%d ri=%d", __LINE__,
968		cfgflags, cpu, ri));
969
970	P4_PCPU_SET_CFGFLAGS(pc,ri,cfgflags);
971
972	mtx_unlock_spin(&pc->pc_mtx);
973
974	return 0;
975}
976
977/*
978 * Retrieve a configured PMC pointer from hardware state.
979 */
980
981static int
982p4_get_config(int cpu, int ri, struct pmc **ppm)
983{
984	struct p4_cpu *pc;
985	struct pmc_hw *phw;
986	int cfgflags;
987
988	pc = (struct p4_cpu *) pmc_pcpu[P4_TO_HTT_PRIMARY(cpu)];
989	phw = pc->pc_hwpmcs[ri];
990
991	mtx_lock_spin(&pc->pc_mtx);
992	cfgflags = P4_PCPU_GET_CFGFLAGS(pc,ri);
993	mtx_unlock_spin(&pc->pc_mtx);
994
995	if (cfgflags & P4_CPU_TO_FLAG(cpu))
996		*ppm = phw->phw_pmc; /* PMC config'ed on this CPU */
997	else
998		*ppm = NULL;
999
1000	return 0;
1001}
1002
1003/*
1004 * Allocate a PMC.
1005 *
1006 * The allocation strategy differs between HTT and non-HTT systems.
1007 *
1008 * The non-HTT case:
1009 *   - Given the desired event and the PMC row-index, lookup the
1010 *   list of valid ESCRs for the event.
1011 *   - For each valid ESCR:
1012 *     - Check if the ESCR is free and the ESCR row is in a compatible
1013 *       mode (i.e., system or process))
1014 *     - Check if the ESCR is usable with a P4 PMC at the desired row-index.
1015 *   If everything matches, we determine the appropriate bit values for the
1016 *   ESCR and CCCR registers.
1017 *
1018 * The HTT case:
1019 *
1020 * - Process mode PMCs require special care.  The FreeBSD scheduler could
1021 *   schedule any two processes on the same physical CPU.  We need to ensure
1022 *   that a given PMC row-index is never allocated to two different
1023 *   PMCs owned by different user-processes.
1024 *   This is ensured by always allocating a PMC from a 'FREE' PMC row
1025 *   if the system has HTT active.
1026 * - A similar check needs to be done for ESCRs; we do not want two PMCs
1027 *   using the same ESCR to be scheduled at the same time.  Thus ESCR
1028 *   allocation is also restricted to FREE rows if the system has HTT
1029 *   enabled.
1030 * - Thirdly, some events are 'thread-independent' terminology, i.e.,
1031 *   the PMC hardware cannot distinguish between events caused by
1032 *   different logical CPUs.  This makes it impossible to assign events
1033 *   to a given thread of execution.  If the system has HTT enabled,
1034 *   these events are not allowed for process-mode PMCs.
1035 */
1036
1037static int
1038p4_allocate_pmc(int cpu, int ri, struct pmc *pm,
1039    const struct pmc_op_pmcallocate *a)
1040{
1041	int found, n, m;
1042	uint32_t caps, cccrvalue, escrvalue, tflags;
1043	enum pmc_p4escr escr;
1044	struct p4_cpu *pc;
1045	struct p4_event_descr *pevent;
1046	const struct p4pmc_descr *pd;
1047
1048	KASSERT(cpu >= 0 && cpu < mp_ncpus,
1049	    ("[p4,%d] illegal CPU %d", __LINE__, cpu));
1050	KASSERT(ri >= 0 && ri < P4_NPMCS,
1051	    ("[p4,%d] illegal row-index value %d", __LINE__, ri));
1052
1053	pd = &p4_pmcdesc[ri];
1054
1055	PMCDBG(MDP,ALL,1, "p4-allocate ri=%d class=%d pmccaps=0x%x "
1056	    "reqcaps=0x%x", ri, pd->pm_descr.pd_class, pd->pm_descr.pd_caps,
1057	    pm->pm_caps);
1058
1059	/* check class */
1060	if (pd->pm_descr.pd_class != a->pm_class)
1061		return EINVAL;
1062
1063	/* check requested capabilities */
1064	caps = a->pm_caps;
1065	if ((pd->pm_descr.pd_caps & caps) != caps)
1066		return EPERM;
1067
1068	if (pd->pm_descr.pd_class == PMC_CLASS_TSC) {
1069		/* TSC's are always allocated in system-wide counting mode */
1070		if (a->pm_ev != PMC_EV_TSC_TSC ||
1071		    a->pm_mode != PMC_MODE_SC)
1072			return EINVAL;
1073		return 0;
1074	}
1075
1076	/*
1077	 * If the system has HTT enabled, and the desired allocation
1078	 * mode is process-private, and the PMC row disposition is not
1079	 * FREE (0), decline the allocation.
1080	 */
1081
1082	if (p4_system_has_htt &&
1083	    PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)) &&
1084	    pmc_getrowdisp(ri) != 0)
1085		return EBUSY;
1086
1087	KASSERT(pd->pm_descr.pd_class == PMC_CLASS_P4,
1088	    ("[p4,%d] unknown PMC class %d", __LINE__,
1089		pd->pm_descr.pd_class));
1090
1091	if (pm->pm_event < PMC_EV_P4_FIRST ||
1092	    pm->pm_event > PMC_EV_P4_LAST)
1093		return EINVAL;
1094
1095	if ((pevent = p4_find_event(pm->pm_event)) == NULL)
1096		return ESRCH;
1097
1098	PMCDBG(MDP,ALL,2, "pevent={ev=%d,escrsel=0x%x,cccrsel=0x%x,isti=%d}",
1099	    pevent->pm_event, pevent->pm_escr_eventselect,
1100	    pevent->pm_cccr_select, pevent->pm_is_ti_event);
1101
1102	/*
1103	 * Some PMC events are 'thread independent'and therefore
1104	 * cannot be used for process-private modes if HTT is being
1105	 * used.
1106	 */
1107
1108	if (P4_EVENT_IS_TI(pevent) &&
1109	    PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)) &&
1110	    p4_system_has_htt)
1111		return EINVAL;
1112
1113	pc = (struct p4_cpu *) pmc_pcpu[P4_TO_HTT_PRIMARY(cpu)];
1114
1115	found   = 0;
1116
1117	/* look for a suitable ESCR for this event */
1118	for (n = 0; n < P4_MAX_ESCR_PER_EVENT && !found; n++) {
1119		if ((escr = pevent->pm_escrs[n]) == P4_ESCR_NONE)
1120			break;	/* out of ESCRs */
1121		/*
1122		 * Check ESCR row disposition.
1123		 *
1124		 * If the request is for a system-mode PMC, then the
1125		 * ESCR row should not be in process-virtual mode, and
1126		 * should also be free on the current CPU.
1127		 */
1128
1129		if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
1130		    if (P4_ESCR_ROW_DISP_IS_THREAD(escr) ||
1131			pc->pc_escrs[escr] != P4_INVALID_PMC_INDEX)
1132			    continue;
1133		}
1134
1135		/*
1136		 * If the request is for a process-virtual PMC, and if
1137		 * HTT is not enabled, we can use an ESCR row that is
1138		 * either FREE or already in process mode.
1139		 *
1140		 * If HTT is enabled, then we need to ensure that a
1141		 * given ESCR is never allocated to two PMCS that
1142		 * could run simultaneously on the two logical CPUs of
1143		 * a CPU package.  We ensure this be only allocating
1144		 * ESCRs from rows marked as 'FREE'.
1145		 */
1146
1147		if (PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm))) {
1148			if (p4_system_has_htt) {
1149				if (!P4_ESCR_ROW_DISP_IS_FREE(escr))
1150					continue;
1151			} else
1152				if (P4_ESCR_ROW_DISP_IS_STANDALONE(escr))
1153					continue;
1154		}
1155
1156		/*
1157		 * We found a suitable ESCR for this event.  Now check if
1158		 * this escr can work with the PMC at row-index 'ri'.
1159		 */
1160
1161		for (m = 0; m < P4_MAX_PMC_PER_ESCR; m++)
1162			if (p4_escrs[escr].pm_pmcs[m] == pd->pm_pmcnum) {
1163				found = 1;
1164				break;
1165			}
1166	}
1167
1168	if (found == 0)
1169		return ESRCH;
1170
1171	KASSERT((int) escr >= 0 && escr < P4_NESCR,
1172	    ("[p4,%d] illegal ESCR value %d", __LINE__, escr));
1173
1174	/* mark ESCR row mode */
1175	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
1176		pc->pc_escrs[escr] = ri; /* mark ESCR as in use on this cpu */
1177		P4_ESCR_MARK_ROW_STANDALONE(escr);
1178	} else {
1179		KASSERT(pc->pc_escrs[escr] == P4_INVALID_PMC_INDEX,
1180		    ("[p4,%d] escr[%d] already in use", __LINE__, escr));
1181		P4_ESCR_MARK_ROW_THREAD(escr);
1182	}
1183
1184	pm->pm_md.pm_p4.pm_p4_escrmsr   = p4_escrs[escr].pm_escr_msr;
1185	pm->pm_md.pm_p4.pm_p4_escr      = escr;
1186
1187	cccrvalue = P4_CCCR_TO_ESCR_SELECT(pevent->pm_cccr_select);
1188	escrvalue = P4_ESCR_TO_EVENT_SELECT(pevent->pm_escr_eventselect);
1189
1190	/* CCCR fields */
1191	if (caps & PMC_CAP_THRESHOLD)
1192		cccrvalue |= (a->pm_md.pm_p4.pm_p4_cccrconfig &
1193		    P4_CCCR_THRESHOLD_MASK) | P4_CCCR_COMPARE;
1194
1195	if (caps & PMC_CAP_EDGE)
1196		cccrvalue |= P4_CCCR_EDGE;
1197
1198	if (caps & PMC_CAP_INVERT)
1199		cccrvalue |= P4_CCCR_COMPLEMENT;
1200
1201	if (p4_system_has_htt)
1202		cccrvalue |= a->pm_md.pm_p4.pm_p4_cccrconfig &
1203		    P4_CCCR_ACTIVE_THREAD_MASK;
1204	else			/* no HTT; thread field should be '11b' */
1205		cccrvalue |= P4_CCCR_TO_ACTIVE_THREAD(0x3);
1206
1207	if (caps & PMC_CAP_CASCADE)
1208		cccrvalue |= P4_CCCR_CASCADE;
1209
1210	/* On HTT systems the PMI T0 field may get moved to T1 at pmc start */
1211	if (caps & PMC_CAP_INTERRUPT)
1212		cccrvalue |= P4_CCCR_OVF_PMI_T0;
1213
1214	/* ESCR fields */
1215	if (caps & PMC_CAP_QUALIFIER)
1216		escrvalue |= a->pm_md.pm_p4.pm_p4_escrconfig &
1217		    P4_ESCR_EVENT_MASK_MASK;
1218	if (caps & PMC_CAP_TAGGING)
1219		escrvalue |= (a->pm_md.pm_p4.pm_p4_escrconfig &
1220		    P4_ESCR_TAG_VALUE_MASK) | P4_ESCR_TAG_ENABLE;
1221	if (caps & PMC_CAP_QUALIFIER)
1222		escrvalue |= (a->pm_md.pm_p4.pm_p4_escrconfig &
1223		    P4_ESCR_EVENT_MASK_MASK);
1224
1225	/* HTT: T0_{OS,USR} bits may get moved to T1 at pmc start */
1226	tflags = 0;
1227	if (caps & PMC_CAP_SYSTEM)
1228		tflags |= P4_ESCR_T0_OS;
1229	if (caps & PMC_CAP_USER)
1230		tflags |= P4_ESCR_T0_USR;
1231	if (tflags == 0)
1232		tflags = (P4_ESCR_T0_OS|P4_ESCR_T0_USR);
1233	escrvalue |= tflags;
1234
1235	pm->pm_md.pm_p4.pm_p4_cccrvalue = cccrvalue;
1236	pm->pm_md.pm_p4.pm_p4_escrvalue = escrvalue;
1237
1238	PMCDBG(MDP,ALL,2, "p4-allocate cccrsel=0x%x cccrval=0x%x "
1239	    "escr=%d escrmsr=0x%x escrval=0x%x", pevent->pm_cccr_select,
1240	    cccrvalue, escr, pm->pm_md.pm_p4.pm_p4_escrmsr, escrvalue);
1241
1242	return 0;
1243}
1244
1245/*
1246 * release a PMC.
1247 */
1248
1249static int
1250p4_release_pmc(int cpu, int ri, struct pmc *pm)
1251{
1252	enum pmc_p4escr escr;
1253	struct pmc_hw *phw;
1254	struct p4_cpu *pc;
1255
1256	if (p4_pmcdesc[ri].pm_descr.pd_class == PMC_CLASS_TSC)
1257		return 0;
1258
1259	escr = pm->pm_md.pm_p4.pm_p4_escr;
1260
1261	PMCDBG(MDP,REL,1, "p4-release cpu=%d ri=%d escr=%d", cpu, ri, escr);
1262
1263	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
1264		pc  = (struct p4_cpu *) pmc_pcpu[P4_TO_HTT_PRIMARY(cpu)];
1265		phw = pc->pc_hwpmcs[ri];
1266
1267		KASSERT(phw->phw_pmc == NULL,
1268		    ("[p4,%d] releasing configured PMC ri=%d", __LINE__, ri));
1269
1270		P4_ESCR_UNMARK_ROW_STANDALONE(escr);
1271		KASSERT(pc->pc_escrs[escr] == ri,
1272		    ("[p4,%d] escr[%d] not allocated to ri %d", __LINE__,
1273			escr, ri));
1274	        pc->pc_escrs[escr] = P4_INVALID_PMC_INDEX; /* mark as free */
1275	} else
1276		P4_ESCR_UNMARK_ROW_THREAD(escr);
1277
1278	return 0;
1279}
1280
1281/*
1282 * Start a PMC
1283 */
1284
1285static int
1286p4_start_pmc(int cpu, int ri)
1287{
1288	int rc;
1289	uint32_t cccrvalue, cccrtbits, escrvalue, escrmsr, escrtbits;
1290	struct pmc *pm;
1291	struct p4_cpu *pc;
1292	struct pmc_hw *phw;
1293	struct p4pmc_descr *pd;
1294
1295	KASSERT(cpu >= 0 && cpu < mp_ncpus,
1296	    ("[p4,%d] illegal CPU value %d", __LINE__, cpu));
1297	KASSERT(ri >= 0 && ri < P4_NPMCS,
1298	    ("[p4,%d] illegal row-index %d", __LINE__, ri));
1299
1300	pc  = (struct p4_cpu *) pmc_pcpu[P4_TO_HTT_PRIMARY(cpu)];
1301	phw = pc->pc_hwpmcs[ri];
1302	pm  = phw->phw_pmc;
1303	pd  = &p4_pmcdesc[ri];
1304
1305	KASSERT(pm != NULL,
1306	    ("[p4,%d] starting cpu%d,pmc%d with null pmc", __LINE__,
1307		cpu, ri));
1308
1309	PMCDBG(MDP,STA,1, "p4-start cpu=%d ri=%d", cpu, ri);
1310
1311	if (pd->pm_descr.pd_class == PMC_CLASS_TSC) /* TSC are always on */
1312		return 0;
1313
1314	KASSERT(pd->pm_descr.pd_class == PMC_CLASS_P4,
1315	    ("[p4,%d] wrong PMC class %d", __LINE__,
1316		pd->pm_descr.pd_class));
1317
1318	/* retrieve the desired CCCR/ESCR values from the PMC */
1319	cccrvalue = pm->pm_md.pm_p4.pm_p4_cccrvalue;
1320	escrvalue = pm->pm_md.pm_p4.pm_p4_escrvalue;
1321	escrmsr   = pm->pm_md.pm_p4.pm_p4_escrmsr;
1322
1323	/* extract and zero the logical processor selection bits */
1324	cccrtbits = cccrvalue & P4_CCCR_OVF_PMI_T0;
1325	escrtbits = escrvalue & (P4_ESCR_T0_OS|P4_ESCR_T0_USR);
1326	cccrvalue &= ~P4_CCCR_OVF_PMI_T0;
1327	escrvalue &= ~(P4_ESCR_T0_OS|P4_ESCR_T0_USR);
1328
1329	if (P4_CPU_IS_HTT_SECONDARY(cpu)) { /* shift T0 bits to T1 position */
1330		cccrtbits <<= 1;
1331		escrtbits >>= 2;
1332	}
1333
1334	/* start system mode PMCs directly */
1335	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
1336		wrmsr(escrmsr, escrvalue | escrtbits);
1337		wrmsr(pd->pm_cccr_msr, cccrvalue | cccrtbits | P4_CCCR_ENABLE);
1338		return 0;
1339	}
1340
1341	/*
1342	 * Thread mode PMCs
1343	 *
1344	 * On HTT machines, the same PMC could be scheduled on the
1345	 * same physical CPU twice (once for each logical CPU), for
1346	 * example, if two threads of a multi-threaded process get
1347	 * scheduled on the same CPU.
1348	 *
1349	 */
1350
1351	mtx_lock_spin(&pc->pc_mtx);
1352
1353	rc = P4_PCPU_GET_RUNCOUNT(pc,ri);
1354	KASSERT(rc == 0 || rc == 1,
1355	    ("[p4,%d] illegal runcount cpu=%d ri=%d rc=%d", __LINE__, cpu, ri,
1356		rc));
1357
1358	if (rc == 0) {		/* 1st CPU and the non-HTT case */
1359
1360		KASSERT(P4_PMC_IS_STOPPED(pd->pm_cccr_msr),
1361		    ("[p4,%d] cpu=%d ri=%d cccr=0x%x not stopped", __LINE__,
1362			cpu, ri, pd->pm_cccr_msr));
1363
1364		/* write out the low 40 bits of the saved value to hardware */
1365		wrmsr(pd->pm_pmc_msr,
1366		    P4_PCPU_PMC_VALUE(pc,ri,cpu) & P4_PERFCTR_MASK);
1367
1368	} else if (rc == 1) {		/* 2nd CPU */
1369
1370		/*
1371		 * Stop the PMC and retrieve the CCCR and ESCR values
1372		 * from their MSRs, and turn on the additional T[0/1]
1373		 * bits for the 2nd CPU.
1374		 */
1375
1376		cccrvalue = rdmsr(pd->pm_cccr_msr);
1377		wrmsr(pd->pm_cccr_msr, cccrvalue & ~P4_CCCR_ENABLE);
1378
1379		/* check that the configuration bits read back match the PMC */
1380		KASSERT((cccrvalue & P4_CCCR_Tx_MASK) ==
1381		    (pm->pm_md.pm_p4.pm_p4_cccrvalue & P4_CCCR_Tx_MASK),
1382		    ("[p4,%d] Extra CCCR bits cpu=%d rc=%d ri=%d "
1383			"cccr=0x%x PMC=0x%x", __LINE__, cpu, rc, ri,
1384			cccrvalue & P4_CCCR_Tx_MASK,
1385			pm->pm_md.pm_p4.pm_p4_cccrvalue & P4_CCCR_Tx_MASK));
1386		KASSERT(cccrvalue & P4_CCCR_ENABLE,
1387		    ("[p4,%d] 2nd cpu rc=%d cpu=%d ri=%d not running",
1388			__LINE__, rc, cpu, ri));
1389		KASSERT((cccrvalue & cccrtbits) == 0,
1390		    ("[p4,%d] CCCR T0/T1 mismatch rc=%d cpu=%d ri=%d"
1391		     "cccrvalue=0x%x tbits=0x%x", __LINE__, rc, cpu, ri,
1392			cccrvalue, cccrtbits));
1393
1394		escrvalue = rdmsr(escrmsr);
1395
1396		KASSERT((escrvalue & P4_ESCR_Tx_MASK) ==
1397		    (pm->pm_md.pm_p4.pm_p4_escrvalue & P4_ESCR_Tx_MASK),
1398		    ("[p4,%d] Extra ESCR bits cpu=%d rc=%d ri=%d "
1399			"escr=0x%x pm=0x%x", __LINE__, cpu, rc, ri,
1400			escrvalue & P4_ESCR_Tx_MASK,
1401			pm->pm_md.pm_p4.pm_p4_escrvalue & P4_ESCR_Tx_MASK));
1402		KASSERT((escrvalue & escrtbits) == 0,
1403		    ("[p4,%d] ESCR T0/T1 mismatch rc=%d cpu=%d ri=%d "
1404		     "escrmsr=0x%x escrvalue=0x%x tbits=0x%x", __LINE__,
1405			rc, cpu, ri, escrmsr, escrvalue, escrtbits));
1406	}
1407
1408	/* Enable the correct bits for this CPU. */
1409	escrvalue |= escrtbits;
1410	cccrvalue |= cccrtbits | P4_CCCR_ENABLE;
1411
1412	/* Save HW value at the time of starting hardware */
1413	P4_PCPU_HW_VALUE(pc,ri,cpu) = rdmsr(pd->pm_pmc_msr);
1414
1415	/* Program the ESCR and CCCR and start the PMC */
1416	wrmsr(escrmsr, escrvalue);
1417	wrmsr(pd->pm_cccr_msr, cccrvalue);
1418
1419	++rc;
1420	P4_PCPU_SET_RUNCOUNT(pc,ri,rc);
1421
1422	mtx_unlock_spin(&pc->pc_mtx);
1423
1424	PMCDBG(MDP,STA,2,"p4-start cpu=%d rc=%d ri=%d escr=%d "
1425	    "escrmsr=0x%x escrvalue=0x%x cccr_config=0x%x v=%jx", cpu, rc,
1426	    ri, pm->pm_md.pm_p4.pm_p4_escr, escrmsr, escrvalue,
1427	    cccrvalue, P4_PCPU_HW_VALUE(pc,ri,cpu));
1428
1429	return 0;
1430}
1431
1432/*
1433 * Stop a PMC.
1434 */
1435
1436static int
1437p4_stop_pmc(int cpu, int ri)
1438{
1439	int rc;
1440	uint32_t cccrvalue, cccrtbits, escrvalue, escrmsr, escrtbits;
1441	struct pmc *pm;
1442	struct p4_cpu *pc;
1443	struct pmc_hw *phw;
1444	struct p4pmc_descr *pd;
1445	pmc_value_t tmp;
1446
1447	KASSERT(cpu >= 0 && cpu < mp_ncpus,
1448	    ("[p4,%d] illegal CPU value %d", __LINE__, cpu));
1449	KASSERT(ri >= 0 && ri < P4_NPMCS,
1450	    ("[p4,%d] illegal row index %d", __LINE__, ri));
1451
1452	pd  = &p4_pmcdesc[ri];
1453
1454	if (pd->pm_descr.pd_class == PMC_CLASS_TSC)
1455		return 0;
1456
1457	pc  = (struct p4_cpu *) pmc_pcpu[P4_TO_HTT_PRIMARY(cpu)];
1458	phw = pc->pc_hwpmcs[ri];
1459
1460	KASSERT(phw != NULL,
1461	    ("[p4,%d] null phw for cpu%d, ri%d", __LINE__, cpu, ri));
1462
1463	pm  = phw->phw_pmc;
1464
1465	KASSERT(pm != NULL,
1466	    ("[p4,%d] null pmc for cpu%d, ri%d", __LINE__, cpu, ri));
1467
1468	PMCDBG(MDP,STO,1, "p4-stop cpu=%d ri=%d", cpu, ri);
1469
1470	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
1471		wrmsr(pd->pm_cccr_msr,
1472		    pm->pm_md.pm_p4.pm_p4_cccrvalue & ~P4_CCCR_ENABLE);
1473		return 0;
1474	}
1475
1476	/*
1477	 * Thread mode PMCs.
1478	 *
1479	 * On HTT machines, this PMC may be in use by two threads
1480	 * running on two logical CPUS.  Thus we look at the
1481	 * 'pm_runcount' field and only turn off the appropriate TO/T1
1482	 * bits (and keep the PMC running) if two logical CPUs were
1483	 * using the PMC.
1484	 *
1485	 */
1486
1487	/* bits to mask */
1488	cccrtbits = P4_CCCR_OVF_PMI_T0;
1489	escrtbits = P4_ESCR_T0_OS | P4_ESCR_T0_USR;
1490	if (P4_CPU_IS_HTT_SECONDARY(cpu)) {
1491		cccrtbits <<= 1;
1492		escrtbits >>= 2;
1493	}
1494
1495	mtx_lock_spin(&pc->pc_mtx);
1496
1497	rc = P4_PCPU_GET_RUNCOUNT(pc,ri);
1498
1499	KASSERT(rc == 2 || rc == 1,
1500	    ("[p4,%d] illegal runcount cpu=%d ri=%d rc=%d", __LINE__, cpu, ri,
1501		rc));
1502
1503	--rc;
1504
1505	P4_PCPU_SET_RUNCOUNT(pc,ri,rc);
1506
1507	/* Stop this PMC */
1508	cccrvalue = rdmsr(pd->pm_cccr_msr);
1509	wrmsr(pd->pm_cccr_msr, cccrvalue & ~P4_CCCR_ENABLE);
1510
1511	escrmsr   = pm->pm_md.pm_p4.pm_p4_escrmsr;
1512	escrvalue = rdmsr(escrmsr);
1513
1514	/* The current CPU should be running on this PMC */
1515	KASSERT(escrvalue & escrtbits,
1516	    ("[p4,%d] ESCR T0/T1 mismatch cpu=%d rc=%d ri=%d escrmsr=0x%x "
1517		"escrvalue=0x%x tbits=0x%x", __LINE__, cpu, rc, ri, escrmsr,
1518		escrvalue, escrtbits));
1519	KASSERT(PMC_IS_COUNTING_MODE(PMC_TO_MODE(pm)) ||
1520	    (cccrvalue & cccrtbits),
1521	    ("[p4,%d] CCCR T0/T1 mismatch cpu=%d ri=%d cccrvalue=0x%x "
1522		"tbits=0x%x", __LINE__, cpu, ri, cccrvalue, cccrtbits));
1523
1524	/* get the current hardware reading */
1525	tmp = rdmsr(pd->pm_pmc_msr);
1526
1527	if (rc == 1) {		/* need to keep the PMC running */
1528		escrvalue &= ~escrtbits;
1529		cccrvalue &= ~cccrtbits;
1530		wrmsr(escrmsr, escrvalue);
1531		wrmsr(pd->pm_cccr_msr, cccrvalue);
1532	}
1533
1534	mtx_unlock_spin(&pc->pc_mtx);
1535
1536	PMCDBG(MDP,STO,2, "p4-stop cpu=%d rc=%d ri=%d escrmsr=0x%x "
1537	    "escrval=0x%x cccrval=0x%x v=%jx", cpu, rc, ri, escrmsr,
1538	    escrvalue, cccrvalue, tmp);
1539
1540	if (tmp < P4_PCPU_HW_VALUE(pc,ri,cpu)) /* 40 bit counter overflow */
1541		tmp += (P4_PERFCTR_MASK + 1) - P4_PCPU_HW_VALUE(pc,ri,cpu);
1542	else
1543		tmp -= P4_PCPU_HW_VALUE(pc,ri,cpu);
1544
1545	P4_PCPU_PMC_VALUE(pc,ri,cpu) += tmp;
1546
1547	return 0;
1548}
1549
1550/*
1551 * Handle an interrupt.
1552 *
1553 * The hardware sets the CCCR_OVF whenever a counter overflow occurs,
1554 * so the handler examines all the 18 CCCR registers, processing the
1555 * counters that have overflowed.
1556 *
1557 * On HTT machines, the CCCR register is shared and will interrupt
1558 * both logical processors if so configured.  Thus multiple logical
1559 * CPUs could enter the NMI service routine at the same time.  These
1560 * will get serialized using a per-cpu spinlock dedicated for use in
1561 * the NMI handler.
1562 */
1563
1564static int
1565p4_intr(int cpu, uintptr_t eip, int usermode)
1566{
1567	int i, did_interrupt, error, ri;
1568	uint32_t cccrval, ovf_mask, ovf_partner;
1569	struct p4_cpu *pc;
1570	struct pmc_hw *phw;
1571	struct pmc *pm;
1572	pmc_value_t v;
1573
1574	PMCDBG(MDP,INT, 1, "cpu=%d eip=%p um=%d", cpu, (void *) eip, usermode);
1575
1576	pc = (struct p4_cpu *) pmc_pcpu[P4_TO_HTT_PRIMARY(cpu)];
1577
1578	ovf_mask = P4_CPU_IS_HTT_SECONDARY(cpu) ?
1579	    P4_CCCR_OVF_PMI_T1 : P4_CCCR_OVF_PMI_T0;
1580	ovf_mask |= P4_CCCR_OVF;
1581	if (p4_system_has_htt)
1582		ovf_partner = P4_CPU_IS_HTT_SECONDARY(cpu) ? P4_CCCR_OVF_PMI_T0 :
1583		    P4_CCCR_OVF_PMI_T1;
1584	else
1585		ovf_partner = 0;
1586	did_interrupt = 0;
1587
1588	if (p4_system_has_htt)
1589		P4_PCPU_ACQ_INTR_SPINLOCK(pc);
1590
1591	/*
1592	 * Loop through all CCCRs, looking for ones that have
1593	 * interrupted this CPU.
1594	 */
1595	for (i = 0; i < P4_NPMCS-1; i++) {
1596
1597		ri = i + 1;	/* row index */
1598
1599		/*
1600		 * Check if our partner logical CPU has already marked
1601		 * this PMC has having interrupted it.  If so, reset
1602		 * the flag and process the interrupt, but leave the
1603		 * hardware alone.
1604		 */
1605		if (p4_system_has_htt && P4_PCPU_GET_INTRFLAG(pc,ri)) {
1606			P4_PCPU_SET_INTRFLAG(pc,ri,0);
1607			did_interrupt = 1;
1608
1609			/*
1610			 * Ignore de-configured or stopped PMCs.
1611			 * Ignore PMCs not in sampling mode.
1612			 */
1613			phw = pc->pc_hwpmcs[ri];
1614			pm  = phw->phw_pmc;
1615			if (pm == NULL ||
1616			    pm->pm_state != PMC_STATE_RUNNING ||
1617			    !PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) {
1618				continue;
1619			}
1620			(void) pmc_process_interrupt(cpu, pm, eip, usermode);
1621			continue;
1622		}
1623
1624		/*
1625		 * Fresh interrupt.  Look for the CCCR_OVF bit
1626		 * and the OVF_Tx bit for this logical
1627		 * processor being set.
1628		 */
1629		cccrval = rdmsr(P4_CCCR_MSR_FIRST + i);
1630
1631		if ((cccrval & ovf_mask) != ovf_mask)
1632			continue;
1633
1634		/*
1635		 * If the other logical CPU would also have been
1636		 * interrupted due to the PMC being shared, record
1637		 * this fact in the per-cpu saved interrupt flag
1638		 * bitmask.
1639		 */
1640		if (p4_system_has_htt && (cccrval & ovf_partner))
1641			P4_PCPU_SET_INTRFLAG(pc, ri, 1);
1642
1643		v = rdmsr(P4_PERFCTR_MSR_FIRST + i);
1644
1645		PMCDBG(MDP,INT, 2, "ri=%d v=%jx", ri, v);
1646
1647		/* Stop the counter, and reset the overflow  bit */
1648		cccrval &= ~(P4_CCCR_OVF | P4_CCCR_ENABLE);
1649		wrmsr(P4_CCCR_MSR_FIRST + i, cccrval);
1650
1651		did_interrupt = 1;
1652
1653		/*
1654		 * Ignore de-configured or stopped PMCs.  Ignore PMCs
1655		 * not in sampling mode.
1656		 */
1657		phw = pc->pc_hwpmcs[ri];
1658		pm  = phw->phw_pmc;
1659
1660		if (pm == NULL ||
1661		    pm->pm_state != PMC_STATE_RUNNING ||
1662		    !PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) {
1663			continue;
1664		}
1665
1666		/*
1667		 * Process the interrupt.  Re-enable the PMC if
1668		 * processing was successful.
1669		 */
1670		error = pmc_process_interrupt(cpu, pm, eip, usermode);
1671
1672		/*
1673		 * Only the first processor executing the NMI handler
1674		 * in a HTT pair will restart a PMC, and that too
1675		 * only if there were no errors.
1676		 */
1677		v = P4_RELOAD_COUNT_TO_PERFCTR_VALUE(
1678			pm->pm_sc.pm_reloadcount);
1679		wrmsr(P4_PERFCTR_MSR_FIRST + i, v);
1680		if (error == 0)
1681			wrmsr(P4_CCCR_MSR_FIRST + i,
1682			    cccrval | P4_CCCR_ENABLE);
1683	}
1684
1685	/* allow the other CPU to proceed */
1686	if (p4_system_has_htt)
1687		P4_PCPU_REL_INTR_SPINLOCK(pc);
1688
1689	/*
1690	 * On Intel P4 CPUs, the PMC 'pcint' entry in the LAPIC gets
1691	 * masked when a PMC interrupts the CPU.  We need to unmask
1692	 * the interrupt source explicitly.
1693	 */
1694
1695	if (did_interrupt)
1696		pmc_x86_lapic_enable_pmc_interrupt();
1697
1698	atomic_add_int(did_interrupt ? &pmc_stats.pm_intr_processed :
1699	    &pmc_stats.pm_intr_ignored, 1);
1700
1701	return did_interrupt;
1702}
1703
1704/*
1705 * Describe a CPU's PMC state.
1706 */
1707
1708static int
1709p4_describe(int cpu, int ri, struct pmc_info *pi,
1710    struct pmc **ppmc)
1711{
1712	int error;
1713	size_t copied;
1714	struct pmc_hw *phw;
1715	const struct p4pmc_descr *pd;
1716
1717	KASSERT(cpu >= 0 && cpu < mp_ncpus,
1718	    ("[p4,%d] illegal CPU %d", __LINE__, cpu));
1719	KASSERT(ri >= 0 && ri < P4_NPMCS,
1720	    ("[p4,%d] row-index %d out of range", __LINE__, ri));
1721
1722	PMCDBG(MDP,OPS,1,"p4-describe cpu=%d ri=%d", cpu, ri);
1723
1724	if (P4_CPU_IS_HTT_SECONDARY(cpu))
1725		return EINVAL;
1726
1727	phw = pmc_pcpu[cpu]->pc_hwpmcs[ri];
1728	pd  = &p4_pmcdesc[ri];
1729
1730	if ((error = copystr(pd->pm_descr.pd_name, pi->pm_name,
1731		 PMC_NAME_MAX, &copied)) != 0)
1732		return error;
1733
1734	pi->pm_class = pd->pm_descr.pd_class;
1735
1736	if (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) {
1737		pi->pm_enabled = TRUE;
1738		*ppmc          = phw->phw_pmc;
1739	} else {
1740		pi->pm_enabled = FALSE;
1741		*ppmc          = NULL;
1742	}
1743
1744	return 0;
1745}
1746
1747/*
1748 * Get MSR# for use with RDPMC.
1749 */
1750
1751static int
1752p4_get_msr(int ri, uint32_t *msr)
1753{
1754	KASSERT(ri >= 0 && ri < P4_NPMCS,
1755	    ("[p4,%d] ri %d out of range", __LINE__, ri));
1756
1757	*msr = p4_pmcdesc[ri].pm_pmc_msr - P4_PERFCTR_MSR_FIRST;
1758
1759	PMCDBG(MDP,OPS, 1, "ri=%d getmsr=0x%x", ri, *msr);
1760
1761	return 0;
1762}
1763
1764
1765int
1766pmc_initialize_p4(struct pmc_mdep *pmc_mdep)
1767{
1768	struct p4_event_descr *pe;
1769
1770	KASSERT(strcmp(cpu_vendor, "GenuineIntel") == 0,
1771	    ("[p4,%d] Initializing non-intel processor", __LINE__));
1772
1773	PMCDBG(MDP,INI,1, "%s", "p4-initialize");
1774
1775	switch (pmc_mdep->pmd_cputype) {
1776	case PMC_CPU_INTEL_PIV:
1777
1778		pmc_mdep->pmd_npmc	    = P4_NPMCS;
1779		pmc_mdep->pmd_classes[1].pm_class = PMC_CLASS_P4;
1780		pmc_mdep->pmd_classes[1].pm_caps  = P4_PMC_CAPS;
1781		pmc_mdep->pmd_classes[1].pm_width = 40;
1782		pmc_mdep->pmd_nclasspmcs[1] = 18;
1783
1784		pmc_mdep->pmd_init    	    = p4_init;
1785		pmc_mdep->pmd_cleanup 	    = p4_cleanup;
1786		pmc_mdep->pmd_switch_in     = p4_switch_in;
1787		pmc_mdep->pmd_switch_out    = p4_switch_out;
1788		pmc_mdep->pmd_read_pmc 	    = p4_read_pmc;
1789		pmc_mdep->pmd_write_pmc     = p4_write_pmc;
1790		pmc_mdep->pmd_config_pmc    = p4_config_pmc;
1791		pmc_mdep->pmd_get_config    = p4_get_config;
1792		pmc_mdep->pmd_allocate_pmc  = p4_allocate_pmc;
1793		pmc_mdep->pmd_release_pmc   = p4_release_pmc;
1794		pmc_mdep->pmd_start_pmc     = p4_start_pmc;
1795		pmc_mdep->pmd_stop_pmc      = p4_stop_pmc;
1796		pmc_mdep->pmd_intr	    = p4_intr;
1797		pmc_mdep->pmd_describe      = p4_describe;
1798		pmc_mdep->pmd_get_msr  	    = p4_get_msr; /* i386 */
1799
1800		/* model specific munging */
1801		if ((cpu_id & 0xFFF) < 0xF27) {
1802
1803			/*
1804			 * On P4 and Xeon with CPUID < (Family 15,
1805			 * Model 2, Stepping 7), only one ESCR is
1806			 * available for the IOQ_ALLOCATION event.
1807			 */
1808
1809			pe = p4_find_event(PMC_EV_P4_IOQ_ALLOCATION);
1810			pe->pm_escrs[1] = P4_ESCR_NONE;
1811		}
1812
1813		break;
1814
1815	default:
1816		KASSERT(0,("[p4,%d] Unknown CPU type", __LINE__));
1817		return ENOSYS;
1818	}
1819
1820	return 0;
1821}
1822