xdt.c revision 8803:8c01b39012c9
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * Xen event provider for DTrace
29 *
30 * NOTE: This provider is PRIVATE. It is intended as a short-term solution and
31 * may disappear or be re-implemented at anytime.
32 *
33 * This provider isn't suitable as a general-purpose solution for a number of
34 * reasons. First and foremost, we rely on the Xen tracing mechanism and don't
35 * have any way to gather data other than that collected by the Xen trace
36 * buffers. Further, it does not fit into the DTrace model (see "Interacting
37 * with DTrace" below.)
38 *
39 *
40 * Tracing in Xen
41 * --------------
42 *
43 * Xen implements a tracing facility for generating and collecting execution
44 * event traces from the hypervisor. When tracing is enabled, compiled in
45 * probes record events in contiguous per-CPU trace buffers.
46 *
47 *               +---------+
48 * +------+      |         |
49 * | CPUn |----> | BUFFERn |
50 * +------+      |         |
51 *               +---------+- tbuf.va + (tbuf.size * n)
52 *               :         :
53 *               +---------+
54 * +------+      |         |
55 * | CPU1 |----> | BUFFER1 |
56 * +------+      |         |
57 *               +---------+- tbuf.va + tbuf.size
58 * +------+      |         |
59 * | CPU0 |----> | BUFFER0 |
60 * +------+      |         |
61 *               +---------+- tbuf.va
62 *
63 * Each CPU buffer consists of a metadata header followed by the trace records.
64 * The metadata consists of a producer/consumer pair of pointers into the buffer
65 * that point to the next record to be written and the next record to be read
66 * respectively. The trace record format is as follows:
67 *
68 * +--------------------------------------------------------------------------+
69 * | CPUID(uint_t) | TSC(uint64_t) | EVENTID(uint32_t) |     DATA FIELDS      |
70 * +--------------------------------------------------------------------------+
71 *
72 * DATA FIELDS:
73 * +--------------------------------------------------------------------------+
74 * | D1(uint32_t) | D2(uint32_t) | D3(uint32_t) | D4(uint32_t) | D5(uint32_t) |
75 * +--------------------------------------------------------------------------+
76 *
77 *
78 * Interacting with DTrace
79 * -----------------------
80 *
81 * Every xdt_poll_nsec nano-seconds we poll the trace buffers for data and feed
82 * each entry into dtrace_probe() with the corresponding probe ID for the event.
83 * As a result of this periodic collection implementation probe firings are
84 * asynchronous. This is the only sensible way to implement this form of
85 * provider, but because of its asynchronous nature asking things like
86 * "current CPU" and, more importantly, arbitrary questions about the context
87 * surrounding the probe firing are not meaningful. So, consumers should not
88 * attempt to infer anything beyond what is supplied via the probe arguments.
89 */
90
91#include <sys/types.h>
92#include <sys/sysmacros.h>
93#include <sys/modctl.h>
94#include <sys/sunddi.h>
95#include <sys/ddi.h>
96#include <sys/conf.h>
97#include <sys/devops.h>
98#include <sys/stat.h>
99#include <sys/cmn_err.h>
100#include <sys/dtrace.h>
101#include <sys/sdt.h>
102#include <sys/cyclic.h>
103#include <vm/seg_kmem.h>
104#include <vm/hat_i86.h>
105#include <sys/hypervisor.h>
106#include <xen/public/trace.h>
107#include <xen/public/sched.h>
108
109#define	XDT_POLL_DEFAULT	100000000	/* default poll interval (ns) */
110#define	XDT_POLL_MIN		10000000	/* min poll interval (ns) */
111#define	XDT_TBUF_RETRY		50		/* tbuf disable retry count */
112
113/*
114 * The domid must match IDLE_DOMAIN_ID in xen.hg/xen/include/xen/sched.h
115 * in the xVM gate.
116 */
117#define	IS_IDLE_DOM(domid)	(domid == 0x7FFFU)
118
119/* Macros to extract the domid and cpuid from a HVM trace data field */
120#define	HVM_DOMID(d)		(d >> 16)
121#define	HVM_VCPUID(d)		(d & 0xFFFF)
122
123#define	XDT_PROBE4(event, cpuid, arg0, arg1, arg2, arg3) {		\
124	dtrace_id_t id = xdt_probemap[event];				\
125	if (id)								\
126		dtrace_probe(id, cpuid, arg0, arg1, arg2, arg3);	\
127}									\
128
129#define	XDT_PROBE3(event, cpuid, arg0, arg1, arg2) \
130	XDT_PROBE4(event, cpuid, arg0, arg1, arg2, 0)
131
132#define	XDT_PROBE2(event, cpuid, arg0, arg1) \
133	XDT_PROBE4(event, cpuid, arg0, arg1, 0, 0)
134
135#define	XDT_PROBE1(event, cpuid, arg0) \
136	XDT_PROBE4(event, cpuid, arg0, 0, 0, 0)
137
138#define	XDT_PROBE0(event, cpuid) \
139	XDT_PROBE4(event, cpuid, 0, 0, 0, 0)
140
141/* Probe classes */
142#define	XDT_SCHED			0
143#define	XDT_MEM				1
144#define	XDT_HVM				2
145#define	XDT_NCLASSES			3
146
147/* Probe events */
148#define	XDT_EVT_INVALID			(-(int)1)
149#define	XDT_SCHED_OFF_CPU		0
150#define	XDT_SCHED_ON_CPU		1
151#define	XDT_SCHED_IDLE_OFF_CPU		2
152#define	XDT_SCHED_IDLE_ON_CPU		3
153#define	XDT_SCHED_BLOCK			4
154#define	XDT_SCHED_SLEEP			5
155#define	XDT_SCHED_WAKE			6
156#define	XDT_SCHED_YIELD			7
157#define	XDT_SCHED_SHUTDOWN_POWEROFF	8
158#define	XDT_SCHED_SHUTDOWN_REBOOT	9
159#define	XDT_SCHED_SHUTDOWN_SUSPEND	10
160#define	XDT_SCHED_SHUTDOWN_CRASH	11
161#define	XDT_MEM_PAGE_GRANT_MAP		12
162#define	XDT_MEM_PAGE_GRANT_UNMAP	13
163#define	XDT_MEM_PAGE_GRANT_TRANSFER	14
164#define	XDT_HVM_VMENTRY			15
165#define	XDT_HVM_VMEXIT			16
166#define	XDT_NEVENTS			17
167
168typedef struct {
169	const char	*pr_mod;	/* probe module */
170	const char	*pr_name;	/* probe name */
171	int		evt_id;		/* event id */
172	uint_t		class;		/* probe class */
173} xdt_probe_t;
174
175typedef struct {
176	uint32_t	trc_mask;	/* trace mask */
177	uint32_t	cnt;		/* num enabled probes in class */
178} xdt_classinfo_t;
179
180typedef struct {
181	ulong_t prev_domid;		/* previous dom executed */
182	ulong_t prev_vcpuid;		/* previous vcpu executed */
183	ulong_t prev_ctime;		/* time spent on cpu */
184	ulong_t next_domid;		/* next dom to be scheduled */
185	ulong_t next_vcpuid;		/* next vcpu to be scheduled */
186	ulong_t next_wtime;		/* time spent waiting to get on cpu */
187	ulong_t next_ts;		/* allocated time slice */
188} xdt_schedinfo_t;
189
190static struct {
191	uint_t cnt;			/* total num of trace buffers */
192	size_t size;			/* size of each cpu buffer */
193	mfn_t start_mfn;		/* starting mfn of buffers */
194	caddr_t va;			/* va buffers are mapped into */
195
196	/* per-cpu buffers */
197	struct t_buf **meta;		/* buffer metadata */
198	struct t_rec **data;		/* buffer data records */
199
200	/* statistics */
201	uint64_t stat_dropped_recs;	/* records dropped */
202	uint64_t stat_spurious_cpu;	/* recs with garbage cpuids */
203	uint64_t stat_spurious_switch;	/* inconsistent vcpu switches */
204	uint64_t stat_unknown_shutdown;	/* unknown shutdown code */
205	uint64_t stat_unknown_recs;	/* unknown records */
206} tbuf;
207
208static char *xdt_stats[] = {
209	"dropped_recs",
210};
211
212/*
213 * Tunable variables
214 *
215 * The following may be tuned by adding a line to /etc/system that
216 * includes both the name of the module ("xdt") and the name of the variable.
217 * For example:
218 *     set xdt:xdt_tbuf_pages = 40
219 */
220uint_t xdt_tbuf_pages = 20;			/* pages to alloc per-cpu buf */
221
222/*
223 * The following may be tuned by adding a line to
224 * /platform/i86xpv/kernel/drv/xdt.conf.
225 * For example:
226 *     xdt_poll_nsec = 200000000;
227 */
228static hrtime_t xdt_poll_nsec;			/* trace buffer poll interval */
229
230/*
231 * Internal variables
232 */
233static dev_info_t *xdt_devi;
234static dtrace_provider_id_t xdt_id;
235static uint_t xdt_ncpus;			/* total number of phys CPUs */
236static uint32_t cur_trace_mask;			/* current trace mask */
237static xdt_schedinfo_t *xdt_cpu_schedinfo;	/* per-cpu sched info */
238dtrace_id_t xdt_probemap[XDT_NEVENTS];		/* map of enabled probes */
239dtrace_id_t xdt_prid[XDT_NEVENTS];		/* IDs of registered events */
240static cyclic_id_t xdt_cyclic = CYCLIC_NONE;
241static kstat_t *xdt_kstats;
242static xdt_classinfo_t xdt_classinfo[XDT_NCLASSES];
243
244static xdt_probe_t xdt_probe[] = {
245	/* Sched probes */
246	{ "sched", "off-cpu", XDT_SCHED_OFF_CPU, XDT_SCHED },
247	{ "sched", "on-cpu", XDT_SCHED_ON_CPU, XDT_SCHED },
248	{ "sched", "idle-off-cpu", XDT_SCHED_IDLE_OFF_CPU, XDT_SCHED },
249	{ "sched", "idle-on-cpu", XDT_SCHED_IDLE_ON_CPU, XDT_SCHED },
250	{ "sched", "block", XDT_SCHED_BLOCK, XDT_SCHED },
251	{ "sched", "sleep", XDT_SCHED_SLEEP, XDT_SCHED },
252	{ "sched", "wake", XDT_SCHED_WAKE, XDT_SCHED },
253	{ "sched", "yield", XDT_SCHED_YIELD, XDT_SCHED },
254	{ "sched", "shutdown-poweroff", XDT_SCHED_SHUTDOWN_POWEROFF,
255		XDT_SCHED },
256	{ "sched", "shutdown-reboot", XDT_SCHED_SHUTDOWN_REBOOT, XDT_SCHED },
257	{ "sched", "shutdown-suspend", XDT_SCHED_SHUTDOWN_SUSPEND, XDT_SCHED },
258	{ "sched", "shutdown-crash", XDT_SCHED_SHUTDOWN_CRASH, XDT_SCHED },
259
260	/* Memory probes */
261	{ "mem", "page-grant-map", XDT_MEM_PAGE_GRANT_MAP, XDT_MEM },
262	{ "mem", "page-grant-unmap", XDT_MEM_PAGE_GRANT_UNMAP, XDT_MEM },
263	{ "mem", "page-grant-transfer", XDT_MEM_PAGE_GRANT_TRANSFER, XDT_MEM },
264
265	/* HVM probes */
266	{ "hvm", "vmentry", XDT_HVM_VMENTRY, XDT_HVM },
267	{ "hvm", "vmexit", XDT_HVM_VMEXIT, XDT_HVM },
268
269	{ NULL }
270};
271
272extern uint_t xen_get_nphyscpus(void);
273
274static inline uint32_t
275xdt_nr_active_probes()
276{
277	int i;
278	uint32_t tot = 0;
279
280	for (i = 0; i < XDT_NCLASSES; i++)
281		tot += xdt_classinfo[i].cnt;
282
283	return (tot);
284}
285
286static void
287xdt_init_trace_masks(void)
288{
289	xdt_classinfo[XDT_SCHED].trc_mask = TRC_SCHED;
290	xdt_classinfo[XDT_MEM].trc_mask = TRC_MEM;
291	xdt_classinfo[XDT_HVM].trc_mask = TRC_HVM;
292}
293
294static int
295xdt_kstat_update(kstat_t *ksp, int flag)
296{
297	kstat_named_t *knp;
298
299	if (flag != KSTAT_READ)
300		return (EACCES);
301
302	knp = ksp->ks_data;
303
304	/*
305	 * Assignment order should match that of the names in
306	 * xdt_stats.
307	 */
308	(knp++)->value.ui64 = tbuf.stat_dropped_recs;
309
310	return (0);
311}
312
313static void
314xdt_kstat_init(void)
315{
316	int nstats = sizeof (xdt_stats) / sizeof (xdt_stats[0]);
317	char **cp = xdt_stats;
318	kstat_named_t *knp;
319
320	if ((xdt_kstats = kstat_create("xdt", 0, "trace_statistics", "misc",
321	    KSTAT_TYPE_NAMED, nstats, 0)) == NULL)
322		return;
323
324	xdt_kstats->ks_update = xdt_kstat_update;
325
326	knp = xdt_kstats->ks_data;
327	while (nstats > 0) {
328		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
329		knp++;
330		cp++;
331		nstats--;
332	}
333
334	kstat_install(xdt_kstats);
335}
336
337static int
338xdt_sysctl_tbuf(xen_sysctl_tbuf_op_t *tbuf_op)
339{
340	xen_sysctl_t op;
341	int xerr;
342
343	op.cmd = XEN_SYSCTL_tbuf_op;
344	op.interface_version = XEN_SYSCTL_INTERFACE_VERSION;
345	op.u.tbuf_op = *tbuf_op;
346
347	if ((xerr = HYPERVISOR_sysctl(&op)) != 0)
348		return (xen_xlate_errcode(xerr));
349
350	*tbuf_op = op.u.tbuf_op;
351	return (0);
352}
353
354static int
355xdt_map_trace_buffers(mfn_t mfn, caddr_t va, size_t len)
356{
357	x86pte_t pte;
358	caddr_t const sva = va;
359	caddr_t const eva = va + len;
360	int xerr;
361
362	ASSERT(mfn != MFN_INVALID);
363	ASSERT(va != NULL);
364	ASSERT(IS_PAGEALIGNED(len));
365
366	for (; va < eva; va += MMU_PAGESIZE) {
367		/*
368		 * Ask the HAT to load a throwaway mapping to page zero, then
369		 * overwrite it with the hypervisor mapping. It gets removed
370		 * later via hat_unload().
371		 */
372		hat_devload(kas.a_hat, va, MMU_PAGESIZE, (pfn_t)0,
373		    PROT_READ | HAT_UNORDERED_OK,
374		    HAT_LOAD_NOCONSIST | HAT_LOAD);
375
376		pte = mmu_ptob((x86pte_t)mfn) | PT_VALID | PT_USER
377		    | PT_FOREIGN | PT_WRITABLE;
378
379		xerr = HYPERVISOR_update_va_mapping_otherdomain((ulong_t)va,
380		    pte, UVMF_INVLPG | UVMF_LOCAL, DOMID_XEN);
381
382		if (xerr != 0) {
383			/* unmap pages loaded so far */
384			size_t ulen = (uintptr_t)(va + MMU_PAGESIZE) -
385			    (uintptr_t)sva;
386			hat_unload(kas.a_hat, sva, ulen, HAT_UNLOAD_UNMAP);
387			return (xen_xlate_errcode(xerr));
388		}
389
390		mfn++;
391	}
392
393	return (0);
394}
395
396static int
397xdt_attach_trace_buffers(void)
398{
399	xen_sysctl_tbuf_op_t tbuf_op;
400	size_t len;
401	int err;
402	uint_t i;
403
404	/*
405	 * Xen does not support trace buffer re-sizing. If the buffers
406	 * have already been allocated we just use them as is.
407	 */
408	tbuf_op.cmd  = XEN_SYSCTL_TBUFOP_get_info;
409	if ((err = xdt_sysctl_tbuf(&tbuf_op)) != 0)
410		return (err);
411
412	if (tbuf_op.size == 0) {
413		/* set trace buffer size */
414		tbuf_op.cmd  = XEN_SYSCTL_TBUFOP_set_size;
415		tbuf_op.size = xdt_tbuf_pages;
416		(void) xdt_sysctl_tbuf(&tbuf_op);
417
418		/* get trace buffer info */
419		tbuf_op.cmd  = XEN_SYSCTL_TBUFOP_get_info;
420		if ((err = xdt_sysctl_tbuf(&tbuf_op)) != 0)
421			return (err);
422
423		if (tbuf_op.size == 0) {
424			cmn_err(CE_NOTE, "Couldn't allocate trace buffers.");
425			return (ENOBUFS);
426		}
427	}
428
429	tbuf.size = tbuf_op.size;
430	tbuf.start_mfn = (mfn_t)tbuf_op.buffer_mfn;
431	tbuf.cnt = xdt_ncpus;
432
433	ASSERT(tbuf.start_mfn != MFN_INVALID);
434	ASSERT(tbuf.cnt > 0);
435
436	len = tbuf.size * tbuf.cnt;
437	tbuf.va = vmem_alloc(heap_arena, len, VM_SLEEP);
438
439	if ((err = xdt_map_trace_buffers(tbuf.start_mfn, tbuf.va, len)) != 0) {
440		vmem_free(heap_arena, tbuf.va, len);
441		tbuf.va = NULL;
442		return (err);
443	}
444
445	tbuf.meta = (struct t_buf **)kmem_alloc(tbuf.cnt * sizeof (*tbuf.meta),
446	    KM_SLEEP);
447	tbuf.data = (struct t_rec **)kmem_alloc(tbuf.cnt * sizeof (*tbuf.data),
448	    KM_SLEEP);
449
450	for (i = 0; i < tbuf.cnt; i++) {
451		void *cpu_buf = (void *)(tbuf.va + (tbuf.size * i));
452		tbuf.meta[i] = cpu_buf;
453		tbuf.data[i] = (struct t_rec *)((uintptr_t)cpu_buf +
454		    sizeof (struct t_buf));
455
456		/* throw away stale trace records */
457		tbuf.meta[i]->cons = tbuf.meta[i]->prod;
458	}
459
460	return (0);
461}
462
463static void
464xdt_detach_trace_buffers(void)
465{
466	size_t len = tbuf.size * tbuf.cnt;
467
468	ASSERT(tbuf.va != NULL);
469
470	hat_unload(kas.a_hat, tbuf.va, len,
471	    HAT_UNLOAD_UNMAP | HAT_UNLOAD_UNLOCK);
472	vmem_free(heap_arena, tbuf.va, len);
473	kmem_free(tbuf.meta, tbuf.cnt * sizeof (*tbuf.meta));
474	kmem_free(tbuf.data, tbuf.cnt * sizeof (*tbuf.data));
475}
476
477static inline void
478xdt_process_rec(uint_t cpuid, struct t_rec *rec)
479{
480	xdt_schedinfo_t *sp = &xdt_cpu_schedinfo[cpuid];
481	int eid;
482
483	ASSERT(rec != NULL);
484	ASSERT(xdt_ncpus == xen_get_nphyscpus());
485
486	if (cpuid >= xdt_ncpus) {
487		tbuf.stat_spurious_cpu++;
488		return;
489	}
490
491	switch (rec->event) {
492
493	/*
494	 * Sched probes
495	 */
496	case TRC_SCHED_SWITCH_INFPREV:
497		/*
498		 * Info on vCPU being de-scheduled
499		 *
500		 * rec->data[0] = prev domid
501		 * rec->data[1] = time spent on pcpu
502		 */
503		sp->prev_domid = rec->data[0];
504		sp->prev_ctime = rec->data[1];
505		break;
506
507	case TRC_SCHED_SWITCH_INFNEXT:
508		/*
509		 * Info on next vCPU to be scheduled
510		 *
511		 * rec->data[0] = next domid
512		 * rec->data[1] = time spent waiting to get on cpu
513		 * rec->data[2] = time slice
514		 */
515		sp->next_domid = rec->data[0];
516		sp->next_wtime = rec->data[1];
517		sp->next_ts = rec->data[2];
518		break;
519
520	case TRC_SCHED_SWITCH:
521		/*
522		 * vCPU switch
523		 *
524		 * rec->data[0] = prev domid
525		 * rec->data[1] = prev vcpuid
526		 * rec->data[2] = next domid
527		 * rec->data[3] = next vcpuid
528		 */
529		if (rec->data[0] != sp->prev_domid &&
530		    rec->data[2] != sp->next_domid) {
531			/* prev and next info don't match doms being sched'd */
532			tbuf.stat_spurious_switch++;
533			return;
534		}
535
536		sp->prev_vcpuid = rec->data[1];
537		sp->next_vcpuid = rec->data[3];
538
539		XDT_PROBE3(IS_IDLE_DOM(sp->prev_domid)?
540		    XDT_SCHED_IDLE_OFF_CPU:XDT_SCHED_OFF_CPU,
541		    cpuid, sp->prev_domid, sp->prev_vcpuid, sp->prev_ctime);
542
543		XDT_PROBE4(IS_IDLE_DOM(sp->next_domid)?
544		    XDT_SCHED_IDLE_ON_CPU:XDT_SCHED_ON_CPU,
545		    cpuid, sp->next_domid, sp->next_vcpuid, sp->next_wtime,
546		    sp->next_ts);
547		break;
548
549	case TRC_SCHED_BLOCK:
550		/*
551		 * vCPU blocked
552		 *
553		 * rec->data[0] = domid
554		 * rec->data[1] = vcpuid
555		 */
556		XDT_PROBE2(XDT_SCHED_BLOCK, cpuid, rec->data[0], rec->data[1]);
557		break;
558
559	case TRC_SCHED_SLEEP:
560		/*
561		 * Put vCPU to sleep
562		 *
563		 * rec->data[0] = domid
564		 * rec->data[1] = vcpuid
565		 */
566		XDT_PROBE2(XDT_SCHED_SLEEP, cpuid, rec->data[0], rec->data[1]);
567		break;
568
569	case TRC_SCHED_WAKE:
570		/*
571		 * Wake up vCPU
572		 *
573		 * rec->data[0] = domid
574		 * rec->data[1] = vcpuid
575		 */
576		XDT_PROBE2(XDT_SCHED_WAKE, cpuid, rec->data[0], rec->data[1]);
577		break;
578
579	case TRC_SCHED_YIELD:
580		/*
581		 * vCPU yielded
582		 *
583		 * rec->data[0] = domid
584		 * rec->data[1] = vcpuid
585		 */
586		XDT_PROBE2(XDT_SCHED_YIELD, cpuid, rec->data[0], rec->data[1]);
587		break;
588
589	case TRC_SCHED_SHUTDOWN:
590		/*
591		 * Guest shutting down
592		 *
593		 * rec->data[0] = domid
594		 * rec->data[1] = initiating vcpu
595		 * rec->data[2] = shutdown code
596		 */
597		switch (rec->data[2]) {
598		case SHUTDOWN_poweroff:
599			eid = XDT_SCHED_SHUTDOWN_POWEROFF;
600			break;
601		case SHUTDOWN_reboot:
602			eid = XDT_SCHED_SHUTDOWN_REBOOT;
603			break;
604		case SHUTDOWN_suspend:
605			eid = XDT_SCHED_SHUTDOWN_SUSPEND;
606			break;
607		case SHUTDOWN_crash:
608			eid = XDT_SCHED_SHUTDOWN_CRASH;
609			break;
610		default:
611			tbuf.stat_unknown_shutdown++;
612			return;
613		}
614
615		XDT_PROBE1(eid, cpuid, rec->data[0]);
616		break;
617
618	/*
619	 * Mem probes
620	 */
621	case TRC_MEM_PAGE_GRANT_MAP:
622		/*
623		 * Guest mapped page grant
624		 *
625		 * rec->data[0] = domid
626		 */
627		XDT_PROBE1(XDT_MEM_PAGE_GRANT_MAP, cpuid, rec->data[0]);
628		break;
629
630	case TRC_MEM_PAGE_GRANT_UNMAP:
631		/*
632		 * Guest unmapped page grant
633		 *
634		 * rec->data[0] = domid
635		 */
636		XDT_PROBE1(XDT_MEM_PAGE_GRANT_UNMAP, cpuid, rec->data[0]);
637		break;
638
639	case TRC_MEM_PAGE_GRANT_TRANSFER:
640		/*
641		 * Page grant is being transferred
642		 *
643		 * rec->data[0] = target domid
644		 */
645		XDT_PROBE1(XDT_MEM_PAGE_GRANT_TRANSFER, cpuid, rec->data[0]);
646		break;
647
648	/*
649	 * HVM probes
650	 */
651	case TRC_HVM_VMENTRY:
652		/*
653		 * Return to guest via vmx_launch/vmrun
654		 *
655		 * rec->data[0] = (domid<<16 + vcpuid)
656		 */
657		XDT_PROBE2(XDT_HVM_VMENTRY, cpuid, HVM_DOMID(rec->data[0]),
658		    HVM_VCPUID(rec->data[0]));
659		break;
660
661	case TRC_HVM_VMEXIT:
662		/*
663		 * Entry into VMEXIT handler
664		 *
665		 * rec->data[0] = (domid<<16 + vcpuid)
666		 * rec->data[1] = guest rip
667		 * rec->data[2] = cpu vendor specific exit code
668		 */
669		XDT_PROBE4(XDT_HVM_VMEXIT, cpuid, HVM_DOMID(rec->data[0]),
670		    HVM_VCPUID(rec->data[0]), rec->data[1], rec->data[2]);
671		break;
672
673	case TRC_LOST_RECORDS:
674		tbuf.stat_dropped_recs++;
675		break;
676
677	default:
678		tbuf.stat_unknown_recs++;
679		break;
680	}
681}
682
683/*ARGSUSED*/
684static void
685xdt_tbuf_scan(void *arg)
686{
687	uint_t cpuid;
688	size_t nrecs;
689	struct t_rec *rec;
690	uint32_t prod;
691
692	nrecs = (tbuf.size - sizeof (struct t_buf)) / sizeof (struct t_rec);
693
694	/* scan all cpu buffers for new records */
695	for (cpuid = 0; cpuid < tbuf.cnt; cpuid++) {
696		prod = tbuf.meta[cpuid]->prod;
697		membar_consumer(); /* read prod /then/ data */
698		while (tbuf.meta[cpuid]->cons != prod) {
699			rec = tbuf.data[cpuid] + tbuf.meta[cpuid]->cons % nrecs;
700			xdt_process_rec(cpuid, rec);
701			membar_exit(); /* read data /then/ update cons */
702			tbuf.meta[cpuid]->cons++;
703		}
704	}
705}
706
707static void
708xdt_cyclic_enable(void)
709{
710	cyc_handler_t hdlr;
711	cyc_time_t when;
712
713	ASSERT(MUTEX_HELD(&cpu_lock));
714
715	hdlr.cyh_func = xdt_tbuf_scan;
716	hdlr.cyh_arg = NULL;
717	hdlr.cyh_level = CY_LOW_LEVEL;
718
719	when.cyt_interval = xdt_poll_nsec;
720	when.cyt_when = dtrace_gethrtime() + when.cyt_interval;
721
722	xdt_cyclic = cyclic_add(&hdlr, &when);
723}
724
725static void
726xdt_probe_create(xdt_probe_t *p)
727{
728	ASSERT(p != NULL && p->pr_mod != NULL);
729
730	if (dtrace_probe_lookup(xdt_id, p->pr_mod, NULL, p->pr_name) != 0)
731		return;
732
733	xdt_prid[p->evt_id] = dtrace_probe_create(xdt_id, p->pr_mod, NULL,
734	    p->pr_name, dtrace_mach_aframes(), p);
735}
736
737/*ARGSUSED*/
738static void
739xdt_provide(void *arg, const dtrace_probedesc_t *desc)
740{
741	const char *mod, *name;
742	int i;
743
744	if (desc == NULL) {
745		for (i = 0; xdt_probe[i].pr_mod != NULL; i++) {
746			xdt_probe_create(&xdt_probe[i]);
747		}
748	} else {
749		mod = desc->dtpd_mod;
750		name = desc->dtpd_name;
751		for (i = 0; xdt_probe[i].pr_mod != NULL; i++) {
752			int l1 = strlen(xdt_probe[i].pr_name);
753			int l2 = strlen(xdt_probe[i].pr_mod);
754			if (strncmp(name, xdt_probe[i].pr_name, l1) == 0 &&
755			    strncmp(mod, xdt_probe[i].pr_mod, l2) == 0)
756				break;
757		}
758
759		if (xdt_probe[i].pr_mod == NULL)
760			return;
761		xdt_probe_create(&xdt_probe[i]);
762	}
763
764}
765
766/*ARGSUSED*/
767static void
768xdt_destroy(void *arg, dtrace_id_t id, void *parg)
769{
770	xdt_probe_t *p = parg;
771	xdt_prid[p->evt_id] = 0;
772}
773
774static void
775xdt_set_trace_mask(uint32_t mask)
776{
777	xen_sysctl_tbuf_op_t tbuf_op;
778
779	tbuf_op.cmd  = XEN_SYSCTL_TBUFOP_set_evt_mask;
780	tbuf_op.evt_mask = mask;
781	(void) xdt_sysctl_tbuf(&tbuf_op);
782}
783
784/*ARGSUSED*/
785static int
786xdt_enable(void *arg, dtrace_id_t id, void *parg)
787{
788	xdt_probe_t *p = parg;
789	xen_sysctl_tbuf_op_t tbuf_op;
790
791	ASSERT(MUTEX_HELD(&cpu_lock));
792	ASSERT(xdt_prid[p->evt_id] != 0);
793
794	xdt_probemap[p->evt_id] = xdt_prid[p->evt_id];
795	xdt_classinfo[p->class].cnt++;
796
797	if (xdt_classinfo[p->class].cnt == 1) {
798		/* set the trace mask for this class */
799		cur_trace_mask |= xdt_classinfo[p->class].trc_mask;
800		xdt_set_trace_mask(cur_trace_mask);
801	}
802
803	if (xdt_cyclic == CYCLIC_NONE) {
804		tbuf_op.cmd = XEN_SYSCTL_TBUFOP_enable;
805		if (xdt_sysctl_tbuf(&tbuf_op) != 0) {
806			cmn_err(CE_NOTE, "Couldn't enable hypervisor tracing.");
807			return (-1);
808		}
809
810		xdt_cyclic_enable();
811	}
812	return (0);
813}
814
815/*ARGSUSED*/
816static void
817xdt_disable(void *arg, dtrace_id_t id, void *parg)
818{
819	xdt_probe_t *p = parg;
820	xen_sysctl_tbuf_op_t tbuf_op;
821	int i, err;
822
823	ASSERT(MUTEX_HELD(&cpu_lock));
824	ASSERT(xdt_probemap[p->evt_id] != 0);
825	ASSERT(xdt_probemap[p->evt_id] == xdt_prid[p->evt_id]);
826	ASSERT(xdt_classinfo[p->class].cnt > 0);
827
828	/*
829	 * We could be here in the slight window between the cyclic firing and
830	 * a call to dtrace_probe() occurring. We need to be careful if we tear
831	 * down any shared state.
832	 */
833
834	xdt_probemap[p->evt_id] = 0;
835	xdt_classinfo[p->class].cnt--;
836
837	if (xdt_nr_active_probes() == 0) {
838		cur_trace_mask = 0;
839
840		if (xdt_cyclic == CYCLIC_NONE)
841			return;
842
843		/*
844		 * We will try to disable the trace buffers. If we fail for some
845		 * reason we will try again, up to a count of XDT_TBUF_RETRY.
846		 * If we still aren't successful we try to set the trace mask
847		 * to 0 in order to prevent trace records from being written.
848		 */
849		tbuf_op.cmd = XEN_SYSCTL_TBUFOP_disable;
850		i = 0;
851		do {
852			err = xdt_sysctl_tbuf(&tbuf_op);
853		} while ((err != 0) && (++i < XDT_TBUF_RETRY));
854
855		if (err != 0) {
856			cmn_err(CE_NOTE,
857			    "Couldn't disable hypervisor tracing.");
858			xdt_set_trace_mask(0);
859		} else {
860			cyclic_remove(xdt_cyclic);
861			xdt_cyclic = CYCLIC_NONE;
862			/*
863			 * We don't bother making the hypercall to set
864			 * the trace mask, since it will be reset when
865			 * tracing is re-enabled.
866			 */
867		}
868	} else if (xdt_classinfo[p->class].cnt == 0) {
869		cur_trace_mask ^= xdt_classinfo[p->class].trc_mask;
870		/* other probes are enabled, so add the sub-class mask back */
871		cur_trace_mask |= 0xF000;
872		xdt_set_trace_mask(cur_trace_mask);
873	}
874}
875
876static dtrace_pattr_t xdt_attr = {
877{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM },
878{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM },
879{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
880{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM },
881{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM },
882};
883
884static dtrace_pops_t xdt_pops = {
885	xdt_provide,		/* dtps_provide() */
886	NULL,			/* dtps_provide_module() */
887	xdt_enable,		/* dtps_enable() */
888	xdt_disable,		/* dtps_disable() */
889	NULL,			/* dtps_suspend() */
890	NULL,			/* dtps_resume() */
891	NULL,			/* dtps_getargdesc() */
892	NULL,			/* dtps_getargval() */
893	NULL,			/* dtps_usermode() */
894	xdt_destroy		/* dtps_destroy() */
895};
896
897static int
898xdt_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
899{
900	int val;
901
902	if (!DOMAIN_IS_INITDOMAIN(xen_info))
903		return (DDI_FAILURE);
904
905	switch (cmd) {
906	case DDI_ATTACH:
907		break;
908
909	case DDI_RESUME:
910		/*
911		 * We might support proper suspend/resume in the future, so,
912		 * return DDI_FAILURE for now.
913		 */
914		return (DDI_FAILURE);
915
916	default:
917		return (DDI_FAILURE);
918	}
919
920	xdt_ncpus = xen_get_nphyscpus();
921	ASSERT(xdt_ncpus > 0);
922
923	if (ddi_create_minor_node(devi, "xdt", S_IFCHR, 0, DDI_PSEUDO, 0) ==
924	    DDI_FAILURE || xdt_attach_trace_buffers() != 0 ||
925	    dtrace_register("xdt", &xdt_attr, DTRACE_PRIV_KERNEL, NULL,
926	    &xdt_pops, NULL, &xdt_id) != 0) {
927		if (tbuf.va != NULL)
928			xdt_detach_trace_buffers();
929		ddi_remove_minor_node(devi, NULL);
930		return (DDI_FAILURE);
931	}
932
933	val = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
934	    "xdt_poll_nsec", XDT_POLL_DEFAULT);
935	xdt_poll_nsec = MAX(val, XDT_POLL_MIN);
936
937	xdt_cpu_schedinfo = (xdt_schedinfo_t *)kmem_alloc(xdt_ncpus *
938	    sizeof (xdt_schedinfo_t), KM_SLEEP);
939	xdt_init_trace_masks();
940	xdt_kstat_init();
941
942	xdt_devi = devi;
943	ddi_report_dev(devi);
944	return (DDI_SUCCESS);
945}
946
947static int
948xdt_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
949{
950	switch (cmd) {
951	case DDI_DETACH:
952		break;
953
954	case DDI_SUSPEND:
955		/*
956		 * We might support proper suspend/resume in the future. So
957		 * return DDI_FAILURE for now.
958		 */
959		return (DDI_FAILURE);
960
961	default:
962		return (DDI_FAILURE);
963	}
964
965	if (dtrace_unregister(xdt_id) != 0)
966		return (DDI_FAILURE);
967
968	xdt_detach_trace_buffers();
969	kmem_free(xdt_cpu_schedinfo, xdt_ncpus * sizeof (xdt_schedinfo_t));
970	if (xdt_cyclic != CYCLIC_NONE)
971		cyclic_remove(xdt_cyclic);
972	if (xdt_kstats != NULL)
973		kstat_delete(xdt_kstats);
974	xdt_devi = (void *)0;
975	ddi_remove_minor_node(devi, NULL);
976
977	return (DDI_SUCCESS);
978}
979
980/*ARGSUSED*/
981static int
982xdt_info(dev_info_t *devi, ddi_info_cmd_t infocmd, void *arg, void **result)
983{
984	int error;
985
986	switch (infocmd) {
987	case DDI_INFO_DEVT2DEVINFO:
988		*result = xdt_devi;
989		error = DDI_SUCCESS;
990		break;
991	case DDI_INFO_DEVT2INSTANCE:
992		*result = (void *)0;
993		error = DDI_SUCCESS;
994		break;
995	default:
996		error = DDI_FAILURE;
997	}
998	return (error);
999}
1000
1001static struct cb_ops xdt_cb_ops = {
1002	nulldev,		/* open(9E) */
1003	nodev,			/* close(9E) */
1004	nodev,			/* strategy(9E) */
1005	nodev,			/* print(9E) */
1006	nodev,			/* dump(9E) */
1007	nodev,			/* read(9E) */
1008	nodev,			/* write(9E) */
1009	nodev,			/* ioctl(9E) */
1010	nodev,			/* devmap(9E) */
1011	nodev,			/* mmap(9E) */
1012	nodev,			/* segmap(9E) */
1013	nochpoll,		/* chpoll(9E) */
1014	ddi_prop_op,		/* prop_op(9E) */
1015	NULL,			/* streamtab(9S) */
1016	D_MP | D_64BIT | D_NEW	/* cb_flag */
1017};
1018
1019static struct dev_ops xdt_ops = {
1020	DEVO_REV,		/* devo_rev */
1021	0,			/* devo_refcnt */
1022	xdt_info,		/* getinfo(9E) */
1023	nulldev,		/* identify(9E) */
1024	nulldev,		/* probe(9E) */
1025	xdt_attach,		/* attach(9E) */
1026	xdt_detach,		/* detach(9E) */
1027	nulldev,		/* devo_reset */
1028	&xdt_cb_ops,		/* devo_cb_ops */
1029	NULL,			/* devo_bus_ops */
1030	NULL,			/* power(9E) */
1031	ddi_quiesce_not_needed,		/* devo_quiesce */
1032};
1033
1034
1035static struct modldrv modldrv = {
1036	&mod_driverops,
1037	"Hypervisor event tracing",
1038	&xdt_ops
1039};
1040
1041static struct modlinkage modlinkage = {
1042	MODREV_1,
1043	&modldrv,
1044	NULL
1045};
1046
1047int
1048_init(void)
1049{
1050	return (mod_install(&modlinkage));
1051}
1052
1053int
1054_fini(void)
1055{
1056	return (mod_remove(&modlinkage));
1057}
1058
1059int
1060_info(struct modinfo *modinfop)
1061{
1062	return (mod_info(&modlinkage, modinfop));
1063}
1064