1/*	$NetBSD: tprof.c,v 1.23 2023/04/11 10:07:12 msaitoh Exp $	*/
2
3/*-
4 * Copyright (c)2008,2009,2010 YAMAMOTO Takashi,
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__KERNEL_RCSID(0, "$NetBSD: tprof.c,v 1.23 2023/04/11 10:07:12 msaitoh Exp $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/kernel.h>
35
36#include <sys/callout.h>
37#include <sys/conf.h>
38#include <sys/cpu.h>
39#include <sys/kmem.h>
40#include <sys/module.h>
41#include <sys/percpu.h>
42#include <sys/poll.h>
43#include <sys/proc.h>
44#include <sys/queue.h>
45#include <sys/select.h>
46#include <sys/workqueue.h>
47#include <sys/xcall.h>
48
49#include <dev/tprof/tprof.h>
50#include <dev/tprof/tprof_ioctl.h>
51
52#include "ioconf.h"
53
54#ifndef TPROF_HZ
55#define TPROF_HZ	10000
56#endif
57
58/*
59 * locking order:
60 *	tprof_reader_lock -> tprof_lock
61 *	tprof_startstop_lock -> tprof_lock
62 */
63
64/*
65 * protected by:
66 *	L: tprof_lock
67 *	R: tprof_reader_lock
68 *	S: tprof_startstop_lock
69 *	s: writer should hold tprof_startstop_lock and tprof_lock
70 *	   reader should hold tprof_startstop_lock or tprof_lock
71 */
72
73typedef struct tprof_buf {
74	u_int b_used;
75	u_int b_size;
76	u_int b_overflow;
77	u_int b_unused;
78	STAILQ_ENTRY(tprof_buf) b_list;
79	tprof_sample_t b_data[];
80} tprof_buf_t;
81#define	TPROF_BUF_BYTESIZE(sz) \
82	(sizeof(tprof_buf_t) + (sz) * sizeof(tprof_sample_t))
83#define	TPROF_MAX_SAMPLES_PER_BUF	TPROF_HZ
84
85typedef struct {
86	tprof_buf_t *c_buf;
87	uint32_t c_cpuid;
88	struct work c_work;
89	callout_t c_callout;
90} __aligned(CACHE_LINE_SIZE) tprof_cpu_t;
91
92typedef struct tprof_backend {
93	/*
94	 * tprof_backend_softc_t must be passed as an argument to the interrupt
95	 * handler, but since this is difficult to implement in armv7/v8. Then,
96	 * tprof_backend is exposed. Additionally, softc must be placed at the
97	 * beginning of struct tprof_backend.
98	 */
99	tprof_backend_softc_t tb_softc;
100
101	const char *tb_name;
102	const tprof_backend_ops_t *tb_ops;
103	LIST_ENTRY(tprof_backend) tb_list;
104} tprof_backend_t;
105
106static kmutex_t tprof_lock;
107static u_int tprof_nworker;		/* L: # of running worker LWPs */
108static lwp_t *tprof_owner;
109static STAILQ_HEAD(, tprof_buf) tprof_list; /* L: global buffer list */
110static u_int tprof_nbuf_on_list;	/* L: # of buffers on tprof_list */
111static struct workqueue *tprof_wq;
112static struct percpu *tprof_cpus __read_mostly;	/* tprof_cpu_t * */
113static u_int tprof_samples_per_buf;
114static u_int tprof_max_buf;
115
116tprof_backend_t *tprof_backend;	/* S: */
117static LIST_HEAD(, tprof_backend) tprof_backends =
118    LIST_HEAD_INITIALIZER(tprof_backend); /* S: */
119
120static kmutex_t tprof_reader_lock;
121static kcondvar_t tprof_reader_cv;	/* L: */
122static off_t tprof_reader_offset;	/* R: */
123
124static kmutex_t tprof_startstop_lock;
125static kcondvar_t tprof_cv;		/* L: */
126static struct selinfo tprof_selp;	/* L: */
127
128static struct tprof_stat tprof_stat;	/* L: */
129
130static tprof_cpu_t *
131tprof_cpu_direct(struct cpu_info *ci)
132{
133	tprof_cpu_t **cp;
134
135	cp = percpu_getptr_remote(tprof_cpus, ci);
136	return *cp;
137}
138
139static tprof_cpu_t *
140tprof_cpu(struct cpu_info *ci)
141{
142	tprof_cpu_t *c;
143
144	/*
145	 * As long as xcalls are blocked -- e.g., by kpreempt_disable
146	 * -- the percpu object will not be swapped and destroyed.  We
147	 * can't write to it, because the data may have already been
148	 * moved to a new buffer, but we can safely read from it.
149	 */
150	kpreempt_disable();
151	c = tprof_cpu_direct(ci);
152	kpreempt_enable();
153
154	return c;
155}
156
157static tprof_cpu_t *
158tprof_curcpu(void)
159{
160
161	return tprof_cpu(curcpu());
162}
163
164static tprof_buf_t *
165tprof_buf_alloc(void)
166{
167	tprof_buf_t *new;
168	u_int size = tprof_samples_per_buf;
169
170	new = kmem_alloc(TPROF_BUF_BYTESIZE(size), KM_SLEEP);
171	new->b_used = 0;
172	new->b_size = size;
173	new->b_overflow = 0;
174	return new;
175}
176
177static void
178tprof_buf_free(tprof_buf_t *buf)
179{
180
181	kmem_free(buf, TPROF_BUF_BYTESIZE(buf->b_size));
182}
183
184static tprof_buf_t *
185tprof_buf_switch(tprof_cpu_t *c, tprof_buf_t *new)
186{
187	tprof_buf_t *old;
188
189	old = c->c_buf;
190	c->c_buf = new;
191	return old;
192}
193
194static tprof_buf_t *
195tprof_buf_refresh(void)
196{
197	tprof_cpu_t * const c = tprof_curcpu();
198	tprof_buf_t *new;
199
200	new = tprof_buf_alloc();
201	return tprof_buf_switch(c, new);
202}
203
204static void
205tprof_worker(struct work *wk, void *dummy)
206{
207	tprof_cpu_t * const c = tprof_curcpu();
208	tprof_buf_t *buf;
209	tprof_backend_t *tb;
210	bool shouldstop;
211
212	KASSERT(wk == &c->c_work);
213	KASSERT(dummy == NULL);
214
215	/*
216	 * Get a per cpu buffer.
217	 */
218	buf = tprof_buf_refresh();
219
220	/*
221	 * and put it on the global list for read(2).
222	 */
223	mutex_enter(&tprof_lock);
224	tb = tprof_backend;
225	shouldstop = (tb == NULL || tb->tb_softc.sc_ctr_running_mask == 0);
226	if (shouldstop) {
227		KASSERT(tprof_nworker > 0);
228		tprof_nworker--;
229		cv_broadcast(&tprof_cv);
230		cv_broadcast(&tprof_reader_cv);
231	}
232	if (buf->b_used == 0) {
233		tprof_stat.ts_emptybuf++;
234	} else if (tprof_nbuf_on_list < tprof_max_buf) {
235		tprof_stat.ts_sample += buf->b_used;
236		tprof_stat.ts_overflow += buf->b_overflow;
237		tprof_stat.ts_buf++;
238		STAILQ_INSERT_TAIL(&tprof_list, buf, b_list);
239		tprof_nbuf_on_list++;
240		buf = NULL;
241		selnotify(&tprof_selp, 0, NOTE_SUBMIT);
242		cv_broadcast(&tprof_reader_cv);
243	} else {
244		tprof_stat.ts_dropbuf_sample += buf->b_used;
245		tprof_stat.ts_dropbuf++;
246	}
247	mutex_exit(&tprof_lock);
248	if (buf)
249		tprof_buf_free(buf);
250
251	if (!shouldstop)
252		callout_schedule(&c->c_callout, hz / 8);
253}
254
255static void
256tprof_kick(void *vp)
257{
258	struct cpu_info * const ci = vp;
259	tprof_cpu_t * const c = tprof_cpu(ci);
260
261	workqueue_enqueue(tprof_wq, &c->c_work, ci);
262}
263
264static void
265tprof_stop1(void)
266{
267	CPU_INFO_ITERATOR cii;
268	struct cpu_info *ci;
269
270	KASSERT(mutex_owned(&tprof_startstop_lock));
271	KASSERT(tprof_nworker == 0);
272
273	for (CPU_INFO_FOREACH(cii, ci)) {
274		tprof_cpu_t * const c = tprof_cpu(ci);
275		tprof_buf_t *old;
276
277		old = tprof_buf_switch(c, NULL);
278		if (old != NULL)
279			tprof_buf_free(old);
280
281		callout_destroy(&c->c_callout);
282	}
283	workqueue_destroy(tprof_wq);
284}
285
286static void
287tprof_getinfo(struct tprof_info *info)
288{
289	tprof_backend_t *tb;
290
291	KASSERT(mutex_owned(&tprof_startstop_lock));
292
293	memset(info, 0, sizeof(*info));
294	info->ti_version = TPROF_VERSION;
295	if ((tb = tprof_backend) != NULL)
296		info->ti_ident = tb->tb_ops->tbo_ident();
297}
298
299static int
300tprof_getncounters(u_int *ncounters)
301{
302	tprof_backend_t *tb;
303
304	tb = tprof_backend;
305	if (tb == NULL)
306		return ENOENT;
307
308	*ncounters = tb->tb_ops->tbo_ncounters();
309	return 0;
310}
311
312static void
313tprof_start_cpu(void *arg1, void *arg2)
314{
315	tprof_backend_t *tb = arg1;
316	tprof_countermask_t runmask = (uintptr_t)arg2;
317
318	tb->tb_ops->tbo_start(runmask);
319}
320
321static void
322tprof_stop_cpu(void *arg1, void *arg2)
323{
324	tprof_backend_t *tb = arg1;
325	tprof_countermask_t stopmask = (uintptr_t)arg2;
326
327	tb->tb_ops->tbo_stop(stopmask);
328}
329
330static int
331tprof_start(tprof_countermask_t runmask)
332{
333	CPU_INFO_ITERATOR cii;
334	struct cpu_info *ci;
335	tprof_backend_t *tb;
336	uint64_t xc;
337	int error;
338	bool firstrun;
339
340	KASSERT(mutex_owned(&tprof_startstop_lock));
341
342	tb = tprof_backend;
343	if (tb == NULL) {
344		error = ENOENT;
345		goto done;
346	}
347
348	runmask &= ~tb->tb_softc.sc_ctr_running_mask;
349	runmask &= tb->tb_softc.sc_ctr_configured_mask;
350	if (runmask == 0) {
351		/*
352		 * Targets are already running.
353		 * Unconfigured counters are ignored.
354		 */
355		error = 0;
356		goto done;
357	}
358
359	firstrun = (tb->tb_softc.sc_ctr_running_mask == 0);
360	if (firstrun) {
361		if (tb->tb_ops->tbo_establish != NULL) {
362			error = tb->tb_ops->tbo_establish(&tb->tb_softc);
363			if (error != 0)
364				goto done;
365		}
366
367		tprof_samples_per_buf = TPROF_MAX_SAMPLES_PER_BUF;
368		tprof_max_buf = ncpu * 3;
369		error = workqueue_create(&tprof_wq, "tprofmv", tprof_worker,
370		    NULL, PRI_NONE, IPL_SOFTCLOCK, WQ_MPSAFE | WQ_PERCPU);
371		if (error != 0) {
372			if (tb->tb_ops->tbo_disestablish != NULL)
373				tb->tb_ops->tbo_disestablish(&tb->tb_softc);
374			goto done;
375		}
376
377		for (CPU_INFO_FOREACH(cii, ci)) {
378			tprof_cpu_t * const c = tprof_cpu(ci);
379			tprof_buf_t *new;
380			tprof_buf_t *old;
381
382			new = tprof_buf_alloc();
383			old = tprof_buf_switch(c, new);
384			if (old != NULL) {
385				tprof_buf_free(old);
386			}
387			callout_init(&c->c_callout, CALLOUT_MPSAFE);
388			callout_setfunc(&c->c_callout, tprof_kick, ci);
389		}
390	}
391
392	runmask &= tb->tb_softc.sc_ctr_configured_mask;
393	xc = xc_broadcast(0, tprof_start_cpu, tb, (void *)(uintptr_t)runmask);
394	xc_wait(xc);
395	mutex_enter(&tprof_lock);
396	tb->tb_softc.sc_ctr_running_mask |= runmask;
397	mutex_exit(&tprof_lock);
398
399	if (firstrun) {
400		for (CPU_INFO_FOREACH(cii, ci)) {
401			tprof_cpu_t * const c = tprof_cpu(ci);
402
403			mutex_enter(&tprof_lock);
404			tprof_nworker++;
405			mutex_exit(&tprof_lock);
406			workqueue_enqueue(tprof_wq, &c->c_work, ci);
407		}
408	}
409	error = 0;
410
411done:
412	return error;
413}
414
415static void
416tprof_stop(tprof_countermask_t stopmask)
417{
418	tprof_backend_t *tb;
419	uint64_t xc;
420
421	tb = tprof_backend;
422	if (tb == NULL)
423		return;
424
425	KASSERT(mutex_owned(&tprof_startstop_lock));
426	stopmask &= tb->tb_softc.sc_ctr_running_mask;
427	if (stopmask == 0) {
428		/* Targets are not running */
429		goto done;
430	}
431
432	xc = xc_broadcast(0, tprof_stop_cpu, tb, (void *)(uintptr_t)stopmask);
433	xc_wait(xc);
434	mutex_enter(&tprof_lock);
435	tb->tb_softc.sc_ctr_running_mask &= ~stopmask;
436	mutex_exit(&tprof_lock);
437
438	/* All counters have stopped? */
439	if (tb->tb_softc.sc_ctr_running_mask == 0) {
440		mutex_enter(&tprof_lock);
441		cv_broadcast(&tprof_reader_cv);
442		while (tprof_nworker > 0)
443			cv_wait(&tprof_cv, &tprof_lock);
444
445		mutex_exit(&tprof_lock);
446
447		tprof_stop1();
448		if (tb->tb_ops->tbo_disestablish != NULL)
449			tb->tb_ops->tbo_disestablish(&tb->tb_softc);
450	}
451done:
452	;
453}
454
455static void
456tprof_init_percpu_counters_offset(void *vp, void *vp2, struct cpu_info *ci)
457{
458	uint64_t *counters_offset = vp;
459	u_int counter = (uintptr_t)vp2;
460
461	tprof_backend_t *tb = tprof_backend;
462	tprof_param_t *param = &tb->tb_softc.sc_count[counter].ctr_param;
463	counters_offset[counter] = param->p_value;
464}
465
466static void
467tprof_configure_event_cpu(void *arg1, void *arg2)
468{
469	tprof_backend_t *tb = arg1;
470	u_int counter = (uintptr_t)arg2;
471	tprof_param_t *param = &tb->tb_softc.sc_count[counter].ctr_param;
472
473	tb->tb_ops->tbo_configure_event(counter, param);
474}
475
476static int
477tprof_configure_event(const tprof_param_t *param)
478{
479	tprof_backend_t *tb;
480	tprof_backend_softc_t *sc;
481	tprof_param_t *sc_param;
482	uint64_t xc;
483	int c, error;
484
485	if ((param->p_flags & (TPROF_PARAM_USER | TPROF_PARAM_KERN)) == 0) {
486		error = EINVAL;
487		goto done;
488	}
489
490	tb = tprof_backend;
491	if (tb == NULL) {
492		error = ENOENT;
493		goto done;
494	}
495	sc = &tb->tb_softc;
496
497	c = param->p_counter;
498	if (c >= tb->tb_softc.sc_ncounters) {
499		error = EINVAL;
500		goto done;
501	}
502
503	if (tb->tb_ops->tbo_valid_event != NULL) {
504		error = tb->tb_ops->tbo_valid_event(param->p_counter, param);
505		if (error != 0)
506			goto done;
507	}
508
509	/* if already running, stop the counter */
510	if (ISSET(c, tb->tb_softc.sc_ctr_running_mask))
511		tprof_stop(__BIT(c));
512
513	sc->sc_count[c].ctr_bitwidth =
514	    tb->tb_ops->tbo_counter_bitwidth(param->p_counter);
515
516	sc_param = &sc->sc_count[c].ctr_param;
517	memcpy(sc_param, param, sizeof(*sc_param)); /* save copy of param */
518
519	if (ISSET(param->p_flags, TPROF_PARAM_PROFILE)) {
520		uint64_t freq, inum, dnum;
521
522		freq = tb->tb_ops->tbo_counter_estimate_freq(c);
523		sc->sc_count[c].ctr_counter_val = freq / TPROF_HZ;
524		if (sc->sc_count[c].ctr_counter_val == 0) {
525			printf("%s: counter#%d frequency (%"PRIu64") is"
526			    " very low relative to TPROF_HZ (%u)\n", __func__,
527			    c, freq, TPROF_HZ);
528			sc->sc_count[c].ctr_counter_val =
529			    4000000000ULL / TPROF_HZ;
530		}
531
532		switch (param->p_flags & TPROF_PARAM_VALUE2_MASK) {
533		case TPROF_PARAM_VALUE2_SCALE:
534			if (sc_param->p_value2 == 0)
535				break;
536			/*
537			 * p_value2 is 64-bit fixed-point
538			 * upper 32 bits are the integer part
539			 * lower 32 bits are the decimal part
540			 */
541			inum = sc_param->p_value2 >> 32;
542			dnum = sc_param->p_value2 & __BITS(31, 0);
543			sc->sc_count[c].ctr_counter_val =
544			    sc->sc_count[c].ctr_counter_val * inum +
545			    (sc->sc_count[c].ctr_counter_val * dnum >> 32);
546			if (sc->sc_count[c].ctr_counter_val == 0)
547				sc->sc_count[c].ctr_counter_val = 1;
548			break;
549		case TPROF_PARAM_VALUE2_TRIGGERCOUNT:
550			if (sc_param->p_value2 == 0)
551				sc_param->p_value2 = 1;
552			if (sc_param->p_value2 >
553			    __BITS(sc->sc_count[c].ctr_bitwidth - 1, 0)) {
554				sc_param->p_value2 =
555				    __BITS(sc->sc_count[c].ctr_bitwidth - 1, 0);
556			}
557			sc->sc_count[c].ctr_counter_val = sc_param->p_value2;
558			break;
559		default:
560			break;
561		}
562		sc->sc_count[c].ctr_counter_reset_val =
563		    -sc->sc_count[c].ctr_counter_val;
564		sc->sc_count[c].ctr_counter_reset_val &=
565		    __BITS(sc->sc_count[c].ctr_bitwidth - 1, 0);
566	} else {
567		sc->sc_count[c].ctr_counter_val = 0;
568		sc->sc_count[c].ctr_counter_reset_val = 0;
569	}
570
571	/* At this point, p_value is used as an initial value */
572	percpu_foreach(tb->tb_softc.sc_ctr_offset_percpu,
573	    tprof_init_percpu_counters_offset, (void *)(uintptr_t)c);
574	/* On the backend side, p_value is used as the reset value */
575	sc_param->p_value = tb->tb_softc.sc_count[c].ctr_counter_reset_val;
576
577	xc = xc_broadcast(0, tprof_configure_event_cpu,
578	    tb, (void *)(uintptr_t)c);
579	xc_wait(xc);
580
581	mutex_enter(&tprof_lock);
582	/* update counters bitmasks */
583	SET(tb->tb_softc.sc_ctr_configured_mask, __BIT(c));
584	CLR(tb->tb_softc.sc_ctr_prof_mask, __BIT(c));
585	CLR(tb->tb_softc.sc_ctr_ovf_mask, __BIT(c));
586	/* profiled counter requires overflow handling */
587	if (ISSET(param->p_flags, TPROF_PARAM_PROFILE)) {
588		SET(tb->tb_softc.sc_ctr_prof_mask, __BIT(c));
589		SET(tb->tb_softc.sc_ctr_ovf_mask, __BIT(c));
590	}
591	/* counters with less than 64bits also require overflow handling */
592	if (sc->sc_count[c].ctr_bitwidth != 64)
593		SET(tb->tb_softc.sc_ctr_ovf_mask, __BIT(c));
594	mutex_exit(&tprof_lock);
595
596	error = 0;
597
598 done:
599	return error;
600}
601
602static void
603tprof_getcounts_cpu(void *arg1, void *arg2)
604{
605	tprof_backend_t *tb = arg1;
606	tprof_backend_softc_t *sc = &tb->tb_softc;
607	uint64_t *counters = arg2;
608	uint64_t *counters_offset;
609	unsigned int c;
610
611	tprof_countermask_t configmask = sc->sc_ctr_configured_mask;
612	counters_offset = percpu_getref(sc->sc_ctr_offset_percpu);
613	for (c = 0; c < sc->sc_ncounters; c++) {
614		if (ISSET(configmask, __BIT(c))) {
615			uint64_t ctr = tb->tb_ops->tbo_counter_read(c);
616			counters[c] = counters_offset[c] +
617			    ((ctr - sc->sc_count[c].ctr_counter_reset_val) &
618			    __BITS(sc->sc_count[c].ctr_bitwidth - 1, 0));
619		} else
620			counters[c] = 0;
621	}
622	percpu_putref(sc->sc_ctr_offset_percpu);
623}
624
625static int
626tprof_getcounts(tprof_counts_t *counts)
627{
628	struct cpu_info *ci;
629	tprof_backend_t *tb;
630	uint64_t xc;
631
632	tb = tprof_backend;
633	if (tb == NULL)
634		return ENOENT;
635
636	if (counts->c_cpu >= ncpu)
637		return ESRCH;
638	ci = cpu_lookup(counts->c_cpu);
639	if (ci == NULL)
640		return ESRCH;
641
642	xc = xc_unicast(0, tprof_getcounts_cpu, tb, counts->c_count, ci);
643	xc_wait(xc);
644
645	counts->c_ncounters = tb->tb_softc.sc_ncounters;
646	counts->c_runningmask = tb->tb_softc.sc_ctr_running_mask;
647	return 0;
648}
649
650/*
651 * tprof_clear: drain unread samples.
652 */
653
654static void
655tprof_clear(void)
656{
657	tprof_buf_t *buf;
658
659	mutex_enter(&tprof_reader_lock);
660	mutex_enter(&tprof_lock);
661	while ((buf = STAILQ_FIRST(&tprof_list)) != NULL) {
662		if (buf != NULL) {
663			STAILQ_REMOVE_HEAD(&tprof_list, b_list);
664			KASSERT(tprof_nbuf_on_list > 0);
665			tprof_nbuf_on_list--;
666			mutex_exit(&tprof_lock);
667			tprof_buf_free(buf);
668			mutex_enter(&tprof_lock);
669		}
670	}
671	KASSERT(tprof_nbuf_on_list == 0);
672	mutex_exit(&tprof_lock);
673	tprof_reader_offset = 0;
674	mutex_exit(&tprof_reader_lock);
675
676	memset(&tprof_stat, 0, sizeof(tprof_stat));
677}
678
679static tprof_backend_t *
680tprof_backend_lookup(const char *name)
681{
682	tprof_backend_t *tb;
683
684	KASSERT(mutex_owned(&tprof_startstop_lock));
685
686	LIST_FOREACH(tb, &tprof_backends, tb_list) {
687		if (!strcmp(tb->tb_name, name)) {
688			return tb;
689		}
690	}
691	return NULL;
692}
693
694/* -------------------- backend interfaces */
695
696/*
697 * tprof_sample: record a sample on the per-cpu buffer.
698 *
699 * be careful; can be called in NMI context.
700 * we are bluntly assuming the followings are safe.
701 *	curcpu()
702 *	curlwp->l_lid
703 *	curlwp->l_proc->p_pid
704 */
705
706void
707tprof_sample(void *unused, const tprof_frame_info_t *tfi)
708{
709	tprof_cpu_t * const c = tprof_cpu_direct(curcpu());
710	tprof_buf_t * const buf = c->c_buf;
711	tprof_sample_t *sp;
712	const uintptr_t pc = tfi->tfi_pc;
713	const lwp_t * const l = curlwp;
714	u_int idx;
715
716	idx = buf->b_used;
717	if (__predict_false(idx >= buf->b_size)) {
718		buf->b_overflow++;
719		return;
720	}
721	sp = &buf->b_data[idx];
722	sp->s_pid = l->l_proc->p_pid;
723	sp->s_lwpid = l->l_lid;
724	sp->s_cpuid = c->c_cpuid;
725	sp->s_flags = ((tfi->tfi_inkernel) ? TPROF_SAMPLE_INKERNEL : 0) |
726	    __SHIFTIN(tfi->tfi_counter, TPROF_SAMPLE_COUNTER_MASK);
727	sp->s_pc = pc;
728	buf->b_used = idx + 1;
729}
730
731/*
732 * tprof_backend_register:
733 */
734
735int
736tprof_backend_register(const char *name, const tprof_backend_ops_t *ops,
737    int vers)
738{
739	tprof_backend_t *tb;
740
741	if (vers != TPROF_BACKEND_VERSION)
742		return EINVAL;
743
744	mutex_enter(&tprof_startstop_lock);
745	tb = tprof_backend_lookup(name);
746	if (tb != NULL) {
747		mutex_exit(&tprof_startstop_lock);
748		return EEXIST;
749	}
750#if 1 /* XXX for now */
751	if (!LIST_EMPTY(&tprof_backends)) {
752		mutex_exit(&tprof_startstop_lock);
753		return ENOTSUP;
754	}
755#endif
756	tb = kmem_zalloc(sizeof(*tb), KM_SLEEP);
757	tb->tb_name = name;
758	tb->tb_ops = ops;
759	LIST_INSERT_HEAD(&tprof_backends, tb, tb_list);
760#if 1 /* XXX for now */
761	if (tprof_backend == NULL) {
762		tprof_backend = tb;
763	}
764#endif
765	mutex_exit(&tprof_startstop_lock);
766
767	/* Init backend softc */
768	tb->tb_softc.sc_ncounters = tb->tb_ops->tbo_ncounters();
769	tb->tb_softc.sc_ctr_offset_percpu_size =
770	    sizeof(uint64_t) * tb->tb_softc.sc_ncounters;
771	tb->tb_softc.sc_ctr_offset_percpu =
772	    percpu_alloc(tb->tb_softc.sc_ctr_offset_percpu_size);
773
774	return 0;
775}
776
777/*
778 * tprof_backend_unregister:
779 */
780
781int
782tprof_backend_unregister(const char *name)
783{
784	tprof_backend_t *tb;
785
786	mutex_enter(&tprof_startstop_lock);
787	tb = tprof_backend_lookup(name);
788#if defined(DIAGNOSTIC)
789	if (tb == NULL) {
790		mutex_exit(&tprof_startstop_lock);
791		panic("%s: not found '%s'", __func__, name);
792	}
793#endif /* defined(DIAGNOSTIC) */
794	if (tb->tb_softc.sc_ctr_running_mask != 0) {
795		mutex_exit(&tprof_startstop_lock);
796		return EBUSY;
797	}
798#if 1 /* XXX for now */
799	if (tprof_backend == tb)
800		tprof_backend = NULL;
801#endif
802	LIST_REMOVE(tb, tb_list);
803	mutex_exit(&tprof_startstop_lock);
804
805	/* fini backend softc */
806	percpu_free(tb->tb_softc.sc_ctr_offset_percpu,
807	    tb->tb_softc.sc_ctr_offset_percpu_size);
808
809	/* Free backend */
810	kmem_free(tb, sizeof(*tb));
811
812	return 0;
813}
814
815/* -------------------- cdevsw interfaces */
816
817static int
818tprof_open(dev_t dev, int flags, int type, struct lwp *l)
819{
820
821	if (minor(dev) != 0)
822		return EXDEV;
823
824	mutex_enter(&tprof_lock);
825	if (tprof_owner != NULL) {
826		mutex_exit(&tprof_lock);
827		return  EBUSY;
828	}
829	tprof_owner = curlwp;
830	mutex_exit(&tprof_lock);
831
832	return 0;
833}
834
835static int
836tprof_close(dev_t dev, int flags, int type, struct lwp *l)
837{
838
839	KASSERT(minor(dev) == 0);
840
841	mutex_enter(&tprof_startstop_lock);
842	mutex_enter(&tprof_lock);
843	tprof_owner = NULL;
844	mutex_exit(&tprof_lock);
845	tprof_stop(TPROF_COUNTERMASK_ALL);
846	tprof_clear();
847
848	tprof_backend_t *tb = tprof_backend;
849	if (tb != NULL) {
850		KASSERT(tb->tb_softc.sc_ctr_running_mask == 0);
851		tb->tb_softc.sc_ctr_configured_mask = 0;
852		tb->tb_softc.sc_ctr_prof_mask = 0;
853		tb->tb_softc.sc_ctr_ovf_mask = 0;
854	}
855
856	mutex_exit(&tprof_startstop_lock);
857
858	return 0;
859}
860
861static int
862tprof_poll(dev_t dev, int events, struct lwp *l)
863{
864	int revents;
865
866	revents = events & (POLLIN | POLLRDNORM);
867	if (revents == 0)
868		return 0;
869
870	mutex_enter(&tprof_lock);
871	if (STAILQ_EMPTY(&tprof_list)) {
872		revents = 0;
873		selrecord(l, &tprof_selp);
874	}
875	mutex_exit(&tprof_lock);
876
877	return revents;
878}
879
880static void
881filt_tprof_read_detach(struct knote *kn)
882{
883	mutex_enter(&tprof_lock);
884	selremove_knote(&tprof_selp, kn);
885	mutex_exit(&tprof_lock);
886}
887
888static int
889filt_tprof_read_event(struct knote *kn, long hint)
890{
891	int rv = 0;
892
893	if ((hint & NOTE_SUBMIT) == 0)
894		mutex_enter(&tprof_lock);
895
896	if (!STAILQ_EMPTY(&tprof_list)) {
897		tprof_buf_t *buf;
898		int64_t n = 0;
899
900		STAILQ_FOREACH(buf, &tprof_list, b_list) {
901			n += buf->b_used;
902		}
903		kn->kn_data = n * sizeof(tprof_sample_t);
904
905		rv = 1;
906	}
907
908	if ((hint & NOTE_SUBMIT) == 0)
909		mutex_exit(&tprof_lock);
910
911	return rv;
912}
913
914static const struct filterops tprof_read_filtops = {
915	.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
916	.f_attach = NULL,
917	.f_detach = filt_tprof_read_detach,
918	.f_event = filt_tprof_read_event,
919};
920
921static int
922tprof_kqfilter(dev_t dev, struct knote *kn)
923{
924	switch (kn->kn_filter) {
925	case EVFILT_READ:
926		kn->kn_fop = &tprof_read_filtops;
927		mutex_enter(&tprof_lock);
928		selrecord_knote(&tprof_selp, kn);
929		mutex_exit(&tprof_lock);
930		break;
931	default:
932		return EINVAL;
933	}
934
935	return 0;
936}
937
938static int
939tprof_read(dev_t dev, struct uio *uio, int flags)
940{
941	tprof_buf_t *buf;
942	size_t bytes;
943	size_t resid;
944	size_t done = 0;
945	int error = 0;
946
947	KASSERT(minor(dev) == 0);
948	mutex_enter(&tprof_reader_lock);
949	while (uio->uio_resid > 0 && error == 0) {
950		/*
951		 * Take the first buffer from the list.
952		 */
953		mutex_enter(&tprof_lock);
954		buf = STAILQ_FIRST(&tprof_list);
955		if (buf == NULL) {
956			if (tprof_nworker == 0 || done != 0) {
957				mutex_exit(&tprof_lock);
958				error = 0;
959				break;
960			}
961			mutex_exit(&tprof_reader_lock);
962			error = cv_wait_sig(&tprof_reader_cv, &tprof_lock);
963			mutex_exit(&tprof_lock);
964			mutex_enter(&tprof_reader_lock);
965			continue;
966		}
967		STAILQ_REMOVE_HEAD(&tprof_list, b_list);
968		KASSERT(tprof_nbuf_on_list > 0);
969		tprof_nbuf_on_list--;
970		mutex_exit(&tprof_lock);
971
972		/*
973		 * Copy it out.
974		 */
975		bytes = MIN(buf->b_used * sizeof(tprof_sample_t) -
976		    tprof_reader_offset, uio->uio_resid);
977		resid = uio->uio_resid;
978		error = uiomove((char *)buf->b_data + tprof_reader_offset,
979		    bytes, uio);
980		done = resid - uio->uio_resid;
981		tprof_reader_offset += done;
982
983		/*
984		 * If we didn't consume the whole buffer,
985		 * put it back to the list.
986		 */
987		if (tprof_reader_offset <
988		    buf->b_used * sizeof(tprof_sample_t)) {
989			mutex_enter(&tprof_lock);
990			STAILQ_INSERT_HEAD(&tprof_list, buf, b_list);
991			tprof_nbuf_on_list++;
992			cv_broadcast(&tprof_reader_cv);
993			mutex_exit(&tprof_lock);
994		} else {
995			tprof_buf_free(buf);
996			tprof_reader_offset = 0;
997		}
998	}
999	mutex_exit(&tprof_reader_lock);
1000
1001	return error;
1002}
1003
1004static int
1005tprof_ioctl(dev_t dev, u_long cmd, void *data, int flags, struct lwp *l)
1006{
1007	const tprof_param_t *param;
1008	tprof_counts_t *counts;
1009	int error = 0;
1010
1011	KASSERT(minor(dev) == 0);
1012
1013	switch (cmd) {
1014	case TPROF_IOC_GETINFO:
1015		mutex_enter(&tprof_startstop_lock);
1016		tprof_getinfo(data);
1017		mutex_exit(&tprof_startstop_lock);
1018		break;
1019	case TPROF_IOC_GETNCOUNTERS:
1020		mutex_enter(&tprof_lock);
1021		error = tprof_getncounters((u_int *)data);
1022		mutex_exit(&tprof_lock);
1023		break;
1024	case TPROF_IOC_START:
1025		mutex_enter(&tprof_startstop_lock);
1026		error = tprof_start(*(tprof_countermask_t *)data);
1027		mutex_exit(&tprof_startstop_lock);
1028		break;
1029	case TPROF_IOC_STOP:
1030		mutex_enter(&tprof_startstop_lock);
1031		tprof_stop(*(tprof_countermask_t *)data);
1032		mutex_exit(&tprof_startstop_lock);
1033		break;
1034	case TPROF_IOC_GETSTAT:
1035		mutex_enter(&tprof_lock);
1036		memcpy(data, &tprof_stat, sizeof(tprof_stat));
1037		mutex_exit(&tprof_lock);
1038		break;
1039	case TPROF_IOC_CONFIGURE_EVENT:
1040		param = data;
1041		mutex_enter(&tprof_startstop_lock);
1042		error = tprof_configure_event(param);
1043		mutex_exit(&tprof_startstop_lock);
1044		break;
1045	case TPROF_IOC_GETCOUNTS:
1046		counts = data;
1047		mutex_enter(&tprof_startstop_lock);
1048		error = tprof_getcounts(counts);
1049		mutex_exit(&tprof_startstop_lock);
1050		break;
1051	default:
1052		error = EINVAL;
1053		break;
1054	}
1055
1056	return error;
1057}
1058
1059const struct cdevsw tprof_cdevsw = {
1060	.d_open = tprof_open,
1061	.d_close = tprof_close,
1062	.d_read = tprof_read,
1063	.d_write = nowrite,
1064	.d_ioctl = tprof_ioctl,
1065	.d_stop = nostop,
1066	.d_tty = notty,
1067	.d_poll = tprof_poll,
1068	.d_mmap = nommap,
1069	.d_kqfilter = tprof_kqfilter,
1070	.d_discard = nodiscard,
1071	.d_flag = D_OTHER | D_MPSAFE
1072};
1073
1074void
1075tprofattach(int nunits)
1076{
1077
1078	/* Nothing */
1079}
1080
1081MODULE(MODULE_CLASS_DRIVER, tprof, NULL);
1082
1083static void
1084tprof_cpu_init(void *vcp, void *vcookie, struct cpu_info *ci)
1085{
1086	tprof_cpu_t **cp = vcp, *c;
1087
1088	c = kmem_zalloc(sizeof(*c), KM_SLEEP);
1089	c->c_buf = NULL;
1090	c->c_cpuid = cpu_index(ci);
1091	*cp = c;
1092}
1093
1094static void
1095tprof_cpu_fini(void *vcp, void *vcookie, struct cpu_info *ci)
1096{
1097	tprof_cpu_t **cp = vcp, *c;
1098
1099	c = *cp;
1100	KASSERT(c->c_cpuid == cpu_index(ci));
1101	KASSERT(c->c_buf == NULL);
1102	kmem_free(c, sizeof(*c));
1103	*cp = NULL;
1104}
1105
1106static void
1107tprof_driver_init(void)
1108{
1109
1110	mutex_init(&tprof_lock, MUTEX_DEFAULT, IPL_NONE);
1111	mutex_init(&tprof_reader_lock, MUTEX_DEFAULT, IPL_NONE);
1112	mutex_init(&tprof_startstop_lock, MUTEX_DEFAULT, IPL_NONE);
1113	selinit(&tprof_selp);
1114	cv_init(&tprof_cv, "tprof");
1115	cv_init(&tprof_reader_cv, "tprof_rd");
1116	STAILQ_INIT(&tprof_list);
1117	tprof_cpus = percpu_create(sizeof(tprof_cpu_t *),
1118	    tprof_cpu_init, tprof_cpu_fini, NULL);
1119}
1120
1121static void
1122tprof_driver_fini(void)
1123{
1124
1125	percpu_free(tprof_cpus, sizeof(tprof_cpu_t *));
1126	mutex_destroy(&tprof_lock);
1127	mutex_destroy(&tprof_reader_lock);
1128	mutex_destroy(&tprof_startstop_lock);
1129	seldestroy(&tprof_selp);
1130	cv_destroy(&tprof_cv);
1131	cv_destroy(&tprof_reader_cv);
1132}
1133
1134static int
1135tprof_modcmd(modcmd_t cmd, void *arg)
1136{
1137
1138	switch (cmd) {
1139	case MODULE_CMD_INIT:
1140		tprof_driver_init();
1141#if defined(_MODULE)
1142		{
1143			devmajor_t bmajor = NODEVMAJOR;
1144			devmajor_t cmajor = NODEVMAJOR;
1145			int error;
1146
1147			error = devsw_attach("tprof", NULL, &bmajor,
1148			    &tprof_cdevsw, &cmajor);
1149			if (error) {
1150				tprof_driver_fini();
1151				return error;
1152			}
1153		}
1154#endif /* defined(_MODULE) */
1155		return 0;
1156
1157	case MODULE_CMD_FINI:
1158#if defined(_MODULE)
1159		devsw_detach(NULL, &tprof_cdevsw);
1160#endif /* defined(_MODULE) */
1161		tprof_driver_fini();
1162		return 0;
1163
1164	default:
1165		return ENOTTY;
1166	}
1167}
1168