1131962Smp/*	$NetBSD: subr_prof.c,v 1.50 2021/08/14 17:51:20 ryo Exp $	*/
259243Sobrien
359243Sobrien/*-
459243Sobrien * Copyright (c) 1982, 1986, 1993
559243Sobrien *	The Regents of the University of California.  All rights reserved.
659243Sobrien *
759243Sobrien * Redistribution and use in source and binary forms, with or without
859243Sobrien * modification, are permitted provided that the following conditions
959243Sobrien * are met:
1059243Sobrien * 1. Redistributions of source code must retain the above copyright
1159243Sobrien *    notice, this list of conditions and the following disclaimer.
1259243Sobrien * 2. Redistributions in binary form must reproduce the above copyright
1359243Sobrien *    notice, this list of conditions and the following disclaimer in the
1459243Sobrien *    documentation and/or other materials provided with the distribution.
1559243Sobrien * 3. Neither the name of the University nor the names of its contributors
1659243Sobrien *    may be used to endorse or promote products derived from this software
17131962Smp *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 *
31 *	@(#)subr_prof.c	8.4 (Berkeley) 2/14/95
32 */
33
34#include <sys/cdefs.h>
35__KERNEL_RCSID(0, "$NetBSD: subr_prof.c,v 1.50 2021/08/14 17:51:20 ryo Exp $");
36
37#ifdef _KERNEL_OPT
38#include "opt_gprof.h"
39#include "opt_multiprocessor.h"
40#endif
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/kernel.h>
45#include <sys/proc.h>
46#include <sys/mount.h>
47#include <sys/syscallargs.h>
48#include <sys/sysctl.h>
49
50#include <sys/cpu.h>
51
52#ifdef GPROF
53#include <sys/malloc.h>
54#include <sys/gmon.h>
55#include <sys/xcall.h>
56
57MALLOC_DEFINE(M_GPROF, "gprof", "kernel profiling buffer");
58
59static int sysctl_kern_profiling(SYSCTLFN_ARGS);
60#ifdef MULTIPROCESSOR
61void _gmonparam_merge(struct gmonparam *, struct gmonparam *);
62#endif
63
64/*
65 * Froms is actually a bunch of unsigned shorts indexing tos
66 */
67struct gmonparam _gmonparam = { .state = GMON_PROF_OFF };
68
69/* Actual start of the kernel text segment. */
70extern char kernel_text[];
71
72extern char etext[];
73
74
75void
76kmstartup(void)
77{
78	char *cp;
79	struct gmonparam *p = &_gmonparam;
80	unsigned long size;
81	/*
82	 * Round lowpc and highpc to multiples of the density we're using
83	 * so the rest of the scaling (here and in gprof) stays in ints.
84	 */
85	p->lowpc = rounddown(((u_long)kernel_text),
86		HISTFRACTION * sizeof(HISTCOUNTER));
87	p->highpc = roundup((u_long)etext,
88		HISTFRACTION * sizeof(HISTCOUNTER));
89	p->textsize = p->highpc - p->lowpc;
90	printf("Profiling kernel, textsize=%ld [%lx..%lx]\n",
91	       p->textsize, p->lowpc, p->highpc);
92	p->kcountsize = p->textsize / HISTFRACTION;
93	p->hashfraction = HASHFRACTION;
94	p->fromssize = p->textsize / HASHFRACTION;
95	p->tolimit = p->textsize * ARCDENSITY / 100;
96	if (p->tolimit < MINARCS)
97		p->tolimit = MINARCS;
98	else if (p->tolimit > MAXARCS)
99		p->tolimit = MAXARCS;
100	p->tossize = p->tolimit * sizeof(struct tostruct);
101
102	size = p->kcountsize + p->fromssize + p->tossize;
103#ifdef MULTIPROCESSOR
104	CPU_INFO_ITERATOR cii;
105	struct cpu_info *ci;
106	for (CPU_INFO_FOREACH(cii, ci)) {
107		p = malloc(sizeof(struct gmonparam) + size, M_GPROF,
108		    M_NOWAIT | M_ZERO);
109		if (p == NULL) {
110			printf("No memory for profiling on %s\n",
111			    cpu_name(ci));
112			/* cannot profile on this cpu */
113			continue;
114		}
115		memcpy(p, &_gmonparam, sizeof(_gmonparam));
116		ci->ci_gmon = p;
117
118		/*
119		 * To allow profiling to be controlled only by the global
120		 * _gmonparam.state, set the default value for each CPU to
121		 * GMON_PROF_ON. If _gmonparam.state is not ON, mcount will
122		 * not be executed.
123		 * This is For compatibility of the kgmon(8) kmem interface.
124		 */
125		p->state = GMON_PROF_ON;
126
127		cp = (char *)(p + 1);
128		p->tos = (struct tostruct *)cp;
129		p->kcount = (u_short *)(cp + p->tossize);
130		p->froms = (u_short *)(cp + p->tossize + p->kcountsize);
131	}
132
133	sysctl_createv(NULL, 0, NULL, NULL,
134	    0, CTLTYPE_NODE, "percpu",
135	    SYSCTL_DESCR("per cpu profiling information"),
136	    NULL, 0, NULL, 0,
137	    CTL_KERN, KERN_PROF, GPROF_PERCPU, CTL_EOL);
138
139	for (CPU_INFO_FOREACH(cii, ci)) {
140		if (ci->ci_gmon == NULL)
141			continue;
142
143		sysctl_createv(NULL, 0, NULL, NULL,
144		    0, CTLTYPE_NODE, cpu_name(ci),
145		    NULL,
146		    NULL, 0, NULL, 0,
147		    CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci), CTL_EOL);
148
149		sysctl_createv(NULL, 0, NULL, NULL,
150		    CTLFLAG_READWRITE, CTLTYPE_INT, "state",
151		    SYSCTL_DESCR("Profiling state"),
152		    sysctl_kern_profiling, 0, (void *)ci, 0,
153		    CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
154		    GPROF_STATE, CTL_EOL);
155		sysctl_createv(NULL, 0, NULL, NULL,
156		    CTLFLAG_READWRITE, CTLTYPE_STRUCT, "count",
157		    SYSCTL_DESCR("Array of statistical program counters"),
158		    sysctl_kern_profiling, 0, (void *)ci, 0,
159		    CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
160		    GPROF_COUNT, CTL_EOL);
161		sysctl_createv(NULL, 0, NULL, NULL,
162		    CTLFLAG_READWRITE, CTLTYPE_STRUCT, "froms",
163		    SYSCTL_DESCR("Array indexed by program counter of "
164		    "call-from points"),
165		    sysctl_kern_profiling, 0, (void *)ci, 0,
166		    CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
167		    GPROF_FROMS, CTL_EOL);
168		sysctl_createv(NULL, 0, NULL, NULL,
169		    CTLFLAG_READWRITE, CTLTYPE_STRUCT, "tos",
170		    SYSCTL_DESCR("Array of structures describing "
171		    "destination of calls and their counts"),
172		    sysctl_kern_profiling, 0, (void *)ci, 0,
173		    CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
174		    GPROF_TOS, CTL_EOL);
175		sysctl_createv(NULL, 0, NULL, NULL,
176		    CTLFLAG_READWRITE, CTLTYPE_STRUCT, "gmonparam",
177		    SYSCTL_DESCR("Structure giving the sizes of the above "
178		    "arrays"),
179		    sysctl_kern_profiling, 0, (void *)ci, 0,
180		    CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
181		    GPROF_GMONPARAM, CTL_EOL);
182	}
183
184	/*
185	 * For minimal compatibility of the kgmon(8) kmem interface,
186	 * the _gmonparam and cpu0:ci_gmon share buffers.
187	 */
188	p = curcpu()->ci_gmon;
189	if (p != NULL) {
190		_gmonparam.tos = p->tos;
191		_gmonparam.kcount = p->kcount;
192		_gmonparam.froms = p->froms;
193	}
194#else /* MULTIPROCESSOR */
195	cp = malloc(size, M_GPROF, M_NOWAIT | M_ZERO);
196	if (cp == 0) {
197		printf("No memory for profiling.\n");
198		return;
199	}
200	p->tos = (struct tostruct *)cp;
201	cp += p->tossize;
202	p->kcount = (u_short *)cp;
203	cp += p->kcountsize;
204	p->froms = (u_short *)cp;
205#endif /* MULTIPROCESSOR */
206}
207
208#ifdef MULTIPROCESSOR
209static void
210prof_set_state_xc(void *arg1, void *arg2 __unused)
211{
212	int state = PTRTOUINT64(arg1);
213	struct gmonparam *gp = curcpu()->ci_gmon;
214
215	if (gp != NULL)
216		gp->state = state;
217}
218#endif /* MULTIPROCESSOR */
219
220/*
221 * Return kernel profiling information.
222 */
223/*
224 * sysctl helper routine for kern.profiling subtree.  enables/disables
225 * kernel profiling and gives out copies of the profiling data.
226 */
227static int
228sysctl_kern_profiling(SYSCTLFN_ARGS)
229{
230	struct sysctlnode node = *rnode;
231	struct gmonparam *gp;
232	int error;
233#ifdef MULTIPROCESSOR
234	CPU_INFO_ITERATOR cii;
235	struct cpu_info *ci, *target_ci;
236	uint64_t where;
237	int state;
238	bool prof_on, do_merge;
239
240	target_ci = (struct cpu_info *)rnode->sysctl_data;
241	do_merge = (oldp != NULL) && (target_ci == NULL) &&
242	    ((node.sysctl_num == GPROF_COUNT) ||
243	    (node.sysctl_num == GPROF_FROMS) ||
244	    (node.sysctl_num == GPROF_TOS));
245
246	if (do_merge) {
247		/* kern.profiling.{count,froms,tos} */
248		unsigned long size;
249		char *cp;
250
251		/* allocate temporary gmonparam, and merge results of all CPU */
252		size = _gmonparam.kcountsize + _gmonparam.fromssize +
253		    _gmonparam.tossize;
254		gp = malloc(sizeof(struct gmonparam) + size, M_GPROF,
255		    M_NOWAIT | M_ZERO);
256		if (gp == NULL)
257			return ENOMEM;
258		memcpy(gp, &_gmonparam, sizeof(_gmonparam));
259		cp = (char *)(gp + 1);
260		gp->tos = (struct tostruct *)cp;
261		gp->kcount = (u_short *)(cp + gp->tossize);
262		gp->froms = (u_short *)(cp + gp->tossize + gp->kcountsize);
263
264		for (CPU_INFO_FOREACH(cii, ci)) {
265			if (ci->ci_gmon == NULL)
266				continue;
267			_gmonparam_merge(gp, ci->ci_gmon);
268		}
269	} else if (target_ci != NULL) {
270		/* kern.profiling.percpu.* */
271		gp = target_ci->ci_gmon;
272	} else {
273		/* kern.profiling.{state,gmonparam} */
274		gp = &_gmonparam;
275	}
276#else /* MULTIPROCESSOR */
277	gp = &_gmonparam;
278#endif
279
280	switch (node.sysctl_num) {
281	case GPROF_STATE:
282#ifdef MULTIPROCESSOR
283		/*
284		 * if _gmonparam.state is OFF, the state of each CPU is
285		 * considered to be OFF, even if it is actually ON.
286		 */
287		if (_gmonparam.state == GMON_PROF_OFF ||
288		    gp->state == GMON_PROF_OFF)
289			state = GMON_PROF_OFF;
290		else
291			state = GMON_PROF_ON;
292		node.sysctl_data = &state;
293#else
294		node.sysctl_data = &gp->state;
295#endif
296		break;
297	case GPROF_COUNT:
298		node.sysctl_data = gp->kcount;
299		node.sysctl_size = gp->kcountsize;
300		break;
301	case GPROF_FROMS:
302		node.sysctl_data = gp->froms;
303		node.sysctl_size = gp->fromssize;
304		break;
305	case GPROF_TOS:
306		node.sysctl_data = gp->tos;
307		node.sysctl_size = gp->tossize;
308		break;
309	case GPROF_GMONPARAM:
310		node.sysctl_data = gp;
311		node.sysctl_size = sizeof(*gp);
312		break;
313	default:
314		return (EOPNOTSUPP);
315	}
316
317	error = sysctl_lookup(SYSCTLFN_CALL(&node));
318	if (error || newp == NULL)
319		goto done;
320
321#ifdef MULTIPROCESSOR
322	switch (node.sysctl_num) {
323	case GPROF_STATE:
324		if (target_ci != NULL) {
325			where = xc_unicast(0, prof_set_state_xc,
326			    UINT64TOPTR(state), NULL, target_ci);
327			xc_wait(where);
328
329			/* if even one CPU being profiled, enable perfclock. */
330			prof_on = false;
331			for (CPU_INFO_FOREACH(cii, ci)) {
332				if (ci->ci_gmon == NULL)
333					continue;
334				if (ci->ci_gmon->state != GMON_PROF_OFF) {
335					prof_on = true;
336					break;
337				}
338			}
339			mutex_spin_enter(&proc0.p_stmutex);
340			if (prof_on)
341				startprofclock(&proc0);
342			else
343				stopprofclock(&proc0);
344			mutex_spin_exit(&proc0.p_stmutex);
345
346			if (prof_on) {
347				_gmonparam.state = GMON_PROF_ON;
348			} else {
349				_gmonparam.state = GMON_PROF_OFF;
350				/*
351				 * when _gmonparam.state and all CPU gmon state
352				 * are OFF, all CPU states should be ON so that
353				 * the entire CPUs profiling can be controlled
354				 * by _gmonparam.state only.
355				 */
356				for (CPU_INFO_FOREACH(cii, ci)) {
357					if (ci->ci_gmon == NULL)
358						continue;
359					ci->ci_gmon->state = GMON_PROF_ON;
360				}
361			}
362		} else {
363			_gmonparam.state = state;
364			where = xc_broadcast(0, prof_set_state_xc,
365			    UINT64TOPTR(state), NULL);
366			xc_wait(where);
367
368			mutex_spin_enter(&proc0.p_stmutex);
369			if (state == GMON_PROF_OFF)
370				stopprofclock(&proc0);
371			else
372				startprofclock(&proc0);
373			mutex_spin_exit(&proc0.p_stmutex);
374		}
375		break;
376	case GPROF_COUNT:
377		/*
378		 * if 'kern.profiling.{count,froms,tos}' is written, the same
379		 * data will be written to 'kern.profiling.percpu.cpuN.xxx'
380		 */
381		if (target_ci == NULL) {
382			for (CPU_INFO_FOREACH(cii, ci)) {
383				if (ci->ci_gmon == NULL)
384					continue;
385				memmove(ci->ci_gmon->kcount, gp->kcount,
386				    newlen);
387			}
388		}
389		break;
390	case GPROF_FROMS:
391		if (target_ci == NULL) {
392			for (CPU_INFO_FOREACH(cii, ci)) {
393				if (ci->ci_gmon == NULL)
394					continue;
395				memmove(ci->ci_gmon->froms, gp->froms, newlen);
396			}
397		}
398		break;
399	case GPROF_TOS:
400		if (target_ci == NULL) {
401			for (CPU_INFO_FOREACH(cii, ci)) {
402				if (ci->ci_gmon == NULL)
403					continue;
404				memmove(ci->ci_gmon->tos, gp->tos, newlen);
405			}
406		}
407		break;
408	}
409#else
410	if (node.sysctl_num == GPROF_STATE) {
411		mutex_spin_enter(&proc0.p_stmutex);
412		if (gp->state == GMON_PROF_OFF)
413			stopprofclock(&proc0);
414		else
415			startprofclock(&proc0);
416		mutex_spin_exit(&proc0.p_stmutex);
417	}
418#endif
419
420 done:
421#ifdef MULTIPROCESSOR
422	if (do_merge)
423		free(gp, M_GPROF);
424#endif
425	return error;
426}
427
428SYSCTL_SETUP(sysctl_kern_gprof_setup, "sysctl kern.profiling subtree setup")
429{
430
431	sysctl_createv(clog, 0, NULL, NULL,
432		       CTLFLAG_PERMANENT,
433		       CTLTYPE_NODE, "profiling",
434		       SYSCTL_DESCR("Profiling information (available)"),
435		       NULL, 0, NULL, 0,
436		       CTL_KERN, KERN_PROF, CTL_EOL);
437
438	sysctl_createv(clog, 0, NULL, NULL,
439		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
440		       CTLTYPE_INT, "state",
441		       SYSCTL_DESCR("Profiling state"),
442		       sysctl_kern_profiling, 0, NULL, 0,
443		       CTL_KERN, KERN_PROF, GPROF_STATE, CTL_EOL);
444	sysctl_createv(clog, 0, NULL, NULL,
445		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
446		       CTLTYPE_STRUCT, "count",
447		       SYSCTL_DESCR("Array of statistical program counters"),
448		       sysctl_kern_profiling, 0, NULL, 0,
449		       CTL_KERN, KERN_PROF, GPROF_COUNT, CTL_EOL);
450	sysctl_createv(clog, 0, NULL, NULL,
451		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
452		       CTLTYPE_STRUCT, "froms",
453		       SYSCTL_DESCR("Array indexed by program counter of "
454				    "call-from points"),
455		       sysctl_kern_profiling, 0, NULL, 0,
456		       CTL_KERN, KERN_PROF, GPROF_FROMS, CTL_EOL);
457	sysctl_createv(clog, 0, NULL, NULL,
458		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
459		       CTLTYPE_STRUCT, "tos",
460		       SYSCTL_DESCR("Array of structures describing "
461				    "destination of calls and their counts"),
462		       sysctl_kern_profiling, 0, NULL, 0,
463		       CTL_KERN, KERN_PROF, GPROF_TOS, CTL_EOL);
464	sysctl_createv(clog, 0, NULL, NULL,
465		       CTLFLAG_PERMANENT,
466		       CTLTYPE_STRUCT, "gmonparam",
467		       SYSCTL_DESCR("Structure giving the sizes of the above "
468				    "arrays"),
469		       sysctl_kern_profiling, 0, NULL, 0,
470		       CTL_KERN, KERN_PROF, GPROF_GMONPARAM, CTL_EOL);
471}
472#endif /* GPROF */
473
474/*
475 * Profiling system call.
476 *
477 * The scale factor is a fixed point number with 16 bits of fraction, so that
478 * 1.0 is represented as 0x10000.  A scale factor of 0 turns off profiling.
479 */
480/* ARGSUSED */
481int
482sys_profil(struct lwp *l, const struct sys_profil_args *uap, register_t *retval)
483{
484	/* {
485		syscallarg(char *) samples;
486		syscallarg(size_t) size;
487		syscallarg(u_long) offset;
488		syscallarg(u_int) scale;
489	} */
490	struct proc *p = l->l_proc;
491	struct uprof *upp;
492
493	if (SCARG(uap, scale) > (1 << 16))
494		return (EINVAL);
495	if (SCARG(uap, scale) == 0) {
496		mutex_spin_enter(&p->p_stmutex);
497		stopprofclock(p);
498		mutex_spin_exit(&p->p_stmutex);
499		return (0);
500	}
501	upp = &p->p_stats->p_prof;
502
503	/* Block profile interrupts while changing state. */
504	mutex_spin_enter(&p->p_stmutex);
505	upp->pr_off = SCARG(uap, offset);
506	upp->pr_scale = SCARG(uap, scale);
507	upp->pr_base = SCARG(uap, samples);
508	upp->pr_size = SCARG(uap, size);
509	startprofclock(p);
510	mutex_spin_exit(&p->p_stmutex);
511
512	return (0);
513}
514
515/*
516 * Scale is a fixed-point number with the binary point 16 bits
517 * into the value, and is <= 1.0.  pc is at most 32 bits, so the
518 * intermediate result is at most 48 bits.
519 */
520#define	PC_TO_INDEX(pc, prof) \
521	((int)(((u_quad_t)((pc) - (prof)->pr_off) * \
522	    (u_quad_t)((prof)->pr_scale)) >> 16) & ~1)
523
524/*
525 * Collect user-level profiling statistics; called on a profiling tick,
526 * when a process is running in user-mode.  This routine may be called
527 * from an interrupt context.  We schedule an AST that will vector us
528 * to trap() with a context in which copyin and copyout will work.
529 * Trap will then call addupc_task().
530 *
531 * XXX We could use ufetch/ustore here if the profile buffers were
532 * wired.
533 *
534 * Note that we may (rarely) not get around to the AST soon enough, and
535 * lose profile ticks when the next tick overwrites this one, but in this
536 * case the system is overloaded and the profile is probably already
537 * inaccurate.
538 */
539void
540addupc_intr(struct lwp *l, u_long pc)
541{
542	struct uprof *prof;
543	struct proc *p;
544	u_int i;
545
546	p = l->l_proc;
547
548	KASSERT(mutex_owned(&p->p_stmutex));
549
550	prof = &p->p_stats->p_prof;
551	if (pc < prof->pr_off ||
552	    (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size)
553		return;			/* out of range; ignore */
554
555	mutex_spin_exit(&p->p_stmutex);
556
557	/* XXXSMP */
558	prof->pr_addr = pc;
559	prof->pr_ticks++;
560	cpu_need_proftick(l);
561
562	mutex_spin_enter(&p->p_stmutex);
563}
564
565/*
566 * Much like before, but we can afford to take faults here.  If the
567 * update fails, we simply turn off profiling.
568 */
569void
570addupc_task(struct lwp *l, u_long pc, u_int ticks)
571{
572	struct uprof *prof;
573	struct proc *p;
574	void *addr;
575	int error;
576	u_int i;
577	u_short v;
578
579	p = l->l_proc;
580
581	if (ticks == 0)
582		return;
583
584	mutex_spin_enter(&p->p_stmutex);
585	prof = &p->p_stats->p_prof;
586
587	/* Testing P_PROFIL may be unnecessary, but is certainly safe. */
588	if ((p->p_stflag & PST_PROFIL) == 0 || pc < prof->pr_off ||
589	    (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) {
590		mutex_spin_exit(&p->p_stmutex);
591		return;
592	}
593
594	addr = prof->pr_base + i;
595	mutex_spin_exit(&p->p_stmutex);
596	if ((error = copyin(addr, (void *)&v, sizeof(v))) == 0) {
597		v += ticks;
598		error = copyout((void *)&v, addr, sizeof(v));
599	}
600	if (error != 0) {
601		mutex_spin_enter(&p->p_stmutex);
602		stopprofclock(p);
603		mutex_spin_exit(&p->p_stmutex);
604	}
605}
606