kern_ktrace.c revision 219312
1/*-
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.
4 * Copyright (c) 2005 Robert N. M. Watson
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 4. Neither the name of the University nor the names of its contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 *
31 *	@(#)kern_ktrace.c	8.2 (Berkeley) 9/23/93
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD: head/sys/kern/kern_ktrace.c 219312 2011-03-05 20:54:17Z dchagin $");
36
37#include "opt_ktrace.h"
38
39#include <sys/param.h>
40#include <sys/systm.h>
41#include <sys/fcntl.h>
42#include <sys/kernel.h>
43#include <sys/kthread.h>
44#include <sys/lock.h>
45#include <sys/mutex.h>
46#include <sys/malloc.h>
47#include <sys/mount.h>
48#include <sys/namei.h>
49#include <sys/priv.h>
50#include <sys/proc.h>
51#include <sys/unistd.h>
52#include <sys/vnode.h>
53#include <sys/socket.h>
54#include <sys/stat.h>
55#include <sys/ktrace.h>
56#include <sys/sx.h>
57#include <sys/sysctl.h>
58#include <sys/sysent.h>
59#include <sys/syslog.h>
60#include <sys/sysproto.h>
61
62#include <security/mac/mac_framework.h>
63
64/*
65 * The ktrace facility allows the tracing of certain key events in user space
66 * processes, such as system calls, signal delivery, context switches, and
67 * user generated events using utrace(2).  It works by streaming event
68 * records and data to a vnode associated with the process using the
69 * ktrace(2) system call.  In general, records can be written directly from
70 * the context that generates the event.  One important exception to this is
71 * during a context switch, where sleeping is not permitted.  To handle this
72 * case, trace events are generated using in-kernel ktr_request records, and
73 * then delivered to disk at a convenient moment -- either immediately, the
74 * next traceable event, at system call return, or at process exit.
75 *
76 * When dealing with multiple threads or processes writing to the same event
77 * log, ordering guarantees are weak: specifically, if an event has multiple
78 * records (i.e., system call enter and return), they may be interlaced with
79 * records from another event.  Process and thread ID information is provided
80 * in the record, and user applications can de-interlace events if required.
81 */
82
83static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE");
84
85#ifdef KTRACE
86
87FEATURE(ktrace, "Kernel support for system-call tracing");
88
89#ifndef KTRACE_REQUEST_POOL
90#define	KTRACE_REQUEST_POOL	100
91#endif
92
93struct ktr_request {
94	struct	ktr_header ktr_header;
95	void	*ktr_buffer;
96	union {
97		struct	ktr_proc_ctor ktr_proc_ctor;
98		struct	ktr_syscall ktr_syscall;
99		struct	ktr_sysret ktr_sysret;
100		struct	ktr_genio ktr_genio;
101		struct	ktr_psig ktr_psig;
102		struct	ktr_csw ktr_csw;
103	} ktr_data;
104	STAILQ_ENTRY(ktr_request) ktr_list;
105};
106
107static int data_lengths[] = {
108	0,					/* none */
109	offsetof(struct ktr_syscall, ktr_args),	/* KTR_SYSCALL */
110	sizeof(struct ktr_sysret),		/* KTR_SYSRET */
111	0,					/* KTR_NAMEI */
112	sizeof(struct ktr_genio),		/* KTR_GENIO */
113	sizeof(struct ktr_psig),		/* KTR_PSIG */
114	sizeof(struct ktr_csw),			/* KTR_CSW */
115	0,					/* KTR_USER */
116	0,					/* KTR_STRUCT */
117	0,					/* KTR_SYSCTL */
118	sizeof(struct ktr_proc_ctor),		/* KTR_PROCCTOR */
119	0,					/* KTR_PROCDTOR */
120};
121
122static STAILQ_HEAD(, ktr_request) ktr_free;
123
124static SYSCTL_NODE(_kern, OID_AUTO, ktrace, CTLFLAG_RD, 0, "KTRACE options");
125
126static u_int ktr_requestpool = KTRACE_REQUEST_POOL;
127TUNABLE_INT("kern.ktrace.request_pool", &ktr_requestpool);
128
129static u_int ktr_geniosize = PAGE_SIZE;
130TUNABLE_INT("kern.ktrace.genio_size", &ktr_geniosize);
131SYSCTL_UINT(_kern_ktrace, OID_AUTO, genio_size, CTLFLAG_RW, &ktr_geniosize,
132    0, "Maximum size of genio event payload");
133
134static int print_message = 1;
135static struct mtx ktrace_mtx;
136static struct sx ktrace_sx;
137
138static void ktrace_init(void *dummy);
139static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS);
140static u_int ktrace_resize_pool(u_int oldsize, u_int newsize);
141static struct ktr_request *ktr_getrequest_entered(struct thread *td, int type);
142static struct ktr_request *ktr_getrequest(int type);
143static void ktr_submitrequest(struct thread *td, struct ktr_request *req);
144static void ktr_freeproc(struct proc *p, struct ucred **uc,
145    struct vnode **vp);
146static void ktr_freerequest(struct ktr_request *req);
147static void ktr_freerequest_locked(struct ktr_request *req);
148static void ktr_writerequest(struct thread *td, struct ktr_request *req);
149static int ktrcanset(struct thread *,struct proc *);
150static int ktrsetchildren(struct thread *,struct proc *,int,int,struct vnode *);
151static int ktrops(struct thread *,struct proc *,int,int,struct vnode *);
152static void ktrprocctor_entered(struct thread *, struct proc *);
153
154/*
155 * ktrace itself generates events, such as context switches, which we do not
156 * wish to trace.  Maintain a flag, TDP_INKTRACE, on each thread to determine
157 * whether or not it is in a region where tracing of events should be
158 * suppressed.
159 */
160static void
161ktrace_enter(struct thread *td)
162{
163
164	KASSERT(!(td->td_pflags & TDP_INKTRACE), ("ktrace_enter: flag set"));
165	td->td_pflags |= TDP_INKTRACE;
166}
167
168static void
169ktrace_exit(struct thread *td)
170{
171
172	KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_exit: flag not set"));
173	td->td_pflags &= ~TDP_INKTRACE;
174}
175
176static void
177ktrace_assert(struct thread *td)
178{
179
180	KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_assert: flag not set"));
181}
182
183static void
184ktrace_init(void *dummy)
185{
186	struct ktr_request *req;
187	int i;
188
189	mtx_init(&ktrace_mtx, "ktrace", NULL, MTX_DEF | MTX_QUIET);
190	sx_init(&ktrace_sx, "ktrace_sx");
191	STAILQ_INIT(&ktr_free);
192	for (i = 0; i < ktr_requestpool; i++) {
193		req = malloc(sizeof(struct ktr_request), M_KTRACE, M_WAITOK);
194		STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
195	}
196}
197SYSINIT(ktrace_init, SI_SUB_KTRACE, SI_ORDER_ANY, ktrace_init, NULL);
198
199static int
200sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS)
201{
202	struct thread *td;
203	u_int newsize, oldsize, wantsize;
204	int error;
205
206	/* Handle easy read-only case first to avoid warnings from GCC. */
207	if (!req->newptr) {
208		oldsize = ktr_requestpool;
209		return (SYSCTL_OUT(req, &oldsize, sizeof(u_int)));
210	}
211
212	error = SYSCTL_IN(req, &wantsize, sizeof(u_int));
213	if (error)
214		return (error);
215	td = curthread;
216	ktrace_enter(td);
217	oldsize = ktr_requestpool;
218	newsize = ktrace_resize_pool(oldsize, wantsize);
219	ktrace_exit(td);
220	error = SYSCTL_OUT(req, &oldsize, sizeof(u_int));
221	if (error)
222		return (error);
223	if (wantsize > oldsize && newsize < wantsize)
224		return (ENOSPC);
225	return (0);
226}
227SYSCTL_PROC(_kern_ktrace, OID_AUTO, request_pool, CTLTYPE_UINT|CTLFLAG_RW,
228    &ktr_requestpool, 0, sysctl_kern_ktrace_request_pool, "IU",
229    "Pool buffer size for ktrace(1)");
230
231static u_int
232ktrace_resize_pool(u_int oldsize, u_int newsize)
233{
234	STAILQ_HEAD(, ktr_request) ktr_new;
235	struct ktr_request *req;
236	int bound;
237
238	print_message = 1;
239	bound = newsize - oldsize;
240	if (bound == 0)
241		return (ktr_requestpool);
242	if (bound < 0) {
243		mtx_lock(&ktrace_mtx);
244		/* Shrink pool down to newsize if possible. */
245		while (bound++ < 0) {
246			req = STAILQ_FIRST(&ktr_free);
247			if (req == NULL)
248				break;
249			STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
250			ktr_requestpool--;
251			free(req, M_KTRACE);
252		}
253	} else {
254		/* Grow pool up to newsize. */
255		STAILQ_INIT(&ktr_new);
256		while (bound-- > 0) {
257			req = malloc(sizeof(struct ktr_request), M_KTRACE,
258			    M_WAITOK);
259			STAILQ_INSERT_HEAD(&ktr_new, req, ktr_list);
260		}
261		mtx_lock(&ktrace_mtx);
262		STAILQ_CONCAT(&ktr_free, &ktr_new);
263		ktr_requestpool += (newsize - oldsize);
264	}
265	mtx_unlock(&ktrace_mtx);
266	return (ktr_requestpool);
267}
268
269/* ktr_getrequest() assumes that ktr_comm[] is the same size as td_name[]. */
270CTASSERT(sizeof(((struct ktr_header *)NULL)->ktr_comm) ==
271    (sizeof((struct thread *)NULL)->td_name));
272
273static struct ktr_request *
274ktr_getrequest_entered(struct thread *td, int type)
275{
276	struct ktr_request *req;
277	struct proc *p = td->td_proc;
278	int pm;
279
280	mtx_lock(&ktrace_mtx);
281	if (!KTRCHECK(td, type)) {
282		mtx_unlock(&ktrace_mtx);
283		return (NULL);
284	}
285	req = STAILQ_FIRST(&ktr_free);
286	if (req != NULL) {
287		STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
288		req->ktr_header.ktr_type = type;
289		if (p->p_traceflag & KTRFAC_DROP) {
290			req->ktr_header.ktr_type |= KTR_DROP;
291			p->p_traceflag &= ~KTRFAC_DROP;
292		}
293		mtx_unlock(&ktrace_mtx);
294		microtime(&req->ktr_header.ktr_time);
295		req->ktr_header.ktr_pid = p->p_pid;
296		req->ktr_header.ktr_tid = td->td_tid;
297		bcopy(td->td_name, req->ktr_header.ktr_comm,
298		    sizeof(req->ktr_header.ktr_comm));
299		req->ktr_buffer = NULL;
300		req->ktr_header.ktr_len = 0;
301	} else {
302		p->p_traceflag |= KTRFAC_DROP;
303		pm = print_message;
304		print_message = 0;
305		mtx_unlock(&ktrace_mtx);
306		if (pm)
307			printf("Out of ktrace request objects.\n");
308	}
309	return (req);
310}
311
312static struct ktr_request *
313ktr_getrequest(int type)
314{
315	struct thread *td = curthread;
316	struct ktr_request *req;
317
318	ktrace_enter(td);
319	req = ktr_getrequest_entered(td, type);
320	if (req == NULL)
321		ktrace_exit(td);
322
323	return (req);
324}
325
326/*
327 * Some trace generation environments don't permit direct access to VFS,
328 * such as during a context switch where sleeping is not allowed.  Under these
329 * circumstances, queue a request to the thread to be written asynchronously
330 * later.
331 */
332static void
333ktr_enqueuerequest(struct thread *td, struct ktr_request *req)
334{
335
336	mtx_lock(&ktrace_mtx);
337	STAILQ_INSERT_TAIL(&td->td_proc->p_ktr, req, ktr_list);
338	mtx_unlock(&ktrace_mtx);
339}
340
341/*
342 * Drain any pending ktrace records from the per-thread queue to disk.  This
343 * is used both internally before committing other records, and also on
344 * system call return.  We drain all the ones we can find at the time when
345 * drain is requested, but don't keep draining after that as those events
346 * may be approximately "after" the current event.
347 */
348static void
349ktr_drain(struct thread *td)
350{
351	struct ktr_request *queued_req;
352	STAILQ_HEAD(, ktr_request) local_queue;
353
354	ktrace_assert(td);
355	sx_assert(&ktrace_sx, SX_XLOCKED);
356
357	STAILQ_INIT(&local_queue);
358
359	if (!STAILQ_EMPTY(&td->td_proc->p_ktr)) {
360		mtx_lock(&ktrace_mtx);
361		STAILQ_CONCAT(&local_queue, &td->td_proc->p_ktr);
362		mtx_unlock(&ktrace_mtx);
363
364		while ((queued_req = STAILQ_FIRST(&local_queue))) {
365			STAILQ_REMOVE_HEAD(&local_queue, ktr_list);
366			ktr_writerequest(td, queued_req);
367			ktr_freerequest(queued_req);
368		}
369	}
370}
371
372/*
373 * Submit a trace record for immediate commit to disk -- to be used only
374 * where entering VFS is OK.  First drain any pending records that may have
375 * been cached in the thread.
376 */
377static void
378ktr_submitrequest(struct thread *td, struct ktr_request *req)
379{
380
381	ktrace_assert(td);
382
383	sx_xlock(&ktrace_sx);
384	ktr_drain(td);
385	ktr_writerequest(td, req);
386	ktr_freerequest(req);
387	sx_xunlock(&ktrace_sx);
388	ktrace_exit(td);
389}
390
391static void
392ktr_freerequest(struct ktr_request *req)
393{
394
395	mtx_lock(&ktrace_mtx);
396	ktr_freerequest_locked(req);
397	mtx_unlock(&ktrace_mtx);
398}
399
400static void
401ktr_freerequest_locked(struct ktr_request *req)
402{
403
404	mtx_assert(&ktrace_mtx, MA_OWNED);
405	if (req->ktr_buffer != NULL)
406		free(req->ktr_buffer, M_KTRACE);
407	STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
408}
409
410/*
411 * Disable tracing for a process and release all associated resources.
412 * The caller is responsible for releasing a reference on the returned
413 * vnode and credentials.
414 */
415static void
416ktr_freeproc(struct proc *p, struct ucred **uc, struct vnode **vp)
417{
418	struct ktr_request *req;
419
420	PROC_LOCK_ASSERT(p, MA_OWNED);
421	mtx_assert(&ktrace_mtx, MA_OWNED);
422	*uc = p->p_tracecred;
423	p->p_tracecred = NULL;
424	if (vp != NULL)
425		*vp = p->p_tracevp;
426	p->p_tracevp = NULL;
427	p->p_traceflag = 0;
428	while ((req = STAILQ_FIRST(&p->p_ktr)) != NULL) {
429		STAILQ_REMOVE_HEAD(&p->p_ktr, ktr_list);
430		ktr_freerequest_locked(req);
431	}
432}
433
434void
435ktrsyscall(code, narg, args)
436	int code, narg;
437	register_t args[];
438{
439	struct ktr_request *req;
440	struct ktr_syscall *ktp;
441	size_t buflen;
442	char *buf = NULL;
443
444	buflen = sizeof(register_t) * narg;
445	if (buflen > 0) {
446		buf = malloc(buflen, M_KTRACE, M_WAITOK);
447		bcopy(args, buf, buflen);
448	}
449	req = ktr_getrequest(KTR_SYSCALL);
450	if (req == NULL) {
451		if (buf != NULL)
452			free(buf, M_KTRACE);
453		return;
454	}
455	ktp = &req->ktr_data.ktr_syscall;
456	ktp->ktr_code = code;
457	ktp->ktr_narg = narg;
458	if (buflen > 0) {
459		req->ktr_header.ktr_len = buflen;
460		req->ktr_buffer = buf;
461	}
462	ktr_submitrequest(curthread, req);
463}
464
465void
466ktrsysret(code, error, retval)
467	int code, error;
468	register_t retval;
469{
470	struct ktr_request *req;
471	struct ktr_sysret *ktp;
472
473	req = ktr_getrequest(KTR_SYSRET);
474	if (req == NULL)
475		return;
476	ktp = &req->ktr_data.ktr_sysret;
477	ktp->ktr_code = code;
478	ktp->ktr_error = error;
479	ktp->ktr_retval = retval;		/* what about val2 ? */
480	ktr_submitrequest(curthread, req);
481}
482
483/*
484 * When a setuid process execs, disable tracing.
485 *
486 * XXX: We toss any pending asynchronous records.
487 */
488void
489ktrprocexec(struct proc *p, struct ucred **uc, struct vnode **vp)
490{
491
492	PROC_LOCK_ASSERT(p, MA_OWNED);
493	mtx_lock(&ktrace_mtx);
494	ktr_freeproc(p, uc, vp);
495	mtx_unlock(&ktrace_mtx);
496}
497
498/*
499 * When a process exits, drain per-process asynchronous trace records
500 * and disable tracing.
501 */
502void
503ktrprocexit(struct thread *td)
504{
505	struct ktr_request *req;
506	struct proc *p;
507	struct ucred *cred;
508	struct vnode *vp;
509	int vfslocked;
510
511	p = td->td_proc;
512	if (p->p_traceflag == 0)
513		return;
514
515	ktrace_enter(td);
516	req = ktr_getrequest_entered(td, KTR_PROCDTOR);
517	if (req != NULL)
518		ktr_enqueuerequest(td, req);
519	sx_xlock(&ktrace_sx);
520	ktr_drain(td);
521	sx_xunlock(&ktrace_sx);
522	PROC_LOCK(p);
523	mtx_lock(&ktrace_mtx);
524	ktr_freeproc(p, &cred, &vp);
525	mtx_unlock(&ktrace_mtx);
526	PROC_UNLOCK(p);
527	if (vp != NULL) {
528		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
529		vrele(vp);
530		VFS_UNLOCK_GIANT(vfslocked);
531	}
532	if (cred != NULL)
533		crfree(cred);
534	ktrace_exit(td);
535}
536
537static void
538ktrprocctor_entered(struct thread *td, struct proc *p)
539{
540	struct ktr_proc_ctor *ktp;
541	struct ktr_request *req;
542	struct thread *td2;
543
544	ktrace_assert(td);
545	td2 = FIRST_THREAD_IN_PROC(p);
546	req = ktr_getrequest_entered(td2, KTR_PROCCTOR);
547	if (req == NULL)
548		return;
549	ktp = &req->ktr_data.ktr_proc_ctor;
550	ktp->sv_flags = p->p_sysent->sv_flags;
551	ktr_enqueuerequest(td2, req);
552}
553
554void
555ktrprocctor(struct proc *p)
556{
557	struct thread *td = curthread;
558
559	if ((p->p_traceflag & KTRFAC_MASK) == 0)
560		return;
561
562	ktrace_enter(td);
563	ktrprocctor_entered(td, p);
564	ktrace_exit(td);
565}
566
567/*
568 * When a process forks, enable tracing in the new process if needed.
569 */
570void
571ktrprocfork(struct proc *p1, struct proc *p2)
572{
573
574	PROC_LOCK(p1);
575	mtx_lock(&ktrace_mtx);
576	KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode"));
577	if (p1->p_traceflag & KTRFAC_INHERIT) {
578		p2->p_traceflag = p1->p_traceflag;
579		if ((p2->p_tracevp = p1->p_tracevp) != NULL) {
580			VREF(p2->p_tracevp);
581			KASSERT(p1->p_tracecred != NULL,
582			    ("ktrace vnode with no cred"));
583			p2->p_tracecred = crhold(p1->p_tracecred);
584		}
585	}
586	mtx_unlock(&ktrace_mtx);
587	PROC_UNLOCK(p1);
588
589	ktrprocctor(p2);
590}
591
592/*
593 * When a thread returns, drain any asynchronous records generated by the
594 * system call.
595 */
596void
597ktruserret(struct thread *td)
598{
599
600	ktrace_enter(td);
601	sx_xlock(&ktrace_sx);
602	ktr_drain(td);
603	sx_xunlock(&ktrace_sx);
604	ktrace_exit(td);
605}
606
607void
608ktrnamei(path)
609	char *path;
610{
611	struct ktr_request *req;
612	int namelen;
613	char *buf = NULL;
614
615	namelen = strlen(path);
616	if (namelen > 0) {
617		buf = malloc(namelen, M_KTRACE, M_WAITOK);
618		bcopy(path, buf, namelen);
619	}
620	req = ktr_getrequest(KTR_NAMEI);
621	if (req == NULL) {
622		if (buf != NULL)
623			free(buf, M_KTRACE);
624		return;
625	}
626	if (namelen > 0) {
627		req->ktr_header.ktr_len = namelen;
628		req->ktr_buffer = buf;
629	}
630	ktr_submitrequest(curthread, req);
631}
632
633void
634ktrsysctl(name, namelen)
635	int *name;
636	u_int namelen;
637{
638	struct ktr_request *req;
639	u_int mib[CTL_MAXNAME + 2];
640	char *mibname;
641	size_t mibnamelen;
642	int error;
643
644	/* Lookup name of mib. */
645	KASSERT(namelen <= CTL_MAXNAME, ("sysctl MIB too long"));
646	mib[0] = 0;
647	mib[1] = 1;
648	bcopy(name, mib + 2, namelen * sizeof(*name));
649	mibnamelen = 128;
650	mibname = malloc(mibnamelen, M_KTRACE, M_WAITOK);
651	error = kernel_sysctl(curthread, mib, namelen + 2, mibname, &mibnamelen,
652	    NULL, 0, &mibnamelen, 0);
653	if (error) {
654		free(mibname, M_KTRACE);
655		return;
656	}
657	req = ktr_getrequest(KTR_SYSCTL);
658	if (req == NULL) {
659		free(mibname, M_KTRACE);
660		return;
661	}
662	req->ktr_header.ktr_len = mibnamelen;
663	req->ktr_buffer = mibname;
664	ktr_submitrequest(curthread, req);
665}
666
667void
668ktrgenio(fd, rw, uio, error)
669	int fd;
670	enum uio_rw rw;
671	struct uio *uio;
672	int error;
673{
674	struct ktr_request *req;
675	struct ktr_genio *ktg;
676	int datalen;
677	char *buf;
678
679	if (error) {
680		free(uio, M_IOV);
681		return;
682	}
683	uio->uio_offset = 0;
684	uio->uio_rw = UIO_WRITE;
685	datalen = imin(uio->uio_resid, ktr_geniosize);
686	buf = malloc(datalen, M_KTRACE, M_WAITOK);
687	error = uiomove(buf, datalen, uio);
688	free(uio, M_IOV);
689	if (error) {
690		free(buf, M_KTRACE);
691		return;
692	}
693	req = ktr_getrequest(KTR_GENIO);
694	if (req == NULL) {
695		free(buf, M_KTRACE);
696		return;
697	}
698	ktg = &req->ktr_data.ktr_genio;
699	ktg->ktr_fd = fd;
700	ktg->ktr_rw = rw;
701	req->ktr_header.ktr_len = datalen;
702	req->ktr_buffer = buf;
703	ktr_submitrequest(curthread, req);
704}
705
706void
707ktrpsig(sig, action, mask, code)
708	int sig;
709	sig_t action;
710	sigset_t *mask;
711	int code;
712{
713	struct thread *td = curthread;
714	struct ktr_request *req;
715	struct ktr_psig	*kp;
716
717	req = ktr_getrequest(KTR_PSIG);
718	if (req == NULL)
719		return;
720	kp = &req->ktr_data.ktr_psig;
721	kp->signo = (char)sig;
722	kp->action = action;
723	kp->mask = *mask;
724	kp->code = code;
725	ktr_enqueuerequest(td, req);
726	ktrace_exit(td);
727}
728
729void
730ktrcsw(out, user)
731	int out, user;
732{
733	struct thread *td = curthread;
734	struct ktr_request *req;
735	struct ktr_csw *kc;
736
737	req = ktr_getrequest(KTR_CSW);
738	if (req == NULL)
739		return;
740	kc = &req->ktr_data.ktr_csw;
741	kc->out = out;
742	kc->user = user;
743	ktr_enqueuerequest(td, req);
744	ktrace_exit(td);
745}
746
747void
748ktrstruct(name, data, datalen)
749	const char *name;
750	void *data;
751	size_t datalen;
752{
753	struct ktr_request *req;
754	char *buf = NULL;
755	size_t buflen;
756
757	if (!data)
758		datalen = 0;
759	buflen = strlen(name) + 1 + datalen;
760	buf = malloc(buflen, M_KTRACE, M_WAITOK);
761	strcpy(buf, name);
762	bcopy(data, buf + strlen(name) + 1, datalen);
763	if ((req = ktr_getrequest(KTR_STRUCT)) == NULL) {
764		free(buf, M_KTRACE);
765		return;
766	}
767	req->ktr_buffer = buf;
768	req->ktr_header.ktr_len = buflen;
769	ktr_submitrequest(curthread, req);
770}
771#endif /* KTRACE */
772
773/* Interface and common routines */
774
775#ifndef _SYS_SYSPROTO_H_
776struct ktrace_args {
777	char	*fname;
778	int	ops;
779	int	facs;
780	int	pid;
781};
782#endif
783/* ARGSUSED */
784int
785ktrace(td, uap)
786	struct thread *td;
787	register struct ktrace_args *uap;
788{
789#ifdef KTRACE
790	register struct vnode *vp = NULL;
791	register struct proc *p;
792	struct pgrp *pg;
793	int facs = uap->facs & ~KTRFAC_ROOT;
794	int ops = KTROP(uap->ops);
795	int descend = uap->ops & KTRFLAG_DESCEND;
796	int nfound, ret = 0;
797	int flags, error = 0, vfslocked;
798	struct nameidata nd;
799	struct ucred *cred;
800
801	/*
802	 * Need something to (un)trace.
803	 */
804	if (ops != KTROP_CLEARFILE && facs == 0)
805		return (EINVAL);
806
807	ktrace_enter(td);
808	if (ops != KTROP_CLEAR) {
809		/*
810		 * an operation which requires a file argument.
811		 */
812		NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_USERSPACE,
813		    uap->fname, td);
814		flags = FREAD | FWRITE | O_NOFOLLOW;
815		error = vn_open(&nd, &flags, 0, NULL);
816		if (error) {
817			ktrace_exit(td);
818			return (error);
819		}
820		vfslocked = NDHASGIANT(&nd);
821		NDFREE(&nd, NDF_ONLY_PNBUF);
822		vp = nd.ni_vp;
823		VOP_UNLOCK(vp, 0);
824		if (vp->v_type != VREG) {
825			(void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
826			VFS_UNLOCK_GIANT(vfslocked);
827			ktrace_exit(td);
828			return (EACCES);
829		}
830		VFS_UNLOCK_GIANT(vfslocked);
831	}
832	/*
833	 * Clear all uses of the tracefile.
834	 */
835	if (ops == KTROP_CLEARFILE) {
836		int vrele_count;
837
838		vrele_count = 0;
839		sx_slock(&allproc_lock);
840		FOREACH_PROC_IN_SYSTEM(p) {
841			PROC_LOCK(p);
842			if (p->p_tracevp == vp) {
843				if (ktrcanset(td, p)) {
844					mtx_lock(&ktrace_mtx);
845					ktr_freeproc(p, &cred, NULL);
846					mtx_unlock(&ktrace_mtx);
847					vrele_count++;
848					crfree(cred);
849				} else
850					error = EPERM;
851			}
852			PROC_UNLOCK(p);
853		}
854		sx_sunlock(&allproc_lock);
855		if (vrele_count > 0) {
856			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
857			while (vrele_count-- > 0)
858				vrele(vp);
859			VFS_UNLOCK_GIANT(vfslocked);
860		}
861		goto done;
862	}
863	/*
864	 * do it
865	 */
866	sx_slock(&proctree_lock);
867	if (uap->pid < 0) {
868		/*
869		 * by process group
870		 */
871		pg = pgfind(-uap->pid);
872		if (pg == NULL) {
873			sx_sunlock(&proctree_lock);
874			error = ESRCH;
875			goto done;
876		}
877		/*
878		 * ktrops() may call vrele(). Lock pg_members
879		 * by the proctree_lock rather than pg_mtx.
880		 */
881		PGRP_UNLOCK(pg);
882		nfound = 0;
883		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
884			PROC_LOCK(p);
885			if (p_cansee(td, p) != 0) {
886				PROC_UNLOCK(p);
887				continue;
888			}
889			nfound++;
890			if (descend)
891				ret |= ktrsetchildren(td, p, ops, facs, vp);
892			else
893				ret |= ktrops(td, p, ops, facs, vp);
894		}
895		if (nfound == 0) {
896			sx_sunlock(&proctree_lock);
897			error = ESRCH;
898			goto done;
899		}
900	} else {
901		/*
902		 * by pid
903		 */
904		p = pfind(uap->pid);
905		if (p == NULL)
906			error = ESRCH;
907		else
908			error = p_cansee(td, p);
909		if (error) {
910			if (p != NULL)
911				PROC_UNLOCK(p);
912			sx_sunlock(&proctree_lock);
913			goto done;
914		}
915		if (descend)
916			ret |= ktrsetchildren(td, p, ops, facs, vp);
917		else
918			ret |= ktrops(td, p, ops, facs, vp);
919	}
920	sx_sunlock(&proctree_lock);
921	if (!ret)
922		error = EPERM;
923done:
924	if (vp != NULL) {
925		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
926		(void) vn_close(vp, FWRITE, td->td_ucred, td);
927		VFS_UNLOCK_GIANT(vfslocked);
928	}
929	ktrace_exit(td);
930	return (error);
931#else /* !KTRACE */
932	return (ENOSYS);
933#endif /* KTRACE */
934}
935
936/* ARGSUSED */
937int
938utrace(td, uap)
939	struct thread *td;
940	register struct utrace_args *uap;
941{
942
943#ifdef KTRACE
944	struct ktr_request *req;
945	void *cp;
946	int error;
947
948	if (!KTRPOINT(td, KTR_USER))
949		return (0);
950	if (uap->len > KTR_USER_MAXLEN)
951		return (EINVAL);
952	cp = malloc(uap->len, M_KTRACE, M_WAITOK);
953	error = copyin(uap->addr, cp, uap->len);
954	if (error) {
955		free(cp, M_KTRACE);
956		return (error);
957	}
958	req = ktr_getrequest(KTR_USER);
959	if (req == NULL) {
960		free(cp, M_KTRACE);
961		return (ENOMEM);
962	}
963	req->ktr_buffer = cp;
964	req->ktr_header.ktr_len = uap->len;
965	ktr_submitrequest(td, req);
966	return (0);
967#else /* !KTRACE */
968	return (ENOSYS);
969#endif /* KTRACE */
970}
971
972#ifdef KTRACE
973static int
974ktrops(td, p, ops, facs, vp)
975	struct thread *td;
976	struct proc *p;
977	int ops, facs;
978	struct vnode *vp;
979{
980	struct vnode *tracevp = NULL;
981	struct ucred *tracecred = NULL;
982
983	PROC_LOCK_ASSERT(p, MA_OWNED);
984	if (!ktrcanset(td, p)) {
985		PROC_UNLOCK(p);
986		return (0);
987	}
988	if (p->p_flag & P_WEXIT) {
989		/* If the process is exiting, just ignore it. */
990		PROC_UNLOCK(p);
991		return (1);
992	}
993	mtx_lock(&ktrace_mtx);
994	if (ops == KTROP_SET) {
995		if (p->p_tracevp != vp) {
996			/*
997			 * if trace file already in use, relinquish below
998			 */
999			tracevp = p->p_tracevp;
1000			VREF(vp);
1001			p->p_tracevp = vp;
1002		}
1003		if (p->p_tracecred != td->td_ucred) {
1004			tracecred = p->p_tracecred;
1005			p->p_tracecred = crhold(td->td_ucred);
1006		}
1007		p->p_traceflag |= facs;
1008		if (priv_check(td, PRIV_KTRACE) == 0)
1009			p->p_traceflag |= KTRFAC_ROOT;
1010	} else {
1011		/* KTROP_CLEAR */
1012		if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0)
1013			/* no more tracing */
1014			ktr_freeproc(p, &tracecred, &tracevp);
1015	}
1016	mtx_unlock(&ktrace_mtx);
1017	if ((p->p_traceflag & KTRFAC_MASK) != 0)
1018		ktrprocctor_entered(td, p);
1019	PROC_UNLOCK(p);
1020	if (tracevp != NULL) {
1021		int vfslocked;
1022
1023		vfslocked = VFS_LOCK_GIANT(tracevp->v_mount);
1024		vrele(tracevp);
1025		VFS_UNLOCK_GIANT(vfslocked);
1026	}
1027	if (tracecred != NULL)
1028		crfree(tracecred);
1029
1030	return (1);
1031}
1032
1033static int
1034ktrsetchildren(td, top, ops, facs, vp)
1035	struct thread *td;
1036	struct proc *top;
1037	int ops, facs;
1038	struct vnode *vp;
1039{
1040	register struct proc *p;
1041	register int ret = 0;
1042
1043	p = top;
1044	PROC_LOCK_ASSERT(p, MA_OWNED);
1045	sx_assert(&proctree_lock, SX_LOCKED);
1046	for (;;) {
1047		ret |= ktrops(td, p, ops, facs, vp);
1048		/*
1049		 * If this process has children, descend to them next,
1050		 * otherwise do any siblings, and if done with this level,
1051		 * follow back up the tree (but not past top).
1052		 */
1053		if (!LIST_EMPTY(&p->p_children))
1054			p = LIST_FIRST(&p->p_children);
1055		else for (;;) {
1056			if (p == top)
1057				return (ret);
1058			if (LIST_NEXT(p, p_sibling)) {
1059				p = LIST_NEXT(p, p_sibling);
1060				break;
1061			}
1062			p = p->p_pptr;
1063		}
1064		PROC_LOCK(p);
1065	}
1066	/*NOTREACHED*/
1067}
1068
1069static void
1070ktr_writerequest(struct thread *td, struct ktr_request *req)
1071{
1072	struct ktr_header *kth;
1073	struct vnode *vp;
1074	struct proc *p;
1075	struct ucred *cred;
1076	struct uio auio;
1077	struct iovec aiov[3];
1078	struct mount *mp;
1079	int datalen, buflen, vrele_count;
1080	int error, vfslocked;
1081
1082	/*
1083	 * We hold the vnode and credential for use in I/O in case ktrace is
1084	 * disabled on the process as we write out the request.
1085	 *
1086	 * XXXRW: This is not ideal: we could end up performing a write after
1087	 * the vnode has been closed.
1088	 */
1089	mtx_lock(&ktrace_mtx);
1090	vp = td->td_proc->p_tracevp;
1091	cred = td->td_proc->p_tracecred;
1092
1093	/*
1094	 * If vp is NULL, the vp has been cleared out from under this
1095	 * request, so just drop it.  Make sure the credential and vnode are
1096	 * in sync: we should have both or neither.
1097	 */
1098	if (vp == NULL) {
1099		KASSERT(cred == NULL, ("ktr_writerequest: cred != NULL"));
1100		mtx_unlock(&ktrace_mtx);
1101		return;
1102	}
1103	VREF(vp);
1104	KASSERT(cred != NULL, ("ktr_writerequest: cred == NULL"));
1105	crhold(cred);
1106	mtx_unlock(&ktrace_mtx);
1107
1108	kth = &req->ktr_header;
1109	KASSERT(((u_short)kth->ktr_type & ~KTR_DROP) <
1110	    sizeof(data_lengths) / sizeof(data_lengths[0]),
1111	    ("data_lengths array overflow"));
1112	datalen = data_lengths[(u_short)kth->ktr_type & ~KTR_DROP];
1113	buflen = kth->ktr_len;
1114	auio.uio_iov = &aiov[0];
1115	auio.uio_offset = 0;
1116	auio.uio_segflg = UIO_SYSSPACE;
1117	auio.uio_rw = UIO_WRITE;
1118	aiov[0].iov_base = (caddr_t)kth;
1119	aiov[0].iov_len = sizeof(struct ktr_header);
1120	auio.uio_resid = sizeof(struct ktr_header);
1121	auio.uio_iovcnt = 1;
1122	auio.uio_td = td;
1123	if (datalen != 0) {
1124		aiov[1].iov_base = (caddr_t)&req->ktr_data;
1125		aiov[1].iov_len = datalen;
1126		auio.uio_resid += datalen;
1127		auio.uio_iovcnt++;
1128		kth->ktr_len += datalen;
1129	}
1130	if (buflen != 0) {
1131		KASSERT(req->ktr_buffer != NULL, ("ktrace: nothing to write"));
1132		aiov[auio.uio_iovcnt].iov_base = req->ktr_buffer;
1133		aiov[auio.uio_iovcnt].iov_len = buflen;
1134		auio.uio_resid += buflen;
1135		auio.uio_iovcnt++;
1136	}
1137
1138	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1139	vn_start_write(vp, &mp, V_WAIT);
1140	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1141#ifdef MAC
1142	error = mac_vnode_check_write(cred, NOCRED, vp);
1143	if (error == 0)
1144#endif
1145		error = VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, cred);
1146	VOP_UNLOCK(vp, 0);
1147	vn_finished_write(mp);
1148	crfree(cred);
1149	if (!error) {
1150		vrele(vp);
1151		VFS_UNLOCK_GIANT(vfslocked);
1152		return;
1153	}
1154	VFS_UNLOCK_GIANT(vfslocked);
1155
1156	/*
1157	 * If error encountered, give up tracing on this vnode.  We defer
1158	 * all the vrele()'s on the vnode until after we are finished walking
1159	 * the various lists to avoid needlessly holding locks.
1160	 * NB: at this point we still hold the vnode reference that must
1161	 * not go away as we need the valid vnode to compare with. Thus let
1162	 * vrele_count start at 1 and the reference will be freed
1163	 * by the loop at the end after our last use of vp.
1164	 */
1165	log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n",
1166	    error);
1167	vrele_count = 1;
1168	/*
1169	 * First, clear this vnode from being used by any processes in the
1170	 * system.
1171	 * XXX - If one process gets an EPERM writing to the vnode, should
1172	 * we really do this?  Other processes might have suitable
1173	 * credentials for the operation.
1174	 */
1175	cred = NULL;
1176	sx_slock(&allproc_lock);
1177	FOREACH_PROC_IN_SYSTEM(p) {
1178		PROC_LOCK(p);
1179		if (p->p_tracevp == vp) {
1180			mtx_lock(&ktrace_mtx);
1181			ktr_freeproc(p, &cred, NULL);
1182			mtx_unlock(&ktrace_mtx);
1183			vrele_count++;
1184		}
1185		PROC_UNLOCK(p);
1186		if (cred != NULL) {
1187			crfree(cred);
1188			cred = NULL;
1189		}
1190	}
1191	sx_sunlock(&allproc_lock);
1192
1193	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1194	while (vrele_count-- > 0)
1195		vrele(vp);
1196	VFS_UNLOCK_GIANT(vfslocked);
1197}
1198
1199/*
1200 * Return true if caller has permission to set the ktracing state
1201 * of target.  Essentially, the target can't possess any
1202 * more permissions than the caller.  KTRFAC_ROOT signifies that
1203 * root previously set the tracing status on the target process, and
1204 * so, only root may further change it.
1205 */
1206static int
1207ktrcanset(td, targetp)
1208	struct thread *td;
1209	struct proc *targetp;
1210{
1211
1212	PROC_LOCK_ASSERT(targetp, MA_OWNED);
1213	if (targetp->p_traceflag & KTRFAC_ROOT &&
1214	    priv_check(td, PRIV_KTRACE))
1215		return (0);
1216
1217	if (p_candebug(td, targetp) != 0)
1218		return (0);
1219
1220	return (1);
1221}
1222
1223#endif /* KTRACE */
1224