kern_ktrace.c revision 190888
1/*-
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.
4 * Copyright (c) 2005 Robert N. M. Watson
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 4. Neither the name of the University nor the names of its contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 *
31 *	@(#)kern_ktrace.c	8.2 (Berkeley) 9/23/93
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD: head/sys/kern/kern_ktrace.c 190888 2009-04-10 10:52:19Z rwatson $");
36
37#include "opt_ktrace.h"
38#include "opt_mac.h"
39
40#include <sys/param.h>
41#include <sys/systm.h>
42#include <sys/fcntl.h>
43#include <sys/kernel.h>
44#include <sys/kthread.h>
45#include <sys/lock.h>
46#include <sys/mutex.h>
47#include <sys/malloc.h>
48#include <sys/mount.h>
49#include <sys/namei.h>
50#include <sys/priv.h>
51#include <sys/proc.h>
52#include <sys/unistd.h>
53#include <sys/vnode.h>
54#include <sys/socket.h>
55#include <sys/stat.h>
56#include <sys/ktrace.h>
57#include <sys/sx.h>
58#include <sys/sysctl.h>
59#include <sys/syslog.h>
60#include <sys/sysproto.h>
61
62#include <security/mac/mac_framework.h>
63
64/*
65 * The ktrace facility allows the tracing of certain key events in user space
66 * processes, such as system calls, signal delivery, context switches, and
67 * user generated events using utrace(2).  It works by streaming event
68 * records and data to a vnode associated with the process using the
69 * ktrace(2) system call.  In general, records can be written directly from
70 * the context that generates the event.  One important exception to this is
71 * during a context switch, where sleeping is not permitted.  To handle this
72 * case, trace events are generated using in-kernel ktr_request records, and
73 * then delivered to disk at a convenient moment -- either immediately, the
74 * next traceable event, at system call return, or at process exit.
75 *
76 * When dealing with multiple threads or processes writing to the same event
77 * log, ordering guarantees are weak: specifically, if an event has multiple
78 * records (i.e., system call enter and return), they may be interlaced with
79 * records from another event.  Process and thread ID information is provided
80 * in the record, and user applications can de-interlace events if required.
81 */
82
83static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE");
84
85#ifdef KTRACE
86
87#ifndef KTRACE_REQUEST_POOL
88#define	KTRACE_REQUEST_POOL	100
89#endif
90
91struct ktr_request {
92	struct	ktr_header ktr_header;
93	void	*ktr_buffer;
94	union {
95		struct	ktr_syscall ktr_syscall;
96		struct	ktr_sysret ktr_sysret;
97		struct	ktr_genio ktr_genio;
98		struct	ktr_psig ktr_psig;
99		struct	ktr_csw ktr_csw;
100	} ktr_data;
101	STAILQ_ENTRY(ktr_request) ktr_list;
102};
103
104static int data_lengths[] = {
105	0,					/* none */
106	offsetof(struct ktr_syscall, ktr_args),	/* KTR_SYSCALL */
107	sizeof(struct ktr_sysret),		/* KTR_SYSRET */
108	0,					/* KTR_NAMEI */
109	sizeof(struct ktr_genio),		/* KTR_GENIO */
110	sizeof(struct ktr_psig),		/* KTR_PSIG */
111	sizeof(struct ktr_csw),			/* KTR_CSW */
112	0,					/* KTR_USER */
113	0,					/* KTR_STRUCT */
114	0,					/* KTR_SYSCTL */
115};
116
117static STAILQ_HEAD(, ktr_request) ktr_free;
118
119static SYSCTL_NODE(_kern, OID_AUTO, ktrace, CTLFLAG_RD, 0, "KTRACE options");
120
121static u_int ktr_requestpool = KTRACE_REQUEST_POOL;
122TUNABLE_INT("kern.ktrace.request_pool", &ktr_requestpool);
123
124static u_int ktr_geniosize = PAGE_SIZE;
125TUNABLE_INT("kern.ktrace.genio_size", &ktr_geniosize);
126SYSCTL_UINT(_kern_ktrace, OID_AUTO, genio_size, CTLFLAG_RW, &ktr_geniosize,
127    0, "Maximum size of genio event payload");
128
129static int print_message = 1;
130struct mtx ktrace_mtx;
131static struct sx ktrace_sx;
132
133static void ktrace_init(void *dummy);
134static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS);
135static u_int ktrace_resize_pool(u_int newsize);
136static struct ktr_request *ktr_getrequest(int type);
137static void ktr_submitrequest(struct thread *td, struct ktr_request *req);
138static void ktr_freerequest(struct ktr_request *req);
139static void ktr_writerequest(struct thread *td, struct ktr_request *req);
140static int ktrcanset(struct thread *,struct proc *);
141static int ktrsetchildren(struct thread *,struct proc *,int,int,struct vnode *);
142static int ktrops(struct thread *,struct proc *,int,int,struct vnode *);
143
144/*
145 * ktrace itself generates events, such as context switches, which we do not
146 * wish to trace.  Maintain a flag, TDP_INKTRACE, on each thread to determine
147 * whether or not it is in a region where tracing of events should be
148 * suppressed.
149 */
150static void
151ktrace_enter(struct thread *td)
152{
153
154	KASSERT(!(td->td_pflags & TDP_INKTRACE), ("ktrace_enter: flag set"));
155	td->td_pflags |= TDP_INKTRACE;
156}
157
158static void
159ktrace_exit(struct thread *td)
160{
161
162	KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_exit: flag not set"));
163	td->td_pflags &= ~TDP_INKTRACE;
164}
165
166static void
167ktrace_assert(struct thread *td)
168{
169
170	KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_assert: flag not set"));
171}
172
173static void
174ktrace_init(void *dummy)
175{
176	struct ktr_request *req;
177	int i;
178
179	mtx_init(&ktrace_mtx, "ktrace", NULL, MTX_DEF | MTX_QUIET);
180	sx_init(&ktrace_sx, "ktrace_sx");
181	STAILQ_INIT(&ktr_free);
182	for (i = 0; i < ktr_requestpool; i++) {
183		req = malloc(sizeof(struct ktr_request), M_KTRACE, M_WAITOK);
184		STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
185	}
186}
187SYSINIT(ktrace_init, SI_SUB_KTRACE, SI_ORDER_ANY, ktrace_init, NULL);
188
189static int
190sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS)
191{
192	struct thread *td;
193	u_int newsize, oldsize, wantsize;
194	int error;
195
196	/* Handle easy read-only case first to avoid warnings from GCC. */
197	if (!req->newptr) {
198		mtx_lock(&ktrace_mtx);
199		oldsize = ktr_requestpool;
200		mtx_unlock(&ktrace_mtx);
201		return (SYSCTL_OUT(req, &oldsize, sizeof(u_int)));
202	}
203
204	error = SYSCTL_IN(req, &wantsize, sizeof(u_int));
205	if (error)
206		return (error);
207	td = curthread;
208	ktrace_enter(td);
209	mtx_lock(&ktrace_mtx);
210	oldsize = ktr_requestpool;
211	newsize = ktrace_resize_pool(wantsize);
212	mtx_unlock(&ktrace_mtx);
213	ktrace_exit(td);
214	error = SYSCTL_OUT(req, &oldsize, sizeof(u_int));
215	if (error)
216		return (error);
217	if (wantsize > oldsize && newsize < wantsize)
218		return (ENOSPC);
219	return (0);
220}
221SYSCTL_PROC(_kern_ktrace, OID_AUTO, request_pool, CTLTYPE_UINT|CTLFLAG_RW,
222    &ktr_requestpool, 0, sysctl_kern_ktrace_request_pool, "IU", "");
223
224static u_int
225ktrace_resize_pool(u_int newsize)
226{
227	struct ktr_request *req;
228	int bound;
229
230	mtx_assert(&ktrace_mtx, MA_OWNED);
231	print_message = 1;
232	bound = newsize - ktr_requestpool;
233	if (bound == 0)
234		return (ktr_requestpool);
235	if (bound < 0)
236		/* Shrink pool down to newsize if possible. */
237		while (bound++ < 0) {
238			req = STAILQ_FIRST(&ktr_free);
239			if (req == NULL)
240				return (ktr_requestpool);
241			STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
242			ktr_requestpool--;
243			mtx_unlock(&ktrace_mtx);
244			free(req, M_KTRACE);
245			mtx_lock(&ktrace_mtx);
246		}
247	else
248		/* Grow pool up to newsize. */
249		while (bound-- > 0) {
250			mtx_unlock(&ktrace_mtx);
251			req = malloc(sizeof(struct ktr_request), M_KTRACE,
252			    M_WAITOK);
253			mtx_lock(&ktrace_mtx);
254			STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
255			ktr_requestpool++;
256		}
257	return (ktr_requestpool);
258}
259
260static struct ktr_request *
261ktr_getrequest(int type)
262{
263	struct ktr_request *req;
264	struct thread *td = curthread;
265	struct proc *p = td->td_proc;
266	int pm;
267
268	ktrace_enter(td);	/* XXX: In caller instead? */
269	mtx_lock(&ktrace_mtx);
270	if (!KTRCHECK(td, type)) {
271		mtx_unlock(&ktrace_mtx);
272		ktrace_exit(td);
273		return (NULL);
274	}
275	req = STAILQ_FIRST(&ktr_free);
276	if (req != NULL) {
277		STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
278		req->ktr_header.ktr_type = type;
279		if (p->p_traceflag & KTRFAC_DROP) {
280			req->ktr_header.ktr_type |= KTR_DROP;
281			p->p_traceflag &= ~KTRFAC_DROP;
282		}
283		mtx_unlock(&ktrace_mtx);
284		microtime(&req->ktr_header.ktr_time);
285		req->ktr_header.ktr_pid = p->p_pid;
286		req->ktr_header.ktr_tid = td->td_tid;
287		bcopy(td->td_name, req->ktr_header.ktr_comm, MAXCOMLEN + 1);
288		req->ktr_buffer = NULL;
289		req->ktr_header.ktr_len = 0;
290	} else {
291		p->p_traceflag |= KTRFAC_DROP;
292		pm = print_message;
293		print_message = 0;
294		mtx_unlock(&ktrace_mtx);
295		if (pm)
296			printf("Out of ktrace request objects.\n");
297		ktrace_exit(td);
298	}
299	return (req);
300}
301
302/*
303 * Some trace generation environments don't permit direct access to VFS,
304 * such as during a context switch where sleeping is not allowed.  Under these
305 * circumstances, queue a request to the thread to be written asynchronously
306 * later.
307 */
308static void
309ktr_enqueuerequest(struct thread *td, struct ktr_request *req)
310{
311
312	mtx_lock(&ktrace_mtx);
313	STAILQ_INSERT_TAIL(&td->td_proc->p_ktr, req, ktr_list);
314	mtx_unlock(&ktrace_mtx);
315	ktrace_exit(td);
316}
317
318/*
319 * Drain any pending ktrace records from the per-thread queue to disk.  This
320 * is used both internally before committing other records, and also on
321 * system call return.  We drain all the ones we can find at the time when
322 * drain is requested, but don't keep draining after that as those events
323 * may be approximately "after" the current event.
324 */
325static void
326ktr_drain(struct thread *td)
327{
328	struct ktr_request *queued_req;
329	STAILQ_HEAD(, ktr_request) local_queue;
330
331	ktrace_assert(td);
332	sx_assert(&ktrace_sx, SX_XLOCKED);
333
334	STAILQ_INIT(&local_queue);	/* XXXRW: needed? */
335
336	if (!STAILQ_EMPTY(&td->td_proc->p_ktr)) {
337		mtx_lock(&ktrace_mtx);
338		STAILQ_CONCAT(&local_queue, &td->td_proc->p_ktr);
339		mtx_unlock(&ktrace_mtx);
340
341		while ((queued_req = STAILQ_FIRST(&local_queue))) {
342			STAILQ_REMOVE_HEAD(&local_queue, ktr_list);
343			ktr_writerequest(td, queued_req);
344			ktr_freerequest(queued_req);
345		}
346	}
347}
348
349/*
350 * Submit a trace record for immediate commit to disk -- to be used only
351 * where entering VFS is OK.  First drain any pending records that may have
352 * been cached in the thread.
353 */
354static void
355ktr_submitrequest(struct thread *td, struct ktr_request *req)
356{
357
358	ktrace_assert(td);
359
360	sx_xlock(&ktrace_sx);
361	ktr_drain(td);
362	ktr_writerequest(td, req);
363	ktr_freerequest(req);
364	sx_xunlock(&ktrace_sx);
365
366	ktrace_exit(td);
367}
368
369static void
370ktr_freerequest(struct ktr_request *req)
371{
372
373	if (req->ktr_buffer != NULL)
374		free(req->ktr_buffer, M_KTRACE);
375	mtx_lock(&ktrace_mtx);
376	STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
377	mtx_unlock(&ktrace_mtx);
378}
379
380void
381ktrsyscall(code, narg, args)
382	int code, narg;
383	register_t args[];
384{
385	struct ktr_request *req;
386	struct ktr_syscall *ktp;
387	size_t buflen;
388	char *buf = NULL;
389
390	buflen = sizeof(register_t) * narg;
391	if (buflen > 0) {
392		buf = malloc(buflen, M_KTRACE, M_WAITOK);
393		bcopy(args, buf, buflen);
394	}
395	req = ktr_getrequest(KTR_SYSCALL);
396	if (req == NULL) {
397		if (buf != NULL)
398			free(buf, M_KTRACE);
399		return;
400	}
401	ktp = &req->ktr_data.ktr_syscall;
402	ktp->ktr_code = code;
403	ktp->ktr_narg = narg;
404	if (buflen > 0) {
405		req->ktr_header.ktr_len = buflen;
406		req->ktr_buffer = buf;
407	}
408	ktr_submitrequest(curthread, req);
409}
410
411void
412ktrsysret(code, error, retval)
413	int code, error;
414	register_t retval;
415{
416	struct ktr_request *req;
417	struct ktr_sysret *ktp;
418
419	req = ktr_getrequest(KTR_SYSRET);
420	if (req == NULL)
421		return;
422	ktp = &req->ktr_data.ktr_sysret;
423	ktp->ktr_code = code;
424	ktp->ktr_error = error;
425	ktp->ktr_retval = retval;		/* what about val2 ? */
426	ktr_submitrequest(curthread, req);
427}
428
429/*
430 * When a process exits, drain per-process asynchronous trace records.
431 */
432void
433ktrprocexit(struct thread *td)
434{
435
436	ktrace_enter(td);
437	sx_xlock(&ktrace_sx);
438	ktr_drain(td);
439	sx_xunlock(&ktrace_sx);
440	ktrace_exit(td);
441}
442
443/*
444 * When a thread returns, drain any asynchronous records generated by the
445 * system call.
446 */
447void
448ktruserret(struct thread *td)
449{
450
451	ktrace_enter(td);
452	sx_xlock(&ktrace_sx);
453	ktr_drain(td);
454	sx_xunlock(&ktrace_sx);
455	ktrace_exit(td);
456}
457
458void
459ktrnamei(path)
460	char *path;
461{
462	struct ktr_request *req;
463	int namelen;
464	char *buf = NULL;
465
466	namelen = strlen(path);
467	if (namelen > 0) {
468		buf = malloc(namelen, M_KTRACE, M_WAITOK);
469		bcopy(path, buf, namelen);
470	}
471	req = ktr_getrequest(KTR_NAMEI);
472	if (req == NULL) {
473		if (buf != NULL)
474			free(buf, M_KTRACE);
475		return;
476	}
477	if (namelen > 0) {
478		req->ktr_header.ktr_len = namelen;
479		req->ktr_buffer = buf;
480	}
481	ktr_submitrequest(curthread, req);
482}
483
484void
485ktrsysctl(name, namelen)
486	int *name;
487	u_int namelen;
488{
489	struct ktr_request *req;
490	u_int mib[CTL_MAXNAME + 2];
491	char *mibname;
492	size_t mibnamelen;
493	int error;
494
495	/* Lookup name of mib. */
496	KASSERT(namelen <= CTL_MAXNAME, ("sysctl MIB too long"));
497	mib[0] = 0;
498	mib[1] = 1;
499	bcopy(name, mib + 2, namelen * sizeof(*name));
500	mibnamelen = 128;
501	mibname = malloc(mibnamelen, M_KTRACE, M_WAITOK);
502	error = kernel_sysctl(curthread, mib, namelen + 2, mibname, &mibnamelen,
503	    NULL, 0, &mibnamelen, 0);
504	if (error) {
505		free(mibname, M_KTRACE);
506		return;
507	}
508	req = ktr_getrequest(KTR_SYSCTL);
509	if (req == NULL) {
510		free(mibname, M_KTRACE);
511		return;
512	}
513	req->ktr_header.ktr_len = mibnamelen;
514	req->ktr_buffer = mibname;
515	ktr_submitrequest(curthread, req);
516}
517
518void
519ktrgenio(fd, rw, uio, error)
520	int fd;
521	enum uio_rw rw;
522	struct uio *uio;
523	int error;
524{
525	struct ktr_request *req;
526	struct ktr_genio *ktg;
527	int datalen;
528	char *buf;
529
530	if (error) {
531		free(uio, M_IOV);
532		return;
533	}
534	uio->uio_offset = 0;
535	uio->uio_rw = UIO_WRITE;
536	datalen = imin(uio->uio_resid, ktr_geniosize);
537	buf = malloc(datalen, M_KTRACE, M_WAITOK);
538	error = uiomove(buf, datalen, uio);
539	free(uio, M_IOV);
540	if (error) {
541		free(buf, M_KTRACE);
542		return;
543	}
544	req = ktr_getrequest(KTR_GENIO);
545	if (req == NULL) {
546		free(buf, M_KTRACE);
547		return;
548	}
549	ktg = &req->ktr_data.ktr_genio;
550	ktg->ktr_fd = fd;
551	ktg->ktr_rw = rw;
552	req->ktr_header.ktr_len = datalen;
553	req->ktr_buffer = buf;
554	ktr_submitrequest(curthread, req);
555}
556
557void
558ktrpsig(sig, action, mask, code)
559	int sig;
560	sig_t action;
561	sigset_t *mask;
562	int code;
563{
564	struct ktr_request *req;
565	struct ktr_psig	*kp;
566
567	req = ktr_getrequest(KTR_PSIG);
568	if (req == NULL)
569		return;
570	kp = &req->ktr_data.ktr_psig;
571	kp->signo = (char)sig;
572	kp->action = action;
573	kp->mask = *mask;
574	kp->code = code;
575	ktr_enqueuerequest(curthread, req);
576}
577
578void
579ktrcsw(out, user)
580	int out, user;
581{
582	struct ktr_request *req;
583	struct ktr_csw *kc;
584
585	req = ktr_getrequest(KTR_CSW);
586	if (req == NULL)
587		return;
588	kc = &req->ktr_data.ktr_csw;
589	kc->out = out;
590	kc->user = user;
591	ktr_enqueuerequest(curthread, req);
592}
593
594void
595ktrstruct(name, namelen, data, datalen)
596	const char *name;
597	size_t namelen;
598	void *data;
599	size_t datalen;
600{
601	struct ktr_request *req;
602	char *buf = NULL;
603	size_t buflen;
604
605	if (!data)
606		datalen = 0;
607	buflen = namelen + 1 + datalen;
608	buf = malloc(buflen, M_KTRACE, M_WAITOK);
609	bcopy(name, buf, namelen);
610	buf[namelen] = '\0';
611	bcopy(data, buf + namelen + 1, datalen);
612	if ((req = ktr_getrequest(KTR_STRUCT)) == NULL) {
613		free(buf, M_KTRACE);
614		return;
615	}
616	req->ktr_buffer = buf;
617	req->ktr_header.ktr_len = buflen;
618	ktr_submitrequest(curthread, req);
619}
620#endif /* KTRACE */
621
622/* Interface and common routines */
623
624#ifndef _SYS_SYSPROTO_H_
625struct ktrace_args {
626	char	*fname;
627	int	ops;
628	int	facs;
629	int	pid;
630};
631#endif
632/* ARGSUSED */
633int
634ktrace(td, uap)
635	struct thread *td;
636	register struct ktrace_args *uap;
637{
638#ifdef KTRACE
639	register struct vnode *vp = NULL;
640	register struct proc *p;
641	struct pgrp *pg;
642	int facs = uap->facs & ~KTRFAC_ROOT;
643	int ops = KTROP(uap->ops);
644	int descend = uap->ops & KTRFLAG_DESCEND;
645	int nfound, ret = 0;
646	int flags, error = 0, vfslocked;
647	struct nameidata nd;
648	struct ucred *cred;
649
650	/*
651	 * Need something to (un)trace.
652	 */
653	if (ops != KTROP_CLEARFILE && facs == 0)
654		return (EINVAL);
655
656	ktrace_enter(td);
657	if (ops != KTROP_CLEAR) {
658		/*
659		 * an operation which requires a file argument.
660		 */
661		NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_USERSPACE,
662		    uap->fname, td);
663		flags = FREAD | FWRITE | O_NOFOLLOW;
664		error = vn_open(&nd, &flags, 0, NULL);
665		if (error) {
666			ktrace_exit(td);
667			return (error);
668		}
669		vfslocked = NDHASGIANT(&nd);
670		NDFREE(&nd, NDF_ONLY_PNBUF);
671		vp = nd.ni_vp;
672		VOP_UNLOCK(vp, 0);
673		if (vp->v_type != VREG) {
674			(void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
675			VFS_UNLOCK_GIANT(vfslocked);
676			ktrace_exit(td);
677			return (EACCES);
678		}
679		VFS_UNLOCK_GIANT(vfslocked);
680	}
681	/*
682	 * Clear all uses of the tracefile.
683	 */
684	if (ops == KTROP_CLEARFILE) {
685		int vrele_count;
686
687		vrele_count = 0;
688		sx_slock(&allproc_lock);
689		FOREACH_PROC_IN_SYSTEM(p) {
690			PROC_LOCK(p);
691			if (p->p_tracevp == vp) {
692				if (ktrcanset(td, p)) {
693					mtx_lock(&ktrace_mtx);
694					cred = p->p_tracecred;
695					p->p_tracecred = NULL;
696					p->p_tracevp = NULL;
697					p->p_traceflag = 0;
698					mtx_unlock(&ktrace_mtx);
699					vrele_count++;
700					crfree(cred);
701				} else
702					error = EPERM;
703			}
704			PROC_UNLOCK(p);
705		}
706		sx_sunlock(&allproc_lock);
707		if (vrele_count > 0) {
708			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
709			while (vrele_count-- > 0)
710				vrele(vp);
711			VFS_UNLOCK_GIANT(vfslocked);
712		}
713		goto done;
714	}
715	/*
716	 * do it
717	 */
718	sx_slock(&proctree_lock);
719	if (uap->pid < 0) {
720		/*
721		 * by process group
722		 */
723		pg = pgfind(-uap->pid);
724		if (pg == NULL) {
725			sx_sunlock(&proctree_lock);
726			error = ESRCH;
727			goto done;
728		}
729		/*
730		 * ktrops() may call vrele(). Lock pg_members
731		 * by the proctree_lock rather than pg_mtx.
732		 */
733		PGRP_UNLOCK(pg);
734		nfound = 0;
735		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
736			PROC_LOCK(p);
737			if (p_cansee(td, p) != 0) {
738				PROC_UNLOCK(p);
739				continue;
740			}
741			PROC_UNLOCK(p);
742			nfound++;
743			if (descend)
744				ret |= ktrsetchildren(td, p, ops, facs, vp);
745			else
746				ret |= ktrops(td, p, ops, facs, vp);
747		}
748		if (nfound == 0) {
749			sx_sunlock(&proctree_lock);
750			error = ESRCH;
751			goto done;
752		}
753	} else {
754		/*
755		 * by pid
756		 */
757		p = pfind(uap->pid);
758		if (p == NULL) {
759			sx_sunlock(&proctree_lock);
760			error = ESRCH;
761			goto done;
762		}
763		error = p_cansee(td, p);
764		/*
765		 * The slock of the proctree lock will keep this process
766		 * from going away, so unlocking the proc here is ok.
767		 */
768		PROC_UNLOCK(p);
769		if (error) {
770			sx_sunlock(&proctree_lock);
771			goto done;
772		}
773		if (descend)
774			ret |= ktrsetchildren(td, p, ops, facs, vp);
775		else
776			ret |= ktrops(td, p, ops, facs, vp);
777	}
778	sx_sunlock(&proctree_lock);
779	if (!ret)
780		error = EPERM;
781done:
782	if (vp != NULL) {
783		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
784		(void) vn_close(vp, FWRITE, td->td_ucred, td);
785		VFS_UNLOCK_GIANT(vfslocked);
786	}
787	ktrace_exit(td);
788	return (error);
789#else /* !KTRACE */
790	return (ENOSYS);
791#endif /* KTRACE */
792}
793
794/* ARGSUSED */
795int
796utrace(td, uap)
797	struct thread *td;
798	register struct utrace_args *uap;
799{
800
801#ifdef KTRACE
802	struct ktr_request *req;
803	void *cp;
804	int error;
805
806	if (!KTRPOINT(td, KTR_USER))
807		return (0);
808	if (uap->len > KTR_USER_MAXLEN)
809		return (EINVAL);
810	cp = malloc(uap->len, M_KTRACE, M_WAITOK);
811	error = copyin(uap->addr, cp, uap->len);
812	if (error) {
813		free(cp, M_KTRACE);
814		return (error);
815	}
816	req = ktr_getrequest(KTR_USER);
817	if (req == NULL) {
818		free(cp, M_KTRACE);
819		return (ENOMEM);
820	}
821	req->ktr_buffer = cp;
822	req->ktr_header.ktr_len = uap->len;
823	ktr_submitrequest(td, req);
824	return (0);
825#else /* !KTRACE */
826	return (ENOSYS);
827#endif /* KTRACE */
828}
829
830#ifdef KTRACE
831static int
832ktrops(td, p, ops, facs, vp)
833	struct thread *td;
834	struct proc *p;
835	int ops, facs;
836	struct vnode *vp;
837{
838	struct vnode *tracevp = NULL;
839	struct ucred *tracecred = NULL;
840
841	PROC_LOCK(p);
842	if (!ktrcanset(td, p)) {
843		PROC_UNLOCK(p);
844		return (0);
845	}
846	mtx_lock(&ktrace_mtx);
847	if (ops == KTROP_SET) {
848		if (p->p_tracevp != vp) {
849			/*
850			 * if trace file already in use, relinquish below
851			 */
852			tracevp = p->p_tracevp;
853			VREF(vp);
854			p->p_tracevp = vp;
855		}
856		if (p->p_tracecred != td->td_ucred) {
857			tracecred = p->p_tracecred;
858			p->p_tracecred = crhold(td->td_ucred);
859		}
860		p->p_traceflag |= facs;
861		if (priv_check(td, PRIV_KTRACE) == 0)
862			p->p_traceflag |= KTRFAC_ROOT;
863	} else {
864		/* KTROP_CLEAR */
865		if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0) {
866			/* no more tracing */
867			p->p_traceflag = 0;
868			tracevp = p->p_tracevp;
869			p->p_tracevp = NULL;
870			tracecred = p->p_tracecred;
871			p->p_tracecred = NULL;
872		}
873	}
874	mtx_unlock(&ktrace_mtx);
875	PROC_UNLOCK(p);
876	if (tracevp != NULL) {
877		int vfslocked;
878
879		vfslocked = VFS_LOCK_GIANT(tracevp->v_mount);
880		vrele(tracevp);
881		VFS_UNLOCK_GIANT(vfslocked);
882	}
883	if (tracecred != NULL)
884		crfree(tracecred);
885
886	return (1);
887}
888
889static int
890ktrsetchildren(td, top, ops, facs, vp)
891	struct thread *td;
892	struct proc *top;
893	int ops, facs;
894	struct vnode *vp;
895{
896	register struct proc *p;
897	register int ret = 0;
898
899	p = top;
900	sx_assert(&proctree_lock, SX_LOCKED);
901	for (;;) {
902		ret |= ktrops(td, p, ops, facs, vp);
903		/*
904		 * If this process has children, descend to them next,
905		 * otherwise do any siblings, and if done with this level,
906		 * follow back up the tree (but not past top).
907		 */
908		if (!LIST_EMPTY(&p->p_children))
909			p = LIST_FIRST(&p->p_children);
910		else for (;;) {
911			if (p == top)
912				return (ret);
913			if (LIST_NEXT(p, p_sibling)) {
914				p = LIST_NEXT(p, p_sibling);
915				break;
916			}
917			p = p->p_pptr;
918		}
919	}
920	/*NOTREACHED*/
921}
922
923static void
924ktr_writerequest(struct thread *td, struct ktr_request *req)
925{
926	struct ktr_header *kth;
927	struct vnode *vp;
928	struct proc *p;
929	struct ucred *cred;
930	struct uio auio;
931	struct iovec aiov[3];
932	struct mount *mp;
933	int datalen, buflen, vrele_count;
934	int error, vfslocked;
935
936	/*
937	 * We hold the vnode and credential for use in I/O in case ktrace is
938	 * disabled on the process as we write out the request.
939	 *
940	 * XXXRW: This is not ideal: we could end up performing a write after
941	 * the vnode has been closed.
942	 */
943	mtx_lock(&ktrace_mtx);
944	vp = td->td_proc->p_tracevp;
945	cred = td->td_proc->p_tracecred;
946
947	/*
948	 * If vp is NULL, the vp has been cleared out from under this
949	 * request, so just drop it.  Make sure the credential and vnode are
950	 * in sync: we should have both or neither.
951	 */
952	if (vp == NULL) {
953		KASSERT(cred == NULL, ("ktr_writerequest: cred != NULL"));
954		mtx_unlock(&ktrace_mtx);
955		return;
956	}
957	VREF(vp);
958	KASSERT(cred != NULL, ("ktr_writerequest: cred == NULL"));
959	crhold(cred);
960	mtx_unlock(&ktrace_mtx);
961
962	kth = &req->ktr_header;
963	KASSERT(((u_short)kth->ktr_type & ~KTR_DROP) <
964	    sizeof(data_lengths) / sizeof(data_lengths[0]),
965	    ("data_lengths array overflow"));
966	datalen = data_lengths[(u_short)kth->ktr_type & ~KTR_DROP];
967	buflen = kth->ktr_len;
968	auio.uio_iov = &aiov[0];
969	auio.uio_offset = 0;
970	auio.uio_segflg = UIO_SYSSPACE;
971	auio.uio_rw = UIO_WRITE;
972	aiov[0].iov_base = (caddr_t)kth;
973	aiov[0].iov_len = sizeof(struct ktr_header);
974	auio.uio_resid = sizeof(struct ktr_header);
975	auio.uio_iovcnt = 1;
976	auio.uio_td = td;
977	if (datalen != 0) {
978		aiov[1].iov_base = (caddr_t)&req->ktr_data;
979		aiov[1].iov_len = datalen;
980		auio.uio_resid += datalen;
981		auio.uio_iovcnt++;
982		kth->ktr_len += datalen;
983	}
984	if (buflen != 0) {
985		KASSERT(req->ktr_buffer != NULL, ("ktrace: nothing to write"));
986		aiov[auio.uio_iovcnt].iov_base = req->ktr_buffer;
987		aiov[auio.uio_iovcnt].iov_len = buflen;
988		auio.uio_resid += buflen;
989		auio.uio_iovcnt++;
990	}
991
992	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
993	vn_start_write(vp, &mp, V_WAIT);
994	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
995#ifdef MAC
996	error = mac_vnode_check_write(cred, NOCRED, vp);
997	if (error == 0)
998#endif
999		error = VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, cred);
1000	VOP_UNLOCK(vp, 0);
1001	vn_finished_write(mp);
1002	crfree(cred);
1003	if (!error) {
1004		vrele(vp);
1005		VFS_UNLOCK_GIANT(vfslocked);
1006		return;
1007	}
1008	VFS_UNLOCK_GIANT(vfslocked);
1009
1010	/*
1011	 * If error encountered, give up tracing on this vnode.  We defer
1012	 * all the vrele()'s on the vnode until after we are finished walking
1013	 * the various lists to avoid needlessly holding locks.
1014	 * NB: at this point we still hold the vnode reference that must
1015	 * not go away as we need the valid vnode to compare with. Thus let
1016	 * vrele_count start at 1 and the reference will be freed
1017	 * by the loop at the end after our last use of vp.
1018	 */
1019	log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n",
1020	    error);
1021	vrele_count = 1;
1022	/*
1023	 * First, clear this vnode from being used by any processes in the
1024	 * system.
1025	 * XXX - If one process gets an EPERM writing to the vnode, should
1026	 * we really do this?  Other processes might have suitable
1027	 * credentials for the operation.
1028	 */
1029	cred = NULL;
1030	sx_slock(&allproc_lock);
1031	FOREACH_PROC_IN_SYSTEM(p) {
1032		PROC_LOCK(p);
1033		if (p->p_tracevp == vp) {
1034			mtx_lock(&ktrace_mtx);
1035			p->p_tracevp = NULL;
1036			p->p_traceflag = 0;
1037			cred = p->p_tracecred;
1038			p->p_tracecred = NULL;
1039			mtx_unlock(&ktrace_mtx);
1040			vrele_count++;
1041		}
1042		PROC_UNLOCK(p);
1043		if (cred != NULL) {
1044			crfree(cred);
1045			cred = NULL;
1046		}
1047	}
1048	sx_sunlock(&allproc_lock);
1049
1050	/*
1051	 * We can't clear any pending requests in threads that have cached
1052	 * them but not yet committed them, as those are per-thread.  The
1053	 * thread will have to clear it itself on system call return.
1054	 */
1055	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1056	while (vrele_count-- > 0)
1057		vrele(vp);
1058	VFS_UNLOCK_GIANT(vfslocked);
1059}
1060
1061/*
1062 * Return true if caller has permission to set the ktracing state
1063 * of target.  Essentially, the target can't possess any
1064 * more permissions than the caller.  KTRFAC_ROOT signifies that
1065 * root previously set the tracing status on the target process, and
1066 * so, only root may further change it.
1067 */
1068static int
1069ktrcanset(td, targetp)
1070	struct thread *td;
1071	struct proc *targetp;
1072{
1073
1074	PROC_LOCK_ASSERT(targetp, MA_OWNED);
1075	if (targetp->p_traceflag & KTRFAC_ROOT &&
1076	    priv_check(td, PRIV_KTRACE))
1077		return (0);
1078
1079	if (p_candebug(td, targetp) != 0)
1080		return (0);
1081
1082	return (1);
1083}
1084
1085#endif /* KTRACE */
1086