kern_descrip.c revision 271183
1/*-
2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: head/sys/kern/kern_descrip.c 271183 2014-09-05 23:56:25Z mjg $");
39
40#include "opt_capsicum.h"
41#include "opt_compat.h"
42#include "opt_ddb.h"
43#include "opt_ktrace.h"
44
45#include <sys/param.h>
46#include <sys/systm.h>
47
48#include <sys/capsicum.h>
49#include <sys/conf.h>
50#include <sys/domain.h>
51#include <sys/fcntl.h>
52#include <sys/file.h>
53#include <sys/filedesc.h>
54#include <sys/filio.h>
55#include <sys/jail.h>
56#include <sys/kernel.h>
57#include <sys/ksem.h>
58#include <sys/limits.h>
59#include <sys/lock.h>
60#include <sys/malloc.h>
61#include <sys/mman.h>
62#include <sys/mount.h>
63#include <sys/mqueue.h>
64#include <sys/mutex.h>
65#include <sys/namei.h>
66#include <sys/selinfo.h>
67#include <sys/pipe.h>
68#include <sys/priv.h>
69#include <sys/proc.h>
70#include <sys/procdesc.h>
71#include <sys/protosw.h>
72#include <sys/racct.h>
73#include <sys/resourcevar.h>
74#include <sys/sbuf.h>
75#include <sys/signalvar.h>
76#include <sys/socketvar.h>
77#include <sys/stat.h>
78#include <sys/sx.h>
79#include <sys/syscallsubr.h>
80#include <sys/sysctl.h>
81#include <sys/sysproto.h>
82#include <sys/tty.h>
83#include <sys/unistd.h>
84#include <sys/un.h>
85#include <sys/unpcb.h>
86#include <sys/user.h>
87#include <sys/vnode.h>
88#ifdef KTRACE
89#include <sys/ktrace.h>
90#endif
91
92#include <net/vnet.h>
93
94#include <netinet/in.h>
95#include <netinet/in_pcb.h>
96
97#include <security/audit/audit.h>
98
99#include <vm/uma.h>
100#include <vm/vm.h>
101
102#include <ddb/ddb.h>
103
104static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
105static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
106    "file desc to leader structures");
107static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
108MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
109
110MALLOC_DECLARE(M_FADVISE);
111
112static uma_zone_t file_zone;
113
114void	(*ksem_info)(struct ksem *ks, char *path, size_t size, uint32_t *value);
115
116static int	closefp(struct filedesc *fdp, int fd, struct file *fp,
117		    struct thread *td, int holdleaders);
118static int	fd_first_free(struct filedesc *fdp, int low, int size);
119static int	fd_last_used(struct filedesc *fdp, int size);
120static void	fdgrowtable(struct filedesc *fdp, int nfd);
121static void	fdgrowtable_exp(struct filedesc *fdp, int nfd);
122static void	fdunused(struct filedesc *fdp, int fd);
123static void	fdused(struct filedesc *fdp, int fd);
124static int	fill_pipe_info(struct pipe *pi, struct kinfo_file *kif);
125static int	fill_procdesc_info(struct procdesc *pdp,
126		    struct kinfo_file *kif);
127static int	fill_pts_info(struct tty *tp, struct kinfo_file *kif);
128static int	fill_sem_info(struct file *fp, struct kinfo_file *kif);
129static int	fill_shm_info(struct file *fp, struct kinfo_file *kif);
130static int	fill_socket_info(struct socket *so, struct kinfo_file *kif);
131static int	fill_vnode_info(struct vnode *vp, struct kinfo_file *kif);
132static int	getmaxfd(struct proc *p);
133
134/*
135 * Each process has:
136 *
137 * - An array of open file descriptors (fd_ofiles)
138 * - An array of file flags (fd_ofileflags)
139 * - A bitmap recording which descriptors are in use (fd_map)
140 *
141 * A process starts out with NDFILE descriptors.  The value of NDFILE has
142 * been selected based the historical limit of 20 open files, and an
143 * assumption that the majority of processes, especially short-lived
144 * processes like shells, will never need more.
145 *
146 * If this initial allocation is exhausted, a larger descriptor table and
147 * map are allocated dynamically, and the pointers in the process's struct
148 * filedesc are updated to point to those.  This is repeated every time
149 * the process runs out of file descriptors (provided it hasn't hit its
150 * resource limit).
151 *
152 * Since threads may hold references to individual descriptor table
153 * entries, the tables are never freed.  Instead, they are placed on a
154 * linked list and freed only when the struct filedesc is released.
155 */
156#define NDFILE		20
157#define NDSLOTSIZE	sizeof(NDSLOTTYPE)
158#define	NDENTRIES	(NDSLOTSIZE * __CHAR_BIT)
159#define NDSLOT(x)	((x) / NDENTRIES)
160#define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
161#define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
162
163/*
164 * SLIST entry used to keep track of ofiles which must be reclaimed when
165 * the process exits.
166 */
167struct freetable {
168	struct filedescent *ft_table;
169	SLIST_ENTRY(freetable) ft_next;
170};
171
172/*
173 * Initial allocation: a filedesc structure + the head of SLIST used to
174 * keep track of old ofiles + enough space for NDFILE descriptors.
175 */
176struct filedesc0 {
177	struct filedesc fd_fd;
178	SLIST_HEAD(, freetable) fd_free;
179	struct	filedescent fd_dfiles[NDFILE];
180	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
181};
182
183/*
184 * Descriptor management.
185 */
186volatile int openfiles;			/* actual number of open files */
187struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
188void (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
189
190/* A mutex to protect the association between a proc and filedesc. */
191static struct mtx fdesc_mtx;
192
193/*
194 * If low >= size, just return low. Otherwise find the first zero bit in the
195 * given bitmap, starting at low and not exceeding size - 1. Return size if
196 * not found.
197 */
198static int
199fd_first_free(struct filedesc *fdp, int low, int size)
200{
201	NDSLOTTYPE *map = fdp->fd_map;
202	NDSLOTTYPE mask;
203	int off, maxoff;
204
205	if (low >= size)
206		return (low);
207
208	off = NDSLOT(low);
209	if (low % NDENTRIES) {
210		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
211		if ((mask &= ~map[off]) != 0UL)
212			return (off * NDENTRIES + ffsl(mask) - 1);
213		++off;
214	}
215	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
216		if (map[off] != ~0UL)
217			return (off * NDENTRIES + ffsl(~map[off]) - 1);
218	return (size);
219}
220
221/*
222 * Find the highest non-zero bit in the given bitmap, starting at 0 and
223 * not exceeding size - 1. Return -1 if not found.
224 */
225static int
226fd_last_used(struct filedesc *fdp, int size)
227{
228	NDSLOTTYPE *map = fdp->fd_map;
229	NDSLOTTYPE mask;
230	int off, minoff;
231
232	off = NDSLOT(size);
233	if (size % NDENTRIES) {
234		mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
235		if ((mask &= map[off]) != 0)
236			return (off * NDENTRIES + flsl(mask) - 1);
237		--off;
238	}
239	for (minoff = NDSLOT(0); off >= minoff; --off)
240		if (map[off] != 0)
241			return (off * NDENTRIES + flsl(map[off]) - 1);
242	return (-1);
243}
244
245static int
246fdisused(struct filedesc *fdp, int fd)
247{
248
249	FILEDESC_LOCK_ASSERT(fdp);
250
251	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
252	    ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
253
254	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
255}
256
257/*
258 * Mark a file descriptor as used.
259 */
260static void
261fdused(struct filedesc *fdp, int fd)
262{
263
264	FILEDESC_XLOCK_ASSERT(fdp);
265
266	KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
267
268	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
269	if (fd > fdp->fd_lastfile)
270		fdp->fd_lastfile = fd;
271	if (fd == fdp->fd_freefile)
272		fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
273}
274
275/*
276 * Mark a file descriptor as unused.
277 */
278static void
279fdunused(struct filedesc *fdp, int fd)
280{
281
282	FILEDESC_XLOCK_ASSERT(fdp);
283
284	KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
285	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
286	    ("fd=%d is still in use", fd));
287
288	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
289	if (fd < fdp->fd_freefile)
290		fdp->fd_freefile = fd;
291	if (fd == fdp->fd_lastfile)
292		fdp->fd_lastfile = fd_last_used(fdp, fd);
293}
294
295/*
296 * Free a file descriptor.
297 *
298 * Avoid some work if fdp is about to be destroyed.
299 */
300static inline void
301_fdfree(struct filedesc *fdp, int fd, int last)
302{
303	struct filedescent *fde;
304
305	fde = &fdp->fd_ofiles[fd];
306	filecaps_free(&fde->fde_caps);
307	if (last)
308		return;
309	bzero(fde, sizeof(*fde));
310	fdunused(fdp, fd);
311}
312
313static inline void
314fdfree(struct filedesc *fdp, int fd)
315{
316
317	_fdfree(fdp, fd, 0);
318}
319
320static inline void
321fdfree_last(struct filedesc *fdp, int fd)
322{
323
324	_fdfree(fdp, fd, 1);
325}
326
327/*
328 * System calls on descriptors.
329 */
330#ifndef _SYS_SYSPROTO_H_
331struct getdtablesize_args {
332	int	dummy;
333};
334#endif
335/* ARGSUSED */
336int
337sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
338{
339	struct proc *p = td->td_proc;
340	uint64_t lim;
341
342	PROC_LOCK(p);
343	td->td_retval[0] =
344	    min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
345	lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
346	PROC_UNLOCK(p);
347	if (lim < td->td_retval[0])
348		td->td_retval[0] = lim;
349	return (0);
350}
351
352/*
353 * Duplicate a file descriptor to a particular value.
354 *
355 * Note: keep in mind that a potential race condition exists when closing
356 * descriptors from a shared descriptor table (via rfork).
357 */
358#ifndef _SYS_SYSPROTO_H_
359struct dup2_args {
360	u_int	from;
361	u_int	to;
362};
363#endif
364/* ARGSUSED */
365int
366sys_dup2(struct thread *td, struct dup2_args *uap)
367{
368
369	return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
370		    td->td_retval));
371}
372
373/*
374 * Duplicate a file descriptor.
375 */
376#ifndef _SYS_SYSPROTO_H_
377struct dup_args {
378	u_int	fd;
379};
380#endif
381/* ARGSUSED */
382int
383sys_dup(struct thread *td, struct dup_args *uap)
384{
385
386	return (do_dup(td, 0, (int)uap->fd, 0, td->td_retval));
387}
388
389/*
390 * The file control system call.
391 */
392#ifndef _SYS_SYSPROTO_H_
393struct fcntl_args {
394	int	fd;
395	int	cmd;
396	long	arg;
397};
398#endif
399/* ARGSUSED */
400int
401sys_fcntl(struct thread *td, struct fcntl_args *uap)
402{
403	struct flock fl;
404	struct __oflock ofl;
405	intptr_t arg;
406	int error;
407	int cmd;
408
409	error = 0;
410	cmd = uap->cmd;
411	switch (uap->cmd) {
412	case F_OGETLK:
413	case F_OSETLK:
414	case F_OSETLKW:
415		/*
416		 * Convert old flock structure to new.
417		 */
418		error = copyin((void *)(intptr_t)uap->arg, &ofl, sizeof(ofl));
419		fl.l_start = ofl.l_start;
420		fl.l_len = ofl.l_len;
421		fl.l_pid = ofl.l_pid;
422		fl.l_type = ofl.l_type;
423		fl.l_whence = ofl.l_whence;
424		fl.l_sysid = 0;
425
426		switch (uap->cmd) {
427		case F_OGETLK:
428		    cmd = F_GETLK;
429		    break;
430		case F_OSETLK:
431		    cmd = F_SETLK;
432		    break;
433		case F_OSETLKW:
434		    cmd = F_SETLKW;
435		    break;
436		}
437		arg = (intptr_t)&fl;
438		break;
439        case F_GETLK:
440        case F_SETLK:
441        case F_SETLKW:
442	case F_SETLK_REMOTE:
443                error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
444                arg = (intptr_t)&fl;
445                break;
446	default:
447		arg = uap->arg;
448		break;
449	}
450	if (error)
451		return (error);
452	error = kern_fcntl(td, uap->fd, cmd, arg);
453	if (error)
454		return (error);
455	if (uap->cmd == F_OGETLK) {
456		ofl.l_start = fl.l_start;
457		ofl.l_len = fl.l_len;
458		ofl.l_pid = fl.l_pid;
459		ofl.l_type = fl.l_type;
460		ofl.l_whence = fl.l_whence;
461		error = copyout(&ofl, (void *)(intptr_t)uap->arg, sizeof(ofl));
462	} else if (uap->cmd == F_GETLK) {
463		error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
464	}
465	return (error);
466}
467
468int
469kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
470{
471	struct filedesc *fdp;
472	struct flock *flp;
473	struct file *fp, *fp2;
474	struct filedescent *fde;
475	struct proc *p;
476	struct vnode *vp;
477	cap_rights_t rights;
478	int error, flg, tmp;
479	uint64_t bsize;
480	off_t foffset;
481
482	error = 0;
483	flg = F_POSIX;
484	p = td->td_proc;
485	fdp = p->p_fd;
486
487	switch (cmd) {
488	case F_DUPFD:
489		tmp = arg;
490		error = do_dup(td, DUP_FCNTL, fd, tmp, td->td_retval);
491		break;
492
493	case F_DUPFD_CLOEXEC:
494		tmp = arg;
495		error = do_dup(td, DUP_FCNTL | DUP_CLOEXEC, fd, tmp,
496		    td->td_retval);
497		break;
498
499	case F_DUP2FD:
500		tmp = arg;
501		error = do_dup(td, DUP_FIXED, fd, tmp, td->td_retval);
502		break;
503
504	case F_DUP2FD_CLOEXEC:
505		tmp = arg;
506		error = do_dup(td, DUP_FIXED | DUP_CLOEXEC, fd, tmp,
507		    td->td_retval);
508		break;
509
510	case F_GETFD:
511		FILEDESC_SLOCK(fdp);
512		if (fget_locked(fdp, fd) == NULL) {
513			FILEDESC_SUNLOCK(fdp);
514			error = EBADF;
515			break;
516		}
517		fde = &fdp->fd_ofiles[fd];
518		td->td_retval[0] =
519		    (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
520		FILEDESC_SUNLOCK(fdp);
521		break;
522
523	case F_SETFD:
524		FILEDESC_XLOCK(fdp);
525		if (fget_locked(fdp, fd) == NULL) {
526			FILEDESC_XUNLOCK(fdp);
527			error = EBADF;
528			break;
529		}
530		fde = &fdp->fd_ofiles[fd];
531		fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
532		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
533		FILEDESC_XUNLOCK(fdp);
534		break;
535
536	case F_GETFL:
537		error = fget_unlocked(fdp, fd,
538		    cap_rights_init(&rights, CAP_FCNTL), F_GETFL, &fp, NULL);
539		if (error != 0)
540			break;
541		td->td_retval[0] = OFLAGS(fp->f_flag);
542		fdrop(fp, td);
543		break;
544
545	case F_SETFL:
546		error = fget_unlocked(fdp, fd,
547		    cap_rights_init(&rights, CAP_FCNTL), F_SETFL, &fp, NULL);
548		if (error != 0)
549			break;
550		do {
551			tmp = flg = fp->f_flag;
552			tmp &= ~FCNTLFLAGS;
553			tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
554		} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
555		tmp = fp->f_flag & FNONBLOCK;
556		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
557		if (error != 0) {
558			fdrop(fp, td);
559			break;
560		}
561		tmp = fp->f_flag & FASYNC;
562		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
563		if (error == 0) {
564			fdrop(fp, td);
565			break;
566		}
567		atomic_clear_int(&fp->f_flag, FNONBLOCK);
568		tmp = 0;
569		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
570		fdrop(fp, td);
571		break;
572
573	case F_GETOWN:
574		error = fget_unlocked(fdp, fd,
575		    cap_rights_init(&rights, CAP_FCNTL), F_GETOWN, &fp, NULL);
576		if (error != 0)
577			break;
578		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
579		if (error == 0)
580			td->td_retval[0] = tmp;
581		fdrop(fp, td);
582		break;
583
584	case F_SETOWN:
585		error = fget_unlocked(fdp, fd,
586		    cap_rights_init(&rights, CAP_FCNTL), F_SETOWN, &fp, NULL);
587		if (error != 0)
588			break;
589		tmp = arg;
590		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
591		fdrop(fp, td);
592		break;
593
594	case F_SETLK_REMOTE:
595		error = priv_check(td, PRIV_NFS_LOCKD);
596		if (error)
597			return (error);
598		flg = F_REMOTE;
599		goto do_setlk;
600
601	case F_SETLKW:
602		flg |= F_WAIT;
603		/* FALLTHROUGH F_SETLK */
604
605	case F_SETLK:
606	do_setlk:
607		cap_rights_init(&rights, CAP_FLOCK);
608		error = fget_unlocked(fdp, fd, &rights, 0, &fp, NULL);
609		if (error != 0)
610			break;
611		if (fp->f_type != DTYPE_VNODE) {
612			error = EBADF;
613			fdrop(fp, td);
614			break;
615		}
616
617		flp = (struct flock *)arg;
618		if (flp->l_whence == SEEK_CUR) {
619			foffset = foffset_get(fp);
620			if (foffset < 0 ||
621			    (flp->l_start > 0 &&
622			     foffset > OFF_MAX - flp->l_start)) {
623				FILEDESC_SUNLOCK(fdp);
624				error = EOVERFLOW;
625				fdrop(fp, td);
626				break;
627			}
628			flp->l_start += foffset;
629		}
630
631		vp = fp->f_vnode;
632		switch (flp->l_type) {
633		case F_RDLCK:
634			if ((fp->f_flag & FREAD) == 0) {
635				error = EBADF;
636				break;
637			}
638			PROC_LOCK(p->p_leader);
639			p->p_leader->p_flag |= P_ADVLOCK;
640			PROC_UNLOCK(p->p_leader);
641			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
642			    flp, flg);
643			break;
644		case F_WRLCK:
645			if ((fp->f_flag & FWRITE) == 0) {
646				error = EBADF;
647				break;
648			}
649			PROC_LOCK(p->p_leader);
650			p->p_leader->p_flag |= P_ADVLOCK;
651			PROC_UNLOCK(p->p_leader);
652			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
653			    flp, flg);
654			break;
655		case F_UNLCK:
656			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
657			    flp, flg);
658			break;
659		case F_UNLCKSYS:
660			/*
661			 * Temporary api for testing remote lock
662			 * infrastructure.
663			 */
664			if (flg != F_REMOTE) {
665				error = EINVAL;
666				break;
667			}
668			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
669			    F_UNLCKSYS, flp, flg);
670			break;
671		default:
672			error = EINVAL;
673			break;
674		}
675		if (error != 0 || flp->l_type == F_UNLCK ||
676		    flp->l_type == F_UNLCKSYS) {
677			fdrop(fp, td);
678			break;
679		}
680
681		/*
682		 * Check for a race with close.
683		 *
684		 * The vnode is now advisory locked (or unlocked, but this case
685		 * is not really important) as the caller requested.
686		 * We had to drop the filedesc lock, so we need to recheck if
687		 * the descriptor is still valid, because if it was closed
688		 * in the meantime we need to remove advisory lock from the
689		 * vnode - close on any descriptor leading to an advisory
690		 * locked vnode, removes that lock.
691		 * We will return 0 on purpose in that case, as the result of
692		 * successful advisory lock might have been externally visible
693		 * already. This is fine - effectively we pretend to the caller
694		 * that the closing thread was a bit slower and that the
695		 * advisory lock succeeded before the close.
696		 */
697		error = fget_unlocked(fdp, fd, &rights, 0, &fp2, NULL);
698		if (error != 0) {
699			fdrop(fp, td);
700			break;
701		}
702		if (fp != fp2) {
703			flp->l_whence = SEEK_SET;
704			flp->l_start = 0;
705			flp->l_len = 0;
706			flp->l_type = F_UNLCK;
707			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
708			    F_UNLCK, flp, F_POSIX);
709		}
710		fdrop(fp, td);
711		fdrop(fp2, td);
712		break;
713
714	case F_GETLK:
715		error = fget_unlocked(fdp, fd,
716		    cap_rights_init(&rights, CAP_FLOCK), 0, &fp, NULL);
717		if (error != 0)
718			break;
719		if (fp->f_type != DTYPE_VNODE) {
720			error = EBADF;
721			fdrop(fp, td);
722			break;
723		}
724		flp = (struct flock *)arg;
725		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
726		    flp->l_type != F_UNLCK) {
727			error = EINVAL;
728			fdrop(fp, td);
729			break;
730		}
731		if (flp->l_whence == SEEK_CUR) {
732			foffset = foffset_get(fp);
733			if ((flp->l_start > 0 &&
734			    foffset > OFF_MAX - flp->l_start) ||
735			    (flp->l_start < 0 &&
736			     foffset < OFF_MIN - flp->l_start)) {
737				FILEDESC_SUNLOCK(fdp);
738				error = EOVERFLOW;
739				fdrop(fp, td);
740				break;
741			}
742			flp->l_start += foffset;
743		}
744		vp = fp->f_vnode;
745		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
746		    F_POSIX);
747		fdrop(fp, td);
748		break;
749
750	case F_RDAHEAD:
751		arg = arg ? 128 * 1024: 0;
752		/* FALLTHROUGH */
753	case F_READAHEAD:
754		error = fget_unlocked(fdp, fd, NULL, 0, &fp, NULL);
755		if (error != 0)
756			break;
757		if (fp->f_type != DTYPE_VNODE) {
758			fdrop(fp, td);
759			error = EBADF;
760			break;
761		}
762		vp = fp->f_vnode;
763		/*
764		 * Exclusive lock synchronizes against f_seqcount reads and
765		 * writes in sequential_heuristic().
766		 */
767		error = vn_lock(vp, LK_EXCLUSIVE);
768		if (error != 0) {
769			fdrop(fp, td);
770			break;
771		}
772		if (arg >= 0) {
773			bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
774			fp->f_seqcount = (arg + bsize - 1) / bsize;
775			atomic_set_int(&fp->f_flag, FRDAHEAD);
776		} else {
777			atomic_clear_int(&fp->f_flag, FRDAHEAD);
778		}
779		VOP_UNLOCK(vp, 0);
780		fdrop(fp, td);
781		break;
782
783	default:
784		error = EINVAL;
785		break;
786	}
787	return (error);
788}
789
790static int
791getmaxfd(struct proc *p)
792{
793	int maxfd;
794
795	PROC_LOCK(p);
796	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
797	PROC_UNLOCK(p);
798
799	return (maxfd);
800}
801
802/*
803 * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
804 */
805int
806do_dup(struct thread *td, int flags, int old, int new,
807    register_t *retval)
808{
809	struct filedesc *fdp;
810	struct filedescent *oldfde, *newfde;
811	struct proc *p;
812	struct file *fp;
813	struct file *delfp;
814	int error, maxfd;
815
816	p = td->td_proc;
817	fdp = p->p_fd;
818
819	/*
820	 * Verify we have a valid descriptor to dup from and possibly to
821	 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
822	 * return EINVAL when the new descriptor is out of bounds.
823	 */
824	if (old < 0)
825		return (EBADF);
826	if (new < 0)
827		return (flags & DUP_FCNTL ? EINVAL : EBADF);
828	maxfd = getmaxfd(p);
829	if (new >= maxfd)
830		return (flags & DUP_FCNTL ? EINVAL : EBADF);
831
832	FILEDESC_XLOCK(fdp);
833	if (fget_locked(fdp, old) == NULL) {
834		FILEDESC_XUNLOCK(fdp);
835		return (EBADF);
836	}
837	oldfde = &fdp->fd_ofiles[old];
838	if (flags & DUP_FIXED && old == new) {
839		*retval = new;
840		if (flags & DUP_CLOEXEC)
841			fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
842		FILEDESC_XUNLOCK(fdp);
843		return (0);
844	}
845	fp = oldfde->fde_file;
846	fhold(fp);
847
848	/*
849	 * If the caller specified a file descriptor, make sure the file
850	 * table is large enough to hold it, and grab it.  Otherwise, just
851	 * allocate a new descriptor the usual way.
852	 */
853	if (flags & DUP_FIXED) {
854		if (new >= fdp->fd_nfiles) {
855			/*
856			 * The resource limits are here instead of e.g.
857			 * fdalloc(), because the file descriptor table may be
858			 * shared between processes, so we can't really use
859			 * racct_add()/racct_sub().  Instead of counting the
860			 * number of actually allocated descriptors, just put
861			 * the limit on the size of the file descriptor table.
862			 */
863#ifdef RACCT
864			PROC_LOCK(p);
865			error = racct_set(p, RACCT_NOFILE, new + 1);
866			PROC_UNLOCK(p);
867			if (error != 0) {
868				FILEDESC_XUNLOCK(fdp);
869				fdrop(fp, td);
870				return (EMFILE);
871			}
872#endif
873			fdgrowtable_exp(fdp, new + 1);
874			oldfde = &fdp->fd_ofiles[old];
875		}
876		newfde = &fdp->fd_ofiles[new];
877		if (newfde->fde_file == NULL)
878			fdused(fdp, new);
879	} else {
880		if ((error = fdalloc(td, new, &new)) != 0) {
881			FILEDESC_XUNLOCK(fdp);
882			fdrop(fp, td);
883			return (error);
884		}
885		newfde = &fdp->fd_ofiles[new];
886	}
887
888	KASSERT(fp == oldfde->fde_file, ("old fd has been modified"));
889	KASSERT(old != new, ("new fd is same as old"));
890
891	delfp = newfde->fde_file;
892
893	/*
894	 * Duplicate the source descriptor.
895	 */
896	filecaps_free(&newfde->fde_caps);
897	*newfde = *oldfde;
898	filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps);
899	if ((flags & DUP_CLOEXEC) != 0)
900		newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
901	else
902		newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
903	*retval = new;
904
905	if (delfp != NULL) {
906		(void) closefp(fdp, new, delfp, td, 1);
907		/* closefp() drops the FILEDESC lock for us. */
908	} else {
909		FILEDESC_XUNLOCK(fdp);
910	}
911
912	return (0);
913}
914
915/*
916 * If sigio is on the list associated with a process or process group,
917 * disable signalling from the device, remove sigio from the list and
918 * free sigio.
919 */
920void
921funsetown(struct sigio **sigiop)
922{
923	struct sigio *sigio;
924
925	SIGIO_LOCK();
926	sigio = *sigiop;
927	if (sigio == NULL) {
928		SIGIO_UNLOCK();
929		return;
930	}
931	*(sigio->sio_myref) = NULL;
932	if ((sigio)->sio_pgid < 0) {
933		struct pgrp *pg = (sigio)->sio_pgrp;
934		PGRP_LOCK(pg);
935		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
936			     sigio, sio_pgsigio);
937		PGRP_UNLOCK(pg);
938	} else {
939		struct proc *p = (sigio)->sio_proc;
940		PROC_LOCK(p);
941		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
942			     sigio, sio_pgsigio);
943		PROC_UNLOCK(p);
944	}
945	SIGIO_UNLOCK();
946	crfree(sigio->sio_ucred);
947	free(sigio, M_SIGIO);
948}
949
950/*
951 * Free a list of sigio structures.
952 * We only need to lock the SIGIO_LOCK because we have made ourselves
953 * inaccessible to callers of fsetown and therefore do not need to lock
954 * the proc or pgrp struct for the list manipulation.
955 */
956void
957funsetownlst(struct sigiolst *sigiolst)
958{
959	struct proc *p;
960	struct pgrp *pg;
961	struct sigio *sigio;
962
963	sigio = SLIST_FIRST(sigiolst);
964	if (sigio == NULL)
965		return;
966	p = NULL;
967	pg = NULL;
968
969	/*
970	 * Every entry of the list should belong
971	 * to a single proc or pgrp.
972	 */
973	if (sigio->sio_pgid < 0) {
974		pg = sigio->sio_pgrp;
975		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
976	} else /* if (sigio->sio_pgid > 0) */ {
977		p = sigio->sio_proc;
978		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
979	}
980
981	SIGIO_LOCK();
982	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
983		*(sigio->sio_myref) = NULL;
984		if (pg != NULL) {
985			KASSERT(sigio->sio_pgid < 0,
986			    ("Proc sigio in pgrp sigio list"));
987			KASSERT(sigio->sio_pgrp == pg,
988			    ("Bogus pgrp in sigio list"));
989			PGRP_LOCK(pg);
990			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
991			    sio_pgsigio);
992			PGRP_UNLOCK(pg);
993		} else /* if (p != NULL) */ {
994			KASSERT(sigio->sio_pgid > 0,
995			    ("Pgrp sigio in proc sigio list"));
996			KASSERT(sigio->sio_proc == p,
997			    ("Bogus proc in sigio list"));
998			PROC_LOCK(p);
999			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
1000			    sio_pgsigio);
1001			PROC_UNLOCK(p);
1002		}
1003		SIGIO_UNLOCK();
1004		crfree(sigio->sio_ucred);
1005		free(sigio, M_SIGIO);
1006		SIGIO_LOCK();
1007	}
1008	SIGIO_UNLOCK();
1009}
1010
1011/*
1012 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
1013 *
1014 * After permission checking, add a sigio structure to the sigio list for
1015 * the process or process group.
1016 */
1017int
1018fsetown(pid_t pgid, struct sigio **sigiop)
1019{
1020	struct proc *proc;
1021	struct pgrp *pgrp;
1022	struct sigio *sigio;
1023	int ret;
1024
1025	if (pgid == 0) {
1026		funsetown(sigiop);
1027		return (0);
1028	}
1029
1030	ret = 0;
1031
1032	/* Allocate and fill in the new sigio out of locks. */
1033	sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
1034	sigio->sio_pgid = pgid;
1035	sigio->sio_ucred = crhold(curthread->td_ucred);
1036	sigio->sio_myref = sigiop;
1037
1038	sx_slock(&proctree_lock);
1039	if (pgid > 0) {
1040		proc = pfind(pgid);
1041		if (proc == NULL) {
1042			ret = ESRCH;
1043			goto fail;
1044		}
1045
1046		/*
1047		 * Policy - Don't allow a process to FSETOWN a process
1048		 * in another session.
1049		 *
1050		 * Remove this test to allow maximum flexibility or
1051		 * restrict FSETOWN to the current process or process
1052		 * group for maximum safety.
1053		 */
1054		PROC_UNLOCK(proc);
1055		if (proc->p_session != curthread->td_proc->p_session) {
1056			ret = EPERM;
1057			goto fail;
1058		}
1059
1060		pgrp = NULL;
1061	} else /* if (pgid < 0) */ {
1062		pgrp = pgfind(-pgid);
1063		if (pgrp == NULL) {
1064			ret = ESRCH;
1065			goto fail;
1066		}
1067		PGRP_UNLOCK(pgrp);
1068
1069		/*
1070		 * Policy - Don't allow a process to FSETOWN a process
1071		 * in another session.
1072		 *
1073		 * Remove this test to allow maximum flexibility or
1074		 * restrict FSETOWN to the current process or process
1075		 * group for maximum safety.
1076		 */
1077		if (pgrp->pg_session != curthread->td_proc->p_session) {
1078			ret = EPERM;
1079			goto fail;
1080		}
1081
1082		proc = NULL;
1083	}
1084	funsetown(sigiop);
1085	if (pgid > 0) {
1086		PROC_LOCK(proc);
1087		/*
1088		 * Since funsetownlst() is called without the proctree
1089		 * locked, we need to check for P_WEXIT.
1090		 * XXX: is ESRCH correct?
1091		 */
1092		if ((proc->p_flag & P_WEXIT) != 0) {
1093			PROC_UNLOCK(proc);
1094			ret = ESRCH;
1095			goto fail;
1096		}
1097		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
1098		sigio->sio_proc = proc;
1099		PROC_UNLOCK(proc);
1100	} else {
1101		PGRP_LOCK(pgrp);
1102		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
1103		sigio->sio_pgrp = pgrp;
1104		PGRP_UNLOCK(pgrp);
1105	}
1106	sx_sunlock(&proctree_lock);
1107	SIGIO_LOCK();
1108	*sigiop = sigio;
1109	SIGIO_UNLOCK();
1110	return (0);
1111
1112fail:
1113	sx_sunlock(&proctree_lock);
1114	crfree(sigio->sio_ucred);
1115	free(sigio, M_SIGIO);
1116	return (ret);
1117}
1118
1119/*
1120 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
1121 */
1122pid_t
1123fgetown(sigiop)
1124	struct sigio **sigiop;
1125{
1126	pid_t pgid;
1127
1128	SIGIO_LOCK();
1129	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
1130	SIGIO_UNLOCK();
1131	return (pgid);
1132}
1133
1134/*
1135 * Function drops the filedesc lock on return.
1136 */
1137static int
1138closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
1139    int holdleaders)
1140{
1141	int error;
1142
1143	FILEDESC_XLOCK_ASSERT(fdp);
1144
1145	if (holdleaders) {
1146		if (td->td_proc->p_fdtol != NULL) {
1147			/*
1148			 * Ask fdfree() to sleep to ensure that all relevant
1149			 * process leaders can be traversed in closef().
1150			 */
1151			fdp->fd_holdleaderscount++;
1152		} else {
1153			holdleaders = 0;
1154		}
1155	}
1156
1157	/*
1158	 * We now hold the fp reference that used to be owned by the
1159	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
1160	 * knote_fdclose to prevent a race of the fd getting opened, a knote
1161	 * added, and deleteing a knote for the new fd.
1162	 */
1163	knote_fdclose(td, fd);
1164
1165	/*
1166	 * We need to notify mqueue if the object is of type mqueue.
1167	 */
1168	if (fp->f_type == DTYPE_MQUEUE)
1169		mq_fdclose(td, fd, fp);
1170	FILEDESC_XUNLOCK(fdp);
1171
1172	error = closef(fp, td);
1173	if (holdleaders) {
1174		FILEDESC_XLOCK(fdp);
1175		fdp->fd_holdleaderscount--;
1176		if (fdp->fd_holdleaderscount == 0 &&
1177		    fdp->fd_holdleaderswakeup != 0) {
1178			fdp->fd_holdleaderswakeup = 0;
1179			wakeup(&fdp->fd_holdleaderscount);
1180		}
1181		FILEDESC_XUNLOCK(fdp);
1182	}
1183	return (error);
1184}
1185
1186/*
1187 * Close a file descriptor.
1188 */
1189#ifndef _SYS_SYSPROTO_H_
1190struct close_args {
1191	int     fd;
1192};
1193#endif
1194/* ARGSUSED */
1195int
1196sys_close(td, uap)
1197	struct thread *td;
1198	struct close_args *uap;
1199{
1200
1201	return (kern_close(td, uap->fd));
1202}
1203
1204int
1205kern_close(td, fd)
1206	struct thread *td;
1207	int fd;
1208{
1209	struct filedesc *fdp;
1210	struct file *fp;
1211
1212	fdp = td->td_proc->p_fd;
1213
1214	AUDIT_SYSCLOSE(td, fd);
1215
1216	FILEDESC_XLOCK(fdp);
1217	if ((fp = fget_locked(fdp, fd)) == NULL) {
1218		FILEDESC_XUNLOCK(fdp);
1219		return (EBADF);
1220	}
1221	fdfree(fdp, fd);
1222
1223	/* closefp() drops the FILEDESC lock for us. */
1224	return (closefp(fdp, fd, fp, td, 1));
1225}
1226
1227/*
1228 * Close open file descriptors.
1229 */
1230#ifndef _SYS_SYSPROTO_H_
1231struct closefrom_args {
1232	int	lowfd;
1233};
1234#endif
1235/* ARGSUSED */
1236int
1237sys_closefrom(struct thread *td, struct closefrom_args *uap)
1238{
1239	struct filedesc *fdp;
1240	int fd;
1241
1242	fdp = td->td_proc->p_fd;
1243	AUDIT_ARG_FD(uap->lowfd);
1244
1245	/*
1246	 * Treat negative starting file descriptor values identical to
1247	 * closefrom(0) which closes all files.
1248	 */
1249	if (uap->lowfd < 0)
1250		uap->lowfd = 0;
1251	FILEDESC_SLOCK(fdp);
1252	for (fd = uap->lowfd; fd <= fdp->fd_lastfile; fd++) {
1253		if (fdp->fd_ofiles[fd].fde_file != NULL) {
1254			FILEDESC_SUNLOCK(fdp);
1255			(void)kern_close(td, fd);
1256			FILEDESC_SLOCK(fdp);
1257		}
1258	}
1259	FILEDESC_SUNLOCK(fdp);
1260	return (0);
1261}
1262
1263#if defined(COMPAT_43)
1264/*
1265 * Return status information about a file descriptor.
1266 */
1267#ifndef _SYS_SYSPROTO_H_
1268struct ofstat_args {
1269	int	fd;
1270	struct	ostat *sb;
1271};
1272#endif
1273/* ARGSUSED */
1274int
1275ofstat(struct thread *td, struct ofstat_args *uap)
1276{
1277	struct ostat oub;
1278	struct stat ub;
1279	int error;
1280
1281	error = kern_fstat(td, uap->fd, &ub);
1282	if (error == 0) {
1283		cvtstat(&ub, &oub);
1284		error = copyout(&oub, uap->sb, sizeof(oub));
1285	}
1286	return (error);
1287}
1288#endif /* COMPAT_43 */
1289
1290/*
1291 * Return status information about a file descriptor.
1292 */
1293#ifndef _SYS_SYSPROTO_H_
1294struct fstat_args {
1295	int	fd;
1296	struct	stat *sb;
1297};
1298#endif
1299/* ARGSUSED */
1300int
1301sys_fstat(struct thread *td, struct fstat_args *uap)
1302{
1303	struct stat ub;
1304	int error;
1305
1306	error = kern_fstat(td, uap->fd, &ub);
1307	if (error == 0)
1308		error = copyout(&ub, uap->sb, sizeof(ub));
1309	return (error);
1310}
1311
1312int
1313kern_fstat(struct thread *td, int fd, struct stat *sbp)
1314{
1315	struct file *fp;
1316	cap_rights_t rights;
1317	int error;
1318
1319	AUDIT_ARG_FD(fd);
1320
1321	error = fget(td, fd, cap_rights_init(&rights, CAP_FSTAT), &fp);
1322	if (error != 0)
1323		return (error);
1324
1325	AUDIT_ARG_FILE(td->td_proc, fp);
1326
1327	error = fo_stat(fp, sbp, td->td_ucred, td);
1328	fdrop(fp, td);
1329#ifdef KTRACE
1330	if (error == 0 && KTRPOINT(td, KTR_STRUCT))
1331		ktrstat(sbp);
1332#endif
1333	return (error);
1334}
1335
1336/*
1337 * Return status information about a file descriptor.
1338 */
1339#ifndef _SYS_SYSPROTO_H_
1340struct nfstat_args {
1341	int	fd;
1342	struct	nstat *sb;
1343};
1344#endif
1345/* ARGSUSED */
1346int
1347sys_nfstat(struct thread *td, struct nfstat_args *uap)
1348{
1349	struct nstat nub;
1350	struct stat ub;
1351	int error;
1352
1353	error = kern_fstat(td, uap->fd, &ub);
1354	if (error == 0) {
1355		cvtnstat(&ub, &nub);
1356		error = copyout(&nub, uap->sb, sizeof(nub));
1357	}
1358	return (error);
1359}
1360
1361/*
1362 * Return pathconf information about a file descriptor.
1363 */
1364#ifndef _SYS_SYSPROTO_H_
1365struct fpathconf_args {
1366	int	fd;
1367	int	name;
1368};
1369#endif
1370/* ARGSUSED */
1371int
1372sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
1373{
1374	struct file *fp;
1375	struct vnode *vp;
1376	cap_rights_t rights;
1377	int error;
1378
1379	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FPATHCONF), &fp);
1380	if (error != 0)
1381		return (error);
1382
1383	/* If asynchronous I/O is available, it works for all descriptors. */
1384	if (uap->name == _PC_ASYNC_IO) {
1385		td->td_retval[0] = async_io_version;
1386		goto out;
1387	}
1388	vp = fp->f_vnode;
1389	if (vp != NULL) {
1390		vn_lock(vp, LK_SHARED | LK_RETRY);
1391		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
1392		VOP_UNLOCK(vp, 0);
1393	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
1394		if (uap->name != _PC_PIPE_BUF) {
1395			error = EINVAL;
1396		} else {
1397			td->td_retval[0] = PIPE_BUF;
1398			error = 0;
1399		}
1400	} else {
1401		error = EOPNOTSUPP;
1402	}
1403out:
1404	fdrop(fp, td);
1405	return (error);
1406}
1407
1408/*
1409 * Initialize filecaps structure.
1410 */
1411void
1412filecaps_init(struct filecaps *fcaps)
1413{
1414
1415	bzero(fcaps, sizeof(*fcaps));
1416	fcaps->fc_nioctls = -1;
1417}
1418
1419/*
1420 * Copy filecaps structure allocating memory for ioctls array if needed.
1421 */
1422void
1423filecaps_copy(const struct filecaps *src, struct filecaps *dst)
1424{
1425	size_t size;
1426
1427	*dst = *src;
1428	if (src->fc_ioctls != NULL) {
1429		KASSERT(src->fc_nioctls > 0,
1430		    ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
1431
1432		size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
1433		dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
1434		bcopy(src->fc_ioctls, dst->fc_ioctls, size);
1435	}
1436}
1437
1438/*
1439 * Move filecaps structure to the new place and clear the old place.
1440 */
1441void
1442filecaps_move(struct filecaps *src, struct filecaps *dst)
1443{
1444
1445	*dst = *src;
1446	bzero(src, sizeof(*src));
1447}
1448
1449/*
1450 * Fill the given filecaps structure with full rights.
1451 */
1452static void
1453filecaps_fill(struct filecaps *fcaps)
1454{
1455
1456	CAP_ALL(&fcaps->fc_rights);
1457	fcaps->fc_ioctls = NULL;
1458	fcaps->fc_nioctls = -1;
1459	fcaps->fc_fcntls = CAP_FCNTL_ALL;
1460}
1461
1462/*
1463 * Free memory allocated within filecaps structure.
1464 */
1465void
1466filecaps_free(struct filecaps *fcaps)
1467{
1468
1469	free(fcaps->fc_ioctls, M_FILECAPS);
1470	bzero(fcaps, sizeof(*fcaps));
1471}
1472
1473/*
1474 * Validate the given filecaps structure.
1475 */
1476static void
1477filecaps_validate(const struct filecaps *fcaps, const char *func)
1478{
1479
1480	KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
1481	    ("%s: invalid rights", func));
1482	KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
1483	    ("%s: invalid fcntls", func));
1484	KASSERT(fcaps->fc_fcntls == 0 ||
1485	    cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
1486	    ("%s: fcntls without CAP_FCNTL", func));
1487	KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
1488	    (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
1489	    ("%s: invalid ioctls", func));
1490	KASSERT(fcaps->fc_nioctls == 0 ||
1491	    cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
1492	    ("%s: ioctls without CAP_IOCTL", func));
1493}
1494
1495static void
1496fdgrowtable_exp(struct filedesc *fdp, int nfd)
1497{
1498	int nfd1;
1499
1500	FILEDESC_XLOCK_ASSERT(fdp);
1501
1502	nfd1 = fdp->fd_nfiles * 2;
1503	if (nfd1 < nfd)
1504		nfd1 = nfd;
1505	fdgrowtable(fdp, nfd1);
1506}
1507
1508/*
1509 * Grow the file table to accomodate (at least) nfd descriptors.
1510 */
1511static void
1512fdgrowtable(struct filedesc *fdp, int nfd)
1513{
1514	struct filedesc0 *fdp0;
1515	struct freetable *ft;
1516	struct filedescent *ntable;
1517	struct filedescent *otable;
1518	int nnfiles, onfiles;
1519	NDSLOTTYPE *nmap, *omap;
1520
1521	FILEDESC_XLOCK_ASSERT(fdp);
1522
1523	KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
1524
1525	/* save old values */
1526	onfiles = fdp->fd_nfiles;
1527	otable = fdp->fd_ofiles;
1528	omap = fdp->fd_map;
1529
1530	/* compute the size of the new table */
1531	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
1532	if (nnfiles <= onfiles)
1533		/* the table is already large enough */
1534		return;
1535
1536	/*
1537	 * Allocate a new table.  We need enough space for the
1538	 * file entries themselves and the struct freetable we will use
1539	 * when we decommission the table and place it on the freelist.
1540	 * We place the struct freetable in the middle so we don't have
1541	 * to worry about padding.
1542	 */
1543	ntable = malloc(nnfiles * sizeof(ntable[0]) + sizeof(struct freetable),
1544	    M_FILEDESC, M_ZERO | M_WAITOK);
1545	/* copy the old data over and point at the new tables */
1546	memcpy(ntable, otable, onfiles * sizeof(*otable));
1547	fdp->fd_ofiles = ntable;
1548
1549	/*
1550	 * Allocate a new map only if the old is not large enough.  It will
1551	 * grow at a slower rate than the table as it can map more
1552	 * entries than the table can hold.
1553	 */
1554	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
1555		nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
1556		    M_ZERO | M_WAITOK);
1557		/* copy over the old data and update the pointer */
1558		memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
1559		fdp->fd_map = nmap;
1560	}
1561
1562	/*
1563	 * In order to have a valid pattern for fget_unlocked()
1564	 * fdp->fd_nfiles must be the last member to be updated, otherwise
1565	 * fget_unlocked() consumers may reference a new, higher value for
1566	 * fdp->fd_nfiles before to access the fdp->fd_ofiles array,
1567	 * resulting in OOB accesses.
1568	 */
1569	atomic_store_rel_int(&fdp->fd_nfiles, nnfiles);
1570
1571	/*
1572	 * Do not free the old file table, as some threads may still
1573	 * reference entries within it.  Instead, place it on a freelist
1574	 * which will be processed when the struct filedesc is released.
1575	 *
1576	 * Note that if onfiles == NDFILE, we're dealing with the original
1577	 * static allocation contained within (struct filedesc0 *)fdp,
1578	 * which must not be freed.
1579	 */
1580	if (onfiles > NDFILE) {
1581		ft = (struct freetable *)&otable[onfiles];
1582		fdp0 = (struct filedesc0 *)fdp;
1583		ft->ft_table = otable;
1584		SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
1585	}
1586	/*
1587	 * The map does not have the same possibility of threads still
1588	 * holding references to it.  So always free it as long as it
1589	 * does not reference the original static allocation.
1590	 */
1591	if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
1592		free(omap, M_FILEDESC);
1593}
1594
1595/*
1596 * Allocate a file descriptor for the process.
1597 */
1598int
1599fdalloc(struct thread *td, int minfd, int *result)
1600{
1601	struct proc *p = td->td_proc;
1602	struct filedesc *fdp = p->p_fd;
1603	int fd = -1, maxfd, allocfd;
1604#ifdef RACCT
1605	int error;
1606#endif
1607
1608	FILEDESC_XLOCK_ASSERT(fdp);
1609
1610	if (fdp->fd_freefile > minfd)
1611		minfd = fdp->fd_freefile;
1612
1613	maxfd = getmaxfd(p);
1614
1615	/*
1616	 * Search the bitmap for a free descriptor starting at minfd.
1617	 * If none is found, grow the file table.
1618	 */
1619	fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
1620	if (fd >= maxfd)
1621		return (EMFILE);
1622	if (fd >= fdp->fd_nfiles) {
1623		allocfd = min(fd * 2, maxfd);
1624#ifdef RACCT
1625		PROC_LOCK(p);
1626		error = racct_set(p, RACCT_NOFILE, allocfd);
1627		PROC_UNLOCK(p);
1628		if (error != 0)
1629			return (EMFILE);
1630#endif
1631		/*
1632		 * fd is already equal to first free descriptor >= minfd, so
1633		 * we only need to grow the table and we are done.
1634		 */
1635		fdgrowtable_exp(fdp, allocfd);
1636	}
1637
1638	/*
1639	 * Perform some sanity checks, then mark the file descriptor as
1640	 * used and return it to the caller.
1641	 */
1642	KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
1643	    ("invalid descriptor %d", fd));
1644	KASSERT(!fdisused(fdp, fd),
1645	    ("fd_first_free() returned non-free descriptor"));
1646	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
1647	    ("file descriptor isn't free"));
1648	KASSERT(fdp->fd_ofiles[fd].fde_flags == 0, ("file flags are set"));
1649	fdused(fdp, fd);
1650	*result = fd;
1651	return (0);
1652}
1653
1654/*
1655 * Allocate n file descriptors for the process.
1656 */
1657int
1658fdallocn(struct thread *td, int minfd, int *fds, int n)
1659{
1660	struct proc *p = td->td_proc;
1661	struct filedesc *fdp = p->p_fd;
1662	int i;
1663
1664	FILEDESC_XLOCK_ASSERT(fdp);
1665
1666	for (i = 0; i < n; i++)
1667		if (fdalloc(td, 0, &fds[i]) != 0)
1668			break;
1669
1670	if (i < n) {
1671		for (i--; i >= 0; i--)
1672			fdunused(fdp, fds[i]);
1673		return (EMFILE);
1674	}
1675
1676	return (0);
1677}
1678
1679/*
1680 * Create a new open file structure and allocate a file decriptor for the
1681 * process that refers to it.  We add one reference to the file for the
1682 * descriptor table and one reference for resultfp. This is to prevent us
1683 * being preempted and the entry in the descriptor table closed after we
1684 * release the FILEDESC lock.
1685 */
1686int
1687falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags)
1688{
1689	struct file *fp;
1690	int error, fd;
1691
1692	error = falloc_noinstall(td, &fp);
1693	if (error)
1694		return (error);		/* no reference held on error */
1695
1696	error = finstall(td, fp, &fd, flags, NULL);
1697	if (error) {
1698		fdrop(fp, td);		/* one reference (fp only) */
1699		return (error);
1700	}
1701
1702	if (resultfp != NULL)
1703		*resultfp = fp;		/* copy out result */
1704	else
1705		fdrop(fp, td);		/* release local reference */
1706
1707	if (resultfd != NULL)
1708		*resultfd = fd;
1709
1710	return (0);
1711}
1712
1713/*
1714 * Create a new open file structure without allocating a file descriptor.
1715 */
1716int
1717falloc_noinstall(struct thread *td, struct file **resultfp)
1718{
1719	struct file *fp;
1720	int maxuserfiles = maxfiles - (maxfiles / 20);
1721	static struct timeval lastfail;
1722	static int curfail;
1723
1724	KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
1725
1726	if ((openfiles >= maxuserfiles &&
1727	    priv_check(td, PRIV_MAXFILES) != 0) ||
1728	    openfiles >= maxfiles) {
1729		if (ppsratecheck(&lastfail, &curfail, 1)) {
1730			printf("kern.maxfiles limit exceeded by uid %i, "
1731			    "please see tuning(7).\n", td->td_ucred->cr_ruid);
1732		}
1733		return (ENFILE);
1734	}
1735	atomic_add_int(&openfiles, 1);
1736	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
1737	refcount_init(&fp->f_count, 1);
1738	fp->f_cred = crhold(td->td_ucred);
1739	fp->f_ops = &badfileops;
1740	fp->f_data = NULL;
1741	fp->f_vnode = NULL;
1742	*resultfp = fp;
1743	return (0);
1744}
1745
1746/*
1747 * Install a file in a file descriptor table.
1748 */
1749int
1750finstall(struct thread *td, struct file *fp, int *fd, int flags,
1751    struct filecaps *fcaps)
1752{
1753	struct filedesc *fdp = td->td_proc->p_fd;
1754	struct filedescent *fde;
1755	int error;
1756
1757	KASSERT(fd != NULL, ("%s: fd == NULL", __func__));
1758	KASSERT(fp != NULL, ("%s: fp == NULL", __func__));
1759	if (fcaps != NULL)
1760		filecaps_validate(fcaps, __func__);
1761
1762	FILEDESC_XLOCK(fdp);
1763	if ((error = fdalloc(td, 0, fd))) {
1764		FILEDESC_XUNLOCK(fdp);
1765		return (error);
1766	}
1767	fhold(fp);
1768	fde = &fdp->fd_ofiles[*fd];
1769	fde->fde_file = fp;
1770	if ((flags & O_CLOEXEC) != 0)
1771		fde->fde_flags |= UF_EXCLOSE;
1772	if (fcaps != NULL)
1773		filecaps_move(fcaps, &fde->fde_caps);
1774	else
1775		filecaps_fill(&fde->fde_caps);
1776	FILEDESC_XUNLOCK(fdp);
1777	return (0);
1778}
1779
1780/*
1781 * Build a new filedesc structure from another.
1782 * Copy the current, root, and jail root vnode references.
1783 */
1784struct filedesc *
1785fdinit(struct filedesc *fdp)
1786{
1787	struct filedesc0 *newfdp;
1788
1789	newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO);
1790	FILEDESC_LOCK_INIT(&newfdp->fd_fd);
1791	if (fdp != NULL) {
1792		FILEDESC_SLOCK(fdp);
1793		newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
1794		if (newfdp->fd_fd.fd_cdir)
1795			VREF(newfdp->fd_fd.fd_cdir);
1796		newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
1797		if (newfdp->fd_fd.fd_rdir)
1798			VREF(newfdp->fd_fd.fd_rdir);
1799		newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
1800		if (newfdp->fd_fd.fd_jdir)
1801			VREF(newfdp->fd_fd.fd_jdir);
1802		FILEDESC_SUNLOCK(fdp);
1803	}
1804
1805	/* Create the file descriptor table. */
1806	newfdp->fd_fd.fd_refcnt = 1;
1807	newfdp->fd_fd.fd_holdcnt = 1;
1808	newfdp->fd_fd.fd_cmask = CMASK;
1809	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
1810	newfdp->fd_fd.fd_nfiles = NDFILE;
1811	newfdp->fd_fd.fd_map = newfdp->fd_dmap;
1812	newfdp->fd_fd.fd_lastfile = -1;
1813	return (&newfdp->fd_fd);
1814}
1815
1816static struct filedesc *
1817fdhold(struct proc *p)
1818{
1819	struct filedesc *fdp;
1820
1821	mtx_lock(&fdesc_mtx);
1822	fdp = p->p_fd;
1823	if (fdp != NULL)
1824		fdp->fd_holdcnt++;
1825	mtx_unlock(&fdesc_mtx);
1826	return (fdp);
1827}
1828
1829static void
1830fddrop(struct filedesc *fdp)
1831{
1832	struct filedesc0 *fdp0;
1833	struct freetable *ft;
1834	int i;
1835
1836	mtx_lock(&fdesc_mtx);
1837	i = --fdp->fd_holdcnt;
1838	mtx_unlock(&fdesc_mtx);
1839	if (i > 0)
1840		return;
1841
1842	FILEDESC_LOCK_DESTROY(fdp);
1843	fdp0 = (struct filedesc0 *)fdp;
1844	while ((ft = SLIST_FIRST(&fdp0->fd_free)) != NULL) {
1845		SLIST_REMOVE_HEAD(&fdp0->fd_free, ft_next);
1846		free(ft->ft_table, M_FILEDESC);
1847	}
1848	free(fdp, M_FILEDESC);
1849}
1850
1851/*
1852 * Share a filedesc structure.
1853 */
1854struct filedesc *
1855fdshare(struct filedesc *fdp)
1856{
1857
1858	FILEDESC_XLOCK(fdp);
1859	fdp->fd_refcnt++;
1860	FILEDESC_XUNLOCK(fdp);
1861	return (fdp);
1862}
1863
1864/*
1865 * Unshare a filedesc structure, if necessary by making a copy
1866 */
1867void
1868fdunshare(struct thread *td)
1869{
1870	struct filedesc *tmp;
1871	struct proc *p = td->td_proc;
1872
1873	if (p->p_fd->fd_refcnt == 1)
1874		return;
1875
1876	tmp = fdcopy(p->p_fd);
1877	fdescfree(td);
1878	p->p_fd = tmp;
1879}
1880
1881/*
1882 * Copy a filedesc structure.  A NULL pointer in returns a NULL reference,
1883 * this is to ease callers, not catch errors.
1884 */
1885struct filedesc *
1886fdcopy(struct filedesc *fdp)
1887{
1888	struct filedesc *newfdp;
1889	struct filedescent *nfde, *ofde;
1890	int i;
1891
1892	/* Certain daemons might not have file descriptors. */
1893	if (fdp == NULL)
1894		return (NULL);
1895
1896	newfdp = fdinit(fdp);
1897	FILEDESC_SLOCK(fdp);
1898	while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
1899		FILEDESC_SUNLOCK(fdp);
1900		FILEDESC_XLOCK(newfdp);
1901		fdgrowtable(newfdp, fdp->fd_lastfile + 1);
1902		FILEDESC_XUNLOCK(newfdp);
1903		FILEDESC_SLOCK(fdp);
1904	}
1905	/* copy all passable descriptors (i.e. not kqueue) */
1906	newfdp->fd_freefile = -1;
1907	for (i = 0; i <= fdp->fd_lastfile; ++i) {
1908		ofde = &fdp->fd_ofiles[i];
1909		if (fdisused(fdp, i) &&
1910		    (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) &&
1911		    ofde->fde_file->f_ops != &badfileops) {
1912			nfde = &newfdp->fd_ofiles[i];
1913			*nfde = *ofde;
1914			filecaps_copy(&ofde->fde_caps, &nfde->fde_caps);
1915			fhold(nfde->fde_file);
1916			newfdp->fd_lastfile = i;
1917		} else {
1918			if (newfdp->fd_freefile == -1)
1919				newfdp->fd_freefile = i;
1920		}
1921	}
1922	newfdp->fd_cmask = fdp->fd_cmask;
1923	FILEDESC_SUNLOCK(fdp);
1924	FILEDESC_XLOCK(newfdp);
1925	for (i = 0; i <= newfdp->fd_lastfile; ++i) {
1926		if (newfdp->fd_ofiles[i].fde_file != NULL)
1927			fdused(newfdp, i);
1928	}
1929	if (newfdp->fd_freefile == -1)
1930		newfdp->fd_freefile = i;
1931	FILEDESC_XUNLOCK(newfdp);
1932	return (newfdp);
1933}
1934
1935/*
1936 * Release a filedesc structure.
1937 */
1938void
1939fdescfree(struct thread *td)
1940{
1941	struct filedesc *fdp;
1942	int i;
1943	struct filedesc_to_leader *fdtol;
1944	struct file *fp;
1945	struct vnode *cdir, *jdir, *rdir, *vp;
1946	struct flock lf;
1947
1948	/* Certain daemons might not have file descriptors. */
1949	fdp = td->td_proc->p_fd;
1950	if (fdp == NULL)
1951		return;
1952
1953#ifdef RACCT
1954	PROC_LOCK(td->td_proc);
1955	racct_set(td->td_proc, RACCT_NOFILE, 0);
1956	PROC_UNLOCK(td->td_proc);
1957#endif
1958
1959	/* Check for special need to clear POSIX style locks */
1960	fdtol = td->td_proc->p_fdtol;
1961	if (fdtol != NULL) {
1962		FILEDESC_XLOCK(fdp);
1963		KASSERT(fdtol->fdl_refcount > 0,
1964		    ("filedesc_to_refcount botch: fdl_refcount=%d",
1965		    fdtol->fdl_refcount));
1966		if (fdtol->fdl_refcount == 1 &&
1967		    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
1968			for (i = 0; i <= fdp->fd_lastfile; i++) {
1969				fp = fdp->fd_ofiles[i].fde_file;
1970				if (fp == NULL || fp->f_type != DTYPE_VNODE)
1971					continue;
1972				fhold(fp);
1973				FILEDESC_XUNLOCK(fdp);
1974				lf.l_whence = SEEK_SET;
1975				lf.l_start = 0;
1976				lf.l_len = 0;
1977				lf.l_type = F_UNLCK;
1978				vp = fp->f_vnode;
1979				(void) VOP_ADVLOCK(vp,
1980				    (caddr_t)td->td_proc->p_leader, F_UNLCK,
1981				    &lf, F_POSIX);
1982				FILEDESC_XLOCK(fdp);
1983				fdrop(fp, td);
1984			}
1985		}
1986	retry:
1987		if (fdtol->fdl_refcount == 1) {
1988			if (fdp->fd_holdleaderscount > 0 &&
1989			    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
1990				/*
1991				 * close() or do_dup() has cleared a reference
1992				 * in a shared file descriptor table.
1993				 */
1994				fdp->fd_holdleaderswakeup = 1;
1995				sx_sleep(&fdp->fd_holdleaderscount,
1996				    FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
1997				goto retry;
1998			}
1999			if (fdtol->fdl_holdcount > 0) {
2000				/*
2001				 * Ensure that fdtol->fdl_leader remains
2002				 * valid in closef().
2003				 */
2004				fdtol->fdl_wakeup = 1;
2005				sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
2006				    "fdlhold", 0);
2007				goto retry;
2008			}
2009		}
2010		fdtol->fdl_refcount--;
2011		if (fdtol->fdl_refcount == 0 &&
2012		    fdtol->fdl_holdcount == 0) {
2013			fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
2014			fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
2015		} else
2016			fdtol = NULL;
2017		td->td_proc->p_fdtol = NULL;
2018		FILEDESC_XUNLOCK(fdp);
2019		if (fdtol != NULL)
2020			free(fdtol, M_FILEDESC_TO_LEADER);
2021	}
2022
2023	mtx_lock(&fdesc_mtx);
2024	td->td_proc->p_fd = NULL;
2025	mtx_unlock(&fdesc_mtx);
2026
2027	FILEDESC_XLOCK(fdp);
2028	i = --fdp->fd_refcnt;
2029	if (i > 0) {
2030		FILEDESC_XUNLOCK(fdp);
2031		return;
2032	}
2033
2034	cdir = fdp->fd_cdir;
2035	fdp->fd_cdir = NULL;
2036	rdir = fdp->fd_rdir;
2037	fdp->fd_rdir = NULL;
2038	jdir = fdp->fd_jdir;
2039	fdp->fd_jdir = NULL;
2040	FILEDESC_XUNLOCK(fdp);
2041
2042	for (i = 0; i <= fdp->fd_lastfile; i++) {
2043		fp = fdp->fd_ofiles[i].fde_file;
2044		if (fp != NULL) {
2045			fdfree_last(fdp, i);
2046			(void) closef(fp, td);
2047		}
2048	}
2049
2050	if (fdp->fd_nfiles > NDFILE)
2051		free(fdp->fd_ofiles, M_FILEDESC);
2052	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
2053		free(fdp->fd_map, M_FILEDESC);
2054
2055	if (cdir != NULL)
2056		vrele(cdir);
2057	if (rdir != NULL)
2058		vrele(rdir);
2059	if (jdir != NULL)
2060		vrele(jdir);
2061
2062	fddrop(fdp);
2063}
2064
2065/*
2066 * For setugid programs, we don't want to people to use that setugidness
2067 * to generate error messages which write to a file which otherwise would
2068 * otherwise be off-limits to the process.  We check for filesystems where
2069 * the vnode can change out from under us after execve (like [lin]procfs).
2070 *
2071 * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
2072 * sufficient.  We also don't check for setugidness since we know we are.
2073 */
2074static int
2075is_unsafe(struct file *fp)
2076{
2077	if (fp->f_type == DTYPE_VNODE) {
2078		struct vnode *vp = fp->f_vnode;
2079
2080		if ((vp->v_vflag & VV_PROCDEP) != 0)
2081			return (1);
2082	}
2083	return (0);
2084}
2085
2086/*
2087 * Make this setguid thing safe, if at all possible.
2088 */
2089void
2090setugidsafety(struct thread *td)
2091{
2092	struct filedesc *fdp;
2093	struct file *fp;
2094	int i;
2095
2096	fdp = td->td_proc->p_fd;
2097	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
2098	FILEDESC_XLOCK(fdp);
2099	for (i = 0; i <= fdp->fd_lastfile; i++) {
2100		if (i > 2)
2101			break;
2102		fp = fdp->fd_ofiles[i].fde_file;
2103		if (fp != NULL && is_unsafe(fp)) {
2104			knote_fdclose(td, i);
2105			/*
2106			 * NULL-out descriptor prior to close to avoid
2107			 * a race while close blocks.
2108			 */
2109			fdfree(fdp, i);
2110			FILEDESC_XUNLOCK(fdp);
2111			(void) closef(fp, td);
2112			FILEDESC_XLOCK(fdp);
2113		}
2114	}
2115	FILEDESC_XUNLOCK(fdp);
2116}
2117
2118/*
2119 * If a specific file object occupies a specific file descriptor, close the
2120 * file descriptor entry and drop a reference on the file object.  This is a
2121 * convenience function to handle a subsequent error in a function that calls
2122 * falloc() that handles the race that another thread might have closed the
2123 * file descriptor out from under the thread creating the file object.
2124 */
2125void
2126fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
2127{
2128
2129	FILEDESC_XLOCK(fdp);
2130	if (fdp->fd_ofiles[idx].fde_file == fp) {
2131		fdfree(fdp, idx);
2132		FILEDESC_XUNLOCK(fdp);
2133		fdrop(fp, td);
2134	} else
2135		FILEDESC_XUNLOCK(fdp);
2136}
2137
2138/*
2139 * Close any files on exec?
2140 */
2141void
2142fdcloseexec(struct thread *td)
2143{
2144	struct filedesc *fdp;
2145	struct filedescent *fde;
2146	struct file *fp;
2147	int i;
2148
2149	fdp = td->td_proc->p_fd;
2150	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
2151	FILEDESC_XLOCK(fdp);
2152	for (i = 0; i <= fdp->fd_lastfile; i++) {
2153		fde = &fdp->fd_ofiles[i];
2154		fp = fde->fde_file;
2155		if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
2156		    (fde->fde_flags & UF_EXCLOSE))) {
2157			fdfree(fdp, i);
2158			(void) closefp(fdp, i, fp, td, 0);
2159			/* closefp() drops the FILEDESC lock. */
2160			FILEDESC_XLOCK(fdp);
2161		}
2162	}
2163	FILEDESC_XUNLOCK(fdp);
2164}
2165
2166/*
2167 * It is unsafe for set[ug]id processes to be started with file
2168 * descriptors 0..2 closed, as these descriptors are given implicit
2169 * significance in the Standard C library.  fdcheckstd() will create a
2170 * descriptor referencing /dev/null for each of stdin, stdout, and
2171 * stderr that is not already open.
2172 */
2173int
2174fdcheckstd(struct thread *td)
2175{
2176	struct filedesc *fdp;
2177	register_t retval, save;
2178	int i, error, devnull;
2179
2180	fdp = td->td_proc->p_fd;
2181	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
2182	devnull = -1;
2183	error = 0;
2184	for (i = 0; i < 3; i++) {
2185		if (fdp->fd_ofiles[i].fde_file != NULL)
2186			continue;
2187		if (devnull < 0) {
2188			save = td->td_retval[0];
2189			error = kern_open(td, "/dev/null", UIO_SYSSPACE,
2190			    O_RDWR, 0);
2191			devnull = td->td_retval[0];
2192			td->td_retval[0] = save;
2193			if (error)
2194				break;
2195			KASSERT(devnull == i, ("oof, we didn't get our fd"));
2196		} else {
2197			error = do_dup(td, DUP_FIXED, devnull, i, &retval);
2198			if (error != 0)
2199				break;
2200		}
2201	}
2202	return (error);
2203}
2204
2205/*
2206 * Internal form of close.  Decrement reference count on file structure.
2207 * Note: td may be NULL when closing a file that was being passed in a
2208 * message.
2209 *
2210 * XXXRW: Giant is not required for the caller, but often will be held; this
2211 * makes it moderately likely the Giant will be recursed in the VFS case.
2212 */
2213int
2214closef(struct file *fp, struct thread *td)
2215{
2216	struct vnode *vp;
2217	struct flock lf;
2218	struct filedesc_to_leader *fdtol;
2219	struct filedesc *fdp;
2220
2221	/*
2222	 * POSIX record locking dictates that any close releases ALL
2223	 * locks owned by this process.  This is handled by setting
2224	 * a flag in the unlock to free ONLY locks obeying POSIX
2225	 * semantics, and not to free BSD-style file locks.
2226	 * If the descriptor was in a message, POSIX-style locks
2227	 * aren't passed with the descriptor, and the thread pointer
2228	 * will be NULL.  Callers should be careful only to pass a
2229	 * NULL thread pointer when there really is no owning
2230	 * context that might have locks, or the locks will be
2231	 * leaked.
2232	 */
2233	if (fp->f_type == DTYPE_VNODE && td != NULL) {
2234		vp = fp->f_vnode;
2235		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
2236			lf.l_whence = SEEK_SET;
2237			lf.l_start = 0;
2238			lf.l_len = 0;
2239			lf.l_type = F_UNLCK;
2240			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
2241			    F_UNLCK, &lf, F_POSIX);
2242		}
2243		fdtol = td->td_proc->p_fdtol;
2244		if (fdtol != NULL) {
2245			/*
2246			 * Handle special case where file descriptor table is
2247			 * shared between multiple process leaders.
2248			 */
2249			fdp = td->td_proc->p_fd;
2250			FILEDESC_XLOCK(fdp);
2251			for (fdtol = fdtol->fdl_next;
2252			     fdtol != td->td_proc->p_fdtol;
2253			     fdtol = fdtol->fdl_next) {
2254				if ((fdtol->fdl_leader->p_flag &
2255				     P_ADVLOCK) == 0)
2256					continue;
2257				fdtol->fdl_holdcount++;
2258				FILEDESC_XUNLOCK(fdp);
2259				lf.l_whence = SEEK_SET;
2260				lf.l_start = 0;
2261				lf.l_len = 0;
2262				lf.l_type = F_UNLCK;
2263				vp = fp->f_vnode;
2264				(void) VOP_ADVLOCK(vp,
2265				    (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
2266				    F_POSIX);
2267				FILEDESC_XLOCK(fdp);
2268				fdtol->fdl_holdcount--;
2269				if (fdtol->fdl_holdcount == 0 &&
2270				    fdtol->fdl_wakeup != 0) {
2271					fdtol->fdl_wakeup = 0;
2272					wakeup(fdtol);
2273				}
2274			}
2275			FILEDESC_XUNLOCK(fdp);
2276		}
2277	}
2278	return (fdrop(fp, td));
2279}
2280
2281/*
2282 * Initialize the file pointer with the specified properties.
2283 *
2284 * The ops are set with release semantics to be certain that the flags, type,
2285 * and data are visible when ops is.  This is to prevent ops methods from being
2286 * called with bad data.
2287 */
2288void
2289finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
2290{
2291	fp->f_data = data;
2292	fp->f_flag = flag;
2293	fp->f_type = type;
2294	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
2295}
2296
2297int
2298fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
2299    int needfcntl, struct file **fpp, cap_rights_t *haverightsp)
2300{
2301#ifdef CAPABILITIES
2302	struct filedescent fde;
2303#endif
2304	struct file *fp;
2305	u_int count;
2306#ifdef CAPABILITIES
2307	cap_rights_t haverights;
2308	int error;
2309#endif
2310
2311	/*
2312	 * Avoid reads reordering and then a first access to the
2313	 * fdp->fd_ofiles table which could result in OOB operation.
2314	 */
2315	if (fd < 0 || fd >= atomic_load_acq_int(&fdp->fd_nfiles))
2316		return (EBADF);
2317	/*
2318	 * Fetch the descriptor locklessly.  We avoid fdrop() races by
2319	 * never raising a refcount above 0.  To accomplish this we have
2320	 * to use a cmpset loop rather than an atomic_add.  The descriptor
2321	 * must be re-verified once we acquire a reference to be certain
2322	 * that the identity is still correct and we did not lose a race
2323	 * due to preemption.
2324	 */
2325	for (;;) {
2326#ifdef CAPABILITIES
2327		fde = fdp->fd_ofiles[fd];
2328		fp = fde.fde_file;
2329#else
2330		fp = fdp->fd_ofiles[fd].fde_file;
2331#endif
2332		if (fp == NULL)
2333			return (EBADF);
2334#ifdef CAPABILITIES
2335		haverights = *cap_rights_fde(&fde);
2336		if (needrightsp != NULL) {
2337			error = cap_check(&haverights, needrightsp);
2338			if (error != 0)
2339				return (error);
2340			if (cap_rights_is_set(needrightsp, CAP_FCNTL)) {
2341				error = cap_fcntl_check_fde(&fde, needfcntl);
2342				if (error != 0)
2343					return (error);
2344			}
2345		}
2346#endif
2347		count = fp->f_count;
2348		if (count == 0)
2349			continue;
2350		/*
2351		 * Use an acquire barrier to prevent caching of fd_ofiles
2352		 * so it is refreshed for verification.
2353		 */
2354		if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) != 1)
2355			continue;
2356		if (fp == fdp->fd_ofiles[fd].fde_file)
2357			break;
2358		fdrop(fp, curthread);
2359	}
2360	*fpp = fp;
2361	if (haverightsp != NULL) {
2362#ifdef CAPABILITIES
2363		*haverightsp = haverights;
2364#else
2365		CAP_ALL(haverightsp);
2366#endif
2367	}
2368	return (0);
2369}
2370
2371/*
2372 * Extract the file pointer associated with the specified descriptor for the
2373 * current user process.
2374 *
2375 * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
2376 * returned.
2377 *
2378 * File's rights will be checked against the capability rights mask.
2379 *
2380 * If an error occured the non-zero error is returned and *fpp is set to
2381 * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
2382 * responsible for fdrop().
2383 */
2384static __inline int
2385_fget(struct thread *td, int fd, struct file **fpp, int flags,
2386    cap_rights_t *needrightsp, u_char *maxprotp)
2387{
2388	struct filedesc *fdp;
2389	struct file *fp;
2390	cap_rights_t haverights, needrights;
2391	int error;
2392
2393	*fpp = NULL;
2394	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
2395		return (EBADF);
2396	if (needrightsp != NULL)
2397		needrights = *needrightsp;
2398	else
2399		cap_rights_init(&needrights);
2400	if (maxprotp != NULL)
2401		cap_rights_set(&needrights, CAP_MMAP);
2402	error = fget_unlocked(fdp, fd, &needrights, 0, &fp, &haverights);
2403	if (error != 0)
2404		return (error);
2405	if (fp->f_ops == &badfileops) {
2406		fdrop(fp, td);
2407		return (EBADF);
2408	}
2409
2410#ifdef CAPABILITIES
2411	/*
2412	 * If requested, convert capability rights to access flags.
2413	 */
2414	if (maxprotp != NULL)
2415		*maxprotp = cap_rights_to_vmprot(&haverights);
2416#else /* !CAPABILITIES */
2417	if (maxprotp != NULL)
2418		*maxprotp = VM_PROT_ALL;
2419#endif /* CAPABILITIES */
2420
2421	/*
2422	 * FREAD and FWRITE failure return EBADF as per POSIX.
2423	 */
2424	error = 0;
2425	switch (flags) {
2426	case FREAD:
2427	case FWRITE:
2428		if ((fp->f_flag & flags) == 0)
2429			error = EBADF;
2430		break;
2431	case FEXEC:
2432	    	if ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
2433		    ((fp->f_flag & FWRITE) != 0))
2434			error = EBADF;
2435		break;
2436	case 0:
2437		break;
2438	default:
2439		KASSERT(0, ("wrong flags"));
2440	}
2441
2442	if (error != 0) {
2443		fdrop(fp, td);
2444		return (error);
2445	}
2446
2447	*fpp = fp;
2448	return (0);
2449}
2450
2451int
2452fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
2453{
2454
2455	return(_fget(td, fd, fpp, 0, rightsp, NULL));
2456}
2457
2458int
2459fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, u_char *maxprotp,
2460    struct file **fpp)
2461{
2462
2463	return (_fget(td, fd, fpp, 0, rightsp, maxprotp));
2464}
2465
2466int
2467fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
2468{
2469
2470	return(_fget(td, fd, fpp, FREAD, rightsp, NULL));
2471}
2472
2473int
2474fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
2475{
2476
2477	return (_fget(td, fd, fpp, FWRITE, rightsp, NULL));
2478}
2479
2480/*
2481 * Like fget() but loads the underlying vnode, or returns an error if the
2482 * descriptor does not represent a vnode.  Note that pipes use vnodes but
2483 * never have VM objects.  The returned vnode will be vref()'d.
2484 *
2485 * XXX: what about the unused flags ?
2486 */
2487static __inline int
2488_fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp,
2489    struct vnode **vpp)
2490{
2491	struct file *fp;
2492	int error;
2493
2494	*vpp = NULL;
2495	error = _fget(td, fd, &fp, flags, needrightsp, NULL);
2496	if (error != 0)
2497		return (error);
2498	if (fp->f_vnode == NULL) {
2499		error = EINVAL;
2500	} else {
2501		*vpp = fp->f_vnode;
2502		vref(*vpp);
2503	}
2504	fdrop(fp, td);
2505
2506	return (error);
2507}
2508
2509int
2510fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
2511{
2512
2513	return (_fgetvp(td, fd, 0, rightsp, vpp));
2514}
2515
2516int
2517fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
2518    struct filecaps *havecaps, struct vnode **vpp)
2519{
2520	struct filedesc *fdp;
2521	struct file *fp;
2522#ifdef CAPABILITIES
2523	int error;
2524#endif
2525
2526	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
2527		return (EBADF);
2528
2529	fp = fget_locked(fdp, fd);
2530	if (fp == NULL || fp->f_ops == &badfileops)
2531		return (EBADF);
2532
2533#ifdef CAPABILITIES
2534	if (needrightsp != NULL) {
2535		error = cap_check(cap_rights(fdp, fd), needrightsp);
2536		if (error != 0)
2537			return (error);
2538	}
2539#endif
2540
2541	if (fp->f_vnode == NULL)
2542		return (EINVAL);
2543
2544	*vpp = fp->f_vnode;
2545	vref(*vpp);
2546	filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, havecaps);
2547
2548	return (0);
2549}
2550
2551int
2552fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
2553{
2554
2555	return (_fgetvp(td, fd, FREAD, rightsp, vpp));
2556}
2557
2558int
2559fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
2560{
2561
2562	return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
2563}
2564
2565#ifdef notyet
2566int
2567fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
2568    struct vnode **vpp)
2569{
2570
2571	return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
2572}
2573#endif
2574
2575/*
2576 * Like fget() but loads the underlying socket, or returns an error if the
2577 * descriptor does not represent a socket.
2578 *
2579 * We bump the ref count on the returned socket.  XXX Also obtain the SX lock
2580 * in the future.
2581 *
2582 * Note: fgetsock() and fputsock() are deprecated, as consumers should rely
2583 * on their file descriptor reference to prevent the socket from being free'd
2584 * during use.
2585 */
2586int
2587fgetsock(struct thread *td, int fd, cap_rights_t *rightsp, struct socket **spp,
2588    u_int *fflagp)
2589{
2590	struct file *fp;
2591	int error;
2592
2593	*spp = NULL;
2594	if (fflagp != NULL)
2595		*fflagp = 0;
2596	if ((error = _fget(td, fd, &fp, 0, rightsp, NULL)) != 0)
2597		return (error);
2598	if (fp->f_type != DTYPE_SOCKET) {
2599		error = ENOTSOCK;
2600	} else {
2601		*spp = fp->f_data;
2602		if (fflagp)
2603			*fflagp = fp->f_flag;
2604		SOCK_LOCK(*spp);
2605		soref(*spp);
2606		SOCK_UNLOCK(*spp);
2607	}
2608	fdrop(fp, td);
2609
2610	return (error);
2611}
2612
2613/*
2614 * Drop the reference count on the socket and XXX release the SX lock in the
2615 * future.  The last reference closes the socket.
2616 *
2617 * Note: fputsock() is deprecated, see comment for fgetsock().
2618 */
2619void
2620fputsock(struct socket *so)
2621{
2622
2623	ACCEPT_LOCK();
2624	SOCK_LOCK(so);
2625	CURVNET_SET(so->so_vnet);
2626	sorele(so);
2627	CURVNET_RESTORE();
2628}
2629
2630/*
2631 * Handle the last reference to a file being closed.
2632 */
2633int
2634_fdrop(struct file *fp, struct thread *td)
2635{
2636	int error;
2637
2638	error = 0;
2639	if (fp->f_count != 0)
2640		panic("fdrop: count %d", fp->f_count);
2641	if (fp->f_ops != &badfileops)
2642		error = fo_close(fp, td);
2643	atomic_subtract_int(&openfiles, 1);
2644	crfree(fp->f_cred);
2645	free(fp->f_advice, M_FADVISE);
2646	uma_zfree(file_zone, fp);
2647
2648	return (error);
2649}
2650
2651/*
2652 * Apply an advisory lock on a file descriptor.
2653 *
2654 * Just attempt to get a record lock of the requested type on the entire file
2655 * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
2656 */
2657#ifndef _SYS_SYSPROTO_H_
2658struct flock_args {
2659	int	fd;
2660	int	how;
2661};
2662#endif
2663/* ARGSUSED */
2664int
2665sys_flock(struct thread *td, struct flock_args *uap)
2666{
2667	struct file *fp;
2668	struct vnode *vp;
2669	struct flock lf;
2670	cap_rights_t rights;
2671	int error;
2672
2673	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FLOCK), &fp);
2674	if (error != 0)
2675		return (error);
2676	if (fp->f_type != DTYPE_VNODE) {
2677		fdrop(fp, td);
2678		return (EOPNOTSUPP);
2679	}
2680
2681	vp = fp->f_vnode;
2682	lf.l_whence = SEEK_SET;
2683	lf.l_start = 0;
2684	lf.l_len = 0;
2685	if (uap->how & LOCK_UN) {
2686		lf.l_type = F_UNLCK;
2687		atomic_clear_int(&fp->f_flag, FHASLOCK);
2688		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
2689		goto done2;
2690	}
2691	if (uap->how & LOCK_EX)
2692		lf.l_type = F_WRLCK;
2693	else if (uap->how & LOCK_SH)
2694		lf.l_type = F_RDLCK;
2695	else {
2696		error = EBADF;
2697		goto done2;
2698	}
2699	atomic_set_int(&fp->f_flag, FHASLOCK);
2700	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
2701	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
2702done2:
2703	fdrop(fp, td);
2704	return (error);
2705}
2706/*
2707 * Duplicate the specified descriptor to a free descriptor.
2708 */
2709int
2710dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
2711    int openerror, int *indxp)
2712{
2713	struct file *fp;
2714	int error, indx;
2715
2716	KASSERT(openerror == ENODEV || openerror == ENXIO,
2717	    ("unexpected error %d in %s", openerror, __func__));
2718
2719	/*
2720	 * If the to-be-dup'd fd number is greater than the allowed number
2721	 * of file descriptors, or the fd to be dup'd has already been
2722	 * closed, then reject.
2723	 */
2724	FILEDESC_XLOCK(fdp);
2725	if ((fp = fget_locked(fdp, dfd)) == NULL) {
2726		FILEDESC_XUNLOCK(fdp);
2727		return (EBADF);
2728	}
2729
2730	error = fdalloc(td, 0, &indx);
2731	if (error != 0) {
2732		FILEDESC_XUNLOCK(fdp);
2733		return (error);
2734	}
2735
2736	/*
2737	 * There are two cases of interest here.
2738	 *
2739	 * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
2740	 *
2741	 * For ENXIO steal away the file structure from (dfd) and store it in
2742	 * (indx).  (dfd) is effectively closed by this operation.
2743	 */
2744	switch (openerror) {
2745	case ENODEV:
2746		/*
2747		 * Check that the mode the file is being opened for is a
2748		 * subset of the mode of the existing descriptor.
2749		 */
2750		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
2751			fdunused(fdp, indx);
2752			FILEDESC_XUNLOCK(fdp);
2753			return (EACCES);
2754		}
2755		fhold(fp);
2756		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
2757		filecaps_copy(&fdp->fd_ofiles[dfd].fde_caps,
2758		    &fdp->fd_ofiles[indx].fde_caps);
2759		break;
2760	case ENXIO:
2761		/*
2762		 * Steal away the file pointer from dfd and stuff it into indx.
2763		 */
2764		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
2765		bzero(&fdp->fd_ofiles[dfd], sizeof(fdp->fd_ofiles[dfd]));
2766		fdunused(fdp, dfd);
2767		break;
2768	}
2769	FILEDESC_XUNLOCK(fdp);
2770	*indxp = indx;
2771	return (0);
2772}
2773
2774/*
2775 * Scan all active processes and prisons to see if any of them have a current
2776 * or root directory of `olddp'. If so, replace them with the new mount point.
2777 */
2778void
2779mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
2780{
2781	struct filedesc *fdp;
2782	struct prison *pr;
2783	struct proc *p;
2784	int nrele;
2785
2786	if (vrefcnt(olddp) == 1)
2787		return;
2788	nrele = 0;
2789	sx_slock(&allproc_lock);
2790	FOREACH_PROC_IN_SYSTEM(p) {
2791		fdp = fdhold(p);
2792		if (fdp == NULL)
2793			continue;
2794		FILEDESC_XLOCK(fdp);
2795		if (fdp->fd_cdir == olddp) {
2796			vref(newdp);
2797			fdp->fd_cdir = newdp;
2798			nrele++;
2799		}
2800		if (fdp->fd_rdir == olddp) {
2801			vref(newdp);
2802			fdp->fd_rdir = newdp;
2803			nrele++;
2804		}
2805		if (fdp->fd_jdir == olddp) {
2806			vref(newdp);
2807			fdp->fd_jdir = newdp;
2808			nrele++;
2809		}
2810		FILEDESC_XUNLOCK(fdp);
2811		fddrop(fdp);
2812	}
2813	sx_sunlock(&allproc_lock);
2814	if (rootvnode == olddp) {
2815		vref(newdp);
2816		rootvnode = newdp;
2817		nrele++;
2818	}
2819	mtx_lock(&prison0.pr_mtx);
2820	if (prison0.pr_root == olddp) {
2821		vref(newdp);
2822		prison0.pr_root = newdp;
2823		nrele++;
2824	}
2825	mtx_unlock(&prison0.pr_mtx);
2826	sx_slock(&allprison_lock);
2827	TAILQ_FOREACH(pr, &allprison, pr_list) {
2828		mtx_lock(&pr->pr_mtx);
2829		if (pr->pr_root == olddp) {
2830			vref(newdp);
2831			pr->pr_root = newdp;
2832			nrele++;
2833		}
2834		mtx_unlock(&pr->pr_mtx);
2835	}
2836	sx_sunlock(&allprison_lock);
2837	while (nrele--)
2838		vrele(olddp);
2839}
2840
2841struct filedesc_to_leader *
2842filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
2843{
2844	struct filedesc_to_leader *fdtol;
2845
2846	fdtol = malloc(sizeof(struct filedesc_to_leader),
2847	       M_FILEDESC_TO_LEADER,
2848	       M_WAITOK);
2849	fdtol->fdl_refcount = 1;
2850	fdtol->fdl_holdcount = 0;
2851	fdtol->fdl_wakeup = 0;
2852	fdtol->fdl_leader = leader;
2853	if (old != NULL) {
2854		FILEDESC_XLOCK(fdp);
2855		fdtol->fdl_next = old->fdl_next;
2856		fdtol->fdl_prev = old;
2857		old->fdl_next = fdtol;
2858		fdtol->fdl_next->fdl_prev = fdtol;
2859		FILEDESC_XUNLOCK(fdp);
2860	} else {
2861		fdtol->fdl_next = fdtol;
2862		fdtol->fdl_prev = fdtol;
2863	}
2864	return (fdtol);
2865}
2866
2867/*
2868 * Get file structures globally.
2869 */
2870static int
2871sysctl_kern_file(SYSCTL_HANDLER_ARGS)
2872{
2873	struct xfile xf;
2874	struct filedesc *fdp;
2875	struct file *fp;
2876	struct proc *p;
2877	int error, n;
2878
2879	error = sysctl_wire_old_buffer(req, 0);
2880	if (error != 0)
2881		return (error);
2882	if (req->oldptr == NULL) {
2883		n = 0;
2884		sx_slock(&allproc_lock);
2885		FOREACH_PROC_IN_SYSTEM(p) {
2886			if (p->p_state == PRS_NEW)
2887				continue;
2888			fdp = fdhold(p);
2889			if (fdp == NULL)
2890				continue;
2891			/* overestimates sparse tables. */
2892			if (fdp->fd_lastfile > 0)
2893				n += fdp->fd_lastfile;
2894			fddrop(fdp);
2895		}
2896		sx_sunlock(&allproc_lock);
2897		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
2898	}
2899	error = 0;
2900	bzero(&xf, sizeof(xf));
2901	xf.xf_size = sizeof(xf);
2902	sx_slock(&allproc_lock);
2903	FOREACH_PROC_IN_SYSTEM(p) {
2904		PROC_LOCK(p);
2905		if (p->p_state == PRS_NEW) {
2906			PROC_UNLOCK(p);
2907			continue;
2908		}
2909		if (p_cansee(req->td, p) != 0) {
2910			PROC_UNLOCK(p);
2911			continue;
2912		}
2913		xf.xf_pid = p->p_pid;
2914		xf.xf_uid = p->p_ucred->cr_uid;
2915		PROC_UNLOCK(p);
2916		fdp = fdhold(p);
2917		if (fdp == NULL)
2918			continue;
2919		FILEDESC_SLOCK(fdp);
2920		for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) {
2921			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
2922				continue;
2923			xf.xf_fd = n;
2924			xf.xf_file = fp;
2925			xf.xf_data = fp->f_data;
2926			xf.xf_vnode = fp->f_vnode;
2927			xf.xf_type = fp->f_type;
2928			xf.xf_count = fp->f_count;
2929			xf.xf_msgcount = 0;
2930			xf.xf_offset = foffset_get(fp);
2931			xf.xf_flag = fp->f_flag;
2932			error = SYSCTL_OUT(req, &xf, sizeof(xf));
2933			if (error)
2934				break;
2935		}
2936		FILEDESC_SUNLOCK(fdp);
2937		fddrop(fdp);
2938		if (error)
2939			break;
2940	}
2941	sx_sunlock(&allproc_lock);
2942	return (error);
2943}
2944
2945SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
2946    0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
2947
2948#ifdef KINFO_OFILE_SIZE
2949CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
2950#endif
2951
2952#ifdef COMPAT_FREEBSD7
2953static int
2954export_vnode_for_osysctl(struct vnode *vp, int type,
2955    struct kinfo_ofile *kif, struct filedesc *fdp, struct sysctl_req *req)
2956{
2957	int error;
2958	char *fullpath, *freepath;
2959
2960	bzero(kif, sizeof(*kif));
2961	kif->kf_structsize = sizeof(*kif);
2962
2963	vref(vp);
2964	kif->kf_fd = type;
2965	kif->kf_type = KF_TYPE_VNODE;
2966	/* This function only handles directories. */
2967	if (vp->v_type != VDIR) {
2968		vrele(vp);
2969		return (ENOTDIR);
2970	}
2971	kif->kf_vnode_type = KF_VTYPE_VDIR;
2972
2973	/*
2974	 * This is not a true file descriptor, so we set a bogus refcount
2975	 * and offset to indicate these fields should be ignored.
2976	 */
2977	kif->kf_ref_count = -1;
2978	kif->kf_offset = -1;
2979
2980	freepath = NULL;
2981	fullpath = "-";
2982	FILEDESC_SUNLOCK(fdp);
2983	vn_fullpath(curthread, vp, &fullpath, &freepath);
2984	vrele(vp);
2985	strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
2986	if (freepath != NULL)
2987		free(freepath, M_TEMP);
2988	error = SYSCTL_OUT(req, kif, sizeof(*kif));
2989	FILEDESC_SLOCK(fdp);
2990	return (error);
2991}
2992
2993/*
2994 * Get per-process file descriptors for use by procstat(1), et al.
2995 */
2996static int
2997sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
2998{
2999	char *fullpath, *freepath;
3000	struct kinfo_ofile *kif;
3001	struct filedesc *fdp;
3002	int error, i, *name;
3003	struct shmfd *shmfd;
3004	struct socket *so;
3005	struct vnode *vp;
3006	struct ksem *ks;
3007	struct file *fp;
3008	struct proc *p;
3009	struct tty *tp;
3010
3011	name = (int *)arg1;
3012	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
3013	if (error != 0)
3014		return (error);
3015	fdp = fdhold(p);
3016	PROC_UNLOCK(p);
3017	if (fdp == NULL)
3018		return (ENOENT);
3019	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
3020	FILEDESC_SLOCK(fdp);
3021	if (fdp->fd_cdir != NULL)
3022		export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif,
3023				fdp, req);
3024	if (fdp->fd_rdir != NULL)
3025		export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif,
3026				fdp, req);
3027	if (fdp->fd_jdir != NULL)
3028		export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
3029				fdp, req);
3030	for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
3031		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
3032			continue;
3033		bzero(kif, sizeof(*kif));
3034		kif->kf_structsize = sizeof(*kif);
3035		ks = NULL;
3036		vp = NULL;
3037		so = NULL;
3038		tp = NULL;
3039		shmfd = NULL;
3040		kif->kf_fd = i;
3041
3042		switch (fp->f_type) {
3043		case DTYPE_VNODE:
3044			kif->kf_type = KF_TYPE_VNODE;
3045			vp = fp->f_vnode;
3046			break;
3047
3048		case DTYPE_SOCKET:
3049			kif->kf_type = KF_TYPE_SOCKET;
3050			so = fp->f_data;
3051			break;
3052
3053		case DTYPE_PIPE:
3054			kif->kf_type = KF_TYPE_PIPE;
3055			break;
3056
3057		case DTYPE_FIFO:
3058			kif->kf_type = KF_TYPE_FIFO;
3059			vp = fp->f_vnode;
3060			break;
3061
3062		case DTYPE_KQUEUE:
3063			kif->kf_type = KF_TYPE_KQUEUE;
3064			break;
3065
3066		case DTYPE_CRYPTO:
3067			kif->kf_type = KF_TYPE_CRYPTO;
3068			break;
3069
3070		case DTYPE_MQUEUE:
3071			kif->kf_type = KF_TYPE_MQUEUE;
3072			break;
3073
3074		case DTYPE_SHM:
3075			kif->kf_type = KF_TYPE_SHM;
3076			shmfd = fp->f_data;
3077			break;
3078
3079		case DTYPE_SEM:
3080			kif->kf_type = KF_TYPE_SEM;
3081			ks = fp->f_data;
3082			break;
3083
3084		case DTYPE_PTS:
3085			kif->kf_type = KF_TYPE_PTS;
3086			tp = fp->f_data;
3087			break;
3088
3089		case DTYPE_PROCDESC:
3090			kif->kf_type = KF_TYPE_PROCDESC;
3091			break;
3092
3093		default:
3094			kif->kf_type = KF_TYPE_UNKNOWN;
3095			break;
3096		}
3097		kif->kf_ref_count = fp->f_count;
3098		if (fp->f_flag & FREAD)
3099			kif->kf_flags |= KF_FLAG_READ;
3100		if (fp->f_flag & FWRITE)
3101			kif->kf_flags |= KF_FLAG_WRITE;
3102		if (fp->f_flag & FAPPEND)
3103			kif->kf_flags |= KF_FLAG_APPEND;
3104		if (fp->f_flag & FASYNC)
3105			kif->kf_flags |= KF_FLAG_ASYNC;
3106		if (fp->f_flag & FFSYNC)
3107			kif->kf_flags |= KF_FLAG_FSYNC;
3108		if (fp->f_flag & FNONBLOCK)
3109			kif->kf_flags |= KF_FLAG_NONBLOCK;
3110		if (fp->f_flag & O_DIRECT)
3111			kif->kf_flags |= KF_FLAG_DIRECT;
3112		if (fp->f_flag & FHASLOCK)
3113			kif->kf_flags |= KF_FLAG_HASLOCK;
3114		kif->kf_offset = foffset_get(fp);
3115		if (vp != NULL) {
3116			vref(vp);
3117			switch (vp->v_type) {
3118			case VNON:
3119				kif->kf_vnode_type = KF_VTYPE_VNON;
3120				break;
3121			case VREG:
3122				kif->kf_vnode_type = KF_VTYPE_VREG;
3123				break;
3124			case VDIR:
3125				kif->kf_vnode_type = KF_VTYPE_VDIR;
3126				break;
3127			case VBLK:
3128				kif->kf_vnode_type = KF_VTYPE_VBLK;
3129				break;
3130			case VCHR:
3131				kif->kf_vnode_type = KF_VTYPE_VCHR;
3132				break;
3133			case VLNK:
3134				kif->kf_vnode_type = KF_VTYPE_VLNK;
3135				break;
3136			case VSOCK:
3137				kif->kf_vnode_type = KF_VTYPE_VSOCK;
3138				break;
3139			case VFIFO:
3140				kif->kf_vnode_type = KF_VTYPE_VFIFO;
3141				break;
3142			case VBAD:
3143				kif->kf_vnode_type = KF_VTYPE_VBAD;
3144				break;
3145			default:
3146				kif->kf_vnode_type = KF_VTYPE_UNKNOWN;
3147				break;
3148			}
3149			/*
3150			 * It is OK to drop the filedesc lock here as we will
3151			 * re-validate and re-evaluate its properties when
3152			 * the loop continues.
3153			 */
3154			freepath = NULL;
3155			fullpath = "-";
3156			FILEDESC_SUNLOCK(fdp);
3157			vn_fullpath(curthread, vp, &fullpath, &freepath);
3158			vrele(vp);
3159			strlcpy(kif->kf_path, fullpath,
3160			    sizeof(kif->kf_path));
3161			if (freepath != NULL)
3162				free(freepath, M_TEMP);
3163			FILEDESC_SLOCK(fdp);
3164		}
3165		if (so != NULL) {
3166			struct sockaddr *sa;
3167
3168			if (so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa)
3169			    == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
3170				bcopy(sa, &kif->kf_sa_local, sa->sa_len);
3171				free(sa, M_SONAME);
3172			}
3173			if (so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa)
3174			    == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
3175				bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
3176				free(sa, M_SONAME);
3177			}
3178			kif->kf_sock_domain =
3179			    so->so_proto->pr_domain->dom_family;
3180			kif->kf_sock_type = so->so_type;
3181			kif->kf_sock_protocol = so->so_proto->pr_protocol;
3182		}
3183		if (tp != NULL) {
3184			strlcpy(kif->kf_path, tty_devname(tp),
3185			    sizeof(kif->kf_path));
3186		}
3187		if (shmfd != NULL)
3188			shm_path(shmfd, kif->kf_path, sizeof(kif->kf_path));
3189		if (ks != NULL && ksem_info != NULL)
3190			ksem_info(ks, kif->kf_path, sizeof(kif->kf_path), NULL);
3191		error = SYSCTL_OUT(req, kif, sizeof(*kif));
3192		if (error)
3193			break;
3194	}
3195	FILEDESC_SUNLOCK(fdp);
3196	fddrop(fdp);
3197	free(kif, M_TEMP);
3198	return (0);
3199}
3200
3201static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc,
3202    CTLFLAG_RD||CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc,
3203    "Process ofiledesc entries");
3204#endif	/* COMPAT_FREEBSD7 */
3205
3206#ifdef KINFO_FILE_SIZE
3207CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
3208#endif
3209
3210struct export_fd_buf {
3211	struct filedesc		*fdp;
3212	struct sbuf 		*sb;
3213	ssize_t			remainder;
3214	struct kinfo_file	kif;
3215};
3216
3217static int
3218export_fd_to_sb(void *data, int type, int fd, int fflags, int refcnt,
3219    int64_t offset, cap_rights_t *rightsp, struct export_fd_buf *efbuf)
3220{
3221	struct {
3222		int	fflag;
3223		int	kf_fflag;
3224	} fflags_table[] = {
3225		{ FAPPEND, KF_FLAG_APPEND },
3226		{ FASYNC, KF_FLAG_ASYNC },
3227		{ FFSYNC, KF_FLAG_FSYNC },
3228		{ FHASLOCK, KF_FLAG_HASLOCK },
3229		{ FNONBLOCK, KF_FLAG_NONBLOCK },
3230		{ FREAD, KF_FLAG_READ },
3231		{ FWRITE, KF_FLAG_WRITE },
3232		{ O_CREAT, KF_FLAG_CREAT },
3233		{ O_DIRECT, KF_FLAG_DIRECT },
3234		{ O_EXCL, KF_FLAG_EXCL },
3235		{ O_EXEC, KF_FLAG_EXEC },
3236		{ O_EXLOCK, KF_FLAG_EXLOCK },
3237		{ O_NOFOLLOW, KF_FLAG_NOFOLLOW },
3238		{ O_SHLOCK, KF_FLAG_SHLOCK },
3239		{ O_TRUNC, KF_FLAG_TRUNC }
3240	};
3241#define	NFFLAGS	(sizeof(fflags_table) / sizeof(*fflags_table))
3242	struct kinfo_file *kif;
3243	struct vnode *vp;
3244	int error, locked;
3245	unsigned int i;
3246
3247	if (efbuf->remainder == 0)
3248		return (0);
3249	kif = &efbuf->kif;
3250	bzero(kif, sizeof(*kif));
3251	locked = efbuf->fdp != NULL;
3252	switch (type) {
3253	case KF_TYPE_FIFO:
3254	case KF_TYPE_VNODE:
3255		if (locked) {
3256			FILEDESC_SUNLOCK(efbuf->fdp);
3257			locked = 0;
3258		}
3259		vp = (struct vnode *)data;
3260		error = fill_vnode_info(vp, kif);
3261		vrele(vp);
3262		break;
3263	case KF_TYPE_SOCKET:
3264		error = fill_socket_info((struct socket *)data, kif);
3265		break;
3266	case KF_TYPE_PIPE:
3267		error = fill_pipe_info((struct pipe *)data, kif);
3268		break;
3269	case KF_TYPE_PTS:
3270		error = fill_pts_info((struct tty *)data, kif);
3271		break;
3272	case KF_TYPE_PROCDESC:
3273		error = fill_procdesc_info((struct procdesc *)data, kif);
3274		break;
3275	case KF_TYPE_SEM:
3276		error = fill_sem_info((struct file *)data, kif);
3277		break;
3278	case KF_TYPE_SHM:
3279		error = fill_shm_info((struct file *)data, kif);
3280		break;
3281	default:
3282		error = 0;
3283	}
3284	if (error == 0)
3285		kif->kf_status |= KF_ATTR_VALID;
3286
3287	/*
3288	 * Translate file access flags.
3289	 */
3290	for (i = 0; i < NFFLAGS; i++)
3291		if (fflags & fflags_table[i].fflag)
3292			kif->kf_flags |=  fflags_table[i].kf_fflag;
3293	if (rightsp != NULL)
3294		kif->kf_cap_rights = *rightsp;
3295	else
3296		cap_rights_init(&kif->kf_cap_rights);
3297	kif->kf_fd = fd;
3298	kif->kf_type = type;
3299	kif->kf_ref_count = refcnt;
3300	kif->kf_offset = offset;
3301	/* Pack record size down */
3302	kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
3303	    strlen(kif->kf_path) + 1;
3304	kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
3305	if (efbuf->remainder != -1) {
3306		if (efbuf->remainder < kif->kf_structsize) {
3307			/* Terminate export. */
3308			efbuf->remainder = 0;
3309			if (efbuf->fdp != NULL && !locked)
3310				FILEDESC_SLOCK(efbuf->fdp);
3311			return (0);
3312		}
3313		efbuf->remainder -= kif->kf_structsize;
3314	}
3315	if (locked)
3316		FILEDESC_SUNLOCK(efbuf->fdp);
3317	error = sbuf_bcat(efbuf->sb, kif, kif->kf_structsize);
3318	if (efbuf->fdp != NULL)
3319		FILEDESC_SLOCK(efbuf->fdp);
3320	return (error);
3321}
3322
3323/*
3324 * Store a process file descriptor information to sbuf.
3325 *
3326 * Takes a locked proc as argument, and returns with the proc unlocked.
3327 */
3328int
3329kern_proc_filedesc_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen)
3330{
3331	struct file *fp;
3332	struct filedesc *fdp;
3333	struct export_fd_buf *efbuf;
3334	struct vnode *cttyvp, *textvp, *tracevp;
3335	int64_t offset;
3336	void *data;
3337	int error, i;
3338	int type, refcnt, fflags;
3339	cap_rights_t rights;
3340
3341	PROC_LOCK_ASSERT(p, MA_OWNED);
3342
3343	/* ktrace vnode */
3344	tracevp = p->p_tracevp;
3345	if (tracevp != NULL)
3346		vref(tracevp);
3347	/* text vnode */
3348	textvp = p->p_textvp;
3349	if (textvp != NULL)
3350		vref(textvp);
3351	/* Controlling tty. */
3352	cttyvp = NULL;
3353	if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
3354		cttyvp = p->p_pgrp->pg_session->s_ttyvp;
3355		if (cttyvp != NULL)
3356			vref(cttyvp);
3357	}
3358	fdp = fdhold(p);
3359	PROC_UNLOCK(p);
3360	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
3361	efbuf->fdp = NULL;
3362	efbuf->sb = sb;
3363	efbuf->remainder = maxlen;
3364	if (tracevp != NULL)
3365		export_fd_to_sb(tracevp, KF_TYPE_VNODE, KF_FD_TYPE_TRACE,
3366		    FREAD | FWRITE, -1, -1, NULL, efbuf);
3367	if (textvp != NULL)
3368		export_fd_to_sb(textvp, KF_TYPE_VNODE, KF_FD_TYPE_TEXT,
3369		    FREAD, -1, -1, NULL, efbuf);
3370	if (cttyvp != NULL)
3371		export_fd_to_sb(cttyvp, KF_TYPE_VNODE, KF_FD_TYPE_CTTY,
3372		    FREAD | FWRITE, -1, -1, NULL, efbuf);
3373	error = 0;
3374	if (fdp == NULL)
3375		goto fail;
3376	efbuf->fdp = fdp;
3377	FILEDESC_SLOCK(fdp);
3378	/* working directory */
3379	if (fdp->fd_cdir != NULL) {
3380		vref(fdp->fd_cdir);
3381		data = fdp->fd_cdir;
3382		export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_CWD,
3383		    FREAD, -1, -1, NULL, efbuf);
3384	}
3385	/* root directory */
3386	if (fdp->fd_rdir != NULL) {
3387		vref(fdp->fd_rdir);
3388		data = fdp->fd_rdir;
3389		export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_ROOT,
3390		    FREAD, -1, -1, NULL, efbuf);
3391	}
3392	/* jail directory */
3393	if (fdp->fd_jdir != NULL) {
3394		vref(fdp->fd_jdir);
3395		data = fdp->fd_jdir;
3396		export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_JAIL,
3397		    FREAD, -1, -1, NULL, efbuf);
3398	}
3399	for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
3400		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
3401			continue;
3402		data = NULL;
3403#ifdef CAPABILITIES
3404		rights = *cap_rights(fdp, i);
3405#else /* !CAPABILITIES */
3406		cap_rights_init(&rights);
3407#endif
3408		switch (fp->f_type) {
3409		case DTYPE_VNODE:
3410			type = KF_TYPE_VNODE;
3411			vref(fp->f_vnode);
3412			data = fp->f_vnode;
3413			break;
3414
3415		case DTYPE_SOCKET:
3416			type = KF_TYPE_SOCKET;
3417			data = fp->f_data;
3418			break;
3419
3420		case DTYPE_PIPE:
3421			type = KF_TYPE_PIPE;
3422			data = fp->f_data;
3423			break;
3424
3425		case DTYPE_FIFO:
3426			type = KF_TYPE_FIFO;
3427			vref(fp->f_vnode);
3428			data = fp->f_vnode;
3429			break;
3430
3431		case DTYPE_KQUEUE:
3432			type = KF_TYPE_KQUEUE;
3433			break;
3434
3435		case DTYPE_CRYPTO:
3436			type = KF_TYPE_CRYPTO;
3437			break;
3438
3439		case DTYPE_MQUEUE:
3440			type = KF_TYPE_MQUEUE;
3441			break;
3442
3443		case DTYPE_SHM:
3444			type = KF_TYPE_SHM;
3445			data = fp;
3446			break;
3447
3448		case DTYPE_SEM:
3449			type = KF_TYPE_SEM;
3450			data = fp;
3451			break;
3452
3453		case DTYPE_PTS:
3454			type = KF_TYPE_PTS;
3455			data = fp->f_data;
3456			break;
3457
3458		case DTYPE_PROCDESC:
3459			type = KF_TYPE_PROCDESC;
3460			data = fp->f_data;
3461			break;
3462
3463		default:
3464			type = KF_TYPE_UNKNOWN;
3465			break;
3466		}
3467		refcnt = fp->f_count;
3468		fflags = fp->f_flag;
3469		offset = foffset_get(fp);
3470
3471		/*
3472		 * Create sysctl entry.
3473		 * It is OK to drop the filedesc lock here as we will
3474		 * re-validate and re-evaluate its properties when
3475		 * the loop continues.
3476		 */
3477		error = export_fd_to_sb(data, type, i, fflags, refcnt,
3478		    offset, &rights, efbuf);
3479		if (error != 0)
3480			break;
3481	}
3482	FILEDESC_SUNLOCK(fdp);
3483	fddrop(fdp);
3484fail:
3485	free(efbuf, M_TEMP);
3486	return (error);
3487}
3488
3489#define FILEDESC_SBUF_SIZE	(sizeof(struct kinfo_file) * 5)
3490
3491/*
3492 * Get per-process file descriptors for use by procstat(1), et al.
3493 */
3494static int
3495sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
3496{
3497	struct sbuf sb;
3498	struct proc *p;
3499	ssize_t maxlen;
3500	int error, error2, *name;
3501
3502	name = (int *)arg1;
3503
3504	sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
3505	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
3506	if (error != 0) {
3507		sbuf_delete(&sb);
3508		return (error);
3509	}
3510	maxlen = req->oldptr != NULL ? req->oldlen : -1;
3511	error = kern_proc_filedesc_out(p, &sb, maxlen);
3512	error2 = sbuf_finish(&sb);
3513	sbuf_delete(&sb);
3514	return (error != 0 ? error : error2);
3515}
3516
3517int
3518vntype_to_kinfo(int vtype)
3519{
3520	struct {
3521		int	vtype;
3522		int	kf_vtype;
3523	} vtypes_table[] = {
3524		{ VBAD, KF_VTYPE_VBAD },
3525		{ VBLK, KF_VTYPE_VBLK },
3526		{ VCHR, KF_VTYPE_VCHR },
3527		{ VDIR, KF_VTYPE_VDIR },
3528		{ VFIFO, KF_VTYPE_VFIFO },
3529		{ VLNK, KF_VTYPE_VLNK },
3530		{ VNON, KF_VTYPE_VNON },
3531		{ VREG, KF_VTYPE_VREG },
3532		{ VSOCK, KF_VTYPE_VSOCK }
3533	};
3534#define	NVTYPES	(sizeof(vtypes_table) / sizeof(*vtypes_table))
3535	unsigned int i;
3536
3537	/*
3538	 * Perform vtype translation.
3539	 */
3540	for (i = 0; i < NVTYPES; i++)
3541		if (vtypes_table[i].vtype == vtype)
3542			break;
3543	if (i < NVTYPES)
3544		return (vtypes_table[i].kf_vtype);
3545
3546	return (KF_VTYPE_UNKNOWN);
3547}
3548
3549static int
3550fill_vnode_info(struct vnode *vp, struct kinfo_file *kif)
3551{
3552	struct vattr va;
3553	char *fullpath, *freepath;
3554	int error;
3555
3556	if (vp == NULL)
3557		return (1);
3558	kif->kf_vnode_type = vntype_to_kinfo(vp->v_type);
3559	freepath = NULL;
3560	fullpath = "-";
3561	error = vn_fullpath(curthread, vp, &fullpath, &freepath);
3562	if (error == 0) {
3563		strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
3564	}
3565	if (freepath != NULL)
3566		free(freepath, M_TEMP);
3567
3568	/*
3569	 * Retrieve vnode attributes.
3570	 */
3571	va.va_fsid = VNOVAL;
3572	va.va_rdev = NODEV;
3573	vn_lock(vp, LK_SHARED | LK_RETRY);
3574	error = VOP_GETATTR(vp, &va, curthread->td_ucred);
3575	VOP_UNLOCK(vp, 0);
3576	if (error != 0)
3577		return (error);
3578	if (va.va_fsid != VNOVAL)
3579		kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
3580	else
3581		kif->kf_un.kf_file.kf_file_fsid =
3582		    vp->v_mount->mnt_stat.f_fsid.val[0];
3583	kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
3584	kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
3585	kif->kf_un.kf_file.kf_file_size = va.va_size;
3586	kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
3587	return (0);
3588}
3589
3590static int
3591fill_socket_info(struct socket *so, struct kinfo_file *kif)
3592{
3593	struct sockaddr *sa;
3594	struct inpcb *inpcb;
3595	struct unpcb *unpcb;
3596	int error;
3597
3598	if (so == NULL)
3599		return (1);
3600	kif->kf_sock_domain = so->so_proto->pr_domain->dom_family;
3601	kif->kf_sock_type = so->so_type;
3602	kif->kf_sock_protocol = so->so_proto->pr_protocol;
3603	kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb;
3604	switch(kif->kf_sock_domain) {
3605	case AF_INET:
3606	case AF_INET6:
3607		if (kif->kf_sock_protocol == IPPROTO_TCP) {
3608			if (so->so_pcb != NULL) {
3609				inpcb = (struct inpcb *)(so->so_pcb);
3610				kif->kf_un.kf_sock.kf_sock_inpcb =
3611				    (uintptr_t)inpcb->inp_ppcb;
3612			}
3613		}
3614		break;
3615	case AF_UNIX:
3616		if (so->so_pcb != NULL) {
3617			unpcb = (struct unpcb *)(so->so_pcb);
3618			if (unpcb->unp_conn) {
3619				kif->kf_un.kf_sock.kf_sock_unpconn =
3620				    (uintptr_t)unpcb->unp_conn;
3621				kif->kf_un.kf_sock.kf_sock_rcv_sb_state =
3622				    so->so_rcv.sb_state;
3623				kif->kf_un.kf_sock.kf_sock_snd_sb_state =
3624				    so->so_snd.sb_state;
3625			}
3626		}
3627		break;
3628	}
3629	error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa);
3630	if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
3631		bcopy(sa, &kif->kf_sa_local, sa->sa_len);
3632		free(sa, M_SONAME);
3633	}
3634	error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa);
3635	if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
3636		bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
3637		free(sa, M_SONAME);
3638	}
3639	strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name,
3640	    sizeof(kif->kf_path));
3641	return (0);
3642}
3643
3644static int
3645fill_pts_info(struct tty *tp, struct kinfo_file *kif)
3646{
3647
3648	if (tp == NULL)
3649		return (1);
3650	kif->kf_un.kf_pts.kf_pts_dev = tty_udev(tp);
3651	strlcpy(kif->kf_path, tty_devname(tp), sizeof(kif->kf_path));
3652	return (0);
3653}
3654
3655static int
3656fill_pipe_info(struct pipe *pi, struct kinfo_file *kif)
3657{
3658
3659	if (pi == NULL)
3660		return (1);
3661	kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi;
3662	kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer;
3663	kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt;
3664	return (0);
3665}
3666
3667static int
3668fill_procdesc_info(struct procdesc *pdp, struct kinfo_file *kif)
3669{
3670
3671	if (pdp == NULL)
3672		return (1);
3673	kif->kf_un.kf_proc.kf_pid = pdp->pd_pid;
3674	return (0);
3675}
3676
3677static int
3678fill_sem_info(struct file *fp, struct kinfo_file *kif)
3679{
3680	struct thread *td;
3681	struct stat sb;
3682
3683	td = curthread;
3684	if (fp->f_data == NULL)
3685		return (1);
3686	if (fo_stat(fp, &sb, td->td_ucred, td) != 0)
3687		return (1);
3688	if (ksem_info == NULL)
3689		return (1);
3690	ksem_info(fp->f_data, kif->kf_path, sizeof(kif->kf_path),
3691	    &kif->kf_un.kf_sem.kf_sem_value);
3692	kif->kf_un.kf_sem.kf_sem_mode = sb.st_mode;
3693	return (0);
3694}
3695
3696static int
3697fill_shm_info(struct file *fp, struct kinfo_file *kif)
3698{
3699	struct thread *td;
3700	struct stat sb;
3701
3702	td = curthread;
3703	if (fp->f_data == NULL)
3704		return (1);
3705	if (fo_stat(fp, &sb, td->td_ucred, td) != 0)
3706		return (1);
3707	shm_path(fp->f_data, kif->kf_path, sizeof(kif->kf_path));
3708	kif->kf_un.kf_file.kf_file_mode = sb.st_mode;
3709	kif->kf_un.kf_file.kf_file_size = sb.st_size;
3710	return (0);
3711}
3712
3713static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc,
3714    CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc,
3715    "Process filedesc entries");
3716
3717#ifdef DDB
3718/*
3719 * For the purposes of debugging, generate a human-readable string for the
3720 * file type.
3721 */
3722static const char *
3723file_type_to_name(short type)
3724{
3725
3726	switch (type) {
3727	case 0:
3728		return ("zero");
3729	case DTYPE_VNODE:
3730		return ("vnod");
3731	case DTYPE_SOCKET:
3732		return ("sock");
3733	case DTYPE_PIPE:
3734		return ("pipe");
3735	case DTYPE_FIFO:
3736		return ("fifo");
3737	case DTYPE_KQUEUE:
3738		return ("kque");
3739	case DTYPE_CRYPTO:
3740		return ("crpt");
3741	case DTYPE_MQUEUE:
3742		return ("mque");
3743	case DTYPE_SHM:
3744		return ("shm");
3745	case DTYPE_SEM:
3746		return ("ksem");
3747	default:
3748		return ("unkn");
3749	}
3750}
3751
3752/*
3753 * For the purposes of debugging, identify a process (if any, perhaps one of
3754 * many) that references the passed file in its file descriptor array. Return
3755 * NULL if none.
3756 */
3757static struct proc *
3758file_to_first_proc(struct file *fp)
3759{
3760	struct filedesc *fdp;
3761	struct proc *p;
3762	int n;
3763
3764	FOREACH_PROC_IN_SYSTEM(p) {
3765		if (p->p_state == PRS_NEW)
3766			continue;
3767		fdp = p->p_fd;
3768		if (fdp == NULL)
3769			continue;
3770		for (n = 0; n <= fdp->fd_lastfile; n++) {
3771			if (fp == fdp->fd_ofiles[n].fde_file)
3772				return (p);
3773		}
3774	}
3775	return (NULL);
3776}
3777
3778static void
3779db_print_file(struct file *fp, int header)
3780{
3781	struct proc *p;
3782
3783	if (header)
3784		db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n",
3785		    "File", "Type", "Data", "Flag", "GCFl", "Count",
3786		    "MCount", "Vnode", "FPID", "FCmd");
3787	p = file_to_first_proc(fp);
3788	db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
3789	    file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
3790	    0, fp->f_count, 0, fp->f_vnode,
3791	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
3792}
3793
3794DB_SHOW_COMMAND(file, db_show_file)
3795{
3796	struct file *fp;
3797
3798	if (!have_addr) {
3799		db_printf("usage: show file <addr>\n");
3800		return;
3801	}
3802	fp = (struct file *)addr;
3803	db_print_file(fp, 1);
3804}
3805
3806DB_SHOW_COMMAND(files, db_show_files)
3807{
3808	struct filedesc *fdp;
3809	struct file *fp;
3810	struct proc *p;
3811	int header;
3812	int n;
3813
3814	header = 1;
3815	FOREACH_PROC_IN_SYSTEM(p) {
3816		if (p->p_state == PRS_NEW)
3817			continue;
3818		if ((fdp = p->p_fd) == NULL)
3819			continue;
3820		for (n = 0; n <= fdp->fd_lastfile; ++n) {
3821			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
3822				continue;
3823			db_print_file(fp, header);
3824			header = 0;
3825		}
3826	}
3827}
3828#endif
3829
3830SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
3831    &maxfilesperproc, 0, "Maximum files allowed open per process");
3832
3833SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
3834    &maxfiles, 0, "Maximum number of files");
3835
3836SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
3837    __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files");
3838
3839/* ARGSUSED*/
3840static void
3841filelistinit(void *dummy)
3842{
3843
3844	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
3845	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
3846	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
3847	mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF);
3848}
3849SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
3850
3851/*-------------------------------------------------------------------*/
3852
3853static int
3854badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
3855    int flags, struct thread *td)
3856{
3857
3858	return (EBADF);
3859}
3860
3861static int
3862badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
3863    struct thread *td)
3864{
3865
3866	return (EINVAL);
3867}
3868
3869static int
3870badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
3871    struct thread *td)
3872{
3873
3874	return (EBADF);
3875}
3876
3877static int
3878badfo_poll(struct file *fp, int events, struct ucred *active_cred,
3879    struct thread *td)
3880{
3881
3882	return (0);
3883}
3884
3885static int
3886badfo_kqfilter(struct file *fp, struct knote *kn)
3887{
3888
3889	return (EBADF);
3890}
3891
3892static int
3893badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
3894    struct thread *td)
3895{
3896
3897	return (EBADF);
3898}
3899
3900static int
3901badfo_close(struct file *fp, struct thread *td)
3902{
3903
3904	return (EBADF);
3905}
3906
3907static int
3908badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
3909    struct thread *td)
3910{
3911
3912	return (EBADF);
3913}
3914
3915static int
3916badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
3917    struct thread *td)
3918{
3919
3920	return (EBADF);
3921}
3922
3923static int
3924badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
3925    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
3926    int kflags, struct sendfile_sync *sfs, struct thread *td)
3927{
3928
3929	return (EBADF);
3930}
3931
3932struct fileops badfileops = {
3933	.fo_read = badfo_readwrite,
3934	.fo_write = badfo_readwrite,
3935	.fo_truncate = badfo_truncate,
3936	.fo_ioctl = badfo_ioctl,
3937	.fo_poll = badfo_poll,
3938	.fo_kqfilter = badfo_kqfilter,
3939	.fo_stat = badfo_stat,
3940	.fo_close = badfo_close,
3941	.fo_chmod = badfo_chmod,
3942	.fo_chown = badfo_chown,
3943	.fo_sendfile = badfo_sendfile,
3944};
3945
3946int
3947invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
3948    struct thread *td)
3949{
3950
3951	return (EINVAL);
3952}
3953
3954int
3955invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
3956    struct thread *td)
3957{
3958
3959	return (EINVAL);
3960}
3961
3962int
3963invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
3964    struct thread *td)
3965{
3966
3967	return (EINVAL);
3968}
3969
3970int
3971invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
3972    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
3973    int kflags, struct sendfile_sync *sfs, struct thread *td)
3974{
3975
3976	return (EINVAL);
3977}
3978
3979/*-------------------------------------------------------------------*/
3980
3981/*
3982 * File Descriptor pseudo-device driver (/dev/fd/).
3983 *
3984 * Opening minor device N dup()s the file (if any) connected to file
3985 * descriptor N belonging to the calling process.  Note that this driver
3986 * consists of only the ``open()'' routine, because all subsequent
3987 * references to this file will be direct to the other driver.
3988 *
3989 * XXX: we could give this one a cloning event handler if necessary.
3990 */
3991
3992/* ARGSUSED */
3993static int
3994fdopen(struct cdev *dev, int mode, int type, struct thread *td)
3995{
3996
3997	/*
3998	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
3999	 * the file descriptor being sought for duplication. The error
4000	 * return ensures that the vnode for this device will be released
4001	 * by vn_open. Open will detect this special error and take the
4002	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
4003	 * will simply report the error.
4004	 */
4005	td->td_dupfd = dev2unit(dev);
4006	return (ENODEV);
4007}
4008
4009static struct cdevsw fildesc_cdevsw = {
4010	.d_version =	D_VERSION,
4011	.d_open =	fdopen,
4012	.d_name =	"FD",
4013};
4014
4015static void
4016fildesc_drvinit(void *unused)
4017{
4018	struct cdev *dev;
4019
4020	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
4021	    UID_ROOT, GID_WHEEL, 0666, "fd/0");
4022	make_dev_alias(dev, "stdin");
4023	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
4024	    UID_ROOT, GID_WHEEL, 0666, "fd/1");
4025	make_dev_alias(dev, "stdout");
4026	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
4027	    UID_ROOT, GID_WHEEL, 0666, "fd/2");
4028	make_dev_alias(dev, "stderr");
4029}
4030
4031SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
4032