kern_descrip.c revision 255892
1/*-
2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: head/sys/kern/kern_descrip.c 255892 2013-09-26 17:55:04Z jmg $");
39
40#include "opt_capsicum.h"
41#include "opt_compat.h"
42#include "opt_ddb.h"
43#include "opt_ktrace.h"
44#include "opt_procdesc.h"
45
46#include <sys/param.h>
47#include <sys/systm.h>
48
49#include <sys/capability.h>
50#include <sys/conf.h>
51#include <sys/domain.h>
52#include <sys/fcntl.h>
53#include <sys/file.h>
54#include <sys/filedesc.h>
55#include <sys/filio.h>
56#include <sys/jail.h>
57#include <sys/kernel.h>
58#include <sys/ksem.h>
59#include <sys/limits.h>
60#include <sys/lock.h>
61#include <sys/malloc.h>
62#include <sys/mman.h>
63#include <sys/mount.h>
64#include <sys/mqueue.h>
65#include <sys/mutex.h>
66#include <sys/namei.h>
67#include <sys/selinfo.h>
68#include <sys/pipe.h>
69#include <sys/priv.h>
70#include <sys/proc.h>
71#include <sys/procdesc.h>
72#include <sys/protosw.h>
73#include <sys/racct.h>
74#include <sys/resourcevar.h>
75#include <sys/sbuf.h>
76#include <sys/signalvar.h>
77#include <sys/socketvar.h>
78#include <sys/stat.h>
79#include <sys/sx.h>
80#include <sys/syscallsubr.h>
81#include <sys/sysctl.h>
82#include <sys/sysproto.h>
83#include <sys/tty.h>
84#include <sys/unistd.h>
85#include <sys/un.h>
86#include <sys/unpcb.h>
87#include <sys/user.h>
88#include <sys/vnode.h>
89#ifdef KTRACE
90#include <sys/ktrace.h>
91#endif
92
93#include <net/vnet.h>
94
95#include <netinet/in.h>
96#include <netinet/in_pcb.h>
97
98#include <security/audit/audit.h>
99
100#include <vm/uma.h>
101#include <vm/vm.h>
102
103#include <ddb/ddb.h>
104
105static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
106static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
107    "file desc to leader structures");
108static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
109MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
110
111MALLOC_DECLARE(M_FADVISE);
112
113static uma_zone_t file_zone;
114
115void	(*ksem_info)(struct ksem *ks, char *path, size_t size, uint32_t *value);
116
117static int	closefp(struct filedesc *fdp, int fd, struct file *fp,
118		    struct thread *td, int holdleaders);
119static int	fd_first_free(struct filedesc *fdp, int low, int size);
120static int	fd_last_used(struct filedesc *fdp, int size);
121static void	fdgrowtable(struct filedesc *fdp, int nfd);
122static void	fdunused(struct filedesc *fdp, int fd);
123static void	fdused(struct filedesc *fdp, int fd);
124static int	fill_pipe_info(struct pipe *pi, struct kinfo_file *kif);
125static int	fill_procdesc_info(struct procdesc *pdp,
126		    struct kinfo_file *kif);
127static int	fill_pts_info(struct tty *tp, struct kinfo_file *kif);
128static int	fill_sem_info(struct file *fp, struct kinfo_file *kif);
129static int	fill_shm_info(struct file *fp, struct kinfo_file *kif);
130static int	fill_socket_info(struct socket *so, struct kinfo_file *kif);
131static int	fill_vnode_info(struct vnode *vp, struct kinfo_file *kif);
132
133/*
134 * Each process has:
135 *
136 * - An array of open file descriptors (fd_ofiles)
137 * - An array of file flags (fd_ofileflags)
138 * - A bitmap recording which descriptors are in use (fd_map)
139 *
140 * A process starts out with NDFILE descriptors.  The value of NDFILE has
141 * been selected based the historical limit of 20 open files, and an
142 * assumption that the majority of processes, especially short-lived
143 * processes like shells, will never need more.
144 *
145 * If this initial allocation is exhausted, a larger descriptor table and
146 * map are allocated dynamically, and the pointers in the process's struct
147 * filedesc are updated to point to those.  This is repeated every time
148 * the process runs out of file descriptors (provided it hasn't hit its
149 * resource limit).
150 *
151 * Since threads may hold references to individual descriptor table
152 * entries, the tables are never freed.  Instead, they are placed on a
153 * linked list and freed only when the struct filedesc is released.
154 */
155#define NDFILE		20
156#define NDSLOTSIZE	sizeof(NDSLOTTYPE)
157#define	NDENTRIES	(NDSLOTSIZE * __CHAR_BIT)
158#define NDSLOT(x)	((x) / NDENTRIES)
159#define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
160#define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
161
162/*
163 * SLIST entry used to keep track of ofiles which must be reclaimed when
164 * the process exits.
165 */
166struct freetable {
167	struct filedescent *ft_table;
168	SLIST_ENTRY(freetable) ft_next;
169};
170
171/*
172 * Initial allocation: a filedesc structure + the head of SLIST used to
173 * keep track of old ofiles + enough space for NDFILE descriptors.
174 */
175struct filedesc0 {
176	struct filedesc fd_fd;
177	SLIST_HEAD(, freetable) fd_free;
178	struct	filedescent fd_dfiles[NDFILE];
179	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
180};
181
182/*
183 * Descriptor management.
184 */
185volatile int openfiles;			/* actual number of open files */
186struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
187void (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
188
189/* A mutex to protect the association between a proc and filedesc. */
190static struct mtx fdesc_mtx;
191
192/*
193 * If low >= size, just return low. Otherwise find the first zero bit in the
194 * given bitmap, starting at low and not exceeding size - 1. Return size if
195 * not found.
196 */
197static int
198fd_first_free(struct filedesc *fdp, int low, int size)
199{
200	NDSLOTTYPE *map = fdp->fd_map;
201	NDSLOTTYPE mask;
202	int off, maxoff;
203
204	if (low >= size)
205		return (low);
206
207	off = NDSLOT(low);
208	if (low % NDENTRIES) {
209		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
210		if ((mask &= ~map[off]) != 0UL)
211			return (off * NDENTRIES + ffsl(mask) - 1);
212		++off;
213	}
214	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
215		if (map[off] != ~0UL)
216			return (off * NDENTRIES + ffsl(~map[off]) - 1);
217	return (size);
218}
219
220/*
221 * Find the highest non-zero bit in the given bitmap, starting at 0 and
222 * not exceeding size - 1. Return -1 if not found.
223 */
224static int
225fd_last_used(struct filedesc *fdp, int size)
226{
227	NDSLOTTYPE *map = fdp->fd_map;
228	NDSLOTTYPE mask;
229	int off, minoff;
230
231	off = NDSLOT(size);
232	if (size % NDENTRIES) {
233		mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
234		if ((mask &= map[off]) != 0)
235			return (off * NDENTRIES + flsl(mask) - 1);
236		--off;
237	}
238	for (minoff = NDSLOT(0); off >= minoff; --off)
239		if (map[off] != 0)
240			return (off * NDENTRIES + flsl(map[off]) - 1);
241	return (-1);
242}
243
244static int
245fdisused(struct filedesc *fdp, int fd)
246{
247
248	FILEDESC_LOCK_ASSERT(fdp);
249
250	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
251	    ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
252
253	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
254}
255
256/*
257 * Mark a file descriptor as used.
258 */
259static void
260fdused(struct filedesc *fdp, int fd)
261{
262
263	FILEDESC_XLOCK_ASSERT(fdp);
264
265	KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
266
267	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
268	if (fd > fdp->fd_lastfile)
269		fdp->fd_lastfile = fd;
270	if (fd == fdp->fd_freefile)
271		fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
272}
273
274/*
275 * Mark a file descriptor as unused.
276 */
277static void
278fdunused(struct filedesc *fdp, int fd)
279{
280
281	FILEDESC_XLOCK_ASSERT(fdp);
282
283	KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
284	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
285	    ("fd=%d is still in use", fd));
286
287	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
288	if (fd < fdp->fd_freefile)
289		fdp->fd_freefile = fd;
290	if (fd == fdp->fd_lastfile)
291		fdp->fd_lastfile = fd_last_used(fdp, fd);
292}
293
294/*
295 * Free a file descriptor.
296 */
297static inline void
298fdfree(struct filedesc *fdp, int fd)
299{
300	struct filedescent *fde;
301
302	fde = &fdp->fd_ofiles[fd];
303	filecaps_free(&fde->fde_caps);
304	bzero(fde, sizeof(*fde));
305	fdunused(fdp, fd);
306}
307
308/*
309 * System calls on descriptors.
310 */
311#ifndef _SYS_SYSPROTO_H_
312struct getdtablesize_args {
313	int	dummy;
314};
315#endif
316/* ARGSUSED */
317int
318sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
319{
320	struct proc *p = td->td_proc;
321	uint64_t lim;
322
323	PROC_LOCK(p);
324	td->td_retval[0] =
325	    min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
326	lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
327	PROC_UNLOCK(p);
328	if (lim < td->td_retval[0])
329		td->td_retval[0] = lim;
330	return (0);
331}
332
333/*
334 * Duplicate a file descriptor to a particular value.
335 *
336 * Note: keep in mind that a potential race condition exists when closing
337 * descriptors from a shared descriptor table (via rfork).
338 */
339#ifndef _SYS_SYSPROTO_H_
340struct dup2_args {
341	u_int	from;
342	u_int	to;
343};
344#endif
345/* ARGSUSED */
346int
347sys_dup2(struct thread *td, struct dup2_args *uap)
348{
349
350	return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
351		    td->td_retval));
352}
353
354/*
355 * Duplicate a file descriptor.
356 */
357#ifndef _SYS_SYSPROTO_H_
358struct dup_args {
359	u_int	fd;
360};
361#endif
362/* ARGSUSED */
363int
364sys_dup(struct thread *td, struct dup_args *uap)
365{
366
367	return (do_dup(td, 0, (int)uap->fd, 0, td->td_retval));
368}
369
370/*
371 * The file control system call.
372 */
373#ifndef _SYS_SYSPROTO_H_
374struct fcntl_args {
375	int	fd;
376	int	cmd;
377	long	arg;
378};
379#endif
380/* ARGSUSED */
381int
382sys_fcntl(struct thread *td, struct fcntl_args *uap)
383{
384	struct flock fl;
385	struct __oflock ofl;
386	intptr_t arg;
387	int error;
388	int cmd;
389
390	error = 0;
391	cmd = uap->cmd;
392	switch (uap->cmd) {
393	case F_OGETLK:
394	case F_OSETLK:
395	case F_OSETLKW:
396		/*
397		 * Convert old flock structure to new.
398		 */
399		error = copyin((void *)(intptr_t)uap->arg, &ofl, sizeof(ofl));
400		fl.l_start = ofl.l_start;
401		fl.l_len = ofl.l_len;
402		fl.l_pid = ofl.l_pid;
403		fl.l_type = ofl.l_type;
404		fl.l_whence = ofl.l_whence;
405		fl.l_sysid = 0;
406
407		switch (uap->cmd) {
408		case F_OGETLK:
409		    cmd = F_GETLK;
410		    break;
411		case F_OSETLK:
412		    cmd = F_SETLK;
413		    break;
414		case F_OSETLKW:
415		    cmd = F_SETLKW;
416		    break;
417		}
418		arg = (intptr_t)&fl;
419		break;
420        case F_GETLK:
421        case F_SETLK:
422        case F_SETLKW:
423	case F_SETLK_REMOTE:
424                error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
425                arg = (intptr_t)&fl;
426                break;
427	default:
428		arg = uap->arg;
429		break;
430	}
431	if (error)
432		return (error);
433	error = kern_fcntl(td, uap->fd, cmd, arg);
434	if (error)
435		return (error);
436	if (uap->cmd == F_OGETLK) {
437		ofl.l_start = fl.l_start;
438		ofl.l_len = fl.l_len;
439		ofl.l_pid = fl.l_pid;
440		ofl.l_type = fl.l_type;
441		ofl.l_whence = fl.l_whence;
442		error = copyout(&ofl, (void *)(intptr_t)uap->arg, sizeof(ofl));
443	} else if (uap->cmd == F_GETLK) {
444		error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
445	}
446	return (error);
447}
448
449int
450kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
451{
452	struct filedesc *fdp;
453	struct flock *flp;
454	struct file *fp, *fp2;
455	struct filedescent *fde;
456	struct proc *p;
457	struct vnode *vp;
458	cap_rights_t rights;
459	int error, flg, tmp;
460	u_int old, new;
461	uint64_t bsize;
462	off_t foffset;
463
464	error = 0;
465	flg = F_POSIX;
466	p = td->td_proc;
467	fdp = p->p_fd;
468
469	switch (cmd) {
470	case F_DUPFD:
471		tmp = arg;
472		error = do_dup(td, DUP_FCNTL, fd, tmp, td->td_retval);
473		break;
474
475	case F_DUPFD_CLOEXEC:
476		tmp = arg;
477		error = do_dup(td, DUP_FCNTL | DUP_CLOEXEC, fd, tmp,
478		    td->td_retval);
479		break;
480
481	case F_DUP2FD:
482		tmp = arg;
483		error = do_dup(td, DUP_FIXED, fd, tmp, td->td_retval);
484		break;
485
486	case F_DUP2FD_CLOEXEC:
487		tmp = arg;
488		error = do_dup(td, DUP_FIXED | DUP_CLOEXEC, fd, tmp,
489		    td->td_retval);
490		break;
491
492	case F_GETFD:
493		FILEDESC_SLOCK(fdp);
494		if ((fp = fget_locked(fdp, fd)) == NULL) {
495			FILEDESC_SUNLOCK(fdp);
496			error = EBADF;
497			break;
498		}
499		fde = &fdp->fd_ofiles[fd];
500		td->td_retval[0] =
501		    (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
502		FILEDESC_SUNLOCK(fdp);
503		break;
504
505	case F_SETFD:
506		FILEDESC_XLOCK(fdp);
507		if ((fp = fget_locked(fdp, fd)) == NULL) {
508			FILEDESC_XUNLOCK(fdp);
509			error = EBADF;
510			break;
511		}
512		fde = &fdp->fd_ofiles[fd];
513		fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
514		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
515		FILEDESC_XUNLOCK(fdp);
516		break;
517
518	case F_GETFL:
519		error = fget_unlocked(fdp, fd,
520		    cap_rights_init(&rights, CAP_FCNTL), F_GETFL, &fp, NULL);
521		if (error != 0)
522			break;
523		td->td_retval[0] = OFLAGS(fp->f_flag);
524		fdrop(fp, td);
525		break;
526
527	case F_SETFL:
528		error = fget_unlocked(fdp, fd,
529		    cap_rights_init(&rights, CAP_FCNTL), F_SETFL, &fp, NULL);
530		if (error != 0)
531			break;
532		do {
533			tmp = flg = fp->f_flag;
534			tmp &= ~FCNTLFLAGS;
535			tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
536		} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
537		tmp = fp->f_flag & FNONBLOCK;
538		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
539		if (error != 0) {
540			fdrop(fp, td);
541			break;
542		}
543		tmp = fp->f_flag & FASYNC;
544		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
545		if (error == 0) {
546			fdrop(fp, td);
547			break;
548		}
549		atomic_clear_int(&fp->f_flag, FNONBLOCK);
550		tmp = 0;
551		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
552		fdrop(fp, td);
553		break;
554
555	case F_GETOWN:
556		error = fget_unlocked(fdp, fd,
557		    cap_rights_init(&rights, CAP_FCNTL), F_GETOWN, &fp, NULL);
558		if (error != 0)
559			break;
560		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
561		if (error == 0)
562			td->td_retval[0] = tmp;
563		fdrop(fp, td);
564		break;
565
566	case F_SETOWN:
567		error = fget_unlocked(fdp, fd,
568		    cap_rights_init(&rights, CAP_FCNTL), F_SETOWN, &fp, NULL);
569		if (error != 0)
570			break;
571		tmp = arg;
572		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
573		fdrop(fp, td);
574		break;
575
576	case F_SETLK_REMOTE:
577		error = priv_check(td, PRIV_NFS_LOCKD);
578		if (error)
579			return (error);
580		flg = F_REMOTE;
581		goto do_setlk;
582
583	case F_SETLKW:
584		flg |= F_WAIT;
585		/* FALLTHROUGH F_SETLK */
586
587	case F_SETLK:
588	do_setlk:
589		cap_rights_init(&rights, CAP_FLOCK);
590		error = fget_unlocked(fdp, fd, &rights, 0, &fp, NULL);
591		if (error != 0)
592			break;
593		if (fp->f_type != DTYPE_VNODE) {
594			error = EBADF;
595			fdrop(fp, td);
596			break;
597		}
598
599		flp = (struct flock *)arg;
600		if (flp->l_whence == SEEK_CUR) {
601			foffset = foffset_get(fp);
602			if (foffset < 0 ||
603			    (flp->l_start > 0 &&
604			     foffset > OFF_MAX - flp->l_start)) {
605				FILEDESC_SUNLOCK(fdp);
606				error = EOVERFLOW;
607				fdrop(fp, td);
608				break;
609			}
610			flp->l_start += foffset;
611		}
612
613		vp = fp->f_vnode;
614		switch (flp->l_type) {
615		case F_RDLCK:
616			if ((fp->f_flag & FREAD) == 0) {
617				error = EBADF;
618				break;
619			}
620			PROC_LOCK(p->p_leader);
621			p->p_leader->p_flag |= P_ADVLOCK;
622			PROC_UNLOCK(p->p_leader);
623			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
624			    flp, flg);
625			break;
626		case F_WRLCK:
627			if ((fp->f_flag & FWRITE) == 0) {
628				error = EBADF;
629				break;
630			}
631			PROC_LOCK(p->p_leader);
632			p->p_leader->p_flag |= P_ADVLOCK;
633			PROC_UNLOCK(p->p_leader);
634			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
635			    flp, flg);
636			break;
637		case F_UNLCK:
638			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
639			    flp, flg);
640			break;
641		case F_UNLCKSYS:
642			/*
643			 * Temporary api for testing remote lock
644			 * infrastructure.
645			 */
646			if (flg != F_REMOTE) {
647				error = EINVAL;
648				break;
649			}
650			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
651			    F_UNLCKSYS, flp, flg);
652			break;
653		default:
654			error = EINVAL;
655			break;
656		}
657		if (error != 0 || flp->l_type == F_UNLCK ||
658		    flp->l_type == F_UNLCKSYS) {
659			fdrop(fp, td);
660			break;
661		}
662
663		/*
664		 * Check for a race with close.
665		 *
666		 * The vnode is now advisory locked (or unlocked, but this case
667		 * is not really important) as the caller requested.
668		 * We had to drop the filedesc lock, so we need to recheck if
669		 * the descriptor is still valid, because if it was closed
670		 * in the meantime we need to remove advisory lock from the
671		 * vnode - close on any descriptor leading to an advisory
672		 * locked vnode, removes that lock.
673		 * We will return 0 on purpose in that case, as the result of
674		 * successful advisory lock might have been externally visible
675		 * already. This is fine - effectively we pretend to the caller
676		 * that the closing thread was a bit slower and that the
677		 * advisory lock succeeded before the close.
678		 */
679		error = fget_unlocked(fdp, fd, &rights, 0, &fp2, NULL);
680		if (error != 0) {
681			fdrop(fp, td);
682			break;
683		}
684		if (fp != fp2) {
685			flp->l_whence = SEEK_SET;
686			flp->l_start = 0;
687			flp->l_len = 0;
688			flp->l_type = F_UNLCK;
689			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
690			    F_UNLCK, flp, F_POSIX);
691		}
692		fdrop(fp, td);
693		fdrop(fp2, td);
694		break;
695
696	case F_GETLK:
697		error = fget_unlocked(fdp, fd,
698		    cap_rights_init(&rights, CAP_FLOCK), 0, &fp, NULL);
699		if (error != 0)
700			break;
701		if (fp->f_type != DTYPE_VNODE) {
702			error = EBADF;
703			fdrop(fp, td);
704			break;
705		}
706		flp = (struct flock *)arg;
707		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
708		    flp->l_type != F_UNLCK) {
709			error = EINVAL;
710			fdrop(fp, td);
711			break;
712		}
713		if (flp->l_whence == SEEK_CUR) {
714			foffset = foffset_get(fp);
715			if ((flp->l_start > 0 &&
716			    foffset > OFF_MAX - flp->l_start) ||
717			    (flp->l_start < 0 &&
718			     foffset < OFF_MIN - flp->l_start)) {
719				FILEDESC_SUNLOCK(fdp);
720				error = EOVERFLOW;
721				fdrop(fp, td);
722				break;
723			}
724			flp->l_start += foffset;
725		}
726		vp = fp->f_vnode;
727		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
728		    F_POSIX);
729		fdrop(fp, td);
730		break;
731
732	case F_RDAHEAD:
733		arg = arg ? 128 * 1024: 0;
734		/* FALLTHROUGH */
735	case F_READAHEAD:
736		error = fget_unlocked(fdp, fd, NULL, 0, &fp, NULL);
737		if (error != 0)
738			break;
739		if (fp->f_type != DTYPE_VNODE) {
740			fdrop(fp, td);
741			error = EBADF;
742			break;
743		}
744		if (arg >= 0) {
745			vp = fp->f_vnode;
746			error = vn_lock(vp, LK_SHARED);
747			if (error != 0) {
748				fdrop(fp, td);
749				break;
750			}
751			bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
752			VOP_UNLOCK(vp, 0);
753			fp->f_seqcount = (arg + bsize - 1) / bsize;
754			do {
755				new = old = fp->f_flag;
756				new |= FRDAHEAD;
757			} while (!atomic_cmpset_rel_int(&fp->f_flag, old, new));
758		} else {
759			do {
760				new = old = fp->f_flag;
761				new &= ~FRDAHEAD;
762			} while (!atomic_cmpset_rel_int(&fp->f_flag, old, new));
763		}
764		fdrop(fp, td);
765		break;
766
767	default:
768		error = EINVAL;
769		break;
770	}
771	return (error);
772}
773
774/*
775 * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
776 */
777int
778do_dup(struct thread *td, int flags, int old, int new,
779    register_t *retval)
780{
781	struct filedesc *fdp;
782	struct filedescent *oldfde, *newfde;
783	struct proc *p;
784	struct file *fp;
785	struct file *delfp;
786	int error, maxfd;
787
788	p = td->td_proc;
789	fdp = p->p_fd;
790
791	/*
792	 * Verify we have a valid descriptor to dup from and possibly to
793	 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
794	 * return EINVAL when the new descriptor is out of bounds.
795	 */
796	if (old < 0)
797		return (EBADF);
798	if (new < 0)
799		return (flags & DUP_FCNTL ? EINVAL : EBADF);
800	PROC_LOCK(p);
801	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
802	PROC_UNLOCK(p);
803	if (new >= maxfd)
804		return (flags & DUP_FCNTL ? EINVAL : EBADF);
805
806	FILEDESC_XLOCK(fdp);
807	if (fget_locked(fdp, old) == NULL) {
808		FILEDESC_XUNLOCK(fdp);
809		return (EBADF);
810	}
811	oldfde = &fdp->fd_ofiles[old];
812	if (flags & DUP_FIXED && old == new) {
813		*retval = new;
814		if (flags & DUP_CLOEXEC)
815			fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
816		FILEDESC_XUNLOCK(fdp);
817		return (0);
818	}
819	fp = oldfde->fde_file;
820	fhold(fp);
821
822	/*
823	 * If the caller specified a file descriptor, make sure the file
824	 * table is large enough to hold it, and grab it.  Otherwise, just
825	 * allocate a new descriptor the usual way.
826	 */
827	if (flags & DUP_FIXED) {
828		if (new >= fdp->fd_nfiles) {
829			/*
830			 * The resource limits are here instead of e.g.
831			 * fdalloc(), because the file descriptor table may be
832			 * shared between processes, so we can't really use
833			 * racct_add()/racct_sub().  Instead of counting the
834			 * number of actually allocated descriptors, just put
835			 * the limit on the size of the file descriptor table.
836			 */
837#ifdef RACCT
838			PROC_LOCK(p);
839			error = racct_set(p, RACCT_NOFILE, new + 1);
840			PROC_UNLOCK(p);
841			if (error != 0) {
842				FILEDESC_XUNLOCK(fdp);
843				fdrop(fp, td);
844				return (EMFILE);
845			}
846#endif
847			fdgrowtable(fdp, new + 1);
848			oldfde = &fdp->fd_ofiles[old];
849		}
850		newfde = &fdp->fd_ofiles[new];
851		if (newfde->fde_file == NULL)
852			fdused(fdp, new);
853	} else {
854		if ((error = fdalloc(td, new, &new)) != 0) {
855			FILEDESC_XUNLOCK(fdp);
856			fdrop(fp, td);
857			return (error);
858		}
859		newfde = &fdp->fd_ofiles[new];
860	}
861
862	KASSERT(fp == oldfde->fde_file, ("old fd has been modified"));
863	KASSERT(old != new, ("new fd is same as old"));
864
865	delfp = newfde->fde_file;
866
867	/*
868	 * Duplicate the source descriptor.
869	 */
870	*newfde = *oldfde;
871	filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps);
872	if ((flags & DUP_CLOEXEC) != 0)
873		newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
874	else
875		newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
876	if (new > fdp->fd_lastfile)
877		fdp->fd_lastfile = new;
878	*retval = new;
879
880	if (delfp != NULL) {
881		(void) closefp(fdp, new, delfp, td, 1);
882		/* closefp() drops the FILEDESC lock for us. */
883	} else {
884		FILEDESC_XUNLOCK(fdp);
885	}
886
887	return (0);
888}
889
890/*
891 * If sigio is on the list associated with a process or process group,
892 * disable signalling from the device, remove sigio from the list and
893 * free sigio.
894 */
895void
896funsetown(struct sigio **sigiop)
897{
898	struct sigio *sigio;
899
900	SIGIO_LOCK();
901	sigio = *sigiop;
902	if (sigio == NULL) {
903		SIGIO_UNLOCK();
904		return;
905	}
906	*(sigio->sio_myref) = NULL;
907	if ((sigio)->sio_pgid < 0) {
908		struct pgrp *pg = (sigio)->sio_pgrp;
909		PGRP_LOCK(pg);
910		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
911			     sigio, sio_pgsigio);
912		PGRP_UNLOCK(pg);
913	} else {
914		struct proc *p = (sigio)->sio_proc;
915		PROC_LOCK(p);
916		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
917			     sigio, sio_pgsigio);
918		PROC_UNLOCK(p);
919	}
920	SIGIO_UNLOCK();
921	crfree(sigio->sio_ucred);
922	free(sigio, M_SIGIO);
923}
924
925/*
926 * Free a list of sigio structures.
927 * We only need to lock the SIGIO_LOCK because we have made ourselves
928 * inaccessible to callers of fsetown and therefore do not need to lock
929 * the proc or pgrp struct for the list manipulation.
930 */
931void
932funsetownlst(struct sigiolst *sigiolst)
933{
934	struct proc *p;
935	struct pgrp *pg;
936	struct sigio *sigio;
937
938	sigio = SLIST_FIRST(sigiolst);
939	if (sigio == NULL)
940		return;
941	p = NULL;
942	pg = NULL;
943
944	/*
945	 * Every entry of the list should belong
946	 * to a single proc or pgrp.
947	 */
948	if (sigio->sio_pgid < 0) {
949		pg = sigio->sio_pgrp;
950		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
951	} else /* if (sigio->sio_pgid > 0) */ {
952		p = sigio->sio_proc;
953		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
954	}
955
956	SIGIO_LOCK();
957	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
958		*(sigio->sio_myref) = NULL;
959		if (pg != NULL) {
960			KASSERT(sigio->sio_pgid < 0,
961			    ("Proc sigio in pgrp sigio list"));
962			KASSERT(sigio->sio_pgrp == pg,
963			    ("Bogus pgrp in sigio list"));
964			PGRP_LOCK(pg);
965			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
966			    sio_pgsigio);
967			PGRP_UNLOCK(pg);
968		} else /* if (p != NULL) */ {
969			KASSERT(sigio->sio_pgid > 0,
970			    ("Pgrp sigio in proc sigio list"));
971			KASSERT(sigio->sio_proc == p,
972			    ("Bogus proc in sigio list"));
973			PROC_LOCK(p);
974			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
975			    sio_pgsigio);
976			PROC_UNLOCK(p);
977		}
978		SIGIO_UNLOCK();
979		crfree(sigio->sio_ucred);
980		free(sigio, M_SIGIO);
981		SIGIO_LOCK();
982	}
983	SIGIO_UNLOCK();
984}
985
986/*
987 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
988 *
989 * After permission checking, add a sigio structure to the sigio list for
990 * the process or process group.
991 */
992int
993fsetown(pid_t pgid, struct sigio **sigiop)
994{
995	struct proc *proc;
996	struct pgrp *pgrp;
997	struct sigio *sigio;
998	int ret;
999
1000	if (pgid == 0) {
1001		funsetown(sigiop);
1002		return (0);
1003	}
1004
1005	ret = 0;
1006
1007	/* Allocate and fill in the new sigio out of locks. */
1008	sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
1009	sigio->sio_pgid = pgid;
1010	sigio->sio_ucred = crhold(curthread->td_ucred);
1011	sigio->sio_myref = sigiop;
1012
1013	sx_slock(&proctree_lock);
1014	if (pgid > 0) {
1015		proc = pfind(pgid);
1016		if (proc == NULL) {
1017			ret = ESRCH;
1018			goto fail;
1019		}
1020
1021		/*
1022		 * Policy - Don't allow a process to FSETOWN a process
1023		 * in another session.
1024		 *
1025		 * Remove this test to allow maximum flexibility or
1026		 * restrict FSETOWN to the current process or process
1027		 * group for maximum safety.
1028		 */
1029		PROC_UNLOCK(proc);
1030		if (proc->p_session != curthread->td_proc->p_session) {
1031			ret = EPERM;
1032			goto fail;
1033		}
1034
1035		pgrp = NULL;
1036	} else /* if (pgid < 0) */ {
1037		pgrp = pgfind(-pgid);
1038		if (pgrp == NULL) {
1039			ret = ESRCH;
1040			goto fail;
1041		}
1042		PGRP_UNLOCK(pgrp);
1043
1044		/*
1045		 * Policy - Don't allow a process to FSETOWN a process
1046		 * in another session.
1047		 *
1048		 * Remove this test to allow maximum flexibility or
1049		 * restrict FSETOWN to the current process or process
1050		 * group for maximum safety.
1051		 */
1052		if (pgrp->pg_session != curthread->td_proc->p_session) {
1053			ret = EPERM;
1054			goto fail;
1055		}
1056
1057		proc = NULL;
1058	}
1059	funsetown(sigiop);
1060	if (pgid > 0) {
1061		PROC_LOCK(proc);
1062		/*
1063		 * Since funsetownlst() is called without the proctree
1064		 * locked, we need to check for P_WEXIT.
1065		 * XXX: is ESRCH correct?
1066		 */
1067		if ((proc->p_flag & P_WEXIT) != 0) {
1068			PROC_UNLOCK(proc);
1069			ret = ESRCH;
1070			goto fail;
1071		}
1072		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
1073		sigio->sio_proc = proc;
1074		PROC_UNLOCK(proc);
1075	} else {
1076		PGRP_LOCK(pgrp);
1077		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
1078		sigio->sio_pgrp = pgrp;
1079		PGRP_UNLOCK(pgrp);
1080	}
1081	sx_sunlock(&proctree_lock);
1082	SIGIO_LOCK();
1083	*sigiop = sigio;
1084	SIGIO_UNLOCK();
1085	return (0);
1086
1087fail:
1088	sx_sunlock(&proctree_lock);
1089	crfree(sigio->sio_ucred);
1090	free(sigio, M_SIGIO);
1091	return (ret);
1092}
1093
1094/*
1095 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
1096 */
1097pid_t
1098fgetown(sigiop)
1099	struct sigio **sigiop;
1100{
1101	pid_t pgid;
1102
1103	SIGIO_LOCK();
1104	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
1105	SIGIO_UNLOCK();
1106	return (pgid);
1107}
1108
1109/*
1110 * Function drops the filedesc lock on return.
1111 */
1112static int
1113closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
1114    int holdleaders)
1115{
1116	int error;
1117
1118	FILEDESC_XLOCK_ASSERT(fdp);
1119
1120	if (holdleaders) {
1121		if (td->td_proc->p_fdtol != NULL) {
1122			/*
1123			 * Ask fdfree() to sleep to ensure that all relevant
1124			 * process leaders can be traversed in closef().
1125			 */
1126			fdp->fd_holdleaderscount++;
1127		} else {
1128			holdleaders = 0;
1129		}
1130	}
1131
1132	/*
1133	 * We now hold the fp reference that used to be owned by the
1134	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
1135	 * knote_fdclose to prevent a race of the fd getting opened, a knote
1136	 * added, and deleteing a knote for the new fd.
1137	 */
1138	knote_fdclose(td, fd);
1139
1140	/*
1141	 * We need to notify mqueue if the object is of type mqueue.
1142	 */
1143	if (fp->f_type == DTYPE_MQUEUE)
1144		mq_fdclose(td, fd, fp);
1145	FILEDESC_XUNLOCK(fdp);
1146
1147	error = closef(fp, td);
1148	if (holdleaders) {
1149		FILEDESC_XLOCK(fdp);
1150		fdp->fd_holdleaderscount--;
1151		if (fdp->fd_holdleaderscount == 0 &&
1152		    fdp->fd_holdleaderswakeup != 0) {
1153			fdp->fd_holdleaderswakeup = 0;
1154			wakeup(&fdp->fd_holdleaderscount);
1155		}
1156		FILEDESC_XUNLOCK(fdp);
1157	}
1158	return (error);
1159}
1160
1161/*
1162 * Close a file descriptor.
1163 */
1164#ifndef _SYS_SYSPROTO_H_
1165struct close_args {
1166	int     fd;
1167};
1168#endif
1169/* ARGSUSED */
1170int
1171sys_close(td, uap)
1172	struct thread *td;
1173	struct close_args *uap;
1174{
1175
1176	return (kern_close(td, uap->fd));
1177}
1178
1179int
1180kern_close(td, fd)
1181	struct thread *td;
1182	int fd;
1183{
1184	struct filedesc *fdp;
1185	struct file *fp;
1186
1187	fdp = td->td_proc->p_fd;
1188
1189	AUDIT_SYSCLOSE(td, fd);
1190
1191	FILEDESC_XLOCK(fdp);
1192	if ((fp = fget_locked(fdp, fd)) == NULL) {
1193		FILEDESC_XUNLOCK(fdp);
1194		return (EBADF);
1195	}
1196	fdfree(fdp, fd);
1197
1198	/* closefp() drops the FILEDESC lock for us. */
1199	return (closefp(fdp, fd, fp, td, 1));
1200}
1201
1202/*
1203 * Close open file descriptors.
1204 */
1205#ifndef _SYS_SYSPROTO_H_
1206struct closefrom_args {
1207	int	lowfd;
1208};
1209#endif
1210/* ARGSUSED */
1211int
1212sys_closefrom(struct thread *td, struct closefrom_args *uap)
1213{
1214	struct filedesc *fdp;
1215	int fd;
1216
1217	fdp = td->td_proc->p_fd;
1218	AUDIT_ARG_FD(uap->lowfd);
1219
1220	/*
1221	 * Treat negative starting file descriptor values identical to
1222	 * closefrom(0) which closes all files.
1223	 */
1224	if (uap->lowfd < 0)
1225		uap->lowfd = 0;
1226	FILEDESC_SLOCK(fdp);
1227	for (fd = uap->lowfd; fd < fdp->fd_nfiles; fd++) {
1228		if (fdp->fd_ofiles[fd].fde_file != NULL) {
1229			FILEDESC_SUNLOCK(fdp);
1230			(void)kern_close(td, fd);
1231			FILEDESC_SLOCK(fdp);
1232		}
1233	}
1234	FILEDESC_SUNLOCK(fdp);
1235	return (0);
1236}
1237
1238#if defined(COMPAT_43)
1239/*
1240 * Return status information about a file descriptor.
1241 */
1242#ifndef _SYS_SYSPROTO_H_
1243struct ofstat_args {
1244	int	fd;
1245	struct	ostat *sb;
1246};
1247#endif
1248/* ARGSUSED */
1249int
1250ofstat(struct thread *td, struct ofstat_args *uap)
1251{
1252	struct ostat oub;
1253	struct stat ub;
1254	int error;
1255
1256	error = kern_fstat(td, uap->fd, &ub);
1257	if (error == 0) {
1258		cvtstat(&ub, &oub);
1259		error = copyout(&oub, uap->sb, sizeof(oub));
1260	}
1261	return (error);
1262}
1263#endif /* COMPAT_43 */
1264
1265/*
1266 * Return status information about a file descriptor.
1267 */
1268#ifndef _SYS_SYSPROTO_H_
1269struct fstat_args {
1270	int	fd;
1271	struct	stat *sb;
1272};
1273#endif
1274/* ARGSUSED */
1275int
1276sys_fstat(struct thread *td, struct fstat_args *uap)
1277{
1278	struct stat ub;
1279	int error;
1280
1281	error = kern_fstat(td, uap->fd, &ub);
1282	if (error == 0)
1283		error = copyout(&ub, uap->sb, sizeof(ub));
1284	return (error);
1285}
1286
1287int
1288kern_fstat(struct thread *td, int fd, struct stat *sbp)
1289{
1290	struct file *fp;
1291	cap_rights_t rights;
1292	int error;
1293
1294	AUDIT_ARG_FD(fd);
1295
1296	error = fget(td, fd, cap_rights_init(&rights, CAP_FSTAT), &fp);
1297	if (error != 0)
1298		return (error);
1299
1300	AUDIT_ARG_FILE(td->td_proc, fp);
1301
1302	error = fo_stat(fp, sbp, td->td_ucred, td);
1303	fdrop(fp, td);
1304#ifdef KTRACE
1305	if (error == 0 && KTRPOINT(td, KTR_STRUCT))
1306		ktrstat(sbp);
1307#endif
1308	return (error);
1309}
1310
1311/*
1312 * Return status information about a file descriptor.
1313 */
1314#ifndef _SYS_SYSPROTO_H_
1315struct nfstat_args {
1316	int	fd;
1317	struct	nstat *sb;
1318};
1319#endif
1320/* ARGSUSED */
1321int
1322sys_nfstat(struct thread *td, struct nfstat_args *uap)
1323{
1324	struct nstat nub;
1325	struct stat ub;
1326	int error;
1327
1328	error = kern_fstat(td, uap->fd, &ub);
1329	if (error == 0) {
1330		cvtnstat(&ub, &nub);
1331		error = copyout(&nub, uap->sb, sizeof(nub));
1332	}
1333	return (error);
1334}
1335
1336/*
1337 * Return pathconf information about a file descriptor.
1338 */
1339#ifndef _SYS_SYSPROTO_H_
1340struct fpathconf_args {
1341	int	fd;
1342	int	name;
1343};
1344#endif
1345/* ARGSUSED */
1346int
1347sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
1348{
1349	struct file *fp;
1350	struct vnode *vp;
1351	cap_rights_t rights;
1352	int error;
1353
1354	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FPATHCONF), &fp);
1355	if (error != 0)
1356		return (error);
1357
1358	/* If asynchronous I/O is available, it works for all descriptors. */
1359	if (uap->name == _PC_ASYNC_IO) {
1360		td->td_retval[0] = async_io_version;
1361		goto out;
1362	}
1363	vp = fp->f_vnode;
1364	if (vp != NULL) {
1365		vn_lock(vp, LK_SHARED | LK_RETRY);
1366		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
1367		VOP_UNLOCK(vp, 0);
1368	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
1369		if (uap->name != _PC_PIPE_BUF) {
1370			error = EINVAL;
1371		} else {
1372			td->td_retval[0] = PIPE_BUF;
1373			error = 0;
1374		}
1375	} else {
1376		error = EOPNOTSUPP;
1377	}
1378out:
1379	fdrop(fp, td);
1380	return (error);
1381}
1382
1383/*
1384 * Initialize filecaps structure.
1385 */
1386void
1387filecaps_init(struct filecaps *fcaps)
1388{
1389
1390	bzero(fcaps, sizeof(*fcaps));
1391	fcaps->fc_nioctls = -1;
1392}
1393
1394/*
1395 * Copy filecaps structure allocating memory for ioctls array if needed.
1396 */
1397void
1398filecaps_copy(const struct filecaps *src, struct filecaps *dst)
1399{
1400	size_t size;
1401
1402	*dst = *src;
1403	if (src->fc_ioctls != NULL) {
1404		KASSERT(src->fc_nioctls > 0,
1405		    ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
1406
1407		size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
1408		dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
1409		bcopy(src->fc_ioctls, dst->fc_ioctls, size);
1410	}
1411}
1412
1413/*
1414 * Move filecaps structure to the new place and clear the old place.
1415 */
1416void
1417filecaps_move(struct filecaps *src, struct filecaps *dst)
1418{
1419
1420	*dst = *src;
1421	bzero(src, sizeof(*src));
1422}
1423
1424/*
1425 * Fill the given filecaps structure with full rights.
1426 */
1427static void
1428filecaps_fill(struct filecaps *fcaps)
1429{
1430
1431	CAP_ALL(&fcaps->fc_rights);
1432	fcaps->fc_ioctls = NULL;
1433	fcaps->fc_nioctls = -1;
1434	fcaps->fc_fcntls = CAP_FCNTL_ALL;
1435}
1436
1437/*
1438 * Free memory allocated within filecaps structure.
1439 */
1440void
1441filecaps_free(struct filecaps *fcaps)
1442{
1443
1444	free(fcaps->fc_ioctls, M_FILECAPS);
1445	bzero(fcaps, sizeof(*fcaps));
1446}
1447
1448/*
1449 * Validate the given filecaps structure.
1450 */
1451static void
1452filecaps_validate(const struct filecaps *fcaps, const char *func)
1453{
1454
1455	KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
1456	    ("%s: invalid rights", func));
1457	KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
1458	    ("%s: invalid fcntls", func));
1459	KASSERT(fcaps->fc_fcntls == 0 ||
1460	    cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
1461	    ("%s: fcntls without CAP_FCNTL", func));
1462	KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
1463	    (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
1464	    ("%s: invalid ioctls", func));
1465	KASSERT(fcaps->fc_nioctls == 0 ||
1466	    cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
1467	    ("%s: ioctls without CAP_IOCTL", func));
1468}
1469
1470/*
1471 * Grow the file table to accomodate (at least) nfd descriptors.
1472 */
1473static void
1474fdgrowtable(struct filedesc *fdp, int nfd)
1475{
1476	struct filedesc0 *fdp0;
1477	struct freetable *ft;
1478	struct filedescent *ntable;
1479	struct filedescent *otable;
1480	int nnfiles, onfiles;
1481	NDSLOTTYPE *nmap, *omap;
1482
1483	FILEDESC_XLOCK_ASSERT(fdp);
1484
1485	KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
1486
1487	/* save old values */
1488	onfiles = fdp->fd_nfiles;
1489	otable = fdp->fd_ofiles;
1490	omap = fdp->fd_map;
1491
1492	/* compute the size of the new table */
1493	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
1494	if (nnfiles <= onfiles)
1495		/* the table is already large enough */
1496		return;
1497
1498	/*
1499	 * Allocate a new table and map.  We need enough space for the
1500	 * file entries themselves and the struct freetable we will use
1501	 * when we decommission the table and place it on the freelist.
1502	 * We place the struct freetable in the middle so we don't have
1503	 * to worry about padding.
1504	 */
1505	ntable = malloc(nnfiles * sizeof(ntable[0]) + sizeof(struct freetable),
1506	    M_FILEDESC, M_ZERO | M_WAITOK);
1507	nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
1508	    M_ZERO | M_WAITOK);
1509
1510	/* copy the old data over and point at the new tables */
1511	memcpy(ntable, otable, onfiles * sizeof(*otable));
1512	memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
1513
1514	/* update the pointers and counters */
1515	memcpy(ntable, otable, onfiles * sizeof(ntable[0]));
1516	fdp->fd_ofiles = ntable;
1517	fdp->fd_map = nmap;
1518
1519	/*
1520	 * In order to have a valid pattern for fget_unlocked()
1521	 * fdp->fd_nfiles must be the last member to be updated, otherwise
1522	 * fget_unlocked() consumers may reference a new, higher value for
1523	 * fdp->fd_nfiles before to access the fdp->fd_ofiles array,
1524	 * resulting in OOB accesses.
1525	 */
1526	atomic_store_rel_int(&fdp->fd_nfiles, nnfiles);
1527
1528	/*
1529	 * Do not free the old file table, as some threads may still
1530	 * reference entries within it.  Instead, place it on a freelist
1531	 * which will be processed when the struct filedesc is released.
1532	 *
1533	 * Do, however, free the old map.
1534	 *
1535	 * Note that if onfiles == NDFILE, we're dealing with the original
1536	 * static allocation contained within (struct filedesc0 *)fdp,
1537	 * which must not be freed.
1538	 */
1539	if (onfiles > NDFILE) {
1540		ft = (struct freetable *)&otable[onfiles];
1541		fdp0 = (struct filedesc0 *)fdp;
1542		ft->ft_table = otable;
1543		SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
1544		free(omap, M_FILEDESC);
1545	}
1546}
1547
1548/*
1549 * Allocate a file descriptor for the process.
1550 */
1551int
1552fdalloc(struct thread *td, int minfd, int *result)
1553{
1554	struct proc *p = td->td_proc;
1555	struct filedesc *fdp = p->p_fd;
1556	int fd = -1, maxfd, allocfd;
1557#ifdef RACCT
1558	int error;
1559#endif
1560
1561	FILEDESC_XLOCK_ASSERT(fdp);
1562
1563	if (fdp->fd_freefile > minfd)
1564		minfd = fdp->fd_freefile;
1565
1566	PROC_LOCK(p);
1567	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
1568	PROC_UNLOCK(p);
1569
1570	/*
1571	 * Search the bitmap for a free descriptor starting at minfd.
1572	 * If none is found, grow the file table.
1573	 */
1574	fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
1575	if (fd >= maxfd)
1576		return (EMFILE);
1577	if (fd >= fdp->fd_nfiles) {
1578		allocfd = min(fd * 2, maxfd);
1579#ifdef RACCT
1580		PROC_LOCK(p);
1581		error = racct_set(p, RACCT_NOFILE, allocfd);
1582		PROC_UNLOCK(p);
1583		if (error != 0)
1584			return (EMFILE);
1585#endif
1586		/*
1587		 * fd is already equal to first free descriptor >= minfd, so
1588		 * we only need to grow the table and we are done.
1589		 */
1590		fdgrowtable(fdp, allocfd);
1591	}
1592
1593	/*
1594	 * Perform some sanity checks, then mark the file descriptor as
1595	 * used and return it to the caller.
1596	 */
1597	KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
1598	    ("invalid descriptor %d", fd));
1599	KASSERT(!fdisused(fdp, fd),
1600	    ("fd_first_free() returned non-free descriptor"));
1601	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
1602	    ("file descriptor isn't free"));
1603	KASSERT(fdp->fd_ofiles[fd].fde_flags == 0, ("file flags are set"));
1604	fdused(fdp, fd);
1605	*result = fd;
1606	return (0);
1607}
1608
1609/*
1610 * Allocate n file descriptors for the process.
1611 */
1612int
1613fdallocn(struct thread *td, int minfd, int *fds, int n)
1614{
1615	struct proc *p = td->td_proc;
1616	struct filedesc *fdp = p->p_fd;
1617	int i;
1618
1619	FILEDESC_XLOCK_ASSERT(fdp);
1620
1621	if (!fdavail(td, n))
1622		return (EMFILE);
1623
1624	for (i = 0; i < n; i++)
1625		if (fdalloc(td, 0, &fds[i]) != 0)
1626			break;
1627
1628	if (i < n) {
1629		for (i--; i >= 0; i--)
1630			fdunused(fdp, fds[i]);
1631		return (EMFILE);
1632	}
1633
1634	return (0);
1635}
1636
1637/*
1638 * Check to see whether n user file descriptors are available to the process
1639 * p.
1640 */
1641int
1642fdavail(struct thread *td, int n)
1643{
1644	struct proc *p = td->td_proc;
1645	struct filedesc *fdp = td->td_proc->p_fd;
1646	int i, lim, last;
1647
1648	FILEDESC_LOCK_ASSERT(fdp);
1649
1650	/*
1651	 * XXX: This is only called from uipc_usrreq.c:unp_externalize();
1652	 *      call racct_add() from there instead of dealing with containers
1653	 *      here.
1654	 */
1655	PROC_LOCK(p);
1656	lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
1657	PROC_UNLOCK(p);
1658	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
1659		return (1);
1660	last = min(fdp->fd_nfiles, lim);
1661	for (i = fdp->fd_freefile; i < last; i++) {
1662		if (fdp->fd_ofiles[i].fde_file == NULL && --n <= 0)
1663			return (1);
1664	}
1665	return (0);
1666}
1667
1668/*
1669 * Create a new open file structure and allocate a file decriptor for the
1670 * process that refers to it.  We add one reference to the file for the
1671 * descriptor table and one reference for resultfp. This is to prevent us
1672 * being preempted and the entry in the descriptor table closed after we
1673 * release the FILEDESC lock.
1674 */
1675int
1676falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags)
1677{
1678	struct file *fp;
1679	int error, fd;
1680
1681	error = falloc_noinstall(td, &fp);
1682	if (error)
1683		return (error);		/* no reference held on error */
1684
1685	error = finstall(td, fp, &fd, flags, NULL);
1686	if (error) {
1687		fdrop(fp, td);		/* one reference (fp only) */
1688		return (error);
1689	}
1690
1691	if (resultfp != NULL)
1692		*resultfp = fp;		/* copy out result */
1693	else
1694		fdrop(fp, td);		/* release local reference */
1695
1696	if (resultfd != NULL)
1697		*resultfd = fd;
1698
1699	return (0);
1700}
1701
1702/*
1703 * Create a new open file structure without allocating a file descriptor.
1704 */
1705int
1706falloc_noinstall(struct thread *td, struct file **resultfp)
1707{
1708	struct file *fp;
1709	int maxuserfiles = maxfiles - (maxfiles / 20);
1710	static struct timeval lastfail;
1711	static int curfail;
1712
1713	KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
1714
1715	if ((openfiles >= maxuserfiles &&
1716	    priv_check(td, PRIV_MAXFILES) != 0) ||
1717	    openfiles >= maxfiles) {
1718		if (ppsratecheck(&lastfail, &curfail, 1)) {
1719			printf("kern.maxfiles limit exceeded by uid %i, "
1720			    "please see tuning(7).\n", td->td_ucred->cr_ruid);
1721		}
1722		return (ENFILE);
1723	}
1724	atomic_add_int(&openfiles, 1);
1725	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
1726	refcount_init(&fp->f_count, 1);
1727	fp->f_cred = crhold(td->td_ucred);
1728	fp->f_ops = &badfileops;
1729	fp->f_data = NULL;
1730	fp->f_vnode = NULL;
1731	*resultfp = fp;
1732	return (0);
1733}
1734
1735/*
1736 * Install a file in a file descriptor table.
1737 */
1738int
1739finstall(struct thread *td, struct file *fp, int *fd, int flags,
1740    struct filecaps *fcaps)
1741{
1742	struct filedesc *fdp = td->td_proc->p_fd;
1743	struct filedescent *fde;
1744	int error;
1745
1746	KASSERT(fd != NULL, ("%s: fd == NULL", __func__));
1747	KASSERT(fp != NULL, ("%s: fp == NULL", __func__));
1748	if (fcaps != NULL)
1749		filecaps_validate(fcaps, __func__);
1750
1751	FILEDESC_XLOCK(fdp);
1752	if ((error = fdalloc(td, 0, fd))) {
1753		FILEDESC_XUNLOCK(fdp);
1754		return (error);
1755	}
1756	fhold(fp);
1757	fde = &fdp->fd_ofiles[*fd];
1758	fde->fde_file = fp;
1759	if ((flags & O_CLOEXEC) != 0)
1760		fde->fde_flags |= UF_EXCLOSE;
1761	if (fcaps != NULL)
1762		filecaps_move(fcaps, &fde->fde_caps);
1763	else
1764		filecaps_fill(&fde->fde_caps);
1765	FILEDESC_XUNLOCK(fdp);
1766	return (0);
1767}
1768
1769/*
1770 * Build a new filedesc structure from another.
1771 * Copy the current, root, and jail root vnode references.
1772 */
1773struct filedesc *
1774fdinit(struct filedesc *fdp)
1775{
1776	struct filedesc0 *newfdp;
1777
1778	newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO);
1779	FILEDESC_LOCK_INIT(&newfdp->fd_fd);
1780	if (fdp != NULL) {
1781		FILEDESC_XLOCK(fdp);
1782		newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
1783		if (newfdp->fd_fd.fd_cdir)
1784			VREF(newfdp->fd_fd.fd_cdir);
1785		newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
1786		if (newfdp->fd_fd.fd_rdir)
1787			VREF(newfdp->fd_fd.fd_rdir);
1788		newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
1789		if (newfdp->fd_fd.fd_jdir)
1790			VREF(newfdp->fd_fd.fd_jdir);
1791		FILEDESC_XUNLOCK(fdp);
1792	}
1793
1794	/* Create the file descriptor table. */
1795	newfdp->fd_fd.fd_refcnt = 1;
1796	newfdp->fd_fd.fd_holdcnt = 1;
1797	newfdp->fd_fd.fd_cmask = CMASK;
1798	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
1799	newfdp->fd_fd.fd_nfiles = NDFILE;
1800	newfdp->fd_fd.fd_map = newfdp->fd_dmap;
1801	newfdp->fd_fd.fd_lastfile = -1;
1802	return (&newfdp->fd_fd);
1803}
1804
1805static struct filedesc *
1806fdhold(struct proc *p)
1807{
1808	struct filedesc *fdp;
1809
1810	mtx_lock(&fdesc_mtx);
1811	fdp = p->p_fd;
1812	if (fdp != NULL)
1813		fdp->fd_holdcnt++;
1814	mtx_unlock(&fdesc_mtx);
1815	return (fdp);
1816}
1817
1818static void
1819fddrop(struct filedesc *fdp)
1820{
1821	struct filedesc0 *fdp0;
1822	struct freetable *ft;
1823	int i;
1824
1825	mtx_lock(&fdesc_mtx);
1826	i = --fdp->fd_holdcnt;
1827	mtx_unlock(&fdesc_mtx);
1828	if (i > 0)
1829		return;
1830
1831	FILEDESC_LOCK_DESTROY(fdp);
1832	fdp0 = (struct filedesc0 *)fdp;
1833	while ((ft = SLIST_FIRST(&fdp0->fd_free)) != NULL) {
1834		SLIST_REMOVE_HEAD(&fdp0->fd_free, ft_next);
1835		free(ft->ft_table, M_FILEDESC);
1836	}
1837	free(fdp, M_FILEDESC);
1838}
1839
1840/*
1841 * Share a filedesc structure.
1842 */
1843struct filedesc *
1844fdshare(struct filedesc *fdp)
1845{
1846
1847	FILEDESC_XLOCK(fdp);
1848	fdp->fd_refcnt++;
1849	FILEDESC_XUNLOCK(fdp);
1850	return (fdp);
1851}
1852
1853/*
1854 * Unshare a filedesc structure, if necessary by making a copy
1855 */
1856void
1857fdunshare(struct proc *p, struct thread *td)
1858{
1859
1860	FILEDESC_XLOCK(p->p_fd);
1861	if (p->p_fd->fd_refcnt > 1) {
1862		struct filedesc *tmp;
1863
1864		FILEDESC_XUNLOCK(p->p_fd);
1865		tmp = fdcopy(p->p_fd);
1866		fdescfree(td);
1867		p->p_fd = tmp;
1868	} else
1869		FILEDESC_XUNLOCK(p->p_fd);
1870}
1871
1872/*
1873 * Copy a filedesc structure.  A NULL pointer in returns a NULL reference,
1874 * this is to ease callers, not catch errors.
1875 */
1876struct filedesc *
1877fdcopy(struct filedesc *fdp)
1878{
1879	struct filedesc *newfdp;
1880	struct filedescent *nfde, *ofde;
1881	int i;
1882
1883	/* Certain daemons might not have file descriptors. */
1884	if (fdp == NULL)
1885		return (NULL);
1886
1887	newfdp = fdinit(fdp);
1888	FILEDESC_SLOCK(fdp);
1889	while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
1890		FILEDESC_SUNLOCK(fdp);
1891		FILEDESC_XLOCK(newfdp);
1892		fdgrowtable(newfdp, fdp->fd_lastfile + 1);
1893		FILEDESC_XUNLOCK(newfdp);
1894		FILEDESC_SLOCK(fdp);
1895	}
1896	/* copy all passable descriptors (i.e. not kqueue) */
1897	newfdp->fd_freefile = -1;
1898	for (i = 0; i <= fdp->fd_lastfile; ++i) {
1899		ofde = &fdp->fd_ofiles[i];
1900		if (fdisused(fdp, i) &&
1901		    (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) &&
1902		    ofde->fde_file->f_ops != &badfileops) {
1903			nfde = &newfdp->fd_ofiles[i];
1904			*nfde = *ofde;
1905			filecaps_copy(&ofde->fde_caps, &nfde->fde_caps);
1906			fhold(nfde->fde_file);
1907			newfdp->fd_lastfile = i;
1908		} else {
1909			if (newfdp->fd_freefile == -1)
1910				newfdp->fd_freefile = i;
1911		}
1912	}
1913	newfdp->fd_cmask = fdp->fd_cmask;
1914	FILEDESC_SUNLOCK(fdp);
1915	FILEDESC_XLOCK(newfdp);
1916	for (i = 0; i <= newfdp->fd_lastfile; ++i) {
1917		if (newfdp->fd_ofiles[i].fde_file != NULL)
1918			fdused(newfdp, i);
1919	}
1920	if (newfdp->fd_freefile == -1)
1921		newfdp->fd_freefile = i;
1922	FILEDESC_XUNLOCK(newfdp);
1923	return (newfdp);
1924}
1925
1926/*
1927 * Release a filedesc structure.
1928 */
1929void
1930fdescfree(struct thread *td)
1931{
1932	struct filedesc *fdp;
1933	int i;
1934	struct filedesc_to_leader *fdtol;
1935	struct file *fp;
1936	struct vnode *cdir, *jdir, *rdir, *vp;
1937	struct flock lf;
1938
1939	/* Certain daemons might not have file descriptors. */
1940	fdp = td->td_proc->p_fd;
1941	if (fdp == NULL)
1942		return;
1943
1944#ifdef RACCT
1945	PROC_LOCK(td->td_proc);
1946	racct_set(td->td_proc, RACCT_NOFILE, 0);
1947	PROC_UNLOCK(td->td_proc);
1948#endif
1949
1950	/* Check for special need to clear POSIX style locks */
1951	fdtol = td->td_proc->p_fdtol;
1952	if (fdtol != NULL) {
1953		FILEDESC_XLOCK(fdp);
1954		KASSERT(fdtol->fdl_refcount > 0,
1955		    ("filedesc_to_refcount botch: fdl_refcount=%d",
1956		    fdtol->fdl_refcount));
1957		if (fdtol->fdl_refcount == 1 &&
1958		    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
1959			for (i = 0; i <= fdp->fd_lastfile; i++) {
1960				fp = fdp->fd_ofiles[i].fde_file;
1961				if (fp == NULL || fp->f_type != DTYPE_VNODE)
1962					continue;
1963				fhold(fp);
1964				FILEDESC_XUNLOCK(fdp);
1965				lf.l_whence = SEEK_SET;
1966				lf.l_start = 0;
1967				lf.l_len = 0;
1968				lf.l_type = F_UNLCK;
1969				vp = fp->f_vnode;
1970				(void) VOP_ADVLOCK(vp,
1971				    (caddr_t)td->td_proc->p_leader, F_UNLCK,
1972				    &lf, F_POSIX);
1973				FILEDESC_XLOCK(fdp);
1974				fdrop(fp, td);
1975			}
1976		}
1977	retry:
1978		if (fdtol->fdl_refcount == 1) {
1979			if (fdp->fd_holdleaderscount > 0 &&
1980			    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
1981				/*
1982				 * close() or do_dup() has cleared a reference
1983				 * in a shared file descriptor table.
1984				 */
1985				fdp->fd_holdleaderswakeup = 1;
1986				sx_sleep(&fdp->fd_holdleaderscount,
1987				    FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
1988				goto retry;
1989			}
1990			if (fdtol->fdl_holdcount > 0) {
1991				/*
1992				 * Ensure that fdtol->fdl_leader remains
1993				 * valid in closef().
1994				 */
1995				fdtol->fdl_wakeup = 1;
1996				sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
1997				    "fdlhold", 0);
1998				goto retry;
1999			}
2000		}
2001		fdtol->fdl_refcount--;
2002		if (fdtol->fdl_refcount == 0 &&
2003		    fdtol->fdl_holdcount == 0) {
2004			fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
2005			fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
2006		} else
2007			fdtol = NULL;
2008		td->td_proc->p_fdtol = NULL;
2009		FILEDESC_XUNLOCK(fdp);
2010		if (fdtol != NULL)
2011			free(fdtol, M_FILEDESC_TO_LEADER);
2012	}
2013	FILEDESC_XLOCK(fdp);
2014	i = --fdp->fd_refcnt;
2015	FILEDESC_XUNLOCK(fdp);
2016	if (i > 0)
2017		return;
2018
2019	for (i = 0; i <= fdp->fd_lastfile; i++) {
2020		fp = fdp->fd_ofiles[i].fde_file;
2021		if (fp != NULL) {
2022			FILEDESC_XLOCK(fdp);
2023			fdfree(fdp, i);
2024			FILEDESC_XUNLOCK(fdp);
2025			(void) closef(fp, td);
2026		}
2027	}
2028	FILEDESC_XLOCK(fdp);
2029
2030	/* XXX This should happen earlier. */
2031	mtx_lock(&fdesc_mtx);
2032	td->td_proc->p_fd = NULL;
2033	mtx_unlock(&fdesc_mtx);
2034
2035	if (fdp->fd_nfiles > NDFILE)
2036		free(fdp->fd_ofiles, M_FILEDESC);
2037	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
2038		free(fdp->fd_map, M_FILEDESC);
2039
2040	fdp->fd_nfiles = 0;
2041
2042	cdir = fdp->fd_cdir;
2043	fdp->fd_cdir = NULL;
2044	rdir = fdp->fd_rdir;
2045	fdp->fd_rdir = NULL;
2046	jdir = fdp->fd_jdir;
2047	fdp->fd_jdir = NULL;
2048	FILEDESC_XUNLOCK(fdp);
2049
2050	if (cdir != NULL)
2051		vrele(cdir);
2052	if (rdir != NULL)
2053		vrele(rdir);
2054	if (jdir != NULL)
2055		vrele(jdir);
2056
2057	fddrop(fdp);
2058}
2059
2060/*
2061 * For setugid programs, we don't want to people to use that setugidness
2062 * to generate error messages which write to a file which otherwise would
2063 * otherwise be off-limits to the process.  We check for filesystems where
2064 * the vnode can change out from under us after execve (like [lin]procfs).
2065 *
2066 * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
2067 * sufficient.  We also don't check for setugidness since we know we are.
2068 */
2069static int
2070is_unsafe(struct file *fp)
2071{
2072	if (fp->f_type == DTYPE_VNODE) {
2073		struct vnode *vp = fp->f_vnode;
2074
2075		if ((vp->v_vflag & VV_PROCDEP) != 0)
2076			return (1);
2077	}
2078	return (0);
2079}
2080
2081/*
2082 * Make this setguid thing safe, if at all possible.
2083 */
2084void
2085setugidsafety(struct thread *td)
2086{
2087	struct filedesc *fdp;
2088	struct file *fp;
2089	int i;
2090
2091	/* Certain daemons might not have file descriptors. */
2092	fdp = td->td_proc->p_fd;
2093	if (fdp == NULL)
2094		return;
2095
2096	/*
2097	 * Note: fdp->fd_ofiles may be reallocated out from under us while
2098	 * we are blocked in a close.  Be careful!
2099	 */
2100	FILEDESC_XLOCK(fdp);
2101	for (i = 0; i <= fdp->fd_lastfile; i++) {
2102		if (i > 2)
2103			break;
2104		fp = fdp->fd_ofiles[i].fde_file;
2105		if (fp != NULL && is_unsafe(fp)) {
2106			knote_fdclose(td, i);
2107			/*
2108			 * NULL-out descriptor prior to close to avoid
2109			 * a race while close blocks.
2110			 */
2111			fdfree(fdp, i);
2112			FILEDESC_XUNLOCK(fdp);
2113			(void) closef(fp, td);
2114			FILEDESC_XLOCK(fdp);
2115		}
2116	}
2117	FILEDESC_XUNLOCK(fdp);
2118}
2119
2120/*
2121 * If a specific file object occupies a specific file descriptor, close the
2122 * file descriptor entry and drop a reference on the file object.  This is a
2123 * convenience function to handle a subsequent error in a function that calls
2124 * falloc() that handles the race that another thread might have closed the
2125 * file descriptor out from under the thread creating the file object.
2126 */
2127void
2128fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
2129{
2130
2131	FILEDESC_XLOCK(fdp);
2132	if (fdp->fd_ofiles[idx].fde_file == fp) {
2133		fdfree(fdp, idx);
2134		FILEDESC_XUNLOCK(fdp);
2135		fdrop(fp, td);
2136	} else
2137		FILEDESC_XUNLOCK(fdp);
2138}
2139
2140/*
2141 * Close any files on exec?
2142 */
2143void
2144fdcloseexec(struct thread *td)
2145{
2146	struct filedesc *fdp;
2147	struct filedescent *fde;
2148	struct file *fp;
2149	int i;
2150
2151	/* Certain daemons might not have file descriptors. */
2152	fdp = td->td_proc->p_fd;
2153	if (fdp == NULL)
2154		return;
2155
2156	/*
2157	 * We cannot cache fd_ofiles since operations
2158	 * may block and rip them out from under us.
2159	 */
2160	FILEDESC_XLOCK(fdp);
2161	for (i = 0; i <= fdp->fd_lastfile; i++) {
2162		fde = &fdp->fd_ofiles[i];
2163		fp = fde->fde_file;
2164		if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
2165		    (fde->fde_flags & UF_EXCLOSE))) {
2166			fdfree(fdp, i);
2167			(void) closefp(fdp, i, fp, td, 0);
2168			/* closefp() drops the FILEDESC lock. */
2169			FILEDESC_XLOCK(fdp);
2170		}
2171	}
2172	FILEDESC_XUNLOCK(fdp);
2173}
2174
2175/*
2176 * It is unsafe for set[ug]id processes to be started with file
2177 * descriptors 0..2 closed, as these descriptors are given implicit
2178 * significance in the Standard C library.  fdcheckstd() will create a
2179 * descriptor referencing /dev/null for each of stdin, stdout, and
2180 * stderr that is not already open.
2181 */
2182int
2183fdcheckstd(struct thread *td)
2184{
2185	struct filedesc *fdp;
2186	register_t retval, save;
2187	int i, error, devnull;
2188
2189	fdp = td->td_proc->p_fd;
2190	if (fdp == NULL)
2191		return (0);
2192	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
2193	devnull = -1;
2194	error = 0;
2195	for (i = 0; i < 3; i++) {
2196		if (fdp->fd_ofiles[i].fde_file != NULL)
2197			continue;
2198		if (devnull < 0) {
2199			save = td->td_retval[0];
2200			error = kern_open(td, "/dev/null", UIO_SYSSPACE,
2201			    O_RDWR, 0);
2202			devnull = td->td_retval[0];
2203			td->td_retval[0] = save;
2204			if (error)
2205				break;
2206			KASSERT(devnull == i, ("oof, we didn't get our fd"));
2207		} else {
2208			error = do_dup(td, DUP_FIXED, devnull, i, &retval);
2209			if (error != 0)
2210				break;
2211		}
2212	}
2213	return (error);
2214}
2215
2216/*
2217 * Internal form of close.  Decrement reference count on file structure.
2218 * Note: td may be NULL when closing a file that was being passed in a
2219 * message.
2220 *
2221 * XXXRW: Giant is not required for the caller, but often will be held; this
2222 * makes it moderately likely the Giant will be recursed in the VFS case.
2223 */
2224int
2225closef(struct file *fp, struct thread *td)
2226{
2227	struct vnode *vp;
2228	struct flock lf;
2229	struct filedesc_to_leader *fdtol;
2230	struct filedesc *fdp;
2231
2232	/*
2233	 * POSIX record locking dictates that any close releases ALL
2234	 * locks owned by this process.  This is handled by setting
2235	 * a flag in the unlock to free ONLY locks obeying POSIX
2236	 * semantics, and not to free BSD-style file locks.
2237	 * If the descriptor was in a message, POSIX-style locks
2238	 * aren't passed with the descriptor, and the thread pointer
2239	 * will be NULL.  Callers should be careful only to pass a
2240	 * NULL thread pointer when there really is no owning
2241	 * context that might have locks, or the locks will be
2242	 * leaked.
2243	 */
2244	if (fp->f_type == DTYPE_VNODE && td != NULL) {
2245		vp = fp->f_vnode;
2246		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
2247			lf.l_whence = SEEK_SET;
2248			lf.l_start = 0;
2249			lf.l_len = 0;
2250			lf.l_type = F_UNLCK;
2251			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
2252			    F_UNLCK, &lf, F_POSIX);
2253		}
2254		fdtol = td->td_proc->p_fdtol;
2255		if (fdtol != NULL) {
2256			/*
2257			 * Handle special case where file descriptor table is
2258			 * shared between multiple process leaders.
2259			 */
2260			fdp = td->td_proc->p_fd;
2261			FILEDESC_XLOCK(fdp);
2262			for (fdtol = fdtol->fdl_next;
2263			     fdtol != td->td_proc->p_fdtol;
2264			     fdtol = fdtol->fdl_next) {
2265				if ((fdtol->fdl_leader->p_flag &
2266				     P_ADVLOCK) == 0)
2267					continue;
2268				fdtol->fdl_holdcount++;
2269				FILEDESC_XUNLOCK(fdp);
2270				lf.l_whence = SEEK_SET;
2271				lf.l_start = 0;
2272				lf.l_len = 0;
2273				lf.l_type = F_UNLCK;
2274				vp = fp->f_vnode;
2275				(void) VOP_ADVLOCK(vp,
2276				    (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
2277				    F_POSIX);
2278				FILEDESC_XLOCK(fdp);
2279				fdtol->fdl_holdcount--;
2280				if (fdtol->fdl_holdcount == 0 &&
2281				    fdtol->fdl_wakeup != 0) {
2282					fdtol->fdl_wakeup = 0;
2283					wakeup(fdtol);
2284				}
2285			}
2286			FILEDESC_XUNLOCK(fdp);
2287		}
2288	}
2289	return (fdrop(fp, td));
2290}
2291
2292/*
2293 * Initialize the file pointer with the specified properties.
2294 *
2295 * The ops are set with release semantics to be certain that the flags, type,
2296 * and data are visible when ops is.  This is to prevent ops methods from being
2297 * called with bad data.
2298 */
2299void
2300finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
2301{
2302	fp->f_data = data;
2303	fp->f_flag = flag;
2304	fp->f_type = type;
2305	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
2306}
2307
2308int
2309fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
2310    int needfcntl, struct file **fpp, cap_rights_t *haverightsp)
2311{
2312	struct file *fp;
2313	u_int count;
2314#ifdef CAPABILITIES
2315	cap_rights_t haverights;
2316	int error;
2317#endif
2318
2319	/*
2320	 * Avoid reads reordering and then a first access to the
2321	 * fdp->fd_ofiles table which could result in OOB operation.
2322	 */
2323	if (fd < 0 || fd >= atomic_load_acq_int(&fdp->fd_nfiles))
2324		return (EBADF);
2325	/*
2326	 * Fetch the descriptor locklessly.  We avoid fdrop() races by
2327	 * never raising a refcount above 0.  To accomplish this we have
2328	 * to use a cmpset loop rather than an atomic_add.  The descriptor
2329	 * must be re-verified once we acquire a reference to be certain
2330	 * that the identity is still correct and we did not lose a race
2331	 * due to preemption.
2332	 */
2333	for (;;) {
2334		fp = fdp->fd_ofiles[fd].fde_file;
2335		if (fp == NULL)
2336			return (EBADF);
2337#ifdef CAPABILITIES
2338		haverights = *cap_rights(fdp, fd);
2339		if (needrightsp != NULL) {
2340			error = cap_check(&haverights, needrightsp);
2341			if (error != 0)
2342				return (error);
2343			if (cap_rights_is_set(needrightsp, CAP_FCNTL)) {
2344				error = cap_fcntl_check(fdp, fd, needfcntl);
2345				if (error != 0)
2346					return (error);
2347			}
2348		}
2349#endif
2350		count = fp->f_count;
2351		if (count == 0)
2352			continue;
2353		/*
2354		 * Use an acquire barrier to prevent caching of fd_ofiles
2355		 * so it is refreshed for verification.
2356		 */
2357		if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) != 1)
2358			continue;
2359		if (fp == fdp->fd_ofiles[fd].fde_file)
2360			break;
2361		fdrop(fp, curthread);
2362	}
2363	*fpp = fp;
2364	if (haverightsp != NULL) {
2365#ifdef CAPABILITIES
2366		*haverightsp = haverights;
2367#else
2368		CAP_ALL(haverightsp);
2369#endif
2370	}
2371	return (0);
2372}
2373
2374/*
2375 * Extract the file pointer associated with the specified descriptor for the
2376 * current user process.
2377 *
2378 * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
2379 * returned.
2380 *
2381 * File's rights will be checked against the capability rights mask.
2382 *
2383 * If an error occured the non-zero error is returned and *fpp is set to
2384 * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
2385 * responsible for fdrop().
2386 */
2387static __inline int
2388_fget(struct thread *td, int fd, struct file **fpp, int flags,
2389    cap_rights_t *needrightsp, u_char *maxprotp)
2390{
2391	struct filedesc *fdp;
2392	struct file *fp;
2393	cap_rights_t haverights, needrights;
2394	int error;
2395
2396	*fpp = NULL;
2397	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
2398		return (EBADF);
2399	if (needrightsp != NULL)
2400		needrights = *needrightsp;
2401	else
2402		cap_rights_init(&needrights);
2403	if (maxprotp != NULL)
2404		cap_rights_set(&needrights, CAP_MMAP);
2405	error = fget_unlocked(fdp, fd, &needrights, 0, &fp, &haverights);
2406	if (error != 0)
2407		return (error);
2408	if (fp->f_ops == &badfileops) {
2409		fdrop(fp, td);
2410		return (EBADF);
2411	}
2412
2413#ifdef CAPABILITIES
2414	/*
2415	 * If requested, convert capability rights to access flags.
2416	 */
2417	if (maxprotp != NULL)
2418		*maxprotp = cap_rights_to_vmprot(&haverights);
2419#else /* !CAPABILITIES */
2420	if (maxprotp != NULL)
2421		*maxprotp = VM_PROT_ALL;
2422#endif /* CAPABILITIES */
2423
2424	/*
2425	 * FREAD and FWRITE failure return EBADF as per POSIX.
2426	 */
2427	error = 0;
2428	switch (flags) {
2429	case FREAD:
2430	case FWRITE:
2431		if ((fp->f_flag & flags) == 0)
2432			error = EBADF;
2433		break;
2434	case FEXEC:
2435	    	if ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
2436		    ((fp->f_flag & FWRITE) != 0))
2437			error = EBADF;
2438		break;
2439	case 0:
2440		break;
2441	default:
2442		KASSERT(0, ("wrong flags"));
2443	}
2444
2445	if (error != 0) {
2446		fdrop(fp, td);
2447		return (error);
2448	}
2449
2450	*fpp = fp;
2451	return (0);
2452}
2453
2454int
2455fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
2456{
2457
2458	return(_fget(td, fd, fpp, 0, rightsp, NULL));
2459}
2460
2461int
2462fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, u_char *maxprotp,
2463    struct file **fpp)
2464{
2465
2466	return (_fget(td, fd, fpp, 0, rightsp, maxprotp));
2467}
2468
2469int
2470fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
2471{
2472
2473	return(_fget(td, fd, fpp, FREAD, rightsp, NULL));
2474}
2475
2476int
2477fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
2478{
2479
2480	return (_fget(td, fd, fpp, FWRITE, rightsp, NULL));
2481}
2482
2483/*
2484 * Like fget() but loads the underlying vnode, or returns an error if the
2485 * descriptor does not represent a vnode.  Note that pipes use vnodes but
2486 * never have VM objects.  The returned vnode will be vref()'d.
2487 *
2488 * XXX: what about the unused flags ?
2489 */
2490static __inline int
2491_fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp,
2492    struct vnode **vpp)
2493{
2494	struct file *fp;
2495	int error;
2496
2497	*vpp = NULL;
2498	error = _fget(td, fd, &fp, flags, needrightsp, NULL);
2499	if (error != 0)
2500		return (error);
2501	if (fp->f_vnode == NULL) {
2502		error = EINVAL;
2503	} else {
2504		*vpp = fp->f_vnode;
2505		vref(*vpp);
2506	}
2507	fdrop(fp, td);
2508
2509	return (error);
2510}
2511
2512int
2513fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
2514{
2515
2516	return (_fgetvp(td, fd, 0, rightsp, vpp));
2517}
2518
2519int
2520fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
2521    struct filecaps *havecaps, struct vnode **vpp)
2522{
2523	struct filedesc *fdp;
2524	struct file *fp;
2525#ifdef CAPABILITIES
2526	int error;
2527#endif
2528
2529	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
2530		return (EBADF);
2531
2532	fp = fget_locked(fdp, fd);
2533	if (fp == NULL || fp->f_ops == &badfileops)
2534		return (EBADF);
2535
2536#ifdef CAPABILITIES
2537	if (needrightsp != NULL) {
2538		error = cap_check(cap_rights(fdp, fd), needrightsp);
2539		if (error != 0)
2540			return (error);
2541	}
2542#endif
2543
2544	if (fp->f_vnode == NULL)
2545		return (EINVAL);
2546
2547	*vpp = fp->f_vnode;
2548	vref(*vpp);
2549	filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, havecaps);
2550
2551	return (0);
2552}
2553
2554int
2555fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
2556{
2557
2558	return (_fgetvp(td, fd, FREAD, rightsp, vpp));
2559}
2560
2561int
2562fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
2563{
2564
2565	return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
2566}
2567
2568#ifdef notyet
2569int
2570fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
2571    struct vnode **vpp)
2572{
2573
2574	return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
2575}
2576#endif
2577
2578/*
2579 * Like fget() but loads the underlying socket, or returns an error if the
2580 * descriptor does not represent a socket.
2581 *
2582 * We bump the ref count on the returned socket.  XXX Also obtain the SX lock
2583 * in the future.
2584 *
2585 * Note: fgetsock() and fputsock() are deprecated, as consumers should rely
2586 * on their file descriptor reference to prevent the socket from being free'd
2587 * during use.
2588 */
2589int
2590fgetsock(struct thread *td, int fd, cap_rights_t *rightsp, struct socket **spp,
2591    u_int *fflagp)
2592{
2593	struct file *fp;
2594	int error;
2595
2596	*spp = NULL;
2597	if (fflagp != NULL)
2598		*fflagp = 0;
2599	if ((error = _fget(td, fd, &fp, 0, rightsp, NULL)) != 0)
2600		return (error);
2601	if (fp->f_type != DTYPE_SOCKET) {
2602		error = ENOTSOCK;
2603	} else {
2604		*spp = fp->f_data;
2605		if (fflagp)
2606			*fflagp = fp->f_flag;
2607		SOCK_LOCK(*spp);
2608		soref(*spp);
2609		SOCK_UNLOCK(*spp);
2610	}
2611	fdrop(fp, td);
2612
2613	return (error);
2614}
2615
2616/*
2617 * Drop the reference count on the socket and XXX release the SX lock in the
2618 * future.  The last reference closes the socket.
2619 *
2620 * Note: fputsock() is deprecated, see comment for fgetsock().
2621 */
2622void
2623fputsock(struct socket *so)
2624{
2625
2626	ACCEPT_LOCK();
2627	SOCK_LOCK(so);
2628	CURVNET_SET(so->so_vnet);
2629	sorele(so);
2630	CURVNET_RESTORE();
2631}
2632
2633/*
2634 * Handle the last reference to a file being closed.
2635 */
2636int
2637_fdrop(struct file *fp, struct thread *td)
2638{
2639	int error;
2640
2641	error = 0;
2642	if (fp->f_count != 0)
2643		panic("fdrop: count %d", fp->f_count);
2644	if (fp->f_ops != &badfileops)
2645		error = fo_close(fp, td);
2646	atomic_subtract_int(&openfiles, 1);
2647	crfree(fp->f_cred);
2648	free(fp->f_advice, M_FADVISE);
2649	uma_zfree(file_zone, fp);
2650
2651	return (error);
2652}
2653
2654/*
2655 * Apply an advisory lock on a file descriptor.
2656 *
2657 * Just attempt to get a record lock of the requested type on the entire file
2658 * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
2659 */
2660#ifndef _SYS_SYSPROTO_H_
2661struct flock_args {
2662	int	fd;
2663	int	how;
2664};
2665#endif
2666/* ARGSUSED */
2667int
2668sys_flock(struct thread *td, struct flock_args *uap)
2669{
2670	struct file *fp;
2671	struct vnode *vp;
2672	struct flock lf;
2673	cap_rights_t rights;
2674	int error;
2675
2676	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FLOCK), &fp);
2677	if (error != 0)
2678		return (error);
2679	if (fp->f_type != DTYPE_VNODE) {
2680		fdrop(fp, td);
2681		return (EOPNOTSUPP);
2682	}
2683
2684	vp = fp->f_vnode;
2685	lf.l_whence = SEEK_SET;
2686	lf.l_start = 0;
2687	lf.l_len = 0;
2688	if (uap->how & LOCK_UN) {
2689		lf.l_type = F_UNLCK;
2690		atomic_clear_int(&fp->f_flag, FHASLOCK);
2691		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
2692		goto done2;
2693	}
2694	if (uap->how & LOCK_EX)
2695		lf.l_type = F_WRLCK;
2696	else if (uap->how & LOCK_SH)
2697		lf.l_type = F_RDLCK;
2698	else {
2699		error = EBADF;
2700		goto done2;
2701	}
2702	atomic_set_int(&fp->f_flag, FHASLOCK);
2703	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
2704	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
2705done2:
2706	fdrop(fp, td);
2707	return (error);
2708}
2709/*
2710 * Duplicate the specified descriptor to a free descriptor.
2711 */
2712int
2713dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
2714    int openerror, int *indxp)
2715{
2716	struct file *fp;
2717	int error, indx;
2718
2719	KASSERT(openerror == ENODEV || openerror == ENXIO,
2720	    ("unexpected error %d in %s", openerror, __func__));
2721
2722	/*
2723	 * If the to-be-dup'd fd number is greater than the allowed number
2724	 * of file descriptors, or the fd to be dup'd has already been
2725	 * closed, then reject.
2726	 */
2727	FILEDESC_XLOCK(fdp);
2728	if ((fp = fget_locked(fdp, dfd)) == NULL) {
2729		FILEDESC_XUNLOCK(fdp);
2730		return (EBADF);
2731	}
2732
2733	error = fdalloc(td, 0, &indx);
2734	if (error != 0) {
2735		FILEDESC_XUNLOCK(fdp);
2736		return (error);
2737	}
2738
2739	/*
2740	 * There are two cases of interest here.
2741	 *
2742	 * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
2743	 *
2744	 * For ENXIO steal away the file structure from (dfd) and store it in
2745	 * (indx).  (dfd) is effectively closed by this operation.
2746	 */
2747	switch (openerror) {
2748	case ENODEV:
2749		/*
2750		 * Check that the mode the file is being opened for is a
2751		 * subset of the mode of the existing descriptor.
2752		 */
2753		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
2754			fdunused(fdp, indx);
2755			FILEDESC_XUNLOCK(fdp);
2756			return (EACCES);
2757		}
2758		fhold(fp);
2759		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
2760		filecaps_copy(&fdp->fd_ofiles[dfd].fde_caps,
2761		    &fdp->fd_ofiles[indx].fde_caps);
2762		break;
2763	case ENXIO:
2764		/*
2765		 * Steal away the file pointer from dfd and stuff it into indx.
2766		 */
2767		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
2768		bzero(&fdp->fd_ofiles[dfd], sizeof(fdp->fd_ofiles[dfd]));
2769		fdunused(fdp, dfd);
2770		break;
2771	}
2772	FILEDESC_XUNLOCK(fdp);
2773	*indxp = indx;
2774	return (0);
2775}
2776
2777/*
2778 * Scan all active processes and prisons to see if any of them have a current
2779 * or root directory of `olddp'. If so, replace them with the new mount point.
2780 */
2781void
2782mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
2783{
2784	struct filedesc *fdp;
2785	struct prison *pr;
2786	struct proc *p;
2787	int nrele;
2788
2789	if (vrefcnt(olddp) == 1)
2790		return;
2791	nrele = 0;
2792	sx_slock(&allproc_lock);
2793	FOREACH_PROC_IN_SYSTEM(p) {
2794		fdp = fdhold(p);
2795		if (fdp == NULL)
2796			continue;
2797		FILEDESC_XLOCK(fdp);
2798		if (fdp->fd_cdir == olddp) {
2799			vref(newdp);
2800			fdp->fd_cdir = newdp;
2801			nrele++;
2802		}
2803		if (fdp->fd_rdir == olddp) {
2804			vref(newdp);
2805			fdp->fd_rdir = newdp;
2806			nrele++;
2807		}
2808		if (fdp->fd_jdir == olddp) {
2809			vref(newdp);
2810			fdp->fd_jdir = newdp;
2811			nrele++;
2812		}
2813		FILEDESC_XUNLOCK(fdp);
2814		fddrop(fdp);
2815	}
2816	sx_sunlock(&allproc_lock);
2817	if (rootvnode == olddp) {
2818		vref(newdp);
2819		rootvnode = newdp;
2820		nrele++;
2821	}
2822	mtx_lock(&prison0.pr_mtx);
2823	if (prison0.pr_root == olddp) {
2824		vref(newdp);
2825		prison0.pr_root = newdp;
2826		nrele++;
2827	}
2828	mtx_unlock(&prison0.pr_mtx);
2829	sx_slock(&allprison_lock);
2830	TAILQ_FOREACH(pr, &allprison, pr_list) {
2831		mtx_lock(&pr->pr_mtx);
2832		if (pr->pr_root == olddp) {
2833			vref(newdp);
2834			pr->pr_root = newdp;
2835			nrele++;
2836		}
2837		mtx_unlock(&pr->pr_mtx);
2838	}
2839	sx_sunlock(&allprison_lock);
2840	while (nrele--)
2841		vrele(olddp);
2842}
2843
2844struct filedesc_to_leader *
2845filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
2846{
2847	struct filedesc_to_leader *fdtol;
2848
2849	fdtol = malloc(sizeof(struct filedesc_to_leader),
2850	       M_FILEDESC_TO_LEADER,
2851	       M_WAITOK);
2852	fdtol->fdl_refcount = 1;
2853	fdtol->fdl_holdcount = 0;
2854	fdtol->fdl_wakeup = 0;
2855	fdtol->fdl_leader = leader;
2856	if (old != NULL) {
2857		FILEDESC_XLOCK(fdp);
2858		fdtol->fdl_next = old->fdl_next;
2859		fdtol->fdl_prev = old;
2860		old->fdl_next = fdtol;
2861		fdtol->fdl_next->fdl_prev = fdtol;
2862		FILEDESC_XUNLOCK(fdp);
2863	} else {
2864		fdtol->fdl_next = fdtol;
2865		fdtol->fdl_prev = fdtol;
2866	}
2867	return (fdtol);
2868}
2869
2870/*
2871 * Get file structures globally.
2872 */
2873static int
2874sysctl_kern_file(SYSCTL_HANDLER_ARGS)
2875{
2876	struct xfile xf;
2877	struct filedesc *fdp;
2878	struct file *fp;
2879	struct proc *p;
2880	int error, n;
2881
2882	error = sysctl_wire_old_buffer(req, 0);
2883	if (error != 0)
2884		return (error);
2885	if (req->oldptr == NULL) {
2886		n = 0;
2887		sx_slock(&allproc_lock);
2888		FOREACH_PROC_IN_SYSTEM(p) {
2889			if (p->p_state == PRS_NEW)
2890				continue;
2891			fdp = fdhold(p);
2892			if (fdp == NULL)
2893				continue;
2894			/* overestimates sparse tables. */
2895			if (fdp->fd_lastfile > 0)
2896				n += fdp->fd_lastfile;
2897			fddrop(fdp);
2898		}
2899		sx_sunlock(&allproc_lock);
2900		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
2901	}
2902	error = 0;
2903	bzero(&xf, sizeof(xf));
2904	xf.xf_size = sizeof(xf);
2905	sx_slock(&allproc_lock);
2906	FOREACH_PROC_IN_SYSTEM(p) {
2907		PROC_LOCK(p);
2908		if (p->p_state == PRS_NEW) {
2909			PROC_UNLOCK(p);
2910			continue;
2911		}
2912		if (p_cansee(req->td, p) != 0) {
2913			PROC_UNLOCK(p);
2914			continue;
2915		}
2916		xf.xf_pid = p->p_pid;
2917		xf.xf_uid = p->p_ucred->cr_uid;
2918		PROC_UNLOCK(p);
2919		fdp = fdhold(p);
2920		if (fdp == NULL)
2921			continue;
2922		FILEDESC_SLOCK(fdp);
2923		for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) {
2924			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
2925				continue;
2926			xf.xf_fd = n;
2927			xf.xf_file = fp;
2928			xf.xf_data = fp->f_data;
2929			xf.xf_vnode = fp->f_vnode;
2930			xf.xf_type = fp->f_type;
2931			xf.xf_count = fp->f_count;
2932			xf.xf_msgcount = 0;
2933			xf.xf_offset = foffset_get(fp);
2934			xf.xf_flag = fp->f_flag;
2935			error = SYSCTL_OUT(req, &xf, sizeof(xf));
2936			if (error)
2937				break;
2938		}
2939		FILEDESC_SUNLOCK(fdp);
2940		fddrop(fdp);
2941		if (error)
2942			break;
2943	}
2944	sx_sunlock(&allproc_lock);
2945	return (error);
2946}
2947
2948SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
2949    0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
2950
2951#ifdef KINFO_OFILE_SIZE
2952CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
2953#endif
2954
2955#ifdef COMPAT_FREEBSD7
2956static int
2957export_vnode_for_osysctl(struct vnode *vp, int type,
2958    struct kinfo_ofile *kif, struct filedesc *fdp, struct sysctl_req *req)
2959{
2960	int error;
2961	char *fullpath, *freepath;
2962
2963	bzero(kif, sizeof(*kif));
2964	kif->kf_structsize = sizeof(*kif);
2965
2966	vref(vp);
2967	kif->kf_fd = type;
2968	kif->kf_type = KF_TYPE_VNODE;
2969	/* This function only handles directories. */
2970	if (vp->v_type != VDIR) {
2971		vrele(vp);
2972		return (ENOTDIR);
2973	}
2974	kif->kf_vnode_type = KF_VTYPE_VDIR;
2975
2976	/*
2977	 * This is not a true file descriptor, so we set a bogus refcount
2978	 * and offset to indicate these fields should be ignored.
2979	 */
2980	kif->kf_ref_count = -1;
2981	kif->kf_offset = -1;
2982
2983	freepath = NULL;
2984	fullpath = "-";
2985	FILEDESC_SUNLOCK(fdp);
2986	vn_fullpath(curthread, vp, &fullpath, &freepath);
2987	vrele(vp);
2988	strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
2989	if (freepath != NULL)
2990		free(freepath, M_TEMP);
2991	error = SYSCTL_OUT(req, kif, sizeof(*kif));
2992	FILEDESC_SLOCK(fdp);
2993	return (error);
2994}
2995
2996/*
2997 * Get per-process file descriptors for use by procstat(1), et al.
2998 */
2999static int
3000sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
3001{
3002	char *fullpath, *freepath;
3003	struct kinfo_ofile *kif;
3004	struct filedesc *fdp;
3005	int error, i, *name;
3006	struct shmfd *shmfd;
3007	struct socket *so;
3008	struct vnode *vp;
3009	struct ksem *ks;
3010	struct file *fp;
3011	struct proc *p;
3012	struct tty *tp;
3013
3014	name = (int *)arg1;
3015	error = pget((pid_t)name[0], PGET_CANDEBUG, &p);
3016	if (error != 0)
3017		return (error);
3018	fdp = fdhold(p);
3019	PROC_UNLOCK(p);
3020	if (fdp == NULL)
3021		return (ENOENT);
3022	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
3023	FILEDESC_SLOCK(fdp);
3024	if (fdp->fd_cdir != NULL)
3025		export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif,
3026				fdp, req);
3027	if (fdp->fd_rdir != NULL)
3028		export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif,
3029				fdp, req);
3030	if (fdp->fd_jdir != NULL)
3031		export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
3032				fdp, req);
3033	for (i = 0; i < fdp->fd_nfiles; i++) {
3034		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
3035			continue;
3036		bzero(kif, sizeof(*kif));
3037		kif->kf_structsize = sizeof(*kif);
3038		ks = NULL;
3039		vp = NULL;
3040		so = NULL;
3041		tp = NULL;
3042		shmfd = NULL;
3043		kif->kf_fd = i;
3044
3045		switch (fp->f_type) {
3046		case DTYPE_VNODE:
3047			kif->kf_type = KF_TYPE_VNODE;
3048			vp = fp->f_vnode;
3049			break;
3050
3051		case DTYPE_SOCKET:
3052			kif->kf_type = KF_TYPE_SOCKET;
3053			so = fp->f_data;
3054			break;
3055
3056		case DTYPE_PIPE:
3057			kif->kf_type = KF_TYPE_PIPE;
3058			break;
3059
3060		case DTYPE_FIFO:
3061			kif->kf_type = KF_TYPE_FIFO;
3062			vp = fp->f_vnode;
3063			break;
3064
3065		case DTYPE_KQUEUE:
3066			kif->kf_type = KF_TYPE_KQUEUE;
3067			break;
3068
3069		case DTYPE_CRYPTO:
3070			kif->kf_type = KF_TYPE_CRYPTO;
3071			break;
3072
3073		case DTYPE_MQUEUE:
3074			kif->kf_type = KF_TYPE_MQUEUE;
3075			break;
3076
3077		case DTYPE_SHM:
3078			kif->kf_type = KF_TYPE_SHM;
3079			shmfd = fp->f_data;
3080			break;
3081
3082		case DTYPE_SEM:
3083			kif->kf_type = KF_TYPE_SEM;
3084			ks = fp->f_data;
3085			break;
3086
3087		case DTYPE_PTS:
3088			kif->kf_type = KF_TYPE_PTS;
3089			tp = fp->f_data;
3090			break;
3091
3092#ifdef PROCDESC
3093		case DTYPE_PROCDESC:
3094			kif->kf_type = KF_TYPE_PROCDESC;
3095			break;
3096#endif
3097
3098		default:
3099			kif->kf_type = KF_TYPE_UNKNOWN;
3100			break;
3101		}
3102		kif->kf_ref_count = fp->f_count;
3103		if (fp->f_flag & FREAD)
3104			kif->kf_flags |= KF_FLAG_READ;
3105		if (fp->f_flag & FWRITE)
3106			kif->kf_flags |= KF_FLAG_WRITE;
3107		if (fp->f_flag & FAPPEND)
3108			kif->kf_flags |= KF_FLAG_APPEND;
3109		if (fp->f_flag & FASYNC)
3110			kif->kf_flags |= KF_FLAG_ASYNC;
3111		if (fp->f_flag & FFSYNC)
3112			kif->kf_flags |= KF_FLAG_FSYNC;
3113		if (fp->f_flag & FNONBLOCK)
3114			kif->kf_flags |= KF_FLAG_NONBLOCK;
3115		if (fp->f_flag & O_DIRECT)
3116			kif->kf_flags |= KF_FLAG_DIRECT;
3117		if (fp->f_flag & FHASLOCK)
3118			kif->kf_flags |= KF_FLAG_HASLOCK;
3119		kif->kf_offset = foffset_get(fp);
3120		if (vp != NULL) {
3121			vref(vp);
3122			switch (vp->v_type) {
3123			case VNON:
3124				kif->kf_vnode_type = KF_VTYPE_VNON;
3125				break;
3126			case VREG:
3127				kif->kf_vnode_type = KF_VTYPE_VREG;
3128				break;
3129			case VDIR:
3130				kif->kf_vnode_type = KF_VTYPE_VDIR;
3131				break;
3132			case VBLK:
3133				kif->kf_vnode_type = KF_VTYPE_VBLK;
3134				break;
3135			case VCHR:
3136				kif->kf_vnode_type = KF_VTYPE_VCHR;
3137				break;
3138			case VLNK:
3139				kif->kf_vnode_type = KF_VTYPE_VLNK;
3140				break;
3141			case VSOCK:
3142				kif->kf_vnode_type = KF_VTYPE_VSOCK;
3143				break;
3144			case VFIFO:
3145				kif->kf_vnode_type = KF_VTYPE_VFIFO;
3146				break;
3147			case VBAD:
3148				kif->kf_vnode_type = KF_VTYPE_VBAD;
3149				break;
3150			default:
3151				kif->kf_vnode_type = KF_VTYPE_UNKNOWN;
3152				break;
3153			}
3154			/*
3155			 * It is OK to drop the filedesc lock here as we will
3156			 * re-validate and re-evaluate its properties when
3157			 * the loop continues.
3158			 */
3159			freepath = NULL;
3160			fullpath = "-";
3161			FILEDESC_SUNLOCK(fdp);
3162			vn_fullpath(curthread, vp, &fullpath, &freepath);
3163			vrele(vp);
3164			strlcpy(kif->kf_path, fullpath,
3165			    sizeof(kif->kf_path));
3166			if (freepath != NULL)
3167				free(freepath, M_TEMP);
3168			FILEDESC_SLOCK(fdp);
3169		}
3170		if (so != NULL) {
3171			struct sockaddr *sa;
3172
3173			if (so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa)
3174			    == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
3175				bcopy(sa, &kif->kf_sa_local, sa->sa_len);
3176				free(sa, M_SONAME);
3177			}
3178			if (so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa)
3179			    == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
3180				bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
3181				free(sa, M_SONAME);
3182			}
3183			kif->kf_sock_domain =
3184			    so->so_proto->pr_domain->dom_family;
3185			kif->kf_sock_type = so->so_type;
3186			kif->kf_sock_protocol = so->so_proto->pr_protocol;
3187		}
3188		if (tp != NULL) {
3189			strlcpy(kif->kf_path, tty_devname(tp),
3190			    sizeof(kif->kf_path));
3191		}
3192		if (shmfd != NULL)
3193			shm_path(shmfd, kif->kf_path, sizeof(kif->kf_path));
3194		if (ks != NULL && ksem_info != NULL)
3195			ksem_info(ks, kif->kf_path, sizeof(kif->kf_path), NULL);
3196		error = SYSCTL_OUT(req, kif, sizeof(*kif));
3197		if (error)
3198			break;
3199	}
3200	FILEDESC_SUNLOCK(fdp);
3201	fddrop(fdp);
3202	free(kif, M_TEMP);
3203	return (0);
3204}
3205
3206static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, CTLFLAG_RD,
3207    sysctl_kern_proc_ofiledesc, "Process ofiledesc entries");
3208#endif	/* COMPAT_FREEBSD7 */
3209
3210#ifdef KINFO_FILE_SIZE
3211CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
3212#endif
3213
3214struct export_fd_buf {
3215	struct filedesc		*fdp;
3216	struct sbuf 		*sb;
3217	ssize_t			remainder;
3218	struct kinfo_file	kif;
3219};
3220
3221static int
3222export_fd_to_sb(void *data, int type, int fd, int fflags, int refcnt,
3223    int64_t offset, cap_rights_t *rightsp, struct export_fd_buf *efbuf)
3224{
3225	struct {
3226		int	fflag;
3227		int	kf_fflag;
3228	} fflags_table[] = {
3229		{ FAPPEND, KF_FLAG_APPEND },
3230		{ FASYNC, KF_FLAG_ASYNC },
3231		{ FFSYNC, KF_FLAG_FSYNC },
3232		{ FHASLOCK, KF_FLAG_HASLOCK },
3233		{ FNONBLOCK, KF_FLAG_NONBLOCK },
3234		{ FREAD, KF_FLAG_READ },
3235		{ FWRITE, KF_FLAG_WRITE },
3236		{ O_CREAT, KF_FLAG_CREAT },
3237		{ O_DIRECT, KF_FLAG_DIRECT },
3238		{ O_EXCL, KF_FLAG_EXCL },
3239		{ O_EXEC, KF_FLAG_EXEC },
3240		{ O_EXLOCK, KF_FLAG_EXLOCK },
3241		{ O_NOFOLLOW, KF_FLAG_NOFOLLOW },
3242		{ O_SHLOCK, KF_FLAG_SHLOCK },
3243		{ O_TRUNC, KF_FLAG_TRUNC }
3244	};
3245#define	NFFLAGS	(sizeof(fflags_table) / sizeof(*fflags_table))
3246	struct kinfo_file *kif;
3247	struct vnode *vp;
3248	int error, locked;
3249	unsigned int i;
3250
3251	if (efbuf->remainder == 0)
3252		return (0);
3253	kif = &efbuf->kif;
3254	bzero(kif, sizeof(*kif));
3255	locked = efbuf->fdp != NULL;
3256	switch (type) {
3257	case KF_TYPE_FIFO:
3258	case KF_TYPE_VNODE:
3259		if (locked) {
3260			FILEDESC_SUNLOCK(efbuf->fdp);
3261			locked = 0;
3262		}
3263		vp = (struct vnode *)data;
3264		error = fill_vnode_info(vp, kif);
3265		vrele(vp);
3266		break;
3267	case KF_TYPE_SOCKET:
3268		error = fill_socket_info((struct socket *)data, kif);
3269		break;
3270	case KF_TYPE_PIPE:
3271		error = fill_pipe_info((struct pipe *)data, kif);
3272		break;
3273	case KF_TYPE_PTS:
3274		error = fill_pts_info((struct tty *)data, kif);
3275		break;
3276	case KF_TYPE_PROCDESC:
3277		error = fill_procdesc_info((struct procdesc *)data, kif);
3278		break;
3279	case KF_TYPE_SEM:
3280		error = fill_sem_info((struct file *)data, kif);
3281		break;
3282	case KF_TYPE_SHM:
3283		error = fill_shm_info((struct file *)data, kif);
3284		break;
3285	default:
3286		error = 0;
3287	}
3288	if (error == 0)
3289		kif->kf_status |= KF_ATTR_VALID;
3290
3291	/*
3292	 * Translate file access flags.
3293	 */
3294	for (i = 0; i < NFFLAGS; i++)
3295		if (fflags & fflags_table[i].fflag)
3296			kif->kf_flags |=  fflags_table[i].kf_fflag;
3297	if (rightsp != NULL)
3298		kif->kf_cap_rights = *rightsp;
3299	else
3300		cap_rights_init(&kif->kf_cap_rights);
3301	kif->kf_fd = fd;
3302	kif->kf_type = type;
3303	kif->kf_ref_count = refcnt;
3304	kif->kf_offset = offset;
3305	/* Pack record size down */
3306	kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
3307	    strlen(kif->kf_path) + 1;
3308	kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
3309	if (efbuf->remainder != -1) {
3310		if (efbuf->remainder < kif->kf_structsize) {
3311			/* Terminate export. */
3312			efbuf->remainder = 0;
3313			if (efbuf->fdp != NULL && !locked)
3314				FILEDESC_SLOCK(efbuf->fdp);
3315			return (0);
3316		}
3317		efbuf->remainder -= kif->kf_structsize;
3318	}
3319	if (locked)
3320		FILEDESC_SUNLOCK(efbuf->fdp);
3321	error = sbuf_bcat(efbuf->sb, kif, kif->kf_structsize);
3322	if (efbuf->fdp != NULL)
3323		FILEDESC_SLOCK(efbuf->fdp);
3324	return (error);
3325}
3326
3327/*
3328 * Store a process file descriptor information to sbuf.
3329 *
3330 * Takes a locked proc as argument, and returns with the proc unlocked.
3331 */
3332int
3333kern_proc_filedesc_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen)
3334{
3335	struct file *fp;
3336	struct filedesc *fdp;
3337	struct export_fd_buf *efbuf;
3338	struct vnode *cttyvp, *textvp, *tracevp;
3339	int64_t offset;
3340	void *data;
3341	int error, i;
3342	int type, refcnt, fflags;
3343	cap_rights_t rights;
3344
3345	PROC_LOCK_ASSERT(p, MA_OWNED);
3346
3347	/* ktrace vnode */
3348	tracevp = p->p_tracevp;
3349	if (tracevp != NULL)
3350		vref(tracevp);
3351	/* text vnode */
3352	textvp = p->p_textvp;
3353	if (textvp != NULL)
3354		vref(textvp);
3355	/* Controlling tty. */
3356	cttyvp = NULL;
3357	if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
3358		cttyvp = p->p_pgrp->pg_session->s_ttyvp;
3359		if (cttyvp != NULL)
3360			vref(cttyvp);
3361	}
3362	fdp = fdhold(p);
3363	PROC_UNLOCK(p);
3364	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
3365	efbuf->fdp = NULL;
3366	efbuf->sb = sb;
3367	efbuf->remainder = maxlen;
3368	if (tracevp != NULL)
3369		export_fd_to_sb(tracevp, KF_TYPE_VNODE, KF_FD_TYPE_TRACE,
3370		    FREAD | FWRITE, -1, -1, NULL, efbuf);
3371	if (textvp != NULL)
3372		export_fd_to_sb(textvp, KF_TYPE_VNODE, KF_FD_TYPE_TEXT,
3373		    FREAD, -1, -1, NULL, efbuf);
3374	if (cttyvp != NULL)
3375		export_fd_to_sb(cttyvp, KF_TYPE_VNODE, KF_FD_TYPE_CTTY,
3376		    FREAD | FWRITE, -1, -1, NULL, efbuf);
3377	error = 0;
3378	if (fdp == NULL)
3379		goto fail;
3380	efbuf->fdp = fdp;
3381	FILEDESC_SLOCK(fdp);
3382	/* working directory */
3383	if (fdp->fd_cdir != NULL) {
3384		vref(fdp->fd_cdir);
3385		data = fdp->fd_cdir;
3386		export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_CWD,
3387		    FREAD, -1, -1, NULL, efbuf);
3388	}
3389	/* root directory */
3390	if (fdp->fd_rdir != NULL) {
3391		vref(fdp->fd_rdir);
3392		data = fdp->fd_rdir;
3393		export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_ROOT,
3394		    FREAD, -1, -1, NULL, efbuf);
3395	}
3396	/* jail directory */
3397	if (fdp->fd_jdir != NULL) {
3398		vref(fdp->fd_jdir);
3399		data = fdp->fd_jdir;
3400		export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_JAIL,
3401		    FREAD, -1, -1, NULL, efbuf);
3402	}
3403	for (i = 0; i < fdp->fd_nfiles; i++) {
3404		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
3405			continue;
3406		data = NULL;
3407#ifdef CAPABILITIES
3408		rights = *cap_rights(fdp, i);
3409#else /* !CAPABILITIES */
3410		cap_rights_init(&rights);
3411#endif
3412		switch (fp->f_type) {
3413		case DTYPE_VNODE:
3414			type = KF_TYPE_VNODE;
3415			vref(fp->f_vnode);
3416			data = fp->f_vnode;
3417			break;
3418
3419		case DTYPE_SOCKET:
3420			type = KF_TYPE_SOCKET;
3421			data = fp->f_data;
3422			break;
3423
3424		case DTYPE_PIPE:
3425			type = KF_TYPE_PIPE;
3426			data = fp->f_data;
3427			break;
3428
3429		case DTYPE_FIFO:
3430			type = KF_TYPE_FIFO;
3431			vref(fp->f_vnode);
3432			data = fp->f_vnode;
3433			break;
3434
3435		case DTYPE_KQUEUE:
3436			type = KF_TYPE_KQUEUE;
3437			break;
3438
3439		case DTYPE_CRYPTO:
3440			type = KF_TYPE_CRYPTO;
3441			break;
3442
3443		case DTYPE_MQUEUE:
3444			type = KF_TYPE_MQUEUE;
3445			break;
3446
3447		case DTYPE_SHM:
3448			type = KF_TYPE_SHM;
3449			data = fp;
3450			break;
3451
3452		case DTYPE_SEM:
3453			type = KF_TYPE_SEM;
3454			data = fp;
3455			break;
3456
3457		case DTYPE_PTS:
3458			type = KF_TYPE_PTS;
3459			data = fp->f_data;
3460			break;
3461
3462#ifdef PROCDESC
3463		case DTYPE_PROCDESC:
3464			type = KF_TYPE_PROCDESC;
3465			data = fp->f_data;
3466			break;
3467#endif
3468
3469		default:
3470			type = KF_TYPE_UNKNOWN;
3471			break;
3472		}
3473		refcnt = fp->f_count;
3474		fflags = fp->f_flag;
3475		offset = foffset_get(fp);
3476
3477		/*
3478		 * Create sysctl entry.
3479		 * It is OK to drop the filedesc lock here as we will
3480		 * re-validate and re-evaluate its properties when
3481		 * the loop continues.
3482		 */
3483		error = export_fd_to_sb(data, type, i, fflags, refcnt,
3484		    offset, &rights, efbuf);
3485		if (error != 0)
3486			break;
3487	}
3488	FILEDESC_SUNLOCK(fdp);
3489	fddrop(fdp);
3490fail:
3491	free(efbuf, M_TEMP);
3492	return (error);
3493}
3494
3495#define FILEDESC_SBUF_SIZE	(sizeof(struct kinfo_file) * 5)
3496
3497/*
3498 * Get per-process file descriptors for use by procstat(1), et al.
3499 */
3500static int
3501sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
3502{
3503	struct sbuf sb;
3504	struct proc *p;
3505	ssize_t maxlen;
3506	int error, error2, *name;
3507
3508	name = (int *)arg1;
3509
3510	sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
3511	error = pget((pid_t)name[0], PGET_CANDEBUG, &p);
3512	if (error != 0) {
3513		sbuf_delete(&sb);
3514		return (error);
3515	}
3516	maxlen = req->oldptr != NULL ? req->oldlen : -1;
3517	error = kern_proc_filedesc_out(p, &sb, maxlen);
3518	error2 = sbuf_finish(&sb);
3519	sbuf_delete(&sb);
3520	return (error != 0 ? error : error2);
3521}
3522
3523int
3524vntype_to_kinfo(int vtype)
3525{
3526	struct {
3527		int	vtype;
3528		int	kf_vtype;
3529	} vtypes_table[] = {
3530		{ VBAD, KF_VTYPE_VBAD },
3531		{ VBLK, KF_VTYPE_VBLK },
3532		{ VCHR, KF_VTYPE_VCHR },
3533		{ VDIR, KF_VTYPE_VDIR },
3534		{ VFIFO, KF_VTYPE_VFIFO },
3535		{ VLNK, KF_VTYPE_VLNK },
3536		{ VNON, KF_VTYPE_VNON },
3537		{ VREG, KF_VTYPE_VREG },
3538		{ VSOCK, KF_VTYPE_VSOCK }
3539	};
3540#define	NVTYPES	(sizeof(vtypes_table) / sizeof(*vtypes_table))
3541	unsigned int i;
3542
3543	/*
3544	 * Perform vtype translation.
3545	 */
3546	for (i = 0; i < NVTYPES; i++)
3547		if (vtypes_table[i].vtype == vtype)
3548			break;
3549	if (i < NVTYPES)
3550		return (vtypes_table[i].kf_vtype);
3551
3552	return (KF_VTYPE_UNKNOWN);
3553}
3554
3555static int
3556fill_vnode_info(struct vnode *vp, struct kinfo_file *kif)
3557{
3558	struct vattr va;
3559	char *fullpath, *freepath;
3560	int error;
3561
3562	if (vp == NULL)
3563		return (1);
3564	kif->kf_vnode_type = vntype_to_kinfo(vp->v_type);
3565	freepath = NULL;
3566	fullpath = "-";
3567	error = vn_fullpath(curthread, vp, &fullpath, &freepath);
3568	if (error == 0) {
3569		strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
3570	}
3571	if (freepath != NULL)
3572		free(freepath, M_TEMP);
3573
3574	/*
3575	 * Retrieve vnode attributes.
3576	 */
3577	va.va_fsid = VNOVAL;
3578	va.va_rdev = NODEV;
3579	vn_lock(vp, LK_SHARED | LK_RETRY);
3580	error = VOP_GETATTR(vp, &va, curthread->td_ucred);
3581	VOP_UNLOCK(vp, 0);
3582	if (error != 0)
3583		return (error);
3584	if (va.va_fsid != VNOVAL)
3585		kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
3586	else
3587		kif->kf_un.kf_file.kf_file_fsid =
3588		    vp->v_mount->mnt_stat.f_fsid.val[0];
3589	kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
3590	kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
3591	kif->kf_un.kf_file.kf_file_size = va.va_size;
3592	kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
3593	return (0);
3594}
3595
3596static int
3597fill_socket_info(struct socket *so, struct kinfo_file *kif)
3598{
3599	struct sockaddr *sa;
3600	struct inpcb *inpcb;
3601	struct unpcb *unpcb;
3602	int error;
3603
3604	if (so == NULL)
3605		return (1);
3606	kif->kf_sock_domain = so->so_proto->pr_domain->dom_family;
3607	kif->kf_sock_type = so->so_type;
3608	kif->kf_sock_protocol = so->so_proto->pr_protocol;
3609	kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb;
3610	switch(kif->kf_sock_domain) {
3611	case AF_INET:
3612	case AF_INET6:
3613		if (kif->kf_sock_protocol == IPPROTO_TCP) {
3614			if (so->so_pcb != NULL) {
3615				inpcb = (struct inpcb *)(so->so_pcb);
3616				kif->kf_un.kf_sock.kf_sock_inpcb =
3617				    (uintptr_t)inpcb->inp_ppcb;
3618			}
3619		}
3620		break;
3621	case AF_UNIX:
3622		if (so->so_pcb != NULL) {
3623			unpcb = (struct unpcb *)(so->so_pcb);
3624			if (unpcb->unp_conn) {
3625				kif->kf_un.kf_sock.kf_sock_unpconn =
3626				    (uintptr_t)unpcb->unp_conn;
3627				kif->kf_un.kf_sock.kf_sock_rcv_sb_state =
3628				    so->so_rcv.sb_state;
3629				kif->kf_un.kf_sock.kf_sock_snd_sb_state =
3630				    so->so_snd.sb_state;
3631			}
3632		}
3633		break;
3634	}
3635	error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa);
3636	if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
3637		bcopy(sa, &kif->kf_sa_local, sa->sa_len);
3638		free(sa, M_SONAME);
3639	}
3640	error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa);
3641	if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
3642		bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
3643		free(sa, M_SONAME);
3644	}
3645	strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name,
3646	    sizeof(kif->kf_path));
3647	return (0);
3648}
3649
3650static int
3651fill_pts_info(struct tty *tp, struct kinfo_file *kif)
3652{
3653
3654	if (tp == NULL)
3655		return (1);
3656	kif->kf_un.kf_pts.kf_pts_dev = tty_udev(tp);
3657	strlcpy(kif->kf_path, tty_devname(tp), sizeof(kif->kf_path));
3658	return (0);
3659}
3660
3661static int
3662fill_pipe_info(struct pipe *pi, struct kinfo_file *kif)
3663{
3664
3665	if (pi == NULL)
3666		return (1);
3667	kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi;
3668	kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer;
3669	kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt;
3670	return (0);
3671}
3672
3673static int
3674fill_procdesc_info(struct procdesc *pdp, struct kinfo_file *kif)
3675{
3676
3677	if (pdp == NULL)
3678		return (1);
3679	kif->kf_un.kf_proc.kf_pid = pdp->pd_pid;
3680	return (0);
3681}
3682
3683static int
3684fill_sem_info(struct file *fp, struct kinfo_file *kif)
3685{
3686	struct thread *td;
3687	struct stat sb;
3688
3689	td = curthread;
3690	if (fp->f_data == NULL)
3691		return (1);
3692	if (fo_stat(fp, &sb, td->td_ucred, td) != 0)
3693		return (1);
3694	if (ksem_info == NULL)
3695		return (1);
3696	ksem_info(fp->f_data, kif->kf_path, sizeof(kif->kf_path),
3697	    &kif->kf_un.kf_sem.kf_sem_value);
3698	kif->kf_un.kf_sem.kf_sem_mode = sb.st_mode;
3699	return (0);
3700}
3701
3702static int
3703fill_shm_info(struct file *fp, struct kinfo_file *kif)
3704{
3705	struct thread *td;
3706	struct stat sb;
3707
3708	td = curthread;
3709	if (fp->f_data == NULL)
3710		return (1);
3711	if (fo_stat(fp, &sb, td->td_ucred, td) != 0)
3712		return (1);
3713	shm_path(fp->f_data, kif->kf_path, sizeof(kif->kf_path));
3714	kif->kf_un.kf_file.kf_file_mode = sb.st_mode;
3715	kif->kf_un.kf_file.kf_file_size = sb.st_size;
3716	return (0);
3717}
3718
3719static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD,
3720    sysctl_kern_proc_filedesc, "Process filedesc entries");
3721
3722#ifdef DDB
3723/*
3724 * For the purposes of debugging, generate a human-readable string for the
3725 * file type.
3726 */
3727static const char *
3728file_type_to_name(short type)
3729{
3730
3731	switch (type) {
3732	case 0:
3733		return ("zero");
3734	case DTYPE_VNODE:
3735		return ("vnod");
3736	case DTYPE_SOCKET:
3737		return ("sock");
3738	case DTYPE_PIPE:
3739		return ("pipe");
3740	case DTYPE_FIFO:
3741		return ("fifo");
3742	case DTYPE_KQUEUE:
3743		return ("kque");
3744	case DTYPE_CRYPTO:
3745		return ("crpt");
3746	case DTYPE_MQUEUE:
3747		return ("mque");
3748	case DTYPE_SHM:
3749		return ("shm");
3750	case DTYPE_SEM:
3751		return ("ksem");
3752	default:
3753		return ("unkn");
3754	}
3755}
3756
3757/*
3758 * For the purposes of debugging, identify a process (if any, perhaps one of
3759 * many) that references the passed file in its file descriptor array. Return
3760 * NULL if none.
3761 */
3762static struct proc *
3763file_to_first_proc(struct file *fp)
3764{
3765	struct filedesc *fdp;
3766	struct proc *p;
3767	int n;
3768
3769	FOREACH_PROC_IN_SYSTEM(p) {
3770		if (p->p_state == PRS_NEW)
3771			continue;
3772		fdp = p->p_fd;
3773		if (fdp == NULL)
3774			continue;
3775		for (n = 0; n < fdp->fd_nfiles; n++) {
3776			if (fp == fdp->fd_ofiles[n].fde_file)
3777				return (p);
3778		}
3779	}
3780	return (NULL);
3781}
3782
3783static void
3784db_print_file(struct file *fp, int header)
3785{
3786	struct proc *p;
3787
3788	if (header)
3789		db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n",
3790		    "File", "Type", "Data", "Flag", "GCFl", "Count",
3791		    "MCount", "Vnode", "FPID", "FCmd");
3792	p = file_to_first_proc(fp);
3793	db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
3794	    file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
3795	    0, fp->f_count, 0, fp->f_vnode,
3796	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
3797}
3798
3799DB_SHOW_COMMAND(file, db_show_file)
3800{
3801	struct file *fp;
3802
3803	if (!have_addr) {
3804		db_printf("usage: show file <addr>\n");
3805		return;
3806	}
3807	fp = (struct file *)addr;
3808	db_print_file(fp, 1);
3809}
3810
3811DB_SHOW_COMMAND(files, db_show_files)
3812{
3813	struct filedesc *fdp;
3814	struct file *fp;
3815	struct proc *p;
3816	int header;
3817	int n;
3818
3819	header = 1;
3820	FOREACH_PROC_IN_SYSTEM(p) {
3821		if (p->p_state == PRS_NEW)
3822			continue;
3823		if ((fdp = p->p_fd) == NULL)
3824			continue;
3825		for (n = 0; n < fdp->fd_nfiles; ++n) {
3826			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
3827				continue;
3828			db_print_file(fp, header);
3829			header = 0;
3830		}
3831	}
3832}
3833#endif
3834
3835SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
3836    &maxfilesperproc, 0, "Maximum files allowed open per process");
3837
3838SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
3839    &maxfiles, 0, "Maximum number of files");
3840
3841SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
3842    __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files");
3843
3844/* ARGSUSED*/
3845static void
3846filelistinit(void *dummy)
3847{
3848
3849	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
3850	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
3851	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
3852	mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF);
3853}
3854SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
3855
3856/*-------------------------------------------------------------------*/
3857
3858static int
3859badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
3860    int flags, struct thread *td)
3861{
3862
3863	return (EBADF);
3864}
3865
3866static int
3867badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
3868    struct thread *td)
3869{
3870
3871	return (EINVAL);
3872}
3873
3874static int
3875badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
3876    struct thread *td)
3877{
3878
3879	return (EBADF);
3880}
3881
3882static int
3883badfo_poll(struct file *fp, int events, struct ucred *active_cred,
3884    struct thread *td)
3885{
3886
3887	return (0);
3888}
3889
3890static int
3891badfo_kqfilter(struct file *fp, struct knote *kn)
3892{
3893
3894	return (EBADF);
3895}
3896
3897static int
3898badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
3899    struct thread *td)
3900{
3901
3902	return (EBADF);
3903}
3904
3905static int
3906badfo_close(struct file *fp, struct thread *td)
3907{
3908
3909	return (EBADF);
3910}
3911
3912static int
3913badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
3914    struct thread *td)
3915{
3916
3917	return (EBADF);
3918}
3919
3920static int
3921badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
3922    struct thread *td)
3923{
3924
3925	return (EBADF);
3926}
3927
3928static int
3929badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
3930    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
3931    int kflags, struct thread *td)
3932{
3933
3934	return (EBADF);
3935}
3936
3937struct fileops badfileops = {
3938	.fo_read = badfo_readwrite,
3939	.fo_write = badfo_readwrite,
3940	.fo_truncate = badfo_truncate,
3941	.fo_ioctl = badfo_ioctl,
3942	.fo_poll = badfo_poll,
3943	.fo_kqfilter = badfo_kqfilter,
3944	.fo_stat = badfo_stat,
3945	.fo_close = badfo_close,
3946	.fo_chmod = badfo_chmod,
3947	.fo_chown = badfo_chown,
3948	.fo_sendfile = badfo_sendfile,
3949};
3950
3951int
3952invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
3953    struct thread *td)
3954{
3955
3956	return (EINVAL);
3957}
3958
3959int
3960invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
3961    struct thread *td)
3962{
3963
3964	return (EINVAL);
3965}
3966
3967int
3968invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
3969    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
3970    int kflags, struct thread *td)
3971{
3972
3973	return (EINVAL);
3974}
3975
3976/*-------------------------------------------------------------------*/
3977
3978/*
3979 * File Descriptor pseudo-device driver (/dev/fd/).
3980 *
3981 * Opening minor device N dup()s the file (if any) connected to file
3982 * descriptor N belonging to the calling process.  Note that this driver
3983 * consists of only the ``open()'' routine, because all subsequent
3984 * references to this file will be direct to the other driver.
3985 *
3986 * XXX: we could give this one a cloning event handler if necessary.
3987 */
3988
3989/* ARGSUSED */
3990static int
3991fdopen(struct cdev *dev, int mode, int type, struct thread *td)
3992{
3993
3994	/*
3995	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
3996	 * the file descriptor being sought for duplication. The error
3997	 * return ensures that the vnode for this device will be released
3998	 * by vn_open. Open will detect this special error and take the
3999	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
4000	 * will simply report the error.
4001	 */
4002	td->td_dupfd = dev2unit(dev);
4003	return (ENODEV);
4004}
4005
4006static struct cdevsw fildesc_cdevsw = {
4007	.d_version =	D_VERSION,
4008	.d_open =	fdopen,
4009	.d_name =	"FD",
4010};
4011
4012static void
4013fildesc_drvinit(void *unused)
4014{
4015	struct cdev *dev;
4016
4017	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
4018	    UID_ROOT, GID_WHEEL, 0666, "fd/0");
4019	make_dev_alias(dev, "stdin");
4020	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
4021	    UID_ROOT, GID_WHEEL, 0666, "fd/1");
4022	make_dev_alias(dev, "stdout");
4023	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
4024	    UID_ROOT, GID_WHEEL, 0666, "fd/2");
4025	make_dev_alias(dev, "stderr");
4026}
4027
4028SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
4029