kern_descrip.c revision 273843
1/*-
2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: head/sys/kern/kern_descrip.c 273843 2014-10-30 05:21:12Z mjg $");
39
40#include "opt_capsicum.h"
41#include "opt_compat.h"
42#include "opt_ddb.h"
43#include "opt_ktrace.h"
44
45#include <sys/param.h>
46#include <sys/systm.h>
47
48#include <sys/capsicum.h>
49#include <sys/conf.h>
50#include <sys/fcntl.h>
51#include <sys/file.h>
52#include <sys/filedesc.h>
53#include <sys/filio.h>
54#include <sys/jail.h>
55#include <sys/kernel.h>
56#include <sys/limits.h>
57#include <sys/lock.h>
58#include <sys/malloc.h>
59#include <sys/mount.h>
60#include <sys/mutex.h>
61#include <sys/namei.h>
62#include <sys/selinfo.h>
63#include <sys/priv.h>
64#include <sys/proc.h>
65#include <sys/protosw.h>
66#include <sys/racct.h>
67#include <sys/resourcevar.h>
68#include <sys/sbuf.h>
69#include <sys/signalvar.h>
70#include <sys/socketvar.h>
71#include <sys/stat.h>
72#include <sys/sx.h>
73#include <sys/syscallsubr.h>
74#include <sys/sysctl.h>
75#include <sys/sysproto.h>
76#include <sys/unistd.h>
77#include <sys/user.h>
78#include <sys/vnode.h>
79#ifdef KTRACE
80#include <sys/ktrace.h>
81#endif
82
83#include <net/vnet.h>
84
85#include <security/audit/audit.h>
86
87#include <vm/uma.h>
88#include <vm/vm.h>
89
90#include <ddb/ddb.h>
91
92static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
93static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
94    "file desc to leader structures");
95static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
96MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
97
98MALLOC_DECLARE(M_FADVISE);
99
100static uma_zone_t file_zone;
101
102static int	closefp(struct filedesc *fdp, int fd, struct file *fp,
103		    struct thread *td, int holdleaders);
104static int	do_dup(struct thread *td, int flags, int old, int new,
105		    register_t *retval);
106static int	fd_first_free(struct filedesc *fdp, int low, int size);
107static int	fd_last_used(struct filedesc *fdp, int size);
108static void	fdgrowtable(struct filedesc *fdp, int nfd);
109static void	fdgrowtable_exp(struct filedesc *fdp, int nfd);
110static void	fdunused(struct filedesc *fdp, int fd);
111static void	fdused(struct filedesc *fdp, int fd);
112static int	getmaxfd(struct proc *p);
113
114/* Flags for do_dup() */
115#define	DUP_FIXED	0x1	/* Force fixed allocation. */
116#define	DUP_FCNTL	0x2	/* fcntl()-style errors. */
117#define	DUP_CLOEXEC	0x4	/* Atomically set FD_CLOEXEC. */
118
119/*
120 * Each process has:
121 *
122 * - An array of open file descriptors (fd_ofiles)
123 * - An array of file flags (fd_ofileflags)
124 * - A bitmap recording which descriptors are in use (fd_map)
125 *
126 * A process starts out with NDFILE descriptors.  The value of NDFILE has
127 * been selected based the historical limit of 20 open files, and an
128 * assumption that the majority of processes, especially short-lived
129 * processes like shells, will never need more.
130 *
131 * If this initial allocation is exhausted, a larger descriptor table and
132 * map are allocated dynamically, and the pointers in the process's struct
133 * filedesc are updated to point to those.  This is repeated every time
134 * the process runs out of file descriptors (provided it hasn't hit its
135 * resource limit).
136 *
137 * Since threads may hold references to individual descriptor table
138 * entries, the tables are never freed.  Instead, they are placed on a
139 * linked list and freed only when the struct filedesc is released.
140 */
141#define NDFILE		20
142#define NDSLOTSIZE	sizeof(NDSLOTTYPE)
143#define	NDENTRIES	(NDSLOTSIZE * __CHAR_BIT)
144#define NDSLOT(x)	((x) / NDENTRIES)
145#define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
146#define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
147
148/*
149 * SLIST entry used to keep track of ofiles which must be reclaimed when
150 * the process exits.
151 */
152struct freetable {
153	struct fdescenttbl *ft_table;
154	SLIST_ENTRY(freetable) ft_next;
155};
156
157/*
158 * Initial allocation: a filedesc structure + the head of SLIST used to
159 * keep track of old ofiles + enough space for NDFILE descriptors.
160 */
161
162struct fdescenttbl0 {
163	int	fdt_nfiles;
164	struct	filedescent fdt_ofiles[NDFILE];
165};
166
167struct filedesc0 {
168	struct filedesc fd_fd;
169	SLIST_HEAD(, freetable) fd_free;
170	struct	fdescenttbl0 fd_dfiles;
171	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
172};
173
174/*
175 * Descriptor management.
176 */
177volatile int openfiles;			/* actual number of open files */
178struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
179void (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
180
181/* A mutex to protect the association between a proc and filedesc. */
182static struct mtx fdesc_mtx;
183
184/*
185 * If low >= size, just return low. Otherwise find the first zero bit in the
186 * given bitmap, starting at low and not exceeding size - 1. Return size if
187 * not found.
188 */
189static int
190fd_first_free(struct filedesc *fdp, int low, int size)
191{
192	NDSLOTTYPE *map = fdp->fd_map;
193	NDSLOTTYPE mask;
194	int off, maxoff;
195
196	if (low >= size)
197		return (low);
198
199	off = NDSLOT(low);
200	if (low % NDENTRIES) {
201		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
202		if ((mask &= ~map[off]) != 0UL)
203			return (off * NDENTRIES + ffsl(mask) - 1);
204		++off;
205	}
206	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
207		if (map[off] != ~0UL)
208			return (off * NDENTRIES + ffsl(~map[off]) - 1);
209	return (size);
210}
211
212/*
213 * Find the highest non-zero bit in the given bitmap, starting at 0 and
214 * not exceeding size - 1. Return -1 if not found.
215 */
216static int
217fd_last_used(struct filedesc *fdp, int size)
218{
219	NDSLOTTYPE *map = fdp->fd_map;
220	NDSLOTTYPE mask;
221	int off, minoff;
222
223	off = NDSLOT(size);
224	if (size % NDENTRIES) {
225		mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
226		if ((mask &= map[off]) != 0)
227			return (off * NDENTRIES + flsl(mask) - 1);
228		--off;
229	}
230	for (minoff = NDSLOT(0); off >= minoff; --off)
231		if (map[off] != 0)
232			return (off * NDENTRIES + flsl(map[off]) - 1);
233	return (-1);
234}
235
236static int
237fdisused(struct filedesc *fdp, int fd)
238{
239
240	FILEDESC_LOCK_ASSERT(fdp);
241
242	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
243	    ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
244
245	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
246}
247
248/*
249 * Mark a file descriptor as used.
250 */
251static void
252fdused(struct filedesc *fdp, int fd)
253{
254
255	FILEDESC_XLOCK_ASSERT(fdp);
256
257	KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
258
259	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
260	if (fd > fdp->fd_lastfile)
261		fdp->fd_lastfile = fd;
262	if (fd == fdp->fd_freefile)
263		fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
264}
265
266/*
267 * Mark a file descriptor as unused.
268 */
269static void
270fdunused(struct filedesc *fdp, int fd)
271{
272
273	FILEDESC_XLOCK_ASSERT(fdp);
274
275	KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
276	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
277	    ("fd=%d is still in use", fd));
278
279	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
280	if (fd < fdp->fd_freefile)
281		fdp->fd_freefile = fd;
282	if (fd == fdp->fd_lastfile)
283		fdp->fd_lastfile = fd_last_used(fdp, fd);
284}
285
286/*
287 * Free a file descriptor.
288 *
289 * Avoid some work if fdp is about to be destroyed.
290 */
291static inline void
292_fdfree(struct filedesc *fdp, int fd, int last)
293{
294	struct filedescent *fde;
295
296	fde = &fdp->fd_ofiles[fd];
297#ifdef CAPABILITIES
298	if (!last)
299		seq_write_begin(&fde->fde_seq);
300#endif
301	filecaps_free(&fde->fde_caps);
302	if (last)
303		return;
304	bzero(fde, fde_change_size);
305	fdunused(fdp, fd);
306#ifdef CAPABILITIES
307	seq_write_end(&fde->fde_seq);
308#endif
309}
310
311static inline void
312fdfree(struct filedesc *fdp, int fd)
313{
314
315	_fdfree(fdp, fd, 0);
316}
317
318static inline void
319fdfree_last(struct filedesc *fdp, int fd)
320{
321
322	_fdfree(fdp, fd, 1);
323}
324
325/*
326 * System calls on descriptors.
327 */
328#ifndef _SYS_SYSPROTO_H_
329struct getdtablesize_args {
330	int	dummy;
331};
332#endif
333/* ARGSUSED */
334int
335sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
336{
337	struct proc *p = td->td_proc;
338	uint64_t lim;
339
340	PROC_LOCK(p);
341	td->td_retval[0] =
342	    min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
343	lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
344	PROC_UNLOCK(p);
345	if (lim < td->td_retval[0])
346		td->td_retval[0] = lim;
347	return (0);
348}
349
350/*
351 * Duplicate a file descriptor to a particular value.
352 *
353 * Note: keep in mind that a potential race condition exists when closing
354 * descriptors from a shared descriptor table (via rfork).
355 */
356#ifndef _SYS_SYSPROTO_H_
357struct dup2_args {
358	u_int	from;
359	u_int	to;
360};
361#endif
362/* ARGSUSED */
363int
364sys_dup2(struct thread *td, struct dup2_args *uap)
365{
366
367	return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
368		    td->td_retval));
369}
370
371/*
372 * Duplicate a file descriptor.
373 */
374#ifndef _SYS_SYSPROTO_H_
375struct dup_args {
376	u_int	fd;
377};
378#endif
379/* ARGSUSED */
380int
381sys_dup(struct thread *td, struct dup_args *uap)
382{
383
384	return (do_dup(td, 0, (int)uap->fd, 0, td->td_retval));
385}
386
387/*
388 * The file control system call.
389 */
390#ifndef _SYS_SYSPROTO_H_
391struct fcntl_args {
392	int	fd;
393	int	cmd;
394	long	arg;
395};
396#endif
397/* ARGSUSED */
398int
399sys_fcntl(struct thread *td, struct fcntl_args *uap)
400{
401
402	return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg));
403}
404
405int
406kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg)
407{
408	struct flock fl;
409	struct __oflock ofl;
410	intptr_t arg1;
411	int error;
412
413	error = 0;
414	switch (cmd) {
415	case F_OGETLK:
416	case F_OSETLK:
417	case F_OSETLKW:
418		/*
419		 * Convert old flock structure to new.
420		 */
421		error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl));
422		fl.l_start = ofl.l_start;
423		fl.l_len = ofl.l_len;
424		fl.l_pid = ofl.l_pid;
425		fl.l_type = ofl.l_type;
426		fl.l_whence = ofl.l_whence;
427		fl.l_sysid = 0;
428
429		switch (cmd) {
430		case F_OGETLK:
431		    cmd = F_GETLK;
432		    break;
433		case F_OSETLK:
434		    cmd = F_SETLK;
435		    break;
436		case F_OSETLKW:
437		    cmd = F_SETLKW;
438		    break;
439		}
440		arg1 = (intptr_t)&fl;
441		break;
442        case F_GETLK:
443        case F_SETLK:
444        case F_SETLKW:
445	case F_SETLK_REMOTE:
446                error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl));
447                arg1 = (intptr_t)&fl;
448                break;
449	default:
450		arg1 = arg;
451		break;
452	}
453	if (error)
454		return (error);
455	error = kern_fcntl(td, fd, cmd, arg1);
456	if (error)
457		return (error);
458	if (cmd == F_OGETLK) {
459		ofl.l_start = fl.l_start;
460		ofl.l_len = fl.l_len;
461		ofl.l_pid = fl.l_pid;
462		ofl.l_type = fl.l_type;
463		ofl.l_whence = fl.l_whence;
464		error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl));
465	} else if (cmd == F_GETLK) {
466		error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl));
467	}
468	return (error);
469}
470
471int
472kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
473{
474	struct filedesc *fdp;
475	struct flock *flp;
476	struct file *fp, *fp2;
477	struct filedescent *fde;
478	struct proc *p;
479	struct vnode *vp;
480	cap_rights_t rights;
481	int error, flg, tmp;
482	uint64_t bsize;
483	off_t foffset;
484
485	error = 0;
486	flg = F_POSIX;
487	p = td->td_proc;
488	fdp = p->p_fd;
489
490	switch (cmd) {
491	case F_DUPFD:
492		tmp = arg;
493		error = do_dup(td, DUP_FCNTL, fd, tmp, td->td_retval);
494		break;
495
496	case F_DUPFD_CLOEXEC:
497		tmp = arg;
498		error = do_dup(td, DUP_FCNTL | DUP_CLOEXEC, fd, tmp,
499		    td->td_retval);
500		break;
501
502	case F_DUP2FD:
503		tmp = arg;
504		error = do_dup(td, DUP_FIXED, fd, tmp, td->td_retval);
505		break;
506
507	case F_DUP2FD_CLOEXEC:
508		tmp = arg;
509		error = do_dup(td, DUP_FIXED | DUP_CLOEXEC, fd, tmp,
510		    td->td_retval);
511		break;
512
513	case F_GETFD:
514		FILEDESC_SLOCK(fdp);
515		if (fget_locked(fdp, fd) == NULL) {
516			FILEDESC_SUNLOCK(fdp);
517			error = EBADF;
518			break;
519		}
520		fde = &fdp->fd_ofiles[fd];
521		td->td_retval[0] =
522		    (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
523		FILEDESC_SUNLOCK(fdp);
524		break;
525
526	case F_SETFD:
527		FILEDESC_XLOCK(fdp);
528		if (fget_locked(fdp, fd) == NULL) {
529			FILEDESC_XUNLOCK(fdp);
530			error = EBADF;
531			break;
532		}
533		fde = &fdp->fd_ofiles[fd];
534		fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
535		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
536		FILEDESC_XUNLOCK(fdp);
537		break;
538
539	case F_GETFL:
540		error = fget_unlocked(fdp, fd,
541		    cap_rights_init(&rights, CAP_FCNTL), F_GETFL, &fp, NULL);
542		if (error != 0)
543			break;
544		td->td_retval[0] = OFLAGS(fp->f_flag);
545		fdrop(fp, td);
546		break;
547
548	case F_SETFL:
549		error = fget_unlocked(fdp, fd,
550		    cap_rights_init(&rights, CAP_FCNTL), F_SETFL, &fp, NULL);
551		if (error != 0)
552			break;
553		do {
554			tmp = flg = fp->f_flag;
555			tmp &= ~FCNTLFLAGS;
556			tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
557		} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
558		tmp = fp->f_flag & FNONBLOCK;
559		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
560		if (error != 0) {
561			fdrop(fp, td);
562			break;
563		}
564		tmp = fp->f_flag & FASYNC;
565		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
566		if (error == 0) {
567			fdrop(fp, td);
568			break;
569		}
570		atomic_clear_int(&fp->f_flag, FNONBLOCK);
571		tmp = 0;
572		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
573		fdrop(fp, td);
574		break;
575
576	case F_GETOWN:
577		error = fget_unlocked(fdp, fd,
578		    cap_rights_init(&rights, CAP_FCNTL), F_GETOWN, &fp, NULL);
579		if (error != 0)
580			break;
581		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
582		if (error == 0)
583			td->td_retval[0] = tmp;
584		fdrop(fp, td);
585		break;
586
587	case F_SETOWN:
588		error = fget_unlocked(fdp, fd,
589		    cap_rights_init(&rights, CAP_FCNTL), F_SETOWN, &fp, NULL);
590		if (error != 0)
591			break;
592		tmp = arg;
593		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
594		fdrop(fp, td);
595		break;
596
597	case F_SETLK_REMOTE:
598		error = priv_check(td, PRIV_NFS_LOCKD);
599		if (error)
600			return (error);
601		flg = F_REMOTE;
602		goto do_setlk;
603
604	case F_SETLKW:
605		flg |= F_WAIT;
606		/* FALLTHROUGH F_SETLK */
607
608	case F_SETLK:
609	do_setlk:
610		cap_rights_init(&rights, CAP_FLOCK);
611		error = fget_unlocked(fdp, fd, &rights, 0, &fp, NULL);
612		if (error != 0)
613			break;
614		if (fp->f_type != DTYPE_VNODE) {
615			error = EBADF;
616			fdrop(fp, td);
617			break;
618		}
619
620		flp = (struct flock *)arg;
621		if (flp->l_whence == SEEK_CUR) {
622			foffset = foffset_get(fp);
623			if (foffset < 0 ||
624			    (flp->l_start > 0 &&
625			     foffset > OFF_MAX - flp->l_start)) {
626				FILEDESC_SUNLOCK(fdp);
627				error = EOVERFLOW;
628				fdrop(fp, td);
629				break;
630			}
631			flp->l_start += foffset;
632		}
633
634		vp = fp->f_vnode;
635		switch (flp->l_type) {
636		case F_RDLCK:
637			if ((fp->f_flag & FREAD) == 0) {
638				error = EBADF;
639				break;
640			}
641			PROC_LOCK(p->p_leader);
642			p->p_leader->p_flag |= P_ADVLOCK;
643			PROC_UNLOCK(p->p_leader);
644			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
645			    flp, flg);
646			break;
647		case F_WRLCK:
648			if ((fp->f_flag & FWRITE) == 0) {
649				error = EBADF;
650				break;
651			}
652			PROC_LOCK(p->p_leader);
653			p->p_leader->p_flag |= P_ADVLOCK;
654			PROC_UNLOCK(p->p_leader);
655			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
656			    flp, flg);
657			break;
658		case F_UNLCK:
659			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
660			    flp, flg);
661			break;
662		case F_UNLCKSYS:
663			/*
664			 * Temporary api for testing remote lock
665			 * infrastructure.
666			 */
667			if (flg != F_REMOTE) {
668				error = EINVAL;
669				break;
670			}
671			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
672			    F_UNLCKSYS, flp, flg);
673			break;
674		default:
675			error = EINVAL;
676			break;
677		}
678		if (error != 0 || flp->l_type == F_UNLCK ||
679		    flp->l_type == F_UNLCKSYS) {
680			fdrop(fp, td);
681			break;
682		}
683
684		/*
685		 * Check for a race with close.
686		 *
687		 * The vnode is now advisory locked (or unlocked, but this case
688		 * is not really important) as the caller requested.
689		 * We had to drop the filedesc lock, so we need to recheck if
690		 * the descriptor is still valid, because if it was closed
691		 * in the meantime we need to remove advisory lock from the
692		 * vnode - close on any descriptor leading to an advisory
693		 * locked vnode, removes that lock.
694		 * We will return 0 on purpose in that case, as the result of
695		 * successful advisory lock might have been externally visible
696		 * already. This is fine - effectively we pretend to the caller
697		 * that the closing thread was a bit slower and that the
698		 * advisory lock succeeded before the close.
699		 */
700		error = fget_unlocked(fdp, fd, &rights, 0, &fp2, NULL);
701		if (error != 0) {
702			fdrop(fp, td);
703			break;
704		}
705		if (fp != fp2) {
706			flp->l_whence = SEEK_SET;
707			flp->l_start = 0;
708			flp->l_len = 0;
709			flp->l_type = F_UNLCK;
710			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
711			    F_UNLCK, flp, F_POSIX);
712		}
713		fdrop(fp, td);
714		fdrop(fp2, td);
715		break;
716
717	case F_GETLK:
718		error = fget_unlocked(fdp, fd,
719		    cap_rights_init(&rights, CAP_FLOCK), 0, &fp, NULL);
720		if (error != 0)
721			break;
722		if (fp->f_type != DTYPE_VNODE) {
723			error = EBADF;
724			fdrop(fp, td);
725			break;
726		}
727		flp = (struct flock *)arg;
728		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
729		    flp->l_type != F_UNLCK) {
730			error = EINVAL;
731			fdrop(fp, td);
732			break;
733		}
734		if (flp->l_whence == SEEK_CUR) {
735			foffset = foffset_get(fp);
736			if ((flp->l_start > 0 &&
737			    foffset > OFF_MAX - flp->l_start) ||
738			    (flp->l_start < 0 &&
739			     foffset < OFF_MIN - flp->l_start)) {
740				FILEDESC_SUNLOCK(fdp);
741				error = EOVERFLOW;
742				fdrop(fp, td);
743				break;
744			}
745			flp->l_start += foffset;
746		}
747		vp = fp->f_vnode;
748		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
749		    F_POSIX);
750		fdrop(fp, td);
751		break;
752
753	case F_RDAHEAD:
754		arg = arg ? 128 * 1024: 0;
755		/* FALLTHROUGH */
756	case F_READAHEAD:
757		error = fget_unlocked(fdp, fd, NULL, 0, &fp, NULL);
758		if (error != 0)
759			break;
760		if (fp->f_type != DTYPE_VNODE) {
761			fdrop(fp, td);
762			error = EBADF;
763			break;
764		}
765		vp = fp->f_vnode;
766		/*
767		 * Exclusive lock synchronizes against f_seqcount reads and
768		 * writes in sequential_heuristic().
769		 */
770		error = vn_lock(vp, LK_EXCLUSIVE);
771		if (error != 0) {
772			fdrop(fp, td);
773			break;
774		}
775		if (arg >= 0) {
776			bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
777			fp->f_seqcount = (arg + bsize - 1) / bsize;
778			atomic_set_int(&fp->f_flag, FRDAHEAD);
779		} else {
780			atomic_clear_int(&fp->f_flag, FRDAHEAD);
781		}
782		VOP_UNLOCK(vp, 0);
783		fdrop(fp, td);
784		break;
785
786	default:
787		error = EINVAL;
788		break;
789	}
790	return (error);
791}
792
793static int
794getmaxfd(struct proc *p)
795{
796	int maxfd;
797
798	PROC_LOCK(p);
799	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
800	PROC_UNLOCK(p);
801
802	return (maxfd);
803}
804
805/*
806 * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
807 */
808static int
809do_dup(struct thread *td, int flags, int old, int new,
810    register_t *retval)
811{
812	struct filedesc *fdp;
813	struct filedescent *oldfde, *newfde;
814	struct proc *p;
815	struct file *fp;
816	struct file *delfp;
817	int error, maxfd;
818
819	p = td->td_proc;
820	fdp = p->p_fd;
821
822	/*
823	 * Verify we have a valid descriptor to dup from and possibly to
824	 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
825	 * return EINVAL when the new descriptor is out of bounds.
826	 */
827	if (old < 0)
828		return (EBADF);
829	if (new < 0)
830		return (flags & DUP_FCNTL ? EINVAL : EBADF);
831	maxfd = getmaxfd(p);
832	if (new >= maxfd)
833		return (flags & DUP_FCNTL ? EINVAL : EBADF);
834
835	FILEDESC_XLOCK(fdp);
836	if (fget_locked(fdp, old) == NULL) {
837		FILEDESC_XUNLOCK(fdp);
838		return (EBADF);
839	}
840	oldfde = &fdp->fd_ofiles[old];
841	if (flags & DUP_FIXED && old == new) {
842		*retval = new;
843		if (flags & DUP_CLOEXEC)
844			fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
845		FILEDESC_XUNLOCK(fdp);
846		return (0);
847	}
848	fp = oldfde->fde_file;
849	fhold(fp);
850
851	/*
852	 * If the caller specified a file descriptor, make sure the file
853	 * table is large enough to hold it, and grab it.  Otherwise, just
854	 * allocate a new descriptor the usual way.
855	 */
856	if (flags & DUP_FIXED) {
857		if (new >= fdp->fd_nfiles) {
858			/*
859			 * The resource limits are here instead of e.g.
860			 * fdalloc(), because the file descriptor table may be
861			 * shared between processes, so we can't really use
862			 * racct_add()/racct_sub().  Instead of counting the
863			 * number of actually allocated descriptors, just put
864			 * the limit on the size of the file descriptor table.
865			 */
866#ifdef RACCT
867			PROC_LOCK(p);
868			error = racct_set(p, RACCT_NOFILE, new + 1);
869			PROC_UNLOCK(p);
870			if (error != 0) {
871				FILEDESC_XUNLOCK(fdp);
872				fdrop(fp, td);
873				return (EMFILE);
874			}
875#endif
876			fdgrowtable_exp(fdp, new + 1);
877			oldfde = &fdp->fd_ofiles[old];
878		}
879		newfde = &fdp->fd_ofiles[new];
880		if (newfde->fde_file == NULL)
881			fdused(fdp, new);
882	} else {
883		if ((error = fdalloc(td, new, &new)) != 0) {
884			FILEDESC_XUNLOCK(fdp);
885			fdrop(fp, td);
886			return (error);
887		}
888		newfde = &fdp->fd_ofiles[new];
889	}
890
891	KASSERT(fp == oldfde->fde_file, ("old fd has been modified"));
892	KASSERT(old != new, ("new fd is same as old"));
893
894	delfp = newfde->fde_file;
895
896	/*
897	 * Duplicate the source descriptor.
898	 */
899#ifdef CAPABILITIES
900	seq_write_begin(&newfde->fde_seq);
901#endif
902	filecaps_free(&newfde->fde_caps);
903	memcpy(newfde, oldfde, fde_change_size);
904	filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps);
905	if ((flags & DUP_CLOEXEC) != 0)
906		newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
907	else
908		newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
909#ifdef CAPABILITIES
910	seq_write_end(&newfde->fde_seq);
911#endif
912	*retval = new;
913
914	if (delfp != NULL) {
915		(void) closefp(fdp, new, delfp, td, 1);
916		/* closefp() drops the FILEDESC lock for us. */
917	} else {
918		FILEDESC_XUNLOCK(fdp);
919	}
920
921	return (0);
922}
923
924/*
925 * If sigio is on the list associated with a process or process group,
926 * disable signalling from the device, remove sigio from the list and
927 * free sigio.
928 */
929void
930funsetown(struct sigio **sigiop)
931{
932	struct sigio *sigio;
933
934	SIGIO_LOCK();
935	sigio = *sigiop;
936	if (sigio == NULL) {
937		SIGIO_UNLOCK();
938		return;
939	}
940	*(sigio->sio_myref) = NULL;
941	if ((sigio)->sio_pgid < 0) {
942		struct pgrp *pg = (sigio)->sio_pgrp;
943		PGRP_LOCK(pg);
944		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
945			     sigio, sio_pgsigio);
946		PGRP_UNLOCK(pg);
947	} else {
948		struct proc *p = (sigio)->sio_proc;
949		PROC_LOCK(p);
950		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
951			     sigio, sio_pgsigio);
952		PROC_UNLOCK(p);
953	}
954	SIGIO_UNLOCK();
955	crfree(sigio->sio_ucred);
956	free(sigio, M_SIGIO);
957}
958
959/*
960 * Free a list of sigio structures.
961 * We only need to lock the SIGIO_LOCK because we have made ourselves
962 * inaccessible to callers of fsetown and therefore do not need to lock
963 * the proc or pgrp struct for the list manipulation.
964 */
965void
966funsetownlst(struct sigiolst *sigiolst)
967{
968	struct proc *p;
969	struct pgrp *pg;
970	struct sigio *sigio;
971
972	sigio = SLIST_FIRST(sigiolst);
973	if (sigio == NULL)
974		return;
975	p = NULL;
976	pg = NULL;
977
978	/*
979	 * Every entry of the list should belong
980	 * to a single proc or pgrp.
981	 */
982	if (sigio->sio_pgid < 0) {
983		pg = sigio->sio_pgrp;
984		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
985	} else /* if (sigio->sio_pgid > 0) */ {
986		p = sigio->sio_proc;
987		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
988	}
989
990	SIGIO_LOCK();
991	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
992		*(sigio->sio_myref) = NULL;
993		if (pg != NULL) {
994			KASSERT(sigio->sio_pgid < 0,
995			    ("Proc sigio in pgrp sigio list"));
996			KASSERT(sigio->sio_pgrp == pg,
997			    ("Bogus pgrp in sigio list"));
998			PGRP_LOCK(pg);
999			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
1000			    sio_pgsigio);
1001			PGRP_UNLOCK(pg);
1002		} else /* if (p != NULL) */ {
1003			KASSERT(sigio->sio_pgid > 0,
1004			    ("Pgrp sigio in proc sigio list"));
1005			KASSERT(sigio->sio_proc == p,
1006			    ("Bogus proc in sigio list"));
1007			PROC_LOCK(p);
1008			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
1009			    sio_pgsigio);
1010			PROC_UNLOCK(p);
1011		}
1012		SIGIO_UNLOCK();
1013		crfree(sigio->sio_ucred);
1014		free(sigio, M_SIGIO);
1015		SIGIO_LOCK();
1016	}
1017	SIGIO_UNLOCK();
1018}
1019
1020/*
1021 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
1022 *
1023 * After permission checking, add a sigio structure to the sigio list for
1024 * the process or process group.
1025 */
1026int
1027fsetown(pid_t pgid, struct sigio **sigiop)
1028{
1029	struct proc *proc;
1030	struct pgrp *pgrp;
1031	struct sigio *sigio;
1032	int ret;
1033
1034	if (pgid == 0) {
1035		funsetown(sigiop);
1036		return (0);
1037	}
1038
1039	ret = 0;
1040
1041	/* Allocate and fill in the new sigio out of locks. */
1042	sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
1043	sigio->sio_pgid = pgid;
1044	sigio->sio_ucred = crhold(curthread->td_ucred);
1045	sigio->sio_myref = sigiop;
1046
1047	sx_slock(&proctree_lock);
1048	if (pgid > 0) {
1049		proc = pfind(pgid);
1050		if (proc == NULL) {
1051			ret = ESRCH;
1052			goto fail;
1053		}
1054
1055		/*
1056		 * Policy - Don't allow a process to FSETOWN a process
1057		 * in another session.
1058		 *
1059		 * Remove this test to allow maximum flexibility or
1060		 * restrict FSETOWN to the current process or process
1061		 * group for maximum safety.
1062		 */
1063		PROC_UNLOCK(proc);
1064		if (proc->p_session != curthread->td_proc->p_session) {
1065			ret = EPERM;
1066			goto fail;
1067		}
1068
1069		pgrp = NULL;
1070	} else /* if (pgid < 0) */ {
1071		pgrp = pgfind(-pgid);
1072		if (pgrp == NULL) {
1073			ret = ESRCH;
1074			goto fail;
1075		}
1076		PGRP_UNLOCK(pgrp);
1077
1078		/*
1079		 * Policy - Don't allow a process to FSETOWN a process
1080		 * in another session.
1081		 *
1082		 * Remove this test to allow maximum flexibility or
1083		 * restrict FSETOWN to the current process or process
1084		 * group for maximum safety.
1085		 */
1086		if (pgrp->pg_session != curthread->td_proc->p_session) {
1087			ret = EPERM;
1088			goto fail;
1089		}
1090
1091		proc = NULL;
1092	}
1093	funsetown(sigiop);
1094	if (pgid > 0) {
1095		PROC_LOCK(proc);
1096		/*
1097		 * Since funsetownlst() is called without the proctree
1098		 * locked, we need to check for P_WEXIT.
1099		 * XXX: is ESRCH correct?
1100		 */
1101		if ((proc->p_flag & P_WEXIT) != 0) {
1102			PROC_UNLOCK(proc);
1103			ret = ESRCH;
1104			goto fail;
1105		}
1106		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
1107		sigio->sio_proc = proc;
1108		PROC_UNLOCK(proc);
1109	} else {
1110		PGRP_LOCK(pgrp);
1111		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
1112		sigio->sio_pgrp = pgrp;
1113		PGRP_UNLOCK(pgrp);
1114	}
1115	sx_sunlock(&proctree_lock);
1116	SIGIO_LOCK();
1117	*sigiop = sigio;
1118	SIGIO_UNLOCK();
1119	return (0);
1120
1121fail:
1122	sx_sunlock(&proctree_lock);
1123	crfree(sigio->sio_ucred);
1124	free(sigio, M_SIGIO);
1125	return (ret);
1126}
1127
1128/*
1129 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
1130 */
1131pid_t
1132fgetown(sigiop)
1133	struct sigio **sigiop;
1134{
1135	pid_t pgid;
1136
1137	SIGIO_LOCK();
1138	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
1139	SIGIO_UNLOCK();
1140	return (pgid);
1141}
1142
1143/*
1144 * Function drops the filedesc lock on return.
1145 */
1146static int
1147closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
1148    int holdleaders)
1149{
1150	int error;
1151
1152	FILEDESC_XLOCK_ASSERT(fdp);
1153
1154	if (holdleaders) {
1155		if (td->td_proc->p_fdtol != NULL) {
1156			/*
1157			 * Ask fdfree() to sleep to ensure that all relevant
1158			 * process leaders can be traversed in closef().
1159			 */
1160			fdp->fd_holdleaderscount++;
1161		} else {
1162			holdleaders = 0;
1163		}
1164	}
1165
1166	/*
1167	 * We now hold the fp reference that used to be owned by the
1168	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
1169	 * knote_fdclose to prevent a race of the fd getting opened, a knote
1170	 * added, and deleteing a knote for the new fd.
1171	 */
1172	knote_fdclose(td, fd);
1173
1174	/*
1175	 * We need to notify mqueue if the object is of type mqueue.
1176	 */
1177	if (fp->f_type == DTYPE_MQUEUE)
1178		mq_fdclose(td, fd, fp);
1179	FILEDESC_XUNLOCK(fdp);
1180
1181	error = closef(fp, td);
1182	if (holdleaders) {
1183		FILEDESC_XLOCK(fdp);
1184		fdp->fd_holdleaderscount--;
1185		if (fdp->fd_holdleaderscount == 0 &&
1186		    fdp->fd_holdleaderswakeup != 0) {
1187			fdp->fd_holdleaderswakeup = 0;
1188			wakeup(&fdp->fd_holdleaderscount);
1189		}
1190		FILEDESC_XUNLOCK(fdp);
1191	}
1192	return (error);
1193}
1194
1195/*
1196 * Close a file descriptor.
1197 */
1198#ifndef _SYS_SYSPROTO_H_
1199struct close_args {
1200	int     fd;
1201};
1202#endif
1203/* ARGSUSED */
1204int
1205sys_close(td, uap)
1206	struct thread *td;
1207	struct close_args *uap;
1208{
1209
1210	return (kern_close(td, uap->fd));
1211}
1212
1213int
1214kern_close(td, fd)
1215	struct thread *td;
1216	int fd;
1217{
1218	struct filedesc *fdp;
1219	struct file *fp;
1220
1221	fdp = td->td_proc->p_fd;
1222
1223	AUDIT_SYSCLOSE(td, fd);
1224
1225	FILEDESC_XLOCK(fdp);
1226	if ((fp = fget_locked(fdp, fd)) == NULL) {
1227		FILEDESC_XUNLOCK(fdp);
1228		return (EBADF);
1229	}
1230	fdfree(fdp, fd);
1231
1232	/* closefp() drops the FILEDESC lock for us. */
1233	return (closefp(fdp, fd, fp, td, 1));
1234}
1235
1236/*
1237 * Close open file descriptors.
1238 */
1239#ifndef _SYS_SYSPROTO_H_
1240struct closefrom_args {
1241	int	lowfd;
1242};
1243#endif
1244/* ARGSUSED */
1245int
1246sys_closefrom(struct thread *td, struct closefrom_args *uap)
1247{
1248	struct filedesc *fdp;
1249	int fd;
1250
1251	fdp = td->td_proc->p_fd;
1252	AUDIT_ARG_FD(uap->lowfd);
1253
1254	/*
1255	 * Treat negative starting file descriptor values identical to
1256	 * closefrom(0) which closes all files.
1257	 */
1258	if (uap->lowfd < 0)
1259		uap->lowfd = 0;
1260	FILEDESC_SLOCK(fdp);
1261	for (fd = uap->lowfd; fd <= fdp->fd_lastfile; fd++) {
1262		if (fdp->fd_ofiles[fd].fde_file != NULL) {
1263			FILEDESC_SUNLOCK(fdp);
1264			(void)kern_close(td, fd);
1265			FILEDESC_SLOCK(fdp);
1266		}
1267	}
1268	FILEDESC_SUNLOCK(fdp);
1269	return (0);
1270}
1271
1272#if defined(COMPAT_43)
1273/*
1274 * Return status information about a file descriptor.
1275 */
1276#ifndef _SYS_SYSPROTO_H_
1277struct ofstat_args {
1278	int	fd;
1279	struct	ostat *sb;
1280};
1281#endif
1282/* ARGSUSED */
1283int
1284ofstat(struct thread *td, struct ofstat_args *uap)
1285{
1286	struct ostat oub;
1287	struct stat ub;
1288	int error;
1289
1290	error = kern_fstat(td, uap->fd, &ub);
1291	if (error == 0) {
1292		cvtstat(&ub, &oub);
1293		error = copyout(&oub, uap->sb, sizeof(oub));
1294	}
1295	return (error);
1296}
1297#endif /* COMPAT_43 */
1298
1299/*
1300 * Return status information about a file descriptor.
1301 */
1302#ifndef _SYS_SYSPROTO_H_
1303struct fstat_args {
1304	int	fd;
1305	struct	stat *sb;
1306};
1307#endif
1308/* ARGSUSED */
1309int
1310sys_fstat(struct thread *td, struct fstat_args *uap)
1311{
1312	struct stat ub;
1313	int error;
1314
1315	error = kern_fstat(td, uap->fd, &ub);
1316	if (error == 0)
1317		error = copyout(&ub, uap->sb, sizeof(ub));
1318	return (error);
1319}
1320
1321int
1322kern_fstat(struct thread *td, int fd, struct stat *sbp)
1323{
1324	struct file *fp;
1325	cap_rights_t rights;
1326	int error;
1327
1328	AUDIT_ARG_FD(fd);
1329
1330	error = fget(td, fd, cap_rights_init(&rights, CAP_FSTAT), &fp);
1331	if (error != 0)
1332		return (error);
1333
1334	AUDIT_ARG_FILE(td->td_proc, fp);
1335
1336	error = fo_stat(fp, sbp, td->td_ucred, td);
1337	fdrop(fp, td);
1338#ifdef KTRACE
1339	if (error == 0 && KTRPOINT(td, KTR_STRUCT))
1340		ktrstat(sbp);
1341#endif
1342	return (error);
1343}
1344
1345/*
1346 * Return status information about a file descriptor.
1347 */
1348#ifndef _SYS_SYSPROTO_H_
1349struct nfstat_args {
1350	int	fd;
1351	struct	nstat *sb;
1352};
1353#endif
1354/* ARGSUSED */
1355int
1356sys_nfstat(struct thread *td, struct nfstat_args *uap)
1357{
1358	struct nstat nub;
1359	struct stat ub;
1360	int error;
1361
1362	error = kern_fstat(td, uap->fd, &ub);
1363	if (error == 0) {
1364		cvtnstat(&ub, &nub);
1365		error = copyout(&nub, uap->sb, sizeof(nub));
1366	}
1367	return (error);
1368}
1369
1370/*
1371 * Return pathconf information about a file descriptor.
1372 */
1373#ifndef _SYS_SYSPROTO_H_
1374struct fpathconf_args {
1375	int	fd;
1376	int	name;
1377};
1378#endif
1379/* ARGSUSED */
1380int
1381sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
1382{
1383	struct file *fp;
1384	struct vnode *vp;
1385	cap_rights_t rights;
1386	int error;
1387
1388	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FPATHCONF), &fp);
1389	if (error != 0)
1390		return (error);
1391
1392	/* If asynchronous I/O is available, it works for all descriptors. */
1393	if (uap->name == _PC_ASYNC_IO) {
1394		td->td_retval[0] = async_io_version;
1395		goto out;
1396	}
1397	vp = fp->f_vnode;
1398	if (vp != NULL) {
1399		vn_lock(vp, LK_SHARED | LK_RETRY);
1400		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
1401		VOP_UNLOCK(vp, 0);
1402	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
1403		if (uap->name != _PC_PIPE_BUF) {
1404			error = EINVAL;
1405		} else {
1406			td->td_retval[0] = PIPE_BUF;
1407			error = 0;
1408		}
1409	} else {
1410		error = EOPNOTSUPP;
1411	}
1412out:
1413	fdrop(fp, td);
1414	return (error);
1415}
1416
1417/*
1418 * Initialize filecaps structure.
1419 */
1420void
1421filecaps_init(struct filecaps *fcaps)
1422{
1423
1424	bzero(fcaps, sizeof(*fcaps));
1425	fcaps->fc_nioctls = -1;
1426}
1427
1428/*
1429 * Copy filecaps structure allocating memory for ioctls array if needed.
1430 */
1431void
1432filecaps_copy(const struct filecaps *src, struct filecaps *dst)
1433{
1434	size_t size;
1435
1436	*dst = *src;
1437	if (src->fc_ioctls != NULL) {
1438		KASSERT(src->fc_nioctls > 0,
1439		    ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
1440
1441		size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
1442		dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
1443		bcopy(src->fc_ioctls, dst->fc_ioctls, size);
1444	}
1445}
1446
1447/*
1448 * Move filecaps structure to the new place and clear the old place.
1449 */
1450void
1451filecaps_move(struct filecaps *src, struct filecaps *dst)
1452{
1453
1454	*dst = *src;
1455	bzero(src, sizeof(*src));
1456}
1457
1458/*
1459 * Fill the given filecaps structure with full rights.
1460 */
1461static void
1462filecaps_fill(struct filecaps *fcaps)
1463{
1464
1465	CAP_ALL(&fcaps->fc_rights);
1466	fcaps->fc_ioctls = NULL;
1467	fcaps->fc_nioctls = -1;
1468	fcaps->fc_fcntls = CAP_FCNTL_ALL;
1469}
1470
1471/*
1472 * Free memory allocated within filecaps structure.
1473 */
1474void
1475filecaps_free(struct filecaps *fcaps)
1476{
1477
1478	free(fcaps->fc_ioctls, M_FILECAPS);
1479	bzero(fcaps, sizeof(*fcaps));
1480}
1481
1482/*
1483 * Validate the given filecaps structure.
1484 */
1485static void
1486filecaps_validate(const struct filecaps *fcaps, const char *func)
1487{
1488
1489	KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
1490	    ("%s: invalid rights", func));
1491	KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
1492	    ("%s: invalid fcntls", func));
1493	KASSERT(fcaps->fc_fcntls == 0 ||
1494	    cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
1495	    ("%s: fcntls without CAP_FCNTL", func));
1496	KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
1497	    (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
1498	    ("%s: invalid ioctls", func));
1499	KASSERT(fcaps->fc_nioctls == 0 ||
1500	    cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
1501	    ("%s: ioctls without CAP_IOCTL", func));
1502}
1503
1504static void
1505fdgrowtable_exp(struct filedesc *fdp, int nfd)
1506{
1507	int nfd1;
1508
1509	FILEDESC_XLOCK_ASSERT(fdp);
1510
1511	nfd1 = fdp->fd_nfiles * 2;
1512	if (nfd1 < nfd)
1513		nfd1 = nfd;
1514	fdgrowtable(fdp, nfd1);
1515}
1516
1517/*
1518 * Grow the file table to accomodate (at least) nfd descriptors.
1519 */
1520static void
1521fdgrowtable(struct filedesc *fdp, int nfd)
1522{
1523	struct filedesc0 *fdp0;
1524	struct freetable *ft;
1525	struct fdescenttbl *ntable;
1526	struct fdescenttbl *otable;
1527	int nnfiles, onfiles;
1528	NDSLOTTYPE *nmap, *omap;
1529
1530	FILEDESC_XLOCK_ASSERT(fdp);
1531
1532	KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
1533
1534	/* save old values */
1535	onfiles = fdp->fd_nfiles;
1536	otable = fdp->fd_files;
1537	omap = fdp->fd_map;
1538
1539	/* compute the size of the new table */
1540	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
1541	if (nnfiles <= onfiles)
1542		/* the table is already large enough */
1543		return;
1544
1545	/*
1546	 * Allocate a new table.  We need enough space for the number of
1547	 * entries, file entries themselves and the struct freetable we will use
1548	 * when we decommission the table and place it on the freelist.
1549	 * We place the struct freetable in the middle so we don't have
1550	 * to worry about padding.
1551	 */
1552	ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) +
1553	    nnfiles * sizeof(ntable->fdt_ofiles[0]) +
1554	    sizeof(struct freetable),
1555	    M_FILEDESC, M_ZERO | M_WAITOK);
1556	/* copy the old data */
1557	ntable->fdt_nfiles = nnfiles;
1558	memcpy(ntable->fdt_ofiles, otable->fdt_ofiles,
1559	    onfiles * sizeof(ntable->fdt_ofiles[0]));
1560
1561	/*
1562	 * Allocate a new map only if the old is not large enough.  It will
1563	 * grow at a slower rate than the table as it can map more
1564	 * entries than the table can hold.
1565	 */
1566	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
1567		nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
1568		    M_ZERO | M_WAITOK);
1569		/* copy over the old data and update the pointer */
1570		memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
1571		fdp->fd_map = nmap;
1572	}
1573
1574	/*
1575	 * Make sure that ntable is correctly initialized before we replace
1576	 * fd_files poiner. Otherwise fget_unlocked() may see inconsistent
1577	 * data.
1578	 */
1579	atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable);
1580
1581	/*
1582	 * Do not free the old file table, as some threads may still
1583	 * reference entries within it.  Instead, place it on a freelist
1584	 * which will be processed when the struct filedesc is released.
1585	 *
1586	 * Note that if onfiles == NDFILE, we're dealing with the original
1587	 * static allocation contained within (struct filedesc0 *)fdp,
1588	 * which must not be freed.
1589	 */
1590	if (onfiles > NDFILE) {
1591		ft = (struct freetable *)&otable->fdt_ofiles[onfiles];
1592		fdp0 = (struct filedesc0 *)fdp;
1593		ft->ft_table = otable;
1594		SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
1595	}
1596	/*
1597	 * The map does not have the same possibility of threads still
1598	 * holding references to it.  So always free it as long as it
1599	 * does not reference the original static allocation.
1600	 */
1601	if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
1602		free(omap, M_FILEDESC);
1603}
1604
1605/*
1606 * Allocate a file descriptor for the process.
1607 */
1608int
1609fdalloc(struct thread *td, int minfd, int *result)
1610{
1611	struct proc *p = td->td_proc;
1612	struct filedesc *fdp = p->p_fd;
1613	int fd = -1, maxfd, allocfd;
1614#ifdef RACCT
1615	int error;
1616#endif
1617
1618	FILEDESC_XLOCK_ASSERT(fdp);
1619
1620	if (fdp->fd_freefile > minfd)
1621		minfd = fdp->fd_freefile;
1622
1623	maxfd = getmaxfd(p);
1624
1625	/*
1626	 * Search the bitmap for a free descriptor starting at minfd.
1627	 * If none is found, grow the file table.
1628	 */
1629	fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
1630	if (fd >= maxfd)
1631		return (EMFILE);
1632	if (fd >= fdp->fd_nfiles) {
1633		allocfd = min(fd * 2, maxfd);
1634#ifdef RACCT
1635		PROC_LOCK(p);
1636		error = racct_set(p, RACCT_NOFILE, allocfd);
1637		PROC_UNLOCK(p);
1638		if (error != 0)
1639			return (EMFILE);
1640#endif
1641		/*
1642		 * fd is already equal to first free descriptor >= minfd, so
1643		 * we only need to grow the table and we are done.
1644		 */
1645		fdgrowtable_exp(fdp, allocfd);
1646	}
1647
1648	/*
1649	 * Perform some sanity checks, then mark the file descriptor as
1650	 * used and return it to the caller.
1651	 */
1652	KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
1653	    ("invalid descriptor %d", fd));
1654	KASSERT(!fdisused(fdp, fd),
1655	    ("fd_first_free() returned non-free descriptor"));
1656	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
1657	    ("file descriptor isn't free"));
1658	KASSERT(fdp->fd_ofiles[fd].fde_flags == 0, ("file flags are set"));
1659	fdused(fdp, fd);
1660	*result = fd;
1661	return (0);
1662}
1663
1664/*
1665 * Allocate n file descriptors for the process.
1666 */
1667int
1668fdallocn(struct thread *td, int minfd, int *fds, int n)
1669{
1670	struct proc *p = td->td_proc;
1671	struct filedesc *fdp = p->p_fd;
1672	int i;
1673
1674	FILEDESC_XLOCK_ASSERT(fdp);
1675
1676	for (i = 0; i < n; i++)
1677		if (fdalloc(td, 0, &fds[i]) != 0)
1678			break;
1679
1680	if (i < n) {
1681		for (i--; i >= 0; i--)
1682			fdunused(fdp, fds[i]);
1683		return (EMFILE);
1684	}
1685
1686	return (0);
1687}
1688
1689/*
1690 * Create a new open file structure and allocate a file decriptor for the
1691 * process that refers to it.  We add one reference to the file for the
1692 * descriptor table and one reference for resultfp. This is to prevent us
1693 * being preempted and the entry in the descriptor table closed after we
1694 * release the FILEDESC lock.
1695 */
1696int
1697falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags)
1698{
1699	struct file *fp;
1700	int error, fd;
1701
1702	error = falloc_noinstall(td, &fp);
1703	if (error)
1704		return (error);		/* no reference held on error */
1705
1706	error = finstall(td, fp, &fd, flags, NULL);
1707	if (error) {
1708		fdrop(fp, td);		/* one reference (fp only) */
1709		return (error);
1710	}
1711
1712	if (resultfp != NULL)
1713		*resultfp = fp;		/* copy out result */
1714	else
1715		fdrop(fp, td);		/* release local reference */
1716
1717	if (resultfd != NULL)
1718		*resultfd = fd;
1719
1720	return (0);
1721}
1722
1723/*
1724 * Create a new open file structure without allocating a file descriptor.
1725 */
1726int
1727falloc_noinstall(struct thread *td, struct file **resultfp)
1728{
1729	struct file *fp;
1730	int maxuserfiles = maxfiles - (maxfiles / 20);
1731	static struct timeval lastfail;
1732	static int curfail;
1733
1734	KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
1735
1736	if ((openfiles >= maxuserfiles &&
1737	    priv_check(td, PRIV_MAXFILES) != 0) ||
1738	    openfiles >= maxfiles) {
1739		if (ppsratecheck(&lastfail, &curfail, 1)) {
1740			printf("kern.maxfiles limit exceeded by uid %i, "
1741			    "please see tuning(7).\n", td->td_ucred->cr_ruid);
1742		}
1743		return (ENFILE);
1744	}
1745	atomic_add_int(&openfiles, 1);
1746	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
1747	refcount_init(&fp->f_count, 1);
1748	fp->f_cred = crhold(td->td_ucred);
1749	fp->f_ops = &badfileops;
1750	*resultfp = fp;
1751	return (0);
1752}
1753
1754/*
1755 * Install a file in a file descriptor table.
1756 */
1757int
1758finstall(struct thread *td, struct file *fp, int *fd, int flags,
1759    struct filecaps *fcaps)
1760{
1761	struct filedesc *fdp = td->td_proc->p_fd;
1762	struct filedescent *fde;
1763	int error;
1764
1765	KASSERT(fd != NULL, ("%s: fd == NULL", __func__));
1766	KASSERT(fp != NULL, ("%s: fp == NULL", __func__));
1767	if (fcaps != NULL)
1768		filecaps_validate(fcaps, __func__);
1769
1770	FILEDESC_XLOCK(fdp);
1771	if ((error = fdalloc(td, 0, fd))) {
1772		FILEDESC_XUNLOCK(fdp);
1773		return (error);
1774	}
1775	fhold(fp);
1776	fde = &fdp->fd_ofiles[*fd];
1777#ifdef CAPABILITIES
1778	seq_write_begin(&fde->fde_seq);
1779#endif
1780	fde->fde_file = fp;
1781	if ((flags & O_CLOEXEC) != 0)
1782		fde->fde_flags |= UF_EXCLOSE;
1783	if (fcaps != NULL)
1784		filecaps_move(fcaps, &fde->fde_caps);
1785	else
1786		filecaps_fill(&fde->fde_caps);
1787#ifdef CAPABILITIES
1788	seq_write_end(&fde->fde_seq);
1789#endif
1790	FILEDESC_XUNLOCK(fdp);
1791	return (0);
1792}
1793
1794/*
1795 * Build a new filedesc structure from another.
1796 * Copy the current, root, and jail root vnode references.
1797 */
1798struct filedesc *
1799fdinit(struct filedesc *fdp)
1800{
1801	struct filedesc0 *newfdp;
1802
1803	newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO);
1804	FILEDESC_LOCK_INIT(&newfdp->fd_fd);
1805	if (fdp != NULL) {
1806		FILEDESC_SLOCK(fdp);
1807		newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
1808		if (newfdp->fd_fd.fd_cdir)
1809			VREF(newfdp->fd_fd.fd_cdir);
1810		newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
1811		if (newfdp->fd_fd.fd_rdir)
1812			VREF(newfdp->fd_fd.fd_rdir);
1813		newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
1814		if (newfdp->fd_fd.fd_jdir)
1815			VREF(newfdp->fd_fd.fd_jdir);
1816		FILEDESC_SUNLOCK(fdp);
1817	}
1818
1819	/* Create the file descriptor table. */
1820	newfdp->fd_fd.fd_refcnt = 1;
1821	newfdp->fd_fd.fd_holdcnt = 1;
1822	newfdp->fd_fd.fd_cmask = CMASK;
1823	newfdp->fd_dfiles.fdt_nfiles = NDFILE;
1824	newfdp->fd_fd.fd_files = (struct fdescenttbl *)&newfdp->fd_dfiles;
1825	newfdp->fd_fd.fd_map = newfdp->fd_dmap;
1826	newfdp->fd_fd.fd_lastfile = -1;
1827	return (&newfdp->fd_fd);
1828}
1829
1830static struct filedesc *
1831fdhold(struct proc *p)
1832{
1833	struct filedesc *fdp;
1834
1835	mtx_lock(&fdesc_mtx);
1836	fdp = p->p_fd;
1837	if (fdp != NULL)
1838		fdp->fd_holdcnt++;
1839	mtx_unlock(&fdesc_mtx);
1840	return (fdp);
1841}
1842
1843static void
1844fddrop(struct filedesc *fdp)
1845{
1846	struct filedesc0 *fdp0;
1847	struct freetable *ft;
1848	int i;
1849
1850	mtx_lock(&fdesc_mtx);
1851	i = --fdp->fd_holdcnt;
1852	mtx_unlock(&fdesc_mtx);
1853	if (i > 0)
1854		return;
1855
1856	FILEDESC_LOCK_DESTROY(fdp);
1857	fdp0 = (struct filedesc0 *)fdp;
1858	while ((ft = SLIST_FIRST(&fdp0->fd_free)) != NULL) {
1859		SLIST_REMOVE_HEAD(&fdp0->fd_free, ft_next);
1860		free(ft->ft_table, M_FILEDESC);
1861	}
1862	free(fdp, M_FILEDESC);
1863}
1864
1865/*
1866 * Share a filedesc structure.
1867 */
1868struct filedesc *
1869fdshare(struct filedesc *fdp)
1870{
1871
1872	FILEDESC_XLOCK(fdp);
1873	fdp->fd_refcnt++;
1874	FILEDESC_XUNLOCK(fdp);
1875	return (fdp);
1876}
1877
1878/*
1879 * Unshare a filedesc structure, if necessary by making a copy
1880 */
1881void
1882fdunshare(struct thread *td)
1883{
1884	struct filedesc *tmp;
1885	struct proc *p = td->td_proc;
1886
1887	if (p->p_fd->fd_refcnt == 1)
1888		return;
1889
1890	tmp = fdcopy(p->p_fd);
1891	fdescfree(td);
1892	p->p_fd = tmp;
1893}
1894
1895/*
1896 * Copy a filedesc structure.  A NULL pointer in returns a NULL reference,
1897 * this is to ease callers, not catch errors.
1898 */
1899struct filedesc *
1900fdcopy(struct filedesc *fdp)
1901{
1902	struct filedesc *newfdp;
1903	struct filedescent *nfde, *ofde;
1904	int i;
1905
1906	/* Certain daemons might not have file descriptors. */
1907	if (fdp == NULL)
1908		return (NULL);
1909
1910	newfdp = fdinit(fdp);
1911	FILEDESC_SLOCK(fdp);
1912	while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
1913		FILEDESC_SUNLOCK(fdp);
1914		FILEDESC_XLOCK(newfdp);
1915		fdgrowtable(newfdp, fdp->fd_lastfile + 1);
1916		FILEDESC_XUNLOCK(newfdp);
1917		FILEDESC_SLOCK(fdp);
1918	}
1919	/* copy all passable descriptors (i.e. not kqueue) */
1920	newfdp->fd_freefile = -1;
1921	for (i = 0; i <= fdp->fd_lastfile; ++i) {
1922		ofde = &fdp->fd_ofiles[i];
1923		if (fdisused(fdp, i) &&
1924		    (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) &&
1925		    ofde->fde_file->f_ops != &badfileops) {
1926			nfde = &newfdp->fd_ofiles[i];
1927			*nfde = *ofde;
1928			filecaps_copy(&ofde->fde_caps, &nfde->fde_caps);
1929			fhold(nfde->fde_file);
1930			newfdp->fd_lastfile = i;
1931		} else {
1932			if (newfdp->fd_freefile == -1)
1933				newfdp->fd_freefile = i;
1934		}
1935	}
1936	newfdp->fd_cmask = fdp->fd_cmask;
1937	FILEDESC_SUNLOCK(fdp);
1938	FILEDESC_XLOCK(newfdp);
1939	for (i = 0; i <= newfdp->fd_lastfile; ++i) {
1940		if (newfdp->fd_ofiles[i].fde_file != NULL)
1941			fdused(newfdp, i);
1942	}
1943	if (newfdp->fd_freefile == -1)
1944		newfdp->fd_freefile = i;
1945	FILEDESC_XUNLOCK(newfdp);
1946	return (newfdp);
1947}
1948
1949/*
1950 * Release a filedesc structure.
1951 */
1952void
1953fdescfree(struct thread *td)
1954{
1955	struct filedesc *fdp;
1956	int i;
1957	struct filedesc_to_leader *fdtol;
1958	struct file *fp;
1959	struct vnode *cdir, *jdir, *rdir, *vp;
1960	struct flock lf;
1961
1962	/* Certain daemons might not have file descriptors. */
1963	fdp = td->td_proc->p_fd;
1964	if (fdp == NULL)
1965		return;
1966
1967#ifdef RACCT
1968	PROC_LOCK(td->td_proc);
1969	racct_set(td->td_proc, RACCT_NOFILE, 0);
1970	PROC_UNLOCK(td->td_proc);
1971#endif
1972
1973	/* Check for special need to clear POSIX style locks */
1974	fdtol = td->td_proc->p_fdtol;
1975	if (fdtol != NULL) {
1976		FILEDESC_XLOCK(fdp);
1977		KASSERT(fdtol->fdl_refcount > 0,
1978		    ("filedesc_to_refcount botch: fdl_refcount=%d",
1979		    fdtol->fdl_refcount));
1980		if (fdtol->fdl_refcount == 1 &&
1981		    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
1982			for (i = 0; i <= fdp->fd_lastfile; i++) {
1983				fp = fdp->fd_ofiles[i].fde_file;
1984				if (fp == NULL || fp->f_type != DTYPE_VNODE)
1985					continue;
1986				fhold(fp);
1987				FILEDESC_XUNLOCK(fdp);
1988				lf.l_whence = SEEK_SET;
1989				lf.l_start = 0;
1990				lf.l_len = 0;
1991				lf.l_type = F_UNLCK;
1992				vp = fp->f_vnode;
1993				(void) VOP_ADVLOCK(vp,
1994				    (caddr_t)td->td_proc->p_leader, F_UNLCK,
1995				    &lf, F_POSIX);
1996				FILEDESC_XLOCK(fdp);
1997				fdrop(fp, td);
1998			}
1999		}
2000	retry:
2001		if (fdtol->fdl_refcount == 1) {
2002			if (fdp->fd_holdleaderscount > 0 &&
2003			    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
2004				/*
2005				 * close() or do_dup() has cleared a reference
2006				 * in a shared file descriptor table.
2007				 */
2008				fdp->fd_holdleaderswakeup = 1;
2009				sx_sleep(&fdp->fd_holdleaderscount,
2010				    FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
2011				goto retry;
2012			}
2013			if (fdtol->fdl_holdcount > 0) {
2014				/*
2015				 * Ensure that fdtol->fdl_leader remains
2016				 * valid in closef().
2017				 */
2018				fdtol->fdl_wakeup = 1;
2019				sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
2020				    "fdlhold", 0);
2021				goto retry;
2022			}
2023		}
2024		fdtol->fdl_refcount--;
2025		if (fdtol->fdl_refcount == 0 &&
2026		    fdtol->fdl_holdcount == 0) {
2027			fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
2028			fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
2029		} else
2030			fdtol = NULL;
2031		td->td_proc->p_fdtol = NULL;
2032		FILEDESC_XUNLOCK(fdp);
2033		if (fdtol != NULL)
2034			free(fdtol, M_FILEDESC_TO_LEADER);
2035	}
2036
2037	mtx_lock(&fdesc_mtx);
2038	td->td_proc->p_fd = NULL;
2039	mtx_unlock(&fdesc_mtx);
2040
2041	FILEDESC_XLOCK(fdp);
2042	i = --fdp->fd_refcnt;
2043	if (i > 0) {
2044		FILEDESC_XUNLOCK(fdp);
2045		return;
2046	}
2047
2048	cdir = fdp->fd_cdir;
2049	fdp->fd_cdir = NULL;
2050	rdir = fdp->fd_rdir;
2051	fdp->fd_rdir = NULL;
2052	jdir = fdp->fd_jdir;
2053	fdp->fd_jdir = NULL;
2054	FILEDESC_XUNLOCK(fdp);
2055
2056	for (i = 0; i <= fdp->fd_lastfile; i++) {
2057		fp = fdp->fd_ofiles[i].fde_file;
2058		if (fp != NULL) {
2059			fdfree_last(fdp, i);
2060			(void) closef(fp, td);
2061		}
2062	}
2063
2064	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
2065		free(fdp->fd_map, M_FILEDESC);
2066	if (fdp->fd_nfiles > NDFILE)
2067		free(fdp->fd_files, M_FILEDESC);
2068
2069	if (cdir != NULL)
2070		vrele(cdir);
2071	if (rdir != NULL)
2072		vrele(rdir);
2073	if (jdir != NULL)
2074		vrele(jdir);
2075
2076	fddrop(fdp);
2077}
2078
2079/*
2080 * For setugid programs, we don't want to people to use that setugidness
2081 * to generate error messages which write to a file which otherwise would
2082 * otherwise be off-limits to the process.  We check for filesystems where
2083 * the vnode can change out from under us after execve (like [lin]procfs).
2084 *
2085 * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
2086 * sufficient.  We also don't check for setugidness since we know we are.
2087 */
2088static bool
2089is_unsafe(struct file *fp)
2090{
2091	struct vnode *vp;
2092
2093	if (fp->f_type != DTYPE_VNODE)
2094		return (false);
2095
2096	vp = fp->f_vnode;
2097	return ((vp->v_vflag & VV_PROCDEP) != 0);
2098}
2099
2100/*
2101 * Make this setguid thing safe, if at all possible.
2102 */
2103void
2104fdsetugidsafety(struct thread *td)
2105{
2106	struct filedesc *fdp;
2107	struct file *fp;
2108	int i;
2109
2110	fdp = td->td_proc->p_fd;
2111	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
2112	MPASS(fdp->fd_nfiles >= 3);
2113	for (i = 0; i <= 2; i++) {
2114		fp = fdp->fd_ofiles[i].fde_file;
2115		if (fp != NULL && is_unsafe(fp)) {
2116			FILEDESC_XLOCK(fdp);
2117			knote_fdclose(td, i);
2118			/*
2119			 * NULL-out descriptor prior to close to avoid
2120			 * a race while close blocks.
2121			 */
2122			fdfree(fdp, i);
2123			FILEDESC_XUNLOCK(fdp);
2124			(void) closef(fp, td);
2125		}
2126	}
2127}
2128
2129/*
2130 * If a specific file object occupies a specific file descriptor, close the
2131 * file descriptor entry and drop a reference on the file object.  This is a
2132 * convenience function to handle a subsequent error in a function that calls
2133 * falloc() that handles the race that another thread might have closed the
2134 * file descriptor out from under the thread creating the file object.
2135 */
2136void
2137fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
2138{
2139
2140	FILEDESC_XLOCK(fdp);
2141	if (fdp->fd_ofiles[idx].fde_file == fp) {
2142		fdfree(fdp, idx);
2143		FILEDESC_XUNLOCK(fdp);
2144		fdrop(fp, td);
2145	} else
2146		FILEDESC_XUNLOCK(fdp);
2147}
2148
2149/*
2150 * Close any files on exec?
2151 */
2152void
2153fdcloseexec(struct thread *td)
2154{
2155	struct filedesc *fdp;
2156	struct filedescent *fde;
2157	struct file *fp;
2158	int i;
2159
2160	fdp = td->td_proc->p_fd;
2161	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
2162	FILEDESC_XLOCK(fdp);
2163	for (i = 0; i <= fdp->fd_lastfile; i++) {
2164		fde = &fdp->fd_ofiles[i];
2165		fp = fde->fde_file;
2166		if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
2167		    (fde->fde_flags & UF_EXCLOSE))) {
2168			fdfree(fdp, i);
2169			(void) closefp(fdp, i, fp, td, 0);
2170			/* closefp() drops the FILEDESC lock. */
2171			FILEDESC_XLOCK(fdp);
2172		}
2173	}
2174	FILEDESC_XUNLOCK(fdp);
2175}
2176
2177/*
2178 * It is unsafe for set[ug]id processes to be started with file
2179 * descriptors 0..2 closed, as these descriptors are given implicit
2180 * significance in the Standard C library.  fdcheckstd() will create a
2181 * descriptor referencing /dev/null for each of stdin, stdout, and
2182 * stderr that is not already open.
2183 */
2184int
2185fdcheckstd(struct thread *td)
2186{
2187	struct filedesc *fdp;
2188	register_t retval, save;
2189	int i, error, devnull;
2190
2191	fdp = td->td_proc->p_fd;
2192	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
2193	devnull = -1;
2194	error = 0;
2195	for (i = 0; i < 3; i++) {
2196		if (fdp->fd_ofiles[i].fde_file != NULL)
2197			continue;
2198		if (devnull < 0) {
2199			save = td->td_retval[0];
2200			error = kern_open(td, "/dev/null", UIO_SYSSPACE,
2201			    O_RDWR, 0);
2202			devnull = td->td_retval[0];
2203			td->td_retval[0] = save;
2204			if (error)
2205				break;
2206			KASSERT(devnull == i, ("oof, we didn't get our fd"));
2207		} else {
2208			error = do_dup(td, DUP_FIXED, devnull, i, &retval);
2209			if (error != 0)
2210				break;
2211		}
2212	}
2213	return (error);
2214}
2215
2216/*
2217 * Internal form of close.  Decrement reference count on file structure.
2218 * Note: td may be NULL when closing a file that was being passed in a
2219 * message.
2220 *
2221 * XXXRW: Giant is not required for the caller, but often will be held; this
2222 * makes it moderately likely the Giant will be recursed in the VFS case.
2223 */
2224int
2225closef(struct file *fp, struct thread *td)
2226{
2227	struct vnode *vp;
2228	struct flock lf;
2229	struct filedesc_to_leader *fdtol;
2230	struct filedesc *fdp;
2231
2232	/*
2233	 * POSIX record locking dictates that any close releases ALL
2234	 * locks owned by this process.  This is handled by setting
2235	 * a flag in the unlock to free ONLY locks obeying POSIX
2236	 * semantics, and not to free BSD-style file locks.
2237	 * If the descriptor was in a message, POSIX-style locks
2238	 * aren't passed with the descriptor, and the thread pointer
2239	 * will be NULL.  Callers should be careful only to pass a
2240	 * NULL thread pointer when there really is no owning
2241	 * context that might have locks, or the locks will be
2242	 * leaked.
2243	 */
2244	if (fp->f_type == DTYPE_VNODE && td != NULL) {
2245		vp = fp->f_vnode;
2246		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
2247			lf.l_whence = SEEK_SET;
2248			lf.l_start = 0;
2249			lf.l_len = 0;
2250			lf.l_type = F_UNLCK;
2251			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
2252			    F_UNLCK, &lf, F_POSIX);
2253		}
2254		fdtol = td->td_proc->p_fdtol;
2255		if (fdtol != NULL) {
2256			/*
2257			 * Handle special case where file descriptor table is
2258			 * shared between multiple process leaders.
2259			 */
2260			fdp = td->td_proc->p_fd;
2261			FILEDESC_XLOCK(fdp);
2262			for (fdtol = fdtol->fdl_next;
2263			     fdtol != td->td_proc->p_fdtol;
2264			     fdtol = fdtol->fdl_next) {
2265				if ((fdtol->fdl_leader->p_flag &
2266				     P_ADVLOCK) == 0)
2267					continue;
2268				fdtol->fdl_holdcount++;
2269				FILEDESC_XUNLOCK(fdp);
2270				lf.l_whence = SEEK_SET;
2271				lf.l_start = 0;
2272				lf.l_len = 0;
2273				lf.l_type = F_UNLCK;
2274				vp = fp->f_vnode;
2275				(void) VOP_ADVLOCK(vp,
2276				    (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
2277				    F_POSIX);
2278				FILEDESC_XLOCK(fdp);
2279				fdtol->fdl_holdcount--;
2280				if (fdtol->fdl_holdcount == 0 &&
2281				    fdtol->fdl_wakeup != 0) {
2282					fdtol->fdl_wakeup = 0;
2283					wakeup(fdtol);
2284				}
2285			}
2286			FILEDESC_XUNLOCK(fdp);
2287		}
2288	}
2289	return (fdrop(fp, td));
2290}
2291
2292/*
2293 * Initialize the file pointer with the specified properties.
2294 *
2295 * The ops are set with release semantics to be certain that the flags, type,
2296 * and data are visible when ops is.  This is to prevent ops methods from being
2297 * called with bad data.
2298 */
2299void
2300finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
2301{
2302	fp->f_data = data;
2303	fp->f_flag = flag;
2304	fp->f_type = type;
2305	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
2306}
2307
2308int
2309fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
2310    int needfcntl, struct file **fpp, cap_rights_t *haverightsp)
2311{
2312#ifdef CAPABILITIES
2313	struct filedescent fde;
2314#endif
2315	struct fdescenttbl *fdt;
2316	struct file *fp;
2317	u_int count;
2318#ifdef CAPABILITIES
2319	seq_t seq;
2320	cap_rights_t haverights;
2321	int error;
2322#endif
2323
2324	fdt = fdp->fd_files;
2325	if (fd < 0 || fd >= fdt->fdt_nfiles)
2326		return (EBADF);
2327	/*
2328	 * Fetch the descriptor locklessly.  We avoid fdrop() races by
2329	 * never raising a refcount above 0.  To accomplish this we have
2330	 * to use a cmpset loop rather than an atomic_add.  The descriptor
2331	 * must be re-verified once we acquire a reference to be certain
2332	 * that the identity is still correct and we did not lose a race
2333	 * due to preemption.
2334	 */
2335	for (;;) {
2336#ifdef CAPABILITIES
2337		seq = seq_read(fd_seq(fdt, fd));
2338		fde = fdt->fdt_ofiles[fd];
2339		if (!seq_consistent(fd_seq(fdt, fd), seq)) {
2340			cpu_spinwait();
2341			continue;
2342		}
2343		fp = fde.fde_file;
2344#else
2345		fp = fdt->fdt_ofiles[fd].fde_file;
2346#endif
2347		if (fp == NULL)
2348			return (EBADF);
2349#ifdef CAPABILITIES
2350		haverights = *cap_rights_fde(&fde);
2351		if (needrightsp != NULL) {
2352			error = cap_check(&haverights, needrightsp);
2353			if (error != 0)
2354				return (error);
2355			if (cap_rights_is_set(needrightsp, CAP_FCNTL)) {
2356				error = cap_fcntl_check_fde(&fde, needfcntl);
2357				if (error != 0)
2358					return (error);
2359			}
2360		}
2361#endif
2362	retry:
2363		count = fp->f_count;
2364		if (count == 0) {
2365			fdt = fdp->fd_files;
2366			continue;
2367		}
2368		/*
2369		 * Use an acquire barrier to force re-reading of fdt so it is
2370		 * refreshed for verification.
2371		 */
2372		if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) == 0)
2373			goto retry;
2374		fdt = fdp->fd_files;
2375#ifdef	CAPABILITIES
2376		if (seq_consistent_nomb(fd_seq(fdt, fd), seq))
2377#else
2378		if (fp == fdt->fdt_ofiles[fd].fde_file)
2379#endif
2380			break;
2381		fdrop(fp, curthread);
2382	}
2383	*fpp = fp;
2384	if (haverightsp != NULL) {
2385#ifdef CAPABILITIES
2386		*haverightsp = haverights;
2387#else
2388		CAP_ALL(haverightsp);
2389#endif
2390	}
2391	return (0);
2392}
2393
2394/*
2395 * Extract the file pointer associated with the specified descriptor for the
2396 * current user process.
2397 *
2398 * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
2399 * returned.
2400 *
2401 * File's rights will be checked against the capability rights mask.
2402 *
2403 * If an error occured the non-zero error is returned and *fpp is set to
2404 * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
2405 * responsible for fdrop().
2406 */
2407static __inline int
2408_fget(struct thread *td, int fd, struct file **fpp, int flags,
2409    cap_rights_t *needrightsp, u_char *maxprotp)
2410{
2411	struct filedesc *fdp;
2412	struct file *fp;
2413	cap_rights_t haverights, needrights;
2414	int error;
2415
2416	*fpp = NULL;
2417	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
2418		return (EBADF);
2419	if (needrightsp != NULL)
2420		needrights = *needrightsp;
2421	else
2422		cap_rights_init(&needrights);
2423	if (maxprotp != NULL)
2424		cap_rights_set(&needrights, CAP_MMAP);
2425	error = fget_unlocked(fdp, fd, &needrights, 0, &fp, &haverights);
2426	if (error != 0)
2427		return (error);
2428	if (fp->f_ops == &badfileops) {
2429		fdrop(fp, td);
2430		return (EBADF);
2431	}
2432
2433#ifdef CAPABILITIES
2434	/*
2435	 * If requested, convert capability rights to access flags.
2436	 */
2437	if (maxprotp != NULL)
2438		*maxprotp = cap_rights_to_vmprot(&haverights);
2439#else /* !CAPABILITIES */
2440	if (maxprotp != NULL)
2441		*maxprotp = VM_PROT_ALL;
2442#endif /* CAPABILITIES */
2443
2444	/*
2445	 * FREAD and FWRITE failure return EBADF as per POSIX.
2446	 */
2447	error = 0;
2448	switch (flags) {
2449	case FREAD:
2450	case FWRITE:
2451		if ((fp->f_flag & flags) == 0)
2452			error = EBADF;
2453		break;
2454	case FEXEC:
2455	    	if ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
2456		    ((fp->f_flag & FWRITE) != 0))
2457			error = EBADF;
2458		break;
2459	case 0:
2460		break;
2461	default:
2462		KASSERT(0, ("wrong flags"));
2463	}
2464
2465	if (error != 0) {
2466		fdrop(fp, td);
2467		return (error);
2468	}
2469
2470	*fpp = fp;
2471	return (0);
2472}
2473
2474int
2475fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
2476{
2477
2478	return(_fget(td, fd, fpp, 0, rightsp, NULL));
2479}
2480
2481int
2482fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, u_char *maxprotp,
2483    struct file **fpp)
2484{
2485
2486	return (_fget(td, fd, fpp, 0, rightsp, maxprotp));
2487}
2488
2489int
2490fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
2491{
2492
2493	return(_fget(td, fd, fpp, FREAD, rightsp, NULL));
2494}
2495
2496int
2497fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
2498{
2499
2500	return (_fget(td, fd, fpp, FWRITE, rightsp, NULL));
2501}
2502
2503/*
2504 * Like fget() but loads the underlying vnode, or returns an error if the
2505 * descriptor does not represent a vnode.  Note that pipes use vnodes but
2506 * never have VM objects.  The returned vnode will be vref()'d.
2507 *
2508 * XXX: what about the unused flags ?
2509 */
2510static __inline int
2511_fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp,
2512    struct vnode **vpp)
2513{
2514	struct file *fp;
2515	int error;
2516
2517	*vpp = NULL;
2518	error = _fget(td, fd, &fp, flags, needrightsp, NULL);
2519	if (error != 0)
2520		return (error);
2521	if (fp->f_vnode == NULL) {
2522		error = EINVAL;
2523	} else {
2524		*vpp = fp->f_vnode;
2525		vref(*vpp);
2526	}
2527	fdrop(fp, td);
2528
2529	return (error);
2530}
2531
2532int
2533fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
2534{
2535
2536	return (_fgetvp(td, fd, 0, rightsp, vpp));
2537}
2538
2539int
2540fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
2541    struct filecaps *havecaps, struct vnode **vpp)
2542{
2543	struct filedesc *fdp;
2544	struct file *fp;
2545#ifdef CAPABILITIES
2546	int error;
2547#endif
2548
2549	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
2550		return (EBADF);
2551
2552	fp = fget_locked(fdp, fd);
2553	if (fp == NULL || fp->f_ops == &badfileops)
2554		return (EBADF);
2555
2556#ifdef CAPABILITIES
2557	if (needrightsp != NULL) {
2558		error = cap_check(cap_rights(fdp, fd), needrightsp);
2559		if (error != 0)
2560			return (error);
2561	}
2562#endif
2563
2564	if (fp->f_vnode == NULL)
2565		return (EINVAL);
2566
2567	*vpp = fp->f_vnode;
2568	vref(*vpp);
2569	filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, havecaps);
2570
2571	return (0);
2572}
2573
2574int
2575fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
2576{
2577
2578	return (_fgetvp(td, fd, FREAD, rightsp, vpp));
2579}
2580
2581int
2582fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
2583{
2584
2585	return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
2586}
2587
2588#ifdef notyet
2589int
2590fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
2591    struct vnode **vpp)
2592{
2593
2594	return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
2595}
2596#endif
2597
2598/*
2599 * Like fget() but loads the underlying socket, or returns an error if the
2600 * descriptor does not represent a socket.
2601 *
2602 * We bump the ref count on the returned socket.  XXX Also obtain the SX lock
2603 * in the future.
2604 *
2605 * Note: fgetsock() and fputsock() are deprecated, as consumers should rely
2606 * on their file descriptor reference to prevent the socket from being free'd
2607 * during use.
2608 */
2609int
2610fgetsock(struct thread *td, int fd, cap_rights_t *rightsp, struct socket **spp,
2611    u_int *fflagp)
2612{
2613	struct file *fp;
2614	int error;
2615
2616	*spp = NULL;
2617	if (fflagp != NULL)
2618		*fflagp = 0;
2619	if ((error = _fget(td, fd, &fp, 0, rightsp, NULL)) != 0)
2620		return (error);
2621	if (fp->f_type != DTYPE_SOCKET) {
2622		error = ENOTSOCK;
2623	} else {
2624		*spp = fp->f_data;
2625		if (fflagp)
2626			*fflagp = fp->f_flag;
2627		SOCK_LOCK(*spp);
2628		soref(*spp);
2629		SOCK_UNLOCK(*spp);
2630	}
2631	fdrop(fp, td);
2632
2633	return (error);
2634}
2635
2636/*
2637 * Drop the reference count on the socket and XXX release the SX lock in the
2638 * future.  The last reference closes the socket.
2639 *
2640 * Note: fputsock() is deprecated, see comment for fgetsock().
2641 */
2642void
2643fputsock(struct socket *so)
2644{
2645
2646	ACCEPT_LOCK();
2647	SOCK_LOCK(so);
2648	CURVNET_SET(so->so_vnet);
2649	sorele(so);
2650	CURVNET_RESTORE();
2651}
2652
2653/*
2654 * Handle the last reference to a file being closed.
2655 */
2656int
2657_fdrop(struct file *fp, struct thread *td)
2658{
2659	int error;
2660
2661	error = 0;
2662	if (fp->f_count != 0)
2663		panic("fdrop: count %d", fp->f_count);
2664	if (fp->f_ops != &badfileops)
2665		error = fo_close(fp, td);
2666	atomic_subtract_int(&openfiles, 1);
2667	crfree(fp->f_cred);
2668	free(fp->f_advice, M_FADVISE);
2669	uma_zfree(file_zone, fp);
2670
2671	return (error);
2672}
2673
2674/*
2675 * Apply an advisory lock on a file descriptor.
2676 *
2677 * Just attempt to get a record lock of the requested type on the entire file
2678 * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
2679 */
2680#ifndef _SYS_SYSPROTO_H_
2681struct flock_args {
2682	int	fd;
2683	int	how;
2684};
2685#endif
2686/* ARGSUSED */
2687int
2688sys_flock(struct thread *td, struct flock_args *uap)
2689{
2690	struct file *fp;
2691	struct vnode *vp;
2692	struct flock lf;
2693	cap_rights_t rights;
2694	int error;
2695
2696	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FLOCK), &fp);
2697	if (error != 0)
2698		return (error);
2699	if (fp->f_type != DTYPE_VNODE) {
2700		fdrop(fp, td);
2701		return (EOPNOTSUPP);
2702	}
2703
2704	vp = fp->f_vnode;
2705	lf.l_whence = SEEK_SET;
2706	lf.l_start = 0;
2707	lf.l_len = 0;
2708	if (uap->how & LOCK_UN) {
2709		lf.l_type = F_UNLCK;
2710		atomic_clear_int(&fp->f_flag, FHASLOCK);
2711		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
2712		goto done2;
2713	}
2714	if (uap->how & LOCK_EX)
2715		lf.l_type = F_WRLCK;
2716	else if (uap->how & LOCK_SH)
2717		lf.l_type = F_RDLCK;
2718	else {
2719		error = EBADF;
2720		goto done2;
2721	}
2722	atomic_set_int(&fp->f_flag, FHASLOCK);
2723	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
2724	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
2725done2:
2726	fdrop(fp, td);
2727	return (error);
2728}
2729/*
2730 * Duplicate the specified descriptor to a free descriptor.
2731 */
2732int
2733dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
2734    int openerror, int *indxp)
2735{
2736	struct filedescent *newfde, *oldfde;
2737	struct file *fp;
2738	int error, indx;
2739
2740	KASSERT(openerror == ENODEV || openerror == ENXIO,
2741	    ("unexpected error %d in %s", openerror, __func__));
2742
2743	/*
2744	 * If the to-be-dup'd fd number is greater than the allowed number
2745	 * of file descriptors, or the fd to be dup'd has already been
2746	 * closed, then reject.
2747	 */
2748	FILEDESC_XLOCK(fdp);
2749	if ((fp = fget_locked(fdp, dfd)) == NULL) {
2750		FILEDESC_XUNLOCK(fdp);
2751		return (EBADF);
2752	}
2753
2754	error = fdalloc(td, 0, &indx);
2755	if (error != 0) {
2756		FILEDESC_XUNLOCK(fdp);
2757		return (error);
2758	}
2759
2760	/*
2761	 * There are two cases of interest here.
2762	 *
2763	 * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
2764	 *
2765	 * For ENXIO steal away the file structure from (dfd) and store it in
2766	 * (indx).  (dfd) is effectively closed by this operation.
2767	 */
2768	switch (openerror) {
2769	case ENODEV:
2770		/*
2771		 * Check that the mode the file is being opened for is a
2772		 * subset of the mode of the existing descriptor.
2773		 */
2774		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
2775			fdunused(fdp, indx);
2776			FILEDESC_XUNLOCK(fdp);
2777			return (EACCES);
2778		}
2779		fhold(fp);
2780		newfde = &fdp->fd_ofiles[indx];
2781		oldfde = &fdp->fd_ofiles[dfd];
2782#ifdef CAPABILITIES
2783		seq_write_begin(&newfde->fde_seq);
2784#endif
2785		memcpy(newfde, oldfde, fde_change_size);
2786		filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps);
2787#ifdef CAPABILITIES
2788		seq_write_end(&newfde->fde_seq);
2789#endif
2790		break;
2791	case ENXIO:
2792		/*
2793		 * Steal away the file pointer from dfd and stuff it into indx.
2794		 */
2795		newfde = &fdp->fd_ofiles[indx];
2796		oldfde = &fdp->fd_ofiles[dfd];
2797#ifdef CAPABILITIES
2798		seq_write_begin(&newfde->fde_seq);
2799#endif
2800		memcpy(newfde, oldfde, fde_change_size);
2801		bzero(oldfde, fde_change_size);
2802		fdunused(fdp, dfd);
2803#ifdef CAPABILITIES
2804		seq_write_end(&newfde->fde_seq);
2805#endif
2806		break;
2807	}
2808	FILEDESC_XUNLOCK(fdp);
2809	*indxp = indx;
2810	return (0);
2811}
2812
2813/*
2814 * Scan all active processes and prisons to see if any of them have a current
2815 * or root directory of `olddp'. If so, replace them with the new mount point.
2816 */
2817void
2818mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
2819{
2820	struct filedesc *fdp;
2821	struct prison *pr;
2822	struct proc *p;
2823	int nrele;
2824
2825	if (vrefcnt(olddp) == 1)
2826		return;
2827	nrele = 0;
2828	sx_slock(&allproc_lock);
2829	FOREACH_PROC_IN_SYSTEM(p) {
2830		fdp = fdhold(p);
2831		if (fdp == NULL)
2832			continue;
2833		FILEDESC_XLOCK(fdp);
2834		if (fdp->fd_cdir == olddp) {
2835			vref(newdp);
2836			fdp->fd_cdir = newdp;
2837			nrele++;
2838		}
2839		if (fdp->fd_rdir == olddp) {
2840			vref(newdp);
2841			fdp->fd_rdir = newdp;
2842			nrele++;
2843		}
2844		if (fdp->fd_jdir == olddp) {
2845			vref(newdp);
2846			fdp->fd_jdir = newdp;
2847			nrele++;
2848		}
2849		FILEDESC_XUNLOCK(fdp);
2850		fddrop(fdp);
2851	}
2852	sx_sunlock(&allproc_lock);
2853	if (rootvnode == olddp) {
2854		vref(newdp);
2855		rootvnode = newdp;
2856		nrele++;
2857	}
2858	mtx_lock(&prison0.pr_mtx);
2859	if (prison0.pr_root == olddp) {
2860		vref(newdp);
2861		prison0.pr_root = newdp;
2862		nrele++;
2863	}
2864	mtx_unlock(&prison0.pr_mtx);
2865	sx_slock(&allprison_lock);
2866	TAILQ_FOREACH(pr, &allprison, pr_list) {
2867		mtx_lock(&pr->pr_mtx);
2868		if (pr->pr_root == olddp) {
2869			vref(newdp);
2870			pr->pr_root = newdp;
2871			nrele++;
2872		}
2873		mtx_unlock(&pr->pr_mtx);
2874	}
2875	sx_sunlock(&allprison_lock);
2876	while (nrele--)
2877		vrele(olddp);
2878}
2879
2880struct filedesc_to_leader *
2881filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
2882{
2883	struct filedesc_to_leader *fdtol;
2884
2885	fdtol = malloc(sizeof(struct filedesc_to_leader),
2886	       M_FILEDESC_TO_LEADER,
2887	       M_WAITOK);
2888	fdtol->fdl_refcount = 1;
2889	fdtol->fdl_holdcount = 0;
2890	fdtol->fdl_wakeup = 0;
2891	fdtol->fdl_leader = leader;
2892	if (old != NULL) {
2893		FILEDESC_XLOCK(fdp);
2894		fdtol->fdl_next = old->fdl_next;
2895		fdtol->fdl_prev = old;
2896		old->fdl_next = fdtol;
2897		fdtol->fdl_next->fdl_prev = fdtol;
2898		FILEDESC_XUNLOCK(fdp);
2899	} else {
2900		fdtol->fdl_next = fdtol;
2901		fdtol->fdl_prev = fdtol;
2902	}
2903	return (fdtol);
2904}
2905
2906/*
2907 * Get file structures globally.
2908 */
2909static int
2910sysctl_kern_file(SYSCTL_HANDLER_ARGS)
2911{
2912	struct xfile xf;
2913	struct filedesc *fdp;
2914	struct file *fp;
2915	struct proc *p;
2916	int error, n;
2917
2918	error = sysctl_wire_old_buffer(req, 0);
2919	if (error != 0)
2920		return (error);
2921	if (req->oldptr == NULL) {
2922		n = 0;
2923		sx_slock(&allproc_lock);
2924		FOREACH_PROC_IN_SYSTEM(p) {
2925			if (p->p_state == PRS_NEW)
2926				continue;
2927			fdp = fdhold(p);
2928			if (fdp == NULL)
2929				continue;
2930			/* overestimates sparse tables. */
2931			if (fdp->fd_lastfile > 0)
2932				n += fdp->fd_lastfile;
2933			fddrop(fdp);
2934		}
2935		sx_sunlock(&allproc_lock);
2936		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
2937	}
2938	error = 0;
2939	bzero(&xf, sizeof(xf));
2940	xf.xf_size = sizeof(xf);
2941	sx_slock(&allproc_lock);
2942	FOREACH_PROC_IN_SYSTEM(p) {
2943		PROC_LOCK(p);
2944		if (p->p_state == PRS_NEW) {
2945			PROC_UNLOCK(p);
2946			continue;
2947		}
2948		if (p_cansee(req->td, p) != 0) {
2949			PROC_UNLOCK(p);
2950			continue;
2951		}
2952		xf.xf_pid = p->p_pid;
2953		xf.xf_uid = p->p_ucred->cr_uid;
2954		PROC_UNLOCK(p);
2955		fdp = fdhold(p);
2956		if (fdp == NULL)
2957			continue;
2958		FILEDESC_SLOCK(fdp);
2959		for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) {
2960			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
2961				continue;
2962			xf.xf_fd = n;
2963			xf.xf_file = fp;
2964			xf.xf_data = fp->f_data;
2965			xf.xf_vnode = fp->f_vnode;
2966			xf.xf_type = fp->f_type;
2967			xf.xf_count = fp->f_count;
2968			xf.xf_msgcount = 0;
2969			xf.xf_offset = foffset_get(fp);
2970			xf.xf_flag = fp->f_flag;
2971			error = SYSCTL_OUT(req, &xf, sizeof(xf));
2972			if (error)
2973				break;
2974		}
2975		FILEDESC_SUNLOCK(fdp);
2976		fddrop(fdp);
2977		if (error)
2978			break;
2979	}
2980	sx_sunlock(&allproc_lock);
2981	return (error);
2982}
2983
2984SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
2985    0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
2986
2987#ifdef KINFO_FILE_SIZE
2988CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
2989#endif
2990
2991static int
2992xlate_fflags(int fflags)
2993{
2994	static const struct {
2995		int	fflag;
2996		int	kf_fflag;
2997	} fflags_table[] = {
2998		{ FAPPEND, KF_FLAG_APPEND },
2999		{ FASYNC, KF_FLAG_ASYNC },
3000		{ FFSYNC, KF_FLAG_FSYNC },
3001		{ FHASLOCK, KF_FLAG_HASLOCK },
3002		{ FNONBLOCK, KF_FLAG_NONBLOCK },
3003		{ FREAD, KF_FLAG_READ },
3004		{ FWRITE, KF_FLAG_WRITE },
3005		{ O_CREAT, KF_FLAG_CREAT },
3006		{ O_DIRECT, KF_FLAG_DIRECT },
3007		{ O_EXCL, KF_FLAG_EXCL },
3008		{ O_EXEC, KF_FLAG_EXEC },
3009		{ O_EXLOCK, KF_FLAG_EXLOCK },
3010		{ O_NOFOLLOW, KF_FLAG_NOFOLLOW },
3011		{ O_SHLOCK, KF_FLAG_SHLOCK },
3012		{ O_TRUNC, KF_FLAG_TRUNC }
3013	};
3014	unsigned int i;
3015	int kflags;
3016
3017	kflags = 0;
3018	for (i = 0; i < nitems(fflags_table); i++)
3019		if (fflags & fflags_table[i].fflag)
3020			kflags |=  fflags_table[i].kf_fflag;
3021	return (kflags);
3022}
3023
3024/* Trim unused data from kf_path by truncating the structure size. */
3025static void
3026pack_kinfo(struct kinfo_file *kif)
3027{
3028
3029	kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
3030	    strlen(kif->kf_path) + 1;
3031	kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
3032}
3033
3034static void
3035export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp,
3036    struct kinfo_file *kif, struct filedesc *fdp)
3037{
3038	int error;
3039
3040	bzero(kif, sizeof(*kif));
3041
3042	/* Set a default type to allow for empty fill_kinfo() methods. */
3043	kif->kf_type = KF_TYPE_UNKNOWN;
3044	kif->kf_flags = xlate_fflags(fp->f_flag);
3045	if (rightsp != NULL)
3046		kif->kf_cap_rights = *rightsp;
3047	else
3048		cap_rights_init(&kif->kf_cap_rights);
3049	kif->kf_fd = fd;
3050	kif->kf_ref_count = fp->f_count;
3051	kif->kf_offset = foffset_get(fp);
3052
3053	/*
3054	 * This may drop the filedesc lock, so the 'fp' cannot be
3055	 * accessed after this call.
3056	 */
3057	error = fo_fill_kinfo(fp, kif, fdp);
3058	if (error == 0)
3059		kif->kf_status |= KF_ATTR_VALID;
3060	pack_kinfo(kif);
3061}
3062
3063static void
3064export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags,
3065    struct kinfo_file *kif)
3066{
3067	int error;
3068
3069	bzero(kif, sizeof(*kif));
3070
3071	kif->kf_type = KF_TYPE_VNODE;
3072	error = vn_fill_kinfo_vnode(vp, kif);
3073	if (error == 0)
3074		kif->kf_status |= KF_ATTR_VALID;
3075	kif->kf_flags = xlate_fflags(fflags);
3076	kif->kf_fd = fd;
3077	kif->kf_ref_count = -1;
3078	kif->kf_offset = -1;
3079	pack_kinfo(kif);
3080	vrele(vp);
3081}
3082
3083struct export_fd_buf {
3084	struct filedesc		*fdp;
3085	struct sbuf 		*sb;
3086	ssize_t			remainder;
3087	struct kinfo_file	kif;
3088};
3089
3090static int
3091export_kinfo_to_sb(struct export_fd_buf *efbuf)
3092{
3093	struct kinfo_file *kif;
3094
3095	kif = &efbuf->kif;
3096	if (efbuf->remainder != -1) {
3097		if (efbuf->remainder < kif->kf_structsize) {
3098			/* Terminate export. */
3099			efbuf->remainder = 0;
3100			return (0);
3101		}
3102		efbuf->remainder -= kif->kf_structsize;
3103	}
3104	return (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM);
3105}
3106
3107static int
3108export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp,
3109    struct export_fd_buf *efbuf)
3110{
3111	int error;
3112
3113	if (efbuf->remainder == 0)
3114		return (0);
3115	export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp);
3116	FILEDESC_SUNLOCK(efbuf->fdp);
3117	error = export_kinfo_to_sb(efbuf);
3118	FILEDESC_SLOCK(efbuf->fdp);
3119	return (error);
3120}
3121
3122static int
3123export_vnode_to_sb(struct vnode *vp, int fd, int fflags,
3124    struct export_fd_buf *efbuf)
3125{
3126	int error;
3127
3128	if (efbuf->remainder == 0)
3129		return (0);
3130	if (efbuf->fdp != NULL)
3131		FILEDESC_SUNLOCK(efbuf->fdp);
3132	export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif);
3133	error = export_kinfo_to_sb(efbuf);
3134	if (efbuf->fdp != NULL)
3135		FILEDESC_SLOCK(efbuf->fdp);
3136	return (error);
3137}
3138
3139/*
3140 * Store a process file descriptor information to sbuf.
3141 *
3142 * Takes a locked proc as argument, and returns with the proc unlocked.
3143 */
3144int
3145kern_proc_filedesc_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen)
3146{
3147	struct file *fp;
3148	struct filedesc *fdp;
3149	struct export_fd_buf *efbuf;
3150	struct vnode *cttyvp, *textvp, *tracevp;
3151	int error, i;
3152	cap_rights_t rights;
3153
3154	PROC_LOCK_ASSERT(p, MA_OWNED);
3155
3156	/* ktrace vnode */
3157	tracevp = p->p_tracevp;
3158	if (tracevp != NULL)
3159		vref(tracevp);
3160	/* text vnode */
3161	textvp = p->p_textvp;
3162	if (textvp != NULL)
3163		vref(textvp);
3164	/* Controlling tty. */
3165	cttyvp = NULL;
3166	if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
3167		cttyvp = p->p_pgrp->pg_session->s_ttyvp;
3168		if (cttyvp != NULL)
3169			vref(cttyvp);
3170	}
3171	fdp = fdhold(p);
3172	PROC_UNLOCK(p);
3173	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
3174	efbuf->fdp = NULL;
3175	efbuf->sb = sb;
3176	efbuf->remainder = maxlen;
3177	if (tracevp != NULL)
3178		export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, FREAD | FWRITE,
3179		    efbuf);
3180	if (textvp != NULL)
3181		export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, efbuf);
3182	if (cttyvp != NULL)
3183		export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, FREAD | FWRITE,
3184		    efbuf);
3185	error = 0;
3186	if (fdp == NULL)
3187		goto fail;
3188	efbuf->fdp = fdp;
3189	FILEDESC_SLOCK(fdp);
3190	/* working directory */
3191	if (fdp->fd_cdir != NULL) {
3192		vref(fdp->fd_cdir);
3193		export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf);
3194	}
3195	/* root directory */
3196	if (fdp->fd_rdir != NULL) {
3197		vref(fdp->fd_rdir);
3198		export_vnode_to_sb(fdp->fd_rdir, KF_FD_TYPE_ROOT, FREAD, efbuf);
3199	}
3200	/* jail directory */
3201	if (fdp->fd_jdir != NULL) {
3202		vref(fdp->fd_jdir);
3203		export_vnode_to_sb(fdp->fd_jdir, KF_FD_TYPE_JAIL, FREAD, efbuf);
3204	}
3205	for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
3206		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
3207			continue;
3208#ifdef CAPABILITIES
3209		rights = *cap_rights(fdp, i);
3210#else /* !CAPABILITIES */
3211		cap_rights_init(&rights);
3212#endif
3213		/*
3214		 * Create sysctl entry.  It is OK to drop the filedesc
3215		 * lock inside of export_file_to_sb() as we will
3216		 * re-validate and re-evaluate its properties when the
3217		 * loop continues.
3218		 */
3219		error = export_file_to_sb(fp, i, &rights, efbuf);
3220		if (error != 0 || efbuf->remainder == 0)
3221			break;
3222	}
3223	FILEDESC_SUNLOCK(fdp);
3224	fddrop(fdp);
3225fail:
3226	free(efbuf, M_TEMP);
3227	return (error);
3228}
3229
3230#define FILEDESC_SBUF_SIZE	(sizeof(struct kinfo_file) * 5)
3231
3232/*
3233 * Get per-process file descriptors for use by procstat(1), et al.
3234 */
3235static int
3236sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
3237{
3238	struct sbuf sb;
3239	struct proc *p;
3240	ssize_t maxlen;
3241	int error, error2, *name;
3242
3243	name = (int *)arg1;
3244
3245	sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
3246	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
3247	if (error != 0) {
3248		sbuf_delete(&sb);
3249		return (error);
3250	}
3251	maxlen = req->oldptr != NULL ? req->oldlen : -1;
3252	error = kern_proc_filedesc_out(p, &sb, maxlen);
3253	error2 = sbuf_finish(&sb);
3254	sbuf_delete(&sb);
3255	return (error != 0 ? error : error2);
3256}
3257
3258#ifdef KINFO_OFILE_SIZE
3259CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
3260#endif
3261
3262#ifdef COMPAT_FREEBSD7
3263static void
3264kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif)
3265{
3266
3267	okif->kf_structsize = sizeof(*okif);
3268	okif->kf_type = kif->kf_type;
3269	okif->kf_fd = kif->kf_fd;
3270	okif->kf_ref_count = kif->kf_ref_count;
3271	okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE |
3272	    KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK |
3273	    KF_FLAG_DIRECT | KF_FLAG_HASLOCK);
3274	okif->kf_offset = kif->kf_offset;
3275	okif->kf_vnode_type = kif->kf_vnode_type;
3276	okif->kf_sock_domain = kif->kf_sock_domain;
3277	okif->kf_sock_type = kif->kf_sock_type;
3278	okif->kf_sock_protocol = kif->kf_sock_protocol;
3279	strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path));
3280	okif->kf_sa_local = kif->kf_sa_local;
3281	okif->kf_sa_peer = kif->kf_sa_peer;
3282}
3283
3284static int
3285export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif,
3286    struct kinfo_ofile *okif, struct filedesc *fdp, struct sysctl_req *req)
3287{
3288	int error;
3289
3290	vref(vp);
3291	FILEDESC_SUNLOCK(fdp);
3292	export_vnode_to_kinfo(vp, type, 0, kif);
3293	kinfo_to_okinfo(kif, okif);
3294	error = SYSCTL_OUT(req, okif, sizeof(*okif));
3295	FILEDESC_SLOCK(fdp);
3296	return (error);
3297}
3298
3299/*
3300 * Get per-process file descriptors for use by procstat(1), et al.
3301 */
3302static int
3303sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
3304{
3305	struct kinfo_ofile *okif;
3306	struct kinfo_file *kif;
3307	struct filedesc *fdp;
3308	int error, i, *name;
3309	struct file *fp;
3310	struct proc *p;
3311
3312	name = (int *)arg1;
3313	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
3314	if (error != 0)
3315		return (error);
3316	fdp = fdhold(p);
3317	PROC_UNLOCK(p);
3318	if (fdp == NULL)
3319		return (ENOENT);
3320	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
3321	okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK);
3322	FILEDESC_SLOCK(fdp);
3323	if (fdp->fd_cdir != NULL)
3324		export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif,
3325		    okif, fdp, req);
3326	if (fdp->fd_rdir != NULL)
3327		export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif,
3328		    okif, fdp, req);
3329	if (fdp->fd_jdir != NULL)
3330		export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
3331		    okif, fdp, req);
3332	for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
3333		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
3334			continue;
3335		export_file_to_kinfo(fp, i, NULL, kif, fdp);
3336		FILEDESC_SUNLOCK(fdp);
3337		kinfo_to_okinfo(kif, okif);
3338		error = SYSCTL_OUT(req, okif, sizeof(*okif));
3339		FILEDESC_SLOCK(fdp);
3340		if (error)
3341			break;
3342	}
3343	FILEDESC_SUNLOCK(fdp);
3344	fddrop(fdp);
3345	free(kif, M_TEMP);
3346	free(okif, M_TEMP);
3347	return (0);
3348}
3349
3350static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc,
3351    CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc,
3352    "Process ofiledesc entries");
3353#endif	/* COMPAT_FREEBSD7 */
3354
3355int
3356vntype_to_kinfo(int vtype)
3357{
3358	struct {
3359		int	vtype;
3360		int	kf_vtype;
3361	} vtypes_table[] = {
3362		{ VBAD, KF_VTYPE_VBAD },
3363		{ VBLK, KF_VTYPE_VBLK },
3364		{ VCHR, KF_VTYPE_VCHR },
3365		{ VDIR, KF_VTYPE_VDIR },
3366		{ VFIFO, KF_VTYPE_VFIFO },
3367		{ VLNK, KF_VTYPE_VLNK },
3368		{ VNON, KF_VTYPE_VNON },
3369		{ VREG, KF_VTYPE_VREG },
3370		{ VSOCK, KF_VTYPE_VSOCK }
3371	};
3372	unsigned int i;
3373
3374	/*
3375	 * Perform vtype translation.
3376	 */
3377	for (i = 0; i < nitems(vtypes_table); i++)
3378		if (vtypes_table[i].vtype == vtype)
3379			return (vtypes_table[i].kf_vtype);
3380
3381	return (KF_VTYPE_UNKNOWN);
3382}
3383
3384static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc,
3385    CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc,
3386    "Process filedesc entries");
3387
3388#ifdef DDB
3389/*
3390 * For the purposes of debugging, generate a human-readable string for the
3391 * file type.
3392 */
3393static const char *
3394file_type_to_name(short type)
3395{
3396
3397	switch (type) {
3398	case 0:
3399		return ("zero");
3400	case DTYPE_VNODE:
3401		return ("vnod");
3402	case DTYPE_SOCKET:
3403		return ("sock");
3404	case DTYPE_PIPE:
3405		return ("pipe");
3406	case DTYPE_FIFO:
3407		return ("fifo");
3408	case DTYPE_KQUEUE:
3409		return ("kque");
3410	case DTYPE_CRYPTO:
3411		return ("crpt");
3412	case DTYPE_MQUEUE:
3413		return ("mque");
3414	case DTYPE_SHM:
3415		return ("shm");
3416	case DTYPE_SEM:
3417		return ("ksem");
3418	default:
3419		return ("unkn");
3420	}
3421}
3422
3423/*
3424 * For the purposes of debugging, identify a process (if any, perhaps one of
3425 * many) that references the passed file in its file descriptor array. Return
3426 * NULL if none.
3427 */
3428static struct proc *
3429file_to_first_proc(struct file *fp)
3430{
3431	struct filedesc *fdp;
3432	struct proc *p;
3433	int n;
3434
3435	FOREACH_PROC_IN_SYSTEM(p) {
3436		if (p->p_state == PRS_NEW)
3437			continue;
3438		fdp = p->p_fd;
3439		if (fdp == NULL)
3440			continue;
3441		for (n = 0; n <= fdp->fd_lastfile; n++) {
3442			if (fp == fdp->fd_ofiles[n].fde_file)
3443				return (p);
3444		}
3445	}
3446	return (NULL);
3447}
3448
3449static void
3450db_print_file(struct file *fp, int header)
3451{
3452	struct proc *p;
3453
3454	if (header)
3455		db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n",
3456		    "File", "Type", "Data", "Flag", "GCFl", "Count",
3457		    "MCount", "Vnode", "FPID", "FCmd");
3458	p = file_to_first_proc(fp);
3459	db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
3460	    file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
3461	    0, fp->f_count, 0, fp->f_vnode,
3462	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
3463}
3464
3465DB_SHOW_COMMAND(file, db_show_file)
3466{
3467	struct file *fp;
3468
3469	if (!have_addr) {
3470		db_printf("usage: show file <addr>\n");
3471		return;
3472	}
3473	fp = (struct file *)addr;
3474	db_print_file(fp, 1);
3475}
3476
3477DB_SHOW_COMMAND(files, db_show_files)
3478{
3479	struct filedesc *fdp;
3480	struct file *fp;
3481	struct proc *p;
3482	int header;
3483	int n;
3484
3485	header = 1;
3486	FOREACH_PROC_IN_SYSTEM(p) {
3487		if (p->p_state == PRS_NEW)
3488			continue;
3489		if ((fdp = p->p_fd) == NULL)
3490			continue;
3491		for (n = 0; n <= fdp->fd_lastfile; ++n) {
3492			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
3493				continue;
3494			db_print_file(fp, header);
3495			header = 0;
3496		}
3497	}
3498}
3499#endif
3500
3501SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
3502    &maxfilesperproc, 0, "Maximum files allowed open per process");
3503
3504SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
3505    &maxfiles, 0, "Maximum number of files");
3506
3507SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
3508    __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files");
3509
3510/* ARGSUSED*/
3511static void
3512filelistinit(void *dummy)
3513{
3514
3515	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
3516	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
3517	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
3518	mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF);
3519}
3520SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
3521
3522/*-------------------------------------------------------------------*/
3523
3524static int
3525badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
3526    int flags, struct thread *td)
3527{
3528
3529	return (EBADF);
3530}
3531
3532static int
3533badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
3534    struct thread *td)
3535{
3536
3537	return (EINVAL);
3538}
3539
3540static int
3541badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
3542    struct thread *td)
3543{
3544
3545	return (EBADF);
3546}
3547
3548static int
3549badfo_poll(struct file *fp, int events, struct ucred *active_cred,
3550    struct thread *td)
3551{
3552
3553	return (0);
3554}
3555
3556static int
3557badfo_kqfilter(struct file *fp, struct knote *kn)
3558{
3559
3560	return (EBADF);
3561}
3562
3563static int
3564badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
3565    struct thread *td)
3566{
3567
3568	return (EBADF);
3569}
3570
3571static int
3572badfo_close(struct file *fp, struct thread *td)
3573{
3574
3575	return (EBADF);
3576}
3577
3578static int
3579badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
3580    struct thread *td)
3581{
3582
3583	return (EBADF);
3584}
3585
3586static int
3587badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
3588    struct thread *td)
3589{
3590
3591	return (EBADF);
3592}
3593
3594static int
3595badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
3596    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
3597    int kflags, struct sendfile_sync *sfs, struct thread *td)
3598{
3599
3600	return (EBADF);
3601}
3602
3603static int
3604badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
3605{
3606
3607	return (0);
3608}
3609
3610struct fileops badfileops = {
3611	.fo_read = badfo_readwrite,
3612	.fo_write = badfo_readwrite,
3613	.fo_truncate = badfo_truncate,
3614	.fo_ioctl = badfo_ioctl,
3615	.fo_poll = badfo_poll,
3616	.fo_kqfilter = badfo_kqfilter,
3617	.fo_stat = badfo_stat,
3618	.fo_close = badfo_close,
3619	.fo_chmod = badfo_chmod,
3620	.fo_chown = badfo_chown,
3621	.fo_sendfile = badfo_sendfile,
3622	.fo_fill_kinfo = badfo_fill_kinfo,
3623};
3624
3625int
3626invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred,
3627    int flags, struct thread *td)
3628{
3629
3630	return (EOPNOTSUPP);
3631}
3632
3633int
3634invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
3635    struct thread *td)
3636{
3637
3638	return (EINVAL);
3639}
3640
3641int
3642invfo_ioctl(struct file *fp, u_long com, void *data,
3643    struct ucred *active_cred, struct thread *td)
3644{
3645
3646	return (ENOTTY);
3647}
3648
3649int
3650invfo_poll(struct file *fp, int events, struct ucred *active_cred,
3651    struct thread *td)
3652{
3653
3654	return (poll_no_poll(events));
3655}
3656
3657int
3658invfo_kqfilter(struct file *fp, struct knote *kn)
3659{
3660
3661	return (EINVAL);
3662}
3663
3664int
3665invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
3666    struct thread *td)
3667{
3668
3669	return (EINVAL);
3670}
3671
3672int
3673invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
3674    struct thread *td)
3675{
3676
3677	return (EINVAL);
3678}
3679
3680int
3681invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
3682    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
3683    int kflags, struct sendfile_sync *sfs, struct thread *td)
3684{
3685
3686	return (EINVAL);
3687}
3688
3689/*-------------------------------------------------------------------*/
3690
3691/*
3692 * File Descriptor pseudo-device driver (/dev/fd/).
3693 *
3694 * Opening minor device N dup()s the file (if any) connected to file
3695 * descriptor N belonging to the calling process.  Note that this driver
3696 * consists of only the ``open()'' routine, because all subsequent
3697 * references to this file will be direct to the other driver.
3698 *
3699 * XXX: we could give this one a cloning event handler if necessary.
3700 */
3701
3702/* ARGSUSED */
3703static int
3704fdopen(struct cdev *dev, int mode, int type, struct thread *td)
3705{
3706
3707	/*
3708	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
3709	 * the file descriptor being sought for duplication. The error
3710	 * return ensures that the vnode for this device will be released
3711	 * by vn_open. Open will detect this special error and take the
3712	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
3713	 * will simply report the error.
3714	 */
3715	td->td_dupfd = dev2unit(dev);
3716	return (ENODEV);
3717}
3718
3719static struct cdevsw fildesc_cdevsw = {
3720	.d_version =	D_VERSION,
3721	.d_open =	fdopen,
3722	.d_name =	"FD",
3723};
3724
3725static void
3726fildesc_drvinit(void *unused)
3727{
3728	struct cdev *dev;
3729
3730	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
3731	    UID_ROOT, GID_WHEEL, 0666, "fd/0");
3732	make_dev_alias(dev, "stdin");
3733	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
3734	    UID_ROOT, GID_WHEEL, 0666, "fd/1");
3735	make_dev_alias(dev, "stdout");
3736	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
3737	    UID_ROOT, GID_WHEEL, 0666, "fd/2");
3738	make_dev_alias(dev, "stderr");
3739}
3740
3741SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
3742