kern_descrip.c revision 273956
1/*-
2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: head/sys/kern/kern_descrip.c 273956 2014-11-02 01:13:11Z mjg $");
39
40#include "opt_capsicum.h"
41#include "opt_compat.h"
42#include "opt_ddb.h"
43#include "opt_ktrace.h"
44
45#include <sys/param.h>
46#include <sys/systm.h>
47
48#include <sys/capsicum.h>
49#include <sys/conf.h>
50#include <sys/fcntl.h>
51#include <sys/file.h>
52#include <sys/filedesc.h>
53#include <sys/filio.h>
54#include <sys/jail.h>
55#include <sys/kernel.h>
56#include <sys/limits.h>
57#include <sys/lock.h>
58#include <sys/malloc.h>
59#include <sys/mount.h>
60#include <sys/mutex.h>
61#include <sys/namei.h>
62#include <sys/selinfo.h>
63#include <sys/priv.h>
64#include <sys/proc.h>
65#include <sys/protosw.h>
66#include <sys/racct.h>
67#include <sys/resourcevar.h>
68#include <sys/sbuf.h>
69#include <sys/signalvar.h>
70#include <sys/socketvar.h>
71#include <sys/stat.h>
72#include <sys/sx.h>
73#include <sys/syscallsubr.h>
74#include <sys/sysctl.h>
75#include <sys/sysproto.h>
76#include <sys/unistd.h>
77#include <sys/user.h>
78#include <sys/vnode.h>
79#ifdef KTRACE
80#include <sys/ktrace.h>
81#endif
82
83#include <net/vnet.h>
84
85#include <security/audit/audit.h>
86
87#include <vm/uma.h>
88#include <vm/vm.h>
89
90#include <ddb/ddb.h>
91
92static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
93static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
94    "file desc to leader structures");
95static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
96MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
97
98MALLOC_DECLARE(M_FADVISE);
99
100static uma_zone_t file_zone;
101
102static int	closefp(struct filedesc *fdp, int fd, struct file *fp,
103		    struct thread *td, int holdleaders);
104static int	do_dup(struct thread *td, int flags, int old, int new);
105static int	fd_first_free(struct filedesc *fdp, int low, int size);
106static int	fd_last_used(struct filedesc *fdp, int size);
107static void	fdgrowtable(struct filedesc *fdp, int nfd);
108static void	fdgrowtable_exp(struct filedesc *fdp, int nfd);
109static void	fdunused(struct filedesc *fdp, int fd);
110static void	fdused(struct filedesc *fdp, int fd);
111static int	getmaxfd(struct proc *p);
112
113/* Flags for do_dup() */
114#define	DUP_FIXED	0x1	/* Force fixed allocation. */
115#define	DUP_FCNTL	0x2	/* fcntl()-style errors. */
116#define	DUP_CLOEXEC	0x4	/* Atomically set FD_CLOEXEC. */
117
118/*
119 * Each process has:
120 *
121 * - An array of open file descriptors (fd_ofiles)
122 * - An array of file flags (fd_ofileflags)
123 * - A bitmap recording which descriptors are in use (fd_map)
124 *
125 * A process starts out with NDFILE descriptors.  The value of NDFILE has
126 * been selected based the historical limit of 20 open files, and an
127 * assumption that the majority of processes, especially short-lived
128 * processes like shells, will never need more.
129 *
130 * If this initial allocation is exhausted, a larger descriptor table and
131 * map are allocated dynamically, and the pointers in the process's struct
132 * filedesc are updated to point to those.  This is repeated every time
133 * the process runs out of file descriptors (provided it hasn't hit its
134 * resource limit).
135 *
136 * Since threads may hold references to individual descriptor table
137 * entries, the tables are never freed.  Instead, they are placed on a
138 * linked list and freed only when the struct filedesc is released.
139 */
140#define NDFILE		20
141#define NDSLOTSIZE	sizeof(NDSLOTTYPE)
142#define	NDENTRIES	(NDSLOTSIZE * __CHAR_BIT)
143#define NDSLOT(x)	((x) / NDENTRIES)
144#define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
145#define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
146
147/*
148 * SLIST entry used to keep track of ofiles which must be reclaimed when
149 * the process exits.
150 */
151struct freetable {
152	struct fdescenttbl *ft_table;
153	SLIST_ENTRY(freetable) ft_next;
154};
155
156/*
157 * Initial allocation: a filedesc structure + the head of SLIST used to
158 * keep track of old ofiles + enough space for NDFILE descriptors.
159 */
160
161struct fdescenttbl0 {
162	int	fdt_nfiles;
163	struct	filedescent fdt_ofiles[NDFILE];
164};
165
166struct filedesc0 {
167	struct filedesc fd_fd;
168	SLIST_HEAD(, freetable) fd_free;
169	struct	fdescenttbl0 fd_dfiles;
170	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
171};
172
173/*
174 * Descriptor management.
175 */
176volatile int openfiles;			/* actual number of open files */
177struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
178void (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
179
180/* A mutex to protect the association between a proc and filedesc. */
181static struct mtx fdesc_mtx;
182
183/*
184 * If low >= size, just return low. Otherwise find the first zero bit in the
185 * given bitmap, starting at low and not exceeding size - 1. Return size if
186 * not found.
187 */
188static int
189fd_first_free(struct filedesc *fdp, int low, int size)
190{
191	NDSLOTTYPE *map = fdp->fd_map;
192	NDSLOTTYPE mask;
193	int off, maxoff;
194
195	if (low >= size)
196		return (low);
197
198	off = NDSLOT(low);
199	if (low % NDENTRIES) {
200		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
201		if ((mask &= ~map[off]) != 0UL)
202			return (off * NDENTRIES + ffsl(mask) - 1);
203		++off;
204	}
205	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
206		if (map[off] != ~0UL)
207			return (off * NDENTRIES + ffsl(~map[off]) - 1);
208	return (size);
209}
210
211/*
212 * Find the highest non-zero bit in the given bitmap, starting at 0 and
213 * not exceeding size - 1. Return -1 if not found.
214 */
215static int
216fd_last_used(struct filedesc *fdp, int size)
217{
218	NDSLOTTYPE *map = fdp->fd_map;
219	NDSLOTTYPE mask;
220	int off, minoff;
221
222	off = NDSLOT(size);
223	if (size % NDENTRIES) {
224		mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
225		if ((mask &= map[off]) != 0)
226			return (off * NDENTRIES + flsl(mask) - 1);
227		--off;
228	}
229	for (minoff = NDSLOT(0); off >= minoff; --off)
230		if (map[off] != 0)
231			return (off * NDENTRIES + flsl(map[off]) - 1);
232	return (-1);
233}
234
235#ifdef INVARIANTS
236static int
237fdisused(struct filedesc *fdp, int fd)
238{
239
240	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
241	    ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
242
243	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
244}
245#endif
246
247/*
248 * Mark a file descriptor as used.
249 */
250static void
251fdused_init(struct filedesc *fdp, int fd)
252{
253
254	KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
255
256	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
257}
258
259static void
260fdused(struct filedesc *fdp, int fd)
261{
262
263	FILEDESC_XLOCK_ASSERT(fdp);
264
265	fdused_init(fdp, fd);
266	if (fd > fdp->fd_lastfile)
267		fdp->fd_lastfile = fd;
268	if (fd == fdp->fd_freefile)
269		fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
270}
271
272/*
273 * Mark a file descriptor as unused.
274 */
275static void
276fdunused(struct filedesc *fdp, int fd)
277{
278
279	FILEDESC_XLOCK_ASSERT(fdp);
280
281	KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
282	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
283	    ("fd=%d is still in use", fd));
284
285	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
286	if (fd < fdp->fd_freefile)
287		fdp->fd_freefile = fd;
288	if (fd == fdp->fd_lastfile)
289		fdp->fd_lastfile = fd_last_used(fdp, fd);
290}
291
292/*
293 * Free a file descriptor.
294 *
295 * Avoid some work if fdp is about to be destroyed.
296 */
297static inline void
298fdefree_last(struct filedescent *fde)
299{
300
301	filecaps_free(&fde->fde_caps);
302}
303
304static inline void
305fdfree(struct filedesc *fdp, int fd)
306{
307	struct filedescent *fde;
308
309	fde = &fdp->fd_ofiles[fd];
310#ifdef CAPABILITIES
311	seq_write_begin(&fde->fde_seq);
312#endif
313	fdefree_last(fde);
314	bzero(fde, fde_change_size);
315	fdunused(fdp, fd);
316#ifdef CAPABILITIES
317	seq_write_end(&fde->fde_seq);
318#endif
319}
320
321/*
322 * System calls on descriptors.
323 */
324#ifndef _SYS_SYSPROTO_H_
325struct getdtablesize_args {
326	int	dummy;
327};
328#endif
329/* ARGSUSED */
330int
331sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
332{
333	struct proc *p = td->td_proc;
334	uint64_t lim;
335
336	PROC_LOCK(p);
337	td->td_retval[0] =
338	    min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
339	lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
340	PROC_UNLOCK(p);
341	if (lim < td->td_retval[0])
342		td->td_retval[0] = lim;
343	return (0);
344}
345
346/*
347 * Duplicate a file descriptor to a particular value.
348 *
349 * Note: keep in mind that a potential race condition exists when closing
350 * descriptors from a shared descriptor table (via rfork).
351 */
352#ifndef _SYS_SYSPROTO_H_
353struct dup2_args {
354	u_int	from;
355	u_int	to;
356};
357#endif
358/* ARGSUSED */
359int
360sys_dup2(struct thread *td, struct dup2_args *uap)
361{
362
363	return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to));
364}
365
366/*
367 * Duplicate a file descriptor.
368 */
369#ifndef _SYS_SYSPROTO_H_
370struct dup_args {
371	u_int	fd;
372};
373#endif
374/* ARGSUSED */
375int
376sys_dup(struct thread *td, struct dup_args *uap)
377{
378
379	return (do_dup(td, 0, (int)uap->fd, 0));
380}
381
382/*
383 * The file control system call.
384 */
385#ifndef _SYS_SYSPROTO_H_
386struct fcntl_args {
387	int	fd;
388	int	cmd;
389	long	arg;
390};
391#endif
392/* ARGSUSED */
393int
394sys_fcntl(struct thread *td, struct fcntl_args *uap)
395{
396
397	return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg));
398}
399
400int
401kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg)
402{
403	struct flock fl;
404	struct __oflock ofl;
405	intptr_t arg1;
406	int error;
407
408	error = 0;
409	switch (cmd) {
410	case F_OGETLK:
411	case F_OSETLK:
412	case F_OSETLKW:
413		/*
414		 * Convert old flock structure to new.
415		 */
416		error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl));
417		fl.l_start = ofl.l_start;
418		fl.l_len = ofl.l_len;
419		fl.l_pid = ofl.l_pid;
420		fl.l_type = ofl.l_type;
421		fl.l_whence = ofl.l_whence;
422		fl.l_sysid = 0;
423
424		switch (cmd) {
425		case F_OGETLK:
426		    cmd = F_GETLK;
427		    break;
428		case F_OSETLK:
429		    cmd = F_SETLK;
430		    break;
431		case F_OSETLKW:
432		    cmd = F_SETLKW;
433		    break;
434		}
435		arg1 = (intptr_t)&fl;
436		break;
437        case F_GETLK:
438        case F_SETLK:
439        case F_SETLKW:
440	case F_SETLK_REMOTE:
441                error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl));
442                arg1 = (intptr_t)&fl;
443                break;
444	default:
445		arg1 = arg;
446		break;
447	}
448	if (error)
449		return (error);
450	error = kern_fcntl(td, fd, cmd, arg1);
451	if (error)
452		return (error);
453	if (cmd == F_OGETLK) {
454		ofl.l_start = fl.l_start;
455		ofl.l_len = fl.l_len;
456		ofl.l_pid = fl.l_pid;
457		ofl.l_type = fl.l_type;
458		ofl.l_whence = fl.l_whence;
459		error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl));
460	} else if (cmd == F_GETLK) {
461		error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl));
462	}
463	return (error);
464}
465
466int
467kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
468{
469	struct filedesc *fdp;
470	struct flock *flp;
471	struct file *fp, *fp2;
472	struct filedescent *fde;
473	struct proc *p;
474	struct vnode *vp;
475	cap_rights_t rights;
476	int error, flg, tmp;
477	uint64_t bsize;
478	off_t foffset;
479
480	error = 0;
481	flg = F_POSIX;
482	p = td->td_proc;
483	fdp = p->p_fd;
484
485	switch (cmd) {
486	case F_DUPFD:
487		tmp = arg;
488		error = do_dup(td, DUP_FCNTL, fd, tmp);
489		break;
490
491	case F_DUPFD_CLOEXEC:
492		tmp = arg;
493		error = do_dup(td, DUP_FCNTL | DUP_CLOEXEC, fd, tmp);
494		break;
495
496	case F_DUP2FD:
497		tmp = arg;
498		error = do_dup(td, DUP_FIXED, fd, tmp);
499		break;
500
501	case F_DUP2FD_CLOEXEC:
502		tmp = arg;
503		error = do_dup(td, DUP_FIXED | DUP_CLOEXEC, fd, tmp);
504		break;
505
506	case F_GETFD:
507		FILEDESC_SLOCK(fdp);
508		if (fget_locked(fdp, fd) == NULL) {
509			FILEDESC_SUNLOCK(fdp);
510			error = EBADF;
511			break;
512		}
513		fde = &fdp->fd_ofiles[fd];
514		td->td_retval[0] =
515		    (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
516		FILEDESC_SUNLOCK(fdp);
517		break;
518
519	case F_SETFD:
520		FILEDESC_XLOCK(fdp);
521		if (fget_locked(fdp, fd) == NULL) {
522			FILEDESC_XUNLOCK(fdp);
523			error = EBADF;
524			break;
525		}
526		fde = &fdp->fd_ofiles[fd];
527		fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
528		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
529		FILEDESC_XUNLOCK(fdp);
530		break;
531
532	case F_GETFL:
533		error = fget_unlocked(fdp, fd,
534		    cap_rights_init(&rights, CAP_FCNTL), F_GETFL, &fp, NULL);
535		if (error != 0)
536			break;
537		td->td_retval[0] = OFLAGS(fp->f_flag);
538		fdrop(fp, td);
539		break;
540
541	case F_SETFL:
542		error = fget_unlocked(fdp, fd,
543		    cap_rights_init(&rights, CAP_FCNTL), F_SETFL, &fp, NULL);
544		if (error != 0)
545			break;
546		do {
547			tmp = flg = fp->f_flag;
548			tmp &= ~FCNTLFLAGS;
549			tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
550		} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
551		tmp = fp->f_flag & FNONBLOCK;
552		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
553		if (error != 0) {
554			fdrop(fp, td);
555			break;
556		}
557		tmp = fp->f_flag & FASYNC;
558		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
559		if (error == 0) {
560			fdrop(fp, td);
561			break;
562		}
563		atomic_clear_int(&fp->f_flag, FNONBLOCK);
564		tmp = 0;
565		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
566		fdrop(fp, td);
567		break;
568
569	case F_GETOWN:
570		error = fget_unlocked(fdp, fd,
571		    cap_rights_init(&rights, CAP_FCNTL), F_GETOWN, &fp, NULL);
572		if (error != 0)
573			break;
574		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
575		if (error == 0)
576			td->td_retval[0] = tmp;
577		fdrop(fp, td);
578		break;
579
580	case F_SETOWN:
581		error = fget_unlocked(fdp, fd,
582		    cap_rights_init(&rights, CAP_FCNTL), F_SETOWN, &fp, NULL);
583		if (error != 0)
584			break;
585		tmp = arg;
586		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
587		fdrop(fp, td);
588		break;
589
590	case F_SETLK_REMOTE:
591		error = priv_check(td, PRIV_NFS_LOCKD);
592		if (error)
593			return (error);
594		flg = F_REMOTE;
595		goto do_setlk;
596
597	case F_SETLKW:
598		flg |= F_WAIT;
599		/* FALLTHROUGH F_SETLK */
600
601	case F_SETLK:
602	do_setlk:
603		cap_rights_init(&rights, CAP_FLOCK);
604		error = fget_unlocked(fdp, fd, &rights, 0, &fp, NULL);
605		if (error != 0)
606			break;
607		if (fp->f_type != DTYPE_VNODE) {
608			error = EBADF;
609			fdrop(fp, td);
610			break;
611		}
612
613		flp = (struct flock *)arg;
614		if (flp->l_whence == SEEK_CUR) {
615			foffset = foffset_get(fp);
616			if (foffset < 0 ||
617			    (flp->l_start > 0 &&
618			     foffset > OFF_MAX - flp->l_start)) {
619				FILEDESC_SUNLOCK(fdp);
620				error = EOVERFLOW;
621				fdrop(fp, td);
622				break;
623			}
624			flp->l_start += foffset;
625		}
626
627		vp = fp->f_vnode;
628		switch (flp->l_type) {
629		case F_RDLCK:
630			if ((fp->f_flag & FREAD) == 0) {
631				error = EBADF;
632				break;
633			}
634			PROC_LOCK(p->p_leader);
635			p->p_leader->p_flag |= P_ADVLOCK;
636			PROC_UNLOCK(p->p_leader);
637			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
638			    flp, flg);
639			break;
640		case F_WRLCK:
641			if ((fp->f_flag & FWRITE) == 0) {
642				error = EBADF;
643				break;
644			}
645			PROC_LOCK(p->p_leader);
646			p->p_leader->p_flag |= P_ADVLOCK;
647			PROC_UNLOCK(p->p_leader);
648			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
649			    flp, flg);
650			break;
651		case F_UNLCK:
652			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
653			    flp, flg);
654			break;
655		case F_UNLCKSYS:
656			/*
657			 * Temporary api for testing remote lock
658			 * infrastructure.
659			 */
660			if (flg != F_REMOTE) {
661				error = EINVAL;
662				break;
663			}
664			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
665			    F_UNLCKSYS, flp, flg);
666			break;
667		default:
668			error = EINVAL;
669			break;
670		}
671		if (error != 0 || flp->l_type == F_UNLCK ||
672		    flp->l_type == F_UNLCKSYS) {
673			fdrop(fp, td);
674			break;
675		}
676
677		/*
678		 * Check for a race with close.
679		 *
680		 * The vnode is now advisory locked (or unlocked, but this case
681		 * is not really important) as the caller requested.
682		 * We had to drop the filedesc lock, so we need to recheck if
683		 * the descriptor is still valid, because if it was closed
684		 * in the meantime we need to remove advisory lock from the
685		 * vnode - close on any descriptor leading to an advisory
686		 * locked vnode, removes that lock.
687		 * We will return 0 on purpose in that case, as the result of
688		 * successful advisory lock might have been externally visible
689		 * already. This is fine - effectively we pretend to the caller
690		 * that the closing thread was a bit slower and that the
691		 * advisory lock succeeded before the close.
692		 */
693		error = fget_unlocked(fdp, fd, &rights, 0, &fp2, NULL);
694		if (error != 0) {
695			fdrop(fp, td);
696			break;
697		}
698		if (fp != fp2) {
699			flp->l_whence = SEEK_SET;
700			flp->l_start = 0;
701			flp->l_len = 0;
702			flp->l_type = F_UNLCK;
703			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
704			    F_UNLCK, flp, F_POSIX);
705		}
706		fdrop(fp, td);
707		fdrop(fp2, td);
708		break;
709
710	case F_GETLK:
711		error = fget_unlocked(fdp, fd,
712		    cap_rights_init(&rights, CAP_FLOCK), 0, &fp, NULL);
713		if (error != 0)
714			break;
715		if (fp->f_type != DTYPE_VNODE) {
716			error = EBADF;
717			fdrop(fp, td);
718			break;
719		}
720		flp = (struct flock *)arg;
721		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
722		    flp->l_type != F_UNLCK) {
723			error = EINVAL;
724			fdrop(fp, td);
725			break;
726		}
727		if (flp->l_whence == SEEK_CUR) {
728			foffset = foffset_get(fp);
729			if ((flp->l_start > 0 &&
730			    foffset > OFF_MAX - flp->l_start) ||
731			    (flp->l_start < 0 &&
732			     foffset < OFF_MIN - flp->l_start)) {
733				FILEDESC_SUNLOCK(fdp);
734				error = EOVERFLOW;
735				fdrop(fp, td);
736				break;
737			}
738			flp->l_start += foffset;
739		}
740		vp = fp->f_vnode;
741		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
742		    F_POSIX);
743		fdrop(fp, td);
744		break;
745
746	case F_RDAHEAD:
747		arg = arg ? 128 * 1024: 0;
748		/* FALLTHROUGH */
749	case F_READAHEAD:
750		error = fget_unlocked(fdp, fd, NULL, 0, &fp, NULL);
751		if (error != 0)
752			break;
753		if (fp->f_type != DTYPE_VNODE) {
754			fdrop(fp, td);
755			error = EBADF;
756			break;
757		}
758		vp = fp->f_vnode;
759		/*
760		 * Exclusive lock synchronizes against f_seqcount reads and
761		 * writes in sequential_heuristic().
762		 */
763		error = vn_lock(vp, LK_EXCLUSIVE);
764		if (error != 0) {
765			fdrop(fp, td);
766			break;
767		}
768		if (arg >= 0) {
769			bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
770			fp->f_seqcount = (arg + bsize - 1) / bsize;
771			atomic_set_int(&fp->f_flag, FRDAHEAD);
772		} else {
773			atomic_clear_int(&fp->f_flag, FRDAHEAD);
774		}
775		VOP_UNLOCK(vp, 0);
776		fdrop(fp, td);
777		break;
778
779	default:
780		error = EINVAL;
781		break;
782	}
783	return (error);
784}
785
786static int
787getmaxfd(struct proc *p)
788{
789	int maxfd;
790
791	PROC_LOCK(p);
792	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
793	PROC_UNLOCK(p);
794
795	return (maxfd);
796}
797
798/*
799 * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
800 */
801static int
802do_dup(struct thread *td, int flags, int old, int new)
803{
804	struct filedesc *fdp;
805	struct filedescent *oldfde, *newfde;
806	struct proc *p;
807	struct file *fp;
808	struct file *delfp;
809	int error, maxfd;
810
811	p = td->td_proc;
812	fdp = p->p_fd;
813
814	/*
815	 * Verify we have a valid descriptor to dup from and possibly to
816	 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
817	 * return EINVAL when the new descriptor is out of bounds.
818	 */
819	if (old < 0)
820		return (EBADF);
821	if (new < 0)
822		return (flags & DUP_FCNTL ? EINVAL : EBADF);
823	maxfd = getmaxfd(p);
824	if (new >= maxfd)
825		return (flags & DUP_FCNTL ? EINVAL : EBADF);
826
827	FILEDESC_XLOCK(fdp);
828	if (fget_locked(fdp, old) == NULL) {
829		FILEDESC_XUNLOCK(fdp);
830		return (EBADF);
831	}
832	oldfde = &fdp->fd_ofiles[old];
833	if (flags & DUP_FIXED && old == new) {
834		td->td_retval[0] = new;
835		if (flags & DUP_CLOEXEC)
836			fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
837		FILEDESC_XUNLOCK(fdp);
838		return (0);
839	}
840	fp = oldfde->fde_file;
841	fhold(fp);
842
843	/*
844	 * If the caller specified a file descriptor, make sure the file
845	 * table is large enough to hold it, and grab it.  Otherwise, just
846	 * allocate a new descriptor the usual way.
847	 */
848	if (flags & DUP_FIXED) {
849		if (new >= fdp->fd_nfiles) {
850			/*
851			 * The resource limits are here instead of e.g.
852			 * fdalloc(), because the file descriptor table may be
853			 * shared between processes, so we can't really use
854			 * racct_add()/racct_sub().  Instead of counting the
855			 * number of actually allocated descriptors, just put
856			 * the limit on the size of the file descriptor table.
857			 */
858#ifdef RACCT
859			PROC_LOCK(p);
860			error = racct_set(p, RACCT_NOFILE, new + 1);
861			PROC_UNLOCK(p);
862			if (error != 0) {
863				FILEDESC_XUNLOCK(fdp);
864				fdrop(fp, td);
865				return (EMFILE);
866			}
867#endif
868			fdgrowtable_exp(fdp, new + 1);
869			oldfde = &fdp->fd_ofiles[old];
870		}
871		newfde = &fdp->fd_ofiles[new];
872		if (newfde->fde_file == NULL)
873			fdused(fdp, new);
874	} else {
875		if ((error = fdalloc(td, new, &new)) != 0) {
876			FILEDESC_XUNLOCK(fdp);
877			fdrop(fp, td);
878			return (error);
879		}
880		newfde = &fdp->fd_ofiles[new];
881	}
882
883	KASSERT(fp == oldfde->fde_file, ("old fd has been modified"));
884	KASSERT(old != new, ("new fd is same as old"));
885
886	delfp = newfde->fde_file;
887
888	/*
889	 * Duplicate the source descriptor.
890	 */
891#ifdef CAPABILITIES
892	seq_write_begin(&newfde->fde_seq);
893#endif
894	filecaps_free(&newfde->fde_caps);
895	memcpy(newfde, oldfde, fde_change_size);
896	filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps);
897	if ((flags & DUP_CLOEXEC) != 0)
898		newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
899	else
900		newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
901#ifdef CAPABILITIES
902	seq_write_end(&newfde->fde_seq);
903#endif
904	td->td_retval[0] = new;
905
906	if (delfp != NULL) {
907		(void) closefp(fdp, new, delfp, td, 1);
908		/* closefp() drops the FILEDESC lock for us. */
909	} else {
910		FILEDESC_XUNLOCK(fdp);
911	}
912
913	return (0);
914}
915
916/*
917 * If sigio is on the list associated with a process or process group,
918 * disable signalling from the device, remove sigio from the list and
919 * free sigio.
920 */
921void
922funsetown(struct sigio **sigiop)
923{
924	struct sigio *sigio;
925
926	SIGIO_LOCK();
927	sigio = *sigiop;
928	if (sigio == NULL) {
929		SIGIO_UNLOCK();
930		return;
931	}
932	*(sigio->sio_myref) = NULL;
933	if ((sigio)->sio_pgid < 0) {
934		struct pgrp *pg = (sigio)->sio_pgrp;
935		PGRP_LOCK(pg);
936		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
937			     sigio, sio_pgsigio);
938		PGRP_UNLOCK(pg);
939	} else {
940		struct proc *p = (sigio)->sio_proc;
941		PROC_LOCK(p);
942		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
943			     sigio, sio_pgsigio);
944		PROC_UNLOCK(p);
945	}
946	SIGIO_UNLOCK();
947	crfree(sigio->sio_ucred);
948	free(sigio, M_SIGIO);
949}
950
951/*
952 * Free a list of sigio structures.
953 * We only need to lock the SIGIO_LOCK because we have made ourselves
954 * inaccessible to callers of fsetown and therefore do not need to lock
955 * the proc or pgrp struct for the list manipulation.
956 */
957void
958funsetownlst(struct sigiolst *sigiolst)
959{
960	struct proc *p;
961	struct pgrp *pg;
962	struct sigio *sigio;
963
964	sigio = SLIST_FIRST(sigiolst);
965	if (sigio == NULL)
966		return;
967	p = NULL;
968	pg = NULL;
969
970	/*
971	 * Every entry of the list should belong
972	 * to a single proc or pgrp.
973	 */
974	if (sigio->sio_pgid < 0) {
975		pg = sigio->sio_pgrp;
976		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
977	} else /* if (sigio->sio_pgid > 0) */ {
978		p = sigio->sio_proc;
979		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
980	}
981
982	SIGIO_LOCK();
983	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
984		*(sigio->sio_myref) = NULL;
985		if (pg != NULL) {
986			KASSERT(sigio->sio_pgid < 0,
987			    ("Proc sigio in pgrp sigio list"));
988			KASSERT(sigio->sio_pgrp == pg,
989			    ("Bogus pgrp in sigio list"));
990			PGRP_LOCK(pg);
991			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
992			    sio_pgsigio);
993			PGRP_UNLOCK(pg);
994		} else /* if (p != NULL) */ {
995			KASSERT(sigio->sio_pgid > 0,
996			    ("Pgrp sigio in proc sigio list"));
997			KASSERT(sigio->sio_proc == p,
998			    ("Bogus proc in sigio list"));
999			PROC_LOCK(p);
1000			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
1001			    sio_pgsigio);
1002			PROC_UNLOCK(p);
1003		}
1004		SIGIO_UNLOCK();
1005		crfree(sigio->sio_ucred);
1006		free(sigio, M_SIGIO);
1007		SIGIO_LOCK();
1008	}
1009	SIGIO_UNLOCK();
1010}
1011
1012/*
1013 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
1014 *
1015 * After permission checking, add a sigio structure to the sigio list for
1016 * the process or process group.
1017 */
1018int
1019fsetown(pid_t pgid, struct sigio **sigiop)
1020{
1021	struct proc *proc;
1022	struct pgrp *pgrp;
1023	struct sigio *sigio;
1024	int ret;
1025
1026	if (pgid == 0) {
1027		funsetown(sigiop);
1028		return (0);
1029	}
1030
1031	ret = 0;
1032
1033	/* Allocate and fill in the new sigio out of locks. */
1034	sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
1035	sigio->sio_pgid = pgid;
1036	sigio->sio_ucred = crhold(curthread->td_ucred);
1037	sigio->sio_myref = sigiop;
1038
1039	sx_slock(&proctree_lock);
1040	if (pgid > 0) {
1041		proc = pfind(pgid);
1042		if (proc == NULL) {
1043			ret = ESRCH;
1044			goto fail;
1045		}
1046
1047		/*
1048		 * Policy - Don't allow a process to FSETOWN a process
1049		 * in another session.
1050		 *
1051		 * Remove this test to allow maximum flexibility or
1052		 * restrict FSETOWN to the current process or process
1053		 * group for maximum safety.
1054		 */
1055		PROC_UNLOCK(proc);
1056		if (proc->p_session != curthread->td_proc->p_session) {
1057			ret = EPERM;
1058			goto fail;
1059		}
1060
1061		pgrp = NULL;
1062	} else /* if (pgid < 0) */ {
1063		pgrp = pgfind(-pgid);
1064		if (pgrp == NULL) {
1065			ret = ESRCH;
1066			goto fail;
1067		}
1068		PGRP_UNLOCK(pgrp);
1069
1070		/*
1071		 * Policy - Don't allow a process to FSETOWN a process
1072		 * in another session.
1073		 *
1074		 * Remove this test to allow maximum flexibility or
1075		 * restrict FSETOWN to the current process or process
1076		 * group for maximum safety.
1077		 */
1078		if (pgrp->pg_session != curthread->td_proc->p_session) {
1079			ret = EPERM;
1080			goto fail;
1081		}
1082
1083		proc = NULL;
1084	}
1085	funsetown(sigiop);
1086	if (pgid > 0) {
1087		PROC_LOCK(proc);
1088		/*
1089		 * Since funsetownlst() is called without the proctree
1090		 * locked, we need to check for P_WEXIT.
1091		 * XXX: is ESRCH correct?
1092		 */
1093		if ((proc->p_flag & P_WEXIT) != 0) {
1094			PROC_UNLOCK(proc);
1095			ret = ESRCH;
1096			goto fail;
1097		}
1098		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
1099		sigio->sio_proc = proc;
1100		PROC_UNLOCK(proc);
1101	} else {
1102		PGRP_LOCK(pgrp);
1103		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
1104		sigio->sio_pgrp = pgrp;
1105		PGRP_UNLOCK(pgrp);
1106	}
1107	sx_sunlock(&proctree_lock);
1108	SIGIO_LOCK();
1109	*sigiop = sigio;
1110	SIGIO_UNLOCK();
1111	return (0);
1112
1113fail:
1114	sx_sunlock(&proctree_lock);
1115	crfree(sigio->sio_ucred);
1116	free(sigio, M_SIGIO);
1117	return (ret);
1118}
1119
1120/*
1121 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
1122 */
1123pid_t
1124fgetown(sigiop)
1125	struct sigio **sigiop;
1126{
1127	pid_t pgid;
1128
1129	SIGIO_LOCK();
1130	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
1131	SIGIO_UNLOCK();
1132	return (pgid);
1133}
1134
1135/*
1136 * Function drops the filedesc lock on return.
1137 */
1138static int
1139closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
1140    int holdleaders)
1141{
1142	int error;
1143
1144	FILEDESC_XLOCK_ASSERT(fdp);
1145
1146	if (holdleaders) {
1147		if (td->td_proc->p_fdtol != NULL) {
1148			/*
1149			 * Ask fdfree() to sleep to ensure that all relevant
1150			 * process leaders can be traversed in closef().
1151			 */
1152			fdp->fd_holdleaderscount++;
1153		} else {
1154			holdleaders = 0;
1155		}
1156	}
1157
1158	/*
1159	 * We now hold the fp reference that used to be owned by the
1160	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
1161	 * knote_fdclose to prevent a race of the fd getting opened, a knote
1162	 * added, and deleteing a knote for the new fd.
1163	 */
1164	knote_fdclose(td, fd);
1165
1166	/*
1167	 * We need to notify mqueue if the object is of type mqueue.
1168	 */
1169	if (fp->f_type == DTYPE_MQUEUE)
1170		mq_fdclose(td, fd, fp);
1171	FILEDESC_XUNLOCK(fdp);
1172
1173	error = closef(fp, td);
1174	if (holdleaders) {
1175		FILEDESC_XLOCK(fdp);
1176		fdp->fd_holdleaderscount--;
1177		if (fdp->fd_holdleaderscount == 0 &&
1178		    fdp->fd_holdleaderswakeup != 0) {
1179			fdp->fd_holdleaderswakeup = 0;
1180			wakeup(&fdp->fd_holdleaderscount);
1181		}
1182		FILEDESC_XUNLOCK(fdp);
1183	}
1184	return (error);
1185}
1186
1187/*
1188 * Close a file descriptor.
1189 */
1190#ifndef _SYS_SYSPROTO_H_
1191struct close_args {
1192	int     fd;
1193};
1194#endif
1195/* ARGSUSED */
1196int
1197sys_close(td, uap)
1198	struct thread *td;
1199	struct close_args *uap;
1200{
1201
1202	return (kern_close(td, uap->fd));
1203}
1204
1205int
1206kern_close(td, fd)
1207	struct thread *td;
1208	int fd;
1209{
1210	struct filedesc *fdp;
1211	struct file *fp;
1212
1213	fdp = td->td_proc->p_fd;
1214
1215	AUDIT_SYSCLOSE(td, fd);
1216
1217	FILEDESC_XLOCK(fdp);
1218	if ((fp = fget_locked(fdp, fd)) == NULL) {
1219		FILEDESC_XUNLOCK(fdp);
1220		return (EBADF);
1221	}
1222	fdfree(fdp, fd);
1223
1224	/* closefp() drops the FILEDESC lock for us. */
1225	return (closefp(fdp, fd, fp, td, 1));
1226}
1227
1228/*
1229 * Close open file descriptors.
1230 */
1231#ifndef _SYS_SYSPROTO_H_
1232struct closefrom_args {
1233	int	lowfd;
1234};
1235#endif
1236/* ARGSUSED */
1237int
1238sys_closefrom(struct thread *td, struct closefrom_args *uap)
1239{
1240	struct filedesc *fdp;
1241	int fd;
1242
1243	fdp = td->td_proc->p_fd;
1244	AUDIT_ARG_FD(uap->lowfd);
1245
1246	/*
1247	 * Treat negative starting file descriptor values identical to
1248	 * closefrom(0) which closes all files.
1249	 */
1250	if (uap->lowfd < 0)
1251		uap->lowfd = 0;
1252	FILEDESC_SLOCK(fdp);
1253	for (fd = uap->lowfd; fd <= fdp->fd_lastfile; fd++) {
1254		if (fdp->fd_ofiles[fd].fde_file != NULL) {
1255			FILEDESC_SUNLOCK(fdp);
1256			(void)kern_close(td, fd);
1257			FILEDESC_SLOCK(fdp);
1258		}
1259	}
1260	FILEDESC_SUNLOCK(fdp);
1261	return (0);
1262}
1263
1264#if defined(COMPAT_43)
1265/*
1266 * Return status information about a file descriptor.
1267 */
1268#ifndef _SYS_SYSPROTO_H_
1269struct ofstat_args {
1270	int	fd;
1271	struct	ostat *sb;
1272};
1273#endif
1274/* ARGSUSED */
1275int
1276ofstat(struct thread *td, struct ofstat_args *uap)
1277{
1278	struct ostat oub;
1279	struct stat ub;
1280	int error;
1281
1282	error = kern_fstat(td, uap->fd, &ub);
1283	if (error == 0) {
1284		cvtstat(&ub, &oub);
1285		error = copyout(&oub, uap->sb, sizeof(oub));
1286	}
1287	return (error);
1288}
1289#endif /* COMPAT_43 */
1290
1291/*
1292 * Return status information about a file descriptor.
1293 */
1294#ifndef _SYS_SYSPROTO_H_
1295struct fstat_args {
1296	int	fd;
1297	struct	stat *sb;
1298};
1299#endif
1300/* ARGSUSED */
1301int
1302sys_fstat(struct thread *td, struct fstat_args *uap)
1303{
1304	struct stat ub;
1305	int error;
1306
1307	error = kern_fstat(td, uap->fd, &ub);
1308	if (error == 0)
1309		error = copyout(&ub, uap->sb, sizeof(ub));
1310	return (error);
1311}
1312
1313int
1314kern_fstat(struct thread *td, int fd, struct stat *sbp)
1315{
1316	struct file *fp;
1317	cap_rights_t rights;
1318	int error;
1319
1320	AUDIT_ARG_FD(fd);
1321
1322	error = fget(td, fd, cap_rights_init(&rights, CAP_FSTAT), &fp);
1323	if (error != 0)
1324		return (error);
1325
1326	AUDIT_ARG_FILE(td->td_proc, fp);
1327
1328	error = fo_stat(fp, sbp, td->td_ucred, td);
1329	fdrop(fp, td);
1330#ifdef KTRACE
1331	if (error == 0 && KTRPOINT(td, KTR_STRUCT))
1332		ktrstat(sbp);
1333#endif
1334	return (error);
1335}
1336
1337/*
1338 * Return status information about a file descriptor.
1339 */
1340#ifndef _SYS_SYSPROTO_H_
1341struct nfstat_args {
1342	int	fd;
1343	struct	nstat *sb;
1344};
1345#endif
1346/* ARGSUSED */
1347int
1348sys_nfstat(struct thread *td, struct nfstat_args *uap)
1349{
1350	struct nstat nub;
1351	struct stat ub;
1352	int error;
1353
1354	error = kern_fstat(td, uap->fd, &ub);
1355	if (error == 0) {
1356		cvtnstat(&ub, &nub);
1357		error = copyout(&nub, uap->sb, sizeof(nub));
1358	}
1359	return (error);
1360}
1361
1362/*
1363 * Return pathconf information about a file descriptor.
1364 */
1365#ifndef _SYS_SYSPROTO_H_
1366struct fpathconf_args {
1367	int	fd;
1368	int	name;
1369};
1370#endif
1371/* ARGSUSED */
1372int
1373sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
1374{
1375	struct file *fp;
1376	struct vnode *vp;
1377	cap_rights_t rights;
1378	int error;
1379
1380	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FPATHCONF), &fp);
1381	if (error != 0)
1382		return (error);
1383
1384	/* If asynchronous I/O is available, it works for all descriptors. */
1385	if (uap->name == _PC_ASYNC_IO) {
1386		td->td_retval[0] = async_io_version;
1387		goto out;
1388	}
1389	vp = fp->f_vnode;
1390	if (vp != NULL) {
1391		vn_lock(vp, LK_SHARED | LK_RETRY);
1392		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
1393		VOP_UNLOCK(vp, 0);
1394	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
1395		if (uap->name != _PC_PIPE_BUF) {
1396			error = EINVAL;
1397		} else {
1398			td->td_retval[0] = PIPE_BUF;
1399			error = 0;
1400		}
1401	} else {
1402		error = EOPNOTSUPP;
1403	}
1404out:
1405	fdrop(fp, td);
1406	return (error);
1407}
1408
1409/*
1410 * Initialize filecaps structure.
1411 */
1412void
1413filecaps_init(struct filecaps *fcaps)
1414{
1415
1416	bzero(fcaps, sizeof(*fcaps));
1417	fcaps->fc_nioctls = -1;
1418}
1419
1420/*
1421 * Copy filecaps structure allocating memory for ioctls array if needed.
1422 */
1423void
1424filecaps_copy(const struct filecaps *src, struct filecaps *dst)
1425{
1426	size_t size;
1427
1428	*dst = *src;
1429	if (src->fc_ioctls != NULL) {
1430		KASSERT(src->fc_nioctls > 0,
1431		    ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
1432
1433		size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
1434		dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
1435		bcopy(src->fc_ioctls, dst->fc_ioctls, size);
1436	}
1437}
1438
1439/*
1440 * Move filecaps structure to the new place and clear the old place.
1441 */
1442void
1443filecaps_move(struct filecaps *src, struct filecaps *dst)
1444{
1445
1446	*dst = *src;
1447	bzero(src, sizeof(*src));
1448}
1449
1450/*
1451 * Fill the given filecaps structure with full rights.
1452 */
1453static void
1454filecaps_fill(struct filecaps *fcaps)
1455{
1456
1457	CAP_ALL(&fcaps->fc_rights);
1458	fcaps->fc_ioctls = NULL;
1459	fcaps->fc_nioctls = -1;
1460	fcaps->fc_fcntls = CAP_FCNTL_ALL;
1461}
1462
1463/*
1464 * Free memory allocated within filecaps structure.
1465 */
1466void
1467filecaps_free(struct filecaps *fcaps)
1468{
1469
1470	free(fcaps->fc_ioctls, M_FILECAPS);
1471	bzero(fcaps, sizeof(*fcaps));
1472}
1473
1474/*
1475 * Validate the given filecaps structure.
1476 */
1477static void
1478filecaps_validate(const struct filecaps *fcaps, const char *func)
1479{
1480
1481	KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
1482	    ("%s: invalid rights", func));
1483	KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
1484	    ("%s: invalid fcntls", func));
1485	KASSERT(fcaps->fc_fcntls == 0 ||
1486	    cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
1487	    ("%s: fcntls without CAP_FCNTL", func));
1488	KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
1489	    (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
1490	    ("%s: invalid ioctls", func));
1491	KASSERT(fcaps->fc_nioctls == 0 ||
1492	    cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
1493	    ("%s: ioctls without CAP_IOCTL", func));
1494}
1495
1496static void
1497fdgrowtable_exp(struct filedesc *fdp, int nfd)
1498{
1499	int nfd1;
1500
1501	FILEDESC_XLOCK_ASSERT(fdp);
1502
1503	nfd1 = fdp->fd_nfiles * 2;
1504	if (nfd1 < nfd)
1505		nfd1 = nfd;
1506	fdgrowtable(fdp, nfd1);
1507}
1508
1509/*
1510 * Grow the file table to accomodate (at least) nfd descriptors.
1511 */
1512static void
1513fdgrowtable(struct filedesc *fdp, int nfd)
1514{
1515	struct filedesc0 *fdp0;
1516	struct freetable *ft;
1517	struct fdescenttbl *ntable;
1518	struct fdescenttbl *otable;
1519	int nnfiles, onfiles;
1520	NDSLOTTYPE *nmap, *omap;
1521
1522	/*
1523	 * If lastfile is -1 this struct filedesc was just allocated and we are
1524	 * growing it to accomodate for the one we are going to copy from. There
1525	 * is no need to have a lock on this one as it's not visible to anyone.
1526	 */
1527	if (fdp->fd_lastfile != -1)
1528		FILEDESC_XLOCK_ASSERT(fdp);
1529
1530	KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
1531
1532	/* save old values */
1533	onfiles = fdp->fd_nfiles;
1534	otable = fdp->fd_files;
1535	omap = fdp->fd_map;
1536
1537	/* compute the size of the new table */
1538	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
1539	if (nnfiles <= onfiles)
1540		/* the table is already large enough */
1541		return;
1542
1543	/*
1544	 * Allocate a new table.  We need enough space for the number of
1545	 * entries, file entries themselves and the struct freetable we will use
1546	 * when we decommission the table and place it on the freelist.
1547	 * We place the struct freetable in the middle so we don't have
1548	 * to worry about padding.
1549	 */
1550	ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) +
1551	    nnfiles * sizeof(ntable->fdt_ofiles[0]) +
1552	    sizeof(struct freetable),
1553	    M_FILEDESC, M_ZERO | M_WAITOK);
1554	/* copy the old data */
1555	ntable->fdt_nfiles = nnfiles;
1556	memcpy(ntable->fdt_ofiles, otable->fdt_ofiles,
1557	    onfiles * sizeof(ntable->fdt_ofiles[0]));
1558
1559	/*
1560	 * Allocate a new map only if the old is not large enough.  It will
1561	 * grow at a slower rate than the table as it can map more
1562	 * entries than the table can hold.
1563	 */
1564	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
1565		nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
1566		    M_ZERO | M_WAITOK);
1567		/* copy over the old data and update the pointer */
1568		memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
1569		fdp->fd_map = nmap;
1570	}
1571
1572	/*
1573	 * Make sure that ntable is correctly initialized before we replace
1574	 * fd_files poiner. Otherwise fget_unlocked() may see inconsistent
1575	 * data.
1576	 */
1577	atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable);
1578
1579	/*
1580	 * Do not free the old file table, as some threads may still
1581	 * reference entries within it.  Instead, place it on a freelist
1582	 * which will be processed when the struct filedesc is released.
1583	 *
1584	 * Note that if onfiles == NDFILE, we're dealing with the original
1585	 * static allocation contained within (struct filedesc0 *)fdp,
1586	 * which must not be freed.
1587	 */
1588	if (onfiles > NDFILE) {
1589		ft = (struct freetable *)&otable->fdt_ofiles[onfiles];
1590		fdp0 = (struct filedesc0 *)fdp;
1591		ft->ft_table = otable;
1592		SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
1593	}
1594	/*
1595	 * The map does not have the same possibility of threads still
1596	 * holding references to it.  So always free it as long as it
1597	 * does not reference the original static allocation.
1598	 */
1599	if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
1600		free(omap, M_FILEDESC);
1601}
1602
1603/*
1604 * Allocate a file descriptor for the process.
1605 */
1606int
1607fdalloc(struct thread *td, int minfd, int *result)
1608{
1609	struct proc *p = td->td_proc;
1610	struct filedesc *fdp = p->p_fd;
1611	int fd = -1, maxfd, allocfd;
1612#ifdef RACCT
1613	int error;
1614#endif
1615
1616	FILEDESC_XLOCK_ASSERT(fdp);
1617
1618	if (fdp->fd_freefile > minfd)
1619		minfd = fdp->fd_freefile;
1620
1621	maxfd = getmaxfd(p);
1622
1623	/*
1624	 * Search the bitmap for a free descriptor starting at minfd.
1625	 * If none is found, grow the file table.
1626	 */
1627	fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
1628	if (fd >= maxfd)
1629		return (EMFILE);
1630	if (fd >= fdp->fd_nfiles) {
1631		allocfd = min(fd * 2, maxfd);
1632#ifdef RACCT
1633		PROC_LOCK(p);
1634		error = racct_set(p, RACCT_NOFILE, allocfd);
1635		PROC_UNLOCK(p);
1636		if (error != 0)
1637			return (EMFILE);
1638#endif
1639		/*
1640		 * fd is already equal to first free descriptor >= minfd, so
1641		 * we only need to grow the table and we are done.
1642		 */
1643		fdgrowtable_exp(fdp, allocfd);
1644	}
1645
1646	/*
1647	 * Perform some sanity checks, then mark the file descriptor as
1648	 * used and return it to the caller.
1649	 */
1650	KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
1651	    ("invalid descriptor %d", fd));
1652	KASSERT(!fdisused(fdp, fd),
1653	    ("fd_first_free() returned non-free descriptor"));
1654	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
1655	    ("file descriptor isn't free"));
1656	KASSERT(fdp->fd_ofiles[fd].fde_flags == 0, ("file flags are set"));
1657	fdused(fdp, fd);
1658	*result = fd;
1659	return (0);
1660}
1661
1662/*
1663 * Allocate n file descriptors for the process.
1664 */
1665int
1666fdallocn(struct thread *td, int minfd, int *fds, int n)
1667{
1668	struct proc *p = td->td_proc;
1669	struct filedesc *fdp = p->p_fd;
1670	int i;
1671
1672	FILEDESC_XLOCK_ASSERT(fdp);
1673
1674	for (i = 0; i < n; i++)
1675		if (fdalloc(td, 0, &fds[i]) != 0)
1676			break;
1677
1678	if (i < n) {
1679		for (i--; i >= 0; i--)
1680			fdunused(fdp, fds[i]);
1681		return (EMFILE);
1682	}
1683
1684	return (0);
1685}
1686
1687/*
1688 * Create a new open file structure and allocate a file decriptor for the
1689 * process that refers to it.  We add one reference to the file for the
1690 * descriptor table and one reference for resultfp. This is to prevent us
1691 * being preempted and the entry in the descriptor table closed after we
1692 * release the FILEDESC lock.
1693 */
1694int
1695falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags)
1696{
1697	struct file *fp;
1698	int error, fd;
1699
1700	error = falloc_noinstall(td, &fp);
1701	if (error)
1702		return (error);		/* no reference held on error */
1703
1704	error = finstall(td, fp, &fd, flags, NULL);
1705	if (error) {
1706		fdrop(fp, td);		/* one reference (fp only) */
1707		return (error);
1708	}
1709
1710	if (resultfp != NULL)
1711		*resultfp = fp;		/* copy out result */
1712	else
1713		fdrop(fp, td);		/* release local reference */
1714
1715	if (resultfd != NULL)
1716		*resultfd = fd;
1717
1718	return (0);
1719}
1720
1721/*
1722 * Create a new open file structure without allocating a file descriptor.
1723 */
1724int
1725falloc_noinstall(struct thread *td, struct file **resultfp)
1726{
1727	struct file *fp;
1728	int maxuserfiles = maxfiles - (maxfiles / 20);
1729	static struct timeval lastfail;
1730	static int curfail;
1731
1732	KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
1733
1734	if ((openfiles >= maxuserfiles &&
1735	    priv_check(td, PRIV_MAXFILES) != 0) ||
1736	    openfiles >= maxfiles) {
1737		if (ppsratecheck(&lastfail, &curfail, 1)) {
1738			printf("kern.maxfiles limit exceeded by uid %i, "
1739			    "please see tuning(7).\n", td->td_ucred->cr_ruid);
1740		}
1741		return (ENFILE);
1742	}
1743	atomic_add_int(&openfiles, 1);
1744	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
1745	refcount_init(&fp->f_count, 1);
1746	fp->f_cred = crhold(td->td_ucred);
1747	fp->f_ops = &badfileops;
1748	*resultfp = fp;
1749	return (0);
1750}
1751
1752/*
1753 * Install a file in a file descriptor table.
1754 */
1755int
1756finstall(struct thread *td, struct file *fp, int *fd, int flags,
1757    struct filecaps *fcaps)
1758{
1759	struct filedesc *fdp = td->td_proc->p_fd;
1760	struct filedescent *fde;
1761	int error;
1762
1763	KASSERT(fd != NULL, ("%s: fd == NULL", __func__));
1764	KASSERT(fp != NULL, ("%s: fp == NULL", __func__));
1765	if (fcaps != NULL)
1766		filecaps_validate(fcaps, __func__);
1767
1768	FILEDESC_XLOCK(fdp);
1769	if ((error = fdalloc(td, 0, fd))) {
1770		FILEDESC_XUNLOCK(fdp);
1771		return (error);
1772	}
1773	fhold(fp);
1774	fde = &fdp->fd_ofiles[*fd];
1775#ifdef CAPABILITIES
1776	seq_write_begin(&fde->fde_seq);
1777#endif
1778	fde->fde_file = fp;
1779	if ((flags & O_CLOEXEC) != 0)
1780		fde->fde_flags |= UF_EXCLOSE;
1781	if (fcaps != NULL)
1782		filecaps_move(fcaps, &fde->fde_caps);
1783	else
1784		filecaps_fill(&fde->fde_caps);
1785#ifdef CAPABILITIES
1786	seq_write_end(&fde->fde_seq);
1787#endif
1788	FILEDESC_XUNLOCK(fdp);
1789	return (0);
1790}
1791
1792/*
1793 * Build a new filedesc structure from another.
1794 * Copy the current, root, and jail root vnode references.
1795 *
1796 * If fdp is not NULL, return with it shared locked.
1797 */
1798struct filedesc *
1799fdinit(struct filedesc *fdp)
1800{
1801	struct filedesc0 *newfdp0;
1802	struct filedesc *newfdp;
1803
1804	newfdp0 = malloc(sizeof *newfdp0, M_FILEDESC, M_WAITOK | M_ZERO);
1805	newfdp = &newfdp0->fd_fd;
1806
1807	/* Create the file descriptor table. */
1808	FILEDESC_LOCK_INIT(newfdp);
1809	newfdp->fd_refcnt = 1;
1810	newfdp->fd_holdcnt = 1;
1811	newfdp->fd_cmask = CMASK;
1812	newfdp->fd_map = newfdp0->fd_dmap;
1813	newfdp->fd_lastfile = -1;
1814	newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles;
1815	newfdp->fd_files->fdt_nfiles = NDFILE;
1816
1817	if (fdp == NULL)
1818		return (newfdp);
1819
1820	if (fdp->fd_lastfile >= newfdp->fd_nfiles)
1821		fdgrowtable(newfdp, fdp->fd_lastfile + 1);
1822
1823	FILEDESC_SLOCK(fdp);
1824	newfdp->fd_cdir = fdp->fd_cdir;
1825	if (newfdp->fd_cdir)
1826		VREF(newfdp->fd_cdir);
1827	newfdp->fd_rdir = fdp->fd_rdir;
1828	if (newfdp->fd_rdir)
1829		VREF(newfdp->fd_rdir);
1830	newfdp->fd_jdir = fdp->fd_jdir;
1831	if (newfdp->fd_jdir)
1832		VREF(newfdp->fd_jdir);
1833
1834	while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
1835		FILEDESC_SUNLOCK(fdp);
1836		fdgrowtable(newfdp, fdp->fd_lastfile + 1);
1837		FILEDESC_SLOCK(fdp);
1838	}
1839
1840	return (newfdp);
1841}
1842
1843static struct filedesc *
1844fdhold(struct proc *p)
1845{
1846	struct filedesc *fdp;
1847
1848	mtx_lock(&fdesc_mtx);
1849	fdp = p->p_fd;
1850	if (fdp != NULL)
1851		fdp->fd_holdcnt++;
1852	mtx_unlock(&fdesc_mtx);
1853	return (fdp);
1854}
1855
1856static void
1857fddrop(struct filedesc *fdp)
1858{
1859	struct filedesc0 *fdp0;
1860	struct freetable *ft;
1861	int i;
1862
1863	mtx_lock(&fdesc_mtx);
1864	i = --fdp->fd_holdcnt;
1865	mtx_unlock(&fdesc_mtx);
1866	if (i > 0)
1867		return;
1868
1869	FILEDESC_LOCK_DESTROY(fdp);
1870	fdp0 = (struct filedesc0 *)fdp;
1871	while ((ft = SLIST_FIRST(&fdp0->fd_free)) != NULL) {
1872		SLIST_REMOVE_HEAD(&fdp0->fd_free, ft_next);
1873		free(ft->ft_table, M_FILEDESC);
1874	}
1875	free(fdp, M_FILEDESC);
1876}
1877
1878/*
1879 * Share a filedesc structure.
1880 */
1881struct filedesc *
1882fdshare(struct filedesc *fdp)
1883{
1884
1885	FILEDESC_XLOCK(fdp);
1886	fdp->fd_refcnt++;
1887	FILEDESC_XUNLOCK(fdp);
1888	return (fdp);
1889}
1890
1891/*
1892 * Unshare a filedesc structure, if necessary by making a copy
1893 */
1894void
1895fdunshare(struct thread *td)
1896{
1897	struct filedesc *tmp;
1898	struct proc *p = td->td_proc;
1899
1900	if (p->p_fd->fd_refcnt == 1)
1901		return;
1902
1903	tmp = fdcopy(p->p_fd);
1904	fdescfree(td);
1905	p->p_fd = tmp;
1906}
1907
1908/*
1909 * Copy a filedesc structure.  A NULL pointer in returns a NULL reference,
1910 * this is to ease callers, not catch errors.
1911 */
1912struct filedesc *
1913fdcopy(struct filedesc *fdp)
1914{
1915	struct filedesc *newfdp;
1916	struct filedescent *nfde, *ofde;
1917	int i;
1918
1919	/* Certain daemons might not have file descriptors. */
1920	if (fdp == NULL)
1921		return (NULL);
1922
1923	newfdp = fdinit(fdp);
1924	/* copy all passable descriptors (i.e. not kqueue) */
1925	newfdp->fd_freefile = -1;
1926	for (i = 0; i <= fdp->fd_lastfile; ++i) {
1927		ofde = &fdp->fd_ofiles[i];
1928		if (ofde->fde_file == NULL ||
1929		    (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) {
1930			if (newfdp->fd_freefile == -1)
1931				newfdp->fd_freefile = i;
1932			continue;
1933		}
1934		nfde = &newfdp->fd_ofiles[i];
1935		*nfde = *ofde;
1936		filecaps_copy(&ofde->fde_caps, &nfde->fde_caps);
1937		fhold(nfde->fde_file);
1938		fdused_init(newfdp, i);
1939		newfdp->fd_lastfile = i;
1940	}
1941	if (newfdp->fd_freefile == -1)
1942		newfdp->fd_freefile = i;
1943	newfdp->fd_cmask = fdp->fd_cmask;
1944	FILEDESC_SUNLOCK(fdp);
1945	return (newfdp);
1946}
1947
1948/*
1949 * Release a filedesc structure.
1950 */
1951void
1952fdescfree(struct thread *td)
1953{
1954	struct filedesc *fdp;
1955	int i;
1956	struct filedesc_to_leader *fdtol;
1957	struct filedescent *fde;
1958	struct file *fp;
1959	struct vnode *cdir, *jdir, *rdir, *vp;
1960	struct flock lf;
1961
1962	/* Certain daemons might not have file descriptors. */
1963	fdp = td->td_proc->p_fd;
1964	if (fdp == NULL)
1965		return;
1966
1967#ifdef RACCT
1968	PROC_LOCK(td->td_proc);
1969	racct_set(td->td_proc, RACCT_NOFILE, 0);
1970	PROC_UNLOCK(td->td_proc);
1971#endif
1972
1973	/* Check for special need to clear POSIX style locks */
1974	fdtol = td->td_proc->p_fdtol;
1975	if (fdtol != NULL) {
1976		FILEDESC_XLOCK(fdp);
1977		KASSERT(fdtol->fdl_refcount > 0,
1978		    ("filedesc_to_refcount botch: fdl_refcount=%d",
1979		    fdtol->fdl_refcount));
1980		if (fdtol->fdl_refcount == 1 &&
1981		    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
1982			for (i = 0; i <= fdp->fd_lastfile; i++) {
1983				fp = fdp->fd_ofiles[i].fde_file;
1984				if (fp == NULL || fp->f_type != DTYPE_VNODE)
1985					continue;
1986				fhold(fp);
1987				FILEDESC_XUNLOCK(fdp);
1988				lf.l_whence = SEEK_SET;
1989				lf.l_start = 0;
1990				lf.l_len = 0;
1991				lf.l_type = F_UNLCK;
1992				vp = fp->f_vnode;
1993				(void) VOP_ADVLOCK(vp,
1994				    (caddr_t)td->td_proc->p_leader, F_UNLCK,
1995				    &lf, F_POSIX);
1996				FILEDESC_XLOCK(fdp);
1997				fdrop(fp, td);
1998			}
1999		}
2000	retry:
2001		if (fdtol->fdl_refcount == 1) {
2002			if (fdp->fd_holdleaderscount > 0 &&
2003			    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
2004				/*
2005				 * close() or do_dup() has cleared a reference
2006				 * in a shared file descriptor table.
2007				 */
2008				fdp->fd_holdleaderswakeup = 1;
2009				sx_sleep(&fdp->fd_holdleaderscount,
2010				    FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
2011				goto retry;
2012			}
2013			if (fdtol->fdl_holdcount > 0) {
2014				/*
2015				 * Ensure that fdtol->fdl_leader remains
2016				 * valid in closef().
2017				 */
2018				fdtol->fdl_wakeup = 1;
2019				sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
2020				    "fdlhold", 0);
2021				goto retry;
2022			}
2023		}
2024		fdtol->fdl_refcount--;
2025		if (fdtol->fdl_refcount == 0 &&
2026		    fdtol->fdl_holdcount == 0) {
2027			fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
2028			fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
2029		} else
2030			fdtol = NULL;
2031		td->td_proc->p_fdtol = NULL;
2032		FILEDESC_XUNLOCK(fdp);
2033		if (fdtol != NULL)
2034			free(fdtol, M_FILEDESC_TO_LEADER);
2035	}
2036
2037	mtx_lock(&fdesc_mtx);
2038	td->td_proc->p_fd = NULL;
2039	mtx_unlock(&fdesc_mtx);
2040
2041	FILEDESC_XLOCK(fdp);
2042	i = --fdp->fd_refcnt;
2043	if (i > 0) {
2044		FILEDESC_XUNLOCK(fdp);
2045		return;
2046	}
2047
2048	cdir = fdp->fd_cdir;
2049	fdp->fd_cdir = NULL;
2050	rdir = fdp->fd_rdir;
2051	fdp->fd_rdir = NULL;
2052	jdir = fdp->fd_jdir;
2053	fdp->fd_jdir = NULL;
2054	FILEDESC_XUNLOCK(fdp);
2055
2056	for (i = 0; i <= fdp->fd_lastfile; i++) {
2057		fde = &fdp->fd_ofiles[i];
2058		fp = fde->fde_file;
2059		if (fp != NULL) {
2060			fdefree_last(fde);
2061			(void) closef(fp, td);
2062		}
2063	}
2064
2065	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
2066		free(fdp->fd_map, M_FILEDESC);
2067	if (fdp->fd_nfiles > NDFILE)
2068		free(fdp->fd_files, M_FILEDESC);
2069
2070	if (cdir != NULL)
2071		vrele(cdir);
2072	if (rdir != NULL)
2073		vrele(rdir);
2074	if (jdir != NULL)
2075		vrele(jdir);
2076
2077	fddrop(fdp);
2078}
2079
2080/*
2081 * For setugid programs, we don't want to people to use that setugidness
2082 * to generate error messages which write to a file which otherwise would
2083 * otherwise be off-limits to the process.  We check for filesystems where
2084 * the vnode can change out from under us after execve (like [lin]procfs).
2085 *
2086 * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is
2087 * sufficient.  We also don't check for setugidness since we know we are.
2088 */
2089static bool
2090is_unsafe(struct file *fp)
2091{
2092	struct vnode *vp;
2093
2094	if (fp->f_type != DTYPE_VNODE)
2095		return (false);
2096
2097	vp = fp->f_vnode;
2098	return ((vp->v_vflag & VV_PROCDEP) != 0);
2099}
2100
2101/*
2102 * Make this setguid thing safe, if at all possible.
2103 */
2104void
2105fdsetugidsafety(struct thread *td)
2106{
2107	struct filedesc *fdp;
2108	struct file *fp;
2109	int i;
2110
2111	fdp = td->td_proc->p_fd;
2112	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
2113	MPASS(fdp->fd_nfiles >= 3);
2114	for (i = 0; i <= 2; i++) {
2115		fp = fdp->fd_ofiles[i].fde_file;
2116		if (fp != NULL && is_unsafe(fp)) {
2117			FILEDESC_XLOCK(fdp);
2118			knote_fdclose(td, i);
2119			/*
2120			 * NULL-out descriptor prior to close to avoid
2121			 * a race while close blocks.
2122			 */
2123			fdfree(fdp, i);
2124			FILEDESC_XUNLOCK(fdp);
2125			(void) closef(fp, td);
2126		}
2127	}
2128}
2129
2130/*
2131 * If a specific file object occupies a specific file descriptor, close the
2132 * file descriptor entry and drop a reference on the file object.  This is a
2133 * convenience function to handle a subsequent error in a function that calls
2134 * falloc() that handles the race that another thread might have closed the
2135 * file descriptor out from under the thread creating the file object.
2136 */
2137void
2138fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
2139{
2140
2141	FILEDESC_XLOCK(fdp);
2142	if (fdp->fd_ofiles[idx].fde_file == fp) {
2143		fdfree(fdp, idx);
2144		FILEDESC_XUNLOCK(fdp);
2145		fdrop(fp, td);
2146	} else
2147		FILEDESC_XUNLOCK(fdp);
2148}
2149
2150/*
2151 * Close any files on exec?
2152 */
2153void
2154fdcloseexec(struct thread *td)
2155{
2156	struct filedesc *fdp;
2157	struct filedescent *fde;
2158	struct file *fp;
2159	int i;
2160
2161	fdp = td->td_proc->p_fd;
2162	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
2163	for (i = 0; i <= fdp->fd_lastfile; i++) {
2164		fde = &fdp->fd_ofiles[i];
2165		fp = fde->fde_file;
2166		if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
2167		    (fde->fde_flags & UF_EXCLOSE))) {
2168			FILEDESC_XLOCK(fdp);
2169			fdfree(fdp, i);
2170			(void) closefp(fdp, i, fp, td, 0);
2171			/* closefp() drops the FILEDESC lock. */
2172		}
2173	}
2174}
2175
2176/*
2177 * It is unsafe for set[ug]id processes to be started with file
2178 * descriptors 0..2 closed, as these descriptors are given implicit
2179 * significance in the Standard C library.  fdcheckstd() will create a
2180 * descriptor referencing /dev/null for each of stdin, stdout, and
2181 * stderr that is not already open.
2182 */
2183int
2184fdcheckstd(struct thread *td)
2185{
2186	struct filedesc *fdp;
2187	register_t save;
2188	int i, error, devnull;
2189
2190	fdp = td->td_proc->p_fd;
2191	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
2192	devnull = -1;
2193	error = 0;
2194	for (i = 0; i < 3; i++) {
2195		if (fdp->fd_ofiles[i].fde_file != NULL)
2196			continue;
2197		if (devnull < 0) {
2198			save = td->td_retval[0];
2199			error = kern_open(td, "/dev/null", UIO_SYSSPACE,
2200			    O_RDWR, 0);
2201			devnull = td->td_retval[0];
2202			td->td_retval[0] = save;
2203			if (error)
2204				break;
2205			KASSERT(devnull == i, ("oof, we didn't get our fd"));
2206		} else {
2207			save = td->td_retval[0];
2208			error = do_dup(td, DUP_FIXED, devnull, i);
2209			td->td_retval[0] = save;
2210			if (error != 0)
2211				break;
2212		}
2213	}
2214	return (error);
2215}
2216
2217/*
2218 * Internal form of close.  Decrement reference count on file structure.
2219 * Note: td may be NULL when closing a file that was being passed in a
2220 * message.
2221 *
2222 * XXXRW: Giant is not required for the caller, but often will be held; this
2223 * makes it moderately likely the Giant will be recursed in the VFS case.
2224 */
2225int
2226closef(struct file *fp, struct thread *td)
2227{
2228	struct vnode *vp;
2229	struct flock lf;
2230	struct filedesc_to_leader *fdtol;
2231	struct filedesc *fdp;
2232
2233	/*
2234	 * POSIX record locking dictates that any close releases ALL
2235	 * locks owned by this process.  This is handled by setting
2236	 * a flag in the unlock to free ONLY locks obeying POSIX
2237	 * semantics, and not to free BSD-style file locks.
2238	 * If the descriptor was in a message, POSIX-style locks
2239	 * aren't passed with the descriptor, and the thread pointer
2240	 * will be NULL.  Callers should be careful only to pass a
2241	 * NULL thread pointer when there really is no owning
2242	 * context that might have locks, or the locks will be
2243	 * leaked.
2244	 */
2245	if (fp->f_type == DTYPE_VNODE && td != NULL) {
2246		vp = fp->f_vnode;
2247		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
2248			lf.l_whence = SEEK_SET;
2249			lf.l_start = 0;
2250			lf.l_len = 0;
2251			lf.l_type = F_UNLCK;
2252			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
2253			    F_UNLCK, &lf, F_POSIX);
2254		}
2255		fdtol = td->td_proc->p_fdtol;
2256		if (fdtol != NULL) {
2257			/*
2258			 * Handle special case where file descriptor table is
2259			 * shared between multiple process leaders.
2260			 */
2261			fdp = td->td_proc->p_fd;
2262			FILEDESC_XLOCK(fdp);
2263			for (fdtol = fdtol->fdl_next;
2264			     fdtol != td->td_proc->p_fdtol;
2265			     fdtol = fdtol->fdl_next) {
2266				if ((fdtol->fdl_leader->p_flag &
2267				     P_ADVLOCK) == 0)
2268					continue;
2269				fdtol->fdl_holdcount++;
2270				FILEDESC_XUNLOCK(fdp);
2271				lf.l_whence = SEEK_SET;
2272				lf.l_start = 0;
2273				lf.l_len = 0;
2274				lf.l_type = F_UNLCK;
2275				vp = fp->f_vnode;
2276				(void) VOP_ADVLOCK(vp,
2277				    (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
2278				    F_POSIX);
2279				FILEDESC_XLOCK(fdp);
2280				fdtol->fdl_holdcount--;
2281				if (fdtol->fdl_holdcount == 0 &&
2282				    fdtol->fdl_wakeup != 0) {
2283					fdtol->fdl_wakeup = 0;
2284					wakeup(fdtol);
2285				}
2286			}
2287			FILEDESC_XUNLOCK(fdp);
2288		}
2289	}
2290	return (fdrop(fp, td));
2291}
2292
2293/*
2294 * Initialize the file pointer with the specified properties.
2295 *
2296 * The ops are set with release semantics to be certain that the flags, type,
2297 * and data are visible when ops is.  This is to prevent ops methods from being
2298 * called with bad data.
2299 */
2300void
2301finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
2302{
2303	fp->f_data = data;
2304	fp->f_flag = flag;
2305	fp->f_type = type;
2306	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
2307}
2308
2309int
2310fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
2311    int needfcntl, struct file **fpp, cap_rights_t *haverightsp)
2312{
2313#ifdef CAPABILITIES
2314	struct filedescent fde;
2315#endif
2316	struct fdescenttbl *fdt;
2317	struct file *fp;
2318	u_int count;
2319#ifdef CAPABILITIES
2320	seq_t seq;
2321	cap_rights_t haverights;
2322	int error;
2323#endif
2324
2325	fdt = fdp->fd_files;
2326	if (fd < 0 || fd >= fdt->fdt_nfiles)
2327		return (EBADF);
2328	/*
2329	 * Fetch the descriptor locklessly.  We avoid fdrop() races by
2330	 * never raising a refcount above 0.  To accomplish this we have
2331	 * to use a cmpset loop rather than an atomic_add.  The descriptor
2332	 * must be re-verified once we acquire a reference to be certain
2333	 * that the identity is still correct and we did not lose a race
2334	 * due to preemption.
2335	 */
2336	for (;;) {
2337#ifdef CAPABILITIES
2338		seq = seq_read(fd_seq(fdt, fd));
2339		fde = fdt->fdt_ofiles[fd];
2340		if (!seq_consistent(fd_seq(fdt, fd), seq)) {
2341			cpu_spinwait();
2342			continue;
2343		}
2344		fp = fde.fde_file;
2345#else
2346		fp = fdt->fdt_ofiles[fd].fde_file;
2347#endif
2348		if (fp == NULL)
2349			return (EBADF);
2350#ifdef CAPABILITIES
2351		haverights = *cap_rights_fde(&fde);
2352		if (needrightsp != NULL) {
2353			error = cap_check(&haverights, needrightsp);
2354			if (error != 0)
2355				return (error);
2356			if (cap_rights_is_set(needrightsp, CAP_FCNTL)) {
2357				error = cap_fcntl_check_fde(&fde, needfcntl);
2358				if (error != 0)
2359					return (error);
2360			}
2361		}
2362#endif
2363	retry:
2364		count = fp->f_count;
2365		if (count == 0) {
2366			/*
2367			 * Force a reload. Other thread could reallocate the
2368			 * table before this fd was closed, so it possible that
2369			 * there is a stale fp pointer in cached version.
2370			 */
2371			fdt = *(struct fdescenttbl * volatile *)&(fdp->fd_files);
2372			continue;
2373		}
2374		/*
2375		 * Use an acquire barrier to force re-reading of fdt so it is
2376		 * refreshed for verification.
2377		 */
2378		if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) == 0)
2379			goto retry;
2380		fdt = fdp->fd_files;
2381#ifdef	CAPABILITIES
2382		if (seq_consistent_nomb(fd_seq(fdt, fd), seq))
2383#else
2384		if (fp == fdt->fdt_ofiles[fd].fde_file)
2385#endif
2386			break;
2387		fdrop(fp, curthread);
2388	}
2389	*fpp = fp;
2390	if (haverightsp != NULL) {
2391#ifdef CAPABILITIES
2392		*haverightsp = haverights;
2393#else
2394		CAP_ALL(haverightsp);
2395#endif
2396	}
2397	return (0);
2398}
2399
2400/*
2401 * Extract the file pointer associated with the specified descriptor for the
2402 * current user process.
2403 *
2404 * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
2405 * returned.
2406 *
2407 * File's rights will be checked against the capability rights mask.
2408 *
2409 * If an error occured the non-zero error is returned and *fpp is set to
2410 * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
2411 * responsible for fdrop().
2412 */
2413static __inline int
2414_fget(struct thread *td, int fd, struct file **fpp, int flags,
2415    cap_rights_t *needrightsp, u_char *maxprotp)
2416{
2417	struct filedesc *fdp;
2418	struct file *fp;
2419	cap_rights_t haverights, needrights;
2420	int error;
2421
2422	*fpp = NULL;
2423	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
2424		return (EBADF);
2425	if (needrightsp != NULL)
2426		needrights = *needrightsp;
2427	else
2428		cap_rights_init(&needrights);
2429	if (maxprotp != NULL)
2430		cap_rights_set(&needrights, CAP_MMAP);
2431	error = fget_unlocked(fdp, fd, &needrights, 0, &fp, &haverights);
2432	if (error != 0)
2433		return (error);
2434	if (fp->f_ops == &badfileops) {
2435		fdrop(fp, td);
2436		return (EBADF);
2437	}
2438
2439#ifdef CAPABILITIES
2440	/*
2441	 * If requested, convert capability rights to access flags.
2442	 */
2443	if (maxprotp != NULL)
2444		*maxprotp = cap_rights_to_vmprot(&haverights);
2445#else /* !CAPABILITIES */
2446	if (maxprotp != NULL)
2447		*maxprotp = VM_PROT_ALL;
2448#endif /* CAPABILITIES */
2449
2450	/*
2451	 * FREAD and FWRITE failure return EBADF as per POSIX.
2452	 */
2453	error = 0;
2454	switch (flags) {
2455	case FREAD:
2456	case FWRITE:
2457		if ((fp->f_flag & flags) == 0)
2458			error = EBADF;
2459		break;
2460	case FEXEC:
2461	    	if ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
2462		    ((fp->f_flag & FWRITE) != 0))
2463			error = EBADF;
2464		break;
2465	case 0:
2466		break;
2467	default:
2468		KASSERT(0, ("wrong flags"));
2469	}
2470
2471	if (error != 0) {
2472		fdrop(fp, td);
2473		return (error);
2474	}
2475
2476	*fpp = fp;
2477	return (0);
2478}
2479
2480int
2481fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
2482{
2483
2484	return(_fget(td, fd, fpp, 0, rightsp, NULL));
2485}
2486
2487int
2488fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, u_char *maxprotp,
2489    struct file **fpp)
2490{
2491
2492	return (_fget(td, fd, fpp, 0, rightsp, maxprotp));
2493}
2494
2495int
2496fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
2497{
2498
2499	return(_fget(td, fd, fpp, FREAD, rightsp, NULL));
2500}
2501
2502int
2503fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
2504{
2505
2506	return (_fget(td, fd, fpp, FWRITE, rightsp, NULL));
2507}
2508
2509/*
2510 * Like fget() but loads the underlying vnode, or returns an error if the
2511 * descriptor does not represent a vnode.  Note that pipes use vnodes but
2512 * never have VM objects.  The returned vnode will be vref()'d.
2513 *
2514 * XXX: what about the unused flags ?
2515 */
2516static __inline int
2517_fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp,
2518    struct vnode **vpp)
2519{
2520	struct file *fp;
2521	int error;
2522
2523	*vpp = NULL;
2524	error = _fget(td, fd, &fp, flags, needrightsp, NULL);
2525	if (error != 0)
2526		return (error);
2527	if (fp->f_vnode == NULL) {
2528		error = EINVAL;
2529	} else {
2530		*vpp = fp->f_vnode;
2531		vref(*vpp);
2532	}
2533	fdrop(fp, td);
2534
2535	return (error);
2536}
2537
2538int
2539fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
2540{
2541
2542	return (_fgetvp(td, fd, 0, rightsp, vpp));
2543}
2544
2545int
2546fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
2547    struct filecaps *havecaps, struct vnode **vpp)
2548{
2549	struct filedesc *fdp;
2550	struct file *fp;
2551#ifdef CAPABILITIES
2552	int error;
2553#endif
2554
2555	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
2556		return (EBADF);
2557
2558	fp = fget_locked(fdp, fd);
2559	if (fp == NULL || fp->f_ops == &badfileops)
2560		return (EBADF);
2561
2562#ifdef CAPABILITIES
2563	if (needrightsp != NULL) {
2564		error = cap_check(cap_rights(fdp, fd), needrightsp);
2565		if (error != 0)
2566			return (error);
2567	}
2568#endif
2569
2570	if (fp->f_vnode == NULL)
2571		return (EINVAL);
2572
2573	*vpp = fp->f_vnode;
2574	vref(*vpp);
2575	filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, havecaps);
2576
2577	return (0);
2578}
2579
2580int
2581fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
2582{
2583
2584	return (_fgetvp(td, fd, FREAD, rightsp, vpp));
2585}
2586
2587int
2588fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
2589{
2590
2591	return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
2592}
2593
2594#ifdef notyet
2595int
2596fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
2597    struct vnode **vpp)
2598{
2599
2600	return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
2601}
2602#endif
2603
2604/*
2605 * Like fget() but loads the underlying socket, or returns an error if the
2606 * descriptor does not represent a socket.
2607 *
2608 * We bump the ref count on the returned socket.  XXX Also obtain the SX lock
2609 * in the future.
2610 *
2611 * Note: fgetsock() and fputsock() are deprecated, as consumers should rely
2612 * on their file descriptor reference to prevent the socket from being free'd
2613 * during use.
2614 */
2615int
2616fgetsock(struct thread *td, int fd, cap_rights_t *rightsp, struct socket **spp,
2617    u_int *fflagp)
2618{
2619	struct file *fp;
2620	int error;
2621
2622	*spp = NULL;
2623	if (fflagp != NULL)
2624		*fflagp = 0;
2625	if ((error = _fget(td, fd, &fp, 0, rightsp, NULL)) != 0)
2626		return (error);
2627	if (fp->f_type != DTYPE_SOCKET) {
2628		error = ENOTSOCK;
2629	} else {
2630		*spp = fp->f_data;
2631		if (fflagp)
2632			*fflagp = fp->f_flag;
2633		SOCK_LOCK(*spp);
2634		soref(*spp);
2635		SOCK_UNLOCK(*spp);
2636	}
2637	fdrop(fp, td);
2638
2639	return (error);
2640}
2641
2642/*
2643 * Drop the reference count on the socket and XXX release the SX lock in the
2644 * future.  The last reference closes the socket.
2645 *
2646 * Note: fputsock() is deprecated, see comment for fgetsock().
2647 */
2648void
2649fputsock(struct socket *so)
2650{
2651
2652	ACCEPT_LOCK();
2653	SOCK_LOCK(so);
2654	CURVNET_SET(so->so_vnet);
2655	sorele(so);
2656	CURVNET_RESTORE();
2657}
2658
2659/*
2660 * Handle the last reference to a file being closed.
2661 */
2662int
2663_fdrop(struct file *fp, struct thread *td)
2664{
2665	int error;
2666
2667	error = 0;
2668	if (fp->f_count != 0)
2669		panic("fdrop: count %d", fp->f_count);
2670	if (fp->f_ops != &badfileops)
2671		error = fo_close(fp, td);
2672	atomic_subtract_int(&openfiles, 1);
2673	crfree(fp->f_cred);
2674	free(fp->f_advice, M_FADVISE);
2675	uma_zfree(file_zone, fp);
2676
2677	return (error);
2678}
2679
2680/*
2681 * Apply an advisory lock on a file descriptor.
2682 *
2683 * Just attempt to get a record lock of the requested type on the entire file
2684 * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
2685 */
2686#ifndef _SYS_SYSPROTO_H_
2687struct flock_args {
2688	int	fd;
2689	int	how;
2690};
2691#endif
2692/* ARGSUSED */
2693int
2694sys_flock(struct thread *td, struct flock_args *uap)
2695{
2696	struct file *fp;
2697	struct vnode *vp;
2698	struct flock lf;
2699	cap_rights_t rights;
2700	int error;
2701
2702	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FLOCK), &fp);
2703	if (error != 0)
2704		return (error);
2705	if (fp->f_type != DTYPE_VNODE) {
2706		fdrop(fp, td);
2707		return (EOPNOTSUPP);
2708	}
2709
2710	vp = fp->f_vnode;
2711	lf.l_whence = SEEK_SET;
2712	lf.l_start = 0;
2713	lf.l_len = 0;
2714	if (uap->how & LOCK_UN) {
2715		lf.l_type = F_UNLCK;
2716		atomic_clear_int(&fp->f_flag, FHASLOCK);
2717		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
2718		goto done2;
2719	}
2720	if (uap->how & LOCK_EX)
2721		lf.l_type = F_WRLCK;
2722	else if (uap->how & LOCK_SH)
2723		lf.l_type = F_RDLCK;
2724	else {
2725		error = EBADF;
2726		goto done2;
2727	}
2728	atomic_set_int(&fp->f_flag, FHASLOCK);
2729	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
2730	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
2731done2:
2732	fdrop(fp, td);
2733	return (error);
2734}
2735/*
2736 * Duplicate the specified descriptor to a free descriptor.
2737 */
2738int
2739dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
2740    int openerror, int *indxp)
2741{
2742	struct filedescent *newfde, *oldfde;
2743	struct file *fp;
2744	int error, indx;
2745
2746	KASSERT(openerror == ENODEV || openerror == ENXIO,
2747	    ("unexpected error %d in %s", openerror, __func__));
2748
2749	/*
2750	 * If the to-be-dup'd fd number is greater than the allowed number
2751	 * of file descriptors, or the fd to be dup'd has already been
2752	 * closed, then reject.
2753	 */
2754	FILEDESC_XLOCK(fdp);
2755	if ((fp = fget_locked(fdp, dfd)) == NULL) {
2756		FILEDESC_XUNLOCK(fdp);
2757		return (EBADF);
2758	}
2759
2760	error = fdalloc(td, 0, &indx);
2761	if (error != 0) {
2762		FILEDESC_XUNLOCK(fdp);
2763		return (error);
2764	}
2765
2766	/*
2767	 * There are two cases of interest here.
2768	 *
2769	 * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
2770	 *
2771	 * For ENXIO steal away the file structure from (dfd) and store it in
2772	 * (indx).  (dfd) is effectively closed by this operation.
2773	 */
2774	switch (openerror) {
2775	case ENODEV:
2776		/*
2777		 * Check that the mode the file is being opened for is a
2778		 * subset of the mode of the existing descriptor.
2779		 */
2780		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
2781			fdunused(fdp, indx);
2782			FILEDESC_XUNLOCK(fdp);
2783			return (EACCES);
2784		}
2785		fhold(fp);
2786		newfde = &fdp->fd_ofiles[indx];
2787		oldfde = &fdp->fd_ofiles[dfd];
2788#ifdef CAPABILITIES
2789		seq_write_begin(&newfde->fde_seq);
2790#endif
2791		memcpy(newfde, oldfde, fde_change_size);
2792		filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps);
2793#ifdef CAPABILITIES
2794		seq_write_end(&newfde->fde_seq);
2795#endif
2796		break;
2797	case ENXIO:
2798		/*
2799		 * Steal away the file pointer from dfd and stuff it into indx.
2800		 */
2801		newfde = &fdp->fd_ofiles[indx];
2802		oldfde = &fdp->fd_ofiles[dfd];
2803#ifdef CAPABILITIES
2804		seq_write_begin(&newfde->fde_seq);
2805#endif
2806		memcpy(newfde, oldfde, fde_change_size);
2807		bzero(oldfde, fde_change_size);
2808		fdunused(fdp, dfd);
2809#ifdef CAPABILITIES
2810		seq_write_end(&newfde->fde_seq);
2811#endif
2812		break;
2813	}
2814	FILEDESC_XUNLOCK(fdp);
2815	*indxp = indx;
2816	return (0);
2817}
2818
2819/*
2820 * Scan all active processes and prisons to see if any of them have a current
2821 * or root directory of `olddp'. If so, replace them with the new mount point.
2822 */
2823void
2824mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
2825{
2826	struct filedesc *fdp;
2827	struct prison *pr;
2828	struct proc *p;
2829	int nrele;
2830
2831	if (vrefcnt(olddp) == 1)
2832		return;
2833	nrele = 0;
2834	sx_slock(&allproc_lock);
2835	FOREACH_PROC_IN_SYSTEM(p) {
2836		fdp = fdhold(p);
2837		if (fdp == NULL)
2838			continue;
2839		FILEDESC_XLOCK(fdp);
2840		if (fdp->fd_cdir == olddp) {
2841			vref(newdp);
2842			fdp->fd_cdir = newdp;
2843			nrele++;
2844		}
2845		if (fdp->fd_rdir == olddp) {
2846			vref(newdp);
2847			fdp->fd_rdir = newdp;
2848			nrele++;
2849		}
2850		if (fdp->fd_jdir == olddp) {
2851			vref(newdp);
2852			fdp->fd_jdir = newdp;
2853			nrele++;
2854		}
2855		FILEDESC_XUNLOCK(fdp);
2856		fddrop(fdp);
2857	}
2858	sx_sunlock(&allproc_lock);
2859	if (rootvnode == olddp) {
2860		vref(newdp);
2861		rootvnode = newdp;
2862		nrele++;
2863	}
2864	mtx_lock(&prison0.pr_mtx);
2865	if (prison0.pr_root == olddp) {
2866		vref(newdp);
2867		prison0.pr_root = newdp;
2868		nrele++;
2869	}
2870	mtx_unlock(&prison0.pr_mtx);
2871	sx_slock(&allprison_lock);
2872	TAILQ_FOREACH(pr, &allprison, pr_list) {
2873		mtx_lock(&pr->pr_mtx);
2874		if (pr->pr_root == olddp) {
2875			vref(newdp);
2876			pr->pr_root = newdp;
2877			nrele++;
2878		}
2879		mtx_unlock(&pr->pr_mtx);
2880	}
2881	sx_sunlock(&allprison_lock);
2882	while (nrele--)
2883		vrele(olddp);
2884}
2885
2886struct filedesc_to_leader *
2887filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
2888{
2889	struct filedesc_to_leader *fdtol;
2890
2891	fdtol = malloc(sizeof(struct filedesc_to_leader),
2892	       M_FILEDESC_TO_LEADER,
2893	       M_WAITOK);
2894	fdtol->fdl_refcount = 1;
2895	fdtol->fdl_holdcount = 0;
2896	fdtol->fdl_wakeup = 0;
2897	fdtol->fdl_leader = leader;
2898	if (old != NULL) {
2899		FILEDESC_XLOCK(fdp);
2900		fdtol->fdl_next = old->fdl_next;
2901		fdtol->fdl_prev = old;
2902		old->fdl_next = fdtol;
2903		fdtol->fdl_next->fdl_prev = fdtol;
2904		FILEDESC_XUNLOCK(fdp);
2905	} else {
2906		fdtol->fdl_next = fdtol;
2907		fdtol->fdl_prev = fdtol;
2908	}
2909	return (fdtol);
2910}
2911
2912/*
2913 * Get file structures globally.
2914 */
2915static int
2916sysctl_kern_file(SYSCTL_HANDLER_ARGS)
2917{
2918	struct xfile xf;
2919	struct filedesc *fdp;
2920	struct file *fp;
2921	struct proc *p;
2922	int error, n;
2923
2924	error = sysctl_wire_old_buffer(req, 0);
2925	if (error != 0)
2926		return (error);
2927	if (req->oldptr == NULL) {
2928		n = 0;
2929		sx_slock(&allproc_lock);
2930		FOREACH_PROC_IN_SYSTEM(p) {
2931			if (p->p_state == PRS_NEW)
2932				continue;
2933			fdp = fdhold(p);
2934			if (fdp == NULL)
2935				continue;
2936			/* overestimates sparse tables. */
2937			if (fdp->fd_lastfile > 0)
2938				n += fdp->fd_lastfile;
2939			fddrop(fdp);
2940		}
2941		sx_sunlock(&allproc_lock);
2942		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
2943	}
2944	error = 0;
2945	bzero(&xf, sizeof(xf));
2946	xf.xf_size = sizeof(xf);
2947	sx_slock(&allproc_lock);
2948	FOREACH_PROC_IN_SYSTEM(p) {
2949		PROC_LOCK(p);
2950		if (p->p_state == PRS_NEW) {
2951			PROC_UNLOCK(p);
2952			continue;
2953		}
2954		if (p_cansee(req->td, p) != 0) {
2955			PROC_UNLOCK(p);
2956			continue;
2957		}
2958		xf.xf_pid = p->p_pid;
2959		xf.xf_uid = p->p_ucred->cr_uid;
2960		PROC_UNLOCK(p);
2961		fdp = fdhold(p);
2962		if (fdp == NULL)
2963			continue;
2964		FILEDESC_SLOCK(fdp);
2965		for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) {
2966			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
2967				continue;
2968			xf.xf_fd = n;
2969			xf.xf_file = fp;
2970			xf.xf_data = fp->f_data;
2971			xf.xf_vnode = fp->f_vnode;
2972			xf.xf_type = fp->f_type;
2973			xf.xf_count = fp->f_count;
2974			xf.xf_msgcount = 0;
2975			xf.xf_offset = foffset_get(fp);
2976			xf.xf_flag = fp->f_flag;
2977			error = SYSCTL_OUT(req, &xf, sizeof(xf));
2978			if (error)
2979				break;
2980		}
2981		FILEDESC_SUNLOCK(fdp);
2982		fddrop(fdp);
2983		if (error)
2984			break;
2985	}
2986	sx_sunlock(&allproc_lock);
2987	return (error);
2988}
2989
2990SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
2991    0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
2992
2993#ifdef KINFO_FILE_SIZE
2994CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
2995#endif
2996
2997static int
2998xlate_fflags(int fflags)
2999{
3000	static const struct {
3001		int	fflag;
3002		int	kf_fflag;
3003	} fflags_table[] = {
3004		{ FAPPEND, KF_FLAG_APPEND },
3005		{ FASYNC, KF_FLAG_ASYNC },
3006		{ FFSYNC, KF_FLAG_FSYNC },
3007		{ FHASLOCK, KF_FLAG_HASLOCK },
3008		{ FNONBLOCK, KF_FLAG_NONBLOCK },
3009		{ FREAD, KF_FLAG_READ },
3010		{ FWRITE, KF_FLAG_WRITE },
3011		{ O_CREAT, KF_FLAG_CREAT },
3012		{ O_DIRECT, KF_FLAG_DIRECT },
3013		{ O_EXCL, KF_FLAG_EXCL },
3014		{ O_EXEC, KF_FLAG_EXEC },
3015		{ O_EXLOCK, KF_FLAG_EXLOCK },
3016		{ O_NOFOLLOW, KF_FLAG_NOFOLLOW },
3017		{ O_SHLOCK, KF_FLAG_SHLOCK },
3018		{ O_TRUNC, KF_FLAG_TRUNC }
3019	};
3020	unsigned int i;
3021	int kflags;
3022
3023	kflags = 0;
3024	for (i = 0; i < nitems(fflags_table); i++)
3025		if (fflags & fflags_table[i].fflag)
3026			kflags |=  fflags_table[i].kf_fflag;
3027	return (kflags);
3028}
3029
3030/* Trim unused data from kf_path by truncating the structure size. */
3031static void
3032pack_kinfo(struct kinfo_file *kif)
3033{
3034
3035	kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
3036	    strlen(kif->kf_path) + 1;
3037	kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
3038}
3039
3040static void
3041export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp,
3042    struct kinfo_file *kif, struct filedesc *fdp)
3043{
3044	int error;
3045
3046	bzero(kif, sizeof(*kif));
3047
3048	/* Set a default type to allow for empty fill_kinfo() methods. */
3049	kif->kf_type = KF_TYPE_UNKNOWN;
3050	kif->kf_flags = xlate_fflags(fp->f_flag);
3051	if (rightsp != NULL)
3052		kif->kf_cap_rights = *rightsp;
3053	else
3054		cap_rights_init(&kif->kf_cap_rights);
3055	kif->kf_fd = fd;
3056	kif->kf_ref_count = fp->f_count;
3057	kif->kf_offset = foffset_get(fp);
3058
3059	/*
3060	 * This may drop the filedesc lock, so the 'fp' cannot be
3061	 * accessed after this call.
3062	 */
3063	error = fo_fill_kinfo(fp, kif, fdp);
3064	if (error == 0)
3065		kif->kf_status |= KF_ATTR_VALID;
3066	pack_kinfo(kif);
3067}
3068
3069static void
3070export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags,
3071    struct kinfo_file *kif)
3072{
3073	int error;
3074
3075	bzero(kif, sizeof(*kif));
3076
3077	kif->kf_type = KF_TYPE_VNODE;
3078	error = vn_fill_kinfo_vnode(vp, kif);
3079	if (error == 0)
3080		kif->kf_status |= KF_ATTR_VALID;
3081	kif->kf_flags = xlate_fflags(fflags);
3082	kif->kf_fd = fd;
3083	kif->kf_ref_count = -1;
3084	kif->kf_offset = -1;
3085	pack_kinfo(kif);
3086	vrele(vp);
3087}
3088
3089struct export_fd_buf {
3090	struct filedesc		*fdp;
3091	struct sbuf 		*sb;
3092	ssize_t			remainder;
3093	struct kinfo_file	kif;
3094};
3095
3096static int
3097export_kinfo_to_sb(struct export_fd_buf *efbuf)
3098{
3099	struct kinfo_file *kif;
3100
3101	kif = &efbuf->kif;
3102	if (efbuf->remainder != -1) {
3103		if (efbuf->remainder < kif->kf_structsize) {
3104			/* Terminate export. */
3105			efbuf->remainder = 0;
3106			return (0);
3107		}
3108		efbuf->remainder -= kif->kf_structsize;
3109	}
3110	return (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM);
3111}
3112
3113static int
3114export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp,
3115    struct export_fd_buf *efbuf)
3116{
3117	int error;
3118
3119	if (efbuf->remainder == 0)
3120		return (0);
3121	export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp);
3122	FILEDESC_SUNLOCK(efbuf->fdp);
3123	error = export_kinfo_to_sb(efbuf);
3124	FILEDESC_SLOCK(efbuf->fdp);
3125	return (error);
3126}
3127
3128static int
3129export_vnode_to_sb(struct vnode *vp, int fd, int fflags,
3130    struct export_fd_buf *efbuf)
3131{
3132	int error;
3133
3134	if (efbuf->remainder == 0)
3135		return (0);
3136	if (efbuf->fdp != NULL)
3137		FILEDESC_SUNLOCK(efbuf->fdp);
3138	export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif);
3139	error = export_kinfo_to_sb(efbuf);
3140	if (efbuf->fdp != NULL)
3141		FILEDESC_SLOCK(efbuf->fdp);
3142	return (error);
3143}
3144
3145/*
3146 * Store a process file descriptor information to sbuf.
3147 *
3148 * Takes a locked proc as argument, and returns with the proc unlocked.
3149 */
3150int
3151kern_proc_filedesc_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen)
3152{
3153	struct file *fp;
3154	struct filedesc *fdp;
3155	struct export_fd_buf *efbuf;
3156	struct vnode *cttyvp, *textvp, *tracevp;
3157	int error, i;
3158	cap_rights_t rights;
3159
3160	PROC_LOCK_ASSERT(p, MA_OWNED);
3161
3162	/* ktrace vnode */
3163	tracevp = p->p_tracevp;
3164	if (tracevp != NULL)
3165		vref(tracevp);
3166	/* text vnode */
3167	textvp = p->p_textvp;
3168	if (textvp != NULL)
3169		vref(textvp);
3170	/* Controlling tty. */
3171	cttyvp = NULL;
3172	if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
3173		cttyvp = p->p_pgrp->pg_session->s_ttyvp;
3174		if (cttyvp != NULL)
3175			vref(cttyvp);
3176	}
3177	fdp = fdhold(p);
3178	PROC_UNLOCK(p);
3179	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
3180	efbuf->fdp = NULL;
3181	efbuf->sb = sb;
3182	efbuf->remainder = maxlen;
3183	if (tracevp != NULL)
3184		export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, FREAD | FWRITE,
3185		    efbuf);
3186	if (textvp != NULL)
3187		export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, efbuf);
3188	if (cttyvp != NULL)
3189		export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, FREAD | FWRITE,
3190		    efbuf);
3191	error = 0;
3192	if (fdp == NULL)
3193		goto fail;
3194	efbuf->fdp = fdp;
3195	FILEDESC_SLOCK(fdp);
3196	/* working directory */
3197	if (fdp->fd_cdir != NULL) {
3198		vref(fdp->fd_cdir);
3199		export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf);
3200	}
3201	/* root directory */
3202	if (fdp->fd_rdir != NULL) {
3203		vref(fdp->fd_rdir);
3204		export_vnode_to_sb(fdp->fd_rdir, KF_FD_TYPE_ROOT, FREAD, efbuf);
3205	}
3206	/* jail directory */
3207	if (fdp->fd_jdir != NULL) {
3208		vref(fdp->fd_jdir);
3209		export_vnode_to_sb(fdp->fd_jdir, KF_FD_TYPE_JAIL, FREAD, efbuf);
3210	}
3211	for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
3212		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
3213			continue;
3214#ifdef CAPABILITIES
3215		rights = *cap_rights(fdp, i);
3216#else /* !CAPABILITIES */
3217		cap_rights_init(&rights);
3218#endif
3219		/*
3220		 * Create sysctl entry.  It is OK to drop the filedesc
3221		 * lock inside of export_file_to_sb() as we will
3222		 * re-validate and re-evaluate its properties when the
3223		 * loop continues.
3224		 */
3225		error = export_file_to_sb(fp, i, &rights, efbuf);
3226		if (error != 0 || efbuf->remainder == 0)
3227			break;
3228	}
3229	FILEDESC_SUNLOCK(fdp);
3230	fddrop(fdp);
3231fail:
3232	free(efbuf, M_TEMP);
3233	return (error);
3234}
3235
3236#define FILEDESC_SBUF_SIZE	(sizeof(struct kinfo_file) * 5)
3237
3238/*
3239 * Get per-process file descriptors for use by procstat(1), et al.
3240 */
3241static int
3242sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
3243{
3244	struct sbuf sb;
3245	struct proc *p;
3246	ssize_t maxlen;
3247	int error, error2, *name;
3248
3249	name = (int *)arg1;
3250
3251	sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
3252	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
3253	if (error != 0) {
3254		sbuf_delete(&sb);
3255		return (error);
3256	}
3257	maxlen = req->oldptr != NULL ? req->oldlen : -1;
3258	error = kern_proc_filedesc_out(p, &sb, maxlen);
3259	error2 = sbuf_finish(&sb);
3260	sbuf_delete(&sb);
3261	return (error != 0 ? error : error2);
3262}
3263
3264#ifdef KINFO_OFILE_SIZE
3265CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
3266#endif
3267
3268#ifdef COMPAT_FREEBSD7
3269static void
3270kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif)
3271{
3272
3273	okif->kf_structsize = sizeof(*okif);
3274	okif->kf_type = kif->kf_type;
3275	okif->kf_fd = kif->kf_fd;
3276	okif->kf_ref_count = kif->kf_ref_count;
3277	okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE |
3278	    KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK |
3279	    KF_FLAG_DIRECT | KF_FLAG_HASLOCK);
3280	okif->kf_offset = kif->kf_offset;
3281	okif->kf_vnode_type = kif->kf_vnode_type;
3282	okif->kf_sock_domain = kif->kf_sock_domain;
3283	okif->kf_sock_type = kif->kf_sock_type;
3284	okif->kf_sock_protocol = kif->kf_sock_protocol;
3285	strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path));
3286	okif->kf_sa_local = kif->kf_sa_local;
3287	okif->kf_sa_peer = kif->kf_sa_peer;
3288}
3289
3290static int
3291export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif,
3292    struct kinfo_ofile *okif, struct filedesc *fdp, struct sysctl_req *req)
3293{
3294	int error;
3295
3296	vref(vp);
3297	FILEDESC_SUNLOCK(fdp);
3298	export_vnode_to_kinfo(vp, type, 0, kif);
3299	kinfo_to_okinfo(kif, okif);
3300	error = SYSCTL_OUT(req, okif, sizeof(*okif));
3301	FILEDESC_SLOCK(fdp);
3302	return (error);
3303}
3304
3305/*
3306 * Get per-process file descriptors for use by procstat(1), et al.
3307 */
3308static int
3309sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
3310{
3311	struct kinfo_ofile *okif;
3312	struct kinfo_file *kif;
3313	struct filedesc *fdp;
3314	int error, i, *name;
3315	struct file *fp;
3316	struct proc *p;
3317
3318	name = (int *)arg1;
3319	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
3320	if (error != 0)
3321		return (error);
3322	fdp = fdhold(p);
3323	PROC_UNLOCK(p);
3324	if (fdp == NULL)
3325		return (ENOENT);
3326	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
3327	okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK);
3328	FILEDESC_SLOCK(fdp);
3329	if (fdp->fd_cdir != NULL)
3330		export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif,
3331		    okif, fdp, req);
3332	if (fdp->fd_rdir != NULL)
3333		export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif,
3334		    okif, fdp, req);
3335	if (fdp->fd_jdir != NULL)
3336		export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
3337		    okif, fdp, req);
3338	for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
3339		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
3340			continue;
3341		export_file_to_kinfo(fp, i, NULL, kif, fdp);
3342		FILEDESC_SUNLOCK(fdp);
3343		kinfo_to_okinfo(kif, okif);
3344		error = SYSCTL_OUT(req, okif, sizeof(*okif));
3345		FILEDESC_SLOCK(fdp);
3346		if (error)
3347			break;
3348	}
3349	FILEDESC_SUNLOCK(fdp);
3350	fddrop(fdp);
3351	free(kif, M_TEMP);
3352	free(okif, M_TEMP);
3353	return (0);
3354}
3355
3356static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc,
3357    CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc,
3358    "Process ofiledesc entries");
3359#endif	/* COMPAT_FREEBSD7 */
3360
3361int
3362vntype_to_kinfo(int vtype)
3363{
3364	struct {
3365		int	vtype;
3366		int	kf_vtype;
3367	} vtypes_table[] = {
3368		{ VBAD, KF_VTYPE_VBAD },
3369		{ VBLK, KF_VTYPE_VBLK },
3370		{ VCHR, KF_VTYPE_VCHR },
3371		{ VDIR, KF_VTYPE_VDIR },
3372		{ VFIFO, KF_VTYPE_VFIFO },
3373		{ VLNK, KF_VTYPE_VLNK },
3374		{ VNON, KF_VTYPE_VNON },
3375		{ VREG, KF_VTYPE_VREG },
3376		{ VSOCK, KF_VTYPE_VSOCK }
3377	};
3378	unsigned int i;
3379
3380	/*
3381	 * Perform vtype translation.
3382	 */
3383	for (i = 0; i < nitems(vtypes_table); i++)
3384		if (vtypes_table[i].vtype == vtype)
3385			return (vtypes_table[i].kf_vtype);
3386
3387	return (KF_VTYPE_UNKNOWN);
3388}
3389
3390static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc,
3391    CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc,
3392    "Process filedesc entries");
3393
3394#ifdef DDB
3395/*
3396 * For the purposes of debugging, generate a human-readable string for the
3397 * file type.
3398 */
3399static const char *
3400file_type_to_name(short type)
3401{
3402
3403	switch (type) {
3404	case 0:
3405		return ("zero");
3406	case DTYPE_VNODE:
3407		return ("vnod");
3408	case DTYPE_SOCKET:
3409		return ("sock");
3410	case DTYPE_PIPE:
3411		return ("pipe");
3412	case DTYPE_FIFO:
3413		return ("fifo");
3414	case DTYPE_KQUEUE:
3415		return ("kque");
3416	case DTYPE_CRYPTO:
3417		return ("crpt");
3418	case DTYPE_MQUEUE:
3419		return ("mque");
3420	case DTYPE_SHM:
3421		return ("shm");
3422	case DTYPE_SEM:
3423		return ("ksem");
3424	default:
3425		return ("unkn");
3426	}
3427}
3428
3429/*
3430 * For the purposes of debugging, identify a process (if any, perhaps one of
3431 * many) that references the passed file in its file descriptor array. Return
3432 * NULL if none.
3433 */
3434static struct proc *
3435file_to_first_proc(struct file *fp)
3436{
3437	struct filedesc *fdp;
3438	struct proc *p;
3439	int n;
3440
3441	FOREACH_PROC_IN_SYSTEM(p) {
3442		if (p->p_state == PRS_NEW)
3443			continue;
3444		fdp = p->p_fd;
3445		if (fdp == NULL)
3446			continue;
3447		for (n = 0; n <= fdp->fd_lastfile; n++) {
3448			if (fp == fdp->fd_ofiles[n].fde_file)
3449				return (p);
3450		}
3451	}
3452	return (NULL);
3453}
3454
3455static void
3456db_print_file(struct file *fp, int header)
3457{
3458	struct proc *p;
3459
3460	if (header)
3461		db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n",
3462		    "File", "Type", "Data", "Flag", "GCFl", "Count",
3463		    "MCount", "Vnode", "FPID", "FCmd");
3464	p = file_to_first_proc(fp);
3465	db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
3466	    file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
3467	    0, fp->f_count, 0, fp->f_vnode,
3468	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
3469}
3470
3471DB_SHOW_COMMAND(file, db_show_file)
3472{
3473	struct file *fp;
3474
3475	if (!have_addr) {
3476		db_printf("usage: show file <addr>\n");
3477		return;
3478	}
3479	fp = (struct file *)addr;
3480	db_print_file(fp, 1);
3481}
3482
3483DB_SHOW_COMMAND(files, db_show_files)
3484{
3485	struct filedesc *fdp;
3486	struct file *fp;
3487	struct proc *p;
3488	int header;
3489	int n;
3490
3491	header = 1;
3492	FOREACH_PROC_IN_SYSTEM(p) {
3493		if (p->p_state == PRS_NEW)
3494			continue;
3495		if ((fdp = p->p_fd) == NULL)
3496			continue;
3497		for (n = 0; n <= fdp->fd_lastfile; ++n) {
3498			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
3499				continue;
3500			db_print_file(fp, header);
3501			header = 0;
3502		}
3503	}
3504}
3505#endif
3506
3507SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
3508    &maxfilesperproc, 0, "Maximum files allowed open per process");
3509
3510SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
3511    &maxfiles, 0, "Maximum number of files");
3512
3513SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
3514    __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files");
3515
3516/* ARGSUSED*/
3517static void
3518filelistinit(void *dummy)
3519{
3520
3521	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
3522	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
3523	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
3524	mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF);
3525}
3526SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
3527
3528/*-------------------------------------------------------------------*/
3529
3530static int
3531badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
3532    int flags, struct thread *td)
3533{
3534
3535	return (EBADF);
3536}
3537
3538static int
3539badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
3540    struct thread *td)
3541{
3542
3543	return (EINVAL);
3544}
3545
3546static int
3547badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
3548    struct thread *td)
3549{
3550
3551	return (EBADF);
3552}
3553
3554static int
3555badfo_poll(struct file *fp, int events, struct ucred *active_cred,
3556    struct thread *td)
3557{
3558
3559	return (0);
3560}
3561
3562static int
3563badfo_kqfilter(struct file *fp, struct knote *kn)
3564{
3565
3566	return (EBADF);
3567}
3568
3569static int
3570badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
3571    struct thread *td)
3572{
3573
3574	return (EBADF);
3575}
3576
3577static int
3578badfo_close(struct file *fp, struct thread *td)
3579{
3580
3581	return (EBADF);
3582}
3583
3584static int
3585badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
3586    struct thread *td)
3587{
3588
3589	return (EBADF);
3590}
3591
3592static int
3593badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
3594    struct thread *td)
3595{
3596
3597	return (EBADF);
3598}
3599
3600static int
3601badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
3602    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
3603    int kflags, struct sendfile_sync *sfs, struct thread *td)
3604{
3605
3606	return (EBADF);
3607}
3608
3609static int
3610badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
3611{
3612
3613	return (0);
3614}
3615
3616struct fileops badfileops = {
3617	.fo_read = badfo_readwrite,
3618	.fo_write = badfo_readwrite,
3619	.fo_truncate = badfo_truncate,
3620	.fo_ioctl = badfo_ioctl,
3621	.fo_poll = badfo_poll,
3622	.fo_kqfilter = badfo_kqfilter,
3623	.fo_stat = badfo_stat,
3624	.fo_close = badfo_close,
3625	.fo_chmod = badfo_chmod,
3626	.fo_chown = badfo_chown,
3627	.fo_sendfile = badfo_sendfile,
3628	.fo_fill_kinfo = badfo_fill_kinfo,
3629};
3630
3631int
3632invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred,
3633    int flags, struct thread *td)
3634{
3635
3636	return (EOPNOTSUPP);
3637}
3638
3639int
3640invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
3641    struct thread *td)
3642{
3643
3644	return (EINVAL);
3645}
3646
3647int
3648invfo_ioctl(struct file *fp, u_long com, void *data,
3649    struct ucred *active_cred, struct thread *td)
3650{
3651
3652	return (ENOTTY);
3653}
3654
3655int
3656invfo_poll(struct file *fp, int events, struct ucred *active_cred,
3657    struct thread *td)
3658{
3659
3660	return (poll_no_poll(events));
3661}
3662
3663int
3664invfo_kqfilter(struct file *fp, struct knote *kn)
3665{
3666
3667	return (EINVAL);
3668}
3669
3670int
3671invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
3672    struct thread *td)
3673{
3674
3675	return (EINVAL);
3676}
3677
3678int
3679invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
3680    struct thread *td)
3681{
3682
3683	return (EINVAL);
3684}
3685
3686int
3687invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
3688    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
3689    int kflags, struct sendfile_sync *sfs, struct thread *td)
3690{
3691
3692	return (EINVAL);
3693}
3694
3695/*-------------------------------------------------------------------*/
3696
3697/*
3698 * File Descriptor pseudo-device driver (/dev/fd/).
3699 *
3700 * Opening minor device N dup()s the file (if any) connected to file
3701 * descriptor N belonging to the calling process.  Note that this driver
3702 * consists of only the ``open()'' routine, because all subsequent
3703 * references to this file will be direct to the other driver.
3704 *
3705 * XXX: we could give this one a cloning event handler if necessary.
3706 */
3707
3708/* ARGSUSED */
3709static int
3710fdopen(struct cdev *dev, int mode, int type, struct thread *td)
3711{
3712
3713	/*
3714	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
3715	 * the file descriptor being sought for duplication. The error
3716	 * return ensures that the vnode for this device will be released
3717	 * by vn_open. Open will detect this special error and take the
3718	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
3719	 * will simply report the error.
3720	 */
3721	td->td_dupfd = dev2unit(dev);
3722	return (ENODEV);
3723}
3724
3725static struct cdevsw fildesc_cdevsw = {
3726	.d_version =	D_VERSION,
3727	.d_open =	fdopen,
3728	.d_name =	"FD",
3729};
3730
3731static void
3732fildesc_drvinit(void *unused)
3733{
3734	struct cdev *dev;
3735
3736	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
3737	    UID_ROOT, GID_WHEEL, 0666, "fd/0");
3738	make_dev_alias(dev, "stdin");
3739	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
3740	    UID_ROOT, GID_WHEEL, 0666, "fd/1");
3741	make_dev_alias(dev, "stdout");
3742	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
3743	    UID_ROOT, GID_WHEEL, 0666, "fd/2");
3744	make_dev_alias(dev, "stderr");
3745}
3746
3747SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
3748