kern_descrip.c revision 247736
1/*-
2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: head/sys/kern/kern_descrip.c 247736 2013-03-03 23:23:35Z pjd $");
39
40#include "opt_capsicum.h"
41#include "opt_compat.h"
42#include "opt_ddb.h"
43#include "opt_ktrace.h"
44#include "opt_procdesc.h"
45
46#include <sys/param.h>
47#include <sys/systm.h>
48
49#include <sys/capability.h>
50#include <sys/conf.h>
51#include <sys/domain.h>
52#include <sys/fcntl.h>
53#include <sys/file.h>
54#include <sys/filedesc.h>
55#include <sys/filio.h>
56#include <sys/jail.h>
57#include <sys/kernel.h>
58#include <sys/limits.h>
59#include <sys/lock.h>
60#include <sys/malloc.h>
61#include <sys/mman.h>
62#include <sys/mount.h>
63#include <sys/mqueue.h>
64#include <sys/mutex.h>
65#include <sys/namei.h>
66#include <sys/selinfo.h>
67#include <sys/pipe.h>
68#include <sys/priv.h>
69#include <sys/proc.h>
70#include <sys/procdesc.h>
71#include <sys/protosw.h>
72#include <sys/racct.h>
73#include <sys/resourcevar.h>
74#include <sys/signalvar.h>
75#include <sys/socketvar.h>
76#include <sys/stat.h>
77#include <sys/sx.h>
78#include <sys/syscallsubr.h>
79#include <sys/sysctl.h>
80#include <sys/sysproto.h>
81#include <sys/tty.h>
82#include <sys/unistd.h>
83#include <sys/un.h>
84#include <sys/unpcb.h>
85#include <sys/user.h>
86#include <sys/vnode.h>
87#ifdef KTRACE
88#include <sys/ktrace.h>
89#endif
90
91#include <net/vnet.h>
92
93#include <netinet/in.h>
94#include <netinet/in_pcb.h>
95
96#include <security/audit/audit.h>
97
98#include <vm/uma.h>
99#include <vm/vm.h>
100
101#include <ddb/ddb.h>
102
103static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
104static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
105    "file desc to leader structures");
106static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
107
108MALLOC_DECLARE(M_FADVISE);
109
110static uma_zone_t file_zone;
111
112
113static int	closefp(struct filedesc *fdp, int fd, struct file *fp,
114		    struct thread *td, int holdleaders);
115static int	fd_first_free(struct filedesc *fdp, int low, int size);
116static int	fd_last_used(struct filedesc *fdp, int size);
117static void	fdgrowtable(struct filedesc *fdp, int nfd);
118static void	fdunused(struct filedesc *fdp, int fd);
119static void	fdused(struct filedesc *fdp, int fd);
120static int	fill_pipe_info(struct pipe *pi, struct kinfo_file *kif);
121static int	fill_procdesc_info(struct procdesc *pdp,
122		    struct kinfo_file *kif);
123static int	fill_pts_info(struct tty *tp, struct kinfo_file *kif);
124static int	fill_shm_info(struct file *fp, struct kinfo_file *kif);
125static int	fill_socket_info(struct socket *so, struct kinfo_file *kif);
126static int	fill_vnode_info(struct vnode *vp, struct kinfo_file *kif);
127
128/*
129 * Each process has:
130 *
131 * - An array of open file descriptors (fd_ofiles)
132 * - An array of file flags (fd_ofileflags)
133 * - A bitmap recording which descriptors are in use (fd_map)
134 *
135 * A process starts out with NDFILE descriptors.  The value of NDFILE has
136 * been selected based the historical limit of 20 open files, and an
137 * assumption that the majority of processes, especially short-lived
138 * processes like shells, will never need more.
139 *
140 * If this initial allocation is exhausted, a larger descriptor table and
141 * map are allocated dynamically, and the pointers in the process's struct
142 * filedesc are updated to point to those.  This is repeated every time
143 * the process runs out of file descriptors (provided it hasn't hit its
144 * resource limit).
145 *
146 * Since threads may hold references to individual descriptor table
147 * entries, the tables are never freed.  Instead, they are placed on a
148 * linked list and freed only when the struct filedesc is released.
149 */
150#define NDFILE		20
151#define NDSLOTSIZE	sizeof(NDSLOTTYPE)
152#define	NDENTRIES	(NDSLOTSIZE * __CHAR_BIT)
153#define NDSLOT(x)	((x) / NDENTRIES)
154#define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
155#define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
156
157/*
158 * SLIST entry used to keep track of ofiles which must be reclaimed when
159 * the process exits.
160 */
161struct freetable {
162	struct filedescent *ft_table;
163	SLIST_ENTRY(freetable) ft_next;
164};
165
166/*
167 * Initial allocation: a filedesc structure + the head of SLIST used to
168 * keep track of old ofiles + enough space for NDFILE descriptors.
169 */
170struct filedesc0 {
171	struct filedesc fd_fd;
172	SLIST_HEAD(, freetable) fd_free;
173	struct	filedescent fd_dfiles[NDFILE];
174	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
175};
176
177/*
178 * Descriptor management.
179 */
180volatile int openfiles;			/* actual number of open files */
181struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
182void (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
183
184/* A mutex to protect the association between a proc and filedesc. */
185static struct mtx fdesc_mtx;
186
187/*
188 * If low >= size, just return low. Otherwise find the first zero bit in the
189 * given bitmap, starting at low and not exceeding size - 1. Return size if
190 * not found.
191 */
192static int
193fd_first_free(struct filedesc *fdp, int low, int size)
194{
195	NDSLOTTYPE *map = fdp->fd_map;
196	NDSLOTTYPE mask;
197	int off, maxoff;
198
199	if (low >= size)
200		return (low);
201
202	off = NDSLOT(low);
203	if (low % NDENTRIES) {
204		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
205		if ((mask &= ~map[off]) != 0UL)
206			return (off * NDENTRIES + ffsl(mask) - 1);
207		++off;
208	}
209	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
210		if (map[off] != ~0UL)
211			return (off * NDENTRIES + ffsl(~map[off]) - 1);
212	return (size);
213}
214
215/*
216 * Find the highest non-zero bit in the given bitmap, starting at 0 and
217 * not exceeding size - 1. Return -1 if not found.
218 */
219static int
220fd_last_used(struct filedesc *fdp, int size)
221{
222	NDSLOTTYPE *map = fdp->fd_map;
223	NDSLOTTYPE mask;
224	int off, minoff;
225
226	off = NDSLOT(size);
227	if (size % NDENTRIES) {
228		mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
229		if ((mask &= map[off]) != 0)
230			return (off * NDENTRIES + flsl(mask) - 1);
231		--off;
232	}
233	for (minoff = NDSLOT(0); off >= minoff; --off)
234		if (map[off] != 0)
235			return (off * NDENTRIES + flsl(map[off]) - 1);
236	return (-1);
237}
238
239static int
240fdisused(struct filedesc *fdp, int fd)
241{
242
243	FILEDESC_LOCK_ASSERT(fdp);
244
245	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
246	    ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
247
248	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
249}
250
251/*
252 * Mark a file descriptor as used.
253 */
254static void
255fdused(struct filedesc *fdp, int fd)
256{
257
258	FILEDESC_XLOCK_ASSERT(fdp);
259
260	KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
261
262	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
263	if (fd > fdp->fd_lastfile)
264		fdp->fd_lastfile = fd;
265	if (fd == fdp->fd_freefile)
266		fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
267}
268
269/*
270 * Mark a file descriptor as unused.
271 */
272static void
273fdunused(struct filedesc *fdp, int fd)
274{
275
276	FILEDESC_XLOCK_ASSERT(fdp);
277
278	KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
279	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
280	    ("fd=%d is still in use", fd));
281
282	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
283	if (fd < fdp->fd_freefile)
284		fdp->fd_freefile = fd;
285	if (fd == fdp->fd_lastfile)
286		fdp->fd_lastfile = fd_last_used(fdp, fd);
287}
288
289/*
290 * Free a file descriptor.
291 */
292static inline void
293fdfree(struct filedesc *fdp, int fd)
294{
295	struct filedescent *fde;
296
297	fde = &fdp->fd_ofiles[fd];
298	filecaps_free(&fde->fde_caps);
299	bzero(fde, sizeof(*fde));
300	fdunused(fdp, fd);
301}
302
303/*
304 * System calls on descriptors.
305 */
306#ifndef _SYS_SYSPROTO_H_
307struct getdtablesize_args {
308	int	dummy;
309};
310#endif
311/* ARGSUSED */
312int
313sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
314{
315	struct proc *p = td->td_proc;
316	uint64_t lim;
317
318	PROC_LOCK(p);
319	td->td_retval[0] =
320	    min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
321	lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
322	PROC_UNLOCK(p);
323	if (lim < td->td_retval[0])
324		td->td_retval[0] = lim;
325	return (0);
326}
327
328/*
329 * Duplicate a file descriptor to a particular value.
330 *
331 * Note: keep in mind that a potential race condition exists when closing
332 * descriptors from a shared descriptor table (via rfork).
333 */
334#ifndef _SYS_SYSPROTO_H_
335struct dup2_args {
336	u_int	from;
337	u_int	to;
338};
339#endif
340/* ARGSUSED */
341int
342sys_dup2(struct thread *td, struct dup2_args *uap)
343{
344
345	return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
346		    td->td_retval));
347}
348
349/*
350 * Duplicate a file descriptor.
351 */
352#ifndef _SYS_SYSPROTO_H_
353struct dup_args {
354	u_int	fd;
355};
356#endif
357/* ARGSUSED */
358int
359sys_dup(struct thread *td, struct dup_args *uap)
360{
361
362	return (do_dup(td, 0, (int)uap->fd, 0, td->td_retval));
363}
364
365/*
366 * The file control system call.
367 */
368#ifndef _SYS_SYSPROTO_H_
369struct fcntl_args {
370	int	fd;
371	int	cmd;
372	long	arg;
373};
374#endif
375/* ARGSUSED */
376int
377sys_fcntl(struct thread *td, struct fcntl_args *uap)
378{
379	struct flock fl;
380	struct __oflock ofl;
381	intptr_t arg;
382	int error;
383	int cmd;
384
385	error = 0;
386	cmd = uap->cmd;
387	switch (uap->cmd) {
388	case F_OGETLK:
389	case F_OSETLK:
390	case F_OSETLKW:
391		/*
392		 * Convert old flock structure to new.
393		 */
394		error = copyin((void *)(intptr_t)uap->arg, &ofl, sizeof(ofl));
395		fl.l_start = ofl.l_start;
396		fl.l_len = ofl.l_len;
397		fl.l_pid = ofl.l_pid;
398		fl.l_type = ofl.l_type;
399		fl.l_whence = ofl.l_whence;
400		fl.l_sysid = 0;
401
402		switch (uap->cmd) {
403		case F_OGETLK:
404		    cmd = F_GETLK;
405		    break;
406		case F_OSETLK:
407		    cmd = F_SETLK;
408		    break;
409		case F_OSETLKW:
410		    cmd = F_SETLKW;
411		    break;
412		}
413		arg = (intptr_t)&fl;
414		break;
415        case F_GETLK:
416        case F_SETLK:
417        case F_SETLKW:
418	case F_SETLK_REMOTE:
419                error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
420                arg = (intptr_t)&fl;
421                break;
422	default:
423		arg = uap->arg;
424		break;
425	}
426	if (error)
427		return (error);
428	error = kern_fcntl(td, uap->fd, cmd, arg);
429	if (error)
430		return (error);
431	if (uap->cmd == F_OGETLK) {
432		ofl.l_start = fl.l_start;
433		ofl.l_len = fl.l_len;
434		ofl.l_pid = fl.l_pid;
435		ofl.l_type = fl.l_type;
436		ofl.l_whence = fl.l_whence;
437		error = copyout(&ofl, (void *)(intptr_t)uap->arg, sizeof(ofl));
438	} else if (uap->cmd == F_GETLK) {
439		error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
440	}
441	return (error);
442}
443
444int
445kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
446{
447	struct filedesc *fdp;
448	struct flock *flp;
449	struct file *fp, *fp2;
450	struct filedescent *fde;
451	struct proc *p;
452	struct vnode *vp;
453	int error, flg, tmp;
454	u_int old, new;
455	uint64_t bsize;
456	off_t foffset;
457
458	error = 0;
459	flg = F_POSIX;
460	p = td->td_proc;
461	fdp = p->p_fd;
462
463	switch (cmd) {
464	case F_DUPFD:
465		tmp = arg;
466		error = do_dup(td, DUP_FCNTL, fd, tmp, td->td_retval);
467		break;
468
469	case F_DUPFD_CLOEXEC:
470		tmp = arg;
471		error = do_dup(td, DUP_FCNTL | DUP_CLOEXEC, fd, tmp,
472		    td->td_retval);
473		break;
474
475	case F_DUP2FD:
476		tmp = arg;
477		error = do_dup(td, DUP_FIXED, fd, tmp, td->td_retval);
478		break;
479
480	case F_DUP2FD_CLOEXEC:
481		tmp = arg;
482		error = do_dup(td, DUP_FIXED | DUP_CLOEXEC, fd, tmp,
483		    td->td_retval);
484		break;
485
486	case F_GETFD:
487		FILEDESC_SLOCK(fdp);
488		if ((fp = fget_locked(fdp, fd)) == NULL) {
489			FILEDESC_SUNLOCK(fdp);
490			error = EBADF;
491			break;
492		}
493		fde = &fdp->fd_ofiles[fd];
494		td->td_retval[0] =
495		    (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
496		FILEDESC_SUNLOCK(fdp);
497		break;
498
499	case F_SETFD:
500		FILEDESC_XLOCK(fdp);
501		if ((fp = fget_locked(fdp, fd)) == NULL) {
502			FILEDESC_XUNLOCK(fdp);
503			error = EBADF;
504			break;
505		}
506		fde = &fdp->fd_ofiles[fd];
507		fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
508		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
509		FILEDESC_XUNLOCK(fdp);
510		break;
511
512	case F_GETFL:
513		error = fget_unlocked(fdp, fd, CAP_FCNTL, F_GETFL, &fp, NULL);
514		if (error != 0)
515			break;
516		td->td_retval[0] = OFLAGS(fp->f_flag);
517		fdrop(fp, td);
518		break;
519
520	case F_SETFL:
521		error = fget_unlocked(fdp, fd, CAP_FCNTL, F_SETFL, &fp, NULL);
522		if (error != 0)
523			break;
524		do {
525			tmp = flg = fp->f_flag;
526			tmp &= ~FCNTLFLAGS;
527			tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
528		} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
529		tmp = fp->f_flag & FNONBLOCK;
530		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
531		if (error != 0) {
532			fdrop(fp, td);
533			break;
534		}
535		tmp = fp->f_flag & FASYNC;
536		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
537		if (error == 0) {
538			fdrop(fp, td);
539			break;
540		}
541		atomic_clear_int(&fp->f_flag, FNONBLOCK);
542		tmp = 0;
543		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
544		fdrop(fp, td);
545		break;
546
547	case F_GETOWN:
548		error = fget_unlocked(fdp, fd, CAP_FCNTL, F_GETOWN, &fp, NULL);
549		if (error != 0)
550			break;
551		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
552		if (error == 0)
553			td->td_retval[0] = tmp;
554		fdrop(fp, td);
555		break;
556
557	case F_SETOWN:
558		error = fget_unlocked(fdp, fd, CAP_FCNTL, F_SETOWN, &fp, NULL);
559		if (error != 0)
560			break;
561		tmp = arg;
562		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
563		fdrop(fp, td);
564		break;
565
566	case F_SETLK_REMOTE:
567		error = priv_check(td, PRIV_NFS_LOCKD);
568		if (error)
569			return (error);
570		flg = F_REMOTE;
571		goto do_setlk;
572
573	case F_SETLKW:
574		flg |= F_WAIT;
575		/* FALLTHROUGH F_SETLK */
576
577	case F_SETLK:
578	do_setlk:
579		error = fget_unlocked(fdp, fd, CAP_FLOCK, 0, &fp, NULL);
580		if (error != 0)
581			break;
582		if (fp->f_type != DTYPE_VNODE) {
583			error = EBADF;
584			fdrop(fp, td);
585			break;
586		}
587
588		flp = (struct flock *)arg;
589		if (flp->l_whence == SEEK_CUR) {
590			foffset = foffset_get(fp);
591			if (foffset < 0 ||
592			    (flp->l_start > 0 &&
593			     foffset > OFF_MAX - flp->l_start)) {
594				FILEDESC_SUNLOCK(fdp);
595				error = EOVERFLOW;
596				fdrop(fp, td);
597				break;
598			}
599			flp->l_start += foffset;
600		}
601
602		vp = fp->f_vnode;
603		switch (flp->l_type) {
604		case F_RDLCK:
605			if ((fp->f_flag & FREAD) == 0) {
606				error = EBADF;
607				break;
608			}
609			PROC_LOCK(p->p_leader);
610			p->p_leader->p_flag |= P_ADVLOCK;
611			PROC_UNLOCK(p->p_leader);
612			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
613			    flp, flg);
614			break;
615		case F_WRLCK:
616			if ((fp->f_flag & FWRITE) == 0) {
617				error = EBADF;
618				break;
619			}
620			PROC_LOCK(p->p_leader);
621			p->p_leader->p_flag |= P_ADVLOCK;
622			PROC_UNLOCK(p->p_leader);
623			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
624			    flp, flg);
625			break;
626		case F_UNLCK:
627			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
628			    flp, flg);
629			break;
630		case F_UNLCKSYS:
631			/*
632			 * Temporary api for testing remote lock
633			 * infrastructure.
634			 */
635			if (flg != F_REMOTE) {
636				error = EINVAL;
637				break;
638			}
639			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
640			    F_UNLCKSYS, flp, flg);
641			break;
642		default:
643			error = EINVAL;
644			break;
645		}
646		if (error != 0 || flp->l_type == F_UNLCK ||
647		    flp->l_type == F_UNLCKSYS) {
648			fdrop(fp, td);
649			break;
650		}
651
652		/*
653		 * Check for a race with close.
654		 *
655		 * The vnode is now advisory locked (or unlocked, but this case
656		 * is not really important) as the caller requested.
657		 * We had to drop the filedesc lock, so we need to recheck if
658		 * the descriptor is still valid, because if it was closed
659		 * in the meantime we need to remove advisory lock from the
660		 * vnode - close on any descriptor leading to an advisory
661		 * locked vnode, removes that lock.
662		 * We will return 0 on purpose in that case, as the result of
663		 * successful advisory lock might have been externally visible
664		 * already. This is fine - effectively we pretend to the caller
665		 * that the closing thread was a bit slower and that the
666		 * advisory lock succeeded before the close.
667		 */
668		error = fget_unlocked(fdp, fd, 0, 0, &fp2, NULL);
669		if (error != 0) {
670			fdrop(fp, td);
671			break;
672		}
673		if (fp != fp2) {
674			flp->l_whence = SEEK_SET;
675			flp->l_start = 0;
676			flp->l_len = 0;
677			flp->l_type = F_UNLCK;
678			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
679			    F_UNLCK, flp, F_POSIX);
680		}
681		fdrop(fp, td);
682		fdrop(fp2, td);
683		break;
684
685	case F_GETLK:
686		error = fget_unlocked(fdp, fd, CAP_FLOCK, 0, &fp, NULL);
687		if (error != 0)
688			break;
689		if (fp->f_type != DTYPE_VNODE) {
690			error = EBADF;
691			fdrop(fp, td);
692			break;
693		}
694		flp = (struct flock *)arg;
695		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
696		    flp->l_type != F_UNLCK) {
697			error = EINVAL;
698			fdrop(fp, td);
699			break;
700		}
701		if (flp->l_whence == SEEK_CUR) {
702			foffset = foffset_get(fp);
703			if ((flp->l_start > 0 &&
704			    foffset > OFF_MAX - flp->l_start) ||
705			    (flp->l_start < 0 &&
706			     foffset < OFF_MIN - flp->l_start)) {
707				FILEDESC_SUNLOCK(fdp);
708				error = EOVERFLOW;
709				fdrop(fp, td);
710				break;
711			}
712			flp->l_start += foffset;
713		}
714		vp = fp->f_vnode;
715		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
716		    F_POSIX);
717		fdrop(fp, td);
718		break;
719
720	case F_RDAHEAD:
721		arg = arg ? 128 * 1024: 0;
722		/* FALLTHROUGH */
723	case F_READAHEAD:
724		error = fget_unlocked(fdp, fd, 0, 0, &fp, NULL);
725		if (error != 0)
726			break;
727		if (fp->f_type != DTYPE_VNODE) {
728			fdrop(fp, td);
729			error = EBADF;
730			break;
731		}
732		if (arg >= 0) {
733			vp = fp->f_vnode;
734			error = vn_lock(vp, LK_SHARED);
735			if (error != 0) {
736				fdrop(fp, td);
737				break;
738			}
739			bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
740			VOP_UNLOCK(vp, 0);
741			fp->f_seqcount = (arg + bsize - 1) / bsize;
742			do {
743				new = old = fp->f_flag;
744				new |= FRDAHEAD;
745			} while (!atomic_cmpset_rel_int(&fp->f_flag, old, new));
746		} else {
747			do {
748				new = old = fp->f_flag;
749				new &= ~FRDAHEAD;
750			} while (!atomic_cmpset_rel_int(&fp->f_flag, old, new));
751		}
752		fdrop(fp, td);
753		break;
754
755	default:
756		error = EINVAL;
757		break;
758	}
759	return (error);
760}
761
762/*
763 * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
764 */
765int
766do_dup(struct thread *td, int flags, int old, int new,
767    register_t *retval)
768{
769	struct filedesc *fdp;
770	struct filedescent *oldfde, *newfde;
771	struct proc *p;
772	struct file *fp;
773	struct file *delfp;
774	int error, maxfd;
775
776	p = td->td_proc;
777	fdp = p->p_fd;
778
779	/*
780	 * Verify we have a valid descriptor to dup from and possibly to
781	 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
782	 * return EINVAL when the new descriptor is out of bounds.
783	 */
784	if (old < 0)
785		return (EBADF);
786	if (new < 0)
787		return (flags & DUP_FCNTL ? EINVAL : EBADF);
788	PROC_LOCK(p);
789	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
790	PROC_UNLOCK(p);
791	if (new >= maxfd)
792		return (flags & DUP_FCNTL ? EINVAL : EBADF);
793
794	FILEDESC_XLOCK(fdp);
795	if (fget_locked(fdp, old) == NULL) {
796		FILEDESC_XUNLOCK(fdp);
797		return (EBADF);
798	}
799	oldfde = &fdp->fd_ofiles[old];
800	if (flags & DUP_FIXED && old == new) {
801		*retval = new;
802		if (flags & DUP_CLOEXEC)
803			fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
804		FILEDESC_XUNLOCK(fdp);
805		return (0);
806	}
807	fp = oldfde->fde_file;
808	fhold(fp);
809
810	/*
811	 * If the caller specified a file descriptor, make sure the file
812	 * table is large enough to hold it, and grab it.  Otherwise, just
813	 * allocate a new descriptor the usual way.
814	 */
815	if (flags & DUP_FIXED) {
816		if (new >= fdp->fd_nfiles) {
817			/*
818			 * The resource limits are here instead of e.g.
819			 * fdalloc(), because the file descriptor table may be
820			 * shared between processes, so we can't really use
821			 * racct_add()/racct_sub().  Instead of counting the
822			 * number of actually allocated descriptors, just put
823			 * the limit on the size of the file descriptor table.
824			 */
825#ifdef RACCT
826			PROC_LOCK(p);
827			error = racct_set(p, RACCT_NOFILE, new + 1);
828			PROC_UNLOCK(p);
829			if (error != 0) {
830				FILEDESC_XUNLOCK(fdp);
831				fdrop(fp, td);
832				return (EMFILE);
833			}
834#endif
835			fdgrowtable(fdp, new + 1);
836			oldfde = &fdp->fd_ofiles[old];
837		}
838		newfde = &fdp->fd_ofiles[new];
839		if (newfde->fde_file == NULL)
840			fdused(fdp, new);
841	} else {
842		if ((error = fdalloc(td, new, &new)) != 0) {
843			FILEDESC_XUNLOCK(fdp);
844			fdrop(fp, td);
845			return (error);
846		}
847		newfde = &fdp->fd_ofiles[new];
848	}
849
850	KASSERT(fp == oldfde->fde_file, ("old fd has been modified"));
851	KASSERT(old != new, ("new fd is same as old"));
852
853	delfp = newfde->fde_file;
854
855	/*
856	 * Duplicate the source descriptor.
857	 */
858	*newfde = *oldfde;
859	filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps);
860	if ((flags & DUP_CLOEXEC) != 0)
861		newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
862	else
863		newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
864	if (new > fdp->fd_lastfile)
865		fdp->fd_lastfile = new;
866	*retval = new;
867
868	if (delfp != NULL) {
869		(void) closefp(fdp, new, delfp, td, 1);
870		/* closefp() drops the FILEDESC lock for us. */
871	} else {
872		FILEDESC_XUNLOCK(fdp);
873	}
874
875	return (0);
876}
877
878/*
879 * If sigio is on the list associated with a process or process group,
880 * disable signalling from the device, remove sigio from the list and
881 * free sigio.
882 */
883void
884funsetown(struct sigio **sigiop)
885{
886	struct sigio *sigio;
887
888	SIGIO_LOCK();
889	sigio = *sigiop;
890	if (sigio == NULL) {
891		SIGIO_UNLOCK();
892		return;
893	}
894	*(sigio->sio_myref) = NULL;
895	if ((sigio)->sio_pgid < 0) {
896		struct pgrp *pg = (sigio)->sio_pgrp;
897		PGRP_LOCK(pg);
898		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
899			     sigio, sio_pgsigio);
900		PGRP_UNLOCK(pg);
901	} else {
902		struct proc *p = (sigio)->sio_proc;
903		PROC_LOCK(p);
904		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
905			     sigio, sio_pgsigio);
906		PROC_UNLOCK(p);
907	}
908	SIGIO_UNLOCK();
909	crfree(sigio->sio_ucred);
910	free(sigio, M_SIGIO);
911}
912
913/*
914 * Free a list of sigio structures.
915 * We only need to lock the SIGIO_LOCK because we have made ourselves
916 * inaccessible to callers of fsetown and therefore do not need to lock
917 * the proc or pgrp struct for the list manipulation.
918 */
919void
920funsetownlst(struct sigiolst *sigiolst)
921{
922	struct proc *p;
923	struct pgrp *pg;
924	struct sigio *sigio;
925
926	sigio = SLIST_FIRST(sigiolst);
927	if (sigio == NULL)
928		return;
929	p = NULL;
930	pg = NULL;
931
932	/*
933	 * Every entry of the list should belong
934	 * to a single proc or pgrp.
935	 */
936	if (sigio->sio_pgid < 0) {
937		pg = sigio->sio_pgrp;
938		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
939	} else /* if (sigio->sio_pgid > 0) */ {
940		p = sigio->sio_proc;
941		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
942	}
943
944	SIGIO_LOCK();
945	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
946		*(sigio->sio_myref) = NULL;
947		if (pg != NULL) {
948			KASSERT(sigio->sio_pgid < 0,
949			    ("Proc sigio in pgrp sigio list"));
950			KASSERT(sigio->sio_pgrp == pg,
951			    ("Bogus pgrp in sigio list"));
952			PGRP_LOCK(pg);
953			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
954			    sio_pgsigio);
955			PGRP_UNLOCK(pg);
956		} else /* if (p != NULL) */ {
957			KASSERT(sigio->sio_pgid > 0,
958			    ("Pgrp sigio in proc sigio list"));
959			KASSERT(sigio->sio_proc == p,
960			    ("Bogus proc in sigio list"));
961			PROC_LOCK(p);
962			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
963			    sio_pgsigio);
964			PROC_UNLOCK(p);
965		}
966		SIGIO_UNLOCK();
967		crfree(sigio->sio_ucred);
968		free(sigio, M_SIGIO);
969		SIGIO_LOCK();
970	}
971	SIGIO_UNLOCK();
972}
973
974/*
975 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
976 *
977 * After permission checking, add a sigio structure to the sigio list for
978 * the process or process group.
979 */
980int
981fsetown(pid_t pgid, struct sigio **sigiop)
982{
983	struct proc *proc;
984	struct pgrp *pgrp;
985	struct sigio *sigio;
986	int ret;
987
988	if (pgid == 0) {
989		funsetown(sigiop);
990		return (0);
991	}
992
993	ret = 0;
994
995	/* Allocate and fill in the new sigio out of locks. */
996	sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
997	sigio->sio_pgid = pgid;
998	sigio->sio_ucred = crhold(curthread->td_ucred);
999	sigio->sio_myref = sigiop;
1000
1001	sx_slock(&proctree_lock);
1002	if (pgid > 0) {
1003		proc = pfind(pgid);
1004		if (proc == NULL) {
1005			ret = ESRCH;
1006			goto fail;
1007		}
1008
1009		/*
1010		 * Policy - Don't allow a process to FSETOWN a process
1011		 * in another session.
1012		 *
1013		 * Remove this test to allow maximum flexibility or
1014		 * restrict FSETOWN to the current process or process
1015		 * group for maximum safety.
1016		 */
1017		PROC_UNLOCK(proc);
1018		if (proc->p_session != curthread->td_proc->p_session) {
1019			ret = EPERM;
1020			goto fail;
1021		}
1022
1023		pgrp = NULL;
1024	} else /* if (pgid < 0) */ {
1025		pgrp = pgfind(-pgid);
1026		if (pgrp == NULL) {
1027			ret = ESRCH;
1028			goto fail;
1029		}
1030		PGRP_UNLOCK(pgrp);
1031
1032		/*
1033		 * Policy - Don't allow a process to FSETOWN a process
1034		 * in another session.
1035		 *
1036		 * Remove this test to allow maximum flexibility or
1037		 * restrict FSETOWN to the current process or process
1038		 * group for maximum safety.
1039		 */
1040		if (pgrp->pg_session != curthread->td_proc->p_session) {
1041			ret = EPERM;
1042			goto fail;
1043		}
1044
1045		proc = NULL;
1046	}
1047	funsetown(sigiop);
1048	if (pgid > 0) {
1049		PROC_LOCK(proc);
1050		/*
1051		 * Since funsetownlst() is called without the proctree
1052		 * locked, we need to check for P_WEXIT.
1053		 * XXX: is ESRCH correct?
1054		 */
1055		if ((proc->p_flag & P_WEXIT) != 0) {
1056			PROC_UNLOCK(proc);
1057			ret = ESRCH;
1058			goto fail;
1059		}
1060		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
1061		sigio->sio_proc = proc;
1062		PROC_UNLOCK(proc);
1063	} else {
1064		PGRP_LOCK(pgrp);
1065		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
1066		sigio->sio_pgrp = pgrp;
1067		PGRP_UNLOCK(pgrp);
1068	}
1069	sx_sunlock(&proctree_lock);
1070	SIGIO_LOCK();
1071	*sigiop = sigio;
1072	SIGIO_UNLOCK();
1073	return (0);
1074
1075fail:
1076	sx_sunlock(&proctree_lock);
1077	crfree(sigio->sio_ucred);
1078	free(sigio, M_SIGIO);
1079	return (ret);
1080}
1081
1082/*
1083 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
1084 */
1085pid_t
1086fgetown(sigiop)
1087	struct sigio **sigiop;
1088{
1089	pid_t pgid;
1090
1091	SIGIO_LOCK();
1092	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
1093	SIGIO_UNLOCK();
1094	return (pgid);
1095}
1096
1097/*
1098 * Function drops the filedesc lock on return.
1099 */
1100static int
1101closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
1102    int holdleaders)
1103{
1104	int error;
1105
1106	FILEDESC_XLOCK_ASSERT(fdp);
1107
1108	if (holdleaders) {
1109		if (td->td_proc->p_fdtol != NULL) {
1110			/*
1111			 * Ask fdfree() to sleep to ensure that all relevant
1112			 * process leaders can be traversed in closef().
1113			 */
1114			fdp->fd_holdleaderscount++;
1115		} else {
1116			holdleaders = 0;
1117		}
1118	}
1119
1120	/*
1121	 * We now hold the fp reference that used to be owned by the
1122	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
1123	 * knote_fdclose to prevent a race of the fd getting opened, a knote
1124	 * added, and deleteing a knote for the new fd.
1125	 */
1126	knote_fdclose(td, fd);
1127
1128	/*
1129	 * We need to notify mqueue if the object is of type mqueue.
1130	 */
1131	if (fp->f_type == DTYPE_MQUEUE)
1132		mq_fdclose(td, fd, fp);
1133	FILEDESC_XUNLOCK(fdp);
1134
1135	error = closef(fp, td);
1136	if (holdleaders) {
1137		FILEDESC_XLOCK(fdp);
1138		fdp->fd_holdleaderscount--;
1139		if (fdp->fd_holdleaderscount == 0 &&
1140		    fdp->fd_holdleaderswakeup != 0) {
1141			fdp->fd_holdleaderswakeup = 0;
1142			wakeup(&fdp->fd_holdleaderscount);
1143		}
1144		FILEDESC_XUNLOCK(fdp);
1145	}
1146	return (error);
1147}
1148
1149/*
1150 * Close a file descriptor.
1151 */
1152#ifndef _SYS_SYSPROTO_H_
1153struct close_args {
1154	int     fd;
1155};
1156#endif
1157/* ARGSUSED */
1158int
1159sys_close(td, uap)
1160	struct thread *td;
1161	struct close_args *uap;
1162{
1163
1164	return (kern_close(td, uap->fd));
1165}
1166
1167int
1168kern_close(td, fd)
1169	struct thread *td;
1170	int fd;
1171{
1172	struct filedesc *fdp;
1173	struct file *fp;
1174
1175	fdp = td->td_proc->p_fd;
1176
1177	AUDIT_SYSCLOSE(td, fd);
1178
1179	FILEDESC_XLOCK(fdp);
1180	if ((fp = fget_locked(fdp, fd)) == NULL) {
1181		FILEDESC_XUNLOCK(fdp);
1182		return (EBADF);
1183	}
1184	fdfree(fdp, fd);
1185
1186	/* closefp() drops the FILEDESC lock for us. */
1187	return (closefp(fdp, fd, fp, td, 1));
1188}
1189
1190/*
1191 * Close open file descriptors.
1192 */
1193#ifndef _SYS_SYSPROTO_H_
1194struct closefrom_args {
1195	int	lowfd;
1196};
1197#endif
1198/* ARGSUSED */
1199int
1200sys_closefrom(struct thread *td, struct closefrom_args *uap)
1201{
1202	struct filedesc *fdp;
1203	int fd;
1204
1205	fdp = td->td_proc->p_fd;
1206	AUDIT_ARG_FD(uap->lowfd);
1207
1208	/*
1209	 * Treat negative starting file descriptor values identical to
1210	 * closefrom(0) which closes all files.
1211	 */
1212	if (uap->lowfd < 0)
1213		uap->lowfd = 0;
1214	FILEDESC_SLOCK(fdp);
1215	for (fd = uap->lowfd; fd < fdp->fd_nfiles; fd++) {
1216		if (fdp->fd_ofiles[fd].fde_file != NULL) {
1217			FILEDESC_SUNLOCK(fdp);
1218			(void)kern_close(td, fd);
1219			FILEDESC_SLOCK(fdp);
1220		}
1221	}
1222	FILEDESC_SUNLOCK(fdp);
1223	return (0);
1224}
1225
1226#if defined(COMPAT_43)
1227/*
1228 * Return status information about a file descriptor.
1229 */
1230#ifndef _SYS_SYSPROTO_H_
1231struct ofstat_args {
1232	int	fd;
1233	struct	ostat *sb;
1234};
1235#endif
1236/* ARGSUSED */
1237int
1238ofstat(struct thread *td, struct ofstat_args *uap)
1239{
1240	struct ostat oub;
1241	struct stat ub;
1242	int error;
1243
1244	error = kern_fstat(td, uap->fd, &ub);
1245	if (error == 0) {
1246		cvtstat(&ub, &oub);
1247		error = copyout(&oub, uap->sb, sizeof(oub));
1248	}
1249	return (error);
1250}
1251#endif /* COMPAT_43 */
1252
1253/*
1254 * Return status information about a file descriptor.
1255 */
1256#ifndef _SYS_SYSPROTO_H_
1257struct fstat_args {
1258	int	fd;
1259	struct	stat *sb;
1260};
1261#endif
1262/* ARGSUSED */
1263int
1264sys_fstat(struct thread *td, struct fstat_args *uap)
1265{
1266	struct stat ub;
1267	int error;
1268
1269	error = kern_fstat(td, uap->fd, &ub);
1270	if (error == 0)
1271		error = copyout(&ub, uap->sb, sizeof(ub));
1272	return (error);
1273}
1274
1275int
1276kern_fstat(struct thread *td, int fd, struct stat *sbp)
1277{
1278	struct file *fp;
1279	int error;
1280
1281	AUDIT_ARG_FD(fd);
1282
1283	if ((error = fget(td, fd, CAP_FSTAT, &fp)) != 0)
1284		return (error);
1285
1286	AUDIT_ARG_FILE(td->td_proc, fp);
1287
1288	error = fo_stat(fp, sbp, td->td_ucred, td);
1289	fdrop(fp, td);
1290#ifdef KTRACE
1291	if (error == 0 && KTRPOINT(td, KTR_STRUCT))
1292		ktrstat(sbp);
1293#endif
1294	return (error);
1295}
1296
1297/*
1298 * Return status information about a file descriptor.
1299 */
1300#ifndef _SYS_SYSPROTO_H_
1301struct nfstat_args {
1302	int	fd;
1303	struct	nstat *sb;
1304};
1305#endif
1306/* ARGSUSED */
1307int
1308sys_nfstat(struct thread *td, struct nfstat_args *uap)
1309{
1310	struct nstat nub;
1311	struct stat ub;
1312	int error;
1313
1314	error = kern_fstat(td, uap->fd, &ub);
1315	if (error == 0) {
1316		cvtnstat(&ub, &nub);
1317		error = copyout(&nub, uap->sb, sizeof(nub));
1318	}
1319	return (error);
1320}
1321
1322/*
1323 * Return pathconf information about a file descriptor.
1324 */
1325#ifndef _SYS_SYSPROTO_H_
1326struct fpathconf_args {
1327	int	fd;
1328	int	name;
1329};
1330#endif
1331/* ARGSUSED */
1332int
1333sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
1334{
1335	struct file *fp;
1336	struct vnode *vp;
1337	int error;
1338
1339	if ((error = fget(td, uap->fd, CAP_FPATHCONF, &fp)) != 0)
1340		return (error);
1341
1342	/* If asynchronous I/O is available, it works for all descriptors. */
1343	if (uap->name == _PC_ASYNC_IO) {
1344		td->td_retval[0] = async_io_version;
1345		goto out;
1346	}
1347	vp = fp->f_vnode;
1348	if (vp != NULL) {
1349		vn_lock(vp, LK_SHARED | LK_RETRY);
1350		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
1351		VOP_UNLOCK(vp, 0);
1352	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
1353		if (uap->name != _PC_PIPE_BUF) {
1354			error = EINVAL;
1355		} else {
1356			td->td_retval[0] = PIPE_BUF;
1357			error = 0;
1358		}
1359	} else {
1360		error = EOPNOTSUPP;
1361	}
1362out:
1363	fdrop(fp, td);
1364	return (error);
1365}
1366
1367/*
1368 * Initialize filecaps structure.
1369 */
1370void
1371filecaps_init(struct filecaps *fcaps)
1372{
1373
1374	bzero(fcaps, sizeof(*fcaps));
1375	fcaps->fc_nioctls = -1;
1376}
1377
1378/*
1379 * Copy filecaps structure allocating memory for ioctls array if needed.
1380 */
1381void
1382filecaps_copy(const struct filecaps *src, struct filecaps *dst)
1383{
1384	size_t size;
1385
1386	*dst = *src;
1387	if (src->fc_ioctls != NULL) {
1388		KASSERT(src->fc_nioctls > 0,
1389		    ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
1390
1391		size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
1392		dst->fc_ioctls = malloc(size, M_TEMP, M_WAITOK);
1393		bcopy(src->fc_ioctls, dst->fc_ioctls, size);
1394	}
1395}
1396
1397/*
1398 * Move filecaps structure to the new place and clear the old place.
1399 */
1400void
1401filecaps_move(struct filecaps *src, struct filecaps *dst)
1402{
1403
1404	*dst = *src;
1405	bzero(src, sizeof(*src));
1406}
1407
1408/*
1409 * Fill the given filecaps structure with full rights.
1410 */
1411static void
1412filecaps_fill(struct filecaps *fcaps)
1413{
1414
1415	fcaps->fc_rights = CAP_ALL;
1416	fcaps->fc_ioctls = NULL;
1417	fcaps->fc_nioctls = -1;
1418	fcaps->fc_fcntls = CAP_FCNTL_ALL;
1419}
1420
1421/*
1422 * Free memory allocated within filecaps structure.
1423 */
1424void
1425filecaps_free(struct filecaps *fcaps)
1426{
1427
1428	free(fcaps->fc_ioctls, M_TEMP);
1429	bzero(fcaps, sizeof(*fcaps));
1430}
1431
1432/*
1433 * Validate the given filecaps structure.
1434 */
1435static void
1436filecaps_validate(const struct filecaps *fcaps, const char *func)
1437{
1438
1439	KASSERT((fcaps->fc_rights & ~CAP_MASK_VALID) == 0,
1440	    ("%s: invalid rights", func));
1441	KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
1442	    ("%s: invalid fcntls", func));
1443	KASSERT(fcaps->fc_fcntls == 0 || (fcaps->fc_rights & CAP_FCNTL) != 0,
1444	    ("%s: fcntls without CAP_FCNTL", func));
1445	KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
1446	    (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
1447	    ("%s: invalid ioctls", func));
1448	KASSERT(fcaps->fc_nioctls == 0 || (fcaps->fc_rights & CAP_IOCTL) != 0,
1449	    ("%s: ioctls without CAP_IOCTL", func));
1450}
1451
1452/*
1453 * Grow the file table to accomodate (at least) nfd descriptors.
1454 */
1455static void
1456fdgrowtable(struct filedesc *fdp, int nfd)
1457{
1458	struct filedesc0 *fdp0;
1459	struct freetable *ft;
1460	struct filedescent *ntable;
1461	struct filedescent *otable;
1462	int nnfiles, onfiles;
1463	NDSLOTTYPE *nmap, *omap;
1464
1465	FILEDESC_XLOCK_ASSERT(fdp);
1466
1467	KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
1468
1469	/* save old values */
1470	onfiles = fdp->fd_nfiles;
1471	otable = fdp->fd_ofiles;
1472	omap = fdp->fd_map;
1473
1474	/* compute the size of the new table */
1475	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
1476	if (nnfiles <= onfiles)
1477		/* the table is already large enough */
1478		return;
1479
1480	/*
1481	 * Allocate a new table and map.  We need enough space for the
1482	 * file entries themselves and the struct freetable we will use
1483	 * when we decommission the table and place it on the freelist.
1484	 * We place the struct freetable in the middle so we don't have
1485	 * to worry about padding.
1486	 */
1487	ntable = malloc(nnfiles * sizeof(ntable[0]) + sizeof(struct freetable),
1488	    M_FILEDESC, M_ZERO | M_WAITOK);
1489	nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
1490	    M_ZERO | M_WAITOK);
1491
1492	/* copy the old data over and point at the new tables */
1493	memcpy(ntable, otable, onfiles * sizeof(*otable));
1494	memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
1495
1496	/* update the pointers and counters */
1497	fdp->fd_nfiles = nnfiles;
1498	memcpy(ntable, otable, onfiles * sizeof(ntable[0]));
1499	fdp->fd_ofiles = ntable;
1500	fdp->fd_map = nmap;
1501
1502	/*
1503	 * Do not free the old file table, as some threads may still
1504	 * reference entries within it.  Instead, place it on a freelist
1505	 * which will be processed when the struct filedesc is released.
1506	 *
1507	 * Do, however, free the old map.
1508	 *
1509	 * Note that if onfiles == NDFILE, we're dealing with the original
1510	 * static allocation contained within (struct filedesc0 *)fdp,
1511	 * which must not be freed.
1512	 */
1513	if (onfiles > NDFILE) {
1514		ft = (struct freetable *)&otable[onfiles];
1515		fdp0 = (struct filedesc0 *)fdp;
1516		ft->ft_table = otable;
1517		SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
1518		free(omap, M_FILEDESC);
1519	}
1520}
1521
1522/*
1523 * Allocate a file descriptor for the process.
1524 */
1525int
1526fdalloc(struct thread *td, int minfd, int *result)
1527{
1528	struct proc *p = td->td_proc;
1529	struct filedesc *fdp = p->p_fd;
1530	int fd = -1, maxfd, allocfd;
1531#ifdef RACCT
1532	int error;
1533#endif
1534
1535	FILEDESC_XLOCK_ASSERT(fdp);
1536
1537	if (fdp->fd_freefile > minfd)
1538		minfd = fdp->fd_freefile;
1539
1540	PROC_LOCK(p);
1541	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
1542	PROC_UNLOCK(p);
1543
1544	/*
1545	 * Search the bitmap for a free descriptor starting at minfd.
1546	 * If none is found, grow the file table.
1547	 */
1548	fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
1549	if (fd >= maxfd)
1550		return (EMFILE);
1551	if (fd >= fdp->fd_nfiles) {
1552		allocfd = min(fd * 2, maxfd);
1553#ifdef RACCT
1554		PROC_LOCK(p);
1555		error = racct_set(p, RACCT_NOFILE, allocfd);
1556		PROC_UNLOCK(p);
1557		if (error != 0)
1558			return (EMFILE);
1559#endif
1560		/*
1561		 * fd is already equal to first free descriptor >= minfd, so
1562		 * we only need to grow the table and we are done.
1563		 */
1564		fdgrowtable(fdp, allocfd);
1565	}
1566
1567	/*
1568	 * Perform some sanity checks, then mark the file descriptor as
1569	 * used and return it to the caller.
1570	 */
1571	KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
1572	    ("invalid descriptor %d", fd));
1573	KASSERT(!fdisused(fdp, fd),
1574	    ("fd_first_free() returned non-free descriptor"));
1575	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
1576	    ("file descriptor isn't free"));
1577	KASSERT(fdp->fd_ofiles[fd].fde_flags == 0, ("file flags are set"));
1578	fdused(fdp, fd);
1579	*result = fd;
1580	return (0);
1581}
1582
1583/*
1584 * Check to see whether n user file descriptors are available to the process
1585 * p.
1586 */
1587int
1588fdavail(struct thread *td, int n)
1589{
1590	struct proc *p = td->td_proc;
1591	struct filedesc *fdp = td->td_proc->p_fd;
1592	int i, lim, last;
1593
1594	FILEDESC_LOCK_ASSERT(fdp);
1595
1596	/*
1597	 * XXX: This is only called from uipc_usrreq.c:unp_externalize();
1598	 *      call racct_add() from there instead of dealing with containers
1599	 *      here.
1600	 */
1601	PROC_LOCK(p);
1602	lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
1603	PROC_UNLOCK(p);
1604	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
1605		return (1);
1606	last = min(fdp->fd_nfiles, lim);
1607	for (i = fdp->fd_freefile; i < last; i++) {
1608		if (fdp->fd_ofiles[i].fde_file == NULL && --n <= 0)
1609			return (1);
1610	}
1611	return (0);
1612}
1613
1614/*
1615 * Create a new open file structure and allocate a file decriptor for the
1616 * process that refers to it.  We add one reference to the file for the
1617 * descriptor table and one reference for resultfp. This is to prevent us
1618 * being preempted and the entry in the descriptor table closed after we
1619 * release the FILEDESC lock.
1620 */
1621int
1622falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags)
1623{
1624	struct file *fp;
1625	int error, fd;
1626
1627	error = falloc_noinstall(td, &fp);
1628	if (error)
1629		return (error);		/* no reference held on error */
1630
1631	error = finstall(td, fp, &fd, flags, NULL);
1632	if (error) {
1633		fdrop(fp, td);		/* one reference (fp only) */
1634		return (error);
1635	}
1636
1637	if (resultfp != NULL)
1638		*resultfp = fp;		/* copy out result */
1639	else
1640		fdrop(fp, td);		/* release local reference */
1641
1642	if (resultfd != NULL)
1643		*resultfd = fd;
1644
1645	return (0);
1646}
1647
1648/*
1649 * Create a new open file structure without allocating a file descriptor.
1650 */
1651int
1652falloc_noinstall(struct thread *td, struct file **resultfp)
1653{
1654	struct file *fp;
1655	int maxuserfiles = maxfiles - (maxfiles / 20);
1656	static struct timeval lastfail;
1657	static int curfail;
1658
1659	KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
1660
1661	if ((openfiles >= maxuserfiles &&
1662	    priv_check(td, PRIV_MAXFILES) != 0) ||
1663	    openfiles >= maxfiles) {
1664		if (ppsratecheck(&lastfail, &curfail, 1)) {
1665			printf("kern.maxfiles limit exceeded by uid %i, "
1666			    "please see tuning(7).\n", td->td_ucred->cr_ruid);
1667		}
1668		return (ENFILE);
1669	}
1670	atomic_add_int(&openfiles, 1);
1671	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
1672	refcount_init(&fp->f_count, 1);
1673	fp->f_cred = crhold(td->td_ucred);
1674	fp->f_ops = &badfileops;
1675	fp->f_data = NULL;
1676	fp->f_vnode = NULL;
1677	*resultfp = fp;
1678	return (0);
1679}
1680
1681/*
1682 * Install a file in a file descriptor table.
1683 */
1684int
1685finstall(struct thread *td, struct file *fp, int *fd, int flags,
1686    struct filecaps *fcaps)
1687{
1688	struct filedesc *fdp = td->td_proc->p_fd;
1689	struct filedescent *fde;
1690	int error;
1691
1692	KASSERT(fd != NULL, ("%s: fd == NULL", __func__));
1693	KASSERT(fp != NULL, ("%s: fp == NULL", __func__));
1694	if (fcaps != NULL)
1695		filecaps_validate(fcaps, __func__);
1696
1697	FILEDESC_XLOCK(fdp);
1698	if ((error = fdalloc(td, 0, fd))) {
1699		FILEDESC_XUNLOCK(fdp);
1700		return (error);
1701	}
1702	fhold(fp);
1703	fde = &fdp->fd_ofiles[*fd];
1704	fde->fde_file = fp;
1705	if ((flags & O_CLOEXEC) != 0)
1706		fde->fde_flags |= UF_EXCLOSE;
1707	if (fcaps != NULL)
1708		filecaps_move(fcaps, &fde->fde_caps);
1709	else
1710		filecaps_fill(&fde->fde_caps);
1711	FILEDESC_XUNLOCK(fdp);
1712	return (0);
1713}
1714
1715/*
1716 * Build a new filedesc structure from another.
1717 * Copy the current, root, and jail root vnode references.
1718 */
1719struct filedesc *
1720fdinit(struct filedesc *fdp)
1721{
1722	struct filedesc0 *newfdp;
1723
1724	newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO);
1725	FILEDESC_LOCK_INIT(&newfdp->fd_fd);
1726	if (fdp != NULL) {
1727		FILEDESC_XLOCK(fdp);
1728		newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
1729		if (newfdp->fd_fd.fd_cdir)
1730			VREF(newfdp->fd_fd.fd_cdir);
1731		newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
1732		if (newfdp->fd_fd.fd_rdir)
1733			VREF(newfdp->fd_fd.fd_rdir);
1734		newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
1735		if (newfdp->fd_fd.fd_jdir)
1736			VREF(newfdp->fd_fd.fd_jdir);
1737		FILEDESC_XUNLOCK(fdp);
1738	}
1739
1740	/* Create the file descriptor table. */
1741	newfdp->fd_fd.fd_refcnt = 1;
1742	newfdp->fd_fd.fd_holdcnt = 1;
1743	newfdp->fd_fd.fd_cmask = CMASK;
1744	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
1745	newfdp->fd_fd.fd_nfiles = NDFILE;
1746	newfdp->fd_fd.fd_map = newfdp->fd_dmap;
1747	newfdp->fd_fd.fd_lastfile = -1;
1748	return (&newfdp->fd_fd);
1749}
1750
1751static struct filedesc *
1752fdhold(struct proc *p)
1753{
1754	struct filedesc *fdp;
1755
1756	mtx_lock(&fdesc_mtx);
1757	fdp = p->p_fd;
1758	if (fdp != NULL)
1759		fdp->fd_holdcnt++;
1760	mtx_unlock(&fdesc_mtx);
1761	return (fdp);
1762}
1763
1764static void
1765fddrop(struct filedesc *fdp)
1766{
1767	struct filedesc0 *fdp0;
1768	struct freetable *ft;
1769	int i;
1770
1771	mtx_lock(&fdesc_mtx);
1772	i = --fdp->fd_holdcnt;
1773	mtx_unlock(&fdesc_mtx);
1774	if (i > 0)
1775		return;
1776
1777	FILEDESC_LOCK_DESTROY(fdp);
1778	fdp0 = (struct filedesc0 *)fdp;
1779	while ((ft = SLIST_FIRST(&fdp0->fd_free)) != NULL) {
1780		SLIST_REMOVE_HEAD(&fdp0->fd_free, ft_next);
1781		free(ft->ft_table, M_FILEDESC);
1782	}
1783	free(fdp, M_FILEDESC);
1784}
1785
1786/*
1787 * Share a filedesc structure.
1788 */
1789struct filedesc *
1790fdshare(struct filedesc *fdp)
1791{
1792
1793	FILEDESC_XLOCK(fdp);
1794	fdp->fd_refcnt++;
1795	FILEDESC_XUNLOCK(fdp);
1796	return (fdp);
1797}
1798
1799/*
1800 * Unshare a filedesc structure, if necessary by making a copy
1801 */
1802void
1803fdunshare(struct proc *p, struct thread *td)
1804{
1805
1806	FILEDESC_XLOCK(p->p_fd);
1807	if (p->p_fd->fd_refcnt > 1) {
1808		struct filedesc *tmp;
1809
1810		FILEDESC_XUNLOCK(p->p_fd);
1811		tmp = fdcopy(p->p_fd);
1812		fdescfree(td);
1813		p->p_fd = tmp;
1814	} else
1815		FILEDESC_XUNLOCK(p->p_fd);
1816}
1817
1818/*
1819 * Copy a filedesc structure.  A NULL pointer in returns a NULL reference,
1820 * this is to ease callers, not catch errors.
1821 */
1822struct filedesc *
1823fdcopy(struct filedesc *fdp)
1824{
1825	struct filedesc *newfdp;
1826	struct filedescent *nfde, *ofde;
1827	int i;
1828
1829	/* Certain daemons might not have file descriptors. */
1830	if (fdp == NULL)
1831		return (NULL);
1832
1833	newfdp = fdinit(fdp);
1834	FILEDESC_SLOCK(fdp);
1835	while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
1836		FILEDESC_SUNLOCK(fdp);
1837		FILEDESC_XLOCK(newfdp);
1838		fdgrowtable(newfdp, fdp->fd_lastfile + 1);
1839		FILEDESC_XUNLOCK(newfdp);
1840		FILEDESC_SLOCK(fdp);
1841	}
1842	/* copy all passable descriptors (i.e. not kqueue) */
1843	newfdp->fd_freefile = -1;
1844	for (i = 0; i <= fdp->fd_lastfile; ++i) {
1845		ofde = &fdp->fd_ofiles[i];
1846		if (fdisused(fdp, i) &&
1847		    (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) &&
1848		    ofde->fde_file->f_ops != &badfileops) {
1849			nfde = &newfdp->fd_ofiles[i];
1850			*nfde = *ofde;
1851			filecaps_copy(&ofde->fde_caps, &nfde->fde_caps);
1852			fhold(nfde->fde_file);
1853			newfdp->fd_lastfile = i;
1854		} else {
1855			if (newfdp->fd_freefile == -1)
1856				newfdp->fd_freefile = i;
1857		}
1858	}
1859	newfdp->fd_cmask = fdp->fd_cmask;
1860	FILEDESC_SUNLOCK(fdp);
1861	FILEDESC_XLOCK(newfdp);
1862	for (i = 0; i <= newfdp->fd_lastfile; ++i) {
1863		if (newfdp->fd_ofiles[i].fde_file != NULL)
1864			fdused(newfdp, i);
1865	}
1866	if (newfdp->fd_freefile == -1)
1867		newfdp->fd_freefile = i;
1868	FILEDESC_XUNLOCK(newfdp);
1869	return (newfdp);
1870}
1871
1872/*
1873 * Release a filedesc structure.
1874 */
1875void
1876fdescfree(struct thread *td)
1877{
1878	struct filedesc *fdp;
1879	int i;
1880	struct filedesc_to_leader *fdtol;
1881	struct file *fp;
1882	struct vnode *cdir, *jdir, *rdir, *vp;
1883	struct flock lf;
1884
1885	/* Certain daemons might not have file descriptors. */
1886	fdp = td->td_proc->p_fd;
1887	if (fdp == NULL)
1888		return;
1889
1890#ifdef RACCT
1891	PROC_LOCK(td->td_proc);
1892	racct_set(td->td_proc, RACCT_NOFILE, 0);
1893	PROC_UNLOCK(td->td_proc);
1894#endif
1895
1896	/* Check for special need to clear POSIX style locks */
1897	fdtol = td->td_proc->p_fdtol;
1898	if (fdtol != NULL) {
1899		FILEDESC_XLOCK(fdp);
1900		KASSERT(fdtol->fdl_refcount > 0,
1901		    ("filedesc_to_refcount botch: fdl_refcount=%d",
1902		    fdtol->fdl_refcount));
1903		if (fdtol->fdl_refcount == 1 &&
1904		    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
1905			for (i = 0; i <= fdp->fd_lastfile; i++) {
1906				fp = fdp->fd_ofiles[i].fde_file;
1907				if (fp == NULL || fp->f_type != DTYPE_VNODE)
1908					continue;
1909				fhold(fp);
1910				FILEDESC_XUNLOCK(fdp);
1911				lf.l_whence = SEEK_SET;
1912				lf.l_start = 0;
1913				lf.l_len = 0;
1914				lf.l_type = F_UNLCK;
1915				vp = fp->f_vnode;
1916				(void) VOP_ADVLOCK(vp,
1917				    (caddr_t)td->td_proc->p_leader, F_UNLCK,
1918				    &lf, F_POSIX);
1919				FILEDESC_XLOCK(fdp);
1920				fdrop(fp, td);
1921			}
1922		}
1923	retry:
1924		if (fdtol->fdl_refcount == 1) {
1925			if (fdp->fd_holdleaderscount > 0 &&
1926			    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
1927				/*
1928				 * close() or do_dup() has cleared a reference
1929				 * in a shared file descriptor table.
1930				 */
1931				fdp->fd_holdleaderswakeup = 1;
1932				sx_sleep(&fdp->fd_holdleaderscount,
1933				    FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
1934				goto retry;
1935			}
1936			if (fdtol->fdl_holdcount > 0) {
1937				/*
1938				 * Ensure that fdtol->fdl_leader remains
1939				 * valid in closef().
1940				 */
1941				fdtol->fdl_wakeup = 1;
1942				sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
1943				    "fdlhold", 0);
1944				goto retry;
1945			}
1946		}
1947		fdtol->fdl_refcount--;
1948		if (fdtol->fdl_refcount == 0 &&
1949		    fdtol->fdl_holdcount == 0) {
1950			fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
1951			fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
1952		} else
1953			fdtol = NULL;
1954		td->td_proc->p_fdtol = NULL;
1955		FILEDESC_XUNLOCK(fdp);
1956		if (fdtol != NULL)
1957			free(fdtol, M_FILEDESC_TO_LEADER);
1958	}
1959	FILEDESC_XLOCK(fdp);
1960	i = --fdp->fd_refcnt;
1961	FILEDESC_XUNLOCK(fdp);
1962	if (i > 0)
1963		return;
1964
1965	for (i = 0; i <= fdp->fd_lastfile; i++) {
1966		fp = fdp->fd_ofiles[i].fde_file;
1967		if (fp != NULL) {
1968			FILEDESC_XLOCK(fdp);
1969			fdfree(fdp, i);
1970			FILEDESC_XUNLOCK(fdp);
1971			(void) closef(fp, td);
1972		}
1973	}
1974	FILEDESC_XLOCK(fdp);
1975
1976	/* XXX This should happen earlier. */
1977	mtx_lock(&fdesc_mtx);
1978	td->td_proc->p_fd = NULL;
1979	mtx_unlock(&fdesc_mtx);
1980
1981	if (fdp->fd_nfiles > NDFILE)
1982		free(fdp->fd_ofiles, M_FILEDESC);
1983	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
1984		free(fdp->fd_map, M_FILEDESC);
1985
1986	fdp->fd_nfiles = 0;
1987
1988	cdir = fdp->fd_cdir;
1989	fdp->fd_cdir = NULL;
1990	rdir = fdp->fd_rdir;
1991	fdp->fd_rdir = NULL;
1992	jdir = fdp->fd_jdir;
1993	fdp->fd_jdir = NULL;
1994	FILEDESC_XUNLOCK(fdp);
1995
1996	if (cdir != NULL)
1997		vrele(cdir);
1998	if (rdir != NULL)
1999		vrele(rdir);
2000	if (jdir != NULL)
2001		vrele(jdir);
2002
2003	fddrop(fdp);
2004}
2005
2006/*
2007 * For setugid programs, we don't want to people to use that setugidness
2008 * to generate error messages which write to a file which otherwise would
2009 * otherwise be off-limits to the process.  We check for filesystems where
2010 * the vnode can change out from under us after execve (like [lin]procfs).
2011 *
2012 * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
2013 * sufficient.  We also don't check for setugidness since we know we are.
2014 */
2015static int
2016is_unsafe(struct file *fp)
2017{
2018	if (fp->f_type == DTYPE_VNODE) {
2019		struct vnode *vp = fp->f_vnode;
2020
2021		if ((vp->v_vflag & VV_PROCDEP) != 0)
2022			return (1);
2023	}
2024	return (0);
2025}
2026
2027/*
2028 * Make this setguid thing safe, if at all possible.
2029 */
2030void
2031setugidsafety(struct thread *td)
2032{
2033	struct filedesc *fdp;
2034	struct file *fp;
2035	int i;
2036
2037	/* Certain daemons might not have file descriptors. */
2038	fdp = td->td_proc->p_fd;
2039	if (fdp == NULL)
2040		return;
2041
2042	/*
2043	 * Note: fdp->fd_ofiles may be reallocated out from under us while
2044	 * we are blocked in a close.  Be careful!
2045	 */
2046	FILEDESC_XLOCK(fdp);
2047	for (i = 0; i <= fdp->fd_lastfile; i++) {
2048		if (i > 2)
2049			break;
2050		fp = fdp->fd_ofiles[i].fde_file;
2051		if (fp != NULL && is_unsafe(fp)) {
2052			knote_fdclose(td, i);
2053			/*
2054			 * NULL-out descriptor prior to close to avoid
2055			 * a race while close blocks.
2056			 */
2057			fdfree(fdp, i);
2058			FILEDESC_XUNLOCK(fdp);
2059			(void) closef(fp, td);
2060			FILEDESC_XLOCK(fdp);
2061		}
2062	}
2063	FILEDESC_XUNLOCK(fdp);
2064}
2065
2066/*
2067 * If a specific file object occupies a specific file descriptor, close the
2068 * file descriptor entry and drop a reference on the file object.  This is a
2069 * convenience function to handle a subsequent error in a function that calls
2070 * falloc() that handles the race that another thread might have closed the
2071 * file descriptor out from under the thread creating the file object.
2072 */
2073void
2074fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
2075{
2076
2077	FILEDESC_XLOCK(fdp);
2078	if (fdp->fd_ofiles[idx].fde_file == fp) {
2079		fdfree(fdp, idx);
2080		FILEDESC_XUNLOCK(fdp);
2081		fdrop(fp, td);
2082	} else
2083		FILEDESC_XUNLOCK(fdp);
2084}
2085
2086/*
2087 * Close any files on exec?
2088 */
2089void
2090fdcloseexec(struct thread *td)
2091{
2092	struct filedesc *fdp;
2093	struct filedescent *fde;
2094	struct file *fp;
2095	int i;
2096
2097	/* Certain daemons might not have file descriptors. */
2098	fdp = td->td_proc->p_fd;
2099	if (fdp == NULL)
2100		return;
2101
2102	/*
2103	 * We cannot cache fd_ofiles since operations
2104	 * may block and rip them out from under us.
2105	 */
2106	FILEDESC_XLOCK(fdp);
2107	for (i = 0; i <= fdp->fd_lastfile; i++) {
2108		fde = &fdp->fd_ofiles[i];
2109		fp = fde->fde_file;
2110		if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
2111		    (fde->fde_flags & UF_EXCLOSE))) {
2112			fdfree(fdp, i);
2113			(void) closefp(fdp, i, fp, td, 0);
2114			/* closefp() drops the FILEDESC lock. */
2115			FILEDESC_XLOCK(fdp);
2116		}
2117	}
2118	FILEDESC_XUNLOCK(fdp);
2119}
2120
2121/*
2122 * It is unsafe for set[ug]id processes to be started with file
2123 * descriptors 0..2 closed, as these descriptors are given implicit
2124 * significance in the Standard C library.  fdcheckstd() will create a
2125 * descriptor referencing /dev/null for each of stdin, stdout, and
2126 * stderr that is not already open.
2127 */
2128int
2129fdcheckstd(struct thread *td)
2130{
2131	struct filedesc *fdp;
2132	register_t retval, save;
2133	int i, error, devnull;
2134
2135	fdp = td->td_proc->p_fd;
2136	if (fdp == NULL)
2137		return (0);
2138	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
2139	devnull = -1;
2140	error = 0;
2141	for (i = 0; i < 3; i++) {
2142		if (fdp->fd_ofiles[i].fde_file != NULL)
2143			continue;
2144		if (devnull < 0) {
2145			save = td->td_retval[0];
2146			error = kern_open(td, "/dev/null", UIO_SYSSPACE,
2147			    O_RDWR, 0);
2148			devnull = td->td_retval[0];
2149			td->td_retval[0] = save;
2150			if (error)
2151				break;
2152			KASSERT(devnull == i, ("oof, we didn't get our fd"));
2153		} else {
2154			error = do_dup(td, DUP_FIXED, devnull, i, &retval);
2155			if (error != 0)
2156				break;
2157		}
2158	}
2159	return (error);
2160}
2161
2162/*
2163 * Internal form of close.  Decrement reference count on file structure.
2164 * Note: td may be NULL when closing a file that was being passed in a
2165 * message.
2166 *
2167 * XXXRW: Giant is not required for the caller, but often will be held; this
2168 * makes it moderately likely the Giant will be recursed in the VFS case.
2169 */
2170int
2171closef(struct file *fp, struct thread *td)
2172{
2173	struct vnode *vp;
2174	struct flock lf;
2175	struct filedesc_to_leader *fdtol;
2176	struct filedesc *fdp;
2177
2178	/*
2179	 * POSIX record locking dictates that any close releases ALL
2180	 * locks owned by this process.  This is handled by setting
2181	 * a flag in the unlock to free ONLY locks obeying POSIX
2182	 * semantics, and not to free BSD-style file locks.
2183	 * If the descriptor was in a message, POSIX-style locks
2184	 * aren't passed with the descriptor, and the thread pointer
2185	 * will be NULL.  Callers should be careful only to pass a
2186	 * NULL thread pointer when there really is no owning
2187	 * context that might have locks, or the locks will be
2188	 * leaked.
2189	 */
2190	if (fp->f_type == DTYPE_VNODE && td != NULL) {
2191		vp = fp->f_vnode;
2192		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
2193			lf.l_whence = SEEK_SET;
2194			lf.l_start = 0;
2195			lf.l_len = 0;
2196			lf.l_type = F_UNLCK;
2197			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
2198			    F_UNLCK, &lf, F_POSIX);
2199		}
2200		fdtol = td->td_proc->p_fdtol;
2201		if (fdtol != NULL) {
2202			/*
2203			 * Handle special case where file descriptor table is
2204			 * shared between multiple process leaders.
2205			 */
2206			fdp = td->td_proc->p_fd;
2207			FILEDESC_XLOCK(fdp);
2208			for (fdtol = fdtol->fdl_next;
2209			     fdtol != td->td_proc->p_fdtol;
2210			     fdtol = fdtol->fdl_next) {
2211				if ((fdtol->fdl_leader->p_flag &
2212				     P_ADVLOCK) == 0)
2213					continue;
2214				fdtol->fdl_holdcount++;
2215				FILEDESC_XUNLOCK(fdp);
2216				lf.l_whence = SEEK_SET;
2217				lf.l_start = 0;
2218				lf.l_len = 0;
2219				lf.l_type = F_UNLCK;
2220				vp = fp->f_vnode;
2221				(void) VOP_ADVLOCK(vp,
2222				    (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
2223				    F_POSIX);
2224				FILEDESC_XLOCK(fdp);
2225				fdtol->fdl_holdcount--;
2226				if (fdtol->fdl_holdcount == 0 &&
2227				    fdtol->fdl_wakeup != 0) {
2228					fdtol->fdl_wakeup = 0;
2229					wakeup(fdtol);
2230				}
2231			}
2232			FILEDESC_XUNLOCK(fdp);
2233		}
2234	}
2235	return (fdrop(fp, td));
2236}
2237
2238/*
2239 * Initialize the file pointer with the specified properties.
2240 *
2241 * The ops are set with release semantics to be certain that the flags, type,
2242 * and data are visible when ops is.  This is to prevent ops methods from being
2243 * called with bad data.
2244 */
2245void
2246finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
2247{
2248	fp->f_data = data;
2249	fp->f_flag = flag;
2250	fp->f_type = type;
2251	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
2252}
2253
2254int
2255fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t needrights,
2256    int needfcntl, struct file **fpp, cap_rights_t *haverightsp)
2257{
2258	struct file *fp;
2259	u_int count;
2260#ifdef CAPABILITIES
2261	cap_rights_t haverights;
2262	int error;
2263#endif
2264
2265	if (fd < 0 || fd >= fdp->fd_nfiles)
2266		return (EBADF);
2267	/*
2268	 * Fetch the descriptor locklessly.  We avoid fdrop() races by
2269	 * never raising a refcount above 0.  To accomplish this we have
2270	 * to use a cmpset loop rather than an atomic_add.  The descriptor
2271	 * must be re-verified once we acquire a reference to be certain
2272	 * that the identity is still correct and we did not lose a race
2273	 * due to preemption.
2274	 */
2275	for (;;) {
2276		fp = fdp->fd_ofiles[fd].fde_file;
2277		if (fp == NULL)
2278			return (EBADF);
2279#ifdef CAPABILITIES
2280		haverights = cap_rights(fdp, fd);
2281		error = cap_check(haverights, needrights);
2282		if (error != 0)
2283			return (error);
2284		if ((needrights & CAP_FCNTL) != 0) {
2285			error = cap_fcntl_check(fdp, fd, needfcntl);
2286			if (error != 0)
2287				return (error);
2288		}
2289#endif
2290		count = fp->f_count;
2291		if (count == 0)
2292			continue;
2293		/*
2294		 * Use an acquire barrier to prevent caching of fd_ofiles
2295		 * so it is refreshed for verification.
2296		 */
2297		if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) != 1)
2298			continue;
2299		if (fp == fdp->fd_ofiles[fd].fde_file)
2300			break;
2301		fdrop(fp, curthread);
2302	}
2303	*fpp = fp;
2304	if (haverightsp != NULL) {
2305#ifdef CAPABILITIES
2306		*haverightsp = haverights;
2307#else
2308		*haverightsp = CAP_ALL;
2309#endif
2310	}
2311	return (0);
2312}
2313
2314/*
2315 * Extract the file pointer associated with the specified descriptor for the
2316 * current user process.
2317 *
2318 * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
2319 * returned.
2320 *
2321 * File's rights will be checked against the capability rights mask.
2322 *
2323 * If an error occured the non-zero error is returned and *fpp is set to
2324 * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
2325 * responsible for fdrop().
2326 */
2327static __inline int
2328_fget(struct thread *td, int fd, struct file **fpp, int flags,
2329    cap_rights_t needrights, u_char *maxprotp)
2330{
2331	struct filedesc *fdp;
2332	struct file *fp;
2333	cap_rights_t haverights;
2334	int error;
2335
2336	*fpp = NULL;
2337	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
2338		return (EBADF);
2339	if (maxprotp != NULL)
2340		needrights |= CAP_MMAP;
2341	error = fget_unlocked(fdp, fd, needrights, 0, &fp, &haverights);
2342	if (error != 0)
2343		return (error);
2344	if (fp->f_ops == &badfileops) {
2345		fdrop(fp, td);
2346		return (EBADF);
2347	}
2348
2349#ifdef CAPABILITIES
2350	/*
2351	 * If requested, convert capability rights to access flags.
2352	 */
2353	if (maxprotp != NULL)
2354		*maxprotp = cap_rights_to_vmprot(haverights);
2355#else /* !CAPABILITIES */
2356	if (maxprotp != NULL)
2357		*maxprotp = VM_PROT_ALL;
2358#endif /* CAPABILITIES */
2359
2360	/*
2361	 * FREAD and FWRITE failure return EBADF as per POSIX.
2362	 */
2363	error = 0;
2364	switch (flags) {
2365	case FREAD:
2366	case FWRITE:
2367		if ((fp->f_flag & flags) == 0)
2368			error = EBADF;
2369		break;
2370	case FEXEC:
2371	    	if ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
2372		    ((fp->f_flag & FWRITE) != 0))
2373			error = EBADF;
2374		break;
2375	case 0:
2376		break;
2377	default:
2378		KASSERT(0, ("wrong flags"));
2379	}
2380
2381	if (error != 0) {
2382		fdrop(fp, td);
2383		return (error);
2384	}
2385
2386	*fpp = fp;
2387	return (0);
2388}
2389
2390int
2391fget(struct thread *td, int fd, cap_rights_t rights, struct file **fpp)
2392{
2393
2394	return(_fget(td, fd, fpp, 0, rights, NULL));
2395}
2396
2397int
2398fget_mmap(struct thread *td, int fd, cap_rights_t rights, u_char *maxprotp,
2399    struct file **fpp)
2400{
2401
2402	return (_fget(td, fd, fpp, 0, rights, maxprotp));
2403}
2404
2405int
2406fget_read(struct thread *td, int fd, cap_rights_t rights, struct file **fpp)
2407{
2408
2409	return(_fget(td, fd, fpp, FREAD, rights, NULL));
2410}
2411
2412int
2413fget_write(struct thread *td, int fd, cap_rights_t rights, struct file **fpp)
2414{
2415
2416	return (_fget(td, fd, fpp, FWRITE, rights, NULL));
2417}
2418
2419/*
2420 * Like fget() but loads the underlying vnode, or returns an error if the
2421 * descriptor does not represent a vnode.  Note that pipes use vnodes but
2422 * never have VM objects.  The returned vnode will be vref()'d.
2423 *
2424 * XXX: what about the unused flags ?
2425 */
2426static __inline int
2427_fgetvp(struct thread *td, int fd, int flags, cap_rights_t needrights,
2428    struct vnode **vpp)
2429{
2430	struct file *fp;
2431	int error;
2432
2433	*vpp = NULL;
2434	error = _fget(td, fd, &fp, flags, needrights, NULL);
2435	if (error)
2436		return (error);
2437	if (fp->f_vnode == NULL) {
2438		error = EINVAL;
2439	} else {
2440		*vpp = fp->f_vnode;
2441		vref(*vpp);
2442	}
2443	fdrop(fp, td);
2444
2445	return (error);
2446}
2447
2448int
2449fgetvp(struct thread *td, int fd, cap_rights_t rights, struct vnode **vpp)
2450{
2451
2452	return (_fgetvp(td, fd, 0, rights, vpp));
2453}
2454
2455int
2456fgetvp_rights(struct thread *td, int fd, cap_rights_t need,
2457    struct filecaps *havecaps, struct vnode **vpp)
2458{
2459	struct filedesc *fdp;
2460	struct file *fp;
2461#ifdef CAPABILITIES
2462	int error;
2463#endif
2464
2465	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
2466		return (EBADF);
2467
2468	fp = fget_locked(fdp, fd);
2469	if (fp == NULL || fp->f_ops == &badfileops)
2470		return (EBADF);
2471
2472#ifdef CAPABILITIES
2473	error = cap_check(cap_rights(fdp, fd), need);
2474	if (error != 0)
2475		return (error);
2476#endif
2477
2478	if (fp->f_vnode == NULL)
2479		return (EINVAL);
2480
2481	*vpp = fp->f_vnode;
2482	vref(*vpp);
2483	filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, havecaps);
2484
2485	return (0);
2486}
2487
2488int
2489fgetvp_read(struct thread *td, int fd, cap_rights_t rights, struct vnode **vpp)
2490{
2491
2492	return (_fgetvp(td, fd, FREAD, rights, vpp));
2493}
2494
2495int
2496fgetvp_exec(struct thread *td, int fd, cap_rights_t rights, struct vnode **vpp)
2497{
2498
2499	return (_fgetvp(td, fd, FEXEC, rights, vpp));
2500}
2501
2502#ifdef notyet
2503int
2504fgetvp_write(struct thread *td, int fd, cap_rights_t rights,
2505    struct vnode **vpp)
2506{
2507
2508	return (_fgetvp(td, fd, FWRITE, rights, vpp));
2509}
2510#endif
2511
2512/*
2513 * Like fget() but loads the underlying socket, or returns an error if the
2514 * descriptor does not represent a socket.
2515 *
2516 * We bump the ref count on the returned socket.  XXX Also obtain the SX lock
2517 * in the future.
2518 *
2519 * Note: fgetsock() and fputsock() are deprecated, as consumers should rely
2520 * on their file descriptor reference to prevent the socket from being free'd
2521 * during use.
2522 */
2523int
2524fgetsock(struct thread *td, int fd, cap_rights_t rights, struct socket **spp,
2525    u_int *fflagp)
2526{
2527	struct file *fp;
2528	int error;
2529
2530	*spp = NULL;
2531	if (fflagp != NULL)
2532		*fflagp = 0;
2533	if ((error = _fget(td, fd, &fp, 0, rights, NULL)) != 0)
2534		return (error);
2535	if (fp->f_type != DTYPE_SOCKET) {
2536		error = ENOTSOCK;
2537	} else {
2538		*spp = fp->f_data;
2539		if (fflagp)
2540			*fflagp = fp->f_flag;
2541		SOCK_LOCK(*spp);
2542		soref(*spp);
2543		SOCK_UNLOCK(*spp);
2544	}
2545	fdrop(fp, td);
2546
2547	return (error);
2548}
2549
2550/*
2551 * Drop the reference count on the socket and XXX release the SX lock in the
2552 * future.  The last reference closes the socket.
2553 *
2554 * Note: fputsock() is deprecated, see comment for fgetsock().
2555 */
2556void
2557fputsock(struct socket *so)
2558{
2559
2560	ACCEPT_LOCK();
2561	SOCK_LOCK(so);
2562	CURVNET_SET(so->so_vnet);
2563	sorele(so);
2564	CURVNET_RESTORE();
2565}
2566
2567/*
2568 * Handle the last reference to a file being closed.
2569 */
2570int
2571_fdrop(struct file *fp, struct thread *td)
2572{
2573	int error;
2574
2575	error = 0;
2576	if (fp->f_count != 0)
2577		panic("fdrop: count %d", fp->f_count);
2578	if (fp->f_ops != &badfileops)
2579		error = fo_close(fp, td);
2580	atomic_subtract_int(&openfiles, 1);
2581	crfree(fp->f_cred);
2582	free(fp->f_advice, M_FADVISE);
2583	uma_zfree(file_zone, fp);
2584
2585	return (error);
2586}
2587
2588/*
2589 * Apply an advisory lock on a file descriptor.
2590 *
2591 * Just attempt to get a record lock of the requested type on the entire file
2592 * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
2593 */
2594#ifndef _SYS_SYSPROTO_H_
2595struct flock_args {
2596	int	fd;
2597	int	how;
2598};
2599#endif
2600/* ARGSUSED */
2601int
2602sys_flock(struct thread *td, struct flock_args *uap)
2603{
2604	struct file *fp;
2605	struct vnode *vp;
2606	struct flock lf;
2607	int error;
2608
2609	if ((error = fget(td, uap->fd, CAP_FLOCK, &fp)) != 0)
2610		return (error);
2611	if (fp->f_type != DTYPE_VNODE) {
2612		fdrop(fp, td);
2613		return (EOPNOTSUPP);
2614	}
2615
2616	vp = fp->f_vnode;
2617	lf.l_whence = SEEK_SET;
2618	lf.l_start = 0;
2619	lf.l_len = 0;
2620	if (uap->how & LOCK_UN) {
2621		lf.l_type = F_UNLCK;
2622		atomic_clear_int(&fp->f_flag, FHASLOCK);
2623		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
2624		goto done2;
2625	}
2626	if (uap->how & LOCK_EX)
2627		lf.l_type = F_WRLCK;
2628	else if (uap->how & LOCK_SH)
2629		lf.l_type = F_RDLCK;
2630	else {
2631		error = EBADF;
2632		goto done2;
2633	}
2634	atomic_set_int(&fp->f_flag, FHASLOCK);
2635	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
2636	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
2637done2:
2638	fdrop(fp, td);
2639	return (error);
2640}
2641/*
2642 * Duplicate the specified descriptor to a free descriptor.
2643 */
2644int
2645dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
2646    int openerror, int *indxp)
2647{
2648	struct file *fp;
2649	int error, indx;
2650
2651	KASSERT(openerror == ENODEV || openerror == ENXIO,
2652	    ("unexpected error %d in %s", openerror, __func__));
2653
2654	/*
2655	 * If the to-be-dup'd fd number is greater than the allowed number
2656	 * of file descriptors, or the fd to be dup'd has already been
2657	 * closed, then reject.
2658	 */
2659	FILEDESC_XLOCK(fdp);
2660	if ((fp = fget_locked(fdp, dfd)) == NULL) {
2661		FILEDESC_XUNLOCK(fdp);
2662		return (EBADF);
2663	}
2664
2665	error = fdalloc(td, 0, &indx);
2666	if (error != 0) {
2667		FILEDESC_XUNLOCK(fdp);
2668		return (error);
2669	}
2670
2671	/*
2672	 * There are two cases of interest here.
2673	 *
2674	 * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
2675	 *
2676	 * For ENXIO steal away the file structure from (dfd) and store it in
2677	 * (indx).  (dfd) is effectively closed by this operation.
2678	 */
2679	switch (openerror) {
2680	case ENODEV:
2681		/*
2682		 * Check that the mode the file is being opened for is a
2683		 * subset of the mode of the existing descriptor.
2684		 */
2685		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
2686			fdunused(fdp, indx);
2687			FILEDESC_XUNLOCK(fdp);
2688			return (EACCES);
2689		}
2690		fhold(fp);
2691		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
2692		filecaps_copy(&fdp->fd_ofiles[dfd].fde_caps,
2693		    &fdp->fd_ofiles[indx].fde_caps);
2694		break;
2695	case ENXIO:
2696		/*
2697		 * Steal away the file pointer from dfd and stuff it into indx.
2698		 */
2699		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
2700		bzero(&fdp->fd_ofiles[dfd], sizeof(fdp->fd_ofiles[dfd]));
2701		fdunused(fdp, dfd);
2702		break;
2703	}
2704	FILEDESC_XUNLOCK(fdp);
2705	*indxp = indx;
2706	return (0);
2707}
2708
2709/*
2710 * Scan all active processes and prisons to see if any of them have a current
2711 * or root directory of `olddp'. If so, replace them with the new mount point.
2712 */
2713void
2714mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
2715{
2716	struct filedesc *fdp;
2717	struct prison *pr;
2718	struct proc *p;
2719	int nrele;
2720
2721	if (vrefcnt(olddp) == 1)
2722		return;
2723	nrele = 0;
2724	sx_slock(&allproc_lock);
2725	FOREACH_PROC_IN_SYSTEM(p) {
2726		fdp = fdhold(p);
2727		if (fdp == NULL)
2728			continue;
2729		FILEDESC_XLOCK(fdp);
2730		if (fdp->fd_cdir == olddp) {
2731			vref(newdp);
2732			fdp->fd_cdir = newdp;
2733			nrele++;
2734		}
2735		if (fdp->fd_rdir == olddp) {
2736			vref(newdp);
2737			fdp->fd_rdir = newdp;
2738			nrele++;
2739		}
2740		if (fdp->fd_jdir == olddp) {
2741			vref(newdp);
2742			fdp->fd_jdir = newdp;
2743			nrele++;
2744		}
2745		FILEDESC_XUNLOCK(fdp);
2746		fddrop(fdp);
2747	}
2748	sx_sunlock(&allproc_lock);
2749	if (rootvnode == olddp) {
2750		vref(newdp);
2751		rootvnode = newdp;
2752		nrele++;
2753	}
2754	mtx_lock(&prison0.pr_mtx);
2755	if (prison0.pr_root == olddp) {
2756		vref(newdp);
2757		prison0.pr_root = newdp;
2758		nrele++;
2759	}
2760	mtx_unlock(&prison0.pr_mtx);
2761	sx_slock(&allprison_lock);
2762	TAILQ_FOREACH(pr, &allprison, pr_list) {
2763		mtx_lock(&pr->pr_mtx);
2764		if (pr->pr_root == olddp) {
2765			vref(newdp);
2766			pr->pr_root = newdp;
2767			nrele++;
2768		}
2769		mtx_unlock(&pr->pr_mtx);
2770	}
2771	sx_sunlock(&allprison_lock);
2772	while (nrele--)
2773		vrele(olddp);
2774}
2775
2776struct filedesc_to_leader *
2777filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
2778{
2779	struct filedesc_to_leader *fdtol;
2780
2781	fdtol = malloc(sizeof(struct filedesc_to_leader),
2782	       M_FILEDESC_TO_LEADER,
2783	       M_WAITOK);
2784	fdtol->fdl_refcount = 1;
2785	fdtol->fdl_holdcount = 0;
2786	fdtol->fdl_wakeup = 0;
2787	fdtol->fdl_leader = leader;
2788	if (old != NULL) {
2789		FILEDESC_XLOCK(fdp);
2790		fdtol->fdl_next = old->fdl_next;
2791		fdtol->fdl_prev = old;
2792		old->fdl_next = fdtol;
2793		fdtol->fdl_next->fdl_prev = fdtol;
2794		FILEDESC_XUNLOCK(fdp);
2795	} else {
2796		fdtol->fdl_next = fdtol;
2797		fdtol->fdl_prev = fdtol;
2798	}
2799	return (fdtol);
2800}
2801
2802/*
2803 * Get file structures globally.
2804 */
2805static int
2806sysctl_kern_file(SYSCTL_HANDLER_ARGS)
2807{
2808	struct xfile xf;
2809	struct filedesc *fdp;
2810	struct file *fp;
2811	struct proc *p;
2812	int error, n;
2813
2814	error = sysctl_wire_old_buffer(req, 0);
2815	if (error != 0)
2816		return (error);
2817	if (req->oldptr == NULL) {
2818		n = 0;
2819		sx_slock(&allproc_lock);
2820		FOREACH_PROC_IN_SYSTEM(p) {
2821			if (p->p_state == PRS_NEW)
2822				continue;
2823			fdp = fdhold(p);
2824			if (fdp == NULL)
2825				continue;
2826			/* overestimates sparse tables. */
2827			if (fdp->fd_lastfile > 0)
2828				n += fdp->fd_lastfile;
2829			fddrop(fdp);
2830		}
2831		sx_sunlock(&allproc_lock);
2832		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
2833	}
2834	error = 0;
2835	bzero(&xf, sizeof(xf));
2836	xf.xf_size = sizeof(xf);
2837	sx_slock(&allproc_lock);
2838	FOREACH_PROC_IN_SYSTEM(p) {
2839		PROC_LOCK(p);
2840		if (p->p_state == PRS_NEW) {
2841			PROC_UNLOCK(p);
2842			continue;
2843		}
2844		if (p_cansee(req->td, p) != 0) {
2845			PROC_UNLOCK(p);
2846			continue;
2847		}
2848		xf.xf_pid = p->p_pid;
2849		xf.xf_uid = p->p_ucred->cr_uid;
2850		PROC_UNLOCK(p);
2851		fdp = fdhold(p);
2852		if (fdp == NULL)
2853			continue;
2854		FILEDESC_SLOCK(fdp);
2855		for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) {
2856			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
2857				continue;
2858			xf.xf_fd = n;
2859			xf.xf_file = fp;
2860			xf.xf_data = fp->f_data;
2861			xf.xf_vnode = fp->f_vnode;
2862			xf.xf_type = fp->f_type;
2863			xf.xf_count = fp->f_count;
2864			xf.xf_msgcount = 0;
2865			xf.xf_offset = foffset_get(fp);
2866			xf.xf_flag = fp->f_flag;
2867			error = SYSCTL_OUT(req, &xf, sizeof(xf));
2868			if (error)
2869				break;
2870		}
2871		FILEDESC_SUNLOCK(fdp);
2872		fddrop(fdp);
2873		if (error)
2874			break;
2875	}
2876	sx_sunlock(&allproc_lock);
2877	return (error);
2878}
2879
2880SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
2881    0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
2882
2883#ifdef KINFO_OFILE_SIZE
2884CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
2885#endif
2886
2887#ifdef COMPAT_FREEBSD7
2888static int
2889export_vnode_for_osysctl(struct vnode *vp, int type,
2890    struct kinfo_ofile *kif, struct filedesc *fdp, struct sysctl_req *req)
2891{
2892	int error;
2893	char *fullpath, *freepath;
2894
2895	bzero(kif, sizeof(*kif));
2896	kif->kf_structsize = sizeof(*kif);
2897
2898	vref(vp);
2899	kif->kf_fd = type;
2900	kif->kf_type = KF_TYPE_VNODE;
2901	/* This function only handles directories. */
2902	if (vp->v_type != VDIR) {
2903		vrele(vp);
2904		return (ENOTDIR);
2905	}
2906	kif->kf_vnode_type = KF_VTYPE_VDIR;
2907
2908	/*
2909	 * This is not a true file descriptor, so we set a bogus refcount
2910	 * and offset to indicate these fields should be ignored.
2911	 */
2912	kif->kf_ref_count = -1;
2913	kif->kf_offset = -1;
2914
2915	freepath = NULL;
2916	fullpath = "-";
2917	FILEDESC_SUNLOCK(fdp);
2918	vn_fullpath(curthread, vp, &fullpath, &freepath);
2919	vrele(vp);
2920	strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
2921	if (freepath != NULL)
2922		free(freepath, M_TEMP);
2923	error = SYSCTL_OUT(req, kif, sizeof(*kif));
2924	FILEDESC_SLOCK(fdp);
2925	return (error);
2926}
2927
2928/*
2929 * Get per-process file descriptors for use by procstat(1), et al.
2930 */
2931static int
2932sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
2933{
2934	char *fullpath, *freepath;
2935	struct kinfo_ofile *kif;
2936	struct filedesc *fdp;
2937	int error, i, *name;
2938	struct shmfd *shmfd;
2939	struct socket *so;
2940	struct vnode *vp;
2941	struct file *fp;
2942	struct proc *p;
2943	struct tty *tp;
2944
2945	name = (int *)arg1;
2946	if ((p = pfind((pid_t)name[0])) == NULL)
2947		return (ESRCH);
2948	if ((error = p_candebug(curthread, p))) {
2949		PROC_UNLOCK(p);
2950		return (error);
2951	}
2952	fdp = fdhold(p);
2953	PROC_UNLOCK(p);
2954	if (fdp == NULL)
2955		return (ENOENT);
2956	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
2957	FILEDESC_SLOCK(fdp);
2958	if (fdp->fd_cdir != NULL)
2959		export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif,
2960				fdp, req);
2961	if (fdp->fd_rdir != NULL)
2962		export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif,
2963				fdp, req);
2964	if (fdp->fd_jdir != NULL)
2965		export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
2966				fdp, req);
2967	for (i = 0; i < fdp->fd_nfiles; i++) {
2968		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
2969			continue;
2970		bzero(kif, sizeof(*kif));
2971		kif->kf_structsize = sizeof(*kif);
2972		vp = NULL;
2973		so = NULL;
2974		tp = NULL;
2975		shmfd = NULL;
2976		kif->kf_fd = i;
2977
2978		switch (fp->f_type) {
2979		case DTYPE_VNODE:
2980			kif->kf_type = KF_TYPE_VNODE;
2981			vp = fp->f_vnode;
2982			break;
2983
2984		case DTYPE_SOCKET:
2985			kif->kf_type = KF_TYPE_SOCKET;
2986			so = fp->f_data;
2987			break;
2988
2989		case DTYPE_PIPE:
2990			kif->kf_type = KF_TYPE_PIPE;
2991			break;
2992
2993		case DTYPE_FIFO:
2994			kif->kf_type = KF_TYPE_FIFO;
2995			vp = fp->f_vnode;
2996			break;
2997
2998		case DTYPE_KQUEUE:
2999			kif->kf_type = KF_TYPE_KQUEUE;
3000			break;
3001
3002		case DTYPE_CRYPTO:
3003			kif->kf_type = KF_TYPE_CRYPTO;
3004			break;
3005
3006		case DTYPE_MQUEUE:
3007			kif->kf_type = KF_TYPE_MQUEUE;
3008			break;
3009
3010		case DTYPE_SHM:
3011			kif->kf_type = KF_TYPE_SHM;
3012			shmfd = fp->f_data;
3013			break;
3014
3015		case DTYPE_SEM:
3016			kif->kf_type = KF_TYPE_SEM;
3017			break;
3018
3019		case DTYPE_PTS:
3020			kif->kf_type = KF_TYPE_PTS;
3021			tp = fp->f_data;
3022			break;
3023
3024#ifdef PROCDESC
3025		case DTYPE_PROCDESC:
3026			kif->kf_type = KF_TYPE_PROCDESC;
3027			break;
3028#endif
3029
3030		default:
3031			kif->kf_type = KF_TYPE_UNKNOWN;
3032			break;
3033		}
3034		kif->kf_ref_count = fp->f_count;
3035		if (fp->f_flag & FREAD)
3036			kif->kf_flags |= KF_FLAG_READ;
3037		if (fp->f_flag & FWRITE)
3038			kif->kf_flags |= KF_FLAG_WRITE;
3039		if (fp->f_flag & FAPPEND)
3040			kif->kf_flags |= KF_FLAG_APPEND;
3041		if (fp->f_flag & FASYNC)
3042			kif->kf_flags |= KF_FLAG_ASYNC;
3043		if (fp->f_flag & FFSYNC)
3044			kif->kf_flags |= KF_FLAG_FSYNC;
3045		if (fp->f_flag & FNONBLOCK)
3046			kif->kf_flags |= KF_FLAG_NONBLOCK;
3047		if (fp->f_flag & O_DIRECT)
3048			kif->kf_flags |= KF_FLAG_DIRECT;
3049		if (fp->f_flag & FHASLOCK)
3050			kif->kf_flags |= KF_FLAG_HASLOCK;
3051		kif->kf_offset = foffset_get(fp);
3052		if (vp != NULL) {
3053			vref(vp);
3054			switch (vp->v_type) {
3055			case VNON:
3056				kif->kf_vnode_type = KF_VTYPE_VNON;
3057				break;
3058			case VREG:
3059				kif->kf_vnode_type = KF_VTYPE_VREG;
3060				break;
3061			case VDIR:
3062				kif->kf_vnode_type = KF_VTYPE_VDIR;
3063				break;
3064			case VBLK:
3065				kif->kf_vnode_type = KF_VTYPE_VBLK;
3066				break;
3067			case VCHR:
3068				kif->kf_vnode_type = KF_VTYPE_VCHR;
3069				break;
3070			case VLNK:
3071				kif->kf_vnode_type = KF_VTYPE_VLNK;
3072				break;
3073			case VSOCK:
3074				kif->kf_vnode_type = KF_VTYPE_VSOCK;
3075				break;
3076			case VFIFO:
3077				kif->kf_vnode_type = KF_VTYPE_VFIFO;
3078				break;
3079			case VBAD:
3080				kif->kf_vnode_type = KF_VTYPE_VBAD;
3081				break;
3082			default:
3083				kif->kf_vnode_type = KF_VTYPE_UNKNOWN;
3084				break;
3085			}
3086			/*
3087			 * It is OK to drop the filedesc lock here as we will
3088			 * re-validate and re-evaluate its properties when
3089			 * the loop continues.
3090			 */
3091			freepath = NULL;
3092			fullpath = "-";
3093			FILEDESC_SUNLOCK(fdp);
3094			vn_fullpath(curthread, vp, &fullpath, &freepath);
3095			vrele(vp);
3096			strlcpy(kif->kf_path, fullpath,
3097			    sizeof(kif->kf_path));
3098			if (freepath != NULL)
3099				free(freepath, M_TEMP);
3100			FILEDESC_SLOCK(fdp);
3101		}
3102		if (so != NULL) {
3103			struct sockaddr *sa;
3104
3105			if (so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa)
3106			    == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
3107				bcopy(sa, &kif->kf_sa_local, sa->sa_len);
3108				free(sa, M_SONAME);
3109			}
3110			if (so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa)
3111			    == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
3112				bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
3113				free(sa, M_SONAME);
3114			}
3115			kif->kf_sock_domain =
3116			    so->so_proto->pr_domain->dom_family;
3117			kif->kf_sock_type = so->so_type;
3118			kif->kf_sock_protocol = so->so_proto->pr_protocol;
3119		}
3120		if (tp != NULL) {
3121			strlcpy(kif->kf_path, tty_devname(tp),
3122			    sizeof(kif->kf_path));
3123		}
3124		if (shmfd != NULL)
3125			shm_path(shmfd, kif->kf_path, sizeof(kif->kf_path));
3126		error = SYSCTL_OUT(req, kif, sizeof(*kif));
3127		if (error)
3128			break;
3129	}
3130	FILEDESC_SUNLOCK(fdp);
3131	fddrop(fdp);
3132	free(kif, M_TEMP);
3133	return (0);
3134}
3135
3136static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, CTLFLAG_RD,
3137    sysctl_kern_proc_ofiledesc, "Process ofiledesc entries");
3138#endif	/* COMPAT_FREEBSD7 */
3139
3140#ifdef KINFO_FILE_SIZE
3141CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
3142#endif
3143
3144static int
3145export_fd_for_sysctl(void *data, int type, int fd, int fflags, int refcnt,
3146    int64_t offset, cap_rights_t fd_cap_rights, struct kinfo_file *kif,
3147    struct sysctl_req *req)
3148{
3149	struct {
3150		int	fflag;
3151		int	kf_fflag;
3152	} fflags_table[] = {
3153		{ FAPPEND, KF_FLAG_APPEND },
3154		{ FASYNC, KF_FLAG_ASYNC },
3155		{ FFSYNC, KF_FLAG_FSYNC },
3156		{ FHASLOCK, KF_FLAG_HASLOCK },
3157		{ FNONBLOCK, KF_FLAG_NONBLOCK },
3158		{ FREAD, KF_FLAG_READ },
3159		{ FWRITE, KF_FLAG_WRITE },
3160		{ O_CREAT, KF_FLAG_CREAT },
3161		{ O_DIRECT, KF_FLAG_DIRECT },
3162		{ O_EXCL, KF_FLAG_EXCL },
3163		{ O_EXEC, KF_FLAG_EXEC },
3164		{ O_EXLOCK, KF_FLAG_EXLOCK },
3165		{ O_NOFOLLOW, KF_FLAG_NOFOLLOW },
3166		{ O_SHLOCK, KF_FLAG_SHLOCK },
3167		{ O_TRUNC, KF_FLAG_TRUNC }
3168	};
3169#define	NFFLAGS	(sizeof(fflags_table) / sizeof(*fflags_table))
3170	struct vnode *vp;
3171	int error;
3172	unsigned int i;
3173
3174	bzero(kif, sizeof(*kif));
3175	switch (type) {
3176	case KF_TYPE_FIFO:
3177	case KF_TYPE_VNODE:
3178		vp = (struct vnode *)data;
3179		error = fill_vnode_info(vp, kif);
3180		vrele(vp);
3181		break;
3182	case KF_TYPE_SOCKET:
3183		error = fill_socket_info((struct socket *)data, kif);
3184		break;
3185	case KF_TYPE_PIPE:
3186		error = fill_pipe_info((struct pipe *)data, kif);
3187		break;
3188	case KF_TYPE_PTS:
3189		error = fill_pts_info((struct tty *)data, kif);
3190		break;
3191	case KF_TYPE_PROCDESC:
3192		error = fill_procdesc_info((struct procdesc *)data, kif);
3193		break;
3194	case KF_TYPE_SHM:
3195		error = fill_shm_info((struct file *)data, kif);
3196		break;
3197	default:
3198		error = 0;
3199	}
3200	if (error == 0)
3201		kif->kf_status |= KF_ATTR_VALID;
3202
3203	/*
3204	 * Translate file access flags.
3205	 */
3206	for (i = 0; i < NFFLAGS; i++)
3207		if (fflags & fflags_table[i].fflag)
3208			kif->kf_flags |=  fflags_table[i].kf_fflag;
3209	kif->kf_cap_rights = fd_cap_rights;
3210	kif->kf_fd = fd;
3211	kif->kf_type = type;
3212	kif->kf_ref_count = refcnt;
3213	kif->kf_offset = offset;
3214	/* Pack record size down */
3215	kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
3216	    strlen(kif->kf_path) + 1;
3217	kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
3218	error = SYSCTL_OUT(req, kif, kif->kf_structsize);
3219	return (error);
3220}
3221
3222/*
3223 * Get per-process file descriptors for use by procstat(1), et al.
3224 */
3225static int
3226sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
3227{
3228	struct file *fp;
3229	struct filedesc *fdp;
3230	struct kinfo_file *kif;
3231	struct proc *p;
3232	struct vnode *cttyvp, *textvp, *tracevp;
3233	size_t oldidx;
3234	int64_t offset;
3235	void *data;
3236	int error, i, *name;
3237	int type, refcnt, fflags;
3238	cap_rights_t fd_cap_rights;
3239
3240	name = (int *)arg1;
3241	if ((p = pfind((pid_t)name[0])) == NULL)
3242		return (ESRCH);
3243	if ((error = p_candebug(curthread, p))) {
3244		PROC_UNLOCK(p);
3245		return (error);
3246	}
3247	/* ktrace vnode */
3248	tracevp = p->p_tracevp;
3249	if (tracevp != NULL)
3250		vref(tracevp);
3251	/* text vnode */
3252	textvp = p->p_textvp;
3253	if (textvp != NULL)
3254		vref(textvp);
3255	/* Controlling tty. */
3256	cttyvp = NULL;
3257	if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
3258		cttyvp = p->p_pgrp->pg_session->s_ttyvp;
3259		if (cttyvp != NULL)
3260			vref(cttyvp);
3261	}
3262	fdp = fdhold(p);
3263	PROC_UNLOCK(p);
3264	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
3265	if (tracevp != NULL)
3266		export_fd_for_sysctl(tracevp, KF_TYPE_VNODE, KF_FD_TYPE_TRACE,
3267		    FREAD | FWRITE, -1, -1, 0, kif, req);
3268	if (textvp != NULL)
3269		export_fd_for_sysctl(textvp, KF_TYPE_VNODE, KF_FD_TYPE_TEXT,
3270		    FREAD, -1, -1, 0, kif, req);
3271	if (cttyvp != NULL)
3272		export_fd_for_sysctl(cttyvp, KF_TYPE_VNODE, KF_FD_TYPE_CTTY,
3273		    FREAD | FWRITE, -1, -1, 0, kif, req);
3274	if (fdp == NULL)
3275		goto fail;
3276	FILEDESC_SLOCK(fdp);
3277	/* working directory */
3278	if (fdp->fd_cdir != NULL) {
3279		vref(fdp->fd_cdir);
3280		data = fdp->fd_cdir;
3281		FILEDESC_SUNLOCK(fdp);
3282		export_fd_for_sysctl(data, KF_TYPE_VNODE, KF_FD_TYPE_CWD,
3283		    FREAD, -1, -1, 0, kif, req);
3284		FILEDESC_SLOCK(fdp);
3285	}
3286	/* root directory */
3287	if (fdp->fd_rdir != NULL) {
3288		vref(fdp->fd_rdir);
3289		data = fdp->fd_rdir;
3290		FILEDESC_SUNLOCK(fdp);
3291		export_fd_for_sysctl(data, KF_TYPE_VNODE, KF_FD_TYPE_ROOT,
3292		    FREAD, -1, -1, 0, kif, req);
3293		FILEDESC_SLOCK(fdp);
3294	}
3295	/* jail directory */
3296	if (fdp->fd_jdir != NULL) {
3297		vref(fdp->fd_jdir);
3298		data = fdp->fd_jdir;
3299		FILEDESC_SUNLOCK(fdp);
3300		export_fd_for_sysctl(data, KF_TYPE_VNODE, KF_FD_TYPE_JAIL,
3301		    FREAD, -1, -1, 0, kif, req);
3302		FILEDESC_SLOCK(fdp);
3303	}
3304	for (i = 0; i < fdp->fd_nfiles; i++) {
3305		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
3306			continue;
3307		data = NULL;
3308#ifdef CAPABILITIES
3309		fd_cap_rights = cap_rights(fdp, i);
3310#else /* !CAPABILITIES */
3311		fd_cap_rights = 0;
3312#endif
3313		switch (fp->f_type) {
3314		case DTYPE_VNODE:
3315			type = KF_TYPE_VNODE;
3316			vref(fp->f_vnode);
3317			data = fp->f_vnode;
3318			break;
3319
3320		case DTYPE_SOCKET:
3321			type = KF_TYPE_SOCKET;
3322			data = fp->f_data;
3323			break;
3324
3325		case DTYPE_PIPE:
3326			type = KF_TYPE_PIPE;
3327			data = fp->f_data;
3328			break;
3329
3330		case DTYPE_FIFO:
3331			type = KF_TYPE_FIFO;
3332			vref(fp->f_vnode);
3333			data = fp->f_vnode;
3334			break;
3335
3336		case DTYPE_KQUEUE:
3337			type = KF_TYPE_KQUEUE;
3338			break;
3339
3340		case DTYPE_CRYPTO:
3341			type = KF_TYPE_CRYPTO;
3342			break;
3343
3344		case DTYPE_MQUEUE:
3345			type = KF_TYPE_MQUEUE;
3346			break;
3347
3348		case DTYPE_SHM:
3349			type = KF_TYPE_SHM;
3350			data = fp;
3351			break;
3352
3353		case DTYPE_SEM:
3354			type = KF_TYPE_SEM;
3355			break;
3356
3357		case DTYPE_PTS:
3358			type = KF_TYPE_PTS;
3359			data = fp->f_data;
3360			break;
3361
3362#ifdef PROCDESC
3363		case DTYPE_PROCDESC:
3364			type = KF_TYPE_PROCDESC;
3365			data = fp->f_data;
3366			break;
3367#endif
3368
3369		default:
3370			type = KF_TYPE_UNKNOWN;
3371			break;
3372		}
3373		refcnt = fp->f_count;
3374		fflags = fp->f_flag;
3375		offset = foffset_get(fp);
3376
3377		/*
3378		 * Create sysctl entry.
3379		 * It is OK to drop the filedesc lock here as we will
3380		 * re-validate and re-evaluate its properties when
3381		 * the loop continues.
3382		 */
3383		oldidx = req->oldidx;
3384		if (type == KF_TYPE_VNODE || type == KF_TYPE_FIFO)
3385			FILEDESC_SUNLOCK(fdp);
3386		error = export_fd_for_sysctl(data, type, i, fflags, refcnt,
3387		    offset, fd_cap_rights, kif, req);
3388		if (type == KF_TYPE_VNODE || type == KF_TYPE_FIFO)
3389			FILEDESC_SLOCK(fdp);
3390		if (error) {
3391			if (error == ENOMEM) {
3392				/*
3393				 * The hack to keep the ABI of sysctl
3394				 * kern.proc.filedesc intact, but not
3395				 * to account a partially copied
3396				 * kinfo_file into the oldidx.
3397				 */
3398				req->oldidx = oldidx;
3399				error = 0;
3400			}
3401			break;
3402		}
3403	}
3404	FILEDESC_SUNLOCK(fdp);
3405fail:
3406	if (fdp != NULL)
3407		fddrop(fdp);
3408	free(kif, M_TEMP);
3409	return (error);
3410}
3411
3412int
3413vntype_to_kinfo(int vtype)
3414{
3415	struct {
3416		int	vtype;
3417		int	kf_vtype;
3418	} vtypes_table[] = {
3419		{ VBAD, KF_VTYPE_VBAD },
3420		{ VBLK, KF_VTYPE_VBLK },
3421		{ VCHR, KF_VTYPE_VCHR },
3422		{ VDIR, KF_VTYPE_VDIR },
3423		{ VFIFO, KF_VTYPE_VFIFO },
3424		{ VLNK, KF_VTYPE_VLNK },
3425		{ VNON, KF_VTYPE_VNON },
3426		{ VREG, KF_VTYPE_VREG },
3427		{ VSOCK, KF_VTYPE_VSOCK }
3428	};
3429#define	NVTYPES	(sizeof(vtypes_table) / sizeof(*vtypes_table))
3430	unsigned int i;
3431
3432	/*
3433	 * Perform vtype translation.
3434	 */
3435	for (i = 0; i < NVTYPES; i++)
3436		if (vtypes_table[i].vtype == vtype)
3437			break;
3438	if (i < NVTYPES)
3439		return (vtypes_table[i].kf_vtype);
3440
3441	return (KF_VTYPE_UNKNOWN);
3442}
3443
3444static int
3445fill_vnode_info(struct vnode *vp, struct kinfo_file *kif)
3446{
3447	struct vattr va;
3448	char *fullpath, *freepath;
3449	int error;
3450
3451	if (vp == NULL)
3452		return (1);
3453	kif->kf_vnode_type = vntype_to_kinfo(vp->v_type);
3454	freepath = NULL;
3455	fullpath = "-";
3456	error = vn_fullpath(curthread, vp, &fullpath, &freepath);
3457	if (error == 0) {
3458		strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
3459	}
3460	if (freepath != NULL)
3461		free(freepath, M_TEMP);
3462
3463	/*
3464	 * Retrieve vnode attributes.
3465	 */
3466	va.va_fsid = VNOVAL;
3467	va.va_rdev = NODEV;
3468	vn_lock(vp, LK_SHARED | LK_RETRY);
3469	error = VOP_GETATTR(vp, &va, curthread->td_ucred);
3470	VOP_UNLOCK(vp, 0);
3471	if (error != 0)
3472		return (error);
3473	if (va.va_fsid != VNOVAL)
3474		kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
3475	else
3476		kif->kf_un.kf_file.kf_file_fsid =
3477		    vp->v_mount->mnt_stat.f_fsid.val[0];
3478	kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
3479	kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
3480	kif->kf_un.kf_file.kf_file_size = va.va_size;
3481	kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
3482	return (0);
3483}
3484
3485static int
3486fill_socket_info(struct socket *so, struct kinfo_file *kif)
3487{
3488	struct sockaddr *sa;
3489	struct inpcb *inpcb;
3490	struct unpcb *unpcb;
3491	int error;
3492
3493	if (so == NULL)
3494		return (1);
3495	kif->kf_sock_domain = so->so_proto->pr_domain->dom_family;
3496	kif->kf_sock_type = so->so_type;
3497	kif->kf_sock_protocol = so->so_proto->pr_protocol;
3498	kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb;
3499	switch(kif->kf_sock_domain) {
3500	case AF_INET:
3501	case AF_INET6:
3502		if (kif->kf_sock_protocol == IPPROTO_TCP) {
3503			if (so->so_pcb != NULL) {
3504				inpcb = (struct inpcb *)(so->so_pcb);
3505				kif->kf_un.kf_sock.kf_sock_inpcb =
3506				    (uintptr_t)inpcb->inp_ppcb;
3507			}
3508		}
3509		break;
3510	case AF_UNIX:
3511		if (so->so_pcb != NULL) {
3512			unpcb = (struct unpcb *)(so->so_pcb);
3513			if (unpcb->unp_conn) {
3514				kif->kf_un.kf_sock.kf_sock_unpconn =
3515				    (uintptr_t)unpcb->unp_conn;
3516				kif->kf_un.kf_sock.kf_sock_rcv_sb_state =
3517				    so->so_rcv.sb_state;
3518				kif->kf_un.kf_sock.kf_sock_snd_sb_state =
3519				    so->so_snd.sb_state;
3520			}
3521		}
3522		break;
3523	}
3524	error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa);
3525	if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
3526		bcopy(sa, &kif->kf_sa_local, sa->sa_len);
3527		free(sa, M_SONAME);
3528	}
3529	error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa);
3530	if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
3531		bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
3532		free(sa, M_SONAME);
3533	}
3534	strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name,
3535	    sizeof(kif->kf_path));
3536	return (0);
3537}
3538
3539static int
3540fill_pts_info(struct tty *tp, struct kinfo_file *kif)
3541{
3542
3543	if (tp == NULL)
3544		return (1);
3545	kif->kf_un.kf_pts.kf_pts_dev = tty_udev(tp);
3546	strlcpy(kif->kf_path, tty_devname(tp), sizeof(kif->kf_path));
3547	return (0);
3548}
3549
3550static int
3551fill_pipe_info(struct pipe *pi, struct kinfo_file *kif)
3552{
3553
3554	if (pi == NULL)
3555		return (1);
3556	kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi;
3557	kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer;
3558	kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt;
3559	return (0);
3560}
3561
3562static int
3563fill_procdesc_info(struct procdesc *pdp, struct kinfo_file *kif)
3564{
3565
3566	if (pdp == NULL)
3567		return (1);
3568	kif->kf_un.kf_proc.kf_pid = pdp->pd_pid;
3569	return (0);
3570}
3571
3572static int
3573fill_shm_info(struct file *fp, struct kinfo_file *kif)
3574{
3575	struct thread *td;
3576	struct stat sb;
3577
3578	td = curthread;
3579	if (fp->f_data == NULL)
3580		return (1);
3581	if (fo_stat(fp, &sb, td->td_ucred, td) != 0)
3582		return (1);
3583	shm_path(fp->f_data, kif->kf_path, sizeof(kif->kf_path));
3584	kif->kf_un.kf_file.kf_file_mode = sb.st_mode;
3585	kif->kf_un.kf_file.kf_file_size = sb.st_size;
3586	return (0);
3587}
3588
3589static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD,
3590    sysctl_kern_proc_filedesc, "Process filedesc entries");
3591
3592#ifdef DDB
3593/*
3594 * For the purposes of debugging, generate a human-readable string for the
3595 * file type.
3596 */
3597static const char *
3598file_type_to_name(short type)
3599{
3600
3601	switch (type) {
3602	case 0:
3603		return ("zero");
3604	case DTYPE_VNODE:
3605		return ("vnod");
3606	case DTYPE_SOCKET:
3607		return ("sock");
3608	case DTYPE_PIPE:
3609		return ("pipe");
3610	case DTYPE_FIFO:
3611		return ("fifo");
3612	case DTYPE_KQUEUE:
3613		return ("kque");
3614	case DTYPE_CRYPTO:
3615		return ("crpt");
3616	case DTYPE_MQUEUE:
3617		return ("mque");
3618	case DTYPE_SHM:
3619		return ("shm");
3620	case DTYPE_SEM:
3621		return ("ksem");
3622	default:
3623		return ("unkn");
3624	}
3625}
3626
3627/*
3628 * For the purposes of debugging, identify a process (if any, perhaps one of
3629 * many) that references the passed file in its file descriptor array. Return
3630 * NULL if none.
3631 */
3632static struct proc *
3633file_to_first_proc(struct file *fp)
3634{
3635	struct filedesc *fdp;
3636	struct proc *p;
3637	int n;
3638
3639	FOREACH_PROC_IN_SYSTEM(p) {
3640		if (p->p_state == PRS_NEW)
3641			continue;
3642		fdp = p->p_fd;
3643		if (fdp == NULL)
3644			continue;
3645		for (n = 0; n < fdp->fd_nfiles; n++) {
3646			if (fp == fdp->fd_ofiles[n].fde_file)
3647				return (p);
3648		}
3649	}
3650	return (NULL);
3651}
3652
3653static void
3654db_print_file(struct file *fp, int header)
3655{
3656	struct proc *p;
3657
3658	if (header)
3659		db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n",
3660		    "File", "Type", "Data", "Flag", "GCFl", "Count",
3661		    "MCount", "Vnode", "FPID", "FCmd");
3662	p = file_to_first_proc(fp);
3663	db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
3664	    file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
3665	    0, fp->f_count, 0, fp->f_vnode,
3666	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
3667}
3668
3669DB_SHOW_COMMAND(file, db_show_file)
3670{
3671	struct file *fp;
3672
3673	if (!have_addr) {
3674		db_printf("usage: show file <addr>\n");
3675		return;
3676	}
3677	fp = (struct file *)addr;
3678	db_print_file(fp, 1);
3679}
3680
3681DB_SHOW_COMMAND(files, db_show_files)
3682{
3683	struct filedesc *fdp;
3684	struct file *fp;
3685	struct proc *p;
3686	int header;
3687	int n;
3688
3689	header = 1;
3690	FOREACH_PROC_IN_SYSTEM(p) {
3691		if (p->p_state == PRS_NEW)
3692			continue;
3693		if ((fdp = p->p_fd) == NULL)
3694			continue;
3695		for (n = 0; n < fdp->fd_nfiles; ++n) {
3696			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
3697				continue;
3698			db_print_file(fp, header);
3699			header = 0;
3700		}
3701	}
3702}
3703#endif
3704
3705SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
3706    &maxfilesperproc, 0, "Maximum files allowed open per process");
3707
3708SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
3709    &maxfiles, 0, "Maximum number of files");
3710
3711SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
3712    __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files");
3713
3714/* ARGSUSED*/
3715static void
3716filelistinit(void *dummy)
3717{
3718
3719	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
3720	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
3721	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
3722	mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF);
3723}
3724SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
3725
3726/*-------------------------------------------------------------------*/
3727
3728static int
3729badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
3730    int flags, struct thread *td)
3731{
3732
3733	return (EBADF);
3734}
3735
3736static int
3737badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
3738    struct thread *td)
3739{
3740
3741	return (EINVAL);
3742}
3743
3744static int
3745badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
3746    struct thread *td)
3747{
3748
3749	return (EBADF);
3750}
3751
3752static int
3753badfo_poll(struct file *fp, int events, struct ucred *active_cred,
3754    struct thread *td)
3755{
3756
3757	return (0);
3758}
3759
3760static int
3761badfo_kqfilter(struct file *fp, struct knote *kn)
3762{
3763
3764	return (EBADF);
3765}
3766
3767static int
3768badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
3769    struct thread *td)
3770{
3771
3772	return (EBADF);
3773}
3774
3775static int
3776badfo_close(struct file *fp, struct thread *td)
3777{
3778
3779	return (EBADF);
3780}
3781
3782static int
3783badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
3784    struct thread *td)
3785{
3786
3787	return (EBADF);
3788}
3789
3790static int
3791badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
3792    struct thread *td)
3793{
3794
3795	return (EBADF);
3796}
3797
3798struct fileops badfileops = {
3799	.fo_read = badfo_readwrite,
3800	.fo_write = badfo_readwrite,
3801	.fo_truncate = badfo_truncate,
3802	.fo_ioctl = badfo_ioctl,
3803	.fo_poll = badfo_poll,
3804	.fo_kqfilter = badfo_kqfilter,
3805	.fo_stat = badfo_stat,
3806	.fo_close = badfo_close,
3807	.fo_chmod = badfo_chmod,
3808	.fo_chown = badfo_chown,
3809};
3810
3811int
3812invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
3813    struct thread *td)
3814{
3815
3816	return (EINVAL);
3817}
3818
3819int
3820invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
3821    struct thread *td)
3822{
3823
3824	return (EINVAL);
3825}
3826
3827/*-------------------------------------------------------------------*/
3828
3829/*
3830 * File Descriptor pseudo-device driver (/dev/fd/).
3831 *
3832 * Opening minor device N dup()s the file (if any) connected to file
3833 * descriptor N belonging to the calling process.  Note that this driver
3834 * consists of only the ``open()'' routine, because all subsequent
3835 * references to this file will be direct to the other driver.
3836 *
3837 * XXX: we could give this one a cloning event handler if necessary.
3838 */
3839
3840/* ARGSUSED */
3841static int
3842fdopen(struct cdev *dev, int mode, int type, struct thread *td)
3843{
3844
3845	/*
3846	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
3847	 * the file descriptor being sought for duplication. The error
3848	 * return ensures that the vnode for this device will be released
3849	 * by vn_open. Open will detect this special error and take the
3850	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
3851	 * will simply report the error.
3852	 */
3853	td->td_dupfd = dev2unit(dev);
3854	return (ENODEV);
3855}
3856
3857static struct cdevsw fildesc_cdevsw = {
3858	.d_version =	D_VERSION,
3859	.d_open =	fdopen,
3860	.d_name =	"FD",
3861};
3862
3863static void
3864fildesc_drvinit(void *unused)
3865{
3866	struct cdev *dev;
3867
3868	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
3869	    UID_ROOT, GID_WHEEL, 0666, "fd/0");
3870	make_dev_alias(dev, "stdin");
3871	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
3872	    UID_ROOT, GID_WHEEL, 0666, "fd/1");
3873	make_dev_alias(dev, "stdout");
3874	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
3875	    UID_ROOT, GID_WHEEL, 0666, "fd/2");
3876	make_dev_alias(dev, "stderr");
3877}
3878
3879SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
3880