kern_descrip.c revision 236910
1/*-
2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: head/sys/kern/kern_descrip.c 236910 2012-06-11 19:48:55Z pjd $");
39
40#include "opt_capsicum.h"
41#include "opt_compat.h"
42#include "opt_ddb.h"
43#include "opt_ktrace.h"
44#include "opt_procdesc.h"
45
46#include <sys/param.h>
47#include <sys/systm.h>
48
49#include <sys/capability.h>
50#include <sys/conf.h>
51#include <sys/domain.h>
52#include <sys/fcntl.h>
53#include <sys/file.h>
54#include <sys/filedesc.h>
55#include <sys/filio.h>
56#include <sys/jail.h>
57#include <sys/kernel.h>
58#include <sys/limits.h>
59#include <sys/lock.h>
60#include <sys/malloc.h>
61#include <sys/mman.h>
62#include <sys/mount.h>
63#include <sys/mqueue.h>
64#include <sys/mutex.h>
65#include <sys/namei.h>
66#include <sys/selinfo.h>
67#include <sys/pipe.h>
68#include <sys/priv.h>
69#include <sys/proc.h>
70#include <sys/procdesc.h>
71#include <sys/protosw.h>
72#include <sys/racct.h>
73#include <sys/resourcevar.h>
74#include <sys/signalvar.h>
75#include <sys/socketvar.h>
76#include <sys/stat.h>
77#include <sys/sx.h>
78#include <sys/syscallsubr.h>
79#include <sys/sysctl.h>
80#include <sys/sysproto.h>
81#include <sys/tty.h>
82#include <sys/unistd.h>
83#include <sys/un.h>
84#include <sys/unpcb.h>
85#include <sys/user.h>
86#include <sys/vnode.h>
87#ifdef KTRACE
88#include <sys/ktrace.h>
89#endif
90
91#include <net/vnet.h>
92
93#include <netinet/in.h>
94#include <netinet/in_pcb.h>
95
96#include <security/audit/audit.h>
97
98#include <vm/uma.h>
99#include <vm/vm.h>
100
101#include <ddb/ddb.h>
102
103static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
104static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
105		     "file desc to leader structures");
106static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
107
108MALLOC_DECLARE(M_FADVISE);
109
110static uma_zone_t file_zone;
111
112
113/* Flags for do_dup() */
114#define DUP_FIXED	0x1	/* Force fixed allocation */
115#define DUP_FCNTL	0x2	/* fcntl()-style errors */
116
117static int do_dup(struct thread *td, int flags, int old, int new,
118    register_t *retval);
119static int	fd_first_free(struct filedesc *, int, int);
120static int	fd_last_used(struct filedesc *, int, int);
121static void	fdgrowtable(struct filedesc *, int);
122static void	fdunused(struct filedesc *fdp, int fd);
123static void	fdused(struct filedesc *fdp, int fd);
124static int	fill_vnode_info(struct vnode *vp, struct kinfo_file *kif);
125static int	fill_socket_info(struct socket *so, struct kinfo_file *kif);
126static int	fill_pts_info(struct tty *tp, struct kinfo_file *kif);
127static int	fill_pipe_info(struct pipe *pi, struct kinfo_file *kif);
128static int	fill_procdesc_info(struct procdesc *pdp,
129    struct kinfo_file *kif);
130static int	fill_shm_info(struct file *fp, struct kinfo_file *kif);
131
132/*
133 * A process is initially started out with NDFILE descriptors stored within
134 * this structure, selected to be enough for typical applications based on
135 * the historical limit of 20 open files (and the usage of descriptors by
136 * shells).  If these descriptors are exhausted, a larger descriptor table
137 * may be allocated, up to a process' resource limit; the internal arrays
138 * are then unused.
139 */
140#define NDFILE		20
141#define NDSLOTSIZE	sizeof(NDSLOTTYPE)
142#define	NDENTRIES	(NDSLOTSIZE * __CHAR_BIT)
143#define NDSLOT(x)	((x) / NDENTRIES)
144#define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
145#define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
146
147/*
148 * Storage required per open file descriptor.
149 */
150#define OFILESIZE (sizeof(struct file *) + sizeof(char))
151
152/*
153 * Storage to hold unused ofiles that need to be reclaimed.
154 */
155struct freetable {
156	struct file	**ft_table;
157	SLIST_ENTRY(freetable) ft_next;
158};
159
160/*
161 * Basic allocation of descriptors:
162 * one of the above, plus arrays for NDFILE descriptors.
163 */
164struct filedesc0 {
165	struct	filedesc fd_fd;
166	/*
167	 * ofiles which need to be reclaimed on free.
168	 */
169	SLIST_HEAD(,freetable) fd_free;
170	/*
171	 * These arrays are used when the number of open files is
172	 * <= NDFILE, and are then pointed to by the pointers above.
173	 */
174	struct	file *fd_dfiles[NDFILE];
175	char	fd_dfileflags[NDFILE];
176	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
177};
178
179/*
180 * Descriptor management.
181 */
182volatile int openfiles;			/* actual number of open files */
183struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
184void	(*mq_fdclose)(struct thread *td, int fd, struct file *fp);
185
186/* A mutex to protect the association between a proc and filedesc. */
187static struct mtx	fdesc_mtx;
188
189/*
190 * Find the first zero bit in the given bitmap, starting at low and not
191 * exceeding size - 1.
192 */
193static int
194fd_first_free(struct filedesc *fdp, int low, int size)
195{
196	NDSLOTTYPE *map = fdp->fd_map;
197	NDSLOTTYPE mask;
198	int off, maxoff;
199
200	if (low >= size)
201		return (low);
202
203	off = NDSLOT(low);
204	if (low % NDENTRIES) {
205		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
206		if ((mask &= ~map[off]) != 0UL)
207			return (off * NDENTRIES + ffsl(mask) - 1);
208		++off;
209	}
210	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
211		if (map[off] != ~0UL)
212			return (off * NDENTRIES + ffsl(~map[off]) - 1);
213	return (size);
214}
215
216/*
217 * Find the highest non-zero bit in the given bitmap, starting at low and
218 * not exceeding size - 1.
219 */
220static int
221fd_last_used(struct filedesc *fdp, int low, int size)
222{
223	NDSLOTTYPE *map = fdp->fd_map;
224	NDSLOTTYPE mask;
225	int off, minoff;
226
227	if (low >= size)
228		return (-1);
229
230	off = NDSLOT(size);
231	if (size % NDENTRIES) {
232		mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
233		if ((mask &= map[off]) != 0)
234			return (off * NDENTRIES + flsl(mask) - 1);
235		--off;
236	}
237	for (minoff = NDSLOT(low); off >= minoff; --off)
238		if (map[off] != 0)
239			return (off * NDENTRIES + flsl(map[off]) - 1);
240	return (low - 1);
241}
242
243static int
244fdisused(struct filedesc *fdp, int fd)
245{
246        KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
247            ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
248	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
249}
250
251/*
252 * Mark a file descriptor as used.
253 */
254static void
255fdused(struct filedesc *fdp, int fd)
256{
257
258	FILEDESC_XLOCK_ASSERT(fdp);
259	KASSERT(!fdisused(fdp, fd),
260	    ("fd already used"));
261
262	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
263	if (fd > fdp->fd_lastfile)
264		fdp->fd_lastfile = fd;
265	if (fd == fdp->fd_freefile)
266		fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
267}
268
269/*
270 * Mark a file descriptor as unused.
271 */
272static void
273fdunused(struct filedesc *fdp, int fd)
274{
275
276	FILEDESC_XLOCK_ASSERT(fdp);
277	KASSERT(fdisused(fdp, fd),
278	    ("fd is already unused"));
279	KASSERT(fdp->fd_ofiles[fd] == NULL,
280	    ("fd is still in use"));
281
282	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
283	if (fd < fdp->fd_freefile)
284		fdp->fd_freefile = fd;
285	if (fd == fdp->fd_lastfile)
286		fdp->fd_lastfile = fd_last_used(fdp, 0, fd);
287}
288
289/*
290 * System calls on descriptors.
291 */
292#ifndef _SYS_SYSPROTO_H_
293struct getdtablesize_args {
294	int	dummy;
295};
296#endif
297/* ARGSUSED */
298int
299sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
300{
301	struct proc *p = td->td_proc;
302	uint64_t lim;
303
304	PROC_LOCK(p);
305	td->td_retval[0] =
306	    min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
307	lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
308	PROC_UNLOCK(p);
309	if (lim < td->td_retval[0])
310		td->td_retval[0] = lim;
311	return (0);
312}
313
314/*
315 * Duplicate a file descriptor to a particular value.
316 *
317 * Note: keep in mind that a potential race condition exists when closing
318 * descriptors from a shared descriptor table (via rfork).
319 */
320#ifndef _SYS_SYSPROTO_H_
321struct dup2_args {
322	u_int	from;
323	u_int	to;
324};
325#endif
326/* ARGSUSED */
327int
328sys_dup2(struct thread *td, struct dup2_args *uap)
329{
330
331	return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
332		    td->td_retval));
333}
334
335/*
336 * Duplicate a file descriptor.
337 */
338#ifndef _SYS_SYSPROTO_H_
339struct dup_args {
340	u_int	fd;
341};
342#endif
343/* ARGSUSED */
344int
345sys_dup(struct thread *td, struct dup_args *uap)
346{
347
348	return (do_dup(td, 0, (int)uap->fd, 0, td->td_retval));
349}
350
351/*
352 * The file control system call.
353 */
354#ifndef _SYS_SYSPROTO_H_
355struct fcntl_args {
356	int	fd;
357	int	cmd;
358	long	arg;
359};
360#endif
361/* ARGSUSED */
362int
363sys_fcntl(struct thread *td, struct fcntl_args *uap)
364{
365	struct flock fl;
366	struct oflock ofl;
367	intptr_t arg;
368	int error;
369	int cmd;
370
371	error = 0;
372	cmd = uap->cmd;
373	switch (uap->cmd) {
374	case F_OGETLK:
375	case F_OSETLK:
376	case F_OSETLKW:
377		/*
378		 * Convert old flock structure to new.
379		 */
380		error = copyin((void *)(intptr_t)uap->arg, &ofl, sizeof(ofl));
381		fl.l_start = ofl.l_start;
382		fl.l_len = ofl.l_len;
383		fl.l_pid = ofl.l_pid;
384		fl.l_type = ofl.l_type;
385		fl.l_whence = ofl.l_whence;
386		fl.l_sysid = 0;
387
388		switch (uap->cmd) {
389		case F_OGETLK:
390		    cmd = F_GETLK;
391		    break;
392		case F_OSETLK:
393		    cmd = F_SETLK;
394		    break;
395		case F_OSETLKW:
396		    cmd = F_SETLKW;
397		    break;
398		}
399		arg = (intptr_t)&fl;
400		break;
401        case F_GETLK:
402        case F_SETLK:
403        case F_SETLKW:
404	case F_SETLK_REMOTE:
405                error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
406                arg = (intptr_t)&fl;
407                break;
408	default:
409		arg = uap->arg;
410		break;
411	}
412	if (error)
413		return (error);
414	error = kern_fcntl(td, uap->fd, cmd, arg);
415	if (error)
416		return (error);
417	if (uap->cmd == F_OGETLK) {
418		ofl.l_start = fl.l_start;
419		ofl.l_len = fl.l_len;
420		ofl.l_pid = fl.l_pid;
421		ofl.l_type = fl.l_type;
422		ofl.l_whence = fl.l_whence;
423		error = copyout(&ofl, (void *)(intptr_t)uap->arg, sizeof(ofl));
424	} else if (uap->cmd == F_GETLK) {
425		error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
426	}
427	return (error);
428}
429
430static inline struct file *
431fdtofp(int fd, struct filedesc *fdp)
432{
433
434	FILEDESC_LOCK_ASSERT(fdp);
435
436	if ((unsigned)fd >= fdp->fd_nfiles)
437		return (NULL);
438
439	return (fdp->fd_ofiles[fd]);
440}
441
442static inline int
443fdunwrap(int fd, cap_rights_t rights, struct filedesc *fdp, struct file **fpp)
444{
445
446	*fpp = fdtofp(fd, fdp);
447	if (*fpp == NULL)
448		return (EBADF);
449
450#ifdef CAPABILITIES
451	if ((*fpp)->f_type == DTYPE_CAPABILITY) {
452		int err = cap_funwrap(*fpp, rights, fpp);
453		if (err != 0) {
454			*fpp = NULL;
455			return (err);
456		}
457	}
458#endif /* CAPABILITIES */
459	return (0);
460}
461
462int
463kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
464{
465	struct filedesc *fdp;
466	struct flock *flp;
467	struct file *fp;
468	struct proc *p;
469	char *pop;
470	struct vnode *vp;
471	int error, flg, tmp;
472	int vfslocked;
473	u_int old, new;
474	uint64_t bsize;
475
476	vfslocked = 0;
477	error = 0;
478	flg = F_POSIX;
479	p = td->td_proc;
480	fdp = p->p_fd;
481
482	switch (cmd) {
483	case F_DUPFD:
484		tmp = arg;
485		error = do_dup(td, DUP_FCNTL, fd, tmp, td->td_retval);
486		break;
487
488	case F_DUP2FD:
489		tmp = arg;
490		error = do_dup(td, DUP_FIXED, fd, tmp, td->td_retval);
491		break;
492
493	case F_GETFD:
494		FILEDESC_SLOCK(fdp);
495		if ((fp = fdtofp(fd, fdp)) == NULL) {
496			FILEDESC_SUNLOCK(fdp);
497			error = EBADF;
498			break;
499		}
500		pop = &fdp->fd_ofileflags[fd];
501		td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0;
502		FILEDESC_SUNLOCK(fdp);
503		break;
504
505	case F_SETFD:
506		FILEDESC_XLOCK(fdp);
507		if ((fp = fdtofp(fd, fdp)) == NULL) {
508			FILEDESC_XUNLOCK(fdp);
509			error = EBADF;
510			break;
511		}
512		pop = &fdp->fd_ofileflags[fd];
513		*pop = (*pop &~ UF_EXCLOSE) |
514		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
515		FILEDESC_XUNLOCK(fdp);
516		break;
517
518	case F_GETFL:
519		FILEDESC_SLOCK(fdp);
520		error = fdunwrap(fd, CAP_FCNTL, fdp, &fp);
521		if (error != 0) {
522			FILEDESC_SUNLOCK(fdp);
523			break;
524		}
525		td->td_retval[0] = OFLAGS(fp->f_flag);
526		FILEDESC_SUNLOCK(fdp);
527		break;
528
529	case F_SETFL:
530		FILEDESC_SLOCK(fdp);
531		error = fdunwrap(fd, CAP_FCNTL, fdp, &fp);
532		if (error != 0) {
533			FILEDESC_SUNLOCK(fdp);
534			break;
535		}
536		fhold(fp);
537		FILEDESC_SUNLOCK(fdp);
538		do {
539			tmp = flg = fp->f_flag;
540			tmp &= ~FCNTLFLAGS;
541			tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
542		} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
543		tmp = fp->f_flag & FNONBLOCK;
544		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
545		if (error) {
546			fdrop(fp, td);
547			break;
548		}
549		tmp = fp->f_flag & FASYNC;
550		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
551		if (error == 0) {
552			fdrop(fp, td);
553			break;
554		}
555		atomic_clear_int(&fp->f_flag, FNONBLOCK);
556		tmp = 0;
557		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
558		fdrop(fp, td);
559		break;
560
561	case F_GETOWN:
562		FILEDESC_SLOCK(fdp);
563		error = fdunwrap(fd, CAP_FCNTL, fdp, &fp);
564		if (error != 0) {
565			FILEDESC_SUNLOCK(fdp);
566			break;
567		}
568		fhold(fp);
569		FILEDESC_SUNLOCK(fdp);
570		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
571		if (error == 0)
572			td->td_retval[0] = tmp;
573		fdrop(fp, td);
574		break;
575
576	case F_SETOWN:
577		FILEDESC_SLOCK(fdp);
578		error = fdunwrap(fd, CAP_FCNTL, fdp, &fp);
579		if (error != 0) {
580			FILEDESC_SUNLOCK(fdp);
581			break;
582		}
583		fhold(fp);
584		FILEDESC_SUNLOCK(fdp);
585		tmp = arg;
586		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
587		fdrop(fp, td);
588		break;
589
590	case F_SETLK_REMOTE:
591		error = priv_check(td, PRIV_NFS_LOCKD);
592		if (error)
593			return (error);
594		flg = F_REMOTE;
595		goto do_setlk;
596
597	case F_SETLKW:
598		flg |= F_WAIT;
599		/* FALLTHROUGH F_SETLK */
600
601	case F_SETLK:
602	do_setlk:
603		FILEDESC_SLOCK(fdp);
604		error = fdunwrap(fd, CAP_FLOCK, fdp, &fp);
605		if (error != 0) {
606			FILEDESC_SUNLOCK(fdp);
607			break;
608		}
609		if (fp->f_type != DTYPE_VNODE) {
610			FILEDESC_SUNLOCK(fdp);
611			error = EBADF;
612			break;
613		}
614		flp = (struct flock *)arg;
615		if (flp->l_whence == SEEK_CUR) {
616			if (fp->f_offset < 0 ||
617			    (flp->l_start > 0 &&
618			     fp->f_offset > OFF_MAX - flp->l_start)) {
619				FILEDESC_SUNLOCK(fdp);
620				error = EOVERFLOW;
621				break;
622			}
623			flp->l_start += fp->f_offset;
624		}
625
626		/*
627		 * VOP_ADVLOCK() may block.
628		 */
629		fhold(fp);
630		FILEDESC_SUNLOCK(fdp);
631		vp = fp->f_vnode;
632		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
633		switch (flp->l_type) {
634		case F_RDLCK:
635			if ((fp->f_flag & FREAD) == 0) {
636				error = EBADF;
637				break;
638			}
639			PROC_LOCK(p->p_leader);
640			p->p_leader->p_flag |= P_ADVLOCK;
641			PROC_UNLOCK(p->p_leader);
642			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
643			    flp, flg);
644			break;
645		case F_WRLCK:
646			if ((fp->f_flag & FWRITE) == 0) {
647				error = EBADF;
648				break;
649			}
650			PROC_LOCK(p->p_leader);
651			p->p_leader->p_flag |= P_ADVLOCK;
652			PROC_UNLOCK(p->p_leader);
653			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
654			    flp, flg);
655			break;
656		case F_UNLCK:
657			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
658			    flp, flg);
659			break;
660		case F_UNLCKSYS:
661			/*
662			 * Temporary api for testing remote lock
663			 * infrastructure.
664			 */
665			if (flg != F_REMOTE) {
666				error = EINVAL;
667				break;
668			}
669			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
670			    F_UNLCKSYS, flp, flg);
671			break;
672		default:
673			error = EINVAL;
674			break;
675		}
676		VFS_UNLOCK_GIANT(vfslocked);
677		vfslocked = 0;
678		/* Check for race with close */
679		FILEDESC_SLOCK(fdp);
680		if ((unsigned) fd >= fdp->fd_nfiles ||
681		    fp != fdp->fd_ofiles[fd]) {
682			FILEDESC_SUNLOCK(fdp);
683			flp->l_whence = SEEK_SET;
684			flp->l_start = 0;
685			flp->l_len = 0;
686			flp->l_type = F_UNLCK;
687			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
688			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
689					   F_UNLCK, flp, F_POSIX);
690			VFS_UNLOCK_GIANT(vfslocked);
691			vfslocked = 0;
692		} else
693			FILEDESC_SUNLOCK(fdp);
694		fdrop(fp, td);
695		break;
696
697	case F_GETLK:
698		FILEDESC_SLOCK(fdp);
699		error = fdunwrap(fd, CAP_FLOCK, fdp, &fp);
700		if (error != 0) {
701			FILEDESC_SUNLOCK(fdp);
702			break;
703		}
704		if (fp->f_type != DTYPE_VNODE) {
705			FILEDESC_SUNLOCK(fdp);
706			error = EBADF;
707			break;
708		}
709		flp = (struct flock *)arg;
710		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
711		    flp->l_type != F_UNLCK) {
712			FILEDESC_SUNLOCK(fdp);
713			error = EINVAL;
714			break;
715		}
716		if (flp->l_whence == SEEK_CUR) {
717			if ((flp->l_start > 0 &&
718			    fp->f_offset > OFF_MAX - flp->l_start) ||
719			    (flp->l_start < 0 &&
720			     fp->f_offset < OFF_MIN - flp->l_start)) {
721				FILEDESC_SUNLOCK(fdp);
722				error = EOVERFLOW;
723				break;
724			}
725			flp->l_start += fp->f_offset;
726		}
727		/*
728		 * VOP_ADVLOCK() may block.
729		 */
730		fhold(fp);
731		FILEDESC_SUNLOCK(fdp);
732		vp = fp->f_vnode;
733		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
734		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
735		    F_POSIX);
736		VFS_UNLOCK_GIANT(vfslocked);
737		vfslocked = 0;
738		fdrop(fp, td);
739		break;
740
741	case F_RDAHEAD:
742		arg = arg ? 128 * 1024: 0;
743		/* FALLTHROUGH */
744	case F_READAHEAD:
745		FILEDESC_SLOCK(fdp);
746		if ((fp = fdtofp(fd, fdp)) == NULL) {
747			FILEDESC_SUNLOCK(fdp);
748			error = EBADF;
749			break;
750		}
751		if (fp->f_type != DTYPE_VNODE) {
752			FILEDESC_SUNLOCK(fdp);
753			error = EBADF;
754			break;
755		}
756		fhold(fp);
757		FILEDESC_SUNLOCK(fdp);
758		if (arg != 0) {
759			vp = fp->f_vnode;
760			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
761			error = vn_lock(vp, LK_SHARED);
762			if (error != 0)
763				goto readahead_vnlock_fail;
764			bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
765			VOP_UNLOCK(vp, 0);
766			fp->f_seqcount = (arg + bsize - 1) / bsize;
767			do {
768				new = old = fp->f_flag;
769				new |= FRDAHEAD;
770			} while (!atomic_cmpset_rel_int(&fp->f_flag, old, new));
771readahead_vnlock_fail:
772			VFS_UNLOCK_GIANT(vfslocked);
773			vfslocked = 0;
774		} else {
775			do {
776				new = old = fp->f_flag;
777				new &= ~FRDAHEAD;
778			} while (!atomic_cmpset_rel_int(&fp->f_flag, old, new));
779		}
780		fdrop(fp, td);
781		break;
782
783	default:
784		error = EINVAL;
785		break;
786	}
787	VFS_UNLOCK_GIANT(vfslocked);
788	return (error);
789}
790
791/*
792 * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
793 */
794static int
795do_dup(struct thread *td, int flags, int old, int new,
796    register_t *retval)
797{
798	struct filedesc *fdp;
799	struct proc *p;
800	struct file *fp;
801	struct file *delfp;
802	int error, holdleaders, maxfd;
803
804	p = td->td_proc;
805	fdp = p->p_fd;
806
807	/*
808	 * Verify we have a valid descriptor to dup from and possibly to
809	 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
810	 * return EINVAL when the new descriptor is out of bounds.
811	 */
812	if (old < 0)
813		return (EBADF);
814	if (new < 0)
815		return (flags & DUP_FCNTL ? EINVAL : EBADF);
816	PROC_LOCK(p);
817	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
818	PROC_UNLOCK(p);
819	if (new >= maxfd)
820		return (flags & DUP_FCNTL ? EINVAL : EBADF);
821
822	FILEDESC_XLOCK(fdp);
823	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
824		FILEDESC_XUNLOCK(fdp);
825		return (EBADF);
826	}
827	if (flags & DUP_FIXED && old == new) {
828		*retval = new;
829		FILEDESC_XUNLOCK(fdp);
830		return (0);
831	}
832	fp = fdp->fd_ofiles[old];
833	fhold(fp);
834
835	/*
836	 * If the caller specified a file descriptor, make sure the file
837	 * table is large enough to hold it, and grab it.  Otherwise, just
838	 * allocate a new descriptor the usual way.  Since the filedesc
839	 * lock may be temporarily dropped in the process, we have to look
840	 * out for a race.
841	 */
842	if (flags & DUP_FIXED) {
843		if (new >= fdp->fd_nfiles) {
844			/*
845			 * The resource limits are here instead of e.g.
846			 * fdalloc(), because the file descriptor table may be
847			 * shared between processes, so we can't really use
848			 * racct_add()/racct_sub().  Instead of counting the
849			 * number of actually allocated descriptors, just put
850			 * the limit on the size of the file descriptor table.
851			 */
852#ifdef RACCT
853			PROC_LOCK(p);
854			error = racct_set(p, RACCT_NOFILE, new + 1);
855			PROC_UNLOCK(p);
856			if (error != 0) {
857				FILEDESC_XUNLOCK(fdp);
858				fdrop(fp, td);
859				return (EMFILE);
860			}
861#endif
862			fdgrowtable(fdp, new + 1);
863		}
864		if (fdp->fd_ofiles[new] == NULL)
865			fdused(fdp, new);
866	} else {
867		if ((error = fdalloc(td, new, &new)) != 0) {
868			FILEDESC_XUNLOCK(fdp);
869			fdrop(fp, td);
870			return (error);
871		}
872	}
873
874	KASSERT(fp == fdp->fd_ofiles[old], ("old fd has been modified"));
875	KASSERT(old != new, ("new fd is same as old"));
876
877	/*
878	 * Save info on the descriptor being overwritten.  We cannot close
879	 * it without introducing an ownership race for the slot, since we
880	 * need to drop the filedesc lock to call closef().
881	 *
882	 * XXX this duplicates parts of close().
883	 */
884	delfp = fdp->fd_ofiles[new];
885	holdleaders = 0;
886	if (delfp != NULL && td->td_proc->p_fdtol != NULL) {
887		/*
888		 * Ask fdfree() to sleep to ensure that all relevant
889		 * process leaders can be traversed in closef().
890		 */
891		fdp->fd_holdleaderscount++;
892		holdleaders = 1;
893	}
894
895	/*
896	 * Duplicate the source descriptor.
897	 */
898	fdp->fd_ofiles[new] = fp;
899	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
900	if (new > fdp->fd_lastfile)
901		fdp->fd_lastfile = new;
902	*retval = new;
903
904	/*
905	 * If we dup'd over a valid file, we now own the reference to it
906	 * and must dispose of it using closef() semantics (as if a
907	 * close() were performed on it).
908	 *
909	 * XXX this duplicates parts of close().
910	 */
911	if (delfp != NULL) {
912		knote_fdclose(td, new);
913		/*
914		 * When we're closing an fd with a capability, we need to
915		 * notify mqueue if the underlying object is of type mqueue.
916		 */
917		(void)cap_funwrap(delfp, 0, &fp);
918		if (fp->f_type == DTYPE_MQUEUE)
919			mq_fdclose(td, new, fp);
920		FILEDESC_XUNLOCK(fdp);
921		(void) closef(delfp, td);
922		if (holdleaders) {
923			FILEDESC_XLOCK(fdp);
924			fdp->fd_holdleaderscount--;
925			if (fdp->fd_holdleaderscount == 0 &&
926			    fdp->fd_holdleaderswakeup != 0) {
927				fdp->fd_holdleaderswakeup = 0;
928				wakeup(&fdp->fd_holdleaderscount);
929			}
930			FILEDESC_XUNLOCK(fdp);
931		}
932	} else {
933		FILEDESC_XUNLOCK(fdp);
934	}
935	return (0);
936}
937
938/*
939 * If sigio is on the list associated with a process or process group,
940 * disable signalling from the device, remove sigio from the list and
941 * free sigio.
942 */
943void
944funsetown(struct sigio **sigiop)
945{
946	struct sigio *sigio;
947
948	SIGIO_LOCK();
949	sigio = *sigiop;
950	if (sigio == NULL) {
951		SIGIO_UNLOCK();
952		return;
953	}
954	*(sigio->sio_myref) = NULL;
955	if ((sigio)->sio_pgid < 0) {
956		struct pgrp *pg = (sigio)->sio_pgrp;
957		PGRP_LOCK(pg);
958		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
959			     sigio, sio_pgsigio);
960		PGRP_UNLOCK(pg);
961	} else {
962		struct proc *p = (sigio)->sio_proc;
963		PROC_LOCK(p);
964		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
965			     sigio, sio_pgsigio);
966		PROC_UNLOCK(p);
967	}
968	SIGIO_UNLOCK();
969	crfree(sigio->sio_ucred);
970	free(sigio, M_SIGIO);
971}
972
973/*
974 * Free a list of sigio structures.
975 * We only need to lock the SIGIO_LOCK because we have made ourselves
976 * inaccessible to callers of fsetown and therefore do not need to lock
977 * the proc or pgrp struct for the list manipulation.
978 */
979void
980funsetownlst(struct sigiolst *sigiolst)
981{
982	struct proc *p;
983	struct pgrp *pg;
984	struct sigio *sigio;
985
986	sigio = SLIST_FIRST(sigiolst);
987	if (sigio == NULL)
988		return;
989	p = NULL;
990	pg = NULL;
991
992	/*
993	 * Every entry of the list should belong
994	 * to a single proc or pgrp.
995	 */
996	if (sigio->sio_pgid < 0) {
997		pg = sigio->sio_pgrp;
998		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
999	} else /* if (sigio->sio_pgid > 0) */ {
1000		p = sigio->sio_proc;
1001		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
1002	}
1003
1004	SIGIO_LOCK();
1005	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
1006		*(sigio->sio_myref) = NULL;
1007		if (pg != NULL) {
1008			KASSERT(sigio->sio_pgid < 0,
1009			    ("Proc sigio in pgrp sigio list"));
1010			KASSERT(sigio->sio_pgrp == pg,
1011			    ("Bogus pgrp in sigio list"));
1012			PGRP_LOCK(pg);
1013			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
1014			    sio_pgsigio);
1015			PGRP_UNLOCK(pg);
1016		} else /* if (p != NULL) */ {
1017			KASSERT(sigio->sio_pgid > 0,
1018			    ("Pgrp sigio in proc sigio list"));
1019			KASSERT(sigio->sio_proc == p,
1020			    ("Bogus proc in sigio list"));
1021			PROC_LOCK(p);
1022			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
1023			    sio_pgsigio);
1024			PROC_UNLOCK(p);
1025		}
1026		SIGIO_UNLOCK();
1027		crfree(sigio->sio_ucred);
1028		free(sigio, M_SIGIO);
1029		SIGIO_LOCK();
1030	}
1031	SIGIO_UNLOCK();
1032}
1033
1034/*
1035 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
1036 *
1037 * After permission checking, add a sigio structure to the sigio list for
1038 * the process or process group.
1039 */
1040int
1041fsetown(pid_t pgid, struct sigio **sigiop)
1042{
1043	struct proc *proc;
1044	struct pgrp *pgrp;
1045	struct sigio *sigio;
1046	int ret;
1047
1048	if (pgid == 0) {
1049		funsetown(sigiop);
1050		return (0);
1051	}
1052
1053	ret = 0;
1054
1055	/* Allocate and fill in the new sigio out of locks. */
1056	sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
1057	sigio->sio_pgid = pgid;
1058	sigio->sio_ucred = crhold(curthread->td_ucred);
1059	sigio->sio_myref = sigiop;
1060
1061	sx_slock(&proctree_lock);
1062	if (pgid > 0) {
1063		proc = pfind(pgid);
1064		if (proc == NULL) {
1065			ret = ESRCH;
1066			goto fail;
1067		}
1068
1069		/*
1070		 * Policy - Don't allow a process to FSETOWN a process
1071		 * in another session.
1072		 *
1073		 * Remove this test to allow maximum flexibility or
1074		 * restrict FSETOWN to the current process or process
1075		 * group for maximum safety.
1076		 */
1077		PROC_UNLOCK(proc);
1078		if (proc->p_session != curthread->td_proc->p_session) {
1079			ret = EPERM;
1080			goto fail;
1081		}
1082
1083		pgrp = NULL;
1084	} else /* if (pgid < 0) */ {
1085		pgrp = pgfind(-pgid);
1086		if (pgrp == NULL) {
1087			ret = ESRCH;
1088			goto fail;
1089		}
1090		PGRP_UNLOCK(pgrp);
1091
1092		/*
1093		 * Policy - Don't allow a process to FSETOWN a process
1094		 * in another session.
1095		 *
1096		 * Remove this test to allow maximum flexibility or
1097		 * restrict FSETOWN to the current process or process
1098		 * group for maximum safety.
1099		 */
1100		if (pgrp->pg_session != curthread->td_proc->p_session) {
1101			ret = EPERM;
1102			goto fail;
1103		}
1104
1105		proc = NULL;
1106	}
1107	funsetown(sigiop);
1108	if (pgid > 0) {
1109		PROC_LOCK(proc);
1110		/*
1111		 * Since funsetownlst() is called without the proctree
1112		 * locked, we need to check for P_WEXIT.
1113		 * XXX: is ESRCH correct?
1114		 */
1115		if ((proc->p_flag & P_WEXIT) != 0) {
1116			PROC_UNLOCK(proc);
1117			ret = ESRCH;
1118			goto fail;
1119		}
1120		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
1121		sigio->sio_proc = proc;
1122		PROC_UNLOCK(proc);
1123	} else {
1124		PGRP_LOCK(pgrp);
1125		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
1126		sigio->sio_pgrp = pgrp;
1127		PGRP_UNLOCK(pgrp);
1128	}
1129	sx_sunlock(&proctree_lock);
1130	SIGIO_LOCK();
1131	*sigiop = sigio;
1132	SIGIO_UNLOCK();
1133	return (0);
1134
1135fail:
1136	sx_sunlock(&proctree_lock);
1137	crfree(sigio->sio_ucred);
1138	free(sigio, M_SIGIO);
1139	return (ret);
1140}
1141
1142/*
1143 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
1144 */
1145pid_t
1146fgetown(sigiop)
1147	struct sigio **sigiop;
1148{
1149	pid_t pgid;
1150
1151	SIGIO_LOCK();
1152	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
1153	SIGIO_UNLOCK();
1154	return (pgid);
1155}
1156
1157/*
1158 * Close a file descriptor.
1159 */
1160#ifndef _SYS_SYSPROTO_H_
1161struct close_args {
1162	int     fd;
1163};
1164#endif
1165/* ARGSUSED */
1166int
1167sys_close(td, uap)
1168	struct thread *td;
1169	struct close_args *uap;
1170{
1171
1172	return (kern_close(td, uap->fd));
1173}
1174
1175int
1176kern_close(td, fd)
1177	struct thread *td;
1178	int fd;
1179{
1180	struct filedesc *fdp;
1181	struct file *fp, *fp_object;
1182	int error;
1183	int holdleaders;
1184
1185	error = 0;
1186	holdleaders = 0;
1187	fdp = td->td_proc->p_fd;
1188
1189	AUDIT_SYSCLOSE(td, fd);
1190
1191	FILEDESC_XLOCK(fdp);
1192	if ((unsigned)fd >= fdp->fd_nfiles ||
1193	    (fp = fdp->fd_ofiles[fd]) == NULL) {
1194		FILEDESC_XUNLOCK(fdp);
1195		return (EBADF);
1196	}
1197	fdp->fd_ofiles[fd] = NULL;
1198	fdp->fd_ofileflags[fd] = 0;
1199	fdunused(fdp, fd);
1200	if (td->td_proc->p_fdtol != NULL) {
1201		/*
1202		 * Ask fdfree() to sleep to ensure that all relevant
1203		 * process leaders can be traversed in closef().
1204		 */
1205		fdp->fd_holdleaderscount++;
1206		holdleaders = 1;
1207	}
1208
1209	/*
1210	 * We now hold the fp reference that used to be owned by the
1211	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
1212	 * knote_fdclose to prevent a race of the fd getting opened, a knote
1213	 * added, and deleteing a knote for the new fd.
1214	 */
1215	knote_fdclose(td, fd);
1216
1217	/*
1218	 * When we're closing an fd with a capability, we need to notify
1219	 * mqueue if the underlying object is of type mqueue.
1220	 */
1221	(void)cap_funwrap(fp, 0, &fp_object);
1222	if (fp_object->f_type == DTYPE_MQUEUE)
1223		mq_fdclose(td, fd, fp_object);
1224	FILEDESC_XUNLOCK(fdp);
1225
1226	error = closef(fp, td);
1227	if (holdleaders) {
1228		FILEDESC_XLOCK(fdp);
1229		fdp->fd_holdleaderscount--;
1230		if (fdp->fd_holdleaderscount == 0 &&
1231		    fdp->fd_holdleaderswakeup != 0) {
1232			fdp->fd_holdleaderswakeup = 0;
1233			wakeup(&fdp->fd_holdleaderscount);
1234		}
1235		FILEDESC_XUNLOCK(fdp);
1236	}
1237	return (error);
1238}
1239
1240/*
1241 * Close open file descriptors.
1242 */
1243#ifndef _SYS_SYSPROTO_H_
1244struct closefrom_args {
1245	int	lowfd;
1246};
1247#endif
1248/* ARGSUSED */
1249int
1250sys_closefrom(struct thread *td, struct closefrom_args *uap)
1251{
1252	struct filedesc *fdp;
1253	int fd;
1254
1255	fdp = td->td_proc->p_fd;
1256	AUDIT_ARG_FD(uap->lowfd);
1257
1258	/*
1259	 * Treat negative starting file descriptor values identical to
1260	 * closefrom(0) which closes all files.
1261	 */
1262	if (uap->lowfd < 0)
1263		uap->lowfd = 0;
1264	FILEDESC_SLOCK(fdp);
1265	for (fd = uap->lowfd; fd < fdp->fd_nfiles; fd++) {
1266		if (fdp->fd_ofiles[fd] != NULL) {
1267			FILEDESC_SUNLOCK(fdp);
1268			(void)kern_close(td, fd);
1269			FILEDESC_SLOCK(fdp);
1270		}
1271	}
1272	FILEDESC_SUNLOCK(fdp);
1273	return (0);
1274}
1275
1276#if defined(COMPAT_43)
1277/*
1278 * Return status information about a file descriptor.
1279 */
1280#ifndef _SYS_SYSPROTO_H_
1281struct ofstat_args {
1282	int	fd;
1283	struct	ostat *sb;
1284};
1285#endif
1286/* ARGSUSED */
1287int
1288ofstat(struct thread *td, struct ofstat_args *uap)
1289{
1290	struct ostat oub;
1291	struct stat ub;
1292	int error;
1293
1294	error = kern_fstat(td, uap->fd, &ub);
1295	if (error == 0) {
1296		cvtstat(&ub, &oub);
1297		error = copyout(&oub, uap->sb, sizeof(oub));
1298	}
1299	return (error);
1300}
1301#endif /* COMPAT_43 */
1302
1303/*
1304 * Return status information about a file descriptor.
1305 */
1306#ifndef _SYS_SYSPROTO_H_
1307struct fstat_args {
1308	int	fd;
1309	struct	stat *sb;
1310};
1311#endif
1312/* ARGSUSED */
1313int
1314sys_fstat(struct thread *td, struct fstat_args *uap)
1315{
1316	struct stat ub;
1317	int error;
1318
1319	error = kern_fstat(td, uap->fd, &ub);
1320	if (error == 0)
1321		error = copyout(&ub, uap->sb, sizeof(ub));
1322	return (error);
1323}
1324
1325int
1326kern_fstat(struct thread *td, int fd, struct stat *sbp)
1327{
1328	struct file *fp;
1329	int error;
1330
1331	AUDIT_ARG_FD(fd);
1332
1333	if ((error = fget(td, fd, CAP_FSTAT, &fp)) != 0)
1334		return (error);
1335
1336	AUDIT_ARG_FILE(td->td_proc, fp);
1337
1338	error = fo_stat(fp, sbp, td->td_ucred, td);
1339	fdrop(fp, td);
1340#ifdef KTRACE
1341	if (error == 0 && KTRPOINT(td, KTR_STRUCT))
1342		ktrstat(sbp);
1343#endif
1344	return (error);
1345}
1346
1347/*
1348 * Return status information about a file descriptor.
1349 */
1350#ifndef _SYS_SYSPROTO_H_
1351struct nfstat_args {
1352	int	fd;
1353	struct	nstat *sb;
1354};
1355#endif
1356/* ARGSUSED */
1357int
1358sys_nfstat(struct thread *td, struct nfstat_args *uap)
1359{
1360	struct nstat nub;
1361	struct stat ub;
1362	int error;
1363
1364	error = kern_fstat(td, uap->fd, &ub);
1365	if (error == 0) {
1366		cvtnstat(&ub, &nub);
1367		error = copyout(&nub, uap->sb, sizeof(nub));
1368	}
1369	return (error);
1370}
1371
1372/*
1373 * Return pathconf information about a file descriptor.
1374 */
1375#ifndef _SYS_SYSPROTO_H_
1376struct fpathconf_args {
1377	int	fd;
1378	int	name;
1379};
1380#endif
1381/* ARGSUSED */
1382int
1383sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
1384{
1385	struct file *fp;
1386	struct vnode *vp;
1387	int error;
1388
1389	if ((error = fget(td, uap->fd, CAP_FPATHCONF, &fp)) != 0)
1390		return (error);
1391
1392	/* If asynchronous I/O is available, it works for all descriptors. */
1393	if (uap->name == _PC_ASYNC_IO) {
1394		td->td_retval[0] = async_io_version;
1395		goto out;
1396	}
1397	vp = fp->f_vnode;
1398	if (vp != NULL) {
1399		int vfslocked;
1400		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1401		vn_lock(vp, LK_SHARED | LK_RETRY);
1402		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
1403		VOP_UNLOCK(vp, 0);
1404		VFS_UNLOCK_GIANT(vfslocked);
1405	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
1406		if (uap->name != _PC_PIPE_BUF) {
1407			error = EINVAL;
1408		} else {
1409			td->td_retval[0] = PIPE_BUF;
1410		error = 0;
1411		}
1412	} else {
1413		error = EOPNOTSUPP;
1414	}
1415out:
1416	fdrop(fp, td);
1417	return (error);
1418}
1419
1420/*
1421 * Grow the file table to accomodate (at least) nfd descriptors.
1422 */
1423static void
1424fdgrowtable(struct filedesc *fdp, int nfd)
1425{
1426	struct filedesc0 *fdp0;
1427	struct freetable *fo;
1428	struct file **ntable;
1429	struct file **otable;
1430	char *nfileflags;
1431	int nnfiles, onfiles;
1432	NDSLOTTYPE *nmap;
1433
1434	FILEDESC_XLOCK_ASSERT(fdp);
1435
1436	KASSERT(fdp->fd_nfiles > 0,
1437	    ("zero-length file table"));
1438
1439	/* compute the size of the new table */
1440	onfiles = fdp->fd_nfiles;
1441	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
1442	if (nnfiles <= onfiles)
1443		/* the table is already large enough */
1444		return;
1445
1446	/* allocate a new table and (if required) new bitmaps */
1447	ntable = malloc((nnfiles * OFILESIZE) + sizeof(struct freetable),
1448	    M_FILEDESC, M_ZERO | M_WAITOK);
1449	nfileflags = (char *)&ntable[nnfiles];
1450	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles))
1451		nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE,
1452		    M_FILEDESC, M_ZERO | M_WAITOK);
1453	else
1454		nmap = NULL;
1455
1456	bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable));
1457	bcopy(fdp->fd_ofileflags, nfileflags, onfiles);
1458	otable = fdp->fd_ofiles;
1459	fdp->fd_ofileflags = nfileflags;
1460	fdp->fd_ofiles = ntable;
1461	/*
1462	 * We must preserve ofiles until the process exits because we can't
1463	 * be certain that no threads have references to the old table via
1464	 * _fget().
1465	 */
1466	if (onfiles > NDFILE) {
1467		fo = (struct freetable *)&otable[onfiles];
1468		fdp0 = (struct filedesc0 *)fdp;
1469		fo->ft_table = otable;
1470		SLIST_INSERT_HEAD(&fdp0->fd_free, fo, ft_next);
1471	}
1472	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
1473		bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap));
1474		if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
1475			free(fdp->fd_map, M_FILEDESC);
1476		fdp->fd_map = nmap;
1477	}
1478	fdp->fd_nfiles = nnfiles;
1479}
1480
1481/*
1482 * Allocate a file descriptor for the process.
1483 */
1484int
1485fdalloc(struct thread *td, int minfd, int *result)
1486{
1487	struct proc *p = td->td_proc;
1488	struct filedesc *fdp = p->p_fd;
1489	int fd = -1, maxfd;
1490#ifdef RACCT
1491	int error;
1492#endif
1493
1494	FILEDESC_XLOCK_ASSERT(fdp);
1495
1496	if (fdp->fd_freefile > minfd)
1497		minfd = fdp->fd_freefile;
1498
1499	PROC_LOCK(p);
1500	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
1501	PROC_UNLOCK(p);
1502
1503	/*
1504	 * Search the bitmap for a free descriptor.  If none is found, try
1505	 * to grow the file table.  Keep at it until we either get a file
1506	 * descriptor or run into process or system limits; fdgrowtable()
1507	 * may drop the filedesc lock, so we're in a race.
1508	 */
1509	for (;;) {
1510		fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
1511		if (fd >= maxfd)
1512			return (EMFILE);
1513		if (fd < fdp->fd_nfiles)
1514			break;
1515#ifdef RACCT
1516		PROC_LOCK(p);
1517		error = racct_set(p, RACCT_NOFILE, min(fdp->fd_nfiles * 2, maxfd));
1518		PROC_UNLOCK(p);
1519		if (error != 0)
1520			return (EMFILE);
1521#endif
1522		fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd));
1523	}
1524
1525	/*
1526	 * Perform some sanity checks, then mark the file descriptor as
1527	 * used and return it to the caller.
1528	 */
1529	KASSERT(!fdisused(fdp, fd),
1530	    ("fd_first_free() returned non-free descriptor"));
1531	KASSERT(fdp->fd_ofiles[fd] == NULL, ("file descriptor isn't free"));
1532	KASSERT(fdp->fd_ofileflags[fd] == 0, ("file flags are set"));
1533	fdused(fdp, fd);
1534	*result = fd;
1535	return (0);
1536}
1537
1538/*
1539 * Check to see whether n user file descriptors are available to the process
1540 * p.
1541 */
1542int
1543fdavail(struct thread *td, int n)
1544{
1545	struct proc *p = td->td_proc;
1546	struct filedesc *fdp = td->td_proc->p_fd;
1547	int i, lim, last;
1548
1549	FILEDESC_LOCK_ASSERT(fdp);
1550
1551	/*
1552	 * XXX: This is only called from uipc_usrreq.c:unp_externalize();
1553	 *      call racct_add() from there instead of dealing with containers
1554	 *      here.
1555	 */
1556	PROC_LOCK(p);
1557	lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
1558	PROC_UNLOCK(p);
1559	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
1560		return (1);
1561	last = min(fdp->fd_nfiles, lim);
1562	for (i = fdp->fd_freefile; i < last; i++) {
1563		if (fdp->fd_ofiles[i] == NULL && --n <= 0)
1564			return (1);
1565	}
1566	return (0);
1567}
1568
1569/*
1570 * Create a new open file structure and allocate a file decriptor for the
1571 * process that refers to it.  We add one reference to the file for the
1572 * descriptor table and one reference for resultfp. This is to prevent us
1573 * being preempted and the entry in the descriptor table closed after we
1574 * release the FILEDESC lock.
1575 */
1576int
1577falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags)
1578{
1579	struct file *fp;
1580	int error, fd;
1581
1582	error = falloc_noinstall(td, &fp);
1583	if (error)
1584		return (error);		/* no reference held on error */
1585
1586	error = finstall(td, fp, &fd, flags);
1587	if (error) {
1588		fdrop(fp, td);		/* one reference (fp only) */
1589		return (error);
1590	}
1591
1592	if (resultfp != NULL)
1593		*resultfp = fp;		/* copy out result */
1594	else
1595		fdrop(fp, td);		/* release local reference */
1596
1597	if (resultfd != NULL)
1598		*resultfd = fd;
1599
1600	return (0);
1601}
1602
1603/*
1604 * Create a new open file structure without allocating a file descriptor.
1605 */
1606int
1607falloc_noinstall(struct thread *td, struct file **resultfp)
1608{
1609	struct file *fp;
1610	int maxuserfiles = maxfiles - (maxfiles / 20);
1611	static struct timeval lastfail;
1612	static int curfail;
1613
1614	KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
1615
1616	if ((openfiles >= maxuserfiles &&
1617	    priv_check(td, PRIV_MAXFILES) != 0) ||
1618	    openfiles >= maxfiles) {
1619		if (ppsratecheck(&lastfail, &curfail, 1)) {
1620			printf("kern.maxfiles limit exceeded by uid %i, "
1621			    "please see tuning(7).\n", td->td_ucred->cr_ruid);
1622		}
1623		return (ENFILE);
1624	}
1625	atomic_add_int(&openfiles, 1);
1626	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
1627	refcount_init(&fp->f_count, 1);
1628	fp->f_cred = crhold(td->td_ucred);
1629	fp->f_ops = &badfileops;
1630	fp->f_data = NULL;
1631	fp->f_vnode = NULL;
1632	*resultfp = fp;
1633	return (0);
1634}
1635
1636/*
1637 * Install a file in a file descriptor table.
1638 */
1639int
1640finstall(struct thread *td, struct file *fp, int *fd, int flags)
1641{
1642	struct filedesc *fdp = td->td_proc->p_fd;
1643	int error;
1644
1645	KASSERT(fd != NULL, ("%s: fd == NULL", __func__));
1646	KASSERT(fp != NULL, ("%s: fp == NULL", __func__));
1647
1648	FILEDESC_XLOCK(fdp);
1649	if ((error = fdalloc(td, 0, fd))) {
1650		FILEDESC_XUNLOCK(fdp);
1651		return (error);
1652	}
1653	fhold(fp);
1654	fdp->fd_ofiles[*fd] = fp;
1655	if ((flags & O_CLOEXEC) != 0)
1656		fdp->fd_ofileflags[*fd] |= UF_EXCLOSE;
1657	FILEDESC_XUNLOCK(fdp);
1658	return (0);
1659}
1660
1661/*
1662 * Build a new filedesc structure from another.
1663 * Copy the current, root, and jail root vnode references.
1664 */
1665struct filedesc *
1666fdinit(struct filedesc *fdp)
1667{
1668	struct filedesc0 *newfdp;
1669
1670	newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO);
1671	FILEDESC_LOCK_INIT(&newfdp->fd_fd);
1672	if (fdp != NULL) {
1673		FILEDESC_XLOCK(fdp);
1674		newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
1675		if (newfdp->fd_fd.fd_cdir)
1676			VREF(newfdp->fd_fd.fd_cdir);
1677		newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
1678		if (newfdp->fd_fd.fd_rdir)
1679			VREF(newfdp->fd_fd.fd_rdir);
1680		newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
1681		if (newfdp->fd_fd.fd_jdir)
1682			VREF(newfdp->fd_fd.fd_jdir);
1683		FILEDESC_XUNLOCK(fdp);
1684	}
1685
1686	/* Create the file descriptor table. */
1687	newfdp->fd_fd.fd_refcnt = 1;
1688	newfdp->fd_fd.fd_holdcnt = 1;
1689	newfdp->fd_fd.fd_cmask = CMASK;
1690	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
1691	newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
1692	newfdp->fd_fd.fd_nfiles = NDFILE;
1693	newfdp->fd_fd.fd_map = newfdp->fd_dmap;
1694	newfdp->fd_fd.fd_lastfile = -1;
1695	return (&newfdp->fd_fd);
1696}
1697
1698static struct filedesc *
1699fdhold(struct proc *p)
1700{
1701	struct filedesc *fdp;
1702
1703	mtx_lock(&fdesc_mtx);
1704	fdp = p->p_fd;
1705	if (fdp != NULL)
1706		fdp->fd_holdcnt++;
1707	mtx_unlock(&fdesc_mtx);
1708	return (fdp);
1709}
1710
1711static void
1712fddrop(struct filedesc *fdp)
1713{
1714	struct filedesc0 *fdp0;
1715	struct freetable *ft;
1716	int i;
1717
1718	mtx_lock(&fdesc_mtx);
1719	i = --fdp->fd_holdcnt;
1720	mtx_unlock(&fdesc_mtx);
1721	if (i > 0)
1722		return;
1723
1724	FILEDESC_LOCK_DESTROY(fdp);
1725	fdp0 = (struct filedesc0 *)fdp;
1726	while ((ft = SLIST_FIRST(&fdp0->fd_free)) != NULL) {
1727		SLIST_REMOVE_HEAD(&fdp0->fd_free, ft_next);
1728		free(ft->ft_table, M_FILEDESC);
1729	}
1730	free(fdp, M_FILEDESC);
1731}
1732
1733/*
1734 * Share a filedesc structure.
1735 */
1736struct filedesc *
1737fdshare(struct filedesc *fdp)
1738{
1739
1740	FILEDESC_XLOCK(fdp);
1741	fdp->fd_refcnt++;
1742	FILEDESC_XUNLOCK(fdp);
1743	return (fdp);
1744}
1745
1746/*
1747 * Unshare a filedesc structure, if necessary by making a copy
1748 */
1749void
1750fdunshare(struct proc *p, struct thread *td)
1751{
1752
1753	FILEDESC_XLOCK(p->p_fd);
1754	if (p->p_fd->fd_refcnt > 1) {
1755		struct filedesc *tmp;
1756
1757		FILEDESC_XUNLOCK(p->p_fd);
1758		tmp = fdcopy(p->p_fd);
1759		fdfree(td);
1760		p->p_fd = tmp;
1761	} else
1762		FILEDESC_XUNLOCK(p->p_fd);
1763}
1764
1765/*
1766 * Copy a filedesc structure.  A NULL pointer in returns a NULL reference,
1767 * this is to ease callers, not catch errors.
1768 */
1769struct filedesc *
1770fdcopy(struct filedesc *fdp)
1771{
1772	struct filedesc *newfdp;
1773	int i;
1774
1775	/* Certain daemons might not have file descriptors. */
1776	if (fdp == NULL)
1777		return (NULL);
1778
1779	newfdp = fdinit(fdp);
1780	FILEDESC_SLOCK(fdp);
1781	while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
1782		FILEDESC_SUNLOCK(fdp);
1783		FILEDESC_XLOCK(newfdp);
1784		fdgrowtable(newfdp, fdp->fd_lastfile + 1);
1785		FILEDESC_XUNLOCK(newfdp);
1786		FILEDESC_SLOCK(fdp);
1787	}
1788	/* copy all passable descriptors (i.e. not kqueue) */
1789	newfdp->fd_freefile = -1;
1790	for (i = 0; i <= fdp->fd_lastfile; ++i) {
1791		if (fdisused(fdp, i) &&
1792		    (fdp->fd_ofiles[i]->f_ops->fo_flags & DFLAG_PASSABLE) &&
1793		    fdp->fd_ofiles[i]->f_ops != &badfileops) {
1794			newfdp->fd_ofiles[i] = fdp->fd_ofiles[i];
1795			newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i];
1796			fhold(newfdp->fd_ofiles[i]);
1797			newfdp->fd_lastfile = i;
1798		} else {
1799			if (newfdp->fd_freefile == -1)
1800				newfdp->fd_freefile = i;
1801		}
1802	}
1803	newfdp->fd_cmask = fdp->fd_cmask;
1804	FILEDESC_SUNLOCK(fdp);
1805	FILEDESC_XLOCK(newfdp);
1806	for (i = 0; i <= newfdp->fd_lastfile; ++i)
1807		if (newfdp->fd_ofiles[i] != NULL)
1808			fdused(newfdp, i);
1809	if (newfdp->fd_freefile == -1)
1810		newfdp->fd_freefile = i;
1811	FILEDESC_XUNLOCK(newfdp);
1812	return (newfdp);
1813}
1814
1815/*
1816 * Release a filedesc structure.
1817 */
1818void
1819fdfree(struct thread *td)
1820{
1821	struct filedesc *fdp;
1822	int i, locked;
1823	struct filedesc_to_leader *fdtol;
1824	struct file *fp;
1825	struct vnode *cdir, *jdir, *rdir, *vp;
1826	struct flock lf;
1827
1828	/* Certain daemons might not have file descriptors. */
1829	fdp = td->td_proc->p_fd;
1830	if (fdp == NULL)
1831		return;
1832
1833#ifdef RACCT
1834	PROC_LOCK(td->td_proc);
1835	racct_set(td->td_proc, RACCT_NOFILE, 0);
1836	PROC_UNLOCK(td->td_proc);
1837#endif
1838
1839	/* Check for special need to clear POSIX style locks */
1840	fdtol = td->td_proc->p_fdtol;
1841	if (fdtol != NULL) {
1842		FILEDESC_XLOCK(fdp);
1843		KASSERT(fdtol->fdl_refcount > 0,
1844			("filedesc_to_refcount botch: fdl_refcount=%d",
1845			 fdtol->fdl_refcount));
1846		if (fdtol->fdl_refcount == 1 &&
1847		    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
1848			for (i = 0; i <= fdp->fd_lastfile; i++) {
1849				fp = fdp->fd_ofiles[i];
1850				if (fp == NULL || fp->f_type != DTYPE_VNODE)
1851					continue;
1852				fhold(fp);
1853				FILEDESC_XUNLOCK(fdp);
1854				lf.l_whence = SEEK_SET;
1855				lf.l_start = 0;
1856				lf.l_len = 0;
1857				lf.l_type = F_UNLCK;
1858				vp = fp->f_vnode;
1859				locked = VFS_LOCK_GIANT(vp->v_mount);
1860				(void) VOP_ADVLOCK(vp,
1861						   (caddr_t)td->td_proc->
1862						   p_leader,
1863						   F_UNLCK,
1864						   &lf,
1865						   F_POSIX);
1866				VFS_UNLOCK_GIANT(locked);
1867				FILEDESC_XLOCK(fdp);
1868				fdrop(fp, td);
1869			}
1870		}
1871	retry:
1872		if (fdtol->fdl_refcount == 1) {
1873			if (fdp->fd_holdleaderscount > 0 &&
1874			    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
1875				/*
1876				 * close() or do_dup() has cleared a reference
1877				 * in a shared file descriptor table.
1878				 */
1879				fdp->fd_holdleaderswakeup = 1;
1880				sx_sleep(&fdp->fd_holdleaderscount,
1881				    FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
1882				goto retry;
1883			}
1884			if (fdtol->fdl_holdcount > 0) {
1885				/*
1886				 * Ensure that fdtol->fdl_leader remains
1887				 * valid in closef().
1888				 */
1889				fdtol->fdl_wakeup = 1;
1890				sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
1891				    "fdlhold", 0);
1892				goto retry;
1893			}
1894		}
1895		fdtol->fdl_refcount--;
1896		if (fdtol->fdl_refcount == 0 &&
1897		    fdtol->fdl_holdcount == 0) {
1898			fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
1899			fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
1900		} else
1901			fdtol = NULL;
1902		td->td_proc->p_fdtol = NULL;
1903		FILEDESC_XUNLOCK(fdp);
1904		if (fdtol != NULL)
1905			free(fdtol, M_FILEDESC_TO_LEADER);
1906	}
1907	FILEDESC_XLOCK(fdp);
1908	i = --fdp->fd_refcnt;
1909	FILEDESC_XUNLOCK(fdp);
1910	if (i > 0)
1911		return;
1912
1913	for (i = 0; i <= fdp->fd_lastfile; i++) {
1914		fp = fdp->fd_ofiles[i];
1915		if (fp != NULL) {
1916			FILEDESC_XLOCK(fdp);
1917			fdp->fd_ofiles[i] = NULL;
1918			FILEDESC_XUNLOCK(fdp);
1919			(void) closef(fp, td);
1920		}
1921	}
1922	FILEDESC_XLOCK(fdp);
1923
1924	/* XXX This should happen earlier. */
1925	mtx_lock(&fdesc_mtx);
1926	td->td_proc->p_fd = NULL;
1927	mtx_unlock(&fdesc_mtx);
1928
1929	if (fdp->fd_nfiles > NDFILE)
1930		free(fdp->fd_ofiles, M_FILEDESC);
1931	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
1932		free(fdp->fd_map, M_FILEDESC);
1933
1934	fdp->fd_nfiles = 0;
1935
1936	cdir = fdp->fd_cdir;
1937	fdp->fd_cdir = NULL;
1938	rdir = fdp->fd_rdir;
1939	fdp->fd_rdir = NULL;
1940	jdir = fdp->fd_jdir;
1941	fdp->fd_jdir = NULL;
1942	FILEDESC_XUNLOCK(fdp);
1943
1944	if (cdir) {
1945		locked = VFS_LOCK_GIANT(cdir->v_mount);
1946		vrele(cdir);
1947		VFS_UNLOCK_GIANT(locked);
1948	}
1949	if (rdir) {
1950		locked = VFS_LOCK_GIANT(rdir->v_mount);
1951		vrele(rdir);
1952		VFS_UNLOCK_GIANT(locked);
1953	}
1954	if (jdir) {
1955		locked = VFS_LOCK_GIANT(jdir->v_mount);
1956		vrele(jdir);
1957		VFS_UNLOCK_GIANT(locked);
1958	}
1959
1960	fddrop(fdp);
1961}
1962
1963/*
1964 * For setugid programs, we don't want to people to use that setugidness
1965 * to generate error messages which write to a file which otherwise would
1966 * otherwise be off-limits to the process.  We check for filesystems where
1967 * the vnode can change out from under us after execve (like [lin]procfs).
1968 *
1969 * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
1970 * sufficient.  We also don't check for setugidness since we know we are.
1971 */
1972static int
1973is_unsafe(struct file *fp)
1974{
1975	if (fp->f_type == DTYPE_VNODE) {
1976		struct vnode *vp = fp->f_vnode;
1977
1978		if ((vp->v_vflag & VV_PROCDEP) != 0)
1979			return (1);
1980	}
1981	return (0);
1982}
1983
1984/*
1985 * Make this setguid thing safe, if at all possible.
1986 */
1987void
1988setugidsafety(struct thread *td)
1989{
1990	struct filedesc *fdp;
1991	int i;
1992
1993	/* Certain daemons might not have file descriptors. */
1994	fdp = td->td_proc->p_fd;
1995	if (fdp == NULL)
1996		return;
1997
1998	/*
1999	 * Note: fdp->fd_ofiles may be reallocated out from under us while
2000	 * we are blocked in a close.  Be careful!
2001	 */
2002	FILEDESC_XLOCK(fdp);
2003	for (i = 0; i <= fdp->fd_lastfile; i++) {
2004		if (i > 2)
2005			break;
2006		if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) {
2007			struct file *fp;
2008
2009			knote_fdclose(td, i);
2010			/*
2011			 * NULL-out descriptor prior to close to avoid
2012			 * a race while close blocks.
2013			 */
2014			fp = fdp->fd_ofiles[i];
2015			fdp->fd_ofiles[i] = NULL;
2016			fdp->fd_ofileflags[i] = 0;
2017			fdunused(fdp, i);
2018			FILEDESC_XUNLOCK(fdp);
2019			(void) closef(fp, td);
2020			FILEDESC_XLOCK(fdp);
2021		}
2022	}
2023	FILEDESC_XUNLOCK(fdp);
2024}
2025
2026/*
2027 * If a specific file object occupies a specific file descriptor, close the
2028 * file descriptor entry and drop a reference on the file object.  This is a
2029 * convenience function to handle a subsequent error in a function that calls
2030 * falloc() that handles the race that another thread might have closed the
2031 * file descriptor out from under the thread creating the file object.
2032 */
2033void
2034fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
2035{
2036
2037	FILEDESC_XLOCK(fdp);
2038	if (fdp->fd_ofiles[idx] == fp) {
2039		fdp->fd_ofiles[idx] = NULL;
2040		fdunused(fdp, idx);
2041		FILEDESC_XUNLOCK(fdp);
2042		fdrop(fp, td);
2043	} else
2044		FILEDESC_XUNLOCK(fdp);
2045}
2046
2047/*
2048 * Close any files on exec?
2049 */
2050void
2051fdcloseexec(struct thread *td)
2052{
2053	struct filedesc *fdp;
2054	int i;
2055
2056	/* Certain daemons might not have file descriptors. */
2057	fdp = td->td_proc->p_fd;
2058	if (fdp == NULL)
2059		return;
2060
2061	FILEDESC_XLOCK(fdp);
2062
2063	/*
2064	 * We cannot cache fd_ofiles or fd_ofileflags since operations
2065	 * may block and rip them out from under us.
2066	 */
2067	for (i = 0; i <= fdp->fd_lastfile; i++) {
2068		if (fdp->fd_ofiles[i] != NULL &&
2069		    (fdp->fd_ofiles[i]->f_type == DTYPE_MQUEUE ||
2070		    (fdp->fd_ofileflags[i] & UF_EXCLOSE))) {
2071			struct file *fp;
2072
2073			knote_fdclose(td, i);
2074			/*
2075			 * NULL-out descriptor prior to close to avoid
2076			 * a race while close blocks.
2077			 */
2078			fp = fdp->fd_ofiles[i];
2079			fdp->fd_ofiles[i] = NULL;
2080			fdp->fd_ofileflags[i] = 0;
2081			fdunused(fdp, i);
2082			if (fp->f_type == DTYPE_MQUEUE)
2083				mq_fdclose(td, i, fp);
2084			FILEDESC_XUNLOCK(fdp);
2085			(void) closef(fp, td);
2086			FILEDESC_XLOCK(fdp);
2087		}
2088	}
2089	FILEDESC_XUNLOCK(fdp);
2090}
2091
2092/*
2093 * It is unsafe for set[ug]id processes to be started with file
2094 * descriptors 0..2 closed, as these descriptors are given implicit
2095 * significance in the Standard C library.  fdcheckstd() will create a
2096 * descriptor referencing /dev/null for each of stdin, stdout, and
2097 * stderr that is not already open.
2098 */
2099int
2100fdcheckstd(struct thread *td)
2101{
2102	struct filedesc *fdp;
2103	register_t retval, save;
2104	int i, error, devnull;
2105
2106	fdp = td->td_proc->p_fd;
2107	if (fdp == NULL)
2108		return (0);
2109	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
2110	devnull = -1;
2111	error = 0;
2112	for (i = 0; i < 3; i++) {
2113		if (fdp->fd_ofiles[i] != NULL)
2114			continue;
2115		if (devnull < 0) {
2116			save = td->td_retval[0];
2117			error = kern_open(td, "/dev/null", UIO_SYSSPACE,
2118			    O_RDWR, 0);
2119			devnull = td->td_retval[0];
2120			td->td_retval[0] = save;
2121			if (error)
2122				break;
2123			KASSERT(devnull == i, ("oof, we didn't get our fd"));
2124		} else {
2125			error = do_dup(td, DUP_FIXED, devnull, i, &retval);
2126			if (error != 0)
2127				break;
2128		}
2129	}
2130	return (error);
2131}
2132
2133/*
2134 * Internal form of close.  Decrement reference count on file structure.
2135 * Note: td may be NULL when closing a file that was being passed in a
2136 * message.
2137 *
2138 * XXXRW: Giant is not required for the caller, but often will be held; this
2139 * makes it moderately likely the Giant will be recursed in the VFS case.
2140 */
2141int
2142closef(struct file *fp, struct thread *td)
2143{
2144	struct vnode *vp;
2145	struct flock lf;
2146	struct filedesc_to_leader *fdtol;
2147	struct filedesc *fdp;
2148	struct file *fp_object;
2149
2150	/*
2151	 * POSIX record locking dictates that any close releases ALL
2152	 * locks owned by this process.  This is handled by setting
2153	 * a flag in the unlock to free ONLY locks obeying POSIX
2154	 * semantics, and not to free BSD-style file locks.
2155	 * If the descriptor was in a message, POSIX-style locks
2156	 * aren't passed with the descriptor, and the thread pointer
2157	 * will be NULL.  Callers should be careful only to pass a
2158	 * NULL thread pointer when there really is no owning
2159	 * context that might have locks, or the locks will be
2160	 * leaked.
2161	 *
2162	 * If this is a capability, we do lock processing under the underlying
2163	 * node, not the capability itself.
2164	 */
2165	(void)cap_funwrap(fp, 0, &fp_object);
2166	if ((fp_object->f_type == DTYPE_VNODE) && (td != NULL)) {
2167		int vfslocked;
2168
2169		vp = fp_object->f_vnode;
2170		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
2171		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
2172			lf.l_whence = SEEK_SET;
2173			lf.l_start = 0;
2174			lf.l_len = 0;
2175			lf.l_type = F_UNLCK;
2176			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
2177					   F_UNLCK, &lf, F_POSIX);
2178		}
2179		fdtol = td->td_proc->p_fdtol;
2180		if (fdtol != NULL) {
2181			/*
2182			 * Handle special case where file descriptor table is
2183			 * shared between multiple process leaders.
2184			 */
2185			fdp = td->td_proc->p_fd;
2186			FILEDESC_XLOCK(fdp);
2187			for (fdtol = fdtol->fdl_next;
2188			     fdtol != td->td_proc->p_fdtol;
2189			     fdtol = fdtol->fdl_next) {
2190				if ((fdtol->fdl_leader->p_flag &
2191				     P_ADVLOCK) == 0)
2192					continue;
2193				fdtol->fdl_holdcount++;
2194				FILEDESC_XUNLOCK(fdp);
2195				lf.l_whence = SEEK_SET;
2196				lf.l_start = 0;
2197				lf.l_len = 0;
2198				lf.l_type = F_UNLCK;
2199				vp = fp_object->f_vnode;
2200				(void) VOP_ADVLOCK(vp,
2201						   (caddr_t)fdtol->fdl_leader,
2202						   F_UNLCK, &lf, F_POSIX);
2203				FILEDESC_XLOCK(fdp);
2204				fdtol->fdl_holdcount--;
2205				if (fdtol->fdl_holdcount == 0 &&
2206				    fdtol->fdl_wakeup != 0) {
2207					fdtol->fdl_wakeup = 0;
2208					wakeup(fdtol);
2209				}
2210			}
2211			FILEDESC_XUNLOCK(fdp);
2212		}
2213		VFS_UNLOCK_GIANT(vfslocked);
2214	}
2215	return (fdrop(fp, td));
2216}
2217
2218/*
2219 * Initialize the file pointer with the specified properties.
2220 *
2221 * The ops are set with release semantics to be certain that the flags, type,
2222 * and data are visible when ops is.  This is to prevent ops methods from being
2223 * called with bad data.
2224 */
2225void
2226finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
2227{
2228	fp->f_data = data;
2229	fp->f_flag = flag;
2230	fp->f_type = type;
2231	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
2232}
2233
2234struct file *
2235fget_unlocked(struct filedesc *fdp, int fd)
2236{
2237	struct file *fp;
2238	u_int count;
2239
2240	if (fd < 0 || fd >= fdp->fd_nfiles)
2241		return (NULL);
2242	/*
2243	 * Fetch the descriptor locklessly.  We avoid fdrop() races by
2244	 * never raising a refcount above 0.  To accomplish this we have
2245	 * to use a cmpset loop rather than an atomic_add.  The descriptor
2246	 * must be re-verified once we acquire a reference to be certain
2247	 * that the identity is still correct and we did not lose a race
2248	 * due to preemption.
2249	 */
2250	for (;;) {
2251		fp = fdp->fd_ofiles[fd];
2252		if (fp == NULL)
2253			break;
2254		count = fp->f_count;
2255		if (count == 0)
2256			continue;
2257		/*
2258		 * Use an acquire barrier to prevent caching of fd_ofiles
2259		 * so it is refreshed for verification.
2260		 */
2261		if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) != 1)
2262			continue;
2263		if (fp == fdp->fd_ofiles[fd])
2264			break;
2265		fdrop(fp, curthread);
2266	}
2267
2268	return (fp);
2269}
2270
2271/*
2272 * Extract the file pointer associated with the specified descriptor for the
2273 * current user process.
2274 *
2275 * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
2276 * returned.
2277 *
2278 * If the FGET_GETCAP flag is set, the capability itself will be returned.
2279 * Calling _fget() with FGET_GETCAP on a non-capability will return EINVAL.
2280 * Otherwise, if the file is a capability, its rights will be checked against
2281 * the capability rights mask, and if successful, the object will be unwrapped.
2282 *
2283 * If an error occured the non-zero error is returned and *fpp is set to
2284 * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
2285 * responsible for fdrop().
2286 */
2287#define	FGET_GETCAP	0x00000001
2288static __inline int
2289_fget(struct thread *td, int fd, struct file **fpp, int flags,
2290    cap_rights_t needrights, cap_rights_t *haverightsp, u_char *maxprotp,
2291    int fget_flags)
2292{
2293	struct filedesc *fdp;
2294	struct file *fp;
2295#ifdef CAPABILITIES
2296	struct file *fp_fromcap;
2297	int error;
2298#endif
2299
2300	*fpp = NULL;
2301	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
2302		return (EBADF);
2303	if ((fp = fget_unlocked(fdp, fd)) == NULL)
2304		return (EBADF);
2305	if (fp->f_ops == &badfileops) {
2306		fdrop(fp, td);
2307		return (EBADF);
2308	}
2309
2310#ifdef CAPABILITIES
2311	/*
2312	 * If this is a capability, what rights does it have?
2313	 */
2314	if (haverightsp != NULL) {
2315		if (fp->f_type == DTYPE_CAPABILITY)
2316			*haverightsp = cap_rights(fp);
2317		else
2318			*haverightsp = CAP_MASK_VALID;
2319	}
2320
2321	/*
2322	 * If a capability has been requested, return the capability directly.
2323	 * Otherwise, check capability rights, extract the underlying object,
2324	 * and check its access flags.
2325	 */
2326	if (fget_flags & FGET_GETCAP) {
2327		if (fp->f_type != DTYPE_CAPABILITY) {
2328			fdrop(fp, td);
2329			return (EINVAL);
2330		}
2331	} else {
2332		if (maxprotp == NULL)
2333			error = cap_funwrap(fp, needrights, &fp_fromcap);
2334		else
2335			error = cap_funwrap_mmap(fp, needrights, maxprotp,
2336			    &fp_fromcap);
2337		if (error) {
2338			fdrop(fp, td);
2339			return (error);
2340		}
2341
2342		/*
2343		 * If we've unwrapped a file, drop the original capability
2344		 * and hold the new descriptor.  fp after this point refers to
2345		 * the actual (unwrapped) object, not the capability.
2346		 */
2347		if (fp != fp_fromcap) {
2348			fhold(fp_fromcap);
2349			fdrop(fp, td);
2350			fp = fp_fromcap;
2351		}
2352	}
2353#else /* !CAPABILITIES */
2354	KASSERT(fp->f_type != DTYPE_CAPABILITY,
2355	    ("%s: saw capability", __func__));
2356	if (maxprotp != NULL)
2357		*maxprotp = VM_PROT_ALL;
2358#endif /* CAPABILITIES */
2359
2360	/*
2361	 * FREAD and FWRITE failure return EBADF as per POSIX.
2362	 *
2363	 * Only one flag, or 0, may be specified.
2364	 */
2365	if ((flags == FREAD && (fp->f_flag & FREAD) == 0) ||
2366	    (flags == FWRITE && (fp->f_flag & FWRITE) == 0)) {
2367		fdrop(fp, td);
2368		return (EBADF);
2369	}
2370	*fpp = fp;
2371	return (0);
2372}
2373
2374int
2375fget(struct thread *td, int fd, cap_rights_t rights, struct file **fpp)
2376{
2377
2378	return(_fget(td, fd, fpp, 0, rights, NULL, NULL, 0));
2379}
2380
2381int
2382fget_mmap(struct thread *td, int fd, cap_rights_t rights, u_char *maxprotp,
2383    struct file **fpp)
2384{
2385
2386	return (_fget(td, fd, fpp, 0, rights, NULL, maxprotp, 0));
2387}
2388
2389int
2390fget_read(struct thread *td, int fd, cap_rights_t rights, struct file **fpp)
2391{
2392
2393	return(_fget(td, fd, fpp, FREAD, rights, NULL, NULL, 0));
2394}
2395
2396int
2397fget_write(struct thread *td, int fd, cap_rights_t rights, struct file **fpp)
2398{
2399
2400	return (_fget(td, fd, fpp, FWRITE, rights, NULL, NULL, 0));
2401}
2402
2403/*
2404 * Unlike the other fget() calls, which accept and check capability rights
2405 * but never return capabilities, fgetcap() returns the capability but doesn't
2406 * check capability rights.
2407 */
2408int
2409fgetcap(struct thread *td, int fd, struct file **fpp)
2410{
2411
2412	return (_fget(td, fd, fpp, 0, 0, NULL, NULL, FGET_GETCAP));
2413}
2414
2415
2416/*
2417 * Like fget() but loads the underlying vnode, or returns an error if the
2418 * descriptor does not represent a vnode.  Note that pipes use vnodes but
2419 * never have VM objects.  The returned vnode will be vref()'d.
2420 *
2421 * XXX: what about the unused flags ?
2422 */
2423static __inline int
2424_fgetvp(struct thread *td, int fd, int flags, cap_rights_t needrights,
2425    cap_rights_t *haverightsp, struct vnode **vpp)
2426{
2427	struct file *fp;
2428	int error;
2429
2430	*vpp = NULL;
2431	if ((error = _fget(td, fd, &fp, flags, needrights, haverightsp,
2432	    NULL, 0)) != 0)
2433		return (error);
2434	if (fp->f_vnode == NULL) {
2435		error = EINVAL;
2436	} else {
2437		*vpp = fp->f_vnode;
2438		vref(*vpp);
2439	}
2440	fdrop(fp, td);
2441
2442	return (error);
2443}
2444
2445int
2446fgetvp(struct thread *td, int fd, cap_rights_t rights, struct vnode **vpp)
2447{
2448
2449	return (_fgetvp(td, fd, 0, rights, NULL, vpp));
2450}
2451
2452int
2453fgetvp_rights(struct thread *td, int fd, cap_rights_t need, cap_rights_t *have,
2454    struct vnode **vpp)
2455{
2456	return (_fgetvp(td, fd, 0, need, have, vpp));
2457}
2458
2459int
2460fgetvp_read(struct thread *td, int fd, cap_rights_t rights, struct vnode **vpp)
2461{
2462
2463	return (_fgetvp(td, fd, FREAD, rights, NULL, vpp));
2464}
2465
2466#ifdef notyet
2467int
2468fgetvp_write(struct thread *td, int fd, cap_rights_t rights,
2469    struct vnode **vpp)
2470{
2471
2472	return (_fgetvp(td, fd, FWRITE, rights, NULL, vpp));
2473}
2474#endif
2475
2476/*
2477 * Like fget() but loads the underlying socket, or returns an error if the
2478 * descriptor does not represent a socket.
2479 *
2480 * We bump the ref count on the returned socket.  XXX Also obtain the SX lock
2481 * in the future.
2482 *
2483 * Note: fgetsock() and fputsock() are deprecated, as consumers should rely
2484 * on their file descriptor reference to prevent the socket from being free'd
2485 * during use.
2486 */
2487int
2488fgetsock(struct thread *td, int fd, cap_rights_t rights, struct socket **spp,
2489    u_int *fflagp)
2490{
2491	struct file *fp;
2492	int error;
2493
2494	*spp = NULL;
2495	if (fflagp != NULL)
2496		*fflagp = 0;
2497	if ((error = _fget(td, fd, &fp, 0, rights, NULL, NULL, 0)) != 0)
2498		return (error);
2499	if (fp->f_type != DTYPE_SOCKET) {
2500		error = ENOTSOCK;
2501	} else {
2502		*spp = fp->f_data;
2503		if (fflagp)
2504			*fflagp = fp->f_flag;
2505		SOCK_LOCK(*spp);
2506		soref(*spp);
2507		SOCK_UNLOCK(*spp);
2508	}
2509	fdrop(fp, td);
2510
2511	return (error);
2512}
2513
2514/*
2515 * Drop the reference count on the socket and XXX release the SX lock in the
2516 * future.  The last reference closes the socket.
2517 *
2518 * Note: fputsock() is deprecated, see comment for fgetsock().
2519 */
2520void
2521fputsock(struct socket *so)
2522{
2523
2524	ACCEPT_LOCK();
2525	SOCK_LOCK(so);
2526	CURVNET_SET(so->so_vnet);
2527	sorele(so);
2528	CURVNET_RESTORE();
2529}
2530
2531/*
2532 * Handle the last reference to a file being closed.
2533 *
2534 * No special capability handling here, as the capability's fo_close will run
2535 * instead of the object here, and perform any necessary drop on the object.
2536 */
2537int
2538_fdrop(struct file *fp, struct thread *td)
2539{
2540	int error;
2541
2542	error = 0;
2543	if (fp->f_count != 0)
2544		panic("fdrop: count %d", fp->f_count);
2545	if (fp->f_ops != &badfileops)
2546		error = fo_close(fp, td);
2547	atomic_subtract_int(&openfiles, 1);
2548	crfree(fp->f_cred);
2549	free(fp->f_advice, M_FADVISE);
2550	uma_zfree(file_zone, fp);
2551
2552	return (error);
2553}
2554
2555/*
2556 * Apply an advisory lock on a file descriptor.
2557 *
2558 * Just attempt to get a record lock of the requested type on the entire file
2559 * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
2560 */
2561#ifndef _SYS_SYSPROTO_H_
2562struct flock_args {
2563	int	fd;
2564	int	how;
2565};
2566#endif
2567/* ARGSUSED */
2568int
2569sys_flock(struct thread *td, struct flock_args *uap)
2570{
2571	struct file *fp;
2572	struct vnode *vp;
2573	struct flock lf;
2574	int vfslocked;
2575	int error;
2576
2577	if ((error = fget(td, uap->fd, CAP_FLOCK, &fp)) != 0)
2578		return (error);
2579	if (fp->f_type != DTYPE_VNODE) {
2580		fdrop(fp, td);
2581		return (EOPNOTSUPP);
2582	}
2583
2584	vp = fp->f_vnode;
2585	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
2586	lf.l_whence = SEEK_SET;
2587	lf.l_start = 0;
2588	lf.l_len = 0;
2589	if (uap->how & LOCK_UN) {
2590		lf.l_type = F_UNLCK;
2591		atomic_clear_int(&fp->f_flag, FHASLOCK);
2592		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
2593		goto done2;
2594	}
2595	if (uap->how & LOCK_EX)
2596		lf.l_type = F_WRLCK;
2597	else if (uap->how & LOCK_SH)
2598		lf.l_type = F_RDLCK;
2599	else {
2600		error = EBADF;
2601		goto done2;
2602	}
2603	atomic_set_int(&fp->f_flag, FHASLOCK);
2604	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
2605	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
2606done2:
2607	fdrop(fp, td);
2608	VFS_UNLOCK_GIANT(vfslocked);
2609	return (error);
2610}
2611/*
2612 * Duplicate the specified descriptor to a free descriptor.
2613 */
2614int
2615dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, int error)
2616{
2617	struct file *wfp;
2618	struct file *fp;
2619
2620	/*
2621	 * If the to-be-dup'd fd number is greater than the allowed number
2622	 * of file descriptors, or the fd to be dup'd has already been
2623	 * closed, then reject.
2624	 */
2625	FILEDESC_XLOCK(fdp);
2626	if (dfd < 0 || dfd >= fdp->fd_nfiles ||
2627	    (wfp = fdp->fd_ofiles[dfd]) == NULL) {
2628		FILEDESC_XUNLOCK(fdp);
2629		return (EBADF);
2630	}
2631
2632	/*
2633	 * There are two cases of interest here.
2634	 *
2635	 * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
2636	 *
2637	 * For ENXIO steal away the file structure from (dfd) and store it in
2638	 * (indx).  (dfd) is effectively closed by this operation.
2639	 *
2640	 * Any other error code is just returned.
2641	 */
2642	switch (error) {
2643	case ENODEV:
2644		/*
2645		 * Check that the mode the file is being opened for is a
2646		 * subset of the mode of the existing descriptor.
2647		 */
2648		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {
2649			FILEDESC_XUNLOCK(fdp);
2650			return (EACCES);
2651		}
2652		fp = fdp->fd_ofiles[indx];
2653		fdp->fd_ofiles[indx] = wfp;
2654		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
2655		if (fp == NULL)
2656			fdused(fdp, indx);
2657		fhold(wfp);
2658		FILEDESC_XUNLOCK(fdp);
2659		if (fp != NULL)
2660			/*
2661			 * We now own the reference to fp that the ofiles[]
2662			 * array used to own.  Release it.
2663			 */
2664			fdrop(fp, td);
2665		return (0);
2666
2667	case ENXIO:
2668		/*
2669		 * Steal away the file pointer from dfd and stuff it into indx.
2670		 */
2671		fp = fdp->fd_ofiles[indx];
2672		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
2673		fdp->fd_ofiles[dfd] = NULL;
2674		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
2675		fdp->fd_ofileflags[dfd] = 0;
2676		fdunused(fdp, dfd);
2677		if (fp == NULL)
2678			fdused(fdp, indx);
2679		FILEDESC_XUNLOCK(fdp);
2680
2681		/*
2682		 * We now own the reference to fp that the ofiles[] array
2683		 * used to own.  Release it.
2684		 */
2685		if (fp != NULL)
2686			fdrop(fp, td);
2687		return (0);
2688
2689	default:
2690		FILEDESC_XUNLOCK(fdp);
2691		return (error);
2692	}
2693	/* NOTREACHED */
2694}
2695
2696/*
2697 * Scan all active processes and prisons to see if any of them have a current
2698 * or root directory of `olddp'. If so, replace them with the new mount point.
2699 */
2700void
2701mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
2702{
2703	struct filedesc *fdp;
2704	struct prison *pr;
2705	struct proc *p;
2706	int nrele;
2707
2708	if (vrefcnt(olddp) == 1)
2709		return;
2710	nrele = 0;
2711	sx_slock(&allproc_lock);
2712	FOREACH_PROC_IN_SYSTEM(p) {
2713		fdp = fdhold(p);
2714		if (fdp == NULL)
2715			continue;
2716		FILEDESC_XLOCK(fdp);
2717		if (fdp->fd_cdir == olddp) {
2718			vref(newdp);
2719			fdp->fd_cdir = newdp;
2720			nrele++;
2721		}
2722		if (fdp->fd_rdir == olddp) {
2723			vref(newdp);
2724			fdp->fd_rdir = newdp;
2725			nrele++;
2726		}
2727		if (fdp->fd_jdir == olddp) {
2728			vref(newdp);
2729			fdp->fd_jdir = newdp;
2730			nrele++;
2731		}
2732		FILEDESC_XUNLOCK(fdp);
2733		fddrop(fdp);
2734	}
2735	sx_sunlock(&allproc_lock);
2736	if (rootvnode == olddp) {
2737		vref(newdp);
2738		rootvnode = newdp;
2739		nrele++;
2740	}
2741	mtx_lock(&prison0.pr_mtx);
2742	if (prison0.pr_root == olddp) {
2743		vref(newdp);
2744		prison0.pr_root = newdp;
2745		nrele++;
2746	}
2747	mtx_unlock(&prison0.pr_mtx);
2748	sx_slock(&allprison_lock);
2749	TAILQ_FOREACH(pr, &allprison, pr_list) {
2750		mtx_lock(&pr->pr_mtx);
2751		if (pr->pr_root == olddp) {
2752			vref(newdp);
2753			pr->pr_root = newdp;
2754			nrele++;
2755		}
2756		mtx_unlock(&pr->pr_mtx);
2757	}
2758	sx_sunlock(&allprison_lock);
2759	while (nrele--)
2760		vrele(olddp);
2761}
2762
2763struct filedesc_to_leader *
2764filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
2765{
2766	struct filedesc_to_leader *fdtol;
2767
2768	fdtol = malloc(sizeof(struct filedesc_to_leader),
2769	       M_FILEDESC_TO_LEADER,
2770	       M_WAITOK);
2771	fdtol->fdl_refcount = 1;
2772	fdtol->fdl_holdcount = 0;
2773	fdtol->fdl_wakeup = 0;
2774	fdtol->fdl_leader = leader;
2775	if (old != NULL) {
2776		FILEDESC_XLOCK(fdp);
2777		fdtol->fdl_next = old->fdl_next;
2778		fdtol->fdl_prev = old;
2779		old->fdl_next = fdtol;
2780		fdtol->fdl_next->fdl_prev = fdtol;
2781		FILEDESC_XUNLOCK(fdp);
2782	} else {
2783		fdtol->fdl_next = fdtol;
2784		fdtol->fdl_prev = fdtol;
2785	}
2786	return (fdtol);
2787}
2788
2789/*
2790 * Get file structures globally.
2791 */
2792static int
2793sysctl_kern_file(SYSCTL_HANDLER_ARGS)
2794{
2795	struct xfile xf;
2796	struct filedesc *fdp;
2797	struct file *fp;
2798	struct proc *p;
2799	int error, n;
2800
2801	error = sysctl_wire_old_buffer(req, 0);
2802	if (error != 0)
2803		return (error);
2804	if (req->oldptr == NULL) {
2805		n = 0;
2806		sx_slock(&allproc_lock);
2807		FOREACH_PROC_IN_SYSTEM(p) {
2808			if (p->p_state == PRS_NEW)
2809				continue;
2810			fdp = fdhold(p);
2811			if (fdp == NULL)
2812				continue;
2813			/* overestimates sparse tables. */
2814			if (fdp->fd_lastfile > 0)
2815				n += fdp->fd_lastfile;
2816			fddrop(fdp);
2817		}
2818		sx_sunlock(&allproc_lock);
2819		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
2820	}
2821	error = 0;
2822	bzero(&xf, sizeof(xf));
2823	xf.xf_size = sizeof(xf);
2824	sx_slock(&allproc_lock);
2825	FOREACH_PROC_IN_SYSTEM(p) {
2826		PROC_LOCK(p);
2827		if (p->p_state == PRS_NEW) {
2828			PROC_UNLOCK(p);
2829			continue;
2830		}
2831		if (p_cansee(req->td, p) != 0) {
2832			PROC_UNLOCK(p);
2833			continue;
2834		}
2835		xf.xf_pid = p->p_pid;
2836		xf.xf_uid = p->p_ucred->cr_uid;
2837		PROC_UNLOCK(p);
2838		fdp = fdhold(p);
2839		if (fdp == NULL)
2840			continue;
2841		FILEDESC_SLOCK(fdp);
2842		for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) {
2843			if ((fp = fdp->fd_ofiles[n]) == NULL)
2844				continue;
2845			xf.xf_fd = n;
2846			xf.xf_file = fp;
2847			xf.xf_data = fp->f_data;
2848			xf.xf_vnode = fp->f_vnode;
2849			xf.xf_type = fp->f_type;
2850			xf.xf_count = fp->f_count;
2851			xf.xf_msgcount = 0;
2852			xf.xf_offset = fp->f_offset;
2853			xf.xf_flag = fp->f_flag;
2854			error = SYSCTL_OUT(req, &xf, sizeof(xf));
2855			if (error)
2856				break;
2857		}
2858		FILEDESC_SUNLOCK(fdp);
2859		fddrop(fdp);
2860		if (error)
2861			break;
2862	}
2863	sx_sunlock(&allproc_lock);
2864	return (error);
2865}
2866
2867SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
2868    0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
2869
2870#ifdef KINFO_OFILE_SIZE
2871CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
2872#endif
2873
2874#ifdef COMPAT_FREEBSD7
2875static int
2876export_vnode_for_osysctl(struct vnode *vp, int type,
2877    struct kinfo_ofile *kif, struct filedesc *fdp, struct sysctl_req *req)
2878{
2879	int error;
2880	char *fullpath, *freepath;
2881	int vfslocked;
2882
2883	bzero(kif, sizeof(*kif));
2884	kif->kf_structsize = sizeof(*kif);
2885
2886	vref(vp);
2887	kif->kf_fd = type;
2888	kif->kf_type = KF_TYPE_VNODE;
2889	/* This function only handles directories. */
2890	if (vp->v_type != VDIR) {
2891		vrele(vp);
2892		return (ENOTDIR);
2893	}
2894	kif->kf_vnode_type = KF_VTYPE_VDIR;
2895
2896	/*
2897	 * This is not a true file descriptor, so we set a bogus refcount
2898	 * and offset to indicate these fields should be ignored.
2899	 */
2900	kif->kf_ref_count = -1;
2901	kif->kf_offset = -1;
2902
2903	freepath = NULL;
2904	fullpath = "-";
2905	FILEDESC_SUNLOCK(fdp);
2906	vn_fullpath(curthread, vp, &fullpath, &freepath);
2907	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
2908	vrele(vp);
2909	VFS_UNLOCK_GIANT(vfslocked);
2910	strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
2911	if (freepath != NULL)
2912		free(freepath, M_TEMP);
2913	error = SYSCTL_OUT(req, kif, sizeof(*kif));
2914	FILEDESC_SLOCK(fdp);
2915	return (error);
2916}
2917
2918/*
2919 * Get per-process file descriptors for use by procstat(1), et al.
2920 */
2921static int
2922sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
2923{
2924	char *fullpath, *freepath;
2925	struct kinfo_ofile *kif;
2926	struct filedesc *fdp;
2927	int error, i, *name;
2928	struct shmfd *shmfd;
2929	struct socket *so;
2930	struct vnode *vp;
2931	struct file *fp;
2932	struct proc *p;
2933	struct tty *tp;
2934	int vfslocked;
2935
2936	name = (int *)arg1;
2937	if ((p = pfind((pid_t)name[0])) == NULL)
2938		return (ESRCH);
2939	if ((error = p_candebug(curthread, p))) {
2940		PROC_UNLOCK(p);
2941		return (error);
2942	}
2943	fdp = fdhold(p);
2944	PROC_UNLOCK(p);
2945	if (fdp == NULL)
2946		return (ENOENT);
2947	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
2948	FILEDESC_SLOCK(fdp);
2949	if (fdp->fd_cdir != NULL)
2950		export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif,
2951				fdp, req);
2952	if (fdp->fd_rdir != NULL)
2953		export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif,
2954				fdp, req);
2955	if (fdp->fd_jdir != NULL)
2956		export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
2957				fdp, req);
2958	for (i = 0; i < fdp->fd_nfiles; i++) {
2959		if ((fp = fdp->fd_ofiles[i]) == NULL)
2960			continue;
2961		bzero(kif, sizeof(*kif));
2962		kif->kf_structsize = sizeof(*kif);
2963		vp = NULL;
2964		so = NULL;
2965		tp = NULL;
2966		shmfd = NULL;
2967		kif->kf_fd = i;
2968
2969#ifdef CAPABILITIES
2970		/*
2971		 * When reporting a capability, most fields will be from the
2972		 * underlying object, but do mark as a capability. With
2973		 * ofiledesc, we don't have a field to export the cap_rights_t,
2974		 * but we do with the new filedesc.
2975		 */
2976		if (fp->f_type == DTYPE_CAPABILITY) {
2977			kif->kf_flags |= KF_FLAG_CAPABILITY;
2978			(void)cap_funwrap(fp, 0, &fp);
2979		}
2980#else
2981		KASSERT(fp->f_type != DTYPE_CAPABILITY,
2982		    ("sysctl_kern_proc_ofiledesc: saw capability"));
2983#endif
2984		switch (fp->f_type) {
2985		case DTYPE_VNODE:
2986			kif->kf_type = KF_TYPE_VNODE;
2987			vp = fp->f_vnode;
2988			break;
2989
2990		case DTYPE_SOCKET:
2991			kif->kf_type = KF_TYPE_SOCKET;
2992			so = fp->f_data;
2993			break;
2994
2995		case DTYPE_PIPE:
2996			kif->kf_type = KF_TYPE_PIPE;
2997			break;
2998
2999		case DTYPE_FIFO:
3000			kif->kf_type = KF_TYPE_FIFO;
3001			vp = fp->f_vnode;
3002			break;
3003
3004		case DTYPE_KQUEUE:
3005			kif->kf_type = KF_TYPE_KQUEUE;
3006			break;
3007
3008		case DTYPE_CRYPTO:
3009			kif->kf_type = KF_TYPE_CRYPTO;
3010			break;
3011
3012		case DTYPE_MQUEUE:
3013			kif->kf_type = KF_TYPE_MQUEUE;
3014			break;
3015
3016		case DTYPE_SHM:
3017			kif->kf_type = KF_TYPE_SHM;
3018			shmfd = fp->f_data;
3019			break;
3020
3021		case DTYPE_SEM:
3022			kif->kf_type = KF_TYPE_SEM;
3023			break;
3024
3025		case DTYPE_PTS:
3026			kif->kf_type = KF_TYPE_PTS;
3027			tp = fp->f_data;
3028			break;
3029
3030#ifdef PROCDESC
3031		case DTYPE_PROCDESC:
3032			kif->kf_type = KF_TYPE_PROCDESC;
3033			break;
3034#endif
3035
3036		default:
3037			kif->kf_type = KF_TYPE_UNKNOWN;
3038			break;
3039		}
3040		kif->kf_ref_count = fp->f_count;
3041		if (fp->f_flag & FREAD)
3042			kif->kf_flags |= KF_FLAG_READ;
3043		if (fp->f_flag & FWRITE)
3044			kif->kf_flags |= KF_FLAG_WRITE;
3045		if (fp->f_flag & FAPPEND)
3046			kif->kf_flags |= KF_FLAG_APPEND;
3047		if (fp->f_flag & FASYNC)
3048			kif->kf_flags |= KF_FLAG_ASYNC;
3049		if (fp->f_flag & FFSYNC)
3050			kif->kf_flags |= KF_FLAG_FSYNC;
3051		if (fp->f_flag & FNONBLOCK)
3052			kif->kf_flags |= KF_FLAG_NONBLOCK;
3053		if (fp->f_flag & O_DIRECT)
3054			kif->kf_flags |= KF_FLAG_DIRECT;
3055		if (fp->f_flag & FHASLOCK)
3056			kif->kf_flags |= KF_FLAG_HASLOCK;
3057		kif->kf_offset = fp->f_offset;
3058		if (vp != NULL) {
3059			vref(vp);
3060			switch (vp->v_type) {
3061			case VNON:
3062				kif->kf_vnode_type = KF_VTYPE_VNON;
3063				break;
3064			case VREG:
3065				kif->kf_vnode_type = KF_VTYPE_VREG;
3066				break;
3067			case VDIR:
3068				kif->kf_vnode_type = KF_VTYPE_VDIR;
3069				break;
3070			case VBLK:
3071				kif->kf_vnode_type = KF_VTYPE_VBLK;
3072				break;
3073			case VCHR:
3074				kif->kf_vnode_type = KF_VTYPE_VCHR;
3075				break;
3076			case VLNK:
3077				kif->kf_vnode_type = KF_VTYPE_VLNK;
3078				break;
3079			case VSOCK:
3080				kif->kf_vnode_type = KF_VTYPE_VSOCK;
3081				break;
3082			case VFIFO:
3083				kif->kf_vnode_type = KF_VTYPE_VFIFO;
3084				break;
3085			case VBAD:
3086				kif->kf_vnode_type = KF_VTYPE_VBAD;
3087				break;
3088			default:
3089				kif->kf_vnode_type = KF_VTYPE_UNKNOWN;
3090				break;
3091			}
3092			/*
3093			 * It is OK to drop the filedesc lock here as we will
3094			 * re-validate and re-evaluate its properties when
3095			 * the loop continues.
3096			 */
3097			freepath = NULL;
3098			fullpath = "-";
3099			FILEDESC_SUNLOCK(fdp);
3100			vn_fullpath(curthread, vp, &fullpath, &freepath);
3101			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
3102			vrele(vp);
3103			VFS_UNLOCK_GIANT(vfslocked);
3104			strlcpy(kif->kf_path, fullpath,
3105			    sizeof(kif->kf_path));
3106			if (freepath != NULL)
3107				free(freepath, M_TEMP);
3108			FILEDESC_SLOCK(fdp);
3109		}
3110		if (so != NULL) {
3111			struct sockaddr *sa;
3112
3113			if (so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa)
3114			    == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
3115				bcopy(sa, &kif->kf_sa_local, sa->sa_len);
3116				free(sa, M_SONAME);
3117			}
3118			if (so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa)
3119			    == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
3120				bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
3121				free(sa, M_SONAME);
3122			}
3123			kif->kf_sock_domain =
3124			    so->so_proto->pr_domain->dom_family;
3125			kif->kf_sock_type = so->so_type;
3126			kif->kf_sock_protocol = so->so_proto->pr_protocol;
3127		}
3128		if (tp != NULL) {
3129			strlcpy(kif->kf_path, tty_devname(tp),
3130			    sizeof(kif->kf_path));
3131		}
3132		if (shmfd != NULL)
3133			shm_path(shmfd, kif->kf_path, sizeof(kif->kf_path));
3134		error = SYSCTL_OUT(req, kif, sizeof(*kif));
3135		if (error)
3136			break;
3137	}
3138	FILEDESC_SUNLOCK(fdp);
3139	fddrop(fdp);
3140	free(kif, M_TEMP);
3141	return (0);
3142}
3143
3144static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, CTLFLAG_RD,
3145    sysctl_kern_proc_ofiledesc, "Process ofiledesc entries");
3146#endif	/* COMPAT_FREEBSD7 */
3147
3148#ifdef KINFO_FILE_SIZE
3149CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
3150#endif
3151
3152static int
3153export_fd_for_sysctl(void *data, int type, int fd, int fflags, int refcnt,
3154    int64_t offset, int fd_is_cap, cap_rights_t fd_cap_rights,
3155    struct kinfo_file *kif, struct sysctl_req *req)
3156{
3157	struct {
3158		int	fflag;
3159		int	kf_fflag;
3160	} fflags_table[] = {
3161		{ FAPPEND, KF_FLAG_APPEND },
3162		{ FASYNC, KF_FLAG_ASYNC },
3163		{ FFSYNC, KF_FLAG_FSYNC },
3164		{ FHASLOCK, KF_FLAG_HASLOCK },
3165		{ FNONBLOCK, KF_FLAG_NONBLOCK },
3166		{ FREAD, KF_FLAG_READ },
3167		{ FWRITE, KF_FLAG_WRITE },
3168		{ O_CREAT, KF_FLAG_CREAT },
3169		{ O_DIRECT, KF_FLAG_DIRECT },
3170		{ O_EXCL, KF_FLAG_EXCL },
3171		{ O_EXEC, KF_FLAG_EXEC },
3172		{ O_EXLOCK, KF_FLAG_EXLOCK },
3173		{ O_NOFOLLOW, KF_FLAG_NOFOLLOW },
3174		{ O_SHLOCK, KF_FLAG_SHLOCK },
3175		{ O_TRUNC, KF_FLAG_TRUNC }
3176	};
3177#define	NFFLAGS	(sizeof(fflags_table) / sizeof(*fflags_table))
3178	struct vnode *vp;
3179	int error, vfslocked;
3180	unsigned int i;
3181
3182	bzero(kif, sizeof(*kif));
3183	switch (type) {
3184	case KF_TYPE_FIFO:
3185	case KF_TYPE_VNODE:
3186		vp = (struct vnode *)data;
3187		error = fill_vnode_info(vp, kif);
3188		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
3189		vrele(vp);
3190		VFS_UNLOCK_GIANT(vfslocked);
3191		break;
3192	case KF_TYPE_SOCKET:
3193		error = fill_socket_info((struct socket *)data, kif);
3194		break;
3195	case KF_TYPE_PIPE:
3196		error = fill_pipe_info((struct pipe *)data, kif);
3197		break;
3198	case KF_TYPE_PTS:
3199		error = fill_pts_info((struct tty *)data, kif);
3200		break;
3201	case KF_TYPE_PROCDESC:
3202		error = fill_procdesc_info((struct procdesc *)data, kif);
3203		break;
3204	case KF_TYPE_SHM:
3205		error = fill_shm_info((struct file *)data, kif);
3206		break;
3207	default:
3208		error = 0;
3209	}
3210	if (error == 0)
3211		kif->kf_status |= KF_ATTR_VALID;
3212
3213	/*
3214	 * Translate file access flags.
3215	 */
3216	for (i = 0; i < NFFLAGS; i++)
3217		if (fflags & fflags_table[i].fflag)
3218			kif->kf_flags |=  fflags_table[i].kf_fflag;
3219	if (fd_is_cap)
3220		kif->kf_flags |= KF_FLAG_CAPABILITY;
3221	if (fd_is_cap)
3222		kif->kf_cap_rights = fd_cap_rights;
3223	kif->kf_fd = fd;
3224	kif->kf_type = type;
3225	kif->kf_ref_count = refcnt;
3226	kif->kf_offset = offset;
3227	/* Pack record size down */
3228	kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
3229	    strlen(kif->kf_path) + 1;
3230	kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
3231	error = SYSCTL_OUT(req, kif, kif->kf_structsize);
3232	return (error);
3233}
3234
3235/*
3236 * Get per-process file descriptors for use by procstat(1), et al.
3237 */
3238static int
3239sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
3240{
3241	struct file *fp;
3242	struct filedesc *fdp;
3243	struct kinfo_file *kif;
3244	struct proc *p;
3245	struct vnode *cttyvp, *textvp, *tracevp;
3246	size_t oldidx;
3247	int64_t offset;
3248	void *data;
3249	int error, i, *name;
3250	int fd_is_cap, type, refcnt, fflags;
3251	cap_rights_t fd_cap_rights;
3252
3253	name = (int *)arg1;
3254	if ((p = pfind((pid_t)name[0])) == NULL)
3255		return (ESRCH);
3256	if ((error = p_candebug(curthread, p))) {
3257		PROC_UNLOCK(p);
3258		return (error);
3259	}
3260	/* ktrace vnode */
3261	tracevp = p->p_tracevp;
3262	if (tracevp != NULL)
3263		vref(tracevp);
3264	/* text vnode */
3265	textvp = p->p_textvp;
3266	if (textvp != NULL)
3267		vref(textvp);
3268	/* Controlling tty. */
3269	cttyvp = NULL;
3270	if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
3271		cttyvp = p->p_pgrp->pg_session->s_ttyvp;
3272		if (cttyvp != NULL)
3273			vref(cttyvp);
3274	}
3275	fdp = fdhold(p);
3276	PROC_UNLOCK(p);
3277	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
3278	if (tracevp != NULL)
3279		export_fd_for_sysctl(tracevp, KF_TYPE_VNODE, KF_FD_TYPE_TRACE,
3280		    FREAD | FWRITE, -1, -1, 0, 0, kif, req);
3281	if (textvp != NULL)
3282		export_fd_for_sysctl(textvp, KF_TYPE_VNODE, KF_FD_TYPE_TEXT,
3283		    FREAD, -1, -1, 0, 0, kif, req);
3284	if (cttyvp != NULL)
3285		export_fd_for_sysctl(cttyvp, KF_TYPE_VNODE, KF_FD_TYPE_CTTY,
3286		    FREAD | FWRITE, -1, -1, 0, 0, kif, req);
3287	if (fdp == NULL)
3288		goto fail;
3289	FILEDESC_SLOCK(fdp);
3290	/* working directory */
3291	if (fdp->fd_cdir != NULL) {
3292		vref(fdp->fd_cdir);
3293		data = fdp->fd_cdir;
3294		FILEDESC_SUNLOCK(fdp);
3295		export_fd_for_sysctl(data, KF_TYPE_VNODE, KF_FD_TYPE_CWD,
3296		    FREAD, -1, -1, 0, 0, kif, req);
3297		FILEDESC_SLOCK(fdp);
3298	}
3299	/* root directory */
3300	if (fdp->fd_rdir != NULL) {
3301		vref(fdp->fd_rdir);
3302		data = fdp->fd_rdir;
3303		FILEDESC_SUNLOCK(fdp);
3304		export_fd_for_sysctl(data, KF_TYPE_VNODE, KF_FD_TYPE_ROOT,
3305		    FREAD, -1, -1, 0, 0, kif, req);
3306		FILEDESC_SLOCK(fdp);
3307	}
3308	/* jail directory */
3309	if (fdp->fd_jdir != NULL) {
3310		vref(fdp->fd_jdir);
3311		data = fdp->fd_jdir;
3312		FILEDESC_SUNLOCK(fdp);
3313		export_fd_for_sysctl(data, KF_TYPE_VNODE, KF_FD_TYPE_JAIL,
3314		    FREAD, -1, -1, 0, 0, kif, req);
3315		FILEDESC_SLOCK(fdp);
3316	}
3317	for (i = 0; i < fdp->fd_nfiles; i++) {
3318		if ((fp = fdp->fd_ofiles[i]) == NULL)
3319			continue;
3320		data = NULL;
3321		fd_is_cap = 0;
3322		fd_cap_rights = 0;
3323
3324#ifdef CAPABILITIES
3325		/*
3326		 * When reporting a capability, most fields will be from the
3327		 * underlying object, but do mark as a capability and export
3328		 * the capability rights mask.
3329		 */
3330		if (fp->f_type == DTYPE_CAPABILITY) {
3331			fd_is_cap = 1;
3332			fd_cap_rights = cap_rights(fp);
3333			(void)cap_funwrap(fp, 0, &fp);
3334		}
3335#else /* !CAPABILITIES */
3336		KASSERT(fp->f_type != DTYPE_CAPABILITY,
3337		    ("sysctl_kern_proc_filedesc: saw capability"));
3338#endif
3339		switch (fp->f_type) {
3340		case DTYPE_VNODE:
3341			type = KF_TYPE_VNODE;
3342			vref(fp->f_vnode);
3343			data = fp->f_vnode;
3344			break;
3345
3346		case DTYPE_SOCKET:
3347			type = KF_TYPE_SOCKET;
3348			data = fp->f_data;
3349			break;
3350
3351		case DTYPE_PIPE:
3352			type = KF_TYPE_PIPE;
3353			data = fp->f_data;
3354			break;
3355
3356		case DTYPE_FIFO:
3357			type = KF_TYPE_FIFO;
3358			vref(fp->f_vnode);
3359			data = fp->f_vnode;
3360			break;
3361
3362		case DTYPE_KQUEUE:
3363			type = KF_TYPE_KQUEUE;
3364			break;
3365
3366		case DTYPE_CRYPTO:
3367			type = KF_TYPE_CRYPTO;
3368			break;
3369
3370		case DTYPE_MQUEUE:
3371			type = KF_TYPE_MQUEUE;
3372			break;
3373
3374		case DTYPE_SHM:
3375			type = KF_TYPE_SHM;
3376			data = fp;
3377			break;
3378
3379		case DTYPE_SEM:
3380			type = KF_TYPE_SEM;
3381			break;
3382
3383		case DTYPE_PTS:
3384			type = KF_TYPE_PTS;
3385			data = fp->f_data;
3386			break;
3387
3388#ifdef PROCDESC
3389		case DTYPE_PROCDESC:
3390			type = KF_TYPE_PROCDESC;
3391			data = fp->f_data;
3392			break;
3393#endif
3394
3395		default:
3396			type = KF_TYPE_UNKNOWN;
3397			break;
3398		}
3399		refcnt = fp->f_count;
3400		fflags = fp->f_flag;
3401		offset = fp->f_offset;
3402
3403		/*
3404		 * Create sysctl entry.
3405		 * It is OK to drop the filedesc lock here as we will
3406		 * re-validate and re-evaluate its properties when
3407		 * the loop continues.
3408		 */
3409		oldidx = req->oldidx;
3410		if (type == KF_TYPE_VNODE || type == KF_TYPE_FIFO)
3411			FILEDESC_SUNLOCK(fdp);
3412		error = export_fd_for_sysctl(data, type, i, fflags, refcnt,
3413		    offset, fd_is_cap, fd_cap_rights, kif, req);
3414		if (type == KF_TYPE_VNODE || type == KF_TYPE_FIFO)
3415			FILEDESC_SLOCK(fdp);
3416		if (error) {
3417			if (error == ENOMEM) {
3418				/*
3419				 * The hack to keep the ABI of sysctl
3420				 * kern.proc.filedesc intact, but not
3421				 * to account a partially copied
3422				 * kinfo_file into the oldidx.
3423				 */
3424				req->oldidx = oldidx;
3425				error = 0;
3426			}
3427			break;
3428		}
3429	}
3430	FILEDESC_SUNLOCK(fdp);
3431fail:
3432	if (fdp != NULL)
3433		fddrop(fdp);
3434	free(kif, M_TEMP);
3435	return (error);
3436}
3437
3438int
3439vntype_to_kinfo(int vtype)
3440{
3441	struct {
3442		int	vtype;
3443		int	kf_vtype;
3444	} vtypes_table[] = {
3445		{ VBAD, KF_VTYPE_VBAD },
3446		{ VBLK, KF_VTYPE_VBLK },
3447		{ VCHR, KF_VTYPE_VCHR },
3448		{ VDIR, KF_VTYPE_VDIR },
3449		{ VFIFO, KF_VTYPE_VFIFO },
3450		{ VLNK, KF_VTYPE_VLNK },
3451		{ VNON, KF_VTYPE_VNON },
3452		{ VREG, KF_VTYPE_VREG },
3453		{ VSOCK, KF_VTYPE_VSOCK }
3454	};
3455#define	NVTYPES	(sizeof(vtypes_table) / sizeof(*vtypes_table))
3456	unsigned int i;
3457
3458	/*
3459	 * Perform vtype translation.
3460	 */
3461	for (i = 0; i < NVTYPES; i++)
3462		if (vtypes_table[i].vtype == vtype)
3463			break;
3464	if (i < NVTYPES)
3465		return (vtypes_table[i].kf_vtype);
3466
3467	return (KF_VTYPE_UNKNOWN);
3468}
3469
3470static int
3471fill_vnode_info(struct vnode *vp, struct kinfo_file *kif)
3472{
3473	struct vattr va;
3474	char *fullpath, *freepath;
3475	int error, vfslocked;
3476
3477	if (vp == NULL)
3478		return (1);
3479	kif->kf_vnode_type = vntype_to_kinfo(vp->v_type);
3480	freepath = NULL;
3481	fullpath = "-";
3482	error = vn_fullpath(curthread, vp, &fullpath, &freepath);
3483	if (error == 0) {
3484		strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
3485	}
3486	if (freepath != NULL)
3487		free(freepath, M_TEMP);
3488
3489	/*
3490	 * Retrieve vnode attributes.
3491	 */
3492	va.va_fsid = VNOVAL;
3493	va.va_rdev = NODEV;
3494	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
3495	vn_lock(vp, LK_SHARED | LK_RETRY);
3496	error = VOP_GETATTR(vp, &va, curthread->td_ucred);
3497	VOP_UNLOCK(vp, 0);
3498	VFS_UNLOCK_GIANT(vfslocked);
3499	if (error != 0)
3500		return (error);
3501	if (va.va_fsid != VNOVAL)
3502		kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
3503	else
3504		kif->kf_un.kf_file.kf_file_fsid =
3505		    vp->v_mount->mnt_stat.f_fsid.val[0];
3506	kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
3507	kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
3508	kif->kf_un.kf_file.kf_file_size = va.va_size;
3509	kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
3510	return (0);
3511}
3512
3513static int
3514fill_socket_info(struct socket *so, struct kinfo_file *kif)
3515{
3516	struct sockaddr *sa;
3517	struct inpcb *inpcb;
3518	struct unpcb *unpcb;
3519	int error;
3520
3521	if (so == NULL)
3522		return (1);
3523	kif->kf_sock_domain = so->so_proto->pr_domain->dom_family;
3524	kif->kf_sock_type = so->so_type;
3525	kif->kf_sock_protocol = so->so_proto->pr_protocol;
3526	kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb;
3527	switch(kif->kf_sock_domain) {
3528	case AF_INET:
3529	case AF_INET6:
3530		if (kif->kf_sock_protocol == IPPROTO_TCP) {
3531			if (so->so_pcb != NULL) {
3532				inpcb = (struct inpcb *)(so->so_pcb);
3533				kif->kf_un.kf_sock.kf_sock_inpcb =
3534				    (uintptr_t)inpcb->inp_ppcb;
3535			}
3536		}
3537		break;
3538	case AF_UNIX:
3539		if (so->so_pcb != NULL) {
3540			unpcb = (struct unpcb *)(so->so_pcb);
3541			if (unpcb->unp_conn) {
3542				kif->kf_un.kf_sock.kf_sock_unpconn =
3543				    (uintptr_t)unpcb->unp_conn;
3544				kif->kf_un.kf_sock.kf_sock_rcv_sb_state =
3545				    so->so_rcv.sb_state;
3546				kif->kf_un.kf_sock.kf_sock_snd_sb_state =
3547				    so->so_snd.sb_state;
3548			}
3549		}
3550		break;
3551	}
3552	error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa);
3553	if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
3554		bcopy(sa, &kif->kf_sa_local, sa->sa_len);
3555		free(sa, M_SONAME);
3556	}
3557	error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa);
3558	if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
3559		bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
3560		free(sa, M_SONAME);
3561	}
3562	strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name,
3563	    sizeof(kif->kf_path));
3564	return (0);
3565}
3566
3567static int
3568fill_pts_info(struct tty *tp, struct kinfo_file *kif)
3569{
3570
3571	if (tp == NULL)
3572		return (1);
3573	kif->kf_un.kf_pts.kf_pts_dev = tty_udev(tp);
3574	strlcpy(kif->kf_path, tty_devname(tp), sizeof(kif->kf_path));
3575	return (0);
3576}
3577
3578static int
3579fill_pipe_info(struct pipe *pi, struct kinfo_file *kif)
3580{
3581
3582	if (pi == NULL)
3583		return (1);
3584	kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi;
3585	kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer;
3586	kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt;
3587	return (0);
3588}
3589
3590static int
3591fill_procdesc_info(struct procdesc *pdp, struct kinfo_file *kif)
3592{
3593
3594	if (pdp == NULL)
3595		return (1);
3596	kif->kf_un.kf_proc.kf_pid = pdp->pd_pid;
3597	return (0);
3598}
3599
3600static int
3601fill_shm_info(struct file *fp, struct kinfo_file *kif)
3602{
3603	struct thread *td;
3604	struct stat sb;
3605
3606	td = curthread;
3607	if (fp->f_data == NULL)
3608		return (1);
3609	if (fo_stat(fp, &sb, td->td_ucred, td) != 0)
3610		return (1);
3611	shm_path(fp->f_data, kif->kf_path, sizeof(kif->kf_path));
3612	kif->kf_un.kf_file.kf_file_mode = sb.st_mode;
3613	kif->kf_un.kf_file.kf_file_size = sb.st_size;
3614	return (0);
3615}
3616
3617static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD,
3618    sysctl_kern_proc_filedesc, "Process filedesc entries");
3619
3620#ifdef DDB
3621/*
3622 * For the purposes of debugging, generate a human-readable string for the
3623 * file type.
3624 */
3625static const char *
3626file_type_to_name(short type)
3627{
3628
3629	switch (type) {
3630	case 0:
3631		return ("zero");
3632	case DTYPE_VNODE:
3633		return ("vnod");
3634	case DTYPE_SOCKET:
3635		return ("sock");
3636	case DTYPE_PIPE:
3637		return ("pipe");
3638	case DTYPE_FIFO:
3639		return ("fifo");
3640	case DTYPE_KQUEUE:
3641		return ("kque");
3642	case DTYPE_CRYPTO:
3643		return ("crpt");
3644	case DTYPE_MQUEUE:
3645		return ("mque");
3646	case DTYPE_SHM:
3647		return ("shm");
3648	case DTYPE_SEM:
3649		return ("ksem");
3650	default:
3651		return ("unkn");
3652	}
3653}
3654
3655/*
3656 * For the purposes of debugging, identify a process (if any, perhaps one of
3657 * many) that references the passed file in its file descriptor array. Return
3658 * NULL if none.
3659 */
3660static struct proc *
3661file_to_first_proc(struct file *fp)
3662{
3663	struct filedesc *fdp;
3664	struct proc *p;
3665	int n;
3666
3667	FOREACH_PROC_IN_SYSTEM(p) {
3668		if (p->p_state == PRS_NEW)
3669			continue;
3670		fdp = p->p_fd;
3671		if (fdp == NULL)
3672			continue;
3673		for (n = 0; n < fdp->fd_nfiles; n++) {
3674			if (fp == fdp->fd_ofiles[n])
3675				return (p);
3676		}
3677	}
3678	return (NULL);
3679}
3680
3681static void
3682db_print_file(struct file *fp, int header)
3683{
3684	struct proc *p;
3685
3686	if (header)
3687		db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n",
3688		    "File", "Type", "Data", "Flag", "GCFl", "Count",
3689		    "MCount", "Vnode", "FPID", "FCmd");
3690	p = file_to_first_proc(fp);
3691	db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
3692	    file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
3693	    0, fp->f_count, 0, fp->f_vnode,
3694	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
3695}
3696
3697DB_SHOW_COMMAND(file, db_show_file)
3698{
3699	struct file *fp;
3700
3701	if (!have_addr) {
3702		db_printf("usage: show file <addr>\n");
3703		return;
3704	}
3705	fp = (struct file *)addr;
3706	db_print_file(fp, 1);
3707}
3708
3709DB_SHOW_COMMAND(files, db_show_files)
3710{
3711	struct filedesc *fdp;
3712	struct file *fp;
3713	struct proc *p;
3714	int header;
3715	int n;
3716
3717	header = 1;
3718	FOREACH_PROC_IN_SYSTEM(p) {
3719		if (p->p_state == PRS_NEW)
3720			continue;
3721		if ((fdp = p->p_fd) == NULL)
3722			continue;
3723		for (n = 0; n < fdp->fd_nfiles; ++n) {
3724			if ((fp = fdp->fd_ofiles[n]) == NULL)
3725				continue;
3726			db_print_file(fp, header);
3727			header = 0;
3728		}
3729	}
3730}
3731#endif
3732
3733SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
3734    &maxfilesperproc, 0, "Maximum files allowed open per process");
3735
3736SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
3737    &maxfiles, 0, "Maximum number of files");
3738
3739SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
3740    __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files");
3741
3742/* ARGSUSED*/
3743static void
3744filelistinit(void *dummy)
3745{
3746
3747	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
3748	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
3749	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
3750	mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF);
3751}
3752SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
3753
3754/*-------------------------------------------------------------------*/
3755
3756static int
3757badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
3758    int flags, struct thread *td)
3759{
3760
3761	return (EBADF);
3762}
3763
3764static int
3765badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
3766    struct thread *td)
3767{
3768
3769	return (EINVAL);
3770}
3771
3772static int
3773badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
3774    struct thread *td)
3775{
3776
3777	return (EBADF);
3778}
3779
3780static int
3781badfo_poll(struct file *fp, int events, struct ucred *active_cred,
3782    struct thread *td)
3783{
3784
3785	return (0);
3786}
3787
3788static int
3789badfo_kqfilter(struct file *fp, struct knote *kn)
3790{
3791
3792	return (EBADF);
3793}
3794
3795static int
3796badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
3797    struct thread *td)
3798{
3799
3800	return (EBADF);
3801}
3802
3803static int
3804badfo_close(struct file *fp, struct thread *td)
3805{
3806
3807	return (EBADF);
3808}
3809
3810static int
3811badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
3812    struct thread *td)
3813{
3814
3815	return (EBADF);
3816}
3817
3818static int
3819badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
3820    struct thread *td)
3821{
3822
3823	return (EBADF);
3824}
3825
3826struct fileops badfileops = {
3827	.fo_read = badfo_readwrite,
3828	.fo_write = badfo_readwrite,
3829	.fo_truncate = badfo_truncate,
3830	.fo_ioctl = badfo_ioctl,
3831	.fo_poll = badfo_poll,
3832	.fo_kqfilter = badfo_kqfilter,
3833	.fo_stat = badfo_stat,
3834	.fo_close = badfo_close,
3835	.fo_chmod = badfo_chmod,
3836	.fo_chown = badfo_chown,
3837};
3838
3839int
3840invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
3841    struct thread *td)
3842{
3843
3844	return (EINVAL);
3845}
3846
3847int
3848invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
3849    struct thread *td)
3850{
3851
3852	return (EINVAL);
3853}
3854
3855/*-------------------------------------------------------------------*/
3856
3857/*
3858 * File Descriptor pseudo-device driver (/dev/fd/).
3859 *
3860 * Opening minor device N dup()s the file (if any) connected to file
3861 * descriptor N belonging to the calling process.  Note that this driver
3862 * consists of only the ``open()'' routine, because all subsequent
3863 * references to this file will be direct to the other driver.
3864 *
3865 * XXX: we could give this one a cloning event handler if necessary.
3866 */
3867
3868/* ARGSUSED */
3869static int
3870fdopen(struct cdev *dev, int mode, int type, struct thread *td)
3871{
3872
3873	/*
3874	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
3875	 * the file descriptor being sought for duplication. The error
3876	 * return ensures that the vnode for this device will be released
3877	 * by vn_open. Open will detect this special error and take the
3878	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
3879	 * will simply report the error.
3880	 */
3881	td->td_dupfd = dev2unit(dev);
3882	return (ENODEV);
3883}
3884
3885static struct cdevsw fildesc_cdevsw = {
3886	.d_version =	D_VERSION,
3887	.d_open =	fdopen,
3888	.d_name =	"FD",
3889};
3890
3891static void
3892fildesc_drvinit(void *unused)
3893{
3894	struct cdev *dev;
3895
3896	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
3897	    UID_ROOT, GID_WHEEL, 0666, "fd/0");
3898	make_dev_alias(dev, "stdin");
3899	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
3900	    UID_ROOT, GID_WHEEL, 0666, "fd/1");
3901	make_dev_alias(dev, "stdout");
3902	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
3903	    UID_ROOT, GID_WHEEL, 0666, "fd/2");
3904	make_dev_alias(dev, "stderr");
3905}
3906
3907SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
3908