kern_descrip.c revision 111119
1/*
2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
39 * $FreeBSD: head/sys/kern/kern_descrip.c 111119 2003-02-19 05:47:46Z imp $
40 */
41
42#include "opt_compat.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/syscallsubr.h>
47#include <sys/sysproto.h>
48#include <sys/conf.h>
49#include <sys/filedesc.h>
50#include <sys/lock.h>
51#include <sys/kernel.h>
52#include <sys/malloc.h>
53#include <sys/mutex.h>
54#include <sys/sysctl.h>
55#include <sys/vnode.h>
56#include <sys/mount.h>
57#include <sys/proc.h>
58#include <sys/namei.h>
59#include <sys/file.h>
60#include <sys/stat.h>
61#include <sys/filio.h>
62#include <sys/fcntl.h>
63#include <sys/unistd.h>
64#include <sys/resourcevar.h>
65#include <sys/event.h>
66#include <sys/sx.h>
67#include <sys/socketvar.h>
68#include <sys/signalvar.h>
69
70#include <machine/limits.h>
71
72#include <vm/vm.h>
73#include <vm/vm_extern.h>
74#include <vm/uma.h>
75
76static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
77static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
78
79uma_zone_t file_zone;
80
81static	 d_open_t  fdopen;
82#define	NUMFDESC 64
83
84#define	CDEV_MAJOR 22
85static struct cdevsw fildesc_cdevsw = {
86	/* open */	fdopen,
87	/* close */	noclose,
88	/* read */	noread,
89	/* write */	nowrite,
90	/* ioctl */	noioctl,
91	/* poll */	nopoll,
92	/* mmap */	nommap,
93	/* strategy */	nostrategy,
94	/* name */	"FD",
95	/* maj */	CDEV_MAJOR,
96	/* dump */	nodump,
97	/* psize */	nopsize,
98	/* flags */	0,
99};
100
101/* How to treat 'new' parameter when allocating a fd for do_dup(). */
102enum dup_type { DUP_VARIABLE, DUP_FIXED };
103
104static int do_dup(struct thread *td, enum dup_type type, int old, int new,
105    register_t *retval);
106
107/*
108 * Descriptor management.
109 */
110struct filelist filehead;	/* head of list of open files */
111int nfiles;			/* actual number of open files */
112extern int cmask;
113struct sx filelist_lock;	/* sx to protect filelist */
114struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
115
116/*
117 * System calls on descriptors.
118 */
119#ifndef _SYS_SYSPROTO_H_
120struct getdtablesize_args {
121	int	dummy;
122};
123#endif
124/*
125 * MPSAFE
126 */
127/* ARGSUSED */
128int
129getdtablesize(td, uap)
130	struct thread *td;
131	struct getdtablesize_args *uap;
132{
133	struct proc *p = td->td_proc;
134
135	mtx_lock(&Giant);
136	td->td_retval[0] =
137	    min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
138	mtx_unlock(&Giant);
139	return (0);
140}
141
142/*
143 * Duplicate a file descriptor to a particular value.
144 *
145 * note: keep in mind that a potential race condition exists when closing
146 * descriptors from a shared descriptor table (via rfork).
147 */
148#ifndef _SYS_SYSPROTO_H_
149struct dup2_args {
150	u_int	from;
151	u_int	to;
152};
153#endif
154/*
155 * MPSAFE
156 */
157/* ARGSUSED */
158int
159dup2(td, uap)
160	struct thread *td;
161	struct dup2_args *uap;
162{
163
164	return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
165		    td->td_retval));
166}
167
168/*
169 * Duplicate a file descriptor.
170 */
171#ifndef _SYS_SYSPROTO_H_
172struct dup_args {
173	u_int	fd;
174};
175#endif
176/*
177 * MPSAFE
178 */
179/* ARGSUSED */
180int
181dup(td, uap)
182	struct thread *td;
183	struct dup_args *uap;
184{
185
186	return (do_dup(td, DUP_VARIABLE, (int)uap->fd, 0, td->td_retval));
187}
188
189/*
190 * The file control system call.
191 */
192#ifndef _SYS_SYSPROTO_H_
193struct fcntl_args {
194	int	fd;
195	int	cmd;
196	long	arg;
197};
198#endif
199/*
200 * MPSAFE
201 */
202/* ARGSUSED */
203int
204fcntl(td, uap)
205	struct thread *td;
206	struct fcntl_args *uap;
207{
208	struct flock fl;
209	intptr_t arg;
210	int error;
211
212	error = 0;
213	switch (uap->cmd) {
214	case F_GETLK:
215	case F_SETLK:
216	case F_SETLKW:
217		error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
218		arg = (intptr_t)&fl;
219		break;
220	default:
221		arg = uap->arg;
222		break;
223	}
224	if (error)
225		return (error);
226	error = kern_fcntl(td, uap->fd, uap->cmd, arg);
227	if (error)
228		return (error);
229	if (uap->cmd == F_GETLK)
230		error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
231	return (error);
232}
233
234int
235kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
236{
237	struct filedesc *fdp;
238	struct flock *flp;
239	struct file *fp;
240	struct proc *p;
241	char *pop;
242	struct vnode *vp;
243	u_int newmin;
244	int error, flg, tmp;
245
246	error = 0;
247	flg = F_POSIX;
248	p = td->td_proc;
249	fdp = p->p_fd;
250	mtx_lock(&Giant);
251	FILEDESC_LOCK(fdp);
252	if ((unsigned)fd >= fdp->fd_nfiles ||
253	    (fp = fdp->fd_ofiles[fd]) == NULL) {
254		FILEDESC_UNLOCK(fdp);
255		error = EBADF;
256		goto done2;
257	}
258	pop = &fdp->fd_ofileflags[fd];
259
260	switch (cmd) {
261	case F_DUPFD:
262		FILEDESC_UNLOCK(fdp);
263		newmin = arg;
264		if (newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
265		    newmin >= maxfilesperproc) {
266			error = EINVAL;
267			break;
268		}
269		error = do_dup(td, DUP_VARIABLE, fd, newmin, td->td_retval);
270		break;
271
272	case F_GETFD:
273		td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0;
274		FILEDESC_UNLOCK(fdp);
275		break;
276
277	case F_SETFD:
278		*pop = (*pop &~ UF_EXCLOSE) |
279		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
280		FILEDESC_UNLOCK(fdp);
281		break;
282
283	case F_GETFL:
284		FILE_LOCK(fp);
285		FILEDESC_UNLOCK(fdp);
286		td->td_retval[0] = OFLAGS(fp->f_flag);
287		FILE_UNLOCK(fp);
288		break;
289
290	case F_SETFL:
291		FILE_LOCK(fp);
292		FILEDESC_UNLOCK(fdp);
293		fhold_locked(fp);
294		fp->f_flag &= ~FCNTLFLAGS;
295		fp->f_flag |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
296		FILE_UNLOCK(fp);
297		tmp = fp->f_flag & FNONBLOCK;
298		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
299		if (error) {
300			fdrop(fp, td);
301			break;
302		}
303		tmp = fp->f_flag & FASYNC;
304		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
305		if (error == 0) {
306			fdrop(fp, td);
307			break;
308		}
309		FILE_LOCK(fp);
310		fp->f_flag &= ~FNONBLOCK;
311		FILE_UNLOCK(fp);
312		tmp = 0;
313		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
314		fdrop(fp, td);
315		break;
316
317	case F_GETOWN:
318		fhold(fp);
319		FILEDESC_UNLOCK(fdp);
320		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
321		if (error == 0)
322			td->td_retval[0] = tmp;
323		fdrop(fp, td);
324		break;
325
326	case F_SETOWN:
327		fhold(fp);
328		FILEDESC_UNLOCK(fdp);
329		tmp = arg;
330		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
331		fdrop(fp, td);
332		break;
333
334	case F_SETLKW:
335		flg |= F_WAIT;
336		/* FALLTHROUGH F_SETLK */
337
338	case F_SETLK:
339		if (fp->f_type != DTYPE_VNODE) {
340			FILEDESC_UNLOCK(fdp);
341			error = EBADF;
342			break;
343		}
344
345		flp = (struct flock *)arg;
346		if (flp->l_whence == SEEK_CUR) {
347			if (fp->f_offset < 0 ||
348			    (flp->l_start > 0 &&
349			     fp->f_offset > OFF_MAX - flp->l_start)) {
350				FILEDESC_UNLOCK(fdp);
351				error = EOVERFLOW;
352				break;
353			}
354			flp->l_start += fp->f_offset;
355		}
356
357		/*
358		 * VOP_ADVLOCK() may block.
359		 */
360		fhold(fp);
361		FILEDESC_UNLOCK(fdp);
362		vp = fp->f_data;
363
364		switch (flp->l_type) {
365		case F_RDLCK:
366			if ((fp->f_flag & FREAD) == 0) {
367				error = EBADF;
368				break;
369			}
370			PROC_LOCK(p->p_leader);
371			p->p_leader->p_flag |= P_ADVLOCK;
372			PROC_UNLOCK(p->p_leader);
373			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
374			    flp, flg);
375			break;
376		case F_WRLCK:
377			if ((fp->f_flag & FWRITE) == 0) {
378				error = EBADF;
379				break;
380			}
381			PROC_LOCK(p->p_leader);
382			p->p_leader->p_flag |= P_ADVLOCK;
383			PROC_UNLOCK(p->p_leader);
384			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
385			    flp, flg);
386			break;
387		case F_UNLCK:
388			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
389			    flp, F_POSIX);
390			break;
391		default:
392			error = EINVAL;
393			break;
394		}
395		/* Check for race with close */
396		FILEDESC_LOCK(fdp);
397		if ((unsigned) fd >= fdp->fd_nfiles ||
398		    fp != fdp->fd_ofiles[fd]) {
399			FILEDESC_UNLOCK(fdp);
400			flp->l_whence = SEEK_SET;
401			flp->l_start = 0;
402			flp->l_len = 0;
403			flp->l_type = F_UNLCK;
404			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
405					   F_UNLCK, flp, F_POSIX);
406		} else
407			FILEDESC_UNLOCK(fdp);
408		fdrop(fp, td);
409		break;
410
411	case F_GETLK:
412		if (fp->f_type != DTYPE_VNODE) {
413			FILEDESC_UNLOCK(fdp);
414			error = EBADF;
415			break;
416		}
417		flp = (struct flock *)arg;
418		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
419		    flp->l_type != F_UNLCK) {
420			FILEDESC_UNLOCK(fdp);
421			error = EINVAL;
422			break;
423		}
424		if (flp->l_whence == SEEK_CUR) {
425			if ((flp->l_start > 0 &&
426			    fp->f_offset > OFF_MAX - flp->l_start) ||
427			    (flp->l_start < 0 &&
428			     fp->f_offset < OFF_MIN - flp->l_start)) {
429				FILEDESC_UNLOCK(fdp);
430				error = EOVERFLOW;
431				break;
432			}
433			flp->l_start += fp->f_offset;
434		}
435		/*
436		 * VOP_ADVLOCK() may block.
437		 */
438		fhold(fp);
439		FILEDESC_UNLOCK(fdp);
440		vp = fp->f_data;
441		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
442		    F_POSIX);
443		/* Check for race with close */
444		FILEDESC_LOCK(fdp);
445		if ((unsigned) fd >= fdp->fd_nfiles ||
446		    fp != fdp->fd_ofiles[fd]) {
447			FILEDESC_UNLOCK(fdp);
448			flp->l_whence = SEEK_SET;
449			flp->l_start = 0;
450			flp->l_len = 0;
451			flp->l_type = F_UNLCK;
452			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
453					   F_UNLCK, flp, F_POSIX);
454		} else
455			FILEDESC_UNLOCK(fdp);
456		fdrop(fp, td);
457		break;
458	default:
459		FILEDESC_UNLOCK(fdp);
460		error = EINVAL;
461		break;
462	}
463done2:
464	mtx_unlock(&Giant);
465	return (error);
466}
467
468/*
469 * Common code for dup, dup2, and fcntl(F_DUPFD).
470 * filedesc must be locked, but will be unlocked as a side effect.
471 */
472static int
473do_dup(td, type, old, new, retval)
474	enum dup_type type;
475	int old, new;
476	register_t *retval;
477	struct thread *td;
478{
479	struct filedesc *fdp;
480	struct proc *p;
481	struct file *fp;
482	struct file *delfp;
483	int error, newfd;
484
485	p = td->td_proc;
486	fdp = p->p_fd;
487
488	/*
489	 * Verify we have a valid descriptor to dup from and possibly to
490	 * dup to.
491	 */
492	if (old < 0 || new < 0 || new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
493	    new >= maxfilesperproc)
494		return (EBADF);
495	FILEDESC_LOCK(fdp);
496	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
497		FILEDESC_UNLOCK(fdp);
498		return (EBADF);
499	}
500	if (type == DUP_FIXED && old == new) {
501		*retval = new;
502		FILEDESC_UNLOCK(fdp);
503		return (0);
504	}
505	fp = fdp->fd_ofiles[old];
506	fhold(fp);
507
508	/*
509	 * Expand the table for the new descriptor if needed.  This may
510	 * block and drop and reacquire the filedesc lock.
511	 */
512	if (type == DUP_VARIABLE || new >= fdp->fd_nfiles) {
513		error = fdalloc(td, new, &newfd);
514		if (error) {
515			FILEDESC_UNLOCK(fdp);
516			fdrop(fp, td);
517			return (error);
518		}
519	}
520	if (type == DUP_VARIABLE)
521		new = newfd;
522
523	/*
524	 * If the old file changed out from under us then treat it as a
525	 * bad file descriptor.  Userland should do its own locking to
526	 * avoid this case.
527	 */
528	if (fdp->fd_ofiles[old] != fp) {
529		if (fdp->fd_ofiles[new] == NULL) {
530			if (new < fdp->fd_freefile)
531				fdp->fd_freefile = new;
532			while (fdp->fd_lastfile > 0 &&
533			    fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
534				fdp->fd_lastfile--;
535		}
536		FILEDESC_UNLOCK(fdp);
537		fdrop(fp, td);
538		return (EBADF);
539	}
540	KASSERT(old != new, ("new fd is same as old"));
541
542	/*
543	 * Save info on the descriptor being overwritten.  We have
544	 * to do the unmap now, but we cannot close it without
545	 * introducing an ownership race for the slot.
546	 */
547	delfp = fdp->fd_ofiles[new];
548	KASSERT(delfp == NULL || type == DUP_FIXED,
549	    ("dup() picked an open file"));
550#if 0
551	if (delfp && (fdp->fd_ofileflags[new] & UF_MAPPED))
552		(void) munmapfd(td, new);
553#endif
554
555	/*
556	 * Duplicate the source descriptor, update lastfile
557	 */
558	fdp->fd_ofiles[new] = fp;
559 	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
560	if (new > fdp->fd_lastfile)
561		fdp->fd_lastfile = new;
562	FILEDESC_UNLOCK(fdp);
563	*retval = new;
564
565	/*
566	 * If we dup'd over a valid file, we now own the reference to it
567	 * and must dispose of it using closef() semantics (as if a
568	 * close() were performed on it).
569	 */
570	if (delfp) {
571		mtx_lock(&Giant);
572		(void) closef(delfp, td);
573		mtx_unlock(&Giant);
574	}
575	return (0);
576}
577
578/*
579 * If sigio is on the list associated with a process or process group,
580 * disable signalling from the device, remove sigio from the list and
581 * free sigio.
582 */
583void
584funsetown(sigiop)
585	struct sigio **sigiop;
586{
587	struct sigio *sigio;
588
589	SIGIO_LOCK();
590	sigio = *sigiop;
591	if (sigio == NULL) {
592		SIGIO_UNLOCK();
593		return;
594	}
595	*(sigio->sio_myref) = NULL;
596	if ((sigio)->sio_pgid < 0) {
597		struct pgrp *pg = (sigio)->sio_pgrp;
598		PGRP_LOCK(pg);
599		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
600			     sigio, sio_pgsigio);
601		PGRP_UNLOCK(pg);
602	} else {
603		struct proc *p = (sigio)->sio_proc;
604		PROC_LOCK(p);
605		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
606			     sigio, sio_pgsigio);
607		PROC_UNLOCK(p);
608	}
609	SIGIO_UNLOCK();
610	crfree(sigio->sio_ucred);
611	FREE(sigio, M_SIGIO);
612}
613
614/*
615 * Free a list of sigio structures.
616 * We only need to lock the SIGIO_LOCK because we have made ourselves
617 * inaccessable to callers of fsetown and therefore do not need to lock
618 * the proc or pgrp struct for the list manipulation.
619 */
620void
621funsetownlst(sigiolst)
622	struct sigiolst *sigiolst;
623{
624	struct proc *p;
625	struct pgrp *pg;
626	struct sigio *sigio;
627
628	sigio = SLIST_FIRST(sigiolst);
629	if (sigio == NULL)
630		return;
631	p = NULL;
632	pg = NULL;
633
634	/*
635	 * Every entry of the list should belong
636	 * to a single proc or pgrp.
637	 */
638	if (sigio->sio_pgid < 0) {
639		pg = sigio->sio_pgrp;
640		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
641	} else /* if (sigio->sio_pgid > 0) */ {
642		p = sigio->sio_proc;
643		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
644	}
645
646	SIGIO_LOCK();
647	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
648		*(sigio->sio_myref) = NULL;
649		if (pg != NULL) {
650			KASSERT(sigio->sio_pgid < 0,
651			    ("Proc sigio in pgrp sigio list"));
652			KASSERT(sigio->sio_pgrp == pg,
653			    ("Bogus pgrp in sigio list"));
654			PGRP_LOCK(pg);
655			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
656			    sio_pgsigio);
657			PGRP_UNLOCK(pg);
658		} else /* if (p != NULL) */ {
659			KASSERT(sigio->sio_pgid > 0,
660			    ("Pgrp sigio in proc sigio list"));
661			KASSERT(sigio->sio_proc == p,
662			    ("Bogus proc in sigio list"));
663			PROC_LOCK(p);
664			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
665			    sio_pgsigio);
666			PROC_UNLOCK(p);
667		}
668		SIGIO_UNLOCK();
669		crfree(sigio->sio_ucred);
670		FREE(sigio, M_SIGIO);
671		SIGIO_LOCK();
672	}
673	SIGIO_UNLOCK();
674}
675
676/*
677 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
678 *
679 * After permission checking, add a sigio structure to the sigio list for
680 * the process or process group.
681 */
682int
683fsetown(pgid, sigiop)
684	pid_t pgid;
685	struct sigio **sigiop;
686{
687	struct proc *proc;
688	struct pgrp *pgrp;
689	struct sigio *sigio;
690	int ret;
691
692	if (pgid == 0) {
693		funsetown(sigiop);
694		return (0);
695	}
696
697	ret = 0;
698
699	/* Allocate and fill in the new sigio out of locks. */
700	MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO, M_WAITOK);
701	sigio->sio_pgid = pgid;
702	sigio->sio_ucred = crhold(curthread->td_ucred);
703	sigio->sio_myref = sigiop;
704
705	sx_slock(&proctree_lock);
706	if (pgid > 0) {
707		proc = pfind(pgid);
708		if (proc == NULL) {
709			ret = ESRCH;
710			goto fail;
711		}
712
713		/*
714		 * Policy - Don't allow a process to FSETOWN a process
715		 * in another session.
716		 *
717		 * Remove this test to allow maximum flexibility or
718		 * restrict FSETOWN to the current process or process
719		 * group for maximum safety.
720		 */
721		PROC_UNLOCK(proc);
722		if (proc->p_session != curthread->td_proc->p_session) {
723			ret = EPERM;
724			goto fail;
725		}
726
727		pgrp = NULL;
728	} else /* if (pgid < 0) */ {
729		pgrp = pgfind(-pgid);
730		if (pgrp == NULL) {
731			ret = ESRCH;
732			goto fail;
733		}
734		PGRP_UNLOCK(pgrp);
735
736		/*
737		 * Policy - Don't allow a process to FSETOWN a process
738		 * in another session.
739		 *
740		 * Remove this test to allow maximum flexibility or
741		 * restrict FSETOWN to the current process or process
742		 * group for maximum safety.
743		 */
744		if (pgrp->pg_session != curthread->td_proc->p_session) {
745			ret = EPERM;
746			goto fail;
747		}
748
749		proc = NULL;
750	}
751	funsetown(sigiop);
752	if (pgid > 0) {
753		PROC_LOCK(proc);
754		/*
755		 * Since funsetownlst() is called without the proctree
756		 * locked, we need to check for P_WEXIT.
757		 * XXX: is ESRCH correct?
758		 */
759		if ((proc->p_flag & P_WEXIT) != 0) {
760			PROC_UNLOCK(proc);
761			ret = ESRCH;
762			goto fail;
763		}
764		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
765		sigio->sio_proc = proc;
766		PROC_UNLOCK(proc);
767	} else {
768		PGRP_LOCK(pgrp);
769		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
770		sigio->sio_pgrp = pgrp;
771		PGRP_UNLOCK(pgrp);
772	}
773	sx_sunlock(&proctree_lock);
774	SIGIO_LOCK();
775	*sigiop = sigio;
776	SIGIO_UNLOCK();
777	return (0);
778
779fail:
780	sx_sunlock(&proctree_lock);
781	crfree(sigio->sio_ucred);
782	FREE(sigio, M_SIGIO);
783	return (ret);
784}
785
786/*
787 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
788 */
789pid_t
790fgetown(sigiop)
791	struct sigio **sigiop;
792{
793	pid_t pgid;
794
795	SIGIO_LOCK();
796	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
797	SIGIO_UNLOCK();
798	return (pgid);
799}
800
801/*
802 * Close a file descriptor.
803 */
804#ifndef _SYS_SYSPROTO_H_
805struct close_args {
806        int     fd;
807};
808#endif
809/*
810 * MPSAFE
811 */
812/* ARGSUSED */
813int
814close(td, uap)
815	struct thread *td;
816	struct close_args *uap;
817{
818	struct filedesc *fdp;
819	struct file *fp;
820	int fd, error;
821
822	fd = uap->fd;
823	error = 0;
824	fdp = td->td_proc->p_fd;
825	mtx_lock(&Giant);
826	FILEDESC_LOCK(fdp);
827	if ((unsigned)fd >= fdp->fd_nfiles ||
828	    (fp = fdp->fd_ofiles[fd]) == NULL) {
829		FILEDESC_UNLOCK(fdp);
830		error = EBADF;
831		goto done2;
832	}
833#if 0
834	if (fdp->fd_ofileflags[fd] & UF_MAPPED)
835		(void) munmapfd(td, fd);
836#endif
837	fdp->fd_ofiles[fd] = NULL;
838	fdp->fd_ofileflags[fd] = 0;
839
840	/*
841	 * we now hold the fp reference that used to be owned by the descriptor
842	 * array.
843	 */
844	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
845		fdp->fd_lastfile--;
846	if (fd < fdp->fd_freefile)
847		fdp->fd_freefile = fd;
848	if (fd < fdp->fd_knlistsize) {
849		FILEDESC_UNLOCK(fdp);
850		knote_fdclose(td, fd);
851	} else
852		FILEDESC_UNLOCK(fdp);
853
854	error = closef(fp, td);
855done2:
856	mtx_unlock(&Giant);
857	return (error);
858}
859
860#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
861/*
862 * Return status information about a file descriptor.
863 */
864#ifndef _SYS_SYSPROTO_H_
865struct ofstat_args {
866	int	fd;
867	struct	ostat *sb;
868};
869#endif
870/*
871 * MPSAFE
872 */
873/* ARGSUSED */
874int
875ofstat(td, uap)
876	struct thread *td;
877	struct ofstat_args *uap;
878{
879	struct file *fp;
880	struct stat ub;
881	struct ostat oub;
882	int error;
883
884	mtx_lock(&Giant);
885	if ((error = fget(td, uap->fd, &fp)) != 0)
886		goto done2;
887	error = fo_stat(fp, &ub, td->td_ucred, td);
888	if (error == 0) {
889		cvtstat(&ub, &oub);
890		error = copyout(&oub, uap->sb, sizeof(oub));
891	}
892	fdrop(fp, td);
893done2:
894	mtx_unlock(&Giant);
895	return (error);
896}
897#endif /* COMPAT_43 || COMPAT_SUNOS */
898
899/*
900 * Return status information about a file descriptor.
901 */
902#ifndef _SYS_SYSPROTO_H_
903struct fstat_args {
904	int	fd;
905	struct	stat *sb;
906};
907#endif
908/*
909 * MPSAFE
910 */
911/* ARGSUSED */
912int
913fstat(td, uap)
914	struct thread *td;
915	struct fstat_args *uap;
916{
917	struct file *fp;
918	struct stat ub;
919	int error;
920
921	mtx_lock(&Giant);
922	if ((error = fget(td, uap->fd, &fp)) != 0)
923		goto done2;
924	error = fo_stat(fp, &ub, td->td_ucred, td);
925	if (error == 0)
926		error = copyout(&ub, uap->sb, sizeof(ub));
927	fdrop(fp, td);
928done2:
929	mtx_unlock(&Giant);
930	return (error);
931}
932
933/*
934 * Return status information about a file descriptor.
935 */
936#ifndef _SYS_SYSPROTO_H_
937struct nfstat_args {
938	int	fd;
939	struct	nstat *sb;
940};
941#endif
942/*
943 * MPSAFE
944 */
945/* ARGSUSED */
946int
947nfstat(td, uap)
948	struct thread *td;
949	struct nfstat_args *uap;
950{
951	struct file *fp;
952	struct stat ub;
953	struct nstat nub;
954	int error;
955
956	mtx_lock(&Giant);
957	if ((error = fget(td, uap->fd, &fp)) != 0)
958		goto done2;
959	error = fo_stat(fp, &ub, td->td_ucred, td);
960	if (error == 0) {
961		cvtnstat(&ub, &nub);
962		error = copyout(&nub, uap->sb, sizeof(nub));
963	}
964	fdrop(fp, td);
965done2:
966	mtx_unlock(&Giant);
967	return (error);
968}
969
970/*
971 * Return pathconf information about a file descriptor.
972 */
973#ifndef _SYS_SYSPROTO_H_
974struct fpathconf_args {
975	int	fd;
976	int	name;
977};
978#endif
979/*
980 * MPSAFE
981 */
982/* ARGSUSED */
983int
984fpathconf(td, uap)
985	struct thread *td;
986	struct fpathconf_args *uap;
987{
988	struct file *fp;
989	struct vnode *vp;
990	int error;
991
992	if ((error = fget(td, uap->fd, &fp)) != 0)
993		return (error);
994
995	/* If asynchronous I/O is available, it works for all descriptors. */
996	if (uap->name == _PC_ASYNC_IO) {
997		td->td_retval[0] = async_io_version;
998		goto out;
999	}
1000	switch (fp->f_type) {
1001	case DTYPE_PIPE:
1002	case DTYPE_SOCKET:
1003		if (uap->name != _PC_PIPE_BUF) {
1004			error = EINVAL;
1005		} else {
1006			td->td_retval[0] = PIPE_BUF;
1007			error = 0;
1008		}
1009		break;
1010	case DTYPE_FIFO:
1011	case DTYPE_VNODE:
1012		vp = fp->f_data;
1013		mtx_lock(&Giant);
1014		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
1015		mtx_unlock(&Giant);
1016		break;
1017	default:
1018		error = EOPNOTSUPP;
1019		break;
1020	}
1021out:
1022	fdrop(fp, td);
1023	return (error);
1024}
1025
1026/*
1027 * Allocate a file descriptor for the process.
1028 */
1029static int fdexpand;
1030SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0, "");
1031
1032int
1033fdalloc(td, want, result)
1034	struct thread *td;
1035	int want;
1036	int *result;
1037{
1038	struct proc *p = td->td_proc;
1039	struct filedesc *fdp = td->td_proc->p_fd;
1040	int i;
1041	int lim, last, nfiles;
1042	struct file **newofile, **oldofile;
1043	char *newofileflags;
1044
1045	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1046
1047	/*
1048	 * Search for a free descriptor starting at the higher
1049	 * of want or fd_freefile.  If that fails, consider
1050	 * expanding the ofile array.
1051	 */
1052	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
1053	for (;;) {
1054		last = min(fdp->fd_nfiles, lim);
1055		i = max(want, fdp->fd_freefile);
1056		for (; i < last; i++) {
1057			if (fdp->fd_ofiles[i] == NULL) {
1058				fdp->fd_ofileflags[i] = 0;
1059				if (i > fdp->fd_lastfile)
1060					fdp->fd_lastfile = i;
1061				if (want <= fdp->fd_freefile)
1062					fdp->fd_freefile = i;
1063				*result = i;
1064				return (0);
1065			}
1066		}
1067
1068		/*
1069		 * No space in current array.  Expand?
1070		 */
1071		if (i >= lim)
1072			return (EMFILE);
1073		if (fdp->fd_nfiles < NDEXTENT)
1074			nfiles = NDEXTENT;
1075		else
1076			nfiles = 2 * fdp->fd_nfiles;
1077		while (nfiles < want)
1078			nfiles <<= 1;
1079		FILEDESC_UNLOCK(fdp);
1080		/*
1081		 * XXX malloc() calls uma_large_malloc() for sizes larger
1082		 * than KMEM_ZMAX bytes. uma_large_malloc() requires Giant.
1083		 */
1084		mtx_lock(&Giant);
1085		newofile = malloc(nfiles * OFILESIZE, M_FILEDESC, M_WAITOK);
1086		mtx_unlock(&Giant);
1087
1088		/*
1089		 * Deal with file-table extend race that might have
1090		 * occurred while filedesc was unlocked.
1091		 */
1092		FILEDESC_LOCK(fdp);
1093		if (fdp->fd_nfiles >= nfiles) {
1094			/* XXX uma_large_free() needs Giant. */
1095			FILEDESC_UNLOCK(fdp);
1096			mtx_lock(&Giant);
1097			free(newofile, M_FILEDESC);
1098			mtx_unlock(&Giant);
1099			FILEDESC_LOCK(fdp);
1100			continue;
1101		}
1102		newofileflags = (char *) &newofile[nfiles];
1103		/*
1104		 * Copy the existing ofile and ofileflags arrays
1105		 * and zero the new portion of each array.
1106		 */
1107		i = fdp->fd_nfiles * sizeof(struct file *);
1108		bcopy(fdp->fd_ofiles, newofile,	i);
1109		bzero((char *)newofile + i,
1110		    nfiles * sizeof(struct file *) - i);
1111		i = fdp->fd_nfiles * sizeof(char);
1112		bcopy(fdp->fd_ofileflags, newofileflags, i);
1113		bzero(newofileflags + i, nfiles * sizeof(char) - i);
1114		if (fdp->fd_nfiles > NDFILE)
1115			oldofile = fdp->fd_ofiles;
1116		else
1117			oldofile = NULL;
1118		fdp->fd_ofiles = newofile;
1119		fdp->fd_ofileflags = newofileflags;
1120		fdp->fd_nfiles = nfiles;
1121		fdexpand++;
1122		if (oldofile != NULL) {
1123			/* XXX uma_large_free() needs Giant. */
1124			FILEDESC_UNLOCK(fdp);
1125			mtx_lock(&Giant);
1126			free(oldofile, M_FILEDESC);
1127			mtx_unlock(&Giant);
1128			FILEDESC_LOCK(fdp);
1129		}
1130	}
1131	return (0);
1132}
1133
1134/*
1135 * Check to see whether n user file descriptors
1136 * are available to the process p.
1137 */
1138int
1139fdavail(td, n)
1140	struct thread *td;
1141	int n;
1142{
1143	struct proc *p = td->td_proc;
1144	struct filedesc *fdp = td->td_proc->p_fd;
1145	struct file **fpp;
1146	int i, lim, last;
1147
1148	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1149
1150	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
1151	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
1152		return (1);
1153	last = min(fdp->fd_nfiles, lim);
1154	fpp = &fdp->fd_ofiles[fdp->fd_freefile];
1155	for (i = last - fdp->fd_freefile; --i >= 0; fpp++) {
1156		if (*fpp == NULL && --n <= 0)
1157			return (1);
1158	}
1159	return (0);
1160}
1161
1162/*
1163 * Create a new open file structure and allocate
1164 * a file decriptor for the process that refers to it.
1165 */
1166int
1167falloc(td, resultfp, resultfd)
1168	struct thread *td;
1169	struct file **resultfp;
1170	int *resultfd;
1171{
1172	struct proc *p = td->td_proc;
1173	struct file *fp, *fq;
1174	int error, i;
1175
1176	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
1177	sx_xlock(&filelist_lock);
1178	if (nfiles >= maxfiles) {
1179		sx_xunlock(&filelist_lock);
1180		uma_zfree(file_zone, fp);
1181		tablefull("file");
1182		return (ENFILE);
1183	}
1184	nfiles++;
1185
1186	/*
1187	 * If the process has file descriptor zero open, add the new file
1188	 * descriptor to the list of open files at that point, otherwise
1189	 * put it at the front of the list of open files.
1190	 */
1191	fp->f_mtxp = mtx_pool_alloc();
1192	fp->f_gcflag = 0;
1193	fp->f_count = 1;
1194	fp->f_cred = crhold(td->td_ucred);
1195	fp->f_ops = &badfileops;
1196	fp->f_seqcount = 1;
1197	FILEDESC_LOCK(p->p_fd);
1198	if ((fq = p->p_fd->fd_ofiles[0])) {
1199		LIST_INSERT_AFTER(fq, fp, f_list);
1200	} else {
1201		LIST_INSERT_HEAD(&filehead, fp, f_list);
1202	}
1203	sx_xunlock(&filelist_lock);
1204	if ((error = fdalloc(td, 0, &i))) {
1205		FILEDESC_UNLOCK(p->p_fd);
1206		fdrop(fp, td);
1207		return (error);
1208	}
1209	p->p_fd->fd_ofiles[i] = fp;
1210	FILEDESC_UNLOCK(p->p_fd);
1211	if (resultfp)
1212		*resultfp = fp;
1213	if (resultfd)
1214		*resultfd = i;
1215	return (0);
1216}
1217
1218/*
1219 * Free a file descriptor.
1220 */
1221void
1222ffree(fp)
1223	struct file *fp;
1224{
1225
1226	KASSERT(fp->f_count == 0, ("ffree: fp_fcount not 0!"));
1227	sx_xlock(&filelist_lock);
1228	LIST_REMOVE(fp, f_list);
1229	nfiles--;
1230	sx_xunlock(&filelist_lock);
1231	crfree(fp->f_cred);
1232	uma_zfree(file_zone, fp);
1233}
1234
1235/*
1236 * Build a new filedesc structure from another.
1237 * Copy the current, root, and jail root vnode references.
1238 */
1239struct filedesc *
1240fdinit(fdp)
1241	struct filedesc *fdp;
1242{
1243	struct filedesc0 *newfdp;
1244
1245	MALLOC(newfdp, struct filedesc0 *, sizeof(struct filedesc0),
1246	    M_FILEDESC, M_WAITOK | M_ZERO);
1247	mtx_init(&newfdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
1248	newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
1249	if (newfdp->fd_fd.fd_cdir)
1250		VREF(newfdp->fd_fd.fd_cdir);
1251	newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
1252	if (newfdp->fd_fd.fd_rdir)
1253		VREF(newfdp->fd_fd.fd_rdir);
1254	newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
1255	if (newfdp->fd_fd.fd_jdir)
1256		VREF(newfdp->fd_fd.fd_jdir);
1257
1258	/* Create the file descriptor table. */
1259	newfdp->fd_fd.fd_refcnt = 1;
1260	newfdp->fd_fd.fd_cmask = cmask;
1261	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
1262	newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
1263	newfdp->fd_fd.fd_nfiles = NDFILE;
1264	newfdp->fd_fd.fd_knlistsize = -1;
1265	return (&newfdp->fd_fd);
1266}
1267
1268/*
1269 * Share a filedesc structure.
1270 */
1271struct filedesc *
1272fdshare(fdp)
1273	struct filedesc *fdp;
1274{
1275	FILEDESC_LOCK(fdp);
1276	fdp->fd_refcnt++;
1277	FILEDESC_UNLOCK(fdp);
1278	return (fdp);
1279}
1280
1281/*
1282 * Copy a filedesc structure.
1283 * A NULL pointer in returns a NULL reference, this is to ease callers,
1284 * not catch errors.
1285 */
1286struct filedesc *
1287fdcopy(fdp)
1288	struct filedesc *fdp;
1289{
1290	struct filedesc *newfdp;
1291	struct file **fpp;
1292	int i, j;
1293
1294	/* Certain daemons might not have file descriptors. */
1295	if (fdp == NULL)
1296		return (NULL);
1297
1298	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1299
1300	FILEDESC_UNLOCK(fdp);
1301	MALLOC(newfdp, struct filedesc *, sizeof(struct filedesc0),
1302	    M_FILEDESC, M_WAITOK);
1303	FILEDESC_LOCK(fdp);
1304	bcopy(fdp, newfdp, sizeof(struct filedesc));
1305	FILEDESC_UNLOCK(fdp);
1306	bzero(&newfdp->fd_mtx, sizeof(newfdp->fd_mtx));
1307	mtx_init(&newfdp->fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
1308	if (newfdp->fd_cdir)
1309		VREF(newfdp->fd_cdir);
1310	if (newfdp->fd_rdir)
1311		VREF(newfdp->fd_rdir);
1312	if (newfdp->fd_jdir)
1313		VREF(newfdp->fd_jdir);
1314	newfdp->fd_refcnt = 1;
1315
1316	/*
1317	 * If the number of open files fits in the internal arrays
1318	 * of the open file structure, use them, otherwise allocate
1319	 * additional memory for the number of descriptors currently
1320	 * in use.
1321	 */
1322	FILEDESC_LOCK(fdp);
1323	newfdp->fd_lastfile = fdp->fd_lastfile;
1324	newfdp->fd_nfiles = fdp->fd_nfiles;
1325	if (newfdp->fd_lastfile < NDFILE) {
1326		newfdp->fd_ofiles = ((struct filedesc0 *) newfdp)->fd_dfiles;
1327		newfdp->fd_ofileflags =
1328		    ((struct filedesc0 *) newfdp)->fd_dfileflags;
1329		i = NDFILE;
1330	} else {
1331		/*
1332		 * Compute the smallest multiple of NDEXTENT needed
1333		 * for the file descriptors currently in use,
1334		 * allowing the table to shrink.
1335		 */
1336retry:
1337		i = newfdp->fd_nfiles;
1338		while (i > 2 * NDEXTENT && i > newfdp->fd_lastfile * 2)
1339			i /= 2;
1340		FILEDESC_UNLOCK(fdp);
1341		MALLOC(newfdp->fd_ofiles, struct file **, i * OFILESIZE,
1342		    M_FILEDESC, M_WAITOK);
1343		FILEDESC_LOCK(fdp);
1344		newfdp->fd_lastfile = fdp->fd_lastfile;
1345		newfdp->fd_nfiles = fdp->fd_nfiles;
1346		j = newfdp->fd_nfiles;
1347		while (j > 2 * NDEXTENT && j > newfdp->fd_lastfile * 2)
1348			j /= 2;
1349		if (i != j) {
1350			/*
1351			 * The size of the original table has changed.
1352			 * Go over once again.
1353			 */
1354			FILEDESC_UNLOCK(fdp);
1355			FREE(newfdp->fd_ofiles, M_FILEDESC);
1356			FILEDESC_LOCK(fdp);
1357			newfdp->fd_lastfile = fdp->fd_lastfile;
1358			newfdp->fd_nfiles = fdp->fd_nfiles;
1359			goto retry;
1360		}
1361		newfdp->fd_ofileflags = (char *) &newfdp->fd_ofiles[i];
1362	}
1363	newfdp->fd_nfiles = i;
1364	bcopy(fdp->fd_ofiles, newfdp->fd_ofiles, i * sizeof(struct file **));
1365	bcopy(fdp->fd_ofileflags, newfdp->fd_ofileflags, i * sizeof(char));
1366
1367	/*
1368	 * kq descriptors cannot be copied.
1369	 */
1370	if (newfdp->fd_knlistsize != -1) {
1371		fpp = &newfdp->fd_ofiles[newfdp->fd_lastfile];
1372		for (i = newfdp->fd_lastfile; i >= 0; i--, fpp--) {
1373			if (*fpp != NULL && (*fpp)->f_type == DTYPE_KQUEUE) {
1374				*fpp = NULL;
1375				if (i < newfdp->fd_freefile)
1376					newfdp->fd_freefile = i;
1377			}
1378			if (*fpp == NULL && i == newfdp->fd_lastfile && i > 0)
1379				newfdp->fd_lastfile--;
1380		}
1381		newfdp->fd_knlist = NULL;
1382		newfdp->fd_knlistsize = -1;
1383		newfdp->fd_knhash = NULL;
1384		newfdp->fd_knhashmask = 0;
1385	}
1386
1387	fpp = newfdp->fd_ofiles;
1388	for (i = newfdp->fd_lastfile; i-- >= 0; fpp++) {
1389		if (*fpp != NULL)
1390			fhold(*fpp);
1391	}
1392	return (newfdp);
1393}
1394
1395/* A mutex to protect the association between a proc and filedesc. */
1396struct mtx	fdesc_mtx;
1397MTX_SYSINIT(fdesc, &fdesc_mtx, "fdesc", MTX_DEF);
1398
1399/*
1400 * Release a filedesc structure.
1401 */
1402void
1403fdfree(td)
1404	struct thread *td;
1405{
1406	struct filedesc *fdp;
1407	struct file **fpp;
1408	int i;
1409
1410	/* Certain daemons might not have file descriptors. */
1411	fdp = td->td_proc->p_fd;
1412	if (fdp == NULL)
1413		return;
1414
1415	mtx_lock(&fdesc_mtx);
1416	td->td_proc->p_fd = NULL;
1417	mtx_unlock(&fdesc_mtx);
1418
1419	FILEDESC_LOCK(fdp);
1420	if (--fdp->fd_refcnt > 0) {
1421		FILEDESC_UNLOCK(fdp);
1422		return;
1423	}
1424
1425	/*
1426	 * We are the last reference to the structure, so we can
1427	 * safely assume it will not change out from under us.
1428	 */
1429	FILEDESC_UNLOCK(fdp);
1430	fpp = fdp->fd_ofiles;
1431	for (i = fdp->fd_lastfile; i-- >= 0; fpp++) {
1432		if (*fpp)
1433			(void) closef(*fpp, td);
1434	}
1435	if (fdp->fd_nfiles > NDFILE)
1436		FREE(fdp->fd_ofiles, M_FILEDESC);
1437	if (fdp->fd_cdir)
1438		vrele(fdp->fd_cdir);
1439	if (fdp->fd_rdir)
1440		vrele(fdp->fd_rdir);
1441	if (fdp->fd_jdir)
1442		vrele(fdp->fd_jdir);
1443	if (fdp->fd_knlist)
1444		FREE(fdp->fd_knlist, M_KQUEUE);
1445	if (fdp->fd_knhash)
1446		FREE(fdp->fd_knhash, M_KQUEUE);
1447	mtx_destroy(&fdp->fd_mtx);
1448	FREE(fdp, M_FILEDESC);
1449}
1450
1451/*
1452 * For setugid programs, we don't want to people to use that setugidness
1453 * to generate error messages which write to a file which otherwise would
1454 * otherwise be off-limits to the process.  We check for filesystems where
1455 * the vnode can change out from under us after execve (like [lin]procfs).
1456 *
1457 * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
1458 * sufficient.  We also don't for check setugidness since we know we are.
1459 */
1460static int
1461is_unsafe(struct file *fp)
1462{
1463	if (fp->f_type == DTYPE_VNODE) {
1464		struct vnode *vp = fp->f_data;
1465
1466		if ((vp->v_vflag & VV_PROCDEP) != 0)
1467			return (1);
1468	}
1469	return (0);
1470}
1471
1472/*
1473 * Make this setguid thing safe, if at all possible.
1474 */
1475void
1476setugidsafety(td)
1477	struct thread *td;
1478{
1479	struct filedesc *fdp;
1480	int i;
1481
1482	/* Certain daemons might not have file descriptors. */
1483	fdp = td->td_proc->p_fd;
1484	if (fdp == NULL)
1485		return;
1486
1487	/*
1488	 * Note: fdp->fd_ofiles may be reallocated out from under us while
1489	 * we are blocked in a close.  Be careful!
1490	 */
1491	FILEDESC_LOCK(fdp);
1492	for (i = 0; i <= fdp->fd_lastfile; i++) {
1493		if (i > 2)
1494			break;
1495		if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) {
1496			struct file *fp;
1497
1498#if 0
1499			if ((fdp->fd_ofileflags[i] & UF_MAPPED) != 0)
1500				(void) munmapfd(td, i);
1501#endif
1502			if (i < fdp->fd_knlistsize) {
1503				FILEDESC_UNLOCK(fdp);
1504				knote_fdclose(td, i);
1505				FILEDESC_LOCK(fdp);
1506			}
1507			/*
1508			 * NULL-out descriptor prior to close to avoid
1509			 * a race while close blocks.
1510			 */
1511			fp = fdp->fd_ofiles[i];
1512			fdp->fd_ofiles[i] = NULL;
1513			fdp->fd_ofileflags[i] = 0;
1514			if (i < fdp->fd_freefile)
1515				fdp->fd_freefile = i;
1516			FILEDESC_UNLOCK(fdp);
1517			(void) closef(fp, td);
1518			FILEDESC_LOCK(fdp);
1519		}
1520	}
1521	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
1522		fdp->fd_lastfile--;
1523	FILEDESC_UNLOCK(fdp);
1524}
1525
1526/*
1527 * Close any files on exec?
1528 */
1529void
1530fdcloseexec(td)
1531	struct thread *td;
1532{
1533	struct filedesc *fdp;
1534	int i;
1535
1536	/* Certain daemons might not have file descriptors. */
1537	fdp = td->td_proc->p_fd;
1538	if (fdp == NULL)
1539		return;
1540
1541	FILEDESC_LOCK(fdp);
1542
1543	/*
1544	 * We cannot cache fd_ofiles or fd_ofileflags since operations
1545	 * may block and rip them out from under us.
1546	 */
1547	for (i = 0; i <= fdp->fd_lastfile; i++) {
1548		if (fdp->fd_ofiles[i] != NULL &&
1549		    (fdp->fd_ofileflags[i] & UF_EXCLOSE)) {
1550			struct file *fp;
1551
1552#if 0
1553			if (fdp->fd_ofileflags[i] & UF_MAPPED)
1554				(void) munmapfd(td, i);
1555#endif
1556			if (i < fdp->fd_knlistsize) {
1557				FILEDESC_UNLOCK(fdp);
1558				knote_fdclose(td, i);
1559				FILEDESC_LOCK(fdp);
1560			}
1561			/*
1562			 * NULL-out descriptor prior to close to avoid
1563			 * a race while close blocks.
1564			 */
1565			fp = fdp->fd_ofiles[i];
1566			fdp->fd_ofiles[i] = NULL;
1567			fdp->fd_ofileflags[i] = 0;
1568			if (i < fdp->fd_freefile)
1569				fdp->fd_freefile = i;
1570			FILEDESC_UNLOCK(fdp);
1571			(void) closef(fp, td);
1572			FILEDESC_LOCK(fdp);
1573		}
1574	}
1575	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
1576		fdp->fd_lastfile--;
1577	FILEDESC_UNLOCK(fdp);
1578}
1579
1580/*
1581 * It is unsafe for set[ug]id processes to be started with file
1582 * descriptors 0..2 closed, as these descriptors are given implicit
1583 * significance in the Standard C library.  fdcheckstd() will create a
1584 * descriptor referencing /dev/null for each of stdin, stdout, and
1585 * stderr that is not already open.
1586 */
1587int
1588fdcheckstd(td)
1589	struct thread *td;
1590{
1591	struct nameidata nd;
1592	struct filedesc *fdp;
1593	struct file *fp;
1594	register_t retval;
1595	int fd, i, error, flags, devnull;
1596
1597	fdp = td->td_proc->p_fd;
1598	if (fdp == NULL)
1599		return (0);
1600	devnull = -1;
1601	error = 0;
1602	for (i = 0; i < 3; i++) {
1603		if (fdp->fd_ofiles[i] != NULL)
1604			continue;
1605		if (devnull < 0) {
1606			error = falloc(td, &fp, &fd);
1607			if (error != 0)
1608				break;
1609			KASSERT(fd == i, ("oof, we didn't get our fd"));
1610			NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/dev/null",
1611			    td);
1612			flags = FREAD | FWRITE;
1613			error = vn_open(&nd, &flags, 0);
1614			if (error != 0) {
1615				FILEDESC_LOCK(fdp);
1616				fdp->fd_ofiles[fd] = NULL;
1617				FILEDESC_UNLOCK(fdp);
1618				fdrop(fp, td);
1619				break;
1620			}
1621			NDFREE(&nd, NDF_ONLY_PNBUF);
1622			fp->f_data = nd.ni_vp;
1623			fp->f_flag = flags;
1624			fp->f_ops = &vnops;
1625			fp->f_type = DTYPE_VNODE;
1626			VOP_UNLOCK(nd.ni_vp, 0, td);
1627			devnull = fd;
1628		} else {
1629			error = do_dup(td, DUP_FIXED, devnull, i, &retval);
1630			if (error != 0)
1631				break;
1632		}
1633	}
1634	return (error);
1635}
1636
1637/*
1638 * Internal form of close.
1639 * Decrement reference count on file structure.
1640 * Note: td may be NULL when closing a file
1641 * that was being passed in a message.
1642 */
1643int
1644closef(fp, td)
1645	struct file *fp;
1646	struct thread *td;
1647{
1648	struct vnode *vp;
1649	struct flock lf;
1650
1651	if (fp == NULL)
1652		return (0);
1653	/*
1654	 * POSIX record locking dictates that any close releases ALL
1655	 * locks owned by this process.  This is handled by setting
1656	 * a flag in the unlock to free ONLY locks obeying POSIX
1657	 * semantics, and not to free BSD-style file locks.
1658	 * If the descriptor was in a message, POSIX-style locks
1659	 * aren't passed with the descriptor.
1660	 */
1661	if (td != NULL && fp->f_type == DTYPE_VNODE) {
1662		struct proc *p = td->td_proc;
1663		int pflagcopy;
1664		if (p->p_leader != p ||
1665		    p->p_peers != NULL) {
1666			PROC_LOCK(p->p_leader);
1667			pflagcopy = p->p_leader->p_flag;
1668			PROC_UNLOCK(p->p_leader);
1669		} else
1670			pflagcopy = p->p_flag;
1671		if ((pflagcopy & P_ADVLOCK) != 0) {
1672			lf.l_whence = SEEK_SET;
1673			lf.l_start = 0;
1674			lf.l_len = 0;
1675			lf.l_type = F_UNLCK;
1676			vp = fp->f_data;
1677			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
1678					   F_UNLCK, &lf, F_POSIX);
1679		}
1680	}
1681	return (fdrop(fp, td));
1682}
1683
1684/*
1685 * Drop reference on struct file passed in, may call closef if the
1686 * reference hits zero.
1687 */
1688int
1689fdrop(fp, td)
1690	struct file *fp;
1691	struct thread *td;
1692{
1693
1694	FILE_LOCK(fp);
1695	return (fdrop_locked(fp, td));
1696}
1697
1698/*
1699 * Extract the file pointer associated with the specified descriptor for
1700 * the current user process.
1701 *
1702 * If the descriptor doesn't exist, EBADF is returned.
1703 *
1704 * If the descriptor exists but doesn't match 'flags' then
1705 * return EBADF for read attempts and EINVAL for write attempts.
1706 *
1707 * If 'hold' is set (non-zero) the file's refcount will be bumped on return.
1708 * It should be droped with fdrop().
1709 * If it is not set, then the refcount will not be bumped however the
1710 * thread's filedesc struct will be returned locked (for fgetsock).
1711 *
1712 * If an error occured the non-zero error is returned and *fpp is set to NULL.
1713 * Otherwise *fpp is set and zero is returned.
1714 */
1715static __inline int
1716_fget(struct thread *td, int fd, struct file **fpp, int flags, int hold)
1717{
1718	struct filedesc *fdp;
1719	struct file *fp;
1720
1721	*fpp = NULL;
1722	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
1723		return (EBADF);
1724	FILEDESC_LOCK(fdp);
1725	if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) {
1726		FILEDESC_UNLOCK(fdp);
1727		return (EBADF);
1728	}
1729
1730	/*
1731	 * Note: FREAD failures returns EBADF to maintain backwards
1732	 * compatibility with what routines returned before.
1733	 *
1734	 * Only one flag, or 0, may be specified.
1735	 */
1736	if (flags == FREAD && (fp->f_flag & FREAD) == 0) {
1737		FILEDESC_UNLOCK(fdp);
1738		return (EBADF);
1739	}
1740	if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) {
1741		FILEDESC_UNLOCK(fdp);
1742		return (EINVAL);
1743	}
1744	if (hold) {
1745		fhold(fp);
1746		FILEDESC_UNLOCK(fdp);
1747	}
1748	*fpp = fp;
1749	return (0);
1750}
1751
1752int
1753fget(struct thread *td, int fd, struct file **fpp)
1754{
1755
1756	return(_fget(td, fd, fpp, 0, 1));
1757}
1758
1759int
1760fget_read(struct thread *td, int fd, struct file **fpp)
1761{
1762
1763	return(_fget(td, fd, fpp, FREAD, 1));
1764}
1765
1766int
1767fget_write(struct thread *td, int fd, struct file **fpp)
1768{
1769
1770	return(_fget(td, fd, fpp, FWRITE, 1));
1771}
1772
1773/*
1774 * Like fget() but loads the underlying vnode, or returns an error if
1775 * the descriptor does not represent a vnode.  Note that pipes use vnodes
1776 * but never have VM objects (so VOP_GETVOBJECT() calls will return an
1777 * error).  The returned vnode will be vref()d.
1778 */
1779static __inline int
1780_fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags)
1781{
1782	struct file *fp;
1783	int error;
1784
1785	*vpp = NULL;
1786	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
1787		return (error);
1788	if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) {
1789		error = EINVAL;
1790	} else {
1791		*vpp = fp->f_data;
1792		vref(*vpp);
1793	}
1794	FILEDESC_UNLOCK(td->td_proc->p_fd);
1795	return (error);
1796}
1797
1798int
1799fgetvp(struct thread *td, int fd, struct vnode **vpp)
1800{
1801
1802	return (_fgetvp(td, fd, vpp, 0));
1803}
1804
1805int
1806fgetvp_read(struct thread *td, int fd, struct vnode **vpp)
1807{
1808
1809	return (_fgetvp(td, fd, vpp, FREAD));
1810}
1811
1812int
1813fgetvp_write(struct thread *td, int fd, struct vnode **vpp)
1814{
1815
1816	return (_fgetvp(td, fd, vpp, FWRITE));
1817}
1818
1819/*
1820 * Like fget() but loads the underlying socket, or returns an error if
1821 * the descriptor does not represent a socket.
1822 *
1823 * We bump the ref count on the returned socket.  XXX Also obtain the SX
1824 * lock in the future.
1825 */
1826int
1827fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp)
1828{
1829	struct file *fp;
1830	int error;
1831
1832	*spp = NULL;
1833	if (fflagp != NULL)
1834		*fflagp = 0;
1835	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
1836		return (error);
1837	if (fp->f_type != DTYPE_SOCKET) {
1838		error = ENOTSOCK;
1839	} else {
1840		*spp = fp->f_data;
1841		if (fflagp)
1842			*fflagp = fp->f_flag;
1843		soref(*spp);
1844	}
1845	FILEDESC_UNLOCK(td->td_proc->p_fd);
1846	return (error);
1847}
1848
1849/*
1850 * Drop the reference count on the the socket and XXX release the SX lock in
1851 * the future.  The last reference closes the socket.
1852 */
1853void
1854fputsock(struct socket *so)
1855{
1856
1857	sorele(so);
1858}
1859
1860/*
1861 * Drop reference on struct file passed in, may call closef if the
1862 * reference hits zero.
1863 * Expects struct file locked, and will unlock it.
1864 */
1865int
1866fdrop_locked(fp, td)
1867	struct file *fp;
1868	struct thread *td;
1869{
1870	struct flock lf;
1871	struct vnode *vp;
1872	int error;
1873
1874	FILE_LOCK_ASSERT(fp, MA_OWNED);
1875
1876	if (--fp->f_count > 0) {
1877		FILE_UNLOCK(fp);
1878		return (0);
1879	}
1880	mtx_lock(&Giant);
1881	if (fp->f_count < 0)
1882		panic("fdrop: count < 0");
1883	if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
1884		lf.l_whence = SEEK_SET;
1885		lf.l_start = 0;
1886		lf.l_len = 0;
1887		lf.l_type = F_UNLCK;
1888		vp = fp->f_data;
1889		FILE_UNLOCK(fp);
1890		(void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
1891	} else
1892		FILE_UNLOCK(fp);
1893	if (fp->f_ops != &badfileops)
1894		error = fo_close(fp, td);
1895	else
1896		error = 0;
1897	ffree(fp);
1898	mtx_unlock(&Giant);
1899	return (error);
1900}
1901
1902/*
1903 * Apply an advisory lock on a file descriptor.
1904 *
1905 * Just attempt to get a record lock of the requested type on
1906 * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
1907 */
1908#ifndef _SYS_SYSPROTO_H_
1909struct flock_args {
1910	int	fd;
1911	int	how;
1912};
1913#endif
1914/*
1915 * MPSAFE
1916 */
1917/* ARGSUSED */
1918int
1919flock(td, uap)
1920	struct thread *td;
1921	struct flock_args *uap;
1922{
1923	struct file *fp;
1924	struct vnode *vp;
1925	struct flock lf;
1926	int error;
1927
1928	if ((error = fget(td, uap->fd, &fp)) != 0)
1929		return (error);
1930	if (fp->f_type != DTYPE_VNODE) {
1931		fdrop(fp, td);
1932		return (EOPNOTSUPP);
1933	}
1934
1935	mtx_lock(&Giant);
1936	vp = fp->f_data;
1937	lf.l_whence = SEEK_SET;
1938	lf.l_start = 0;
1939	lf.l_len = 0;
1940	if (uap->how & LOCK_UN) {
1941		lf.l_type = F_UNLCK;
1942		FILE_LOCK(fp);
1943		fp->f_flag &= ~FHASLOCK;
1944		FILE_UNLOCK(fp);
1945		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
1946		goto done2;
1947	}
1948	if (uap->how & LOCK_EX)
1949		lf.l_type = F_WRLCK;
1950	else if (uap->how & LOCK_SH)
1951		lf.l_type = F_RDLCK;
1952	else {
1953		error = EBADF;
1954		goto done2;
1955	}
1956	FILE_LOCK(fp);
1957	fp->f_flag |= FHASLOCK;
1958	FILE_UNLOCK(fp);
1959	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
1960	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
1961done2:
1962	fdrop(fp, td);
1963	mtx_unlock(&Giant);
1964	return (error);
1965}
1966
1967/*
1968 * File Descriptor pseudo-device driver (/dev/fd/).
1969 *
1970 * Opening minor device N dup()s the file (if any) connected to file
1971 * descriptor N belonging to the calling process.  Note that this driver
1972 * consists of only the ``open()'' routine, because all subsequent
1973 * references to this file will be direct to the other driver.
1974 */
1975/* ARGSUSED */
1976static int
1977fdopen(dev, mode, type, td)
1978	dev_t dev;
1979	int mode, type;
1980	struct thread *td;
1981{
1982
1983	/*
1984	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
1985	 * the file descriptor being sought for duplication. The error
1986	 * return ensures that the vnode for this device will be released
1987	 * by vn_open. Open will detect this special error and take the
1988	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
1989	 * will simply report the error.
1990	 */
1991	td->td_dupfd = dev2unit(dev);
1992	return (ENODEV);
1993}
1994
1995/*
1996 * Duplicate the specified descriptor to a free descriptor.
1997 */
1998int
1999dupfdopen(td, fdp, indx, dfd, mode, error)
2000	struct thread *td;
2001	struct filedesc *fdp;
2002	int indx, dfd;
2003	int mode;
2004	int error;
2005{
2006	struct file *wfp;
2007	struct file *fp;
2008
2009	/*
2010	 * If the to-be-dup'd fd number is greater than the allowed number
2011	 * of file descriptors, or the fd to be dup'd has already been
2012	 * closed, then reject.
2013	 */
2014	FILEDESC_LOCK(fdp);
2015	if (dfd < 0 || dfd >= fdp->fd_nfiles ||
2016	    (wfp = fdp->fd_ofiles[dfd]) == NULL) {
2017		FILEDESC_UNLOCK(fdp);
2018		return (EBADF);
2019	}
2020
2021	/*
2022	 * There are two cases of interest here.
2023	 *
2024	 * For ENODEV simply dup (dfd) to file descriptor
2025	 * (indx) and return.
2026	 *
2027	 * For ENXIO steal away the file structure from (dfd) and
2028	 * store it in (indx).  (dfd) is effectively closed by
2029	 * this operation.
2030	 *
2031	 * Any other error code is just returned.
2032	 */
2033	switch (error) {
2034	case ENODEV:
2035		/*
2036		 * Check that the mode the file is being opened for is a
2037		 * subset of the mode of the existing descriptor.
2038		 */
2039		FILE_LOCK(wfp);
2040		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {
2041			FILE_UNLOCK(wfp);
2042			FILEDESC_UNLOCK(fdp);
2043			return (EACCES);
2044		}
2045		fp = fdp->fd_ofiles[indx];
2046#if 0
2047		if (fp && fdp->fd_ofileflags[indx] & UF_MAPPED)
2048			(void) munmapfd(td, indx);
2049#endif
2050		fdp->fd_ofiles[indx] = wfp;
2051		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
2052		fhold_locked(wfp);
2053		FILE_UNLOCK(wfp);
2054		if (indx > fdp->fd_lastfile)
2055			fdp->fd_lastfile = indx;
2056		if (fp != NULL)
2057			FILE_LOCK(fp);
2058		FILEDESC_UNLOCK(fdp);
2059		/*
2060		 * We now own the reference to fp that the ofiles[] array
2061		 * used to own.  Release it.
2062		 */
2063		if (fp != NULL)
2064			fdrop_locked(fp, td);
2065		return (0);
2066
2067	case ENXIO:
2068		/*
2069		 * Steal away the file pointer from dfd and stuff it into indx.
2070		 */
2071		fp = fdp->fd_ofiles[indx];
2072#if 0
2073		if (fp && fdp->fd_ofileflags[indx] & UF_MAPPED)
2074			(void) munmapfd(td, indx);
2075#endif
2076		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
2077		fdp->fd_ofiles[dfd] = NULL;
2078		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
2079		fdp->fd_ofileflags[dfd] = 0;
2080
2081		/*
2082		 * Complete the clean up of the filedesc structure by
2083		 * recomputing the various hints.
2084		 */
2085		if (indx > fdp->fd_lastfile) {
2086			fdp->fd_lastfile = indx;
2087		} else {
2088			while (fdp->fd_lastfile > 0 &&
2089			   fdp->fd_ofiles[fdp->fd_lastfile] == NULL) {
2090				fdp->fd_lastfile--;
2091			}
2092			if (dfd < fdp->fd_freefile)
2093				fdp->fd_freefile = dfd;
2094		}
2095		if (fp != NULL)
2096			FILE_LOCK(fp);
2097		FILEDESC_UNLOCK(fdp);
2098
2099		/*
2100		 * we now own the reference to fp that the ofiles[] array
2101		 * used to own.  Release it.
2102		 */
2103		if (fp != NULL)
2104			fdrop_locked(fp, td);
2105		return (0);
2106
2107	default:
2108		FILEDESC_UNLOCK(fdp);
2109		return (error);
2110	}
2111	/* NOTREACHED */
2112}
2113
2114/*
2115 * Get file structures.
2116 */
2117static int
2118sysctl_kern_file(SYSCTL_HANDLER_ARGS)
2119{
2120	struct xfile xf;
2121	struct filedesc *fdp;
2122	struct file *fp;
2123	struct proc *p;
2124	int error, n;
2125
2126	sysctl_wire_old_buffer(req, 0);
2127	if (req->oldptr == NULL) {
2128		n = 16;		/* A slight overestimate. */
2129		sx_slock(&filelist_lock);
2130		LIST_FOREACH(fp, &filehead, f_list) {
2131			/*
2132			 * We should grab the lock, but this is an
2133			 * estimate, so does it really matter?
2134			 */
2135			/* mtx_lock(fp->f_mtxp); */
2136			n += fp->f_count;
2137			/* mtx_unlock(f->f_mtxp); */
2138		}
2139		sx_sunlock(&filelist_lock);
2140		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
2141	}
2142	error = 0;
2143	bzero(&xf, sizeof(xf));
2144	xf.xf_size = sizeof(xf);
2145	sx_slock(&allproc_lock);
2146	LIST_FOREACH(p, &allproc, p_list) {
2147		PROC_LOCK(p);
2148		xf.xf_pid = p->p_pid;
2149		xf.xf_uid = p->p_ucred->cr_uid;
2150		PROC_UNLOCK(p);
2151		mtx_lock(&fdesc_mtx);
2152		if ((fdp = p->p_fd) == NULL) {
2153			mtx_unlock(&fdesc_mtx);
2154			continue;
2155		}
2156		FILEDESC_LOCK(fdp);
2157		for (n = 0; n < fdp->fd_nfiles; ++n) {
2158			if ((fp = fdp->fd_ofiles[n]) == NULL)
2159				continue;
2160			xf.xf_fd = n;
2161			xf.xf_file = fp;
2162			xf.xf_data = fp->f_data;
2163			xf.xf_type = fp->f_type;
2164			xf.xf_count = fp->f_count;
2165			xf.xf_msgcount = fp->f_msgcount;
2166			xf.xf_offset = fp->f_offset;
2167			xf.xf_flag = fp->f_flag;
2168			error = SYSCTL_OUT(req, &xf, sizeof(xf));
2169			if (error)
2170				break;
2171		}
2172		FILEDESC_UNLOCK(fdp);
2173		mtx_unlock(&fdesc_mtx);
2174		if (error)
2175			break;
2176	}
2177	sx_sunlock(&allproc_lock);
2178	return (error);
2179}
2180
2181SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
2182    0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
2183
2184SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
2185    &maxfilesperproc, 0, "Maximum files allowed open per process");
2186
2187SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
2188    &maxfiles, 0, "Maximum number of files");
2189
2190SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
2191    &nfiles, 0, "System-wide number of open files");
2192
2193static void
2194fildesc_drvinit(void *unused)
2195{
2196	dev_t dev;
2197
2198	dev = make_dev(&fildesc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "fd/0");
2199	make_dev_alias(dev, "stdin");
2200	dev = make_dev(&fildesc_cdevsw, 1, UID_ROOT, GID_WHEEL, 0666, "fd/1");
2201	make_dev_alias(dev, "stdout");
2202	dev = make_dev(&fildesc_cdevsw, 2, UID_ROOT, GID_WHEEL, 0666, "fd/2");
2203	make_dev_alias(dev, "stderr");
2204}
2205
2206static fo_rdwr_t	badfo_readwrite;
2207static fo_ioctl_t	badfo_ioctl;
2208static fo_poll_t	badfo_poll;
2209static fo_kqfilter_t	badfo_kqfilter;
2210static fo_stat_t	badfo_stat;
2211static fo_close_t	badfo_close;
2212
2213struct fileops badfileops = {
2214	badfo_readwrite,
2215	badfo_readwrite,
2216	badfo_ioctl,
2217	badfo_poll,
2218	badfo_kqfilter,
2219	badfo_stat,
2220	badfo_close,
2221	0
2222};
2223
2224static int
2225badfo_readwrite(fp, uio, active_cred, flags, td)
2226	struct file *fp;
2227	struct uio *uio;
2228	struct ucred *active_cred;
2229	struct thread *td;
2230	int flags;
2231{
2232
2233	return (EBADF);
2234}
2235
2236static int
2237badfo_ioctl(fp, com, data, active_cred, td)
2238	struct file *fp;
2239	u_long com;
2240	void *data;
2241	struct ucred *active_cred;
2242	struct thread *td;
2243{
2244
2245	return (EBADF);
2246}
2247
2248static int
2249badfo_poll(fp, events, active_cred, td)
2250	struct file *fp;
2251	int events;
2252	struct ucred *active_cred;
2253	struct thread *td;
2254{
2255
2256	return (0);
2257}
2258
2259static int
2260badfo_kqfilter(fp, kn)
2261	struct file *fp;
2262	struct knote *kn;
2263{
2264
2265	return (0);
2266}
2267
2268static int
2269badfo_stat(fp, sb, active_cred, td)
2270	struct file *fp;
2271	struct stat *sb;
2272	struct ucred *active_cred;
2273	struct thread *td;
2274{
2275
2276	return (EBADF);
2277}
2278
2279static int
2280badfo_close(fp, td)
2281	struct file *fp;
2282	struct thread *td;
2283{
2284
2285	return (EBADF);
2286}
2287
2288SYSINIT(fildescdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,
2289					fildesc_drvinit,NULL)
2290
2291static void filelistinit(void *);
2292SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL)
2293
2294/* ARGSUSED*/
2295static void
2296filelistinit(dummy)
2297	void *dummy;
2298{
2299
2300	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
2301	    NULL, NULL, UMA_ALIGN_PTR, 0);
2302	sx_init(&filelist_lock, "filelist lock");
2303	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
2304}
2305