kern_descrip.c revision 51418
1/*
2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
39 * $FreeBSD: head/sys/kern/kern_descrip.c 51418 1999-09-19 17:00:25Z green $
40 */
41
42#include "opt_compat.h"
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/sysproto.h>
46#include <sys/conf.h>
47#include <sys/filedesc.h>
48#include <sys/kernel.h>
49#include <sys/sysctl.h>
50#include <sys/vnode.h>
51#include <sys/proc.h>
52#include <sys/file.h>
53#include <sys/socketvar.h>
54#include <sys/stat.h>
55#include <sys/filio.h>
56#include <sys/ttycom.h>
57#include <sys/fcntl.h>
58#include <sys/malloc.h>
59#include <sys/unistd.h>
60#include <sys/resourcevar.h>
61#include <sys/pipe.h>
62
63#include <vm/vm.h>
64#include <vm/vm_extern.h>
65
66static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
67MALLOC_DEFINE(M_FILE, "file", "Open file structure");
68static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
69
70
71static	 d_open_t  fdopen;
72#define NUMFDESC 64
73
74#define CDEV_MAJOR 22
75static struct cdevsw fildesc_cdevsw = {
76	/* open */	fdopen,
77	/* close */	noclose,
78	/* read */	noread,
79	/* write */	nowrite,
80	/* ioctl */	noioctl,
81	/* stop */	nostop,
82	/* reset */	noreset,
83	/* devtotty */	nodevtotty,
84	/* poll */	nopoll,
85	/* mmap */	nommap,
86	/* strategy */	nostrategy,
87	/* name */	"FD",
88	/* parms */	noparms,
89	/* maj */	CDEV_MAJOR,
90	/* dump */	nodump,
91	/* psize */	nopsize,
92	/* flags */	0,
93	/* maxio */	0,
94	/* bmaj */	-1
95};
96
97static int finishdup __P((struct filedesc *fdp, int old, int new, register_t *retval));
98static int badfo_readwrite __P((struct file *fp, struct uio *uio,
99    struct ucred *cred, int flags, struct proc *p));
100static int badfo_ioctl __P((struct file *fp, u_long com, caddr_t data,
101    struct proc *p));
102static int badfo_poll __P((struct file *fp, int events,
103    struct ucred *cred, struct proc *p));
104static int badfo_close __P((struct file *fp, struct proc *p));
105/*
106 * Descriptor management.
107 */
108struct filelist filehead;	/* head of list of open files */
109int nfiles;			/* actual number of open files */
110extern int cmask;
111
112/*
113 * System calls on descriptors.
114 */
115#ifndef _SYS_SYSPROTO_H_
116struct getdtablesize_args {
117	int	dummy;
118};
119#endif
120/* ARGSUSED */
121int
122getdtablesize(p, uap)
123	struct proc *p;
124	struct getdtablesize_args *uap;
125{
126
127	p->p_retval[0] =
128	    min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
129	return (0);
130}
131
132/*
133 * Duplicate a file descriptor to a particular value.
134 */
135#ifndef _SYS_SYSPROTO_H_
136struct dup2_args {
137	u_int	from;
138	u_int	to;
139};
140#endif
141/* ARGSUSED */
142int
143dup2(p, uap)
144	struct proc *p;
145	struct dup2_args *uap;
146{
147	register struct filedesc *fdp = p->p_fd;
148	register u_int old = uap->from, new = uap->to;
149	int i, error;
150
151	if (old >= fdp->fd_nfiles ||
152	    fdp->fd_ofiles[old] == NULL ||
153	    new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
154	    new >= maxfilesperproc)
155		return (EBADF);
156	if (old == new) {
157		p->p_retval[0] = new;
158		return (0);
159	}
160	if (new >= fdp->fd_nfiles) {
161		if ((error = fdalloc(p, new, &i)))
162			return (error);
163		if (new != i)
164			panic("dup2: fdalloc");
165	} else if (fdp->fd_ofiles[new]) {
166		if (fdp->fd_ofileflags[new] & UF_MAPPED)
167			(void) munmapfd(p, new);
168		/*
169		 * dup2() must succeed even if the close has an error.
170		 */
171		(void) closef(fdp->fd_ofiles[new], p);
172	}
173	return (finishdup(fdp, (int)old, (int)new, p->p_retval));
174}
175
176/*
177 * Duplicate a file descriptor.
178 */
179#ifndef _SYS_SYSPROTO_H_
180struct dup_args {
181	u_int	fd;
182};
183#endif
184/* ARGSUSED */
185int
186dup(p, uap)
187	struct proc *p;
188	struct dup_args *uap;
189{
190	register struct filedesc *fdp;
191	u_int old;
192	int new, error;
193
194	old = uap->fd;
195
196#if 0
197	/*
198	 * XXX Compatibility
199	 */
200	if (old &~ 077) { uap->fd &= 077; return (dup2(p, uap, p->p_retval)); }
201#endif
202
203	fdp = p->p_fd;
204	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL)
205		return (EBADF);
206	if ((error = fdalloc(p, 0, &new)))
207		return (error);
208	return (finishdup(fdp, (int)old, new, p->p_retval));
209}
210
211/*
212 * The file control system call.
213 */
214#ifndef _SYS_SYSPROTO_H_
215struct fcntl_args {
216	int	fd;
217	int	cmd;
218	long	arg;
219};
220#endif
221/* ARGSUSED */
222int
223fcntl(p, uap)
224	struct proc *p;
225	register struct fcntl_args *uap;
226{
227	register struct filedesc *fdp = p->p_fd;
228	register struct file *fp;
229	register char *pop;
230	struct vnode *vp;
231	int i, tmp, error, flg = F_POSIX;
232	struct flock fl;
233	u_int newmin;
234
235	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
236	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
237		return (EBADF);
238	pop = &fdp->fd_ofileflags[uap->fd];
239	switch (uap->cmd) {
240
241	case F_DUPFD:
242		newmin = uap->arg;
243		if (newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
244		    newmin >= maxfilesperproc)
245			return (EINVAL);
246		if ((error = fdalloc(p, newmin, &i)))
247			return (error);
248		return (finishdup(fdp, uap->fd, i, p->p_retval));
249
250	case F_GETFD:
251		p->p_retval[0] = *pop & 1;
252		return (0);
253
254	case F_SETFD:
255		*pop = (*pop &~ 1) | (uap->arg & 1);
256		return (0);
257
258	case F_GETFL:
259		p->p_retval[0] = OFLAGS(fp->f_flag);
260		return (0);
261
262	case F_SETFL:
263		fp->f_flag &= ~FCNTLFLAGS;
264		fp->f_flag |= FFLAGS(uap->arg & ~O_ACCMODE) & FCNTLFLAGS;
265		tmp = fp->f_flag & FNONBLOCK;
266		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, p);
267		if (error)
268			return (error);
269		tmp = fp->f_flag & FASYNC;
270		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, p);
271		if (!error)
272			return (0);
273		fp->f_flag &= ~FNONBLOCK;
274		tmp = 0;
275		(void)fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, p);
276		return (error);
277
278	case F_GETOWN:
279		return (fo_ioctl(fp, FIOGETOWN, (caddr_t)p->p_retval, p));
280
281	case F_SETOWN:
282		return (fo_ioctl(fp, FIOSETOWN, (caddr_t)&uap->arg, p));
283
284	case F_SETLKW:
285		flg |= F_WAIT;
286		/* Fall into F_SETLK */
287
288	case F_SETLK:
289		if (fp->f_type != DTYPE_VNODE)
290			return (EBADF);
291		vp = (struct vnode *)fp->f_data;
292		/* Copy in the lock structure */
293		error = copyin((caddr_t)(intptr_t)uap->arg, (caddr_t)&fl,
294		    sizeof(fl));
295		if (error)
296			return (error);
297		if (fl.l_whence == SEEK_CUR)
298			fl.l_start += fp->f_offset;
299		switch (fl.l_type) {
300
301		case F_RDLCK:
302			if ((fp->f_flag & FREAD) == 0)
303				return (EBADF);
304			p->p_flag |= P_ADVLOCK;
305			return (VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, &fl, flg));
306
307		case F_WRLCK:
308			if ((fp->f_flag & FWRITE) == 0)
309				return (EBADF);
310			p->p_flag |= P_ADVLOCK;
311			return (VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, &fl, flg));
312
313		case F_UNLCK:
314			return (VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, &fl,
315				F_POSIX));
316
317		default:
318			return (EINVAL);
319		}
320
321	case F_GETLK:
322		if (fp->f_type != DTYPE_VNODE)
323			return (EBADF);
324		vp = (struct vnode *)fp->f_data;
325		/* Copy in the lock structure */
326		error = copyin((caddr_t)(intptr_t)uap->arg, (caddr_t)&fl,
327		    sizeof(fl));
328		if (error)
329			return (error);
330		if (fl.l_type != F_RDLCK && fl.l_type != F_WRLCK &&
331		    fl.l_type != F_UNLCK)
332			return (EINVAL);
333		if (fl.l_whence == SEEK_CUR)
334			fl.l_start += fp->f_offset;
335		if ((error = VOP_ADVLOCK(vp,(caddr_t)p->p_leader,F_GETLK,&fl,F_POSIX)))
336			return (error);
337		return (copyout((caddr_t)&fl, (caddr_t)(intptr_t)uap->arg,
338		    sizeof(fl)));
339
340	default:
341		return (EINVAL);
342	}
343	/* NOTREACHED */
344}
345
346/*
347 * Common code for dup, dup2, and fcntl(F_DUPFD).
348 */
349static int
350finishdup(fdp, old, new, retval)
351	register struct filedesc *fdp;
352	register int old, new;
353	register_t *retval;
354{
355	register struct file *fp;
356
357	fp = fdp->fd_ofiles[old];
358	fdp->fd_ofiles[new] = fp;
359	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
360	fhold(fp);
361	if (new > fdp->fd_lastfile)
362		fdp->fd_lastfile = new;
363	*retval = new;
364	return (0);
365}
366
367/*
368 * If sigio is on the list associated with a process or process group,
369 * disable signalling from the device, remove sigio from the list and
370 * free sigio.
371 */
372void
373funsetown(sigio)
374	struct sigio *sigio;
375{
376	int s;
377
378	if (sigio == NULL)
379		return;
380	s = splhigh();
381	*(sigio->sio_myref) = NULL;
382	splx(s);
383	if (sigio->sio_pgid < 0) {
384		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
385			     sigio, sio_pgsigio);
386	} else /* if ((*sigiop)->sio_pgid > 0) */ {
387		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
388			     sigio, sio_pgsigio);
389	}
390	crfree(sigio->sio_ucred);
391	FREE(sigio, M_SIGIO);
392}
393
394/* Free a list of sigio structures. */
395void
396funsetownlst(sigiolst)
397	struct sigiolst *sigiolst;
398{
399	struct sigio *sigio;
400
401	while ((sigio = sigiolst->slh_first) != NULL)
402		funsetown(sigio);
403}
404
405/*
406 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
407 *
408 * After permission checking, add a sigio structure to the sigio list for
409 * the process or process group.
410 */
411int
412fsetown(pgid, sigiop)
413	pid_t pgid;
414	struct sigio **sigiop;
415{
416	struct proc *proc;
417	struct pgrp *pgrp;
418	struct sigio *sigio;
419	int s;
420
421	if (pgid == 0) {
422		funsetown(*sigiop);
423		return (0);
424	}
425	if (pgid > 0) {
426		proc = pfind(pgid);
427		if (proc == NULL)
428			return (ESRCH);
429		/*
430		 * Policy - Don't allow a process to FSETOWN a process
431		 * in another session.
432		 *
433		 * Remove this test to allow maximum flexibility or
434		 * restrict FSETOWN to the current process or process
435		 * group for maximum safety.
436		 */
437		else if (proc->p_session != curproc->p_session)
438			return (EPERM);
439		pgrp = NULL;
440	} else /* if (pgid < 0) */ {
441		pgrp = pgfind(-pgid);
442		if (pgrp == NULL)
443			return (ESRCH);
444		/*
445		 * Policy - Don't allow a process to FSETOWN a process
446		 * in another session.
447		 *
448		 * Remove this test to allow maximum flexibility or
449		 * restrict FSETOWN to the current process or process
450		 * group for maximum safety.
451		 */
452		else if (pgrp->pg_session != curproc->p_session)
453			return (EPERM);
454		proc = NULL;
455	}
456	funsetown(*sigiop);
457	MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO,
458	       M_WAITOK);
459	if (pgid > 0) {
460		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
461		sigio->sio_proc = proc;
462	} else {
463		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
464		sigio->sio_pgrp = pgrp;
465	}
466	sigio->sio_pgid = pgid;
467	crhold(curproc->p_ucred);
468	sigio->sio_ucred = curproc->p_ucred;
469	/* It would be convenient if p_ruid was in ucred. */
470	sigio->sio_ruid = curproc->p_cred->p_ruid;
471	sigio->sio_myref = sigiop;
472	s = splhigh();
473	*sigiop = sigio;
474	splx(s);
475	return (0);
476}
477
478/*
479 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
480 */
481pid_t
482fgetown(sigio)
483	struct sigio *sigio;
484{
485	return (sigio != NULL ? sigio->sio_pgid : 0);
486}
487
488/*
489 * Close a file descriptor.
490 */
491#ifndef _SYS_SYSPROTO_H_
492struct close_args {
493        int     fd;
494};
495#endif
496/* ARGSUSED */
497int
498close(p, uap)
499	struct proc *p;
500	struct close_args *uap;
501{
502	register struct filedesc *fdp = p->p_fd;
503	register struct file *fp;
504	register int fd = uap->fd;
505	register u_char *pf;
506
507	if ((unsigned)fd >= fdp->fd_nfiles ||
508	    (fp = fdp->fd_ofiles[fd]) == NULL)
509		return (EBADF);
510	pf = (u_char *)&fdp->fd_ofileflags[fd];
511	if (*pf & UF_MAPPED)
512		(void) munmapfd(p, fd);
513	fdp->fd_ofiles[fd] = NULL;
514	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
515		fdp->fd_lastfile--;
516	if (fd < fdp->fd_freefile)
517		fdp->fd_freefile = fd;
518	*pf = 0;
519	return (closef(fp, p));
520}
521
522#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
523/*
524 * Return status information about a file descriptor.
525 */
526#ifndef _SYS_SYSPROTO_H_
527struct ofstat_args {
528	int	fd;
529	struct	ostat *sb;
530};
531#endif
532/* ARGSUSED */
533int
534ofstat(p, uap)
535	struct proc *p;
536	register struct ofstat_args *uap;
537{
538	register struct filedesc *fdp = p->p_fd;
539	register struct file *fp;
540	struct stat ub;
541	struct ostat oub;
542	int error;
543
544	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
545	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
546		return (EBADF);
547	switch (fp->f_type) {
548
549	case DTYPE_FIFO:
550	case DTYPE_VNODE:
551		error = vn_stat((struct vnode *)fp->f_data, &ub, p);
552		break;
553
554	case DTYPE_SOCKET:
555		error = soo_stat((struct socket *)fp->f_data, &ub);
556		break;
557
558	case DTYPE_PIPE:
559		error = pipe_stat((struct pipe *)fp->f_data, &ub);
560		break;
561
562	default:
563		panic("ofstat");
564		/*NOTREACHED*/
565	}
566	cvtstat(&ub, &oub);
567	if (error == 0)
568		error = copyout((caddr_t)&oub, (caddr_t)uap->sb, sizeof (oub));
569	return (error);
570}
571#endif /* COMPAT_43 || COMPAT_SUNOS */
572
573/*
574 * Return status information about a file descriptor.
575 */
576#ifndef _SYS_SYSPROTO_H_
577struct fstat_args {
578	int	fd;
579	struct	stat *sb;
580};
581#endif
582/* ARGSUSED */
583int
584fstat(p, uap)
585	struct proc *p;
586	register struct fstat_args *uap;
587{
588	register struct filedesc *fdp = p->p_fd;
589	register struct file *fp;
590	struct stat ub;
591	int error;
592
593	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
594	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
595		return (EBADF);
596	switch (fp->f_type) {
597
598	case DTYPE_FIFO:
599	case DTYPE_VNODE:
600		error = vn_stat((struct vnode *)fp->f_data, &ub, p);
601		break;
602
603	case DTYPE_SOCKET:
604		error = soo_stat((struct socket *)fp->f_data, &ub);
605		break;
606
607	case DTYPE_PIPE:
608		error = pipe_stat((struct pipe *)fp->f_data, &ub);
609		break;
610
611	default:
612		panic("fstat");
613		/*NOTREACHED*/
614	}
615	if (error == 0)
616		error = copyout((caddr_t)&ub, (caddr_t)uap->sb, sizeof (ub));
617	return (error);
618}
619
620/*
621 * Return status information about a file descriptor.
622 */
623#ifndef _SYS_SYSPROTO_H_
624struct nfstat_args {
625	int	fd;
626	struct	nstat *sb;
627};
628#endif
629/* ARGSUSED */
630int
631nfstat(p, uap)
632	struct proc *p;
633	register struct nfstat_args *uap;
634{
635	register struct filedesc *fdp = p->p_fd;
636	register struct file *fp;
637	struct stat ub;
638	struct nstat nub;
639	int error;
640
641	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
642	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
643		return (EBADF);
644	switch (fp->f_type) {
645
646	case DTYPE_FIFO:
647	case DTYPE_VNODE:
648		error = vn_stat((struct vnode *)fp->f_data, &ub, p);
649		break;
650
651	case DTYPE_SOCKET:
652		error = soo_stat((struct socket *)fp->f_data, &ub);
653		break;
654
655	case DTYPE_PIPE:
656		error = pipe_stat((struct pipe *)fp->f_data, &ub);
657		break;
658
659	default:
660		panic("fstat");
661		/*NOTREACHED*/
662	}
663	if (error == 0) {
664		cvtnstat(&ub, &nub);
665		error = copyout((caddr_t)&nub, (caddr_t)uap->sb, sizeof (nub));
666	}
667	return (error);
668}
669
670/*
671 * Return pathconf information about a file descriptor.
672 */
673#ifndef _SYS_SYSPROTO_H_
674struct fpathconf_args {
675	int	fd;
676	int	name;
677};
678#endif
679/* ARGSUSED */
680int
681fpathconf(p, uap)
682	struct proc *p;
683	register struct fpathconf_args *uap;
684{
685	struct filedesc *fdp = p->p_fd;
686	struct file *fp;
687	struct vnode *vp;
688
689	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
690	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
691		return (EBADF);
692	switch (fp->f_type) {
693
694	case DTYPE_PIPE:
695	case DTYPE_SOCKET:
696		if (uap->name != _PC_PIPE_BUF)
697			return (EINVAL);
698		p->p_retval[0] = PIPE_BUF;
699		return (0);
700
701	case DTYPE_FIFO:
702	case DTYPE_VNODE:
703		vp = (struct vnode *)fp->f_data;
704		return (VOP_PATHCONF(vp, uap->name, p->p_retval));
705
706	default:
707		panic("fpathconf");
708	}
709	/*NOTREACHED*/
710}
711
712/*
713 * Allocate a file descriptor for the process.
714 */
715static int fdexpand;
716SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0, "");
717
718int
719fdalloc(p, want, result)
720	struct proc *p;
721	int want;
722	int *result;
723{
724	register struct filedesc *fdp = p->p_fd;
725	register int i;
726	int lim, last, nfiles;
727	struct file **newofile;
728	char *newofileflags;
729
730	/*
731	 * Search for a free descriptor starting at the higher
732	 * of want or fd_freefile.  If that fails, consider
733	 * expanding the ofile array.
734	 */
735	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
736	for (;;) {
737		last = min(fdp->fd_nfiles, lim);
738		if ((i = want) < fdp->fd_freefile)
739			i = fdp->fd_freefile;
740		for (; i < last; i++) {
741			if (fdp->fd_ofiles[i] == NULL) {
742				fdp->fd_ofileflags[i] = 0;
743				if (i > fdp->fd_lastfile)
744					fdp->fd_lastfile = i;
745				if (want <= fdp->fd_freefile)
746					fdp->fd_freefile = i;
747				*result = i;
748				return (0);
749			}
750		}
751
752		/*
753		 * No space in current array.  Expand?
754		 */
755		if (fdp->fd_nfiles >= lim)
756			return (EMFILE);
757		if (fdp->fd_nfiles < NDEXTENT)
758			nfiles = NDEXTENT;
759		else
760			nfiles = 2 * fdp->fd_nfiles;
761		MALLOC(newofile, struct file **, nfiles * OFILESIZE,
762		    M_FILEDESC, M_WAITOK);
763		newofileflags = (char *) &newofile[nfiles];
764		/*
765		 * Copy the existing ofile and ofileflags arrays
766		 * and zero the new portion of each array.
767		 */
768		bcopy(fdp->fd_ofiles, newofile,
769			(i = sizeof(struct file *) * fdp->fd_nfiles));
770		bzero((char *)newofile + i, nfiles * sizeof(struct file *) - i);
771		bcopy(fdp->fd_ofileflags, newofileflags,
772			(i = sizeof(char) * fdp->fd_nfiles));
773		bzero(newofileflags + i, nfiles * sizeof(char) - i);
774		if (fdp->fd_nfiles > NDFILE)
775			FREE(fdp->fd_ofiles, M_FILEDESC);
776		fdp->fd_ofiles = newofile;
777		fdp->fd_ofileflags = newofileflags;
778		fdp->fd_nfiles = nfiles;
779		fdexpand++;
780	}
781	return (0);
782}
783
784/*
785 * Check to see whether n user file descriptors
786 * are available to the process p.
787 */
788int
789fdavail(p, n)
790	struct proc *p;
791	register int n;
792{
793	register struct filedesc *fdp = p->p_fd;
794	register struct file **fpp;
795	register int i, lim, last;
796
797	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
798	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
799		return (1);
800
801	last = min(fdp->fd_nfiles, lim);
802	fpp = &fdp->fd_ofiles[fdp->fd_freefile];
803	for (i = last - fdp->fd_freefile; --i >= 0; fpp++)
804		if (*fpp == NULL && --n <= 0)
805			return (1);
806	return (0);
807}
808
809/*
810 * Create a new open file structure and allocate
811 * a file decriptor for the process that refers to it.
812 */
813int
814falloc(p, resultfp, resultfd)
815	register struct proc *p;
816	struct file **resultfp;
817	int *resultfd;
818{
819	register struct file *fp, *fq;
820	int error, i;
821
822	if ((error = fdalloc(p, 0, &i)))
823		return (error);
824	if (nfiles >= maxfiles) {
825		tablefull("file");
826		return (ENFILE);
827	}
828	/*
829	 * Allocate a new file descriptor.
830	 * If the process has file descriptor zero open, add to the list
831	 * of open files at that point, otherwise put it at the front of
832	 * the list of open files.
833	 */
834	nfiles++;
835	MALLOC(fp, struct file *, sizeof(struct file), M_FILE, M_WAITOK);
836	bzero(fp, sizeof(struct file));
837	fp->f_count = 1;
838	fp->f_cred = p->p_ucred;
839	fp->f_ops = &badfileops;
840	fp->f_seqcount = 1;
841	crhold(fp->f_cred);
842	if ((fq = p->p_fd->fd_ofiles[0])) {
843		LIST_INSERT_AFTER(fq, fp, f_list);
844	} else {
845		LIST_INSERT_HEAD(&filehead, fp, f_list);
846	}
847	p->p_fd->fd_ofiles[i] = fp;
848	if (resultfp)
849		*resultfp = fp;
850	if (resultfd)
851		*resultfd = i;
852	return (0);
853}
854
855/*
856 * Free a file descriptor.
857 */
858void
859ffree(fp)
860	register struct file *fp;
861{
862	LIST_REMOVE(fp, f_list);
863	crfree(fp->f_cred);
864#if defined(DIAGNOSTIC) || defined(INVARIANTS)
865	fp->f_count = 0;
866#endif
867	nfiles--;
868	FREE(fp, M_FILE);
869}
870
871/*
872 * Build a new filedesc structure.
873 */
874struct filedesc *
875fdinit(p)
876	struct proc *p;
877{
878	register struct filedesc0 *newfdp;
879	register struct filedesc *fdp = p->p_fd;
880
881	MALLOC(newfdp, struct filedesc0 *, sizeof(struct filedesc0),
882	    M_FILEDESC, M_WAITOK);
883	bzero(newfdp, sizeof(struct filedesc0));
884	newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
885	VREF(newfdp->fd_fd.fd_cdir);
886	newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
887	VREF(newfdp->fd_fd.fd_rdir);
888
889	/* Create the file descriptor table. */
890	newfdp->fd_fd.fd_refcnt = 1;
891	newfdp->fd_fd.fd_cmask = cmask;
892	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
893	newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
894	newfdp->fd_fd.fd_nfiles = NDFILE;
895
896	newfdp->fd_fd.fd_freefile = 0;
897	newfdp->fd_fd.fd_lastfile = 0;
898
899	return (&newfdp->fd_fd);
900}
901
902/*
903 * Share a filedesc structure.
904 */
905struct filedesc *
906fdshare(p)
907	struct proc *p;
908{
909	p->p_fd->fd_refcnt++;
910	return (p->p_fd);
911}
912
913/*
914 * Copy a filedesc structure.
915 */
916struct filedesc *
917fdcopy(p)
918	struct proc *p;
919{
920	register struct filedesc *newfdp, *fdp = p->p_fd;
921	register struct file **fpp;
922	register int i;
923
924/*
925 * Certain daemons might not have file descriptors
926 */
927	if (fdp == NULL)
928		return NULL;
929
930	MALLOC(newfdp, struct filedesc *, sizeof(struct filedesc0),
931	    M_FILEDESC, M_WAITOK);
932	bcopy(fdp, newfdp, sizeof(struct filedesc));
933	VREF(newfdp->fd_cdir);
934	VREF(newfdp->fd_rdir);
935	newfdp->fd_refcnt = 1;
936
937	/*
938	 * If the number of open files fits in the internal arrays
939	 * of the open file structure, use them, otherwise allocate
940	 * additional memory for the number of descriptors currently
941	 * in use.
942	 */
943	if (newfdp->fd_lastfile < NDFILE) {
944		newfdp->fd_ofiles = ((struct filedesc0 *) newfdp)->fd_dfiles;
945		newfdp->fd_ofileflags =
946		    ((struct filedesc0 *) newfdp)->fd_dfileflags;
947		i = NDFILE;
948	} else {
949		/*
950		 * Compute the smallest multiple of NDEXTENT needed
951		 * for the file descriptors currently in use,
952		 * allowing the table to shrink.
953		 */
954		i = newfdp->fd_nfiles;
955		while (i > 2 * NDEXTENT && i > newfdp->fd_lastfile * 2)
956			i /= 2;
957		MALLOC(newfdp->fd_ofiles, struct file **, i * OFILESIZE,
958		    M_FILEDESC, M_WAITOK);
959		newfdp->fd_ofileflags = (char *) &newfdp->fd_ofiles[i];
960	}
961	newfdp->fd_nfiles = i;
962	bcopy(fdp->fd_ofiles, newfdp->fd_ofiles, i * sizeof(struct file **));
963	bcopy(fdp->fd_ofileflags, newfdp->fd_ofileflags, i * sizeof(char));
964	fpp = newfdp->fd_ofiles;
965	for (i = newfdp->fd_lastfile; i-- >= 0; fpp++)
966		if (*fpp != NULL)
967			fhold(*fpp);
968	return (newfdp);
969}
970
971/*
972 * Release a filedesc structure.
973 */
974void
975fdfree(p)
976	struct proc *p;
977{
978	register struct filedesc *fdp = p->p_fd;
979	struct file **fpp;
980	register int i;
981
982/*
983 * Certain daemons might not have file descriptors
984 */
985	if (fdp == NULL)
986		return;
987
988	if (--fdp->fd_refcnt > 0)
989		return;
990	fpp = fdp->fd_ofiles;
991	for (i = fdp->fd_lastfile; i-- >= 0; fpp++)
992		if (*fpp)
993			(void) closef(*fpp, p);
994	if (fdp->fd_nfiles > NDFILE)
995		FREE(fdp->fd_ofiles, M_FILEDESC);
996	vrele(fdp->fd_cdir);
997	vrele(fdp->fd_rdir);
998	FREE(fdp, M_FILEDESC);
999}
1000
1001/*
1002 * Close any files on exec?
1003 */
1004void
1005fdcloseexec(p)
1006	struct proc *p;
1007{
1008	struct filedesc *fdp = p->p_fd;
1009	struct file **fpp;
1010	char *fdfp;
1011	register int i;
1012
1013/*
1014 * Certain daemons might not have file descriptors
1015 */
1016	if (fdp == NULL)
1017		return;
1018
1019	fpp = fdp->fd_ofiles;
1020	fdfp = fdp->fd_ofileflags;
1021	for (i = 0; i <= fdp->fd_lastfile; i++, fpp++, fdfp++)
1022		if (*fpp != NULL && (*fdfp & UF_EXCLOSE)) {
1023			if (*fdfp & UF_MAPPED)
1024				(void) munmapfd(p, i);
1025			(void) closef(*fpp, p);
1026			*fpp = NULL;
1027			*fdfp = 0;
1028			if (i < fdp->fd_freefile)
1029				fdp->fd_freefile = i;
1030		}
1031	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
1032		fdp->fd_lastfile--;
1033}
1034
1035/*
1036 * Internal form of close.
1037 * Decrement reference count on file structure.
1038 * Note: p may be NULL when closing a file
1039 * that was being passed in a message.
1040 */
1041int
1042closef(fp, p)
1043	register struct file *fp;
1044	register struct proc *p;
1045{
1046	struct vnode *vp;
1047	struct flock lf;
1048
1049	if (fp == NULL)
1050		return (0);
1051	/*
1052	 * POSIX record locking dictates that any close releases ALL
1053	 * locks owned by this process.  This is handled by setting
1054	 * a flag in the unlock to free ONLY locks obeying POSIX
1055	 * semantics, and not to free BSD-style file locks.
1056	 * If the descriptor was in a message, POSIX-style locks
1057	 * aren't passed with the descriptor.
1058	 */
1059	if (p && (p->p_flag & P_ADVLOCK) && fp->f_type == DTYPE_VNODE) {
1060		lf.l_whence = SEEK_SET;
1061		lf.l_start = 0;
1062		lf.l_len = 0;
1063		lf.l_type = F_UNLCK;
1064		vp = (struct vnode *)fp->f_data;
1065		(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, &lf, F_POSIX);
1066	}
1067	if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
1068		lf.l_whence = SEEK_SET;
1069		lf.l_start = 0;
1070		lf.l_len = 0;
1071		lf.l_type = F_UNLCK;
1072		vp = (struct vnode *)fp->f_data;
1073		(void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
1074	}
1075	return (fdrop(fp, p));
1076}
1077
1078int
1079fdrop(fp, p)
1080	struct file *fp;
1081	struct proc *p;
1082{
1083	int error;
1084
1085	if (--fp->f_count > 0)
1086		return (0);
1087	if (fp->f_count < 0)
1088		panic("fdrop: count < 0");
1089	if (fp->f_ops != &badfileops)
1090		error = fo_close(fp, p);
1091	else
1092		error = 0;
1093	ffree(fp);
1094	return (error);
1095}
1096
1097/*
1098 * Apply an advisory lock on a file descriptor.
1099 *
1100 * Just attempt to get a record lock of the requested type on
1101 * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
1102 */
1103#ifndef _SYS_SYSPROTO_H_
1104struct flock_args {
1105	int	fd;
1106	int	how;
1107};
1108#endif
1109/* ARGSUSED */
1110int
1111flock(p, uap)
1112	struct proc *p;
1113	register struct flock_args *uap;
1114{
1115	register struct filedesc *fdp = p->p_fd;
1116	register struct file *fp;
1117	struct vnode *vp;
1118	struct flock lf;
1119
1120	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
1121	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
1122		return (EBADF);
1123	if (fp->f_type != DTYPE_VNODE)
1124		return (EOPNOTSUPP);
1125	vp = (struct vnode *)fp->f_data;
1126	lf.l_whence = SEEK_SET;
1127	lf.l_start = 0;
1128	lf.l_len = 0;
1129	if (uap->how & LOCK_UN) {
1130		lf.l_type = F_UNLCK;
1131		fp->f_flag &= ~FHASLOCK;
1132		return (VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK));
1133	}
1134	if (uap->how & LOCK_EX)
1135		lf.l_type = F_WRLCK;
1136	else if (uap->how & LOCK_SH)
1137		lf.l_type = F_RDLCK;
1138	else
1139		return (EBADF);
1140	fp->f_flag |= FHASLOCK;
1141	if (uap->how & LOCK_NB)
1142		return (VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK));
1143	return (VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK|F_WAIT));
1144}
1145
1146/*
1147 * File Descriptor pseudo-device driver (/dev/fd/).
1148 *
1149 * Opening minor device N dup()s the file (if any) connected to file
1150 * descriptor N belonging to the calling process.  Note that this driver
1151 * consists of only the ``open()'' routine, because all subsequent
1152 * references to this file will be direct to the other driver.
1153 */
1154/* ARGSUSED */
1155static int
1156fdopen(dev, mode, type, p)
1157	dev_t dev;
1158	int mode, type;
1159	struct proc *p;
1160{
1161
1162	/*
1163	 * XXX Kludge: set curproc->p_dupfd to contain the value of the
1164	 * the file descriptor being sought for duplication. The error
1165	 * return ensures that the vnode for this device will be released
1166	 * by vn_open. Open will detect this special error and take the
1167	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
1168	 * will simply report the error.
1169	 */
1170	p->p_dupfd = minor(dev);
1171	return (ENODEV);
1172}
1173
1174/*
1175 * Duplicate the specified descriptor to a free descriptor.
1176 */
1177int
1178dupfdopen(fdp, indx, dfd, mode, error)
1179	register struct filedesc *fdp;
1180	register int indx, dfd;
1181	int mode;
1182	int error;
1183{
1184	register struct file *wfp;
1185	struct file *fp;
1186
1187	/*
1188	 * If the to-be-dup'd fd number is greater than the allowed number
1189	 * of file descriptors, or the fd to be dup'd has already been
1190	 * closed, reject.  Note, check for new == old is necessary as
1191	 * falloc could allocate an already closed to-be-dup'd descriptor
1192	 * as the new descriptor.
1193	 */
1194	fp = fdp->fd_ofiles[indx];
1195	if ((u_int)dfd >= fdp->fd_nfiles ||
1196	    (wfp = fdp->fd_ofiles[dfd]) == NULL || fp == wfp)
1197		return (EBADF);
1198
1199	/*
1200	 * There are two cases of interest here.
1201	 *
1202	 * For ENODEV simply dup (dfd) to file descriptor
1203	 * (indx) and return.
1204	 *
1205	 * For ENXIO steal away the file structure from (dfd) and
1206	 * store it in (indx).  (dfd) is effectively closed by
1207	 * this operation.
1208	 *
1209	 * Any other error code is just returned.
1210	 */
1211	switch (error) {
1212	case ENODEV:
1213		/*
1214		 * Check that the mode the file is being opened for is a
1215		 * subset of the mode of the existing descriptor.
1216		 */
1217		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag)
1218			return (EACCES);
1219		fdp->fd_ofiles[indx] = wfp;
1220		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
1221		fhold(wfp);
1222		if (indx > fdp->fd_lastfile)
1223			fdp->fd_lastfile = indx;
1224		return (0);
1225
1226	case ENXIO:
1227		/*
1228		 * Steal away the file pointer from dfd, and stuff it into indx.
1229		 */
1230		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
1231		fdp->fd_ofiles[dfd] = NULL;
1232		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
1233		fdp->fd_ofileflags[dfd] = 0;
1234		/*
1235		 * Complete the clean up of the filedesc structure by
1236		 * recomputing the various hints.
1237		 */
1238		if (indx > fdp->fd_lastfile)
1239			fdp->fd_lastfile = indx;
1240		else
1241			while (fdp->fd_lastfile > 0 &&
1242			       fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
1243				fdp->fd_lastfile--;
1244			if (dfd < fdp->fd_freefile)
1245				fdp->fd_freefile = dfd;
1246		return (0);
1247
1248	default:
1249		return (error);
1250	}
1251	/* NOTREACHED */
1252}
1253
1254/*
1255 * Get file structures.
1256 */
1257static int
1258sysctl_kern_file SYSCTL_HANDLER_ARGS
1259{
1260	int error;
1261	struct file *fp;
1262
1263	if (!req->oldptr) {
1264		/*
1265		 * overestimate by 10 files
1266		 */
1267		return (SYSCTL_OUT(req, 0, sizeof(filehead) +
1268				(nfiles + 10) * sizeof(struct file)));
1269	}
1270
1271	error = SYSCTL_OUT(req, (caddr_t)&filehead, sizeof(filehead));
1272	if (error)
1273		return (error);
1274
1275	/*
1276	 * followed by an array of file structures
1277	 */
1278	for (fp = filehead.lh_first; fp != NULL; fp = fp->f_list.le_next) {
1279		error = SYSCTL_OUT(req, (caddr_t)fp, sizeof (struct file));
1280		if (error)
1281			return (error);
1282	}
1283	return (0);
1284}
1285
1286SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
1287    0, 0, sysctl_kern_file, "S,file", "Entire file table");
1288
1289SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
1290    &maxfilesperproc, 0, "Maximum files allowed open per process");
1291
1292SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
1293    &maxfiles, 0, "Maximum number of files");
1294
1295static void
1296fildesc_drvinit(void *unused)
1297{
1298	int fd;
1299
1300	cdevsw_add(&fildesc_cdevsw);
1301	for (fd = 0; fd < NUMFDESC; fd++)
1302		make_dev(&fildesc_cdevsw, fd,
1303		    UID_BIN, GID_BIN, 0666, "fd/%d", fd);
1304	make_dev(&fildesc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "stdin");
1305	make_dev(&fildesc_cdevsw, 1, UID_ROOT, GID_WHEEL, 0666, "stdout");
1306	make_dev(&fildesc_cdevsw, 2, UID_ROOT, GID_WHEEL, 0666, "stderr");
1307}
1308
1309struct fileops badfileops = {
1310	badfo_readwrite,
1311	badfo_readwrite,
1312	badfo_ioctl,
1313	badfo_poll,
1314	badfo_close
1315};
1316
1317static int
1318badfo_readwrite(fp, uio, cred, flags, p)
1319	struct file *fp;
1320	struct uio *uio;
1321	struct ucred *cred;
1322	struct proc *p;
1323	int flags;
1324{
1325
1326	return (EBADF);
1327}
1328
1329static int
1330badfo_ioctl(fp, com, data, p)
1331	struct file *fp;
1332	u_long com;
1333	caddr_t data;
1334	struct proc *p;
1335{
1336
1337	return (EBADF);
1338}
1339
1340static int
1341badfo_poll(fp, events, cred, p)
1342	struct file *fp;
1343	int events;
1344	struct ucred *cred;
1345	struct proc *p;
1346{
1347
1348	return (0);
1349}
1350
1351static int
1352badfo_close(fp, p)
1353	struct file *fp;
1354	struct proc *p;
1355{
1356
1357	return (EBADF);
1358}
1359
1360SYSINIT(fildescdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,
1361					fildesc_drvinit,NULL)
1362
1363
1364