kern_descrip.c revision 50477
1/*
2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
39 * $FreeBSD: head/sys/kern/kern_descrip.c 50477 1999-08-28 01:08:13Z peter $
40 */
41
42#include "opt_compat.h"
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/sysproto.h>
46#include <sys/conf.h>
47#include <sys/filedesc.h>
48#include <sys/kernel.h>
49#include <sys/sysctl.h>
50#include <sys/vnode.h>
51#include <sys/proc.h>
52#include <sys/file.h>
53#include <sys/socketvar.h>
54#include <sys/stat.h>
55#include <sys/filio.h>
56#include <sys/ttycom.h>
57#include <sys/fcntl.h>
58#include <sys/malloc.h>
59#include <sys/unistd.h>
60#include <sys/resourcevar.h>
61#include <sys/pipe.h>
62
63#include <vm/vm.h>
64#include <vm/vm_extern.h>
65
66static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
67MALLOC_DEFINE(M_FILE, "file", "Open file structure");
68static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
69
70
71static	 d_open_t  fdopen;
72#define NUMFDESC 64
73
74#define CDEV_MAJOR 22
75static struct cdevsw fildesc_cdevsw = {
76	/* open */	fdopen,
77	/* close */	noclose,
78	/* read */	noread,
79	/* write */	nowrite,
80	/* ioctl */	noioctl,
81	/* stop */	nostop,
82	/* reset */	noreset,
83	/* devtotty */	nodevtotty,
84	/* poll */	nopoll,
85	/* mmap */	nommap,
86	/* strategy */	nostrategy,
87	/* name */	"FD",
88	/* parms */	noparms,
89	/* maj */	CDEV_MAJOR,
90	/* dump */	nodump,
91	/* psize */	nopsize,
92	/* flags */	0,
93	/* maxio */	0,
94	/* bmaj */	-1
95};
96
97static int finishdup __P((struct filedesc *fdp, int old, int new, register_t *retval));
98static int badfo_readwrite __P((struct file *fp, struct uio *uio,
99    struct ucred *cred, int flags));
100static int badfo_ioctl __P((struct file *fp, u_long com, caddr_t data,
101    struct proc *p));
102static int badfo_poll __P((struct file *fp, int events,
103    struct ucred *cred, struct proc *p));
104static int badfo_close __P((struct file *fp, struct proc *p));
105/*
106 * Descriptor management.
107 */
108struct filelist filehead;	/* head of list of open files */
109int nfiles;			/* actual number of open files */
110extern int cmask;
111
112/*
113 * System calls on descriptors.
114 */
115#ifndef _SYS_SYSPROTO_H_
116struct getdtablesize_args {
117	int	dummy;
118};
119#endif
120/* ARGSUSED */
121int
122getdtablesize(p, uap)
123	struct proc *p;
124	struct getdtablesize_args *uap;
125{
126
127	p->p_retval[0] =
128	    min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
129	return (0);
130}
131
132/*
133 * Duplicate a file descriptor to a particular value.
134 */
135#ifndef _SYS_SYSPROTO_H_
136struct dup2_args {
137	u_int	from;
138	u_int	to;
139};
140#endif
141/* ARGSUSED */
142int
143dup2(p, uap)
144	struct proc *p;
145	struct dup2_args *uap;
146{
147	register struct filedesc *fdp = p->p_fd;
148	register u_int old = uap->from, new = uap->to;
149	int i, error;
150
151	if (old >= fdp->fd_nfiles ||
152	    fdp->fd_ofiles[old] == NULL ||
153	    new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
154	    new >= maxfilesperproc)
155		return (EBADF);
156	if (old == new) {
157		p->p_retval[0] = new;
158		return (0);
159	}
160	if (new >= fdp->fd_nfiles) {
161		if ((error = fdalloc(p, new, &i)))
162			return (error);
163		if (new != i)
164			panic("dup2: fdalloc");
165	} else if (fdp->fd_ofiles[new]) {
166		if (fdp->fd_ofileflags[new] & UF_MAPPED)
167			(void) munmapfd(p, new);
168		/*
169		 * dup2() must succeed even if the close has an error.
170		 */
171		(void) closef(fdp->fd_ofiles[new], p);
172	}
173	return (finishdup(fdp, (int)old, (int)new, p->p_retval));
174}
175
176/*
177 * Duplicate a file descriptor.
178 */
179#ifndef _SYS_SYSPROTO_H_
180struct dup_args {
181	u_int	fd;
182};
183#endif
184/* ARGSUSED */
185int
186dup(p, uap)
187	struct proc *p;
188	struct dup_args *uap;
189{
190	register struct filedesc *fdp;
191	u_int old;
192	int new, error;
193
194	old = uap->fd;
195
196#if 0
197	/*
198	 * XXX Compatibility
199	 */
200	if (old &~ 077) { uap->fd &= 077; return (dup2(p, uap, p->p_retval)); }
201#endif
202
203	fdp = p->p_fd;
204	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL)
205		return (EBADF);
206	if ((error = fdalloc(p, 0, &new)))
207		return (error);
208	return (finishdup(fdp, (int)old, new, p->p_retval));
209}
210
211/*
212 * The file control system call.
213 */
214#ifndef _SYS_SYSPROTO_H_
215struct fcntl_args {
216	int	fd;
217	int	cmd;
218	long	arg;
219};
220#endif
221/* ARGSUSED */
222int
223fcntl(p, uap)
224	struct proc *p;
225	register struct fcntl_args *uap;
226{
227	register struct filedesc *fdp = p->p_fd;
228	register struct file *fp;
229	register char *pop;
230	struct vnode *vp;
231	int i, tmp, error, flg = F_POSIX;
232	struct flock fl;
233	u_int newmin;
234
235	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
236	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
237		return (EBADF);
238	pop = &fdp->fd_ofileflags[uap->fd];
239	switch (uap->cmd) {
240
241	case F_DUPFD:
242		newmin = uap->arg;
243		if (newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
244		    newmin >= maxfilesperproc)
245			return (EINVAL);
246		if ((error = fdalloc(p, newmin, &i)))
247			return (error);
248		return (finishdup(fdp, uap->fd, i, p->p_retval));
249
250	case F_GETFD:
251		p->p_retval[0] = *pop & 1;
252		return (0);
253
254	case F_SETFD:
255		*pop = (*pop &~ 1) | (uap->arg & 1);
256		return (0);
257
258	case F_GETFL:
259		p->p_retval[0] = OFLAGS(fp->f_flag);
260		return (0);
261
262	case F_SETFL:
263		fp->f_flag &= ~FCNTLFLAGS;
264		fp->f_flag |= FFLAGS(uap->arg & ~O_ACCMODE) & FCNTLFLAGS;
265		tmp = fp->f_flag & FNONBLOCK;
266		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
267		if (error)
268			return (error);
269		tmp = fp->f_flag & FASYNC;
270		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
271		if (!error)
272			return (0);
273		fp->f_flag &= ~FNONBLOCK;
274		tmp = 0;
275		(void) (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
276		return (error);
277
278	case F_GETOWN:
279		error = (*fp->f_ops->fo_ioctl)
280			(fp, FIOGETOWN, (caddr_t)p->p_retval, p);
281		return (error);
282
283	case F_SETOWN:
284		return ((*fp->f_ops->fo_ioctl)
285			(fp, FIOSETOWN, (caddr_t)&uap->arg, p));
286
287	case F_SETLKW:
288		flg |= F_WAIT;
289		/* Fall into F_SETLK */
290
291	case F_SETLK:
292		if (fp->f_type != DTYPE_VNODE)
293			return (EBADF);
294		vp = (struct vnode *)fp->f_data;
295		/* Copy in the lock structure */
296		error = copyin((caddr_t)(intptr_t)uap->arg, (caddr_t)&fl,
297		    sizeof(fl));
298		if (error)
299			return (error);
300		if (fl.l_whence == SEEK_CUR)
301			fl.l_start += fp->f_offset;
302		switch (fl.l_type) {
303
304		case F_RDLCK:
305			if ((fp->f_flag & FREAD) == 0)
306				return (EBADF);
307			p->p_flag |= P_ADVLOCK;
308			return (VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, &fl, flg));
309
310		case F_WRLCK:
311			if ((fp->f_flag & FWRITE) == 0)
312				return (EBADF);
313			p->p_flag |= P_ADVLOCK;
314			return (VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, &fl, flg));
315
316		case F_UNLCK:
317			return (VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, &fl,
318				F_POSIX));
319
320		default:
321			return (EINVAL);
322		}
323
324	case F_GETLK:
325		if (fp->f_type != DTYPE_VNODE)
326			return (EBADF);
327		vp = (struct vnode *)fp->f_data;
328		/* Copy in the lock structure */
329		error = copyin((caddr_t)(intptr_t)uap->arg, (caddr_t)&fl,
330		    sizeof(fl));
331		if (error)
332			return (error);
333		if (fl.l_type != F_RDLCK && fl.l_type != F_WRLCK &&
334		    fl.l_type != F_UNLCK)
335			return (EINVAL);
336		if (fl.l_whence == SEEK_CUR)
337			fl.l_start += fp->f_offset;
338		if ((error = VOP_ADVLOCK(vp,(caddr_t)p->p_leader,F_GETLK,&fl,F_POSIX)))
339			return (error);
340		return (copyout((caddr_t)&fl, (caddr_t)(intptr_t)uap->arg,
341		    sizeof(fl)));
342
343	default:
344		return (EINVAL);
345	}
346	/* NOTREACHED */
347}
348
349/*
350 * Common code for dup, dup2, and fcntl(F_DUPFD).
351 */
352static int
353finishdup(fdp, old, new, retval)
354	register struct filedesc *fdp;
355	register int old, new;
356	register_t *retval;
357{
358	register struct file *fp;
359
360	fp = fdp->fd_ofiles[old];
361	fdp->fd_ofiles[new] = fp;
362	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
363	fp->f_count++;
364	if (new > fdp->fd_lastfile)
365		fdp->fd_lastfile = new;
366	*retval = new;
367	return (0);
368}
369
370/*
371 * If sigio is on the list associated with a process or process group,
372 * disable signalling from the device, remove sigio from the list and
373 * free sigio.
374 */
375void
376funsetown(sigio)
377	struct sigio *sigio;
378{
379	int s;
380
381	if (sigio == NULL)
382		return;
383	s = splhigh();
384	*(sigio->sio_myref) = NULL;
385	splx(s);
386	if (sigio->sio_pgid < 0) {
387		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
388			     sigio, sio_pgsigio);
389	} else /* if ((*sigiop)->sio_pgid > 0) */ {
390		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
391			     sigio, sio_pgsigio);
392	}
393	crfree(sigio->sio_ucred);
394	FREE(sigio, M_SIGIO);
395}
396
397/* Free a list of sigio structures. */
398void
399funsetownlst(sigiolst)
400	struct sigiolst *sigiolst;
401{
402	struct sigio *sigio;
403
404	while ((sigio = sigiolst->slh_first) != NULL)
405		funsetown(sigio);
406}
407
408/*
409 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
410 *
411 * After permission checking, add a sigio structure to the sigio list for
412 * the process or process group.
413 */
414int
415fsetown(pgid, sigiop)
416	pid_t pgid;
417	struct sigio **sigiop;
418{
419	struct proc *proc;
420	struct pgrp *pgrp;
421	struct sigio *sigio;
422	int s;
423
424	if (pgid == 0) {
425		funsetown(*sigiop);
426		return (0);
427	}
428	if (pgid > 0) {
429		proc = pfind(pgid);
430		if (proc == NULL)
431			return (ESRCH);
432		/*
433		 * Policy - Don't allow a process to FSETOWN a process
434		 * in another session.
435		 *
436		 * Remove this test to allow maximum flexibility or
437		 * restrict FSETOWN to the current process or process
438		 * group for maximum safety.
439		 */
440		else if (proc->p_session != curproc->p_session)
441			return (EPERM);
442		pgrp = NULL;
443	} else /* if (pgid < 0) */ {
444		pgrp = pgfind(-pgid);
445		if (pgrp == NULL)
446			return (ESRCH);
447		/*
448		 * Policy - Don't allow a process to FSETOWN a process
449		 * in another session.
450		 *
451		 * Remove this test to allow maximum flexibility or
452		 * restrict FSETOWN to the current process or process
453		 * group for maximum safety.
454		 */
455		else if (pgrp->pg_session != curproc->p_session)
456			return (EPERM);
457		proc = NULL;
458	}
459	funsetown(*sigiop);
460	MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO,
461	       M_WAITOK);
462	if (pgid > 0) {
463		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
464		sigio->sio_proc = proc;
465	} else {
466		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
467		sigio->sio_pgrp = pgrp;
468	}
469	sigio->sio_pgid = pgid;
470	crhold(curproc->p_ucred);
471	sigio->sio_ucred = curproc->p_ucred;
472	/* It would be convenient if p_ruid was in ucred. */
473	sigio->sio_ruid = curproc->p_cred->p_ruid;
474	sigio->sio_myref = sigiop;
475	s = splhigh();
476	*sigiop = sigio;
477	splx(s);
478	return (0);
479}
480
481/*
482 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
483 */
484pid_t
485fgetown(sigio)
486	struct sigio *sigio;
487{
488	return (sigio != NULL ? sigio->sio_pgid : 0);
489}
490
491/*
492 * Close a file descriptor.
493 */
494#ifndef _SYS_SYSPROTO_H_
495struct close_args {
496        int     fd;
497};
498#endif
499/* ARGSUSED */
500int
501close(p, uap)
502	struct proc *p;
503	struct close_args *uap;
504{
505	register struct filedesc *fdp = p->p_fd;
506	register struct file *fp;
507	register int fd = uap->fd;
508	register u_char *pf;
509
510	if ((unsigned)fd >= fdp->fd_nfiles ||
511	    (fp = fdp->fd_ofiles[fd]) == NULL)
512		return (EBADF);
513	pf = (u_char *)&fdp->fd_ofileflags[fd];
514	if (*pf & UF_MAPPED)
515		(void) munmapfd(p, fd);
516	fdp->fd_ofiles[fd] = NULL;
517	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
518		fdp->fd_lastfile--;
519	if (fd < fdp->fd_freefile)
520		fdp->fd_freefile = fd;
521	*pf = 0;
522	return (closef(fp, p));
523}
524
525#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
526/*
527 * Return status information about a file descriptor.
528 */
529#ifndef _SYS_SYSPROTO_H_
530struct ofstat_args {
531	int	fd;
532	struct	ostat *sb;
533};
534#endif
535/* ARGSUSED */
536int
537ofstat(p, uap)
538	struct proc *p;
539	register struct ofstat_args *uap;
540{
541	register struct filedesc *fdp = p->p_fd;
542	register struct file *fp;
543	struct stat ub;
544	struct ostat oub;
545	int error;
546
547	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
548	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
549		return (EBADF);
550	switch (fp->f_type) {
551
552	case DTYPE_FIFO:
553	case DTYPE_VNODE:
554		error = vn_stat((struct vnode *)fp->f_data, &ub, p);
555		break;
556
557	case DTYPE_SOCKET:
558		error = soo_stat((struct socket *)fp->f_data, &ub);
559		break;
560
561	case DTYPE_PIPE:
562		error = pipe_stat((struct pipe *)fp->f_data, &ub);
563		break;
564
565	default:
566		panic("ofstat");
567		/*NOTREACHED*/
568	}
569	cvtstat(&ub, &oub);
570	if (error == 0)
571		error = copyout((caddr_t)&oub, (caddr_t)uap->sb, sizeof (oub));
572	return (error);
573}
574#endif /* COMPAT_43 || COMPAT_SUNOS */
575
576/*
577 * Return status information about a file descriptor.
578 */
579#ifndef _SYS_SYSPROTO_H_
580struct fstat_args {
581	int	fd;
582	struct	stat *sb;
583};
584#endif
585/* ARGSUSED */
586int
587fstat(p, uap)
588	struct proc *p;
589	register struct fstat_args *uap;
590{
591	register struct filedesc *fdp = p->p_fd;
592	register struct file *fp;
593	struct stat ub;
594	int error;
595
596	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
597	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
598		return (EBADF);
599	switch (fp->f_type) {
600
601	case DTYPE_FIFO:
602	case DTYPE_VNODE:
603		error = vn_stat((struct vnode *)fp->f_data, &ub, p);
604		break;
605
606	case DTYPE_SOCKET:
607		error = soo_stat((struct socket *)fp->f_data, &ub);
608		break;
609
610	case DTYPE_PIPE:
611		error = pipe_stat((struct pipe *)fp->f_data, &ub);
612		break;
613
614	default:
615		panic("fstat");
616		/*NOTREACHED*/
617	}
618	if (error == 0)
619		error = copyout((caddr_t)&ub, (caddr_t)uap->sb, sizeof (ub));
620	return (error);
621}
622
623/*
624 * Return status information about a file descriptor.
625 */
626#ifndef _SYS_SYSPROTO_H_
627struct nfstat_args {
628	int	fd;
629	struct	nstat *sb;
630};
631#endif
632/* ARGSUSED */
633int
634nfstat(p, uap)
635	struct proc *p;
636	register struct nfstat_args *uap;
637{
638	register struct filedesc *fdp = p->p_fd;
639	register struct file *fp;
640	struct stat ub;
641	struct nstat nub;
642	int error;
643
644	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
645	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
646		return (EBADF);
647	switch (fp->f_type) {
648
649	case DTYPE_FIFO:
650	case DTYPE_VNODE:
651		error = vn_stat((struct vnode *)fp->f_data, &ub, p);
652		break;
653
654	case DTYPE_SOCKET:
655		error = soo_stat((struct socket *)fp->f_data, &ub);
656		break;
657
658	case DTYPE_PIPE:
659		error = pipe_stat((struct pipe *)fp->f_data, &ub);
660		break;
661
662	default:
663		panic("fstat");
664		/*NOTREACHED*/
665	}
666	if (error == 0) {
667		cvtnstat(&ub, &nub);
668		error = copyout((caddr_t)&nub, (caddr_t)uap->sb, sizeof (nub));
669	}
670	return (error);
671}
672
673/*
674 * Return pathconf information about a file descriptor.
675 */
676#ifndef _SYS_SYSPROTO_H_
677struct fpathconf_args {
678	int	fd;
679	int	name;
680};
681#endif
682/* ARGSUSED */
683int
684fpathconf(p, uap)
685	struct proc *p;
686	register struct fpathconf_args *uap;
687{
688	struct filedesc *fdp = p->p_fd;
689	struct file *fp;
690	struct vnode *vp;
691
692	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
693	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
694		return (EBADF);
695	switch (fp->f_type) {
696
697	case DTYPE_PIPE:
698	case DTYPE_SOCKET:
699		if (uap->name != _PC_PIPE_BUF)
700			return (EINVAL);
701		p->p_retval[0] = PIPE_BUF;
702		return (0);
703
704	case DTYPE_FIFO:
705	case DTYPE_VNODE:
706		vp = (struct vnode *)fp->f_data;
707		return (VOP_PATHCONF(vp, uap->name, p->p_retval));
708
709	default:
710		panic("fpathconf");
711	}
712	/*NOTREACHED*/
713}
714
715/*
716 * Allocate a file descriptor for the process.
717 */
718static int fdexpand;
719SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0, "");
720
721int
722fdalloc(p, want, result)
723	struct proc *p;
724	int want;
725	int *result;
726{
727	register struct filedesc *fdp = p->p_fd;
728	register int i;
729	int lim, last, nfiles;
730	struct file **newofile;
731	char *newofileflags;
732
733	/*
734	 * Search for a free descriptor starting at the higher
735	 * of want or fd_freefile.  If that fails, consider
736	 * expanding the ofile array.
737	 */
738	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
739	for (;;) {
740		last = min(fdp->fd_nfiles, lim);
741		if ((i = want) < fdp->fd_freefile)
742			i = fdp->fd_freefile;
743		for (; i < last; i++) {
744			if (fdp->fd_ofiles[i] == NULL) {
745				fdp->fd_ofileflags[i] = 0;
746				if (i > fdp->fd_lastfile)
747					fdp->fd_lastfile = i;
748				if (want <= fdp->fd_freefile)
749					fdp->fd_freefile = i;
750				*result = i;
751				return (0);
752			}
753		}
754
755		/*
756		 * No space in current array.  Expand?
757		 */
758		if (fdp->fd_nfiles >= lim)
759			return (EMFILE);
760		if (fdp->fd_nfiles < NDEXTENT)
761			nfiles = NDEXTENT;
762		else
763			nfiles = 2 * fdp->fd_nfiles;
764		MALLOC(newofile, struct file **, nfiles * OFILESIZE,
765		    M_FILEDESC, M_WAITOK);
766		newofileflags = (char *) &newofile[nfiles];
767		/*
768		 * Copy the existing ofile and ofileflags arrays
769		 * and zero the new portion of each array.
770		 */
771		bcopy(fdp->fd_ofiles, newofile,
772			(i = sizeof(struct file *) * fdp->fd_nfiles));
773		bzero((char *)newofile + i, nfiles * sizeof(struct file *) - i);
774		bcopy(fdp->fd_ofileflags, newofileflags,
775			(i = sizeof(char) * fdp->fd_nfiles));
776		bzero(newofileflags + i, nfiles * sizeof(char) - i);
777		if (fdp->fd_nfiles > NDFILE)
778			FREE(fdp->fd_ofiles, M_FILEDESC);
779		fdp->fd_ofiles = newofile;
780		fdp->fd_ofileflags = newofileflags;
781		fdp->fd_nfiles = nfiles;
782		fdexpand++;
783	}
784	return (0);
785}
786
787/*
788 * Check to see whether n user file descriptors
789 * are available to the process p.
790 */
791int
792fdavail(p, n)
793	struct proc *p;
794	register int n;
795{
796	register struct filedesc *fdp = p->p_fd;
797	register struct file **fpp;
798	register int i, lim, last;
799
800	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
801	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
802		return (1);
803
804	last = min(fdp->fd_nfiles, lim);
805	fpp = &fdp->fd_ofiles[fdp->fd_freefile];
806	for (i = last - fdp->fd_freefile; --i >= 0; fpp++)
807		if (*fpp == NULL && --n <= 0)
808			return (1);
809	return (0);
810}
811
812/*
813 * Create a new open file structure and allocate
814 * a file decriptor for the process that refers to it.
815 */
816int
817falloc(p, resultfp, resultfd)
818	register struct proc *p;
819	struct file **resultfp;
820	int *resultfd;
821{
822	register struct file *fp, *fq;
823	int error, i;
824
825	if ((error = fdalloc(p, 0, &i)))
826		return (error);
827	if (nfiles >= maxfiles) {
828		tablefull("file");
829		return (ENFILE);
830	}
831	/*
832	 * Allocate a new file descriptor.
833	 * If the process has file descriptor zero open, add to the list
834	 * of open files at that point, otherwise put it at the front of
835	 * the list of open files.
836	 */
837	nfiles++;
838	MALLOC(fp, struct file *, sizeof(struct file), M_FILE, M_WAITOK);
839	bzero(fp, sizeof(struct file));
840	fp->f_count = 1;
841	fp->f_cred = p->p_ucred;
842	fp->f_ops = &badfileops;
843	fp->f_seqcount = 1;
844	crhold(fp->f_cred);
845	if ((fq = p->p_fd->fd_ofiles[0])) {
846		LIST_INSERT_AFTER(fq, fp, f_list);
847	} else {
848		LIST_INSERT_HEAD(&filehead, fp, f_list);
849	}
850	p->p_fd->fd_ofiles[i] = fp;
851	if (resultfp)
852		*resultfp = fp;
853	if (resultfd)
854		*resultfd = i;
855	return (0);
856}
857
858/*
859 * Free a file descriptor.
860 */
861void
862ffree(fp)
863	register struct file *fp;
864{
865	LIST_REMOVE(fp, f_list);
866	crfree(fp->f_cred);
867#if defined(DIAGNOSTIC) || defined(INVARIANTS)
868	fp->f_count = 0;
869#endif
870	nfiles--;
871	FREE(fp, M_FILE);
872}
873
874/*
875 * Build a new filedesc structure.
876 */
877struct filedesc *
878fdinit(p)
879	struct proc *p;
880{
881	register struct filedesc0 *newfdp;
882	register struct filedesc *fdp = p->p_fd;
883
884	MALLOC(newfdp, struct filedesc0 *, sizeof(struct filedesc0),
885	    M_FILEDESC, M_WAITOK);
886	bzero(newfdp, sizeof(struct filedesc0));
887	newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
888	VREF(newfdp->fd_fd.fd_cdir);
889	newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
890	VREF(newfdp->fd_fd.fd_rdir);
891
892	/* Create the file descriptor table. */
893	newfdp->fd_fd.fd_refcnt = 1;
894	newfdp->fd_fd.fd_cmask = cmask;
895	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
896	newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
897	newfdp->fd_fd.fd_nfiles = NDFILE;
898
899	newfdp->fd_fd.fd_freefile = 0;
900	newfdp->fd_fd.fd_lastfile = 0;
901
902	return (&newfdp->fd_fd);
903}
904
905/*
906 * Share a filedesc structure.
907 */
908struct filedesc *
909fdshare(p)
910	struct proc *p;
911{
912	p->p_fd->fd_refcnt++;
913	return (p->p_fd);
914}
915
916/*
917 * Copy a filedesc structure.
918 */
919struct filedesc *
920fdcopy(p)
921	struct proc *p;
922{
923	register struct filedesc *newfdp, *fdp = p->p_fd;
924	register struct file **fpp;
925	register int i;
926
927/*
928 * Certain daemons might not have file descriptors
929 */
930	if (fdp == NULL)
931		return NULL;
932
933	MALLOC(newfdp, struct filedesc *, sizeof(struct filedesc0),
934	    M_FILEDESC, M_WAITOK);
935	bcopy(fdp, newfdp, sizeof(struct filedesc));
936	VREF(newfdp->fd_cdir);
937	VREF(newfdp->fd_rdir);
938	newfdp->fd_refcnt = 1;
939
940	/*
941	 * If the number of open files fits in the internal arrays
942	 * of the open file structure, use them, otherwise allocate
943	 * additional memory for the number of descriptors currently
944	 * in use.
945	 */
946	if (newfdp->fd_lastfile < NDFILE) {
947		newfdp->fd_ofiles = ((struct filedesc0 *) newfdp)->fd_dfiles;
948		newfdp->fd_ofileflags =
949		    ((struct filedesc0 *) newfdp)->fd_dfileflags;
950		i = NDFILE;
951	} else {
952		/*
953		 * Compute the smallest multiple of NDEXTENT needed
954		 * for the file descriptors currently in use,
955		 * allowing the table to shrink.
956		 */
957		i = newfdp->fd_nfiles;
958		while (i > 2 * NDEXTENT && i > newfdp->fd_lastfile * 2)
959			i /= 2;
960		MALLOC(newfdp->fd_ofiles, struct file **, i * OFILESIZE,
961		    M_FILEDESC, M_WAITOK);
962		newfdp->fd_ofileflags = (char *) &newfdp->fd_ofiles[i];
963	}
964	newfdp->fd_nfiles = i;
965	bcopy(fdp->fd_ofiles, newfdp->fd_ofiles, i * sizeof(struct file **));
966	bcopy(fdp->fd_ofileflags, newfdp->fd_ofileflags, i * sizeof(char));
967	fpp = newfdp->fd_ofiles;
968	for (i = newfdp->fd_lastfile; i-- >= 0; fpp++)
969		if (*fpp != NULL)
970			(*fpp)->f_count++;
971	return (newfdp);
972}
973
974/*
975 * Release a filedesc structure.
976 */
977void
978fdfree(p)
979	struct proc *p;
980{
981	register struct filedesc *fdp = p->p_fd;
982	struct file **fpp;
983	register int i;
984
985/*
986 * Certain daemons might not have file descriptors
987 */
988	if (fdp == NULL)
989		return;
990
991	if (--fdp->fd_refcnt > 0)
992		return;
993	fpp = fdp->fd_ofiles;
994	for (i = fdp->fd_lastfile; i-- >= 0; fpp++)
995		if (*fpp)
996			(void) closef(*fpp, p);
997	if (fdp->fd_nfiles > NDFILE)
998		FREE(fdp->fd_ofiles, M_FILEDESC);
999	vrele(fdp->fd_cdir);
1000	vrele(fdp->fd_rdir);
1001	FREE(fdp, M_FILEDESC);
1002}
1003
1004/*
1005 * Close any files on exec?
1006 */
1007void
1008fdcloseexec(p)
1009	struct proc *p;
1010{
1011	struct filedesc *fdp = p->p_fd;
1012	struct file **fpp;
1013	char *fdfp;
1014	register int i;
1015
1016/*
1017 * Certain daemons might not have file descriptors
1018 */
1019	if (fdp == NULL)
1020		return;
1021
1022	fpp = fdp->fd_ofiles;
1023	fdfp = fdp->fd_ofileflags;
1024	for (i = 0; i <= fdp->fd_lastfile; i++, fpp++, fdfp++)
1025		if (*fpp != NULL && (*fdfp & UF_EXCLOSE)) {
1026			if (*fdfp & UF_MAPPED)
1027				(void) munmapfd(p, i);
1028			(void) closef(*fpp, p);
1029			*fpp = NULL;
1030			*fdfp = 0;
1031			if (i < fdp->fd_freefile)
1032				fdp->fd_freefile = i;
1033		}
1034	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
1035		fdp->fd_lastfile--;
1036}
1037
1038/*
1039 * Internal form of close.
1040 * Decrement reference count on file structure.
1041 * Note: p may be NULL when closing a file
1042 * that was being passed in a message.
1043 */
1044int
1045closef(fp, p)
1046	register struct file *fp;
1047	register struct proc *p;
1048{
1049	struct vnode *vp;
1050	struct flock lf;
1051	int error;
1052
1053	if (fp == NULL)
1054		return (0);
1055	/*
1056	 * POSIX record locking dictates that any close releases ALL
1057	 * locks owned by this process.  This is handled by setting
1058	 * a flag in the unlock to free ONLY locks obeying POSIX
1059	 * semantics, and not to free BSD-style file locks.
1060	 * If the descriptor was in a message, POSIX-style locks
1061	 * aren't passed with the descriptor.
1062	 */
1063	if (p && (p->p_flag & P_ADVLOCK) && fp->f_type == DTYPE_VNODE) {
1064		lf.l_whence = SEEK_SET;
1065		lf.l_start = 0;
1066		lf.l_len = 0;
1067		lf.l_type = F_UNLCK;
1068		vp = (struct vnode *)fp->f_data;
1069		(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, &lf, F_POSIX);
1070	}
1071	if (--fp->f_count > 0)
1072		return (0);
1073	if (fp->f_count < 0)
1074		panic("closef: count < 0");
1075	if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
1076		lf.l_whence = SEEK_SET;
1077		lf.l_start = 0;
1078		lf.l_len = 0;
1079		lf.l_type = F_UNLCK;
1080		vp = (struct vnode *)fp->f_data;
1081		(void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
1082	}
1083	if (fp->f_ops != &badfileops)
1084		error = (*fp->f_ops->fo_close)(fp, p);
1085	else
1086		error = 0;
1087	ffree(fp);
1088	return (error);
1089}
1090
1091/*
1092 * Apply an advisory lock on a file descriptor.
1093 *
1094 * Just attempt to get a record lock of the requested type on
1095 * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
1096 */
1097#ifndef _SYS_SYSPROTO_H_
1098struct flock_args {
1099	int	fd;
1100	int	how;
1101};
1102#endif
1103/* ARGSUSED */
1104int
1105flock(p, uap)
1106	struct proc *p;
1107	register struct flock_args *uap;
1108{
1109	register struct filedesc *fdp = p->p_fd;
1110	register struct file *fp;
1111	struct vnode *vp;
1112	struct flock lf;
1113
1114	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
1115	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
1116		return (EBADF);
1117	if (fp->f_type != DTYPE_VNODE)
1118		return (EOPNOTSUPP);
1119	vp = (struct vnode *)fp->f_data;
1120	lf.l_whence = SEEK_SET;
1121	lf.l_start = 0;
1122	lf.l_len = 0;
1123	if (uap->how & LOCK_UN) {
1124		lf.l_type = F_UNLCK;
1125		fp->f_flag &= ~FHASLOCK;
1126		return (VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK));
1127	}
1128	if (uap->how & LOCK_EX)
1129		lf.l_type = F_WRLCK;
1130	else if (uap->how & LOCK_SH)
1131		lf.l_type = F_RDLCK;
1132	else
1133		return (EBADF);
1134	fp->f_flag |= FHASLOCK;
1135	if (uap->how & LOCK_NB)
1136		return (VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK));
1137	return (VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK|F_WAIT));
1138}
1139
1140/*
1141 * File Descriptor pseudo-device driver (/dev/fd/).
1142 *
1143 * Opening minor device N dup()s the file (if any) connected to file
1144 * descriptor N belonging to the calling process.  Note that this driver
1145 * consists of only the ``open()'' routine, because all subsequent
1146 * references to this file will be direct to the other driver.
1147 */
1148/* ARGSUSED */
1149static int
1150fdopen(dev, mode, type, p)
1151	dev_t dev;
1152	int mode, type;
1153	struct proc *p;
1154{
1155
1156	/*
1157	 * XXX Kludge: set curproc->p_dupfd to contain the value of the
1158	 * the file descriptor being sought for duplication. The error
1159	 * return ensures that the vnode for this device will be released
1160	 * by vn_open. Open will detect this special error and take the
1161	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
1162	 * will simply report the error.
1163	 */
1164	p->p_dupfd = minor(dev);
1165	return (ENODEV);
1166}
1167
1168/*
1169 * Duplicate the specified descriptor to a free descriptor.
1170 */
1171int
1172dupfdopen(fdp, indx, dfd, mode, error)
1173	register struct filedesc *fdp;
1174	register int indx, dfd;
1175	int mode;
1176	int error;
1177{
1178	register struct file *wfp;
1179	struct file *fp;
1180
1181	/*
1182	 * If the to-be-dup'd fd number is greater than the allowed number
1183	 * of file descriptors, or the fd to be dup'd has already been
1184	 * closed, reject.  Note, check for new == old is necessary as
1185	 * falloc could allocate an already closed to-be-dup'd descriptor
1186	 * as the new descriptor.
1187	 */
1188	fp = fdp->fd_ofiles[indx];
1189	if ((u_int)dfd >= fdp->fd_nfiles ||
1190	    (wfp = fdp->fd_ofiles[dfd]) == NULL || fp == wfp)
1191		return (EBADF);
1192
1193	/*
1194	 * There are two cases of interest here.
1195	 *
1196	 * For ENODEV simply dup (dfd) to file descriptor
1197	 * (indx) and return.
1198	 *
1199	 * For ENXIO steal away the file structure from (dfd) and
1200	 * store it in (indx).  (dfd) is effectively closed by
1201	 * this operation.
1202	 *
1203	 * Any other error code is just returned.
1204	 */
1205	switch (error) {
1206	case ENODEV:
1207		/*
1208		 * Check that the mode the file is being opened for is a
1209		 * subset of the mode of the existing descriptor.
1210		 */
1211		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag)
1212			return (EACCES);
1213		fdp->fd_ofiles[indx] = wfp;
1214		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
1215		wfp->f_count++;
1216		if (indx > fdp->fd_lastfile)
1217			fdp->fd_lastfile = indx;
1218		return (0);
1219
1220	case ENXIO:
1221		/*
1222		 * Steal away the file pointer from dfd, and stuff it into indx.
1223		 */
1224		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
1225		fdp->fd_ofiles[dfd] = NULL;
1226		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
1227		fdp->fd_ofileflags[dfd] = 0;
1228		/*
1229		 * Complete the clean up of the filedesc structure by
1230		 * recomputing the various hints.
1231		 */
1232		if (indx > fdp->fd_lastfile)
1233			fdp->fd_lastfile = indx;
1234		else
1235			while (fdp->fd_lastfile > 0 &&
1236			       fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
1237				fdp->fd_lastfile--;
1238			if (dfd < fdp->fd_freefile)
1239				fdp->fd_freefile = dfd;
1240		return (0);
1241
1242	default:
1243		return (error);
1244	}
1245	/* NOTREACHED */
1246}
1247
1248/*
1249 * Get file structures.
1250 */
1251static int
1252sysctl_kern_file SYSCTL_HANDLER_ARGS
1253{
1254	int error;
1255	struct file *fp;
1256
1257	if (!req->oldptr) {
1258		/*
1259		 * overestimate by 10 files
1260		 */
1261		return (SYSCTL_OUT(req, 0, sizeof(filehead) +
1262				(nfiles + 10) * sizeof(struct file)));
1263	}
1264
1265	error = SYSCTL_OUT(req, (caddr_t)&filehead, sizeof(filehead));
1266	if (error)
1267		return (error);
1268
1269	/*
1270	 * followed by an array of file structures
1271	 */
1272	for (fp = filehead.lh_first; fp != NULL; fp = fp->f_list.le_next) {
1273		error = SYSCTL_OUT(req, (caddr_t)fp, sizeof (struct file));
1274		if (error)
1275			return (error);
1276	}
1277	return (0);
1278}
1279
1280SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
1281    0, 0, sysctl_kern_file, "S,file", "Entire file table");
1282
1283SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
1284    &maxfilesperproc, 0, "Maximum files allowed open per process");
1285
1286SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
1287    &maxfiles, 0, "Maximum number of files");
1288
1289static void
1290fildesc_drvinit(void *unused)
1291{
1292	int fd;
1293
1294	cdevsw_add(&fildesc_cdevsw);
1295	for (fd = 0; fd < NUMFDESC; fd++)
1296		make_dev(&fildesc_cdevsw, fd,
1297		    UID_BIN, GID_BIN, 0666, "fd/%d", fd);
1298	make_dev(&fildesc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "stdin");
1299	make_dev(&fildesc_cdevsw, 1, UID_ROOT, GID_WHEEL, 0666, "stdout");
1300	make_dev(&fildesc_cdevsw, 2, UID_ROOT, GID_WHEEL, 0666, "stderr");
1301}
1302
1303struct fileops badfileops = {
1304	badfo_readwrite,
1305	badfo_readwrite,
1306	badfo_ioctl,
1307	badfo_poll,
1308	badfo_close
1309};
1310
1311static int
1312badfo_readwrite(fp, uio, cred, flags)
1313	struct file *fp;
1314	struct uio *uio;
1315	struct ucred *cred;
1316	int flags;
1317{
1318
1319	return (EBADF);
1320}
1321
1322static int
1323badfo_ioctl(fp, com, data, p)
1324	struct file *fp;
1325	u_long com;
1326	caddr_t data;
1327	struct proc *p;
1328{
1329
1330	return (EBADF);
1331}
1332
1333static int
1334badfo_poll(fp, events, cred, p)
1335	struct file *fp;
1336	int events;
1337	struct ucred *cred;
1338	struct proc *p;
1339{
1340
1341	return (0);
1342}
1343
1344static int
1345badfo_close(fp, p)
1346	struct file *fp;
1347	struct proc *p;
1348{
1349
1350	return (EBADF);
1351}
1352
1353SYSINIT(fildescdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,
1354					fildesc_drvinit,NULL)
1355
1356
1357