kern_descrip.c revision 117494
1/*
2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: head/sys/kern/kern_descrip.c 117494 2003-07-13 01:22:21Z truckman $");
43
44#include "opt_compat.h"
45
46#include <sys/param.h>
47#include <sys/systm.h>
48#include <sys/syscallsubr.h>
49#include <sys/sysproto.h>
50#include <sys/conf.h>
51#include <sys/filedesc.h>
52#include <sys/lock.h>
53#include <sys/kernel.h>
54#include <sys/limits.h>
55#include <sys/malloc.h>
56#include <sys/mutex.h>
57#include <sys/sysctl.h>
58#include <sys/vnode.h>
59#include <sys/mount.h>
60#include <sys/proc.h>
61#include <sys/namei.h>
62#include <sys/file.h>
63#include <sys/stat.h>
64#include <sys/filio.h>
65#include <sys/fcntl.h>
66#include <sys/unistd.h>
67#include <sys/resourcevar.h>
68#include <sys/event.h>
69#include <sys/sx.h>
70#include <sys/socketvar.h>
71#include <sys/signalvar.h>
72
73#include <vm/vm.h>
74#include <vm/vm_extern.h>
75#include <vm/uma.h>
76
77static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
78static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "file desc to leader",
79		     "file desc to leader structures");
80static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
81
82static uma_zone_t file_zone;
83
84static	 d_open_t  fdopen;
85#define	NUMFDESC 64
86
87#define	CDEV_MAJOR 22
88static struct cdevsw fildesc_cdevsw = {
89	.d_open =	fdopen,
90	.d_name =	"FD",
91	.d_maj =	CDEV_MAJOR,
92};
93
94/* How to treat 'new' parameter when allocating a fd for do_dup(). */
95enum dup_type { DUP_VARIABLE, DUP_FIXED };
96
97static int do_dup(struct thread *td, enum dup_type type, int old, int new,
98    register_t *retval);
99
100/*
101 * Descriptor management.
102 */
103struct filelist filehead;	/* head of list of open files */
104int nfiles;			/* actual number of open files */
105extern int cmask;
106struct sx filelist_lock;	/* sx to protect filelist */
107struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
108
109/*
110 * System calls on descriptors.
111 */
112#ifndef _SYS_SYSPROTO_H_
113struct getdtablesize_args {
114	int	dummy;
115};
116#endif
117/*
118 * MPSAFE
119 */
120/* ARGSUSED */
121int
122getdtablesize(td, uap)
123	struct thread *td;
124	struct getdtablesize_args *uap;
125{
126	struct proc *p = td->td_proc;
127
128	mtx_lock(&Giant);
129	td->td_retval[0] =
130	    min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
131	mtx_unlock(&Giant);
132	return (0);
133}
134
135/*
136 * Duplicate a file descriptor to a particular value.
137 *
138 * note: keep in mind that a potential race condition exists when closing
139 * descriptors from a shared descriptor table (via rfork).
140 */
141#ifndef _SYS_SYSPROTO_H_
142struct dup2_args {
143	u_int	from;
144	u_int	to;
145};
146#endif
147/*
148 * MPSAFE
149 */
150/* ARGSUSED */
151int
152dup2(td, uap)
153	struct thread *td;
154	struct dup2_args *uap;
155{
156
157	return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
158		    td->td_retval));
159}
160
161/*
162 * Duplicate a file descriptor.
163 */
164#ifndef _SYS_SYSPROTO_H_
165struct dup_args {
166	u_int	fd;
167};
168#endif
169/*
170 * MPSAFE
171 */
172/* ARGSUSED */
173int
174dup(td, uap)
175	struct thread *td;
176	struct dup_args *uap;
177{
178
179	return (do_dup(td, DUP_VARIABLE, (int)uap->fd, 0, td->td_retval));
180}
181
182/*
183 * The file control system call.
184 */
185#ifndef _SYS_SYSPROTO_H_
186struct fcntl_args {
187	int	fd;
188	int	cmd;
189	long	arg;
190};
191#endif
192/*
193 * MPSAFE
194 */
195/* ARGSUSED */
196int
197fcntl(td, uap)
198	struct thread *td;
199	struct fcntl_args *uap;
200{
201	struct flock fl;
202	intptr_t arg;
203	int error;
204
205	error = 0;
206	switch (uap->cmd) {
207	case F_GETLK:
208	case F_SETLK:
209	case F_SETLKW:
210		error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
211		arg = (intptr_t)&fl;
212		break;
213	default:
214		arg = uap->arg;
215		break;
216	}
217	if (error)
218		return (error);
219	error = kern_fcntl(td, uap->fd, uap->cmd, arg);
220	if (error)
221		return (error);
222	if (uap->cmd == F_GETLK)
223		error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
224	return (error);
225}
226
227int
228kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
229{
230	struct filedesc *fdp;
231	struct flock *flp;
232	struct file *fp;
233	struct proc *p;
234	char *pop;
235	struct vnode *vp;
236	u_int newmin;
237	int error, flg, tmp;
238
239	error = 0;
240	flg = F_POSIX;
241	p = td->td_proc;
242	fdp = p->p_fd;
243	mtx_lock(&Giant);
244	FILEDESC_LOCK(fdp);
245	if ((unsigned)fd >= fdp->fd_nfiles ||
246	    (fp = fdp->fd_ofiles[fd]) == NULL) {
247		FILEDESC_UNLOCK(fdp);
248		error = EBADF;
249		goto done2;
250	}
251	pop = &fdp->fd_ofileflags[fd];
252
253	switch (cmd) {
254	case F_DUPFD:
255		FILEDESC_UNLOCK(fdp);
256		newmin = arg;
257		if (newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
258		    newmin >= maxfilesperproc) {
259			error = EINVAL;
260			break;
261		}
262		error = do_dup(td, DUP_VARIABLE, fd, newmin, td->td_retval);
263		break;
264
265	case F_GETFD:
266		td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0;
267		FILEDESC_UNLOCK(fdp);
268		break;
269
270	case F_SETFD:
271		*pop = (*pop &~ UF_EXCLOSE) |
272		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
273		FILEDESC_UNLOCK(fdp);
274		break;
275
276	case F_GETFL:
277		FILE_LOCK(fp);
278		FILEDESC_UNLOCK(fdp);
279		td->td_retval[0] = OFLAGS(fp->f_flag);
280		FILE_UNLOCK(fp);
281		break;
282
283	case F_SETFL:
284		FILE_LOCK(fp);
285		FILEDESC_UNLOCK(fdp);
286		fhold_locked(fp);
287		fp->f_flag &= ~FCNTLFLAGS;
288		fp->f_flag |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
289		FILE_UNLOCK(fp);
290		tmp = fp->f_flag & FNONBLOCK;
291		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
292		if (error) {
293			fdrop(fp, td);
294			break;
295		}
296		tmp = fp->f_flag & FASYNC;
297		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
298		if (error == 0) {
299			fdrop(fp, td);
300			break;
301		}
302		FILE_LOCK(fp);
303		fp->f_flag &= ~FNONBLOCK;
304		FILE_UNLOCK(fp);
305		tmp = 0;
306		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
307		fdrop(fp, td);
308		break;
309
310	case F_GETOWN:
311		fhold(fp);
312		FILEDESC_UNLOCK(fdp);
313		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
314		if (error == 0)
315			td->td_retval[0] = tmp;
316		fdrop(fp, td);
317		break;
318
319	case F_SETOWN:
320		fhold(fp);
321		FILEDESC_UNLOCK(fdp);
322		tmp = arg;
323		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
324		fdrop(fp, td);
325		break;
326
327	case F_SETLKW:
328		flg |= F_WAIT;
329		/* FALLTHROUGH F_SETLK */
330
331	case F_SETLK:
332		if (fp->f_type != DTYPE_VNODE) {
333			FILEDESC_UNLOCK(fdp);
334			error = EBADF;
335			break;
336		}
337
338		flp = (struct flock *)arg;
339		if (flp->l_whence == SEEK_CUR) {
340			if (fp->f_offset < 0 ||
341			    (flp->l_start > 0 &&
342			     fp->f_offset > OFF_MAX - flp->l_start)) {
343				FILEDESC_UNLOCK(fdp);
344				error = EOVERFLOW;
345				break;
346			}
347			flp->l_start += fp->f_offset;
348		}
349
350		/*
351		 * VOP_ADVLOCK() may block.
352		 */
353		fhold(fp);
354		FILEDESC_UNLOCK(fdp);
355		vp = fp->f_vnode;
356
357		switch (flp->l_type) {
358		case F_RDLCK:
359			if ((fp->f_flag & FREAD) == 0) {
360				error = EBADF;
361				break;
362			}
363			PROC_LOCK(p->p_leader);
364			p->p_leader->p_flag |= P_ADVLOCK;
365			PROC_UNLOCK(p->p_leader);
366			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
367			    flp, flg);
368			break;
369		case F_WRLCK:
370			if ((fp->f_flag & FWRITE) == 0) {
371				error = EBADF;
372				break;
373			}
374			PROC_LOCK(p->p_leader);
375			p->p_leader->p_flag |= P_ADVLOCK;
376			PROC_UNLOCK(p->p_leader);
377			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
378			    flp, flg);
379			break;
380		case F_UNLCK:
381			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
382			    flp, F_POSIX);
383			break;
384		default:
385			error = EINVAL;
386			break;
387		}
388		/* Check for race with close */
389		FILEDESC_LOCK(fdp);
390		if ((unsigned) fd >= fdp->fd_nfiles ||
391		    fp != fdp->fd_ofiles[fd]) {
392			FILEDESC_UNLOCK(fdp);
393			flp->l_whence = SEEK_SET;
394			flp->l_start = 0;
395			flp->l_len = 0;
396			flp->l_type = F_UNLCK;
397			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
398					   F_UNLCK, flp, F_POSIX);
399		} else
400			FILEDESC_UNLOCK(fdp);
401		fdrop(fp, td);
402		break;
403
404	case F_GETLK:
405		if (fp->f_type != DTYPE_VNODE) {
406			FILEDESC_UNLOCK(fdp);
407			error = EBADF;
408			break;
409		}
410		flp = (struct flock *)arg;
411		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
412		    flp->l_type != F_UNLCK) {
413			FILEDESC_UNLOCK(fdp);
414			error = EINVAL;
415			break;
416		}
417		if (flp->l_whence == SEEK_CUR) {
418			if ((flp->l_start > 0 &&
419			    fp->f_offset > OFF_MAX - flp->l_start) ||
420			    (flp->l_start < 0 &&
421			     fp->f_offset < OFF_MIN - flp->l_start)) {
422				FILEDESC_UNLOCK(fdp);
423				error = EOVERFLOW;
424				break;
425			}
426			flp->l_start += fp->f_offset;
427		}
428		/*
429		 * VOP_ADVLOCK() may block.
430		 */
431		fhold(fp);
432		FILEDESC_UNLOCK(fdp);
433		vp = fp->f_vnode;
434		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
435		    F_POSIX);
436		fdrop(fp, td);
437		break;
438	default:
439		FILEDESC_UNLOCK(fdp);
440		error = EINVAL;
441		break;
442	}
443done2:
444	mtx_unlock(&Giant);
445	return (error);
446}
447
448/*
449 * Common code for dup, dup2, and fcntl(F_DUPFD).
450 */
451static int
452do_dup(td, type, old, new, retval)
453	enum dup_type type;
454	int old, new;
455	register_t *retval;
456	struct thread *td;
457{
458	struct filedesc *fdp;
459	struct proc *p;
460	struct file *fp;
461	struct file *delfp;
462	int error, newfd;
463	int holdleaders;
464
465	p = td->td_proc;
466	fdp = p->p_fd;
467
468	/*
469	 * Verify we have a valid descriptor to dup from and possibly to
470	 * dup to.
471	 */
472	if (old < 0 || new < 0 || new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
473	    new >= maxfilesperproc)
474		return (EBADF);
475	FILEDESC_LOCK(fdp);
476	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
477		FILEDESC_UNLOCK(fdp);
478		return (EBADF);
479	}
480	if (type == DUP_FIXED && old == new) {
481		*retval = new;
482		FILEDESC_UNLOCK(fdp);
483		return (0);
484	}
485	fp = fdp->fd_ofiles[old];
486	fhold(fp);
487
488	/*
489	 * Expand the table for the new descriptor if needed.  This may
490	 * block and drop and reacquire the filedesc lock.
491	 */
492	if (type == DUP_VARIABLE || new >= fdp->fd_nfiles) {
493		error = fdalloc(td, new, &newfd);
494		if (error) {
495			FILEDESC_UNLOCK(fdp);
496			fdrop(fp, td);
497			return (error);
498		}
499	}
500	if (type == DUP_VARIABLE)
501		new = newfd;
502
503	/*
504	 * If the old file changed out from under us then treat it as a
505	 * bad file descriptor.  Userland should do its own locking to
506	 * avoid this case.
507	 */
508	if (fdp->fd_ofiles[old] != fp) {
509		if (fdp->fd_ofiles[new] == NULL) {
510			if (new < fdp->fd_freefile)
511				fdp->fd_freefile = new;
512			while (fdp->fd_lastfile > 0 &&
513			    fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
514				fdp->fd_lastfile--;
515		}
516		FILEDESC_UNLOCK(fdp);
517		fdrop(fp, td);
518		return (EBADF);
519	}
520	KASSERT(old != new, ("new fd is same as old"));
521
522	/*
523	 * Save info on the descriptor being overwritten.  We have
524	 * to do the unmap now, but we cannot close it without
525	 * introducing an ownership race for the slot.
526	 */
527	delfp = fdp->fd_ofiles[new];
528	if (delfp != NULL && p->p_fdtol != NULL) {
529		/*
530		 * Ask fdfree() to sleep to ensure that all relevant
531		 * process leaders can be traversed in closef().
532		 */
533		fdp->fd_holdleaderscount++;
534		holdleaders = 1;
535	} else
536		holdleaders = 0;
537	KASSERT(delfp == NULL || type == DUP_FIXED,
538	    ("dup() picked an open file"));
539#if 0
540	if (delfp && (fdp->fd_ofileflags[new] & UF_MAPPED))
541		(void) munmapfd(td, new);
542#endif
543
544	/*
545	 * Duplicate the source descriptor, update lastfile
546	 */
547	fdp->fd_ofiles[new] = fp;
548 	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
549	if (new > fdp->fd_lastfile)
550		fdp->fd_lastfile = new;
551	FILEDESC_UNLOCK(fdp);
552	*retval = new;
553
554	/*
555	 * If we dup'd over a valid file, we now own the reference to it
556	 * and must dispose of it using closef() semantics (as if a
557	 * close() were performed on it).
558	 */
559	if (delfp) {
560		mtx_lock(&Giant);
561		(void) closef(delfp, td);
562		mtx_unlock(&Giant);
563		if (holdleaders) {
564			FILEDESC_LOCK(fdp);
565			fdp->fd_holdleaderscount--;
566			if (fdp->fd_holdleaderscount == 0 &&
567			    fdp->fd_holdleaderswakeup != 0) {
568				fdp->fd_holdleaderswakeup = 0;
569				wakeup(&fdp->fd_holdleaderscount);
570			}
571			FILEDESC_UNLOCK(fdp);
572		}
573	}
574	return (0);
575}
576
577/*
578 * If sigio is on the list associated with a process or process group,
579 * disable signalling from the device, remove sigio from the list and
580 * free sigio.
581 */
582void
583funsetown(sigiop)
584	struct sigio **sigiop;
585{
586	struct sigio *sigio;
587
588	SIGIO_LOCK();
589	sigio = *sigiop;
590	if (sigio == NULL) {
591		SIGIO_UNLOCK();
592		return;
593	}
594	*(sigio->sio_myref) = NULL;
595	if ((sigio)->sio_pgid < 0) {
596		struct pgrp *pg = (sigio)->sio_pgrp;
597		PGRP_LOCK(pg);
598		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
599			     sigio, sio_pgsigio);
600		PGRP_UNLOCK(pg);
601	} else {
602		struct proc *p = (sigio)->sio_proc;
603		PROC_LOCK(p);
604		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
605			     sigio, sio_pgsigio);
606		PROC_UNLOCK(p);
607	}
608	SIGIO_UNLOCK();
609	crfree(sigio->sio_ucred);
610	FREE(sigio, M_SIGIO);
611}
612
613/*
614 * Free a list of sigio structures.
615 * We only need to lock the SIGIO_LOCK because we have made ourselves
616 * inaccessable to callers of fsetown and therefore do not need to lock
617 * the proc or pgrp struct for the list manipulation.
618 */
619void
620funsetownlst(sigiolst)
621	struct sigiolst *sigiolst;
622{
623	struct proc *p;
624	struct pgrp *pg;
625	struct sigio *sigio;
626
627	sigio = SLIST_FIRST(sigiolst);
628	if (sigio == NULL)
629		return;
630	p = NULL;
631	pg = NULL;
632
633	/*
634	 * Every entry of the list should belong
635	 * to a single proc or pgrp.
636	 */
637	if (sigio->sio_pgid < 0) {
638		pg = sigio->sio_pgrp;
639		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
640	} else /* if (sigio->sio_pgid > 0) */ {
641		p = sigio->sio_proc;
642		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
643	}
644
645	SIGIO_LOCK();
646	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
647		*(sigio->sio_myref) = NULL;
648		if (pg != NULL) {
649			KASSERT(sigio->sio_pgid < 0,
650			    ("Proc sigio in pgrp sigio list"));
651			KASSERT(sigio->sio_pgrp == pg,
652			    ("Bogus pgrp in sigio list"));
653			PGRP_LOCK(pg);
654			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
655			    sio_pgsigio);
656			PGRP_UNLOCK(pg);
657		} else /* if (p != NULL) */ {
658			KASSERT(sigio->sio_pgid > 0,
659			    ("Pgrp sigio in proc sigio list"));
660			KASSERT(sigio->sio_proc == p,
661			    ("Bogus proc in sigio list"));
662			PROC_LOCK(p);
663			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
664			    sio_pgsigio);
665			PROC_UNLOCK(p);
666		}
667		SIGIO_UNLOCK();
668		crfree(sigio->sio_ucred);
669		FREE(sigio, M_SIGIO);
670		SIGIO_LOCK();
671	}
672	SIGIO_UNLOCK();
673}
674
675/*
676 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
677 *
678 * After permission checking, add a sigio structure to the sigio list for
679 * the process or process group.
680 */
681int
682fsetown(pgid, sigiop)
683	pid_t pgid;
684	struct sigio **sigiop;
685{
686	struct proc *proc;
687	struct pgrp *pgrp;
688	struct sigio *sigio;
689	int ret;
690
691	if (pgid == 0) {
692		funsetown(sigiop);
693		return (0);
694	}
695
696	ret = 0;
697
698	/* Allocate and fill in the new sigio out of locks. */
699	MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO, M_WAITOK);
700	sigio->sio_pgid = pgid;
701	sigio->sio_ucred = crhold(curthread->td_ucred);
702	sigio->sio_myref = sigiop;
703
704	sx_slock(&proctree_lock);
705	if (pgid > 0) {
706		proc = pfind(pgid);
707		if (proc == NULL) {
708			ret = ESRCH;
709			goto fail;
710		}
711
712		/*
713		 * Policy - Don't allow a process to FSETOWN a process
714		 * in another session.
715		 *
716		 * Remove this test to allow maximum flexibility or
717		 * restrict FSETOWN to the current process or process
718		 * group for maximum safety.
719		 */
720		PROC_UNLOCK(proc);
721		if (proc->p_session != curthread->td_proc->p_session) {
722			ret = EPERM;
723			goto fail;
724		}
725
726		pgrp = NULL;
727	} else /* if (pgid < 0) */ {
728		pgrp = pgfind(-pgid);
729		if (pgrp == NULL) {
730			ret = ESRCH;
731			goto fail;
732		}
733		PGRP_UNLOCK(pgrp);
734
735		/*
736		 * Policy - Don't allow a process to FSETOWN a process
737		 * in another session.
738		 *
739		 * Remove this test to allow maximum flexibility or
740		 * restrict FSETOWN to the current process or process
741		 * group for maximum safety.
742		 */
743		if (pgrp->pg_session != curthread->td_proc->p_session) {
744			ret = EPERM;
745			goto fail;
746		}
747
748		proc = NULL;
749	}
750	funsetown(sigiop);
751	if (pgid > 0) {
752		PROC_LOCK(proc);
753		/*
754		 * Since funsetownlst() is called without the proctree
755		 * locked, we need to check for P_WEXIT.
756		 * XXX: is ESRCH correct?
757		 */
758		if ((proc->p_flag & P_WEXIT) != 0) {
759			PROC_UNLOCK(proc);
760			ret = ESRCH;
761			goto fail;
762		}
763		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
764		sigio->sio_proc = proc;
765		PROC_UNLOCK(proc);
766	} else {
767		PGRP_LOCK(pgrp);
768		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
769		sigio->sio_pgrp = pgrp;
770		PGRP_UNLOCK(pgrp);
771	}
772	sx_sunlock(&proctree_lock);
773	SIGIO_LOCK();
774	*sigiop = sigio;
775	SIGIO_UNLOCK();
776	return (0);
777
778fail:
779	sx_sunlock(&proctree_lock);
780	crfree(sigio->sio_ucred);
781	FREE(sigio, M_SIGIO);
782	return (ret);
783}
784
785/*
786 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
787 */
788pid_t
789fgetown(sigiop)
790	struct sigio **sigiop;
791{
792	pid_t pgid;
793
794	SIGIO_LOCK();
795	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
796	SIGIO_UNLOCK();
797	return (pgid);
798}
799
800/*
801 * Close a file descriptor.
802 */
803#ifndef _SYS_SYSPROTO_H_
804struct close_args {
805        int     fd;
806};
807#endif
808/*
809 * MPSAFE
810 */
811/* ARGSUSED */
812int
813close(td, uap)
814	struct thread *td;
815	struct close_args *uap;
816{
817	struct filedesc *fdp;
818	struct file *fp;
819	int fd, error;
820	int holdleaders;
821
822	fd = uap->fd;
823	error = 0;
824	holdleaders = 0;
825	fdp = td->td_proc->p_fd;
826	mtx_lock(&Giant);
827	FILEDESC_LOCK(fdp);
828	if ((unsigned)fd >= fdp->fd_nfiles ||
829	    (fp = fdp->fd_ofiles[fd]) == NULL) {
830		FILEDESC_UNLOCK(fdp);
831		error = EBADF;
832		goto done2;
833	}
834#if 0
835	if (fdp->fd_ofileflags[fd] & UF_MAPPED)
836		(void) munmapfd(td, fd);
837#endif
838	fdp->fd_ofiles[fd] = NULL;
839	fdp->fd_ofileflags[fd] = 0;
840	if (td->td_proc->p_fdtol != NULL) {
841		/*
842		 * Ask fdfree() to sleep to ensure that all relevant
843		 * process leaders can be traversed in closef().
844		 */
845		fdp->fd_holdleaderscount++;
846		holdleaders = 1;
847	}
848
849	/*
850	 * we now hold the fp reference that used to be owned by the descriptor
851	 * array.
852	 */
853	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
854		fdp->fd_lastfile--;
855	if (fd < fdp->fd_freefile)
856		fdp->fd_freefile = fd;
857	if (fd < fdp->fd_knlistsize) {
858		FILEDESC_UNLOCK(fdp);
859		knote_fdclose(td, fd);
860	} else
861		FILEDESC_UNLOCK(fdp);
862
863	error = closef(fp, td);
864done2:
865	mtx_unlock(&Giant);
866	if (holdleaders) {
867		FILEDESC_LOCK(fdp);
868		fdp->fd_holdleaderscount--;
869		if (fdp->fd_holdleaderscount == 0 &&
870		    fdp->fd_holdleaderswakeup != 0) {
871			fdp->fd_holdleaderswakeup = 0;
872			wakeup(&fdp->fd_holdleaderscount);
873		}
874		FILEDESC_UNLOCK(fdp);
875	}
876	return (error);
877}
878
879#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
880/*
881 * Return status information about a file descriptor.
882 */
883#ifndef _SYS_SYSPROTO_H_
884struct ofstat_args {
885	int	fd;
886	struct	ostat *sb;
887};
888#endif
889/*
890 * MPSAFE
891 */
892/* ARGSUSED */
893int
894ofstat(td, uap)
895	struct thread *td;
896	struct ofstat_args *uap;
897{
898	struct file *fp;
899	struct stat ub;
900	struct ostat oub;
901	int error;
902
903	mtx_lock(&Giant);
904	if ((error = fget(td, uap->fd, &fp)) != 0)
905		goto done2;
906	error = fo_stat(fp, &ub, td->td_ucred, td);
907	if (error == 0) {
908		cvtstat(&ub, &oub);
909		error = copyout(&oub, uap->sb, sizeof(oub));
910	}
911	fdrop(fp, td);
912done2:
913	mtx_unlock(&Giant);
914	return (error);
915}
916#endif /* COMPAT_43 || COMPAT_SUNOS */
917
918/*
919 * Return status information about a file descriptor.
920 */
921#ifndef _SYS_SYSPROTO_H_
922struct fstat_args {
923	int	fd;
924	struct	stat *sb;
925};
926#endif
927/*
928 * MPSAFE
929 */
930/* ARGSUSED */
931int
932fstat(td, uap)
933	struct thread *td;
934	struct fstat_args *uap;
935{
936	struct file *fp;
937	struct stat ub;
938	int error;
939
940	mtx_lock(&Giant);
941	if ((error = fget(td, uap->fd, &fp)) != 0)
942		goto done2;
943	error = fo_stat(fp, &ub, td->td_ucred, td);
944	if (error == 0)
945		error = copyout(&ub, uap->sb, sizeof(ub));
946	fdrop(fp, td);
947done2:
948	mtx_unlock(&Giant);
949	return (error);
950}
951
952/*
953 * Return status information about a file descriptor.
954 */
955#ifndef _SYS_SYSPROTO_H_
956struct nfstat_args {
957	int	fd;
958	struct	nstat *sb;
959};
960#endif
961/*
962 * MPSAFE
963 */
964/* ARGSUSED */
965int
966nfstat(td, uap)
967	struct thread *td;
968	struct nfstat_args *uap;
969{
970	struct file *fp;
971	struct stat ub;
972	struct nstat nub;
973	int error;
974
975	mtx_lock(&Giant);
976	if ((error = fget(td, uap->fd, &fp)) != 0)
977		goto done2;
978	error = fo_stat(fp, &ub, td->td_ucred, td);
979	if (error == 0) {
980		cvtnstat(&ub, &nub);
981		error = copyout(&nub, uap->sb, sizeof(nub));
982	}
983	fdrop(fp, td);
984done2:
985	mtx_unlock(&Giant);
986	return (error);
987}
988
989/*
990 * Return pathconf information about a file descriptor.
991 */
992#ifndef _SYS_SYSPROTO_H_
993struct fpathconf_args {
994	int	fd;
995	int	name;
996};
997#endif
998/*
999 * MPSAFE
1000 */
1001/* ARGSUSED */
1002int
1003fpathconf(td, uap)
1004	struct thread *td;
1005	struct fpathconf_args *uap;
1006{
1007	struct file *fp;
1008	struct vnode *vp;
1009	int error;
1010
1011	if ((error = fget(td, uap->fd, &fp)) != 0)
1012		return (error);
1013
1014	/* If asynchronous I/O is available, it works for all descriptors. */
1015	if (uap->name == _PC_ASYNC_IO) {
1016		td->td_retval[0] = async_io_version;
1017		goto out;
1018	}
1019	vp = fp->f_vnode;
1020	if (vp != NULL) {
1021		mtx_lock(&Giant);
1022		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1023		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
1024		VOP_UNLOCK(vp, 0, td);
1025		mtx_unlock(&Giant);
1026	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
1027		if (uap->name != _PC_PIPE_BUF) {
1028			error = EINVAL;
1029		} else {
1030			td->td_retval[0] = PIPE_BUF;
1031		error = 0;
1032		}
1033	} else {
1034		error = EOPNOTSUPP;
1035	}
1036out:
1037	fdrop(fp, td);
1038	return (error);
1039}
1040
1041/*
1042 * Allocate a file descriptor for the process.
1043 */
1044static int fdexpand;
1045SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0, "");
1046
1047int
1048fdalloc(td, want, result)
1049	struct thread *td;
1050	int want;
1051	int *result;
1052{
1053	struct proc *p = td->td_proc;
1054	struct filedesc *fdp = td->td_proc->p_fd;
1055	int i;
1056	int lim, last, nfiles;
1057	struct file **newofile, **oldofile;
1058	char *newofileflags;
1059
1060	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1061
1062	/*
1063	 * Search for a free descriptor starting at the higher
1064	 * of want or fd_freefile.  If that fails, consider
1065	 * expanding the ofile array.
1066	 */
1067	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
1068	for (;;) {
1069		last = min(fdp->fd_nfiles, lim);
1070		i = max(want, fdp->fd_freefile);
1071		for (; i < last; i++) {
1072			if (fdp->fd_ofiles[i] == NULL) {
1073				fdp->fd_ofileflags[i] = 0;
1074				if (i > fdp->fd_lastfile)
1075					fdp->fd_lastfile = i;
1076				if (want <= fdp->fd_freefile)
1077					fdp->fd_freefile = i;
1078				*result = i;
1079				return (0);
1080			}
1081		}
1082
1083		/*
1084		 * No space in current array.  Expand?
1085		 */
1086		if (i >= lim)
1087			return (EMFILE);
1088		if (fdp->fd_nfiles < NDEXTENT)
1089			nfiles = NDEXTENT;
1090		else
1091			nfiles = 2 * fdp->fd_nfiles;
1092		while (nfiles < want)
1093			nfiles <<= 1;
1094		FILEDESC_UNLOCK(fdp);
1095		/*
1096		 * XXX malloc() calls uma_large_malloc() for sizes larger
1097		 * than KMEM_ZMAX bytes. uma_large_malloc() requires Giant.
1098		 */
1099		mtx_lock(&Giant);
1100		newofile = malloc(nfiles * OFILESIZE, M_FILEDESC, M_WAITOK);
1101		mtx_unlock(&Giant);
1102
1103		/*
1104		 * Deal with file-table extend race that might have
1105		 * occurred while filedesc was unlocked.
1106		 */
1107		FILEDESC_LOCK(fdp);
1108		if (fdp->fd_nfiles >= nfiles) {
1109			/* XXX uma_large_free() needs Giant. */
1110			FILEDESC_UNLOCK(fdp);
1111			mtx_lock(&Giant);
1112			free(newofile, M_FILEDESC);
1113			mtx_unlock(&Giant);
1114			FILEDESC_LOCK(fdp);
1115			continue;
1116		}
1117		newofileflags = (char *) &newofile[nfiles];
1118		/*
1119		 * Copy the existing ofile and ofileflags arrays
1120		 * and zero the new portion of each array.
1121		 */
1122		i = fdp->fd_nfiles * sizeof(struct file *);
1123		bcopy(fdp->fd_ofiles, newofile,	i);
1124		bzero((char *)newofile + i,
1125		    nfiles * sizeof(struct file *) - i);
1126		i = fdp->fd_nfiles * sizeof(char);
1127		bcopy(fdp->fd_ofileflags, newofileflags, i);
1128		bzero(newofileflags + i, nfiles * sizeof(char) - i);
1129		if (fdp->fd_nfiles > NDFILE)
1130			oldofile = fdp->fd_ofiles;
1131		else
1132			oldofile = NULL;
1133		fdp->fd_ofiles = newofile;
1134		fdp->fd_ofileflags = newofileflags;
1135		fdp->fd_nfiles = nfiles;
1136		fdexpand++;
1137		if (oldofile != NULL) {
1138			/* XXX uma_large_free() needs Giant. */
1139			FILEDESC_UNLOCK(fdp);
1140			mtx_lock(&Giant);
1141			free(oldofile, M_FILEDESC);
1142			mtx_unlock(&Giant);
1143			FILEDESC_LOCK(fdp);
1144		}
1145	}
1146}
1147
1148/*
1149 * Check to see whether n user file descriptors
1150 * are available to the process p.
1151 */
1152int
1153fdavail(td, n)
1154	struct thread *td;
1155	int n;
1156{
1157	struct proc *p = td->td_proc;
1158	struct filedesc *fdp = td->td_proc->p_fd;
1159	struct file **fpp;
1160	int i, lim, last;
1161
1162	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1163
1164	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
1165	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
1166		return (1);
1167	last = min(fdp->fd_nfiles, lim);
1168	fpp = &fdp->fd_ofiles[fdp->fd_freefile];
1169	for (i = last - fdp->fd_freefile; --i >= 0; fpp++) {
1170		if (*fpp == NULL && --n <= 0)
1171			return (1);
1172	}
1173	return (0);
1174}
1175
1176/*
1177 * Create a new open file structure and allocate
1178 * a file decriptor for the process that refers to it.
1179 */
1180int
1181falloc(td, resultfp, resultfd)
1182	struct thread *td;
1183	struct file **resultfp;
1184	int *resultfd;
1185{
1186	struct proc *p = td->td_proc;
1187	struct file *fp, *fq;
1188	int error, i;
1189	int maxuserfiles = maxfiles - (maxfiles / 20);
1190	static struct timeval lastfail;
1191	static int curfail;
1192
1193	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
1194	sx_xlock(&filelist_lock);
1195	if ((nfiles >= maxuserfiles && td->td_ucred->cr_ruid != 0)
1196	   || nfiles >= maxfiles) {
1197		if (ppsratecheck(&lastfail, &curfail, 1)) {
1198			printf("kern.maxfiles limit exceeded by uid %i, please see tuning(7).\n",
1199				td->td_ucred->cr_ruid);
1200		}
1201		sx_xunlock(&filelist_lock);
1202		uma_zfree(file_zone, fp);
1203		return (ENFILE);
1204	}
1205	nfiles++;
1206
1207	/*
1208	 * If the process has file descriptor zero open, add the new file
1209	 * descriptor to the list of open files at that point, otherwise
1210	 * put it at the front of the list of open files.
1211	 */
1212	fp->f_mtxp = mtx_pool_alloc(mtxpool_sleep);
1213	fp->f_count = 1;
1214	fp->f_cred = crhold(td->td_ucred);
1215	fp->f_ops = &badfileops;
1216	FILEDESC_LOCK(p->p_fd);
1217	if ((fq = p->p_fd->fd_ofiles[0])) {
1218		LIST_INSERT_AFTER(fq, fp, f_list);
1219	} else {
1220		LIST_INSERT_HEAD(&filehead, fp, f_list);
1221	}
1222	sx_xunlock(&filelist_lock);
1223	if ((error = fdalloc(td, 0, &i))) {
1224		FILEDESC_UNLOCK(p->p_fd);
1225		fdrop(fp, td);
1226		return (error);
1227	}
1228	p->p_fd->fd_ofiles[i] = fp;
1229	FILEDESC_UNLOCK(p->p_fd);
1230	if (resultfp)
1231		*resultfp = fp;
1232	if (resultfd)
1233		*resultfd = i;
1234	return (0);
1235}
1236
1237/*
1238 * Free a file descriptor.
1239 */
1240void
1241ffree(fp)
1242	struct file *fp;
1243{
1244
1245	KASSERT(fp->f_count == 0, ("ffree: fp_fcount not 0!"));
1246	sx_xlock(&filelist_lock);
1247	LIST_REMOVE(fp, f_list);
1248	nfiles--;
1249	sx_xunlock(&filelist_lock);
1250	crfree(fp->f_cred);
1251	uma_zfree(file_zone, fp);
1252}
1253
1254/*
1255 * Build a new filedesc structure from another.
1256 * Copy the current, root, and jail root vnode references.
1257 */
1258struct filedesc *
1259fdinit(fdp)
1260	struct filedesc *fdp;
1261{
1262	struct filedesc0 *newfdp;
1263
1264	MALLOC(newfdp, struct filedesc0 *, sizeof(struct filedesc0),
1265	    M_FILEDESC, M_WAITOK | M_ZERO);
1266	mtx_init(&newfdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
1267	newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
1268	if (newfdp->fd_fd.fd_cdir)
1269		VREF(newfdp->fd_fd.fd_cdir);
1270	newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
1271	if (newfdp->fd_fd.fd_rdir)
1272		VREF(newfdp->fd_fd.fd_rdir);
1273	newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
1274	if (newfdp->fd_fd.fd_jdir)
1275		VREF(newfdp->fd_fd.fd_jdir);
1276
1277	/* Create the file descriptor table. */
1278	newfdp->fd_fd.fd_refcnt = 1;
1279	newfdp->fd_fd.fd_cmask = cmask;
1280	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
1281	newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
1282	newfdp->fd_fd.fd_nfiles = NDFILE;
1283	newfdp->fd_fd.fd_knlistsize = -1;
1284	return (&newfdp->fd_fd);
1285}
1286
1287/*
1288 * Share a filedesc structure.
1289 */
1290struct filedesc *
1291fdshare(fdp)
1292	struct filedesc *fdp;
1293{
1294	FILEDESC_LOCK(fdp);
1295	fdp->fd_refcnt++;
1296	FILEDESC_UNLOCK(fdp);
1297	return (fdp);
1298}
1299
1300/*
1301 * Copy a filedesc structure.
1302 * A NULL pointer in returns a NULL reference, this is to ease callers,
1303 * not catch errors.
1304 */
1305struct filedesc *
1306fdcopy(fdp)
1307	struct filedesc *fdp;
1308{
1309	struct filedesc *newfdp;
1310	struct file **fpp;
1311	int i, j;
1312
1313	/* Certain daemons might not have file descriptors. */
1314	if (fdp == NULL)
1315		return (NULL);
1316
1317	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1318
1319	FILEDESC_UNLOCK(fdp);
1320	MALLOC(newfdp, struct filedesc *, sizeof(struct filedesc0),
1321	    M_FILEDESC, M_WAITOK);
1322	FILEDESC_LOCK(fdp);
1323	bcopy(fdp, newfdp, sizeof(struct filedesc));
1324	FILEDESC_UNLOCK(fdp);
1325	bzero(&newfdp->fd_mtx, sizeof(newfdp->fd_mtx));
1326	mtx_init(&newfdp->fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
1327	if (newfdp->fd_cdir)
1328		VREF(newfdp->fd_cdir);
1329	if (newfdp->fd_rdir)
1330		VREF(newfdp->fd_rdir);
1331	if (newfdp->fd_jdir)
1332		VREF(newfdp->fd_jdir);
1333	newfdp->fd_refcnt = 1;
1334
1335	/*
1336	 * If the number of open files fits in the internal arrays
1337	 * of the open file structure, use them, otherwise allocate
1338	 * additional memory for the number of descriptors currently
1339	 * in use.
1340	 */
1341	FILEDESC_LOCK(fdp);
1342	newfdp->fd_lastfile = fdp->fd_lastfile;
1343	newfdp->fd_nfiles = fdp->fd_nfiles;
1344	if (newfdp->fd_lastfile < NDFILE) {
1345		newfdp->fd_ofiles = ((struct filedesc0 *) newfdp)->fd_dfiles;
1346		newfdp->fd_ofileflags =
1347		    ((struct filedesc0 *) newfdp)->fd_dfileflags;
1348		i = NDFILE;
1349	} else {
1350		/*
1351		 * Compute the smallest multiple of NDEXTENT needed
1352		 * for the file descriptors currently in use,
1353		 * allowing the table to shrink.
1354		 */
1355retry:
1356		i = newfdp->fd_nfiles;
1357		while (i > 2 * NDEXTENT && i > newfdp->fd_lastfile * 2)
1358			i /= 2;
1359		FILEDESC_UNLOCK(fdp);
1360		MALLOC(newfdp->fd_ofiles, struct file **, i * OFILESIZE,
1361		    M_FILEDESC, M_WAITOK);
1362		FILEDESC_LOCK(fdp);
1363		newfdp->fd_lastfile = fdp->fd_lastfile;
1364		newfdp->fd_nfiles = fdp->fd_nfiles;
1365		j = newfdp->fd_nfiles;
1366		while (j > 2 * NDEXTENT && j > newfdp->fd_lastfile * 2)
1367			j /= 2;
1368		if (i != j) {
1369			/*
1370			 * The size of the original table has changed.
1371			 * Go over once again.
1372			 */
1373			FILEDESC_UNLOCK(fdp);
1374			FREE(newfdp->fd_ofiles, M_FILEDESC);
1375			FILEDESC_LOCK(fdp);
1376			newfdp->fd_lastfile = fdp->fd_lastfile;
1377			newfdp->fd_nfiles = fdp->fd_nfiles;
1378			goto retry;
1379		}
1380		newfdp->fd_ofileflags = (char *) &newfdp->fd_ofiles[i];
1381	}
1382	newfdp->fd_nfiles = i;
1383	bcopy(fdp->fd_ofiles, newfdp->fd_ofiles, i * sizeof(struct file **));
1384	bcopy(fdp->fd_ofileflags, newfdp->fd_ofileflags, i * sizeof(char));
1385
1386	/*
1387	 * kq descriptors cannot be copied.
1388	 */
1389	if (newfdp->fd_knlistsize != -1) {
1390		fpp = &newfdp->fd_ofiles[newfdp->fd_lastfile];
1391		for (i = newfdp->fd_lastfile; i >= 0; i--, fpp--) {
1392			if (*fpp != NULL && (*fpp)->f_type == DTYPE_KQUEUE) {
1393				*fpp = NULL;
1394				if (i < newfdp->fd_freefile)
1395					newfdp->fd_freefile = i;
1396			}
1397			if (*fpp == NULL && i == newfdp->fd_lastfile && i > 0)
1398				newfdp->fd_lastfile--;
1399		}
1400		newfdp->fd_knlist = NULL;
1401		newfdp->fd_knlistsize = -1;
1402		newfdp->fd_knhash = NULL;
1403		newfdp->fd_knhashmask = 0;
1404	}
1405
1406	fpp = newfdp->fd_ofiles;
1407	for (i = newfdp->fd_lastfile; i-- >= 0; fpp++) {
1408		if (*fpp != NULL)
1409			fhold(*fpp);
1410	}
1411	return (newfdp);
1412}
1413
1414/* A mutex to protect the association between a proc and filedesc. */
1415struct mtx	fdesc_mtx;
1416MTX_SYSINIT(fdesc, &fdesc_mtx, "fdesc", MTX_DEF);
1417
1418/*
1419 * Release a filedesc structure.
1420 */
1421void
1422fdfree(td)
1423	struct thread *td;
1424{
1425	struct filedesc *fdp;
1426	struct file **fpp;
1427	int i;
1428	struct filedesc_to_leader *fdtol;
1429	struct file *fp;
1430	struct vnode *vp;
1431	struct flock lf;
1432
1433	/* Certain daemons might not have file descriptors. */
1434	fdp = td->td_proc->p_fd;
1435	if (fdp == NULL)
1436		return;
1437
1438	/* Check for special need to clear POSIX style locks */
1439	fdtol = td->td_proc->p_fdtol;
1440	if (fdtol != NULL) {
1441		FILEDESC_LOCK(fdp);
1442		KASSERT(fdtol->fdl_refcount > 0,
1443			("filedesc_to_refcount botch: fdl_refcount=%d",
1444			 fdtol->fdl_refcount));
1445		if (fdtol->fdl_refcount == 1 &&
1446		    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
1447			i = 0;
1448			fpp = fdp->fd_ofiles;
1449			for (i = 0, fpp = fdp->fd_ofiles;
1450			     i < fdp->fd_lastfile;
1451			     i++, fpp++) {
1452				if (*fpp == NULL ||
1453				    (*fpp)->f_type != DTYPE_VNODE)
1454					continue;
1455				fp = *fpp;
1456				fhold(fp);
1457				FILEDESC_UNLOCK(fdp);
1458				lf.l_whence = SEEK_SET;
1459				lf.l_start = 0;
1460				lf.l_len = 0;
1461				lf.l_type = F_UNLCK;
1462				vp = fp->f_vnode;
1463				(void) VOP_ADVLOCK(vp,
1464						   (caddr_t)td->td_proc->
1465						   p_leader,
1466						   F_UNLCK,
1467						   &lf,
1468						   F_POSIX);
1469				FILEDESC_LOCK(fdp);
1470				fdrop(fp, td);
1471				fpp = fdp->fd_ofiles + i;
1472			}
1473		}
1474	retry:
1475		if (fdtol->fdl_refcount == 1) {
1476			if (fdp->fd_holdleaderscount > 0 &&
1477			    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
1478				/*
1479				 * close() or do_dup() has cleared a reference
1480				 * in a shared file descriptor table.
1481				 */
1482				fdp->fd_holdleaderswakeup = 1;
1483				msleep(&fdp->fd_holdleaderscount, &fdp->fd_mtx,
1484				       PLOCK, "fdlhold", 0);
1485				goto retry;
1486			}
1487			if (fdtol->fdl_holdcount > 0) {
1488				/*
1489				 * Ensure that fdtol->fdl_leader
1490				 * remains valid in closef().
1491				 */
1492				fdtol->fdl_wakeup = 1;
1493				msleep(fdtol, &fdp->fd_mtx,
1494				       PLOCK, "fdlhold", 0);
1495				goto retry;
1496			}
1497		}
1498		fdtol->fdl_refcount--;
1499		if (fdtol->fdl_refcount == 0 &&
1500		    fdtol->fdl_holdcount == 0) {
1501			fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
1502			fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
1503		} else
1504			fdtol = NULL;
1505		td->td_proc->p_fdtol = NULL;
1506		FILEDESC_UNLOCK(fdp);
1507		if (fdtol != NULL)
1508			FREE(fdtol, M_FILEDESC_TO_LEADER);
1509	}
1510	FILEDESC_LOCK(fdp);
1511	if (--fdp->fd_refcnt > 0) {
1512		FILEDESC_UNLOCK(fdp);
1513		return;
1514	}
1515
1516	/*
1517	 * We are the last reference to the structure, so we can
1518	 * safely assume it will not change out from under us.
1519	 */
1520	FILEDESC_UNLOCK(fdp);
1521	fpp = fdp->fd_ofiles;
1522	for (i = fdp->fd_lastfile; i-- >= 0; fpp++) {
1523		if (*fpp)
1524			(void) closef(*fpp, td);
1525	}
1526
1527	/* XXX This should happen earlier. */
1528	mtx_lock(&fdesc_mtx);
1529	td->td_proc->p_fd = NULL;
1530	mtx_unlock(&fdesc_mtx);
1531
1532	if (fdp->fd_nfiles > NDFILE)
1533		FREE(fdp->fd_ofiles, M_FILEDESC);
1534	if (fdp->fd_cdir)
1535		vrele(fdp->fd_cdir);
1536	if (fdp->fd_rdir)
1537		vrele(fdp->fd_rdir);
1538	if (fdp->fd_jdir)
1539		vrele(fdp->fd_jdir);
1540	if (fdp->fd_knlist)
1541		FREE(fdp->fd_knlist, M_KQUEUE);
1542	if (fdp->fd_knhash)
1543		FREE(fdp->fd_knhash, M_KQUEUE);
1544	mtx_destroy(&fdp->fd_mtx);
1545	FREE(fdp, M_FILEDESC);
1546}
1547
1548/*
1549 * For setugid programs, we don't want to people to use that setugidness
1550 * to generate error messages which write to a file which otherwise would
1551 * otherwise be off-limits to the process.  We check for filesystems where
1552 * the vnode can change out from under us after execve (like [lin]procfs).
1553 *
1554 * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
1555 * sufficient.  We also don't for check setugidness since we know we are.
1556 */
1557static int
1558is_unsafe(struct file *fp)
1559{
1560	if (fp->f_type == DTYPE_VNODE) {
1561		struct vnode *vp = fp->f_vnode;
1562
1563		if ((vp->v_vflag & VV_PROCDEP) != 0)
1564			return (1);
1565	}
1566	return (0);
1567}
1568
1569/*
1570 * Make this setguid thing safe, if at all possible.
1571 */
1572void
1573setugidsafety(td)
1574	struct thread *td;
1575{
1576	struct filedesc *fdp;
1577	int i;
1578
1579	/* Certain daemons might not have file descriptors. */
1580	fdp = td->td_proc->p_fd;
1581	if (fdp == NULL)
1582		return;
1583
1584	/*
1585	 * Note: fdp->fd_ofiles may be reallocated out from under us while
1586	 * we are blocked in a close.  Be careful!
1587	 */
1588	FILEDESC_LOCK(fdp);
1589	for (i = 0; i <= fdp->fd_lastfile; i++) {
1590		if (i > 2)
1591			break;
1592		if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) {
1593			struct file *fp;
1594
1595#if 0
1596			if ((fdp->fd_ofileflags[i] & UF_MAPPED) != 0)
1597				(void) munmapfd(td, i);
1598#endif
1599			if (i < fdp->fd_knlistsize) {
1600				FILEDESC_UNLOCK(fdp);
1601				knote_fdclose(td, i);
1602				FILEDESC_LOCK(fdp);
1603			}
1604			/*
1605			 * NULL-out descriptor prior to close to avoid
1606			 * a race while close blocks.
1607			 */
1608			fp = fdp->fd_ofiles[i];
1609			fdp->fd_ofiles[i] = NULL;
1610			fdp->fd_ofileflags[i] = 0;
1611			if (i < fdp->fd_freefile)
1612				fdp->fd_freefile = i;
1613			FILEDESC_UNLOCK(fdp);
1614			(void) closef(fp, td);
1615			FILEDESC_LOCK(fdp);
1616		}
1617	}
1618	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
1619		fdp->fd_lastfile--;
1620	FILEDESC_UNLOCK(fdp);
1621}
1622
1623/*
1624 * Close any files on exec?
1625 */
1626void
1627fdcloseexec(td)
1628	struct thread *td;
1629{
1630	struct filedesc *fdp;
1631	int i;
1632
1633	/* Certain daemons might not have file descriptors. */
1634	fdp = td->td_proc->p_fd;
1635	if (fdp == NULL)
1636		return;
1637
1638	FILEDESC_LOCK(fdp);
1639
1640	/*
1641	 * We cannot cache fd_ofiles or fd_ofileflags since operations
1642	 * may block and rip them out from under us.
1643	 */
1644	for (i = 0; i <= fdp->fd_lastfile; i++) {
1645		if (fdp->fd_ofiles[i] != NULL &&
1646		    (fdp->fd_ofileflags[i] & UF_EXCLOSE)) {
1647			struct file *fp;
1648
1649#if 0
1650			if (fdp->fd_ofileflags[i] & UF_MAPPED)
1651				(void) munmapfd(td, i);
1652#endif
1653			if (i < fdp->fd_knlistsize) {
1654				FILEDESC_UNLOCK(fdp);
1655				knote_fdclose(td, i);
1656				FILEDESC_LOCK(fdp);
1657			}
1658			/*
1659			 * NULL-out descriptor prior to close to avoid
1660			 * a race while close blocks.
1661			 */
1662			fp = fdp->fd_ofiles[i];
1663			fdp->fd_ofiles[i] = NULL;
1664			fdp->fd_ofileflags[i] = 0;
1665			if (i < fdp->fd_freefile)
1666				fdp->fd_freefile = i;
1667			FILEDESC_UNLOCK(fdp);
1668			(void) closef(fp, td);
1669			FILEDESC_LOCK(fdp);
1670		}
1671	}
1672	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
1673		fdp->fd_lastfile--;
1674	FILEDESC_UNLOCK(fdp);
1675}
1676
1677/*
1678 * It is unsafe for set[ug]id processes to be started with file
1679 * descriptors 0..2 closed, as these descriptors are given implicit
1680 * significance in the Standard C library.  fdcheckstd() will create a
1681 * descriptor referencing /dev/null for each of stdin, stdout, and
1682 * stderr that is not already open.
1683 */
1684int
1685fdcheckstd(td)
1686	struct thread *td;
1687{
1688	struct nameidata nd;
1689	struct filedesc *fdp;
1690	struct file *fp;
1691	register_t retval;
1692	int fd, i, error, flags, devnull;
1693
1694	fdp = td->td_proc->p_fd;
1695	if (fdp == NULL)
1696		return (0);
1697	devnull = -1;
1698	error = 0;
1699	for (i = 0; i < 3; i++) {
1700		if (fdp->fd_ofiles[i] != NULL)
1701			continue;
1702		if (devnull < 0) {
1703			error = falloc(td, &fp, &fd);
1704			if (error != 0)
1705				break;
1706			KASSERT(fd == i, ("oof, we didn't get our fd"));
1707			NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/dev/null",
1708			    td);
1709			flags = FREAD | FWRITE;
1710			error = vn_open(&nd, &flags, 0);
1711			if (error != 0) {
1712				FILEDESC_LOCK(fdp);
1713				fdp->fd_ofiles[fd] = NULL;
1714				FILEDESC_UNLOCK(fdp);
1715				fdrop(fp, td);
1716				break;
1717			}
1718			NDFREE(&nd, NDF_ONLY_PNBUF);
1719			fp->f_vnode = nd.ni_vp;
1720			fp->f_data = nd.ni_vp;
1721			fp->f_flag = flags;
1722			fp->f_ops = &vnops;
1723			fp->f_type = DTYPE_VNODE;
1724			VOP_UNLOCK(nd.ni_vp, 0, td);
1725			devnull = fd;
1726		} else {
1727			error = do_dup(td, DUP_FIXED, devnull, i, &retval);
1728			if (error != 0)
1729				break;
1730		}
1731	}
1732	return (error);
1733}
1734
1735/*
1736 * Internal form of close.
1737 * Decrement reference count on file structure.
1738 * Note: td may be NULL when closing a file
1739 * that was being passed in a message.
1740 */
1741int
1742closef(fp, td)
1743	struct file *fp;
1744	struct thread *td;
1745{
1746	struct vnode *vp;
1747	struct flock lf;
1748	struct filedesc_to_leader *fdtol;
1749	struct filedesc *fdp;
1750
1751	if (fp == NULL)
1752		return (0);
1753	/*
1754	 * POSIX record locking dictates that any close releases ALL
1755	 * locks owned by this process.  This is handled by setting
1756	 * a flag in the unlock to free ONLY locks obeying POSIX
1757	 * semantics, and not to free BSD-style file locks.
1758	 * If the descriptor was in a message, POSIX-style locks
1759	 * aren't passed with the descriptor.
1760	 */
1761	if (td != NULL &&
1762	    fp->f_type == DTYPE_VNODE) {
1763		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
1764			lf.l_whence = SEEK_SET;
1765			lf.l_start = 0;
1766			lf.l_len = 0;
1767			lf.l_type = F_UNLCK;
1768			vp = fp->f_vnode;
1769			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
1770					   F_UNLCK, &lf, F_POSIX);
1771		}
1772		fdtol = td->td_proc->p_fdtol;
1773		if (fdtol != NULL) {
1774			/*
1775			 * Handle special case where file descriptor table
1776			 * is shared between multiple process leaders.
1777			 */
1778			fdp = td->td_proc->p_fd;
1779			FILEDESC_LOCK(fdp);
1780			for (fdtol = fdtol->fdl_next;
1781			     fdtol != td->td_proc->p_fdtol;
1782			     fdtol = fdtol->fdl_next) {
1783				if ((fdtol->fdl_leader->p_flag &
1784				     P_ADVLOCK) == 0)
1785					continue;
1786				fdtol->fdl_holdcount++;
1787				FILEDESC_UNLOCK(fdp);
1788				lf.l_whence = SEEK_SET;
1789				lf.l_start = 0;
1790				lf.l_len = 0;
1791				lf.l_type = F_UNLCK;
1792				vp = fp->f_vnode;
1793				(void) VOP_ADVLOCK(vp,
1794						   (caddr_t)fdtol->fdl_leader,
1795						   F_UNLCK, &lf, F_POSIX);
1796				FILEDESC_LOCK(fdp);
1797				fdtol->fdl_holdcount--;
1798				if (fdtol->fdl_holdcount == 0 &&
1799				    fdtol->fdl_wakeup != 0) {
1800					fdtol->fdl_wakeup = 0;
1801					wakeup(fdtol);
1802				}
1803			}
1804			FILEDESC_UNLOCK(fdp);
1805		}
1806	}
1807	return (fdrop(fp, td));
1808}
1809
1810/*
1811 * Drop reference on struct file passed in, may call closef if the
1812 * reference hits zero.
1813 */
1814int
1815fdrop(fp, td)
1816	struct file *fp;
1817	struct thread *td;
1818{
1819
1820	FILE_LOCK(fp);
1821	return (fdrop_locked(fp, td));
1822}
1823
1824/*
1825 * Extract the file pointer associated with the specified descriptor for
1826 * the current user process.
1827 *
1828 * If the descriptor doesn't exist, EBADF is returned.
1829 *
1830 * If the descriptor exists but doesn't match 'flags' then
1831 * return EBADF for read attempts and EINVAL for write attempts.
1832 *
1833 * If 'hold' is set (non-zero) the file's refcount will be bumped on return.
1834 * It should be droped with fdrop().
1835 * If it is not set, then the refcount will not be bumped however the
1836 * thread's filedesc struct will be returned locked (for fgetsock).
1837 *
1838 * If an error occured the non-zero error is returned and *fpp is set to NULL.
1839 * Otherwise *fpp is set and zero is returned.
1840 */
1841static __inline int
1842_fget(struct thread *td, int fd, struct file **fpp, int flags, int hold)
1843{
1844	struct filedesc *fdp;
1845	struct file *fp;
1846
1847	*fpp = NULL;
1848	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
1849		return (EBADF);
1850	FILEDESC_LOCK(fdp);
1851	if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) {
1852		FILEDESC_UNLOCK(fdp);
1853		return (EBADF);
1854	}
1855
1856	/*
1857	 * Note: FREAD failures returns EBADF to maintain backwards
1858	 * compatibility with what routines returned before.
1859	 *
1860	 * Only one flag, or 0, may be specified.
1861	 */
1862	if (flags == FREAD && (fp->f_flag & FREAD) == 0) {
1863		FILEDESC_UNLOCK(fdp);
1864		return (EBADF);
1865	}
1866	if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) {
1867		FILEDESC_UNLOCK(fdp);
1868		return (EINVAL);
1869	}
1870	if (hold) {
1871		fhold(fp);
1872		FILEDESC_UNLOCK(fdp);
1873	}
1874	*fpp = fp;
1875	return (0);
1876}
1877
1878int
1879fget(struct thread *td, int fd, struct file **fpp)
1880{
1881
1882	return(_fget(td, fd, fpp, 0, 1));
1883}
1884
1885int
1886fget_read(struct thread *td, int fd, struct file **fpp)
1887{
1888
1889	return(_fget(td, fd, fpp, FREAD, 1));
1890}
1891
1892int
1893fget_write(struct thread *td, int fd, struct file **fpp)
1894{
1895
1896	return(_fget(td, fd, fpp, FWRITE, 1));
1897}
1898
1899/*
1900 * Like fget() but loads the underlying vnode, or returns an error if
1901 * the descriptor does not represent a vnode.  Note that pipes use vnodes
1902 * but never have VM objects (so VOP_GETVOBJECT() calls will return an
1903 * error).  The returned vnode will be vref()d.
1904 */
1905static __inline int
1906_fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags)
1907{
1908	struct file *fp;
1909	int error;
1910
1911	*vpp = NULL;
1912	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
1913		return (error);
1914	if (fp->f_vnode == NULL) {
1915		error = EINVAL;
1916	} else {
1917		*vpp = fp->f_vnode;
1918		vref(*vpp);
1919	}
1920	FILEDESC_UNLOCK(td->td_proc->p_fd);
1921	return (error);
1922}
1923
1924int
1925fgetvp(struct thread *td, int fd, struct vnode **vpp)
1926{
1927
1928	return (_fgetvp(td, fd, vpp, 0));
1929}
1930
1931int
1932fgetvp_read(struct thread *td, int fd, struct vnode **vpp)
1933{
1934
1935	return (_fgetvp(td, fd, vpp, FREAD));
1936}
1937
1938int
1939fgetvp_write(struct thread *td, int fd, struct vnode **vpp)
1940{
1941
1942	return (_fgetvp(td, fd, vpp, FWRITE));
1943}
1944
1945/*
1946 * Like fget() but loads the underlying socket, or returns an error if
1947 * the descriptor does not represent a socket.
1948 *
1949 * We bump the ref count on the returned socket.  XXX Also obtain the SX
1950 * lock in the future.
1951 */
1952int
1953fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp)
1954{
1955	struct file *fp;
1956	int error;
1957
1958	*spp = NULL;
1959	if (fflagp != NULL)
1960		*fflagp = 0;
1961	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
1962		return (error);
1963	if (fp->f_type != DTYPE_SOCKET) {
1964		error = ENOTSOCK;
1965	} else {
1966		*spp = fp->f_data;
1967		if (fflagp)
1968			*fflagp = fp->f_flag;
1969		soref(*spp);
1970	}
1971	FILEDESC_UNLOCK(td->td_proc->p_fd);
1972	return (error);
1973}
1974
1975/*
1976 * Drop the reference count on the the socket and XXX release the SX lock in
1977 * the future.  The last reference closes the socket.
1978 */
1979void
1980fputsock(struct socket *so)
1981{
1982
1983	sorele(so);
1984}
1985
1986/*
1987 * Drop reference on struct file passed in, may call closef if the
1988 * reference hits zero.
1989 * Expects struct file locked, and will unlock it.
1990 */
1991int
1992fdrop_locked(fp, td)
1993	struct file *fp;
1994	struct thread *td;
1995{
1996	struct flock lf;
1997	struct vnode *vp;
1998	int error;
1999
2000	FILE_LOCK_ASSERT(fp, MA_OWNED);
2001
2002	if (--fp->f_count > 0) {
2003		FILE_UNLOCK(fp);
2004		return (0);
2005	}
2006	/* We have the last ref so we can proceed without the file lock. */
2007	FILE_UNLOCK(fp);
2008	mtx_lock(&Giant);
2009	if (fp->f_count < 0)
2010		panic("fdrop: count < 0");
2011	if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
2012		lf.l_whence = SEEK_SET;
2013		lf.l_start = 0;
2014		lf.l_len = 0;
2015		lf.l_type = F_UNLCK;
2016		vp = fp->f_vnode;
2017		(void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
2018	}
2019	if (fp->f_ops != &badfileops)
2020		error = fo_close(fp, td);
2021	else
2022		error = 0;
2023	ffree(fp);
2024	mtx_unlock(&Giant);
2025	return (error);
2026}
2027
2028/*
2029 * Apply an advisory lock on a file descriptor.
2030 *
2031 * Just attempt to get a record lock of the requested type on
2032 * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
2033 */
2034#ifndef _SYS_SYSPROTO_H_
2035struct flock_args {
2036	int	fd;
2037	int	how;
2038};
2039#endif
2040/*
2041 * MPSAFE
2042 */
2043/* ARGSUSED */
2044int
2045flock(td, uap)
2046	struct thread *td;
2047	struct flock_args *uap;
2048{
2049	struct file *fp;
2050	struct vnode *vp;
2051	struct flock lf;
2052	int error;
2053
2054	if ((error = fget(td, uap->fd, &fp)) != 0)
2055		return (error);
2056	if (fp->f_type != DTYPE_VNODE) {
2057		fdrop(fp, td);
2058		return (EOPNOTSUPP);
2059	}
2060
2061	mtx_lock(&Giant);
2062	vp = fp->f_vnode;
2063	lf.l_whence = SEEK_SET;
2064	lf.l_start = 0;
2065	lf.l_len = 0;
2066	if (uap->how & LOCK_UN) {
2067		lf.l_type = F_UNLCK;
2068		FILE_LOCK(fp);
2069		fp->f_flag &= ~FHASLOCK;
2070		FILE_UNLOCK(fp);
2071		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
2072		goto done2;
2073	}
2074	if (uap->how & LOCK_EX)
2075		lf.l_type = F_WRLCK;
2076	else if (uap->how & LOCK_SH)
2077		lf.l_type = F_RDLCK;
2078	else {
2079		error = EBADF;
2080		goto done2;
2081	}
2082	FILE_LOCK(fp);
2083	fp->f_flag |= FHASLOCK;
2084	FILE_UNLOCK(fp);
2085	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
2086	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
2087done2:
2088	fdrop(fp, td);
2089	mtx_unlock(&Giant);
2090	return (error);
2091}
2092
2093/*
2094 * File Descriptor pseudo-device driver (/dev/fd/).
2095 *
2096 * Opening minor device N dup()s the file (if any) connected to file
2097 * descriptor N belonging to the calling process.  Note that this driver
2098 * consists of only the ``open()'' routine, because all subsequent
2099 * references to this file will be direct to the other driver.
2100 */
2101/* ARGSUSED */
2102static int
2103fdopen(dev, mode, type, td)
2104	dev_t dev;
2105	int mode, type;
2106	struct thread *td;
2107{
2108
2109	/*
2110	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
2111	 * the file descriptor being sought for duplication. The error
2112	 * return ensures that the vnode for this device will be released
2113	 * by vn_open. Open will detect this special error and take the
2114	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
2115	 * will simply report the error.
2116	 */
2117	td->td_dupfd = dev2unit(dev);
2118	return (ENODEV);
2119}
2120
2121/*
2122 * Duplicate the specified descriptor to a free descriptor.
2123 */
2124int
2125dupfdopen(td, fdp, indx, dfd, mode, error)
2126	struct thread *td;
2127	struct filedesc *fdp;
2128	int indx, dfd;
2129	int mode;
2130	int error;
2131{
2132	struct file *wfp;
2133	struct file *fp;
2134
2135	/*
2136	 * If the to-be-dup'd fd number is greater than the allowed number
2137	 * of file descriptors, or the fd to be dup'd has already been
2138	 * closed, then reject.
2139	 */
2140	FILEDESC_LOCK(fdp);
2141	if (dfd < 0 || dfd >= fdp->fd_nfiles ||
2142	    (wfp = fdp->fd_ofiles[dfd]) == NULL) {
2143		FILEDESC_UNLOCK(fdp);
2144		return (EBADF);
2145	}
2146
2147	/*
2148	 * There are two cases of interest here.
2149	 *
2150	 * For ENODEV simply dup (dfd) to file descriptor
2151	 * (indx) and return.
2152	 *
2153	 * For ENXIO steal away the file structure from (dfd) and
2154	 * store it in (indx).  (dfd) is effectively closed by
2155	 * this operation.
2156	 *
2157	 * Any other error code is just returned.
2158	 */
2159	switch (error) {
2160	case ENODEV:
2161		/*
2162		 * Check that the mode the file is being opened for is a
2163		 * subset of the mode of the existing descriptor.
2164		 */
2165		FILE_LOCK(wfp);
2166		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {
2167			FILE_UNLOCK(wfp);
2168			FILEDESC_UNLOCK(fdp);
2169			return (EACCES);
2170		}
2171		fp = fdp->fd_ofiles[indx];
2172#if 0
2173		if (fp && fdp->fd_ofileflags[indx] & UF_MAPPED)
2174			(void) munmapfd(td, indx);
2175#endif
2176		fdp->fd_ofiles[indx] = wfp;
2177		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
2178		fhold_locked(wfp);
2179		FILE_UNLOCK(wfp);
2180		if (indx > fdp->fd_lastfile)
2181			fdp->fd_lastfile = indx;
2182		if (fp != NULL)
2183			FILE_LOCK(fp);
2184		FILEDESC_UNLOCK(fdp);
2185		/*
2186		 * We now own the reference to fp that the ofiles[] array
2187		 * used to own.  Release it.
2188		 */
2189		if (fp != NULL)
2190			fdrop_locked(fp, td);
2191		return (0);
2192
2193	case ENXIO:
2194		/*
2195		 * Steal away the file pointer from dfd and stuff it into indx.
2196		 */
2197		fp = fdp->fd_ofiles[indx];
2198#if 0
2199		if (fp && fdp->fd_ofileflags[indx] & UF_MAPPED)
2200			(void) munmapfd(td, indx);
2201#endif
2202		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
2203		fdp->fd_ofiles[dfd] = NULL;
2204		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
2205		fdp->fd_ofileflags[dfd] = 0;
2206
2207		/*
2208		 * Complete the clean up of the filedesc structure by
2209		 * recomputing the various hints.
2210		 */
2211		if (indx > fdp->fd_lastfile) {
2212			fdp->fd_lastfile = indx;
2213		} else {
2214			while (fdp->fd_lastfile > 0 &&
2215			   fdp->fd_ofiles[fdp->fd_lastfile] == NULL) {
2216				fdp->fd_lastfile--;
2217			}
2218			if (dfd < fdp->fd_freefile)
2219				fdp->fd_freefile = dfd;
2220		}
2221		if (fp != NULL)
2222			FILE_LOCK(fp);
2223		FILEDESC_UNLOCK(fdp);
2224
2225		/*
2226		 * we now own the reference to fp that the ofiles[] array
2227		 * used to own.  Release it.
2228		 */
2229		if (fp != NULL)
2230			fdrop_locked(fp, td);
2231		return (0);
2232
2233	default:
2234		FILEDESC_UNLOCK(fdp);
2235		return (error);
2236	}
2237	/* NOTREACHED */
2238}
2239
2240
2241struct filedesc_to_leader *
2242filedesc_to_leader_alloc(struct filedesc_to_leader *old,
2243			 struct filedesc *fdp,
2244			 struct proc *leader)
2245{
2246	struct filedesc_to_leader *fdtol;
2247
2248	MALLOC(fdtol, struct filedesc_to_leader *,
2249	       sizeof(struct filedesc_to_leader),
2250	       M_FILEDESC_TO_LEADER,
2251	       M_WAITOK);
2252	fdtol->fdl_refcount = 1;
2253	fdtol->fdl_holdcount = 0;
2254	fdtol->fdl_wakeup = 0;
2255	fdtol->fdl_leader = leader;
2256	if (old != NULL) {
2257		FILEDESC_LOCK(fdp);
2258		fdtol->fdl_next = old->fdl_next;
2259		fdtol->fdl_prev = old;
2260		old->fdl_next = fdtol;
2261		fdtol->fdl_next->fdl_prev = fdtol;
2262		FILEDESC_UNLOCK(fdp);
2263	} else {
2264		fdtol->fdl_next = fdtol;
2265		fdtol->fdl_prev = fdtol;
2266	}
2267	return fdtol;
2268}
2269
2270/*
2271 * Get file structures.
2272 */
2273static int
2274sysctl_kern_file(SYSCTL_HANDLER_ARGS)
2275{
2276	struct xfile xf;
2277	struct filedesc *fdp;
2278	struct file *fp;
2279	struct proc *p;
2280	int error, n;
2281
2282	sysctl_wire_old_buffer(req, 0);
2283	if (req->oldptr == NULL) {
2284		n = 16;		/* A slight overestimate. */
2285		sx_slock(&filelist_lock);
2286		LIST_FOREACH(fp, &filehead, f_list) {
2287			/*
2288			 * We should grab the lock, but this is an
2289			 * estimate, so does it really matter?
2290			 */
2291			/* mtx_lock(fp->f_mtxp); */
2292			n += fp->f_count;
2293			/* mtx_unlock(f->f_mtxp); */
2294		}
2295		sx_sunlock(&filelist_lock);
2296		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
2297	}
2298	error = 0;
2299	bzero(&xf, sizeof(xf));
2300	xf.xf_size = sizeof(xf);
2301	sx_slock(&allproc_lock);
2302	LIST_FOREACH(p, &allproc, p_list) {
2303		PROC_LOCK(p);
2304		xf.xf_pid = p->p_pid;
2305		xf.xf_uid = p->p_ucred->cr_uid;
2306		PROC_UNLOCK(p);
2307		mtx_lock(&fdesc_mtx);
2308		if ((fdp = p->p_fd) == NULL) {
2309			mtx_unlock(&fdesc_mtx);
2310			continue;
2311		}
2312		FILEDESC_LOCK(fdp);
2313		for (n = 0; n < fdp->fd_nfiles; ++n) {
2314			if ((fp = fdp->fd_ofiles[n]) == NULL)
2315				continue;
2316			xf.xf_fd = n;
2317			xf.xf_file = fp;
2318			xf.xf_data = fp->f_data;
2319			xf.xf_type = fp->f_type;
2320			xf.xf_count = fp->f_count;
2321			xf.xf_msgcount = fp->f_msgcount;
2322			xf.xf_offset = fp->f_offset;
2323			xf.xf_flag = fp->f_flag;
2324			error = SYSCTL_OUT(req, &xf, sizeof(xf));
2325			if (error)
2326				break;
2327		}
2328		FILEDESC_UNLOCK(fdp);
2329		mtx_unlock(&fdesc_mtx);
2330		if (error)
2331			break;
2332	}
2333	sx_sunlock(&allproc_lock);
2334	return (error);
2335}
2336
2337SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
2338    0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
2339
2340SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
2341    &maxfilesperproc, 0, "Maximum files allowed open per process");
2342
2343SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
2344    &maxfiles, 0, "Maximum number of files");
2345
2346SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
2347    &nfiles, 0, "System-wide number of open files");
2348
2349static void
2350fildesc_drvinit(void *unused)
2351{
2352	dev_t dev;
2353
2354	dev = make_dev(&fildesc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "fd/0");
2355	make_dev_alias(dev, "stdin");
2356	dev = make_dev(&fildesc_cdevsw, 1, UID_ROOT, GID_WHEEL, 0666, "fd/1");
2357	make_dev_alias(dev, "stdout");
2358	dev = make_dev(&fildesc_cdevsw, 2, UID_ROOT, GID_WHEEL, 0666, "fd/2");
2359	make_dev_alias(dev, "stderr");
2360}
2361
2362static fo_rdwr_t	badfo_readwrite;
2363static fo_ioctl_t	badfo_ioctl;
2364static fo_poll_t	badfo_poll;
2365static fo_kqfilter_t	badfo_kqfilter;
2366static fo_stat_t	badfo_stat;
2367static fo_close_t	badfo_close;
2368
2369struct fileops badfileops = {
2370	.fo_read = badfo_readwrite,
2371	.fo_write = badfo_readwrite,
2372	.fo_ioctl = badfo_ioctl,
2373	.fo_poll = badfo_poll,
2374	.fo_kqfilter = badfo_kqfilter,
2375	.fo_stat = badfo_stat,
2376	.fo_close = badfo_close,
2377};
2378
2379static int
2380badfo_readwrite(fp, uio, active_cred, flags, td)
2381	struct file *fp;
2382	struct uio *uio;
2383	struct ucred *active_cred;
2384	struct thread *td;
2385	int flags;
2386{
2387
2388	return (EBADF);
2389}
2390
2391static int
2392badfo_ioctl(fp, com, data, active_cred, td)
2393	struct file *fp;
2394	u_long com;
2395	void *data;
2396	struct ucred *active_cred;
2397	struct thread *td;
2398{
2399
2400	return (EBADF);
2401}
2402
2403static int
2404badfo_poll(fp, events, active_cred, td)
2405	struct file *fp;
2406	int events;
2407	struct ucred *active_cred;
2408	struct thread *td;
2409{
2410
2411	return (0);
2412}
2413
2414static int
2415badfo_kqfilter(fp, kn)
2416	struct file *fp;
2417	struct knote *kn;
2418{
2419
2420	return (0);
2421}
2422
2423static int
2424badfo_stat(fp, sb, active_cred, td)
2425	struct file *fp;
2426	struct stat *sb;
2427	struct ucred *active_cred;
2428	struct thread *td;
2429{
2430
2431	return (EBADF);
2432}
2433
2434static int
2435badfo_close(fp, td)
2436	struct file *fp;
2437	struct thread *td;
2438{
2439
2440	return (EBADF);
2441}
2442
2443SYSINIT(fildescdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,
2444					fildesc_drvinit,NULL)
2445
2446static void filelistinit(void *);
2447SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL)
2448
2449/* ARGSUSED*/
2450static void
2451filelistinit(dummy)
2452	void *dummy;
2453{
2454
2455	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
2456	    NULL, NULL, UMA_ALIGN_PTR, 0);
2457	sx_init(&filelist_lock, "filelist lock");
2458	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
2459}
2460