kern_descrip.c revision 118126
1/*
2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: head/sys/kern/kern_descrip.c 118126 2003-07-28 16:03:53Z rwatson $");
43
44#include "opt_compat.h"
45
46#include <sys/param.h>
47#include <sys/systm.h>
48#include <sys/syscallsubr.h>
49#include <sys/sysproto.h>
50#include <sys/conf.h>
51#include <sys/filedesc.h>
52#include <sys/lock.h>
53#include <sys/kernel.h>
54#include <sys/limits.h>
55#include <sys/malloc.h>
56#include <sys/mutex.h>
57#include <sys/sysctl.h>
58#include <sys/vnode.h>
59#include <sys/mount.h>
60#include <sys/proc.h>
61#include <sys/namei.h>
62#include <sys/file.h>
63#include <sys/stat.h>
64#include <sys/filio.h>
65#include <sys/fcntl.h>
66#include <sys/unistd.h>
67#include <sys/resourcevar.h>
68#include <sys/event.h>
69#include <sys/sx.h>
70#include <sys/socketvar.h>
71#include <sys/signalvar.h>
72
73#include <vm/vm.h>
74#include <vm/vm_extern.h>
75#include <vm/uma.h>
76
77static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
78static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "file desc to leader",
79		     "file desc to leader structures");
80static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
81
82static uma_zone_t file_zone;
83
84static	 d_open_t  fdopen;
85#define	NUMFDESC 64
86
87#define	CDEV_MAJOR 22
88static struct cdevsw fildesc_cdevsw = {
89	.d_open =	fdopen,
90	.d_name =	"FD",
91	.d_maj =	CDEV_MAJOR,
92};
93
94/* How to treat 'new' parameter when allocating a fd for do_dup(). */
95enum dup_type { DUP_VARIABLE, DUP_FIXED };
96
97static int do_dup(struct thread *td, enum dup_type type, int old, int new,
98    register_t *retval);
99
100/*
101 * Descriptor management.
102 */
103struct filelist filehead;	/* head of list of open files */
104int nfiles;			/* actual number of open files */
105extern int cmask;
106struct sx filelist_lock;	/* sx to protect filelist */
107struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
108
109/*
110 * System calls on descriptors.
111 */
112#ifndef _SYS_SYSPROTO_H_
113struct getdtablesize_args {
114	int	dummy;
115};
116#endif
117/*
118 * MPSAFE
119 */
120/* ARGSUSED */
121int
122getdtablesize(td, uap)
123	struct thread *td;
124	struct getdtablesize_args *uap;
125{
126	struct proc *p = td->td_proc;
127
128	mtx_lock(&Giant);
129	td->td_retval[0] =
130	    min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
131	mtx_unlock(&Giant);
132	return (0);
133}
134
135/*
136 * Duplicate a file descriptor to a particular value.
137 *
138 * note: keep in mind that a potential race condition exists when closing
139 * descriptors from a shared descriptor table (via rfork).
140 */
141#ifndef _SYS_SYSPROTO_H_
142struct dup2_args {
143	u_int	from;
144	u_int	to;
145};
146#endif
147/*
148 * MPSAFE
149 */
150/* ARGSUSED */
151int
152dup2(td, uap)
153	struct thread *td;
154	struct dup2_args *uap;
155{
156
157	return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
158		    td->td_retval));
159}
160
161/*
162 * Duplicate a file descriptor.
163 */
164#ifndef _SYS_SYSPROTO_H_
165struct dup_args {
166	u_int	fd;
167};
168#endif
169/*
170 * MPSAFE
171 */
172/* ARGSUSED */
173int
174dup(td, uap)
175	struct thread *td;
176	struct dup_args *uap;
177{
178
179	return (do_dup(td, DUP_VARIABLE, (int)uap->fd, 0, td->td_retval));
180}
181
182/*
183 * The file control system call.
184 */
185#ifndef _SYS_SYSPROTO_H_
186struct fcntl_args {
187	int	fd;
188	int	cmd;
189	long	arg;
190};
191#endif
192/*
193 * MPSAFE
194 */
195/* ARGSUSED */
196int
197fcntl(td, uap)
198	struct thread *td;
199	struct fcntl_args *uap;
200{
201	struct flock fl;
202	intptr_t arg;
203	int error;
204
205	error = 0;
206	switch (uap->cmd) {
207	case F_GETLK:
208	case F_SETLK:
209	case F_SETLKW:
210		error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
211		arg = (intptr_t)&fl;
212		break;
213	default:
214		arg = uap->arg;
215		break;
216	}
217	if (error)
218		return (error);
219	error = kern_fcntl(td, uap->fd, uap->cmd, arg);
220	if (error)
221		return (error);
222	if (uap->cmd == F_GETLK)
223		error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
224	return (error);
225}
226
227int
228kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
229{
230	struct filedesc *fdp;
231	struct flock *flp;
232	struct file *fp;
233	struct proc *p;
234	char *pop;
235	struct vnode *vp;
236	u_int newmin;
237	int error, flg, tmp;
238
239	error = 0;
240	flg = F_POSIX;
241	p = td->td_proc;
242	fdp = p->p_fd;
243	mtx_lock(&Giant);
244	FILEDESC_LOCK(fdp);
245	if ((unsigned)fd >= fdp->fd_nfiles ||
246	    (fp = fdp->fd_ofiles[fd]) == NULL) {
247		FILEDESC_UNLOCK(fdp);
248		error = EBADF;
249		goto done2;
250	}
251	pop = &fdp->fd_ofileflags[fd];
252
253	switch (cmd) {
254	case F_DUPFD:
255		FILEDESC_UNLOCK(fdp);
256		newmin = arg;
257		if (newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
258		    newmin >= maxfilesperproc) {
259			error = EINVAL;
260			break;
261		}
262		error = do_dup(td, DUP_VARIABLE, fd, newmin, td->td_retval);
263		break;
264
265	case F_GETFD:
266		td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0;
267		FILEDESC_UNLOCK(fdp);
268		break;
269
270	case F_SETFD:
271		*pop = (*pop &~ UF_EXCLOSE) |
272		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
273		FILEDESC_UNLOCK(fdp);
274		break;
275
276	case F_GETFL:
277		FILE_LOCK(fp);
278		FILEDESC_UNLOCK(fdp);
279		td->td_retval[0] = OFLAGS(fp->f_flag);
280		FILE_UNLOCK(fp);
281		break;
282
283	case F_SETFL:
284		FILE_LOCK(fp);
285		FILEDESC_UNLOCK(fdp);
286		fhold_locked(fp);
287		fp->f_flag &= ~FCNTLFLAGS;
288		fp->f_flag |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
289		FILE_UNLOCK(fp);
290		tmp = fp->f_flag & FNONBLOCK;
291		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
292		if (error) {
293			fdrop(fp, td);
294			break;
295		}
296		tmp = fp->f_flag & FASYNC;
297		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
298		if (error == 0) {
299			fdrop(fp, td);
300			break;
301		}
302		FILE_LOCK(fp);
303		fp->f_flag &= ~FNONBLOCK;
304		FILE_UNLOCK(fp);
305		tmp = 0;
306		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
307		fdrop(fp, td);
308		break;
309
310	case F_GETOWN:
311		fhold(fp);
312		FILEDESC_UNLOCK(fdp);
313		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
314		if (error == 0)
315			td->td_retval[0] = tmp;
316		fdrop(fp, td);
317		break;
318
319	case F_SETOWN:
320		fhold(fp);
321		FILEDESC_UNLOCK(fdp);
322		tmp = arg;
323		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
324		fdrop(fp, td);
325		break;
326
327	case F_SETLKW:
328		flg |= F_WAIT;
329		/* FALLTHROUGH F_SETLK */
330
331	case F_SETLK:
332		if (fp->f_type != DTYPE_VNODE) {
333			FILEDESC_UNLOCK(fdp);
334			error = EBADF;
335			break;
336		}
337
338		flp = (struct flock *)arg;
339		if (flp->l_whence == SEEK_CUR) {
340			if (fp->f_offset < 0 ||
341			    (flp->l_start > 0 &&
342			     fp->f_offset > OFF_MAX - flp->l_start)) {
343				FILEDESC_UNLOCK(fdp);
344				error = EOVERFLOW;
345				break;
346			}
347			flp->l_start += fp->f_offset;
348		}
349
350		/*
351		 * VOP_ADVLOCK() may block.
352		 */
353		fhold(fp);
354		FILEDESC_UNLOCK(fdp);
355		vp = fp->f_vnode;
356
357		switch (flp->l_type) {
358		case F_RDLCK:
359			if ((fp->f_flag & FREAD) == 0) {
360				error = EBADF;
361				break;
362			}
363			PROC_LOCK(p->p_leader);
364			p->p_leader->p_flag |= P_ADVLOCK;
365			PROC_UNLOCK(p->p_leader);
366			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
367			    flp, flg);
368			break;
369		case F_WRLCK:
370			if ((fp->f_flag & FWRITE) == 0) {
371				error = EBADF;
372				break;
373			}
374			PROC_LOCK(p->p_leader);
375			p->p_leader->p_flag |= P_ADVLOCK;
376			PROC_UNLOCK(p->p_leader);
377			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
378			    flp, flg);
379			break;
380		case F_UNLCK:
381			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
382			    flp, F_POSIX);
383			break;
384		default:
385			error = EINVAL;
386			break;
387		}
388		/* Check for race with close */
389		FILEDESC_LOCK(fdp);
390		if ((unsigned) fd >= fdp->fd_nfiles ||
391		    fp != fdp->fd_ofiles[fd]) {
392			FILEDESC_UNLOCK(fdp);
393			flp->l_whence = SEEK_SET;
394			flp->l_start = 0;
395			flp->l_len = 0;
396			flp->l_type = F_UNLCK;
397			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
398					   F_UNLCK, flp, F_POSIX);
399		} else
400			FILEDESC_UNLOCK(fdp);
401		fdrop(fp, td);
402		break;
403
404	case F_GETLK:
405		if (fp->f_type != DTYPE_VNODE) {
406			FILEDESC_UNLOCK(fdp);
407			error = EBADF;
408			break;
409		}
410		flp = (struct flock *)arg;
411		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
412		    flp->l_type != F_UNLCK) {
413			FILEDESC_UNLOCK(fdp);
414			error = EINVAL;
415			break;
416		}
417		if (flp->l_whence == SEEK_CUR) {
418			if ((flp->l_start > 0 &&
419			    fp->f_offset > OFF_MAX - flp->l_start) ||
420			    (flp->l_start < 0 &&
421			     fp->f_offset < OFF_MIN - flp->l_start)) {
422				FILEDESC_UNLOCK(fdp);
423				error = EOVERFLOW;
424				break;
425			}
426			flp->l_start += fp->f_offset;
427		}
428		/*
429		 * VOP_ADVLOCK() may block.
430		 */
431		fhold(fp);
432		FILEDESC_UNLOCK(fdp);
433		vp = fp->f_vnode;
434		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
435		    F_POSIX);
436		fdrop(fp, td);
437		break;
438	default:
439		FILEDESC_UNLOCK(fdp);
440		error = EINVAL;
441		break;
442	}
443done2:
444	mtx_unlock(&Giant);
445	return (error);
446}
447
448/*
449 * Common code for dup, dup2, and fcntl(F_DUPFD).
450 */
451static int
452do_dup(td, type, old, new, retval)
453	enum dup_type type;
454	int old, new;
455	register_t *retval;
456	struct thread *td;
457{
458	struct filedesc *fdp;
459	struct proc *p;
460	struct file *fp;
461	struct file *delfp;
462	int error, newfd;
463	int holdleaders;
464
465	p = td->td_proc;
466	fdp = p->p_fd;
467
468	/*
469	 * Verify we have a valid descriptor to dup from and possibly to
470	 * dup to.
471	 */
472	if (old < 0 || new < 0 || new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
473	    new >= maxfilesperproc)
474		return (EBADF);
475	FILEDESC_LOCK(fdp);
476	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
477		FILEDESC_UNLOCK(fdp);
478		return (EBADF);
479	}
480	if (type == DUP_FIXED && old == new) {
481		*retval = new;
482		FILEDESC_UNLOCK(fdp);
483		return (0);
484	}
485	fp = fdp->fd_ofiles[old];
486	fhold(fp);
487
488	/*
489	 * Expand the table for the new descriptor if needed.  This may
490	 * block and drop and reacquire the filedesc lock.
491	 */
492	if (type == DUP_VARIABLE || new >= fdp->fd_nfiles) {
493		error = fdalloc(td, new, &newfd);
494		if (error) {
495			FILEDESC_UNLOCK(fdp);
496			fdrop(fp, td);
497			return (error);
498		}
499	}
500	if (type == DUP_VARIABLE)
501		new = newfd;
502
503	/*
504	 * If the old file changed out from under us then treat it as a
505	 * bad file descriptor.  Userland should do its own locking to
506	 * avoid this case.
507	 */
508	if (fdp->fd_ofiles[old] != fp) {
509		if (fdp->fd_ofiles[new] == NULL) {
510			if (new < fdp->fd_freefile)
511				fdp->fd_freefile = new;
512			while (fdp->fd_lastfile > 0 &&
513			    fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
514				fdp->fd_lastfile--;
515		}
516		FILEDESC_UNLOCK(fdp);
517		fdrop(fp, td);
518		return (EBADF);
519	}
520	KASSERT(old != new, ("new fd is same as old"));
521
522	/*
523	 * Save info on the descriptor being overwritten.  We have
524	 * to do the unmap now, but we cannot close it without
525	 * introducing an ownership race for the slot.
526	 */
527	delfp = fdp->fd_ofiles[new];
528	if (delfp != NULL && p->p_fdtol != NULL) {
529		/*
530		 * Ask fdfree() to sleep to ensure that all relevant
531		 * process leaders can be traversed in closef().
532		 */
533		fdp->fd_holdleaderscount++;
534		holdleaders = 1;
535	} else
536		holdleaders = 0;
537	KASSERT(delfp == NULL || type == DUP_FIXED,
538	    ("dup() picked an open file"));
539#if 0
540	if (delfp && (fdp->fd_ofileflags[new] & UF_MAPPED))
541		(void) munmapfd(td, new);
542#endif
543
544	/*
545	 * Duplicate the source descriptor, update lastfile
546	 */
547	fdp->fd_ofiles[new] = fp;
548 	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
549	if (new > fdp->fd_lastfile)
550		fdp->fd_lastfile = new;
551	FILEDESC_UNLOCK(fdp);
552	*retval = new;
553
554	/*
555	 * If we dup'd over a valid file, we now own the reference to it
556	 * and must dispose of it using closef() semantics (as if a
557	 * close() were performed on it).
558	 */
559	if (delfp) {
560		mtx_lock(&Giant);
561		(void) closef(delfp, td);
562		mtx_unlock(&Giant);
563		if (holdleaders) {
564			FILEDESC_LOCK(fdp);
565			fdp->fd_holdleaderscount--;
566			if (fdp->fd_holdleaderscount == 0 &&
567			    fdp->fd_holdleaderswakeup != 0) {
568				fdp->fd_holdleaderswakeup = 0;
569				wakeup(&fdp->fd_holdleaderscount);
570			}
571			FILEDESC_UNLOCK(fdp);
572		}
573	}
574	return (0);
575}
576
577/*
578 * If sigio is on the list associated with a process or process group,
579 * disable signalling from the device, remove sigio from the list and
580 * free sigio.
581 */
582void
583funsetown(sigiop)
584	struct sigio **sigiop;
585{
586	struct sigio *sigio;
587
588	SIGIO_LOCK();
589	sigio = *sigiop;
590	if (sigio == NULL) {
591		SIGIO_UNLOCK();
592		return;
593	}
594	*(sigio->sio_myref) = NULL;
595	if ((sigio)->sio_pgid < 0) {
596		struct pgrp *pg = (sigio)->sio_pgrp;
597		PGRP_LOCK(pg);
598		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
599			     sigio, sio_pgsigio);
600		PGRP_UNLOCK(pg);
601	} else {
602		struct proc *p = (sigio)->sio_proc;
603		PROC_LOCK(p);
604		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
605			     sigio, sio_pgsigio);
606		PROC_UNLOCK(p);
607	}
608	SIGIO_UNLOCK();
609	crfree(sigio->sio_ucred);
610	FREE(sigio, M_SIGIO);
611}
612
613/*
614 * Free a list of sigio structures.
615 * We only need to lock the SIGIO_LOCK because we have made ourselves
616 * inaccessable to callers of fsetown and therefore do not need to lock
617 * the proc or pgrp struct for the list manipulation.
618 */
619void
620funsetownlst(sigiolst)
621	struct sigiolst *sigiolst;
622{
623	struct proc *p;
624	struct pgrp *pg;
625	struct sigio *sigio;
626
627	sigio = SLIST_FIRST(sigiolst);
628	if (sigio == NULL)
629		return;
630	p = NULL;
631	pg = NULL;
632
633	/*
634	 * Every entry of the list should belong
635	 * to a single proc or pgrp.
636	 */
637	if (sigio->sio_pgid < 0) {
638		pg = sigio->sio_pgrp;
639		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
640	} else /* if (sigio->sio_pgid > 0) */ {
641		p = sigio->sio_proc;
642		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
643	}
644
645	SIGIO_LOCK();
646	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
647		*(sigio->sio_myref) = NULL;
648		if (pg != NULL) {
649			KASSERT(sigio->sio_pgid < 0,
650			    ("Proc sigio in pgrp sigio list"));
651			KASSERT(sigio->sio_pgrp == pg,
652			    ("Bogus pgrp in sigio list"));
653			PGRP_LOCK(pg);
654			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
655			    sio_pgsigio);
656			PGRP_UNLOCK(pg);
657		} else /* if (p != NULL) */ {
658			KASSERT(sigio->sio_pgid > 0,
659			    ("Pgrp sigio in proc sigio list"));
660			KASSERT(sigio->sio_proc == p,
661			    ("Bogus proc in sigio list"));
662			PROC_LOCK(p);
663			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
664			    sio_pgsigio);
665			PROC_UNLOCK(p);
666		}
667		SIGIO_UNLOCK();
668		crfree(sigio->sio_ucred);
669		FREE(sigio, M_SIGIO);
670		SIGIO_LOCK();
671	}
672	SIGIO_UNLOCK();
673}
674
675/*
676 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
677 *
678 * After permission checking, add a sigio structure to the sigio list for
679 * the process or process group.
680 */
681int
682fsetown(pgid, sigiop)
683	pid_t pgid;
684	struct sigio **sigiop;
685{
686	struct proc *proc;
687	struct pgrp *pgrp;
688	struct sigio *sigio;
689	int ret;
690
691	if (pgid == 0) {
692		funsetown(sigiop);
693		return (0);
694	}
695
696	ret = 0;
697
698	/* Allocate and fill in the new sigio out of locks. */
699	MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO, M_WAITOK);
700	sigio->sio_pgid = pgid;
701	sigio->sio_ucred = crhold(curthread->td_ucred);
702	sigio->sio_myref = sigiop;
703
704	sx_slock(&proctree_lock);
705	if (pgid > 0) {
706		proc = pfind(pgid);
707		if (proc == NULL) {
708			ret = ESRCH;
709			goto fail;
710		}
711
712		/*
713		 * Policy - Don't allow a process to FSETOWN a process
714		 * in another session.
715		 *
716		 * Remove this test to allow maximum flexibility or
717		 * restrict FSETOWN to the current process or process
718		 * group for maximum safety.
719		 */
720		PROC_UNLOCK(proc);
721		if (proc->p_session != curthread->td_proc->p_session) {
722			ret = EPERM;
723			goto fail;
724		}
725
726		pgrp = NULL;
727	} else /* if (pgid < 0) */ {
728		pgrp = pgfind(-pgid);
729		if (pgrp == NULL) {
730			ret = ESRCH;
731			goto fail;
732		}
733		PGRP_UNLOCK(pgrp);
734
735		/*
736		 * Policy - Don't allow a process to FSETOWN a process
737		 * in another session.
738		 *
739		 * Remove this test to allow maximum flexibility or
740		 * restrict FSETOWN to the current process or process
741		 * group for maximum safety.
742		 */
743		if (pgrp->pg_session != curthread->td_proc->p_session) {
744			ret = EPERM;
745			goto fail;
746		}
747
748		proc = NULL;
749	}
750	funsetown(sigiop);
751	if (pgid > 0) {
752		PROC_LOCK(proc);
753		/*
754		 * Since funsetownlst() is called without the proctree
755		 * locked, we need to check for P_WEXIT.
756		 * XXX: is ESRCH correct?
757		 */
758		if ((proc->p_flag & P_WEXIT) != 0) {
759			PROC_UNLOCK(proc);
760			ret = ESRCH;
761			goto fail;
762		}
763		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
764		sigio->sio_proc = proc;
765		PROC_UNLOCK(proc);
766	} else {
767		PGRP_LOCK(pgrp);
768		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
769		sigio->sio_pgrp = pgrp;
770		PGRP_UNLOCK(pgrp);
771	}
772	sx_sunlock(&proctree_lock);
773	SIGIO_LOCK();
774	*sigiop = sigio;
775	SIGIO_UNLOCK();
776	return (0);
777
778fail:
779	sx_sunlock(&proctree_lock);
780	crfree(sigio->sio_ucred);
781	FREE(sigio, M_SIGIO);
782	return (ret);
783}
784
785/*
786 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
787 */
788pid_t
789fgetown(sigiop)
790	struct sigio **sigiop;
791{
792	pid_t pgid;
793
794	SIGIO_LOCK();
795	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
796	SIGIO_UNLOCK();
797	return (pgid);
798}
799
800/*
801 * Close a file descriptor.
802 */
803#ifndef _SYS_SYSPROTO_H_
804struct close_args {
805        int     fd;
806};
807#endif
808/*
809 * MPSAFE
810 */
811/* ARGSUSED */
812int
813close(td, uap)
814	struct thread *td;
815	struct close_args *uap;
816{
817	struct filedesc *fdp;
818	struct file *fp;
819	int fd, error;
820	int holdleaders;
821
822	fd = uap->fd;
823	error = 0;
824	holdleaders = 0;
825	fdp = td->td_proc->p_fd;
826	mtx_lock(&Giant);
827	FILEDESC_LOCK(fdp);
828	if ((unsigned)fd >= fdp->fd_nfiles ||
829	    (fp = fdp->fd_ofiles[fd]) == NULL) {
830		FILEDESC_UNLOCK(fdp);
831		error = EBADF;
832		goto done2;
833	}
834#if 0
835	if (fdp->fd_ofileflags[fd] & UF_MAPPED)
836		(void) munmapfd(td, fd);
837#endif
838	fdp->fd_ofiles[fd] = NULL;
839	fdp->fd_ofileflags[fd] = 0;
840	if (td->td_proc->p_fdtol != NULL) {
841		/*
842		 * Ask fdfree() to sleep to ensure that all relevant
843		 * process leaders can be traversed in closef().
844		 */
845		fdp->fd_holdleaderscount++;
846		holdleaders = 1;
847	}
848
849	/*
850	 * we now hold the fp reference that used to be owned by the descriptor
851	 * array.
852	 */
853	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
854		fdp->fd_lastfile--;
855	if (fd < fdp->fd_freefile)
856		fdp->fd_freefile = fd;
857	if (fd < fdp->fd_knlistsize) {
858		FILEDESC_UNLOCK(fdp);
859		knote_fdclose(td, fd);
860	} else
861		FILEDESC_UNLOCK(fdp);
862
863	error = closef(fp, td);
864done2:
865	mtx_unlock(&Giant);
866	if (holdleaders) {
867		FILEDESC_LOCK(fdp);
868		fdp->fd_holdleaderscount--;
869		if (fdp->fd_holdleaderscount == 0 &&
870		    fdp->fd_holdleaderswakeup != 0) {
871			fdp->fd_holdleaderswakeup = 0;
872			wakeup(&fdp->fd_holdleaderscount);
873		}
874		FILEDESC_UNLOCK(fdp);
875	}
876	return (error);
877}
878
879#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
880/*
881 * Return status information about a file descriptor.
882 */
883#ifndef _SYS_SYSPROTO_H_
884struct ofstat_args {
885	int	fd;
886	struct	ostat *sb;
887};
888#endif
889/*
890 * MPSAFE
891 */
892/* ARGSUSED */
893int
894ofstat(td, uap)
895	struct thread *td;
896	struct ofstat_args *uap;
897{
898	struct file *fp;
899	struct stat ub;
900	struct ostat oub;
901	int error;
902
903	mtx_lock(&Giant);
904	if ((error = fget(td, uap->fd, &fp)) != 0)
905		goto done2;
906	error = fo_stat(fp, &ub, td->td_ucred, td);
907	if (error == 0) {
908		cvtstat(&ub, &oub);
909		error = copyout(&oub, uap->sb, sizeof(oub));
910	}
911	fdrop(fp, td);
912done2:
913	mtx_unlock(&Giant);
914	return (error);
915}
916#endif /* COMPAT_43 || COMPAT_SUNOS */
917
918/*
919 * Return status information about a file descriptor.
920 */
921#ifndef _SYS_SYSPROTO_H_
922struct fstat_args {
923	int	fd;
924	struct	stat *sb;
925};
926#endif
927/*
928 * MPSAFE
929 */
930/* ARGSUSED */
931int
932fstat(td, uap)
933	struct thread *td;
934	struct fstat_args *uap;
935{
936	struct file *fp;
937	struct stat ub;
938	int error;
939
940	mtx_lock(&Giant);
941	if ((error = fget(td, uap->fd, &fp)) != 0)
942		goto done2;
943	error = fo_stat(fp, &ub, td->td_ucred, td);
944	if (error == 0)
945		error = copyout(&ub, uap->sb, sizeof(ub));
946	fdrop(fp, td);
947done2:
948	mtx_unlock(&Giant);
949	return (error);
950}
951
952/*
953 * Return status information about a file descriptor.
954 */
955#ifndef _SYS_SYSPROTO_H_
956struct nfstat_args {
957	int	fd;
958	struct	nstat *sb;
959};
960#endif
961/*
962 * MPSAFE
963 */
964/* ARGSUSED */
965int
966nfstat(td, uap)
967	struct thread *td;
968	struct nfstat_args *uap;
969{
970	struct file *fp;
971	struct stat ub;
972	struct nstat nub;
973	int error;
974
975	mtx_lock(&Giant);
976	if ((error = fget(td, uap->fd, &fp)) != 0)
977		goto done2;
978	error = fo_stat(fp, &ub, td->td_ucred, td);
979	if (error == 0) {
980		cvtnstat(&ub, &nub);
981		error = copyout(&nub, uap->sb, sizeof(nub));
982	}
983	fdrop(fp, td);
984done2:
985	mtx_unlock(&Giant);
986	return (error);
987}
988
989/*
990 * Return pathconf information about a file descriptor.
991 */
992#ifndef _SYS_SYSPROTO_H_
993struct fpathconf_args {
994	int	fd;
995	int	name;
996};
997#endif
998/*
999 * MPSAFE
1000 */
1001/* ARGSUSED */
1002int
1003fpathconf(td, uap)
1004	struct thread *td;
1005	struct fpathconf_args *uap;
1006{
1007	struct file *fp;
1008	struct vnode *vp;
1009	int error;
1010
1011	if ((error = fget(td, uap->fd, &fp)) != 0)
1012		return (error);
1013
1014	/* If asynchronous I/O is available, it works for all descriptors. */
1015	if (uap->name == _PC_ASYNC_IO) {
1016		td->td_retval[0] = async_io_version;
1017		goto out;
1018	}
1019	vp = fp->f_vnode;
1020	if (vp != NULL) {
1021		mtx_lock(&Giant);
1022		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1023		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
1024		VOP_UNLOCK(vp, 0, td);
1025		mtx_unlock(&Giant);
1026	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
1027		if (uap->name != _PC_PIPE_BUF) {
1028			error = EINVAL;
1029		} else {
1030			td->td_retval[0] = PIPE_BUF;
1031		error = 0;
1032		}
1033	} else {
1034		error = EOPNOTSUPP;
1035	}
1036out:
1037	fdrop(fp, td);
1038	return (error);
1039}
1040
1041/*
1042 * Allocate a file descriptor for the process.
1043 */
1044static int fdexpand;
1045SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0, "");
1046
1047int
1048fdalloc(td, want, result)
1049	struct thread *td;
1050	int want;
1051	int *result;
1052{
1053	struct proc *p = td->td_proc;
1054	struct filedesc *fdp = td->td_proc->p_fd;
1055	int i;
1056	int lim, last, nfiles;
1057	struct file **newofile, **oldofile;
1058	char *newofileflags;
1059
1060	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1061
1062	/*
1063	 * Search for a free descriptor starting at the higher
1064	 * of want or fd_freefile.  If that fails, consider
1065	 * expanding the ofile array.
1066	 */
1067	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
1068	for (;;) {
1069		last = min(fdp->fd_nfiles, lim);
1070		i = max(want, fdp->fd_freefile);
1071		for (; i < last; i++) {
1072			if (fdp->fd_ofiles[i] == NULL) {
1073				fdp->fd_ofileflags[i] = 0;
1074				if (i > fdp->fd_lastfile)
1075					fdp->fd_lastfile = i;
1076				if (want <= fdp->fd_freefile)
1077					fdp->fd_freefile = i;
1078				*result = i;
1079				return (0);
1080			}
1081		}
1082
1083		/*
1084		 * No space in current array.  Expand?
1085		 */
1086		if (i >= lim)
1087			return (EMFILE);
1088		if (fdp->fd_nfiles < NDEXTENT)
1089			nfiles = NDEXTENT;
1090		else
1091			nfiles = 2 * fdp->fd_nfiles;
1092		while (nfiles < want)
1093			nfiles <<= 1;
1094		FILEDESC_UNLOCK(fdp);
1095		newofile = malloc(nfiles * OFILESIZE, M_FILEDESC, M_WAITOK);
1096
1097		/*
1098		 * Deal with file-table extend race that might have
1099		 * occurred while filedesc was unlocked.
1100		 */
1101		FILEDESC_LOCK(fdp);
1102		if (fdp->fd_nfiles >= nfiles) {
1103			/* XXX uma_large_free() needs Giant. */
1104			FILEDESC_UNLOCK(fdp);
1105			mtx_lock(&Giant);
1106			free(newofile, M_FILEDESC);
1107			mtx_unlock(&Giant);
1108			FILEDESC_LOCK(fdp);
1109			continue;
1110		}
1111		newofileflags = (char *) &newofile[nfiles];
1112		/*
1113		 * Copy the existing ofile and ofileflags arrays
1114		 * and zero the new portion of each array.
1115		 */
1116		i = fdp->fd_nfiles * sizeof(struct file *);
1117		bcopy(fdp->fd_ofiles, newofile,	i);
1118		bzero((char *)newofile + i,
1119		    nfiles * sizeof(struct file *) - i);
1120		i = fdp->fd_nfiles * sizeof(char);
1121		bcopy(fdp->fd_ofileflags, newofileflags, i);
1122		bzero(newofileflags + i, nfiles * sizeof(char) - i);
1123		if (fdp->fd_nfiles > NDFILE)
1124			oldofile = fdp->fd_ofiles;
1125		else
1126			oldofile = NULL;
1127		fdp->fd_ofiles = newofile;
1128		fdp->fd_ofileflags = newofileflags;
1129		fdp->fd_nfiles = nfiles;
1130		fdexpand++;
1131		if (oldofile != NULL) {
1132			/* XXX uma_large_free() needs Giant. */
1133			FILEDESC_UNLOCK(fdp);
1134			mtx_lock(&Giant);
1135			free(oldofile, M_FILEDESC);
1136			mtx_unlock(&Giant);
1137			FILEDESC_LOCK(fdp);
1138		}
1139	}
1140}
1141
1142/*
1143 * Check to see whether n user file descriptors
1144 * are available to the process p.
1145 */
1146int
1147fdavail(td, n)
1148	struct thread *td;
1149	int n;
1150{
1151	struct proc *p = td->td_proc;
1152	struct filedesc *fdp = td->td_proc->p_fd;
1153	struct file **fpp;
1154	int i, lim, last;
1155
1156	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1157
1158	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
1159	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
1160		return (1);
1161	last = min(fdp->fd_nfiles, lim);
1162	fpp = &fdp->fd_ofiles[fdp->fd_freefile];
1163	for (i = last - fdp->fd_freefile; --i >= 0; fpp++) {
1164		if (*fpp == NULL && --n <= 0)
1165			return (1);
1166	}
1167	return (0);
1168}
1169
1170/*
1171 * Create a new open file structure and allocate
1172 * a file decriptor for the process that refers to it.
1173 */
1174int
1175falloc(td, resultfp, resultfd)
1176	struct thread *td;
1177	struct file **resultfp;
1178	int *resultfd;
1179{
1180	struct proc *p = td->td_proc;
1181	struct file *fp, *fq;
1182	int error, i;
1183	int maxuserfiles = maxfiles - (maxfiles / 20);
1184	static struct timeval lastfail;
1185	static int curfail;
1186
1187	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
1188	sx_xlock(&filelist_lock);
1189	if ((nfiles >= maxuserfiles && td->td_ucred->cr_ruid != 0)
1190	   || nfiles >= maxfiles) {
1191		if (ppsratecheck(&lastfail, &curfail, 1)) {
1192			printf("kern.maxfiles limit exceeded by uid %i, please see tuning(7).\n",
1193				td->td_ucred->cr_ruid);
1194		}
1195		sx_xunlock(&filelist_lock);
1196		uma_zfree(file_zone, fp);
1197		return (ENFILE);
1198	}
1199	nfiles++;
1200
1201	/*
1202	 * If the process has file descriptor zero open, add the new file
1203	 * descriptor to the list of open files at that point, otherwise
1204	 * put it at the front of the list of open files.
1205	 */
1206	fp->f_mtxp = mtx_pool_alloc(mtxpool_sleep);
1207	fp->f_count = 1;
1208	fp->f_cred = crhold(td->td_ucred);
1209	fp->f_ops = &badfileops;
1210	FILEDESC_LOCK(p->p_fd);
1211	if ((fq = p->p_fd->fd_ofiles[0])) {
1212		LIST_INSERT_AFTER(fq, fp, f_list);
1213	} else {
1214		LIST_INSERT_HEAD(&filehead, fp, f_list);
1215	}
1216	sx_xunlock(&filelist_lock);
1217	if ((error = fdalloc(td, 0, &i))) {
1218		FILEDESC_UNLOCK(p->p_fd);
1219		fdrop(fp, td);
1220		return (error);
1221	}
1222	p->p_fd->fd_ofiles[i] = fp;
1223	FILEDESC_UNLOCK(p->p_fd);
1224	if (resultfp)
1225		*resultfp = fp;
1226	if (resultfd)
1227		*resultfd = i;
1228	return (0);
1229}
1230
1231/*
1232 * Free a file descriptor.
1233 */
1234void
1235ffree(fp)
1236	struct file *fp;
1237{
1238
1239	KASSERT(fp->f_count == 0, ("ffree: fp_fcount not 0!"));
1240	sx_xlock(&filelist_lock);
1241	LIST_REMOVE(fp, f_list);
1242	nfiles--;
1243	sx_xunlock(&filelist_lock);
1244	crfree(fp->f_cred);
1245	uma_zfree(file_zone, fp);
1246}
1247
1248/*
1249 * Build a new filedesc structure from another.
1250 * Copy the current, root, and jail root vnode references.
1251 */
1252struct filedesc *
1253fdinit(fdp)
1254	struct filedesc *fdp;
1255{
1256	struct filedesc0 *newfdp;
1257
1258	MALLOC(newfdp, struct filedesc0 *, sizeof(struct filedesc0),
1259	    M_FILEDESC, M_WAITOK | M_ZERO);
1260	mtx_init(&newfdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
1261	newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
1262	if (newfdp->fd_fd.fd_cdir)
1263		VREF(newfdp->fd_fd.fd_cdir);
1264	newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
1265	if (newfdp->fd_fd.fd_rdir)
1266		VREF(newfdp->fd_fd.fd_rdir);
1267	newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
1268	if (newfdp->fd_fd.fd_jdir)
1269		VREF(newfdp->fd_fd.fd_jdir);
1270
1271	/* Create the file descriptor table. */
1272	newfdp->fd_fd.fd_refcnt = 1;
1273	newfdp->fd_fd.fd_cmask = cmask;
1274	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
1275	newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
1276	newfdp->fd_fd.fd_nfiles = NDFILE;
1277	newfdp->fd_fd.fd_knlistsize = -1;
1278	return (&newfdp->fd_fd);
1279}
1280
1281/*
1282 * Share a filedesc structure.
1283 */
1284struct filedesc *
1285fdshare(fdp)
1286	struct filedesc *fdp;
1287{
1288	FILEDESC_LOCK(fdp);
1289	fdp->fd_refcnt++;
1290	FILEDESC_UNLOCK(fdp);
1291	return (fdp);
1292}
1293
1294/*
1295 * Copy a filedesc structure.
1296 * A NULL pointer in returns a NULL reference, this is to ease callers,
1297 * not catch errors.
1298 */
1299struct filedesc *
1300fdcopy(fdp)
1301	struct filedesc *fdp;
1302{
1303	struct filedesc *newfdp;
1304	struct file **fpp;
1305	int i, j;
1306
1307	/* Certain daemons might not have file descriptors. */
1308	if (fdp == NULL)
1309		return (NULL);
1310
1311	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1312
1313	FILEDESC_UNLOCK(fdp);
1314	MALLOC(newfdp, struct filedesc *, sizeof(struct filedesc0),
1315	    M_FILEDESC, M_WAITOK);
1316	FILEDESC_LOCK(fdp);
1317	bcopy(fdp, newfdp, sizeof(struct filedesc));
1318	FILEDESC_UNLOCK(fdp);
1319	bzero(&newfdp->fd_mtx, sizeof(newfdp->fd_mtx));
1320	mtx_init(&newfdp->fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
1321	if (newfdp->fd_cdir)
1322		VREF(newfdp->fd_cdir);
1323	if (newfdp->fd_rdir)
1324		VREF(newfdp->fd_rdir);
1325	if (newfdp->fd_jdir)
1326		VREF(newfdp->fd_jdir);
1327	newfdp->fd_refcnt = 1;
1328
1329	/*
1330	 * If the number of open files fits in the internal arrays
1331	 * of the open file structure, use them, otherwise allocate
1332	 * additional memory for the number of descriptors currently
1333	 * in use.
1334	 */
1335	FILEDESC_LOCK(fdp);
1336	newfdp->fd_lastfile = fdp->fd_lastfile;
1337	newfdp->fd_nfiles = fdp->fd_nfiles;
1338	if (newfdp->fd_lastfile < NDFILE) {
1339		newfdp->fd_ofiles = ((struct filedesc0 *) newfdp)->fd_dfiles;
1340		newfdp->fd_ofileflags =
1341		    ((struct filedesc0 *) newfdp)->fd_dfileflags;
1342		i = NDFILE;
1343	} else {
1344		/*
1345		 * Compute the smallest multiple of NDEXTENT needed
1346		 * for the file descriptors currently in use,
1347		 * allowing the table to shrink.
1348		 */
1349retry:
1350		i = newfdp->fd_nfiles;
1351		while (i > 2 * NDEXTENT && i > newfdp->fd_lastfile * 2)
1352			i /= 2;
1353		FILEDESC_UNLOCK(fdp);
1354		MALLOC(newfdp->fd_ofiles, struct file **, i * OFILESIZE,
1355		    M_FILEDESC, M_WAITOK);
1356		FILEDESC_LOCK(fdp);
1357		newfdp->fd_lastfile = fdp->fd_lastfile;
1358		newfdp->fd_nfiles = fdp->fd_nfiles;
1359		j = newfdp->fd_nfiles;
1360		while (j > 2 * NDEXTENT && j > newfdp->fd_lastfile * 2)
1361			j /= 2;
1362		if (i != j) {
1363			/*
1364			 * The size of the original table has changed.
1365			 * Go over once again.
1366			 */
1367			FILEDESC_UNLOCK(fdp);
1368			FREE(newfdp->fd_ofiles, M_FILEDESC);
1369			FILEDESC_LOCK(fdp);
1370			newfdp->fd_lastfile = fdp->fd_lastfile;
1371			newfdp->fd_nfiles = fdp->fd_nfiles;
1372			goto retry;
1373		}
1374		newfdp->fd_ofileflags = (char *) &newfdp->fd_ofiles[i];
1375	}
1376	newfdp->fd_nfiles = i;
1377	bcopy(fdp->fd_ofiles, newfdp->fd_ofiles, i * sizeof(struct file **));
1378	bcopy(fdp->fd_ofileflags, newfdp->fd_ofileflags, i * sizeof(char));
1379
1380	/*
1381	 * kq descriptors cannot be copied.
1382	 */
1383	if (newfdp->fd_knlistsize != -1) {
1384		fpp = &newfdp->fd_ofiles[newfdp->fd_lastfile];
1385		for (i = newfdp->fd_lastfile; i >= 0; i--, fpp--) {
1386			if (*fpp != NULL && (*fpp)->f_type == DTYPE_KQUEUE) {
1387				*fpp = NULL;
1388				if (i < newfdp->fd_freefile)
1389					newfdp->fd_freefile = i;
1390			}
1391			if (*fpp == NULL && i == newfdp->fd_lastfile && i > 0)
1392				newfdp->fd_lastfile--;
1393		}
1394		newfdp->fd_knlist = NULL;
1395		newfdp->fd_knlistsize = -1;
1396		newfdp->fd_knhash = NULL;
1397		newfdp->fd_knhashmask = 0;
1398	}
1399
1400	fpp = newfdp->fd_ofiles;
1401	for (i = newfdp->fd_lastfile; i-- >= 0; fpp++) {
1402		if (*fpp != NULL)
1403			fhold(*fpp);
1404	}
1405	return (newfdp);
1406}
1407
1408/* A mutex to protect the association between a proc and filedesc. */
1409struct mtx	fdesc_mtx;
1410MTX_SYSINIT(fdesc, &fdesc_mtx, "fdesc", MTX_DEF);
1411
1412/*
1413 * Release a filedesc structure.
1414 */
1415void
1416fdfree(td)
1417	struct thread *td;
1418{
1419	struct filedesc *fdp;
1420	struct file **fpp;
1421	int i;
1422	struct filedesc_to_leader *fdtol;
1423	struct file *fp;
1424	struct vnode *vp;
1425	struct flock lf;
1426
1427	/* Certain daemons might not have file descriptors. */
1428	fdp = td->td_proc->p_fd;
1429	if (fdp == NULL)
1430		return;
1431
1432	/* Check for special need to clear POSIX style locks */
1433	fdtol = td->td_proc->p_fdtol;
1434	if (fdtol != NULL) {
1435		FILEDESC_LOCK(fdp);
1436		KASSERT(fdtol->fdl_refcount > 0,
1437			("filedesc_to_refcount botch: fdl_refcount=%d",
1438			 fdtol->fdl_refcount));
1439		if (fdtol->fdl_refcount == 1 &&
1440		    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
1441			i = 0;
1442			fpp = fdp->fd_ofiles;
1443			for (i = 0, fpp = fdp->fd_ofiles;
1444			     i < fdp->fd_lastfile;
1445			     i++, fpp++) {
1446				if (*fpp == NULL ||
1447				    (*fpp)->f_type != DTYPE_VNODE)
1448					continue;
1449				fp = *fpp;
1450				fhold(fp);
1451				FILEDESC_UNLOCK(fdp);
1452				lf.l_whence = SEEK_SET;
1453				lf.l_start = 0;
1454				lf.l_len = 0;
1455				lf.l_type = F_UNLCK;
1456				vp = fp->f_vnode;
1457				(void) VOP_ADVLOCK(vp,
1458						   (caddr_t)td->td_proc->
1459						   p_leader,
1460						   F_UNLCK,
1461						   &lf,
1462						   F_POSIX);
1463				FILEDESC_LOCK(fdp);
1464				fdrop(fp, td);
1465				fpp = fdp->fd_ofiles + i;
1466			}
1467		}
1468	retry:
1469		if (fdtol->fdl_refcount == 1) {
1470			if (fdp->fd_holdleaderscount > 0 &&
1471			    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
1472				/*
1473				 * close() or do_dup() has cleared a reference
1474				 * in a shared file descriptor table.
1475				 */
1476				fdp->fd_holdleaderswakeup = 1;
1477				msleep(&fdp->fd_holdleaderscount, &fdp->fd_mtx,
1478				       PLOCK, "fdlhold", 0);
1479				goto retry;
1480			}
1481			if (fdtol->fdl_holdcount > 0) {
1482				/*
1483				 * Ensure that fdtol->fdl_leader
1484				 * remains valid in closef().
1485				 */
1486				fdtol->fdl_wakeup = 1;
1487				msleep(fdtol, &fdp->fd_mtx,
1488				       PLOCK, "fdlhold", 0);
1489				goto retry;
1490			}
1491		}
1492		fdtol->fdl_refcount--;
1493		if (fdtol->fdl_refcount == 0 &&
1494		    fdtol->fdl_holdcount == 0) {
1495			fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
1496			fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
1497		} else
1498			fdtol = NULL;
1499		td->td_proc->p_fdtol = NULL;
1500		FILEDESC_UNLOCK(fdp);
1501		if (fdtol != NULL)
1502			FREE(fdtol, M_FILEDESC_TO_LEADER);
1503	}
1504	FILEDESC_LOCK(fdp);
1505	if (--fdp->fd_refcnt > 0) {
1506		FILEDESC_UNLOCK(fdp);
1507		return;
1508	}
1509
1510	/*
1511	 * We are the last reference to the structure, so we can
1512	 * safely assume it will not change out from under us.
1513	 */
1514	FILEDESC_UNLOCK(fdp);
1515	fpp = fdp->fd_ofiles;
1516	for (i = fdp->fd_lastfile; i-- >= 0; fpp++) {
1517		if (*fpp)
1518			(void) closef(*fpp, td);
1519	}
1520
1521	/* XXX This should happen earlier. */
1522	mtx_lock(&fdesc_mtx);
1523	td->td_proc->p_fd = NULL;
1524	mtx_unlock(&fdesc_mtx);
1525
1526	if (fdp->fd_nfiles > NDFILE)
1527		FREE(fdp->fd_ofiles, M_FILEDESC);
1528	if (fdp->fd_cdir)
1529		vrele(fdp->fd_cdir);
1530	if (fdp->fd_rdir)
1531		vrele(fdp->fd_rdir);
1532	if (fdp->fd_jdir)
1533		vrele(fdp->fd_jdir);
1534	if (fdp->fd_knlist)
1535		FREE(fdp->fd_knlist, M_KQUEUE);
1536	if (fdp->fd_knhash)
1537		FREE(fdp->fd_knhash, M_KQUEUE);
1538	mtx_destroy(&fdp->fd_mtx);
1539	FREE(fdp, M_FILEDESC);
1540}
1541
1542/*
1543 * For setugid programs, we don't want to people to use that setugidness
1544 * to generate error messages which write to a file which otherwise would
1545 * otherwise be off-limits to the process.  We check for filesystems where
1546 * the vnode can change out from under us after execve (like [lin]procfs).
1547 *
1548 * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
1549 * sufficient.  We also don't for check setugidness since we know we are.
1550 */
1551static int
1552is_unsafe(struct file *fp)
1553{
1554	if (fp->f_type == DTYPE_VNODE) {
1555		struct vnode *vp = fp->f_vnode;
1556
1557		if ((vp->v_vflag & VV_PROCDEP) != 0)
1558			return (1);
1559	}
1560	return (0);
1561}
1562
1563/*
1564 * Make this setguid thing safe, if at all possible.
1565 */
1566void
1567setugidsafety(td)
1568	struct thread *td;
1569{
1570	struct filedesc *fdp;
1571	int i;
1572
1573	/* Certain daemons might not have file descriptors. */
1574	fdp = td->td_proc->p_fd;
1575	if (fdp == NULL)
1576		return;
1577
1578	/*
1579	 * Note: fdp->fd_ofiles may be reallocated out from under us while
1580	 * we are blocked in a close.  Be careful!
1581	 */
1582	FILEDESC_LOCK(fdp);
1583	for (i = 0; i <= fdp->fd_lastfile; i++) {
1584		if (i > 2)
1585			break;
1586		if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) {
1587			struct file *fp;
1588
1589#if 0
1590			if ((fdp->fd_ofileflags[i] & UF_MAPPED) != 0)
1591				(void) munmapfd(td, i);
1592#endif
1593			if (i < fdp->fd_knlistsize) {
1594				FILEDESC_UNLOCK(fdp);
1595				knote_fdclose(td, i);
1596				FILEDESC_LOCK(fdp);
1597			}
1598			/*
1599			 * NULL-out descriptor prior to close to avoid
1600			 * a race while close blocks.
1601			 */
1602			fp = fdp->fd_ofiles[i];
1603			fdp->fd_ofiles[i] = NULL;
1604			fdp->fd_ofileflags[i] = 0;
1605			if (i < fdp->fd_freefile)
1606				fdp->fd_freefile = i;
1607			FILEDESC_UNLOCK(fdp);
1608			(void) closef(fp, td);
1609			FILEDESC_LOCK(fdp);
1610		}
1611	}
1612	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
1613		fdp->fd_lastfile--;
1614	FILEDESC_UNLOCK(fdp);
1615}
1616
1617/*
1618 * Close any files on exec?
1619 */
1620void
1621fdcloseexec(td)
1622	struct thread *td;
1623{
1624	struct filedesc *fdp;
1625	int i;
1626
1627	/* Certain daemons might not have file descriptors. */
1628	fdp = td->td_proc->p_fd;
1629	if (fdp == NULL)
1630		return;
1631
1632	FILEDESC_LOCK(fdp);
1633
1634	/*
1635	 * We cannot cache fd_ofiles or fd_ofileflags since operations
1636	 * may block and rip them out from under us.
1637	 */
1638	for (i = 0; i <= fdp->fd_lastfile; i++) {
1639		if (fdp->fd_ofiles[i] != NULL &&
1640		    (fdp->fd_ofileflags[i] & UF_EXCLOSE)) {
1641			struct file *fp;
1642
1643#if 0
1644			if (fdp->fd_ofileflags[i] & UF_MAPPED)
1645				(void) munmapfd(td, i);
1646#endif
1647			if (i < fdp->fd_knlistsize) {
1648				FILEDESC_UNLOCK(fdp);
1649				knote_fdclose(td, i);
1650				FILEDESC_LOCK(fdp);
1651			}
1652			/*
1653			 * NULL-out descriptor prior to close to avoid
1654			 * a race while close blocks.
1655			 */
1656			fp = fdp->fd_ofiles[i];
1657			fdp->fd_ofiles[i] = NULL;
1658			fdp->fd_ofileflags[i] = 0;
1659			if (i < fdp->fd_freefile)
1660				fdp->fd_freefile = i;
1661			FILEDESC_UNLOCK(fdp);
1662			(void) closef(fp, td);
1663			FILEDESC_LOCK(fdp);
1664		}
1665	}
1666	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
1667		fdp->fd_lastfile--;
1668	FILEDESC_UNLOCK(fdp);
1669}
1670
1671/*
1672 * It is unsafe for set[ug]id processes to be started with file
1673 * descriptors 0..2 closed, as these descriptors are given implicit
1674 * significance in the Standard C library.  fdcheckstd() will create a
1675 * descriptor referencing /dev/null for each of stdin, stdout, and
1676 * stderr that is not already open.
1677 */
1678int
1679fdcheckstd(td)
1680	struct thread *td;
1681{
1682	struct nameidata nd;
1683	struct filedesc *fdp;
1684	struct file *fp;
1685	register_t retval;
1686	int fd, i, error, flags, devnull;
1687
1688	fdp = td->td_proc->p_fd;
1689	if (fdp == NULL)
1690		return (0);
1691	devnull = -1;
1692	error = 0;
1693	for (i = 0; i < 3; i++) {
1694		if (fdp->fd_ofiles[i] != NULL)
1695			continue;
1696		if (devnull < 0) {
1697			error = falloc(td, &fp, &fd);
1698			if (error != 0)
1699				break;
1700			KASSERT(fd == i, ("oof, we didn't get our fd"));
1701			NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/dev/null",
1702			    td);
1703			flags = FREAD | FWRITE;
1704			error = vn_open(&nd, &flags, 0, -1);
1705			if (error != 0) {
1706				FILEDESC_LOCK(fdp);
1707				fdp->fd_ofiles[fd] = NULL;
1708				FILEDESC_UNLOCK(fdp);
1709				fdrop(fp, td);
1710				break;
1711			}
1712			NDFREE(&nd, NDF_ONLY_PNBUF);
1713			fp->f_vnode = nd.ni_vp;
1714			fp->f_data = nd.ni_vp;
1715			fp->f_flag = flags;
1716			fp->f_ops = &vnops;
1717			fp->f_type = DTYPE_VNODE;
1718			VOP_UNLOCK(nd.ni_vp, 0, td);
1719			devnull = fd;
1720		} else {
1721			error = do_dup(td, DUP_FIXED, devnull, i, &retval);
1722			if (error != 0)
1723				break;
1724		}
1725	}
1726	return (error);
1727}
1728
1729/*
1730 * Internal form of close.
1731 * Decrement reference count on file structure.
1732 * Note: td may be NULL when closing a file
1733 * that was being passed in a message.
1734 */
1735int
1736closef(fp, td)
1737	struct file *fp;
1738	struct thread *td;
1739{
1740	struct vnode *vp;
1741	struct flock lf;
1742	struct filedesc_to_leader *fdtol;
1743	struct filedesc *fdp;
1744
1745	if (fp == NULL)
1746		return (0);
1747	/*
1748	 * POSIX record locking dictates that any close releases ALL
1749	 * locks owned by this process.  This is handled by setting
1750	 * a flag in the unlock to free ONLY locks obeying POSIX
1751	 * semantics, and not to free BSD-style file locks.
1752	 * If the descriptor was in a message, POSIX-style locks
1753	 * aren't passed with the descriptor.
1754	 */
1755	if (td != NULL &&
1756	    fp->f_type == DTYPE_VNODE) {
1757		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
1758			lf.l_whence = SEEK_SET;
1759			lf.l_start = 0;
1760			lf.l_len = 0;
1761			lf.l_type = F_UNLCK;
1762			vp = fp->f_vnode;
1763			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
1764					   F_UNLCK, &lf, F_POSIX);
1765		}
1766		fdtol = td->td_proc->p_fdtol;
1767		if (fdtol != NULL) {
1768			/*
1769			 * Handle special case where file descriptor table
1770			 * is shared between multiple process leaders.
1771			 */
1772			fdp = td->td_proc->p_fd;
1773			FILEDESC_LOCK(fdp);
1774			for (fdtol = fdtol->fdl_next;
1775			     fdtol != td->td_proc->p_fdtol;
1776			     fdtol = fdtol->fdl_next) {
1777				if ((fdtol->fdl_leader->p_flag &
1778				     P_ADVLOCK) == 0)
1779					continue;
1780				fdtol->fdl_holdcount++;
1781				FILEDESC_UNLOCK(fdp);
1782				lf.l_whence = SEEK_SET;
1783				lf.l_start = 0;
1784				lf.l_len = 0;
1785				lf.l_type = F_UNLCK;
1786				vp = fp->f_vnode;
1787				(void) VOP_ADVLOCK(vp,
1788						   (caddr_t)fdtol->fdl_leader,
1789						   F_UNLCK, &lf, F_POSIX);
1790				FILEDESC_LOCK(fdp);
1791				fdtol->fdl_holdcount--;
1792				if (fdtol->fdl_holdcount == 0 &&
1793				    fdtol->fdl_wakeup != 0) {
1794					fdtol->fdl_wakeup = 0;
1795					wakeup(fdtol);
1796				}
1797			}
1798			FILEDESC_UNLOCK(fdp);
1799		}
1800	}
1801	return (fdrop(fp, td));
1802}
1803
1804/*
1805 * Drop reference on struct file passed in, may call closef if the
1806 * reference hits zero.
1807 */
1808int
1809fdrop(fp, td)
1810	struct file *fp;
1811	struct thread *td;
1812{
1813
1814	FILE_LOCK(fp);
1815	return (fdrop_locked(fp, td));
1816}
1817
1818/*
1819 * Extract the file pointer associated with the specified descriptor for
1820 * the current user process.
1821 *
1822 * If the descriptor doesn't exist, EBADF is returned.
1823 *
1824 * If the descriptor exists but doesn't match 'flags' then
1825 * return EBADF for read attempts and EINVAL for write attempts.
1826 *
1827 * If 'hold' is set (non-zero) the file's refcount will be bumped on return.
1828 * It should be droped with fdrop().
1829 * If it is not set, then the refcount will not be bumped however the
1830 * thread's filedesc struct will be returned locked (for fgetsock).
1831 *
1832 * If an error occured the non-zero error is returned and *fpp is set to NULL.
1833 * Otherwise *fpp is set and zero is returned.
1834 */
1835static __inline int
1836_fget(struct thread *td, int fd, struct file **fpp, int flags, int hold)
1837{
1838	struct filedesc *fdp;
1839	struct file *fp;
1840
1841	*fpp = NULL;
1842	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
1843		return (EBADF);
1844	FILEDESC_LOCK(fdp);
1845	if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) {
1846		FILEDESC_UNLOCK(fdp);
1847		return (EBADF);
1848	}
1849
1850	/*
1851	 * Note: FREAD failures returns EBADF to maintain backwards
1852	 * compatibility with what routines returned before.
1853	 *
1854	 * Only one flag, or 0, may be specified.
1855	 */
1856	if (flags == FREAD && (fp->f_flag & FREAD) == 0) {
1857		FILEDESC_UNLOCK(fdp);
1858		return (EBADF);
1859	}
1860	if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) {
1861		FILEDESC_UNLOCK(fdp);
1862		return (EINVAL);
1863	}
1864	if (hold) {
1865		fhold(fp);
1866		FILEDESC_UNLOCK(fdp);
1867	}
1868	*fpp = fp;
1869	return (0);
1870}
1871
1872int
1873fget(struct thread *td, int fd, struct file **fpp)
1874{
1875
1876	return(_fget(td, fd, fpp, 0, 1));
1877}
1878
1879int
1880fget_read(struct thread *td, int fd, struct file **fpp)
1881{
1882
1883	return(_fget(td, fd, fpp, FREAD, 1));
1884}
1885
1886int
1887fget_write(struct thread *td, int fd, struct file **fpp)
1888{
1889
1890	return(_fget(td, fd, fpp, FWRITE, 1));
1891}
1892
1893/*
1894 * Like fget() but loads the underlying vnode, or returns an error if
1895 * the descriptor does not represent a vnode.  Note that pipes use vnodes
1896 * but never have VM objects (so VOP_GETVOBJECT() calls will return an
1897 * error).  The returned vnode will be vref()d.
1898 */
1899static __inline int
1900_fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags)
1901{
1902	struct file *fp;
1903	int error;
1904
1905	*vpp = NULL;
1906	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
1907		return (error);
1908	if (fp->f_vnode == NULL) {
1909		error = EINVAL;
1910	} else {
1911		*vpp = fp->f_vnode;
1912		vref(*vpp);
1913	}
1914	FILEDESC_UNLOCK(td->td_proc->p_fd);
1915	return (error);
1916}
1917
1918int
1919fgetvp(struct thread *td, int fd, struct vnode **vpp)
1920{
1921
1922	return (_fgetvp(td, fd, vpp, 0));
1923}
1924
1925int
1926fgetvp_read(struct thread *td, int fd, struct vnode **vpp)
1927{
1928
1929	return (_fgetvp(td, fd, vpp, FREAD));
1930}
1931
1932int
1933fgetvp_write(struct thread *td, int fd, struct vnode **vpp)
1934{
1935
1936	return (_fgetvp(td, fd, vpp, FWRITE));
1937}
1938
1939/*
1940 * Like fget() but loads the underlying socket, or returns an error if
1941 * the descriptor does not represent a socket.
1942 *
1943 * We bump the ref count on the returned socket.  XXX Also obtain the SX
1944 * lock in the future.
1945 */
1946int
1947fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp)
1948{
1949	struct file *fp;
1950	int error;
1951
1952	*spp = NULL;
1953	if (fflagp != NULL)
1954		*fflagp = 0;
1955	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
1956		return (error);
1957	if (fp->f_type != DTYPE_SOCKET) {
1958		error = ENOTSOCK;
1959	} else {
1960		*spp = fp->f_data;
1961		if (fflagp)
1962			*fflagp = fp->f_flag;
1963		soref(*spp);
1964	}
1965	FILEDESC_UNLOCK(td->td_proc->p_fd);
1966	return (error);
1967}
1968
1969/*
1970 * Drop the reference count on the the socket and XXX release the SX lock in
1971 * the future.  The last reference closes the socket.
1972 */
1973void
1974fputsock(struct socket *so)
1975{
1976
1977	sorele(so);
1978}
1979
1980/*
1981 * Drop reference on struct file passed in, may call closef if the
1982 * reference hits zero.
1983 * Expects struct file locked, and will unlock it.
1984 */
1985int
1986fdrop_locked(fp, td)
1987	struct file *fp;
1988	struct thread *td;
1989{
1990	struct flock lf;
1991	struct vnode *vp;
1992	int error;
1993
1994	FILE_LOCK_ASSERT(fp, MA_OWNED);
1995
1996	if (--fp->f_count > 0) {
1997		FILE_UNLOCK(fp);
1998		return (0);
1999	}
2000	/* We have the last ref so we can proceed without the file lock. */
2001	FILE_UNLOCK(fp);
2002	mtx_lock(&Giant);
2003	if (fp->f_count < 0)
2004		panic("fdrop: count < 0");
2005	if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
2006		lf.l_whence = SEEK_SET;
2007		lf.l_start = 0;
2008		lf.l_len = 0;
2009		lf.l_type = F_UNLCK;
2010		vp = fp->f_vnode;
2011		(void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
2012	}
2013	if (fp->f_ops != &badfileops)
2014		error = fo_close(fp, td);
2015	else
2016		error = 0;
2017	ffree(fp);
2018	mtx_unlock(&Giant);
2019	return (error);
2020}
2021
2022/*
2023 * Apply an advisory lock on a file descriptor.
2024 *
2025 * Just attempt to get a record lock of the requested type on
2026 * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
2027 */
2028#ifndef _SYS_SYSPROTO_H_
2029struct flock_args {
2030	int	fd;
2031	int	how;
2032};
2033#endif
2034/*
2035 * MPSAFE
2036 */
2037/* ARGSUSED */
2038int
2039flock(td, uap)
2040	struct thread *td;
2041	struct flock_args *uap;
2042{
2043	struct file *fp;
2044	struct vnode *vp;
2045	struct flock lf;
2046	int error;
2047
2048	if ((error = fget(td, uap->fd, &fp)) != 0)
2049		return (error);
2050	if (fp->f_type != DTYPE_VNODE) {
2051		fdrop(fp, td);
2052		return (EOPNOTSUPP);
2053	}
2054
2055	mtx_lock(&Giant);
2056	vp = fp->f_vnode;
2057	lf.l_whence = SEEK_SET;
2058	lf.l_start = 0;
2059	lf.l_len = 0;
2060	if (uap->how & LOCK_UN) {
2061		lf.l_type = F_UNLCK;
2062		FILE_LOCK(fp);
2063		fp->f_flag &= ~FHASLOCK;
2064		FILE_UNLOCK(fp);
2065		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
2066		goto done2;
2067	}
2068	if (uap->how & LOCK_EX)
2069		lf.l_type = F_WRLCK;
2070	else if (uap->how & LOCK_SH)
2071		lf.l_type = F_RDLCK;
2072	else {
2073		error = EBADF;
2074		goto done2;
2075	}
2076	FILE_LOCK(fp);
2077	fp->f_flag |= FHASLOCK;
2078	FILE_UNLOCK(fp);
2079	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
2080	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
2081done2:
2082	fdrop(fp, td);
2083	mtx_unlock(&Giant);
2084	return (error);
2085}
2086
2087/*
2088 * File Descriptor pseudo-device driver (/dev/fd/).
2089 *
2090 * Opening minor device N dup()s the file (if any) connected to file
2091 * descriptor N belonging to the calling process.  Note that this driver
2092 * consists of only the ``open()'' routine, because all subsequent
2093 * references to this file will be direct to the other driver.
2094 */
2095/* ARGSUSED */
2096static int
2097fdopen(dev, mode, type, td)
2098	dev_t dev;
2099	int mode, type;
2100	struct thread *td;
2101{
2102
2103	/*
2104	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
2105	 * the file descriptor being sought for duplication. The error
2106	 * return ensures that the vnode for this device will be released
2107	 * by vn_open. Open will detect this special error and take the
2108	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
2109	 * will simply report the error.
2110	 */
2111	td->td_dupfd = dev2unit(dev);
2112	return (ENODEV);
2113}
2114
2115/*
2116 * Duplicate the specified descriptor to a free descriptor.
2117 */
2118int
2119dupfdopen(td, fdp, indx, dfd, mode, error)
2120	struct thread *td;
2121	struct filedesc *fdp;
2122	int indx, dfd;
2123	int mode;
2124	int error;
2125{
2126	struct file *wfp;
2127	struct file *fp;
2128
2129	/*
2130	 * If the to-be-dup'd fd number is greater than the allowed number
2131	 * of file descriptors, or the fd to be dup'd has already been
2132	 * closed, then reject.
2133	 */
2134	FILEDESC_LOCK(fdp);
2135	if (dfd < 0 || dfd >= fdp->fd_nfiles ||
2136	    (wfp = fdp->fd_ofiles[dfd]) == NULL) {
2137		FILEDESC_UNLOCK(fdp);
2138		return (EBADF);
2139	}
2140
2141	/*
2142	 * There are two cases of interest here.
2143	 *
2144	 * For ENODEV simply dup (dfd) to file descriptor
2145	 * (indx) and return.
2146	 *
2147	 * For ENXIO steal away the file structure from (dfd) and
2148	 * store it in (indx).  (dfd) is effectively closed by
2149	 * this operation.
2150	 *
2151	 * Any other error code is just returned.
2152	 */
2153	switch (error) {
2154	case ENODEV:
2155		/*
2156		 * Check that the mode the file is being opened for is a
2157		 * subset of the mode of the existing descriptor.
2158		 */
2159		FILE_LOCK(wfp);
2160		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {
2161			FILE_UNLOCK(wfp);
2162			FILEDESC_UNLOCK(fdp);
2163			return (EACCES);
2164		}
2165		fp = fdp->fd_ofiles[indx];
2166#if 0
2167		if (fp && fdp->fd_ofileflags[indx] & UF_MAPPED)
2168			(void) munmapfd(td, indx);
2169#endif
2170		fdp->fd_ofiles[indx] = wfp;
2171		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
2172		fhold_locked(wfp);
2173		FILE_UNLOCK(wfp);
2174		if (indx > fdp->fd_lastfile)
2175			fdp->fd_lastfile = indx;
2176		if (fp != NULL)
2177			FILE_LOCK(fp);
2178		FILEDESC_UNLOCK(fdp);
2179		/*
2180		 * We now own the reference to fp that the ofiles[] array
2181		 * used to own.  Release it.
2182		 */
2183		if (fp != NULL)
2184			fdrop_locked(fp, td);
2185		return (0);
2186
2187	case ENXIO:
2188		/*
2189		 * Steal away the file pointer from dfd and stuff it into indx.
2190		 */
2191		fp = fdp->fd_ofiles[indx];
2192#if 0
2193		if (fp && fdp->fd_ofileflags[indx] & UF_MAPPED)
2194			(void) munmapfd(td, indx);
2195#endif
2196		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
2197		fdp->fd_ofiles[dfd] = NULL;
2198		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
2199		fdp->fd_ofileflags[dfd] = 0;
2200
2201		/*
2202		 * Complete the clean up of the filedesc structure by
2203		 * recomputing the various hints.
2204		 */
2205		if (indx > fdp->fd_lastfile) {
2206			fdp->fd_lastfile = indx;
2207		} else {
2208			while (fdp->fd_lastfile > 0 &&
2209			   fdp->fd_ofiles[fdp->fd_lastfile] == NULL) {
2210				fdp->fd_lastfile--;
2211			}
2212			if (dfd < fdp->fd_freefile)
2213				fdp->fd_freefile = dfd;
2214		}
2215		if (fp != NULL)
2216			FILE_LOCK(fp);
2217		FILEDESC_UNLOCK(fdp);
2218
2219		/*
2220		 * we now own the reference to fp that the ofiles[] array
2221		 * used to own.  Release it.
2222		 */
2223		if (fp != NULL)
2224			fdrop_locked(fp, td);
2225		return (0);
2226
2227	default:
2228		FILEDESC_UNLOCK(fdp);
2229		return (error);
2230	}
2231	/* NOTREACHED */
2232}
2233
2234
2235struct filedesc_to_leader *
2236filedesc_to_leader_alloc(struct filedesc_to_leader *old,
2237			 struct filedesc *fdp,
2238			 struct proc *leader)
2239{
2240	struct filedesc_to_leader *fdtol;
2241
2242	MALLOC(fdtol, struct filedesc_to_leader *,
2243	       sizeof(struct filedesc_to_leader),
2244	       M_FILEDESC_TO_LEADER,
2245	       M_WAITOK);
2246	fdtol->fdl_refcount = 1;
2247	fdtol->fdl_holdcount = 0;
2248	fdtol->fdl_wakeup = 0;
2249	fdtol->fdl_leader = leader;
2250	if (old != NULL) {
2251		FILEDESC_LOCK(fdp);
2252		fdtol->fdl_next = old->fdl_next;
2253		fdtol->fdl_prev = old;
2254		old->fdl_next = fdtol;
2255		fdtol->fdl_next->fdl_prev = fdtol;
2256		FILEDESC_UNLOCK(fdp);
2257	} else {
2258		fdtol->fdl_next = fdtol;
2259		fdtol->fdl_prev = fdtol;
2260	}
2261	return fdtol;
2262}
2263
2264/*
2265 * Get file structures.
2266 */
2267static int
2268sysctl_kern_file(SYSCTL_HANDLER_ARGS)
2269{
2270	struct xfile xf;
2271	struct filedesc *fdp;
2272	struct file *fp;
2273	struct proc *p;
2274	int error, n;
2275
2276	/*
2277	 * Note: because the number of file descriptors is calculated
2278	 * in different ways for sizing vs returning the data,
2279	 * there is information leakage from the first loop.  However,
2280	 * it is of a similar order of magnitude to the leakage from
2281	 * global system statistics such as kern.openfiles.
2282	 */
2283	sysctl_wire_old_buffer(req, 0);
2284	if (req->oldptr == NULL) {
2285		n = 16;		/* A slight overestimate. */
2286		sx_slock(&filelist_lock);
2287		LIST_FOREACH(fp, &filehead, f_list) {
2288			/*
2289			 * We should grab the lock, but this is an
2290			 * estimate, so does it really matter?
2291			 */
2292			/* mtx_lock(fp->f_mtxp); */
2293			n += fp->f_count;
2294			/* mtx_unlock(f->f_mtxp); */
2295		}
2296		sx_sunlock(&filelist_lock);
2297		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
2298	}
2299	error = 0;
2300	bzero(&xf, sizeof(xf));
2301	xf.xf_size = sizeof(xf);
2302	sx_slock(&allproc_lock);
2303	LIST_FOREACH(p, &allproc, p_list) {
2304		PROC_LOCK(p);
2305		if (p_cansee(req->td, p) != 0) {
2306			PROC_UNLOCK(p);
2307			continue;
2308		}
2309		xf.xf_pid = p->p_pid;
2310		xf.xf_uid = p->p_ucred->cr_uid;
2311		PROC_UNLOCK(p);
2312		mtx_lock(&fdesc_mtx);
2313		if ((fdp = p->p_fd) == NULL) {
2314			mtx_unlock(&fdesc_mtx);
2315			continue;
2316		}
2317		FILEDESC_LOCK(fdp);
2318		for (n = 0; n < fdp->fd_nfiles; ++n) {
2319			if ((fp = fdp->fd_ofiles[n]) == NULL)
2320				continue;
2321			xf.xf_fd = n;
2322			xf.xf_file = fp;
2323			xf.xf_data = fp->f_data;
2324			xf.xf_type = fp->f_type;
2325			xf.xf_count = fp->f_count;
2326			xf.xf_msgcount = fp->f_msgcount;
2327			xf.xf_offset = fp->f_offset;
2328			xf.xf_flag = fp->f_flag;
2329			error = SYSCTL_OUT(req, &xf, sizeof(xf));
2330			if (error)
2331				break;
2332		}
2333		FILEDESC_UNLOCK(fdp);
2334		mtx_unlock(&fdesc_mtx);
2335		if (error)
2336			break;
2337	}
2338	sx_sunlock(&allproc_lock);
2339	return (error);
2340}
2341
2342SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
2343    0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
2344
2345SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
2346    &maxfilesperproc, 0, "Maximum files allowed open per process");
2347
2348SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
2349    &maxfiles, 0, "Maximum number of files");
2350
2351SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
2352    &nfiles, 0, "System-wide number of open files");
2353
2354static void
2355fildesc_drvinit(void *unused)
2356{
2357	dev_t dev;
2358
2359	dev = make_dev(&fildesc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "fd/0");
2360	make_dev_alias(dev, "stdin");
2361	dev = make_dev(&fildesc_cdevsw, 1, UID_ROOT, GID_WHEEL, 0666, "fd/1");
2362	make_dev_alias(dev, "stdout");
2363	dev = make_dev(&fildesc_cdevsw, 2, UID_ROOT, GID_WHEEL, 0666, "fd/2");
2364	make_dev_alias(dev, "stderr");
2365}
2366
2367static fo_rdwr_t	badfo_readwrite;
2368static fo_ioctl_t	badfo_ioctl;
2369static fo_poll_t	badfo_poll;
2370static fo_kqfilter_t	badfo_kqfilter;
2371static fo_stat_t	badfo_stat;
2372static fo_close_t	badfo_close;
2373
2374struct fileops badfileops = {
2375	.fo_read = badfo_readwrite,
2376	.fo_write = badfo_readwrite,
2377	.fo_ioctl = badfo_ioctl,
2378	.fo_poll = badfo_poll,
2379	.fo_kqfilter = badfo_kqfilter,
2380	.fo_stat = badfo_stat,
2381	.fo_close = badfo_close,
2382};
2383
2384static int
2385badfo_readwrite(fp, uio, active_cred, flags, td)
2386	struct file *fp;
2387	struct uio *uio;
2388	struct ucred *active_cred;
2389	struct thread *td;
2390	int flags;
2391{
2392
2393	return (EBADF);
2394}
2395
2396static int
2397badfo_ioctl(fp, com, data, active_cred, td)
2398	struct file *fp;
2399	u_long com;
2400	void *data;
2401	struct ucred *active_cred;
2402	struct thread *td;
2403{
2404
2405	return (EBADF);
2406}
2407
2408static int
2409badfo_poll(fp, events, active_cred, td)
2410	struct file *fp;
2411	int events;
2412	struct ucred *active_cred;
2413	struct thread *td;
2414{
2415
2416	return (0);
2417}
2418
2419static int
2420badfo_kqfilter(fp, kn)
2421	struct file *fp;
2422	struct knote *kn;
2423{
2424
2425	return (0);
2426}
2427
2428static int
2429badfo_stat(fp, sb, active_cred, td)
2430	struct file *fp;
2431	struct stat *sb;
2432	struct ucred *active_cred;
2433	struct thread *td;
2434{
2435
2436	return (EBADF);
2437}
2438
2439static int
2440badfo_close(fp, td)
2441	struct file *fp;
2442	struct thread *td;
2443{
2444
2445	return (EBADF);
2446}
2447
2448SYSINIT(fildescdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,
2449					fildesc_drvinit,NULL)
2450
2451static void filelistinit(void *);
2452SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL)
2453
2454/* ARGSUSED*/
2455static void
2456filelistinit(dummy)
2457	void *dummy;
2458{
2459
2460	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
2461	    NULL, NULL, UMA_ALIGN_PTR, 0);
2462	sx_init(&filelist_lock, "filelist lock");
2463	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
2464}
2465