kern_descrip.c revision 126080
1/*
2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: head/sys/kern/kern_descrip.c 126080 2004-02-21 21:10:55Z phk $");
43
44#include "opt_compat.h"
45
46#include <sys/param.h>
47#include <sys/limits.h>
48#include <sys/systm.h>
49#include <sys/syscallsubr.h>
50#include <sys/sysproto.h>
51#include <sys/conf.h>
52#include <sys/filedesc.h>
53#include <sys/lock.h>
54#include <sys/kernel.h>
55#include <sys/limits.h>
56#include <sys/malloc.h>
57#include <sys/mutex.h>
58#include <sys/sysctl.h>
59#include <sys/vnode.h>
60#include <sys/mount.h>
61#include <sys/proc.h>
62#include <sys/namei.h>
63#include <sys/file.h>
64#include <sys/stat.h>
65#include <sys/filio.h>
66#include <sys/fcntl.h>
67#include <sys/unistd.h>
68#include <sys/resourcevar.h>
69#include <sys/event.h>
70#include <sys/sx.h>
71#include <sys/socketvar.h>
72#include <sys/signalvar.h>
73
74#include <vm/vm.h>
75#include <vm/vm_extern.h>
76#include <vm/uma.h>
77
78static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
79static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "file desc to leader",
80		     "file desc to leader structures");
81static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
82
83static uma_zone_t file_zone;
84
85static	 d_open_t  fdopen;
86#define	NUMFDESC 64
87
88#define	CDEV_MAJOR 22
89static struct cdevsw fildesc_cdevsw = {
90	.d_version =	D_VERSION,
91	.d_flags =	D_NEEDGIANT,
92	.d_open =	fdopen,
93	.d_name =	"FD",
94	.d_maj =	CDEV_MAJOR,
95};
96
97/* How to treat 'new' parameter when allocating a fd for do_dup(). */
98enum dup_type { DUP_VARIABLE, DUP_FIXED };
99
100static int do_dup(struct thread *td, enum dup_type type, int old, int new,
101    register_t *retval);
102static int	fd_first_free(struct filedesc *, int, int);
103static int	fd_last_used(struct filedesc *, int, int);
104static void	fdgrowtable(struct filedesc *, int);
105
106/*
107 * Descriptor management.
108 */
109struct filelist filehead;	/* head of list of open files */
110int nfiles;			/* actual number of open files */
111struct sx filelist_lock;	/* sx to protect filelist */
112struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
113
114/*
115 * Find the first zero bit in the given bitmap, starting at low and not
116 * exceeding size - 1.
117 */
118static int
119fd_first_free(struct filedesc *fdp, int low, int size)
120{
121	NDSLOTTYPE *map = fdp->fd_map;
122	NDSLOTTYPE mask;
123	int off, maxoff;
124
125	if (low >= size)
126		return (low);
127
128	off = NDSLOT(low);
129	if (low % NDENTRIES) {
130		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
131		if ((mask &= ~map[off]) != 0UL)
132			return (off * NDENTRIES + ffsl(mask) - 1);
133		++off;
134	}
135	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
136		if (map[off] != ~0UL)
137			return (off * NDENTRIES + ffsl(~map[off]) - 1);
138	return (size);
139}
140
141/*
142 * Find the highest non-zero bit in the given bitmap, starting at low and
143 * not exceeding size - 1.
144 */
145static int
146fd_last_used(struct filedesc *fdp, int low, int size)
147{
148	NDSLOTTYPE *map = fdp->fd_map;
149	NDSLOTTYPE mask;
150	int off, minoff;
151
152	if (low >= size)
153		return (-1);
154
155	off = NDSLOT(size);
156	if (size % NDENTRIES) {
157		mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
158		if ((mask &= map[off]) != 0)
159			return (off * NDENTRIES + flsl(mask) - 1);
160		--off;
161	}
162	for (minoff = NDSLOT(low); off >= minoff; --off)
163		if (map[off] != 0)
164			return (off * NDENTRIES + flsl(map[off]) - 1);
165	return (size - 1);
166}
167
168static int
169fdisused(struct filedesc *fdp, int fd)
170{
171        KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
172            ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
173	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
174}
175
176/*
177 * Mark a file descriptor as used.
178 */
179void
180fdused(struct filedesc *fdp, int fd)
181{
182	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
183	KASSERT(!fdisused(fdp, fd),
184	    ("fd already used"));
185	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
186	if (fd > fdp->fd_lastfile)
187		fdp->fd_lastfile = fd;
188	if (fd == fdp->fd_freefile)
189		fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
190}
191
192/*
193 * Mark a file descriptor as unused.
194 */
195void
196fdunused(struct filedesc *fdp, int fd)
197{
198	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
199	KASSERT(fdisused(fdp, fd),
200	    ("fd is already unused"));
201	KASSERT(fdp->fd_ofiles[fd] == NULL,
202	    ("fd is still in use"));
203	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
204	if (fd < fdp->fd_freefile)
205		fdp->fd_freefile = fd;
206	if (fd == fdp->fd_lastfile)
207		fdp->fd_lastfile = fd_last_used(fdp, 0, fd);
208}
209
210/*
211 * System calls on descriptors.
212 */
213#ifndef _SYS_SYSPROTO_H_
214struct getdtablesize_args {
215	int	dummy;
216};
217#endif
218/*
219 * MPSAFE
220 */
221/* ARGSUSED */
222int
223getdtablesize(td, uap)
224	struct thread *td;
225	struct getdtablesize_args *uap;
226{
227	struct proc *p = td->td_proc;
228
229	PROC_LOCK(p);
230	td->td_retval[0] =
231	    min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
232	PROC_UNLOCK(p);
233	return (0);
234}
235
236/*
237 * Duplicate a file descriptor to a particular value.
238 *
239 * note: keep in mind that a potential race condition exists when closing
240 * descriptors from a shared descriptor table (via rfork).
241 */
242#ifndef _SYS_SYSPROTO_H_
243struct dup2_args {
244	u_int	from;
245	u_int	to;
246};
247#endif
248/*
249 * MPSAFE
250 */
251/* ARGSUSED */
252int
253dup2(td, uap)
254	struct thread *td;
255	struct dup2_args *uap;
256{
257
258	return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
259		    td->td_retval));
260}
261
262/*
263 * Duplicate a file descriptor.
264 */
265#ifndef _SYS_SYSPROTO_H_
266struct dup_args {
267	u_int	fd;
268};
269#endif
270/*
271 * MPSAFE
272 */
273/* ARGSUSED */
274int
275dup(td, uap)
276	struct thread *td;
277	struct dup_args *uap;
278{
279
280	return (do_dup(td, DUP_VARIABLE, (int)uap->fd, 0, td->td_retval));
281}
282
283/*
284 * The file control system call.
285 */
286#ifndef _SYS_SYSPROTO_H_
287struct fcntl_args {
288	int	fd;
289	int	cmd;
290	long	arg;
291};
292#endif
293/*
294 * MPSAFE
295 */
296/* ARGSUSED */
297int
298fcntl(td, uap)
299	struct thread *td;
300	struct fcntl_args *uap;
301{
302	struct flock fl;
303	intptr_t arg;
304	int error;
305
306	error = 0;
307	switch (uap->cmd) {
308	case F_GETLK:
309	case F_SETLK:
310	case F_SETLKW:
311		error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
312		arg = (intptr_t)&fl;
313		break;
314	default:
315		arg = uap->arg;
316		break;
317	}
318	if (error)
319		return (error);
320	error = kern_fcntl(td, uap->fd, uap->cmd, arg);
321	if (error)
322		return (error);
323	if (uap->cmd == F_GETLK)
324		error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
325	return (error);
326}
327
328int
329kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
330{
331	struct filedesc *fdp;
332	struct flock *flp;
333	struct file *fp;
334	struct proc *p;
335	char *pop;
336	struct vnode *vp;
337	u_int newmin;
338	int error, flg, tmp;
339
340	error = 0;
341	flg = F_POSIX;
342	p = td->td_proc;
343	fdp = p->p_fd;
344	mtx_lock(&Giant);
345	FILEDESC_LOCK(fdp);
346	if ((unsigned)fd >= fdp->fd_nfiles ||
347	    (fp = fdp->fd_ofiles[fd]) == NULL) {
348		FILEDESC_UNLOCK(fdp);
349		error = EBADF;
350		goto done2;
351	}
352	pop = &fdp->fd_ofileflags[fd];
353
354	switch (cmd) {
355	case F_DUPFD:
356		FILEDESC_UNLOCK(fdp);
357		newmin = arg;
358		PROC_LOCK(p);
359		if (newmin >= lim_cur(p, RLIMIT_NOFILE) ||
360		    newmin >= maxfilesperproc) {
361			PROC_UNLOCK(p);
362			error = EINVAL;
363			break;
364		}
365		PROC_UNLOCK(p);
366		error = do_dup(td, DUP_VARIABLE, fd, newmin, td->td_retval);
367		break;
368
369	case F_GETFD:
370		td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0;
371		FILEDESC_UNLOCK(fdp);
372		break;
373
374	case F_SETFD:
375		*pop = (*pop &~ UF_EXCLOSE) |
376		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
377		FILEDESC_UNLOCK(fdp);
378		break;
379
380	case F_GETFL:
381		FILE_LOCK(fp);
382		FILEDESC_UNLOCK(fdp);
383		td->td_retval[0] = OFLAGS(fp->f_flag);
384		FILE_UNLOCK(fp);
385		break;
386
387	case F_SETFL:
388		FILE_LOCK(fp);
389		FILEDESC_UNLOCK(fdp);
390		fhold_locked(fp);
391		fp->f_flag &= ~FCNTLFLAGS;
392		fp->f_flag |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
393		FILE_UNLOCK(fp);
394		tmp = fp->f_flag & FNONBLOCK;
395		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
396		if (error) {
397			fdrop(fp, td);
398			break;
399		}
400		tmp = fp->f_flag & FASYNC;
401		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
402		if (error == 0) {
403			fdrop(fp, td);
404			break;
405		}
406		FILE_LOCK(fp);
407		fp->f_flag &= ~FNONBLOCK;
408		FILE_UNLOCK(fp);
409		tmp = 0;
410		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
411		fdrop(fp, td);
412		break;
413
414	case F_GETOWN:
415		fhold(fp);
416		FILEDESC_UNLOCK(fdp);
417		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
418		if (error == 0)
419			td->td_retval[0] = tmp;
420		fdrop(fp, td);
421		break;
422
423	case F_SETOWN:
424		fhold(fp);
425		FILEDESC_UNLOCK(fdp);
426		tmp = arg;
427		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
428		fdrop(fp, td);
429		break;
430
431	case F_SETLKW:
432		flg |= F_WAIT;
433		/* FALLTHROUGH F_SETLK */
434
435	case F_SETLK:
436		if (fp->f_type != DTYPE_VNODE) {
437			FILEDESC_UNLOCK(fdp);
438			error = EBADF;
439			break;
440		}
441
442		flp = (struct flock *)arg;
443		if (flp->l_whence == SEEK_CUR) {
444			if (fp->f_offset < 0 ||
445			    (flp->l_start > 0 &&
446			     fp->f_offset > OFF_MAX - flp->l_start)) {
447				FILEDESC_UNLOCK(fdp);
448				error = EOVERFLOW;
449				break;
450			}
451			flp->l_start += fp->f_offset;
452		}
453
454		/*
455		 * VOP_ADVLOCK() may block.
456		 */
457		fhold(fp);
458		FILEDESC_UNLOCK(fdp);
459		vp = fp->f_vnode;
460
461		switch (flp->l_type) {
462		case F_RDLCK:
463			if ((fp->f_flag & FREAD) == 0) {
464				error = EBADF;
465				break;
466			}
467			PROC_LOCK(p->p_leader);
468			p->p_leader->p_flag |= P_ADVLOCK;
469			PROC_UNLOCK(p->p_leader);
470			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
471			    flp, flg);
472			break;
473		case F_WRLCK:
474			if ((fp->f_flag & FWRITE) == 0) {
475				error = EBADF;
476				break;
477			}
478			PROC_LOCK(p->p_leader);
479			p->p_leader->p_flag |= P_ADVLOCK;
480			PROC_UNLOCK(p->p_leader);
481			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
482			    flp, flg);
483			break;
484		case F_UNLCK:
485			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
486			    flp, F_POSIX);
487			break;
488		default:
489			error = EINVAL;
490			break;
491		}
492		/* Check for race with close */
493		FILEDESC_LOCK(fdp);
494		if ((unsigned) fd >= fdp->fd_nfiles ||
495		    fp != fdp->fd_ofiles[fd]) {
496			FILEDESC_UNLOCK(fdp);
497			flp->l_whence = SEEK_SET;
498			flp->l_start = 0;
499			flp->l_len = 0;
500			flp->l_type = F_UNLCK;
501			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
502					   F_UNLCK, flp, F_POSIX);
503		} else
504			FILEDESC_UNLOCK(fdp);
505		fdrop(fp, td);
506		break;
507
508	case F_GETLK:
509		if (fp->f_type != DTYPE_VNODE) {
510			FILEDESC_UNLOCK(fdp);
511			error = EBADF;
512			break;
513		}
514		flp = (struct flock *)arg;
515		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
516		    flp->l_type != F_UNLCK) {
517			FILEDESC_UNLOCK(fdp);
518			error = EINVAL;
519			break;
520		}
521		if (flp->l_whence == SEEK_CUR) {
522			if ((flp->l_start > 0 &&
523			    fp->f_offset > OFF_MAX - flp->l_start) ||
524			    (flp->l_start < 0 &&
525			     fp->f_offset < OFF_MIN - flp->l_start)) {
526				FILEDESC_UNLOCK(fdp);
527				error = EOVERFLOW;
528				break;
529			}
530			flp->l_start += fp->f_offset;
531		}
532		/*
533		 * VOP_ADVLOCK() may block.
534		 */
535		fhold(fp);
536		FILEDESC_UNLOCK(fdp);
537		vp = fp->f_vnode;
538		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
539		    F_POSIX);
540		fdrop(fp, td);
541		break;
542	default:
543		FILEDESC_UNLOCK(fdp);
544		error = EINVAL;
545		break;
546	}
547done2:
548	mtx_unlock(&Giant);
549	return (error);
550}
551
552/*
553 * Common code for dup, dup2, and fcntl(F_DUPFD).
554 */
555static int
556do_dup(td, type, old, new, retval)
557	enum dup_type type;
558	int old, new;
559	register_t *retval;
560	struct thread *td;
561{
562	struct filedesc *fdp;
563	struct proc *p;
564	struct file *fp;
565	struct file *delfp;
566	int error, holdleaders, maxfd;
567
568	KASSERT((type == DUP_VARIABLE || type == DUP_FIXED),
569	    ("invalid dup type %d", type));
570
571	p = td->td_proc;
572	fdp = p->p_fd;
573
574	/*
575	 * Verify we have a valid descriptor to dup from and possibly to
576	 * dup to.
577	 */
578	if (old < 0 || new < 0)
579		return (EBADF);
580	PROC_LOCK(p);
581	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
582	PROC_UNLOCK(p);
583	if (new >= maxfd)
584		return (EMFILE);
585
586	FILEDESC_LOCK(fdp);
587	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
588		FILEDESC_UNLOCK(fdp);
589		return (EBADF);
590	}
591	if (type == DUP_FIXED && old == new) {
592		*retval = new;
593		FILEDESC_UNLOCK(fdp);
594		return (0);
595	}
596	fp = fdp->fd_ofiles[old];
597	fhold(fp);
598
599	/*
600	 * If the caller specified a file descriptor, make sure the file
601	 * table is large enough to hold it, and grab it.  Otherwise, just
602	 * allocate a new descriptor the usual way.  Since the filedesc
603	 * lock may be temporarily dropped in the process, we have to look
604	 * out for a race.
605	 */
606	if (type == DUP_FIXED) {
607		if (new >= fdp->fd_nfiles)
608			fdgrowtable(fdp, new + 1);
609		if (fdp->fd_ofiles[new] == NULL)
610			fdused(fdp, new);
611	} else {
612		if ((error = fdalloc(td, new, &new)) != 0) {
613			FILEDESC_UNLOCK(fdp);
614			fdrop(fp, td);
615			return (error);
616		}
617	}
618
619	/*
620	 * If the old file changed out from under us then treat it as a
621	 * bad file descriptor.  Userland should do its own locking to
622	 * avoid this case.
623	 */
624	if (fdp->fd_ofiles[old] != fp) {
625		/* we've allocated a descriptor which we won't use */
626		if (fdp->fd_ofiles[new] == NULL)
627			fdunused(fdp, new);
628		FILEDESC_UNLOCK(fdp);
629		fdrop(fp, td);
630		return (EBADF);
631	}
632	KASSERT(old != new,
633	    ("new fd is same as old"));
634
635	/*
636	 * Save info on the descriptor being overwritten.  We cannot close
637	 * it without introducing an ownership race for the slot, since we
638	 * need to drop the filedesc lock to call closef().
639	 *
640	 * XXX this duplicates parts of close().
641	 */
642	delfp = fdp->fd_ofiles[new];
643	holdleaders = 0;
644	if (delfp != NULL) {
645		if (td->td_proc->p_fdtol != NULL) {
646			/*
647			 * Ask fdfree() to sleep to ensure that all relevant
648			 * process leaders can be traversed in closef().
649			 */
650			fdp->fd_holdleaderscount++;
651			holdleaders = 1;
652		}
653	}
654
655	/*
656	 * Duplicate the source descriptor
657	 */
658	fdp->fd_ofiles[new] = fp;
659	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
660	if (new > fdp->fd_lastfile)
661		fdp->fd_lastfile = new;
662	FILEDESC_UNLOCK(fdp);
663	*retval = new;
664
665	/*
666	 * If we dup'd over a valid file, we now own the reference to it
667	 * and must dispose of it using closef() semantics (as if a
668	 * close() were performed on it).
669	 *
670	 * XXX this duplicates parts of close().
671	 */
672	if (delfp != NULL) {
673		/* XXX need to call knote_fdclose() */
674		mtx_lock(&Giant);
675		(void) closef(delfp, td);
676		mtx_unlock(&Giant);
677		if (holdleaders) {
678			FILEDESC_LOCK(fdp);
679			fdp->fd_holdleaderscount--;
680			if (fdp->fd_holdleaderscount == 0 &&
681			    fdp->fd_holdleaderswakeup != 0) {
682				fdp->fd_holdleaderswakeup = 0;
683				wakeup(&fdp->fd_holdleaderscount);
684			}
685			FILEDESC_UNLOCK(fdp);
686		}
687	}
688	return (0);
689}
690
691/*
692 * If sigio is on the list associated with a process or process group,
693 * disable signalling from the device, remove sigio from the list and
694 * free sigio.
695 */
696void
697funsetown(sigiop)
698	struct sigio **sigiop;
699{
700	struct sigio *sigio;
701
702	SIGIO_LOCK();
703	sigio = *sigiop;
704	if (sigio == NULL) {
705		SIGIO_UNLOCK();
706		return;
707	}
708	*(sigio->sio_myref) = NULL;
709	if ((sigio)->sio_pgid < 0) {
710		struct pgrp *pg = (sigio)->sio_pgrp;
711		PGRP_LOCK(pg);
712		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
713			     sigio, sio_pgsigio);
714		PGRP_UNLOCK(pg);
715	} else {
716		struct proc *p = (sigio)->sio_proc;
717		PROC_LOCK(p);
718		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
719			     sigio, sio_pgsigio);
720		PROC_UNLOCK(p);
721	}
722	SIGIO_UNLOCK();
723	crfree(sigio->sio_ucred);
724	FREE(sigio, M_SIGIO);
725}
726
727/*
728 * Free a list of sigio structures.
729 * We only need to lock the SIGIO_LOCK because we have made ourselves
730 * inaccessable to callers of fsetown and therefore do not need to lock
731 * the proc or pgrp struct for the list manipulation.
732 */
733void
734funsetownlst(sigiolst)
735	struct sigiolst *sigiolst;
736{
737	struct proc *p;
738	struct pgrp *pg;
739	struct sigio *sigio;
740
741	sigio = SLIST_FIRST(sigiolst);
742	if (sigio == NULL)
743		return;
744	p = NULL;
745	pg = NULL;
746
747	/*
748	 * Every entry of the list should belong
749	 * to a single proc or pgrp.
750	 */
751	if (sigio->sio_pgid < 0) {
752		pg = sigio->sio_pgrp;
753		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
754	} else /* if (sigio->sio_pgid > 0) */ {
755		p = sigio->sio_proc;
756		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
757	}
758
759	SIGIO_LOCK();
760	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
761		*(sigio->sio_myref) = NULL;
762		if (pg != NULL) {
763			KASSERT(sigio->sio_pgid < 0,
764			    ("Proc sigio in pgrp sigio list"));
765			KASSERT(sigio->sio_pgrp == pg,
766			    ("Bogus pgrp in sigio list"));
767			PGRP_LOCK(pg);
768			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
769			    sio_pgsigio);
770			PGRP_UNLOCK(pg);
771		} else /* if (p != NULL) */ {
772			KASSERT(sigio->sio_pgid > 0,
773			    ("Pgrp sigio in proc sigio list"));
774			KASSERT(sigio->sio_proc == p,
775			    ("Bogus proc in sigio list"));
776			PROC_LOCK(p);
777			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
778			    sio_pgsigio);
779			PROC_UNLOCK(p);
780		}
781		SIGIO_UNLOCK();
782		crfree(sigio->sio_ucred);
783		FREE(sigio, M_SIGIO);
784		SIGIO_LOCK();
785	}
786	SIGIO_UNLOCK();
787}
788
789/*
790 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
791 *
792 * After permission checking, add a sigio structure to the sigio list for
793 * the process or process group.
794 */
795int
796fsetown(pgid, sigiop)
797	pid_t pgid;
798	struct sigio **sigiop;
799{
800	struct proc *proc;
801	struct pgrp *pgrp;
802	struct sigio *sigio;
803	int ret;
804
805	if (pgid == 0) {
806		funsetown(sigiop);
807		return (0);
808	}
809
810	ret = 0;
811
812	/* Allocate and fill in the new sigio out of locks. */
813	MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO, M_WAITOK);
814	sigio->sio_pgid = pgid;
815	sigio->sio_ucred = crhold(curthread->td_ucred);
816	sigio->sio_myref = sigiop;
817
818	sx_slock(&proctree_lock);
819	if (pgid > 0) {
820		proc = pfind(pgid);
821		if (proc == NULL) {
822			ret = ESRCH;
823			goto fail;
824		}
825
826		/*
827		 * Policy - Don't allow a process to FSETOWN a process
828		 * in another session.
829		 *
830		 * Remove this test to allow maximum flexibility or
831		 * restrict FSETOWN to the current process or process
832		 * group for maximum safety.
833		 */
834		PROC_UNLOCK(proc);
835		if (proc->p_session != curthread->td_proc->p_session) {
836			ret = EPERM;
837			goto fail;
838		}
839
840		pgrp = NULL;
841	} else /* if (pgid < 0) */ {
842		pgrp = pgfind(-pgid);
843		if (pgrp == NULL) {
844			ret = ESRCH;
845			goto fail;
846		}
847		PGRP_UNLOCK(pgrp);
848
849		/*
850		 * Policy - Don't allow a process to FSETOWN a process
851		 * in another session.
852		 *
853		 * Remove this test to allow maximum flexibility or
854		 * restrict FSETOWN to the current process or process
855		 * group for maximum safety.
856		 */
857		if (pgrp->pg_session != curthread->td_proc->p_session) {
858			ret = EPERM;
859			goto fail;
860		}
861
862		proc = NULL;
863	}
864	funsetown(sigiop);
865	if (pgid > 0) {
866		PROC_LOCK(proc);
867		/*
868		 * Since funsetownlst() is called without the proctree
869		 * locked, we need to check for P_WEXIT.
870		 * XXX: is ESRCH correct?
871		 */
872		if ((proc->p_flag & P_WEXIT) != 0) {
873			PROC_UNLOCK(proc);
874			ret = ESRCH;
875			goto fail;
876		}
877		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
878		sigio->sio_proc = proc;
879		PROC_UNLOCK(proc);
880	} else {
881		PGRP_LOCK(pgrp);
882		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
883		sigio->sio_pgrp = pgrp;
884		PGRP_UNLOCK(pgrp);
885	}
886	sx_sunlock(&proctree_lock);
887	SIGIO_LOCK();
888	*sigiop = sigio;
889	SIGIO_UNLOCK();
890	return (0);
891
892fail:
893	sx_sunlock(&proctree_lock);
894	crfree(sigio->sio_ucred);
895	FREE(sigio, M_SIGIO);
896	return (ret);
897}
898
899/*
900 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
901 */
902pid_t
903fgetown(sigiop)
904	struct sigio **sigiop;
905{
906	pid_t pgid;
907
908	SIGIO_LOCK();
909	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
910	SIGIO_UNLOCK();
911	return (pgid);
912}
913
914/*
915 * Close a file descriptor.
916 */
917#ifndef _SYS_SYSPROTO_H_
918struct close_args {
919	int     fd;
920};
921#endif
922/*
923 * MPSAFE
924 */
925/* ARGSUSED */
926int
927close(td, uap)
928	struct thread *td;
929	struct close_args *uap;
930{
931	struct filedesc *fdp;
932	struct file *fp;
933	int fd, error;
934	int holdleaders;
935
936	fd = uap->fd;
937	error = 0;
938	holdleaders = 0;
939	fdp = td->td_proc->p_fd;
940	mtx_lock(&Giant);
941	FILEDESC_LOCK(fdp);
942	if ((unsigned)fd >= fdp->fd_nfiles ||
943	    (fp = fdp->fd_ofiles[fd]) == NULL) {
944		FILEDESC_UNLOCK(fdp);
945		mtx_unlock(&Giant);
946		return (EBADF);
947	}
948	fdp->fd_ofiles[fd] = NULL;
949	fdp->fd_ofileflags[fd] = 0;
950	fdunused(fdp, fd);
951	if (td->td_proc->p_fdtol != NULL) {
952		/*
953		 * Ask fdfree() to sleep to ensure that all relevant
954		 * process leaders can be traversed in closef().
955		 */
956		fdp->fd_holdleaderscount++;
957		holdleaders = 1;
958	}
959
960	/*
961	 * we now hold the fp reference that used to be owned by the descriptor
962	 * array.
963	 */
964	if (fd < fdp->fd_knlistsize) {
965		FILEDESC_UNLOCK(fdp);
966		knote_fdclose(td, fd);
967	} else
968		FILEDESC_UNLOCK(fdp);
969
970	error = closef(fp, td);
971	mtx_unlock(&Giant);
972	if (holdleaders) {
973		FILEDESC_LOCK(fdp);
974		fdp->fd_holdleaderscount--;
975		if (fdp->fd_holdleaderscount == 0 &&
976		    fdp->fd_holdleaderswakeup != 0) {
977			fdp->fd_holdleaderswakeup = 0;
978			wakeup(&fdp->fd_holdleaderscount);
979		}
980		FILEDESC_UNLOCK(fdp);
981	}
982	return (error);
983}
984
985#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
986/*
987 * Return status information about a file descriptor.
988 */
989#ifndef _SYS_SYSPROTO_H_
990struct ofstat_args {
991	int	fd;
992	struct	ostat *sb;
993};
994#endif
995/*
996 * MPSAFE
997 */
998/* ARGSUSED */
999int
1000ofstat(td, uap)
1001	struct thread *td;
1002	struct ofstat_args *uap;
1003{
1004	struct file *fp;
1005	struct stat ub;
1006	struct ostat oub;
1007	int error;
1008
1009	if ((error = fget(td, uap->fd, &fp)) != 0)
1010		goto done2;
1011	mtx_lock(&Giant);
1012	error = fo_stat(fp, &ub, td->td_ucred, td);
1013	mtx_unlock(&Giant);
1014	if (error == 0) {
1015		cvtstat(&ub, &oub);
1016		error = copyout(&oub, uap->sb, sizeof(oub));
1017	}
1018	fdrop(fp, td);
1019done2:
1020	return (error);
1021}
1022#endif /* COMPAT_43 || COMPAT_SUNOS */
1023
1024/*
1025 * Return status information about a file descriptor.
1026 */
1027#ifndef _SYS_SYSPROTO_H_
1028struct fstat_args {
1029	int	fd;
1030	struct	stat *sb;
1031};
1032#endif
1033/*
1034 * MPSAFE
1035 */
1036/* ARGSUSED */
1037int
1038fstat(td, uap)
1039	struct thread *td;
1040	struct fstat_args *uap;
1041{
1042	struct file *fp;
1043	struct stat ub;
1044	int error;
1045
1046	if ((error = fget(td, uap->fd, &fp)) != 0)
1047		goto done2;
1048	mtx_lock(&Giant);
1049	error = fo_stat(fp, &ub, td->td_ucred, td);
1050	mtx_unlock(&Giant);
1051	if (error == 0)
1052		error = copyout(&ub, uap->sb, sizeof(ub));
1053	fdrop(fp, td);
1054done2:
1055	return (error);
1056}
1057
1058/*
1059 * Return status information about a file descriptor.
1060 */
1061#ifndef _SYS_SYSPROTO_H_
1062struct nfstat_args {
1063	int	fd;
1064	struct	nstat *sb;
1065};
1066#endif
1067/*
1068 * MPSAFE
1069 */
1070/* ARGSUSED */
1071int
1072nfstat(td, uap)
1073	struct thread *td;
1074	struct nfstat_args *uap;
1075{
1076	struct file *fp;
1077	struct stat ub;
1078	struct nstat nub;
1079	int error;
1080
1081	if ((error = fget(td, uap->fd, &fp)) != 0)
1082		goto done2;
1083	mtx_lock(&Giant);
1084	error = fo_stat(fp, &ub, td->td_ucred, td);
1085	mtx_unlock(&Giant);
1086	if (error == 0) {
1087		cvtnstat(&ub, &nub);
1088		error = copyout(&nub, uap->sb, sizeof(nub));
1089	}
1090	fdrop(fp, td);
1091done2:
1092	return (error);
1093}
1094
1095/*
1096 * Return pathconf information about a file descriptor.
1097 */
1098#ifndef _SYS_SYSPROTO_H_
1099struct fpathconf_args {
1100	int	fd;
1101	int	name;
1102};
1103#endif
1104/*
1105 * MPSAFE
1106 */
1107/* ARGSUSED */
1108int
1109fpathconf(td, uap)
1110	struct thread *td;
1111	struct fpathconf_args *uap;
1112{
1113	struct file *fp;
1114	struct vnode *vp;
1115	int error;
1116
1117	if ((error = fget(td, uap->fd, &fp)) != 0)
1118		return (error);
1119
1120	/* If asynchronous I/O is available, it works for all descriptors. */
1121	if (uap->name == _PC_ASYNC_IO) {
1122		td->td_retval[0] = async_io_version;
1123		goto out;
1124	}
1125	vp = fp->f_vnode;
1126	if (vp != NULL) {
1127		mtx_lock(&Giant);
1128		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1129		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
1130		VOP_UNLOCK(vp, 0, td);
1131		mtx_unlock(&Giant);
1132	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
1133		if (uap->name != _PC_PIPE_BUF) {
1134			error = EINVAL;
1135		} else {
1136			td->td_retval[0] = PIPE_BUF;
1137		error = 0;
1138		}
1139	} else {
1140		error = EOPNOTSUPP;
1141	}
1142out:
1143	fdrop(fp, td);
1144	return (error);
1145}
1146
1147/*
1148 * Grow the file table to accomodate (at least) nfd descriptors.  This may
1149 * block and drop the filedesc lock, but it will reacquire it before
1150 * returing.
1151 */
1152static void
1153fdgrowtable(struct filedesc *fdp, int nfd)
1154{
1155	struct file **ntable;
1156	char *nfileflags;
1157	int nnfiles, onfiles;
1158	NDSLOTTYPE *nmap;
1159
1160	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1161
1162	KASSERT(fdp->fd_nfiles > 0,
1163	    ("zero-length file table"));
1164
1165	/* compute the size of the new table */
1166	onfiles = fdp->fd_nfiles;
1167	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
1168	if (nnfiles <= onfiles)
1169		/* the table is already large enough */
1170		return;
1171
1172	/* allocate a new table and (if required) new bitmaps */
1173	FILEDESC_UNLOCK(fdp);
1174	MALLOC(ntable, struct file **, nnfiles * OFILESIZE,
1175	    M_FILEDESC, M_ZERO | M_WAITOK);
1176	nfileflags = (char *)&ntable[nnfiles];
1177	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles))
1178		MALLOC(nmap, NDSLOTTYPE *, NDSLOTS(nnfiles) * NDSLOTSIZE,
1179		    M_FILEDESC, M_ZERO | M_WAITOK);
1180	else
1181		nmap = NULL;
1182	FILEDESC_LOCK(fdp);
1183
1184	/*
1185	 * We now have new tables ready to go.  Since we dropped the
1186	 * filedesc lock to call malloc(), watch out for a race.
1187	 */
1188	onfiles = fdp->fd_nfiles;
1189	if (onfiles >= nnfiles) {
1190		/* we lost the race, but that's OK */
1191		free(ntable, M_FILEDESC);
1192		if (nmap != NULL)
1193			free(nmap, M_FILEDESC);
1194		return;
1195	}
1196	bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable));
1197	bcopy(fdp->fd_ofileflags, nfileflags, onfiles);
1198	if (onfiles > NDFILE)
1199		free(fdp->fd_ofiles, M_FILEDESC);
1200	fdp->fd_ofiles = ntable;
1201	fdp->fd_ofileflags = nfileflags;
1202	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
1203		bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap));
1204		if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
1205			free(fdp->fd_map, M_FILEDESC);
1206		fdp->fd_map = nmap;
1207	}
1208	fdp->fd_nfiles = nnfiles;
1209}
1210
1211/*
1212 * Allocate a file descriptor for the process.
1213 */
1214int
1215fdalloc(struct thread *td, int minfd, int *result)
1216{
1217	struct proc *p = td->td_proc;
1218	struct filedesc *fdp = p->p_fd;
1219	int fd = -1, maxfd;
1220
1221	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1222
1223	PROC_LOCK(p);
1224	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
1225	PROC_UNLOCK(p);
1226
1227	/*
1228	 * Search the bitmap for a free descriptor.  If none is found, try
1229	 * to grow the file table.  Keep at it until we either get a file
1230	 * descriptor or run into process or system limits; fdgrowtable()
1231	 * may drop the filedesc lock, so we're in a race.
1232	 */
1233	for (;;) {
1234		fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
1235		if (fd >= maxfd)
1236			return (EMFILE);
1237		if (fd < fdp->fd_nfiles)
1238			break;
1239		fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd));
1240	}
1241
1242	/*
1243	 * Perform some sanity checks, then mark the file descriptor as
1244	 * used and return it to the caller.
1245	 */
1246	KASSERT(!fdisused(fdp, fd),
1247	    ("fd_first_free() returned non-free descriptor"));
1248	KASSERT(fdp->fd_ofiles[fd] == NULL,
1249	    ("free descriptor isn't"));
1250	fdp->fd_ofileflags[fd] = 0; /* XXX needed? */
1251	fdused(fdp, fd);
1252	fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
1253	*result = fd;
1254	return (0);
1255}
1256
1257/*
1258 * Check to see whether n user file descriptors
1259 * are available to the process p.
1260 */
1261int
1262fdavail(td, n)
1263	struct thread *td;
1264	int n;
1265{
1266	struct proc *p = td->td_proc;
1267	struct filedesc *fdp = td->td_proc->p_fd;
1268	struct file **fpp;
1269	int i, lim, last;
1270
1271	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1272
1273	PROC_LOCK(p);
1274	lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
1275	PROC_UNLOCK(p);
1276	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
1277		return (1);
1278	last = min(fdp->fd_nfiles, lim);
1279	fpp = &fdp->fd_ofiles[fdp->fd_freefile];
1280	for (i = last - fdp->fd_freefile; --i >= 0; fpp++) {
1281		if (*fpp == NULL && --n <= 0)
1282			return (1);
1283	}
1284	return (0);
1285}
1286
1287/*
1288 * Create a new open file structure and allocate
1289 * a file decriptor for the process that refers to it.
1290 * We add one reference to the file for the descriptor table
1291 * and one reference for resultfp. This is to prevent us being
1292 * prempted and the entry in the descriptor table closed after
1293 * we release the FILEDESC lock.
1294 */
1295int
1296falloc(td, resultfp, resultfd)
1297	struct thread *td;
1298	struct file **resultfp;
1299	int *resultfd;
1300{
1301	struct proc *p = td->td_proc;
1302	struct file *fp, *fq;
1303	int error, i;
1304	int maxuserfiles = maxfiles - (maxfiles / 20);
1305	static struct timeval lastfail;
1306	static int curfail;
1307
1308	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
1309	sx_xlock(&filelist_lock);
1310	if ((nfiles >= maxuserfiles && td->td_ucred->cr_ruid != 0)
1311	   || nfiles >= maxfiles) {
1312		if (ppsratecheck(&lastfail, &curfail, 1)) {
1313			printf("kern.maxfiles limit exceeded by uid %i, please see tuning(7).\n",
1314				td->td_ucred->cr_ruid);
1315		}
1316		sx_xunlock(&filelist_lock);
1317		uma_zfree(file_zone, fp);
1318		return (ENFILE);
1319	}
1320	nfiles++;
1321
1322	/*
1323	 * If the process has file descriptor zero open, add the new file
1324	 * descriptor to the list of open files at that point, otherwise
1325	 * put it at the front of the list of open files.
1326	 */
1327	fp->f_mtxp = mtx_pool_alloc(mtxpool_sleep);
1328	fp->f_count = 1;
1329	if (resultfp)
1330		fp->f_count++;
1331	fp->f_cred = crhold(td->td_ucred);
1332	fp->f_ops = &badfileops;
1333	FILEDESC_LOCK(p->p_fd);
1334	if ((fq = p->p_fd->fd_ofiles[0])) {
1335		LIST_INSERT_AFTER(fq, fp, f_list);
1336	} else {
1337		LIST_INSERT_HEAD(&filehead, fp, f_list);
1338	}
1339	sx_xunlock(&filelist_lock);
1340	if ((error = fdalloc(td, 0, &i))) {
1341		FILEDESC_UNLOCK(p->p_fd);
1342		fdrop(fp, td);
1343		if (resultfp)
1344			fdrop(fp, td);
1345		return (error);
1346	}
1347	p->p_fd->fd_ofiles[i] = fp;
1348	FILEDESC_UNLOCK(p->p_fd);
1349	if (resultfp)
1350		*resultfp = fp;
1351	if (resultfd)
1352		*resultfd = i;
1353	return (0);
1354}
1355
1356/*
1357 * Free a file descriptor.
1358 */
1359void
1360ffree(fp)
1361	struct file *fp;
1362{
1363
1364	KASSERT(fp->f_count == 0, ("ffree: fp_fcount not 0!"));
1365	sx_xlock(&filelist_lock);
1366	LIST_REMOVE(fp, f_list);
1367	nfiles--;
1368	sx_xunlock(&filelist_lock);
1369	crfree(fp->f_cred);
1370	uma_zfree(file_zone, fp);
1371}
1372
1373/*
1374 * Build a new filedesc structure from another.
1375 * Copy the current, root, and jail root vnode references.
1376 */
1377struct filedesc *
1378fdinit(fdp)
1379	struct filedesc *fdp;
1380{
1381	struct filedesc0 *newfdp;
1382
1383	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1384
1385	FILEDESC_UNLOCK(fdp);
1386	MALLOC(newfdp, struct filedesc0 *, sizeof(struct filedesc0),
1387	    M_FILEDESC, M_WAITOK | M_ZERO);
1388	FILEDESC_LOCK(fdp);
1389	mtx_init(&newfdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
1390	newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
1391	if (newfdp->fd_fd.fd_cdir)
1392		VREF(newfdp->fd_fd.fd_cdir);
1393	newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
1394	if (newfdp->fd_fd.fd_rdir)
1395		VREF(newfdp->fd_fd.fd_rdir);
1396	newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
1397	if (newfdp->fd_fd.fd_jdir)
1398		VREF(newfdp->fd_fd.fd_jdir);
1399
1400	/* Create the file descriptor table. */
1401	newfdp->fd_fd.fd_refcnt = 1;
1402	newfdp->fd_fd.fd_cmask = CMASK;
1403	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
1404	newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
1405	newfdp->fd_fd.fd_nfiles = NDFILE;
1406	newfdp->fd_fd.fd_knlistsize = -1;
1407	newfdp->fd_fd.fd_map = newfdp->fd_dmap;
1408	return (&newfdp->fd_fd);
1409}
1410
1411/*
1412 * Share a filedesc structure.
1413 */
1414struct filedesc *
1415fdshare(fdp)
1416	struct filedesc *fdp;
1417{
1418	FILEDESC_LOCK(fdp);
1419	fdp->fd_refcnt++;
1420	FILEDESC_UNLOCK(fdp);
1421	return (fdp);
1422}
1423
1424/*
1425 * Copy a filedesc structure.
1426 * A NULL pointer in returns a NULL reference, this is to ease callers,
1427 * not catch errors.
1428 */
1429struct filedesc *
1430fdcopy(fdp)
1431	struct filedesc *fdp;
1432{
1433	struct filedesc *newfdp;
1434	int i;
1435
1436	/* Certain daemons might not have file descriptors. */
1437	if (fdp == NULL)
1438		return (NULL);
1439
1440	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1441	newfdp = fdinit(fdp);
1442	while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
1443		FILEDESC_UNLOCK(fdp);
1444		FILEDESC_LOCK(newfdp);
1445		fdgrowtable(newfdp, fdp->fd_lastfile + 1);
1446		FILEDESC_UNLOCK(newfdp);
1447		FILEDESC_LOCK(fdp);
1448	}
1449	/* copy everything except kqueue descriptors */
1450	newfdp->fd_freefile = -1;
1451	for (i = 0; i <= fdp->fd_lastfile; ++i) {
1452		if (fdisused(fdp, i) &&
1453		    fdp->fd_ofiles[i]->f_type != DTYPE_KQUEUE) {
1454			newfdp->fd_ofiles[i] = fdp->fd_ofiles[i];
1455			newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i];
1456			fhold(newfdp->fd_ofiles[i]);
1457			newfdp->fd_lastfile = i;
1458		} else {
1459			if (newfdp->fd_freefile == -1)
1460				newfdp->fd_freefile = i;
1461		}
1462	}
1463	FILEDESC_UNLOCK(fdp);
1464	FILEDESC_LOCK(newfdp);
1465	for (i = 0; i <= newfdp->fd_lastfile; ++i)
1466		if (newfdp->fd_ofiles[i] != NULL)
1467			fdused(newfdp, i);
1468	FILEDESC_UNLOCK(newfdp);
1469	FILEDESC_LOCK(fdp);
1470	if (newfdp->fd_freefile == -1)
1471		newfdp->fd_freefile = i;
1472	newfdp->fd_cmask = fdp->fd_cmask;
1473	return (newfdp);
1474}
1475
1476/* A mutex to protect the association between a proc and filedesc. */
1477struct mtx	fdesc_mtx;
1478MTX_SYSINIT(fdesc, &fdesc_mtx, "fdesc", MTX_DEF);
1479
1480/*
1481 * Release a filedesc structure.
1482 */
1483void
1484fdfree(td)
1485	struct thread *td;
1486{
1487	struct filedesc *fdp;
1488	struct file **fpp;
1489	int i;
1490	struct filedesc_to_leader *fdtol;
1491	struct file *fp;
1492	struct vnode *vp;
1493	struct flock lf;
1494
1495	/* Certain daemons might not have file descriptors. */
1496	fdp = td->td_proc->p_fd;
1497	if (fdp == NULL)
1498		return;
1499
1500	/* Check for special need to clear POSIX style locks */
1501	fdtol = td->td_proc->p_fdtol;
1502	if (fdtol != NULL) {
1503		FILEDESC_LOCK(fdp);
1504		KASSERT(fdtol->fdl_refcount > 0,
1505			("filedesc_to_refcount botch: fdl_refcount=%d",
1506			 fdtol->fdl_refcount));
1507		if (fdtol->fdl_refcount == 1 &&
1508		    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
1509			i = 0;
1510			fpp = fdp->fd_ofiles;
1511			for (i = 0, fpp = fdp->fd_ofiles;
1512			     i <= fdp->fd_lastfile;
1513			     i++, fpp++) {
1514				if (*fpp == NULL ||
1515				    (*fpp)->f_type != DTYPE_VNODE)
1516					continue;
1517				fp = *fpp;
1518				fhold(fp);
1519				FILEDESC_UNLOCK(fdp);
1520				lf.l_whence = SEEK_SET;
1521				lf.l_start = 0;
1522				lf.l_len = 0;
1523				lf.l_type = F_UNLCK;
1524				vp = fp->f_vnode;
1525				(void) VOP_ADVLOCK(vp,
1526						   (caddr_t)td->td_proc->
1527						   p_leader,
1528						   F_UNLCK,
1529						   &lf,
1530						   F_POSIX);
1531				FILEDESC_LOCK(fdp);
1532				fdrop(fp, td);
1533				fpp = fdp->fd_ofiles + i;
1534			}
1535		}
1536	retry:
1537		if (fdtol->fdl_refcount == 1) {
1538			if (fdp->fd_holdleaderscount > 0 &&
1539			    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
1540				/*
1541				 * close() or do_dup() has cleared a reference
1542				 * in a shared file descriptor table.
1543				 */
1544				fdp->fd_holdleaderswakeup = 1;
1545				msleep(&fdp->fd_holdleaderscount, &fdp->fd_mtx,
1546				       PLOCK, "fdlhold", 0);
1547				goto retry;
1548			}
1549			if (fdtol->fdl_holdcount > 0) {
1550				/*
1551				 * Ensure that fdtol->fdl_leader
1552				 * remains valid in closef().
1553				 */
1554				fdtol->fdl_wakeup = 1;
1555				msleep(fdtol, &fdp->fd_mtx,
1556				       PLOCK, "fdlhold", 0);
1557				goto retry;
1558			}
1559		}
1560		fdtol->fdl_refcount--;
1561		if (fdtol->fdl_refcount == 0 &&
1562		    fdtol->fdl_holdcount == 0) {
1563			fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
1564			fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
1565		} else
1566			fdtol = NULL;
1567		td->td_proc->p_fdtol = NULL;
1568		FILEDESC_UNLOCK(fdp);
1569		if (fdtol != NULL)
1570			FREE(fdtol, M_FILEDESC_TO_LEADER);
1571	}
1572	FILEDESC_LOCK(fdp);
1573	if (--fdp->fd_refcnt > 0) {
1574		FILEDESC_UNLOCK(fdp);
1575		return;
1576	}
1577
1578	/*
1579	 * We are the last reference to the structure, so we can
1580	 * safely assume it will not change out from under us.
1581	 */
1582	FILEDESC_UNLOCK(fdp);
1583	fpp = fdp->fd_ofiles;
1584	for (i = fdp->fd_lastfile; i-- >= 0; fpp++) {
1585		if (*fpp)
1586			(void) closef(*fpp, td);
1587	}
1588
1589	/* XXX This should happen earlier. */
1590	mtx_lock(&fdesc_mtx);
1591	td->td_proc->p_fd = NULL;
1592	mtx_unlock(&fdesc_mtx);
1593
1594	if (fdp->fd_nfiles > NDFILE)
1595		FREE(fdp->fd_ofiles, M_FILEDESC);
1596	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
1597		FREE(fdp->fd_map, M_FILEDESC);
1598	if (fdp->fd_cdir)
1599		vrele(fdp->fd_cdir);
1600	if (fdp->fd_rdir)
1601		vrele(fdp->fd_rdir);
1602	if (fdp->fd_jdir)
1603		vrele(fdp->fd_jdir);
1604	if (fdp->fd_knlist)
1605		FREE(fdp->fd_knlist, M_KQUEUE);
1606	if (fdp->fd_knhash)
1607		FREE(fdp->fd_knhash, M_KQUEUE);
1608	mtx_destroy(&fdp->fd_mtx);
1609	FREE(fdp, M_FILEDESC);
1610}
1611
1612/*
1613 * For setugid programs, we don't want to people to use that setugidness
1614 * to generate error messages which write to a file which otherwise would
1615 * otherwise be off-limits to the process.  We check for filesystems where
1616 * the vnode can change out from under us after execve (like [lin]procfs).
1617 *
1618 * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
1619 * sufficient.  We also don't for check setugidness since we know we are.
1620 */
1621static int
1622is_unsafe(struct file *fp)
1623{
1624	if (fp->f_type == DTYPE_VNODE) {
1625		struct vnode *vp = fp->f_vnode;
1626
1627		if ((vp->v_vflag & VV_PROCDEP) != 0)
1628			return (1);
1629	}
1630	return (0);
1631}
1632
1633/*
1634 * Make this setguid thing safe, if at all possible.
1635 */
1636void
1637setugidsafety(td)
1638	struct thread *td;
1639{
1640	struct filedesc *fdp;
1641	int i;
1642
1643	/* Certain daemons might not have file descriptors. */
1644	fdp = td->td_proc->p_fd;
1645	if (fdp == NULL)
1646		return;
1647
1648	/*
1649	 * Note: fdp->fd_ofiles may be reallocated out from under us while
1650	 * we are blocked in a close.  Be careful!
1651	 */
1652	FILEDESC_LOCK(fdp);
1653	for (i = 0; i <= fdp->fd_lastfile; i++) {
1654		if (i > 2)
1655			break;
1656		if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) {
1657			struct file *fp;
1658
1659			if (i < fdp->fd_knlistsize) {
1660				FILEDESC_UNLOCK(fdp);
1661				knote_fdclose(td, i);
1662				FILEDESC_LOCK(fdp);
1663			}
1664			/*
1665			 * NULL-out descriptor prior to close to avoid
1666			 * a race while close blocks.
1667			 */
1668			fp = fdp->fd_ofiles[i];
1669			fdp->fd_ofiles[i] = NULL;
1670			fdp->fd_ofileflags[i] = 0;
1671			fdunused(fdp, i);
1672			FILEDESC_UNLOCK(fdp);
1673			(void) closef(fp, td);
1674			FILEDESC_LOCK(fdp);
1675		}
1676	}
1677	FILEDESC_UNLOCK(fdp);
1678}
1679
1680/*
1681 * Close any files on exec?
1682 */
1683void
1684fdcloseexec(td)
1685	struct thread *td;
1686{
1687	struct filedesc *fdp;
1688	int i;
1689
1690	/* Certain daemons might not have file descriptors. */
1691	fdp = td->td_proc->p_fd;
1692	if (fdp == NULL)
1693		return;
1694
1695	FILEDESC_LOCK(fdp);
1696
1697	/*
1698	 * We cannot cache fd_ofiles or fd_ofileflags since operations
1699	 * may block and rip them out from under us.
1700	 */
1701	for (i = 0; i <= fdp->fd_lastfile; i++) {
1702		if (fdp->fd_ofiles[i] != NULL &&
1703		    (fdp->fd_ofileflags[i] & UF_EXCLOSE)) {
1704			struct file *fp;
1705
1706			if (i < fdp->fd_knlistsize) {
1707				FILEDESC_UNLOCK(fdp);
1708				knote_fdclose(td, i);
1709				FILEDESC_LOCK(fdp);
1710			}
1711			/*
1712			 * NULL-out descriptor prior to close to avoid
1713			 * a race while close blocks.
1714			 */
1715			fp = fdp->fd_ofiles[i];
1716			fdp->fd_ofiles[i] = NULL;
1717			fdp->fd_ofileflags[i] = 0;
1718			fdunused(fdp, i);
1719			FILEDESC_UNLOCK(fdp);
1720			(void) closef(fp, td);
1721			FILEDESC_LOCK(fdp);
1722		}
1723	}
1724	FILEDESC_UNLOCK(fdp);
1725}
1726
1727/*
1728 * It is unsafe for set[ug]id processes to be started with file
1729 * descriptors 0..2 closed, as these descriptors are given implicit
1730 * significance in the Standard C library.  fdcheckstd() will create a
1731 * descriptor referencing /dev/null for each of stdin, stdout, and
1732 * stderr that is not already open.
1733 */
1734int
1735fdcheckstd(td)
1736	struct thread *td;
1737{
1738	struct nameidata nd;
1739	struct filedesc *fdp;
1740	struct file *fp;
1741	register_t retval;
1742	int fd, i, error, flags, devnull;
1743
1744	fdp = td->td_proc->p_fd;
1745	if (fdp == NULL)
1746		return (0);
1747	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
1748	devnull = -1;
1749	error = 0;
1750	for (i = 0; i < 3; i++) {
1751		if (fdp->fd_ofiles[i] != NULL)
1752			continue;
1753		if (devnull < 0) {
1754			error = falloc(td, &fp, &fd);
1755			if (error != 0)
1756				break;
1757			/* Note extra ref on `fp' held for us by falloc(). */
1758			KASSERT(fd == i, ("oof, we didn't get our fd"));
1759			NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/dev/null",
1760			    td);
1761			flags = FREAD | FWRITE;
1762			error = vn_open(&nd, &flags, 0, -1);
1763			if (error != 0) {
1764				/*
1765				 * Someone may have closed the entry in the
1766				 * file descriptor table, so check it hasn't
1767				 * changed before dropping the reference count.
1768				 */
1769				FILEDESC_LOCK(fdp);
1770				KASSERT(fdp->fd_ofiles[fd] == fp,
1771				    ("table not shared, how did it change?"));
1772				fdp->fd_ofiles[fd] = NULL;
1773				fdunused(fdp, fd);
1774				FILEDESC_UNLOCK(fdp);
1775				fdrop(fp, td);
1776				fdrop(fp, td);
1777				break;
1778			}
1779			NDFREE(&nd, NDF_ONLY_PNBUF);
1780			fp->f_vnode = nd.ni_vp;
1781			fp->f_data = nd.ni_vp;
1782			fp->f_flag = flags;
1783			fp->f_ops = &vnops;
1784			fp->f_type = DTYPE_VNODE;
1785			VOP_UNLOCK(nd.ni_vp, 0, td);
1786			devnull = fd;
1787			fdrop(fp, td);
1788		} else {
1789			error = do_dup(td, DUP_FIXED, devnull, i, &retval);
1790			if (error != 0)
1791				break;
1792		}
1793	}
1794	return (error);
1795}
1796
1797/*
1798 * Internal form of close.
1799 * Decrement reference count on file structure.
1800 * Note: td may be NULL when closing a file
1801 * that was being passed in a message.
1802 */
1803int
1804closef(fp, td)
1805	struct file *fp;
1806	struct thread *td;
1807{
1808	struct vnode *vp;
1809	struct flock lf;
1810	struct filedesc_to_leader *fdtol;
1811	struct filedesc *fdp;
1812
1813	if (fp == NULL)
1814		return (0);
1815	/*
1816	 * POSIX record locking dictates that any close releases ALL
1817	 * locks owned by this process.  This is handled by setting
1818	 * a flag in the unlock to free ONLY locks obeying POSIX
1819	 * semantics, and not to free BSD-style file locks.
1820	 * If the descriptor was in a message, POSIX-style locks
1821	 * aren't passed with the descriptor.
1822	 */
1823	if (td != NULL &&
1824	    fp->f_type == DTYPE_VNODE) {
1825		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
1826			lf.l_whence = SEEK_SET;
1827			lf.l_start = 0;
1828			lf.l_len = 0;
1829			lf.l_type = F_UNLCK;
1830			vp = fp->f_vnode;
1831			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
1832					   F_UNLCK, &lf, F_POSIX);
1833		}
1834		fdtol = td->td_proc->p_fdtol;
1835		if (fdtol != NULL) {
1836			/*
1837			 * Handle special case where file descriptor table
1838			 * is shared between multiple process leaders.
1839			 */
1840			fdp = td->td_proc->p_fd;
1841			FILEDESC_LOCK(fdp);
1842			for (fdtol = fdtol->fdl_next;
1843			     fdtol != td->td_proc->p_fdtol;
1844			     fdtol = fdtol->fdl_next) {
1845				if ((fdtol->fdl_leader->p_flag &
1846				     P_ADVLOCK) == 0)
1847					continue;
1848				fdtol->fdl_holdcount++;
1849				FILEDESC_UNLOCK(fdp);
1850				lf.l_whence = SEEK_SET;
1851				lf.l_start = 0;
1852				lf.l_len = 0;
1853				lf.l_type = F_UNLCK;
1854				vp = fp->f_vnode;
1855				(void) VOP_ADVLOCK(vp,
1856						   (caddr_t)fdtol->fdl_leader,
1857						   F_UNLCK, &lf, F_POSIX);
1858				FILEDESC_LOCK(fdp);
1859				fdtol->fdl_holdcount--;
1860				if (fdtol->fdl_holdcount == 0 &&
1861				    fdtol->fdl_wakeup != 0) {
1862					fdtol->fdl_wakeup = 0;
1863					wakeup(fdtol);
1864				}
1865			}
1866			FILEDESC_UNLOCK(fdp);
1867		}
1868	}
1869	return (fdrop(fp, td));
1870}
1871
1872/*
1873 * Drop reference on struct file passed in, may call closef if the
1874 * reference hits zero.
1875 */
1876int
1877fdrop(fp, td)
1878	struct file *fp;
1879	struct thread *td;
1880{
1881
1882	FILE_LOCK(fp);
1883	return (fdrop_locked(fp, td));
1884}
1885
1886/*
1887 * Extract the file pointer associated with the specified descriptor for
1888 * the current user process.
1889 *
1890 * If the descriptor doesn't exist, EBADF is returned.
1891 *
1892 * If the descriptor exists but doesn't match 'flags' then
1893 * return EBADF for read attempts and EINVAL for write attempts.
1894 *
1895 * If 'hold' is set (non-zero) the file's refcount will be bumped on return.
1896 * It should be droped with fdrop().
1897 * If it is not set, then the refcount will not be bumped however the
1898 * thread's filedesc struct will be returned locked (for fgetsock).
1899 *
1900 * If an error occured the non-zero error is returned and *fpp is set to NULL.
1901 * Otherwise *fpp is set and zero is returned.
1902 */
1903static __inline int
1904_fget(struct thread *td, int fd, struct file **fpp, int flags, int hold)
1905{
1906	struct filedesc *fdp;
1907	struct file *fp;
1908
1909	*fpp = NULL;
1910	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
1911		return (EBADF);
1912	FILEDESC_LOCK(fdp);
1913	if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) {
1914		FILEDESC_UNLOCK(fdp);
1915		return (EBADF);
1916	}
1917
1918	/*
1919	 * Note: FREAD failures returns EBADF to maintain backwards
1920	 * compatibility with what routines returned before.
1921	 *
1922	 * Only one flag, or 0, may be specified.
1923	 */
1924	if (flags == FREAD && (fp->f_flag & FREAD) == 0) {
1925		FILEDESC_UNLOCK(fdp);
1926		return (EBADF);
1927	}
1928	if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) {
1929		FILEDESC_UNLOCK(fdp);
1930		return (EINVAL);
1931	}
1932	if (hold) {
1933		fhold(fp);
1934		FILEDESC_UNLOCK(fdp);
1935	}
1936	*fpp = fp;
1937	return (0);
1938}
1939
1940int
1941fget(struct thread *td, int fd, struct file **fpp)
1942{
1943
1944	return(_fget(td, fd, fpp, 0, 1));
1945}
1946
1947int
1948fget_read(struct thread *td, int fd, struct file **fpp)
1949{
1950
1951	return(_fget(td, fd, fpp, FREAD, 1));
1952}
1953
1954int
1955fget_write(struct thread *td, int fd, struct file **fpp)
1956{
1957
1958	return(_fget(td, fd, fpp, FWRITE, 1));
1959}
1960
1961/*
1962 * Like fget() but loads the underlying vnode, or returns an error if
1963 * the descriptor does not represent a vnode.  Note that pipes use vnodes
1964 * but never have VM objects (so VOP_GETVOBJECT() calls will return an
1965 * error).  The returned vnode will be vref()d.
1966 */
1967static __inline int
1968_fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags)
1969{
1970	struct file *fp;
1971	int error;
1972
1973	*vpp = NULL;
1974	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
1975		return (error);
1976	if (fp->f_vnode == NULL) {
1977		error = EINVAL;
1978	} else {
1979		*vpp = fp->f_vnode;
1980		vref(*vpp);
1981	}
1982	FILEDESC_UNLOCK(td->td_proc->p_fd);
1983	return (error);
1984}
1985
1986int
1987fgetvp(struct thread *td, int fd, struct vnode **vpp)
1988{
1989
1990	return (_fgetvp(td, fd, vpp, 0));
1991}
1992
1993int
1994fgetvp_read(struct thread *td, int fd, struct vnode **vpp)
1995{
1996
1997	return (_fgetvp(td, fd, vpp, FREAD));
1998}
1999
2000int
2001fgetvp_write(struct thread *td, int fd, struct vnode **vpp)
2002{
2003
2004	return (_fgetvp(td, fd, vpp, FWRITE));
2005}
2006
2007/*
2008 * Like fget() but loads the underlying socket, or returns an error if
2009 * the descriptor does not represent a socket.
2010 *
2011 * We bump the ref count on the returned socket.  XXX Also obtain the SX
2012 * lock in the future.
2013 */
2014int
2015fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp)
2016{
2017	struct file *fp;
2018	int error;
2019
2020	*spp = NULL;
2021	if (fflagp != NULL)
2022		*fflagp = 0;
2023	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
2024		return (error);
2025	if (fp->f_type != DTYPE_SOCKET) {
2026		error = ENOTSOCK;
2027	} else {
2028		*spp = fp->f_data;
2029		if (fflagp)
2030			*fflagp = fp->f_flag;
2031		soref(*spp);
2032	}
2033	FILEDESC_UNLOCK(td->td_proc->p_fd);
2034	return (error);
2035}
2036
2037/*
2038 * Drop the reference count on the the socket and XXX release the SX lock in
2039 * the future.  The last reference closes the socket.
2040 */
2041void
2042fputsock(struct socket *so)
2043{
2044
2045	sorele(so);
2046}
2047
2048/*
2049 * Drop reference on struct file passed in, may call closef if the
2050 * reference hits zero.
2051 * Expects struct file locked, and will unlock it.
2052 */
2053int
2054fdrop_locked(fp, td)
2055	struct file *fp;
2056	struct thread *td;
2057{
2058	struct flock lf;
2059	struct vnode *vp;
2060	int error;
2061
2062	FILE_LOCK_ASSERT(fp, MA_OWNED);
2063
2064	if (--fp->f_count > 0) {
2065		FILE_UNLOCK(fp);
2066		return (0);
2067	}
2068	/* We have the last ref so we can proceed without the file lock. */
2069	FILE_UNLOCK(fp);
2070	mtx_lock(&Giant);
2071	if (fp->f_count < 0)
2072		panic("fdrop: count < 0");
2073	if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
2074		lf.l_whence = SEEK_SET;
2075		lf.l_start = 0;
2076		lf.l_len = 0;
2077		lf.l_type = F_UNLCK;
2078		vp = fp->f_vnode;
2079		(void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
2080	}
2081	if (fp->f_ops != &badfileops)
2082		error = fo_close(fp, td);
2083	else
2084		error = 0;
2085	ffree(fp);
2086	mtx_unlock(&Giant);
2087	return (error);
2088}
2089
2090/*
2091 * Apply an advisory lock on a file descriptor.
2092 *
2093 * Just attempt to get a record lock of the requested type on
2094 * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
2095 */
2096#ifndef _SYS_SYSPROTO_H_
2097struct flock_args {
2098	int	fd;
2099	int	how;
2100};
2101#endif
2102/*
2103 * MPSAFE
2104 */
2105/* ARGSUSED */
2106int
2107flock(td, uap)
2108	struct thread *td;
2109	struct flock_args *uap;
2110{
2111	struct file *fp;
2112	struct vnode *vp;
2113	struct flock lf;
2114	int error;
2115
2116	if ((error = fget(td, uap->fd, &fp)) != 0)
2117		return (error);
2118	if (fp->f_type != DTYPE_VNODE) {
2119		fdrop(fp, td);
2120		return (EOPNOTSUPP);
2121	}
2122
2123	mtx_lock(&Giant);
2124	vp = fp->f_vnode;
2125	lf.l_whence = SEEK_SET;
2126	lf.l_start = 0;
2127	lf.l_len = 0;
2128	if (uap->how & LOCK_UN) {
2129		lf.l_type = F_UNLCK;
2130		FILE_LOCK(fp);
2131		fp->f_flag &= ~FHASLOCK;
2132		FILE_UNLOCK(fp);
2133		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
2134		goto done2;
2135	}
2136	if (uap->how & LOCK_EX)
2137		lf.l_type = F_WRLCK;
2138	else if (uap->how & LOCK_SH)
2139		lf.l_type = F_RDLCK;
2140	else {
2141		error = EBADF;
2142		goto done2;
2143	}
2144	FILE_LOCK(fp);
2145	fp->f_flag |= FHASLOCK;
2146	FILE_UNLOCK(fp);
2147	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
2148	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
2149done2:
2150	fdrop(fp, td);
2151	mtx_unlock(&Giant);
2152	return (error);
2153}
2154
2155/*
2156 * File Descriptor pseudo-device driver (/dev/fd/).
2157 *
2158 * Opening minor device N dup()s the file (if any) connected to file
2159 * descriptor N belonging to the calling process.  Note that this driver
2160 * consists of only the ``open()'' routine, because all subsequent
2161 * references to this file will be direct to the other driver.
2162 */
2163/* ARGSUSED */
2164static int
2165fdopen(dev, mode, type, td)
2166	dev_t dev;
2167	int mode, type;
2168	struct thread *td;
2169{
2170
2171	/*
2172	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
2173	 * the file descriptor being sought for duplication. The error
2174	 * return ensures that the vnode for this device will be released
2175	 * by vn_open. Open will detect this special error and take the
2176	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
2177	 * will simply report the error.
2178	 */
2179	td->td_dupfd = dev2unit(dev);
2180	return (ENODEV);
2181}
2182
2183/*
2184 * Duplicate the specified descriptor to a free descriptor.
2185 */
2186int
2187dupfdopen(td, fdp, indx, dfd, mode, error)
2188	struct thread *td;
2189	struct filedesc *fdp;
2190	int indx, dfd;
2191	int mode;
2192	int error;
2193{
2194	struct file *wfp;
2195	struct file *fp;
2196
2197	/*
2198	 * If the to-be-dup'd fd number is greater than the allowed number
2199	 * of file descriptors, or the fd to be dup'd has already been
2200	 * closed, then reject.
2201	 */
2202	FILEDESC_LOCK(fdp);
2203	if (dfd < 0 || dfd >= fdp->fd_nfiles ||
2204	    (wfp = fdp->fd_ofiles[dfd]) == NULL) {
2205		FILEDESC_UNLOCK(fdp);
2206		return (EBADF);
2207	}
2208
2209	/*
2210	 * There are two cases of interest here.
2211	 *
2212	 * For ENODEV simply dup (dfd) to file descriptor
2213	 * (indx) and return.
2214	 *
2215	 * For ENXIO steal away the file structure from (dfd) and
2216	 * store it in (indx).  (dfd) is effectively closed by
2217	 * this operation.
2218	 *
2219	 * Any other error code is just returned.
2220	 */
2221	switch (error) {
2222	case ENODEV:
2223		/*
2224		 * Check that the mode the file is being opened for is a
2225		 * subset of the mode of the existing descriptor.
2226		 */
2227		FILE_LOCK(wfp);
2228		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {
2229			FILE_UNLOCK(wfp);
2230			FILEDESC_UNLOCK(fdp);
2231			return (EACCES);
2232		}
2233		fp = fdp->fd_ofiles[indx];
2234		fdp->fd_ofiles[indx] = wfp;
2235		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
2236		if (fp == NULL)
2237			fdused(fdp, indx);
2238		fhold_locked(wfp);
2239		FILE_UNLOCK(wfp);
2240		if (fp != NULL)
2241			FILE_LOCK(fp);
2242		FILEDESC_UNLOCK(fdp);
2243		/*
2244		 * We now own the reference to fp that the ofiles[] array
2245		 * used to own.  Release it.
2246		 */
2247		if (fp != NULL)
2248			fdrop_locked(fp, td);
2249		return (0);
2250
2251	case ENXIO:
2252		/*
2253		 * Steal away the file pointer from dfd and stuff it into indx.
2254		 */
2255		fp = fdp->fd_ofiles[indx];
2256		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
2257		fdp->fd_ofiles[dfd] = NULL;
2258		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
2259		fdp->fd_ofileflags[dfd] = 0;
2260		fdunused(fdp, dfd);
2261		if (fp == NULL)
2262			fdused(fdp, indx);
2263		if (fp != NULL)
2264			FILE_LOCK(fp);
2265		FILEDESC_UNLOCK(fdp);
2266
2267		/*
2268		 * we now own the reference to fp that the ofiles[] array
2269		 * used to own.  Release it.
2270		 */
2271		if (fp != NULL)
2272			fdrop_locked(fp, td);
2273		return (0);
2274
2275	default:
2276		FILEDESC_UNLOCK(fdp);
2277		return (error);
2278	}
2279	/* NOTREACHED */
2280}
2281
2282struct filedesc_to_leader *
2283filedesc_to_leader_alloc(struct filedesc_to_leader *old,
2284			 struct filedesc *fdp,
2285			 struct proc *leader)
2286{
2287	struct filedesc_to_leader *fdtol;
2288
2289	MALLOC(fdtol, struct filedesc_to_leader *,
2290	       sizeof(struct filedesc_to_leader),
2291	       M_FILEDESC_TO_LEADER,
2292	       M_WAITOK);
2293	fdtol->fdl_refcount = 1;
2294	fdtol->fdl_holdcount = 0;
2295	fdtol->fdl_wakeup = 0;
2296	fdtol->fdl_leader = leader;
2297	if (old != NULL) {
2298		FILEDESC_LOCK(fdp);
2299		fdtol->fdl_next = old->fdl_next;
2300		fdtol->fdl_prev = old;
2301		old->fdl_next = fdtol;
2302		fdtol->fdl_next->fdl_prev = fdtol;
2303		FILEDESC_UNLOCK(fdp);
2304	} else {
2305		fdtol->fdl_next = fdtol;
2306		fdtol->fdl_prev = fdtol;
2307	}
2308	return (fdtol);
2309}
2310
2311/*
2312 * Get file structures.
2313 */
2314static int
2315sysctl_kern_file(SYSCTL_HANDLER_ARGS)
2316{
2317	struct xfile xf;
2318	struct filedesc *fdp;
2319	struct file *fp;
2320	struct proc *p;
2321	int error, n;
2322
2323	/*
2324	 * Note: because the number of file descriptors is calculated
2325	 * in different ways for sizing vs returning the data,
2326	 * there is information leakage from the first loop.  However,
2327	 * it is of a similar order of magnitude to the leakage from
2328	 * global system statistics such as kern.openfiles.
2329	 */
2330	sysctl_wire_old_buffer(req, 0);
2331	if (req->oldptr == NULL) {
2332		n = 16;		/* A slight overestimate. */
2333		sx_slock(&filelist_lock);
2334		LIST_FOREACH(fp, &filehead, f_list) {
2335			/*
2336			 * We should grab the lock, but this is an
2337			 * estimate, so does it really matter?
2338			 */
2339			/* mtx_lock(fp->f_mtxp); */
2340			n += fp->f_count;
2341			/* mtx_unlock(f->f_mtxp); */
2342		}
2343		sx_sunlock(&filelist_lock);
2344		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
2345	}
2346	error = 0;
2347	bzero(&xf, sizeof(xf));
2348	xf.xf_size = sizeof(xf);
2349	sx_slock(&allproc_lock);
2350	LIST_FOREACH(p, &allproc, p_list) {
2351		PROC_LOCK(p);
2352		if (p_cansee(req->td, p) != 0) {
2353			PROC_UNLOCK(p);
2354			continue;
2355		}
2356		xf.xf_pid = p->p_pid;
2357		xf.xf_uid = p->p_ucred->cr_uid;
2358		PROC_UNLOCK(p);
2359		mtx_lock(&fdesc_mtx);
2360		if ((fdp = p->p_fd) == NULL) {
2361			mtx_unlock(&fdesc_mtx);
2362			continue;
2363		}
2364		FILEDESC_LOCK(fdp);
2365		for (n = 0; n < fdp->fd_nfiles; ++n) {
2366			if ((fp = fdp->fd_ofiles[n]) == NULL)
2367				continue;
2368			xf.xf_fd = n;
2369			xf.xf_file = fp;
2370			xf.xf_data = fp->f_data;
2371			xf.xf_type = fp->f_type;
2372			xf.xf_count = fp->f_count;
2373			xf.xf_msgcount = fp->f_msgcount;
2374			xf.xf_offset = fp->f_offset;
2375			xf.xf_flag = fp->f_flag;
2376			error = SYSCTL_OUT(req, &xf, sizeof(xf));
2377			if (error)
2378				break;
2379		}
2380		FILEDESC_UNLOCK(fdp);
2381		mtx_unlock(&fdesc_mtx);
2382		if (error)
2383			break;
2384	}
2385	sx_sunlock(&allproc_lock);
2386	return (error);
2387}
2388
2389SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
2390    0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
2391
2392SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
2393    &maxfilesperproc, 0, "Maximum files allowed open per process");
2394
2395SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
2396    &maxfiles, 0, "Maximum number of files");
2397
2398SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
2399    &nfiles, 0, "System-wide number of open files");
2400
2401static void
2402fildesc_drvinit(void *unused)
2403{
2404	dev_t dev;
2405
2406	dev = make_dev(&fildesc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "fd/0");
2407	make_dev_alias(dev, "stdin");
2408	dev = make_dev(&fildesc_cdevsw, 1, UID_ROOT, GID_WHEEL, 0666, "fd/1");
2409	make_dev_alias(dev, "stdout");
2410	dev = make_dev(&fildesc_cdevsw, 2, UID_ROOT, GID_WHEEL, 0666, "fd/2");
2411	make_dev_alias(dev, "stderr");
2412}
2413
2414static fo_rdwr_t	badfo_readwrite;
2415static fo_ioctl_t	badfo_ioctl;
2416static fo_poll_t	badfo_poll;
2417static fo_kqfilter_t	badfo_kqfilter;
2418static fo_stat_t	badfo_stat;
2419static fo_close_t	badfo_close;
2420
2421struct fileops badfileops = {
2422	.fo_read = badfo_readwrite,
2423	.fo_write = badfo_readwrite,
2424	.fo_ioctl = badfo_ioctl,
2425	.fo_poll = badfo_poll,
2426	.fo_kqfilter = badfo_kqfilter,
2427	.fo_stat = badfo_stat,
2428	.fo_close = badfo_close,
2429};
2430
2431static int
2432badfo_readwrite(fp, uio, active_cred, flags, td)
2433	struct file *fp;
2434	struct uio *uio;
2435	struct ucred *active_cred;
2436	struct thread *td;
2437	int flags;
2438{
2439
2440	return (EBADF);
2441}
2442
2443static int
2444badfo_ioctl(fp, com, data, active_cred, td)
2445	struct file *fp;
2446	u_long com;
2447	void *data;
2448	struct ucred *active_cred;
2449	struct thread *td;
2450{
2451
2452	return (EBADF);
2453}
2454
2455static int
2456badfo_poll(fp, events, active_cred, td)
2457	struct file *fp;
2458	int events;
2459	struct ucred *active_cred;
2460	struct thread *td;
2461{
2462
2463	return (0);
2464}
2465
2466static int
2467badfo_kqfilter(fp, kn)
2468	struct file *fp;
2469	struct knote *kn;
2470{
2471
2472	return (0);
2473}
2474
2475static int
2476badfo_stat(fp, sb, active_cred, td)
2477	struct file *fp;
2478	struct stat *sb;
2479	struct ucred *active_cred;
2480	struct thread *td;
2481{
2482
2483	return (EBADF);
2484}
2485
2486static int
2487badfo_close(fp, td)
2488	struct file *fp;
2489	struct thread *td;
2490{
2491
2492	return (EBADF);
2493}
2494
2495SYSINIT(fildescdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,
2496					fildesc_drvinit,NULL)
2497
2498static void filelistinit(void *);
2499SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL)
2500
2501/* ARGSUSED*/
2502static void
2503filelistinit(dummy)
2504	void *dummy;
2505{
2506
2507	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
2508	    NULL, NULL, UMA_ALIGN_PTR, 0);
2509	sx_init(&filelist_lock, "filelist lock");
2510	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
2511}
2512