kern_descrip.c revision 236950
1/*- 2 * Copyright (c) 1982, 1986, 1989, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 35 */ 36 37#include <sys/cdefs.h> 38__FBSDID("$FreeBSD: head/sys/kern/kern_descrip.c 236950 2012-06-12 10:25:11Z pjd $"); 39 40#include "opt_capsicum.h" 41#include "opt_compat.h" 42#include "opt_ddb.h" 43#include "opt_ktrace.h" 44#include "opt_procdesc.h" 45 46#include <sys/param.h> 47#include <sys/systm.h> 48 49#include <sys/capability.h> 50#include <sys/conf.h> 51#include <sys/domain.h> 52#include <sys/fcntl.h> 53#include <sys/file.h> 54#include <sys/filedesc.h> 55#include <sys/filio.h> 56#include <sys/jail.h> 57#include <sys/kernel.h> 58#include <sys/limits.h> 59#include <sys/lock.h> 60#include <sys/malloc.h> 61#include <sys/mman.h> 62#include <sys/mount.h> 63#include <sys/mqueue.h> 64#include <sys/mutex.h> 65#include <sys/namei.h> 66#include <sys/selinfo.h> 67#include <sys/pipe.h> 68#include <sys/priv.h> 69#include <sys/proc.h> 70#include <sys/procdesc.h> 71#include <sys/protosw.h> 72#include <sys/racct.h> 73#include <sys/resourcevar.h> 74#include <sys/signalvar.h> 75#include <sys/socketvar.h> 76#include <sys/stat.h> 77#include <sys/sx.h> 78#include <sys/syscallsubr.h> 79#include <sys/sysctl.h> 80#include <sys/sysproto.h> 81#include <sys/tty.h> 82#include <sys/unistd.h> 83#include <sys/un.h> 84#include <sys/unpcb.h> 85#include <sys/user.h> 86#include <sys/vnode.h> 87#ifdef KTRACE 88#include <sys/ktrace.h> 89#endif 90 91#include <net/vnet.h> 92 93#include <netinet/in.h> 94#include <netinet/in_pcb.h> 95 96#include <security/audit/audit.h> 97 98#include <vm/uma.h> 99#include <vm/vm.h> 100 101#include <ddb/ddb.h> 102 103static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table"); 104static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader", 105 "file desc to leader structures"); 106static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); 107 108MALLOC_DECLARE(M_FADVISE); 109 110static uma_zone_t file_zone; 111 112 113/* Flags for do_dup() */ 114#define DUP_FIXED 0x1 /* Force fixed allocation */ 115#define DUP_FCNTL 0x2 /* fcntl()-style errors */ 116 117static int closefp(struct filedesc *fdp, int fd, struct file *fp, 118 struct thread *td); 119static int do_dup(struct thread *td, int flags, int old, int new, 120 register_t *retval); 121static int fd_first_free(struct filedesc *, int, int); 122static int fd_last_used(struct filedesc *, int, int); 123static void fdgrowtable(struct filedesc *, int); 124static void fdunused(struct filedesc *fdp, int fd); 125static void fdused(struct filedesc *fdp, int fd); 126static int fill_vnode_info(struct vnode *vp, struct kinfo_file *kif); 127static int fill_socket_info(struct socket *so, struct kinfo_file *kif); 128static int fill_pts_info(struct tty *tp, struct kinfo_file *kif); 129static int fill_pipe_info(struct pipe *pi, struct kinfo_file *kif); 130static int fill_procdesc_info(struct procdesc *pdp, 131 struct kinfo_file *kif); 132static int fill_shm_info(struct file *fp, struct kinfo_file *kif); 133 134/* 135 * A process is initially started out with NDFILE descriptors stored within 136 * this structure, selected to be enough for typical applications based on 137 * the historical limit of 20 open files (and the usage of descriptors by 138 * shells). If these descriptors are exhausted, a larger descriptor table 139 * may be allocated, up to a process' resource limit; the internal arrays 140 * are then unused. 141 */ 142#define NDFILE 20 143#define NDSLOTSIZE sizeof(NDSLOTTYPE) 144#define NDENTRIES (NDSLOTSIZE * __CHAR_BIT) 145#define NDSLOT(x) ((x) / NDENTRIES) 146#define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES)) 147#define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) 148 149/* 150 * Storage required per open file descriptor. 151 */ 152#define OFILESIZE (sizeof(struct file *) + sizeof(char)) 153 154/* 155 * Storage to hold unused ofiles that need to be reclaimed. 156 */ 157struct freetable { 158 struct file **ft_table; 159 SLIST_ENTRY(freetable) ft_next; 160}; 161 162/* 163 * Basic allocation of descriptors: 164 * one of the above, plus arrays for NDFILE descriptors. 165 */ 166struct filedesc0 { 167 struct filedesc fd_fd; 168 /* 169 * ofiles which need to be reclaimed on free. 170 */ 171 SLIST_HEAD(,freetable) fd_free; 172 /* 173 * These arrays are used when the number of open files is 174 * <= NDFILE, and are then pointed to by the pointers above. 175 */ 176 struct file *fd_dfiles[NDFILE]; 177 char fd_dfileflags[NDFILE]; 178 NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; 179}; 180 181/* 182 * Descriptor management. 183 */ 184volatile int openfiles; /* actual number of open files */ 185struct mtx sigio_lock; /* mtx to protect pointers to sigio */ 186void (*mq_fdclose)(struct thread *td, int fd, struct file *fp); 187 188/* A mutex to protect the association between a proc and filedesc. */ 189static struct mtx fdesc_mtx; 190 191/* 192 * Find the first zero bit in the given bitmap, starting at low and not 193 * exceeding size - 1. 194 */ 195static int 196fd_first_free(struct filedesc *fdp, int low, int size) 197{ 198 NDSLOTTYPE *map = fdp->fd_map; 199 NDSLOTTYPE mask; 200 int off, maxoff; 201 202 if (low >= size) 203 return (low); 204 205 off = NDSLOT(low); 206 if (low % NDENTRIES) { 207 mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES))); 208 if ((mask &= ~map[off]) != 0UL) 209 return (off * NDENTRIES + ffsl(mask) - 1); 210 ++off; 211 } 212 for (maxoff = NDSLOTS(size); off < maxoff; ++off) 213 if (map[off] != ~0UL) 214 return (off * NDENTRIES + ffsl(~map[off]) - 1); 215 return (size); 216} 217 218/* 219 * Find the highest non-zero bit in the given bitmap, starting at low and 220 * not exceeding size - 1. 221 */ 222static int 223fd_last_used(struct filedesc *fdp, int low, int size) 224{ 225 NDSLOTTYPE *map = fdp->fd_map; 226 NDSLOTTYPE mask; 227 int off, minoff; 228 229 if (low >= size) 230 return (-1); 231 232 off = NDSLOT(size); 233 if (size % NDENTRIES) { 234 mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES)); 235 if ((mask &= map[off]) != 0) 236 return (off * NDENTRIES + flsl(mask) - 1); 237 --off; 238 } 239 for (minoff = NDSLOT(low); off >= minoff; --off) 240 if (map[off] != 0) 241 return (off * NDENTRIES + flsl(map[off]) - 1); 242 return (low - 1); 243} 244 245static int 246fdisused(struct filedesc *fdp, int fd) 247{ 248 KASSERT((unsigned int)fd < fdp->fd_nfiles, 249 ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles)); 250 return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0); 251} 252 253/* 254 * Mark a file descriptor as used. 255 */ 256static void 257fdused(struct filedesc *fdp, int fd) 258{ 259 260 FILEDESC_XLOCK_ASSERT(fdp); 261 KASSERT(!fdisused(fdp, fd), 262 ("fd already used")); 263 264 fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd); 265 if (fd > fdp->fd_lastfile) 266 fdp->fd_lastfile = fd; 267 if (fd == fdp->fd_freefile) 268 fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles); 269} 270 271/* 272 * Mark a file descriptor as unused. 273 */ 274static void 275fdunused(struct filedesc *fdp, int fd) 276{ 277 278 FILEDESC_XLOCK_ASSERT(fdp); 279 KASSERT(fdisused(fdp, fd), 280 ("fd is already unused")); 281 KASSERT(fdp->fd_ofiles[fd] == NULL, 282 ("fd is still in use")); 283 284 fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd); 285 if (fd < fdp->fd_freefile) 286 fdp->fd_freefile = fd; 287 if (fd == fdp->fd_lastfile) 288 fdp->fd_lastfile = fd_last_used(fdp, 0, fd); 289} 290 291/* 292 * System calls on descriptors. 293 */ 294#ifndef _SYS_SYSPROTO_H_ 295struct getdtablesize_args { 296 int dummy; 297}; 298#endif 299/* ARGSUSED */ 300int 301sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap) 302{ 303 struct proc *p = td->td_proc; 304 uint64_t lim; 305 306 PROC_LOCK(p); 307 td->td_retval[0] = 308 min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); 309 lim = racct_get_limit(td->td_proc, RACCT_NOFILE); 310 PROC_UNLOCK(p); 311 if (lim < td->td_retval[0]) 312 td->td_retval[0] = lim; 313 return (0); 314} 315 316/* 317 * Duplicate a file descriptor to a particular value. 318 * 319 * Note: keep in mind that a potential race condition exists when closing 320 * descriptors from a shared descriptor table (via rfork). 321 */ 322#ifndef _SYS_SYSPROTO_H_ 323struct dup2_args { 324 u_int from; 325 u_int to; 326}; 327#endif 328/* ARGSUSED */ 329int 330sys_dup2(struct thread *td, struct dup2_args *uap) 331{ 332 333 return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to, 334 td->td_retval)); 335} 336 337/* 338 * Duplicate a file descriptor. 339 */ 340#ifndef _SYS_SYSPROTO_H_ 341struct dup_args { 342 u_int fd; 343}; 344#endif 345/* ARGSUSED */ 346int 347sys_dup(struct thread *td, struct dup_args *uap) 348{ 349 350 return (do_dup(td, 0, (int)uap->fd, 0, td->td_retval)); 351} 352 353/* 354 * The file control system call. 355 */ 356#ifndef _SYS_SYSPROTO_H_ 357struct fcntl_args { 358 int fd; 359 int cmd; 360 long arg; 361}; 362#endif 363/* ARGSUSED */ 364int 365sys_fcntl(struct thread *td, struct fcntl_args *uap) 366{ 367 struct flock fl; 368 struct oflock ofl; 369 intptr_t arg; 370 int error; 371 int cmd; 372 373 error = 0; 374 cmd = uap->cmd; 375 switch (uap->cmd) { 376 case F_OGETLK: 377 case F_OSETLK: 378 case F_OSETLKW: 379 /* 380 * Convert old flock structure to new. 381 */ 382 error = copyin((void *)(intptr_t)uap->arg, &ofl, sizeof(ofl)); 383 fl.l_start = ofl.l_start; 384 fl.l_len = ofl.l_len; 385 fl.l_pid = ofl.l_pid; 386 fl.l_type = ofl.l_type; 387 fl.l_whence = ofl.l_whence; 388 fl.l_sysid = 0; 389 390 switch (uap->cmd) { 391 case F_OGETLK: 392 cmd = F_GETLK; 393 break; 394 case F_OSETLK: 395 cmd = F_SETLK; 396 break; 397 case F_OSETLKW: 398 cmd = F_SETLKW; 399 break; 400 } 401 arg = (intptr_t)&fl; 402 break; 403 case F_GETLK: 404 case F_SETLK: 405 case F_SETLKW: 406 case F_SETLK_REMOTE: 407 error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl)); 408 arg = (intptr_t)&fl; 409 break; 410 default: 411 arg = uap->arg; 412 break; 413 } 414 if (error) 415 return (error); 416 error = kern_fcntl(td, uap->fd, cmd, arg); 417 if (error) 418 return (error); 419 if (uap->cmd == F_OGETLK) { 420 ofl.l_start = fl.l_start; 421 ofl.l_len = fl.l_len; 422 ofl.l_pid = fl.l_pid; 423 ofl.l_type = fl.l_type; 424 ofl.l_whence = fl.l_whence; 425 error = copyout(&ofl, (void *)(intptr_t)uap->arg, sizeof(ofl)); 426 } else if (uap->cmd == F_GETLK) { 427 error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl)); 428 } 429 return (error); 430} 431 432static inline struct file * 433fdtofp(int fd, struct filedesc *fdp) 434{ 435 436 FILEDESC_LOCK_ASSERT(fdp); 437 438 if ((unsigned)fd >= fdp->fd_nfiles) 439 return (NULL); 440 441 return (fdp->fd_ofiles[fd]); 442} 443 444static inline int 445fdunwrap(int fd, cap_rights_t rights, struct filedesc *fdp, struct file **fpp) 446{ 447 448 *fpp = fdtofp(fd, fdp); 449 if (*fpp == NULL) 450 return (EBADF); 451 452#ifdef CAPABILITIES 453 if ((*fpp)->f_type == DTYPE_CAPABILITY) { 454 int err = cap_funwrap(*fpp, rights, fpp); 455 if (err != 0) { 456 *fpp = NULL; 457 return (err); 458 } 459 } 460#endif /* CAPABILITIES */ 461 return (0); 462} 463 464int 465kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) 466{ 467 struct filedesc *fdp; 468 struct flock *flp; 469 struct file *fp; 470 struct proc *p; 471 char *pop; 472 struct vnode *vp; 473 int error, flg, tmp; 474 int vfslocked; 475 u_int old, new; 476 uint64_t bsize; 477 478 vfslocked = 0; 479 error = 0; 480 flg = F_POSIX; 481 p = td->td_proc; 482 fdp = p->p_fd; 483 484 switch (cmd) { 485 case F_DUPFD: 486 tmp = arg; 487 error = do_dup(td, DUP_FCNTL, fd, tmp, td->td_retval); 488 break; 489 490 case F_DUP2FD: 491 tmp = arg; 492 error = do_dup(td, DUP_FIXED, fd, tmp, td->td_retval); 493 break; 494 495 case F_GETFD: 496 FILEDESC_SLOCK(fdp); 497 if ((fp = fdtofp(fd, fdp)) == NULL) { 498 FILEDESC_SUNLOCK(fdp); 499 error = EBADF; 500 break; 501 } 502 pop = &fdp->fd_ofileflags[fd]; 503 td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0; 504 FILEDESC_SUNLOCK(fdp); 505 break; 506 507 case F_SETFD: 508 FILEDESC_XLOCK(fdp); 509 if ((fp = fdtofp(fd, fdp)) == NULL) { 510 FILEDESC_XUNLOCK(fdp); 511 error = EBADF; 512 break; 513 } 514 pop = &fdp->fd_ofileflags[fd]; 515 *pop = (*pop &~ UF_EXCLOSE) | 516 (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); 517 FILEDESC_XUNLOCK(fdp); 518 break; 519 520 case F_GETFL: 521 FILEDESC_SLOCK(fdp); 522 error = fdunwrap(fd, CAP_FCNTL, fdp, &fp); 523 if (error != 0) { 524 FILEDESC_SUNLOCK(fdp); 525 break; 526 } 527 td->td_retval[0] = OFLAGS(fp->f_flag); 528 FILEDESC_SUNLOCK(fdp); 529 break; 530 531 case F_SETFL: 532 FILEDESC_SLOCK(fdp); 533 error = fdunwrap(fd, CAP_FCNTL, fdp, &fp); 534 if (error != 0) { 535 FILEDESC_SUNLOCK(fdp); 536 break; 537 } 538 fhold(fp); 539 FILEDESC_SUNLOCK(fdp); 540 do { 541 tmp = flg = fp->f_flag; 542 tmp &= ~FCNTLFLAGS; 543 tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; 544 } while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); 545 tmp = fp->f_flag & FNONBLOCK; 546 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 547 if (error) { 548 fdrop(fp, td); 549 break; 550 } 551 tmp = fp->f_flag & FASYNC; 552 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); 553 if (error == 0) { 554 fdrop(fp, td); 555 break; 556 } 557 atomic_clear_int(&fp->f_flag, FNONBLOCK); 558 tmp = 0; 559 (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 560 fdrop(fp, td); 561 break; 562 563 case F_GETOWN: 564 FILEDESC_SLOCK(fdp); 565 error = fdunwrap(fd, CAP_FCNTL, fdp, &fp); 566 if (error != 0) { 567 FILEDESC_SUNLOCK(fdp); 568 break; 569 } 570 fhold(fp); 571 FILEDESC_SUNLOCK(fdp); 572 error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td); 573 if (error == 0) 574 td->td_retval[0] = tmp; 575 fdrop(fp, td); 576 break; 577 578 case F_SETOWN: 579 FILEDESC_SLOCK(fdp); 580 error = fdunwrap(fd, CAP_FCNTL, fdp, &fp); 581 if (error != 0) { 582 FILEDESC_SUNLOCK(fdp); 583 break; 584 } 585 fhold(fp); 586 FILEDESC_SUNLOCK(fdp); 587 tmp = arg; 588 error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td); 589 fdrop(fp, td); 590 break; 591 592 case F_SETLK_REMOTE: 593 error = priv_check(td, PRIV_NFS_LOCKD); 594 if (error) 595 return (error); 596 flg = F_REMOTE; 597 goto do_setlk; 598 599 case F_SETLKW: 600 flg |= F_WAIT; 601 /* FALLTHROUGH F_SETLK */ 602 603 case F_SETLK: 604 do_setlk: 605 FILEDESC_SLOCK(fdp); 606 error = fdunwrap(fd, CAP_FLOCK, fdp, &fp); 607 if (error != 0) { 608 FILEDESC_SUNLOCK(fdp); 609 break; 610 } 611 if (fp->f_type != DTYPE_VNODE) { 612 FILEDESC_SUNLOCK(fdp); 613 error = EBADF; 614 break; 615 } 616 flp = (struct flock *)arg; 617 if (flp->l_whence == SEEK_CUR) { 618 if (fp->f_offset < 0 || 619 (flp->l_start > 0 && 620 fp->f_offset > OFF_MAX - flp->l_start)) { 621 FILEDESC_SUNLOCK(fdp); 622 error = EOVERFLOW; 623 break; 624 } 625 flp->l_start += fp->f_offset; 626 } 627 628 /* 629 * VOP_ADVLOCK() may block. 630 */ 631 fhold(fp); 632 FILEDESC_SUNLOCK(fdp); 633 vp = fp->f_vnode; 634 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 635 switch (flp->l_type) { 636 case F_RDLCK: 637 if ((fp->f_flag & FREAD) == 0) { 638 error = EBADF; 639 break; 640 } 641 PROC_LOCK(p->p_leader); 642 p->p_leader->p_flag |= P_ADVLOCK; 643 PROC_UNLOCK(p->p_leader); 644 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, 645 flp, flg); 646 break; 647 case F_WRLCK: 648 if ((fp->f_flag & FWRITE) == 0) { 649 error = EBADF; 650 break; 651 } 652 PROC_LOCK(p->p_leader); 653 p->p_leader->p_flag |= P_ADVLOCK; 654 PROC_UNLOCK(p->p_leader); 655 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, 656 flp, flg); 657 break; 658 case F_UNLCK: 659 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, 660 flp, flg); 661 break; 662 case F_UNLCKSYS: 663 /* 664 * Temporary api for testing remote lock 665 * infrastructure. 666 */ 667 if (flg != F_REMOTE) { 668 error = EINVAL; 669 break; 670 } 671 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, 672 F_UNLCKSYS, flp, flg); 673 break; 674 default: 675 error = EINVAL; 676 break; 677 } 678 VFS_UNLOCK_GIANT(vfslocked); 679 vfslocked = 0; 680 /* Check for race with close */ 681 FILEDESC_SLOCK(fdp); 682 if ((unsigned) fd >= fdp->fd_nfiles || 683 fp != fdp->fd_ofiles[fd]) { 684 FILEDESC_SUNLOCK(fdp); 685 flp->l_whence = SEEK_SET; 686 flp->l_start = 0; 687 flp->l_len = 0; 688 flp->l_type = F_UNLCK; 689 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 690 (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, 691 F_UNLCK, flp, F_POSIX); 692 VFS_UNLOCK_GIANT(vfslocked); 693 vfslocked = 0; 694 } else 695 FILEDESC_SUNLOCK(fdp); 696 fdrop(fp, td); 697 break; 698 699 case F_GETLK: 700 FILEDESC_SLOCK(fdp); 701 error = fdunwrap(fd, CAP_FLOCK, fdp, &fp); 702 if (error != 0) { 703 FILEDESC_SUNLOCK(fdp); 704 break; 705 } 706 if (fp->f_type != DTYPE_VNODE) { 707 FILEDESC_SUNLOCK(fdp); 708 error = EBADF; 709 break; 710 } 711 flp = (struct flock *)arg; 712 if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK && 713 flp->l_type != F_UNLCK) { 714 FILEDESC_SUNLOCK(fdp); 715 error = EINVAL; 716 break; 717 } 718 if (flp->l_whence == SEEK_CUR) { 719 if ((flp->l_start > 0 && 720 fp->f_offset > OFF_MAX - flp->l_start) || 721 (flp->l_start < 0 && 722 fp->f_offset < OFF_MIN - flp->l_start)) { 723 FILEDESC_SUNLOCK(fdp); 724 error = EOVERFLOW; 725 break; 726 } 727 flp->l_start += fp->f_offset; 728 } 729 /* 730 * VOP_ADVLOCK() may block. 731 */ 732 fhold(fp); 733 FILEDESC_SUNLOCK(fdp); 734 vp = fp->f_vnode; 735 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 736 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp, 737 F_POSIX); 738 VFS_UNLOCK_GIANT(vfslocked); 739 vfslocked = 0; 740 fdrop(fp, td); 741 break; 742 743 case F_RDAHEAD: 744 arg = arg ? 128 * 1024: 0; 745 /* FALLTHROUGH */ 746 case F_READAHEAD: 747 FILEDESC_SLOCK(fdp); 748 if ((fp = fdtofp(fd, fdp)) == NULL) { 749 FILEDESC_SUNLOCK(fdp); 750 error = EBADF; 751 break; 752 } 753 if (fp->f_type != DTYPE_VNODE) { 754 FILEDESC_SUNLOCK(fdp); 755 error = EBADF; 756 break; 757 } 758 fhold(fp); 759 FILEDESC_SUNLOCK(fdp); 760 if (arg != 0) { 761 vp = fp->f_vnode; 762 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 763 error = vn_lock(vp, LK_SHARED); 764 if (error != 0) 765 goto readahead_vnlock_fail; 766 bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize; 767 VOP_UNLOCK(vp, 0); 768 fp->f_seqcount = (arg + bsize - 1) / bsize; 769 do { 770 new = old = fp->f_flag; 771 new |= FRDAHEAD; 772 } while (!atomic_cmpset_rel_int(&fp->f_flag, old, new)); 773readahead_vnlock_fail: 774 VFS_UNLOCK_GIANT(vfslocked); 775 vfslocked = 0; 776 } else { 777 do { 778 new = old = fp->f_flag; 779 new &= ~FRDAHEAD; 780 } while (!atomic_cmpset_rel_int(&fp->f_flag, old, new)); 781 } 782 fdrop(fp, td); 783 break; 784 785 default: 786 error = EINVAL; 787 break; 788 } 789 VFS_UNLOCK_GIANT(vfslocked); 790 return (error); 791} 792 793/* 794 * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD). 795 */ 796static int 797do_dup(struct thread *td, int flags, int old, int new, 798 register_t *retval) 799{ 800 struct filedesc *fdp; 801 struct proc *p; 802 struct file *fp; 803 struct file *delfp; 804 int error, maxfd; 805 806 p = td->td_proc; 807 fdp = p->p_fd; 808 809 /* 810 * Verify we have a valid descriptor to dup from and possibly to 811 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should 812 * return EINVAL when the new descriptor is out of bounds. 813 */ 814 if (old < 0) 815 return (EBADF); 816 if (new < 0) 817 return (flags & DUP_FCNTL ? EINVAL : EBADF); 818 PROC_LOCK(p); 819 maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); 820 PROC_UNLOCK(p); 821 if (new >= maxfd) 822 return (flags & DUP_FCNTL ? EINVAL : EBADF); 823 824 FILEDESC_XLOCK(fdp); 825 if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) { 826 FILEDESC_XUNLOCK(fdp); 827 return (EBADF); 828 } 829 if (flags & DUP_FIXED && old == new) { 830 *retval = new; 831 FILEDESC_XUNLOCK(fdp); 832 return (0); 833 } 834 fp = fdp->fd_ofiles[old]; 835 fhold(fp); 836 837 /* 838 * If the caller specified a file descriptor, make sure the file 839 * table is large enough to hold it, and grab it. Otherwise, just 840 * allocate a new descriptor the usual way. Since the filedesc 841 * lock may be temporarily dropped in the process, we have to look 842 * out for a race. 843 */ 844 if (flags & DUP_FIXED) { 845 if (new >= fdp->fd_nfiles) { 846 /* 847 * The resource limits are here instead of e.g. 848 * fdalloc(), because the file descriptor table may be 849 * shared between processes, so we can't really use 850 * racct_add()/racct_sub(). Instead of counting the 851 * number of actually allocated descriptors, just put 852 * the limit on the size of the file descriptor table. 853 */ 854#ifdef RACCT 855 PROC_LOCK(p); 856 error = racct_set(p, RACCT_NOFILE, new + 1); 857 PROC_UNLOCK(p); 858 if (error != 0) { 859 FILEDESC_XUNLOCK(fdp); 860 fdrop(fp, td); 861 return (EMFILE); 862 } 863#endif 864 fdgrowtable(fdp, new + 1); 865 } 866 if (fdp->fd_ofiles[new] == NULL) 867 fdused(fdp, new); 868 } else { 869 if ((error = fdalloc(td, new, &new)) != 0) { 870 FILEDESC_XUNLOCK(fdp); 871 fdrop(fp, td); 872 return (error); 873 } 874 } 875 876 KASSERT(fp == fdp->fd_ofiles[old], ("old fd has been modified")); 877 KASSERT(old != new, ("new fd is same as old")); 878 879 delfp = fdp->fd_ofiles[new]; 880 /* 881 * Duplicate the source descriptor. 882 */ 883 fdp->fd_ofiles[new] = fp; 884 fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE; 885 if (new > fdp->fd_lastfile) 886 fdp->fd_lastfile = new; 887 *retval = new; 888 889 if (delfp != NULL) { 890 (void) closefp(fdp, new, delfp, td); 891 /* closefp() drops the FILEDESC lock for us. */ 892 } else { 893 FILEDESC_XUNLOCK(fdp); 894 } 895 896 return (0); 897} 898 899/* 900 * If sigio is on the list associated with a process or process group, 901 * disable signalling from the device, remove sigio from the list and 902 * free sigio. 903 */ 904void 905funsetown(struct sigio **sigiop) 906{ 907 struct sigio *sigio; 908 909 SIGIO_LOCK(); 910 sigio = *sigiop; 911 if (sigio == NULL) { 912 SIGIO_UNLOCK(); 913 return; 914 } 915 *(sigio->sio_myref) = NULL; 916 if ((sigio)->sio_pgid < 0) { 917 struct pgrp *pg = (sigio)->sio_pgrp; 918 PGRP_LOCK(pg); 919 SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio, 920 sigio, sio_pgsigio); 921 PGRP_UNLOCK(pg); 922 } else { 923 struct proc *p = (sigio)->sio_proc; 924 PROC_LOCK(p); 925 SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio, 926 sigio, sio_pgsigio); 927 PROC_UNLOCK(p); 928 } 929 SIGIO_UNLOCK(); 930 crfree(sigio->sio_ucred); 931 free(sigio, M_SIGIO); 932} 933 934/* 935 * Free a list of sigio structures. 936 * We only need to lock the SIGIO_LOCK because we have made ourselves 937 * inaccessible to callers of fsetown and therefore do not need to lock 938 * the proc or pgrp struct for the list manipulation. 939 */ 940void 941funsetownlst(struct sigiolst *sigiolst) 942{ 943 struct proc *p; 944 struct pgrp *pg; 945 struct sigio *sigio; 946 947 sigio = SLIST_FIRST(sigiolst); 948 if (sigio == NULL) 949 return; 950 p = NULL; 951 pg = NULL; 952 953 /* 954 * Every entry of the list should belong 955 * to a single proc or pgrp. 956 */ 957 if (sigio->sio_pgid < 0) { 958 pg = sigio->sio_pgrp; 959 PGRP_LOCK_ASSERT(pg, MA_NOTOWNED); 960 } else /* if (sigio->sio_pgid > 0) */ { 961 p = sigio->sio_proc; 962 PROC_LOCK_ASSERT(p, MA_NOTOWNED); 963 } 964 965 SIGIO_LOCK(); 966 while ((sigio = SLIST_FIRST(sigiolst)) != NULL) { 967 *(sigio->sio_myref) = NULL; 968 if (pg != NULL) { 969 KASSERT(sigio->sio_pgid < 0, 970 ("Proc sigio in pgrp sigio list")); 971 KASSERT(sigio->sio_pgrp == pg, 972 ("Bogus pgrp in sigio list")); 973 PGRP_LOCK(pg); 974 SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, 975 sio_pgsigio); 976 PGRP_UNLOCK(pg); 977 } else /* if (p != NULL) */ { 978 KASSERT(sigio->sio_pgid > 0, 979 ("Pgrp sigio in proc sigio list")); 980 KASSERT(sigio->sio_proc == p, 981 ("Bogus proc in sigio list")); 982 PROC_LOCK(p); 983 SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, 984 sio_pgsigio); 985 PROC_UNLOCK(p); 986 } 987 SIGIO_UNLOCK(); 988 crfree(sigio->sio_ucred); 989 free(sigio, M_SIGIO); 990 SIGIO_LOCK(); 991 } 992 SIGIO_UNLOCK(); 993} 994 995/* 996 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). 997 * 998 * After permission checking, add a sigio structure to the sigio list for 999 * the process or process group. 1000 */ 1001int 1002fsetown(pid_t pgid, struct sigio **sigiop) 1003{ 1004 struct proc *proc; 1005 struct pgrp *pgrp; 1006 struct sigio *sigio; 1007 int ret; 1008 1009 if (pgid == 0) { 1010 funsetown(sigiop); 1011 return (0); 1012 } 1013 1014 ret = 0; 1015 1016 /* Allocate and fill in the new sigio out of locks. */ 1017 sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK); 1018 sigio->sio_pgid = pgid; 1019 sigio->sio_ucred = crhold(curthread->td_ucred); 1020 sigio->sio_myref = sigiop; 1021 1022 sx_slock(&proctree_lock); 1023 if (pgid > 0) { 1024 proc = pfind(pgid); 1025 if (proc == NULL) { 1026 ret = ESRCH; 1027 goto fail; 1028 } 1029 1030 /* 1031 * Policy - Don't allow a process to FSETOWN a process 1032 * in another session. 1033 * 1034 * Remove this test to allow maximum flexibility or 1035 * restrict FSETOWN to the current process or process 1036 * group for maximum safety. 1037 */ 1038 PROC_UNLOCK(proc); 1039 if (proc->p_session != curthread->td_proc->p_session) { 1040 ret = EPERM; 1041 goto fail; 1042 } 1043 1044 pgrp = NULL; 1045 } else /* if (pgid < 0) */ { 1046 pgrp = pgfind(-pgid); 1047 if (pgrp == NULL) { 1048 ret = ESRCH; 1049 goto fail; 1050 } 1051 PGRP_UNLOCK(pgrp); 1052 1053 /* 1054 * Policy - Don't allow a process to FSETOWN a process 1055 * in another session. 1056 * 1057 * Remove this test to allow maximum flexibility or 1058 * restrict FSETOWN to the current process or process 1059 * group for maximum safety. 1060 */ 1061 if (pgrp->pg_session != curthread->td_proc->p_session) { 1062 ret = EPERM; 1063 goto fail; 1064 } 1065 1066 proc = NULL; 1067 } 1068 funsetown(sigiop); 1069 if (pgid > 0) { 1070 PROC_LOCK(proc); 1071 /* 1072 * Since funsetownlst() is called without the proctree 1073 * locked, we need to check for P_WEXIT. 1074 * XXX: is ESRCH correct? 1075 */ 1076 if ((proc->p_flag & P_WEXIT) != 0) { 1077 PROC_UNLOCK(proc); 1078 ret = ESRCH; 1079 goto fail; 1080 } 1081 SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio); 1082 sigio->sio_proc = proc; 1083 PROC_UNLOCK(proc); 1084 } else { 1085 PGRP_LOCK(pgrp); 1086 SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); 1087 sigio->sio_pgrp = pgrp; 1088 PGRP_UNLOCK(pgrp); 1089 } 1090 sx_sunlock(&proctree_lock); 1091 SIGIO_LOCK(); 1092 *sigiop = sigio; 1093 SIGIO_UNLOCK(); 1094 return (0); 1095 1096fail: 1097 sx_sunlock(&proctree_lock); 1098 crfree(sigio->sio_ucred); 1099 free(sigio, M_SIGIO); 1100 return (ret); 1101} 1102 1103/* 1104 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). 1105 */ 1106pid_t 1107fgetown(sigiop) 1108 struct sigio **sigiop; 1109{ 1110 pid_t pgid; 1111 1112 SIGIO_LOCK(); 1113 pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0; 1114 SIGIO_UNLOCK(); 1115 return (pgid); 1116} 1117 1118/* 1119 * Function drops the filedesc lock on return. 1120 */ 1121static int 1122closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td) 1123{ 1124 struct file *fp_object; 1125 int error, holdleaders; 1126 1127 FILEDESC_XLOCK_ASSERT(fdp); 1128 1129 if (td->td_proc->p_fdtol != NULL) { 1130 /* 1131 * Ask fdfree() to sleep to ensure that all relevant 1132 * process leaders can be traversed in closef(). 1133 */ 1134 fdp->fd_holdleaderscount++; 1135 holdleaders = 1; 1136 } else { 1137 holdleaders = 0; 1138 } 1139 1140 /* 1141 * We now hold the fp reference that used to be owned by the 1142 * descriptor array. We have to unlock the FILEDESC *AFTER* 1143 * knote_fdclose to prevent a race of the fd getting opened, a knote 1144 * added, and deleteing a knote for the new fd. 1145 */ 1146 knote_fdclose(td, fd); 1147 1148 /* 1149 * When we're closing an fd with a capability, we need to notify 1150 * mqueue if the underlying object is of type mqueue. 1151 */ 1152 (void)cap_funwrap(fp, 0, &fp_object); 1153 if (fp_object->f_type == DTYPE_MQUEUE) 1154 mq_fdclose(td, fd, fp_object); 1155 FILEDESC_XUNLOCK(fdp); 1156 1157 error = closef(fp, td); 1158 if (holdleaders) { 1159 FILEDESC_XLOCK(fdp); 1160 fdp->fd_holdleaderscount--; 1161 if (fdp->fd_holdleaderscount == 0 && 1162 fdp->fd_holdleaderswakeup != 0) { 1163 fdp->fd_holdleaderswakeup = 0; 1164 wakeup(&fdp->fd_holdleaderscount); 1165 } 1166 FILEDESC_XUNLOCK(fdp); 1167 } 1168 return (error); 1169} 1170 1171/* 1172 * Close a file descriptor. 1173 */ 1174#ifndef _SYS_SYSPROTO_H_ 1175struct close_args { 1176 int fd; 1177}; 1178#endif 1179/* ARGSUSED */ 1180int 1181sys_close(td, uap) 1182 struct thread *td; 1183 struct close_args *uap; 1184{ 1185 1186 return (kern_close(td, uap->fd)); 1187} 1188 1189int 1190kern_close(td, fd) 1191 struct thread *td; 1192 int fd; 1193{ 1194 struct filedesc *fdp; 1195 struct file *fp; 1196 1197 fdp = td->td_proc->p_fd; 1198 1199 AUDIT_SYSCLOSE(td, fd); 1200 1201 FILEDESC_XLOCK(fdp); 1202 if ((unsigned)fd >= fdp->fd_nfiles || 1203 (fp = fdp->fd_ofiles[fd]) == NULL) { 1204 FILEDESC_XUNLOCK(fdp); 1205 return (EBADF); 1206 } 1207 fdp->fd_ofiles[fd] = NULL; 1208 fdp->fd_ofileflags[fd] = 0; 1209 fdunused(fdp, fd); 1210 1211 /* closefp() drops the FILEDESC lock for us. */ 1212 return (closefp(fdp, fd, fp, td)); 1213} 1214 1215/* 1216 * Close open file descriptors. 1217 */ 1218#ifndef _SYS_SYSPROTO_H_ 1219struct closefrom_args { 1220 int lowfd; 1221}; 1222#endif 1223/* ARGSUSED */ 1224int 1225sys_closefrom(struct thread *td, struct closefrom_args *uap) 1226{ 1227 struct filedesc *fdp; 1228 int fd; 1229 1230 fdp = td->td_proc->p_fd; 1231 AUDIT_ARG_FD(uap->lowfd); 1232 1233 /* 1234 * Treat negative starting file descriptor values identical to 1235 * closefrom(0) which closes all files. 1236 */ 1237 if (uap->lowfd < 0) 1238 uap->lowfd = 0; 1239 FILEDESC_SLOCK(fdp); 1240 for (fd = uap->lowfd; fd < fdp->fd_nfiles; fd++) { 1241 if (fdp->fd_ofiles[fd] != NULL) { 1242 FILEDESC_SUNLOCK(fdp); 1243 (void)kern_close(td, fd); 1244 FILEDESC_SLOCK(fdp); 1245 } 1246 } 1247 FILEDESC_SUNLOCK(fdp); 1248 return (0); 1249} 1250 1251#if defined(COMPAT_43) 1252/* 1253 * Return status information about a file descriptor. 1254 */ 1255#ifndef _SYS_SYSPROTO_H_ 1256struct ofstat_args { 1257 int fd; 1258 struct ostat *sb; 1259}; 1260#endif 1261/* ARGSUSED */ 1262int 1263ofstat(struct thread *td, struct ofstat_args *uap) 1264{ 1265 struct ostat oub; 1266 struct stat ub; 1267 int error; 1268 1269 error = kern_fstat(td, uap->fd, &ub); 1270 if (error == 0) { 1271 cvtstat(&ub, &oub); 1272 error = copyout(&oub, uap->sb, sizeof(oub)); 1273 } 1274 return (error); 1275} 1276#endif /* COMPAT_43 */ 1277 1278/* 1279 * Return status information about a file descriptor. 1280 */ 1281#ifndef _SYS_SYSPROTO_H_ 1282struct fstat_args { 1283 int fd; 1284 struct stat *sb; 1285}; 1286#endif 1287/* ARGSUSED */ 1288int 1289sys_fstat(struct thread *td, struct fstat_args *uap) 1290{ 1291 struct stat ub; 1292 int error; 1293 1294 error = kern_fstat(td, uap->fd, &ub); 1295 if (error == 0) 1296 error = copyout(&ub, uap->sb, sizeof(ub)); 1297 return (error); 1298} 1299 1300int 1301kern_fstat(struct thread *td, int fd, struct stat *sbp) 1302{ 1303 struct file *fp; 1304 int error; 1305 1306 AUDIT_ARG_FD(fd); 1307 1308 if ((error = fget(td, fd, CAP_FSTAT, &fp)) != 0) 1309 return (error); 1310 1311 AUDIT_ARG_FILE(td->td_proc, fp); 1312 1313 error = fo_stat(fp, sbp, td->td_ucred, td); 1314 fdrop(fp, td); 1315#ifdef KTRACE 1316 if (error == 0 && KTRPOINT(td, KTR_STRUCT)) 1317 ktrstat(sbp); 1318#endif 1319 return (error); 1320} 1321 1322/* 1323 * Return status information about a file descriptor. 1324 */ 1325#ifndef _SYS_SYSPROTO_H_ 1326struct nfstat_args { 1327 int fd; 1328 struct nstat *sb; 1329}; 1330#endif 1331/* ARGSUSED */ 1332int 1333sys_nfstat(struct thread *td, struct nfstat_args *uap) 1334{ 1335 struct nstat nub; 1336 struct stat ub; 1337 int error; 1338 1339 error = kern_fstat(td, uap->fd, &ub); 1340 if (error == 0) { 1341 cvtnstat(&ub, &nub); 1342 error = copyout(&nub, uap->sb, sizeof(nub)); 1343 } 1344 return (error); 1345} 1346 1347/* 1348 * Return pathconf information about a file descriptor. 1349 */ 1350#ifndef _SYS_SYSPROTO_H_ 1351struct fpathconf_args { 1352 int fd; 1353 int name; 1354}; 1355#endif 1356/* ARGSUSED */ 1357int 1358sys_fpathconf(struct thread *td, struct fpathconf_args *uap) 1359{ 1360 struct file *fp; 1361 struct vnode *vp; 1362 int error; 1363 1364 if ((error = fget(td, uap->fd, CAP_FPATHCONF, &fp)) != 0) 1365 return (error); 1366 1367 /* If asynchronous I/O is available, it works for all descriptors. */ 1368 if (uap->name == _PC_ASYNC_IO) { 1369 td->td_retval[0] = async_io_version; 1370 goto out; 1371 } 1372 vp = fp->f_vnode; 1373 if (vp != NULL) { 1374 int vfslocked; 1375 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 1376 vn_lock(vp, LK_SHARED | LK_RETRY); 1377 error = VOP_PATHCONF(vp, uap->name, td->td_retval); 1378 VOP_UNLOCK(vp, 0); 1379 VFS_UNLOCK_GIANT(vfslocked); 1380 } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) { 1381 if (uap->name != _PC_PIPE_BUF) { 1382 error = EINVAL; 1383 } else { 1384 td->td_retval[0] = PIPE_BUF; 1385 error = 0; 1386 } 1387 } else { 1388 error = EOPNOTSUPP; 1389 } 1390out: 1391 fdrop(fp, td); 1392 return (error); 1393} 1394 1395/* 1396 * Grow the file table to accomodate (at least) nfd descriptors. 1397 */ 1398static void 1399fdgrowtable(struct filedesc *fdp, int nfd) 1400{ 1401 struct filedesc0 *fdp0; 1402 struct freetable *fo; 1403 struct file **ntable; 1404 struct file **otable; 1405 char *nfileflags; 1406 int nnfiles, onfiles; 1407 NDSLOTTYPE *nmap; 1408 1409 FILEDESC_XLOCK_ASSERT(fdp); 1410 1411 KASSERT(fdp->fd_nfiles > 0, 1412 ("zero-length file table")); 1413 1414 /* compute the size of the new table */ 1415 onfiles = fdp->fd_nfiles; 1416 nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ 1417 if (nnfiles <= onfiles) 1418 /* the table is already large enough */ 1419 return; 1420 1421 /* allocate a new table and (if required) new bitmaps */ 1422 ntable = malloc((nnfiles * OFILESIZE) + sizeof(struct freetable), 1423 M_FILEDESC, M_ZERO | M_WAITOK); 1424 nfileflags = (char *)&ntable[nnfiles]; 1425 if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) 1426 nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, 1427 M_FILEDESC, M_ZERO | M_WAITOK); 1428 else 1429 nmap = NULL; 1430 1431 bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable)); 1432 bcopy(fdp->fd_ofileflags, nfileflags, onfiles); 1433 otable = fdp->fd_ofiles; 1434 fdp->fd_ofileflags = nfileflags; 1435 fdp->fd_ofiles = ntable; 1436 /* 1437 * We must preserve ofiles until the process exits because we can't 1438 * be certain that no threads have references to the old table via 1439 * _fget(). 1440 */ 1441 if (onfiles > NDFILE) { 1442 fo = (struct freetable *)&otable[onfiles]; 1443 fdp0 = (struct filedesc0 *)fdp; 1444 fo->ft_table = otable; 1445 SLIST_INSERT_HEAD(&fdp0->fd_free, fo, ft_next); 1446 } 1447 if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { 1448 bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap)); 1449 if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) 1450 free(fdp->fd_map, M_FILEDESC); 1451 fdp->fd_map = nmap; 1452 } 1453 fdp->fd_nfiles = nnfiles; 1454} 1455 1456/* 1457 * Allocate a file descriptor for the process. 1458 */ 1459int 1460fdalloc(struct thread *td, int minfd, int *result) 1461{ 1462 struct proc *p = td->td_proc; 1463 struct filedesc *fdp = p->p_fd; 1464 int fd = -1, maxfd; 1465#ifdef RACCT 1466 int error; 1467#endif 1468 1469 FILEDESC_XLOCK_ASSERT(fdp); 1470 1471 if (fdp->fd_freefile > minfd) 1472 minfd = fdp->fd_freefile; 1473 1474 PROC_LOCK(p); 1475 maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); 1476 PROC_UNLOCK(p); 1477 1478 /* 1479 * Search the bitmap for a free descriptor. If none is found, try 1480 * to grow the file table. Keep at it until we either get a file 1481 * descriptor or run into process or system limits. 1482 */ 1483 for (;;) { 1484 fd = fd_first_free(fdp, minfd, fdp->fd_nfiles); 1485 if (fd >= maxfd) 1486 return (EMFILE); 1487 if (fd < fdp->fd_nfiles) 1488 break; 1489#ifdef RACCT 1490 PROC_LOCK(p); 1491 error = racct_set(p, RACCT_NOFILE, 1492 min(fdp->fd_nfiles * 2, maxfd)); 1493 PROC_UNLOCK(p); 1494 if (error != 0) 1495 return (EMFILE); 1496#endif 1497 fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd)); 1498 } 1499 1500 /* 1501 * Perform some sanity checks, then mark the file descriptor as 1502 * used and return it to the caller. 1503 */ 1504 KASSERT((unsigned int)fd < min(maxfd, fdp->fd_nfiles), 1505 ("invalid descriptor %d", fd)); 1506 KASSERT(!fdisused(fdp, fd), 1507 ("fd_first_free() returned non-free descriptor")); 1508 KASSERT(fdp->fd_ofiles[fd] == NULL, ("file descriptor isn't free")); 1509 KASSERT(fdp->fd_ofileflags[fd] == 0, ("file flags are set")); 1510 fdused(fdp, fd); 1511 *result = fd; 1512 return (0); 1513} 1514 1515/* 1516 * Check to see whether n user file descriptors are available to the process 1517 * p. 1518 */ 1519int 1520fdavail(struct thread *td, int n) 1521{ 1522 struct proc *p = td->td_proc; 1523 struct filedesc *fdp = td->td_proc->p_fd; 1524 int i, lim, last; 1525 1526 FILEDESC_LOCK_ASSERT(fdp); 1527 1528 /* 1529 * XXX: This is only called from uipc_usrreq.c:unp_externalize(); 1530 * call racct_add() from there instead of dealing with containers 1531 * here. 1532 */ 1533 PROC_LOCK(p); 1534 lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); 1535 PROC_UNLOCK(p); 1536 if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0) 1537 return (1); 1538 last = min(fdp->fd_nfiles, lim); 1539 for (i = fdp->fd_freefile; i < last; i++) { 1540 if (fdp->fd_ofiles[i] == NULL && --n <= 0) 1541 return (1); 1542 } 1543 return (0); 1544} 1545 1546/* 1547 * Create a new open file structure and allocate a file decriptor for the 1548 * process that refers to it. We add one reference to the file for the 1549 * descriptor table and one reference for resultfp. This is to prevent us 1550 * being preempted and the entry in the descriptor table closed after we 1551 * release the FILEDESC lock. 1552 */ 1553int 1554falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags) 1555{ 1556 struct file *fp; 1557 int error, fd; 1558 1559 error = falloc_noinstall(td, &fp); 1560 if (error) 1561 return (error); /* no reference held on error */ 1562 1563 error = finstall(td, fp, &fd, flags); 1564 if (error) { 1565 fdrop(fp, td); /* one reference (fp only) */ 1566 return (error); 1567 } 1568 1569 if (resultfp != NULL) 1570 *resultfp = fp; /* copy out result */ 1571 else 1572 fdrop(fp, td); /* release local reference */ 1573 1574 if (resultfd != NULL) 1575 *resultfd = fd; 1576 1577 return (0); 1578} 1579 1580/* 1581 * Create a new open file structure without allocating a file descriptor. 1582 */ 1583int 1584falloc_noinstall(struct thread *td, struct file **resultfp) 1585{ 1586 struct file *fp; 1587 int maxuserfiles = maxfiles - (maxfiles / 20); 1588 static struct timeval lastfail; 1589 static int curfail; 1590 1591 KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__)); 1592 1593 if ((openfiles >= maxuserfiles && 1594 priv_check(td, PRIV_MAXFILES) != 0) || 1595 openfiles >= maxfiles) { 1596 if (ppsratecheck(&lastfail, &curfail, 1)) { 1597 printf("kern.maxfiles limit exceeded by uid %i, " 1598 "please see tuning(7).\n", td->td_ucred->cr_ruid); 1599 } 1600 return (ENFILE); 1601 } 1602 atomic_add_int(&openfiles, 1); 1603 fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO); 1604 refcount_init(&fp->f_count, 1); 1605 fp->f_cred = crhold(td->td_ucred); 1606 fp->f_ops = &badfileops; 1607 fp->f_data = NULL; 1608 fp->f_vnode = NULL; 1609 *resultfp = fp; 1610 return (0); 1611} 1612 1613/* 1614 * Install a file in a file descriptor table. 1615 */ 1616int 1617finstall(struct thread *td, struct file *fp, int *fd, int flags) 1618{ 1619 struct filedesc *fdp = td->td_proc->p_fd; 1620 int error; 1621 1622 KASSERT(fd != NULL, ("%s: fd == NULL", __func__)); 1623 KASSERT(fp != NULL, ("%s: fp == NULL", __func__)); 1624 1625 FILEDESC_XLOCK(fdp); 1626 if ((error = fdalloc(td, 0, fd))) { 1627 FILEDESC_XUNLOCK(fdp); 1628 return (error); 1629 } 1630 fhold(fp); 1631 fdp->fd_ofiles[*fd] = fp; 1632 if ((flags & O_CLOEXEC) != 0) 1633 fdp->fd_ofileflags[*fd] |= UF_EXCLOSE; 1634 FILEDESC_XUNLOCK(fdp); 1635 return (0); 1636} 1637 1638/* 1639 * Build a new filedesc structure from another. 1640 * Copy the current, root, and jail root vnode references. 1641 */ 1642struct filedesc * 1643fdinit(struct filedesc *fdp) 1644{ 1645 struct filedesc0 *newfdp; 1646 1647 newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO); 1648 FILEDESC_LOCK_INIT(&newfdp->fd_fd); 1649 if (fdp != NULL) { 1650 FILEDESC_XLOCK(fdp); 1651 newfdp->fd_fd.fd_cdir = fdp->fd_cdir; 1652 if (newfdp->fd_fd.fd_cdir) 1653 VREF(newfdp->fd_fd.fd_cdir); 1654 newfdp->fd_fd.fd_rdir = fdp->fd_rdir; 1655 if (newfdp->fd_fd.fd_rdir) 1656 VREF(newfdp->fd_fd.fd_rdir); 1657 newfdp->fd_fd.fd_jdir = fdp->fd_jdir; 1658 if (newfdp->fd_fd.fd_jdir) 1659 VREF(newfdp->fd_fd.fd_jdir); 1660 FILEDESC_XUNLOCK(fdp); 1661 } 1662 1663 /* Create the file descriptor table. */ 1664 newfdp->fd_fd.fd_refcnt = 1; 1665 newfdp->fd_fd.fd_holdcnt = 1; 1666 newfdp->fd_fd.fd_cmask = CMASK; 1667 newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles; 1668 newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags; 1669 newfdp->fd_fd.fd_nfiles = NDFILE; 1670 newfdp->fd_fd.fd_map = newfdp->fd_dmap; 1671 newfdp->fd_fd.fd_lastfile = -1; 1672 return (&newfdp->fd_fd); 1673} 1674 1675static struct filedesc * 1676fdhold(struct proc *p) 1677{ 1678 struct filedesc *fdp; 1679 1680 mtx_lock(&fdesc_mtx); 1681 fdp = p->p_fd; 1682 if (fdp != NULL) 1683 fdp->fd_holdcnt++; 1684 mtx_unlock(&fdesc_mtx); 1685 return (fdp); 1686} 1687 1688static void 1689fddrop(struct filedesc *fdp) 1690{ 1691 struct filedesc0 *fdp0; 1692 struct freetable *ft; 1693 int i; 1694 1695 mtx_lock(&fdesc_mtx); 1696 i = --fdp->fd_holdcnt; 1697 mtx_unlock(&fdesc_mtx); 1698 if (i > 0) 1699 return; 1700 1701 FILEDESC_LOCK_DESTROY(fdp); 1702 fdp0 = (struct filedesc0 *)fdp; 1703 while ((ft = SLIST_FIRST(&fdp0->fd_free)) != NULL) { 1704 SLIST_REMOVE_HEAD(&fdp0->fd_free, ft_next); 1705 free(ft->ft_table, M_FILEDESC); 1706 } 1707 free(fdp, M_FILEDESC); 1708} 1709 1710/* 1711 * Share a filedesc structure. 1712 */ 1713struct filedesc * 1714fdshare(struct filedesc *fdp) 1715{ 1716 1717 FILEDESC_XLOCK(fdp); 1718 fdp->fd_refcnt++; 1719 FILEDESC_XUNLOCK(fdp); 1720 return (fdp); 1721} 1722 1723/* 1724 * Unshare a filedesc structure, if necessary by making a copy 1725 */ 1726void 1727fdunshare(struct proc *p, struct thread *td) 1728{ 1729 1730 FILEDESC_XLOCK(p->p_fd); 1731 if (p->p_fd->fd_refcnt > 1) { 1732 struct filedesc *tmp; 1733 1734 FILEDESC_XUNLOCK(p->p_fd); 1735 tmp = fdcopy(p->p_fd); 1736 fdfree(td); 1737 p->p_fd = tmp; 1738 } else 1739 FILEDESC_XUNLOCK(p->p_fd); 1740} 1741 1742/* 1743 * Copy a filedesc structure. A NULL pointer in returns a NULL reference, 1744 * this is to ease callers, not catch errors. 1745 */ 1746struct filedesc * 1747fdcopy(struct filedesc *fdp) 1748{ 1749 struct filedesc *newfdp; 1750 int i; 1751 1752 /* Certain daemons might not have file descriptors. */ 1753 if (fdp == NULL) 1754 return (NULL); 1755 1756 newfdp = fdinit(fdp); 1757 FILEDESC_SLOCK(fdp); 1758 while (fdp->fd_lastfile >= newfdp->fd_nfiles) { 1759 FILEDESC_SUNLOCK(fdp); 1760 FILEDESC_XLOCK(newfdp); 1761 fdgrowtable(newfdp, fdp->fd_lastfile + 1); 1762 FILEDESC_XUNLOCK(newfdp); 1763 FILEDESC_SLOCK(fdp); 1764 } 1765 /* copy all passable descriptors (i.e. not kqueue) */ 1766 newfdp->fd_freefile = -1; 1767 for (i = 0; i <= fdp->fd_lastfile; ++i) { 1768 if (fdisused(fdp, i) && 1769 (fdp->fd_ofiles[i]->f_ops->fo_flags & DFLAG_PASSABLE) && 1770 fdp->fd_ofiles[i]->f_ops != &badfileops) { 1771 newfdp->fd_ofiles[i] = fdp->fd_ofiles[i]; 1772 newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i]; 1773 fhold(newfdp->fd_ofiles[i]); 1774 newfdp->fd_lastfile = i; 1775 } else { 1776 if (newfdp->fd_freefile == -1) 1777 newfdp->fd_freefile = i; 1778 } 1779 } 1780 newfdp->fd_cmask = fdp->fd_cmask; 1781 FILEDESC_SUNLOCK(fdp); 1782 FILEDESC_XLOCK(newfdp); 1783 for (i = 0; i <= newfdp->fd_lastfile; ++i) 1784 if (newfdp->fd_ofiles[i] != NULL) 1785 fdused(newfdp, i); 1786 if (newfdp->fd_freefile == -1) 1787 newfdp->fd_freefile = i; 1788 FILEDESC_XUNLOCK(newfdp); 1789 return (newfdp); 1790} 1791 1792/* 1793 * Release a filedesc structure. 1794 */ 1795void 1796fdfree(struct thread *td) 1797{ 1798 struct filedesc *fdp; 1799 int i, locked; 1800 struct filedesc_to_leader *fdtol; 1801 struct file *fp; 1802 struct vnode *cdir, *jdir, *rdir, *vp; 1803 struct flock lf; 1804 1805 /* Certain daemons might not have file descriptors. */ 1806 fdp = td->td_proc->p_fd; 1807 if (fdp == NULL) 1808 return; 1809 1810#ifdef RACCT 1811 PROC_LOCK(td->td_proc); 1812 racct_set(td->td_proc, RACCT_NOFILE, 0); 1813 PROC_UNLOCK(td->td_proc); 1814#endif 1815 1816 /* Check for special need to clear POSIX style locks */ 1817 fdtol = td->td_proc->p_fdtol; 1818 if (fdtol != NULL) { 1819 FILEDESC_XLOCK(fdp); 1820 KASSERT(fdtol->fdl_refcount > 0, 1821 ("filedesc_to_refcount botch: fdl_refcount=%d", 1822 fdtol->fdl_refcount)); 1823 if (fdtol->fdl_refcount == 1 && 1824 (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { 1825 for (i = 0; i <= fdp->fd_lastfile; i++) { 1826 fp = fdp->fd_ofiles[i]; 1827 if (fp == NULL || fp->f_type != DTYPE_VNODE) 1828 continue; 1829 fhold(fp); 1830 FILEDESC_XUNLOCK(fdp); 1831 lf.l_whence = SEEK_SET; 1832 lf.l_start = 0; 1833 lf.l_len = 0; 1834 lf.l_type = F_UNLCK; 1835 vp = fp->f_vnode; 1836 locked = VFS_LOCK_GIANT(vp->v_mount); 1837 (void) VOP_ADVLOCK(vp, 1838 (caddr_t)td->td_proc-> 1839 p_leader, 1840 F_UNLCK, 1841 &lf, 1842 F_POSIX); 1843 VFS_UNLOCK_GIANT(locked); 1844 FILEDESC_XLOCK(fdp); 1845 fdrop(fp, td); 1846 } 1847 } 1848 retry: 1849 if (fdtol->fdl_refcount == 1) { 1850 if (fdp->fd_holdleaderscount > 0 && 1851 (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { 1852 /* 1853 * close() or do_dup() has cleared a reference 1854 * in a shared file descriptor table. 1855 */ 1856 fdp->fd_holdleaderswakeup = 1; 1857 sx_sleep(&fdp->fd_holdleaderscount, 1858 FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); 1859 goto retry; 1860 } 1861 if (fdtol->fdl_holdcount > 0) { 1862 /* 1863 * Ensure that fdtol->fdl_leader remains 1864 * valid in closef(). 1865 */ 1866 fdtol->fdl_wakeup = 1; 1867 sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK, 1868 "fdlhold", 0); 1869 goto retry; 1870 } 1871 } 1872 fdtol->fdl_refcount--; 1873 if (fdtol->fdl_refcount == 0 && 1874 fdtol->fdl_holdcount == 0) { 1875 fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; 1876 fdtol->fdl_prev->fdl_next = fdtol->fdl_next; 1877 } else 1878 fdtol = NULL; 1879 td->td_proc->p_fdtol = NULL; 1880 FILEDESC_XUNLOCK(fdp); 1881 if (fdtol != NULL) 1882 free(fdtol, M_FILEDESC_TO_LEADER); 1883 } 1884 FILEDESC_XLOCK(fdp); 1885 i = --fdp->fd_refcnt; 1886 FILEDESC_XUNLOCK(fdp); 1887 if (i > 0) 1888 return; 1889 1890 for (i = 0; i <= fdp->fd_lastfile; i++) { 1891 fp = fdp->fd_ofiles[i]; 1892 if (fp != NULL) { 1893 FILEDESC_XLOCK(fdp); 1894 fdp->fd_ofiles[i] = NULL; 1895 FILEDESC_XUNLOCK(fdp); 1896 (void) closef(fp, td); 1897 } 1898 } 1899 FILEDESC_XLOCK(fdp); 1900 1901 /* XXX This should happen earlier. */ 1902 mtx_lock(&fdesc_mtx); 1903 td->td_proc->p_fd = NULL; 1904 mtx_unlock(&fdesc_mtx); 1905 1906 if (fdp->fd_nfiles > NDFILE) 1907 free(fdp->fd_ofiles, M_FILEDESC); 1908 if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE)) 1909 free(fdp->fd_map, M_FILEDESC); 1910 1911 fdp->fd_nfiles = 0; 1912 1913 cdir = fdp->fd_cdir; 1914 fdp->fd_cdir = NULL; 1915 rdir = fdp->fd_rdir; 1916 fdp->fd_rdir = NULL; 1917 jdir = fdp->fd_jdir; 1918 fdp->fd_jdir = NULL; 1919 FILEDESC_XUNLOCK(fdp); 1920 1921 if (cdir) { 1922 locked = VFS_LOCK_GIANT(cdir->v_mount); 1923 vrele(cdir); 1924 VFS_UNLOCK_GIANT(locked); 1925 } 1926 if (rdir) { 1927 locked = VFS_LOCK_GIANT(rdir->v_mount); 1928 vrele(rdir); 1929 VFS_UNLOCK_GIANT(locked); 1930 } 1931 if (jdir) { 1932 locked = VFS_LOCK_GIANT(jdir->v_mount); 1933 vrele(jdir); 1934 VFS_UNLOCK_GIANT(locked); 1935 } 1936 1937 fddrop(fdp); 1938} 1939 1940/* 1941 * For setugid programs, we don't want to people to use that setugidness 1942 * to generate error messages which write to a file which otherwise would 1943 * otherwise be off-limits to the process. We check for filesystems where 1944 * the vnode can change out from under us after execve (like [lin]procfs). 1945 * 1946 * Since setugidsafety calls this only for fd 0, 1 and 2, this check is 1947 * sufficient. We also don't check for setugidness since we know we are. 1948 */ 1949static int 1950is_unsafe(struct file *fp) 1951{ 1952 if (fp->f_type == DTYPE_VNODE) { 1953 struct vnode *vp = fp->f_vnode; 1954 1955 if ((vp->v_vflag & VV_PROCDEP) != 0) 1956 return (1); 1957 } 1958 return (0); 1959} 1960 1961/* 1962 * Make this setguid thing safe, if at all possible. 1963 */ 1964void 1965setugidsafety(struct thread *td) 1966{ 1967 struct filedesc *fdp; 1968 int i; 1969 1970 /* Certain daemons might not have file descriptors. */ 1971 fdp = td->td_proc->p_fd; 1972 if (fdp == NULL) 1973 return; 1974 1975 /* 1976 * Note: fdp->fd_ofiles may be reallocated out from under us while 1977 * we are blocked in a close. Be careful! 1978 */ 1979 FILEDESC_XLOCK(fdp); 1980 for (i = 0; i <= fdp->fd_lastfile; i++) { 1981 if (i > 2) 1982 break; 1983 if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) { 1984 struct file *fp; 1985 1986 knote_fdclose(td, i); 1987 /* 1988 * NULL-out descriptor prior to close to avoid 1989 * a race while close blocks. 1990 */ 1991 fp = fdp->fd_ofiles[i]; 1992 fdp->fd_ofiles[i] = NULL; 1993 fdp->fd_ofileflags[i] = 0; 1994 fdunused(fdp, i); 1995 FILEDESC_XUNLOCK(fdp); 1996 (void) closef(fp, td); 1997 FILEDESC_XLOCK(fdp); 1998 } 1999 } 2000 FILEDESC_XUNLOCK(fdp); 2001} 2002 2003/* 2004 * If a specific file object occupies a specific file descriptor, close the 2005 * file descriptor entry and drop a reference on the file object. This is a 2006 * convenience function to handle a subsequent error in a function that calls 2007 * falloc() that handles the race that another thread might have closed the 2008 * file descriptor out from under the thread creating the file object. 2009 */ 2010void 2011fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td) 2012{ 2013 2014 FILEDESC_XLOCK(fdp); 2015 if (fdp->fd_ofiles[idx] == fp) { 2016 fdp->fd_ofiles[idx] = NULL; 2017 fdunused(fdp, idx); 2018 FILEDESC_XUNLOCK(fdp); 2019 fdrop(fp, td); 2020 } else 2021 FILEDESC_XUNLOCK(fdp); 2022} 2023 2024/* 2025 * Close any files on exec? 2026 */ 2027void 2028fdcloseexec(struct thread *td) 2029{ 2030 struct filedesc *fdp; 2031 int i; 2032 2033 /* Certain daemons might not have file descriptors. */ 2034 fdp = td->td_proc->p_fd; 2035 if (fdp == NULL) 2036 return; 2037 2038 FILEDESC_XLOCK(fdp); 2039 2040 /* 2041 * We cannot cache fd_ofiles or fd_ofileflags since operations 2042 * may block and rip them out from under us. 2043 */ 2044 for (i = 0; i <= fdp->fd_lastfile; i++) { 2045 if (fdp->fd_ofiles[i] != NULL && 2046 (fdp->fd_ofiles[i]->f_type == DTYPE_MQUEUE || 2047 (fdp->fd_ofileflags[i] & UF_EXCLOSE))) { 2048 struct file *fp; 2049 2050 knote_fdclose(td, i); 2051 /* 2052 * NULL-out descriptor prior to close to avoid 2053 * a race while close blocks. 2054 */ 2055 fp = fdp->fd_ofiles[i]; 2056 fdp->fd_ofiles[i] = NULL; 2057 fdp->fd_ofileflags[i] = 0; 2058 fdunused(fdp, i); 2059 if (fp->f_type == DTYPE_MQUEUE) 2060 mq_fdclose(td, i, fp); 2061 FILEDESC_XUNLOCK(fdp); 2062 (void) closef(fp, td); 2063 FILEDESC_XLOCK(fdp); 2064 } 2065 } 2066 FILEDESC_XUNLOCK(fdp); 2067} 2068 2069/* 2070 * It is unsafe for set[ug]id processes to be started with file 2071 * descriptors 0..2 closed, as these descriptors are given implicit 2072 * significance in the Standard C library. fdcheckstd() will create a 2073 * descriptor referencing /dev/null for each of stdin, stdout, and 2074 * stderr that is not already open. 2075 */ 2076int 2077fdcheckstd(struct thread *td) 2078{ 2079 struct filedesc *fdp; 2080 register_t retval, save; 2081 int i, error, devnull; 2082 2083 fdp = td->td_proc->p_fd; 2084 if (fdp == NULL) 2085 return (0); 2086 KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); 2087 devnull = -1; 2088 error = 0; 2089 for (i = 0; i < 3; i++) { 2090 if (fdp->fd_ofiles[i] != NULL) 2091 continue; 2092 if (devnull < 0) { 2093 save = td->td_retval[0]; 2094 error = kern_open(td, "/dev/null", UIO_SYSSPACE, 2095 O_RDWR, 0); 2096 devnull = td->td_retval[0]; 2097 td->td_retval[0] = save; 2098 if (error) 2099 break; 2100 KASSERT(devnull == i, ("oof, we didn't get our fd")); 2101 } else { 2102 error = do_dup(td, DUP_FIXED, devnull, i, &retval); 2103 if (error != 0) 2104 break; 2105 } 2106 } 2107 return (error); 2108} 2109 2110/* 2111 * Internal form of close. Decrement reference count on file structure. 2112 * Note: td may be NULL when closing a file that was being passed in a 2113 * message. 2114 * 2115 * XXXRW: Giant is not required for the caller, but often will be held; this 2116 * makes it moderately likely the Giant will be recursed in the VFS case. 2117 */ 2118int 2119closef(struct file *fp, struct thread *td) 2120{ 2121 struct vnode *vp; 2122 struct flock lf; 2123 struct filedesc_to_leader *fdtol; 2124 struct filedesc *fdp; 2125 struct file *fp_object; 2126 2127 /* 2128 * POSIX record locking dictates that any close releases ALL 2129 * locks owned by this process. This is handled by setting 2130 * a flag in the unlock to free ONLY locks obeying POSIX 2131 * semantics, and not to free BSD-style file locks. 2132 * If the descriptor was in a message, POSIX-style locks 2133 * aren't passed with the descriptor, and the thread pointer 2134 * will be NULL. Callers should be careful only to pass a 2135 * NULL thread pointer when there really is no owning 2136 * context that might have locks, or the locks will be 2137 * leaked. 2138 * 2139 * If this is a capability, we do lock processing under the underlying 2140 * node, not the capability itself. 2141 */ 2142 (void)cap_funwrap(fp, 0, &fp_object); 2143 if ((fp_object->f_type == DTYPE_VNODE) && (td != NULL)) { 2144 int vfslocked; 2145 2146 vp = fp_object->f_vnode; 2147 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 2148 if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { 2149 lf.l_whence = SEEK_SET; 2150 lf.l_start = 0; 2151 lf.l_len = 0; 2152 lf.l_type = F_UNLCK; 2153 (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, 2154 F_UNLCK, &lf, F_POSIX); 2155 } 2156 fdtol = td->td_proc->p_fdtol; 2157 if (fdtol != NULL) { 2158 /* 2159 * Handle special case where file descriptor table is 2160 * shared between multiple process leaders. 2161 */ 2162 fdp = td->td_proc->p_fd; 2163 FILEDESC_XLOCK(fdp); 2164 for (fdtol = fdtol->fdl_next; 2165 fdtol != td->td_proc->p_fdtol; 2166 fdtol = fdtol->fdl_next) { 2167 if ((fdtol->fdl_leader->p_flag & 2168 P_ADVLOCK) == 0) 2169 continue; 2170 fdtol->fdl_holdcount++; 2171 FILEDESC_XUNLOCK(fdp); 2172 lf.l_whence = SEEK_SET; 2173 lf.l_start = 0; 2174 lf.l_len = 0; 2175 lf.l_type = F_UNLCK; 2176 vp = fp_object->f_vnode; 2177 (void) VOP_ADVLOCK(vp, 2178 (caddr_t)fdtol->fdl_leader, 2179 F_UNLCK, &lf, F_POSIX); 2180 FILEDESC_XLOCK(fdp); 2181 fdtol->fdl_holdcount--; 2182 if (fdtol->fdl_holdcount == 0 && 2183 fdtol->fdl_wakeup != 0) { 2184 fdtol->fdl_wakeup = 0; 2185 wakeup(fdtol); 2186 } 2187 } 2188 FILEDESC_XUNLOCK(fdp); 2189 } 2190 VFS_UNLOCK_GIANT(vfslocked); 2191 } 2192 return (fdrop(fp, td)); 2193} 2194 2195/* 2196 * Initialize the file pointer with the specified properties. 2197 * 2198 * The ops are set with release semantics to be certain that the flags, type, 2199 * and data are visible when ops is. This is to prevent ops methods from being 2200 * called with bad data. 2201 */ 2202void 2203finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops) 2204{ 2205 fp->f_data = data; 2206 fp->f_flag = flag; 2207 fp->f_type = type; 2208 atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops); 2209} 2210 2211struct file * 2212fget_unlocked(struct filedesc *fdp, int fd) 2213{ 2214 struct file *fp; 2215 u_int count; 2216 2217 if ((unsigned int)fd >= fdp->fd_nfiles) 2218 return (NULL); 2219 /* 2220 * Fetch the descriptor locklessly. We avoid fdrop() races by 2221 * never raising a refcount above 0. To accomplish this we have 2222 * to use a cmpset loop rather than an atomic_add. The descriptor 2223 * must be re-verified once we acquire a reference to be certain 2224 * that the identity is still correct and we did not lose a race 2225 * due to preemption. 2226 */ 2227 for (;;) { 2228 fp = fdp->fd_ofiles[fd]; 2229 if (fp == NULL) 2230 break; 2231 count = fp->f_count; 2232 if (count == 0) 2233 continue; 2234 /* 2235 * Use an acquire barrier to prevent caching of fd_ofiles 2236 * so it is refreshed for verification. 2237 */ 2238 if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) != 1) 2239 continue; 2240 if (fp == fdp->fd_ofiles[fd]) 2241 break; 2242 fdrop(fp, curthread); 2243 } 2244 2245 return (fp); 2246} 2247 2248/* 2249 * Extract the file pointer associated with the specified descriptor for the 2250 * current user process. 2251 * 2252 * If the descriptor doesn't exist or doesn't match 'flags', EBADF is 2253 * returned. 2254 * 2255 * If the FGET_GETCAP flag is set, the capability itself will be returned. 2256 * Calling _fget() with FGET_GETCAP on a non-capability will return EINVAL. 2257 * Otherwise, if the file is a capability, its rights will be checked against 2258 * the capability rights mask, and if successful, the object will be unwrapped. 2259 * 2260 * If an error occured the non-zero error is returned and *fpp is set to 2261 * NULL. Otherwise *fpp is held and set and zero is returned. Caller is 2262 * responsible for fdrop(). 2263 */ 2264#define FGET_GETCAP 0x00000001 2265static __inline int 2266_fget(struct thread *td, int fd, struct file **fpp, int flags, 2267 cap_rights_t needrights, cap_rights_t *haverightsp, u_char *maxprotp, 2268 int fget_flags) 2269{ 2270 struct filedesc *fdp; 2271 struct file *fp; 2272#ifdef CAPABILITIES 2273 struct file *fp_fromcap; 2274 int error; 2275#endif 2276 2277 *fpp = NULL; 2278 if (td == NULL || (fdp = td->td_proc->p_fd) == NULL) 2279 return (EBADF); 2280 if ((fp = fget_unlocked(fdp, fd)) == NULL) 2281 return (EBADF); 2282 if (fp->f_ops == &badfileops) { 2283 fdrop(fp, td); 2284 return (EBADF); 2285 } 2286 2287#ifdef CAPABILITIES 2288 /* 2289 * If this is a capability, what rights does it have? 2290 */ 2291 if (haverightsp != NULL) { 2292 if (fp->f_type == DTYPE_CAPABILITY) 2293 *haverightsp = cap_rights(fp); 2294 else 2295 *haverightsp = CAP_MASK_VALID; 2296 } 2297 2298 /* 2299 * If a capability has been requested, return the capability directly. 2300 * Otherwise, check capability rights, extract the underlying object, 2301 * and check its access flags. 2302 */ 2303 if (fget_flags & FGET_GETCAP) { 2304 if (fp->f_type != DTYPE_CAPABILITY) { 2305 fdrop(fp, td); 2306 return (EINVAL); 2307 } 2308 } else { 2309 if (maxprotp == NULL) 2310 error = cap_funwrap(fp, needrights, &fp_fromcap); 2311 else 2312 error = cap_funwrap_mmap(fp, needrights, maxprotp, 2313 &fp_fromcap); 2314 if (error) { 2315 fdrop(fp, td); 2316 return (error); 2317 } 2318 2319 /* 2320 * If we've unwrapped a file, drop the original capability 2321 * and hold the new descriptor. fp after this point refers to 2322 * the actual (unwrapped) object, not the capability. 2323 */ 2324 if (fp != fp_fromcap) { 2325 fhold(fp_fromcap); 2326 fdrop(fp, td); 2327 fp = fp_fromcap; 2328 } 2329 } 2330#else /* !CAPABILITIES */ 2331 KASSERT(fp->f_type != DTYPE_CAPABILITY, 2332 ("%s: saw capability", __func__)); 2333 if (maxprotp != NULL) 2334 *maxprotp = VM_PROT_ALL; 2335#endif /* CAPABILITIES */ 2336 2337 /* 2338 * FREAD and FWRITE failure return EBADF as per POSIX. 2339 * 2340 * Only one flag, or 0, may be specified. 2341 */ 2342 if ((flags == FREAD && (fp->f_flag & FREAD) == 0) || 2343 (flags == FWRITE && (fp->f_flag & FWRITE) == 0)) { 2344 fdrop(fp, td); 2345 return (EBADF); 2346 } 2347 *fpp = fp; 2348 return (0); 2349} 2350 2351int 2352fget(struct thread *td, int fd, cap_rights_t rights, struct file **fpp) 2353{ 2354 2355 return(_fget(td, fd, fpp, 0, rights, NULL, NULL, 0)); 2356} 2357 2358int 2359fget_mmap(struct thread *td, int fd, cap_rights_t rights, u_char *maxprotp, 2360 struct file **fpp) 2361{ 2362 2363 return (_fget(td, fd, fpp, 0, rights, NULL, maxprotp, 0)); 2364} 2365 2366int 2367fget_read(struct thread *td, int fd, cap_rights_t rights, struct file **fpp) 2368{ 2369 2370 return(_fget(td, fd, fpp, FREAD, rights, NULL, NULL, 0)); 2371} 2372 2373int 2374fget_write(struct thread *td, int fd, cap_rights_t rights, struct file **fpp) 2375{ 2376 2377 return (_fget(td, fd, fpp, FWRITE, rights, NULL, NULL, 0)); 2378} 2379 2380/* 2381 * Unlike the other fget() calls, which accept and check capability rights 2382 * but never return capabilities, fgetcap() returns the capability but doesn't 2383 * check capability rights. 2384 */ 2385int 2386fgetcap(struct thread *td, int fd, struct file **fpp) 2387{ 2388 2389 return (_fget(td, fd, fpp, 0, 0, NULL, NULL, FGET_GETCAP)); 2390} 2391 2392 2393/* 2394 * Like fget() but loads the underlying vnode, or returns an error if the 2395 * descriptor does not represent a vnode. Note that pipes use vnodes but 2396 * never have VM objects. The returned vnode will be vref()'d. 2397 * 2398 * XXX: what about the unused flags ? 2399 */ 2400static __inline int 2401_fgetvp(struct thread *td, int fd, int flags, cap_rights_t needrights, 2402 cap_rights_t *haverightsp, struct vnode **vpp) 2403{ 2404 struct file *fp; 2405 int error; 2406 2407 *vpp = NULL; 2408 if ((error = _fget(td, fd, &fp, flags, needrights, haverightsp, 2409 NULL, 0)) != 0) 2410 return (error); 2411 if (fp->f_vnode == NULL) { 2412 error = EINVAL; 2413 } else { 2414 *vpp = fp->f_vnode; 2415 vref(*vpp); 2416 } 2417 fdrop(fp, td); 2418 2419 return (error); 2420} 2421 2422int 2423fgetvp(struct thread *td, int fd, cap_rights_t rights, struct vnode **vpp) 2424{ 2425 2426 return (_fgetvp(td, fd, 0, rights, NULL, vpp)); 2427} 2428 2429int 2430fgetvp_rights(struct thread *td, int fd, cap_rights_t need, cap_rights_t *have, 2431 struct vnode **vpp) 2432{ 2433 return (_fgetvp(td, fd, 0, need, have, vpp)); 2434} 2435 2436int 2437fgetvp_read(struct thread *td, int fd, cap_rights_t rights, struct vnode **vpp) 2438{ 2439 2440 return (_fgetvp(td, fd, FREAD, rights, NULL, vpp)); 2441} 2442 2443#ifdef notyet 2444int 2445fgetvp_write(struct thread *td, int fd, cap_rights_t rights, 2446 struct vnode **vpp) 2447{ 2448 2449 return (_fgetvp(td, fd, FWRITE, rights, NULL, vpp)); 2450} 2451#endif 2452 2453/* 2454 * Like fget() but loads the underlying socket, or returns an error if the 2455 * descriptor does not represent a socket. 2456 * 2457 * We bump the ref count on the returned socket. XXX Also obtain the SX lock 2458 * in the future. 2459 * 2460 * Note: fgetsock() and fputsock() are deprecated, as consumers should rely 2461 * on their file descriptor reference to prevent the socket from being free'd 2462 * during use. 2463 */ 2464int 2465fgetsock(struct thread *td, int fd, cap_rights_t rights, struct socket **spp, 2466 u_int *fflagp) 2467{ 2468 struct file *fp; 2469 int error; 2470 2471 *spp = NULL; 2472 if (fflagp != NULL) 2473 *fflagp = 0; 2474 if ((error = _fget(td, fd, &fp, 0, rights, NULL, NULL, 0)) != 0) 2475 return (error); 2476 if (fp->f_type != DTYPE_SOCKET) { 2477 error = ENOTSOCK; 2478 } else { 2479 *spp = fp->f_data; 2480 if (fflagp) 2481 *fflagp = fp->f_flag; 2482 SOCK_LOCK(*spp); 2483 soref(*spp); 2484 SOCK_UNLOCK(*spp); 2485 } 2486 fdrop(fp, td); 2487 2488 return (error); 2489} 2490 2491/* 2492 * Drop the reference count on the socket and XXX release the SX lock in the 2493 * future. The last reference closes the socket. 2494 * 2495 * Note: fputsock() is deprecated, see comment for fgetsock(). 2496 */ 2497void 2498fputsock(struct socket *so) 2499{ 2500 2501 ACCEPT_LOCK(); 2502 SOCK_LOCK(so); 2503 CURVNET_SET(so->so_vnet); 2504 sorele(so); 2505 CURVNET_RESTORE(); 2506} 2507 2508/* 2509 * Handle the last reference to a file being closed. 2510 * 2511 * No special capability handling here, as the capability's fo_close will run 2512 * instead of the object here, and perform any necessary drop on the object. 2513 */ 2514int 2515_fdrop(struct file *fp, struct thread *td) 2516{ 2517 int error; 2518 2519 error = 0; 2520 if (fp->f_count != 0) 2521 panic("fdrop: count %d", fp->f_count); 2522 if (fp->f_ops != &badfileops) 2523 error = fo_close(fp, td); 2524 atomic_subtract_int(&openfiles, 1); 2525 crfree(fp->f_cred); 2526 free(fp->f_advice, M_FADVISE); 2527 uma_zfree(file_zone, fp); 2528 2529 return (error); 2530} 2531 2532/* 2533 * Apply an advisory lock on a file descriptor. 2534 * 2535 * Just attempt to get a record lock of the requested type on the entire file 2536 * (l_whence = SEEK_SET, l_start = 0, l_len = 0). 2537 */ 2538#ifndef _SYS_SYSPROTO_H_ 2539struct flock_args { 2540 int fd; 2541 int how; 2542}; 2543#endif 2544/* ARGSUSED */ 2545int 2546sys_flock(struct thread *td, struct flock_args *uap) 2547{ 2548 struct file *fp; 2549 struct vnode *vp; 2550 struct flock lf; 2551 int vfslocked; 2552 int error; 2553 2554 if ((error = fget(td, uap->fd, CAP_FLOCK, &fp)) != 0) 2555 return (error); 2556 if (fp->f_type != DTYPE_VNODE) { 2557 fdrop(fp, td); 2558 return (EOPNOTSUPP); 2559 } 2560 2561 vp = fp->f_vnode; 2562 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 2563 lf.l_whence = SEEK_SET; 2564 lf.l_start = 0; 2565 lf.l_len = 0; 2566 if (uap->how & LOCK_UN) { 2567 lf.l_type = F_UNLCK; 2568 atomic_clear_int(&fp->f_flag, FHASLOCK); 2569 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); 2570 goto done2; 2571 } 2572 if (uap->how & LOCK_EX) 2573 lf.l_type = F_WRLCK; 2574 else if (uap->how & LOCK_SH) 2575 lf.l_type = F_RDLCK; 2576 else { 2577 error = EBADF; 2578 goto done2; 2579 } 2580 atomic_set_int(&fp->f_flag, FHASLOCK); 2581 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, 2582 (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); 2583done2: 2584 fdrop(fp, td); 2585 VFS_UNLOCK_GIANT(vfslocked); 2586 return (error); 2587} 2588/* 2589 * Duplicate the specified descriptor to a free descriptor. 2590 */ 2591int 2592dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, int error) 2593{ 2594 struct file *wfp; 2595 struct file *fp; 2596 2597 /* 2598 * If the to-be-dup'd fd number is greater than the allowed number 2599 * of file descriptors, or the fd to be dup'd has already been 2600 * closed, then reject. 2601 */ 2602 FILEDESC_XLOCK(fdp); 2603 if ((unsigned int)dfd >= fdp->fd_nfiles || 2604 (wfp = fdp->fd_ofiles[dfd]) == NULL) { 2605 FILEDESC_XUNLOCK(fdp); 2606 return (EBADF); 2607 } 2608 2609 /* 2610 * There are two cases of interest here. 2611 * 2612 * For ENODEV simply dup (dfd) to file descriptor (indx) and return. 2613 * 2614 * For ENXIO steal away the file structure from (dfd) and store it in 2615 * (indx). (dfd) is effectively closed by this operation. 2616 * 2617 * Any other error code is just returned. 2618 */ 2619 switch (error) { 2620 case ENODEV: 2621 /* 2622 * Check that the mode the file is being opened for is a 2623 * subset of the mode of the existing descriptor. 2624 */ 2625 if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) { 2626 FILEDESC_XUNLOCK(fdp); 2627 return (EACCES); 2628 } 2629 fp = fdp->fd_ofiles[indx]; 2630 fdp->fd_ofiles[indx] = wfp; 2631 fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; 2632 if (fp == NULL) 2633 fdused(fdp, indx); 2634 fhold(wfp); 2635 FILEDESC_XUNLOCK(fdp); 2636 if (fp != NULL) 2637 /* 2638 * We now own the reference to fp that the ofiles[] 2639 * array used to own. Release it. 2640 */ 2641 fdrop(fp, td); 2642 return (0); 2643 2644 case ENXIO: 2645 /* 2646 * Steal away the file pointer from dfd and stuff it into indx. 2647 */ 2648 fp = fdp->fd_ofiles[indx]; 2649 fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd]; 2650 fdp->fd_ofiles[dfd] = NULL; 2651 fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; 2652 fdp->fd_ofileflags[dfd] = 0; 2653 fdunused(fdp, dfd); 2654 if (fp == NULL) 2655 fdused(fdp, indx); 2656 FILEDESC_XUNLOCK(fdp); 2657 2658 /* 2659 * We now own the reference to fp that the ofiles[] array 2660 * used to own. Release it. 2661 */ 2662 if (fp != NULL) 2663 fdrop(fp, td); 2664 return (0); 2665 2666 default: 2667 FILEDESC_XUNLOCK(fdp); 2668 return (error); 2669 } 2670 /* NOTREACHED */ 2671} 2672 2673/* 2674 * Scan all active processes and prisons to see if any of them have a current 2675 * or root directory of `olddp'. If so, replace them with the new mount point. 2676 */ 2677void 2678mountcheckdirs(struct vnode *olddp, struct vnode *newdp) 2679{ 2680 struct filedesc *fdp; 2681 struct prison *pr; 2682 struct proc *p; 2683 int nrele; 2684 2685 if (vrefcnt(olddp) == 1) 2686 return; 2687 nrele = 0; 2688 sx_slock(&allproc_lock); 2689 FOREACH_PROC_IN_SYSTEM(p) { 2690 fdp = fdhold(p); 2691 if (fdp == NULL) 2692 continue; 2693 FILEDESC_XLOCK(fdp); 2694 if (fdp->fd_cdir == olddp) { 2695 vref(newdp); 2696 fdp->fd_cdir = newdp; 2697 nrele++; 2698 } 2699 if (fdp->fd_rdir == olddp) { 2700 vref(newdp); 2701 fdp->fd_rdir = newdp; 2702 nrele++; 2703 } 2704 if (fdp->fd_jdir == olddp) { 2705 vref(newdp); 2706 fdp->fd_jdir = newdp; 2707 nrele++; 2708 } 2709 FILEDESC_XUNLOCK(fdp); 2710 fddrop(fdp); 2711 } 2712 sx_sunlock(&allproc_lock); 2713 if (rootvnode == olddp) { 2714 vref(newdp); 2715 rootvnode = newdp; 2716 nrele++; 2717 } 2718 mtx_lock(&prison0.pr_mtx); 2719 if (prison0.pr_root == olddp) { 2720 vref(newdp); 2721 prison0.pr_root = newdp; 2722 nrele++; 2723 } 2724 mtx_unlock(&prison0.pr_mtx); 2725 sx_slock(&allprison_lock); 2726 TAILQ_FOREACH(pr, &allprison, pr_list) { 2727 mtx_lock(&pr->pr_mtx); 2728 if (pr->pr_root == olddp) { 2729 vref(newdp); 2730 pr->pr_root = newdp; 2731 nrele++; 2732 } 2733 mtx_unlock(&pr->pr_mtx); 2734 } 2735 sx_sunlock(&allprison_lock); 2736 while (nrele--) 2737 vrele(olddp); 2738} 2739 2740struct filedesc_to_leader * 2741filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader) 2742{ 2743 struct filedesc_to_leader *fdtol; 2744 2745 fdtol = malloc(sizeof(struct filedesc_to_leader), 2746 M_FILEDESC_TO_LEADER, 2747 M_WAITOK); 2748 fdtol->fdl_refcount = 1; 2749 fdtol->fdl_holdcount = 0; 2750 fdtol->fdl_wakeup = 0; 2751 fdtol->fdl_leader = leader; 2752 if (old != NULL) { 2753 FILEDESC_XLOCK(fdp); 2754 fdtol->fdl_next = old->fdl_next; 2755 fdtol->fdl_prev = old; 2756 old->fdl_next = fdtol; 2757 fdtol->fdl_next->fdl_prev = fdtol; 2758 FILEDESC_XUNLOCK(fdp); 2759 } else { 2760 fdtol->fdl_next = fdtol; 2761 fdtol->fdl_prev = fdtol; 2762 } 2763 return (fdtol); 2764} 2765 2766/* 2767 * Get file structures globally. 2768 */ 2769static int 2770sysctl_kern_file(SYSCTL_HANDLER_ARGS) 2771{ 2772 struct xfile xf; 2773 struct filedesc *fdp; 2774 struct file *fp; 2775 struct proc *p; 2776 int error, n; 2777 2778 error = sysctl_wire_old_buffer(req, 0); 2779 if (error != 0) 2780 return (error); 2781 if (req->oldptr == NULL) { 2782 n = 0; 2783 sx_slock(&allproc_lock); 2784 FOREACH_PROC_IN_SYSTEM(p) { 2785 if (p->p_state == PRS_NEW) 2786 continue; 2787 fdp = fdhold(p); 2788 if (fdp == NULL) 2789 continue; 2790 /* overestimates sparse tables. */ 2791 if (fdp->fd_lastfile > 0) 2792 n += fdp->fd_lastfile; 2793 fddrop(fdp); 2794 } 2795 sx_sunlock(&allproc_lock); 2796 return (SYSCTL_OUT(req, 0, n * sizeof(xf))); 2797 } 2798 error = 0; 2799 bzero(&xf, sizeof(xf)); 2800 xf.xf_size = sizeof(xf); 2801 sx_slock(&allproc_lock); 2802 FOREACH_PROC_IN_SYSTEM(p) { 2803 PROC_LOCK(p); 2804 if (p->p_state == PRS_NEW) { 2805 PROC_UNLOCK(p); 2806 continue; 2807 } 2808 if (p_cansee(req->td, p) != 0) { 2809 PROC_UNLOCK(p); 2810 continue; 2811 } 2812 xf.xf_pid = p->p_pid; 2813 xf.xf_uid = p->p_ucred->cr_uid; 2814 PROC_UNLOCK(p); 2815 fdp = fdhold(p); 2816 if (fdp == NULL) 2817 continue; 2818 FILEDESC_SLOCK(fdp); 2819 for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) { 2820 if ((fp = fdp->fd_ofiles[n]) == NULL) 2821 continue; 2822 xf.xf_fd = n; 2823 xf.xf_file = fp; 2824 xf.xf_data = fp->f_data; 2825 xf.xf_vnode = fp->f_vnode; 2826 xf.xf_type = fp->f_type; 2827 xf.xf_count = fp->f_count; 2828 xf.xf_msgcount = 0; 2829 xf.xf_offset = fp->f_offset; 2830 xf.xf_flag = fp->f_flag; 2831 error = SYSCTL_OUT(req, &xf, sizeof(xf)); 2832 if (error) 2833 break; 2834 } 2835 FILEDESC_SUNLOCK(fdp); 2836 fddrop(fdp); 2837 if (error) 2838 break; 2839 } 2840 sx_sunlock(&allproc_lock); 2841 return (error); 2842} 2843 2844SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD, 2845 0, 0, sysctl_kern_file, "S,xfile", "Entire file table"); 2846 2847#ifdef KINFO_OFILE_SIZE 2848CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE); 2849#endif 2850 2851#ifdef COMPAT_FREEBSD7 2852static int 2853export_vnode_for_osysctl(struct vnode *vp, int type, 2854 struct kinfo_ofile *kif, struct filedesc *fdp, struct sysctl_req *req) 2855{ 2856 int error; 2857 char *fullpath, *freepath; 2858 int vfslocked; 2859 2860 bzero(kif, sizeof(*kif)); 2861 kif->kf_structsize = sizeof(*kif); 2862 2863 vref(vp); 2864 kif->kf_fd = type; 2865 kif->kf_type = KF_TYPE_VNODE; 2866 /* This function only handles directories. */ 2867 if (vp->v_type != VDIR) { 2868 vrele(vp); 2869 return (ENOTDIR); 2870 } 2871 kif->kf_vnode_type = KF_VTYPE_VDIR; 2872 2873 /* 2874 * This is not a true file descriptor, so we set a bogus refcount 2875 * and offset to indicate these fields should be ignored. 2876 */ 2877 kif->kf_ref_count = -1; 2878 kif->kf_offset = -1; 2879 2880 freepath = NULL; 2881 fullpath = "-"; 2882 FILEDESC_SUNLOCK(fdp); 2883 vn_fullpath(curthread, vp, &fullpath, &freepath); 2884 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 2885 vrele(vp); 2886 VFS_UNLOCK_GIANT(vfslocked); 2887 strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path)); 2888 if (freepath != NULL) 2889 free(freepath, M_TEMP); 2890 error = SYSCTL_OUT(req, kif, sizeof(*kif)); 2891 FILEDESC_SLOCK(fdp); 2892 return (error); 2893} 2894 2895/* 2896 * Get per-process file descriptors for use by procstat(1), et al. 2897 */ 2898static int 2899sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS) 2900{ 2901 char *fullpath, *freepath; 2902 struct kinfo_ofile *kif; 2903 struct filedesc *fdp; 2904 int error, i, *name; 2905 struct shmfd *shmfd; 2906 struct socket *so; 2907 struct vnode *vp; 2908 struct file *fp; 2909 struct proc *p; 2910 struct tty *tp; 2911 int vfslocked; 2912 2913 name = (int *)arg1; 2914 if ((p = pfind((pid_t)name[0])) == NULL) 2915 return (ESRCH); 2916 if ((error = p_candebug(curthread, p))) { 2917 PROC_UNLOCK(p); 2918 return (error); 2919 } 2920 fdp = fdhold(p); 2921 PROC_UNLOCK(p); 2922 if (fdp == NULL) 2923 return (ENOENT); 2924 kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); 2925 FILEDESC_SLOCK(fdp); 2926 if (fdp->fd_cdir != NULL) 2927 export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif, 2928 fdp, req); 2929 if (fdp->fd_rdir != NULL) 2930 export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif, 2931 fdp, req); 2932 if (fdp->fd_jdir != NULL) 2933 export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif, 2934 fdp, req); 2935 for (i = 0; i < fdp->fd_nfiles; i++) { 2936 if ((fp = fdp->fd_ofiles[i]) == NULL) 2937 continue; 2938 bzero(kif, sizeof(*kif)); 2939 kif->kf_structsize = sizeof(*kif); 2940 vp = NULL; 2941 so = NULL; 2942 tp = NULL; 2943 shmfd = NULL; 2944 kif->kf_fd = i; 2945 2946#ifdef CAPABILITIES 2947 /* 2948 * When reporting a capability, most fields will be from the 2949 * underlying object, but do mark as a capability. With 2950 * ofiledesc, we don't have a field to export the cap_rights_t, 2951 * but we do with the new filedesc. 2952 */ 2953 if (fp->f_type == DTYPE_CAPABILITY) { 2954 kif->kf_flags |= KF_FLAG_CAPABILITY; 2955 (void)cap_funwrap(fp, 0, &fp); 2956 } 2957#else 2958 KASSERT(fp->f_type != DTYPE_CAPABILITY, 2959 ("sysctl_kern_proc_ofiledesc: saw capability")); 2960#endif 2961 switch (fp->f_type) { 2962 case DTYPE_VNODE: 2963 kif->kf_type = KF_TYPE_VNODE; 2964 vp = fp->f_vnode; 2965 break; 2966 2967 case DTYPE_SOCKET: 2968 kif->kf_type = KF_TYPE_SOCKET; 2969 so = fp->f_data; 2970 break; 2971 2972 case DTYPE_PIPE: 2973 kif->kf_type = KF_TYPE_PIPE; 2974 break; 2975 2976 case DTYPE_FIFO: 2977 kif->kf_type = KF_TYPE_FIFO; 2978 vp = fp->f_vnode; 2979 break; 2980 2981 case DTYPE_KQUEUE: 2982 kif->kf_type = KF_TYPE_KQUEUE; 2983 break; 2984 2985 case DTYPE_CRYPTO: 2986 kif->kf_type = KF_TYPE_CRYPTO; 2987 break; 2988 2989 case DTYPE_MQUEUE: 2990 kif->kf_type = KF_TYPE_MQUEUE; 2991 break; 2992 2993 case DTYPE_SHM: 2994 kif->kf_type = KF_TYPE_SHM; 2995 shmfd = fp->f_data; 2996 break; 2997 2998 case DTYPE_SEM: 2999 kif->kf_type = KF_TYPE_SEM; 3000 break; 3001 3002 case DTYPE_PTS: 3003 kif->kf_type = KF_TYPE_PTS; 3004 tp = fp->f_data; 3005 break; 3006 3007#ifdef PROCDESC 3008 case DTYPE_PROCDESC: 3009 kif->kf_type = KF_TYPE_PROCDESC; 3010 break; 3011#endif 3012 3013 default: 3014 kif->kf_type = KF_TYPE_UNKNOWN; 3015 break; 3016 } 3017 kif->kf_ref_count = fp->f_count; 3018 if (fp->f_flag & FREAD) 3019 kif->kf_flags |= KF_FLAG_READ; 3020 if (fp->f_flag & FWRITE) 3021 kif->kf_flags |= KF_FLAG_WRITE; 3022 if (fp->f_flag & FAPPEND) 3023 kif->kf_flags |= KF_FLAG_APPEND; 3024 if (fp->f_flag & FASYNC) 3025 kif->kf_flags |= KF_FLAG_ASYNC; 3026 if (fp->f_flag & FFSYNC) 3027 kif->kf_flags |= KF_FLAG_FSYNC; 3028 if (fp->f_flag & FNONBLOCK) 3029 kif->kf_flags |= KF_FLAG_NONBLOCK; 3030 if (fp->f_flag & O_DIRECT) 3031 kif->kf_flags |= KF_FLAG_DIRECT; 3032 if (fp->f_flag & FHASLOCK) 3033 kif->kf_flags |= KF_FLAG_HASLOCK; 3034 kif->kf_offset = fp->f_offset; 3035 if (vp != NULL) { 3036 vref(vp); 3037 switch (vp->v_type) { 3038 case VNON: 3039 kif->kf_vnode_type = KF_VTYPE_VNON; 3040 break; 3041 case VREG: 3042 kif->kf_vnode_type = KF_VTYPE_VREG; 3043 break; 3044 case VDIR: 3045 kif->kf_vnode_type = KF_VTYPE_VDIR; 3046 break; 3047 case VBLK: 3048 kif->kf_vnode_type = KF_VTYPE_VBLK; 3049 break; 3050 case VCHR: 3051 kif->kf_vnode_type = KF_VTYPE_VCHR; 3052 break; 3053 case VLNK: 3054 kif->kf_vnode_type = KF_VTYPE_VLNK; 3055 break; 3056 case VSOCK: 3057 kif->kf_vnode_type = KF_VTYPE_VSOCK; 3058 break; 3059 case VFIFO: 3060 kif->kf_vnode_type = KF_VTYPE_VFIFO; 3061 break; 3062 case VBAD: 3063 kif->kf_vnode_type = KF_VTYPE_VBAD; 3064 break; 3065 default: 3066 kif->kf_vnode_type = KF_VTYPE_UNKNOWN; 3067 break; 3068 } 3069 /* 3070 * It is OK to drop the filedesc lock here as we will 3071 * re-validate and re-evaluate its properties when 3072 * the loop continues. 3073 */ 3074 freepath = NULL; 3075 fullpath = "-"; 3076 FILEDESC_SUNLOCK(fdp); 3077 vn_fullpath(curthread, vp, &fullpath, &freepath); 3078 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 3079 vrele(vp); 3080 VFS_UNLOCK_GIANT(vfslocked); 3081 strlcpy(kif->kf_path, fullpath, 3082 sizeof(kif->kf_path)); 3083 if (freepath != NULL) 3084 free(freepath, M_TEMP); 3085 FILEDESC_SLOCK(fdp); 3086 } 3087 if (so != NULL) { 3088 struct sockaddr *sa; 3089 3090 if (so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa) 3091 == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) { 3092 bcopy(sa, &kif->kf_sa_local, sa->sa_len); 3093 free(sa, M_SONAME); 3094 } 3095 if (so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa) 3096 == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) { 3097 bcopy(sa, &kif->kf_sa_peer, sa->sa_len); 3098 free(sa, M_SONAME); 3099 } 3100 kif->kf_sock_domain = 3101 so->so_proto->pr_domain->dom_family; 3102 kif->kf_sock_type = so->so_type; 3103 kif->kf_sock_protocol = so->so_proto->pr_protocol; 3104 } 3105 if (tp != NULL) { 3106 strlcpy(kif->kf_path, tty_devname(tp), 3107 sizeof(kif->kf_path)); 3108 } 3109 if (shmfd != NULL) 3110 shm_path(shmfd, kif->kf_path, sizeof(kif->kf_path)); 3111 error = SYSCTL_OUT(req, kif, sizeof(*kif)); 3112 if (error) 3113 break; 3114 } 3115 FILEDESC_SUNLOCK(fdp); 3116 fddrop(fdp); 3117 free(kif, M_TEMP); 3118 return (0); 3119} 3120 3121static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, CTLFLAG_RD, 3122 sysctl_kern_proc_ofiledesc, "Process ofiledesc entries"); 3123#endif /* COMPAT_FREEBSD7 */ 3124 3125#ifdef KINFO_FILE_SIZE 3126CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE); 3127#endif 3128 3129static int 3130export_fd_for_sysctl(void *data, int type, int fd, int fflags, int refcnt, 3131 int64_t offset, int fd_is_cap, cap_rights_t fd_cap_rights, 3132 struct kinfo_file *kif, struct sysctl_req *req) 3133{ 3134 struct { 3135 int fflag; 3136 int kf_fflag; 3137 } fflags_table[] = { 3138 { FAPPEND, KF_FLAG_APPEND }, 3139 { FASYNC, KF_FLAG_ASYNC }, 3140 { FFSYNC, KF_FLAG_FSYNC }, 3141 { FHASLOCK, KF_FLAG_HASLOCK }, 3142 { FNONBLOCK, KF_FLAG_NONBLOCK }, 3143 { FREAD, KF_FLAG_READ }, 3144 { FWRITE, KF_FLAG_WRITE }, 3145 { O_CREAT, KF_FLAG_CREAT }, 3146 { O_DIRECT, KF_FLAG_DIRECT }, 3147 { O_EXCL, KF_FLAG_EXCL }, 3148 { O_EXEC, KF_FLAG_EXEC }, 3149 { O_EXLOCK, KF_FLAG_EXLOCK }, 3150 { O_NOFOLLOW, KF_FLAG_NOFOLLOW }, 3151 { O_SHLOCK, KF_FLAG_SHLOCK }, 3152 { O_TRUNC, KF_FLAG_TRUNC } 3153 }; 3154#define NFFLAGS (sizeof(fflags_table) / sizeof(*fflags_table)) 3155 struct vnode *vp; 3156 int error, vfslocked; 3157 unsigned int i; 3158 3159 bzero(kif, sizeof(*kif)); 3160 switch (type) { 3161 case KF_TYPE_FIFO: 3162 case KF_TYPE_VNODE: 3163 vp = (struct vnode *)data; 3164 error = fill_vnode_info(vp, kif); 3165 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 3166 vrele(vp); 3167 VFS_UNLOCK_GIANT(vfslocked); 3168 break; 3169 case KF_TYPE_SOCKET: 3170 error = fill_socket_info((struct socket *)data, kif); 3171 break; 3172 case KF_TYPE_PIPE: 3173 error = fill_pipe_info((struct pipe *)data, kif); 3174 break; 3175 case KF_TYPE_PTS: 3176 error = fill_pts_info((struct tty *)data, kif); 3177 break; 3178 case KF_TYPE_PROCDESC: 3179 error = fill_procdesc_info((struct procdesc *)data, kif); 3180 break; 3181 case KF_TYPE_SHM: 3182 error = fill_shm_info((struct file *)data, kif); 3183 break; 3184 default: 3185 error = 0; 3186 } 3187 if (error == 0) 3188 kif->kf_status |= KF_ATTR_VALID; 3189 3190 /* 3191 * Translate file access flags. 3192 */ 3193 for (i = 0; i < NFFLAGS; i++) 3194 if (fflags & fflags_table[i].fflag) 3195 kif->kf_flags |= fflags_table[i].kf_fflag; 3196 if (fd_is_cap) 3197 kif->kf_flags |= KF_FLAG_CAPABILITY; 3198 if (fd_is_cap) 3199 kif->kf_cap_rights = fd_cap_rights; 3200 kif->kf_fd = fd; 3201 kif->kf_type = type; 3202 kif->kf_ref_count = refcnt; 3203 kif->kf_offset = offset; 3204 /* Pack record size down */ 3205 kif->kf_structsize = offsetof(struct kinfo_file, kf_path) + 3206 strlen(kif->kf_path) + 1; 3207 kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t)); 3208 error = SYSCTL_OUT(req, kif, kif->kf_structsize); 3209 return (error); 3210} 3211 3212/* 3213 * Get per-process file descriptors for use by procstat(1), et al. 3214 */ 3215static int 3216sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) 3217{ 3218 struct file *fp; 3219 struct filedesc *fdp; 3220 struct kinfo_file *kif; 3221 struct proc *p; 3222 struct vnode *cttyvp, *textvp, *tracevp; 3223 size_t oldidx; 3224 int64_t offset; 3225 void *data; 3226 int error, i, *name; 3227 int fd_is_cap, type, refcnt, fflags; 3228 cap_rights_t fd_cap_rights; 3229 3230 name = (int *)arg1; 3231 if ((p = pfind((pid_t)name[0])) == NULL) 3232 return (ESRCH); 3233 if ((error = p_candebug(curthread, p))) { 3234 PROC_UNLOCK(p); 3235 return (error); 3236 } 3237 /* ktrace vnode */ 3238 tracevp = p->p_tracevp; 3239 if (tracevp != NULL) 3240 vref(tracevp); 3241 /* text vnode */ 3242 textvp = p->p_textvp; 3243 if (textvp != NULL) 3244 vref(textvp); 3245 /* Controlling tty. */ 3246 cttyvp = NULL; 3247 if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) { 3248 cttyvp = p->p_pgrp->pg_session->s_ttyvp; 3249 if (cttyvp != NULL) 3250 vref(cttyvp); 3251 } 3252 fdp = fdhold(p); 3253 PROC_UNLOCK(p); 3254 kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); 3255 if (tracevp != NULL) 3256 export_fd_for_sysctl(tracevp, KF_TYPE_VNODE, KF_FD_TYPE_TRACE, 3257 FREAD | FWRITE, -1, -1, 0, 0, kif, req); 3258 if (textvp != NULL) 3259 export_fd_for_sysctl(textvp, KF_TYPE_VNODE, KF_FD_TYPE_TEXT, 3260 FREAD, -1, -1, 0, 0, kif, req); 3261 if (cttyvp != NULL) 3262 export_fd_for_sysctl(cttyvp, KF_TYPE_VNODE, KF_FD_TYPE_CTTY, 3263 FREAD | FWRITE, -1, -1, 0, 0, kif, req); 3264 if (fdp == NULL) 3265 goto fail; 3266 FILEDESC_SLOCK(fdp); 3267 /* working directory */ 3268 if (fdp->fd_cdir != NULL) { 3269 vref(fdp->fd_cdir); 3270 data = fdp->fd_cdir; 3271 FILEDESC_SUNLOCK(fdp); 3272 export_fd_for_sysctl(data, KF_TYPE_VNODE, KF_FD_TYPE_CWD, 3273 FREAD, -1, -1, 0, 0, kif, req); 3274 FILEDESC_SLOCK(fdp); 3275 } 3276 /* root directory */ 3277 if (fdp->fd_rdir != NULL) { 3278 vref(fdp->fd_rdir); 3279 data = fdp->fd_rdir; 3280 FILEDESC_SUNLOCK(fdp); 3281 export_fd_for_sysctl(data, KF_TYPE_VNODE, KF_FD_TYPE_ROOT, 3282 FREAD, -1, -1, 0, 0, kif, req); 3283 FILEDESC_SLOCK(fdp); 3284 } 3285 /* jail directory */ 3286 if (fdp->fd_jdir != NULL) { 3287 vref(fdp->fd_jdir); 3288 data = fdp->fd_jdir; 3289 FILEDESC_SUNLOCK(fdp); 3290 export_fd_for_sysctl(data, KF_TYPE_VNODE, KF_FD_TYPE_JAIL, 3291 FREAD, -1, -1, 0, 0, kif, req); 3292 FILEDESC_SLOCK(fdp); 3293 } 3294 for (i = 0; i < fdp->fd_nfiles; i++) { 3295 if ((fp = fdp->fd_ofiles[i]) == NULL) 3296 continue; 3297 data = NULL; 3298 fd_is_cap = 0; 3299 fd_cap_rights = 0; 3300 3301#ifdef CAPABILITIES 3302 /* 3303 * When reporting a capability, most fields will be from the 3304 * underlying object, but do mark as a capability and export 3305 * the capability rights mask. 3306 */ 3307 if (fp->f_type == DTYPE_CAPABILITY) { 3308 fd_is_cap = 1; 3309 fd_cap_rights = cap_rights(fp); 3310 (void)cap_funwrap(fp, 0, &fp); 3311 } 3312#else /* !CAPABILITIES */ 3313 KASSERT(fp->f_type != DTYPE_CAPABILITY, 3314 ("sysctl_kern_proc_filedesc: saw capability")); 3315#endif 3316 switch (fp->f_type) { 3317 case DTYPE_VNODE: 3318 type = KF_TYPE_VNODE; 3319 vref(fp->f_vnode); 3320 data = fp->f_vnode; 3321 break; 3322 3323 case DTYPE_SOCKET: 3324 type = KF_TYPE_SOCKET; 3325 data = fp->f_data; 3326 break; 3327 3328 case DTYPE_PIPE: 3329 type = KF_TYPE_PIPE; 3330 data = fp->f_data; 3331 break; 3332 3333 case DTYPE_FIFO: 3334 type = KF_TYPE_FIFO; 3335 vref(fp->f_vnode); 3336 data = fp->f_vnode; 3337 break; 3338 3339 case DTYPE_KQUEUE: 3340 type = KF_TYPE_KQUEUE; 3341 break; 3342 3343 case DTYPE_CRYPTO: 3344 type = KF_TYPE_CRYPTO; 3345 break; 3346 3347 case DTYPE_MQUEUE: 3348 type = KF_TYPE_MQUEUE; 3349 break; 3350 3351 case DTYPE_SHM: 3352 type = KF_TYPE_SHM; 3353 data = fp; 3354 break; 3355 3356 case DTYPE_SEM: 3357 type = KF_TYPE_SEM; 3358 break; 3359 3360 case DTYPE_PTS: 3361 type = KF_TYPE_PTS; 3362 data = fp->f_data; 3363 break; 3364 3365#ifdef PROCDESC 3366 case DTYPE_PROCDESC: 3367 type = KF_TYPE_PROCDESC; 3368 data = fp->f_data; 3369 break; 3370#endif 3371 3372 default: 3373 type = KF_TYPE_UNKNOWN; 3374 break; 3375 } 3376 refcnt = fp->f_count; 3377 fflags = fp->f_flag; 3378 offset = fp->f_offset; 3379 3380 /* 3381 * Create sysctl entry. 3382 * It is OK to drop the filedesc lock here as we will 3383 * re-validate and re-evaluate its properties when 3384 * the loop continues. 3385 */ 3386 oldidx = req->oldidx; 3387 if (type == KF_TYPE_VNODE || type == KF_TYPE_FIFO) 3388 FILEDESC_SUNLOCK(fdp); 3389 error = export_fd_for_sysctl(data, type, i, fflags, refcnt, 3390 offset, fd_is_cap, fd_cap_rights, kif, req); 3391 if (type == KF_TYPE_VNODE || type == KF_TYPE_FIFO) 3392 FILEDESC_SLOCK(fdp); 3393 if (error) { 3394 if (error == ENOMEM) { 3395 /* 3396 * The hack to keep the ABI of sysctl 3397 * kern.proc.filedesc intact, but not 3398 * to account a partially copied 3399 * kinfo_file into the oldidx. 3400 */ 3401 req->oldidx = oldidx; 3402 error = 0; 3403 } 3404 break; 3405 } 3406 } 3407 FILEDESC_SUNLOCK(fdp); 3408fail: 3409 if (fdp != NULL) 3410 fddrop(fdp); 3411 free(kif, M_TEMP); 3412 return (error); 3413} 3414 3415int 3416vntype_to_kinfo(int vtype) 3417{ 3418 struct { 3419 int vtype; 3420 int kf_vtype; 3421 } vtypes_table[] = { 3422 { VBAD, KF_VTYPE_VBAD }, 3423 { VBLK, KF_VTYPE_VBLK }, 3424 { VCHR, KF_VTYPE_VCHR }, 3425 { VDIR, KF_VTYPE_VDIR }, 3426 { VFIFO, KF_VTYPE_VFIFO }, 3427 { VLNK, KF_VTYPE_VLNK }, 3428 { VNON, KF_VTYPE_VNON }, 3429 { VREG, KF_VTYPE_VREG }, 3430 { VSOCK, KF_VTYPE_VSOCK } 3431 }; 3432#define NVTYPES (sizeof(vtypes_table) / sizeof(*vtypes_table)) 3433 unsigned int i; 3434 3435 /* 3436 * Perform vtype translation. 3437 */ 3438 for (i = 0; i < NVTYPES; i++) 3439 if (vtypes_table[i].vtype == vtype) 3440 break; 3441 if (i < NVTYPES) 3442 return (vtypes_table[i].kf_vtype); 3443 3444 return (KF_VTYPE_UNKNOWN); 3445} 3446 3447static int 3448fill_vnode_info(struct vnode *vp, struct kinfo_file *kif) 3449{ 3450 struct vattr va; 3451 char *fullpath, *freepath; 3452 int error, vfslocked; 3453 3454 if (vp == NULL) 3455 return (1); 3456 kif->kf_vnode_type = vntype_to_kinfo(vp->v_type); 3457 freepath = NULL; 3458 fullpath = "-"; 3459 error = vn_fullpath(curthread, vp, &fullpath, &freepath); 3460 if (error == 0) { 3461 strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path)); 3462 } 3463 if (freepath != NULL) 3464 free(freepath, M_TEMP); 3465 3466 /* 3467 * Retrieve vnode attributes. 3468 */ 3469 va.va_fsid = VNOVAL; 3470 va.va_rdev = NODEV; 3471 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 3472 vn_lock(vp, LK_SHARED | LK_RETRY); 3473 error = VOP_GETATTR(vp, &va, curthread->td_ucred); 3474 VOP_UNLOCK(vp, 0); 3475 VFS_UNLOCK_GIANT(vfslocked); 3476 if (error != 0) 3477 return (error); 3478 if (va.va_fsid != VNOVAL) 3479 kif->kf_un.kf_file.kf_file_fsid = va.va_fsid; 3480 else 3481 kif->kf_un.kf_file.kf_file_fsid = 3482 vp->v_mount->mnt_stat.f_fsid.val[0]; 3483 kif->kf_un.kf_file.kf_file_fileid = va.va_fileid; 3484 kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode); 3485 kif->kf_un.kf_file.kf_file_size = va.va_size; 3486 kif->kf_un.kf_file.kf_file_rdev = va.va_rdev; 3487 return (0); 3488} 3489 3490static int 3491fill_socket_info(struct socket *so, struct kinfo_file *kif) 3492{ 3493 struct sockaddr *sa; 3494 struct inpcb *inpcb; 3495 struct unpcb *unpcb; 3496 int error; 3497 3498 if (so == NULL) 3499 return (1); 3500 kif->kf_sock_domain = so->so_proto->pr_domain->dom_family; 3501 kif->kf_sock_type = so->so_type; 3502 kif->kf_sock_protocol = so->so_proto->pr_protocol; 3503 kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb; 3504 switch(kif->kf_sock_domain) { 3505 case AF_INET: 3506 case AF_INET6: 3507 if (kif->kf_sock_protocol == IPPROTO_TCP) { 3508 if (so->so_pcb != NULL) { 3509 inpcb = (struct inpcb *)(so->so_pcb); 3510 kif->kf_un.kf_sock.kf_sock_inpcb = 3511 (uintptr_t)inpcb->inp_ppcb; 3512 } 3513 } 3514 break; 3515 case AF_UNIX: 3516 if (so->so_pcb != NULL) { 3517 unpcb = (struct unpcb *)(so->so_pcb); 3518 if (unpcb->unp_conn) { 3519 kif->kf_un.kf_sock.kf_sock_unpconn = 3520 (uintptr_t)unpcb->unp_conn; 3521 kif->kf_un.kf_sock.kf_sock_rcv_sb_state = 3522 so->so_rcv.sb_state; 3523 kif->kf_un.kf_sock.kf_sock_snd_sb_state = 3524 so->so_snd.sb_state; 3525 } 3526 } 3527 break; 3528 } 3529 error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); 3530 if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) { 3531 bcopy(sa, &kif->kf_sa_local, sa->sa_len); 3532 free(sa, M_SONAME); 3533 } 3534 error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa); 3535 if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) { 3536 bcopy(sa, &kif->kf_sa_peer, sa->sa_len); 3537 free(sa, M_SONAME); 3538 } 3539 strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name, 3540 sizeof(kif->kf_path)); 3541 return (0); 3542} 3543 3544static int 3545fill_pts_info(struct tty *tp, struct kinfo_file *kif) 3546{ 3547 3548 if (tp == NULL) 3549 return (1); 3550 kif->kf_un.kf_pts.kf_pts_dev = tty_udev(tp); 3551 strlcpy(kif->kf_path, tty_devname(tp), sizeof(kif->kf_path)); 3552 return (0); 3553} 3554 3555static int 3556fill_pipe_info(struct pipe *pi, struct kinfo_file *kif) 3557{ 3558 3559 if (pi == NULL) 3560 return (1); 3561 kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi; 3562 kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer; 3563 kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt; 3564 return (0); 3565} 3566 3567static int 3568fill_procdesc_info(struct procdesc *pdp, struct kinfo_file *kif) 3569{ 3570 3571 if (pdp == NULL) 3572 return (1); 3573 kif->kf_un.kf_proc.kf_pid = pdp->pd_pid; 3574 return (0); 3575} 3576 3577static int 3578fill_shm_info(struct file *fp, struct kinfo_file *kif) 3579{ 3580 struct thread *td; 3581 struct stat sb; 3582 3583 td = curthread; 3584 if (fp->f_data == NULL) 3585 return (1); 3586 if (fo_stat(fp, &sb, td->td_ucred, td) != 0) 3587 return (1); 3588 shm_path(fp->f_data, kif->kf_path, sizeof(kif->kf_path)); 3589 kif->kf_un.kf_file.kf_file_mode = sb.st_mode; 3590 kif->kf_un.kf_file.kf_file_size = sb.st_size; 3591 return (0); 3592} 3593 3594static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD, 3595 sysctl_kern_proc_filedesc, "Process filedesc entries"); 3596 3597#ifdef DDB 3598/* 3599 * For the purposes of debugging, generate a human-readable string for the 3600 * file type. 3601 */ 3602static const char * 3603file_type_to_name(short type) 3604{ 3605 3606 switch (type) { 3607 case 0: 3608 return ("zero"); 3609 case DTYPE_VNODE: 3610 return ("vnod"); 3611 case DTYPE_SOCKET: 3612 return ("sock"); 3613 case DTYPE_PIPE: 3614 return ("pipe"); 3615 case DTYPE_FIFO: 3616 return ("fifo"); 3617 case DTYPE_KQUEUE: 3618 return ("kque"); 3619 case DTYPE_CRYPTO: 3620 return ("crpt"); 3621 case DTYPE_MQUEUE: 3622 return ("mque"); 3623 case DTYPE_SHM: 3624 return ("shm"); 3625 case DTYPE_SEM: 3626 return ("ksem"); 3627 default: 3628 return ("unkn"); 3629 } 3630} 3631 3632/* 3633 * For the purposes of debugging, identify a process (if any, perhaps one of 3634 * many) that references the passed file in its file descriptor array. Return 3635 * NULL if none. 3636 */ 3637static struct proc * 3638file_to_first_proc(struct file *fp) 3639{ 3640 struct filedesc *fdp; 3641 struct proc *p; 3642 int n; 3643 3644 FOREACH_PROC_IN_SYSTEM(p) { 3645 if (p->p_state == PRS_NEW) 3646 continue; 3647 fdp = p->p_fd; 3648 if (fdp == NULL) 3649 continue; 3650 for (n = 0; n < fdp->fd_nfiles; n++) { 3651 if (fp == fdp->fd_ofiles[n]) 3652 return (p); 3653 } 3654 } 3655 return (NULL); 3656} 3657 3658static void 3659db_print_file(struct file *fp, int header) 3660{ 3661 struct proc *p; 3662 3663 if (header) 3664 db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n", 3665 "File", "Type", "Data", "Flag", "GCFl", "Count", 3666 "MCount", "Vnode", "FPID", "FCmd"); 3667 p = file_to_first_proc(fp); 3668 db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp, 3669 file_type_to_name(fp->f_type), fp->f_data, fp->f_flag, 3670 0, fp->f_count, 0, fp->f_vnode, 3671 p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-"); 3672} 3673 3674DB_SHOW_COMMAND(file, db_show_file) 3675{ 3676 struct file *fp; 3677 3678 if (!have_addr) { 3679 db_printf("usage: show file <addr>\n"); 3680 return; 3681 } 3682 fp = (struct file *)addr; 3683 db_print_file(fp, 1); 3684} 3685 3686DB_SHOW_COMMAND(files, db_show_files) 3687{ 3688 struct filedesc *fdp; 3689 struct file *fp; 3690 struct proc *p; 3691 int header; 3692 int n; 3693 3694 header = 1; 3695 FOREACH_PROC_IN_SYSTEM(p) { 3696 if (p->p_state == PRS_NEW) 3697 continue; 3698 if ((fdp = p->p_fd) == NULL) 3699 continue; 3700 for (n = 0; n < fdp->fd_nfiles; ++n) { 3701 if ((fp = fdp->fd_ofiles[n]) == NULL) 3702 continue; 3703 db_print_file(fp, header); 3704 header = 0; 3705 } 3706 } 3707} 3708#endif 3709 3710SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, 3711 &maxfilesperproc, 0, "Maximum files allowed open per process"); 3712 3713SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, 3714 &maxfiles, 0, "Maximum number of files"); 3715 3716SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, 3717 __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files"); 3718 3719/* ARGSUSED*/ 3720static void 3721filelistinit(void *dummy) 3722{ 3723 3724 file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, 3725 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 3726 mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); 3727 mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF); 3728} 3729SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL); 3730 3731/*-------------------------------------------------------------------*/ 3732 3733static int 3734badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, 3735 int flags, struct thread *td) 3736{ 3737 3738 return (EBADF); 3739} 3740 3741static int 3742badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, 3743 struct thread *td) 3744{ 3745 3746 return (EINVAL); 3747} 3748 3749static int 3750badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, 3751 struct thread *td) 3752{ 3753 3754 return (EBADF); 3755} 3756 3757static int 3758badfo_poll(struct file *fp, int events, struct ucred *active_cred, 3759 struct thread *td) 3760{ 3761 3762 return (0); 3763} 3764 3765static int 3766badfo_kqfilter(struct file *fp, struct knote *kn) 3767{ 3768 3769 return (EBADF); 3770} 3771 3772static int 3773badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, 3774 struct thread *td) 3775{ 3776 3777 return (EBADF); 3778} 3779 3780static int 3781badfo_close(struct file *fp, struct thread *td) 3782{ 3783 3784 return (EBADF); 3785} 3786 3787static int 3788badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 3789 struct thread *td) 3790{ 3791 3792 return (EBADF); 3793} 3794 3795static int 3796badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 3797 struct thread *td) 3798{ 3799 3800 return (EBADF); 3801} 3802 3803struct fileops badfileops = { 3804 .fo_read = badfo_readwrite, 3805 .fo_write = badfo_readwrite, 3806 .fo_truncate = badfo_truncate, 3807 .fo_ioctl = badfo_ioctl, 3808 .fo_poll = badfo_poll, 3809 .fo_kqfilter = badfo_kqfilter, 3810 .fo_stat = badfo_stat, 3811 .fo_close = badfo_close, 3812 .fo_chmod = badfo_chmod, 3813 .fo_chown = badfo_chown, 3814}; 3815 3816int 3817invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 3818 struct thread *td) 3819{ 3820 3821 return (EINVAL); 3822} 3823 3824int 3825invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 3826 struct thread *td) 3827{ 3828 3829 return (EINVAL); 3830} 3831 3832/*-------------------------------------------------------------------*/ 3833 3834/* 3835 * File Descriptor pseudo-device driver (/dev/fd/). 3836 * 3837 * Opening minor device N dup()s the file (if any) connected to file 3838 * descriptor N belonging to the calling process. Note that this driver 3839 * consists of only the ``open()'' routine, because all subsequent 3840 * references to this file will be direct to the other driver. 3841 * 3842 * XXX: we could give this one a cloning event handler if necessary. 3843 */ 3844 3845/* ARGSUSED */ 3846static int 3847fdopen(struct cdev *dev, int mode, int type, struct thread *td) 3848{ 3849 3850 /* 3851 * XXX Kludge: set curthread->td_dupfd to contain the value of the 3852 * the file descriptor being sought for duplication. The error 3853 * return ensures that the vnode for this device will be released 3854 * by vn_open. Open will detect this special error and take the 3855 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN 3856 * will simply report the error. 3857 */ 3858 td->td_dupfd = dev2unit(dev); 3859 return (ENODEV); 3860} 3861 3862static struct cdevsw fildesc_cdevsw = { 3863 .d_version = D_VERSION, 3864 .d_open = fdopen, 3865 .d_name = "FD", 3866}; 3867 3868static void 3869fildesc_drvinit(void *unused) 3870{ 3871 struct cdev *dev; 3872 3873 dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL, 3874 UID_ROOT, GID_WHEEL, 0666, "fd/0"); 3875 make_dev_alias(dev, "stdin"); 3876 dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL, 3877 UID_ROOT, GID_WHEEL, 0666, "fd/1"); 3878 make_dev_alias(dev, "stdout"); 3879 dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL, 3880 UID_ROOT, GID_WHEEL, 0666, "fd/2"); 3881 make_dev_alias(dev, "stderr"); 3882} 3883 3884SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL); 3885