1/* 2 * Copyright (c) 2000-2012 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ 29/* 30 * Copyright (c) 1989, 1993, 1995 31 * The Regents of the University of California. All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 3. All advertising materials mentioning features or use of this software 42 * must display the following acknowledgement: 43 * This product includes software developed by the University of 44 * California, Berkeley and its contributors. 45 * 4. Neither the name of the University nor the names of its contributors 46 * may be used to endorse or promote products derived from this software 47 * without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * SUCH DAMAGE. 60 * 61 * @(#)spec_vnops.c 8.14 (Berkeley) 5/21/95 62 */ 63 64#include <sys/param.h> 65#include <sys/proc_internal.h> 66#include <sys/kauth.h> 67#include <sys/systm.h> 68#include <sys/kernel.h> 69#include <sys/conf.h> 70#include <sys/buf_internal.h> 71#include <sys/mount_internal.h> 72#include <sys/vnode_internal.h> 73#include <sys/file_internal.h> 74#include <sys/namei.h> 75#include <sys/stat.h> 76#include <sys/errno.h> 77#include <sys/ioctl.h> 78#include <sys/file.h> 79#include <sys/user.h> 80#include <sys/malloc.h> 81#include <sys/disk.h> 82#include <sys/uio_internal.h> 83#include <sys/resource.h> 84#include <miscfs/specfs/specdev.h> 85#include <vfs/vfs_support.h> 86#include <kern/assert.h> 87#include <kern/task.h> 88 89#include <sys/kdebug.h> 90 91/* XXX following three prototypes should be in a header file somewhere */ 92extern dev_t chrtoblk(dev_t dev); 93extern int iskmemdev(dev_t dev); 94extern int bpfkqfilter(dev_t dev, struct knote *kn); 95extern int ptsd_kqfilter(dev_t dev, struct knote *kn); 96 97extern int ignore_is_ssd; 98 99struct vnode *speclisth[SPECHSZ]; 100 101/* symbolic sleep message strings for devices */ 102char devopn[] = "devopn"; 103char devio[] = "devio"; 104char devwait[] = "devwait"; 105char devin[] = "devin"; 106char devout[] = "devout"; 107char devioc[] = "devioc"; 108char devcls[] = "devcls"; 109 110#define VOPFUNC int (*)(void *) 111 112int (**spec_vnodeop_p)(void *); 113struct vnodeopv_entry_desc spec_vnodeop_entries[] = { 114 { &vnop_default_desc, (VOPFUNC)vn_default_error }, 115 { &vnop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */ 116 { &vnop_create_desc, (VOPFUNC)err_create }, /* create */ 117 { &vnop_mknod_desc, (VOPFUNC)err_mknod }, /* mknod */ 118 { &vnop_open_desc, (VOPFUNC)spec_open }, /* open */ 119 { &vnop_close_desc, (VOPFUNC)spec_close }, /* close */ 120 { &vnop_access_desc, (VOPFUNC)spec_access }, /* access */ 121 { &vnop_getattr_desc, (VOPFUNC)spec_getattr }, /* getattr */ 122 { &vnop_setattr_desc, (VOPFUNC)spec_setattr }, /* setattr */ 123 { &vnop_read_desc, (VOPFUNC)spec_read }, /* read */ 124 { &vnop_write_desc, (VOPFUNC)spec_write }, /* write */ 125 { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */ 126 { &vnop_select_desc, (VOPFUNC)spec_select }, /* select */ 127 { &vnop_revoke_desc, (VOPFUNC)nop_revoke }, /* revoke */ 128 { &vnop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */ 129 { &vnop_fsync_desc, (VOPFUNC)spec_fsync }, /* fsync */ 130 { &vnop_remove_desc, (VOPFUNC)err_remove }, /* remove */ 131 { &vnop_link_desc, (VOPFUNC)err_link }, /* link */ 132 { &vnop_rename_desc, (VOPFUNC)err_rename }, /* rename */ 133 { &vnop_mkdir_desc, (VOPFUNC)err_mkdir }, /* mkdir */ 134 { &vnop_rmdir_desc, (VOPFUNC)err_rmdir }, /* rmdir */ 135 { &vnop_symlink_desc, (VOPFUNC)err_symlink }, /* symlink */ 136 { &vnop_readdir_desc, (VOPFUNC)err_readdir }, /* readdir */ 137 { &vnop_readlink_desc, (VOPFUNC)err_readlink }, /* readlink */ 138 { &vnop_inactive_desc, (VOPFUNC)nop_inactive }, /* inactive */ 139 { &vnop_reclaim_desc, (VOPFUNC)nop_reclaim }, /* reclaim */ 140 { &vnop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */ 141 { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */ 142 { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ 143 { &vnop_bwrite_desc, (VOPFUNC)spec_bwrite }, /* bwrite */ 144 { &vnop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */ 145 { &vnop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */ 146 { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ 147 { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff }, /* blktooff */ 148 { &vnop_offtoblk_desc, (VOPFUNC)spec_offtoblk }, /* offtoblk */ 149 { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap }, /* blockmap */ 150 { (struct vnodeop_desc*)NULL, (int(*)())NULL } 151}; 152struct vnodeopv_desc spec_vnodeop_opv_desc = 153 { &spec_vnodeop_p, spec_vnodeop_entries }; 154 155 156static void set_blocksize(vnode_t, dev_t); 157 158 159#define THROTTLE_LEVEL_NONE -1 160#define THROTTLE_LEVEL_TIER0 0 161 162#define THROTTLE_LEVEL_THROTTLED 1 163#define THROTTLE_LEVEL_TIER1 1 164#define THROTTLE_LEVEL_TIER2 2 165 166#define THROTTLE_LEVEL_START 0 167#define THROTTLE_LEVEL_END 2 168 169 170struct _throttle_io_info_t { 171 struct timeval throttle_last_IO_timestamp[THROTTLE_LEVEL_END + 1]; 172 struct timeval throttle_last_write_timestamp; 173 struct timeval throttle_start_IO_period_timestamp; 174 175 TAILQ_HEAD( , uthread) throttle_uthlist; /* List of throttled uthreads */ 176 177 lck_mtx_t throttle_lock; 178 thread_call_t throttle_timer_call; 179 int32_t throttle_timer_running; 180 int32_t throttle_io_count; 181 int32_t throttle_io_count_begin; 182 int32_t throttle_io_period; 183 uint32_t throttle_io_period_num; 184 int32_t throttle_refcnt; 185 int32_t throttle_alloc; 186}; 187 188struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV]; 189 190static void throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int policy, int flags, boolean_t isssd); 191static int throttle_get_thread_throttle_level(uthread_t ut, int policy); 192 193__private_extern__ int32_t throttle_legacy_process_count = 0; 194 195/* 196 * Trivial lookup routine that always fails. 197 */ 198int 199spec_lookup(struct vnop_lookup_args *ap) 200{ 201 202 *ap->a_vpp = NULL; 203 return (ENOTDIR); 204} 205 206static void 207set_blocksize(struct vnode *vp, dev_t dev) 208{ 209 int (*size)(dev_t); 210 int rsize; 211 212 if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) { 213 rsize = (*size)(dev); 214 if (rsize <= 0) /* did size fail? */ 215 vp->v_specsize = DEV_BSIZE; 216 else 217 vp->v_specsize = rsize; 218 } 219 else 220 vp->v_specsize = DEV_BSIZE; 221} 222 223void 224set_fsblocksize(struct vnode *vp) 225{ 226 227 if (vp->v_type == VBLK) { 228 dev_t dev = (dev_t)vp->v_rdev; 229 int maj = major(dev); 230 231 if ((u_int)maj >= (u_int)nblkdev) 232 return; 233 234 vnode_lock(vp); 235 set_blocksize(vp, dev); 236 vnode_unlock(vp); 237 } 238 239} 240 241 242/* 243 * Open a special file. 244 */ 245int 246spec_open(struct vnop_open_args *ap) 247{ 248 struct proc *p = vfs_context_proc(ap->a_context); 249 kauth_cred_t cred = vfs_context_ucred(ap->a_context); 250 struct vnode *vp = ap->a_vp; 251 dev_t bdev, dev = (dev_t)vp->v_rdev; 252 int maj = major(dev); 253 int error; 254 255 /* 256 * Don't allow open if fs is mounted -nodev. 257 */ 258 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV)) 259 return (ENXIO); 260 261 switch (vp->v_type) { 262 263 case VCHR: 264 if ((u_int)maj >= (u_int)nchrdev) 265 return (ENXIO); 266 if (cred != FSCRED && (ap->a_mode & FWRITE)) { 267 /* 268 * When running in very secure mode, do not allow 269 * opens for writing of any disk character devices. 270 */ 271 if (securelevel >= 2 && isdisk(dev, VCHR)) 272 return (EPERM); 273 /* 274 * When running in secure mode, do not allow opens 275 * for writing of /dev/mem, /dev/kmem, or character 276 * devices whose corresponding block devices are 277 * currently mounted. 278 */ 279 if (securelevel >= 1) { 280 if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error)) 281 return (error); 282 if (iskmemdev(dev)) 283 return (EPERM); 284 } 285 } 286 287 devsw_lock(dev, S_IFCHR); 288 error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p); 289 290 if (error == 0) { 291 vp->v_specinfo->si_opencount++; 292 } 293 294 devsw_unlock(dev, S_IFCHR); 295 296 if (error == 0 && (D_TYPEMASK & cdevsw[maj].d_type) == D_DISK && !vp->v_un.vu_specinfo->si_initted) { 297 int isssd = 0; 298 uint64_t throttle_mask = 0; 299 uint32_t devbsdunit = 0; 300 301 if (VNOP_IOCTL(vp, DKIOCGETTHROTTLEMASK, (caddr_t)&throttle_mask, 0, NULL) == 0) { 302 303 if (throttle_mask != 0 && 304 VNOP_IOCTL(vp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ap->a_context) == 0) { 305 /* 306 * as a reasonable approximation, only use the lowest bit of the mask 307 * to generate a disk unit number 308 */ 309 devbsdunit = num_trailing_0(throttle_mask); 310 311 vnode_lock(vp); 312 313 vp->v_un.vu_specinfo->si_isssd = isssd; 314 vp->v_un.vu_specinfo->si_devbsdunit = devbsdunit; 315 vp->v_un.vu_specinfo->si_throttle_mask = throttle_mask; 316 vp->v_un.vu_specinfo->si_throttleable = 1; 317 vp->v_un.vu_specinfo->si_initted = 1; 318 319 vnode_unlock(vp); 320 } 321 } 322 if (vp->v_un.vu_specinfo->si_initted == 0) { 323 vnode_lock(vp); 324 vp->v_un.vu_specinfo->si_initted = 1; 325 vnode_unlock(vp); 326 } 327 } 328 return (error); 329 330 case VBLK: 331 if ((u_int)maj >= (u_int)nblkdev) 332 return (ENXIO); 333 /* 334 * When running in very secure mode, do not allow 335 * opens for writing of any disk block devices. 336 */ 337 if (securelevel >= 2 && cred != FSCRED && 338 (ap->a_mode & FWRITE) && isdisk(dev, VBLK)) 339 return (EPERM); 340 /* 341 * Do not allow opens of block devices that are 342 * currently mounted. 343 */ 344 if ( (error = vfs_mountedon(vp)) ) 345 return (error); 346 347 devsw_lock(dev, S_IFBLK); 348 error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p); 349 if (!error) { 350 vp->v_specinfo->si_opencount++; 351 } 352 devsw_unlock(dev, S_IFBLK); 353 354 if (!error) { 355 u_int64_t blkcnt; 356 u_int32_t blksize; 357 int setsize = 0; 358 u_int32_t size512 = 512; 359 360 361 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) { 362 /* Switch to 512 byte sectors (temporarily) */ 363 364 if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) { 365 /* Get the number of 512 byte physical blocks. */ 366 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) { 367 setsize = 1; 368 } 369 } 370 /* If it doesn't set back, we can't recover */ 371 if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context)) 372 error = ENXIO; 373 } 374 375 376 vnode_lock(vp); 377 set_blocksize(vp, dev); 378 379 /* 380 * Cache the size in bytes of the block device for later 381 * use by spec_write(). 382 */ 383 if (setsize) 384 vp->v_specdevsize = blkcnt * (u_int64_t)size512; 385 else 386 vp->v_specdevsize = (u_int64_t)0; /* Default: Can't get */ 387 388 vnode_unlock(vp); 389 390 } 391 return(error); 392 default: 393 panic("spec_open type"); 394 } 395 return (0); 396} 397 398/* 399 * Vnode op for read 400 */ 401int 402spec_read(struct vnop_read_args *ap) 403{ 404 struct vnode *vp = ap->a_vp; 405 struct uio *uio = ap->a_uio; 406 struct buf *bp; 407 daddr64_t bn, nextbn; 408 long bsize, bscale; 409 int devBlockSize=0; 410 int n, on; 411 int error = 0; 412 dev_t dev; 413 414#if DIAGNOSTIC 415 if (uio->uio_rw != UIO_READ) 416 panic("spec_read mode"); 417 if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) 418 panic("spec_read proc"); 419#endif 420 if (uio_resid(uio) == 0) 421 return (0); 422 423 switch (vp->v_type) { 424 425 case VCHR: 426 if ((D_TYPEMASK & cdevsw[major(vp->v_rdev)].d_type) == D_DISK && vp->v_un.vu_specinfo->si_throttleable) { 427 struct _throttle_io_info_t *throttle_info; 428 429 throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit]; 430 431 throttle_info_update_internal(throttle_info, NULL, -1, 0, vp->v_un.vu_specinfo->si_isssd); 432 } 433 error = (*cdevsw[major(vp->v_rdev)].d_read) 434 (vp->v_rdev, uio, ap->a_ioflag); 435 436 return (error); 437 438 case VBLK: 439 if (uio->uio_offset < 0) 440 return (EINVAL); 441 442 dev = vp->v_rdev; 443 444 devBlockSize = vp->v_specsize; 445 446 if (devBlockSize > PAGE_SIZE) 447 return (EINVAL); 448 449 bscale = PAGE_SIZE / devBlockSize; 450 bsize = bscale * devBlockSize; 451 452 do { 453 on = uio->uio_offset % bsize; 454 455 bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ (bscale - 1)); 456 457 if (vp->v_speclastr + bscale == bn) { 458 nextbn = bn + bscale; 459 error = buf_breadn(vp, bn, (int)bsize, &nextbn, 460 (int *)&bsize, 1, NOCRED, &bp); 461 } else 462 error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp); 463 464 vnode_lock(vp); 465 vp->v_speclastr = bn; 466 vnode_unlock(vp); 467 468 n = bsize - buf_resid(bp); 469 if ((on > n) || error) { 470 if (!error) 471 error = EINVAL; 472 buf_brelse(bp); 473 return (error); 474 } 475 n = min((unsigned)(n - on), uio_resid(uio)); 476 477 error = uiomove((char *)buf_dataptr(bp) + on, n, uio); 478 if (n + on == bsize) 479 buf_markaged(bp); 480 buf_brelse(bp); 481 } while (error == 0 && uio_resid(uio) > 0 && n != 0); 482 return (error); 483 484 default: 485 panic("spec_read type"); 486 } 487 /* NOTREACHED */ 488 489 return (0); 490} 491 492/* 493 * Vnode op for write 494 */ 495int 496spec_write(struct vnop_write_args *ap) 497{ 498 struct vnode *vp = ap->a_vp; 499 struct uio *uio = ap->a_uio; 500 struct buf *bp; 501 daddr64_t bn; 502 int bsize, blkmask, bscale; 503 int io_sync; 504 int devBlockSize=0; 505 int n, on; 506 int error = 0; 507 dev_t dev; 508 509#if DIAGNOSTIC 510 if (uio->uio_rw != UIO_WRITE) 511 panic("spec_write mode"); 512 if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) 513 panic("spec_write proc"); 514#endif 515 516 switch (vp->v_type) { 517 518 case VCHR: 519 if ((D_TYPEMASK & cdevsw[major(vp->v_rdev)].d_type) == D_DISK && vp->v_un.vu_specinfo->si_throttleable) { 520 struct _throttle_io_info_t *throttle_info; 521 522 throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit]; 523 524 throttle_info_update_internal(throttle_info, NULL, -1, 0, vp->v_un.vu_specinfo->si_isssd); 525 526 microuptime(&throttle_info->throttle_last_write_timestamp); 527 } 528 error = (*cdevsw[major(vp->v_rdev)].d_write) 529 (vp->v_rdev, uio, ap->a_ioflag); 530 531 return (error); 532 533 case VBLK: 534 if (uio_resid(uio) == 0) 535 return (0); 536 if (uio->uio_offset < 0) 537 return (EINVAL); 538 539 io_sync = (ap->a_ioflag & IO_SYNC); 540 541 dev = (vp->v_rdev); 542 543 devBlockSize = vp->v_specsize; 544 if (devBlockSize > PAGE_SIZE) 545 return(EINVAL); 546 547 bscale = PAGE_SIZE / devBlockSize; 548 blkmask = bscale - 1; 549 bsize = bscale * devBlockSize; 550 551 552 do { 553 bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ blkmask); 554 on = uio->uio_offset % bsize; 555 556 n = min((unsigned)(bsize - on), uio_resid(uio)); 557 558 /* 559 * Use buf_getblk() as an optimization IFF: 560 * 561 * 1) We are reading exactly a block on a block 562 * aligned boundary 563 * 2) We know the size of the device from spec_open 564 * 3) The read doesn't span the end of the device 565 * 566 * Otherwise, we fall back on buf_bread(). 567 */ 568 if (n == bsize && 569 vp->v_specdevsize != (u_int64_t)0 && 570 (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) { 571 /* reduce the size of the read to what is there */ 572 n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize; 573 } 574 575 if (n == bsize) 576 bp = buf_getblk(vp, bn, bsize, 0, 0, BLK_WRITE); 577 else 578 error = (int)buf_bread(vp, bn, bsize, NOCRED, &bp); 579 580 /* Translate downstream error for upstream, if needed */ 581 if (!error) 582 error = (int)buf_error(bp); 583 if (error) { 584 buf_brelse(bp); 585 return (error); 586 } 587 n = min(n, bsize - buf_resid(bp)); 588 589 error = uiomove((char *)buf_dataptr(bp) + on, n, uio); 590 if (error) { 591 buf_brelse(bp); 592 return (error); 593 } 594 buf_markaged(bp); 595 596 if (io_sync) 597 error = buf_bwrite(bp); 598 else { 599 if ((n + on) == bsize) 600 error = buf_bawrite(bp); 601 else 602 error = buf_bdwrite(bp); 603 } 604 } while (error == 0 && uio_resid(uio) > 0 && n != 0); 605 return (error); 606 607 default: 608 panic("spec_write type"); 609 } 610 /* NOTREACHED */ 611 612 return (0); 613} 614 615/* 616 * Device ioctl operation. 617 */ 618int 619spec_ioctl(struct vnop_ioctl_args *ap) 620{ 621 proc_t p = vfs_context_proc(ap->a_context); 622 dev_t dev = ap->a_vp->v_rdev; 623 int retval = 0; 624 625 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_START, 626 (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, (unsigned int)ap->a_vp->v_type, 0); 627 628 switch (ap->a_vp->v_type) { 629 630 case VCHR: 631 retval = (*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, 632 ap->a_fflag, p); 633 break; 634 635 case VBLK: 636 if (kdebug_enable) { 637 if (ap->a_command == DKIOCUNMAP) { 638 dk_unmap_t *unmap; 639 dk_extent_t *extent; 640 uint32_t i; 641 642 unmap = (dk_unmap_t *)ap->a_data; 643 extent = unmap->extents; 644 645 for (i = 0; i < unmap->extentsCount; i++, extent++) { 646 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 1) | DBG_FUNC_NONE, dev, extent->offset/ap->a_vp->v_specsize, extent->length, 0, 0); 647 } 648 } 649 } 650 retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, ap->a_fflag, p); 651 break; 652 653 default: 654 panic("spec_ioctl"); 655 /* NOTREACHED */ 656 } 657 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_END, 658 (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, retval, 0); 659 660 return (retval); 661} 662 663int 664spec_select(struct vnop_select_args *ap) 665{ 666 proc_t p = vfs_context_proc(ap->a_context); 667 dev_t dev; 668 669 switch (ap->a_vp->v_type) { 670 671 default: 672 return (1); /* XXX */ 673 674 case VCHR: 675 dev = ap->a_vp->v_rdev; 676 return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p); 677 } 678} 679 680static int filt_specattach(struct knote *kn); 681 682int 683spec_kqfilter(vnode_t vp, struct knote *kn) 684{ 685 dev_t dev; 686 int err = EINVAL; 687 688 /* 689 * For a few special kinds of devices, we can attach knotes. 690 * Each filter function must check whether the dev type matches it. 691 */ 692 dev = vnode_specrdev(vp); 693 694 if (vnode_istty(vp)) { 695 /* We can hook into TTYs... */ 696 err = filt_specattach(kn); 697 } else { 698 /* Try a bpf device, as defined in bsd/net/bpf.c */ 699 err = bpfkqfilter(dev, kn); 700 } 701 702 return err; 703} 704 705/* 706 * Synch buffers associated with a block device 707 */ 708int 709spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context) 710{ 711 if (vp->v_type == VCHR) 712 return (0); 713 /* 714 * Flush all dirty buffers associated with a block device. 715 */ 716 buf_flushdirtyblks(vp, (waitfor == MNT_WAIT || waitfor == MNT_DWAIT), 0, "spec_fsync"); 717 718 return (0); 719} 720 721int 722spec_fsync(struct vnop_fsync_args *ap) 723{ 724 return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context); 725} 726 727 728/* 729 * Just call the device strategy routine 730 */ 731extern int hard_throttle_on_root; 732 733void throttle_init(void); 734 735 736#define LOWPRI_THROTTLE_WINDOW_MSECS 500 737#define LOWPRI_LEGACY_THROTTLE_WINDOW_MSECS 200 738#define LOWPRI_IO_PERIOD_MSECS 200 739#define LOWPRI_IO_PERIOD_SSD_MSECS 20 740#define LOWPRI_TIMER_PERIOD_MSECS 10 741 742 743int lowpri_throttle_window_msecs = LOWPRI_THROTTLE_WINDOW_MSECS; 744int lowpri_legacy_throttle_window_msecs = LOWPRI_LEGACY_THROTTLE_WINDOW_MSECS; 745int lowpri_io_period_msecs = LOWPRI_IO_PERIOD_MSECS; 746int lowpri_io_period_ssd_msecs = LOWPRI_IO_PERIOD_SSD_MSECS; 747int lowpri_timer_period_msecs = LOWPRI_TIMER_PERIOD_MSECS; 748 749/* 750 * If a process requiring legacy iothrottle behavior is running on the 751 * system, use legacy limits for throttle window and max IO size. 752 */ 753#if CONFIG_EMBEDDED 754#define THROTTLE_WINDOW (lowpri_throttle_window_msecs) 755#else 756#define THROTTLE_WINDOW (throttle_legacy_process_count == 0 ? lowpri_throttle_window_msecs : lowpri_legacy_throttle_window_msecs) 757#endif 758 759#if 0 760#define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...) \ 761 do { \ 762 if ((debug_info)->alloc) \ 763 printf("%s: "format, __FUNCTION__, ## args); \ 764 } while(0) 765 766#else 767#define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...) 768#endif 769 770SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_throttle_window_msecs, 0, ""); 771SYSCTL_INT(_debug, OID_AUTO, lowpri_legacy_throttle_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_legacy_throttle_window_msecs, 0, ""); 772SYSCTL_INT(_debug, OID_AUTO, lowpri_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_io_period_msecs, 0, ""); 773SYSCTL_INT(_debug, OID_AUTO, lowpri_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_io_period_ssd_msecs, 0, ""); 774SYSCTL_INT(_debug, OID_AUTO, lowpri_timer_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_timer_period_msecs, 0, ""); 775SYSCTL_INT(_debug, OID_AUTO, lowpri_legacy_process_count, CTLFLAG_RD | CTLFLAG_LOCKED, &throttle_legacy_process_count, 0, ""); 776 777static lck_grp_t *throttle_mtx_grp; 778static lck_attr_t *throttle_mtx_attr; 779static lck_grp_attr_t *throttle_mtx_grp_attr; 780 781 782/* 783 * throttled I/O helper function 784 * convert the index of the lowest set bit to a device index 785 */ 786int 787num_trailing_0(uint64_t n) 788{ 789 /* 790 * since in most cases the number of trailing 0s is very small, 791 * we simply counting sequentially from the lowest bit 792 */ 793 if (n == 0) 794 return sizeof(n) * 8; 795 int count = 0; 796 while (!ISSET(n, 1)) { 797 n >>= 1; 798 ++count; 799 } 800 return count; 801} 802 803 804/* 805 * Release the reference and if the item was allocated and this is the last 806 * reference then free it. 807 * 808 * This routine always returns the old value. 809 */ 810static int 811throttle_info_rel(struct _throttle_io_info_t *info) 812{ 813 SInt32 oldValue = OSDecrementAtomic(&info->throttle_refcnt); 814 815 DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n", 816 info, (int)(oldValue -1), info ); 817 818 /* The reference count just went negative, very bad */ 819 if (oldValue == 0) 820 panic("throttle info ref cnt went negative!"); 821 822 /* 823 * Once reference count is zero, no one else should be able to take a 824 * reference 825 */ 826 if ((info->throttle_refcnt == 0) && (info->throttle_alloc)) { 827 DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info); 828 829 lck_mtx_destroy(&info->throttle_lock, throttle_mtx_grp); 830 FREE(info, M_TEMP); 831 } 832 return oldValue; 833} 834 835 836/* 837 * Just take a reference on the throttle info structure. 838 * 839 * This routine always returns the old value. 840 */ 841static SInt32 842throttle_info_ref(struct _throttle_io_info_t *info) 843{ 844 SInt32 oldValue = OSIncrementAtomic(&info->throttle_refcnt); 845 846 DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n", 847 info, (int)(oldValue -1), info ); 848 /* Allocated items should never have a reference of zero */ 849 if (info->throttle_alloc && (oldValue == 0)) 850 panic("Taking a reference without calling create throttle info!\n"); 851 852 return oldValue; 853} 854 855 856/* 857 * on entry the throttle_lock is held... 858 * this function is responsible for taking 859 * and dropping the reference on the info 860 * structure which will keep it from going 861 * away while the timer is running if it 862 * happens to have been dynamically allocated by 863 * a network fileystem kext which is now trying 864 * to free it 865 */ 866static uint32_t 867throttle_timer_start(struct _throttle_io_info_t *info, boolean_t update_io_count) 868{ 869 struct timeval elapsed; 870 uint64_t elapsed_msecs; 871 int throttle_level; 872 uint64_t deadline; 873 874 if (update_io_count == TRUE) { 875 info->throttle_io_count_begin = info->throttle_io_count; 876 info->throttle_io_period_num++; 877 878 microuptime(&info->throttle_start_IO_period_timestamp); 879 } 880 for (throttle_level = THROTTLE_LEVEL_START; throttle_level < THROTTLE_LEVEL_END; throttle_level++) { 881 882 microuptime(&elapsed); 883 timevalsub(&elapsed, &info->throttle_last_IO_timestamp[throttle_level]); 884 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000); 885 886 if (elapsed_msecs < (uint64_t)THROTTLE_WINDOW) { 887 /* 888 * we had an I/O occur in this level within 889 * our throttle window, so we need to 890 * to make sure the timer continues to run 891 */ 892 break; 893 } 894 } 895 if (throttle_level >= THROTTLE_LEVEL_END) { 896 /* 897 * we're outside all of the throttle windows... 898 * don't start a new timer 899 */ 900 info->throttle_timer_running = 0; 901 902 return (THROTTLE_LEVEL_END); 903 } 904 if (info->throttle_timer_running == 0) { 905 /* 906 * take a reference for the timer 907 */ 908 throttle_info_ref(info); 909 910 info->throttle_timer_running = 1; 911 } 912 clock_interval_to_deadline(lowpri_timer_period_msecs, 1000000, &deadline); 913 914 thread_call_enter_delayed(info->throttle_timer_call, deadline); 915 916 return (throttle_level); 917} 918 919 920static void 921throttle_timer(struct _throttle_io_info_t *info) 922{ 923 uthread_t ut, utlist; 924 struct timeval elapsed; 925 uint64_t elapsed_msecs; 926 int throttle_level; 927 boolean_t update_io_count = FALSE; 928 boolean_t need_wakeup = FALSE; 929 boolean_t need_release = FALSE; 930 931 lck_mtx_lock(&info->throttle_lock); 932 933 microuptime(&elapsed); 934 timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp); 935 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000); 936 937 if (elapsed_msecs >= (uint64_t)info->throttle_io_period) { 938 /* 939 * we're closing out the current IO period... 940 * if we have a waiting thread, wake it up 941 * after we have reset the I/O window info 942 */ 943 need_wakeup = TRUE; 944 update_io_count = TRUE; 945 } 946 if ((throttle_level = throttle_timer_start(info, update_io_count)) == THROTTLE_LEVEL_END) { 947 /* 948 * we are now outside of the throttle window 949 * for all throttle levels... 950 * 951 * the timer is not restarted in this case, so 952 * we need to get rid of the reference we took when 953 * we started up the timer... we can't do this 954 * until we are entirely done playing with 'info' 955 */ 956 need_release = TRUE; 957 } 958 959 TAILQ_FOREACH_SAFE(ut, &info->throttle_uthlist, uu_throttlelist, utlist) { 960 /* 961 * if we are now outside of the throttle window release 962 * all of the currently blocked threads, otherwise 963 * look for threads that have had their IO policy changed 964 * by someone else and are no longer throttleable, or are 965 * not at the current throttle level and unblock them 966 */ 967 if (throttle_level == THROTTLE_LEVEL_END || throttle_get_thread_throttle_level(ut, -1) <= throttle_level) { 968 969 TAILQ_REMOVE(&info->throttle_uthlist, ut, uu_throttlelist); 970 ut->uu_on_throttlelist = 0; 971 972 wakeup(&ut->uu_on_throttlelist); 973 } 974 } 975 if (need_wakeup && !TAILQ_EMPTY(&info->throttle_uthlist)) { 976 /* 977 * we've entered a new I/O period and we're still 978 * in the throttle window, so wakeup the next guy in line 979 */ 980 ut = (uthread_t)TAILQ_FIRST(&info->throttle_uthlist); 981 TAILQ_REMOVE(&info->throttle_uthlist, ut, uu_throttlelist); 982 ut->uu_on_throttlelist = 0; 983 984 wakeup(&ut->uu_on_throttlelist); 985 } 986 lck_mtx_unlock(&info->throttle_lock); 987 988 if (need_release == TRUE) 989 throttle_info_rel(info); 990} 991 992 993void 994throttle_init(void) 995{ 996 struct _throttle_io_info_t *info; 997 int i; 998 999 /* 1000 * allocate lock group attribute and group 1001 */ 1002 throttle_mtx_grp_attr = lck_grp_attr_alloc_init(); 1003 throttle_mtx_grp = lck_grp_alloc_init("throttle I/O", throttle_mtx_grp_attr); 1004 1005 /* 1006 * allocate the lock attribute 1007 */ 1008 throttle_mtx_attr = lck_attr_alloc_init(); 1009 1010 for (i = 0; i < LOWPRI_MAX_NUM_DEV; i++) { 1011 info = &_throttle_io_info[i]; 1012 1013 lck_mtx_init(&info->throttle_lock, throttle_mtx_grp, throttle_mtx_attr); 1014 info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info); 1015 1016 TAILQ_INIT(&info->throttle_uthlist); 1017 } 1018} 1019 1020 1021/* 1022 * KPI routine 1023 * 1024 * wakeup and remove the specified thread from the throttle queue 1025 * if it's no longer in a throttleable state... 1026 * takes a valid uthread (which may or may not be on the 1027 * throttle queue) as input 1028 */ 1029void 1030unthrottle_thread(uthread_t ut) 1031{ 1032 struct _throttle_io_info_t *info; 1033 1034 if ((info = ut->uu_throttle_info) == NULL) 1035 return; 1036 1037 lck_mtx_lock(&info->throttle_lock); 1038 1039 if (ut->uu_on_throttlelist && throttle_get_thread_throttle_level(ut, -1) <= THROTTLE_LEVEL_THROTTLED) { 1040 TAILQ_REMOVE(&info->throttle_uthlist, ut, uu_throttlelist); 1041 ut->uu_on_throttlelist = 0; 1042 1043 wakeup(&ut->uu_on_throttlelist); 1044 } 1045 lck_mtx_unlock(&info->throttle_lock); 1046} 1047 1048 1049/* 1050 * KPI routine 1051 * 1052 * Create and take a reference on a throttle info structure and return a 1053 * pointer for the file system to use when calling throttle_info_update. 1054 * Calling file system must have a matching release for every create. 1055 */ 1056void * 1057throttle_info_create(void) 1058{ 1059 struct _throttle_io_info_t *info; 1060 1061 MALLOC(info, struct _throttle_io_info_t *, sizeof(*info), M_TEMP, M_ZERO | M_WAITOK); 1062 /* Should never happen but just in case */ 1063 if (info == NULL) 1064 return NULL; 1065 /* Mark that this one was allocated and needs to be freed */ 1066 DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info ); 1067 info->throttle_alloc = TRUE; 1068 1069 lck_mtx_init(&info->throttle_lock, throttle_mtx_grp, throttle_mtx_attr); 1070 info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info); 1071 1072 TAILQ_INIT(&info->throttle_uthlist); 1073 1074 /* Take a reference */ 1075 OSIncrementAtomic(&info->throttle_refcnt); 1076 return info; 1077} 1078 1079/* 1080 * KPI routine 1081 * 1082 * Release the throttle info pointer if all the reference are gone. Should be 1083 * called to release reference taken by throttle_info_create 1084 */ 1085void 1086throttle_info_release(void *throttle_info) 1087{ 1088 DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n", 1089 (struct _throttle_io_info_t *)throttle_info, 1090 (struct _throttle_io_info_t *)throttle_info); 1091 if (throttle_info) /* Just to be careful */ 1092 throttle_info_rel(throttle_info); 1093} 1094 1095/* 1096 * KPI routine 1097 * 1098 * File Systems that create an info structure, need to call this routine in 1099 * their mount routine (used by cluster code). File Systems that call this in 1100 * their mount routines must call throttle_info_mount_rel in their unmount 1101 * routines. 1102 */ 1103void 1104throttle_info_mount_ref(mount_t mp, void *throttle_info) 1105{ 1106 if ((throttle_info == NULL) || (mp == NULL)) 1107 return; 1108 throttle_info_ref(throttle_info); 1109 1110 /* 1111 * We already have a reference release it before adding the new one 1112 */ 1113 if (mp->mnt_throttle_info) 1114 throttle_info_rel(mp->mnt_throttle_info); 1115 mp->mnt_throttle_info = throttle_info; 1116} 1117 1118/* 1119 * Private KPI routine 1120 * 1121 * return a handle for accessing throttle_info given a throttle_mask. The 1122 * handle must be released by throttle_info_rel_by_mask 1123 */ 1124int 1125throttle_info_ref_by_mask(uint64_t throttle_mask, throttle_info_handle_t *throttle_info_handle) 1126{ 1127 int dev_index; 1128 struct _throttle_io_info_t *info; 1129 1130 if (throttle_info_handle == NULL) 1131 return EINVAL; 1132 1133 dev_index = num_trailing_0(throttle_mask); 1134 info = &_throttle_io_info[dev_index]; 1135 throttle_info_ref(info); 1136 *(struct _throttle_io_info_t**)throttle_info_handle = info; 1137 1138 return 0; 1139} 1140 1141/* 1142 * Private KPI routine 1143 * 1144 * release the handle obtained by throttle_info_ref_by_mask 1145 */ 1146void 1147throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle) 1148{ 1149 /* 1150 * for now the handle is just a pointer to _throttle_io_info_t 1151 */ 1152 throttle_info_rel((struct _throttle_io_info_t*)throttle_info_handle); 1153} 1154 1155/* 1156 * KPI routine 1157 * 1158 * File Systems that throttle_info_mount_ref, must call this routine in their 1159 * umount routine. 1160 */ 1161void 1162throttle_info_mount_rel(mount_t mp) 1163{ 1164 if (mp->mnt_throttle_info) 1165 throttle_info_rel(mp->mnt_throttle_info); 1166 mp->mnt_throttle_info = NULL; 1167} 1168 1169void 1170throttle_info_get_last_io_time(mount_t mp, struct timeval *tv) 1171{ 1172 struct _throttle_io_info_t *info; 1173 1174 if (mp == NULL) 1175 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; 1176 else if (mp->mnt_throttle_info == NULL) 1177 info = &_throttle_io_info[mp->mnt_devbsdunit]; 1178 else 1179 info = mp->mnt_throttle_info; 1180 1181 *tv = info->throttle_last_write_timestamp; 1182} 1183 1184void 1185update_last_io_time(mount_t mp) 1186{ 1187 struct _throttle_io_info_t *info; 1188 1189 if (mp == NULL) 1190 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; 1191 else if (mp->mnt_throttle_info == NULL) 1192 info = &_throttle_io_info[mp->mnt_devbsdunit]; 1193 else 1194 info = mp->mnt_throttle_info; 1195 1196 microuptime(&info->throttle_last_write_timestamp); 1197} 1198 1199 1200int 1201throttle_get_io_policy(uthread_t *ut) 1202{ 1203 *ut = get_bsdthread_info(current_thread()); 1204 1205 return (proc_get_task_selfdiskacc()); 1206} 1207 1208 1209 1210static int 1211throttle_get_thread_throttle_level(uthread_t ut, int policy) 1212{ 1213 int thread_throttle_level = THROTTLE_LEVEL_NONE; 1214 1215 if (ut == NULL) 1216 ut = get_bsdthread_info(current_thread()); 1217 1218 if (policy == -1) 1219 policy = proc_get_diskacc(ut->uu_thread); 1220 1221 switch (policy) { 1222 1223 case IOPOL_DEFAULT: 1224 case IOPOL_NORMAL: 1225 thread_throttle_level = THROTTLE_LEVEL_TIER0; 1226 case IOPOL_PASSIVE: 1227 if (ut->uu_throttle_bc == TRUE) 1228 thread_throttle_level = THROTTLE_LEVEL_TIER2; 1229 break; 1230 case IOPOL_THROTTLE: 1231 thread_throttle_level = THROTTLE_LEVEL_TIER2; 1232 break; 1233 case IOPOL_UTILITY: 1234 thread_throttle_level = THROTTLE_LEVEL_TIER1; 1235 break; 1236 default: 1237 printf("unknown I/O policy %d", policy); 1238 break; 1239 } 1240 return (thread_throttle_level); 1241} 1242 1243 1244static int 1245throttle_io_will_be_throttled_internal(void * throttle_info) 1246{ 1247 struct _throttle_io_info_t *info = throttle_info; 1248 struct timeval elapsed; 1249 uint64_t elapsed_msecs; 1250 int thread_throttle_level; 1251 int throttle_level; 1252 1253 if ((thread_throttle_level = throttle_get_thread_throttle_level(NULL, -1)) < THROTTLE_LEVEL_THROTTLED) 1254 return (0); 1255 1256 for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) { 1257 1258 microuptime(&elapsed); 1259 timevalsub(&elapsed, &info->throttle_last_IO_timestamp[throttle_level]); 1260 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000); 1261 1262 if (elapsed_msecs < (uint64_t)THROTTLE_WINDOW) 1263 break; 1264 } 1265 if (throttle_level >= thread_throttle_level) { 1266 /* 1267 * we're beyond all of the throttle windows 1268 * that affect the throttle level of this thread, 1269 * so go ahead and treat as normal I/O 1270 */ 1271 return (0); 1272 } 1273 if (info->throttle_io_count != info->throttle_io_count_begin) { 1274 /* 1275 * we've already issued at least one throttleable I/O 1276 * in the current I/O window, so avoid issuing another one 1277 */ 1278 return (2); 1279 } 1280 /* 1281 * we're in the throttle window, so 1282 * cut the I/O size back 1283 */ 1284 return (1); 1285} 1286 1287/* 1288 * If we have a mount point and it has a throttle info pointer then 1289 * use it to do the check, otherwise use the device unit number to find 1290 * the correct throttle info array element. 1291 */ 1292int 1293throttle_io_will_be_throttled(__unused int lowpri_window_msecs, mount_t mp) 1294{ 1295 void *info; 1296 1297 /* 1298 * Should we just return zero if no mount point 1299 */ 1300 if (mp == NULL) 1301 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; 1302 else if (mp->mnt_throttle_info == NULL) 1303 info = &_throttle_io_info[mp->mnt_devbsdunit]; 1304 else 1305 info = mp->mnt_throttle_info; 1306 1307 return throttle_io_will_be_throttled_internal(info); 1308} 1309 1310 1311uint32_t 1312throttle_lowpri_io(int sleep_amount) 1313{ 1314 uthread_t ut; 1315 struct _throttle_io_info_t *info; 1316 int throttle_type = 0; 1317 int sleep_cnt = 0; 1318 int locked = 0; 1319 uint32_t throttle_io_period_num = 0; 1320 boolean_t insert_tail = TRUE; 1321 1322 ut = get_bsdthread_info(current_thread()); 1323 1324 if (ut->uu_lowpri_window == 0) 1325 return (0); 1326 1327 info = ut->uu_throttle_info; 1328 1329 if ((sleep_amount == 0) || (info == NULL)) 1330 goto done; 1331 1332 if (sleep_amount == 1 && ut->uu_throttle_bc == FALSE) 1333 sleep_amount = 0; 1334 1335 throttle_io_period_num = info->throttle_io_period_num; 1336 1337 while ( (throttle_type = throttle_io_will_be_throttled_internal(info)) ) { 1338 1339 if (throttle_type == 1) { 1340 if (sleep_amount == 0) 1341 break; 1342 if (info->throttle_io_period_num < throttle_io_period_num) 1343 break; 1344 if ((info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) 1345 break; 1346 } 1347 if (!locked) { 1348 lck_mtx_lock(&info->throttle_lock); 1349 locked = 1; 1350 } 1351 if (info->throttle_timer_running == 0) { 1352 /* 1353 * try to start the timer since it's 1354 * currently not running. on failure, no 1355 * timer reference to drop since it wasn't started 1356 */ 1357 if (throttle_timer_start(info, TRUE) == THROTTLE_LEVEL_END) 1358 goto done; 1359 } 1360 if (sleep_cnt == 0) { 1361 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, 1362 ut->uu_lowpri_window, info->throttle_io_period, info->throttle_io_count, 0, 0); 1363 } 1364 if (ut->uu_on_throttlelist == 0) { 1365 if (insert_tail == TRUE) 1366 TAILQ_INSERT_TAIL(&info->throttle_uthlist, ut, uu_throttlelist); 1367 else 1368 TAILQ_INSERT_HEAD(&info->throttle_uthlist, ut, uu_throttlelist); 1369 1370 ut->uu_on_throttlelist = 1; 1371 } 1372 msleep((caddr_t)&ut->uu_on_throttlelist, &info->throttle_lock, PRIBIO + 1, "throttle_lowpri_io", NULL); 1373 1374 sleep_cnt++; 1375 1376 if (sleep_amount == 0) 1377 insert_tail = FALSE; 1378 else if (info->throttle_io_period_num < throttle_io_period_num || 1379 (info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) { 1380 insert_tail = FALSE; 1381 sleep_amount = 0; 1382 } 1383 } 1384done: 1385 if (ut->uu_on_throttlelist) { 1386 if (!locked) { 1387 lck_mtx_lock(&info->throttle_lock); 1388 locked = 1; 1389 } 1390 if (ut->uu_on_throttlelist) { 1391 TAILQ_REMOVE(&info->throttle_uthlist, ut, uu_throttlelist); 1392 1393 ut->uu_on_throttlelist = 0; 1394 } 1395 } 1396 if (locked) 1397 lck_mtx_unlock(&info->throttle_lock); 1398 1399 if (sleep_cnt) 1400 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, 1401 ut->uu_lowpri_window, info->throttle_io_period, info->throttle_io_count, 0, 0); 1402 if (info) 1403 throttle_info_rel(info); 1404 1405 ut->uu_throttle_info = NULL; 1406 ut->uu_throttle_bc = FALSE; 1407 ut->uu_lowpri_window = 0; 1408 1409 return (sleep_cnt); 1410} 1411 1412/* 1413 * KPI routine 1414 * 1415 * set a kernel thread's IO policy. policy can be: 1416 * IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE 1417 * 1418 * explanations about these policies are in the man page of setiopolicy_np 1419 */ 1420void throttle_set_thread_io_policy(int policy) 1421{ 1422 proc_apply_thread_selfdiskacc(policy); 1423} 1424 1425 1426static 1427void throttle_info_reset_window(uthread_t ut) 1428{ 1429 struct _throttle_io_info_t *info; 1430 1431 if ( (info = ut->uu_throttle_info) ) { 1432 throttle_info_rel(info); 1433 1434 ut->uu_throttle_info = NULL; 1435 ut->uu_lowpri_window = 0; 1436 ut->uu_throttle_bc = FALSE; 1437 } 1438} 1439 1440static 1441void throttle_info_set_initial_window(uthread_t ut, struct _throttle_io_info_t *info, boolean_t BC_throttle) 1442{ 1443 if (ut->uu_throttle_info == NULL) { 1444 1445 ut->uu_throttle_info = info; 1446 throttle_info_ref(info); 1447 DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info ); 1448 1449 ut->uu_lowpri_window = THROTTLE_WINDOW; 1450 ut->uu_throttle_bc = BC_throttle; 1451 } 1452} 1453 1454 1455static 1456void throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int policy, int flags, boolean_t isssd) 1457{ 1458 int thread_throttle_level; 1459 1460 if (THROTTLE_WINDOW == 0) 1461 return; 1462 1463 if (ut == NULL) 1464 ut = get_bsdthread_info(current_thread()); 1465 1466 thread_throttle_level = throttle_get_thread_throttle_level(ut, policy); 1467 1468 if (thread_throttle_level == THROTTLE_LEVEL_TIER0 && ISSET(flags, B_PASSIVE)) 1469 thread_throttle_level = THROTTLE_LEVEL_NONE; 1470 1471 if (thread_throttle_level != THROTTLE_LEVEL_NONE) 1472 microuptime(&info->throttle_last_IO_timestamp[thread_throttle_level]); 1473 1474 if (thread_throttle_level >= THROTTLE_LEVEL_THROTTLED) { 1475 /* 1476 * I'd really like to do the IOSleep here, but 1477 * we may be holding all kinds of filesystem related locks 1478 * and the pages for this I/O marked 'busy'... 1479 * we don't want to cause a normal task to block on 1480 * one of these locks while we're throttling a task marked 1481 * for low priority I/O... we'll mark the uthread and 1482 * do the delay just before we return from the system 1483 * call that triggered this I/O or from vnode_pagein 1484 */ 1485 if (info->throttle_io_period == 0) { 1486 1487 if (isssd == TRUE) 1488 info->throttle_io_period = lowpri_io_period_ssd_msecs; 1489 else 1490 info->throttle_io_period = lowpri_io_period_msecs; 1491 1492 if (info->throttle_io_period < lowpri_timer_period_msecs) 1493 info->throttle_io_period = lowpri_timer_period_msecs; 1494 } 1495 OSAddAtomic(1, &info->throttle_io_count); 1496 1497 throttle_info_set_initial_window(ut, info, FALSE); 1498 } 1499} 1500 1501void throttle_info_update_by_mount(mount_t mp) 1502{ 1503 struct _throttle_io_info_t *info; 1504 uthread_t ut; 1505 boolean_t isssd = FALSE; 1506 1507 ut = get_bsdthread_info(current_thread()); 1508 1509 if (ut->uu_lowpri_window) 1510 return; 1511 1512 if (mp != NULL) { 1513 if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) 1514 isssd = TRUE; 1515 info = &_throttle_io_info[mp->mnt_devbsdunit]; 1516 } else 1517 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; 1518 1519 if (info->throttle_io_period == 0) { 1520 1521 if (isssd == TRUE) 1522 info->throttle_io_period = lowpri_io_period_ssd_msecs; 1523 else 1524 info->throttle_io_period = lowpri_io_period_msecs; 1525 1526 if (info->throttle_io_period < lowpri_timer_period_msecs) 1527 info->throttle_io_period = lowpri_timer_period_msecs; 1528 } 1529 throttle_info_set_initial_window(ut, info, FALSE); 1530} 1531 1532 1533/* 1534 * KPI routine 1535 * 1536 * this is usually called before every I/O, used for throttled I/O 1537 * book keeping. This routine has low overhead and does not sleep 1538 */ 1539void throttle_info_update(void *throttle_info, int flags) 1540{ 1541 if (throttle_info) 1542 throttle_info_update_internal(throttle_info, NULL, -1, flags, FALSE); 1543} 1544 1545/* 1546 * KPI routine 1547 * 1548 * this is usually called before every I/O, used for throttled I/O 1549 * book keeping. This routine has low overhead and does not sleep 1550 */ 1551void throttle_info_update_by_mask(void *throttle_info_handle, int flags) 1552{ 1553 void *throttle_info = throttle_info_handle; 1554 1555 /* 1556 * for now we only use the lowest bit of the throttle mask, so the 1557 * handle is the same as the throttle_info. Later if we store a 1558 * set of throttle infos in the handle, we will want to loop through 1559 * them and call throttle_info_update in a loop 1560 */ 1561 throttle_info_update(throttle_info, flags); 1562} 1563 1564 1565int throttle_info_io_will_be_throttled(void * throttle_info, int policy) 1566{ 1567 struct _throttle_io_info_t *info = throttle_info; 1568 struct timeval elapsed; 1569 uint64_t elapsed_msecs; 1570 int throttle_level; 1571 int thread_throttle_level; 1572 1573 switch (policy) { 1574 1575 case IOPOL_THROTTLE: 1576 thread_throttle_level = THROTTLE_LEVEL_TIER2; 1577 break; 1578 case IOPOL_UTILITY: 1579 thread_throttle_level = THROTTLE_LEVEL_TIER1; 1580 break; 1581 default: 1582 thread_throttle_level = THROTTLE_LEVEL_TIER0; 1583 break; 1584 } 1585 for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) { 1586 1587 microuptime(&elapsed); 1588 timevalsub(&elapsed, &info->throttle_last_IO_timestamp[throttle_level]); 1589 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000); 1590 1591 if (elapsed_msecs < (uint64_t)THROTTLE_WINDOW) 1592 break; 1593 } 1594 if (throttle_level >= thread_throttle_level) { 1595 /* 1596 * we're beyond all of the throttle windows 1597 * so go ahead and treat as normal I/O 1598 */ 1599 return (0); 1600 } 1601 /* 1602 * we're in the throttle window 1603 */ 1604 return (1); 1605} 1606 1607void 1608throttle_legacy_process_incr(void) 1609{ 1610 OSIncrementAtomic(&throttle_legacy_process_count); 1611} 1612 1613void 1614throttle_legacy_process_decr(void) 1615{ 1616 OSDecrementAtomic(&throttle_legacy_process_count); 1617} 1618 1619 1620int 1621spec_strategy(struct vnop_strategy_args *ap) 1622{ 1623 buf_t bp; 1624 int bflags; 1625 int policy; 1626 dev_t bdev; 1627 uthread_t ut; 1628 mount_t mp; 1629 int strategy_ret; 1630 struct _throttle_io_info_t *throttle_info; 1631 boolean_t isssd = FALSE; 1632#if !CONFIG_EMBEDDED 1633 proc_t curproc = current_proc(); 1634#endif /* !CONFIG_EMBEDDED */ 1635 1636 bp = ap->a_bp; 1637 bdev = buf_device(bp); 1638 mp = buf_vnode(bp)->v_mount; 1639 1640 policy = throttle_get_io_policy(&ut); 1641 1642 if (bp->b_flags & B_META) 1643 bp->b_attr.ba_flags |= BA_META; 1644 1645 if (policy == IOPOL_THROTTLE || policy == IOPOL_UTILITY) { 1646 bp->b_flags |= B_THROTTLED_IO; 1647 bp->b_attr.ba_flags |= BA_THROTTLED_IO; 1648 bp->b_flags &= ~B_PASSIVE; 1649 } else if (policy == IOPOL_PASSIVE) 1650 bp->b_flags |= B_PASSIVE; 1651 1652#if !CONFIG_EMBEDDED 1653 if ((curproc != NULL) && ((curproc->p_flag & P_DELAYIDLESLEEP) == P_DELAYIDLESLEEP)) 1654 bp->b_attr.ba_flags |= BA_DELAYIDLESLEEP; 1655#endif /* !CONFIG_EMBEDDED */ 1656 1657 bflags = bp->b_flags; 1658 1659 if (kdebug_enable) { 1660 int code = 0; 1661 1662 if (bflags & B_READ) 1663 code |= DKIO_READ; 1664 if (bflags & B_ASYNC) 1665 code |= DKIO_ASYNC; 1666 1667 if (bflags & B_META) 1668 code |= DKIO_META; 1669 else if (bflags & B_PAGEIO) 1670 code |= DKIO_PAGING; 1671 1672 if (bflags & B_THROTTLED_IO) 1673 code |= DKIO_THROTTLE; 1674 else if (bflags & B_PASSIVE) 1675 code |= DKIO_PASSIVE; 1676 1677 if (bp->b_attr.ba_flags & BA_NOCACHE) 1678 code |= DKIO_NOCACHE; 1679 1680 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE, 1681 bp, bdev, (int)buf_blkno(bp), buf_count(bp), 0); 1682 } 1683 if (((bflags & (B_THROTTLED_IO | B_PASSIVE | B_IOSTREAMING | B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) && 1684 mp && (mp->mnt_kern_flag & MNTK_ROOTDEV)) 1685 hard_throttle_on_root = 1; 1686 1687 if (mp != NULL) { 1688 if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) 1689 isssd = TRUE; 1690 throttle_info = &_throttle_io_info[mp->mnt_devbsdunit]; 1691 } else 1692 throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; 1693 1694 throttle_info_update_internal(throttle_info, ut, policy, bflags, isssd); 1695 1696 if ((bflags & B_READ) == 0) { 1697 microuptime(&throttle_info->throttle_last_write_timestamp); 1698 1699 if (mp) { 1700 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size); 1701 } 1702 } else if (mp) { 1703 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_read_size); 1704 } 1705 /* 1706 * The BootCache may give us special information about 1707 * the IO, so it returns special values that we check 1708 * for here. 1709 * 1710 * IO_SATISFIED_BY_CACHE 1711 * The read has been satisfied by the boot cache. Don't 1712 * throttle the thread unnecessarily. 1713 * 1714 * IO_SHOULD_BE_THROTTLED 1715 * The boot cache is playing back a playlist and this IO 1716 * cut through. Throttle it so we're not cutting through 1717 * the boot cache too often. 1718 * 1719 * Note that typical strategy routines are defined with 1720 * a void return so we'll get garbage here. In the 1721 * unlikely case the garbage matches our special return 1722 * value, it's not a big deal since we're only adjusting 1723 * the throttling delay. 1724 */ 1725#define IO_SATISFIED_BY_CACHE ((int)0xcafefeed) 1726#define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef) 1727 typedef int strategy_fcn_ret_t(struct buf *bp); 1728 1729 strategy_ret = (*(strategy_fcn_ret_t*)bdevsw[major(bdev)].d_strategy)(bp); 1730 1731 if (IO_SATISFIED_BY_CACHE == strategy_ret) { 1732 /* 1733 * If this was a throttled IO satisfied by the boot cache, 1734 * don't delay the thread. 1735 */ 1736 throttle_info_reset_window(ut); 1737 1738 } else if (IO_SHOULD_BE_THROTTLED == strategy_ret) { 1739 /* 1740 * If the boot cache indicates this IO should be throttled, 1741 * delay the thread. 1742 */ 1743 throttle_info_set_initial_window(ut, throttle_info, TRUE); 1744 } 1745 return (0); 1746} 1747 1748 1749/* 1750 * This is a noop, simply returning what one has been given. 1751 */ 1752int 1753spec_blockmap(__unused struct vnop_blockmap_args *ap) 1754{ 1755 return (ENOTSUP); 1756} 1757 1758 1759/* 1760 * Device close routine 1761 */ 1762int 1763spec_close(struct vnop_close_args *ap) 1764{ 1765 struct vnode *vp = ap->a_vp; 1766 dev_t dev = vp->v_rdev; 1767 int error = 0; 1768 int flags = ap->a_fflag; 1769 struct proc *p = vfs_context_proc(ap->a_context); 1770 struct session *sessp; 1771 int do_rele = 0; 1772 1773 switch (vp->v_type) { 1774 1775 case VCHR: 1776 /* 1777 * Hack: a tty device that is a controlling terminal 1778 * has a reference from the session structure. 1779 * We cannot easily tell that a character device is 1780 * a controlling terminal, unless it is the closing 1781 * process' controlling terminal. In that case, 1782 * if the reference count is 1 (this is the very 1783 * last close) 1784 */ 1785 sessp = proc_session(p); 1786 if (sessp != SESSION_NULL) { 1787 if (vp == sessp->s_ttyvp && vcount(vp) == 1) { 1788 struct tty *tp; 1789 1790 session_lock(sessp); 1791 if (vp == sessp->s_ttyvp) { 1792 tp = SESSION_TP(sessp); 1793 sessp->s_ttyvp = NULL; 1794 sessp->s_ttyvid = 0; 1795 sessp->s_ttyp = TTY_NULL; 1796 sessp->s_ttypgrpid = NO_PID; 1797 do_rele = 1; 1798 } 1799 session_unlock(sessp); 1800 1801 if (do_rele) { 1802 vnode_rele(vp); 1803 if (NULL != tp) 1804 ttyfree(tp); 1805 } 1806 } 1807 session_rele(sessp); 1808 } 1809 1810 devsw_lock(dev, S_IFCHR); 1811 1812 if (--vp->v_specinfo->si_opencount < 0) 1813 panic("negative open count (c, %u, %u)", major(dev), minor(dev)); 1814 1815 /* 1816 * close always, or close on last reference, or close on revoke 1817 */ 1818 if ((D_TRACKCLOSE & cdevsw[major(dev)].d_type) != 0 || 1819 vcount(vp) == 0 || (flags & IO_REVOKE) != 0) 1820 error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p); 1821 1822 devsw_unlock(dev, S_IFCHR); 1823 break; 1824 1825 case VBLK: 1826 /* 1827 * If there is more than one outstanding open, don't 1828 * send the close to the device. 1829 */ 1830 devsw_lock(dev, S_IFBLK); 1831 if (vcount(vp) > 1) { 1832 vp->v_specinfo->si_opencount--; 1833 devsw_unlock(dev, S_IFBLK); 1834 return (0); 1835 } 1836 devsw_unlock(dev, S_IFBLK); 1837 1838 /* 1839 * On last close of a block device (that isn't mounted) 1840 * we must invalidate any in core blocks, so that 1841 * we can, for instance, change floppy disks. 1842 */ 1843 if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context))) 1844 return (error); 1845 1846 error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0); 1847 if (error) 1848 return (error); 1849 1850 devsw_lock(dev, S_IFBLK); 1851 1852 if (--vp->v_specinfo->si_opencount < 0) 1853 panic("negative open count (b, %u, %u)", major(dev), minor(dev)); 1854 1855 if (vcount(vp) == 0) 1856 error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p); 1857 1858 devsw_unlock(dev, S_IFBLK); 1859 break; 1860 1861 default: 1862 panic("spec_close: not special"); 1863 return(EBADF); 1864 } 1865 1866 return error; 1867} 1868 1869/* 1870 * Return POSIX pathconf information applicable to special devices. 1871 */ 1872int 1873spec_pathconf(struct vnop_pathconf_args *ap) 1874{ 1875 1876 switch (ap->a_name) { 1877 case _PC_LINK_MAX: 1878 *ap->a_retval = LINK_MAX; 1879 return (0); 1880 case _PC_MAX_CANON: 1881 *ap->a_retval = MAX_CANON; 1882 return (0); 1883 case _PC_MAX_INPUT: 1884 *ap->a_retval = MAX_INPUT; 1885 return (0); 1886 case _PC_PIPE_BUF: 1887 *ap->a_retval = PIPE_BUF; 1888 return (0); 1889 case _PC_CHOWN_RESTRICTED: 1890 *ap->a_retval = 200112; /* _POSIX_CHOWN_RESTRICTED */ 1891 return (0); 1892 case _PC_VDISABLE: 1893 *ap->a_retval = _POSIX_VDISABLE; 1894 return (0); 1895 default: 1896 return (EINVAL); 1897 } 1898 /* NOTREACHED */ 1899} 1900 1901/* 1902 * Special device failed operation 1903 */ 1904int 1905spec_ebadf(__unused void *dummy) 1906{ 1907 1908 return (EBADF); 1909} 1910 1911/* Blktooff derives file offset from logical block number */ 1912int 1913spec_blktooff(struct vnop_blktooff_args *ap) 1914{ 1915 struct vnode *vp = ap->a_vp; 1916 1917 switch (vp->v_type) { 1918 case VCHR: 1919 *ap->a_offset = (off_t)-1; /* failure */ 1920 return (ENOTSUP); 1921 1922 case VBLK: 1923 printf("spec_blktooff: not implemented for VBLK\n"); 1924 *ap->a_offset = (off_t)-1; /* failure */ 1925 return (ENOTSUP); 1926 1927 default: 1928 panic("spec_blktooff type"); 1929 } 1930 /* NOTREACHED */ 1931 1932 return (0); 1933} 1934 1935/* Offtoblk derives logical block number from file offset */ 1936int 1937spec_offtoblk(struct vnop_offtoblk_args *ap) 1938{ 1939 struct vnode *vp = ap->a_vp; 1940 1941 switch (vp->v_type) { 1942 case VCHR: 1943 *ap->a_lblkno = (daddr64_t)-1; /* failure */ 1944 return (ENOTSUP); 1945 1946 case VBLK: 1947 printf("spec_offtoblk: not implemented for VBLK\n"); 1948 *ap->a_lblkno = (daddr64_t)-1; /* failure */ 1949 return (ENOTSUP); 1950 1951 default: 1952 panic("spec_offtoblk type"); 1953 } 1954 /* NOTREACHED */ 1955 1956 return (0); 1957} 1958 1959static void filt_specdetach(struct knote *kn); 1960static int filt_spec(struct knote *kn, long hint); 1961static unsigned filt_specpeek(struct knote *kn); 1962 1963struct filterops spec_filtops = { 1964 .f_isfd = 1, 1965 .f_attach = filt_specattach, 1966 .f_detach = filt_specdetach, 1967 .f_event = filt_spec, 1968 .f_peek = filt_specpeek 1969}; 1970 1971static int 1972filter_to_seltype(int16_t filter) 1973{ 1974 switch (filter) { 1975 case EVFILT_READ: 1976 return FREAD; 1977 case EVFILT_WRITE: 1978 return FWRITE; 1979 break; 1980 default: 1981 panic("filt_to_seltype(): invalid filter %d\n", filter); 1982 return 0; 1983 } 1984} 1985 1986static int 1987filt_specattach(struct knote *kn) 1988{ 1989 vnode_t vp; 1990 dev_t dev; 1991 1992 vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; /* Already have iocount, and vnode is alive */ 1993 1994 assert(vnode_ischr(vp)); 1995 1996 dev = vnode_specrdev(vp); 1997 1998 if (major(dev) > nchrdev) { 1999 return ENXIO; 2000 } 2001 2002 if ((cdevsw_flags[major(dev)] & CDEVSW_SELECT_KQUEUE) == 0) { 2003 return EINVAL; 2004 } 2005 2006 /* Resulting wql is safe to unlink even if it has never been linked */ 2007 kn->kn_hook = wait_queue_link_allocate(); 2008 if (kn->kn_hook == NULL) { 2009 return EAGAIN; 2010 } 2011 2012 kn->kn_fop = &spec_filtops; 2013 kn->kn_hookid = vnode_vid(vp); 2014 2015 knote_markstayqueued(kn); 2016 2017 return 0; 2018} 2019 2020static void 2021filt_specdetach(struct knote *kn) 2022{ 2023 kern_return_t ret; 2024 2025 /* 2026 * Given wait queue link and wait queue set, unlink. This is subtle. 2027 * If the device has been revoked from under us, selclearthread() will 2028 * have removed our link from the kqueue's wait queue set, which 2029 * wait_queue_set_unlink_one() will detect and handle. 2030 */ 2031 ret = wait_queue_set_unlink_one(kn->kn_kq->kq_wqs, kn->kn_hook); 2032 if (ret != KERN_SUCCESS) { 2033 panic("filt_specdetach(): failed to unlink wait queue link."); 2034 } 2035 2036 (void)wait_queue_link_free(kn->kn_hook); 2037 kn->kn_hook = NULL; 2038 kn->kn_status &= ~KN_STAYQUEUED; 2039} 2040 2041static int 2042filt_spec(struct knote *kn, long hint) 2043{ 2044 vnode_t vp; 2045 uthread_t uth; 2046 wait_queue_set_t old_wqs; 2047 vfs_context_t ctx; 2048 int selres; 2049 int error; 2050 int use_offset; 2051 dev_t dev; 2052 uint64_t flags; 2053 2054 assert(kn->kn_hook != NULL); 2055 2056 if (hint != 0) { 2057 panic("filt_spec(): nonzero hint?"); 2058 } 2059 2060 uth = get_bsdthread_info(current_thread()); 2061 ctx = vfs_context_current(); 2062 vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; 2063 2064 error = vnode_getwithvid(vp, kn->kn_hookid); 2065 if (error != 0) { 2066 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 2067 return 1; 2068 } 2069 2070 dev = vnode_specrdev(vp); 2071 flags = cdevsw_flags[major(dev)]; 2072 use_offset = ((flags & CDEVSW_USE_OFFSET) != 0); 2073 assert((flags & CDEVSW_SELECT_KQUEUE) != 0); 2074 2075 /* Trick selrecord() into hooking kqueue's wait queue set into device wait queue */ 2076 old_wqs = uth->uu_wqset; 2077 uth->uu_wqset = kn->kn_kq->kq_wqs; 2078 selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx); 2079 uth->uu_wqset = old_wqs; 2080 2081 if (use_offset) { 2082 if (kn->kn_fp->f_fglob->fg_offset >= (uint32_t)selres) { 2083 kn->kn_data = 0; 2084 } else { 2085 kn->kn_data = ((uint32_t)selres) - kn->kn_fp->f_fglob->fg_offset; 2086 } 2087 } else { 2088 kn->kn_data = selres; 2089 } 2090 2091 vnode_put(vp); 2092 2093 return (kn->kn_data != 0); 2094} 2095 2096static unsigned 2097filt_specpeek(struct knote *kn) 2098{ 2099 vnode_t vp; 2100 uthread_t uth; 2101 wait_queue_set_t old_wqs; 2102 vfs_context_t ctx; 2103 int error, selres; 2104 2105 uth = get_bsdthread_info(current_thread()); 2106 ctx = vfs_context_current(); 2107 vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; 2108 2109 error = vnode_getwithvid(vp, kn->kn_hookid); 2110 if (error != 0) { 2111 return 1; /* Just like VNOP_SELECT() on recycled vnode */ 2112 } 2113 2114 /* 2115 * Why pass the link here? Because we may not have registered in the past... 2116 */ 2117 old_wqs = uth->uu_wqset; 2118 uth->uu_wqset = kn->kn_kq->kq_wqs; 2119 selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx); 2120 uth->uu_wqset = old_wqs; 2121 2122 vnode_put(vp); 2123 return selres; 2124} 2125 2126