1/* 2 * Copyright (c) 2000-2012 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ 29/* 30 * Copyright (c) 1989, 1993, 1995 31 * The Regents of the University of California. All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 3. All advertising materials mentioning features or use of this software 42 * must display the following acknowledgement: 43 * This product includes software developed by the University of 44 * California, Berkeley and its contributors. 45 * 4. Neither the name of the University nor the names of its contributors 46 * may be used to endorse or promote products derived from this software 47 * without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * SUCH DAMAGE. 60 * 61 * @(#)spec_vnops.c 8.14 (Berkeley) 5/21/95 62 */ 63 64#include <sys/param.h> 65#include <sys/proc_internal.h> 66#include <sys/kauth.h> 67#include <sys/systm.h> 68#include <sys/kernel.h> 69#include <sys/conf.h> 70#include <sys/buf_internal.h> 71#include <sys/mount_internal.h> 72#include <sys/vnode_internal.h> 73#include <sys/file_internal.h> 74#include <sys/namei.h> 75#include <sys/stat.h> 76#include <sys/errno.h> 77#include <sys/ioctl.h> 78#include <sys/file.h> 79#include <sys/user.h> 80#include <sys/malloc.h> 81#include <sys/disk.h> 82#include <sys/uio_internal.h> 83#include <sys/resource.h> 84#include <miscfs/specfs/specdev.h> 85#include <vfs/vfs_support.h> 86#include <kern/assert.h> 87#include <kern/task.h> 88#include <pexpert/pexpert.h> 89 90#include <sys/kdebug.h> 91 92/* XXX following three prototypes should be in a header file somewhere */ 93extern dev_t chrtoblk(dev_t dev); 94extern int iskmemdev(dev_t dev); 95extern int bpfkqfilter(dev_t dev, struct knote *kn); 96extern int ptsd_kqfilter(dev_t dev, struct knote *kn); 97 98extern int ignore_is_ssd; 99 100struct vnode *speclisth[SPECHSZ]; 101 102/* symbolic sleep message strings for devices */ 103char devopn[] = "devopn"; 104char devio[] = "devio"; 105char devwait[] = "devwait"; 106char devin[] = "devin"; 107char devout[] = "devout"; 108char devioc[] = "devioc"; 109char devcls[] = "devcls"; 110 111#define VOPFUNC int (*)(void *) 112 113int (**spec_vnodeop_p)(void *); 114struct vnodeopv_entry_desc spec_vnodeop_entries[] = { 115 { &vnop_default_desc, (VOPFUNC)vn_default_error }, 116 { &vnop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */ 117 { &vnop_create_desc, (VOPFUNC)err_create }, /* create */ 118 { &vnop_mknod_desc, (VOPFUNC)err_mknod }, /* mknod */ 119 { &vnop_open_desc, (VOPFUNC)spec_open }, /* open */ 120 { &vnop_close_desc, (VOPFUNC)spec_close }, /* close */ 121 { &vnop_access_desc, (VOPFUNC)spec_access }, /* access */ 122 { &vnop_getattr_desc, (VOPFUNC)spec_getattr }, /* getattr */ 123 { &vnop_setattr_desc, (VOPFUNC)spec_setattr }, /* setattr */ 124 { &vnop_read_desc, (VOPFUNC)spec_read }, /* read */ 125 { &vnop_write_desc, (VOPFUNC)spec_write }, /* write */ 126 { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */ 127 { &vnop_select_desc, (VOPFUNC)spec_select }, /* select */ 128 { &vnop_revoke_desc, (VOPFUNC)nop_revoke }, /* revoke */ 129 { &vnop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */ 130 { &vnop_fsync_desc, (VOPFUNC)spec_fsync }, /* fsync */ 131 { &vnop_remove_desc, (VOPFUNC)err_remove }, /* remove */ 132 { &vnop_link_desc, (VOPFUNC)err_link }, /* link */ 133 { &vnop_rename_desc, (VOPFUNC)err_rename }, /* rename */ 134 { &vnop_mkdir_desc, (VOPFUNC)err_mkdir }, /* mkdir */ 135 { &vnop_rmdir_desc, (VOPFUNC)err_rmdir }, /* rmdir */ 136 { &vnop_symlink_desc, (VOPFUNC)err_symlink }, /* symlink */ 137 { &vnop_readdir_desc, (VOPFUNC)err_readdir }, /* readdir */ 138 { &vnop_readlink_desc, (VOPFUNC)err_readlink }, /* readlink */ 139 { &vnop_inactive_desc, (VOPFUNC)nop_inactive }, /* inactive */ 140 { &vnop_reclaim_desc, (VOPFUNC)nop_reclaim }, /* reclaim */ 141 { &vnop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */ 142 { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */ 143 { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ 144 { &vnop_bwrite_desc, (VOPFUNC)spec_bwrite }, /* bwrite */ 145 { &vnop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */ 146 { &vnop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */ 147 { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ 148 { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff }, /* blktooff */ 149 { &vnop_offtoblk_desc, (VOPFUNC)spec_offtoblk }, /* offtoblk */ 150 { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap }, /* blockmap */ 151 { (struct vnodeop_desc*)NULL, (int(*)())NULL } 152}; 153struct vnodeopv_desc spec_vnodeop_opv_desc = 154 { &spec_vnodeop_p, spec_vnodeop_entries }; 155 156 157static void set_blocksize(vnode_t, dev_t); 158 159#define LOWPRI_TIER1_WINDOW_MSECS 25 160#define LOWPRI_TIER2_WINDOW_MSECS 100 161#define LOWPRI_TIER3_WINDOW_MSECS 500 162 163#define LOWPRI_TIER1_IO_PERIOD_MSECS 15 164#define LOWPRI_TIER2_IO_PERIOD_MSECS 50 165#define LOWPRI_TIER3_IO_PERIOD_MSECS 200 166 167#define LOWPRI_TIER1_IO_PERIOD_SSD_MSECS 5 168#define LOWPRI_TIER2_IO_PERIOD_SSD_MSECS 15 169#define LOWPRI_TIER3_IO_PERIOD_SSD_MSECS 25 170 171 172int throttle_windows_msecs[THROTTLE_LEVEL_END + 1] = { 173 0, 174 LOWPRI_TIER1_WINDOW_MSECS, 175 LOWPRI_TIER2_WINDOW_MSECS, 176 LOWPRI_TIER3_WINDOW_MSECS, 177}; 178 179int throttle_io_period_msecs[THROTTLE_LEVEL_END + 1] = { 180 0, 181 LOWPRI_TIER1_IO_PERIOD_MSECS, 182 LOWPRI_TIER2_IO_PERIOD_MSECS, 183 LOWPRI_TIER3_IO_PERIOD_MSECS, 184}; 185 186int throttle_io_period_ssd_msecs[THROTTLE_LEVEL_END + 1] = { 187 0, 188 LOWPRI_TIER1_IO_PERIOD_SSD_MSECS, 189 LOWPRI_TIER2_IO_PERIOD_SSD_MSECS, 190 LOWPRI_TIER3_IO_PERIOD_SSD_MSECS, 191}; 192 193 194int throttled_count[THROTTLE_LEVEL_END + 1]; 195 196struct _throttle_io_info_t { 197 lck_mtx_t throttle_lock; 198 199 struct timeval throttle_last_write_timestamp; 200 struct timeval throttle_min_timer_deadline; 201 struct timeval throttle_window_start_timestamp[THROTTLE_LEVEL_END + 1]; 202 struct timeval throttle_last_IO_timestamp[THROTTLE_LEVEL_END + 1]; 203 pid_t throttle_last_IO_pid[THROTTLE_LEVEL_END + 1]; 204 struct timeval throttle_start_IO_period_timestamp[THROTTLE_LEVEL_END + 1]; 205 206 TAILQ_HEAD( , uthread) throttle_uthlist[THROTTLE_LEVEL_END + 1]; /* Lists of throttled uthreads */ 207 int throttle_next_wake_level; 208 209 thread_call_t throttle_timer_call; 210 int32_t throttle_timer_ref; 211 int32_t throttle_timer_active; 212 213 int32_t throttle_io_count; 214 int32_t throttle_io_count_begin; 215 int *throttle_io_periods; 216 uint32_t throttle_io_period_num; 217 218 int32_t throttle_refcnt; 219 int32_t throttle_alloc; 220}; 221 222struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV]; 223 224 225int lowpri_throttle_enabled = 1; 226 227 228 229static void throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd); 230static int throttle_get_thread_throttle_level(uthread_t ut); 231 232/* 233 * Trivial lookup routine that always fails. 234 */ 235int 236spec_lookup(struct vnop_lookup_args *ap) 237{ 238 239 *ap->a_vpp = NULL; 240 return (ENOTDIR); 241} 242 243static void 244set_blocksize(struct vnode *vp, dev_t dev) 245{ 246 int (*size)(dev_t); 247 int rsize; 248 249 if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) { 250 rsize = (*size)(dev); 251 if (rsize <= 0) /* did size fail? */ 252 vp->v_specsize = DEV_BSIZE; 253 else 254 vp->v_specsize = rsize; 255 } 256 else 257 vp->v_specsize = DEV_BSIZE; 258} 259 260void 261set_fsblocksize(struct vnode *vp) 262{ 263 264 if (vp->v_type == VBLK) { 265 dev_t dev = (dev_t)vp->v_rdev; 266 int maj = major(dev); 267 268 if ((u_int)maj >= (u_int)nblkdev) 269 return; 270 271 vnode_lock(vp); 272 set_blocksize(vp, dev); 273 vnode_unlock(vp); 274 } 275 276} 277 278 279/* 280 * Open a special file. 281 */ 282int 283spec_open(struct vnop_open_args *ap) 284{ 285 struct proc *p = vfs_context_proc(ap->a_context); 286 kauth_cred_t cred = vfs_context_ucred(ap->a_context); 287 struct vnode *vp = ap->a_vp; 288 dev_t bdev, dev = (dev_t)vp->v_rdev; 289 int maj = major(dev); 290 int error; 291 292 /* 293 * Don't allow open if fs is mounted -nodev. 294 */ 295 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV)) 296 return (ENXIO); 297 298 switch (vp->v_type) { 299 300 case VCHR: 301 if ((u_int)maj >= (u_int)nchrdev) 302 return (ENXIO); 303 if (cred != FSCRED && (ap->a_mode & FWRITE)) { 304 /* 305 * When running in very secure mode, do not allow 306 * opens for writing of any disk character devices. 307 */ 308 if (securelevel >= 2 && isdisk(dev, VCHR)) 309 return (EPERM); 310 /* 311 * When running in secure mode, do not allow opens 312 * for writing of /dev/mem, /dev/kmem, or character 313 * devices whose corresponding block devices are 314 * currently mounted. 315 */ 316 if (securelevel >= 1) { 317 if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error)) 318 return (error); 319 if (iskmemdev(dev)) 320 return (EPERM); 321 } 322 } 323 324 devsw_lock(dev, S_IFCHR); 325 error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p); 326 327 if (error == 0) { 328 vp->v_specinfo->si_opencount++; 329 } 330 331 devsw_unlock(dev, S_IFCHR); 332 333 if (error == 0 && cdevsw[maj].d_type == D_DISK && !vp->v_un.vu_specinfo->si_initted) { 334 int isssd = 0; 335 uint64_t throttle_mask = 0; 336 uint32_t devbsdunit = 0; 337 338 if (VNOP_IOCTL(vp, DKIOCGETTHROTTLEMASK, (caddr_t)&throttle_mask, 0, NULL) == 0) { 339 340 if (throttle_mask != 0 && 341 VNOP_IOCTL(vp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ap->a_context) == 0) { 342 /* 343 * as a reasonable approximation, only use the lowest bit of the mask 344 * to generate a disk unit number 345 */ 346 devbsdunit = num_trailing_0(throttle_mask); 347 348 vnode_lock(vp); 349 350 vp->v_un.vu_specinfo->si_isssd = isssd; 351 vp->v_un.vu_specinfo->si_devbsdunit = devbsdunit; 352 vp->v_un.vu_specinfo->si_throttle_mask = throttle_mask; 353 vp->v_un.vu_specinfo->si_throttleable = 1; 354 vp->v_un.vu_specinfo->si_initted = 1; 355 356 vnode_unlock(vp); 357 } 358 } 359 if (vp->v_un.vu_specinfo->si_initted == 0) { 360 vnode_lock(vp); 361 vp->v_un.vu_specinfo->si_initted = 1; 362 vnode_unlock(vp); 363 } 364 } 365 return (error); 366 367 case VBLK: 368 if ((u_int)maj >= (u_int)nblkdev) 369 return (ENXIO); 370 /* 371 * When running in very secure mode, do not allow 372 * opens for writing of any disk block devices. 373 */ 374 if (securelevel >= 2 && cred != FSCRED && 375 (ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK) 376 return (EPERM); 377 /* 378 * Do not allow opens of block devices that are 379 * currently mounted. 380 */ 381 if ( (error = vfs_mountedon(vp)) ) 382 return (error); 383 384 devsw_lock(dev, S_IFBLK); 385 error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p); 386 if (!error) { 387 vp->v_specinfo->si_opencount++; 388 } 389 devsw_unlock(dev, S_IFBLK); 390 391 if (!error) { 392 u_int64_t blkcnt; 393 u_int32_t blksize; 394 int setsize = 0; 395 u_int32_t size512 = 512; 396 397 398 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) { 399 /* Switch to 512 byte sectors (temporarily) */ 400 401 if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) { 402 /* Get the number of 512 byte physical blocks. */ 403 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) { 404 setsize = 1; 405 } 406 } 407 /* If it doesn't set back, we can't recover */ 408 if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context)) 409 error = ENXIO; 410 } 411 412 413 vnode_lock(vp); 414 set_blocksize(vp, dev); 415 416 /* 417 * Cache the size in bytes of the block device for later 418 * use by spec_write(). 419 */ 420 if (setsize) 421 vp->v_specdevsize = blkcnt * (u_int64_t)size512; 422 else 423 vp->v_specdevsize = (u_int64_t)0; /* Default: Can't get */ 424 425 vnode_unlock(vp); 426 427 } 428 return(error); 429 default: 430 panic("spec_open type"); 431 } 432 return (0); 433} 434 435/* 436 * Vnode op for read 437 */ 438int 439spec_read(struct vnop_read_args *ap) 440{ 441 struct vnode *vp = ap->a_vp; 442 struct uio *uio = ap->a_uio; 443 struct buf *bp; 444 daddr64_t bn, nextbn; 445 long bsize, bscale; 446 int devBlockSize=0; 447 int n, on; 448 int error = 0; 449 dev_t dev; 450 451#if DIAGNOSTIC 452 if (uio->uio_rw != UIO_READ) 453 panic("spec_read mode"); 454 if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) 455 panic("spec_read proc"); 456#endif 457 if (uio_resid(uio) == 0) 458 return (0); 459 460 switch (vp->v_type) { 461 462 case VCHR: 463 if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) { 464 struct _throttle_io_info_t *throttle_info; 465 466 throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit]; 467 468 throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd); 469 } 470 error = (*cdevsw[major(vp->v_rdev)].d_read) 471 (vp->v_rdev, uio, ap->a_ioflag); 472 473 return (error); 474 475 case VBLK: 476 if (uio->uio_offset < 0) 477 return (EINVAL); 478 479 dev = vp->v_rdev; 480 481 devBlockSize = vp->v_specsize; 482 483 if (devBlockSize > PAGE_SIZE) 484 return (EINVAL); 485 486 bscale = PAGE_SIZE / devBlockSize; 487 bsize = bscale * devBlockSize; 488 489 do { 490 on = uio->uio_offset % bsize; 491 492 bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ (bscale - 1)); 493 494 if (vp->v_speclastr + bscale == bn) { 495 nextbn = bn + bscale; 496 error = buf_breadn(vp, bn, (int)bsize, &nextbn, 497 (int *)&bsize, 1, NOCRED, &bp); 498 } else 499 error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp); 500 501 vnode_lock(vp); 502 vp->v_speclastr = bn; 503 vnode_unlock(vp); 504 505 n = bsize - buf_resid(bp); 506 if ((on > n) || error) { 507 if (!error) 508 error = EINVAL; 509 buf_brelse(bp); 510 return (error); 511 } 512 n = min((unsigned)(n - on), uio_resid(uio)); 513 514 error = uiomove((char *)buf_dataptr(bp) + on, n, uio); 515 if (n + on == bsize) 516 buf_markaged(bp); 517 buf_brelse(bp); 518 } while (error == 0 && uio_resid(uio) > 0 && n != 0); 519 return (error); 520 521 default: 522 panic("spec_read type"); 523 } 524 /* NOTREACHED */ 525 526 return (0); 527} 528 529/* 530 * Vnode op for write 531 */ 532int 533spec_write(struct vnop_write_args *ap) 534{ 535 struct vnode *vp = ap->a_vp; 536 struct uio *uio = ap->a_uio; 537 struct buf *bp; 538 daddr64_t bn; 539 int bsize, blkmask, bscale; 540 int io_sync; 541 int devBlockSize=0; 542 int n, on; 543 int error = 0; 544 dev_t dev; 545 546#if DIAGNOSTIC 547 if (uio->uio_rw != UIO_WRITE) 548 panic("spec_write mode"); 549 if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) 550 panic("spec_write proc"); 551#endif 552 553 switch (vp->v_type) { 554 555 case VCHR: 556 if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) { 557 struct _throttle_io_info_t *throttle_info; 558 559 throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit]; 560 561 throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd); 562 563 microuptime(&throttle_info->throttle_last_write_timestamp); 564 } 565 error = (*cdevsw[major(vp->v_rdev)].d_write) 566 (vp->v_rdev, uio, ap->a_ioflag); 567 568 return (error); 569 570 case VBLK: 571 if (uio_resid(uio) == 0) 572 return (0); 573 if (uio->uio_offset < 0) 574 return (EINVAL); 575 576 io_sync = (ap->a_ioflag & IO_SYNC); 577 578 dev = (vp->v_rdev); 579 580 devBlockSize = vp->v_specsize; 581 if (devBlockSize > PAGE_SIZE) 582 return(EINVAL); 583 584 bscale = PAGE_SIZE / devBlockSize; 585 blkmask = bscale - 1; 586 bsize = bscale * devBlockSize; 587 588 589 do { 590 bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ blkmask); 591 on = uio->uio_offset % bsize; 592 593 n = min((unsigned)(bsize - on), uio_resid(uio)); 594 595 /* 596 * Use buf_getblk() as an optimization IFF: 597 * 598 * 1) We are reading exactly a block on a block 599 * aligned boundary 600 * 2) We know the size of the device from spec_open 601 * 3) The read doesn't span the end of the device 602 * 603 * Otherwise, we fall back on buf_bread(). 604 */ 605 if (n == bsize && 606 vp->v_specdevsize != (u_int64_t)0 && 607 (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) { 608 /* reduce the size of the read to what is there */ 609 n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize; 610 } 611 612 if (n == bsize) 613 bp = buf_getblk(vp, bn, bsize, 0, 0, BLK_WRITE); 614 else 615 error = (int)buf_bread(vp, bn, bsize, NOCRED, &bp); 616 617 /* Translate downstream error for upstream, if needed */ 618 if (!error) 619 error = (int)buf_error(bp); 620 if (error) { 621 buf_brelse(bp); 622 return (error); 623 } 624 n = min(n, bsize - buf_resid(bp)); 625 626 error = uiomove((char *)buf_dataptr(bp) + on, n, uio); 627 if (error) { 628 buf_brelse(bp); 629 return (error); 630 } 631 buf_markaged(bp); 632 633 if (io_sync) 634 error = buf_bwrite(bp); 635 else { 636 if ((n + on) == bsize) 637 error = buf_bawrite(bp); 638 else 639 error = buf_bdwrite(bp); 640 } 641 } while (error == 0 && uio_resid(uio) > 0 && n != 0); 642 return (error); 643 644 default: 645 panic("spec_write type"); 646 } 647 /* NOTREACHED */ 648 649 return (0); 650} 651 652/* 653 * Device ioctl operation. 654 */ 655int 656spec_ioctl(struct vnop_ioctl_args *ap) 657{ 658 proc_t p = vfs_context_proc(ap->a_context); 659 dev_t dev = ap->a_vp->v_rdev; 660 int retval = 0; 661 662 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_START, 663 (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, (unsigned int)ap->a_vp->v_type, 0); 664 665 switch (ap->a_vp->v_type) { 666 667 case VCHR: 668 retval = (*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, 669 ap->a_fflag, p); 670 break; 671 672 case VBLK: 673 if (kdebug_enable) { 674 if (ap->a_command == DKIOCUNMAP) { 675 dk_unmap_t *unmap; 676 dk_extent_t *extent; 677 uint32_t i; 678 679 unmap = (dk_unmap_t *)ap->a_data; 680 extent = unmap->extents; 681 682 for (i = 0; i < unmap->extentsCount; i++, extent++) { 683 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 1) | DBG_FUNC_NONE, dev, extent->offset/ap->a_vp->v_specsize, extent->length, 0, 0); 684 } 685 } 686 } 687 retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, ap->a_fflag, p); 688 break; 689 690 default: 691 panic("spec_ioctl"); 692 /* NOTREACHED */ 693 } 694 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_END, 695 (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, retval, 0); 696 697 return (retval); 698} 699 700int 701spec_select(struct vnop_select_args *ap) 702{ 703 proc_t p = vfs_context_proc(ap->a_context); 704 dev_t dev; 705 706 switch (ap->a_vp->v_type) { 707 708 default: 709 return (1); /* XXX */ 710 711 case VCHR: 712 dev = ap->a_vp->v_rdev; 713 return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p); 714 } 715} 716 717static int filt_specattach(struct knote *kn); 718 719int 720spec_kqfilter(vnode_t vp, struct knote *kn) 721{ 722 dev_t dev; 723 int err = EINVAL; 724 725 /* 726 * For a few special kinds of devices, we can attach knotes. 727 * Each filter function must check whether the dev type matches it. 728 */ 729 dev = vnode_specrdev(vp); 730 731 if (vnode_istty(vp)) { 732 /* We can hook into TTYs... */ 733 err = filt_specattach(kn); 734 } else { 735#if NETWORKING 736 /* Try a bpf device, as defined in bsd/net/bpf.c */ 737 err = bpfkqfilter(dev, kn); 738#endif 739 } 740 741 return err; 742} 743 744/* 745 * Synch buffers associated with a block device 746 */ 747int 748spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context) 749{ 750 if (vp->v_type == VCHR) 751 return (0); 752 /* 753 * Flush all dirty buffers associated with a block device. 754 */ 755 buf_flushdirtyblks(vp, (waitfor == MNT_WAIT || waitfor == MNT_DWAIT), 0, "spec_fsync"); 756 757 return (0); 758} 759 760int 761spec_fsync(struct vnop_fsync_args *ap) 762{ 763 return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context); 764} 765 766 767/* 768 * Just call the device strategy routine 769 */ 770void throttle_init(void); 771 772 773#if 0 774#define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...) \ 775 do { \ 776 if ((debug_info)->alloc) \ 777 printf("%s: "format, __FUNCTION__, ## args); \ 778 } while(0) 779 780#else 781#define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...) 782#endif 783 784 785SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER1], 0, ""); 786SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER2], 0, ""); 787SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER3], 0, ""); 788 789SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER1], 0, ""); 790SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER2], 0, ""); 791SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER3], 0, ""); 792 793SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER1], 0, ""); 794SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER2], 0, ""); 795SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER3], 0, ""); 796 797SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_throttle_enabled, 0, ""); 798 799 800static lck_grp_t *throttle_mtx_grp; 801static lck_attr_t *throttle_mtx_attr; 802static lck_grp_attr_t *throttle_mtx_grp_attr; 803 804 805/* 806 * throttled I/O helper function 807 * convert the index of the lowest set bit to a device index 808 */ 809int 810num_trailing_0(uint64_t n) 811{ 812 /* 813 * since in most cases the number of trailing 0s is very small, 814 * we simply counting sequentially from the lowest bit 815 */ 816 if (n == 0) 817 return sizeof(n) * 8; 818 int count = 0; 819 while (!ISSET(n, 1)) { 820 n >>= 1; 821 ++count; 822 } 823 return count; 824} 825 826 827/* 828 * Release the reference and if the item was allocated and this is the last 829 * reference then free it. 830 * 831 * This routine always returns the old value. 832 */ 833static int 834throttle_info_rel(struct _throttle_io_info_t *info) 835{ 836 SInt32 oldValue = OSDecrementAtomic(&info->throttle_refcnt); 837 838 DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n", 839 info, (int)(oldValue -1), info ); 840 841 /* The reference count just went negative, very bad */ 842 if (oldValue == 0) 843 panic("throttle info ref cnt went negative!"); 844 845 /* 846 * Once reference count is zero, no one else should be able to take a 847 * reference 848 */ 849 if ((info->throttle_refcnt == 0) && (info->throttle_alloc)) { 850 DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info); 851 852 lck_mtx_destroy(&info->throttle_lock, throttle_mtx_grp); 853 FREE(info, M_TEMP); 854 } 855 return oldValue; 856} 857 858 859/* 860 * Just take a reference on the throttle info structure. 861 * 862 * This routine always returns the old value. 863 */ 864static SInt32 865throttle_info_ref(struct _throttle_io_info_t *info) 866{ 867 SInt32 oldValue = OSIncrementAtomic(&info->throttle_refcnt); 868 869 DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n", 870 info, (int)(oldValue -1), info ); 871 /* Allocated items should never have a reference of zero */ 872 if (info->throttle_alloc && (oldValue == 0)) 873 panic("Taking a reference without calling create throttle info!\n"); 874 875 return oldValue; 876} 877 878/* 879 * on entry the throttle_lock is held... 880 * this function is responsible for taking 881 * and dropping the reference on the info 882 * structure which will keep it from going 883 * away while the timer is running if it 884 * happens to have been dynamically allocated by 885 * a network fileystem kext which is now trying 886 * to free it 887 */ 888static uint32_t 889throttle_timer_start(struct _throttle_io_info_t *info, boolean_t update_io_count, int wakelevel) 890{ 891 struct timeval elapsed; 892 struct timeval now; 893 struct timeval period; 894 uint64_t elapsed_msecs; 895 int throttle_level; 896 int level; 897 int msecs; 898 boolean_t throttled = FALSE; 899 boolean_t need_timer = FALSE; 900 901 microuptime(&now); 902 903 if (update_io_count == TRUE) { 904 info->throttle_io_count_begin = info->throttle_io_count; 905 info->throttle_io_period_num++; 906 907 while (wakelevel >= THROTTLE_LEVEL_THROTTLED) 908 info->throttle_start_IO_period_timestamp[wakelevel--] = now; 909 910 info->throttle_min_timer_deadline = now; 911 912 msecs = info->throttle_io_periods[THROTTLE_LEVEL_THROTTLED]; 913 period.tv_sec = msecs / 1000; 914 period.tv_usec = (msecs % 1000) * 1000; 915 916 timevaladd(&info->throttle_min_timer_deadline, &period); 917 } 918 for (throttle_level = THROTTLE_LEVEL_START; throttle_level < THROTTLE_LEVEL_END; throttle_level++) { 919 920 elapsed = now; 921 timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]); 922 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000); 923 924 for (level = throttle_level + 1; level <= THROTTLE_LEVEL_END; level++) { 925 926 if (!TAILQ_EMPTY(&info->throttle_uthlist[level])) { 927 928 if (elapsed_msecs < (uint64_t)throttle_windows_msecs[level]) { 929 /* 930 * we had an I/O occur at a higher priority tier within 931 * this tier's throttle window 932 */ 933 throttled = TRUE; 934 } 935 /* 936 * we assume that the windows are the same or longer 937 * as we drop through the throttling tiers... thus 938 * we can stop looking once we run into a tier with 939 * threads to schedule regardless of whether it's 940 * still in its throttling window or not 941 */ 942 break; 943 } 944 } 945 if (throttled == TRUE) 946 break; 947 } 948 if (throttled == TRUE) { 949 uint64_t deadline = 0; 950 struct timeval target; 951 struct timeval min_target; 952 953 /* 954 * we've got at least one tier still in a throttled window 955 * so we need a timer running... compute the next deadline 956 * and schedule it 957 */ 958 for (level = throttle_level+1; level <= THROTTLE_LEVEL_END; level++) { 959 960 if (TAILQ_EMPTY(&info->throttle_uthlist[level])) 961 continue; 962 963 target = info->throttle_start_IO_period_timestamp[level]; 964 965 msecs = info->throttle_io_periods[level]; 966 period.tv_sec = msecs / 1000; 967 period.tv_usec = (msecs % 1000) * 1000; 968 969 timevaladd(&target, &period); 970 971 if (need_timer == FALSE || timevalcmp(&target, &min_target, <)) { 972 min_target = target; 973 need_timer = TRUE; 974 } 975 } 976 if (timevalcmp(&info->throttle_min_timer_deadline, &now, >)) { 977 if (timevalcmp(&info->throttle_min_timer_deadline, &min_target, >)) 978 min_target = info->throttle_min_timer_deadline; 979 } 980 981 if (info->throttle_timer_active) { 982 if (thread_call_cancel(info->throttle_timer_call) == FALSE) { 983 /* 984 * couldn't kill the timer because it's already 985 * been dispatched, so don't try to start a new 986 * one... once we drop the lock, the timer will 987 * proceed and eventually re-run this function 988 */ 989 need_timer = FALSE; 990 } else 991 info->throttle_timer_active = 0; 992 } 993 if (need_timer == TRUE) { 994 /* 995 * This is defined as an int (32-bit) rather than a 64-bit 996 * value because it would need a really big period in the 997 * order of ~500 days to overflow this. So, we let this be 998 * 32-bit which allows us to use the clock_interval_to_deadline() 999 * routine. 1000 */ 1001 int target_msecs; 1002 1003 if (info->throttle_timer_ref == 0) { 1004 /* 1005 * take a reference for the timer 1006 */ 1007 throttle_info_ref(info); 1008 1009 info->throttle_timer_ref = 1; 1010 } 1011 elapsed = min_target; 1012 timevalsub(&elapsed, &now); 1013 target_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000; 1014 1015 if (target_msecs <= 0) { 1016 /* 1017 * we may have computed a deadline slightly in the past 1018 * due to various factors... if so, just set the timer 1019 * to go off in the near future (we don't need to be precise) 1020 */ 1021 target_msecs = 1; 1022 } 1023 clock_interval_to_deadline(target_msecs, 1000000, &deadline); 1024 1025 thread_call_enter_delayed(info->throttle_timer_call, deadline); 1026 info->throttle_timer_active = 1; 1027 } 1028 } 1029 return (throttle_level); 1030} 1031 1032 1033static void 1034throttle_timer(struct _throttle_io_info_t *info) 1035{ 1036 uthread_t ut, utlist; 1037 struct timeval elapsed; 1038 struct timeval now; 1039 uint64_t elapsed_msecs; 1040 int throttle_level; 1041 int level; 1042 int wake_level; 1043 caddr_t wake_address = NULL; 1044 boolean_t update_io_count = FALSE; 1045 boolean_t need_wakeup = FALSE; 1046 boolean_t need_release = FALSE; 1047 1048 ut = NULL; 1049 lck_mtx_lock(&info->throttle_lock); 1050 1051 info->throttle_timer_active = 0; 1052 microuptime(&now); 1053 1054 elapsed = now; 1055 timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp[THROTTLE_LEVEL_THROTTLED]); 1056 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000); 1057 1058 if (elapsed_msecs >= (uint64_t)info->throttle_io_periods[THROTTLE_LEVEL_THROTTLED]) { 1059 1060 wake_level = info->throttle_next_wake_level; 1061 1062 for (level = THROTTLE_LEVEL_START; level < THROTTLE_LEVEL_END; level++) { 1063 1064 elapsed = now; 1065 timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp[wake_level]); 1066 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000); 1067 1068 if (elapsed_msecs >= (uint64_t)info->throttle_io_periods[wake_level] && !TAILQ_EMPTY(&info->throttle_uthlist[wake_level])) { 1069 /* 1070 * we're closing out the current IO period... 1071 * if we have a waiting thread, wake it up 1072 * after we have reset the I/O window info 1073 */ 1074 need_wakeup = TRUE; 1075 update_io_count = TRUE; 1076 1077 info->throttle_next_wake_level = wake_level - 1; 1078 1079 if (info->throttle_next_wake_level == THROTTLE_LEVEL_START) 1080 info->throttle_next_wake_level = THROTTLE_LEVEL_END; 1081 1082 break; 1083 } 1084 wake_level--; 1085 1086 if (wake_level == THROTTLE_LEVEL_START) 1087 wake_level = THROTTLE_LEVEL_END; 1088 } 1089 } 1090 if (need_wakeup == TRUE) { 1091 if (!TAILQ_EMPTY(&info->throttle_uthlist[wake_level])) { 1092 1093 ut = (uthread_t)TAILQ_FIRST(&info->throttle_uthlist[wake_level]); 1094 TAILQ_REMOVE(&info->throttle_uthlist[wake_level], ut, uu_throttlelist); 1095 ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE; 1096 1097 wake_address = (caddr_t)&ut->uu_on_throttlelist; 1098 } 1099 } else 1100 wake_level = THROTTLE_LEVEL_START; 1101 1102 throttle_level = throttle_timer_start(info, update_io_count, wake_level); 1103 1104 if (wake_address != NULL) 1105 wakeup(wake_address); 1106 1107 for (level = THROTTLE_LEVEL_THROTTLED; level <= throttle_level; level++) { 1108 1109 TAILQ_FOREACH_SAFE(ut, &info->throttle_uthlist[level], uu_throttlelist, utlist) { 1110 1111 TAILQ_REMOVE(&info->throttle_uthlist[level], ut, uu_throttlelist); 1112 ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE; 1113 1114 wakeup(&ut->uu_on_throttlelist); 1115 } 1116 } 1117 if (info->throttle_timer_active == 0 && info->throttle_timer_ref) { 1118 info->throttle_timer_ref = 0; 1119 need_release = TRUE; 1120 } 1121 lck_mtx_unlock(&info->throttle_lock); 1122 1123 if (need_release == TRUE) 1124 throttle_info_rel(info); 1125} 1126 1127 1128static int 1129throttle_add_to_list(struct _throttle_io_info_t *info, uthread_t ut, int mylevel, boolean_t insert_tail) 1130{ 1131 boolean_t start_timer = FALSE; 1132 int level = THROTTLE_LEVEL_START; 1133 1134 if (TAILQ_EMPTY(&info->throttle_uthlist[mylevel])) { 1135 info->throttle_start_IO_period_timestamp[mylevel] = info->throttle_last_IO_timestamp[mylevel]; 1136 start_timer = TRUE; 1137 } 1138 1139 if (insert_tail == TRUE) 1140 TAILQ_INSERT_TAIL(&info->throttle_uthlist[mylevel], ut, uu_throttlelist); 1141 else 1142 TAILQ_INSERT_HEAD(&info->throttle_uthlist[mylevel], ut, uu_throttlelist); 1143 1144 ut->uu_on_throttlelist = mylevel; 1145 1146 if (start_timer == TRUE) { 1147 /* we may need to start or rearm the timer */ 1148 level = throttle_timer_start(info, FALSE, THROTTLE_LEVEL_START); 1149 1150 if (level == THROTTLE_LEVEL_END) { 1151 if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) { 1152 TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist); 1153 1154 ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE; 1155 } 1156 } 1157 } 1158 return (level); 1159} 1160 1161static void 1162throttle_init_throttle_window(void) 1163{ 1164 int throttle_window_size; 1165 1166 /* 1167 * The hierarchy of throttle window values is as follows: 1168 * - Global defaults 1169 * - Device tree properties 1170 * - Boot-args 1171 * All values are specified in msecs. 1172 */ 1173 1174 /* Override global values with device-tree properties */ 1175 if (PE_get_default("kern.io_throttle_window_tier1", &throttle_window_size, sizeof(throttle_window_size))) 1176 throttle_windows_msecs[THROTTLE_LEVEL_TIER1] = throttle_window_size; 1177 1178 if (PE_get_default("kern.io_throttle_window_tier2", &throttle_window_size, sizeof(throttle_window_size))) 1179 throttle_windows_msecs[THROTTLE_LEVEL_TIER2] = throttle_window_size; 1180 1181 if (PE_get_default("kern.io_throttle_window_tier3", &throttle_window_size, sizeof(throttle_window_size))) 1182 throttle_windows_msecs[THROTTLE_LEVEL_TIER3] = throttle_window_size; 1183 1184 /* Override with boot-args */ 1185 if (PE_parse_boot_argn("io_throttle_window_tier1", &throttle_window_size, sizeof(throttle_window_size))) 1186 throttle_windows_msecs[THROTTLE_LEVEL_TIER1] = throttle_window_size; 1187 1188 if (PE_parse_boot_argn("io_throttle_window_tier2", &throttle_window_size, sizeof(throttle_window_size))) 1189 throttle_windows_msecs[THROTTLE_LEVEL_TIER2] = throttle_window_size; 1190 1191 if (PE_parse_boot_argn("io_throttle_window_tier3", &throttle_window_size, sizeof(throttle_window_size))) 1192 throttle_windows_msecs[THROTTLE_LEVEL_TIER3] = throttle_window_size; 1193} 1194 1195static void 1196throttle_init_throttle_period(struct _throttle_io_info_t *info, boolean_t isssd) 1197{ 1198 int throttle_period_size; 1199 1200 /* 1201 * The hierarchy of throttle period values is as follows: 1202 * - Global defaults 1203 * - Device tree properties 1204 * - Boot-args 1205 * All values are specified in msecs. 1206 */ 1207 1208 /* Assign global defaults */ 1209 if (isssd == TRUE) 1210 info->throttle_io_periods = &throttle_io_period_ssd_msecs[0]; 1211 else 1212 info->throttle_io_periods = &throttle_io_period_msecs[0]; 1213 1214 /* Override global values with device-tree properties */ 1215 if (PE_get_default("kern.io_throttle_period_tier1", &throttle_period_size, sizeof(throttle_period_size))) 1216 info->throttle_io_periods[THROTTLE_LEVEL_TIER1] = throttle_period_size; 1217 1218 if (PE_get_default("kern.io_throttle_period_tier2", &throttle_period_size, sizeof(throttle_period_size))) 1219 info->throttle_io_periods[THROTTLE_LEVEL_TIER2] = throttle_period_size; 1220 1221 if (PE_get_default("kern.io_throttle_period_tier3", &throttle_period_size, sizeof(throttle_period_size))) 1222 info->throttle_io_periods[THROTTLE_LEVEL_TIER3] = throttle_period_size; 1223 1224 /* Override with boot-args */ 1225 if (PE_parse_boot_argn("io_throttle_period_tier1", &throttle_period_size, sizeof(throttle_period_size))) 1226 info->throttle_io_periods[THROTTLE_LEVEL_TIER1] = throttle_period_size; 1227 1228 if (PE_parse_boot_argn("io_throttle_period_tier2", &throttle_period_size, sizeof(throttle_period_size))) 1229 info->throttle_io_periods[THROTTLE_LEVEL_TIER2] = throttle_period_size; 1230 1231 if (PE_parse_boot_argn("io_throttle_period_tier3", &throttle_period_size, sizeof(throttle_period_size))) 1232 info->throttle_io_periods[THROTTLE_LEVEL_TIER3] = throttle_period_size; 1233 1234} 1235 1236void 1237throttle_init(void) 1238{ 1239 struct _throttle_io_info_t *info; 1240 int i; 1241 int level; 1242 1243 /* 1244 * allocate lock group attribute and group 1245 */ 1246 throttle_mtx_grp_attr = lck_grp_attr_alloc_init(); 1247 throttle_mtx_grp = lck_grp_alloc_init("throttle I/O", throttle_mtx_grp_attr); 1248 1249 /* Update throttle parameters based on device tree configuration */ 1250 throttle_init_throttle_window(); 1251 1252 /* 1253 * allocate the lock attribute 1254 */ 1255 throttle_mtx_attr = lck_attr_alloc_init(); 1256 1257 for (i = 0; i < LOWPRI_MAX_NUM_DEV; i++) { 1258 info = &_throttle_io_info[i]; 1259 1260 lck_mtx_init(&info->throttle_lock, throttle_mtx_grp, throttle_mtx_attr); 1261 info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info); 1262 1263 for (level = 0; level <= THROTTLE_LEVEL_END; level++) { 1264 TAILQ_INIT(&info->throttle_uthlist[level]); 1265 info->throttle_last_IO_pid[level] = 0; 1266 } 1267 info->throttle_next_wake_level = THROTTLE_LEVEL_END; 1268 } 1269} 1270 1271void 1272sys_override_io_throttle(int flag) 1273{ 1274 if (flag == THROTTLE_IO_ENABLE) 1275 lowpri_throttle_enabled = 1; 1276 if (flag == THROTTLE_IO_DISABLE) 1277 lowpri_throttle_enabled = 0; 1278} 1279 1280int rethrottle_removed_from_list = 0; 1281int rethrottle_moved_to_new_list = 0; 1282 1283/* 1284 * move a throttled thread to the appropriate state based 1285 * on it's new throttle level... throttle_add_to_list will 1286 * reset the timer deadline if necessary... it may also 1287 * leave the thread off of the queue if we're already outside 1288 * the throttle window for the new level 1289 * takes a valid uthread (which may or may not be on the 1290 * throttle queue) as input 1291 * 1292 * NOTE: This is called with the task lock held. 1293 */ 1294 1295void 1296rethrottle_thread(uthread_t ut) 1297{ 1298 struct _throttle_io_info_t *info; 1299 int my_new_level; 1300 1301 if ((info = ut->uu_throttle_info) == NULL) 1302 return; 1303 1304 lck_mtx_lock(&info->throttle_lock); 1305 1306 if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) { 1307 1308 my_new_level = throttle_get_thread_throttle_level(ut); 1309 1310 if (my_new_level != ut->uu_on_throttlelist) { 1311 1312 TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist); 1313 ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE; 1314 1315 if (my_new_level >= THROTTLE_LEVEL_THROTTLED) { 1316 throttle_add_to_list(info, ut, my_new_level, TRUE); 1317 rethrottle_moved_to_new_list++; 1318 } 1319 1320 /* Thread no longer in window, need to wake it up */ 1321 if (ut->uu_on_throttlelist == THROTTLE_LEVEL_NONE) { 1322 wakeup(&ut->uu_on_throttlelist); 1323 rethrottle_removed_from_list++; 1324 } 1325 } 1326 } 1327 1328 lck_mtx_unlock(&info->throttle_lock); 1329} 1330 1331 1332/* 1333 * KPI routine 1334 * 1335 * Create and take a reference on a throttle info structure and return a 1336 * pointer for the file system to use when calling throttle_info_update. 1337 * Calling file system must have a matching release for every create. 1338 */ 1339void * 1340throttle_info_create(void) 1341{ 1342 struct _throttle_io_info_t *info; 1343 int level; 1344 1345 MALLOC(info, struct _throttle_io_info_t *, sizeof(*info), M_TEMP, M_ZERO | M_WAITOK); 1346 /* Should never happen but just in case */ 1347 if (info == NULL) 1348 return NULL; 1349 /* Mark that this one was allocated and needs to be freed */ 1350 DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info ); 1351 info->throttle_alloc = TRUE; 1352 1353 lck_mtx_init(&info->throttle_lock, throttle_mtx_grp, throttle_mtx_attr); 1354 info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info); 1355 1356 for (level = 0; level <= THROTTLE_LEVEL_END; level++) { 1357 TAILQ_INIT(&info->throttle_uthlist[level]); 1358 } 1359 info->throttle_next_wake_level = THROTTLE_LEVEL_END; 1360 1361 /* Take a reference */ 1362 OSIncrementAtomic(&info->throttle_refcnt); 1363 return info; 1364} 1365 1366/* 1367 * KPI routine 1368 * 1369 * Release the throttle info pointer if all the reference are gone. Should be 1370 * called to release reference taken by throttle_info_create 1371 */ 1372void 1373throttle_info_release(void *throttle_info) 1374{ 1375 DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n", 1376 (struct _throttle_io_info_t *)throttle_info, 1377 (struct _throttle_io_info_t *)throttle_info); 1378 if (throttle_info) /* Just to be careful */ 1379 throttle_info_rel(throttle_info); 1380} 1381 1382/* 1383 * KPI routine 1384 * 1385 * File Systems that create an info structure, need to call this routine in 1386 * their mount routine (used by cluster code). File Systems that call this in 1387 * their mount routines must call throttle_info_mount_rel in their unmount 1388 * routines. 1389 */ 1390void 1391throttle_info_mount_ref(mount_t mp, void *throttle_info) 1392{ 1393 if ((throttle_info == NULL) || (mp == NULL)) 1394 return; 1395 throttle_info_ref(throttle_info); 1396 1397 /* 1398 * We already have a reference release it before adding the new one 1399 */ 1400 if (mp->mnt_throttle_info) 1401 throttle_info_rel(mp->mnt_throttle_info); 1402 mp->mnt_throttle_info = throttle_info; 1403} 1404 1405/* 1406 * Private KPI routine 1407 * 1408 * return a handle for accessing throttle_info given a throttle_mask. The 1409 * handle must be released by throttle_info_rel_by_mask 1410 */ 1411int 1412throttle_info_ref_by_mask(uint64_t throttle_mask, throttle_info_handle_t *throttle_info_handle) 1413{ 1414 int dev_index; 1415 struct _throttle_io_info_t *info; 1416 1417 if (throttle_info_handle == NULL) 1418 return EINVAL; 1419 1420 dev_index = num_trailing_0(throttle_mask); 1421 info = &_throttle_io_info[dev_index]; 1422 throttle_info_ref(info); 1423 *(struct _throttle_io_info_t**)throttle_info_handle = info; 1424 1425 return 0; 1426} 1427 1428/* 1429 * Private KPI routine 1430 * 1431 * release the handle obtained by throttle_info_ref_by_mask 1432 */ 1433void 1434throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle) 1435{ 1436 /* 1437 * for now the handle is just a pointer to _throttle_io_info_t 1438 */ 1439 throttle_info_rel((struct _throttle_io_info_t*)throttle_info_handle); 1440} 1441 1442/* 1443 * KPI routine 1444 * 1445 * File Systems that throttle_info_mount_ref, must call this routine in their 1446 * umount routine. 1447 */ 1448void 1449throttle_info_mount_rel(mount_t mp) 1450{ 1451 if (mp->mnt_throttle_info) 1452 throttle_info_rel(mp->mnt_throttle_info); 1453 mp->mnt_throttle_info = NULL; 1454} 1455 1456void 1457throttle_info_get_last_io_time(mount_t mp, struct timeval *tv) 1458{ 1459 struct _throttle_io_info_t *info; 1460 1461 if (mp == NULL) 1462 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; 1463 else if (mp->mnt_throttle_info == NULL) 1464 info = &_throttle_io_info[mp->mnt_devbsdunit]; 1465 else 1466 info = mp->mnt_throttle_info; 1467 1468 *tv = info->throttle_last_write_timestamp; 1469} 1470 1471void 1472update_last_io_time(mount_t mp) 1473{ 1474 struct _throttle_io_info_t *info; 1475 1476 if (mp == NULL) 1477 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; 1478 else if (mp->mnt_throttle_info == NULL) 1479 info = &_throttle_io_info[mp->mnt_devbsdunit]; 1480 else 1481 info = mp->mnt_throttle_info; 1482 1483 microuptime(&info->throttle_last_write_timestamp); 1484 if (mp != NULL) 1485 mp->mnt_last_write_completed_timestamp = info->throttle_last_write_timestamp; 1486} 1487 1488 1489int 1490throttle_get_io_policy(uthread_t *ut) 1491{ 1492 if (ut != NULL) 1493 *ut = get_bsdthread_info(current_thread()); 1494 1495 return (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO)); 1496} 1497 1498int 1499throttle_get_passive_io_policy(uthread_t *ut) 1500{ 1501 if (ut != NULL) 1502 *ut = get_bsdthread_info(current_thread()); 1503 1504 return (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_PASSIVE_IO)); 1505} 1506 1507 1508static int 1509throttle_get_thread_throttle_level(uthread_t ut) 1510{ 1511 int thread_throttle_level; 1512 1513 if (ut == NULL) 1514 ut = get_bsdthread_info(current_thread()); 1515 1516 thread_throttle_level = proc_get_effective_thread_policy(ut->uu_thread, TASK_POLICY_IO); 1517 1518 /* Bootcache misses should always be throttled */ 1519 if (ut->uu_throttle_bc == TRUE) 1520 thread_throttle_level = THROTTLE_LEVEL_TIER3; 1521 1522 return (thread_throttle_level); 1523} 1524 1525 1526static int 1527throttle_io_will_be_throttled_internal(void * throttle_info, int * mylevel, int * throttling_level) 1528{ 1529 struct _throttle_io_info_t *info = throttle_info; 1530 struct timeval elapsed; 1531 uint64_t elapsed_msecs; 1532 int thread_throttle_level; 1533 int throttle_level; 1534 1535 if ((thread_throttle_level = throttle_get_thread_throttle_level(NULL)) < THROTTLE_LEVEL_THROTTLED) 1536 return (THROTTLE_DISENGAGED); 1537 1538 for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) { 1539 1540 microuptime(&elapsed); 1541 timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]); 1542 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000); 1543 1544 if (elapsed_msecs < (uint64_t)throttle_windows_msecs[thread_throttle_level]) 1545 break; 1546 } 1547 if (throttle_level >= thread_throttle_level) { 1548 /* 1549 * we're beyond all of the throttle windows 1550 * that affect the throttle level of this thread, 1551 * so go ahead and treat as normal I/O 1552 */ 1553 return (THROTTLE_DISENGAGED); 1554 } 1555 if (mylevel) 1556 *mylevel = thread_throttle_level; 1557 if (throttling_level) 1558 *throttling_level = throttle_level; 1559 1560 if (info->throttle_io_count != info->throttle_io_count_begin) { 1561 /* 1562 * we've already issued at least one throttleable I/O 1563 * in the current I/O window, so avoid issuing another one 1564 */ 1565 return (THROTTLE_NOW); 1566 } 1567 /* 1568 * we're in the throttle window, so 1569 * cut the I/O size back 1570 */ 1571 return (THROTTLE_ENGAGED); 1572} 1573 1574/* 1575 * If we have a mount point and it has a throttle info pointer then 1576 * use it to do the check, otherwise use the device unit number to find 1577 * the correct throttle info array element. 1578 */ 1579int 1580throttle_io_will_be_throttled(__unused int lowpri_window_msecs, mount_t mp) 1581{ 1582 void *info; 1583 1584 /* 1585 * Should we just return zero if no mount point 1586 */ 1587 if (mp == NULL) 1588 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; 1589 else if (mp->mnt_throttle_info == NULL) 1590 info = &_throttle_io_info[mp->mnt_devbsdunit]; 1591 else 1592 info = mp->mnt_throttle_info; 1593 1594 return throttle_io_will_be_throttled_internal(info, NULL, NULL); 1595} 1596 1597/* 1598 * Routine to increment I/O throttling counters maintained in the proc 1599 */ 1600 1601static void 1602throttle_update_proc_stats(pid_t throttling_pid) 1603{ 1604 proc_t throttling_proc; 1605 proc_t throttled_proc = current_proc(); 1606 1607 /* The throttled_proc is always the current proc; so we are not concerned with refs */ 1608 OSAddAtomic64(1, &(throttled_proc->was_throttled)); 1609 1610 /* The throttling pid might have exited by now */ 1611 throttling_proc = proc_find(throttling_pid); 1612 if (throttling_proc != PROC_NULL) { 1613 OSAddAtomic64(1, &(throttling_proc->did_throttle)); 1614 proc_rele(throttling_proc); 1615 } 1616} 1617 1618/* 1619 * Block until woken up by the throttle timer or by a rethrottle call. 1620 * As long as we hold the throttle_lock while querying the throttle tier, we're 1621 * safe against seeing an old throttle tier after a rethrottle. 1622 */ 1623uint32_t 1624throttle_lowpri_io(int sleep_amount) 1625{ 1626 uthread_t ut; 1627 struct _throttle_io_info_t *info; 1628 int throttle_type = 0; 1629 int mylevel = 0; 1630 int throttling_level = THROTTLE_LEVEL_NONE; 1631 int sleep_cnt = 0; 1632 uint32_t throttle_io_period_num = 0; 1633 boolean_t insert_tail = TRUE; 1634 1635 ut = get_bsdthread_info(current_thread()); 1636 1637 if (ut->uu_lowpri_window == 0) 1638 return (0); 1639 1640 info = ut->uu_throttle_info; 1641 1642 if (info == NULL) { 1643 ut->uu_throttle_bc = FALSE; 1644 ut->uu_lowpri_window = 0; 1645 return (0); 1646 } 1647 1648 lck_mtx_lock(&info->throttle_lock); 1649 1650 if (sleep_amount == 0) 1651 goto done; 1652 1653 if (sleep_amount == 1 && ut->uu_throttle_bc == FALSE) 1654 sleep_amount = 0; 1655 1656 throttle_io_period_num = info->throttle_io_period_num; 1657 1658 while ( (throttle_type = throttle_io_will_be_throttled_internal(info, &mylevel, &throttling_level)) ) { 1659 1660 if (throttle_type == THROTTLE_ENGAGED) { 1661 if (sleep_amount == 0) 1662 break; 1663 if (info->throttle_io_period_num < throttle_io_period_num) 1664 break; 1665 if ((info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) 1666 break; 1667 } 1668 if (ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED) { 1669 if (throttle_add_to_list(info, ut, mylevel, insert_tail) == THROTTLE_LEVEL_END) 1670 goto done; 1671 } 1672 assert(throttling_level >= THROTTLE_LEVEL_START && throttling_level <= THROTTLE_LEVEL_END); 1673 throttle_update_proc_stats(info->throttle_last_IO_pid[throttling_level]); 1674 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, PROCESS_THROTTLED)) | DBG_FUNC_NONE, 1675 info->throttle_last_IO_pid[throttling_level], throttling_level, proc_selfpid(), mylevel, 0); 1676 1677 1678 if (sleep_cnt == 0) { 1679 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, 1680 throttle_windows_msecs[mylevel], info->throttle_io_periods[mylevel], info->throttle_io_count, 0, 0); 1681 throttled_count[mylevel]++; 1682 } 1683 msleep((caddr_t)&ut->uu_on_throttlelist, &info->throttle_lock, PRIBIO + 1, "throttle_lowpri_io", NULL); 1684 1685 sleep_cnt++; 1686 1687 if (sleep_amount == 0) 1688 insert_tail = FALSE; 1689 else if (info->throttle_io_period_num < throttle_io_period_num || 1690 (info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) { 1691 insert_tail = FALSE; 1692 sleep_amount = 0; 1693 } 1694 } 1695done: 1696 if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) { 1697 TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist); 1698 ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE; 1699 } 1700 1701 lck_mtx_unlock(&info->throttle_lock); 1702 1703 if (sleep_cnt) { 1704 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, 1705 throttle_windows_msecs[mylevel], info->throttle_io_periods[mylevel], info->throttle_io_count, 0, 0); 1706 } 1707 1708 throttle_info_rel(info); 1709 1710 ut->uu_throttle_info = NULL; 1711 ut->uu_throttle_bc = FALSE; 1712 ut->uu_lowpri_window = 0; 1713 1714 return (sleep_cnt); 1715} 1716 1717/* 1718 * KPI routine 1719 * 1720 * set a kernel thread's IO policy. policy can be: 1721 * IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE, IOPOL_UTILITY, IOPOL_STANDARD 1722 * 1723 * explanations about these policies are in the man page of setiopolicy_np 1724 */ 1725void throttle_set_thread_io_policy(int policy) 1726{ 1727 proc_set_task_policy(current_task(), current_thread(), 1728 TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, 1729 policy); 1730} 1731 1732 1733static 1734void throttle_info_reset_window(uthread_t ut) 1735{ 1736 struct _throttle_io_info_t *info; 1737 1738 if ( (info = ut->uu_throttle_info) ) { 1739 throttle_info_rel(info); 1740 1741 ut->uu_throttle_info = NULL; 1742 ut->uu_lowpri_window = 0; 1743 ut->uu_throttle_bc = FALSE; 1744 } 1745} 1746 1747static 1748void throttle_info_set_initial_window(uthread_t ut, struct _throttle_io_info_t *info, boolean_t BC_throttle, boolean_t isssd) 1749{ 1750 if (lowpri_throttle_enabled == 0) 1751 return; 1752 1753 if (info->throttle_io_periods == 0) { 1754 throttle_init_throttle_period(info, isssd); 1755 } 1756 if (ut->uu_throttle_info == NULL) { 1757 1758 ut->uu_throttle_info = info; 1759 throttle_info_ref(info); 1760 DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info ); 1761 1762 ut->uu_lowpri_window = 1; 1763 ut->uu_throttle_bc = BC_throttle; 1764 } 1765} 1766 1767 1768static 1769void throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd) 1770{ 1771 int thread_throttle_level; 1772 1773 if (lowpri_throttle_enabled == 0) 1774 return; 1775 1776 if (ut == NULL) 1777 ut = get_bsdthread_info(current_thread()); 1778 1779 thread_throttle_level = throttle_get_thread_throttle_level(ut); 1780 1781 if (thread_throttle_level != THROTTLE_LEVEL_NONE) { 1782 if(!ISSET(flags, B_PASSIVE)) { 1783 microuptime(&info->throttle_window_start_timestamp[thread_throttle_level]); 1784 info->throttle_last_IO_pid[thread_throttle_level] = proc_selfpid(); 1785 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, OPEN_THROTTLE_WINDOW)) | DBG_FUNC_NONE, 1786 current_proc()->p_pid, thread_throttle_level, 0, 0, 0); 1787 } 1788 microuptime(&info->throttle_last_IO_timestamp[thread_throttle_level]); 1789 } 1790 1791 1792 if (thread_throttle_level >= THROTTLE_LEVEL_THROTTLED) { 1793 /* 1794 * I'd really like to do the IOSleep here, but 1795 * we may be holding all kinds of filesystem related locks 1796 * and the pages for this I/O marked 'busy'... 1797 * we don't want to cause a normal task to block on 1798 * one of these locks while we're throttling a task marked 1799 * for low priority I/O... we'll mark the uthread and 1800 * do the delay just before we return from the system 1801 * call that triggered this I/O or from vnode_pagein 1802 */ 1803 OSAddAtomic(1, &info->throttle_io_count); 1804 1805 throttle_info_set_initial_window(ut, info, FALSE, isssd); 1806 } 1807} 1808 1809void *throttle_info_update_by_mount(mount_t mp) 1810{ 1811 struct _throttle_io_info_t *info; 1812 uthread_t ut; 1813 boolean_t isssd = FALSE; 1814 1815 ut = get_bsdthread_info(current_thread()); 1816 1817 if (mp != NULL) { 1818 if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) 1819 isssd = TRUE; 1820 info = &_throttle_io_info[mp->mnt_devbsdunit]; 1821 } else 1822 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; 1823 1824 if (!ut->uu_lowpri_window) 1825 throttle_info_set_initial_window(ut, info, FALSE, isssd); 1826 1827 return info; 1828} 1829 1830 1831/* 1832 * KPI routine 1833 * 1834 * this is usually called before every I/O, used for throttled I/O 1835 * book keeping. This routine has low overhead and does not sleep 1836 */ 1837void throttle_info_update(void *throttle_info, int flags) 1838{ 1839 if (throttle_info) 1840 throttle_info_update_internal(throttle_info, NULL, flags, FALSE); 1841} 1842 1843/* 1844 * KPI routine 1845 * 1846 * this is usually called before every I/O, used for throttled I/O 1847 * book keeping. This routine has low overhead and does not sleep 1848 */ 1849void throttle_info_update_by_mask(void *throttle_info_handle, int flags) 1850{ 1851 void *throttle_info = throttle_info_handle; 1852 1853 /* 1854 * for now we only use the lowest bit of the throttle mask, so the 1855 * handle is the same as the throttle_info. Later if we store a 1856 * set of throttle infos in the handle, we will want to loop through 1857 * them and call throttle_info_update in a loop 1858 */ 1859 throttle_info_update(throttle_info, flags); 1860} 1861 1862/* 1863 * KPI routine (private) 1864 * Called to determine if this IO is being throttled to this level so that it can be treated specially 1865 */ 1866int throttle_info_io_will_be_throttled(void * throttle_info, int policy) 1867{ 1868 struct _throttle_io_info_t *info = throttle_info; 1869 struct timeval elapsed; 1870 uint64_t elapsed_msecs; 1871 int throttle_level; 1872 int thread_throttle_level; 1873 1874 switch (policy) { 1875 1876 case IOPOL_THROTTLE: 1877 thread_throttle_level = THROTTLE_LEVEL_TIER3; 1878 break; 1879 case IOPOL_UTILITY: 1880 thread_throttle_level = THROTTLE_LEVEL_TIER2; 1881 break; 1882 case IOPOL_STANDARD: 1883 thread_throttle_level = THROTTLE_LEVEL_TIER1; 1884 break; 1885 default: 1886 thread_throttle_level = THROTTLE_LEVEL_TIER0; 1887 break; 1888 } 1889 for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) { 1890 1891 microuptime(&elapsed); 1892 timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]); 1893 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000); 1894 1895 if (elapsed_msecs < (uint64_t)throttle_windows_msecs[thread_throttle_level]) 1896 break; 1897 } 1898 if (throttle_level >= thread_throttle_level) { 1899 /* 1900 * we're beyond all of the throttle windows 1901 * so go ahead and treat as normal I/O 1902 */ 1903 return (THROTTLE_DISENGAGED); 1904 } 1905 /* 1906 * we're in the throttle window 1907 */ 1908 return (THROTTLE_ENGAGED); 1909} 1910 1911int 1912spec_strategy(struct vnop_strategy_args *ap) 1913{ 1914 buf_t bp; 1915 int bflags; 1916 int io_tier; 1917 int passive; 1918 dev_t bdev; 1919 uthread_t ut; 1920 mount_t mp; 1921 struct bufattr *bap; 1922 int strategy_ret; 1923 struct _throttle_io_info_t *throttle_info; 1924 boolean_t isssd = FALSE; 1925 proc_t curproc = current_proc(); 1926 1927 bp = ap->a_bp; 1928 bdev = buf_device(bp); 1929 mp = buf_vnode(bp)->v_mount; 1930 bap = &bp->b_attr; 1931 1932 io_tier = throttle_get_io_policy(&ut); 1933 passive = throttle_get_passive_io_policy(&ut); 1934 1935 if (bp->b_flags & B_META) 1936 bap->ba_flags |= BA_META; 1937 1938 SET_BUFATTR_IO_TIER(bap, io_tier); 1939 1940 if (passive) 1941 bp->b_flags |= B_PASSIVE; 1942 1943 if ((curproc != NULL) && ((curproc->p_flag & P_DELAYIDLESLEEP) == P_DELAYIDLESLEEP)) 1944 bap->ba_flags |= BA_DELAYIDLESLEEP; 1945 1946 bflags = bp->b_flags; 1947 1948 if (((bflags & B_READ) == 0) && ((bflags & B_ASYNC) == 0)) 1949 bufattr_markquickcomplete(bap); 1950 1951 if (kdebug_enable) { 1952 int code = 0; 1953 1954 if (bflags & B_READ) 1955 code |= DKIO_READ; 1956 if (bflags & B_ASYNC) 1957 code |= DKIO_ASYNC; 1958 1959 if (bflags & B_META) 1960 code |= DKIO_META; 1961 else if (bflags & B_PAGEIO) 1962 code |= DKIO_PAGING; 1963 1964 if (io_tier != 0) 1965 code |= DKIO_THROTTLE; 1966 1967 code |= ((io_tier << DKIO_TIER_SHIFT) & DKIO_TIER_MASK); 1968 1969 if (bflags & B_PASSIVE) 1970 code |= DKIO_PASSIVE; 1971 1972 if (bap->ba_flags & BA_NOCACHE) 1973 code |= DKIO_NOCACHE; 1974 1975 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE, 1976 buf_kernel_addrperm_addr(bp), bdev, (int)buf_blkno(bp), buf_count(bp), 0); 1977 } 1978 if (mp != NULL) { 1979 if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) 1980 isssd = TRUE; 1981 throttle_info = &_throttle_io_info[mp->mnt_devbsdunit]; 1982 } else 1983 throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; 1984 1985 throttle_info_update_internal(throttle_info, ut, bflags, isssd); 1986 1987 if ((bflags & B_READ) == 0) { 1988 microuptime(&throttle_info->throttle_last_write_timestamp); 1989 1990 if (mp) { 1991 mp->mnt_last_write_issued_timestamp = throttle_info->throttle_last_write_timestamp; 1992 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size); 1993 } 1994 } else if (mp) { 1995 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_read_size); 1996 } 1997 /* 1998 * The BootCache may give us special information about 1999 * the IO, so it returns special values that we check 2000 * for here. 2001 * 2002 * IO_SATISFIED_BY_CACHE 2003 * The read has been satisfied by the boot cache. Don't 2004 * throttle the thread unnecessarily. 2005 * 2006 * IO_SHOULD_BE_THROTTLED 2007 * The boot cache is playing back a playlist and this IO 2008 * cut through. Throttle it so we're not cutting through 2009 * the boot cache too often. 2010 * 2011 * Note that typical strategy routines are defined with 2012 * a void return so we'll get garbage here. In the 2013 * unlikely case the garbage matches our special return 2014 * value, it's not a big deal since we're only adjusting 2015 * the throttling delay. 2016 */ 2017#define IO_SATISFIED_BY_CACHE ((int)0xcafefeed) 2018#define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef) 2019 typedef int strategy_fcn_ret_t(struct buf *bp); 2020 2021 strategy_ret = (*(strategy_fcn_ret_t*)bdevsw[major(bdev)].d_strategy)(bp); 2022 2023 if (IO_SATISFIED_BY_CACHE == strategy_ret) { 2024 /* 2025 * If this was a throttled IO satisfied by the boot cache, 2026 * don't delay the thread. 2027 */ 2028 throttle_info_reset_window(ut); 2029 2030 } else if (IO_SHOULD_BE_THROTTLED == strategy_ret) { 2031 /* 2032 * If the boot cache indicates this IO should be throttled, 2033 * delay the thread. 2034 */ 2035 throttle_info_set_initial_window(ut, throttle_info, TRUE, isssd); 2036 } 2037 return (0); 2038} 2039 2040 2041/* 2042 * This is a noop, simply returning what one has been given. 2043 */ 2044int 2045spec_blockmap(__unused struct vnop_blockmap_args *ap) 2046{ 2047 return (ENOTSUP); 2048} 2049 2050 2051/* 2052 * Device close routine 2053 */ 2054int 2055spec_close(struct vnop_close_args *ap) 2056{ 2057 struct vnode *vp = ap->a_vp; 2058 dev_t dev = vp->v_rdev; 2059 int error = 0; 2060 int flags = ap->a_fflag; 2061 struct proc *p = vfs_context_proc(ap->a_context); 2062 struct session *sessp; 2063 int do_rele = 0; 2064 2065 switch (vp->v_type) { 2066 2067 case VCHR: 2068 /* 2069 * Hack: a tty device that is a controlling terminal 2070 * has a reference from the session structure. 2071 * We cannot easily tell that a character device is 2072 * a controlling terminal, unless it is the closing 2073 * process' controlling terminal. In that case, 2074 * if the reference count is 1 (this is the very 2075 * last close) 2076 */ 2077 sessp = proc_session(p); 2078 devsw_lock(dev, S_IFCHR); 2079 if (sessp != SESSION_NULL) { 2080 if (vp == sessp->s_ttyvp && vcount(vp) == 1) { 2081 struct tty *tp; 2082 2083 devsw_unlock(dev, S_IFCHR); 2084 session_lock(sessp); 2085 if (vp == sessp->s_ttyvp) { 2086 tp = SESSION_TP(sessp); 2087 sessp->s_ttyvp = NULL; 2088 sessp->s_ttyvid = 0; 2089 sessp->s_ttyp = TTY_NULL; 2090 sessp->s_ttypgrpid = NO_PID; 2091 do_rele = 1; 2092 } 2093 session_unlock(sessp); 2094 2095 if (do_rele) { 2096 vnode_rele(vp); 2097 if (NULL != tp) 2098 ttyfree(tp); 2099 } 2100 devsw_lock(dev, S_IFCHR); 2101 } 2102 session_rele(sessp); 2103 } 2104 2105 if (--vp->v_specinfo->si_opencount < 0) 2106 panic("negative open count (c, %u, %u)", major(dev), minor(dev)); 2107 2108 /* 2109 * close on last reference or on vnode revoke call 2110 */ 2111 if (vcount(vp) == 0 || (flags & IO_REVOKE) != 0) 2112 error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p); 2113 2114 devsw_unlock(dev, S_IFCHR); 2115 break; 2116 2117 case VBLK: 2118 /* 2119 * If there is more than one outstanding open, don't 2120 * send the close to the device. 2121 */ 2122 devsw_lock(dev, S_IFBLK); 2123 if (vcount(vp) > 1) { 2124 vp->v_specinfo->si_opencount--; 2125 devsw_unlock(dev, S_IFBLK); 2126 return (0); 2127 } 2128 devsw_unlock(dev, S_IFBLK); 2129 2130 /* 2131 * On last close of a block device (that isn't mounted) 2132 * we must invalidate any in core blocks, so that 2133 * we can, for instance, change floppy disks. 2134 */ 2135 if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context))) 2136 return (error); 2137 2138 error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0); 2139 if (error) 2140 return (error); 2141 2142 devsw_lock(dev, S_IFBLK); 2143 2144 if (--vp->v_specinfo->si_opencount < 0) 2145 panic("negative open count (b, %u, %u)", major(dev), minor(dev)); 2146 2147 if (vcount(vp) == 0) 2148 error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p); 2149 2150 devsw_unlock(dev, S_IFBLK); 2151 break; 2152 2153 default: 2154 panic("spec_close: not special"); 2155 return(EBADF); 2156 } 2157 2158 return error; 2159} 2160 2161/* 2162 * Return POSIX pathconf information applicable to special devices. 2163 */ 2164int 2165spec_pathconf(struct vnop_pathconf_args *ap) 2166{ 2167 2168 switch (ap->a_name) { 2169 case _PC_LINK_MAX: 2170 *ap->a_retval = LINK_MAX; 2171 return (0); 2172 case _PC_MAX_CANON: 2173 *ap->a_retval = MAX_CANON; 2174 return (0); 2175 case _PC_MAX_INPUT: 2176 *ap->a_retval = MAX_INPUT; 2177 return (0); 2178 case _PC_PIPE_BUF: 2179 *ap->a_retval = PIPE_BUF; 2180 return (0); 2181 case _PC_CHOWN_RESTRICTED: 2182 *ap->a_retval = 200112; /* _POSIX_CHOWN_RESTRICTED */ 2183 return (0); 2184 case _PC_VDISABLE: 2185 *ap->a_retval = _POSIX_VDISABLE; 2186 return (0); 2187 default: 2188 return (EINVAL); 2189 } 2190 /* NOTREACHED */ 2191} 2192 2193/* 2194 * Special device failed operation 2195 */ 2196int 2197spec_ebadf(__unused void *dummy) 2198{ 2199 2200 return (EBADF); 2201} 2202 2203/* Blktooff derives file offset from logical block number */ 2204int 2205spec_blktooff(struct vnop_blktooff_args *ap) 2206{ 2207 struct vnode *vp = ap->a_vp; 2208 2209 switch (vp->v_type) { 2210 case VCHR: 2211 *ap->a_offset = (off_t)-1; /* failure */ 2212 return (ENOTSUP); 2213 2214 case VBLK: 2215 printf("spec_blktooff: not implemented for VBLK\n"); 2216 *ap->a_offset = (off_t)-1; /* failure */ 2217 return (ENOTSUP); 2218 2219 default: 2220 panic("spec_blktooff type"); 2221 } 2222 /* NOTREACHED */ 2223 2224 return (0); 2225} 2226 2227/* Offtoblk derives logical block number from file offset */ 2228int 2229spec_offtoblk(struct vnop_offtoblk_args *ap) 2230{ 2231 struct vnode *vp = ap->a_vp; 2232 2233 switch (vp->v_type) { 2234 case VCHR: 2235 *ap->a_lblkno = (daddr64_t)-1; /* failure */ 2236 return (ENOTSUP); 2237 2238 case VBLK: 2239 printf("spec_offtoblk: not implemented for VBLK\n"); 2240 *ap->a_lblkno = (daddr64_t)-1; /* failure */ 2241 return (ENOTSUP); 2242 2243 default: 2244 panic("spec_offtoblk type"); 2245 } 2246 /* NOTREACHED */ 2247 2248 return (0); 2249} 2250 2251static void filt_specdetach(struct knote *kn); 2252static int filt_spec(struct knote *kn, long hint); 2253static unsigned filt_specpeek(struct knote *kn); 2254 2255struct filterops spec_filtops = { 2256 .f_isfd = 1, 2257 .f_attach = filt_specattach, 2258 .f_detach = filt_specdetach, 2259 .f_event = filt_spec, 2260 .f_peek = filt_specpeek 2261}; 2262 2263static int 2264filter_to_seltype(int16_t filter) 2265{ 2266 switch (filter) { 2267 case EVFILT_READ: 2268 return FREAD; 2269 case EVFILT_WRITE: 2270 return FWRITE; 2271 break; 2272 default: 2273 panic("filt_to_seltype(): invalid filter %d\n", filter); 2274 return 0; 2275 } 2276} 2277 2278static int 2279filt_specattach(struct knote *kn) 2280{ 2281 vnode_t vp; 2282 dev_t dev; 2283 2284 vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; /* Already have iocount, and vnode is alive */ 2285 2286 assert(vnode_ischr(vp)); 2287 2288 dev = vnode_specrdev(vp); 2289 2290 if (major(dev) > nchrdev) { 2291 return ENXIO; 2292 } 2293 2294 if ((cdevsw_flags[major(dev)] & CDEVSW_SELECT_KQUEUE) == 0) { 2295 return EINVAL; 2296 } 2297 2298 /* Resulting wql is safe to unlink even if it has never been linked */ 2299 kn->kn_hook = wait_queue_link_allocate(); 2300 if (kn->kn_hook == NULL) { 2301 return EAGAIN; 2302 } 2303 2304 kn->kn_fop = &spec_filtops; 2305 kn->kn_hookid = vnode_vid(vp); 2306 2307 knote_markstayqueued(kn); 2308 2309 return 0; 2310} 2311 2312static void 2313filt_specdetach(struct knote *kn) 2314{ 2315 kern_return_t ret; 2316 2317 /* 2318 * Given wait queue link and wait queue set, unlink. This is subtle. 2319 * If the device has been revoked from under us, selclearthread() will 2320 * have removed our link from the kqueue's wait queue set, which 2321 * wait_queue_set_unlink_one() will detect and handle. 2322 */ 2323 ret = wait_queue_set_unlink_one(kn->kn_kq->kq_wqs, kn->kn_hook); 2324 if (ret != KERN_SUCCESS) { 2325 panic("filt_specdetach(): failed to unlink wait queue link."); 2326 } 2327 2328 (void)wait_queue_link_free(kn->kn_hook); 2329 kn->kn_hook = NULL; 2330 kn->kn_status &= ~KN_STAYQUEUED; 2331} 2332 2333static int 2334filt_spec(struct knote *kn, long hint) 2335{ 2336 vnode_t vp; 2337 uthread_t uth; 2338 wait_queue_set_t old_wqs; 2339 vfs_context_t ctx; 2340 int selres; 2341 int error; 2342 int use_offset; 2343 dev_t dev; 2344 uint64_t flags; 2345 2346 assert(kn->kn_hook != NULL); 2347 2348 if (hint != 0) { 2349 panic("filt_spec(): nonzero hint?"); 2350 } 2351 2352 uth = get_bsdthread_info(current_thread()); 2353 ctx = vfs_context_current(); 2354 vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; 2355 2356 error = vnode_getwithvid(vp, kn->kn_hookid); 2357 if (error != 0) { 2358 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 2359 return 1; 2360 } 2361 2362 dev = vnode_specrdev(vp); 2363 flags = cdevsw_flags[major(dev)]; 2364 use_offset = ((flags & CDEVSW_USE_OFFSET) != 0); 2365 assert((flags & CDEVSW_SELECT_KQUEUE) != 0); 2366 2367 /* Trick selrecord() into hooking kqueue's wait queue set into device wait queue */ 2368 old_wqs = uth->uu_wqset; 2369 uth->uu_wqset = kn->kn_kq->kq_wqs; 2370 selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx); 2371 uth->uu_wqset = old_wqs; 2372 2373 if (use_offset) { 2374 if (kn->kn_fp->f_fglob->fg_offset >= (uint32_t)selres) { 2375 kn->kn_data = 0; 2376 } else { 2377 kn->kn_data = ((uint32_t)selres) - kn->kn_fp->f_fglob->fg_offset; 2378 } 2379 } else { 2380 kn->kn_data = selres; 2381 } 2382 2383 vnode_put(vp); 2384 2385 return (kn->kn_data != 0); 2386} 2387 2388static unsigned 2389filt_specpeek(struct knote *kn) 2390{ 2391 vnode_t vp; 2392 uthread_t uth; 2393 wait_queue_set_t old_wqs; 2394 vfs_context_t ctx; 2395 int error, selres; 2396 2397 uth = get_bsdthread_info(current_thread()); 2398 ctx = vfs_context_current(); 2399 vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; 2400 2401 error = vnode_getwithvid(vp, kn->kn_hookid); 2402 if (error != 0) { 2403 return 1; /* Just like VNOP_SELECT() on recycled vnode */ 2404 } 2405 2406 /* 2407 * Why pass the link here? Because we may not have registered in the past... 2408 */ 2409 old_wqs = uth->uu_wqset; 2410 uth->uu_wqset = kn->kn_kq->kq_wqs; 2411 selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx); 2412 uth->uu_wqset = old_wqs; 2413 2414 vnode_put(vp); 2415 return selres; 2416} 2417 2418