1/* 2 * Copyright (c) 2000-2012 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ 29/* 30 * Copyright (c) 1989, 1993, 1995 31 * The Regents of the University of California. All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 3. All advertising materials mentioning features or use of this software 42 * must display the following acknowledgement: 43 * This product includes software developed by the University of 44 * California, Berkeley and its contributors. 45 * 4. Neither the name of the University nor the names of its contributors 46 * may be used to endorse or promote products derived from this software 47 * without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * SUCH DAMAGE. 60 * 61 * @(#)spec_vnops.c 8.14 (Berkeley) 5/21/95 62 */ 63 64#include <sys/param.h> 65#include <sys/proc_internal.h> 66#include <sys/kauth.h> 67#include <sys/systm.h> 68#include <sys/kernel.h> 69#include <sys/conf.h> 70#include <sys/buf_internal.h> 71#include <sys/mount_internal.h> 72#include <sys/vnode_internal.h> 73#include <sys/file_internal.h> 74#include <sys/namei.h> 75#include <sys/stat.h> 76#include <sys/errno.h> 77#include <sys/ioctl.h> 78#include <sys/file.h> 79#include <sys/user.h> 80#include <sys/malloc.h> 81#include <sys/disk.h> 82#include <sys/uio_internal.h> 83#include <sys/resource.h> 84#include <miscfs/specfs/specdev.h> 85#include <vfs/vfs_support.h> 86#include <kern/assert.h> 87#include <kern/task.h> 88#include <pexpert/pexpert.h> 89 90#include <sys/kdebug.h> 91 92/* XXX following three prototypes should be in a header file somewhere */ 93extern dev_t chrtoblk(dev_t dev); 94extern boolean_t iskmemdev(dev_t dev); 95extern int bpfkqfilter(dev_t dev, struct knote *kn); 96extern int ptsd_kqfilter(dev_t dev, struct knote *kn); 97 98extern int ignore_is_ssd; 99 100struct vnode *speclisth[SPECHSZ]; 101 102/* symbolic sleep message strings for devices */ 103char devopn[] = "devopn"; 104char devio[] = "devio"; 105char devwait[] = "devwait"; 106char devin[] = "devin"; 107char devout[] = "devout"; 108char devioc[] = "devioc"; 109char devcls[] = "devcls"; 110 111#define VOPFUNC int (*)(void *) 112 113int (**spec_vnodeop_p)(void *); 114struct vnodeopv_entry_desc spec_vnodeop_entries[] = { 115 { &vnop_default_desc, (VOPFUNC)vn_default_error }, 116 { &vnop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */ 117 { &vnop_create_desc, (VOPFUNC)err_create }, /* create */ 118 { &vnop_mknod_desc, (VOPFUNC)err_mknod }, /* mknod */ 119 { &vnop_open_desc, (VOPFUNC)spec_open }, /* open */ 120 { &vnop_close_desc, (VOPFUNC)spec_close }, /* close */ 121 { &vnop_access_desc, (VOPFUNC)spec_access }, /* access */ 122 { &vnop_getattr_desc, (VOPFUNC)spec_getattr }, /* getattr */ 123 { &vnop_setattr_desc, (VOPFUNC)spec_setattr }, /* setattr */ 124 { &vnop_read_desc, (VOPFUNC)spec_read }, /* read */ 125 { &vnop_write_desc, (VOPFUNC)spec_write }, /* write */ 126 { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */ 127 { &vnop_select_desc, (VOPFUNC)spec_select }, /* select */ 128 { &vnop_revoke_desc, (VOPFUNC)nop_revoke }, /* revoke */ 129 { &vnop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */ 130 { &vnop_fsync_desc, (VOPFUNC)spec_fsync }, /* fsync */ 131 { &vnop_remove_desc, (VOPFUNC)err_remove }, /* remove */ 132 { &vnop_link_desc, (VOPFUNC)err_link }, /* link */ 133 { &vnop_rename_desc, (VOPFUNC)err_rename }, /* rename */ 134 { &vnop_mkdir_desc, (VOPFUNC)err_mkdir }, /* mkdir */ 135 { &vnop_rmdir_desc, (VOPFUNC)err_rmdir }, /* rmdir */ 136 { &vnop_symlink_desc, (VOPFUNC)err_symlink }, /* symlink */ 137 { &vnop_readdir_desc, (VOPFUNC)err_readdir }, /* readdir */ 138 { &vnop_readlink_desc, (VOPFUNC)err_readlink }, /* readlink */ 139 { &vnop_inactive_desc, (VOPFUNC)nop_inactive }, /* inactive */ 140 { &vnop_reclaim_desc, (VOPFUNC)nop_reclaim }, /* reclaim */ 141 { &vnop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */ 142 { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */ 143 { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ 144 { &vnop_bwrite_desc, (VOPFUNC)spec_bwrite }, /* bwrite */ 145 { &vnop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */ 146 { &vnop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */ 147 { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ 148 { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff }, /* blktooff */ 149 { &vnop_offtoblk_desc, (VOPFUNC)spec_offtoblk }, /* offtoblk */ 150 { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap }, /* blockmap */ 151 { (struct vnodeop_desc*)NULL, (int(*)())NULL } 152}; 153struct vnodeopv_desc spec_vnodeop_opv_desc = 154 { &spec_vnodeop_p, spec_vnodeop_entries }; 155 156 157static void set_blocksize(vnode_t, dev_t); 158 159#define LOWPRI_TIER1_WINDOW_MSECS 25 160#define LOWPRI_TIER2_WINDOW_MSECS 100 161#define LOWPRI_TIER3_WINDOW_MSECS 500 162 163#define LOWPRI_TIER1_IO_PERIOD_MSECS 15 164#define LOWPRI_TIER2_IO_PERIOD_MSECS 50 165#define LOWPRI_TIER3_IO_PERIOD_MSECS 200 166 167#define LOWPRI_TIER1_IO_PERIOD_SSD_MSECS 5 168#define LOWPRI_TIER2_IO_PERIOD_SSD_MSECS 15 169#define LOWPRI_TIER3_IO_PERIOD_SSD_MSECS 25 170 171 172int throttle_windows_msecs[THROTTLE_LEVEL_END + 1] = { 173 0, 174 LOWPRI_TIER1_WINDOW_MSECS, 175 LOWPRI_TIER2_WINDOW_MSECS, 176 LOWPRI_TIER3_WINDOW_MSECS, 177}; 178 179int throttle_io_period_msecs[THROTTLE_LEVEL_END + 1] = { 180 0, 181 LOWPRI_TIER1_IO_PERIOD_MSECS, 182 LOWPRI_TIER2_IO_PERIOD_MSECS, 183 LOWPRI_TIER3_IO_PERIOD_MSECS, 184}; 185 186int throttle_io_period_ssd_msecs[THROTTLE_LEVEL_END + 1] = { 187 0, 188 LOWPRI_TIER1_IO_PERIOD_SSD_MSECS, 189 LOWPRI_TIER2_IO_PERIOD_SSD_MSECS, 190 LOWPRI_TIER3_IO_PERIOD_SSD_MSECS, 191}; 192 193 194int throttled_count[THROTTLE_LEVEL_END + 1]; 195 196struct _throttle_io_info_t { 197 lck_mtx_t throttle_lock; 198 199 struct timeval throttle_last_write_timestamp; 200 struct timeval throttle_min_timer_deadline; 201 struct timeval throttle_window_start_timestamp[THROTTLE_LEVEL_END + 1]; 202 struct timeval throttle_last_IO_timestamp[THROTTLE_LEVEL_END + 1]; 203 pid_t throttle_last_IO_pid[THROTTLE_LEVEL_END + 1]; 204 struct timeval throttle_start_IO_period_timestamp[THROTTLE_LEVEL_END + 1]; 205 206 TAILQ_HEAD( , uthread) throttle_uthlist[THROTTLE_LEVEL_END + 1]; /* Lists of throttled uthreads */ 207 int throttle_next_wake_level; 208 209 thread_call_t throttle_timer_call; 210 int32_t throttle_timer_ref; 211 int32_t throttle_timer_active; 212 213 int32_t throttle_io_count; 214 int32_t throttle_io_count_begin; 215 int *throttle_io_periods; 216 uint32_t throttle_io_period_num; 217 218 int32_t throttle_refcnt; 219 int32_t throttle_alloc; 220 int32_t throttle_disabled; 221}; 222 223struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV]; 224 225 226int lowpri_throttle_enabled = 1; 227 228 229 230static void throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd); 231static int throttle_get_thread_throttle_level(uthread_t ut); 232 233/* 234 * Trivial lookup routine that always fails. 235 */ 236int 237spec_lookup(struct vnop_lookup_args *ap) 238{ 239 240 *ap->a_vpp = NULL; 241 return (ENOTDIR); 242} 243 244static void 245set_blocksize(struct vnode *vp, dev_t dev) 246{ 247 int (*size)(dev_t); 248 int rsize; 249 250 if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) { 251 rsize = (*size)(dev); 252 if (rsize <= 0) /* did size fail? */ 253 vp->v_specsize = DEV_BSIZE; 254 else 255 vp->v_specsize = rsize; 256 } 257 else 258 vp->v_specsize = DEV_BSIZE; 259} 260 261void 262set_fsblocksize(struct vnode *vp) 263{ 264 265 if (vp->v_type == VBLK) { 266 dev_t dev = (dev_t)vp->v_rdev; 267 int maj = major(dev); 268 269 if ((u_int)maj >= (u_int)nblkdev) 270 return; 271 272 vnode_lock(vp); 273 set_blocksize(vp, dev); 274 vnode_unlock(vp); 275 } 276 277} 278 279 280/* 281 * Open a special file. 282 */ 283int 284spec_open(struct vnop_open_args *ap) 285{ 286 struct proc *p = vfs_context_proc(ap->a_context); 287 kauth_cred_t cred = vfs_context_ucred(ap->a_context); 288 struct vnode *vp = ap->a_vp; 289 dev_t bdev, dev = (dev_t)vp->v_rdev; 290 int maj = major(dev); 291 int error; 292 293 /* 294 * Don't allow open if fs is mounted -nodev. 295 */ 296 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV)) 297 return (ENXIO); 298 299 switch (vp->v_type) { 300 301 case VCHR: 302 if ((u_int)maj >= (u_int)nchrdev) 303 return (ENXIO); 304 if (cred != FSCRED && (ap->a_mode & FWRITE)) { 305 /* 306 * When running in very secure mode, do not allow 307 * opens for writing of any disk character devices. 308 */ 309 if (securelevel >= 2 && isdisk(dev, VCHR)) 310 return (EPERM); 311 312 /* Never allow writing to /dev/mem or /dev/kmem */ 313 if (iskmemdev(dev)) 314 return (EPERM); 315 /* 316 * When running in secure mode, do not allow opens for 317 * writing of character devices whose corresponding block 318 * devices are currently mounted. 319 */ 320 if (securelevel >= 1) { 321 if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error)) 322 return (error); 323 } 324 } 325 326 devsw_lock(dev, S_IFCHR); 327 error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p); 328 329 if (error == 0) { 330 vp->v_specinfo->si_opencount++; 331 } 332 333 devsw_unlock(dev, S_IFCHR); 334 335 if (error == 0 && cdevsw[maj].d_type == D_DISK && !vp->v_un.vu_specinfo->si_initted) { 336 int isssd = 0; 337 uint64_t throttle_mask = 0; 338 uint32_t devbsdunit = 0; 339 340 if (VNOP_IOCTL(vp, DKIOCGETTHROTTLEMASK, (caddr_t)&throttle_mask, 0, NULL) == 0) { 341 342 if (throttle_mask != 0 && 343 VNOP_IOCTL(vp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ap->a_context) == 0) { 344 /* 345 * as a reasonable approximation, only use the lowest bit of the mask 346 * to generate a disk unit number 347 */ 348 devbsdunit = num_trailing_0(throttle_mask); 349 350 vnode_lock(vp); 351 352 vp->v_un.vu_specinfo->si_isssd = isssd; 353 vp->v_un.vu_specinfo->si_devbsdunit = devbsdunit; 354 vp->v_un.vu_specinfo->si_throttle_mask = throttle_mask; 355 vp->v_un.vu_specinfo->si_throttleable = 1; 356 vp->v_un.vu_specinfo->si_initted = 1; 357 358 vnode_unlock(vp); 359 } 360 } 361 if (vp->v_un.vu_specinfo->si_initted == 0) { 362 vnode_lock(vp); 363 vp->v_un.vu_specinfo->si_initted = 1; 364 vnode_unlock(vp); 365 } 366 } 367 return (error); 368 369 case VBLK: 370 if ((u_int)maj >= (u_int)nblkdev) 371 return (ENXIO); 372 /* 373 * When running in very secure mode, do not allow 374 * opens for writing of any disk block devices. 375 */ 376 if (securelevel >= 2 && cred != FSCRED && 377 (ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK) 378 return (EPERM); 379 /* 380 * Do not allow opens of block devices that are 381 * currently mounted. 382 */ 383 if ( (error = vfs_mountedon(vp)) ) 384 return (error); 385 386 devsw_lock(dev, S_IFBLK); 387 error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p); 388 if (!error) { 389 vp->v_specinfo->si_opencount++; 390 } 391 devsw_unlock(dev, S_IFBLK); 392 393 if (!error) { 394 u_int64_t blkcnt; 395 u_int32_t blksize; 396 int setsize = 0; 397 u_int32_t size512 = 512; 398 399 400 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) { 401 /* Switch to 512 byte sectors (temporarily) */ 402 403 if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) { 404 /* Get the number of 512 byte physical blocks. */ 405 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) { 406 setsize = 1; 407 } 408 } 409 /* If it doesn't set back, we can't recover */ 410 if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context)) 411 error = ENXIO; 412 } 413 414 415 vnode_lock(vp); 416 set_blocksize(vp, dev); 417 418 /* 419 * Cache the size in bytes of the block device for later 420 * use by spec_write(). 421 */ 422 if (setsize) 423 vp->v_specdevsize = blkcnt * (u_int64_t)size512; 424 else 425 vp->v_specdevsize = (u_int64_t)0; /* Default: Can't get */ 426 427 vnode_unlock(vp); 428 429 } 430 return(error); 431 default: 432 panic("spec_open type"); 433 } 434 return (0); 435} 436 437/* 438 * Vnode op for read 439 */ 440int 441spec_read(struct vnop_read_args *ap) 442{ 443 struct vnode *vp = ap->a_vp; 444 struct uio *uio = ap->a_uio; 445 struct buf *bp; 446 daddr64_t bn, nextbn; 447 long bsize, bscale; 448 int devBlockSize=0; 449 int n, on; 450 int error = 0; 451 dev_t dev; 452 453#if DIAGNOSTIC 454 if (uio->uio_rw != UIO_READ) 455 panic("spec_read mode"); 456 if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) 457 panic("spec_read proc"); 458#endif 459 if (uio_resid(uio) == 0) 460 return (0); 461 462 switch (vp->v_type) { 463 464 case VCHR: 465 if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) { 466 struct _throttle_io_info_t *throttle_info; 467 468 throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit]; 469 throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd); 470 } 471 error = (*cdevsw[major(vp->v_rdev)].d_read) 472 (vp->v_rdev, uio, ap->a_ioflag); 473 474 return (error); 475 476 case VBLK: 477 if (uio->uio_offset < 0) 478 return (EINVAL); 479 480 dev = vp->v_rdev; 481 482 devBlockSize = vp->v_specsize; 483 484 if (devBlockSize > PAGE_SIZE) 485 return (EINVAL); 486 487 bscale = PAGE_SIZE / devBlockSize; 488 bsize = bscale * devBlockSize; 489 490 do { 491 on = uio->uio_offset % bsize; 492 493 bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ (bscale - 1)); 494 495 if (vp->v_speclastr + bscale == bn) { 496 nextbn = bn + bscale; 497 error = buf_breadn(vp, bn, (int)bsize, &nextbn, 498 (int *)&bsize, 1, NOCRED, &bp); 499 } else 500 error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp); 501 502 vnode_lock(vp); 503 vp->v_speclastr = bn; 504 vnode_unlock(vp); 505 506 n = bsize - buf_resid(bp); 507 if ((on > n) || error) { 508 if (!error) 509 error = EINVAL; 510 buf_brelse(bp); 511 return (error); 512 } 513 n = min((unsigned)(n - on), uio_resid(uio)); 514 515 error = uiomove((char *)buf_dataptr(bp) + on, n, uio); 516 if (n + on == bsize) 517 buf_markaged(bp); 518 buf_brelse(bp); 519 } while (error == 0 && uio_resid(uio) > 0 && n != 0); 520 return (error); 521 522 default: 523 panic("spec_read type"); 524 } 525 /* NOTREACHED */ 526 527 return (0); 528} 529 530/* 531 * Vnode op for write 532 */ 533int 534spec_write(struct vnop_write_args *ap) 535{ 536 struct vnode *vp = ap->a_vp; 537 struct uio *uio = ap->a_uio; 538 struct buf *bp; 539 daddr64_t bn; 540 int bsize, blkmask, bscale; 541 int io_sync; 542 int devBlockSize=0; 543 int n, on; 544 int error = 0; 545 dev_t dev; 546 547#if DIAGNOSTIC 548 if (uio->uio_rw != UIO_WRITE) 549 panic("spec_write mode"); 550 if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) 551 panic("spec_write proc"); 552#endif 553 554 switch (vp->v_type) { 555 556 case VCHR: 557 if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) { 558 struct _throttle_io_info_t *throttle_info; 559 560 throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit]; 561 562 throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd); 563 564 microuptime(&throttle_info->throttle_last_write_timestamp); 565 } 566 error = (*cdevsw[major(vp->v_rdev)].d_write) 567 (vp->v_rdev, uio, ap->a_ioflag); 568 569 return (error); 570 571 case VBLK: 572 if (uio_resid(uio) == 0) 573 return (0); 574 if (uio->uio_offset < 0) 575 return (EINVAL); 576 577 io_sync = (ap->a_ioflag & IO_SYNC); 578 579 dev = (vp->v_rdev); 580 581 devBlockSize = vp->v_specsize; 582 if (devBlockSize > PAGE_SIZE) 583 return(EINVAL); 584 585 bscale = PAGE_SIZE / devBlockSize; 586 blkmask = bscale - 1; 587 bsize = bscale * devBlockSize; 588 589 590 do { 591 bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ blkmask); 592 on = uio->uio_offset % bsize; 593 594 n = min((unsigned)(bsize - on), uio_resid(uio)); 595 596 /* 597 * Use buf_getblk() as an optimization IFF: 598 * 599 * 1) We are reading exactly a block on a block 600 * aligned boundary 601 * 2) We know the size of the device from spec_open 602 * 3) The read doesn't span the end of the device 603 * 604 * Otherwise, we fall back on buf_bread(). 605 */ 606 if (n == bsize && 607 vp->v_specdevsize != (u_int64_t)0 && 608 (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) { 609 /* reduce the size of the read to what is there */ 610 n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize; 611 } 612 613 if (n == bsize) 614 bp = buf_getblk(vp, bn, bsize, 0, 0, BLK_WRITE); 615 else 616 error = (int)buf_bread(vp, bn, bsize, NOCRED, &bp); 617 618 /* Translate downstream error for upstream, if needed */ 619 if (!error) 620 error = (int)buf_error(bp); 621 if (error) { 622 buf_brelse(bp); 623 return (error); 624 } 625 n = min(n, bsize - buf_resid(bp)); 626 627 error = uiomove((char *)buf_dataptr(bp) + on, n, uio); 628 if (error) { 629 buf_brelse(bp); 630 return (error); 631 } 632 buf_markaged(bp); 633 634 if (io_sync) 635 error = buf_bwrite(bp); 636 else { 637 if ((n + on) == bsize) 638 error = buf_bawrite(bp); 639 else 640 error = buf_bdwrite(bp); 641 } 642 } while (error == 0 && uio_resid(uio) > 0 && n != 0); 643 return (error); 644 645 default: 646 panic("spec_write type"); 647 } 648 /* NOTREACHED */ 649 650 return (0); 651} 652 653/* 654 * Device ioctl operation. 655 */ 656int 657spec_ioctl(struct vnop_ioctl_args *ap) 658{ 659 proc_t p = vfs_context_proc(ap->a_context); 660 dev_t dev = ap->a_vp->v_rdev; 661 int retval = 0; 662 663 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_START, 664 (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, (unsigned int)ap->a_vp->v_type, 0); 665 666 switch (ap->a_vp->v_type) { 667 668 case VCHR: 669 retval = (*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, 670 ap->a_fflag, p); 671 break; 672 673 case VBLK: 674 if (kdebug_enable) { 675 if (ap->a_command == DKIOCUNMAP) { 676 dk_unmap_t *unmap; 677 dk_extent_t *extent; 678 uint32_t i; 679 680 unmap = (dk_unmap_t *)ap->a_data; 681 extent = unmap->extents; 682 683 for (i = 0; i < unmap->extentsCount; i++, extent++) { 684 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 1) | DBG_FUNC_NONE, dev, extent->offset/ap->a_vp->v_specsize, extent->length, 0, 0); 685 } 686 } 687 } 688 retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, ap->a_fflag, p); 689 break; 690 691 default: 692 panic("spec_ioctl"); 693 /* NOTREACHED */ 694 } 695 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_END, 696 (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, retval, 0); 697 698 return (retval); 699} 700 701int 702spec_select(struct vnop_select_args *ap) 703{ 704 proc_t p = vfs_context_proc(ap->a_context); 705 dev_t dev; 706 707 switch (ap->a_vp->v_type) { 708 709 default: 710 return (1); /* XXX */ 711 712 case VCHR: 713 dev = ap->a_vp->v_rdev; 714 return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p); 715 } 716} 717 718static int filt_specattach(struct knote *kn); 719 720int 721spec_kqfilter(vnode_t vp, struct knote *kn) 722{ 723 dev_t dev; 724 int err = EINVAL; 725 726 /* 727 * For a few special kinds of devices, we can attach knotes. 728 * Each filter function must check whether the dev type matches it. 729 */ 730 dev = vnode_specrdev(vp); 731 732 if (vnode_istty(vp)) { 733 /* We can hook into TTYs... */ 734 err = filt_specattach(kn); 735 } else { 736#if NETWORKING 737 /* Try a bpf device, as defined in bsd/net/bpf.c */ 738 err = bpfkqfilter(dev, kn); 739#endif 740 } 741 742 return err; 743} 744 745/* 746 * Synch buffers associated with a block device 747 */ 748int 749spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context) 750{ 751 if (vp->v_type == VCHR) 752 return (0); 753 /* 754 * Flush all dirty buffers associated with a block device. 755 */ 756 buf_flushdirtyblks(vp, (waitfor == MNT_WAIT || waitfor == MNT_DWAIT), 0, "spec_fsync"); 757 758 return (0); 759} 760 761int 762spec_fsync(struct vnop_fsync_args *ap) 763{ 764 return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context); 765} 766 767 768/* 769 * Just call the device strategy routine 770 */ 771void throttle_init(void); 772 773 774#if 0 775#define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...) \ 776 do { \ 777 if ((debug_info)->alloc) \ 778 printf("%s: "format, __FUNCTION__, ## args); \ 779 } while(0) 780 781#else 782#define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...) 783#endif 784 785 786SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER1], 0, ""); 787SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER2], 0, ""); 788SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER3], 0, ""); 789 790SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER1], 0, ""); 791SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER2], 0, ""); 792SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER3], 0, ""); 793 794SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER1], 0, ""); 795SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER2], 0, ""); 796SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER3], 0, ""); 797 798SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_throttle_enabled, 0, ""); 799 800 801static lck_grp_t *throttle_mtx_grp; 802static lck_attr_t *throttle_mtx_attr; 803static lck_grp_attr_t *throttle_mtx_grp_attr; 804 805 806/* 807 * throttled I/O helper function 808 * convert the index of the lowest set bit to a device index 809 */ 810int 811num_trailing_0(uint64_t n) 812{ 813 /* 814 * since in most cases the number of trailing 0s is very small, 815 * we simply counting sequentially from the lowest bit 816 */ 817 if (n == 0) 818 return sizeof(n) * 8; 819 int count = 0; 820 while (!ISSET(n, 1)) { 821 n >>= 1; 822 ++count; 823 } 824 return count; 825} 826 827 828/* 829 * Release the reference and if the item was allocated and this is the last 830 * reference then free it. 831 * 832 * This routine always returns the old value. 833 */ 834static int 835throttle_info_rel(struct _throttle_io_info_t *info) 836{ 837 SInt32 oldValue = OSDecrementAtomic(&info->throttle_refcnt); 838 839 DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n", 840 info, (int)(oldValue -1), info ); 841 842 /* The reference count just went negative, very bad */ 843 if (oldValue == 0) 844 panic("throttle info ref cnt went negative!"); 845 846 /* 847 * Once reference count is zero, no one else should be able to take a 848 * reference 849 */ 850 if ((info->throttle_refcnt == 0) && (info->throttle_alloc)) { 851 DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info); 852 853 lck_mtx_destroy(&info->throttle_lock, throttle_mtx_grp); 854 FREE(info, M_TEMP); 855 } 856 return oldValue; 857} 858 859 860/* 861 * Just take a reference on the throttle info structure. 862 * 863 * This routine always returns the old value. 864 */ 865static SInt32 866throttle_info_ref(struct _throttle_io_info_t *info) 867{ 868 SInt32 oldValue = OSIncrementAtomic(&info->throttle_refcnt); 869 870 DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n", 871 info, (int)(oldValue -1), info ); 872 /* Allocated items should never have a reference of zero */ 873 if (info->throttle_alloc && (oldValue == 0)) 874 panic("Taking a reference without calling create throttle info!\n"); 875 876 return oldValue; 877} 878 879/* 880 * on entry the throttle_lock is held... 881 * this function is responsible for taking 882 * and dropping the reference on the info 883 * structure which will keep it from going 884 * away while the timer is running if it 885 * happens to have been dynamically allocated by 886 * a network fileystem kext which is now trying 887 * to free it 888 */ 889static uint32_t 890throttle_timer_start(struct _throttle_io_info_t *info, boolean_t update_io_count, int wakelevel) 891{ 892 struct timeval elapsed; 893 struct timeval now; 894 struct timeval period; 895 uint64_t elapsed_msecs; 896 int throttle_level; 897 int level; 898 int msecs; 899 boolean_t throttled = FALSE; 900 boolean_t need_timer = FALSE; 901 902 microuptime(&now); 903 904 if (update_io_count == TRUE) { 905 info->throttle_io_count_begin = info->throttle_io_count; 906 info->throttle_io_period_num++; 907 908 while (wakelevel >= THROTTLE_LEVEL_THROTTLED) 909 info->throttle_start_IO_period_timestamp[wakelevel--] = now; 910 911 info->throttle_min_timer_deadline = now; 912 913 msecs = info->throttle_io_periods[THROTTLE_LEVEL_THROTTLED]; 914 period.tv_sec = msecs / 1000; 915 period.tv_usec = (msecs % 1000) * 1000; 916 917 timevaladd(&info->throttle_min_timer_deadline, &period); 918 } 919 for (throttle_level = THROTTLE_LEVEL_START; throttle_level < THROTTLE_LEVEL_END; throttle_level++) { 920 921 elapsed = now; 922 timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]); 923 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000); 924 925 for (level = throttle_level + 1; level <= THROTTLE_LEVEL_END; level++) { 926 927 if (!TAILQ_EMPTY(&info->throttle_uthlist[level])) { 928 929 if (elapsed_msecs < (uint64_t)throttle_windows_msecs[level]) { 930 /* 931 * we had an I/O occur at a higher priority tier within 932 * this tier's throttle window 933 */ 934 throttled = TRUE; 935 } 936 /* 937 * we assume that the windows are the same or longer 938 * as we drop through the throttling tiers... thus 939 * we can stop looking once we run into a tier with 940 * threads to schedule regardless of whether it's 941 * still in its throttling window or not 942 */ 943 break; 944 } 945 } 946 if (throttled == TRUE) 947 break; 948 } 949 if (throttled == TRUE) { 950 uint64_t deadline = 0; 951 struct timeval target; 952 struct timeval min_target; 953 954 /* 955 * we've got at least one tier still in a throttled window 956 * so we need a timer running... compute the next deadline 957 * and schedule it 958 */ 959 for (level = throttle_level+1; level <= THROTTLE_LEVEL_END; level++) { 960 961 if (TAILQ_EMPTY(&info->throttle_uthlist[level])) 962 continue; 963 964 target = info->throttle_start_IO_period_timestamp[level]; 965 966 msecs = info->throttle_io_periods[level]; 967 period.tv_sec = msecs / 1000; 968 period.tv_usec = (msecs % 1000) * 1000; 969 970 timevaladd(&target, &period); 971 972 if (need_timer == FALSE || timevalcmp(&target, &min_target, <)) { 973 min_target = target; 974 need_timer = TRUE; 975 } 976 } 977 if (timevalcmp(&info->throttle_min_timer_deadline, &now, >)) { 978 if (timevalcmp(&info->throttle_min_timer_deadline, &min_target, >)) 979 min_target = info->throttle_min_timer_deadline; 980 } 981 982 if (info->throttle_timer_active) { 983 if (thread_call_cancel(info->throttle_timer_call) == FALSE) { 984 /* 985 * couldn't kill the timer because it's already 986 * been dispatched, so don't try to start a new 987 * one... once we drop the lock, the timer will 988 * proceed and eventually re-run this function 989 */ 990 need_timer = FALSE; 991 } else 992 info->throttle_timer_active = 0; 993 } 994 if (need_timer == TRUE) { 995 /* 996 * This is defined as an int (32-bit) rather than a 64-bit 997 * value because it would need a really big period in the 998 * order of ~500 days to overflow this. So, we let this be 999 * 32-bit which allows us to use the clock_interval_to_deadline() 1000 * routine. 1001 */ 1002 int target_msecs; 1003 1004 if (info->throttle_timer_ref == 0) { 1005 /* 1006 * take a reference for the timer 1007 */ 1008 throttle_info_ref(info); 1009 1010 info->throttle_timer_ref = 1; 1011 } 1012 elapsed = min_target; 1013 timevalsub(&elapsed, &now); 1014 target_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000; 1015 1016 if (target_msecs <= 0) { 1017 /* 1018 * we may have computed a deadline slightly in the past 1019 * due to various factors... if so, just set the timer 1020 * to go off in the near future (we don't need to be precise) 1021 */ 1022 target_msecs = 1; 1023 } 1024 clock_interval_to_deadline(target_msecs, 1000000, &deadline); 1025 1026 thread_call_enter_delayed(info->throttle_timer_call, deadline); 1027 info->throttle_timer_active = 1; 1028 } 1029 } 1030 return (throttle_level); 1031} 1032 1033 1034static void 1035throttle_timer(struct _throttle_io_info_t *info) 1036{ 1037 uthread_t ut, utlist; 1038 struct timeval elapsed; 1039 struct timeval now; 1040 uint64_t elapsed_msecs; 1041 int throttle_level; 1042 int level; 1043 int wake_level; 1044 caddr_t wake_address = NULL; 1045 boolean_t update_io_count = FALSE; 1046 boolean_t need_wakeup = FALSE; 1047 boolean_t need_release = FALSE; 1048 1049 ut = NULL; 1050 lck_mtx_lock(&info->throttle_lock); 1051 1052 info->throttle_timer_active = 0; 1053 microuptime(&now); 1054 1055 elapsed = now; 1056 timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp[THROTTLE_LEVEL_THROTTLED]); 1057 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000); 1058 1059 if (elapsed_msecs >= (uint64_t)info->throttle_io_periods[THROTTLE_LEVEL_THROTTLED]) { 1060 1061 wake_level = info->throttle_next_wake_level; 1062 1063 for (level = THROTTLE_LEVEL_START; level < THROTTLE_LEVEL_END; level++) { 1064 1065 elapsed = now; 1066 timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp[wake_level]); 1067 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000); 1068 1069 if (elapsed_msecs >= (uint64_t)info->throttle_io_periods[wake_level] && !TAILQ_EMPTY(&info->throttle_uthlist[wake_level])) { 1070 /* 1071 * we're closing out the current IO period... 1072 * if we have a waiting thread, wake it up 1073 * after we have reset the I/O window info 1074 */ 1075 need_wakeup = TRUE; 1076 update_io_count = TRUE; 1077 1078 info->throttle_next_wake_level = wake_level - 1; 1079 1080 if (info->throttle_next_wake_level == THROTTLE_LEVEL_START) 1081 info->throttle_next_wake_level = THROTTLE_LEVEL_END; 1082 1083 break; 1084 } 1085 wake_level--; 1086 1087 if (wake_level == THROTTLE_LEVEL_START) 1088 wake_level = THROTTLE_LEVEL_END; 1089 } 1090 } 1091 if (need_wakeup == TRUE) { 1092 if (!TAILQ_EMPTY(&info->throttle_uthlist[wake_level])) { 1093 1094 ut = (uthread_t)TAILQ_FIRST(&info->throttle_uthlist[wake_level]); 1095 TAILQ_REMOVE(&info->throttle_uthlist[wake_level], ut, uu_throttlelist); 1096 ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE; 1097 1098 wake_address = (caddr_t)&ut->uu_on_throttlelist; 1099 } 1100 } else 1101 wake_level = THROTTLE_LEVEL_START; 1102 1103 throttle_level = throttle_timer_start(info, update_io_count, wake_level); 1104 1105 if (wake_address != NULL) 1106 wakeup(wake_address); 1107 1108 for (level = THROTTLE_LEVEL_THROTTLED; level <= throttle_level; level++) { 1109 1110 TAILQ_FOREACH_SAFE(ut, &info->throttle_uthlist[level], uu_throttlelist, utlist) { 1111 1112 TAILQ_REMOVE(&info->throttle_uthlist[level], ut, uu_throttlelist); 1113 ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE; 1114 1115 wakeup(&ut->uu_on_throttlelist); 1116 } 1117 } 1118 if (info->throttle_timer_active == 0 && info->throttle_timer_ref) { 1119 info->throttle_timer_ref = 0; 1120 need_release = TRUE; 1121 } 1122 lck_mtx_unlock(&info->throttle_lock); 1123 1124 if (need_release == TRUE) 1125 throttle_info_rel(info); 1126} 1127 1128 1129static int 1130throttle_add_to_list(struct _throttle_io_info_t *info, uthread_t ut, int mylevel, boolean_t insert_tail) 1131{ 1132 boolean_t start_timer = FALSE; 1133 int level = THROTTLE_LEVEL_START; 1134 1135 if (TAILQ_EMPTY(&info->throttle_uthlist[mylevel])) { 1136 info->throttle_start_IO_period_timestamp[mylevel] = info->throttle_last_IO_timestamp[mylevel]; 1137 start_timer = TRUE; 1138 } 1139 1140 if (insert_tail == TRUE) 1141 TAILQ_INSERT_TAIL(&info->throttle_uthlist[mylevel], ut, uu_throttlelist); 1142 else 1143 TAILQ_INSERT_HEAD(&info->throttle_uthlist[mylevel], ut, uu_throttlelist); 1144 1145 ut->uu_on_throttlelist = mylevel; 1146 1147 if (start_timer == TRUE) { 1148 /* we may need to start or rearm the timer */ 1149 level = throttle_timer_start(info, FALSE, THROTTLE_LEVEL_START); 1150 1151 if (level == THROTTLE_LEVEL_END) { 1152 if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) { 1153 TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist); 1154 1155 ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE; 1156 } 1157 } 1158 } 1159 return (level); 1160} 1161 1162static void 1163throttle_init_throttle_window(void) 1164{ 1165 int throttle_window_size; 1166 1167 /* 1168 * The hierarchy of throttle window values is as follows: 1169 * - Global defaults 1170 * - Device tree properties 1171 * - Boot-args 1172 * All values are specified in msecs. 1173 */ 1174 1175 /* Override global values with device-tree properties */ 1176 if (PE_get_default("kern.io_throttle_window_tier1", &throttle_window_size, sizeof(throttle_window_size))) 1177 throttle_windows_msecs[THROTTLE_LEVEL_TIER1] = throttle_window_size; 1178 1179 if (PE_get_default("kern.io_throttle_window_tier2", &throttle_window_size, sizeof(throttle_window_size))) 1180 throttle_windows_msecs[THROTTLE_LEVEL_TIER2] = throttle_window_size; 1181 1182 if (PE_get_default("kern.io_throttle_window_tier3", &throttle_window_size, sizeof(throttle_window_size))) 1183 throttle_windows_msecs[THROTTLE_LEVEL_TIER3] = throttle_window_size; 1184 1185 /* Override with boot-args */ 1186 if (PE_parse_boot_argn("io_throttle_window_tier1", &throttle_window_size, sizeof(throttle_window_size))) 1187 throttle_windows_msecs[THROTTLE_LEVEL_TIER1] = throttle_window_size; 1188 1189 if (PE_parse_boot_argn("io_throttle_window_tier2", &throttle_window_size, sizeof(throttle_window_size))) 1190 throttle_windows_msecs[THROTTLE_LEVEL_TIER2] = throttle_window_size; 1191 1192 if (PE_parse_boot_argn("io_throttle_window_tier3", &throttle_window_size, sizeof(throttle_window_size))) 1193 throttle_windows_msecs[THROTTLE_LEVEL_TIER3] = throttle_window_size; 1194} 1195 1196static void 1197throttle_init_throttle_period(struct _throttle_io_info_t *info, boolean_t isssd) 1198{ 1199 int throttle_period_size; 1200 1201 /* 1202 * The hierarchy of throttle period values is as follows: 1203 * - Global defaults 1204 * - Device tree properties 1205 * - Boot-args 1206 * All values are specified in msecs. 1207 */ 1208 1209 /* Assign global defaults */ 1210 if (isssd == TRUE) 1211 info->throttle_io_periods = &throttle_io_period_ssd_msecs[0]; 1212 else 1213 info->throttle_io_periods = &throttle_io_period_msecs[0]; 1214 1215 /* Override global values with device-tree properties */ 1216 if (PE_get_default("kern.io_throttle_period_tier1", &throttle_period_size, sizeof(throttle_period_size))) 1217 info->throttle_io_periods[THROTTLE_LEVEL_TIER1] = throttle_period_size; 1218 1219 if (PE_get_default("kern.io_throttle_period_tier2", &throttle_period_size, sizeof(throttle_period_size))) 1220 info->throttle_io_periods[THROTTLE_LEVEL_TIER2] = throttle_period_size; 1221 1222 if (PE_get_default("kern.io_throttle_period_tier3", &throttle_period_size, sizeof(throttle_period_size))) 1223 info->throttle_io_periods[THROTTLE_LEVEL_TIER3] = throttle_period_size; 1224 1225 /* Override with boot-args */ 1226 if (PE_parse_boot_argn("io_throttle_period_tier1", &throttle_period_size, sizeof(throttle_period_size))) 1227 info->throttle_io_periods[THROTTLE_LEVEL_TIER1] = throttle_period_size; 1228 1229 if (PE_parse_boot_argn("io_throttle_period_tier2", &throttle_period_size, sizeof(throttle_period_size))) 1230 info->throttle_io_periods[THROTTLE_LEVEL_TIER2] = throttle_period_size; 1231 1232 if (PE_parse_boot_argn("io_throttle_period_tier3", &throttle_period_size, sizeof(throttle_period_size))) 1233 info->throttle_io_periods[THROTTLE_LEVEL_TIER3] = throttle_period_size; 1234 1235} 1236 1237#if CONFIG_IOSCHED 1238extern void vm_io_reprioritize_init(void); 1239int iosched_enabled = 1; 1240#endif 1241 1242void 1243throttle_init(void) 1244{ 1245 struct _throttle_io_info_t *info; 1246 int i; 1247 int level; 1248#if CONFIG_IOSCHED 1249 int iosched; 1250#endif 1251 /* 1252 * allocate lock group attribute and group 1253 */ 1254 throttle_mtx_grp_attr = lck_grp_attr_alloc_init(); 1255 throttle_mtx_grp = lck_grp_alloc_init("throttle I/O", throttle_mtx_grp_attr); 1256 1257 /* Update throttle parameters based on device tree configuration */ 1258 throttle_init_throttle_window(); 1259 1260 /* 1261 * allocate the lock attribute 1262 */ 1263 throttle_mtx_attr = lck_attr_alloc_init(); 1264 1265 for (i = 0; i < LOWPRI_MAX_NUM_DEV; i++) { 1266 info = &_throttle_io_info[i]; 1267 1268 lck_mtx_init(&info->throttle_lock, throttle_mtx_grp, throttle_mtx_attr); 1269 info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info); 1270 1271 for (level = 0; level <= THROTTLE_LEVEL_END; level++) { 1272 TAILQ_INIT(&info->throttle_uthlist[level]); 1273 info->throttle_last_IO_pid[level] = 0; 1274 } 1275 info->throttle_next_wake_level = THROTTLE_LEVEL_END; 1276 info->throttle_disabled = 0; 1277 } 1278#if CONFIG_IOSCHED 1279 if (PE_parse_boot_argn("iosched", &iosched, sizeof(iosched))) { 1280 iosched_enabled = iosched; 1281 } 1282 if (iosched_enabled) { 1283 /* Initialize I/O Reprioritization mechanism */ 1284 vm_io_reprioritize_init(); 1285 } 1286#endif 1287} 1288 1289void 1290sys_override_io_throttle(int flag) 1291{ 1292 if (flag == THROTTLE_IO_ENABLE) 1293 lowpri_throttle_enabled = 1; 1294 1295 if (flag == THROTTLE_IO_DISABLE) 1296 lowpri_throttle_enabled = 0; 1297} 1298 1299int rethrottle_removed_from_list = 0; 1300int rethrottle_moved_to_new_list = 0; 1301 1302/* 1303 * move a throttled thread to the appropriate state based 1304 * on it's new throttle level... throttle_add_to_list will 1305 * reset the timer deadline if necessary... it may also 1306 * leave the thread off of the queue if we're already outside 1307 * the throttle window for the new level 1308 * takes a valid uthread (which may or may not be on the 1309 * throttle queue) as input 1310 * 1311 * NOTE: This is called with the task lock held. 1312 */ 1313 1314void 1315rethrottle_thread(uthread_t ut) 1316{ 1317 struct _throttle_io_info_t *info; 1318 int my_new_level; 1319 1320 if ((info = ut->uu_throttle_info) == NULL) 1321 return; 1322 1323 lck_mtx_lock(&info->throttle_lock); 1324 1325 if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) { 1326 1327 my_new_level = throttle_get_thread_throttle_level(ut); 1328 1329 if (my_new_level != ut->uu_on_throttlelist) { 1330 1331 TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist); 1332 ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE; 1333 1334 if (my_new_level >= THROTTLE_LEVEL_THROTTLED) { 1335 throttle_add_to_list(info, ut, my_new_level, TRUE); 1336 rethrottle_moved_to_new_list++; 1337 } 1338 1339 /* Thread no longer in window, need to wake it up */ 1340 if (ut->uu_on_throttlelist == THROTTLE_LEVEL_NONE) { 1341 wakeup(&ut->uu_on_throttlelist); 1342 rethrottle_removed_from_list++; 1343 } 1344 } 1345 } 1346 1347 lck_mtx_unlock(&info->throttle_lock); 1348} 1349 1350 1351/* 1352 * KPI routine 1353 * 1354 * Create and take a reference on a throttle info structure and return a 1355 * pointer for the file system to use when calling throttle_info_update. 1356 * Calling file system must have a matching release for every create. 1357 */ 1358void * 1359throttle_info_create(void) 1360{ 1361 struct _throttle_io_info_t *info; 1362 int level; 1363 1364 MALLOC(info, struct _throttle_io_info_t *, sizeof(*info), M_TEMP, M_ZERO | M_WAITOK); 1365 /* Should never happen but just in case */ 1366 if (info == NULL) 1367 return NULL; 1368 /* Mark that this one was allocated and needs to be freed */ 1369 DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info ); 1370 info->throttle_alloc = TRUE; 1371 1372 lck_mtx_init(&info->throttle_lock, throttle_mtx_grp, throttle_mtx_attr); 1373 info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info); 1374 1375 for (level = 0; level <= THROTTLE_LEVEL_END; level++) { 1376 TAILQ_INIT(&info->throttle_uthlist[level]); 1377 } 1378 info->throttle_next_wake_level = THROTTLE_LEVEL_END; 1379 1380 /* Take a reference */ 1381 OSIncrementAtomic(&info->throttle_refcnt); 1382 return info; 1383} 1384 1385/* 1386 * KPI routine 1387 * 1388 * Release the throttle info pointer if all the reference are gone. Should be 1389 * called to release reference taken by throttle_info_create 1390 */ 1391void 1392throttle_info_release(void *throttle_info) 1393{ 1394 DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n", 1395 (struct _throttle_io_info_t *)throttle_info, 1396 (struct _throttle_io_info_t *)throttle_info); 1397 if (throttle_info) /* Just to be careful */ 1398 throttle_info_rel(throttle_info); 1399} 1400 1401/* 1402 * KPI routine 1403 * 1404 * File Systems that create an info structure, need to call this routine in 1405 * their mount routine (used by cluster code). File Systems that call this in 1406 * their mount routines must call throttle_info_mount_rel in their unmount 1407 * routines. 1408 */ 1409void 1410throttle_info_mount_ref(mount_t mp, void *throttle_info) 1411{ 1412 if ((throttle_info == NULL) || (mp == NULL)) 1413 return; 1414 throttle_info_ref(throttle_info); 1415 1416 /* 1417 * We already have a reference release it before adding the new one 1418 */ 1419 if (mp->mnt_throttle_info) 1420 throttle_info_rel(mp->mnt_throttle_info); 1421 mp->mnt_throttle_info = throttle_info; 1422} 1423 1424/* 1425 * Private KPI routine 1426 * 1427 * return a handle for accessing throttle_info given a throttle_mask. The 1428 * handle must be released by throttle_info_rel_by_mask 1429 */ 1430int 1431throttle_info_ref_by_mask(uint64_t throttle_mask, throttle_info_handle_t *throttle_info_handle) 1432{ 1433 int dev_index; 1434 struct _throttle_io_info_t *info; 1435 1436 if (throttle_info_handle == NULL) 1437 return EINVAL; 1438 1439 dev_index = num_trailing_0(throttle_mask); 1440 info = &_throttle_io_info[dev_index]; 1441 throttle_info_ref(info); 1442 *(struct _throttle_io_info_t**)throttle_info_handle = info; 1443 1444 return 0; 1445} 1446 1447/* 1448 * Private KPI routine 1449 * 1450 * release the handle obtained by throttle_info_ref_by_mask 1451 */ 1452void 1453throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle) 1454{ 1455 /* 1456 * for now the handle is just a pointer to _throttle_io_info_t 1457 */ 1458 throttle_info_rel((struct _throttle_io_info_t*)throttle_info_handle); 1459} 1460 1461/* 1462 * KPI routine 1463 * 1464 * File Systems that throttle_info_mount_ref, must call this routine in their 1465 * umount routine. 1466 */ 1467void 1468throttle_info_mount_rel(mount_t mp) 1469{ 1470 if (mp->mnt_throttle_info) 1471 throttle_info_rel(mp->mnt_throttle_info); 1472 mp->mnt_throttle_info = NULL; 1473} 1474 1475void 1476throttle_info_get_last_io_time(mount_t mp, struct timeval *tv) 1477{ 1478 struct _throttle_io_info_t *info; 1479 1480 if (mp == NULL) 1481 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; 1482 else if (mp->mnt_throttle_info == NULL) 1483 info = &_throttle_io_info[mp->mnt_devbsdunit]; 1484 else 1485 info = mp->mnt_throttle_info; 1486 1487 *tv = info->throttle_last_write_timestamp; 1488} 1489 1490void 1491update_last_io_time(mount_t mp) 1492{ 1493 struct _throttle_io_info_t *info; 1494 1495 if (mp == NULL) 1496 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; 1497 else if (mp->mnt_throttle_info == NULL) 1498 info = &_throttle_io_info[mp->mnt_devbsdunit]; 1499 else 1500 info = mp->mnt_throttle_info; 1501 1502 microuptime(&info->throttle_last_write_timestamp); 1503 if (mp != NULL) 1504 mp->mnt_last_write_completed_timestamp = info->throttle_last_write_timestamp; 1505} 1506 1507 1508int 1509throttle_get_io_policy(uthread_t *ut) 1510{ 1511 if (ut != NULL) 1512 *ut = get_bsdthread_info(current_thread()); 1513 1514 return (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO)); 1515} 1516 1517int 1518throttle_get_passive_io_policy(uthread_t *ut) 1519{ 1520 if (ut != NULL) 1521 *ut = get_bsdthread_info(current_thread()); 1522 1523 return (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_PASSIVE_IO)); 1524} 1525 1526 1527static int 1528throttle_get_thread_throttle_level(uthread_t ut) 1529{ 1530 int thread_throttle_level; 1531 1532 if (ut == NULL) 1533 ut = get_bsdthread_info(current_thread()); 1534 1535 thread_throttle_level = proc_get_effective_thread_policy(ut->uu_thread, TASK_POLICY_IO); 1536 1537 /* Bootcache misses should always be throttled */ 1538 if (ut->uu_throttle_bc == TRUE) 1539 thread_throttle_level = THROTTLE_LEVEL_TIER3; 1540 1541 return (thread_throttle_level); 1542} 1543 1544 1545static int 1546throttle_io_will_be_throttled_internal(void * throttle_info, int * mylevel, int * throttling_level) 1547{ 1548 struct _throttle_io_info_t *info = throttle_info; 1549 struct timeval elapsed; 1550 uint64_t elapsed_msecs; 1551 int thread_throttle_level; 1552 int throttle_level; 1553 1554 if ((thread_throttle_level = throttle_get_thread_throttle_level(NULL)) < THROTTLE_LEVEL_THROTTLED) 1555 return (THROTTLE_DISENGAGED); 1556 1557 for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) { 1558 1559 microuptime(&elapsed); 1560 timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]); 1561 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000); 1562 1563 if (elapsed_msecs < (uint64_t)throttle_windows_msecs[thread_throttle_level]) 1564 break; 1565 } 1566 if (throttle_level >= thread_throttle_level) { 1567 /* 1568 * we're beyond all of the throttle windows 1569 * that affect the throttle level of this thread, 1570 * so go ahead and treat as normal I/O 1571 */ 1572 return (THROTTLE_DISENGAGED); 1573 } 1574 if (mylevel) 1575 *mylevel = thread_throttle_level; 1576 if (throttling_level) 1577 *throttling_level = throttle_level; 1578 1579 if (info->throttle_io_count != info->throttle_io_count_begin) { 1580 /* 1581 * we've already issued at least one throttleable I/O 1582 * in the current I/O window, so avoid issuing another one 1583 */ 1584 return (THROTTLE_NOW); 1585 } 1586 /* 1587 * we're in the throttle window, so 1588 * cut the I/O size back 1589 */ 1590 return (THROTTLE_ENGAGED); 1591} 1592 1593/* 1594 * If we have a mount point and it has a throttle info pointer then 1595 * use it to do the check, otherwise use the device unit number to find 1596 * the correct throttle info array element. 1597 */ 1598int 1599throttle_io_will_be_throttled(__unused int lowpri_window_msecs, mount_t mp) 1600{ 1601 struct _throttle_io_info_t *info; 1602 1603 /* 1604 * Should we just return zero if no mount point 1605 */ 1606 if (mp == NULL) 1607 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; 1608 else if (mp->mnt_throttle_info == NULL) 1609 info = &_throttle_io_info[mp->mnt_devbsdunit]; 1610 else 1611 info = mp->mnt_throttle_info; 1612 1613 if (info->throttle_disabled) 1614 return (THROTTLE_DISENGAGED); 1615 else 1616 return throttle_io_will_be_throttled_internal(info, NULL, NULL); 1617} 1618 1619/* 1620 * Routine to increment I/O throttling counters maintained in the proc 1621 */ 1622 1623static void 1624throttle_update_proc_stats(pid_t throttling_pid, int count) 1625{ 1626 proc_t throttling_proc; 1627 proc_t throttled_proc = current_proc(); 1628 1629 /* The throttled_proc is always the current proc; so we are not concerned with refs */ 1630 OSAddAtomic64(count, &(throttled_proc->was_throttled)); 1631 1632 /* The throttling pid might have exited by now */ 1633 throttling_proc = proc_find(throttling_pid); 1634 if (throttling_proc != PROC_NULL) { 1635 OSAddAtomic64(count, &(throttling_proc->did_throttle)); 1636 proc_rele(throttling_proc); 1637 } 1638} 1639 1640/* 1641 * Block until woken up by the throttle timer or by a rethrottle call. 1642 * As long as we hold the throttle_lock while querying the throttle tier, we're 1643 * safe against seeing an old throttle tier after a rethrottle. 1644 */ 1645uint32_t 1646throttle_lowpri_io(int sleep_amount) 1647{ 1648 uthread_t ut; 1649 struct _throttle_io_info_t *info; 1650 int throttle_type = 0; 1651 int mylevel = 0; 1652 int throttling_level = THROTTLE_LEVEL_NONE; 1653 int sleep_cnt = 0; 1654 uint32_t throttle_io_period_num = 0; 1655 boolean_t insert_tail = TRUE; 1656 1657 ut = get_bsdthread_info(current_thread()); 1658 1659 if (ut->uu_lowpri_window == 0) 1660 return (0); 1661 1662 info = ut->uu_throttle_info; 1663 1664 if (info == NULL) { 1665 ut->uu_throttle_bc = FALSE; 1666 ut->uu_lowpri_window = 0; 1667 return (0); 1668 } 1669 1670 lck_mtx_lock(&info->throttle_lock); 1671 1672 if (sleep_amount == 0) 1673 goto done; 1674 1675 if (sleep_amount == 1 && ut->uu_throttle_bc == FALSE) 1676 sleep_amount = 0; 1677 1678 throttle_io_period_num = info->throttle_io_period_num; 1679 1680 while ( (throttle_type = throttle_io_will_be_throttled_internal(info, &mylevel, &throttling_level)) ) { 1681 1682 if (throttle_type == THROTTLE_ENGAGED) { 1683 if (sleep_amount == 0) 1684 break; 1685 if (info->throttle_io_period_num < throttle_io_period_num) 1686 break; 1687 if ((info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) 1688 break; 1689 } 1690 if (ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED) { 1691 if (throttle_add_to_list(info, ut, mylevel, insert_tail) == THROTTLE_LEVEL_END) 1692 goto done; 1693 } 1694 assert(throttling_level >= THROTTLE_LEVEL_START && throttling_level <= THROTTLE_LEVEL_END); 1695 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, PROCESS_THROTTLED)) | DBG_FUNC_NONE, 1696 info->throttle_last_IO_pid[throttling_level], throttling_level, proc_selfpid(), mylevel, 0); 1697 1698 1699 if (sleep_cnt == 0) { 1700 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, 1701 throttle_windows_msecs[mylevel], info->throttle_io_periods[mylevel], info->throttle_io_count, 0, 0); 1702 throttled_count[mylevel]++; 1703 } 1704 msleep((caddr_t)&ut->uu_on_throttlelist, &info->throttle_lock, PRIBIO + 1, "throttle_lowpri_io", NULL); 1705 1706 sleep_cnt++; 1707 1708 if (sleep_amount == 0) 1709 insert_tail = FALSE; 1710 else if (info->throttle_io_period_num < throttle_io_period_num || 1711 (info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) { 1712 insert_tail = FALSE; 1713 sleep_amount = 0; 1714 } 1715 } 1716done: 1717 if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) { 1718 TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist); 1719 ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE; 1720 } 1721 1722 lck_mtx_unlock(&info->throttle_lock); 1723 1724 if (sleep_cnt) { 1725 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, 1726 throttle_windows_msecs[mylevel], info->throttle_io_periods[mylevel], info->throttle_io_count, 0, 0); 1727 /* 1728 * We update the stats for the last pid which opened a throttle window for the throttled thread. 1729 * This might not be completely accurate since the multiple throttles seen by the lower tier pid 1730 * might have been caused by various higher prio pids. However, updating these stats accurately 1731 * means doing a proc_find while holding the throttle lock which leads to deadlock. 1732 */ 1733 throttle_update_proc_stats(info->throttle_last_IO_pid[throttling_level], sleep_cnt); 1734 } 1735 1736 throttle_info_rel(info); 1737 1738 ut->uu_throttle_info = NULL; 1739 ut->uu_throttle_bc = FALSE; 1740 ut->uu_lowpri_window = 0; 1741 1742 return (sleep_cnt); 1743} 1744 1745/* 1746 * KPI routine 1747 * 1748 * set a kernel thread's IO policy. policy can be: 1749 * IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE, IOPOL_UTILITY, IOPOL_STANDARD 1750 * 1751 * explanations about these policies are in the man page of setiopolicy_np 1752 */ 1753void throttle_set_thread_io_policy(int policy) 1754{ 1755 proc_set_task_policy(current_task(), current_thread(), 1756 TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, 1757 policy); 1758} 1759 1760 1761void throttle_info_reset_window(uthread_t ut) 1762{ 1763 struct _throttle_io_info_t *info; 1764 1765 if ( (info = ut->uu_throttle_info) ) { 1766 throttle_info_rel(info); 1767 1768 ut->uu_throttle_info = NULL; 1769 ut->uu_lowpri_window = 0; 1770 ut->uu_throttle_bc = FALSE; 1771 } 1772} 1773 1774static 1775void throttle_info_set_initial_window(uthread_t ut, struct _throttle_io_info_t *info, boolean_t BC_throttle, boolean_t isssd) 1776{ 1777 if (lowpri_throttle_enabled == 0 || info->throttle_disabled) 1778 return; 1779 1780 if (info->throttle_io_periods == 0) { 1781 throttle_init_throttle_period(info, isssd); 1782 } 1783 if (ut->uu_throttle_info == NULL) { 1784 1785 ut->uu_throttle_info = info; 1786 throttle_info_ref(info); 1787 DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info ); 1788 1789 ut->uu_lowpri_window = 1; 1790 ut->uu_throttle_bc = BC_throttle; 1791 } 1792} 1793 1794 1795static 1796void throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd) 1797{ 1798 int thread_throttle_level; 1799 1800 if (lowpri_throttle_enabled == 0 || info->throttle_disabled) 1801 return; 1802 1803 if (ut == NULL) 1804 ut = get_bsdthread_info(current_thread()); 1805 1806 thread_throttle_level = throttle_get_thread_throttle_level(ut); 1807 1808 if (thread_throttle_level != THROTTLE_LEVEL_NONE) { 1809 if(!ISSET(flags, B_PASSIVE)) { 1810 microuptime(&info->throttle_window_start_timestamp[thread_throttle_level]); 1811 info->throttle_last_IO_pid[thread_throttle_level] = proc_selfpid(); 1812 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, OPEN_THROTTLE_WINDOW)) | DBG_FUNC_NONE, 1813 current_proc()->p_pid, thread_throttle_level, 0, 0, 0); 1814 } 1815 microuptime(&info->throttle_last_IO_timestamp[thread_throttle_level]); 1816 } 1817 1818 1819 if (thread_throttle_level >= THROTTLE_LEVEL_THROTTLED) { 1820 /* 1821 * I'd really like to do the IOSleep here, but 1822 * we may be holding all kinds of filesystem related locks 1823 * and the pages for this I/O marked 'busy'... 1824 * we don't want to cause a normal task to block on 1825 * one of these locks while we're throttling a task marked 1826 * for low priority I/O... we'll mark the uthread and 1827 * do the delay just before we return from the system 1828 * call that triggered this I/O or from vnode_pagein 1829 */ 1830 OSAddAtomic(1, &info->throttle_io_count); 1831 1832 throttle_info_set_initial_window(ut, info, FALSE, isssd); 1833 } 1834} 1835 1836void *throttle_info_update_by_mount(mount_t mp) 1837{ 1838 struct _throttle_io_info_t *info; 1839 uthread_t ut; 1840 boolean_t isssd = FALSE; 1841 1842 ut = get_bsdthread_info(current_thread()); 1843 1844 if (mp != NULL) { 1845 if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) 1846 isssd = TRUE; 1847 info = &_throttle_io_info[mp->mnt_devbsdunit]; 1848 } else 1849 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; 1850 1851 if (!ut->uu_lowpri_window) 1852 throttle_info_set_initial_window(ut, info, FALSE, isssd); 1853 1854 return info; 1855} 1856 1857 1858/* 1859 * KPI routine 1860 * 1861 * this is usually called before every I/O, used for throttled I/O 1862 * book keeping. This routine has low overhead and does not sleep 1863 */ 1864void throttle_info_update(void *throttle_info, int flags) 1865{ 1866 if (throttle_info) 1867 throttle_info_update_internal(throttle_info, NULL, flags, FALSE); 1868} 1869 1870/* 1871 * KPI routine 1872 * 1873 * this is usually called before every I/O, used for throttled I/O 1874 * book keeping. This routine has low overhead and does not sleep 1875 */ 1876void throttle_info_update_by_mask(void *throttle_info_handle, int flags) 1877{ 1878 void *throttle_info = throttle_info_handle; 1879 1880 /* 1881 * for now we only use the lowest bit of the throttle mask, so the 1882 * handle is the same as the throttle_info. Later if we store a 1883 * set of throttle infos in the handle, we will want to loop through 1884 * them and call throttle_info_update in a loop 1885 */ 1886 throttle_info_update(throttle_info, flags); 1887} 1888/* 1889 * KPI routine 1890 * 1891 * This routine marks the throttle info as disabled. Used for mount points which 1892 * support I/O scheduling. 1893 */ 1894 1895void throttle_info_disable_throttle(int devno) 1896{ 1897 struct _throttle_io_info_t *info; 1898 1899 if (devno < 0 || devno >= LOWPRI_MAX_NUM_DEV) 1900 panic("Illegal devno (%d) passed into throttle_info_disable_throttle()", devno); 1901 1902 info = &_throttle_io_info[devno]; 1903 info->throttle_disabled = 1; 1904 return; 1905} 1906 1907 1908/* 1909 * KPI routine (private) 1910 * Called to determine if this IO is being throttled to this level so that it can be treated specially 1911 */ 1912int throttle_info_io_will_be_throttled(void * throttle_info, int policy) 1913{ 1914 struct _throttle_io_info_t *info = throttle_info; 1915 struct timeval elapsed; 1916 uint64_t elapsed_msecs; 1917 int throttle_level; 1918 int thread_throttle_level; 1919 1920 switch (policy) { 1921 1922 case IOPOL_THROTTLE: 1923 thread_throttle_level = THROTTLE_LEVEL_TIER3; 1924 break; 1925 case IOPOL_UTILITY: 1926 thread_throttle_level = THROTTLE_LEVEL_TIER2; 1927 break; 1928 case IOPOL_STANDARD: 1929 thread_throttle_level = THROTTLE_LEVEL_TIER1; 1930 break; 1931 default: 1932 thread_throttle_level = THROTTLE_LEVEL_TIER0; 1933 break; 1934 } 1935 for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) { 1936 1937 microuptime(&elapsed); 1938 timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]); 1939 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000); 1940 1941 if (elapsed_msecs < (uint64_t)throttle_windows_msecs[thread_throttle_level]) 1942 break; 1943 } 1944 if (throttle_level >= thread_throttle_level) { 1945 /* 1946 * we're beyond all of the throttle windows 1947 * so go ahead and treat as normal I/O 1948 */ 1949 return (THROTTLE_DISENGAGED); 1950 } 1951 /* 1952 * we're in the throttle window 1953 */ 1954 return (THROTTLE_ENGAGED); 1955} 1956 1957int 1958spec_strategy(struct vnop_strategy_args *ap) 1959{ 1960 buf_t bp; 1961 int bflags; 1962 int io_tier; 1963 int passive; 1964 dev_t bdev; 1965 uthread_t ut; 1966 mount_t mp; 1967 struct bufattr *bap; 1968 int strategy_ret; 1969 struct _throttle_io_info_t *throttle_info; 1970 boolean_t isssd = FALSE; 1971 int code = 0; 1972 1973 proc_t curproc = current_proc(); 1974 1975 bp = ap->a_bp; 1976 bdev = buf_device(bp); 1977 mp = buf_vnode(bp)->v_mount; 1978 bap = &bp->b_attr; 1979 1980 io_tier = throttle_get_io_policy(&ut); 1981 passive = throttle_get_passive_io_policy(&ut); 1982 1983 if (bp->b_flags & B_META) 1984 bap->ba_flags |= BA_META; 1985 1986#if CONFIG_IOSCHED 1987 /* 1988 * For I/O Scheduling, we currently do not have a way to track and expedite metadata I/Os. 1989 * To ensure we dont get into priority inversions due to metadata I/Os, we use the following rules: 1990 * For metadata reads, ceil all I/Os to IOSCHED_METADATA_TIER & mark them passive if the I/O tier was upgraded 1991 * For metadata writes, unconditionally mark them as IOSCHED_METADATA_TIER and passive 1992 */ 1993 if (bap->ba_flags & BA_META) { 1994 if (mp && (mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) { 1995 if (bp->b_flags & B_READ) { 1996 if (io_tier > IOSCHED_METADATA_TIER) { 1997 io_tier = IOSCHED_METADATA_TIER; 1998 passive = 1; 1999 } 2000 } else { 2001 io_tier = IOSCHED_METADATA_TIER; 2002 passive = 1; 2003 } 2004 } 2005 } 2006#endif /* CONFIG_IOSCHED */ 2007 2008 SET_BUFATTR_IO_TIER(bap, io_tier); 2009 2010 if (passive) { 2011 bp->b_flags |= B_PASSIVE; 2012 bap->ba_flags |= BA_PASSIVE; 2013 } 2014 2015 if ((curproc != NULL) && ((curproc->p_flag & P_DELAYIDLESLEEP) == P_DELAYIDLESLEEP)) 2016 bap->ba_flags |= BA_DELAYIDLESLEEP; 2017 2018 bflags = bp->b_flags; 2019 2020 if (((bflags & B_READ) == 0) && ((bflags & B_ASYNC) == 0)) 2021 bufattr_markquickcomplete(bap); 2022 2023 if (bflags & B_READ) 2024 code |= DKIO_READ; 2025 if (bflags & B_ASYNC) 2026 code |= DKIO_ASYNC; 2027 if (bflags & B_META) 2028 code |= DKIO_META; 2029 else if (bflags & B_PAGEIO) 2030 code |= DKIO_PAGING; 2031 2032 if (io_tier != 0) 2033 code |= DKIO_THROTTLE; 2034 2035 code |= ((io_tier << DKIO_TIER_SHIFT) & DKIO_TIER_MASK); 2036 2037 if (bflags & B_PASSIVE) 2038 code |= DKIO_PASSIVE; 2039 2040 if (bap->ba_flags & BA_NOCACHE) 2041 code |= DKIO_NOCACHE; 2042 2043 if (kdebug_enable) { 2044 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE, 2045 buf_kernel_addrperm_addr(bp), bdev, (int)buf_blkno(bp), buf_count(bp), 0); 2046 } 2047 2048 thread_update_io_stats(current_thread(), buf_count(bp), code); 2049 2050 if (mp != NULL) { 2051 if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) 2052 isssd = TRUE; 2053 throttle_info = &_throttle_io_info[mp->mnt_devbsdunit]; 2054 } else 2055 throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; 2056 2057 throttle_info_update_internal(throttle_info, ut, bflags, isssd); 2058 2059 if ((bflags & B_READ) == 0) { 2060 microuptime(&throttle_info->throttle_last_write_timestamp); 2061 2062 if (mp) { 2063 mp->mnt_last_write_issued_timestamp = throttle_info->throttle_last_write_timestamp; 2064 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size); 2065 } 2066 } else if (mp) { 2067 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_read_size); 2068 } 2069 /* 2070 * The BootCache may give us special information about 2071 * the IO, so it returns special values that we check 2072 * for here. 2073 * 2074 * IO_SATISFIED_BY_CACHE 2075 * The read has been satisfied by the boot cache. Don't 2076 * throttle the thread unnecessarily. 2077 * 2078 * IO_SHOULD_BE_THROTTLED 2079 * The boot cache is playing back a playlist and this IO 2080 * cut through. Throttle it so we're not cutting through 2081 * the boot cache too often. 2082 * 2083 * Note that typical strategy routines are defined with 2084 * a void return so we'll get garbage here. In the 2085 * unlikely case the garbage matches our special return 2086 * value, it's not a big deal since we're only adjusting 2087 * the throttling delay. 2088 */ 2089#define IO_SATISFIED_BY_CACHE ((int)0xcafefeed) 2090#define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef) 2091 typedef int strategy_fcn_ret_t(struct buf *bp); 2092 2093 strategy_ret = (*(strategy_fcn_ret_t*)bdevsw[major(bdev)].d_strategy)(bp); 2094 2095 if (IO_SATISFIED_BY_CACHE == strategy_ret) { 2096 /* 2097 * If this was a throttled IO satisfied by the boot cache, 2098 * don't delay the thread. 2099 */ 2100 throttle_info_reset_window(ut); 2101 2102 } else if (IO_SHOULD_BE_THROTTLED == strategy_ret) { 2103 /* 2104 * If the boot cache indicates this IO should be throttled, 2105 * delay the thread. 2106 */ 2107 throttle_info_set_initial_window(ut, throttle_info, TRUE, isssd); 2108 } 2109 return (0); 2110} 2111 2112 2113/* 2114 * This is a noop, simply returning what one has been given. 2115 */ 2116int 2117spec_blockmap(__unused struct vnop_blockmap_args *ap) 2118{ 2119 return (ENOTSUP); 2120} 2121 2122 2123/* 2124 * Device close routine 2125 */ 2126int 2127spec_close(struct vnop_close_args *ap) 2128{ 2129 struct vnode *vp = ap->a_vp; 2130 dev_t dev = vp->v_rdev; 2131 int error = 0; 2132 int flags = ap->a_fflag; 2133 struct proc *p = vfs_context_proc(ap->a_context); 2134 struct session *sessp; 2135 2136 switch (vp->v_type) { 2137 2138 case VCHR: 2139 /* 2140 * Hack: a tty device that is a controlling terminal 2141 * has a reference from the session structure. 2142 * We cannot easily tell that a character device is 2143 * a controlling terminal, unless it is the closing 2144 * process' controlling terminal. In that case, 2145 * if the reference count is 1 (this is the very 2146 * last close) 2147 */ 2148 sessp = proc_session(p); 2149 devsw_lock(dev, S_IFCHR); 2150 if (sessp != SESSION_NULL) { 2151 if (vp == sessp->s_ttyvp && vcount(vp) == 1) { 2152 struct tty *tp = TTY_NULL; 2153 2154 devsw_unlock(dev, S_IFCHR); 2155 session_lock(sessp); 2156 if (vp == sessp->s_ttyvp) { 2157 tp = SESSION_TP(sessp); 2158 sessp->s_ttyvp = NULL; 2159 sessp->s_ttyvid = 0; 2160 sessp->s_ttyp = TTY_NULL; 2161 sessp->s_ttypgrpid = NO_PID; 2162 } 2163 session_unlock(sessp); 2164 2165 if (tp != TTY_NULL) { 2166 /* 2167 * We may have won a race with a proc_exit 2168 * of the session leader, the winner 2169 * clears the flag (even if not set) 2170 */ 2171 tty_lock(tp); 2172 ttyclrpgrphup(tp); 2173 tty_unlock(tp); 2174 2175 ttyfree(tp); 2176 } 2177 devsw_lock(dev, S_IFCHR); 2178 } 2179 session_rele(sessp); 2180 } 2181 2182 if (--vp->v_specinfo->si_opencount < 0) 2183 panic("negative open count (c, %u, %u)", major(dev), minor(dev)); 2184 2185 /* 2186 * close on last reference or on vnode revoke call 2187 */ 2188 if (vcount(vp) == 0 || (flags & IO_REVOKE) != 0) 2189 error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p); 2190 2191 devsw_unlock(dev, S_IFCHR); 2192 break; 2193 2194 case VBLK: 2195 /* 2196 * If there is more than one outstanding open, don't 2197 * send the close to the device. 2198 */ 2199 devsw_lock(dev, S_IFBLK); 2200 if (vcount(vp) > 1) { 2201 vp->v_specinfo->si_opencount--; 2202 devsw_unlock(dev, S_IFBLK); 2203 return (0); 2204 } 2205 devsw_unlock(dev, S_IFBLK); 2206 2207 /* 2208 * On last close of a block device (that isn't mounted) 2209 * we must invalidate any in core blocks, so that 2210 * we can, for instance, change floppy disks. 2211 */ 2212 if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context))) 2213 return (error); 2214 2215 error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0); 2216 if (error) 2217 return (error); 2218 2219 devsw_lock(dev, S_IFBLK); 2220 2221 if (--vp->v_specinfo->si_opencount < 0) 2222 panic("negative open count (b, %u, %u)", major(dev), minor(dev)); 2223 2224 if (vcount(vp) == 0) 2225 error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p); 2226 2227 devsw_unlock(dev, S_IFBLK); 2228 break; 2229 2230 default: 2231 panic("spec_close: not special"); 2232 return(EBADF); 2233 } 2234 2235 return error; 2236} 2237 2238/* 2239 * Return POSIX pathconf information applicable to special devices. 2240 */ 2241int 2242spec_pathconf(struct vnop_pathconf_args *ap) 2243{ 2244 2245 switch (ap->a_name) { 2246 case _PC_LINK_MAX: 2247 *ap->a_retval = LINK_MAX; 2248 return (0); 2249 case _PC_MAX_CANON: 2250 *ap->a_retval = MAX_CANON; 2251 return (0); 2252 case _PC_MAX_INPUT: 2253 *ap->a_retval = MAX_INPUT; 2254 return (0); 2255 case _PC_PIPE_BUF: 2256 *ap->a_retval = PIPE_BUF; 2257 return (0); 2258 case _PC_CHOWN_RESTRICTED: 2259 *ap->a_retval = 200112; /* _POSIX_CHOWN_RESTRICTED */ 2260 return (0); 2261 case _PC_VDISABLE: 2262 *ap->a_retval = _POSIX_VDISABLE; 2263 return (0); 2264 default: 2265 return (EINVAL); 2266 } 2267 /* NOTREACHED */ 2268} 2269 2270/* 2271 * Special device failed operation 2272 */ 2273int 2274spec_ebadf(__unused void *dummy) 2275{ 2276 2277 return (EBADF); 2278} 2279 2280/* Blktooff derives file offset from logical block number */ 2281int 2282spec_blktooff(struct vnop_blktooff_args *ap) 2283{ 2284 struct vnode *vp = ap->a_vp; 2285 2286 switch (vp->v_type) { 2287 case VCHR: 2288 *ap->a_offset = (off_t)-1; /* failure */ 2289 return (ENOTSUP); 2290 2291 case VBLK: 2292 printf("spec_blktooff: not implemented for VBLK\n"); 2293 *ap->a_offset = (off_t)-1; /* failure */ 2294 return (ENOTSUP); 2295 2296 default: 2297 panic("spec_blktooff type"); 2298 } 2299 /* NOTREACHED */ 2300 2301 return (0); 2302} 2303 2304/* Offtoblk derives logical block number from file offset */ 2305int 2306spec_offtoblk(struct vnop_offtoblk_args *ap) 2307{ 2308 struct vnode *vp = ap->a_vp; 2309 2310 switch (vp->v_type) { 2311 case VCHR: 2312 *ap->a_lblkno = (daddr64_t)-1; /* failure */ 2313 return (ENOTSUP); 2314 2315 case VBLK: 2316 printf("spec_offtoblk: not implemented for VBLK\n"); 2317 *ap->a_lblkno = (daddr64_t)-1; /* failure */ 2318 return (ENOTSUP); 2319 2320 default: 2321 panic("spec_offtoblk type"); 2322 } 2323 /* NOTREACHED */ 2324 2325 return (0); 2326} 2327 2328static void filt_specdetach(struct knote *kn); 2329static int filt_spec(struct knote *kn, long hint); 2330static unsigned filt_specpeek(struct knote *kn); 2331 2332struct filterops spec_filtops = { 2333 .f_isfd = 1, 2334 .f_attach = filt_specattach, 2335 .f_detach = filt_specdetach, 2336 .f_event = filt_spec, 2337 .f_peek = filt_specpeek 2338}; 2339 2340static int 2341filter_to_seltype(int16_t filter) 2342{ 2343 switch (filter) { 2344 case EVFILT_READ: 2345 return FREAD; 2346 case EVFILT_WRITE: 2347 return FWRITE; 2348 break; 2349 default: 2350 panic("filt_to_seltype(): invalid filter %d\n", filter); 2351 return 0; 2352 } 2353} 2354 2355static int 2356filt_specattach(struct knote *kn) 2357{ 2358 vnode_t vp; 2359 dev_t dev; 2360 2361 vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; /* Already have iocount, and vnode is alive */ 2362 2363 assert(vnode_ischr(vp)); 2364 2365 dev = vnode_specrdev(vp); 2366 2367 if (major(dev) > nchrdev) { 2368 return ENXIO; 2369 } 2370 2371 if ((cdevsw_flags[major(dev)] & CDEVSW_SELECT_KQUEUE) == 0) { 2372 return EINVAL; 2373 } 2374 2375 /* Resulting wql is safe to unlink even if it has never been linked */ 2376 kn->kn_hook = wait_queue_link_allocate(); 2377 if (kn->kn_hook == NULL) { 2378 return EAGAIN; 2379 } 2380 2381 kn->kn_fop = &spec_filtops; 2382 kn->kn_hookid = vnode_vid(vp); 2383 2384 knote_markstayqueued(kn); 2385 2386 return 0; 2387} 2388 2389static void 2390filt_specdetach(struct knote *kn) 2391{ 2392 kern_return_t ret; 2393 2394 /* 2395 * Given wait queue link and wait queue set, unlink. This is subtle. 2396 * If the device has been revoked from under us, selclearthread() will 2397 * have removed our link from the kqueue's wait queue set, which 2398 * wait_queue_set_unlink_one() will detect and handle. 2399 */ 2400 ret = wait_queue_set_unlink_one(kn->kn_kq->kq_wqs, kn->kn_hook); 2401 if (ret != KERN_SUCCESS) { 2402 panic("filt_specdetach(): failed to unlink wait queue link."); 2403 } 2404 2405 (void)wait_queue_link_free(kn->kn_hook); 2406 kn->kn_hook = NULL; 2407 kn->kn_status &= ~KN_STAYQUEUED; 2408} 2409 2410static int 2411filt_spec(struct knote *kn, long hint) 2412{ 2413 vnode_t vp; 2414 uthread_t uth; 2415 wait_queue_set_t old_wqs; 2416 vfs_context_t ctx; 2417 int selres; 2418 int error; 2419 int use_offset; 2420 dev_t dev; 2421 uint64_t flags; 2422 2423 assert(kn->kn_hook != NULL); 2424 2425 if (hint != 0) { 2426 panic("filt_spec(): nonzero hint?"); 2427 } 2428 2429 uth = get_bsdthread_info(current_thread()); 2430 ctx = vfs_context_current(); 2431 vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; 2432 2433 error = vnode_getwithvid(vp, kn->kn_hookid); 2434 if (error != 0) { 2435 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 2436 return 1; 2437 } 2438 2439 dev = vnode_specrdev(vp); 2440 flags = cdevsw_flags[major(dev)]; 2441 use_offset = ((flags & CDEVSW_USE_OFFSET) != 0); 2442 assert((flags & CDEVSW_SELECT_KQUEUE) != 0); 2443 2444 /* Trick selrecord() into hooking kqueue's wait queue set into device wait queue */ 2445 old_wqs = uth->uu_wqset; 2446 uth->uu_wqset = kn->kn_kq->kq_wqs; 2447 selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx); 2448 uth->uu_wqset = old_wqs; 2449 2450 if (use_offset) { 2451 if (kn->kn_fp->f_fglob->fg_offset >= (uint32_t)selres) { 2452 kn->kn_data = 0; 2453 } else { 2454 kn->kn_data = ((uint32_t)selres) - kn->kn_fp->f_fglob->fg_offset; 2455 } 2456 } else { 2457 kn->kn_data = selres; 2458 } 2459 2460 vnode_put(vp); 2461 2462 return (kn->kn_data != 0); 2463} 2464 2465static unsigned 2466filt_specpeek(struct knote *kn) 2467{ 2468 vnode_t vp; 2469 uthread_t uth; 2470 wait_queue_set_t old_wqs; 2471 vfs_context_t ctx; 2472 int error, selres; 2473 2474 uth = get_bsdthread_info(current_thread()); 2475 ctx = vfs_context_current(); 2476 vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; 2477 2478 error = vnode_getwithvid(vp, kn->kn_hookid); 2479 if (error != 0) { 2480 return 1; /* Just like VNOP_SELECT() on recycled vnode */ 2481 } 2482 2483 /* 2484 * Why pass the link here? Because we may not have registered in the past... 2485 */ 2486 old_wqs = uth->uu_wqset; 2487 uth->uu_wqset = kn->kn_kq->kq_wqs; 2488 selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx); 2489 uth->uu_wqset = old_wqs; 2490 2491 vnode_put(vp); 2492 return selres; 2493} 2494 2495