1/* 2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ 29/*- 30 * Copyright (c) 1994 Christopher G. Demetriou 31 * Copyright (c) 1982, 1986, 1989, 1993 32 * The Regents of the University of California. All rights reserved. 33 * (c) UNIX System Laboratories, Inc. 34 * All or some portions of this file are derived from material licensed 35 * to the University of California by American Telephone and Telegraph 36 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 37 * the permission of UNIX System Laboratories, Inc. 38 * 39 * Redistribution and use in source and binary forms, with or without 40 * modification, are permitted provided that the following conditions 41 * are met: 42 * 1. Redistributions of source code must retain the above copyright 43 * notice, this list of conditions and the following disclaimer. 44 * 2. Redistributions in binary form must reproduce the above copyright 45 * notice, this list of conditions and the following disclaimer in the 46 * documentation and/or other materials provided with the distribution. 47 * 3. All advertising materials mentioning features or use of this software 48 * must display the following acknowledgement: 49 * This product includes software developed by the University of 50 * California, Berkeley and its contributors. 51 * 4. Neither the name of the University nor the names of its contributors 52 * may be used to endorse or promote products derived from this software 53 * without specific prior written permission. 54 * 55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 65 * SUCH DAMAGE. 66 * 67 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94 68 */ 69 70/* 71 * Some references: 72 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986) 73 * Leffler, et al.: The Design and Implementation of the 4.3BSD 74 * UNIX Operating System (Addison Welley, 1989) 75 */ 76 77#include <sys/param.h> 78#include <sys/systm.h> 79#include <sys/proc_internal.h> 80#include <sys/buf_internal.h> 81#include <sys/vnode_internal.h> 82#include <sys/mount_internal.h> 83#include <sys/trace.h> 84#include <sys/malloc.h> 85#include <sys/resourcevar.h> 86#include <miscfs/specfs/specdev.h> 87#include <sys/ubc.h> 88#include <sys/kauth.h> 89#if DIAGNOSTIC 90#include <kern/assert.h> 91#endif /* DIAGNOSTIC */ 92#include <kern/task.h> 93#include <kern/zalloc.h> 94#include <kern/lock.h> 95 96#include <sys/fslog.h> /* fslog_io_error() */ 97 98#include <mach/mach_types.h> 99#include <mach/memory_object_types.h> 100#include <kern/sched_prim.h> /* thread_block() */ 101 102#include <vm/vm_kern.h> 103#include <vm/vm_pageout.h> 104 105#include <sys/kdebug.h> 106 107#include <libkern/OSAtomic.h> 108#include <libkern/OSDebug.h> 109#include <sys/ubc_internal.h> 110 111#include <sys/sdt.h> 112#include <sys/cprotect.h> 113 114 115#if BALANCE_QUEUES 116static __inline__ void bufqinc(int q); 117static __inline__ void bufqdec(int q); 118#endif 119 120int bcleanbuf(buf_t bp, boolean_t discard); 121static int brecover_data(buf_t bp); 122static boolean_t incore(vnode_t vp, daddr64_t blkno); 123/* timeout is in msecs */ 124static buf_t getnewbuf(int slpflag, int slptimeo, int *queue); 125static void bremfree_locked(buf_t bp); 126static void buf_reassign(buf_t bp, vnode_t newvp); 127static errno_t buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo); 128static int buf_iterprepare(vnode_t vp, struct buflists *, int flags); 129static void buf_itercomplete(vnode_t vp, struct buflists *, int flags); 130static boolean_t buffer_cache_gc(int); 131static buf_t buf_brelse_shadow(buf_t bp); 132static void buf_free_meta_store(buf_t bp); 133 134static buf_t buf_create_shadow_internal(buf_t bp, boolean_t force_copy, 135 uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv); 136 137 138__private_extern__ int bdwrite_internal(buf_t, int); 139 140/* zone allocated buffer headers */ 141static void bufzoneinit(void); 142static void bcleanbuf_thread_init(void); 143static void bcleanbuf_thread(void); 144 145static zone_t buf_hdr_zone; 146static int buf_hdr_count; 147 148 149/* 150 * Definitions for the buffer hash lists. 151 */ 152#define BUFHASH(dvp, lbn) \ 153 (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash]) 154LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash; 155u_long bufhash; 156 157static buf_t incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp); 158 159/* Definitions for the buffer stats. */ 160struct bufstats bufstats; 161 162/* Number of delayed write buffers */ 163long nbdwrite = 0; 164int blaundrycnt = 0; 165static int boot_nbuf_headers = 0; 166 167static TAILQ_HEAD(delayqueue, buf) delaybufqueue; 168 169static TAILQ_HEAD(ioqueue, buf) iobufqueue; 170static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES]; 171static int needbuffer; 172static int need_iobuffer; 173 174static lck_grp_t *buf_mtx_grp; 175static lck_attr_t *buf_mtx_attr; 176static lck_grp_attr_t *buf_mtx_grp_attr; 177static lck_mtx_t *iobuffer_mtxp; 178static lck_mtx_t *buf_mtxp; 179 180static int buf_busycount; 181 182static __inline__ int 183buf_timestamp(void) 184{ 185 struct timeval t; 186 microuptime(&t); 187 return (t.tv_sec); 188} 189 190/* 191 * Insq/Remq for the buffer free lists. 192 */ 193#if BALANCE_QUEUES 194#define binsheadfree(bp, dp, whichq) do { \ 195 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \ 196 bufqinc((whichq)); \ 197 } while (0) 198 199#define binstailfree(bp, dp, whichq) do { \ 200 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \ 201 bufqinc((whichq)); \ 202 } while (0) 203#else 204#define binsheadfree(bp, dp, whichq) do { \ 205 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \ 206 } while (0) 207 208#define binstailfree(bp, dp, whichq) do { \ 209 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \ 210 } while (0) 211#endif 212 213 214#define BHASHENTCHECK(bp) \ 215 if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \ 216 panic("%p: b_hash.le_prev is not deadbeef", (bp)); 217 218#define BLISTNONE(bp) \ 219 (bp)->b_hash.le_next = (struct buf *)0; \ 220 (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef; 221 222/* 223 * Insq/Remq for the vnode usage lists. 224 */ 225#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) 226#define bufremvn(bp) { \ 227 LIST_REMOVE(bp, b_vnbufs); \ 228 (bp)->b_vnbufs.le_next = NOLIST; \ 229} 230 231/* 232 * Time in seconds before a buffer on a list is 233 * considered as a stale buffer 234 */ 235#define LRU_IS_STALE 120 /* default value for the LRU */ 236#define AGE_IS_STALE 60 /* default value for the AGE */ 237#define META_IS_STALE 180 /* default value for the BQ_META */ 238 239int lru_is_stale = LRU_IS_STALE; 240int age_is_stale = AGE_IS_STALE; 241int meta_is_stale = META_IS_STALE; 242 243#define MAXLAUNDRY 10 244 245/* LIST_INSERT_HEAD() with assertions */ 246static __inline__ void 247blistenterhead(struct bufhashhdr * head, buf_t bp) 248{ 249 if ((bp->b_hash.le_next = (head)->lh_first) != NULL) 250 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next; 251 (head)->lh_first = bp; 252 bp->b_hash.le_prev = &(head)->lh_first; 253 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef) 254 panic("blistenterhead: le_prev is deadbeef"); 255} 256 257static __inline__ void 258binshash(buf_t bp, struct bufhashhdr *dp) 259{ 260#if DIAGNOSTIC 261 buf_t nbp; 262#endif /* DIAGNOSTIC */ 263 264 BHASHENTCHECK(bp); 265 266#if DIAGNOSTIC 267 nbp = dp->lh_first; 268 for(; nbp != NULL; nbp = nbp->b_hash.le_next) { 269 if(nbp == bp) 270 panic("buf already in hashlist"); 271 } 272#endif /* DIAGNOSTIC */ 273 274 blistenterhead(dp, bp); 275} 276 277static __inline__ void 278bremhash(buf_t bp) 279{ 280 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef) 281 panic("bremhash le_prev is deadbeef"); 282 if (bp->b_hash.le_next == bp) 283 panic("bremhash: next points to self"); 284 285 if (bp->b_hash.le_next != NULL) 286 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev; 287 *bp->b_hash.le_prev = (bp)->b_hash.le_next; 288} 289 290/* 291 * buf_mtxp held. 292 */ 293static __inline__ void 294bmovelaundry(buf_t bp) 295{ 296 bp->b_whichq = BQ_LAUNDRY; 297 bp->b_timestamp = buf_timestamp(); 298 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY); 299 blaundrycnt++; 300} 301 302static __inline__ void 303buf_release_credentials(buf_t bp) 304{ 305 if (IS_VALID_CRED(bp->b_rcred)) { 306 kauth_cred_unref(&bp->b_rcred); 307 } 308 if (IS_VALID_CRED(bp->b_wcred)) { 309 kauth_cred_unref(&bp->b_wcred); 310 } 311} 312 313 314int 315buf_valid(buf_t bp) { 316 317 if ( (bp->b_flags & (B_DONE | B_DELWRI)) ) 318 return 1; 319 return 0; 320} 321 322int 323buf_fromcache(buf_t bp) { 324 325 if ( (bp->b_flags & B_CACHE) ) 326 return 1; 327 return 0; 328} 329 330void 331buf_markinvalid(buf_t bp) { 332 333 SET(bp->b_flags, B_INVAL); 334} 335 336void 337buf_markdelayed(buf_t bp) { 338 339 if (!ISSET(bp->b_flags, B_DELWRI)) { 340 SET(bp->b_flags, B_DELWRI); 341 342 OSAddAtomicLong(1, &nbdwrite); 343 buf_reassign(bp, bp->b_vp); 344 } 345 SET(bp->b_flags, B_DONE); 346} 347 348void 349buf_markclean(buf_t bp) { 350 351 if (ISSET(bp->b_flags, B_DELWRI)) { 352 CLR(bp->b_flags, B_DELWRI); 353 354 OSAddAtomicLong(-1, &nbdwrite); 355 buf_reassign(bp, bp->b_vp); 356 } 357} 358 359void 360buf_markeintr(buf_t bp) { 361 362 SET(bp->b_flags, B_EINTR); 363} 364 365 366void 367buf_markaged(buf_t bp) { 368 369 SET(bp->b_flags, B_AGE); 370} 371 372int 373buf_fua(buf_t bp) { 374 375 if ((bp->b_flags & B_FUA) == B_FUA) 376 return 1; 377 return 0; 378} 379 380void 381buf_markfua(buf_t bp) { 382 383 SET(bp->b_flags, B_FUA); 384} 385 386#if CONFIG_PROTECT 387void 388buf_setcpaddr(buf_t bp, struct cprotect *entry) { 389 bp->b_attr.ba_cpentry = entry; 390} 391 392void 393buf_setcpoff (buf_t bp, uint64_t foffset) { 394 bp->b_attr.ba_cp_file_off = foffset; 395} 396 397void * 398bufattr_cpaddr(bufattr_t bap) { 399 return (bap->ba_cpentry); 400} 401 402uint64_t 403bufattr_cpoff(bufattr_t bap) { 404 return (bap->ba_cp_file_off); 405} 406 407void 408bufattr_setcpaddr(bufattr_t bap, void *cp_entry_addr) { 409 bap->ba_cpentry = cp_entry_addr; 410} 411 412void 413bufattr_setcpoff(bufattr_t bap, uint64_t foffset) { 414 bap->ba_cp_file_off = foffset; 415} 416 417#else 418void * 419bufattr_cpaddr(bufattr_t bap __unused) { 420 return NULL; 421} 422 423uint64_t 424bufattr_cpoff(bufattr_t bap __unused) { 425 return 0; 426} 427 428void 429bufattr_setcpaddr(bufattr_t bap __unused, void *cp_entry_addr __unused) { 430} 431 432void 433bufattr_setcpoff(__unused bufattr_t bap, __unused uint64_t foffset) { 434 return; 435} 436#endif /* CONFIG_PROTECT */ 437 438bufattr_t 439bufattr_alloc() { 440 bufattr_t bap; 441 MALLOC(bap, bufattr_t, sizeof(struct bufattr), M_TEMP, M_WAITOK); 442 if (bap == NULL) 443 return NULL; 444 445 bzero(bap, sizeof(struct bufattr)); 446 return bap; 447} 448 449void 450bufattr_free(bufattr_t bap) { 451 if (bap) 452 FREE(bap, M_TEMP); 453} 454 455int 456bufattr_rawencrypted(bufattr_t bap) { 457 if ( (bap->ba_flags & BA_RAW_ENCRYPTED_IO) ) 458 return 1; 459 return 0; 460} 461 462int 463bufattr_throttled(bufattr_t bap) { 464 return (GET_BUFATTR_IO_TIER(bap)); 465} 466 467int 468bufattr_nocache(bufattr_t bap) { 469 if ( (bap->ba_flags & BA_NOCACHE) ) 470 return 1; 471 return 0; 472} 473 474int 475bufattr_meta(bufattr_t bap) { 476 if ( (bap->ba_flags & BA_META) ) 477 return 1; 478 return 0; 479} 480 481int 482bufattr_delayidlesleep(bufattr_t bap) 483{ 484 if ( (bap->ba_flags & BA_DELAYIDLESLEEP) ) 485 return 1; 486 return 0; 487} 488 489bufattr_t 490buf_attr(buf_t bp) { 491 return &bp->b_attr; 492} 493 494void 495buf_markstatic(buf_t bp __unused) { 496 SET(bp->b_flags, B_STATICCONTENT); 497} 498 499int 500buf_static(buf_t bp) { 501 if ( (bp->b_flags & B_STATICCONTENT) ) 502 return 1; 503 return 0; 504} 505 506void 507bufattr_markgreedymode(bufattr_t bap) { 508 SET(bap->ba_flags, BA_GREEDY_MODE); 509} 510 511int 512bufattr_greedymode(bufattr_t bap) { 513 if ( (bap->ba_flags & BA_GREEDY_MODE) ) 514 return 1; 515 return 0; 516} 517 518void 519bufattr_markquickcomplete(bufattr_t bap) { 520 SET(bap->ba_flags, BA_QUICK_COMPLETE); 521} 522 523int 524bufattr_quickcomplete(bufattr_t bap) { 525 if ( (bap->ba_flags & BA_QUICK_COMPLETE) ) 526 return 1; 527 return 0; 528} 529 530errno_t 531buf_error(buf_t bp) { 532 533 return (bp->b_error); 534} 535 536void 537buf_seterror(buf_t bp, errno_t error) { 538 539 if ((bp->b_error = error)) 540 SET(bp->b_flags, B_ERROR); 541 else 542 CLR(bp->b_flags, B_ERROR); 543} 544 545void 546buf_setflags(buf_t bp, int32_t flags) { 547 548 SET(bp->b_flags, (flags & BUF_X_WRFLAGS)); 549} 550 551void 552buf_clearflags(buf_t bp, int32_t flags) { 553 554 CLR(bp->b_flags, (flags & BUF_X_WRFLAGS)); 555} 556 557int32_t 558buf_flags(buf_t bp) { 559 560 return ((bp->b_flags & BUF_X_RDFLAGS)); 561} 562 563void 564buf_reset(buf_t bp, int32_t io_flags) { 565 566 CLR(bp->b_flags, (B_READ | B_WRITE | B_ERROR | B_DONE | B_INVAL | B_ASYNC | B_NOCACHE | B_FUA)); 567 SET(bp->b_flags, (io_flags & (B_ASYNC | B_READ | B_WRITE | B_NOCACHE))); 568 569 bp->b_error = 0; 570} 571 572uint32_t 573buf_count(buf_t bp) { 574 575 return (bp->b_bcount); 576} 577 578void 579buf_setcount(buf_t bp, uint32_t bcount) { 580 581 bp->b_bcount = bcount; 582} 583 584uint32_t 585buf_size(buf_t bp) { 586 587 return (bp->b_bufsize); 588} 589 590void 591buf_setsize(buf_t bp, uint32_t bufsize) { 592 593 bp->b_bufsize = bufsize; 594} 595 596uint32_t 597buf_resid(buf_t bp) { 598 599 return (bp->b_resid); 600} 601 602void 603buf_setresid(buf_t bp, uint32_t resid) { 604 605 bp->b_resid = resid; 606} 607 608uint32_t 609buf_dirtyoff(buf_t bp) { 610 611 return (bp->b_dirtyoff); 612} 613 614uint32_t 615buf_dirtyend(buf_t bp) { 616 617 return (bp->b_dirtyend); 618} 619 620void 621buf_setdirtyoff(buf_t bp, uint32_t dirtyoff) { 622 623 bp->b_dirtyoff = dirtyoff; 624} 625 626void 627buf_setdirtyend(buf_t bp, uint32_t dirtyend) { 628 629 bp->b_dirtyend = dirtyend; 630} 631 632uintptr_t 633buf_dataptr(buf_t bp) { 634 635 return (bp->b_datap); 636} 637 638void 639buf_setdataptr(buf_t bp, uintptr_t data) { 640 641 bp->b_datap = data; 642} 643 644vnode_t 645buf_vnode(buf_t bp) { 646 647 return (bp->b_vp); 648} 649 650void 651buf_setvnode(buf_t bp, vnode_t vp) { 652 653 bp->b_vp = vp; 654} 655 656 657void * 658buf_callback(buf_t bp) 659{ 660 if ( !(bp->b_flags & B_CALL) ) 661 return ((void *) NULL); 662 663 return ((void *)bp->b_iodone); 664} 665 666 667errno_t 668buf_setcallback(buf_t bp, void (*callback)(buf_t, void *), void *transaction) 669{ 670 if (callback) 671 bp->b_flags |= (B_CALL | B_ASYNC); 672 else 673 bp->b_flags &= ~B_CALL; 674 bp->b_transaction = transaction; 675 bp->b_iodone = callback; 676 677 return (0); 678} 679 680errno_t 681buf_setupl(buf_t bp, upl_t upl, uint32_t offset) 682{ 683 684 if ( !(bp->b_lflags & BL_IOBUF) ) 685 return (EINVAL); 686 687 if (upl) 688 bp->b_flags |= B_CLUSTER; 689 else 690 bp->b_flags &= ~B_CLUSTER; 691 bp->b_upl = upl; 692 bp->b_uploffset = offset; 693 694 return (0); 695} 696 697buf_t 698buf_clone(buf_t bp, int io_offset, int io_size, void (*iodone)(buf_t, void *), void *arg) 699{ 700 buf_t io_bp; 701 702 if (io_offset < 0 || io_size < 0) 703 return (NULL); 704 705 if ((unsigned)(io_offset + io_size) > (unsigned)bp->b_bcount) 706 return (NULL); 707 708 if (bp->b_flags & B_CLUSTER) { 709 if (io_offset && ((bp->b_uploffset + io_offset) & PAGE_MASK)) 710 return (NULL); 711 712 if (((bp->b_uploffset + io_offset + io_size) & PAGE_MASK) && ((io_offset + io_size) < bp->b_bcount)) 713 return (NULL); 714 } 715 io_bp = alloc_io_buf(bp->b_vp, 0); 716 717 io_bp->b_flags = bp->b_flags & (B_COMMIT_UPL | B_META | B_PAGEIO | B_CLUSTER | B_PHYS | B_RAW | B_ASYNC | B_READ | B_FUA); 718 719 if (iodone) { 720 io_bp->b_transaction = arg; 721 io_bp->b_iodone = iodone; 722 io_bp->b_flags |= B_CALL; 723 } 724 if (bp->b_flags & B_CLUSTER) { 725 io_bp->b_upl = bp->b_upl; 726 io_bp->b_uploffset = bp->b_uploffset + io_offset; 727 } else { 728 io_bp->b_datap = (uintptr_t)(((char *)bp->b_datap) + io_offset); 729 } 730 io_bp->b_bcount = io_size; 731 732 return (io_bp); 733} 734 735 736int 737buf_shadow(buf_t bp) 738{ 739 if (bp->b_lflags & BL_SHADOW) 740 return 1; 741 return 0; 742} 743 744 745buf_t 746buf_create_shadow_priv(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg) 747{ 748 return (buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 1)); 749} 750 751buf_t 752buf_create_shadow(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg) 753{ 754 return (buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 0)); 755} 756 757 758static buf_t 759buf_create_shadow_internal(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv) 760{ 761 buf_t io_bp; 762 763 KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_START, bp, 0, 0, 0, 0); 764 765 if ( !(bp->b_flags & B_META) || (bp->b_lflags & BL_IOBUF)) { 766 767 KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, 0, 0, 0, 0); 768 return (NULL); 769 } 770#ifdef BUF_MAKE_PRIVATE 771 if (bp->b_shadow_ref && bp->b_data_ref == 0 && external_storage == 0) 772 panic("buf_create_shadow: %p is in the private state (%d, %d)", bp, bp->b_shadow_ref, bp->b_data_ref); 773#endif 774 io_bp = alloc_io_buf(bp->b_vp, priv); 775 776 io_bp->b_flags = bp->b_flags & (B_META | B_ZALLOC | B_ASYNC | B_READ | B_FUA); 777 io_bp->b_blkno = bp->b_blkno; 778 io_bp->b_lblkno = bp->b_lblkno; 779 780 if (iodone) { 781 io_bp->b_transaction = arg; 782 io_bp->b_iodone = iodone; 783 io_bp->b_flags |= B_CALL; 784 } 785 if (force_copy == FALSE) { 786 io_bp->b_bcount = bp->b_bcount; 787 io_bp->b_bufsize = bp->b_bufsize; 788 789 if (external_storage) { 790 io_bp->b_datap = external_storage; 791#ifdef BUF_MAKE_PRIVATE 792 io_bp->b_data_store = NULL; 793#endif 794 } else { 795 io_bp->b_datap = bp->b_datap; 796#ifdef BUF_MAKE_PRIVATE 797 io_bp->b_data_store = bp; 798#endif 799 } 800 *(buf_t *)(&io_bp->b_orig) = bp; 801 802 lck_mtx_lock_spin(buf_mtxp); 803 804 io_bp->b_lflags |= BL_SHADOW; 805 io_bp->b_shadow = bp->b_shadow; 806 bp->b_shadow = io_bp; 807 bp->b_shadow_ref++; 808 809#ifdef BUF_MAKE_PRIVATE 810 if (external_storage) 811 io_bp->b_lflags |= BL_EXTERNAL; 812 else 813 bp->b_data_ref++; 814#endif 815 lck_mtx_unlock(buf_mtxp); 816 } else { 817 if (external_storage) { 818#ifdef BUF_MAKE_PRIVATE 819 io_bp->b_lflags |= BL_EXTERNAL; 820#endif 821 io_bp->b_bcount = bp->b_bcount; 822 io_bp->b_bufsize = bp->b_bufsize; 823 io_bp->b_datap = external_storage; 824 } else { 825 allocbuf(io_bp, bp->b_bcount); 826 827 io_bp->b_lflags |= BL_IOBUF_ALLOC; 828 } 829 bcopy((caddr_t)bp->b_datap, (caddr_t)io_bp->b_datap, bp->b_bcount); 830 831#ifdef BUF_MAKE_PRIVATE 832 io_bp->b_data_store = NULL; 833#endif 834 } 835 KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, io_bp, 0); 836 837 return (io_bp); 838} 839 840 841#ifdef BUF_MAKE_PRIVATE 842errno_t 843buf_make_private(buf_t bp) 844{ 845 buf_t ds_bp; 846 buf_t t_bp; 847 struct buf my_buf; 848 849 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_START, bp, bp->b_shadow_ref, 0, 0, 0); 850 851 if (bp->b_shadow_ref == 0 || bp->b_data_ref == 0 || ISSET(bp->b_lflags, BL_SHADOW)) { 852 853 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0); 854 return (EINVAL); 855 } 856 my_buf.b_flags = B_META; 857 my_buf.b_datap = (uintptr_t)NULL; 858 allocbuf(&my_buf, bp->b_bcount); 859 860 bcopy((caddr_t)bp->b_datap, (caddr_t)my_buf.b_datap, bp->b_bcount); 861 862 lck_mtx_lock_spin(buf_mtxp); 863 864 for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) { 865 if ( !ISSET(bp->b_lflags, BL_EXTERNAL)) 866 break; 867 } 868 ds_bp = t_bp; 869 870 if (ds_bp == NULL && bp->b_data_ref) 871 panic("buf_make_private: b_data_ref != 0 && ds_bp == NULL"); 872 873 if (ds_bp && (bp->b_data_ref == 0 || bp->b_shadow_ref == 0)) 874 panic("buf_make_private: ref_count == 0 && ds_bp != NULL"); 875 876 if (ds_bp == NULL) { 877 lck_mtx_unlock(buf_mtxp); 878 879 buf_free_meta_store(&my_buf); 880 881 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0); 882 return (EINVAL); 883 } 884 for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) { 885 if ( !ISSET(t_bp->b_lflags, BL_EXTERNAL)) 886 t_bp->b_data_store = ds_bp; 887 } 888 ds_bp->b_data_ref = bp->b_data_ref; 889 890 bp->b_data_ref = 0; 891 bp->b_datap = my_buf.b_datap; 892 893 lck_mtx_unlock(buf_mtxp); 894 895 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, 0, 0); 896 return (0); 897} 898#endif 899 900 901void 902buf_setfilter(buf_t bp, void (*filter)(buf_t, void *), void *transaction, 903 void (**old_iodone)(buf_t, void *), void **old_transaction) 904{ 905 if (old_iodone) 906 *old_iodone = bp->b_iodone; 907 if (old_transaction) 908 *old_transaction = bp->b_transaction; 909 910 bp->b_transaction = transaction; 911 bp->b_iodone = filter; 912 if (filter) 913 bp->b_flags |= B_FILTER; 914 else 915 bp->b_flags &= ~B_FILTER; 916} 917 918 919daddr64_t 920buf_blkno(buf_t bp) { 921 922 return (bp->b_blkno); 923} 924 925daddr64_t 926buf_lblkno(buf_t bp) { 927 928 return (bp->b_lblkno); 929} 930 931void 932buf_setblkno(buf_t bp, daddr64_t blkno) { 933 934 bp->b_blkno = blkno; 935} 936 937void 938buf_setlblkno(buf_t bp, daddr64_t lblkno) { 939 940 bp->b_lblkno = lblkno; 941} 942 943dev_t 944buf_device(buf_t bp) { 945 946 return (bp->b_dev); 947} 948 949errno_t 950buf_setdevice(buf_t bp, vnode_t vp) { 951 952 if ((vp->v_type != VBLK) && (vp->v_type != VCHR)) 953 return EINVAL; 954 bp->b_dev = vp->v_rdev; 955 956 return 0; 957} 958 959 960void * 961buf_drvdata(buf_t bp) { 962 963 return (bp->b_drvdata); 964} 965 966void 967buf_setdrvdata(buf_t bp, void *drvdata) { 968 969 bp->b_drvdata = drvdata; 970} 971 972void * 973buf_fsprivate(buf_t bp) { 974 975 return (bp->b_fsprivate); 976} 977 978void 979buf_setfsprivate(buf_t bp, void *fsprivate) { 980 981 bp->b_fsprivate = fsprivate; 982} 983 984kauth_cred_t 985buf_rcred(buf_t bp) { 986 987 return (bp->b_rcred); 988} 989 990kauth_cred_t 991buf_wcred(buf_t bp) { 992 993 return (bp->b_wcred); 994} 995 996void * 997buf_upl(buf_t bp) { 998 999 return (bp->b_upl); 1000} 1001 1002uint32_t 1003buf_uploffset(buf_t bp) { 1004 1005 return ((uint32_t)(bp->b_uploffset)); 1006} 1007 1008proc_t 1009buf_proc(buf_t bp) { 1010 1011 return (bp->b_proc); 1012} 1013 1014 1015errno_t 1016buf_map(buf_t bp, caddr_t *io_addr) 1017{ 1018 buf_t real_bp; 1019 vm_offset_t vaddr; 1020 kern_return_t kret; 1021 1022 if ( !(bp->b_flags & B_CLUSTER)) { 1023 *io_addr = (caddr_t)bp->b_datap; 1024 return (0); 1025 } 1026 real_bp = (buf_t)(bp->b_real_bp); 1027 1028 if (real_bp && real_bp->b_datap) { 1029 /* 1030 * b_real_bp is only valid if B_CLUSTER is SET 1031 * if it's non-zero, than someone did a cluster_bp call 1032 * if the backing physical pages were already mapped 1033 * in before the call to cluster_bp (non-zero b_datap), 1034 * than we just use that mapping 1035 */ 1036 *io_addr = (caddr_t)real_bp->b_datap; 1037 return (0); 1038 } 1039 kret = ubc_upl_map(bp->b_upl, &vaddr); /* Map it in */ 1040 1041 if (kret != KERN_SUCCESS) { 1042 *io_addr = NULL; 1043 1044 return(ENOMEM); 1045 } 1046 vaddr += bp->b_uploffset; 1047 1048 *io_addr = (caddr_t)vaddr; 1049 1050 return (0); 1051} 1052 1053errno_t 1054buf_unmap(buf_t bp) 1055{ 1056 buf_t real_bp; 1057 kern_return_t kret; 1058 1059 if ( !(bp->b_flags & B_CLUSTER)) 1060 return (0); 1061 /* 1062 * see buf_map for the explanation 1063 */ 1064 real_bp = (buf_t)(bp->b_real_bp); 1065 1066 if (real_bp && real_bp->b_datap) 1067 return (0); 1068 1069 if ((bp->b_lflags & BL_IOBUF) && 1070 ((bp->b_flags & (B_PAGEIO | B_READ)) != (B_PAGEIO | B_READ))) { 1071 /* 1072 * ignore pageins... the 'right' thing will 1073 * happen due to the way we handle speculative 1074 * clusters... 1075 * 1076 * when we commit these pages, we'll hit 1077 * it with UPL_COMMIT_INACTIVE which 1078 * will clear the reference bit that got 1079 * turned on when we touched the mapping 1080 */ 1081 bp->b_flags |= B_AGE; 1082 } 1083 kret = ubc_upl_unmap(bp->b_upl); 1084 1085 if (kret != KERN_SUCCESS) 1086 return (EINVAL); 1087 return (0); 1088} 1089 1090 1091void 1092buf_clear(buf_t bp) { 1093 caddr_t baddr; 1094 1095 if (buf_map(bp, &baddr) == 0) { 1096 bzero(baddr, bp->b_bcount); 1097 buf_unmap(bp); 1098 } 1099 bp->b_resid = 0; 1100} 1101 1102/* 1103 * Read or write a buffer that is not contiguous on disk. 1104 * buffer is marked done/error at the conclusion 1105 */ 1106static int 1107buf_strategy_fragmented(vnode_t devvp, buf_t bp, off_t f_offset, size_t contig_bytes) 1108{ 1109 vnode_t vp = buf_vnode(bp); 1110 buf_t io_bp; /* For reading or writing a single block */ 1111 int io_direction; 1112 int io_resid; 1113 size_t io_contig_bytes; 1114 daddr64_t io_blkno; 1115 int error = 0; 1116 int bmap_flags; 1117 1118 /* 1119 * save our starting point... the bp was already mapped 1120 * in buf_strategy before we got called 1121 * no sense doing it again. 1122 */ 1123 io_blkno = bp->b_blkno; 1124 /* 1125 * Make sure we redo this mapping for the next I/O 1126 * i.e. this can never be a 'permanent' mapping 1127 */ 1128 bp->b_blkno = bp->b_lblkno; 1129 1130 /* 1131 * Get an io buffer to do the deblocking 1132 */ 1133 io_bp = alloc_io_buf(devvp, 0); 1134 1135 io_bp->b_lblkno = bp->b_lblkno; 1136 io_bp->b_datap = bp->b_datap; 1137 io_resid = bp->b_bcount; 1138 io_direction = bp->b_flags & B_READ; 1139 io_contig_bytes = contig_bytes; 1140 1141 if (bp->b_flags & B_READ) 1142 bmap_flags = VNODE_READ; 1143 else 1144 bmap_flags = VNODE_WRITE; 1145 1146 for (;;) { 1147 if (io_blkno == -1) 1148 /* 1149 * this is unexepected, but we'll allow for it 1150 */ 1151 bzero((caddr_t)io_bp->b_datap, (int)io_contig_bytes); 1152 else { 1153 io_bp->b_bcount = io_contig_bytes; 1154 io_bp->b_bufsize = io_contig_bytes; 1155 io_bp->b_resid = io_contig_bytes; 1156 io_bp->b_blkno = io_blkno; 1157 1158 buf_reset(io_bp, io_direction); 1159 1160 /* 1161 * Call the device to do the I/O and wait for it. Make sure the appropriate party is charged for write 1162 */ 1163 1164 if (!ISSET(bp->b_flags, B_READ)) 1165 OSAddAtomic(1, &devvp->v_numoutput); 1166 1167 if ((error = VNOP_STRATEGY(io_bp))) 1168 break; 1169 if ((error = (int)buf_biowait(io_bp))) 1170 break; 1171 if (io_bp->b_resid) { 1172 io_resid -= (io_contig_bytes - io_bp->b_resid); 1173 break; 1174 } 1175 } 1176 if ((io_resid -= io_contig_bytes) == 0) 1177 break; 1178 f_offset += io_contig_bytes; 1179 io_bp->b_datap += io_contig_bytes; 1180 1181 /* 1182 * Map the current position to a physical block number 1183 */ 1184 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_resid, &io_blkno, &io_contig_bytes, NULL, bmap_flags, NULL))) 1185 break; 1186 } 1187 buf_free(io_bp); 1188 1189 if (error) 1190 buf_seterror(bp, error); 1191 bp->b_resid = io_resid; 1192 /* 1193 * This I/O is now complete 1194 */ 1195 buf_biodone(bp); 1196 1197 return error; 1198} 1199 1200 1201/* 1202 * struct vnop_strategy_args { 1203 * struct buf *a_bp; 1204 * } *ap; 1205 */ 1206errno_t 1207buf_strategy(vnode_t devvp, void *ap) 1208{ 1209 buf_t bp = ((struct vnop_strategy_args *)ap)->a_bp; 1210 vnode_t vp = bp->b_vp; 1211 int bmap_flags; 1212 errno_t error; 1213#if CONFIG_DTRACE 1214 int dtrace_io_start_flag = 0; /* We only want to trip the io:::start 1215 * probe once, with the true physical 1216 * block in place (b_blkno) 1217 */ 1218 1219#endif 1220 1221 if (vp == NULL || vp->v_type == VCHR || vp->v_type == VBLK) 1222 panic("buf_strategy: b_vp == NULL || vtype == VCHR | VBLK\n"); 1223 /* 1224 * associate the physical device with 1225 * with this buf_t even if we don't 1226 * end up issuing the I/O... 1227 */ 1228 bp->b_dev = devvp->v_rdev; 1229 1230 if (bp->b_flags & B_READ) 1231 bmap_flags = VNODE_READ; 1232 else 1233 bmap_flags = VNODE_WRITE; 1234 1235 if ( !(bp->b_flags & B_CLUSTER)) { 1236 1237 if ( (bp->b_upl) ) { 1238 /* 1239 * we have a UPL associated with this bp 1240 * go through cluster_bp which knows how 1241 * to deal with filesystem block sizes 1242 * that aren't equal to the page size 1243 */ 1244 DTRACE_IO1(start, buf_t, bp); 1245 return (cluster_bp(bp)); 1246 } 1247 if (bp->b_blkno == bp->b_lblkno) { 1248 off_t f_offset; 1249 size_t contig_bytes; 1250 1251 if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) { 1252 DTRACE_IO1(start, buf_t, bp); 1253 buf_seterror(bp, error); 1254 buf_biodone(bp); 1255 1256 return (error); 1257 } 1258 1259 if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) { 1260 DTRACE_IO1(start, buf_t, bp); 1261 buf_seterror(bp, error); 1262 buf_biodone(bp); 1263 1264 return (error); 1265 } 1266 1267 DTRACE_IO1(start, buf_t, bp); 1268#if CONFIG_DTRACE 1269 dtrace_io_start_flag = 1; 1270#endif /* CONFIG_DTRACE */ 1271 1272 if ((bp->b_blkno == -1) || (contig_bytes == 0)) { 1273 /* Set block number to force biodone later */ 1274 bp->b_blkno = -1; 1275 buf_clear(bp); 1276 } 1277 else if ((long)contig_bytes < bp->b_bcount) { 1278 return (buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes)); 1279 } 1280 } 1281 1282#if CONFIG_DTRACE 1283 if (dtrace_io_start_flag == 0) { 1284 DTRACE_IO1(start, buf_t, bp); 1285 dtrace_io_start_flag = 1; 1286 } 1287#endif /* CONFIG_DTRACE */ 1288 1289 if (bp->b_blkno == -1) { 1290 buf_biodone(bp); 1291 return (0); 1292 } 1293 } 1294 1295#if CONFIG_DTRACE 1296 if (dtrace_io_start_flag == 0) 1297 DTRACE_IO1(start, buf_t, bp); 1298#endif /* CONFIG_DTRACE */ 1299 1300#if CONFIG_PROTECT 1301 /* Capture f_offset in the bufattr*/ 1302 if (bp->b_attr.ba_cpentry != 0) { 1303 /* No need to go here for older EAs */ 1304 if(bp->b_attr.ba_cpentry->cp_flags & CP_OFF_IV_ENABLED) { 1305 off_t f_offset; 1306 if ((error = VNOP_BLKTOOFF(bp->b_vp, bp->b_lblkno, &f_offset))) 1307 return error; 1308 1309 /* 1310 * Attach the file offset to this buffer. The 1311 * bufattr attributes will be passed down the stack 1312 * until they reach IOFlashStorage. IOFlashStorage 1313 * will retain the offset in a local variable when it 1314 * issues its I/Os to the NAND controller. 1315 * 1316 * Note that LwVM may end up splitting this I/O 1317 * into sub-I/Os if it crosses a chunk boundary. In this 1318 * case, LwVM will update this field when it dispatches 1319 * each I/O to IOFlashStorage. But from our perspective 1320 * we have only issued a single I/O. 1321 */ 1322 bufattr_setcpoff (&(bp->b_attr), (u_int64_t)f_offset); 1323 } 1324 } 1325#endif 1326 1327 /* 1328 * we can issue the I/O because... 1329 * either B_CLUSTER is set which 1330 * means that the I/O is properly set 1331 * up to be a multiple of the page size, or 1332 * we were able to successfully set up the 1333 * physical block mapping 1334 */ 1335 error = VOCALL(devvp->v_op, VOFFSET(vnop_strategy), ap); 1336 DTRACE_FSINFO(strategy, vnode_t, vp); 1337 return (error); 1338} 1339 1340 1341 1342buf_t 1343buf_alloc(vnode_t vp) 1344{ 1345 return(alloc_io_buf(vp, 0)); 1346} 1347 1348void 1349buf_free(buf_t bp) { 1350 1351 free_io_buf(bp); 1352} 1353 1354 1355/* 1356 * iterate buffers for the specified vp. 1357 * if BUF_SCAN_DIRTY is set, do the dirty list 1358 * if BUF_SCAN_CLEAN is set, do the clean list 1359 * if neither flag is set, default to BUF_SCAN_DIRTY 1360 * if BUF_NOTIFY_BUSY is set, call the callout function using a NULL bp for busy pages 1361 */ 1362 1363struct buf_iterate_info_t { 1364 int flag; 1365 struct buflists *listhead; 1366}; 1367 1368void 1369buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg) 1370{ 1371 buf_t bp; 1372 int retval; 1373 struct buflists local_iterblkhd; 1374 int lock_flags = BAC_NOWAIT | BAC_REMOVE; 1375 int notify_busy = flags & BUF_NOTIFY_BUSY; 1376 struct buf_iterate_info_t list[2]; 1377 int num_lists, i; 1378 1379 if (flags & BUF_SKIP_LOCKED) 1380 lock_flags |= BAC_SKIP_LOCKED; 1381 if (flags & BUF_SKIP_NONLOCKED) 1382 lock_flags |= BAC_SKIP_NONLOCKED; 1383 1384 if ( !(flags & (BUF_SCAN_DIRTY | BUF_SCAN_CLEAN))) 1385 flags |= BUF_SCAN_DIRTY; 1386 1387 num_lists = 0; 1388 1389 if (flags & BUF_SCAN_DIRTY) { 1390 list[num_lists].flag = VBI_DIRTY; 1391 list[num_lists].listhead = &vp->v_dirtyblkhd; 1392 num_lists++; 1393 } 1394 if (flags & BUF_SCAN_CLEAN) { 1395 list[num_lists].flag = VBI_CLEAN; 1396 list[num_lists].listhead = &vp->v_cleanblkhd; 1397 num_lists++; 1398 } 1399 1400 for (i = 0; i < num_lists; i++) { 1401 lck_mtx_lock(buf_mtxp); 1402 1403 if (buf_iterprepare(vp, &local_iterblkhd, list[i].flag)) { 1404 lck_mtx_unlock(buf_mtxp); 1405 continue; 1406 } 1407 while (!LIST_EMPTY(&local_iterblkhd)) { 1408 bp = LIST_FIRST(&local_iterblkhd); 1409 LIST_REMOVE(bp, b_vnbufs); 1410 LIST_INSERT_HEAD(list[i].listhead, bp, b_vnbufs); 1411 1412 if (buf_acquire_locked(bp, lock_flags, 0, 0)) { 1413 if (notify_busy) { 1414 bp = NULL; 1415 } else { 1416 continue; 1417 } 1418 } 1419 1420 lck_mtx_unlock(buf_mtxp); 1421 1422 retval = callout(bp, arg); 1423 1424 switch (retval) { 1425 case BUF_RETURNED: 1426 if (bp) 1427 buf_brelse(bp); 1428 break; 1429 case BUF_CLAIMED: 1430 break; 1431 case BUF_RETURNED_DONE: 1432 if (bp) 1433 buf_brelse(bp); 1434 lck_mtx_lock(buf_mtxp); 1435 goto out; 1436 case BUF_CLAIMED_DONE: 1437 lck_mtx_lock(buf_mtxp); 1438 goto out; 1439 } 1440 lck_mtx_lock(buf_mtxp); 1441 } /* while list has more nodes */ 1442 out: 1443 buf_itercomplete(vp, &local_iterblkhd, list[i].flag); 1444 lck_mtx_unlock(buf_mtxp); 1445 } /* for each list */ 1446} /* buf_iterate */ 1447 1448 1449/* 1450 * Flush out and invalidate all buffers associated with a vnode. 1451 */ 1452int 1453buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo) 1454{ 1455 buf_t bp; 1456 int aflags; 1457 int error = 0; 1458 int must_rescan = 1; 1459 struct buflists local_iterblkhd; 1460 1461 1462 if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd)) 1463 return (0); 1464 1465 lck_mtx_lock(buf_mtxp); 1466 1467 for (;;) { 1468 if (must_rescan == 0) 1469 /* 1470 * the lists may not be empty, but all that's left at this 1471 * point are metadata or B_LOCKED buffers which are being 1472 * skipped... we know this because we made it through both 1473 * the clean and dirty lists without dropping buf_mtxp... 1474 * each time we drop buf_mtxp we bump "must_rescan" 1475 */ 1476 break; 1477 if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd)) 1478 break; 1479 must_rescan = 0; 1480 /* 1481 * iterate the clean list 1482 */ 1483 if (buf_iterprepare(vp, &local_iterblkhd, VBI_CLEAN)) { 1484 goto try_dirty_list; 1485 } 1486 while (!LIST_EMPTY(&local_iterblkhd)) { 1487 1488 bp = LIST_FIRST(&local_iterblkhd); 1489 1490 LIST_REMOVE(bp, b_vnbufs); 1491 LIST_INSERT_HEAD(&vp->v_cleanblkhd, bp, b_vnbufs); 1492 1493 /* 1494 * some filesystems distinguish meta data blocks with a negative logical block # 1495 */ 1496 if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META))) 1497 continue; 1498 1499 aflags = BAC_REMOVE; 1500 1501 if ( !(flags & BUF_INVALIDATE_LOCKED) ) 1502 aflags |= BAC_SKIP_LOCKED; 1503 1504 if ( (error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo)) ) { 1505 if (error == EDEADLK) 1506 /* 1507 * this buffer was marked B_LOCKED... 1508 * we didn't drop buf_mtxp, so we 1509 * we don't need to rescan 1510 */ 1511 continue; 1512 if (error == EAGAIN) { 1513 /* 1514 * found a busy buffer... we blocked and 1515 * dropped buf_mtxp, so we're going to 1516 * need to rescan after this pass is completed 1517 */ 1518 must_rescan++; 1519 continue; 1520 } 1521 /* 1522 * got some kind of 'real' error out of the msleep 1523 * in buf_acquire_locked, terminate the scan and return the error 1524 */ 1525 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN); 1526 1527 lck_mtx_unlock(buf_mtxp); 1528 return (error); 1529 } 1530 lck_mtx_unlock(buf_mtxp); 1531 1532 if (bp->b_flags & B_LOCKED) 1533 KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 0, 0); 1534 1535 CLR(bp->b_flags, B_LOCKED); 1536 SET(bp->b_flags, B_INVAL); 1537 buf_brelse(bp); 1538 1539 lck_mtx_lock(buf_mtxp); 1540 1541 /* 1542 * by dropping buf_mtxp, we allow new 1543 * buffers to be added to the vnode list(s) 1544 * we'll have to rescan at least once more 1545 * if the queues aren't empty 1546 */ 1547 must_rescan++; 1548 } 1549 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN); 1550 1551try_dirty_list: 1552 /* 1553 * Now iterate on dirty blks 1554 */ 1555 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) { 1556 continue; 1557 } 1558 while (!LIST_EMPTY(&local_iterblkhd)) { 1559 bp = LIST_FIRST(&local_iterblkhd); 1560 1561 LIST_REMOVE(bp, b_vnbufs); 1562 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs); 1563 1564 /* 1565 * some filesystems distinguish meta data blocks with a negative logical block # 1566 */ 1567 if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META))) 1568 continue; 1569 1570 aflags = BAC_REMOVE; 1571 1572 if ( !(flags & BUF_INVALIDATE_LOCKED) ) 1573 aflags |= BAC_SKIP_LOCKED; 1574 1575 if ( (error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo)) ) { 1576 if (error == EDEADLK) 1577 /* 1578 * this buffer was marked B_LOCKED... 1579 * we didn't drop buf_mtxp, so we 1580 * we don't need to rescan 1581 */ 1582 continue; 1583 if (error == EAGAIN) { 1584 /* 1585 * found a busy buffer... we blocked and 1586 * dropped buf_mtxp, so we're going to 1587 * need to rescan after this pass is completed 1588 */ 1589 must_rescan++; 1590 continue; 1591 } 1592 /* 1593 * got some kind of 'real' error out of the msleep 1594 * in buf_acquire_locked, terminate the scan and return the error 1595 */ 1596 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY); 1597 1598 lck_mtx_unlock(buf_mtxp); 1599 return (error); 1600 } 1601 lck_mtx_unlock(buf_mtxp); 1602 1603 if (bp->b_flags & B_LOCKED) 1604 KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 1, 0); 1605 1606 CLR(bp->b_flags, B_LOCKED); 1607 SET(bp->b_flags, B_INVAL); 1608 1609 if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA)) 1610 (void) VNOP_BWRITE(bp); 1611 else 1612 buf_brelse(bp); 1613 1614 lck_mtx_lock(buf_mtxp); 1615 /* 1616 * by dropping buf_mtxp, we allow new 1617 * buffers to be added to the vnode list(s) 1618 * we'll have to rescan at least once more 1619 * if the queues aren't empty 1620 */ 1621 must_rescan++; 1622 } 1623 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY); 1624 } 1625 lck_mtx_unlock(buf_mtxp); 1626 1627 return (0); 1628} 1629 1630void 1631buf_flushdirtyblks(vnode_t vp, int wait, int flags, const char *msg) { 1632 1633 (void) buf_flushdirtyblks_skipinfo(vp, wait, flags, msg); 1634 return; 1635} 1636 1637int 1638buf_flushdirtyblks_skipinfo(vnode_t vp, int wait, int flags, const char *msg) { 1639 buf_t bp; 1640 int writes_issued = 0; 1641 errno_t error; 1642 int busy = 0; 1643 struct buflists local_iterblkhd; 1644 int lock_flags = BAC_NOWAIT | BAC_REMOVE; 1645 int any_locked = 0; 1646 1647 if (flags & BUF_SKIP_LOCKED) 1648 lock_flags |= BAC_SKIP_LOCKED; 1649 if (flags & BUF_SKIP_NONLOCKED) 1650 lock_flags |= BAC_SKIP_NONLOCKED; 1651loop: 1652 lck_mtx_lock(buf_mtxp); 1653 1654 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == 0) { 1655 while (!LIST_EMPTY(&local_iterblkhd)) { 1656 bp = LIST_FIRST(&local_iterblkhd); 1657 LIST_REMOVE(bp, b_vnbufs); 1658 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs); 1659 1660 if ((error = buf_acquire_locked(bp, lock_flags, 0, 0)) == EBUSY) { 1661 busy++; 1662 } 1663 if (error) { 1664 /* 1665 * If we passed in BUF_SKIP_LOCKED or BUF_SKIP_NONLOCKED, 1666 * we may want to do somethign differently if a locked or unlocked 1667 * buffer was encountered (depending on the arg specified). 1668 * In this case, we know that one of those two was set, and the 1669 * buf acquisition failed above. 1670 * 1671 * If it failed with EDEADLK, then save state which can be emitted 1672 * later on to the caller. Most callers should not care. 1673 */ 1674 if (error == EDEADLK) { 1675 any_locked++; 1676 } 1677 continue; 1678 } 1679 lck_mtx_unlock(buf_mtxp); 1680 1681 bp->b_flags &= ~B_LOCKED; 1682 1683 /* 1684 * Wait for I/O associated with indirect blocks to complete, 1685 * since there is no way to quickly wait for them below. 1686 */ 1687 if ((bp->b_vp == vp) || (wait == 0)) 1688 (void) buf_bawrite(bp); 1689 else 1690 (void) VNOP_BWRITE(bp); 1691 writes_issued++; 1692 1693 lck_mtx_lock(buf_mtxp); 1694 } 1695 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY); 1696 } 1697 lck_mtx_unlock(buf_mtxp); 1698 1699 if (wait) { 1700 (void)vnode_waitforwrites(vp, 0, 0, 0, msg); 1701 1702 if (vp->v_dirtyblkhd.lh_first && busy) { 1703 /* 1704 * we had one or more BUSY buffers on 1705 * the dirtyblock list... most likely 1706 * these are due to delayed writes that 1707 * were moved to the bclean queue but 1708 * have not yet been 'written'. 1709 * if we issued some writes on the 1710 * previous pass, we try again immediately 1711 * if we didn't, we'll sleep for some time 1712 * to allow the state to change... 1713 */ 1714 if (writes_issued == 0) { 1715 (void)tsleep((caddr_t)&vp->v_numoutput, 1716 PRIBIO + 1, "vnode_flushdirtyblks", hz/20); 1717 } 1718 writes_issued = 0; 1719 busy = 0; 1720 1721 goto loop; 1722 } 1723 } 1724 1725 return any_locked; 1726} 1727 1728 1729/* 1730 * called with buf_mtxp held... 1731 * this lock protects the queue manipulation 1732 */ 1733static int 1734buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags) 1735{ 1736 struct buflists * listheadp; 1737 1738 if (flags & VBI_DIRTY) 1739 listheadp = &vp->v_dirtyblkhd; 1740 else 1741 listheadp = &vp->v_cleanblkhd; 1742 1743 while (vp->v_iterblkflags & VBI_ITER) { 1744 vp->v_iterblkflags |= VBI_ITERWANT; 1745 msleep(&vp->v_iterblkflags, buf_mtxp, 0, "buf_iterprepare", NULL); 1746 } 1747 if (LIST_EMPTY(listheadp)) { 1748 LIST_INIT(iterheadp); 1749 return(EINVAL); 1750 } 1751 vp->v_iterblkflags |= VBI_ITER; 1752 1753 iterheadp->lh_first = listheadp->lh_first; 1754 listheadp->lh_first->b_vnbufs.le_prev = &iterheadp->lh_first; 1755 LIST_INIT(listheadp); 1756 1757 return(0); 1758} 1759 1760/* 1761 * called with buf_mtxp held... 1762 * this lock protects the queue manipulation 1763 */ 1764static void 1765buf_itercomplete(vnode_t vp, struct buflists *iterheadp, int flags) 1766{ 1767 struct buflists * listheadp; 1768 buf_t bp; 1769 1770 if (flags & VBI_DIRTY) 1771 listheadp = &vp->v_dirtyblkhd; 1772 else 1773 listheadp = &vp->v_cleanblkhd; 1774 1775 while (!LIST_EMPTY(iterheadp)) { 1776 bp = LIST_FIRST(iterheadp); 1777 LIST_REMOVE(bp, b_vnbufs); 1778 LIST_INSERT_HEAD(listheadp, bp, b_vnbufs); 1779 } 1780 vp->v_iterblkflags &= ~VBI_ITER; 1781 1782 if (vp->v_iterblkflags & VBI_ITERWANT) { 1783 vp->v_iterblkflags &= ~VBI_ITERWANT; 1784 wakeup(&vp->v_iterblkflags); 1785 } 1786} 1787 1788 1789static void 1790bremfree_locked(buf_t bp) 1791{ 1792 struct bqueues *dp = NULL; 1793 int whichq; 1794 1795 whichq = bp->b_whichq; 1796 1797 if (whichq == -1) { 1798 if (bp->b_shadow_ref == 0) 1799 panic("bremfree_locked: %p not on freelist", bp); 1800 /* 1801 * there are clones pointing to 'bp'... 1802 * therefore, it was not put on a freelist 1803 * when buf_brelse was last called on 'bp' 1804 */ 1805 return; 1806 } 1807 /* 1808 * We only calculate the head of the freelist when removing 1809 * the last element of the list as that is the only time that 1810 * it is needed (e.g. to reset the tail pointer). 1811 * 1812 * NB: This makes an assumption about how tailq's are implemented. 1813 */ 1814 if (bp->b_freelist.tqe_next == NULL) { 1815 dp = &bufqueues[whichq]; 1816 1817 if (dp->tqh_last != &bp->b_freelist.tqe_next) 1818 panic("bremfree: lost tail"); 1819 } 1820 TAILQ_REMOVE(dp, bp, b_freelist); 1821 1822#if BALANCE_QUEUES 1823 bufqdec(whichq); 1824#endif 1825 if (whichq == BQ_LAUNDRY) 1826 blaundrycnt--; 1827 1828 bp->b_whichq = -1; 1829 bp->b_timestamp = 0; 1830 bp->b_shadow = 0; 1831} 1832 1833/* 1834 * Associate a buffer with a vnode. 1835 * buf_mtxp must be locked on entry 1836 */ 1837static void 1838bgetvp_locked(vnode_t vp, buf_t bp) 1839{ 1840 1841 if (bp->b_vp != vp) 1842 panic("bgetvp_locked: not free"); 1843 1844 if (vp->v_type == VBLK || vp->v_type == VCHR) 1845 bp->b_dev = vp->v_rdev; 1846 else 1847 bp->b_dev = NODEV; 1848 /* 1849 * Insert onto list for new vnode. 1850 */ 1851 bufinsvn(bp, &vp->v_cleanblkhd); 1852} 1853 1854/* 1855 * Disassociate a buffer from a vnode. 1856 * buf_mtxp must be locked on entry 1857 */ 1858static void 1859brelvp_locked(buf_t bp) 1860{ 1861 /* 1862 * Delete from old vnode list, if on one. 1863 */ 1864 if (bp->b_vnbufs.le_next != NOLIST) 1865 bufremvn(bp); 1866 1867 bp->b_vp = (vnode_t)NULL; 1868} 1869 1870/* 1871 * Reassign a buffer from one vnode to another. 1872 * Used to assign file specific control information 1873 * (indirect blocks) to the vnode to which they belong. 1874 */ 1875static void 1876buf_reassign(buf_t bp, vnode_t newvp) 1877{ 1878 struct buflists *listheadp; 1879 1880 if (newvp == NULL) { 1881 printf("buf_reassign: NULL"); 1882 return; 1883 } 1884 lck_mtx_lock_spin(buf_mtxp); 1885 1886 /* 1887 * Delete from old vnode list, if on one. 1888 */ 1889 if (bp->b_vnbufs.le_next != NOLIST) 1890 bufremvn(bp); 1891 /* 1892 * If dirty, put on list of dirty buffers; 1893 * otherwise insert onto list of clean buffers. 1894 */ 1895 if (ISSET(bp->b_flags, B_DELWRI)) 1896 listheadp = &newvp->v_dirtyblkhd; 1897 else 1898 listheadp = &newvp->v_cleanblkhd; 1899 bufinsvn(bp, listheadp); 1900 1901 lck_mtx_unlock(buf_mtxp); 1902} 1903 1904static __inline__ void 1905bufhdrinit(buf_t bp) 1906{ 1907 bzero((char *)bp, sizeof *bp); 1908 bp->b_dev = NODEV; 1909 bp->b_rcred = NOCRED; 1910 bp->b_wcred = NOCRED; 1911 bp->b_vnbufs.le_next = NOLIST; 1912 bp->b_flags = B_INVAL; 1913 1914 return; 1915} 1916 1917/* 1918 * Initialize buffers and hash links for buffers. 1919 */ 1920__private_extern__ void 1921bufinit(void) 1922{ 1923 buf_t bp; 1924 struct bqueues *dp; 1925 int i; 1926 1927 nbuf_headers = 0; 1928 /* Initialize the buffer queues ('freelists') and the hash table */ 1929 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) 1930 TAILQ_INIT(dp); 1931 bufhashtbl = hashinit(nbuf_hashelements, M_CACHE, &bufhash); 1932 1933 buf_busycount = 0; 1934 1935 /* Initialize the buffer headers */ 1936 for (i = 0; i < max_nbuf_headers; i++) { 1937 nbuf_headers++; 1938 bp = &buf_headers[i]; 1939 bufhdrinit(bp); 1940 1941 BLISTNONE(bp); 1942 dp = &bufqueues[BQ_EMPTY]; 1943 bp->b_whichq = BQ_EMPTY; 1944 bp->b_timestamp = buf_timestamp(); 1945 binsheadfree(bp, dp, BQ_EMPTY); 1946 binshash(bp, &invalhash); 1947 } 1948 boot_nbuf_headers = nbuf_headers; 1949 1950 TAILQ_INIT(&iobufqueue); 1951 TAILQ_INIT(&delaybufqueue); 1952 1953 for (; i < nbuf_headers + niobuf_headers; i++) { 1954 bp = &buf_headers[i]; 1955 bufhdrinit(bp); 1956 bp->b_whichq = -1; 1957 binsheadfree(bp, &iobufqueue, -1); 1958 } 1959 1960 /* 1961 * allocate lock group attribute and group 1962 */ 1963 buf_mtx_grp_attr = lck_grp_attr_alloc_init(); 1964 buf_mtx_grp = lck_grp_alloc_init("buffer cache", buf_mtx_grp_attr); 1965 1966 /* 1967 * allocate the lock attribute 1968 */ 1969 buf_mtx_attr = lck_attr_alloc_init(); 1970 1971 /* 1972 * allocate and initialize mutex's for the buffer and iobuffer pools 1973 */ 1974 buf_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr); 1975 iobuffer_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr); 1976 1977 if (iobuffer_mtxp == NULL) 1978 panic("couldn't create iobuffer mutex"); 1979 1980 if (buf_mtxp == NULL) 1981 panic("couldn't create buf mutex"); 1982 1983 /* 1984 * allocate and initialize cluster specific global locks... 1985 */ 1986 cluster_init(); 1987 1988 printf("using %d buffer headers and %d cluster IO buffer headers\n", 1989 nbuf_headers, niobuf_headers); 1990 1991 /* Set up zones used by the buffer cache */ 1992 bufzoneinit(); 1993 1994 /* start the bcleanbuf() thread */ 1995 bcleanbuf_thread_init(); 1996 1997 /* Register a callout for relieving vm pressure */ 1998 if (vm_set_buffer_cleanup_callout(buffer_cache_gc) != KERN_SUCCESS) { 1999 panic("Couldn't register buffer cache callout for vm pressure!\n"); 2000 } 2001 2002#if BALANCE_QUEUES 2003 { 2004 static void bufq_balance_thread_init(void); 2005 /* create a thread to do dynamic buffer queue balancing */ 2006 bufq_balance_thread_init(); 2007 } 2008#endif /* notyet */ 2009} 2010 2011 2012 2013/* 2014 * Zones for the meta data buffers 2015 */ 2016 2017#define MINMETA 512 2018#define MAXMETA 8192 2019 2020struct meta_zone_entry { 2021 zone_t mz_zone; 2022 vm_size_t mz_size; 2023 vm_size_t mz_max; 2024 const char *mz_name; 2025}; 2026 2027struct meta_zone_entry meta_zones[] = { 2028 {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" }, 2029 {NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" }, 2030 {NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" }, 2031 {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" }, 2032 {NULL, (MINMETA * 16), 512 * (MINMETA * 16), "buf.8192" }, 2033 {NULL, 0, 0, "" } /* End */ 2034}; 2035 2036/* 2037 * Initialize the meta data zones 2038 */ 2039static void 2040bufzoneinit(void) 2041{ 2042 int i; 2043 2044 for (i = 0; meta_zones[i].mz_size != 0; i++) { 2045 meta_zones[i].mz_zone = 2046 zinit(meta_zones[i].mz_size, 2047 meta_zones[i].mz_max, 2048 PAGE_SIZE, 2049 meta_zones[i].mz_name); 2050 zone_change(meta_zones[i].mz_zone, Z_CALLERACCT, FALSE); 2051 } 2052 buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers"); 2053 zone_change(buf_hdr_zone, Z_CALLERACCT, FALSE); 2054} 2055 2056static __inline__ zone_t 2057getbufzone(size_t size) 2058{ 2059 int i; 2060 2061 if ((size % 512) || (size < MINMETA) || (size > MAXMETA)) 2062 panic("getbufzone: incorect size = %lu", size); 2063 2064 for (i = 0; meta_zones[i].mz_size != 0; i++) { 2065 if (meta_zones[i].mz_size >= size) 2066 break; 2067 } 2068 2069 return (meta_zones[i].mz_zone); 2070} 2071 2072 2073 2074static struct buf * 2075bio_doread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, int async, int queuetype) 2076{ 2077 buf_t bp; 2078 2079 bp = buf_getblk(vp, blkno, size, 0, 0, queuetype); 2080 2081 /* 2082 * If buffer does not have data valid, start a read. 2083 * Note that if buffer is B_INVAL, buf_getblk() won't return it. 2084 * Therefore, it's valid if it's I/O has completed or been delayed. 2085 */ 2086 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) { 2087 struct proc *p; 2088 2089 p = current_proc(); 2090 2091 /* Start I/O for the buffer (keeping credentials). */ 2092 SET(bp->b_flags, B_READ | async); 2093 if (IS_VALID_CRED(cred) && !IS_VALID_CRED(bp->b_rcred)) { 2094 kauth_cred_ref(cred); 2095 bp->b_rcred = cred; 2096 } 2097 2098 VNOP_STRATEGY(bp); 2099 2100 trace(TR_BREADMISS, pack(vp, size), blkno); 2101 2102 /* Pay for the read. */ 2103 if (p && p->p_stats) { 2104 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_inblock); /* XXX */ 2105 OSAddAtomic64(size, &p->p_stats->ri_diskiobytes.ri_bytesread); 2106 } 2107 2108 if (async) { 2109 /* 2110 * since we asked for an ASYNC I/O 2111 * the biodone will do the brelse 2112 * we don't want to pass back a bp 2113 * that we don't 'own' 2114 */ 2115 bp = NULL; 2116 } 2117 } else if (async) { 2118 buf_brelse(bp); 2119 bp = NULL; 2120 } 2121 2122 trace(TR_BREADHIT, pack(vp, size), blkno); 2123 2124 return (bp); 2125} 2126 2127/* 2128 * Perform the reads for buf_breadn() and buf_meta_breadn(). 2129 * Trivial modification to the breada algorithm presented in Bach (p.55). 2130 */ 2131static errno_t 2132do_breadn_for_type(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, 2133 int nrablks, kauth_cred_t cred, buf_t *bpp, int queuetype) 2134{ 2135 buf_t bp; 2136 int i; 2137 2138 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype); 2139 2140 /* 2141 * For each of the read-ahead blocks, start a read, if necessary. 2142 */ 2143 for (i = 0; i < nrablks; i++) { 2144 /* If it's in the cache, just go on to next one. */ 2145 if (incore(vp, rablks[i])) 2146 continue; 2147 2148 /* Get a buffer for the read-ahead block */ 2149 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, queuetype); 2150 } 2151 2152 /* Otherwise, we had to start a read for it; wait until it's valid. */ 2153 return (buf_biowait(bp)); 2154} 2155 2156 2157/* 2158 * Read a disk block. 2159 * This algorithm described in Bach (p.54). 2160 */ 2161errno_t 2162buf_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp) 2163{ 2164 buf_t bp; 2165 2166 /* Get buffer for block. */ 2167 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ); 2168 2169 /* Wait for the read to complete, and return result. */ 2170 return (buf_biowait(bp)); 2171} 2172 2173/* 2174 * Read a disk block. [bread() for meta-data] 2175 * This algorithm described in Bach (p.54). 2176 */ 2177errno_t 2178buf_meta_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp) 2179{ 2180 buf_t bp; 2181 2182 /* Get buffer for block. */ 2183 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META); 2184 2185 /* Wait for the read to complete, and return result. */ 2186 return (buf_biowait(bp)); 2187} 2188 2189/* 2190 * Read-ahead multiple disk blocks. The first is sync, the rest async. 2191 */ 2192errno_t 2193buf_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp) 2194{ 2195 return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ)); 2196} 2197 2198/* 2199 * Read-ahead multiple disk blocks. The first is sync, the rest async. 2200 * [buf_breadn() for meta-data] 2201 */ 2202errno_t 2203buf_meta_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp) 2204{ 2205 return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META)); 2206} 2207 2208/* 2209 * Block write. Described in Bach (p.56) 2210 */ 2211errno_t 2212buf_bwrite(buf_t bp) 2213{ 2214 int sync, wasdelayed; 2215 errno_t rv; 2216 proc_t p = current_proc(); 2217 vnode_t vp = bp->b_vp; 2218 2219 if (bp->b_datap == 0) { 2220 if (brecover_data(bp) == 0) 2221 return (0); 2222 } 2223 /* Remember buffer type, to switch on it later. */ 2224 sync = !ISSET(bp->b_flags, B_ASYNC); 2225 wasdelayed = ISSET(bp->b_flags, B_DELWRI); 2226 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI)); 2227 2228 if (wasdelayed) 2229 OSAddAtomicLong(-1, &nbdwrite); 2230 2231 if (!sync) { 2232 /* 2233 * If not synchronous, pay for the I/O operation and make 2234 * sure the buf is on the correct vnode queue. We have 2235 * to do this now, because if we don't, the vnode may not 2236 * be properly notified that its I/O has completed. 2237 */ 2238 if (wasdelayed) 2239 buf_reassign(bp, vp); 2240 else 2241 if (p && p->p_stats) { 2242 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */ 2243 OSAddAtomic64(buf_count(bp), &p->p_stats->ri_diskiobytes.ri_byteswritten); 2244 } 2245 } 2246 trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno); 2247 2248 /* Initiate disk write. Make sure the appropriate party is charged. */ 2249 2250 OSAddAtomic(1, &vp->v_numoutput); 2251 2252 VNOP_STRATEGY(bp); 2253 2254 if (sync) { 2255 /* 2256 * If I/O was synchronous, wait for it to complete. 2257 */ 2258 rv = buf_biowait(bp); 2259 2260 /* 2261 * Pay for the I/O operation, if it's not been paid for, and 2262 * make sure it's on the correct vnode queue. (async operatings 2263 * were payed for above.) 2264 */ 2265 if (wasdelayed) 2266 buf_reassign(bp, vp); 2267 else 2268 if (p && p->p_stats) { 2269 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */ 2270 OSAddAtomic64(buf_count(bp), &p->p_stats->ri_diskiobytes.ri_byteswritten); 2271 } 2272 2273 /* Release the buffer. */ 2274 // XXXdbg - only if the unused bit is set 2275 if (!ISSET(bp->b_flags, B_NORELSE)) { 2276 buf_brelse(bp); 2277 } else { 2278 CLR(bp->b_flags, B_NORELSE); 2279 } 2280 2281 return (rv); 2282 } else { 2283 return (0); 2284 } 2285} 2286 2287int 2288vn_bwrite(struct vnop_bwrite_args *ap) 2289{ 2290 return (buf_bwrite(ap->a_bp)); 2291} 2292 2293/* 2294 * Delayed write. 2295 * 2296 * The buffer is marked dirty, but is not queued for I/O. 2297 * This routine should be used when the buffer is expected 2298 * to be modified again soon, typically a small write that 2299 * partially fills a buffer. 2300 * 2301 * NB: magnetic tapes cannot be delayed; they must be 2302 * written in the order that the writes are requested. 2303 * 2304 * Described in Leffler, et al. (pp. 208-213). 2305 * 2306 * Note: With the ability to allocate additional buffer 2307 * headers, we can get in to the situation where "too" many 2308 * buf_bdwrite()s can create situation where the kernel can create 2309 * buffers faster than the disks can service. Doing a buf_bawrite() in 2310 * cases where we have "too many" outstanding buf_bdwrite()s avoids that. 2311 */ 2312__private_extern__ int 2313bdwrite_internal(buf_t bp, int return_error) 2314{ 2315 proc_t p = current_proc(); 2316 vnode_t vp = bp->b_vp; 2317 2318 /* 2319 * If the block hasn't been seen before: 2320 * (1) Mark it as having been seen, 2321 * (2) Charge for the write. 2322 * (3) Make sure it's on its vnode's correct block list, 2323 */ 2324 if (!ISSET(bp->b_flags, B_DELWRI)) { 2325 SET(bp->b_flags, B_DELWRI); 2326 if (p && p->p_stats) { 2327 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */ 2328 OSAddAtomic64(buf_count(bp), &p->p_stats->ri_diskiobytes.ri_byteswritten); 2329 } 2330 OSAddAtomicLong(1, &nbdwrite); 2331 buf_reassign(bp, vp); 2332 } 2333 2334 /* 2335 * if we're not LOCKED, but the total number of delayed writes 2336 * has climbed above 75% of the total buffers in the system 2337 * return an error if the caller has indicated that it can 2338 * handle one in this case, otherwise schedule the I/O now 2339 * this is done to prevent us from allocating tons of extra 2340 * buffers when dealing with virtual disks (i.e. DiskImages), 2341 * because additional buffers are dynamically allocated to prevent 2342 * deadlocks from occurring 2343 * 2344 * however, can't do a buf_bawrite() if the LOCKED bit is set because the 2345 * buffer is part of a transaction and can't go to disk until 2346 * the LOCKED bit is cleared. 2347 */ 2348 if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf_headers/4)*3)) { 2349 if (return_error) 2350 return (EAGAIN); 2351 /* 2352 * If the vnode has "too many" write operations in progress 2353 * wait for them to finish the IO 2354 */ 2355 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, "buf_bdwrite"); 2356 2357 return (buf_bawrite(bp)); 2358 } 2359 2360 /* Otherwise, the "write" is done, so mark and release the buffer. */ 2361 SET(bp->b_flags, B_DONE); 2362 buf_brelse(bp); 2363 return (0); 2364} 2365 2366errno_t 2367buf_bdwrite(buf_t bp) 2368{ 2369 return (bdwrite_internal(bp, 0)); 2370} 2371 2372 2373/* 2374 * Asynchronous block write; just an asynchronous buf_bwrite(). 2375 * 2376 * Note: With the abilitty to allocate additional buffer 2377 * headers, we can get in to the situation where "too" many 2378 * buf_bawrite()s can create situation where the kernel can create 2379 * buffers faster than the disks can service. 2380 * We limit the number of "in flight" writes a vnode can have to 2381 * avoid this. 2382 */ 2383static int 2384bawrite_internal(buf_t bp, int throttle) 2385{ 2386 vnode_t vp = bp->b_vp; 2387 2388 if (vp) { 2389 if (throttle) 2390 /* 2391 * If the vnode has "too many" write operations in progress 2392 * wait for them to finish the IO 2393 */ 2394 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (const char *)"buf_bawrite"); 2395 else if (vp->v_numoutput >= VNODE_ASYNC_THROTTLE) 2396 /* 2397 * return to the caller and 2398 * let him decide what to do 2399 */ 2400 return (EWOULDBLOCK); 2401 } 2402 SET(bp->b_flags, B_ASYNC); 2403 2404 return (VNOP_BWRITE(bp)); 2405} 2406 2407errno_t 2408buf_bawrite(buf_t bp) 2409{ 2410 return (bawrite_internal(bp, 1)); 2411} 2412 2413 2414 2415static void 2416buf_free_meta_store(buf_t bp) 2417{ 2418 if (bp->b_bufsize) { 2419 if (ISSET(bp->b_flags, B_ZALLOC)) { 2420 zone_t z; 2421 2422 z = getbufzone(bp->b_bufsize); 2423 zfree(z, (void *)bp->b_datap); 2424 } else 2425 kmem_free(kernel_map, bp->b_datap, bp->b_bufsize); 2426 2427 bp->b_datap = (uintptr_t)NULL; 2428 bp->b_bufsize = 0; 2429 } 2430} 2431 2432 2433static buf_t 2434buf_brelse_shadow(buf_t bp) 2435{ 2436 buf_t bp_head; 2437 buf_t bp_temp; 2438 buf_t bp_return = NULL; 2439#ifdef BUF_MAKE_PRIVATE 2440 buf_t bp_data; 2441 int data_ref = 0; 2442#endif 2443 int need_wakeup = 0; 2444 2445 lck_mtx_lock_spin(buf_mtxp); 2446 2447 bp_head = (buf_t)bp->b_orig; 2448 2449 if (bp_head->b_whichq != -1) 2450 panic("buf_brelse_shadow: bp_head on freelist %d\n", bp_head->b_whichq); 2451 2452#ifdef BUF_MAKE_PRIVATE 2453 if (bp_data = bp->b_data_store) { 2454 bp_data->b_data_ref--; 2455 /* 2456 * snapshot the ref count so that we can check it 2457 * outside of the lock... we only want the guy going 2458 * from 1 -> 0 to try and release the storage 2459 */ 2460 data_ref = bp_data->b_data_ref; 2461 } 2462#endif 2463 KERNEL_DEBUG(0xbbbbc008 | DBG_FUNC_START, bp, bp_head, bp_head->b_shadow_ref, 0, 0); 2464 2465 bp_head->b_shadow_ref--; 2466 2467 for (bp_temp = bp_head; bp_temp && bp != bp_temp->b_shadow; bp_temp = bp_temp->b_shadow); 2468 2469 if (bp_temp == NULL) 2470 panic("buf_brelse_shadow: bp not on list %p", bp_head); 2471 2472 bp_temp->b_shadow = bp_temp->b_shadow->b_shadow; 2473 2474#ifdef BUF_MAKE_PRIVATE 2475 /* 2476 * we're about to free the current 'owner' of the data buffer and 2477 * there is at least one other shadow buf_t still pointing at it 2478 * so transfer it to the first shadow buf left in the chain 2479 */ 2480 if (bp == bp_data && data_ref) { 2481 if ((bp_data = bp_head->b_shadow) == NULL) 2482 panic("buf_brelse_shadow: data_ref mismatch bp(%p)", bp); 2483 2484 for (bp_temp = bp_data; bp_temp; bp_temp = bp_temp->b_shadow) 2485 bp_temp->b_data_store = bp_data; 2486 bp_data->b_data_ref = data_ref; 2487 } 2488#endif 2489 if (bp_head->b_shadow_ref == 0 && bp_head->b_shadow) 2490 panic("buf_relse_shadow: b_shadow != NULL && b_shadow_ref == 0 bp(%p)", bp); 2491 if (bp_head->b_shadow_ref && bp_head->b_shadow == 0) 2492 panic("buf_relse_shadow: b_shadow == NULL && b_shadow_ref != 0 bp(%p)", bp); 2493 2494 if (bp_head->b_shadow_ref == 0) { 2495 if (!ISSET(bp_head->b_lflags, BL_BUSY)) { 2496 2497 CLR(bp_head->b_flags, B_AGE); 2498 bp_head->b_timestamp = buf_timestamp(); 2499 2500 if (ISSET(bp_head->b_flags, B_LOCKED)) { 2501 bp_head->b_whichq = BQ_LOCKED; 2502 binstailfree(bp_head, &bufqueues[BQ_LOCKED], BQ_LOCKED); 2503 } else { 2504 bp_head->b_whichq = BQ_META; 2505 binstailfree(bp_head, &bufqueues[BQ_META], BQ_META); 2506 } 2507 } else if (ISSET(bp_head->b_lflags, BL_WAITSHADOW)) { 2508 CLR(bp_head->b_lflags, BL_WAITSHADOW); 2509 2510 bp_return = bp_head; 2511 } 2512 if (ISSET(bp_head->b_lflags, BL_WANTED_REF)) { 2513 CLR(bp_head->b_lflags, BL_WANTED_REF); 2514 need_wakeup = 1; 2515 } 2516 } 2517 lck_mtx_unlock(buf_mtxp); 2518 2519 if (need_wakeup) 2520 wakeup(bp_head); 2521 2522#ifdef BUF_MAKE_PRIVATE 2523 if (bp == bp_data && data_ref == 0) 2524 buf_free_meta_store(bp); 2525 2526 bp->b_data_store = NULL; 2527#endif 2528 KERNEL_DEBUG(0xbbbbc008 | DBG_FUNC_END, bp, 0, 0, 0, 0); 2529 2530 return (bp_return); 2531} 2532 2533 2534/* 2535 * Release a buffer on to the free lists. 2536 * Described in Bach (p. 46). 2537 */ 2538void 2539buf_brelse(buf_t bp) 2540{ 2541 struct bqueues *bufq; 2542 long whichq; 2543 upl_t upl; 2544 int need_wakeup = 0; 2545 int need_bp_wakeup = 0; 2546 2547 2548 if (bp->b_whichq != -1 || !(bp->b_lflags & BL_BUSY)) 2549 panic("buf_brelse: bad buffer = %p\n", bp); 2550 2551#ifdef JOE_DEBUG 2552 (void) OSBacktrace(&bp->b_stackbrelse[0], 6); 2553 2554 bp->b_lastbrelse = current_thread(); 2555 bp->b_tag = 0; 2556#endif 2557 if (bp->b_lflags & BL_IOBUF) { 2558 buf_t shadow_master_bp = NULL; 2559 2560 if (ISSET(bp->b_lflags, BL_SHADOW)) 2561 shadow_master_bp = buf_brelse_shadow(bp); 2562 else if (ISSET(bp->b_lflags, BL_IOBUF_ALLOC)) 2563 buf_free_meta_store(bp); 2564 free_io_buf(bp); 2565 2566 if (shadow_master_bp) { 2567 bp = shadow_master_bp; 2568 goto finish_shadow_master; 2569 } 2570 return; 2571 } 2572 2573 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START, 2574 bp->b_lblkno * PAGE_SIZE, bp, bp->b_datap, 2575 bp->b_flags, 0); 2576 2577 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 2578 2579 /* 2580 * if we're invalidating a buffer that has the B_FILTER bit 2581 * set then call the b_iodone function so it gets cleaned 2582 * up properly. 2583 * 2584 * the HFS journal code depends on this 2585 */ 2586 if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) { 2587 if (ISSET(bp->b_flags, B_FILTER)) { /* if necessary, call out */ 2588 void (*iodone_func)(struct buf *, void *) = bp->b_iodone; 2589 void *arg = bp->b_transaction; 2590 2591 CLR(bp->b_flags, B_FILTER); /* but note callout done */ 2592 bp->b_iodone = NULL; 2593 bp->b_transaction = NULL; 2594 2595 if (iodone_func == NULL) { 2596 panic("brelse: bp @ %p has NULL b_iodone!\n", bp); 2597 } 2598 (*iodone_func)(bp, arg); 2599 } 2600 } 2601 /* 2602 * I/O is done. Cleanup the UPL state 2603 */ 2604 upl = bp->b_upl; 2605 2606 if ( !ISSET(bp->b_flags, B_META) && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) { 2607 kern_return_t kret; 2608 int upl_flags; 2609 2610 if (upl == NULL) { 2611 if ( !ISSET(bp->b_flags, B_INVAL)) { 2612 kret = ubc_create_upl(bp->b_vp, 2613 ubc_blktooff(bp->b_vp, bp->b_lblkno), 2614 bp->b_bufsize, 2615 &upl, 2616 NULL, 2617 UPL_PRECIOUS); 2618 2619 if (kret != KERN_SUCCESS) 2620 panic("brelse: Failed to create UPL"); 2621#if UPL_DEBUG 2622 upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) 5); 2623#endif /* UPL_DEBUG */ 2624 } 2625 } else { 2626 if (bp->b_datap) { 2627 kret = ubc_upl_unmap(upl); 2628 2629 if (kret != KERN_SUCCESS) 2630 panic("ubc_upl_unmap failed"); 2631 bp->b_datap = (uintptr_t)NULL; 2632 } 2633 } 2634 if (upl) { 2635 if (bp->b_flags & (B_ERROR | B_INVAL)) { 2636 if (bp->b_flags & (B_READ | B_INVAL)) 2637 upl_flags = UPL_ABORT_DUMP_PAGES; 2638 else 2639 upl_flags = 0; 2640 2641 ubc_upl_abort(upl, upl_flags); 2642 } else { 2643 if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY)) 2644 upl_flags = UPL_COMMIT_SET_DIRTY ; 2645 else 2646 upl_flags = UPL_COMMIT_CLEAR_DIRTY ; 2647 2648 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags | 2649 UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY); 2650 } 2651 bp->b_upl = NULL; 2652 } 2653 } else { 2654 if ( (upl) ) 2655 panic("brelse: UPL set for non VREG; vp=%p", bp->b_vp); 2656 } 2657 2658 /* 2659 * If it's locked, don't report an error; try again later. 2660 */ 2661 if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR)) 2662 CLR(bp->b_flags, B_ERROR); 2663 /* 2664 * If it's not cacheable, or an error, mark it invalid. 2665 */ 2666 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR))) 2667 SET(bp->b_flags, B_INVAL); 2668 2669 if ((bp->b_bufsize <= 0) || 2670 ISSET(bp->b_flags, B_INVAL) || 2671 (ISSET(bp->b_lflags, BL_WANTDEALLOC) && !ISSET(bp->b_flags, B_DELWRI))) { 2672 2673 boolean_t delayed_buf_free_meta_store = FALSE; 2674 2675 /* 2676 * If it's invalid or empty, dissociate it from its vnode, 2677 * release its storage if B_META, and 2678 * clean it up a bit and put it on the EMPTY queue 2679 */ 2680 if (ISSET(bp->b_flags, B_DELWRI)) 2681 OSAddAtomicLong(-1, &nbdwrite); 2682 2683 if (ISSET(bp->b_flags, B_META)) { 2684 if (bp->b_shadow_ref) 2685 delayed_buf_free_meta_store = TRUE; 2686 else 2687 buf_free_meta_store(bp); 2688 } 2689 /* 2690 * nuke any credentials we were holding 2691 */ 2692 buf_release_credentials(bp); 2693 2694 lck_mtx_lock_spin(buf_mtxp); 2695 2696 if (bp->b_shadow_ref) { 2697 SET(bp->b_lflags, BL_WAITSHADOW); 2698 2699 lck_mtx_unlock(buf_mtxp); 2700 2701 return; 2702 } 2703 if (delayed_buf_free_meta_store == TRUE) { 2704 2705 lck_mtx_unlock(buf_mtxp); 2706finish_shadow_master: 2707 buf_free_meta_store(bp); 2708 2709 lck_mtx_lock_spin(buf_mtxp); 2710 } 2711 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA)); 2712 2713 if (bp->b_vp) 2714 brelvp_locked(bp); 2715 2716 bremhash(bp); 2717 BLISTNONE(bp); 2718 binshash(bp, &invalhash); 2719 2720 bp->b_whichq = BQ_EMPTY; 2721 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY); 2722 } else { 2723 2724 /* 2725 * It has valid data. Put it on the end of the appropriate 2726 * queue, so that it'll stick around for as long as possible. 2727 */ 2728 if (ISSET(bp->b_flags, B_LOCKED)) 2729 whichq = BQ_LOCKED; /* locked in core */ 2730 else if (ISSET(bp->b_flags, B_META)) 2731 whichq = BQ_META; /* meta-data */ 2732 else if (ISSET(bp->b_flags, B_AGE)) 2733 whichq = BQ_AGE; /* stale but valid data */ 2734 else 2735 whichq = BQ_LRU; /* valid data */ 2736 bufq = &bufqueues[whichq]; 2737 2738 bp->b_timestamp = buf_timestamp(); 2739 2740 lck_mtx_lock_spin(buf_mtxp); 2741 2742 /* 2743 * the buf_brelse_shadow routine doesn't take 'ownership' 2744 * of the parent buf_t... it updates state that is protected by 2745 * the buf_mtxp, and checks for BL_BUSY to determine whether to 2746 * put the buf_t back on a free list. b_shadow_ref is protected 2747 * by the lock, and since we have not yet cleared B_BUSY, we need 2748 * to check it while holding the lock to insure that one of us 2749 * puts this buf_t back on a free list when it is safe to do so 2750 */ 2751 if (bp->b_shadow_ref == 0) { 2752 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE)); 2753 bp->b_whichq = whichq; 2754 binstailfree(bp, bufq, whichq); 2755 } else { 2756 /* 2757 * there are still cloned buf_t's pointing 2758 * at this guy... need to keep it off the 2759 * freelists until a buf_brelse is done on 2760 * the last clone 2761 */ 2762 CLR(bp->b_flags, (B_ASYNC | B_NOCACHE)); 2763 } 2764 } 2765 if (needbuffer) { 2766 /* 2767 * needbuffer is a global 2768 * we're currently using buf_mtxp to protect it 2769 * delay doing the actual wakeup until after 2770 * we drop buf_mtxp 2771 */ 2772 needbuffer = 0; 2773 need_wakeup = 1; 2774 } 2775 if (ISSET(bp->b_lflags, BL_WANTED)) { 2776 /* 2777 * delay the actual wakeup until after we 2778 * clear BL_BUSY and we've dropped buf_mtxp 2779 */ 2780 need_bp_wakeup = 1; 2781 } 2782 /* 2783 * Unlock the buffer. 2784 */ 2785 CLR(bp->b_lflags, (BL_BUSY | BL_WANTED)); 2786 buf_busycount--; 2787 2788 lck_mtx_unlock(buf_mtxp); 2789 2790 if (need_wakeup) { 2791 /* 2792 * Wake up any processes waiting for any buffer to become free. 2793 */ 2794 wakeup(&needbuffer); 2795 } 2796 if (need_bp_wakeup) { 2797 /* 2798 * Wake up any proceeses waiting for _this_ buffer to become free. 2799 */ 2800 wakeup(bp); 2801 } 2802 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END, 2803 bp, bp->b_datap, bp->b_flags, 0, 0); 2804} 2805 2806/* 2807 * Determine if a block is in the cache. 2808 * Just look on what would be its hash chain. If it's there, return 2809 * a pointer to it, unless it's marked invalid. If it's marked invalid, 2810 * we normally don't return the buffer, unless the caller explicitly 2811 * wants us to. 2812 */ 2813static boolean_t 2814incore(vnode_t vp, daddr64_t blkno) 2815{ 2816 boolean_t retval; 2817 struct bufhashhdr *dp; 2818 2819 dp = BUFHASH(vp, blkno); 2820 2821 lck_mtx_lock_spin(buf_mtxp); 2822 2823 if (incore_locked(vp, blkno, dp)) 2824 retval = TRUE; 2825 else 2826 retval = FALSE; 2827 lck_mtx_unlock(buf_mtxp); 2828 2829 return (retval); 2830} 2831 2832 2833static buf_t 2834incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp) 2835{ 2836 struct buf *bp; 2837 2838 /* Search hash chain */ 2839 for (bp = dp->lh_first; bp != NULL; bp = bp->b_hash.le_next) { 2840 if (bp->b_lblkno == blkno && bp->b_vp == vp && 2841 !ISSET(bp->b_flags, B_INVAL)) { 2842 return (bp); 2843 } 2844 } 2845 return (NULL); 2846} 2847 2848 2849void 2850buf_wait_for_shadow_io(vnode_t vp, daddr64_t blkno) 2851{ 2852 buf_t bp; 2853 struct bufhashhdr *dp; 2854 2855 dp = BUFHASH(vp, blkno); 2856 2857 lck_mtx_lock_spin(buf_mtxp); 2858 2859 for (;;) { 2860 if ((bp = incore_locked(vp, blkno, dp)) == NULL) 2861 break; 2862 2863 if (bp->b_shadow_ref == 0) 2864 break; 2865 2866 SET(bp->b_lflags, BL_WANTED_REF); 2867 2868 (void) msleep(bp, buf_mtxp, PSPIN | (PRIBIO+1), "buf_wait_for_shadow", NULL); 2869 } 2870 lck_mtx_unlock(buf_mtxp); 2871} 2872 2873/* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */ 2874/* 2875 * Get a block of requested size that is associated with 2876 * a given vnode and block offset. If it is found in the 2877 * block cache, mark it as having been found, make it busy 2878 * and return it. Otherwise, return an empty block of the 2879 * correct size. It is up to the caller to insure that the 2880 * cached blocks be of the correct size. 2881 */ 2882buf_t 2883buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int operation) 2884{ 2885 buf_t bp; 2886 int err; 2887 upl_t upl; 2888 upl_page_info_t *pl; 2889 kern_return_t kret; 2890 int ret_only_valid; 2891 struct timespec ts; 2892 int upl_flags; 2893 struct bufhashhdr *dp; 2894 2895 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START, 2896 (uintptr_t)(blkno * PAGE_SIZE), size, operation, 0, 0); 2897 2898 ret_only_valid = operation & BLK_ONLYVALID; 2899 operation &= ~BLK_ONLYVALID; 2900 dp = BUFHASH(vp, blkno); 2901start: 2902 lck_mtx_lock_spin(buf_mtxp); 2903 2904 if ((bp = incore_locked(vp, blkno, dp))) { 2905 /* 2906 * Found in the Buffer Cache 2907 */ 2908 if (ISSET(bp->b_lflags, BL_BUSY)) { 2909 /* 2910 * but is busy 2911 */ 2912 switch (operation) { 2913 case BLK_READ: 2914 case BLK_WRITE: 2915 case BLK_META: 2916 SET(bp->b_lflags, BL_WANTED); 2917 bufstats.bufs_busyincore++; 2918 2919 /* 2920 * don't retake the mutex after being awakened... 2921 * the time out is in msecs 2922 */ 2923 ts.tv_sec = (slptimeo/1000); 2924 ts.tv_nsec = (slptimeo % 1000) * 10 * NSEC_PER_USEC * 1000; 2925 2926 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 396)) | DBG_FUNC_NONE, 2927 (uintptr_t)blkno, size, operation, 0, 0); 2928 2929 err = msleep(bp, buf_mtxp, slpflag | PDROP | (PRIBIO + 1), "buf_getblk", &ts); 2930 2931 /* 2932 * Callers who call with PCATCH or timeout are 2933 * willing to deal with the NULL pointer 2934 */ 2935 if (err && ((slpflag & PCATCH) || ((err == EWOULDBLOCK) && slptimeo))) 2936 return (NULL); 2937 goto start; 2938 /*NOTREACHED*/ 2939 break; 2940 2941 default: 2942 /* 2943 * unknown operation requested 2944 */ 2945 panic("getblk: paging or unknown operation for incore busy buffer - %x\n", operation); 2946 /*NOTREACHED*/ 2947 break; 2948 } 2949 } else { 2950 /* 2951 * buffer in core and not busy 2952 */ 2953 SET(bp->b_lflags, BL_BUSY); 2954 SET(bp->b_flags, B_CACHE); 2955 buf_busycount++; 2956 2957 bremfree_locked(bp); 2958 bufstats.bufs_incore++; 2959 2960 lck_mtx_unlock(buf_mtxp); 2961#ifdef JOE_DEBUG 2962 bp->b_owner = current_thread(); 2963 bp->b_tag = 1; 2964#endif 2965 if ( (bp->b_upl) ) 2966 panic("buffer has UPL, but not marked BUSY: %p", bp); 2967 2968 if ( !ret_only_valid && bp->b_bufsize != size) 2969 allocbuf(bp, size); 2970 2971 upl_flags = 0; 2972 switch (operation) { 2973 case BLK_WRITE: 2974 /* 2975 * "write" operation: let the UPL subsystem 2976 * know that we intend to modify the buffer 2977 * cache pages we're gathering. 2978 */ 2979 upl_flags |= UPL_WILL_MODIFY; 2980 case BLK_READ: 2981 upl_flags |= UPL_PRECIOUS; 2982 if (UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) { 2983 kret = ubc_create_upl(vp, 2984 ubc_blktooff(vp, bp->b_lblkno), 2985 bp->b_bufsize, 2986 &upl, 2987 &pl, 2988 upl_flags); 2989 if (kret != KERN_SUCCESS) 2990 panic("Failed to create UPL"); 2991 2992 bp->b_upl = upl; 2993 2994 if (upl_valid_page(pl, 0)) { 2995 if (upl_dirty_page(pl, 0)) 2996 SET(bp->b_flags, B_WASDIRTY); 2997 else 2998 CLR(bp->b_flags, B_WASDIRTY); 2999 } else 3000 CLR(bp->b_flags, (B_DONE | B_CACHE | B_WASDIRTY | B_DELWRI)); 3001 3002 kret = ubc_upl_map(upl, (vm_offset_t*)&(bp->b_datap)); 3003 3004 if (kret != KERN_SUCCESS) 3005 panic("getblk: ubc_upl_map() failed with (%d)", kret); 3006 } 3007 break; 3008 3009 case BLK_META: 3010 /* 3011 * VM is not involved in IO for the meta data 3012 * buffer already has valid data 3013 */ 3014 break; 3015 3016 default: 3017 panic("getblk: paging or unknown operation for incore buffer- %d\n", operation); 3018 /*NOTREACHED*/ 3019 break; 3020 } 3021 } 3022 } else { /* not incore() */ 3023 int queue = BQ_EMPTY; /* Start with no preference */ 3024 3025 if (ret_only_valid) { 3026 lck_mtx_unlock(buf_mtxp); 3027 return (NULL); 3028 } 3029 if ((vnode_isreg(vp) == 0) || (UBCINFOEXISTS(vp) == 0) /*|| (vnode_issystem(vp) == 1)*/) 3030 operation = BLK_META; 3031 3032 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL) 3033 goto start; 3034 3035 /* 3036 * getnewbuf may block for a number of different reasons... 3037 * if it does, it's then possible for someone else to 3038 * create a buffer for the same block and insert it into 3039 * the hash... if we see it incore at this point we dump 3040 * the buffer we were working on and start over 3041 */ 3042 if (incore_locked(vp, blkno, dp)) { 3043 SET(bp->b_flags, B_INVAL); 3044 binshash(bp, &invalhash); 3045 3046 lck_mtx_unlock(buf_mtxp); 3047 3048 buf_brelse(bp); 3049 goto start; 3050 } 3051 /* 3052 * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN 3053 * CALLED! BE CAREFUL. 3054 */ 3055 3056 /* 3057 * mark the buffer as B_META if indicated 3058 * so that when buffer is released it will goto META queue 3059 */ 3060 if (operation == BLK_META) 3061 SET(bp->b_flags, B_META); 3062 3063 bp->b_blkno = bp->b_lblkno = blkno; 3064 bp->b_vp = vp; 3065 3066 /* 3067 * Insert in the hash so that incore() can find it 3068 */ 3069 binshash(bp, BUFHASH(vp, blkno)); 3070 3071 bgetvp_locked(vp, bp); 3072 3073 lck_mtx_unlock(buf_mtxp); 3074 3075 allocbuf(bp, size); 3076 3077 upl_flags = 0; 3078 switch (operation) { 3079 case BLK_META: 3080 /* 3081 * buffer data is invalid... 3082 * 3083 * I don't want to have to retake buf_mtxp, 3084 * so the miss and vmhits counters are done 3085 * with Atomic updates... all other counters 3086 * in bufstats are protected with either 3087 * buf_mtxp or iobuffer_mtxp 3088 */ 3089 OSAddAtomicLong(1, &bufstats.bufs_miss); 3090 break; 3091 3092 case BLK_WRITE: 3093 /* 3094 * "write" operation: let the UPL subsystem know 3095 * that we intend to modify the buffer cache pages 3096 * we're gathering. 3097 */ 3098 upl_flags |= UPL_WILL_MODIFY; 3099 case BLK_READ: 3100 { off_t f_offset; 3101 size_t contig_bytes; 3102 int bmap_flags; 3103 3104 if ( (bp->b_upl) ) 3105 panic("bp already has UPL: %p",bp); 3106 3107 f_offset = ubc_blktooff(vp, blkno); 3108 3109 upl_flags |= UPL_PRECIOUS; 3110 kret = ubc_create_upl(vp, 3111 f_offset, 3112 bp->b_bufsize, 3113 &upl, 3114 &pl, 3115 upl_flags); 3116 3117 if (kret != KERN_SUCCESS) 3118 panic("Failed to create UPL"); 3119#if UPL_DEBUG 3120 upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) 4); 3121#endif /* UPL_DEBUG */ 3122 bp->b_upl = upl; 3123 3124 if (upl_valid_page(pl, 0)) { 3125 3126 if (operation == BLK_READ) 3127 bmap_flags = VNODE_READ; 3128 else 3129 bmap_flags = VNODE_WRITE; 3130 3131 SET(bp->b_flags, B_CACHE | B_DONE); 3132 3133 OSAddAtomicLong(1, &bufstats.bufs_vmhits); 3134 3135 bp->b_validoff = 0; 3136 bp->b_dirtyoff = 0; 3137 3138 if (upl_dirty_page(pl, 0)) { 3139 /* page is dirty */ 3140 SET(bp->b_flags, B_WASDIRTY); 3141 3142 bp->b_validend = bp->b_bcount; 3143 bp->b_dirtyend = bp->b_bcount; 3144 } else { 3145 /* page is clean */ 3146 bp->b_validend = bp->b_bcount; 3147 bp->b_dirtyend = 0; 3148 } 3149 /* 3150 * try to recreate the physical block number associated with 3151 * this buffer... 3152 */ 3153 if (VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL)) 3154 panic("getblk: VNOP_BLOCKMAP failed"); 3155 /* 3156 * if the extent represented by this buffer 3157 * is not completely physically contiguous on 3158 * disk, than we can't cache the physical mapping 3159 * in the buffer header 3160 */ 3161 if ((long)contig_bytes < bp->b_bcount) 3162 bp->b_blkno = bp->b_lblkno; 3163 } else { 3164 OSAddAtomicLong(1, &bufstats.bufs_miss); 3165 } 3166 kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap)); 3167 3168 if (kret != KERN_SUCCESS) 3169 panic("getblk: ubc_upl_map() failed with (%d)", kret); 3170 break; 3171 } 3172 default: 3173 panic("getblk: paging or unknown operation - %x", operation); 3174 /*NOTREACHED*/ 3175 break; 3176 } 3177 } 3178 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END, 3179 bp, bp->b_datap, bp->b_flags, 3, 0); 3180 3181#ifdef JOE_DEBUG 3182 (void) OSBacktrace(&bp->b_stackgetblk[0], 6); 3183#endif 3184 return (bp); 3185} 3186 3187/* 3188 * Get an empty, disassociated buffer of given size. 3189 */ 3190buf_t 3191buf_geteblk(int size) 3192{ 3193 buf_t bp = NULL; 3194 int queue = BQ_EMPTY; 3195 3196 do { 3197 lck_mtx_lock_spin(buf_mtxp); 3198 3199 bp = getnewbuf(0, 0, &queue); 3200 } while (bp == NULL); 3201 3202 SET(bp->b_flags, (B_META|B_INVAL)); 3203 3204#if DIAGNOSTIC 3205 assert(queue == BQ_EMPTY); 3206#endif /* DIAGNOSTIC */ 3207 /* XXX need to implement logic to deal with other queues */ 3208 3209 binshash(bp, &invalhash); 3210 bufstats.bufs_eblk++; 3211 3212 lck_mtx_unlock(buf_mtxp); 3213 3214 allocbuf(bp, size); 3215 3216 return (bp); 3217} 3218 3219uint32_t 3220buf_redundancy_flags(buf_t bp) 3221{ 3222 return bp->b_redundancy_flags; 3223} 3224 3225void 3226buf_set_redundancy_flags(buf_t bp, uint32_t flags) 3227{ 3228 SET(bp->b_redundancy_flags, flags); 3229} 3230 3231void 3232buf_clear_redundancy_flags(buf_t bp, uint32_t flags) 3233{ 3234 CLR(bp->b_redundancy_flags, flags); 3235} 3236 3237/* 3238 * With UBC, there is no need to expand / shrink the file data 3239 * buffer. The VM uses the same pages, hence no waste. 3240 * All the file data buffers can have one size. 3241 * In fact expand / shrink would be an expensive operation. 3242 * 3243 * Only exception to this is meta-data buffers. Most of the 3244 * meta data operations are smaller than PAGE_SIZE. Having the 3245 * meta-data buffers grow and shrink as needed, optimizes use 3246 * of the kernel wired memory. 3247 */ 3248 3249int 3250allocbuf(buf_t bp, int size) 3251{ 3252 vm_size_t desired_size; 3253 3254 desired_size = roundup(size, CLBYTES); 3255 3256 if (desired_size < PAGE_SIZE) 3257 desired_size = PAGE_SIZE; 3258 if (desired_size > MAXBSIZE) 3259 panic("allocbuf: buffer larger than MAXBSIZE requested"); 3260 3261 if (ISSET(bp->b_flags, B_META)) { 3262 zone_t zprev, z; 3263 int nsize = roundup(size, MINMETA); 3264 3265 if (bp->b_datap) { 3266 vm_offset_t elem = (vm_offset_t)bp->b_datap; 3267 3268 if (ISSET(bp->b_flags, B_ZALLOC)) { 3269 if (bp->b_bufsize < nsize) { 3270 /* reallocate to a bigger size */ 3271 3272 zprev = getbufzone(bp->b_bufsize); 3273 if (nsize <= MAXMETA) { 3274 desired_size = nsize; 3275 z = getbufzone(nsize); 3276 /* b_datap not really a ptr */ 3277 *(void **)(&bp->b_datap) = zalloc(z); 3278 } else { 3279 bp->b_datap = (uintptr_t)NULL; 3280 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size); 3281 CLR(bp->b_flags, B_ZALLOC); 3282 } 3283 bcopy((void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize); 3284 zfree(zprev, (void *)elem); 3285 } else { 3286 desired_size = bp->b_bufsize; 3287 } 3288 3289 } else { 3290 if ((vm_size_t)bp->b_bufsize < desired_size) { 3291 /* reallocate to a bigger size */ 3292 bp->b_datap = (uintptr_t)NULL; 3293 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size); 3294 bcopy((const void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize); 3295 kmem_free(kernel_map, elem, bp->b_bufsize); 3296 } else { 3297 desired_size = bp->b_bufsize; 3298 } 3299 } 3300 } else { 3301 /* new allocation */ 3302 if (nsize <= MAXMETA) { 3303 desired_size = nsize; 3304 z = getbufzone(nsize); 3305 /* b_datap not really a ptr */ 3306 *(void **)(&bp->b_datap) = zalloc(z); 3307 SET(bp->b_flags, B_ZALLOC); 3308 } else 3309 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size); 3310 } 3311 3312 if (bp->b_datap == 0) 3313 panic("allocbuf: NULL b_datap"); 3314 } 3315 bp->b_bufsize = desired_size; 3316 bp->b_bcount = size; 3317 3318 return (0); 3319} 3320 3321/* 3322 * Get a new buffer from one of the free lists. 3323 * 3324 * Request for a queue is passes in. The queue from which the buffer was taken 3325 * from is returned. Out of range queue requests get BQ_EMPTY. Request for 3326 * BQUEUE means no preference. Use heuristics in that case. 3327 * Heuristics is as follows: 3328 * Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order. 3329 * If none available block till one is made available. 3330 * If buffers available on both BQ_AGE and BQ_LRU, check the timestamps. 3331 * Pick the most stale buffer. 3332 * If found buffer was marked delayed write, start the async. write 3333 * and restart the search. 3334 * Initialize the fields and disassociate the buffer from the vnode. 3335 * Remove the buffer from the hash. Return the buffer and the queue 3336 * on which it was found. 3337 * 3338 * buf_mtxp is held upon entry 3339 * returns with buf_mtxp locked if new buf available 3340 * returns with buf_mtxp UNlocked if new buf NOT available 3341 */ 3342 3343static buf_t 3344getnewbuf(int slpflag, int slptimeo, int * queue) 3345{ 3346 buf_t bp; 3347 buf_t lru_bp; 3348 buf_t age_bp; 3349 buf_t meta_bp; 3350 int age_time, lru_time, bp_time, meta_time; 3351 int req = *queue; /* save it for restarts */ 3352 struct timespec ts; 3353 3354start: 3355 /* 3356 * invalid request gets empty queue 3357 */ 3358 if ((*queue >= BQUEUES) || (*queue < 0) 3359 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED)) 3360 *queue = BQ_EMPTY; 3361 3362 3363 if (*queue == BQ_EMPTY && (bp = bufqueues[*queue].tqh_first)) 3364 goto found; 3365 3366 /* 3367 * need to grow number of bufs, add another one rather than recycling 3368 */ 3369 if (nbuf_headers < max_nbuf_headers) { 3370 /* 3371 * Increment count now as lock 3372 * is dropped for allocation. 3373 * That avoids over commits 3374 */ 3375 nbuf_headers++; 3376 goto add_newbufs; 3377 } 3378 /* Try for the requested queue first */ 3379 bp = bufqueues[*queue].tqh_first; 3380 if (bp) 3381 goto found; 3382 3383 /* Unable to use requested queue */ 3384 age_bp = bufqueues[BQ_AGE].tqh_first; 3385 lru_bp = bufqueues[BQ_LRU].tqh_first; 3386 meta_bp = bufqueues[BQ_META].tqh_first; 3387 3388 if (!age_bp && !lru_bp && !meta_bp) { 3389 /* 3390 * Unavailble on AGE or LRU or META queues 3391 * Try the empty list first 3392 */ 3393 bp = bufqueues[BQ_EMPTY].tqh_first; 3394 if (bp) { 3395 *queue = BQ_EMPTY; 3396 goto found; 3397 } 3398 /* 3399 * We have seen is this is hard to trigger. 3400 * This is an overcommit of nbufs but needed 3401 * in some scenarios with diskiamges 3402 */ 3403 3404add_newbufs: 3405 lck_mtx_unlock(buf_mtxp); 3406 3407 /* Create a new temporary buffer header */ 3408 bp = (struct buf *)zalloc(buf_hdr_zone); 3409 3410 if (bp) { 3411 bufhdrinit(bp); 3412 bp->b_whichq = BQ_EMPTY; 3413 bp->b_timestamp = buf_timestamp(); 3414 BLISTNONE(bp); 3415 SET(bp->b_flags, B_HDRALLOC); 3416 *queue = BQ_EMPTY; 3417 } 3418 lck_mtx_lock_spin(buf_mtxp); 3419 3420 if (bp) { 3421 binshash(bp, &invalhash); 3422 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY); 3423 buf_hdr_count++; 3424 goto found; 3425 } 3426 /* subtract already accounted bufcount */ 3427 nbuf_headers--; 3428 3429 bufstats.bufs_sleeps++; 3430 3431 /* wait for a free buffer of any kind */ 3432 needbuffer = 1; 3433 /* hz value is 100 */ 3434 ts.tv_sec = (slptimeo/1000); 3435 /* the hz value is 100; which leads to 10ms */ 3436 ts.tv_nsec = (slptimeo % 1000) * NSEC_PER_USEC * 1000 * 10; 3437 3438 msleep(&needbuffer, buf_mtxp, slpflag | PDROP | (PRIBIO+1), "getnewbuf", &ts); 3439 return (NULL); 3440 } 3441 3442 /* Buffer available either on AGE or LRU or META */ 3443 bp = NULL; 3444 *queue = -1; 3445 3446 /* Buffer available either on AGE or LRU */ 3447 if (!age_bp) { 3448 bp = lru_bp; 3449 *queue = BQ_LRU; 3450 } else if (!lru_bp) { 3451 bp = age_bp; 3452 *queue = BQ_AGE; 3453 } else { /* buffer available on both AGE and LRU */ 3454 int t = buf_timestamp(); 3455 3456 age_time = t - age_bp->b_timestamp; 3457 lru_time = t - lru_bp->b_timestamp; 3458 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */ 3459 bp = age_bp; 3460 *queue = BQ_AGE; 3461 /* 3462 * we should probably re-timestamp eveything in the 3463 * queues at this point with the current time 3464 */ 3465 } else { 3466 if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) { 3467 bp = lru_bp; 3468 *queue = BQ_LRU; 3469 } else { 3470 bp = age_bp; 3471 *queue = BQ_AGE; 3472 } 3473 } 3474 } 3475 3476 if (!bp) { /* Neither on AGE nor on LRU */ 3477 bp = meta_bp; 3478 *queue = BQ_META; 3479 } else if (meta_bp) { 3480 int t = buf_timestamp(); 3481 3482 bp_time = t - bp->b_timestamp; 3483 meta_time = t - meta_bp->b_timestamp; 3484 3485 if (!(bp_time < 0) && !(meta_time < 0)) { 3486 /* time not set backwards */ 3487 int bp_is_stale; 3488 bp_is_stale = (*queue == BQ_LRU) ? 3489 lru_is_stale : age_is_stale; 3490 3491 if ((meta_time >= meta_is_stale) && 3492 (bp_time < bp_is_stale)) { 3493 bp = meta_bp; 3494 *queue = BQ_META; 3495 } 3496 } 3497 } 3498found: 3499 if (ISSET(bp->b_flags, B_LOCKED) || ISSET(bp->b_lflags, BL_BUSY)) 3500 panic("getnewbuf: bp @ %p is LOCKED or BUSY! (flags 0x%x)\n", bp, bp->b_flags); 3501 3502 /* Clean it */ 3503 if (bcleanbuf(bp, FALSE)) { 3504 /* 3505 * moved to the laundry thread, buffer not ready 3506 */ 3507 *queue = req; 3508 goto start; 3509 } 3510 return (bp); 3511} 3512 3513 3514/* 3515 * Clean a buffer. 3516 * Returns 0 if buffer is ready to use, 3517 * Returns 1 if issued a buf_bawrite() to indicate 3518 * that the buffer is not ready. 3519 * 3520 * buf_mtxp is held upon entry 3521 * returns with buf_mtxp locked 3522 */ 3523int 3524bcleanbuf(buf_t bp, boolean_t discard) 3525{ 3526 /* Remove from the queue */ 3527 bremfree_locked(bp); 3528 3529#ifdef JOE_DEBUG 3530 bp->b_owner = current_thread(); 3531 bp->b_tag = 2; 3532#endif 3533 /* 3534 * If buffer was a delayed write, start the IO by queuing 3535 * it on the LAUNDRY queue, and return 1 3536 */ 3537 if (ISSET(bp->b_flags, B_DELWRI)) { 3538 if (discard) { 3539 SET(bp->b_lflags, BL_WANTDEALLOC); 3540 } 3541 3542 bmovelaundry(bp); 3543 3544 lck_mtx_unlock(buf_mtxp); 3545 3546 wakeup(&bufqueues[BQ_LAUNDRY]); 3547 /* 3548 * and give it a chance to run 3549 */ 3550 (void)thread_block(THREAD_CONTINUE_NULL); 3551 3552 lck_mtx_lock_spin(buf_mtxp); 3553 3554 return (1); 3555 } 3556#ifdef JOE_DEBUG 3557 bp->b_owner = current_thread(); 3558 bp->b_tag = 8; 3559#endif 3560 /* 3561 * Buffer is no longer on any free list... we own it 3562 */ 3563 SET(bp->b_lflags, BL_BUSY); 3564 buf_busycount++; 3565 3566 bremhash(bp); 3567 3568 /* 3569 * disassociate us from our vnode, if we had one... 3570 */ 3571 if (bp->b_vp) 3572 brelvp_locked(bp); 3573 3574 lck_mtx_unlock(buf_mtxp); 3575 3576 BLISTNONE(bp); 3577 3578 if (ISSET(bp->b_flags, B_META)) 3579 buf_free_meta_store(bp); 3580 3581 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 3582 3583 buf_release_credentials(bp); 3584 3585 /* If discarding, just move to the empty queue */ 3586 if (discard) { 3587 lck_mtx_lock_spin(buf_mtxp); 3588 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA)); 3589 bp->b_whichq = BQ_EMPTY; 3590 binshash(bp, &invalhash); 3591 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY); 3592 CLR(bp->b_lflags, BL_BUSY); 3593 buf_busycount--; 3594 } else { 3595 /* Not discarding: clean up and prepare for reuse */ 3596 bp->b_bufsize = 0; 3597 bp->b_datap = (uintptr_t)NULL; 3598 bp->b_upl = (void *)NULL; 3599 /* 3600 * preserve the state of whether this buffer 3601 * was allocated on the fly or not... 3602 * the only other flag that should be set at 3603 * this point is BL_BUSY... 3604 */ 3605#ifdef JOE_DEBUG 3606 bp->b_owner = current_thread(); 3607 bp->b_tag = 3; 3608#endif 3609 bp->b_lflags = BL_BUSY; 3610 bp->b_flags = (bp->b_flags & B_HDRALLOC); 3611 bp->b_dev = NODEV; 3612 bp->b_blkno = bp->b_lblkno = 0; 3613 bp->b_iodone = NULL; 3614 bp->b_error = 0; 3615 bp->b_resid = 0; 3616 bp->b_bcount = 0; 3617 bp->b_dirtyoff = bp->b_dirtyend = 0; 3618 bp->b_validoff = bp->b_validend = 0; 3619 bzero(&bp->b_attr, sizeof(struct bufattr)); 3620 3621 lck_mtx_lock_spin(buf_mtxp); 3622 } 3623 return (0); 3624} 3625 3626 3627 3628errno_t 3629buf_invalblkno(vnode_t vp, daddr64_t lblkno, int flags) 3630{ 3631 buf_t bp; 3632 errno_t error; 3633 struct bufhashhdr *dp; 3634 3635 dp = BUFHASH(vp, lblkno); 3636 3637relook: 3638 lck_mtx_lock_spin(buf_mtxp); 3639 3640 if ((bp = incore_locked(vp, lblkno, dp)) == (struct buf *)0) { 3641 lck_mtx_unlock(buf_mtxp); 3642 return (0); 3643 } 3644 if (ISSET(bp->b_lflags, BL_BUSY)) { 3645 if ( !ISSET(flags, BUF_WAIT)) { 3646 lck_mtx_unlock(buf_mtxp); 3647 return (EBUSY); 3648 } 3649 SET(bp->b_lflags, BL_WANTED); 3650 3651 error = msleep((caddr_t)bp, buf_mtxp, PDROP | (PRIBIO + 1), "buf_invalblkno", NULL); 3652 3653 if (error) { 3654 return (error); 3655 } 3656 goto relook; 3657 } 3658 bremfree_locked(bp); 3659 SET(bp->b_lflags, BL_BUSY); 3660 SET(bp->b_flags, B_INVAL); 3661 buf_busycount++; 3662#ifdef JOE_DEBUG 3663 bp->b_owner = current_thread(); 3664 bp->b_tag = 4; 3665#endif 3666 lck_mtx_unlock(buf_mtxp); 3667 buf_brelse(bp); 3668 3669 return (0); 3670} 3671 3672 3673void 3674buf_drop(buf_t bp) 3675{ 3676 int need_wakeup = 0; 3677 3678 lck_mtx_lock_spin(buf_mtxp); 3679 3680 if (ISSET(bp->b_lflags, BL_WANTED)) { 3681 /* 3682 * delay the actual wakeup until after we 3683 * clear BL_BUSY and we've dropped buf_mtxp 3684 */ 3685 need_wakeup = 1; 3686 } 3687#ifdef JOE_DEBUG 3688 bp->b_owner = current_thread(); 3689 bp->b_tag = 9; 3690#endif 3691 /* 3692 * Unlock the buffer. 3693 */ 3694 CLR(bp->b_lflags, (BL_BUSY | BL_WANTED)); 3695 buf_busycount--; 3696 3697 lck_mtx_unlock(buf_mtxp); 3698 3699 if (need_wakeup) { 3700 /* 3701 * Wake up any proceeses waiting for _this_ buffer to become free. 3702 */ 3703 wakeup(bp); 3704 } 3705} 3706 3707 3708errno_t 3709buf_acquire(buf_t bp, int flags, int slpflag, int slptimeo) { 3710 errno_t error; 3711 3712 lck_mtx_lock_spin(buf_mtxp); 3713 3714 error = buf_acquire_locked(bp, flags, slpflag, slptimeo); 3715 3716 lck_mtx_unlock(buf_mtxp); 3717 3718 return (error); 3719} 3720 3721 3722static errno_t 3723buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo) 3724{ 3725 errno_t error; 3726 struct timespec ts; 3727 3728 if (ISSET(bp->b_flags, B_LOCKED)) { 3729 if ((flags & BAC_SKIP_LOCKED)) 3730 return (EDEADLK); 3731 } else { 3732 if ((flags & BAC_SKIP_NONLOCKED)) 3733 return (EDEADLK); 3734 } 3735 if (ISSET(bp->b_lflags, BL_BUSY)) { 3736 /* 3737 * since the lck_mtx_lock may block, the buffer 3738 * may become BUSY, so we need to 3739 * recheck for a NOWAIT request 3740 */ 3741 if (flags & BAC_NOWAIT) 3742 return (EBUSY); 3743 SET(bp->b_lflags, BL_WANTED); 3744 3745 /* the hz value is 100; which leads to 10ms */ 3746 ts.tv_sec = (slptimeo/100); 3747 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000; 3748 error = msleep((caddr_t)bp, buf_mtxp, slpflag | (PRIBIO + 1), "buf_acquire", &ts); 3749 3750 if (error) 3751 return (error); 3752 return (EAGAIN); 3753 } 3754 if (flags & BAC_REMOVE) 3755 bremfree_locked(bp); 3756 SET(bp->b_lflags, BL_BUSY); 3757 buf_busycount++; 3758 3759#ifdef JOE_DEBUG 3760 bp->b_owner = current_thread(); 3761 bp->b_tag = 5; 3762#endif 3763 return (0); 3764} 3765 3766 3767/* 3768 * Wait for operations on the buffer to complete. 3769 * When they do, extract and return the I/O's error value. 3770 */ 3771errno_t 3772buf_biowait(buf_t bp) 3773{ 3774 while (!ISSET(bp->b_flags, B_DONE)) { 3775 3776 lck_mtx_lock_spin(buf_mtxp); 3777 3778 if (!ISSET(bp->b_flags, B_DONE)) { 3779 DTRACE_IO1(wait__start, buf_t, bp); 3780 (void) msleep(bp, buf_mtxp, PDROP | (PRIBIO+1), "buf_biowait", NULL); 3781 DTRACE_IO1(wait__done, buf_t, bp); 3782 } else 3783 lck_mtx_unlock(buf_mtxp); 3784 } 3785 /* check for interruption of I/O (e.g. via NFS), then errors. */ 3786 if (ISSET(bp->b_flags, B_EINTR)) { 3787 CLR(bp->b_flags, B_EINTR); 3788 return (EINTR); 3789 } else if (ISSET(bp->b_flags, B_ERROR)) 3790 return (bp->b_error ? bp->b_error : EIO); 3791 else 3792 return (0); 3793} 3794 3795 3796/* 3797 * Mark I/O complete on a buffer. 3798 * 3799 * If a callback has been requested, e.g. the pageout 3800 * daemon, do so. Otherwise, awaken waiting processes. 3801 * 3802 * [ Leffler, et al., says on p.247: 3803 * "This routine wakes up the blocked process, frees the buffer 3804 * for an asynchronous write, or, for a request by the pagedaemon 3805 * process, invokes a procedure specified in the buffer structure" ] 3806 * 3807 * In real life, the pagedaemon (or other system processes) wants 3808 * to do async stuff to, and doesn't want the buffer buf_brelse()'d. 3809 * (for swap pager, that puts swap buffers on the free lists (!!!), 3810 * for the vn device, that puts malloc'd buffers on the free lists!) 3811 */ 3812 3813void 3814buf_biodone(buf_t bp) 3815{ 3816 mount_t mp; 3817 struct bufattr *bap; 3818 3819 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START, 3820 bp, bp->b_datap, bp->b_flags, 0, 0); 3821 3822 if (ISSET(bp->b_flags, B_DONE)) 3823 panic("biodone already"); 3824 3825 if (ISSET(bp->b_flags, B_ERROR)) { 3826 fslog_io_error(bp); 3827 } 3828 3829 bap = &bp->b_attr; 3830 3831 if (bp->b_vp && bp->b_vp->v_mount) { 3832 mp = bp->b_vp->v_mount; 3833 } else { 3834 mp = NULL; 3835 } 3836 3837 if (mp && (bp->b_flags & B_READ) == 0) { 3838 update_last_io_time(mp); 3839 INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_write_size); 3840 } else if (mp) { 3841 INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_read_size); 3842 } 3843 3844 if (kdebug_enable) { 3845 int code = DKIO_DONE; 3846 int io_tier = GET_BUFATTR_IO_TIER(bap); 3847 3848 if (bp->b_flags & B_READ) 3849 code |= DKIO_READ; 3850 if (bp->b_flags & B_ASYNC) 3851 code |= DKIO_ASYNC; 3852 3853 if (bp->b_flags & B_META) 3854 code |= DKIO_META; 3855 else if (bp->b_flags & B_PAGEIO) 3856 code |= DKIO_PAGING; 3857 3858 if (io_tier != 0) 3859 code |= DKIO_THROTTLE; 3860 3861 code |= ((io_tier << DKIO_TIER_SHIFT) & DKIO_TIER_MASK); 3862 3863 if (bp->b_flags & B_PASSIVE) 3864 code |= DKIO_PASSIVE; 3865 3866 if (bap->ba_flags & BA_NOCACHE) 3867 code |= DKIO_NOCACHE; 3868 3869 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE, 3870 buf_kernel_addrperm_addr(bp), (uintptr_t)VM_KERNEL_ADDRPERM(bp->b_vp), bp->b_resid, bp->b_error, 0); 3871 } 3872 3873 /* 3874 * I/O was done, so don't believe 3875 * the DIRTY state from VM anymore... 3876 * and we need to reset the THROTTLED/PASSIVE 3877 * indicators 3878 */ 3879 CLR(bp->b_flags, (B_WASDIRTY | B_PASSIVE)); 3880 CLR(bap->ba_flags, (BA_META | BA_NOCACHE | BA_DELAYIDLESLEEP)); 3881 3882 SET_BUFATTR_IO_TIER(bap, 0); 3883 3884 DTRACE_IO1(done, buf_t, bp); 3885 3886 if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW)) 3887 /* 3888 * wake up any writer's blocked 3889 * on throttle or waiting for I/O 3890 * to drain 3891 */ 3892 vnode_writedone(bp->b_vp); 3893 3894 if (ISSET(bp->b_flags, (B_CALL | B_FILTER))) { /* if necessary, call out */ 3895 void (*iodone_func)(struct buf *, void *) = bp->b_iodone; 3896 void *arg = bp->b_transaction; 3897 int callout = ISSET(bp->b_flags, B_CALL); 3898 3899 if (iodone_func == NULL) 3900 panic("biodone: bp @ %p has NULL b_iodone!\n", bp); 3901 3902 CLR(bp->b_flags, (B_CALL | B_FILTER)); /* filters and callouts are one-shot */ 3903 bp->b_iodone = NULL; 3904 bp->b_transaction = NULL; 3905 3906 if (callout) 3907 SET(bp->b_flags, B_DONE); /* note that it's done */ 3908 3909 (*iodone_func)(bp, arg); 3910 3911 if (callout) { 3912 /* 3913 * assumes that the callback function takes 3914 * ownership of the bp and deals with releasing it if necessary 3915 */ 3916 goto biodone_done; 3917 } 3918 /* 3919 * in this case the call back function is acting 3920 * strictly as a filter... it does not take 3921 * ownership of the bp and is expecting us 3922 * to finish cleaning up... this is currently used 3923 * by the HFS journaling code 3924 */ 3925 } 3926 if (ISSET(bp->b_flags, B_ASYNC)) { /* if async, release it */ 3927 SET(bp->b_flags, B_DONE); /* note that it's done */ 3928 3929 buf_brelse(bp); 3930 } else { /* or just wakeup the buffer */ 3931 /* 3932 * by taking the mutex, we serialize 3933 * the buf owner calling buf_biowait so that we'll 3934 * only see him in one of 2 states... 3935 * state 1: B_DONE wasn't set and he's 3936 * blocked in msleep 3937 * state 2: he's blocked trying to take the 3938 * mutex before looking at B_DONE 3939 * BL_WANTED is cleared in case anyone else 3940 * is blocked waiting for the buffer... note 3941 * that we haven't cleared B_BUSY yet, so if 3942 * they do get to run, their going to re-set 3943 * BL_WANTED and go back to sleep 3944 */ 3945 lck_mtx_lock_spin(buf_mtxp); 3946 3947 CLR(bp->b_lflags, BL_WANTED); 3948 SET(bp->b_flags, B_DONE); /* note that it's done */ 3949 3950 lck_mtx_unlock(buf_mtxp); 3951 3952 wakeup(bp); 3953 } 3954biodone_done: 3955 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END, 3956 (uintptr_t)bp, (uintptr_t)bp->b_datap, bp->b_flags, 0, 0); 3957} 3958 3959/* 3960 * Obfuscate buf pointers. 3961 */ 3962vm_offset_t 3963buf_kernel_addrperm_addr(void * addr) 3964{ 3965 if ((vm_offset_t)addr == 0) 3966 return 0; 3967 else 3968 return ((vm_offset_t)addr + buf_kernel_addrperm); 3969} 3970 3971/* 3972 * Return a count of buffers on the "locked" queue. 3973 */ 3974int 3975count_lock_queue(void) 3976{ 3977 buf_t bp; 3978 int n = 0; 3979 3980 lck_mtx_lock_spin(buf_mtxp); 3981 3982 for (bp = bufqueues[BQ_LOCKED].tqh_first; bp; 3983 bp = bp->b_freelist.tqe_next) 3984 n++; 3985 lck_mtx_unlock(buf_mtxp); 3986 3987 return (n); 3988} 3989 3990/* 3991 * Return a count of 'busy' buffers. Used at the time of shutdown. 3992 * note: This is also called from the mach side in debug context in kdp.c 3993 */ 3994int 3995count_busy_buffers(void) 3996{ 3997 return buf_busycount + bufstats.bufs_iobufinuse; 3998} 3999 4000#if DIAGNOSTIC 4001/* 4002 * Print out statistics on the current allocation of the buffer pool. 4003 * Can be enabled to print out on every ``sync'' by setting "syncprt" 4004 * in vfs_syscalls.c using sysctl. 4005 */ 4006void 4007vfs_bufstats() 4008{ 4009 int i, j, count; 4010 struct buf *bp; 4011 struct bqueues *dp; 4012 int counts[MAXBSIZE/CLBYTES+1]; 4013 static char *bname[BQUEUES] = 4014 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" }; 4015 4016 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) { 4017 count = 0; 4018 for (j = 0; j <= MAXBSIZE/CLBYTES; j++) 4019 counts[j] = 0; 4020 4021 lck_mtx_lock(buf_mtxp); 4022 4023 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) { 4024 counts[bp->b_bufsize/CLBYTES]++; 4025 count++; 4026 } 4027 lck_mtx_unlock(buf_mtxp); 4028 4029 printf("%s: total-%d", bname[i], count); 4030 for (j = 0; j <= MAXBSIZE/CLBYTES; j++) 4031 if (counts[j] != 0) 4032 printf(", %d-%d", j * CLBYTES, counts[j]); 4033 printf("\n"); 4034 } 4035} 4036#endif /* DIAGNOSTIC */ 4037 4038#define NRESERVEDIOBUFS 128 4039 4040 4041buf_t 4042alloc_io_buf(vnode_t vp, int priv) 4043{ 4044 buf_t bp; 4045 4046 lck_mtx_lock_spin(iobuffer_mtxp); 4047 4048 while (((niobuf_headers - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse) && !priv) || 4049 (bp = iobufqueue.tqh_first) == NULL) { 4050 bufstats.bufs_iobufsleeps++; 4051 4052 need_iobuffer = 1; 4053 (void) msleep(&need_iobuffer, iobuffer_mtxp, PSPIN | (PRIBIO+1), (const char *)"alloc_io_buf", NULL); 4054 } 4055 TAILQ_REMOVE(&iobufqueue, bp, b_freelist); 4056 4057 bufstats.bufs_iobufinuse++; 4058 if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax) 4059 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse; 4060 4061 lck_mtx_unlock(iobuffer_mtxp); 4062 4063 /* 4064 * initialize various fields 4065 * we don't need to hold the mutex since the buffer 4066 * is now private... the vp should have a reference 4067 * on it and is not protected by this mutex in any event 4068 */ 4069 bp->b_timestamp = 0; 4070 bp->b_proc = NULL; 4071 4072 bp->b_datap = 0; 4073 bp->b_flags = 0; 4074 bp->b_lflags = BL_BUSY | BL_IOBUF; 4075 bp->b_redundancy_flags = 0; 4076 bp->b_blkno = bp->b_lblkno = 0; 4077#ifdef JOE_DEBUG 4078 bp->b_owner = current_thread(); 4079 bp->b_tag = 6; 4080#endif 4081 bp->b_iodone = NULL; 4082 bp->b_error = 0; 4083 bp->b_resid = 0; 4084 bp->b_bcount = 0; 4085 bp->b_bufsize = 0; 4086 bp->b_upl = NULL; 4087 bp->b_vp = vp; 4088 bzero(&bp->b_attr, sizeof(struct bufattr)); 4089 4090 if (vp && (vp->v_type == VBLK || vp->v_type == VCHR)) 4091 bp->b_dev = vp->v_rdev; 4092 else 4093 bp->b_dev = NODEV; 4094 4095 return (bp); 4096} 4097 4098 4099void 4100free_io_buf(buf_t bp) 4101{ 4102 int need_wakeup = 0; 4103 4104 /* 4105 * put buffer back on the head of the iobufqueue 4106 */ 4107 bp->b_vp = NULL; 4108 bp->b_flags = B_INVAL; 4109 4110 lck_mtx_lock_spin(iobuffer_mtxp); 4111 4112 binsheadfree(bp, &iobufqueue, -1); 4113 4114 if (need_iobuffer) { 4115 /* 4116 * Wake up any processes waiting because they need an io buffer 4117 * 4118 * do the wakeup after we drop the mutex... it's possible that the 4119 * wakeup will be superfluous if need_iobuffer gets set again and 4120 * another thread runs this path, but it's highly unlikely, doesn't 4121 * hurt, and it means we don't hold up I/O progress if the wakeup blocks 4122 * trying to grab a task related lock... 4123 */ 4124 need_iobuffer = 0; 4125 need_wakeup = 1; 4126 } 4127 if (bufstats.bufs_iobufinuse <= 0) 4128 panic("free_io_buf: bp(%p) - bufstats.bufs_iobufinuse < 0", bp); 4129 4130 bufstats.bufs_iobufinuse--; 4131 4132 lck_mtx_unlock(iobuffer_mtxp); 4133 4134 if (need_wakeup) 4135 wakeup(&need_iobuffer); 4136} 4137 4138 4139void 4140buf_list_lock(void) 4141{ 4142 lck_mtx_lock_spin(buf_mtxp); 4143} 4144 4145void 4146buf_list_unlock(void) 4147{ 4148 lck_mtx_unlock(buf_mtxp); 4149} 4150 4151/* 4152 * If getnewbuf() calls bcleanbuf() on the same thread 4153 * there is a potential for stack overrun and deadlocks. 4154 * So we always handoff the work to a worker thread for completion 4155 */ 4156 4157 4158static void 4159bcleanbuf_thread_init(void) 4160{ 4161 thread_t thread = THREAD_NULL; 4162 4163 /* create worker thread */ 4164 kernel_thread_start((thread_continue_t)bcleanbuf_thread, NULL, &thread); 4165 thread_deallocate(thread); 4166} 4167 4168typedef int (*bcleanbufcontinuation)(int); 4169 4170static void 4171bcleanbuf_thread(void) 4172{ 4173 struct buf *bp; 4174 int error = 0; 4175 int loopcnt = 0; 4176 4177 for (;;) { 4178 lck_mtx_lock_spin(buf_mtxp); 4179 4180 while ( (bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY])) == NULL) { 4181 (void)msleep0(&bufqueues[BQ_LAUNDRY], buf_mtxp, PRIBIO|PDROP, "blaundry", 0, (bcleanbufcontinuation)bcleanbuf_thread); 4182 } 4183 4184 /* 4185 * Remove from the queue 4186 */ 4187 bremfree_locked(bp); 4188 4189 /* 4190 * Buffer is no longer on any free list 4191 */ 4192 SET(bp->b_lflags, BL_BUSY); 4193 buf_busycount++; 4194 4195#ifdef JOE_DEBUG 4196 bp->b_owner = current_thread(); 4197 bp->b_tag = 10; 4198#endif 4199 4200 lck_mtx_unlock(buf_mtxp); 4201 /* 4202 * do the IO 4203 */ 4204 error = bawrite_internal(bp, 0); 4205 4206 if (error) { 4207 bp->b_whichq = BQ_LAUNDRY; 4208 bp->b_timestamp = buf_timestamp(); 4209 4210 lck_mtx_lock_spin(buf_mtxp); 4211 4212 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY); 4213 blaundrycnt++; 4214 4215 /* we never leave a busy page on the laundry queue */ 4216 CLR(bp->b_lflags, BL_BUSY); 4217 buf_busycount--; 4218#ifdef JOE_DEBUG 4219 bp->b_owner = current_thread(); 4220 bp->b_tag = 11; 4221#endif 4222 4223 lck_mtx_unlock(buf_mtxp); 4224 4225 if (loopcnt > MAXLAUNDRY) { 4226 /* 4227 * bawrite_internal() can return errors if we're throttled. If we've 4228 * done several I/Os and failed, give the system some time to unthrottle 4229 * the vnode 4230 */ 4231 (void)tsleep((void *)&bufqueues[BQ_LAUNDRY], PRIBIO, "blaundry", 1); 4232 loopcnt = 0; 4233 } else { 4234 /* give other threads a chance to run */ 4235 (void)thread_block(THREAD_CONTINUE_NULL); 4236 loopcnt++; 4237 } 4238 } 4239 } 4240} 4241 4242 4243static int 4244brecover_data(buf_t bp) 4245{ 4246 int upl_offset; 4247 upl_t upl; 4248 upl_page_info_t *pl; 4249 kern_return_t kret; 4250 vnode_t vp = bp->b_vp; 4251 int upl_flags; 4252 4253 4254 if ( !UBCINFOEXISTS(vp) || bp->b_bufsize == 0) 4255 goto dump_buffer; 4256 4257 upl_flags = UPL_PRECIOUS; 4258 if (! (buf_flags(bp) & B_READ)) { 4259 /* 4260 * "write" operation: let the UPL subsystem know 4261 * that we intend to modify the buffer cache pages we're 4262 * gathering. 4263 */ 4264 upl_flags |= UPL_WILL_MODIFY; 4265 } 4266 4267 kret = ubc_create_upl(vp, 4268 ubc_blktooff(vp, bp->b_lblkno), 4269 bp->b_bufsize, 4270 &upl, 4271 &pl, 4272 upl_flags); 4273 if (kret != KERN_SUCCESS) 4274 panic("Failed to create UPL"); 4275 4276 for (upl_offset = 0; upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) { 4277 4278 if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) || !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) { 4279 ubc_upl_abort(upl, 0); 4280 goto dump_buffer; 4281 } 4282 } 4283 bp->b_upl = upl; 4284 4285 kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap)); 4286 4287 if (kret != KERN_SUCCESS) 4288 panic("getblk: ubc_upl_map() failed with (%d)", kret); 4289 return (1); 4290 4291dump_buffer: 4292 bp->b_bufsize = 0; 4293 SET(bp->b_flags, B_INVAL); 4294 buf_brelse(bp); 4295 4296 return(0); 4297} 4298 4299boolean_t 4300buffer_cache_gc(int all) 4301{ 4302 buf_t bp; 4303 boolean_t did_large_zfree = FALSE; 4304 boolean_t need_wakeup = FALSE; 4305 int now = buf_timestamp(); 4306 uint32_t found = 0; 4307 struct bqueues privq; 4308 int thresh_hold = BUF_STALE_THRESHHOLD; 4309 4310 if (all) 4311 thresh_hold = 0; 4312 /* 4313 * We only care about metadata (incore storage comes from zalloc()). 4314 * Unless "all" is set (used to evict meta data buffers in preparation 4315 * for deep sleep), we only evict up to BUF_MAX_GC_BATCH_SIZE buffers 4316 * that have not been accessed in the last 30s. This limit controls both 4317 * the hold time of the global lock "buf_mtxp" and the length of time 4318 * we spend compute bound in the GC thread which calls this function 4319 */ 4320 lck_mtx_lock(buf_mtxp); 4321 4322 do { 4323 found = 0; 4324 TAILQ_INIT(&privq); 4325 need_wakeup = FALSE; 4326 4327 while (((bp = TAILQ_FIRST(&bufqueues[BQ_META]))) && 4328 (now > bp->b_timestamp) && 4329 (now - bp->b_timestamp > thresh_hold) && 4330 (found < BUF_MAX_GC_BATCH_SIZE)) { 4331 4332 /* Remove from free list */ 4333 bremfree_locked(bp); 4334 found++; 4335 4336#ifdef JOE_DEBUG 4337 bp->b_owner = current_thread(); 4338 bp->b_tag = 12; 4339#endif 4340 4341 /* If dirty, move to laundry queue and remember to do wakeup */ 4342 if (ISSET(bp->b_flags, B_DELWRI)) { 4343 SET(bp->b_lflags, BL_WANTDEALLOC); 4344 4345 bmovelaundry(bp); 4346 need_wakeup = TRUE; 4347 4348 continue; 4349 } 4350 4351 /* 4352 * Mark busy and put on private list. We could technically get 4353 * away without setting BL_BUSY here. 4354 */ 4355 SET(bp->b_lflags, BL_BUSY); 4356 buf_busycount++; 4357 4358 /* 4359 * Remove from hash and dissociate from vp. 4360 */ 4361 bremhash(bp); 4362 if (bp->b_vp) { 4363 brelvp_locked(bp); 4364 } 4365 4366 TAILQ_INSERT_TAIL(&privq, bp, b_freelist); 4367 } 4368 4369 if (found == 0) { 4370 break; 4371 } 4372 4373 /* Drop lock for batch processing */ 4374 lck_mtx_unlock(buf_mtxp); 4375 4376 /* Wakeup and yield for laundry if need be */ 4377 if (need_wakeup) { 4378 wakeup(&bufqueues[BQ_LAUNDRY]); 4379 (void)thread_block(THREAD_CONTINUE_NULL); 4380 } 4381 4382 /* Clean up every buffer on private list */ 4383 TAILQ_FOREACH(bp, &privq, b_freelist) { 4384 /* Take note if we've definitely freed at least a page to a zone */ 4385 if ((ISSET(bp->b_flags, B_ZALLOC)) && (buf_size(bp) >= PAGE_SIZE)) { 4386 did_large_zfree = TRUE; 4387 } 4388 4389 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 4390 4391 /* Free Storage */ 4392 buf_free_meta_store(bp); 4393 4394 /* Release credentials */ 4395 buf_release_credentials(bp); 4396 4397 /* Prepare for moving to empty queue */ 4398 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED 4399 | B_AGE | B_ASYNC | B_NOCACHE | B_FUA)); 4400 bp->b_whichq = BQ_EMPTY; 4401 BLISTNONE(bp); 4402 } 4403 lck_mtx_lock(buf_mtxp); 4404 4405 /* Back under lock, move them all to invalid hash and clear busy */ 4406 TAILQ_FOREACH(bp, &privq, b_freelist) { 4407 binshash(bp, &invalhash); 4408 CLR(bp->b_lflags, BL_BUSY); 4409 buf_busycount--; 4410 4411#ifdef JOE_DEBUG 4412 if (bp->b_owner != current_thread()) { 4413 panic("Buffer stolen from buffer_cache_gc()"); 4414 } 4415 bp->b_owner = current_thread(); 4416 bp->b_tag = 13; 4417#endif 4418 } 4419 4420 /* And do a big bulk move to the empty queue */ 4421 TAILQ_CONCAT(&bufqueues[BQ_EMPTY], &privq, b_freelist); 4422 4423 } while (all && (found == BUF_MAX_GC_BATCH_SIZE)); 4424 4425 lck_mtx_unlock(buf_mtxp); 4426 4427 return did_large_zfree; 4428} 4429 4430 4431/* 4432 * disabled for now 4433 */ 4434 4435#if FLUSH_QUEUES 4436 4437#define NFLUSH 32 4438 4439static int 4440bp_cmp(void *a, void *b) 4441{ 4442 buf_t *bp_a = *(buf_t **)a, 4443 *bp_b = *(buf_t **)b; 4444 daddr64_t res; 4445 4446 // don't have to worry about negative block 4447 // numbers so this is ok to do. 4448 // 4449 res = (bp_a->b_blkno - bp_b->b_blkno); 4450 4451 return (int)res; 4452} 4453 4454 4455int 4456bflushq(int whichq, mount_t mp) 4457{ 4458 buf_t bp, next; 4459 int i, buf_count; 4460 int total_writes = 0; 4461 static buf_t flush_table[NFLUSH]; 4462 4463 if (whichq < 0 || whichq >= BQUEUES) { 4464 return (0); 4465 } 4466 4467 restart: 4468 lck_mtx_lock(buf_mtxp); 4469 4470 bp = TAILQ_FIRST(&bufqueues[whichq]); 4471 4472 for (buf_count = 0; bp; bp = next) { 4473 next = bp->b_freelist.tqe_next; 4474 4475 if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) { 4476 continue; 4477 } 4478 4479 if (ISSET(bp->b_flags, B_DELWRI) && !ISSET(bp->b_lflags, BL_BUSY)) { 4480 4481 bremfree_locked(bp); 4482#ifdef JOE_DEBUG 4483 bp->b_owner = current_thread(); 4484 bp->b_tag = 7; 4485#endif 4486 SET(bp->b_lflags, BL_BUSY); 4487 buf_busycount++; 4488 4489 flush_table[buf_count] = bp; 4490 buf_count++; 4491 total_writes++; 4492 4493 if (buf_count >= NFLUSH) { 4494 lck_mtx_unlock(buf_mtxp); 4495 4496 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp); 4497 4498 for (i = 0; i < buf_count; i++) { 4499 buf_bawrite(flush_table[i]); 4500 } 4501 goto restart; 4502 } 4503 } 4504 } 4505 lck_mtx_unlock(buf_mtxp); 4506 4507 if (buf_count > 0) { 4508 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp); 4509 4510 for (i = 0; i < buf_count; i++) { 4511 buf_bawrite(flush_table[i]); 4512 } 4513 } 4514 4515 return (total_writes); 4516} 4517#endif 4518 4519 4520#if BALANCE_QUEUES 4521 4522/* XXX move this to a separate file */ 4523 4524/* 4525 * NOTE: THIS CODE HAS NOT BEEN UPDATED 4526 * WITH RESPECT TO THE NEW LOCKING MODEL 4527 */ 4528 4529 4530/* 4531 * Dynamic Scaling of the Buffer Queues 4532 */ 4533 4534typedef long long blsize_t; 4535 4536blsize_t MAXNBUF; /* initialize to (sane_size / PAGE_SIZE) */ 4537/* Global tunable limits */ 4538blsize_t nbufh; /* number of buffer headers */ 4539blsize_t nbuflow; /* minimum number of buffer headers required */ 4540blsize_t nbufhigh; /* maximum number of buffer headers allowed */ 4541blsize_t nbuftarget; /* preferred number of buffer headers */ 4542 4543/* 4544 * assertions: 4545 * 4546 * 1. 0 < nbuflow <= nbufh <= nbufhigh 4547 * 2. nbufhigh <= MAXNBUF 4548 * 3. 0 < nbuflow <= nbuftarget <= nbufhigh 4549 * 4. nbufh can not be set by sysctl(). 4550 */ 4551 4552/* Per queue tunable limits */ 4553 4554struct bufqlim { 4555 blsize_t bl_nlow; /* minimum number of buffer headers required */ 4556 blsize_t bl_num; /* number of buffer headers on the queue */ 4557 blsize_t bl_nlhigh; /* maximum number of buffer headers allowed */ 4558 blsize_t bl_target; /* preferred number of buffer headers */ 4559 long bl_stale; /* Seconds after which a buffer is considered stale */ 4560} bufqlim[BQUEUES]; 4561 4562/* 4563 * assertions: 4564 * 4565 * 1. 0 <= bl_nlow <= bl_num <= bl_nlhigh 4566 * 2. bl_nlhigh <= MAXNBUF 4567 * 3. bufqlim[BQ_META].bl_nlow != 0 4568 * 4. bufqlim[BQ_META].bl_nlow > (number of possible concurrent 4569 * file system IO operations) 4570 * 5. bl_num can not be set by sysctl(). 4571 * 6. bl_nhigh <= nbufhigh 4572 */ 4573 4574/* 4575 * Rationale: 4576 * ---------- 4577 * Defining it blsize_t as long permits 2^31 buffer headers per queue. 4578 * Which can describe (2^31 * PAGE_SIZE) memory per queue. 4579 * 4580 * These limits are exported to by means of sysctl(). 4581 * It was decided to define blsize_t as a 64 bit quantity. 4582 * This will make sure that we will not be required to change it 4583 * as long as we do not exceed 64 bit address space for the kernel. 4584 * 4585 * low and high numbers parameters initialized at compile time 4586 * and boot arguments can be used to override them. sysctl() 4587 * would not change the value. sysctl() can get all the values 4588 * but can set only target. num is the current level. 4589 * 4590 * Advantages of having a "bufqscan" thread doing the balancing are, 4591 * Keep enough bufs on BQ_EMPTY. 4592 * getnewbuf() by default will always select a buffer from the BQ_EMPTY. 4593 * getnewbuf() perfoms best if a buffer was found there. 4594 * Also this minimizes the possibility of starting IO 4595 * from getnewbuf(). That's a performance win, too. 4596 * 4597 * Localize complex logic [balancing as well as time aging] 4598 * to balancebufq(). 4599 * 4600 * Simplify getnewbuf() logic by elimination of time aging code. 4601 */ 4602 4603/* 4604 * Algorithm: 4605 * ----------- 4606 * The goal of the dynamic scaling of the buffer queues to to keep 4607 * the size of the LRU close to bl_target. Buffers on a queue would 4608 * be time aged. 4609 * 4610 * There would be a thread which will be responsible for "balancing" 4611 * the buffer cache queues. 4612 * 4613 * The scan order would be: AGE, LRU, META, EMPTY. 4614 */ 4615 4616long bufqscanwait = 0; 4617 4618static void bufqscan_thread(); 4619static int balancebufq(int q); 4620static int btrimempty(int n); 4621static __inline__ int initbufqscan(void); 4622static __inline__ int nextbufq(int q); 4623static void buqlimprt(int all); 4624 4625 4626static __inline__ void 4627bufqinc(int q) 4628{ 4629 if ((q < 0) || (q >= BQUEUES)) 4630 return; 4631 4632 bufqlim[q].bl_num++; 4633 return; 4634} 4635 4636static __inline__ void 4637bufqdec(int q) 4638{ 4639 if ((q < 0) || (q >= BQUEUES)) 4640 return; 4641 4642 bufqlim[q].bl_num--; 4643 return; 4644} 4645 4646static void 4647bufq_balance_thread_init(void) 4648{ 4649 thread_t thread = THREAD_NULL; 4650 4651 if (bufqscanwait++ == 0) { 4652 4653 /* Initalize globals */ 4654 MAXNBUF = (sane_size / PAGE_SIZE); 4655 nbufh = nbuf_headers; 4656 nbuflow = min(nbufh, 100); 4657 nbufhigh = min(MAXNBUF, max(nbufh, 2048)); 4658 nbuftarget = (sane_size >> 5) / PAGE_SIZE; 4659 nbuftarget = max(nbuflow, nbuftarget); 4660 nbuftarget = min(nbufhigh, nbuftarget); 4661 4662 /* 4663 * Initialize the bufqlim 4664 */ 4665 4666 /* LOCKED queue */ 4667 bufqlim[BQ_LOCKED].bl_nlow = 0; 4668 bufqlim[BQ_LOCKED].bl_nlhigh = 32; 4669 bufqlim[BQ_LOCKED].bl_target = 0; 4670 bufqlim[BQ_LOCKED].bl_stale = 30; 4671 4672 /* LRU queue */ 4673 bufqlim[BQ_LRU].bl_nlow = 0; 4674 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4; 4675 bufqlim[BQ_LRU].bl_target = nbuftarget/4; 4676 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE; 4677 4678 /* AGE queue */ 4679 bufqlim[BQ_AGE].bl_nlow = 0; 4680 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4; 4681 bufqlim[BQ_AGE].bl_target = nbuftarget/4; 4682 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE; 4683 4684 /* EMPTY queue */ 4685 bufqlim[BQ_EMPTY].bl_nlow = 0; 4686 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4; 4687 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4; 4688 bufqlim[BQ_EMPTY].bl_stale = 600000; 4689 4690 /* META queue */ 4691 bufqlim[BQ_META].bl_nlow = 0; 4692 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4; 4693 bufqlim[BQ_META].bl_target = nbuftarget/4; 4694 bufqlim[BQ_META].bl_stale = META_IS_STALE; 4695 4696 /* LAUNDRY queue */ 4697 bufqlim[BQ_LOCKED].bl_nlow = 0; 4698 bufqlim[BQ_LOCKED].bl_nlhigh = 32; 4699 bufqlim[BQ_LOCKED].bl_target = 0; 4700 bufqlim[BQ_LOCKED].bl_stale = 30; 4701 4702 buqlimprt(1); 4703 } 4704 4705 /* create worker thread */ 4706 kernel_thread_start((thread_continue_t)bufqscan_thread, NULL, &thread); 4707 thread_deallocate(thread); 4708} 4709 4710/* The workloop for the buffer balancing thread */ 4711static void 4712bufqscan_thread() 4713{ 4714 int moretodo = 0; 4715 4716 for(;;) { 4717 do { 4718 int q; /* buffer queue to process */ 4719 4720 q = initbufqscan(); 4721 for (; q; ) { 4722 moretodo |= balancebufq(q); 4723 q = nextbufq(q); 4724 } 4725 } while (moretodo); 4726 4727#if DIAGNOSTIC 4728 vfs_bufstats(); 4729 buqlimprt(0); 4730#endif 4731 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz); 4732 moretodo = 0; 4733 } 4734} 4735 4736/* Seed for the buffer queue balancing */ 4737static __inline__ int 4738initbufqscan() 4739{ 4740 /* Start with AGE queue */ 4741 return (BQ_AGE); 4742} 4743 4744/* Pick next buffer queue to balance */ 4745static __inline__ int 4746nextbufq(int q) 4747{ 4748 int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 }; 4749 4750 q++; 4751 q %= sizeof(order); 4752 return (order[q]); 4753} 4754 4755/* function to balance the buffer queues */ 4756static int 4757balancebufq(int q) 4758{ 4759 int moretodo = 0; 4760 int n, t; 4761 4762 /* reject invalid q */ 4763 if ((q < 0) || (q >= BQUEUES)) 4764 goto out; 4765 4766 /* LOCKED or LAUNDRY queue MUST not be balanced */ 4767 if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY)) 4768 goto out; 4769 4770 n = (bufqlim[q].bl_num - bufqlim[q].bl_target); 4771 4772 /* If queue has less than target nothing more to do */ 4773 if (n < 0) 4774 goto out; 4775 4776 if ( n > 8 ) { 4777 /* Balance only a small amount (12.5%) at a time */ 4778 n >>= 3; 4779 } 4780 4781 /* EMPTY queue needs special handling */ 4782 if (q == BQ_EMPTY) { 4783 moretodo |= btrimempty(n); 4784 goto out; 4785 } 4786 4787 t = buf_timestamp(): 4788 4789 for (; n > 0; n--) { 4790 struct buf *bp = bufqueues[q].tqh_first; 4791 if (!bp) 4792 break; 4793 4794 /* check if it's stale */ 4795 if ((t - bp->b_timestamp) > bufqlim[q].bl_stale) { 4796 if (bcleanbuf(bp, FALSE)) { 4797 /* buf_bawrite() issued, bp not ready */ 4798 moretodo = 1; 4799 } else { 4800 /* release the cleaned buffer to BQ_EMPTY */ 4801 SET(bp->b_flags, B_INVAL); 4802 buf_brelse(bp); 4803 } 4804 } else 4805 break; 4806 } 4807 4808out: 4809 return (moretodo); 4810} 4811 4812static int 4813btrimempty(int n) 4814{ 4815 /* 4816 * When struct buf are allocated dynamically, this would 4817 * reclaim upto 'n' struct buf from the empty queue. 4818 */ 4819 4820 return (0); 4821} 4822 4823static void 4824buqlimprt(int all) 4825{ 4826 int i; 4827 static char *bname[BQUEUES] = 4828 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" }; 4829 4830 if (all) 4831 for (i = 0; i < BQUEUES; i++) { 4832 printf("%s : ", bname[i]); 4833 printf("min = %ld, ", (long)bufqlim[i].bl_nlow); 4834 printf("cur = %ld, ", (long)bufqlim[i].bl_num); 4835 printf("max = %ld, ", (long)bufqlim[i].bl_nlhigh); 4836 printf("target = %ld, ", (long)bufqlim[i].bl_target); 4837 printf("stale after %ld seconds\n", bufqlim[i].bl_stale); 4838 } 4839 else 4840 for (i = 0; i < BQUEUES; i++) { 4841 printf("%s : ", bname[i]); 4842 printf("cur = %ld, ", (long)bufqlim[i].bl_num); 4843 } 4844} 4845 4846#endif 4847 4848 4849