1/* 2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ 29/*- 30 * Copyright (c) 1994 Christopher G. Demetriou 31 * Copyright (c) 1982, 1986, 1989, 1993 32 * The Regents of the University of California. All rights reserved. 33 * (c) UNIX System Laboratories, Inc. 34 * All or some portions of this file are derived from material licensed 35 * to the University of California by American Telephone and Telegraph 36 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 37 * the permission of UNIX System Laboratories, Inc. 38 * 39 * Redistribution and use in source and binary forms, with or without 40 * modification, are permitted provided that the following conditions 41 * are met: 42 * 1. Redistributions of source code must retain the above copyright 43 * notice, this list of conditions and the following disclaimer. 44 * 2. Redistributions in binary form must reproduce the above copyright 45 * notice, this list of conditions and the following disclaimer in the 46 * documentation and/or other materials provided with the distribution. 47 * 3. All advertising materials mentioning features or use of this software 48 * must display the following acknowledgement: 49 * This product includes software developed by the University of 50 * California, Berkeley and its contributors. 51 * 4. Neither the name of the University nor the names of its contributors 52 * may be used to endorse or promote products derived from this software 53 * without specific prior written permission. 54 * 55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 65 * SUCH DAMAGE. 66 * 67 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94 68 */ 69 70/* 71 * Some references: 72 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986) 73 * Leffler, et al.: The Design and Implementation of the 4.3BSD 74 * UNIX Operating System (Addison Welley, 1989) 75 */ 76 77#include <sys/param.h> 78#include <sys/systm.h> 79#include <sys/proc_internal.h> 80#include <sys/buf_internal.h> 81#include <sys/vnode_internal.h> 82#include <sys/mount_internal.h> 83#include <sys/trace.h> 84#include <sys/malloc.h> 85#include <sys/resourcevar.h> 86#include <miscfs/specfs/specdev.h> 87#include <sys/ubc.h> 88#include <sys/kauth.h> 89#if DIAGNOSTIC 90#include <kern/assert.h> 91#endif /* DIAGNOSTIC */ 92#include <kern/task.h> 93#include <kern/zalloc.h> 94#include <kern/lock.h> 95 96#include <sys/fslog.h> /* fslog_io_error() */ 97 98#include <mach/mach_types.h> 99#include <mach/memory_object_types.h> 100#include <kern/sched_prim.h> /* thread_block() */ 101 102#include <vm/vm_kern.h> 103#include <vm/vm_pageout.h> 104 105#include <sys/kdebug.h> 106 107#include <libkern/OSAtomic.h> 108#include <libkern/OSDebug.h> 109#include <sys/ubc_internal.h> 110 111#include <sys/sdt.h> 112#include <sys/cprotect.h> 113 114 115#if BALANCE_QUEUES 116static __inline__ void bufqinc(int q); 117static __inline__ void bufqdec(int q); 118#endif 119 120int bcleanbuf(buf_t bp, boolean_t discard); 121static int brecover_data(buf_t bp); 122static boolean_t incore(vnode_t vp, daddr64_t blkno); 123/* timeout is in msecs */ 124static buf_t getnewbuf(int slpflag, int slptimeo, int *queue); 125static void bremfree_locked(buf_t bp); 126static void buf_reassign(buf_t bp, vnode_t newvp); 127static errno_t buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo); 128static int buf_iterprepare(vnode_t vp, struct buflists *, int flags); 129static void buf_itercomplete(vnode_t vp, struct buflists *, int flags); 130static boolean_t buffer_cache_gc(int); 131static buf_t buf_brelse_shadow(buf_t bp); 132static void buf_free_meta_store(buf_t bp); 133 134static buf_t buf_create_shadow_internal(buf_t bp, boolean_t force_copy, 135 uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv); 136 137 138__private_extern__ int bdwrite_internal(buf_t, int); 139 140/* zone allocated buffer headers */ 141static void bufzoneinit(void) __attribute__((section("__TEXT, initcode"))); 142static void bcleanbuf_thread_init(void) __attribute__((section("__TEXT, initcode"))); 143static void bcleanbuf_thread(void); 144 145static zone_t buf_hdr_zone; 146static int buf_hdr_count; 147 148 149/* 150 * Definitions for the buffer hash lists. 151 */ 152#define BUFHASH(dvp, lbn) \ 153 (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash]) 154LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash; 155u_long bufhash; 156 157static buf_t incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp); 158 159/* Definitions for the buffer stats. */ 160struct bufstats bufstats; 161 162/* Number of delayed write buffers */ 163long nbdwrite = 0; 164int blaundrycnt = 0; 165static int boot_nbuf_headers = 0; 166 167static TAILQ_HEAD(delayqueue, buf) delaybufqueue; 168 169static TAILQ_HEAD(ioqueue, buf) iobufqueue; 170static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES]; 171static int needbuffer; 172static int need_iobuffer; 173 174static lck_grp_t *buf_mtx_grp; 175static lck_attr_t *buf_mtx_attr; 176static lck_grp_attr_t *buf_mtx_grp_attr; 177static lck_mtx_t *iobuffer_mtxp; 178static lck_mtx_t *buf_mtxp; 179 180static int buf_busycount; 181 182static __inline__ int 183buf_timestamp(void) 184{ 185 struct timeval t; 186 microuptime(&t); 187 return (t.tv_sec); 188} 189 190/* 191 * Insq/Remq for the buffer free lists. 192 */ 193#if BALANCE_QUEUES 194#define binsheadfree(bp, dp, whichq) do { \ 195 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \ 196 bufqinc((whichq)); \ 197 } while (0) 198 199#define binstailfree(bp, dp, whichq) do { \ 200 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \ 201 bufqinc((whichq)); \ 202 } while (0) 203#else 204#define binsheadfree(bp, dp, whichq) do { \ 205 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \ 206 } while (0) 207 208#define binstailfree(bp, dp, whichq) do { \ 209 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \ 210 } while (0) 211#endif 212 213 214#define BHASHENTCHECK(bp) \ 215 if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \ 216 panic("%p: b_hash.le_prev is not deadbeef", (bp)); 217 218#define BLISTNONE(bp) \ 219 (bp)->b_hash.le_next = (struct buf *)0; \ 220 (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef; 221 222/* 223 * Insq/Remq for the vnode usage lists. 224 */ 225#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) 226#define bufremvn(bp) { \ 227 LIST_REMOVE(bp, b_vnbufs); \ 228 (bp)->b_vnbufs.le_next = NOLIST; \ 229} 230 231/* 232 * Time in seconds before a buffer on a list is 233 * considered as a stale buffer 234 */ 235#define LRU_IS_STALE 120 /* default value for the LRU */ 236#define AGE_IS_STALE 60 /* default value for the AGE */ 237#define META_IS_STALE 180 /* default value for the BQ_META */ 238 239int lru_is_stale = LRU_IS_STALE; 240int age_is_stale = AGE_IS_STALE; 241int meta_is_stale = META_IS_STALE; 242 243#define MAXLAUNDRY 10 244 245/* LIST_INSERT_HEAD() with assertions */ 246static __inline__ void 247blistenterhead(struct bufhashhdr * head, buf_t bp) 248{ 249 if ((bp->b_hash.le_next = (head)->lh_first) != NULL) 250 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next; 251 (head)->lh_first = bp; 252 bp->b_hash.le_prev = &(head)->lh_first; 253 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef) 254 panic("blistenterhead: le_prev is deadbeef"); 255} 256 257static __inline__ void 258binshash(buf_t bp, struct bufhashhdr *dp) 259{ 260#if DIAGNOSTIC 261 buf_t nbp; 262#endif /* DIAGNOSTIC */ 263 264 BHASHENTCHECK(bp); 265 266#if DIAGNOSTIC 267 nbp = dp->lh_first; 268 for(; nbp != NULL; nbp = nbp->b_hash.le_next) { 269 if(nbp == bp) 270 panic("buf already in hashlist"); 271 } 272#endif /* DIAGNOSTIC */ 273 274 blistenterhead(dp, bp); 275} 276 277static __inline__ void 278bremhash(buf_t bp) 279{ 280 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef) 281 panic("bremhash le_prev is deadbeef"); 282 if (bp->b_hash.le_next == bp) 283 panic("bremhash: next points to self"); 284 285 if (bp->b_hash.le_next != NULL) 286 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev; 287 *bp->b_hash.le_prev = (bp)->b_hash.le_next; 288} 289 290/* 291 * buf_mtxp held. 292 */ 293static __inline__ void 294bmovelaundry(buf_t bp) 295{ 296 bp->b_whichq = BQ_LAUNDRY; 297 bp->b_timestamp = buf_timestamp(); 298 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY); 299 blaundrycnt++; 300} 301 302static __inline__ void 303buf_release_credentials(buf_t bp) 304{ 305 if (IS_VALID_CRED(bp->b_rcred)) { 306 kauth_cred_unref(&bp->b_rcred); 307 } 308 if (IS_VALID_CRED(bp->b_wcred)) { 309 kauth_cred_unref(&bp->b_wcred); 310 } 311} 312 313 314int 315buf_valid(buf_t bp) { 316 317 if ( (bp->b_flags & (B_DONE | B_DELWRI)) ) 318 return 1; 319 return 0; 320} 321 322int 323buf_fromcache(buf_t bp) { 324 325 if ( (bp->b_flags & B_CACHE) ) 326 return 1; 327 return 0; 328} 329 330void 331buf_markinvalid(buf_t bp) { 332 333 SET(bp->b_flags, B_INVAL); 334} 335 336void 337buf_markdelayed(buf_t bp) { 338 339 if (!ISSET(bp->b_flags, B_DELWRI)) { 340 SET(bp->b_flags, B_DELWRI); 341 342 OSAddAtomicLong(1, &nbdwrite); 343 buf_reassign(bp, bp->b_vp); 344 } 345 SET(bp->b_flags, B_DONE); 346} 347 348void 349buf_markclean(buf_t bp) { 350 351 if (ISSET(bp->b_flags, B_DELWRI)) { 352 CLR(bp->b_flags, B_DELWRI); 353 354 OSAddAtomicLong(-1, &nbdwrite); 355 buf_reassign(bp, bp->b_vp); 356 } 357} 358 359void 360buf_markeintr(buf_t bp) { 361 362 SET(bp->b_flags, B_EINTR); 363} 364 365 366void 367buf_markaged(buf_t bp) { 368 369 SET(bp->b_flags, B_AGE); 370} 371 372int 373buf_fua(buf_t bp) { 374 375 if ((bp->b_flags & B_FUA) == B_FUA) 376 return 1; 377 return 0; 378} 379 380void 381buf_markfua(buf_t bp) { 382 383 SET(bp->b_flags, B_FUA); 384} 385 386#if CONFIG_PROTECT 387void 388buf_setcpaddr(buf_t bp, struct cprotect *entry) { 389 bp->b_attr.ba_cpentry = entry; 390} 391 392void 393buf_setcpoff (buf_t bp, uint64_t foffset) { 394 bp->b_attr.ba_cp_file_off = foffset; 395} 396 397void * 398bufattr_cpaddr(bufattr_t bap) { 399 return (bap->ba_cpentry); 400} 401 402uint64_t 403bufattr_cpoff(bufattr_t bap) { 404 return (bap->ba_cp_file_off); 405} 406 407void 408bufattr_setcpaddr(bufattr_t bap, void *cp_entry_addr) { 409 bap->ba_cpentry = cp_entry_addr; 410} 411 412void 413bufattr_setcpoff(bufattr_t bap, uint64_t foffset) { 414 bap->ba_cp_file_off = foffset; 415} 416 417#else 418void * 419bufattr_cpaddr(bufattr_t bap __unused) { 420 return NULL; 421} 422 423uint64_t 424bufattr_cpoff(bufattr_t bap __unused) { 425 return 0; 426} 427 428void 429bufattr_setcpaddr(bufattr_t bap __unused, void *cp_entry_addr __unused) { 430} 431 432void 433bufattr_setcpoff(__unused bufattr_t bap, __unused uint64_t foffset) { 434 return; 435} 436#endif /* CONFIG_PROTECT */ 437 438bufattr_t 439bufattr_alloc() { 440 bufattr_t bap; 441 MALLOC(bap, bufattr_t, sizeof(struct bufattr), M_TEMP, M_WAITOK); 442 if (bap == NULL) 443 return NULL; 444 445 bzero(bap, sizeof(struct bufattr)); 446 return bap; 447} 448 449void 450bufattr_free(bufattr_t bap) { 451 if (bap) 452 FREE(bap, M_TEMP); 453} 454 455int 456bufattr_rawencrypted(bufattr_t bap) { 457 if ( (bap->ba_flags & BA_RAW_ENCRYPTED_IO) ) 458 return 1; 459 return 0; 460} 461 462int 463bufattr_throttled(bufattr_t bap) { 464 if ( (bap->ba_flags & BA_THROTTLED_IO) ) 465 return 1; 466 return 0; 467} 468 469int 470bufattr_nocache(bufattr_t bap) { 471 if ( (bap->ba_flags & BA_NOCACHE) ) 472 return 1; 473 return 0; 474} 475 476int 477bufattr_meta(bufattr_t bap) { 478 if ( (bap->ba_flags & BA_META) ) 479 return 1; 480 return 0; 481} 482 483int 484#if !CONFIG_EMBEDDED 485bufattr_delayidlesleep(bufattr_t bap) 486#else /* !CONFIG_EMBEDDED */ 487bufattr_delayidlesleep(__unused bufattr_t bap) 488#endif /* !CONFIG_EMBEDDED */ 489{ 490#if !CONFIG_EMBEDDED 491 if ( (bap->ba_flags & BA_DELAYIDLESLEEP) ) 492 return 1; 493#endif /* !CONFIG_EMBEDDED */ 494 return 0; 495} 496 497bufattr_t 498buf_attr(buf_t bp) { 499 return &bp->b_attr; 500} 501 502void 503buf_markstatic(buf_t bp __unused) { 504 SET(bp->b_flags, B_STATICCONTENT); 505} 506 507int 508buf_static(buf_t bp) { 509 if ( (bp->b_flags & B_STATICCONTENT) ) 510 return 1; 511 return 0; 512} 513 514errno_t 515buf_error(buf_t bp) { 516 517 return (bp->b_error); 518} 519 520void 521buf_seterror(buf_t bp, errno_t error) { 522 523 if ((bp->b_error = error)) 524 SET(bp->b_flags, B_ERROR); 525 else 526 CLR(bp->b_flags, B_ERROR); 527} 528 529void 530buf_setflags(buf_t bp, int32_t flags) { 531 532 SET(bp->b_flags, (flags & BUF_X_WRFLAGS)); 533} 534 535void 536buf_clearflags(buf_t bp, int32_t flags) { 537 538 CLR(bp->b_flags, (flags & BUF_X_WRFLAGS)); 539} 540 541int32_t 542buf_flags(buf_t bp) { 543 544 return ((bp->b_flags & BUF_X_RDFLAGS)); 545} 546 547void 548buf_reset(buf_t bp, int32_t io_flags) { 549 550 CLR(bp->b_flags, (B_READ | B_WRITE | B_ERROR | B_DONE | B_INVAL | B_ASYNC | B_NOCACHE | B_FUA)); 551 SET(bp->b_flags, (io_flags & (B_ASYNC | B_READ | B_WRITE | B_NOCACHE))); 552 553 bp->b_error = 0; 554} 555 556uint32_t 557buf_count(buf_t bp) { 558 559 return (bp->b_bcount); 560} 561 562void 563buf_setcount(buf_t bp, uint32_t bcount) { 564 565 bp->b_bcount = bcount; 566} 567 568uint32_t 569buf_size(buf_t bp) { 570 571 return (bp->b_bufsize); 572} 573 574void 575buf_setsize(buf_t bp, uint32_t bufsize) { 576 577 bp->b_bufsize = bufsize; 578} 579 580uint32_t 581buf_resid(buf_t bp) { 582 583 return (bp->b_resid); 584} 585 586void 587buf_setresid(buf_t bp, uint32_t resid) { 588 589 bp->b_resid = resid; 590} 591 592uint32_t 593buf_dirtyoff(buf_t bp) { 594 595 return (bp->b_dirtyoff); 596} 597 598uint32_t 599buf_dirtyend(buf_t bp) { 600 601 return (bp->b_dirtyend); 602} 603 604void 605buf_setdirtyoff(buf_t bp, uint32_t dirtyoff) { 606 607 bp->b_dirtyoff = dirtyoff; 608} 609 610void 611buf_setdirtyend(buf_t bp, uint32_t dirtyend) { 612 613 bp->b_dirtyend = dirtyend; 614} 615 616uintptr_t 617buf_dataptr(buf_t bp) { 618 619 return (bp->b_datap); 620} 621 622void 623buf_setdataptr(buf_t bp, uintptr_t data) { 624 625 bp->b_datap = data; 626} 627 628vnode_t 629buf_vnode(buf_t bp) { 630 631 return (bp->b_vp); 632} 633 634void 635buf_setvnode(buf_t bp, vnode_t vp) { 636 637 bp->b_vp = vp; 638} 639 640 641void * 642buf_callback(buf_t bp) 643{ 644 if ( !(bp->b_flags & B_CALL) ) 645 return ((void *) NULL); 646 647 return ((void *)bp->b_iodone); 648} 649 650 651errno_t 652buf_setcallback(buf_t bp, void (*callback)(buf_t, void *), void *transaction) 653{ 654 if (callback) 655 bp->b_flags |= (B_CALL | B_ASYNC); 656 else 657 bp->b_flags &= ~B_CALL; 658 bp->b_transaction = transaction; 659 bp->b_iodone = callback; 660 661 return (0); 662} 663 664errno_t 665buf_setupl(buf_t bp, upl_t upl, uint32_t offset) 666{ 667 668 if ( !(bp->b_lflags & BL_IOBUF) ) 669 return (EINVAL); 670 671 if (upl) 672 bp->b_flags |= B_CLUSTER; 673 else 674 bp->b_flags &= ~B_CLUSTER; 675 bp->b_upl = upl; 676 bp->b_uploffset = offset; 677 678 return (0); 679} 680 681buf_t 682buf_clone(buf_t bp, int io_offset, int io_size, void (*iodone)(buf_t, void *), void *arg) 683{ 684 buf_t io_bp; 685 686 if (io_offset < 0 || io_size < 0) 687 return (NULL); 688 689 if ((unsigned)(io_offset + io_size) > (unsigned)bp->b_bcount) 690 return (NULL); 691 692 if (bp->b_flags & B_CLUSTER) { 693 if (io_offset && ((bp->b_uploffset + io_offset) & PAGE_MASK)) 694 return (NULL); 695 696 if (((bp->b_uploffset + io_offset + io_size) & PAGE_MASK) && ((io_offset + io_size) < bp->b_bcount)) 697 return (NULL); 698 } 699 io_bp = alloc_io_buf(bp->b_vp, 0); 700 701 io_bp->b_flags = bp->b_flags & (B_COMMIT_UPL | B_META | B_PAGEIO | B_CLUSTER | B_PHYS | B_RAW | B_ASYNC | B_READ | B_FUA); 702 703 if (iodone) { 704 io_bp->b_transaction = arg; 705 io_bp->b_iodone = iodone; 706 io_bp->b_flags |= B_CALL; 707 } 708 if (bp->b_flags & B_CLUSTER) { 709 io_bp->b_upl = bp->b_upl; 710 io_bp->b_uploffset = bp->b_uploffset + io_offset; 711 } else { 712 io_bp->b_datap = (uintptr_t)(((char *)bp->b_datap) + io_offset); 713 } 714 io_bp->b_bcount = io_size; 715 716 return (io_bp); 717} 718 719 720int 721buf_shadow(buf_t bp) 722{ 723 if (bp->b_lflags & BL_SHADOW) 724 return 1; 725 return 0; 726} 727 728 729buf_t 730buf_create_shadow_priv(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg) 731{ 732 return (buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 1)); 733} 734 735buf_t 736buf_create_shadow(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg) 737{ 738 return (buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 0)); 739} 740 741 742static buf_t 743buf_create_shadow_internal(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv) 744{ 745 buf_t io_bp; 746 747 KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_START, bp, 0, 0, 0, 0); 748 749 if ( !(bp->b_flags & B_META) || (bp->b_lflags & BL_IOBUF)) { 750 751 KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, 0, 0, 0, 0); 752 return (NULL); 753 } 754#ifdef BUF_MAKE_PRIVATE 755 if (bp->b_shadow_ref && bp->b_data_ref == 0 && external_storage == 0) 756 panic("buf_create_shadow: %p is in the private state (%d, %d)", bp, bp->b_shadow_ref, bp->b_data_ref); 757#endif 758 io_bp = alloc_io_buf(bp->b_vp, priv); 759 760 io_bp->b_flags = bp->b_flags & (B_META | B_ZALLOC | B_ASYNC | B_READ | B_FUA); 761 io_bp->b_blkno = bp->b_blkno; 762 io_bp->b_lblkno = bp->b_lblkno; 763 764 if (iodone) { 765 io_bp->b_transaction = arg; 766 io_bp->b_iodone = iodone; 767 io_bp->b_flags |= B_CALL; 768 } 769 if (force_copy == FALSE) { 770 io_bp->b_bcount = bp->b_bcount; 771 io_bp->b_bufsize = bp->b_bufsize; 772 773 if (external_storage) { 774 io_bp->b_datap = external_storage; 775#ifdef BUF_MAKE_PRIVATE 776 io_bp->b_data_store = NULL; 777#endif 778 } else { 779 io_bp->b_datap = bp->b_datap; 780#ifdef BUF_MAKE_PRIVATE 781 io_bp->b_data_store = bp; 782#endif 783 } 784 *(buf_t *)(&io_bp->b_orig) = bp; 785 786 lck_mtx_lock_spin(buf_mtxp); 787 788 io_bp->b_lflags |= BL_SHADOW; 789 io_bp->b_shadow = bp->b_shadow; 790 bp->b_shadow = io_bp; 791 bp->b_shadow_ref++; 792 793#ifdef BUF_MAKE_PRIVATE 794 if (external_storage) 795 io_bp->b_lflags |= BL_EXTERNAL; 796 else 797 bp->b_data_ref++; 798#endif 799 lck_mtx_unlock(buf_mtxp); 800 } else { 801 if (external_storage) { 802#ifdef BUF_MAKE_PRIVATE 803 io_bp->b_lflags |= BL_EXTERNAL; 804#endif 805 io_bp->b_bcount = bp->b_bcount; 806 io_bp->b_bufsize = bp->b_bufsize; 807 io_bp->b_datap = external_storage; 808 } else { 809 allocbuf(io_bp, bp->b_bcount); 810 811 io_bp->b_lflags |= BL_IOBUF_ALLOC; 812 } 813 bcopy((caddr_t)bp->b_datap, (caddr_t)io_bp->b_datap, bp->b_bcount); 814 815#ifdef BUF_MAKE_PRIVATE 816 io_bp->b_data_store = NULL; 817#endif 818 } 819 KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, io_bp, 0); 820 821 return (io_bp); 822} 823 824 825#ifdef BUF_MAKE_PRIVATE 826errno_t 827buf_make_private(buf_t bp) 828{ 829 buf_t ds_bp; 830 buf_t t_bp; 831 struct buf my_buf; 832 833 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_START, bp, bp->b_shadow_ref, 0, 0, 0); 834 835 if (bp->b_shadow_ref == 0 || bp->b_data_ref == 0 || ISSET(bp->b_lflags, BL_SHADOW)) { 836 837 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0); 838 return (EINVAL); 839 } 840 my_buf.b_flags = B_META; 841 my_buf.b_datap = (uintptr_t)NULL; 842 allocbuf(&my_buf, bp->b_bcount); 843 844 bcopy((caddr_t)bp->b_datap, (caddr_t)my_buf.b_datap, bp->b_bcount); 845 846 lck_mtx_lock_spin(buf_mtxp); 847 848 for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) { 849 if ( !ISSET(bp->b_lflags, BL_EXTERNAL)) 850 break; 851 } 852 ds_bp = t_bp; 853 854 if (ds_bp == NULL && bp->b_data_ref) 855 panic("buf_make_private: b_data_ref != 0 && ds_bp == NULL"); 856 857 if (ds_bp && (bp->b_data_ref == 0 || bp->b_shadow_ref == 0)) 858 panic("buf_make_private: ref_count == 0 && ds_bp != NULL"); 859 860 if (ds_bp == NULL) { 861 lck_mtx_unlock(buf_mtxp); 862 863 buf_free_meta_store(&my_buf); 864 865 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0); 866 return (EINVAL); 867 } 868 for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) { 869 if ( !ISSET(t_bp->b_lflags, BL_EXTERNAL)) 870 t_bp->b_data_store = ds_bp; 871 } 872 ds_bp->b_data_ref = bp->b_data_ref; 873 874 bp->b_data_ref = 0; 875 bp->b_datap = my_buf.b_datap; 876 877 lck_mtx_unlock(buf_mtxp); 878 879 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, 0, 0); 880 return (0); 881} 882#endif 883 884 885void 886buf_setfilter(buf_t bp, void (*filter)(buf_t, void *), void *transaction, 887 void (**old_iodone)(buf_t, void *), void **old_transaction) 888{ 889 if (old_iodone) 890 *old_iodone = bp->b_iodone; 891 if (old_transaction) 892 *old_transaction = bp->b_transaction; 893 894 bp->b_transaction = transaction; 895 bp->b_iodone = filter; 896 if (filter) 897 bp->b_flags |= B_FILTER; 898 else 899 bp->b_flags &= ~B_FILTER; 900} 901 902 903daddr64_t 904buf_blkno(buf_t bp) { 905 906 return (bp->b_blkno); 907} 908 909daddr64_t 910buf_lblkno(buf_t bp) { 911 912 return (bp->b_lblkno); 913} 914 915void 916buf_setblkno(buf_t bp, daddr64_t blkno) { 917 918 bp->b_blkno = blkno; 919} 920 921void 922buf_setlblkno(buf_t bp, daddr64_t lblkno) { 923 924 bp->b_lblkno = lblkno; 925} 926 927dev_t 928buf_device(buf_t bp) { 929 930 return (bp->b_dev); 931} 932 933errno_t 934buf_setdevice(buf_t bp, vnode_t vp) { 935 936 if ((vp->v_type != VBLK) && (vp->v_type != VCHR)) 937 return EINVAL; 938 bp->b_dev = vp->v_rdev; 939 940 return 0; 941} 942 943 944void * 945buf_drvdata(buf_t bp) { 946 947 return (bp->b_drvdata); 948} 949 950void 951buf_setdrvdata(buf_t bp, void *drvdata) { 952 953 bp->b_drvdata = drvdata; 954} 955 956void * 957buf_fsprivate(buf_t bp) { 958 959 return (bp->b_fsprivate); 960} 961 962void 963buf_setfsprivate(buf_t bp, void *fsprivate) { 964 965 bp->b_fsprivate = fsprivate; 966} 967 968kauth_cred_t 969buf_rcred(buf_t bp) { 970 971 return (bp->b_rcred); 972} 973 974kauth_cred_t 975buf_wcred(buf_t bp) { 976 977 return (bp->b_wcred); 978} 979 980void * 981buf_upl(buf_t bp) { 982 983 return (bp->b_upl); 984} 985 986uint32_t 987buf_uploffset(buf_t bp) { 988 989 return ((uint32_t)(bp->b_uploffset)); 990} 991 992proc_t 993buf_proc(buf_t bp) { 994 995 return (bp->b_proc); 996} 997 998 999errno_t 1000buf_map(buf_t bp, caddr_t *io_addr) 1001{ 1002 buf_t real_bp; 1003 vm_offset_t vaddr; 1004 kern_return_t kret; 1005 1006 if ( !(bp->b_flags & B_CLUSTER)) { 1007 *io_addr = (caddr_t)bp->b_datap; 1008 return (0); 1009 } 1010 real_bp = (buf_t)(bp->b_real_bp); 1011 1012 if (real_bp && real_bp->b_datap) { 1013 /* 1014 * b_real_bp is only valid if B_CLUSTER is SET 1015 * if it's non-zero, than someone did a cluster_bp call 1016 * if the backing physical pages were already mapped 1017 * in before the call to cluster_bp (non-zero b_datap), 1018 * than we just use that mapping 1019 */ 1020 *io_addr = (caddr_t)real_bp->b_datap; 1021 return (0); 1022 } 1023 kret = ubc_upl_map(bp->b_upl, &vaddr); /* Map it in */ 1024 1025 if (kret != KERN_SUCCESS) { 1026 *io_addr = NULL; 1027 1028 return(ENOMEM); 1029 } 1030 vaddr += bp->b_uploffset; 1031 1032 *io_addr = (caddr_t)vaddr; 1033 1034 return (0); 1035} 1036 1037errno_t 1038buf_unmap(buf_t bp) 1039{ 1040 buf_t real_bp; 1041 kern_return_t kret; 1042 1043 if ( !(bp->b_flags & B_CLUSTER)) 1044 return (0); 1045 /* 1046 * see buf_map for the explanation 1047 */ 1048 real_bp = (buf_t)(bp->b_real_bp); 1049 1050 if (real_bp && real_bp->b_datap) 1051 return (0); 1052 1053 if ((bp->b_lflags & BL_IOBUF) && 1054 ((bp->b_flags & (B_PAGEIO | B_READ)) != (B_PAGEIO | B_READ))) { 1055 /* 1056 * ignore pageins... the 'right' thing will 1057 * happen due to the way we handle speculative 1058 * clusters... 1059 * 1060 * when we commit these pages, we'll hit 1061 * it with UPL_COMMIT_INACTIVE which 1062 * will clear the reference bit that got 1063 * turned on when we touched the mapping 1064 */ 1065 bp->b_flags |= B_AGE; 1066 } 1067 kret = ubc_upl_unmap(bp->b_upl); 1068 1069 if (kret != KERN_SUCCESS) 1070 return (EINVAL); 1071 return (0); 1072} 1073 1074 1075void 1076buf_clear(buf_t bp) { 1077 caddr_t baddr; 1078 1079 if (buf_map(bp, &baddr) == 0) { 1080 bzero(baddr, bp->b_bcount); 1081 buf_unmap(bp); 1082 } 1083 bp->b_resid = 0; 1084} 1085 1086/* 1087 * Read or write a buffer that is not contiguous on disk. 1088 * buffer is marked done/error at the conclusion 1089 */ 1090static int 1091buf_strategy_fragmented(vnode_t devvp, buf_t bp, off_t f_offset, size_t contig_bytes) 1092{ 1093 vnode_t vp = buf_vnode(bp); 1094 buf_t io_bp; /* For reading or writing a single block */ 1095 int io_direction; 1096 int io_resid; 1097 size_t io_contig_bytes; 1098 daddr64_t io_blkno; 1099 int error = 0; 1100 int bmap_flags; 1101 1102 /* 1103 * save our starting point... the bp was already mapped 1104 * in buf_strategy before we got called 1105 * no sense doing it again. 1106 */ 1107 io_blkno = bp->b_blkno; 1108 /* 1109 * Make sure we redo this mapping for the next I/O 1110 * i.e. this can never be a 'permanent' mapping 1111 */ 1112 bp->b_blkno = bp->b_lblkno; 1113 1114 /* 1115 * Get an io buffer to do the deblocking 1116 */ 1117 io_bp = alloc_io_buf(devvp, 0); 1118 1119 io_bp->b_lblkno = bp->b_lblkno; 1120 io_bp->b_datap = bp->b_datap; 1121 io_resid = bp->b_bcount; 1122 io_direction = bp->b_flags & B_READ; 1123 io_contig_bytes = contig_bytes; 1124 1125 if (bp->b_flags & B_READ) 1126 bmap_flags = VNODE_READ; 1127 else 1128 bmap_flags = VNODE_WRITE; 1129 1130 for (;;) { 1131 if (io_blkno == -1) 1132 /* 1133 * this is unexepected, but we'll allow for it 1134 */ 1135 bzero((caddr_t)io_bp->b_datap, (int)io_contig_bytes); 1136 else { 1137 io_bp->b_bcount = io_contig_bytes; 1138 io_bp->b_bufsize = io_contig_bytes; 1139 io_bp->b_resid = io_contig_bytes; 1140 io_bp->b_blkno = io_blkno; 1141 1142 buf_reset(io_bp, io_direction); 1143 1144 /* 1145 * Call the device to do the I/O and wait for it. Make sure the appropriate party is charged for write 1146 */ 1147 1148 if (!ISSET(bp->b_flags, B_READ)) 1149 OSAddAtomic(1, &devvp->v_numoutput); 1150 1151 if ((error = VNOP_STRATEGY(io_bp))) 1152 break; 1153 if ((error = (int)buf_biowait(io_bp))) 1154 break; 1155 if (io_bp->b_resid) { 1156 io_resid -= (io_contig_bytes - io_bp->b_resid); 1157 break; 1158 } 1159 } 1160 if ((io_resid -= io_contig_bytes) == 0) 1161 break; 1162 f_offset += io_contig_bytes; 1163 io_bp->b_datap += io_contig_bytes; 1164 1165 /* 1166 * Map the current position to a physical block number 1167 */ 1168 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_resid, &io_blkno, &io_contig_bytes, NULL, bmap_flags, NULL))) 1169 break; 1170 } 1171 buf_free(io_bp); 1172 1173 if (error) 1174 buf_seterror(bp, error); 1175 bp->b_resid = io_resid; 1176 /* 1177 * This I/O is now complete 1178 */ 1179 buf_biodone(bp); 1180 1181 return error; 1182} 1183 1184 1185/* 1186 * struct vnop_strategy_args { 1187 * struct buf *a_bp; 1188 * } *ap; 1189 */ 1190errno_t 1191buf_strategy(vnode_t devvp, void *ap) 1192{ 1193 buf_t bp = ((struct vnop_strategy_args *)ap)->a_bp; 1194 vnode_t vp = bp->b_vp; 1195 int bmap_flags; 1196 errno_t error; 1197#if CONFIG_DTRACE 1198 int dtrace_io_start_flag = 0; /* We only want to trip the io:::start 1199 * probe once, with the true phisical 1200 * block in place (b_blkno) 1201 */ 1202 1203#endif 1204 1205 if (vp == NULL || vp->v_type == VCHR || vp->v_type == VBLK) 1206 panic("buf_strategy: b_vp == NULL || vtype == VCHR | VBLK\n"); 1207 /* 1208 * associate the physical device with 1209 * with this buf_t even if we don't 1210 * end up issuing the I/O... 1211 */ 1212 bp->b_dev = devvp->v_rdev; 1213 1214 if (bp->b_flags & B_READ) 1215 bmap_flags = VNODE_READ; 1216 else 1217 bmap_flags = VNODE_WRITE; 1218 1219 if ( !(bp->b_flags & B_CLUSTER)) { 1220 1221 if ( (bp->b_upl) ) { 1222 /* 1223 * we have a UPL associated with this bp 1224 * go through cluster_bp which knows how 1225 * to deal with filesystem block sizes 1226 * that aren't equal to the page size 1227 */ 1228 DTRACE_IO1(start, buf_t, bp); 1229 return (cluster_bp(bp)); 1230 } 1231 if (bp->b_blkno == bp->b_lblkno) { 1232 off_t f_offset; 1233 size_t contig_bytes; 1234 1235 if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) { 1236 DTRACE_IO1(start, buf_t, bp); 1237 buf_seterror(bp, error); 1238 buf_biodone(bp); 1239 1240 return (error); 1241 } 1242 1243 if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) { 1244 DTRACE_IO1(start, buf_t, bp); 1245 buf_seterror(bp, error); 1246 buf_biodone(bp); 1247 1248 return (error); 1249 } 1250 1251 DTRACE_IO1(start, buf_t, bp); 1252#if CONFIG_DTRACE 1253 dtrace_io_start_flag = 1; 1254#endif /* CONFIG_DTRACE */ 1255 1256 if ((bp->b_blkno == -1) || (contig_bytes == 0)) { 1257 /* Set block number to force biodone later */ 1258 bp->b_blkno = -1; 1259 buf_clear(bp); 1260 } 1261 else if ((long)contig_bytes < bp->b_bcount) { 1262 return (buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes)); 1263 } 1264 } 1265 1266#if CONFIG_DTRACE 1267 if (dtrace_io_start_flag == 0) { 1268 DTRACE_IO1(start, buf_t, bp); 1269 dtrace_io_start_flag = 1; 1270 } 1271#endif /* CONFIG_DTRACE */ 1272 1273 if (bp->b_blkno == -1) { 1274 buf_biodone(bp); 1275 return (0); 1276 } 1277 } 1278 1279#if CONFIG_DTRACE 1280 if (dtrace_io_start_flag == 0) 1281 DTRACE_IO1(start, buf_t, bp); 1282#endif /* CONFIG_DTRACE */ 1283 1284#if CONFIG_PROTECT 1285 /* Capture f_offset in the bufattr*/ 1286 if (bp->b_attr.ba_cpentry != 0) { 1287 /* No need to go here for older EAs */ 1288 if(bp->b_attr.ba_cpentry->cp_flags & CP_OFF_IV_ENABLED) { 1289 off_t f_offset; 1290 if ((error = VNOP_BLKTOOFF(bp->b_vp, bp->b_lblkno, &f_offset))) 1291 return error; 1292 1293 /* 1294 * Attach the file offset to this buffer. The 1295 * bufattr attributes will be passed down the stack 1296 * until they reach IOFlashStorage. IOFlashStorage 1297 * will retain the offset in a local variable when it 1298 * issues its I/Os to the NAND controller. 1299 * 1300 * Note that LwVM may end up splitting this I/O 1301 * into sub-I/Os if it crosses a chunk boundary. In this 1302 * case, LwVM will update this field when it dispatches 1303 * each I/O to IOFlashStorage. But from our perspective 1304 * we have only issued a single I/O. 1305 */ 1306 bufattr_setcpoff (&(bp->b_attr), (u_int64_t)f_offset); 1307 } 1308 } 1309#endif 1310 1311 /* 1312 * we can issue the I/O because... 1313 * either B_CLUSTER is set which 1314 * means that the I/O is properly set 1315 * up to be a multiple of the page size, or 1316 * we were able to successfully set up the 1317 * phsyical block mapping 1318 */ 1319 return (VOCALL(devvp->v_op, VOFFSET(vnop_strategy), ap)); 1320} 1321 1322 1323 1324buf_t 1325buf_alloc(vnode_t vp) 1326{ 1327 return(alloc_io_buf(vp, 0)); 1328} 1329 1330void 1331buf_free(buf_t bp) { 1332 1333 free_io_buf(bp); 1334} 1335 1336 1337/* 1338 * iterate buffers for the specified vp. 1339 * if BUF_SCAN_DIRTY is set, do the dirty list 1340 * if BUF_SCAN_CLEAN is set, do the clean list 1341 * if neither flag is set, default to BUF_SCAN_DIRTY 1342 * if BUF_NOTIFY_BUSY is set, call the callout function using a NULL bp for busy pages 1343 */ 1344 1345struct buf_iterate_info_t { 1346 int flag; 1347 struct buflists *listhead; 1348}; 1349 1350void 1351buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg) 1352{ 1353 buf_t bp; 1354 int retval; 1355 struct buflists local_iterblkhd; 1356 int lock_flags = BAC_NOWAIT | BAC_REMOVE; 1357 int notify_busy = flags & BUF_NOTIFY_BUSY; 1358 struct buf_iterate_info_t list[2]; 1359 int num_lists, i; 1360 1361 if (flags & BUF_SKIP_LOCKED) 1362 lock_flags |= BAC_SKIP_LOCKED; 1363 if (flags & BUF_SKIP_NONLOCKED) 1364 lock_flags |= BAC_SKIP_NONLOCKED; 1365 1366 if ( !(flags & (BUF_SCAN_DIRTY | BUF_SCAN_CLEAN))) 1367 flags |= BUF_SCAN_DIRTY; 1368 1369 num_lists = 0; 1370 1371 if (flags & BUF_SCAN_DIRTY) { 1372 list[num_lists].flag = VBI_DIRTY; 1373 list[num_lists].listhead = &vp->v_dirtyblkhd; 1374 num_lists++; 1375 } 1376 if (flags & BUF_SCAN_CLEAN) { 1377 list[num_lists].flag = VBI_CLEAN; 1378 list[num_lists].listhead = &vp->v_cleanblkhd; 1379 num_lists++; 1380 } 1381 1382 for (i = 0; i < num_lists; i++) { 1383 lck_mtx_lock(buf_mtxp); 1384 1385 if (buf_iterprepare(vp, &local_iterblkhd, list[i].flag)) { 1386 lck_mtx_unlock(buf_mtxp); 1387 continue; 1388 } 1389 while (!LIST_EMPTY(&local_iterblkhd)) { 1390 bp = LIST_FIRST(&local_iterblkhd); 1391 LIST_REMOVE(bp, b_vnbufs); 1392 LIST_INSERT_HEAD(list[i].listhead, bp, b_vnbufs); 1393 1394 if (buf_acquire_locked(bp, lock_flags, 0, 0)) { 1395 if (notify_busy) { 1396 bp = NULL; 1397 } else { 1398 continue; 1399 } 1400 } 1401 1402 lck_mtx_unlock(buf_mtxp); 1403 1404 retval = callout(bp, arg); 1405 1406 switch (retval) { 1407 case BUF_RETURNED: 1408 if (bp) 1409 buf_brelse(bp); 1410 break; 1411 case BUF_CLAIMED: 1412 break; 1413 case BUF_RETURNED_DONE: 1414 if (bp) 1415 buf_brelse(bp); 1416 lck_mtx_lock(buf_mtxp); 1417 goto out; 1418 case BUF_CLAIMED_DONE: 1419 lck_mtx_lock(buf_mtxp); 1420 goto out; 1421 } 1422 lck_mtx_lock(buf_mtxp); 1423 } /* while list has more nodes */ 1424 out: 1425 buf_itercomplete(vp, &local_iterblkhd, list[i].flag); 1426 lck_mtx_unlock(buf_mtxp); 1427 } /* for each list */ 1428} /* buf_iterate */ 1429 1430 1431/* 1432 * Flush out and invalidate all buffers associated with a vnode. 1433 */ 1434int 1435buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo) 1436{ 1437 buf_t bp; 1438 int aflags; 1439 int error = 0; 1440 int must_rescan = 1; 1441 struct buflists local_iterblkhd; 1442 1443 1444 if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd)) 1445 return (0); 1446 1447 lck_mtx_lock(buf_mtxp); 1448 1449 for (;;) { 1450 if (must_rescan == 0) 1451 /* 1452 * the lists may not be empty, but all that's left at this 1453 * point are metadata or B_LOCKED buffers which are being 1454 * skipped... we know this because we made it through both 1455 * the clean and dirty lists without dropping buf_mtxp... 1456 * each time we drop buf_mtxp we bump "must_rescan" 1457 */ 1458 break; 1459 if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd)) 1460 break; 1461 must_rescan = 0; 1462 /* 1463 * iterate the clean list 1464 */ 1465 if (buf_iterprepare(vp, &local_iterblkhd, VBI_CLEAN)) { 1466 goto try_dirty_list; 1467 } 1468 while (!LIST_EMPTY(&local_iterblkhd)) { 1469 1470 bp = LIST_FIRST(&local_iterblkhd); 1471 1472 LIST_REMOVE(bp, b_vnbufs); 1473 LIST_INSERT_HEAD(&vp->v_cleanblkhd, bp, b_vnbufs); 1474 1475 /* 1476 * some filesystems distinguish meta data blocks with a negative logical block # 1477 */ 1478 if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META))) 1479 continue; 1480 1481 aflags = BAC_REMOVE; 1482 1483 if ( !(flags & BUF_INVALIDATE_LOCKED) ) 1484 aflags |= BAC_SKIP_LOCKED; 1485 1486 if ( (error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo)) ) { 1487 if (error == EDEADLK) 1488 /* 1489 * this buffer was marked B_LOCKED... 1490 * we didn't drop buf_mtxp, so we 1491 * we don't need to rescan 1492 */ 1493 continue; 1494 if (error == EAGAIN) { 1495 /* 1496 * found a busy buffer... we blocked and 1497 * dropped buf_mtxp, so we're going to 1498 * need to rescan after this pass is completed 1499 */ 1500 must_rescan++; 1501 continue; 1502 } 1503 /* 1504 * got some kind of 'real' error out of the msleep 1505 * in buf_acquire_locked, terminate the scan and return the error 1506 */ 1507 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN); 1508 1509 lck_mtx_unlock(buf_mtxp); 1510 return (error); 1511 } 1512 lck_mtx_unlock(buf_mtxp); 1513 1514 if (bp->b_flags & B_LOCKED) 1515 KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 0, 0); 1516 1517 CLR(bp->b_flags, B_LOCKED); 1518 SET(bp->b_flags, B_INVAL); 1519 buf_brelse(bp); 1520 1521 lck_mtx_lock(buf_mtxp); 1522 1523 /* 1524 * by dropping buf_mtxp, we allow new 1525 * buffers to be added to the vnode list(s) 1526 * we'll have to rescan at least once more 1527 * if the queues aren't empty 1528 */ 1529 must_rescan++; 1530 } 1531 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN); 1532 1533try_dirty_list: 1534 /* 1535 * Now iterate on dirty blks 1536 */ 1537 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) { 1538 continue; 1539 } 1540 while (!LIST_EMPTY(&local_iterblkhd)) { 1541 bp = LIST_FIRST(&local_iterblkhd); 1542 1543 LIST_REMOVE(bp, b_vnbufs); 1544 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs); 1545 1546 /* 1547 * some filesystems distinguish meta data blocks with a negative logical block # 1548 */ 1549 if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META))) 1550 continue; 1551 1552 aflags = BAC_REMOVE; 1553 1554 if ( !(flags & BUF_INVALIDATE_LOCKED) ) 1555 aflags |= BAC_SKIP_LOCKED; 1556 1557 if ( (error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo)) ) { 1558 if (error == EDEADLK) 1559 /* 1560 * this buffer was marked B_LOCKED... 1561 * we didn't drop buf_mtxp, so we 1562 * we don't need to rescan 1563 */ 1564 continue; 1565 if (error == EAGAIN) { 1566 /* 1567 * found a busy buffer... we blocked and 1568 * dropped buf_mtxp, so we're going to 1569 * need to rescan after this pass is completed 1570 */ 1571 must_rescan++; 1572 continue; 1573 } 1574 /* 1575 * got some kind of 'real' error out of the msleep 1576 * in buf_acquire_locked, terminate the scan and return the error 1577 */ 1578 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY); 1579 1580 lck_mtx_unlock(buf_mtxp); 1581 return (error); 1582 } 1583 lck_mtx_unlock(buf_mtxp); 1584 1585 if (bp->b_flags & B_LOCKED) 1586 KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 1, 0); 1587 1588 CLR(bp->b_flags, B_LOCKED); 1589 SET(bp->b_flags, B_INVAL); 1590 1591 if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA)) 1592 (void) VNOP_BWRITE(bp); 1593 else 1594 buf_brelse(bp); 1595 1596 lck_mtx_lock(buf_mtxp); 1597 /* 1598 * by dropping buf_mtxp, we allow new 1599 * buffers to be added to the vnode list(s) 1600 * we'll have to rescan at least once more 1601 * if the queues aren't empty 1602 */ 1603 must_rescan++; 1604 } 1605 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY); 1606 } 1607 lck_mtx_unlock(buf_mtxp); 1608 1609 return (0); 1610} 1611 1612void 1613buf_flushdirtyblks(vnode_t vp, int wait, int flags, const char *msg) { 1614 1615 (void) buf_flushdirtyblks_skipinfo(vp, wait, flags, msg); 1616 return; 1617} 1618 1619int 1620buf_flushdirtyblks_skipinfo(vnode_t vp, int wait, int flags, const char *msg) { 1621 buf_t bp; 1622 int writes_issued = 0; 1623 errno_t error; 1624 int busy = 0; 1625 struct buflists local_iterblkhd; 1626 int lock_flags = BAC_NOWAIT | BAC_REMOVE; 1627 int any_locked = 0; 1628 1629 if (flags & BUF_SKIP_LOCKED) 1630 lock_flags |= BAC_SKIP_LOCKED; 1631 if (flags & BUF_SKIP_NONLOCKED) 1632 lock_flags |= BAC_SKIP_NONLOCKED; 1633loop: 1634 lck_mtx_lock(buf_mtxp); 1635 1636 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == 0) { 1637 while (!LIST_EMPTY(&local_iterblkhd)) { 1638 bp = LIST_FIRST(&local_iterblkhd); 1639 LIST_REMOVE(bp, b_vnbufs); 1640 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs); 1641 1642 if ((error = buf_acquire_locked(bp, lock_flags, 0, 0)) == EBUSY) { 1643 busy++; 1644 } 1645 if (error) { 1646 /* 1647 * If we passed in BUF_SKIP_LOCKED or BUF_SKIP_NONLOCKED, 1648 * we may want to do somethign differently if a locked or unlocked 1649 * buffer was encountered (depending on the arg specified). 1650 * In this case, we know that one of those two was set, and the 1651 * buf acquisition failed above. 1652 * 1653 * If it failed with EDEADLK, then save state which can be emitted 1654 * later on to the caller. Most callers should not care. 1655 */ 1656 if (error == EDEADLK) { 1657 any_locked++; 1658 } 1659 continue; 1660 } 1661 lck_mtx_unlock(buf_mtxp); 1662 1663 bp->b_flags &= ~B_LOCKED; 1664 1665 /* 1666 * Wait for I/O associated with indirect blocks to complete, 1667 * since there is no way to quickly wait for them below. 1668 */ 1669 if ((bp->b_vp == vp) || (wait == 0)) 1670 (void) buf_bawrite(bp); 1671 else 1672 (void) VNOP_BWRITE(bp); 1673 writes_issued++; 1674 1675 lck_mtx_lock(buf_mtxp); 1676 } 1677 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY); 1678 } 1679 lck_mtx_unlock(buf_mtxp); 1680 1681 if (wait) { 1682 (void)vnode_waitforwrites(vp, 0, 0, 0, msg); 1683 1684 if (vp->v_dirtyblkhd.lh_first && busy) { 1685 /* 1686 * we had one or more BUSY buffers on 1687 * the dirtyblock list... most likely 1688 * these are due to delayed writes that 1689 * were moved to the bclean queue but 1690 * have not yet been 'written'. 1691 * if we issued some writes on the 1692 * previous pass, we try again immediately 1693 * if we didn't, we'll sleep for some time 1694 * to allow the state to change... 1695 */ 1696 if (writes_issued == 0) { 1697 (void)tsleep((caddr_t)&vp->v_numoutput, 1698 PRIBIO + 1, "vnode_flushdirtyblks", hz/20); 1699 } 1700 writes_issued = 0; 1701 busy = 0; 1702 1703 goto loop; 1704 } 1705 } 1706 1707 return any_locked; 1708} 1709 1710 1711/* 1712 * called with buf_mtxp held... 1713 * this lock protects the queue manipulation 1714 */ 1715static int 1716buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags) 1717{ 1718 struct buflists * listheadp; 1719 1720 if (flags & VBI_DIRTY) 1721 listheadp = &vp->v_dirtyblkhd; 1722 else 1723 listheadp = &vp->v_cleanblkhd; 1724 1725 while (vp->v_iterblkflags & VBI_ITER) { 1726 vp->v_iterblkflags |= VBI_ITERWANT; 1727 msleep(&vp->v_iterblkflags, buf_mtxp, 0, "buf_iterprepare", NULL); 1728 } 1729 if (LIST_EMPTY(listheadp)) { 1730 LIST_INIT(iterheadp); 1731 return(EINVAL); 1732 } 1733 vp->v_iterblkflags |= VBI_ITER; 1734 1735 iterheadp->lh_first = listheadp->lh_first; 1736 listheadp->lh_first->b_vnbufs.le_prev = &iterheadp->lh_first; 1737 LIST_INIT(listheadp); 1738 1739 return(0); 1740} 1741 1742/* 1743 * called with buf_mtxp held... 1744 * this lock protects the queue manipulation 1745 */ 1746static void 1747buf_itercomplete(vnode_t vp, struct buflists *iterheadp, int flags) 1748{ 1749 struct buflists * listheadp; 1750 buf_t bp; 1751 1752 if (flags & VBI_DIRTY) 1753 listheadp = &vp->v_dirtyblkhd; 1754 else 1755 listheadp = &vp->v_cleanblkhd; 1756 1757 while (!LIST_EMPTY(iterheadp)) { 1758 bp = LIST_FIRST(iterheadp); 1759 LIST_REMOVE(bp, b_vnbufs); 1760 LIST_INSERT_HEAD(listheadp, bp, b_vnbufs); 1761 } 1762 vp->v_iterblkflags &= ~VBI_ITER; 1763 1764 if (vp->v_iterblkflags & VBI_ITERWANT) { 1765 vp->v_iterblkflags &= ~VBI_ITERWANT; 1766 wakeup(&vp->v_iterblkflags); 1767 } 1768} 1769 1770 1771static void 1772bremfree_locked(buf_t bp) 1773{ 1774 struct bqueues *dp = NULL; 1775 int whichq; 1776 1777 whichq = bp->b_whichq; 1778 1779 if (whichq == -1) { 1780 if (bp->b_shadow_ref == 0) 1781 panic("bremfree_locked: %p not on freelist", bp); 1782 /* 1783 * there are clones pointing to 'bp'... 1784 * therefore, it was not put on a freelist 1785 * when buf_brelse was last called on 'bp' 1786 */ 1787 return; 1788 } 1789 /* 1790 * We only calculate the head of the freelist when removing 1791 * the last element of the list as that is the only time that 1792 * it is needed (e.g. to reset the tail pointer). 1793 * 1794 * NB: This makes an assumption about how tailq's are implemented. 1795 */ 1796 if (bp->b_freelist.tqe_next == NULL) { 1797 dp = &bufqueues[whichq]; 1798 1799 if (dp->tqh_last != &bp->b_freelist.tqe_next) 1800 panic("bremfree: lost tail"); 1801 } 1802 TAILQ_REMOVE(dp, bp, b_freelist); 1803 1804#if BALANCE_QUEUES 1805 bufqdec(whichq); 1806#endif 1807 if (whichq == BQ_LAUNDRY) 1808 blaundrycnt--; 1809 1810 bp->b_whichq = -1; 1811 bp->b_timestamp = 0; 1812 bp->b_shadow = 0; 1813} 1814 1815/* 1816 * Associate a buffer with a vnode. 1817 * buf_mtxp must be locked on entry 1818 */ 1819static void 1820bgetvp_locked(vnode_t vp, buf_t bp) 1821{ 1822 1823 if (bp->b_vp != vp) 1824 panic("bgetvp_locked: not free"); 1825 1826 if (vp->v_type == VBLK || vp->v_type == VCHR) 1827 bp->b_dev = vp->v_rdev; 1828 else 1829 bp->b_dev = NODEV; 1830 /* 1831 * Insert onto list for new vnode. 1832 */ 1833 bufinsvn(bp, &vp->v_cleanblkhd); 1834} 1835 1836/* 1837 * Disassociate a buffer from a vnode. 1838 * buf_mtxp must be locked on entry 1839 */ 1840static void 1841brelvp_locked(buf_t bp) 1842{ 1843 /* 1844 * Delete from old vnode list, if on one. 1845 */ 1846 if (bp->b_vnbufs.le_next != NOLIST) 1847 bufremvn(bp); 1848 1849 bp->b_vp = (vnode_t)NULL; 1850} 1851 1852/* 1853 * Reassign a buffer from one vnode to another. 1854 * Used to assign file specific control information 1855 * (indirect blocks) to the vnode to which they belong. 1856 */ 1857static void 1858buf_reassign(buf_t bp, vnode_t newvp) 1859{ 1860 struct buflists *listheadp; 1861 1862 if (newvp == NULL) { 1863 printf("buf_reassign: NULL"); 1864 return; 1865 } 1866 lck_mtx_lock_spin(buf_mtxp); 1867 1868 /* 1869 * Delete from old vnode list, if on one. 1870 */ 1871 if (bp->b_vnbufs.le_next != NOLIST) 1872 bufremvn(bp); 1873 /* 1874 * If dirty, put on list of dirty buffers; 1875 * otherwise insert onto list of clean buffers. 1876 */ 1877 if (ISSET(bp->b_flags, B_DELWRI)) 1878 listheadp = &newvp->v_dirtyblkhd; 1879 else 1880 listheadp = &newvp->v_cleanblkhd; 1881 bufinsvn(bp, listheadp); 1882 1883 lck_mtx_unlock(buf_mtxp); 1884} 1885 1886static __inline__ void 1887bufhdrinit(buf_t bp) 1888{ 1889 bzero((char *)bp, sizeof *bp); 1890 bp->b_dev = NODEV; 1891 bp->b_rcred = NOCRED; 1892 bp->b_wcred = NOCRED; 1893 bp->b_vnbufs.le_next = NOLIST; 1894 bp->b_flags = B_INVAL; 1895 1896 return; 1897} 1898 1899/* 1900 * Initialize buffers and hash links for buffers. 1901 */ 1902__private_extern__ void 1903bufinit(void) 1904{ 1905 buf_t bp; 1906 struct bqueues *dp; 1907 int i; 1908 1909 nbuf_headers = 0; 1910 /* Initialize the buffer queues ('freelists') and the hash table */ 1911 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) 1912 TAILQ_INIT(dp); 1913 bufhashtbl = hashinit(nbuf_hashelements, M_CACHE, &bufhash); 1914 1915 buf_busycount = 0; 1916 1917 /* Initialize the buffer headers */ 1918 for (i = 0; i < max_nbuf_headers; i++) { 1919 nbuf_headers++; 1920 bp = &buf_headers[i]; 1921 bufhdrinit(bp); 1922 1923 BLISTNONE(bp); 1924 dp = &bufqueues[BQ_EMPTY]; 1925 bp->b_whichq = BQ_EMPTY; 1926 bp->b_timestamp = buf_timestamp(); 1927 binsheadfree(bp, dp, BQ_EMPTY); 1928 binshash(bp, &invalhash); 1929 } 1930 boot_nbuf_headers = nbuf_headers; 1931 1932 TAILQ_INIT(&iobufqueue); 1933 TAILQ_INIT(&delaybufqueue); 1934 1935 for (; i < nbuf_headers + niobuf_headers; i++) { 1936 bp = &buf_headers[i]; 1937 bufhdrinit(bp); 1938 bp->b_whichq = -1; 1939 binsheadfree(bp, &iobufqueue, -1); 1940 } 1941 1942 /* 1943 * allocate lock group attribute and group 1944 */ 1945 buf_mtx_grp_attr = lck_grp_attr_alloc_init(); 1946 buf_mtx_grp = lck_grp_alloc_init("buffer cache", buf_mtx_grp_attr); 1947 1948 /* 1949 * allocate the lock attribute 1950 */ 1951 buf_mtx_attr = lck_attr_alloc_init(); 1952 1953 /* 1954 * allocate and initialize mutex's for the buffer and iobuffer pools 1955 */ 1956 buf_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr); 1957 iobuffer_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr); 1958 1959 if (iobuffer_mtxp == NULL) 1960 panic("couldn't create iobuffer mutex"); 1961 1962 if (buf_mtxp == NULL) 1963 panic("couldn't create buf mutex"); 1964 1965 /* 1966 * allocate and initialize cluster specific global locks... 1967 */ 1968 cluster_init(); 1969 1970 printf("using %d buffer headers and %d cluster IO buffer headers\n", 1971 nbuf_headers, niobuf_headers); 1972 1973 /* Set up zones used by the buffer cache */ 1974 bufzoneinit(); 1975 1976 /* start the bcleanbuf() thread */ 1977 bcleanbuf_thread_init(); 1978 1979#ifndef __arm__ 1980 /* Register a callout for relieving vm pressure */ 1981 if (vm_set_buffer_cleanup_callout(buffer_cache_gc) != KERN_SUCCESS) { 1982 panic("Couldn't register buffer cache callout for vm pressure!\n"); 1983 } 1984#endif 1985 1986#if BALANCE_QUEUES 1987 { 1988 static void bufq_balance_thread_init(void) __attribute__((section("__TEXT, initcode"))); 1989 /* create a thread to do dynamic buffer queue balancing */ 1990 bufq_balance_thread_init(); 1991 } 1992#endif /* notyet */ 1993} 1994 1995 1996 1997/* 1998 * Zones for the meta data buffers 1999 */ 2000 2001#define MINMETA 512 2002#define MAXMETA 8192 2003 2004struct meta_zone_entry { 2005 zone_t mz_zone; 2006 vm_size_t mz_size; 2007 vm_size_t mz_max; 2008 const char *mz_name; 2009}; 2010 2011struct meta_zone_entry meta_zones[] = { 2012 {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" }, 2013 {NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" }, 2014 {NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" }, 2015 {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" }, 2016 {NULL, (MINMETA * 16), 512 * (MINMETA * 16), "buf.8192" }, 2017 {NULL, 0, 0, "" } /* End */ 2018}; 2019 2020/* 2021 * Initialize the meta data zones 2022 */ 2023static void 2024bufzoneinit(void) 2025{ 2026 int i; 2027 2028 for (i = 0; meta_zones[i].mz_size != 0; i++) { 2029 meta_zones[i].mz_zone = 2030 zinit(meta_zones[i].mz_size, 2031 meta_zones[i].mz_max, 2032 PAGE_SIZE, 2033 meta_zones[i].mz_name); 2034 zone_change(meta_zones[i].mz_zone, Z_CALLERACCT, FALSE); 2035 } 2036 buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers"); 2037 zone_change(buf_hdr_zone, Z_CALLERACCT, FALSE); 2038} 2039 2040static __inline__ zone_t 2041getbufzone(size_t size) 2042{ 2043 int i; 2044 2045 if ((size % 512) || (size < MINMETA) || (size > MAXMETA)) 2046 panic("getbufzone: incorect size = %lu", size); 2047 2048 for (i = 0; meta_zones[i].mz_size != 0; i++) { 2049 if (meta_zones[i].mz_size >= size) 2050 break; 2051 } 2052 2053 return (meta_zones[i].mz_zone); 2054} 2055 2056 2057 2058static struct buf * 2059bio_doread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, int async, int queuetype) 2060{ 2061 buf_t bp; 2062 2063 bp = buf_getblk(vp, blkno, size, 0, 0, queuetype); 2064 2065 /* 2066 * If buffer does not have data valid, start a read. 2067 * Note that if buffer is B_INVAL, buf_getblk() won't return it. 2068 * Therefore, it's valid if it's I/O has completed or been delayed. 2069 */ 2070 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) { 2071 struct proc *p; 2072 2073 p = current_proc(); 2074 2075 /* Start I/O for the buffer (keeping credentials). */ 2076 SET(bp->b_flags, B_READ | async); 2077 if (IS_VALID_CRED(cred) && !IS_VALID_CRED(bp->b_rcred)) { 2078 kauth_cred_ref(cred); 2079 bp->b_rcred = cred; 2080 } 2081 2082 VNOP_STRATEGY(bp); 2083 2084 trace(TR_BREADMISS, pack(vp, size), blkno); 2085 2086 /* Pay for the read. */ 2087 if (p && p->p_stats) 2088 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_inblock); /* XXX */ 2089 2090 if (async) { 2091 /* 2092 * since we asked for an ASYNC I/O 2093 * the biodone will do the brelse 2094 * we don't want to pass back a bp 2095 * that we don't 'own' 2096 */ 2097 bp = NULL; 2098 } 2099 } else if (async) { 2100 buf_brelse(bp); 2101 bp = NULL; 2102 } 2103 2104 trace(TR_BREADHIT, pack(vp, size), blkno); 2105 2106 return (bp); 2107} 2108 2109/* 2110 * Perform the reads for buf_breadn() and buf_meta_breadn(). 2111 * Trivial modification to the breada algorithm presented in Bach (p.55). 2112 */ 2113static errno_t 2114do_breadn_for_type(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, 2115 int nrablks, kauth_cred_t cred, buf_t *bpp, int queuetype) 2116{ 2117 buf_t bp; 2118 int i; 2119 2120 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype); 2121 2122 /* 2123 * For each of the read-ahead blocks, start a read, if necessary. 2124 */ 2125 for (i = 0; i < nrablks; i++) { 2126 /* If it's in the cache, just go on to next one. */ 2127 if (incore(vp, rablks[i])) 2128 continue; 2129 2130 /* Get a buffer for the read-ahead block */ 2131 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, queuetype); 2132 } 2133 2134 /* Otherwise, we had to start a read for it; wait until it's valid. */ 2135 return (buf_biowait(bp)); 2136} 2137 2138 2139/* 2140 * Read a disk block. 2141 * This algorithm described in Bach (p.54). 2142 */ 2143errno_t 2144buf_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp) 2145{ 2146 buf_t bp; 2147 2148 /* Get buffer for block. */ 2149 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ); 2150 2151 /* Wait for the read to complete, and return result. */ 2152 return (buf_biowait(bp)); 2153} 2154 2155/* 2156 * Read a disk block. [bread() for meta-data] 2157 * This algorithm described in Bach (p.54). 2158 */ 2159errno_t 2160buf_meta_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp) 2161{ 2162 buf_t bp; 2163 2164 /* Get buffer for block. */ 2165 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META); 2166 2167 /* Wait for the read to complete, and return result. */ 2168 return (buf_biowait(bp)); 2169} 2170 2171/* 2172 * Read-ahead multiple disk blocks. The first is sync, the rest async. 2173 */ 2174errno_t 2175buf_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp) 2176{ 2177 return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ)); 2178} 2179 2180/* 2181 * Read-ahead multiple disk blocks. The first is sync, the rest async. 2182 * [buf_breadn() for meta-data] 2183 */ 2184errno_t 2185buf_meta_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp) 2186{ 2187 return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META)); 2188} 2189 2190/* 2191 * Block write. Described in Bach (p.56) 2192 */ 2193errno_t 2194buf_bwrite(buf_t bp) 2195{ 2196 int sync, wasdelayed; 2197 errno_t rv; 2198 proc_t p = current_proc(); 2199 vnode_t vp = bp->b_vp; 2200 2201 if (bp->b_datap == 0) { 2202 if (brecover_data(bp) == 0) 2203 return (0); 2204 } 2205 /* Remember buffer type, to switch on it later. */ 2206 sync = !ISSET(bp->b_flags, B_ASYNC); 2207 wasdelayed = ISSET(bp->b_flags, B_DELWRI); 2208 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI)); 2209 2210 if (wasdelayed) 2211 OSAddAtomicLong(-1, &nbdwrite); 2212 2213 if (!sync) { 2214 /* 2215 * If not synchronous, pay for the I/O operation and make 2216 * sure the buf is on the correct vnode queue. We have 2217 * to do this now, because if we don't, the vnode may not 2218 * be properly notified that its I/O has completed. 2219 */ 2220 if (wasdelayed) 2221 buf_reassign(bp, vp); 2222 else 2223 if (p && p->p_stats) 2224 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */ 2225 } 2226 trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno); 2227 2228 /* Initiate disk write. Make sure the appropriate party is charged. */ 2229 2230 OSAddAtomic(1, &vp->v_numoutput); 2231 2232 VNOP_STRATEGY(bp); 2233 2234 if (sync) { 2235 /* 2236 * If I/O was synchronous, wait for it to complete. 2237 */ 2238 rv = buf_biowait(bp); 2239 2240 /* 2241 * Pay for the I/O operation, if it's not been paid for, and 2242 * make sure it's on the correct vnode queue. (async operatings 2243 * were payed for above.) 2244 */ 2245 if (wasdelayed) 2246 buf_reassign(bp, vp); 2247 else 2248 if (p && p->p_stats) 2249 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */ 2250 2251 /* Release the buffer. */ 2252 // XXXdbg - only if the unused bit is set 2253 if (!ISSET(bp->b_flags, B_NORELSE)) { 2254 buf_brelse(bp); 2255 } else { 2256 CLR(bp->b_flags, B_NORELSE); 2257 } 2258 2259 return (rv); 2260 } else { 2261 return (0); 2262 } 2263} 2264 2265int 2266vn_bwrite(struct vnop_bwrite_args *ap) 2267{ 2268 return (buf_bwrite(ap->a_bp)); 2269} 2270 2271/* 2272 * Delayed write. 2273 * 2274 * The buffer is marked dirty, but is not queued for I/O. 2275 * This routine should be used when the buffer is expected 2276 * to be modified again soon, typically a small write that 2277 * partially fills a buffer. 2278 * 2279 * NB: magnetic tapes cannot be delayed; they must be 2280 * written in the order that the writes are requested. 2281 * 2282 * Described in Leffler, et al. (pp. 208-213). 2283 * 2284 * Note: With the ability to allocate additional buffer 2285 * headers, we can get in to the situation where "too" many 2286 * buf_bdwrite()s can create situation where the kernel can create 2287 * buffers faster than the disks can service. Doing a buf_bawrite() in 2288 * cases where we have "too many" outstanding buf_bdwrite()s avoids that. 2289 */ 2290__private_extern__ int 2291bdwrite_internal(buf_t bp, int return_error) 2292{ 2293 proc_t p = current_proc(); 2294 vnode_t vp = bp->b_vp; 2295 2296 /* 2297 * If the block hasn't been seen before: 2298 * (1) Mark it as having been seen, 2299 * (2) Charge for the write. 2300 * (3) Make sure it's on its vnode's correct block list, 2301 */ 2302 if (!ISSET(bp->b_flags, B_DELWRI)) { 2303 SET(bp->b_flags, B_DELWRI); 2304 if (p && p->p_stats) 2305 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */ 2306 OSAddAtomicLong(1, &nbdwrite); 2307 buf_reassign(bp, vp); 2308 } 2309 2310 /* 2311 * if we're not LOCKED, but the total number of delayed writes 2312 * has climbed above 75% of the total buffers in the system 2313 * return an error if the caller has indicated that it can 2314 * handle one in this case, otherwise schedule the I/O now 2315 * this is done to prevent us from allocating tons of extra 2316 * buffers when dealing with virtual disks (i.e. DiskImages), 2317 * because additional buffers are dynamically allocated to prevent 2318 * deadlocks from occurring 2319 * 2320 * however, can't do a buf_bawrite() if the LOCKED bit is set because the 2321 * buffer is part of a transaction and can't go to disk until 2322 * the LOCKED bit is cleared. 2323 */ 2324 if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf_headers/4)*3)) { 2325 if (return_error) 2326 return (EAGAIN); 2327 /* 2328 * If the vnode has "too many" write operations in progress 2329 * wait for them to finish the IO 2330 */ 2331 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, "buf_bdwrite"); 2332 2333 return (buf_bawrite(bp)); 2334 } 2335 2336 /* Otherwise, the "write" is done, so mark and release the buffer. */ 2337 SET(bp->b_flags, B_DONE); 2338 buf_brelse(bp); 2339 return (0); 2340} 2341 2342errno_t 2343buf_bdwrite(buf_t bp) 2344{ 2345 return (bdwrite_internal(bp, 0)); 2346} 2347 2348 2349/* 2350 * Asynchronous block write; just an asynchronous buf_bwrite(). 2351 * 2352 * Note: With the abilitty to allocate additional buffer 2353 * headers, we can get in to the situation where "too" many 2354 * buf_bawrite()s can create situation where the kernel can create 2355 * buffers faster than the disks can service. 2356 * We limit the number of "in flight" writes a vnode can have to 2357 * avoid this. 2358 */ 2359static int 2360bawrite_internal(buf_t bp, int throttle) 2361{ 2362 vnode_t vp = bp->b_vp; 2363 2364 if (vp) { 2365 if (throttle) 2366 /* 2367 * If the vnode has "too many" write operations in progress 2368 * wait for them to finish the IO 2369 */ 2370 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (const char *)"buf_bawrite"); 2371 else if (vp->v_numoutput >= VNODE_ASYNC_THROTTLE) 2372 /* 2373 * return to the caller and 2374 * let him decide what to do 2375 */ 2376 return (EWOULDBLOCK); 2377 } 2378 SET(bp->b_flags, B_ASYNC); 2379 2380 return (VNOP_BWRITE(bp)); 2381} 2382 2383errno_t 2384buf_bawrite(buf_t bp) 2385{ 2386 return (bawrite_internal(bp, 1)); 2387} 2388 2389 2390 2391static void 2392buf_free_meta_store(buf_t bp) 2393{ 2394 if (bp->b_bufsize) { 2395 if (ISSET(bp->b_flags, B_ZALLOC)) { 2396 zone_t z; 2397 2398 z = getbufzone(bp->b_bufsize); 2399 zfree(z, (void *)bp->b_datap); 2400 } else 2401 kmem_free(kernel_map, bp->b_datap, bp->b_bufsize); 2402 2403 bp->b_datap = (uintptr_t)NULL; 2404 bp->b_bufsize = 0; 2405 } 2406} 2407 2408 2409static buf_t 2410buf_brelse_shadow(buf_t bp) 2411{ 2412 buf_t bp_head; 2413 buf_t bp_temp; 2414 buf_t bp_return = NULL; 2415#ifdef BUF_MAKE_PRIVATE 2416 buf_t bp_data; 2417 int data_ref = 0; 2418#endif 2419 int need_wakeup = 0; 2420 2421 lck_mtx_lock_spin(buf_mtxp); 2422 2423 bp_head = (buf_t)bp->b_orig; 2424 2425 if (bp_head->b_whichq != -1) 2426 panic("buf_brelse_shadow: bp_head on freelist %d\n", bp_head->b_whichq); 2427 2428#ifdef BUF_MAKE_PRIVATE 2429 if (bp_data = bp->b_data_store) { 2430 bp_data->b_data_ref--; 2431 /* 2432 * snapshot the ref count so that we can check it 2433 * outside of the lock... we only want the guy going 2434 * from 1 -> 0 to try and release the storage 2435 */ 2436 data_ref = bp_data->b_data_ref; 2437 } 2438#endif 2439 KERNEL_DEBUG(0xbbbbc008 | DBG_FUNC_START, bp, bp_head, bp_head->b_shadow_ref, 0, 0); 2440 2441 bp_head->b_shadow_ref--; 2442 2443 for (bp_temp = bp_head; bp_temp && bp != bp_temp->b_shadow; bp_temp = bp_temp->b_shadow); 2444 2445 if (bp_temp == NULL) 2446 panic("buf_brelse_shadow: bp not on list %p", bp_head); 2447 2448 bp_temp->b_shadow = bp_temp->b_shadow->b_shadow; 2449 2450#ifdef BUF_MAKE_PRIVATE 2451 /* 2452 * we're about to free the current 'owner' of the data buffer and 2453 * there is at least one other shadow buf_t still pointing at it 2454 * so transfer it to the first shadow buf left in the chain 2455 */ 2456 if (bp == bp_data && data_ref) { 2457 if ((bp_data = bp_head->b_shadow) == NULL) 2458 panic("buf_brelse_shadow: data_ref mismatch bp(%p)", bp); 2459 2460 for (bp_temp = bp_data; bp_temp; bp_temp = bp_temp->b_shadow) 2461 bp_temp->b_data_store = bp_data; 2462 bp_data->b_data_ref = data_ref; 2463 } 2464#endif 2465 if (bp_head->b_shadow_ref == 0 && bp_head->b_shadow) 2466 panic("buf_relse_shadow: b_shadow != NULL && b_shadow_ref == 0 bp(%p)", bp); 2467 if (bp_head->b_shadow_ref && bp_head->b_shadow == 0) 2468 panic("buf_relse_shadow: b_shadow == NULL && b_shadow_ref != 0 bp(%p)", bp); 2469 2470 if (bp_head->b_shadow_ref == 0) { 2471 if (!ISSET(bp_head->b_lflags, BL_BUSY)) { 2472 2473 CLR(bp_head->b_flags, B_AGE); 2474 bp_head->b_timestamp = buf_timestamp(); 2475 2476 if (ISSET(bp_head->b_flags, B_LOCKED)) { 2477 bp_head->b_whichq = BQ_LOCKED; 2478 binstailfree(bp_head, &bufqueues[BQ_LOCKED], BQ_LOCKED); 2479 } else { 2480 bp_head->b_whichq = BQ_META; 2481 binstailfree(bp_head, &bufqueues[BQ_META], BQ_META); 2482 } 2483 } else if (ISSET(bp_head->b_lflags, BL_WAITSHADOW)) { 2484 CLR(bp_head->b_lflags, BL_WAITSHADOW); 2485 2486 bp_return = bp_head; 2487 } 2488 if (ISSET(bp_head->b_lflags, BL_WANTED_REF)) { 2489 CLR(bp_head->b_lflags, BL_WANTED_REF); 2490 need_wakeup = 1; 2491 } 2492 } 2493 lck_mtx_unlock(buf_mtxp); 2494 2495 if (need_wakeup) { 2496 wakeup(bp_head); 2497 } 2498 2499#ifdef BUF_MAKE_PRIVATE 2500 if (bp == bp_data && data_ref == 0) 2501 buf_free_meta_store(bp); 2502 2503 bp->b_data_store = NULL; 2504#endif 2505 KERNEL_DEBUG(0xbbbbc008 | DBG_FUNC_END, bp, 0, 0, 0, 0); 2506 2507 return (bp_return); 2508} 2509 2510 2511/* 2512 * Release a buffer on to the free lists. 2513 * Described in Bach (p. 46). 2514 */ 2515void 2516buf_brelse(buf_t bp) 2517{ 2518 struct bqueues *bufq; 2519 long whichq; 2520 upl_t upl; 2521 int need_wakeup = 0; 2522 int need_bp_wakeup = 0; 2523 2524 2525 if (bp->b_whichq != -1 || !(bp->b_lflags & BL_BUSY)) 2526 panic("buf_brelse: bad buffer = %p\n", bp); 2527 2528#ifdef JOE_DEBUG 2529 (void) OSBacktrace(&bp->b_stackbrelse[0], 6); 2530 2531 bp->b_lastbrelse = current_thread(); 2532 bp->b_tag = 0; 2533#endif 2534 if (bp->b_lflags & BL_IOBUF) { 2535 buf_t shadow_master_bp = NULL; 2536 2537 if (ISSET(bp->b_lflags, BL_SHADOW)) 2538 shadow_master_bp = buf_brelse_shadow(bp); 2539 else if (ISSET(bp->b_lflags, BL_IOBUF_ALLOC)) 2540 buf_free_meta_store(bp); 2541 free_io_buf(bp); 2542 2543 if (shadow_master_bp) { 2544 bp = shadow_master_bp; 2545 goto finish_shadow_master; 2546 } 2547 return; 2548 } 2549 2550 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START, 2551 bp->b_lblkno * PAGE_SIZE, bp, bp->b_datap, 2552 bp->b_flags, 0); 2553 2554 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 2555 2556 /* 2557 * if we're invalidating a buffer that has the B_FILTER bit 2558 * set then call the b_iodone function so it gets cleaned 2559 * up properly. 2560 * 2561 * the HFS journal code depends on this 2562 */ 2563 if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) { 2564 if (ISSET(bp->b_flags, B_FILTER)) { /* if necessary, call out */ 2565 void (*iodone_func)(struct buf *, void *) = bp->b_iodone; 2566 void *arg = bp->b_transaction; 2567 2568 CLR(bp->b_flags, B_FILTER); /* but note callout done */ 2569 bp->b_iodone = NULL; 2570 bp->b_transaction = NULL; 2571 2572 if (iodone_func == NULL) { 2573 panic("brelse: bp @ %p has NULL b_iodone!\n", bp); 2574 } 2575 (*iodone_func)(bp, arg); 2576 } 2577 } 2578 /* 2579 * I/O is done. Cleanup the UPL state 2580 */ 2581 upl = bp->b_upl; 2582 2583 if ( !ISSET(bp->b_flags, B_META) && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) { 2584 kern_return_t kret; 2585 int upl_flags; 2586 2587 if (upl == NULL) { 2588 if ( !ISSET(bp->b_flags, B_INVAL)) { 2589 kret = ubc_create_upl(bp->b_vp, 2590 ubc_blktooff(bp->b_vp, bp->b_lblkno), 2591 bp->b_bufsize, 2592 &upl, 2593 NULL, 2594 UPL_PRECIOUS); 2595 2596 if (kret != KERN_SUCCESS) 2597 panic("brelse: Failed to create UPL"); 2598#if UPL_DEBUG 2599 upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) 5); 2600#endif /* UPL_DEBUG */ 2601 } 2602 } else { 2603 if (bp->b_datap) { 2604 kret = ubc_upl_unmap(upl); 2605 2606 if (kret != KERN_SUCCESS) 2607 panic("ubc_upl_unmap failed"); 2608 bp->b_datap = (uintptr_t)NULL; 2609 } 2610 } 2611 if (upl) { 2612 if (bp->b_flags & (B_ERROR | B_INVAL)) { 2613 if (bp->b_flags & (B_READ | B_INVAL)) 2614 upl_flags = UPL_ABORT_DUMP_PAGES; 2615 else 2616 upl_flags = 0; 2617 2618 ubc_upl_abort(upl, upl_flags); 2619 } else { 2620 if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY)) 2621 upl_flags = UPL_COMMIT_SET_DIRTY ; 2622 else 2623 upl_flags = UPL_COMMIT_CLEAR_DIRTY ; 2624 2625 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags | 2626 UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY); 2627 } 2628 bp->b_upl = NULL; 2629 } 2630 } else { 2631 if ( (upl) ) 2632 panic("brelse: UPL set for non VREG; vp=%p", bp->b_vp); 2633 } 2634 2635 /* 2636 * If it's locked, don't report an error; try again later. 2637 */ 2638 if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR)) 2639 CLR(bp->b_flags, B_ERROR); 2640 /* 2641 * If it's not cacheable, or an error, mark it invalid. 2642 */ 2643 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR))) 2644 SET(bp->b_flags, B_INVAL); 2645 2646 if ((bp->b_bufsize <= 0) || 2647 ISSET(bp->b_flags, B_INVAL) || 2648 (ISSET(bp->b_lflags, BL_WANTDEALLOC) && !ISSET(bp->b_flags, B_DELWRI))) { 2649 2650 boolean_t delayed_buf_free_meta_store = FALSE; 2651 2652 /* 2653 * If it's invalid or empty, dissociate it from its vnode, 2654 * release its storage if B_META, and 2655 * clean it up a bit and put it on the EMPTY queue 2656 */ 2657 if (ISSET(bp->b_flags, B_DELWRI)) 2658 OSAddAtomicLong(-1, &nbdwrite); 2659 2660 if (ISSET(bp->b_flags, B_META)) { 2661 if (bp->b_shadow_ref) 2662 delayed_buf_free_meta_store = TRUE; 2663 else 2664 buf_free_meta_store(bp); 2665 } 2666 /* 2667 * nuke any credentials we were holding 2668 */ 2669 buf_release_credentials(bp); 2670 2671 lck_mtx_lock_spin(buf_mtxp); 2672 2673 if (bp->b_shadow_ref) { 2674 SET(bp->b_lflags, BL_WAITSHADOW); 2675 2676 lck_mtx_unlock(buf_mtxp); 2677 2678 return; 2679 } 2680 if (delayed_buf_free_meta_store == TRUE) { 2681 2682 lck_mtx_unlock(buf_mtxp); 2683finish_shadow_master: 2684 buf_free_meta_store(bp); 2685 2686 lck_mtx_lock_spin(buf_mtxp); 2687 } 2688 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA)); 2689 2690 if (bp->b_vp) 2691 brelvp_locked(bp); 2692 2693 bremhash(bp); 2694 BLISTNONE(bp); 2695 binshash(bp, &invalhash); 2696 2697 bp->b_whichq = BQ_EMPTY; 2698 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY); 2699 } else { 2700 2701 /* 2702 * It has valid data. Put it on the end of the appropriate 2703 * queue, so that it'll stick around for as long as possible. 2704 */ 2705 if (ISSET(bp->b_flags, B_LOCKED)) 2706 whichq = BQ_LOCKED; /* locked in core */ 2707 else if (ISSET(bp->b_flags, B_META)) 2708 whichq = BQ_META; /* meta-data */ 2709 else if (ISSET(bp->b_flags, B_AGE)) 2710 whichq = BQ_AGE; /* stale but valid data */ 2711 else 2712 whichq = BQ_LRU; /* valid data */ 2713 bufq = &bufqueues[whichq]; 2714 2715 bp->b_timestamp = buf_timestamp(); 2716 2717 lck_mtx_lock_spin(buf_mtxp); 2718 2719 /* 2720 * the buf_brelse_shadow routine doesn't take 'ownership' 2721 * of the parent buf_t... it updates state that is protected by 2722 * the buf_mtxp, and checks for BL_BUSY to determine whether to 2723 * put the buf_t back on a free list. b_shadow_ref is protected 2724 * by the lock, and since we have not yet cleared B_BUSY, we need 2725 * to check it while holding the lock to insure that one of us 2726 * puts this buf_t back on a free list when it is safe to do so 2727 */ 2728 if (bp->b_shadow_ref == 0) { 2729 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE)); 2730 bp->b_whichq = whichq; 2731 binstailfree(bp, bufq, whichq); 2732 } else { 2733 /* 2734 * there are still cloned buf_t's pointing 2735 * at this guy... need to keep it off the 2736 * freelists until a buf_brelse is done on 2737 * the last clone 2738 */ 2739 CLR(bp->b_flags, (B_ASYNC | B_NOCACHE)); 2740 } 2741 } 2742 if (needbuffer) { 2743 /* 2744 * needbuffer is a global 2745 * we're currently using buf_mtxp to protect it 2746 * delay doing the actual wakeup until after 2747 * we drop buf_mtxp 2748 */ 2749 needbuffer = 0; 2750 need_wakeup = 1; 2751 } 2752 if (ISSET(bp->b_lflags, BL_WANTED)) { 2753 /* 2754 * delay the actual wakeup until after we 2755 * clear BL_BUSY and we've dropped buf_mtxp 2756 */ 2757 need_bp_wakeup = 1; 2758 } 2759 /* 2760 * Unlock the buffer. 2761 */ 2762 CLR(bp->b_lflags, (BL_BUSY | BL_WANTED)); 2763 buf_busycount--; 2764 2765 lck_mtx_unlock(buf_mtxp); 2766 2767 if (need_wakeup) { 2768 /* 2769 * Wake up any processes waiting for any buffer to become free. 2770 */ 2771 wakeup(&needbuffer); 2772 } 2773 if (need_bp_wakeup) { 2774 /* 2775 * Wake up any proceeses waiting for _this_ buffer to become free. 2776 */ 2777 wakeup(bp); 2778 } 2779 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END, 2780 bp, bp->b_datap, bp->b_flags, 0, 0); 2781} 2782 2783/* 2784 * Determine if a block is in the cache. 2785 * Just look on what would be its hash chain. If it's there, return 2786 * a pointer to it, unless it's marked invalid. If it's marked invalid, 2787 * we normally don't return the buffer, unless the caller explicitly 2788 * wants us to. 2789 */ 2790static boolean_t 2791incore(vnode_t vp, daddr64_t blkno) 2792{ 2793 boolean_t retval; 2794 struct bufhashhdr *dp; 2795 2796 dp = BUFHASH(vp, blkno); 2797 2798 lck_mtx_lock_spin(buf_mtxp); 2799 2800 if (incore_locked(vp, blkno, dp)) 2801 retval = TRUE; 2802 else 2803 retval = FALSE; 2804 lck_mtx_unlock(buf_mtxp); 2805 2806 return (retval); 2807} 2808 2809 2810static buf_t 2811incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp) 2812{ 2813 struct buf *bp; 2814 2815 /* Search hash chain */ 2816 for (bp = dp->lh_first; bp != NULL; bp = bp->b_hash.le_next) { 2817 if (bp->b_lblkno == blkno && bp->b_vp == vp && 2818 !ISSET(bp->b_flags, B_INVAL)) { 2819 return (bp); 2820 } 2821 } 2822 return (NULL); 2823} 2824 2825void 2826buf_wait_for_shadow_io(vnode_t vp, daddr64_t blkno) 2827{ 2828 buf_t bp; 2829 struct bufhashhdr *dp; 2830 2831 dp = BUFHASH(vp, blkno); 2832 2833 lck_mtx_lock_spin(buf_mtxp); 2834 2835 for (;;) { 2836 if ((bp = incore_locked(vp, blkno, dp)) == NULL) 2837 break; 2838 2839 if (bp->b_shadow_ref == 0) 2840 break; 2841 2842 SET(bp->b_lflags, BL_WANTED_REF); 2843 2844 (void) msleep(bp, buf_mtxp, PSPIN | (PRIBIO+1), "buf_wait_for_shadow", NULL); 2845 } 2846 lck_mtx_unlock(buf_mtxp); 2847} 2848 2849/* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */ 2850/* 2851 * Get a block of requested size that is associated with 2852 * a given vnode and block offset. If it is found in the 2853 * block cache, mark it as having been found, make it busy 2854 * and return it. Otherwise, return an empty block of the 2855 * correct size. It is up to the caller to insure that the 2856 * cached blocks be of the correct size. 2857 */ 2858buf_t 2859buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int operation) 2860{ 2861 buf_t bp; 2862 int err; 2863 upl_t upl; 2864 upl_page_info_t *pl; 2865 kern_return_t kret; 2866 int ret_only_valid; 2867 struct timespec ts; 2868 int upl_flags; 2869 struct bufhashhdr *dp; 2870 2871 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START, 2872 (uintptr_t)(blkno * PAGE_SIZE), size, operation, 0, 0); 2873 2874 ret_only_valid = operation & BLK_ONLYVALID; 2875 operation &= ~BLK_ONLYVALID; 2876 dp = BUFHASH(vp, blkno); 2877start: 2878 lck_mtx_lock_spin(buf_mtxp); 2879 2880 if ((bp = incore_locked(vp, blkno, dp))) { 2881 /* 2882 * Found in the Buffer Cache 2883 */ 2884 if (ISSET(bp->b_lflags, BL_BUSY)) { 2885 /* 2886 * but is busy 2887 */ 2888 switch (operation) { 2889 case BLK_READ: 2890 case BLK_WRITE: 2891 case BLK_META: 2892 SET(bp->b_lflags, BL_WANTED); 2893 bufstats.bufs_busyincore++; 2894 2895 /* 2896 * don't retake the mutex after being awakened... 2897 * the time out is in msecs 2898 */ 2899 ts.tv_sec = (slptimeo/1000); 2900 ts.tv_nsec = (slptimeo % 1000) * 10 * NSEC_PER_USEC * 1000; 2901 2902 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 396)) | DBG_FUNC_NONE, 2903 (uintptr_t)blkno, size, operation, 0, 0); 2904 2905 err = msleep(bp, buf_mtxp, slpflag | PDROP | (PRIBIO + 1), "buf_getblk", &ts); 2906 2907 /* 2908 * Callers who call with PCATCH or timeout are 2909 * willing to deal with the NULL pointer 2910 */ 2911 if (err && ((slpflag & PCATCH) || ((err == EWOULDBLOCK) && slptimeo))) 2912 return (NULL); 2913 goto start; 2914 /*NOTREACHED*/ 2915 break; 2916 2917 default: 2918 /* 2919 * unknown operation requested 2920 */ 2921 panic("getblk: paging or unknown operation for incore busy buffer - %x\n", operation); 2922 /*NOTREACHED*/ 2923 break; 2924 } 2925 } else { 2926 /* 2927 * buffer in core and not busy 2928 */ 2929 SET(bp->b_lflags, BL_BUSY); 2930 SET(bp->b_flags, B_CACHE); 2931 buf_busycount++; 2932 2933 bremfree_locked(bp); 2934 bufstats.bufs_incore++; 2935 2936 lck_mtx_unlock(buf_mtxp); 2937#ifdef JOE_DEBUG 2938 bp->b_owner = current_thread(); 2939 bp->b_tag = 1; 2940#endif 2941 if ( (bp->b_upl) ) 2942 panic("buffer has UPL, but not marked BUSY: %p", bp); 2943 2944 if ( !ret_only_valid && bp->b_bufsize != size) 2945 allocbuf(bp, size); 2946 2947 upl_flags = 0; 2948 switch (operation) { 2949 case BLK_WRITE: 2950 /* 2951 * "write" operation: let the UPL subsystem 2952 * know that we intend to modify the buffer 2953 * cache pages we're gathering. 2954 */ 2955 upl_flags |= UPL_WILL_MODIFY; 2956 case BLK_READ: 2957 upl_flags |= UPL_PRECIOUS; 2958 if (UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) { 2959 kret = ubc_create_upl(vp, 2960 ubc_blktooff(vp, bp->b_lblkno), 2961 bp->b_bufsize, 2962 &upl, 2963 &pl, 2964 upl_flags); 2965 if (kret != KERN_SUCCESS) 2966 panic("Failed to create UPL"); 2967 2968 bp->b_upl = upl; 2969 2970 if (upl_valid_page(pl, 0)) { 2971 if (upl_dirty_page(pl, 0)) 2972 SET(bp->b_flags, B_WASDIRTY); 2973 else 2974 CLR(bp->b_flags, B_WASDIRTY); 2975 } else 2976 CLR(bp->b_flags, (B_DONE | B_CACHE | B_WASDIRTY | B_DELWRI)); 2977 2978 kret = ubc_upl_map(upl, (vm_offset_t*)&(bp->b_datap)); 2979 2980 if (kret != KERN_SUCCESS) 2981 panic("getblk: ubc_upl_map() failed with (%d)", kret); 2982 } 2983 break; 2984 2985 case BLK_META: 2986 /* 2987 * VM is not involved in IO for the meta data 2988 * buffer already has valid data 2989 */ 2990 break; 2991 2992 default: 2993 panic("getblk: paging or unknown operation for incore buffer- %d\n", operation); 2994 /*NOTREACHED*/ 2995 break; 2996 } 2997 } 2998 } else { /* not incore() */ 2999 int queue = BQ_EMPTY; /* Start with no preference */ 3000 3001 if (ret_only_valid) { 3002 lck_mtx_unlock(buf_mtxp); 3003 return (NULL); 3004 } 3005 if ((vnode_isreg(vp) == 0) || (UBCINFOEXISTS(vp) == 0) /*|| (vnode_issystem(vp) == 1)*/) 3006 operation = BLK_META; 3007 3008 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL) 3009 goto start; 3010 3011 /* 3012 * getnewbuf may block for a number of different reasons... 3013 * if it does, it's then possible for someone else to 3014 * create a buffer for the same block and insert it into 3015 * the hash... if we see it incore at this point we dump 3016 * the buffer we were working on and start over 3017 */ 3018 if (incore_locked(vp, blkno, dp)) { 3019 SET(bp->b_flags, B_INVAL); 3020 binshash(bp, &invalhash); 3021 3022 lck_mtx_unlock(buf_mtxp); 3023 3024 buf_brelse(bp); 3025 goto start; 3026 } 3027 /* 3028 * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN 3029 * CALLED! BE CAREFUL. 3030 */ 3031 3032 /* 3033 * mark the buffer as B_META if indicated 3034 * so that when buffer is released it will goto META queue 3035 */ 3036 if (operation == BLK_META) 3037 SET(bp->b_flags, B_META); 3038 3039 bp->b_blkno = bp->b_lblkno = blkno; 3040 bp->b_vp = vp; 3041 3042 /* 3043 * Insert in the hash so that incore() can find it 3044 */ 3045 binshash(bp, BUFHASH(vp, blkno)); 3046 3047 bgetvp_locked(vp, bp); 3048 3049 lck_mtx_unlock(buf_mtxp); 3050 3051 allocbuf(bp, size); 3052 3053 upl_flags = 0; 3054 switch (operation) { 3055 case BLK_META: 3056 /* 3057 * buffer data is invalid... 3058 * 3059 * I don't want to have to retake buf_mtxp, 3060 * so the miss and vmhits counters are done 3061 * with Atomic updates... all other counters 3062 * in bufstats are protected with either 3063 * buf_mtxp or iobuffer_mtxp 3064 */ 3065 OSAddAtomicLong(1, &bufstats.bufs_miss); 3066 break; 3067 3068 case BLK_WRITE: 3069 /* 3070 * "write" operation: let the UPL subsystem know 3071 * that we intend to modify the buffer cache pages 3072 * we're gathering. 3073 */ 3074 upl_flags |= UPL_WILL_MODIFY; 3075 case BLK_READ: 3076 { off_t f_offset; 3077 size_t contig_bytes; 3078 int bmap_flags; 3079 3080 if ( (bp->b_upl) ) 3081 panic("bp already has UPL: %p",bp); 3082 3083 f_offset = ubc_blktooff(vp, blkno); 3084 3085 upl_flags |= UPL_PRECIOUS; 3086 kret = ubc_create_upl(vp, 3087 f_offset, 3088 bp->b_bufsize, 3089 &upl, 3090 &pl, 3091 upl_flags); 3092 3093 if (kret != KERN_SUCCESS) 3094 panic("Failed to create UPL"); 3095#if UPL_DEBUG 3096 upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) 4); 3097#endif /* UPL_DEBUG */ 3098 bp->b_upl = upl; 3099 3100 if (upl_valid_page(pl, 0)) { 3101 3102 if (operation == BLK_READ) 3103 bmap_flags = VNODE_READ; 3104 else 3105 bmap_flags = VNODE_WRITE; 3106 3107 SET(bp->b_flags, B_CACHE | B_DONE); 3108 3109 OSAddAtomicLong(1, &bufstats.bufs_vmhits); 3110 3111 bp->b_validoff = 0; 3112 bp->b_dirtyoff = 0; 3113 3114 if (upl_dirty_page(pl, 0)) { 3115 /* page is dirty */ 3116 SET(bp->b_flags, B_WASDIRTY); 3117 3118 bp->b_validend = bp->b_bcount; 3119 bp->b_dirtyend = bp->b_bcount; 3120 } else { 3121 /* page is clean */ 3122 bp->b_validend = bp->b_bcount; 3123 bp->b_dirtyend = 0; 3124 } 3125 /* 3126 * try to recreate the physical block number associated with 3127 * this buffer... 3128 */ 3129 if (VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL)) 3130 panic("getblk: VNOP_BLOCKMAP failed"); 3131 /* 3132 * if the extent represented by this buffer 3133 * is not completely physically contiguous on 3134 * disk, than we can't cache the physical mapping 3135 * in the buffer header 3136 */ 3137 if ((long)contig_bytes < bp->b_bcount) 3138 bp->b_blkno = bp->b_lblkno; 3139 } else { 3140 OSAddAtomicLong(1, &bufstats.bufs_miss); 3141 } 3142 kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap)); 3143 3144 if (kret != KERN_SUCCESS) 3145 panic("getblk: ubc_upl_map() failed with (%d)", kret); 3146 break; 3147 } 3148 default: 3149 panic("getblk: paging or unknown operation - %x", operation); 3150 /*NOTREACHED*/ 3151 break; 3152 } 3153 } 3154 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END, 3155 bp, bp->b_datap, bp->b_flags, 3, 0); 3156 3157#ifdef JOE_DEBUG 3158 (void) OSBacktrace(&bp->b_stackgetblk[0], 6); 3159#endif 3160 return (bp); 3161} 3162 3163/* 3164 * Get an empty, disassociated buffer of given size. 3165 */ 3166buf_t 3167buf_geteblk(int size) 3168{ 3169 buf_t bp = NULL; 3170 int queue = BQ_EMPTY; 3171 3172 do { 3173 lck_mtx_lock_spin(buf_mtxp); 3174 3175 bp = getnewbuf(0, 0, &queue); 3176 } while (bp == NULL); 3177 3178 SET(bp->b_flags, (B_META|B_INVAL)); 3179 3180#if DIAGNOSTIC 3181 assert(queue == BQ_EMPTY); 3182#endif /* DIAGNOSTIC */ 3183 /* XXX need to implement logic to deal with other queues */ 3184 3185 binshash(bp, &invalhash); 3186 bufstats.bufs_eblk++; 3187 3188 lck_mtx_unlock(buf_mtxp); 3189 3190 allocbuf(bp, size); 3191 3192 return (bp); 3193} 3194 3195uint32_t 3196buf_redundancy_flags(buf_t bp) 3197{ 3198 return bp->b_redundancy_flags; 3199} 3200 3201void 3202buf_set_redundancy_flags(buf_t bp, uint32_t flags) 3203{ 3204 SET(bp->b_redundancy_flags, flags); 3205} 3206 3207void 3208buf_clear_redundancy_flags(buf_t bp, uint32_t flags) 3209{ 3210 CLR(bp->b_redundancy_flags, flags); 3211} 3212 3213/* 3214 * With UBC, there is no need to expand / shrink the file data 3215 * buffer. The VM uses the same pages, hence no waste. 3216 * All the file data buffers can have one size. 3217 * In fact expand / shrink would be an expensive operation. 3218 * 3219 * Only exception to this is meta-data buffers. Most of the 3220 * meta data operations are smaller than PAGE_SIZE. Having the 3221 * meta-data buffers grow and shrink as needed, optimizes use 3222 * of the kernel wired memory. 3223 */ 3224 3225int 3226allocbuf(buf_t bp, int size) 3227{ 3228 vm_size_t desired_size; 3229 3230 desired_size = roundup(size, CLBYTES); 3231 3232 if (desired_size < PAGE_SIZE) 3233 desired_size = PAGE_SIZE; 3234 if (desired_size > MAXBSIZE) 3235 panic("allocbuf: buffer larger than MAXBSIZE requested"); 3236 3237 if (ISSET(bp->b_flags, B_META)) { 3238 zone_t zprev, z; 3239 int nsize = roundup(size, MINMETA); 3240 3241 if (bp->b_datap) { 3242 vm_offset_t elem = (vm_offset_t)bp->b_datap; 3243 3244 if (ISSET(bp->b_flags, B_ZALLOC)) { 3245 if (bp->b_bufsize < nsize) { 3246 /* reallocate to a bigger size */ 3247 3248 zprev = getbufzone(bp->b_bufsize); 3249 if (nsize <= MAXMETA) { 3250 desired_size = nsize; 3251 z = getbufzone(nsize); 3252 /* b_datap not really a ptr */ 3253 *(void **)(&bp->b_datap) = zalloc(z); 3254 } else { 3255 bp->b_datap = (uintptr_t)NULL; 3256 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size); 3257 CLR(bp->b_flags, B_ZALLOC); 3258 } 3259 bcopy((void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize); 3260 zfree(zprev, (void *)elem); 3261 } else { 3262 desired_size = bp->b_bufsize; 3263 } 3264 3265 } else { 3266 if ((vm_size_t)bp->b_bufsize < desired_size) { 3267 /* reallocate to a bigger size */ 3268 bp->b_datap = (uintptr_t)NULL; 3269 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size); 3270 bcopy((const void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize); 3271 kmem_free(kernel_map, elem, bp->b_bufsize); 3272 } else { 3273 desired_size = bp->b_bufsize; 3274 } 3275 } 3276 } else { 3277 /* new allocation */ 3278 if (nsize <= MAXMETA) { 3279 desired_size = nsize; 3280 z = getbufzone(nsize); 3281 /* b_datap not really a ptr */ 3282 *(void **)(&bp->b_datap) = zalloc(z); 3283 SET(bp->b_flags, B_ZALLOC); 3284 } else 3285 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size); 3286 } 3287 3288 if (bp->b_datap == 0) 3289 panic("allocbuf: NULL b_datap"); 3290 } 3291 bp->b_bufsize = desired_size; 3292 bp->b_bcount = size; 3293 3294 return (0); 3295} 3296 3297/* 3298 * Get a new buffer from one of the free lists. 3299 * 3300 * Request for a queue is passes in. The queue from which the buffer was taken 3301 * from is returned. Out of range queue requests get BQ_EMPTY. Request for 3302 * BQUEUE means no preference. Use heuristics in that case. 3303 * Heuristics is as follows: 3304 * Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order. 3305 * If none available block till one is made available. 3306 * If buffers available on both BQ_AGE and BQ_LRU, check the timestamps. 3307 * Pick the most stale buffer. 3308 * If found buffer was marked delayed write, start the async. write 3309 * and restart the search. 3310 * Initialize the fields and disassociate the buffer from the vnode. 3311 * Remove the buffer from the hash. Return the buffer and the queue 3312 * on which it was found. 3313 * 3314 * buf_mtxp is held upon entry 3315 * returns with buf_mtxp locked if new buf available 3316 * returns with buf_mtxp UNlocked if new buf NOT available 3317 */ 3318 3319static buf_t 3320getnewbuf(int slpflag, int slptimeo, int * queue) 3321{ 3322 buf_t bp; 3323 buf_t lru_bp; 3324 buf_t age_bp; 3325 buf_t meta_bp; 3326 int age_time, lru_time, bp_time, meta_time; 3327 int req = *queue; /* save it for restarts */ 3328 struct timespec ts; 3329 3330start: 3331 /* 3332 * invalid request gets empty queue 3333 */ 3334 if ((*queue >= BQUEUES) || (*queue < 0) 3335 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED)) 3336 *queue = BQ_EMPTY; 3337 3338 3339 if (*queue == BQ_EMPTY && (bp = bufqueues[*queue].tqh_first)) 3340 goto found; 3341 3342 /* 3343 * need to grow number of bufs, add another one rather than recycling 3344 */ 3345 if (nbuf_headers < max_nbuf_headers) { 3346 /* 3347 * Increment count now as lock 3348 * is dropped for allocation. 3349 * That avoids over commits 3350 */ 3351 nbuf_headers++; 3352 goto add_newbufs; 3353 } 3354 /* Try for the requested queue first */ 3355 bp = bufqueues[*queue].tqh_first; 3356 if (bp) 3357 goto found; 3358 3359 /* Unable to use requested queue */ 3360 age_bp = bufqueues[BQ_AGE].tqh_first; 3361 lru_bp = bufqueues[BQ_LRU].tqh_first; 3362 meta_bp = bufqueues[BQ_META].tqh_first; 3363 3364 if (!age_bp && !lru_bp && !meta_bp) { 3365 /* 3366 * Unavailble on AGE or LRU or META queues 3367 * Try the empty list first 3368 */ 3369 bp = bufqueues[BQ_EMPTY].tqh_first; 3370 if (bp) { 3371 *queue = BQ_EMPTY; 3372 goto found; 3373 } 3374 /* 3375 * We have seen is this is hard to trigger. 3376 * This is an overcommit of nbufs but needed 3377 * in some scenarios with diskiamges 3378 */ 3379 3380add_newbufs: 3381 lck_mtx_unlock(buf_mtxp); 3382 3383 /* Create a new temporary buffer header */ 3384 bp = (struct buf *)zalloc(buf_hdr_zone); 3385 3386 if (bp) { 3387 bufhdrinit(bp); 3388 bp->b_whichq = BQ_EMPTY; 3389 bp->b_timestamp = buf_timestamp(); 3390 BLISTNONE(bp); 3391 SET(bp->b_flags, B_HDRALLOC); 3392 *queue = BQ_EMPTY; 3393 } 3394 lck_mtx_lock_spin(buf_mtxp); 3395 3396 if (bp) { 3397 binshash(bp, &invalhash); 3398 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY); 3399 buf_hdr_count++; 3400 goto found; 3401 } 3402 /* subtract already accounted bufcount */ 3403 nbuf_headers--; 3404 3405 bufstats.bufs_sleeps++; 3406 3407 /* wait for a free buffer of any kind */ 3408 needbuffer = 1; 3409 /* hz value is 100 */ 3410 ts.tv_sec = (slptimeo/1000); 3411 /* the hz value is 100; which leads to 10ms */ 3412 ts.tv_nsec = (slptimeo % 1000) * NSEC_PER_USEC * 1000 * 10; 3413 3414 msleep(&needbuffer, buf_mtxp, slpflag | PDROP | (PRIBIO+1), "getnewbuf", &ts); 3415 return (NULL); 3416 } 3417 3418 /* Buffer available either on AGE or LRU or META */ 3419 bp = NULL; 3420 *queue = -1; 3421 3422 /* Buffer available either on AGE or LRU */ 3423 if (!age_bp) { 3424 bp = lru_bp; 3425 *queue = BQ_LRU; 3426 } else if (!lru_bp) { 3427 bp = age_bp; 3428 *queue = BQ_AGE; 3429 } else { /* buffer available on both AGE and LRU */ 3430 int t = buf_timestamp(); 3431 3432 age_time = t - age_bp->b_timestamp; 3433 lru_time = t - lru_bp->b_timestamp; 3434 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */ 3435 bp = age_bp; 3436 *queue = BQ_AGE; 3437 /* 3438 * we should probably re-timestamp eveything in the 3439 * queues at this point with the current time 3440 */ 3441 } else { 3442 if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) { 3443 bp = lru_bp; 3444 *queue = BQ_LRU; 3445 } else { 3446 bp = age_bp; 3447 *queue = BQ_AGE; 3448 } 3449 } 3450 } 3451 3452 if (!bp) { /* Neither on AGE nor on LRU */ 3453 bp = meta_bp; 3454 *queue = BQ_META; 3455 } else if (meta_bp) { 3456 int t = buf_timestamp(); 3457 3458 bp_time = t - bp->b_timestamp; 3459 meta_time = t - meta_bp->b_timestamp; 3460 3461 if (!(bp_time < 0) && !(meta_time < 0)) { 3462 /* time not set backwards */ 3463 int bp_is_stale; 3464 bp_is_stale = (*queue == BQ_LRU) ? 3465 lru_is_stale : age_is_stale; 3466 3467 if ((meta_time >= meta_is_stale) && 3468 (bp_time < bp_is_stale)) { 3469 bp = meta_bp; 3470 *queue = BQ_META; 3471 } 3472 } 3473 } 3474found: 3475 if (ISSET(bp->b_flags, B_LOCKED) || ISSET(bp->b_lflags, BL_BUSY)) 3476 panic("getnewbuf: bp @ %p is LOCKED or BUSY! (flags 0x%x)\n", bp, bp->b_flags); 3477 3478 /* Clean it */ 3479 if (bcleanbuf(bp, FALSE)) { 3480 /* 3481 * moved to the laundry thread, buffer not ready 3482 */ 3483 *queue = req; 3484 goto start; 3485 } 3486 return (bp); 3487} 3488 3489 3490/* 3491 * Clean a buffer. 3492 * Returns 0 if buffer is ready to use, 3493 * Returns 1 if issued a buf_bawrite() to indicate 3494 * that the buffer is not ready. 3495 * 3496 * buf_mtxp is held upon entry 3497 * returns with buf_mtxp locked 3498 */ 3499int 3500bcleanbuf(buf_t bp, boolean_t discard) 3501{ 3502 /* Remove from the queue */ 3503 bremfree_locked(bp); 3504 3505#ifdef JOE_DEBUG 3506 bp->b_owner = current_thread(); 3507 bp->b_tag = 2; 3508#endif 3509 /* 3510 * If buffer was a delayed write, start the IO by queuing 3511 * it on the LAUNDRY queue, and return 1 3512 */ 3513 if (ISSET(bp->b_flags, B_DELWRI)) { 3514 if (discard) { 3515 SET(bp->b_lflags, BL_WANTDEALLOC); 3516 } 3517 3518 bmovelaundry(bp); 3519 3520 lck_mtx_unlock(buf_mtxp); 3521 3522 wakeup(&bufqueues[BQ_LAUNDRY]); 3523 /* 3524 * and give it a chance to run 3525 */ 3526 (void)thread_block(THREAD_CONTINUE_NULL); 3527 3528 lck_mtx_lock_spin(buf_mtxp); 3529 3530 return (1); 3531 } 3532#ifdef JOE_DEBUG 3533 bp->b_owner = current_thread(); 3534 bp->b_tag = 8; 3535#endif 3536 /* 3537 * Buffer is no longer on any free list... we own it 3538 */ 3539 SET(bp->b_lflags, BL_BUSY); 3540 buf_busycount++; 3541 3542 bremhash(bp); 3543 3544 /* 3545 * disassociate us from our vnode, if we had one... 3546 */ 3547 if (bp->b_vp) 3548 brelvp_locked(bp); 3549 3550 lck_mtx_unlock(buf_mtxp); 3551 3552 BLISTNONE(bp); 3553 3554 if (ISSET(bp->b_flags, B_META)) 3555 buf_free_meta_store(bp); 3556 3557 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 3558 3559 buf_release_credentials(bp); 3560 3561 /* If discarding, just move to the empty queue */ 3562 if (discard) { 3563 lck_mtx_lock_spin(buf_mtxp); 3564 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA)); 3565 bp->b_whichq = BQ_EMPTY; 3566 binshash(bp, &invalhash); 3567 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY); 3568 CLR(bp->b_lflags, BL_BUSY); 3569 buf_busycount--; 3570 } else { 3571 /* Not discarding: clean up and prepare for reuse */ 3572 bp->b_bufsize = 0; 3573 bp->b_datap = (uintptr_t)NULL; 3574 bp->b_upl = (void *)NULL; 3575 /* 3576 * preserve the state of whether this buffer 3577 * was allocated on the fly or not... 3578 * the only other flag that should be set at 3579 * this point is BL_BUSY... 3580 */ 3581#ifdef JOE_DEBUG 3582 bp->b_owner = current_thread(); 3583 bp->b_tag = 3; 3584#endif 3585 bp->b_lflags = BL_BUSY; 3586 bp->b_flags = (bp->b_flags & B_HDRALLOC); 3587 bp->b_dev = NODEV; 3588 bp->b_blkno = bp->b_lblkno = 0; 3589 bp->b_iodone = NULL; 3590 bp->b_error = 0; 3591 bp->b_resid = 0; 3592 bp->b_bcount = 0; 3593 bp->b_dirtyoff = bp->b_dirtyend = 0; 3594 bp->b_validoff = bp->b_validend = 0; 3595 bzero(&bp->b_attr, sizeof(struct bufattr)); 3596 3597 lck_mtx_lock_spin(buf_mtxp); 3598 } 3599 return (0); 3600} 3601 3602 3603 3604errno_t 3605buf_invalblkno(vnode_t vp, daddr64_t lblkno, int flags) 3606{ 3607 buf_t bp; 3608 errno_t error; 3609 struct bufhashhdr *dp; 3610 3611 dp = BUFHASH(vp, lblkno); 3612 3613relook: 3614 lck_mtx_lock_spin(buf_mtxp); 3615 3616 if ((bp = incore_locked(vp, lblkno, dp)) == (struct buf *)0) { 3617 lck_mtx_unlock(buf_mtxp); 3618 return (0); 3619 } 3620 if (ISSET(bp->b_lflags, BL_BUSY)) { 3621 if ( !ISSET(flags, BUF_WAIT)) { 3622 lck_mtx_unlock(buf_mtxp); 3623 return (EBUSY); 3624 } 3625 SET(bp->b_lflags, BL_WANTED); 3626 3627 error = msleep((caddr_t)bp, buf_mtxp, PDROP | (PRIBIO + 1), "buf_invalblkno", NULL); 3628 3629 if (error) { 3630 return (error); 3631 } 3632 goto relook; 3633 } 3634 bremfree_locked(bp); 3635 SET(bp->b_lflags, BL_BUSY); 3636 SET(bp->b_flags, B_INVAL); 3637 buf_busycount++; 3638#ifdef JOE_DEBUG 3639 bp->b_owner = current_thread(); 3640 bp->b_tag = 4; 3641#endif 3642 lck_mtx_unlock(buf_mtxp); 3643 buf_brelse(bp); 3644 3645 return (0); 3646} 3647 3648 3649void 3650buf_drop(buf_t bp) 3651{ 3652 int need_wakeup = 0; 3653 3654 lck_mtx_lock_spin(buf_mtxp); 3655 3656 if (ISSET(bp->b_lflags, BL_WANTED)) { 3657 /* 3658 * delay the actual wakeup until after we 3659 * clear BL_BUSY and we've dropped buf_mtxp 3660 */ 3661 need_wakeup = 1; 3662 } 3663#ifdef JOE_DEBUG 3664 bp->b_owner = current_thread(); 3665 bp->b_tag = 9; 3666#endif 3667 /* 3668 * Unlock the buffer. 3669 */ 3670 CLR(bp->b_lflags, (BL_BUSY | BL_WANTED)); 3671 buf_busycount--; 3672 3673 lck_mtx_unlock(buf_mtxp); 3674 3675 if (need_wakeup) { 3676 /* 3677 * Wake up any proceeses waiting for _this_ buffer to become free. 3678 */ 3679 wakeup(bp); 3680 } 3681} 3682 3683 3684errno_t 3685buf_acquire(buf_t bp, int flags, int slpflag, int slptimeo) { 3686 errno_t error; 3687 3688 lck_mtx_lock_spin(buf_mtxp); 3689 3690 error = buf_acquire_locked(bp, flags, slpflag, slptimeo); 3691 3692 lck_mtx_unlock(buf_mtxp); 3693 3694 return (error); 3695} 3696 3697 3698static errno_t 3699buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo) 3700{ 3701 errno_t error; 3702 struct timespec ts; 3703 3704 if (ISSET(bp->b_flags, B_LOCKED)) { 3705 if ((flags & BAC_SKIP_LOCKED)) 3706 return (EDEADLK); 3707 } else { 3708 if ((flags & BAC_SKIP_NONLOCKED)) 3709 return (EDEADLK); 3710 } 3711 if (ISSET(bp->b_lflags, BL_BUSY)) { 3712 /* 3713 * since the lck_mtx_lock may block, the buffer 3714 * may become BUSY, so we need to 3715 * recheck for a NOWAIT request 3716 */ 3717 if (flags & BAC_NOWAIT) 3718 return (EBUSY); 3719 SET(bp->b_lflags, BL_WANTED); 3720 3721 /* the hz value is 100; which leads to 10ms */ 3722 ts.tv_sec = (slptimeo/100); 3723 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000; 3724 error = msleep((caddr_t)bp, buf_mtxp, slpflag | (PRIBIO + 1), "buf_acquire", &ts); 3725 3726 if (error) 3727 return (error); 3728 return (EAGAIN); 3729 } 3730 if (flags & BAC_REMOVE) 3731 bremfree_locked(bp); 3732 SET(bp->b_lflags, BL_BUSY); 3733 buf_busycount++; 3734 3735#ifdef JOE_DEBUG 3736 bp->b_owner = current_thread(); 3737 bp->b_tag = 5; 3738#endif 3739 return (0); 3740} 3741 3742 3743/* 3744 * Wait for operations on the buffer to complete. 3745 * When they do, extract and return the I/O's error value. 3746 */ 3747errno_t 3748buf_biowait(buf_t bp) 3749{ 3750 while (!ISSET(bp->b_flags, B_DONE)) { 3751 3752 lck_mtx_lock_spin(buf_mtxp); 3753 3754 if (!ISSET(bp->b_flags, B_DONE)) { 3755 DTRACE_IO1(wait__start, buf_t, bp); 3756 (void) msleep(bp, buf_mtxp, PDROP | (PRIBIO+1), "buf_biowait", NULL); 3757 DTRACE_IO1(wait__done, buf_t, bp); 3758 } else 3759 lck_mtx_unlock(buf_mtxp); 3760 } 3761 /* check for interruption of I/O (e.g. via NFS), then errors. */ 3762 if (ISSET(bp->b_flags, B_EINTR)) { 3763 CLR(bp->b_flags, B_EINTR); 3764 return (EINTR); 3765 } else if (ISSET(bp->b_flags, B_ERROR)) 3766 return (bp->b_error ? bp->b_error : EIO); 3767 else 3768 return (0); 3769} 3770 3771 3772/* 3773 * Mark I/O complete on a buffer. 3774 * 3775 * If a callback has been requested, e.g. the pageout 3776 * daemon, do so. Otherwise, awaken waiting processes. 3777 * 3778 * [ Leffler, et al., says on p.247: 3779 * "This routine wakes up the blocked process, frees the buffer 3780 * for an asynchronous write, or, for a request by the pagedaemon 3781 * process, invokes a procedure specified in the buffer structure" ] 3782 * 3783 * In real life, the pagedaemon (or other system processes) wants 3784 * to do async stuff to, and doesn't want the buffer buf_brelse()'d. 3785 * (for swap pager, that puts swap buffers on the free lists (!!!), 3786 * for the vn device, that puts malloc'd buffers on the free lists!) 3787 */ 3788extern struct timeval priority_IO_timestamp_for_root; 3789extern int hard_throttle_on_root; 3790 3791void 3792buf_biodone(buf_t bp) 3793{ 3794 mount_t mp; 3795 3796 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START, 3797 bp, bp->b_datap, bp->b_flags, 0, 0); 3798 3799 if (ISSET(bp->b_flags, B_DONE)) 3800 panic("biodone already"); 3801 3802 if (ISSET(bp->b_flags, B_ERROR)) { 3803 fslog_io_error(bp); 3804 } 3805 3806 if (bp->b_vp && bp->b_vp->v_mount) { 3807 mp = bp->b_vp->v_mount; 3808 } else { 3809 mp = NULL; 3810 } 3811 3812 if (mp && (bp->b_flags & B_READ) == 0) { 3813 update_last_io_time(mp); 3814 INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_write_size); 3815 } else if (mp) { 3816 INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_read_size); 3817 } 3818 3819 if (kdebug_enable) { 3820 int code = DKIO_DONE; 3821 3822 if (bp->b_flags & B_READ) 3823 code |= DKIO_READ; 3824 if (bp->b_flags & B_ASYNC) 3825 code |= DKIO_ASYNC; 3826 3827 if (bp->b_flags & B_META) 3828 code |= DKIO_META; 3829 else if (bp->b_flags & B_PAGEIO) 3830 code |= DKIO_PAGING; 3831 3832 if (bp->b_flags & B_THROTTLED_IO) 3833 code |= DKIO_THROTTLE; 3834 else if (bp->b_flags & B_PASSIVE) 3835 code |= DKIO_PASSIVE; 3836 3837 if (bp->b_attr.ba_flags & BA_NOCACHE) 3838 code |= DKIO_NOCACHE; 3839 3840 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE, 3841 bp, (uintptr_t)bp->b_vp, 3842 bp->b_resid, bp->b_error, 0); 3843 } 3844 if ((bp->b_vp != NULLVP) && 3845 ((bp->b_flags & (B_THROTTLED_IO | B_PASSIVE | B_IOSTREAMING | B_PAGEIO | B_READ | B_THROTTLED_IO | B_PASSIVE)) == (B_PAGEIO | B_READ)) && 3846 (bp->b_vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) { 3847 microuptime(&priority_IO_timestamp_for_root); 3848 hard_throttle_on_root = 0; 3849 } 3850 3851 /* 3852 * I/O was done, so don't believe 3853 * the DIRTY state from VM anymore... 3854 * and we need to reset the THROTTLED/PASSIVE 3855 * indicators 3856 */ 3857 CLR(bp->b_flags, (B_WASDIRTY | B_THROTTLED_IO | B_PASSIVE)); 3858 CLR(bp->b_attr.ba_flags, (BA_META | BA_NOCACHE)); 3859#if !CONFIG_EMBEDDED 3860 CLR(bp->b_attr.ba_flags, (BA_THROTTLED_IO | BA_DELAYIDLESLEEP)); 3861#else 3862 CLR(bp->b_attr.ba_flags, BA_THROTTLED_IO); 3863#endif /* !CONFIG_EMBEDDED */ 3864 DTRACE_IO1(done, buf_t, bp); 3865 3866 if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW)) 3867 /* 3868 * wake up any writer's blocked 3869 * on throttle or waiting for I/O 3870 * to drain 3871 */ 3872 vnode_writedone(bp->b_vp); 3873 3874 if (ISSET(bp->b_flags, (B_CALL | B_FILTER))) { /* if necessary, call out */ 3875 void (*iodone_func)(struct buf *, void *) = bp->b_iodone; 3876 void *arg = bp->b_transaction; 3877 int callout = ISSET(bp->b_flags, B_CALL); 3878 3879 if (iodone_func == NULL) 3880 panic("biodone: bp @ %p has NULL b_iodone!\n", bp); 3881 3882 CLR(bp->b_flags, (B_CALL | B_FILTER)); /* filters and callouts are one-shot */ 3883 bp->b_iodone = NULL; 3884 bp->b_transaction = NULL; 3885 3886 if (callout) 3887 SET(bp->b_flags, B_DONE); /* note that it's done */ 3888 3889 (*iodone_func)(bp, arg); 3890 3891 if (callout) { 3892 /* 3893 * assumes that the callback function takes 3894 * ownership of the bp and deals with releasing it if necessary 3895 */ 3896 goto biodone_done; 3897 } 3898 /* 3899 * in this case the call back function is acting 3900 * strictly as a filter... it does not take 3901 * ownership of the bp and is expecting us 3902 * to finish cleaning up... this is currently used 3903 * by the HFS journaling code 3904 */ 3905 } 3906 if (ISSET(bp->b_flags, B_ASYNC)) { /* if async, release it */ 3907 SET(bp->b_flags, B_DONE); /* note that it's done */ 3908 3909 buf_brelse(bp); 3910 } else { /* or just wakeup the buffer */ 3911 /* 3912 * by taking the mutex, we serialize 3913 * the buf owner calling buf_biowait so that we'll 3914 * only see him in one of 2 states... 3915 * state 1: B_DONE wasn't set and he's 3916 * blocked in msleep 3917 * state 2: he's blocked trying to take the 3918 * mutex before looking at B_DONE 3919 * BL_WANTED is cleared in case anyone else 3920 * is blocked waiting for the buffer... note 3921 * that we haven't cleared B_BUSY yet, so if 3922 * they do get to run, their going to re-set 3923 * BL_WANTED and go back to sleep 3924 */ 3925 lck_mtx_lock_spin(buf_mtxp); 3926 3927 CLR(bp->b_lflags, BL_WANTED); 3928 SET(bp->b_flags, B_DONE); /* note that it's done */ 3929 3930 lck_mtx_unlock(buf_mtxp); 3931 3932 wakeup(bp); 3933 } 3934biodone_done: 3935 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END, 3936 (uintptr_t)bp, (uintptr_t)bp->b_datap, bp->b_flags, 0, 0); 3937} 3938 3939/* 3940 * Return a count of buffers on the "locked" queue. 3941 */ 3942int 3943count_lock_queue(void) 3944{ 3945 buf_t bp; 3946 int n = 0; 3947 3948 lck_mtx_lock_spin(buf_mtxp); 3949 3950 for (bp = bufqueues[BQ_LOCKED].tqh_first; bp; 3951 bp = bp->b_freelist.tqe_next) 3952 n++; 3953 lck_mtx_unlock(buf_mtxp); 3954 3955 return (n); 3956} 3957 3958/* 3959 * Return a count of 'busy' buffers. Used at the time of shutdown. 3960 * note: This is also called from the mach side in debug context in kdp.c 3961 */ 3962int 3963count_busy_buffers(void) 3964{ 3965 return buf_busycount + bufstats.bufs_iobufinuse; 3966} 3967 3968#if DIAGNOSTIC 3969/* 3970 * Print out statistics on the current allocation of the buffer pool. 3971 * Can be enabled to print out on every ``sync'' by setting "syncprt" 3972 * in vfs_syscalls.c using sysctl. 3973 */ 3974void 3975vfs_bufstats() 3976{ 3977 int i, j, count; 3978 struct buf *bp; 3979 struct bqueues *dp; 3980 int counts[MAXBSIZE/CLBYTES+1]; 3981 static char *bname[BQUEUES] = 3982 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" }; 3983 3984 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) { 3985 count = 0; 3986 for (j = 0; j <= MAXBSIZE/CLBYTES; j++) 3987 counts[j] = 0; 3988 3989 lck_mtx_lock(buf_mtxp); 3990 3991 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) { 3992 counts[bp->b_bufsize/CLBYTES]++; 3993 count++; 3994 } 3995 lck_mtx_unlock(buf_mtxp); 3996 3997 printf("%s: total-%d", bname[i], count); 3998 for (j = 0; j <= MAXBSIZE/CLBYTES; j++) 3999 if (counts[j] != 0) 4000 printf(", %d-%d", j * CLBYTES, counts[j]); 4001 printf("\n"); 4002 } 4003} 4004#endif /* DIAGNOSTIC */ 4005 4006#define NRESERVEDIOBUFS 128 4007 4008 4009buf_t 4010alloc_io_buf(vnode_t vp, int priv) 4011{ 4012 buf_t bp; 4013 4014 lck_mtx_lock_spin(iobuffer_mtxp); 4015 4016 while (((niobuf_headers - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse) && !priv) || 4017 (bp = iobufqueue.tqh_first) == NULL) { 4018 bufstats.bufs_iobufsleeps++; 4019 4020 need_iobuffer = 1; 4021 (void) msleep(&need_iobuffer, iobuffer_mtxp, PSPIN | (PRIBIO+1), (const char *)"alloc_io_buf", NULL); 4022 } 4023 TAILQ_REMOVE(&iobufqueue, bp, b_freelist); 4024 4025 bufstats.bufs_iobufinuse++; 4026 if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax) 4027 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse; 4028 4029 lck_mtx_unlock(iobuffer_mtxp); 4030 4031 /* 4032 * initialize various fields 4033 * we don't need to hold the mutex since the buffer 4034 * is now private... the vp should have a reference 4035 * on it and is not protected by this mutex in any event 4036 */ 4037 bp->b_timestamp = 0; 4038 bp->b_proc = NULL; 4039 4040 bp->b_datap = 0; 4041 bp->b_flags = 0; 4042 bp->b_lflags = BL_BUSY | BL_IOBUF; 4043 bp->b_redundancy_flags = 0; 4044 bp->b_blkno = bp->b_lblkno = 0; 4045#ifdef JOE_DEBUG 4046 bp->b_owner = current_thread(); 4047 bp->b_tag = 6; 4048#endif 4049 bp->b_iodone = NULL; 4050 bp->b_error = 0; 4051 bp->b_resid = 0; 4052 bp->b_bcount = 0; 4053 bp->b_bufsize = 0; 4054 bp->b_upl = NULL; 4055 bp->b_vp = vp; 4056 bzero(&bp->b_attr, sizeof(struct bufattr)); 4057 4058 if (vp && (vp->v_type == VBLK || vp->v_type == VCHR)) 4059 bp->b_dev = vp->v_rdev; 4060 else 4061 bp->b_dev = NODEV; 4062 4063 return (bp); 4064} 4065 4066 4067void 4068free_io_buf(buf_t bp) 4069{ 4070 int need_wakeup = 0; 4071 4072 /* 4073 * put buffer back on the head of the iobufqueue 4074 */ 4075 bp->b_vp = NULL; 4076 bp->b_flags = B_INVAL; 4077 4078 lck_mtx_lock_spin(iobuffer_mtxp); 4079 4080 binsheadfree(bp, &iobufqueue, -1); 4081 4082 if (need_iobuffer) { 4083 /* 4084 * Wake up any processes waiting because they need an io buffer 4085 * 4086 * do the wakeup after we drop the mutex... it's possible that the 4087 * wakeup will be superfluous if need_iobuffer gets set again and 4088 * another thread runs this path, but it's highly unlikely, doesn't 4089 * hurt, and it means we don't hold up I/O progress if the wakeup blocks 4090 * trying to grab a task related lock... 4091 */ 4092 need_iobuffer = 0; 4093 need_wakeup = 1; 4094 } 4095 if (bufstats.bufs_iobufinuse <= 0) 4096 panic("free_io_buf: bp(%p) - bufstats.bufs_iobufinuse < 0", bp); 4097 4098 bufstats.bufs_iobufinuse--; 4099 4100 lck_mtx_unlock(iobuffer_mtxp); 4101 4102 if (need_wakeup) 4103 wakeup(&need_iobuffer); 4104} 4105 4106 4107void 4108buf_list_lock(void) 4109{ 4110 lck_mtx_lock_spin(buf_mtxp); 4111} 4112 4113void 4114buf_list_unlock(void) 4115{ 4116 lck_mtx_unlock(buf_mtxp); 4117} 4118 4119/* 4120 * If getnewbuf() calls bcleanbuf() on the same thread 4121 * there is a potential for stack overrun and deadlocks. 4122 * So we always handoff the work to a worker thread for completion 4123 */ 4124 4125 4126static void 4127bcleanbuf_thread_init(void) 4128{ 4129 thread_t thread = THREAD_NULL; 4130 4131 /* create worker thread */ 4132 kernel_thread_start((thread_continue_t)bcleanbuf_thread, NULL, &thread); 4133 thread_deallocate(thread); 4134} 4135 4136typedef int (*bcleanbufcontinuation)(int); 4137 4138static void 4139bcleanbuf_thread(void) 4140{ 4141 struct buf *bp; 4142 int error = 0; 4143 int loopcnt = 0; 4144 4145 for (;;) { 4146 lck_mtx_lock_spin(buf_mtxp); 4147 4148 while ( (bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY])) == NULL) { 4149 (void)msleep0(&bufqueues[BQ_LAUNDRY], buf_mtxp, PRIBIO|PDROP, "blaundry", 0, (bcleanbufcontinuation)bcleanbuf_thread); 4150 } 4151 4152 /* 4153 * Remove from the queue 4154 */ 4155 bremfree_locked(bp); 4156 4157 /* 4158 * Buffer is no longer on any free list 4159 */ 4160 SET(bp->b_lflags, BL_BUSY); 4161 buf_busycount++; 4162 4163#ifdef JOE_DEBUG 4164 bp->b_owner = current_thread(); 4165 bp->b_tag = 10; 4166#endif 4167 4168 lck_mtx_unlock(buf_mtxp); 4169 /* 4170 * do the IO 4171 */ 4172 error = bawrite_internal(bp, 0); 4173 4174 if (error) { 4175 bp->b_whichq = BQ_LAUNDRY; 4176 bp->b_timestamp = buf_timestamp(); 4177 4178 lck_mtx_lock_spin(buf_mtxp); 4179 4180 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY); 4181 blaundrycnt++; 4182 4183 /* we never leave a busy page on the laundry queue */ 4184 CLR(bp->b_lflags, BL_BUSY); 4185 buf_busycount--; 4186#ifdef JOE_DEBUG 4187 bp->b_owner = current_thread(); 4188 bp->b_tag = 11; 4189#endif 4190 4191 lck_mtx_unlock(buf_mtxp); 4192 4193 if (loopcnt > MAXLAUNDRY) { 4194 /* 4195 * bawrite_internal() can return errors if we're throttled. If we've 4196 * done several I/Os and failed, give the system some time to unthrottle 4197 * the vnode 4198 */ 4199 (void)tsleep((void *)&bufqueues[BQ_LAUNDRY], PRIBIO, "blaundry", 1); 4200 loopcnt = 0; 4201 } else { 4202 /* give other threads a chance to run */ 4203 (void)thread_block(THREAD_CONTINUE_NULL); 4204 loopcnt++; 4205 } 4206 } 4207 } 4208} 4209 4210 4211static int 4212brecover_data(buf_t bp) 4213{ 4214 int upl_offset; 4215 upl_t upl; 4216 upl_page_info_t *pl; 4217 kern_return_t kret; 4218 vnode_t vp = bp->b_vp; 4219 int upl_flags; 4220 4221 4222 if ( !UBCINFOEXISTS(vp) || bp->b_bufsize == 0) 4223 goto dump_buffer; 4224 4225 upl_flags = UPL_PRECIOUS; 4226 if (! (buf_flags(bp) & B_READ)) { 4227 /* 4228 * "write" operation: let the UPL subsystem know 4229 * that we intend to modify the buffer cache pages we're 4230 * gathering. 4231 */ 4232 upl_flags |= UPL_WILL_MODIFY; 4233 } 4234 4235 kret = ubc_create_upl(vp, 4236 ubc_blktooff(vp, bp->b_lblkno), 4237 bp->b_bufsize, 4238 &upl, 4239 &pl, 4240 upl_flags); 4241 if (kret != KERN_SUCCESS) 4242 panic("Failed to create UPL"); 4243 4244 for (upl_offset = 0; upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) { 4245 4246 if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) || !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) { 4247 ubc_upl_abort(upl, 0); 4248 goto dump_buffer; 4249 } 4250 } 4251 bp->b_upl = upl; 4252 4253 kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap)); 4254 4255 if (kret != KERN_SUCCESS) 4256 panic("getblk: ubc_upl_map() failed with (%d)", kret); 4257 return (1); 4258 4259dump_buffer: 4260 bp->b_bufsize = 0; 4261 SET(bp->b_flags, B_INVAL); 4262 buf_brelse(bp); 4263 4264 return(0); 4265} 4266 4267boolean_t 4268buffer_cache_gc(int all) 4269{ 4270 buf_t bp; 4271 boolean_t did_large_zfree = FALSE; 4272 boolean_t need_wakeup = FALSE; 4273 int now = buf_timestamp(); 4274 uint32_t found = 0; 4275 struct bqueues privq; 4276 int thresh_hold = BUF_STALE_THRESHHOLD; 4277 4278 if (all) 4279 thresh_hold = 0; 4280 /* 4281 * We only care about metadata (incore storage comes from zalloc()). 4282 * Unless "all" is set (used to evict meta data buffers in preparation 4283 * for deep sleep), we only evict up to BUF_MAX_GC_BATCH_SIZE buffers 4284 * that have not been accessed in the last 30s. This limit controls both 4285 * the hold time of the global lock "buf_mtxp" and the length of time 4286 * we spend compute bound in the GC thread which calls this function 4287 */ 4288 lck_mtx_lock(buf_mtxp); 4289 4290 do { 4291 found = 0; 4292 TAILQ_INIT(&privq); 4293 need_wakeup = FALSE; 4294 4295 while (((bp = TAILQ_FIRST(&bufqueues[BQ_META]))) && 4296 (now > bp->b_timestamp) && 4297 (now - bp->b_timestamp > thresh_hold) && 4298 (found < BUF_MAX_GC_BATCH_SIZE)) { 4299 4300 /* Remove from free list */ 4301 bremfree_locked(bp); 4302 found++; 4303 4304#ifdef JOE_DEBUG 4305 bp->b_owner = current_thread(); 4306 bp->b_tag = 12; 4307#endif 4308 4309 /* If dirty, move to laundry queue and remember to do wakeup */ 4310 if (ISSET(bp->b_flags, B_DELWRI)) { 4311 SET(bp->b_lflags, BL_WANTDEALLOC); 4312 4313 bmovelaundry(bp); 4314 need_wakeup = TRUE; 4315 4316 continue; 4317 } 4318 4319 /* 4320 * Mark busy and put on private list. We could technically get 4321 * away without setting BL_BUSY here. 4322 */ 4323 SET(bp->b_lflags, BL_BUSY); 4324 buf_busycount++; 4325 4326 /* 4327 * Remove from hash and dissociate from vp. 4328 */ 4329 bremhash(bp); 4330 if (bp->b_vp) { 4331 brelvp_locked(bp); 4332 } 4333 4334 TAILQ_INSERT_TAIL(&privq, bp, b_freelist); 4335 } 4336 4337 if (found == 0) { 4338 break; 4339 } 4340 4341 /* Drop lock for batch processing */ 4342 lck_mtx_unlock(buf_mtxp); 4343 4344 /* Wakeup and yield for laundry if need be */ 4345 if (need_wakeup) { 4346 wakeup(&bufqueues[BQ_LAUNDRY]); 4347 (void)thread_block(THREAD_CONTINUE_NULL); 4348 } 4349 4350 /* Clean up every buffer on private list */ 4351 TAILQ_FOREACH(bp, &privq, b_freelist) { 4352 /* Take note if we've definitely freed at least a page to a zone */ 4353 if ((ISSET(bp->b_flags, B_ZALLOC)) && (buf_size(bp) >= PAGE_SIZE)) { 4354 did_large_zfree = TRUE; 4355 } 4356 4357 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 4358 4359 /* Free Storage */ 4360 buf_free_meta_store(bp); 4361 4362 /* Release credentials */ 4363 buf_release_credentials(bp); 4364 4365 /* Prepare for moving to empty queue */ 4366 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED 4367 | B_AGE | B_ASYNC | B_NOCACHE | B_FUA)); 4368 bp->b_whichq = BQ_EMPTY; 4369 BLISTNONE(bp); 4370 } 4371 lck_mtx_lock(buf_mtxp); 4372 4373 /* Back under lock, move them all to invalid hash and clear busy */ 4374 TAILQ_FOREACH(bp, &privq, b_freelist) { 4375 binshash(bp, &invalhash); 4376 CLR(bp->b_lflags, BL_BUSY); 4377 buf_busycount--; 4378 4379#ifdef JOE_DEBUG 4380 if (bp->b_owner != current_thread()) { 4381 panic("Buffer stolen from buffer_cache_gc()"); 4382 } 4383 bp->b_owner = current_thread(); 4384 bp->b_tag = 13; 4385#endif 4386 } 4387 4388 /* And do a big bulk move to the empty queue */ 4389 TAILQ_CONCAT(&bufqueues[BQ_EMPTY], &privq, b_freelist); 4390 4391 } while (all && (found == BUF_MAX_GC_BATCH_SIZE)); 4392 4393 lck_mtx_unlock(buf_mtxp); 4394 4395 return did_large_zfree; 4396} 4397 4398 4399/* 4400 * disabled for now 4401 */ 4402 4403#if FLUSH_QUEUES 4404 4405#define NFLUSH 32 4406 4407static int 4408bp_cmp(void *a, void *b) 4409{ 4410 buf_t *bp_a = *(buf_t **)a, 4411 *bp_b = *(buf_t **)b; 4412 daddr64_t res; 4413 4414 // don't have to worry about negative block 4415 // numbers so this is ok to do. 4416 // 4417 res = (bp_a->b_blkno - bp_b->b_blkno); 4418 4419 return (int)res; 4420} 4421 4422 4423int 4424bflushq(int whichq, mount_t mp) 4425{ 4426 buf_t bp, next; 4427 int i, buf_count; 4428 int total_writes = 0; 4429 static buf_t flush_table[NFLUSH]; 4430 4431 if (whichq < 0 || whichq >= BQUEUES) { 4432 return (0); 4433 } 4434 4435 restart: 4436 lck_mtx_lock(buf_mtxp); 4437 4438 bp = TAILQ_FIRST(&bufqueues[whichq]); 4439 4440 for (buf_count = 0; bp; bp = next) { 4441 next = bp->b_freelist.tqe_next; 4442 4443 if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) { 4444 continue; 4445 } 4446 4447 if (ISSET(bp->b_flags, B_DELWRI) && !ISSET(bp->b_lflags, BL_BUSY)) { 4448 4449 bremfree_locked(bp); 4450#ifdef JOE_DEBUG 4451 bp->b_owner = current_thread(); 4452 bp->b_tag = 7; 4453#endif 4454 SET(bp->b_lflags, BL_BUSY); 4455 buf_busycount++; 4456 4457 flush_table[buf_count] = bp; 4458 buf_count++; 4459 total_writes++; 4460 4461 if (buf_count >= NFLUSH) { 4462 lck_mtx_unlock(buf_mtxp); 4463 4464 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp); 4465 4466 for (i = 0; i < buf_count; i++) { 4467 buf_bawrite(flush_table[i]); 4468 } 4469 goto restart; 4470 } 4471 } 4472 } 4473 lck_mtx_unlock(buf_mtxp); 4474 4475 if (buf_count > 0) { 4476 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp); 4477 4478 for (i = 0; i < buf_count; i++) { 4479 buf_bawrite(flush_table[i]); 4480 } 4481 } 4482 4483 return (total_writes); 4484} 4485#endif 4486 4487 4488#if BALANCE_QUEUES 4489 4490/* XXX move this to a separate file */ 4491 4492/* 4493 * NOTE: THIS CODE HAS NOT BEEN UPDATED 4494 * WITH RESPECT TO THE NEW LOCKING MODEL 4495 */ 4496 4497 4498/* 4499 * Dynamic Scaling of the Buffer Queues 4500 */ 4501 4502typedef long long blsize_t; 4503 4504blsize_t MAXNBUF; /* initialize to (sane_size / PAGE_SIZE) */ 4505/* Global tunable limits */ 4506blsize_t nbufh; /* number of buffer headers */ 4507blsize_t nbuflow; /* minimum number of buffer headers required */ 4508blsize_t nbufhigh; /* maximum number of buffer headers allowed */ 4509blsize_t nbuftarget; /* preferred number of buffer headers */ 4510 4511/* 4512 * assertions: 4513 * 4514 * 1. 0 < nbuflow <= nbufh <= nbufhigh 4515 * 2. nbufhigh <= MAXNBUF 4516 * 3. 0 < nbuflow <= nbuftarget <= nbufhigh 4517 * 4. nbufh can not be set by sysctl(). 4518 */ 4519 4520/* Per queue tunable limits */ 4521 4522struct bufqlim { 4523 blsize_t bl_nlow; /* minimum number of buffer headers required */ 4524 blsize_t bl_num; /* number of buffer headers on the queue */ 4525 blsize_t bl_nlhigh; /* maximum number of buffer headers allowed */ 4526 blsize_t bl_target; /* preferred number of buffer headers */ 4527 long bl_stale; /* Seconds after which a buffer is considered stale */ 4528} bufqlim[BQUEUES]; 4529 4530/* 4531 * assertions: 4532 * 4533 * 1. 0 <= bl_nlow <= bl_num <= bl_nlhigh 4534 * 2. bl_nlhigh <= MAXNBUF 4535 * 3. bufqlim[BQ_META].bl_nlow != 0 4536 * 4. bufqlim[BQ_META].bl_nlow > (number of possible concurrent 4537 * file system IO operations) 4538 * 5. bl_num can not be set by sysctl(). 4539 * 6. bl_nhigh <= nbufhigh 4540 */ 4541 4542/* 4543 * Rationale: 4544 * ---------- 4545 * Defining it blsize_t as long permits 2^31 buffer headers per queue. 4546 * Which can describe (2^31 * PAGE_SIZE) memory per queue. 4547 * 4548 * These limits are exported to by means of sysctl(). 4549 * It was decided to define blsize_t as a 64 bit quantity. 4550 * This will make sure that we will not be required to change it 4551 * as long as we do not exceed 64 bit address space for the kernel. 4552 * 4553 * low and high numbers parameters initialized at compile time 4554 * and boot arguments can be used to override them. sysctl() 4555 * would not change the value. sysctl() can get all the values 4556 * but can set only target. num is the current level. 4557 * 4558 * Advantages of having a "bufqscan" thread doing the balancing are, 4559 * Keep enough bufs on BQ_EMPTY. 4560 * getnewbuf() by default will always select a buffer from the BQ_EMPTY. 4561 * getnewbuf() perfoms best if a buffer was found there. 4562 * Also this minimizes the possibility of starting IO 4563 * from getnewbuf(). That's a performance win, too. 4564 * 4565 * Localize complex logic [balancing as well as time aging] 4566 * to balancebufq(). 4567 * 4568 * Simplify getnewbuf() logic by elimination of time aging code. 4569 */ 4570 4571/* 4572 * Algorithm: 4573 * ----------- 4574 * The goal of the dynamic scaling of the buffer queues to to keep 4575 * the size of the LRU close to bl_target. Buffers on a queue would 4576 * be time aged. 4577 * 4578 * There would be a thread which will be responsible for "balancing" 4579 * the buffer cache queues. 4580 * 4581 * The scan order would be: AGE, LRU, META, EMPTY. 4582 */ 4583 4584long bufqscanwait = 0; 4585 4586static void bufqscan_thread(); 4587static int balancebufq(int q); 4588static int btrimempty(int n); 4589static __inline__ int initbufqscan(void); 4590static __inline__ int nextbufq(int q); 4591static void buqlimprt(int all); 4592 4593 4594static __inline__ void 4595bufqinc(int q) 4596{ 4597 if ((q < 0) || (q >= BQUEUES)) 4598 return; 4599 4600 bufqlim[q].bl_num++; 4601 return; 4602} 4603 4604static __inline__ void 4605bufqdec(int q) 4606{ 4607 if ((q < 0) || (q >= BQUEUES)) 4608 return; 4609 4610 bufqlim[q].bl_num--; 4611 return; 4612} 4613 4614static void 4615bufq_balance_thread_init(void) 4616{ 4617 thread_t thread = THREAD_NULL; 4618 4619 if (bufqscanwait++ == 0) { 4620 4621 /* Initalize globals */ 4622 MAXNBUF = (sane_size / PAGE_SIZE); 4623 nbufh = nbuf_headers; 4624 nbuflow = min(nbufh, 100); 4625 nbufhigh = min(MAXNBUF, max(nbufh, 2048)); 4626 nbuftarget = (sane_size >> 5) / PAGE_SIZE; 4627 nbuftarget = max(nbuflow, nbuftarget); 4628 nbuftarget = min(nbufhigh, nbuftarget); 4629 4630 /* 4631 * Initialize the bufqlim 4632 */ 4633 4634 /* LOCKED queue */ 4635 bufqlim[BQ_LOCKED].bl_nlow = 0; 4636 bufqlim[BQ_LOCKED].bl_nlhigh = 32; 4637 bufqlim[BQ_LOCKED].bl_target = 0; 4638 bufqlim[BQ_LOCKED].bl_stale = 30; 4639 4640 /* LRU queue */ 4641 bufqlim[BQ_LRU].bl_nlow = 0; 4642 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4; 4643 bufqlim[BQ_LRU].bl_target = nbuftarget/4; 4644 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE; 4645 4646 /* AGE queue */ 4647 bufqlim[BQ_AGE].bl_nlow = 0; 4648 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4; 4649 bufqlim[BQ_AGE].bl_target = nbuftarget/4; 4650 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE; 4651 4652 /* EMPTY queue */ 4653 bufqlim[BQ_EMPTY].bl_nlow = 0; 4654 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4; 4655 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4; 4656 bufqlim[BQ_EMPTY].bl_stale = 600000; 4657 4658 /* META queue */ 4659 bufqlim[BQ_META].bl_nlow = 0; 4660 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4; 4661 bufqlim[BQ_META].bl_target = nbuftarget/4; 4662 bufqlim[BQ_META].bl_stale = META_IS_STALE; 4663 4664 /* LAUNDRY queue */ 4665 bufqlim[BQ_LOCKED].bl_nlow = 0; 4666 bufqlim[BQ_LOCKED].bl_nlhigh = 32; 4667 bufqlim[BQ_LOCKED].bl_target = 0; 4668 bufqlim[BQ_LOCKED].bl_stale = 30; 4669 4670 buqlimprt(1); 4671 } 4672 4673 /* create worker thread */ 4674 kernel_thread_start((thread_continue_t)bufqscan_thread, NULL, &thread); 4675 thread_deallocate(thread); 4676} 4677 4678/* The workloop for the buffer balancing thread */ 4679static void 4680bufqscan_thread() 4681{ 4682 int moretodo = 0; 4683 4684 for(;;) { 4685 do { 4686 int q; /* buffer queue to process */ 4687 4688 q = initbufqscan(); 4689 for (; q; ) { 4690 moretodo |= balancebufq(q); 4691 q = nextbufq(q); 4692 } 4693 } while (moretodo); 4694 4695#if DIAGNOSTIC 4696 vfs_bufstats(); 4697 buqlimprt(0); 4698#endif 4699 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz); 4700 moretodo = 0; 4701 } 4702} 4703 4704/* Seed for the buffer queue balancing */ 4705static __inline__ int 4706initbufqscan() 4707{ 4708 /* Start with AGE queue */ 4709 return (BQ_AGE); 4710} 4711 4712/* Pick next buffer queue to balance */ 4713static __inline__ int 4714nextbufq(int q) 4715{ 4716 int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 }; 4717 4718 q++; 4719 q %= sizeof(order); 4720 return (order[q]); 4721} 4722 4723/* function to balance the buffer queues */ 4724static int 4725balancebufq(int q) 4726{ 4727 int moretodo = 0; 4728 int n, t; 4729 4730 /* reject invalid q */ 4731 if ((q < 0) || (q >= BQUEUES)) 4732 goto out; 4733 4734 /* LOCKED or LAUNDRY queue MUST not be balanced */ 4735 if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY)) 4736 goto out; 4737 4738 n = (bufqlim[q].bl_num - bufqlim[q].bl_target); 4739 4740 /* If queue has less than target nothing more to do */ 4741 if (n < 0) 4742 goto out; 4743 4744 if ( n > 8 ) { 4745 /* Balance only a small amount (12.5%) at a time */ 4746 n >>= 3; 4747 } 4748 4749 /* EMPTY queue needs special handling */ 4750 if (q == BQ_EMPTY) { 4751 moretodo |= btrimempty(n); 4752 goto out; 4753 } 4754 4755 t = buf_timestamp(): 4756 4757 for (; n > 0; n--) { 4758 struct buf *bp = bufqueues[q].tqh_first; 4759 if (!bp) 4760 break; 4761 4762 /* check if it's stale */ 4763 if ((t - bp->b_timestamp) > bufqlim[q].bl_stale) { 4764 if (bcleanbuf(bp, FALSE)) { 4765 /* buf_bawrite() issued, bp not ready */ 4766 moretodo = 1; 4767 } else { 4768 /* release the cleaned buffer to BQ_EMPTY */ 4769 SET(bp->b_flags, B_INVAL); 4770 buf_brelse(bp); 4771 } 4772 } else 4773 break; 4774 } 4775 4776out: 4777 return (moretodo); 4778} 4779 4780static int 4781btrimempty(int n) 4782{ 4783 /* 4784 * When struct buf are allocated dynamically, this would 4785 * reclaim upto 'n' struct buf from the empty queue. 4786 */ 4787 4788 return (0); 4789} 4790 4791static void 4792buqlimprt(int all) 4793{ 4794 int i; 4795 static char *bname[BQUEUES] = 4796 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" }; 4797 4798 if (all) 4799 for (i = 0; i < BQUEUES; i++) { 4800 printf("%s : ", bname[i]); 4801 printf("min = %ld, ", (long)bufqlim[i].bl_nlow); 4802 printf("cur = %ld, ", (long)bufqlim[i].bl_num); 4803 printf("max = %ld, ", (long)bufqlim[i].bl_nlhigh); 4804 printf("target = %ld, ", (long)bufqlim[i].bl_target); 4805 printf("stale after %ld seconds\n", bufqlim[i].bl_stale); 4806 } 4807 else 4808 for (i = 0; i < BQUEUES; i++) { 4809 printf("%s : ", bname[i]); 4810 printf("cur = %ld, ", (long)bufqlim[i].bl_num); 4811 } 4812} 4813 4814#endif 4815 4816 4817