1/* 2 * Copyright (c) 2000-2010 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ 29/* 30 * Copyright (c) 1989, 1993 31 * The Regents of the University of California. All rights reserved. 32 * 33 * This code is derived from software contributed to Berkeley by 34 * Rick Macklem at The University of Guelph. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. All advertising materials mentioning features or use of this software 45 * must display the following acknowledgement: 46 * This product includes software developed by the University of 47 * California, Berkeley and its contributors. 48 * 4. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 * 64 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 65 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $ 66 */ 67#include <sys/param.h> 68#include <sys/systm.h> 69#include <sys/resourcevar.h> 70#include <sys/signalvar.h> 71#include <sys/proc_internal.h> 72#include <sys/kauth.h> 73#include <sys/malloc.h> 74#include <sys/vnode.h> 75#include <sys/dirent.h> 76#include <sys/mount_internal.h> 77#include <sys/kernel.h> 78#include <sys/ubc_internal.h> 79#include <sys/uio_internal.h> 80#include <sys/kpi_mbuf.h> 81 82#include <sys/vm.h> 83#include <sys/vmparam.h> 84 85#include <sys/time.h> 86#include <kern/clock.h> 87#include <libkern/OSAtomic.h> 88#include <kern/kalloc.h> 89#include <kern/thread_call.h> 90 91#include <nfs/rpcv2.h> 92#include <nfs/nfsproto.h> 93#include <nfs/nfs.h> 94#include <nfs/nfs_gss.h> 95#include <nfs/nfsmount.h> 96#include <nfs/nfsnode.h> 97#include <sys/buf_internal.h> 98#include <libkern/OSAtomic.h> 99 100kern_return_t thread_terminate(thread_t); /* XXX */ 101 102#define NFSBUFHASH(np, lbn) \ 103 (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash]) 104LIST_HEAD(nfsbufhashhead, nfsbuf) *nfsbufhashtbl; 105struct nfsbuffreehead nfsbuffree, nfsbuffreemeta, nfsbufdelwri; 106u_long nfsbufhash; 107int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax; 108int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer; 109int nfs_nbdwrite; 110int nfs_buf_timer_on = 0; 111thread_t nfsbufdelwrithd = NULL; 112 113lck_grp_t *nfs_buf_lck_grp; 114lck_mtx_t *nfs_buf_mutex; 115 116#define NFSBUF_FREE_PERIOD 30 /* seconds */ 117#define NFSBUF_LRU_STALE 120 118#define NFSBUF_META_STALE 240 119 120/* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */ 121#define LRU_TO_FREEUP 6 122/* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */ 123#define META_TO_FREEUP 3 124/* total number of nfsbufs nfs_buf_freeup() should attempt to free */ 125#define TOTAL_TO_FREEUP (LRU_TO_FREEUP+META_TO_FREEUP) 126/* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from timer */ 127#define LRU_FREEUP_FRAC_ON_TIMER 8 128/* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from timer */ 129#define META_FREEUP_FRAC_ON_TIMER 16 130/* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */ 131#define LRU_FREEUP_MIN_FRAC 4 132/* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */ 133#define META_FREEUP_MIN_FRAC 2 134 135#define NFS_BUF_FREEUP() \ 136 do { \ 137 /* only call nfs_buf_freeup() if it has work to do: */ \ 138 if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \ 139 (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \ 140 ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \ 141 nfs_buf_freeup(0); \ 142 } while (0) 143 144/* 145 * Initialize nfsbuf lists 146 */ 147void 148nfs_nbinit(void) 149{ 150 nfs_buf_lck_grp = lck_grp_alloc_init("nfs_buf", LCK_GRP_ATTR_NULL); 151 nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, LCK_ATTR_NULL); 152 153 nfsbufcnt = nfsbufmetacnt = 154 nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0; 155 nfsbufmin = 128; 156 /* size nfsbufmax to cover at most half sane_size (w/default buf size) */ 157 nfsbufmax = (sane_size >> PAGE_SHIFT) / (2 * (NFS_RWSIZE >> PAGE_SHIFT)); 158 nfsbufmetamax = nfsbufmax / 4; 159 nfsneedbuffer = 0; 160 nfs_nbdwrite = 0; 161 162 nfsbufhashtbl = hashinit(nfsbufmax/4, M_TEMP, &nfsbufhash); 163 TAILQ_INIT(&nfsbuffree); 164 TAILQ_INIT(&nfsbuffreemeta); 165 TAILQ_INIT(&nfsbufdelwri); 166 167} 168 169/* 170 * Check periodically for stale/unused nfs bufs 171 */ 172void 173nfs_buf_timer(__unused void *param0, __unused void *param1) 174{ 175 nfs_buf_freeup(1); 176 177 lck_mtx_lock(nfs_buf_mutex); 178 if (nfsbufcnt <= nfsbufmin) { 179 nfs_buf_timer_on = 0; 180 lck_mtx_unlock(nfs_buf_mutex); 181 return; 182 } 183 lck_mtx_unlock(nfs_buf_mutex); 184 185 nfs_interval_timer_start(nfs_buf_timer_call, 186 NFSBUF_FREE_PERIOD * 1000); 187} 188 189/* 190 * try to free up some excess, unused nfsbufs 191 */ 192void 193nfs_buf_freeup(int timer) 194{ 195 struct nfsbuf *fbp; 196 struct timeval now; 197 int count; 198 struct nfsbuffreehead nfsbuffreeup; 199 200 TAILQ_INIT(&nfsbuffreeup); 201 202 lck_mtx_lock(nfs_buf_mutex); 203 204 microuptime(&now); 205 206 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0); 207 208 count = timer ? nfsbuffreecnt/LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP; 209 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) { 210 fbp = TAILQ_FIRST(&nfsbuffree); 211 if (!fbp) 212 break; 213 if (fbp->nb_refs) 214 break; 215 if (NBUFSTAMPVALID(fbp) && 216 (fbp->nb_timestamp + (2*NFSBUF_LRU_STALE)) > now.tv_sec) 217 break; 218 nfs_buf_remfree(fbp); 219 /* disassociate buffer from any nfsnode */ 220 if (fbp->nb_np) { 221 if (fbp->nb_vnbufs.le_next != NFSNOLIST) { 222 LIST_REMOVE(fbp, nb_vnbufs); 223 fbp->nb_vnbufs.le_next = NFSNOLIST; 224 } 225 fbp->nb_np = NULL; 226 } 227 LIST_REMOVE(fbp, nb_hash); 228 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free); 229 nfsbufcnt--; 230 } 231 232 count = timer ? nfsbuffreemetacnt/META_FREEUP_FRAC_ON_TIMER : META_TO_FREEUP; 233 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) { 234 fbp = TAILQ_FIRST(&nfsbuffreemeta); 235 if (!fbp) 236 break; 237 if (fbp->nb_refs) 238 break; 239 if (NBUFSTAMPVALID(fbp) && 240 (fbp->nb_timestamp + (2*NFSBUF_META_STALE)) > now.tv_sec) 241 break; 242 nfs_buf_remfree(fbp); 243 /* disassociate buffer from any nfsnode */ 244 if (fbp->nb_np) { 245 if (fbp->nb_vnbufs.le_next != NFSNOLIST) { 246 LIST_REMOVE(fbp, nb_vnbufs); 247 fbp->nb_vnbufs.le_next = NFSNOLIST; 248 } 249 fbp->nb_np = NULL; 250 } 251 LIST_REMOVE(fbp, nb_hash); 252 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free); 253 nfsbufcnt--; 254 nfsbufmetacnt--; 255 } 256 257 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0); 258 NFSBUFCNTCHK(); 259 260 lck_mtx_unlock(nfs_buf_mutex); 261 262 while ((fbp = TAILQ_FIRST(&nfsbuffreeup))) { 263 TAILQ_REMOVE(&nfsbuffreeup, fbp, nb_free); 264 /* nuke any creds */ 265 if (IS_VALID_CRED(fbp->nb_rcred)) 266 kauth_cred_unref(&fbp->nb_rcred); 267 if (IS_VALID_CRED(fbp->nb_wcred)) 268 kauth_cred_unref(&fbp->nb_wcred); 269 /* if buf was NB_META, dump buffer */ 270 if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) 271 kfree(fbp->nb_data, fbp->nb_bufsize); 272 FREE(fbp, M_TEMP); 273 } 274 275} 276 277/* 278 * remove a buffer from the freelist 279 * (must be called with nfs_buf_mutex held) 280 */ 281void 282nfs_buf_remfree(struct nfsbuf *bp) 283{ 284 if (bp->nb_free.tqe_next == NFSNOLIST) 285 panic("nfsbuf not on free list"); 286 if (ISSET(bp->nb_flags, NB_DELWRI)) { 287 nfsbufdelwricnt--; 288 TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free); 289 } else if (ISSET(bp->nb_flags, NB_META)) { 290 nfsbuffreemetacnt--; 291 TAILQ_REMOVE(&nfsbuffreemeta, bp, nb_free); 292 } else { 293 nfsbuffreecnt--; 294 TAILQ_REMOVE(&nfsbuffree, bp, nb_free); 295 } 296 bp->nb_free.tqe_next = NFSNOLIST; 297 NFSBUFCNTCHK(); 298} 299 300/* 301 * check for existence of nfsbuf in cache 302 */ 303boolean_t 304nfs_buf_is_incore(nfsnode_t np, daddr64_t blkno) 305{ 306 boolean_t rv; 307 lck_mtx_lock(nfs_buf_mutex); 308 if (nfs_buf_incore(np, blkno)) 309 rv = TRUE; 310 else 311 rv = FALSE; 312 lck_mtx_unlock(nfs_buf_mutex); 313 return (rv); 314} 315 316/* 317 * return incore buffer (must be called with nfs_buf_mutex held) 318 */ 319struct nfsbuf * 320nfs_buf_incore(nfsnode_t np, daddr64_t blkno) 321{ 322 /* Search hash chain */ 323 struct nfsbuf * bp = NFSBUFHASH(np, blkno)->lh_first; 324 for (; bp != NULL; bp = bp->nb_hash.le_next) 325 if ((bp->nb_lblkno == blkno) && (bp->nb_np == np)) { 326 if (!ISSET(bp->nb_flags, NB_INVAL)) { 327 FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_np); 328 return (bp); 329 } 330 } 331 return (NULL); 332} 333 334/* 335 * Check if it's OK to drop a page. 336 * 337 * Called by vnode_pager() on pageout request of non-dirty page. 338 * We need to make sure that it's not part of a delayed write. 339 * If it is, we can't let the VM drop it because we may need it 340 * later when/if we need to write the data (again). 341 */ 342int 343nfs_buf_page_inval(vnode_t vp, off_t offset) 344{ 345 struct nfsmount *nmp = VTONMP(vp); 346 struct nfsbuf *bp; 347 int error = 0; 348 349 if (!nmp) 350 return (ENXIO); 351 352 lck_mtx_lock(nfs_buf_mutex); 353 bp = nfs_buf_incore(VTONFS(vp), (daddr64_t)(offset / nmp->nm_biosize)); 354 if (!bp) 355 goto out; 356 FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend); 357 if (ISSET(bp->nb_lflags, NBL_BUSY)) { 358 error = EBUSY; 359 goto out; 360 } 361 /* 362 * If there's a dirty range in the buffer, check to 363 * see if this page intersects with the dirty range. 364 * If it does, we can't let the pager drop the page. 365 */ 366 if (bp->nb_dirtyend > 0) { 367 int start = offset - NBOFF(bp); 368 if ((bp->nb_dirtyend > start) && 369 (bp->nb_dirtyoff < (start + PAGE_SIZE))) { 370 /* 371 * Before returning the bad news, move the 372 * buffer to the start of the delwri list and 373 * give the list a push to try to flush the 374 * buffer out. 375 */ 376 error = EBUSY; 377 nfs_buf_remfree(bp); 378 TAILQ_INSERT_HEAD(&nfsbufdelwri, bp, nb_free); 379 nfsbufdelwricnt++; 380 nfs_buf_delwri_push(1); 381 } 382 } 383out: 384 lck_mtx_unlock(nfs_buf_mutex); 385 return (error); 386} 387 388/* 389 * set up the UPL for a buffer 390 * (must NOT be called with nfs_buf_mutex held) 391 */ 392int 393nfs_buf_upl_setup(struct nfsbuf *bp) 394{ 395 kern_return_t kret; 396 upl_t upl; 397 int upl_flags; 398 399 if (ISSET(bp->nb_flags, NB_PAGELIST)) 400 return (0); 401 402 upl_flags = UPL_PRECIOUS; 403 if (!ISSET(bp->nb_flags, NB_READ)) { 404 /* 405 * We're doing a "write", so we intend to modify 406 * the pages we're gathering. 407 */ 408 upl_flags |= UPL_WILL_MODIFY; 409 } 410 kret = ubc_create_upl(NFSTOV(bp->nb_np), NBOFF(bp), bp->nb_bufsize, 411 &upl, NULL, upl_flags); 412 if (kret == KERN_INVALID_ARGUMENT) { 413 /* vm object probably doesn't exist any more */ 414 bp->nb_pagelist = NULL; 415 return (EINVAL); 416 } 417 if (kret != KERN_SUCCESS) { 418 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret); 419 bp->nb_pagelist = NULL; 420 return (EIO); 421 } 422 423 FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_np); 424 425 bp->nb_pagelist = upl; 426 SET(bp->nb_flags, NB_PAGELIST); 427 return (0); 428} 429 430/* 431 * update buffer's valid/dirty info from UBC 432 * (must NOT be called with nfs_buf_mutex held) 433 */ 434void 435nfs_buf_upl_check(struct nfsbuf *bp) 436{ 437 upl_page_info_t *pl; 438 off_t filesize, fileoffset; 439 int i, npages; 440 441 if (!ISSET(bp->nb_flags, NB_PAGELIST)) 442 return; 443 444 npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE; 445 filesize = ubc_getsize(NFSTOV(bp->nb_np)); 446 fileoffset = NBOFF(bp); 447 if (fileoffset < filesize) 448 SET(bp->nb_flags, NB_CACHE); 449 else 450 CLR(bp->nb_flags, NB_CACHE); 451 452 pl = ubc_upl_pageinfo(bp->nb_pagelist); 453 bp->nb_valid = bp->nb_dirty = 0; 454 455 for (i=0; i < npages; i++, fileoffset += PAGE_SIZE_64) { 456 /* anything beyond the end of the file is not valid or dirty */ 457 if (fileoffset >= filesize) 458 break; 459 if (!upl_valid_page(pl, i)) { 460 CLR(bp->nb_flags, NB_CACHE); 461 continue; 462 } 463 NBPGVALID_SET(bp,i); 464 if (upl_dirty_page(pl, i)) 465 NBPGDIRTY_SET(bp, i); 466 } 467 fileoffset = NBOFF(bp); 468 if (ISSET(bp->nb_flags, NB_CACHE)) { 469 bp->nb_validoff = 0; 470 bp->nb_validend = bp->nb_bufsize; 471 if (fileoffset + bp->nb_validend > filesize) 472 bp->nb_validend = filesize - fileoffset; 473 } else { 474 bp->nb_validoff = bp->nb_validend = -1; 475 } 476 FSDBG(539, bp, fileoffset, bp->nb_valid, bp->nb_dirty); 477 FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend); 478} 479 480/* 481 * make sure that a buffer is mapped 482 * (must NOT be called with nfs_buf_mutex held) 483 */ 484int 485nfs_buf_map(struct nfsbuf *bp) 486{ 487 kern_return_t kret; 488 489 if (bp->nb_data) 490 return (0); 491 if (!ISSET(bp->nb_flags, NB_PAGELIST)) 492 return (EINVAL); 493 494 kret = ubc_upl_map(bp->nb_pagelist, (vm_offset_t *)&(bp->nb_data)); 495 if (kret != KERN_SUCCESS) 496 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret); 497 if (bp->nb_data == 0) 498 panic("ubc_upl_map mapped 0"); 499 FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data); 500 return (0); 501} 502 503/* 504 * normalize an nfsbuf's valid range 505 * 506 * the read/write code guarantees that we'll always have a valid 507 * region that is an integral number of pages. If either end 508 * of the valid range isn't page-aligned, it gets corrected 509 * here as we extend the valid range through all of the 510 * contiguous valid pages. 511 */ 512void 513nfs_buf_normalize_valid_range(nfsnode_t np, struct nfsbuf *bp) 514{ 515 int pg, npg; 516 /* pull validoff back to start of contiguous valid page range */ 517 pg = bp->nb_validoff/PAGE_SIZE; 518 while (pg >= 0 && NBPGVALID(bp,pg)) 519 pg--; 520 bp->nb_validoff = (pg+1) * PAGE_SIZE; 521 /* push validend forward to end of contiguous valid page range */ 522 npg = bp->nb_bufsize/PAGE_SIZE; 523 pg = bp->nb_validend/PAGE_SIZE; 524 while (pg < npg && NBPGVALID(bp,pg)) 525 pg++; 526 bp->nb_validend = pg * PAGE_SIZE; 527 /* clip to EOF */ 528 if (NBOFF(bp) + bp->nb_validend > (off_t)np->n_size) 529 bp->nb_validend = np->n_size % bp->nb_bufsize; 530} 531 532/* 533 * process some entries on the delayed write queue 534 * (must be called with nfs_buf_mutex held) 535 */ 536void 537nfs_buf_delwri_service(void) 538{ 539 struct nfsbuf *bp; 540 nfsnode_t np; 541 int error, i = 0; 542 543 while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) { 544 np = bp->nb_np; 545 nfs_buf_remfree(bp); 546 nfs_buf_refget(bp); 547 while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN); 548 nfs_buf_refrele(bp); 549 if (error) 550 break; 551 if (!bp->nb_np) { 552 /* buffer is no longer valid */ 553 nfs_buf_drop(bp); 554 continue; 555 } 556 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) 557 nfs_buf_check_write_verifier(np, bp); 558 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { 559 /* put buffer at end of delwri list */ 560 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free); 561 nfsbufdelwricnt++; 562 nfs_buf_drop(bp); 563 lck_mtx_unlock(nfs_buf_mutex); 564 nfs_flushcommits(np, 1); 565 } else { 566 SET(bp->nb_flags, NB_ASYNC); 567 lck_mtx_unlock(nfs_buf_mutex); 568 nfs_buf_write(bp); 569 } 570 i++; 571 lck_mtx_lock(nfs_buf_mutex); 572 } 573} 574 575/* 576 * thread to service the delayed write queue when asked 577 */ 578void 579nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr) 580{ 581 struct timespec ts = { 30, 0 }; 582 int error = 0; 583 584 lck_mtx_lock(nfs_buf_mutex); 585 while (!error) { 586 nfs_buf_delwri_service(); 587 error = msleep(&nfsbufdelwrithd, nfs_buf_mutex, 0, "nfsbufdelwri", &ts); 588 } 589 nfsbufdelwrithd = NULL; 590 lck_mtx_unlock(nfs_buf_mutex); 591 thread_terminate(nfsbufdelwrithd); 592} 593 594/* 595 * try to push out some delayed/uncommitted writes 596 * ("locked" indicates whether nfs_buf_mutex is already held) 597 */ 598void 599nfs_buf_delwri_push(int locked) 600{ 601 if (TAILQ_EMPTY(&nfsbufdelwri)) 602 return; 603 if (!locked) 604 lck_mtx_lock(nfs_buf_mutex); 605 /* wake up the delayed write service thread */ 606 if (nfsbufdelwrithd) 607 wakeup(&nfsbufdelwrithd); 608 else if (kernel_thread_start(nfs_buf_delwri_thread, NULL, &nfsbufdelwrithd) == KERN_SUCCESS) 609 thread_deallocate(nfsbufdelwrithd); 610 /* otherwise, try to do some of the work ourselves */ 611 if (!nfsbufdelwrithd) 612 nfs_buf_delwri_service(); 613 if (!locked) 614 lck_mtx_unlock(nfs_buf_mutex); 615} 616 617/* 618 * Get an nfs buffer. 619 * 620 * Returns errno on error, 0 otherwise. 621 * Any buffer is returned in *bpp. 622 * 623 * If NBLK_ONLYVALID is set, only return buffer if found in cache. 624 * If NBLK_NOWAIT is set, don't wait for the buffer if it's marked BUSY. 625 * 626 * Check for existence of buffer in cache. 627 * Or attempt to reuse a buffer from one of the free lists. 628 * Or allocate a new buffer if we haven't already hit max allocation. 629 * Or wait for a free buffer. 630 * 631 * If available buffer found, prepare it, and return it. 632 * 633 * If the calling process is interrupted by a signal for 634 * an interruptible mount point, return EINTR. 635 */ 636int 637nfs_buf_get( 638 nfsnode_t np, 639 daddr64_t blkno, 640 uint32_t size, 641 thread_t thd, 642 int flags, 643 struct nfsbuf **bpp) 644{ 645 vnode_t vp = NFSTOV(np); 646 struct nfsmount *nmp = VTONMP(vp); 647 struct nfsbuf *bp; 648 uint32_t bufsize; 649 int slpflag = PCATCH; 650 int operation = (flags & NBLK_OPMASK); 651 int error = 0; 652 struct timespec ts; 653 654 FSDBG_TOP(541, np, blkno, size, flags); 655 *bpp = NULL; 656 657 bufsize = size; 658 if (bufsize > NFS_MAXBSIZE) 659 panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested"); 660 661 if (!nmp) { 662 FSDBG_BOT(541, np, blkno, 0, ENXIO); 663 return (ENXIO); 664 } 665 666 if (!UBCINFOEXISTS(vp)) { 667 operation = NBLK_META; 668 } else if (bufsize < (uint32_t)nmp->nm_biosize) { 669 /* reg files should always have biosize blocks */ 670 bufsize = nmp->nm_biosize; 671 } 672 673 /* if NBLK_WRITE, check for too many delayed/uncommitted writes */ 674 if ((operation == NBLK_WRITE) && (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES)) { 675 FSDBG_TOP(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES); 676 677 /* poke the delwri list */ 678 nfs_buf_delwri_push(0); 679 680 /* sleep to let other threads run... */ 681 tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1); 682 FSDBG_BOT(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES); 683 } 684 685loop: 686 lck_mtx_lock(nfs_buf_mutex); 687 688 /* wait for any buffer invalidation/flushing to complete */ 689 while (np->n_bflag & NBINVALINPROG) { 690 np->n_bflag |= NBINVALWANT; 691 ts.tv_sec = 2; 692 ts.tv_nsec = 0; 693 msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_buf_get_invalwait", &ts); 694 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { 695 lck_mtx_unlock(nfs_buf_mutex); 696 FSDBG_BOT(541, np, blkno, 0, error); 697 return (error); 698 } 699 if (np->n_bflag & NBINVALINPROG) 700 slpflag = 0; 701 } 702 703 /* check for existence of nfsbuf in cache */ 704 if ((bp = nfs_buf_incore(np, blkno))) { 705 /* if busy, set wanted and wait */ 706 if (ISSET(bp->nb_lflags, NBL_BUSY)) { 707 if (flags & NBLK_NOWAIT) { 708 lck_mtx_unlock(nfs_buf_mutex); 709 FSDBG_BOT(541, np, blkno, bp, 0xbcbcbcbc); 710 return (0); 711 } 712 FSDBG_TOP(543, np, blkno, bp, bp->nb_flags); 713 SET(bp->nb_lflags, NBL_WANTED); 714 715 ts.tv_sec = 2; 716 ts.tv_nsec = 0; 717 msleep(bp, nfs_buf_mutex, slpflag|(PRIBIO+1)|PDROP, 718 "nfsbufget", (slpflag == PCATCH) ? NULL : &ts); 719 slpflag = 0; 720 FSDBG_BOT(543, np, blkno, bp, bp->nb_flags); 721 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { 722 FSDBG_BOT(541, np, blkno, 0, error); 723 return (error); 724 } 725 goto loop; 726 } 727 if (bp->nb_bufsize != bufsize) 728 panic("nfsbuf size mismatch"); 729 SET(bp->nb_lflags, NBL_BUSY); 730 SET(bp->nb_flags, NB_CACHE); 731 nfs_buf_remfree(bp); 732 /* additional paranoia: */ 733 if (ISSET(bp->nb_flags, NB_PAGELIST)) 734 panic("pagelist buffer was not busy"); 735 goto buffer_setup; 736 } 737 738 if (flags & NBLK_ONLYVALID) { 739 lck_mtx_unlock(nfs_buf_mutex); 740 FSDBG_BOT(541, np, blkno, 0, 0x0000cace); 741 return (0); 742 } 743 744 /* 745 * where to get a free buffer: 746 * - if meta and maxmeta reached, must reuse meta 747 * - alloc new if we haven't reached min bufs 748 * - if free lists are NOT empty 749 * - if free list is stale, use it 750 * - else if freemeta list is stale, use it 751 * - else if max bufs allocated, use least-time-to-stale 752 * - alloc new if we haven't reached max allowed 753 * - start clearing out delwri list and try again 754 */ 755 756 if ((operation == NBLK_META) && (nfsbufmetacnt >= nfsbufmetamax)) { 757 /* if we've hit max meta buffers, must reuse a meta buffer */ 758 bp = TAILQ_FIRST(&nfsbuffreemeta); 759 } else if ((nfsbufcnt > nfsbufmin) && 760 (!TAILQ_EMPTY(&nfsbuffree) || !TAILQ_EMPTY(&nfsbuffreemeta))) { 761 /* try to pull an nfsbuf off a free list */ 762 struct nfsbuf *lrubp, *metabp; 763 struct timeval now; 764 microuptime(&now); 765 766 /* if the next LRU or META buffer is invalid or stale, use it */ 767 lrubp = TAILQ_FIRST(&nfsbuffree); 768 if (lrubp && (!NBUFSTAMPVALID(lrubp) || 769 ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec))) 770 bp = lrubp; 771 metabp = TAILQ_FIRST(&nfsbuffreemeta); 772 if (!bp && metabp && (!NBUFSTAMPVALID(metabp) || 773 ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec))) 774 bp = metabp; 775 776 if (!bp && (nfsbufcnt >= nfsbufmax)) { 777 /* we've already allocated all bufs, so */ 778 /* choose the buffer that'll go stale first */ 779 if (!metabp) 780 bp = lrubp; 781 else if (!lrubp) 782 bp = metabp; 783 else { 784 int32_t lru_stale_time, meta_stale_time; 785 lru_stale_time = lrubp->nb_timestamp + NFSBUF_LRU_STALE; 786 meta_stale_time = metabp->nb_timestamp + NFSBUF_META_STALE; 787 if (lru_stale_time <= meta_stale_time) 788 bp = lrubp; 789 else 790 bp = metabp; 791 } 792 } 793 } 794 795 if (bp) { 796 /* we have a buffer to reuse */ 797 FSDBG(544, np, blkno, bp, bp->nb_flags); 798 nfs_buf_remfree(bp); 799 if (ISSET(bp->nb_flags, NB_DELWRI)) 800 panic("nfs_buf_get: delwri"); 801 SET(bp->nb_lflags, NBL_BUSY); 802 /* disassociate buffer from previous nfsnode */ 803 if (bp->nb_np) { 804 if (bp->nb_vnbufs.le_next != NFSNOLIST) { 805 LIST_REMOVE(bp, nb_vnbufs); 806 bp->nb_vnbufs.le_next = NFSNOLIST; 807 } 808 bp->nb_np = NULL; 809 } 810 LIST_REMOVE(bp, nb_hash); 811 /* nuke any creds we're holding */ 812 if (IS_VALID_CRED(bp->nb_rcred)) 813 kauth_cred_unref(&bp->nb_rcred); 814 if (IS_VALID_CRED(bp->nb_wcred)) 815 kauth_cred_unref(&bp->nb_wcred); 816 /* if buf will no longer be NB_META, dump old buffer */ 817 if (operation == NBLK_META) { 818 if (!ISSET(bp->nb_flags, NB_META)) 819 nfsbufmetacnt++; 820 } else if (ISSET(bp->nb_flags, NB_META)) { 821 if (bp->nb_data) { 822 kfree(bp->nb_data, bp->nb_bufsize); 823 bp->nb_data = NULL; 824 } 825 nfsbufmetacnt--; 826 } 827 /* re-init buf fields */ 828 bp->nb_error = 0; 829 bp->nb_validoff = bp->nb_validend = -1; 830 bp->nb_dirtyoff = bp->nb_dirtyend = 0; 831 bp->nb_valid = 0; 832 bp->nb_dirty = 0; 833 bp->nb_verf = 0; 834 } else { 835 /* no buffer to reuse */ 836 if ((nfsbufcnt < nfsbufmax) && 837 ((operation != NBLK_META) || (nfsbufmetacnt < nfsbufmetamax))) { 838 /* just alloc a new one */ 839 MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK); 840 if (!bp) { 841 lck_mtx_unlock(nfs_buf_mutex); 842 FSDBG_BOT(541, np, blkno, 0, error); 843 return (ENOMEM); 844 } 845 nfsbufcnt++; 846 847 /* 848 * If any excess bufs, make sure the timer 849 * is running to free them up later. 850 */ 851 if (nfsbufcnt > nfsbufmin && !nfs_buf_timer_on) { 852 nfs_buf_timer_on = 1; 853 nfs_interval_timer_start(nfs_buf_timer_call, 854 NFSBUF_FREE_PERIOD * 1000); 855 } 856 857 if (operation == NBLK_META) 858 nfsbufmetacnt++; 859 NFSBUFCNTCHK(); 860 /* init nfsbuf */ 861 bzero(bp, sizeof(*bp)); 862 bp->nb_free.tqe_next = NFSNOLIST; 863 bp->nb_validoff = bp->nb_validend = -1; 864 FSDBG(545, np, blkno, bp, 0); 865 } else { 866 /* too many bufs... wait for buffers to free up */ 867 FSDBG_TOP(546, np, blkno, nfsbufcnt, nfsbufmax); 868 869 /* poke the delwri list */ 870 nfs_buf_delwri_push(1); 871 872 nfsneedbuffer = 1; 873 msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH|PDROP, "nfsbufget", NULL); 874 FSDBG_BOT(546, np, blkno, nfsbufcnt, nfsbufmax); 875 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { 876 FSDBG_BOT(541, np, blkno, 0, error); 877 return (error); 878 } 879 goto loop; 880 } 881 } 882 883 /* set up nfsbuf */ 884 SET(bp->nb_lflags, NBL_BUSY); 885 bp->nb_flags = 0; 886 bp->nb_lblkno = blkno; 887 /* insert buf in hash */ 888 LIST_INSERT_HEAD(NFSBUFHASH(np, blkno), bp, nb_hash); 889 /* associate buffer with new nfsnode */ 890 bp->nb_np = np; 891 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); 892 893buffer_setup: 894 895 /* unlock hash */ 896 lck_mtx_unlock(nfs_buf_mutex); 897 898 switch (operation) { 899 case NBLK_META: 900 SET(bp->nb_flags, NB_META); 901 if ((bp->nb_bufsize != bufsize) && bp->nb_data) { 902 kfree(bp->nb_data, bp->nb_bufsize); 903 bp->nb_data = NULL; 904 bp->nb_validoff = bp->nb_validend = -1; 905 bp->nb_dirtyoff = bp->nb_dirtyend = 0; 906 bp->nb_valid = 0; 907 bp->nb_dirty = 0; 908 CLR(bp->nb_flags, NB_CACHE); 909 } 910 if (!bp->nb_data) 911 bp->nb_data = kalloc(bufsize); 912 if (!bp->nb_data) { 913 /* Ack! couldn't allocate the data buffer! */ 914 /* clean up buffer and return error */ 915 lck_mtx_lock(nfs_buf_mutex); 916 LIST_REMOVE(bp, nb_vnbufs); 917 bp->nb_vnbufs.le_next = NFSNOLIST; 918 bp->nb_np = NULL; 919 /* invalidate usage timestamp to allow immediate freeing */ 920 NBUFSTAMPINVALIDATE(bp); 921 if (bp->nb_free.tqe_next != NFSNOLIST) 922 panic("nfsbuf on freelist"); 923 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free); 924 nfsbuffreecnt++; 925 lck_mtx_unlock(nfs_buf_mutex); 926 FSDBG_BOT(541, np, blkno, 0xb00, ENOMEM); 927 return (ENOMEM); 928 } 929 bp->nb_bufsize = bufsize; 930 break; 931 932 case NBLK_READ: 933 case NBLK_WRITE: 934 /* 935 * Set or clear NB_READ now to let the UPL subsystem know 936 * if we intend to modify the pages or not. 937 */ 938 if (operation == NBLK_READ) { 939 SET(bp->nb_flags, NB_READ); 940 } else { 941 CLR(bp->nb_flags, NB_READ); 942 } 943 if (bufsize < PAGE_SIZE) 944 bufsize = PAGE_SIZE; 945 bp->nb_bufsize = bufsize; 946 bp->nb_validoff = bp->nb_validend = -1; 947 948 if (UBCINFOEXISTS(vp)) { 949 /* set up upl */ 950 if (nfs_buf_upl_setup(bp)) { 951 /* unable to create upl */ 952 /* vm object must no longer exist */ 953 /* clean up buffer and return error */ 954 lck_mtx_lock(nfs_buf_mutex); 955 LIST_REMOVE(bp, nb_vnbufs); 956 bp->nb_vnbufs.le_next = NFSNOLIST; 957 bp->nb_np = NULL; 958 /* invalidate usage timestamp to allow immediate freeing */ 959 NBUFSTAMPINVALIDATE(bp); 960 if (bp->nb_free.tqe_next != NFSNOLIST) 961 panic("nfsbuf on freelist"); 962 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free); 963 nfsbuffreecnt++; 964 lck_mtx_unlock(nfs_buf_mutex); 965 FSDBG_BOT(541, np, blkno, 0x2bc, EIO); 966 return (EIO); 967 } 968 nfs_buf_upl_check(bp); 969 } 970 break; 971 972 default: 973 panic("nfs_buf_get: %d unknown operation", operation); 974 } 975 976 *bpp = bp; 977 978 FSDBG_BOT(541, np, blkno, bp, bp->nb_flags); 979 980 return (0); 981} 982 983void 984nfs_buf_release(struct nfsbuf *bp, int freeup) 985{ 986 nfsnode_t np = bp->nb_np; 987 vnode_t vp; 988 struct timeval now; 989 int wakeup_needbuffer, wakeup_buffer, wakeup_nbdwrite; 990 991 FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data); 992 FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend); 993 FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0); 994 995 vp = np ? NFSTOV(np) : NULL; 996 if (vp && UBCINFOEXISTS(vp) && bp->nb_bufsize) { 997 int upl_flags, rv; 998 upl_t upl; 999 uint32_t i; 1000 1001 if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) { 1002 rv = nfs_buf_upl_setup(bp); 1003 if (rv) 1004 printf("nfs_buf_release: upl create failed %d\n", rv); 1005 else 1006 nfs_buf_upl_check(bp); 1007 } 1008 upl = bp->nb_pagelist; 1009 if (!upl) 1010 goto pagelist_cleanup_done; 1011 if (bp->nb_data) { 1012 if (ubc_upl_unmap(upl) != KERN_SUCCESS) 1013 panic("ubc_upl_unmap failed"); 1014 bp->nb_data = NULL; 1015 } 1016 /* 1017 * Abort the pages on error or: if this is an invalid or 1018 * non-needcommit nocache buffer AND no pages are dirty. 1019 */ 1020 if (ISSET(bp->nb_flags, NB_ERROR) || (!bp->nb_dirty && (ISSET(bp->nb_flags, NB_INVAL) || 1021 (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))))) { 1022 if (ISSET(bp->nb_flags, (NB_READ | NB_INVAL | NB_NOCACHE))) 1023 upl_flags = UPL_ABORT_DUMP_PAGES; 1024 else 1025 upl_flags = 0; 1026 ubc_upl_abort(upl, upl_flags); 1027 goto pagelist_cleanup_done; 1028 } 1029 for (i=0; i <= (bp->nb_bufsize - 1)/PAGE_SIZE; i++) { 1030 if (!NBPGVALID(bp,i)) 1031 ubc_upl_abort_range(upl, 1032 i*PAGE_SIZE, PAGE_SIZE, 1033 UPL_ABORT_DUMP_PAGES | 1034 UPL_ABORT_FREE_ON_EMPTY); 1035 else { 1036 if (NBPGDIRTY(bp,i)) 1037 upl_flags = UPL_COMMIT_SET_DIRTY; 1038 else 1039 upl_flags = UPL_COMMIT_CLEAR_DIRTY; 1040 1041 if (!ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI))) 1042 upl_flags |= UPL_COMMIT_CLEAR_PRECIOUS; 1043 1044 ubc_upl_commit_range(upl, 1045 i*PAGE_SIZE, PAGE_SIZE, 1046 upl_flags | 1047 UPL_COMMIT_INACTIVATE | 1048 UPL_COMMIT_FREE_ON_EMPTY); 1049 } 1050 } 1051pagelist_cleanup_done: 1052 /* invalidate any pages past EOF */ 1053 if (NBOFF(bp) + bp->nb_bufsize > (off_t)(np->n_size)) { 1054 off_t start, end; 1055 start = trunc_page_64(np->n_size) + PAGE_SIZE_64; 1056 end = trunc_page_64(NBOFF(bp) + bp->nb_bufsize); 1057 if (start < NBOFF(bp)) 1058 start = NBOFF(bp); 1059 if (end > start) { 1060 if ((rv = ubc_msync(vp, start, end, NULL, UBC_INVALIDATE))) 1061 printf("nfs_buf_release(): ubc_msync failed!, error %d\n", rv); 1062 } 1063 } 1064 CLR(bp->nb_flags, NB_PAGELIST); 1065 bp->nb_pagelist = NULL; 1066 } 1067 1068 lck_mtx_lock(nfs_buf_mutex); 1069 1070 wakeup_needbuffer = wakeup_buffer = wakeup_nbdwrite = 0; 1071 1072 /* Wake up any processes waiting for any buffer to become free. */ 1073 if (nfsneedbuffer) { 1074 nfsneedbuffer = 0; 1075 wakeup_needbuffer = 1; 1076 } 1077 /* Wake up any processes waiting for _this_ buffer to become free. */ 1078 if (ISSET(bp->nb_lflags, NBL_WANTED)) { 1079 CLR(bp->nb_lflags, NBL_WANTED); 1080 wakeup_buffer = 1; 1081 } 1082 1083 /* If it's non-needcommit nocache, or an error, mark it invalid. */ 1084 if (ISSET(bp->nb_flags, NB_ERROR) || 1085 (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))) 1086 SET(bp->nb_flags, NB_INVAL); 1087 1088 if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) { 1089 /* If it's invalid or empty, dissociate it from its nfsnode */ 1090 if (bp->nb_vnbufs.le_next != NFSNOLIST) { 1091 LIST_REMOVE(bp, nb_vnbufs); 1092 bp->nb_vnbufs.le_next = NFSNOLIST; 1093 } 1094 bp->nb_np = NULL; 1095 /* if this was a delayed write, wakeup anyone */ 1096 /* waiting for delayed writes to complete */ 1097 if (ISSET(bp->nb_flags, NB_DELWRI)) { 1098 CLR(bp->nb_flags, NB_DELWRI); 1099 nfs_nbdwrite--; 1100 NFSBUFCNTCHK(); 1101 wakeup_nbdwrite = 1; 1102 } 1103 /* invalidate usage timestamp to allow immediate freeing */ 1104 NBUFSTAMPINVALIDATE(bp); 1105 /* put buffer at head of free list */ 1106 if (bp->nb_free.tqe_next != NFSNOLIST) 1107 panic("nfsbuf on freelist"); 1108 SET(bp->nb_flags, NB_INVAL); 1109 if (ISSET(bp->nb_flags, NB_META)) { 1110 TAILQ_INSERT_HEAD(&nfsbuffreemeta, bp, nb_free); 1111 nfsbuffreemetacnt++; 1112 } else { 1113 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free); 1114 nfsbuffreecnt++; 1115 } 1116 } else if (ISSET(bp->nb_flags, NB_DELWRI)) { 1117 /* put buffer at end of delwri list */ 1118 if (bp->nb_free.tqe_next != NFSNOLIST) 1119 panic("nfsbuf on freelist"); 1120 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free); 1121 nfsbufdelwricnt++; 1122 freeup = 0; 1123 } else { 1124 /* update usage timestamp */ 1125 microuptime(&now); 1126 bp->nb_timestamp = now.tv_sec; 1127 /* put buffer at end of free list */ 1128 if (bp->nb_free.tqe_next != NFSNOLIST) 1129 panic("nfsbuf on freelist"); 1130 if (ISSET(bp->nb_flags, NB_META)) { 1131 TAILQ_INSERT_TAIL(&nfsbuffreemeta, bp, nb_free); 1132 nfsbuffreemetacnt++; 1133 } else { 1134 TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free); 1135 nfsbuffreecnt++; 1136 } 1137 } 1138 1139 NFSBUFCNTCHK(); 1140 1141 /* Unlock the buffer. */ 1142 CLR(bp->nb_flags, (NB_ASYNC | NB_STABLE)); 1143 CLR(bp->nb_lflags, NBL_BUSY); 1144 1145 FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data); 1146 1147 lck_mtx_unlock(nfs_buf_mutex); 1148 1149 if (wakeup_needbuffer) 1150 wakeup(&nfsneedbuffer); 1151 if (wakeup_buffer) 1152 wakeup(bp); 1153 if (wakeup_nbdwrite) 1154 wakeup(&nfs_nbdwrite); 1155 if (freeup) 1156 NFS_BUF_FREEUP(); 1157} 1158 1159/* 1160 * Wait for operations on the buffer to complete. 1161 * When they do, extract and return the I/O's error value. 1162 */ 1163int 1164nfs_buf_iowait(struct nfsbuf *bp) 1165{ 1166 FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); 1167 1168 lck_mtx_lock(nfs_buf_mutex); 1169 1170 while (!ISSET(bp->nb_flags, NB_DONE)) 1171 msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", NULL); 1172 1173 lck_mtx_unlock(nfs_buf_mutex); 1174 1175 FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); 1176 1177 /* check for interruption of I/O, then errors. */ 1178 if (ISSET(bp->nb_flags, NB_EINTR)) { 1179 CLR(bp->nb_flags, NB_EINTR); 1180 return (EINTR); 1181 } else if (ISSET(bp->nb_flags, NB_ERROR)) 1182 return (bp->nb_error ? bp->nb_error : EIO); 1183 return (0); 1184} 1185 1186/* 1187 * Mark I/O complete on a buffer. 1188 */ 1189void 1190nfs_buf_iodone(struct nfsbuf *bp) 1191{ 1192 1193 FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); 1194 1195 if (ISSET(bp->nb_flags, NB_DONE)) 1196 panic("nfs_buf_iodone already"); 1197 1198 if (!ISSET(bp->nb_flags, NB_READ)) { 1199 CLR(bp->nb_flags, NB_WRITEINPROG); 1200 /* 1201 * vnode_writedone() takes care of waking up 1202 * any throttled write operations 1203 */ 1204 vnode_writedone(NFSTOV(bp->nb_np)); 1205 nfs_node_lock_force(bp->nb_np); 1206 bp->nb_np->n_numoutput--; 1207 nfs_node_unlock(bp->nb_np); 1208 } 1209 if (ISSET(bp->nb_flags, NB_ASYNC)) { /* if async, release it */ 1210 SET(bp->nb_flags, NB_DONE); /* note that it's done */ 1211 nfs_buf_release(bp, 1); 1212 } else { /* or just wakeup the buffer */ 1213 lck_mtx_lock(nfs_buf_mutex); 1214 SET(bp->nb_flags, NB_DONE); /* note that it's done */ 1215 CLR(bp->nb_lflags, NBL_WANTED); 1216 lck_mtx_unlock(nfs_buf_mutex); 1217 wakeup(bp); 1218 } 1219 1220 FSDBG_BOT(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); 1221} 1222 1223void 1224nfs_buf_write_delayed(struct nfsbuf *bp) 1225{ 1226 nfsnode_t np = bp->nb_np; 1227 1228 FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0); 1229 FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty); 1230 1231 /* 1232 * If the block hasn't been seen before: 1233 * (1) Mark it as having been seen, 1234 * (2) Make sure it's on its node's correct block list, 1235 */ 1236 if (!ISSET(bp->nb_flags, NB_DELWRI)) { 1237 SET(bp->nb_flags, NB_DELWRI); 1238 /* move to dirty list */ 1239 lck_mtx_lock(nfs_buf_mutex); 1240 nfs_nbdwrite++; 1241 NFSBUFCNTCHK(); 1242 if (bp->nb_vnbufs.le_next != NFSNOLIST) 1243 LIST_REMOVE(bp, nb_vnbufs); 1244 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); 1245 lck_mtx_unlock(nfs_buf_mutex); 1246 } 1247 1248 /* 1249 * If the vnode has "too many" write operations in progress 1250 * wait for them to finish the IO 1251 */ 1252 vnode_waitforwrites(NFSTOV(np), VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed"); 1253 1254 /* the file is in a modified state, so make sure the flag's set */ 1255 nfs_node_lock_force(np); 1256 np->n_flag |= NMODIFIED; 1257 nfs_node_unlock(np); 1258 1259 /* 1260 * If we have too many delayed write buffers, 1261 * just fall back to doing the async write. 1262 */ 1263 if (nfs_nbdwrite < 0) 1264 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite"); 1265 if (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES) { 1266 /* issue async write */ 1267 SET(bp->nb_flags, NB_ASYNC); 1268 nfs_buf_write(bp); 1269 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); 1270 return; 1271 } 1272 1273 /* Otherwise, the "write" is done, so mark and release the buffer. */ 1274 SET(bp->nb_flags, NB_DONE); 1275 nfs_buf_release(bp, 1); 1276 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, 0); 1277 return; 1278} 1279 1280/* 1281 * Check that a "needcommit" buffer can still be committed. 1282 * If the write verifier has changed, we need to clear the 1283 * the needcommit flag. 1284 */ 1285void 1286nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp) 1287{ 1288 struct nfsmount *nmp; 1289 1290 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) 1291 return; 1292 1293 nmp = NFSTONMP(np); 1294 if (!nmp) 1295 return; 1296 if (!ISSET(bp->nb_flags, NB_STALEWVERF) && (bp->nb_verf == nmp->nm_verf)) 1297 return; 1298 1299 /* write verifier changed, clear commit/wverf flags */ 1300 CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_STALEWVERF)); 1301 bp->nb_verf = 0; 1302 nfs_node_lock_force(np); 1303 np->n_needcommitcnt--; 1304 CHECK_NEEDCOMMITCNT(np); 1305 nfs_node_unlock(np); 1306} 1307 1308/* 1309 * add a reference to a buffer so it doesn't disappear while being used 1310 * (must be called with nfs_buf_mutex held) 1311 */ 1312void 1313nfs_buf_refget(struct nfsbuf *bp) 1314{ 1315 bp->nb_refs++; 1316} 1317/* 1318 * release a reference on a buffer 1319 * (must be called with nfs_buf_mutex held) 1320 */ 1321void 1322nfs_buf_refrele(struct nfsbuf *bp) 1323{ 1324 bp->nb_refs--; 1325} 1326 1327/* 1328 * mark a particular buffer as BUSY 1329 * (must be called with nfs_buf_mutex held) 1330 */ 1331errno_t 1332nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo) 1333{ 1334 errno_t error; 1335 struct timespec ts; 1336 1337 if (ISSET(bp->nb_lflags, NBL_BUSY)) { 1338 /* 1339 * since the lck_mtx_lock may block, the buffer 1340 * may become BUSY, so we need to recheck for 1341 * a NOWAIT request 1342 */ 1343 if (flags & NBAC_NOWAIT) 1344 return (EBUSY); 1345 SET(bp->nb_lflags, NBL_WANTED); 1346 1347 ts.tv_sec = (slptimeo/100); 1348 /* the hz value is 100; which leads to 10ms */ 1349 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000; 1350 1351 error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1), 1352 "nfs_buf_acquire", &ts); 1353 if (error) 1354 return (error); 1355 return (EAGAIN); 1356 } 1357 if (flags & NBAC_REMOVE) 1358 nfs_buf_remfree(bp); 1359 SET(bp->nb_lflags, NBL_BUSY); 1360 1361 return (0); 1362} 1363 1364/* 1365 * simply drop the BUSY status of a buffer 1366 * (must be called with nfs_buf_mutex held) 1367 */ 1368void 1369nfs_buf_drop(struct nfsbuf *bp) 1370{ 1371 int need_wakeup = 0; 1372 1373 if (!ISSET(bp->nb_lflags, NBL_BUSY)) 1374 panic("nfs_buf_drop: buffer not busy!"); 1375 if (ISSET(bp->nb_lflags, NBL_WANTED)) { 1376 /* delay the actual wakeup until after we clear NBL_BUSY */ 1377 need_wakeup = 1; 1378 } 1379 /* Unlock the buffer. */ 1380 CLR(bp->nb_lflags, (NBL_BUSY | NBL_WANTED)); 1381 1382 if (need_wakeup) 1383 wakeup(bp); 1384} 1385 1386/* 1387 * prepare for iterating over an nfsnode's buffer list 1388 * this lock protects the queue manipulation 1389 * (must be called with nfs_buf_mutex held) 1390 */ 1391int 1392nfs_buf_iterprepare(nfsnode_t np, struct nfsbuflists *iterheadp, int flags) 1393{ 1394 struct nfsbuflists *listheadp; 1395 1396 if (flags & NBI_DIRTY) 1397 listheadp = &np->n_dirtyblkhd; 1398 else 1399 listheadp = &np->n_cleanblkhd; 1400 1401 if ((flags & NBI_NOWAIT) && (np->n_bufiterflags & NBI_ITER)) { 1402 LIST_INIT(iterheadp); 1403 return(EWOULDBLOCK); 1404 } 1405 1406 while (np->n_bufiterflags & NBI_ITER) { 1407 np->n_bufiterflags |= NBI_ITERWANT; 1408 msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", NULL); 1409 } 1410 if (LIST_EMPTY(listheadp)) { 1411 LIST_INIT(iterheadp); 1412 return(EINVAL); 1413 } 1414 np->n_bufiterflags |= NBI_ITER; 1415 1416 iterheadp->lh_first = listheadp->lh_first; 1417 listheadp->lh_first->nb_vnbufs.le_prev = &iterheadp->lh_first; 1418 LIST_INIT(listheadp); 1419 1420 return(0); 1421} 1422 1423/* 1424 * clean up after iterating over an nfsnode's buffer list 1425 * this lock protects the queue manipulation 1426 * (must be called with nfs_buf_mutex held) 1427 */ 1428void 1429nfs_buf_itercomplete(nfsnode_t np, struct nfsbuflists *iterheadp, int flags) 1430{ 1431 struct nfsbuflists * listheadp; 1432 struct nfsbuf *bp; 1433 1434 if (flags & NBI_DIRTY) 1435 listheadp = &np->n_dirtyblkhd; 1436 else 1437 listheadp = &np->n_cleanblkhd; 1438 1439 while (!LIST_EMPTY(iterheadp)) { 1440 bp = LIST_FIRST(iterheadp); 1441 LIST_REMOVE(bp, nb_vnbufs); 1442 LIST_INSERT_HEAD(listheadp, bp, nb_vnbufs); 1443 } 1444 1445 np->n_bufiterflags &= ~NBI_ITER; 1446 if (np->n_bufiterflags & NBI_ITERWANT) { 1447 np->n_bufiterflags &= ~NBI_ITERWANT; 1448 wakeup(&np->n_bufiterflags); 1449 } 1450} 1451 1452 1453/* 1454 * Read an NFS buffer for a file. 1455 */ 1456int 1457nfs_buf_read(struct nfsbuf *bp) 1458{ 1459 int error = 0; 1460 nfsnode_t np; 1461 thread_t thd; 1462 kauth_cred_t cred; 1463 1464 np = bp->nb_np; 1465 cred = bp->nb_rcred; 1466 if (IS_VALID_CRED(cred)) 1467 kauth_cred_ref(cred); 1468 thd = ISSET(bp->nb_flags, NB_ASYNC) ? NULL : current_thread(); 1469 1470 /* sanity checks */ 1471 if (!ISSET(bp->nb_flags, NB_READ)) 1472 panic("nfs_buf_read: !NB_READ"); 1473 if (ISSET(bp->nb_flags, NB_DONE)) 1474 CLR(bp->nb_flags, NB_DONE); 1475 1476 NFS_BUF_MAP(bp); 1477 1478 OSAddAtomic64(1, &nfsstats.read_bios); 1479 1480 error = nfs_buf_read_rpc(bp, thd, cred); 1481 /* 1482 * For async I/O, the callbacks will finish up the 1483 * read. Otherwise, the read has already been finished. 1484 */ 1485 1486 if (IS_VALID_CRED(cred)) 1487 kauth_cred_unref(&cred); 1488 return (error); 1489} 1490 1491/* 1492 * finish the reading of a buffer 1493 */ 1494void 1495nfs_buf_read_finish(struct nfsbuf *bp) 1496{ 1497 nfsnode_t np = bp->nb_np; 1498 struct nfsmount *nmp; 1499 1500 if (!ISSET(bp->nb_flags, NB_ERROR)) { 1501 /* update valid range */ 1502 bp->nb_validoff = 0; 1503 bp->nb_validend = bp->nb_endio; 1504 if (bp->nb_endio < (int)bp->nb_bufsize) { 1505 /* 1506 * The read may be short because we have unflushed writes 1507 * that are extending the file size and the reads hit the 1508 * (old) EOF on the server. So, just make sure nb_validend 1509 * correctly tracks EOF. 1510 * Note that the missing data should have already been zeroed 1511 * in nfs_buf_read_rpc_finish(). 1512 */ 1513 off_t boff = NBOFF(bp); 1514 if ((off_t)np->n_size >= (boff + bp->nb_bufsize)) 1515 bp->nb_validend = bp->nb_bufsize; 1516 else if ((off_t)np->n_size >= boff) 1517 bp->nb_validend = np->n_size - boff; 1518 else 1519 bp->nb_validend = 0; 1520 } 1521 if ((nmp = NFSTONMP(np)) && (nmp->nm_vers == NFS_VER2) && 1522 ((NBOFF(bp) + bp->nb_validend) > 0x100000000LL)) 1523 bp->nb_validend = 0x100000000LL - NBOFF(bp); 1524 bp->nb_valid = (1 << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1; 1525 if (bp->nb_validend & PAGE_MASK) { 1526 /* zero-fill remainder of last page */ 1527 bzero(bp->nb_data + bp->nb_validend, PAGE_SIZE - (bp->nb_validend & PAGE_MASK)); 1528 } 1529 } 1530 nfs_buf_iodone(bp); 1531} 1532 1533/* 1534 * initiate the NFS READ RPC(s) for a buffer 1535 */ 1536int 1537nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) 1538{ 1539 struct nfsmount *nmp; 1540 nfsnode_t np = bp->nb_np; 1541 int error = 0, nfsvers, async; 1542 int offset, nrpcs; 1543 uint32_t nmrsize, length, len; 1544 off_t boff; 1545 struct nfsreq *req; 1546 struct nfsreq_cbinfo cb; 1547 1548 nmp = NFSTONMP(np); 1549 if (!nmp) { 1550 bp->nb_error = error = ENXIO; 1551 SET(bp->nb_flags, NB_ERROR); 1552 nfs_buf_iodone(bp); 1553 return (error); 1554 } 1555 nfsvers = nmp->nm_vers; 1556 nmrsize = nmp->nm_rsize; 1557 1558 boff = NBOFF(bp); 1559 offset = 0; 1560 length = bp->nb_bufsize; 1561 1562 if (nfsvers == NFS_VER2) { 1563 if (boff > 0xffffffffLL) { 1564 bp->nb_error = error = EFBIG; 1565 SET(bp->nb_flags, NB_ERROR); 1566 nfs_buf_iodone(bp); 1567 return (error); 1568 } 1569 if ((boff + length - 1) > 0xffffffffLL) 1570 length = 0x100000000LL - boff; 1571 } 1572 1573 /* Note: Can only do async I/O if nfsiods are configured. */ 1574 async = (bp->nb_flags & NB_ASYNC); 1575 cb.rcb_func = async ? nfs_buf_read_rpc_finish : NULL; 1576 cb.rcb_bp = bp; 1577 1578 bp->nb_offio = bp->nb_endio = 0; 1579 bp->nb_rpcs = nrpcs = (length + nmrsize - 1) / nmrsize; 1580 if (async && (nrpcs > 1)) { 1581 SET(bp->nb_flags, NB_MULTASYNCRPC); 1582 } else { 1583 CLR(bp->nb_flags, NB_MULTASYNCRPC); 1584 } 1585 1586 while (length > 0) { 1587 if (ISSET(bp->nb_flags, NB_ERROR)) { 1588 error = bp->nb_error; 1589 break; 1590 } 1591 len = (length > nmrsize) ? nmrsize : length; 1592 cb.rcb_args[0] = offset; 1593 cb.rcb_args[1] = len; 1594 if (nmp->nm_vers >= NFS_VER4) 1595 cb.rcb_args[2] = nmp->nm_stategenid; 1596 req = NULL; 1597 error = nmp->nm_funcs->nf_read_rpc_async(np, boff + offset, len, thd, cred, &cb, &req); 1598 if (error) 1599 break; 1600 offset += len; 1601 length -= len; 1602 if (async) 1603 continue; 1604 nfs_buf_read_rpc_finish(req); 1605 if (ISSET(bp->nb_flags, NB_ERROR)) { 1606 error = bp->nb_error; 1607 break; 1608 } 1609 } 1610 1611 if (length > 0) { 1612 /* 1613 * Something bad happened while trying to send the RPC(s). 1614 * Wait for any outstanding requests to complete. 1615 */ 1616 bp->nb_error = error; 1617 SET(bp->nb_flags, NB_ERROR); 1618 if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) { 1619 nrpcs = (length + nmrsize - 1) / nmrsize; 1620 lck_mtx_lock(nfs_buf_mutex); 1621 bp->nb_rpcs -= nrpcs; 1622 if (bp->nb_rpcs == 0) { 1623 /* No RPCs left, so the buffer's done */ 1624 lck_mtx_unlock(nfs_buf_mutex); 1625 nfs_buf_iodone(bp); 1626 } else { 1627 /* wait for the last RPC to mark it done */ 1628 while (bp->nb_rpcs > 0) 1629 msleep(&bp->nb_rpcs, nfs_buf_mutex, 0, 1630 "nfs_buf_read_rpc_cancel", NULL); 1631 lck_mtx_unlock(nfs_buf_mutex); 1632 } 1633 } else { 1634 nfs_buf_iodone(bp); 1635 } 1636 } 1637 1638 return (error); 1639} 1640 1641/* 1642 * finish up an NFS READ RPC on a buffer 1643 */ 1644void 1645nfs_buf_read_rpc_finish(struct nfsreq *req) 1646{ 1647 struct nfsmount *nmp; 1648 size_t rlen; 1649 struct nfsreq_cbinfo cb; 1650 struct nfsbuf *bp; 1651 int error = 0, nfsvers, offset, length, eof = 0, multasyncrpc, finished; 1652 void *wakeme = NULL; 1653 struct nfsreq *rreq = NULL; 1654 nfsnode_t np; 1655 thread_t thd; 1656 kauth_cred_t cred; 1657 uio_t auio; 1658 char uio_buf [ UIO_SIZEOF(1) ]; 1659 1660finish: 1661 np = req->r_np; 1662 thd = req->r_thread; 1663 cred = req->r_cred; 1664 if (IS_VALID_CRED(cred)) 1665 kauth_cred_ref(cred); 1666 cb = req->r_callback; 1667 bp = cb.rcb_bp; 1668 if (cb.rcb_func) /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */ 1669 nfs_request_ref(req, 0); 1670 1671 nmp = NFSTONMP(np); 1672 if (!nmp) { 1673 SET(bp->nb_flags, NB_ERROR); 1674 bp->nb_error = error = ENXIO; 1675 } 1676 if (error || ISSET(bp->nb_flags, NB_ERROR)) { 1677 /* just drop it */ 1678 nfs_request_async_cancel(req); 1679 goto out; 1680 } 1681 1682 nfsvers = nmp->nm_vers; 1683 offset = cb.rcb_args[0]; 1684 rlen = length = cb.rcb_args[1]; 1685 1686 auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE, 1687 UIO_READ, &uio_buf, sizeof(uio_buf)); 1688 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length); 1689 1690 /* finish the RPC */ 1691 error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, auio, &rlen, &eof); 1692 if ((error == EINPROGRESS) && cb.rcb_func) { 1693 /* async request restarted */ 1694 if (cb.rcb_func) 1695 nfs_request_rele(req); 1696 if (IS_VALID_CRED(cred)) 1697 kauth_cred_unref(&cred); 1698 return; 1699 } 1700 if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) { 1701 lck_mtx_lock(&nmp->nm_lock); 1702 if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) { 1703 NP(np, "nfs_buf_read_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery", 1704 error, NBOFF(bp)+offset, cb.rcb_args[2], nmp->nm_stategenid); 1705 nfs_need_recover(nmp, error); 1706 } 1707 lck_mtx_unlock(&nmp->nm_lock); 1708 if (np->n_flag & NREVOKE) { 1709 error = EIO; 1710 } else { 1711 if (error == NFSERR_GRACE) { 1712 if (cb.rcb_func) { 1713 /* 1714 * For an async I/O request, handle a grace delay just like 1715 * jukebox errors. Set the resend time and queue it up. 1716 */ 1717 struct timeval now; 1718 if (req->r_nmrep.nmc_mhead) { 1719 mbuf_freem(req->r_nmrep.nmc_mhead); 1720 req->r_nmrep.nmc_mhead = NULL; 1721 } 1722 req->r_error = 0; 1723 microuptime(&now); 1724 lck_mtx_lock(&req->r_mtx); 1725 req->r_resendtime = now.tv_sec + 2; 1726 req->r_xid = 0; // get a new XID 1727 req->r_flags |= R_RESTART; 1728 req->r_start = 0; 1729 nfs_asyncio_resend(req); 1730 lck_mtx_unlock(&req->r_mtx); 1731 if (IS_VALID_CRED(cred)) 1732 kauth_cred_unref(&cred); 1733 /* Note: nfsreq reference taken will be dropped later when finished */ 1734 return; 1735 } 1736 /* otherwise, just pause a couple seconds and retry */ 1737 tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); 1738 } 1739 if (!(error = nfs_mount_state_wait_for_recovery(nmp))) { 1740 rlen = 0; 1741 goto readagain; 1742 } 1743 } 1744 } 1745 if (error) { 1746 SET(bp->nb_flags, NB_ERROR); 1747 bp->nb_error = error; 1748 goto out; 1749 } 1750 1751 if ((rlen > 0) && (bp->nb_endio < (offset + (int)rlen))) 1752 bp->nb_endio = offset + rlen; 1753 1754 if ((nfsvers == NFS_VER2) || eof || (rlen == 0)) { 1755 /* zero out the remaining data (up to EOF) */ 1756 off_t rpcrem, eofrem, rem; 1757 rpcrem = (length - rlen); 1758 eofrem = np->n_size - (NBOFF(bp) + offset + rlen); 1759 rem = (rpcrem < eofrem) ? rpcrem : eofrem; 1760 if (rem > 0) 1761 bzero(bp->nb_data + offset + rlen, rem); 1762 } else if (((int)rlen < length) && !ISSET(bp->nb_flags, NB_ERROR)) { 1763 /* 1764 * short read 1765 * 1766 * We haven't hit EOF and we didn't get all the data 1767 * requested, so we need to issue another read for the rest. 1768 * (Don't bother if the buffer already hit an error.) 1769 */ 1770readagain: 1771 offset += rlen; 1772 length -= rlen; 1773 cb.rcb_args[0] = offset; 1774 cb.rcb_args[1] = length; 1775 if (nmp->nm_vers >= NFS_VER4) 1776 cb.rcb_args[2] = nmp->nm_stategenid; 1777 error = nmp->nm_funcs->nf_read_rpc_async(np, NBOFF(bp) + offset, length, thd, cred, &cb, &rreq); 1778 if (!error) { 1779 if (IS_VALID_CRED(cred)) 1780 kauth_cred_unref(&cred); 1781 if (!cb.rcb_func) { 1782 /* if !async we'll need to wait for this RPC to finish */ 1783 req = rreq; 1784 rreq = NULL; 1785 goto finish; 1786 } 1787 nfs_request_rele(req); 1788 /* 1789 * We're done here. 1790 * Outstanding RPC count is unchanged. 1791 * Callback will be called when RPC is done. 1792 */ 1793 return; 1794 } 1795 SET(bp->nb_flags, NB_ERROR); 1796 bp->nb_error = error; 1797 } 1798 1799out: 1800 if (cb.rcb_func) 1801 nfs_request_rele(req); 1802 if (IS_VALID_CRED(cred)) 1803 kauth_cred_unref(&cred); 1804 1805 /* 1806 * Decrement outstanding RPC count on buffer 1807 * and call nfs_buf_read_finish on last RPC. 1808 * 1809 * (Note: when there are multiple async RPCs issued for a 1810 * buffer we need nfs_buffer_mutex to avoid problems when 1811 * aborting a partially-initiated set of RPCs) 1812 */ 1813 1814 multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC); 1815 if (multasyncrpc) 1816 lck_mtx_lock(nfs_buf_mutex); 1817 1818 bp->nb_rpcs--; 1819 finished = (bp->nb_rpcs == 0); 1820 1821 if (multasyncrpc) 1822 lck_mtx_unlock(nfs_buf_mutex); 1823 1824 if (finished) { 1825 if (multasyncrpc) 1826 wakeme = &bp->nb_rpcs; 1827 nfs_buf_read_finish(bp); 1828 if (wakeme) 1829 wakeup(wakeme); 1830 } 1831} 1832 1833/* 1834 * Do buffer readahead. 1835 * Initiate async I/O to read buffers not in cache. 1836 */ 1837int 1838nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn, thread_t thd, kauth_cred_t cred) 1839{ 1840 struct nfsmount *nmp = NFSTONMP(np); 1841 struct nfsbuf *bp; 1842 int error = 0; 1843 uint32_t nra; 1844 1845 if (!nmp) 1846 return (ENXIO); 1847 if (nmp->nm_readahead <= 0) 1848 return (0); 1849 if (*rabnp > lastrabn) 1850 return (0); 1851 1852 for (nra = 0; (nra < nmp->nm_readahead) && (*rabnp <= lastrabn); nra++, *rabnp = *rabnp + 1) { 1853 /* check if block exists and is valid. */ 1854 if ((*rabnp * nmp->nm_biosize) >= (off_t)np->n_size) { 1855 /* stop reading ahead if we're beyond EOF */ 1856 *rabnp = lastrabn; 1857 break; 1858 } 1859 error = nfs_buf_get(np, *rabnp, nmp->nm_biosize, thd, NBLK_READ|NBLK_NOWAIT, &bp); 1860 if (error) 1861 break; 1862 nfs_node_lock_force(np); 1863 np->n_lastrahead = *rabnp; 1864 nfs_node_unlock(np); 1865 if (!bp) 1866 continue; 1867 if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE) && 1868 !bp->nb_dirty && !ISSET(bp->nb_flags, (NB_DELWRI|NB_NCRDAHEAD))) { 1869 CLR(bp->nb_flags, NB_CACHE); 1870 bp->nb_valid = 0; 1871 bp->nb_validoff = bp->nb_validend = -1; 1872 } 1873 if ((bp->nb_dirtyend <= 0) && !bp->nb_dirty && 1874 !ISSET(bp->nb_flags, (NB_CACHE|NB_DELWRI))) { 1875 SET(bp->nb_flags, (NB_READ|NB_ASYNC)); 1876 if (ioflag & IO_NOCACHE) 1877 SET(bp->nb_flags, NB_NCRDAHEAD); 1878 if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) { 1879 kauth_cred_ref(cred); 1880 bp->nb_rcred = cred; 1881 } 1882 if ((error = nfs_buf_read(bp))) 1883 break; 1884 continue; 1885 } 1886 nfs_buf_release(bp, 1); 1887 } 1888 return (error); 1889} 1890 1891/* 1892 * NFS buffer I/O for reading files. 1893 */ 1894int 1895nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx) 1896{ 1897 vnode_t vp = NFSTOV(np); 1898 struct nfsbuf *bp = NULL; 1899 struct nfsmount *nmp = VTONMP(vp); 1900 daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1; 1901 off_t diff; 1902 int error = 0, n = 0, on = 0; 1903 int nfsvers, biosize, modified, readaheads = 0; 1904 thread_t thd; 1905 kauth_cred_t cred; 1906 int64_t io_resid; 1907 1908 FSDBG_TOP(514, np, uio_offset(uio), uio_resid(uio), ioflag); 1909 1910 nfsvers = nmp->nm_vers; 1911 biosize = nmp->nm_biosize; 1912 thd = vfs_context_thread(ctx); 1913 cred = vfs_context_ucred(ctx); 1914 1915 if (vnode_vtype(vp) != VREG) { 1916 printf("nfs_bioread: type %x unexpected\n", vnode_vtype(vp)); 1917 FSDBG_BOT(514, np, 0xd1e0016, 0, EINVAL); 1918 return (EINVAL); 1919 } 1920 1921 /* 1922 * For NFS, cache consistency can only be maintained approximately. 1923 * Although RFC1094 does not specify the criteria, the following is 1924 * believed to be compatible with the reference port. 1925 * 1926 * If the file has changed since the last read RPC or you have 1927 * written to the file, you may have lost data cache consistency 1928 * with the server. So, check for a change, and flush all of the 1929 * file's data out of the cache. 1930 * NB: This implies that cache data can be read when up to 1931 * NFS_MAXATTRTIMO seconds out of date. If you find that you 1932 * need current attributes, nfs_getattr() can be forced to fetch 1933 * new attributes (via NATTRINVALIDATE() or NGA_UNCACHED). 1934 */ 1935 1936 if (ISSET(np->n_flag, NUPDATESIZE)) 1937 nfs_data_update_size(np, 0); 1938 1939 if ((error = nfs_node_lock(np))) { 1940 FSDBG_BOT(514, np, 0xd1e0222, 0, error); 1941 return (error); 1942 } 1943 1944 if (np->n_flag & NNEEDINVALIDATE) { 1945 np->n_flag &= ~NNEEDINVALIDATE; 1946 nfs_node_unlock(np); 1947 error = nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, ctx, 1); 1948 if (!error) 1949 error = nfs_node_lock(np); 1950 if (error) { 1951 FSDBG_BOT(514, np, 0xd1e0322, 0, error); 1952 return (error); 1953 } 1954 } 1955 1956 modified = (np->n_flag & NMODIFIED); 1957 nfs_node_unlock(np); 1958 /* nfs_getattr() will check changed and purge caches */ 1959 error = nfs_getattr(np, NULL, ctx, modified ? NGA_UNCACHED : NGA_CACHED); 1960 if (error) { 1961 FSDBG_BOT(514, np, 0xd1e0004, 0, error); 1962 return (error); 1963 } 1964 1965 if (uio_resid(uio) == 0) { 1966 FSDBG_BOT(514, np, 0xd1e0001, 0, 0); 1967 return (0); 1968 } 1969 if (uio_offset(uio) < 0) { 1970 FSDBG_BOT(514, np, 0xd1e0002, 0, EINVAL); 1971 return (EINVAL); 1972 } 1973 1974 /* 1975 * set up readahead - which may be limited by: 1976 * + current request length (for IO_NOCACHE) 1977 * + readahead setting 1978 * + file size 1979 */ 1980 if (nmp->nm_readahead > 0) { 1981 off_t end = uio_offset(uio) + uio_resid(uio); 1982 if (end > (off_t)np->n_size) 1983 end = np->n_size; 1984 rabn = uio_offset(uio) / biosize; 1985 maxrabn = (end - 1) / biosize; 1986 nfs_node_lock_force(np); 1987 if (!(ioflag & IO_NOCACHE) && 1988 (!rabn || (rabn == np->n_lastread) || (rabn == (np->n_lastread+1)))) { 1989 maxrabn += nmp->nm_readahead; 1990 if ((maxrabn * biosize) >= (off_t)np->n_size) 1991 maxrabn = ((off_t)np->n_size - 1)/biosize; 1992 } 1993 if (maxrabn < np->n_lastrahead) 1994 np->n_lastrahead = -1; 1995 if (rabn < np->n_lastrahead) 1996 rabn = np->n_lastrahead + 1; 1997 nfs_node_unlock(np); 1998 } else { 1999 rabn = maxrabn = 0; 2000 } 2001 2002 do { 2003 2004 nfs_data_lock(np, NFS_DATA_LOCK_SHARED); 2005 lbn = uio_offset(uio) / biosize; 2006 2007 /* 2008 * Copy directly from any cached pages without grabbing the bufs. 2009 * (If we are NOCACHE and we've issued readahead requests, we need 2010 * to grab the NB_NCRDAHEAD bufs to drop them.) 2011 */ 2012 if ((!(ioflag & IO_NOCACHE) || !readaheads) && 2013 ((uio->uio_segflg == UIO_USERSPACE32 || 2014 uio->uio_segflg == UIO_USERSPACE64 || 2015 uio->uio_segflg == UIO_USERSPACE))) { 2016 io_resid = uio_resid(uio); 2017 diff = np->n_size - uio_offset(uio); 2018 if (diff < io_resid) 2019 io_resid = diff; 2020 if (io_resid > 0) { 2021 int count = (io_resid > INT_MAX) ? INT_MAX : io_resid; 2022 error = cluster_copy_ubc_data(vp, uio, &count, 0); 2023 if (error) { 2024 nfs_data_unlock(np); 2025 FSDBG_BOT(514, np, uio_offset(uio), 0xcacefeed, error); 2026 return (error); 2027 } 2028 } 2029 /* count any biocache reads that we just copied directly */ 2030 if (lbn != (uio_offset(uio)/biosize)) { 2031 OSAddAtomic64((uio_offset(uio)/biosize) - lbn, &nfsstats.biocache_reads); 2032 FSDBG(514, np, 0xcacefeed, uio_offset(uio), error); 2033 } 2034 } 2035 2036 lbn = uio_offset(uio) / biosize; 2037 on = uio_offset(uio) % biosize; 2038 nfs_node_lock_force(np); 2039 np->n_lastread = (uio_offset(uio) - 1) / biosize; 2040 nfs_node_unlock(np); 2041 2042 if ((uio_resid(uio) <= 0) || (uio_offset(uio) >= (off_t)np->n_size)) { 2043 nfs_data_unlock(np); 2044 FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), 0xaaaaaaaa); 2045 return (0); 2046 } 2047 2048 /* adjust readahead block number, if necessary */ 2049 if (rabn < lbn) 2050 rabn = lbn; 2051 lastrabn = MIN(maxrabn, lbn + nmp->nm_readahead); 2052 if (rabn <= lastrabn) { /* start readaheads */ 2053 error = nfs_buf_readahead(np, ioflag, &rabn, lastrabn, thd, cred); 2054 if (error) { 2055 nfs_data_unlock(np); 2056 FSDBG_BOT(514, np, 0xd1e000b, 1, error); 2057 return (error); 2058 } 2059 readaheads = 1; 2060 } 2061 2062 OSAddAtomic64(1, &nfsstats.biocache_reads); 2063 2064 /* 2065 * If the block is in the cache and has the required data 2066 * in a valid region, just copy it out. 2067 * Otherwise, get the block and write back/read in, 2068 * as required. 2069 */ 2070again: 2071 io_resid = uio_resid(uio); 2072 n = (io_resid > (biosize - on)) ? (biosize - on) : io_resid; 2073 diff = np->n_size - uio_offset(uio); 2074 if (diff < n) 2075 n = diff; 2076 2077 error = nfs_buf_get(np, lbn, biosize, thd, NBLK_READ, &bp); 2078 if (error) { 2079 nfs_data_unlock(np); 2080 FSDBG_BOT(514, np, 0xd1e000c, 0, error); 2081 return (error); 2082 } 2083 2084 if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE)) { 2085 /* 2086 * IO_NOCACHE found a cached buffer. 2087 * Flush the buffer if it's dirty. 2088 * Invalidate the data if it wasn't just read 2089 * in as part of a "nocache readahead". 2090 */ 2091 if (bp->nb_dirty || (bp->nb_dirtyend > 0)) { 2092 /* so write the buffer out and try again */ 2093 SET(bp->nb_flags, NB_NOCACHE); 2094 goto flushbuffer; 2095 } 2096 if (ISSET(bp->nb_flags, NB_NCRDAHEAD)) { 2097 CLR(bp->nb_flags, NB_NCRDAHEAD); 2098 SET(bp->nb_flags, NB_NOCACHE); 2099 } 2100 } 2101 2102 /* if any pages are valid... */ 2103 if (bp->nb_valid) { 2104 /* ...check for any invalid pages in the read range */ 2105 int pg, firstpg, lastpg, dirtypg; 2106 dirtypg = firstpg = lastpg = -1; 2107 pg = on/PAGE_SIZE; 2108 while (pg <= (on + n - 1)/PAGE_SIZE) { 2109 if (!NBPGVALID(bp,pg)) { 2110 if (firstpg < 0) 2111 firstpg = pg; 2112 lastpg = pg; 2113 } else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp,pg)) 2114 dirtypg = pg; 2115 pg++; 2116 } 2117 2118 /* if there are no invalid pages, we're all set */ 2119 if (firstpg < 0) { 2120 if (bp->nb_validoff < 0) { 2121 /* valid range isn't set up, so */ 2122 /* set it to what we know is valid */ 2123 bp->nb_validoff = trunc_page(on); 2124 bp->nb_validend = round_page(on+n); 2125 nfs_buf_normalize_valid_range(np, bp); 2126 } 2127 goto buffer_ready; 2128 } 2129 2130 /* there are invalid pages in the read range */ 2131 if (((dirtypg > firstpg) && (dirtypg < lastpg)) || 2132 (((firstpg*PAGE_SIZE) < bp->nb_dirtyend) && (((lastpg+1)*PAGE_SIZE) > bp->nb_dirtyoff))) { 2133 /* there are also dirty page(s) (or range) in the read range, */ 2134 /* so write the buffer out and try again */ 2135flushbuffer: 2136 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL)); 2137 SET(bp->nb_flags, NB_ASYNC); 2138 if (!IS_VALID_CRED(bp->nb_wcred)) { 2139 kauth_cred_ref(cred); 2140 bp->nb_wcred = cred; 2141 } 2142 error = nfs_buf_write(bp); 2143 if (error) { 2144 nfs_data_unlock(np); 2145 FSDBG_BOT(514, np, 0xd1e000d, 0, error); 2146 return (error); 2147 } 2148 goto again; 2149 } 2150 if (!bp->nb_dirty && bp->nb_dirtyend <= 0 && 2151 (lastpg - firstpg + 1) > (biosize/PAGE_SIZE)/2) { 2152 /* we need to read in more than half the buffer and the */ 2153 /* buffer's not dirty, so just fetch the whole buffer */ 2154 bp->nb_valid = 0; 2155 } else { 2156 /* read the page range in */ 2157 uio_t auio; 2158 char uio_buf[ UIO_SIZEOF(1) ]; 2159 2160 NFS_BUF_MAP(bp); 2161 auio = uio_createwithbuffer(1, (NBOFF(bp) + firstpg * PAGE_SIZE_64), 2162 UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf)); 2163 if (!auio) { 2164 error = ENOMEM; 2165 } else { 2166 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + (firstpg * PAGE_SIZE)), 2167 ((lastpg - firstpg + 1) * PAGE_SIZE)); 2168 error = nfs_read_rpc(np, auio, ctx); 2169 } 2170 if (error) { 2171 if (ioflag & IO_NOCACHE) 2172 SET(bp->nb_flags, NB_NOCACHE); 2173 nfs_buf_release(bp, 1); 2174 nfs_data_unlock(np); 2175 FSDBG_BOT(514, np, 0xd1e000e, 0, error); 2176 return (error); 2177 } 2178 /* Make sure that the valid range is set to cover this read. */ 2179 bp->nb_validoff = trunc_page_32(on); 2180 bp->nb_validend = round_page_32(on+n); 2181 nfs_buf_normalize_valid_range(np, bp); 2182 if (uio_resid(auio) > 0) { 2183 /* if short read, must have hit EOF, */ 2184 /* so zero the rest of the range */ 2185 bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio)); 2186 } 2187 /* mark the pages (successfully read) as valid */ 2188 for (pg=firstpg; pg <= lastpg; pg++) 2189 NBPGVALID_SET(bp,pg); 2190 } 2191 } 2192 /* if no pages are valid, read the whole block */ 2193 if (!bp->nb_valid) { 2194 if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) { 2195 kauth_cred_ref(cred); 2196 bp->nb_rcred = cred; 2197 } 2198 SET(bp->nb_flags, NB_READ); 2199 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL)); 2200 error = nfs_buf_read(bp); 2201 if (ioflag & IO_NOCACHE) 2202 SET(bp->nb_flags, NB_NOCACHE); 2203 if (error) { 2204 nfs_data_unlock(np); 2205 nfs_buf_release(bp, 1); 2206 FSDBG_BOT(514, np, 0xd1e000f, 0, error); 2207 return (error); 2208 } 2209 } 2210buffer_ready: 2211 /* validate read range against valid range and clip */ 2212 if (bp->nb_validend > 0) { 2213 diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on); 2214 if (diff < n) 2215 n = diff; 2216 } 2217 if (n > 0) { 2218 NFS_BUF_MAP(bp); 2219 error = uiomove(bp->nb_data + on, n, uio); 2220 } 2221 2222 nfs_buf_release(bp, 1); 2223 nfs_data_unlock(np); 2224 nfs_node_lock_force(np); 2225 np->n_lastread = (uio_offset(uio) - 1) / biosize; 2226 nfs_node_unlock(np); 2227 } while (error == 0 && uio_resid(uio) > 0 && n > 0); 2228 FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), error); 2229 return (error); 2230} 2231 2232/* 2233 * limit the number of outstanding async I/O writes 2234 */ 2235int 2236nfs_async_write_start(struct nfsmount *nmp) 2237{ 2238 int error = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0; 2239 struct timespec ts = {1, 0}; 2240 2241 if (nfs_max_async_writes <= 0) 2242 return (0); 2243 lck_mtx_lock(&nmp->nm_lock); 2244 while ((nfs_max_async_writes > 0) && (nmp->nm_asyncwrites >= nfs_max_async_writes)) { 2245 if ((error = nfs_sigintr(nmp, NULL, current_thread(), 1))) 2246 break; 2247 msleep(&nmp->nm_asyncwrites, &nmp->nm_lock, slpflag|(PZERO-1), "nfsasyncwrites", &ts); 2248 slpflag = 0; 2249 } 2250 if (!error) 2251 nmp->nm_asyncwrites++; 2252 lck_mtx_unlock(&nmp->nm_lock); 2253 return (error); 2254} 2255void 2256nfs_async_write_done(struct nfsmount *nmp) 2257{ 2258 if (nmp->nm_asyncwrites <= 0) 2259 return; 2260 lck_mtx_lock(&nmp->nm_lock); 2261 if (nmp->nm_asyncwrites-- >= nfs_max_async_writes) 2262 wakeup(&nmp->nm_asyncwrites); 2263 lck_mtx_unlock(&nmp->nm_lock); 2264} 2265 2266/* 2267 * write (or commit) the given NFS buffer 2268 * 2269 * Commit the buffer if we can. 2270 * Write out any dirty range. 2271 * If any dirty pages remain, write them out. 2272 * Mark buffer done. 2273 * 2274 * For async requests, all the work beyond sending the initial 2275 * write RPC is handled in the RPC callback(s). 2276 */ 2277int 2278nfs_buf_write(struct nfsbuf *bp) 2279{ 2280 int error = 0, oldflags, async; 2281 nfsnode_t np; 2282 thread_t thd; 2283 kauth_cred_t cred; 2284 proc_t p = current_proc(); 2285 int iomode, doff, dend, firstpg, lastpg; 2286 uint32_t pagemask; 2287 2288 FSDBG_TOP(553, bp, NBOFF(bp), bp->nb_flags, 0); 2289 2290 if (!ISSET(bp->nb_lflags, NBL_BUSY)) 2291 panic("nfs_buf_write: buffer is not busy???"); 2292 2293 np = bp->nb_np; 2294 async = ISSET(bp->nb_flags, NB_ASYNC); 2295 oldflags = bp->nb_flags; 2296 2297 CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI)); 2298 if (ISSET(oldflags, NB_DELWRI)) { 2299 lck_mtx_lock(nfs_buf_mutex); 2300 nfs_nbdwrite--; 2301 NFSBUFCNTCHK(); 2302 lck_mtx_unlock(nfs_buf_mutex); 2303 wakeup(&nfs_nbdwrite); 2304 } 2305 2306 /* move to clean list */ 2307 if (ISSET(oldflags, (NB_ASYNC|NB_DELWRI))) { 2308 lck_mtx_lock(nfs_buf_mutex); 2309 if (bp->nb_vnbufs.le_next != NFSNOLIST) 2310 LIST_REMOVE(bp, nb_vnbufs); 2311 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); 2312 lck_mtx_unlock(nfs_buf_mutex); 2313 } 2314 nfs_node_lock_force(np); 2315 np->n_numoutput++; 2316 nfs_node_unlock(np); 2317 vnode_startwrite(NFSTOV(np)); 2318 2319 if (p && p->p_stats) 2320 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); 2321 2322 cred = bp->nb_wcred; 2323 if (!IS_VALID_CRED(cred) && ISSET(bp->nb_flags, NB_READ)) 2324 cred = bp->nb_rcred; /* shouldn't really happen, but... */ 2325 if (IS_VALID_CRED(cred)) 2326 kauth_cred_ref(cred); 2327 thd = async ? NULL : current_thread(); 2328 2329 /* We need to make sure the pages are locked before doing I/O. */ 2330 if (!ISSET(bp->nb_flags, NB_META) && UBCINFOEXISTS(NFSTOV(np))) { 2331 if (!ISSET(bp->nb_flags, NB_PAGELIST)) { 2332 error = nfs_buf_upl_setup(bp); 2333 if (error) { 2334 printf("nfs_buf_write: upl create failed %d\n", error); 2335 SET(bp->nb_flags, NB_ERROR); 2336 bp->nb_error = error = EIO; 2337 nfs_buf_iodone(bp); 2338 goto out; 2339 } 2340 nfs_buf_upl_check(bp); 2341 } 2342 } 2343 2344 /* If NB_NEEDCOMMIT is set, a commit RPC may do the trick. */ 2345 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) 2346 nfs_buf_check_write_verifier(np, bp); 2347 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { 2348 struct nfsmount *nmp = NFSTONMP(np); 2349 if (!nmp) { 2350 SET(bp->nb_flags, NB_ERROR); 2351 bp->nb_error = error = EIO; 2352 nfs_buf_iodone(bp); 2353 goto out; 2354 } 2355 SET(bp->nb_flags, NB_WRITEINPROG); 2356 error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp) + bp->nb_dirtyoff, 2357 bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred, bp->nb_verf); 2358 CLR(bp->nb_flags, NB_WRITEINPROG); 2359 if (error) { 2360 if (error != NFSERR_STALEWRITEVERF) { 2361 SET(bp->nb_flags, NB_ERROR); 2362 bp->nb_error = error; 2363 } 2364 nfs_buf_iodone(bp); 2365 goto out; 2366 } 2367 bp->nb_dirtyoff = bp->nb_dirtyend = 0; 2368 CLR(bp->nb_flags, NB_NEEDCOMMIT); 2369 nfs_node_lock_force(np); 2370 np->n_needcommitcnt--; 2371 CHECK_NEEDCOMMITCNT(np); 2372 nfs_node_unlock(np); 2373 } 2374 if (!error && (bp->nb_dirtyend > 0)) { 2375 /* sanity check the dirty range */ 2376 if (NBOFF(bp) + bp->nb_dirtyend > (off_t) np->n_size) { 2377 bp->nb_dirtyend = np->n_size - NBOFF(bp); 2378 if (bp->nb_dirtyoff >= bp->nb_dirtyend) 2379 bp->nb_dirtyoff = bp->nb_dirtyend = 0; 2380 } 2381 } 2382 if (!error && (bp->nb_dirtyend > 0)) { 2383 /* there's a dirty range that needs to be written out */ 2384 NFS_BUF_MAP(bp); 2385 2386 doff = bp->nb_dirtyoff; 2387 dend = bp->nb_dirtyend; 2388 2389 /* if doff page is dirty, move doff to start of page */ 2390 if (NBPGDIRTY(bp, doff / PAGE_SIZE)) 2391 doff -= doff & PAGE_MASK; 2392 /* try to expand write range to include preceding dirty pages */ 2393 if (!(doff & PAGE_MASK)) 2394 while ((doff > 0) && NBPGDIRTY(bp, (doff - 1) / PAGE_SIZE)) 2395 doff -= PAGE_SIZE; 2396 /* if dend page is dirty, move dend to start of next page */ 2397 if ((dend & PAGE_MASK) && NBPGDIRTY(bp, dend / PAGE_SIZE)) 2398 dend = round_page_32(dend); 2399 /* try to expand write range to include trailing dirty pages */ 2400 if (!(dend & PAGE_MASK)) 2401 while ((dend < (int)bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE)) 2402 dend += PAGE_SIZE; 2403 /* make sure to keep dend clipped to EOF */ 2404 if ((NBOFF(bp) + dend) > (off_t) np->n_size) 2405 dend = np->n_size - NBOFF(bp); 2406 /* calculate range of complete pages being written */ 2407 firstpg = round_page_32(doff) / PAGE_SIZE; 2408 lastpg = (trunc_page_32(dend) - 1) / PAGE_SIZE; 2409 /* calculate mask for that page range */ 2410 pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1); 2411 2412 /* 2413 * compare page mask to nb_dirty; if there are other dirty pages 2414 * then write FILESYNC; otherwise, write UNSTABLE if async and 2415 * not needcommit/stable; otherwise write FILESYNC 2416 */ 2417 if (bp->nb_dirty & ~pagemask) 2418 iomode = NFS_WRITE_FILESYNC; 2419 else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_STABLE)) == NB_ASYNC) 2420 iomode = NFS_WRITE_UNSTABLE; 2421 else 2422 iomode = NFS_WRITE_FILESYNC; 2423 2424 /* write the whole contiguous dirty range */ 2425 bp->nb_offio = doff; 2426 bp->nb_endio = dend; 2427 2428 OSAddAtomic64(1, &nfsstats.write_bios); 2429 2430 SET(bp->nb_flags, NB_WRITEINPROG); 2431 error = nfs_buf_write_rpc(bp, iomode, thd, cred); 2432 /* 2433 * For async I/O, the callbacks will finish up the 2434 * write and push out any dirty pages. Otherwise, 2435 * the write has already been finished and any dirty 2436 * pages pushed out. 2437 */ 2438 } else { 2439 if (!error && bp->nb_dirty) /* write out any dirty pages */ 2440 error = nfs_buf_write_dirty_pages(bp, thd, cred); 2441 nfs_buf_iodone(bp); 2442 } 2443 /* note: bp is still valid only for !async case */ 2444out: 2445 if (!async) { 2446 error = nfs_buf_iowait(bp); 2447 /* move to clean list */ 2448 if (oldflags & NB_DELWRI) { 2449 lck_mtx_lock(nfs_buf_mutex); 2450 if (bp->nb_vnbufs.le_next != NFSNOLIST) 2451 LIST_REMOVE(bp, nb_vnbufs); 2452 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); 2453 lck_mtx_unlock(nfs_buf_mutex); 2454 } 2455 FSDBG_BOT(553, bp, NBOFF(bp), bp->nb_flags, error); 2456 nfs_buf_release(bp, 1); 2457 /* check if we need to invalidate (and we can) */ 2458 if ((np->n_flag & NNEEDINVALIDATE) && 2459 !(np->n_bflag & (NBINVALINPROG|NBFLUSHINPROG))) { 2460 int invalidate = 0; 2461 nfs_node_lock_force(np); 2462 if (np->n_flag & NNEEDINVALIDATE) { 2463 invalidate = 1; 2464 np->n_flag &= ~NNEEDINVALIDATE; 2465 } 2466 nfs_node_unlock(np); 2467 if (invalidate) { 2468 /* 2469 * There was a write error and we need to 2470 * invalidate attrs and flush buffers in 2471 * order to sync up with the server. 2472 * (if this write was extending the file, 2473 * we may no longer know the correct size) 2474 * 2475 * But we couldn't call vinvalbuf while holding 2476 * the buffer busy. So we call vinvalbuf() after 2477 * releasing the buffer. 2478 */ 2479 nfs_vinvalbuf2(NFSTOV(np), V_SAVE|V_IGNORE_WRITEERR, thd, cred, 1); 2480 } 2481 } 2482 } 2483 2484 if (IS_VALID_CRED(cred)) 2485 kauth_cred_unref(&cred); 2486 return (error); 2487} 2488 2489/* 2490 * finish the writing of a buffer 2491 */ 2492void 2493nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) 2494{ 2495 nfsnode_t np = bp->nb_np; 2496 int error = (bp->nb_flags & NB_ERROR) ? bp->nb_error : 0; 2497 int firstpg, lastpg; 2498 uint32_t pagemask; 2499 2500 if ((error == EINTR) || (error == ERESTART)) { 2501 CLR(bp->nb_flags, NB_ERROR); 2502 SET(bp->nb_flags, NB_EINTR); 2503 } 2504 2505 if (!error) { 2506 /* calculate range of complete pages being written */ 2507 firstpg = round_page_32(bp->nb_offio) / PAGE_SIZE; 2508 lastpg = (trunc_page_32(bp->nb_endio) - 1) / PAGE_SIZE; 2509 /* calculate mask for that page range written */ 2510 pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1); 2511 /* clear dirty bits for pages we've written */ 2512 bp->nb_dirty &= ~pagemask; 2513 } 2514 2515 /* manage needcommit state */ 2516 if (!error && (bp->nb_commitlevel == NFS_WRITE_UNSTABLE)) { 2517 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { 2518 nfs_node_lock_force(np); 2519 np->n_needcommitcnt++; 2520 nfs_node_unlock(np); 2521 SET(bp->nb_flags, NB_NEEDCOMMIT); 2522 } 2523 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */ 2524 bp->nb_dirtyoff = bp->nb_offio; 2525 bp->nb_dirtyend = bp->nb_endio; 2526 } else if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { 2527 nfs_node_lock_force(np); 2528 np->n_needcommitcnt--; 2529 CHECK_NEEDCOMMITCNT(np); 2530 nfs_node_unlock(np); 2531 CLR(bp->nb_flags, NB_NEEDCOMMIT); 2532 } 2533 2534 CLR(bp->nb_flags, NB_WRITEINPROG); 2535 2536 /* 2537 * For an unstable write, the buffer is still treated as dirty until 2538 * a commit (or stable (re)write) is performed. Buffers needing only 2539 * a commit are marked with the NB_DELWRI and NB_NEEDCOMMIT flags. 2540 * 2541 * If the write was interrupted we set NB_EINTR. Don't set NB_ERROR 2542 * because that would cause the buffer to be dropped. The buffer is 2543 * still valid and simply needs to be written again. 2544 */ 2545 if ((error == EINTR) || (error == ERESTART) || (!error && (bp->nb_flags & NB_NEEDCOMMIT))) { 2546 CLR(bp->nb_flags, NB_INVAL); 2547 if (!ISSET(bp->nb_flags, NB_DELWRI)) { 2548 SET(bp->nb_flags, NB_DELWRI); 2549 lck_mtx_lock(nfs_buf_mutex); 2550 nfs_nbdwrite++; 2551 NFSBUFCNTCHK(); 2552 lck_mtx_unlock(nfs_buf_mutex); 2553 } 2554 /* 2555 * Since for the NB_ASYNC case, we've reassigned the buffer to the 2556 * clean list, we have to reassign it back to the dirty one. Ugh. 2557 */ 2558 if (ISSET(bp->nb_flags, NB_ASYNC)) { 2559 /* move to dirty list */ 2560 lck_mtx_lock(nfs_buf_mutex); 2561 if (bp->nb_vnbufs.le_next != NFSNOLIST) 2562 LIST_REMOVE(bp, nb_vnbufs); 2563 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); 2564 lck_mtx_unlock(nfs_buf_mutex); 2565 } 2566 } else { 2567 /* either there's an error or we don't need to commit */ 2568 if (error) { 2569 /* 2570 * There was a write error and we need to invalidate 2571 * attrs and flush buffers in order to sync up with the 2572 * server. (if this write was extending the file, we 2573 * may no longer know the correct size) 2574 * 2575 * But we can't call vinvalbuf while holding this 2576 * buffer busy. Set a flag to do it after releasing 2577 * the buffer. 2578 */ 2579 nfs_node_lock_force(np); 2580 np->n_error = error; 2581 np->n_flag |= (NWRITEERR | NNEEDINVALIDATE); 2582 NATTRINVALIDATE(np); 2583 nfs_node_unlock(np); 2584 } 2585 /* clear the dirty range */ 2586 bp->nb_dirtyoff = bp->nb_dirtyend = 0; 2587 } 2588 2589 if (!error && bp->nb_dirty) 2590 nfs_buf_write_dirty_pages(bp, thd, cred); 2591 nfs_buf_iodone(bp); 2592} 2593 2594/* 2595 * write out any pages marked dirty in a buffer 2596 * 2597 * We do use unstable writes and follow up with a commit. 2598 * If we catch the write verifier changing we'll restart 2599 * do the writes filesync. 2600 */ 2601int 2602nfs_buf_write_dirty_pages(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) 2603{ 2604 nfsnode_t np = bp->nb_np; 2605 struct nfsmount *nmp = NFSTONMP(np); 2606 int error = 0, commit, iomode, iomode2, len, pg, count, npages, off; 2607 uint32_t dirty = bp->nb_dirty; 2608 uint64_t wverf; 2609 uio_t auio; 2610 char uio_buf [ UIO_SIZEOF(1) ]; 2611 2612 if (!bp->nb_dirty) 2613 return (0); 2614 2615 /* there are pages marked dirty that need to be written out */ 2616 OSAddAtomic64(1, &nfsstats.write_bios); 2617 NFS_BUF_MAP(bp); 2618 SET(bp->nb_flags, NB_WRITEINPROG); 2619 npages = bp->nb_bufsize / PAGE_SIZE; 2620 iomode = NFS_WRITE_UNSTABLE; 2621 2622 auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_WRITE, 2623 &uio_buf, sizeof(uio_buf)); 2624 2625again: 2626 dirty = bp->nb_dirty; 2627 wverf = bp->nb_verf; 2628 commit = NFS_WRITE_FILESYNC; 2629 for (pg = 0; pg < npages; pg++) { 2630 if (!NBPGDIRTY(bp, pg)) 2631 continue; 2632 count = 1; 2633 while (((pg + count) < npages) && NBPGDIRTY(bp, pg + count)) 2634 count++; 2635 /* write count pages starting with page pg */ 2636 off = pg * PAGE_SIZE; 2637 len = count * PAGE_SIZE; 2638 /* clip writes to EOF */ 2639 if (NBOFF(bp) + off + len > (off_t) np->n_size) 2640 len -= (NBOFF(bp) + off + len) - np->n_size; 2641 if (len > 0) { 2642 iomode2 = iomode; 2643 uio_reset(auio, NBOFF(bp) + off, UIO_SYSSPACE, UIO_WRITE); 2644 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + off), len); 2645 error = nfs_write_rpc2(np, auio, thd, cred, &iomode2, &bp->nb_verf); 2646 if (error) 2647 break; 2648 if (iomode2 < commit) /* Retain the lowest commitment level returned. */ 2649 commit = iomode2; 2650 if ((commit != NFS_WRITE_FILESYNC) && (wverf != bp->nb_verf)) { 2651 /* verifier changed, redo all the writes filesync */ 2652 iomode = NFS_WRITE_FILESYNC; 2653 goto again; 2654 } 2655 } 2656 /* clear dirty bits */ 2657 while (count--) { 2658 dirty &= ~(1 << pg); 2659 if (count) /* leave pg on last page */ 2660 pg++; 2661 } 2662 } 2663 CLR(bp->nb_flags, NB_WRITEINPROG); 2664 2665 if (!error && (commit != NFS_WRITE_FILESYNC)) { 2666 error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred, wverf); 2667 if (error == NFSERR_STALEWRITEVERF) { 2668 /* verifier changed, so we need to restart all the writes */ 2669 iomode = NFS_WRITE_FILESYNC; 2670 goto again; 2671 } 2672 } 2673 if (!error) { 2674 bp->nb_dirty = dirty; 2675 } else { 2676 SET(bp->nb_flags, NB_ERROR); 2677 bp->nb_error = error; 2678 } 2679 return (error); 2680} 2681 2682/* 2683 * initiate the NFS WRITE RPC(s) for a buffer 2684 */ 2685int 2686nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred) 2687{ 2688 struct nfsmount *nmp; 2689 nfsnode_t np = bp->nb_np; 2690 int error = 0, nfsvers, async; 2691 int offset, nrpcs; 2692 uint32_t nmwsize, length, len; 2693 struct nfsreq *req; 2694 struct nfsreq_cbinfo cb; 2695 uio_t auio; 2696 char uio_buf [ UIO_SIZEOF(1) ]; 2697 2698 nmp = NFSTONMP(np); 2699 if (!nmp) { 2700 bp->nb_error = error = ENXIO; 2701 SET(bp->nb_flags, NB_ERROR); 2702 nfs_buf_iodone(bp); 2703 return (error); 2704 } 2705 nfsvers = nmp->nm_vers; 2706 nmwsize = nmp->nm_wsize; 2707 2708 offset = bp->nb_offio; 2709 length = bp->nb_endio - bp->nb_offio; 2710 2711 /* Note: Can only do async I/O if nfsiods are configured. */ 2712 async = (bp->nb_flags & NB_ASYNC) && (NFSIOD_MAX > 0); 2713 bp->nb_commitlevel = NFS_WRITE_FILESYNC; 2714 cb.rcb_func = async ? nfs_buf_write_rpc_finish : NULL; 2715 cb.rcb_bp = bp; 2716 2717 if ((nfsvers == NFS_VER2) && ((NBOFF(bp) + bp->nb_endio) > 0xffffffffLL)) { 2718 bp->nb_error = error = EFBIG; 2719 SET(bp->nb_flags, NB_ERROR); 2720 nfs_buf_iodone(bp); 2721 return (error); 2722 } 2723 2724 auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE, 2725 UIO_WRITE, &uio_buf, sizeof(uio_buf)); 2726 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length); 2727 2728 bp->nb_rpcs = nrpcs = (length + nmwsize - 1) / nmwsize; 2729 if (async && (nrpcs > 1)) { 2730 SET(bp->nb_flags, NB_MULTASYNCRPC); 2731 } else { 2732 CLR(bp->nb_flags, NB_MULTASYNCRPC); 2733 } 2734 2735 while (length > 0) { 2736 if (ISSET(bp->nb_flags, NB_ERROR)) { 2737 error = bp->nb_error; 2738 break; 2739 } 2740 len = (length > nmwsize) ? nmwsize : length; 2741 cb.rcb_args[0] = offset; 2742 cb.rcb_args[1] = len; 2743 if (nmp->nm_vers >= NFS_VER4) 2744 cb.rcb_args[2] = nmp->nm_stategenid; 2745 if (async && ((error = nfs_async_write_start(nmp)))) 2746 break; 2747 req = NULL; 2748 error = nmp->nm_funcs->nf_write_rpc_async(np, auio, len, thd, cred, 2749 iomode, &cb, &req); 2750 if (error) { 2751 if (async) 2752 nfs_async_write_done(nmp); 2753 break; 2754 } 2755 offset += len; 2756 length -= len; 2757 if (async) 2758 continue; 2759 nfs_buf_write_rpc_finish(req); 2760 } 2761 2762 if (length > 0) { 2763 /* 2764 * Something bad happened while trying to send the RPCs. 2765 * Wait for any outstanding requests to complete. 2766 */ 2767 bp->nb_error = error; 2768 SET(bp->nb_flags, NB_ERROR); 2769 if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) { 2770 nrpcs = (length + nmwsize - 1) / nmwsize; 2771 lck_mtx_lock(nfs_buf_mutex); 2772 bp->nb_rpcs -= nrpcs; 2773 if (bp->nb_rpcs == 0) { 2774 /* No RPCs left, so the buffer's done */ 2775 lck_mtx_unlock(nfs_buf_mutex); 2776 nfs_buf_write_finish(bp, thd, cred); 2777 } else { 2778 /* wait for the last RPC to mark it done */ 2779 while (bp->nb_rpcs > 0) 2780 msleep(&bp->nb_rpcs, nfs_buf_mutex, 0, 2781 "nfs_buf_write_rpc_cancel", NULL); 2782 lck_mtx_unlock(nfs_buf_mutex); 2783 } 2784 } else { 2785 nfs_buf_write_finish(bp, thd, cred); 2786 } 2787 /* It may have just been an interrupt... that's OK */ 2788 if (!ISSET(bp->nb_flags, NB_ERROR)) 2789 error = 0; 2790 } 2791 2792 return (error); 2793} 2794 2795/* 2796 * finish up an NFS WRITE RPC on a buffer 2797 */ 2798void 2799nfs_buf_write_rpc_finish(struct nfsreq *req) 2800{ 2801 int error = 0, nfsvers, offset, length, multasyncrpc, finished; 2802 int committed = NFS_WRITE_FILESYNC; 2803 uint64_t wverf = 0; 2804 size_t rlen; 2805 void *wakeme = NULL; 2806 struct nfsreq_cbinfo cb; 2807 struct nfsreq *wreq = NULL; 2808 struct nfsbuf *bp; 2809 struct nfsmount *nmp; 2810 nfsnode_t np; 2811 thread_t thd; 2812 kauth_cred_t cred; 2813 uio_t auio; 2814 char uio_buf [ UIO_SIZEOF(1) ]; 2815 2816finish: 2817 np = req->r_np; 2818 thd = req->r_thread; 2819 cred = req->r_cred; 2820 if (IS_VALID_CRED(cred)) 2821 kauth_cred_ref(cred); 2822 cb = req->r_callback; 2823 bp = cb.rcb_bp; 2824 if (cb.rcb_func) /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */ 2825 nfs_request_ref(req, 0); 2826 2827 nmp = NFSTONMP(np); 2828 if (!nmp) { 2829 SET(bp->nb_flags, NB_ERROR); 2830 bp->nb_error = error = ENXIO; 2831 } 2832 if (error || ISSET(bp->nb_flags, NB_ERROR)) { 2833 /* just drop it */ 2834 nfs_request_async_cancel(req); 2835 goto out; 2836 } 2837 nfsvers = nmp->nm_vers; 2838 2839 offset = cb.rcb_args[0]; 2840 rlen = length = cb.rcb_args[1]; 2841 2842 /* finish the RPC */ 2843 error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req, &committed, &rlen, &wverf); 2844 if ((error == EINPROGRESS) && cb.rcb_func) { 2845 /* async request restarted */ 2846 if (cb.rcb_func) 2847 nfs_request_rele(req); 2848 if (IS_VALID_CRED(cred)) 2849 kauth_cred_unref(&cred); 2850 return; 2851 } 2852 if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) { 2853 lck_mtx_lock(&nmp->nm_lock); 2854 if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) { 2855 NP(np, "nfs_buf_write_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery", 2856 error, NBOFF(bp)+offset, cb.rcb_args[2], nmp->nm_stategenid); 2857 nfs_need_recover(nmp, error); 2858 } 2859 lck_mtx_unlock(&nmp->nm_lock); 2860 if (np->n_flag & NREVOKE) { 2861 error = EIO; 2862 } else { 2863 if (error == NFSERR_GRACE) { 2864 if (cb.rcb_func) { 2865 /* 2866 * For an async I/O request, handle a grace delay just like 2867 * jukebox errors. Set the resend time and queue it up. 2868 */ 2869 struct timeval now; 2870 if (req->r_nmrep.nmc_mhead) { 2871 mbuf_freem(req->r_nmrep.nmc_mhead); 2872 req->r_nmrep.nmc_mhead = NULL; 2873 } 2874 req->r_error = 0; 2875 microuptime(&now); 2876 lck_mtx_lock(&req->r_mtx); 2877 req->r_resendtime = now.tv_sec + 2; 2878 req->r_xid = 0; // get a new XID 2879 req->r_flags |= R_RESTART; 2880 req->r_start = 0; 2881 nfs_asyncio_resend(req); 2882 lck_mtx_unlock(&req->r_mtx); 2883 if (IS_VALID_CRED(cred)) 2884 kauth_cred_unref(&cred); 2885 /* Note: nfsreq reference taken will be dropped later when finished */ 2886 return; 2887 } 2888 /* otherwise, just pause a couple seconds and retry */ 2889 tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); 2890 } 2891 if (!(error = nfs_mount_state_wait_for_recovery(nmp))) { 2892 rlen = 0; 2893 goto writeagain; 2894 } 2895 } 2896 } 2897 if (error) { 2898 SET(bp->nb_flags, NB_ERROR); 2899 bp->nb_error = error; 2900 } 2901 if (error || (nfsvers == NFS_VER2)) 2902 goto out; 2903 if (rlen <= 0) { 2904 SET(bp->nb_flags, NB_ERROR); 2905 bp->nb_error = error = EIO; 2906 goto out; 2907 } 2908 2909 /* save lowest commit level returned */ 2910 if (committed < bp->nb_commitlevel) 2911 bp->nb_commitlevel = committed; 2912 2913 /* check the write verifier */ 2914 if (!bp->nb_verf) { 2915 bp->nb_verf = wverf; 2916 } else if (bp->nb_verf != wverf) { 2917 /* verifier changed, so buffer will need to be rewritten */ 2918 bp->nb_flags |= NB_STALEWVERF; 2919 bp->nb_commitlevel = NFS_WRITE_UNSTABLE; 2920 bp->nb_verf = wverf; 2921 } 2922 2923 /* 2924 * check for a short write 2925 * 2926 * If the server didn't write all the data, then we 2927 * need to issue another write for the rest of it. 2928 * (Don't bother if the buffer hit an error or stale wverf.) 2929 */ 2930 if (((int)rlen < length) && !(bp->nb_flags & (NB_STALEWVERF|NB_ERROR))) { 2931writeagain: 2932 offset += rlen; 2933 length -= rlen; 2934 2935 auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE, 2936 UIO_WRITE, &uio_buf, sizeof(uio_buf)); 2937 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length); 2938 2939 cb.rcb_args[0] = offset; 2940 cb.rcb_args[1] = length; 2941 if (nmp->nm_vers >= NFS_VER4) 2942 cb.rcb_args[2] = nmp->nm_stategenid; 2943 2944 // XXX iomode should really match the original request 2945 error = nmp->nm_funcs->nf_write_rpc_async(np, auio, length, thd, cred, 2946 NFS_WRITE_FILESYNC, &cb, &wreq); 2947 if (!error) { 2948 if (IS_VALID_CRED(cred)) 2949 kauth_cred_unref(&cred); 2950 if (!cb.rcb_func) { 2951 /* if !async we'll need to wait for this RPC to finish */ 2952 req = wreq; 2953 wreq = NULL; 2954 goto finish; 2955 } 2956 nfs_request_rele(req); 2957 /* 2958 * We're done here. 2959 * Outstanding RPC count is unchanged. 2960 * Callback will be called when RPC is done. 2961 */ 2962 return; 2963 } 2964 SET(bp->nb_flags, NB_ERROR); 2965 bp->nb_error = error; 2966 } 2967 2968out: 2969 if (cb.rcb_func) { 2970 nfs_async_write_done(nmp); 2971 nfs_request_rele(req); 2972 } 2973 /* 2974 * Decrement outstanding RPC count on buffer 2975 * and call nfs_buf_write_finish on last RPC. 2976 * 2977 * (Note: when there are multiple async RPCs issued for a 2978 * buffer we need nfs_buffer_mutex to avoid problems when 2979 * aborting a partially-initiated set of RPCs) 2980 */ 2981 multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC); 2982 if (multasyncrpc) 2983 lck_mtx_lock(nfs_buf_mutex); 2984 2985 bp->nb_rpcs--; 2986 finished = (bp->nb_rpcs == 0); 2987 2988 if (multasyncrpc) 2989 lck_mtx_unlock(nfs_buf_mutex); 2990 2991 if (finished) { 2992 if (multasyncrpc) 2993 wakeme = &bp->nb_rpcs; 2994 nfs_buf_write_finish(bp, thd, cred); 2995 if (wakeme) 2996 wakeup(wakeme); 2997 } 2998 2999 if (IS_VALID_CRED(cred)) 3000 kauth_cred_unref(&cred); 3001} 3002 3003/* 3004 * Send commit(s) for the given node's "needcommit" buffers 3005 */ 3006int 3007nfs_flushcommits(nfsnode_t np, int nowait) 3008{ 3009 struct nfsmount *nmp; 3010 struct nfsbuf *bp, *prevlbp, *lbp; 3011 struct nfsbuflists blist, commitlist; 3012 int error = 0, retv, wcred_set, flags, dirty; 3013 u_quad_t off, endoff, toff; 3014 uint64_t wverf; 3015 u_int32_t count; 3016 kauth_cred_t wcred = NULL; 3017 3018 FSDBG_TOP(557, np, 0, 0, 0); 3019 3020 /* 3021 * A nb_flags == (NB_DELWRI | NB_NEEDCOMMIT) block has been written to the 3022 * server, but nas not been committed to stable storage on the server 3023 * yet. The byte range is worked out for as many nfsbufs as we can handle 3024 * and the commit rpc is done. 3025 */ 3026 if (!LIST_EMPTY(&np->n_dirtyblkhd)) { 3027 error = nfs_node_lock(np); 3028 if (error) 3029 goto done; 3030 np->n_flag |= NMODIFIED; 3031 nfs_node_unlock(np); 3032 } 3033 3034 off = (u_quad_t)-1; 3035 endoff = 0; 3036 wcred_set = 0; 3037 LIST_INIT(&commitlist); 3038 3039 nmp = NFSTONMP(np); 3040 if (!nmp) { 3041 error = ENXIO; 3042 goto done; 3043 } 3044 if (nmp->nm_vers == NFS_VER2) { 3045 error = EINVAL; 3046 goto done; 3047 } 3048 3049 flags = NBI_DIRTY; 3050 if (nowait) 3051 flags |= NBI_NOWAIT; 3052 lck_mtx_lock(nfs_buf_mutex); 3053 wverf = nmp->nm_verf; 3054 if (!nfs_buf_iterprepare(np, &blist, flags)) { 3055 while ((bp = LIST_FIRST(&blist))) { 3056 LIST_REMOVE(bp, nb_vnbufs); 3057 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); 3058 error = nfs_buf_acquire(bp, NBAC_NOWAIT, 0, 0); 3059 if (error) 3060 continue; 3061 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) 3062 nfs_buf_check_write_verifier(np, bp); 3063 if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT)) != (NB_DELWRI | NB_NEEDCOMMIT)) || 3064 (bp->nb_verf != wverf)) { 3065 nfs_buf_drop(bp); 3066 continue; 3067 } 3068 nfs_buf_remfree(bp); 3069 3070 /* buffer UPLs will be grabbed *in order* below */ 3071 3072 FSDBG(557, bp, bp->nb_flags, bp->nb_valid, bp->nb_dirty); 3073 FSDBG(557, bp->nb_validoff, bp->nb_validend, 3074 bp->nb_dirtyoff, bp->nb_dirtyend); 3075 3076 /* 3077 * Work out if all buffers are using the same cred 3078 * so we can deal with them all with one commit. 3079 * 3080 * Note: creds in bp's must be obtained by kauth_cred_ref 3081 * on the same original cred in order for them to be equal. 3082 */ 3083 if (wcred_set == 0) { 3084 wcred = bp->nb_wcred; 3085 if (!IS_VALID_CRED(wcred)) 3086 panic("nfs: needcommit w/out wcred"); 3087 wcred_set = 1; 3088 } else if ((wcred_set == 1) && wcred != bp->nb_wcred) { 3089 wcred_set = -1; 3090 } 3091 SET(bp->nb_flags, NB_WRITEINPROG); 3092 3093 /* 3094 * Add this buffer to the list of buffers we are committing. 3095 * Buffers are inserted into the list in ascending order so that 3096 * we can take the UPLs in order after the list is complete. 3097 */ 3098 prevlbp = NULL; 3099 LIST_FOREACH(lbp, &commitlist, nb_vnbufs) { 3100 if (bp->nb_lblkno < lbp->nb_lblkno) 3101 break; 3102 prevlbp = lbp; 3103 } 3104 LIST_REMOVE(bp, nb_vnbufs); 3105 if (prevlbp) 3106 LIST_INSERT_AFTER(prevlbp, bp, nb_vnbufs); 3107 else 3108 LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs); 3109 3110 /* update commit range start, end */ 3111 toff = NBOFF(bp) + bp->nb_dirtyoff; 3112 if (toff < off) 3113 off = toff; 3114 toff += (u_quad_t)(bp->nb_dirtyend - bp->nb_dirtyoff); 3115 if (toff > endoff) 3116 endoff = toff; 3117 } 3118 nfs_buf_itercomplete(np, &blist, NBI_DIRTY); 3119 } 3120 lck_mtx_unlock(nfs_buf_mutex); 3121 3122 if (LIST_EMPTY(&commitlist)) { 3123 error = ENOBUFS; 3124 goto done; 3125 } 3126 3127 /* 3128 * We need a UPL to prevent others from accessing the buffers during 3129 * our commit RPC(s). 3130 * 3131 * We used to also check for dirty pages here; if there were any we'd 3132 * abort the commit and force the entire buffer to be written again. 3133 * Instead of doing that, we just go ahead and commit the dirty range, 3134 * and then leave the buffer around with dirty pages that will be 3135 * written out later. 3136 */ 3137 LIST_FOREACH(bp, &commitlist, nb_vnbufs) { 3138 if (!ISSET(bp->nb_flags, NB_PAGELIST)) { 3139 retv = nfs_buf_upl_setup(bp); 3140 if (retv) { 3141 /* Unable to create the UPL, the VM object probably no longer exists. */ 3142 printf("nfs_flushcommits: upl create failed %d\n", retv); 3143 bp->nb_valid = bp->nb_dirty = 0; 3144 } 3145 } 3146 nfs_buf_upl_check(bp); 3147 } 3148 3149 /* 3150 * Commit data on the server, as required. 3151 * If all bufs are using the same wcred, then use that with 3152 * one call for all of them, otherwise commit each one 3153 * separately. 3154 */ 3155 if (wcred_set == 1) { 3156 /* 3157 * Note, it's possible the commit range could be >2^32-1. 3158 * If it is, we'll send one commit that covers the whole file. 3159 */ 3160 if ((endoff - off) > 0xffffffff) 3161 count = 0; 3162 else 3163 count = (endoff - off); 3164 retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred, wverf); 3165 } else { 3166 retv = 0; 3167 LIST_FOREACH(bp, &commitlist, nb_vnbufs) { 3168 toff = NBOFF(bp) + bp->nb_dirtyoff; 3169 count = bp->nb_dirtyend - bp->nb_dirtyoff; 3170 retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred, wverf); 3171 if (retv) 3172 break; 3173 } 3174 } 3175 3176 /* 3177 * Now, either mark the blocks I/O done or mark the 3178 * blocks dirty, depending on whether the commit 3179 * succeeded. 3180 */ 3181 while ((bp = LIST_FIRST(&commitlist))) { 3182 LIST_REMOVE(bp, nb_vnbufs); 3183 FSDBG(557, bp, retv, bp->nb_flags, bp->nb_dirty); 3184 nfs_node_lock_force(np); 3185 CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_WRITEINPROG)); 3186 np->n_needcommitcnt--; 3187 CHECK_NEEDCOMMITCNT(np); 3188 nfs_node_unlock(np); 3189 3190 if (retv) { 3191 /* move back to dirty list */ 3192 lck_mtx_lock(nfs_buf_mutex); 3193 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); 3194 lck_mtx_unlock(nfs_buf_mutex); 3195 nfs_buf_release(bp, 1); 3196 continue; 3197 } 3198 3199 nfs_node_lock_force(np); 3200 np->n_numoutput++; 3201 nfs_node_unlock(np); 3202 vnode_startwrite(NFSTOV(np)); 3203 if (ISSET(bp->nb_flags, NB_DELWRI)) { 3204 lck_mtx_lock(nfs_buf_mutex); 3205 nfs_nbdwrite--; 3206 NFSBUFCNTCHK(); 3207 lck_mtx_unlock(nfs_buf_mutex); 3208 wakeup(&nfs_nbdwrite); 3209 } 3210 CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI)); 3211 /* if block still has dirty pages, we don't want it to */ 3212 /* be released in nfs_buf_iodone(). So, don't set NB_ASYNC. */ 3213 if (!(dirty = bp->nb_dirty)) 3214 SET(bp->nb_flags, NB_ASYNC); 3215 else 3216 CLR(bp->nb_flags, NB_ASYNC); 3217 3218 /* move to clean list */ 3219 lck_mtx_lock(nfs_buf_mutex); 3220 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); 3221 lck_mtx_unlock(nfs_buf_mutex); 3222 3223 bp->nb_dirtyoff = bp->nb_dirtyend = 0; 3224 3225 nfs_buf_iodone(bp); 3226 if (dirty) { 3227 /* throw it back in as a delayed write buffer */ 3228 CLR(bp->nb_flags, NB_DONE); 3229 nfs_buf_write_delayed(bp); 3230 } 3231 } 3232 3233done: 3234 FSDBG_BOT(557, np, 0, 0, error); 3235 return (error); 3236} 3237 3238/* 3239 * Flush all the blocks associated with a vnode. 3240 * Walk through the buffer pool and push any dirty pages 3241 * associated with the vnode. 3242 */ 3243int 3244nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr) 3245{ 3246 struct nfsbuf *bp; 3247 struct nfsbuflists blist; 3248 struct nfsmount *nmp = NFSTONMP(np); 3249 int error = 0, error2, slptimeo = 0, slpflag = 0; 3250 int nfsvers, flags, passone = 1; 3251 3252 FSDBG_TOP(517, np, waitfor, ignore_writeerr, 0); 3253 3254 if (!nmp) { 3255 error = ENXIO; 3256 goto out; 3257 } 3258 nfsvers = nmp->nm_vers; 3259 if (NMFLAG(nmp, INTR)) 3260 slpflag = PCATCH; 3261 3262 if (!LIST_EMPTY(&np->n_dirtyblkhd)) { 3263 nfs_node_lock_force(np); 3264 np->n_flag |= NMODIFIED; 3265 nfs_node_unlock(np); 3266 } 3267 3268 lck_mtx_lock(nfs_buf_mutex); 3269 while (np->n_bflag & NBFLUSHINPROG) { 3270 np->n_bflag |= NBFLUSHWANT; 3271 error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL); 3272 if ((error && (error != EWOULDBLOCK)) || 3273 ((error = nfs_sigintr(NFSTONMP(np), NULL, thd, 0)))) { 3274 lck_mtx_unlock(nfs_buf_mutex); 3275 goto out; 3276 } 3277 } 3278 np->n_bflag |= NBFLUSHINPROG; 3279 3280 /* 3281 * On the first pass, start async/unstable writes on all 3282 * delayed write buffers. Then wait for all writes to complete 3283 * and call nfs_flushcommits() to commit any uncommitted buffers. 3284 * On all subsequent passes, start STABLE writes on any remaining 3285 * dirty buffers. Then wait for all writes to complete. 3286 */ 3287again: 3288 FSDBG(518, LIST_FIRST(&np->n_dirtyblkhd), np->n_flag, 0, 0); 3289 if (!NFSTONMP(np)) { 3290 lck_mtx_unlock(nfs_buf_mutex); 3291 error = ENXIO; 3292 goto done; 3293 } 3294 3295 /* Start/do any write(s) that are required. */ 3296 if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) { 3297 while ((bp = LIST_FIRST(&blist))) { 3298 LIST_REMOVE(bp, nb_vnbufs); 3299 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); 3300 flags = (passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) ? NBAC_NOWAIT : 0; 3301 if (flags != NBAC_NOWAIT) 3302 nfs_buf_refget(bp); 3303 while ((error = nfs_buf_acquire(bp, flags, slpflag, slptimeo))) { 3304 FSDBG(524, bp, flags, bp->nb_lflags, bp->nb_flags); 3305 if (error == EBUSY) 3306 break; 3307 if (error) { 3308 error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0); 3309 if (error2) { 3310 if (flags != NBAC_NOWAIT) 3311 nfs_buf_refrele(bp); 3312 nfs_buf_itercomplete(np, &blist, NBI_DIRTY); 3313 lck_mtx_unlock(nfs_buf_mutex); 3314 error = error2; 3315 goto done; 3316 } 3317 if (slpflag == PCATCH) { 3318 slpflag = 0; 3319 slptimeo = 2 * hz; 3320 } 3321 } 3322 } 3323 if (flags != NBAC_NOWAIT) 3324 nfs_buf_refrele(bp); 3325 if (error == EBUSY) 3326 continue; 3327 if (!bp->nb_np) { 3328 /* buffer is no longer valid */ 3329 nfs_buf_drop(bp); 3330 continue; 3331 } 3332 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) 3333 nfs_buf_check_write_verifier(np, bp); 3334 if (!ISSET(bp->nb_flags, NB_DELWRI)) { 3335 /* buffer is no longer dirty */ 3336 nfs_buf_drop(bp); 3337 continue; 3338 } 3339 FSDBG(525, bp, passone, bp->nb_lflags, bp->nb_flags); 3340 if ((passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) && 3341 ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { 3342 nfs_buf_drop(bp); 3343 continue; 3344 } 3345 nfs_buf_remfree(bp); 3346 lck_mtx_unlock(nfs_buf_mutex); 3347 if (ISSET(bp->nb_flags, NB_ERROR)) { 3348 nfs_node_lock_force(np); 3349 np->n_error = bp->nb_error ? bp->nb_error : EIO; 3350 np->n_flag |= NWRITEERR; 3351 nfs_node_unlock(np); 3352 nfs_buf_release(bp, 1); 3353 lck_mtx_lock(nfs_buf_mutex); 3354 continue; 3355 } 3356 SET(bp->nb_flags, NB_ASYNC); 3357 if (!passone) { 3358 /* NB_STABLE forces this to be written FILESYNC */ 3359 SET(bp->nb_flags, NB_STABLE); 3360 } 3361 nfs_buf_write(bp); 3362 lck_mtx_lock(nfs_buf_mutex); 3363 } 3364 nfs_buf_itercomplete(np, &blist, NBI_DIRTY); 3365 } 3366 lck_mtx_unlock(nfs_buf_mutex); 3367 3368 if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) { 3369 while ((error = vnode_waitforwrites(NFSTOV(np), 0, slpflag, slptimeo, "nfsflush"))) { 3370 error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0); 3371 if (error2) { 3372 error = error2; 3373 goto done; 3374 } 3375 if (slpflag == PCATCH) { 3376 slpflag = 0; 3377 slptimeo = 2 * hz; 3378 } 3379 } 3380 } 3381 3382 if (nfsvers != NFS_VER2) { 3383 /* loop while it looks like there are still buffers to be */ 3384 /* commited and nfs_flushcommits() seems to be handling them. */ 3385 while (np->n_needcommitcnt) 3386 if (nfs_flushcommits(np, 0)) 3387 break; 3388 } 3389 3390 if (passone) { 3391 passone = 0; 3392 if (!LIST_EMPTY(&np->n_dirtyblkhd)) { 3393 nfs_node_lock_force(np); 3394 np->n_flag |= NMODIFIED; 3395 nfs_node_unlock(np); 3396 } 3397 lck_mtx_lock(nfs_buf_mutex); 3398 goto again; 3399 } 3400 3401 if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) { 3402 if (!LIST_EMPTY(&np->n_dirtyblkhd)) { 3403 nfs_node_lock_force(np); 3404 np->n_flag |= NMODIFIED; 3405 nfs_node_unlock(np); 3406 } 3407 lck_mtx_lock(nfs_buf_mutex); 3408 if (!LIST_EMPTY(&np->n_dirtyblkhd)) 3409 goto again; 3410 lck_mtx_unlock(nfs_buf_mutex); 3411 nfs_node_lock_force(np); 3412 /* 3413 * OK, it looks like there are no dirty blocks. If we have no 3414 * writes in flight and no one in the write code, we can clear 3415 * the modified flag. In order to make sure we see the latest 3416 * attributes and size, we also invalidate the attributes and 3417 * advance the attribute cache XID to guarantee that attributes 3418 * newer than our clearing of NMODIFIED will get loaded next. 3419 * (If we don't do this, it's possible for the flush's final 3420 * write/commit (xid1) to be executed in parallel with a subsequent 3421 * getattr request (xid2). The getattr could return attributes 3422 * from *before* the write/commit completed but the stale attributes 3423 * would be preferred because of the xid ordering.) 3424 */ 3425 if (!np->n_wrbusy && !np->n_numoutput) { 3426 np->n_flag &= ~NMODIFIED; 3427 NATTRINVALIDATE(np); 3428 nfs_get_xid(&np->n_xid); 3429 } 3430 } else { 3431 nfs_node_lock_force(np); 3432 } 3433 3434 FSDBG(526, np->n_flag, np->n_error, 0, 0); 3435 if (!ignore_writeerr && (np->n_flag & NWRITEERR)) { 3436 error = np->n_error; 3437 np->n_flag &= ~NWRITEERR; 3438 } 3439 nfs_node_unlock(np); 3440done: 3441 lck_mtx_lock(nfs_buf_mutex); 3442 flags = np->n_bflag; 3443 np->n_bflag &= ~(NBFLUSHINPROG|NBFLUSHWANT); 3444 lck_mtx_unlock(nfs_buf_mutex); 3445 if (flags & NBFLUSHWANT) 3446 wakeup(&np->n_bflag); 3447out: 3448 FSDBG_BOT(517, np, error, ignore_writeerr, 0); 3449 return (error); 3450} 3451 3452/* 3453 * Flush out and invalidate all buffers associated with a vnode. 3454 * Called with the underlying object locked. 3455 */ 3456int 3457nfs_vinvalbuf_internal( 3458 nfsnode_t np, 3459 int flags, 3460 thread_t thd, 3461 kauth_cred_t cred, 3462 int slpflag, 3463 int slptimeo) 3464{ 3465 struct nfsbuf *bp; 3466 struct nfsbuflists blist; 3467 int list, error = 0; 3468 3469 if (flags & V_SAVE) { 3470 if ((error = nfs_flush(np, MNT_WAIT, thd, (flags & V_IGNORE_WRITEERR)))) 3471 return (error); 3472 } 3473 3474 lck_mtx_lock(nfs_buf_mutex); 3475 for (;;) { 3476 list = NBI_CLEAN; 3477 if (nfs_buf_iterprepare(np, &blist, list)) { 3478 list = NBI_DIRTY; 3479 if (nfs_buf_iterprepare(np, &blist, list)) 3480 break; 3481 } 3482 while ((bp = LIST_FIRST(&blist))) { 3483 LIST_REMOVE(bp, nb_vnbufs); 3484 if (list == NBI_CLEAN) 3485 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); 3486 else 3487 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); 3488 nfs_buf_refget(bp); 3489 while ((error = nfs_buf_acquire(bp, NBAC_REMOVE, slpflag, slptimeo))) { 3490 FSDBG(556, np, bp, NBOFF(bp), bp->nb_flags); 3491 if (error != EAGAIN) { 3492 FSDBG(554, np, bp, -1, error); 3493 nfs_buf_refrele(bp); 3494 nfs_buf_itercomplete(np, &blist, list); 3495 lck_mtx_unlock(nfs_buf_mutex); 3496 return (error); 3497 } 3498 } 3499 nfs_buf_refrele(bp); 3500 FSDBG(554, np, bp, NBOFF(bp), bp->nb_flags); 3501 lck_mtx_unlock(nfs_buf_mutex); 3502 if ((flags & V_SAVE) && UBCINFOEXISTS(NFSTOV(np)) && bp->nb_np && 3503 (NBOFF(bp) < (off_t)np->n_size)) { 3504 /* extra paranoia: make sure we're not */ 3505 /* somehow leaving any dirty data around */ 3506 int mustwrite = 0; 3507 int end = (NBOFF(bp) + bp->nb_bufsize > (off_t)np->n_size) ? 3508 ((off_t)np->n_size - NBOFF(bp)) : bp->nb_bufsize; 3509 if (!ISSET(bp->nb_flags, NB_PAGELIST)) { 3510 error = nfs_buf_upl_setup(bp); 3511 if (error == EINVAL) { 3512 /* vm object must no longer exist */ 3513 /* hopefully we don't need to do */ 3514 /* anything for this buffer */ 3515 } else if (error) 3516 printf("nfs_vinvalbuf: upl setup failed %d\n", error); 3517 bp->nb_valid = bp->nb_dirty = 0; 3518 } 3519 nfs_buf_upl_check(bp); 3520 /* check for any dirty data before the EOF */ 3521 if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) { 3522 /* clip dirty range to EOF */ 3523 if (bp->nb_dirtyend > end) { 3524 bp->nb_dirtyend = end; 3525 if (bp->nb_dirtyoff >= bp->nb_dirtyend) 3526 bp->nb_dirtyoff = bp->nb_dirtyend = 0; 3527 } 3528 if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) 3529 mustwrite++; 3530 } 3531 bp->nb_dirty &= (1 << (round_page_32(end)/PAGE_SIZE)) - 1; 3532 if (bp->nb_dirty) 3533 mustwrite++; 3534 /* also make sure we'll have a credential to do the write */ 3535 if (mustwrite && !IS_VALID_CRED(bp->nb_wcred) && !IS_VALID_CRED(cred)) { 3536 printf("nfs_vinvalbuf: found dirty buffer with no write creds\n"); 3537 mustwrite = 0; 3538 } 3539 if (mustwrite) { 3540 FSDBG(554, np, bp, 0xd00dee, bp->nb_flags); 3541 if (!ISSET(bp->nb_flags, NB_PAGELIST)) 3542 panic("nfs_vinvalbuf: dirty buffer without upl"); 3543 /* gotta write out dirty data before invalidating */ 3544 /* (NB_STABLE indicates that data writes should be FILESYNC) */ 3545 /* (NB_NOCACHE indicates buffer should be discarded) */ 3546 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC)); 3547 SET(bp->nb_flags, NB_STABLE | NB_NOCACHE); 3548 if (!IS_VALID_CRED(bp->nb_wcred)) { 3549 kauth_cred_ref(cred); 3550 bp->nb_wcred = cred; 3551 } 3552 error = nfs_buf_write(bp); 3553 // Note: bp has been released 3554 if (error) { 3555 FSDBG(554, bp, 0xd00dee, 0xbad, error); 3556 nfs_node_lock_force(np); 3557 if ((error != EINTR) && (error != ERESTART)) { 3558 np->n_error = error; 3559 np->n_flag |= NWRITEERR; 3560 } 3561 /* 3562 * There was a write error and we need to 3563 * invalidate attrs to sync with server. 3564 * (if this write was extending the file, 3565 * we may no longer know the correct size) 3566 */ 3567 NATTRINVALIDATE(np); 3568 nfs_node_unlock(np); 3569 if ((error == EINTR) || (error == ERESTART)) { 3570 /* 3571 * Abort on EINTR. If we don't, we could 3572 * be stuck in this loop forever because 3573 * the buffer will continue to stay dirty. 3574 */ 3575 lck_mtx_lock(nfs_buf_mutex); 3576 nfs_buf_itercomplete(np, &blist, list); 3577 lck_mtx_unlock(nfs_buf_mutex); 3578 return (error); 3579 } 3580 error = 0; 3581 } 3582 lck_mtx_lock(nfs_buf_mutex); 3583 continue; 3584 } 3585 } 3586 SET(bp->nb_flags, NB_INVAL); 3587 // hold off on FREEUPs until we're done here 3588 nfs_buf_release(bp, 0); 3589 lck_mtx_lock(nfs_buf_mutex); 3590 } 3591 nfs_buf_itercomplete(np, &blist, list); 3592 } 3593 if (!LIST_EMPTY(&(np)->n_dirtyblkhd) || !LIST_EMPTY(&(np)->n_cleanblkhd)) 3594 panic("nfs_vinvalbuf: flush/inval failed"); 3595 lck_mtx_unlock(nfs_buf_mutex); 3596 nfs_node_lock_force(np); 3597 if (!(flags & V_SAVE)) 3598 np->n_flag &= ~NMODIFIED; 3599 if (vnode_vtype(NFSTOV(np)) == VREG) 3600 np->n_lastrahead = -1; 3601 nfs_node_unlock(np); 3602 NFS_BUF_FREEUP(); 3603 return (0); 3604} 3605 3606 3607/* 3608 * Flush and invalidate all dirty buffers. If another process is already 3609 * doing the flush, just wait for completion. 3610 */ 3611int 3612nfs_vinvalbuf(vnode_t vp, int flags, vfs_context_t ctx, int intrflg) 3613{ 3614 return nfs_vinvalbuf2(vp, flags, vfs_context_thread(ctx), vfs_context_ucred(ctx), intrflg); 3615} 3616 3617int 3618nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrflg) 3619{ 3620 nfsnode_t np = VTONFS(vp); 3621 struct nfsmount *nmp = VTONMP(vp); 3622 int error, slpflag, slptimeo, nflags, retry = 0; 3623 struct timespec ts = { 2, 0 }; 3624 off_t size; 3625 3626 FSDBG_TOP(554, np, flags, intrflg, 0); 3627 3628 if (nmp && !NMFLAG(nmp, INTR)) 3629 intrflg = 0; 3630 if (intrflg) { 3631 slpflag = PCATCH; 3632 slptimeo = 2 * hz; 3633 } else { 3634 slpflag = 0; 3635 slptimeo = 0; 3636 } 3637 3638 /* First wait for any other process doing a flush to complete. */ 3639 lck_mtx_lock(nfs_buf_mutex); 3640 while (np->n_bflag & NBINVALINPROG) { 3641 np->n_bflag |= NBINVALWANT; 3642 msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", &ts); 3643 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { 3644 lck_mtx_unlock(nfs_buf_mutex); 3645 return (error); 3646 } 3647 if (np->n_bflag & NBINVALINPROG) 3648 slpflag = 0; 3649 } 3650 np->n_bflag |= NBINVALINPROG; 3651 lck_mtx_unlock(nfs_buf_mutex); 3652 3653 /* Now, flush as required. */ 3654again: 3655 error = nfs_vinvalbuf_internal(np, flags, thd, cred, slpflag, 0); 3656 while (error) { 3657 FSDBG(554, np, 0, 0, error); 3658 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) 3659 goto done; 3660 error = nfs_vinvalbuf_internal(np, flags, thd, cred, 0, slptimeo); 3661 } 3662 3663 /* get the pages out of vm also */ 3664 if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) 3665 if ((error = ubc_msync(vp, 0, size, NULL, UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE))) { 3666 if (error == EINVAL) 3667 panic("nfs_vinvalbuf(): ubc_msync failed!, error %d", error); 3668 if (retry++ < 10) /* retry invalidating a few times */ 3669 goto again; 3670 /* give up */ 3671 printf("nfs_vinvalbuf(): ubc_msync failed!, error %d", error); 3672 3673 } 3674done: 3675 lck_mtx_lock(nfs_buf_mutex); 3676 nflags = np->n_bflag; 3677 np->n_bflag &= ~(NBINVALINPROG|NBINVALWANT); 3678 lck_mtx_unlock(nfs_buf_mutex); 3679 if (nflags & NBINVALWANT) 3680 wakeup(&np->n_bflag); 3681 3682 FSDBG_BOT(554, np, flags, intrflg, error); 3683 return (error); 3684} 3685 3686/* 3687 * Wait for any busy buffers to complete. 3688 */ 3689void 3690nfs_wait_bufs(nfsnode_t np) 3691{ 3692 struct nfsbuf *bp; 3693 struct nfsbuflists blist; 3694 int error = 0; 3695 3696 lck_mtx_lock(nfs_buf_mutex); 3697 if (!nfs_buf_iterprepare(np, &blist, NBI_CLEAN)) { 3698 while ((bp = LIST_FIRST(&blist))) { 3699 LIST_REMOVE(bp, nb_vnbufs); 3700 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); 3701 nfs_buf_refget(bp); 3702 while ((error = nfs_buf_acquire(bp, 0, 0, 0))) { 3703 if (error != EAGAIN) { 3704 nfs_buf_refrele(bp); 3705 nfs_buf_itercomplete(np, &blist, NBI_CLEAN); 3706 lck_mtx_unlock(nfs_buf_mutex); 3707 return; 3708 } 3709 } 3710 nfs_buf_refrele(bp); 3711 nfs_buf_drop(bp); 3712 } 3713 nfs_buf_itercomplete(np, &blist, NBI_CLEAN); 3714 } 3715 if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) { 3716 while ((bp = LIST_FIRST(&blist))) { 3717 LIST_REMOVE(bp, nb_vnbufs); 3718 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); 3719 nfs_buf_refget(bp); 3720 while ((error = nfs_buf_acquire(bp, 0, 0, 0))) { 3721 if (error != EAGAIN) { 3722 nfs_buf_refrele(bp); 3723 nfs_buf_itercomplete(np, &blist, NBI_DIRTY); 3724 lck_mtx_unlock(nfs_buf_mutex); 3725 return; 3726 } 3727 } 3728 nfs_buf_refrele(bp); 3729 nfs_buf_drop(bp); 3730 } 3731 nfs_buf_itercomplete(np, &blist, NBI_DIRTY); 3732 } 3733 lck_mtx_unlock(nfs_buf_mutex); 3734} 3735 3736 3737/* 3738 * Add an async I/O request to the mount's async I/O queue and make 3739 * sure that an nfsiod will service it. 3740 */ 3741void 3742nfs_asyncio_finish(struct nfsreq *req) 3743{ 3744 struct nfsmount *nmp; 3745 struct nfsiod *niod; 3746 int started = 0; 3747 3748 FSDBG_TOP(552, nmp, 0, 0, 0); 3749again: 3750 if (((nmp = req->r_nmp)) == NULL) 3751 return; 3752 lck_mtx_lock(nfsiod_mutex); 3753 niod = nmp->nm_niod; 3754 3755 /* grab an nfsiod if we don't have one already */ 3756 if (!niod) { 3757 niod = TAILQ_FIRST(&nfsiodfree); 3758 if (niod) { 3759 TAILQ_REMOVE(&nfsiodfree, niod, niod_link); 3760 TAILQ_INSERT_TAIL(&nfsiodwork, niod, niod_link); 3761 niod->niod_nmp = nmp; 3762 } else if (((nfsiod_thread_count < NFSIOD_MAX) || (nfsiod_thread_count <= 0)) && (started < 4)) { 3763 /* 3764 * Try starting a new thread. 3765 * We may try a couple times if other callers 3766 * get the new threads before we do. 3767 */ 3768 lck_mtx_unlock(nfsiod_mutex); 3769 started++; 3770 if (!nfsiod_start()) 3771 goto again; 3772 lck_mtx_lock(nfsiod_mutex); 3773 } 3774 } 3775 3776 if (req->r_achain.tqe_next == NFSREQNOLIST) 3777 TAILQ_INSERT_TAIL(&nmp->nm_iodq, req, r_achain); 3778 3779 /* If this mount doesn't already have an nfsiod working on it... */ 3780 if (!nmp->nm_niod) { 3781 if (niod) { /* give it the nfsiod we just grabbed */ 3782 nmp->nm_niod = niod; 3783 lck_mtx_unlock(nfsiod_mutex); 3784 wakeup(niod); 3785 } else if (nfsiod_thread_count > 0) { 3786 /* just queue it up on nfsiod mounts queue */ 3787 TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink); 3788 lck_mtx_unlock(nfsiod_mutex); 3789 } else { 3790 printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count, NFSIOD_MAX, started); 3791 lck_mtx_unlock(nfsiod_mutex); 3792 /* we have no other option but to be persistent */ 3793 started = 0; 3794 goto again; 3795 } 3796 } else { 3797 lck_mtx_unlock(nfsiod_mutex); 3798 } 3799 3800 FSDBG_BOT(552, nmp, 0, 0, 0); 3801} 3802 3803/* 3804 * queue up async I/O request for resend 3805 */ 3806void 3807nfs_asyncio_resend(struct nfsreq *req) 3808{ 3809 struct nfsmount *nmp = req->r_nmp; 3810 3811 if (!nmp) 3812 return; 3813 nfs_gss_clnt_rpcdone(req); 3814 lck_mtx_lock(&nmp->nm_lock); 3815 if (!(req->r_flags & R_RESENDQ)) { 3816 TAILQ_INSERT_TAIL(&nmp->nm_resendq, req, r_rchain); 3817 req->r_flags |= R_RESENDQ; 3818 } 3819 nfs_mount_sock_thread_wake(nmp); 3820 lck_mtx_unlock(&nmp->nm_lock); 3821} 3822 3823/* 3824 * Read directory data into a buffer. 3825 * 3826 * Buffer will be filled (unless EOF is hit). 3827 * Buffers after this one may also be completely/partially filled. 3828 */ 3829int 3830nfs_buf_readdir(struct nfsbuf *bp, vfs_context_t ctx) 3831{ 3832 nfsnode_t np = bp->nb_np; 3833 struct nfsmount *nmp = NFSTONMP(np); 3834 int error = 0; 3835 3836 if (!nmp) 3837 return (ENXIO); 3838 3839 if (nmp->nm_vers < NFS_VER4) 3840 error = nfs3_readdir_rpc(np, bp, ctx); 3841 else 3842 error = nfs4_readdir_rpc(np, bp, ctx); 3843 3844 if (error && (error != NFSERR_DIRBUFDROPPED)) { 3845 SET(bp->nb_flags, NB_ERROR); 3846 bp->nb_error = error; 3847 } 3848 return (error); 3849} 3850