1/* 2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ 29/* 30 * Copyright (c) 1993 31 * The Regents of the University of California. All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 3. All advertising materials mentioning features or use of this software 42 * must display the following acknowledgement: 43 * This product includes software developed by the University of 44 * California, Berkeley and its contributors. 45 * 4. Neither the name of the University nor the names of its contributors 46 * may be used to endorse or promote products derived from this software 47 * without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * SUCH DAMAGE. 60 * 61 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95 62 */ 63 64#include <sys/param.h> 65#include <sys/proc_internal.h> 66#include <sys/buf_internal.h> 67#include <sys/mount_internal.h> 68#include <sys/vnode_internal.h> 69#include <sys/trace.h> 70#include <sys/malloc.h> 71#include <sys/time.h> 72#include <sys/kernel.h> 73#include <sys/resourcevar.h> 74#include <sys/uio_internal.h> 75#include <libkern/libkern.h> 76#include <machine/machine_routines.h> 77 78#include <sys/ubc_internal.h> 79#include <vm/vnode_pager.h> 80 81#include <mach/mach_types.h> 82#include <mach/memory_object_types.h> 83#include <mach/vm_map.h> 84#include <mach/upl.h> 85 86#include <vm/vm_kern.h> 87#include <vm/vm_map.h> 88#include <vm/vm_pageout.h> 89 90#include <sys/kdebug.h> 91 92#define CL_READ 0x01 93#define CL_WRITE 0x02 94#define CL_ASYNC 0x04 95#define CL_COMMIT 0x08 96#define CL_PAGEOUT 0x10 97#define CL_AGE 0x20 98#define CL_NOZERO 0x40 99#define CL_PAGEIN 0x80 100#define CL_DEV_MEMORY 0x100 101#define CL_PRESERVE 0x200 102#define CL_THROTTLE 0x400 103#define CL_KEEPCACHED 0x800 104#define CL_DIRECT_IO 0x1000 105#define CL_PASSIVE 0x2000 106 107 108struct clios { 109 u_int io_completed; /* amount of io that has currently completed */ 110 u_int io_issued; /* amount of io that was successfully issued */ 111 int io_error; /* error code of first error encountered */ 112 int io_wanted; /* someone is sleeping waiting for a change in state */ 113}; 114 115static lck_grp_t *cl_mtx_grp; 116static lck_attr_t *cl_mtx_attr; 117static lck_grp_attr_t *cl_mtx_grp_attr; 118static lck_mtx_t *cl_mtxp; 119 120 121#define IO_UNKNOWN 0 122#define IO_DIRECT 1 123#define IO_CONTIG 2 124#define IO_COPY 3 125 126#define PUSH_DELAY 0x01 127#define PUSH_ALL 0x02 128#define PUSH_SYNC 0x04 129 130 131static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset); 132static void cluster_wait_IO(buf_t cbp_head, int async); 133static void cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait); 134 135static int cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length); 136 137static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size, 138 int flags, buf_t real_bp, struct clios *iostate, int (*)(buf_t, void *), void *callback_arg); 139static int cluster_iodone(buf_t bp, void *callback_arg); 140static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags); 141static int cluster_hard_throttle_on(vnode_t vp); 142 143static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg); 144 145static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int flags); 146static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference); 147 148static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags, 149 int (*)(buf_t, void *), void *callback_arg); 150static int cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length, 151 int flags, int (*)(buf_t, void *), void *callback_arg); 152static int cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length, 153 int (*)(buf_t, void *), void *callback_arg, int flags); 154 155static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, 156 off_t headOff, off_t tailOff, int flags, int (*)(buf_t, void *), void *callback_arg); 157static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, 158 int *write_type, u_int32_t *write_length, int flags, int (*)(buf_t, void *), void *callback_arg); 159static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, 160 int *write_type, u_int32_t *write_length, int (*)(buf_t, void *), void *callback_arg, int bflag); 161 162static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*)(buf_t, void *), void *callback_arg); 163 164static int cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag); 165static void cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra, int (*callback)(buf_t, void *), void *callback_arg, int bflag); 166 167static int cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg); 168 169static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int (*)(buf_t, void *), void *callback_arg); 170 171static void sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg); 172static void sparse_cluster_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int (*)(buf_t, void *), void *callback_arg); 173static void sparse_cluster_add(struct cl_writebehind *, vnode_t vp, struct cl_extent *, off_t EOF, int (*)(buf_t, void *), void *callback_arg); 174 175static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp); 176static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp); 177static kern_return_t vfs_drt_control(void **cmapp, int op_type); 178 179int is_file_clean(vnode_t, off_t); 180 181/* 182 * limit the internal I/O size so that we 183 * can represent it in a 32 bit int 184 */ 185#define MAX_IO_REQUEST_SIZE (1024 * 1024 * 256) 186#define MAX_IO_CONTIG_SIZE (MAX_UPL_SIZE * PAGE_SIZE) 187#define MAX_VECTS 16 188#define MIN_DIRECT_WRITE_SIZE (4 * PAGE_SIZE) 189 190 191#define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE)) 192#define MAX_PREFETCH(vp) (cluster_max_io_size(vp->v_mount, CL_READ) * 3); 193 194 195int speculative_reads_disabled = 0; 196 197/* 198 * throttle the number of async writes that 199 * can be outstanding on a single vnode 200 * before we issue a synchronous write 201 */ 202#define HARD_THROTTLE_MAXCNT 0 203#define HARD_THROTTLE_MAXSIZE (64 * 1024) 204 205int hard_throttle_on_root = 0; 206struct timeval priority_IO_timestamp_for_root; 207 208 209void 210cluster_init(void) { 211 /* 212 * allocate lock group attribute and group 213 */ 214 cl_mtx_grp_attr = lck_grp_attr_alloc_init(); 215 cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr); 216 217 /* 218 * allocate the lock attribute 219 */ 220 cl_mtx_attr = lck_attr_alloc_init(); 221 222 /* 223 * allocate and initialize mutex's used to protect updates and waits 224 * on the cluster_io context 225 */ 226 cl_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr); 227 228 if (cl_mtxp == NULL) 229 panic("cluster_init: failed to allocate cl_mtxp"); 230} 231 232 233uint32_t 234cluster_max_io_size(mount_t mp, int type) 235{ 236 uint32_t max_io_size; 237 uint32_t segcnt; 238 uint32_t maxcnt; 239 240 switch(type) { 241 242 case CL_READ: 243 segcnt = mp->mnt_segreadcnt; 244 maxcnt = mp->mnt_maxreadcnt; 245 break; 246 case CL_WRITE: 247 segcnt = mp->mnt_segwritecnt; 248 maxcnt = mp->mnt_maxwritecnt; 249 break; 250 default: 251 segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt); 252 maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt); 253 break; 254 } 255 if (segcnt > MAX_UPL_SIZE) { 256 /* 257 * don't allow a size beyond the max UPL size we can create 258 */ 259 segcnt = MAX_UPL_SIZE; 260 } 261 max_io_size = min((segcnt * PAGE_SIZE), maxcnt); 262 263 if (max_io_size < (MAX_UPL_TRANSFER * PAGE_SIZE)) { 264 /* 265 * don't allow a size smaller than the old fixed limit 266 */ 267 max_io_size = (MAX_UPL_TRANSFER * PAGE_SIZE); 268 } else { 269 /* 270 * make sure the size specified is a multiple of PAGE_SIZE 271 */ 272 max_io_size &= ~PAGE_MASK; 273 } 274 return (max_io_size); 275} 276 277 278 279 280#define CLW_ALLOCATE 0x01 281#define CLW_RETURNLOCKED 0x02 282#define CLW_IONOCACHE 0x04 283#define CLW_IOPASSIVE 0x08 284 285/* 286 * if the read ahead context doesn't yet exist, 287 * allocate and initialize it... 288 * the vnode lock serializes multiple callers 289 * during the actual assignment... first one 290 * to grab the lock wins... the other callers 291 * will release the now unnecessary storage 292 * 293 * once the context is present, try to grab (but don't block on) 294 * the lock associated with it... if someone 295 * else currently owns it, than the read 296 * will run without read-ahead. this allows 297 * multiple readers to run in parallel and 298 * since there's only 1 read ahead context, 299 * there's no real loss in only allowing 1 300 * reader to have read-ahead enabled. 301 */ 302static struct cl_readahead * 303cluster_get_rap(vnode_t vp) 304{ 305 struct ubc_info *ubc; 306 struct cl_readahead *rap; 307 308 ubc = vp->v_ubcinfo; 309 310 if ((rap = ubc->cl_rahead) == NULL) { 311 MALLOC_ZONE(rap, struct cl_readahead *, sizeof *rap, M_CLRDAHEAD, M_WAITOK); 312 313 bzero(rap, sizeof *rap); 314 rap->cl_lastr = -1; 315 lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr); 316 317 vnode_lock(vp); 318 319 if (ubc->cl_rahead == NULL) 320 ubc->cl_rahead = rap; 321 else { 322 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp); 323 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD); 324 rap = ubc->cl_rahead; 325 } 326 vnode_unlock(vp); 327 } 328 if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE) 329 return(rap); 330 331 return ((struct cl_readahead *)NULL); 332} 333 334 335/* 336 * if the write behind context doesn't yet exist, 337 * and CLW_ALLOCATE is specified, allocate and initialize it... 338 * the vnode lock serializes multiple callers 339 * during the actual assignment... first one 340 * to grab the lock wins... the other callers 341 * will release the now unnecessary storage 342 * 343 * if CLW_RETURNLOCKED is set, grab (blocking if necessary) 344 * the lock associated with the write behind context before 345 * returning 346 */ 347 348static struct cl_writebehind * 349cluster_get_wbp(vnode_t vp, int flags) 350{ 351 struct ubc_info *ubc; 352 struct cl_writebehind *wbp; 353 354 ubc = vp->v_ubcinfo; 355 356 if ((wbp = ubc->cl_wbehind) == NULL) { 357 358 if ( !(flags & CLW_ALLOCATE)) 359 return ((struct cl_writebehind *)NULL); 360 361 MALLOC_ZONE(wbp, struct cl_writebehind *, sizeof *wbp, M_CLWRBEHIND, M_WAITOK); 362 363 bzero(wbp, sizeof *wbp); 364 lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr); 365 366 vnode_lock(vp); 367 368 if (ubc->cl_wbehind == NULL) 369 ubc->cl_wbehind = wbp; 370 else { 371 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp); 372 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND); 373 wbp = ubc->cl_wbehind; 374 } 375 vnode_unlock(vp); 376 } 377 if (flags & CLW_RETURNLOCKED) 378 lck_mtx_lock(&wbp->cl_lockw); 379 380 return (wbp); 381} 382 383 384static void 385cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg) 386{ 387 struct cl_writebehind *wbp; 388 389 if ((wbp = cluster_get_wbp(vp, 0)) != NULL) { 390 391 if (wbp->cl_number) { 392 lck_mtx_lock(&wbp->cl_lockw); 393 394 cluster_try_push(wbp, vp, newEOF, PUSH_ALL | PUSH_SYNC, callback, callback_arg); 395 396 lck_mtx_unlock(&wbp->cl_lockw); 397 } 398 } 399} 400 401 402static int 403cluster_hard_throttle_on(vnode_t vp) 404{ 405 static struct timeval hard_throttle_maxelapsed = { 0, 200000 }; 406 407 if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) { 408 struct timeval elapsed; 409 410 if (hard_throttle_on_root) 411 return(1); 412 413 microuptime(&elapsed); 414 timevalsub(&elapsed, &priority_IO_timestamp_for_root); 415 416 if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <)) 417 return(1); 418 } 419 struct uthread *ut; 420 if (throttle_get_io_policy(&ut) == IOPOL_THROTTLE) { 421 size_t devbsdunit; 422 if (vp->v_mount != NULL) 423 devbsdunit = vp->v_mount->mnt_devbsdunit; 424 else 425 devbsdunit = LOWPRI_MAX_NUM_DEV - 1; 426 if (throttle_io_will_be_throttled(-1, devbsdunit)) { 427 return(1); 428 } 429 } 430 return(0); 431} 432 433 434static int 435cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags) 436{ 437 int upl_abort_code = 0; 438 int page_in = 0; 439 int page_out = 0; 440 441 if (io_flags & B_PHYS) 442 /* 443 * direct write of any flavor, or a direct read that wasn't aligned 444 */ 445 ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY); 446 else { 447 if (io_flags & B_PAGEIO) { 448 if (io_flags & B_READ) 449 page_in = 1; 450 else 451 page_out = 1; 452 } 453 if (io_flags & B_CACHE) 454 /* 455 * leave pages in the cache unchanged on error 456 */ 457 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY; 458 else if (page_out && (error != ENXIO)) 459 /* 460 * transient error... leave pages unchanged 461 */ 462 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY; 463 else if (page_in) 464 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR; 465 else 466 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES; 467 468 ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code); 469 } 470 return (upl_abort_code); 471} 472 473 474static int 475cluster_iodone(buf_t bp, void *callback_arg) 476{ 477 int b_flags; 478 int error; 479 int total_size; 480 int total_resid; 481 int upl_offset; 482 int zero_offset; 483 int pg_offset = 0; 484 int commit_size = 0; 485 int upl_flags = 0; 486 int transaction_size = 0; 487 upl_t upl; 488 buf_t cbp; 489 buf_t cbp_head; 490 buf_t cbp_next; 491 buf_t real_bp; 492 struct clios *iostate; 493 boolean_t transaction_complete = FALSE; 494 495 cbp_head = (buf_t)(bp->b_trans_head); 496 497 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START, 498 (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0); 499 500 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) { 501 /* 502 * all I/O requests that are part of this transaction 503 * have to complete before we can process it 504 */ 505 if ( !(cbp->b_flags & B_DONE)) { 506 507 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, 508 (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0); 509 510 return 0; 511 } 512 if (cbp->b_flags & B_EOT) 513 transaction_complete = TRUE; 514 } 515 if (transaction_complete == FALSE) { 516 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, 517 (int)cbp_head, 0, 0, 0, 0); 518 519 return 0; 520 } 521 error = 0; 522 total_size = 0; 523 total_resid = 0; 524 525 cbp = cbp_head; 526 upl_offset = cbp->b_uploffset; 527 upl = cbp->b_upl; 528 b_flags = cbp->b_flags; 529 real_bp = cbp->b_real_bp; 530 zero_offset= cbp->b_validend; 531 iostate = (struct clios *)cbp->b_iostate; 532 533 if (real_bp) 534 real_bp->b_dev = cbp->b_dev; 535 536 while (cbp) { 537 if ((cbp->b_flags & B_ERROR) && error == 0) 538 error = cbp->b_error; 539 540 total_resid += cbp->b_resid; 541 total_size += cbp->b_bcount; 542 543 cbp_next = cbp->b_trans_next; 544 545 if (cbp_next == NULL) 546 /* 547 * compute the overall size of the transaction 548 * in case we created one that has 'holes' in it 549 * 'total_size' represents the amount of I/O we 550 * did, not the span of the transaction w/r to the UPL 551 */ 552 transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset; 553 554 if (cbp != cbp_head) 555 free_io_buf(cbp); 556 557 cbp = cbp_next; 558 } 559 if (error == 0 && total_resid) 560 error = EIO; 561 562 if (error == 0) { 563 int (*cliodone_func)(buf_t, void *) = (int (*)(buf_t, void *))(cbp_head->b_cliodone); 564 565 if (cliodone_func != NULL) { 566 cbp_head->b_bcount = transaction_size; 567 568 error = (*cliodone_func)(cbp_head, callback_arg); 569 } 570 } 571 if (zero_offset) 572 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp); 573 574 free_io_buf(cbp_head); 575 576 if (iostate) { 577 int need_wakeup = 0; 578 579 /* 580 * someone has issued multiple I/Os asynchrounsly 581 * and is waiting for them to complete (streaming) 582 */ 583 lck_mtx_lock_spin(cl_mtxp); 584 585 if (error && iostate->io_error == 0) 586 iostate->io_error = error; 587 588 iostate->io_completed += total_size; 589 590 if (iostate->io_wanted) { 591 /* 592 * someone is waiting for the state of 593 * this io stream to change 594 */ 595 iostate->io_wanted = 0; 596 need_wakeup = 1; 597 } 598 lck_mtx_unlock(cl_mtxp); 599 600 if (need_wakeup) 601 wakeup((caddr_t)&iostate->io_wanted); 602 } 603 604 if (b_flags & B_COMMIT_UPL) { 605 606 pg_offset = upl_offset & PAGE_MASK; 607 commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 608 609 if (error) 610 upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags); 611 else { 612 upl_flags = UPL_COMMIT_FREE_ON_EMPTY; 613 614 if ((b_flags & B_PHYS) && (b_flags & B_READ)) 615 upl_flags |= UPL_COMMIT_SET_DIRTY; 616 617 if (b_flags & B_AGE) 618 upl_flags |= UPL_COMMIT_INACTIVATE; 619 620 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags); 621 } 622 } 623 if ((b_flags & B_NEED_IODONE) && real_bp) { 624 if (error) { 625 real_bp->b_flags |= B_ERROR; 626 real_bp->b_error = error; 627 } 628 real_bp->b_resid = total_resid; 629 630 buf_biodone(real_bp); 631 } 632 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, 633 (int)upl, upl_offset - pg_offset, commit_size, (error << 24) | upl_flags, 0); 634 635 return (error); 636} 637 638 639void 640cluster_zero(upl_t upl, vm_offset_t upl_offset, int size, buf_t bp) 641{ 642 643 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START, 644 upl_offset, size, (int)bp, 0, 0); 645 646 if (bp == NULL || bp->b_datap == 0) { 647 upl_page_info_t *pl; 648 addr64_t zero_addr; 649 650 pl = ubc_upl_pageinfo(upl); 651 652 if (upl_device_page(pl) == TRUE) { 653 zero_addr = ((addr64_t)upl_phys_page(pl, 0) << 12) + upl_offset; 654 655 bzero_phys_nc(zero_addr, size); 656 } else { 657 while (size) { 658 int page_offset; 659 int page_index; 660 int zero_cnt; 661 662 page_index = upl_offset / PAGE_SIZE; 663 page_offset = upl_offset & PAGE_MASK; 664 665 zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset; 666 zero_cnt = min(PAGE_SIZE - page_offset, size); 667 668 bzero_phys(zero_addr, zero_cnt); 669 670 size -= zero_cnt; 671 upl_offset += zero_cnt; 672 } 673 } 674 } else 675 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size); 676 677 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END, 678 upl_offset, size, 0, 0, 0); 679} 680 681 682static void 683cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset) 684{ 685 cbp_head->b_validend = zero_offset; 686 cbp_tail->b_flags |= B_EOT; 687} 688 689static void 690cluster_wait_IO(buf_t cbp_head, int async) 691{ 692 buf_t cbp; 693 694 if (async) { 695 /* 696 * async callback completion will not normally 697 * generate a wakeup upon I/O completion... 698 * by setting BL_WANTED, we will force a wakeup 699 * to occur as any outstanding I/Os complete... 700 * I/Os already completed will have BL_CALLDONE already 701 * set and we won't block in buf_biowait_callback.. 702 * note that we're actually waiting for the bp to have 703 * completed the callback function... only then 704 * can we safely take back ownership of the bp 705 * need the main buf mutex in order to safely 706 * update b_lflags 707 */ 708 buf_list_lock(); 709 710 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) 711 cbp->b_lflags |= BL_WANTED; 712 713 buf_list_unlock(); 714 } 715 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) { 716 if (async) 717 buf_biowait_callback(cbp); 718 else 719 buf_biowait(cbp); 720 } 721} 722 723static void 724cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait) 725{ 726 buf_t cbp; 727 int error; 728 729 /* 730 * cluster_complete_transaction will 731 * only be called if we've issued a complete chain in synchronous mode 732 * or, we've already done a cluster_wait_IO on an incomplete chain 733 */ 734 if (needwait) { 735 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) 736 buf_biowait(cbp); 737 } 738 error = cluster_iodone(*cbp_head, callback_arg); 739 740 if ( !(flags & CL_ASYNC) && error && *retval == 0) { 741 if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) != CL_PAGEOUT) || (error != ENXIO)) 742 *retval = error; 743 } 744 *cbp_head = (buf_t)NULL; 745} 746 747 748static int 749cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size, 750 int flags, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg) 751{ 752 buf_t cbp; 753 u_int size; 754 u_int io_size; 755 int io_flags; 756 int bmap_flags; 757 int error = 0; 758 int retval = 0; 759 buf_t cbp_head = NULL; 760 buf_t cbp_tail = NULL; 761 int trans_count = 0; 762 int max_trans_count; 763 u_int pg_count; 764 int pg_offset; 765 u_int max_iosize; 766 u_int max_vectors; 767 int priv; 768 int zero_offset = 0; 769 int async_throttle = 0; 770 mount_t mp; 771 vm_offset_t upl_end_offset; 772 boolean_t need_EOT = FALSE; 773 774 /* 775 * we currently don't support buffers larger than a page 776 */ 777 if (real_bp && non_rounded_size > PAGE_SIZE) 778 panic("%s(): Called with real buffer of size %d bytes which " 779 "is greater than the maximum allowed size of " 780 "%d bytes (the system PAGE_SIZE).\n", 781 __FUNCTION__, non_rounded_size, PAGE_SIZE); 782 783 mp = vp->v_mount; 784 785 /* 786 * we don't want to do any funny rounding of the size for IO requests 787 * coming through the DIRECT or CONTIGUOUS paths... those pages don't 788 * belong to us... we can't extend (nor do we need to) the I/O to fill 789 * out a page 790 */ 791 if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY | CL_DIRECT_IO))) { 792 /* 793 * round the requested size up so that this I/O ends on a 794 * page boundary in case this is a 'write'... if the filesystem 795 * has blocks allocated to back the page beyond the EOF, we want to 796 * make sure to write out the zero's that are sitting beyond the EOF 797 * so that in case the filesystem doesn't explicitly zero this area 798 * if a hole is created via a lseek/write beyond the current EOF, 799 * it will return zeros when it's read back from the disk. If the 800 * physical allocation doesn't extend for the whole page, we'll 801 * only write/read from the disk up to the end of this allocation 802 * via the extent info returned from the VNOP_BLOCKMAP call. 803 */ 804 pg_offset = upl_offset & PAGE_MASK; 805 806 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset; 807 } else { 808 /* 809 * anyone advertising a blocksize of 1 byte probably 810 * can't deal with us rounding up the request size 811 * AFP is one such filesystem/device 812 */ 813 size = non_rounded_size; 814 } 815 upl_end_offset = upl_offset + size; 816 817 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0); 818 819 /* 820 * Set the maximum transaction size to the maximum desired number of 821 * buffers. 822 */ 823 max_trans_count = 8; 824 if (flags & CL_DEV_MEMORY) 825 max_trans_count = 16; 826 827 if (flags & CL_READ) { 828 io_flags = B_READ; 829 bmap_flags = VNODE_READ; 830 831 max_iosize = mp->mnt_maxreadcnt; 832 max_vectors = mp->mnt_segreadcnt; 833 } else { 834 io_flags = B_WRITE; 835 bmap_flags = VNODE_WRITE; 836 837 max_iosize = mp->mnt_maxwritecnt; 838 max_vectors = mp->mnt_segwritecnt; 839 } 840 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0); 841 842 /* 843 * make sure the maximum iosize is a 844 * multiple of the page size 845 */ 846 max_iosize &= ~PAGE_MASK; 847 848 /* 849 * Ensure the maximum iosize is sensible. 850 */ 851 if (!max_iosize) 852 max_iosize = PAGE_SIZE; 853 854 if (flags & CL_THROTTLE) { 855 if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp)) { 856 if (max_iosize > HARD_THROTTLE_MAXSIZE) 857 max_iosize = HARD_THROTTLE_MAXSIZE; 858 async_throttle = HARD_THROTTLE_MAXCNT; 859 } else { 860 if ( (flags & CL_DEV_MEMORY) ) 861 async_throttle = VNODE_ASYNC_THROTTLE; 862 else { 863 u_int max_cluster; 864 u_int max_cluster_size; 865 u_int max_prefetch; 866 867 max_cluster_size = MAX_CLUSTER_SIZE(vp); 868 max_prefetch = MAX_PREFETCH(vp); 869 870 if (max_iosize > max_cluster_size) 871 max_cluster = max_cluster_size; 872 else 873 max_cluster = max_iosize; 874 875 if (size < max_cluster) 876 max_cluster = size; 877 878 async_throttle = min(VNODE_ASYNC_THROTTLE, (max_prefetch / max_cluster) - 1); 879 } 880 } 881 } 882 if (flags & CL_AGE) 883 io_flags |= B_AGE; 884 if (flags & (CL_PAGEIN | CL_PAGEOUT)) 885 io_flags |= B_PAGEIO; 886 if (flags & CL_COMMIT) 887 io_flags |= B_COMMIT_UPL; 888 if (flags & CL_PRESERVE) 889 io_flags |= B_PHYS; 890 if (flags & CL_KEEPCACHED) 891 io_flags |= B_CACHE; 892 if (flags & CL_PASSIVE) 893 io_flags |= B_PASSIVE; 894 if (vp->v_flag & VSYSTEM) 895 io_flags |= B_META; 896 897 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) { 898 /* 899 * then we are going to end up 900 * with a page that we can't complete (the file size wasn't a multiple 901 * of PAGE_SIZE and we're trying to read to the end of the file 902 * so we'll go ahead and zero out the portion of the page we can't 903 * read in from the file 904 */ 905 zero_offset = upl_offset + non_rounded_size; 906 } 907 while (size) { 908 daddr64_t blkno; 909 daddr64_t lblkno; 910 u_int io_size_wanted; 911 912 if (size > max_iosize) 913 io_size = max_iosize; 914 else 915 io_size = size; 916 917 io_size_wanted = io_size; 918 919 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL, bmap_flags, NULL))) 920 break; 921 922 if (io_size > io_size_wanted) 923 io_size = io_size_wanted; 924 925 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) 926 real_bp->b_blkno = blkno; 927 928 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE, 929 (int)f_offset, (int)(blkno>>32), (int)blkno, io_size, 0); 930 931 if (io_size == 0) { 932 /* 933 * vnop_blockmap didn't return an error... however, it did 934 * return an extent size of 0 which means we can't 935 * make forward progress on this I/O... a hole in the 936 * file would be returned as a blkno of -1 with a non-zero io_size 937 * a real extent is returned with a blkno != -1 and a non-zero io_size 938 */ 939 error = EINVAL; 940 break; 941 } 942 if ( !(flags & CL_READ) && blkno == -1) { 943 off_t e_offset; 944 int pageout_flags; 945 946 /* 947 * we're writing into a 'hole' 948 */ 949 if (flags & CL_PAGEOUT) { 950 /* 951 * if we got here via cluster_pageout 952 * then just error the request and return 953 * the 'hole' should already have been covered 954 */ 955 error = EINVAL; 956 break; 957 } 958 /* 959 * we can get here if the cluster code happens to 960 * pick up a page that was dirtied via mmap vs 961 * a 'write' and the page targets a 'hole'... 962 * i.e. the writes to the cluster were sparse 963 * and the file was being written for the first time 964 * 965 * we can also get here if the filesystem supports 966 * 'holes' that are less than PAGE_SIZE.... because 967 * we can't know if the range in the page that covers 968 * the 'hole' has been dirtied via an mmap or not, 969 * we have to assume the worst and try to push the 970 * entire page to storage. 971 * 972 * Try paging out the page individually before 973 * giving up entirely and dumping it (the pageout 974 * path will insure that the zero extent accounting 975 * has been taken care of before we get back into cluster_io) 976 * 977 * go direct to vnode_pageout so that we don't have to 978 * unbusy the page from the UPL... we used to do this 979 * so that we could call ubc_sync_range, but that results 980 * in a potential deadlock if someone else races us to acquire 981 * that page and wins and in addition needs one of the pages 982 * we're continuing to hold in the UPL 983 */ 984 pageout_flags = UPL_MSYNC | UPL_VNODE_PAGER | UPL_NESTED_PAGEOUT; 985 986 if ( !(flags & CL_ASYNC)) 987 pageout_flags |= UPL_IOSYNC; 988 if ( !(flags & CL_COMMIT)) 989 pageout_flags |= UPL_NOCOMMIT; 990 991 if (cbp_head) { 992 buf_t last_cbp; 993 994 /* 995 * first we have to wait for the the current outstanding I/Os 996 * to complete... EOT hasn't been set yet on this transaction 997 * so the pages won't be released just because all of the current 998 * I/O linked to this transaction has completed... 999 */ 1000 cluster_wait_IO(cbp_head, (flags & CL_ASYNC)); 1001 1002 /* 1003 * we've got a transcation that 1004 * includes the page we're about to push out through vnode_pageout... 1005 * find the last bp in the list which will be the one that 1006 * includes the head of this page and round it's iosize down 1007 * to a page boundary... 1008 */ 1009 for (last_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next) 1010 last_cbp = cbp; 1011 1012 cbp->b_bcount &= ~PAGE_MASK; 1013 1014 if (cbp->b_bcount == 0) { 1015 /* 1016 * this buf no longer has any I/O associated with it 1017 */ 1018 free_io_buf(cbp); 1019 1020 if (cbp == cbp_head) { 1021 /* 1022 * the buf we just freed was the only buf in 1023 * this transaction... so there's no I/O to do 1024 */ 1025 cbp_head = NULL; 1026 } else { 1027 /* 1028 * remove the buf we just freed from 1029 * the transaction list 1030 */ 1031 last_cbp->b_trans_next = NULL; 1032 cbp_tail = last_cbp; 1033 } 1034 } 1035 if (cbp_head) { 1036 /* 1037 * there was more to the current transaction 1038 * than just the page we are pushing out via vnode_pageout... 1039 * mark it as finished and complete it... we've already 1040 * waited for the I/Os to complete above in the call to cluster_wait_IO 1041 */ 1042 cluster_EOT(cbp_head, cbp_tail, 0); 1043 1044 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0); 1045 1046 trans_count = 0; 1047 } 1048 } 1049 if (vnode_pageout(vp, upl, trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) { 1050 error = EINVAL; 1051 break; 1052 } 1053 e_offset = round_page_64(f_offset + 1); 1054 io_size = e_offset - f_offset; 1055 1056 f_offset += io_size; 1057 upl_offset += io_size; 1058 1059 if (size >= io_size) 1060 size -= io_size; 1061 else 1062 size = 0; 1063 /* 1064 * keep track of how much of the original request 1065 * that we've actually completed... non_rounded_size 1066 * may go negative due to us rounding the request 1067 * to a page size multiple (i.e. size > non_rounded_size) 1068 */ 1069 non_rounded_size -= io_size; 1070 1071 if (non_rounded_size <= 0) { 1072 /* 1073 * we've transferred all of the data in the original 1074 * request, but we were unable to complete the tail 1075 * of the last page because the file didn't have 1076 * an allocation to back that portion... this is ok. 1077 */ 1078 size = 0; 1079 } 1080 continue; 1081 } 1082 lblkno = (daddr64_t)(f_offset / PAGE_SIZE_64); 1083 /* 1084 * we have now figured out how much I/O we can do - this is in 'io_size' 1085 * pg_offset is the starting point in the first page for the I/O 1086 * pg_count is the number of full and partial pages that 'io_size' encompasses 1087 */ 1088 pg_offset = upl_offset & PAGE_MASK; 1089 1090 if (flags & CL_DEV_MEMORY) { 1091 /* 1092 * treat physical requests as one 'giant' page 1093 */ 1094 pg_count = 1; 1095 } else 1096 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE; 1097 1098 if ((flags & CL_READ) && blkno == -1) { 1099 vm_offset_t commit_offset; 1100 int bytes_to_zero; 1101 int complete_transaction_now = 0; 1102 1103 /* 1104 * if we're reading and blkno == -1, then we've got a 1105 * 'hole' in the file that we need to deal with by zeroing 1106 * out the affected area in the upl 1107 */ 1108 if (io_size >= (u_int)non_rounded_size) { 1109 /* 1110 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE 1111 * than 'zero_offset' will be non-zero 1112 * if the 'hole' returned by vnop_blockmap extends all the way to the eof 1113 * (indicated by the io_size finishing off the I/O request for this UPL) 1114 * than we're not going to issue an I/O for the 1115 * last page in this upl... we need to zero both the hole and the tail 1116 * of the page beyond the EOF, since the delayed zero-fill won't kick in 1117 */ 1118 bytes_to_zero = non_rounded_size; 1119 if (!(flags & CL_NOZERO)) 1120 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset; 1121 1122 zero_offset = 0; 1123 } else 1124 bytes_to_zero = io_size; 1125 1126 pg_count = 0; 1127 1128 cluster_zero(upl, upl_offset, bytes_to_zero, real_bp); 1129 1130 if (cbp_head) { 1131 int pg_resid; 1132 1133 /* 1134 * if there is a current I/O chain pending 1135 * then the first page of the group we just zero'd 1136 * will be handled by the I/O completion if the zero 1137 * fill started in the middle of the page 1138 */ 1139 commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK; 1140 1141 pg_resid = commit_offset - upl_offset; 1142 1143 if (bytes_to_zero >= pg_resid) { 1144 /* 1145 * the last page of the current I/O 1146 * has been completed... 1147 * compute the number of fully zero'd 1148 * pages that are beyond it 1149 * plus the last page if its partial 1150 * and we have no more I/O to issue... 1151 * otherwise a partial page is left 1152 * to begin the next I/O 1153 */ 1154 if ((int)io_size >= non_rounded_size) 1155 pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE; 1156 else 1157 pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE; 1158 1159 complete_transaction_now = 1; 1160 } 1161 } else { 1162 /* 1163 * no pending I/O to deal with 1164 * so, commit all of the fully zero'd pages 1165 * plus the last page if its partial 1166 * and we have no more I/O to issue... 1167 * otherwise a partial page is left 1168 * to begin the next I/O 1169 */ 1170 if ((int)io_size >= non_rounded_size) 1171 pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE; 1172 else 1173 pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE; 1174 1175 commit_offset = upl_offset & ~PAGE_MASK; 1176 } 1177 if ( (flags & CL_COMMIT) && pg_count) { 1178 ubc_upl_commit_range(upl, commit_offset, pg_count * PAGE_SIZE, 1179 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY); 1180 } 1181 upl_offset += io_size; 1182 f_offset += io_size; 1183 size -= io_size; 1184 1185 /* 1186 * keep track of how much of the original request 1187 * that we've actually completed... non_rounded_size 1188 * may go negative due to us rounding the request 1189 * to a page size multiple (i.e. size > non_rounded_size) 1190 */ 1191 non_rounded_size -= io_size; 1192 1193 if (non_rounded_size <= 0) { 1194 /* 1195 * we've transferred all of the data in the original 1196 * request, but we were unable to complete the tail 1197 * of the last page because the file didn't have 1198 * an allocation to back that portion... this is ok. 1199 */ 1200 size = 0; 1201 } 1202 if (cbp_head && (complete_transaction_now || size == 0)) { 1203 cluster_wait_IO(cbp_head, (flags & CL_ASYNC)); 1204 1205 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0); 1206 1207 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0); 1208 1209 trans_count = 0; 1210 } 1211 continue; 1212 } 1213 if (pg_count > max_vectors) { 1214 if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) { 1215 io_size = PAGE_SIZE - pg_offset; 1216 pg_count = 1; 1217 } else { 1218 io_size -= (pg_count - max_vectors) * PAGE_SIZE; 1219 pg_count = max_vectors; 1220 } 1221 } 1222 /* 1223 * If the transaction is going to reach the maximum number of 1224 * desired elements, truncate the i/o to the nearest page so 1225 * that the actual i/o is initiated after this buffer is 1226 * created and added to the i/o chain. 1227 * 1228 * I/O directed to physically contiguous memory 1229 * doesn't have a requirement to make sure we 'fill' a page 1230 */ 1231 if ( !(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count && 1232 ((upl_offset + io_size) & PAGE_MASK)) { 1233 vm_offset_t aligned_ofs; 1234 1235 aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK; 1236 /* 1237 * If the io_size does not actually finish off even a 1238 * single page we have to keep adding buffers to the 1239 * transaction despite having reached the desired limit. 1240 * 1241 * Eventually we get here with the page being finished 1242 * off (and exceeded) and then we truncate the size of 1243 * this i/o request so that it is page aligned so that 1244 * we can finally issue the i/o on the transaction. 1245 */ 1246 if (aligned_ofs > upl_offset) { 1247 io_size = aligned_ofs - upl_offset; 1248 pg_count--; 1249 } 1250 } 1251 1252 if ( !(mp->mnt_kern_flag & MNTK_VIRTUALDEV)) 1253 /* 1254 * if we're not targeting a virtual device i.e. a disk image 1255 * it's safe to dip into the reserve pool since real devices 1256 * can complete this I/O request without requiring additional 1257 * bufs from the alloc_io_buf pool 1258 */ 1259 priv = 1; 1260 else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT)) 1261 /* 1262 * Throttle the speculative IO 1263 */ 1264 priv = 0; 1265 else 1266 priv = 1; 1267 1268 cbp = alloc_io_buf(vp, priv); 1269 1270 if (flags & CL_PAGEOUT) { 1271 u_int i; 1272 1273 for (i = 0; i < pg_count; i++) { 1274 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY) 1275 panic("BUSY bp found in cluster_io"); 1276 } 1277 } 1278 if (flags & CL_ASYNC) { 1279 if (buf_setcallback(cbp, (void *)cluster_iodone, callback_arg)) 1280 panic("buf_setcallback failed\n"); 1281 } 1282 cbp->b_cliodone = (void *)callback; 1283 cbp->b_flags |= io_flags; 1284 1285 cbp->b_lblkno = lblkno; 1286 cbp->b_blkno = blkno; 1287 cbp->b_bcount = io_size; 1288 1289 if (buf_setupl(cbp, upl, upl_offset)) 1290 panic("buf_setupl failed\n"); 1291 1292 cbp->b_trans_next = (buf_t)NULL; 1293 1294 if ((cbp->b_iostate = (void *)iostate)) 1295 /* 1296 * caller wants to track the state of this 1297 * io... bump the amount issued against this stream 1298 */ 1299 iostate->io_issued += io_size; 1300 1301 if (flags & CL_READ) { 1302 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE, 1303 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0); 1304 } 1305 else { 1306 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE, 1307 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0); 1308 } 1309 1310 if (cbp_head) { 1311 cbp_tail->b_trans_next = cbp; 1312 cbp_tail = cbp; 1313 } else { 1314 cbp_head = cbp; 1315 cbp_tail = cbp; 1316 1317 if ( (cbp_head->b_real_bp = real_bp) ) { 1318 cbp_head->b_flags |= B_NEED_IODONE; 1319 real_bp = (buf_t)NULL; 1320 } 1321 } 1322 *(buf_t *)(&cbp->b_trans_head) = cbp_head; 1323 1324 trans_count++; 1325 1326 upl_offset += io_size; 1327 f_offset += io_size; 1328 size -= io_size; 1329 /* 1330 * keep track of how much of the original request 1331 * that we've actually completed... non_rounded_size 1332 * may go negative due to us rounding the request 1333 * to a page size multiple (i.e. size > non_rounded_size) 1334 */ 1335 non_rounded_size -= io_size; 1336 1337 if (non_rounded_size <= 0) { 1338 /* 1339 * we've transferred all of the data in the original 1340 * request, but we were unable to complete the tail 1341 * of the last page because the file didn't have 1342 * an allocation to back that portion... this is ok. 1343 */ 1344 size = 0; 1345 } 1346 if (size == 0) { 1347 /* 1348 * we have no more I/O to issue, so go 1349 * finish the final transaction 1350 */ 1351 need_EOT = TRUE; 1352 } else if ( ((flags & CL_DEV_MEMORY) || (upl_offset & PAGE_MASK) == 0) && 1353 ((flags & CL_ASYNC) || trans_count > max_trans_count) ) { 1354 /* 1355 * I/O directed to physically contiguous memory... 1356 * which doesn't have a requirement to make sure we 'fill' a page 1357 * or... 1358 * the current I/O we've prepared fully 1359 * completes the last page in this request 1360 * and ... 1361 * it's either an ASYNC request or 1362 * we've already accumulated more than 8 I/O's into 1363 * this transaction so mark it as complete so that 1364 * it can finish asynchronously or via the cluster_complete_transaction 1365 * below if the request is synchronous 1366 */ 1367 need_EOT = TRUE; 1368 } 1369 if (need_EOT == TRUE) 1370 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0); 1371 1372 if (flags & CL_THROTTLE) 1373 (void)vnode_waitforwrites(vp, async_throttle, 0, 0, "cluster_io"); 1374 1375 if ( !(io_flags & B_READ)) 1376 vnode_startwrite(vp); 1377 1378 (void) VNOP_STRATEGY(cbp); 1379 1380 if (need_EOT == TRUE) { 1381 if ( !(flags & CL_ASYNC)) 1382 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 1); 1383 1384 need_EOT = FALSE; 1385 trans_count = 0; 1386 cbp_head = NULL; 1387 } 1388 } 1389 if (error) { 1390 int abort_size; 1391 1392 io_size = 0; 1393 1394 if (cbp_head) { 1395 /* 1396 * first wait until all of the outstanding I/O 1397 * for this partial transaction has completed 1398 */ 1399 cluster_wait_IO(cbp_head, (flags & CL_ASYNC)); 1400 1401 /* 1402 * Rewind the upl offset to the beginning of the 1403 * transaction. 1404 */ 1405 upl_offset = cbp_head->b_uploffset; 1406 1407 for (cbp = cbp_head; cbp;) { 1408 buf_t cbp_next; 1409 1410 size += cbp->b_bcount; 1411 io_size += cbp->b_bcount; 1412 1413 cbp_next = cbp->b_trans_next; 1414 free_io_buf(cbp); 1415 cbp = cbp_next; 1416 } 1417 } 1418 if (iostate) { 1419 int need_wakeup = 0; 1420 1421 /* 1422 * update the error condition for this stream 1423 * since we never really issued the io 1424 * just go ahead and adjust it back 1425 */ 1426 lck_mtx_lock_spin(cl_mtxp); 1427 1428 if (iostate->io_error == 0) 1429 iostate->io_error = error; 1430 iostate->io_issued -= io_size; 1431 1432 if (iostate->io_wanted) { 1433 /* 1434 * someone is waiting for the state of 1435 * this io stream to change 1436 */ 1437 iostate->io_wanted = 0; 1438 need_wakeup = 1; 1439 } 1440 lck_mtx_unlock(cl_mtxp); 1441 1442 if (need_wakeup) 1443 wakeup((caddr_t)&iostate->io_wanted); 1444 } 1445 if (flags & CL_COMMIT) { 1446 int upl_flags; 1447 1448 pg_offset = upl_offset & PAGE_MASK; 1449 abort_size = (upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK; 1450 1451 upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, abort_size, error, io_flags); 1452 1453 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE, 1454 (int)upl, upl_offset - pg_offset, abort_size, (error << 24) | upl_flags, 0); 1455 } 1456 if (retval == 0) 1457 retval = error; 1458 } else if (cbp_head) 1459 panic("%s(): cbp_head is not NULL.\n", __FUNCTION__); 1460 1461 if (real_bp) { 1462 /* 1463 * can get here if we either encountered an error 1464 * or we completely zero-filled the request and 1465 * no I/O was issued 1466 */ 1467 if (error) { 1468 real_bp->b_flags |= B_ERROR; 1469 real_bp->b_error = error; 1470 } 1471 buf_biodone(real_bp); 1472 } 1473 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, 0); 1474 1475 return (retval); 1476} 1477 1478 1479static int 1480cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag) 1481{ 1482 int pages_in_prefetch; 1483 1484 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START, 1485 (int)f_offset, size, (int)filesize, 0, 0); 1486 1487 if (f_offset >= filesize) { 1488 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END, 1489 (int)f_offset, 0, 0, 0, 0); 1490 return(0); 1491 } 1492 if ((off_t)size > (filesize - f_offset)) 1493 size = filesize - f_offset; 1494 pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE; 1495 1496 advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag); 1497 1498 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END, 1499 (int)f_offset + size, pages_in_prefetch, 0, 1, 0); 1500 1501 return (pages_in_prefetch); 1502} 1503 1504 1505 1506static void 1507cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap, int (*callback)(buf_t, void *), void *callback_arg, 1508 int bflag) 1509{ 1510 daddr64_t r_addr; 1511 off_t f_offset; 1512 int size_of_prefetch; 1513 u_int max_prefetch; 1514 1515 1516 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START, 1517 (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0); 1518 1519 if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) { 1520 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, 1521 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0); 1522 return; 1523 } 1524 if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1))) { 1525 rap->cl_ralen = 0; 1526 rap->cl_maxra = 0; 1527 1528 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, 1529 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0); 1530 1531 return; 1532 } 1533 max_prefetch = MAX_PREFETCH(vp); 1534 1535 if (extent->e_addr < rap->cl_maxra) { 1536 if ((rap->cl_maxra - extent->e_addr) > ((max_prefetch / PAGE_SIZE) / 4)) { 1537 1538 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, 1539 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0); 1540 return; 1541 } 1542 } 1543 r_addr = max(extent->e_addr, rap->cl_maxra) + 1; 1544 f_offset = (off_t)(r_addr * PAGE_SIZE_64); 1545 1546 size_of_prefetch = 0; 1547 1548 ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch); 1549 1550 if (size_of_prefetch) { 1551 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, 1552 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0); 1553 return; 1554 } 1555 if (f_offset < filesize) { 1556 daddr64_t read_size; 1557 1558 rap->cl_ralen = rap->cl_ralen ? min(max_prefetch / PAGE_SIZE, rap->cl_ralen << 1) : 1; 1559 1560 read_size = (extent->e_addr + 1) - extent->b_addr; 1561 1562 if (read_size > rap->cl_ralen) { 1563 if (read_size > max_prefetch / PAGE_SIZE) 1564 rap->cl_ralen = max_prefetch / PAGE_SIZE; 1565 else 1566 rap->cl_ralen = read_size; 1567 } 1568 size_of_prefetch = cluster_read_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag); 1569 1570 if (size_of_prefetch) 1571 rap->cl_maxra = (r_addr + size_of_prefetch) - 1; 1572 } 1573 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, 1574 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0); 1575} 1576 1577 1578int 1579cluster_pageout(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, 1580 int size, off_t filesize, int flags) 1581{ 1582 return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL); 1583 1584} 1585 1586 1587int 1588cluster_pageout_ext(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, 1589 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg) 1590{ 1591 int io_size; 1592 int rounded_size; 1593 off_t max_size; 1594 int local_flags; 1595 1596 if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) 1597 /* 1598 * if we know we're issuing this I/O to a virtual device (i.e. disk image) 1599 * then we don't want to enforce this throttle... if we do, we can 1600 * potentially deadlock since we're stalling the pageout thread at a time 1601 * when the disk image might need additional memory (which won't be available 1602 * if the pageout thread can't run)... instead we'll just depend on the throttle 1603 * that the pageout thread now has in place to deal with external files 1604 */ 1605 local_flags = CL_PAGEOUT; 1606 else 1607 local_flags = CL_PAGEOUT | CL_THROTTLE; 1608 1609 if ((flags & UPL_IOSYNC) == 0) 1610 local_flags |= CL_ASYNC; 1611 if ((flags & UPL_NOCOMMIT) == 0) 1612 local_flags |= CL_COMMIT; 1613 if ((flags & UPL_KEEPCACHED)) 1614 local_flags |= CL_KEEPCACHED; 1615 if (flags & IO_PASSIVE) 1616 local_flags |= CL_PASSIVE; 1617 1618 1619 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE, 1620 (int)f_offset, size, (int)filesize, local_flags, 0); 1621 1622 /* 1623 * If they didn't specify any I/O, then we are done... 1624 * we can't issue an abort because we don't know how 1625 * big the upl really is 1626 */ 1627 if (size <= 0) 1628 return (EINVAL); 1629 1630 if (vp->v_mount->mnt_flag & MNT_RDONLY) { 1631 if (local_flags & CL_COMMIT) 1632 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY); 1633 return (EROFS); 1634 } 1635 /* 1636 * can't page-in from a negative offset 1637 * or if we're starting beyond the EOF 1638 * or if the file offset isn't page aligned 1639 * or the size requested isn't a multiple of PAGE_SIZE 1640 */ 1641 if (f_offset < 0 || f_offset >= filesize || 1642 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) { 1643 if (local_flags & CL_COMMIT) 1644 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY); 1645 return (EINVAL); 1646 } 1647 max_size = filesize - f_offset; 1648 1649 if (size < max_size) 1650 io_size = size; 1651 else 1652 io_size = max_size; 1653 1654 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 1655 1656 if (size > rounded_size) { 1657 if (local_flags & CL_COMMIT) 1658 ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size, 1659 UPL_ABORT_FREE_ON_EMPTY); 1660 } 1661 return (cluster_io(vp, upl, upl_offset, f_offset, io_size, 1662 local_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg)); 1663} 1664 1665 1666int 1667cluster_pagein(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, 1668 int size, off_t filesize, int flags) 1669{ 1670 return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL); 1671} 1672 1673 1674int 1675cluster_pagein_ext(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, 1676 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg) 1677{ 1678 u_int io_size; 1679 int rounded_size; 1680 off_t max_size; 1681 int retval; 1682 int local_flags = 0; 1683 1684 if (upl == NULL || size < 0) 1685 panic("cluster_pagein: NULL upl passed in"); 1686 1687 if ((flags & UPL_IOSYNC) == 0) 1688 local_flags |= CL_ASYNC; 1689 if ((flags & UPL_NOCOMMIT) == 0) 1690 local_flags |= CL_COMMIT; 1691 if (flags & IO_PASSIVE) 1692 local_flags |= CL_PASSIVE; 1693 1694 1695 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE, 1696 (int)f_offset, size, (int)filesize, local_flags, 0); 1697 1698 /* 1699 * can't page-in from a negative offset 1700 * or if we're starting beyond the EOF 1701 * or if the file offset isn't page aligned 1702 * or the size requested isn't a multiple of PAGE_SIZE 1703 */ 1704 if (f_offset < 0 || f_offset >= filesize || 1705 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) { 1706 if (local_flags & CL_COMMIT) 1707 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); 1708 return (EINVAL); 1709 } 1710 max_size = filesize - f_offset; 1711 1712 if (size < max_size) 1713 io_size = size; 1714 else 1715 io_size = max_size; 1716 1717 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 1718 1719 if (size > rounded_size && (local_flags & CL_COMMIT)) 1720 ubc_upl_abort_range(upl, upl_offset + rounded_size, 1721 size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); 1722 1723 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, 1724 local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 1725 1726 return (retval); 1727} 1728 1729 1730int 1731cluster_bp(buf_t bp) 1732{ 1733 return cluster_bp_ext(bp, NULL, NULL); 1734} 1735 1736 1737int 1738cluster_bp_ext(buf_t bp, int (*callback)(buf_t, void *), void *callback_arg) 1739{ 1740 off_t f_offset; 1741 int flags; 1742 1743 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START, 1744 (int)bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0); 1745 1746 if (bp->b_flags & B_READ) 1747 flags = CL_ASYNC | CL_READ; 1748 else 1749 flags = CL_ASYNC; 1750 if (bp->b_flags & B_PASSIVE) 1751 flags |= CL_PASSIVE; 1752 1753 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno); 1754 1755 return (cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL, callback, callback_arg)); 1756} 1757 1758 1759 1760int 1761cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags) 1762{ 1763 return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL); 1764} 1765 1766 1767int 1768cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, 1769 int xflags, int (*callback)(buf_t, void *), void *callback_arg) 1770{ 1771 user_ssize_t cur_resid; 1772 int retval = 0; 1773 int flags; 1774 int zflags; 1775 int bflag; 1776 int write_type = IO_COPY; 1777 u_int32_t write_length; 1778 1779 flags = xflags; 1780 1781 if (flags & IO_PASSIVE) 1782 bflag = CL_PASSIVE; 1783 else 1784 bflag = 0; 1785 1786 if (vp->v_flag & VNOCACHE_DATA) 1787 flags |= IO_NOCACHE; 1788 1789 if (uio == NULL) { 1790 /* 1791 * no user data... 1792 * this call is being made to zero-fill some range in the file 1793 */ 1794 retval = cluster_write_copy(vp, NULL, (u_int32_t)0, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg); 1795 1796 return(retval); 1797 } 1798 /* 1799 * do a write through the cache if one of the following is true.... 1800 * NOCACHE is not true and 1801 * the uio request doesn't target USERSPACE 1802 * otherwise, find out if we want the direct or contig variant for 1803 * the first vector in the uio request 1804 */ 1805 if ( (flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ) 1806 retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE); 1807 1808 if ( (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)) && write_type == IO_DIRECT) 1809 /* 1810 * must go through the cached variant in this case 1811 */ 1812 write_type = IO_COPY; 1813 1814 while ((cur_resid = uio_resid(uio)) && uio->uio_offset < newEOF && retval == 0) { 1815 1816 switch (write_type) { 1817 1818 case IO_COPY: 1819 /* 1820 * make sure the uio_resid isn't too big... 1821 * internally, we want to handle all of the I/O in 1822 * chunk sizes that fit in a 32 bit int 1823 */ 1824 if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) { 1825 /* 1826 * we're going to have to call cluster_write_copy 1827 * more than once... 1828 * 1829 * only want the last call to cluster_write_copy to 1830 * have the IO_TAILZEROFILL flag set and only the 1831 * first call should have IO_HEADZEROFILL 1832 */ 1833 zflags = flags & ~IO_TAILZEROFILL; 1834 flags &= ~IO_HEADZEROFILL; 1835 1836 write_length = MAX_IO_REQUEST_SIZE; 1837 } else { 1838 /* 1839 * last call to cluster_write_copy 1840 */ 1841 zflags = flags; 1842 1843 write_length = (u_int32_t)cur_resid; 1844 } 1845 retval = cluster_write_copy(vp, uio, write_length, oldEOF, newEOF, headOff, tailOff, zflags, callback, callback_arg); 1846 break; 1847 1848 case IO_CONTIG: 1849 zflags = flags & ~(IO_TAILZEROFILL | IO_HEADZEROFILL); 1850 1851 if (flags & IO_HEADZEROFILL) { 1852 /* 1853 * only do this once per request 1854 */ 1855 flags &= ~IO_HEADZEROFILL; 1856 1857 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, uio->uio_offset, 1858 headOff, (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg); 1859 if (retval) 1860 break; 1861 } 1862 retval = cluster_write_contig(vp, uio, newEOF, &write_type, &write_length, callback, callback_arg, bflag); 1863 1864 if (retval == 0 && (flags & IO_TAILZEROFILL) && uio_resid(uio) == 0) { 1865 /* 1866 * we're done with the data from the user specified buffer(s) 1867 * and we've been requested to zero fill at the tail 1868 * treat this as an IO_HEADZEROFILL which doesn't require a uio 1869 * by rearranging the args and passing in IO_HEADZEROFILL 1870 */ 1871 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, tailOff, uio->uio_offset, 1872 (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg); 1873 } 1874 break; 1875 1876 case IO_DIRECT: 1877 /* 1878 * cluster_write_direct is never called with IO_TAILZEROFILL || IO_HEADZEROFILL 1879 */ 1880 retval = cluster_write_direct(vp, uio, oldEOF, newEOF, &write_type, &write_length, flags, callback, callback_arg); 1881 break; 1882 1883 case IO_UNKNOWN: 1884 retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE); 1885 break; 1886 } 1887 } 1888 return (retval); 1889} 1890 1891 1892static int 1893cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, int *write_type, u_int32_t *write_length, 1894 int flags, int (*callback)(buf_t, void *), void *callback_arg) 1895{ 1896 upl_t upl; 1897 upl_page_info_t *pl; 1898 vm_offset_t upl_offset; 1899 u_int32_t io_req_size; 1900 u_int32_t offset_in_file; 1901 u_int32_t offset_in_iovbase; 1902 u_int32_t io_size; 1903 int io_flag; 1904 int bflag; 1905 vm_size_t upl_size; 1906 vm_size_t upl_needed_size; 1907 mach_msg_type_number_t pages_in_pl; 1908 int upl_flags; 1909 kern_return_t kret; 1910 mach_msg_type_number_t i; 1911 int force_data_sync; 1912 int retval = 0; 1913 int first_IO = 1; 1914 struct clios iostate; 1915 user_addr_t iov_base; 1916 u_int32_t mem_alignment_mask; 1917 u_int32_t devblocksize; 1918 u_int32_t max_upl_size; 1919 1920 1921 max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE); 1922 1923 if (flags & IO_PASSIVE) 1924 bflag = CL_PASSIVE; 1925 else 1926 bflag = 0; 1927 1928 /* 1929 * When we enter this routine, we know 1930 * -- the resid will not exceed iov_len 1931 */ 1932 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START, 1933 (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0); 1934 1935 iostate.io_completed = 0; 1936 iostate.io_issued = 0; 1937 iostate.io_error = 0; 1938 iostate.io_wanted = 0; 1939 1940 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; 1941 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; 1942 1943 if (devblocksize == 1) { 1944 /* 1945 * the AFP client advertises a devblocksize of 1 1946 * however, its BLOCKMAP routine maps to physical 1947 * blocks that are PAGE_SIZE in size... 1948 * therefore we can't ask for I/Os that aren't page aligned 1949 * or aren't multiples of PAGE_SIZE in size 1950 * by setting devblocksize to PAGE_SIZE, we re-instate 1951 * the old behavior we had before the mem_alignment_mask 1952 * changes went in... 1953 */ 1954 devblocksize = PAGE_SIZE; 1955 } 1956 1957next_dwrite: 1958 io_req_size = *write_length; 1959 iov_base = uio_curriovbase(uio); 1960 1961 offset_in_file = (u_int32_t)uio->uio_offset & PAGE_MASK; 1962 offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask; 1963 1964 if (offset_in_file || offset_in_iovbase) { 1965 /* 1966 * one of the 2 important offsets is misaligned 1967 * so fire an I/O through the cache for this entire vector 1968 */ 1969 goto wait_for_dwrites; 1970 } 1971 if (iov_base & (devblocksize - 1)) { 1972 /* 1973 * the offset in memory must be on a device block boundary 1974 * so that we can guarantee that we can generate an 1975 * I/O that ends on a page boundary in cluster_io 1976 */ 1977 goto wait_for_dwrites; 1978 } 1979 1980 while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) { 1981 1982 if (first_IO) { 1983 cluster_syncup(vp, newEOF, callback, callback_arg); 1984 first_IO = 0; 1985 } 1986 io_size = io_req_size & ~PAGE_MASK; 1987 iov_base = uio_curriovbase(uio); 1988 1989 if (io_size > max_upl_size) 1990 io_size = max_upl_size; 1991 1992 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK); 1993 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK; 1994 1995 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START, 1996 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0); 1997 1998 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) { 1999 pages_in_pl = 0; 2000 upl_size = upl_needed_size; 2001 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC | 2002 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; 2003 2004 kret = vm_map_get_upl(current_map(), 2005 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), 2006 &upl_size, 2007 &upl, 2008 NULL, 2009 &pages_in_pl, 2010 &upl_flags, 2011 force_data_sync); 2012 2013 if (kret != KERN_SUCCESS) { 2014 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END, 2015 0, 0, 0, kret, 0); 2016 /* 2017 * failed to get pagelist 2018 * 2019 * we may have already spun some portion of this request 2020 * off as async requests... we need to wait for the I/O 2021 * to complete before returning 2022 */ 2023 goto wait_for_dwrites; 2024 } 2025 pl = UPL_GET_INTERNAL_PAGE_LIST(upl); 2026 pages_in_pl = upl_size / PAGE_SIZE; 2027 2028 for (i = 0; i < pages_in_pl; i++) { 2029 if (!upl_valid_page(pl, i)) 2030 break; 2031 } 2032 if (i == pages_in_pl) 2033 break; 2034 2035 /* 2036 * didn't get all the pages back that we 2037 * needed... release this upl and try again 2038 */ 2039 ubc_upl_abort(upl, 0); 2040 } 2041 if (force_data_sync >= 3) { 2042 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END, 2043 i, pages_in_pl, upl_size, kret, 0); 2044 /* 2045 * for some reason, we couldn't acquire a hold on all 2046 * the pages needed in the user's address space 2047 * 2048 * we may have already spun some portion of this request 2049 * off as async requests... we need to wait for the I/O 2050 * to complete before returning 2051 */ 2052 goto wait_for_dwrites; 2053 } 2054 2055 /* 2056 * Consider the possibility that upl_size wasn't satisfied. 2057 */ 2058 if (upl_size < upl_needed_size) { 2059 if (upl_size && upl_offset == 0) 2060 io_size = upl_size; 2061 else 2062 io_size = 0; 2063 } 2064 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END, 2065 (int)upl_offset, upl_size, (int)iov_base, io_size, 0); 2066 2067 if (io_size == 0) { 2068 ubc_upl_abort(upl, 0); 2069 /* 2070 * we may have already spun some portion of this request 2071 * off as async requests... we need to wait for the I/O 2072 * to complete before returning 2073 */ 2074 goto wait_for_dwrites; 2075 } 2076 2077 /* 2078 * Now look for pages already in the cache 2079 * and throw them away. 2080 * uio->uio_offset is page aligned within the file 2081 * io_size is a multiple of PAGE_SIZE 2082 */ 2083 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL); 2084 2085 /* 2086 * we want push out these writes asynchronously so that we can overlap 2087 * the preparation of the next I/O 2088 * if there are already too many outstanding writes 2089 * wait until some complete before issuing the next 2090 */ 2091 lck_mtx_lock(cl_mtxp); 2092 2093 while ((iostate.io_issued - iostate.io_completed) > (2 * max_upl_size)) { 2094 2095 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, 2096 iostate.io_issued, iostate.io_completed, 2 * max_upl_size, 0, 0); 2097 2098 iostate.io_wanted = 1; 2099 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_direct", NULL); 2100 2101 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, 2102 iostate.io_issued, iostate.io_completed, 2 * max_upl_size, 0, 0); 2103 } 2104 lck_mtx_unlock(cl_mtxp); 2105 2106 if (iostate.io_error) { 2107 /* 2108 * one of the earlier writes we issued ran into a hard error 2109 * don't issue any more writes, cleanup the UPL 2110 * that was just created but not used, then 2111 * go wait for all writes that are part of this stream 2112 * to complete before returning the error to the caller 2113 */ 2114 ubc_upl_abort(upl, 0); 2115 2116 goto wait_for_dwrites; 2117 } 2118 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE | CL_DIRECT_IO | bflag; 2119 2120 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START, 2121 (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0); 2122 2123 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, 2124 io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 2125 2126 /* 2127 * update the uio structure to 2128 * reflect the I/O that we just issued 2129 */ 2130 uio_update(uio, (user_size_t)io_size); 2131 2132 io_req_size -= io_size; 2133 2134 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END, 2135 (int)upl_offset, (int)uio->uio_offset, io_req_size, retval, 0); 2136 2137 } /* end while */ 2138 2139 if (retval == 0 && iostate.io_error == 0 && io_req_size == 0) { 2140 2141 retval = cluster_io_type(uio, write_type, write_length, MIN_DIRECT_WRITE_SIZE); 2142 2143 if (retval == 0 && *write_type == IO_DIRECT) { 2144 2145 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_NONE, 2146 (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0); 2147 2148 goto next_dwrite; 2149 } 2150 } 2151 2152wait_for_dwrites: 2153 if (iostate.io_issued) { 2154 /* 2155 * make sure all async writes issued as part of this stream 2156 * have completed before we return 2157 */ 2158 lck_mtx_lock(cl_mtxp); 2159 2160 while (iostate.io_issued != iostate.io_completed) { 2161 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, 2162 iostate.io_issued, iostate.io_completed, 0, 0, 0); 2163 2164 iostate.io_wanted = 1; 2165 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_direct", NULL); 2166 2167 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, 2168 iostate.io_issued, iostate.io_completed, 0, 0, 0); 2169 } 2170 lck_mtx_unlock(cl_mtxp); 2171 } 2172 if (iostate.io_error) 2173 retval = iostate.io_error; 2174 2175 if (io_req_size && retval == 0) { 2176 /* 2177 * we couldn't handle the tail of this request in DIRECT mode 2178 * so fire it through the copy path 2179 * 2180 * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set 2181 * so we can just pass 0 in for the headOff and tailOff 2182 */ 2183 retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, (off_t)0, (off_t)0, flags, callback, callback_arg); 2184 2185 *write_type = IO_UNKNOWN; 2186 } 2187 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END, 2188 (int)uio->uio_offset, io_req_size, retval, 4, 0); 2189 2190 return (retval); 2191} 2192 2193 2194static int 2195cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, u_int32_t *write_length, 2196 int (*callback)(buf_t, void *), void *callback_arg, int bflag) 2197{ 2198 upl_page_info_t *pl; 2199 addr64_t src_paddr = 0; 2200 upl_t upl[MAX_VECTS]; 2201 vm_offset_t upl_offset; 2202 u_int32_t tail_size = 0; 2203 u_int32_t io_size; 2204 u_int32_t xsize; 2205 vm_size_t upl_size; 2206 vm_size_t upl_needed_size; 2207 mach_msg_type_number_t pages_in_pl; 2208 int upl_flags; 2209 kern_return_t kret; 2210 struct clios iostate; 2211 int error = 0; 2212 int cur_upl = 0; 2213 int num_upl = 0; 2214 int n; 2215 user_addr_t iov_base; 2216 u_int32_t devblocksize; 2217 u_int32_t mem_alignment_mask; 2218 2219 /* 2220 * When we enter this routine, we know 2221 * -- the io_req_size will not exceed iov_len 2222 * -- the target address is physically contiguous 2223 */ 2224 cluster_syncup(vp, newEOF, callback, callback_arg); 2225 2226 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; 2227 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; 2228 2229 iostate.io_completed = 0; 2230 iostate.io_issued = 0; 2231 iostate.io_error = 0; 2232 iostate.io_wanted = 0; 2233 2234next_cwrite: 2235 io_size = *write_length; 2236 2237 iov_base = uio_curriovbase(uio); 2238 2239 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK); 2240 upl_needed_size = upl_offset + io_size; 2241 2242 pages_in_pl = 0; 2243 upl_size = upl_needed_size; 2244 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC | 2245 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; 2246 2247 kret = vm_map_get_upl(current_map(), 2248 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), 2249 &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, 0); 2250 2251 if (kret != KERN_SUCCESS) { 2252 /* 2253 * failed to get pagelist 2254 */ 2255 error = EINVAL; 2256 goto wait_for_cwrites; 2257 } 2258 num_upl++; 2259 2260 /* 2261 * Consider the possibility that upl_size wasn't satisfied. 2262 */ 2263 if (upl_size < upl_needed_size) { 2264 /* 2265 * This is a failure in the physical memory case. 2266 */ 2267 error = EINVAL; 2268 goto wait_for_cwrites; 2269 } 2270 pl = ubc_upl_pageinfo(upl[cur_upl]); 2271 2272 src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset; 2273 2274 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) { 2275 u_int32_t head_size; 2276 2277 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1)); 2278 2279 if (head_size > io_size) 2280 head_size = io_size; 2281 2282 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0, callback, callback_arg); 2283 2284 if (error) 2285 goto wait_for_cwrites; 2286 2287 upl_offset += head_size; 2288 src_paddr += head_size; 2289 io_size -= head_size; 2290 2291 iov_base += head_size; 2292 } 2293 if ((u_int32_t)iov_base & mem_alignment_mask) { 2294 /* 2295 * request doesn't set up on a memory boundary 2296 * the underlying DMA engine can handle... 2297 * return an error instead of going through 2298 * the slow copy path since the intent of this 2299 * path is direct I/O from device memory 2300 */ 2301 error = EINVAL; 2302 goto wait_for_cwrites; 2303 } 2304 2305 tail_size = io_size & (devblocksize - 1); 2306 io_size -= tail_size; 2307 2308 while (io_size && error == 0) { 2309 2310 if (io_size > MAX_IO_CONTIG_SIZE) 2311 xsize = MAX_IO_CONTIG_SIZE; 2312 else 2313 xsize = io_size; 2314 /* 2315 * request asynchronously so that we can overlap 2316 * the preparation of the next I/O... we'll do 2317 * the commit after all the I/O has completed 2318 * since its all issued against the same UPL 2319 * if there are already too many outstanding writes 2320 * wait until some have completed before issuing the next 2321 */ 2322 if (iostate.io_issued) { 2323 lck_mtx_lock(cl_mtxp); 2324 2325 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_IO_CONTIG_SIZE)) { 2326 2327 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, 2328 iostate.io_issued, iostate.io_completed, 2 * MAX_IO_CONTIG_SIZE, 0, 0); 2329 2330 iostate.io_wanted = 1; 2331 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_contig", NULL); 2332 2333 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, 2334 iostate.io_issued, iostate.io_completed, 2 * MAX_IO_CONTIG_SIZE, 0, 0); 2335 } 2336 lck_mtx_unlock(cl_mtxp); 2337 } 2338 if (iostate.io_error) { 2339 /* 2340 * one of the earlier writes we issued ran into a hard error 2341 * don't issue any more writes... 2342 * go wait for all writes that are part of this stream 2343 * to complete before returning the error to the caller 2344 */ 2345 goto wait_for_cwrites; 2346 } 2347 /* 2348 * issue an asynchronous write to cluster_io 2349 */ 2350 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, 2351 xsize, CL_DEV_MEMORY | CL_ASYNC | bflag, (buf_t)NULL, (struct clios *)&iostate, callback, callback_arg); 2352 2353 if (error == 0) { 2354 /* 2355 * The cluster_io write completed successfully, 2356 * update the uio structure 2357 */ 2358 uio_update(uio, (user_size_t)xsize); 2359 2360 upl_offset += xsize; 2361 src_paddr += xsize; 2362 io_size -= xsize; 2363 } 2364 } 2365 if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS) { 2366 2367 error = cluster_io_type(uio, write_type, write_length, 0); 2368 2369 if (error == 0 && *write_type == IO_CONTIG) { 2370 cur_upl++; 2371 goto next_cwrite; 2372 } 2373 } else 2374 *write_type = IO_UNKNOWN; 2375 2376wait_for_cwrites: 2377 /* 2378 * make sure all async writes that are part of this stream 2379 * have completed before we proceed 2380 */ 2381 lck_mtx_lock(cl_mtxp); 2382 2383 while (iostate.io_issued != iostate.io_completed) { 2384 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, 2385 iostate.io_issued, iostate.io_completed, 0, 0, 0); 2386 2387 iostate.io_wanted = 1; 2388 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_contig", NULL); 2389 2390 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, 2391 iostate.io_issued, iostate.io_completed, 0, 0, 0); 2392 } 2393 lck_mtx_unlock(cl_mtxp); 2394 2395 if (iostate.io_error) 2396 error = iostate.io_error; 2397 2398 if (error == 0 && tail_size) 2399 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg); 2400 2401 for (n = 0; n < num_upl; n++) 2402 /* 2403 * just release our hold on each physically contiguous 2404 * region without changing any state 2405 */ 2406 ubc_upl_abort(upl[n], 0); 2407 2408 return (error); 2409} 2410 2411 2412static int 2413cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff, 2414 off_t tailOff, int flags, int (*callback)(buf_t, void *), void *callback_arg) 2415{ 2416 upl_page_info_t *pl; 2417 upl_t upl; 2418 vm_offset_t upl_offset = 0; 2419 vm_size_t upl_size; 2420 off_t upl_f_offset; 2421 int pages_in_upl; 2422 int start_offset; 2423 int xfer_resid; 2424 int io_size; 2425 int io_offset; 2426 int bytes_to_zero; 2427 int bytes_to_move; 2428 kern_return_t kret; 2429 int retval = 0; 2430 int io_resid; 2431 long long total_size; 2432 long long zero_cnt; 2433 off_t zero_off; 2434 long long zero_cnt1; 2435 off_t zero_off1; 2436 struct cl_extent cl; 2437 struct cl_writebehind *wbp; 2438 int bflag; 2439 u_int max_cluster_pgcount; 2440 u_int max_io_size; 2441 2442 if (flags & IO_PASSIVE) 2443 bflag = CL_PASSIVE; 2444 else 2445 bflag = 0; 2446 2447 if (uio) { 2448 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START, 2449 (int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, 0); 2450 2451 io_resid = io_req_size; 2452 } else { 2453 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START, 2454 0, 0, (int)oldEOF, (int)newEOF, 0); 2455 2456 io_resid = 0; 2457 } 2458 zero_cnt = 0; 2459 zero_cnt1 = 0; 2460 zero_off = 0; 2461 zero_off1 = 0; 2462 2463 max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE; 2464 max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE); 2465 2466 if (flags & IO_HEADZEROFILL) { 2467 /* 2468 * some filesystems (HFS is one) don't support unallocated holes within a file... 2469 * so we zero fill the intervening space between the old EOF and the offset 2470 * where the next chunk of real data begins.... ftruncate will also use this 2471 * routine to zero fill to the new EOF when growing a file... in this case, the 2472 * uio structure will not be provided 2473 */ 2474 if (uio) { 2475 if (headOff < uio->uio_offset) { 2476 zero_cnt = uio->uio_offset - headOff; 2477 zero_off = headOff; 2478 } 2479 } else if (headOff < newEOF) { 2480 zero_cnt = newEOF - headOff; 2481 zero_off = headOff; 2482 } 2483 } 2484 if (flags & IO_TAILZEROFILL) { 2485 if (uio) { 2486 zero_off1 = uio->uio_offset + io_req_size; 2487 2488 if (zero_off1 < tailOff) 2489 zero_cnt1 = tailOff - zero_off1; 2490 } 2491 } 2492 if (zero_cnt == 0 && uio == (struct uio *) 0) { 2493 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, 2494 retval, 0, 0, 0, 0); 2495 return (0); 2496 } 2497 2498 while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) { 2499 /* 2500 * for this iteration of the loop, figure out where our starting point is 2501 */ 2502 if (zero_cnt) { 2503 start_offset = (int)(zero_off & PAGE_MASK_64); 2504 upl_f_offset = zero_off - start_offset; 2505 } else if (io_resid) { 2506 start_offset = (int)(uio->uio_offset & PAGE_MASK_64); 2507 upl_f_offset = uio->uio_offset - start_offset; 2508 } else { 2509 start_offset = (int)(zero_off1 & PAGE_MASK_64); 2510 upl_f_offset = zero_off1 - start_offset; 2511 } 2512 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE, 2513 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0); 2514 2515 if (total_size > max_io_size) 2516 total_size = max_io_size; 2517 2518 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64); 2519 2520 if (uio && ((flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) { 2521 /* 2522 * assumption... total_size <= io_resid 2523 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set 2524 */ 2525 if ((start_offset + total_size) > max_io_size) 2526 total_size -= start_offset; 2527 xfer_resid = total_size; 2528 2529 retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1); 2530 2531 if (retval) 2532 break; 2533 2534 io_resid -= (total_size - xfer_resid); 2535 total_size = xfer_resid; 2536 start_offset = (int)(uio->uio_offset & PAGE_MASK_64); 2537 upl_f_offset = uio->uio_offset - start_offset; 2538 2539 if (total_size == 0) { 2540 if (start_offset) { 2541 /* 2542 * the write did not finish on a page boundary 2543 * which will leave upl_f_offset pointing to the 2544 * beginning of the last page written instead of 2545 * the page beyond it... bump it in this case 2546 * so that the cluster code records the last page 2547 * written as dirty 2548 */ 2549 upl_f_offset += PAGE_SIZE_64; 2550 } 2551 upl_size = 0; 2552 2553 goto check_cluster; 2554 } 2555 } 2556 /* 2557 * compute the size of the upl needed to encompass 2558 * the requested write... limit each call to cluster_io 2559 * to the maximum UPL size... cluster_io will clip if 2560 * this exceeds the maximum io_size for the device, 2561 * make sure to account for 2562 * a starting offset that's not page aligned 2563 */ 2564 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 2565 2566 if (upl_size > max_io_size) 2567 upl_size = max_io_size; 2568 2569 pages_in_upl = upl_size / PAGE_SIZE; 2570 io_size = upl_size - start_offset; 2571 2572 if ((long long)io_size > total_size) 2573 io_size = total_size; 2574 2575 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0); 2576 2577 2578 /* 2579 * Gather the pages from the buffer cache. 2580 * The UPL_WILL_MODIFY flag lets the UPL subsystem know 2581 * that we intend to modify these pages. 2582 */ 2583 kret = ubc_create_upl(vp, 2584 upl_f_offset, 2585 upl_size, 2586 &upl, 2587 &pl, 2588 UPL_SET_LITE | UPL_WILL_MODIFY); 2589 if (kret != KERN_SUCCESS) 2590 panic("cluster_write_copy: failed to get pagelist"); 2591 2592 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, 2593 (int)upl, (int)upl_f_offset, start_offset, 0, 0); 2594 2595 if (start_offset && !upl_valid_page(pl, 0)) { 2596 int read_size; 2597 2598 /* 2599 * we're starting in the middle of the first page of the upl 2600 * and the page isn't currently valid, so we're going to have 2601 * to read it in first... this is a synchronous operation 2602 */ 2603 read_size = PAGE_SIZE; 2604 2605 if ((upl_f_offset + read_size) > newEOF) 2606 read_size = newEOF - upl_f_offset; 2607 2608 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, 2609 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 2610 if (retval) { 2611 /* 2612 * we had an error during the read which causes us to abort 2613 * the current cluster_write request... before we do, we need 2614 * to release the rest of the pages in the upl without modifying 2615 * there state and mark the failed page in error 2616 */ 2617 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY); 2618 2619 if (upl_size > PAGE_SIZE) 2620 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); 2621 2622 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE, 2623 (int)upl, 0, 0, retval, 0); 2624 break; 2625 } 2626 } 2627 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) { 2628 /* 2629 * the last offset we're writing to in this upl does not end on a page 2630 * boundary... if it's not beyond the old EOF, then we'll also need to 2631 * pre-read this page in if it isn't already valid 2632 */ 2633 upl_offset = upl_size - PAGE_SIZE; 2634 2635 if ((upl_f_offset + start_offset + io_size) < oldEOF && 2636 !upl_valid_page(pl, upl_offset / PAGE_SIZE)) { 2637 int read_size; 2638 2639 read_size = PAGE_SIZE; 2640 2641 if ((upl_f_offset + upl_offset + read_size) > newEOF) 2642 read_size = newEOF - (upl_f_offset + upl_offset); 2643 2644 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, 2645 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 2646 if (retval) { 2647 /* 2648 * we had an error during the read which causes us to abort 2649 * the current cluster_write request... before we do, we 2650 * need to release the rest of the pages in the upl without 2651 * modifying there state and mark the failed page in error 2652 */ 2653 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY); 2654 2655 if (upl_size > PAGE_SIZE) 2656 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); 2657 2658 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE, 2659 (int)upl, 0, 0, retval, 0); 2660 break; 2661 } 2662 } 2663 } 2664 xfer_resid = io_size; 2665 io_offset = start_offset; 2666 2667 while (zero_cnt && xfer_resid) { 2668 2669 if (zero_cnt < (long long)xfer_resid) 2670 bytes_to_zero = zero_cnt; 2671 else 2672 bytes_to_zero = xfer_resid; 2673 2674 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) { 2675 cluster_zero(upl, io_offset, bytes_to_zero, NULL); 2676 } else { 2677 int zero_pg_index; 2678 2679 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64)); 2680 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64); 2681 2682 if ( !upl_valid_page(pl, zero_pg_index)) { 2683 cluster_zero(upl, io_offset, bytes_to_zero, NULL); 2684 2685 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY && 2686 !upl_dirty_page(pl, zero_pg_index)) { 2687 cluster_zero(upl, io_offset, bytes_to_zero, NULL); 2688 } 2689 } 2690 xfer_resid -= bytes_to_zero; 2691 zero_cnt -= bytes_to_zero; 2692 zero_off += bytes_to_zero; 2693 io_offset += bytes_to_zero; 2694 } 2695 if (xfer_resid && io_resid) { 2696 u_int32_t io_requested; 2697 2698 bytes_to_move = min(io_resid, xfer_resid); 2699 io_requested = bytes_to_move; 2700 2701 retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested); 2702 2703 if (retval) { 2704 2705 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); 2706 2707 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE, 2708 (int)upl, 0, 0, retval, 0); 2709 } else { 2710 io_resid -= bytes_to_move; 2711 xfer_resid -= bytes_to_move; 2712 io_offset += bytes_to_move; 2713 } 2714 } 2715 while (xfer_resid && zero_cnt1 && retval == 0) { 2716 2717 if (zero_cnt1 < (long long)xfer_resid) 2718 bytes_to_zero = zero_cnt1; 2719 else 2720 bytes_to_zero = xfer_resid; 2721 2722 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) { 2723 cluster_zero(upl, io_offset, bytes_to_zero, NULL); 2724 } else { 2725 int zero_pg_index; 2726 2727 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64)); 2728 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64); 2729 2730 if ( !upl_valid_page(pl, zero_pg_index)) { 2731 cluster_zero(upl, io_offset, bytes_to_zero, NULL); 2732 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY && 2733 !upl_dirty_page(pl, zero_pg_index)) { 2734 cluster_zero(upl, io_offset, bytes_to_zero, NULL); 2735 } 2736 } 2737 xfer_resid -= bytes_to_zero; 2738 zero_cnt1 -= bytes_to_zero; 2739 zero_off1 += bytes_to_zero; 2740 io_offset += bytes_to_zero; 2741 } 2742 2743 if (retval == 0) { 2744 int cl_index; 2745 int ret_cluster_try_push; 2746 2747 io_size += start_offset; 2748 2749 if ((upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) { 2750 /* 2751 * if we're extending the file with this write 2752 * we'll zero fill the rest of the page so that 2753 * if the file gets extended again in such a way as to leave a 2754 * hole starting at this EOF, we'll have zero's in the correct spot 2755 */ 2756 cluster_zero(upl, io_size, upl_size - io_size, NULL); 2757 } 2758 /* 2759 * release the upl now if we hold one since... 2760 * 1) pages in it may be present in the sparse cluster map 2761 * and may span 2 separate buckets there... if they do and 2762 * we happen to have to flush a bucket to make room and it intersects 2763 * this upl, a deadlock may result on page BUSY 2764 * 2) we're delaying the I/O... from this point forward we're just updating 2765 * the cluster state... no need to hold the pages, so commit them 2766 * 3) IO_SYNC is set... 2767 * because we had to ask for a UPL that provides currenty non-present pages, the 2768 * UPL has been automatically set to clear the dirty flags (both software and hardware) 2769 * upon committing it... this is not the behavior we want since it's possible for 2770 * pages currently present as part of a mapped file to be dirtied while the I/O is in flight. 2771 * we'll pick these pages back up later with the correct behavior specified. 2772 * 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush 2773 * of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages 2774 * we hold since the flushing context is holding the cluster lock. 2775 */ 2776 ubc_upl_commit_range(upl, 0, upl_size, 2777 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY); 2778check_cluster: 2779 /* 2780 * calculate the last logical block number 2781 * that this delayed I/O encompassed 2782 */ 2783 cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64); 2784 2785 if (flags & IO_SYNC) 2786 /* 2787 * if the IO_SYNC flag is set than we need to 2788 * bypass any clusters and immediately issue 2789 * the I/O 2790 */ 2791 goto issue_io; 2792 2793 /* 2794 * take the lock to protect our accesses 2795 * of the writebehind and sparse cluster state 2796 */ 2797 wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED); 2798 2799 if (wbp->cl_scmap) { 2800 2801 if ( !(flags & IO_NOCACHE)) { 2802 /* 2803 * we've fallen into the sparse 2804 * cluster method of delaying dirty pages 2805 */ 2806 sparse_cluster_add(wbp, vp, &cl, newEOF, callback, callback_arg); 2807 2808 lck_mtx_unlock(&wbp->cl_lockw); 2809 2810 continue; 2811 } 2812 /* 2813 * must have done cached writes that fell into 2814 * the sparse cluster mechanism... we've switched 2815 * to uncached writes on the file, so go ahead 2816 * and push whatever's in the sparse map 2817 * and switch back to normal clustering 2818 */ 2819 wbp->cl_number = 0; 2820 2821 sparse_cluster_push(wbp, vp, newEOF, PUSH_ALL, callback, callback_arg); 2822 /* 2823 * no clusters of either type present at this point 2824 * so just go directly to start_new_cluster since 2825 * we know we need to delay this I/O since we've 2826 * already released the pages back into the cache 2827 * to avoid the deadlock with sparse_cluster_push 2828 */ 2829 goto start_new_cluster; 2830 } 2831 if (wbp->cl_number == 0) 2832 /* 2833 * no clusters currently present 2834 */ 2835 goto start_new_cluster; 2836 2837 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) { 2838 /* 2839 * check each cluster that we currently hold 2840 * try to merge some or all of this write into 2841 * one or more of the existing clusters... if 2842 * any portion of the write remains, start a 2843 * new cluster 2844 */ 2845 if (cl.b_addr >= wbp->cl_clusters[cl_index].b_addr) { 2846 /* 2847 * the current write starts at or after the current cluster 2848 */ 2849 if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) { 2850 /* 2851 * we have a write that fits entirely 2852 * within the existing cluster limits 2853 */ 2854 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) 2855 /* 2856 * update our idea of where the cluster ends 2857 */ 2858 wbp->cl_clusters[cl_index].e_addr = cl.e_addr; 2859 break; 2860 } 2861 if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) { 2862 /* 2863 * we have a write that starts in the middle of the current cluster 2864 * but extends beyond the cluster's limit... we know this because 2865 * of the previous checks 2866 * we'll extend the current cluster to the max 2867 * and update the b_addr for the current write to reflect that 2868 * the head of it was absorbed into this cluster... 2869 * note that we'll always have a leftover tail in this case since 2870 * full absorbtion would have occurred in the clause above 2871 */ 2872 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount; 2873 2874 cl.b_addr = wbp->cl_clusters[cl_index].e_addr; 2875 } 2876 /* 2877 * we come here for the case where the current write starts 2878 * beyond the limit of the existing cluster or we have a leftover 2879 * tail after a partial absorbtion 2880 * 2881 * in either case, we'll check the remaining clusters before 2882 * starting a new one 2883 */ 2884 } else { 2885 /* 2886 * the current write starts in front of the cluster we're currently considering 2887 */ 2888 if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= max_cluster_pgcount) { 2889 /* 2890 * we can just merge the new request into 2891 * this cluster and leave it in the cache 2892 * since the resulting cluster is still 2893 * less than the maximum allowable size 2894 */ 2895 wbp->cl_clusters[cl_index].b_addr = cl.b_addr; 2896 2897 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) { 2898 /* 2899 * the current write completely 2900 * envelops the existing cluster and since 2901 * each write is limited to at most max_cluster_pgcount pages 2902 * we can just use the start and last blocknos of the write 2903 * to generate the cluster limits 2904 */ 2905 wbp->cl_clusters[cl_index].e_addr = cl.e_addr; 2906 } 2907 break; 2908 } 2909 2910 /* 2911 * if we were to combine this write with the current cluster 2912 * we would exceed the cluster size limit.... so, 2913 * let's see if there's any overlap of the new I/O with 2914 * the cluster we're currently considering... in fact, we'll 2915 * stretch the cluster out to it's full limit and see if we 2916 * get an intersection with the current write 2917 * 2918 */ 2919 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) { 2920 /* 2921 * the current write extends into the proposed cluster 2922 * clip the length of the current write after first combining it's 2923 * tail with the newly shaped cluster 2924 */ 2925 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount; 2926 2927 cl.e_addr = wbp->cl_clusters[cl_index].b_addr; 2928 } 2929 /* 2930 * if we get here, there was no way to merge 2931 * any portion of this write with this cluster 2932 * or we could only merge part of it which 2933 * will leave a tail... 2934 * we'll check the remaining clusters before starting a new one 2935 */ 2936 } 2937 } 2938 if (cl_index < wbp->cl_number) 2939 /* 2940 * we found an existing cluster(s) that we 2941 * could entirely merge this I/O into 2942 */ 2943 goto delay_io; 2944 2945 if (wbp->cl_number < MAX_CLUSTERS) 2946 /* 2947 * we didn't find an existing cluster to 2948 * merge into, but there's room to start 2949 * a new one 2950 */ 2951 goto start_new_cluster; 2952 2953 /* 2954 * no exisitng cluster to merge with and no 2955 * room to start a new one... we'll try 2956 * pushing one of the existing ones... if none of 2957 * them are able to be pushed, we'll switch 2958 * to the sparse cluster mechanism 2959 * cluster_try_push updates cl_number to the 2960 * number of remaining clusters... and 2961 * returns the number of currently unused clusters 2962 */ 2963 ret_cluster_try_push = 0; 2964 2965 /* 2966 * if writes are not deferred, call cluster push immediately 2967 */ 2968 if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) { 2969 2970 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, callback, callback_arg); 2971 } 2972 2973 /* 2974 * execute following regardless of writes being deferred or not 2975 */ 2976 if (ret_cluster_try_push == 0) { 2977 /* 2978 * no more room in the normal cluster mechanism 2979 * so let's switch to the more expansive but expensive 2980 * sparse mechanism.... 2981 */ 2982 sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg); 2983 sparse_cluster_add(wbp, vp, &cl, newEOF, callback, callback_arg); 2984 2985 lck_mtx_unlock(&wbp->cl_lockw); 2986 2987 continue; 2988 } 2989 /* 2990 * we pushed one cluster successfully, so we must be sequentially writing this file 2991 * otherwise, we would have failed and fallen into the sparse cluster support 2992 * so let's take the opportunity to push out additional clusters... 2993 * this will give us better I/O locality if we're in a copy loop 2994 * (i.e. we won't jump back and forth between the read and write points 2995 */ 2996 if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) { 2997 while (wbp->cl_number) 2998 cluster_try_push(wbp, vp, newEOF, 0, callback, callback_arg); 2999 } 3000 3001start_new_cluster: 3002 wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr; 3003 wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr; 3004 3005 wbp->cl_clusters[wbp->cl_number].io_flags = 0; 3006 3007 if (flags & IO_NOCACHE) 3008 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IONOCACHE; 3009 3010 if (bflag & CL_PASSIVE) 3011 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IOPASSIVE; 3012 3013 wbp->cl_number++; 3014delay_io: 3015 lck_mtx_unlock(&wbp->cl_lockw); 3016 3017 continue; 3018issue_io: 3019 /* 3020 * we don't hold the lock at this point 3021 * 3022 * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set 3023 * so that we correctly deal with a change in state of the hardware modify bit... 3024 * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force 3025 * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also 3026 * responsible for generating the correct sized I/O(s) 3027 */ 3028 retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg); 3029 } 3030 } 3031 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, retval, 0, io_resid, 0, 0); 3032 3033 return (retval); 3034} 3035 3036 3037 3038int 3039cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags) 3040{ 3041 return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL); 3042} 3043 3044 3045int 3046cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (*callback)(buf_t, void *), void *callback_arg) 3047{ 3048 int retval = 0; 3049 int flags; 3050 user_ssize_t cur_resid; 3051 u_int32_t io_size; 3052 u_int32_t read_length = 0; 3053 int read_type = IO_COPY; 3054 3055 flags = xflags; 3056 3057 if (vp->v_flag & VNOCACHE_DATA) 3058 flags |= IO_NOCACHE; 3059 if ((vp->v_flag & VRAOFF) || speculative_reads_disabled) 3060 flags |= IO_RAOFF; 3061 3062 /* 3063 * do a read through the cache if one of the following is true.... 3064 * NOCACHE is not true 3065 * the uio request doesn't target USERSPACE 3066 * otherwise, find out if we want the direct or contig variant for 3067 * the first vector in the uio request 3068 */ 3069 if ( (flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ) 3070 retval = cluster_io_type(uio, &read_type, &read_length, 0); 3071 3072 while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) { 3073 3074 switch (read_type) { 3075 3076 case IO_COPY: 3077 /* 3078 * make sure the uio_resid isn't too big... 3079 * internally, we want to handle all of the I/O in 3080 * chunk sizes that fit in a 32 bit int 3081 */ 3082 if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) 3083 io_size = MAX_IO_REQUEST_SIZE; 3084 else 3085 io_size = (u_int32_t)cur_resid; 3086 3087 retval = cluster_read_copy(vp, uio, io_size, filesize, flags, callback, callback_arg); 3088 break; 3089 3090 case IO_DIRECT: 3091 retval = cluster_read_direct(vp, uio, filesize, &read_type, &read_length, flags, callback, callback_arg); 3092 break; 3093 3094 case IO_CONTIG: 3095 retval = cluster_read_contig(vp, uio, filesize, &read_type, &read_length, callback, callback_arg, flags); 3096 break; 3097 3098 case IO_UNKNOWN: 3099 retval = cluster_io_type(uio, &read_type, &read_length, 0); 3100 break; 3101 } 3102 } 3103 return (retval); 3104} 3105 3106 3107 3108static void 3109cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int flags) 3110{ 3111 int range; 3112 int abort_flags = UPL_ABORT_FREE_ON_EMPTY; 3113 3114 if ((range = last_pg - start_pg)) { 3115 if ( !(flags & IO_NOCACHE)) 3116 abort_flags |= UPL_ABORT_REFERENCE; 3117 3118 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags); 3119 } 3120} 3121 3122 3123static int 3124cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg) 3125{ 3126 upl_page_info_t *pl; 3127 upl_t upl; 3128 vm_offset_t upl_offset; 3129 u_int32_t upl_size; 3130 off_t upl_f_offset; 3131 int start_offset; 3132 int start_pg; 3133 int last_pg; 3134 int uio_last = 0; 3135 int pages_in_upl; 3136 off_t max_size; 3137 off_t last_ioread_offset; 3138 off_t last_request_offset; 3139 kern_return_t kret; 3140 int error = 0; 3141 int retval = 0; 3142 u_int32_t size_of_prefetch; 3143 u_int32_t xsize; 3144 u_int32_t io_size; 3145 u_int32_t max_rd_size; 3146 u_int32_t max_io_size; 3147 u_int32_t max_prefetch; 3148 u_int rd_ahead_enabled = 1; 3149 u_int prefetch_enabled = 1; 3150 struct cl_readahead * rap; 3151 struct clios iostate; 3152 struct cl_extent extent; 3153 int bflag; 3154 int take_reference = 1; 3155 struct uthread *ut; 3156 int policy = IOPOL_DEFAULT; 3157 3158 policy = current_proc()->p_iopol_disk; 3159 3160 ut = get_bsdthread_info(current_thread()); 3161 3162 if (ut->uu_iopol_disk != IOPOL_DEFAULT) 3163 policy = ut->uu_iopol_disk; 3164 3165 if (policy == IOPOL_THROTTLE) 3166 take_reference = 0; 3167 3168 if (flags & IO_PASSIVE) 3169 bflag = CL_PASSIVE; 3170 else 3171 bflag = 0; 3172 3173 max_prefetch = MAX_PREFETCH(vp); 3174 max_rd_size = max_prefetch; 3175 max_io_size = cluster_max_io_size(vp->v_mount, CL_READ); 3176 3177 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START, 3178 (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0); 3179 3180 last_request_offset = uio->uio_offset + io_req_size; 3181 3182 if ((flags & (IO_RAOFF|IO_NOCACHE)) || ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) { 3183 rd_ahead_enabled = 0; 3184 rap = NULL; 3185 } else { 3186 if (cluster_hard_throttle_on(vp)) { 3187 rd_ahead_enabled = 0; 3188 prefetch_enabled = 0; 3189 3190 max_rd_size = HARD_THROTTLE_MAXSIZE; 3191 } 3192 if ((rap = cluster_get_rap(vp)) == NULL) 3193 rd_ahead_enabled = 0; 3194 } 3195 if (last_request_offset > filesize) 3196 last_request_offset = filesize; 3197 extent.b_addr = uio->uio_offset / PAGE_SIZE_64; 3198 extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64; 3199 3200 if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) { 3201 /* 3202 * determine if we already have a read-ahead in the pipe courtesy of the 3203 * last read systemcall that was issued... 3204 * if so, pick up it's extent to determine where we should start 3205 * with respect to any read-ahead that might be necessary to 3206 * garner all the data needed to complete this read systemcall 3207 */ 3208 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64; 3209 3210 if (last_ioread_offset < uio->uio_offset) 3211 last_ioread_offset = (off_t)0; 3212 else if (last_ioread_offset > last_request_offset) 3213 last_ioread_offset = last_request_offset; 3214 } else 3215 last_ioread_offset = (off_t)0; 3216 3217 while (io_req_size && uio->uio_offset < filesize && retval == 0) { 3218 /* 3219 * compute the size of the upl needed to encompass 3220 * the requested read... limit each call to cluster_io 3221 * to the maximum UPL size... cluster_io will clip if 3222 * this exceeds the maximum io_size for the device, 3223 * make sure to account for 3224 * a starting offset that's not page aligned 3225 */ 3226 start_offset = (int)(uio->uio_offset & PAGE_MASK_64); 3227 upl_f_offset = uio->uio_offset - (off_t)start_offset; 3228 max_size = filesize - uio->uio_offset; 3229 3230 if ((off_t)(io_req_size) < max_size) 3231 io_size = io_req_size; 3232 else 3233 io_size = max_size; 3234 3235 if (!(flags & IO_NOCACHE)) { 3236 3237 while (io_size) { 3238 u_int32_t io_resid; 3239 u_int32_t io_requested; 3240 3241 /* 3242 * if we keep finding the pages we need already in the cache, then 3243 * don't bother to call cluster_read_prefetch since it costs CPU cycles 3244 * to determine that we have all the pages we need... once we miss in 3245 * the cache and have issued an I/O, than we'll assume that we're likely 3246 * to continue to miss in the cache and it's to our advantage to try and prefetch 3247 */ 3248 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) { 3249 if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) { 3250 /* 3251 * we've already issued I/O for this request and 3252 * there's still work to do and 3253 * our prefetch stream is running dry, so issue a 3254 * pre-fetch I/O... the I/O latency will overlap 3255 * with the copying of the data 3256 */ 3257 if (size_of_prefetch > max_rd_size) 3258 size_of_prefetch = max_rd_size; 3259 3260 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag); 3261 3262 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE); 3263 3264 if (last_ioread_offset > last_request_offset) 3265 last_ioread_offset = last_request_offset; 3266 } 3267 } 3268 /* 3269 * limit the size of the copy we're about to do so that 3270 * we can notice that our I/O pipe is running dry and 3271 * get the next I/O issued before it does go dry 3272 */ 3273 if (last_ioread_offset && io_size > (max_io_size / 4)) 3274 io_resid = (max_io_size / 4); 3275 else 3276 io_resid = io_size; 3277 3278 io_requested = io_resid; 3279 3280 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference); 3281 3282 xsize = io_requested - io_resid; 3283 3284 io_size -= xsize; 3285 io_req_size -= xsize; 3286 3287 if (retval || io_resid) 3288 /* 3289 * if we run into a real error or 3290 * a page that is not in the cache 3291 * we need to leave streaming mode 3292 */ 3293 break; 3294 3295 if ((io_size == 0 || last_ioread_offset == last_request_offset) && rd_ahead_enabled) { 3296 /* 3297 * we're already finished the I/O for this read request 3298 * let's see if we should do a read-ahead 3299 */ 3300 cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag); 3301 } 3302 } 3303 if (retval) 3304 break; 3305 if (io_size == 0) { 3306 if (rap != NULL) { 3307 if (extent.e_addr < rap->cl_lastr) 3308 rap->cl_maxra = 0; 3309 rap->cl_lastr = extent.e_addr; 3310 } 3311 break; 3312 } 3313 start_offset = (int)(uio->uio_offset & PAGE_MASK_64); 3314 upl_f_offset = uio->uio_offset - (off_t)start_offset; 3315 max_size = filesize - uio->uio_offset; 3316 } 3317 if (io_size > max_rd_size) 3318 io_size = max_rd_size; 3319 3320 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 3321 3322 if (flags & IO_NOCACHE) { 3323 if (upl_size > max_io_size) 3324 upl_size = max_io_size; 3325 } else { 3326 if (upl_size > max_io_size / 4) 3327 upl_size = max_io_size / 4; 3328 } 3329 pages_in_upl = upl_size / PAGE_SIZE; 3330 3331 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START, 3332 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0); 3333 3334 kret = ubc_create_upl(vp, 3335 upl_f_offset, 3336 upl_size, 3337 &upl, 3338 &pl, 3339 UPL_FILE_IO | UPL_SET_LITE); 3340 if (kret != KERN_SUCCESS) 3341 panic("cluster_read_copy: failed to get pagelist"); 3342 3343 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END, 3344 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0); 3345 3346 /* 3347 * scan from the beginning of the upl looking for the first 3348 * non-valid page.... this will become the first page in 3349 * the request we're going to make to 'cluster_io'... if all 3350 * of the pages are valid, we won't call through to 'cluster_io' 3351 */ 3352 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) { 3353 if (!upl_valid_page(pl, start_pg)) 3354 break; 3355 } 3356 3357 /* 3358 * scan from the starting invalid page looking for a valid 3359 * page before the end of the upl is reached, if we 3360 * find one, then it will be the last page of the request to 3361 * 'cluster_io' 3362 */ 3363 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) { 3364 if (upl_valid_page(pl, last_pg)) 3365 break; 3366 } 3367 iostate.io_completed = 0; 3368 iostate.io_issued = 0; 3369 iostate.io_error = 0; 3370 iostate.io_wanted = 0; 3371 3372 if (start_pg < last_pg) { 3373 /* 3374 * we found a range of 'invalid' pages that must be filled 3375 * if the last page in this range is the last page of the file 3376 * we may have to clip the size of it to keep from reading past 3377 * the end of the last physical block associated with the file 3378 */ 3379 upl_offset = start_pg * PAGE_SIZE; 3380 io_size = (last_pg - start_pg) * PAGE_SIZE; 3381 3382 if ((upl_f_offset + upl_offset + io_size) > filesize) 3383 io_size = filesize - (upl_f_offset + upl_offset); 3384 3385 /* 3386 * issue an asynchronous read to cluster_io 3387 */ 3388 3389 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, 3390 io_size, CL_READ | CL_ASYNC | bflag, (buf_t)NULL, &iostate, callback, callback_arg); 3391 } 3392 if (error == 0) { 3393 /* 3394 * if the read completed successfully, or there was no I/O request 3395 * issued, than copy the data into user land via 'cluster_upl_copy_data' 3396 * we'll first add on any 'valid' 3397 * pages that were present in the upl when we acquired it. 3398 */ 3399 u_int val_size; 3400 3401 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) { 3402 if (!upl_valid_page(pl, uio_last)) 3403 break; 3404 } 3405 if (uio_last < pages_in_upl) { 3406 /* 3407 * there were some invalid pages beyond the valid pages 3408 * that we didn't issue an I/O for, just release them 3409 * unchanged now, so that any prefetch/readahed can 3410 * include them 3411 */ 3412 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE, 3413 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY); 3414 } 3415 3416 /* 3417 * compute size to transfer this round, if io_req_size is 3418 * still non-zero after this attempt, we'll loop around and 3419 * set up for another I/O. 3420 */ 3421 val_size = (uio_last * PAGE_SIZE) - start_offset; 3422 3423 if (val_size > max_size) 3424 val_size = max_size; 3425 3426 if (val_size > io_req_size) 3427 val_size = io_req_size; 3428 3429 if ((uio->uio_offset + val_size) > last_ioread_offset) 3430 last_ioread_offset = uio->uio_offset + val_size; 3431 3432 if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) { 3433 3434 if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) { 3435 /* 3436 * if there's still I/O left to do for this request, and... 3437 * we're not in hard throttle mode, and... 3438 * we're close to using up the previous prefetch, then issue a 3439 * new pre-fetch I/O... the I/O latency will overlap 3440 * with the copying of the data 3441 */ 3442 if (size_of_prefetch > max_rd_size) 3443 size_of_prefetch = max_rd_size; 3444 3445 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag); 3446 3447 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE); 3448 3449 if (last_ioread_offset > last_request_offset) 3450 last_ioread_offset = last_request_offset; 3451 } 3452 3453 } else if ((uio->uio_offset + val_size) == last_request_offset) { 3454 /* 3455 * this transfer will finish this request, so... 3456 * let's try to read ahead if we're in 3457 * a sequential access pattern and we haven't 3458 * explicitly disabled it 3459 */ 3460 if (rd_ahead_enabled) 3461 cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag); 3462 3463 if (rap != NULL) { 3464 if (extent.e_addr < rap->cl_lastr) 3465 rap->cl_maxra = 0; 3466 rap->cl_lastr = extent.e_addr; 3467 } 3468 } 3469 lck_mtx_lock(cl_mtxp); 3470 3471 while (iostate.io_issued != iostate.io_completed) { 3472 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, 3473 iostate.io_issued, iostate.io_completed, 0, 0, 0); 3474 3475 iostate.io_wanted = 1; 3476 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_copy", NULL); 3477 3478 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, 3479 iostate.io_issued, iostate.io_completed, 0, 0, 0); 3480 } 3481 lck_mtx_unlock(cl_mtxp); 3482 3483 if (iostate.io_error) 3484 error = iostate.io_error; 3485 else { 3486 u_int32_t io_requested; 3487 3488 io_requested = val_size; 3489 3490 retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested); 3491 3492 io_req_size -= (val_size - io_requested); 3493 } 3494 } 3495 if (start_pg < last_pg) { 3496 /* 3497 * compute the range of pages that we actually issued an I/O for 3498 * and either commit them as valid if the I/O succeeded 3499 * or abort them if the I/O failed or we're not supposed to 3500 * keep them in the cache 3501 */ 3502 io_size = (last_pg - start_pg) * PAGE_SIZE; 3503 3504 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, (int)upl, start_pg * PAGE_SIZE, io_size, error, 0); 3505 3506 if (error || (flags & IO_NOCACHE)) 3507 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size, 3508 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); 3509 else 3510 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, 3511 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY | UPL_COMMIT_INACTIVATE); 3512 3513 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, (int)upl, start_pg * PAGE_SIZE, io_size, error, 0); 3514 } 3515 if ((last_pg - start_pg) < pages_in_upl) { 3516 /* 3517 * the set of pages that we issued an I/O for did not encompass 3518 * the entire upl... so just release these without modifying 3519 * their state 3520 */ 3521 if (error) 3522 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); 3523 else { 3524 3525 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, 3526 (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0); 3527 3528 /* 3529 * handle any valid pages at the beginning of 3530 * the upl... release these appropriately 3531 */ 3532 cluster_read_upl_release(upl, 0, start_pg, flags); 3533 3534 /* 3535 * handle any valid pages immediately after the 3536 * pages we issued I/O for... ... release these appropriately 3537 */ 3538 cluster_read_upl_release(upl, last_pg, uio_last, flags); 3539 3540 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, (int)upl, -1, -1, 0, 0); 3541 } 3542 } 3543 if (retval == 0) 3544 retval = error; 3545 3546 if (io_req_size) { 3547 if (cluster_hard_throttle_on(vp)) { 3548 rd_ahead_enabled = 0; 3549 prefetch_enabled = 0; 3550 3551 max_rd_size = HARD_THROTTLE_MAXSIZE; 3552 } else { 3553 if (max_rd_size == HARD_THROTTLE_MAXSIZE) { 3554 /* 3555 * coming out of throttled state 3556 */ 3557 if (rap != NULL) 3558 rd_ahead_enabled = 1; 3559 prefetch_enabled = 1; 3560 3561 max_rd_size = max_prefetch; 3562 last_ioread_offset = 0; 3563 } 3564 } 3565 } 3566 } 3567 if (rap != NULL) { 3568 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END, 3569 (int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0); 3570 3571 lck_mtx_unlock(&rap->cl_lockr); 3572 } else { 3573 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END, 3574 (int)uio->uio_offset, io_req_size, 0, retval, 0); 3575 } 3576 3577 return (retval); 3578} 3579 3580 3581static int 3582cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length, 3583 int flags, int (*callback)(buf_t, void *), void *callback_arg) 3584{ 3585 upl_t upl; 3586 upl_page_info_t *pl; 3587 off_t max_io_size; 3588 vm_offset_t upl_offset; 3589 vm_size_t upl_size; 3590 vm_size_t upl_needed_size; 3591 unsigned int pages_in_pl; 3592 int upl_flags; 3593 int bflag; 3594 kern_return_t kret; 3595 unsigned int i; 3596 int force_data_sync; 3597 int retval = 0; 3598 int no_zero_fill = 0; 3599 int abort_flag = 0; 3600 int io_flag = 0; 3601 int misaligned = 0; 3602 struct clios iostate; 3603 user_addr_t iov_base; 3604 u_int32_t io_req_size; 3605 u_int32_t offset_in_file; 3606 u_int32_t offset_in_iovbase; 3607 u_int32_t io_size; 3608 u_int32_t io_min; 3609 u_int32_t xsize; 3610 u_int32_t devblocksize; 3611 u_int32_t mem_alignment_mask; 3612 u_int32_t max_upl_size; 3613 u_int32_t max_rd_size; 3614 u_int32_t max_rd_ahead; 3615 3616 3617 max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ); 3618 3619 max_rd_size = max_upl_size; 3620 max_rd_ahead = max_rd_size * 2; 3621 3622 3623 if (flags & IO_PASSIVE) 3624 bflag = CL_PASSIVE; 3625 else 3626 bflag = 0; 3627 3628 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START, 3629 (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0); 3630 3631 iostate.io_completed = 0; 3632 iostate.io_issued = 0; 3633 iostate.io_error = 0; 3634 iostate.io_wanted = 0; 3635 3636 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; 3637 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; 3638 3639 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE, 3640 (int)devblocksize, (int)mem_alignment_mask, 0, 0, 0); 3641 3642 if (devblocksize == 1) { 3643 /* 3644 * the AFP client advertises a devblocksize of 1 3645 * however, its BLOCKMAP routine maps to physical 3646 * blocks that are PAGE_SIZE in size... 3647 * therefore we can't ask for I/Os that aren't page aligned 3648 * or aren't multiples of PAGE_SIZE in size 3649 * by setting devblocksize to PAGE_SIZE, we re-instate 3650 * the old behavior we had before the mem_alignment_mask 3651 * changes went in... 3652 */ 3653 devblocksize = PAGE_SIZE; 3654 } 3655next_dread: 3656 io_req_size = *read_length; 3657 iov_base = uio_curriovbase(uio); 3658 3659 max_io_size = filesize - uio->uio_offset; 3660 3661 if ((off_t)io_req_size > max_io_size) 3662 io_req_size = max_io_size; 3663 3664 offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1); 3665 offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask; 3666 3667 if (offset_in_file || offset_in_iovbase) { 3668 /* 3669 * one of the 2 important offsets is misaligned 3670 * so fire an I/O through the cache for this entire vector 3671 */ 3672 misaligned = 1; 3673 } 3674 if (iov_base & (devblocksize - 1)) { 3675 /* 3676 * the offset in memory must be on a device block boundary 3677 * so that we can guarantee that we can generate an 3678 * I/O that ends on a page boundary in cluster_io 3679 */ 3680 misaligned = 1; 3681 } 3682 /* 3683 * When we get to this point, we know... 3684 * -- the offset into the file is on a devblocksize boundary 3685 */ 3686 3687 while (io_req_size && retval == 0) { 3688 u_int32_t io_start; 3689 3690 if (cluster_hard_throttle_on(vp)) { 3691 max_rd_size = HARD_THROTTLE_MAXSIZE; 3692 max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1; 3693 } else { 3694 max_rd_size = max_upl_size; 3695 max_rd_ahead = max_rd_size * 2; 3696 } 3697 io_start = io_size = io_req_size; 3698 3699 /* 3700 * First look for pages already in the cache 3701 * and move them to user space. 3702 * 3703 * cluster_copy_ubc_data returns the resid 3704 * in io_size 3705 */ 3706 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0); 3707 3708 /* 3709 * calculate the number of bytes actually copied 3710 * starting size - residual 3711 */ 3712 xsize = io_start - io_size; 3713 3714 io_req_size -= xsize; 3715 3716 /* 3717 * check to see if we are finished with this request... 3718 */ 3719 if (io_req_size == 0 || misaligned) { 3720 /* 3721 * see if there's another uio vector to 3722 * process that's of type IO_DIRECT 3723 * 3724 * break out of while loop to get there 3725 */ 3726 break; 3727 } 3728 /* 3729 * assume the request ends on a device block boundary 3730 */ 3731 io_min = devblocksize; 3732 3733 /* 3734 * we can handle I/O's in multiples of the device block size 3735 * however, if io_size isn't a multiple of devblocksize we 3736 * want to clip it back to the nearest page boundary since 3737 * we are going to have to go through cluster_read_copy to 3738 * deal with the 'overhang'... by clipping it to a PAGE_SIZE 3739 * multiple, we avoid asking the drive for the same physical 3740 * blocks twice.. once for the partial page at the end of the 3741 * request and a 2nd time for the page we read into the cache 3742 * (which overlaps the end of the direct read) in order to 3743 * get at the overhang bytes 3744 */ 3745 if (io_size & (devblocksize - 1)) { 3746 /* 3747 * request does NOT end on a device block boundary 3748 * so clip it back to a PAGE_SIZE boundary 3749 */ 3750 io_size &= ~PAGE_MASK; 3751 io_min = PAGE_SIZE; 3752 } 3753 if (retval || io_size < io_min) { 3754 /* 3755 * either an error or we only have the tail left to 3756 * complete via the copy path... 3757 * we may have already spun some portion of this request 3758 * off as async requests... we need to wait for the I/O 3759 * to complete before returning 3760 */ 3761 goto wait_for_dreads; 3762 } 3763 if ((xsize = io_size) > max_rd_size) 3764 xsize = max_rd_size; 3765 3766 io_size = 0; 3767 3768 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size); 3769 3770 if (io_size == 0) { 3771 /* 3772 * a page must have just come into the cache 3773 * since the first page in this range is no 3774 * longer absent, go back and re-evaluate 3775 */ 3776 continue; 3777 } 3778 iov_base = uio_curriovbase(uio); 3779 3780 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK); 3781 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK; 3782 3783 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START, 3784 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0); 3785 3786 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) { 3787 no_zero_fill = 1; 3788 abort_flag = UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY; 3789 } else { 3790 no_zero_fill = 0; 3791 abort_flag = UPL_ABORT_FREE_ON_EMPTY; 3792 } 3793 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) { 3794 pages_in_pl = 0; 3795 upl_size = upl_needed_size; 3796 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; 3797 3798 if (no_zero_fill) 3799 upl_flags |= UPL_NOZEROFILL; 3800 if (force_data_sync) 3801 upl_flags |= UPL_FORCE_DATA_SYNC; 3802 3803 kret = vm_map_create_upl(current_map(), 3804 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), 3805 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags); 3806 3807 if (kret != KERN_SUCCESS) { 3808 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, 3809 (int)upl_offset, upl_size, io_size, kret, 0); 3810 /* 3811 * failed to get pagelist 3812 * 3813 * we may have already spun some portion of this request 3814 * off as async requests... we need to wait for the I/O 3815 * to complete before returning 3816 */ 3817 goto wait_for_dreads; 3818 } 3819 pages_in_pl = upl_size / PAGE_SIZE; 3820 pl = UPL_GET_INTERNAL_PAGE_LIST(upl); 3821 3822 for (i = 0; i < pages_in_pl; i++) { 3823 if (!upl_valid_page(pl, i)) 3824 break; 3825 } 3826 if (i == pages_in_pl) 3827 break; 3828 3829 ubc_upl_abort(upl, abort_flag); 3830 } 3831 if (force_data_sync >= 3) { 3832 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, 3833 (int)upl_offset, upl_size, io_size, kret, 0); 3834 3835 goto wait_for_dreads; 3836 } 3837 /* 3838 * Consider the possibility that upl_size wasn't satisfied. 3839 */ 3840 if (upl_size < upl_needed_size) { 3841 if (upl_size && upl_offset == 0) 3842 io_size = upl_size; 3843 else 3844 io_size = 0; 3845 } 3846 if (io_size == 0) { 3847 ubc_upl_abort(upl, abort_flag); 3848 goto wait_for_dreads; 3849 } 3850 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, 3851 (int)upl_offset, upl_size, io_size, kret, 0); 3852 3853 /* 3854 * request asynchronously so that we can overlap 3855 * the preparation of the next I/O 3856 * if there are already too many outstanding reads 3857 * wait until some have completed before issuing the next read 3858 */ 3859 lck_mtx_lock(cl_mtxp); 3860 3861 while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) { 3862 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, 3863 iostate.io_issued, iostate.io_completed, max_rd_ahead, 0, 0); 3864 3865 iostate.io_wanted = 1; 3866 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_direct", NULL); 3867 3868 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, 3869 iostate.io_issued, iostate.io_completed, max_rd_ahead, 0, 0); 3870 } 3871 lck_mtx_unlock(cl_mtxp); 3872 3873 if (iostate.io_error) { 3874 /* 3875 * one of the earlier reads we issued ran into a hard error 3876 * don't issue any more reads, cleanup the UPL 3877 * that was just created but not used, then 3878 * go wait for any other reads to complete before 3879 * returning the error to the caller 3880 */ 3881 ubc_upl_abort(upl, abort_flag); 3882 3883 goto wait_for_dreads; 3884 } 3885 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START, 3886 (int)upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0); 3887 3888 if (no_zero_fill) 3889 io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO | bflag; 3890 else 3891 io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO | CL_PRESERVE | bflag; 3892 3893 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 3894 3895 /* 3896 * update the uio structure 3897 */ 3898 uio_update(uio, (user_size_t)io_size); 3899 3900 io_req_size -= io_size; 3901 3902 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END, 3903 (int)upl, (int)uio->uio_offset, io_req_size, retval, 0); 3904 3905 } /* end while */ 3906 3907 if (retval == 0 && iostate.io_error == 0 && io_req_size == 0 && uio->uio_offset < filesize) { 3908 3909 retval = cluster_io_type(uio, read_type, read_length, 0); 3910 3911 if (retval == 0 && *read_type == IO_DIRECT) { 3912 3913 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE, 3914 (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0); 3915 3916 goto next_dread; 3917 } 3918 } 3919 3920wait_for_dreads: 3921 if (iostate.io_issued) { 3922 /* 3923 * make sure all async reads that are part of this stream 3924 * have completed before we return 3925 */ 3926 lck_mtx_lock(cl_mtxp); 3927 3928 while (iostate.io_issued != iostate.io_completed) { 3929 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, 3930 iostate.io_issued, iostate.io_completed, 0, 0, 0); 3931 3932 iostate.io_wanted = 1; 3933 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_direct", NULL); 3934 3935 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, 3936 iostate.io_issued, iostate.io_completed, 0, 0, 0); 3937 } 3938 lck_mtx_unlock(cl_mtxp); 3939 } 3940 3941 if (iostate.io_error) 3942 retval = iostate.io_error; 3943 3944 if (io_req_size && retval == 0) { 3945 /* 3946 * we couldn't handle the tail of this request in DIRECT mode 3947 * so fire it through the copy path 3948 */ 3949 retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg); 3950 3951 *read_type = IO_UNKNOWN; 3952 } 3953 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END, 3954 (int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, 0); 3955 3956 return (retval); 3957} 3958 3959 3960static int 3961cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length, 3962 int (*callback)(buf_t, void *), void *callback_arg, int flags) 3963{ 3964 upl_page_info_t *pl; 3965 upl_t upl[MAX_VECTS]; 3966 vm_offset_t upl_offset; 3967 addr64_t dst_paddr = 0; 3968 user_addr_t iov_base; 3969 off_t max_size; 3970 vm_size_t upl_size; 3971 vm_size_t upl_needed_size; 3972 mach_msg_type_number_t pages_in_pl; 3973 int upl_flags; 3974 kern_return_t kret; 3975 struct clios iostate; 3976 int error= 0; 3977 int cur_upl = 0; 3978 int num_upl = 0; 3979 int n; 3980 u_int32_t xsize; 3981 u_int32_t io_size; 3982 u_int32_t devblocksize; 3983 u_int32_t mem_alignment_mask; 3984 u_int32_t tail_size = 0; 3985 int bflag; 3986 3987 if (flags & IO_PASSIVE) 3988 bflag = CL_PASSIVE; 3989 else 3990 bflag = 0; 3991 3992 /* 3993 * When we enter this routine, we know 3994 * -- the read_length will not exceed the current iov_len 3995 * -- the target address is physically contiguous for read_length 3996 */ 3997 cluster_syncup(vp, filesize, callback, callback_arg); 3998 3999 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; 4000 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; 4001 4002 iostate.io_completed = 0; 4003 iostate.io_issued = 0; 4004 iostate.io_error = 0; 4005 iostate.io_wanted = 0; 4006 4007next_cread: 4008 io_size = *read_length; 4009 4010 max_size = filesize - uio->uio_offset; 4011 4012 if (io_size > max_size) 4013 io_size = max_size; 4014 4015 iov_base = uio_curriovbase(uio); 4016 4017 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK); 4018 upl_needed_size = upl_offset + io_size; 4019 4020 pages_in_pl = 0; 4021 upl_size = upl_needed_size; 4022 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; 4023 4024 4025 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_START, 4026 (int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0); 4027 4028 kret = vm_map_get_upl(current_map(), 4029 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), 4030 &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, 0); 4031 4032 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_END, 4033 (int)upl_offset, upl_size, io_size, kret, 0); 4034 4035 if (kret != KERN_SUCCESS) { 4036 /* 4037 * failed to get pagelist 4038 */ 4039 error = EINVAL; 4040 goto wait_for_creads; 4041 } 4042 num_upl++; 4043 4044 if (upl_size < upl_needed_size) { 4045 /* 4046 * The upl_size wasn't satisfied. 4047 */ 4048 error = EINVAL; 4049 goto wait_for_creads; 4050 } 4051 pl = ubc_upl_pageinfo(upl[cur_upl]); 4052 4053 dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset; 4054 4055 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) { 4056 u_int32_t head_size; 4057 4058 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1)); 4059 4060 if (head_size > io_size) 4061 head_size = io_size; 4062 4063 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ, callback, callback_arg); 4064 4065 if (error) 4066 goto wait_for_creads; 4067 4068 upl_offset += head_size; 4069 dst_paddr += head_size; 4070 io_size -= head_size; 4071 4072 iov_base += head_size; 4073 } 4074 if ((u_int32_t)iov_base & mem_alignment_mask) { 4075 /* 4076 * request doesn't set up on a memory boundary 4077 * the underlying DMA engine can handle... 4078 * return an error instead of going through 4079 * the slow copy path since the intent of this 4080 * path is direct I/O to device memory 4081 */ 4082 error = EINVAL; 4083 goto wait_for_creads; 4084 } 4085 4086 tail_size = io_size & (devblocksize - 1); 4087 4088 io_size -= tail_size; 4089 4090 while (io_size && error == 0) { 4091 4092 if (io_size > MAX_IO_CONTIG_SIZE) 4093 xsize = MAX_IO_CONTIG_SIZE; 4094 else 4095 xsize = io_size; 4096 /* 4097 * request asynchronously so that we can overlap 4098 * the preparation of the next I/O... we'll do 4099 * the commit after all the I/O has completed 4100 * since its all issued against the same UPL 4101 * if there are already too many outstanding reads 4102 * wait until some have completed before issuing the next 4103 */ 4104 if (iostate.io_issued) { 4105 lck_mtx_lock(cl_mtxp); 4106 4107 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_IO_CONTIG_SIZE)) { 4108 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, 4109 iostate.io_issued, iostate.io_completed, 2 * MAX_IO_CONTIG_SIZE, 0, 0); 4110 4111 iostate.io_wanted = 1; 4112 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_contig", NULL); 4113 4114 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, 4115 iostate.io_issued, iostate.io_completed, 2 * MAX_IO_CONTIG_SIZE, 0, 0); 4116 } 4117 lck_mtx_unlock(cl_mtxp); 4118 } 4119 if (iostate.io_error) { 4120 /* 4121 * one of the earlier reads we issued ran into a hard error 4122 * don't issue any more reads... 4123 * go wait for any other reads to complete before 4124 * returning the error to the caller 4125 */ 4126 goto wait_for_creads; 4127 } 4128 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, xsize, 4129 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC | bflag, 4130 (buf_t)NULL, &iostate, callback, callback_arg); 4131 /* 4132 * The cluster_io read was issued successfully, 4133 * update the uio structure 4134 */ 4135 if (error == 0) { 4136 uio_update(uio, (user_size_t)xsize); 4137 4138 dst_paddr += xsize; 4139 upl_offset += xsize; 4140 io_size -= xsize; 4141 } 4142 } 4143 if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS && uio->uio_offset < filesize) { 4144 4145 error = cluster_io_type(uio, read_type, read_length, 0); 4146 4147 if (error == 0 && *read_type == IO_CONTIG) { 4148 cur_upl++; 4149 goto next_cread; 4150 } 4151 } else 4152 *read_type = IO_UNKNOWN; 4153 4154wait_for_creads: 4155 /* 4156 * make sure all async reads that are part of this stream 4157 * have completed before we proceed 4158 */ 4159 lck_mtx_lock(cl_mtxp); 4160 4161 while (iostate.io_issued != iostate.io_completed) { 4162 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, 4163 iostate.io_issued, iostate.io_completed, 0, 0, 0); 4164 4165 iostate.io_wanted = 1; 4166 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_contig", NULL); 4167 4168 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, 4169 iostate.io_issued, iostate.io_completed, 0, 0, 0); 4170 } 4171 lck_mtx_unlock(cl_mtxp); 4172 4173 if (iostate.io_error) 4174 error = iostate.io_error; 4175 4176 if (error == 0 && tail_size) 4177 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg); 4178 4179 for (n = 0; n < num_upl; n++) 4180 /* 4181 * just release our hold on each physically contiguous 4182 * region without changing any state 4183 */ 4184 ubc_upl_abort(upl[n], 0); 4185 4186 return (error); 4187} 4188 4189 4190static int 4191cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length) 4192{ 4193 user_size_t iov_len; 4194 user_addr_t iov_base = 0; 4195 upl_t upl; 4196 vm_size_t upl_size; 4197 int upl_flags; 4198 int retval = 0; 4199 4200 /* 4201 * skip over any emtpy vectors 4202 */ 4203 uio_update(uio, (user_size_t)0); 4204 4205 iov_len = uio_curriovlen(uio); 4206 4207 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_START, (int)uio, (int)iov_len, 0, 0, 0); 4208 4209 if (iov_len) { 4210 iov_base = uio_curriovbase(uio); 4211 /* 4212 * make sure the size of the vector isn't too big... 4213 * internally, we want to handle all of the I/O in 4214 * chunk sizes that fit in a 32 bit int 4215 */ 4216 if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE) 4217 upl_size = MAX_IO_REQUEST_SIZE; 4218 else 4219 upl_size = (u_int32_t)iov_len; 4220 4221 upl_flags = UPL_QUERY_OBJECT_TYPE; 4222 4223 if ((vm_map_get_upl(current_map(), 4224 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), 4225 &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) { 4226 /* 4227 * the user app must have passed in an invalid address 4228 */ 4229 retval = EFAULT; 4230 } 4231 if (upl_size == 0) 4232 retval = EFAULT; 4233 4234 *io_length = upl_size; 4235 4236 if (upl_flags & UPL_PHYS_CONTIG) 4237 *io_type = IO_CONTIG; 4238 else if (iov_len >= min_length) 4239 *io_type = IO_DIRECT; 4240 else 4241 *io_type = IO_COPY; 4242 } else { 4243 /* 4244 * nothing left to do for this uio 4245 */ 4246 *io_length = 0; 4247 *io_type = IO_UNKNOWN; 4248 } 4249 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_END, (int)iov_base, *io_type, *io_length, retval, 0); 4250 4251 return (retval); 4252} 4253 4254 4255/* 4256 * generate advisory I/O's in the largest chunks possible 4257 * the completed pages will be released into the VM cache 4258 */ 4259int 4260advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid) 4261{ 4262 return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE); 4263} 4264 4265int 4266advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*callback)(buf_t, void *), void *callback_arg, int bflag) 4267{ 4268 upl_page_info_t *pl; 4269 upl_t upl; 4270 vm_offset_t upl_offset; 4271 int upl_size; 4272 off_t upl_f_offset; 4273 int start_offset; 4274 int start_pg; 4275 int last_pg; 4276 int pages_in_upl; 4277 off_t max_size; 4278 int io_size; 4279 kern_return_t kret; 4280 int retval = 0; 4281 int issued_io; 4282 int skip_range; 4283 uint32_t max_io_size; 4284 4285 4286 if ( !UBCINFOEXISTS(vp)) 4287 return(EINVAL); 4288 4289 if (resid < 0) 4290 return(EINVAL); 4291 4292 max_io_size = cluster_max_io_size(vp->v_mount, CL_READ); 4293 4294 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START, 4295 (int)f_offset, resid, (int)filesize, 0, 0); 4296 4297 while (resid && f_offset < filesize && retval == 0) { 4298 /* 4299 * compute the size of the upl needed to encompass 4300 * the requested read... limit each call to cluster_io 4301 * to the maximum UPL size... cluster_io will clip if 4302 * this exceeds the maximum io_size for the device, 4303 * make sure to account for 4304 * a starting offset that's not page aligned 4305 */ 4306 start_offset = (int)(f_offset & PAGE_MASK_64); 4307 upl_f_offset = f_offset - (off_t)start_offset; 4308 max_size = filesize - f_offset; 4309 4310 if (resid < max_size) 4311 io_size = resid; 4312 else 4313 io_size = max_size; 4314 4315 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 4316 if ((uint32_t)upl_size > max_io_size) 4317 upl_size = max_io_size; 4318 4319 skip_range = 0; 4320 /* 4321 * return the number of contiguously present pages in the cache 4322 * starting at upl_f_offset within the file 4323 */ 4324 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range); 4325 4326 if (skip_range) { 4327 /* 4328 * skip over pages already present in the cache 4329 */ 4330 io_size = skip_range - start_offset; 4331 4332 f_offset += io_size; 4333 resid -= io_size; 4334 4335 if (skip_range == upl_size) 4336 continue; 4337 /* 4338 * have to issue some real I/O 4339 * at this point, we know it's starting on a page boundary 4340 * because we've skipped over at least the first page in the request 4341 */ 4342 start_offset = 0; 4343 upl_f_offset += skip_range; 4344 upl_size -= skip_range; 4345 } 4346 pages_in_upl = upl_size / PAGE_SIZE; 4347 4348 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START, 4349 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0); 4350 4351 kret = ubc_create_upl(vp, 4352 upl_f_offset, 4353 upl_size, 4354 &upl, 4355 &pl, 4356 UPL_RET_ONLY_ABSENT | UPL_SET_LITE); 4357 if (kret != KERN_SUCCESS) 4358 return(retval); 4359 issued_io = 0; 4360 4361 /* 4362 * before we start marching forward, we must make sure we end on 4363 * a present page, otherwise we will be working with a freed 4364 * upl 4365 */ 4366 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) { 4367 if (upl_page_present(pl, last_pg)) 4368 break; 4369 } 4370 pages_in_upl = last_pg + 1; 4371 4372 4373 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END, 4374 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0); 4375 4376 4377 for (last_pg = 0; last_pg < pages_in_upl; ) { 4378 /* 4379 * scan from the beginning of the upl looking for the first 4380 * page that is present.... this will become the first page in 4381 * the request we're going to make to 'cluster_io'... if all 4382 * of the pages are absent, we won't call through to 'cluster_io' 4383 */ 4384 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) { 4385 if (upl_page_present(pl, start_pg)) 4386 break; 4387 } 4388 4389 /* 4390 * scan from the starting present page looking for an absent 4391 * page before the end of the upl is reached, if we 4392 * find one, then it will terminate the range of pages being 4393 * presented to 'cluster_io' 4394 */ 4395 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) { 4396 if (!upl_page_present(pl, last_pg)) 4397 break; 4398 } 4399 4400 if (last_pg > start_pg) { 4401 /* 4402 * we found a range of pages that must be filled 4403 * if the last page in this range is the last page of the file 4404 * we may have to clip the size of it to keep from reading past 4405 * the end of the last physical block associated with the file 4406 */ 4407 upl_offset = start_pg * PAGE_SIZE; 4408 io_size = (last_pg - start_pg) * PAGE_SIZE; 4409 4410 if ((upl_f_offset + upl_offset + io_size) > filesize) 4411 io_size = filesize - (upl_f_offset + upl_offset); 4412 4413 /* 4414 * issue an asynchronous read to cluster_io 4415 */ 4416 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, 4417 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 4418 4419 issued_io = 1; 4420 } 4421 } 4422 if (issued_io == 0) 4423 ubc_upl_abort(upl, 0); 4424 4425 io_size = upl_size - start_offset; 4426 4427 if (io_size > resid) 4428 io_size = resid; 4429 f_offset += io_size; 4430 resid -= io_size; 4431 } 4432 4433 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END, 4434 (int)f_offset, resid, retval, 0, 0); 4435 4436 return(retval); 4437} 4438 4439 4440int 4441cluster_push(vnode_t vp, int flags) 4442{ 4443 return cluster_push_ext(vp, flags, NULL, NULL); 4444} 4445 4446 4447int 4448cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg) 4449{ 4450 int retval; 4451 struct cl_writebehind *wbp; 4452 4453 if ( !UBCINFOEXISTS(vp)) { 4454 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -1, 0); 4455 return (0); 4456 } 4457 /* return if deferred write is set */ 4458 if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) { 4459 return (0); 4460 } 4461 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) { 4462 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -2, 0); 4463 return (0); 4464 } 4465 if (wbp->cl_number == 0 && wbp->cl_scmap == NULL) { 4466 lck_mtx_unlock(&wbp->cl_lockw); 4467 4468 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -3, 0); 4469 return(0); 4470 } 4471 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START, 4472 (int)wbp->cl_scmap, wbp->cl_number, flags, 0, 0); 4473 4474 if (wbp->cl_scmap) { 4475 sparse_cluster_push(wbp, vp, ubc_getsize(vp), PUSH_ALL | IO_PASSIVE, callback, callback_arg); 4476 4477 retval = 1; 4478 } else 4479 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL | IO_PASSIVE, callback, callback_arg); 4480 4481 lck_mtx_unlock(&wbp->cl_lockw); 4482 4483 if (flags & IO_SYNC) 4484 (void)vnode_waitforwrites(vp, 0, 0, 0, "cluster_push"); 4485 4486 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END, 4487 (int)wbp->cl_scmap, wbp->cl_number, retval, 0, 0); 4488 4489 return (retval); 4490} 4491 4492 4493__private_extern__ void 4494cluster_release(struct ubc_info *ubc) 4495{ 4496 struct cl_writebehind *wbp; 4497 struct cl_readahead *rap; 4498 4499 if ((wbp = ubc->cl_wbehind)) { 4500 4501 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0); 4502 4503 if (wbp->cl_scmap) 4504 vfs_drt_control(&(wbp->cl_scmap), 0); 4505 } else { 4506 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, 0, 0, 0, 0); 4507 } 4508 4509 rap = ubc->cl_rahead; 4510 4511 if (wbp != NULL) { 4512 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp); 4513 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND); 4514 } 4515 if ((rap = ubc->cl_rahead)) { 4516 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp); 4517 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD); 4518 } 4519 ubc->cl_rahead = NULL; 4520 ubc->cl_wbehind = NULL; 4521 4522 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, (int)ubc, (int)rap, (int)wbp, 0, 0); 4523} 4524 4525 4526static int 4527cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int (*callback)(buf_t, void *), void *callback_arg) 4528{ 4529 int cl_index; 4530 int cl_index1; 4531 int min_index; 4532 int cl_len; 4533 int cl_pushed = 0; 4534 struct cl_wextent l_clusters[MAX_CLUSTERS]; 4535 u_int max_cluster_pgcount; 4536 4537 4538 max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE; 4539 /* 4540 * the write behind context exists and has 4541 * already been locked... 4542 */ 4543 if (wbp->cl_number == 0) 4544 /* 4545 * no clusters to push 4546 * return number of empty slots 4547 */ 4548 return (MAX_CLUSTERS); 4549 4550 /* 4551 * make a local 'sorted' copy of the clusters 4552 * and clear wbp->cl_number so that new clusters can 4553 * be developed 4554 */ 4555 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) { 4556 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) { 4557 if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr) 4558 continue; 4559 if (min_index == -1) 4560 min_index = cl_index1; 4561 else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr) 4562 min_index = cl_index1; 4563 } 4564 if (min_index == -1) 4565 break; 4566 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr; 4567 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr; 4568 l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags; 4569 4570 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr; 4571 } 4572 wbp->cl_number = 0; 4573 4574 cl_len = cl_index; 4575 4576 if ( (push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS ) { 4577 int i; 4578 4579 /* 4580 * determine if we appear to be writing the file sequentially 4581 * if not, by returning without having pushed any clusters 4582 * we will cause this vnode to be pushed into the sparse cluster mechanism 4583 * used for managing more random I/O patterns 4584 * 4585 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them... 4586 * that's why we're in try_push with PUSH_DELAY... 4587 * 4588 * check to make sure that all the clusters except the last one are 'full'... and that each cluster 4589 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above 4590 * so we can just make a simple pass through, up to, but not including the last one... 4591 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they 4592 * are sequential 4593 * 4594 * we let the last one be partial as long as it was adjacent to the previous one... 4595 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out 4596 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world... 4597 */ 4598 for (i = 0; i < MAX_CLUSTERS - 1; i++) { 4599 if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount) 4600 goto dont_try; 4601 if (l_clusters[i].e_addr != l_clusters[i+1].b_addr) 4602 goto dont_try; 4603 } 4604 } 4605 for (cl_index = 0; cl_index < cl_len; cl_index++) { 4606 int flags; 4607 struct cl_extent cl; 4608 4609 /* 4610 * try to push each cluster in turn... 4611 */ 4612 if (l_clusters[cl_index].io_flags & CLW_IONOCACHE) 4613 flags = IO_NOCACHE; 4614 else 4615 flags = 0; 4616 4617 if ((l_clusters[cl_index].io_flags & CLW_IOPASSIVE) || (push_flag & IO_PASSIVE)) 4618 flags |= IO_PASSIVE; 4619 4620 if (push_flag & PUSH_SYNC) 4621 flags |= IO_SYNC; 4622 4623 cl.b_addr = l_clusters[cl_index].b_addr; 4624 cl.e_addr = l_clusters[cl_index].e_addr; 4625 4626 cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg); 4627 4628 l_clusters[cl_index].b_addr = 0; 4629 l_clusters[cl_index].e_addr = 0; 4630 4631 cl_pushed++; 4632 4633 if ( !(push_flag & PUSH_ALL) ) 4634 break; 4635 } 4636dont_try: 4637 if (cl_len > cl_pushed) { 4638 /* 4639 * we didn't push all of the clusters, so 4640 * lets try to merge them back in to the vnode 4641 */ 4642 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) { 4643 /* 4644 * we picked up some new clusters while we were trying to 4645 * push the old ones... this can happen because I've dropped 4646 * the vnode lock... the sum of the 4647 * leftovers plus the new cluster count exceeds our ability 4648 * to represent them, so switch to the sparse cluster mechanism 4649 * 4650 * collect the active public clusters... 4651 */ 4652 sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg); 4653 4654 for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) { 4655 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) 4656 continue; 4657 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr; 4658 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr; 4659 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags; 4660 4661 cl_index1++; 4662 } 4663 /* 4664 * update the cluster count 4665 */ 4666 wbp->cl_number = cl_index1; 4667 4668 /* 4669 * and collect the original clusters that were moved into the 4670 * local storage for sorting purposes 4671 */ 4672 sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg); 4673 4674 } else { 4675 /* 4676 * we've got room to merge the leftovers back in 4677 * just append them starting at the next 'hole' 4678 * represented by wbp->cl_number 4679 */ 4680 for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) { 4681 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) 4682 continue; 4683 4684 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr; 4685 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr; 4686 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags; 4687 4688 cl_index1++; 4689 } 4690 /* 4691 * update the cluster count 4692 */ 4693 wbp->cl_number = cl_index1; 4694 } 4695 } 4696 return (MAX_CLUSTERS - wbp->cl_number); 4697} 4698 4699 4700 4701static int 4702cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*callback)(buf_t, void *), void *callback_arg) 4703{ 4704 upl_page_info_t *pl; 4705 upl_t upl; 4706 vm_offset_t upl_offset; 4707 int upl_size; 4708 off_t upl_f_offset; 4709 int pages_in_upl; 4710 int start_pg; 4711 int last_pg; 4712 int io_size; 4713 int io_flags; 4714 int upl_flags; 4715 int bflag; 4716 int size; 4717 int error = 0; 4718 int retval; 4719 kern_return_t kret; 4720 4721 if (flags & IO_PASSIVE) 4722 bflag = CL_PASSIVE; 4723 else 4724 bflag = 0; 4725 4726 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START, 4727 (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0); 4728 4729 if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) { 4730 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0); 4731 4732 return (0); 4733 } 4734 upl_size = pages_in_upl * PAGE_SIZE; 4735 upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64); 4736 4737 if (upl_f_offset + upl_size >= EOF) { 4738 4739 if (upl_f_offset >= EOF) { 4740 /* 4741 * must have truncated the file and missed 4742 * clearing a dangling cluster (i.e. it's completely 4743 * beyond the new EOF 4744 */ 4745 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0); 4746 4747 return(0); 4748 } 4749 size = EOF - upl_f_offset; 4750 4751 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 4752 pages_in_upl = upl_size / PAGE_SIZE; 4753 } else 4754 size = upl_size; 4755 4756 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0); 4757 4758 /* 4759 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior 4760 * 4761 * - only pages that are currently dirty are returned... these are the ones we need to clean 4762 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set 4763 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page 4764 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if 4765 * someone dirties this page while the I/O is in progress, we don't lose track of the new state 4766 * 4767 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard) 4768 */ 4769 4770 if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE)) 4771 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED; 4772 else 4773 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE; 4774 4775 kret = ubc_create_upl(vp, 4776 upl_f_offset, 4777 upl_size, 4778 &upl, 4779 &pl, 4780 upl_flags); 4781 if (kret != KERN_SUCCESS) 4782 panic("cluster_push: failed to get pagelist"); 4783 4784 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, (int)upl, upl_f_offset, 0, 0, 0); 4785 4786 /* 4787 * since we only asked for the dirty pages back 4788 * it's possible that we may only get a few or even none, so... 4789 * before we start marching forward, we must make sure we know 4790 * where the last present page is in the UPL, otherwise we could 4791 * end up working with a freed upl due to the FREE_ON_EMPTY semantics 4792 * employed by commit_range and abort_range. 4793 */ 4794 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) { 4795 if (upl_page_present(pl, last_pg)) 4796 break; 4797 } 4798 pages_in_upl = last_pg + 1; 4799 4800 if (pages_in_upl == 0) { 4801 ubc_upl_abort(upl, 0); 4802 4803 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0); 4804 return(0); 4805 } 4806 4807 for (last_pg = 0; last_pg < pages_in_upl; ) { 4808 /* 4809 * find the next dirty page in the UPL 4810 * this will become the first page in the 4811 * next I/O to generate 4812 */ 4813 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) { 4814 if (upl_dirty_page(pl, start_pg)) 4815 break; 4816 if (upl_page_present(pl, start_pg)) 4817 /* 4818 * RET_ONLY_DIRTY will return non-dirty 'precious' pages 4819 * just release these unchanged since we're not going 4820 * to steal them or change their state 4821 */ 4822 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY); 4823 } 4824 if (start_pg >= pages_in_upl) 4825 /* 4826 * done... no more dirty pages to push 4827 */ 4828 break; 4829 if (start_pg > last_pg) 4830 /* 4831 * skipped over some non-dirty pages 4832 */ 4833 size -= ((start_pg - last_pg) * PAGE_SIZE); 4834 4835 /* 4836 * find a range of dirty pages to write 4837 */ 4838 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) { 4839 if (!upl_dirty_page(pl, last_pg)) 4840 break; 4841 } 4842 upl_offset = start_pg * PAGE_SIZE; 4843 4844 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE); 4845 4846 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | bflag; 4847 4848 if ( !(flags & IO_SYNC)) 4849 io_flags |= CL_ASYNC; 4850 4851 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, 4852 io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 4853 4854 if (error == 0 && retval) 4855 error = retval; 4856 4857 size -= io_size; 4858 } 4859 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0); 4860 4861 return(error); 4862} 4863 4864 4865/* 4866 * sparse_cluster_switch is called with the write behind lock held 4867 */ 4868static void 4869sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg) 4870{ 4871 int cl_index; 4872 4873 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0); 4874 4875 if (wbp->cl_scmap == NULL) 4876 wbp->cl_scdirty = 0; 4877 4878 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) { 4879 int flags; 4880 struct cl_extent cl; 4881 4882 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) { 4883 4884 if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, NULL, &flags) == KERN_SUCCESS) { 4885 if (flags & UPL_POP_DIRTY) { 4886 cl.e_addr = cl.b_addr + 1; 4887 4888 sparse_cluster_add(wbp, vp, &cl, EOF, callback, callback_arg); 4889 } 4890 } 4891 } 4892 } 4893 wbp->cl_number = 0; 4894 4895 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0); 4896} 4897 4898 4899/* 4900 * sparse_cluster_push is called with the write behind lock held 4901 */ 4902static void 4903sparse_cluster_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int (*callback)(buf_t, void *), void *callback_arg) 4904{ 4905 struct cl_extent cl; 4906 off_t offset; 4907 u_int length; 4908 4909 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, push_flag, 0); 4910 4911 if (push_flag & PUSH_ALL) 4912 vfs_drt_control(&(wbp->cl_scmap), 1); 4913 4914 for (;;) { 4915 if (vfs_drt_get_cluster(&(wbp->cl_scmap), &offset, &length) != KERN_SUCCESS) 4916 break; 4917 4918 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64); 4919 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64); 4920 4921 wbp->cl_scdirty -= (int)(cl.e_addr - cl.b_addr); 4922 4923 cluster_push_now(vp, &cl, EOF, push_flag & IO_PASSIVE, callback, callback_arg); 4924 4925 if ( !(push_flag & PUSH_ALL) ) 4926 break; 4927 } 4928 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0); 4929} 4930 4931 4932/* 4933 * sparse_cluster_add is called with the write behind lock held 4934 */ 4935static void 4936sparse_cluster_add(struct cl_writebehind *wbp, vnode_t vp, struct cl_extent *cl, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg) 4937{ 4938 u_int new_dirty; 4939 u_int length; 4940 off_t offset; 4941 4942 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (int)wbp->cl_scmap, wbp->cl_scdirty, (int)cl->b_addr, (int)cl->e_addr, 0); 4943 4944 offset = (off_t)(cl->b_addr * PAGE_SIZE_64); 4945 length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE; 4946 4947 while (vfs_drt_mark_pages(&(wbp->cl_scmap), offset, length, &new_dirty) != KERN_SUCCESS) { 4948 /* 4949 * no room left in the map 4950 * only a partial update was done 4951 * push out some pages and try again 4952 */ 4953 wbp->cl_scdirty += new_dirty; 4954 4955 sparse_cluster_push(wbp, vp, EOF, 0, callback, callback_arg); 4956 4957 offset += (new_dirty * PAGE_SIZE_64); 4958 length -= (new_dirty * PAGE_SIZE); 4959 } 4960 wbp->cl_scdirty += new_dirty; 4961 4962 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0); 4963} 4964 4965 4966static int 4967cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*callback)(buf_t, void *), void *callback_arg) 4968{ 4969 upl_page_info_t *pl; 4970 upl_t upl; 4971 addr64_t ubc_paddr; 4972 kern_return_t kret; 4973 int error = 0; 4974 int did_read = 0; 4975 int abort_flags; 4976 int upl_flags; 4977 int bflag; 4978 4979 if (flags & IO_PASSIVE) 4980 bflag = CL_PASSIVE; 4981 else 4982 bflag = 0; 4983 4984 upl_flags = UPL_SET_LITE; 4985 4986 if ( !(flags & CL_READ) ) { 4987 /* 4988 * "write" operation: let the UPL subsystem know 4989 * that we intend to modify the buffer cache pages 4990 * we're gathering. 4991 */ 4992 upl_flags |= UPL_WILL_MODIFY; 4993 } else { 4994 /* 4995 * indicate that there is no need to pull the 4996 * mapping for this page... we're only going 4997 * to read from it, not modify it. 4998 */ 4999 upl_flags |= UPL_FILE_IO; 5000 } 5001 kret = ubc_create_upl(vp, 5002 uio->uio_offset & ~PAGE_MASK_64, 5003 PAGE_SIZE, 5004 &upl, 5005 &pl, 5006 upl_flags); 5007 5008 if (kret != KERN_SUCCESS) 5009 return(EINVAL); 5010 5011 if (!upl_valid_page(pl, 0)) { 5012 /* 5013 * issue a synchronous read to cluster_io 5014 */ 5015 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, 5016 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 5017 if (error) { 5018 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); 5019 5020 return(error); 5021 } 5022 did_read = 1; 5023 } 5024 ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64); 5025 5026/* 5027 * NOTE: There is no prototype for the following in BSD. It, and the definitions 5028 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in 5029 * osfmk/ppc/mappings.h. They are not included here because there appears to be no 5030 * way to do so without exporting them to kexts as well. 5031 */ 5032 if (flags & CL_READ) 5033// copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */ 5034 copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 4); /* Copy physical to physical and flush the destination */ 5035 else 5036// copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */ 5037 copypv(usr_paddr, ubc_paddr, xsize, 2 | 1 | 8); /* Copy physical to physical and flush the source */ 5038 5039 if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) { 5040 /* 5041 * issue a synchronous write to cluster_io 5042 */ 5043 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, 5044 bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 5045 } 5046 if (error == 0) 5047 uio_update(uio, (user_size_t)xsize); 5048 5049 if (did_read) 5050 abort_flags = UPL_ABORT_FREE_ON_EMPTY; 5051 else 5052 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES; 5053 5054 ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags); 5055 5056 return (error); 5057} 5058 5059 5060 5061int 5062cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid) 5063{ 5064 int pg_offset; 5065 int pg_index; 5066 int csize; 5067 int segflg; 5068 int retval = 0; 5069 int xsize; 5070 upl_page_info_t *pl; 5071 5072 xsize = *io_resid; 5073 5074 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START, 5075 (int)uio->uio_offset, upl_offset, xsize, 0, 0); 5076 5077 segflg = uio->uio_segflg; 5078 5079 switch(segflg) { 5080 5081 case UIO_USERSPACE32: 5082 case UIO_USERISPACE32: 5083 uio->uio_segflg = UIO_PHYS_USERSPACE32; 5084 break; 5085 5086 case UIO_USERSPACE: 5087 case UIO_USERISPACE: 5088 uio->uio_segflg = UIO_PHYS_USERSPACE; 5089 break; 5090 5091 case UIO_USERSPACE64: 5092 case UIO_USERISPACE64: 5093 uio->uio_segflg = UIO_PHYS_USERSPACE64; 5094 break; 5095 5096 case UIO_SYSSPACE32: 5097 uio->uio_segflg = UIO_PHYS_SYSSPACE32; 5098 break; 5099 5100 case UIO_SYSSPACE: 5101 uio->uio_segflg = UIO_PHYS_SYSSPACE; 5102 break; 5103 5104 case UIO_SYSSPACE64: 5105 uio->uio_segflg = UIO_PHYS_SYSSPACE64; 5106 break; 5107 } 5108 pl = ubc_upl_pageinfo(upl); 5109 5110 pg_index = upl_offset / PAGE_SIZE; 5111 pg_offset = upl_offset & PAGE_MASK; 5112 csize = min(PAGE_SIZE - pg_offset, xsize); 5113 5114 while (xsize && retval == 0) { 5115 addr64_t paddr; 5116 5117 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset; 5118 5119 retval = uiomove64(paddr, csize, uio); 5120 5121 pg_index += 1; 5122 pg_offset = 0; 5123 xsize -= csize; 5124 csize = min(PAGE_SIZE, xsize); 5125 } 5126 *io_resid = xsize; 5127 5128 uio->uio_segflg = segflg; 5129 5130 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END, 5131 (int)uio->uio_offset, xsize, retval, segflg, 0); 5132 5133 return (retval); 5134} 5135 5136 5137int 5138cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty) 5139{ 5140 5141 return (cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, 1)); 5142} 5143 5144 5145static int 5146cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference) 5147{ 5148 int segflg; 5149 int io_size; 5150 int xsize; 5151 int start_offset; 5152 int retval = 0; 5153 memory_object_control_t control; 5154 5155 io_size = *io_resid; 5156 5157 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START, 5158 (int)uio->uio_offset, 0, io_size, 0, 0); 5159 5160 control = ubc_getobject(vp, UBC_FLAGS_NONE); 5161 5162 if (control == MEMORY_OBJECT_CONTROL_NULL) { 5163 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END, 5164 (int)uio->uio_offset, io_size, retval, 3, 0); 5165 5166 return(0); 5167 } 5168 segflg = uio->uio_segflg; 5169 5170 switch(segflg) { 5171 5172 case UIO_USERSPACE32: 5173 case UIO_USERISPACE32: 5174 uio->uio_segflg = UIO_PHYS_USERSPACE32; 5175 break; 5176 5177 case UIO_USERSPACE64: 5178 case UIO_USERISPACE64: 5179 uio->uio_segflg = UIO_PHYS_USERSPACE64; 5180 break; 5181 5182 case UIO_SYSSPACE32: 5183 uio->uio_segflg = UIO_PHYS_SYSSPACE32; 5184 break; 5185 5186 case UIO_SYSSPACE64: 5187 uio->uio_segflg = UIO_PHYS_SYSSPACE64; 5188 break; 5189 5190 case UIO_USERSPACE: 5191 case UIO_USERISPACE: 5192 uio->uio_segflg = UIO_PHYS_USERSPACE; 5193 break; 5194 5195 case UIO_SYSSPACE: 5196 uio->uio_segflg = UIO_PHYS_SYSSPACE; 5197 break; 5198 } 5199 5200 if ( (io_size = *io_resid) ) { 5201 start_offset = (int)(uio->uio_offset & PAGE_MASK_64); 5202 xsize = uio_resid(uio); 5203 5204 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio, 5205 start_offset, io_size, mark_dirty, take_reference); 5206 xsize -= uio_resid(uio); 5207 io_size -= xsize; 5208 } 5209 uio->uio_segflg = segflg; 5210 *io_resid = io_size; 5211 5212 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END, 5213 (int)uio->uio_offset, io_size, retval, 0x80000000 | segflg, 0); 5214 5215 return(retval); 5216} 5217 5218 5219int 5220is_file_clean(vnode_t vp, off_t filesize) 5221{ 5222 off_t f_offset; 5223 int flags; 5224 int total_dirty = 0; 5225 5226 for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) { 5227 if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) { 5228 if (flags & UPL_POP_DIRTY) { 5229 total_dirty++; 5230 } 5231 } 5232 } 5233 if (total_dirty) 5234 return(EINVAL); 5235 5236 return (0); 5237} 5238 5239 5240 5241/* 5242 * Dirty region tracking/clustering mechanism. 5243 * 5244 * This code (vfs_drt_*) provides a mechanism for tracking and clustering 5245 * dirty regions within a larger space (file). It is primarily intended to 5246 * support clustering in large files with many dirty areas. 5247 * 5248 * The implementation assumes that the dirty regions are pages. 5249 * 5250 * To represent dirty pages within the file, we store bit vectors in a 5251 * variable-size circular hash. 5252 */ 5253 5254/* 5255 * Bitvector size. This determines the number of pages we group in a 5256 * single hashtable entry. Each hashtable entry is aligned to this 5257 * size within the file. 5258 */ 5259#define DRT_BITVECTOR_PAGES 256 5260 5261/* 5262 * File offset handling. 5263 * 5264 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES; 5265 * the correct formula is (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1) 5266 */ 5267#define DRT_ADDRESS_MASK (~((1 << 20) - 1)) 5268#define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK) 5269 5270/* 5271 * Hashtable address field handling. 5272 * 5273 * The low-order bits of the hashtable address are used to conserve 5274 * space. 5275 * 5276 * DRT_HASH_COUNT_MASK must be large enough to store the range 5277 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value 5278 * to indicate that the bucket is actually unoccupied. 5279 */ 5280#define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK) 5281#define DRT_HASH_SET_ADDRESS(scm, i, a) \ 5282 do { \ 5283 (scm)->scm_hashtable[(i)].dhe_control = \ 5284 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \ 5285 } while (0) 5286#define DRT_HASH_COUNT_MASK 0x1ff 5287#define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK) 5288#define DRT_HASH_SET_COUNT(scm, i, c) \ 5289 do { \ 5290 (scm)->scm_hashtable[(i)].dhe_control = \ 5291 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \ 5292 } while (0) 5293#define DRT_HASH_CLEAR(scm, i) \ 5294 do { \ 5295 (scm)->scm_hashtable[(i)].dhe_control = 0; \ 5296 } while (0) 5297#define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK) 5298#define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK) 5299#define DRT_HASH_COPY(oscm, oi, scm, i) \ 5300 do { \ 5301 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \ 5302 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \ 5303 } while(0); 5304 5305 5306/* 5307 * Hash table moduli. 5308 * 5309 * Since the hashtable entry's size is dependent on the size of 5310 * the bitvector, and since the hashtable size is constrained to 5311 * both being prime and fitting within the desired allocation 5312 * size, these values need to be manually determined. 5313 * 5314 * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes. 5315 * 5316 * The small hashtable allocation is 1024 bytes, so the modulus is 23. 5317 * The large hashtable allocation is 16384 bytes, so the modulus is 401. 5318 */ 5319#define DRT_HASH_SMALL_MODULUS 23 5320#define DRT_HASH_LARGE_MODULUS 401 5321 5322#define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */ 5323#define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */ 5324 5325/* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */ 5326 5327/* 5328 * Hashtable bitvector handling. 5329 * 5330 * Bitvector fields are 32 bits long. 5331 */ 5332 5333#define DRT_HASH_SET_BIT(scm, i, bit) \ 5334 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32)) 5335 5336#define DRT_HASH_CLEAR_BIT(scm, i, bit) \ 5337 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32)) 5338 5339#define DRT_HASH_TEST_BIT(scm, i, bit) \ 5340 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32))) 5341 5342#define DRT_BITVECTOR_CLEAR(scm, i) \ 5343 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t)) 5344 5345#define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \ 5346 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \ 5347 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \ 5348 (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t)) 5349 5350 5351 5352/* 5353 * Hashtable entry. 5354 */ 5355struct vfs_drt_hashentry { 5356 u_int64_t dhe_control; 5357 u_int32_t dhe_bitvector[DRT_BITVECTOR_PAGES / 32]; 5358}; 5359 5360/* 5361 * Dirty Region Tracking structure. 5362 * 5363 * The hashtable is allocated entirely inside the DRT structure. 5364 * 5365 * The hash is a simple circular prime modulus arrangement, the structure 5366 * is resized from small to large if it overflows. 5367 */ 5368 5369struct vfs_drt_clustermap { 5370 u_int32_t scm_magic; /* sanity/detection */ 5371#define DRT_SCM_MAGIC 0x12020003 5372 u_int32_t scm_modulus; /* current ring size */ 5373 u_int32_t scm_buckets; /* number of occupied buckets */ 5374 u_int32_t scm_lastclean; /* last entry we cleaned */ 5375 u_int32_t scm_iskips; /* number of slot skips */ 5376 5377 struct vfs_drt_hashentry scm_hashtable[0]; 5378}; 5379 5380 5381#define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus) 5382#define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus) 5383 5384/* 5385 * Debugging codes and arguments. 5386 */ 5387#define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */ 5388#define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */ 5389#define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */ 5390#define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */ 5391#define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length, 5392 * dirty */ 5393 /* 0, setcount */ 5394 /* 1 (clean, no map) */ 5395 /* 2 (map alloc fail) */ 5396 /* 3, resid (partial) */ 5397#define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87)) 5398#define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets, 5399 * lastclean, iskips */ 5400 5401 5402static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp); 5403static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap); 5404static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap, 5405 u_int64_t offset, int *indexp); 5406static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, 5407 u_int64_t offset, 5408 int *indexp, 5409 int recursed); 5410static kern_return_t vfs_drt_do_mark_pages( 5411 void **cmapp, 5412 u_int64_t offset, 5413 u_int length, 5414 u_int *setcountp, 5415 int dirty); 5416static void vfs_drt_trace( 5417 struct vfs_drt_clustermap *cmap, 5418 int code, 5419 int arg1, 5420 int arg2, 5421 int arg3, 5422 int arg4); 5423 5424 5425/* 5426 * Allocate and initialise a sparse cluster map. 5427 * 5428 * Will allocate a new map, resize or compact an existing map. 5429 * 5430 * XXX we should probably have at least one intermediate map size, 5431 * as the 1:16 ratio seems a bit drastic. 5432 */ 5433static kern_return_t 5434vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp) 5435{ 5436 struct vfs_drt_clustermap *cmap, *ocmap; 5437 kern_return_t kret; 5438 u_int64_t offset; 5439 u_int32_t i; 5440 int nsize, active_buckets, index, copycount; 5441 5442 ocmap = NULL; 5443 if (cmapp != NULL) 5444 ocmap = *cmapp; 5445 5446 /* 5447 * Decide on the size of the new map. 5448 */ 5449 if (ocmap == NULL) { 5450 nsize = DRT_HASH_SMALL_MODULUS; 5451 } else { 5452 /* count the number of active buckets in the old map */ 5453 active_buckets = 0; 5454 for (i = 0; i < ocmap->scm_modulus; i++) { 5455 if (!DRT_HASH_VACANT(ocmap, i) && 5456 (DRT_HASH_GET_COUNT(ocmap, i) != 0)) 5457 active_buckets++; 5458 } 5459 /* 5460 * If we're currently using the small allocation, check to 5461 * see whether we should grow to the large one. 5462 */ 5463 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) { 5464 /* if the ring is nearly full */ 5465 if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) { 5466 nsize = DRT_HASH_LARGE_MODULUS; 5467 } else { 5468 nsize = DRT_HASH_SMALL_MODULUS; 5469 } 5470 } else { 5471 /* already using the large modulus */ 5472 nsize = DRT_HASH_LARGE_MODULUS; 5473 /* 5474 * If the ring is completely full, there's 5475 * nothing useful for us to do. Behave as 5476 * though we had compacted into the new 5477 * array and return. 5478 */ 5479 if (active_buckets >= DRT_HASH_LARGE_MODULUS) 5480 return(KERN_SUCCESS); 5481 } 5482 } 5483 5484 /* 5485 * Allocate and initialise the new map. 5486 */ 5487 5488 kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap, 5489 (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION); 5490 if (kret != KERN_SUCCESS) 5491 return(kret); 5492 cmap->scm_magic = DRT_SCM_MAGIC; 5493 cmap->scm_modulus = nsize; 5494 cmap->scm_buckets = 0; 5495 cmap->scm_lastclean = 0; 5496 cmap->scm_iskips = 0; 5497 for (i = 0; i < cmap->scm_modulus; i++) { 5498 DRT_HASH_CLEAR(cmap, i); 5499 DRT_HASH_VACATE(cmap, i); 5500 DRT_BITVECTOR_CLEAR(cmap, i); 5501 } 5502 5503 /* 5504 * If there's an old map, re-hash entries from it into the new map. 5505 */ 5506 copycount = 0; 5507 if (ocmap != NULL) { 5508 for (i = 0; i < ocmap->scm_modulus; i++) { 5509 /* skip empty buckets */ 5510 if (DRT_HASH_VACANT(ocmap, i) || 5511 (DRT_HASH_GET_COUNT(ocmap, i) == 0)) 5512 continue; 5513 /* get new index */ 5514 offset = DRT_HASH_GET_ADDRESS(ocmap, i); 5515 kret = vfs_drt_get_index(&cmap, offset, &index, 1); 5516 if (kret != KERN_SUCCESS) { 5517 /* XXX need to bail out gracefully here */ 5518 panic("vfs_drt: new cluster map mysteriously too small"); 5519 index = 0; 5520 } 5521 /* copy */ 5522 DRT_HASH_COPY(ocmap, i, cmap, index); 5523 copycount++; 5524 } 5525 } 5526 5527 /* log what we've done */ 5528 vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0); 5529 5530 /* 5531 * It's important to ensure that *cmapp always points to 5532 * a valid map, so we must overwrite it before freeing 5533 * the old map. 5534 */ 5535 *cmapp = cmap; 5536 if (ocmap != NULL) { 5537 /* emit stats into trace buffer */ 5538 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA, 5539 ocmap->scm_modulus, 5540 ocmap->scm_buckets, 5541 ocmap->scm_lastclean, 5542 ocmap->scm_iskips); 5543 5544 vfs_drt_free_map(ocmap); 5545 } 5546 return(KERN_SUCCESS); 5547} 5548 5549 5550/* 5551 * Free a sparse cluster map. 5552 */ 5553static kern_return_t 5554vfs_drt_free_map(struct vfs_drt_clustermap *cmap) 5555{ 5556 kmem_free(kernel_map, (vm_offset_t)cmap, 5557 (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION); 5558 return(KERN_SUCCESS); 5559} 5560 5561 5562/* 5563 * Find the hashtable slot currently occupied by an entry for the supplied offset. 5564 */ 5565static kern_return_t 5566vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp) 5567{ 5568 int index; 5569 u_int32_t i; 5570 5571 offset = DRT_ALIGN_ADDRESS(offset); 5572 index = DRT_HASH(cmap, offset); 5573 5574 /* traverse the hashtable */ 5575 for (i = 0; i < cmap->scm_modulus; i++) { 5576 5577 /* 5578 * If the slot is vacant, we can stop. 5579 */ 5580 if (DRT_HASH_VACANT(cmap, index)) 5581 break; 5582 5583 /* 5584 * If the address matches our offset, we have success. 5585 */ 5586 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) { 5587 *indexp = index; 5588 return(KERN_SUCCESS); 5589 } 5590 5591 /* 5592 * Move to the next slot, try again. 5593 */ 5594 index = DRT_HASH_NEXT(cmap, index); 5595 } 5596 /* 5597 * It's not there. 5598 */ 5599 return(KERN_FAILURE); 5600} 5601 5602/* 5603 * Find the hashtable slot for the supplied offset. If we haven't allocated 5604 * one yet, allocate one and populate the address field. Note that it will 5605 * not have a nonzero page count and thus will still technically be free, so 5606 * in the case where we are called to clean pages, the slot will remain free. 5607 */ 5608static kern_return_t 5609vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed) 5610{ 5611 struct vfs_drt_clustermap *cmap; 5612 kern_return_t kret; 5613 u_int32_t index; 5614 u_int32_t i; 5615 5616 cmap = *cmapp; 5617 5618 /* look for an existing entry */ 5619 kret = vfs_drt_search_index(cmap, offset, indexp); 5620 if (kret == KERN_SUCCESS) 5621 return(kret); 5622 5623 /* need to allocate an entry */ 5624 offset = DRT_ALIGN_ADDRESS(offset); 5625 index = DRT_HASH(cmap, offset); 5626 5627 /* scan from the index forwards looking for a vacant slot */ 5628 for (i = 0; i < cmap->scm_modulus; i++) { 5629 /* slot vacant? */ 5630 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap,index) == 0) { 5631 cmap->scm_buckets++; 5632 if (index < cmap->scm_lastclean) 5633 cmap->scm_lastclean = index; 5634 DRT_HASH_SET_ADDRESS(cmap, index, offset); 5635 DRT_HASH_SET_COUNT(cmap, index, 0); 5636 DRT_BITVECTOR_CLEAR(cmap, index); 5637 *indexp = index; 5638 vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0); 5639 return(KERN_SUCCESS); 5640 } 5641 cmap->scm_iskips += i; 5642 index = DRT_HASH_NEXT(cmap, index); 5643 } 5644 5645 /* 5646 * We haven't found a vacant slot, so the map is full. If we're not 5647 * already recursed, try reallocating/compacting it. 5648 */ 5649 if (recursed) 5650 return(KERN_FAILURE); 5651 kret = vfs_drt_alloc_map(cmapp); 5652 if (kret == KERN_SUCCESS) { 5653 /* now try to insert again */ 5654 kret = vfs_drt_get_index(cmapp, offset, indexp, 1); 5655 } 5656 return(kret); 5657} 5658 5659/* 5660 * Implementation of set dirty/clean. 5661 * 5662 * In the 'clean' case, not finding a map is OK. 5663 */ 5664static kern_return_t 5665vfs_drt_do_mark_pages( 5666 void **private, 5667 u_int64_t offset, 5668 u_int length, 5669 u_int *setcountp, 5670 int dirty) 5671{ 5672 struct vfs_drt_clustermap *cmap, **cmapp; 5673 kern_return_t kret; 5674 int i, index, pgoff, pgcount, setcount, ecount; 5675 5676 cmapp = (struct vfs_drt_clustermap **)private; 5677 cmap = *cmapp; 5678 5679 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0); 5680 5681 if (setcountp != NULL) 5682 *setcountp = 0; 5683 5684 /* allocate a cluster map if we don't already have one */ 5685 if (cmap == NULL) { 5686 /* no cluster map, nothing to clean */ 5687 if (!dirty) { 5688 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0); 5689 return(KERN_SUCCESS); 5690 } 5691 kret = vfs_drt_alloc_map(cmapp); 5692 if (kret != KERN_SUCCESS) { 5693 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0); 5694 return(kret); 5695 } 5696 } 5697 setcount = 0; 5698 5699 /* 5700 * Iterate over the length of the region. 5701 */ 5702 while (length > 0) { 5703 /* 5704 * Get the hashtable index for this offset. 5705 * 5706 * XXX this will add blank entries if we are clearing a range 5707 * that hasn't been dirtied. 5708 */ 5709 kret = vfs_drt_get_index(cmapp, offset, &index, 0); 5710 cmap = *cmapp; /* may have changed! */ 5711 /* this may be a partial-success return */ 5712 if (kret != KERN_SUCCESS) { 5713 if (setcountp != NULL) 5714 *setcountp = setcount; 5715 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0); 5716 5717 return(kret); 5718 } 5719 5720 /* 5721 * Work out how many pages we're modifying in this 5722 * hashtable entry. 5723 */ 5724 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE; 5725 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff)); 5726 5727 /* 5728 * Iterate over pages, dirty/clearing as we go. 5729 */ 5730 ecount = DRT_HASH_GET_COUNT(cmap, index); 5731 for (i = 0; i < pgcount; i++) { 5732 if (dirty) { 5733 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) { 5734 DRT_HASH_SET_BIT(cmap, index, pgoff + i); 5735 ecount++; 5736 setcount++; 5737 } 5738 } else { 5739 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) { 5740 DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i); 5741 ecount--; 5742 setcount++; 5743 } 5744 } 5745 } 5746 DRT_HASH_SET_COUNT(cmap, index, ecount); 5747 5748 offset += pgcount * PAGE_SIZE; 5749 length -= pgcount * PAGE_SIZE; 5750 } 5751 if (setcountp != NULL) 5752 *setcountp = setcount; 5753 5754 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0); 5755 5756 return(KERN_SUCCESS); 5757} 5758 5759/* 5760 * Mark a set of pages as dirty/clean. 5761 * 5762 * This is a public interface. 5763 * 5764 * cmapp 5765 * Pointer to storage suitable for holding a pointer. Note that 5766 * this must either be NULL or a value set by this function. 5767 * 5768 * size 5769 * Current file size in bytes. 5770 * 5771 * offset 5772 * Offset of the first page to be marked as dirty, in bytes. Must be 5773 * page-aligned. 5774 * 5775 * length 5776 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE. 5777 * 5778 * setcountp 5779 * Number of pages newly marked dirty by this call (optional). 5780 * 5781 * Returns KERN_SUCCESS if all the pages were successfully marked. 5782 */ 5783static kern_return_t 5784vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp) 5785{ 5786 /* XXX size unused, drop from interface */ 5787 return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1)); 5788} 5789 5790#if 0 5791static kern_return_t 5792vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length) 5793{ 5794 return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0)); 5795} 5796#endif 5797 5798/* 5799 * Get a cluster of dirty pages. 5800 * 5801 * This is a public interface. 5802 * 5803 * cmapp 5804 * Pointer to storage managed by drt_mark_pages. Note that this must 5805 * be NULL or a value set by drt_mark_pages. 5806 * 5807 * offsetp 5808 * Returns the byte offset into the file of the first page in the cluster. 5809 * 5810 * lengthp 5811 * Returns the length in bytes of the cluster of dirty pages. 5812 * 5813 * Returns success if a cluster was found. If KERN_FAILURE is returned, there 5814 * are no dirty pages meeting the minmum size criteria. Private storage will 5815 * be released if there are no more dirty pages left in the map 5816 * 5817 */ 5818static kern_return_t 5819vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp) 5820{ 5821 struct vfs_drt_clustermap *cmap; 5822 u_int64_t offset; 5823 u_int length; 5824 u_int32_t j; 5825 int index, i, fs, ls; 5826 5827 /* sanity */ 5828 if ((cmapp == NULL) || (*cmapp == NULL)) 5829 return(KERN_FAILURE); 5830 cmap = *cmapp; 5831 5832 /* walk the hashtable */ 5833 for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) { 5834 index = DRT_HASH(cmap, offset); 5835 5836 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0)) 5837 continue; 5838 5839 /* scan the bitfield for a string of bits */ 5840 fs = -1; 5841 5842 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) { 5843 if (DRT_HASH_TEST_BIT(cmap, index, i)) { 5844 fs = i; 5845 break; 5846 } 5847 } 5848 if (fs == -1) { 5849 /* didn't find any bits set */ 5850 panic("vfs_drt: entry summary count > 0 but no bits set in map"); 5851 } 5852 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) { 5853 if (!DRT_HASH_TEST_BIT(cmap, index, i)) 5854 break; 5855 } 5856 5857 /* compute offset and length, mark pages clean */ 5858 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs); 5859 length = ls * PAGE_SIZE; 5860 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0); 5861 cmap->scm_lastclean = index; 5862 5863 /* return successful */ 5864 *offsetp = (off_t)offset; 5865 *lengthp = length; 5866 5867 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0); 5868 return(KERN_SUCCESS); 5869 } 5870 /* 5871 * We didn't find anything... hashtable is empty 5872 * emit stats into trace buffer and 5873 * then free it 5874 */ 5875 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA, 5876 cmap->scm_modulus, 5877 cmap->scm_buckets, 5878 cmap->scm_lastclean, 5879 cmap->scm_iskips); 5880 5881 vfs_drt_free_map(cmap); 5882 *cmapp = NULL; 5883 5884 return(KERN_FAILURE); 5885} 5886 5887 5888static kern_return_t 5889vfs_drt_control(void **cmapp, int op_type) 5890{ 5891 struct vfs_drt_clustermap *cmap; 5892 5893 /* sanity */ 5894 if ((cmapp == NULL) || (*cmapp == NULL)) 5895 return(KERN_FAILURE); 5896 cmap = *cmapp; 5897 5898 switch (op_type) { 5899 case 0: 5900 /* emit stats into trace buffer */ 5901 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA, 5902 cmap->scm_modulus, 5903 cmap->scm_buckets, 5904 cmap->scm_lastclean, 5905 cmap->scm_iskips); 5906 5907 vfs_drt_free_map(cmap); 5908 *cmapp = NULL; 5909 break; 5910 5911 case 1: 5912 cmap->scm_lastclean = 0; 5913 break; 5914 } 5915 return(KERN_SUCCESS); 5916} 5917 5918 5919 5920/* 5921 * Emit a summary of the state of the clustermap into the trace buffer 5922 * along with some caller-provided data. 5923 */ 5924#if KDEBUG 5925static void 5926vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4) 5927{ 5928 KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0); 5929} 5930#else 5931static void 5932vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code, 5933 __unused int arg1, __unused int arg2, __unused int arg3, 5934 __unused int arg4) 5935{ 5936} 5937#endif 5938 5939#if 0 5940/* 5941 * Perform basic sanity check on the hash entry summary count 5942 * vs. the actual bits set in the entry. 5943 */ 5944static void 5945vfs_drt_sanity(struct vfs_drt_clustermap *cmap) 5946{ 5947 int index, i; 5948 int bits_on; 5949 5950 for (index = 0; index < cmap->scm_modulus; index++) { 5951 if (DRT_HASH_VACANT(cmap, index)) 5952 continue; 5953 5954 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) { 5955 if (DRT_HASH_TEST_BIT(cmap, index, i)) 5956 bits_on++; 5957 } 5958 if (bits_on != DRT_HASH_GET_COUNT(cmap, index)) 5959 panic("bits_on = %d, index = %d\n", bits_on, index); 5960 } 5961} 5962#endif 5963