1/* 2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ 29/* 30 * Copyright (c) 1993 31 * The Regents of the University of California. All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 3. All advertising materials mentioning features or use of this software 42 * must display the following acknowledgement: 43 * This product includes software developed by the University of 44 * California, Berkeley and its contributors. 45 * 4. Neither the name of the University nor the names of its contributors 46 * may be used to endorse or promote products derived from this software 47 * without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * SUCH DAMAGE. 60 * 61 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95 62 */ 63 64#include <sys/param.h> 65#include <sys/proc_internal.h> 66#include <sys/buf_internal.h> 67#include <sys/mount_internal.h> 68#include <sys/vnode_internal.h> 69#include <sys/trace.h> 70#include <sys/malloc.h> 71#include <sys/time.h> 72#include <sys/kernel.h> 73#include <sys/resourcevar.h> 74#include <miscfs/specfs/specdev.h> 75#include <sys/uio_internal.h> 76#include <libkern/libkern.h> 77#include <machine/machine_routines.h> 78 79#include <sys/ubc_internal.h> 80#include <vm/vnode_pager.h> 81 82#include <mach/mach_types.h> 83#include <mach/memory_object_types.h> 84#include <mach/vm_map.h> 85#include <mach/upl.h> 86#include <kern/task.h> 87 88#include <vm/vm_kern.h> 89#include <vm/vm_map.h> 90#include <vm/vm_pageout.h> 91 92#include <sys/kdebug.h> 93#include <libkern/OSAtomic.h> 94 95#include <sys/sdt.h> 96 97#if 0 98#undef KERNEL_DEBUG 99#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT 100#endif 101 102 103#define CL_READ 0x01 104#define CL_WRITE 0x02 105#define CL_ASYNC 0x04 106#define CL_COMMIT 0x08 107#define CL_PAGEOUT 0x10 108#define CL_AGE 0x20 109#define CL_NOZERO 0x40 110#define CL_PAGEIN 0x80 111#define CL_DEV_MEMORY 0x100 112#define CL_PRESERVE 0x200 113#define CL_THROTTLE 0x400 114#define CL_KEEPCACHED 0x800 115#define CL_DIRECT_IO 0x1000 116#define CL_PASSIVE 0x2000 117#define CL_IOSTREAMING 0x4000 118#define CL_CLOSE 0x8000 119#define CL_ENCRYPTED 0x10000 120#define CL_RAW_ENCRYPTED 0x20000 121#define CL_NOCACHE 0x40000 122 123#define MAX_VECTOR_UPL_ELEMENTS 8 124#define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE) * PAGE_SIZE 125 126extern upl_t vector_upl_create(vm_offset_t); 127extern boolean_t vector_upl_is_valid(upl_t); 128extern boolean_t vector_upl_set_subupl(upl_t,upl_t, u_int32_t); 129extern void vector_upl_set_pagelist(upl_t); 130extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t); 131 132struct clios { 133 lck_mtx_t io_mtxp; 134 u_int io_completed; /* amount of io that has currently completed */ 135 u_int io_issued; /* amount of io that was successfully issued */ 136 int io_error; /* error code of first error encountered */ 137 int io_wanted; /* someone is sleeping waiting for a change in state */ 138}; 139 140static lck_grp_t *cl_mtx_grp; 141static lck_attr_t *cl_mtx_attr; 142static lck_grp_attr_t *cl_mtx_grp_attr; 143static lck_mtx_t *cl_transaction_mtxp; 144 145 146#define IO_UNKNOWN 0 147#define IO_DIRECT 1 148#define IO_CONTIG 2 149#define IO_COPY 3 150 151#define PUSH_DELAY 0x01 152#define PUSH_ALL 0x02 153#define PUSH_SYNC 0x04 154 155 156static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset); 157static void cluster_wait_IO(buf_t cbp_head, int async); 158static void cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait); 159 160static int cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length); 161 162static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size, 163 int flags, buf_t real_bp, struct clios *iostate, int (*)(buf_t, void *), void *callback_arg); 164static int cluster_iodone(buf_t bp, void *callback_arg); 165static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp); 166static int cluster_is_throttled(vnode_t vp); 167 168static void cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name); 169 170static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg); 171 172static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference); 173static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference); 174 175static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags, 176 int (*)(buf_t, void *), void *callback_arg); 177static int cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length, 178 int flags, int (*)(buf_t, void *), void *callback_arg); 179static int cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length, 180 int (*)(buf_t, void *), void *callback_arg, int flags); 181 182static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, 183 off_t headOff, off_t tailOff, int flags, int (*)(buf_t, void *), void *callback_arg); 184static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, 185 int *write_type, u_int32_t *write_length, int flags, int (*)(buf_t, void *), void *callback_arg); 186static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, 187 int *write_type, u_int32_t *write_length, int (*)(buf_t, void *), void *callback_arg, int bflag); 188 189static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*)(buf_t, void *), void *callback_arg); 190 191static int cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag); 192static void cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra, int (*callback)(buf_t, void *), void *callback_arg, int bflag); 193 194static int cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg); 195 196static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *), void *callback_arg); 197 198static void sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg); 199static void sparse_cluster_push(void **cmapp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*)(buf_t, void *), void *callback_arg); 200static void sparse_cluster_add(void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF, int (*)(buf_t, void *), void *callback_arg); 201 202static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp); 203static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp); 204static kern_return_t vfs_drt_control(void **cmapp, int op_type); 205 206 207/* 208 * For throttled IO to check whether 209 * a block is cached by the boot cache 210 * and thus it can avoid delaying the IO. 211 * 212 * bootcache_contains_block is initially 213 * NULL. The BootCache will set it while 214 * the cache is active and clear it when 215 * the cache is jettisoned. 216 * 217 * Returns 0 if the block is not 218 * contained in the cache, 1 if it is 219 * contained. 220 * 221 * The function pointer remains valid 222 * after the cache has been evicted even 223 * if bootcache_contains_block has been 224 * cleared. 225 * 226 * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs 227 */ 228int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL; 229 230 231/* 232 * limit the internal I/O size so that we 233 * can represent it in a 32 bit int 234 */ 235#define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512) 236#define MAX_IO_CONTIG_SIZE (MAX_UPL_SIZE * PAGE_SIZE) 237#define MAX_VECTS 16 238#define MIN_DIRECT_WRITE_SIZE (4 * PAGE_SIZE) 239 240#define WRITE_THROTTLE 6 241#define WRITE_THROTTLE_SSD 2 242#define WRITE_BEHIND 1 243#define WRITE_BEHIND_SSD 1 244 245#define PREFETCH 3 246#define PREFETCH_SSD 1 247uint32_t speculative_prefetch_max = (MAX_UPL_SIZE * 3); 248uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use for a specluative read-ahead on SSDs*/ 249 250 251#define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base)) 252#define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE)) 253#define MAX_PREFETCH(vp, size, is_ssd) (size * IO_SCALE(vp, ((is_ssd && !ignore_is_ssd) ? PREFETCH_SSD : PREFETCH))) 254 255int ignore_is_ssd = 0; 256int speculative_reads_disabled = 0; 257 258/* 259 * throttle the number of async writes that 260 * can be outstanding on a single vnode 261 * before we issue a synchronous write 262 */ 263#define THROTTLE_MAXCNT 0 264 265uint32_t throttle_max_iosize = (128 * 1024); 266 267#define THROTTLE_MAX_IOSIZE (throttle_max_iosize) 268 269SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_max_iosize, 0, ""); 270 271 272void 273cluster_init(void) { 274 /* 275 * allocate lock group attribute and group 276 */ 277 cl_mtx_grp_attr = lck_grp_attr_alloc_init(); 278 cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr); 279 280 /* 281 * allocate the lock attribute 282 */ 283 cl_mtx_attr = lck_attr_alloc_init(); 284 285 cl_transaction_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr); 286 287 if (cl_transaction_mtxp == NULL) 288 panic("cluster_init: failed to allocate cl_transaction_mtxp"); 289} 290 291 292uint32_t 293cluster_max_io_size(mount_t mp, int type) 294{ 295 uint32_t max_io_size; 296 uint32_t segcnt; 297 uint32_t maxcnt; 298 299 switch(type) { 300 301 case CL_READ: 302 segcnt = mp->mnt_segreadcnt; 303 maxcnt = mp->mnt_maxreadcnt; 304 break; 305 case CL_WRITE: 306 segcnt = mp->mnt_segwritecnt; 307 maxcnt = mp->mnt_maxwritecnt; 308 break; 309 default: 310 segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt); 311 maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt); 312 break; 313 } 314 if (segcnt > MAX_UPL_SIZE) { 315 /* 316 * don't allow a size beyond the max UPL size we can create 317 */ 318 segcnt = MAX_UPL_SIZE; 319 } 320 max_io_size = min((segcnt * PAGE_SIZE), maxcnt); 321 322 if (max_io_size < (MAX_UPL_TRANSFER * PAGE_SIZE)) { 323 /* 324 * don't allow a size smaller than the old fixed limit 325 */ 326 max_io_size = (MAX_UPL_TRANSFER * PAGE_SIZE); 327 } else { 328 /* 329 * make sure the size specified is a multiple of PAGE_SIZE 330 */ 331 max_io_size &= ~PAGE_MASK; 332 } 333 return (max_io_size); 334} 335 336 337 338 339#define CLW_ALLOCATE 0x01 340#define CLW_RETURNLOCKED 0x02 341#define CLW_IONOCACHE 0x04 342#define CLW_IOPASSIVE 0x08 343 344/* 345 * if the read ahead context doesn't yet exist, 346 * allocate and initialize it... 347 * the vnode lock serializes multiple callers 348 * during the actual assignment... first one 349 * to grab the lock wins... the other callers 350 * will release the now unnecessary storage 351 * 352 * once the context is present, try to grab (but don't block on) 353 * the lock associated with it... if someone 354 * else currently owns it, than the read 355 * will run without read-ahead. this allows 356 * multiple readers to run in parallel and 357 * since there's only 1 read ahead context, 358 * there's no real loss in only allowing 1 359 * reader to have read-ahead enabled. 360 */ 361static struct cl_readahead * 362cluster_get_rap(vnode_t vp) 363{ 364 struct ubc_info *ubc; 365 struct cl_readahead *rap; 366 367 ubc = vp->v_ubcinfo; 368 369 if ((rap = ubc->cl_rahead) == NULL) { 370 MALLOC_ZONE(rap, struct cl_readahead *, sizeof *rap, M_CLRDAHEAD, M_WAITOK); 371 372 bzero(rap, sizeof *rap); 373 rap->cl_lastr = -1; 374 lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr); 375 376 vnode_lock(vp); 377 378 if (ubc->cl_rahead == NULL) 379 ubc->cl_rahead = rap; 380 else { 381 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp); 382 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD); 383 rap = ubc->cl_rahead; 384 } 385 vnode_unlock(vp); 386 } 387 if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE) 388 return(rap); 389 390 return ((struct cl_readahead *)NULL); 391} 392 393 394/* 395 * if the write behind context doesn't yet exist, 396 * and CLW_ALLOCATE is specified, allocate and initialize it... 397 * the vnode lock serializes multiple callers 398 * during the actual assignment... first one 399 * to grab the lock wins... the other callers 400 * will release the now unnecessary storage 401 * 402 * if CLW_RETURNLOCKED is set, grab (blocking if necessary) 403 * the lock associated with the write behind context before 404 * returning 405 */ 406 407static struct cl_writebehind * 408cluster_get_wbp(vnode_t vp, int flags) 409{ 410 struct ubc_info *ubc; 411 struct cl_writebehind *wbp; 412 413 ubc = vp->v_ubcinfo; 414 415 if ((wbp = ubc->cl_wbehind) == NULL) { 416 417 if ( !(flags & CLW_ALLOCATE)) 418 return ((struct cl_writebehind *)NULL); 419 420 MALLOC_ZONE(wbp, struct cl_writebehind *, sizeof *wbp, M_CLWRBEHIND, M_WAITOK); 421 422 bzero(wbp, sizeof *wbp); 423 lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr); 424 425 vnode_lock(vp); 426 427 if (ubc->cl_wbehind == NULL) 428 ubc->cl_wbehind = wbp; 429 else { 430 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp); 431 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND); 432 wbp = ubc->cl_wbehind; 433 } 434 vnode_unlock(vp); 435 } 436 if (flags & CLW_RETURNLOCKED) 437 lck_mtx_lock(&wbp->cl_lockw); 438 439 return (wbp); 440} 441 442 443static void 444cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg) 445{ 446 struct cl_writebehind *wbp; 447 448 if ((wbp = cluster_get_wbp(vp, 0)) != NULL) { 449 450 if (wbp->cl_number) { 451 lck_mtx_lock(&wbp->cl_lockw); 452 453 cluster_try_push(wbp, vp, newEOF, PUSH_ALL | PUSH_SYNC, 0, callback, callback_arg); 454 455 lck_mtx_unlock(&wbp->cl_lockw); 456 } 457 } 458} 459 460 461static int 462cluster_io_present_in_BC(vnode_t vp, off_t f_offset) 463{ 464 daddr64_t blkno; 465 size_t io_size; 466 int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block; 467 468 if (bootcache_check_fn) { 469 if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ, NULL)) 470 return(0); 471 472 if (io_size == 0) 473 return (0); 474 475 if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno)) 476 return(1); 477 } 478 return(0); 479} 480 481 482static int 483cluster_is_throttled(vnode_t vp) 484{ 485 return (throttle_io_will_be_throttled(-1, vp->v_mount)); 486} 487 488 489static void 490cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name) 491{ 492 493 lck_mtx_lock(&iostate->io_mtxp); 494 495 while ((iostate->io_issued - iostate->io_completed) > target) { 496 497 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, 498 iostate->io_issued, iostate->io_completed, target, 0, 0); 499 500 iostate->io_wanted = 1; 501 msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL); 502 503 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, 504 iostate->io_issued, iostate->io_completed, target, 0, 0); 505 } 506 lck_mtx_unlock(&iostate->io_mtxp); 507} 508 509 510static int 511cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp) 512{ 513 int upl_abort_code = 0; 514 int page_in = 0; 515 int page_out = 0; 516 517 if ((io_flags & (B_PHYS | B_CACHE)) == (B_PHYS | B_CACHE)) 518 /* 519 * direct write of any flavor, or a direct read that wasn't aligned 520 */ 521 ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY); 522 else { 523 if (io_flags & B_PAGEIO) { 524 if (io_flags & B_READ) 525 page_in = 1; 526 else 527 page_out = 1; 528 } 529 if (io_flags & B_CACHE) 530 /* 531 * leave pages in the cache unchanged on error 532 */ 533 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY; 534 else if (page_out && ((error != ENXIO) || vnode_isswap(vp))) 535 /* 536 * transient error... leave pages unchanged 537 */ 538 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY; 539 else if (page_in) 540 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR; 541 else 542 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES; 543 544 ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code); 545 } 546 return (upl_abort_code); 547} 548 549 550static int 551cluster_iodone(buf_t bp, void *callback_arg) 552{ 553 int b_flags; 554 int error; 555 int total_size; 556 int total_resid; 557 int upl_offset; 558 int zero_offset; 559 int pg_offset = 0; 560 int commit_size = 0; 561 int upl_flags = 0; 562 int transaction_size = 0; 563 upl_t upl; 564 buf_t cbp; 565 buf_t cbp_head; 566 buf_t cbp_next; 567 buf_t real_bp; 568 vnode_t vp; 569 struct clios *iostate; 570 boolean_t transaction_complete = FALSE; 571 572 cbp_head = (buf_t)(bp->b_trans_head); 573 574 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START, 575 cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0); 576 577 if (cbp_head->b_trans_next || !(cbp_head->b_flags & B_EOT)) { 578 boolean_t need_wakeup = FALSE; 579 580 lck_mtx_lock_spin(cl_transaction_mtxp); 581 582 bp->b_flags |= B_TDONE; 583 584 if (bp->b_flags & B_TWANTED) { 585 CLR(bp->b_flags, B_TWANTED); 586 need_wakeup = TRUE; 587 } 588 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) { 589 /* 590 * all I/O requests that are part of this transaction 591 * have to complete before we can process it 592 */ 593 if ( !(cbp->b_flags & B_TDONE)) { 594 595 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, 596 cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0); 597 598 lck_mtx_unlock(cl_transaction_mtxp); 599 600 if (need_wakeup == TRUE) 601 wakeup(bp); 602 603 return 0; 604 } 605 if (cbp->b_flags & B_EOT) 606 transaction_complete = TRUE; 607 } 608 lck_mtx_unlock(cl_transaction_mtxp); 609 610 if (need_wakeup == TRUE) 611 wakeup(bp); 612 613 if (transaction_complete == FALSE) { 614 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, 615 cbp_head, 0, 0, 0, 0); 616 return 0; 617 } 618 } 619 error = 0; 620 total_size = 0; 621 total_resid = 0; 622 623 cbp = cbp_head; 624 vp = cbp->b_vp; 625 upl_offset = cbp->b_uploffset; 626 upl = cbp->b_upl; 627 b_flags = cbp->b_flags; 628 real_bp = cbp->b_real_bp; 629 zero_offset= cbp->b_validend; 630 iostate = (struct clios *)cbp->b_iostate; 631 632 if (real_bp) 633 real_bp->b_dev = cbp->b_dev; 634 635 while (cbp) { 636 if ((cbp->b_flags & B_ERROR) && error == 0) 637 error = cbp->b_error; 638 639 total_resid += cbp->b_resid; 640 total_size += cbp->b_bcount; 641 642 cbp_next = cbp->b_trans_next; 643 644 if (cbp_next == NULL) 645 /* 646 * compute the overall size of the transaction 647 * in case we created one that has 'holes' in it 648 * 'total_size' represents the amount of I/O we 649 * did, not the span of the transaction w/r to the UPL 650 */ 651 transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset; 652 653 if (cbp != cbp_head) 654 free_io_buf(cbp); 655 656 cbp = cbp_next; 657 } 658 if (error == 0 && total_resid) 659 error = EIO; 660 661 if (error == 0) { 662 int (*cliodone_func)(buf_t, void *) = (int (*)(buf_t, void *))(cbp_head->b_cliodone); 663 664 if (cliodone_func != NULL) { 665 cbp_head->b_bcount = transaction_size; 666 667 error = (*cliodone_func)(cbp_head, callback_arg); 668 } 669 } 670 if (zero_offset) 671 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp); 672 673 free_io_buf(cbp_head); 674 675 if (iostate) { 676 int need_wakeup = 0; 677 678 /* 679 * someone has issued multiple I/Os asynchrounsly 680 * and is waiting for them to complete (streaming) 681 */ 682 lck_mtx_lock_spin(&iostate->io_mtxp); 683 684 if (error && iostate->io_error == 0) 685 iostate->io_error = error; 686 687 iostate->io_completed += total_size; 688 689 if (iostate->io_wanted) { 690 /* 691 * someone is waiting for the state of 692 * this io stream to change 693 */ 694 iostate->io_wanted = 0; 695 need_wakeup = 1; 696 } 697 lck_mtx_unlock(&iostate->io_mtxp); 698 699 if (need_wakeup) 700 wakeup((caddr_t)&iostate->io_wanted); 701 } 702 703 if (b_flags & B_COMMIT_UPL) { 704 705 pg_offset = upl_offset & PAGE_MASK; 706 commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 707 708 if (error) 709 upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags, vp); 710 else { 711 upl_flags = UPL_COMMIT_FREE_ON_EMPTY; 712 713 if ((b_flags & B_PHYS) && (b_flags & B_READ)) 714 upl_flags |= UPL_COMMIT_SET_DIRTY; 715 716 if (b_flags & B_AGE) 717 upl_flags |= UPL_COMMIT_INACTIVATE; 718 719 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags); 720 } 721 } 722 if (real_bp) { 723 if (error) { 724 real_bp->b_flags |= B_ERROR; 725 real_bp->b_error = error; 726 } 727 real_bp->b_resid = total_resid; 728 729 buf_biodone(real_bp); 730 } 731 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, 732 upl, upl_offset - pg_offset, commit_size, (error << 24) | upl_flags, 0); 733 734 return (error); 735} 736 737 738uint32_t 739cluster_throttle_io_limit(vnode_t vp, uint32_t *limit) 740{ 741 if (cluster_is_throttled(vp)) { 742 *limit = THROTTLE_MAX_IOSIZE; 743 return 1; 744 } 745 return 0; 746} 747 748 749void 750cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp) 751{ 752 753 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START, 754 upl_offset, size, bp, 0, 0); 755 756 if (bp == NULL || bp->b_datap == 0) { 757 upl_page_info_t *pl; 758 addr64_t zero_addr; 759 760 pl = ubc_upl_pageinfo(upl); 761 762 if (upl_device_page(pl) == TRUE) { 763 zero_addr = ((addr64_t)upl_phys_page(pl, 0) << 12) + upl_offset; 764 765 bzero_phys_nc(zero_addr, size); 766 } else { 767 while (size) { 768 int page_offset; 769 int page_index; 770 int zero_cnt; 771 772 page_index = upl_offset / PAGE_SIZE; 773 page_offset = upl_offset & PAGE_MASK; 774 775 zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset; 776 zero_cnt = min(PAGE_SIZE - page_offset, size); 777 778 bzero_phys(zero_addr, zero_cnt); 779 780 size -= zero_cnt; 781 upl_offset += zero_cnt; 782 } 783 } 784 } else 785 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size); 786 787 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END, 788 upl_offset, size, 0, 0, 0); 789} 790 791 792static void 793cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset) 794{ 795 cbp_head->b_validend = zero_offset; 796 cbp_tail->b_flags |= B_EOT; 797} 798 799static void 800cluster_wait_IO(buf_t cbp_head, int async) 801{ 802 buf_t cbp; 803 804 if (async) { 805 /* 806 * async callback completion will not normally 807 * generate a wakeup upon I/O completion... 808 * by setting B_TWANTED, we will force a wakeup 809 * to occur as any outstanding I/Os complete... 810 * I/Os already completed will have B_TDONE already 811 * set and we won't cause us to block 812 * note that we're actually waiting for the bp to have 813 * completed the callback function... only then 814 * can we safely take back ownership of the bp 815 */ 816 lck_mtx_lock_spin(cl_transaction_mtxp); 817 818 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) 819 cbp->b_flags |= B_TWANTED; 820 821 lck_mtx_unlock(cl_transaction_mtxp); 822 } 823 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) { 824 825 if (async) { 826 while (!ISSET(cbp->b_flags, B_TDONE)) { 827 828 lck_mtx_lock_spin(cl_transaction_mtxp); 829 830 if (!ISSET(cbp->b_flags, B_TDONE)) { 831 DTRACE_IO1(wait__start, buf_t, cbp); 832 (void) msleep(cbp, cl_transaction_mtxp, PDROP | (PRIBIO+1), "cluster_wait_IO", NULL); 833 DTRACE_IO1(wait__done, buf_t, cbp); 834 } else 835 lck_mtx_unlock(cl_transaction_mtxp); 836 } 837 } else 838 buf_biowait(cbp); 839 } 840} 841 842static void 843cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait) 844{ 845 buf_t cbp; 846 int error; 847 boolean_t isswapout = FALSE; 848 849 /* 850 * cluster_complete_transaction will 851 * only be called if we've issued a complete chain in synchronous mode 852 * or, we've already done a cluster_wait_IO on an incomplete chain 853 */ 854 if (needwait) { 855 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) 856 buf_biowait(cbp); 857 } 858 /* 859 * we've already waited on all of the I/Os in this transaction, 860 * so mark all of the buf_t's in this transaction as B_TDONE 861 * so that cluster_iodone sees the transaction as completed 862 */ 863 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) 864 cbp->b_flags |= B_TDONE; 865 cbp = *cbp_head; 866 867 if ((flags & (CL_ASYNC | CL_PAGEOUT)) == CL_PAGEOUT && vnode_isswap(cbp->b_vp)) 868 isswapout = TRUE; 869 870 error = cluster_iodone(cbp, callback_arg); 871 872 if ( !(flags & CL_ASYNC) && error && *retval == 0) { 873 if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) != CL_PAGEOUT) || (error != ENXIO)) 874 *retval = error; 875 else if (isswapout == TRUE) 876 *retval = error; 877 } 878 *cbp_head = (buf_t)NULL; 879} 880 881 882static int 883cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size, 884 int flags, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg) 885{ 886 buf_t cbp; 887 u_int size; 888 u_int io_size; 889 int io_flags; 890 int bmap_flags; 891 int error = 0; 892 int retval = 0; 893 buf_t cbp_head = NULL; 894 buf_t cbp_tail = NULL; 895 int trans_count = 0; 896 int max_trans_count; 897 u_int pg_count; 898 int pg_offset; 899 u_int max_iosize; 900 u_int max_vectors; 901 int priv; 902 int zero_offset = 0; 903 int async_throttle = 0; 904 mount_t mp; 905 vm_offset_t upl_end_offset; 906 boolean_t need_EOT = FALSE; 907 908 /* 909 * we currently don't support buffers larger than a page 910 */ 911 if (real_bp && non_rounded_size > PAGE_SIZE) 912 panic("%s(): Called with real buffer of size %d bytes which " 913 "is greater than the maximum allowed size of " 914 "%d bytes (the system PAGE_SIZE).\n", 915 __FUNCTION__, non_rounded_size, PAGE_SIZE); 916 917 mp = vp->v_mount; 918 919 /* 920 * we don't want to do any funny rounding of the size for IO requests 921 * coming through the DIRECT or CONTIGUOUS paths... those pages don't 922 * belong to us... we can't extend (nor do we need to) the I/O to fill 923 * out a page 924 */ 925 if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY | CL_DIRECT_IO))) { 926 /* 927 * round the requested size up so that this I/O ends on a 928 * page boundary in case this is a 'write'... if the filesystem 929 * has blocks allocated to back the page beyond the EOF, we want to 930 * make sure to write out the zero's that are sitting beyond the EOF 931 * so that in case the filesystem doesn't explicitly zero this area 932 * if a hole is created via a lseek/write beyond the current EOF, 933 * it will return zeros when it's read back from the disk. If the 934 * physical allocation doesn't extend for the whole page, we'll 935 * only write/read from the disk up to the end of this allocation 936 * via the extent info returned from the VNOP_BLOCKMAP call. 937 */ 938 pg_offset = upl_offset & PAGE_MASK; 939 940 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset; 941 } else { 942 /* 943 * anyone advertising a blocksize of 1 byte probably 944 * can't deal with us rounding up the request size 945 * AFP is one such filesystem/device 946 */ 947 size = non_rounded_size; 948 } 949 upl_end_offset = upl_offset + size; 950 951 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0); 952 953 /* 954 * Set the maximum transaction size to the maximum desired number of 955 * buffers. 956 */ 957 max_trans_count = 8; 958 if (flags & CL_DEV_MEMORY) 959 max_trans_count = 16; 960 961 if (flags & CL_READ) { 962 io_flags = B_READ; 963 bmap_flags = VNODE_READ; 964 965 max_iosize = mp->mnt_maxreadcnt; 966 max_vectors = mp->mnt_segreadcnt; 967 } else { 968 io_flags = B_WRITE; 969 bmap_flags = VNODE_WRITE; 970 971 max_iosize = mp->mnt_maxwritecnt; 972 max_vectors = mp->mnt_segwritecnt; 973 } 974 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0); 975 976 /* 977 * make sure the maximum iosize is a 978 * multiple of the page size 979 */ 980 max_iosize &= ~PAGE_MASK; 981 982 /* 983 * Ensure the maximum iosize is sensible. 984 */ 985 if (!max_iosize) 986 max_iosize = PAGE_SIZE; 987 988 if (flags & CL_THROTTLE) { 989 if ( !(flags & CL_PAGEOUT) && cluster_is_throttled(vp)) { 990 if (max_iosize > THROTTLE_MAX_IOSIZE) 991 max_iosize = THROTTLE_MAX_IOSIZE; 992 async_throttle = THROTTLE_MAXCNT; 993 } else { 994 if ( (flags & CL_DEV_MEMORY) ) 995 async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE); 996 else { 997 u_int max_cluster; 998 u_int max_cluster_size; 999 u_int scale; 1000 1001 max_cluster_size = MAX_CLUSTER_SIZE(vp); 1002 1003 if (max_iosize > max_cluster_size) 1004 max_cluster = max_cluster_size; 1005 else 1006 max_cluster = max_iosize; 1007 1008 if (size < max_cluster) 1009 max_cluster = size; 1010 1011 if ((vp->v_mount->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) 1012 scale = WRITE_THROTTLE_SSD; 1013 else 1014 scale = WRITE_THROTTLE; 1015 1016 if (flags & CL_CLOSE) 1017 scale += MAX_CLUSTERS; 1018 1019 async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1); 1020 } 1021 } 1022 } 1023 if (flags & CL_AGE) 1024 io_flags |= B_AGE; 1025 if (flags & (CL_PAGEIN | CL_PAGEOUT)) 1026 io_flags |= B_PAGEIO; 1027 if (flags & (CL_IOSTREAMING)) 1028 io_flags |= B_IOSTREAMING; 1029 if (flags & CL_COMMIT) 1030 io_flags |= B_COMMIT_UPL; 1031 if (flags & CL_DIRECT_IO) 1032 io_flags |= B_PHYS; 1033 if (flags & (CL_PRESERVE | CL_KEEPCACHED)) 1034 io_flags |= B_CACHE; 1035 if (flags & CL_PASSIVE) 1036 io_flags |= B_PASSIVE; 1037 if (flags & CL_ENCRYPTED) 1038 io_flags |= B_ENCRYPTED_IO; 1039 if (vp->v_flag & VSYSTEM) 1040 io_flags |= B_META; 1041 1042 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) { 1043 /* 1044 * then we are going to end up 1045 * with a page that we can't complete (the file size wasn't a multiple 1046 * of PAGE_SIZE and we're trying to read to the end of the file 1047 * so we'll go ahead and zero out the portion of the page we can't 1048 * read in from the file 1049 */ 1050 zero_offset = upl_offset + non_rounded_size; 1051 } 1052 while (size) { 1053 daddr64_t blkno; 1054 daddr64_t lblkno; 1055 u_int io_size_wanted; 1056 size_t io_size_tmp; 1057 1058 if (size > max_iosize) 1059 io_size = max_iosize; 1060 else 1061 io_size = size; 1062 1063 io_size_wanted = io_size; 1064 io_size_tmp = (size_t)io_size; 1065 1066 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, &io_size_tmp, NULL, bmap_flags, NULL))) 1067 break; 1068 1069 if (io_size_tmp > io_size_wanted) 1070 io_size = io_size_wanted; 1071 else 1072 io_size = (u_int)io_size_tmp; 1073 1074 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) 1075 real_bp->b_blkno = blkno; 1076 1077 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE, 1078 (int)f_offset, (int)(blkno>>32), (int)blkno, io_size, 0); 1079 1080 if (io_size == 0) { 1081 /* 1082 * vnop_blockmap didn't return an error... however, it did 1083 * return an extent size of 0 which means we can't 1084 * make forward progress on this I/O... a hole in the 1085 * file would be returned as a blkno of -1 with a non-zero io_size 1086 * a real extent is returned with a blkno != -1 and a non-zero io_size 1087 */ 1088 error = EINVAL; 1089 break; 1090 } 1091 if ( !(flags & CL_READ) && blkno == -1) { 1092 off_t e_offset; 1093 int pageout_flags; 1094 1095 if (upl_get_internal_vectorupl(upl)) 1096 panic("Vector UPLs should not take this code-path\n"); 1097 /* 1098 * we're writing into a 'hole' 1099 */ 1100 if (flags & CL_PAGEOUT) { 1101 /* 1102 * if we got here via cluster_pageout 1103 * then just error the request and return 1104 * the 'hole' should already have been covered 1105 */ 1106 error = EINVAL; 1107 break; 1108 } 1109 /* 1110 * we can get here if the cluster code happens to 1111 * pick up a page that was dirtied via mmap vs 1112 * a 'write' and the page targets a 'hole'... 1113 * i.e. the writes to the cluster were sparse 1114 * and the file was being written for the first time 1115 * 1116 * we can also get here if the filesystem supports 1117 * 'holes' that are less than PAGE_SIZE.... because 1118 * we can't know if the range in the page that covers 1119 * the 'hole' has been dirtied via an mmap or not, 1120 * we have to assume the worst and try to push the 1121 * entire page to storage. 1122 * 1123 * Try paging out the page individually before 1124 * giving up entirely and dumping it (the pageout 1125 * path will insure that the zero extent accounting 1126 * has been taken care of before we get back into cluster_io) 1127 * 1128 * go direct to vnode_pageout so that we don't have to 1129 * unbusy the page from the UPL... we used to do this 1130 * so that we could call ubc_sync_range, but that results 1131 * in a potential deadlock if someone else races us to acquire 1132 * that page and wins and in addition needs one of the pages 1133 * we're continuing to hold in the UPL 1134 */ 1135 pageout_flags = UPL_MSYNC | UPL_VNODE_PAGER | UPL_NESTED_PAGEOUT; 1136 1137 if ( !(flags & CL_ASYNC)) 1138 pageout_flags |= UPL_IOSYNC; 1139 if ( !(flags & CL_COMMIT)) 1140 pageout_flags |= UPL_NOCOMMIT; 1141 1142 if (cbp_head) { 1143 buf_t last_cbp; 1144 1145 /* 1146 * first we have to wait for the the current outstanding I/Os 1147 * to complete... EOT hasn't been set yet on this transaction 1148 * so the pages won't be released just because all of the current 1149 * I/O linked to this transaction has completed... 1150 */ 1151 cluster_wait_IO(cbp_head, (flags & CL_ASYNC)); 1152 1153 /* 1154 * we've got a transcation that 1155 * includes the page we're about to push out through vnode_pageout... 1156 * find the last bp in the list which will be the one that 1157 * includes the head of this page and round it's iosize down 1158 * to a page boundary... 1159 */ 1160 for (last_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next) 1161 last_cbp = cbp; 1162 1163 cbp->b_bcount &= ~PAGE_MASK; 1164 1165 if (cbp->b_bcount == 0) { 1166 /* 1167 * this buf no longer has any I/O associated with it 1168 */ 1169 free_io_buf(cbp); 1170 1171 if (cbp == cbp_head) { 1172 /* 1173 * the buf we just freed was the only buf in 1174 * this transaction... so there's no I/O to do 1175 */ 1176 cbp_head = NULL; 1177 } else { 1178 /* 1179 * remove the buf we just freed from 1180 * the transaction list 1181 */ 1182 last_cbp->b_trans_next = NULL; 1183 cbp_tail = last_cbp; 1184 } 1185 } 1186 if (cbp_head) { 1187 /* 1188 * there was more to the current transaction 1189 * than just the page we are pushing out via vnode_pageout... 1190 * mark it as finished and complete it... we've already 1191 * waited for the I/Os to complete above in the call to cluster_wait_IO 1192 */ 1193 cluster_EOT(cbp_head, cbp_tail, 0); 1194 1195 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0); 1196 1197 trans_count = 0; 1198 } 1199 } 1200 if (vnode_pageout(vp, upl, trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) { 1201 error = EINVAL; 1202 } 1203 e_offset = round_page_64(f_offset + 1); 1204 io_size = e_offset - f_offset; 1205 1206 f_offset += io_size; 1207 upl_offset += io_size; 1208 1209 if (size >= io_size) 1210 size -= io_size; 1211 else 1212 size = 0; 1213 /* 1214 * keep track of how much of the original request 1215 * that we've actually completed... non_rounded_size 1216 * may go negative due to us rounding the request 1217 * to a page size multiple (i.e. size > non_rounded_size) 1218 */ 1219 non_rounded_size -= io_size; 1220 1221 if (non_rounded_size <= 0) { 1222 /* 1223 * we've transferred all of the data in the original 1224 * request, but we were unable to complete the tail 1225 * of the last page because the file didn't have 1226 * an allocation to back that portion... this is ok. 1227 */ 1228 size = 0; 1229 } 1230 if (error) { 1231 if (size == 0) 1232 flags &= ~CL_COMMIT; 1233 break; 1234 } 1235 continue; 1236 } 1237 lblkno = (daddr64_t)(f_offset / PAGE_SIZE_64); 1238 /* 1239 * we have now figured out how much I/O we can do - this is in 'io_size' 1240 * pg_offset is the starting point in the first page for the I/O 1241 * pg_count is the number of full and partial pages that 'io_size' encompasses 1242 */ 1243 pg_offset = upl_offset & PAGE_MASK; 1244 1245 if (flags & CL_DEV_MEMORY) { 1246 /* 1247 * treat physical requests as one 'giant' page 1248 */ 1249 pg_count = 1; 1250 } else 1251 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE; 1252 1253 if ((flags & CL_READ) && blkno == -1) { 1254 vm_offset_t commit_offset; 1255 int bytes_to_zero; 1256 int complete_transaction_now = 0; 1257 1258 /* 1259 * if we're reading and blkno == -1, then we've got a 1260 * 'hole' in the file that we need to deal with by zeroing 1261 * out the affected area in the upl 1262 */ 1263 if (io_size >= (u_int)non_rounded_size) { 1264 /* 1265 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE 1266 * than 'zero_offset' will be non-zero 1267 * if the 'hole' returned by vnop_blockmap extends all the way to the eof 1268 * (indicated by the io_size finishing off the I/O request for this UPL) 1269 * than we're not going to issue an I/O for the 1270 * last page in this upl... we need to zero both the hole and the tail 1271 * of the page beyond the EOF, since the delayed zero-fill won't kick in 1272 */ 1273 bytes_to_zero = non_rounded_size; 1274 if (!(flags & CL_NOZERO)) 1275 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset; 1276 1277 zero_offset = 0; 1278 } else 1279 bytes_to_zero = io_size; 1280 1281 pg_count = 0; 1282 1283 cluster_zero(upl, upl_offset, bytes_to_zero, real_bp); 1284 1285 if (cbp_head) { 1286 int pg_resid; 1287 1288 /* 1289 * if there is a current I/O chain pending 1290 * then the first page of the group we just zero'd 1291 * will be handled by the I/O completion if the zero 1292 * fill started in the middle of the page 1293 */ 1294 commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK; 1295 1296 pg_resid = commit_offset - upl_offset; 1297 1298 if (bytes_to_zero >= pg_resid) { 1299 /* 1300 * the last page of the current I/O 1301 * has been completed... 1302 * compute the number of fully zero'd 1303 * pages that are beyond it 1304 * plus the last page if its partial 1305 * and we have no more I/O to issue... 1306 * otherwise a partial page is left 1307 * to begin the next I/O 1308 */ 1309 if ((int)io_size >= non_rounded_size) 1310 pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE; 1311 else 1312 pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE; 1313 1314 complete_transaction_now = 1; 1315 } 1316 } else { 1317 /* 1318 * no pending I/O to deal with 1319 * so, commit all of the fully zero'd pages 1320 * plus the last page if its partial 1321 * and we have no more I/O to issue... 1322 * otherwise a partial page is left 1323 * to begin the next I/O 1324 */ 1325 if ((int)io_size >= non_rounded_size) 1326 pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE; 1327 else 1328 pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE; 1329 1330 commit_offset = upl_offset & ~PAGE_MASK; 1331 } 1332 if ( (flags & CL_COMMIT) && pg_count) { 1333 ubc_upl_commit_range(upl, commit_offset, pg_count * PAGE_SIZE, 1334 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY); 1335 } 1336 upl_offset += io_size; 1337 f_offset += io_size; 1338 size -= io_size; 1339 1340 /* 1341 * keep track of how much of the original request 1342 * that we've actually completed... non_rounded_size 1343 * may go negative due to us rounding the request 1344 * to a page size multiple (i.e. size > non_rounded_size) 1345 */ 1346 non_rounded_size -= io_size; 1347 1348 if (non_rounded_size <= 0) { 1349 /* 1350 * we've transferred all of the data in the original 1351 * request, but we were unable to complete the tail 1352 * of the last page because the file didn't have 1353 * an allocation to back that portion... this is ok. 1354 */ 1355 size = 0; 1356 } 1357 if (cbp_head && (complete_transaction_now || size == 0)) { 1358 cluster_wait_IO(cbp_head, (flags & CL_ASYNC)); 1359 1360 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0); 1361 1362 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0); 1363 1364 trans_count = 0; 1365 } 1366 continue; 1367 } 1368 if (pg_count > max_vectors) { 1369 if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) { 1370 io_size = PAGE_SIZE - pg_offset; 1371 pg_count = 1; 1372 } else { 1373 io_size -= (pg_count - max_vectors) * PAGE_SIZE; 1374 pg_count = max_vectors; 1375 } 1376 } 1377 /* 1378 * If the transaction is going to reach the maximum number of 1379 * desired elements, truncate the i/o to the nearest page so 1380 * that the actual i/o is initiated after this buffer is 1381 * created and added to the i/o chain. 1382 * 1383 * I/O directed to physically contiguous memory 1384 * doesn't have a requirement to make sure we 'fill' a page 1385 */ 1386 if ( !(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count && 1387 ((upl_offset + io_size) & PAGE_MASK)) { 1388 vm_offset_t aligned_ofs; 1389 1390 aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK; 1391 /* 1392 * If the io_size does not actually finish off even a 1393 * single page we have to keep adding buffers to the 1394 * transaction despite having reached the desired limit. 1395 * 1396 * Eventually we get here with the page being finished 1397 * off (and exceeded) and then we truncate the size of 1398 * this i/o request so that it is page aligned so that 1399 * we can finally issue the i/o on the transaction. 1400 */ 1401 if (aligned_ofs > upl_offset) { 1402 io_size = aligned_ofs - upl_offset; 1403 pg_count--; 1404 } 1405 } 1406 1407 if ( !(mp->mnt_kern_flag & MNTK_VIRTUALDEV)) 1408 /* 1409 * if we're not targeting a virtual device i.e. a disk image 1410 * it's safe to dip into the reserve pool since real devices 1411 * can complete this I/O request without requiring additional 1412 * bufs from the alloc_io_buf pool 1413 */ 1414 priv = 1; 1415 else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT)) 1416 /* 1417 * Throttle the speculative IO 1418 */ 1419 priv = 0; 1420 else 1421 priv = 1; 1422 1423 cbp = alloc_io_buf(vp, priv); 1424 1425 if (flags & CL_PAGEOUT) { 1426 u_int i; 1427 1428 for (i = 0; i < pg_count; i++) { 1429 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY) 1430 panic("BUSY bp found in cluster_io"); 1431 } 1432 } 1433 if (flags & CL_ASYNC) { 1434 if (buf_setcallback(cbp, (void *)cluster_iodone, callback_arg)) 1435 panic("buf_setcallback failed\n"); 1436 } 1437 cbp->b_cliodone = (void *)callback; 1438 cbp->b_flags |= io_flags; 1439 if (flags & CL_NOCACHE) 1440 cbp->b_attr.ba_flags |= BA_NOCACHE; 1441 1442 cbp->b_lblkno = lblkno; 1443 cbp->b_blkno = blkno; 1444 cbp->b_bcount = io_size; 1445 1446 if (buf_setupl(cbp, upl, upl_offset)) 1447 panic("buf_setupl failed\n"); 1448 1449 cbp->b_trans_next = (buf_t)NULL; 1450 1451 if ((cbp->b_iostate = (void *)iostate)) 1452 /* 1453 * caller wants to track the state of this 1454 * io... bump the amount issued against this stream 1455 */ 1456 iostate->io_issued += io_size; 1457 1458 if (flags & CL_READ) { 1459 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE, 1460 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0); 1461 } 1462 else { 1463 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE, 1464 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0); 1465 } 1466 1467 if (cbp_head) { 1468 cbp_tail->b_trans_next = cbp; 1469 cbp_tail = cbp; 1470 } else { 1471 cbp_head = cbp; 1472 cbp_tail = cbp; 1473 1474 if ( (cbp_head->b_real_bp = real_bp) ) 1475 real_bp = (buf_t)NULL; 1476 } 1477 *(buf_t *)(&cbp->b_trans_head) = cbp_head; 1478 1479 trans_count++; 1480 1481 upl_offset += io_size; 1482 f_offset += io_size; 1483 size -= io_size; 1484 /* 1485 * keep track of how much of the original request 1486 * that we've actually completed... non_rounded_size 1487 * may go negative due to us rounding the request 1488 * to a page size multiple (i.e. size > non_rounded_size) 1489 */ 1490 non_rounded_size -= io_size; 1491 1492 if (non_rounded_size <= 0) { 1493 /* 1494 * we've transferred all of the data in the original 1495 * request, but we were unable to complete the tail 1496 * of the last page because the file didn't have 1497 * an allocation to back that portion... this is ok. 1498 */ 1499 size = 0; 1500 } 1501 if (size == 0) { 1502 /* 1503 * we have no more I/O to issue, so go 1504 * finish the final transaction 1505 */ 1506 need_EOT = TRUE; 1507 } else if ( ((flags & CL_DEV_MEMORY) || (upl_offset & PAGE_MASK) == 0) && 1508 ((flags & CL_ASYNC) || trans_count > max_trans_count) ) { 1509 /* 1510 * I/O directed to physically contiguous memory... 1511 * which doesn't have a requirement to make sure we 'fill' a page 1512 * or... 1513 * the current I/O we've prepared fully 1514 * completes the last page in this request 1515 * and ... 1516 * it's either an ASYNC request or 1517 * we've already accumulated more than 8 I/O's into 1518 * this transaction so mark it as complete so that 1519 * it can finish asynchronously or via the cluster_complete_transaction 1520 * below if the request is synchronous 1521 */ 1522 need_EOT = TRUE; 1523 } 1524 if (need_EOT == TRUE) 1525 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0); 1526 1527 if (flags & CL_THROTTLE) 1528 (void)vnode_waitforwrites(vp, async_throttle, 0, 0, "cluster_io"); 1529 1530 if ( !(io_flags & B_READ)) 1531 vnode_startwrite(vp); 1532 1533 if (flags & CL_RAW_ENCRYPTED) { 1534 /* 1535 * User requested raw encrypted bytes. 1536 * Twiddle the bit in the ba_flags for the buffer 1537 */ 1538 cbp->b_attr.ba_flags |= BA_RAW_ENCRYPTED_IO; 1539 } 1540 1541 (void) VNOP_STRATEGY(cbp); 1542 1543 if (need_EOT == TRUE) { 1544 if ( !(flags & CL_ASYNC)) 1545 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 1); 1546 1547 need_EOT = FALSE; 1548 trans_count = 0; 1549 cbp_head = NULL; 1550 } 1551 } 1552 if (error) { 1553 int abort_size; 1554 1555 io_size = 0; 1556 1557 if (cbp_head) { 1558 /* 1559 * first wait until all of the outstanding I/O 1560 * for this partial transaction has completed 1561 */ 1562 cluster_wait_IO(cbp_head, (flags & CL_ASYNC)); 1563 1564 /* 1565 * Rewind the upl offset to the beginning of the 1566 * transaction. 1567 */ 1568 upl_offset = cbp_head->b_uploffset; 1569 1570 for (cbp = cbp_head; cbp;) { 1571 buf_t cbp_next; 1572 1573 size += cbp->b_bcount; 1574 io_size += cbp->b_bcount; 1575 1576 cbp_next = cbp->b_trans_next; 1577 free_io_buf(cbp); 1578 cbp = cbp_next; 1579 } 1580 } 1581 if (iostate) { 1582 int need_wakeup = 0; 1583 1584 /* 1585 * update the error condition for this stream 1586 * since we never really issued the io 1587 * just go ahead and adjust it back 1588 */ 1589 lck_mtx_lock_spin(&iostate->io_mtxp); 1590 1591 if (iostate->io_error == 0) 1592 iostate->io_error = error; 1593 iostate->io_issued -= io_size; 1594 1595 if (iostate->io_wanted) { 1596 /* 1597 * someone is waiting for the state of 1598 * this io stream to change 1599 */ 1600 iostate->io_wanted = 0; 1601 need_wakeup = 1; 1602 } 1603 lck_mtx_unlock(&iostate->io_mtxp); 1604 1605 if (need_wakeup) 1606 wakeup((caddr_t)&iostate->io_wanted); 1607 } 1608 if (flags & CL_COMMIT) { 1609 int upl_flags; 1610 1611 pg_offset = upl_offset & PAGE_MASK; 1612 abort_size = (upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK; 1613 1614 upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, abort_size, error, io_flags, vp); 1615 1616 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE, 1617 upl, upl_offset - pg_offset, abort_size, (error << 24) | upl_flags, 0); 1618 } 1619 if (retval == 0) 1620 retval = error; 1621 } else if (cbp_head) 1622 panic("%s(): cbp_head is not NULL.\n", __FUNCTION__); 1623 1624 if (real_bp) { 1625 /* 1626 * can get here if we either encountered an error 1627 * or we completely zero-filled the request and 1628 * no I/O was issued 1629 */ 1630 if (error) { 1631 real_bp->b_flags |= B_ERROR; 1632 real_bp->b_error = error; 1633 } 1634 buf_biodone(real_bp); 1635 } 1636 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, 0); 1637 1638 return (retval); 1639} 1640 1641#define reset_vector_run_state() \ 1642 issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0; 1643 1644static int 1645vector_cluster_io(vnode_t vp, upl_t vector_upl, vm_offset_t vector_upl_offset, off_t v_upl_uio_offset, int vector_upl_iosize, 1646 int io_flag, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg) 1647{ 1648 vector_upl_set_pagelist(vector_upl); 1649 1650 if(io_flag & CL_READ) { 1651 if(vector_upl_offset == 0 && ((vector_upl_iosize & PAGE_MASK)==0)) 1652 io_flag &= ~CL_PRESERVE; /*don't zero fill*/ 1653 else 1654 io_flag |= CL_PRESERVE; /*zero fill*/ 1655 } 1656 return (cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp, iostate, callback, callback_arg)); 1657 1658} 1659 1660static int 1661cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag) 1662{ 1663 int pages_in_prefetch; 1664 1665 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START, 1666 (int)f_offset, size, (int)filesize, 0, 0); 1667 1668 if (f_offset >= filesize) { 1669 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END, 1670 (int)f_offset, 0, 0, 0, 0); 1671 return(0); 1672 } 1673 if ((off_t)size > (filesize - f_offset)) 1674 size = filesize - f_offset; 1675 pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE; 1676 1677 advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag); 1678 1679 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END, 1680 (int)f_offset + size, pages_in_prefetch, 0, 1, 0); 1681 1682 return (pages_in_prefetch); 1683} 1684 1685 1686 1687static void 1688cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap, int (*callback)(buf_t, void *), void *callback_arg, 1689 int bflag) 1690{ 1691 daddr64_t r_addr; 1692 off_t f_offset; 1693 int size_of_prefetch; 1694 u_int max_prefetch; 1695 1696 1697 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START, 1698 (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0); 1699 1700 if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) { 1701 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, 1702 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0); 1703 return; 1704 } 1705 if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1))) { 1706 rap->cl_ralen = 0; 1707 rap->cl_maxra = 0; 1708 1709 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, 1710 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0); 1711 1712 return; 1713 } 1714 max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ), (vp->v_mount->mnt_kern_flag & MNTK_SSD)); 1715 1716 if ((max_prefetch / PAGE_SIZE) > speculative_prefetch_max) 1717 max_prefetch = (speculative_prefetch_max * PAGE_SIZE); 1718 1719 if (max_prefetch <= PAGE_SIZE) { 1720 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, 1721 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0); 1722 return; 1723 } 1724 if (extent->e_addr < rap->cl_maxra) { 1725 if ((rap->cl_maxra - extent->e_addr) > ((max_prefetch / PAGE_SIZE) / 4)) { 1726 1727 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, 1728 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0); 1729 return; 1730 } 1731 } 1732 r_addr = max(extent->e_addr, rap->cl_maxra) + 1; 1733 f_offset = (off_t)(r_addr * PAGE_SIZE_64); 1734 1735 size_of_prefetch = 0; 1736 1737 ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch); 1738 1739 if (size_of_prefetch) { 1740 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, 1741 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0); 1742 return; 1743 } 1744 if (f_offset < filesize) { 1745 daddr64_t read_size; 1746 1747 rap->cl_ralen = rap->cl_ralen ? min(max_prefetch / PAGE_SIZE, rap->cl_ralen << 1) : 1; 1748 1749 read_size = (extent->e_addr + 1) - extent->b_addr; 1750 1751 if (read_size > rap->cl_ralen) { 1752 if (read_size > max_prefetch / PAGE_SIZE) 1753 rap->cl_ralen = max_prefetch / PAGE_SIZE; 1754 else 1755 rap->cl_ralen = read_size; 1756 } 1757 size_of_prefetch = cluster_read_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag); 1758 1759 if (size_of_prefetch) 1760 rap->cl_maxra = (r_addr + size_of_prefetch) - 1; 1761 } 1762 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, 1763 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0); 1764} 1765 1766 1767int 1768cluster_pageout(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset, 1769 int size, off_t filesize, int flags) 1770{ 1771 return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL); 1772 1773} 1774 1775 1776int 1777cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset, 1778 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg) 1779{ 1780 int io_size; 1781 int rounded_size; 1782 off_t max_size; 1783 int local_flags; 1784 1785 local_flags = CL_PAGEOUT | CL_THROTTLE; 1786 1787 if ((flags & UPL_IOSYNC) == 0) 1788 local_flags |= CL_ASYNC; 1789 if ((flags & UPL_NOCOMMIT) == 0) 1790 local_flags |= CL_COMMIT; 1791 if ((flags & UPL_KEEPCACHED)) 1792 local_flags |= CL_KEEPCACHED; 1793 if (flags & UPL_PAGING_ENCRYPTED) 1794 local_flags |= CL_ENCRYPTED; 1795 1796 1797 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE, 1798 (int)f_offset, size, (int)filesize, local_flags, 0); 1799 1800 /* 1801 * If they didn't specify any I/O, then we are done... 1802 * we can't issue an abort because we don't know how 1803 * big the upl really is 1804 */ 1805 if (size <= 0) 1806 return (EINVAL); 1807 1808 if (vp->v_mount->mnt_flag & MNT_RDONLY) { 1809 if (local_flags & CL_COMMIT) 1810 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY); 1811 return (EROFS); 1812 } 1813 /* 1814 * can't page-in from a negative offset 1815 * or if we're starting beyond the EOF 1816 * or if the file offset isn't page aligned 1817 * or the size requested isn't a multiple of PAGE_SIZE 1818 */ 1819 if (f_offset < 0 || f_offset >= filesize || 1820 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) { 1821 if (local_flags & CL_COMMIT) 1822 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY); 1823 return (EINVAL); 1824 } 1825 max_size = filesize - f_offset; 1826 1827 if (size < max_size) 1828 io_size = size; 1829 else 1830 io_size = max_size; 1831 1832 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 1833 1834 if (size > rounded_size) { 1835 if (local_flags & CL_COMMIT) 1836 ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size, 1837 UPL_ABORT_FREE_ON_EMPTY); 1838 } 1839 return (cluster_io(vp, upl, upl_offset, f_offset, io_size, 1840 local_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg)); 1841} 1842 1843 1844int 1845cluster_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset, 1846 int size, off_t filesize, int flags) 1847{ 1848 return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL); 1849} 1850 1851 1852int 1853cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset, 1854 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg) 1855{ 1856 u_int io_size; 1857 int rounded_size; 1858 off_t max_size; 1859 int retval; 1860 int local_flags = 0; 1861 1862 if (upl == NULL || size < 0) 1863 panic("cluster_pagein: NULL upl passed in"); 1864 1865 if ((flags & UPL_IOSYNC) == 0) 1866 local_flags |= CL_ASYNC; 1867 if ((flags & UPL_NOCOMMIT) == 0) 1868 local_flags |= CL_COMMIT; 1869 if (flags & UPL_IOSTREAMING) 1870 local_flags |= CL_IOSTREAMING; 1871 if (flags & UPL_PAGING_ENCRYPTED) 1872 local_flags |= CL_ENCRYPTED; 1873 1874 1875 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE, 1876 (int)f_offset, size, (int)filesize, local_flags, 0); 1877 1878 /* 1879 * can't page-in from a negative offset 1880 * or if we're starting beyond the EOF 1881 * or if the file offset isn't page aligned 1882 * or the size requested isn't a multiple of PAGE_SIZE 1883 */ 1884 if (f_offset < 0 || f_offset >= filesize || 1885 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) { 1886 if (local_flags & CL_COMMIT) 1887 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); 1888 return (EINVAL); 1889 } 1890 max_size = filesize - f_offset; 1891 1892 if (size < max_size) 1893 io_size = size; 1894 else 1895 io_size = max_size; 1896 1897 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 1898 1899 if (size > rounded_size && (local_flags & CL_COMMIT)) 1900 ubc_upl_abort_range(upl, upl_offset + rounded_size, 1901 size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); 1902 1903 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, 1904 local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 1905 1906 return (retval); 1907} 1908 1909 1910int 1911cluster_bp(buf_t bp) 1912{ 1913 return cluster_bp_ext(bp, NULL, NULL); 1914} 1915 1916 1917int 1918cluster_bp_ext(buf_t bp, int (*callback)(buf_t, void *), void *callback_arg) 1919{ 1920 off_t f_offset; 1921 int flags; 1922 1923 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START, 1924 bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0); 1925 1926 if (bp->b_flags & B_READ) 1927 flags = CL_ASYNC | CL_READ; 1928 else 1929 flags = CL_ASYNC; 1930 if (bp->b_flags & B_PASSIVE) 1931 flags |= CL_PASSIVE; 1932 1933 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno); 1934 1935 return (cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL, callback, callback_arg)); 1936} 1937 1938 1939 1940int 1941cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags) 1942{ 1943 return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL); 1944} 1945 1946 1947int 1948cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, 1949 int xflags, int (*callback)(buf_t, void *), void *callback_arg) 1950{ 1951 user_ssize_t cur_resid; 1952 int retval = 0; 1953 int flags; 1954 int zflags; 1955 int bflag; 1956 int write_type = IO_COPY; 1957 u_int32_t write_length; 1958 1959 flags = xflags; 1960 1961 if (flags & IO_PASSIVE) 1962 bflag = CL_PASSIVE; 1963 else 1964 bflag = 0; 1965 1966 if (vp->v_flag & VNOCACHE_DATA){ 1967 flags |= IO_NOCACHE; 1968 bflag |= CL_NOCACHE; 1969 } 1970 if (uio == NULL) { 1971 /* 1972 * no user data... 1973 * this call is being made to zero-fill some range in the file 1974 */ 1975 retval = cluster_write_copy(vp, NULL, (u_int32_t)0, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg); 1976 1977 return(retval); 1978 } 1979 /* 1980 * do a write through the cache if one of the following is true.... 1981 * NOCACHE is not true or NODIRECT is true 1982 * the uio request doesn't target USERSPACE 1983 * otherwise, find out if we want the direct or contig variant for 1984 * the first vector in the uio request 1985 */ 1986 if ( ((flags & (IO_NOCACHE | IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ) 1987 retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE); 1988 1989 if ( (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)) && write_type == IO_DIRECT) 1990 /* 1991 * must go through the cached variant in this case 1992 */ 1993 write_type = IO_COPY; 1994 1995 while ((cur_resid = uio_resid(uio)) && uio->uio_offset < newEOF && retval == 0) { 1996 1997 switch (write_type) { 1998 1999 case IO_COPY: 2000 /* 2001 * make sure the uio_resid isn't too big... 2002 * internally, we want to handle all of the I/O in 2003 * chunk sizes that fit in a 32 bit int 2004 */ 2005 if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) { 2006 /* 2007 * we're going to have to call cluster_write_copy 2008 * more than once... 2009 * 2010 * only want the last call to cluster_write_copy to 2011 * have the IO_TAILZEROFILL flag set and only the 2012 * first call should have IO_HEADZEROFILL 2013 */ 2014 zflags = flags & ~IO_TAILZEROFILL; 2015 flags &= ~IO_HEADZEROFILL; 2016 2017 write_length = MAX_IO_REQUEST_SIZE; 2018 } else { 2019 /* 2020 * last call to cluster_write_copy 2021 */ 2022 zflags = flags; 2023 2024 write_length = (u_int32_t)cur_resid; 2025 } 2026 retval = cluster_write_copy(vp, uio, write_length, oldEOF, newEOF, headOff, tailOff, zflags, callback, callback_arg); 2027 break; 2028 2029 case IO_CONTIG: 2030 zflags = flags & ~(IO_TAILZEROFILL | IO_HEADZEROFILL); 2031 2032 if (flags & IO_HEADZEROFILL) { 2033 /* 2034 * only do this once per request 2035 */ 2036 flags &= ~IO_HEADZEROFILL; 2037 2038 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, uio->uio_offset, 2039 headOff, (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg); 2040 if (retval) 2041 break; 2042 } 2043 retval = cluster_write_contig(vp, uio, newEOF, &write_type, &write_length, callback, callback_arg, bflag); 2044 2045 if (retval == 0 && (flags & IO_TAILZEROFILL) && uio_resid(uio) == 0) { 2046 /* 2047 * we're done with the data from the user specified buffer(s) 2048 * and we've been requested to zero fill at the tail 2049 * treat this as an IO_HEADZEROFILL which doesn't require a uio 2050 * by rearranging the args and passing in IO_HEADZEROFILL 2051 */ 2052 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, tailOff, uio->uio_offset, 2053 (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg); 2054 } 2055 break; 2056 2057 case IO_DIRECT: 2058 /* 2059 * cluster_write_direct is never called with IO_TAILZEROFILL || IO_HEADZEROFILL 2060 */ 2061 retval = cluster_write_direct(vp, uio, oldEOF, newEOF, &write_type, &write_length, flags, callback, callback_arg); 2062 break; 2063 2064 case IO_UNKNOWN: 2065 retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE); 2066 break; 2067 } 2068 /* 2069 * in case we end up calling cluster_write_copy (from cluster_write_direct) 2070 * multiple times to service a multi-vector request that is not aligned properly 2071 * we need to update the oldEOF so that we 2072 * don't zero-fill the head of a page if we've successfully written 2073 * data to that area... 'cluster_write_copy' will zero-fill the head of a 2074 * page that is beyond the oldEOF if the write is unaligned... we only 2075 * want that to happen for the very first page of the cluster_write, 2076 * NOT the first page of each vector making up a multi-vector write. 2077 */ 2078 if (uio->uio_offset > oldEOF) 2079 oldEOF = uio->uio_offset; 2080 } 2081 return (retval); 2082} 2083 2084 2085static int 2086cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, int *write_type, u_int32_t *write_length, 2087 int flags, int (*callback)(buf_t, void *), void *callback_arg) 2088{ 2089 upl_t upl; 2090 upl_page_info_t *pl; 2091 vm_offset_t upl_offset; 2092 vm_offset_t vector_upl_offset = 0; 2093 u_int32_t io_req_size; 2094 u_int32_t offset_in_file; 2095 u_int32_t offset_in_iovbase; 2096 u_int32_t io_size; 2097 int io_flag = 0; 2098 upl_size_t upl_size, vector_upl_size = 0; 2099 vm_size_t upl_needed_size; 2100 mach_msg_type_number_t pages_in_pl; 2101 int upl_flags; 2102 kern_return_t kret; 2103 mach_msg_type_number_t i; 2104 int force_data_sync; 2105 int retval = 0; 2106 int first_IO = 1; 2107 struct clios iostate; 2108 user_addr_t iov_base; 2109 u_int32_t mem_alignment_mask; 2110 u_int32_t devblocksize; 2111 u_int32_t max_io_size; 2112 u_int32_t max_upl_size; 2113 u_int32_t max_vector_size; 2114 boolean_t io_throttled = FALSE; 2115 2116 u_int32_t vector_upl_iosize = 0; 2117 int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1); 2118 off_t v_upl_uio_offset = 0; 2119 int vector_upl_index=0; 2120 upl_t vector_upl = NULL; 2121 2122 2123 /* 2124 * When we enter this routine, we know 2125 * -- the resid will not exceed iov_len 2126 */ 2127 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START, 2128 (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0); 2129 2130 max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE); 2131 2132 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE | CL_DIRECT_IO; 2133 2134 if (flags & IO_PASSIVE) 2135 io_flag |= CL_PASSIVE; 2136 2137 if (flags & IO_NOCACHE) 2138 io_flag |= CL_NOCACHE; 2139 2140 iostate.io_completed = 0; 2141 iostate.io_issued = 0; 2142 iostate.io_error = 0; 2143 iostate.io_wanted = 0; 2144 2145 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); 2146 2147 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; 2148 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; 2149 2150 if (devblocksize == 1) { 2151 /* 2152 * the AFP client advertises a devblocksize of 1 2153 * however, its BLOCKMAP routine maps to physical 2154 * blocks that are PAGE_SIZE in size... 2155 * therefore we can't ask for I/Os that aren't page aligned 2156 * or aren't multiples of PAGE_SIZE in size 2157 * by setting devblocksize to PAGE_SIZE, we re-instate 2158 * the old behavior we had before the mem_alignment_mask 2159 * changes went in... 2160 */ 2161 devblocksize = PAGE_SIZE; 2162 } 2163 2164next_dwrite: 2165 io_req_size = *write_length; 2166 iov_base = uio_curriovbase(uio); 2167 2168 offset_in_file = (u_int32_t)uio->uio_offset & PAGE_MASK; 2169 offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask; 2170 2171 if (offset_in_file || offset_in_iovbase) { 2172 /* 2173 * one of the 2 important offsets is misaligned 2174 * so fire an I/O through the cache for this entire vector 2175 */ 2176 goto wait_for_dwrites; 2177 } 2178 if (iov_base & (devblocksize - 1)) { 2179 /* 2180 * the offset in memory must be on a device block boundary 2181 * so that we can guarantee that we can generate an 2182 * I/O that ends on a page boundary in cluster_io 2183 */ 2184 goto wait_for_dwrites; 2185 } 2186 2187 while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) { 2188 int throttle_type; 2189 2190 if ( (throttle_type = cluster_is_throttled(vp)) ) { 2191 /* 2192 * we're in the throttle window, at the very least 2193 * we want to limit the size of the I/O we're about 2194 * to issue 2195 */ 2196 if ( (flags & IO_RETURN_ON_THROTTLE) && throttle_type == THROTTLE_NOW) { 2197 /* 2198 * we're in the throttle window and at least 1 I/O 2199 * has already been issued by a throttleable thread 2200 * in this window, so return with EAGAIN to indicate 2201 * to the FS issuing the cluster_write call that it 2202 * should now throttle after dropping any locks 2203 */ 2204 throttle_info_update_by_mount(vp->v_mount); 2205 2206 io_throttled = TRUE; 2207 goto wait_for_dwrites; 2208 } 2209 max_vector_size = THROTTLE_MAX_IOSIZE; 2210 max_io_size = THROTTLE_MAX_IOSIZE; 2211 } else { 2212 max_vector_size = MAX_VECTOR_UPL_SIZE; 2213 max_io_size = max_upl_size; 2214 } 2215 2216 if (first_IO) { 2217 cluster_syncup(vp, newEOF, callback, callback_arg); 2218 first_IO = 0; 2219 } 2220 io_size = io_req_size & ~PAGE_MASK; 2221 iov_base = uio_curriovbase(uio); 2222 2223 if (io_size > max_io_size) 2224 io_size = max_io_size; 2225 2226 if(useVectorUPL && (iov_base & PAGE_MASK)) { 2227 /* 2228 * We have an iov_base that's not page-aligned. 2229 * Issue all I/O's that have been collected within 2230 * this Vectored UPL. 2231 */ 2232 if(vector_upl_index) { 2233 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 2234 reset_vector_run_state(); 2235 } 2236 2237 /* 2238 * After this point, if we are using the Vector UPL path and the base is 2239 * not page-aligned then the UPL with that base will be the first in the vector UPL. 2240 */ 2241 } 2242 2243 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK); 2244 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK; 2245 2246 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START, 2247 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0); 2248 2249 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) { 2250 pages_in_pl = 0; 2251 upl_size = upl_needed_size; 2252 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC | 2253 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; 2254 2255 kret = vm_map_get_upl(current_map(), 2256 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), 2257 &upl_size, 2258 &upl, 2259 NULL, 2260 &pages_in_pl, 2261 &upl_flags, 2262 force_data_sync); 2263 2264 if (kret != KERN_SUCCESS) { 2265 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END, 2266 0, 0, 0, kret, 0); 2267 /* 2268 * failed to get pagelist 2269 * 2270 * we may have already spun some portion of this request 2271 * off as async requests... we need to wait for the I/O 2272 * to complete before returning 2273 */ 2274 goto wait_for_dwrites; 2275 } 2276 pl = UPL_GET_INTERNAL_PAGE_LIST(upl); 2277 pages_in_pl = upl_size / PAGE_SIZE; 2278 2279 for (i = 0; i < pages_in_pl; i++) { 2280 if (!upl_valid_page(pl, i)) 2281 break; 2282 } 2283 if (i == pages_in_pl) 2284 break; 2285 2286 /* 2287 * didn't get all the pages back that we 2288 * needed... release this upl and try again 2289 */ 2290 ubc_upl_abort(upl, 0); 2291 } 2292 if (force_data_sync >= 3) { 2293 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END, 2294 i, pages_in_pl, upl_size, kret, 0); 2295 /* 2296 * for some reason, we couldn't acquire a hold on all 2297 * the pages needed in the user's address space 2298 * 2299 * we may have already spun some portion of this request 2300 * off as async requests... we need to wait for the I/O 2301 * to complete before returning 2302 */ 2303 goto wait_for_dwrites; 2304 } 2305 2306 /* 2307 * Consider the possibility that upl_size wasn't satisfied. 2308 */ 2309 if (upl_size < upl_needed_size) { 2310 if (upl_size && upl_offset == 0) 2311 io_size = upl_size; 2312 else 2313 io_size = 0; 2314 } 2315 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END, 2316 (int)upl_offset, upl_size, (int)iov_base, io_size, 0); 2317 2318 if (io_size == 0) { 2319 ubc_upl_abort(upl, 0); 2320 /* 2321 * we may have already spun some portion of this request 2322 * off as async requests... we need to wait for the I/O 2323 * to complete before returning 2324 */ 2325 goto wait_for_dwrites; 2326 } 2327 2328 if(useVectorUPL) { 2329 vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK); 2330 if(end_off) 2331 issueVectorUPL = 1; 2332 /* 2333 * After this point, if we are using a vector UPL, then 2334 * either all the UPL elements end on a page boundary OR 2335 * this UPL is the last element because it does not end 2336 * on a page boundary. 2337 */ 2338 } 2339 2340 /* 2341 * Now look for pages already in the cache 2342 * and throw them away. 2343 * uio->uio_offset is page aligned within the file 2344 * io_size is a multiple of PAGE_SIZE 2345 */ 2346 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL); 2347 2348 /* 2349 * we want push out these writes asynchronously so that we can overlap 2350 * the preparation of the next I/O 2351 * if there are already too many outstanding writes 2352 * wait until some complete before issuing the next 2353 */ 2354 if (iostate.io_issued > iostate.io_completed) 2355 cluster_iostate_wait(&iostate, max_upl_size * IO_SCALE(vp, 2), "cluster_write_direct"); 2356 2357 if (iostate.io_error) { 2358 /* 2359 * one of the earlier writes we issued ran into a hard error 2360 * don't issue any more writes, cleanup the UPL 2361 * that was just created but not used, then 2362 * go wait for all writes that are part of this stream 2363 * to complete before returning the error to the caller 2364 */ 2365 ubc_upl_abort(upl, 0); 2366 2367 goto wait_for_dwrites; 2368 } 2369 2370 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START, 2371 (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0); 2372 2373 if(!useVectorUPL) 2374 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, 2375 io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 2376 2377 else { 2378 if(!vector_upl_index) { 2379 vector_upl = vector_upl_create(upl_offset); 2380 v_upl_uio_offset = uio->uio_offset; 2381 vector_upl_offset = upl_offset; 2382 } 2383 2384 vector_upl_set_subupl(vector_upl,upl,upl_size); 2385 vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size); 2386 vector_upl_index++; 2387 vector_upl_iosize += io_size; 2388 vector_upl_size += upl_size; 2389 2390 if(issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) { 2391 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 2392 reset_vector_run_state(); 2393 } 2394 } 2395 2396 /* 2397 * update the uio structure to 2398 * reflect the I/O that we just issued 2399 */ 2400 uio_update(uio, (user_size_t)io_size); 2401 2402 /* 2403 * in case we end up calling through to cluster_write_copy to finish 2404 * the tail of this request, we need to update the oldEOF so that we 2405 * don't zero-fill the head of a page if we've successfully written 2406 * data to that area... 'cluster_write_copy' will zero-fill the head of a 2407 * page that is beyond the oldEOF if the write is unaligned... we only 2408 * want that to happen for the very first page of the cluster_write, 2409 * NOT the first page of each vector making up a multi-vector write. 2410 */ 2411 if (uio->uio_offset > oldEOF) 2412 oldEOF = uio->uio_offset; 2413 2414 io_req_size -= io_size; 2415 2416 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END, 2417 (int)upl_offset, (int)uio->uio_offset, io_req_size, retval, 0); 2418 2419 } /* end while */ 2420 2421 if (retval == 0 && iostate.io_error == 0 && io_req_size == 0) { 2422 2423 retval = cluster_io_type(uio, write_type, write_length, MIN_DIRECT_WRITE_SIZE); 2424 2425 if (retval == 0 && *write_type == IO_DIRECT) { 2426 2427 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_NONE, 2428 (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0); 2429 2430 goto next_dwrite; 2431 } 2432 } 2433 2434wait_for_dwrites: 2435 2436 if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) { 2437 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 2438 reset_vector_run_state(); 2439 } 2440 2441 if (iostate.io_issued > iostate.io_completed) { 2442 /* 2443 * make sure all async writes issued as part of this stream 2444 * have completed before we return 2445 */ 2446 cluster_iostate_wait(&iostate, 0, "cluster_write_direct"); 2447 } 2448 if (iostate.io_error) 2449 retval = iostate.io_error; 2450 2451 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); 2452 2453 if (io_throttled == TRUE && retval == 0) 2454 retval = EAGAIN; 2455 2456 if (io_req_size && retval == 0) { 2457 /* 2458 * we couldn't handle the tail of this request in DIRECT mode 2459 * so fire it through the copy path 2460 * 2461 * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set 2462 * so we can just pass 0 in for the headOff and tailOff 2463 */ 2464 if (uio->uio_offset > oldEOF) 2465 oldEOF = uio->uio_offset; 2466 2467 retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, (off_t)0, (off_t)0, flags, callback, callback_arg); 2468 2469 *write_type = IO_UNKNOWN; 2470 } 2471 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END, 2472 (int)uio->uio_offset, io_req_size, retval, 4, 0); 2473 2474 return (retval); 2475} 2476 2477 2478static int 2479cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, u_int32_t *write_length, 2480 int (*callback)(buf_t, void *), void *callback_arg, int bflag) 2481{ 2482 upl_page_info_t *pl; 2483 addr64_t src_paddr = 0; 2484 upl_t upl[MAX_VECTS]; 2485 vm_offset_t upl_offset; 2486 u_int32_t tail_size = 0; 2487 u_int32_t io_size; 2488 u_int32_t xsize; 2489 upl_size_t upl_size; 2490 vm_size_t upl_needed_size; 2491 mach_msg_type_number_t pages_in_pl; 2492 int upl_flags; 2493 kern_return_t kret; 2494 struct clios iostate; 2495 int error = 0; 2496 int cur_upl = 0; 2497 int num_upl = 0; 2498 int n; 2499 user_addr_t iov_base; 2500 u_int32_t devblocksize; 2501 u_int32_t mem_alignment_mask; 2502 2503 /* 2504 * When we enter this routine, we know 2505 * -- the io_req_size will not exceed iov_len 2506 * -- the target address is physically contiguous 2507 */ 2508 cluster_syncup(vp, newEOF, callback, callback_arg); 2509 2510 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; 2511 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; 2512 2513 iostate.io_completed = 0; 2514 iostate.io_issued = 0; 2515 iostate.io_error = 0; 2516 iostate.io_wanted = 0; 2517 2518 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); 2519 2520next_cwrite: 2521 io_size = *write_length; 2522 2523 iov_base = uio_curriovbase(uio); 2524 2525 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK); 2526 upl_needed_size = upl_offset + io_size; 2527 2528 pages_in_pl = 0; 2529 upl_size = upl_needed_size; 2530 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC | 2531 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; 2532 2533 kret = vm_map_get_upl(current_map(), 2534 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), 2535 &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, 0); 2536 2537 if (kret != KERN_SUCCESS) { 2538 /* 2539 * failed to get pagelist 2540 */ 2541 error = EINVAL; 2542 goto wait_for_cwrites; 2543 } 2544 num_upl++; 2545 2546 /* 2547 * Consider the possibility that upl_size wasn't satisfied. 2548 */ 2549 if (upl_size < upl_needed_size) { 2550 /* 2551 * This is a failure in the physical memory case. 2552 */ 2553 error = EINVAL; 2554 goto wait_for_cwrites; 2555 } 2556 pl = ubc_upl_pageinfo(upl[cur_upl]); 2557 2558 src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset; 2559 2560 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) { 2561 u_int32_t head_size; 2562 2563 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1)); 2564 2565 if (head_size > io_size) 2566 head_size = io_size; 2567 2568 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0, callback, callback_arg); 2569 2570 if (error) 2571 goto wait_for_cwrites; 2572 2573 upl_offset += head_size; 2574 src_paddr += head_size; 2575 io_size -= head_size; 2576 2577 iov_base += head_size; 2578 } 2579 if ((u_int32_t)iov_base & mem_alignment_mask) { 2580 /* 2581 * request doesn't set up on a memory boundary 2582 * the underlying DMA engine can handle... 2583 * return an error instead of going through 2584 * the slow copy path since the intent of this 2585 * path is direct I/O from device memory 2586 */ 2587 error = EINVAL; 2588 goto wait_for_cwrites; 2589 } 2590 2591 tail_size = io_size & (devblocksize - 1); 2592 io_size -= tail_size; 2593 2594 while (io_size && error == 0) { 2595 2596 if (io_size > MAX_IO_CONTIG_SIZE) 2597 xsize = MAX_IO_CONTIG_SIZE; 2598 else 2599 xsize = io_size; 2600 /* 2601 * request asynchronously so that we can overlap 2602 * the preparation of the next I/O... we'll do 2603 * the commit after all the I/O has completed 2604 * since its all issued against the same UPL 2605 * if there are already too many outstanding writes 2606 * wait until some have completed before issuing the next 2607 */ 2608 if (iostate.io_issued > iostate.io_completed) 2609 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig"); 2610 2611 if (iostate.io_error) { 2612 /* 2613 * one of the earlier writes we issued ran into a hard error 2614 * don't issue any more writes... 2615 * go wait for all writes that are part of this stream 2616 * to complete before returning the error to the caller 2617 */ 2618 goto wait_for_cwrites; 2619 } 2620 /* 2621 * issue an asynchronous write to cluster_io 2622 */ 2623 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, 2624 xsize, CL_DEV_MEMORY | CL_ASYNC | bflag, (buf_t)NULL, (struct clios *)&iostate, callback, callback_arg); 2625 2626 if (error == 0) { 2627 /* 2628 * The cluster_io write completed successfully, 2629 * update the uio structure 2630 */ 2631 uio_update(uio, (user_size_t)xsize); 2632 2633 upl_offset += xsize; 2634 src_paddr += xsize; 2635 io_size -= xsize; 2636 } 2637 } 2638 if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS) { 2639 2640 error = cluster_io_type(uio, write_type, write_length, 0); 2641 2642 if (error == 0 && *write_type == IO_CONTIG) { 2643 cur_upl++; 2644 goto next_cwrite; 2645 } 2646 } else 2647 *write_type = IO_UNKNOWN; 2648 2649wait_for_cwrites: 2650 /* 2651 * make sure all async writes that are part of this stream 2652 * have completed before we proceed 2653 */ 2654 if (iostate.io_issued > iostate.io_completed) 2655 cluster_iostate_wait(&iostate, 0, "cluster_write_contig"); 2656 2657 if (iostate.io_error) 2658 error = iostate.io_error; 2659 2660 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); 2661 2662 if (error == 0 && tail_size) 2663 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg); 2664 2665 for (n = 0; n < num_upl; n++) 2666 /* 2667 * just release our hold on each physically contiguous 2668 * region without changing any state 2669 */ 2670 ubc_upl_abort(upl[n], 0); 2671 2672 return (error); 2673} 2674 2675 2676/* 2677 * need to avoid a race between an msync of a range of pages dirtied via mmap 2678 * vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's 2679 * zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd 2680 * 2681 * we should never force-zero-fill pages that are already valid in the cache... 2682 * the entire page contains valid data (either from disk, zero-filled or dirtied 2683 * via an mmap) so we can only do damage by trying to zero-fill 2684 * 2685 */ 2686static int 2687cluster_zero_range(upl_t upl, upl_page_info_t *pl, int flags, int io_offset, off_t zero_off, off_t upl_f_offset, int bytes_to_zero) 2688{ 2689 int zero_pg_index; 2690 boolean_t need_cluster_zero = TRUE; 2691 2692 if ((flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) { 2693 2694 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64)); 2695 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64); 2696 2697 if (upl_valid_page(pl, zero_pg_index)) { 2698 /* 2699 * never force zero valid pages - dirty or clean 2700 * we'll leave these in the UPL for cluster_write_copy to deal with 2701 */ 2702 need_cluster_zero = FALSE; 2703 } 2704 } 2705 if (need_cluster_zero == TRUE) 2706 cluster_zero(upl, io_offset, bytes_to_zero, NULL); 2707 2708 return (bytes_to_zero); 2709} 2710 2711 2712static int 2713cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff, 2714 off_t tailOff, int flags, int (*callback)(buf_t, void *), void *callback_arg) 2715{ 2716 upl_page_info_t *pl; 2717 upl_t upl; 2718 vm_offset_t upl_offset = 0; 2719 vm_size_t upl_size; 2720 off_t upl_f_offset; 2721 int pages_in_upl; 2722 int start_offset; 2723 int xfer_resid; 2724 int io_size; 2725 int io_offset; 2726 int bytes_to_zero; 2727 int bytes_to_move; 2728 kern_return_t kret; 2729 int retval = 0; 2730 int io_resid; 2731 long long total_size; 2732 long long zero_cnt; 2733 off_t zero_off; 2734 long long zero_cnt1; 2735 off_t zero_off1; 2736 off_t write_off = 0; 2737 int write_cnt = 0; 2738 boolean_t first_pass = FALSE; 2739 struct cl_extent cl; 2740 struct cl_writebehind *wbp; 2741 int bflag; 2742 u_int max_cluster_pgcount; 2743 u_int max_io_size; 2744 2745 if (uio) { 2746 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START, 2747 (int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, 0); 2748 2749 io_resid = io_req_size; 2750 } else { 2751 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START, 2752 0, 0, (int)oldEOF, (int)newEOF, 0); 2753 2754 io_resid = 0; 2755 } 2756 if (flags & IO_PASSIVE) 2757 bflag = CL_PASSIVE; 2758 else 2759 bflag = 0; 2760 if (flags & IO_NOCACHE) 2761 bflag |= CL_NOCACHE; 2762 2763 zero_cnt = 0; 2764 zero_cnt1 = 0; 2765 zero_off = 0; 2766 zero_off1 = 0; 2767 2768 max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE; 2769 max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE); 2770 2771 if (flags & IO_HEADZEROFILL) { 2772 /* 2773 * some filesystems (HFS is one) don't support unallocated holes within a file... 2774 * so we zero fill the intervening space between the old EOF and the offset 2775 * where the next chunk of real data begins.... ftruncate will also use this 2776 * routine to zero fill to the new EOF when growing a file... in this case, the 2777 * uio structure will not be provided 2778 */ 2779 if (uio) { 2780 if (headOff < uio->uio_offset) { 2781 zero_cnt = uio->uio_offset - headOff; 2782 zero_off = headOff; 2783 } 2784 } else if (headOff < newEOF) { 2785 zero_cnt = newEOF - headOff; 2786 zero_off = headOff; 2787 } 2788 } else { 2789 if (uio && uio->uio_offset > oldEOF) { 2790 zero_off = uio->uio_offset & ~PAGE_MASK_64; 2791 2792 if (zero_off >= oldEOF) { 2793 zero_cnt = uio->uio_offset - zero_off; 2794 2795 flags |= IO_HEADZEROFILL; 2796 } 2797 } 2798 } 2799 if (flags & IO_TAILZEROFILL) { 2800 if (uio) { 2801 zero_off1 = uio->uio_offset + io_req_size; 2802 2803 if (zero_off1 < tailOff) 2804 zero_cnt1 = tailOff - zero_off1; 2805 } 2806 } else { 2807 if (uio && newEOF > oldEOF) { 2808 zero_off1 = uio->uio_offset + io_req_size; 2809 2810 if (zero_off1 == newEOF && (zero_off1 & PAGE_MASK_64)) { 2811 zero_cnt1 = PAGE_SIZE_64 - (zero_off1 & PAGE_MASK_64); 2812 2813 flags |= IO_TAILZEROFILL; 2814 } 2815 } 2816 } 2817 if (zero_cnt == 0 && uio == (struct uio *) 0) { 2818 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, 2819 retval, 0, 0, 0, 0); 2820 return (0); 2821 } 2822 if (uio) { 2823 write_off = uio->uio_offset; 2824 write_cnt = uio_resid(uio); 2825 /* 2826 * delay updating the sequential write info 2827 * in the control block until we've obtained 2828 * the lock for it 2829 */ 2830 first_pass = TRUE; 2831 } 2832 while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) { 2833 /* 2834 * for this iteration of the loop, figure out where our starting point is 2835 */ 2836 if (zero_cnt) { 2837 start_offset = (int)(zero_off & PAGE_MASK_64); 2838 upl_f_offset = zero_off - start_offset; 2839 } else if (io_resid) { 2840 start_offset = (int)(uio->uio_offset & PAGE_MASK_64); 2841 upl_f_offset = uio->uio_offset - start_offset; 2842 } else { 2843 start_offset = (int)(zero_off1 & PAGE_MASK_64); 2844 upl_f_offset = zero_off1 - start_offset; 2845 } 2846 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE, 2847 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0); 2848 2849 if (total_size > max_io_size) 2850 total_size = max_io_size; 2851 2852 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64); 2853 2854 if (uio && ((flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) { 2855 /* 2856 * assumption... total_size <= io_resid 2857 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set 2858 */ 2859 if ((start_offset + total_size) > max_io_size) 2860 total_size = max_io_size - start_offset; 2861 xfer_resid = total_size; 2862 2863 retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1); 2864 2865 if (retval) 2866 break; 2867 2868 io_resid -= (total_size - xfer_resid); 2869 total_size = xfer_resid; 2870 start_offset = (int)(uio->uio_offset & PAGE_MASK_64); 2871 upl_f_offset = uio->uio_offset - start_offset; 2872 2873 if (total_size == 0) { 2874 if (start_offset) { 2875 /* 2876 * the write did not finish on a page boundary 2877 * which will leave upl_f_offset pointing to the 2878 * beginning of the last page written instead of 2879 * the page beyond it... bump it in this case 2880 * so that the cluster code records the last page 2881 * written as dirty 2882 */ 2883 upl_f_offset += PAGE_SIZE_64; 2884 } 2885 upl_size = 0; 2886 2887 goto check_cluster; 2888 } 2889 } 2890 /* 2891 * compute the size of the upl needed to encompass 2892 * the requested write... limit each call to cluster_io 2893 * to the maximum UPL size... cluster_io will clip if 2894 * this exceeds the maximum io_size for the device, 2895 * make sure to account for 2896 * a starting offset that's not page aligned 2897 */ 2898 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 2899 2900 if (upl_size > max_io_size) 2901 upl_size = max_io_size; 2902 2903 pages_in_upl = upl_size / PAGE_SIZE; 2904 io_size = upl_size - start_offset; 2905 2906 if ((long long)io_size > total_size) 2907 io_size = total_size; 2908 2909 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0); 2910 2911 2912 /* 2913 * Gather the pages from the buffer cache. 2914 * The UPL_WILL_MODIFY flag lets the UPL subsystem know 2915 * that we intend to modify these pages. 2916 */ 2917 kret = ubc_create_upl(vp, 2918 upl_f_offset, 2919 upl_size, 2920 &upl, 2921 &pl, 2922 UPL_SET_LITE | (( uio!=NULL && (uio->uio_flags & UIO_FLAGS_IS_COMPRESSED_FILE)) ? 0 : UPL_WILL_MODIFY)); 2923 if (kret != KERN_SUCCESS) 2924 panic("cluster_write_copy: failed to get pagelist"); 2925 2926 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, 2927 upl, (int)upl_f_offset, start_offset, 0, 0); 2928 2929 if (start_offset && upl_f_offset < oldEOF && !upl_valid_page(pl, 0)) { 2930 int read_size; 2931 2932 /* 2933 * we're starting in the middle of the first page of the upl 2934 * and the page isn't currently valid, so we're going to have 2935 * to read it in first... this is a synchronous operation 2936 */ 2937 read_size = PAGE_SIZE; 2938 2939 if ((upl_f_offset + read_size) > oldEOF) 2940 read_size = oldEOF - upl_f_offset; 2941 2942 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, 2943 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 2944 if (retval) { 2945 /* 2946 * we had an error during the read which causes us to abort 2947 * the current cluster_write request... before we do, we need 2948 * to release the rest of the pages in the upl without modifying 2949 * there state and mark the failed page in error 2950 */ 2951 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY); 2952 2953 if (upl_size > PAGE_SIZE) 2954 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); 2955 2956 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE, 2957 upl, 0, 0, retval, 0); 2958 break; 2959 } 2960 } 2961 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) { 2962 /* 2963 * the last offset we're writing to in this upl does not end on a page 2964 * boundary... if it's not beyond the old EOF, then we'll also need to 2965 * pre-read this page in if it isn't already valid 2966 */ 2967 upl_offset = upl_size - PAGE_SIZE; 2968 2969 if ((upl_f_offset + start_offset + io_size) < oldEOF && 2970 !upl_valid_page(pl, upl_offset / PAGE_SIZE)) { 2971 int read_size; 2972 2973 read_size = PAGE_SIZE; 2974 2975 if ((off_t)(upl_f_offset + upl_offset + read_size) > oldEOF) 2976 read_size = oldEOF - (upl_f_offset + upl_offset); 2977 2978 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, 2979 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 2980 if (retval) { 2981 /* 2982 * we had an error during the read which causes us to abort 2983 * the current cluster_write request... before we do, we 2984 * need to release the rest of the pages in the upl without 2985 * modifying there state and mark the failed page in error 2986 */ 2987 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY); 2988 2989 if (upl_size > PAGE_SIZE) 2990 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); 2991 2992 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE, 2993 upl, 0, 0, retval, 0); 2994 break; 2995 } 2996 } 2997 } 2998 xfer_resid = io_size; 2999 io_offset = start_offset; 3000 3001 while (zero_cnt && xfer_resid) { 3002 3003 if (zero_cnt < (long long)xfer_resid) 3004 bytes_to_zero = zero_cnt; 3005 else 3006 bytes_to_zero = xfer_resid; 3007 3008 bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off, upl_f_offset, bytes_to_zero); 3009 3010 xfer_resid -= bytes_to_zero; 3011 zero_cnt -= bytes_to_zero; 3012 zero_off += bytes_to_zero; 3013 io_offset += bytes_to_zero; 3014 } 3015 if (xfer_resid && io_resid) { 3016 u_int32_t io_requested; 3017 3018 bytes_to_move = min(io_resid, xfer_resid); 3019 io_requested = bytes_to_move; 3020 3021 retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested); 3022 3023 if (retval) { 3024 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); 3025 3026 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE, 3027 upl, 0, 0, retval, 0); 3028 } else { 3029 io_resid -= bytes_to_move; 3030 xfer_resid -= bytes_to_move; 3031 io_offset += bytes_to_move; 3032 } 3033 } 3034 while (xfer_resid && zero_cnt1 && retval == 0) { 3035 3036 if (zero_cnt1 < (long long)xfer_resid) 3037 bytes_to_zero = zero_cnt1; 3038 else 3039 bytes_to_zero = xfer_resid; 3040 3041 bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off1, upl_f_offset, bytes_to_zero); 3042 3043 xfer_resid -= bytes_to_zero; 3044 zero_cnt1 -= bytes_to_zero; 3045 zero_off1 += bytes_to_zero; 3046 io_offset += bytes_to_zero; 3047 } 3048 if (retval == 0) { 3049 int cl_index; 3050 int ret_cluster_try_push; 3051 3052 io_size += start_offset; 3053 3054 if ((upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) { 3055 /* 3056 * if we're extending the file with this write 3057 * we'll zero fill the rest of the page so that 3058 * if the file gets extended again in such a way as to leave a 3059 * hole starting at this EOF, we'll have zero's in the correct spot 3060 */ 3061 cluster_zero(upl, io_size, upl_size - io_size, NULL); 3062 } 3063 /* 3064 * release the upl now if we hold one since... 3065 * 1) pages in it may be present in the sparse cluster map 3066 * and may span 2 separate buckets there... if they do and 3067 * we happen to have to flush a bucket to make room and it intersects 3068 * this upl, a deadlock may result on page BUSY 3069 * 2) we're delaying the I/O... from this point forward we're just updating 3070 * the cluster state... no need to hold the pages, so commit them 3071 * 3) IO_SYNC is set... 3072 * because we had to ask for a UPL that provides currenty non-present pages, the 3073 * UPL has been automatically set to clear the dirty flags (both software and hardware) 3074 * upon committing it... this is not the behavior we want since it's possible for 3075 * pages currently present as part of a mapped file to be dirtied while the I/O is in flight. 3076 * we'll pick these pages back up later with the correct behavior specified. 3077 * 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush 3078 * of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages 3079 * we hold since the flushing context is holding the cluster lock. 3080 */ 3081 ubc_upl_commit_range(upl, 0, upl_size, 3082 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY); 3083check_cluster: 3084 /* 3085 * calculate the last logical block number 3086 * that this delayed I/O encompassed 3087 */ 3088 cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64); 3089 3090 if (flags & IO_SYNC) { 3091 /* 3092 * if the IO_SYNC flag is set than we need to 3093 * bypass any clusters and immediately issue 3094 * the I/O 3095 */ 3096 goto issue_io; 3097 } 3098 /* 3099 * take the lock to protect our accesses 3100 * of the writebehind and sparse cluster state 3101 */ 3102 wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED); 3103 3104 if (wbp->cl_scmap) { 3105 3106 if ( !(flags & IO_NOCACHE)) { 3107 /* 3108 * we've fallen into the sparse 3109 * cluster method of delaying dirty pages 3110 */ 3111 sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, newEOF, callback, callback_arg); 3112 3113 lck_mtx_unlock(&wbp->cl_lockw); 3114 3115 continue; 3116 } 3117 /* 3118 * must have done cached writes that fell into 3119 * the sparse cluster mechanism... we've switched 3120 * to uncached writes on the file, so go ahead 3121 * and push whatever's in the sparse map 3122 * and switch back to normal clustering 3123 */ 3124 wbp->cl_number = 0; 3125 3126 sparse_cluster_push(&(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg); 3127 /* 3128 * no clusters of either type present at this point 3129 * so just go directly to start_new_cluster since 3130 * we know we need to delay this I/O since we've 3131 * already released the pages back into the cache 3132 * to avoid the deadlock with sparse_cluster_push 3133 */ 3134 goto start_new_cluster; 3135 } 3136 if (first_pass) { 3137 if (write_off == wbp->cl_last_write) 3138 wbp->cl_seq_written += write_cnt; 3139 else 3140 wbp->cl_seq_written = write_cnt; 3141 3142 wbp->cl_last_write = write_off + write_cnt; 3143 3144 first_pass = FALSE; 3145 } 3146 if (wbp->cl_number == 0) 3147 /* 3148 * no clusters currently present 3149 */ 3150 goto start_new_cluster; 3151 3152 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) { 3153 /* 3154 * check each cluster that we currently hold 3155 * try to merge some or all of this write into 3156 * one or more of the existing clusters... if 3157 * any portion of the write remains, start a 3158 * new cluster 3159 */ 3160 if (cl.b_addr >= wbp->cl_clusters[cl_index].b_addr) { 3161 /* 3162 * the current write starts at or after the current cluster 3163 */ 3164 if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) { 3165 /* 3166 * we have a write that fits entirely 3167 * within the existing cluster limits 3168 */ 3169 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) 3170 /* 3171 * update our idea of where the cluster ends 3172 */ 3173 wbp->cl_clusters[cl_index].e_addr = cl.e_addr; 3174 break; 3175 } 3176 if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) { 3177 /* 3178 * we have a write that starts in the middle of the current cluster 3179 * but extends beyond the cluster's limit... we know this because 3180 * of the previous checks 3181 * we'll extend the current cluster to the max 3182 * and update the b_addr for the current write to reflect that 3183 * the head of it was absorbed into this cluster... 3184 * note that we'll always have a leftover tail in this case since 3185 * full absorbtion would have occurred in the clause above 3186 */ 3187 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount; 3188 3189 cl.b_addr = wbp->cl_clusters[cl_index].e_addr; 3190 } 3191 /* 3192 * we come here for the case where the current write starts 3193 * beyond the limit of the existing cluster or we have a leftover 3194 * tail after a partial absorbtion 3195 * 3196 * in either case, we'll check the remaining clusters before 3197 * starting a new one 3198 */ 3199 } else { 3200 /* 3201 * the current write starts in front of the cluster we're currently considering 3202 */ 3203 if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= max_cluster_pgcount) { 3204 /* 3205 * we can just merge the new request into 3206 * this cluster and leave it in the cache 3207 * since the resulting cluster is still 3208 * less than the maximum allowable size 3209 */ 3210 wbp->cl_clusters[cl_index].b_addr = cl.b_addr; 3211 3212 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) { 3213 /* 3214 * the current write completely 3215 * envelops the existing cluster and since 3216 * each write is limited to at most max_cluster_pgcount pages 3217 * we can just use the start and last blocknos of the write 3218 * to generate the cluster limits 3219 */ 3220 wbp->cl_clusters[cl_index].e_addr = cl.e_addr; 3221 } 3222 break; 3223 } 3224 3225 /* 3226 * if we were to combine this write with the current cluster 3227 * we would exceed the cluster size limit.... so, 3228 * let's see if there's any overlap of the new I/O with 3229 * the cluster we're currently considering... in fact, we'll 3230 * stretch the cluster out to it's full limit and see if we 3231 * get an intersection with the current write 3232 * 3233 */ 3234 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) { 3235 /* 3236 * the current write extends into the proposed cluster 3237 * clip the length of the current write after first combining it's 3238 * tail with the newly shaped cluster 3239 */ 3240 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount; 3241 3242 cl.e_addr = wbp->cl_clusters[cl_index].b_addr; 3243 } 3244 /* 3245 * if we get here, there was no way to merge 3246 * any portion of this write with this cluster 3247 * or we could only merge part of it which 3248 * will leave a tail... 3249 * we'll check the remaining clusters before starting a new one 3250 */ 3251 } 3252 } 3253 if (cl_index < wbp->cl_number) 3254 /* 3255 * we found an existing cluster(s) that we 3256 * could entirely merge this I/O into 3257 */ 3258 goto delay_io; 3259 3260 if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && 3261 wbp->cl_number == MAX_CLUSTERS && 3262 wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) { 3263 uint32_t n; 3264 3265 if (vp->v_mount->mnt_kern_flag & MNTK_SSD) 3266 n = WRITE_BEHIND_SSD; 3267 else 3268 n = WRITE_BEHIND; 3269 3270 while (n--) 3271 cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg); 3272 } 3273 if (wbp->cl_number < MAX_CLUSTERS) { 3274 /* 3275 * we didn't find an existing cluster to 3276 * merge into, but there's room to start 3277 * a new one 3278 */ 3279 goto start_new_cluster; 3280 } 3281 /* 3282 * no exisitng cluster to merge with and no 3283 * room to start a new one... we'll try 3284 * pushing one of the existing ones... if none of 3285 * them are able to be pushed, we'll switch 3286 * to the sparse cluster mechanism 3287 * cluster_try_push updates cl_number to the 3288 * number of remaining clusters... and 3289 * returns the number of currently unused clusters 3290 */ 3291 ret_cluster_try_push = 0; 3292 3293 /* 3294 * if writes are not deferred, call cluster push immediately 3295 */ 3296 if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) { 3297 3298 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg); 3299 } 3300 3301 /* 3302 * execute following regardless of writes being deferred or not 3303 */ 3304 if (ret_cluster_try_push == 0) { 3305 /* 3306 * no more room in the normal cluster mechanism 3307 * so let's switch to the more expansive but expensive 3308 * sparse mechanism.... 3309 */ 3310 sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg); 3311 sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, newEOF, callback, callback_arg); 3312 3313 lck_mtx_unlock(&wbp->cl_lockw); 3314 3315 continue; 3316 } 3317start_new_cluster: 3318 wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr; 3319 wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr; 3320 3321 wbp->cl_clusters[wbp->cl_number].io_flags = 0; 3322 3323 if (flags & IO_NOCACHE) 3324 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IONOCACHE; 3325 3326 if (bflag & CL_PASSIVE) 3327 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IOPASSIVE; 3328 3329 wbp->cl_number++; 3330delay_io: 3331 lck_mtx_unlock(&wbp->cl_lockw); 3332 3333 continue; 3334issue_io: 3335 /* 3336 * we don't hold the lock at this point 3337 * 3338 * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set 3339 * so that we correctly deal with a change in state of the hardware modify bit... 3340 * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force 3341 * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also 3342 * responsible for generating the correct sized I/O(s) 3343 */ 3344 retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg); 3345 } 3346 } 3347 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, retval, 0, io_resid, 0, 0); 3348 3349 return (retval); 3350} 3351 3352 3353 3354int 3355cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags) 3356{ 3357 return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL); 3358} 3359 3360 3361int 3362cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (*callback)(buf_t, void *), void *callback_arg) 3363{ 3364 int retval = 0; 3365 int flags; 3366 user_ssize_t cur_resid; 3367 u_int32_t io_size; 3368 u_int32_t read_length = 0; 3369 int read_type = IO_COPY; 3370 3371 flags = xflags; 3372 3373 if (vp->v_flag & VNOCACHE_DATA) 3374 flags |= IO_NOCACHE; 3375 if ((vp->v_flag & VRAOFF) || speculative_reads_disabled) 3376 flags |= IO_RAOFF; 3377 3378 /* 3379 * If we're doing an encrypted IO, then first check to see 3380 * if the IO requested was page aligned. If not, then bail 3381 * out immediately. 3382 */ 3383 if (flags & IO_ENCRYPTED) { 3384 if (read_length & PAGE_MASK) { 3385 retval = EINVAL; 3386 return retval; 3387 } 3388 } 3389 3390 /* 3391 * do a read through the cache if one of the following is true.... 3392 * NOCACHE is not true 3393 * the uio request doesn't target USERSPACE 3394 * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well. 3395 * Reading encrypted data from a CP filesystem should never result in the data touching 3396 * the UBC. 3397 * 3398 * otherwise, find out if we want the direct or contig variant for 3399 * the first vector in the uio request 3400 */ 3401 if (((flags & IO_NOCACHE) || (flags & IO_ENCRYPTED)) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) { 3402 3403 boolean_t check_io_type = TRUE; 3404 3405 3406 if (check_io_type) { 3407 retval = cluster_io_type(uio, &read_type, &read_length, 0); 3408 } 3409 } 3410 3411 while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) { 3412 3413 switch (read_type) { 3414 3415 case IO_COPY: 3416 /* 3417 * make sure the uio_resid isn't too big... 3418 * internally, we want to handle all of the I/O in 3419 * chunk sizes that fit in a 32 bit int 3420 */ 3421 if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) 3422 io_size = MAX_IO_REQUEST_SIZE; 3423 else 3424 io_size = (u_int32_t)cur_resid; 3425 3426 retval = cluster_read_copy(vp, uio, io_size, filesize, flags, callback, callback_arg); 3427 break; 3428 3429 case IO_DIRECT: 3430 retval = cluster_read_direct(vp, uio, filesize, &read_type, &read_length, flags, callback, callback_arg); 3431 break; 3432 3433 case IO_CONTIG: 3434 retval = cluster_read_contig(vp, uio, filesize, &read_type, &read_length, callback, callback_arg, flags); 3435 break; 3436 3437 case IO_UNKNOWN: 3438 retval = cluster_io_type(uio, &read_type, &read_length, 0); 3439 break; 3440 } 3441 } 3442 return (retval); 3443} 3444 3445 3446 3447static void 3448cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference) 3449{ 3450 int range; 3451 int abort_flags = UPL_ABORT_FREE_ON_EMPTY; 3452 3453 if ((range = last_pg - start_pg)) { 3454 if (take_reference) 3455 abort_flags |= UPL_ABORT_REFERENCE; 3456 3457 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags); 3458 } 3459} 3460 3461 3462static int 3463cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg) 3464{ 3465 upl_page_info_t *pl; 3466 upl_t upl; 3467 vm_offset_t upl_offset; 3468 u_int32_t upl_size; 3469 off_t upl_f_offset; 3470 int start_offset; 3471 int start_pg; 3472 int last_pg; 3473 int uio_last = 0; 3474 int pages_in_upl; 3475 off_t max_size; 3476 off_t last_ioread_offset; 3477 off_t last_request_offset; 3478 kern_return_t kret; 3479 int error = 0; 3480 int retval = 0; 3481 u_int32_t size_of_prefetch; 3482 u_int32_t xsize; 3483 u_int32_t io_size; 3484 u_int32_t max_rd_size; 3485 u_int32_t max_io_size; 3486 u_int32_t max_prefetch; 3487 u_int rd_ahead_enabled = 1; 3488 u_int prefetch_enabled = 1; 3489 struct cl_readahead * rap; 3490 struct clios iostate; 3491 struct cl_extent extent; 3492 int bflag; 3493 int take_reference = 1; 3494 int policy = IOPOL_DEFAULT; 3495 boolean_t iolock_inited = FALSE; 3496 3497 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START, 3498 (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0); 3499 3500 if (flags & IO_ENCRYPTED) { 3501 panic ("encrypted blocks will hit UBC!"); 3502 } 3503 3504 policy = throttle_get_io_policy(NULL); 3505 3506 if (policy == THROTTLE_LEVEL_TIER3 || policy == THROTTLE_LEVEL_TIER2 || (flags & IO_NOCACHE)) 3507 take_reference = 0; 3508 3509 if (flags & IO_PASSIVE) 3510 bflag = CL_PASSIVE; 3511 else 3512 bflag = 0; 3513 3514 if (flags & IO_NOCACHE) 3515 bflag |= CL_NOCACHE; 3516 3517 max_io_size = cluster_max_io_size(vp->v_mount, CL_READ); 3518 max_prefetch = MAX_PREFETCH(vp, max_io_size, (vp->v_mount->mnt_kern_flag & MNTK_SSD)); 3519 max_rd_size = max_prefetch; 3520 3521 last_request_offset = uio->uio_offset + io_req_size; 3522 3523 if (last_request_offset > filesize) 3524 last_request_offset = filesize; 3525 3526 if ((flags & (IO_RAOFF|IO_NOCACHE)) || ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) { 3527 rd_ahead_enabled = 0; 3528 rap = NULL; 3529 } else { 3530 if (cluster_is_throttled(vp)) { 3531 /* 3532 * we're in the throttle window, at the very least 3533 * we want to limit the size of the I/O we're about 3534 * to issue 3535 */ 3536 rd_ahead_enabled = 0; 3537 prefetch_enabled = 0; 3538 3539 max_rd_size = THROTTLE_MAX_IOSIZE; 3540 } 3541 if ((rap = cluster_get_rap(vp)) == NULL) 3542 rd_ahead_enabled = 0; 3543 else { 3544 extent.b_addr = uio->uio_offset / PAGE_SIZE_64; 3545 extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64; 3546 } 3547 } 3548 if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) { 3549 /* 3550 * determine if we already have a read-ahead in the pipe courtesy of the 3551 * last read systemcall that was issued... 3552 * if so, pick up it's extent to determine where we should start 3553 * with respect to any read-ahead that might be necessary to 3554 * garner all the data needed to complete this read systemcall 3555 */ 3556 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64; 3557 3558 if (last_ioread_offset < uio->uio_offset) 3559 last_ioread_offset = (off_t)0; 3560 else if (last_ioread_offset > last_request_offset) 3561 last_ioread_offset = last_request_offset; 3562 } else 3563 last_ioread_offset = (off_t)0; 3564 3565 while (io_req_size && uio->uio_offset < filesize && retval == 0) { 3566 3567 max_size = filesize - uio->uio_offset; 3568 3569 if ((off_t)(io_req_size) < max_size) 3570 io_size = io_req_size; 3571 else 3572 io_size = max_size; 3573 3574 if (!(flags & IO_NOCACHE)) { 3575 3576 while (io_size) { 3577 u_int32_t io_resid; 3578 u_int32_t io_requested; 3579 3580 /* 3581 * if we keep finding the pages we need already in the cache, then 3582 * don't bother to call cluster_read_prefetch since it costs CPU cycles 3583 * to determine that we have all the pages we need... once we miss in 3584 * the cache and have issued an I/O, than we'll assume that we're likely 3585 * to continue to miss in the cache and it's to our advantage to try and prefetch 3586 */ 3587 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) { 3588 if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) { 3589 /* 3590 * we've already issued I/O for this request and 3591 * there's still work to do and 3592 * our prefetch stream is running dry, so issue a 3593 * pre-fetch I/O... the I/O latency will overlap 3594 * with the copying of the data 3595 */ 3596 if (size_of_prefetch > max_rd_size) 3597 size_of_prefetch = max_rd_size; 3598 3599 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag); 3600 3601 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE); 3602 3603 if (last_ioread_offset > last_request_offset) 3604 last_ioread_offset = last_request_offset; 3605 } 3606 } 3607 /* 3608 * limit the size of the copy we're about to do so that 3609 * we can notice that our I/O pipe is running dry and 3610 * get the next I/O issued before it does go dry 3611 */ 3612 if (last_ioread_offset && io_size > (max_io_size / 4)) 3613 io_resid = (max_io_size / 4); 3614 else 3615 io_resid = io_size; 3616 3617 io_requested = io_resid; 3618 3619 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference); 3620 3621 xsize = io_requested - io_resid; 3622 3623 io_size -= xsize; 3624 io_req_size -= xsize; 3625 3626 if (retval || io_resid) 3627 /* 3628 * if we run into a real error or 3629 * a page that is not in the cache 3630 * we need to leave streaming mode 3631 */ 3632 break; 3633 3634 if (rd_ahead_enabled && (io_size == 0 || last_ioread_offset == last_request_offset)) { 3635 /* 3636 * we're already finished the I/O for this read request 3637 * let's see if we should do a read-ahead 3638 */ 3639 cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag); 3640 } 3641 } 3642 if (retval) 3643 break; 3644 if (io_size == 0) { 3645 if (rap != NULL) { 3646 if (extent.e_addr < rap->cl_lastr) 3647 rap->cl_maxra = 0; 3648 rap->cl_lastr = extent.e_addr; 3649 } 3650 break; 3651 } 3652 /* 3653 * recompute max_size since cluster_copy_ubc_data_internal 3654 * may have advanced uio->uio_offset 3655 */ 3656 max_size = filesize - uio->uio_offset; 3657 } 3658 3659 iostate.io_completed = 0; 3660 iostate.io_issued = 0; 3661 iostate.io_error = 0; 3662 iostate.io_wanted = 0; 3663 3664 if ( (flags & IO_RETURN_ON_THROTTLE) ) { 3665 if (cluster_is_throttled(vp) == THROTTLE_NOW) { 3666 if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) { 3667 /* 3668 * we're in the throttle window and at least 1 I/O 3669 * has already been issued by a throttleable thread 3670 * in this window, so return with EAGAIN to indicate 3671 * to the FS issuing the cluster_read call that it 3672 * should now throttle after dropping any locks 3673 */ 3674 throttle_info_update_by_mount(vp->v_mount); 3675 3676 retval = EAGAIN; 3677 break; 3678 } 3679 } 3680 } 3681 3682 /* 3683 * compute the size of the upl needed to encompass 3684 * the requested read... limit each call to cluster_io 3685 * to the maximum UPL size... cluster_io will clip if 3686 * this exceeds the maximum io_size for the device, 3687 * make sure to account for 3688 * a starting offset that's not page aligned 3689 */ 3690 start_offset = (int)(uio->uio_offset & PAGE_MASK_64); 3691 upl_f_offset = uio->uio_offset - (off_t)start_offset; 3692 3693 if (io_size > max_rd_size) 3694 io_size = max_rd_size; 3695 3696 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 3697 3698 if (flags & IO_NOCACHE) { 3699 if (upl_size > max_io_size) 3700 upl_size = max_io_size; 3701 } else { 3702 if (upl_size > max_io_size / 4) 3703 upl_size = max_io_size / 4; 3704 } 3705 pages_in_upl = upl_size / PAGE_SIZE; 3706 3707 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START, 3708 upl, (int)upl_f_offset, upl_size, start_offset, 0); 3709 3710 kret = ubc_create_upl(vp, 3711 upl_f_offset, 3712 upl_size, 3713 &upl, 3714 &pl, 3715 UPL_FILE_IO | UPL_SET_LITE); 3716 if (kret != KERN_SUCCESS) 3717 panic("cluster_read_copy: failed to get pagelist"); 3718 3719 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END, 3720 upl, (int)upl_f_offset, upl_size, start_offset, 0); 3721 3722 /* 3723 * scan from the beginning of the upl looking for the first 3724 * non-valid page.... this will become the first page in 3725 * the request we're going to make to 'cluster_io'... if all 3726 * of the pages are valid, we won't call through to 'cluster_io' 3727 */ 3728 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) { 3729 if (!upl_valid_page(pl, start_pg)) 3730 break; 3731 } 3732 3733 /* 3734 * scan from the starting invalid page looking for a valid 3735 * page before the end of the upl is reached, if we 3736 * find one, then it will be the last page of the request to 3737 * 'cluster_io' 3738 */ 3739 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) { 3740 if (upl_valid_page(pl, last_pg)) 3741 break; 3742 } 3743 3744 if (start_pg < last_pg) { 3745 /* 3746 * we found a range of 'invalid' pages that must be filled 3747 * if the last page in this range is the last page of the file 3748 * we may have to clip the size of it to keep from reading past 3749 * the end of the last physical block associated with the file 3750 */ 3751 if (iolock_inited == FALSE) { 3752 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); 3753 3754 iolock_inited = TRUE; 3755 } 3756 upl_offset = start_pg * PAGE_SIZE; 3757 io_size = (last_pg - start_pg) * PAGE_SIZE; 3758 3759 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) 3760 io_size = filesize - (upl_f_offset + upl_offset); 3761 3762 /* 3763 * issue an asynchronous read to cluster_io 3764 */ 3765 3766 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, 3767 io_size, CL_READ | CL_ASYNC | bflag, (buf_t)NULL, &iostate, callback, callback_arg); 3768 3769 if (rap) { 3770 if (extent.e_addr < rap->cl_maxra) { 3771 /* 3772 * we've just issued a read for a block that should have been 3773 * in the cache courtesy of the read-ahead engine... something 3774 * has gone wrong with the pipeline, so reset the read-ahead 3775 * logic which will cause us to restart from scratch 3776 */ 3777 rap->cl_maxra = 0; 3778 } 3779 } 3780 } 3781 if (error == 0) { 3782 /* 3783 * if the read completed successfully, or there was no I/O request 3784 * issued, than copy the data into user land via 'cluster_upl_copy_data' 3785 * we'll first add on any 'valid' 3786 * pages that were present in the upl when we acquired it. 3787 */ 3788 u_int val_size; 3789 3790 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) { 3791 if (!upl_valid_page(pl, uio_last)) 3792 break; 3793 } 3794 if (uio_last < pages_in_upl) { 3795 /* 3796 * there were some invalid pages beyond the valid pages 3797 * that we didn't issue an I/O for, just release them 3798 * unchanged now, so that any prefetch/readahed can 3799 * include them 3800 */ 3801 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE, 3802 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY); 3803 } 3804 3805 /* 3806 * compute size to transfer this round, if io_req_size is 3807 * still non-zero after this attempt, we'll loop around and 3808 * set up for another I/O. 3809 */ 3810 val_size = (uio_last * PAGE_SIZE) - start_offset; 3811 3812 if (val_size > max_size) 3813 val_size = max_size; 3814 3815 if (val_size > io_req_size) 3816 val_size = io_req_size; 3817 3818 if ((uio->uio_offset + val_size) > last_ioread_offset) 3819 last_ioread_offset = uio->uio_offset + val_size; 3820 3821 if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) { 3822 3823 if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) { 3824 /* 3825 * if there's still I/O left to do for this request, and... 3826 * we're not in hard throttle mode, and... 3827 * we're close to using up the previous prefetch, then issue a 3828 * new pre-fetch I/O... the I/O latency will overlap 3829 * with the copying of the data 3830 */ 3831 if (size_of_prefetch > max_rd_size) 3832 size_of_prefetch = max_rd_size; 3833 3834 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag); 3835 3836 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE); 3837 3838 if (last_ioread_offset > last_request_offset) 3839 last_ioread_offset = last_request_offset; 3840 } 3841 3842 } else if ((uio->uio_offset + val_size) == last_request_offset) { 3843 /* 3844 * this transfer will finish this request, so... 3845 * let's try to read ahead if we're in 3846 * a sequential access pattern and we haven't 3847 * explicitly disabled it 3848 */ 3849 if (rd_ahead_enabled) 3850 cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag); 3851 3852 if (rap != NULL) { 3853 if (extent.e_addr < rap->cl_lastr) 3854 rap->cl_maxra = 0; 3855 rap->cl_lastr = extent.e_addr; 3856 } 3857 } 3858 if (iostate.io_issued > iostate.io_completed) 3859 cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); 3860 3861 if (iostate.io_error) 3862 error = iostate.io_error; 3863 else { 3864 u_int32_t io_requested; 3865 3866 io_requested = val_size; 3867 3868 retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested); 3869 3870 io_req_size -= (val_size - io_requested); 3871 } 3872 } else { 3873 if (iostate.io_issued > iostate.io_completed) 3874 cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); 3875 } 3876 if (start_pg < last_pg) { 3877 /* 3878 * compute the range of pages that we actually issued an I/O for 3879 * and either commit them as valid if the I/O succeeded 3880 * or abort them if the I/O failed or we're not supposed to 3881 * keep them in the cache 3882 */ 3883 io_size = (last_pg - start_pg) * PAGE_SIZE; 3884 3885 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, upl, start_pg * PAGE_SIZE, io_size, error, 0); 3886 3887 if (error || (flags & IO_NOCACHE)) 3888 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size, 3889 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); 3890 else { 3891 int commit_flags = UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY; 3892 3893 if (take_reference) 3894 commit_flags |= UPL_COMMIT_INACTIVATE; 3895 else 3896 commit_flags |= UPL_COMMIT_SPECULATE; 3897 3898 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, commit_flags); 3899 } 3900 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, start_pg * PAGE_SIZE, io_size, error, 0); 3901 } 3902 if ((last_pg - start_pg) < pages_in_upl) { 3903 /* 3904 * the set of pages that we issued an I/O for did not encompass 3905 * the entire upl... so just release these without modifying 3906 * their state 3907 */ 3908 if (error) 3909 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); 3910 else { 3911 3912 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, 3913 upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0); 3914 3915 /* 3916 * handle any valid pages at the beginning of 3917 * the upl... release these appropriately 3918 */ 3919 cluster_read_upl_release(upl, 0, start_pg, take_reference); 3920 3921 /* 3922 * handle any valid pages immediately after the 3923 * pages we issued I/O for... ... release these appropriately 3924 */ 3925 cluster_read_upl_release(upl, last_pg, uio_last, take_reference); 3926 3927 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, -1, -1, 0, 0); 3928 } 3929 } 3930 if (retval == 0) 3931 retval = error; 3932 3933 if (io_req_size) { 3934 if (cluster_is_throttled(vp)) { 3935 /* 3936 * we're in the throttle window, at the very least 3937 * we want to limit the size of the I/O we're about 3938 * to issue 3939 */ 3940 rd_ahead_enabled = 0; 3941 prefetch_enabled = 0; 3942 max_rd_size = THROTTLE_MAX_IOSIZE; 3943 } else { 3944 if (max_rd_size == THROTTLE_MAX_IOSIZE) { 3945 /* 3946 * coming out of throttled state 3947 */ 3948 if (policy != THROTTLE_LEVEL_TIER3 && policy != THROTTLE_LEVEL_TIER2) { 3949 if (rap != NULL) 3950 rd_ahead_enabled = 1; 3951 prefetch_enabled = 1; 3952 } 3953 max_rd_size = max_prefetch; 3954 last_ioread_offset = 0; 3955 } 3956 } 3957 } 3958 } 3959 if (iolock_inited == TRUE) { 3960 if (iostate.io_issued > iostate.io_completed) { 3961 /* 3962 * cluster_io returned an error after it 3963 * had already issued some I/O. we need 3964 * to wait for that I/O to complete before 3965 * we can destroy the iostate mutex... 3966 * 'retval' already contains the early error 3967 * so no need to pick it up from iostate.io_error 3968 */ 3969 cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); 3970 } 3971 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); 3972 } 3973 if (rap != NULL) { 3974 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END, 3975 (int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0); 3976 3977 lck_mtx_unlock(&rap->cl_lockr); 3978 } else { 3979 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END, 3980 (int)uio->uio_offset, io_req_size, 0, retval, 0); 3981 } 3982 3983 return (retval); 3984} 3985 3986 3987static int 3988cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length, 3989 int flags, int (*callback)(buf_t, void *), void *callback_arg) 3990{ 3991 upl_t upl; 3992 upl_page_info_t *pl; 3993 off_t max_io_size; 3994 vm_offset_t upl_offset, vector_upl_offset = 0; 3995 upl_size_t upl_size, vector_upl_size = 0; 3996 vm_size_t upl_needed_size; 3997 unsigned int pages_in_pl; 3998 int upl_flags; 3999 kern_return_t kret; 4000 unsigned int i; 4001 int force_data_sync; 4002 int retval = 0; 4003 int no_zero_fill = 0; 4004 int io_flag = 0; 4005 int misaligned = 0; 4006 struct clios iostate; 4007 user_addr_t iov_base; 4008 u_int32_t io_req_size; 4009 u_int32_t offset_in_file; 4010 u_int32_t offset_in_iovbase; 4011 u_int32_t io_size; 4012 u_int32_t io_min; 4013 u_int32_t xsize; 4014 u_int32_t devblocksize; 4015 u_int32_t mem_alignment_mask; 4016 u_int32_t max_upl_size; 4017 u_int32_t max_rd_size; 4018 u_int32_t max_rd_ahead; 4019 u_int32_t max_vector_size; 4020 boolean_t strict_uncached_IO = FALSE; 4021 boolean_t io_throttled = FALSE; 4022 4023 u_int32_t vector_upl_iosize = 0; 4024 int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1); 4025 off_t v_upl_uio_offset = 0; 4026 int vector_upl_index=0; 4027 upl_t vector_upl = NULL; 4028 4029 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START, 4030 (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0); 4031 4032 max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ); 4033 4034 max_rd_size = max_upl_size; 4035 max_rd_ahead = max_rd_size * IO_SCALE(vp, 2); 4036 4037 io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO; 4038 4039 if (flags & IO_PASSIVE) 4040 io_flag |= CL_PASSIVE; 4041 4042 if (flags & IO_ENCRYPTED) { 4043 io_flag |= CL_RAW_ENCRYPTED; 4044 } 4045 4046 if (flags & IO_NOCACHE) { 4047 io_flag |= CL_NOCACHE; 4048 } 4049 4050 iostate.io_completed = 0; 4051 iostate.io_issued = 0; 4052 iostate.io_error = 0; 4053 iostate.io_wanted = 0; 4054 4055 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); 4056 4057 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; 4058 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; 4059 4060 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE, 4061 (int)devblocksize, (int)mem_alignment_mask, 0, 0, 0); 4062 4063 if (devblocksize == 1) { 4064 /* 4065 * the AFP client advertises a devblocksize of 1 4066 * however, its BLOCKMAP routine maps to physical 4067 * blocks that are PAGE_SIZE in size... 4068 * therefore we can't ask for I/Os that aren't page aligned 4069 * or aren't multiples of PAGE_SIZE in size 4070 * by setting devblocksize to PAGE_SIZE, we re-instate 4071 * the old behavior we had before the mem_alignment_mask 4072 * changes went in... 4073 */ 4074 devblocksize = PAGE_SIZE; 4075 } 4076 4077 strict_uncached_IO = ubc_strict_uncached_IO(vp); 4078 4079next_dread: 4080 io_req_size = *read_length; 4081 iov_base = uio_curriovbase(uio); 4082 4083 max_io_size = filesize - uio->uio_offset; 4084 4085 if ((off_t)io_req_size > max_io_size) 4086 io_req_size = max_io_size; 4087 4088 offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1); 4089 offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask; 4090 4091 if (offset_in_file || offset_in_iovbase) { 4092 /* 4093 * one of the 2 important offsets is misaligned 4094 * so fire an I/O through the cache for this entire vector 4095 */ 4096 misaligned = 1; 4097 } 4098 if (iov_base & (devblocksize - 1)) { 4099 /* 4100 * the offset in memory must be on a device block boundary 4101 * so that we can guarantee that we can generate an 4102 * I/O that ends on a page boundary in cluster_io 4103 */ 4104 misaligned = 1; 4105 } 4106 4107 /* 4108 * The user must request IO in aligned chunks. If the 4109 * offset into the file is bad, or the userland pointer 4110 * is non-aligned, then we cannot service the encrypted IO request. 4111 */ 4112 if ((flags & IO_ENCRYPTED) && (misaligned)) { 4113 retval = EINVAL; 4114 } 4115 4116 /* 4117 * When we get to this point, we know... 4118 * -- the offset into the file is on a devblocksize boundary 4119 */ 4120 4121 while (io_req_size && retval == 0) { 4122 u_int32_t io_start; 4123 4124 if (cluster_is_throttled(vp)) { 4125 /* 4126 * we're in the throttle window, at the very least 4127 * we want to limit the size of the I/O we're about 4128 * to issue 4129 */ 4130 max_rd_size = THROTTLE_MAX_IOSIZE; 4131 max_rd_ahead = THROTTLE_MAX_IOSIZE - 1; 4132 max_vector_size = THROTTLE_MAX_IOSIZE; 4133 } else { 4134 max_rd_size = max_upl_size; 4135 max_rd_ahead = max_rd_size * IO_SCALE(vp, 2); 4136 max_vector_size = MAX_VECTOR_UPL_SIZE; 4137 } 4138 io_start = io_size = io_req_size; 4139 4140 /* 4141 * First look for pages already in the cache 4142 * and move them to user space. But only do this 4143 * check if we are not retrieving encrypted data directly 4144 * from the filesystem; those blocks should never 4145 * be in the UBC. 4146 * 4147 * cluster_copy_ubc_data returns the resid 4148 * in io_size 4149 */ 4150 if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) { 4151 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0); 4152 } 4153 /* 4154 * calculate the number of bytes actually copied 4155 * starting size - residual 4156 */ 4157 xsize = io_start - io_size; 4158 4159 io_req_size -= xsize; 4160 4161 if(useVectorUPL && (xsize || (iov_base & PAGE_MASK))) { 4162 /* 4163 * We found something in the cache or we have an iov_base that's not 4164 * page-aligned. 4165 * 4166 * Issue all I/O's that have been collected within this Vectored UPL. 4167 */ 4168 if(vector_upl_index) { 4169 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 4170 reset_vector_run_state(); 4171 } 4172 4173 if(xsize) 4174 useVectorUPL = 0; 4175 4176 /* 4177 * After this point, if we are using the Vector UPL path and the base is 4178 * not page-aligned then the UPL with that base will be the first in the vector UPL. 4179 */ 4180 } 4181 4182 /* 4183 * check to see if we are finished with this request. 4184 * 4185 * If we satisfied this IO already, then io_req_size will be 0. 4186 * Otherwise, see if the IO was mis-aligned and needs to go through 4187 * the UBC to deal with the 'tail'. 4188 * 4189 */ 4190 if (io_req_size == 0 || (misaligned)) { 4191 /* 4192 * see if there's another uio vector to 4193 * process that's of type IO_DIRECT 4194 * 4195 * break out of while loop to get there 4196 */ 4197 break; 4198 } 4199 /* 4200 * assume the request ends on a device block boundary 4201 */ 4202 io_min = devblocksize; 4203 4204 /* 4205 * we can handle I/O's in multiples of the device block size 4206 * however, if io_size isn't a multiple of devblocksize we 4207 * want to clip it back to the nearest page boundary since 4208 * we are going to have to go through cluster_read_copy to 4209 * deal with the 'overhang'... by clipping it to a PAGE_SIZE 4210 * multiple, we avoid asking the drive for the same physical 4211 * blocks twice.. once for the partial page at the end of the 4212 * request and a 2nd time for the page we read into the cache 4213 * (which overlaps the end of the direct read) in order to 4214 * get at the overhang bytes 4215 */ 4216 if (io_size & (devblocksize - 1)) { 4217 if (flags & IO_ENCRYPTED) { 4218 /* 4219 * Normally, we'd round down to the previous page boundary to 4220 * let the UBC manage the zero-filling of the file past the EOF. 4221 * But if we're doing encrypted IO, we can't let any of 4222 * the data hit the UBC. This means we have to do the full 4223 * IO to the upper block boundary of the device block that 4224 * contains the EOF. The user will be responsible for not 4225 * interpreting data PAST the EOF in its buffer. 4226 * 4227 * So just bump the IO back up to a multiple of devblocksize 4228 */ 4229 io_size = ((io_size + devblocksize) & ~(devblocksize - 1)); 4230 io_min = io_size; 4231 } 4232 else { 4233 /* 4234 * Clip the request to the previous page size boundary 4235 * since request does NOT end on a device block boundary 4236 */ 4237 io_size &= ~PAGE_MASK; 4238 io_min = PAGE_SIZE; 4239 } 4240 4241 } 4242 if (retval || io_size < io_min) { 4243 /* 4244 * either an error or we only have the tail left to 4245 * complete via the copy path... 4246 * we may have already spun some portion of this request 4247 * off as async requests... we need to wait for the I/O 4248 * to complete before returning 4249 */ 4250 goto wait_for_dreads; 4251 } 4252 4253 /* 4254 * Don't re-check the UBC data if we are looking for uncached IO 4255 * or asking for encrypted blocks. 4256 */ 4257 if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) { 4258 4259 if ((xsize = io_size) > max_rd_size) 4260 xsize = max_rd_size; 4261 4262 io_size = 0; 4263 4264 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size); 4265 4266 if (io_size == 0) { 4267 /* 4268 * a page must have just come into the cache 4269 * since the first page in this range is no 4270 * longer absent, go back and re-evaluate 4271 */ 4272 continue; 4273 } 4274 } 4275 if ( (flags & IO_RETURN_ON_THROTTLE) ) { 4276 if (cluster_is_throttled(vp) == THROTTLE_NOW) { 4277 if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) { 4278 /* 4279 * we're in the throttle window and at least 1 I/O 4280 * has already been issued by a throttleable thread 4281 * in this window, so return with EAGAIN to indicate 4282 * to the FS issuing the cluster_read call that it 4283 * should now throttle after dropping any locks 4284 */ 4285 throttle_info_update_by_mount(vp->v_mount); 4286 4287 io_throttled = TRUE; 4288 goto wait_for_dreads; 4289 } 4290 } 4291 } 4292 if (io_size > max_rd_size) 4293 io_size = max_rd_size; 4294 4295 iov_base = uio_curriovbase(uio); 4296 4297 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK); 4298 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK; 4299 4300 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START, 4301 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0); 4302 4303 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) 4304 no_zero_fill = 1; 4305 else 4306 no_zero_fill = 0; 4307 4308 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) { 4309 pages_in_pl = 0; 4310 upl_size = upl_needed_size; 4311 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; 4312 4313 if (no_zero_fill) 4314 upl_flags |= UPL_NOZEROFILL; 4315 if (force_data_sync) 4316 upl_flags |= UPL_FORCE_DATA_SYNC; 4317 4318 kret = vm_map_create_upl(current_map(), 4319 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), 4320 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags); 4321 4322 if (kret != KERN_SUCCESS) { 4323 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, 4324 (int)upl_offset, upl_size, io_size, kret, 0); 4325 /* 4326 * failed to get pagelist 4327 * 4328 * we may have already spun some portion of this request 4329 * off as async requests... we need to wait for the I/O 4330 * to complete before returning 4331 */ 4332 goto wait_for_dreads; 4333 } 4334 pages_in_pl = upl_size / PAGE_SIZE; 4335 pl = UPL_GET_INTERNAL_PAGE_LIST(upl); 4336 4337 for (i = 0; i < pages_in_pl; i++) { 4338 if (!upl_page_present(pl, i)) 4339 break; 4340 } 4341 if (i == pages_in_pl) 4342 break; 4343 4344 ubc_upl_abort(upl, 0); 4345 } 4346 if (force_data_sync >= 3) { 4347 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, 4348 (int)upl_offset, upl_size, io_size, kret, 0); 4349 4350 goto wait_for_dreads; 4351 } 4352 /* 4353 * Consider the possibility that upl_size wasn't satisfied. 4354 */ 4355 if (upl_size < upl_needed_size) { 4356 if (upl_size && upl_offset == 0) 4357 io_size = upl_size; 4358 else 4359 io_size = 0; 4360 } 4361 if (io_size == 0) { 4362 ubc_upl_abort(upl, 0); 4363 goto wait_for_dreads; 4364 } 4365 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, 4366 (int)upl_offset, upl_size, io_size, kret, 0); 4367 4368 if(useVectorUPL) { 4369 vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK); 4370 if(end_off) 4371 issueVectorUPL = 1; 4372 /* 4373 * After this point, if we are using a vector UPL, then 4374 * either all the UPL elements end on a page boundary OR 4375 * this UPL is the last element because it does not end 4376 * on a page boundary. 4377 */ 4378 } 4379 4380 /* 4381 * request asynchronously so that we can overlap 4382 * the preparation of the next I/O 4383 * if there are already too many outstanding reads 4384 * wait until some have completed before issuing the next read 4385 */ 4386 if (iostate.io_issued > iostate.io_completed) 4387 cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct"); 4388 4389 if (iostate.io_error) { 4390 /* 4391 * one of the earlier reads we issued ran into a hard error 4392 * don't issue any more reads, cleanup the UPL 4393 * that was just created but not used, then 4394 * go wait for any other reads to complete before 4395 * returning the error to the caller 4396 */ 4397 ubc_upl_abort(upl, 0); 4398 4399 goto wait_for_dreads; 4400 } 4401 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START, 4402 upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0); 4403 4404 4405 if(!useVectorUPL) { 4406 if (no_zero_fill) 4407 io_flag &= ~CL_PRESERVE; 4408 else 4409 io_flag |= CL_PRESERVE; 4410 4411 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 4412 4413 } else { 4414 4415 if(!vector_upl_index) { 4416 vector_upl = vector_upl_create(upl_offset); 4417 v_upl_uio_offset = uio->uio_offset; 4418 vector_upl_offset = upl_offset; 4419 } 4420 4421 vector_upl_set_subupl(vector_upl,upl, upl_size); 4422 vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size); 4423 vector_upl_index++; 4424 vector_upl_size += upl_size; 4425 vector_upl_iosize += io_size; 4426 4427 if(issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) { 4428 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 4429 reset_vector_run_state(); 4430 } 4431 } 4432 /* 4433 * update the uio structure 4434 */ 4435 if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) { 4436 uio_update(uio, (user_size_t)max_io_size); 4437 } 4438 else { 4439 uio_update(uio, (user_size_t)io_size); 4440 } 4441 /* 4442 * Under normal circumstances, the io_size should not be 4443 * bigger than the io_req_size, but we may have had to round up 4444 * to the end of the page in the encrypted IO case. In that case only, 4445 * ensure that we only decrement io_req_size to 0. 4446 */ 4447 if ((flags & IO_ENCRYPTED) && (io_size > io_req_size)) { 4448 io_req_size = 0; 4449 } 4450 else { 4451 io_req_size -= io_size; 4452 } 4453 4454 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END, 4455 upl, (int)uio->uio_offset, io_req_size, retval, 0); 4456 4457 } /* end while */ 4458 4459 if (retval == 0 && iostate.io_error == 0 && io_req_size == 0 && uio->uio_offset < filesize) { 4460 4461 retval = cluster_io_type(uio, read_type, read_length, 0); 4462 4463 if (retval == 0 && *read_type == IO_DIRECT) { 4464 4465 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE, 4466 (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0); 4467 4468 goto next_dread; 4469 } 4470 } 4471 4472wait_for_dreads: 4473 4474 if(retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) { 4475 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 4476 reset_vector_run_state(); 4477 } 4478 /* 4479 * make sure all async reads that are part of this stream 4480 * have completed before we return 4481 */ 4482 if (iostate.io_issued > iostate.io_completed) 4483 cluster_iostate_wait(&iostate, 0, "cluster_read_direct"); 4484 4485 if (iostate.io_error) 4486 retval = iostate.io_error; 4487 4488 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); 4489 4490 if (io_throttled == TRUE && retval == 0) 4491 retval = EAGAIN; 4492 4493 if (io_req_size && retval == 0) { 4494 /* 4495 * we couldn't handle the tail of this request in DIRECT mode 4496 * so fire it through the copy path 4497 */ 4498 retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg); 4499 4500 *read_type = IO_UNKNOWN; 4501 } 4502 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END, 4503 (int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, 0); 4504 4505 return (retval); 4506} 4507 4508 4509static int 4510cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length, 4511 int (*callback)(buf_t, void *), void *callback_arg, int flags) 4512{ 4513 upl_page_info_t *pl; 4514 upl_t upl[MAX_VECTS]; 4515 vm_offset_t upl_offset; 4516 addr64_t dst_paddr = 0; 4517 user_addr_t iov_base; 4518 off_t max_size; 4519 upl_size_t upl_size; 4520 vm_size_t upl_needed_size; 4521 mach_msg_type_number_t pages_in_pl; 4522 int upl_flags; 4523 kern_return_t kret; 4524 struct clios iostate; 4525 int error= 0; 4526 int cur_upl = 0; 4527 int num_upl = 0; 4528 int n; 4529 u_int32_t xsize; 4530 u_int32_t io_size; 4531 u_int32_t devblocksize; 4532 u_int32_t mem_alignment_mask; 4533 u_int32_t tail_size = 0; 4534 int bflag; 4535 4536 if (flags & IO_PASSIVE) 4537 bflag = CL_PASSIVE; 4538 else 4539 bflag = 0; 4540 4541 if (flags & IO_NOCACHE) 4542 bflag |= CL_NOCACHE; 4543 4544 /* 4545 * When we enter this routine, we know 4546 * -- the read_length will not exceed the current iov_len 4547 * -- the target address is physically contiguous for read_length 4548 */ 4549 cluster_syncup(vp, filesize, callback, callback_arg); 4550 4551 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; 4552 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; 4553 4554 iostate.io_completed = 0; 4555 iostate.io_issued = 0; 4556 iostate.io_error = 0; 4557 iostate.io_wanted = 0; 4558 4559 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); 4560 4561next_cread: 4562 io_size = *read_length; 4563 4564 max_size = filesize - uio->uio_offset; 4565 4566 if (io_size > max_size) 4567 io_size = max_size; 4568 4569 iov_base = uio_curriovbase(uio); 4570 4571 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK); 4572 upl_needed_size = upl_offset + io_size; 4573 4574 pages_in_pl = 0; 4575 upl_size = upl_needed_size; 4576 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; 4577 4578 4579 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_START, 4580 (int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0); 4581 4582 kret = vm_map_get_upl(current_map(), 4583 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), 4584 &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, 0); 4585 4586 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_END, 4587 (int)upl_offset, upl_size, io_size, kret, 0); 4588 4589 if (kret != KERN_SUCCESS) { 4590 /* 4591 * failed to get pagelist 4592 */ 4593 error = EINVAL; 4594 goto wait_for_creads; 4595 } 4596 num_upl++; 4597 4598 if (upl_size < upl_needed_size) { 4599 /* 4600 * The upl_size wasn't satisfied. 4601 */ 4602 error = EINVAL; 4603 goto wait_for_creads; 4604 } 4605 pl = ubc_upl_pageinfo(upl[cur_upl]); 4606 4607 dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset; 4608 4609 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) { 4610 u_int32_t head_size; 4611 4612 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1)); 4613 4614 if (head_size > io_size) 4615 head_size = io_size; 4616 4617 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ, callback, callback_arg); 4618 4619 if (error) 4620 goto wait_for_creads; 4621 4622 upl_offset += head_size; 4623 dst_paddr += head_size; 4624 io_size -= head_size; 4625 4626 iov_base += head_size; 4627 } 4628 if ((u_int32_t)iov_base & mem_alignment_mask) { 4629 /* 4630 * request doesn't set up on a memory boundary 4631 * the underlying DMA engine can handle... 4632 * return an error instead of going through 4633 * the slow copy path since the intent of this 4634 * path is direct I/O to device memory 4635 */ 4636 error = EINVAL; 4637 goto wait_for_creads; 4638 } 4639 4640 tail_size = io_size & (devblocksize - 1); 4641 4642 io_size -= tail_size; 4643 4644 while (io_size && error == 0) { 4645 4646 if (io_size > MAX_IO_CONTIG_SIZE) 4647 xsize = MAX_IO_CONTIG_SIZE; 4648 else 4649 xsize = io_size; 4650 /* 4651 * request asynchronously so that we can overlap 4652 * the preparation of the next I/O... we'll do 4653 * the commit after all the I/O has completed 4654 * since its all issued against the same UPL 4655 * if there are already too many outstanding reads 4656 * wait until some have completed before issuing the next 4657 */ 4658 if (iostate.io_issued > iostate.io_completed) 4659 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig"); 4660 4661 if (iostate.io_error) { 4662 /* 4663 * one of the earlier reads we issued ran into a hard error 4664 * don't issue any more reads... 4665 * go wait for any other reads to complete before 4666 * returning the error to the caller 4667 */ 4668 goto wait_for_creads; 4669 } 4670 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, xsize, 4671 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC | bflag, 4672 (buf_t)NULL, &iostate, callback, callback_arg); 4673 /* 4674 * The cluster_io read was issued successfully, 4675 * update the uio structure 4676 */ 4677 if (error == 0) { 4678 uio_update(uio, (user_size_t)xsize); 4679 4680 dst_paddr += xsize; 4681 upl_offset += xsize; 4682 io_size -= xsize; 4683 } 4684 } 4685 if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS && uio->uio_offset < filesize) { 4686 4687 error = cluster_io_type(uio, read_type, read_length, 0); 4688 4689 if (error == 0 && *read_type == IO_CONTIG) { 4690 cur_upl++; 4691 goto next_cread; 4692 } 4693 } else 4694 *read_type = IO_UNKNOWN; 4695 4696wait_for_creads: 4697 /* 4698 * make sure all async reads that are part of this stream 4699 * have completed before we proceed 4700 */ 4701 if (iostate.io_issued > iostate.io_completed) 4702 cluster_iostate_wait(&iostate, 0, "cluster_read_contig"); 4703 4704 if (iostate.io_error) 4705 error = iostate.io_error; 4706 4707 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); 4708 4709 if (error == 0 && tail_size) 4710 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg); 4711 4712 for (n = 0; n < num_upl; n++) 4713 /* 4714 * just release our hold on each physically contiguous 4715 * region without changing any state 4716 */ 4717 ubc_upl_abort(upl[n], 0); 4718 4719 return (error); 4720} 4721 4722 4723static int 4724cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length) 4725{ 4726 user_size_t iov_len; 4727 user_addr_t iov_base = 0; 4728 upl_t upl; 4729 upl_size_t upl_size; 4730 int upl_flags; 4731 int retval = 0; 4732 4733 /* 4734 * skip over any emtpy vectors 4735 */ 4736 uio_update(uio, (user_size_t)0); 4737 4738 iov_len = uio_curriovlen(uio); 4739 4740 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_START, uio, (int)iov_len, 0, 0, 0); 4741 4742 if (iov_len) { 4743 iov_base = uio_curriovbase(uio); 4744 /* 4745 * make sure the size of the vector isn't too big... 4746 * internally, we want to handle all of the I/O in 4747 * chunk sizes that fit in a 32 bit int 4748 */ 4749 if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE) 4750 upl_size = MAX_IO_REQUEST_SIZE; 4751 else 4752 upl_size = (u_int32_t)iov_len; 4753 4754 upl_flags = UPL_QUERY_OBJECT_TYPE; 4755 4756 if ((vm_map_get_upl(current_map(), 4757 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), 4758 &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) { 4759 /* 4760 * the user app must have passed in an invalid address 4761 */ 4762 retval = EFAULT; 4763 } 4764 if (upl_size == 0) 4765 retval = EFAULT; 4766 4767 *io_length = upl_size; 4768 4769 if (upl_flags & UPL_PHYS_CONTIG) 4770 *io_type = IO_CONTIG; 4771 else if (iov_len >= min_length) 4772 *io_type = IO_DIRECT; 4773 else 4774 *io_type = IO_COPY; 4775 } else { 4776 /* 4777 * nothing left to do for this uio 4778 */ 4779 *io_length = 0; 4780 *io_type = IO_UNKNOWN; 4781 } 4782 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_END, iov_base, *io_type, *io_length, retval, 0); 4783 4784 return (retval); 4785} 4786 4787 4788/* 4789 * generate advisory I/O's in the largest chunks possible 4790 * the completed pages will be released into the VM cache 4791 */ 4792int 4793advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid) 4794{ 4795 return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE); 4796} 4797 4798int 4799advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*callback)(buf_t, void *), void *callback_arg, int bflag) 4800{ 4801 upl_page_info_t *pl; 4802 upl_t upl; 4803 vm_offset_t upl_offset; 4804 int upl_size; 4805 off_t upl_f_offset; 4806 int start_offset; 4807 int start_pg; 4808 int last_pg; 4809 int pages_in_upl; 4810 off_t max_size; 4811 int io_size; 4812 kern_return_t kret; 4813 int retval = 0; 4814 int issued_io; 4815 int skip_range; 4816 uint32_t max_io_size; 4817 4818 4819 if ( !UBCINFOEXISTS(vp)) 4820 return(EINVAL); 4821 4822 if (resid < 0) 4823 return(EINVAL); 4824 4825 max_io_size = cluster_max_io_size(vp->v_mount, CL_READ); 4826 4827 if ((vp->v_mount->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) { 4828 if (max_io_size > speculative_prefetch_max_iosize) 4829 max_io_size = speculative_prefetch_max_iosize; 4830 } 4831 4832 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START, 4833 (int)f_offset, resid, (int)filesize, 0, 0); 4834 4835 while (resid && f_offset < filesize && retval == 0) { 4836 /* 4837 * compute the size of the upl needed to encompass 4838 * the requested read... limit each call to cluster_io 4839 * to the maximum UPL size... cluster_io will clip if 4840 * this exceeds the maximum io_size for the device, 4841 * make sure to account for 4842 * a starting offset that's not page aligned 4843 */ 4844 start_offset = (int)(f_offset & PAGE_MASK_64); 4845 upl_f_offset = f_offset - (off_t)start_offset; 4846 max_size = filesize - f_offset; 4847 4848 if (resid < max_size) 4849 io_size = resid; 4850 else 4851 io_size = max_size; 4852 4853 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 4854 if ((uint32_t)upl_size > max_io_size) 4855 upl_size = max_io_size; 4856 4857 skip_range = 0; 4858 /* 4859 * return the number of contiguously present pages in the cache 4860 * starting at upl_f_offset within the file 4861 */ 4862 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range); 4863 4864 if (skip_range) { 4865 /* 4866 * skip over pages already present in the cache 4867 */ 4868 io_size = skip_range - start_offset; 4869 4870 f_offset += io_size; 4871 resid -= io_size; 4872 4873 if (skip_range == upl_size) 4874 continue; 4875 /* 4876 * have to issue some real I/O 4877 * at this point, we know it's starting on a page boundary 4878 * because we've skipped over at least the first page in the request 4879 */ 4880 start_offset = 0; 4881 upl_f_offset += skip_range; 4882 upl_size -= skip_range; 4883 } 4884 pages_in_upl = upl_size / PAGE_SIZE; 4885 4886 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START, 4887 upl, (int)upl_f_offset, upl_size, start_offset, 0); 4888 4889 kret = ubc_create_upl(vp, 4890 upl_f_offset, 4891 upl_size, 4892 &upl, 4893 &pl, 4894 UPL_RET_ONLY_ABSENT | UPL_SET_LITE); 4895 if (kret != KERN_SUCCESS) 4896 return(retval); 4897 issued_io = 0; 4898 4899 /* 4900 * before we start marching forward, we must make sure we end on 4901 * a present page, otherwise we will be working with a freed 4902 * upl 4903 */ 4904 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) { 4905 if (upl_page_present(pl, last_pg)) 4906 break; 4907 } 4908 pages_in_upl = last_pg + 1; 4909 4910 4911 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END, 4912 upl, (int)upl_f_offset, upl_size, start_offset, 0); 4913 4914 4915 for (last_pg = 0; last_pg < pages_in_upl; ) { 4916 /* 4917 * scan from the beginning of the upl looking for the first 4918 * page that is present.... this will become the first page in 4919 * the request we're going to make to 'cluster_io'... if all 4920 * of the pages are absent, we won't call through to 'cluster_io' 4921 */ 4922 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) { 4923 if (upl_page_present(pl, start_pg)) 4924 break; 4925 } 4926 4927 /* 4928 * scan from the starting present page looking for an absent 4929 * page before the end of the upl is reached, if we 4930 * find one, then it will terminate the range of pages being 4931 * presented to 'cluster_io' 4932 */ 4933 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) { 4934 if (!upl_page_present(pl, last_pg)) 4935 break; 4936 } 4937 4938 if (last_pg > start_pg) { 4939 /* 4940 * we found a range of pages that must be filled 4941 * if the last page in this range is the last page of the file 4942 * we may have to clip the size of it to keep from reading past 4943 * the end of the last physical block associated with the file 4944 */ 4945 upl_offset = start_pg * PAGE_SIZE; 4946 io_size = (last_pg - start_pg) * PAGE_SIZE; 4947 4948 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) 4949 io_size = filesize - (upl_f_offset + upl_offset); 4950 4951 /* 4952 * issue an asynchronous read to cluster_io 4953 */ 4954 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, 4955 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 4956 4957 issued_io = 1; 4958 } 4959 } 4960 if (issued_io == 0) 4961 ubc_upl_abort(upl, 0); 4962 4963 io_size = upl_size - start_offset; 4964 4965 if (io_size > resid) 4966 io_size = resid; 4967 f_offset += io_size; 4968 resid -= io_size; 4969 } 4970 4971 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END, 4972 (int)f_offset, resid, retval, 0, 0); 4973 4974 return(retval); 4975} 4976 4977 4978int 4979cluster_push(vnode_t vp, int flags) 4980{ 4981 return cluster_push_ext(vp, flags, NULL, NULL); 4982} 4983 4984 4985int 4986cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg) 4987{ 4988 int retval; 4989 int my_sparse_wait = 0; 4990 struct cl_writebehind *wbp; 4991 4992 if ( !UBCINFOEXISTS(vp)) { 4993 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -1, 0); 4994 return (0); 4995 } 4996 /* return if deferred write is set */ 4997 if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) { 4998 return (0); 4999 } 5000 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) { 5001 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -2, 0); 5002 return (0); 5003 } 5004 if (wbp->cl_number == 0 && wbp->cl_scmap == NULL) { 5005 lck_mtx_unlock(&wbp->cl_lockw); 5006 5007 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -3, 0); 5008 return(0); 5009 } 5010 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START, 5011 wbp->cl_scmap, wbp->cl_number, flags, 0, 0); 5012 5013 /* 5014 * if we have an fsync in progress, we don't want to allow any additional 5015 * sync/fsync/close(s) to occur until it finishes. 5016 * note that its possible for writes to continue to occur to this file 5017 * while we're waiting and also once the fsync starts to clean if we're 5018 * in the sparse map case 5019 */ 5020 while (wbp->cl_sparse_wait) { 5021 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, vp, 0, 0, 0, 0); 5022 5023 msleep((caddr_t)&wbp->cl_sparse_wait, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL); 5024 5025 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, vp, 0, 0, 0, 0); 5026 } 5027 if (flags & IO_SYNC) { 5028 my_sparse_wait = 1; 5029 wbp->cl_sparse_wait = 1; 5030 5031 /* 5032 * this is an fsync (or equivalent)... we must wait for any existing async 5033 * cleaning operations to complete before we evaulate the current state 5034 * and finish cleaning... this insures that all writes issued before this 5035 * fsync actually get cleaned to the disk before this fsync returns 5036 */ 5037 while (wbp->cl_sparse_pushes) { 5038 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_START, vp, 0, 0, 0, 0); 5039 5040 msleep((caddr_t)&wbp->cl_sparse_pushes, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL); 5041 5042 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_END, vp, 0, 0, 0, 0); 5043 } 5044 } 5045 if (wbp->cl_scmap) { 5046 void *scmap; 5047 5048 if (wbp->cl_sparse_pushes < SPARSE_PUSH_LIMIT) { 5049 5050 scmap = wbp->cl_scmap; 5051 wbp->cl_scmap = NULL; 5052 5053 wbp->cl_sparse_pushes++; 5054 5055 lck_mtx_unlock(&wbp->cl_lockw); 5056 5057 sparse_cluster_push(&scmap, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg); 5058 5059 lck_mtx_lock(&wbp->cl_lockw); 5060 5061 wbp->cl_sparse_pushes--; 5062 5063 if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0) 5064 wakeup((caddr_t)&wbp->cl_sparse_pushes); 5065 } else { 5066 sparse_cluster_push(&(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg); 5067 } 5068 retval = 1; 5069 } else { 5070 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg); 5071 } 5072 lck_mtx_unlock(&wbp->cl_lockw); 5073 5074 if (flags & IO_SYNC) 5075 (void)vnode_waitforwrites(vp, 0, 0, 0, "cluster_push"); 5076 5077 if (my_sparse_wait) { 5078 /* 5079 * I'm the owner of the serialization token 5080 * clear it and wakeup anyone that is waiting 5081 * for me to finish 5082 */ 5083 lck_mtx_lock(&wbp->cl_lockw); 5084 5085 wbp->cl_sparse_wait = 0; 5086 wakeup((caddr_t)&wbp->cl_sparse_wait); 5087 5088 lck_mtx_unlock(&wbp->cl_lockw); 5089 } 5090 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END, 5091 wbp->cl_scmap, wbp->cl_number, retval, 0, 0); 5092 5093 return (retval); 5094} 5095 5096 5097__private_extern__ void 5098cluster_release(struct ubc_info *ubc) 5099{ 5100 struct cl_writebehind *wbp; 5101 struct cl_readahead *rap; 5102 5103 if ((wbp = ubc->cl_wbehind)) { 5104 5105 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, wbp->cl_scmap, 0, 0, 0); 5106 5107 if (wbp->cl_scmap) 5108 vfs_drt_control(&(wbp->cl_scmap), 0); 5109 } else { 5110 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, 0, 0, 0, 0); 5111 } 5112 5113 rap = ubc->cl_rahead; 5114 5115 if (wbp != NULL) { 5116 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp); 5117 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND); 5118 } 5119 if ((rap = ubc->cl_rahead)) { 5120 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp); 5121 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD); 5122 } 5123 ubc->cl_rahead = NULL; 5124 ubc->cl_wbehind = NULL; 5125 5126 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, ubc, rap, wbp, 0, 0); 5127} 5128 5129 5130static int 5131cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg) 5132{ 5133 int cl_index; 5134 int cl_index1; 5135 int min_index; 5136 int cl_len; 5137 int cl_pushed = 0; 5138 struct cl_wextent l_clusters[MAX_CLUSTERS]; 5139 u_int max_cluster_pgcount; 5140 5141 5142 max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE; 5143 /* 5144 * the write behind context exists and has 5145 * already been locked... 5146 */ 5147 if (wbp->cl_number == 0) 5148 /* 5149 * no clusters to push 5150 * return number of empty slots 5151 */ 5152 return (MAX_CLUSTERS); 5153 5154 /* 5155 * make a local 'sorted' copy of the clusters 5156 * and clear wbp->cl_number so that new clusters can 5157 * be developed 5158 */ 5159 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) { 5160 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) { 5161 if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr) 5162 continue; 5163 if (min_index == -1) 5164 min_index = cl_index1; 5165 else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr) 5166 min_index = cl_index1; 5167 } 5168 if (min_index == -1) 5169 break; 5170 5171 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr; 5172 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr; 5173 l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags; 5174 5175 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr; 5176 } 5177 wbp->cl_number = 0; 5178 5179 cl_len = cl_index; 5180 5181 if ( (push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS ) { 5182 int i; 5183 5184 /* 5185 * determine if we appear to be writing the file sequentially 5186 * if not, by returning without having pushed any clusters 5187 * we will cause this vnode to be pushed into the sparse cluster mechanism 5188 * used for managing more random I/O patterns 5189 * 5190 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them... 5191 * that's why we're in try_push with PUSH_DELAY... 5192 * 5193 * check to make sure that all the clusters except the last one are 'full'... and that each cluster 5194 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above 5195 * so we can just make a simple pass through, up to, but not including the last one... 5196 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they 5197 * are sequential 5198 * 5199 * we let the last one be partial as long as it was adjacent to the previous one... 5200 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out 5201 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world... 5202 */ 5203 for (i = 0; i < MAX_CLUSTERS - 1; i++) { 5204 if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount) 5205 goto dont_try; 5206 if (l_clusters[i].e_addr != l_clusters[i+1].b_addr) 5207 goto dont_try; 5208 } 5209 } 5210 for (cl_index = 0; cl_index < cl_len; cl_index++) { 5211 int flags; 5212 struct cl_extent cl; 5213 5214 flags = io_flags & (IO_PASSIVE|IO_CLOSE); 5215 5216 /* 5217 * try to push each cluster in turn... 5218 */ 5219 if (l_clusters[cl_index].io_flags & CLW_IONOCACHE) 5220 flags |= IO_NOCACHE; 5221 5222 if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE) 5223 flags |= IO_PASSIVE; 5224 5225 if (push_flag & PUSH_SYNC) 5226 flags |= IO_SYNC; 5227 5228 cl.b_addr = l_clusters[cl_index].b_addr; 5229 cl.e_addr = l_clusters[cl_index].e_addr; 5230 5231 cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg); 5232 5233 l_clusters[cl_index].b_addr = 0; 5234 l_clusters[cl_index].e_addr = 0; 5235 5236 cl_pushed++; 5237 5238 if ( !(push_flag & PUSH_ALL) ) 5239 break; 5240 } 5241dont_try: 5242 if (cl_len > cl_pushed) { 5243 /* 5244 * we didn't push all of the clusters, so 5245 * lets try to merge them back in to the vnode 5246 */ 5247 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) { 5248 /* 5249 * we picked up some new clusters while we were trying to 5250 * push the old ones... this can happen because I've dropped 5251 * the vnode lock... the sum of the 5252 * leftovers plus the new cluster count exceeds our ability 5253 * to represent them, so switch to the sparse cluster mechanism 5254 * 5255 * collect the active public clusters... 5256 */ 5257 sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg); 5258 5259 for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) { 5260 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) 5261 continue; 5262 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr; 5263 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr; 5264 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags; 5265 5266 cl_index1++; 5267 } 5268 /* 5269 * update the cluster count 5270 */ 5271 wbp->cl_number = cl_index1; 5272 5273 /* 5274 * and collect the original clusters that were moved into the 5275 * local storage for sorting purposes 5276 */ 5277 sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg); 5278 5279 } else { 5280 /* 5281 * we've got room to merge the leftovers back in 5282 * just append them starting at the next 'hole' 5283 * represented by wbp->cl_number 5284 */ 5285 for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) { 5286 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) 5287 continue; 5288 5289 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr; 5290 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr; 5291 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags; 5292 5293 cl_index1++; 5294 } 5295 /* 5296 * update the cluster count 5297 */ 5298 wbp->cl_number = cl_index1; 5299 } 5300 } 5301 return (MAX_CLUSTERS - wbp->cl_number); 5302} 5303 5304 5305 5306static int 5307cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*callback)(buf_t, void *), void *callback_arg) 5308{ 5309 upl_page_info_t *pl; 5310 upl_t upl; 5311 vm_offset_t upl_offset; 5312 int upl_size; 5313 off_t upl_f_offset; 5314 int pages_in_upl; 5315 int start_pg; 5316 int last_pg; 5317 int io_size; 5318 int io_flags; 5319 int upl_flags; 5320 int bflag; 5321 int size; 5322 int error = 0; 5323 int retval; 5324 kern_return_t kret; 5325 5326 if (flags & IO_PASSIVE) 5327 bflag = CL_PASSIVE; 5328 else 5329 bflag = 0; 5330 5331 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START, 5332 (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0); 5333 5334 if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) { 5335 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0); 5336 5337 return (0); 5338 } 5339 upl_size = pages_in_upl * PAGE_SIZE; 5340 upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64); 5341 5342 if (upl_f_offset + upl_size >= EOF) { 5343 5344 if (upl_f_offset >= EOF) { 5345 /* 5346 * must have truncated the file and missed 5347 * clearing a dangling cluster (i.e. it's completely 5348 * beyond the new EOF 5349 */ 5350 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0); 5351 5352 return(0); 5353 } 5354 size = EOF - upl_f_offset; 5355 5356 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 5357 pages_in_upl = upl_size / PAGE_SIZE; 5358 } else 5359 size = upl_size; 5360 5361 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0); 5362 5363 /* 5364 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior 5365 * 5366 * - only pages that are currently dirty are returned... these are the ones we need to clean 5367 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set 5368 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page 5369 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if 5370 * someone dirties this page while the I/O is in progress, we don't lose track of the new state 5371 * 5372 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard) 5373 */ 5374 5375 if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE)) 5376 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED; 5377 else 5378 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE; 5379 5380 kret = ubc_create_upl(vp, 5381 upl_f_offset, 5382 upl_size, 5383 &upl, 5384 &pl, 5385 upl_flags); 5386 if (kret != KERN_SUCCESS) 5387 panic("cluster_push: failed to get pagelist"); 5388 5389 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, upl, upl_f_offset, 0, 0, 0); 5390 5391 /* 5392 * since we only asked for the dirty pages back 5393 * it's possible that we may only get a few or even none, so... 5394 * before we start marching forward, we must make sure we know 5395 * where the last present page is in the UPL, otherwise we could 5396 * end up working with a freed upl due to the FREE_ON_EMPTY semantics 5397 * employed by commit_range and abort_range. 5398 */ 5399 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) { 5400 if (upl_page_present(pl, last_pg)) 5401 break; 5402 } 5403 pages_in_upl = last_pg + 1; 5404 5405 if (pages_in_upl == 0) { 5406 ubc_upl_abort(upl, 0); 5407 5408 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0); 5409 return(0); 5410 } 5411 5412 for (last_pg = 0; last_pg < pages_in_upl; ) { 5413 /* 5414 * find the next dirty page in the UPL 5415 * this will become the first page in the 5416 * next I/O to generate 5417 */ 5418 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) { 5419 if (upl_dirty_page(pl, start_pg)) 5420 break; 5421 if (upl_page_present(pl, start_pg)) 5422 /* 5423 * RET_ONLY_DIRTY will return non-dirty 'precious' pages 5424 * just release these unchanged since we're not going 5425 * to steal them or change their state 5426 */ 5427 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY); 5428 } 5429 if (start_pg >= pages_in_upl) 5430 /* 5431 * done... no more dirty pages to push 5432 */ 5433 break; 5434 if (start_pg > last_pg) 5435 /* 5436 * skipped over some non-dirty pages 5437 */ 5438 size -= ((start_pg - last_pg) * PAGE_SIZE); 5439 5440 /* 5441 * find a range of dirty pages to write 5442 */ 5443 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) { 5444 if (!upl_dirty_page(pl, last_pg)) 5445 break; 5446 } 5447 upl_offset = start_pg * PAGE_SIZE; 5448 5449 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE); 5450 5451 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | bflag; 5452 5453 if ( !(flags & IO_SYNC)) 5454 io_flags |= CL_ASYNC; 5455 5456 if (flags & IO_CLOSE) 5457 io_flags |= CL_CLOSE; 5458 5459 if (flags & IO_NOCACHE) 5460 io_flags |= CL_NOCACHE; 5461 5462 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, 5463 io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 5464 5465 if (error == 0 && retval) 5466 error = retval; 5467 5468 size -= io_size; 5469 } 5470 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0); 5471 5472 return(error); 5473} 5474 5475 5476/* 5477 * sparse_cluster_switch is called with the write behind lock held 5478 */ 5479static void 5480sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg) 5481{ 5482 int cl_index; 5483 5484 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, vp, wbp->cl_scmap, 0, 0, 0); 5485 5486 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) { 5487 int flags; 5488 struct cl_extent cl; 5489 5490 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) { 5491 5492 if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, NULL, &flags) == KERN_SUCCESS) { 5493 if (flags & UPL_POP_DIRTY) { 5494 cl.e_addr = cl.b_addr + 1; 5495 5496 sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg); 5497 } 5498 } 5499 } 5500 } 5501 wbp->cl_number = 0; 5502 5503 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, vp, wbp->cl_scmap, 0, 0, 0); 5504} 5505 5506 5507/* 5508 * sparse_cluster_push must be called with the write-behind lock held if the scmap is 5509 * still associated with the write-behind context... however, if the scmap has been disassociated 5510 * from the write-behind context (the cluster_push case), the wb lock is not held 5511 */ 5512static void 5513sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg) 5514{ 5515 struct cl_extent cl; 5516 off_t offset; 5517 u_int length; 5518 5519 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, vp, (*scmap), 0, push_flag, 0); 5520 5521 if (push_flag & PUSH_ALL) 5522 vfs_drt_control(scmap, 1); 5523 5524 for (;;) { 5525 if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS) 5526 break; 5527 5528 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64); 5529 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64); 5530 5531 cluster_push_now(vp, &cl, EOF, io_flags & (IO_PASSIVE|IO_CLOSE), callback, callback_arg); 5532 5533 if ( !(push_flag & PUSH_ALL) ) 5534 break; 5535 } 5536 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, vp, (*scmap), 0, 0, 0); 5537} 5538 5539 5540/* 5541 * sparse_cluster_add is called with the write behind lock held 5542 */ 5543static void 5544sparse_cluster_add(void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg) 5545{ 5546 u_int new_dirty; 5547 u_int length; 5548 off_t offset; 5549 5550 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0); 5551 5552 offset = (off_t)(cl->b_addr * PAGE_SIZE_64); 5553 length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE; 5554 5555 while (vfs_drt_mark_pages(scmap, offset, length, &new_dirty) != KERN_SUCCESS) { 5556 /* 5557 * no room left in the map 5558 * only a partial update was done 5559 * push out some pages and try again 5560 */ 5561 sparse_cluster_push(scmap, vp, EOF, 0, 0, callback, callback_arg); 5562 5563 offset += (new_dirty * PAGE_SIZE_64); 5564 length -= (new_dirty * PAGE_SIZE); 5565 } 5566 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, vp, (*scmap), 0, 0, 0); 5567} 5568 5569 5570static int 5571cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*callback)(buf_t, void *), void *callback_arg) 5572{ 5573 upl_page_info_t *pl; 5574 upl_t upl; 5575 addr64_t ubc_paddr; 5576 kern_return_t kret; 5577 int error = 0; 5578 int did_read = 0; 5579 int abort_flags; 5580 int upl_flags; 5581 int bflag; 5582 5583 if (flags & IO_PASSIVE) 5584 bflag = CL_PASSIVE; 5585 else 5586 bflag = 0; 5587 5588 if (flags & IO_NOCACHE) 5589 bflag |= CL_NOCACHE; 5590 5591 upl_flags = UPL_SET_LITE; 5592 5593 if ( !(flags & CL_READ) ) { 5594 /* 5595 * "write" operation: let the UPL subsystem know 5596 * that we intend to modify the buffer cache pages 5597 * we're gathering. 5598 */ 5599 upl_flags |= UPL_WILL_MODIFY; 5600 } else { 5601 /* 5602 * indicate that there is no need to pull the 5603 * mapping for this page... we're only going 5604 * to read from it, not modify it. 5605 */ 5606 upl_flags |= UPL_FILE_IO; 5607 } 5608 kret = ubc_create_upl(vp, 5609 uio->uio_offset & ~PAGE_MASK_64, 5610 PAGE_SIZE, 5611 &upl, 5612 &pl, 5613 upl_flags); 5614 5615 if (kret != KERN_SUCCESS) 5616 return(EINVAL); 5617 5618 if (!upl_valid_page(pl, 0)) { 5619 /* 5620 * issue a synchronous read to cluster_io 5621 */ 5622 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, 5623 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 5624 if (error) { 5625 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); 5626 5627 return(error); 5628 } 5629 did_read = 1; 5630 } 5631 ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64); 5632 5633/* 5634 * NOTE: There is no prototype for the following in BSD. It, and the definitions 5635 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in 5636 * osfmk/ppc/mappings.h. They are not included here because there appears to be no 5637 * way to do so without exporting them to kexts as well. 5638 */ 5639 if (flags & CL_READ) 5640// copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */ 5641 copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 4); /* Copy physical to physical and flush the destination */ 5642 else 5643// copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */ 5644 copypv(usr_paddr, ubc_paddr, xsize, 2 | 1 | 8); /* Copy physical to physical and flush the source */ 5645 5646 if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) { 5647 /* 5648 * issue a synchronous write to cluster_io 5649 */ 5650 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, 5651 bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 5652 } 5653 if (error == 0) 5654 uio_update(uio, (user_size_t)xsize); 5655 5656 if (did_read) 5657 abort_flags = UPL_ABORT_FREE_ON_EMPTY; 5658 else 5659 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES; 5660 5661 ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags); 5662 5663 return (error); 5664} 5665 5666 5667 5668int 5669cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid) 5670{ 5671 int pg_offset; 5672 int pg_index; 5673 int csize; 5674 int segflg; 5675 int retval = 0; 5676 int xsize; 5677 upl_page_info_t *pl; 5678 5679 xsize = *io_resid; 5680 5681 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START, 5682 (int)uio->uio_offset, upl_offset, xsize, 0, 0); 5683 5684 segflg = uio->uio_segflg; 5685 5686 switch(segflg) { 5687 5688 case UIO_USERSPACE32: 5689 case UIO_USERISPACE32: 5690 uio->uio_segflg = UIO_PHYS_USERSPACE32; 5691 break; 5692 5693 case UIO_USERSPACE: 5694 case UIO_USERISPACE: 5695 uio->uio_segflg = UIO_PHYS_USERSPACE; 5696 break; 5697 5698 case UIO_USERSPACE64: 5699 case UIO_USERISPACE64: 5700 uio->uio_segflg = UIO_PHYS_USERSPACE64; 5701 break; 5702 5703 case UIO_SYSSPACE: 5704 uio->uio_segflg = UIO_PHYS_SYSSPACE; 5705 break; 5706 5707 } 5708 pl = ubc_upl_pageinfo(upl); 5709 5710 pg_index = upl_offset / PAGE_SIZE; 5711 pg_offset = upl_offset & PAGE_MASK; 5712 csize = min(PAGE_SIZE - pg_offset, xsize); 5713 5714 while (xsize && retval == 0) { 5715 addr64_t paddr; 5716 5717 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset; 5718 5719 retval = uiomove64(paddr, csize, uio); 5720 5721 pg_index += 1; 5722 pg_offset = 0; 5723 xsize -= csize; 5724 csize = min(PAGE_SIZE, xsize); 5725 } 5726 *io_resid = xsize; 5727 5728 uio->uio_segflg = segflg; 5729 5730 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END, 5731 (int)uio->uio_offset, xsize, retval, segflg, 0); 5732 5733 return (retval); 5734} 5735 5736 5737int 5738cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty) 5739{ 5740 5741 return (cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, 1)); 5742} 5743 5744 5745static int 5746cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference) 5747{ 5748 int segflg; 5749 int io_size; 5750 int xsize; 5751 int start_offset; 5752 int retval = 0; 5753 memory_object_control_t control; 5754 5755 io_size = *io_resid; 5756 5757 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START, 5758 (int)uio->uio_offset, io_size, mark_dirty, take_reference, 0); 5759 5760 control = ubc_getobject(vp, UBC_FLAGS_NONE); 5761 5762 if (control == MEMORY_OBJECT_CONTROL_NULL) { 5763 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END, 5764 (int)uio->uio_offset, io_size, retval, 3, 0); 5765 5766 return(0); 5767 } 5768 segflg = uio->uio_segflg; 5769 5770 switch(segflg) { 5771 5772 case UIO_USERSPACE32: 5773 case UIO_USERISPACE32: 5774 uio->uio_segflg = UIO_PHYS_USERSPACE32; 5775 break; 5776 5777 case UIO_USERSPACE64: 5778 case UIO_USERISPACE64: 5779 uio->uio_segflg = UIO_PHYS_USERSPACE64; 5780 break; 5781 5782 case UIO_USERSPACE: 5783 case UIO_USERISPACE: 5784 uio->uio_segflg = UIO_PHYS_USERSPACE; 5785 break; 5786 5787 case UIO_SYSSPACE: 5788 uio->uio_segflg = UIO_PHYS_SYSSPACE; 5789 break; 5790 } 5791 5792 if ( (io_size = *io_resid) ) { 5793 start_offset = (int)(uio->uio_offset & PAGE_MASK_64); 5794 xsize = uio_resid(uio); 5795 5796 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio, 5797 start_offset, io_size, mark_dirty, take_reference); 5798 xsize -= uio_resid(uio); 5799 io_size -= xsize; 5800 } 5801 uio->uio_segflg = segflg; 5802 *io_resid = io_size; 5803 5804 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END, 5805 (int)uio->uio_offset, io_size, retval, 0x80000000 | segflg, 0); 5806 5807 return(retval); 5808} 5809 5810 5811int 5812is_file_clean(vnode_t vp, off_t filesize) 5813{ 5814 off_t f_offset; 5815 int flags; 5816 int total_dirty = 0; 5817 5818 for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) { 5819 if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) { 5820 if (flags & UPL_POP_DIRTY) { 5821 total_dirty++; 5822 } 5823 } 5824 } 5825 if (total_dirty) 5826 return(EINVAL); 5827 5828 return (0); 5829} 5830 5831 5832 5833/* 5834 * Dirty region tracking/clustering mechanism. 5835 * 5836 * This code (vfs_drt_*) provides a mechanism for tracking and clustering 5837 * dirty regions within a larger space (file). It is primarily intended to 5838 * support clustering in large files with many dirty areas. 5839 * 5840 * The implementation assumes that the dirty regions are pages. 5841 * 5842 * To represent dirty pages within the file, we store bit vectors in a 5843 * variable-size circular hash. 5844 */ 5845 5846/* 5847 * Bitvector size. This determines the number of pages we group in a 5848 * single hashtable entry. Each hashtable entry is aligned to this 5849 * size within the file. 5850 */ 5851#define DRT_BITVECTOR_PAGES 256 5852 5853/* 5854 * File offset handling. 5855 * 5856 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES; 5857 * the correct formula is (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1) 5858 */ 5859#define DRT_ADDRESS_MASK (~((1 << 20) - 1)) 5860#define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK) 5861 5862/* 5863 * Hashtable address field handling. 5864 * 5865 * The low-order bits of the hashtable address are used to conserve 5866 * space. 5867 * 5868 * DRT_HASH_COUNT_MASK must be large enough to store the range 5869 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value 5870 * to indicate that the bucket is actually unoccupied. 5871 */ 5872#define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK) 5873#define DRT_HASH_SET_ADDRESS(scm, i, a) \ 5874 do { \ 5875 (scm)->scm_hashtable[(i)].dhe_control = \ 5876 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \ 5877 } while (0) 5878#define DRT_HASH_COUNT_MASK 0x1ff 5879#define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK) 5880#define DRT_HASH_SET_COUNT(scm, i, c) \ 5881 do { \ 5882 (scm)->scm_hashtable[(i)].dhe_control = \ 5883 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \ 5884 } while (0) 5885#define DRT_HASH_CLEAR(scm, i) \ 5886 do { \ 5887 (scm)->scm_hashtable[(i)].dhe_control = 0; \ 5888 } while (0) 5889#define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK) 5890#define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK) 5891#define DRT_HASH_COPY(oscm, oi, scm, i) \ 5892 do { \ 5893 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \ 5894 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \ 5895 } while(0); 5896 5897 5898/* 5899 * Hash table moduli. 5900 * 5901 * Since the hashtable entry's size is dependent on the size of 5902 * the bitvector, and since the hashtable size is constrained to 5903 * both being prime and fitting within the desired allocation 5904 * size, these values need to be manually determined. 5905 * 5906 * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes. 5907 * 5908 * The small hashtable allocation is 1024 bytes, so the modulus is 23. 5909 * The large hashtable allocation is 16384 bytes, so the modulus is 401. 5910 */ 5911#define DRT_HASH_SMALL_MODULUS 23 5912#define DRT_HASH_LARGE_MODULUS 401 5913 5914/* 5915 * Physical memory required before the large hash modulus is permitted. 5916 * 5917 * On small memory systems, the large hash modulus can lead to phsyical 5918 * memory starvation, so we avoid using it there. 5919 */ 5920#define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */ 5921 5922#define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */ 5923#define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */ 5924 5925/* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */ 5926 5927/* 5928 * Hashtable bitvector handling. 5929 * 5930 * Bitvector fields are 32 bits long. 5931 */ 5932 5933#define DRT_HASH_SET_BIT(scm, i, bit) \ 5934 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32)) 5935 5936#define DRT_HASH_CLEAR_BIT(scm, i, bit) \ 5937 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32)) 5938 5939#define DRT_HASH_TEST_BIT(scm, i, bit) \ 5940 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32))) 5941 5942#define DRT_BITVECTOR_CLEAR(scm, i) \ 5943 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t)) 5944 5945#define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \ 5946 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \ 5947 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \ 5948 (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t)) 5949 5950 5951 5952/* 5953 * Hashtable entry. 5954 */ 5955struct vfs_drt_hashentry { 5956 u_int64_t dhe_control; 5957 u_int32_t dhe_bitvector[DRT_BITVECTOR_PAGES / 32]; 5958}; 5959 5960/* 5961 * Dirty Region Tracking structure. 5962 * 5963 * The hashtable is allocated entirely inside the DRT structure. 5964 * 5965 * The hash is a simple circular prime modulus arrangement, the structure 5966 * is resized from small to large if it overflows. 5967 */ 5968 5969struct vfs_drt_clustermap { 5970 u_int32_t scm_magic; /* sanity/detection */ 5971#define DRT_SCM_MAGIC 0x12020003 5972 u_int32_t scm_modulus; /* current ring size */ 5973 u_int32_t scm_buckets; /* number of occupied buckets */ 5974 u_int32_t scm_lastclean; /* last entry we cleaned */ 5975 u_int32_t scm_iskips; /* number of slot skips */ 5976 5977 struct vfs_drt_hashentry scm_hashtable[0]; 5978}; 5979 5980 5981#define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus) 5982#define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus) 5983 5984/* 5985 * Debugging codes and arguments. 5986 */ 5987#define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */ 5988#define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */ 5989#define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */ 5990#define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */ 5991#define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length, 5992 * dirty */ 5993 /* 0, setcount */ 5994 /* 1 (clean, no map) */ 5995 /* 2 (map alloc fail) */ 5996 /* 3, resid (partial) */ 5997#define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87)) 5998#define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets, 5999 * lastclean, iskips */ 6000 6001 6002static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp); 6003static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap); 6004static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap, 6005 u_int64_t offset, int *indexp); 6006static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, 6007 u_int64_t offset, 6008 int *indexp, 6009 int recursed); 6010static kern_return_t vfs_drt_do_mark_pages( 6011 void **cmapp, 6012 u_int64_t offset, 6013 u_int length, 6014 u_int *setcountp, 6015 int dirty); 6016static void vfs_drt_trace( 6017 struct vfs_drt_clustermap *cmap, 6018 int code, 6019 int arg1, 6020 int arg2, 6021 int arg3, 6022 int arg4); 6023 6024 6025/* 6026 * Allocate and initialise a sparse cluster map. 6027 * 6028 * Will allocate a new map, resize or compact an existing map. 6029 * 6030 * XXX we should probably have at least one intermediate map size, 6031 * as the 1:16 ratio seems a bit drastic. 6032 */ 6033static kern_return_t 6034vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp) 6035{ 6036 struct vfs_drt_clustermap *cmap, *ocmap; 6037 kern_return_t kret; 6038 u_int64_t offset; 6039 u_int32_t i; 6040 int nsize, active_buckets, index, copycount; 6041 6042 ocmap = NULL; 6043 if (cmapp != NULL) 6044 ocmap = *cmapp; 6045 6046 /* 6047 * Decide on the size of the new map. 6048 */ 6049 if (ocmap == NULL) { 6050 nsize = DRT_HASH_SMALL_MODULUS; 6051 } else { 6052 /* count the number of active buckets in the old map */ 6053 active_buckets = 0; 6054 for (i = 0; i < ocmap->scm_modulus; i++) { 6055 if (!DRT_HASH_VACANT(ocmap, i) && 6056 (DRT_HASH_GET_COUNT(ocmap, i) != 0)) 6057 active_buckets++; 6058 } 6059 /* 6060 * If we're currently using the small allocation, check to 6061 * see whether we should grow to the large one. 6062 */ 6063 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) { 6064 /* 6065 * If the ring is nearly full and we are allowed to 6066 * use the large modulus, upgrade. 6067 */ 6068 if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) && 6069 (max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) { 6070 nsize = DRT_HASH_LARGE_MODULUS; 6071 } else { 6072 nsize = DRT_HASH_SMALL_MODULUS; 6073 } 6074 } else { 6075 /* already using the large modulus */ 6076 nsize = DRT_HASH_LARGE_MODULUS; 6077 /* 6078 * If the ring is completely full, there's 6079 * nothing useful for us to do. Behave as 6080 * though we had compacted into the new 6081 * array and return. 6082 */ 6083 if (active_buckets >= DRT_HASH_LARGE_MODULUS) 6084 return(KERN_SUCCESS); 6085 } 6086 } 6087 6088 /* 6089 * Allocate and initialise the new map. 6090 */ 6091 6092 kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap, 6093 (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION); 6094 if (kret != KERN_SUCCESS) 6095 return(kret); 6096 cmap->scm_magic = DRT_SCM_MAGIC; 6097 cmap->scm_modulus = nsize; 6098 cmap->scm_buckets = 0; 6099 cmap->scm_lastclean = 0; 6100 cmap->scm_iskips = 0; 6101 for (i = 0; i < cmap->scm_modulus; i++) { 6102 DRT_HASH_CLEAR(cmap, i); 6103 DRT_HASH_VACATE(cmap, i); 6104 DRT_BITVECTOR_CLEAR(cmap, i); 6105 } 6106 6107 /* 6108 * If there's an old map, re-hash entries from it into the new map. 6109 */ 6110 copycount = 0; 6111 if (ocmap != NULL) { 6112 for (i = 0; i < ocmap->scm_modulus; i++) { 6113 /* skip empty buckets */ 6114 if (DRT_HASH_VACANT(ocmap, i) || 6115 (DRT_HASH_GET_COUNT(ocmap, i) == 0)) 6116 continue; 6117 /* get new index */ 6118 offset = DRT_HASH_GET_ADDRESS(ocmap, i); 6119 kret = vfs_drt_get_index(&cmap, offset, &index, 1); 6120 if (kret != KERN_SUCCESS) { 6121 /* XXX need to bail out gracefully here */ 6122 panic("vfs_drt: new cluster map mysteriously too small"); 6123 index = 0; 6124 } 6125 /* copy */ 6126 DRT_HASH_COPY(ocmap, i, cmap, index); 6127 copycount++; 6128 } 6129 } 6130 6131 /* log what we've done */ 6132 vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0); 6133 6134 /* 6135 * It's important to ensure that *cmapp always points to 6136 * a valid map, so we must overwrite it before freeing 6137 * the old map. 6138 */ 6139 *cmapp = cmap; 6140 if (ocmap != NULL) { 6141 /* emit stats into trace buffer */ 6142 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA, 6143 ocmap->scm_modulus, 6144 ocmap->scm_buckets, 6145 ocmap->scm_lastclean, 6146 ocmap->scm_iskips); 6147 6148 vfs_drt_free_map(ocmap); 6149 } 6150 return(KERN_SUCCESS); 6151} 6152 6153 6154/* 6155 * Free a sparse cluster map. 6156 */ 6157static kern_return_t 6158vfs_drt_free_map(struct vfs_drt_clustermap *cmap) 6159{ 6160 kmem_free(kernel_map, (vm_offset_t)cmap, 6161 (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION); 6162 return(KERN_SUCCESS); 6163} 6164 6165 6166/* 6167 * Find the hashtable slot currently occupied by an entry for the supplied offset. 6168 */ 6169static kern_return_t 6170vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp) 6171{ 6172 int index; 6173 u_int32_t i; 6174 6175 offset = DRT_ALIGN_ADDRESS(offset); 6176 index = DRT_HASH(cmap, offset); 6177 6178 /* traverse the hashtable */ 6179 for (i = 0; i < cmap->scm_modulus; i++) { 6180 6181 /* 6182 * If the slot is vacant, we can stop. 6183 */ 6184 if (DRT_HASH_VACANT(cmap, index)) 6185 break; 6186 6187 /* 6188 * If the address matches our offset, we have success. 6189 */ 6190 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) { 6191 *indexp = index; 6192 return(KERN_SUCCESS); 6193 } 6194 6195 /* 6196 * Move to the next slot, try again. 6197 */ 6198 index = DRT_HASH_NEXT(cmap, index); 6199 } 6200 /* 6201 * It's not there. 6202 */ 6203 return(KERN_FAILURE); 6204} 6205 6206/* 6207 * Find the hashtable slot for the supplied offset. If we haven't allocated 6208 * one yet, allocate one and populate the address field. Note that it will 6209 * not have a nonzero page count and thus will still technically be free, so 6210 * in the case where we are called to clean pages, the slot will remain free. 6211 */ 6212static kern_return_t 6213vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed) 6214{ 6215 struct vfs_drt_clustermap *cmap; 6216 kern_return_t kret; 6217 u_int32_t index; 6218 u_int32_t i; 6219 6220 cmap = *cmapp; 6221 6222 /* look for an existing entry */ 6223 kret = vfs_drt_search_index(cmap, offset, indexp); 6224 if (kret == KERN_SUCCESS) 6225 return(kret); 6226 6227 /* need to allocate an entry */ 6228 offset = DRT_ALIGN_ADDRESS(offset); 6229 index = DRT_HASH(cmap, offset); 6230 6231 /* scan from the index forwards looking for a vacant slot */ 6232 for (i = 0; i < cmap->scm_modulus; i++) { 6233 /* slot vacant? */ 6234 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap,index) == 0) { 6235 cmap->scm_buckets++; 6236 if (index < cmap->scm_lastclean) 6237 cmap->scm_lastclean = index; 6238 DRT_HASH_SET_ADDRESS(cmap, index, offset); 6239 DRT_HASH_SET_COUNT(cmap, index, 0); 6240 DRT_BITVECTOR_CLEAR(cmap, index); 6241 *indexp = index; 6242 vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0); 6243 return(KERN_SUCCESS); 6244 } 6245 cmap->scm_iskips += i; 6246 index = DRT_HASH_NEXT(cmap, index); 6247 } 6248 6249 /* 6250 * We haven't found a vacant slot, so the map is full. If we're not 6251 * already recursed, try reallocating/compacting it. 6252 */ 6253 if (recursed) 6254 return(KERN_FAILURE); 6255 kret = vfs_drt_alloc_map(cmapp); 6256 if (kret == KERN_SUCCESS) { 6257 /* now try to insert again */ 6258 kret = vfs_drt_get_index(cmapp, offset, indexp, 1); 6259 } 6260 return(kret); 6261} 6262 6263/* 6264 * Implementation of set dirty/clean. 6265 * 6266 * In the 'clean' case, not finding a map is OK. 6267 */ 6268static kern_return_t 6269vfs_drt_do_mark_pages( 6270 void **private, 6271 u_int64_t offset, 6272 u_int length, 6273 u_int *setcountp, 6274 int dirty) 6275{ 6276 struct vfs_drt_clustermap *cmap, **cmapp; 6277 kern_return_t kret; 6278 int i, index, pgoff, pgcount, setcount, ecount; 6279 6280 cmapp = (struct vfs_drt_clustermap **)private; 6281 cmap = *cmapp; 6282 6283 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0); 6284 6285 if (setcountp != NULL) 6286 *setcountp = 0; 6287 6288 /* allocate a cluster map if we don't already have one */ 6289 if (cmap == NULL) { 6290 /* no cluster map, nothing to clean */ 6291 if (!dirty) { 6292 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0); 6293 return(KERN_SUCCESS); 6294 } 6295 kret = vfs_drt_alloc_map(cmapp); 6296 if (kret != KERN_SUCCESS) { 6297 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0); 6298 return(kret); 6299 } 6300 } 6301 setcount = 0; 6302 6303 /* 6304 * Iterate over the length of the region. 6305 */ 6306 while (length > 0) { 6307 /* 6308 * Get the hashtable index for this offset. 6309 * 6310 * XXX this will add blank entries if we are clearing a range 6311 * that hasn't been dirtied. 6312 */ 6313 kret = vfs_drt_get_index(cmapp, offset, &index, 0); 6314 cmap = *cmapp; /* may have changed! */ 6315 /* this may be a partial-success return */ 6316 if (kret != KERN_SUCCESS) { 6317 if (setcountp != NULL) 6318 *setcountp = setcount; 6319 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0); 6320 6321 return(kret); 6322 } 6323 6324 /* 6325 * Work out how many pages we're modifying in this 6326 * hashtable entry. 6327 */ 6328 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE; 6329 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff)); 6330 6331 /* 6332 * Iterate over pages, dirty/clearing as we go. 6333 */ 6334 ecount = DRT_HASH_GET_COUNT(cmap, index); 6335 for (i = 0; i < pgcount; i++) { 6336 if (dirty) { 6337 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) { 6338 DRT_HASH_SET_BIT(cmap, index, pgoff + i); 6339 ecount++; 6340 setcount++; 6341 } 6342 } else { 6343 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) { 6344 DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i); 6345 ecount--; 6346 setcount++; 6347 } 6348 } 6349 } 6350 DRT_HASH_SET_COUNT(cmap, index, ecount); 6351 6352 offset += pgcount * PAGE_SIZE; 6353 length -= pgcount * PAGE_SIZE; 6354 } 6355 if (setcountp != NULL) 6356 *setcountp = setcount; 6357 6358 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0); 6359 6360 return(KERN_SUCCESS); 6361} 6362 6363/* 6364 * Mark a set of pages as dirty/clean. 6365 * 6366 * This is a public interface. 6367 * 6368 * cmapp 6369 * Pointer to storage suitable for holding a pointer. Note that 6370 * this must either be NULL or a value set by this function. 6371 * 6372 * size 6373 * Current file size in bytes. 6374 * 6375 * offset 6376 * Offset of the first page to be marked as dirty, in bytes. Must be 6377 * page-aligned. 6378 * 6379 * length 6380 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE. 6381 * 6382 * setcountp 6383 * Number of pages newly marked dirty by this call (optional). 6384 * 6385 * Returns KERN_SUCCESS if all the pages were successfully marked. 6386 */ 6387static kern_return_t 6388vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp) 6389{ 6390 /* XXX size unused, drop from interface */ 6391 return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1)); 6392} 6393 6394#if 0 6395static kern_return_t 6396vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length) 6397{ 6398 return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0)); 6399} 6400#endif 6401 6402/* 6403 * Get a cluster of dirty pages. 6404 * 6405 * This is a public interface. 6406 * 6407 * cmapp 6408 * Pointer to storage managed by drt_mark_pages. Note that this must 6409 * be NULL or a value set by drt_mark_pages. 6410 * 6411 * offsetp 6412 * Returns the byte offset into the file of the first page in the cluster. 6413 * 6414 * lengthp 6415 * Returns the length in bytes of the cluster of dirty pages. 6416 * 6417 * Returns success if a cluster was found. If KERN_FAILURE is returned, there 6418 * are no dirty pages meeting the minmum size criteria. Private storage will 6419 * be released if there are no more dirty pages left in the map 6420 * 6421 */ 6422static kern_return_t 6423vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp) 6424{ 6425 struct vfs_drt_clustermap *cmap; 6426 u_int64_t offset; 6427 u_int length; 6428 u_int32_t j; 6429 int index, i, fs, ls; 6430 6431 /* sanity */ 6432 if ((cmapp == NULL) || (*cmapp == NULL)) 6433 return(KERN_FAILURE); 6434 cmap = *cmapp; 6435 6436 /* walk the hashtable */ 6437 for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) { 6438 index = DRT_HASH(cmap, offset); 6439 6440 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0)) 6441 continue; 6442 6443 /* scan the bitfield for a string of bits */ 6444 fs = -1; 6445 6446 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) { 6447 if (DRT_HASH_TEST_BIT(cmap, index, i)) { 6448 fs = i; 6449 break; 6450 } 6451 } 6452 if (fs == -1) { 6453 /* didn't find any bits set */ 6454 panic("vfs_drt: entry summary count > 0 but no bits set in map"); 6455 } 6456 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) { 6457 if (!DRT_HASH_TEST_BIT(cmap, index, i)) 6458 break; 6459 } 6460 6461 /* compute offset and length, mark pages clean */ 6462 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs); 6463 length = ls * PAGE_SIZE; 6464 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0); 6465 cmap->scm_lastclean = index; 6466 6467 /* return successful */ 6468 *offsetp = (off_t)offset; 6469 *lengthp = length; 6470 6471 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0); 6472 return(KERN_SUCCESS); 6473 } 6474 /* 6475 * We didn't find anything... hashtable is empty 6476 * emit stats into trace buffer and 6477 * then free it 6478 */ 6479 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA, 6480 cmap->scm_modulus, 6481 cmap->scm_buckets, 6482 cmap->scm_lastclean, 6483 cmap->scm_iskips); 6484 6485 vfs_drt_free_map(cmap); 6486 *cmapp = NULL; 6487 6488 return(KERN_FAILURE); 6489} 6490 6491 6492static kern_return_t 6493vfs_drt_control(void **cmapp, int op_type) 6494{ 6495 struct vfs_drt_clustermap *cmap; 6496 6497 /* sanity */ 6498 if ((cmapp == NULL) || (*cmapp == NULL)) 6499 return(KERN_FAILURE); 6500 cmap = *cmapp; 6501 6502 switch (op_type) { 6503 case 0: 6504 /* emit stats into trace buffer */ 6505 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA, 6506 cmap->scm_modulus, 6507 cmap->scm_buckets, 6508 cmap->scm_lastclean, 6509 cmap->scm_iskips); 6510 6511 vfs_drt_free_map(cmap); 6512 *cmapp = NULL; 6513 break; 6514 6515 case 1: 6516 cmap->scm_lastclean = 0; 6517 break; 6518 } 6519 return(KERN_SUCCESS); 6520} 6521 6522 6523 6524/* 6525 * Emit a summary of the state of the clustermap into the trace buffer 6526 * along with some caller-provided data. 6527 */ 6528#if KDEBUG 6529static void 6530vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4) 6531{ 6532 KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0); 6533} 6534#else 6535static void 6536vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code, 6537 __unused int arg1, __unused int arg2, __unused int arg3, 6538 __unused int arg4) 6539{ 6540} 6541#endif 6542 6543#if 0 6544/* 6545 * Perform basic sanity check on the hash entry summary count 6546 * vs. the actual bits set in the entry. 6547 */ 6548static void 6549vfs_drt_sanity(struct vfs_drt_clustermap *cmap) 6550{ 6551 int index, i; 6552 int bits_on; 6553 6554 for (index = 0; index < cmap->scm_modulus; index++) { 6555 if (DRT_HASH_VACANT(cmap, index)) 6556 continue; 6557 6558 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) { 6559 if (DRT_HASH_TEST_BIT(cmap, index, i)) 6560 bits_on++; 6561 } 6562 if (bits_on != DRT_HASH_GET_COUNT(cmap, index)) 6563 panic("bits_on = %d, index = %d\n", bits_on, index); 6564 } 6565} 6566#endif 6567