1/* 2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ 29/* 30 * Copyright (c) 1993 31 * The Regents of the University of California. All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 3. All advertising materials mentioning features or use of this software 42 * must display the following acknowledgement: 43 * This product includes software developed by the University of 44 * California, Berkeley and its contributors. 45 * 4. Neither the name of the University nor the names of its contributors 46 * may be used to endorse or promote products derived from this software 47 * without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * SUCH DAMAGE. 60 * 61 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95 62 */ 63 64#include <sys/param.h> 65#include <sys/proc_internal.h> 66#include <sys/buf_internal.h> 67#include <sys/mount_internal.h> 68#include <sys/vnode_internal.h> 69#include <sys/trace.h> 70#include <sys/malloc.h> 71#include <sys/time.h> 72#include <sys/kernel.h> 73#include <sys/resourcevar.h> 74#include <miscfs/specfs/specdev.h> 75#include <sys/uio_internal.h> 76#include <libkern/libkern.h> 77#include <machine/machine_routines.h> 78 79#include <sys/ubc_internal.h> 80#include <vm/vnode_pager.h> 81 82#include <mach/mach_types.h> 83#include <mach/memory_object_types.h> 84#include <mach/vm_map.h> 85#include <mach/upl.h> 86#include <kern/task.h> 87 88#include <vm/vm_kern.h> 89#include <vm/vm_map.h> 90#include <vm/vm_pageout.h> 91 92#include <sys/kdebug.h> 93#include <libkern/OSAtomic.h> 94 95#include <sys/sdt.h> 96 97#if 0 98#undef KERNEL_DEBUG 99#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT 100#endif 101 102 103#define CL_READ 0x01 104#define CL_WRITE 0x02 105#define CL_ASYNC 0x04 106#define CL_COMMIT 0x08 107#define CL_PAGEOUT 0x10 108#define CL_AGE 0x20 109#define CL_NOZERO 0x40 110#define CL_PAGEIN 0x80 111#define CL_DEV_MEMORY 0x100 112#define CL_PRESERVE 0x200 113#define CL_THROTTLE 0x400 114#define CL_KEEPCACHED 0x800 115#define CL_DIRECT_IO 0x1000 116#define CL_PASSIVE 0x2000 117#define CL_IOSTREAMING 0x4000 118#define CL_CLOSE 0x8000 119#define CL_ENCRYPTED 0x10000 120#define CL_RAW_ENCRYPTED 0x20000 121#define CL_NOCACHE 0x40000 122 123#define MAX_VECTOR_UPL_ELEMENTS 8 124#define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE) * PAGE_SIZE 125 126extern upl_t vector_upl_create(vm_offset_t); 127extern boolean_t vector_upl_is_valid(upl_t); 128extern boolean_t vector_upl_set_subupl(upl_t,upl_t, u_int32_t); 129extern void vector_upl_set_pagelist(upl_t); 130extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t); 131 132struct clios { 133 lck_mtx_t io_mtxp; 134 u_int io_completed; /* amount of io that has currently completed */ 135 u_int io_issued; /* amount of io that was successfully issued */ 136 int io_error; /* error code of first error encountered */ 137 int io_wanted; /* someone is sleeping waiting for a change in state */ 138}; 139 140static lck_grp_t *cl_mtx_grp; 141static lck_attr_t *cl_mtx_attr; 142static lck_grp_attr_t *cl_mtx_grp_attr; 143static lck_mtx_t *cl_transaction_mtxp; 144 145 146#define IO_UNKNOWN 0 147#define IO_DIRECT 1 148#define IO_CONTIG 2 149#define IO_COPY 3 150 151#define PUSH_DELAY 0x01 152#define PUSH_ALL 0x02 153#define PUSH_SYNC 0x04 154 155 156static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset); 157static void cluster_wait_IO(buf_t cbp_head, int async); 158static void cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait); 159 160static int cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length); 161 162static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size, 163 int flags, buf_t real_bp, struct clios *iostate, int (*)(buf_t, void *), void *callback_arg); 164static int cluster_iodone(buf_t bp, void *callback_arg); 165static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags); 166static int cluster_hard_throttle_on(vnode_t vp, uint32_t); 167 168static void cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name); 169 170static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg); 171 172static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference); 173static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference); 174 175static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags, 176 int (*)(buf_t, void *), void *callback_arg); 177static int cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length, 178 int flags, int (*)(buf_t, void *), void *callback_arg); 179static int cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length, 180 int (*)(buf_t, void *), void *callback_arg, int flags); 181 182static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, 183 off_t headOff, off_t tailOff, int flags, int (*)(buf_t, void *), void *callback_arg); 184static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, 185 int *write_type, u_int32_t *write_length, int flags, int (*)(buf_t, void *), void *callback_arg); 186static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, 187 int *write_type, u_int32_t *write_length, int (*)(buf_t, void *), void *callback_arg, int bflag); 188 189static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*)(buf_t, void *), void *callback_arg); 190 191static int cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag); 192static void cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra, int (*callback)(buf_t, void *), void *callback_arg, int bflag); 193 194static int cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg); 195 196static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *), void *callback_arg); 197 198static void sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg); 199static void sparse_cluster_push(void **cmapp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*)(buf_t, void *), void *callback_arg); 200static void sparse_cluster_add(void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF, int (*)(buf_t, void *), void *callback_arg); 201 202static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp); 203static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp); 204static kern_return_t vfs_drt_control(void **cmapp, int op_type); 205 206 207/* 208 * For throttled IO to check whether 209 * a block is cached by the boot cache 210 * and thus it can avoid delaying the IO. 211 * 212 * bootcache_contains_block is initially 213 * NULL. The BootCache will set it while 214 * the cache is active and clear it when 215 * the cache is jettisoned. 216 * 217 * Returns 0 if the block is not 218 * contained in the cache, 1 if it is 219 * contained. 220 * 221 * The function pointer remains valid 222 * after the cache has been evicted even 223 * if bootcache_contains_block has been 224 * cleared. 225 * 226 * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs 227 */ 228int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL; 229 230 231/* 232 * limit the internal I/O size so that we 233 * can represent it in a 32 bit int 234 */ 235#define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512) 236#define MAX_IO_CONTIG_SIZE (MAX_UPL_SIZE * PAGE_SIZE) 237#define MAX_VECTS 16 238#define MIN_DIRECT_WRITE_SIZE (4 * PAGE_SIZE) 239 240#define WRITE_THROTTLE 6 241#define WRITE_THROTTLE_SSD 2 242#define WRITE_BEHIND 1 243#define WRITE_BEHIND_SSD 1 244 245#if CONFIG_EMBEDDED 246#define PREFETCH 1 247#define PREFETCH_SSD 1 248uint32_t speculative_prefetch_max = 512; /* maximum number of pages to use for a specluative read-ahead */ 249uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use for a specluative read-ahead */ 250#else 251#define PREFETCH 3 252#define PREFETCH_SSD 1 253uint32_t speculative_prefetch_max = (MAX_UPL_SIZE * 3); 254uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use for a specluative read-ahead on SSDs*/ 255#endif 256 257 258#define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base)) 259#define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE)) 260#define MAX_PREFETCH(vp, size, is_ssd) (size * IO_SCALE(vp, ((is_ssd && !ignore_is_ssd) ? PREFETCH_SSD : PREFETCH))) 261 262int ignore_is_ssd = 0; 263int speculative_reads_disabled = 0; 264 265/* 266 * throttle the number of async writes that 267 * can be outstanding on a single vnode 268 * before we issue a synchronous write 269 */ 270#define HARD_THROTTLE_MAXCNT 0 271#define HARD_THROTTLE_MAX_IOSIZE (128 * 1024) 272#define LEGACY_HARD_THROTTLE_MAX_IOSIZE (512 * 1024) 273 274extern int32_t throttle_legacy_process_count; 275int hard_throttle_on_root = 0; 276uint32_t hard_throttle_max_iosize = HARD_THROTTLE_MAX_IOSIZE; 277uint32_t legacy_hard_throttle_max_iosize = LEGACY_HARD_THROTTLE_MAX_IOSIZE; 278struct timeval priority_IO_timestamp_for_root; 279 280#if CONFIG_EMBEDDED 281#define THROTTLE_MAX_IOSIZE (hard_throttle_max_iosize) 282#else 283#define THROTTLE_MAX_IOSIZE (throttle_legacy_process_count == 0 ? hard_throttle_max_iosize : legacy_hard_throttle_max_iosize) 284#endif 285 286 287SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &hard_throttle_max_iosize, 0, ""); 288SYSCTL_INT(_debug, OID_AUTO, lowpri_legacy_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &legacy_hard_throttle_max_iosize, 0, ""); 289 290 291void 292cluster_init(void) { 293 /* 294 * allocate lock group attribute and group 295 */ 296 cl_mtx_grp_attr = lck_grp_attr_alloc_init(); 297 cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr); 298 299 /* 300 * allocate the lock attribute 301 */ 302 cl_mtx_attr = lck_attr_alloc_init(); 303 304 cl_transaction_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr); 305 306 if (cl_transaction_mtxp == NULL) 307 panic("cluster_init: failed to allocate cl_transaction_mtxp"); 308} 309 310 311uint32_t 312cluster_max_io_size(mount_t mp, int type) 313{ 314 uint32_t max_io_size; 315 uint32_t segcnt; 316 uint32_t maxcnt; 317 318 switch(type) { 319 320 case CL_READ: 321 segcnt = mp->mnt_segreadcnt; 322 maxcnt = mp->mnt_maxreadcnt; 323 break; 324 case CL_WRITE: 325 segcnt = mp->mnt_segwritecnt; 326 maxcnt = mp->mnt_maxwritecnt; 327 break; 328 default: 329 segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt); 330 maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt); 331 break; 332 } 333 if (segcnt > MAX_UPL_SIZE) { 334 /* 335 * don't allow a size beyond the max UPL size we can create 336 */ 337 segcnt = MAX_UPL_SIZE; 338 } 339 max_io_size = min((segcnt * PAGE_SIZE), maxcnt); 340 341 if (max_io_size < (MAX_UPL_TRANSFER * PAGE_SIZE)) { 342 /* 343 * don't allow a size smaller than the old fixed limit 344 */ 345 max_io_size = (MAX_UPL_TRANSFER * PAGE_SIZE); 346 } else { 347 /* 348 * make sure the size specified is a multiple of PAGE_SIZE 349 */ 350 max_io_size &= ~PAGE_MASK; 351 } 352 return (max_io_size); 353} 354 355 356 357 358#define CLW_ALLOCATE 0x01 359#define CLW_RETURNLOCKED 0x02 360#define CLW_IONOCACHE 0x04 361#define CLW_IOPASSIVE 0x08 362 363/* 364 * if the read ahead context doesn't yet exist, 365 * allocate and initialize it... 366 * the vnode lock serializes multiple callers 367 * during the actual assignment... first one 368 * to grab the lock wins... the other callers 369 * will release the now unnecessary storage 370 * 371 * once the context is present, try to grab (but don't block on) 372 * the lock associated with it... if someone 373 * else currently owns it, than the read 374 * will run without read-ahead. this allows 375 * multiple readers to run in parallel and 376 * since there's only 1 read ahead context, 377 * there's no real loss in only allowing 1 378 * reader to have read-ahead enabled. 379 */ 380static struct cl_readahead * 381cluster_get_rap(vnode_t vp) 382{ 383 struct ubc_info *ubc; 384 struct cl_readahead *rap; 385 386 ubc = vp->v_ubcinfo; 387 388 if ((rap = ubc->cl_rahead) == NULL) { 389 MALLOC_ZONE(rap, struct cl_readahead *, sizeof *rap, M_CLRDAHEAD, M_WAITOK); 390 391 bzero(rap, sizeof *rap); 392 rap->cl_lastr = -1; 393 lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr); 394 395 vnode_lock(vp); 396 397 if (ubc->cl_rahead == NULL) 398 ubc->cl_rahead = rap; 399 else { 400 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp); 401 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD); 402 rap = ubc->cl_rahead; 403 } 404 vnode_unlock(vp); 405 } 406 if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE) 407 return(rap); 408 409 return ((struct cl_readahead *)NULL); 410} 411 412 413/* 414 * if the write behind context doesn't yet exist, 415 * and CLW_ALLOCATE is specified, allocate and initialize it... 416 * the vnode lock serializes multiple callers 417 * during the actual assignment... first one 418 * to grab the lock wins... the other callers 419 * will release the now unnecessary storage 420 * 421 * if CLW_RETURNLOCKED is set, grab (blocking if necessary) 422 * the lock associated with the write behind context before 423 * returning 424 */ 425 426static struct cl_writebehind * 427cluster_get_wbp(vnode_t vp, int flags) 428{ 429 struct ubc_info *ubc; 430 struct cl_writebehind *wbp; 431 432 ubc = vp->v_ubcinfo; 433 434 if ((wbp = ubc->cl_wbehind) == NULL) { 435 436 if ( !(flags & CLW_ALLOCATE)) 437 return ((struct cl_writebehind *)NULL); 438 439 MALLOC_ZONE(wbp, struct cl_writebehind *, sizeof *wbp, M_CLWRBEHIND, M_WAITOK); 440 441 bzero(wbp, sizeof *wbp); 442 lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr); 443 444 vnode_lock(vp); 445 446 if (ubc->cl_wbehind == NULL) 447 ubc->cl_wbehind = wbp; 448 else { 449 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp); 450 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND); 451 wbp = ubc->cl_wbehind; 452 } 453 vnode_unlock(vp); 454 } 455 if (flags & CLW_RETURNLOCKED) 456 lck_mtx_lock(&wbp->cl_lockw); 457 458 return (wbp); 459} 460 461 462static void 463cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg) 464{ 465 struct cl_writebehind *wbp; 466 467 if ((wbp = cluster_get_wbp(vp, 0)) != NULL) { 468 469 if (wbp->cl_number) { 470 lck_mtx_lock(&wbp->cl_lockw); 471 472 cluster_try_push(wbp, vp, newEOF, PUSH_ALL | PUSH_SYNC, 0, callback, callback_arg); 473 474 lck_mtx_unlock(&wbp->cl_lockw); 475 } 476 } 477} 478 479 480static int 481cluster_io_present_in_BC(vnode_t vp, off_t f_offset) 482{ 483 daddr64_t blkno; 484 size_t io_size; 485 int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block; 486 487 if (bootcache_check_fn) { 488 if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ, NULL)) 489 return(0); 490 491 if (io_size == 0) 492 return (0); 493 494 if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno)) 495 return(1); 496 } 497 return(0); 498} 499 500 501static int 502cluster_hard_throttle_on(vnode_t vp, uint32_t hard_throttle) 503{ 504 int throttle_type = 0; 505 506 if ( (throttle_type = throttle_io_will_be_throttled(-1, vp->v_mount)) ) 507 return(throttle_type); 508 509 if (hard_throttle && (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) { 510 static struct timeval hard_throttle_maxelapsed = { 0, 100000 }; 511 struct timeval elapsed; 512 513 if (hard_throttle_on_root) 514 return(1); 515 516 microuptime(&elapsed); 517 timevalsub(&elapsed, &priority_IO_timestamp_for_root); 518 519 if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <)) 520 return(1); 521 } 522 return(0); 523} 524 525 526static void 527cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name) 528{ 529 530 lck_mtx_lock(&iostate->io_mtxp); 531 532 while ((iostate->io_issued - iostate->io_completed) > target) { 533 534 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, 535 iostate->io_issued, iostate->io_completed, target, 0, 0); 536 537 iostate->io_wanted = 1; 538 msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL); 539 540 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, 541 iostate->io_issued, iostate->io_completed, target, 0, 0); 542 } 543 lck_mtx_unlock(&iostate->io_mtxp); 544} 545 546 547static int 548cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags) 549{ 550 int upl_abort_code = 0; 551 int page_in = 0; 552 int page_out = 0; 553 554 if ((io_flags & (B_PHYS | B_CACHE)) == (B_PHYS | B_CACHE)) 555 /* 556 * direct write of any flavor, or a direct read that wasn't aligned 557 */ 558 ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY); 559 else { 560 if (io_flags & B_PAGEIO) { 561 if (io_flags & B_READ) 562 page_in = 1; 563 else 564 page_out = 1; 565 } 566 if (io_flags & B_CACHE) 567 /* 568 * leave pages in the cache unchanged on error 569 */ 570 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY; 571 else if (page_out && (error != ENXIO)) 572 /* 573 * transient error... leave pages unchanged 574 */ 575 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY; 576 else if (page_in) 577 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR; 578 else 579 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES; 580 581 ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code); 582 } 583 return (upl_abort_code); 584} 585 586 587static int 588cluster_iodone(buf_t bp, void *callback_arg) 589{ 590 int b_flags; 591 int error; 592 int total_size; 593 int total_resid; 594 int upl_offset; 595 int zero_offset; 596 int pg_offset = 0; 597 int commit_size = 0; 598 int upl_flags = 0; 599 int transaction_size = 0; 600 upl_t upl; 601 buf_t cbp; 602 buf_t cbp_head; 603 buf_t cbp_next; 604 buf_t real_bp; 605 struct clios *iostate; 606 boolean_t transaction_complete = FALSE; 607 608 cbp_head = (buf_t)(bp->b_trans_head); 609 610 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START, 611 cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0); 612 613 if (cbp_head->b_trans_next || !(cbp_head->b_flags & B_EOT)) { 614 boolean_t need_wakeup = FALSE; 615 616 lck_mtx_lock_spin(cl_transaction_mtxp); 617 618 bp->b_flags |= B_TDONE; 619 620 if (bp->b_flags & B_TWANTED) { 621 CLR(bp->b_flags, B_TWANTED); 622 need_wakeup = TRUE; 623 } 624 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) { 625 /* 626 * all I/O requests that are part of this transaction 627 * have to complete before we can process it 628 */ 629 if ( !(cbp->b_flags & B_TDONE)) { 630 631 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, 632 cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0); 633 634 lck_mtx_unlock(cl_transaction_mtxp); 635 636 if (need_wakeup == TRUE) 637 wakeup(bp); 638 639 return 0; 640 } 641 if (cbp->b_flags & B_EOT) 642 transaction_complete = TRUE; 643 } 644 lck_mtx_unlock(cl_transaction_mtxp); 645 646 if (need_wakeup == TRUE) 647 wakeup(bp); 648 649 if (transaction_complete == FALSE) { 650 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, 651 cbp_head, 0, 0, 0, 0); 652 return 0; 653 } 654 } 655 error = 0; 656 total_size = 0; 657 total_resid = 0; 658 659 cbp = cbp_head; 660 upl_offset = cbp->b_uploffset; 661 upl = cbp->b_upl; 662 b_flags = cbp->b_flags; 663 real_bp = cbp->b_real_bp; 664 zero_offset= cbp->b_validend; 665 iostate = (struct clios *)cbp->b_iostate; 666 667 if (real_bp) 668 real_bp->b_dev = cbp->b_dev; 669 670 while (cbp) { 671 if ((cbp->b_flags & B_ERROR) && error == 0) 672 error = cbp->b_error; 673 674 total_resid += cbp->b_resid; 675 total_size += cbp->b_bcount; 676 677 cbp_next = cbp->b_trans_next; 678 679 if (cbp_next == NULL) 680 /* 681 * compute the overall size of the transaction 682 * in case we created one that has 'holes' in it 683 * 'total_size' represents the amount of I/O we 684 * did, not the span of the transaction w/r to the UPL 685 */ 686 transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset; 687 688 if (cbp != cbp_head) 689 free_io_buf(cbp); 690 691 cbp = cbp_next; 692 } 693 if (error == 0 && total_resid) 694 error = EIO; 695 696 if (error == 0) { 697 int (*cliodone_func)(buf_t, void *) = (int (*)(buf_t, void *))(cbp_head->b_cliodone); 698 699 if (cliodone_func != NULL) { 700 cbp_head->b_bcount = transaction_size; 701 702 error = (*cliodone_func)(cbp_head, callback_arg); 703 } 704 } 705 if (zero_offset) 706 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp); 707 708 free_io_buf(cbp_head); 709 710 if (iostate) { 711 int need_wakeup = 0; 712 713 /* 714 * someone has issued multiple I/Os asynchrounsly 715 * and is waiting for them to complete (streaming) 716 */ 717 lck_mtx_lock_spin(&iostate->io_mtxp); 718 719 if (error && iostate->io_error == 0) 720 iostate->io_error = error; 721 722 iostate->io_completed += total_size; 723 724 if (iostate->io_wanted) { 725 /* 726 * someone is waiting for the state of 727 * this io stream to change 728 */ 729 iostate->io_wanted = 0; 730 need_wakeup = 1; 731 } 732 lck_mtx_unlock(&iostate->io_mtxp); 733 734 if (need_wakeup) 735 wakeup((caddr_t)&iostate->io_wanted); 736 } 737 738 if (b_flags & B_COMMIT_UPL) { 739 740 pg_offset = upl_offset & PAGE_MASK; 741 commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 742 743 if (error) 744 upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags); 745 else { 746 upl_flags = UPL_COMMIT_FREE_ON_EMPTY; 747 748 if ((b_flags & B_PHYS) && (b_flags & B_READ)) 749 upl_flags |= UPL_COMMIT_SET_DIRTY; 750 751 if (b_flags & B_AGE) 752 upl_flags |= UPL_COMMIT_INACTIVATE; 753 754 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags); 755 } 756 } 757 if (real_bp) { 758 if (error) { 759 real_bp->b_flags |= B_ERROR; 760 real_bp->b_error = error; 761 } 762 real_bp->b_resid = total_resid; 763 764 buf_biodone(real_bp); 765 } 766 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, 767 upl, upl_offset - pg_offset, commit_size, (error << 24) | upl_flags, 0); 768 769 return (error); 770} 771 772 773uint32_t 774cluster_hard_throttle_limit(vnode_t vp, uint32_t *limit, uint32_t hard_throttle) 775{ 776 if (cluster_hard_throttle_on(vp, hard_throttle)) { 777 *limit = THROTTLE_MAX_IOSIZE; 778 return 1; 779 } 780 return 0; 781} 782 783 784void 785cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp) 786{ 787 788 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START, 789 upl_offset, size, bp, 0, 0); 790 791 if (bp == NULL || bp->b_datap == 0) { 792 upl_page_info_t *pl; 793 addr64_t zero_addr; 794 795 pl = ubc_upl_pageinfo(upl); 796 797 if (upl_device_page(pl) == TRUE) { 798 zero_addr = ((addr64_t)upl_phys_page(pl, 0) << 12) + upl_offset; 799 800 bzero_phys_nc(zero_addr, size); 801 } else { 802 while (size) { 803 int page_offset; 804 int page_index; 805 int zero_cnt; 806 807 page_index = upl_offset / PAGE_SIZE; 808 page_offset = upl_offset & PAGE_MASK; 809 810 zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset; 811 zero_cnt = min(PAGE_SIZE - page_offset, size); 812 813 bzero_phys(zero_addr, zero_cnt); 814 815 size -= zero_cnt; 816 upl_offset += zero_cnt; 817 } 818 } 819 } else 820 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size); 821 822 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END, 823 upl_offset, size, 0, 0, 0); 824} 825 826 827static void 828cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset) 829{ 830 cbp_head->b_validend = zero_offset; 831 cbp_tail->b_flags |= B_EOT; 832} 833 834static void 835cluster_wait_IO(buf_t cbp_head, int async) 836{ 837 buf_t cbp; 838 839 if (async) { 840 /* 841 * async callback completion will not normally 842 * generate a wakeup upon I/O completion... 843 * by setting B_TWANTED, we will force a wakeup 844 * to occur as any outstanding I/Os complete... 845 * I/Os already completed will have B_TDONE already 846 * set and we won't cause us to block 847 * note that we're actually waiting for the bp to have 848 * completed the callback function... only then 849 * can we safely take back ownership of the bp 850 */ 851 lck_mtx_lock_spin(cl_transaction_mtxp); 852 853 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) 854 cbp->b_flags |= B_TWANTED; 855 856 lck_mtx_unlock(cl_transaction_mtxp); 857 } 858 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) { 859 860 if (async) { 861 while (!ISSET(cbp->b_flags, B_TDONE)) { 862 863 lck_mtx_lock_spin(cl_transaction_mtxp); 864 865 if (!ISSET(cbp->b_flags, B_TDONE)) { 866 DTRACE_IO1(wait__start, buf_t, cbp); 867 (void) msleep(cbp, cl_transaction_mtxp, PDROP | (PRIBIO+1), "cluster_wait_IO", NULL); 868 DTRACE_IO1(wait__done, buf_t, cbp); 869 } else 870 lck_mtx_unlock(cl_transaction_mtxp); 871 } 872 } else 873 buf_biowait(cbp); 874 } 875} 876 877static void 878cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait) 879{ 880 buf_t cbp; 881 int error; 882 883 /* 884 * cluster_complete_transaction will 885 * only be called if we've issued a complete chain in synchronous mode 886 * or, we've already done a cluster_wait_IO on an incomplete chain 887 */ 888 if (needwait) { 889 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) 890 buf_biowait(cbp); 891 } 892 /* 893 * we've already waited on all of the I/Os in this transaction, 894 * so mark all of the buf_t's in this transaction as B_TDONE 895 * so that cluster_iodone sees the transaction as completed 896 */ 897 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) 898 cbp->b_flags |= B_TDONE; 899 900 error = cluster_iodone(*cbp_head, callback_arg); 901 902 if ( !(flags & CL_ASYNC) && error && *retval == 0) { 903 if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) != CL_PAGEOUT) || (error != ENXIO)) 904 *retval = error; 905 } 906 *cbp_head = (buf_t)NULL; 907} 908 909 910static int 911cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size, 912 int flags, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg) 913{ 914 buf_t cbp; 915 u_int size; 916 u_int io_size; 917 int io_flags; 918 int bmap_flags; 919 int error = 0; 920 int retval = 0; 921 buf_t cbp_head = NULL; 922 buf_t cbp_tail = NULL; 923 int trans_count = 0; 924 int max_trans_count; 925 u_int pg_count; 926 int pg_offset; 927 u_int max_iosize; 928 u_int max_vectors; 929 int priv; 930 int zero_offset = 0; 931 int async_throttle = 0; 932 mount_t mp; 933 vm_offset_t upl_end_offset; 934 boolean_t need_EOT = FALSE; 935 936 /* 937 * we currently don't support buffers larger than a page 938 */ 939 if (real_bp && non_rounded_size > PAGE_SIZE) 940 panic("%s(): Called with real buffer of size %d bytes which " 941 "is greater than the maximum allowed size of " 942 "%d bytes (the system PAGE_SIZE).\n", 943 __FUNCTION__, non_rounded_size, PAGE_SIZE); 944 945 mp = vp->v_mount; 946 947 /* 948 * we don't want to do any funny rounding of the size for IO requests 949 * coming through the DIRECT or CONTIGUOUS paths... those pages don't 950 * belong to us... we can't extend (nor do we need to) the I/O to fill 951 * out a page 952 */ 953 if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY | CL_DIRECT_IO))) { 954 /* 955 * round the requested size up so that this I/O ends on a 956 * page boundary in case this is a 'write'... if the filesystem 957 * has blocks allocated to back the page beyond the EOF, we want to 958 * make sure to write out the zero's that are sitting beyond the EOF 959 * so that in case the filesystem doesn't explicitly zero this area 960 * if a hole is created via a lseek/write beyond the current EOF, 961 * it will return zeros when it's read back from the disk. If the 962 * physical allocation doesn't extend for the whole page, we'll 963 * only write/read from the disk up to the end of this allocation 964 * via the extent info returned from the VNOP_BLOCKMAP call. 965 */ 966 pg_offset = upl_offset & PAGE_MASK; 967 968 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset; 969 } else { 970 /* 971 * anyone advertising a blocksize of 1 byte probably 972 * can't deal with us rounding up the request size 973 * AFP is one such filesystem/device 974 */ 975 size = non_rounded_size; 976 } 977 upl_end_offset = upl_offset + size; 978 979 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0); 980 981 /* 982 * Set the maximum transaction size to the maximum desired number of 983 * buffers. 984 */ 985 max_trans_count = 8; 986 if (flags & CL_DEV_MEMORY) 987 max_trans_count = 16; 988 989 if (flags & CL_READ) { 990 io_flags = B_READ; 991 bmap_flags = VNODE_READ; 992 993 max_iosize = mp->mnt_maxreadcnt; 994 max_vectors = mp->mnt_segreadcnt; 995 } else { 996 io_flags = B_WRITE; 997 bmap_flags = VNODE_WRITE; 998 999 max_iosize = mp->mnt_maxwritecnt; 1000 max_vectors = mp->mnt_segwritecnt; 1001 } 1002 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0); 1003 1004 /* 1005 * make sure the maximum iosize is a 1006 * multiple of the page size 1007 */ 1008 max_iosize &= ~PAGE_MASK; 1009 1010 /* 1011 * Ensure the maximum iosize is sensible. 1012 */ 1013 if (!max_iosize) 1014 max_iosize = PAGE_SIZE; 1015 1016 if (flags & CL_THROTTLE) { 1017 if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp, 1)) { 1018 if (max_iosize > THROTTLE_MAX_IOSIZE) 1019 max_iosize = THROTTLE_MAX_IOSIZE; 1020 async_throttle = HARD_THROTTLE_MAXCNT; 1021 } else { 1022 if ( (flags & CL_DEV_MEMORY) ) 1023 async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE); 1024 else { 1025 u_int max_cluster; 1026 u_int max_cluster_size; 1027 u_int scale; 1028 1029 max_cluster_size = MAX_CLUSTER_SIZE(vp); 1030 1031 if (max_iosize > max_cluster_size) 1032 max_cluster = max_cluster_size; 1033 else 1034 max_cluster = max_iosize; 1035 1036 if (size < max_cluster) 1037 max_cluster = size; 1038 1039 if ((vp->v_mount->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) 1040 scale = WRITE_THROTTLE_SSD; 1041 else 1042 scale = WRITE_THROTTLE; 1043 1044 if (flags & CL_CLOSE) 1045 scale += MAX_CLUSTERS; 1046 1047 async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1); 1048 } 1049 } 1050 } 1051 if (flags & CL_AGE) 1052 io_flags |= B_AGE; 1053 if (flags & (CL_PAGEIN | CL_PAGEOUT)) 1054 io_flags |= B_PAGEIO; 1055 if (flags & (CL_IOSTREAMING)) 1056 io_flags |= B_IOSTREAMING; 1057 if (flags & CL_COMMIT) 1058 io_flags |= B_COMMIT_UPL; 1059 if (flags & CL_DIRECT_IO) 1060 io_flags |= B_PHYS; 1061 if (flags & (CL_PRESERVE | CL_KEEPCACHED)) 1062 io_flags |= B_CACHE; 1063 if (flags & CL_PASSIVE) 1064 io_flags |= B_PASSIVE; 1065 if (flags & CL_ENCRYPTED) 1066 io_flags |= B_ENCRYPTED_IO; 1067 if (vp->v_flag & VSYSTEM) 1068 io_flags |= B_META; 1069 1070 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) { 1071 /* 1072 * then we are going to end up 1073 * with a page that we can't complete (the file size wasn't a multiple 1074 * of PAGE_SIZE and we're trying to read to the end of the file 1075 * so we'll go ahead and zero out the portion of the page we can't 1076 * read in from the file 1077 */ 1078 zero_offset = upl_offset + non_rounded_size; 1079 } 1080 while (size) { 1081 daddr64_t blkno; 1082 daddr64_t lblkno; 1083 u_int io_size_wanted; 1084 size_t io_size_tmp; 1085 1086 if (size > max_iosize) 1087 io_size = max_iosize; 1088 else 1089 io_size = size; 1090 1091 io_size_wanted = io_size; 1092 io_size_tmp = (size_t)io_size; 1093 1094 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, &io_size_tmp, NULL, bmap_flags, NULL))) 1095 break; 1096 1097 if (io_size_tmp > io_size_wanted) 1098 io_size = io_size_wanted; 1099 else 1100 io_size = (u_int)io_size_tmp; 1101 1102 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) 1103 real_bp->b_blkno = blkno; 1104 1105 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE, 1106 (int)f_offset, (int)(blkno>>32), (int)blkno, io_size, 0); 1107 1108 if (io_size == 0) { 1109 /* 1110 * vnop_blockmap didn't return an error... however, it did 1111 * return an extent size of 0 which means we can't 1112 * make forward progress on this I/O... a hole in the 1113 * file would be returned as a blkno of -1 with a non-zero io_size 1114 * a real extent is returned with a blkno != -1 and a non-zero io_size 1115 */ 1116 error = EINVAL; 1117 break; 1118 } 1119 if ( !(flags & CL_READ) && blkno == -1) { 1120 off_t e_offset; 1121 int pageout_flags; 1122 1123 if (upl_get_internal_vectorupl(upl)) 1124 panic("Vector UPLs should not take this code-path\n"); 1125 /* 1126 * we're writing into a 'hole' 1127 */ 1128 if (flags & CL_PAGEOUT) { 1129 /* 1130 * if we got here via cluster_pageout 1131 * then just error the request and return 1132 * the 'hole' should already have been covered 1133 */ 1134 error = EINVAL; 1135 break; 1136 } 1137 /* 1138 * we can get here if the cluster code happens to 1139 * pick up a page that was dirtied via mmap vs 1140 * a 'write' and the page targets a 'hole'... 1141 * i.e. the writes to the cluster were sparse 1142 * and the file was being written for the first time 1143 * 1144 * we can also get here if the filesystem supports 1145 * 'holes' that are less than PAGE_SIZE.... because 1146 * we can't know if the range in the page that covers 1147 * the 'hole' has been dirtied via an mmap or not, 1148 * we have to assume the worst and try to push the 1149 * entire page to storage. 1150 * 1151 * Try paging out the page individually before 1152 * giving up entirely and dumping it (the pageout 1153 * path will insure that the zero extent accounting 1154 * has been taken care of before we get back into cluster_io) 1155 * 1156 * go direct to vnode_pageout so that we don't have to 1157 * unbusy the page from the UPL... we used to do this 1158 * so that we could call ubc_sync_range, but that results 1159 * in a potential deadlock if someone else races us to acquire 1160 * that page and wins and in addition needs one of the pages 1161 * we're continuing to hold in the UPL 1162 */ 1163 pageout_flags = UPL_MSYNC | UPL_VNODE_PAGER | UPL_NESTED_PAGEOUT; 1164 1165 if ( !(flags & CL_ASYNC)) 1166 pageout_flags |= UPL_IOSYNC; 1167 if ( !(flags & CL_COMMIT)) 1168 pageout_flags |= UPL_NOCOMMIT; 1169 1170 if (cbp_head) { 1171 buf_t last_cbp; 1172 1173 /* 1174 * first we have to wait for the the current outstanding I/Os 1175 * to complete... EOT hasn't been set yet on this transaction 1176 * so the pages won't be released just because all of the current 1177 * I/O linked to this transaction has completed... 1178 */ 1179 cluster_wait_IO(cbp_head, (flags & CL_ASYNC)); 1180 1181 /* 1182 * we've got a transcation that 1183 * includes the page we're about to push out through vnode_pageout... 1184 * find the last bp in the list which will be the one that 1185 * includes the head of this page and round it's iosize down 1186 * to a page boundary... 1187 */ 1188 for (last_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next) 1189 last_cbp = cbp; 1190 1191 cbp->b_bcount &= ~PAGE_MASK; 1192 1193 if (cbp->b_bcount == 0) { 1194 /* 1195 * this buf no longer has any I/O associated with it 1196 */ 1197 free_io_buf(cbp); 1198 1199 if (cbp == cbp_head) { 1200 /* 1201 * the buf we just freed was the only buf in 1202 * this transaction... so there's no I/O to do 1203 */ 1204 cbp_head = NULL; 1205 } else { 1206 /* 1207 * remove the buf we just freed from 1208 * the transaction list 1209 */ 1210 last_cbp->b_trans_next = NULL; 1211 cbp_tail = last_cbp; 1212 } 1213 } 1214 if (cbp_head) { 1215 /* 1216 * there was more to the current transaction 1217 * than just the page we are pushing out via vnode_pageout... 1218 * mark it as finished and complete it... we've already 1219 * waited for the I/Os to complete above in the call to cluster_wait_IO 1220 */ 1221 cluster_EOT(cbp_head, cbp_tail, 0); 1222 1223 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0); 1224 1225 trans_count = 0; 1226 } 1227 } 1228 if (vnode_pageout(vp, upl, trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) { 1229 error = EINVAL; 1230 } 1231 e_offset = round_page_64(f_offset + 1); 1232 io_size = e_offset - f_offset; 1233 1234 f_offset += io_size; 1235 upl_offset += io_size; 1236 1237 if (size >= io_size) 1238 size -= io_size; 1239 else 1240 size = 0; 1241 /* 1242 * keep track of how much of the original request 1243 * that we've actually completed... non_rounded_size 1244 * may go negative due to us rounding the request 1245 * to a page size multiple (i.e. size > non_rounded_size) 1246 */ 1247 non_rounded_size -= io_size; 1248 1249 if (non_rounded_size <= 0) { 1250 /* 1251 * we've transferred all of the data in the original 1252 * request, but we were unable to complete the tail 1253 * of the last page because the file didn't have 1254 * an allocation to back that portion... this is ok. 1255 */ 1256 size = 0; 1257 } 1258 if (error) { 1259 if (size == 0) 1260 flags &= ~CL_COMMIT; 1261 break; 1262 } 1263 continue; 1264 } 1265 lblkno = (daddr64_t)(f_offset / PAGE_SIZE_64); 1266 /* 1267 * we have now figured out how much I/O we can do - this is in 'io_size' 1268 * pg_offset is the starting point in the first page for the I/O 1269 * pg_count is the number of full and partial pages that 'io_size' encompasses 1270 */ 1271 pg_offset = upl_offset & PAGE_MASK; 1272 1273 if (flags & CL_DEV_MEMORY) { 1274 /* 1275 * treat physical requests as one 'giant' page 1276 */ 1277 pg_count = 1; 1278 } else 1279 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE; 1280 1281 if ((flags & CL_READ) && blkno == -1) { 1282 vm_offset_t commit_offset; 1283 int bytes_to_zero; 1284 int complete_transaction_now = 0; 1285 1286 /* 1287 * if we're reading and blkno == -1, then we've got a 1288 * 'hole' in the file that we need to deal with by zeroing 1289 * out the affected area in the upl 1290 */ 1291 if (io_size >= (u_int)non_rounded_size) { 1292 /* 1293 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE 1294 * than 'zero_offset' will be non-zero 1295 * if the 'hole' returned by vnop_blockmap extends all the way to the eof 1296 * (indicated by the io_size finishing off the I/O request for this UPL) 1297 * than we're not going to issue an I/O for the 1298 * last page in this upl... we need to zero both the hole and the tail 1299 * of the page beyond the EOF, since the delayed zero-fill won't kick in 1300 */ 1301 bytes_to_zero = non_rounded_size; 1302 if (!(flags & CL_NOZERO)) 1303 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset; 1304 1305 zero_offset = 0; 1306 } else 1307 bytes_to_zero = io_size; 1308 1309 pg_count = 0; 1310 1311 cluster_zero(upl, upl_offset, bytes_to_zero, real_bp); 1312 1313 if (cbp_head) { 1314 int pg_resid; 1315 1316 /* 1317 * if there is a current I/O chain pending 1318 * then the first page of the group we just zero'd 1319 * will be handled by the I/O completion if the zero 1320 * fill started in the middle of the page 1321 */ 1322 commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK; 1323 1324 pg_resid = commit_offset - upl_offset; 1325 1326 if (bytes_to_zero >= pg_resid) { 1327 /* 1328 * the last page of the current I/O 1329 * has been completed... 1330 * compute the number of fully zero'd 1331 * pages that are beyond it 1332 * plus the last page if its partial 1333 * and we have no more I/O to issue... 1334 * otherwise a partial page is left 1335 * to begin the next I/O 1336 */ 1337 if ((int)io_size >= non_rounded_size) 1338 pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE; 1339 else 1340 pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE; 1341 1342 complete_transaction_now = 1; 1343 } 1344 } else { 1345 /* 1346 * no pending I/O to deal with 1347 * so, commit all of the fully zero'd pages 1348 * plus the last page if its partial 1349 * and we have no more I/O to issue... 1350 * otherwise a partial page is left 1351 * to begin the next I/O 1352 */ 1353 if ((int)io_size >= non_rounded_size) 1354 pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE; 1355 else 1356 pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE; 1357 1358 commit_offset = upl_offset & ~PAGE_MASK; 1359 } 1360 if ( (flags & CL_COMMIT) && pg_count) { 1361 ubc_upl_commit_range(upl, commit_offset, pg_count * PAGE_SIZE, 1362 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY); 1363 } 1364 upl_offset += io_size; 1365 f_offset += io_size; 1366 size -= io_size; 1367 1368 /* 1369 * keep track of how much of the original request 1370 * that we've actually completed... non_rounded_size 1371 * may go negative due to us rounding the request 1372 * to a page size multiple (i.e. size > non_rounded_size) 1373 */ 1374 non_rounded_size -= io_size; 1375 1376 if (non_rounded_size <= 0) { 1377 /* 1378 * we've transferred all of the data in the original 1379 * request, but we were unable to complete the tail 1380 * of the last page because the file didn't have 1381 * an allocation to back that portion... this is ok. 1382 */ 1383 size = 0; 1384 } 1385 if (cbp_head && (complete_transaction_now || size == 0)) { 1386 cluster_wait_IO(cbp_head, (flags & CL_ASYNC)); 1387 1388 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0); 1389 1390 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0); 1391 1392 trans_count = 0; 1393 } 1394 continue; 1395 } 1396 if (pg_count > max_vectors) { 1397 if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) { 1398 io_size = PAGE_SIZE - pg_offset; 1399 pg_count = 1; 1400 } else { 1401 io_size -= (pg_count - max_vectors) * PAGE_SIZE; 1402 pg_count = max_vectors; 1403 } 1404 } 1405 /* 1406 * If the transaction is going to reach the maximum number of 1407 * desired elements, truncate the i/o to the nearest page so 1408 * that the actual i/o is initiated after this buffer is 1409 * created and added to the i/o chain. 1410 * 1411 * I/O directed to physically contiguous memory 1412 * doesn't have a requirement to make sure we 'fill' a page 1413 */ 1414 if ( !(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count && 1415 ((upl_offset + io_size) & PAGE_MASK)) { 1416 vm_offset_t aligned_ofs; 1417 1418 aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK; 1419 /* 1420 * If the io_size does not actually finish off even a 1421 * single page we have to keep adding buffers to the 1422 * transaction despite having reached the desired limit. 1423 * 1424 * Eventually we get here with the page being finished 1425 * off (and exceeded) and then we truncate the size of 1426 * this i/o request so that it is page aligned so that 1427 * we can finally issue the i/o on the transaction. 1428 */ 1429 if (aligned_ofs > upl_offset) { 1430 io_size = aligned_ofs - upl_offset; 1431 pg_count--; 1432 } 1433 } 1434 1435 if ( !(mp->mnt_kern_flag & MNTK_VIRTUALDEV)) 1436 /* 1437 * if we're not targeting a virtual device i.e. a disk image 1438 * it's safe to dip into the reserve pool since real devices 1439 * can complete this I/O request without requiring additional 1440 * bufs from the alloc_io_buf pool 1441 */ 1442 priv = 1; 1443 else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT)) 1444 /* 1445 * Throttle the speculative IO 1446 */ 1447 priv = 0; 1448 else 1449 priv = 1; 1450 1451 cbp = alloc_io_buf(vp, priv); 1452 1453 if (flags & CL_PAGEOUT) { 1454 u_int i; 1455 1456 for (i = 0; i < pg_count; i++) { 1457 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY) 1458 panic("BUSY bp found in cluster_io"); 1459 } 1460 } 1461 if (flags & CL_ASYNC) { 1462 if (buf_setcallback(cbp, (void *)cluster_iodone, callback_arg)) 1463 panic("buf_setcallback failed\n"); 1464 } 1465 cbp->b_cliodone = (void *)callback; 1466 cbp->b_flags |= io_flags; 1467 if (flags & CL_NOCACHE) 1468 cbp->b_attr.ba_flags |= BA_NOCACHE; 1469 1470 cbp->b_lblkno = lblkno; 1471 cbp->b_blkno = blkno; 1472 cbp->b_bcount = io_size; 1473 1474 if (buf_setupl(cbp, upl, upl_offset)) 1475 panic("buf_setupl failed\n"); 1476 1477 cbp->b_trans_next = (buf_t)NULL; 1478 1479 if ((cbp->b_iostate = (void *)iostate)) 1480 /* 1481 * caller wants to track the state of this 1482 * io... bump the amount issued against this stream 1483 */ 1484 iostate->io_issued += io_size; 1485 1486 if (flags & CL_READ) { 1487 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE, 1488 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0); 1489 } 1490 else { 1491 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE, 1492 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0); 1493 } 1494 1495 if (cbp_head) { 1496 cbp_tail->b_trans_next = cbp; 1497 cbp_tail = cbp; 1498 } else { 1499 cbp_head = cbp; 1500 cbp_tail = cbp; 1501 1502 if ( (cbp_head->b_real_bp = real_bp) ) 1503 real_bp = (buf_t)NULL; 1504 } 1505 *(buf_t *)(&cbp->b_trans_head) = cbp_head; 1506 1507 trans_count++; 1508 1509 upl_offset += io_size; 1510 f_offset += io_size; 1511 size -= io_size; 1512 /* 1513 * keep track of how much of the original request 1514 * that we've actually completed... non_rounded_size 1515 * may go negative due to us rounding the request 1516 * to a page size multiple (i.e. size > non_rounded_size) 1517 */ 1518 non_rounded_size -= io_size; 1519 1520 if (non_rounded_size <= 0) { 1521 /* 1522 * we've transferred all of the data in the original 1523 * request, but we were unable to complete the tail 1524 * of the last page because the file didn't have 1525 * an allocation to back that portion... this is ok. 1526 */ 1527 size = 0; 1528 } 1529 if (size == 0) { 1530 /* 1531 * we have no more I/O to issue, so go 1532 * finish the final transaction 1533 */ 1534 need_EOT = TRUE; 1535 } else if ( ((flags & CL_DEV_MEMORY) || (upl_offset & PAGE_MASK) == 0) && 1536 ((flags & CL_ASYNC) || trans_count > max_trans_count) ) { 1537 /* 1538 * I/O directed to physically contiguous memory... 1539 * which doesn't have a requirement to make sure we 'fill' a page 1540 * or... 1541 * the current I/O we've prepared fully 1542 * completes the last page in this request 1543 * and ... 1544 * it's either an ASYNC request or 1545 * we've already accumulated more than 8 I/O's into 1546 * this transaction so mark it as complete so that 1547 * it can finish asynchronously or via the cluster_complete_transaction 1548 * below if the request is synchronous 1549 */ 1550 need_EOT = TRUE; 1551 } 1552 if (need_EOT == TRUE) 1553 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0); 1554 1555 if (flags & CL_THROTTLE) 1556 (void)vnode_waitforwrites(vp, async_throttle, 0, 0, "cluster_io"); 1557 1558 if ( !(io_flags & B_READ)) 1559 vnode_startwrite(vp); 1560 1561 if (flags & CL_RAW_ENCRYPTED) { 1562 /* 1563 * User requested raw encrypted bytes. 1564 * Twiddle the bit in the ba_flags for the buffer 1565 */ 1566 cbp->b_attr.ba_flags |= BA_RAW_ENCRYPTED_IO; 1567 } 1568 1569 (void) VNOP_STRATEGY(cbp); 1570 1571 if (need_EOT == TRUE) { 1572 if ( !(flags & CL_ASYNC)) 1573 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 1); 1574 1575 need_EOT = FALSE; 1576 trans_count = 0; 1577 cbp_head = NULL; 1578 } 1579 } 1580 if (error) { 1581 int abort_size; 1582 1583 io_size = 0; 1584 1585 if (cbp_head) { 1586 /* 1587 * first wait until all of the outstanding I/O 1588 * for this partial transaction has completed 1589 */ 1590 cluster_wait_IO(cbp_head, (flags & CL_ASYNC)); 1591 1592 /* 1593 * Rewind the upl offset to the beginning of the 1594 * transaction. 1595 */ 1596 upl_offset = cbp_head->b_uploffset; 1597 1598 for (cbp = cbp_head; cbp;) { 1599 buf_t cbp_next; 1600 1601 size += cbp->b_bcount; 1602 io_size += cbp->b_bcount; 1603 1604 cbp_next = cbp->b_trans_next; 1605 free_io_buf(cbp); 1606 cbp = cbp_next; 1607 } 1608 } 1609 if (iostate) { 1610 int need_wakeup = 0; 1611 1612 /* 1613 * update the error condition for this stream 1614 * since we never really issued the io 1615 * just go ahead and adjust it back 1616 */ 1617 lck_mtx_lock_spin(&iostate->io_mtxp); 1618 1619 if (iostate->io_error == 0) 1620 iostate->io_error = error; 1621 iostate->io_issued -= io_size; 1622 1623 if (iostate->io_wanted) { 1624 /* 1625 * someone is waiting for the state of 1626 * this io stream to change 1627 */ 1628 iostate->io_wanted = 0; 1629 need_wakeup = 1; 1630 } 1631 lck_mtx_unlock(&iostate->io_mtxp); 1632 1633 if (need_wakeup) 1634 wakeup((caddr_t)&iostate->io_wanted); 1635 } 1636 if (flags & CL_COMMIT) { 1637 int upl_flags; 1638 1639 pg_offset = upl_offset & PAGE_MASK; 1640 abort_size = (upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK; 1641 1642 upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, abort_size, error, io_flags); 1643 1644 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE, 1645 upl, upl_offset - pg_offset, abort_size, (error << 24) | upl_flags, 0); 1646 } 1647 if (retval == 0) 1648 retval = error; 1649 } else if (cbp_head) 1650 panic("%s(): cbp_head is not NULL.\n", __FUNCTION__); 1651 1652 if (real_bp) { 1653 /* 1654 * can get here if we either encountered an error 1655 * or we completely zero-filled the request and 1656 * no I/O was issued 1657 */ 1658 if (error) { 1659 real_bp->b_flags |= B_ERROR; 1660 real_bp->b_error = error; 1661 } 1662 buf_biodone(real_bp); 1663 } 1664 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, 0); 1665 1666 return (retval); 1667} 1668 1669#define reset_vector_run_state() \ 1670 issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0; 1671 1672static int 1673vector_cluster_io(vnode_t vp, upl_t vector_upl, vm_offset_t vector_upl_offset, off_t v_upl_uio_offset, int vector_upl_iosize, 1674 int io_flag, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg) 1675{ 1676 vector_upl_set_pagelist(vector_upl); 1677 1678 if(io_flag & CL_READ) { 1679 if(vector_upl_offset == 0 && ((vector_upl_iosize & PAGE_MASK)==0)) 1680 io_flag &= ~CL_PRESERVE; /*don't zero fill*/ 1681 else 1682 io_flag |= CL_PRESERVE; /*zero fill*/ 1683 } 1684 return (cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp, iostate, callback, callback_arg)); 1685 1686} 1687 1688static int 1689cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag) 1690{ 1691 int pages_in_prefetch; 1692 1693 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START, 1694 (int)f_offset, size, (int)filesize, 0, 0); 1695 1696 if (f_offset >= filesize) { 1697 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END, 1698 (int)f_offset, 0, 0, 0, 0); 1699 return(0); 1700 } 1701 if ((off_t)size > (filesize - f_offset)) 1702 size = filesize - f_offset; 1703 pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE; 1704 1705 advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag); 1706 1707 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END, 1708 (int)f_offset + size, pages_in_prefetch, 0, 1, 0); 1709 1710 return (pages_in_prefetch); 1711} 1712 1713 1714 1715static void 1716cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap, int (*callback)(buf_t, void *), void *callback_arg, 1717 int bflag) 1718{ 1719 daddr64_t r_addr; 1720 off_t f_offset; 1721 int size_of_prefetch; 1722 u_int max_prefetch; 1723 1724 1725 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START, 1726 (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0); 1727 1728 if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) { 1729 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, 1730 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0); 1731 return; 1732 } 1733 if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1))) { 1734 rap->cl_ralen = 0; 1735 rap->cl_maxra = 0; 1736 1737 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, 1738 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0); 1739 1740 return; 1741 } 1742 max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ), (vp->v_mount->mnt_kern_flag & MNTK_SSD)); 1743 1744 if ((max_prefetch / PAGE_SIZE) > speculative_prefetch_max) 1745 max_prefetch = (speculative_prefetch_max * PAGE_SIZE); 1746 1747 if (max_prefetch <= PAGE_SIZE) { 1748 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, 1749 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0); 1750 return; 1751 } 1752 if (extent->e_addr < rap->cl_maxra) { 1753 if ((rap->cl_maxra - extent->e_addr) > ((max_prefetch / PAGE_SIZE) / 4)) { 1754 1755 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, 1756 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0); 1757 return; 1758 } 1759 } 1760 r_addr = max(extent->e_addr, rap->cl_maxra) + 1; 1761 f_offset = (off_t)(r_addr * PAGE_SIZE_64); 1762 1763 size_of_prefetch = 0; 1764 1765 ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch); 1766 1767 if (size_of_prefetch) { 1768 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, 1769 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0); 1770 return; 1771 } 1772 if (f_offset < filesize) { 1773 daddr64_t read_size; 1774 1775 rap->cl_ralen = rap->cl_ralen ? min(max_prefetch / PAGE_SIZE, rap->cl_ralen << 1) : 1; 1776 1777 read_size = (extent->e_addr + 1) - extent->b_addr; 1778 1779 if (read_size > rap->cl_ralen) { 1780 if (read_size > max_prefetch / PAGE_SIZE) 1781 rap->cl_ralen = max_prefetch / PAGE_SIZE; 1782 else 1783 rap->cl_ralen = read_size; 1784 } 1785 size_of_prefetch = cluster_read_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag); 1786 1787 if (size_of_prefetch) 1788 rap->cl_maxra = (r_addr + size_of_prefetch) - 1; 1789 } 1790 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, 1791 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0); 1792} 1793 1794 1795int 1796cluster_pageout(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset, 1797 int size, off_t filesize, int flags) 1798{ 1799 return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL); 1800 1801} 1802 1803 1804int 1805cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset, 1806 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg) 1807{ 1808 int io_size; 1809 int rounded_size; 1810 off_t max_size; 1811 int local_flags; 1812 1813 local_flags = CL_PAGEOUT | CL_THROTTLE; 1814 1815 if ((flags & UPL_IOSYNC) == 0) 1816 local_flags |= CL_ASYNC; 1817 if ((flags & UPL_NOCOMMIT) == 0) 1818 local_flags |= CL_COMMIT; 1819 if ((flags & UPL_KEEPCACHED)) 1820 local_flags |= CL_KEEPCACHED; 1821 if (flags & UPL_PAGING_ENCRYPTED) 1822 local_flags |= CL_ENCRYPTED; 1823 1824 1825 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE, 1826 (int)f_offset, size, (int)filesize, local_flags, 0); 1827 1828 /* 1829 * If they didn't specify any I/O, then we are done... 1830 * we can't issue an abort because we don't know how 1831 * big the upl really is 1832 */ 1833 if (size <= 0) 1834 return (EINVAL); 1835 1836 if (vp->v_mount->mnt_flag & MNT_RDONLY) { 1837 if (local_flags & CL_COMMIT) 1838 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY); 1839 return (EROFS); 1840 } 1841 /* 1842 * can't page-in from a negative offset 1843 * or if we're starting beyond the EOF 1844 * or if the file offset isn't page aligned 1845 * or the size requested isn't a multiple of PAGE_SIZE 1846 */ 1847 if (f_offset < 0 || f_offset >= filesize || 1848 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) { 1849 if (local_flags & CL_COMMIT) 1850 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY); 1851 return (EINVAL); 1852 } 1853 max_size = filesize - f_offset; 1854 1855 if (size < max_size) 1856 io_size = size; 1857 else 1858 io_size = max_size; 1859 1860 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 1861 1862 if (size > rounded_size) { 1863 if (local_flags & CL_COMMIT) 1864 ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size, 1865 UPL_ABORT_FREE_ON_EMPTY); 1866 } 1867 return (cluster_io(vp, upl, upl_offset, f_offset, io_size, 1868 local_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg)); 1869} 1870 1871 1872int 1873cluster_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset, 1874 int size, off_t filesize, int flags) 1875{ 1876 return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL); 1877} 1878 1879 1880int 1881cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset, 1882 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg) 1883{ 1884 u_int io_size; 1885 int rounded_size; 1886 off_t max_size; 1887 int retval; 1888 int local_flags = 0; 1889 1890 if (upl == NULL || size < 0) 1891 panic("cluster_pagein: NULL upl passed in"); 1892 1893 if ((flags & UPL_IOSYNC) == 0) 1894 local_flags |= CL_ASYNC; 1895 if ((flags & UPL_NOCOMMIT) == 0) 1896 local_flags |= CL_COMMIT; 1897 if (flags & UPL_IOSTREAMING) 1898 local_flags |= CL_IOSTREAMING; 1899 if (flags & UPL_PAGING_ENCRYPTED) 1900 local_flags |= CL_ENCRYPTED; 1901 1902 1903 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE, 1904 (int)f_offset, size, (int)filesize, local_flags, 0); 1905 1906 /* 1907 * can't page-in from a negative offset 1908 * or if we're starting beyond the EOF 1909 * or if the file offset isn't page aligned 1910 * or the size requested isn't a multiple of PAGE_SIZE 1911 */ 1912 if (f_offset < 0 || f_offset >= filesize || 1913 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) { 1914 if (local_flags & CL_COMMIT) 1915 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); 1916 return (EINVAL); 1917 } 1918 max_size = filesize - f_offset; 1919 1920 if (size < max_size) 1921 io_size = size; 1922 else 1923 io_size = max_size; 1924 1925 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 1926 1927 if (size > rounded_size && (local_flags & CL_COMMIT)) 1928 ubc_upl_abort_range(upl, upl_offset + rounded_size, 1929 size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); 1930 1931 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, 1932 local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 1933 1934 return (retval); 1935} 1936 1937 1938int 1939cluster_bp(buf_t bp) 1940{ 1941 return cluster_bp_ext(bp, NULL, NULL); 1942} 1943 1944 1945int 1946cluster_bp_ext(buf_t bp, int (*callback)(buf_t, void *), void *callback_arg) 1947{ 1948 off_t f_offset; 1949 int flags; 1950 1951 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START, 1952 bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0); 1953 1954 if (bp->b_flags & B_READ) 1955 flags = CL_ASYNC | CL_READ; 1956 else 1957 flags = CL_ASYNC; 1958 if (bp->b_flags & B_PASSIVE) 1959 flags |= CL_PASSIVE; 1960 1961 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno); 1962 1963 return (cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL, callback, callback_arg)); 1964} 1965 1966 1967 1968int 1969cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags) 1970{ 1971 return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL); 1972} 1973 1974 1975int 1976cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, 1977 int xflags, int (*callback)(buf_t, void *), void *callback_arg) 1978{ 1979 user_ssize_t cur_resid; 1980 int retval = 0; 1981 int flags; 1982 int zflags; 1983 int bflag; 1984 int write_type = IO_COPY; 1985 u_int32_t write_length; 1986 1987 flags = xflags; 1988 1989 if (flags & IO_PASSIVE) 1990 bflag = CL_PASSIVE; 1991 else 1992 bflag = 0; 1993 1994 if (vp->v_flag & VNOCACHE_DATA){ 1995 flags |= IO_NOCACHE; 1996 bflag |= CL_NOCACHE; 1997 } 1998 if (uio == NULL) { 1999 /* 2000 * no user data... 2001 * this call is being made to zero-fill some range in the file 2002 */ 2003 retval = cluster_write_copy(vp, NULL, (u_int32_t)0, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg); 2004 2005 return(retval); 2006 } 2007 /* 2008 * do a write through the cache if one of the following is true.... 2009 * NOCACHE is not true or NODIRECT is true 2010 * the uio request doesn't target USERSPACE 2011 * otherwise, find out if we want the direct or contig variant for 2012 * the first vector in the uio request 2013 */ 2014 if ( ((flags & (IO_NOCACHE | IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ) 2015 retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE); 2016 2017 if ( (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)) && write_type == IO_DIRECT) 2018 /* 2019 * must go through the cached variant in this case 2020 */ 2021 write_type = IO_COPY; 2022 2023 while ((cur_resid = uio_resid(uio)) && uio->uio_offset < newEOF && retval == 0) { 2024 2025 switch (write_type) { 2026 2027 case IO_COPY: 2028 /* 2029 * make sure the uio_resid isn't too big... 2030 * internally, we want to handle all of the I/O in 2031 * chunk sizes that fit in a 32 bit int 2032 */ 2033 if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) { 2034 /* 2035 * we're going to have to call cluster_write_copy 2036 * more than once... 2037 * 2038 * only want the last call to cluster_write_copy to 2039 * have the IO_TAILZEROFILL flag set and only the 2040 * first call should have IO_HEADZEROFILL 2041 */ 2042 zflags = flags & ~IO_TAILZEROFILL; 2043 flags &= ~IO_HEADZEROFILL; 2044 2045 write_length = MAX_IO_REQUEST_SIZE; 2046 } else { 2047 /* 2048 * last call to cluster_write_copy 2049 */ 2050 zflags = flags; 2051 2052 write_length = (u_int32_t)cur_resid; 2053 } 2054 retval = cluster_write_copy(vp, uio, write_length, oldEOF, newEOF, headOff, tailOff, zflags, callback, callback_arg); 2055 break; 2056 2057 case IO_CONTIG: 2058 zflags = flags & ~(IO_TAILZEROFILL | IO_HEADZEROFILL); 2059 2060 if (flags & IO_HEADZEROFILL) { 2061 /* 2062 * only do this once per request 2063 */ 2064 flags &= ~IO_HEADZEROFILL; 2065 2066 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, uio->uio_offset, 2067 headOff, (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg); 2068 if (retval) 2069 break; 2070 } 2071 retval = cluster_write_contig(vp, uio, newEOF, &write_type, &write_length, callback, callback_arg, bflag); 2072 2073 if (retval == 0 && (flags & IO_TAILZEROFILL) && uio_resid(uio) == 0) { 2074 /* 2075 * we're done with the data from the user specified buffer(s) 2076 * and we've been requested to zero fill at the tail 2077 * treat this as an IO_HEADZEROFILL which doesn't require a uio 2078 * by rearranging the args and passing in IO_HEADZEROFILL 2079 */ 2080 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, tailOff, uio->uio_offset, 2081 (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg); 2082 } 2083 break; 2084 2085 case IO_DIRECT: 2086 /* 2087 * cluster_write_direct is never called with IO_TAILZEROFILL || IO_HEADZEROFILL 2088 */ 2089 retval = cluster_write_direct(vp, uio, oldEOF, newEOF, &write_type, &write_length, flags, callback, callback_arg); 2090 break; 2091 2092 case IO_UNKNOWN: 2093 retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE); 2094 break; 2095 } 2096 /* 2097 * in case we end up calling cluster_write_copy (from cluster_write_direct) 2098 * multiple times to service a multi-vector request that is not aligned properly 2099 * we need to update the oldEOF so that we 2100 * don't zero-fill the head of a page if we've successfully written 2101 * data to that area... 'cluster_write_copy' will zero-fill the head of a 2102 * page that is beyond the oldEOF if the write is unaligned... we only 2103 * want that to happen for the very first page of the cluster_write, 2104 * NOT the first page of each vector making up a multi-vector write. 2105 */ 2106 if (uio->uio_offset > oldEOF) 2107 oldEOF = uio->uio_offset; 2108 } 2109 return (retval); 2110} 2111 2112 2113static int 2114cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, int *write_type, u_int32_t *write_length, 2115 int flags, int (*callback)(buf_t, void *), void *callback_arg) 2116{ 2117 upl_t upl; 2118 upl_page_info_t *pl; 2119 vm_offset_t upl_offset; 2120 vm_offset_t vector_upl_offset = 0; 2121 u_int32_t io_req_size; 2122 u_int32_t offset_in_file; 2123 u_int32_t offset_in_iovbase; 2124 u_int32_t io_size; 2125 int io_flag = 0; 2126 upl_size_t upl_size, vector_upl_size = 0; 2127 vm_size_t upl_needed_size; 2128 mach_msg_type_number_t pages_in_pl; 2129 int upl_flags; 2130 kern_return_t kret; 2131 mach_msg_type_number_t i; 2132 int force_data_sync; 2133 int retval = 0; 2134 int first_IO = 1; 2135 struct clios iostate; 2136 user_addr_t iov_base; 2137 u_int32_t mem_alignment_mask; 2138 u_int32_t devblocksize; 2139 u_int32_t max_io_size; 2140 u_int32_t max_upl_size; 2141 u_int32_t max_vector_size; 2142 boolean_t io_throttled = FALSE; 2143 2144 u_int32_t vector_upl_iosize = 0; 2145 int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1); 2146 off_t v_upl_uio_offset = 0; 2147 int vector_upl_index=0; 2148 upl_t vector_upl = NULL; 2149 2150 2151 /* 2152 * When we enter this routine, we know 2153 * -- the resid will not exceed iov_len 2154 */ 2155 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START, 2156 (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0); 2157 2158 max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE); 2159 2160 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE | CL_DIRECT_IO; 2161 2162 if (flags & IO_PASSIVE) 2163 io_flag |= CL_PASSIVE; 2164 2165 if (flags & IO_NOCACHE) 2166 io_flag |= CL_NOCACHE; 2167 2168 iostate.io_completed = 0; 2169 iostate.io_issued = 0; 2170 iostate.io_error = 0; 2171 iostate.io_wanted = 0; 2172 2173 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); 2174 2175 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; 2176 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; 2177 2178 if (devblocksize == 1) { 2179 /* 2180 * the AFP client advertises a devblocksize of 1 2181 * however, its BLOCKMAP routine maps to physical 2182 * blocks that are PAGE_SIZE in size... 2183 * therefore we can't ask for I/Os that aren't page aligned 2184 * or aren't multiples of PAGE_SIZE in size 2185 * by setting devblocksize to PAGE_SIZE, we re-instate 2186 * the old behavior we had before the mem_alignment_mask 2187 * changes went in... 2188 */ 2189 devblocksize = PAGE_SIZE; 2190 } 2191 2192next_dwrite: 2193 io_req_size = *write_length; 2194 iov_base = uio_curriovbase(uio); 2195 2196 offset_in_file = (u_int32_t)uio->uio_offset & PAGE_MASK; 2197 offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask; 2198 2199 if (offset_in_file || offset_in_iovbase) { 2200 /* 2201 * one of the 2 important offsets is misaligned 2202 * so fire an I/O through the cache for this entire vector 2203 */ 2204 goto wait_for_dwrites; 2205 } 2206 if (iov_base & (devblocksize - 1)) { 2207 /* 2208 * the offset in memory must be on a device block boundary 2209 * so that we can guarantee that we can generate an 2210 * I/O that ends on a page boundary in cluster_io 2211 */ 2212 goto wait_for_dwrites; 2213 } 2214 2215 while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) { 2216 int throttle_type; 2217 2218 if ( (throttle_type = cluster_hard_throttle_on(vp, 1)) ) { 2219 /* 2220 * we're in the throttle window, at the very least 2221 * we want to limit the size of the I/O we're about 2222 * to issue 2223 */ 2224 if ( (flags & IO_RETURN_ON_THROTTLE) && throttle_type == 2) { 2225 /* 2226 * we're in the throttle window and at least 1 I/O 2227 * has already been issued by a throttleable thread 2228 * in this window, so return with EAGAIN to indicate 2229 * to the FS issuing the cluster_write call that it 2230 * should now throttle after dropping any locks 2231 */ 2232 throttle_info_update_by_mount(vp->v_mount); 2233 2234 io_throttled = TRUE; 2235 goto wait_for_dwrites; 2236 } 2237 max_vector_size = THROTTLE_MAX_IOSIZE; 2238 max_io_size = THROTTLE_MAX_IOSIZE; 2239 } else { 2240 max_vector_size = MAX_VECTOR_UPL_SIZE; 2241 max_io_size = max_upl_size; 2242 } 2243 2244 if (first_IO) { 2245 cluster_syncup(vp, newEOF, callback, callback_arg); 2246 first_IO = 0; 2247 } 2248 io_size = io_req_size & ~PAGE_MASK; 2249 iov_base = uio_curriovbase(uio); 2250 2251 if (io_size > max_io_size) 2252 io_size = max_io_size; 2253 2254 if(useVectorUPL && (iov_base & PAGE_MASK)) { 2255 /* 2256 * We have an iov_base that's not page-aligned. 2257 * Issue all I/O's that have been collected within 2258 * this Vectored UPL. 2259 */ 2260 if(vector_upl_index) { 2261 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 2262 reset_vector_run_state(); 2263 } 2264 2265 /* 2266 * After this point, if we are using the Vector UPL path and the base is 2267 * not page-aligned then the UPL with that base will be the first in the vector UPL. 2268 */ 2269 } 2270 2271 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK); 2272 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK; 2273 2274 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START, 2275 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0); 2276 2277 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) { 2278 pages_in_pl = 0; 2279 upl_size = upl_needed_size; 2280 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC | 2281 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; 2282 2283 kret = vm_map_get_upl(current_map(), 2284 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), 2285 &upl_size, 2286 &upl, 2287 NULL, 2288 &pages_in_pl, 2289 &upl_flags, 2290 force_data_sync); 2291 2292 if (kret != KERN_SUCCESS) { 2293 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END, 2294 0, 0, 0, kret, 0); 2295 /* 2296 * failed to get pagelist 2297 * 2298 * we may have already spun some portion of this request 2299 * off as async requests... we need to wait for the I/O 2300 * to complete before returning 2301 */ 2302 goto wait_for_dwrites; 2303 } 2304 pl = UPL_GET_INTERNAL_PAGE_LIST(upl); 2305 pages_in_pl = upl_size / PAGE_SIZE; 2306 2307 for (i = 0; i < pages_in_pl; i++) { 2308 if (!upl_valid_page(pl, i)) 2309 break; 2310 } 2311 if (i == pages_in_pl) 2312 break; 2313 2314 /* 2315 * didn't get all the pages back that we 2316 * needed... release this upl and try again 2317 */ 2318 ubc_upl_abort(upl, 0); 2319 } 2320 if (force_data_sync >= 3) { 2321 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END, 2322 i, pages_in_pl, upl_size, kret, 0); 2323 /* 2324 * for some reason, we couldn't acquire a hold on all 2325 * the pages needed in the user's address space 2326 * 2327 * we may have already spun some portion of this request 2328 * off as async requests... we need to wait for the I/O 2329 * to complete before returning 2330 */ 2331 goto wait_for_dwrites; 2332 } 2333 2334 /* 2335 * Consider the possibility that upl_size wasn't satisfied. 2336 */ 2337 if (upl_size < upl_needed_size) { 2338 if (upl_size && upl_offset == 0) 2339 io_size = upl_size; 2340 else 2341 io_size = 0; 2342 } 2343 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END, 2344 (int)upl_offset, upl_size, (int)iov_base, io_size, 0); 2345 2346 if (io_size == 0) { 2347 ubc_upl_abort(upl, 0); 2348 /* 2349 * we may have already spun some portion of this request 2350 * off as async requests... we need to wait for the I/O 2351 * to complete before returning 2352 */ 2353 goto wait_for_dwrites; 2354 } 2355 2356 if(useVectorUPL) { 2357 vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK); 2358 if(end_off) 2359 issueVectorUPL = 1; 2360 /* 2361 * After this point, if we are using a vector UPL, then 2362 * either all the UPL elements end on a page boundary OR 2363 * this UPL is the last element because it does not end 2364 * on a page boundary. 2365 */ 2366 } 2367 2368 /* 2369 * Now look for pages already in the cache 2370 * and throw them away. 2371 * uio->uio_offset is page aligned within the file 2372 * io_size is a multiple of PAGE_SIZE 2373 */ 2374 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL); 2375 2376 /* 2377 * we want push out these writes asynchronously so that we can overlap 2378 * the preparation of the next I/O 2379 * if there are already too many outstanding writes 2380 * wait until some complete before issuing the next 2381 */ 2382 if (iostate.io_issued > iostate.io_completed) 2383 cluster_iostate_wait(&iostate, max_upl_size * IO_SCALE(vp, 2), "cluster_write_direct"); 2384 2385 if (iostate.io_error) { 2386 /* 2387 * one of the earlier writes we issued ran into a hard error 2388 * don't issue any more writes, cleanup the UPL 2389 * that was just created but not used, then 2390 * go wait for all writes that are part of this stream 2391 * to complete before returning the error to the caller 2392 */ 2393 ubc_upl_abort(upl, 0); 2394 2395 goto wait_for_dwrites; 2396 } 2397 2398 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START, 2399 (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0); 2400 2401 if(!useVectorUPL) 2402 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, 2403 io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 2404 2405 else { 2406 if(!vector_upl_index) { 2407 vector_upl = vector_upl_create(upl_offset); 2408 v_upl_uio_offset = uio->uio_offset; 2409 vector_upl_offset = upl_offset; 2410 } 2411 2412 vector_upl_set_subupl(vector_upl,upl,upl_size); 2413 vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size); 2414 vector_upl_index++; 2415 vector_upl_iosize += io_size; 2416 vector_upl_size += upl_size; 2417 2418 if(issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) { 2419 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 2420 reset_vector_run_state(); 2421 } 2422 } 2423 2424 /* 2425 * update the uio structure to 2426 * reflect the I/O that we just issued 2427 */ 2428 uio_update(uio, (user_size_t)io_size); 2429 2430 /* 2431 * in case we end up calling through to cluster_write_copy to finish 2432 * the tail of this request, we need to update the oldEOF so that we 2433 * don't zero-fill the head of a page if we've successfully written 2434 * data to that area... 'cluster_write_copy' will zero-fill the head of a 2435 * page that is beyond the oldEOF if the write is unaligned... we only 2436 * want that to happen for the very first page of the cluster_write, 2437 * NOT the first page of each vector making up a multi-vector write. 2438 */ 2439 if (uio->uio_offset > oldEOF) 2440 oldEOF = uio->uio_offset; 2441 2442 io_req_size -= io_size; 2443 2444 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END, 2445 (int)upl_offset, (int)uio->uio_offset, io_req_size, retval, 0); 2446 2447 } /* end while */ 2448 2449 if (retval == 0 && iostate.io_error == 0 && io_req_size == 0) { 2450 2451 retval = cluster_io_type(uio, write_type, write_length, MIN_DIRECT_WRITE_SIZE); 2452 2453 if (retval == 0 && *write_type == IO_DIRECT) { 2454 2455 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_NONE, 2456 (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0); 2457 2458 goto next_dwrite; 2459 } 2460 } 2461 2462wait_for_dwrites: 2463 2464 if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) { 2465 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 2466 reset_vector_run_state(); 2467 } 2468 2469 if (iostate.io_issued > iostate.io_completed) { 2470 /* 2471 * make sure all async writes issued as part of this stream 2472 * have completed before we return 2473 */ 2474 cluster_iostate_wait(&iostate, 0, "cluster_write_direct"); 2475 } 2476 if (iostate.io_error) 2477 retval = iostate.io_error; 2478 2479 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); 2480 2481 if (io_throttled == TRUE && retval == 0) 2482 retval = EAGAIN; 2483 2484 if (io_req_size && retval == 0) { 2485 /* 2486 * we couldn't handle the tail of this request in DIRECT mode 2487 * so fire it through the copy path 2488 * 2489 * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set 2490 * so we can just pass 0 in for the headOff and tailOff 2491 */ 2492 if (uio->uio_offset > oldEOF) 2493 oldEOF = uio->uio_offset; 2494 2495 retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, (off_t)0, (off_t)0, flags, callback, callback_arg); 2496 2497 *write_type = IO_UNKNOWN; 2498 } 2499 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END, 2500 (int)uio->uio_offset, io_req_size, retval, 4, 0); 2501 2502 return (retval); 2503} 2504 2505 2506static int 2507cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, u_int32_t *write_length, 2508 int (*callback)(buf_t, void *), void *callback_arg, int bflag) 2509{ 2510 upl_page_info_t *pl; 2511 addr64_t src_paddr = 0; 2512 upl_t upl[MAX_VECTS]; 2513 vm_offset_t upl_offset; 2514 u_int32_t tail_size = 0; 2515 u_int32_t io_size; 2516 u_int32_t xsize; 2517 upl_size_t upl_size; 2518 vm_size_t upl_needed_size; 2519 mach_msg_type_number_t pages_in_pl; 2520 int upl_flags; 2521 kern_return_t kret; 2522 struct clios iostate; 2523 int error = 0; 2524 int cur_upl = 0; 2525 int num_upl = 0; 2526 int n; 2527 user_addr_t iov_base; 2528 u_int32_t devblocksize; 2529 u_int32_t mem_alignment_mask; 2530 2531 /* 2532 * When we enter this routine, we know 2533 * -- the io_req_size will not exceed iov_len 2534 * -- the target address is physically contiguous 2535 */ 2536 cluster_syncup(vp, newEOF, callback, callback_arg); 2537 2538 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; 2539 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; 2540 2541 iostate.io_completed = 0; 2542 iostate.io_issued = 0; 2543 iostate.io_error = 0; 2544 iostate.io_wanted = 0; 2545 2546 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); 2547 2548next_cwrite: 2549 io_size = *write_length; 2550 2551 iov_base = uio_curriovbase(uio); 2552 2553 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK); 2554 upl_needed_size = upl_offset + io_size; 2555 2556 pages_in_pl = 0; 2557 upl_size = upl_needed_size; 2558 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC | 2559 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; 2560 2561 kret = vm_map_get_upl(current_map(), 2562 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), 2563 &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, 0); 2564 2565 if (kret != KERN_SUCCESS) { 2566 /* 2567 * failed to get pagelist 2568 */ 2569 error = EINVAL; 2570 goto wait_for_cwrites; 2571 } 2572 num_upl++; 2573 2574 /* 2575 * Consider the possibility that upl_size wasn't satisfied. 2576 */ 2577 if (upl_size < upl_needed_size) { 2578 /* 2579 * This is a failure in the physical memory case. 2580 */ 2581 error = EINVAL; 2582 goto wait_for_cwrites; 2583 } 2584 pl = ubc_upl_pageinfo(upl[cur_upl]); 2585 2586 src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset; 2587 2588 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) { 2589 u_int32_t head_size; 2590 2591 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1)); 2592 2593 if (head_size > io_size) 2594 head_size = io_size; 2595 2596 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0, callback, callback_arg); 2597 2598 if (error) 2599 goto wait_for_cwrites; 2600 2601 upl_offset += head_size; 2602 src_paddr += head_size; 2603 io_size -= head_size; 2604 2605 iov_base += head_size; 2606 } 2607 if ((u_int32_t)iov_base & mem_alignment_mask) { 2608 /* 2609 * request doesn't set up on a memory boundary 2610 * the underlying DMA engine can handle... 2611 * return an error instead of going through 2612 * the slow copy path since the intent of this 2613 * path is direct I/O from device memory 2614 */ 2615 error = EINVAL; 2616 goto wait_for_cwrites; 2617 } 2618 2619 tail_size = io_size & (devblocksize - 1); 2620 io_size -= tail_size; 2621 2622 while (io_size && error == 0) { 2623 2624 if (io_size > MAX_IO_CONTIG_SIZE) 2625 xsize = MAX_IO_CONTIG_SIZE; 2626 else 2627 xsize = io_size; 2628 /* 2629 * request asynchronously so that we can overlap 2630 * the preparation of the next I/O... we'll do 2631 * the commit after all the I/O has completed 2632 * since its all issued against the same UPL 2633 * if there are already too many outstanding writes 2634 * wait until some have completed before issuing the next 2635 */ 2636 if (iostate.io_issued > iostate.io_completed) 2637 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig"); 2638 2639 if (iostate.io_error) { 2640 /* 2641 * one of the earlier writes we issued ran into a hard error 2642 * don't issue any more writes... 2643 * go wait for all writes that are part of this stream 2644 * to complete before returning the error to the caller 2645 */ 2646 goto wait_for_cwrites; 2647 } 2648 /* 2649 * issue an asynchronous write to cluster_io 2650 */ 2651 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, 2652 xsize, CL_DEV_MEMORY | CL_ASYNC | bflag, (buf_t)NULL, (struct clios *)&iostate, callback, callback_arg); 2653 2654 if (error == 0) { 2655 /* 2656 * The cluster_io write completed successfully, 2657 * update the uio structure 2658 */ 2659 uio_update(uio, (user_size_t)xsize); 2660 2661 upl_offset += xsize; 2662 src_paddr += xsize; 2663 io_size -= xsize; 2664 } 2665 } 2666 if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS) { 2667 2668 error = cluster_io_type(uio, write_type, write_length, 0); 2669 2670 if (error == 0 && *write_type == IO_CONTIG) { 2671 cur_upl++; 2672 goto next_cwrite; 2673 } 2674 } else 2675 *write_type = IO_UNKNOWN; 2676 2677wait_for_cwrites: 2678 /* 2679 * make sure all async writes that are part of this stream 2680 * have completed before we proceed 2681 */ 2682 if (iostate.io_issued > iostate.io_completed) 2683 cluster_iostate_wait(&iostate, 0, "cluster_write_contig"); 2684 2685 if (iostate.io_error) 2686 error = iostate.io_error; 2687 2688 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); 2689 2690 if (error == 0 && tail_size) 2691 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg); 2692 2693 for (n = 0; n < num_upl; n++) 2694 /* 2695 * just release our hold on each physically contiguous 2696 * region without changing any state 2697 */ 2698 ubc_upl_abort(upl[n], 0); 2699 2700 return (error); 2701} 2702 2703 2704/* 2705 * need to avoid a race between an msync of a range of pages dirtied via mmap 2706 * vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's 2707 * zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd 2708 * 2709 * we should never force-zero-fill pages that are already valid in the cache... 2710 * the entire page contains valid data (either from disk, zero-filled or dirtied 2711 * via an mmap) so we can only do damage by trying to zero-fill 2712 * 2713 */ 2714static int 2715cluster_zero_range(upl_t upl, upl_page_info_t *pl, int flags, int io_offset, off_t zero_off, off_t upl_f_offset, int bytes_to_zero) 2716{ 2717 int zero_pg_index; 2718 boolean_t need_cluster_zero = TRUE; 2719 2720 if ((flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) { 2721 2722 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64)); 2723 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64); 2724 2725 if (upl_valid_page(pl, zero_pg_index)) { 2726 /* 2727 * never force zero valid pages - dirty or clean 2728 * we'll leave these in the UPL for cluster_write_copy to deal with 2729 */ 2730 need_cluster_zero = FALSE; 2731 } 2732 } 2733 if (need_cluster_zero == TRUE) 2734 cluster_zero(upl, io_offset, bytes_to_zero, NULL); 2735 2736 return (bytes_to_zero); 2737} 2738 2739 2740static int 2741cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff, 2742 off_t tailOff, int flags, int (*callback)(buf_t, void *), void *callback_arg) 2743{ 2744 upl_page_info_t *pl; 2745 upl_t upl; 2746 vm_offset_t upl_offset = 0; 2747 vm_size_t upl_size; 2748 off_t upl_f_offset; 2749 int pages_in_upl; 2750 int start_offset; 2751 int xfer_resid; 2752 int io_size; 2753 int io_offset; 2754 int bytes_to_zero; 2755 int bytes_to_move; 2756 kern_return_t kret; 2757 int retval = 0; 2758 int io_resid; 2759 long long total_size; 2760 long long zero_cnt; 2761 off_t zero_off; 2762 long long zero_cnt1; 2763 off_t zero_off1; 2764 off_t write_off = 0; 2765 int write_cnt = 0; 2766 boolean_t first_pass = FALSE; 2767 struct cl_extent cl; 2768 struct cl_writebehind *wbp; 2769 int bflag; 2770 u_int max_cluster_pgcount; 2771 u_int max_io_size; 2772 2773 if (uio) { 2774 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START, 2775 (int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, 0); 2776 2777 io_resid = io_req_size; 2778 } else { 2779 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START, 2780 0, 0, (int)oldEOF, (int)newEOF, 0); 2781 2782 io_resid = 0; 2783 } 2784 if (flags & IO_PASSIVE) 2785 bflag = CL_PASSIVE; 2786 else 2787 bflag = 0; 2788 if (flags & IO_NOCACHE) 2789 bflag |= CL_NOCACHE; 2790 2791 zero_cnt = 0; 2792 zero_cnt1 = 0; 2793 zero_off = 0; 2794 zero_off1 = 0; 2795 2796 max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE; 2797 max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE); 2798 2799 if (flags & IO_HEADZEROFILL) { 2800 /* 2801 * some filesystems (HFS is one) don't support unallocated holes within a file... 2802 * so we zero fill the intervening space between the old EOF and the offset 2803 * where the next chunk of real data begins.... ftruncate will also use this 2804 * routine to zero fill to the new EOF when growing a file... in this case, the 2805 * uio structure will not be provided 2806 */ 2807 if (uio) { 2808 if (headOff < uio->uio_offset) { 2809 zero_cnt = uio->uio_offset - headOff; 2810 zero_off = headOff; 2811 } 2812 } else if (headOff < newEOF) { 2813 zero_cnt = newEOF - headOff; 2814 zero_off = headOff; 2815 } 2816 } else { 2817 if (uio && uio->uio_offset > oldEOF) { 2818 zero_off = uio->uio_offset & ~PAGE_MASK_64; 2819 2820 if (zero_off >= oldEOF) { 2821 zero_cnt = uio->uio_offset - zero_off; 2822 2823 flags |= IO_HEADZEROFILL; 2824 } 2825 } 2826 } 2827 if (flags & IO_TAILZEROFILL) { 2828 if (uio) { 2829 zero_off1 = uio->uio_offset + io_req_size; 2830 2831 if (zero_off1 < tailOff) 2832 zero_cnt1 = tailOff - zero_off1; 2833 } 2834 } else { 2835 if (uio && newEOF > oldEOF) { 2836 zero_off1 = uio->uio_offset + io_req_size; 2837 2838 if (zero_off1 == newEOF && (zero_off1 & PAGE_MASK_64)) { 2839 zero_cnt1 = PAGE_SIZE_64 - (zero_off1 & PAGE_MASK_64); 2840 2841 flags |= IO_TAILZEROFILL; 2842 } 2843 } 2844 } 2845 if (zero_cnt == 0 && uio == (struct uio *) 0) { 2846 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, 2847 retval, 0, 0, 0, 0); 2848 return (0); 2849 } 2850 if (uio) { 2851 write_off = uio->uio_offset; 2852 write_cnt = uio_resid(uio); 2853 /* 2854 * delay updating the sequential write info 2855 * in the control block until we've obtained 2856 * the lock for it 2857 */ 2858 first_pass = TRUE; 2859 } 2860 while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) { 2861 /* 2862 * for this iteration of the loop, figure out where our starting point is 2863 */ 2864 if (zero_cnt) { 2865 start_offset = (int)(zero_off & PAGE_MASK_64); 2866 upl_f_offset = zero_off - start_offset; 2867 } else if (io_resid) { 2868 start_offset = (int)(uio->uio_offset & PAGE_MASK_64); 2869 upl_f_offset = uio->uio_offset - start_offset; 2870 } else { 2871 start_offset = (int)(zero_off1 & PAGE_MASK_64); 2872 upl_f_offset = zero_off1 - start_offset; 2873 } 2874 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE, 2875 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0); 2876 2877 if (total_size > max_io_size) 2878 total_size = max_io_size; 2879 2880 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64); 2881 2882 if (uio && ((flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) { 2883 /* 2884 * assumption... total_size <= io_resid 2885 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set 2886 */ 2887 if ((start_offset + total_size) > max_io_size) 2888 total_size = max_io_size - start_offset; 2889 xfer_resid = total_size; 2890 2891 retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1); 2892 2893 if (retval) 2894 break; 2895 2896 io_resid -= (total_size - xfer_resid); 2897 total_size = xfer_resid; 2898 start_offset = (int)(uio->uio_offset & PAGE_MASK_64); 2899 upl_f_offset = uio->uio_offset - start_offset; 2900 2901 if (total_size == 0) { 2902 if (start_offset) { 2903 /* 2904 * the write did not finish on a page boundary 2905 * which will leave upl_f_offset pointing to the 2906 * beginning of the last page written instead of 2907 * the page beyond it... bump it in this case 2908 * so that the cluster code records the last page 2909 * written as dirty 2910 */ 2911 upl_f_offset += PAGE_SIZE_64; 2912 } 2913 upl_size = 0; 2914 2915 goto check_cluster; 2916 } 2917 } 2918 /* 2919 * compute the size of the upl needed to encompass 2920 * the requested write... limit each call to cluster_io 2921 * to the maximum UPL size... cluster_io will clip if 2922 * this exceeds the maximum io_size for the device, 2923 * make sure to account for 2924 * a starting offset that's not page aligned 2925 */ 2926 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 2927 2928 if (upl_size > max_io_size) 2929 upl_size = max_io_size; 2930 2931 pages_in_upl = upl_size / PAGE_SIZE; 2932 io_size = upl_size - start_offset; 2933 2934 if ((long long)io_size > total_size) 2935 io_size = total_size; 2936 2937 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0); 2938 2939 2940 /* 2941 * Gather the pages from the buffer cache. 2942 * The UPL_WILL_MODIFY flag lets the UPL subsystem know 2943 * that we intend to modify these pages. 2944 */ 2945 kret = ubc_create_upl(vp, 2946 upl_f_offset, 2947 upl_size, 2948 &upl, 2949 &pl, 2950 UPL_SET_LITE | (( uio!=NULL && (uio->uio_flags & UIO_FLAGS_IS_COMPRESSED_FILE)) ? 0 : UPL_WILL_MODIFY)); 2951 if (kret != KERN_SUCCESS) 2952 panic("cluster_write_copy: failed to get pagelist"); 2953 2954 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, 2955 upl, (int)upl_f_offset, start_offset, 0, 0); 2956 2957 if (start_offset && upl_f_offset < oldEOF && !upl_valid_page(pl, 0)) { 2958 int read_size; 2959 2960 /* 2961 * we're starting in the middle of the first page of the upl 2962 * and the page isn't currently valid, so we're going to have 2963 * to read it in first... this is a synchronous operation 2964 */ 2965 read_size = PAGE_SIZE; 2966 2967 if ((upl_f_offset + read_size) > oldEOF) 2968 read_size = oldEOF - upl_f_offset; 2969 2970 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, 2971 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 2972 if (retval) { 2973 /* 2974 * we had an error during the read which causes us to abort 2975 * the current cluster_write request... before we do, we need 2976 * to release the rest of the pages in the upl without modifying 2977 * there state and mark the failed page in error 2978 */ 2979 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY); 2980 2981 if (upl_size > PAGE_SIZE) 2982 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); 2983 2984 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE, 2985 upl, 0, 0, retval, 0); 2986 break; 2987 } 2988 } 2989 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) { 2990 /* 2991 * the last offset we're writing to in this upl does not end on a page 2992 * boundary... if it's not beyond the old EOF, then we'll also need to 2993 * pre-read this page in if it isn't already valid 2994 */ 2995 upl_offset = upl_size - PAGE_SIZE; 2996 2997 if ((upl_f_offset + start_offset + io_size) < oldEOF && 2998 !upl_valid_page(pl, upl_offset / PAGE_SIZE)) { 2999 int read_size; 3000 3001 read_size = PAGE_SIZE; 3002 3003 if ((off_t)(upl_f_offset + upl_offset + read_size) > oldEOF) 3004 read_size = oldEOF - (upl_f_offset + upl_offset); 3005 3006 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, 3007 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 3008 if (retval) { 3009 /* 3010 * we had an error during the read which causes us to abort 3011 * the current cluster_write request... before we do, we 3012 * need to release the rest of the pages in the upl without 3013 * modifying there state and mark the failed page in error 3014 */ 3015 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY); 3016 3017 if (upl_size > PAGE_SIZE) 3018 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); 3019 3020 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE, 3021 upl, 0, 0, retval, 0); 3022 break; 3023 } 3024 } 3025 } 3026 xfer_resid = io_size; 3027 io_offset = start_offset; 3028 3029 while (zero_cnt && xfer_resid) { 3030 3031 if (zero_cnt < (long long)xfer_resid) 3032 bytes_to_zero = zero_cnt; 3033 else 3034 bytes_to_zero = xfer_resid; 3035 3036 bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off, upl_f_offset, bytes_to_zero); 3037 3038 xfer_resid -= bytes_to_zero; 3039 zero_cnt -= bytes_to_zero; 3040 zero_off += bytes_to_zero; 3041 io_offset += bytes_to_zero; 3042 } 3043 if (xfer_resid && io_resid) { 3044 u_int32_t io_requested; 3045 3046 bytes_to_move = min(io_resid, xfer_resid); 3047 io_requested = bytes_to_move; 3048 3049 retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested); 3050 3051 if (retval) { 3052 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); 3053 3054 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE, 3055 upl, 0, 0, retval, 0); 3056 } else { 3057 io_resid -= bytes_to_move; 3058 xfer_resid -= bytes_to_move; 3059 io_offset += bytes_to_move; 3060 } 3061 } 3062 while (xfer_resid && zero_cnt1 && retval == 0) { 3063 3064 if (zero_cnt1 < (long long)xfer_resid) 3065 bytes_to_zero = zero_cnt1; 3066 else 3067 bytes_to_zero = xfer_resid; 3068 3069 bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off1, upl_f_offset, bytes_to_zero); 3070 3071 xfer_resid -= bytes_to_zero; 3072 zero_cnt1 -= bytes_to_zero; 3073 zero_off1 += bytes_to_zero; 3074 io_offset += bytes_to_zero; 3075 } 3076 if (retval == 0) { 3077 int cl_index; 3078 int ret_cluster_try_push; 3079 3080 io_size += start_offset; 3081 3082 if ((upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) { 3083 /* 3084 * if we're extending the file with this write 3085 * we'll zero fill the rest of the page so that 3086 * if the file gets extended again in such a way as to leave a 3087 * hole starting at this EOF, we'll have zero's in the correct spot 3088 */ 3089 cluster_zero(upl, io_size, upl_size - io_size, NULL); 3090 } 3091 /* 3092 * release the upl now if we hold one since... 3093 * 1) pages in it may be present in the sparse cluster map 3094 * and may span 2 separate buckets there... if they do and 3095 * we happen to have to flush a bucket to make room and it intersects 3096 * this upl, a deadlock may result on page BUSY 3097 * 2) we're delaying the I/O... from this point forward we're just updating 3098 * the cluster state... no need to hold the pages, so commit them 3099 * 3) IO_SYNC is set... 3100 * because we had to ask for a UPL that provides currenty non-present pages, the 3101 * UPL has been automatically set to clear the dirty flags (both software and hardware) 3102 * upon committing it... this is not the behavior we want since it's possible for 3103 * pages currently present as part of a mapped file to be dirtied while the I/O is in flight. 3104 * we'll pick these pages back up later with the correct behavior specified. 3105 * 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush 3106 * of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages 3107 * we hold since the flushing context is holding the cluster lock. 3108 */ 3109 ubc_upl_commit_range(upl, 0, upl_size, 3110 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY); 3111check_cluster: 3112 /* 3113 * calculate the last logical block number 3114 * that this delayed I/O encompassed 3115 */ 3116 cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64); 3117 3118 if (flags & IO_SYNC) { 3119 /* 3120 * if the IO_SYNC flag is set than we need to 3121 * bypass any clusters and immediately issue 3122 * the I/O 3123 */ 3124 goto issue_io; 3125 } 3126 /* 3127 * take the lock to protect our accesses 3128 * of the writebehind and sparse cluster state 3129 */ 3130 wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED); 3131 3132 if (wbp->cl_scmap) { 3133 3134 if ( !(flags & IO_NOCACHE)) { 3135 /* 3136 * we've fallen into the sparse 3137 * cluster method of delaying dirty pages 3138 */ 3139 sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, newEOF, callback, callback_arg); 3140 3141 lck_mtx_unlock(&wbp->cl_lockw); 3142 3143 continue; 3144 } 3145 /* 3146 * must have done cached writes that fell into 3147 * the sparse cluster mechanism... we've switched 3148 * to uncached writes on the file, so go ahead 3149 * and push whatever's in the sparse map 3150 * and switch back to normal clustering 3151 */ 3152 wbp->cl_number = 0; 3153 3154 sparse_cluster_push(&(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg); 3155 /* 3156 * no clusters of either type present at this point 3157 * so just go directly to start_new_cluster since 3158 * we know we need to delay this I/O since we've 3159 * already released the pages back into the cache 3160 * to avoid the deadlock with sparse_cluster_push 3161 */ 3162 goto start_new_cluster; 3163 } 3164 if (first_pass) { 3165 if (write_off == wbp->cl_last_write) 3166 wbp->cl_seq_written += write_cnt; 3167 else 3168 wbp->cl_seq_written = write_cnt; 3169 3170 wbp->cl_last_write = write_off + write_cnt; 3171 3172 first_pass = FALSE; 3173 } 3174 if (wbp->cl_number == 0) 3175 /* 3176 * no clusters currently present 3177 */ 3178 goto start_new_cluster; 3179 3180 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) { 3181 /* 3182 * check each cluster that we currently hold 3183 * try to merge some or all of this write into 3184 * one or more of the existing clusters... if 3185 * any portion of the write remains, start a 3186 * new cluster 3187 */ 3188 if (cl.b_addr >= wbp->cl_clusters[cl_index].b_addr) { 3189 /* 3190 * the current write starts at or after the current cluster 3191 */ 3192 if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) { 3193 /* 3194 * we have a write that fits entirely 3195 * within the existing cluster limits 3196 */ 3197 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) 3198 /* 3199 * update our idea of where the cluster ends 3200 */ 3201 wbp->cl_clusters[cl_index].e_addr = cl.e_addr; 3202 break; 3203 } 3204 if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) { 3205 /* 3206 * we have a write that starts in the middle of the current cluster 3207 * but extends beyond the cluster's limit... we know this because 3208 * of the previous checks 3209 * we'll extend the current cluster to the max 3210 * and update the b_addr for the current write to reflect that 3211 * the head of it was absorbed into this cluster... 3212 * note that we'll always have a leftover tail in this case since 3213 * full absorbtion would have occurred in the clause above 3214 */ 3215 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount; 3216 3217 cl.b_addr = wbp->cl_clusters[cl_index].e_addr; 3218 } 3219 /* 3220 * we come here for the case where the current write starts 3221 * beyond the limit of the existing cluster or we have a leftover 3222 * tail after a partial absorbtion 3223 * 3224 * in either case, we'll check the remaining clusters before 3225 * starting a new one 3226 */ 3227 } else { 3228 /* 3229 * the current write starts in front of the cluster we're currently considering 3230 */ 3231 if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= max_cluster_pgcount) { 3232 /* 3233 * we can just merge the new request into 3234 * this cluster and leave it in the cache 3235 * since the resulting cluster is still 3236 * less than the maximum allowable size 3237 */ 3238 wbp->cl_clusters[cl_index].b_addr = cl.b_addr; 3239 3240 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) { 3241 /* 3242 * the current write completely 3243 * envelops the existing cluster and since 3244 * each write is limited to at most max_cluster_pgcount pages 3245 * we can just use the start and last blocknos of the write 3246 * to generate the cluster limits 3247 */ 3248 wbp->cl_clusters[cl_index].e_addr = cl.e_addr; 3249 } 3250 break; 3251 } 3252 3253 /* 3254 * if we were to combine this write with the current cluster 3255 * we would exceed the cluster size limit.... so, 3256 * let's see if there's any overlap of the new I/O with 3257 * the cluster we're currently considering... in fact, we'll 3258 * stretch the cluster out to it's full limit and see if we 3259 * get an intersection with the current write 3260 * 3261 */ 3262 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) { 3263 /* 3264 * the current write extends into the proposed cluster 3265 * clip the length of the current write after first combining it's 3266 * tail with the newly shaped cluster 3267 */ 3268 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount; 3269 3270 cl.e_addr = wbp->cl_clusters[cl_index].b_addr; 3271 } 3272 /* 3273 * if we get here, there was no way to merge 3274 * any portion of this write with this cluster 3275 * or we could only merge part of it which 3276 * will leave a tail... 3277 * we'll check the remaining clusters before starting a new one 3278 */ 3279 } 3280 } 3281 if (cl_index < wbp->cl_number) 3282 /* 3283 * we found an existing cluster(s) that we 3284 * could entirely merge this I/O into 3285 */ 3286 goto delay_io; 3287 3288 if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && 3289 wbp->cl_number == MAX_CLUSTERS && 3290 wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) { 3291 uint32_t n; 3292 3293 if (vp->v_mount->mnt_kern_flag & MNTK_SSD) 3294 n = WRITE_BEHIND_SSD; 3295 else 3296 n = WRITE_BEHIND; 3297 3298 while (n--) 3299 cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg); 3300 } 3301 if (wbp->cl_number < MAX_CLUSTERS) { 3302 /* 3303 * we didn't find an existing cluster to 3304 * merge into, but there's room to start 3305 * a new one 3306 */ 3307 goto start_new_cluster; 3308 } 3309 /* 3310 * no exisitng cluster to merge with and no 3311 * room to start a new one... we'll try 3312 * pushing one of the existing ones... if none of 3313 * them are able to be pushed, we'll switch 3314 * to the sparse cluster mechanism 3315 * cluster_try_push updates cl_number to the 3316 * number of remaining clusters... and 3317 * returns the number of currently unused clusters 3318 */ 3319 ret_cluster_try_push = 0; 3320 3321 /* 3322 * if writes are not deferred, call cluster push immediately 3323 */ 3324 if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) { 3325 3326 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg); 3327 } 3328 3329 /* 3330 * execute following regardless of writes being deferred or not 3331 */ 3332 if (ret_cluster_try_push == 0) { 3333 /* 3334 * no more room in the normal cluster mechanism 3335 * so let's switch to the more expansive but expensive 3336 * sparse mechanism.... 3337 */ 3338 sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg); 3339 sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, newEOF, callback, callback_arg); 3340 3341 lck_mtx_unlock(&wbp->cl_lockw); 3342 3343 continue; 3344 } 3345start_new_cluster: 3346 wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr; 3347 wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr; 3348 3349 wbp->cl_clusters[wbp->cl_number].io_flags = 0; 3350 3351 if (flags & IO_NOCACHE) 3352 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IONOCACHE; 3353 3354 if (bflag & CL_PASSIVE) 3355 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IOPASSIVE; 3356 3357 wbp->cl_number++; 3358delay_io: 3359 lck_mtx_unlock(&wbp->cl_lockw); 3360 3361 continue; 3362issue_io: 3363 /* 3364 * we don't hold the lock at this point 3365 * 3366 * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set 3367 * so that we correctly deal with a change in state of the hardware modify bit... 3368 * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force 3369 * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also 3370 * responsible for generating the correct sized I/O(s) 3371 */ 3372 retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg); 3373 } 3374 } 3375 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, retval, 0, io_resid, 0, 0); 3376 3377 return (retval); 3378} 3379 3380 3381 3382int 3383cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags) 3384{ 3385 return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL); 3386} 3387 3388 3389int 3390cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (*callback)(buf_t, void *), void *callback_arg) 3391{ 3392 int retval = 0; 3393 int flags; 3394 user_ssize_t cur_resid; 3395 u_int32_t io_size; 3396 u_int32_t read_length = 0; 3397 int read_type = IO_COPY; 3398 3399 flags = xflags; 3400 3401 if (vp->v_flag & VNOCACHE_DATA) 3402 flags |= IO_NOCACHE; 3403 if ((vp->v_flag & VRAOFF) || speculative_reads_disabled) 3404 flags |= IO_RAOFF; 3405 3406 /* 3407 * If we're doing an encrypted IO, then first check to see 3408 * if the IO requested was page aligned. If not, then bail 3409 * out immediately. 3410 */ 3411 if (flags & IO_ENCRYPTED) { 3412 if (read_length & PAGE_MASK) { 3413 retval = EINVAL; 3414 return retval; 3415 } 3416 } 3417 3418 /* 3419 * do a read through the cache if one of the following is true.... 3420 * NOCACHE is not true 3421 * the uio request doesn't target USERSPACE 3422 * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well. 3423 * Reading encrypted data from a CP filesystem should never result in the data touching 3424 * the UBC. 3425 * 3426 * otherwise, find out if we want the direct or contig variant for 3427 * the first vector in the uio request 3428 */ 3429 if (((flags & IO_NOCACHE) || (flags & IO_ENCRYPTED)) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) { 3430 retval = cluster_io_type(uio, &read_type, &read_length, 0); 3431 } 3432 3433 while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) { 3434 3435 switch (read_type) { 3436 3437 case IO_COPY: 3438 /* 3439 * make sure the uio_resid isn't too big... 3440 * internally, we want to handle all of the I/O in 3441 * chunk sizes that fit in a 32 bit int 3442 */ 3443 if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) 3444 io_size = MAX_IO_REQUEST_SIZE; 3445 else 3446 io_size = (u_int32_t)cur_resid; 3447 3448 retval = cluster_read_copy(vp, uio, io_size, filesize, flags, callback, callback_arg); 3449 break; 3450 3451 case IO_DIRECT: 3452 retval = cluster_read_direct(vp, uio, filesize, &read_type, &read_length, flags, callback, callback_arg); 3453 break; 3454 3455 case IO_CONTIG: 3456 retval = cluster_read_contig(vp, uio, filesize, &read_type, &read_length, callback, callback_arg, flags); 3457 break; 3458 3459 case IO_UNKNOWN: 3460 retval = cluster_io_type(uio, &read_type, &read_length, 0); 3461 break; 3462 } 3463 } 3464 return (retval); 3465} 3466 3467 3468 3469static void 3470cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference) 3471{ 3472 int range; 3473 int abort_flags = UPL_ABORT_FREE_ON_EMPTY; 3474 3475 if ((range = last_pg - start_pg)) { 3476 if (take_reference) 3477 abort_flags |= UPL_ABORT_REFERENCE; 3478 3479 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags); 3480 } 3481} 3482 3483 3484static int 3485cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg) 3486{ 3487 upl_page_info_t *pl; 3488 upl_t upl; 3489 vm_offset_t upl_offset; 3490 u_int32_t upl_size; 3491 off_t upl_f_offset; 3492 int start_offset; 3493 int start_pg; 3494 int last_pg; 3495 int uio_last = 0; 3496 int pages_in_upl; 3497 off_t max_size; 3498 off_t last_ioread_offset; 3499 off_t last_request_offset; 3500 kern_return_t kret; 3501 int error = 0; 3502 int retval = 0; 3503 u_int32_t size_of_prefetch; 3504 u_int32_t xsize; 3505 u_int32_t io_size; 3506 u_int32_t max_rd_size; 3507 u_int32_t max_io_size; 3508 u_int32_t max_prefetch; 3509 u_int rd_ahead_enabled = 1; 3510 u_int prefetch_enabled = 1; 3511 struct cl_readahead * rap; 3512 struct clios iostate; 3513 struct cl_extent extent; 3514 int bflag; 3515 int take_reference = 1; 3516 int policy = IOPOL_DEFAULT; 3517 boolean_t iolock_inited = FALSE; 3518 3519 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START, 3520 (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0); 3521 3522 if (flags & IO_ENCRYPTED) { 3523 panic ("encrypted blocks will hit UBC!"); 3524 } 3525 3526 policy = proc_get_task_selfdiskacc(); 3527 3528 if (policy == IOPOL_THROTTLE || policy == IOPOL_UTILITY || (flags & IO_NOCACHE)) 3529 take_reference = 0; 3530 3531 if (flags & IO_PASSIVE) 3532 bflag = CL_PASSIVE; 3533 else 3534 bflag = 0; 3535 3536 if (flags & IO_NOCACHE) 3537 bflag |= CL_NOCACHE; 3538 3539 max_io_size = cluster_max_io_size(vp->v_mount, CL_READ); 3540 max_prefetch = MAX_PREFETCH(vp, max_io_size, (vp->v_mount->mnt_kern_flag & MNTK_SSD)); 3541 max_rd_size = max_prefetch; 3542 3543 last_request_offset = uio->uio_offset + io_req_size; 3544 3545 if (last_request_offset > filesize) 3546 last_request_offset = filesize; 3547 3548 if ((flags & (IO_RAOFF|IO_NOCACHE)) || ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) { 3549 rd_ahead_enabled = 0; 3550 rap = NULL; 3551 } else { 3552 if (cluster_hard_throttle_on(vp, 1)) { 3553 /* 3554 * we're in the throttle window, at the very least 3555 * we want to limit the size of the I/O we're about 3556 * to issue 3557 */ 3558 rd_ahead_enabled = 0; 3559 prefetch_enabled = 0; 3560 3561 max_rd_size = THROTTLE_MAX_IOSIZE; 3562 } 3563 if ((rap = cluster_get_rap(vp)) == NULL) 3564 rd_ahead_enabled = 0; 3565 else { 3566 extent.b_addr = uio->uio_offset / PAGE_SIZE_64; 3567 extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64; 3568 } 3569 } 3570 if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) { 3571 /* 3572 * determine if we already have a read-ahead in the pipe courtesy of the 3573 * last read systemcall that was issued... 3574 * if so, pick up it's extent to determine where we should start 3575 * with respect to any read-ahead that might be necessary to 3576 * garner all the data needed to complete this read systemcall 3577 */ 3578 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64; 3579 3580 if (last_ioread_offset < uio->uio_offset) 3581 last_ioread_offset = (off_t)0; 3582 else if (last_ioread_offset > last_request_offset) 3583 last_ioread_offset = last_request_offset; 3584 } else 3585 last_ioread_offset = (off_t)0; 3586 3587 while (io_req_size && uio->uio_offset < filesize && retval == 0) { 3588 3589 max_size = filesize - uio->uio_offset; 3590 3591 if ((off_t)(io_req_size) < max_size) 3592 io_size = io_req_size; 3593 else 3594 io_size = max_size; 3595 3596 if (!(flags & IO_NOCACHE)) { 3597 3598 while (io_size) { 3599 u_int32_t io_resid; 3600 u_int32_t io_requested; 3601 3602 /* 3603 * if we keep finding the pages we need already in the cache, then 3604 * don't bother to call cluster_read_prefetch since it costs CPU cycles 3605 * to determine that we have all the pages we need... once we miss in 3606 * the cache and have issued an I/O, than we'll assume that we're likely 3607 * to continue to miss in the cache and it's to our advantage to try and prefetch 3608 */ 3609 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) { 3610 if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) { 3611 /* 3612 * we've already issued I/O for this request and 3613 * there's still work to do and 3614 * our prefetch stream is running dry, so issue a 3615 * pre-fetch I/O... the I/O latency will overlap 3616 * with the copying of the data 3617 */ 3618 if (size_of_prefetch > max_rd_size) 3619 size_of_prefetch = max_rd_size; 3620 3621 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag); 3622 3623 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE); 3624 3625 if (last_ioread_offset > last_request_offset) 3626 last_ioread_offset = last_request_offset; 3627 } 3628 } 3629 /* 3630 * limit the size of the copy we're about to do so that 3631 * we can notice that our I/O pipe is running dry and 3632 * get the next I/O issued before it does go dry 3633 */ 3634 if (last_ioread_offset && io_size > (max_io_size / 4)) 3635 io_resid = (max_io_size / 4); 3636 else 3637 io_resid = io_size; 3638 3639 io_requested = io_resid; 3640 3641 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference); 3642 3643 xsize = io_requested - io_resid; 3644 3645 io_size -= xsize; 3646 io_req_size -= xsize; 3647 3648 if (retval || io_resid) 3649 /* 3650 * if we run into a real error or 3651 * a page that is not in the cache 3652 * we need to leave streaming mode 3653 */ 3654 break; 3655 3656 if (rd_ahead_enabled && (io_size == 0 || last_ioread_offset == last_request_offset)) { 3657 /* 3658 * we're already finished the I/O for this read request 3659 * let's see if we should do a read-ahead 3660 */ 3661 cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag); 3662 } 3663 } 3664 if (retval) 3665 break; 3666 if (io_size == 0) { 3667 if (rap != NULL) { 3668 if (extent.e_addr < rap->cl_lastr) 3669 rap->cl_maxra = 0; 3670 rap->cl_lastr = extent.e_addr; 3671 } 3672 break; 3673 } 3674 /* 3675 * recompute max_size since cluster_copy_ubc_data_internal 3676 * may have advanced uio->uio_offset 3677 */ 3678 max_size = filesize - uio->uio_offset; 3679 } 3680 3681 iostate.io_completed = 0; 3682 iostate.io_issued = 0; 3683 iostate.io_error = 0; 3684 iostate.io_wanted = 0; 3685 3686 if ( (flags & IO_RETURN_ON_THROTTLE) ) { 3687 if (cluster_hard_throttle_on(vp, 0) == 2) { 3688 if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) { 3689 /* 3690 * we're in the throttle window and at least 1 I/O 3691 * has already been issued by a throttleable thread 3692 * in this window, so return with EAGAIN to indicate 3693 * to the FS issuing the cluster_read call that it 3694 * should now throttle after dropping any locks 3695 */ 3696 throttle_info_update_by_mount(vp->v_mount); 3697 3698 retval = EAGAIN; 3699 break; 3700 } 3701 } 3702 } 3703 3704 /* 3705 * compute the size of the upl needed to encompass 3706 * the requested read... limit each call to cluster_io 3707 * to the maximum UPL size... cluster_io will clip if 3708 * this exceeds the maximum io_size for the device, 3709 * make sure to account for 3710 * a starting offset that's not page aligned 3711 */ 3712 start_offset = (int)(uio->uio_offset & PAGE_MASK_64); 3713 upl_f_offset = uio->uio_offset - (off_t)start_offset; 3714 3715 if (io_size > max_rd_size) 3716 io_size = max_rd_size; 3717 3718 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 3719 3720 if (flags & IO_NOCACHE) { 3721 if (upl_size > max_io_size) 3722 upl_size = max_io_size; 3723 } else { 3724 if (upl_size > max_io_size / 4) 3725 upl_size = max_io_size / 4; 3726 } 3727 pages_in_upl = upl_size / PAGE_SIZE; 3728 3729 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START, 3730 upl, (int)upl_f_offset, upl_size, start_offset, 0); 3731 3732 kret = ubc_create_upl(vp, 3733 upl_f_offset, 3734 upl_size, 3735 &upl, 3736 &pl, 3737 UPL_FILE_IO | UPL_SET_LITE); 3738 if (kret != KERN_SUCCESS) 3739 panic("cluster_read_copy: failed to get pagelist"); 3740 3741 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END, 3742 upl, (int)upl_f_offset, upl_size, start_offset, 0); 3743 3744 /* 3745 * scan from the beginning of the upl looking for the first 3746 * non-valid page.... this will become the first page in 3747 * the request we're going to make to 'cluster_io'... if all 3748 * of the pages are valid, we won't call through to 'cluster_io' 3749 */ 3750 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) { 3751 if (!upl_valid_page(pl, start_pg)) 3752 break; 3753 } 3754 3755 /* 3756 * scan from the starting invalid page looking for a valid 3757 * page before the end of the upl is reached, if we 3758 * find one, then it will be the last page of the request to 3759 * 'cluster_io' 3760 */ 3761 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) { 3762 if (upl_valid_page(pl, last_pg)) 3763 break; 3764 } 3765 3766 if (start_pg < last_pg) { 3767 /* 3768 * we found a range of 'invalid' pages that must be filled 3769 * if the last page in this range is the last page of the file 3770 * we may have to clip the size of it to keep from reading past 3771 * the end of the last physical block associated with the file 3772 */ 3773 if (iolock_inited == FALSE) { 3774 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); 3775 3776 iolock_inited = TRUE; 3777 } 3778 upl_offset = start_pg * PAGE_SIZE; 3779 io_size = (last_pg - start_pg) * PAGE_SIZE; 3780 3781 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) 3782 io_size = filesize - (upl_f_offset + upl_offset); 3783 3784 /* 3785 * issue an asynchronous read to cluster_io 3786 */ 3787 3788 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, 3789 io_size, CL_READ | CL_ASYNC | bflag, (buf_t)NULL, &iostate, callback, callback_arg); 3790 3791 if (rap) { 3792 if (extent.e_addr < rap->cl_maxra) { 3793 /* 3794 * we've just issued a read for a block that should have been 3795 * in the cache courtesy of the read-ahead engine... something 3796 * has gone wrong with the pipeline, so reset the read-ahead 3797 * logic which will cause us to restart from scratch 3798 */ 3799 rap->cl_maxra = 0; 3800 } 3801 } 3802 } 3803 if (error == 0) { 3804 /* 3805 * if the read completed successfully, or there was no I/O request 3806 * issued, than copy the data into user land via 'cluster_upl_copy_data' 3807 * we'll first add on any 'valid' 3808 * pages that were present in the upl when we acquired it. 3809 */ 3810 u_int val_size; 3811 3812 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) { 3813 if (!upl_valid_page(pl, uio_last)) 3814 break; 3815 } 3816 if (uio_last < pages_in_upl) { 3817 /* 3818 * there were some invalid pages beyond the valid pages 3819 * that we didn't issue an I/O for, just release them 3820 * unchanged now, so that any prefetch/readahed can 3821 * include them 3822 */ 3823 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE, 3824 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY); 3825 } 3826 3827 /* 3828 * compute size to transfer this round, if io_req_size is 3829 * still non-zero after this attempt, we'll loop around and 3830 * set up for another I/O. 3831 */ 3832 val_size = (uio_last * PAGE_SIZE) - start_offset; 3833 3834 if (val_size > max_size) 3835 val_size = max_size; 3836 3837 if (val_size > io_req_size) 3838 val_size = io_req_size; 3839 3840 if ((uio->uio_offset + val_size) > last_ioread_offset) 3841 last_ioread_offset = uio->uio_offset + val_size; 3842 3843 if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) { 3844 3845 if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) { 3846 /* 3847 * if there's still I/O left to do for this request, and... 3848 * we're not in hard throttle mode, and... 3849 * we're close to using up the previous prefetch, then issue a 3850 * new pre-fetch I/O... the I/O latency will overlap 3851 * with the copying of the data 3852 */ 3853 if (size_of_prefetch > max_rd_size) 3854 size_of_prefetch = max_rd_size; 3855 3856 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag); 3857 3858 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE); 3859 3860 if (last_ioread_offset > last_request_offset) 3861 last_ioread_offset = last_request_offset; 3862 } 3863 3864 } else if ((uio->uio_offset + val_size) == last_request_offset) { 3865 /* 3866 * this transfer will finish this request, so... 3867 * let's try to read ahead if we're in 3868 * a sequential access pattern and we haven't 3869 * explicitly disabled it 3870 */ 3871 if (rd_ahead_enabled) 3872 cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag); 3873 3874 if (rap != NULL) { 3875 if (extent.e_addr < rap->cl_lastr) 3876 rap->cl_maxra = 0; 3877 rap->cl_lastr = extent.e_addr; 3878 } 3879 } 3880 if (iostate.io_issued > iostate.io_completed) 3881 cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); 3882 3883 if (iostate.io_error) 3884 error = iostate.io_error; 3885 else { 3886 u_int32_t io_requested; 3887 3888 io_requested = val_size; 3889 3890 retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested); 3891 3892 io_req_size -= (val_size - io_requested); 3893 } 3894 } else { 3895 if (iostate.io_issued > iostate.io_completed) 3896 cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); 3897 } 3898 if (start_pg < last_pg) { 3899 /* 3900 * compute the range of pages that we actually issued an I/O for 3901 * and either commit them as valid if the I/O succeeded 3902 * or abort them if the I/O failed or we're not supposed to 3903 * keep them in the cache 3904 */ 3905 io_size = (last_pg - start_pg) * PAGE_SIZE; 3906 3907 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, upl, start_pg * PAGE_SIZE, io_size, error, 0); 3908 3909 if (error || (flags & IO_NOCACHE)) 3910 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size, 3911 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); 3912 else { 3913 int commit_flags = UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY; 3914 3915 if (take_reference) 3916 commit_flags |= UPL_COMMIT_INACTIVATE; 3917 else 3918 commit_flags |= UPL_COMMIT_SPECULATE; 3919 3920 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, commit_flags); 3921 } 3922 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, start_pg * PAGE_SIZE, io_size, error, 0); 3923 } 3924 if ((last_pg - start_pg) < pages_in_upl) { 3925 /* 3926 * the set of pages that we issued an I/O for did not encompass 3927 * the entire upl... so just release these without modifying 3928 * their state 3929 */ 3930 if (error) 3931 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); 3932 else { 3933 3934 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, 3935 upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0); 3936 3937 /* 3938 * handle any valid pages at the beginning of 3939 * the upl... release these appropriately 3940 */ 3941 cluster_read_upl_release(upl, 0, start_pg, take_reference); 3942 3943 /* 3944 * handle any valid pages immediately after the 3945 * pages we issued I/O for... ... release these appropriately 3946 */ 3947 cluster_read_upl_release(upl, last_pg, uio_last, take_reference); 3948 3949 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, -1, -1, 0, 0); 3950 } 3951 } 3952 if (retval == 0) 3953 retval = error; 3954 3955 if (io_req_size) { 3956 if (cluster_hard_throttle_on(vp, 1)) { 3957 /* 3958 * we're in the throttle window, at the very least 3959 * we want to limit the size of the I/O we're about 3960 * to issue 3961 */ 3962 rd_ahead_enabled = 0; 3963 prefetch_enabled = 0; 3964 max_rd_size = THROTTLE_MAX_IOSIZE; 3965 } else { 3966 if (max_rd_size == THROTTLE_MAX_IOSIZE) { 3967 /* 3968 * coming out of throttled state 3969 */ 3970 if (policy != IOPOL_THROTTLE && policy != IOPOL_UTILITY) { 3971 if (rap != NULL) 3972 rd_ahead_enabled = 1; 3973 prefetch_enabled = 1; 3974 } 3975 max_rd_size = max_prefetch; 3976 last_ioread_offset = 0; 3977 } 3978 } 3979 } 3980 } 3981 if (iolock_inited == TRUE) { 3982 if (iostate.io_issued > iostate.io_completed) { 3983 /* 3984 * cluster_io returned an error after it 3985 * had already issued some I/O. we need 3986 * to wait for that I/O to complete before 3987 * we can destroy the iostate mutex... 3988 * 'retval' already contains the early error 3989 * so no need to pick it up from iostate.io_error 3990 */ 3991 cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); 3992 } 3993 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); 3994 } 3995 if (rap != NULL) { 3996 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END, 3997 (int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0); 3998 3999 lck_mtx_unlock(&rap->cl_lockr); 4000 } else { 4001 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END, 4002 (int)uio->uio_offset, io_req_size, 0, retval, 0); 4003 } 4004 4005 return (retval); 4006} 4007 4008 4009static int 4010cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length, 4011 int flags, int (*callback)(buf_t, void *), void *callback_arg) 4012{ 4013 upl_t upl; 4014 upl_page_info_t *pl; 4015 off_t max_io_size; 4016 vm_offset_t upl_offset, vector_upl_offset = 0; 4017 upl_size_t upl_size, vector_upl_size = 0; 4018 vm_size_t upl_needed_size; 4019 unsigned int pages_in_pl; 4020 int upl_flags; 4021 kern_return_t kret; 4022 unsigned int i; 4023 int force_data_sync; 4024 int retval = 0; 4025 int no_zero_fill = 0; 4026 int io_flag = 0; 4027 int misaligned = 0; 4028 struct clios iostate; 4029 user_addr_t iov_base; 4030 u_int32_t io_req_size; 4031 u_int32_t offset_in_file; 4032 u_int32_t offset_in_iovbase; 4033 u_int32_t io_size; 4034 u_int32_t io_min; 4035 u_int32_t xsize; 4036 u_int32_t devblocksize; 4037 u_int32_t mem_alignment_mask; 4038 u_int32_t max_upl_size; 4039 u_int32_t max_rd_size; 4040 u_int32_t max_rd_ahead; 4041 u_int32_t max_vector_size; 4042 boolean_t strict_uncached_IO = FALSE; 4043 boolean_t io_throttled = FALSE; 4044 4045 u_int32_t vector_upl_iosize = 0; 4046 int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1); 4047 off_t v_upl_uio_offset = 0; 4048 int vector_upl_index=0; 4049 upl_t vector_upl = NULL; 4050 4051 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START, 4052 (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0); 4053 4054 max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ); 4055 4056 max_rd_size = max_upl_size; 4057 max_rd_ahead = max_rd_size * IO_SCALE(vp, 2); 4058 4059 io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO; 4060 4061 if (flags & IO_PASSIVE) 4062 io_flag |= CL_PASSIVE; 4063 4064 if (flags & IO_ENCRYPTED) { 4065 io_flag |= CL_RAW_ENCRYPTED; 4066 } 4067 4068 if (flags & IO_NOCACHE) { 4069 io_flag |= CL_NOCACHE; 4070 } 4071 4072 iostate.io_completed = 0; 4073 iostate.io_issued = 0; 4074 iostate.io_error = 0; 4075 iostate.io_wanted = 0; 4076 4077 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); 4078 4079 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; 4080 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; 4081 4082 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE, 4083 (int)devblocksize, (int)mem_alignment_mask, 0, 0, 0); 4084 4085 if (devblocksize == 1) { 4086 /* 4087 * the AFP client advertises a devblocksize of 1 4088 * however, its BLOCKMAP routine maps to physical 4089 * blocks that are PAGE_SIZE in size... 4090 * therefore we can't ask for I/Os that aren't page aligned 4091 * or aren't multiples of PAGE_SIZE in size 4092 * by setting devblocksize to PAGE_SIZE, we re-instate 4093 * the old behavior we had before the mem_alignment_mask 4094 * changes went in... 4095 */ 4096 devblocksize = PAGE_SIZE; 4097 } 4098 4099 strict_uncached_IO = ubc_strict_uncached_IO(vp); 4100 4101next_dread: 4102 io_req_size = *read_length; 4103 iov_base = uio_curriovbase(uio); 4104 4105 max_io_size = filesize - uio->uio_offset; 4106 4107 if ((off_t)io_req_size > max_io_size) 4108 io_req_size = max_io_size; 4109 4110 offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1); 4111 offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask; 4112 4113 if (offset_in_file || offset_in_iovbase) { 4114 /* 4115 * one of the 2 important offsets is misaligned 4116 * so fire an I/O through the cache for this entire vector 4117 */ 4118 misaligned = 1; 4119 } 4120 if (iov_base & (devblocksize - 1)) { 4121 /* 4122 * the offset in memory must be on a device block boundary 4123 * so that we can guarantee that we can generate an 4124 * I/O that ends on a page boundary in cluster_io 4125 */ 4126 misaligned = 1; 4127 } 4128 4129 /* 4130 * The user must request IO in aligned chunks. If the 4131 * offset into the file is bad, or the userland pointer 4132 * is non-aligned, then we cannot service the encrypted IO request. 4133 */ 4134 if ((flags & IO_ENCRYPTED) && (misaligned)) { 4135 retval = EINVAL; 4136 } 4137 4138 /* 4139 * When we get to this point, we know... 4140 * -- the offset into the file is on a devblocksize boundary 4141 */ 4142 4143 while (io_req_size && retval == 0) { 4144 u_int32_t io_start; 4145 4146 if (cluster_hard_throttle_on(vp, 1)) { 4147 /* 4148 * we're in the throttle window, at the very least 4149 * we want to limit the size of the I/O we're about 4150 * to issue 4151 */ 4152 max_rd_size = THROTTLE_MAX_IOSIZE; 4153 max_rd_ahead = THROTTLE_MAX_IOSIZE - 1; 4154 max_vector_size = THROTTLE_MAX_IOSIZE; 4155 } else { 4156 max_rd_size = max_upl_size; 4157 max_rd_ahead = max_rd_size * IO_SCALE(vp, 2); 4158 max_vector_size = MAX_VECTOR_UPL_SIZE; 4159 } 4160 io_start = io_size = io_req_size; 4161 4162 /* 4163 * First look for pages already in the cache 4164 * and move them to user space. But only do this 4165 * check if we are not retrieving encrypted data directly 4166 * from the filesystem; those blocks should never 4167 * be in the UBC. 4168 * 4169 * cluster_copy_ubc_data returns the resid 4170 * in io_size 4171 */ 4172 if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) { 4173 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0); 4174 } 4175 /* 4176 * calculate the number of bytes actually copied 4177 * starting size - residual 4178 */ 4179 xsize = io_start - io_size; 4180 4181 io_req_size -= xsize; 4182 4183 if(useVectorUPL && (xsize || (iov_base & PAGE_MASK))) { 4184 /* 4185 * We found something in the cache or we have an iov_base that's not 4186 * page-aligned. 4187 * 4188 * Issue all I/O's that have been collected within this Vectored UPL. 4189 */ 4190 if(vector_upl_index) { 4191 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 4192 reset_vector_run_state(); 4193 } 4194 4195 if(xsize) 4196 useVectorUPL = 0; 4197 4198 /* 4199 * After this point, if we are using the Vector UPL path and the base is 4200 * not page-aligned then the UPL with that base will be the first in the vector UPL. 4201 */ 4202 } 4203 4204 /* 4205 * check to see if we are finished with this request. 4206 * 4207 * If we satisfied this IO already, then io_req_size will be 0. 4208 * Otherwise, see if the IO was mis-aligned and needs to go through 4209 * the UBC to deal with the 'tail'. 4210 * 4211 */ 4212 if (io_req_size == 0 || (misaligned)) { 4213 /* 4214 * see if there's another uio vector to 4215 * process that's of type IO_DIRECT 4216 * 4217 * break out of while loop to get there 4218 */ 4219 break; 4220 } 4221 /* 4222 * assume the request ends on a device block boundary 4223 */ 4224 io_min = devblocksize; 4225 4226 /* 4227 * we can handle I/O's in multiples of the device block size 4228 * however, if io_size isn't a multiple of devblocksize we 4229 * want to clip it back to the nearest page boundary since 4230 * we are going to have to go through cluster_read_copy to 4231 * deal with the 'overhang'... by clipping it to a PAGE_SIZE 4232 * multiple, we avoid asking the drive for the same physical 4233 * blocks twice.. once for the partial page at the end of the 4234 * request and a 2nd time for the page we read into the cache 4235 * (which overlaps the end of the direct read) in order to 4236 * get at the overhang bytes 4237 */ 4238 if (io_size & (devblocksize - 1)) { 4239 if (flags & IO_ENCRYPTED) { 4240 /* 4241 * Normally, we'd round down to the previous page boundary to 4242 * let the UBC manage the zero-filling of the file past the EOF. 4243 * But if we're doing encrypted IO, we can't let any of 4244 * the data hit the UBC. This means we have to do the full 4245 * IO to the upper block boundary of the device block that 4246 * contains the EOF. The user will be responsible for not 4247 * interpreting data PAST the EOF in its buffer. 4248 * 4249 * So just bump the IO back up to a multiple of devblocksize 4250 */ 4251 io_size = ((io_size + devblocksize) & ~(devblocksize - 1)); 4252 io_min = io_size; 4253 } 4254 else { 4255 /* 4256 * Clip the request to the previous page size boundary 4257 * since request does NOT end on a device block boundary 4258 */ 4259 io_size &= ~PAGE_MASK; 4260 io_min = PAGE_SIZE; 4261 } 4262 4263 } 4264 if (retval || io_size < io_min) { 4265 /* 4266 * either an error or we only have the tail left to 4267 * complete via the copy path... 4268 * we may have already spun some portion of this request 4269 * off as async requests... we need to wait for the I/O 4270 * to complete before returning 4271 */ 4272 goto wait_for_dreads; 4273 } 4274 4275 /* 4276 * Don't re-check the UBC data if we are looking for uncached IO 4277 * or asking for encrypted blocks. 4278 */ 4279 if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) { 4280 4281 if ((xsize = io_size) > max_rd_size) 4282 xsize = max_rd_size; 4283 4284 io_size = 0; 4285 4286 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size); 4287 4288 if (io_size == 0) { 4289 /* 4290 * a page must have just come into the cache 4291 * since the first page in this range is no 4292 * longer absent, go back and re-evaluate 4293 */ 4294 continue; 4295 } 4296 } 4297 if ( (flags & IO_RETURN_ON_THROTTLE) ) { 4298 if (cluster_hard_throttle_on(vp, 0) == 2) { 4299 if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) { 4300 /* 4301 * we're in the throttle window and at least 1 I/O 4302 * has already been issued by a throttleable thread 4303 * in this window, so return with EAGAIN to indicate 4304 * to the FS issuing the cluster_read call that it 4305 * should now throttle after dropping any locks 4306 */ 4307 throttle_info_update_by_mount(vp->v_mount); 4308 4309 io_throttled = TRUE; 4310 goto wait_for_dreads; 4311 } 4312 } 4313 } 4314 if (io_size > max_rd_size) 4315 io_size = max_rd_size; 4316 4317 iov_base = uio_curriovbase(uio); 4318 4319 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK); 4320 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK; 4321 4322 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START, 4323 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0); 4324 4325 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) 4326 no_zero_fill = 1; 4327 else 4328 no_zero_fill = 0; 4329 4330 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) { 4331 pages_in_pl = 0; 4332 upl_size = upl_needed_size; 4333 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; 4334 4335 if (no_zero_fill) 4336 upl_flags |= UPL_NOZEROFILL; 4337 if (force_data_sync) 4338 upl_flags |= UPL_FORCE_DATA_SYNC; 4339 4340 kret = vm_map_create_upl(current_map(), 4341 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), 4342 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags); 4343 4344 if (kret != KERN_SUCCESS) { 4345 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, 4346 (int)upl_offset, upl_size, io_size, kret, 0); 4347 /* 4348 * failed to get pagelist 4349 * 4350 * we may have already spun some portion of this request 4351 * off as async requests... we need to wait for the I/O 4352 * to complete before returning 4353 */ 4354 goto wait_for_dreads; 4355 } 4356 pages_in_pl = upl_size / PAGE_SIZE; 4357 pl = UPL_GET_INTERNAL_PAGE_LIST(upl); 4358 4359 for (i = 0; i < pages_in_pl; i++) { 4360 if (!upl_page_present(pl, i)) 4361 break; 4362 } 4363 if (i == pages_in_pl) 4364 break; 4365 4366 ubc_upl_abort(upl, 0); 4367 } 4368 if (force_data_sync >= 3) { 4369 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, 4370 (int)upl_offset, upl_size, io_size, kret, 0); 4371 4372 goto wait_for_dreads; 4373 } 4374 /* 4375 * Consider the possibility that upl_size wasn't satisfied. 4376 */ 4377 if (upl_size < upl_needed_size) { 4378 if (upl_size && upl_offset == 0) 4379 io_size = upl_size; 4380 else 4381 io_size = 0; 4382 } 4383 if (io_size == 0) { 4384 ubc_upl_abort(upl, 0); 4385 goto wait_for_dreads; 4386 } 4387 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, 4388 (int)upl_offset, upl_size, io_size, kret, 0); 4389 4390 if(useVectorUPL) { 4391 vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK); 4392 if(end_off) 4393 issueVectorUPL = 1; 4394 /* 4395 * After this point, if we are using a vector UPL, then 4396 * either all the UPL elements end on a page boundary OR 4397 * this UPL is the last element because it does not end 4398 * on a page boundary. 4399 */ 4400 } 4401 4402 /* 4403 * request asynchronously so that we can overlap 4404 * the preparation of the next I/O 4405 * if there are already too many outstanding reads 4406 * wait until some have completed before issuing the next read 4407 */ 4408 if (iostate.io_issued > iostate.io_completed) 4409 cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct"); 4410 4411 if (iostate.io_error) { 4412 /* 4413 * one of the earlier reads we issued ran into a hard error 4414 * don't issue any more reads, cleanup the UPL 4415 * that was just created but not used, then 4416 * go wait for any other reads to complete before 4417 * returning the error to the caller 4418 */ 4419 ubc_upl_abort(upl, 0); 4420 4421 goto wait_for_dreads; 4422 } 4423 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START, 4424 upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0); 4425 4426 4427 if(!useVectorUPL) { 4428 if (no_zero_fill) 4429 io_flag &= ~CL_PRESERVE; 4430 else 4431 io_flag |= CL_PRESERVE; 4432 4433 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 4434 4435 } else { 4436 4437 if(!vector_upl_index) { 4438 vector_upl = vector_upl_create(upl_offset); 4439 v_upl_uio_offset = uio->uio_offset; 4440 vector_upl_offset = upl_offset; 4441 } 4442 4443 vector_upl_set_subupl(vector_upl,upl, upl_size); 4444 vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size); 4445 vector_upl_index++; 4446 vector_upl_size += upl_size; 4447 vector_upl_iosize += io_size; 4448 4449 if(issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) { 4450 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 4451 reset_vector_run_state(); 4452 } 4453 } 4454 /* 4455 * update the uio structure 4456 */ 4457 if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) { 4458 uio_update(uio, (user_size_t)max_io_size); 4459 } 4460 else { 4461 uio_update(uio, (user_size_t)io_size); 4462 } 4463 /* 4464 * Under normal circumstances, the io_size should not be 4465 * bigger than the io_req_size, but we may have had to round up 4466 * to the end of the page in the encrypted IO case. In that case only, 4467 * ensure that we only decrement io_req_size to 0. 4468 */ 4469 if ((flags & IO_ENCRYPTED) && (io_size > io_req_size)) { 4470 io_req_size = 0; 4471 } 4472 else { 4473 io_req_size -= io_size; 4474 } 4475 4476 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END, 4477 upl, (int)uio->uio_offset, io_req_size, retval, 0); 4478 4479 } /* end while */ 4480 4481 if (retval == 0 && iostate.io_error == 0 && io_req_size == 0 && uio->uio_offset < filesize) { 4482 4483 retval = cluster_io_type(uio, read_type, read_length, 0); 4484 4485 if (retval == 0 && *read_type == IO_DIRECT) { 4486 4487 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE, 4488 (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0); 4489 4490 goto next_dread; 4491 } 4492 } 4493 4494wait_for_dreads: 4495 4496 if(retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) { 4497 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 4498 reset_vector_run_state(); 4499 } 4500 /* 4501 * make sure all async reads that are part of this stream 4502 * have completed before we return 4503 */ 4504 if (iostate.io_issued > iostate.io_completed) 4505 cluster_iostate_wait(&iostate, 0, "cluster_read_direct"); 4506 4507 if (iostate.io_error) 4508 retval = iostate.io_error; 4509 4510 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); 4511 4512 if (io_throttled == TRUE && retval == 0) 4513 retval = EAGAIN; 4514 4515 if (io_req_size && retval == 0) { 4516 /* 4517 * we couldn't handle the tail of this request in DIRECT mode 4518 * so fire it through the copy path 4519 */ 4520 retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg); 4521 4522 *read_type = IO_UNKNOWN; 4523 } 4524 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END, 4525 (int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, 0); 4526 4527 return (retval); 4528} 4529 4530 4531static int 4532cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length, 4533 int (*callback)(buf_t, void *), void *callback_arg, int flags) 4534{ 4535 upl_page_info_t *pl; 4536 upl_t upl[MAX_VECTS]; 4537 vm_offset_t upl_offset; 4538 addr64_t dst_paddr = 0; 4539 user_addr_t iov_base; 4540 off_t max_size; 4541 upl_size_t upl_size; 4542 vm_size_t upl_needed_size; 4543 mach_msg_type_number_t pages_in_pl; 4544 int upl_flags; 4545 kern_return_t kret; 4546 struct clios iostate; 4547 int error= 0; 4548 int cur_upl = 0; 4549 int num_upl = 0; 4550 int n; 4551 u_int32_t xsize; 4552 u_int32_t io_size; 4553 u_int32_t devblocksize; 4554 u_int32_t mem_alignment_mask; 4555 u_int32_t tail_size = 0; 4556 int bflag; 4557 4558 if (flags & IO_PASSIVE) 4559 bflag = CL_PASSIVE; 4560 else 4561 bflag = 0; 4562 4563 if (flags & IO_NOCACHE) 4564 bflag |= CL_NOCACHE; 4565 4566 /* 4567 * When we enter this routine, we know 4568 * -- the read_length will not exceed the current iov_len 4569 * -- the target address is physically contiguous for read_length 4570 */ 4571 cluster_syncup(vp, filesize, callback, callback_arg); 4572 4573 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; 4574 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; 4575 4576 iostate.io_completed = 0; 4577 iostate.io_issued = 0; 4578 iostate.io_error = 0; 4579 iostate.io_wanted = 0; 4580 4581 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); 4582 4583next_cread: 4584 io_size = *read_length; 4585 4586 max_size = filesize - uio->uio_offset; 4587 4588 if (io_size > max_size) 4589 io_size = max_size; 4590 4591 iov_base = uio_curriovbase(uio); 4592 4593 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK); 4594 upl_needed_size = upl_offset + io_size; 4595 4596 pages_in_pl = 0; 4597 upl_size = upl_needed_size; 4598 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; 4599 4600 4601 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_START, 4602 (int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0); 4603 4604 kret = vm_map_get_upl(current_map(), 4605 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), 4606 &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, 0); 4607 4608 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_END, 4609 (int)upl_offset, upl_size, io_size, kret, 0); 4610 4611 if (kret != KERN_SUCCESS) { 4612 /* 4613 * failed to get pagelist 4614 */ 4615 error = EINVAL; 4616 goto wait_for_creads; 4617 } 4618 num_upl++; 4619 4620 if (upl_size < upl_needed_size) { 4621 /* 4622 * The upl_size wasn't satisfied. 4623 */ 4624 error = EINVAL; 4625 goto wait_for_creads; 4626 } 4627 pl = ubc_upl_pageinfo(upl[cur_upl]); 4628 4629 dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset; 4630 4631 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) { 4632 u_int32_t head_size; 4633 4634 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1)); 4635 4636 if (head_size > io_size) 4637 head_size = io_size; 4638 4639 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ, callback, callback_arg); 4640 4641 if (error) 4642 goto wait_for_creads; 4643 4644 upl_offset += head_size; 4645 dst_paddr += head_size; 4646 io_size -= head_size; 4647 4648 iov_base += head_size; 4649 } 4650 if ((u_int32_t)iov_base & mem_alignment_mask) { 4651 /* 4652 * request doesn't set up on a memory boundary 4653 * the underlying DMA engine can handle... 4654 * return an error instead of going through 4655 * the slow copy path since the intent of this 4656 * path is direct I/O to device memory 4657 */ 4658 error = EINVAL; 4659 goto wait_for_creads; 4660 } 4661 4662 tail_size = io_size & (devblocksize - 1); 4663 4664 io_size -= tail_size; 4665 4666 while (io_size && error == 0) { 4667 4668 if (io_size > MAX_IO_CONTIG_SIZE) 4669 xsize = MAX_IO_CONTIG_SIZE; 4670 else 4671 xsize = io_size; 4672 /* 4673 * request asynchronously so that we can overlap 4674 * the preparation of the next I/O... we'll do 4675 * the commit after all the I/O has completed 4676 * since its all issued against the same UPL 4677 * if there are already too many outstanding reads 4678 * wait until some have completed before issuing the next 4679 */ 4680 if (iostate.io_issued > iostate.io_completed) 4681 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig"); 4682 4683 if (iostate.io_error) { 4684 /* 4685 * one of the earlier reads we issued ran into a hard error 4686 * don't issue any more reads... 4687 * go wait for any other reads to complete before 4688 * returning the error to the caller 4689 */ 4690 goto wait_for_creads; 4691 } 4692 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, xsize, 4693 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC | bflag, 4694 (buf_t)NULL, &iostate, callback, callback_arg); 4695 /* 4696 * The cluster_io read was issued successfully, 4697 * update the uio structure 4698 */ 4699 if (error == 0) { 4700 uio_update(uio, (user_size_t)xsize); 4701 4702 dst_paddr += xsize; 4703 upl_offset += xsize; 4704 io_size -= xsize; 4705 } 4706 } 4707 if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS && uio->uio_offset < filesize) { 4708 4709 error = cluster_io_type(uio, read_type, read_length, 0); 4710 4711 if (error == 0 && *read_type == IO_CONTIG) { 4712 cur_upl++; 4713 goto next_cread; 4714 } 4715 } else 4716 *read_type = IO_UNKNOWN; 4717 4718wait_for_creads: 4719 /* 4720 * make sure all async reads that are part of this stream 4721 * have completed before we proceed 4722 */ 4723 if (iostate.io_issued > iostate.io_completed) 4724 cluster_iostate_wait(&iostate, 0, "cluster_read_contig"); 4725 4726 if (iostate.io_error) 4727 error = iostate.io_error; 4728 4729 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); 4730 4731 if (error == 0 && tail_size) 4732 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg); 4733 4734 for (n = 0; n < num_upl; n++) 4735 /* 4736 * just release our hold on each physically contiguous 4737 * region without changing any state 4738 */ 4739 ubc_upl_abort(upl[n], 0); 4740 4741 return (error); 4742} 4743 4744 4745static int 4746cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length) 4747{ 4748 user_size_t iov_len; 4749 user_addr_t iov_base = 0; 4750 upl_t upl; 4751 upl_size_t upl_size; 4752 int upl_flags; 4753 int retval = 0; 4754 4755 /* 4756 * skip over any emtpy vectors 4757 */ 4758 uio_update(uio, (user_size_t)0); 4759 4760 iov_len = uio_curriovlen(uio); 4761 4762 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_START, uio, (int)iov_len, 0, 0, 0); 4763 4764 if (iov_len) { 4765 iov_base = uio_curriovbase(uio); 4766 /* 4767 * make sure the size of the vector isn't too big... 4768 * internally, we want to handle all of the I/O in 4769 * chunk sizes that fit in a 32 bit int 4770 */ 4771 if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE) 4772 upl_size = MAX_IO_REQUEST_SIZE; 4773 else 4774 upl_size = (u_int32_t)iov_len; 4775 4776 upl_flags = UPL_QUERY_OBJECT_TYPE; 4777 4778 if ((vm_map_get_upl(current_map(), 4779 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), 4780 &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) { 4781 /* 4782 * the user app must have passed in an invalid address 4783 */ 4784 retval = EFAULT; 4785 } 4786 if (upl_size == 0) 4787 retval = EFAULT; 4788 4789 *io_length = upl_size; 4790 4791 if (upl_flags & UPL_PHYS_CONTIG) 4792 *io_type = IO_CONTIG; 4793 else if (iov_len >= min_length) 4794 *io_type = IO_DIRECT; 4795 else 4796 *io_type = IO_COPY; 4797 } else { 4798 /* 4799 * nothing left to do for this uio 4800 */ 4801 *io_length = 0; 4802 *io_type = IO_UNKNOWN; 4803 } 4804 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_END, iov_base, *io_type, *io_length, retval, 0); 4805 4806 return (retval); 4807} 4808 4809 4810/* 4811 * generate advisory I/O's in the largest chunks possible 4812 * the completed pages will be released into the VM cache 4813 */ 4814int 4815advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid) 4816{ 4817 return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE); 4818} 4819 4820int 4821advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*callback)(buf_t, void *), void *callback_arg, int bflag) 4822{ 4823 upl_page_info_t *pl; 4824 upl_t upl; 4825 vm_offset_t upl_offset; 4826 int upl_size; 4827 off_t upl_f_offset; 4828 int start_offset; 4829 int start_pg; 4830 int last_pg; 4831 int pages_in_upl; 4832 off_t max_size; 4833 int io_size; 4834 kern_return_t kret; 4835 int retval = 0; 4836 int issued_io; 4837 int skip_range; 4838 uint32_t max_io_size; 4839 4840 4841 if ( !UBCINFOEXISTS(vp)) 4842 return(EINVAL); 4843 4844 if (resid < 0) 4845 return(EINVAL); 4846 4847 max_io_size = cluster_max_io_size(vp->v_mount, CL_READ); 4848 4849#if CONFIG_EMBEDDED 4850 if (max_io_size > speculative_prefetch_max_iosize) 4851 max_io_size = speculative_prefetch_max_iosize; 4852#else 4853 if ((vp->v_mount->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) { 4854 if (max_io_size > speculative_prefetch_max_iosize) 4855 max_io_size = speculative_prefetch_max_iosize; 4856 } 4857#endif 4858 4859 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START, 4860 (int)f_offset, resid, (int)filesize, 0, 0); 4861 4862 while (resid && f_offset < filesize && retval == 0) { 4863 /* 4864 * compute the size of the upl needed to encompass 4865 * the requested read... limit each call to cluster_io 4866 * to the maximum UPL size... cluster_io will clip if 4867 * this exceeds the maximum io_size for the device, 4868 * make sure to account for 4869 * a starting offset that's not page aligned 4870 */ 4871 start_offset = (int)(f_offset & PAGE_MASK_64); 4872 upl_f_offset = f_offset - (off_t)start_offset; 4873 max_size = filesize - f_offset; 4874 4875 if (resid < max_size) 4876 io_size = resid; 4877 else 4878 io_size = max_size; 4879 4880 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 4881 if ((uint32_t)upl_size > max_io_size) 4882 upl_size = max_io_size; 4883 4884 skip_range = 0; 4885 /* 4886 * return the number of contiguously present pages in the cache 4887 * starting at upl_f_offset within the file 4888 */ 4889 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range); 4890 4891 if (skip_range) { 4892 /* 4893 * skip over pages already present in the cache 4894 */ 4895 io_size = skip_range - start_offset; 4896 4897 f_offset += io_size; 4898 resid -= io_size; 4899 4900 if (skip_range == upl_size) 4901 continue; 4902 /* 4903 * have to issue some real I/O 4904 * at this point, we know it's starting on a page boundary 4905 * because we've skipped over at least the first page in the request 4906 */ 4907 start_offset = 0; 4908 upl_f_offset += skip_range; 4909 upl_size -= skip_range; 4910 } 4911 pages_in_upl = upl_size / PAGE_SIZE; 4912 4913 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START, 4914 upl, (int)upl_f_offset, upl_size, start_offset, 0); 4915 4916 kret = ubc_create_upl(vp, 4917 upl_f_offset, 4918 upl_size, 4919 &upl, 4920 &pl, 4921 UPL_RET_ONLY_ABSENT | UPL_SET_LITE); 4922 if (kret != KERN_SUCCESS) 4923 return(retval); 4924 issued_io = 0; 4925 4926 /* 4927 * before we start marching forward, we must make sure we end on 4928 * a present page, otherwise we will be working with a freed 4929 * upl 4930 */ 4931 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) { 4932 if (upl_page_present(pl, last_pg)) 4933 break; 4934 } 4935 pages_in_upl = last_pg + 1; 4936 4937 4938 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END, 4939 upl, (int)upl_f_offset, upl_size, start_offset, 0); 4940 4941 4942 for (last_pg = 0; last_pg < pages_in_upl; ) { 4943 /* 4944 * scan from the beginning of the upl looking for the first 4945 * page that is present.... this will become the first page in 4946 * the request we're going to make to 'cluster_io'... if all 4947 * of the pages are absent, we won't call through to 'cluster_io' 4948 */ 4949 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) { 4950 if (upl_page_present(pl, start_pg)) 4951 break; 4952 } 4953 4954 /* 4955 * scan from the starting present page looking for an absent 4956 * page before the end of the upl is reached, if we 4957 * find one, then it will terminate the range of pages being 4958 * presented to 'cluster_io' 4959 */ 4960 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) { 4961 if (!upl_page_present(pl, last_pg)) 4962 break; 4963 } 4964 4965 if (last_pg > start_pg) { 4966 /* 4967 * we found a range of pages that must be filled 4968 * if the last page in this range is the last page of the file 4969 * we may have to clip the size of it to keep from reading past 4970 * the end of the last physical block associated with the file 4971 */ 4972 upl_offset = start_pg * PAGE_SIZE; 4973 io_size = (last_pg - start_pg) * PAGE_SIZE; 4974 4975 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) 4976 io_size = filesize - (upl_f_offset + upl_offset); 4977 4978 /* 4979 * issue an asynchronous read to cluster_io 4980 */ 4981 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, 4982 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 4983 4984 issued_io = 1; 4985 } 4986 } 4987 if (issued_io == 0) 4988 ubc_upl_abort(upl, 0); 4989 4990 io_size = upl_size - start_offset; 4991 4992 if (io_size > resid) 4993 io_size = resid; 4994 f_offset += io_size; 4995 resid -= io_size; 4996 } 4997 4998 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END, 4999 (int)f_offset, resid, retval, 0, 0); 5000 5001 return(retval); 5002} 5003 5004 5005int 5006cluster_push(vnode_t vp, int flags) 5007{ 5008 return cluster_push_ext(vp, flags, NULL, NULL); 5009} 5010 5011 5012int 5013cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg) 5014{ 5015 int retval; 5016 int my_sparse_wait = 0; 5017 struct cl_writebehind *wbp; 5018 5019 if ( !UBCINFOEXISTS(vp)) { 5020 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -1, 0); 5021 return (0); 5022 } 5023 /* return if deferred write is set */ 5024 if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) { 5025 return (0); 5026 } 5027 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) { 5028 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -2, 0); 5029 return (0); 5030 } 5031 if (wbp->cl_number == 0 && wbp->cl_scmap == NULL) { 5032 lck_mtx_unlock(&wbp->cl_lockw); 5033 5034 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -3, 0); 5035 return(0); 5036 } 5037 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START, 5038 wbp->cl_scmap, wbp->cl_number, flags, 0, 0); 5039 5040 /* 5041 * if we have an fsync in progress, we don't want to allow any additional 5042 * sync/fsync/close(s) to occur until it finishes. 5043 * note that its possible for writes to continue to occur to this file 5044 * while we're waiting and also once the fsync starts to clean if we're 5045 * in the sparse map case 5046 */ 5047 while (wbp->cl_sparse_wait) { 5048 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, vp, 0, 0, 0, 0); 5049 5050 msleep((caddr_t)&wbp->cl_sparse_wait, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL); 5051 5052 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, vp, 0, 0, 0, 0); 5053 } 5054 if (flags & IO_SYNC) { 5055 my_sparse_wait = 1; 5056 wbp->cl_sparse_wait = 1; 5057 5058 /* 5059 * this is an fsync (or equivalent)... we must wait for any existing async 5060 * cleaning operations to complete before we evaulate the current state 5061 * and finish cleaning... this insures that all writes issued before this 5062 * fsync actually get cleaned to the disk before this fsync returns 5063 */ 5064 while (wbp->cl_sparse_pushes) { 5065 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_START, vp, 0, 0, 0, 0); 5066 5067 msleep((caddr_t)&wbp->cl_sparse_pushes, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL); 5068 5069 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_END, vp, 0, 0, 0, 0); 5070 } 5071 } 5072 if (wbp->cl_scmap) { 5073 void *scmap; 5074 5075 if (wbp->cl_sparse_pushes < SPARSE_PUSH_LIMIT) { 5076 5077 scmap = wbp->cl_scmap; 5078 wbp->cl_scmap = NULL; 5079 5080 wbp->cl_sparse_pushes++; 5081 5082 lck_mtx_unlock(&wbp->cl_lockw); 5083 5084 sparse_cluster_push(&scmap, vp, ubc_getsize(vp), PUSH_ALL, flags | IO_PASSIVE, callback, callback_arg); 5085 5086 lck_mtx_lock(&wbp->cl_lockw); 5087 5088 wbp->cl_sparse_pushes--; 5089 5090 if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0) 5091 wakeup((caddr_t)&wbp->cl_sparse_pushes); 5092 } else { 5093 sparse_cluster_push(&(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags | IO_PASSIVE, callback, callback_arg); 5094 } 5095 retval = 1; 5096 } else { 5097 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags | IO_PASSIVE, callback, callback_arg); 5098 } 5099 lck_mtx_unlock(&wbp->cl_lockw); 5100 5101 if (flags & IO_SYNC) 5102 (void)vnode_waitforwrites(vp, 0, 0, 0, "cluster_push"); 5103 5104 if (my_sparse_wait) { 5105 /* 5106 * I'm the owner of the serialization token 5107 * clear it and wakeup anyone that is waiting 5108 * for me to finish 5109 */ 5110 lck_mtx_lock(&wbp->cl_lockw); 5111 5112 wbp->cl_sparse_wait = 0; 5113 wakeup((caddr_t)&wbp->cl_sparse_wait); 5114 5115 lck_mtx_unlock(&wbp->cl_lockw); 5116 } 5117 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END, 5118 wbp->cl_scmap, wbp->cl_number, retval, 0, 0); 5119 5120 return (retval); 5121} 5122 5123 5124__private_extern__ void 5125cluster_release(struct ubc_info *ubc) 5126{ 5127 struct cl_writebehind *wbp; 5128 struct cl_readahead *rap; 5129 5130 if ((wbp = ubc->cl_wbehind)) { 5131 5132 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, wbp->cl_scmap, 0, 0, 0); 5133 5134 if (wbp->cl_scmap) 5135 vfs_drt_control(&(wbp->cl_scmap), 0); 5136 } else { 5137 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, 0, 0, 0, 0); 5138 } 5139 5140 rap = ubc->cl_rahead; 5141 5142 if (wbp != NULL) { 5143 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp); 5144 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND); 5145 } 5146 if ((rap = ubc->cl_rahead)) { 5147 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp); 5148 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD); 5149 } 5150 ubc->cl_rahead = NULL; 5151 ubc->cl_wbehind = NULL; 5152 5153 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, ubc, rap, wbp, 0, 0); 5154} 5155 5156 5157static int 5158cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg) 5159{ 5160 int cl_index; 5161 int cl_index1; 5162 int min_index; 5163 int cl_len; 5164 int cl_pushed = 0; 5165 struct cl_wextent l_clusters[MAX_CLUSTERS]; 5166 u_int max_cluster_pgcount; 5167 5168 5169 max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE; 5170 /* 5171 * the write behind context exists and has 5172 * already been locked... 5173 */ 5174 if (wbp->cl_number == 0) 5175 /* 5176 * no clusters to push 5177 * return number of empty slots 5178 */ 5179 return (MAX_CLUSTERS); 5180 5181 /* 5182 * make a local 'sorted' copy of the clusters 5183 * and clear wbp->cl_number so that new clusters can 5184 * be developed 5185 */ 5186 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) { 5187 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) { 5188 if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr) 5189 continue; 5190 if (min_index == -1) 5191 min_index = cl_index1; 5192 else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr) 5193 min_index = cl_index1; 5194 } 5195 if (min_index == -1) 5196 break; 5197 5198 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr; 5199 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr; 5200 l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags; 5201 5202 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr; 5203 } 5204 wbp->cl_number = 0; 5205 5206 cl_len = cl_index; 5207 5208 if ( (push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS ) { 5209 int i; 5210 5211 /* 5212 * determine if we appear to be writing the file sequentially 5213 * if not, by returning without having pushed any clusters 5214 * we will cause this vnode to be pushed into the sparse cluster mechanism 5215 * used for managing more random I/O patterns 5216 * 5217 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them... 5218 * that's why we're in try_push with PUSH_DELAY... 5219 * 5220 * check to make sure that all the clusters except the last one are 'full'... and that each cluster 5221 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above 5222 * so we can just make a simple pass through, up to, but not including the last one... 5223 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they 5224 * are sequential 5225 * 5226 * we let the last one be partial as long as it was adjacent to the previous one... 5227 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out 5228 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world... 5229 */ 5230 for (i = 0; i < MAX_CLUSTERS - 1; i++) { 5231 if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount) 5232 goto dont_try; 5233 if (l_clusters[i].e_addr != l_clusters[i+1].b_addr) 5234 goto dont_try; 5235 } 5236 } 5237 for (cl_index = 0; cl_index < cl_len; cl_index++) { 5238 int flags; 5239 struct cl_extent cl; 5240 5241 flags = io_flags & (IO_PASSIVE|IO_CLOSE); 5242 5243 /* 5244 * try to push each cluster in turn... 5245 */ 5246 if (l_clusters[cl_index].io_flags & CLW_IONOCACHE) 5247 flags |= IO_NOCACHE; 5248 5249 if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE) 5250 flags |= IO_PASSIVE; 5251 5252 if (push_flag & PUSH_SYNC) 5253 flags |= IO_SYNC; 5254 5255 cl.b_addr = l_clusters[cl_index].b_addr; 5256 cl.e_addr = l_clusters[cl_index].e_addr; 5257 5258 cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg); 5259 5260 l_clusters[cl_index].b_addr = 0; 5261 l_clusters[cl_index].e_addr = 0; 5262 5263 cl_pushed++; 5264 5265 if ( !(push_flag & PUSH_ALL) ) 5266 break; 5267 } 5268dont_try: 5269 if (cl_len > cl_pushed) { 5270 /* 5271 * we didn't push all of the clusters, so 5272 * lets try to merge them back in to the vnode 5273 */ 5274 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) { 5275 /* 5276 * we picked up some new clusters while we were trying to 5277 * push the old ones... this can happen because I've dropped 5278 * the vnode lock... the sum of the 5279 * leftovers plus the new cluster count exceeds our ability 5280 * to represent them, so switch to the sparse cluster mechanism 5281 * 5282 * collect the active public clusters... 5283 */ 5284 sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg); 5285 5286 for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) { 5287 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) 5288 continue; 5289 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr; 5290 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr; 5291 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags; 5292 5293 cl_index1++; 5294 } 5295 /* 5296 * update the cluster count 5297 */ 5298 wbp->cl_number = cl_index1; 5299 5300 /* 5301 * and collect the original clusters that were moved into the 5302 * local storage for sorting purposes 5303 */ 5304 sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg); 5305 5306 } else { 5307 /* 5308 * we've got room to merge the leftovers back in 5309 * just append them starting at the next 'hole' 5310 * represented by wbp->cl_number 5311 */ 5312 for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) { 5313 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) 5314 continue; 5315 5316 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr; 5317 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr; 5318 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags; 5319 5320 cl_index1++; 5321 } 5322 /* 5323 * update the cluster count 5324 */ 5325 wbp->cl_number = cl_index1; 5326 } 5327 } 5328 return (MAX_CLUSTERS - wbp->cl_number); 5329} 5330 5331 5332 5333static int 5334cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*callback)(buf_t, void *), void *callback_arg) 5335{ 5336 upl_page_info_t *pl; 5337 upl_t upl; 5338 vm_offset_t upl_offset; 5339 int upl_size; 5340 off_t upl_f_offset; 5341 int pages_in_upl; 5342 int start_pg; 5343 int last_pg; 5344 int io_size; 5345 int io_flags; 5346 int upl_flags; 5347 int bflag; 5348 int size; 5349 int error = 0; 5350 int retval; 5351 kern_return_t kret; 5352 5353 if (flags & IO_PASSIVE) 5354 bflag = CL_PASSIVE; 5355 else 5356 bflag = 0; 5357 5358 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START, 5359 (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0); 5360 5361 if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) { 5362 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0); 5363 5364 return (0); 5365 } 5366 upl_size = pages_in_upl * PAGE_SIZE; 5367 upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64); 5368 5369 if (upl_f_offset + upl_size >= EOF) { 5370 5371 if (upl_f_offset >= EOF) { 5372 /* 5373 * must have truncated the file and missed 5374 * clearing a dangling cluster (i.e. it's completely 5375 * beyond the new EOF 5376 */ 5377 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0); 5378 5379 return(0); 5380 } 5381 size = EOF - upl_f_offset; 5382 5383 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 5384 pages_in_upl = upl_size / PAGE_SIZE; 5385 } else 5386 size = upl_size; 5387 5388 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0); 5389 5390 /* 5391 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior 5392 * 5393 * - only pages that are currently dirty are returned... these are the ones we need to clean 5394 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set 5395 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page 5396 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if 5397 * someone dirties this page while the I/O is in progress, we don't lose track of the new state 5398 * 5399 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard) 5400 */ 5401 5402 if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE)) 5403 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED; 5404 else 5405 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE; 5406 5407 kret = ubc_create_upl(vp, 5408 upl_f_offset, 5409 upl_size, 5410 &upl, 5411 &pl, 5412 upl_flags); 5413 if (kret != KERN_SUCCESS) 5414 panic("cluster_push: failed to get pagelist"); 5415 5416 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, upl, upl_f_offset, 0, 0, 0); 5417 5418 /* 5419 * since we only asked for the dirty pages back 5420 * it's possible that we may only get a few or even none, so... 5421 * before we start marching forward, we must make sure we know 5422 * where the last present page is in the UPL, otherwise we could 5423 * end up working with a freed upl due to the FREE_ON_EMPTY semantics 5424 * employed by commit_range and abort_range. 5425 */ 5426 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) { 5427 if (upl_page_present(pl, last_pg)) 5428 break; 5429 } 5430 pages_in_upl = last_pg + 1; 5431 5432 if (pages_in_upl == 0) { 5433 ubc_upl_abort(upl, 0); 5434 5435 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0); 5436 return(0); 5437 } 5438 5439 for (last_pg = 0; last_pg < pages_in_upl; ) { 5440 /* 5441 * find the next dirty page in the UPL 5442 * this will become the first page in the 5443 * next I/O to generate 5444 */ 5445 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) { 5446 if (upl_dirty_page(pl, start_pg)) 5447 break; 5448 if (upl_page_present(pl, start_pg)) 5449 /* 5450 * RET_ONLY_DIRTY will return non-dirty 'precious' pages 5451 * just release these unchanged since we're not going 5452 * to steal them or change their state 5453 */ 5454 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY); 5455 } 5456 if (start_pg >= pages_in_upl) 5457 /* 5458 * done... no more dirty pages to push 5459 */ 5460 break; 5461 if (start_pg > last_pg) 5462 /* 5463 * skipped over some non-dirty pages 5464 */ 5465 size -= ((start_pg - last_pg) * PAGE_SIZE); 5466 5467 /* 5468 * find a range of dirty pages to write 5469 */ 5470 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) { 5471 if (!upl_dirty_page(pl, last_pg)) 5472 break; 5473 } 5474 upl_offset = start_pg * PAGE_SIZE; 5475 5476 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE); 5477 5478 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | bflag; 5479 5480 if ( !(flags & IO_SYNC)) 5481 io_flags |= CL_ASYNC; 5482 5483 if (flags & IO_CLOSE) 5484 io_flags |= CL_CLOSE; 5485 5486 if (flags & IO_NOCACHE) 5487 io_flags |= CL_NOCACHE; 5488 5489 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, 5490 io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 5491 5492 if (error == 0 && retval) 5493 error = retval; 5494 5495 size -= io_size; 5496 } 5497 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0); 5498 5499 return(error); 5500} 5501 5502 5503/* 5504 * sparse_cluster_switch is called with the write behind lock held 5505 */ 5506static void 5507sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg) 5508{ 5509 int cl_index; 5510 5511 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, vp, wbp->cl_scmap, 0, 0, 0); 5512 5513 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) { 5514 int flags; 5515 struct cl_extent cl; 5516 5517 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) { 5518 5519 if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, NULL, &flags) == KERN_SUCCESS) { 5520 if (flags & UPL_POP_DIRTY) { 5521 cl.e_addr = cl.b_addr + 1; 5522 5523 sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg); 5524 } 5525 } 5526 } 5527 } 5528 wbp->cl_number = 0; 5529 5530 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, vp, wbp->cl_scmap, 0, 0, 0); 5531} 5532 5533 5534/* 5535 * sparse_cluster_push must be called with the write-behind lock held if the scmap is 5536 * still associated with the write-behind context... however, if the scmap has been disassociated 5537 * from the write-behind context (the cluster_push case), the wb lock is not held 5538 */ 5539static void 5540sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg) 5541{ 5542 struct cl_extent cl; 5543 off_t offset; 5544 u_int length; 5545 5546 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, vp, (*scmap), 0, push_flag, 0); 5547 5548 if (push_flag & PUSH_ALL) 5549 vfs_drt_control(scmap, 1); 5550 5551 for (;;) { 5552 if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS) 5553 break; 5554 5555 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64); 5556 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64); 5557 5558 cluster_push_now(vp, &cl, EOF, io_flags & (IO_PASSIVE|IO_CLOSE), callback, callback_arg); 5559 5560 if ( !(push_flag & PUSH_ALL) ) 5561 break; 5562 } 5563 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, vp, (*scmap), 0, 0, 0); 5564} 5565 5566 5567/* 5568 * sparse_cluster_add is called with the write behind lock held 5569 */ 5570static void 5571sparse_cluster_add(void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg) 5572{ 5573 u_int new_dirty; 5574 u_int length; 5575 off_t offset; 5576 5577 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0); 5578 5579 offset = (off_t)(cl->b_addr * PAGE_SIZE_64); 5580 length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE; 5581 5582 while (vfs_drt_mark_pages(scmap, offset, length, &new_dirty) != KERN_SUCCESS) { 5583 /* 5584 * no room left in the map 5585 * only a partial update was done 5586 * push out some pages and try again 5587 */ 5588 sparse_cluster_push(scmap, vp, EOF, 0, 0, callback, callback_arg); 5589 5590 offset += (new_dirty * PAGE_SIZE_64); 5591 length -= (new_dirty * PAGE_SIZE); 5592 } 5593 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, vp, (*scmap), 0, 0, 0); 5594} 5595 5596 5597static int 5598cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*callback)(buf_t, void *), void *callback_arg) 5599{ 5600 upl_page_info_t *pl; 5601 upl_t upl; 5602 addr64_t ubc_paddr; 5603 kern_return_t kret; 5604 int error = 0; 5605 int did_read = 0; 5606 int abort_flags; 5607 int upl_flags; 5608 int bflag; 5609 5610 if (flags & IO_PASSIVE) 5611 bflag = CL_PASSIVE; 5612 else 5613 bflag = 0; 5614 5615 if (flags & IO_NOCACHE) 5616 bflag |= CL_NOCACHE; 5617 5618 upl_flags = UPL_SET_LITE; 5619 5620 if ( !(flags & CL_READ) ) { 5621 /* 5622 * "write" operation: let the UPL subsystem know 5623 * that we intend to modify the buffer cache pages 5624 * we're gathering. 5625 */ 5626 upl_flags |= UPL_WILL_MODIFY; 5627 } else { 5628 /* 5629 * indicate that there is no need to pull the 5630 * mapping for this page... we're only going 5631 * to read from it, not modify it. 5632 */ 5633 upl_flags |= UPL_FILE_IO; 5634 } 5635 kret = ubc_create_upl(vp, 5636 uio->uio_offset & ~PAGE_MASK_64, 5637 PAGE_SIZE, 5638 &upl, 5639 &pl, 5640 upl_flags); 5641 5642 if (kret != KERN_SUCCESS) 5643 return(EINVAL); 5644 5645 if (!upl_valid_page(pl, 0)) { 5646 /* 5647 * issue a synchronous read to cluster_io 5648 */ 5649 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, 5650 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 5651 if (error) { 5652 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); 5653 5654 return(error); 5655 } 5656 did_read = 1; 5657 } 5658 ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64); 5659 5660/* 5661 * NOTE: There is no prototype for the following in BSD. It, and the definitions 5662 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in 5663 * osfmk/ppc/mappings.h. They are not included here because there appears to be no 5664 * way to do so without exporting them to kexts as well. 5665 */ 5666 if (flags & CL_READ) 5667// copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */ 5668 copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 4); /* Copy physical to physical and flush the destination */ 5669 else 5670// copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */ 5671 copypv(usr_paddr, ubc_paddr, xsize, 2 | 1 | 8); /* Copy physical to physical and flush the source */ 5672 5673 if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) { 5674 /* 5675 * issue a synchronous write to cluster_io 5676 */ 5677 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, 5678 bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 5679 } 5680 if (error == 0) 5681 uio_update(uio, (user_size_t)xsize); 5682 5683 if (did_read) 5684 abort_flags = UPL_ABORT_FREE_ON_EMPTY; 5685 else 5686 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES; 5687 5688 ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags); 5689 5690 return (error); 5691} 5692 5693 5694 5695int 5696cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid) 5697{ 5698 int pg_offset; 5699 int pg_index; 5700 int csize; 5701 int segflg; 5702 int retval = 0; 5703 int xsize; 5704 upl_page_info_t *pl; 5705 5706 xsize = *io_resid; 5707 5708 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START, 5709 (int)uio->uio_offset, upl_offset, xsize, 0, 0); 5710 5711 segflg = uio->uio_segflg; 5712 5713 switch(segflg) { 5714 5715 case UIO_USERSPACE32: 5716 case UIO_USERISPACE32: 5717 uio->uio_segflg = UIO_PHYS_USERSPACE32; 5718 break; 5719 5720 case UIO_USERSPACE: 5721 case UIO_USERISPACE: 5722 uio->uio_segflg = UIO_PHYS_USERSPACE; 5723 break; 5724 5725 case UIO_USERSPACE64: 5726 case UIO_USERISPACE64: 5727 uio->uio_segflg = UIO_PHYS_USERSPACE64; 5728 break; 5729 5730 case UIO_SYSSPACE: 5731 uio->uio_segflg = UIO_PHYS_SYSSPACE; 5732 break; 5733 5734 } 5735 pl = ubc_upl_pageinfo(upl); 5736 5737 pg_index = upl_offset / PAGE_SIZE; 5738 pg_offset = upl_offset & PAGE_MASK; 5739 csize = min(PAGE_SIZE - pg_offset, xsize); 5740 5741 while (xsize && retval == 0) { 5742 addr64_t paddr; 5743 5744 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset; 5745 5746 retval = uiomove64(paddr, csize, uio); 5747 5748 pg_index += 1; 5749 pg_offset = 0; 5750 xsize -= csize; 5751 csize = min(PAGE_SIZE, xsize); 5752 } 5753 *io_resid = xsize; 5754 5755 uio->uio_segflg = segflg; 5756 5757 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END, 5758 (int)uio->uio_offset, xsize, retval, segflg, 0); 5759 5760 return (retval); 5761} 5762 5763 5764int 5765cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty) 5766{ 5767 5768 return (cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, 1)); 5769} 5770 5771 5772static int 5773cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference) 5774{ 5775 int segflg; 5776 int io_size; 5777 int xsize; 5778 int start_offset; 5779 int retval = 0; 5780 memory_object_control_t control; 5781 5782 io_size = *io_resid; 5783 5784 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START, 5785 (int)uio->uio_offset, io_size, mark_dirty, take_reference, 0); 5786 5787 control = ubc_getobject(vp, UBC_FLAGS_NONE); 5788 5789 if (control == MEMORY_OBJECT_CONTROL_NULL) { 5790 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END, 5791 (int)uio->uio_offset, io_size, retval, 3, 0); 5792 5793 return(0); 5794 } 5795 segflg = uio->uio_segflg; 5796 5797 switch(segflg) { 5798 5799 case UIO_USERSPACE32: 5800 case UIO_USERISPACE32: 5801 uio->uio_segflg = UIO_PHYS_USERSPACE32; 5802 break; 5803 5804 case UIO_USERSPACE64: 5805 case UIO_USERISPACE64: 5806 uio->uio_segflg = UIO_PHYS_USERSPACE64; 5807 break; 5808 5809 case UIO_USERSPACE: 5810 case UIO_USERISPACE: 5811 uio->uio_segflg = UIO_PHYS_USERSPACE; 5812 break; 5813 5814 case UIO_SYSSPACE: 5815 uio->uio_segflg = UIO_PHYS_SYSSPACE; 5816 break; 5817 } 5818 5819 if ( (io_size = *io_resid) ) { 5820 start_offset = (int)(uio->uio_offset & PAGE_MASK_64); 5821 xsize = uio_resid(uio); 5822 5823 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio, 5824 start_offset, io_size, mark_dirty, take_reference); 5825 xsize -= uio_resid(uio); 5826 io_size -= xsize; 5827 } 5828 uio->uio_segflg = segflg; 5829 *io_resid = io_size; 5830 5831 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END, 5832 (int)uio->uio_offset, io_size, retval, 0x80000000 | segflg, 0); 5833 5834 return(retval); 5835} 5836 5837 5838int 5839is_file_clean(vnode_t vp, off_t filesize) 5840{ 5841 off_t f_offset; 5842 int flags; 5843 int total_dirty = 0; 5844 5845 for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) { 5846 if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) { 5847 if (flags & UPL_POP_DIRTY) { 5848 total_dirty++; 5849 } 5850 } 5851 } 5852 if (total_dirty) 5853 return(EINVAL); 5854 5855 return (0); 5856} 5857 5858 5859 5860/* 5861 * Dirty region tracking/clustering mechanism. 5862 * 5863 * This code (vfs_drt_*) provides a mechanism for tracking and clustering 5864 * dirty regions within a larger space (file). It is primarily intended to 5865 * support clustering in large files with many dirty areas. 5866 * 5867 * The implementation assumes that the dirty regions are pages. 5868 * 5869 * To represent dirty pages within the file, we store bit vectors in a 5870 * variable-size circular hash. 5871 */ 5872 5873/* 5874 * Bitvector size. This determines the number of pages we group in a 5875 * single hashtable entry. Each hashtable entry is aligned to this 5876 * size within the file. 5877 */ 5878#define DRT_BITVECTOR_PAGES 256 5879 5880/* 5881 * File offset handling. 5882 * 5883 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES; 5884 * the correct formula is (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1) 5885 */ 5886#define DRT_ADDRESS_MASK (~((1 << 20) - 1)) 5887#define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK) 5888 5889/* 5890 * Hashtable address field handling. 5891 * 5892 * The low-order bits of the hashtable address are used to conserve 5893 * space. 5894 * 5895 * DRT_HASH_COUNT_MASK must be large enough to store the range 5896 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value 5897 * to indicate that the bucket is actually unoccupied. 5898 */ 5899#define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK) 5900#define DRT_HASH_SET_ADDRESS(scm, i, a) \ 5901 do { \ 5902 (scm)->scm_hashtable[(i)].dhe_control = \ 5903 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \ 5904 } while (0) 5905#define DRT_HASH_COUNT_MASK 0x1ff 5906#define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK) 5907#define DRT_HASH_SET_COUNT(scm, i, c) \ 5908 do { \ 5909 (scm)->scm_hashtable[(i)].dhe_control = \ 5910 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \ 5911 } while (0) 5912#define DRT_HASH_CLEAR(scm, i) \ 5913 do { \ 5914 (scm)->scm_hashtable[(i)].dhe_control = 0; \ 5915 } while (0) 5916#define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK) 5917#define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK) 5918#define DRT_HASH_COPY(oscm, oi, scm, i) \ 5919 do { \ 5920 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \ 5921 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \ 5922 } while(0); 5923 5924 5925/* 5926 * Hash table moduli. 5927 * 5928 * Since the hashtable entry's size is dependent on the size of 5929 * the bitvector, and since the hashtable size is constrained to 5930 * both being prime and fitting within the desired allocation 5931 * size, these values need to be manually determined. 5932 * 5933 * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes. 5934 * 5935 * The small hashtable allocation is 1024 bytes, so the modulus is 23. 5936 * The large hashtable allocation is 16384 bytes, so the modulus is 401. 5937 */ 5938#define DRT_HASH_SMALL_MODULUS 23 5939#define DRT_HASH_LARGE_MODULUS 401 5940 5941/* 5942 * Physical memory required before the large hash modulus is permitted. 5943 * 5944 * On small memory systems, the large hash modulus can lead to phsyical 5945 * memory starvation, so we avoid using it there. 5946 */ 5947#define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */ 5948 5949#define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */ 5950#define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */ 5951 5952/* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */ 5953 5954/* 5955 * Hashtable bitvector handling. 5956 * 5957 * Bitvector fields are 32 bits long. 5958 */ 5959 5960#define DRT_HASH_SET_BIT(scm, i, bit) \ 5961 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32)) 5962 5963#define DRT_HASH_CLEAR_BIT(scm, i, bit) \ 5964 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32)) 5965 5966#define DRT_HASH_TEST_BIT(scm, i, bit) \ 5967 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32))) 5968 5969#define DRT_BITVECTOR_CLEAR(scm, i) \ 5970 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t)) 5971 5972#define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \ 5973 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \ 5974 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \ 5975 (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t)) 5976 5977 5978 5979/* 5980 * Hashtable entry. 5981 */ 5982struct vfs_drt_hashentry { 5983 u_int64_t dhe_control; 5984 u_int32_t dhe_bitvector[DRT_BITVECTOR_PAGES / 32]; 5985}; 5986 5987/* 5988 * Dirty Region Tracking structure. 5989 * 5990 * The hashtable is allocated entirely inside the DRT structure. 5991 * 5992 * The hash is a simple circular prime modulus arrangement, the structure 5993 * is resized from small to large if it overflows. 5994 */ 5995 5996struct vfs_drt_clustermap { 5997 u_int32_t scm_magic; /* sanity/detection */ 5998#define DRT_SCM_MAGIC 0x12020003 5999 u_int32_t scm_modulus; /* current ring size */ 6000 u_int32_t scm_buckets; /* number of occupied buckets */ 6001 u_int32_t scm_lastclean; /* last entry we cleaned */ 6002 u_int32_t scm_iskips; /* number of slot skips */ 6003 6004 struct vfs_drt_hashentry scm_hashtable[0]; 6005}; 6006 6007 6008#define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus) 6009#define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus) 6010 6011/* 6012 * Debugging codes and arguments. 6013 */ 6014#define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */ 6015#define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */ 6016#define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */ 6017#define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */ 6018#define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length, 6019 * dirty */ 6020 /* 0, setcount */ 6021 /* 1 (clean, no map) */ 6022 /* 2 (map alloc fail) */ 6023 /* 3, resid (partial) */ 6024#define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87)) 6025#define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets, 6026 * lastclean, iskips */ 6027 6028 6029static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp); 6030static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap); 6031static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap, 6032 u_int64_t offset, int *indexp); 6033static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, 6034 u_int64_t offset, 6035 int *indexp, 6036 int recursed); 6037static kern_return_t vfs_drt_do_mark_pages( 6038 void **cmapp, 6039 u_int64_t offset, 6040 u_int length, 6041 u_int *setcountp, 6042 int dirty); 6043static void vfs_drt_trace( 6044 struct vfs_drt_clustermap *cmap, 6045 int code, 6046 int arg1, 6047 int arg2, 6048 int arg3, 6049 int arg4); 6050 6051 6052/* 6053 * Allocate and initialise a sparse cluster map. 6054 * 6055 * Will allocate a new map, resize or compact an existing map. 6056 * 6057 * XXX we should probably have at least one intermediate map size, 6058 * as the 1:16 ratio seems a bit drastic. 6059 */ 6060static kern_return_t 6061vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp) 6062{ 6063 struct vfs_drt_clustermap *cmap, *ocmap; 6064 kern_return_t kret; 6065 u_int64_t offset; 6066 u_int32_t i; 6067 int nsize, active_buckets, index, copycount; 6068 6069 ocmap = NULL; 6070 if (cmapp != NULL) 6071 ocmap = *cmapp; 6072 6073 /* 6074 * Decide on the size of the new map. 6075 */ 6076 if (ocmap == NULL) { 6077 nsize = DRT_HASH_SMALL_MODULUS; 6078 } else { 6079 /* count the number of active buckets in the old map */ 6080 active_buckets = 0; 6081 for (i = 0; i < ocmap->scm_modulus; i++) { 6082 if (!DRT_HASH_VACANT(ocmap, i) && 6083 (DRT_HASH_GET_COUNT(ocmap, i) != 0)) 6084 active_buckets++; 6085 } 6086 /* 6087 * If we're currently using the small allocation, check to 6088 * see whether we should grow to the large one. 6089 */ 6090 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) { 6091 /* 6092 * If the ring is nearly full and we are allowed to 6093 * use the large modulus, upgrade. 6094 */ 6095 if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) && 6096 (max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) { 6097 nsize = DRT_HASH_LARGE_MODULUS; 6098 } else { 6099 nsize = DRT_HASH_SMALL_MODULUS; 6100 } 6101 } else { 6102 /* already using the large modulus */ 6103 nsize = DRT_HASH_LARGE_MODULUS; 6104 /* 6105 * If the ring is completely full, there's 6106 * nothing useful for us to do. Behave as 6107 * though we had compacted into the new 6108 * array and return. 6109 */ 6110 if (active_buckets >= DRT_HASH_LARGE_MODULUS) 6111 return(KERN_SUCCESS); 6112 } 6113 } 6114 6115 /* 6116 * Allocate and initialise the new map. 6117 */ 6118 6119 kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap, 6120 (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION); 6121 if (kret != KERN_SUCCESS) 6122 return(kret); 6123 cmap->scm_magic = DRT_SCM_MAGIC; 6124 cmap->scm_modulus = nsize; 6125 cmap->scm_buckets = 0; 6126 cmap->scm_lastclean = 0; 6127 cmap->scm_iskips = 0; 6128 for (i = 0; i < cmap->scm_modulus; i++) { 6129 DRT_HASH_CLEAR(cmap, i); 6130 DRT_HASH_VACATE(cmap, i); 6131 DRT_BITVECTOR_CLEAR(cmap, i); 6132 } 6133 6134 /* 6135 * If there's an old map, re-hash entries from it into the new map. 6136 */ 6137 copycount = 0; 6138 if (ocmap != NULL) { 6139 for (i = 0; i < ocmap->scm_modulus; i++) { 6140 /* skip empty buckets */ 6141 if (DRT_HASH_VACANT(ocmap, i) || 6142 (DRT_HASH_GET_COUNT(ocmap, i) == 0)) 6143 continue; 6144 /* get new index */ 6145 offset = DRT_HASH_GET_ADDRESS(ocmap, i); 6146 kret = vfs_drt_get_index(&cmap, offset, &index, 1); 6147 if (kret != KERN_SUCCESS) { 6148 /* XXX need to bail out gracefully here */ 6149 panic("vfs_drt: new cluster map mysteriously too small"); 6150 index = 0; 6151 } 6152 /* copy */ 6153 DRT_HASH_COPY(ocmap, i, cmap, index); 6154 copycount++; 6155 } 6156 } 6157 6158 /* log what we've done */ 6159 vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0); 6160 6161 /* 6162 * It's important to ensure that *cmapp always points to 6163 * a valid map, so we must overwrite it before freeing 6164 * the old map. 6165 */ 6166 *cmapp = cmap; 6167 if (ocmap != NULL) { 6168 /* emit stats into trace buffer */ 6169 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA, 6170 ocmap->scm_modulus, 6171 ocmap->scm_buckets, 6172 ocmap->scm_lastclean, 6173 ocmap->scm_iskips); 6174 6175 vfs_drt_free_map(ocmap); 6176 } 6177 return(KERN_SUCCESS); 6178} 6179 6180 6181/* 6182 * Free a sparse cluster map. 6183 */ 6184static kern_return_t 6185vfs_drt_free_map(struct vfs_drt_clustermap *cmap) 6186{ 6187 kmem_free(kernel_map, (vm_offset_t)cmap, 6188 (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION); 6189 return(KERN_SUCCESS); 6190} 6191 6192 6193/* 6194 * Find the hashtable slot currently occupied by an entry for the supplied offset. 6195 */ 6196static kern_return_t 6197vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp) 6198{ 6199 int index; 6200 u_int32_t i; 6201 6202 offset = DRT_ALIGN_ADDRESS(offset); 6203 index = DRT_HASH(cmap, offset); 6204 6205 /* traverse the hashtable */ 6206 for (i = 0; i < cmap->scm_modulus; i++) { 6207 6208 /* 6209 * If the slot is vacant, we can stop. 6210 */ 6211 if (DRT_HASH_VACANT(cmap, index)) 6212 break; 6213 6214 /* 6215 * If the address matches our offset, we have success. 6216 */ 6217 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) { 6218 *indexp = index; 6219 return(KERN_SUCCESS); 6220 } 6221 6222 /* 6223 * Move to the next slot, try again. 6224 */ 6225 index = DRT_HASH_NEXT(cmap, index); 6226 } 6227 /* 6228 * It's not there. 6229 */ 6230 return(KERN_FAILURE); 6231} 6232 6233/* 6234 * Find the hashtable slot for the supplied offset. If we haven't allocated 6235 * one yet, allocate one and populate the address field. Note that it will 6236 * not have a nonzero page count and thus will still technically be free, so 6237 * in the case where we are called to clean pages, the slot will remain free. 6238 */ 6239static kern_return_t 6240vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed) 6241{ 6242 struct vfs_drt_clustermap *cmap; 6243 kern_return_t kret; 6244 u_int32_t index; 6245 u_int32_t i; 6246 6247 cmap = *cmapp; 6248 6249 /* look for an existing entry */ 6250 kret = vfs_drt_search_index(cmap, offset, indexp); 6251 if (kret == KERN_SUCCESS) 6252 return(kret); 6253 6254 /* need to allocate an entry */ 6255 offset = DRT_ALIGN_ADDRESS(offset); 6256 index = DRT_HASH(cmap, offset); 6257 6258 /* scan from the index forwards looking for a vacant slot */ 6259 for (i = 0; i < cmap->scm_modulus; i++) { 6260 /* slot vacant? */ 6261 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap,index) == 0) { 6262 cmap->scm_buckets++; 6263 if (index < cmap->scm_lastclean) 6264 cmap->scm_lastclean = index; 6265 DRT_HASH_SET_ADDRESS(cmap, index, offset); 6266 DRT_HASH_SET_COUNT(cmap, index, 0); 6267 DRT_BITVECTOR_CLEAR(cmap, index); 6268 *indexp = index; 6269 vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0); 6270 return(KERN_SUCCESS); 6271 } 6272 cmap->scm_iskips += i; 6273 index = DRT_HASH_NEXT(cmap, index); 6274 } 6275 6276 /* 6277 * We haven't found a vacant slot, so the map is full. If we're not 6278 * already recursed, try reallocating/compacting it. 6279 */ 6280 if (recursed) 6281 return(KERN_FAILURE); 6282 kret = vfs_drt_alloc_map(cmapp); 6283 if (kret == KERN_SUCCESS) { 6284 /* now try to insert again */ 6285 kret = vfs_drt_get_index(cmapp, offset, indexp, 1); 6286 } 6287 return(kret); 6288} 6289 6290/* 6291 * Implementation of set dirty/clean. 6292 * 6293 * In the 'clean' case, not finding a map is OK. 6294 */ 6295static kern_return_t 6296vfs_drt_do_mark_pages( 6297 void **private, 6298 u_int64_t offset, 6299 u_int length, 6300 u_int *setcountp, 6301 int dirty) 6302{ 6303 struct vfs_drt_clustermap *cmap, **cmapp; 6304 kern_return_t kret; 6305 int i, index, pgoff, pgcount, setcount, ecount; 6306 6307 cmapp = (struct vfs_drt_clustermap **)private; 6308 cmap = *cmapp; 6309 6310 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0); 6311 6312 if (setcountp != NULL) 6313 *setcountp = 0; 6314 6315 /* allocate a cluster map if we don't already have one */ 6316 if (cmap == NULL) { 6317 /* no cluster map, nothing to clean */ 6318 if (!dirty) { 6319 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0); 6320 return(KERN_SUCCESS); 6321 } 6322 kret = vfs_drt_alloc_map(cmapp); 6323 if (kret != KERN_SUCCESS) { 6324 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0); 6325 return(kret); 6326 } 6327 } 6328 setcount = 0; 6329 6330 /* 6331 * Iterate over the length of the region. 6332 */ 6333 while (length > 0) { 6334 /* 6335 * Get the hashtable index for this offset. 6336 * 6337 * XXX this will add blank entries if we are clearing a range 6338 * that hasn't been dirtied. 6339 */ 6340 kret = vfs_drt_get_index(cmapp, offset, &index, 0); 6341 cmap = *cmapp; /* may have changed! */ 6342 /* this may be a partial-success return */ 6343 if (kret != KERN_SUCCESS) { 6344 if (setcountp != NULL) 6345 *setcountp = setcount; 6346 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0); 6347 6348 return(kret); 6349 } 6350 6351 /* 6352 * Work out how many pages we're modifying in this 6353 * hashtable entry. 6354 */ 6355 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE; 6356 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff)); 6357 6358 /* 6359 * Iterate over pages, dirty/clearing as we go. 6360 */ 6361 ecount = DRT_HASH_GET_COUNT(cmap, index); 6362 for (i = 0; i < pgcount; i++) { 6363 if (dirty) { 6364 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) { 6365 DRT_HASH_SET_BIT(cmap, index, pgoff + i); 6366 ecount++; 6367 setcount++; 6368 } 6369 } else { 6370 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) { 6371 DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i); 6372 ecount--; 6373 setcount++; 6374 } 6375 } 6376 } 6377 DRT_HASH_SET_COUNT(cmap, index, ecount); 6378 6379 offset += pgcount * PAGE_SIZE; 6380 length -= pgcount * PAGE_SIZE; 6381 } 6382 if (setcountp != NULL) 6383 *setcountp = setcount; 6384 6385 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0); 6386 6387 return(KERN_SUCCESS); 6388} 6389 6390/* 6391 * Mark a set of pages as dirty/clean. 6392 * 6393 * This is a public interface. 6394 * 6395 * cmapp 6396 * Pointer to storage suitable for holding a pointer. Note that 6397 * this must either be NULL or a value set by this function. 6398 * 6399 * size 6400 * Current file size in bytes. 6401 * 6402 * offset 6403 * Offset of the first page to be marked as dirty, in bytes. Must be 6404 * page-aligned. 6405 * 6406 * length 6407 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE. 6408 * 6409 * setcountp 6410 * Number of pages newly marked dirty by this call (optional). 6411 * 6412 * Returns KERN_SUCCESS if all the pages were successfully marked. 6413 */ 6414static kern_return_t 6415vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp) 6416{ 6417 /* XXX size unused, drop from interface */ 6418 return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1)); 6419} 6420 6421#if 0 6422static kern_return_t 6423vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length) 6424{ 6425 return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0)); 6426} 6427#endif 6428 6429/* 6430 * Get a cluster of dirty pages. 6431 * 6432 * This is a public interface. 6433 * 6434 * cmapp 6435 * Pointer to storage managed by drt_mark_pages. Note that this must 6436 * be NULL or a value set by drt_mark_pages. 6437 * 6438 * offsetp 6439 * Returns the byte offset into the file of the first page in the cluster. 6440 * 6441 * lengthp 6442 * Returns the length in bytes of the cluster of dirty pages. 6443 * 6444 * Returns success if a cluster was found. If KERN_FAILURE is returned, there 6445 * are no dirty pages meeting the minmum size criteria. Private storage will 6446 * be released if there are no more dirty pages left in the map 6447 * 6448 */ 6449static kern_return_t 6450vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp) 6451{ 6452 struct vfs_drt_clustermap *cmap; 6453 u_int64_t offset; 6454 u_int length; 6455 u_int32_t j; 6456 int index, i, fs, ls; 6457 6458 /* sanity */ 6459 if ((cmapp == NULL) || (*cmapp == NULL)) 6460 return(KERN_FAILURE); 6461 cmap = *cmapp; 6462 6463 /* walk the hashtable */ 6464 for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) { 6465 index = DRT_HASH(cmap, offset); 6466 6467 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0)) 6468 continue; 6469 6470 /* scan the bitfield for a string of bits */ 6471 fs = -1; 6472 6473 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) { 6474 if (DRT_HASH_TEST_BIT(cmap, index, i)) { 6475 fs = i; 6476 break; 6477 } 6478 } 6479 if (fs == -1) { 6480 /* didn't find any bits set */ 6481 panic("vfs_drt: entry summary count > 0 but no bits set in map"); 6482 } 6483 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) { 6484 if (!DRT_HASH_TEST_BIT(cmap, index, i)) 6485 break; 6486 } 6487 6488 /* compute offset and length, mark pages clean */ 6489 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs); 6490 length = ls * PAGE_SIZE; 6491 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0); 6492 cmap->scm_lastclean = index; 6493 6494 /* return successful */ 6495 *offsetp = (off_t)offset; 6496 *lengthp = length; 6497 6498 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0); 6499 return(KERN_SUCCESS); 6500 } 6501 /* 6502 * We didn't find anything... hashtable is empty 6503 * emit stats into trace buffer and 6504 * then free it 6505 */ 6506 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA, 6507 cmap->scm_modulus, 6508 cmap->scm_buckets, 6509 cmap->scm_lastclean, 6510 cmap->scm_iskips); 6511 6512 vfs_drt_free_map(cmap); 6513 *cmapp = NULL; 6514 6515 return(KERN_FAILURE); 6516} 6517 6518 6519static kern_return_t 6520vfs_drt_control(void **cmapp, int op_type) 6521{ 6522 struct vfs_drt_clustermap *cmap; 6523 6524 /* sanity */ 6525 if ((cmapp == NULL) || (*cmapp == NULL)) 6526 return(KERN_FAILURE); 6527 cmap = *cmapp; 6528 6529 switch (op_type) { 6530 case 0: 6531 /* emit stats into trace buffer */ 6532 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA, 6533 cmap->scm_modulus, 6534 cmap->scm_buckets, 6535 cmap->scm_lastclean, 6536 cmap->scm_iskips); 6537 6538 vfs_drt_free_map(cmap); 6539 *cmapp = NULL; 6540 break; 6541 6542 case 1: 6543 cmap->scm_lastclean = 0; 6544 break; 6545 } 6546 return(KERN_SUCCESS); 6547} 6548 6549 6550 6551/* 6552 * Emit a summary of the state of the clustermap into the trace buffer 6553 * along with some caller-provided data. 6554 */ 6555#if KDEBUG 6556static void 6557vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4) 6558{ 6559 KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0); 6560} 6561#else 6562static void 6563vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code, 6564 __unused int arg1, __unused int arg2, __unused int arg3, 6565 __unused int arg4) 6566{ 6567} 6568#endif 6569 6570#if 0 6571/* 6572 * Perform basic sanity check on the hash entry summary count 6573 * vs. the actual bits set in the entry. 6574 */ 6575static void 6576vfs_drt_sanity(struct vfs_drt_clustermap *cmap) 6577{ 6578 int index, i; 6579 int bits_on; 6580 6581 for (index = 0; index < cmap->scm_modulus; index++) { 6582 if (DRT_HASH_VACANT(cmap, index)) 6583 continue; 6584 6585 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) { 6586 if (DRT_HASH_TEST_BIT(cmap, index, i)) 6587 bits_on++; 6588 } 6589 if (bits_on != DRT_HASH_GET_COUNT(cmap, index)) 6590 panic("bits_on = %d, index = %d\n", bits_on, index); 6591 } 6592} 6593#endif 6594