1/* 2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ 29/* 30 * Copyright (c) 1993 31 * The Regents of the University of California. All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 3. All advertising materials mentioning features or use of this software 42 * must display the following acknowledgement: 43 * This product includes software developed by the University of 44 * California, Berkeley and its contributors. 45 * 4. Neither the name of the University nor the names of its contributors 46 * may be used to endorse or promote products derived from this software 47 * without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * SUCH DAMAGE. 60 * 61 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95 62 */ 63 64#include <sys/param.h> 65#include <sys/proc_internal.h> 66#include <sys/buf_internal.h> 67#include <sys/mount_internal.h> 68#include <sys/vnode_internal.h> 69#include <sys/trace.h> 70#include <sys/malloc.h> 71#include <sys/time.h> 72#include <sys/kernel.h> 73#include <sys/resourcevar.h> 74#include <miscfs/specfs/specdev.h> 75#include <sys/uio_internal.h> 76#include <libkern/libkern.h> 77#include <machine/machine_routines.h> 78 79#include <sys/ubc_internal.h> 80#include <vm/vnode_pager.h> 81 82#include <mach/mach_types.h> 83#include <mach/memory_object_types.h> 84#include <mach/vm_map.h> 85#include <mach/upl.h> 86#include <kern/task.h> 87 88#include <vm/vm_kern.h> 89#include <vm/vm_map.h> 90#include <vm/vm_pageout.h> 91#include <vm/vm_fault.h> 92 93#include <sys/kdebug.h> 94#include <libkern/OSAtomic.h> 95 96#include <sys/sdt.h> 97 98#if 0 99#undef KERNEL_DEBUG 100#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT 101#endif 102 103 104#define CL_READ 0x01 105#define CL_WRITE 0x02 106#define CL_ASYNC 0x04 107#define CL_COMMIT 0x08 108#define CL_PAGEOUT 0x10 109#define CL_AGE 0x20 110#define CL_NOZERO 0x40 111#define CL_PAGEIN 0x80 112#define CL_DEV_MEMORY 0x100 113#define CL_PRESERVE 0x200 114#define CL_THROTTLE 0x400 115#define CL_KEEPCACHED 0x800 116#define CL_DIRECT_IO 0x1000 117#define CL_PASSIVE 0x2000 118#define CL_IOSTREAMING 0x4000 119#define CL_CLOSE 0x8000 120#define CL_ENCRYPTED 0x10000 121#define CL_RAW_ENCRYPTED 0x20000 122#define CL_NOCACHE 0x40000 123 124#define MAX_VECTOR_UPL_ELEMENTS 8 125#define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE_BYTES) 126 127extern upl_t vector_upl_create(vm_offset_t); 128extern boolean_t vector_upl_is_valid(upl_t); 129extern boolean_t vector_upl_set_subupl(upl_t,upl_t, u_int32_t); 130extern void vector_upl_set_pagelist(upl_t); 131extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t); 132 133struct clios { 134 lck_mtx_t io_mtxp; 135 u_int io_completed; /* amount of io that has currently completed */ 136 u_int io_issued; /* amount of io that was successfully issued */ 137 int io_error; /* error code of first error encountered */ 138 int io_wanted; /* someone is sleeping waiting for a change in state */ 139}; 140 141static lck_grp_t *cl_mtx_grp; 142static lck_attr_t *cl_mtx_attr; 143static lck_grp_attr_t *cl_mtx_grp_attr; 144static lck_mtx_t *cl_transaction_mtxp; 145 146 147#define IO_UNKNOWN 0 148#define IO_DIRECT 1 149#define IO_CONTIG 2 150#define IO_COPY 3 151 152#define PUSH_DELAY 0x01 153#define PUSH_ALL 0x02 154#define PUSH_SYNC 0x04 155 156 157static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset); 158static void cluster_wait_IO(buf_t cbp_head, int async); 159static void cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait); 160 161static int cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length); 162 163static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size, 164 int flags, buf_t real_bp, struct clios *iostate, int (*)(buf_t, void *), void *callback_arg); 165static int cluster_iodone(buf_t bp, void *callback_arg); 166static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp); 167static int cluster_is_throttled(vnode_t vp); 168 169static void cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name); 170 171static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg, int flags); 172 173static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference); 174static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference); 175 176static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags, 177 int (*)(buf_t, void *), void *callback_arg); 178static int cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length, 179 int flags, int (*)(buf_t, void *), void *callback_arg); 180static int cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length, 181 int (*)(buf_t, void *), void *callback_arg, int flags); 182 183static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, 184 off_t headOff, off_t tailOff, int flags, int (*)(buf_t, void *), void *callback_arg); 185static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, 186 int *write_type, u_int32_t *write_length, int flags, int (*)(buf_t, void *), void *callback_arg); 187static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, 188 int *write_type, u_int32_t *write_length, int (*)(buf_t, void *), void *callback_arg, int bflag); 189 190static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*)(buf_t, void *), void *callback_arg); 191 192static int cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag); 193static void cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra, int (*callback)(buf_t, void *), void *callback_arg, int bflag); 194 195static int cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg); 196 197static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *), void *callback_arg); 198 199static void sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg); 200static void sparse_cluster_push(void **cmapp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*)(buf_t, void *), void *callback_arg); 201static void sparse_cluster_add(void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF, int (*)(buf_t, void *), void *callback_arg); 202 203static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp); 204static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp); 205static kern_return_t vfs_drt_control(void **cmapp, int op_type); 206 207 208/* 209 * For throttled IO to check whether 210 * a block is cached by the boot cache 211 * and thus it can avoid delaying the IO. 212 * 213 * bootcache_contains_block is initially 214 * NULL. The BootCache will set it while 215 * the cache is active and clear it when 216 * the cache is jettisoned. 217 * 218 * Returns 0 if the block is not 219 * contained in the cache, 1 if it is 220 * contained. 221 * 222 * The function pointer remains valid 223 * after the cache has been evicted even 224 * if bootcache_contains_block has been 225 * cleared. 226 * 227 * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs 228 */ 229int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL; 230 231 232/* 233 * limit the internal I/O size so that we 234 * can represent it in a 32 bit int 235 */ 236#define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512) 237#define MAX_IO_CONTIG_SIZE MAX_UPL_SIZE_BYTES 238#define MAX_VECTS 16 239#define MIN_DIRECT_WRITE_SIZE (4 * PAGE_SIZE) 240 241#define WRITE_THROTTLE 6 242#define WRITE_THROTTLE_SSD 2 243#define WRITE_BEHIND 1 244#define WRITE_BEHIND_SSD 1 245 246#define PREFETCH 3 247#define PREFETCH_SSD 2 248uint32_t speculative_prefetch_max = (MAX_UPL_SIZE_BYTES * 3); /* maximum bytes in a specluative read-ahead */ 249uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead on SSDs*/ 250 251 252#define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base)) 253#define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE)) 254#define MAX_PREFETCH(vp, size, is_ssd) (size * IO_SCALE(vp, ((is_ssd && !ignore_is_ssd) ? PREFETCH_SSD : PREFETCH))) 255 256int ignore_is_ssd = 0; 257int speculative_reads_disabled = 0; 258 259/* 260 * throttle the number of async writes that 261 * can be outstanding on a single vnode 262 * before we issue a synchronous write 263 */ 264#define THROTTLE_MAXCNT 0 265 266uint32_t throttle_max_iosize = (128 * 1024); 267 268#define THROTTLE_MAX_IOSIZE (throttle_max_iosize) 269 270SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_max_iosize, 0, ""); 271 272 273void 274cluster_init(void) { 275 /* 276 * allocate lock group attribute and group 277 */ 278 cl_mtx_grp_attr = lck_grp_attr_alloc_init(); 279 cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr); 280 281 /* 282 * allocate the lock attribute 283 */ 284 cl_mtx_attr = lck_attr_alloc_init(); 285 286 cl_transaction_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr); 287 288 if (cl_transaction_mtxp == NULL) 289 panic("cluster_init: failed to allocate cl_transaction_mtxp"); 290} 291 292 293uint32_t 294cluster_max_io_size(mount_t mp, int type) 295{ 296 uint32_t max_io_size; 297 uint32_t segcnt; 298 uint32_t maxcnt; 299 300 switch(type) { 301 302 case CL_READ: 303 segcnt = mp->mnt_segreadcnt; 304 maxcnt = mp->mnt_maxreadcnt; 305 break; 306 case CL_WRITE: 307 segcnt = mp->mnt_segwritecnt; 308 maxcnt = mp->mnt_maxwritecnt; 309 break; 310 default: 311 segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt); 312 maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt); 313 break; 314 } 315 if (segcnt > (MAX_UPL_SIZE_BYTES >> PAGE_SHIFT)) { 316 /* 317 * don't allow a size beyond the max UPL size we can create 318 */ 319 segcnt = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT; 320 } 321 max_io_size = min((segcnt * PAGE_SIZE), maxcnt); 322 323 if (max_io_size < MAX_UPL_TRANSFER_BYTES) { 324 /* 325 * don't allow a size smaller than the old fixed limit 326 */ 327 max_io_size = MAX_UPL_TRANSFER_BYTES; 328 } else { 329 /* 330 * make sure the size specified is a multiple of PAGE_SIZE 331 */ 332 max_io_size &= ~PAGE_MASK; 333 } 334 return (max_io_size); 335} 336 337 338 339 340#define CLW_ALLOCATE 0x01 341#define CLW_RETURNLOCKED 0x02 342#define CLW_IONOCACHE 0x04 343#define CLW_IOPASSIVE 0x08 344 345/* 346 * if the read ahead context doesn't yet exist, 347 * allocate and initialize it... 348 * the vnode lock serializes multiple callers 349 * during the actual assignment... first one 350 * to grab the lock wins... the other callers 351 * will release the now unnecessary storage 352 * 353 * once the context is present, try to grab (but don't block on) 354 * the lock associated with it... if someone 355 * else currently owns it, than the read 356 * will run without read-ahead. this allows 357 * multiple readers to run in parallel and 358 * since there's only 1 read ahead context, 359 * there's no real loss in only allowing 1 360 * reader to have read-ahead enabled. 361 */ 362static struct cl_readahead * 363cluster_get_rap(vnode_t vp) 364{ 365 struct ubc_info *ubc; 366 struct cl_readahead *rap; 367 368 ubc = vp->v_ubcinfo; 369 370 if ((rap = ubc->cl_rahead) == NULL) { 371 MALLOC_ZONE(rap, struct cl_readahead *, sizeof *rap, M_CLRDAHEAD, M_WAITOK); 372 373 bzero(rap, sizeof *rap); 374 rap->cl_lastr = -1; 375 lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr); 376 377 vnode_lock(vp); 378 379 if (ubc->cl_rahead == NULL) 380 ubc->cl_rahead = rap; 381 else { 382 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp); 383 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD); 384 rap = ubc->cl_rahead; 385 } 386 vnode_unlock(vp); 387 } 388 if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE) 389 return(rap); 390 391 return ((struct cl_readahead *)NULL); 392} 393 394 395/* 396 * if the write behind context doesn't yet exist, 397 * and CLW_ALLOCATE is specified, allocate and initialize it... 398 * the vnode lock serializes multiple callers 399 * during the actual assignment... first one 400 * to grab the lock wins... the other callers 401 * will release the now unnecessary storage 402 * 403 * if CLW_RETURNLOCKED is set, grab (blocking if necessary) 404 * the lock associated with the write behind context before 405 * returning 406 */ 407 408static struct cl_writebehind * 409cluster_get_wbp(vnode_t vp, int flags) 410{ 411 struct ubc_info *ubc; 412 struct cl_writebehind *wbp; 413 414 ubc = vp->v_ubcinfo; 415 416 if ((wbp = ubc->cl_wbehind) == NULL) { 417 418 if ( !(flags & CLW_ALLOCATE)) 419 return ((struct cl_writebehind *)NULL); 420 421 MALLOC_ZONE(wbp, struct cl_writebehind *, sizeof *wbp, M_CLWRBEHIND, M_WAITOK); 422 423 bzero(wbp, sizeof *wbp); 424 lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr); 425 426 vnode_lock(vp); 427 428 if (ubc->cl_wbehind == NULL) 429 ubc->cl_wbehind = wbp; 430 else { 431 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp); 432 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND); 433 wbp = ubc->cl_wbehind; 434 } 435 vnode_unlock(vp); 436 } 437 if (flags & CLW_RETURNLOCKED) 438 lck_mtx_lock(&wbp->cl_lockw); 439 440 return (wbp); 441} 442 443 444static void 445cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, int flags) 446{ 447 struct cl_writebehind *wbp; 448 449 if ((wbp = cluster_get_wbp(vp, 0)) != NULL) { 450 451 if (wbp->cl_number) { 452 lck_mtx_lock(&wbp->cl_lockw); 453 454 cluster_try_push(wbp, vp, newEOF, PUSH_ALL | flags, 0, callback, callback_arg); 455 456 lck_mtx_unlock(&wbp->cl_lockw); 457 } 458 } 459} 460 461 462static int 463cluster_io_present_in_BC(vnode_t vp, off_t f_offset) 464{ 465 daddr64_t blkno; 466 size_t io_size; 467 int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block; 468 469 if (bootcache_check_fn) { 470 if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ, NULL)) 471 return(0); 472 473 if (io_size == 0) 474 return (0); 475 476 if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno)) 477 return(1); 478 } 479 return(0); 480} 481 482 483static int 484cluster_is_throttled(vnode_t vp) 485{ 486 return (throttle_io_will_be_throttled(-1, vp->v_mount)); 487} 488 489 490static void 491cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name) 492{ 493 494 lck_mtx_lock(&iostate->io_mtxp); 495 496 while ((iostate->io_issued - iostate->io_completed) > target) { 497 498 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, 499 iostate->io_issued, iostate->io_completed, target, 0, 0); 500 501 iostate->io_wanted = 1; 502 msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL); 503 504 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, 505 iostate->io_issued, iostate->io_completed, target, 0, 0); 506 } 507 lck_mtx_unlock(&iostate->io_mtxp); 508} 509 510 511static int 512cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp) 513{ 514 int upl_abort_code = 0; 515 int page_in = 0; 516 int page_out = 0; 517 518 if ((io_flags & (B_PHYS | B_CACHE)) == (B_PHYS | B_CACHE)) 519 /* 520 * direct write of any flavor, or a direct read that wasn't aligned 521 */ 522 ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY); 523 else { 524 if (io_flags & B_PAGEIO) { 525 if (io_flags & B_READ) 526 page_in = 1; 527 else 528 page_out = 1; 529 } 530 if (io_flags & B_CACHE) 531 /* 532 * leave pages in the cache unchanged on error 533 */ 534 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY; 535 else if (page_out && ((error != ENXIO) || vnode_isswap(vp))) 536 /* 537 * transient error... leave pages unchanged 538 */ 539 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY; 540 else if (page_in) 541 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR; 542 else 543 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES; 544 545 ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code); 546 } 547 return (upl_abort_code); 548} 549 550 551static int 552cluster_iodone(buf_t bp, void *callback_arg) 553{ 554 int b_flags; 555 int error; 556 int total_size; 557 int total_resid; 558 int upl_offset; 559 int zero_offset; 560 int pg_offset = 0; 561 int commit_size = 0; 562 int upl_flags = 0; 563 int transaction_size = 0; 564 upl_t upl; 565 buf_t cbp; 566 buf_t cbp_head; 567 buf_t cbp_next; 568 buf_t real_bp; 569 vnode_t vp; 570 struct clios *iostate; 571 boolean_t transaction_complete = FALSE; 572 573 cbp_head = (buf_t)(bp->b_trans_head); 574 575 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START, 576 cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0); 577 578 if (cbp_head->b_trans_next || !(cbp_head->b_flags & B_EOT)) { 579 boolean_t need_wakeup = FALSE; 580 581 lck_mtx_lock_spin(cl_transaction_mtxp); 582 583 bp->b_flags |= B_TDONE; 584 585 if (bp->b_flags & B_TWANTED) { 586 CLR(bp->b_flags, B_TWANTED); 587 need_wakeup = TRUE; 588 } 589 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) { 590 /* 591 * all I/O requests that are part of this transaction 592 * have to complete before we can process it 593 */ 594 if ( !(cbp->b_flags & B_TDONE)) { 595 596 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, 597 cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0); 598 599 lck_mtx_unlock(cl_transaction_mtxp); 600 601 if (need_wakeup == TRUE) 602 wakeup(bp); 603 604 return 0; 605 } 606 if (cbp->b_flags & B_EOT) 607 transaction_complete = TRUE; 608 } 609 lck_mtx_unlock(cl_transaction_mtxp); 610 611 if (need_wakeup == TRUE) 612 wakeup(bp); 613 614 if (transaction_complete == FALSE) { 615 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, 616 cbp_head, 0, 0, 0, 0); 617 return 0; 618 } 619 } 620 error = 0; 621 total_size = 0; 622 total_resid = 0; 623 624 cbp = cbp_head; 625 vp = cbp->b_vp; 626 upl_offset = cbp->b_uploffset; 627 upl = cbp->b_upl; 628 b_flags = cbp->b_flags; 629 real_bp = cbp->b_real_bp; 630 zero_offset= cbp->b_validend; 631 iostate = (struct clios *)cbp->b_iostate; 632 633 if (real_bp) 634 real_bp->b_dev = cbp->b_dev; 635 636 while (cbp) { 637 if ((cbp->b_flags & B_ERROR) && error == 0) 638 error = cbp->b_error; 639 640 total_resid += cbp->b_resid; 641 total_size += cbp->b_bcount; 642 643 cbp_next = cbp->b_trans_next; 644 645 if (cbp_next == NULL) 646 /* 647 * compute the overall size of the transaction 648 * in case we created one that has 'holes' in it 649 * 'total_size' represents the amount of I/O we 650 * did, not the span of the transaction w/r to the UPL 651 */ 652 transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset; 653 654 if (cbp != cbp_head) 655 free_io_buf(cbp); 656 657 cbp = cbp_next; 658 } 659 if (error == 0 && total_resid) 660 error = EIO; 661 662 if (error == 0) { 663 int (*cliodone_func)(buf_t, void *) = (int (*)(buf_t, void *))(cbp_head->b_cliodone); 664 665 if (cliodone_func != NULL) { 666 cbp_head->b_bcount = transaction_size; 667 668 error = (*cliodone_func)(cbp_head, callback_arg); 669 } 670 } 671 if (zero_offset) 672 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp); 673 674 free_io_buf(cbp_head); 675 676 if (iostate) { 677 int need_wakeup = 0; 678 679 /* 680 * someone has issued multiple I/Os asynchrounsly 681 * and is waiting for them to complete (streaming) 682 */ 683 lck_mtx_lock_spin(&iostate->io_mtxp); 684 685 if (error && iostate->io_error == 0) 686 iostate->io_error = error; 687 688 iostate->io_completed += total_size; 689 690 if (iostate->io_wanted) { 691 /* 692 * someone is waiting for the state of 693 * this io stream to change 694 */ 695 iostate->io_wanted = 0; 696 need_wakeup = 1; 697 } 698 lck_mtx_unlock(&iostate->io_mtxp); 699 700 if (need_wakeup) 701 wakeup((caddr_t)&iostate->io_wanted); 702 } 703 704 if (b_flags & B_COMMIT_UPL) { 705 706 pg_offset = upl_offset & PAGE_MASK; 707 commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 708 709 if (error) 710 upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags, vp); 711 else { 712 upl_flags = UPL_COMMIT_FREE_ON_EMPTY; 713 714 if ((b_flags & B_PHYS) && (b_flags & B_READ)) 715 upl_flags |= UPL_COMMIT_SET_DIRTY; 716 717 if (b_flags & B_AGE) 718 upl_flags |= UPL_COMMIT_INACTIVATE; 719 720 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags); 721 } 722 } 723 if (real_bp) { 724 if (error) { 725 real_bp->b_flags |= B_ERROR; 726 real_bp->b_error = error; 727 } 728 real_bp->b_resid = total_resid; 729 730 buf_biodone(real_bp); 731 } 732 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, 733 upl, upl_offset - pg_offset, commit_size, (error << 24) | upl_flags, 0); 734 735 return (error); 736} 737 738 739uint32_t 740cluster_throttle_io_limit(vnode_t vp, uint32_t *limit) 741{ 742 if (cluster_is_throttled(vp)) { 743 *limit = THROTTLE_MAX_IOSIZE; 744 return 1; 745 } 746 return 0; 747} 748 749 750void 751cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp) 752{ 753 754 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START, 755 upl_offset, size, bp, 0, 0); 756 757 if (bp == NULL || bp->b_datap == 0) { 758 upl_page_info_t *pl; 759 addr64_t zero_addr; 760 761 pl = ubc_upl_pageinfo(upl); 762 763 if (upl_device_page(pl) == TRUE) { 764 zero_addr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + upl_offset; 765 766 bzero_phys_nc(zero_addr, size); 767 } else { 768 while (size) { 769 int page_offset; 770 int page_index; 771 int zero_cnt; 772 773 page_index = upl_offset / PAGE_SIZE; 774 page_offset = upl_offset & PAGE_MASK; 775 776 zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << PAGE_SHIFT) + page_offset; 777 zero_cnt = min(PAGE_SIZE - page_offset, size); 778 779 bzero_phys(zero_addr, zero_cnt); 780 781 size -= zero_cnt; 782 upl_offset += zero_cnt; 783 } 784 } 785 } else 786 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size); 787 788 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END, 789 upl_offset, size, 0, 0, 0); 790} 791 792 793static void 794cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset) 795{ 796 cbp_head->b_validend = zero_offset; 797 cbp_tail->b_flags |= B_EOT; 798} 799 800static void 801cluster_wait_IO(buf_t cbp_head, int async) 802{ 803 buf_t cbp; 804 805 if (async) { 806 /* 807 * async callback completion will not normally 808 * generate a wakeup upon I/O completion... 809 * by setting B_TWANTED, we will force a wakeup 810 * to occur as any outstanding I/Os complete... 811 * I/Os already completed will have B_TDONE already 812 * set and we won't cause us to block 813 * note that we're actually waiting for the bp to have 814 * completed the callback function... only then 815 * can we safely take back ownership of the bp 816 */ 817 lck_mtx_lock_spin(cl_transaction_mtxp); 818 819 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) 820 cbp->b_flags |= B_TWANTED; 821 822 lck_mtx_unlock(cl_transaction_mtxp); 823 } 824 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) { 825 826 if (async) { 827 while (!ISSET(cbp->b_flags, B_TDONE)) { 828 829 lck_mtx_lock_spin(cl_transaction_mtxp); 830 831 if (!ISSET(cbp->b_flags, B_TDONE)) { 832 DTRACE_IO1(wait__start, buf_t, cbp); 833 (void) msleep(cbp, cl_transaction_mtxp, PDROP | (PRIBIO+1), "cluster_wait_IO", NULL); 834 DTRACE_IO1(wait__done, buf_t, cbp); 835 } else 836 lck_mtx_unlock(cl_transaction_mtxp); 837 } 838 } else 839 buf_biowait(cbp); 840 } 841} 842 843static void 844cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait) 845{ 846 buf_t cbp; 847 int error; 848 boolean_t isswapout = FALSE; 849 850 /* 851 * cluster_complete_transaction will 852 * only be called if we've issued a complete chain in synchronous mode 853 * or, we've already done a cluster_wait_IO on an incomplete chain 854 */ 855 if (needwait) { 856 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) 857 buf_biowait(cbp); 858 } 859 /* 860 * we've already waited on all of the I/Os in this transaction, 861 * so mark all of the buf_t's in this transaction as B_TDONE 862 * so that cluster_iodone sees the transaction as completed 863 */ 864 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) 865 cbp->b_flags |= B_TDONE; 866 cbp = *cbp_head; 867 868 if ((flags & (CL_ASYNC | CL_PAGEOUT)) == CL_PAGEOUT && vnode_isswap(cbp->b_vp)) 869 isswapout = TRUE; 870 871 error = cluster_iodone(cbp, callback_arg); 872 873 if ( !(flags & CL_ASYNC) && error && *retval == 0) { 874 if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) != CL_PAGEOUT) || (error != ENXIO)) 875 *retval = error; 876 else if (isswapout == TRUE) 877 *retval = error; 878 } 879 *cbp_head = (buf_t)NULL; 880} 881 882 883static int 884cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size, 885 int flags, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg) 886{ 887 buf_t cbp; 888 u_int size; 889 u_int io_size; 890 int io_flags; 891 int bmap_flags; 892 int error = 0; 893 int retval = 0; 894 buf_t cbp_head = NULL; 895 buf_t cbp_tail = NULL; 896 int trans_count = 0; 897 int max_trans_count; 898 u_int pg_count; 899 int pg_offset; 900 u_int max_iosize; 901 u_int max_vectors; 902 int priv; 903 int zero_offset = 0; 904 int async_throttle = 0; 905 mount_t mp; 906 vm_offset_t upl_end_offset; 907 boolean_t need_EOT = FALSE; 908 909 /* 910 * we currently don't support buffers larger than a page 911 */ 912 if (real_bp && non_rounded_size > PAGE_SIZE) 913 panic("%s(): Called with real buffer of size %d bytes which " 914 "is greater than the maximum allowed size of " 915 "%d bytes (the system PAGE_SIZE).\n", 916 __FUNCTION__, non_rounded_size, PAGE_SIZE); 917 918 mp = vp->v_mount; 919 920 /* 921 * we don't want to do any funny rounding of the size for IO requests 922 * coming through the DIRECT or CONTIGUOUS paths... those pages don't 923 * belong to us... we can't extend (nor do we need to) the I/O to fill 924 * out a page 925 */ 926 if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY | CL_DIRECT_IO))) { 927 /* 928 * round the requested size up so that this I/O ends on a 929 * page boundary in case this is a 'write'... if the filesystem 930 * has blocks allocated to back the page beyond the EOF, we want to 931 * make sure to write out the zero's that are sitting beyond the EOF 932 * so that in case the filesystem doesn't explicitly zero this area 933 * if a hole is created via a lseek/write beyond the current EOF, 934 * it will return zeros when it's read back from the disk. If the 935 * physical allocation doesn't extend for the whole page, we'll 936 * only write/read from the disk up to the end of this allocation 937 * via the extent info returned from the VNOP_BLOCKMAP call. 938 */ 939 pg_offset = upl_offset & PAGE_MASK; 940 941 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset; 942 } else { 943 /* 944 * anyone advertising a blocksize of 1 byte probably 945 * can't deal with us rounding up the request size 946 * AFP is one such filesystem/device 947 */ 948 size = non_rounded_size; 949 } 950 upl_end_offset = upl_offset + size; 951 952 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0); 953 954 /* 955 * Set the maximum transaction size to the maximum desired number of 956 * buffers. 957 */ 958 max_trans_count = 8; 959 if (flags & CL_DEV_MEMORY) 960 max_trans_count = 16; 961 962 if (flags & CL_READ) { 963 io_flags = B_READ; 964 bmap_flags = VNODE_READ; 965 966 max_iosize = mp->mnt_maxreadcnt; 967 max_vectors = mp->mnt_segreadcnt; 968 } else { 969 io_flags = B_WRITE; 970 bmap_flags = VNODE_WRITE; 971 972 max_iosize = mp->mnt_maxwritecnt; 973 max_vectors = mp->mnt_segwritecnt; 974 } 975 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0); 976 977 /* 978 * make sure the maximum iosize is a 979 * multiple of the page size 980 */ 981 max_iosize &= ~PAGE_MASK; 982 983 /* 984 * Ensure the maximum iosize is sensible. 985 */ 986 if (!max_iosize) 987 max_iosize = PAGE_SIZE; 988 989 if (flags & CL_THROTTLE) { 990 if ( !(flags & CL_PAGEOUT) && cluster_is_throttled(vp)) { 991 if (max_iosize > THROTTLE_MAX_IOSIZE) 992 max_iosize = THROTTLE_MAX_IOSIZE; 993 async_throttle = THROTTLE_MAXCNT; 994 } else { 995 if ( (flags & CL_DEV_MEMORY) ) 996 async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE); 997 else { 998 u_int max_cluster; 999 u_int max_cluster_size; 1000 u_int scale; 1001 1002 max_cluster_size = MAX_CLUSTER_SIZE(vp); 1003 1004 if (max_iosize > max_cluster_size) 1005 max_cluster = max_cluster_size; 1006 else 1007 max_cluster = max_iosize; 1008 1009 if (size < max_cluster) 1010 max_cluster = size; 1011 1012 if ((vp->v_mount->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) 1013 scale = WRITE_THROTTLE_SSD; 1014 else 1015 scale = WRITE_THROTTLE; 1016 1017 if (flags & CL_CLOSE) 1018 scale += MAX_CLUSTERS; 1019 1020 async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1); 1021 } 1022 } 1023 } 1024 if (flags & CL_AGE) 1025 io_flags |= B_AGE; 1026 if (flags & (CL_PAGEIN | CL_PAGEOUT)) 1027 io_flags |= B_PAGEIO; 1028 if (flags & (CL_IOSTREAMING)) 1029 io_flags |= B_IOSTREAMING; 1030 if (flags & CL_COMMIT) 1031 io_flags |= B_COMMIT_UPL; 1032 if (flags & CL_DIRECT_IO) 1033 io_flags |= B_PHYS; 1034 if (flags & (CL_PRESERVE | CL_KEEPCACHED)) 1035 io_flags |= B_CACHE; 1036 if (flags & CL_PASSIVE) 1037 io_flags |= B_PASSIVE; 1038 if (flags & CL_ENCRYPTED) 1039 io_flags |= B_ENCRYPTED_IO; 1040 if (vp->v_flag & VSYSTEM) 1041 io_flags |= B_META; 1042 1043 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) { 1044 /* 1045 * then we are going to end up 1046 * with a page that we can't complete (the file size wasn't a multiple 1047 * of PAGE_SIZE and we're trying to read to the end of the file 1048 * so we'll go ahead and zero out the portion of the page we can't 1049 * read in from the file 1050 */ 1051 zero_offset = upl_offset + non_rounded_size; 1052 } 1053 while (size) { 1054 daddr64_t blkno; 1055 daddr64_t lblkno; 1056 u_int io_size_wanted; 1057 size_t io_size_tmp; 1058 1059 if (size > max_iosize) 1060 io_size = max_iosize; 1061 else 1062 io_size = size; 1063 1064 io_size_wanted = io_size; 1065 io_size_tmp = (size_t)io_size; 1066 1067 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, &io_size_tmp, NULL, bmap_flags, NULL))) 1068 break; 1069 1070 if (io_size_tmp > io_size_wanted) 1071 io_size = io_size_wanted; 1072 else 1073 io_size = (u_int)io_size_tmp; 1074 1075 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) 1076 real_bp->b_blkno = blkno; 1077 1078 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE, 1079 (int)f_offset, (int)(blkno>>32), (int)blkno, io_size, 0); 1080 1081 if (io_size == 0) { 1082 /* 1083 * vnop_blockmap didn't return an error... however, it did 1084 * return an extent size of 0 which means we can't 1085 * make forward progress on this I/O... a hole in the 1086 * file would be returned as a blkno of -1 with a non-zero io_size 1087 * a real extent is returned with a blkno != -1 and a non-zero io_size 1088 */ 1089 error = EINVAL; 1090 break; 1091 } 1092 if ( !(flags & CL_READ) && blkno == -1) { 1093 off_t e_offset; 1094 int pageout_flags; 1095 1096 if (upl_get_internal_vectorupl(upl)) 1097 panic("Vector UPLs should not take this code-path\n"); 1098 /* 1099 * we're writing into a 'hole' 1100 */ 1101 if (flags & CL_PAGEOUT) { 1102 /* 1103 * if we got here via cluster_pageout 1104 * then just error the request and return 1105 * the 'hole' should already have been covered 1106 */ 1107 error = EINVAL; 1108 break; 1109 } 1110 /* 1111 * we can get here if the cluster code happens to 1112 * pick up a page that was dirtied via mmap vs 1113 * a 'write' and the page targets a 'hole'... 1114 * i.e. the writes to the cluster were sparse 1115 * and the file was being written for the first time 1116 * 1117 * we can also get here if the filesystem supports 1118 * 'holes' that are less than PAGE_SIZE.... because 1119 * we can't know if the range in the page that covers 1120 * the 'hole' has been dirtied via an mmap or not, 1121 * we have to assume the worst and try to push the 1122 * entire page to storage. 1123 * 1124 * Try paging out the page individually before 1125 * giving up entirely and dumping it (the pageout 1126 * path will insure that the zero extent accounting 1127 * has been taken care of before we get back into cluster_io) 1128 * 1129 * go direct to vnode_pageout so that we don't have to 1130 * unbusy the page from the UPL... we used to do this 1131 * so that we could call ubc_msync, but that results 1132 * in a potential deadlock if someone else races us to acquire 1133 * that page and wins and in addition needs one of the pages 1134 * we're continuing to hold in the UPL 1135 */ 1136 pageout_flags = UPL_MSYNC | UPL_VNODE_PAGER | UPL_NESTED_PAGEOUT; 1137 1138 if ( !(flags & CL_ASYNC)) 1139 pageout_flags |= UPL_IOSYNC; 1140 if ( !(flags & CL_COMMIT)) 1141 pageout_flags |= UPL_NOCOMMIT; 1142 1143 if (cbp_head) { 1144 buf_t last_cbp; 1145 1146 /* 1147 * first we have to wait for the the current outstanding I/Os 1148 * to complete... EOT hasn't been set yet on this transaction 1149 * so the pages won't be released just because all of the current 1150 * I/O linked to this transaction has completed... 1151 */ 1152 cluster_wait_IO(cbp_head, (flags & CL_ASYNC)); 1153 1154 /* 1155 * we've got a transcation that 1156 * includes the page we're about to push out through vnode_pageout... 1157 * find the last bp in the list which will be the one that 1158 * includes the head of this page and round it's iosize down 1159 * to a page boundary... 1160 */ 1161 for (last_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next) 1162 last_cbp = cbp; 1163 1164 cbp->b_bcount &= ~PAGE_MASK; 1165 1166 if (cbp->b_bcount == 0) { 1167 /* 1168 * this buf no longer has any I/O associated with it 1169 */ 1170 free_io_buf(cbp); 1171 1172 if (cbp == cbp_head) { 1173 /* 1174 * the buf we just freed was the only buf in 1175 * this transaction... so there's no I/O to do 1176 */ 1177 cbp_head = NULL; 1178 } else { 1179 /* 1180 * remove the buf we just freed from 1181 * the transaction list 1182 */ 1183 last_cbp->b_trans_next = NULL; 1184 cbp_tail = last_cbp; 1185 } 1186 } 1187 if (cbp_head) { 1188 /* 1189 * there was more to the current transaction 1190 * than just the page we are pushing out via vnode_pageout... 1191 * mark it as finished and complete it... we've already 1192 * waited for the I/Os to complete above in the call to cluster_wait_IO 1193 */ 1194 cluster_EOT(cbp_head, cbp_tail, 0); 1195 1196 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0); 1197 1198 trans_count = 0; 1199 } 1200 } 1201 if (vnode_pageout(vp, upl, trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) { 1202 error = EINVAL; 1203 } 1204 e_offset = round_page_64(f_offset + 1); 1205 io_size = e_offset - f_offset; 1206 1207 f_offset += io_size; 1208 upl_offset += io_size; 1209 1210 if (size >= io_size) 1211 size -= io_size; 1212 else 1213 size = 0; 1214 /* 1215 * keep track of how much of the original request 1216 * that we've actually completed... non_rounded_size 1217 * may go negative due to us rounding the request 1218 * to a page size multiple (i.e. size > non_rounded_size) 1219 */ 1220 non_rounded_size -= io_size; 1221 1222 if (non_rounded_size <= 0) { 1223 /* 1224 * we've transferred all of the data in the original 1225 * request, but we were unable to complete the tail 1226 * of the last page because the file didn't have 1227 * an allocation to back that portion... this is ok. 1228 */ 1229 size = 0; 1230 } 1231 if (error) { 1232 if (size == 0) 1233 flags &= ~CL_COMMIT; 1234 break; 1235 } 1236 continue; 1237 } 1238 lblkno = (daddr64_t)(f_offset / 0x1000); 1239 /* 1240 * we have now figured out how much I/O we can do - this is in 'io_size' 1241 * pg_offset is the starting point in the first page for the I/O 1242 * pg_count is the number of full and partial pages that 'io_size' encompasses 1243 */ 1244 pg_offset = upl_offset & PAGE_MASK; 1245 1246 if (flags & CL_DEV_MEMORY) { 1247 /* 1248 * treat physical requests as one 'giant' page 1249 */ 1250 pg_count = 1; 1251 } else 1252 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE; 1253 1254 if ((flags & CL_READ) && blkno == -1) { 1255 vm_offset_t commit_offset; 1256 int bytes_to_zero; 1257 int complete_transaction_now = 0; 1258 1259 /* 1260 * if we're reading and blkno == -1, then we've got a 1261 * 'hole' in the file that we need to deal with by zeroing 1262 * out the affected area in the upl 1263 */ 1264 if (io_size >= (u_int)non_rounded_size) { 1265 /* 1266 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE 1267 * than 'zero_offset' will be non-zero 1268 * if the 'hole' returned by vnop_blockmap extends all the way to the eof 1269 * (indicated by the io_size finishing off the I/O request for this UPL) 1270 * than we're not going to issue an I/O for the 1271 * last page in this upl... we need to zero both the hole and the tail 1272 * of the page beyond the EOF, since the delayed zero-fill won't kick in 1273 */ 1274 bytes_to_zero = non_rounded_size; 1275 if (!(flags & CL_NOZERO)) 1276 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset; 1277 1278 zero_offset = 0; 1279 } else 1280 bytes_to_zero = io_size; 1281 1282 pg_count = 0; 1283 1284 cluster_zero(upl, upl_offset, bytes_to_zero, real_bp); 1285 1286 if (cbp_head) { 1287 int pg_resid; 1288 1289 /* 1290 * if there is a current I/O chain pending 1291 * then the first page of the group we just zero'd 1292 * will be handled by the I/O completion if the zero 1293 * fill started in the middle of the page 1294 */ 1295 commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK; 1296 1297 pg_resid = commit_offset - upl_offset; 1298 1299 if (bytes_to_zero >= pg_resid) { 1300 /* 1301 * the last page of the current I/O 1302 * has been completed... 1303 * compute the number of fully zero'd 1304 * pages that are beyond it 1305 * plus the last page if its partial 1306 * and we have no more I/O to issue... 1307 * otherwise a partial page is left 1308 * to begin the next I/O 1309 */ 1310 if ((int)io_size >= non_rounded_size) 1311 pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE; 1312 else 1313 pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE; 1314 1315 complete_transaction_now = 1; 1316 } 1317 } else { 1318 /* 1319 * no pending I/O to deal with 1320 * so, commit all of the fully zero'd pages 1321 * plus the last page if its partial 1322 * and we have no more I/O to issue... 1323 * otherwise a partial page is left 1324 * to begin the next I/O 1325 */ 1326 if ((int)io_size >= non_rounded_size) 1327 pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE; 1328 else 1329 pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE; 1330 1331 commit_offset = upl_offset & ~PAGE_MASK; 1332 } 1333 if ( (flags & CL_COMMIT) && pg_count) { 1334 ubc_upl_commit_range(upl, commit_offset, pg_count * PAGE_SIZE, 1335 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY); 1336 } 1337 upl_offset += io_size; 1338 f_offset += io_size; 1339 size -= io_size; 1340 1341 /* 1342 * keep track of how much of the original request 1343 * that we've actually completed... non_rounded_size 1344 * may go negative due to us rounding the request 1345 * to a page size multiple (i.e. size > non_rounded_size) 1346 */ 1347 non_rounded_size -= io_size; 1348 1349 if (non_rounded_size <= 0) { 1350 /* 1351 * we've transferred all of the data in the original 1352 * request, but we were unable to complete the tail 1353 * of the last page because the file didn't have 1354 * an allocation to back that portion... this is ok. 1355 */ 1356 size = 0; 1357 } 1358 if (cbp_head && (complete_transaction_now || size == 0)) { 1359 cluster_wait_IO(cbp_head, (flags & CL_ASYNC)); 1360 1361 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0); 1362 1363 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0); 1364 1365 trans_count = 0; 1366 } 1367 continue; 1368 } 1369 if (pg_count > max_vectors) { 1370 if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) { 1371 io_size = PAGE_SIZE - pg_offset; 1372 pg_count = 1; 1373 } else { 1374 io_size -= (pg_count - max_vectors) * PAGE_SIZE; 1375 pg_count = max_vectors; 1376 } 1377 } 1378 /* 1379 * If the transaction is going to reach the maximum number of 1380 * desired elements, truncate the i/o to the nearest page so 1381 * that the actual i/o is initiated after this buffer is 1382 * created and added to the i/o chain. 1383 * 1384 * I/O directed to physically contiguous memory 1385 * doesn't have a requirement to make sure we 'fill' a page 1386 */ 1387 if ( !(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count && 1388 ((upl_offset + io_size) & PAGE_MASK)) { 1389 vm_offset_t aligned_ofs; 1390 1391 aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK; 1392 /* 1393 * If the io_size does not actually finish off even a 1394 * single page we have to keep adding buffers to the 1395 * transaction despite having reached the desired limit. 1396 * 1397 * Eventually we get here with the page being finished 1398 * off (and exceeded) and then we truncate the size of 1399 * this i/o request so that it is page aligned so that 1400 * we can finally issue the i/o on the transaction. 1401 */ 1402 if (aligned_ofs > upl_offset) { 1403 io_size = aligned_ofs - upl_offset; 1404 pg_count--; 1405 } 1406 } 1407 1408 if ( !(mp->mnt_kern_flag & MNTK_VIRTUALDEV)) 1409 /* 1410 * if we're not targeting a virtual device i.e. a disk image 1411 * it's safe to dip into the reserve pool since real devices 1412 * can complete this I/O request without requiring additional 1413 * bufs from the alloc_io_buf pool 1414 */ 1415 priv = 1; 1416 else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT)) 1417 /* 1418 * Throttle the speculative IO 1419 */ 1420 priv = 0; 1421 else 1422 priv = 1; 1423 1424 cbp = alloc_io_buf(vp, priv); 1425 1426 if (flags & CL_PAGEOUT) { 1427 u_int i; 1428 1429 for (i = 0; i < pg_count; i++) { 1430 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY) 1431 panic("BUSY bp found in cluster_io"); 1432 } 1433 } 1434 if (flags & CL_ASYNC) { 1435 if (buf_setcallback(cbp, (void *)cluster_iodone, callback_arg)) 1436 panic("buf_setcallback failed\n"); 1437 } 1438 cbp->b_cliodone = (void *)callback; 1439 cbp->b_flags |= io_flags; 1440 if (flags & CL_NOCACHE) 1441 cbp->b_attr.ba_flags |= BA_NOCACHE; 1442 1443 cbp->b_lblkno = lblkno; 1444 cbp->b_blkno = blkno; 1445 cbp->b_bcount = io_size; 1446 1447 if (buf_setupl(cbp, upl, upl_offset)) 1448 panic("buf_setupl failed\n"); 1449#if CONFIG_IOSCHED 1450 upl_set_blkno(upl, upl_offset, io_size, blkno); 1451#endif 1452 cbp->b_trans_next = (buf_t)NULL; 1453 1454 if ((cbp->b_iostate = (void *)iostate)) 1455 /* 1456 * caller wants to track the state of this 1457 * io... bump the amount issued against this stream 1458 */ 1459 iostate->io_issued += io_size; 1460 1461 if (flags & CL_READ) { 1462 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE, 1463 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0); 1464 } 1465 else { 1466 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE, 1467 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0); 1468 } 1469 1470 if (cbp_head) { 1471 cbp_tail->b_trans_next = cbp; 1472 cbp_tail = cbp; 1473 } else { 1474 cbp_head = cbp; 1475 cbp_tail = cbp; 1476 1477 if ( (cbp_head->b_real_bp = real_bp) ) 1478 real_bp = (buf_t)NULL; 1479 } 1480 *(buf_t *)(&cbp->b_trans_head) = cbp_head; 1481 1482 trans_count++; 1483 1484 upl_offset += io_size; 1485 f_offset += io_size; 1486 size -= io_size; 1487 /* 1488 * keep track of how much of the original request 1489 * that we've actually completed... non_rounded_size 1490 * may go negative due to us rounding the request 1491 * to a page size multiple (i.e. size > non_rounded_size) 1492 */ 1493 non_rounded_size -= io_size; 1494 1495 if (non_rounded_size <= 0) { 1496 /* 1497 * we've transferred all of the data in the original 1498 * request, but we were unable to complete the tail 1499 * of the last page because the file didn't have 1500 * an allocation to back that portion... this is ok. 1501 */ 1502 size = 0; 1503 } 1504 if (size == 0) { 1505 /* 1506 * we have no more I/O to issue, so go 1507 * finish the final transaction 1508 */ 1509 need_EOT = TRUE; 1510 } else if ( ((flags & CL_DEV_MEMORY) || (upl_offset & PAGE_MASK) == 0) && 1511 ((flags & CL_ASYNC) || trans_count > max_trans_count) ) { 1512 /* 1513 * I/O directed to physically contiguous memory... 1514 * which doesn't have a requirement to make sure we 'fill' a page 1515 * or... 1516 * the current I/O we've prepared fully 1517 * completes the last page in this request 1518 * and ... 1519 * it's either an ASYNC request or 1520 * we've already accumulated more than 8 I/O's into 1521 * this transaction so mark it as complete so that 1522 * it can finish asynchronously or via the cluster_complete_transaction 1523 * below if the request is synchronous 1524 */ 1525 need_EOT = TRUE; 1526 } 1527 if (need_EOT == TRUE) 1528 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0); 1529 1530 if (flags & CL_THROTTLE) 1531 (void)vnode_waitforwrites(vp, async_throttle, 0, 0, "cluster_io"); 1532 1533 if ( !(io_flags & B_READ)) 1534 vnode_startwrite(vp); 1535 1536 if (flags & CL_RAW_ENCRYPTED) { 1537 /* 1538 * User requested raw encrypted bytes. 1539 * Twiddle the bit in the ba_flags for the buffer 1540 */ 1541 cbp->b_attr.ba_flags |= BA_RAW_ENCRYPTED_IO; 1542 } 1543 1544 (void) VNOP_STRATEGY(cbp); 1545 1546 if (need_EOT == TRUE) { 1547 if ( !(flags & CL_ASYNC)) 1548 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 1); 1549 1550 need_EOT = FALSE; 1551 trans_count = 0; 1552 cbp_head = NULL; 1553 } 1554 } 1555 if (error) { 1556 int abort_size; 1557 1558 io_size = 0; 1559 1560 if (cbp_head) { 1561 /* 1562 * first wait until all of the outstanding I/O 1563 * for this partial transaction has completed 1564 */ 1565 cluster_wait_IO(cbp_head, (flags & CL_ASYNC)); 1566 1567 /* 1568 * Rewind the upl offset to the beginning of the 1569 * transaction. 1570 */ 1571 upl_offset = cbp_head->b_uploffset; 1572 1573 for (cbp = cbp_head; cbp;) { 1574 buf_t cbp_next; 1575 1576 size += cbp->b_bcount; 1577 io_size += cbp->b_bcount; 1578 1579 cbp_next = cbp->b_trans_next; 1580 free_io_buf(cbp); 1581 cbp = cbp_next; 1582 } 1583 } 1584 if (iostate) { 1585 int need_wakeup = 0; 1586 1587 /* 1588 * update the error condition for this stream 1589 * since we never really issued the io 1590 * just go ahead and adjust it back 1591 */ 1592 lck_mtx_lock_spin(&iostate->io_mtxp); 1593 1594 if (iostate->io_error == 0) 1595 iostate->io_error = error; 1596 iostate->io_issued -= io_size; 1597 1598 if (iostate->io_wanted) { 1599 /* 1600 * someone is waiting for the state of 1601 * this io stream to change 1602 */ 1603 iostate->io_wanted = 0; 1604 need_wakeup = 1; 1605 } 1606 lck_mtx_unlock(&iostate->io_mtxp); 1607 1608 if (need_wakeup) 1609 wakeup((caddr_t)&iostate->io_wanted); 1610 } 1611 if (flags & CL_COMMIT) { 1612 int upl_flags; 1613 1614 pg_offset = upl_offset & PAGE_MASK; 1615 abort_size = (upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK; 1616 1617 upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, abort_size, error, io_flags, vp); 1618 1619 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE, 1620 upl, upl_offset - pg_offset, abort_size, (error << 24) | upl_flags, 0); 1621 } 1622 if (retval == 0) 1623 retval = error; 1624 } else if (cbp_head) 1625 panic("%s(): cbp_head is not NULL.\n", __FUNCTION__); 1626 1627 if (real_bp) { 1628 /* 1629 * can get here if we either encountered an error 1630 * or we completely zero-filled the request and 1631 * no I/O was issued 1632 */ 1633 if (error) { 1634 real_bp->b_flags |= B_ERROR; 1635 real_bp->b_error = error; 1636 } 1637 buf_biodone(real_bp); 1638 } 1639 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, 0); 1640 1641 return (retval); 1642} 1643 1644#define reset_vector_run_state() \ 1645 issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0; 1646 1647static int 1648vector_cluster_io(vnode_t vp, upl_t vector_upl, vm_offset_t vector_upl_offset, off_t v_upl_uio_offset, int vector_upl_iosize, 1649 int io_flag, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg) 1650{ 1651 vector_upl_set_pagelist(vector_upl); 1652 1653 if(io_flag & CL_READ) { 1654 if(vector_upl_offset == 0 && ((vector_upl_iosize & PAGE_MASK)==0)) 1655 io_flag &= ~CL_PRESERVE; /*don't zero fill*/ 1656 else 1657 io_flag |= CL_PRESERVE; /*zero fill*/ 1658 } 1659 return (cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp, iostate, callback, callback_arg)); 1660 1661} 1662 1663static int 1664cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag) 1665{ 1666 int pages_in_prefetch; 1667 1668 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START, 1669 (int)f_offset, size, (int)filesize, 0, 0); 1670 1671 if (f_offset >= filesize) { 1672 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END, 1673 (int)f_offset, 0, 0, 0, 0); 1674 return(0); 1675 } 1676 if ((off_t)size > (filesize - f_offset)) 1677 size = filesize - f_offset; 1678 pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE; 1679 1680 advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag); 1681 1682 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END, 1683 (int)f_offset + size, pages_in_prefetch, 0, 1, 0); 1684 1685 return (pages_in_prefetch); 1686} 1687 1688 1689 1690static void 1691cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap, int (*callback)(buf_t, void *), void *callback_arg, 1692 int bflag) 1693{ 1694 daddr64_t r_addr; 1695 off_t f_offset; 1696 int size_of_prefetch; 1697 u_int max_prefetch; 1698 1699 1700 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START, 1701 (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0); 1702 1703 if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) { 1704 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, 1705 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0); 1706 return; 1707 } 1708 if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1))) { 1709 rap->cl_ralen = 0; 1710 rap->cl_maxra = 0; 1711 1712 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, 1713 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0); 1714 1715 return; 1716 } 1717 max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ), (vp->v_mount->mnt_kern_flag & MNTK_SSD)); 1718 1719 if (max_prefetch > speculative_prefetch_max) 1720 max_prefetch = speculative_prefetch_max; 1721 1722 if (max_prefetch <= PAGE_SIZE) { 1723 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, 1724 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0); 1725 return; 1726 } 1727 if (extent->e_addr < rap->cl_maxra && rap->cl_ralen >= 4) { 1728 if ((rap->cl_maxra - extent->e_addr) > (rap->cl_ralen / 4)) { 1729 1730 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, 1731 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0); 1732 return; 1733 } 1734 } 1735 r_addr = max(extent->e_addr, rap->cl_maxra) + 1; 1736 f_offset = (off_t)(r_addr * PAGE_SIZE_64); 1737 1738 size_of_prefetch = 0; 1739 1740 ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch); 1741 1742 if (size_of_prefetch) { 1743 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, 1744 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0); 1745 return; 1746 } 1747 if (f_offset < filesize) { 1748 daddr64_t read_size; 1749 1750 rap->cl_ralen = rap->cl_ralen ? min(max_prefetch / PAGE_SIZE, rap->cl_ralen << 1) : 1; 1751 1752 read_size = (extent->e_addr + 1) - extent->b_addr; 1753 1754 if (read_size > rap->cl_ralen) { 1755 if (read_size > max_prefetch / PAGE_SIZE) 1756 rap->cl_ralen = max_prefetch / PAGE_SIZE; 1757 else 1758 rap->cl_ralen = read_size; 1759 } 1760 size_of_prefetch = cluster_read_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag); 1761 1762 if (size_of_prefetch) 1763 rap->cl_maxra = (r_addr + size_of_prefetch) - 1; 1764 } 1765 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, 1766 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0); 1767} 1768 1769 1770int 1771cluster_pageout(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset, 1772 int size, off_t filesize, int flags) 1773{ 1774 return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL); 1775 1776} 1777 1778 1779int 1780cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset, 1781 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg) 1782{ 1783 int io_size; 1784 int rounded_size; 1785 off_t max_size; 1786 int local_flags; 1787 1788 local_flags = CL_PAGEOUT | CL_THROTTLE; 1789 1790 if ((flags & UPL_IOSYNC) == 0) 1791 local_flags |= CL_ASYNC; 1792 if ((flags & UPL_NOCOMMIT) == 0) 1793 local_flags |= CL_COMMIT; 1794 if ((flags & UPL_KEEPCACHED)) 1795 local_flags |= CL_KEEPCACHED; 1796 if (flags & UPL_PAGING_ENCRYPTED) 1797 local_flags |= CL_ENCRYPTED; 1798 1799 1800 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE, 1801 (int)f_offset, size, (int)filesize, local_flags, 0); 1802 1803 /* 1804 * If they didn't specify any I/O, then we are done... 1805 * we can't issue an abort because we don't know how 1806 * big the upl really is 1807 */ 1808 if (size <= 0) 1809 return (EINVAL); 1810 1811 if (vp->v_mount->mnt_flag & MNT_RDONLY) { 1812 if (local_flags & CL_COMMIT) 1813 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY); 1814 return (EROFS); 1815 } 1816 /* 1817 * can't page-in from a negative offset 1818 * or if we're starting beyond the EOF 1819 * or if the file offset isn't page aligned 1820 * or the size requested isn't a multiple of PAGE_SIZE 1821 */ 1822 if (f_offset < 0 || f_offset >= filesize || 1823 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) { 1824 if (local_flags & CL_COMMIT) 1825 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY); 1826 return (EINVAL); 1827 } 1828 max_size = filesize - f_offset; 1829 1830 if (size < max_size) 1831 io_size = size; 1832 else 1833 io_size = max_size; 1834 1835 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 1836 1837 if (size > rounded_size) { 1838 if (local_flags & CL_COMMIT) 1839 ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size, 1840 UPL_ABORT_FREE_ON_EMPTY); 1841 } 1842 return (cluster_io(vp, upl, upl_offset, f_offset, io_size, 1843 local_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg)); 1844} 1845 1846 1847int 1848cluster_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset, 1849 int size, off_t filesize, int flags) 1850{ 1851 return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL); 1852} 1853 1854 1855int 1856cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset, 1857 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg) 1858{ 1859 u_int io_size; 1860 int rounded_size; 1861 off_t max_size; 1862 int retval; 1863 int local_flags = 0; 1864 1865 if (upl == NULL || size < 0) 1866 panic("cluster_pagein: NULL upl passed in"); 1867 1868 if ((flags & UPL_IOSYNC) == 0) 1869 local_flags |= CL_ASYNC; 1870 if ((flags & UPL_NOCOMMIT) == 0) 1871 local_flags |= CL_COMMIT; 1872 if (flags & UPL_IOSTREAMING) 1873 local_flags |= CL_IOSTREAMING; 1874 if (flags & UPL_PAGING_ENCRYPTED) 1875 local_flags |= CL_ENCRYPTED; 1876 1877 1878 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE, 1879 (int)f_offset, size, (int)filesize, local_flags, 0); 1880 1881 /* 1882 * can't page-in from a negative offset 1883 * or if we're starting beyond the EOF 1884 * or if the file offset isn't page aligned 1885 * or the size requested isn't a multiple of PAGE_SIZE 1886 */ 1887 if (f_offset < 0 || f_offset >= filesize || 1888 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) { 1889 if (local_flags & CL_COMMIT) 1890 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); 1891 return (EINVAL); 1892 } 1893 max_size = filesize - f_offset; 1894 1895 if (size < max_size) 1896 io_size = size; 1897 else 1898 io_size = max_size; 1899 1900 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 1901 1902 if (size > rounded_size && (local_flags & CL_COMMIT)) 1903 ubc_upl_abort_range(upl, upl_offset + rounded_size, 1904 size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); 1905 1906 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, 1907 local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 1908 1909 return (retval); 1910} 1911 1912 1913int 1914cluster_bp(buf_t bp) 1915{ 1916 return cluster_bp_ext(bp, NULL, NULL); 1917} 1918 1919 1920int 1921cluster_bp_ext(buf_t bp, int (*callback)(buf_t, void *), void *callback_arg) 1922{ 1923 off_t f_offset; 1924 int flags; 1925 1926 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START, 1927 bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0); 1928 1929 if (bp->b_flags & B_READ) 1930 flags = CL_ASYNC | CL_READ; 1931 else 1932 flags = CL_ASYNC; 1933 if (bp->b_flags & B_PASSIVE) 1934 flags |= CL_PASSIVE; 1935 1936 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno); 1937 1938 return (cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL, callback, callback_arg)); 1939} 1940 1941 1942 1943int 1944cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags) 1945{ 1946 return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL); 1947} 1948 1949 1950int 1951cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, 1952 int xflags, int (*callback)(buf_t, void *), void *callback_arg) 1953{ 1954 user_ssize_t cur_resid; 1955 int retval = 0; 1956 int flags; 1957 int zflags; 1958 int bflag; 1959 int write_type = IO_COPY; 1960 u_int32_t write_length; 1961 1962 flags = xflags; 1963 1964 if (flags & IO_PASSIVE) 1965 bflag = CL_PASSIVE; 1966 else 1967 bflag = 0; 1968 1969 if (vp->v_flag & VNOCACHE_DATA){ 1970 flags |= IO_NOCACHE; 1971 bflag |= CL_NOCACHE; 1972 } 1973 if (uio == NULL) { 1974 /* 1975 * no user data... 1976 * this call is being made to zero-fill some range in the file 1977 */ 1978 retval = cluster_write_copy(vp, NULL, (u_int32_t)0, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg); 1979 1980 return(retval); 1981 } 1982 /* 1983 * do a write through the cache if one of the following is true.... 1984 * NOCACHE is not true or NODIRECT is true 1985 * the uio request doesn't target USERSPACE 1986 * otherwise, find out if we want the direct or contig variant for 1987 * the first vector in the uio request 1988 */ 1989 if ( ((flags & (IO_NOCACHE | IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ) 1990 retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE); 1991 1992 if ( (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)) && write_type == IO_DIRECT) 1993 /* 1994 * must go through the cached variant in this case 1995 */ 1996 write_type = IO_COPY; 1997 1998 while ((cur_resid = uio_resid(uio)) && uio->uio_offset < newEOF && retval == 0) { 1999 2000 switch (write_type) { 2001 2002 case IO_COPY: 2003 /* 2004 * make sure the uio_resid isn't too big... 2005 * internally, we want to handle all of the I/O in 2006 * chunk sizes that fit in a 32 bit int 2007 */ 2008 if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) { 2009 /* 2010 * we're going to have to call cluster_write_copy 2011 * more than once... 2012 * 2013 * only want the last call to cluster_write_copy to 2014 * have the IO_TAILZEROFILL flag set and only the 2015 * first call should have IO_HEADZEROFILL 2016 */ 2017 zflags = flags & ~IO_TAILZEROFILL; 2018 flags &= ~IO_HEADZEROFILL; 2019 2020 write_length = MAX_IO_REQUEST_SIZE; 2021 } else { 2022 /* 2023 * last call to cluster_write_copy 2024 */ 2025 zflags = flags; 2026 2027 write_length = (u_int32_t)cur_resid; 2028 } 2029 retval = cluster_write_copy(vp, uio, write_length, oldEOF, newEOF, headOff, tailOff, zflags, callback, callback_arg); 2030 break; 2031 2032 case IO_CONTIG: 2033 zflags = flags & ~(IO_TAILZEROFILL | IO_HEADZEROFILL); 2034 2035 if (flags & IO_HEADZEROFILL) { 2036 /* 2037 * only do this once per request 2038 */ 2039 flags &= ~IO_HEADZEROFILL; 2040 2041 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, uio->uio_offset, 2042 headOff, (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg); 2043 if (retval) 2044 break; 2045 } 2046 retval = cluster_write_contig(vp, uio, newEOF, &write_type, &write_length, callback, callback_arg, bflag); 2047 2048 if (retval == 0 && (flags & IO_TAILZEROFILL) && uio_resid(uio) == 0) { 2049 /* 2050 * we're done with the data from the user specified buffer(s) 2051 * and we've been requested to zero fill at the tail 2052 * treat this as an IO_HEADZEROFILL which doesn't require a uio 2053 * by rearranging the args and passing in IO_HEADZEROFILL 2054 */ 2055 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, tailOff, uio->uio_offset, 2056 (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg); 2057 } 2058 break; 2059 2060 case IO_DIRECT: 2061 /* 2062 * cluster_write_direct is never called with IO_TAILZEROFILL || IO_HEADZEROFILL 2063 */ 2064 retval = cluster_write_direct(vp, uio, oldEOF, newEOF, &write_type, &write_length, flags, callback, callback_arg); 2065 break; 2066 2067 case IO_UNKNOWN: 2068 retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE); 2069 break; 2070 } 2071 /* 2072 * in case we end up calling cluster_write_copy (from cluster_write_direct) 2073 * multiple times to service a multi-vector request that is not aligned properly 2074 * we need to update the oldEOF so that we 2075 * don't zero-fill the head of a page if we've successfully written 2076 * data to that area... 'cluster_write_copy' will zero-fill the head of a 2077 * page that is beyond the oldEOF if the write is unaligned... we only 2078 * want that to happen for the very first page of the cluster_write, 2079 * NOT the first page of each vector making up a multi-vector write. 2080 */ 2081 if (uio->uio_offset > oldEOF) 2082 oldEOF = uio->uio_offset; 2083 } 2084 return (retval); 2085} 2086 2087 2088static int 2089cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, int *write_type, u_int32_t *write_length, 2090 int flags, int (*callback)(buf_t, void *), void *callback_arg) 2091{ 2092 upl_t upl; 2093 upl_page_info_t *pl; 2094 vm_offset_t upl_offset; 2095 vm_offset_t vector_upl_offset = 0; 2096 u_int32_t io_req_size; 2097 u_int32_t offset_in_file; 2098 u_int32_t offset_in_iovbase; 2099 u_int32_t io_size; 2100 int io_flag = 0; 2101 upl_size_t upl_size, vector_upl_size = 0; 2102 vm_size_t upl_needed_size; 2103 mach_msg_type_number_t pages_in_pl; 2104 int upl_flags; 2105 kern_return_t kret; 2106 mach_msg_type_number_t i; 2107 int force_data_sync; 2108 int retval = 0; 2109 int first_IO = 1; 2110 struct clios iostate; 2111 user_addr_t iov_base; 2112 u_int32_t mem_alignment_mask; 2113 u_int32_t devblocksize; 2114 u_int32_t max_io_size; 2115 u_int32_t max_upl_size; 2116 u_int32_t max_vector_size; 2117 boolean_t io_throttled = FALSE; 2118 2119 u_int32_t vector_upl_iosize = 0; 2120 int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1); 2121 off_t v_upl_uio_offset = 0; 2122 int vector_upl_index=0; 2123 upl_t vector_upl = NULL; 2124 2125 2126 /* 2127 * When we enter this routine, we know 2128 * -- the resid will not exceed iov_len 2129 */ 2130 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START, 2131 (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0); 2132 2133 max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE); 2134 2135 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE | CL_DIRECT_IO; 2136 2137 if (flags & IO_PASSIVE) 2138 io_flag |= CL_PASSIVE; 2139 2140 if (flags & IO_NOCACHE) 2141 io_flag |= CL_NOCACHE; 2142 2143 if (flags & IO_SKIP_ENCRYPTION) 2144 io_flag |= CL_ENCRYPTED; 2145 2146 iostate.io_completed = 0; 2147 iostate.io_issued = 0; 2148 iostate.io_error = 0; 2149 iostate.io_wanted = 0; 2150 2151 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); 2152 2153 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; 2154 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; 2155 2156 if (devblocksize == 1) { 2157 /* 2158 * the AFP client advertises a devblocksize of 1 2159 * however, its BLOCKMAP routine maps to physical 2160 * blocks that are PAGE_SIZE in size... 2161 * therefore we can't ask for I/Os that aren't page aligned 2162 * or aren't multiples of PAGE_SIZE in size 2163 * by setting devblocksize to PAGE_SIZE, we re-instate 2164 * the old behavior we had before the mem_alignment_mask 2165 * changes went in... 2166 */ 2167 devblocksize = PAGE_SIZE; 2168 } 2169 2170next_dwrite: 2171 io_req_size = *write_length; 2172 iov_base = uio_curriovbase(uio); 2173 2174 offset_in_file = (u_int32_t)uio->uio_offset & PAGE_MASK; 2175 offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask; 2176 2177 if (offset_in_file || offset_in_iovbase) { 2178 /* 2179 * one of the 2 important offsets is misaligned 2180 * so fire an I/O through the cache for this entire vector 2181 */ 2182 goto wait_for_dwrites; 2183 } 2184 if (iov_base & (devblocksize - 1)) { 2185 /* 2186 * the offset in memory must be on a device block boundary 2187 * so that we can guarantee that we can generate an 2188 * I/O that ends on a page boundary in cluster_io 2189 */ 2190 goto wait_for_dwrites; 2191 } 2192 2193 while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) { 2194 int throttle_type; 2195 2196 if ( (throttle_type = cluster_is_throttled(vp)) ) { 2197 /* 2198 * we're in the throttle window, at the very least 2199 * we want to limit the size of the I/O we're about 2200 * to issue 2201 */ 2202 if ( (flags & IO_RETURN_ON_THROTTLE) && throttle_type == THROTTLE_NOW) { 2203 /* 2204 * we're in the throttle window and at least 1 I/O 2205 * has already been issued by a throttleable thread 2206 * in this window, so return with EAGAIN to indicate 2207 * to the FS issuing the cluster_write call that it 2208 * should now throttle after dropping any locks 2209 */ 2210 throttle_info_update_by_mount(vp->v_mount); 2211 2212 io_throttled = TRUE; 2213 goto wait_for_dwrites; 2214 } 2215 max_vector_size = THROTTLE_MAX_IOSIZE; 2216 max_io_size = THROTTLE_MAX_IOSIZE; 2217 } else { 2218 max_vector_size = MAX_VECTOR_UPL_SIZE; 2219 max_io_size = max_upl_size; 2220 } 2221 2222 if (first_IO) { 2223 cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0); 2224 first_IO = 0; 2225 } 2226 io_size = io_req_size & ~PAGE_MASK; 2227 iov_base = uio_curriovbase(uio); 2228 2229 if (io_size > max_io_size) 2230 io_size = max_io_size; 2231 2232 if(useVectorUPL && (iov_base & PAGE_MASK)) { 2233 /* 2234 * We have an iov_base that's not page-aligned. 2235 * Issue all I/O's that have been collected within 2236 * this Vectored UPL. 2237 */ 2238 if(vector_upl_index) { 2239 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 2240 reset_vector_run_state(); 2241 } 2242 2243 /* 2244 * After this point, if we are using the Vector UPL path and the base is 2245 * not page-aligned then the UPL with that base will be the first in the vector UPL. 2246 */ 2247 } 2248 2249 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK); 2250 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK; 2251 2252 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START, 2253 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0); 2254 2255 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) { 2256 pages_in_pl = 0; 2257 upl_size = upl_needed_size; 2258 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC | 2259 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; 2260 2261 kret = vm_map_get_upl(current_map(), 2262 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), 2263 &upl_size, 2264 &upl, 2265 NULL, 2266 &pages_in_pl, 2267 &upl_flags, 2268 force_data_sync); 2269 2270 if (kret != KERN_SUCCESS) { 2271 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END, 2272 0, 0, 0, kret, 0); 2273 /* 2274 * failed to get pagelist 2275 * 2276 * we may have already spun some portion of this request 2277 * off as async requests... we need to wait for the I/O 2278 * to complete before returning 2279 */ 2280 goto wait_for_dwrites; 2281 } 2282 pl = UPL_GET_INTERNAL_PAGE_LIST(upl); 2283 pages_in_pl = upl_size / PAGE_SIZE; 2284 2285 for (i = 0; i < pages_in_pl; i++) { 2286 if (!upl_valid_page(pl, i)) 2287 break; 2288 } 2289 if (i == pages_in_pl) 2290 break; 2291 2292 /* 2293 * didn't get all the pages back that we 2294 * needed... release this upl and try again 2295 */ 2296 ubc_upl_abort(upl, 0); 2297 } 2298 if (force_data_sync >= 3) { 2299 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END, 2300 i, pages_in_pl, upl_size, kret, 0); 2301 /* 2302 * for some reason, we couldn't acquire a hold on all 2303 * the pages needed in the user's address space 2304 * 2305 * we may have already spun some portion of this request 2306 * off as async requests... we need to wait for the I/O 2307 * to complete before returning 2308 */ 2309 goto wait_for_dwrites; 2310 } 2311 2312 /* 2313 * Consider the possibility that upl_size wasn't satisfied. 2314 */ 2315 if (upl_size < upl_needed_size) { 2316 if (upl_size && upl_offset == 0) 2317 io_size = upl_size; 2318 else 2319 io_size = 0; 2320 } 2321 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END, 2322 (int)upl_offset, upl_size, (int)iov_base, io_size, 0); 2323 2324 if (io_size == 0) { 2325 ubc_upl_abort(upl, 0); 2326 /* 2327 * we may have already spun some portion of this request 2328 * off as async requests... we need to wait for the I/O 2329 * to complete before returning 2330 */ 2331 goto wait_for_dwrites; 2332 } 2333 2334 if(useVectorUPL) { 2335 vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK); 2336 if(end_off) 2337 issueVectorUPL = 1; 2338 /* 2339 * After this point, if we are using a vector UPL, then 2340 * either all the UPL elements end on a page boundary OR 2341 * this UPL is the last element because it does not end 2342 * on a page boundary. 2343 */ 2344 } 2345 2346 /* 2347 * Now look for pages already in the cache 2348 * and throw them away. 2349 * uio->uio_offset is page aligned within the file 2350 * io_size is a multiple of PAGE_SIZE 2351 */ 2352 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL); 2353 2354 /* 2355 * we want push out these writes asynchronously so that we can overlap 2356 * the preparation of the next I/O 2357 * if there are already too many outstanding writes 2358 * wait until some complete before issuing the next 2359 */ 2360 cluster_iostate_wait(&iostate, max_upl_size * IO_SCALE(vp, 2), "cluster_write_direct"); 2361 2362 if (iostate.io_error) { 2363 /* 2364 * one of the earlier writes we issued ran into a hard error 2365 * don't issue any more writes, cleanup the UPL 2366 * that was just created but not used, then 2367 * go wait for all writes that are part of this stream 2368 * to complete before returning the error to the caller 2369 */ 2370 ubc_upl_abort(upl, 0); 2371 2372 goto wait_for_dwrites; 2373 } 2374 2375 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START, 2376 (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0); 2377 2378 if(!useVectorUPL) 2379 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, 2380 io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 2381 2382 else { 2383 if(!vector_upl_index) { 2384 vector_upl = vector_upl_create(upl_offset); 2385 v_upl_uio_offset = uio->uio_offset; 2386 vector_upl_offset = upl_offset; 2387 } 2388 2389 vector_upl_set_subupl(vector_upl,upl,upl_size); 2390 vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size); 2391 vector_upl_index++; 2392 vector_upl_iosize += io_size; 2393 vector_upl_size += upl_size; 2394 2395 if(issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) { 2396 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 2397 reset_vector_run_state(); 2398 } 2399 } 2400 2401 /* 2402 * update the uio structure to 2403 * reflect the I/O that we just issued 2404 */ 2405 uio_update(uio, (user_size_t)io_size); 2406 2407 /* 2408 * in case we end up calling through to cluster_write_copy to finish 2409 * the tail of this request, we need to update the oldEOF so that we 2410 * don't zero-fill the head of a page if we've successfully written 2411 * data to that area... 'cluster_write_copy' will zero-fill the head of a 2412 * page that is beyond the oldEOF if the write is unaligned... we only 2413 * want that to happen for the very first page of the cluster_write, 2414 * NOT the first page of each vector making up a multi-vector write. 2415 */ 2416 if (uio->uio_offset > oldEOF) 2417 oldEOF = uio->uio_offset; 2418 2419 io_req_size -= io_size; 2420 2421 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END, 2422 (int)upl_offset, (int)uio->uio_offset, io_req_size, retval, 0); 2423 2424 } /* end while */ 2425 2426 if (retval == 0 && iostate.io_error == 0 && io_req_size == 0) { 2427 2428 retval = cluster_io_type(uio, write_type, write_length, MIN_DIRECT_WRITE_SIZE); 2429 2430 if (retval == 0 && *write_type == IO_DIRECT) { 2431 2432 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_NONE, 2433 (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0); 2434 2435 goto next_dwrite; 2436 } 2437 } 2438 2439wait_for_dwrites: 2440 2441 if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) { 2442 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 2443 reset_vector_run_state(); 2444 } 2445 /* 2446 * make sure all async writes issued as part of this stream 2447 * have completed before we return 2448 */ 2449 cluster_iostate_wait(&iostate, 0, "cluster_write_direct"); 2450 2451 if (iostate.io_error) 2452 retval = iostate.io_error; 2453 2454 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); 2455 2456 if (io_throttled == TRUE && retval == 0) 2457 retval = EAGAIN; 2458 2459 if (io_req_size && retval == 0) { 2460 /* 2461 * we couldn't handle the tail of this request in DIRECT mode 2462 * so fire it through the copy path 2463 * 2464 * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set 2465 * so we can just pass 0 in for the headOff and tailOff 2466 */ 2467 if (uio->uio_offset > oldEOF) 2468 oldEOF = uio->uio_offset; 2469 2470 retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, (off_t)0, (off_t)0, flags, callback, callback_arg); 2471 2472 *write_type = IO_UNKNOWN; 2473 } 2474 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END, 2475 (int)uio->uio_offset, io_req_size, retval, 4, 0); 2476 2477 return (retval); 2478} 2479 2480 2481static int 2482cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, u_int32_t *write_length, 2483 int (*callback)(buf_t, void *), void *callback_arg, int bflag) 2484{ 2485 upl_page_info_t *pl; 2486 addr64_t src_paddr = 0; 2487 upl_t upl[MAX_VECTS]; 2488 vm_offset_t upl_offset; 2489 u_int32_t tail_size = 0; 2490 u_int32_t io_size; 2491 u_int32_t xsize; 2492 upl_size_t upl_size; 2493 vm_size_t upl_needed_size; 2494 mach_msg_type_number_t pages_in_pl; 2495 int upl_flags; 2496 kern_return_t kret; 2497 struct clios iostate; 2498 int error = 0; 2499 int cur_upl = 0; 2500 int num_upl = 0; 2501 int n; 2502 user_addr_t iov_base; 2503 u_int32_t devblocksize; 2504 u_int32_t mem_alignment_mask; 2505 2506 /* 2507 * When we enter this routine, we know 2508 * -- the io_req_size will not exceed iov_len 2509 * -- the target address is physically contiguous 2510 */ 2511 cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0); 2512 2513 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; 2514 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; 2515 2516 iostate.io_completed = 0; 2517 iostate.io_issued = 0; 2518 iostate.io_error = 0; 2519 iostate.io_wanted = 0; 2520 2521 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); 2522 2523next_cwrite: 2524 io_size = *write_length; 2525 2526 iov_base = uio_curriovbase(uio); 2527 2528 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK); 2529 upl_needed_size = upl_offset + io_size; 2530 2531 pages_in_pl = 0; 2532 upl_size = upl_needed_size; 2533 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC | 2534 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; 2535 2536 kret = vm_map_get_upl(current_map(), 2537 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), 2538 &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, 0); 2539 2540 if (kret != KERN_SUCCESS) { 2541 /* 2542 * failed to get pagelist 2543 */ 2544 error = EINVAL; 2545 goto wait_for_cwrites; 2546 } 2547 num_upl++; 2548 2549 /* 2550 * Consider the possibility that upl_size wasn't satisfied. 2551 */ 2552 if (upl_size < upl_needed_size) { 2553 /* 2554 * This is a failure in the physical memory case. 2555 */ 2556 error = EINVAL; 2557 goto wait_for_cwrites; 2558 } 2559 pl = ubc_upl_pageinfo(upl[cur_upl]); 2560 2561 src_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset; 2562 2563 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) { 2564 u_int32_t head_size; 2565 2566 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1)); 2567 2568 if (head_size > io_size) 2569 head_size = io_size; 2570 2571 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0, callback, callback_arg); 2572 2573 if (error) 2574 goto wait_for_cwrites; 2575 2576 upl_offset += head_size; 2577 src_paddr += head_size; 2578 io_size -= head_size; 2579 2580 iov_base += head_size; 2581 } 2582 if ((u_int32_t)iov_base & mem_alignment_mask) { 2583 /* 2584 * request doesn't set up on a memory boundary 2585 * the underlying DMA engine can handle... 2586 * return an error instead of going through 2587 * the slow copy path since the intent of this 2588 * path is direct I/O from device memory 2589 */ 2590 error = EINVAL; 2591 goto wait_for_cwrites; 2592 } 2593 2594 tail_size = io_size & (devblocksize - 1); 2595 io_size -= tail_size; 2596 2597 while (io_size && error == 0) { 2598 2599 if (io_size > MAX_IO_CONTIG_SIZE) 2600 xsize = MAX_IO_CONTIG_SIZE; 2601 else 2602 xsize = io_size; 2603 /* 2604 * request asynchronously so that we can overlap 2605 * the preparation of the next I/O... we'll do 2606 * the commit after all the I/O has completed 2607 * since its all issued against the same UPL 2608 * if there are already too many outstanding writes 2609 * wait until some have completed before issuing the next 2610 */ 2611 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig"); 2612 2613 if (iostate.io_error) { 2614 /* 2615 * one of the earlier writes we issued ran into a hard error 2616 * don't issue any more writes... 2617 * go wait for all writes that are part of this stream 2618 * to complete before returning the error to the caller 2619 */ 2620 goto wait_for_cwrites; 2621 } 2622 /* 2623 * issue an asynchronous write to cluster_io 2624 */ 2625 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, 2626 xsize, CL_DEV_MEMORY | CL_ASYNC | bflag, (buf_t)NULL, (struct clios *)&iostate, callback, callback_arg); 2627 2628 if (error == 0) { 2629 /* 2630 * The cluster_io write completed successfully, 2631 * update the uio structure 2632 */ 2633 uio_update(uio, (user_size_t)xsize); 2634 2635 upl_offset += xsize; 2636 src_paddr += xsize; 2637 io_size -= xsize; 2638 } 2639 } 2640 if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS) { 2641 2642 error = cluster_io_type(uio, write_type, write_length, 0); 2643 2644 if (error == 0 && *write_type == IO_CONTIG) { 2645 cur_upl++; 2646 goto next_cwrite; 2647 } 2648 } else 2649 *write_type = IO_UNKNOWN; 2650 2651wait_for_cwrites: 2652 /* 2653 * make sure all async writes that are part of this stream 2654 * have completed before we proceed 2655 */ 2656 cluster_iostate_wait(&iostate, 0, "cluster_write_contig"); 2657 2658 if (iostate.io_error) 2659 error = iostate.io_error; 2660 2661 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); 2662 2663 if (error == 0 && tail_size) 2664 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg); 2665 2666 for (n = 0; n < num_upl; n++) 2667 /* 2668 * just release our hold on each physically contiguous 2669 * region without changing any state 2670 */ 2671 ubc_upl_abort(upl[n], 0); 2672 2673 return (error); 2674} 2675 2676 2677/* 2678 * need to avoid a race between an msync of a range of pages dirtied via mmap 2679 * vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's 2680 * zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd 2681 * 2682 * we should never force-zero-fill pages that are already valid in the cache... 2683 * the entire page contains valid data (either from disk, zero-filled or dirtied 2684 * via an mmap) so we can only do damage by trying to zero-fill 2685 * 2686 */ 2687static int 2688cluster_zero_range(upl_t upl, upl_page_info_t *pl, int flags, int io_offset, off_t zero_off, off_t upl_f_offset, int bytes_to_zero) 2689{ 2690 int zero_pg_index; 2691 boolean_t need_cluster_zero = TRUE; 2692 2693 if ((flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) { 2694 2695 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64)); 2696 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64); 2697 2698 if (upl_valid_page(pl, zero_pg_index)) { 2699 /* 2700 * never force zero valid pages - dirty or clean 2701 * we'll leave these in the UPL for cluster_write_copy to deal with 2702 */ 2703 need_cluster_zero = FALSE; 2704 } 2705 } 2706 if (need_cluster_zero == TRUE) 2707 cluster_zero(upl, io_offset, bytes_to_zero, NULL); 2708 2709 return (bytes_to_zero); 2710} 2711 2712 2713static int 2714cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff, 2715 off_t tailOff, int flags, int (*callback)(buf_t, void *), void *callback_arg) 2716{ 2717 upl_page_info_t *pl; 2718 upl_t upl; 2719 vm_offset_t upl_offset = 0; 2720 vm_size_t upl_size; 2721 off_t upl_f_offset; 2722 int pages_in_upl; 2723 int start_offset; 2724 int xfer_resid; 2725 int io_size; 2726 int io_offset; 2727 int bytes_to_zero; 2728 int bytes_to_move; 2729 kern_return_t kret; 2730 int retval = 0; 2731 int io_resid; 2732 long long total_size; 2733 long long zero_cnt; 2734 off_t zero_off; 2735 long long zero_cnt1; 2736 off_t zero_off1; 2737 off_t write_off = 0; 2738 int write_cnt = 0; 2739 boolean_t first_pass = FALSE; 2740 struct cl_extent cl; 2741 struct cl_writebehind *wbp; 2742 int bflag; 2743 u_int max_cluster_pgcount; 2744 u_int max_io_size; 2745 2746 if (uio) { 2747 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START, 2748 (int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, 0); 2749 2750 io_resid = io_req_size; 2751 } else { 2752 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START, 2753 0, 0, (int)oldEOF, (int)newEOF, 0); 2754 2755 io_resid = 0; 2756 } 2757 if (flags & IO_PASSIVE) 2758 bflag = CL_PASSIVE; 2759 else 2760 bflag = 0; 2761 if (flags & IO_NOCACHE) 2762 bflag |= CL_NOCACHE; 2763 2764 if (flags & IO_SKIP_ENCRYPTION) 2765 bflag |= CL_ENCRYPTED; 2766 2767 zero_cnt = 0; 2768 zero_cnt1 = 0; 2769 zero_off = 0; 2770 zero_off1 = 0; 2771 2772 max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE; 2773 max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE); 2774 2775 if (flags & IO_HEADZEROFILL) { 2776 /* 2777 * some filesystems (HFS is one) don't support unallocated holes within a file... 2778 * so we zero fill the intervening space between the old EOF and the offset 2779 * where the next chunk of real data begins.... ftruncate will also use this 2780 * routine to zero fill to the new EOF when growing a file... in this case, the 2781 * uio structure will not be provided 2782 */ 2783 if (uio) { 2784 if (headOff < uio->uio_offset) { 2785 zero_cnt = uio->uio_offset - headOff; 2786 zero_off = headOff; 2787 } 2788 } else if (headOff < newEOF) { 2789 zero_cnt = newEOF - headOff; 2790 zero_off = headOff; 2791 } 2792 } else { 2793 if (uio && uio->uio_offset > oldEOF) { 2794 zero_off = uio->uio_offset & ~PAGE_MASK_64; 2795 2796 if (zero_off >= oldEOF) { 2797 zero_cnt = uio->uio_offset - zero_off; 2798 2799 flags |= IO_HEADZEROFILL; 2800 } 2801 } 2802 } 2803 if (flags & IO_TAILZEROFILL) { 2804 if (uio) { 2805 zero_off1 = uio->uio_offset + io_req_size; 2806 2807 if (zero_off1 < tailOff) 2808 zero_cnt1 = tailOff - zero_off1; 2809 } 2810 } else { 2811 if (uio && newEOF > oldEOF) { 2812 zero_off1 = uio->uio_offset + io_req_size; 2813 2814 if (zero_off1 == newEOF && (zero_off1 & PAGE_MASK_64)) { 2815 zero_cnt1 = PAGE_SIZE_64 - (zero_off1 & PAGE_MASK_64); 2816 2817 flags |= IO_TAILZEROFILL; 2818 } 2819 } 2820 } 2821 if (zero_cnt == 0 && uio == (struct uio *) 0) { 2822 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, 2823 retval, 0, 0, 0, 0); 2824 return (0); 2825 } 2826 if (uio) { 2827 write_off = uio->uio_offset; 2828 write_cnt = uio_resid(uio); 2829 /* 2830 * delay updating the sequential write info 2831 * in the control block until we've obtained 2832 * the lock for it 2833 */ 2834 first_pass = TRUE; 2835 } 2836 while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) { 2837 /* 2838 * for this iteration of the loop, figure out where our starting point is 2839 */ 2840 if (zero_cnt) { 2841 start_offset = (int)(zero_off & PAGE_MASK_64); 2842 upl_f_offset = zero_off - start_offset; 2843 } else if (io_resid) { 2844 start_offset = (int)(uio->uio_offset & PAGE_MASK_64); 2845 upl_f_offset = uio->uio_offset - start_offset; 2846 } else { 2847 start_offset = (int)(zero_off1 & PAGE_MASK_64); 2848 upl_f_offset = zero_off1 - start_offset; 2849 } 2850 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE, 2851 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0); 2852 2853 if (total_size > max_io_size) 2854 total_size = max_io_size; 2855 2856 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64); 2857 2858 if (uio && ((flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) { 2859 /* 2860 * assumption... total_size <= io_resid 2861 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set 2862 */ 2863 if ((start_offset + total_size) > max_io_size) 2864 total_size = max_io_size - start_offset; 2865 xfer_resid = total_size; 2866 2867 retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1); 2868 2869 if (retval) 2870 break; 2871 2872 io_resid -= (total_size - xfer_resid); 2873 total_size = xfer_resid; 2874 start_offset = (int)(uio->uio_offset & PAGE_MASK_64); 2875 upl_f_offset = uio->uio_offset - start_offset; 2876 2877 if (total_size == 0) { 2878 if (start_offset) { 2879 /* 2880 * the write did not finish on a page boundary 2881 * which will leave upl_f_offset pointing to the 2882 * beginning of the last page written instead of 2883 * the page beyond it... bump it in this case 2884 * so that the cluster code records the last page 2885 * written as dirty 2886 */ 2887 upl_f_offset += PAGE_SIZE_64; 2888 } 2889 upl_size = 0; 2890 2891 goto check_cluster; 2892 } 2893 } 2894 /* 2895 * compute the size of the upl needed to encompass 2896 * the requested write... limit each call to cluster_io 2897 * to the maximum UPL size... cluster_io will clip if 2898 * this exceeds the maximum io_size for the device, 2899 * make sure to account for 2900 * a starting offset that's not page aligned 2901 */ 2902 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 2903 2904 if (upl_size > max_io_size) 2905 upl_size = max_io_size; 2906 2907 pages_in_upl = upl_size / PAGE_SIZE; 2908 io_size = upl_size - start_offset; 2909 2910 if ((long long)io_size > total_size) 2911 io_size = total_size; 2912 2913 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0); 2914 2915 2916 /* 2917 * Gather the pages from the buffer cache. 2918 * The UPL_WILL_MODIFY flag lets the UPL subsystem know 2919 * that we intend to modify these pages. 2920 */ 2921 kret = ubc_create_upl(vp, 2922 upl_f_offset, 2923 upl_size, 2924 &upl, 2925 &pl, 2926 UPL_SET_LITE | (( uio!=NULL && (uio->uio_flags & UIO_FLAGS_IS_COMPRESSED_FILE)) ? 0 : UPL_WILL_MODIFY)); 2927 if (kret != KERN_SUCCESS) 2928 panic("cluster_write_copy: failed to get pagelist"); 2929 2930 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, 2931 upl, (int)upl_f_offset, start_offset, 0, 0); 2932 2933 if (start_offset && upl_f_offset < oldEOF && !upl_valid_page(pl, 0)) { 2934 int read_size; 2935 2936 /* 2937 * we're starting in the middle of the first page of the upl 2938 * and the page isn't currently valid, so we're going to have 2939 * to read it in first... this is a synchronous operation 2940 */ 2941 read_size = PAGE_SIZE; 2942 2943 if ((upl_f_offset + read_size) > oldEOF) 2944 read_size = oldEOF - upl_f_offset; 2945 2946 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, 2947 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 2948 if (retval) { 2949 /* 2950 * we had an error during the read which causes us to abort 2951 * the current cluster_write request... before we do, we need 2952 * to release the rest of the pages in the upl without modifying 2953 * there state and mark the failed page in error 2954 */ 2955 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY); 2956 2957 if (upl_size > PAGE_SIZE) 2958 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); 2959 2960 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE, 2961 upl, 0, 0, retval, 0); 2962 break; 2963 } 2964 } 2965 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) { 2966 /* 2967 * the last offset we're writing to in this upl does not end on a page 2968 * boundary... if it's not beyond the old EOF, then we'll also need to 2969 * pre-read this page in if it isn't already valid 2970 */ 2971 upl_offset = upl_size - PAGE_SIZE; 2972 2973 if ((upl_f_offset + start_offset + io_size) < oldEOF && 2974 !upl_valid_page(pl, upl_offset / PAGE_SIZE)) { 2975 int read_size; 2976 2977 read_size = PAGE_SIZE; 2978 2979 if ((off_t)(upl_f_offset + upl_offset + read_size) > oldEOF) 2980 read_size = oldEOF - (upl_f_offset + upl_offset); 2981 2982 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, 2983 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 2984 if (retval) { 2985 /* 2986 * we had an error during the read which causes us to abort 2987 * the current cluster_write request... before we do, we 2988 * need to release the rest of the pages in the upl without 2989 * modifying there state and mark the failed page in error 2990 */ 2991 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY); 2992 2993 if (upl_size > PAGE_SIZE) 2994 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); 2995 2996 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE, 2997 upl, 0, 0, retval, 0); 2998 break; 2999 } 3000 } 3001 } 3002 xfer_resid = io_size; 3003 io_offset = start_offset; 3004 3005 while (zero_cnt && xfer_resid) { 3006 3007 if (zero_cnt < (long long)xfer_resid) 3008 bytes_to_zero = zero_cnt; 3009 else 3010 bytes_to_zero = xfer_resid; 3011 3012 bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off, upl_f_offset, bytes_to_zero); 3013 3014 xfer_resid -= bytes_to_zero; 3015 zero_cnt -= bytes_to_zero; 3016 zero_off += bytes_to_zero; 3017 io_offset += bytes_to_zero; 3018 } 3019 if (xfer_resid && io_resid) { 3020 u_int32_t io_requested; 3021 3022 bytes_to_move = min(io_resid, xfer_resid); 3023 io_requested = bytes_to_move; 3024 3025 retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested); 3026 3027 if (retval) { 3028 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); 3029 3030 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE, 3031 upl, 0, 0, retval, 0); 3032 } else { 3033 io_resid -= bytes_to_move; 3034 xfer_resid -= bytes_to_move; 3035 io_offset += bytes_to_move; 3036 } 3037 } 3038 while (xfer_resid && zero_cnt1 && retval == 0) { 3039 3040 if (zero_cnt1 < (long long)xfer_resid) 3041 bytes_to_zero = zero_cnt1; 3042 else 3043 bytes_to_zero = xfer_resid; 3044 3045 bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off1, upl_f_offset, bytes_to_zero); 3046 3047 xfer_resid -= bytes_to_zero; 3048 zero_cnt1 -= bytes_to_zero; 3049 zero_off1 += bytes_to_zero; 3050 io_offset += bytes_to_zero; 3051 } 3052 if (retval == 0) { 3053 int cl_index; 3054 int ret_cluster_try_push; 3055 3056 io_size += start_offset; 3057 3058 if ((upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) { 3059 /* 3060 * if we're extending the file with this write 3061 * we'll zero fill the rest of the page so that 3062 * if the file gets extended again in such a way as to leave a 3063 * hole starting at this EOF, we'll have zero's in the correct spot 3064 */ 3065 cluster_zero(upl, io_size, upl_size - io_size, NULL); 3066 } 3067 /* 3068 * release the upl now if we hold one since... 3069 * 1) pages in it may be present in the sparse cluster map 3070 * and may span 2 separate buckets there... if they do and 3071 * we happen to have to flush a bucket to make room and it intersects 3072 * this upl, a deadlock may result on page BUSY 3073 * 2) we're delaying the I/O... from this point forward we're just updating 3074 * the cluster state... no need to hold the pages, so commit them 3075 * 3) IO_SYNC is set... 3076 * because we had to ask for a UPL that provides currenty non-present pages, the 3077 * UPL has been automatically set to clear the dirty flags (both software and hardware) 3078 * upon committing it... this is not the behavior we want since it's possible for 3079 * pages currently present as part of a mapped file to be dirtied while the I/O is in flight. 3080 * we'll pick these pages back up later with the correct behavior specified. 3081 * 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush 3082 * of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages 3083 * we hold since the flushing context is holding the cluster lock. 3084 */ 3085 ubc_upl_commit_range(upl, 0, upl_size, 3086 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY); 3087check_cluster: 3088 /* 3089 * calculate the last logical block number 3090 * that this delayed I/O encompassed 3091 */ 3092 cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64); 3093 3094 if (flags & IO_SYNC) { 3095 /* 3096 * if the IO_SYNC flag is set than we need to 3097 * bypass any clusters and immediately issue 3098 * the I/O 3099 */ 3100 goto issue_io; 3101 } 3102 /* 3103 * take the lock to protect our accesses 3104 * of the writebehind and sparse cluster state 3105 */ 3106 wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED); 3107 3108 if (wbp->cl_scmap) { 3109 3110 if ( !(flags & IO_NOCACHE)) { 3111 /* 3112 * we've fallen into the sparse 3113 * cluster method of delaying dirty pages 3114 */ 3115 sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, newEOF, callback, callback_arg); 3116 3117 lck_mtx_unlock(&wbp->cl_lockw); 3118 3119 continue; 3120 } 3121 /* 3122 * must have done cached writes that fell into 3123 * the sparse cluster mechanism... we've switched 3124 * to uncached writes on the file, so go ahead 3125 * and push whatever's in the sparse map 3126 * and switch back to normal clustering 3127 */ 3128 wbp->cl_number = 0; 3129 3130 sparse_cluster_push(&(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg); 3131 /* 3132 * no clusters of either type present at this point 3133 * so just go directly to start_new_cluster since 3134 * we know we need to delay this I/O since we've 3135 * already released the pages back into the cache 3136 * to avoid the deadlock with sparse_cluster_push 3137 */ 3138 goto start_new_cluster; 3139 } 3140 if (first_pass) { 3141 if (write_off == wbp->cl_last_write) 3142 wbp->cl_seq_written += write_cnt; 3143 else 3144 wbp->cl_seq_written = write_cnt; 3145 3146 wbp->cl_last_write = write_off + write_cnt; 3147 3148 first_pass = FALSE; 3149 } 3150 if (wbp->cl_number == 0) 3151 /* 3152 * no clusters currently present 3153 */ 3154 goto start_new_cluster; 3155 3156 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) { 3157 /* 3158 * check each cluster that we currently hold 3159 * try to merge some or all of this write into 3160 * one or more of the existing clusters... if 3161 * any portion of the write remains, start a 3162 * new cluster 3163 */ 3164 if (cl.b_addr >= wbp->cl_clusters[cl_index].b_addr) { 3165 /* 3166 * the current write starts at or after the current cluster 3167 */ 3168 if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) { 3169 /* 3170 * we have a write that fits entirely 3171 * within the existing cluster limits 3172 */ 3173 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) 3174 /* 3175 * update our idea of where the cluster ends 3176 */ 3177 wbp->cl_clusters[cl_index].e_addr = cl.e_addr; 3178 break; 3179 } 3180 if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) { 3181 /* 3182 * we have a write that starts in the middle of the current cluster 3183 * but extends beyond the cluster's limit... we know this because 3184 * of the previous checks 3185 * we'll extend the current cluster to the max 3186 * and update the b_addr for the current write to reflect that 3187 * the head of it was absorbed into this cluster... 3188 * note that we'll always have a leftover tail in this case since 3189 * full absorbtion would have occurred in the clause above 3190 */ 3191 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount; 3192 3193 cl.b_addr = wbp->cl_clusters[cl_index].e_addr; 3194 } 3195 /* 3196 * we come here for the case where the current write starts 3197 * beyond the limit of the existing cluster or we have a leftover 3198 * tail after a partial absorbtion 3199 * 3200 * in either case, we'll check the remaining clusters before 3201 * starting a new one 3202 */ 3203 } else { 3204 /* 3205 * the current write starts in front of the cluster we're currently considering 3206 */ 3207 if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= max_cluster_pgcount) { 3208 /* 3209 * we can just merge the new request into 3210 * this cluster and leave it in the cache 3211 * since the resulting cluster is still 3212 * less than the maximum allowable size 3213 */ 3214 wbp->cl_clusters[cl_index].b_addr = cl.b_addr; 3215 3216 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) { 3217 /* 3218 * the current write completely 3219 * envelops the existing cluster and since 3220 * each write is limited to at most max_cluster_pgcount pages 3221 * we can just use the start and last blocknos of the write 3222 * to generate the cluster limits 3223 */ 3224 wbp->cl_clusters[cl_index].e_addr = cl.e_addr; 3225 } 3226 break; 3227 } 3228 3229 /* 3230 * if we were to combine this write with the current cluster 3231 * we would exceed the cluster size limit.... so, 3232 * let's see if there's any overlap of the new I/O with 3233 * the cluster we're currently considering... in fact, we'll 3234 * stretch the cluster out to it's full limit and see if we 3235 * get an intersection with the current write 3236 * 3237 */ 3238 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) { 3239 /* 3240 * the current write extends into the proposed cluster 3241 * clip the length of the current write after first combining it's 3242 * tail with the newly shaped cluster 3243 */ 3244 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount; 3245 3246 cl.e_addr = wbp->cl_clusters[cl_index].b_addr; 3247 } 3248 /* 3249 * if we get here, there was no way to merge 3250 * any portion of this write with this cluster 3251 * or we could only merge part of it which 3252 * will leave a tail... 3253 * we'll check the remaining clusters before starting a new one 3254 */ 3255 } 3256 } 3257 if (cl_index < wbp->cl_number) 3258 /* 3259 * we found an existing cluster(s) that we 3260 * could entirely merge this I/O into 3261 */ 3262 goto delay_io; 3263 3264 if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && 3265 wbp->cl_number == MAX_CLUSTERS && 3266 wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) { 3267 uint32_t n; 3268 3269 if (vp->v_mount->mnt_kern_flag & MNTK_SSD) 3270 n = WRITE_BEHIND_SSD; 3271 else 3272 n = WRITE_BEHIND; 3273 3274 while (n--) 3275 cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg); 3276 } 3277 if (wbp->cl_number < MAX_CLUSTERS) { 3278 /* 3279 * we didn't find an existing cluster to 3280 * merge into, but there's room to start 3281 * a new one 3282 */ 3283 goto start_new_cluster; 3284 } 3285 /* 3286 * no exisitng cluster to merge with and no 3287 * room to start a new one... we'll try 3288 * pushing one of the existing ones... if none of 3289 * them are able to be pushed, we'll switch 3290 * to the sparse cluster mechanism 3291 * cluster_try_push updates cl_number to the 3292 * number of remaining clusters... and 3293 * returns the number of currently unused clusters 3294 */ 3295 ret_cluster_try_push = 0; 3296 3297 /* 3298 * if writes are not deferred, call cluster push immediately 3299 */ 3300 if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) { 3301 3302 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg); 3303 } 3304 3305 /* 3306 * execute following regardless of writes being deferred or not 3307 */ 3308 if (ret_cluster_try_push == 0) { 3309 /* 3310 * no more room in the normal cluster mechanism 3311 * so let's switch to the more expansive but expensive 3312 * sparse mechanism.... 3313 */ 3314 sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg); 3315 sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, newEOF, callback, callback_arg); 3316 3317 lck_mtx_unlock(&wbp->cl_lockw); 3318 3319 continue; 3320 } 3321start_new_cluster: 3322 wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr; 3323 wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr; 3324 3325 wbp->cl_clusters[wbp->cl_number].io_flags = 0; 3326 3327 if (flags & IO_NOCACHE) 3328 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IONOCACHE; 3329 3330 if (bflag & CL_PASSIVE) 3331 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IOPASSIVE; 3332 3333 wbp->cl_number++; 3334delay_io: 3335 lck_mtx_unlock(&wbp->cl_lockw); 3336 3337 continue; 3338issue_io: 3339 /* 3340 * we don't hold the lock at this point 3341 * 3342 * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set 3343 * so that we correctly deal with a change in state of the hardware modify bit... 3344 * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force 3345 * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also 3346 * responsible for generating the correct sized I/O(s) 3347 */ 3348 retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg); 3349 } 3350 } 3351 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, retval, 0, io_resid, 0, 0); 3352 3353 return (retval); 3354} 3355 3356 3357 3358int 3359cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags) 3360{ 3361 return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL); 3362} 3363 3364 3365int 3366cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (*callback)(buf_t, void *), void *callback_arg) 3367{ 3368 int retval = 0; 3369 int flags; 3370 user_ssize_t cur_resid; 3371 u_int32_t io_size; 3372 u_int32_t read_length = 0; 3373 int read_type = IO_COPY; 3374 3375 flags = xflags; 3376 3377 if (vp->v_flag & VNOCACHE_DATA) 3378 flags |= IO_NOCACHE; 3379 if ((vp->v_flag & VRAOFF) || speculative_reads_disabled) 3380 flags |= IO_RAOFF; 3381 3382 if (flags & IO_SKIP_ENCRYPTION) 3383 flags |= IO_ENCRYPTED; 3384 /* 3385 * If we're doing an encrypted IO, then first check to see 3386 * if the IO requested was page aligned. If not, then bail 3387 * out immediately. 3388 */ 3389 if (flags & IO_ENCRYPTED) { 3390 if (read_length & PAGE_MASK) { 3391 retval = EINVAL; 3392 return retval; 3393 } 3394 } 3395 3396 /* 3397 * do a read through the cache if one of the following is true.... 3398 * NOCACHE is not true 3399 * the uio request doesn't target USERSPACE 3400 * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well. 3401 * Reading encrypted data from a CP filesystem should never result in the data touching 3402 * the UBC. 3403 * 3404 * otherwise, find out if we want the direct or contig variant for 3405 * the first vector in the uio request 3406 */ 3407 if ( ((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) || (flags & IO_ENCRYPTED) ) { 3408 3409 retval = cluster_io_type(uio, &read_type, &read_length, 0); 3410 } 3411 3412 while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) { 3413 3414 switch (read_type) { 3415 3416 case IO_COPY: 3417 /* 3418 * make sure the uio_resid isn't too big... 3419 * internally, we want to handle all of the I/O in 3420 * chunk sizes that fit in a 32 bit int 3421 */ 3422 if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) 3423 io_size = MAX_IO_REQUEST_SIZE; 3424 else 3425 io_size = (u_int32_t)cur_resid; 3426 3427 retval = cluster_read_copy(vp, uio, io_size, filesize, flags, callback, callback_arg); 3428 break; 3429 3430 case IO_DIRECT: 3431 retval = cluster_read_direct(vp, uio, filesize, &read_type, &read_length, flags, callback, callback_arg); 3432 break; 3433 3434 case IO_CONTIG: 3435 retval = cluster_read_contig(vp, uio, filesize, &read_type, &read_length, callback, callback_arg, flags); 3436 break; 3437 3438 case IO_UNKNOWN: 3439 retval = cluster_io_type(uio, &read_type, &read_length, 0); 3440 break; 3441 } 3442 } 3443 return (retval); 3444} 3445 3446 3447 3448static void 3449cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference) 3450{ 3451 int range; 3452 int abort_flags = UPL_ABORT_FREE_ON_EMPTY; 3453 3454 if ((range = last_pg - start_pg)) { 3455 if (take_reference) 3456 abort_flags |= UPL_ABORT_REFERENCE; 3457 3458 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags); 3459 } 3460} 3461 3462 3463static int 3464cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg) 3465{ 3466 upl_page_info_t *pl; 3467 upl_t upl; 3468 vm_offset_t upl_offset; 3469 u_int32_t upl_size; 3470 off_t upl_f_offset; 3471 int start_offset; 3472 int start_pg; 3473 int last_pg; 3474 int uio_last = 0; 3475 int pages_in_upl; 3476 off_t max_size; 3477 off_t last_ioread_offset; 3478 off_t last_request_offset; 3479 kern_return_t kret; 3480 int error = 0; 3481 int retval = 0; 3482 u_int32_t size_of_prefetch; 3483 u_int32_t xsize; 3484 u_int32_t io_size; 3485 u_int32_t max_rd_size; 3486 u_int32_t max_io_size; 3487 u_int32_t max_prefetch; 3488 u_int rd_ahead_enabled = 1; 3489 u_int prefetch_enabled = 1; 3490 struct cl_readahead * rap; 3491 struct clios iostate; 3492 struct cl_extent extent; 3493 int bflag; 3494 int take_reference = 1; 3495 int policy = IOPOL_DEFAULT; 3496 boolean_t iolock_inited = FALSE; 3497 3498 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START, 3499 (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0); 3500 3501 if (flags & IO_ENCRYPTED) { 3502 panic ("encrypted blocks will hit UBC!"); 3503 } 3504 3505 policy = throttle_get_io_policy(NULL); 3506 3507 if (policy == THROTTLE_LEVEL_TIER3 || policy == THROTTLE_LEVEL_TIER2 || (flags & IO_NOCACHE)) 3508 take_reference = 0; 3509 3510 if (flags & IO_PASSIVE) 3511 bflag = CL_PASSIVE; 3512 else 3513 bflag = 0; 3514 3515 if (flags & IO_NOCACHE) 3516 bflag |= CL_NOCACHE; 3517 3518 if (flags & IO_SKIP_ENCRYPTION) 3519 bflag |= CL_ENCRYPTED; 3520 3521 max_io_size = cluster_max_io_size(vp->v_mount, CL_READ); 3522 max_prefetch = MAX_PREFETCH(vp, max_io_size, (vp->v_mount->mnt_kern_flag & MNTK_SSD)); 3523 max_rd_size = max_prefetch; 3524 3525 last_request_offset = uio->uio_offset + io_req_size; 3526 3527 if (last_request_offset > filesize) 3528 last_request_offset = filesize; 3529 3530 if ((flags & (IO_RAOFF|IO_NOCACHE)) || ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) { 3531 rd_ahead_enabled = 0; 3532 rap = NULL; 3533 } else { 3534 if (cluster_is_throttled(vp)) { 3535 /* 3536 * we're in the throttle window, at the very least 3537 * we want to limit the size of the I/O we're about 3538 * to issue 3539 */ 3540 rd_ahead_enabled = 0; 3541 prefetch_enabled = 0; 3542 3543 max_rd_size = THROTTLE_MAX_IOSIZE; 3544 } 3545 if ((rap = cluster_get_rap(vp)) == NULL) 3546 rd_ahead_enabled = 0; 3547 else { 3548 extent.b_addr = uio->uio_offset / PAGE_SIZE_64; 3549 extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64; 3550 } 3551 } 3552 if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) { 3553 /* 3554 * determine if we already have a read-ahead in the pipe courtesy of the 3555 * last read systemcall that was issued... 3556 * if so, pick up it's extent to determine where we should start 3557 * with respect to any read-ahead that might be necessary to 3558 * garner all the data needed to complete this read systemcall 3559 */ 3560 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64; 3561 3562 if (last_ioread_offset < uio->uio_offset) 3563 last_ioread_offset = (off_t)0; 3564 else if (last_ioread_offset > last_request_offset) 3565 last_ioread_offset = last_request_offset; 3566 } else 3567 last_ioread_offset = (off_t)0; 3568 3569 while (io_req_size && uio->uio_offset < filesize && retval == 0) { 3570 3571 max_size = filesize - uio->uio_offset; 3572 3573 if ((off_t)(io_req_size) < max_size) 3574 io_size = io_req_size; 3575 else 3576 io_size = max_size; 3577 3578 if (!(flags & IO_NOCACHE)) { 3579 3580 while (io_size) { 3581 u_int32_t io_resid; 3582 u_int32_t io_requested; 3583 3584 /* 3585 * if we keep finding the pages we need already in the cache, then 3586 * don't bother to call cluster_read_prefetch since it costs CPU cycles 3587 * to determine that we have all the pages we need... once we miss in 3588 * the cache and have issued an I/O, than we'll assume that we're likely 3589 * to continue to miss in the cache and it's to our advantage to try and prefetch 3590 */ 3591 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) { 3592 if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) { 3593 /* 3594 * we've already issued I/O for this request and 3595 * there's still work to do and 3596 * our prefetch stream is running dry, so issue a 3597 * pre-fetch I/O... the I/O latency will overlap 3598 * with the copying of the data 3599 */ 3600 if (size_of_prefetch > max_rd_size) 3601 size_of_prefetch = max_rd_size; 3602 3603 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag); 3604 3605 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE); 3606 3607 if (last_ioread_offset > last_request_offset) 3608 last_ioread_offset = last_request_offset; 3609 } 3610 } 3611 /* 3612 * limit the size of the copy we're about to do so that 3613 * we can notice that our I/O pipe is running dry and 3614 * get the next I/O issued before it does go dry 3615 */ 3616 if (last_ioread_offset && io_size > (max_io_size / 4)) 3617 io_resid = (max_io_size / 4); 3618 else 3619 io_resid = io_size; 3620 3621 io_requested = io_resid; 3622 3623 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference); 3624 3625 xsize = io_requested - io_resid; 3626 3627 io_size -= xsize; 3628 io_req_size -= xsize; 3629 3630 if (retval || io_resid) 3631 /* 3632 * if we run into a real error or 3633 * a page that is not in the cache 3634 * we need to leave streaming mode 3635 */ 3636 break; 3637 3638 if (rd_ahead_enabled && (io_size == 0 || last_ioread_offset == last_request_offset)) { 3639 /* 3640 * we're already finished the I/O for this read request 3641 * let's see if we should do a read-ahead 3642 */ 3643 cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag); 3644 } 3645 } 3646 if (retval) 3647 break; 3648 if (io_size == 0) { 3649 if (rap != NULL) { 3650 if (extent.e_addr < rap->cl_lastr) 3651 rap->cl_maxra = 0; 3652 rap->cl_lastr = extent.e_addr; 3653 } 3654 break; 3655 } 3656 /* 3657 * recompute max_size since cluster_copy_ubc_data_internal 3658 * may have advanced uio->uio_offset 3659 */ 3660 max_size = filesize - uio->uio_offset; 3661 } 3662 3663 iostate.io_completed = 0; 3664 iostate.io_issued = 0; 3665 iostate.io_error = 0; 3666 iostate.io_wanted = 0; 3667 3668 if ( (flags & IO_RETURN_ON_THROTTLE) ) { 3669 if (cluster_is_throttled(vp) == THROTTLE_NOW) { 3670 if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) { 3671 /* 3672 * we're in the throttle window and at least 1 I/O 3673 * has already been issued by a throttleable thread 3674 * in this window, so return with EAGAIN to indicate 3675 * to the FS issuing the cluster_read call that it 3676 * should now throttle after dropping any locks 3677 */ 3678 throttle_info_update_by_mount(vp->v_mount); 3679 3680 retval = EAGAIN; 3681 break; 3682 } 3683 } 3684 } 3685 3686 /* 3687 * compute the size of the upl needed to encompass 3688 * the requested read... limit each call to cluster_io 3689 * to the maximum UPL size... cluster_io will clip if 3690 * this exceeds the maximum io_size for the device, 3691 * make sure to account for 3692 * a starting offset that's not page aligned 3693 */ 3694 start_offset = (int)(uio->uio_offset & PAGE_MASK_64); 3695 upl_f_offset = uio->uio_offset - (off_t)start_offset; 3696 3697 if (io_size > max_rd_size) 3698 io_size = max_rd_size; 3699 3700 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 3701 3702 if (flags & IO_NOCACHE) { 3703 if (upl_size > max_io_size) 3704 upl_size = max_io_size; 3705 } else { 3706 if (upl_size > max_io_size / 4) { 3707 upl_size = max_io_size / 4; 3708 upl_size &= ~PAGE_MASK; 3709 3710 if (upl_size == 0) 3711 upl_size = PAGE_SIZE; 3712 } 3713 } 3714 pages_in_upl = upl_size / PAGE_SIZE; 3715 3716 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START, 3717 upl, (int)upl_f_offset, upl_size, start_offset, 0); 3718 3719 kret = ubc_create_upl(vp, 3720 upl_f_offset, 3721 upl_size, 3722 &upl, 3723 &pl, 3724 UPL_FILE_IO | UPL_SET_LITE); 3725 if (kret != KERN_SUCCESS) 3726 panic("cluster_read_copy: failed to get pagelist"); 3727 3728 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END, 3729 upl, (int)upl_f_offset, upl_size, start_offset, 0); 3730 3731 /* 3732 * scan from the beginning of the upl looking for the first 3733 * non-valid page.... this will become the first page in 3734 * the request we're going to make to 'cluster_io'... if all 3735 * of the pages are valid, we won't call through to 'cluster_io' 3736 */ 3737 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) { 3738 if (!upl_valid_page(pl, start_pg)) 3739 break; 3740 } 3741 3742 /* 3743 * scan from the starting invalid page looking for a valid 3744 * page before the end of the upl is reached, if we 3745 * find one, then it will be the last page of the request to 3746 * 'cluster_io' 3747 */ 3748 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) { 3749 if (upl_valid_page(pl, last_pg)) 3750 break; 3751 } 3752 3753 if (start_pg < last_pg) { 3754 /* 3755 * we found a range of 'invalid' pages that must be filled 3756 * if the last page in this range is the last page of the file 3757 * we may have to clip the size of it to keep from reading past 3758 * the end of the last physical block associated with the file 3759 */ 3760 if (iolock_inited == FALSE) { 3761 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); 3762 3763 iolock_inited = TRUE; 3764 } 3765 upl_offset = start_pg * PAGE_SIZE; 3766 io_size = (last_pg - start_pg) * PAGE_SIZE; 3767 3768 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) 3769 io_size = filesize - (upl_f_offset + upl_offset); 3770 3771 /* 3772 * issue an asynchronous read to cluster_io 3773 */ 3774 3775 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, 3776 io_size, CL_READ | CL_ASYNC | bflag, (buf_t)NULL, &iostate, callback, callback_arg); 3777 3778 if (rap) { 3779 if (extent.e_addr < rap->cl_maxra) { 3780 /* 3781 * we've just issued a read for a block that should have been 3782 * in the cache courtesy of the read-ahead engine... something 3783 * has gone wrong with the pipeline, so reset the read-ahead 3784 * logic which will cause us to restart from scratch 3785 */ 3786 rap->cl_maxra = 0; 3787 } 3788 } 3789 } 3790 if (error == 0) { 3791 /* 3792 * if the read completed successfully, or there was no I/O request 3793 * issued, than copy the data into user land via 'cluster_upl_copy_data' 3794 * we'll first add on any 'valid' 3795 * pages that were present in the upl when we acquired it. 3796 */ 3797 u_int val_size; 3798 3799 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) { 3800 if (!upl_valid_page(pl, uio_last)) 3801 break; 3802 } 3803 if (uio_last < pages_in_upl) { 3804 /* 3805 * there were some invalid pages beyond the valid pages 3806 * that we didn't issue an I/O for, just release them 3807 * unchanged now, so that any prefetch/readahed can 3808 * include them 3809 */ 3810 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE, 3811 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY); 3812 } 3813 3814 /* 3815 * compute size to transfer this round, if io_req_size is 3816 * still non-zero after this attempt, we'll loop around and 3817 * set up for another I/O. 3818 */ 3819 val_size = (uio_last * PAGE_SIZE) - start_offset; 3820 3821 if (val_size > max_size) 3822 val_size = max_size; 3823 3824 if (val_size > io_req_size) 3825 val_size = io_req_size; 3826 3827 if ((uio->uio_offset + val_size) > last_ioread_offset) 3828 last_ioread_offset = uio->uio_offset + val_size; 3829 3830 if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) { 3831 3832 if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) { 3833 /* 3834 * if there's still I/O left to do for this request, and... 3835 * we're not in hard throttle mode, and... 3836 * we're close to using up the previous prefetch, then issue a 3837 * new pre-fetch I/O... the I/O latency will overlap 3838 * with the copying of the data 3839 */ 3840 if (size_of_prefetch > max_rd_size) 3841 size_of_prefetch = max_rd_size; 3842 3843 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag); 3844 3845 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE); 3846 3847 if (last_ioread_offset > last_request_offset) 3848 last_ioread_offset = last_request_offset; 3849 } 3850 3851 } else if ((uio->uio_offset + val_size) == last_request_offset) { 3852 /* 3853 * this transfer will finish this request, so... 3854 * let's try to read ahead if we're in 3855 * a sequential access pattern and we haven't 3856 * explicitly disabled it 3857 */ 3858 if (rd_ahead_enabled) 3859 cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag); 3860 3861 if (rap != NULL) { 3862 if (extent.e_addr < rap->cl_lastr) 3863 rap->cl_maxra = 0; 3864 rap->cl_lastr = extent.e_addr; 3865 } 3866 } 3867 if (iolock_inited == TRUE) 3868 cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); 3869 3870 if (iostate.io_error) 3871 error = iostate.io_error; 3872 else { 3873 u_int32_t io_requested; 3874 3875 io_requested = val_size; 3876 3877 retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested); 3878 3879 io_req_size -= (val_size - io_requested); 3880 } 3881 } else { 3882 if (iolock_inited == TRUE) 3883 cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); 3884 } 3885 if (start_pg < last_pg) { 3886 /* 3887 * compute the range of pages that we actually issued an I/O for 3888 * and either commit them as valid if the I/O succeeded 3889 * or abort them if the I/O failed or we're not supposed to 3890 * keep them in the cache 3891 */ 3892 io_size = (last_pg - start_pg) * PAGE_SIZE; 3893 3894 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, upl, start_pg * PAGE_SIZE, io_size, error, 0); 3895 3896 if (error || (flags & IO_NOCACHE)) 3897 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size, 3898 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); 3899 else { 3900 int commit_flags = UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY; 3901 3902 if (take_reference) 3903 commit_flags |= UPL_COMMIT_INACTIVATE; 3904 else 3905 commit_flags |= UPL_COMMIT_SPECULATE; 3906 3907 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, commit_flags); 3908 } 3909 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, start_pg * PAGE_SIZE, io_size, error, 0); 3910 } 3911 if ((last_pg - start_pg) < pages_in_upl) { 3912 /* 3913 * the set of pages that we issued an I/O for did not encompass 3914 * the entire upl... so just release these without modifying 3915 * their state 3916 */ 3917 if (error) 3918 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); 3919 else { 3920 3921 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, 3922 upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0); 3923 3924 /* 3925 * handle any valid pages at the beginning of 3926 * the upl... release these appropriately 3927 */ 3928 cluster_read_upl_release(upl, 0, start_pg, take_reference); 3929 3930 /* 3931 * handle any valid pages immediately after the 3932 * pages we issued I/O for... ... release these appropriately 3933 */ 3934 cluster_read_upl_release(upl, last_pg, uio_last, take_reference); 3935 3936 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, -1, -1, 0, 0); 3937 } 3938 } 3939 if (retval == 0) 3940 retval = error; 3941 3942 if (io_req_size) { 3943 if (cluster_is_throttled(vp)) { 3944 /* 3945 * we're in the throttle window, at the very least 3946 * we want to limit the size of the I/O we're about 3947 * to issue 3948 */ 3949 rd_ahead_enabled = 0; 3950 prefetch_enabled = 0; 3951 max_rd_size = THROTTLE_MAX_IOSIZE; 3952 } else { 3953 if (max_rd_size == THROTTLE_MAX_IOSIZE) { 3954 /* 3955 * coming out of throttled state 3956 */ 3957 if (policy != THROTTLE_LEVEL_TIER3 && policy != THROTTLE_LEVEL_TIER2) { 3958 if (rap != NULL) 3959 rd_ahead_enabled = 1; 3960 prefetch_enabled = 1; 3961 } 3962 max_rd_size = max_prefetch; 3963 last_ioread_offset = 0; 3964 } 3965 } 3966 } 3967 } 3968 if (iolock_inited == TRUE) { 3969 /* 3970 * cluster_io returned an error after it 3971 * had already issued some I/O. we need 3972 * to wait for that I/O to complete before 3973 * we can destroy the iostate mutex... 3974 * 'retval' already contains the early error 3975 * so no need to pick it up from iostate.io_error 3976 */ 3977 cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); 3978 3979 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); 3980 } 3981 if (rap != NULL) { 3982 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END, 3983 (int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0); 3984 3985 lck_mtx_unlock(&rap->cl_lockr); 3986 } else { 3987 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END, 3988 (int)uio->uio_offset, io_req_size, 0, retval, 0); 3989 } 3990 3991 return (retval); 3992} 3993 3994static int 3995cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length, 3996 int flags, int (*callback)(buf_t, void *), void *callback_arg) 3997{ 3998 upl_t upl; 3999 upl_page_info_t *pl; 4000 off_t max_io_size; 4001 vm_offset_t upl_offset, vector_upl_offset = 0; 4002 upl_size_t upl_size, vector_upl_size = 0; 4003 vm_size_t upl_needed_size; 4004 unsigned int pages_in_pl; 4005 int upl_flags; 4006 kern_return_t kret; 4007 unsigned int i; 4008 int force_data_sync; 4009 int retval = 0; 4010 int no_zero_fill = 0; 4011 int io_flag = 0; 4012 int misaligned = 0; 4013 struct clios iostate; 4014 user_addr_t iov_base; 4015 u_int32_t io_req_size; 4016 u_int32_t offset_in_file; 4017 u_int32_t offset_in_iovbase; 4018 u_int32_t io_size; 4019 u_int32_t io_min; 4020 u_int32_t xsize; 4021 u_int32_t devblocksize; 4022 u_int32_t mem_alignment_mask; 4023 u_int32_t max_upl_size; 4024 u_int32_t max_rd_size; 4025 u_int32_t max_rd_ahead; 4026 u_int32_t max_vector_size; 4027 boolean_t strict_uncached_IO = FALSE; 4028 boolean_t io_throttled = FALSE; 4029 4030 u_int32_t vector_upl_iosize = 0; 4031 int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1); 4032 off_t v_upl_uio_offset = 0; 4033 int vector_upl_index=0; 4034 upl_t vector_upl = NULL; 4035 4036 user_addr_t orig_iov_base = 0; 4037 user_addr_t last_iov_base = 0; 4038 user_addr_t next_iov_base = 0; 4039 4040 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START, 4041 (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0); 4042 4043 max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ); 4044 4045 max_rd_size = max_upl_size; 4046 max_rd_ahead = max_rd_size * IO_SCALE(vp, 2); 4047 4048 io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO; 4049 4050 if (flags & IO_PASSIVE) 4051 io_flag |= CL_PASSIVE; 4052 4053 if (flags & IO_ENCRYPTED) { 4054 io_flag |= CL_RAW_ENCRYPTED; 4055 } 4056 4057 if (flags & IO_NOCACHE) { 4058 io_flag |= CL_NOCACHE; 4059 } 4060 4061 if (flags & IO_SKIP_ENCRYPTION) 4062 io_flag |= CL_ENCRYPTED; 4063 4064 iostate.io_completed = 0; 4065 iostate.io_issued = 0; 4066 iostate.io_error = 0; 4067 iostate.io_wanted = 0; 4068 4069 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); 4070 4071 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; 4072 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; 4073 4074 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE, 4075 (int)devblocksize, (int)mem_alignment_mask, 0, 0, 0); 4076 4077 if (devblocksize == 1) { 4078 /* 4079 * the AFP client advertises a devblocksize of 1 4080 * however, its BLOCKMAP routine maps to physical 4081 * blocks that are PAGE_SIZE in size... 4082 * therefore we can't ask for I/Os that aren't page aligned 4083 * or aren't multiples of PAGE_SIZE in size 4084 * by setting devblocksize to PAGE_SIZE, we re-instate 4085 * the old behavior we had before the mem_alignment_mask 4086 * changes went in... 4087 */ 4088 devblocksize = PAGE_SIZE; 4089 } 4090 4091 strict_uncached_IO = ubc_strict_uncached_IO(vp); 4092 4093 orig_iov_base = uio_curriovbase(uio); 4094 last_iov_base = orig_iov_base; 4095 4096next_dread: 4097 io_req_size = *read_length; 4098 iov_base = uio_curriovbase(uio); 4099 4100 max_io_size = filesize - uio->uio_offset; 4101 4102 if ((off_t)io_req_size > max_io_size) 4103 io_req_size = max_io_size; 4104 4105 offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1); 4106 offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask; 4107 4108 if (offset_in_file || offset_in_iovbase) { 4109 /* 4110 * one of the 2 important offsets is misaligned 4111 * so fire an I/O through the cache for this entire vector 4112 */ 4113 misaligned = 1; 4114 } 4115 if (iov_base & (devblocksize - 1)) { 4116 /* 4117 * the offset in memory must be on a device block boundary 4118 * so that we can guarantee that we can generate an 4119 * I/O that ends on a page boundary in cluster_io 4120 */ 4121 misaligned = 1; 4122 } 4123 4124 /* 4125 * The user must request IO in aligned chunks. If the 4126 * offset into the file is bad, or the userland pointer 4127 * is non-aligned, then we cannot service the encrypted IO request. 4128 */ 4129 if ((flags & IO_ENCRYPTED) && (misaligned)) { 4130 retval = EINVAL; 4131 } 4132 4133 /* 4134 * When we get to this point, we know... 4135 * -- the offset into the file is on a devblocksize boundary 4136 */ 4137 4138 while (io_req_size && retval == 0) { 4139 u_int32_t io_start; 4140 4141 if (cluster_is_throttled(vp)) { 4142 /* 4143 * we're in the throttle window, at the very least 4144 * we want to limit the size of the I/O we're about 4145 * to issue 4146 */ 4147 max_rd_size = THROTTLE_MAX_IOSIZE; 4148 max_rd_ahead = THROTTLE_MAX_IOSIZE - 1; 4149 max_vector_size = THROTTLE_MAX_IOSIZE; 4150 } else { 4151 max_rd_size = max_upl_size; 4152 max_rd_ahead = max_rd_size * IO_SCALE(vp, 2); 4153 max_vector_size = MAX_VECTOR_UPL_SIZE; 4154 } 4155 io_start = io_size = io_req_size; 4156 4157 /* 4158 * First look for pages already in the cache 4159 * and move them to user space. But only do this 4160 * check if we are not retrieving encrypted data directly 4161 * from the filesystem; those blocks should never 4162 * be in the UBC. 4163 * 4164 * cluster_copy_ubc_data returns the resid 4165 * in io_size 4166 */ 4167 if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) { 4168 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0); 4169 } 4170 /* 4171 * calculate the number of bytes actually copied 4172 * starting size - residual 4173 */ 4174 xsize = io_start - io_size; 4175 4176 io_req_size -= xsize; 4177 4178 if(useVectorUPL && (xsize || (iov_base & PAGE_MASK))) { 4179 /* 4180 * We found something in the cache or we have an iov_base that's not 4181 * page-aligned. 4182 * 4183 * Issue all I/O's that have been collected within this Vectored UPL. 4184 */ 4185 if(vector_upl_index) { 4186 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 4187 reset_vector_run_state(); 4188 } 4189 4190 if(xsize) 4191 useVectorUPL = 0; 4192 4193 /* 4194 * After this point, if we are using the Vector UPL path and the base is 4195 * not page-aligned then the UPL with that base will be the first in the vector UPL. 4196 */ 4197 } 4198 4199 /* 4200 * check to see if we are finished with this request. 4201 * 4202 * If we satisfied this IO already, then io_req_size will be 0. 4203 * Otherwise, see if the IO was mis-aligned and needs to go through 4204 * the UBC to deal with the 'tail'. 4205 * 4206 */ 4207 if (io_req_size == 0 || (misaligned)) { 4208 /* 4209 * see if there's another uio vector to 4210 * process that's of type IO_DIRECT 4211 * 4212 * break out of while loop to get there 4213 */ 4214 break; 4215 } 4216 /* 4217 * assume the request ends on a device block boundary 4218 */ 4219 io_min = devblocksize; 4220 4221 /* 4222 * we can handle I/O's in multiples of the device block size 4223 * however, if io_size isn't a multiple of devblocksize we 4224 * want to clip it back to the nearest page boundary since 4225 * we are going to have to go through cluster_read_copy to 4226 * deal with the 'overhang'... by clipping it to a PAGE_SIZE 4227 * multiple, we avoid asking the drive for the same physical 4228 * blocks twice.. once for the partial page at the end of the 4229 * request and a 2nd time for the page we read into the cache 4230 * (which overlaps the end of the direct read) in order to 4231 * get at the overhang bytes 4232 */ 4233 if (io_size & (devblocksize - 1)) { 4234 if (flags & IO_ENCRYPTED) { 4235 /* 4236 * Normally, we'd round down to the previous page boundary to 4237 * let the UBC manage the zero-filling of the file past the EOF. 4238 * But if we're doing encrypted IO, we can't let any of 4239 * the data hit the UBC. This means we have to do the full 4240 * IO to the upper block boundary of the device block that 4241 * contains the EOF. The user will be responsible for not 4242 * interpreting data PAST the EOF in its buffer. 4243 * 4244 * So just bump the IO back up to a multiple of devblocksize 4245 */ 4246 io_size = ((io_size + devblocksize) & ~(devblocksize - 1)); 4247 io_min = io_size; 4248 } 4249 else { 4250 /* 4251 * Clip the request to the previous page size boundary 4252 * since request does NOT end on a device block boundary 4253 */ 4254 io_size &= ~PAGE_MASK; 4255 io_min = PAGE_SIZE; 4256 } 4257 4258 } 4259 if (retval || io_size < io_min) { 4260 /* 4261 * either an error or we only have the tail left to 4262 * complete via the copy path... 4263 * we may have already spun some portion of this request 4264 * off as async requests... we need to wait for the I/O 4265 * to complete before returning 4266 */ 4267 goto wait_for_dreads; 4268 } 4269 4270 /* 4271 * Don't re-check the UBC data if we are looking for uncached IO 4272 * or asking for encrypted blocks. 4273 */ 4274 if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) { 4275 4276 if ((xsize = io_size) > max_rd_size) 4277 xsize = max_rd_size; 4278 4279 io_size = 0; 4280 4281 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size); 4282 4283 if (io_size == 0) { 4284 /* 4285 * a page must have just come into the cache 4286 * since the first page in this range is no 4287 * longer absent, go back and re-evaluate 4288 */ 4289 continue; 4290 } 4291 } 4292 if ( (flags & IO_RETURN_ON_THROTTLE) ) { 4293 if (cluster_is_throttled(vp) == THROTTLE_NOW) { 4294 if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) { 4295 /* 4296 * we're in the throttle window and at least 1 I/O 4297 * has already been issued by a throttleable thread 4298 * in this window, so return with EAGAIN to indicate 4299 * to the FS issuing the cluster_read call that it 4300 * should now throttle after dropping any locks 4301 */ 4302 throttle_info_update_by_mount(vp->v_mount); 4303 4304 io_throttled = TRUE; 4305 goto wait_for_dreads; 4306 } 4307 } 4308 } 4309 if (io_size > max_rd_size) 4310 io_size = max_rd_size; 4311 4312 iov_base = uio_curriovbase(uio); 4313 4314 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK); 4315 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK; 4316 4317 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START, 4318 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0); 4319 4320 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) 4321 no_zero_fill = 1; 4322 else 4323 no_zero_fill = 0; 4324 4325 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) { 4326 pages_in_pl = 0; 4327 upl_size = upl_needed_size; 4328 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; 4329 4330 if (no_zero_fill) 4331 upl_flags |= UPL_NOZEROFILL; 4332 if (force_data_sync) 4333 upl_flags |= UPL_FORCE_DATA_SYNC; 4334 4335 kret = vm_map_create_upl(current_map(), 4336 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), 4337 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags); 4338 4339 if (kret != KERN_SUCCESS) { 4340 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, 4341 (int)upl_offset, upl_size, io_size, kret, 0); 4342 /* 4343 * failed to get pagelist 4344 * 4345 * we may have already spun some portion of this request 4346 * off as async requests... we need to wait for the I/O 4347 * to complete before returning 4348 */ 4349 goto wait_for_dreads; 4350 } 4351 pages_in_pl = upl_size / PAGE_SIZE; 4352 pl = UPL_GET_INTERNAL_PAGE_LIST(upl); 4353 4354 for (i = 0; i < pages_in_pl; i++) { 4355 if (!upl_page_present(pl, i)) 4356 break; 4357 } 4358 if (i == pages_in_pl) 4359 break; 4360 4361 ubc_upl_abort(upl, 0); 4362 } 4363 if (force_data_sync >= 3) { 4364 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, 4365 (int)upl_offset, upl_size, io_size, kret, 0); 4366 4367 goto wait_for_dreads; 4368 } 4369 /* 4370 * Consider the possibility that upl_size wasn't satisfied. 4371 */ 4372 if (upl_size < upl_needed_size) { 4373 if (upl_size && upl_offset == 0) 4374 io_size = upl_size; 4375 else 4376 io_size = 0; 4377 } 4378 if (io_size == 0) { 4379 ubc_upl_abort(upl, 0); 4380 goto wait_for_dreads; 4381 } 4382 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, 4383 (int)upl_offset, upl_size, io_size, kret, 0); 4384 4385 if(useVectorUPL) { 4386 vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK); 4387 if(end_off) 4388 issueVectorUPL = 1; 4389 /* 4390 * After this point, if we are using a vector UPL, then 4391 * either all the UPL elements end on a page boundary OR 4392 * this UPL is the last element because it does not end 4393 * on a page boundary. 4394 */ 4395 } 4396 4397 /* 4398 * request asynchronously so that we can overlap 4399 * the preparation of the next I/O 4400 * if there are already too many outstanding reads 4401 * wait until some have completed before issuing the next read 4402 */ 4403 cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct"); 4404 4405 if (iostate.io_error) { 4406 /* 4407 * one of the earlier reads we issued ran into a hard error 4408 * don't issue any more reads, cleanup the UPL 4409 * that was just created but not used, then 4410 * go wait for any other reads to complete before 4411 * returning the error to the caller 4412 */ 4413 ubc_upl_abort(upl, 0); 4414 4415 goto wait_for_dreads; 4416 } 4417 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START, 4418 upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0); 4419 4420 4421 if(!useVectorUPL) { 4422 if (no_zero_fill) 4423 io_flag &= ~CL_PRESERVE; 4424 else 4425 io_flag |= CL_PRESERVE; 4426 4427 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 4428 4429 } else { 4430 4431 if(!vector_upl_index) { 4432 vector_upl = vector_upl_create(upl_offset); 4433 v_upl_uio_offset = uio->uio_offset; 4434 vector_upl_offset = upl_offset; 4435 } 4436 4437 vector_upl_set_subupl(vector_upl,upl, upl_size); 4438 vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size); 4439 vector_upl_index++; 4440 vector_upl_size += upl_size; 4441 vector_upl_iosize += io_size; 4442 4443 if(issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) { 4444 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 4445 reset_vector_run_state(); 4446 } 4447 } 4448 last_iov_base = iov_base + io_size; 4449 4450 /* 4451 * update the uio structure 4452 */ 4453 if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) { 4454 uio_update(uio, (user_size_t)max_io_size); 4455 } 4456 else { 4457 uio_update(uio, (user_size_t)io_size); 4458 } 4459 /* 4460 * Under normal circumstances, the io_size should not be 4461 * bigger than the io_req_size, but we may have had to round up 4462 * to the end of the page in the encrypted IO case. In that case only, 4463 * ensure that we only decrement io_req_size to 0. 4464 */ 4465 if ((flags & IO_ENCRYPTED) && (io_size > io_req_size)) { 4466 io_req_size = 0; 4467 } 4468 else { 4469 io_req_size -= io_size; 4470 } 4471 4472 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END, 4473 upl, (int)uio->uio_offset, io_req_size, retval, 0); 4474 4475 } /* end while */ 4476 4477 if (retval == 0 && iostate.io_error == 0 && io_req_size == 0 && uio->uio_offset < filesize) { 4478 4479 retval = cluster_io_type(uio, read_type, read_length, 0); 4480 4481 if (retval == 0 && *read_type == IO_DIRECT) { 4482 4483 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE, 4484 (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0); 4485 4486 goto next_dread; 4487 } 4488 } 4489 4490wait_for_dreads: 4491 4492 if(retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) { 4493 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); 4494 reset_vector_run_state(); 4495 } 4496 /* 4497 * make sure all async reads that are part of this stream 4498 * have completed before we return 4499 */ 4500 cluster_iostate_wait(&iostate, 0, "cluster_read_direct"); 4501 4502 if (iostate.io_error) 4503 retval = iostate.io_error; 4504 4505 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); 4506 4507 if (io_throttled == TRUE && retval == 0) 4508 retval = EAGAIN; 4509 4510 for (next_iov_base = orig_iov_base; next_iov_base < last_iov_base; next_iov_base += PAGE_SIZE) { 4511 /* 4512 * This is specifically done for pmap accounting purposes. 4513 * vm_pre_fault() will call vm_fault() to enter the page into 4514 * the pmap if there isn't _a_ physical page for that VA already. 4515 */ 4516 vm_pre_fault(vm_map_trunc_page(next_iov_base, PAGE_MASK)); 4517 } 4518 4519 if (io_req_size && retval == 0) { 4520 /* 4521 * we couldn't handle the tail of this request in DIRECT mode 4522 * so fire it through the copy path 4523 */ 4524 retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg); 4525 4526 *read_type = IO_UNKNOWN; 4527 } 4528 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END, 4529 (int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, 0); 4530 4531 return (retval); 4532} 4533 4534 4535static int 4536cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length, 4537 int (*callback)(buf_t, void *), void *callback_arg, int flags) 4538{ 4539 upl_page_info_t *pl; 4540 upl_t upl[MAX_VECTS]; 4541 vm_offset_t upl_offset; 4542 addr64_t dst_paddr = 0; 4543 user_addr_t iov_base; 4544 off_t max_size; 4545 upl_size_t upl_size; 4546 vm_size_t upl_needed_size; 4547 mach_msg_type_number_t pages_in_pl; 4548 int upl_flags; 4549 kern_return_t kret; 4550 struct clios iostate; 4551 int error= 0; 4552 int cur_upl = 0; 4553 int num_upl = 0; 4554 int n; 4555 u_int32_t xsize; 4556 u_int32_t io_size; 4557 u_int32_t devblocksize; 4558 u_int32_t mem_alignment_mask; 4559 u_int32_t tail_size = 0; 4560 int bflag; 4561 4562 if (flags & IO_PASSIVE) 4563 bflag = CL_PASSIVE; 4564 else 4565 bflag = 0; 4566 4567 if (flags & IO_NOCACHE) 4568 bflag |= CL_NOCACHE; 4569 4570 /* 4571 * When we enter this routine, we know 4572 * -- the read_length will not exceed the current iov_len 4573 * -- the target address is physically contiguous for read_length 4574 */ 4575 cluster_syncup(vp, filesize, callback, callback_arg, PUSH_SYNC); 4576 4577 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; 4578 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; 4579 4580 iostate.io_completed = 0; 4581 iostate.io_issued = 0; 4582 iostate.io_error = 0; 4583 iostate.io_wanted = 0; 4584 4585 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); 4586 4587next_cread: 4588 io_size = *read_length; 4589 4590 max_size = filesize - uio->uio_offset; 4591 4592 if (io_size > max_size) 4593 io_size = max_size; 4594 4595 iov_base = uio_curriovbase(uio); 4596 4597 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK); 4598 upl_needed_size = upl_offset + io_size; 4599 4600 pages_in_pl = 0; 4601 upl_size = upl_needed_size; 4602 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; 4603 4604 4605 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_START, 4606 (int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0); 4607 4608 kret = vm_map_get_upl(current_map(), 4609 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), 4610 &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, 0); 4611 4612 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_END, 4613 (int)upl_offset, upl_size, io_size, kret, 0); 4614 4615 if (kret != KERN_SUCCESS) { 4616 /* 4617 * failed to get pagelist 4618 */ 4619 error = EINVAL; 4620 goto wait_for_creads; 4621 } 4622 num_upl++; 4623 4624 if (upl_size < upl_needed_size) { 4625 /* 4626 * The upl_size wasn't satisfied. 4627 */ 4628 error = EINVAL; 4629 goto wait_for_creads; 4630 } 4631 pl = ubc_upl_pageinfo(upl[cur_upl]); 4632 4633 dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset; 4634 4635 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) { 4636 u_int32_t head_size; 4637 4638 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1)); 4639 4640 if (head_size > io_size) 4641 head_size = io_size; 4642 4643 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ, callback, callback_arg); 4644 4645 if (error) 4646 goto wait_for_creads; 4647 4648 upl_offset += head_size; 4649 dst_paddr += head_size; 4650 io_size -= head_size; 4651 4652 iov_base += head_size; 4653 } 4654 if ((u_int32_t)iov_base & mem_alignment_mask) { 4655 /* 4656 * request doesn't set up on a memory boundary 4657 * the underlying DMA engine can handle... 4658 * return an error instead of going through 4659 * the slow copy path since the intent of this 4660 * path is direct I/O to device memory 4661 */ 4662 error = EINVAL; 4663 goto wait_for_creads; 4664 } 4665 4666 tail_size = io_size & (devblocksize - 1); 4667 4668 io_size -= tail_size; 4669 4670 while (io_size && error == 0) { 4671 4672 if (io_size > MAX_IO_CONTIG_SIZE) 4673 xsize = MAX_IO_CONTIG_SIZE; 4674 else 4675 xsize = io_size; 4676 /* 4677 * request asynchronously so that we can overlap 4678 * the preparation of the next I/O... we'll do 4679 * the commit after all the I/O has completed 4680 * since its all issued against the same UPL 4681 * if there are already too many outstanding reads 4682 * wait until some have completed before issuing the next 4683 */ 4684 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig"); 4685 4686 if (iostate.io_error) { 4687 /* 4688 * one of the earlier reads we issued ran into a hard error 4689 * don't issue any more reads... 4690 * go wait for any other reads to complete before 4691 * returning the error to the caller 4692 */ 4693 goto wait_for_creads; 4694 } 4695 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, xsize, 4696 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC | bflag, 4697 (buf_t)NULL, &iostate, callback, callback_arg); 4698 /* 4699 * The cluster_io read was issued successfully, 4700 * update the uio structure 4701 */ 4702 if (error == 0) { 4703 uio_update(uio, (user_size_t)xsize); 4704 4705 dst_paddr += xsize; 4706 upl_offset += xsize; 4707 io_size -= xsize; 4708 } 4709 } 4710 if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS && uio->uio_offset < filesize) { 4711 4712 error = cluster_io_type(uio, read_type, read_length, 0); 4713 4714 if (error == 0 && *read_type == IO_CONTIG) { 4715 cur_upl++; 4716 goto next_cread; 4717 } 4718 } else 4719 *read_type = IO_UNKNOWN; 4720 4721wait_for_creads: 4722 /* 4723 * make sure all async reads that are part of this stream 4724 * have completed before we proceed 4725 */ 4726 cluster_iostate_wait(&iostate, 0, "cluster_read_contig"); 4727 4728 if (iostate.io_error) 4729 error = iostate.io_error; 4730 4731 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); 4732 4733 if (error == 0 && tail_size) 4734 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg); 4735 4736 for (n = 0; n < num_upl; n++) 4737 /* 4738 * just release our hold on each physically contiguous 4739 * region without changing any state 4740 */ 4741 ubc_upl_abort(upl[n], 0); 4742 4743 return (error); 4744} 4745 4746 4747static int 4748cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length) 4749{ 4750 user_size_t iov_len; 4751 user_addr_t iov_base = 0; 4752 upl_t upl; 4753 upl_size_t upl_size; 4754 int upl_flags; 4755 int retval = 0; 4756 4757 /* 4758 * skip over any emtpy vectors 4759 */ 4760 uio_update(uio, (user_size_t)0); 4761 4762 iov_len = uio_curriovlen(uio); 4763 4764 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_START, uio, (int)iov_len, 0, 0, 0); 4765 4766 if (iov_len) { 4767 iov_base = uio_curriovbase(uio); 4768 /* 4769 * make sure the size of the vector isn't too big... 4770 * internally, we want to handle all of the I/O in 4771 * chunk sizes that fit in a 32 bit int 4772 */ 4773 if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE) 4774 upl_size = MAX_IO_REQUEST_SIZE; 4775 else 4776 upl_size = (u_int32_t)iov_len; 4777 4778 upl_flags = UPL_QUERY_OBJECT_TYPE; 4779 4780 if ((vm_map_get_upl(current_map(), 4781 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), 4782 &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) { 4783 /* 4784 * the user app must have passed in an invalid address 4785 */ 4786 retval = EFAULT; 4787 } 4788 if (upl_size == 0) 4789 retval = EFAULT; 4790 4791 *io_length = upl_size; 4792 4793 if (upl_flags & UPL_PHYS_CONTIG) 4794 *io_type = IO_CONTIG; 4795 else if (iov_len >= min_length) 4796 *io_type = IO_DIRECT; 4797 else 4798 *io_type = IO_COPY; 4799 } else { 4800 /* 4801 * nothing left to do for this uio 4802 */ 4803 *io_length = 0; 4804 *io_type = IO_UNKNOWN; 4805 } 4806 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_END, iov_base, *io_type, *io_length, retval, 0); 4807 4808 return (retval); 4809} 4810 4811 4812/* 4813 * generate advisory I/O's in the largest chunks possible 4814 * the completed pages will be released into the VM cache 4815 */ 4816int 4817advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid) 4818{ 4819 return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE); 4820} 4821 4822int 4823advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*callback)(buf_t, void *), void *callback_arg, int bflag) 4824{ 4825 upl_page_info_t *pl; 4826 upl_t upl; 4827 vm_offset_t upl_offset; 4828 int upl_size; 4829 off_t upl_f_offset; 4830 int start_offset; 4831 int start_pg; 4832 int last_pg; 4833 int pages_in_upl; 4834 off_t max_size; 4835 int io_size; 4836 kern_return_t kret; 4837 int retval = 0; 4838 int issued_io; 4839 int skip_range; 4840 uint32_t max_io_size; 4841 4842 4843 if ( !UBCINFOEXISTS(vp)) 4844 return(EINVAL); 4845 4846 if (resid < 0) 4847 return(EINVAL); 4848 4849 max_io_size = cluster_max_io_size(vp->v_mount, CL_READ); 4850 4851 if ((vp->v_mount->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) { 4852 if (max_io_size > speculative_prefetch_max_iosize) 4853 max_io_size = speculative_prefetch_max_iosize; 4854 } 4855 4856 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START, 4857 (int)f_offset, resid, (int)filesize, 0, 0); 4858 4859 while (resid && f_offset < filesize && retval == 0) { 4860 /* 4861 * compute the size of the upl needed to encompass 4862 * the requested read... limit each call to cluster_io 4863 * to the maximum UPL size... cluster_io will clip if 4864 * this exceeds the maximum io_size for the device, 4865 * make sure to account for 4866 * a starting offset that's not page aligned 4867 */ 4868 start_offset = (int)(f_offset & PAGE_MASK_64); 4869 upl_f_offset = f_offset - (off_t)start_offset; 4870 max_size = filesize - f_offset; 4871 4872 if (resid < max_size) 4873 io_size = resid; 4874 else 4875 io_size = max_size; 4876 4877 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 4878 if ((uint32_t)upl_size > max_io_size) 4879 upl_size = max_io_size; 4880 4881 skip_range = 0; 4882 /* 4883 * return the number of contiguously present pages in the cache 4884 * starting at upl_f_offset within the file 4885 */ 4886 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range); 4887 4888 if (skip_range) { 4889 /* 4890 * skip over pages already present in the cache 4891 */ 4892 io_size = skip_range - start_offset; 4893 4894 f_offset += io_size; 4895 resid -= io_size; 4896 4897 if (skip_range == upl_size) 4898 continue; 4899 /* 4900 * have to issue some real I/O 4901 * at this point, we know it's starting on a page boundary 4902 * because we've skipped over at least the first page in the request 4903 */ 4904 start_offset = 0; 4905 upl_f_offset += skip_range; 4906 upl_size -= skip_range; 4907 } 4908 pages_in_upl = upl_size / PAGE_SIZE; 4909 4910 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START, 4911 upl, (int)upl_f_offset, upl_size, start_offset, 0); 4912 4913 kret = ubc_create_upl(vp, 4914 upl_f_offset, 4915 upl_size, 4916 &upl, 4917 &pl, 4918 UPL_RET_ONLY_ABSENT | UPL_SET_LITE); 4919 if (kret != KERN_SUCCESS) 4920 return(retval); 4921 issued_io = 0; 4922 4923 /* 4924 * before we start marching forward, we must make sure we end on 4925 * a present page, otherwise we will be working with a freed 4926 * upl 4927 */ 4928 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) { 4929 if (upl_page_present(pl, last_pg)) 4930 break; 4931 } 4932 pages_in_upl = last_pg + 1; 4933 4934 4935 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END, 4936 upl, (int)upl_f_offset, upl_size, start_offset, 0); 4937 4938 4939 for (last_pg = 0; last_pg < pages_in_upl; ) { 4940 /* 4941 * scan from the beginning of the upl looking for the first 4942 * page that is present.... this will become the first page in 4943 * the request we're going to make to 'cluster_io'... if all 4944 * of the pages are absent, we won't call through to 'cluster_io' 4945 */ 4946 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) { 4947 if (upl_page_present(pl, start_pg)) 4948 break; 4949 } 4950 4951 /* 4952 * scan from the starting present page looking for an absent 4953 * page before the end of the upl is reached, if we 4954 * find one, then it will terminate the range of pages being 4955 * presented to 'cluster_io' 4956 */ 4957 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) { 4958 if (!upl_page_present(pl, last_pg)) 4959 break; 4960 } 4961 4962 if (last_pg > start_pg) { 4963 /* 4964 * we found a range of pages that must be filled 4965 * if the last page in this range is the last page of the file 4966 * we may have to clip the size of it to keep from reading past 4967 * the end of the last physical block associated with the file 4968 */ 4969 upl_offset = start_pg * PAGE_SIZE; 4970 io_size = (last_pg - start_pg) * PAGE_SIZE; 4971 4972 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) 4973 io_size = filesize - (upl_f_offset + upl_offset); 4974 4975 /* 4976 * issue an asynchronous read to cluster_io 4977 */ 4978 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, 4979 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 4980 4981 issued_io = 1; 4982 } 4983 } 4984 if (issued_io == 0) 4985 ubc_upl_abort(upl, 0); 4986 4987 io_size = upl_size - start_offset; 4988 4989 if (io_size > resid) 4990 io_size = resid; 4991 f_offset += io_size; 4992 resid -= io_size; 4993 } 4994 4995 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END, 4996 (int)f_offset, resid, retval, 0, 0); 4997 4998 return(retval); 4999} 5000 5001 5002int 5003cluster_push(vnode_t vp, int flags) 5004{ 5005 return cluster_push_ext(vp, flags, NULL, NULL); 5006} 5007 5008 5009int 5010cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg) 5011{ 5012 int retval; 5013 int my_sparse_wait = 0; 5014 struct cl_writebehind *wbp; 5015 5016 if ( !UBCINFOEXISTS(vp)) { 5017 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -1, 0); 5018 return (0); 5019 } 5020 /* return if deferred write is set */ 5021 if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) { 5022 return (0); 5023 } 5024 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) { 5025 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -2, 0); 5026 return (0); 5027 } 5028 if (!ISSET(flags, IO_SYNC) && wbp->cl_number == 0 && wbp->cl_scmap == NULL) { 5029 lck_mtx_unlock(&wbp->cl_lockw); 5030 5031 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -3, 0); 5032 return(0); 5033 } 5034 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START, 5035 wbp->cl_scmap, wbp->cl_number, flags, 0, 0); 5036 5037 /* 5038 * if we have an fsync in progress, we don't want to allow any additional 5039 * sync/fsync/close(s) to occur until it finishes. 5040 * note that its possible for writes to continue to occur to this file 5041 * while we're waiting and also once the fsync starts to clean if we're 5042 * in the sparse map case 5043 */ 5044 while (wbp->cl_sparse_wait) { 5045 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, vp, 0, 0, 0, 0); 5046 5047 msleep((caddr_t)&wbp->cl_sparse_wait, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL); 5048 5049 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, vp, 0, 0, 0, 0); 5050 } 5051 if (flags & IO_SYNC) { 5052 my_sparse_wait = 1; 5053 wbp->cl_sparse_wait = 1; 5054 5055 /* 5056 * this is an fsync (or equivalent)... we must wait for any existing async 5057 * cleaning operations to complete before we evaulate the current state 5058 * and finish cleaning... this insures that all writes issued before this 5059 * fsync actually get cleaned to the disk before this fsync returns 5060 */ 5061 while (wbp->cl_sparse_pushes) { 5062 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_START, vp, 0, 0, 0, 0); 5063 5064 msleep((caddr_t)&wbp->cl_sparse_pushes, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL); 5065 5066 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_END, vp, 0, 0, 0, 0); 5067 } 5068 } 5069 if (wbp->cl_scmap) { 5070 void *scmap; 5071 5072 if (wbp->cl_sparse_pushes < SPARSE_PUSH_LIMIT) { 5073 5074 scmap = wbp->cl_scmap; 5075 wbp->cl_scmap = NULL; 5076 5077 wbp->cl_sparse_pushes++; 5078 5079 lck_mtx_unlock(&wbp->cl_lockw); 5080 5081 sparse_cluster_push(&scmap, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg); 5082 5083 lck_mtx_lock(&wbp->cl_lockw); 5084 5085 wbp->cl_sparse_pushes--; 5086 5087 if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0) 5088 wakeup((caddr_t)&wbp->cl_sparse_pushes); 5089 } else { 5090 sparse_cluster_push(&(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg); 5091 } 5092 retval = 1; 5093 } else { 5094 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg); 5095 } 5096 lck_mtx_unlock(&wbp->cl_lockw); 5097 5098 if (flags & IO_SYNC) 5099 (void)vnode_waitforwrites(vp, 0, 0, 0, "cluster_push"); 5100 5101 if (my_sparse_wait) { 5102 /* 5103 * I'm the owner of the serialization token 5104 * clear it and wakeup anyone that is waiting 5105 * for me to finish 5106 */ 5107 lck_mtx_lock(&wbp->cl_lockw); 5108 5109 wbp->cl_sparse_wait = 0; 5110 wakeup((caddr_t)&wbp->cl_sparse_wait); 5111 5112 lck_mtx_unlock(&wbp->cl_lockw); 5113 } 5114 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END, 5115 wbp->cl_scmap, wbp->cl_number, retval, 0, 0); 5116 5117 return (retval); 5118} 5119 5120 5121__private_extern__ void 5122cluster_release(struct ubc_info *ubc) 5123{ 5124 struct cl_writebehind *wbp; 5125 struct cl_readahead *rap; 5126 5127 if ((wbp = ubc->cl_wbehind)) { 5128 5129 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, wbp->cl_scmap, 0, 0, 0); 5130 5131 if (wbp->cl_scmap) 5132 vfs_drt_control(&(wbp->cl_scmap), 0); 5133 } else { 5134 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, 0, 0, 0, 0); 5135 } 5136 5137 rap = ubc->cl_rahead; 5138 5139 if (wbp != NULL) { 5140 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp); 5141 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND); 5142 } 5143 if ((rap = ubc->cl_rahead)) { 5144 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp); 5145 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD); 5146 } 5147 ubc->cl_rahead = NULL; 5148 ubc->cl_wbehind = NULL; 5149 5150 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, ubc, rap, wbp, 0, 0); 5151} 5152 5153 5154static int 5155cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg) 5156{ 5157 int cl_index; 5158 int cl_index1; 5159 int min_index; 5160 int cl_len; 5161 int cl_pushed = 0; 5162 struct cl_wextent l_clusters[MAX_CLUSTERS]; 5163 u_int max_cluster_pgcount; 5164 5165 5166 max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE; 5167 /* 5168 * the write behind context exists and has 5169 * already been locked... 5170 */ 5171 if (wbp->cl_number == 0) 5172 /* 5173 * no clusters to push 5174 * return number of empty slots 5175 */ 5176 return (MAX_CLUSTERS); 5177 5178 /* 5179 * make a local 'sorted' copy of the clusters 5180 * and clear wbp->cl_number so that new clusters can 5181 * be developed 5182 */ 5183 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) { 5184 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) { 5185 if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr) 5186 continue; 5187 if (min_index == -1) 5188 min_index = cl_index1; 5189 else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr) 5190 min_index = cl_index1; 5191 } 5192 if (min_index == -1) 5193 break; 5194 5195 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr; 5196 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr; 5197 l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags; 5198 5199 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr; 5200 } 5201 wbp->cl_number = 0; 5202 5203 cl_len = cl_index; 5204 5205 if ( (push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS ) { 5206 int i; 5207 5208 /* 5209 * determine if we appear to be writing the file sequentially 5210 * if not, by returning without having pushed any clusters 5211 * we will cause this vnode to be pushed into the sparse cluster mechanism 5212 * used for managing more random I/O patterns 5213 * 5214 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them... 5215 * that's why we're in try_push with PUSH_DELAY... 5216 * 5217 * check to make sure that all the clusters except the last one are 'full'... and that each cluster 5218 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above 5219 * so we can just make a simple pass through, up to, but not including the last one... 5220 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they 5221 * are sequential 5222 * 5223 * we let the last one be partial as long as it was adjacent to the previous one... 5224 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out 5225 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world... 5226 */ 5227 for (i = 0; i < MAX_CLUSTERS - 1; i++) { 5228 if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount) 5229 goto dont_try; 5230 if (l_clusters[i].e_addr != l_clusters[i+1].b_addr) 5231 goto dont_try; 5232 } 5233 } 5234 for (cl_index = 0; cl_index < cl_len; cl_index++) { 5235 int flags; 5236 struct cl_extent cl; 5237 5238 flags = io_flags & (IO_PASSIVE|IO_CLOSE); 5239 5240 /* 5241 * try to push each cluster in turn... 5242 */ 5243 if (l_clusters[cl_index].io_flags & CLW_IONOCACHE) 5244 flags |= IO_NOCACHE; 5245 5246 if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE) 5247 flags |= IO_PASSIVE; 5248 5249 if (push_flag & PUSH_SYNC) 5250 flags |= IO_SYNC; 5251 5252 cl.b_addr = l_clusters[cl_index].b_addr; 5253 cl.e_addr = l_clusters[cl_index].e_addr; 5254 5255 cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg); 5256 5257 l_clusters[cl_index].b_addr = 0; 5258 l_clusters[cl_index].e_addr = 0; 5259 5260 cl_pushed++; 5261 5262 if ( !(push_flag & PUSH_ALL) ) 5263 break; 5264 } 5265dont_try: 5266 if (cl_len > cl_pushed) { 5267 /* 5268 * we didn't push all of the clusters, so 5269 * lets try to merge them back in to the vnode 5270 */ 5271 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) { 5272 /* 5273 * we picked up some new clusters while we were trying to 5274 * push the old ones... this can happen because I've dropped 5275 * the vnode lock... the sum of the 5276 * leftovers plus the new cluster count exceeds our ability 5277 * to represent them, so switch to the sparse cluster mechanism 5278 * 5279 * collect the active public clusters... 5280 */ 5281 sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg); 5282 5283 for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) { 5284 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) 5285 continue; 5286 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr; 5287 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr; 5288 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags; 5289 5290 cl_index1++; 5291 } 5292 /* 5293 * update the cluster count 5294 */ 5295 wbp->cl_number = cl_index1; 5296 5297 /* 5298 * and collect the original clusters that were moved into the 5299 * local storage for sorting purposes 5300 */ 5301 sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg); 5302 5303 } else { 5304 /* 5305 * we've got room to merge the leftovers back in 5306 * just append them starting at the next 'hole' 5307 * represented by wbp->cl_number 5308 */ 5309 for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) { 5310 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) 5311 continue; 5312 5313 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr; 5314 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr; 5315 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags; 5316 5317 cl_index1++; 5318 } 5319 /* 5320 * update the cluster count 5321 */ 5322 wbp->cl_number = cl_index1; 5323 } 5324 } 5325 return (MAX_CLUSTERS - wbp->cl_number); 5326} 5327 5328 5329 5330static int 5331cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*callback)(buf_t, void *), void *callback_arg) 5332{ 5333 upl_page_info_t *pl; 5334 upl_t upl; 5335 vm_offset_t upl_offset; 5336 int upl_size; 5337 off_t upl_f_offset; 5338 int pages_in_upl; 5339 int start_pg; 5340 int last_pg; 5341 int io_size; 5342 int io_flags; 5343 int upl_flags; 5344 int bflag; 5345 int size; 5346 int error = 0; 5347 int retval; 5348 kern_return_t kret; 5349 5350 if (flags & IO_PASSIVE) 5351 bflag = CL_PASSIVE; 5352 else 5353 bflag = 0; 5354 5355 if (flags & IO_SKIP_ENCRYPTION) 5356 bflag |= CL_ENCRYPTED; 5357 5358 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START, 5359 (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0); 5360 5361 if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) { 5362 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0); 5363 5364 return (0); 5365 } 5366 upl_size = pages_in_upl * PAGE_SIZE; 5367 upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64); 5368 5369 if (upl_f_offset + upl_size >= EOF) { 5370 5371 if (upl_f_offset >= EOF) { 5372 /* 5373 * must have truncated the file and missed 5374 * clearing a dangling cluster (i.e. it's completely 5375 * beyond the new EOF 5376 */ 5377 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0); 5378 5379 return(0); 5380 } 5381 size = EOF - upl_f_offset; 5382 5383 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK; 5384 pages_in_upl = upl_size / PAGE_SIZE; 5385 } else 5386 size = upl_size; 5387 5388 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0); 5389 5390 /* 5391 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior 5392 * 5393 * - only pages that are currently dirty are returned... these are the ones we need to clean 5394 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set 5395 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page 5396 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if 5397 * someone dirties this page while the I/O is in progress, we don't lose track of the new state 5398 * 5399 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard) 5400 */ 5401 5402 if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE)) 5403 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED; 5404 else 5405 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE; 5406 5407 kret = ubc_create_upl(vp, 5408 upl_f_offset, 5409 upl_size, 5410 &upl, 5411 &pl, 5412 upl_flags); 5413 if (kret != KERN_SUCCESS) 5414 panic("cluster_push: failed to get pagelist"); 5415 5416 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, upl, upl_f_offset, 0, 0, 0); 5417 5418 /* 5419 * since we only asked for the dirty pages back 5420 * it's possible that we may only get a few or even none, so... 5421 * before we start marching forward, we must make sure we know 5422 * where the last present page is in the UPL, otherwise we could 5423 * end up working with a freed upl due to the FREE_ON_EMPTY semantics 5424 * employed by commit_range and abort_range. 5425 */ 5426 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) { 5427 if (upl_page_present(pl, last_pg)) 5428 break; 5429 } 5430 pages_in_upl = last_pg + 1; 5431 5432 if (pages_in_upl == 0) { 5433 ubc_upl_abort(upl, 0); 5434 5435 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0); 5436 return(0); 5437 } 5438 5439 for (last_pg = 0; last_pg < pages_in_upl; ) { 5440 /* 5441 * find the next dirty page in the UPL 5442 * this will become the first page in the 5443 * next I/O to generate 5444 */ 5445 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) { 5446 if (upl_dirty_page(pl, start_pg)) 5447 break; 5448 if (upl_page_present(pl, start_pg)) 5449 /* 5450 * RET_ONLY_DIRTY will return non-dirty 'precious' pages 5451 * just release these unchanged since we're not going 5452 * to steal them or change their state 5453 */ 5454 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY); 5455 } 5456 if (start_pg >= pages_in_upl) 5457 /* 5458 * done... no more dirty pages to push 5459 */ 5460 break; 5461 if (start_pg > last_pg) 5462 /* 5463 * skipped over some non-dirty pages 5464 */ 5465 size -= ((start_pg - last_pg) * PAGE_SIZE); 5466 5467 /* 5468 * find a range of dirty pages to write 5469 */ 5470 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) { 5471 if (!upl_dirty_page(pl, last_pg)) 5472 break; 5473 } 5474 upl_offset = start_pg * PAGE_SIZE; 5475 5476 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE); 5477 5478 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | bflag; 5479 5480 if ( !(flags & IO_SYNC)) 5481 io_flags |= CL_ASYNC; 5482 5483 if (flags & IO_CLOSE) 5484 io_flags |= CL_CLOSE; 5485 5486 if (flags & IO_NOCACHE) 5487 io_flags |= CL_NOCACHE; 5488 5489 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, 5490 io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 5491 5492 if (error == 0 && retval) 5493 error = retval; 5494 5495 size -= io_size; 5496 } 5497 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0); 5498 5499 return(error); 5500} 5501 5502 5503/* 5504 * sparse_cluster_switch is called with the write behind lock held 5505 */ 5506static void 5507sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg) 5508{ 5509 int cl_index; 5510 5511 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, vp, wbp->cl_scmap, 0, 0, 0); 5512 5513 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) { 5514 int flags; 5515 struct cl_extent cl; 5516 5517 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) { 5518 5519 if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, NULL, &flags) == KERN_SUCCESS) { 5520 if (flags & UPL_POP_DIRTY) { 5521 cl.e_addr = cl.b_addr + 1; 5522 5523 sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg); 5524 } 5525 } 5526 } 5527 } 5528 wbp->cl_number = 0; 5529 5530 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, vp, wbp->cl_scmap, 0, 0, 0); 5531} 5532 5533 5534/* 5535 * sparse_cluster_push must be called with the write-behind lock held if the scmap is 5536 * still associated with the write-behind context... however, if the scmap has been disassociated 5537 * from the write-behind context (the cluster_push case), the wb lock is not held 5538 */ 5539static void 5540sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg) 5541{ 5542 struct cl_extent cl; 5543 off_t offset; 5544 u_int length; 5545 5546 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, vp, (*scmap), 0, push_flag, 0); 5547 5548 if (push_flag & PUSH_ALL) 5549 vfs_drt_control(scmap, 1); 5550 5551 for (;;) { 5552 if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS) 5553 break; 5554 5555 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64); 5556 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64); 5557 5558 cluster_push_now(vp, &cl, EOF, io_flags & (IO_PASSIVE|IO_CLOSE), callback, callback_arg); 5559 5560 if ( !(push_flag & PUSH_ALL) ) 5561 break; 5562 } 5563 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, vp, (*scmap), 0, 0, 0); 5564} 5565 5566 5567/* 5568 * sparse_cluster_add is called with the write behind lock held 5569 */ 5570static void 5571sparse_cluster_add(void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg) 5572{ 5573 u_int new_dirty; 5574 u_int length; 5575 off_t offset; 5576 5577 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0); 5578 5579 offset = (off_t)(cl->b_addr * PAGE_SIZE_64); 5580 length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE; 5581 5582 while (vfs_drt_mark_pages(scmap, offset, length, &new_dirty) != KERN_SUCCESS) { 5583 /* 5584 * no room left in the map 5585 * only a partial update was done 5586 * push out some pages and try again 5587 */ 5588 sparse_cluster_push(scmap, vp, EOF, 0, 0, callback, callback_arg); 5589 5590 offset += (new_dirty * PAGE_SIZE_64); 5591 length -= (new_dirty * PAGE_SIZE); 5592 } 5593 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, vp, (*scmap), 0, 0, 0); 5594} 5595 5596 5597static int 5598cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*callback)(buf_t, void *), void *callback_arg) 5599{ 5600 upl_page_info_t *pl; 5601 upl_t upl; 5602 addr64_t ubc_paddr; 5603 kern_return_t kret; 5604 int error = 0; 5605 int did_read = 0; 5606 int abort_flags; 5607 int upl_flags; 5608 int bflag; 5609 5610 if (flags & IO_PASSIVE) 5611 bflag = CL_PASSIVE; 5612 else 5613 bflag = 0; 5614 5615 if (flags & IO_NOCACHE) 5616 bflag |= CL_NOCACHE; 5617 5618 upl_flags = UPL_SET_LITE; 5619 5620 if ( !(flags & CL_READ) ) { 5621 /* 5622 * "write" operation: let the UPL subsystem know 5623 * that we intend to modify the buffer cache pages 5624 * we're gathering. 5625 */ 5626 upl_flags |= UPL_WILL_MODIFY; 5627 } else { 5628 /* 5629 * indicate that there is no need to pull the 5630 * mapping for this page... we're only going 5631 * to read from it, not modify it. 5632 */ 5633 upl_flags |= UPL_FILE_IO; 5634 } 5635 kret = ubc_create_upl(vp, 5636 uio->uio_offset & ~PAGE_MASK_64, 5637 PAGE_SIZE, 5638 &upl, 5639 &pl, 5640 upl_flags); 5641 5642 if (kret != KERN_SUCCESS) 5643 return(EINVAL); 5644 5645 if (!upl_valid_page(pl, 0)) { 5646 /* 5647 * issue a synchronous read to cluster_io 5648 */ 5649 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, 5650 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 5651 if (error) { 5652 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); 5653 5654 return(error); 5655 } 5656 did_read = 1; 5657 } 5658 ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)(uio->uio_offset & PAGE_MASK_64); 5659 5660/* 5661 * NOTE: There is no prototype for the following in BSD. It, and the definitions 5662 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in 5663 * osfmk/ppc/mappings.h. They are not included here because there appears to be no 5664 * way to do so without exporting them to kexts as well. 5665 */ 5666 if (flags & CL_READ) 5667// copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */ 5668 copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 4); /* Copy physical to physical and flush the destination */ 5669 else 5670// copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */ 5671 copypv(usr_paddr, ubc_paddr, xsize, 2 | 1 | 8); /* Copy physical to physical and flush the source */ 5672 5673 if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) { 5674 /* 5675 * issue a synchronous write to cluster_io 5676 */ 5677 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, 5678 bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); 5679 } 5680 if (error == 0) 5681 uio_update(uio, (user_size_t)xsize); 5682 5683 if (did_read) 5684 abort_flags = UPL_ABORT_FREE_ON_EMPTY; 5685 else 5686 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES; 5687 5688 ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags); 5689 5690 return (error); 5691} 5692 5693 5694 5695int 5696cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid) 5697{ 5698 int pg_offset; 5699 int pg_index; 5700 int csize; 5701 int segflg; 5702 int retval = 0; 5703 int xsize; 5704 upl_page_info_t *pl; 5705 5706 xsize = *io_resid; 5707 5708 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START, 5709 (int)uio->uio_offset, upl_offset, xsize, 0, 0); 5710 5711 segflg = uio->uio_segflg; 5712 5713 switch(segflg) { 5714 5715 case UIO_USERSPACE32: 5716 case UIO_USERISPACE32: 5717 uio->uio_segflg = UIO_PHYS_USERSPACE32; 5718 break; 5719 5720 case UIO_USERSPACE: 5721 case UIO_USERISPACE: 5722 uio->uio_segflg = UIO_PHYS_USERSPACE; 5723 break; 5724 5725 case UIO_USERSPACE64: 5726 case UIO_USERISPACE64: 5727 uio->uio_segflg = UIO_PHYS_USERSPACE64; 5728 break; 5729 5730 case UIO_SYSSPACE: 5731 uio->uio_segflg = UIO_PHYS_SYSSPACE; 5732 break; 5733 5734 } 5735 pl = ubc_upl_pageinfo(upl); 5736 5737 pg_index = upl_offset / PAGE_SIZE; 5738 pg_offset = upl_offset & PAGE_MASK; 5739 csize = min(PAGE_SIZE - pg_offset, xsize); 5740 5741 while (xsize && retval == 0) { 5742 addr64_t paddr; 5743 5744 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << PAGE_SHIFT) + pg_offset; 5745 5746 retval = uiomove64(paddr, csize, uio); 5747 5748 pg_index += 1; 5749 pg_offset = 0; 5750 xsize -= csize; 5751 csize = min(PAGE_SIZE, xsize); 5752 } 5753 *io_resid = xsize; 5754 5755 uio->uio_segflg = segflg; 5756 5757 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END, 5758 (int)uio->uio_offset, xsize, retval, segflg, 0); 5759 5760 return (retval); 5761} 5762 5763 5764int 5765cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty) 5766{ 5767 5768 return (cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, 1)); 5769} 5770 5771 5772static int 5773cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference) 5774{ 5775 int segflg; 5776 int io_size; 5777 int xsize; 5778 int start_offset; 5779 int retval = 0; 5780 memory_object_control_t control; 5781 5782 io_size = *io_resid; 5783 5784 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START, 5785 (int)uio->uio_offset, io_size, mark_dirty, take_reference, 0); 5786 5787 control = ubc_getobject(vp, UBC_FLAGS_NONE); 5788 5789 if (control == MEMORY_OBJECT_CONTROL_NULL) { 5790 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END, 5791 (int)uio->uio_offset, io_size, retval, 3, 0); 5792 5793 return(0); 5794 } 5795 segflg = uio->uio_segflg; 5796 5797 switch(segflg) { 5798 5799 case UIO_USERSPACE32: 5800 case UIO_USERISPACE32: 5801 uio->uio_segflg = UIO_PHYS_USERSPACE32; 5802 break; 5803 5804 case UIO_USERSPACE64: 5805 case UIO_USERISPACE64: 5806 uio->uio_segflg = UIO_PHYS_USERSPACE64; 5807 break; 5808 5809 case UIO_USERSPACE: 5810 case UIO_USERISPACE: 5811 uio->uio_segflg = UIO_PHYS_USERSPACE; 5812 break; 5813 5814 case UIO_SYSSPACE: 5815 uio->uio_segflg = UIO_PHYS_SYSSPACE; 5816 break; 5817 } 5818 5819 if ( (io_size = *io_resid) ) { 5820 start_offset = (int)(uio->uio_offset & PAGE_MASK_64); 5821 xsize = uio_resid(uio); 5822 5823 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio, 5824 start_offset, io_size, mark_dirty, take_reference); 5825 xsize -= uio_resid(uio); 5826 io_size -= xsize; 5827 } 5828 uio->uio_segflg = segflg; 5829 *io_resid = io_size; 5830 5831 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END, 5832 (int)uio->uio_offset, io_size, retval, 0x80000000 | segflg, 0); 5833 5834 return(retval); 5835} 5836 5837 5838int 5839is_file_clean(vnode_t vp, off_t filesize) 5840{ 5841 off_t f_offset; 5842 int flags; 5843 int total_dirty = 0; 5844 5845 for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) { 5846 if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) { 5847 if (flags & UPL_POP_DIRTY) { 5848 total_dirty++; 5849 } 5850 } 5851 } 5852 if (total_dirty) 5853 return(EINVAL); 5854 5855 return (0); 5856} 5857 5858 5859 5860/* 5861 * Dirty region tracking/clustering mechanism. 5862 * 5863 * This code (vfs_drt_*) provides a mechanism for tracking and clustering 5864 * dirty regions within a larger space (file). It is primarily intended to 5865 * support clustering in large files with many dirty areas. 5866 * 5867 * The implementation assumes that the dirty regions are pages. 5868 * 5869 * To represent dirty pages within the file, we store bit vectors in a 5870 * variable-size circular hash. 5871 */ 5872 5873/* 5874 * Bitvector size. This determines the number of pages we group in a 5875 * single hashtable entry. Each hashtable entry is aligned to this 5876 * size within the file. 5877 */ 5878#define DRT_BITVECTOR_PAGES 256 5879 5880/* 5881 * File offset handling. 5882 * 5883 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES; 5884 * the correct formula is (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1) 5885 */ 5886#define DRT_ADDRESS_MASK (~((1 << 20) - 1)) 5887#define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK) 5888 5889/* 5890 * Hashtable address field handling. 5891 * 5892 * The low-order bits of the hashtable address are used to conserve 5893 * space. 5894 * 5895 * DRT_HASH_COUNT_MASK must be large enough to store the range 5896 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value 5897 * to indicate that the bucket is actually unoccupied. 5898 */ 5899#define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK) 5900#define DRT_HASH_SET_ADDRESS(scm, i, a) \ 5901 do { \ 5902 (scm)->scm_hashtable[(i)].dhe_control = \ 5903 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \ 5904 } while (0) 5905#define DRT_HASH_COUNT_MASK 0x1ff 5906#define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK) 5907#define DRT_HASH_SET_COUNT(scm, i, c) \ 5908 do { \ 5909 (scm)->scm_hashtable[(i)].dhe_control = \ 5910 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \ 5911 } while (0) 5912#define DRT_HASH_CLEAR(scm, i) \ 5913 do { \ 5914 (scm)->scm_hashtable[(i)].dhe_control = 0; \ 5915 } while (0) 5916#define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK) 5917#define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK) 5918#define DRT_HASH_COPY(oscm, oi, scm, i) \ 5919 do { \ 5920 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \ 5921 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \ 5922 } while(0); 5923 5924 5925/* 5926 * Hash table moduli. 5927 * 5928 * Since the hashtable entry's size is dependent on the size of 5929 * the bitvector, and since the hashtable size is constrained to 5930 * both being prime and fitting within the desired allocation 5931 * size, these values need to be manually determined. 5932 * 5933 * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes. 5934 * 5935 * The small hashtable allocation is 1024 bytes, so the modulus is 23. 5936 * The large hashtable allocation is 16384 bytes, so the modulus is 401. 5937 */ 5938#define DRT_HASH_SMALL_MODULUS 23 5939#define DRT_HASH_LARGE_MODULUS 401 5940 5941/* 5942 * Physical memory required before the large hash modulus is permitted. 5943 * 5944 * On small memory systems, the large hash modulus can lead to phsyical 5945 * memory starvation, so we avoid using it there. 5946 */ 5947#define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */ 5948 5949#define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */ 5950#define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */ 5951 5952/* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */ 5953 5954/* 5955 * Hashtable bitvector handling. 5956 * 5957 * Bitvector fields are 32 bits long. 5958 */ 5959 5960#define DRT_HASH_SET_BIT(scm, i, bit) \ 5961 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32)) 5962 5963#define DRT_HASH_CLEAR_BIT(scm, i, bit) \ 5964 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32)) 5965 5966#define DRT_HASH_TEST_BIT(scm, i, bit) \ 5967 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32))) 5968 5969#define DRT_BITVECTOR_CLEAR(scm, i) \ 5970 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t)) 5971 5972#define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \ 5973 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \ 5974 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \ 5975 (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t)) 5976 5977 5978 5979/* 5980 * Hashtable entry. 5981 */ 5982struct vfs_drt_hashentry { 5983 u_int64_t dhe_control; 5984 u_int32_t dhe_bitvector[DRT_BITVECTOR_PAGES / 32]; 5985}; 5986 5987/* 5988 * Dirty Region Tracking structure. 5989 * 5990 * The hashtable is allocated entirely inside the DRT structure. 5991 * 5992 * The hash is a simple circular prime modulus arrangement, the structure 5993 * is resized from small to large if it overflows. 5994 */ 5995 5996struct vfs_drt_clustermap { 5997 u_int32_t scm_magic; /* sanity/detection */ 5998#define DRT_SCM_MAGIC 0x12020003 5999 u_int32_t scm_modulus; /* current ring size */ 6000 u_int32_t scm_buckets; /* number of occupied buckets */ 6001 u_int32_t scm_lastclean; /* last entry we cleaned */ 6002 u_int32_t scm_iskips; /* number of slot skips */ 6003 6004 struct vfs_drt_hashentry scm_hashtable[0]; 6005}; 6006 6007 6008#define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus) 6009#define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus) 6010 6011/* 6012 * Debugging codes and arguments. 6013 */ 6014#define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */ 6015#define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */ 6016#define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */ 6017#define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */ 6018#define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length, 6019 * dirty */ 6020 /* 0, setcount */ 6021 /* 1 (clean, no map) */ 6022 /* 2 (map alloc fail) */ 6023 /* 3, resid (partial) */ 6024#define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87)) 6025#define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets, 6026 * lastclean, iskips */ 6027 6028 6029static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp); 6030static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap); 6031static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap, 6032 u_int64_t offset, int *indexp); 6033static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, 6034 u_int64_t offset, 6035 int *indexp, 6036 int recursed); 6037static kern_return_t vfs_drt_do_mark_pages( 6038 void **cmapp, 6039 u_int64_t offset, 6040 u_int length, 6041 u_int *setcountp, 6042 int dirty); 6043static void vfs_drt_trace( 6044 struct vfs_drt_clustermap *cmap, 6045 int code, 6046 int arg1, 6047 int arg2, 6048 int arg3, 6049 int arg4); 6050 6051 6052/* 6053 * Allocate and initialise a sparse cluster map. 6054 * 6055 * Will allocate a new map, resize or compact an existing map. 6056 * 6057 * XXX we should probably have at least one intermediate map size, 6058 * as the 1:16 ratio seems a bit drastic. 6059 */ 6060static kern_return_t 6061vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp) 6062{ 6063 struct vfs_drt_clustermap *cmap, *ocmap; 6064 kern_return_t kret; 6065 u_int64_t offset; 6066 u_int32_t i; 6067 int nsize, active_buckets, index, copycount; 6068 6069 ocmap = NULL; 6070 if (cmapp != NULL) 6071 ocmap = *cmapp; 6072 6073 /* 6074 * Decide on the size of the new map. 6075 */ 6076 if (ocmap == NULL) { 6077 nsize = DRT_HASH_SMALL_MODULUS; 6078 } else { 6079 /* count the number of active buckets in the old map */ 6080 active_buckets = 0; 6081 for (i = 0; i < ocmap->scm_modulus; i++) { 6082 if (!DRT_HASH_VACANT(ocmap, i) && 6083 (DRT_HASH_GET_COUNT(ocmap, i) != 0)) 6084 active_buckets++; 6085 } 6086 /* 6087 * If we're currently using the small allocation, check to 6088 * see whether we should grow to the large one. 6089 */ 6090 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) { 6091 /* 6092 * If the ring is nearly full and we are allowed to 6093 * use the large modulus, upgrade. 6094 */ 6095 if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) && 6096 (max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) { 6097 nsize = DRT_HASH_LARGE_MODULUS; 6098 } else { 6099 nsize = DRT_HASH_SMALL_MODULUS; 6100 } 6101 } else { 6102 /* already using the large modulus */ 6103 nsize = DRT_HASH_LARGE_MODULUS; 6104 /* 6105 * If the ring is completely full, there's 6106 * nothing useful for us to do. Behave as 6107 * though we had compacted into the new 6108 * array and return. 6109 */ 6110 if (active_buckets >= DRT_HASH_LARGE_MODULUS) 6111 return(KERN_SUCCESS); 6112 } 6113 } 6114 6115 /* 6116 * Allocate and initialise the new map. 6117 */ 6118 6119 kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap, 6120 (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION); 6121 if (kret != KERN_SUCCESS) 6122 return(kret); 6123 cmap->scm_magic = DRT_SCM_MAGIC; 6124 cmap->scm_modulus = nsize; 6125 cmap->scm_buckets = 0; 6126 cmap->scm_lastclean = 0; 6127 cmap->scm_iskips = 0; 6128 for (i = 0; i < cmap->scm_modulus; i++) { 6129 DRT_HASH_CLEAR(cmap, i); 6130 DRT_HASH_VACATE(cmap, i); 6131 DRT_BITVECTOR_CLEAR(cmap, i); 6132 } 6133 6134 /* 6135 * If there's an old map, re-hash entries from it into the new map. 6136 */ 6137 copycount = 0; 6138 if (ocmap != NULL) { 6139 for (i = 0; i < ocmap->scm_modulus; i++) { 6140 /* skip empty buckets */ 6141 if (DRT_HASH_VACANT(ocmap, i) || 6142 (DRT_HASH_GET_COUNT(ocmap, i) == 0)) 6143 continue; 6144 /* get new index */ 6145 offset = DRT_HASH_GET_ADDRESS(ocmap, i); 6146 kret = vfs_drt_get_index(&cmap, offset, &index, 1); 6147 if (kret != KERN_SUCCESS) { 6148 /* XXX need to bail out gracefully here */ 6149 panic("vfs_drt: new cluster map mysteriously too small"); 6150 index = 0; 6151 } 6152 /* copy */ 6153 DRT_HASH_COPY(ocmap, i, cmap, index); 6154 copycount++; 6155 } 6156 } 6157 6158 /* log what we've done */ 6159 vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0); 6160 6161 /* 6162 * It's important to ensure that *cmapp always points to 6163 * a valid map, so we must overwrite it before freeing 6164 * the old map. 6165 */ 6166 *cmapp = cmap; 6167 if (ocmap != NULL) { 6168 /* emit stats into trace buffer */ 6169 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA, 6170 ocmap->scm_modulus, 6171 ocmap->scm_buckets, 6172 ocmap->scm_lastclean, 6173 ocmap->scm_iskips); 6174 6175 vfs_drt_free_map(ocmap); 6176 } 6177 return(KERN_SUCCESS); 6178} 6179 6180 6181/* 6182 * Free a sparse cluster map. 6183 */ 6184static kern_return_t 6185vfs_drt_free_map(struct vfs_drt_clustermap *cmap) 6186{ 6187 kmem_free(kernel_map, (vm_offset_t)cmap, 6188 (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION); 6189 return(KERN_SUCCESS); 6190} 6191 6192 6193/* 6194 * Find the hashtable slot currently occupied by an entry for the supplied offset. 6195 */ 6196static kern_return_t 6197vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp) 6198{ 6199 int index; 6200 u_int32_t i; 6201 6202 offset = DRT_ALIGN_ADDRESS(offset); 6203 index = DRT_HASH(cmap, offset); 6204 6205 /* traverse the hashtable */ 6206 for (i = 0; i < cmap->scm_modulus; i++) { 6207 6208 /* 6209 * If the slot is vacant, we can stop. 6210 */ 6211 if (DRT_HASH_VACANT(cmap, index)) 6212 break; 6213 6214 /* 6215 * If the address matches our offset, we have success. 6216 */ 6217 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) { 6218 *indexp = index; 6219 return(KERN_SUCCESS); 6220 } 6221 6222 /* 6223 * Move to the next slot, try again. 6224 */ 6225 index = DRT_HASH_NEXT(cmap, index); 6226 } 6227 /* 6228 * It's not there. 6229 */ 6230 return(KERN_FAILURE); 6231} 6232 6233/* 6234 * Find the hashtable slot for the supplied offset. If we haven't allocated 6235 * one yet, allocate one and populate the address field. Note that it will 6236 * not have a nonzero page count and thus will still technically be free, so 6237 * in the case where we are called to clean pages, the slot will remain free. 6238 */ 6239static kern_return_t 6240vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed) 6241{ 6242 struct vfs_drt_clustermap *cmap; 6243 kern_return_t kret; 6244 u_int32_t index; 6245 u_int32_t i; 6246 6247 cmap = *cmapp; 6248 6249 /* look for an existing entry */ 6250 kret = vfs_drt_search_index(cmap, offset, indexp); 6251 if (kret == KERN_SUCCESS) 6252 return(kret); 6253 6254 /* need to allocate an entry */ 6255 offset = DRT_ALIGN_ADDRESS(offset); 6256 index = DRT_HASH(cmap, offset); 6257 6258 /* scan from the index forwards looking for a vacant slot */ 6259 for (i = 0; i < cmap->scm_modulus; i++) { 6260 /* slot vacant? */ 6261 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap,index) == 0) { 6262 cmap->scm_buckets++; 6263 if (index < cmap->scm_lastclean) 6264 cmap->scm_lastclean = index; 6265 DRT_HASH_SET_ADDRESS(cmap, index, offset); 6266 DRT_HASH_SET_COUNT(cmap, index, 0); 6267 DRT_BITVECTOR_CLEAR(cmap, index); 6268 *indexp = index; 6269 vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0); 6270 return(KERN_SUCCESS); 6271 } 6272 cmap->scm_iskips += i; 6273 index = DRT_HASH_NEXT(cmap, index); 6274 } 6275 6276 /* 6277 * We haven't found a vacant slot, so the map is full. If we're not 6278 * already recursed, try reallocating/compacting it. 6279 */ 6280 if (recursed) 6281 return(KERN_FAILURE); 6282 kret = vfs_drt_alloc_map(cmapp); 6283 if (kret == KERN_SUCCESS) { 6284 /* now try to insert again */ 6285 kret = vfs_drt_get_index(cmapp, offset, indexp, 1); 6286 } 6287 return(kret); 6288} 6289 6290/* 6291 * Implementation of set dirty/clean. 6292 * 6293 * In the 'clean' case, not finding a map is OK. 6294 */ 6295static kern_return_t 6296vfs_drt_do_mark_pages( 6297 void **private, 6298 u_int64_t offset, 6299 u_int length, 6300 u_int *setcountp, 6301 int dirty) 6302{ 6303 struct vfs_drt_clustermap *cmap, **cmapp; 6304 kern_return_t kret; 6305 int i, index, pgoff, pgcount, setcount, ecount; 6306 6307 cmapp = (struct vfs_drt_clustermap **)private; 6308 cmap = *cmapp; 6309 6310 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0); 6311 6312 if (setcountp != NULL) 6313 *setcountp = 0; 6314 6315 /* allocate a cluster map if we don't already have one */ 6316 if (cmap == NULL) { 6317 /* no cluster map, nothing to clean */ 6318 if (!dirty) { 6319 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0); 6320 return(KERN_SUCCESS); 6321 } 6322 kret = vfs_drt_alloc_map(cmapp); 6323 if (kret != KERN_SUCCESS) { 6324 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0); 6325 return(kret); 6326 } 6327 } 6328 setcount = 0; 6329 6330 /* 6331 * Iterate over the length of the region. 6332 */ 6333 while (length > 0) { 6334 /* 6335 * Get the hashtable index for this offset. 6336 * 6337 * XXX this will add blank entries if we are clearing a range 6338 * that hasn't been dirtied. 6339 */ 6340 kret = vfs_drt_get_index(cmapp, offset, &index, 0); 6341 cmap = *cmapp; /* may have changed! */ 6342 /* this may be a partial-success return */ 6343 if (kret != KERN_SUCCESS) { 6344 if (setcountp != NULL) 6345 *setcountp = setcount; 6346 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0); 6347 6348 return(kret); 6349 } 6350 6351 /* 6352 * Work out how many pages we're modifying in this 6353 * hashtable entry. 6354 */ 6355 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE; 6356 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff)); 6357 6358 /* 6359 * Iterate over pages, dirty/clearing as we go. 6360 */ 6361 ecount = DRT_HASH_GET_COUNT(cmap, index); 6362 for (i = 0; i < pgcount; i++) { 6363 if (dirty) { 6364 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) { 6365 DRT_HASH_SET_BIT(cmap, index, pgoff + i); 6366 ecount++; 6367 setcount++; 6368 } 6369 } else { 6370 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) { 6371 DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i); 6372 ecount--; 6373 setcount++; 6374 } 6375 } 6376 } 6377 DRT_HASH_SET_COUNT(cmap, index, ecount); 6378 6379 offset += pgcount * PAGE_SIZE; 6380 length -= pgcount * PAGE_SIZE; 6381 } 6382 if (setcountp != NULL) 6383 *setcountp = setcount; 6384 6385 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0); 6386 6387 return(KERN_SUCCESS); 6388} 6389 6390/* 6391 * Mark a set of pages as dirty/clean. 6392 * 6393 * This is a public interface. 6394 * 6395 * cmapp 6396 * Pointer to storage suitable for holding a pointer. Note that 6397 * this must either be NULL or a value set by this function. 6398 * 6399 * size 6400 * Current file size in bytes. 6401 * 6402 * offset 6403 * Offset of the first page to be marked as dirty, in bytes. Must be 6404 * page-aligned. 6405 * 6406 * length 6407 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE. 6408 * 6409 * setcountp 6410 * Number of pages newly marked dirty by this call (optional). 6411 * 6412 * Returns KERN_SUCCESS if all the pages were successfully marked. 6413 */ 6414static kern_return_t 6415vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp) 6416{ 6417 /* XXX size unused, drop from interface */ 6418 return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1)); 6419} 6420 6421#if 0 6422static kern_return_t 6423vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length) 6424{ 6425 return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0)); 6426} 6427#endif 6428 6429/* 6430 * Get a cluster of dirty pages. 6431 * 6432 * This is a public interface. 6433 * 6434 * cmapp 6435 * Pointer to storage managed by drt_mark_pages. Note that this must 6436 * be NULL or a value set by drt_mark_pages. 6437 * 6438 * offsetp 6439 * Returns the byte offset into the file of the first page in the cluster. 6440 * 6441 * lengthp 6442 * Returns the length in bytes of the cluster of dirty pages. 6443 * 6444 * Returns success if a cluster was found. If KERN_FAILURE is returned, there 6445 * are no dirty pages meeting the minmum size criteria. Private storage will 6446 * be released if there are no more dirty pages left in the map 6447 * 6448 */ 6449static kern_return_t 6450vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp) 6451{ 6452 struct vfs_drt_clustermap *cmap; 6453 u_int64_t offset; 6454 u_int length; 6455 u_int32_t j; 6456 int index, i, fs, ls; 6457 6458 /* sanity */ 6459 if ((cmapp == NULL) || (*cmapp == NULL)) 6460 return(KERN_FAILURE); 6461 cmap = *cmapp; 6462 6463 /* walk the hashtable */ 6464 for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) { 6465 index = DRT_HASH(cmap, offset); 6466 6467 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0)) 6468 continue; 6469 6470 /* scan the bitfield for a string of bits */ 6471 fs = -1; 6472 6473 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) { 6474 if (DRT_HASH_TEST_BIT(cmap, index, i)) { 6475 fs = i; 6476 break; 6477 } 6478 } 6479 if (fs == -1) { 6480 /* didn't find any bits set */ 6481 panic("vfs_drt: entry summary count > 0 but no bits set in map"); 6482 } 6483 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) { 6484 if (!DRT_HASH_TEST_BIT(cmap, index, i)) 6485 break; 6486 } 6487 6488 /* compute offset and length, mark pages clean */ 6489 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs); 6490 length = ls * PAGE_SIZE; 6491 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0); 6492 cmap->scm_lastclean = index; 6493 6494 /* return successful */ 6495 *offsetp = (off_t)offset; 6496 *lengthp = length; 6497 6498 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0); 6499 return(KERN_SUCCESS); 6500 } 6501 /* 6502 * We didn't find anything... hashtable is empty 6503 * emit stats into trace buffer and 6504 * then free it 6505 */ 6506 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA, 6507 cmap->scm_modulus, 6508 cmap->scm_buckets, 6509 cmap->scm_lastclean, 6510 cmap->scm_iskips); 6511 6512 vfs_drt_free_map(cmap); 6513 *cmapp = NULL; 6514 6515 return(KERN_FAILURE); 6516} 6517 6518 6519static kern_return_t 6520vfs_drt_control(void **cmapp, int op_type) 6521{ 6522 struct vfs_drt_clustermap *cmap; 6523 6524 /* sanity */ 6525 if ((cmapp == NULL) || (*cmapp == NULL)) 6526 return(KERN_FAILURE); 6527 cmap = *cmapp; 6528 6529 switch (op_type) { 6530 case 0: 6531 /* emit stats into trace buffer */ 6532 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA, 6533 cmap->scm_modulus, 6534 cmap->scm_buckets, 6535 cmap->scm_lastclean, 6536 cmap->scm_iskips); 6537 6538 vfs_drt_free_map(cmap); 6539 *cmapp = NULL; 6540 break; 6541 6542 case 1: 6543 cmap->scm_lastclean = 0; 6544 break; 6545 } 6546 return(KERN_SUCCESS); 6547} 6548 6549 6550 6551/* 6552 * Emit a summary of the state of the clustermap into the trace buffer 6553 * along with some caller-provided data. 6554 */ 6555#if KDEBUG 6556static void 6557vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4) 6558{ 6559 KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0); 6560} 6561#else 6562static void 6563vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code, 6564 __unused int arg1, __unused int arg2, __unused int arg3, 6565 __unused int arg4) 6566{ 6567} 6568#endif 6569 6570#if 0 6571/* 6572 * Perform basic sanity check on the hash entry summary count 6573 * vs. the actual bits set in the entry. 6574 */ 6575static void 6576vfs_drt_sanity(struct vfs_drt_clustermap *cmap) 6577{ 6578 int index, i; 6579 int bits_on; 6580 6581 for (index = 0; index < cmap->scm_modulus; index++) { 6582 if (DRT_HASH_VACANT(cmap, index)) 6583 continue; 6584 6585 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) { 6586 if (DRT_HASH_TEST_BIT(cmap, index, i)) 6587 bits_on++; 6588 } 6589 if (bits_on != DRT_HASH_GET_COUNT(cmap, index)) 6590 panic("bits_on = %d, index = %d\n", bits_on, index); 6591 } 6592} 6593#endif 6594