1/* 2 * Boot-time performance cache. 3 * 4 * We implement a shim layer for a block device which can be advised ahead 5 * of time of the likely read pattern for the block device. 6 * 7 * Armed with a sorted version of this read pattern (the "playlist"), we 8 * cluster the reads into a private cache, taking advantage of the 9 * sequential I/O pattern. Incoming read requests are then satisfied from 10 * the cache (if possible), or by forwarding to the original strategy routine. 11 * 12 * We also record the pattern of incoming read requests (the "history 13 * list"), in order to provide for adaptation to changes in behaviour by 14 * the calling layer. 15 * 16 * The hit rate of the cache is also measured, and caching will be disabled 17 * preemptively if the hit rate is too low. 18 * 19 * The cache is controlled by a sysctl interface. Typical usage is as 20 * follows: 21 * 22 * - Load a sorted read pattern. 23 * - Wait for the operation(s) of interest to be performed. 24 * - Fetch the new (unsorted) read pattern. 25 * 26 * Loading a read pattern initialises the cache; upon successful loading 27 * the cache is active and readhead is being performed. After fetching the 28 * recorded read pattern, the cache remains around until jettisoned due 29 * to memory pressure or the hit rate falls below a desirable level. 30 * 31 * Limitations: 32 * 33 * - only one copy of the cache may be active at a time 34 */ 35 36/* 37 * Build options. 38 */ 39/*#define BC_DEBUG*/ 40/*#define BC_EXTRA_DEBUG*/ 41 42/* 43 * Emulate readahead, don't actually perform it. 44 */ 45/*#define EMULATE_ONLY*/ 46 47/* 48 * Don't collect request history. 49 */ 50/*#define NO_HISTORY*/ 51 52/* 53 * Don't use history cluster chains (seems to be a malloc problem with these 54 * enabled) 55 */ 56/*#define STATIC_HISTORY*/ 57 58/* 59 * Only cache for the volumes on the root disk 60 */ 61#define ROOT_DISK_ONLY 62 63/* 64 * Keep a log of read history to aid in debugging. 65 */ 66/*#define READ_HISTORY_BUFFER 1000*/ 67/*#define READ_HISTORY_ALL*/ 68 69/* 70 * Ignore the batches on playlist entries. 71 */ 72/*#define IGNORE_BATCH */ 73 74/* 75 * Tunable parameters. 76 */ 77 78/* 79 * Minimum required physical memory before BootCache will 80 * activate. Insufficient physical memory leads to paging 81 * activity during boot, which is not repeatable and can't 82 * be distinguished from the repeatable read patterns. 83 * 84 * Size in megabytes. 85 */ 86static int BC_minimum_mem_size = 256; 87 88/* 89 * Number of seconds to wait in BC_strategy for readahead to catch up 90 * with our request. Tuninig issues: 91 * - Waiting too long may hold up the system behind this I/O/. 92 * - Bypassing an I/O too early will reduce the hit rate and adversely 93 * affect performance on slow media. 94 */ 95#define BC_READAHEAD_WAIT_DEFAULT 10 96#define BC_READAHEAD_WAIT_CDROM 45 97static int BC_wait_for_readahead = BC_READAHEAD_WAIT_DEFAULT; 98 99/* 100 * Cache hit ratio monitoring. 101 * 102 * The time overhead of a miss in the cache is approx 1/5000th of the 103 * time it takes to satisfy an IO from a HDD. If our hit rate falls 104 * below this, we're no longer providing a performance win and the 105 * cache will be jettisoned. 106 * 107 * Make sure to keep the cache around for at least an hour 108 * if the user hasn't logged in yet, though. 109 */ 110static int BC_chit_max_num_IOs_since_last_hit = 10000; 111static int BC_chit_prelogin_timeout_seconds = 3600; 112 113/* 114 * History recording timeout. 115 * 116 * If the history recording is not turned off, the history buffer 117 * will grow to its maximum size. This timeout will kill the history 118 * recording after the specified interval (in hz). 119 * 120 * User DVDs don't issue a "BootCacheControl stop", so they depend 121 * on the history recording timeout. 122 */ 123static int BC_history_timeout = 240 * 100; 124 125/* 126 * Trace macros 127 */ 128#ifndef DBG_BOOTCACHE 129#define DBG_BOOTCACHE 7 130#endif 131 132#define DBG_BC_TAG (1 << 0) 133#define DBG_BC_BATCH (1 << 1) 134 135#define DBG_BC_IO_HIT (1 << 2) 136#define DBG_BC_IO_HIT_STALLED (1 << 3) 137#define DBG_BC_IO_MISS (1 << 4) 138#define DBG_BC_IO_MISS_CUT_THROUGH (1 << 5) 139#define DBG_BC_PLAYBACK_IO (1 << 6) 140 141static int dbg_tag_count = 0; 142 143#ifdef BC_DEBUG 144# define MACH_DEBUG 145# define MACH_ASSERT 1 146# define message(fmt, args...) \ 147do { \ 148 microtime(&debug_currenttime); \ 149 timersub(&debug_currenttime, &debug_starttime, &debug_currenttime); \ 150 lck_mtx_lock(&debug_printlock); \ 151 printf("BootCache: %5d.%03d %24s[%4d]: " fmt "\n", (u_int)(debug_currenttime.tv_sec), (u_int)(debug_currenttime.tv_usec / 1000), __FUNCTION__, __LINE__, ##args); \ 152 lck_mtx_unlock(&debug_printlock); \ 153} while (0) 154 155# define debug(fmt, args...) message("*** " fmt, ##args) 156extern void Debugger(char *); 157#else 158# define debug(fmt, args...) do {} while (0) 159# define message(fmt, args...) printf("BootCache: " fmt "\n" , ##args) 160#endif 161 162#ifdef BC_EXTRA_DEBUG 163# define xdebug(fmt, args...) message("+++ " fmt, ##args) 164#else 165# define xdebug(fmt, args...) do {} while (0) 166#endif 167 168#include <sys/buf.h> 169#include <sys/conf.h> 170#include <sys/fcntl.h> 171#include <sys/kdebug.h> 172#include <sys/kernel.h> 173#include <sys/malloc.h> 174#include <sys/param.h> 175#include <sys/proc.h> 176#include <sys/types.h> 177#include <sys/sysctl.h> 178#include <sys/systm.h> 179#include <sys/vnode.h> 180#include <sys/mount.h> 181#include <sys/ubc.h> 182#include <sys/uio.h> 183 184#include <uuid/uuid.h> 185 186#include <mach/kmod.h> 187#include <mach/mach_types.h> 188#include <mach/vm_param.h> /* for max_mem */ 189 190#include <vm/vm_kern.h> 191 192#include <libkern/libkern.h> 193#include <libkern/OSAtomic.h> 194 195#include <kern/assert.h> 196#include <kern/host.h> 197#include <kern/kalloc.h> 198#include <kern/thread_call.h> 199#include <kern/locks.h> 200 201#include <IOKit/storage/IOMediaBSDClient.h> 202 203#include <pexpert/pexpert.h> 204 205#include "BootCache.h" 206 207#ifdef BC_DEBUG 208static struct timeval debug_starttime; 209static struct timeval debug_currenttime; 210static lck_mtx_t debug_printlock; 211#endif 212 213#define BC_MALLOC(_size, _type, _flags) ((ssize_t)(_size) < 0) ? NULL : _MALLOC((_size), (_type), (_flags)) 214 215/************************************** 216 * Structures for the readahead cache * 217 **************************************/ 218 219/* 220 * A cache extent. 221 * 222 * Each extent represents a contiguous range of disk blocks which are 223 * held in the cache. All values in bytes. 224 */ 225struct BC_cache_extent { 226 lck_mtx_t ce_lock; 227 u_int64_t ce_diskoffset; /* physical offset on device */ 228 u_int64_t ce_length; /* data length */ 229 off_t ce_cacheoffset; /* offset into cache backing store */ 230 u_int16_t ce_batch; /* batch number */ 231 u_int16_t ce_flags; /* flags */ 232#define CE_ABORTED (1 << 0) /* extent no longer has data */ 233#define CE_IODONE (1 << 1) /* extent has been read */ 234#define CE_LOWPRI (1 << 2) /* extent is low priority */ 235#define CE_SHARED (1 << 3) /* extent is in the shared cache */ 236 int ce_mount_idx; /* the index of the mount for this extent */ 237 u_int32_t *ce_blockmap; /* track valid blocks for this extent */ 238}; 239 240/* A mount 241 * 242 * The volume to which an extent refers 243 */ 244struct BC_cache_mount { 245 lck_rw_t cm_lock; /* lock guards instance variables, extents have their own locks */ 246 uuid_t cm_uuid; /* this mount's uuid */ 247 int cm_nextents; /* the number of extents that reference this mount */ 248 struct BC_cache_extent **cm_pextents; /* a list of pointers to extents which reference 249 this mount, sorted by block address */ 250 int cm_state; /* this mount's state */ 251#define CM_SETUP (0) /* fields above are valid. Not mounted yet */ 252#define CM_READY (1) /* every field is valid. Mounted and ready to go */ 253#define CM_ABORTED (2) /* mount failed for some reason */ 254 255 /* fields below filled in when we detect that the volume has mounted */ 256 dev_t cm_dev; /* the device for this mount */ 257 struct buf *cm_bp; /* struct buf allocated in the reader thread for this device */ 258 u_int64_t cm_throttle_mask;/* the throttle mask to use in throttle APIs for this mount */ 259 u_int64_t cm_blocksize; /* keep 64-bit for promotion */ 260 u_int64_t cm_devsize; /* range checking */ 261 u_int64_t cm_maxread; /* largest read the dev will support */ 262 struct BC_cache_disk* cm_disk; /* the mount's physical disk */ 263}; 264 265/* A physical disk 266 * 267 * The disk may contain several mounts 268 * and will have one readahead thread 269 */ 270struct BC_cache_disk { 271 lck_mtx_t cd_lock; 272 u_int64_t cd_disk_id; /* The throttle mask, used as an identifier */ 273 int cd_disk_num; /* used for stat batching */ 274 int cd_flags; /* flags */ 275#define CD_HAS_THREAD (1 << 0) /* there is a readahead thread in progress for this disk */ 276#define CD_ISSUE_LOWPRI (1 << 1) /* the readahead thread is reading in low-priority extents */ 277#define CD_IS_SSD (1 << 2) /* this is a solid state disk */ 278 int cd_nmounts; /* The number of mounts that reference this disk */ 279 int cd_batch; /* What playback batch this disk is on */ 280}; 281 282/************************************ 283 * Structures for history recording * 284 ************************************/ 285 286/* see BC_history_entry and BC_history_mount in BootCache.h */ 287 288/* 289 * Wrapper around BC_history_mount so we 290 * can keep track of the device 291 */ 292struct BC_history_mount_device { 293 u_int64_t hmd_disk_id; 294 struct BC_history_mount_device* hmd_next; 295 uint32_t hmd_is_ssd; 296 dev_t hmd_dev; 297 int hmd_blocksize; 298 struct BC_history_mount hmd_mount; 299}; 300 301/* 302 * A history entry list cluster. 303 */ 304struct BC_history_entry_cluster { 305 struct BC_history_entry_cluster *hec_link; 306 int hec_nentries; /* number of entries in list */ 307 struct BC_history_entry hec_data[0]; 308}; 309 310#ifdef STATIC_HISTORY 311# define BC_HISTORY_ALLOC 128000 312# define BC_HISTORY_ENTRIES \ 313(((BC_HISTORY_ALLOC - sizeof(struct BC_history_entry_cluster)) / \ 314sizeof(struct BC_history_entry)) - 1) 315# define BC_HISTORY_MAXCLUSTERS 1 316#else 317# define BC_HISTORY_ALLOC 16000 318# define BC_HISTORY_ENTRIES \ 319(((BC_HISTORY_ALLOC - sizeof(struct BC_history_entry_cluster)) / \ 320sizeof(struct BC_history_entry)) - 1) 321# define BC_HISTORY_MAXCLUSTERS ((BC_MAXENTRIES / BC_HISTORY_ENTRIES) + 1) 322#endif 323 324 325/***************************************** 326 * To be able to debug live read history * 327 *****************************************/ 328#ifdef READ_HISTORY_BUFFER 329struct BC_read_history_entry { 330 struct BC_cache_extent *rh_extent; /* extent this read is for */ 331 u_int64_t rh_blkno; /* offset of actual read */ 332 u_int64_t rh_bcount; /* length in bytes */ 333 int rh_result; 334# define BC_RHISTORY_OK 0 335# define BC_RHISTORY_FAIL 1 336}; 337#endif 338 339 340/* 341 * The cache control structure. 342 */ 343struct BC_cache_control { 344 lck_grp_t *c_lckgrp; 345 346 /* the device we are covering */ 347 struct bdevsw *c_devsw; 348 349 /* saved switch table routines */ 350 strategy_fcn_t *c_strategy; 351 open_close_fcn_t *c_close; 352 lck_mtx_t c_handlers_lock; 353 354 u_int64_t c_cachesize; /* total number of bytes in cache */ 355 356 u_int64_t c_root_disk_id; /* the throttle mask of the root disk, used as an id for the physical device */ 357 /* This needs to be updated to handle multiple physical volumes backing a mount */ 358 359 /* 360 * The mounts we're tracking 361 */ 362 int c_nmounts; /* number of mounts in the array below */ 363 struct BC_cache_mount *c_mounts; /* the array of mounts the cache extents refer to */ 364 365 /* 366 * Extents, in optimal read order. 367 * 368 * Each extent tracks the pages it owns in the buffer. 369 * It is assumed that PAGE_SIZE is a multiple of cm_blocksize for each mount. 370 * 371 * Each extent refers to one of the mounts, above, by index into c_mounts 372 */ 373 int c_nextentlists; /* number of extent lists we have */ 374 int *c_nextents; /* number of extents in a given extent list */ 375 struct BC_cache_extent **c_extentlists; /* an array of extent lists in the cache */ 376 377 int c_batch_count; /* number of extent batches */ 378 vnode_t c_vp; 379 uint32_t c_vid; 380 /* rw lock protects the above fields through c_nmounts, but not their contents. 381 * Shared for general use of the fields, 382 * Exclusive for cache initialization or cleanup. 383 * Each mount/extent has its own lock for modifying its contents. 384 */ 385 lck_rw_t c_cache_lock; 386 387 388 /* history list, in reverse order */ 389 int c_history_num_clusters; 390 struct BC_history_entry_cluster *c_history; 391 struct BC_history_mount_device *c_history_mounts; 392 uint64_t c_history_size; 393 /* lock protects the above history fields and their contents */ 394 lck_rw_t c_history_lock; 395 396 /* fields below are accessed atomically */ 397 398 /* flags */ 399 UInt32 c_flags; 400#define BC_FLAG_SETUP (1 << 0) /* cache setup properly during mount */ 401#define BC_FLAG_CACHEACTIVE (1 << 1) /* cache is active, owns memory */ 402#define BC_FLAG_HISTORYACTIVE (1 << 2) /* currently recording history */ 403#define BC_FLAG_HTRUNCATED (1 << 3) /* history list truncated */ 404#define BC_FLAG_SHUTDOWN (1 << 4) /* readahead shut down */ 405 SInt32 c_strategycalls; /* count of busy strategy calls in the cache */ 406 407 uint32_t c_readahead_throttles_cutthroughs; 408 409 uint32_t c_num_ios_since_last_hit; /* The number of cache misses since the last hit */ 410 411 uint32_t c_num_reader_threads; /* The number of reader threads active */ 412 lck_mtx_t c_reader_threads_lock; /* protects c_num_reader_threads */ 413 414 /* statistics */ 415 int c_take_detailed_stats; 416 struct BC_statistics c_stats; 417 418 /* time-keeping */ 419 struct timeval c_loadtime; 420 struct timeval c_cache_starttime; 421 struct timeval c_history_starttime; 422 423#ifdef READ_HISTORY_BUFFER 424 int c_rhistory_idx; 425 struct BC_read_history_entry *c_rhistory; 426#endif 427}; 428 429/* 430 * The number of blocks per page; assumes that a page 431 * will always be >= the size of a disk block. 432 */ 433#define CB_PAGEBLOCKS(cm) (PAGE_SIZE / (cm)->cm_blocksize) 434 435/* 436 * Convert block offset to page offset and vice versa. 437 */ 438#define CB_BLOCK_TO_PAGE(cm, block) ((block) / CB_PAGEBLOCKS(cm)) 439#define CB_PAGE_TO_BLOCK(cm, page) ((page) * CB_PAGEBLOCKS(cm)) 440#define CB_BLOCK_TO_BYTE(cm, block) ((block) * (cm)->cm_blocksize) 441#define HB_BLOCK_TO_BYTE(hmd, block) ((block) * (hmd)->hmd_blocksize) 442#define CB_BYTE_TO_BLOCK(cm, byte) ((byte) / (cm)->cm_blocksize) 443 444/* 445 * The size of an addressable field in the block map. 446 * This reflects the u_int32_t type for the blockmap type. 447 */ 448#define CB_MAPFIELDBITS 32 449#define CB_MAPFIELDBYTES 4 450 451/* 452 * The index into an array of addressable fields, and the bit within 453 * the addressable field corresponding to an index in the map. 454 */ 455#define CB_MAPIDX(x) ((x) / CB_MAPFIELDBITS) 456#define CB_MAPBIT(x) (1 << ((x) % CB_MAPFIELDBITS)) 457 458/* 459 * Blockmap management. Must be called with extent lock held. 460 */ 461#define CB_BLOCK_PRESENT(ce, block) \ 462((ce)->ce_blockmap != NULL && ((ce)->ce_blockmap[CB_MAPIDX(block)] & CB_MAPBIT(block))) 463#define CB_MARK_BLOCK_PRESENT(ce, block) \ 464((ce)->ce_blockmap[CB_MAPIDX(block)] |= CB_MAPBIT(block)) 465#define CB_MARK_BLOCK_VACANT(ce, block) \ 466((ce)->ce_blockmap[CB_MAPIDX(block)] &= ~CB_MAPBIT(block)) 467 468/* 469 * Determine whether a given page is vacant (all blocks are gone). 470 * This takes advantage of the expectation that a page's blocks 471 * will never cross the boundary of a field in the blockmap. 472 */ 473#define CB_PAGE_VACANT(cm, ce, page) \ 474(!((ce)->ce_blockmap[CB_MAPIDX(CB_PAGE_TO_BLOCK(cm, page))] & \ 475(((1 << CB_PAGEBLOCKS(cm)) - 1) << (CB_PAGE_TO_BLOCK(cm, page) % \ 476CB_MAPFIELDBITS)))) 477 478/* Maximum size of the boot cache */ 479#define BC_MAX_SIZE (max_mem / 2) 480 481/* 482 * Sanity macro, frees and zeroes a pointer if it is not already zeroed. 483 */ 484#define _FREE_ZERO(p, c) \ 485do { \ 486if (p != NULL) { \ 487_FREE(p, c); \ 488p = NULL; \ 489} \ 490} while (0); 491 492/* 493 * Synchronization macros 494 */ 495#define LOCK_HISTORY_R() lck_rw_lock_shared(&BC_cache->c_history_lock) 496#define UNLOCK_HISTORY_R() lck_rw_unlock_shared(&BC_cache->c_history_lock) 497#define LOCK_HISTORY_W() lck_rw_lock_exclusive(&BC_cache->c_history_lock) 498#define UNLOCK_HISTORY_W() lck_rw_unlock_exclusive(&BC_cache->c_history_lock) 499 500#define LOCK_EXTENT(e) lck_mtx_lock(&(e)->ce_lock) 501#define UNLOCK_EXTENT(e) lck_mtx_unlock(&(e)->ce_lock) 502 503/* mount locks should only be held while also holding the cache lock */ 504#define LOCK_MOUNT_R(m) lck_rw_lock_shared(&(m)->cm_lock) 505#define UNLOCK_MOUNT_R(m) lck_rw_unlock_shared(&(m)->cm_lock) 506#define LOCK_MOUNT_W(m) lck_rw_lock_exclusive(&(m)->cm_lock) 507#define UNLOCK_MOUNT_W(m) lck_rw_unlock_exclusive(&(m)->cm_lock) 508#define LOCK_MOUNT_W_TO_R(m) lck_rw_lock_exclusive_to_shared(&(m)->cm_lock) 509#define LOCK_MOUNT_R_TO_W(m) lck_rw_lock_shared_to_exclusive(&(m)->cm_lock) 510 511#define LOCK_DISK(d) lck_mtx_lock(&(d)->cd_lock) 512#define UNLOCK_DISK(d) lck_mtx_unlock(&(d)->cd_lock) 513 514#define LOCK_READERS() lck_mtx_lock(&BC_cache->c_reader_threads_lock) 515#define UNLOCK_READERS() lck_mtx_unlock(&BC_cache->c_reader_threads_lock) 516 517#define LOCK_HANDLERS() lck_mtx_lock(&BC_cache->c_handlers_lock) 518#define UNLOCK_HANDLERS() lck_mtx_unlock(&BC_cache->c_handlers_lock) 519 520#define LOCK_CACHE_R() lck_rw_lock_shared(&BC_cache->c_cache_lock) 521#define UNLOCK_CACHE_R() lck_rw_unlock_shared(&BC_cache->c_cache_lock) 522#define LOCK_CACHE_W() lck_rw_lock_exclusive(&BC_cache->c_cache_lock) 523#define UNLOCK_CACHE_W() lck_rw_unlock_exclusive(&BC_cache->c_cache_lock) 524#define LOCK_CACHE_TRY_W() lck_rw_try_lock(&BC_cache->c_cache_lock, LCK_RW_TYPE_EXCLUSIVE) 525#define LOCK_CACHE_TRY_R() lck_rw_try_lock(&BC_cache->c_cache_lock, LCK_RW_TYPE_SHARED) 526#define LOCK_CACHE_W_TO_R() lck_rw_lock_exclusive_to_shared(&BC_cache->c_cache_lock) 527#define LOCK_CACHE_R_TO_W() lck_rw_lock_shared_to_exclusive(&BC_cache->c_cache_lock) 528 529#define BC_ADD_STAT(stat, inc) OSAddAtomic((inc), ((SInt32 *)&BC_cache->c_stats.ss_##stat)) 530 531/* 532 * Only one instance of the cache is supported. 533 */ 534static struct BC_cache_control BC_cache_softc; 535static struct BC_cache_control *BC_cache = &BC_cache_softc; 536 537/* 538 * We support preloaded cache data by checking for a Mach-O 539 * segment/section which can be populated with a playlist by the 540 * linker. This is particularly useful in the CD mastering process, 541 * as reading the playlist from CD is very slow and rebuilding the 542 * kext at mastering time is not practical. 543 */ 544extern kmod_info_t kmod_info; 545static struct BC_playlist_header *BC_preloaded_playlist; 546static int BC_preloaded_playlist_size; 547 548static int BC_check_intersection(struct BC_cache_extent *ce, u_int64_t offset, u_int64_t length); 549static int BC_find_cache_mount(dev_t dev); 550static struct BC_cache_extent** BC_find_extent(struct BC_cache_mount* cm, u_int64_t offset, u_int64_t length, int contained, int* pnum_extents); 551static int BC_discard_bytes(struct BC_cache_extent *ce, u_int64_t offset, u_int64_t length); 552static int BC_handle_discards(struct BC_cache_mount *cm, u_int64_t offset, u_int64_t length); 553static int BC_blocks_present(struct BC_cache_extent *ce, int base, int nblk); 554static void BC_reader_thread(void *param0, wait_result_t param1); 555static int BC_strategy(struct buf *bp); 556static int BC_close(dev_t dev, int flags, int devtype, struct proc *p); 557static int BC_terminate_readahead(void); 558static int BC_terminate_cache(void); 559static int BC_terminate_history(void); 560static void BC_terminate_cache_async(void); 561static void BC_check_handlers(void); 562static void BC_next_valid_range(struct BC_cache_mount *cm, struct BC_cache_extent *ce, uint32_t from, 563 uint32_t *nextpage, uint32_t *nextoffset, uint32_t *nextlength); 564static int BC_setup_disk(struct BC_cache_disk *cd, u_int64_t disk_id, int is_ssd); 565static void BC_teardown_disk(struct BC_cache_disk *cd); 566static void BC_mount_available(struct BC_cache_mount *cm); 567static int BC_setup_mount(struct BC_cache_mount *cm, struct BC_playlist_mount* pm); 568static int BC_fill_in_mount(struct BC_cache_mount *cm, mount_t mount, vfs_context_t context); 569static void BC_teardown_mount(struct BC_cache_mount *ce); 570static int BC_setup_extent(struct BC_cache_extent *ce, const struct BC_playlist_entry *pe, int batch_adjustment, int cache_mount_idx); 571static void BC_teardown_extent(struct BC_cache_extent *ce); 572static int BC_copyin_playlist(size_t mounts_size, user_addr_t mounts, size_t entries_size, user_addr_t entries); 573static int BC_init_cache(size_t mounts_size, user_addr_t mounts, size_t entries_size, user_addr_t entries, int record_history); 574static void BC_update_mounts(void); 575static void BC_timeout_history(void *); 576static struct BC_history_mount_device * BC_get_history_mount_device(dev_t dev, int* pindex); 577static int BC_add_history(daddr64_t blkno, u_int64_t length, int pid, int hit, int write, int tag, int shared, dev_t dev); 578static int BC_size_history_mounts(void); 579static int BC_size_history_entries(void); 580static int BC_copyout_history_mounts(user_addr_t uptr); 581static int BC_copyout_history_entries(user_addr_t uptr); 582static void BC_discard_history(void); 583static void BC_auto_start(void); 584static int BC_setup(void); 585 586static int fill_in_bc_cache_mounts(mount_t mount, void* arg); 587static int check_for_new_mount_itr(mount_t mount, void* arg); 588static int extents_status(struct BC_cache_extent **pce, int nextents) ; 589static void wait_for_extent(struct BC_cache_extent *ce, struct timeval starttime) ; 590 591#ifndef BOOTCACHE_ENTRIES_SORTED_BY_DISK_OFFSET 592static int compare_cache_extents(const void *vfirst, const void *vsecond); 593//FIXME: qsort not available in kexts 594extern void qsort(void *a, size_t n, size_t es, int (*cmp)(const void *, const void *)); 595#endif 596 597/* 598 * Sysctl interface. 599 */ 600static int BC_sysctl SYSCTL_HANDLER_ARGS; 601 602SYSCTL_PROC(_kern, OID_AUTO, BootCache, CTLFLAG_RW | CTLFLAG_LOCKED, 603 0, 0, BC_sysctl, 604 "S,BC_command", 605 "BootCache command interface"); 606 607/* 608 * Netboot exports this hook so that we can determine 609 * whether we were netbooted. 610 */ 611extern int netboot_root(void); 612 613/* 614 * bsd_init exports this hook so that we can register for notification 615 * once the root filesystem is mounted. 616 */ 617extern void (* mountroot_post_hook)(void); 618extern void (* unmountroot_pre_hook)(void); 619 620/* 621 * Mach VM-specific functions. 622 */ 623static int BC_alloc_pagebuffer(); 624static void BC_free_pagebuffer(); 625static void BC_free_page(struct BC_cache_extent *ce, int page); 626 627/* Functions not prototyped elsewhere */ 628int ubc_range_op(vnode_t, off_t, off_t, int, int *); /* in sys/ubc_internal.h */ 629 630/* 631 * Mach-o functions. 632 */ 633#define MACH_KERNEL 1 634#include <mach-o/loader.h> 635extern void *getsectdatafromheader(struct mach_header *, char *, char *, int *); 636 637/* 638 * Set or clear a flag atomically and return the previous value of that bit. 639 */ 640static inline UInt32 641BC_set_flag(UInt32 flag) 642{ 643 return flag & OSBitOrAtomic(flag, &BC_cache->c_flags); 644} 645 646static inline UInt32 647BC_clear_flag(UInt32 flag) 648{ 649 return flag & OSBitAndAtomic(~(flag), &BC_cache->c_flags); 650} 651 652/* 653 * Return a user-readable string for a given uuid 654 * 655 * Returns a pointer to a static buffer, which is 656 * racy, so this should only be used for debugging purposes 657 */ 658static inline const char* uuid_string(uuid_t uuid) 659{ 660 /* Racy, but used for debug output so who cares */ 661 static uuid_string_t uuidString; 662 uuid_unparse(uuid, uuidString); 663 return (char*)uuidString; 664} 665 666/* 667 * Test whether a given byte range on disk intersects with an extent's range. 668 * This function assumes the ranges are within the same mount. 669 */ 670static inline int 671BC_check_intersection(struct BC_cache_extent *ce, u_int64_t offset, u_int64_t length) 672{ 673 if ((offset < (ce->ce_diskoffset + ce->ce_length)) && 674 ((offset + length) > ce->ce_diskoffset)) 675 return 1; 676 return 0; 677} 678 679/* 680 * Find one of our cache mounts with the given device, if it has been seen this boot 681 * 682 * Called with the cache read lock held 683 * 684 * Returns the index of the mount, or -1 if it is not found 685 */ 686static int 687BC_find_cache_mount(dev_t dev) 688{ 689 int i; 690 691 if (dev == nulldev()) 692 return(-1); 693 694 if (!(BC_cache->c_flags & BC_FLAG_CACHEACTIVE) || (BC_cache->c_mounts == NULL)) 695 return(-1); 696 697 698 for (i = 0; i < BC_cache->c_nmounts; i++) { 699 if (BC_cache->c_mounts[i].cm_state == CM_READY) { 700 if (BC_cache->c_mounts[i].cm_dev == dev) { 701 return i; 702 } 703 } 704 } 705 706 return(-1); 707} 708 709/* The new throttling implementation needs to be able to check 710 * whether an IO is in the boot cache before issuing the IO 711 */ 712static int BC_cache_contains_block(dev_t device, u_int64_t blkno) { 713 struct BC_cache_mount * cm = NULL; 714 struct BC_cache_extent **pce, *ce; 715 716 if (!(BC_cache->c_flags & BC_FLAG_CACHEACTIVE)) { 717 return 0; 718 } 719 720 LOCK_CACHE_R(); 721 722 int cm_idx = BC_find_cache_mount(device); 723 if (cm_idx == -1) { 724 UNLOCK_CACHE_R(); 725 return 0; 726 } 727 728 cm = BC_cache->c_mounts + cm_idx; 729 730 LOCK_MOUNT_R(cm); 731 732 if (cm->cm_state != CM_READY) { 733 UNLOCK_MOUNT_R(cm); 734 UNLOCK_CACHE_R(); 735 return 0; 736 } 737 738 u_int64_t disk_offset = CB_BLOCK_TO_BYTE(cm, blkno); 739 u_int64_t length = CB_BLOCK_TO_BYTE(cm, 1) /* one block */; 740 741 pce = BC_find_extent(cm, disk_offset, length, 0, NULL); 742 743 if (pce == NULL || *pce == NULL) { 744 UNLOCK_MOUNT_R(cm); 745 UNLOCK_CACHE_R(); 746 return 0; 747 } 748 749 ce = *pce; 750 751 UNLOCK_MOUNT_R(cm); 752 753 if (ce->ce_flags & CE_ABORTED) { 754 UNLOCK_CACHE_R(); 755 return 0; 756 } 757 758 if (ce->ce_flags & CE_LOWPRI && 759 !(ce->ce_flags & CE_IODONE)) { 760 UNLOCK_CACHE_R(); 761 return 0; 762 } 763 764 UNLOCK_CACHE_R(); 765 return 1; 766} 767 768/* 769 * Find a cache extent containing or intersecting the offset/length. 770 * 771 * We need to be able to find both containing and intersecting 772 * extents. Rather than handle each seperately, we take advantage 773 * of the fact that any containing extent will also be an intersecting 774 * extent. 775 * 776 * Thus, a search for a containing extent should be a search for an 777 * intersecting extent followed by a test for containment. 778 * 779 * Containment may include multiple extents. The returned extent will 780 * be the first of these extents (lowest offset), but the next extents 781 * in the array may also be required to contain the requested range. 782 * Upon success, *pnum_extents contains the number of extents required 783 * to contain the range. 784 * 785 * Returns a pointer to a matching the entry in the mount's extent list 786 * or NULL if no match is found. 787 * 788 * Called with the cache mount read lock held 789 */ 790static struct BC_cache_extent ** 791BC_find_extent(struct BC_cache_mount *cm, u_int64_t offset, u_int64_t length, int contained, int *pnum_extents) 792{ 793 struct BC_cache_extent **p, **base, **last, **next, **prev; /* pointers into the mount's extent index list */ 794 size_t lim; 795 796 /* invariants */ 797 assert(length > 0); 798 799 base = cm->cm_pextents; 800 last = base + cm->cm_nextents - 1; 801 802 /* 803 * If the cache is inactive, exit. 804 */ 805 if (!(BC_cache->c_flags & BC_FLAG_CACHEACTIVE) || (base == NULL)) { 806 return(NULL); 807 } 808 809 /* 810 * Range-check against the cache first. 811 * 812 * Note that we check for possible intersection, rather than 813 * containment. 814 */ 815 if (((offset + length) <= (*base)->ce_diskoffset) || 816 (offset >= ((*last)->ce_diskoffset + (*last)->ce_length))) { 817 return(NULL); 818 } 819 820 /* 821 * Perform a binary search for an extent intersecting 822 * the offset and length. 823 * 824 * This code is based on bsearch(3), and the description following 825 * is taken directly from it. 826 * 827 * The code below is a bit sneaky. After a comparison fails, we 828 * divide the work in half by moving either left or right. If lim 829 * is odd, moving left simply involves halving lim: e.g., when lim 830 * is 5 we look at item 2, so we change lim to 2 so that we will 831 * look at items 0 & 1. If lim is even, the same applies. If lim 832 * is odd, moving right again involes halving lim, this time moving 833 * the base up one item past p: e.g., when lim is 5 we change base 834 * to item 3 and make lim 2 so that we will look at items 3 and 4. 835 * If lim is even, however, we have to shrink it by one before 836 * halving: e.g., when lim is 4, we still looked at item 2, so we 837 * have to make lim 3, then halve, obtaining 1, so that we will only 838 * look at item 3. 839 */ 840 for (lim = cm->cm_nextents; lim != 0; lim >>= 1) { 841 p = base + (lim >> 1); 842 843 if (BC_check_intersection((*p), offset, length)) { 844 845 if (contained == 0) 846 return(p); 847 848 if (pnum_extents) { 849 *pnum_extents = 1; 850 } 851 852 /* 853 * We are checking for containment, verify that here. 854 * Extents may be adjacent, so include neighboring 855 * extents in the containment check 856 */ 857 858 /* Check the high bound */ 859 if ((offset + length) > ((*p)->ce_diskoffset + (*p)->ce_length)) { 860 861 /* check higher extents to see if they contain the high end of this range */ 862 prev = p; 863 for (next = prev + 1; 864 next <= last; 865 next++) { 866 867 /* extents should never overlap */ 868 assert(((*prev)->ce_diskoffset + (*prev)->ce_length) <= (*next)->ce_diskoffset); 869 870 if (((*prev)->ce_diskoffset + (*prev)->ce_length) < (*next)->ce_diskoffset) { 871 /* extents are not adjacent */ 872 if (BC_cache->c_take_detailed_stats) { 873 xdebug("Read 0x%llx:%lld on mount %s intersected, but not contained by %d-extent cache range 0x%llx,%lld (missed last %lld bytes)", offset, length, uuid_string(cm->cm_uuid), *pnum_extents, (*p)->ce_diskoffset, ((*prev)->ce_diskoffset + (*prev)->ce_length), (offset + length) - ((*prev)->ce_diskoffset + (*prev)->ce_length)); 874 } 875 return(NULL); 876 } 877 878 if (pnum_extents) { 879 (*pnum_extents)++; 880 } 881 882 if ((offset + length) <= ((*next)->ce_diskoffset + (*next)->ce_length)) { 883 /* we contain the high end of the range, break so we check the low end below */ 884 break; 885 } 886 887 prev = next; 888 } 889 890 if (next > last) { 891 /* ran off the end of the extent list */ 892 if (BC_cache->c_take_detailed_stats) { 893 xdebug("Read 0x%llx:%lld on mount %s intersected, but not contained by %d-extent cache range 0x%llx,%lld (missed last %lld bytes)", offset, length, uuid_string(cm->cm_uuid), *pnum_extents, (*p)->ce_diskoffset, ((*prev)->ce_diskoffset + (*prev)->ce_length), (offset + length) - ((*prev)->ce_diskoffset + (*prev)->ce_length)); 894 } 895 return (NULL); 896 } 897 } 898 899 /* Check the low bound */ 900 if (offset < (*p)->ce_diskoffset) { 901 902 /* check lower extents to see if they contain the low end of this range */ 903 next = p; 904 for (prev = next - 1; 905 prev >= base; 906 prev--) { 907 908 /* extents should never overlap */ 909 assert(((*prev)->ce_diskoffset + (*prev)->ce_length) <= (*next)->ce_diskoffset); 910 911 if (((*prev)->ce_diskoffset + (*prev)->ce_length) < (*next)->ce_diskoffset) { 912 /* extents are not adjacent */ 913 if (BC_cache->c_take_detailed_stats) { 914 xdebug("Read 0x%llx:%lld on mount %s intersected, but not contained by %d-extent cache range 0x%llx:%lld (missed first %lld bytes)", offset, length, uuid_string(cm->cm_uuid), *pnum_extents, (*next)->ce_diskoffset, ((*p)->ce_diskoffset + (*p)->ce_length), (*next)->ce_diskoffset - offset); 915 } 916 return(NULL); 917 } 918 919 if (pnum_extents) { 920 (*pnum_extents)++; 921 } 922 923 if (offset >= (*prev)->ce_diskoffset) { 924 /* we contain the low end of the range (we checked high end above) */ 925 return(prev); 926 } 927 928 next = prev; 929 } 930 931 assert(prev < base); /* only break condition */ 932 933 if (prev < base) { 934 /* ran off the end of the extent list */ 935 if (BC_cache->c_take_detailed_stats) { 936 xdebug("Read 0x%llx:%lld on mount %s intersected, but not contained by %d-extent cache range 0x%llx:%lld (missed first %lld bytes)", offset, length, uuid_string(cm->cm_uuid), *pnum_extents, (*next)->ce_diskoffset, ((*p)->ce_diskoffset + (*p)->ce_length), (*next)->ce_diskoffset - offset); 937 } 938 return (NULL); 939 } 940 } 941 942 return(p); 943 } 944 945 if (offset > (*p)->ce_diskoffset) { /* move right */ 946 base = p + 1; 947 lim--; 948 } /* else move left */ 949 } 950 /* found nothing */ 951 return(NULL); 952} 953 954/* 955 * Discard data from the cache and free pages which are no longer required. 956 * 957 * This should be locked against anyone testing for !vacant pages. 958 * 959 * We return the number of bytes we marked unused. If the value is 960 * zero, the offset/length did not overap this extent. 961 * 962 * Called with extent lock held. 963 */ 964static int 965BC_discard_bytes(struct BC_cache_extent *ce, u_int64_t offset, u_int64_t length) 966{ 967 struct BC_cache_mount *cm; 968 u_int64_t estart, eend, dstart, dend, nblk, base, page; 969 int i, discarded; 970 971 /* invariants */ 972 assert(length > 0); 973#ifdef BC_DEBUG 974 lck_mtx_assert(&ce->ce_lock, LCK_MTX_ASSERT_OWNED); 975#endif 976 977 /* 978 * Extent has been terminated: blockmap may no longer be valid. 979 */ 980 if (ce->ce_flags & CE_ABORTED || 981 ce->ce_length == 0) 982 return 0; 983 984 cm = BC_cache->c_mounts + ce->ce_mount_idx; 985 986 /* 987 * Constrain offset and length to fit this extent, return immediately if 988 * there is no overlap. 989 */ 990 dstart = offset; /* convert to start/end format */ 991 dend = offset + length; 992 assert(dend > dstart); 993 estart = ce->ce_diskoffset; 994 eend = ce->ce_diskoffset + ce->ce_length; 995 assert(eend > estart); 996 997 if (dend <= estart) /* check for overlap */ 998 return(0); 999 if (eend <= dstart) 1000 return(0); 1001 1002 if (dstart < estart) /* clip to extent */ 1003 dstart = estart; 1004 if (dend > eend) 1005 dend = eend; 1006 1007 /* 1008 * Convert length in bytes to a block count. 1009 */ 1010 nblk = CB_BYTE_TO_BLOCK(cm, dend - dstart); 1011 1012 /* 1013 * Mark blocks vacant. For each vacated block, check whether the page 1014 * holding it can be freed. 1015 * 1016 * Note that this could be optimised to reduce the number of 1017 * page-freeable checks, but the logic would be more complex. 1018 */ 1019 base = CB_BYTE_TO_BLOCK(cm, dstart - ce->ce_diskoffset); 1020 assert(base >= 0); 1021 discarded = 0; 1022 assert((base + nblk - 1) < howmany(ce->ce_length, cm->cm_blocksize)); 1023 for (i = 0; i < nblk; i++) { 1024 1025 /* this is a no-op if the block is already gone */ 1026 if (CB_BLOCK_PRESENT(ce, base + i)) { 1027 1028 /* mark the block as vacant */ 1029 CB_MARK_BLOCK_VACANT(ce, base + i); 1030 discarded++; 1031 1032 page = CB_BLOCK_TO_PAGE(cm, base + i); 1033 if (CB_PAGE_VACANT(cm, ce, page)) 1034 BC_free_page(ce, (int) page); 1035 } 1036 } 1037 return(discarded * cm->cm_blocksize); 1038} 1039 1040/* 1041 * Blocks in an extent can be invalidated, e.g. by a write, before the 1042 * reader thread fills the extent. Search for the next range of viable blocks 1043 * in the extent, starting from offset 'fromoffset'. 1044 * 1045 * Values are returned by out parameters: 1046 * 'nextpage' takes the page containing the start of the next run, or 1047 * -1 if no more runs are to be found. 1048 * 'nextoffset' takes the offset into that page that the run begins. 1049 * 'nextlength' takes the length in bytes of the range, capped at maxread. 1050 * 1051 * In other words, for this blockmap, if 1052 * *fromoffset = 3 * blocksize, 1053 * 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 0 1054 * *nextpage = 0 1055 * *nextoffset = 5 * blocksize 1056 * *nextlength = 6 * blocksize 1057 * 1058 * Called with the extent lock held 1059 */ 1060static void 1061BC_next_valid_range(struct BC_cache_mount *cm, struct BC_cache_extent *ce, uint32_t fromoffset, 1062 uint32_t *nextpage, uint32_t *nextoffset, uint32_t *nextlength) 1063{ 1064 int maxblks, i, nextblk = 0; 1065 int found = 0; 1066 1067 maxblks = howmany(ce->ce_length, cm->cm_blocksize); 1068 i = CB_BYTE_TO_BLOCK(cm, fromoffset); 1069 /* scan for the next range of valid blocks in the extent */ 1070 for (; i < maxblks; i++) { 1071 if (CB_BLOCK_PRESENT(ce, i)) { 1072 if (found == 0) { 1073 nextblk = i; 1074 } 1075 found++; 1076 } else { 1077 if (found != 0) 1078 break; 1079 } 1080 } 1081 1082 if (found) { 1083 /* found a valid range, so convert to (page, offset, length) */ 1084 *nextpage = CB_BLOCK_TO_PAGE(cm, nextblk); 1085 *nextoffset = CB_BLOCK_TO_BYTE(cm, nextblk) % PAGE_SIZE; 1086 *nextlength = MIN(CB_BLOCK_TO_BYTE(cm, found), cm->cm_maxread); 1087 } else { 1088 *nextpage = -1; 1089 } 1090 1091 return; 1092} 1093 1094/* 1095 * Test for the presence of a range of blocks in the cache. 1096 * 1097 * Called with the extent lock held. 1098 */ 1099static int 1100BC_blocks_present(struct BC_cache_extent *ce, int base, int nblk) 1101{ 1102 int blk; 1103 1104 assert(base >= 0); 1105 assert((base + nblk) <= howmany(ce->ce_length, BC_cache->c_mounts[ce->ce_mount_idx].cm_blocksize)); 1106 1107 for (blk = 0; blk < nblk; blk++) { 1108 1109 if (!CB_BLOCK_PRESENT(ce, base + blk)) { 1110 return(0); 1111 } 1112 } 1113 return(1); 1114} 1115 1116/* 1117 * Readahead thread. 1118 */ 1119static void 1120BC_reader_thread(void *param0, wait_result_t param1) 1121{ 1122 struct BC_cache_disk *cd = NULL; 1123 struct BC_cache_mount *cm = NULL; 1124 struct BC_cache_extent *ce = NULL; 1125 int count, num_mounts, ce_idx, cel_idx; 1126 upl_t upl; 1127 kern_return_t kret; 1128 struct timeval batchstart, batchend; 1129 int error = 0; 1130 int issuing_lowpriority_extents = 0; 1131 1132 /* We can access the BC_cache_disk passed in freely without locks 1133 * since disks are only freed up in terminate cache after it has 1134 * guanteed that no reader threads are running 1135 */ 1136 cd = (struct BC_cache_disk*) param0; 1137 1138 num_mounts = cd->cd_nmounts; 1139 1140 if (BC_cache->c_flags & BC_FLAG_SHUTDOWN) 1141 goto out; 1142 1143 for (;;) { 1144 1145 if (issuing_lowpriority_extents) { 1146 debug("disk %d starting low priority batch", cd->cd_disk_num); 1147 } else { 1148 debug("disk %d starting batch %d", cd->cd_disk_num, cd->cd_batch); 1149 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, DBG_BC_BATCH) | 1150 DBG_FUNC_START, 1151 cd->cd_batch, cd->cd_disk_num, 0, 0, 0); 1152 } 1153 microtime(&batchstart); 1154 1155 LOCK_CACHE_R(); 1156 for (cel_idx = 0; 1157 cel_idx < BC_cache->c_nextentlists; 1158 cel_idx++) { 1159 1160 /* iterate over extents to populate */ 1161 for (ce_idx = 0; 1162 ce_idx < BC_cache->c_nextents[cel_idx]; 1163 ce_idx++) { 1164 1165 ce = BC_cache->c_extentlists[cel_idx] + ce_idx; 1166 1167 cm = BC_cache->c_mounts + ce->ce_mount_idx; 1168 1169 if (cm->cm_state != CM_READY) { 1170 continue; 1171 } 1172 1173 if (cm->cm_disk != cd){ 1174 continue; 1175 } 1176 1177 /* Only read extents marked for this batch or earlier. 1178 * All low-priority IO is read in one batch, however. 1179 */ 1180 if (ce->ce_batch > cd->cd_batch && !(cd->cd_flags & CD_ISSUE_LOWPRI)) { 1181 continue; 1182 } 1183 1184 /* Check if already done or aborted */ 1185 if (ce->ce_flags & (CE_ABORTED | CE_IODONE)) { 1186 continue; 1187 } 1188 1189 /* Check if this extent is low priority and we're not yet reading in low-priority extents */ 1190 if (!(cd->cd_flags & CD_ISSUE_LOWPRI) && 1191 (ce->ce_flags & CE_LOWPRI)) { 1192 continue; 1193 } 1194 1195 /* Shouldn't happen */ 1196 if((cd->cd_flags & CD_ISSUE_LOWPRI) && 1197 !(ce->ce_flags & CE_LOWPRI)) { 1198 debug("Saw a regular extent while issuing throttled IO"); 1199 } 1200 1201 LOCK_EXTENT(ce); 1202 1203 /* Check if again with the lock */ 1204 if (ce->ce_flags & (CE_ABORTED | CE_IODONE)) { 1205 UNLOCK_EXTENT(ce); 1206 continue; 1207 } 1208 1209 /* loop reading to fill this extent */ 1210 uint32_t fromoffset = 0; 1211 1212 LOCK_MOUNT_R(cm); 1213 1214 if (cm->cm_state != CM_READY) { 1215 /* the mount was aborted. Whoever aborted it handles aborting the extents as well */ 1216 UNLOCK_MOUNT_R(cm); 1217 UNLOCK_EXTENT(ce); 1218 continue; 1219 } 1220 1221 for (;;) { 1222 uint32_t nextpage; 1223 uint32_t nextoffset; 1224 uint32_t nextlength; 1225 1226 /* requested shutdown */ 1227 if (BC_cache->c_flags & BC_FLAG_SHUTDOWN) { 1228 UNLOCK_MOUNT_R(cm); 1229 goto out; 1230 } 1231 1232 1233 /* 1234 * Find the next set of blocks that haven't been invalidated 1235 * for this extent. 1236 */ 1237 BC_next_valid_range(cm, ce, fromoffset, &nextpage, &nextoffset, &nextlength); 1238 /* no more blocks to be read */ 1239 if (nextpage == -1) 1240 break; 1241 1242 /* set up fromoffset to read the next segment of the extent */ 1243 fromoffset = (nextpage * PAGE_SIZE) + nextoffset + nextlength; 1244 1245 kret = vnode_getwithvid(BC_cache->c_vp, BC_cache->c_vid); 1246 if (kret != KERN_SUCCESS) { 1247 UNLOCK_MOUNT_R(cm); 1248 message("reader thread: vnode_getwithvid failed - %d\n", kret); 1249 goto out; 1250 } 1251 1252 kret = ubc_create_upl(BC_cache->c_vp, 1253 ce->ce_cacheoffset + (nextpage * PAGE_SIZE), 1254 (int) roundup(nextoffset + nextlength, PAGE_SIZE), 1255 &upl, 1256 NULL, 1257 UPL_SET_LITE|UPL_FILE_IO); 1258 if (kret != KERN_SUCCESS) { 1259 UNLOCK_MOUNT_R(cm); 1260 message("ubc_create_upl returned %d\n", kret); 1261 (void) vnode_put(BC_cache->c_vp); 1262 goto out; 1263 } 1264 1265 /* cm_bp is only accessed from the reader thread, 1266 * and there's only one reader thread for a given mount, 1267 * so we don't need the write lock to modify it */ 1268 1269 /* set buf to fill the requested subset of the upl */ 1270 buf_setblkno(cm->cm_bp, CB_BYTE_TO_BLOCK(cm, ce->ce_diskoffset + nextoffset) + CB_PAGE_TO_BLOCK(cm, nextpage)); 1271 buf_setcount(cm->cm_bp, (unsigned int) nextlength); 1272 buf_setupl(cm->cm_bp, upl, (unsigned int) nextoffset); 1273 buf_setresid(cm->cm_bp, buf_count(cm->cm_bp)); /* ask for residual indication */ 1274 buf_reset(cm->cm_bp, B_READ); 1275 1276 /* If this is regular readahead, make sure any throttled IO are throttled by the readahead thread rdar://8592635 1277 * If this is low-priority readahead, make sure this thread will be throttled appropriately later rdar://8734858 1278 */ 1279 throttle_info_handle_t throttle_info_handle; 1280 if ((error = throttle_info_ref_by_mask(cm->cm_throttle_mask, &throttle_info_handle)) == 0) { 1281 throttle_info_update_by_mask(throttle_info_handle, 0x0); 1282 throttle_info_rel_by_mask(throttle_info_handle); 1283 } else { 1284 debug("Unable to update throttle info for mount %s: %d", uuid_string(cm->cm_uuid), error); 1285 } 1286 1287 BC_ADD_STAT(initiated_reads, 1); 1288 if (cd->cd_disk_num < STAT_DISKMAX) { 1289 if (ce->ce_flags & CE_LOWPRI) { 1290 BC_ADD_STAT(disk_initiated_reads_lowpri[cd->cd_disk_num], 1); 1291 } else { 1292 BC_ADD_STAT(disk_initiated_reads[cd->cd_disk_num], 1); 1293 } 1294 } 1295 1296 /* give the buf to the underlying strategy routine */ 1297 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, DKIO_READ) | DBG_FUNC_NONE, 1298 (uintptr_t) buf_kernel_addrperm_addr(cm->cm_bp), buf_device(cm->cm_bp), (long)buf_blkno(cm->cm_bp), buf_count(cm->cm_bp), 0); 1299 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, DBG_BC_PLAYBACK_IO) | DBG_FUNC_NONE, buf_kernel_addrperm_addr(cm->cm_bp), 0, 0, 0, 0); 1300 1301 BC_cache->c_strategy(cm->cm_bp); 1302 1303 /* wait for the bio to complete */ 1304 buf_biowait(cm->cm_bp); 1305 1306 kret = ubc_upl_commit(upl); 1307 (void) vnode_put(BC_cache->c_vp); 1308 1309 /* 1310 * If the read or commit returned an error, invalidate the bytes 1311 * covered by the read (on a residual, we could avoid invalidating 1312 * bytes that are claimed to be read as a minor optimisation, but we do 1313 * not expect errors as a matter of course). 1314 */ 1315 if (kret != KERN_SUCCESS || buf_error(cm->cm_bp) || (buf_resid(cm->cm_bp) != 0)) { 1316 debug("read error: extent %lu %lu/%lu " 1317 "(error buf %ld/%ld flags %08x resid %d)", 1318 (unsigned long) ce_idx, 1319 (unsigned long)ce->ce_diskoffset, (unsigned long)ce->ce_length, 1320 (long)buf_blkno(cm->cm_bp), (long)buf_count(cm->cm_bp), 1321 buf_flags(cm->cm_bp), buf_resid(cm->cm_bp)); 1322 1323 count = BC_discard_bytes(ce, ce->ce_diskoffset + nextoffset, nextlength); 1324 debug("read error: discarded %d bytes", count); 1325 BC_ADD_STAT(read_errors, 1); 1326 BC_ADD_STAT(read_errors_bytes, count); 1327 } 1328#ifdef READ_HISTORY_BUFFER 1329 if (BC_cache->c_rhistory && BC_cache->c_rhistory_idx < READ_HISTORY_BUFFER) { 1330 BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_extent = ce; 1331 BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_blkno = buf_blkno(cm->cm_bp); 1332 BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_bcount = buf_count(cm->cm_bp); 1333 if (buf_error(cm->cm_bp) || (buf_resid(cm->cm_bp) != 0)) { 1334 BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_result = BC_RHISTORY_FAIL; 1335 BC_cache->c_rhistory_idx++; 1336 } else { 1337# ifdef READ_HISTORY_ALL 1338 BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_result = BC_RHISTORY_OK; 1339 BC_cache->c_rhistory_idx++; 1340# endif 1341 } 1342 } 1343#endif 1344 1345 /* I'd like to throttle the thread here, but I'm holding the extent and cache locks, so it's throttled below */ 1346 } 1347 1348 /* update stats */ 1349 BC_ADD_STAT(read_blocks, (u_int) CB_BYTE_TO_BLOCK(cm, ce->ce_length)); 1350 BC_ADD_STAT(read_bytes, (u_int) ce->ce_length); 1351 if (ce->ce_flags & CE_LOWPRI) { 1352 BC_ADD_STAT(read_bytes_lowpri, (u_int) ce->ce_length); 1353 if (cd->cd_disk_num < STAT_DISKMAX) { 1354 BC_ADD_STAT(batch_bytes_lowpri[cd->cd_disk_num], (u_int) ce->ce_length); 1355 } 1356 } else { 1357 if (cd->cd_disk_num < STAT_DISKMAX) { 1358 BC_ADD_STAT(batch_bytes[cd->cd_disk_num][MIN(cd->cd_batch, STAT_BATCHMAX)], (u_int) ce->ce_length); 1359 if (ce->ce_batch < cd->cd_batch) { 1360 BC_ADD_STAT(batch_late_bytes[cd->cd_disk_num][MIN(cd->cd_batch, STAT_BATCHMAX)], ce->ce_length); 1361 } 1362 } 1363 } 1364 if (ce->ce_flags & CE_SHARED) { 1365 BC_ADD_STAT(shared_bytes, (u_int) ce->ce_length); 1366 } 1367 UNLOCK_MOUNT_R(cm); 1368 1369 /* 1370 * Wake up anyone wanting this extent, as it is now ready for 1371 * use. 1372 */ 1373 ce->ce_flags |= CE_IODONE; 1374 UNLOCK_EXTENT(ce); 1375 wakeup(ce); 1376 1377 /* Let anyone waiting for the write lock to take hold */ 1378 UNLOCK_CACHE_R(); 1379 1380 /* Take this opportunity of locklessness to throttle the reader thread, if necessary */ 1381 if (issuing_lowpriority_extents) { 1382 throttle_lowpri_io((ce->ce_flags & CE_LOWPRI) ? 1 : 0); 1383 } 1384 1385 LOCK_CACHE_R(); 1386 if (issuing_lowpriority_extents && !(cd->cd_flags & CD_ISSUE_LOWPRI)) { 1387 /* New playlist arrived, go back to issuing regular priority extents */ 1388 debug("New extents, interrupting low priority readahead"); 1389 break; 1390 } 1391 1392 if (cel_idx >= BC_cache->c_nextentlists || 1393 ce_idx >= BC_cache->c_nextents[cel_idx]) { 1394 /* cache shrunk while we weren't looking. 1395 * Not a hard error, but we're done with this batch at least 1396 */ 1397 break; 1398 } 1399 } 1400 ce = NULL; 1401 if (issuing_lowpriority_extents && !(cd->cd_flags & CD_ISSUE_LOWPRI)) { 1402 /* New playlist arrived, go back to issuing regular priority extents */ 1403 break; 1404 } 1405 } 1406 UNLOCK_CACHE_R(); 1407 1408 microtime(&batchend); 1409 timersub(&batchend, &batchstart, &batchend); 1410 if (cd->cd_disk_num < STAT_DISKMAX) { 1411 if (issuing_lowpriority_extents) { 1412 BC_cache->c_stats.ss_batch_time_lowpri[cd->cd_disk_num] += 1413 (u_int) batchend.tv_sec * 1000 + 1414 (u_int) batchend.tv_usec / 1000; 1415 } else { 1416 /* Measure times for the first several batches separately */ 1417 BC_cache->c_stats.ss_batch_time[cd->cd_disk_num][MIN(cd->cd_batch, STAT_BATCHMAX)] += 1418 (u_int) batchend.tv_sec * 1000 + 1419 (u_int) batchend.tv_usec / 1000; 1420 } 1421 } 1422 if (issuing_lowpriority_extents) { 1423 // debug("disk %d low priority batch done", cd->cd_disk_num); 1424 } else { 1425 // debug("disk %d batch %d done", cd->cd_disk_num, cd->cd_batch); 1426 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, DBG_BC_BATCH) | 1427 DBG_FUNC_END, 1428 cd->cd_batch, cd->cd_disk_num, 0, 0, 0); 1429 } 1430 1431 1432 LOCK_DISK(cd); 1433 if (issuing_lowpriority_extents && !(cd->cd_flags & CD_ISSUE_LOWPRI)) { 1434 /* New playlist arrived, go back to issuing regular extents */ 1435 issuing_lowpriority_extents = 0; 1436 throttle_set_thread_io_policy(IOPOL_NORMAL); 1437 cd->cd_batch = 0; 1438 } else if (cd->cd_batch + 1 < BC_cache->c_batch_count && !issuing_lowpriority_extents) { 1439 cd->cd_batch++; 1440 } else { 1441 if (num_mounts == cd->cd_nmounts) { 1442 /* no new mounts appeared */ 1443 if ( !(cd->cd_flags & CD_ISSUE_LOWPRI)) { 1444 /* go through the playlist again, issuing any low-priority extents we have */ 1445 cd->cd_flags |= CD_ISSUE_LOWPRI; 1446 throttle_set_thread_io_policy(IOPOL_THROTTLE); 1447 issuing_lowpriority_extents = 1; 1448 } else { 1449 /* we're done */ 1450 cd->cd_flags &= (~CD_HAS_THREAD); 1451 cd->cd_batch = 0; 1452 UNLOCK_DISK(cd); 1453 goto out; 1454 } 1455 } else { 1456 /* New mounts appeared. Run through again */ 1457 cd->cd_batch = 0; 1458 debug("disk %d more mounts detected (%d total)", cd->cd_disk_num, cd->cd_nmounts); 1459 } 1460 } 1461 num_mounts = cd->cd_nmounts; 1462 UNLOCK_DISK(cd); 1463 } 1464 1465out: 1466 /* 1467 * If ce != NULL we have bailed out, but we cannot free memory beyond 1468 * the bailout point as it may have been read in a previous batch and 1469 * there may be a sleeping strategy routine assuming that it will 1470 * remain valid. Since we are typically killed immediately before a 1471 * complete cache termination, this does not represent a significant 1472 * problem. 1473 * 1474 * However, to prevent readers blocking waiting for readahead to 1475 * complete for extents that will never be read, we mark all the 1476 * extents we have given up on as aborted. 1477 */ 1478 if (ce != NULL) { 1479 UNLOCK_EXTENT(ce); 1480 1481 /* the only errors that get us here are global, so mark the cache as shut down */ 1482 LOCK_DISK(cd); 1483 BC_set_flag(BC_FLAG_SHUTDOWN); 1484 cd->cd_flags &= (~CD_HAS_THREAD); 1485 UNLOCK_DISK(cd); 1486 1487 int tmpcel_idx, tmpce_idx; 1488 for (tmpcel_idx = 0; 1489 tmpcel_idx < BC_cache->c_nextentlists; 1490 tmpcel_idx++) { 1491 for (tmpce_idx = 0; 1492 tmpce_idx < BC_cache->c_nextents[tmpcel_idx]; 1493 tmpce_idx++) { 1494 ce = BC_cache->c_extentlists[tmpcel_idx] + tmpce_idx; 1495 LOCK_EXTENT(ce); 1496 /* abort any extent that hasn't been satisfied */ 1497 if (!(ce->ce_flags & CE_IODONE)) { 1498 ce->ce_flags |= CE_ABORTED; 1499 /* wake up anyone asleep on this extent */ 1500 UNLOCK_EXTENT(ce); 1501 wakeup(ce); 1502 } else { 1503 UNLOCK_EXTENT(ce); 1504 } 1505 } 1506 } 1507 UNLOCK_CACHE_R(); 1508 } 1509 1510 1511 debug("disk %d done", cd->cd_disk_num); 1512 1513 /* wake up someone that might be waiting for this reader thread to exit */ 1514 wakeup(&cd->cd_flags); 1515 1516 LOCK_READERS(); 1517 /* wake up someone that might be waiting for all the reader threads to exit */ 1518 BC_cache->c_num_reader_threads--; 1519 if (BC_cache->c_num_reader_threads == 0) { 1520 wakeup(&BC_cache->c_num_reader_threads); 1521 } 1522 UNLOCK_READERS(); 1523 1524} 1525 1526/* 1527 * Handle an incoming close request. 1528 */ 1529static int 1530BC_close(dev_t dev, int flags, int devtype, struct proc *p) 1531{ 1532 struct BC_cache_mount *cm; 1533 int i, cm_idx; 1534 1535 /* Mark our cache_mount for this device as invalid */ 1536 1537 LOCK_CACHE_R(); 1538 if (BC_cache->c_flags & BC_FLAG_CACHEACTIVE) { 1539 1540 cm_idx = BC_find_cache_mount(dev); 1541 if (-1 != cm_idx) { 1542 cm = BC_cache->c_mounts + cm_idx; 1543 debug("Tearing down closing mount %s with dev 0x%x", uuid_string(cm->cm_uuid), dev); 1544 LOCK_MOUNT_R(cm); 1545 if (cm->cm_state != CM_ABORTED) { 1546 /* 1547 * Mark all extents as aborted. This will notify any sleepers in the 1548 * strategy routine that the extent won't have data for them. 1549 * 1550 * Note that we shouldn't try to take an extent lock while holding 1551 * the exclusive mount lock or we risk deadlock with the reader thread 1552 */ 1553 for (i = 0; i < cm->cm_nextents; i++) { 1554 LOCK_EXTENT(cm->cm_pextents[i]); 1555 BC_teardown_extent(cm->cm_pextents[i]); 1556 UNLOCK_EXTENT(cm->cm_pextents[i]); 1557 wakeup(cm->cm_pextents[i]); 1558 } 1559 if (! LOCK_MOUNT_R_TO_W(cm)) { 1560 /* If there is someone waiting for the exclusive lock on this mount, 1561 * we want to grab it before they can so they don't waste time. 1562 * If we fail to take the exclusive lock it means someone 1563 * else is also trying to upgrate the mount lock. We don't keep any 1564 * state, so we can simply take the write lock again. 1565 */ 1566 LOCK_MOUNT_W(cm); 1567 if (cm->cm_state == CM_ABORTED) { 1568 // Mount is aborted, no more work necessary 1569 UNLOCK_MOUNT_W(cm); 1570 goto out; 1571 } 1572 } 1573 BC_teardown_mount(cm); 1574 UNLOCK_MOUNT_W(cm); 1575 } else { 1576 UNLOCK_MOUNT_R(cm); 1577 } 1578 } 1579 } 1580 1581out: 1582 UNLOCK_CACHE_R(); 1583 1584 return BC_cache->c_close(dev, flags, devtype, p); 1585} 1586 1587/* 1588 * Returns CE_ABORTED if any of the extents in the array are aborted 1589 * Returns CE_LOWPRI if any of the extents in the array are low priority and not done 1590 * Returns CE_IODONE if all the extents in the array are done 1591 * Returns 0 otherwise 1592 */ 1593static int extents_status(struct BC_cache_extent **pce, int nextents) { 1594 int ce_idx; 1595 int ret = CE_IODONE; 1596 1597 for (ce_idx = 0; 1598 ce_idx < nextents; 1599 ce_idx++) { 1600 1601 if (pce[ce_idx]->ce_flags & CE_ABORTED) { 1602 return CE_ABORTED; 1603 } 1604 1605 if (! (pce[ce_idx]->ce_flags & CE_IODONE)) { 1606 if (pce[ce_idx]->ce_flags & CE_LOWPRI) { 1607 ret = CE_LOWPRI; 1608 } else { 1609 if (ret == CE_IODONE) { 1610 ret = 0; 1611 } 1612 } 1613 } 1614 } 1615 return ret; 1616} 1617 1618/* 1619 * Waits for the given extent to be filled or aborted 1620 * 1621 * Should only be called with the extent lock. 1622 * NO OTHER LOCKS SHOULD BE HELD INCLUDING THE CACHE LOCK 1623 * 1624 * We are guaranteed that the extent won't disappear when we release 1625 * the cache lock by the cleanup code waiting for BC_cache->c_strategycalls 1626 * to reach 0, and we're counted in there. 1627 */ 1628static void wait_for_extent(struct BC_cache_extent *ce, struct timeval starttime) { 1629 struct timeval time; 1630 struct timespec timeout; 1631 microtime(&time); 1632 timersub(&time, 1633 &starttime, 1634 &time); 1635 if (time.tv_sec < BC_wait_for_readahead) { 1636 timeout.tv_sec = BC_wait_for_readahead - time.tv_sec; 1637 if (time.tv_usec == 0) { 1638 timeout.tv_nsec = 0; 1639 } else { 1640 timeout.tv_sec--; 1641 timeout.tv_nsec = NSEC_PER_USEC * (USEC_PER_SEC - time.tv_usec); 1642 } 1643 msleep(ce, &ce->ce_lock, PRIBIO, "BC_strategy", &timeout); 1644 } 1645} 1646 1647/* 1648 * Handle an incoming IO request. 1649 */ 1650static int 1651BC_strategy(struct buf *bp) 1652{ 1653 struct BC_cache_extent *ce = NULL, **pce, **pcontaining_extents = NULL; 1654 int num_extents; 1655 uio_t uio = NULL; 1656 int nbytes, resid, discards = 0; 1657 struct timeval blocked_start_time, blocked_end_time; 1658 daddr64_t blkno; 1659 caddr_t buf_map_p, buf_extent_p; 1660 off_t disk_offset; 1661 kern_return_t kret; 1662 int cm_idx = -1, ce_idx; 1663 dev_t dev; 1664 int32_t bufflags; 1665 int during_cache = 0, during_io = 0, take_detailed_stats = 0, cache_hit = 0; 1666 int is_root_disk, did_block, status; 1667 int is_shared = 0, is_swap = 0; 1668 int should_throttle = 0; 1669 int is_ssd = 0; 1670 int is_stolen = 0; 1671 int unfilled = 0; 1672 vnode_t vp = NULL; 1673 int pid = 0; 1674 int dont_cache = 0; 1675 int throttle_tier = 0; 1676 bufattr_t bap = NULL; 1677 1678 assert(bp != NULL); 1679 1680 blkno = buf_blkno(bp); 1681 nbytes = buf_count(bp); 1682 bufflags = buf_flags(bp); 1683 bap = buf_attr(bp); 1684 dev = buf_device(bp); 1685 vp = buf_vnode(bp); 1686 1687 if (bap) { 1688 throttle_tier = bufattr_throttled(bap); 1689 } 1690 1691 /* 1692 * If the buf doesn't have a vnode for some reason, pretend 1693 * we never saw the request at all. 1694 */ 1695 if (dev == nulldev()) { 1696 BC_cache->c_strategy(bp); 1697 BC_ADD_STAT(strategy_unknown, 1); 1698 BC_ADD_STAT(strategy_unknown_bytes, nbytes); 1699 return 0; 1700 } 1701 1702 if (vp && vnode_isswap(vp)) { 1703 is_swap = 1; 1704 goto bypass; 1705 } 1706 1707 if (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE) { 1708 pid = proc_selfpid(); 1709 1710 if (vp) { 1711 is_shared = vnode_isdyldsharedcache(vp); 1712 } 1713 } 1714 1715 if (vp && (vnode_israge(vp))) { 1716 dont_cache = 1; 1717 } 1718 1719 /* 1720 * If the cache is not active, bypass the request. We may 1721 * not be able to fill it, but we still want to record it. 1722 */ 1723 if (!(BC_cache->c_flags & BC_FLAG_CACHEACTIVE)) { 1724 goto bypass; 1725 } 1726 1727 /* 1728 * In order to prevent the cache cleanup code from racing with us 1729 * when we sleep, we track the number of strategy calls in flight. 1730 */ 1731 OSAddAtomic(1, &BC_cache->c_strategycalls); 1732 LOCK_CACHE_R(); 1733 during_cache = 1; /* we're included in the in_flight count */ 1734 1735 /* Get cache mount asap for use in case we bypass */ 1736 cm_idx = BC_find_cache_mount(dev); 1737 1738 if (cm_idx != -1) { 1739 disk_offset = CB_BLOCK_TO_BYTE(BC_cache->c_mounts + cm_idx, blkno); 1740 } 1741 1742 if (BC_cache->c_take_detailed_stats) { 1743 take_detailed_stats = 1; 1744 BC_ADD_STAT(strategy_calls, 1); 1745 if (throttle_tier) { 1746 BC_ADD_STAT(strategy_throttled, 1); 1747 } 1748#ifdef BC_DEBUG 1749// if (dont_cache) { 1750// char procname[128]; 1751// proc_selfname(procname, sizeof(procname)); 1752// const char* filename = vp ? vnode_getname(vp) : NULL; 1753// debug("not recording %s%s from app %s for file %s (disk block 0x%llx) which is%s throttled", (vp && vnode_israge(vp)) ? "rapid age " : "", (bufflags & B_READ) ? "read" : "write", procname, filename?:"unknown", blkno, (bufflags & B_THROTTLED_IO) ? "" : " not"); 1754// if (filename) { 1755// vnode_putname(filename); 1756// } 1757// } 1758#endif 1759 } 1760 1761 /* 1762 * if we haven't seen this mount before, pass it off 1763 * 1764 * If this is a mount we want to cache, we'll kick it off when 1765 * adding to our history 1766 */ 1767 if (cm_idx == -1) { 1768 /* don't have a mount for this IO */ 1769 if (take_detailed_stats) 1770 BC_ADD_STAT(strategy_noncached_mount, 1); 1771 goto bypass; 1772 } 1773 1774 /* Get the info from the mount we need */ 1775 LOCK_MOUNT_R(BC_cache->c_mounts + cm_idx); 1776 1777 if (BC_cache->c_mounts[cm_idx].cm_disk && 1778 (BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_HAS_THREAD) && 1779 !(BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_ISSUE_LOWPRI)) { 1780 during_io = 1; /* for statistics */ 1781 if (take_detailed_stats) { 1782 BC_ADD_STAT(strategy_duringio, 1); 1783 } 1784 } 1785 1786 if (bufflags & B_ENCRYPTED_IO) { 1787 UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx); 1788 dont_cache = 1; 1789 goto bypass; 1790 } 1791 1792 if (BC_cache->c_mounts[cm_idx].cm_state != CM_READY) { 1793 /* the mount has been aborted, treat it like a missing mount */ 1794 UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx); 1795 cm_idx = -1; 1796 1797 if (take_detailed_stats) 1798 BC_ADD_STAT(strategy_noncached_mount, 1); 1799 goto bypass; 1800 } 1801 1802 /* if it's not a read, pass it off */ 1803 if ( !(bufflags & B_READ)) { 1804 UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx); 1805 if (take_detailed_stats) 1806 BC_ADD_STAT(strategy_nonread, 1); 1807 goto bypass; 1808 } 1809 1810 if ((nbytes % BC_cache->c_mounts[cm_idx].cm_blocksize) != 0) { 1811 debug("IO with non-blocksize multiple bytes (%d %% %lld = %lld)", nbytes, BC_cache->c_mounts[cm_idx].cm_blocksize, (nbytes % BC_cache->c_mounts[cm_idx].cm_blocksize)); 1812 UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx); 1813 if (take_detailed_stats) 1814 BC_ADD_STAT(strategy_nonblocksize, 1); 1815 goto bypass; 1816 } 1817 1818 if (take_detailed_stats) { 1819 BC_ADD_STAT(extent_lookups, 1); 1820 BC_ADD_STAT(requested_bytes, nbytes); 1821 if (cm_idx < STAT_MOUNTMAX) { 1822 BC_ADD_STAT(requested_bytes_m[cm_idx], nbytes); 1823 } 1824 } 1825 1826 /* 1827 * Look for a cache extent containing this request. 1828 */ 1829 num_extents = 0; 1830 pce = BC_find_extent(BC_cache->c_mounts + cm_idx, disk_offset, nbytes, 1, &num_extents); 1831 if (pce == NULL) { 1832#ifdef BC_EXTRA_DEBUG 1833 if (take_detailed_stats && !dont_cache) { 1834 char procname[128]; 1835 proc_selfname(procname, sizeof(procname)); 1836 const char* filename = vp ? vnode_getname(vp) : NULL; 1837 if (num_extents > 0) { 1838 xdebug("Missed IO from app %s for file %s (disk offset 0x%llx) (intersected, though)", procname, filename?:"unknown", blkno); 1839 } else { 1840 xdebug("Missed IO from app %s for file %s (disk offset 0x%llx)", procname, filename?:"unknown", blkno); 1841 } 1842 if (filename) { 1843 vnode_putname(filename); 1844 } 1845 } 1846#endif 1847 UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx); 1848 goto bypass; 1849 } 1850 1851 assert(num_extents > 0); 1852 1853 cache_hit = 1; 1854 1855 if (take_detailed_stats) { 1856 BC_ADD_STAT(extent_hits, 1); 1857 if (during_io) 1858 BC_ADD_STAT(hit_duringio, 1); 1859 if (num_extents > 1) 1860 BC_ADD_STAT(hit_multiple, 1); 1861 } 1862 1863 pcontaining_extents = BC_MALLOC(num_extents * sizeof(*pcontaining_extents), M_TEMP, M_WAITOK | M_ZERO); 1864 if (pcontaining_extents == NULL) { 1865 UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx); 1866 if (take_detailed_stats) 1867 BC_ADD_STAT(hit_failure, 1); 1868 goto bypass; 1869 } 1870 memcpy(pcontaining_extents, pce, num_extents * sizeof(*pcontaining_extents)); 1871 1872 is_ssd = (BC_cache->c_mounts[cm_idx].cm_disk && 1873 (BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_IS_SSD)); 1874 1875 UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx); 1876 1877 /* if any extent is aborted, bypass immediately */ 1878 status = extents_status(pcontaining_extents, num_extents); 1879 if (status & CE_ABORTED) { 1880 if (take_detailed_stats) 1881 BC_ADD_STAT(hit_aborted, 1); 1882 goto bypass; 1883 } 1884 1885#ifndef EMULATE_ONLY 1886 1887 did_block = 0; 1888 if (! (status & CE_IODONE)) { 1889 1890 if (is_ssd) { 1891 /* Don't delay IOs on SSD since cut-throughs aren't harmful */ 1892 unfilled = 1; 1893 goto bypass; 1894 } 1895 1896 if (status & CE_LOWPRI) { 1897 /* Don't wait for low priority extents */ 1898 if (take_detailed_stats) 1899 BC_ADD_STAT(strategy_unfilled_lowpri, 1); 1900 cache_hit = 0; 1901 goto bypass; 1902 } 1903 1904 /* 1905 * wait for all the extents to finish 1906 */ 1907 microtime(&blocked_start_time); 1908 for (ce_idx = 0; 1909 ce_idx < num_extents; 1910 ce_idx++) { 1911 1912 ce = pcontaining_extents[ce_idx]; 1913 LOCK_EXTENT(ce); 1914 if (! (ce->ce_flags & (CE_ABORTED | CE_IODONE))) { 1915 did_block = 1; 1916 UNLOCK_CACHE_R(); 1917 /* 1918 * We need to release the cache lock since we will msleep in wait_for_extent. 1919 * If we don't release the cache read lock and someone wants to add a new playlist, 1920 * they go to grab the cache write lock, and everybody stalls for the msleep timeout. 1921 * 1922 * We are guaranteed that the cache won't disappear during this time since 1923 * the cleanup code waits for all the strategy calls (this function) to complete 1924 * by checking BC_cache->c_strategycalls, which we incremented above. 1925 * 1926 * The cache may have grown behind our backs, however. Mount pointers are not valid, 1927 * but mount indexes still are. Extent pointers are still valid since the extent list 1928 * is not modified (this is a requirement to be able to msleep with the extent's lock). 1929 */ 1930 wait_for_extent(ce, blocked_start_time); 1931 1932 /* Cache lock must be grabbed without holding any locks to avoid deadlock rdar://8626772 */ 1933 UNLOCK_EXTENT(ce); 1934 LOCK_CACHE_R(); 1935 LOCK_EXTENT(ce); 1936 1937 if (! (ce->ce_flags & (CE_ABORTED | CE_IODONE))) { 1938 /* strategy call timed out waiting for the extent */ 1939 if (take_detailed_stats) 1940 BC_ADD_STAT(strategy_timedout, 1); 1941#ifdef BC_DEBUG 1942 microtime(&blocked_end_time); 1943 timersub(&blocked_end_time, 1944 &blocked_start_time, 1945 &blocked_end_time); 1946 char procname[128]; 1947 proc_selfname(procname, sizeof(procname)); 1948 const char* filename = vp ? vnode_getname(vp) : NULL; 1949 debug("Strategy routine timed out app %s waiting on file %s (disk offset 0x%llx) after %lu.%03d seconds", procname, filename?:"unknown", disk_offset, blocked_end_time.tv_sec, blocked_end_time.tv_usec / 1000); 1950 if (filename) { 1951 vnode_putname(filename); 1952 } 1953#endif 1954 goto bypass; 1955 } 1956 UNLOCK_EXTENT(ce); 1957 ce = NULL; 1958 1959 /* Check every time we sleep so we can avoid more sleeping if unnecessary */ 1960 status = extents_status(pcontaining_extents, num_extents); 1961 if (status & CE_ABORTED) { 1962 if (take_detailed_stats) 1963 BC_ADD_STAT(hit_aborted, 1); 1964 goto bypass; 1965 } else if (status & CE_IODONE) { 1966 break; 1967 } 1968 } else { 1969 UNLOCK_EXTENT(ce); 1970 ce = NULL; 1971 } 1972 } 1973 if (take_detailed_stats && did_block) { 1974 BC_ADD_STAT(strategy_blocked, 1); 1975 microtime(&blocked_end_time); 1976 timersub(&blocked_end_time, 1977 &blocked_start_time, 1978 &blocked_end_time); 1979 int ms_blocked = (blocked_end_time.tv_sec * 1000) + (blocked_end_time.tv_usec / (1000)); 1980#ifdef BC_DEBUG 1981 char procname[128]; 1982 proc_selfname(procname, sizeof(procname)); 1983 const char* filename = vp ? vnode_getname(vp) : NULL; 1984 debug("Strategy routine blocked app %s waiting on file %s (disk offset 0x%llx) for %lu.%03d seconds", procname, filename?:"unknown", disk_offset, blocked_end_time.tv_sec, blocked_end_time.tv_usec / 1000); 1985 if (filename) { 1986 vnode_putname(filename); 1987 } 1988#endif 1989 BC_ADD_STAT(strategy_time_blocked, ms_blocked); 1990 if (BC_cache->c_stats.ss_strategy_time_longest_blocked < ms_blocked) { 1991 BC_cache->c_stats.ss_strategy_time_longest_blocked = ms_blocked; 1992 } 1993 } 1994 } 1995 1996 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, (did_block ? DBG_BC_IO_HIT_STALLED : DBG_BC_IO_HIT)) | DBG_FUNC_NONE, buf_kernel_addrperm_addr(bp), 0, 0, 0, 0); 1997 1998 /* 1999 * buf_map will do its best to provide access to this 2000 * buffer via a kernel accessible address 2001 * if it fails, we can still fall through. 2002 */ 2003 if (buf_map(bp, &buf_map_p)) { 2004 /* can't map, let someone else worry about it */ 2005 if (take_detailed_stats) 2006 BC_ADD_STAT(hit_failure, 1); 2007 goto bypass; 2008 } 2009 buf_extent_p = buf_map_p; 2010 2011 kret = vnode_getwithvid(BC_cache->c_vp, BC_cache->c_vid); 2012 if (kret != KERN_SUCCESS) { 2013 debug("vnode_getwithvid failed - %d", kret); 2014 if (take_detailed_stats) 2015 BC_ADD_STAT(hit_failure, 1); 2016 buf_unmap(bp); 2017 goto bypass; 2018 } 2019 2020 /* 2021 * fill in bp from the extents 2022 */ 2023 for (ce_idx = 0; 2024 ce_idx < num_extents; 2025 ce_idx++) { 2026 u_int64_t nbytes_thisextent; 2027 u_int64_t diskoffset_thisextent; 2028 off_t cacheoffset_thisextent; 2029 2030 ce = pcontaining_extents[ce_idx]; 2031 LOCK_EXTENT(ce); 2032 2033 /* 2034 * Check that the extent wasn't aborted while we were unlocked. 2035 */ 2036 if (ce->ce_flags & CE_ABORTED) { 2037 if (take_detailed_stats) 2038 BC_ADD_STAT(hit_aborted, 1); 2039 vnode_put(BC_cache->c_vp); 2040 buf_unmap(bp); 2041 goto bypass; 2042 } 2043 2044 assert(ce->ce_flags & CE_IODONE); 2045 2046 diskoffset_thisextent = MAX(ce->ce_diskoffset, disk_offset); 2047 nbytes_thisextent = MIN(disk_offset + nbytes, ce->ce_diskoffset + ce->ce_length) - diskoffset_thisextent; 2048 cacheoffset_thisextent = ce->ce_cacheoffset + (diskoffset_thisextent - ce->ce_diskoffset); 2049 2050 /* range check against extent */ 2051 assert(diskoffset_thisextent < ce->ce_diskoffset + ce->ce_length); 2052 assert(diskoffset_thisextent >= ce->ce_diskoffset); 2053 assert(nbytes_thisextent <= (ce->ce_diskoffset + ce->ce_length) - diskoffset_thisextent); 2054 assert(cacheoffset_thisextent < ce->ce_cacheoffset + ce->ce_length); 2055 assert(cacheoffset_thisextent >= ce->ce_cacheoffset); 2056 assert(nbytes_thisextent <= (ce->ce_cacheoffset + ce->ce_length) - cacheoffset_thisextent); 2057 2058 /* range check against buf_t */ 2059 assert(diskoffset_thisextent < disk_offset + nbytes); 2060 assert(diskoffset_thisextent >= disk_offset); 2061 assert(nbytes_thisextent <= (disk_offset + nbytes) - diskoffset_thisextent); 2062 2063 /* check our buf_map pointer */ 2064 assert(buf_extent_p - buf_map_p == diskoffset_thisextent - disk_offset); /* didn't skip any bytes */ 2065 assert(buf_map_p + nbytes >= buf_extent_p + nbytes_thisextent); /* not overflowing the buffer */ 2066 2067 /* Make sure we're including the entire buffer requested */ 2068 assert(ce_idx > 0 || disk_offset == diskoffset_thisextent); /* include the first byte */ 2069 assert(ce_idx < (num_extents - 1) || disk_offset + nbytes == diskoffset_thisextent + nbytes_thisextent); /* include the last byte */ 2070 2071 /* 2072 * Check that the requested blocks are in the buffer. 2073 */ 2074 if (!BC_blocks_present(ce, 2075 CB_BYTE_TO_BLOCK(BC_cache->c_mounts + cm_idx, diskoffset_thisextent - ce->ce_diskoffset), 2076 CB_BYTE_TO_BLOCK(BC_cache->c_mounts + cm_idx, nbytes_thisextent))) { 2077 if (take_detailed_stats) 2078 BC_ADD_STAT(hit_blkmissing, 1); 2079 vnode_put(BC_cache->c_vp); 2080 buf_unmap(bp); 2081 goto bypass; 2082 } 2083 2084 resid = nbytes_thisextent; 2085 uio = uio_create(1, cacheoffset_thisextent, UIO_SYSSPACE, UIO_READ); 2086 if (uio == NULL) { 2087 debug("couldn't allocate uio"); 2088 if (take_detailed_stats) 2089 BC_ADD_STAT(hit_failure, 1); 2090 vnode_put(BC_cache->c_vp); 2091 buf_unmap(bp); 2092 goto bypass; 2093 } 2094 2095 kret = uio_addiov(uio, CAST_USER_ADDR_T(buf_extent_p), nbytes_thisextent); 2096 if (kret != KERN_SUCCESS) { 2097 debug("couldn't add iov to uio - %d", kret); 2098 if (take_detailed_stats) 2099 BC_ADD_STAT(hit_failure, 1); 2100 vnode_put(BC_cache->c_vp); 2101 buf_unmap(bp); 2102 goto bypass; 2103 } 2104 2105 kret = cluster_copy_ubc_data(BC_cache->c_vp, uio, &resid, 0); 2106 if (kret != KERN_SUCCESS) { 2107 debug("couldn't copy ubc data - %d", kret); 2108 if (take_detailed_stats) 2109 BC_ADD_STAT(hit_failure, 1); 2110 vnode_put(BC_cache->c_vp); 2111 buf_unmap(bp); 2112 goto bypass; 2113 } 2114 2115 if (resid != 0) { 2116 /* blocks have been stolen for pageout or contiguous memory */ 2117 if (take_detailed_stats) 2118 BC_ADD_STAT(hit_stolen, 1); 2119 vnode_put(BC_cache->c_vp); 2120 buf_unmap(bp); 2121 is_stolen = 1; 2122 goto bypass; 2123 } 2124 2125 buf_extent_p += nbytes_thisextent; 2126 2127 /* discard blocks we have touched */ 2128 discards += BC_discard_bytes(ce, disk_offset, nbytes_thisextent); 2129 2130 UNLOCK_EXTENT(ce); 2131 ce = NULL; 2132 2133 /* copy was successful */ 2134 uio_free(uio); 2135 uio = NULL; 2136 } 2137 2138 vnode_put(BC_cache->c_vp); 2139 2140 buf_setresid(bp, 0); 2141 2142 /* buf_unmap will take care of all cases */ 2143 buf_unmap(bp); 2144 2145 /* complete the request */ 2146 buf_biodone(bp); 2147 2148 /* can't dereference bp after a buf_biodone has been issued */ 2149 2150#else //ifndef EMULATE_ONLY 2151 (void) kret; 2152 (void) p; 2153 (void) resid; 2154 2155 /* discard blocks we have touched */ 2156 for (ce_idx = 0; 2157 ce_idx < num_extents; 2158 ce_idx++) { 2159 ce = pcontaining_extents[ce_idx]; 2160 LOCK_EXTENT(ce); 2161 discards += BC_discard_bytes(ce, disk_offset, nbytes); 2162 UNLOCK_EXTENT(ce); 2163 ce = NULL; 2164 } 2165 2166 /* send the IO to the disk, emulate hit in statistics */ 2167 BC_cache->c_strategy(bp); 2168 2169#endif //ifndef EMULATE_ONLY else 2170 2171 if (take_detailed_stats) { 2172 BC_ADD_STAT(hit_bytes, discards); 2173 if (cm_idx < STAT_MOUNTMAX) { 2174 BC_ADD_STAT(hit_bytes_m[cm_idx], discards); 2175 } 2176 if (dont_cache) { 2177 BC_ADD_STAT(strategy_hit_nocache, 1); 2178 BC_ADD_STAT(hit_nocache_bytes, discards); 2179 } 2180 if (is_shared) { 2181 BC_ADD_STAT(hit_shared_bytes, discards); 2182 } 2183 } else { 2184 BC_ADD_STAT(hit_bytes_afterhistory, discards); 2185 } 2186 2187 BC_cache->c_num_ios_since_last_hit = 0; 2188 2189 /* we are not busy anymore */ 2190 OSAddAtomic(-1, &BC_cache->c_strategycalls); 2191 UNLOCK_CACHE_R(); 2192 2193 _FREE_ZERO(pcontaining_extents, M_TEMP); 2194 2195 if (! dont_cache) { 2196 /* record successful fulfilment (may block) */ 2197 BC_add_history(blkno, nbytes, pid, 1, 0, 0, is_shared, dev); 2198 } 2199 2200 /* 2201 * spec_strategy wants to know if the read has been 2202 * satisfied by the boot cache in order to avoid 2203 * throttling the thread unnecessarily. spec_strategy 2204 * will check for the special return value 0xcafefeed 2205 * to indicate that the read was satisfied by the 2206 * cache. 2207 */ 2208 2209#define IO_SATISFIED_BY_CACHE ((int)0xcafefeed) 2210 return IO_SATISFIED_BY_CACHE; 2211 2212bypass: 2213 if (ce != NULL) 2214 UNLOCK_EXTENT(ce); 2215 if (uio != NULL) 2216 uio_free(uio); 2217 _FREE_ZERO(pcontaining_extents, M_TEMP); 2218 2219 /* pass the request on */ 2220 BC_cache->c_strategy(bp); 2221 2222 /* 2223 * can't dereference bp after c_strategy has been issued 2224 * or else we race with buf_biodone 2225 */ 2226 void *bp_void = (void *)bp; // for use in ktrace 2227 bp = NULL; 2228 2229 /* not really "bypassed" if the cache is not active */ 2230 if (during_cache) { 2231 if (cm_idx != -1) { 2232 LOCK_MOUNT_R(BC_cache->c_mounts + cm_idx); 2233 if (BC_cache->c_mounts[cm_idx].cm_state == CM_READY) { 2234 discards += BC_handle_discards(BC_cache->c_mounts + cm_idx, disk_offset, nbytes); 2235 } 2236 2237 /* Check if we should throttle this IO */ 2238 if (BC_cache->c_readahead_throttles_cutthroughs && 2239 !is_swap && 2240 BC_cache->c_mounts[cm_idx].cm_disk && 2241 (BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_HAS_THREAD) && 2242 !(BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_ISSUE_LOWPRI) && 2243 !(BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_IS_SSD)) { 2244 /* We're currently issuing readahead for this disk. 2245 * Throttle this IO so we don't cut-through the readahead so much. 2246 */ 2247 should_throttle = 1; 2248 } 2249 2250 UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx); 2251 } 2252 if (take_detailed_stats) { 2253 BC_ADD_STAT(strategy_bypassed, 1); 2254 if (during_io) { 2255 BC_ADD_STAT(strategy_bypass_duringio, 1); 2256 } 2257 if (dont_cache) { 2258 BC_ADD_STAT(strategy_bypass_nocache, 1); 2259 BC_ADD_STAT(bypass_nocache_bytes, nbytes); 2260 if (during_io) { 2261 BC_ADD_STAT(strategy_bypass_duringio_nocache, 1); 2262 } 2263 } 2264 if (cache_hit) { 2265 BC_ADD_STAT(error_discards, discards); 2266 } else if (dont_cache) { 2267 BC_ADD_STAT(bypass_nocache_discards, discards); 2268 } else if (bufflags & B_READ) { 2269 BC_ADD_STAT(read_discards, discards); 2270 } else { 2271 BC_ADD_STAT(write_discards, discards); 2272 } 2273 } else { 2274 BC_ADD_STAT(lost_bytes_afterhistory, discards); 2275 } 2276 } 2277 2278 if (during_cache) { 2279 OSAddAtomic(-1, &BC_cache->c_strategycalls); 2280 UNLOCK_CACHE_R(); 2281 } 2282 2283 if (! is_swap) { 2284 if (! dont_cache) { 2285 is_root_disk = BC_add_history(blkno, nbytes, pid, 0, ((bufflags & B_READ) ? 0 : 1), 0, is_shared, dev); 2286 2287 if (take_detailed_stats && during_io && is_root_disk) { 2288 if (cache_hit) { 2289 if (unfilled) { 2290 BC_ADD_STAT(strategy_bypass_duringio_unfilled, 1); 2291 } else { 2292 BC_ADD_STAT(strategy_bypass_duringio_rootdisk_failure, 1); 2293 } 2294 } else if (bufflags & B_READ) { 2295 BC_ADD_STAT(strategy_bypass_duringio_rootdisk_read, 1); 2296 } else { 2297 BC_ADD_STAT(strategy_bypass_duringio_rootdisk_nonread, 1); 2298 } 2299 } 2300 } 2301 } 2302 2303 /* 2304 * Check to make sure that we're not missing the cache 2305 * too often. If we are, we're no longer providing a 2306 * performance win and the best thing would be to give 2307 * up. 2308 * 2309 * Don't count throttled IOs since those aren't time 2310 * critical. (We don't want to jettison the cache just 2311 * because spotlight is indexing) 2312 */ 2313 2314 /* if this is a read, and we do have an active cache, and the read isn't throttled */ 2315 if (during_cache) { 2316 (void) is_stolen; 2317 if (is_swap /*|| is_stolen*/) { //rdar://10651288&10658086 seeing stolen pages early during boot 2318 if (is_swap) { 2319 debug("detected %s swap file, jettisoning cache", (bufflags & B_READ) ? "read from" : "write to"); 2320 } else { 2321 debug("detected stolen page, jettisoning cache"); 2322 } 2323 //rdar://9858070 Do this asynchronously to avoid deadlocks 2324 BC_terminate_cache_async(); 2325 } else if ((bufflags & B_READ) && 2326 !(throttle_tier)) { 2327 2328 struct timeval current_time; 2329 if (BC_cache->c_stats.ss_history_num_recordings < 2) { 2330 microtime(¤t_time); 2331 timersub(¤t_time, 2332 &BC_cache->c_loadtime, 2333 ¤t_time); 2334 } 2335 /* Don't start counting misses until we've started login or hit our prelogin timeout */ 2336 if (BC_cache->c_stats.ss_history_num_recordings >= 2 || current_time.tv_sec > BC_chit_prelogin_timeout_seconds) { 2337 2338 /* increase number of IOs since last hit */ 2339 OSAddAtomic(1, &BC_cache->c_num_ios_since_last_hit); 2340 2341 if (BC_cache->c_num_ios_since_last_hit >= BC_chit_max_num_IOs_since_last_hit) { 2342 /* 2343 * The hit rate is poor, shut the cache down. 2344 */ 2345 debug("hit rate below threshold (0 hits in the last %u lookups), jettisoning cache", 2346 BC_cache->c_num_ios_since_last_hit); 2347 //rdar://9858070 Do this asynchronously to avoid deadlocks 2348 BC_terminate_cache_async(); 2349 } 2350 } 2351 } 2352 } 2353 2354 if (is_swap && (! (BC_cache->c_flags & BC_FLAG_SHUTDOWN))) { 2355 /* if we're swapping, stop readahead */ 2356 debug("Detected %s swap file. Terminating readahead", (bufflags & B_READ) ? "read from" : "write to"); 2357 BC_set_flag(BC_FLAG_SHUTDOWN); 2358 } 2359 2360 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, (should_throttle ? DBG_BC_IO_MISS_CUT_THROUGH : DBG_BC_IO_MISS)) | DBG_FUNC_NONE, buf_kernel_addrperm_addr(bp_void), 0, 0, 0, 0); 2361 2362 if (should_throttle && throttle_tier < IOPOL_THROTTLE) { 2363 /* 2364 * We need to indicate to spec_strategy that we want to 2365 * throttle this IO to avoid cutting through readahead 2366 * too much. spec_strategy will check for the special 2367 * return value 0xcafebeef to indicate that the IO 2368 * should be throttled. 2369 */ 2370 2371 BC_ADD_STAT(strategy_forced_throttled, 1); 2372 2373#define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef) 2374 return IO_SHOULD_BE_THROTTLED; 2375 } 2376 2377 return 0; 2378} 2379 2380/* 2381 * Handle a block range that needs to be discarded. 2382 * 2383 * Called with the cache mount read lock held 2384 * 2385 * Returns the number of bytes discarded 2386 */ 2387static int 2388BC_handle_discards(struct BC_cache_mount *cm, u_int64_t offset, u_int64_t length) 2389{ 2390 struct BC_cache_extent **pce, **p; 2391 int count, total_discards; 2392 2393 total_discards = 0; 2394 2395 /* 2396 * Look for an extent that we overlap. 2397 */ 2398 if ((pce = BC_find_extent(cm, offset, length, 0, NULL)) == NULL) 2399 return 0; /* doesn't affect us */ 2400 2401 /* 2402 * Discard bytes in the matched extent. 2403 */ 2404 LOCK_EXTENT(*pce); 2405 count = BC_discard_bytes((*pce), offset, length); 2406 UNLOCK_EXTENT(*pce); 2407 total_discards += count; 2408 2409 /* 2410 * Scan adjacent extents for possible overlap and discard there as well. 2411 */ 2412 p = pce - 1; 2413 while (p >= cm->cm_pextents && 2414 BC_check_intersection((*p), offset, length)) { 2415 LOCK_EXTENT(*p); 2416 count = BC_discard_bytes((*p), offset, length); 2417 UNLOCK_EXTENT(*p); 2418 if (count == 0) 2419 break; 2420 total_discards += count; 2421 p--; 2422 } 2423 p = pce + 1; 2424 while (p < (cm->cm_pextents + cm->cm_nextents) && 2425 BC_check_intersection((*p), offset, length)) { 2426 LOCK_EXTENT(*p); 2427 count = BC_discard_bytes((*p), offset, length); 2428 UNLOCK_EXTENT(*p); 2429 if (count == 0) 2430 break; 2431 total_discards += count; 2432 p++; 2433 } 2434 2435 return total_discards; 2436} 2437 2438/* 2439 * Shut down readahead operations. 2440 */ 2441static int 2442BC_terminate_readahead(void) 2443{ 2444 int error; 2445 struct timespec timeout; 2446 timeout.tv_sec = 10; 2447 timeout.tv_nsec = 0; 2448 2449 /* 2450 * Signal the readahead thread to terminate, and wait for 2451 * it to comply. If this takes > 10 seconds, give up. 2452 */ 2453 BC_set_flag(BC_FLAG_SHUTDOWN); 2454 2455 /* 2456 * If readahead is still in progress, we have to shut it down 2457 * cleanly. This is an expensive process, but since readahead 2458 * will normally complete long before the reads it tries to serve 2459 * complete, it will typically only be invoked when the playlist 2460 * is out of synch and the cache hitrate falls below the acceptable 2461 * threshold. 2462 * 2463 * Note that if readahead is aborted, the reader thread will mark 2464 * the aborted extents and wake up any strategy callers waiting 2465 * on them, so we don't have to worry about them here. 2466 */ 2467 LOCK_READERS(); 2468 while (BC_cache->c_num_reader_threads > 0) { 2469 debug("terminating active readahead"); 2470 2471 error = msleep(&BC_cache->c_num_reader_threads, &BC_cache->c_reader_threads_lock, PRIBIO, "BC_terminate_readahead", &timeout); 2472 if (error == EWOULDBLOCK) { 2473 UNLOCK_READERS(); 2474 2475 message("timed out waiting for I/O to stop"); 2476 if (BC_cache->c_num_reader_threads == 0) { 2477 debug("but I/O has stopped!"); 2478 } 2479#ifdef BC_DEBUG 2480 Debugger("I/O Kit wedged on us"); 2481#endif 2482 /* 2483 * It might be nice to free all the pages that 2484 * aren't actually referenced by the outstanding 2485 * region, since otherwise we may be camped on a 2486 * very large amount of physical memory. 2487 * 2488 * Ideally though, this will never happen. 2489 */ 2490 return(EBUSY); /* really EWEDGED */ 2491 } 2492 } 2493 UNLOCK_READERS(); 2494 2495 return(0); 2496} 2497 2498static void 2499BC_terminate_cache_thread(void *param0, wait_result_t param1) 2500{ 2501 BC_terminate_cache(); 2502} 2503 2504/* 2505 * Start up an auxilliary thread to stop the cache so we avoid potential deadlocks 2506 */ 2507static void 2508BC_terminate_cache_async(void) 2509{ 2510 if (! (BC_cache->c_flags & BC_FLAG_CACHEACTIVE)) { 2511 return; 2512 } 2513 2514 int error; 2515 thread_t rthread; 2516 2517 debug("Kicking off thread to terminate cache"); 2518 if ((error = kernel_thread_start(BC_terminate_cache_thread, NULL, &rthread)) == KERN_SUCCESS) { 2519 thread_deallocate(rthread); 2520 } else { 2521 message("Unable to start thread to terminate cache"); 2522 } 2523} 2524 2525/* 2526 * Terminate the cache. 2527 * 2528 * This prevents any further requests from being satisfied from the cache 2529 * and releases all the resources owned by it. 2530 * 2531 * Must be called with no locks held 2532 */ 2533static int 2534BC_terminate_cache(void) 2535{ 2536 int retry, cm_idx, j, ce_idx, cel_idx; 2537 2538 BC_terminate_readahead(); 2539 2540 /* can't shut down if readahead is still active */ 2541 if (BC_cache->c_num_reader_threads > 0) { 2542 debug("cannot terminate cache while readahead is in progress"); 2543 return(EBUSY); 2544 } 2545 2546 LOCK_CACHE_R(); 2547 2548 LOCK_HANDLERS(); 2549 if (!BC_clear_flag(BC_FLAG_CACHEACTIVE)) { 2550 /* can't shut down if we're not active */ 2551 debug("cannot terminate cache when not active"); 2552 UNLOCK_HANDLERS(); 2553 UNLOCK_CACHE_R(); 2554 return(ENXIO); 2555 } 2556 2557 bootcache_contains_block = NULL; 2558 2559 debug("terminating cache..."); 2560 2561 /* if we're no longer recording history also, disconnect our strategy routine */ 2562 BC_check_handlers(); 2563 UNLOCK_HANDLERS(); 2564 2565 /* 2566 * Mark all extents as FREED. This will notify any sleepers in the 2567 * strategy routine that the extent won't have data for them. 2568 */ 2569 for (cel_idx = 0; 2570 cel_idx < BC_cache->c_nextentlists; 2571 cel_idx++) { 2572 for (ce_idx = 0; 2573 ce_idx < BC_cache->c_nextents[cel_idx]; 2574 ce_idx++) { 2575 struct BC_cache_extent *ce = BC_cache->c_extentlists[cel_idx] + ce_idx; 2576 LOCK_EXTENT(ce); 2577 /* 2578 * Track unused bytes 2579 */ 2580 if (ce->ce_blockmap != NULL && BC_cache->c_mounts[ce->ce_mount_idx].cm_blocksize != 0) { 2581 for (j = 0; j < howmany(ce->ce_length, BC_cache->c_mounts[ce->ce_mount_idx].cm_blocksize); j++) { 2582 if (CB_BLOCK_PRESENT(ce, j)) 2583 BC_ADD_STAT(spurious_bytes, BC_cache->c_mounts[ce->ce_mount_idx].cm_blocksize); 2584 } 2585 } 2586 2587 BC_teardown_extent(ce); 2588 UNLOCK_EXTENT(ce); 2589 wakeup(ce); 2590 } 2591 } 2592 2593 for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) { 2594 struct BC_cache_mount *cm = BC_cache->c_mounts + cm_idx; 2595 LOCK_MOUNT_W(cm); 2596 BC_teardown_mount(cm); 2597 UNLOCK_MOUNT_W(cm); 2598 } 2599 2600 2601 /* 2602 * It is possible that one or more callers are asleep in the 2603 * strategy routine (eg. if readahead was terminated early, 2604 * or if we are called off the timeout). 2605 * Check the count of callers in the strategy code, and sleep 2606 * until there are none left (or we time out here). Note that 2607 * by clearing BC_FLAG_CACHEACTIVE above we prevent any new 2608 * strategy callers from touching the cache, so the count 2609 * must eventually drain to zero. 2610 */ 2611 retry = 0; 2612 while (BC_cache->c_strategycalls > 0) { 2613 tsleep(BC_cache, PRIBIO, "BC_terminate_cache", 10); 2614 if (retry++ > 50) { 2615 message("could not terminate cache, timed out with %d caller%s in BC_strategy", 2616 (int) BC_cache->c_strategycalls, 2617 BC_cache->c_strategycalls == 1 ? "" : "s"); 2618 UNLOCK_CACHE_R(); 2619 return(EBUSY); /* again really EWEDGED */ 2620 } 2621 } 2622 2623 if (! LOCK_CACHE_R_TO_W()) { 2624 /* We shouldn't get here. This is the only LOCK_CACHE_R_TO_W call, 2625 * so this implies someone is terminating the cache in parallel with us, 2626 * but we check for exclusivity by clearing BC_FLAG_CACHEACTIVE. 2627 */ 2628 message("Unable to upgrade cache lock to free resources"); 2629 return ENXIO; 2630 } 2631 2632 for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) { 2633 struct BC_cache_mount *cm = BC_cache->c_mounts + cm_idx; 2634 if (cm->cm_disk != NULL) { 2635 for (j = cm_idx + 1; j < BC_cache->c_nmounts; j++) { 2636 if (BC_cache->c_mounts[j].cm_disk == cm->cm_disk) { 2637 BC_cache->c_mounts[j].cm_disk = NULL; 2638 } 2639 } 2640 BC_teardown_disk(cm->cm_disk); 2641 lck_mtx_destroy(&cm->cm_disk->cd_lock, BC_cache->c_lckgrp); 2642 _FREE_ZERO(cm->cm_disk, M_TEMP); 2643 } 2644 } 2645 2646 BC_free_pagebuffer(); 2647 BC_cache->c_cachesize = 0; 2648 2649 /* free memory held by extents and mounts */ 2650 for (cel_idx = 0; 2651 cel_idx < BC_cache->c_nextentlists; 2652 cel_idx++) { 2653 for (ce_idx = 0; 2654 ce_idx < BC_cache->c_nextents[cel_idx]; 2655 ce_idx++) { 2656 lck_mtx_destroy(&BC_cache->c_extentlists[cel_idx][ce_idx].ce_lock, BC_cache->c_lckgrp); 2657 } 2658 _FREE_ZERO(BC_cache->c_extentlists[cel_idx], M_TEMP); 2659 } 2660 _FREE_ZERO(BC_cache->c_extentlists, M_TEMP); 2661 _FREE_ZERO(BC_cache->c_nextents, M_TEMP); 2662 BC_cache->c_nextentlists = 0; 2663 2664 for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) { 2665 lck_rw_destroy(&BC_cache->c_mounts[cm_idx].cm_lock, BC_cache->c_lckgrp); 2666 } 2667 _FREE_ZERO(BC_cache->c_mounts, M_TEMP); 2668 BC_cache->c_nmounts = 0; 2669 2670 UNLOCK_CACHE_W(); 2671 2672 /* record stop time */ 2673 struct timeval endtime; 2674 microtime(&endtime); 2675 timersub(&endtime, 2676 &BC_cache->c_cache_starttime, 2677 &endtime); 2678 BC_cache->c_stats.ss_cache_time += (u_int) endtime.tv_sec * 1000 + (u_int) endtime.tv_usec / 1000; 2679 return(0); 2680} 2681 2682/* 2683 * Terminate history recording. 2684 * 2685 * This stops us recording any further history events. 2686 */ 2687static int 2688BC_terminate_history(void) 2689{ 2690 struct BC_history_mount_device *hmd; 2691 LOCK_HANDLERS(); 2692 if (!BC_clear_flag(BC_FLAG_HISTORYACTIVE)) { 2693 /* can't shut down if we're not active */ 2694 debug("cannot terminate history when not active"); 2695 UNLOCK_HANDLERS(); 2696 return(ENXIO); 2697 } 2698 2699 debug("terminating history collection..."); 2700 2701 /* if the cache is no longer active also, disconnect our strategy routine */ 2702 BC_check_handlers(); 2703 UNLOCK_HANDLERS(); 2704 2705 /* 2706 * Kill the timeout handler; we don't want it going off 2707 * now. 2708 */ 2709 untimeout(BC_timeout_history, NULL); 2710 2711 if (BC_cache->c_take_detailed_stats) { 2712 BC_cache->c_stats.ss_history_mount_no_uuid = 0; 2713 for (hmd = BC_cache->c_history_mounts; hmd != NULL; hmd = hmd->hmd_next) 2714 if (uuid_is_null(hmd->hmd_mount.hm_uuid)) 2715 BC_ADD_STAT(history_mount_no_uuid, 1); 2716 } 2717 2718 /* record stop time */ 2719 if (BC_cache->c_take_detailed_stats) { 2720 struct timeval endtime; 2721 /* record stop time */ 2722 microtime(&endtime); 2723 timersub(&endtime, 2724 &BC_cache->c_history_starttime, 2725 &endtime); 2726 BC_ADD_STAT(history_time, (u_int) endtime.tv_sec * 1000 + (u_int) endtime.tv_usec / 1000); 2727 } 2728 2729 BC_cache->c_take_detailed_stats = 0; 2730 2731 return(0); 2732} 2733 2734/* 2735 * Check our strategy handlers and make sure they are set/cleared depending on our current state. 2736 * 2737 * Should be called after changing BC_FLAG_CACHEACTIVE or BC_FLAG_HISTORYACTIVE. 2738 * Called with the handlers lock held. 2739 */ 2740static void 2741BC_check_handlers(void) 2742{ 2743 if (BC_cache->c_devsw == NULL || 2744 BC_cache->c_strategy == NULL || 2745 BC_cache->c_close == NULL) { 2746 debug("Cannot check device handlers: cache not setup properly"); 2747 return; 2748 } 2749 2750 /* if the cache or history recording is active, make sure we've captured the appropriate device handler routines */ 2751 if ((BC_cache->c_flags & BC_FLAG_CACHEACTIVE) || 2752 (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE)) { 2753 2754 if (BC_cache->c_devsw->d_strategy != (strategy_fcn_t*) BC_strategy || 2755 BC_cache->c_devsw->d_close != BC_close) { 2756 2757 debug("connecting handlers..."); 2758 2759 /* connect the strategy and close routines */ 2760 BC_cache->c_devsw->d_strategy = (strategy_fcn_t*) BC_strategy; 2761 BC_cache->c_devsw->d_close = BC_close; 2762 } 2763 } else { 2764 if (BC_cache->c_devsw->d_strategy != BC_cache->c_strategy || 2765 BC_cache->c_devsw->d_close != BC_cache->c_close) { 2766 2767 debug("disconnecting handlers..."); 2768 2769 /* disconnect the strategy and close routines */ 2770 BC_cache->c_devsw->d_strategy = BC_cache->c_strategy; 2771 BC_cache->c_devsw->d_close = BC_cache->c_close; 2772 } 2773 } 2774} 2775 2776/* 2777 * Setup a cache extent with the parameters given by the playlist entry 2778 * 2779 * Returns 0 on success 2780 */ 2781static int 2782BC_setup_extent(struct BC_cache_extent *ce, const struct BC_playlist_entry *pe, int batch_adjustment, int cache_mount_idx) 2783{ 2784 lck_mtx_init(&ce->ce_lock, BC_cache->c_lckgrp, 2785 LCK_ATTR_NULL); 2786 ce->ce_diskoffset = pe->pe_offset; 2787 ce->ce_mount_idx = cache_mount_idx; 2788 ce->ce_length = pe->pe_length; 2789#ifdef IGNORE_BATCH 2790 ce->ce_batch = 0; 2791#else 2792 ce->ce_batch = pe->pe_batch; 2793#endif 2794 ce->ce_batch += batch_adjustment; 2795 if (ce->ce_batch > BC_MAXBATCHES) { 2796 ce->ce_batch = BC_MAXBATCHES; /* cap batch number */ 2797 } 2798 ce->ce_cacheoffset = 0; 2799 ce->ce_blockmap = NULL; 2800 ce->ce_flags = 0; 2801 if (pe->pe_flags & BC_PE_LOWPRIORITY) { 2802 ce->ce_flags |= CE_LOWPRI; 2803 } 2804 if (pe->pe_flags & BC_PE_SHARED) { 2805 ce->ce_flags |= CE_SHARED; 2806 } 2807 2808 /* track highest batch number for this playlist */ 2809 if (ce->ce_batch >= BC_cache->c_batch_count) { 2810 BC_cache->c_batch_count = ce->ce_batch + 1; 2811 // debug("Largest batch is now %d from extent with disk offset %llu", BC_cache->c_batch_count, ce->ce_diskoffset); 2812 } 2813 2814 return 0; 2815} 2816 2817/* 2818 * The blocksize is initialised from the first playlist read, the statistics 2819 * structure, or it can be pre-set by the caller. Once set, only playlists with 2820 * matching sizes can be read/written. 2821 */ 2822#define BLOCK_ROUNDDOWN(cm, x) (((x) / (cm)->cm_blocksize) * (cm)->cm_blocksize) 2823#define BLOCK_ROUNDUP(cm, x) ((((x) + ((cm)->cm_blocksize - 1)) / (cm)->cm_blocksize) * (cm)->cm_blocksize) 2824/* 2825 * optional for power-of-two blocksize roundup: 2826 * (((x) + ((cm)->cm_blocksize - 1)) & (~((cm)->cm_blocksize - 1))) 2827 */ 2828 2829/* 2830 * Fill in a cache extent now that its mount has been filled in 2831 * 2832 * Called with the extent lock held 2833 * 2834 * Returns 0 on success 2835 */ 2836static int 2837BC_fill_in_extent(struct BC_cache_extent *ce) 2838{ 2839 int numblocks, roundsize, i; 2840 u_int64_t end; 2841 2842 if (ce->ce_flags & CE_ABORTED || 2843 ce->ce_length == 0) { 2844 return 1; 2845 } 2846 2847 struct BC_cache_mount *cm = BC_cache->c_mounts + ce->ce_mount_idx; 2848 2849 int blocksize = cm->cm_blocksize; 2850 2851 if (0 == blocksize) { 2852 return 1; 2853 } 2854 2855 roundsize = roundup(ce->ce_length, PAGE_SIZE); 2856 2857 /* make sure we're on block boundaries */ 2858 end = ce->ce_diskoffset + roundsize; 2859 ce->ce_diskoffset = BLOCK_ROUNDDOWN(cm, ce->ce_diskoffset); 2860 ce->ce_length = BLOCK_ROUNDUP(cm, end) - ce->ce_diskoffset; 2861 2862 /* make sure we didn't grow our pagebuffer size since is's already been allocated */ 2863 if (roundup(ce->ce_length, PAGE_SIZE) > roundsize) { 2864 debug("Clipped extent %llu by a page", ce->ce_diskoffset); 2865 BC_ADD_STAT(extents_clipped, 1); 2866 ce->ce_length = roundsize; 2867 } 2868 2869 numblocks = howmany(ce->ce_length, blocksize); 2870 2871 ce->ce_blockmap = BC_MALLOC(howmany(numblocks, (CB_MAPFIELDBITS / CB_MAPFIELDBYTES)), 2872 M_TEMP, M_WAITOK | M_ZERO); 2873 if (!ce->ce_blockmap) { 2874 message("can't allocate extent blockmap for %d blocks of %d bytes", numblocks, howmany(numblocks, (CB_MAPFIELDBITS / CB_MAPFIELDBYTES))); 2875 return 1; 2876 } 2877 2878 for (i = 0; i < howmany(ce->ce_length, blocksize); i++) { 2879 CB_MARK_BLOCK_PRESENT((ce), i); 2880 } 2881 2882 return 0; 2883} 2884 2885static void 2886BC_teardown_extent(struct BC_cache_extent *ce) 2887{ 2888 ce->ce_flags |= CE_ABORTED; 2889 _FREE_ZERO(ce->ce_blockmap, M_TEMP); 2890} 2891 2892/* 2893 * Setup a cache disk. 2894 * Returns 0 on success 2895 */ 2896static int 2897BC_setup_disk(struct BC_cache_disk *cd, u_int64_t disk_id, int is_ssd) 2898{ 2899 static int next_disk_num; 2900 lck_mtx_init(&cd->cd_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL); 2901 cd->cd_disk_id = disk_id; 2902 cd->cd_disk_num = next_disk_num++; 2903 cd->cd_flags = 0; 2904 cd->cd_nmounts = 0; 2905 cd->cd_batch = 0; 2906 2907 if (is_ssd) { 2908 cd->cd_flags |= CD_IS_SSD; 2909 } 2910 2911 debug("Setup disk 0x%llx as disk num %d%s", disk_id, cd->cd_disk_num, (cd->cd_flags & CD_IS_SSD) ? " (ssd)" : ""); 2912 2913 return (0); 2914} 2915 2916/* 2917 * Teardown a cache disk. 2918 */ 2919static void 2920BC_teardown_disk(struct BC_cache_disk *cd) 2921{ 2922 /* Nothing to do */ 2923} 2924 2925 2926/* 2927 * Check if a mount has become available for readahead 2928 * 2929 * If so, make sure the reader thread picks up this mount's IOs. 2930 * If there is no reader thread for the mount's disk, kick one off. 2931 * 2932 * Called with the cache mount read lock held 2933 */ 2934static void 2935BC_mount_available(struct BC_cache_mount *cm) 2936{ 2937#ifdef EMULATE_ONLY 2938 int i; 2939 for (i = 0; i < cm->cm_nextents; i++) 2940 cm->cm_pextents[i]->ce_flags |= CE_IODONE; 2941#else 2942 int i, error; 2943 thread_t rthread; 2944 if (cm->cm_state != CM_READY) { 2945 /* not ready */ 2946 return; 2947 } 2948 2949 struct BC_cache_disk *cd = cm->cm_disk; 2950 LOCK_DISK(cd); 2951 cd->cd_nmounts++; 2952 LOCK_READERS(); 2953 if (!(BC_cache->c_flags & BC_FLAG_SHUTDOWN)) { 2954 if (cd->cd_flags & CD_HAS_THREAD) { 2955 /* Make sure the thread is not issuing lowpriority IO */ 2956 if (cd->cd_flags & CD_ISSUE_LOWPRI) { 2957 debug("Interrupting low-priority thread for new mount %s", uuid_string(cm->cm_uuid)); 2958 cd->cd_flags &= (~CD_ISSUE_LOWPRI); 2959 /* TODO: Unthrottle the readahead thread here rather than waiting out its current throttle delay */ 2960 } 2961 UNLOCK_READERS(); 2962 UNLOCK_DISK(cd); 2963 return; 2964 } 2965 2966 /* Make sure we issue regular IOs for the new playlist in case we were issuing low-priority previously */ 2967 cd->cd_flags &= (~CD_ISSUE_LOWPRI); 2968 2969 debug("Kicking off reader thread for disk %d for mount %s", cd->cd_disk_num, uuid_string(cm->cm_uuid)); 2970 if ((error = kernel_thread_start(BC_reader_thread, cd, &rthread)) == KERN_SUCCESS) { 2971 thread_deallocate(rthread); 2972 BC_cache->c_num_reader_threads++; 2973 BC_ADD_STAT(readahead_threads, 1); 2974 cd->cd_flags |= CD_HAS_THREAD; 2975 UNLOCK_READERS(); 2976 UNLOCK_DISK(cd); 2977 return; 2978 } 2979 2980 message("Unable to start reader thread for disk %d: %d", cd->cd_disk_num, error); 2981 } 2982 UNLOCK_READERS(); 2983 UNLOCK_DISK(cd); 2984 2985 /* 2986 * Getting here indicates some failure. 2987 * 2988 * Mark all extents as aborted. This will notify any sleepers in the 2989 * strategy routine that the extent won't have data for them. 2990 */ 2991 for (i = 0; i < cm->cm_nextents; i++) { 2992 LOCK_EXTENT(cm->cm_pextents[i]); 2993 BC_teardown_extent(cm->cm_pextents[i]); 2994 UNLOCK_EXTENT(cm->cm_pextents[i]); 2995 wakeup(cm->cm_pextents[i]); 2996 } 2997#endif 2998} 2999 3000/* 3001 * Setup a cache mount from the playlist mount. 3002 * 3003 * Allocates cm_pextents large enough to hold pm->pm_nentries extents, 3004 * but leaves cm_nextents 0 since the array hasn't been initialized. 3005 * 3006 * Returns 0 on success 3007 */ 3008static int 3009BC_setup_mount(struct BC_cache_mount *cm, struct BC_playlist_mount* pm) 3010{ 3011 int error = 0; 3012 3013 lck_rw_init(&cm->cm_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL); 3014 uuid_copy(cm->cm_uuid, pm->pm_uuid); 3015 3016 /* These will be set once we've detected that this volume has been mounted */ 3017 cm->cm_dev = nulldev(); 3018 cm->cm_bp = NULL; 3019 cm->cm_devsize = 0; 3020 cm->cm_maxread = 0; 3021 cm->cm_disk = NULL; 3022 cm->cm_blocksize = 0; 3023 cm->cm_nextents = 0; 3024 3025 /* Allocate the sorted extent array. 3026 * We'll fill it in while we setup the extents. 3027 */ 3028 if (pm->pm_nentries == 0) { 3029 message("Playlist incuded mount with 0 entries"); 3030 error = EINVAL; 3031 goto out; 3032 } 3033 3034 cm->cm_pextents = BC_MALLOC(pm->pm_nentries * sizeof(*cm->cm_pextents), M_TEMP, M_WAITOK | M_ZERO); 3035 if (cm->cm_pextents == NULL) { 3036 message("can't allocate mount's extent array (%d entries)", pm->pm_nentries); 3037 error = ENOMEM; 3038 goto out; 3039 } 3040 3041 cm->cm_state = CM_SETUP; 3042 3043out: 3044 if (error) { 3045 cm->cm_state = CM_ABORTED; 3046 _FREE_ZERO(cm->cm_pextents, M_TEMP); 3047 lck_rw_destroy(&cm->cm_lock, BC_cache->c_lckgrp); 3048 } 3049 return error; 3050} 3051 3052/* 3053 * Fill in the rest of the cache mount given the matching mount structure. 3054 * 3055 * Called with the cache mount write lock held 3056 * 3057 * Returns 0 if the mount was successfully filled in and cm_state will be CM_READY 3058 * Returns non-0 on failure or it the mount was already filled in 3059 */ 3060static int 3061BC_fill_in_mount(struct BC_cache_mount *cm, mount_t mount, vfs_context_t context) 3062{ 3063 uint64_t blkcount, max_byte_count, max_segment_count, max_segment_byte_count, max_block_count; 3064 uint32_t blksize, is_ssd; 3065 int error, mount_idx, i; 3066 u_int64_t disk_id; 3067 struct BC_cache_disk *cd; 3068 vnode_t devvp = NULLVP; 3069 3070 error = 0; 3071 3072 if (CM_SETUP != cm->cm_state) { 3073 return EINVAL; 3074 } 3075 3076 cm->cm_throttle_mask = vfs_throttle_mask(mount); 3077 disk_id = cm->cm_throttle_mask; /* using the throttle mask as a way to identify the physical disk */ 3078 /* debug("Got throttle mask %llx for mount %s", cm->cm_throttle_mask, uuid_string(cm->cm_uuid)); */ 3079 3080 devvp = vfs_devvp(mount); 3081 if (devvp == NULLVP) { 3082 message("mount %s does not have a vnode", uuid_string(cm->cm_uuid)); 3083 error = EINVAL; 3084 goto out; 3085 } 3086 3087#ifdef ROOT_DISK_ONLY 3088 if (devvp == rootvp) { 3089 BC_cache->c_root_disk_id = disk_id; 3090 if (BC_cache->c_root_disk_id == 0) { 3091 message("Root disk is 0"); 3092 } else { 3093 debug("Root disk (via cache) is 0x%llx", BC_cache->c_root_disk_id); 3094 } 3095 } else if (0 == BC_cache->c_root_disk_id) { 3096 error = EAGAIN; /* we haven't seen the root mount yet, try again later */ 3097 goto out; 3098 3099 //rdar://11653286 disk image volumes (FileVault 1) are messing with this check, so we're going back to != rather than !( & ) 3100 } else if (BC_cache->c_root_disk_id != disk_id) { 3101 debug("mount %s (disk 0x%llx) is not on the root disk (disk 0x%llx)", uuid_string(cm->cm_uuid), disk_id, BC_cache->c_root_disk_id); 3102 error = ENODEV; 3103 goto out; 3104 } 3105#endif 3106 3107 /* See if we already have a cache_disk for this disk */ 3108 for (mount_idx = 0; mount_idx < BC_cache->c_nmounts; mount_idx++) { 3109 if (BC_cache->c_mounts + mount_idx == cm) continue; 3110 3111 cd = BC_cache->c_mounts[mount_idx].cm_disk; 3112 3113 /* 3114 * We're not handling mounts backed by multiple disks as gracefull as we should. 3115 * Previously, this was cd->cd_disk_id == disk_id, so we had a thread per disk combination 3116 * meaning reader threads may span multiple disks and disks may have multiple reader threads. 3117 * We've only ever supported the root disk, however, so this wasn't a problem, it just missed 3118 * cases where you'd have other volumes on one of the disks you're booting from. 3119 * 3120 * Now, since we are checking for cd->cd_disk_id & disk_id, we at least include all mounts that 3121 * are on disks that the root mount uses. We still only have one reader thread, but we don't support 3122 * playback on composite disks, so that's not a problem yet. See rdar://10081513 3123 * 3124 * This assumes that the root mount (or, at least the mount we care most about) will always appear first 3125 * 3126 */ 3127 if (cd && (cd->cd_disk_id & disk_id)) { 3128 cm->cm_disk = cd; 3129 break; 3130 } 3131 } 3132 3133 /* If we don't already have a cache_disk for this disk, allocate one */ 3134 if (cm->cm_disk == NULL) { 3135 cd = BC_MALLOC(sizeof(*cd), M_TEMP, M_WAITOK | M_ZERO); 3136 if (cd == NULL) { 3137 message("can't allocate memory for cache disk"); 3138 error = ENOMEM; 3139 goto out; 3140 } 3141 3142 if (VNOP_IOCTL(devvp, /* vnode */ 3143 DKIOCISSOLIDSTATE, /* cmd */ 3144 (caddr_t)&is_ssd, /* data */ 3145 0, 3146 context)) /* context */ 3147 { 3148 message("can't determine if disk is a solid state disk for mount %s", uuid_string(cm->cm_uuid)); 3149 is_ssd = 0; 3150 } 3151 3152 if ((error = BC_setup_disk(cd, disk_id, is_ssd)) != 0) { 3153 _FREE_ZERO(cd, M_TEMP); 3154 message("cache disk setup failed: %d", error); 3155 goto out; 3156 } 3157 cm->cm_disk = cd; 3158 } 3159 3160 3161 if (VNOP_IOCTL(devvp, 3162 DKIOCGETBLOCKCOUNT, 3163 (caddr_t)&blkcount, 3164 0, 3165 context) 3166 || blkcount == 0) 3167 { 3168 message("can't determine device size, not checking"); 3169 blkcount = 0; 3170 } 3171 3172 if (VNOP_IOCTL(devvp, 3173 DKIOCGETBLOCKSIZE, 3174 (caddr_t)&blksize, 3175 0, 3176 context) 3177 || blksize == 0) 3178 { 3179 message("can't determine device block size for mount %s, defaulting to 512 bytes", uuid_string(cm->cm_uuid)); 3180 blksize = 512; 3181 } 3182 3183 if (PAGE_SIZE % blksize != 0) { 3184 message("PAGE_SIZE (%d) is not a multiple of block size (%d) for mount %s", PAGE_SIZE, blksize, uuid_string(cm->cm_uuid)); 3185 error = EINVAL; 3186 goto out; 3187 } 3188 3189 cm->cm_blocksize = blksize; 3190 cm->cm_devsize = blksize * blkcount; 3191 3192 /* max read size isn't larger than the max UPL size */ 3193 cm->cm_maxread = ubc_upl_maxbufsize(); 3194 3195 /* maxread = min ( maxread, MAXBYTECOUNTREAD ) */ 3196 if (0 == VNOP_IOCTL(devvp, 3197 DKIOCGETMAXBYTECOUNTREAD, 3198 (caddr_t)&max_byte_count, 3199 0, 3200 context)) { 3201 if (cm->cm_maxread > max_byte_count && max_byte_count > 0) { 3202 cm->cm_maxread = max_byte_count; 3203 debug("MAXBYTECOUNTREAD is %#llx", max_byte_count); 3204 } 3205 } 3206 3207 /* maxread = min ( maxread, MAXBLOCKCOUNTREAD * BLOCKSIZE ) */ 3208 if (0 == VNOP_IOCTL(devvp, 3209 DKIOCGETMAXBLOCKCOUNTREAD, 3210 (caddr_t)&max_block_count, 3211 0, 3212 context)) { 3213 if (cm->cm_maxread > max_block_count * cm->cm_blocksize && max_block_count > 0) { 3214 cm->cm_maxread = max_block_count * cm->cm_blocksize; 3215 debug("MAXBLOCKCOUNTREAD is %#llx, BLOCKSIZE is %#llx, (multiplied %#llx)", max_block_count, cm->cm_blocksize, max_block_count * cm->cm_blocksize, cm->cm_maxread); 3216 } 3217 } 3218 3219 /* maxread = min ( maxread, MAXSEGMENTCOUNTREAD * min (MAXSEGMENTBYTECOUNTREAD, PAGE_SIZE ) ) */ 3220 if (0 == VNOP_IOCTL(devvp, 3221 DKIOCGETMAXSEGMENTCOUNTREAD, 3222 (caddr_t)&max_segment_count, 3223 0, 3224 context)) { 3225 3226 if (max_segment_count > 0) { 3227 3228 if (0 == VNOP_IOCTL(devvp, 3229 DKIOCGETMAXSEGMENTBYTECOUNTREAD, 3230 (caddr_t)&max_segment_byte_count, 3231 0, 3232 context)) { 3233 //rdar://13835534 Limit max_segment_byte_count to PAGE_SIZE because some drives don't handle the spec correctly 3234 if (max_segment_byte_count > PAGE_SIZE || max_segment_byte_count == 0) { 3235 debug("MAXSEGMENTBYTECOUNTREAD is %#llx, limiting to PAGE_SIZE %#x", max_segment_byte_count, PAGE_SIZE); 3236 max_segment_byte_count = PAGE_SIZE; 3237 } 3238 } else { 3239 debug("Unable to get MAXSEGMENTBYTECOUNTREAD, assuming PAGE_SIZE %#x", PAGE_SIZE); 3240 max_segment_byte_count = PAGE_SIZE; 3241 } 3242 3243 if (cm->cm_maxread > max_segment_count * max_segment_byte_count) { 3244 cm->cm_maxread = max_segment_count * max_segment_byte_count; 3245 debug("MAXSEGMENTCOUNTREAD is %#llx, MAXSEGMENTBYTECOUNTREAD is %#llx, (multiplied %#llx)", max_segment_count, max_segment_byte_count, max_segment_count * max_segment_byte_count); 3246 } 3247 } 3248 } 3249 3250 /* maxread = min ( maxread, MAX_UPL_TRANSFER * PAGE_SIZE ) */ 3251 if (cm->cm_maxread > MAX_UPL_TRANSFER * PAGE_SIZE) { 3252 cm->cm_maxread = MAX_UPL_TRANSFER * PAGE_SIZE; 3253 debug("MAX_UPL_TRANSFER is %#x, PAGE_SIZE is %#x, (multiplied %#x)", MAX_UPL_TRANSFER, PAGE_SIZE, MAX_UPL_TRANSFER * PAGE_SIZE); 3254 } 3255 3256 /* maxread = max ( maxread, MAXPHYS ) */ 3257 if (cm->cm_maxread < MAXPHYS) { 3258 debug("can't determine device read size for mount %s; using default", uuid_string(cm->cm_uuid)); 3259 cm->cm_maxread = MAXPHYS; 3260 } 3261 3262 /* make sure maxread is a multiple of the block size */ 3263 if (cm->cm_maxread % cm->cm_blocksize != 0) { 3264 debug("Mount max IO size (%llu) not a multiple of block size (%llu)", cm->cm_maxread, cm->cm_blocksize); 3265 cm->cm_maxread -= cm->cm_maxread % cm->cm_blocksize; 3266 } 3267 3268 cm->cm_bp = buf_alloc(devvp); 3269 if (cm->cm_bp == NULL) { 3270 message("can't allocate buf"); 3271 error = ENOMEM; 3272 goto out; 3273 } 3274 3275 cm->cm_dev = vnode_specrdev(devvp); 3276 if (cm->cm_dev == nulldev()) { 3277 message("mount %s does not have a device", uuid_string(cm->cm_uuid)); 3278 error = EINVAL; 3279 goto out; 3280 } 3281 3282 for (i = 0; i < cm->cm_nextents; i++) { 3283 LOCK_EXTENT(cm->cm_pextents[i]); 3284 if (0 != BC_fill_in_extent(cm->cm_pextents[i])) { 3285 BC_teardown_extent(cm->cm_pextents[i]); 3286 UNLOCK_EXTENT(cm->cm_pextents[i]); 3287 wakeup(cm->cm_pextents[i]); 3288 } else { 3289 UNLOCK_EXTENT(cm->cm_pextents[i]); 3290 } 3291 } 3292 3293 cm->cm_state = CM_READY; 3294 3295 debug("Found new cache mount %s disk %d, %d extents", uuid_string(cm->cm_uuid), cm->cm_disk->cd_disk_num, cm->cm_nextents); 3296 3297out: 3298 if (error && error != EAGAIN) { 3299 /* 3300 * Mark all extents as aborted. This will notify any sleepers in the 3301 * strategy routine that the extent won't have data for them. 3302 * 3303 * Note that its fine to take the extent lock here since no one will 3304 * have extent lock and try to take the mount lock if the mount isn't CM_READY 3305 */ 3306 for (i = 0; i < cm->cm_nextents; i++) { 3307 LOCK_EXTENT(cm->cm_pextents[i]); 3308 BC_teardown_extent(cm->cm_pextents[i]); 3309 UNLOCK_EXTENT(cm->cm_pextents[i]); 3310 wakeup(cm->cm_pextents[i]); 3311 } 3312 3313 BC_teardown_mount(cm); 3314 } 3315 if (devvp != NULLVP) { 3316 vnode_put(devvp); 3317 } 3318 return error; 3319} 3320 3321/* 3322 * Called with the cache mount write lock held 3323 */ 3324static void 3325BC_teardown_mount(struct BC_cache_mount *cm) 3326{ 3327 if (cm->cm_bp) { 3328 buf_free(cm->cm_bp); 3329 cm->cm_bp = NULL; 3330 } 3331 cm->cm_nextents = 0; 3332 _FREE_ZERO(cm->cm_pextents, M_TEMP); 3333 cm->cm_state = CM_ABORTED; 3334} 3335 3336/* 3337 * Check to see which mounts have been mounted and finish setting them up. 3338 * 3339 * No extent/mount locks required since we have a write lock on the cache 3340 * 3341 * Called with the cache write lock held 3342 */ 3343static int fill_in_bc_cache_mounts(mount_t mount, void* arg) 3344{ 3345 int mount_idx, error, i; 3346 struct vfs_attr attr; 3347 vfs_context_t context; 3348 int* go_again = (int*) arg; 3349 3350 VFSATTR_INIT(&attr); 3351 VFSATTR_WANTED(&attr, f_uuid); 3352 context = vfs_context_create(NULL); 3353 error = vfs_getattr(mount, &attr, context); 3354 if ((0 != error) || (! VFSATTR_IS_SUPPORTED(&attr, f_uuid))) { 3355#ifdef BC_DEBUG 3356 char name[MFSNAMELEN]; 3357 vfs_name(mount, name); 3358 if (strncmp("devfs", name, sizeof("devfs")) != 0) { 3359 debug("Found mount %s for IO without a UUID: error %d", name, error); 3360 } 3361#endif 3362 vfs_context_rele(context); 3363 return VFS_RETURNED; 3364 } else { 3365 // debug("Found mount for IO with UUID %s", uuid_string(attr.f_uuid)); 3366 3367 for (mount_idx = 0; mount_idx < BC_cache->c_nmounts; mount_idx++) { 3368 struct BC_cache_mount *cm = BC_cache->c_mounts + mount_idx; 3369 int match = 0; 3370 if (CM_SETUP == cm->cm_state) { 3371 3372 if (0 == uuid_compare(cm->cm_uuid, attr.f_uuid)) { 3373 match = 1; 3374 } 3375 3376 /* a null uuid indicates we want to match the root volume, no matter what the uuid (8350414) */ 3377 if ((!match) && uuid_is_null(cm->cm_uuid)) { 3378 vnode_t devvp = vfs_devvp(mount); 3379 if (vnode_specrdev(devvp) == rootdev) { 3380 uuid_copy(cm->cm_uuid, attr.f_uuid); 3381 match = 1; 3382 debug("Found root mount %s", uuid_string(cm->cm_uuid)); 3383 } 3384 vnode_put(devvp); 3385 } 3386 3387 if (match) { 3388 /* Found a matching mount */ 3389 3390 /* Locking here isn't necessary since we're only called while holding the cache write lock 3391 * and no one holds the mount locks without also holding the cache lock 3392 */ 3393 if (BC_fill_in_mount(cm, mount, context) == EAGAIN) { 3394 *go_again = 1; 3395 } 3396 vfs_context_rele(context); 3397 3398 /* Check to see if we have any more mounts to fill in */ 3399 for (i = 0; i < BC_cache->c_nmounts; i++) { 3400 if (CM_SETUP == BC_cache->c_mounts[i].cm_state) { 3401 /* have more to fill in, keep iterating */ 3402 return VFS_RETURNED; 3403 } 3404 } 3405 3406 return VFS_RETURNED_DONE; 3407 } 3408 } 3409 } 3410 } 3411 3412 vfs_context_rele(context); 3413 3414 /* Check to see if we have any more mounts to fill in */ 3415 for (i = 0; i < BC_cache->c_nmounts; i++) { 3416 if (CM_SETUP == BC_cache->c_mounts[i].cm_state) { 3417 /* have more to fill in, keep iterating */ 3418 return VFS_RETURNED; 3419 } 3420 } 3421 3422 debug("No new mounts filled in fill_in_bc_cache_mounts"); 3423 return VFS_RETURNED_DONE; 3424 3425} 3426 3427#ifndef BOOTCACHE_ENTRIES_SORTED_BY_DISK_OFFSET 3428/* 3429 * Sort the extent list. 3430 */ 3431static int 3432compare_cache_extents(const void *vfirst, const void *vsecond) 3433{ 3434 const struct BC_cache_extent *first, *second; 3435 3436 first = (const struct BC_cache_extent *)vfirst; 3437 second = (const struct BC_cache_extent *)vsecond; 3438 3439 // Sort by volume first, then by logical block address 3440 int mount_comparison = first->ce_mount_idx - second->ce_mount_idx; 3441 if (mount_comparison != 0) 3442 return((mount_comparison < 0) ? -1 : 1); 3443 3444 if (first->ce_diskoffset == second->ce_diskoffset) 3445 return(0); 3446 return((first->ce_diskoffset < second->ce_diskoffset) ? -1 : 1); 3447} 3448#endif 3449 3450/* 3451 * Fetch the playlist from userspace or the Mach-O segment where it 3452 * was preloaded. Add it to the cache and readahead in batches after 3453 * the current cache, if any. 3454 * 3455 * Returns 0 on success. In failure cases, the cache is not modified. 3456 * 3457 * Called with the cache write lock held. 3458 */ 3459static int 3460BC_copyin_playlist(size_t mounts_size, user_addr_t mounts_buf, size_t entries_size, user_addr_t entries_buf) 3461{ 3462 int nplaylist_mounts, nplaylist_entries; 3463 struct BC_playlist_mount *playlist_mounts = NULL, *pm; 3464 struct BC_playlist_entry *playlist_entries = NULL, *pe; 3465 struct BC_cache_mount *cm, *old_cm; 3466 struct BC_cache_extent *ce; 3467 3468 int ncache_mounts = 0, ncache_extents = 0, nnew_mounts = 0; 3469 struct BC_cache_mount *cache_mounts = NULL; 3470 struct BC_cache_extent *cache_extents = NULL; 3471 int *pmidx_to_cmidx = NULL; 3472 int *next_old_extent_idx = NULL; 3473 3474 int *cache_nextents = NULL; 3475 struct BC_cache_extent **cache_extents_list = NULL; 3476 3477 int old_batch_count; 3478 u_int64_t old_cache_size; 3479 3480 int error, pm_idx, cm_idx, pe_idx, ce_idx, cel_idx, pm_idx2, max_entries; 3481 off_t p; 3482 u_int64_t size; 3483 int actual, remaining_entries; 3484 3485 int clipped_extents = 0, enveloped_extents = 0; 3486 3487 playlist_mounts = NULL; 3488 playlist_entries = NULL; 3489 3490#ifdef BC_DEBUG 3491 struct timeval start_time; 3492 microtime(&start_time); 3493#endif 3494 3495 old_batch_count = BC_cache->c_batch_count; 3496 old_cache_size = BC_cache->c_cachesize; 3497 3498 nplaylist_mounts = (int) mounts_size / sizeof(*playlist_mounts); 3499 nplaylist_entries = (int) entries_size / sizeof(*playlist_entries); 3500 3501 if (BC_cache->c_stats.ss_total_extents > 0) { 3502 debug("adding %u extents to existing cache of %u extents", nplaylist_entries, BC_cache->c_stats.ss_total_extents); 3503 } else { 3504 debug("setting up first cache of %d extents", nplaylist_entries); 3505 } 3506 3507 for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) { 3508 debug("Mount %s has %d extents", uuid_string(BC_cache->c_mounts[cm_idx].cm_uuid), BC_cache->c_mounts[cm_idx].cm_nextents); 3509 } 3510 3511 /* 3512 * Mount array 3513 */ 3514 if (BC_preloaded_playlist) { 3515 playlist_mounts = CAST_DOWN(struct BC_playlist_mount *, mounts_buf); 3516 } else { 3517 playlist_mounts = BC_MALLOC(mounts_size, M_TEMP, M_WAITOK); 3518 if (playlist_mounts == NULL) { 3519 message("can't allocate unpack mount buffer of %ld bytes", mounts_size); 3520 error = ENOMEM; 3521 goto out; 3522 } 3523 if ((error = copyin(mounts_buf, playlist_mounts, mounts_size)) != 0) { 3524 message("copyin %ld bytes from 0x%llx to %p failed", mounts_size, mounts_buf, playlist_mounts); 3525 goto out; 3526 } 3527 3528 /* if BC_preloaded_playlist is non-NULL, playlist_mounts must be freed */ 3529 } 3530 3531 /* map from the playlist mount index to the corresponding cache mount index */ 3532 pmidx_to_cmidx = BC_MALLOC(nplaylist_mounts * sizeof(*pmidx_to_cmidx), M_TEMP, M_WAITOK); 3533 if (pmidx_to_cmidx == NULL) { 3534 message("can't allocate playlist index to cache index reference array for %d mounts", nplaylist_mounts); 3535 error = ENOMEM; 3536 goto out; 3537 } 3538 3539 /* In order to merge the mount's current (sorted) extent list with the new (sorted) 3540 * extent list from the new playlist we first grow the extent list allocation to fit 3541 * both arrays, then merge the two arrays into the new allocation. 3542 */ 3543 3544 if (BC_cache->c_nmounts > 0) { 3545 /* next_old_extent_idx saves the index of the next unmerged extent in the original mount's extent list */ 3546 next_old_extent_idx = BC_MALLOC(BC_cache->c_nmounts * sizeof(*next_old_extent_idx), M_TEMP, M_WAITOK | M_ZERO); 3547 if (next_old_extent_idx == NULL) { 3548 message("can't allocate index array for %d mounts", BC_cache->c_nmounts); 3549 error = ENOMEM; 3550 goto out; 3551 } 3552 3553 /* 0-filled since all the index start at 0 */ 3554 } 3555 3556 nnew_mounts = nplaylist_mounts; 3557 3558 /* determine how many mounts are new and how many we already have */ 3559 for (pm_idx = 0; pm_idx < nplaylist_mounts; pm_idx++) { 3560 pmidx_to_cmidx[pm_idx] = -1; /* invalid by default */ 3561 3562 for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) { 3563 if (0 == uuid_compare(BC_cache->c_mounts[cm_idx].cm_uuid, playlist_mounts[pm_idx].pm_uuid)) { 3564 /* already have a record of this mount, won't need a new spot for it */ 3565 pmidx_to_cmidx[pm_idx] = cm_idx; 3566 nnew_mounts--; 3567 break; 3568 } 3569 } 3570 3571 /* check to make sure there aren't any duplicate mounts within the playlist */ 3572 for (pm_idx2 = pm_idx + 1; pm_idx2 < nplaylist_mounts; pm_idx2++) { 3573 if (0 == uuid_compare(playlist_mounts[pm_idx2].pm_uuid, playlist_mounts[pm_idx].pm_uuid)) { 3574 message("Playlist had duplicate mount %s", uuid_string(playlist_mounts[pm_idx2].pm_uuid)); 3575 error = EINVAL; 3576 goto out; 3577 } 3578 } 3579 } 3580 3581 /* cache_mounts is the array of mounts that will replace the one currently in the cache */ 3582 cache_mounts = BC_MALLOC((BC_cache->c_nmounts + nnew_mounts) * sizeof(*cache_mounts), M_TEMP, M_WAITOK | M_ZERO); 3583 if (cache_mounts == NULL) { 3584 message("can't allocate memory for %d mounts", (BC_cache->c_nmounts + nnew_mounts)); 3585 error = ENOMEM; 3586 goto out; 3587 } 3588 memcpy(cache_mounts, BC_cache->c_mounts, (BC_cache->c_nmounts * sizeof(*BC_cache->c_mounts))); 3589 3590 /* ncache_mounts is the current number of valid mounts in the mount array */ 3591 ncache_mounts = BC_cache->c_nmounts; 3592 3593 for (pm_idx = 0; pm_idx < nplaylist_mounts; pm_idx++) { 3594 if (pmidx_to_cmidx[pm_idx] != -1) { 3595 /* We already have a record for this mount */ 3596 3597 if (playlist_mounts[pm_idx].pm_nentries == 0) { 3598 message("Playlist incuded mount with 0 entries"); 3599 error = EINVAL; 3600 goto out; 3601 } 3602 3603 cm_idx = pmidx_to_cmidx[pm_idx]; 3604 3605 assert(cm_idx < BC_cache->c_nmounts); 3606 3607 /* grow the mount's extent array to fit the new extents (we'll fill in the new extents below) */ 3608 cache_mounts[cm_idx].cm_pextents = BC_MALLOC((BC_cache->c_mounts[cm_idx].cm_nextents + playlist_mounts[pm_idx].pm_nentries) * sizeof(*cache_mounts[cm_idx].cm_pextents), M_TEMP, M_WAITOK | M_ZERO); 3609 if (cache_mounts[cm_idx].cm_pextents == NULL) { 3610 message("can't allocate mount's extent array (%d entries)", (BC_cache->c_mounts[cm_idx].cm_nextents + playlist_mounts[pm_idx].pm_nentries)); 3611 error = ENOMEM; 3612 goto out; 3613 } 3614 3615 /* don't free the old extent list yet, the real BC_cache's mount still points to it and we may yet fail */ 3616 3617 /* cm_nextents is the number of valid extents in this mount's extent list. It starts out as 0 and we fill it in below */ 3618 cache_mounts[cm_idx].cm_nextents = 0; 3619 } else { 3620 /* New mount for the cache */ 3621 3622 if ((error = BC_setup_mount(cache_mounts + ncache_mounts, playlist_mounts + pm_idx)) != 0) { 3623 goto out; 3624 } 3625 3626 pmidx_to_cmidx[pm_idx] = ncache_mounts; 3627 ncache_mounts++; 3628 } 3629 } 3630 3631 /* 3632 * Extent array 3633 * 3634 * The existing extent array cannot move since a strategy routine may be 3635 * in the middle of an msleep with one of the extent locks. So, we allocate 3636 * a new array for the new extents. 3637 * 3638 * Add one extent list to our arrays of extent lists 3639 */ 3640 3641 /* cache_nextents is an array of ints, each indicating the number of extents in the list in cache_extents_list at the same index */ 3642 cache_nextents = BC_MALLOC((BC_cache->c_nextentlists + 1) * sizeof(*BC_cache->c_nextents), M_TEMP, M_WAITOK | M_ZERO); 3643 if (cache_nextents == NULL) { 3644 message("can't allocate memory for %d extent list sizes", nplaylist_entries); 3645 error = ENOMEM; 3646 goto out; 3647 } 3648 memcpy(cache_nextents, BC_cache->c_nextents, BC_cache->c_nextentlists * sizeof(*BC_cache->c_nextents)); 3649 3650 /* cache_extents_list is the array of extent lists. The extent list at the last index is for the new playlist's cache */ 3651 cache_extents_list = BC_MALLOC((BC_cache->c_nextentlists + 1) * sizeof(*BC_cache->c_extentlists), M_TEMP, M_WAITOK | M_ZERO); 3652 if (cache_extents_list == NULL) { 3653 message("can't allocate memory for %d extent lists", nplaylist_entries); 3654 error = ENOMEM; 3655 goto out; 3656 } 3657 memcpy(cache_extents_list, BC_cache->c_extentlists, BC_cache->c_nextentlists * sizeof(*BC_cache->c_extentlists)); 3658 3659 /* The extent list for this new playlist's cache */ 3660 cache_extents = BC_MALLOC(nplaylist_entries * sizeof(*cache_extents), M_TEMP, M_WAITOK | M_ZERO); 3661 if (cache_extents == NULL) { 3662 message("can't allocate memory for %d extents", nplaylist_entries); 3663 error = ENOMEM; 3664 goto out; 3665 } 3666 3667 /* TODO: iterate over our history and remove any blocks we've already seen from the new cache */ 3668 3669 /* Fill in the new extents. 3670 * 3671 * We just tack the new playlist onto the end of any cache we currently have as a new batch. 3672 * 3673 * At this point, we assume that any additional caches should be in additional batches just as 3674 * if there was a single recording with a tag separating the new extents. If we did decide to 3675 * merge, it would still be hard since that would require reordering the extent list and 3676 * BC_strategy assumes that the cache extents never move (see the msleep in wait_for_extent). 3677 * 3678 * We also don't coalesce the new extents into the old (and lower batch) extents. 3679 * This would be a bit of work, we assume we want a new batch anyway, and it's rare that 3680 * a new cache comes in while an old cache is still in readahead. So, we don't bother. 3681 * 3682 * We do clip any overlap with the new cache from the old cache, however. 3683 * 3684 * We don't clip any overlap with the history from the new extents. 3685 * The history list isn't ordered, so we don't have a fast way to compare the ranges. It's also 3686 * expected that the overlap is minimal. If we stopped recording history at some point, we don't 3687 * have this info anyway. So, we don't bother. 3688 */ 3689 3690 /* size is the size in bytes of the cache, used to allocate our page buffer */ 3691 size = 0; 3692 3693 if (BC_preloaded_playlist) { 3694 /* 3695 * Unpack the static control entry array into the extent array. 3696 */ 3697 playlist_entries = CAST_DOWN(struct BC_playlist_entry *, entries_buf); 3698 } else { 3699 /* 3700 * Since the playlist control entry structure is not the same as the 3701 * extent structure we use, we need to copy the control entries in 3702 * and unpack them. 3703 */ 3704 playlist_entries = BC_MALLOC(BC_PLC_CHUNK * sizeof(*playlist_entries), 3705 M_TEMP, M_WAITOK); 3706 if (playlist_entries == NULL) { 3707 message("can't allocate unpack buffer for %d entries", BC_PLC_CHUNK); 3708 error = ENOMEM; 3709 goto out; 3710 } 3711 } 3712 3713 remaining_entries = nplaylist_entries; 3714 while (remaining_entries > 0) { 3715 3716 if (BC_preloaded_playlist) { 3717 actual = remaining_entries; 3718 } else { 3719 actual = min(remaining_entries, BC_PLC_CHUNK); 3720 if ((error = copyin(entries_buf, playlist_entries, 3721 actual * sizeof(struct BC_playlist_entry))) != 0) { 3722 message("copyin from 0x%llx to %p failed", entries_buf, playlist_entries); 3723 goto out; 3724 } 3725 } 3726 3727 /* unpack into our array */ 3728 for (pe_idx = 0; pe_idx < actual; pe_idx++) { 3729 pe = playlist_entries + pe_idx; 3730 if (pe->pe_length == 0) { 3731 debug("Bad Playlist: entry %d has 0 length", (nplaylist_entries - remaining_entries) + pe_idx); 3732 error = EINVAL; 3733 goto out; 3734 } 3735 pm_idx = pe->pe_mount_idx; 3736 3737 if (pm_idx >= nplaylist_mounts || pm_idx < 0) { 3738 message("Bad playlist: entry %d referenced non-existent mount index %d", (nplaylist_entries - remaining_entries) + pe_idx, pm_idx); 3739 error = EINVAL; 3740 goto out; 3741 } 3742 3743 pm = playlist_mounts + pm_idx; 3744 cm_idx = pmidx_to_cmidx[pm_idx]; 3745 cm = cache_mounts + cm_idx; 3746 ce = cache_extents + ncache_extents; 3747 3748 /* The size of the extent list is the number of playlist entries + the number of old cache extents */ 3749 max_entries = pm->pm_nentries + ((cm_idx < BC_cache->c_nmounts) ? BC_cache->c_mounts[cm_idx].cm_nextents : 0); 3750 3751 if (cm->cm_nextents >= max_entries) { 3752 message("Bad playlist: more entries existed than the mount %s claimed (%d)", uuid_string(pm->pm_uuid), pm->pm_nentries); 3753 error = EINVAL; 3754 goto out; 3755 } 3756 3757 if ((error = BC_setup_extent(ce, pe, old_batch_count, cm_idx)) != 0) { 3758 goto out; 3759 } 3760 3761 /* Merge the new extent with the old extents for this mount. The new extent may get clipped. */ 3762#ifdef BOOTCACHE_ENTRIES_SORTED_BY_DISK_OFFSET 3763 if (cm_idx < BC_cache->c_nmounts) { 3764 old_cm = BC_cache->c_mounts + cm_idx; 3765 /* Copy any lower or equal extents from the existing playlist down to the low end of the array */ 3766 while (next_old_extent_idx[cm_idx] < old_cm->cm_nextents && 3767 old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_diskoffset <= ce->ce_diskoffset) { 3768 cm->cm_pextents[cm->cm_nextents] = old_cm->cm_pextents[next_old_extent_idx[cm_idx]]; 3769 cm->cm_nextents++; 3770 next_old_extent_idx[cm_idx]++; 3771 } 3772 3773 /* check for overlap with the next extent in the old list and clip the new extent */ 3774 if (next_old_extent_idx[cm_idx] < old_cm->cm_nextents) { 3775 3776 /* FIXME: rdar://9153031 If the new extent extends past the end of the old extent, we're losing caching! */ 3777 if (ce->ce_diskoffset + ce->ce_length > (old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_diskoffset + old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_length)) { 3778 debug("!!! Entry %d (0x%llx:%lld) is getting clipped too much because of previous entry for mount %s (0x%llx:%lld)", pe_idx, ce->ce_diskoffset, ce->ce_length, uuid_string(cm->cm_uuid), old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_diskoffset, old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_length); 3779 } 3780 3781 u_int64_t max_offset = old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_diskoffset; 3782 if (max_offset < ce->ce_diskoffset + ce->ce_length) { 3783 if (max_offset <= ce->ce_diskoffset) { 3784 ce->ce_length = 0; 3785 } else { 3786 ce->ce_length = (max_offset - ce->ce_diskoffset); 3787 } 3788 } 3789 } 3790 } 3791 3792 /* check for intersection with the next lower extent in the list and clip the new extent */ 3793 if (cm->cm_nextents > 0) { 3794 u_int64_t min_offset = cm->cm_pextents[cm->cm_nextents - 1]->ce_diskoffset + cm->cm_pextents[cm->cm_nextents - 1]->ce_length; 3795 if (min_offset > ce->ce_diskoffset) { 3796 3797 /* Check if this extent is overlapping with an extent from the same playlist */ 3798 if (cm->cm_pextents[cm->cm_nextents - 1] >= cache_extents && 3799 cm->cm_pextents[cm->cm_nextents - 1] < cache_extents + ncache_extents) { 3800 message("Bad playlist: entry %d (0x%llx:%lld) overlapped with previous entry for mount %s (0x%llx:%lld)", pe_idx, ce->ce_diskoffset, ce->ce_length, uuid_string(cm->cm_uuid), cm->cm_pextents[cm->cm_nextents - 1]->ce_diskoffset, cm->cm_pextents[cm->cm_nextents - 1]->ce_length); 3801 error = EINVAL; 3802 goto out; 3803 } 3804 3805 if (min_offset >= ce->ce_diskoffset + ce->ce_length) { 3806 ce->ce_length = 0; 3807 } else { 3808 ce->ce_length -= (min_offset - ce->ce_diskoffset); 3809 ce->ce_diskoffset = min_offset; 3810 } 3811 } 3812 } 3813 3814 if (ce->ce_length != pe->pe_length) { 3815 clipped_extents++; 3816 } 3817 if (ce->ce_length == 0) { 3818 /* new extent is entirely captured in extents from the old cache, throw it out */ 3819 enveloped_extents++; 3820 BC_teardown_extent(ce); 3821 lck_mtx_destroy(&ce->ce_lock, BC_cache->c_lckgrp); 3822 3823 /* continue without incrementing ncache_extents so we reuse this extent index */ 3824 continue; 3825 } 3826 size += roundup(ce->ce_length, PAGE_SIZE); 3827#endif 3828 3829 cm->cm_pextents[cm->cm_nextents] = ce; 3830 cm->cm_nextents++; 3831 ncache_extents++; 3832 } 3833 remaining_entries -= actual; 3834 entries_buf += actual * sizeof(struct BC_playlist_entry); 3835 } 3836 3837 /* Fill in any remaining extents from the original cache that are still sitting at the high end of the extent list */ 3838 for (pm_idx = 0; pm_idx < nplaylist_mounts; pm_idx++) { 3839 cm_idx = pmidx_to_cmidx[pm_idx]; 3840 cm = cache_mounts + cm_idx; 3841 pm = playlist_mounts + pm_idx; 3842 3843 if (cm->cm_nextents == 0) { 3844 message("Bad playlist: No entries existed for mount %s (claimed %d)", uuid_string(pm->pm_uuid), pm->pm_nentries); 3845 error = EINVAL; 3846 goto out; 3847 } 3848 3849 if (cm_idx < BC_cache->c_nmounts) { 3850 if (next_old_extent_idx[cm_idx] < BC_cache->c_mounts[cm_idx].cm_nextents) { 3851 /* There are more extents in the old extent array, copy them over */ 3852 memcpy(cm->cm_pextents + cm->cm_nextents, BC_cache->c_mounts[cm_idx].cm_pextents + next_old_extent_idx[cm_idx], sizeof(*cm->cm_pextents) * (BC_cache->c_mounts[cm_idx].cm_nextents - next_old_extent_idx[cm_idx])); 3853 cm->cm_nextents += BC_cache->c_mounts[cm_idx].cm_nextents - next_old_extent_idx[cm_idx]; 3854 } 3855 } 3856 3857 } 3858 3859#ifndef BOOTCACHE_ENTRIES_SORTED_BY_DISK_OFFSET 3860 /* We need an ordered list of extents for each mount, but we were given 3861 * the extents in a different ordering, so we need to sort the mount's 3862 * extent list here 3863 */ 3864 for (cm_idx = 0; cm_idx < ncache_mounts; cm_idx++) { 3865 cm = cache_mounts + cm_idx; 3866 3867 /* If this is a new mount or a mount with a new extent list, it needs sorting */ 3868 if (cm_idx >= BC_cache->c_nmounts || 3869 cm->cm_pextents != BC_cache->c_mounts[cm_idx].cm_pextents) { 3870 3871 qsort(cm->cm_pextents, 3872 cm->cm_nextents, 3873 sizeof(*cm->cm_pextents), 3874 compare_cache_extents); 3875 3876 /* Clip the new extents of overlap with the old extents */ 3877 for (ce_idx = 0; ce_idx < cm->cm_nextents; ce_idx++) { 3878 3879 /* Only look at new extents, old extents were already clipped when they were added and can't be clipped further */ 3880 if (cm->cm_pextents[ce_idx] >= cache_extents && 3881 cm->cm_pextents[ce_idx] < cache_extents + ncache_extents) { 3882 3883 u_int64_t old_length = ce->ce_length; 3884 3885 /* Compare against the previous (lower) extent */ 3886 if (ce_idx > 0) { 3887 u_int64_t min_offset = cm->cm_pextents[ce_idx - 1]->ce_diskoffset + cm->cm_pextents[ce_idx - 1]->ce_length; 3888 if (min_offset > ce->ce_diskoffset) { 3889 if (min_offset >= ce->ce_diskoffset + ce->ce_length) { 3890 ce->ce_length = 0; 3891 } else { 3892 ce->ce_length -= (min_offset - ce->ce_diskoffset); 3893 ce->ce_diskoffset = min_offset; 3894 } 3895 } 3896 } 3897 3898 /* Compare against the next (higher) extent */ 3899 if (ce_idx < cm_nextents - 1) { 3900 u_int64_t max_offset = cm->cm_pextents[ce_idx + 1]->ce_diskoffset; 3901 if (max_offset < ce->ce_diskoffset + ce->ce_length) { 3902 3903 /* Check if this extent is overlapping with an extent from the same playlist */ 3904 if (cm->cm_pextents[ce_idx + 1] >= cache_extents && 3905 cm->cm_pextents[ce_idx + 1] < cache_extents + ncache_extents) { 3906 message("Bad playlist: entry %d (0x%llx:%lld) overlapped with previous entry for mount %s (0x%llx:%lld)", pe_idx, ce->ce_diskoffset, ce->ce_length, uuid_string(cm->cm_uuid), cm->cm_pextents[ce_idx + 1]->ce_diskoffset, cm->cm_pextents[ce_idx + 1]->ce_length); 3907 error = EINVAL; 3908 goto out; 3909 } 3910 3911 if (max_offset <= ce->ce_diskoffset) { 3912 ce->ce_length = 0; 3913 } else { 3914 ce->ce_length = (max_offset - ce->ce_diskoffset); 3915 } 3916 } 3917 } 3918 3919 if (ce->ce_length != old_length) { 3920 clipped_extents++; 3921 if (ce->ce_length == 0) { 3922 /* extent was enveloped, remove this extent from the mount's extent list */ 3923 if ((ce_idx + 1) < cm->cm_nextents) { 3924 memmove(cm->cm_pextents + ce_idx, cm->cm_pextents + (ce_idx + 1), cm->cm_nextents - (ce_idx + 1)); 3925 } 3926 cm->cm_nextents--; 3927 ce_idx--; 3928 enveloped_extents++; 3929 } 3930 } 3931 } 3932 } 3933 } 3934 } 3935 3936 for (ce_idx = 0; ce_idx < ncache_extents; ce_idx++) { 3937 size += roundup(cache_extents->ce_length, PAGE_SIZE); 3938 } 3939 3940#endif 3941 3942 if (clipped_extents > 0) { 3943 debug("%d extents were clipped, %d of which were completely enveloped", clipped_extents, enveloped_extents); 3944 } 3945 3946 if (ncache_extents == 0) { 3947 debug("No extents added, not updating cache"); 3948 error = EALREADY; 3949 goto out; 3950 } 3951 3952 if (! BC_preloaded_playlist) { 3953 _FREE_ZERO(playlist_entries, M_TEMP); 3954 _FREE_ZERO(playlist_mounts, M_TEMP); 3955 } 3956 3957 if (ncache_extents < nplaylist_entries) { 3958 debug("%d extents added out of %d", ncache_extents, nplaylist_entries); 3959 } 3960 for (cm_idx = 0; cm_idx < ncache_mounts; cm_idx++) { 3961 debug("Mount %s now has %d extents", uuid_string(cache_mounts[cm_idx].cm_uuid), cache_mounts[cm_idx].cm_nextents); 3962 } 3963 3964 3965 /* 3966 * In order to simplify allocation of the block and page maps, we round 3967 * the size up to a multiple of CB_MAPFIELDBITS pages. This means we 3968 * may waste as many as 31 pages in the worst case. 3969 */ 3970 size = roundup(size, PAGE_SIZE * CB_MAPFIELDBITS); 3971 3972 /* 3973 * Verify that the cache is not larger than 50% of physical memory 3974 * to avoid forcing pageout activity. 3975 * 3976 * max_mem and size are in bytes. All calculations are done in page 3977 * units to avoid overflow/sign issues. 3978 * 3979 * If the cache is too large, we discard it and go to record-only mode. 3980 * This lets us survive the case where the playlist is corrupted or 3981 * manually made too large, but the "real" bootstrap footprint 3982 * would be OK. 3983 */ 3984 BC_cache->c_cachesize += size; 3985 if (BC_cache->c_cachesize > BC_MAX_SIZE) { 3986 message("cache size (%lu bytes) too large for physical memory (%llu bytes), capping at %llu bytes", 3987 (unsigned long)BC_cache->c_cachesize, max_mem, BC_MAX_SIZE); 3988 BC_cache->c_cachesize = BC_MAX_SIZE; 3989 } 3990 3991 /* 3992 * Allocate/grow the pagebuffer. 3993 */ 3994 if (BC_alloc_pagebuffer() != 0) { 3995 message("can't allocate %lld bytes for cache buffer", BC_cache->c_cachesize); 3996 error = ENOMEM; 3997 goto out; 3998 } 3999 4000 /* 4001 * Fix up the extent cache offsets. 4002 */ 4003 p = 0; 4004 if (BC_cache->c_nextentlists > 0) { 4005 /* start at the offset of the end of the previous cache list */ 4006 for (cel_idx = BC_cache->c_nextentlists - 1; cel_idx >= 0; cel_idx--) { 4007 if (BC_cache->c_nextents[cel_idx] > 0) { 4008 struct BC_cache_extent *last_ce = BC_cache->c_extentlists[cel_idx] + BC_cache->c_nextents[cel_idx] - 1; 4009 p = last_ce->ce_cacheoffset + roundup(last_ce->ce_length, PAGE_SIZE); 4010 break; 4011 } 4012 } 4013 } 4014 for (ce_idx = 0; ce_idx < ncache_extents; ce_idx++) { 4015 cache_extents[ce_idx].ce_cacheoffset = p; 4016 p += roundup(cache_extents[ce_idx].ce_length, PAGE_SIZE); 4017 4018 /* Abort any extents that extend past our cache size */ 4019 if (p >= BC_cache->c_cachesize || cache_extents[ce_idx].ce_length == 0) { 4020 BC_teardown_extent(cache_extents + ce_idx); 4021 } 4022 } 4023 4024 /* We've successfully integrated the new playlist with our cache, copy it into place */ 4025 4026 /* free up old cache's allocations that we're replacing */ 4027 if (BC_cache->c_nmounts > 0) { 4028 for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) { 4029 /* If we have a new extent array for this mount, free the old one */ 4030 if (BC_cache->c_mounts[cm_idx].cm_pextents != cache_mounts[cm_idx].cm_pextents) { 4031 _FREE_ZERO(BC_cache->c_mounts[cm_idx].cm_pextents, M_TEMP); 4032 } 4033 } 4034 _FREE_ZERO(BC_cache->c_mounts, M_TEMP) 4035 } 4036 BC_cache->c_mounts = cache_mounts; 4037 BC_cache->c_nmounts = ncache_mounts; 4038 4039 if (BC_cache->c_nextentlists > 0) { 4040 _FREE_ZERO(BC_cache->c_extentlists, M_TEMP) 4041 _FREE_ZERO(BC_cache->c_nextents, M_TEMP) 4042 } 4043 4044 cache_nextents[BC_cache->c_nextentlists] = ncache_extents; 4045 cache_extents_list[BC_cache->c_nextentlists] = cache_extents; 4046 4047 BC_cache->c_nextentlists++; 4048 4049 BC_cache->c_nextents = cache_nextents; 4050 BC_cache->c_extentlists = cache_extents_list; 4051 4052 BC_cache->c_stats.ss_total_mounts = ncache_mounts; 4053 BC_cache->c_stats.ss_total_extents += ncache_extents; 4054 4055 /* fill (or tear down) the extents for all the mounts we've already seen */ 4056 for (ce_idx = 0; ce_idx < ncache_extents; ce_idx++) { 4057 ce = cache_extents + ce_idx; 4058 if (CM_SETUP != BC_cache->c_mounts[ce->ce_mount_idx].cm_state) { 4059 LOCK_EXTENT(ce); 4060 if (CM_READY == BC_cache->c_mounts[ce->ce_mount_idx].cm_state) { 4061 if (0 == BC_fill_in_extent(ce)) { 4062 UNLOCK_EXTENT(ce); 4063 continue; 4064 } 4065 } 4066 /* either the mount is aborted or fill_in_extent failed */ 4067 BC_teardown_extent(ce); 4068 UNLOCK_EXTENT(ce); 4069 } 4070 } 4071 4072 /* Check to see if we have any more mounts to fill in */ 4073 for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) { 4074 if (CM_SETUP == BC_cache->c_mounts[cm_idx].cm_state) { 4075 /* have at least one mount to fill in, iterate all of them */ 4076 int go_again = 0; 4077 vfs_iterate(0, fill_in_bc_cache_mounts, &go_again); 4078 if (go_again) { 4079 debug("Missing root mount during last iteration, going again"); 4080 vfs_iterate(0, fill_in_bc_cache_mounts, &go_again); 4081 } 4082 break; 4083 } 4084 } 4085 4086 error = 0; 4087 4088 /* all done */ 4089out: 4090 if (! BC_preloaded_playlist) { 4091 _FREE_ZERO(playlist_entries, M_TEMP); 4092 _FREE_ZERO(playlist_mounts, M_TEMP); 4093 } 4094 _FREE_ZERO(pmidx_to_cmidx, M_TEMP); 4095 _FREE_ZERO(next_old_extent_idx, M_TEMP); 4096 if (error != 0) { 4097 if (error != EALREADY) { 4098 debug("cache setup failed, aborting"); 4099 } 4100 4101 /* If it was possible to fail after growing the page buffer, we'd need a way to shrink it back here */ 4102 4103 if (BC_cache->c_extentlists == NULL) { 4104 /* If this is our first cache, clear stats and our page buffer */ 4105 BC_cache->c_stats.ss_total_extents = 0; 4106 BC_cache->c_stats.ss_total_mounts = 0; 4107 if (BC_cache->c_vp != NULL) { 4108 BC_free_pagebuffer(); 4109 } 4110 } 4111 4112 BC_cache->c_cachesize = old_cache_size; 4113 BC_cache->c_batch_count = old_batch_count; 4114 4115 /* free up new cache's allocations that aren't part of the old cache */ 4116 if (cache_extents) { 4117 for (ce_idx = 0; ce_idx < ncache_extents; ce_idx++) { 4118 BC_teardown_extent(cache_extents + ce_idx); 4119 lck_mtx_destroy(&cache_extents[ce_idx].ce_lock, BC_cache->c_lckgrp); 4120 } 4121 _FREE_ZERO(cache_extents, M_TEMP); 4122 } 4123 _FREE_ZERO(cache_nextents, M_TEMP); 4124 _FREE_ZERO(cache_extents_list, M_TEMP); 4125 4126 if (cache_mounts) { 4127 for (cm_idx = 0; cm_idx < ncache_mounts; cm_idx++) { 4128 if (cm_idx >= BC_cache->c_nmounts) { 4129 /* newly allocated mounts, teardown completely */ 4130 BC_teardown_mount(cache_mounts + cm_idx); 4131 lck_rw_destroy(&cache_mounts[cm_idx].cm_lock, BC_cache->c_lckgrp); 4132 } else if (BC_cache->c_mounts[cm_idx].cm_pextents != cache_mounts[cm_idx].cm_pextents) { 4133 /* old mounts with a new extent list */ 4134 _FREE_ZERO(cache_mounts[cm_idx].cm_pextents, M_TEMP); 4135 } 4136 } 4137 _FREE_ZERO(cache_mounts, M_TEMP); 4138 } 4139 } 4140 4141#ifdef BC_DEBUG 4142 struct timeval end_time; 4143 microtime(&end_time); 4144 timersub(&end_time, &start_time, &end_time); 4145 debug("BC_copyin_playlist took %ldus", end_time.tv_sec * 1000000 + end_time.tv_usec); 4146#endif 4147 4148 return(error); 4149} 4150 4151/* 4152 * Initialise the cache. Fetch the playlist from userspace and 4153 * find and attach to our block device. 4154 */ 4155static int 4156BC_init_cache(size_t mounts_size, user_addr_t mounts, size_t entries_size, user_addr_t entries, int record_history) 4157{ 4158 int error = 0, copyin_error, mount_idx; 4159 struct timeval current_time; 4160 4161 /* Everything should be set, or nothing should be set */ 4162 if ((mounts_size == 0 || mounts == 0 || entries_size == 0 || entries == 0) && 4163 (mounts_size != 0 || mounts != 0 || entries_size != 0 || entries != 0)) { 4164 debug("Invalid cache parameters: %ld, %lld, %ld, %lld", mounts_size, mounts, entries_size, entries); 4165 return(EINVAL); 4166 } 4167 4168 4169 if (! (BC_cache->c_flags & BC_FLAG_SETUP)) { 4170 debug("Cache not yet setup"); 4171 return(ENXIO); 4172 } 4173 4174 microtime(¤t_time); 4175 4176 /* 4177 * Start recording history. 4178 * 4179 * Only one history recording may be active at a time 4180 */ 4181 if (record_history) { 4182 LOCK_HANDLERS(); 4183 if (BC_set_flag(BC_FLAG_HISTORYACTIVE)) { 4184 debug("history recording already started, only one playlist will be returned"); 4185 BC_ADD_STAT(history_num_recordings, 1); 4186 UNLOCK_HANDLERS(); 4187 } else { 4188 debug("starting history recording"); 4189 4190 /* start the timeout handler */ 4191 timeout(BC_timeout_history, NULL, BC_history_timeout); 4192 4193 /* always take stats for the first two recordings (boot + login), even if we don't have a playlist */ 4194 if (BC_cache->c_stats.ss_history_num_recordings < 2) 4195 BC_cache->c_take_detailed_stats = 1; 4196 4197 BC_cache->c_history_starttime = current_time; 4198 BC_cache->c_history_size = 0; 4199 BC_cache->c_history_num_clusters = 0; 4200 BC_ADD_STAT(history_num_recordings, 1); 4201 BC_clear_flag(BC_FLAG_HTRUNCATED); 4202 4203 /* 4204 * Take over the strategy routine for our device; we are now 4205 * recording read operations. 4206 */ 4207 BC_check_handlers(); 4208 UNLOCK_HANDLERS(); 4209 4210 BC_update_mounts(); 4211 4212#ifndef NO_HISTORY 4213# ifdef STATIC_HISTORY 4214 /* initialise the history buffer */ 4215 if ((BC_cache->c_history = BC_MALLOC(BC_HISTORY_ALLOC, M_TEMP, M_WAITOK)) == NULL) { 4216 message("can't allocate %d bytes for static history buffer", BC_HISTORY_ALLOC); 4217 BC_clear_flag(BC_FLAG_HISTORYACTIVE); 4218 error = ENOMEM; 4219 goto out; 4220 } 4221 4222 bzero(BC_cache->c_history, BC_HISTORY_ALLOC); 4223# endif 4224#endif 4225 4226 } 4227 } 4228 4229 /* 4230 * If we have playlist data, fetch it. 4231 */ 4232 if (entries_size > 0) { 4233 LOCK_CACHE_W(); 4234 4235 if (BC_cache->c_flags & BC_FLAG_SHUTDOWN) { 4236 debug("cache already shutdown"); 4237 error = ENXIO; 4238 UNLOCK_CACHE_W(); 4239 goto out; 4240 } 4241 4242 /* we're playing back a cache (or at least trying to), 4243 * record detailed statistics until we stop recording history 4244 */ 4245 BC_cache->c_take_detailed_stats = 1; 4246 4247 copyin_error = BC_copyin_playlist(mounts_size, mounts, entries_size, entries); 4248 if (copyin_error != 0 && copyin_error != EALREADY) { 4249 message("can't load playlist: %d", copyin_error); 4250 UNLOCK_CACHE_W(); 4251 error = copyin_error; 4252 goto out; 4253 } 4254 4255 /* Always throttle cut-through IOs. 4256 * I tried throttling only after login had started, but 4257 * then users who log in quickly see a very long delay 4258 * after entering their password before apps come up. 4259 * I'd prefer delay the time until the login screen appears 4260 * and provide a fast-as-possible login. 4261 * rdar://8592401 4262 */ 4263 BC_cache->c_readahead_throttles_cutthroughs = 1; 4264 4265 LOCK_HANDLERS(); 4266 if (BC_set_flag(BC_FLAG_CACHEACTIVE)) { 4267 /* cache is already running, we're adding to it */ 4268 UNLOCK_HANDLERS(); 4269 } else { 4270 /* first cache we've created this boot */ 4271 4272#ifdef READ_HISTORY_BUFFER 4273 /* initialise the read history buffer */ 4274 if (NULL == BC_cache->c_rhistory) { 4275 if ((BC_cache->c_rhistory = BC_MALLOC(READ_HISTORY_BUFFER * sizeof(struct BC_read_history_entry), 4276 M_TEMP, M_WAITOK)) == NULL) { 4277 message("can't allocate %d bytes for read history buffer", 4278 READ_HISTORY_BUFFER * sizeof(struct BC_read_history_entry)); 4279 } 4280 } 4281 if (BC_cache->c_rhistory) { 4282 bzero(BC_cache->c_rhistory, READ_HISTORY_BUFFER * sizeof(struct BC_read_history_entry)); 4283 } 4284#endif 4285 4286 BC_cache->c_cache_starttime = current_time; 4287 4288 /* 4289 * Take over the strategy routine for our device; we are now 4290 * filling read operations from the cache. 4291 * 4292 * This must be done before we kick off reader threads to ensure 4293 * that we see catch any writes that occur on blocks in our cache. 4294 * This ensures that a write won't come in unnoticed on a block we've 4295 * already read in so when it's subsequently requested, the cache gives 4296 * stale data. 4297 */ 4298 BC_check_handlers(); 4299 UNLOCK_HANDLERS(); 4300 4301 bootcache_contains_block = BC_cache_contains_block; 4302 } 4303 4304 LOCK_CACHE_W_TO_R(); 4305 if (copyin_error != EALREADY) { 4306 /* 4307 * Start the reader threads. 4308 */ 4309 for (mount_idx = 0; mount_idx < BC_cache->c_nmounts; mount_idx++) { 4310 if (BC_cache->c_mounts[mount_idx].cm_state == CM_READY) { 4311 LOCK_MOUNT_R(BC_cache->c_mounts + mount_idx); 4312 BC_mount_available(BC_cache->c_mounts + mount_idx); 4313 UNLOCK_MOUNT_R(BC_cache->c_mounts + mount_idx); 4314 } 4315 } 4316 } 4317 4318 UNLOCK_CACHE_R(); 4319 4320 /* cache is now active */ 4321 4322 } 4323 4324 if (BC_cache->c_stats.ss_preload_time == 0) { 4325 timersub(¤t_time, 4326 &BC_cache->c_loadtime, 4327 ¤t_time); 4328 BC_cache->c_stats.ss_preload_time = (u_int) current_time.tv_sec * 1000 + (u_int) current_time.tv_usec / 1000; 4329 } 4330 4331out: 4332 return(error); 4333} 4334 4335/* 4336 * Timeout on collection. 4337 * 4338 * We haven't heard from the userland component for too long; 4339 * stop recording history, but keep the current history 4340 * around in case they show up. 4341 */ 4342static void 4343BC_timeout_history(void *junk) 4344{ 4345 /* stop recording history */ 4346 debug("history recording timed out"); 4347 BC_terminate_history(); 4348} 4349 4350/* 4351 * Check for new mounts. Called with no locks held 4352 */ 4353static int check_for_new_mount_itr(mount_t mount, void* arg) { 4354 struct vfs_attr attr; 4355 vfs_context_t context; 4356 int error, mount_idx; 4357 int* go_again = (int*) arg; 4358 4359 struct BC_history_mount_device *hmd = NULL; 4360 vnode_t devvp = NULLVP; 4361 4362 LOCK_HISTORY_R(); 4363 if (! (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE)) { 4364 /* don't do anything if we've stopped recording history */ 4365 UNLOCK_HISTORY_R(); 4366 return VFS_RETURNED_DONE; 4367 } 4368 4369#ifdef BC_DEBUG 4370 char name[MFSNAMELEN]; 4371 vfs_name(mount, name); 4372#endif 4373 4374 devvp = vfs_devvp(mount); 4375 if (devvp == NULLVP) { 4376 //debug("Mount %s (%p)'s vnode is NULL", name, mount); 4377 UNLOCK_HISTORY_R(); 4378 goto out; 4379 } 4380 4381 dev_t mount_dev = vnode_specrdev(devvp); 4382 //debug("Mount %s (%p)'s vnode %p's device is 0x%x", name, mount, devvp, mount_dev); 4383 4384 /* 4385 * A new mount appeared, but unfortunately we don't know which one. 4386 * So, for every mount we have to check through our list of history_mounts 4387 * to see if we already have info for it. 4388 */ 4389 hmd = BC_get_history_mount_device(mount_dev, NULL); 4390 if (hmd == NULL || (! uuid_is_null(hmd->hmd_mount.hm_uuid))) { 4391 /* Unable to allocate new space for a mount, or 4392 * we already have info about this mount 4393 */ 4394 UNLOCK_HISTORY_R(); 4395 goto out; 4396 } 4397 4398 /* 4399 * We don't yet have info about this mount. It's new! 4400 */ 4401 4402 hmd->hmd_blocksize = vfs_devblocksize(mount); 4403 4404 VFSATTR_INIT(&attr); 4405 VFSATTR_WANTED(&attr, f_uuid); 4406 context = vfs_context_create(NULL); 4407 error = vfs_getattr(mount, &attr, context); 4408 4409 if ((0 != error) || (! VFSATTR_IS_SUPPORTED(&attr, f_uuid))) { 4410 vfs_context_rele(context); 4411#ifdef BC_DEBUG 4412 if (strncmp("devfs", name, sizeof("devfs")) != 0) { 4413 debug("Found mount %s without a UUID: error %d", name, error); 4414 } 4415#endif 4416 UNLOCK_HISTORY_R(); 4417 goto out; 4418 } 4419 4420 hmd->hmd_disk_id = vfs_throttle_mask(mount); 4421 4422 if (VNOP_IOCTL(devvp, /* vnode */ 4423 DKIOCISSOLIDSTATE, /* cmd */ 4424 (caddr_t)&hmd->hmd_is_ssd, /* data */ 4425 0, 4426 context)) /* context */ 4427 { 4428 debug("can't determine if physical disk for history mount %s is an ssd", uuid_string(hmd->hmd_mount.hm_uuid)); 4429 hmd->hmd_is_ssd = 0; 4430 } 4431 4432#ifdef ROOT_DISK_ONLY 4433 if (devvp == rootvp) { 4434 if (BC_cache->c_root_disk_id == 0) { 4435 BC_cache->c_root_disk_id = hmd->hmd_disk_id; 4436 if (BC_cache->c_root_disk_id == 0) { 4437 message("Root disk is 0"); 4438 } else { 4439 debug("Root disk (via history) is 0x%llx", BC_cache->c_root_disk_id); 4440 } 4441 } else if (BC_cache->c_root_disk_id != hmd->hmd_disk_id) { 4442 debug("Root disk 0x%llx doesn't match that found by history 0x%llx", BC_cache->c_root_disk_id, hmd->hmd_disk_id); 4443 } 4444 } 4445#endif 4446 4447 4448 /* Found info for our mount */ 4449 debug("Found new historic mount after %d IOs: %s, dev 0x%x, disk 0x%llx, blk %d%s", 4450 hmd->hmd_mount.hm_nentries, 4451 uuid_string(attr.f_uuid), 4452 hmd->hmd_dev, 4453 hmd->hmd_disk_id, 4454 hmd->hmd_blocksize, 4455 hmd->hmd_is_ssd ? ", ssd" : ""); 4456 4457 if (hmd->hmd_blocksize == 0) { 4458 hmd->hmd_blocksize = 512; 4459 } 4460 hmd->hmd_mount.hm_nentries = 0; 4461 4462 uuid_copy(hmd->hmd_mount.hm_uuid, attr.f_uuid); 4463 4464 UNLOCK_HISTORY_R(); 4465 4466 if (BC_cache->c_flags & BC_FLAG_CACHEACTIVE) { 4467 LOCK_CACHE_R(); 4468 /* Check if we have a playlist mount that matches this mount and set it up */ 4469 for (mount_idx = 0; mount_idx < BC_cache->c_nmounts; mount_idx++) { 4470 struct BC_cache_mount *cm = BC_cache->c_mounts + mount_idx; 4471 if (CM_SETUP == cm->cm_state && 0 == uuid_compare(cm->cm_uuid, attr.f_uuid)) { 4472 /* Found a matching unfilled mount */ 4473 4474 LOCK_MOUNT_W(cm); 4475 if ((error = BC_fill_in_mount(cm, mount, context)) == 0) { 4476 LOCK_MOUNT_W_TO_R(cm); 4477 /* Kick off another reader thread if this is a new physical disk */ 4478 BC_mount_available(cm); 4479 UNLOCK_MOUNT_R(cm); 4480 } else { 4481 if (error == EAGAIN) { 4482 *go_again = 1; 4483 } 4484 UNLOCK_MOUNT_W(cm); 4485 } 4486 4487 break; 4488 } 4489 } 4490 UNLOCK_CACHE_R(); 4491 } 4492 4493 vfs_context_rele(context); 4494 4495out: 4496 if (devvp != NULLVP) { 4497 vnode_put(devvp); 4498 } 4499 return VFS_RETURNED; 4500} 4501 4502/* 4503 * Check for new or disappearing mounts 4504 */ 4505static void BC_update_mounts(void) { 4506 4507 /* don't do anything if we've stopped recording history */ 4508 if (! (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE)) { 4509 return; 4510 } 4511 4512 int go_again = 0; 4513 vfs_iterate(0, check_for_new_mount_itr, &go_again); 4514 if (go_again) { 4515 debug("Missing root mount during last iteration, going again"); 4516 vfs_iterate(0, check_for_new_mount_itr, &go_again); 4517 } 4518} 4519 4520/* 4521 * Find the mount that corresponds to this IO, 4522 * or the first empty slot. 4523 * 4524 * Called with the history read lock 4525 */ 4526static struct BC_history_mount_device * BC_get_history_mount_device(dev_t dev, int* pindex) { 4527 struct BC_history_mount_device *tmphmd = NULL, **phmd, *ret = NULL; 4528 int mount_idx = 0; 4529 4530 for (phmd = &BC_cache->c_history_mounts; 4531 (*phmd) == NULL || (*phmd)->hmd_dev != dev; 4532 phmd = &(*phmd)->hmd_next) { 4533 4534 if (*phmd == NULL) { 4535 if (tmphmd == NULL) { /* we may have an allocation from previous iterations */ 4536 tmphmd = kalloc(sizeof(struct BC_history_mount_device)); 4537 if (tmphmd == NULL) { 4538 message("could not allocate %lu bytes for history mount device", 4539 sizeof(struct BC_history_mount_device)); 4540 BC_set_flag(BC_FLAG_HTRUNCATED); 4541 goto out; 4542 } 4543 4544 /* claim the new entry */ 4545 tmphmd->hmd_next = NULL; 4546 tmphmd->hmd_mount.hm_nentries = 0; 4547 uuid_clear(tmphmd->hmd_mount.hm_uuid); 4548 tmphmd->hmd_disk_id = 0; 4549 tmphmd->hmd_blocksize = 0; 4550 tmphmd->hmd_dev = dev; 4551 } 4552 4553 if (OSCompareAndSwapPtr(NULL, tmphmd, phmd)) { 4554 tmphmd = NULL; 4555 if (BC_cache->c_take_detailed_stats) { 4556 BC_ADD_STAT(history_mounts, 1); 4557 } 4558 break; 4559 } 4560 4561 /* someone else added a mount before we could, check if its the one we want */ 4562 if ((*phmd)->hmd_dev == dev) break; 4563 4564 } 4565 mount_idx++; 4566 } 4567 4568 if (pindex) 4569 *pindex = mount_idx; 4570 ret = *phmd; 4571out: 4572 if (tmphmd) 4573 kfree(tmphmd, sizeof(struct BC_history_mount_device)); 4574 4575 return ret; 4576} 4577 4578/* 4579 * Record an incoming IO in the history list. 4580 * 4581 * Returns non-0 if this IO was on the root disk. 4582 * 4583 * Note that this function is not reentrant. 4584 */ 4585static int 4586BC_add_history(daddr64_t blkno, u_int64_t length, int pid, int hit, int write, int tag, int shared, dev_t dev) 4587{ 4588 u_int64_t offset; 4589 struct BC_history_entry_cluster *hec, *tmphec = NULL; 4590 struct BC_history_mount_device *hmd = NULL; 4591 struct BC_history_entry *he; 4592 int mount_idx, entry_idx; 4593 int is_root_disk = 0; 4594 4595 /* don't do anything if we've stopped recording history */ 4596 if (! (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE)) 4597 return 0; 4598 4599 LOCK_HISTORY_R(); 4600 4601 /* check again, with lock */ 4602 if (! (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE)) 4603 goto out; 4604 4605 /* count IOs during boot (first recording) */ 4606 if (BC_cache->c_take_detailed_stats) { 4607 if (! tag) { 4608 if (! write) { 4609 BC_ADD_STAT(history_reads, 1); 4610 } else { 4611 BC_ADD_STAT(history_writes, 1); 4612 } 4613 } 4614 } 4615 4616 /* don't do anything if the history list has been truncated */ 4617 if (!tag && BC_cache->c_flags & BC_FLAG_HTRUNCATED) 4618 goto out; 4619 4620 /* 4621 * In order to avoid recording a playlist larger than we will 4622 * allow to be played back, if the total byte count for recorded 4623 * reads reaches 50% of the physical memory in the system, we 4624 * start ignoring reads. 4625 * This is preferable to dropping entries from the end of the 4626 * playlist as we copy it in, since it will result in a clean 4627 * cessation of caching at a single point, rather than 4628 * introducing sporadic cache misses throughout the cache's run. 4629 */ 4630 if (!tag && (BC_cache->c_history_size + length) > BC_MAX_SIZE) { 4631 debug("History recording too large, capping at %lluMB", BC_MAX_SIZE / (1024 * 1024)); 4632 BC_set_flag(BC_FLAG_HTRUNCATED); 4633 goto out; 4634 } 4635 4636#ifdef NO_HISTORY 4637 BC_set_flag(BC_FLAG_HTRUNCATED); 4638 debug("history disabled, truncating"); 4639 goto out; 4640#endif 4641 4642 mount_idx = 0; 4643 offset = 0; 4644 if (! tag) { 4645 4646 hmd = BC_get_history_mount_device(dev, &mount_idx); 4647 if (hmd == NULL) goto out; 4648 4649 if (uuid_is_null(hmd->hmd_mount.hm_uuid)) { 4650 /* Couldn't identify the mount 4651 * 4652 * Before the mount has appeared we don't want to 4653 * include any of this device's IOs in the history 4654 * since these IOs will be issued next boot before 4655 * we're able to start this mount's part of the 4656 * playlist. 4657 */ 4658 4659 /* Keep track of the number of IOs we've seen until the mount appears */ 4660 OSIncrementAtomic(&hmd->hmd_mount.hm_nentries); 4661 4662 if (BC_cache->c_take_detailed_stats) { 4663 BC_ADD_STAT(history_unknown, 1); 4664 BC_ADD_STAT(history_unknown_bytes, length); 4665 } 4666 goto out; 4667 } 4668 4669 offset = HB_BLOCK_TO_BYTE(hmd, blkno); 4670 4671 /* Only track IOs on the root disk. 4672 * 4673 * We don't expect IOs on other physical disks 4674 * to be part of the critical path to boot/login 4675 * and we certainly don't expect the other disks 4676 * to be contested enough to make a cache worthwhile 4677 */ 4678 4679 if (hmd->hmd_disk_id != BC_cache->c_root_disk_id) { 4680#ifdef ROOT_DISK_ONLY 4681 goto out; 4682#endif 4683 } else { 4684 is_root_disk = 1; 4685 } 4686 4687 } 4688 4689 /* 4690 * Get the current entry cluster. 4691 */ 4692 while ((hec = BC_cache->c_history) == NULL || 4693 (entry_idx = OSAddAtomic(1, &hec->hec_nentries)) >= BC_HISTORY_ENTRIES) { 4694 if (hec) 4695 OSAddAtomic(-1, &hec->hec_nentries); 4696 4697 /* need a new cluster */ 4698 4699 /* limit the number of clusters we will allocate */ 4700 if (BC_cache->c_history_num_clusters + 1 >= BC_HISTORY_MAXCLUSTERS) { 4701 message("too many history clusters (%d, limit %ld)", 4702 BC_cache->c_history_num_clusters, 4703 (long)BC_HISTORY_MAXCLUSTERS); 4704 BC_set_flag(BC_FLAG_HTRUNCATED); 4705 goto out; 4706 } 4707 4708 if (tmphec == NULL) { /* we may have an allocation from previous iterations */ 4709 tmphec = kalloc(BC_HISTORY_ALLOC); 4710 if (tmphec == NULL) { 4711 message("could not allocate %d bytes for history cluster", 4712 BC_HISTORY_ALLOC); 4713 BC_set_flag(BC_FLAG_HTRUNCATED); 4714 goto out; 4715 } 4716 4717 /* claim the first entry of the new cluster*/ 4718 tmphec->hec_nentries = 1; 4719 tmphec->hec_link = hec; 4720 } 4721 4722 if (OSCompareAndSwapPtr(hec, tmphec, &BC_cache->c_history)) { 4723 hec = tmphec; 4724 entry_idx = 0; 4725 tmphec = NULL; 4726 OSAddAtomic(1, &BC_cache->c_history_num_clusters); 4727 break; 4728 } 4729 } 4730 if (tmphec != NULL) 4731 kfree(tmphec, BC_HISTORY_ALLOC); 4732 4733 /* 4734 * We've claimed entry entry_idx of cluster hec, fill it in. 4735 */ 4736 he = &(hec->hec_data[entry_idx]); 4737 assert(he >= &hec->hec_data[0]); 4738 assert(he < &hec->hec_data[BC_HISTORY_ENTRIES]); 4739 he->he_offset = offset; 4740 he->he_length = length; 4741 he->he_pid = pid; 4742 he->he_mount_idx = mount_idx; 4743 he->he_flags = 0x0; 4744 if (hit) he->he_flags |= BC_HE_HIT; 4745 if (write) he->he_flags |= BC_HE_WRITE; 4746 if (tag) he->he_flags |= BC_HE_TAG; 4747 if (shared) he->he_flags |= BC_HE_SHARED; 4748 if (hmd) 4749 OSIncrementAtomic(&hmd->hmd_mount.hm_nentries); 4750 4751 if (!write) { 4752 OSAddAtomic64(length, &BC_cache->c_history_size); 4753 if (BC_cache->c_take_detailed_stats) { 4754 BC_ADD_STAT(history_bytes, length); 4755 BC_ADD_STAT(history_entries, 1); 4756 } 4757 } 4758out: 4759 UNLOCK_HISTORY_R(); 4760 return is_root_disk; 4761} 4762 4763/* 4764 * Return the size of the history buffer. 4765 */ 4766static int 4767BC_size_history_mounts(void) 4768{ 4769 struct BC_history_mount_device *hmd; 4770 int nmounts; 4771 4772 /* walk the list of mount clusters and count the number of mounts */ 4773 nmounts = 0; 4774 for (hmd = BC_cache->c_history_mounts; hmd != NULL; hmd = hmd->hmd_next) 4775 nmounts ++; 4776 4777 return(nmounts * sizeof(struct BC_history_mount)); 4778} 4779 4780/* 4781 * Return the size of the history buffer. 4782 */ 4783static int 4784BC_size_history_entries(void) 4785{ 4786 struct BC_history_entry_cluster *hec; 4787 int nentries, cluster_entries; 4788 4789 /* walk the list of clusters and count the number of entries */ 4790 nentries = 0; 4791 for (hec = BC_cache->c_history; hec != NULL; hec = hec->hec_link) { 4792 cluster_entries = hec->hec_nentries; 4793 if (cluster_entries > BC_HISTORY_ENTRIES) { 4794 cluster_entries = BC_HISTORY_ENTRIES; 4795 } 4796 4797 nentries += cluster_entries; 4798 } 4799 4800 return(nentries * sizeof(struct BC_history_entry)); 4801} 4802 4803/* 4804 * Copy out either the history buffer size, or the entire history buffer. 4805 * 4806 * Note that the cache must be shut down before calling this function in 4807 * order to avoid a race around the entry count/buffer size. 4808 */ 4809static int 4810BC_copyout_history_mounts(user_addr_t uptr) 4811{ 4812 struct BC_history_mount_device *hmd; 4813 int error; 4814 4815 assert(uptr != 0); 4816 4817 /* 4818 * Copy out the mounts 4819 */ 4820 for (hmd = BC_cache->c_history_mounts; 4821 hmd != NULL; 4822 hmd = hmd->hmd_next) { 4823 4824 /* copy the cluster out */ 4825 if ((error = copyout(&(hmd->hmd_mount), uptr, 4826 sizeof(struct BC_history_mount))) != 0) 4827 return(error); 4828 uptr += sizeof(struct BC_history_mount); 4829 4830 } 4831 return(0); 4832} 4833 4834/* 4835 * Copy out either the history buffer size, or the entire history buffer. 4836 * 4837 * Note that the cache must be shut down before calling this function in 4838 * order to avoid a race around the entry count/buffer size. 4839 */ 4840static int 4841BC_copyout_history_entries(user_addr_t uptr) 4842{ 4843 struct BC_history_entry_cluster *hec, *ohec; 4844 int error, cluster_entries; 4845 4846 assert(uptr != 0); 4847 assert(BC_cache->c_history); 4848 4849 /* 4850 * Copying the history entires out is a little messy due to the fact that the 4851 * clusters are on the list backwards, but we want the entries in order. 4852 */ 4853 ohec = NULL; 4854 if (BC_cache->c_history) { 4855 for (;;) { 4856 /* scan the list until we find the last one we haven't copied */ 4857 hec = BC_cache->c_history; 4858 while (hec->hec_link != ohec) 4859 hec = hec->hec_link; 4860 ohec = hec; 4861 4862 cluster_entries = hec->hec_nentries; 4863 if (cluster_entries > BC_HISTORY_ENTRIES) { 4864 cluster_entries = BC_HISTORY_ENTRIES; 4865 } 4866 4867 /* copy the cluster out */ 4868 if ((error = copyout(hec->hec_data, uptr, 4869 cluster_entries * sizeof(struct BC_history_entry))) != 0) 4870 return(error); 4871 uptr += cluster_entries * sizeof(struct BC_history_entry); 4872 4873 /* if this was the last cluster, all done */ 4874 if (hec == BC_cache->c_history) 4875 break; 4876 } 4877 } 4878 return(0); 4879} 4880 4881/* 4882 * Discard the history list. 4883 */ 4884static void 4885BC_discard_history(void) 4886{ 4887 struct BC_history_mount_device *hmd; 4888 struct BC_history_entry_cluster *hec; 4889 4890 while (BC_cache->c_history_mounts != NULL) { 4891 hmd = BC_cache->c_history_mounts; 4892 BC_cache->c_history_mounts = hmd->hmd_next; 4893 kfree(hmd, sizeof(struct BC_history_mount_device)); 4894 } 4895 4896 while (BC_cache->c_history != NULL) { 4897 hec = BC_cache->c_history; 4898 BC_cache->c_history = hec->hec_link; 4899 kfree(hec, BC_HISTORY_ALLOC); 4900 } 4901} 4902 4903/* 4904 * Setup for the root mounted device 4905 */ 4906static int 4907BC_setup(void) 4908{ 4909 unsigned int boot_arg; 4910 4911 /* 4912 * If we have less than the minimum supported physical memory, 4913 * bail immediately. With too little memory, the cache will 4914 * become polluted with nondeterministic pagein activity, leaving 4915 * large amounts of wired memory around which further exacerbates 4916 * the problem. 4917 */ 4918 if ((max_mem / (1024 * 1024)) < BC_minimum_mem_size) { 4919 debug("insufficient physical memory (%dMB < %dMB required)", 4920 (int) (max_mem / (1024 * 1024)), BC_minimum_mem_size); 4921 return(ENOMEM); 4922 } 4923 4924 /* 4925 * Find our block device. 4926 * 4927 * Note that in the netbooted case, we are loaded but should not 4928 * actually run, unless we're explicitly overridden. 4929 * 4930 * Note also that if the override is set, we'd better have a valid 4931 * rootdev... 4932 */ 4933 if (netboot_root() && 4934 (!PE_parse_boot_argn("BootCacheOverride", &boot_arg, sizeof(boot_arg)) || 4935 (boot_arg != 1))) { /* not interesting */ 4936 return(ENXIO); 4937 } 4938 4939 debug("Major of rootdev is %d", major(rootdev)); 4940 4941 BC_cache->c_devsw = &bdevsw[major(rootdev)]; 4942 assert(BC_cache->c_devsw != NULL); 4943 4944 /* Make sure we don't clobber the device strategy routine */ 4945 if ((BC_cache->c_devsw->d_strategy == (strategy_fcn_t*)BC_strategy) || 4946 (BC_cache->c_devsw->d_close == BC_close)) { 4947 debug("cache was not terminated properly previously"); 4948 return(ENXIO); 4949 } 4950 4951 BC_cache->c_strategy = BC_cache->c_devsw->d_strategy; 4952 BC_cache->c_close = BC_cache->c_devsw->d_close; 4953 if (BC_cache->c_strategy == NULL || 4954 BC_cache->c_close == NULL) { /* not configured */ 4955 return(ENXIO); 4956 } 4957 4958 BC_set_flag(BC_FLAG_SETUP); 4959 4960 return(0); 4961} 4962 4963 4964/* 4965 * Called from the root-mount hook. 4966 */ 4967static void 4968BC_auto_start(void) 4969{ 4970 int error; 4971 user_addr_t pm, pe; 4972 4973 if ((error = BC_setup()) != 0) { 4974 debug("BootCache setup failed: %d", error); 4975 return; 4976 } 4977 4978 pm = CAST_USER_ADDR_T((uintptr_t)BC_preloaded_playlist + sizeof(struct BC_playlist_header)); 4979 pe = pm + (BC_preloaded_playlist->ph_nmounts * sizeof(struct BC_playlist_mount)); 4980 4981 /* Initialize the cache. 4982 * 4983 * auto-start is for DVDs or other read-only media, so don't bother recording history 4984 */ 4985 error = BC_init_cache(BC_preloaded_playlist->ph_nmounts * sizeof(struct BC_playlist_mount), pm, 4986 BC_preloaded_playlist->ph_nentries * sizeof(struct BC_playlist_entry), pe, 4987 0 /* don't record history */); 4988 if (error != 0) 4989 debug("BootCache autostart failed: %d", error); 4990} 4991 4992/* 4993 * Called from the root-unmount hook. 4994 */ 4995static void 4996BC_unmount_hook(void) 4997{ 4998 debug("device unmounted"); 4999 5000 /* 5001 * If the cache is running, stop it. If it's already stopped 5002 * (it may have stopped itself), that's OK. 5003 */ 5004 BC_terminate_cache(); 5005 BC_terminate_history(); 5006 5007} 5008 5009 5010/* 5011 * Handle the command sysctl. 5012 */ 5013static int 5014BC_sysctl SYSCTL_HANDLER_ARGS 5015{ 5016 struct BC_command bc; 5017 int error; 5018 5019 /* get the commande structure and validate */ 5020 if ((error = SYSCTL_IN(req, &bc, sizeof(bc))) != 0) { 5021 debug("couldn't get command"); 5022 return(error); 5023 } 5024 if (bc.bc_magic != BC_MAGIC) { 5025 debug("bad command magic"); 5026 return(EINVAL); 5027 } 5028 5029 switch (bc.bc_opcode) { 5030 case BC_OP_START: 5031 debug("BC_OP_START(%d mounts, %d extents)", (int)(bc.bc_data1_size / sizeof(struct BC_playlist_mount)), (int)(bc.bc_data2_size / sizeof(struct BC_playlist_entry))); 5032 if (BC_preloaded_playlist) { 5033 error = EINVAL; 5034 break; 5035 } 5036 error = BC_init_cache(bc.bc_data1_size, (user_addr_t)bc.bc_data1, bc.bc_data2_size, (user_addr_t)bc.bc_data2, 1 /* record history */); 5037 if (error != 0) { 5038 message("cache init failed"); 5039 } 5040 break; 5041 5042 case BC_OP_STOP: 5043 debug("BC_OP_STOP"); 5044 5045 /* 5046 * If we're recording history, stop it. If it's already stopped 5047 * (it may have stopped itself), that's OK. 5048 */ 5049 BC_terminate_history(); 5050 5051 /* return the size of the history buffer */ 5052 LOCK_HISTORY_W(); 5053 bc.bc_data1_size = BC_size_history_mounts(); 5054 bc.bc_data2_size = BC_size_history_entries(); 5055 if ((error = SYSCTL_OUT(req, &bc, sizeof(bc))) != 0) 5056 debug("could not return history size"); 5057 UNLOCK_HISTORY_W(); 5058 5059 break; 5060 5061 case BC_OP_MOUNT: 5062 /* debug("BC_OP_MOUNT"); */ 5063 5064 /* 5065 * A notification that mounts have changed. 5066 */ 5067 BC_update_mounts(); 5068 break; 5069 5070 case BC_OP_HISTORY: 5071 debug("BC_OP_HISTORY"); 5072 /* if there's a user buffer, verify the size and copy out */ 5073 LOCK_HISTORY_W(); 5074 if (bc.bc_data1 != 0 && bc.bc_data2 != 0) { 5075 if (bc.bc_data1_size < BC_size_history_mounts() || 5076 bc.bc_data2_size < BC_size_history_entries()) { 5077 debug("supplied history buffer too small"); 5078 error = EINVAL; 5079 UNLOCK_HISTORY_W(); 5080 break; 5081 } 5082 if ((error = BC_copyout_history_mounts(bc.bc_data1)) != 0) { 5083 debug("could not copy out history mounts: %d", error); 5084 } 5085 if ((error = BC_copyout_history_entries(bc.bc_data2)) != 0) { 5086 debug("could not copy out history entries: %d", error); 5087 } 5088 } 5089 /* 5090 * Release the last of the memory we own. 5091 */ 5092 BC_discard_history(); 5093 UNLOCK_HISTORY_W(); 5094 break; 5095 5096 case BC_OP_JETTISON: 5097 debug("BC_OP_JETTISON"); 5098 5099 /* 5100 * Jettison the cache. If it's already jettisoned 5101 * (it may have jettisoned itself), that's OK. 5102 */ 5103 BC_terminate_cache(); 5104 break; 5105 5106 case BC_OP_STATS: 5107 debug("BC_OP_STATS"); 5108 /* check buffer size and copy out */ 5109 if (bc.bc_data1_size != sizeof(BC_cache->c_stats)) { 5110 debug("stats structure wrong size"); 5111 error = ENOMEM; 5112 } else { 5113 BC_cache->c_stats.ss_cache_flags = BC_cache->c_flags; 5114 if ((error = copyout(&BC_cache->c_stats, bc.bc_data1, bc.bc_data1_size)) != 0) 5115 debug("could not copy out statistics"); 5116 } 5117 break; 5118 5119 case BC_OP_TAG: 5120 debug("BC_OP_TAG"); 5121 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, DBG_BC_TAG) | 5122 DBG_FUNC_NONE, proc_selfppid(), dbg_tag_count, 5123 0, 0, 0); 5124 BC_add_history(0, 0, 0, 0, 0, 1, 0, 0); 5125 dbg_tag_count++; 5126 break; 5127 5128 case BC_OP_TEST: 5129 debug("BC_OP_TEST"); 5130 5131 if (! (BC_cache->c_flags & BC_FLAG_SETUP)) { 5132 error = ENODEV; 5133 } 5134 break; 5135 5136 default: 5137 error = EINVAL; 5138 } 5139 return(error); 5140} 5141 5142 5143/* 5144 * Initialise the block of pages we use to back our data. 5145 * 5146 */ 5147static int 5148BC_alloc_pagebuffer() 5149{ 5150 kern_return_t kret; 5151 int ret; 5152 5153 if (BC_cache->c_vp == NULL) { 5154 5155 kret = vnode_open(BC_BOOT_BACKINGFILE, (O_CREAT | O_NOFOLLOW | FREAD), 5156 0400, 0, &BC_cache->c_vp, vfs_context_current()); 5157 if (kret != KERN_SUCCESS) { 5158 debug("vnode_open failed - %d", kret); 5159 return -1; 5160 } 5161 5162 BC_cache->c_vid = vnode_vid(BC_cache->c_vp); 5163 5164 } else { 5165 ret = vnode_getwithvid(BC_cache->c_vp, BC_cache->c_vid); 5166 if (ret != 0) { 5167 debug("vnode_getwithvid failed - %d", ret); 5168 return -1; 5169 } 5170 } 5171 5172 assert(BC_cache->c_cachesize != 0); 5173 kret = ubc_setsize(BC_cache->c_vp, BC_cache->c_cachesize); 5174 if (kret != 1) { 5175 debug("ubc_setsize failed - %d", kret); 5176 (void) vnode_put(BC_cache->c_vp); 5177 return -1; 5178 } 5179 5180 (void) vnode_put(BC_cache->c_vp); 5181 5182 return(0); 5183} 5184 5185/* 5186 * Release all resources associated with our pagebuffer. 5187 */ 5188static void 5189BC_free_pagebuffer(void) 5190{ 5191 kern_return_t kret; 5192 if (BC_cache->c_vp != NULL) { 5193 /* Can handle a vnode without ubc info */ 5194 kret = vnode_getwithref(BC_cache->c_vp); 5195 if (kret != KERN_SUCCESS) { 5196 debug("vnode_getwithref failed - %d", kret); 5197 return; 5198 } 5199 5200 kret = ubc_range_op(BC_cache->c_vp, 0, BC_cache->c_cachesize, 5201 UPL_ROP_DUMP, NULL); 5202 if (kret != KERN_SUCCESS) { 5203 debug("ubc_range_op failed - %d", kret); 5204 } 5205 5206#if 0 5207 /* need this interface to be exported... will eliminate need 5208 * for the ubc_range_op above */ 5209 kret = vnode_delete(BC_BOOT_BACKINGFILE, vfs_context_current()); 5210 if (kret) { 5211 debug("vnode_delete failed - %d", error); 5212 } 5213#endif 5214 kret = vnode_close(BC_cache->c_vp, 0, vfs_context_current()); 5215 if (kret != KERN_SUCCESS) { 5216 debug("vnode_close failed - %d", kret); 5217 } 5218 5219 BC_cache->c_vp = NULL; 5220 } 5221} 5222 5223/* 5224 * Release one page from the pagebuffer. 5225 */ 5226static void 5227BC_free_page(struct BC_cache_extent *ce, int page) 5228{ 5229 off_t vpoffset; 5230 kern_return_t kret; 5231 5232 vpoffset = (page * PAGE_SIZE) + ce->ce_cacheoffset; 5233 kret = vnode_getwithvid(BC_cache->c_vp, BC_cache->c_vid); 5234 if (kret != KERN_SUCCESS) { 5235 debug("vnode_getwithvid failed - %d", kret); 5236 return; 5237 } 5238 5239 kret = ubc_range_op(BC_cache->c_vp, vpoffset, vpoffset + PAGE_SIZE, 5240 UPL_ROP_DUMP, NULL); 5241 if (kret != KERN_SUCCESS) { 5242 debug("ubc_range_op failed - %d", kret); 5243 } 5244 5245 (void) vnode_put(BC_cache->c_vp); 5246} 5247 5248void 5249BC_hook(void) 5250{ 5251 int error; 5252 5253 if ((error = BC_setup()) != 0) { 5254 debug("BootCache setup failed: %d", error); 5255 } 5256} 5257 5258/* 5259 * Cache start/stop functions. 5260 */ 5261void 5262BC_load(void) 5263{ 5264 struct mach_header *mh; 5265 long xsize; 5266 int has64bitness, error; 5267 size_t sysctlsize = sizeof(int); 5268 char *plsection = "playlist"; 5269 5270#ifdef BC_DEBUG 5271 microtime(&debug_starttime); 5272#endif 5273 5274 /* register our control interface */ 5275 sysctl_register_oid(&sysctl__kern_BootCache); 5276 5277 /* clear the control structure */ 5278 bzero(BC_cache, sizeof(*BC_cache)); 5279 5280 BC_cache->c_lckgrp = lck_grp_alloc_init("BootCache", LCK_GRP_ATTR_NULL); 5281 lck_rw_init(&BC_cache->c_history_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL); 5282 lck_rw_init(&BC_cache->c_cache_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL); 5283 lck_mtx_init(&BC_cache->c_handlers_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL); 5284 lck_mtx_init(&BC_cache->c_reader_threads_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL); 5285#ifdef BC_DEBUG 5286 lck_mtx_init(&debug_printlock, BC_cache->c_lckgrp, LCK_ATTR_NULL); 5287#endif 5288 /* 5289 * Find the preload section and its size. If it's not empty 5290 * we have a preloaded playlist, so prepare for an early 5291 * start. 5292 */ 5293 if (kmod_info.info_version != KMOD_INFO_VERSION) { 5294 message("incompatible kmod_info versions"); 5295 return; 5296 } 5297 mh = (struct mach_header *)kmod_info.address; 5298 5299 /* 5300 * If booting on a 32-bit only machine, use the "playlist32" section. 5301 * Otherwise, fall through to the default "playlist" section. 5302 */ 5303 error = sysctlbyname("hw.cpu64bit_capable", &has64bitness, &sysctlsize, 5304 NULL, 0); 5305 if (error == 0 && has64bitness == 0) { 5306 plsection = "playlist32"; 5307 } 5308 BC_preloaded_playlist = getsectdatafromheader(mh, "BootCache", 5309 plsection, &BC_preloaded_playlist_size); 5310 debug("preload section %s: %d @ %p", 5311 plsection, BC_preloaded_playlist_size, BC_preloaded_playlist); 5312 5313 if (BC_preloaded_playlist != NULL) { 5314 if (BC_preloaded_playlist_size < sizeof(struct BC_playlist_header)) { 5315 message("preloaded playlist too small"); 5316 return; 5317 } 5318 if (BC_preloaded_playlist->ph_magic != PH_MAGIC) { 5319 message("preloaded playlist has invalid magic (%x, expected %x)", 5320 BC_preloaded_playlist->ph_magic, PH_MAGIC); 5321 return; 5322 } 5323 xsize = sizeof(struct BC_playlist_header) + 5324 (BC_preloaded_playlist->ph_nmounts * sizeof(struct BC_playlist_mount)) + 5325 (BC_preloaded_playlist->ph_nentries * sizeof(struct BC_playlist_entry)); 5326 if (xsize > BC_preloaded_playlist_size) { 5327 message("preloaded playlist too small (%d, expected at least %ld)", 5328 BC_preloaded_playlist_size, xsize); 5329 return; 5330 } 5331 BC_wait_for_readahead = BC_READAHEAD_WAIT_CDROM; 5332 mountroot_post_hook = BC_auto_start; 5333 unmountroot_pre_hook = BC_unmount_hook; 5334 5335 debug("waiting for root mount..."); 5336 } else { 5337 /* no preload, normal operation */ 5338 BC_preloaded_playlist = NULL; 5339 mountroot_post_hook = BC_hook; 5340 debug("waiting for playlist..."); 5341 } 5342 5343 microtime(&BC_cache->c_loadtime); 5344} 5345 5346int 5347BC_unload(void) 5348{ 5349 int error; 5350 5351 debug("preparing to unload..."); 5352 unmountroot_pre_hook = NULL; 5353 5354 /* 5355 * If the cache is running, stop it. If it's already stopped 5356 * (it may have stopped itself), that's OK. 5357 */ 5358 BC_terminate_history(); 5359 if ((error = BC_terminate_cache()) != 0) { 5360 if (error != ENXIO) 5361 goto out; 5362 } 5363 LOCK_HISTORY_W(); 5364 BC_discard_history(); 5365 UNLOCK_HISTORY_W(); 5366 5367 lck_rw_destroy(&BC_cache->c_history_lock, BC_cache->c_lckgrp); 5368 lck_rw_destroy(&BC_cache->c_cache_lock, BC_cache->c_lckgrp); 5369 lck_mtx_destroy(&BC_cache->c_handlers_lock, BC_cache->c_lckgrp); 5370 lck_mtx_destroy(&BC_cache->c_reader_threads_lock, BC_cache->c_lckgrp); 5371 lck_grp_free(BC_cache->c_lckgrp); 5372 5373 sysctl_unregister_oid(&sysctl__kern_BootCache); 5374 error = 0; 5375 5376out: 5377 return(error); 5378} 5379