1/*
2 * Boot-time performance cache.
3 *
4 * We implement a shim layer for a block device which can be advised ahead
5 * of time of the likely read pattern for the block device.
6 *
7 * Armed with a sorted version of this read pattern (the "playlist"), we
8 * cluster the reads into a private cache, taking advantage of the
9 * sequential I/O pattern. Incoming read requests are then satisfied from
10 * the cache (if possible), or by forwarding to the original strategy routine.
11 *
12 * We also record the pattern of incoming read requests (the "history
13 * list"), in order to provide for adaptation to changes in behaviour by
14 * the calling layer.
15 *
16 * The hit rate of the cache is also measured, and caching will be disabled
17 * preemptively if the hit rate is too low.
18 *
19 * The cache is controlled by a sysctl interface.  Typical usage is as
20 * follows:
21 *
22 * - Load a sorted read pattern.
23 * - Wait for the operation(s) of interest to be performed.
24 * - Fetch the new (unsorted) read pattern.
25 *
26 * Loading a read pattern initialises the cache; upon successful loading
27 * the cache is active and readhead is being performed.  After fetching the
28 * recorded read pattern, the cache remains around until jettisoned due
29 * to memory pressure or the hit rate falls below a desirable level.
30 *
31 * Limitations:
32 *
33 * - only one copy of the cache may be active at a time
34 */
35
36/*
37 * Build options.
38 */
39/*#define BC_DEBUG*/
40/*#define BC_EXTRA_DEBUG*/
41
42/*
43 * Emulate readahead, don't actually perform it.
44 */
45/*#define EMULATE_ONLY*/
46
47/*
48 * Don't collect request history.
49 */
50/*#define NO_HISTORY*/
51
52/*
53 * Don't use history cluster chains (seems to be a malloc problem with these
54 * enabled)
55 */
56/*#define STATIC_HISTORY*/
57
58/*
59 * Only cache for the volumes on the root disk
60 */
61#define ROOT_DISK_ONLY
62
63/*
64 * Keep a log of read history to aid in debugging.
65 */
66/*#define READ_HISTORY_BUFFER	1000*/
67/*#define READ_HISTORY_ALL*/
68
69/*
70 * Ignore the batches on playlist entries.
71 */
72/*#define IGNORE_BATCH */
73
74/*
75 * Tunable parameters.
76 */
77
78/*
79 * Minimum required physical memory before BootCache will
80 * activate.  Insufficient physical memory leads to paging
81 * activity during boot, which is not repeatable and can't
82 * be distinguished from the repeatable read patterns.
83 *
84 * Size in megabytes.
85 */
86static int BC_minimum_mem_size = 256;
87
88/*
89 * Number of seconds to wait in BC_strategy for readahead to catch up
90 * with our request.   Tuninig issues:
91 *  - Waiting too long may hold up the system behind this I/O/.
92 *  - Bypassing an I/O too early will reduce the hit rate and adversely
93 *    affect performance on slow media.
94 */
95#define BC_READAHEAD_WAIT_DEFAULT	10
96#define BC_READAHEAD_WAIT_CDROM		45
97static int BC_wait_for_readahead = BC_READAHEAD_WAIT_DEFAULT;
98
99/*
100 * Cache hit ratio monitoring.
101 *
102 * The time overhead of a miss in the cache is approx 1/5000th of the
103 * time it takes to satisfy an IO from a HDD. If our hit rate falls
104 * below this, we're no longer providing a performance win and the
105 * cache will be jettisoned.
106 *
107 * Make sure to keep the cache around for at least an hour
108 * if the user hasn't logged in yet, though.
109 */
110static int BC_chit_max_num_IOs_since_last_hit = 10000;
111static int BC_chit_prelogin_timeout_seconds = 3600;
112
113/*
114 * History recording timeout.
115 *
116 * If the history recording is not turned off, the history buffer
117 * will grow to its maximum size. This timeout will kill the history
118 * recording after the specified interval (in hz).
119 *
120 * User DVDs don't issue a "BootCacheControl stop", so they depend
121 * on the history recording timeout.
122 */
123static int BC_history_timeout = 240 * 100;
124
125/*
126 * Trace macros
127 */
128#ifndef DBG_BOOTCACHE
129#define DBG_BOOTCACHE	7
130#endif
131
132#define	DBG_BC_TAG					(1 << 0)
133#define	DBG_BC_BATCH				(1 << 1)
134
135#define DBG_BC_IO_HIT				(1 << 2)
136#define DBG_BC_IO_HIT_STALLED		(1 << 3)
137#define DBG_BC_IO_MISS				(1 << 4)
138#define DBG_BC_IO_MISS_CUT_THROUGH	(1 << 5)
139#define DBG_BC_PLAYBACK_IO			(1 << 6)
140
141static int dbg_tag_count = 0;
142
143#ifdef BC_DEBUG
144# define MACH_DEBUG
145# define MACH_ASSERT 1
146# define message(fmt, args...)	\
147do { \
148	microtime(&debug_currenttime); \
149	timersub(&debug_currenttime, &debug_starttime, &debug_currenttime); \
150	lck_mtx_lock(&debug_printlock); \
151	printf("BootCache: %5d.%03d %24s[%4d]: " fmt "\n", (u_int)(debug_currenttime.tv_sec), (u_int)(debug_currenttime.tv_usec / 1000), __FUNCTION__, __LINE__, ##args); \
152	lck_mtx_unlock(&debug_printlock); \
153} while (0)
154
155# define debug(fmt, args...)	message("*** " fmt, ##args)
156extern void Debugger(char *);
157#else
158# define debug(fmt, args...)	do {} while (0)
159# define message(fmt, args...)	printf("BootCache: " fmt "\n" , ##args)
160#endif
161
162#ifdef BC_EXTRA_DEBUG
163# define xdebug(fmt, args...)	message("+++ " fmt, ##args)
164#else
165# define xdebug(fmt, args...)	do {} while (0)
166#endif
167
168#include <sys/buf.h>
169#include <sys/conf.h>
170#include <sys/fcntl.h>
171#include <sys/kdebug.h>
172#include <sys/kernel.h>
173#include <sys/malloc.h>
174#include <sys/param.h>
175#include <sys/proc.h>
176#include <sys/types.h>
177#include <sys/sysctl.h>
178#include <sys/systm.h>
179#include <sys/vnode.h>
180#include <sys/mount.h>
181#include <sys/ubc.h>
182#include <sys/uio.h>
183
184#include <uuid/uuid.h>
185
186#include <mach/kmod.h>
187#include <mach/mach_types.h>
188#include <mach/vm_param.h>	/* for max_mem */
189
190#include <vm/vm_kern.h>
191
192#include <libkern/libkern.h>
193#include <libkern/OSAtomic.h>
194
195#include <kern/assert.h>
196#include <kern/host.h>
197#include <kern/kalloc.h>
198#include <kern/thread_call.h>
199#include <kern/locks.h>
200
201#include <IOKit/storage/IOMediaBSDClient.h>
202
203#include <pexpert/pexpert.h>
204
205#include "BootCache.h"
206
207#ifdef BC_DEBUG
208static struct timeval debug_starttime;
209static struct timeval debug_currenttime;
210static lck_mtx_t debug_printlock;
211#endif
212
213#define BC_MALLOC(_size, _type, _flags) ((ssize_t)(_size) < 0) ? NULL : _MALLOC((_size), (_type), (_flags))
214
215/**************************************
216 * Structures for the readahead cache *
217 **************************************/
218
219/*
220 * A cache extent.
221 *
222 * Each extent represents a contiguous range of disk blocks which are
223 * held in the cache.  All values in bytes.
224 */
225struct BC_cache_extent {
226	lck_mtx_t   ce_lock;
227	u_int64_t   ce_diskoffset;  /* physical offset on device */
228	u_int64_t   ce_length;      /* data length */
229	off_t       ce_cacheoffset;	/* offset into cache backing store */
230	u_int16_t   ce_batch;       /* batch number */
231	u_int16_t   ce_flags;       /* flags */
232#define CE_ABORTED  (1 << 0)    /* extent no longer has data */
233#define CE_IODONE   (1 << 1)    /* extent has been read */
234#define CE_LOWPRI   (1 << 2)    /* extent is low priority */
235#define CE_SHARED   (1 << 3)    /* extent is in the shared cache */
236	int         ce_mount_idx;   /* the index of the mount for this extent */
237	u_int32_t  *ce_blockmap;    /* track valid blocks for this extent */
238};
239
240/* A mount
241 *
242 * The volume to which an extent refers
243 */
244struct BC_cache_mount {
245	lck_rw_t                 cm_lock;      /* lock guards instance variables, extents have their own locks */
246	uuid_t                   cm_uuid;      /* this mount's uuid */
247	int                      cm_nextents;  /* the number of extents that reference this mount */
248	struct BC_cache_extent **cm_pextents;  /* a list of pointers to extents which reference
249											this mount, sorted by block address */
250	int                      cm_state;     /* this mount's state */
251#define CM_SETUP	(0)                    /* fields above are valid. Not mounted yet */
252#define CM_READY	(1)                    /* every field is valid. Mounted and ready to go */
253#define CM_ABORTED	(2)                    /* mount failed for some reason */
254
255	/* fields below filled in when we detect that the volume has mounted */
256	dev_t                    cm_dev;       /* the device for this mount */
257	struct buf              *cm_bp;        /* struct buf allocated in the reader thread for this device */
258	u_int64_t                cm_throttle_mask;/* the throttle mask to use in throttle APIs for this mount */
259	u_int64_t                cm_blocksize; /* keep 64-bit for promotion */
260	u_int64_t                cm_devsize;   /* range checking */
261	u_int64_t                cm_maxread;   /* largest read the dev will support */
262	struct BC_cache_disk*    cm_disk;      /* the mount's physical disk */
263};
264
265/* A physical disk
266 *
267 * The disk may contain several mounts
268 * and will have one readahead thread
269 */
270struct BC_cache_disk {
271	lck_mtx_t cd_lock;
272	u_int64_t cd_disk_id;    /* The throttle mask, used as an identifier */
273	int       cd_disk_num; /* used for stat batching */
274	int       cd_flags;      /* flags */
275#define CD_HAS_THREAD   (1 << 0)  /* there is a readahead thread in progress for this disk */
276#define CD_ISSUE_LOWPRI (1 << 1)  /* the readahead thread is reading in low-priority extents */
277#define CD_IS_SSD       (1 << 2)  /* this is a solid state disk */
278	int       cd_nmounts;    /* The number of mounts that reference this disk */
279	int       cd_batch;      /* What playback batch this disk is on */
280};
281
282/************************************
283 * Structures for history recording *
284 ************************************/
285
286/* see BC_history_entry and BC_history_mount in BootCache.h */
287
288/*
289 * Wrapper around BC_history_mount so we
290 * can keep track of the device
291 */
292struct BC_history_mount_device {
293	u_int64_t                       hmd_disk_id;
294	struct BC_history_mount_device* hmd_next;
295	uint32_t                        hmd_is_ssd;
296	dev_t                           hmd_dev;
297	int                             hmd_blocksize;
298	struct BC_history_mount         hmd_mount;
299};
300
301/*
302 * A history entry list cluster.
303 */
304struct BC_history_entry_cluster {
305	struct BC_history_entry_cluster *hec_link;
306	int                              hec_nentries;	/* number of entries in list */
307	struct BC_history_entry          hec_data[0];
308};
309
310#ifdef STATIC_HISTORY
311# define BC_HISTORY_ALLOC	128000
312# define BC_HISTORY_ENTRIES						\
313(((BC_HISTORY_ALLOC - sizeof(struct BC_history_entry_cluster)) /	\
314sizeof(struct BC_history_entry)) - 1)
315# define BC_HISTORY_MAXCLUSTERS	1
316#else
317# define BC_HISTORY_ALLOC	16000
318# define BC_HISTORY_ENTRIES						\
319(((BC_HISTORY_ALLOC - sizeof(struct BC_history_entry_cluster)) /	\
320sizeof(struct BC_history_entry)) - 1)
321# define BC_HISTORY_MAXCLUSTERS	((BC_MAXENTRIES / BC_HISTORY_ENTRIES) + 1)
322#endif
323
324
325/*****************************************
326 * To be able to debug live read history *
327 *****************************************/
328#ifdef READ_HISTORY_BUFFER
329struct BC_read_history_entry {
330	struct BC_cache_extent	*rh_extent;	/* extent this read is for */
331	u_int64_t		rh_blkno;	/* offset of actual read */
332	u_int64_t		rh_bcount;	/* length in bytes */
333	int			rh_result;
334# define BC_RHISTORY_OK		0
335# define BC_RHISTORY_FAIL	1
336};
337#endif
338
339
340/*
341 * The cache control structure.
342 */
343struct BC_cache_control {
344	lck_grp_t  *c_lckgrp;
345
346	/* the device we are covering */
347	struct bdevsw	*c_devsw;
348
349	/* saved switch table routines */
350	strategy_fcn_t   *c_strategy;
351	open_close_fcn_t *c_close;
352	lck_mtx_t         c_handlers_lock;
353
354	u_int64_t	c_cachesize;		/* total number of bytes in cache */
355
356	u_int64_t   c_root_disk_id;     /* the throttle mask of the root disk, used as an id for the physical device */
357	                                /* This needs to be updated to handle multiple physical volumes backing a mount */
358
359	/*
360	 * The mounts we're tracking
361	 */
362	int                     c_nmounts; /* number of mounts in the array below */
363	struct BC_cache_mount  *c_mounts;  /* the array of mounts the cache extents refer to */
364
365	/*
366	 * Extents, in optimal read order.
367	 *
368	 * Each extent tracks the pages it owns in the buffer.
369	 * It is assumed that PAGE_SIZE is a multiple of cm_blocksize for each mount.
370	 *
371	 * Each extent refers to one of the mounts, above, by index into c_mounts
372	 */
373	int                      c_nextentlists; /* number of extent lists we have */
374	int                     *c_nextents;     /* number of extents in a given extent list */
375	struct BC_cache_extent **c_extentlists;      /* an array of extent lists in the cache */
376
377	int         c_batch_count;		/* number of extent batches */
378	vnode_t     c_vp;
379	uint32_t    c_vid;
380	/* rw lock protects the above fields through c_nmounts, but not their contents.
381	 * Shared for general use of the fields,
382	 * Exclusive for cache initialization or cleanup.
383	 * Each mount/extent has its own lock for modifying its contents.
384	 */
385	lck_rw_t    c_cache_lock;
386
387
388	/* history list, in reverse order */
389	int                              c_history_num_clusters;
390	struct BC_history_entry_cluster *c_history;
391	struct BC_history_mount_device  *c_history_mounts;
392	uint64_t                         c_history_size;
393	/* lock protects the above history fields and their contents */
394	lck_rw_t   c_history_lock;
395
396	/* fields below are accessed atomically */
397
398	/* flags */
399	UInt32		c_flags;
400#define	BC_FLAG_SETUP			(1 << 0)	/* cache setup properly during mount */
401#define	BC_FLAG_CACHEACTIVE		(1 << 1)	/* cache is active, owns memory */
402#define	BC_FLAG_HISTORYACTIVE	(1 << 2)	/* currently recording history */
403#define	BC_FLAG_HTRUNCATED		(1 << 3)	/* history list truncated */
404#define	BC_FLAG_SHUTDOWN		(1 << 4)	/* readahead shut down */
405	SInt32		c_strategycalls;	/* count of busy strategy calls in the cache */
406
407	uint32_t  c_readahead_throttles_cutthroughs;
408
409	uint32_t  c_num_ios_since_last_hit; /* The number of cache misses since the last hit */
410
411	uint32_t  c_num_reader_threads;     /* The number of reader threads active */
412	lck_mtx_t c_reader_threads_lock;    /* protects c_num_reader_threads */
413
414	/* statistics */
415	int c_take_detailed_stats;
416	struct BC_statistics c_stats;
417
418	/* time-keeping */
419	struct timeval c_loadtime;
420	struct timeval c_cache_starttime;
421	struct timeval c_history_starttime;
422
423#ifdef READ_HISTORY_BUFFER
424	int		c_rhistory_idx;
425	struct BC_read_history_entry *c_rhistory;
426#endif
427};
428
429/*
430 * The number of blocks per page; assumes that a page
431 * will always be >= the size of a disk block.
432 */
433#define CB_PAGEBLOCKS(cm)			(PAGE_SIZE / (cm)->cm_blocksize)
434
435/*
436 * Convert block offset to page offset and vice versa.
437 */
438#define CB_BLOCK_TO_PAGE(cm, block)		((block) / CB_PAGEBLOCKS(cm))
439#define CB_PAGE_TO_BLOCK(cm, page)		((page)  * CB_PAGEBLOCKS(cm))
440#define CB_BLOCK_TO_BYTE(cm, block)		((block) * (cm)->cm_blocksize)
441#define HB_BLOCK_TO_BYTE(hmd, block)	((block) * (hmd)->hmd_blocksize)
442#define CB_BYTE_TO_BLOCK(cm, byte)		((byte)  / (cm)->cm_blocksize)
443
444/*
445 * The size of an addressable field in the block map.
446 * This reflects the u_int32_t type for the blockmap type.
447 */
448#define CB_MAPFIELDBITS			32
449#define CB_MAPFIELDBYTES		4
450
451/*
452 * The index into an array of addressable fields, and the bit within
453 * the addressable field corresponding to an index in the map.
454 */
455#define CB_MAPIDX(x)			((x) / CB_MAPFIELDBITS)
456#define CB_MAPBIT(x)			(1 << ((x) % CB_MAPFIELDBITS))
457
458/*
459 * Blockmap management. Must be called with extent lock held.
460 */
461#define CB_BLOCK_PRESENT(ce, block) \
462((ce)->ce_blockmap != NULL && ((ce)->ce_blockmap[CB_MAPIDX(block)] &   CB_MAPBIT(block)))
463#define CB_MARK_BLOCK_PRESENT(ce, block) \
464((ce)->ce_blockmap[CB_MAPIDX(block)] |=  CB_MAPBIT(block))
465#define CB_MARK_BLOCK_VACANT(ce, block) \
466((ce)->ce_blockmap[CB_MAPIDX(block)] &= ~CB_MAPBIT(block))
467
468/*
469 * Determine whether a given page is vacant (all blocks are gone).
470 * This takes advantage of the expectation that a page's blocks
471 * will never cross the boundary of a field in the blockmap.
472 */
473#define CB_PAGE_VACANT(cm, ce, page)					\
474(!((ce)->ce_blockmap[CB_MAPIDX(CB_PAGE_TO_BLOCK(cm, page))] &	\
475(((1 << CB_PAGEBLOCKS(cm)) - 1) << (CB_PAGE_TO_BLOCK(cm, page) %	\
476CB_MAPFIELDBITS))))
477
478/* Maximum size of the boot cache */
479#define BC_MAX_SIZE (max_mem / 2)
480
481/*
482 * Sanity macro, frees and zeroes a pointer if it is not already zeroed.
483 */
484#define _FREE_ZERO(p, c)						\
485do {								\
486if (p != NULL) {					\
487_FREE(p, c);					\
488p = NULL;					\
489}							\
490} while (0);
491
492/*
493 * Synchronization macros
494 */
495#define LOCK_HISTORY_R()		lck_rw_lock_shared(&BC_cache->c_history_lock)
496#define UNLOCK_HISTORY_R()		lck_rw_unlock_shared(&BC_cache->c_history_lock)
497#define LOCK_HISTORY_W()		lck_rw_lock_exclusive(&BC_cache->c_history_lock)
498#define UNLOCK_HISTORY_W()		lck_rw_unlock_exclusive(&BC_cache->c_history_lock)
499
500#define LOCK_EXTENT(e)			lck_mtx_lock(&(e)->ce_lock)
501#define UNLOCK_EXTENT(e)		lck_mtx_unlock(&(e)->ce_lock)
502
503/* mount locks should only be held while also holding the cache lock */
504#define LOCK_MOUNT_R(m)			lck_rw_lock_shared(&(m)->cm_lock)
505#define UNLOCK_MOUNT_R(m)		lck_rw_unlock_shared(&(m)->cm_lock)
506#define LOCK_MOUNT_W(m)			lck_rw_lock_exclusive(&(m)->cm_lock)
507#define UNLOCK_MOUNT_W(m)		lck_rw_unlock_exclusive(&(m)->cm_lock)
508#define LOCK_MOUNT_W_TO_R(m)	lck_rw_lock_exclusive_to_shared(&(m)->cm_lock)
509#define LOCK_MOUNT_R_TO_W(m)	lck_rw_lock_shared_to_exclusive(&(m)->cm_lock)
510
511#define LOCK_DISK(d)			lck_mtx_lock(&(d)->cd_lock)
512#define UNLOCK_DISK(d)			lck_mtx_unlock(&(d)->cd_lock)
513
514#define LOCK_READERS()			lck_mtx_lock(&BC_cache->c_reader_threads_lock)
515#define UNLOCK_READERS()		lck_mtx_unlock(&BC_cache->c_reader_threads_lock)
516
517#define LOCK_HANDLERS()			lck_mtx_lock(&BC_cache->c_handlers_lock)
518#define UNLOCK_HANDLERS()		lck_mtx_unlock(&BC_cache->c_handlers_lock)
519
520#define LOCK_CACHE_R()			lck_rw_lock_shared(&BC_cache->c_cache_lock)
521#define UNLOCK_CACHE_R()		lck_rw_unlock_shared(&BC_cache->c_cache_lock)
522#define LOCK_CACHE_W()			lck_rw_lock_exclusive(&BC_cache->c_cache_lock)
523#define UNLOCK_CACHE_W()		lck_rw_unlock_exclusive(&BC_cache->c_cache_lock)
524#define LOCK_CACHE_TRY_W()		lck_rw_try_lock(&BC_cache->c_cache_lock, LCK_RW_TYPE_EXCLUSIVE)
525#define LOCK_CACHE_TRY_R()		lck_rw_try_lock(&BC_cache->c_cache_lock, LCK_RW_TYPE_SHARED)
526#define LOCK_CACHE_W_TO_R()		lck_rw_lock_exclusive_to_shared(&BC_cache->c_cache_lock)
527#define LOCK_CACHE_R_TO_W()		lck_rw_lock_shared_to_exclusive(&BC_cache->c_cache_lock)
528
529#define BC_ADD_STAT(stat, inc)	OSAddAtomic((inc), ((SInt32 *)&BC_cache->c_stats.ss_##stat))
530
531/*
532 * Only one instance of the cache is supported.
533 */
534static struct BC_cache_control BC_cache_softc;
535static struct BC_cache_control *BC_cache = &BC_cache_softc;
536
537/*
538 * We support preloaded cache data by checking for a Mach-O
539 * segment/section which can be populated with a playlist by the
540 * linker.  This is particularly useful in the CD mastering process,
541 * as reading the playlist from CD is very slow and rebuilding the
542 * kext at mastering time is not practical.
543 */
544extern kmod_info_t kmod_info;
545static struct BC_playlist_header *BC_preloaded_playlist;
546static int	BC_preloaded_playlist_size;
547
548static int	BC_check_intersection(struct BC_cache_extent *ce, u_int64_t offset, u_int64_t length);
549static int	BC_find_cache_mount(dev_t dev);
550static struct BC_cache_extent**	BC_find_extent(struct BC_cache_mount* cm, u_int64_t offset, u_int64_t length, int contained, int* pnum_extents);
551static int	BC_discard_bytes(struct BC_cache_extent *ce, u_int64_t offset, u_int64_t length);
552static int	BC_handle_discards(struct BC_cache_mount *cm, u_int64_t offset, u_int64_t length);
553static int	BC_blocks_present(struct BC_cache_extent *ce, int base, int nblk);
554static void	BC_reader_thread(void *param0, wait_result_t param1);
555static int	BC_strategy(struct buf *bp);
556static int	BC_close(dev_t dev, int flags, int devtype, struct proc *p);
557static int	BC_terminate_readahead(void);
558static int	BC_terminate_cache(void);
559static int	BC_terminate_history(void);
560static void	BC_terminate_cache_async(void);
561static void	BC_check_handlers(void);
562static void	BC_next_valid_range(struct BC_cache_mount *cm, struct BC_cache_extent *ce, uint32_t from,
563								uint32_t *nextpage, uint32_t *nextoffset, uint32_t *nextlength);
564static int	BC_setup_disk(struct BC_cache_disk *cd, u_int64_t disk_id, int is_ssd);
565static void	BC_teardown_disk(struct BC_cache_disk *cd);
566static void	BC_mount_available(struct BC_cache_mount *cm);
567static int	BC_setup_mount(struct BC_cache_mount *cm, struct BC_playlist_mount* pm);
568static int	BC_fill_in_mount(struct BC_cache_mount *cm, mount_t mount, vfs_context_t context);
569static void	BC_teardown_mount(struct BC_cache_mount *ce);
570static int	BC_setup_extent(struct BC_cache_extent *ce, const struct BC_playlist_entry *pe, int batch_adjustment, int cache_mount_idx);
571static void	BC_teardown_extent(struct BC_cache_extent *ce);
572static int	BC_copyin_playlist(size_t mounts_size, user_addr_t mounts, size_t entries_size, user_addr_t entries);
573static int	BC_init_cache(size_t mounts_size, user_addr_t mounts, size_t entries_size, user_addr_t entries, int record_history);
574static void BC_update_mounts(void);
575static void	BC_timeout_history(void *);
576static struct BC_history_mount_device * BC_get_history_mount_device(dev_t dev, int* pindex);
577static int	BC_add_history(daddr64_t blkno, u_int64_t length, int pid, int hit, int write, int tag, int shared, dev_t dev);
578static int	BC_size_history_mounts(void);
579static int	BC_size_history_entries(void);
580static int	BC_copyout_history_mounts(user_addr_t uptr);
581static int	BC_copyout_history_entries(user_addr_t uptr);
582static void	BC_discard_history(void);
583static void	BC_auto_start(void);
584static int	BC_setup(void);
585
586static int	fill_in_bc_cache_mounts(mount_t mount, void* arg);
587static int	check_for_new_mount_itr(mount_t mount, void* arg);
588static int	extents_status(struct BC_cache_extent **pce, int nextents) ;
589static void	wait_for_extent(struct BC_cache_extent *ce, struct timeval starttime) ;
590
591#ifndef BOOTCACHE_ENTRIES_SORTED_BY_DISK_OFFSET
592static int  compare_cache_extents(const void *vfirst, const void *vsecond);
593//FIXME: qsort not available in kexts
594extern void qsort(void *a, size_t n, size_t es, int (*cmp)(const void *, const void *));
595#endif
596
597/*
598 * Sysctl interface.
599 */
600static int	BC_sysctl SYSCTL_HANDLER_ARGS;
601
602SYSCTL_PROC(_kern, OID_AUTO, BootCache, CTLFLAG_RW | CTLFLAG_LOCKED,
603			0, 0, BC_sysctl,
604			"S,BC_command",
605			"BootCache command interface");
606
607/*
608 * Netboot exports this hook so that we can determine
609 * whether we were netbooted.
610 */
611extern int	netboot_root(void);
612
613/*
614 * bsd_init exports this hook so that we can register for notification
615 * once the root filesystem is mounted.
616 */
617extern void (* mountroot_post_hook)(void);
618extern void (* unmountroot_pre_hook)(void);
619
620/*
621 * Mach VM-specific functions.
622 */
623static int	BC_alloc_pagebuffer();
624static void	BC_free_pagebuffer();
625static void	BC_free_page(struct BC_cache_extent *ce, int page);
626
627/* Functions not prototyped elsewhere */
628int     	ubc_range_op(vnode_t, off_t, off_t, int, int *); /* in sys/ubc_internal.h */
629
630/*
631 * Mach-o functions.
632 */
633#define MACH_KERNEL 1
634#include <mach-o/loader.h>
635extern void *getsectdatafromheader(struct mach_header *, char *, char *, int *);
636
637/*
638 * Set or clear a flag atomically and return the previous value of that bit.
639 */
640static inline UInt32
641BC_set_flag(UInt32 flag)
642{
643	return flag & OSBitOrAtomic(flag, &BC_cache->c_flags);
644}
645
646static inline UInt32
647BC_clear_flag(UInt32 flag)
648{
649	return flag & OSBitAndAtomic(~(flag), &BC_cache->c_flags);
650}
651
652/*
653 * Return a user-readable string for a given uuid
654 *
655 * Returns a pointer to a static buffer, which is
656 * racy, so this should only be used for debugging purposes
657 */
658static inline const char* uuid_string(uuid_t uuid)
659{
660	/* Racy, but used for debug output so who cares */
661	static uuid_string_t uuidString;
662	uuid_unparse(uuid, uuidString);
663	return (char*)uuidString;
664}
665
666/*
667 * Test whether a given byte range on disk intersects with an extent's range.
668 * This function assumes the ranges are within the same mount.
669 */
670static inline int
671BC_check_intersection(struct BC_cache_extent *ce, u_int64_t offset, u_int64_t length)
672{
673	if ((offset < (ce->ce_diskoffset + ce->ce_length)) &&
674		((offset + length) > ce->ce_diskoffset))
675		return 1;
676	return 0;
677}
678
679/*
680 * Find one of our cache mounts with the given device, if it has been seen this boot
681 *
682 * Called with the cache read lock held
683 *
684 * Returns the index of the mount, or -1 if it is not found
685 */
686static int
687BC_find_cache_mount(dev_t dev)
688{
689	int i;
690
691	if (dev == nulldev())
692		return(-1);
693
694	if (!(BC_cache->c_flags & BC_FLAG_CACHEACTIVE) || (BC_cache->c_mounts == NULL))
695		return(-1);
696
697
698	for (i = 0; i < BC_cache->c_nmounts; i++) {
699		if (BC_cache->c_mounts[i].cm_state == CM_READY) {
700			if (BC_cache->c_mounts[i].cm_dev == dev) {
701				return i;
702			}
703		}
704	}
705
706	return(-1);
707}
708
709/* The new throttling implementation needs to be able to check
710 * whether an IO is in the boot cache before issuing the IO
711 */
712static int BC_cache_contains_block(dev_t device, u_int64_t blkno) {
713	struct BC_cache_mount * cm = NULL;
714	struct BC_cache_extent **pce, *ce;
715
716	if (!(BC_cache->c_flags & BC_FLAG_CACHEACTIVE)) {
717		return 0;
718	}
719
720	LOCK_CACHE_R();
721
722	int cm_idx = BC_find_cache_mount(device);
723	if (cm_idx == -1) {
724		UNLOCK_CACHE_R();
725		return 0;
726	}
727
728	cm = BC_cache->c_mounts + cm_idx;
729
730	LOCK_MOUNT_R(cm);
731
732	if (cm->cm_state != CM_READY) {
733		UNLOCK_MOUNT_R(cm);
734		UNLOCK_CACHE_R();
735		return 0;
736	}
737
738	u_int64_t disk_offset = CB_BLOCK_TO_BYTE(cm, blkno);
739	u_int64_t length = CB_BLOCK_TO_BYTE(cm, 1) /* one block */;
740
741	pce = BC_find_extent(cm, disk_offset, length, 0, NULL);
742
743	if (pce == NULL || *pce == NULL) {
744		UNLOCK_MOUNT_R(cm);
745		UNLOCK_CACHE_R();
746		return 0;
747	}
748
749	ce = *pce;
750
751	UNLOCK_MOUNT_R(cm);
752
753	if (ce->ce_flags & CE_ABORTED) {
754		UNLOCK_CACHE_R();
755		return 0;
756	}
757
758	if (ce->ce_flags & CE_LOWPRI &&
759		!(ce->ce_flags & CE_IODONE)) {
760		UNLOCK_CACHE_R();
761		return 0;
762	}
763
764	UNLOCK_CACHE_R();
765	return 1;
766}
767
768/*
769 * Find a cache extent containing or intersecting the offset/length.
770 *
771 * We need to be able to find both containing and intersecting
772 * extents.  Rather than handle each seperately, we take advantage
773 * of the fact that any containing extent will also be an intersecting
774 * extent.
775 *
776 * Thus, a search for a containing extent should be a search for an
777 * intersecting extent followed by a test for containment.
778 *
779 * Containment may include multiple extents. The returned extent will
780 * be the first of these extents (lowest offset), but the next extents
781 * in the array may also be required to contain the requested range.
782 * Upon success, *pnum_extents contains the number of extents required
783 * to contain the range.
784 *
785 * Returns a pointer to a matching the entry in the mount's extent list
786 * or NULL if no match is found.
787 *
788 * Called with the cache mount read lock held
789 */
790static struct BC_cache_extent **
791BC_find_extent(struct BC_cache_mount *cm, u_int64_t offset, u_int64_t length, int contained, int *pnum_extents)
792{
793	struct BC_cache_extent **p, **base, **last, **next, **prev; /* pointers into the mount's extent index list */
794	size_t lim;
795
796	/* invariants */
797	assert(length > 0);
798
799	base = cm->cm_pextents;
800	last = base + cm->cm_nextents - 1;
801
802	/*
803	 * If the cache is inactive, exit.
804	 */
805	if (!(BC_cache->c_flags & BC_FLAG_CACHEACTIVE) || (base == NULL)) {
806		return(NULL);
807	}
808
809	/*
810	 * Range-check against the cache first.
811	 *
812	 * Note that we check for possible intersection, rather than
813	 * containment.
814	 */
815	if (((offset + length) <= (*base)->ce_diskoffset) ||
816	    (offset >= ((*last)->ce_diskoffset + (*last)->ce_length))) {
817		return(NULL);
818	}
819
820	/*
821	 * Perform a binary search for an extent intersecting
822	 * the offset and length.
823	 *
824	 * This code is based on bsearch(3), and the description following
825	 * is taken directly from it.
826	 *
827	 * The code below is a bit sneaky.  After a comparison fails, we
828	 * divide the work in half by moving either left or right. If lim
829	 * is odd, moving left simply involves halving lim: e.g., when lim
830	 * is 5 we look at item 2, so we change lim to 2 so that we will
831	 * look at items 0 & 1.  If lim is even, the same applies.  If lim
832	 * is odd, moving right again involes halving lim, this time moving
833	 * the base up one item past p: e.g., when lim is 5 we change base
834	 * to item 3 and make lim 2 so that we will look at items 3 and 4.
835	 * If lim is even, however, we have to shrink it by one before
836	 * halving: e.g., when lim is 4, we still looked at item 2, so we
837	 * have to make lim 3, then halve, obtaining 1, so that we will only
838	 * look at item 3.
839	 */
840	for (lim = cm->cm_nextents; lim != 0; lim >>= 1) {
841		p = base + (lim >> 1);
842
843		if (BC_check_intersection((*p), offset, length)) {
844
845			if (contained == 0)
846				return(p);
847
848			if (pnum_extents) {
849				*pnum_extents = 1;
850			}
851
852			/*
853			 * We are checking for containment, verify that here.
854			 * Extents may be adjacent, so include neighboring
855			 * extents in the containment check
856			 */
857
858			/* Check the high bound */
859			if ((offset + length) > ((*p)->ce_diskoffset + (*p)->ce_length)) {
860
861				/* check higher extents to see if they contain the high end of this range */
862				prev = p;
863				for (next = prev + 1;
864					 next <= last;
865					 next++) {
866
867					/* extents should never overlap */
868					assert(((*prev)->ce_diskoffset + (*prev)->ce_length) <= (*next)->ce_diskoffset);
869
870					if (((*prev)->ce_diskoffset + (*prev)->ce_length) < (*next)->ce_diskoffset) {
871						/* extents are not adjacent */
872						if (BC_cache->c_take_detailed_stats) {
873							xdebug("Read 0x%llx:%lld on mount %s intersected, but not contained by %d-extent cache range 0x%llx,%lld (missed last %lld bytes)", offset, length, uuid_string(cm->cm_uuid), *pnum_extents, (*p)->ce_diskoffset, ((*prev)->ce_diskoffset + (*prev)->ce_length), (offset + length) - ((*prev)->ce_diskoffset + (*prev)->ce_length));
874						}
875						return(NULL);
876					}
877
878					if (pnum_extents) {
879						(*pnum_extents)++;
880					}
881
882					if ((offset + length) <= ((*next)->ce_diskoffset + (*next)->ce_length)) {
883						/* we contain the high end of the range, break so we check the low end below */
884						break;
885					}
886
887					prev = next;
888				}
889
890				if (next > last) {
891					/* ran off the end of the extent list */
892					if (BC_cache->c_take_detailed_stats) {
893						xdebug("Read 0x%llx:%lld on mount %s intersected, but not contained by %d-extent cache range 0x%llx,%lld (missed last %lld bytes)", offset, length, uuid_string(cm->cm_uuid), *pnum_extents, (*p)->ce_diskoffset, ((*prev)->ce_diskoffset + (*prev)->ce_length), (offset + length) - ((*prev)->ce_diskoffset + (*prev)->ce_length));
894					}
895					return (NULL);
896				}
897			}
898
899			/* Check the low bound */
900			if (offset < (*p)->ce_diskoffset) {
901
902				/* check lower extents to see if they contain the low end of this range */
903				next = p;
904				for (prev = next - 1;
905					 prev >= base;
906					 prev--) {
907
908					/* extents should never overlap */
909					assert(((*prev)->ce_diskoffset + (*prev)->ce_length) <= (*next)->ce_diskoffset);
910
911					if (((*prev)->ce_diskoffset + (*prev)->ce_length) < (*next)->ce_diskoffset) {
912						/* extents are not adjacent */
913						if (BC_cache->c_take_detailed_stats) {
914							xdebug("Read 0x%llx:%lld on mount %s intersected, but not contained by %d-extent cache range 0x%llx:%lld (missed first %lld bytes)", offset, length, uuid_string(cm->cm_uuid), *pnum_extents, (*next)->ce_diskoffset, ((*p)->ce_diskoffset + (*p)->ce_length), (*next)->ce_diskoffset - offset);
915						}
916						return(NULL);
917					}
918
919					if (pnum_extents) {
920						(*pnum_extents)++;
921					}
922
923					if (offset >= (*prev)->ce_diskoffset) {
924						/* we contain the low end of the range (we checked high end above) */
925						return(prev);
926					}
927
928					next = prev;
929				}
930
931				assert(prev < base); /* only break condition */
932
933				if (prev < base) {
934					/* ran off the end of the extent list */
935					if (BC_cache->c_take_detailed_stats) {
936						xdebug("Read 0x%llx:%lld on mount %s intersected, but not contained by %d-extent cache range 0x%llx:%lld (missed first %lld bytes)", offset, length, uuid_string(cm->cm_uuid), *pnum_extents, (*next)->ce_diskoffset, ((*p)->ce_diskoffset + (*p)->ce_length), (*next)->ce_diskoffset - offset);
937					}
938					return (NULL);
939				}
940			}
941
942			return(p);
943		}
944
945		if (offset > (*p)->ce_diskoffset) {	/* move right */
946			base = p + 1;
947			lim--;
948		}				/* else move left */
949	}
950	/* found nothing */
951	return(NULL);
952}
953
954/*
955 * Discard data from the cache and free pages which are no longer required.
956 *
957 * This should be locked against anyone testing for !vacant pages.
958 *
959 * We return the number of bytes we marked unused. If the value is
960 * zero, the offset/length did not overap this extent.
961 *
962 * Called with extent lock held.
963 */
964static int
965BC_discard_bytes(struct BC_cache_extent *ce, u_int64_t offset, u_int64_t length)
966{
967	struct BC_cache_mount *cm;
968	u_int64_t estart, eend, dstart, dend, nblk, base, page;
969	int i, discarded;
970
971	/* invariants */
972	assert(length > 0);
973#ifdef BC_DEBUG
974	lck_mtx_assert(&ce->ce_lock, LCK_MTX_ASSERT_OWNED);
975#endif
976
977	/*
978	 * Extent has been terminated: blockmap may no longer be valid.
979	 */
980	if (ce->ce_flags & CE_ABORTED ||
981		ce->ce_length == 0)
982		return 0;
983
984	cm = BC_cache->c_mounts + ce->ce_mount_idx;
985
986	/*
987	 * Constrain offset and length to fit this extent, return immediately if
988	 * there is no overlap.
989	 */
990	dstart = offset;			/* convert to start/end format */
991	dend = offset + length;
992	assert(dend > dstart);
993	estart = ce->ce_diskoffset;
994	eend = ce->ce_diskoffset + ce->ce_length;
995	assert(eend > estart);
996
997	if (dend <= estart)			/* check for overlap */
998		return(0);
999	if (eend <= dstart)
1000		return(0);
1001
1002	if (dstart < estart)			/* clip to extent */
1003		dstart = estart;
1004	if (dend > eend)
1005		dend = eend;
1006
1007	/*
1008	 * Convert length in bytes to a block count.
1009	 */
1010	nblk = CB_BYTE_TO_BLOCK(cm, dend - dstart);
1011
1012	/*
1013	 * Mark blocks vacant.  For each vacated block, check whether the page
1014	 * holding it can be freed.
1015	 *
1016	 * Note that this could be optimised to reduce the number of
1017	 * page-freeable checks, but the logic would be more complex.
1018	 */
1019	base = CB_BYTE_TO_BLOCK(cm, dstart - ce->ce_diskoffset);
1020	assert(base >= 0);
1021	discarded = 0;
1022	assert((base + nblk - 1) < howmany(ce->ce_length, cm->cm_blocksize));
1023	for (i = 0; i < nblk; i++) {
1024
1025		/* this is a no-op if the block is already gone */
1026		if (CB_BLOCK_PRESENT(ce, base + i)) {
1027
1028			/* mark the block as vacant */
1029			CB_MARK_BLOCK_VACANT(ce, base + i);
1030			discarded++;
1031
1032			page = CB_BLOCK_TO_PAGE(cm, base + i);
1033			if (CB_PAGE_VACANT(cm, ce, page))
1034				BC_free_page(ce, (int) page);
1035		}
1036	}
1037	return(discarded * cm->cm_blocksize);
1038}
1039
1040/*
1041 * Blocks in an extent can be invalidated, e.g. by a write, before the
1042 * reader thread fills the extent. Search for the next range of viable blocks
1043 * in the extent, starting from offset 'fromoffset'.
1044 *
1045 * Values are returned by out parameters:
1046 * 	'nextpage' takes the page containing the start of the next run, or
1047 * 		-1 if no more runs are to be found.
1048 * 	'nextoffset' takes the offset into that page that the run begins.
1049 * 	'nextlength' takes the length in bytes of the range, capped at maxread.
1050 *
1051 * In other words, for this blockmap, if
1052 * 	    *fromoffset = 3 * blocksize,
1053 * 	1 1 0 0 0 1 1 1   1 1 1 1 1 1 0 0
1054 * 	*nextpage = 0
1055 * 	          *nextoffset = 5 * blocksize
1056 * 	                            *nextlength = 6 * blocksize
1057 *
1058 * Called with the extent lock held
1059 */
1060static void
1061BC_next_valid_range(struct BC_cache_mount *cm, struct BC_cache_extent *ce, uint32_t fromoffset,
1062					uint32_t *nextpage, uint32_t *nextoffset, uint32_t *nextlength)
1063{
1064	int maxblks, i, nextblk = 0;
1065	int found = 0;
1066
1067	maxblks = howmany(ce->ce_length, cm->cm_blocksize);
1068	i = CB_BYTE_TO_BLOCK(cm, fromoffset);
1069	/* scan for the next range of valid blocks in the extent */
1070	for (; i < maxblks; i++) {
1071		if (CB_BLOCK_PRESENT(ce, i)) {
1072			if (found == 0) {
1073				nextblk = i;
1074			}
1075			found++;
1076		} else {
1077			if (found != 0)
1078				break;
1079		}
1080	}
1081
1082	if (found) {
1083		/* found a valid range, so convert to (page, offset, length) */
1084		*nextpage = CB_BLOCK_TO_PAGE(cm, nextblk);
1085		*nextoffset = CB_BLOCK_TO_BYTE(cm, nextblk) % PAGE_SIZE;
1086		*nextlength = MIN(CB_BLOCK_TO_BYTE(cm, found), cm->cm_maxread);
1087	} else {
1088		*nextpage = -1;
1089	}
1090
1091	return;
1092}
1093
1094/*
1095 * Test for the presence of a range of blocks in the cache.
1096 *
1097 * Called with the extent lock held.
1098 */
1099static int
1100BC_blocks_present(struct BC_cache_extent *ce, int base, int nblk)
1101{
1102	int	blk;
1103
1104	assert(base >= 0);
1105	assert((base + nblk) <= howmany(ce->ce_length, BC_cache->c_mounts[ce->ce_mount_idx].cm_blocksize));
1106
1107	for (blk = 0; blk < nblk; blk++) {
1108
1109		if (!CB_BLOCK_PRESENT(ce, base + blk)) {
1110			return(0);
1111		}
1112	}
1113	return(1);
1114}
1115
1116/*
1117 * Readahead thread.
1118 */
1119static void
1120BC_reader_thread(void *param0, wait_result_t param1)
1121{
1122	struct BC_cache_disk *cd = NULL;
1123	struct BC_cache_mount *cm = NULL;
1124	struct BC_cache_extent *ce = NULL;
1125	int count, num_mounts, ce_idx, cel_idx;
1126	upl_t	upl;
1127	kern_return_t kret;
1128	struct timeval batchstart, batchend;
1129	int error = 0;
1130	int issuing_lowpriority_extents = 0;
1131
1132	/* We can access the BC_cache_disk passed in freely without locks
1133	 * since disks are only freed up in terminate cache after it has
1134	 * guanteed that no reader threads are running
1135	 */
1136	cd = (struct BC_cache_disk*) param0;
1137
1138	num_mounts = cd->cd_nmounts;
1139
1140	if (BC_cache->c_flags & BC_FLAG_SHUTDOWN)
1141		goto out;
1142
1143	for (;;) {
1144
1145		if (issuing_lowpriority_extents) {
1146			debug("disk %d starting low priority batch", cd->cd_disk_num);
1147		} else {
1148			debug("disk %d starting batch %d", cd->cd_disk_num, cd->cd_batch);
1149			KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, DBG_BC_BATCH) |
1150								  DBG_FUNC_START,
1151								  cd->cd_batch, cd->cd_disk_num, 0, 0, 0);
1152		}
1153		microtime(&batchstart);
1154
1155		LOCK_CACHE_R();
1156		for (cel_idx = 0;
1157			 cel_idx < BC_cache->c_nextentlists;
1158			 cel_idx++) {
1159
1160			/* iterate over extents to populate */
1161			for (ce_idx = 0;
1162				 ce_idx < BC_cache->c_nextents[cel_idx];
1163				 ce_idx++) {
1164
1165				ce = BC_cache->c_extentlists[cel_idx] + ce_idx;
1166
1167				cm = BC_cache->c_mounts + ce->ce_mount_idx;
1168
1169				if (cm->cm_state != CM_READY) {
1170					continue;
1171				}
1172
1173				if (cm->cm_disk != cd){
1174					continue;
1175				}
1176
1177				/* Only read extents marked for this batch or earlier.
1178				 * All low-priority IO is read in one batch, however.
1179				 */
1180				if (ce->ce_batch > cd->cd_batch && !(cd->cd_flags & CD_ISSUE_LOWPRI)) {
1181					continue;
1182				}
1183
1184				/* Check if already done or aborted */
1185				if (ce->ce_flags & (CE_ABORTED | CE_IODONE)) {
1186					continue;
1187				}
1188
1189				/* Check if this extent is low priority and we're not yet reading in low-priority extents */
1190				if (!(cd->cd_flags & CD_ISSUE_LOWPRI) &&
1191					(ce->ce_flags & CE_LOWPRI)) {
1192					continue;
1193				}
1194
1195				/* Shouldn't happen */
1196				if((cd->cd_flags & CD_ISSUE_LOWPRI) &&
1197				   !(ce->ce_flags & CE_LOWPRI)) {
1198					debug("Saw a regular extent while issuing throttled IO");
1199				}
1200
1201				LOCK_EXTENT(ce);
1202
1203				/* Check if again with the lock */
1204				if (ce->ce_flags & (CE_ABORTED | CE_IODONE)) {
1205					UNLOCK_EXTENT(ce);
1206					continue;
1207				}
1208
1209				/* loop reading to fill this extent */
1210				uint32_t fromoffset = 0;
1211
1212				LOCK_MOUNT_R(cm);
1213
1214				if (cm->cm_state != CM_READY) {
1215					/* the mount was aborted. Whoever aborted it handles aborting the extents as well */
1216					UNLOCK_MOUNT_R(cm);
1217					UNLOCK_EXTENT(ce);
1218					continue;
1219				}
1220
1221				for (;;) {
1222					uint32_t nextpage;
1223					uint32_t nextoffset;
1224					uint32_t nextlength;
1225
1226					/* requested shutdown */
1227					if (BC_cache->c_flags & BC_FLAG_SHUTDOWN) {
1228						UNLOCK_MOUNT_R(cm);
1229						goto out;
1230					}
1231
1232
1233					/*
1234					 * Find the next set of blocks that haven't been invalidated
1235					 * for this extent.
1236					 */
1237					BC_next_valid_range(cm, ce, fromoffset, &nextpage, &nextoffset, &nextlength);
1238					/* no more blocks to be read */
1239					if (nextpage == -1)
1240						break;
1241
1242					/* set up fromoffset to read the next segment of the extent */
1243					fromoffset = (nextpage * PAGE_SIZE) + nextoffset + nextlength;
1244
1245					kret = vnode_getwithvid(BC_cache->c_vp, BC_cache->c_vid);
1246					if (kret != KERN_SUCCESS) {
1247						UNLOCK_MOUNT_R(cm);
1248						message("reader thread: vnode_getwithvid failed - %d\n", kret);
1249						goto out;
1250					}
1251
1252					kret = ubc_create_upl(BC_cache->c_vp,
1253										  ce->ce_cacheoffset + (nextpage * PAGE_SIZE),
1254										  (int) roundup(nextoffset + nextlength, PAGE_SIZE),
1255										  &upl,
1256										  NULL,
1257										  UPL_SET_LITE|UPL_FILE_IO);
1258					if (kret != KERN_SUCCESS) {
1259						UNLOCK_MOUNT_R(cm);
1260						message("ubc_create_upl returned %d\n", kret);
1261						(void) vnode_put(BC_cache->c_vp);
1262						goto out;
1263					}
1264
1265					/* cm_bp is only accessed from the reader thread,
1266					 * and there's only one reader thread for a given mount,
1267					 * so we don't need the write lock to modify it */
1268
1269					/* set buf to fill the requested subset of the upl */
1270					buf_setblkno(cm->cm_bp, CB_BYTE_TO_BLOCK(cm, ce->ce_diskoffset + nextoffset) + CB_PAGE_TO_BLOCK(cm, nextpage));
1271					buf_setcount(cm->cm_bp, (unsigned int) nextlength);
1272					buf_setupl(cm->cm_bp, upl, (unsigned int) nextoffset);
1273					buf_setresid(cm->cm_bp, buf_count(cm->cm_bp));	/* ask for residual indication */
1274					buf_reset(cm->cm_bp, B_READ);
1275
1276					/* If this is regular readahead, make sure any throttled IO are throttled by the readahead thread rdar://8592635
1277					 * If this is low-priority readahead, make sure this thread will be throttled appropriately later rdar://8734858
1278					 */
1279					throttle_info_handle_t throttle_info_handle;
1280					if ((error = throttle_info_ref_by_mask(cm->cm_throttle_mask, &throttle_info_handle)) == 0) {
1281						throttle_info_update_by_mask(throttle_info_handle, 0x0);
1282						throttle_info_rel_by_mask(throttle_info_handle);
1283					} else {
1284						debug("Unable to update throttle info for mount %s: %d", uuid_string(cm->cm_uuid), error);
1285					}
1286
1287					BC_ADD_STAT(initiated_reads, 1);
1288					if (cd->cd_disk_num < STAT_DISKMAX) {
1289						if (ce->ce_flags & CE_LOWPRI) {
1290							BC_ADD_STAT(disk_initiated_reads_lowpri[cd->cd_disk_num], 1);
1291						} else {
1292							BC_ADD_STAT(disk_initiated_reads[cd->cd_disk_num], 1);
1293						}
1294					}
1295
1296					/* give the buf to the underlying strategy routine */
1297					KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, DKIO_READ) | DBG_FUNC_NONE,
1298										  (uintptr_t) buf_kernel_addrperm_addr(cm->cm_bp), buf_device(cm->cm_bp), (long)buf_blkno(cm->cm_bp), buf_count(cm->cm_bp), 0);
1299					KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, DBG_BC_PLAYBACK_IO) | DBG_FUNC_NONE, buf_kernel_addrperm_addr(cm->cm_bp), 0, 0, 0, 0);
1300
1301					BC_cache->c_strategy(cm->cm_bp);
1302
1303					/* wait for the bio to complete */
1304					buf_biowait(cm->cm_bp);
1305
1306					kret = ubc_upl_commit(upl);
1307					(void) vnode_put(BC_cache->c_vp);
1308
1309					/*
1310					 * If the read or commit returned an error, invalidate the bytes
1311					 * covered by the read (on a residual, we could avoid invalidating
1312					 * bytes that are claimed to be read as a minor optimisation, but we do
1313					 * not expect errors as a matter of course).
1314					 */
1315					if (kret != KERN_SUCCESS || buf_error(cm->cm_bp) || (buf_resid(cm->cm_bp) != 0)) {
1316						debug("read error: extent %lu %lu/%lu "
1317							  "(error buf %ld/%ld flags %08x resid %d)",
1318							  (unsigned long) ce_idx,
1319							  (unsigned long)ce->ce_diskoffset, (unsigned long)ce->ce_length,
1320							  (long)buf_blkno(cm->cm_bp), (long)buf_count(cm->cm_bp),
1321							  buf_flags(cm->cm_bp), buf_resid(cm->cm_bp));
1322
1323						count = BC_discard_bytes(ce, ce->ce_diskoffset + nextoffset, nextlength);
1324						debug("read error: discarded %d bytes", count);
1325						BC_ADD_STAT(read_errors, 1);
1326						BC_ADD_STAT(read_errors_bytes, count);
1327					}
1328#ifdef READ_HISTORY_BUFFER
1329					if (BC_cache->c_rhistory && BC_cache->c_rhistory_idx < READ_HISTORY_BUFFER) {
1330						BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_extent = ce;
1331						BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_blkno = buf_blkno(cm->cm_bp);
1332						BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_bcount = buf_count(cm->cm_bp);
1333						if (buf_error(cm->cm_bp) || (buf_resid(cm->cm_bp) != 0)) {
1334							BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_result = BC_RHISTORY_FAIL;
1335							BC_cache->c_rhistory_idx++;
1336						} else {
1337# ifdef READ_HISTORY_ALL
1338							BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_result = BC_RHISTORY_OK;
1339							BC_cache->c_rhistory_idx++;
1340# endif
1341						}
1342					}
1343#endif
1344
1345					/* I'd like to throttle the thread here, but I'm holding the extent and cache locks, so it's throttled below */
1346				}
1347
1348				/* update stats */
1349				BC_ADD_STAT(read_blocks, (u_int) CB_BYTE_TO_BLOCK(cm, ce->ce_length));
1350				BC_ADD_STAT(read_bytes,  (u_int) ce->ce_length);
1351				if (ce->ce_flags & CE_LOWPRI) {
1352					BC_ADD_STAT(read_bytes_lowpri,  (u_int) ce->ce_length);
1353					if (cd->cd_disk_num < STAT_DISKMAX) {
1354						BC_ADD_STAT(batch_bytes_lowpri[cd->cd_disk_num],  (u_int) ce->ce_length);
1355					}
1356				} else {
1357					if (cd->cd_disk_num < STAT_DISKMAX) {
1358						BC_ADD_STAT(batch_bytes[cd->cd_disk_num][MIN(cd->cd_batch, STAT_BATCHMAX)],  (u_int) ce->ce_length);
1359						if (ce->ce_batch < cd->cd_batch) {
1360							BC_ADD_STAT(batch_late_bytes[cd->cd_disk_num][MIN(cd->cd_batch, STAT_BATCHMAX)], ce->ce_length);
1361						}
1362					}
1363				}
1364				if (ce->ce_flags & CE_SHARED) {
1365					BC_ADD_STAT(shared_bytes, (u_int) ce->ce_length);
1366				}
1367				UNLOCK_MOUNT_R(cm);
1368
1369				/*
1370				 * Wake up anyone wanting this extent, as it is now ready for
1371				 * use.
1372				 */
1373				ce->ce_flags |= CE_IODONE;
1374				UNLOCK_EXTENT(ce);
1375				wakeup(ce);
1376
1377				/* Let anyone waiting for the write lock to take hold */
1378				UNLOCK_CACHE_R();
1379
1380				/* Take this opportunity of locklessness to throttle the reader thread, if necessary */
1381				if (issuing_lowpriority_extents) {
1382					throttle_lowpri_io((ce->ce_flags & CE_LOWPRI) ? 1 : 0);
1383				}
1384
1385				LOCK_CACHE_R();
1386				if (issuing_lowpriority_extents && !(cd->cd_flags & CD_ISSUE_LOWPRI)) {
1387					/* New playlist arrived, go back to issuing regular priority extents */
1388					debug("New extents, interrupting low priority readahead");
1389					break;
1390				}
1391
1392				if (cel_idx >= BC_cache->c_nextentlists ||
1393					ce_idx  >= BC_cache->c_nextents[cel_idx]) {
1394					/* cache shrunk while we weren't looking.
1395					 * Not a hard error, but we're done with this batch at least
1396					 */
1397					break;
1398				}
1399			}
1400			ce = NULL;
1401			if (issuing_lowpriority_extents && !(cd->cd_flags & CD_ISSUE_LOWPRI)) {
1402				/* New playlist arrived, go back to issuing regular priority extents */
1403				break;
1404			}
1405		}
1406		UNLOCK_CACHE_R();
1407
1408		microtime(&batchend);
1409		timersub(&batchend, &batchstart, &batchend);
1410		if (cd->cd_disk_num < STAT_DISKMAX) {
1411			if (issuing_lowpriority_extents) {
1412				BC_cache->c_stats.ss_batch_time_lowpri[cd->cd_disk_num] +=
1413				(u_int) batchend.tv_sec * 1000 +
1414				(u_int) batchend.tv_usec / 1000;
1415			} else {
1416				/* Measure times for the first several batches separately */
1417				BC_cache->c_stats.ss_batch_time[cd->cd_disk_num][MIN(cd->cd_batch, STAT_BATCHMAX)] +=
1418				(u_int) batchend.tv_sec * 1000 +
1419				(u_int) batchend.tv_usec / 1000;
1420			}
1421		}
1422		if (issuing_lowpriority_extents) {
1423			// debug("disk %d low priority batch done", cd->cd_disk_num);
1424		} else {
1425			// debug("disk %d batch %d done", cd->cd_disk_num, cd->cd_batch);
1426			KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, DBG_BC_BATCH) |
1427								  DBG_FUNC_END,
1428								  cd->cd_batch, cd->cd_disk_num, 0, 0, 0);
1429		}
1430
1431
1432		LOCK_DISK(cd);
1433		if (issuing_lowpriority_extents && !(cd->cd_flags & CD_ISSUE_LOWPRI)) {
1434			/* New playlist arrived, go back to issuing regular extents */
1435			issuing_lowpriority_extents = 0;
1436			throttle_set_thread_io_policy(IOPOL_NORMAL);
1437			cd->cd_batch = 0;
1438		} else if (cd->cd_batch + 1 < BC_cache->c_batch_count && !issuing_lowpriority_extents) {
1439			cd->cd_batch++;
1440		} else {
1441			if (num_mounts == cd->cd_nmounts) {
1442				/* no new mounts appeared */
1443				if ( !(cd->cd_flags & CD_ISSUE_LOWPRI)) {
1444					/* go through the playlist again, issuing any low-priority extents we have */
1445					cd->cd_flags |= CD_ISSUE_LOWPRI;
1446					throttle_set_thread_io_policy(IOPOL_THROTTLE);
1447					issuing_lowpriority_extents = 1;
1448				} else {
1449					/* we're done */
1450					cd->cd_flags &= (~CD_HAS_THREAD);
1451					cd->cd_batch = 0;
1452					UNLOCK_DISK(cd);
1453					goto out;
1454				}
1455			} else {
1456				/* New mounts appeared. Run through again */
1457				cd->cd_batch = 0;
1458				debug("disk %d more mounts detected (%d total)", cd->cd_disk_num, cd->cd_nmounts);
1459			}
1460		}
1461		num_mounts = cd->cd_nmounts;
1462		UNLOCK_DISK(cd);
1463	}
1464
1465out:
1466	/*
1467	 * If ce != NULL we have bailed out, but we cannot free memory beyond
1468	 * the bailout point as it may have been read in a previous batch and
1469	 * there may be a sleeping strategy routine assuming that it will
1470	 * remain valid.  Since we are typically killed immediately before a
1471	 * complete cache termination, this does not represent a significant
1472	 * problem.
1473	 *
1474	 * However, to prevent readers blocking waiting for readahead to
1475	 * complete for extents that will never be read, we mark all the
1476	 * extents we have given up on as aborted.
1477	 */
1478	if (ce != NULL) {
1479		UNLOCK_EXTENT(ce);
1480
1481		/* the only errors that get us here are global, so mark the cache as shut down */
1482		LOCK_DISK(cd);
1483		BC_set_flag(BC_FLAG_SHUTDOWN);
1484		cd->cd_flags &= (~CD_HAS_THREAD);
1485		UNLOCK_DISK(cd);
1486
1487		int tmpcel_idx, tmpce_idx;
1488		for (tmpcel_idx = 0;
1489			 tmpcel_idx < BC_cache->c_nextentlists;
1490			 tmpcel_idx++) {
1491			for (tmpce_idx = 0;
1492				 tmpce_idx < BC_cache->c_nextents[tmpcel_idx];
1493				 tmpce_idx++) {
1494				ce = BC_cache->c_extentlists[tmpcel_idx] + tmpce_idx;
1495				LOCK_EXTENT(ce);
1496				/* abort any extent that hasn't been satisfied */
1497				if (!(ce->ce_flags & CE_IODONE)) {
1498					ce->ce_flags |= CE_ABORTED;
1499					/* wake up anyone asleep on this extent */
1500					UNLOCK_EXTENT(ce);
1501					wakeup(ce);
1502				} else {
1503					UNLOCK_EXTENT(ce);
1504				}
1505			}
1506		}
1507		UNLOCK_CACHE_R();
1508	}
1509
1510
1511	debug("disk %d done", cd->cd_disk_num);
1512
1513	/* wake up someone that might be waiting for this reader thread to exit */
1514	wakeup(&cd->cd_flags);
1515
1516	LOCK_READERS();
1517	/* wake up someone that might be waiting for all the reader threads to exit */
1518	BC_cache->c_num_reader_threads--;
1519	if (BC_cache->c_num_reader_threads == 0) {
1520		wakeup(&BC_cache->c_num_reader_threads);
1521	}
1522	UNLOCK_READERS();
1523
1524}
1525
1526/*
1527 * Handle an incoming close request.
1528 */
1529static int
1530BC_close(dev_t dev, int flags, int devtype, struct proc *p)
1531{
1532	struct BC_cache_mount *cm;
1533	int i, cm_idx;
1534
1535	/* Mark our cache_mount for this device as invalid */
1536
1537	LOCK_CACHE_R();
1538	if (BC_cache->c_flags & BC_FLAG_CACHEACTIVE) {
1539
1540		cm_idx = BC_find_cache_mount(dev);
1541		if (-1 != cm_idx) {
1542			cm = BC_cache->c_mounts + cm_idx;
1543			debug("Tearing down closing mount %s with dev 0x%x", uuid_string(cm->cm_uuid), dev);
1544			LOCK_MOUNT_R(cm);
1545			if (cm->cm_state != CM_ABORTED) {
1546				/*
1547				 * Mark all extents as aborted. This will notify any sleepers in the
1548				 * strategy routine that the extent won't have data for them.
1549				 *
1550				 * Note that we shouldn't try to take an extent lock while holding
1551				 * the exclusive mount lock or we risk deadlock with the reader thread
1552				 */
1553				for (i = 0; i < cm->cm_nextents; i++) {
1554					LOCK_EXTENT(cm->cm_pextents[i]);
1555					BC_teardown_extent(cm->cm_pextents[i]);
1556					UNLOCK_EXTENT(cm->cm_pextents[i]);
1557					wakeup(cm->cm_pextents[i]);
1558				}
1559				if (! LOCK_MOUNT_R_TO_W(cm)) {
1560					/* If there is someone waiting for the exclusive lock on this mount,
1561					 * we want to grab it before they can so they don't waste time.
1562					 * If we fail to take the exclusive lock it means someone
1563					 * else is also trying to upgrate the mount lock. We don't keep any
1564					 * state, so we can simply take the write lock again.
1565					 */
1566					LOCK_MOUNT_W(cm);
1567					if (cm->cm_state == CM_ABORTED) {
1568						// Mount is aborted, no more work necessary
1569						UNLOCK_MOUNT_W(cm);
1570						goto out;
1571					}
1572				}
1573				BC_teardown_mount(cm);
1574				UNLOCK_MOUNT_W(cm);
1575			} else {
1576				UNLOCK_MOUNT_R(cm);
1577			}
1578		}
1579	}
1580
1581out:
1582	UNLOCK_CACHE_R();
1583
1584	return BC_cache->c_close(dev, flags, devtype, p);
1585}
1586
1587/*
1588 * Returns CE_ABORTED if any of the extents in the array are aborted
1589 * Returns CE_LOWPRI  if any of the extents in the array are low priority and not done
1590 * Returns CE_IODONE  if all the extents in the array are done
1591 * Returns 0 otherwise
1592 */
1593static int extents_status(struct BC_cache_extent **pce, int nextents) {
1594	int ce_idx;
1595	int ret = CE_IODONE;
1596
1597	for (ce_idx = 0;
1598		 ce_idx < nextents;
1599		 ce_idx++) {
1600
1601		if (pce[ce_idx]->ce_flags & CE_ABORTED) {
1602			return CE_ABORTED;
1603		}
1604
1605		if (! (pce[ce_idx]->ce_flags & CE_IODONE)) {
1606			if (pce[ce_idx]->ce_flags & CE_LOWPRI) {
1607				ret = CE_LOWPRI;
1608			} else {
1609				if (ret == CE_IODONE) {
1610					ret = 0;
1611				}
1612			}
1613		}
1614	}
1615	return ret;
1616}
1617
1618/*
1619 * Waits for the given extent to be filled or aborted
1620 *
1621 * Should only be called with the extent lock.
1622 * NO OTHER LOCKS SHOULD BE HELD INCLUDING THE CACHE LOCK
1623 *
1624 * We are guaranteed that the extent won't disappear when we release
1625 * the cache lock by the cleanup code waiting for BC_cache->c_strategycalls
1626 * to reach 0, and we're counted in there.
1627 */
1628static void wait_for_extent(struct BC_cache_extent *ce, struct timeval starttime) {
1629	struct timeval time;
1630	struct timespec timeout;
1631	microtime(&time);
1632	timersub(&time,
1633			 &starttime,
1634			 &time);
1635	if (time.tv_sec < BC_wait_for_readahead) {
1636		timeout.tv_sec = BC_wait_for_readahead - time.tv_sec;
1637		if (time.tv_usec == 0) {
1638			timeout.tv_nsec = 0;
1639		} else {
1640			timeout.tv_sec--;
1641			timeout.tv_nsec = NSEC_PER_USEC * (USEC_PER_SEC - time.tv_usec);
1642		}
1643		msleep(ce, &ce->ce_lock, PRIBIO, "BC_strategy", &timeout);
1644	}
1645}
1646
1647/*
1648 * Handle an incoming IO request.
1649 */
1650static int
1651BC_strategy(struct buf *bp)
1652{
1653	struct BC_cache_extent *ce = NULL, **pce, **pcontaining_extents = NULL;
1654	int num_extents;
1655	uio_t uio = NULL;
1656	int nbytes, resid, discards = 0;
1657	struct timeval blocked_start_time, blocked_end_time;
1658	daddr64_t blkno;
1659	caddr_t buf_map_p, buf_extent_p;
1660	off_t disk_offset;
1661	kern_return_t kret;
1662	int cm_idx = -1, ce_idx;
1663	dev_t dev;
1664	int32_t bufflags;
1665	int during_cache = 0, during_io = 0, take_detailed_stats = 0, cache_hit = 0;
1666	int is_root_disk, did_block, status;
1667	int is_shared = 0, is_swap = 0;
1668	int should_throttle = 0;
1669	int is_ssd = 0;
1670	int is_stolen = 0;
1671	int unfilled = 0;
1672	vnode_t vp = NULL;
1673	int pid = 0;
1674	int dont_cache = 0;
1675	int throttle_tier = 0;
1676	bufattr_t bap = NULL;
1677
1678	assert(bp != NULL);
1679
1680	blkno = buf_blkno(bp);
1681	nbytes = buf_count(bp);
1682	bufflags = buf_flags(bp);
1683	bap = buf_attr(bp);
1684	dev = buf_device(bp);
1685	vp = buf_vnode(bp);
1686
1687	if (bap) {
1688		throttle_tier = bufattr_throttled(bap);
1689	}
1690
1691	/*
1692	 * If the buf doesn't have a vnode for some reason, pretend
1693	 * we never saw the request at all.
1694	 */
1695	if (dev == nulldev()) {
1696		BC_cache->c_strategy(bp);
1697		BC_ADD_STAT(strategy_unknown, 1);
1698		BC_ADD_STAT(strategy_unknown_bytes, nbytes);
1699		return 0;
1700	}
1701
1702	if (vp && vnode_isswap(vp)) {
1703		is_swap = 1;
1704		goto bypass;
1705	}
1706
1707	if (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE) {
1708		pid = proc_selfpid();
1709
1710		if (vp) {
1711			is_shared = vnode_isdyldsharedcache(vp);
1712 		}
1713	}
1714
1715	if (vp && (vnode_israge(vp))) {
1716		dont_cache = 1;
1717	}
1718
1719	/*
1720	 * If the cache is not active, bypass the request.  We may
1721	 * not be able to fill it, but we still want to record it.
1722	 */
1723	if (!(BC_cache->c_flags & BC_FLAG_CACHEACTIVE)) {
1724		goto bypass;
1725	}
1726
1727	/*
1728	 * In order to prevent the cache cleanup code from racing with us
1729	 * when we sleep, we track the number of strategy calls in flight.
1730	 */
1731	OSAddAtomic(1, &BC_cache->c_strategycalls);
1732	LOCK_CACHE_R();
1733	during_cache = 1; /* we're included in the in_flight count */
1734
1735	/* Get cache mount asap for use in case we bypass */
1736	cm_idx = BC_find_cache_mount(dev);
1737
1738	if (cm_idx != -1) {
1739		disk_offset = CB_BLOCK_TO_BYTE(BC_cache->c_mounts + cm_idx, blkno);
1740	}
1741
1742	if (BC_cache->c_take_detailed_stats) {
1743		take_detailed_stats = 1;
1744		BC_ADD_STAT(strategy_calls, 1);
1745		if (throttle_tier) {
1746			BC_ADD_STAT(strategy_throttled, 1);
1747		}
1748#ifdef BC_DEBUG
1749//		if (dont_cache) {
1750//			char procname[128];
1751//			proc_selfname(procname, sizeof(procname));
1752//			const char* filename = vp ? vnode_getname(vp) : NULL;
1753//			debug("not recording %s%s from app %s for file %s (disk block 0x%llx) which is%s throttled", (vp && vnode_israge(vp)) ? "rapid age " : "", (bufflags & B_READ) ? "read" : "write", procname, filename?:"unknown", blkno, (bufflags & B_THROTTLED_IO) ? "" : " not");
1754//			if (filename) {
1755//				vnode_putname(filename);
1756//			}
1757//		}
1758#endif
1759	}
1760
1761	/*
1762	 * if we haven't seen this mount before, pass it off
1763	 *
1764	 * If this is a mount we want to cache, we'll kick it off when
1765	 * adding to our history
1766	 */
1767	if (cm_idx == -1) {
1768		/* don't have a mount for this IO */
1769		if (take_detailed_stats)
1770			BC_ADD_STAT(strategy_noncached_mount, 1);
1771		goto bypass;
1772	}
1773
1774	/* Get the info from the mount we need */
1775	LOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
1776
1777	if (BC_cache->c_mounts[cm_idx].cm_disk &&
1778		(BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_HAS_THREAD) &&
1779		!(BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_ISSUE_LOWPRI)) {
1780		during_io = 1; /* for statistics */
1781		if (take_detailed_stats) {
1782			BC_ADD_STAT(strategy_duringio, 1);
1783		}
1784	}
1785
1786	if (bufflags & B_ENCRYPTED_IO) {
1787		UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
1788		dont_cache = 1;
1789		goto bypass;
1790	}
1791
1792	if (BC_cache->c_mounts[cm_idx].cm_state != CM_READY) {
1793		/* the mount has been aborted, treat it like a missing mount */
1794		UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
1795		cm_idx = -1;
1796
1797		if (take_detailed_stats)
1798			BC_ADD_STAT(strategy_noncached_mount, 1);
1799		goto bypass;
1800	}
1801
1802	/* if it's not a read, pass it off */
1803	if ( !(bufflags & B_READ)) {
1804		UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
1805		if (take_detailed_stats)
1806			BC_ADD_STAT(strategy_nonread, 1);
1807		goto bypass;
1808	}
1809
1810	if ((nbytes % BC_cache->c_mounts[cm_idx].cm_blocksize) != 0) {
1811		debug("IO with non-blocksize multiple bytes (%d %% %lld = %lld)", nbytes, BC_cache->c_mounts[cm_idx].cm_blocksize, (nbytes % BC_cache->c_mounts[cm_idx].cm_blocksize));
1812		UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
1813		if (take_detailed_stats)
1814			BC_ADD_STAT(strategy_nonblocksize, 1);
1815		goto bypass;
1816	}
1817
1818	if (take_detailed_stats) {
1819		BC_ADD_STAT(extent_lookups, 1);
1820		BC_ADD_STAT(requested_bytes, nbytes);
1821		if (cm_idx < STAT_MOUNTMAX) {
1822			BC_ADD_STAT(requested_bytes_m[cm_idx], nbytes);
1823		}
1824	}
1825
1826	/*
1827	 * Look for a cache extent containing this request.
1828	 */
1829	num_extents = 0;
1830	pce = BC_find_extent(BC_cache->c_mounts + cm_idx, disk_offset, nbytes, 1, &num_extents);
1831	if (pce == NULL) {
1832#ifdef BC_EXTRA_DEBUG
1833		if (take_detailed_stats && !dont_cache) {
1834			char procname[128];
1835			proc_selfname(procname, sizeof(procname));
1836			const char* filename = vp ? vnode_getname(vp) : NULL;
1837			if (num_extents > 0) {
1838				xdebug("Missed IO from app %s for file %s (disk offset 0x%llx) (intersected, though)", procname, filename?:"unknown", blkno);
1839			} else {
1840				xdebug("Missed IO from app %s for file %s (disk offset 0x%llx)", procname, filename?:"unknown", blkno);
1841			}
1842			if (filename) {
1843				vnode_putname(filename);
1844			}
1845		}
1846#endif
1847		UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
1848		goto bypass;
1849	}
1850
1851	assert(num_extents > 0);
1852
1853	cache_hit = 1;
1854
1855	if (take_detailed_stats) {
1856		BC_ADD_STAT(extent_hits, 1);
1857		if (during_io)
1858			BC_ADD_STAT(hit_duringio, 1);
1859		if (num_extents > 1)
1860			BC_ADD_STAT(hit_multiple, 1);
1861	}
1862
1863	pcontaining_extents = BC_MALLOC(num_extents * sizeof(*pcontaining_extents), M_TEMP, M_WAITOK | M_ZERO);
1864	if (pcontaining_extents == NULL) {
1865		UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
1866		if (take_detailed_stats)
1867			BC_ADD_STAT(hit_failure, 1);
1868		goto bypass;
1869	}
1870	memcpy(pcontaining_extents, pce, num_extents * sizeof(*pcontaining_extents));
1871
1872	is_ssd = (BC_cache->c_mounts[cm_idx].cm_disk &&
1873			 (BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_IS_SSD));
1874
1875	UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
1876
1877	/* if any extent is aborted, bypass immediately */
1878	status = extents_status(pcontaining_extents, num_extents);
1879	if (status & CE_ABORTED) {
1880		if (take_detailed_stats)
1881			BC_ADD_STAT(hit_aborted, 1);
1882		goto bypass;
1883	}
1884
1885#ifndef EMULATE_ONLY
1886
1887	did_block = 0;
1888	if (! (status & CE_IODONE)) {
1889
1890		if (is_ssd) {
1891			/* Don't delay IOs on SSD since cut-throughs aren't harmful */
1892			unfilled = 1;
1893			goto bypass;
1894		}
1895
1896		if (status & CE_LOWPRI) {
1897			/* Don't wait for low priority extents */
1898			if (take_detailed_stats)
1899				BC_ADD_STAT(strategy_unfilled_lowpri, 1);
1900			cache_hit = 0;
1901			goto bypass;
1902		}
1903
1904		/*
1905		 * wait for all the extents to finish
1906		 */
1907		microtime(&blocked_start_time);
1908		for (ce_idx = 0;
1909			 ce_idx < num_extents;
1910			 ce_idx++) {
1911
1912			ce = pcontaining_extents[ce_idx];
1913			LOCK_EXTENT(ce);
1914			if (! (ce->ce_flags & (CE_ABORTED | CE_IODONE))) {
1915				did_block = 1;
1916				UNLOCK_CACHE_R();
1917				/*
1918				 * We need to release the cache lock since we will msleep in wait_for_extent.
1919				 * If we don't release the cache read lock and someone wants to add a new playlist,
1920				 * they go to grab the cache write lock, and everybody stalls for the msleep timeout.
1921				 *
1922				 * We are guaranteed that the cache won't disappear during this time since
1923				 * the cleanup code waits for all the strategy calls (this function) to complete
1924				 * by checking BC_cache->c_strategycalls, which we incremented above.
1925				 *
1926				 * The cache may have grown behind our backs, however. Mount pointers are not valid,
1927				 * but mount indexes still are. Extent pointers are still valid since the extent list
1928				 * is not modified (this is a requirement to be able to msleep with the extent's lock).
1929				 */
1930				wait_for_extent(ce, blocked_start_time);
1931
1932				/* Cache lock must be grabbed without holding any locks to avoid deadlock rdar://8626772 */
1933				UNLOCK_EXTENT(ce);
1934				LOCK_CACHE_R();
1935				LOCK_EXTENT(ce);
1936
1937				if (! (ce->ce_flags & (CE_ABORTED | CE_IODONE))) {
1938					/* strategy call timed out waiting for the extent */
1939					if (take_detailed_stats)
1940						BC_ADD_STAT(strategy_timedout, 1);
1941#ifdef BC_DEBUG
1942					microtime(&blocked_end_time);
1943					timersub(&blocked_end_time,
1944							 &blocked_start_time,
1945							 &blocked_end_time);
1946					char procname[128];
1947					proc_selfname(procname, sizeof(procname));
1948					const char* filename = vp ? vnode_getname(vp) : NULL;
1949					debug("Strategy routine timed out app %s waiting on file %s (disk offset 0x%llx) after %lu.%03d seconds", procname, filename?:"unknown", disk_offset, blocked_end_time.tv_sec, blocked_end_time.tv_usec / 1000);
1950					if (filename) {
1951						vnode_putname(filename);
1952					}
1953#endif
1954					goto bypass;
1955				}
1956				UNLOCK_EXTENT(ce);
1957				ce = NULL;
1958
1959				/* Check every time we sleep so we can avoid more sleeping if unnecessary */
1960				status = extents_status(pcontaining_extents, num_extents);
1961				if (status & CE_ABORTED) {
1962					if (take_detailed_stats)
1963						BC_ADD_STAT(hit_aborted, 1);
1964					goto bypass;
1965				} else if (status & CE_IODONE) {
1966					break;
1967				}
1968			} else {
1969				UNLOCK_EXTENT(ce);
1970				ce = NULL;
1971			}
1972		}
1973		if (take_detailed_stats && did_block) {
1974			BC_ADD_STAT(strategy_blocked, 1);
1975			microtime(&blocked_end_time);
1976			timersub(&blocked_end_time,
1977					 &blocked_start_time,
1978					 &blocked_end_time);
1979			int ms_blocked = (blocked_end_time.tv_sec * 1000) + (blocked_end_time.tv_usec / (1000));
1980#ifdef BC_DEBUG
1981			char procname[128];
1982			proc_selfname(procname, sizeof(procname));
1983			const char* filename = vp ? vnode_getname(vp) : NULL;
1984			debug("Strategy routine blocked app %s waiting on file %s (disk offset 0x%llx) for %lu.%03d seconds", procname, filename?:"unknown", disk_offset, blocked_end_time.tv_sec, blocked_end_time.tv_usec / 1000);
1985			if (filename) {
1986				vnode_putname(filename);
1987			}
1988#endif
1989			BC_ADD_STAT(strategy_time_blocked, ms_blocked);
1990			if (BC_cache->c_stats.ss_strategy_time_longest_blocked < ms_blocked) {
1991				BC_cache->c_stats.ss_strategy_time_longest_blocked = ms_blocked;
1992			}
1993		}
1994	}
1995
1996	KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, (did_block ? DBG_BC_IO_HIT_STALLED : DBG_BC_IO_HIT)) | DBG_FUNC_NONE, buf_kernel_addrperm_addr(bp), 0, 0, 0, 0);
1997
1998	/*
1999	 * buf_map will do its best to provide access to this
2000	 * buffer via a kernel accessible address
2001	 * if it fails, we can still fall through.
2002	 */
2003	if (buf_map(bp, &buf_map_p)) {
2004		/* can't map, let someone else worry about it */
2005		if (take_detailed_stats)
2006			BC_ADD_STAT(hit_failure, 1);
2007		goto bypass;
2008	}
2009	buf_extent_p = buf_map_p;
2010
2011	kret = vnode_getwithvid(BC_cache->c_vp, BC_cache->c_vid);
2012	if (kret != KERN_SUCCESS) {
2013		debug("vnode_getwithvid failed - %d", kret);
2014		if (take_detailed_stats)
2015			BC_ADD_STAT(hit_failure, 1);
2016		buf_unmap(bp);
2017		goto bypass;
2018	}
2019
2020	/*
2021	 * fill in bp from the extents
2022	 */
2023	for (ce_idx = 0;
2024		 ce_idx < num_extents;
2025		 ce_idx++) {
2026		u_int64_t nbytes_thisextent;
2027		u_int64_t diskoffset_thisextent;
2028		off_t     cacheoffset_thisextent;
2029
2030		ce = pcontaining_extents[ce_idx];
2031		LOCK_EXTENT(ce);
2032
2033		/*
2034		 * Check that the extent wasn't aborted while we were unlocked.
2035		 */
2036		if (ce->ce_flags & CE_ABORTED) {
2037			if (take_detailed_stats)
2038				BC_ADD_STAT(hit_aborted, 1);
2039			vnode_put(BC_cache->c_vp);
2040			buf_unmap(bp);
2041			goto bypass;
2042		}
2043
2044		assert(ce->ce_flags & CE_IODONE);
2045
2046		diskoffset_thisextent  = MAX(ce->ce_diskoffset, disk_offset);
2047		nbytes_thisextent      = MIN(disk_offset + nbytes, ce->ce_diskoffset + ce->ce_length) - diskoffset_thisextent;
2048		cacheoffset_thisextent = ce->ce_cacheoffset + (diskoffset_thisextent - ce->ce_diskoffset);
2049
2050		/* range check against extent */
2051		assert(diskoffset_thisextent  <  ce->ce_diskoffset + ce->ce_length);
2052		assert(diskoffset_thisextent  >= ce->ce_diskoffset);
2053		assert(nbytes_thisextent      <= (ce->ce_diskoffset + ce->ce_length) - diskoffset_thisextent);
2054		assert(cacheoffset_thisextent <  ce->ce_cacheoffset + ce->ce_length);
2055		assert(cacheoffset_thisextent >= ce->ce_cacheoffset);
2056		assert(nbytes_thisextent      <= (ce->ce_cacheoffset + ce->ce_length) - cacheoffset_thisextent);
2057
2058		/* range check against buf_t */
2059		assert(diskoffset_thisextent    <  disk_offset + nbytes);
2060		assert(diskoffset_thisextent    >= disk_offset);
2061		assert(nbytes_thisextent        <= (disk_offset + nbytes) - diskoffset_thisextent);
2062
2063		/* check our buf_map pointer */
2064		assert(buf_extent_p - buf_map_p == diskoffset_thisextent - disk_offset); /* didn't skip any bytes */
2065		assert(buf_map_p + nbytes       >= buf_extent_p + nbytes_thisextent); /* not overflowing the buffer */
2066
2067		/* Make sure we're including the entire buffer requested */
2068		assert(ce_idx > 0 || disk_offset == diskoffset_thisextent); /* include the first byte */
2069		assert(ce_idx < (num_extents - 1) || disk_offset + nbytes == diskoffset_thisextent + nbytes_thisextent); /* include the last byte */
2070
2071		/*
2072		 * Check that the requested blocks are in the buffer.
2073		 */
2074		if (!BC_blocks_present(ce,
2075							   CB_BYTE_TO_BLOCK(BC_cache->c_mounts + cm_idx, diskoffset_thisextent - ce->ce_diskoffset),
2076							   CB_BYTE_TO_BLOCK(BC_cache->c_mounts + cm_idx, nbytes_thisextent))) {
2077			if (take_detailed_stats)
2078				BC_ADD_STAT(hit_blkmissing, 1);
2079			vnode_put(BC_cache->c_vp);
2080			buf_unmap(bp);
2081			goto bypass;
2082		}
2083
2084		resid = nbytes_thisextent;
2085		uio = uio_create(1, cacheoffset_thisextent, UIO_SYSSPACE, UIO_READ);
2086		if (uio == NULL) {
2087			debug("couldn't allocate uio");
2088			if (take_detailed_stats)
2089				BC_ADD_STAT(hit_failure, 1);
2090			vnode_put(BC_cache->c_vp);
2091			buf_unmap(bp);
2092			goto bypass;
2093		}
2094
2095		kret = uio_addiov(uio, CAST_USER_ADDR_T(buf_extent_p), nbytes_thisextent);
2096		if (kret != KERN_SUCCESS) {
2097			debug("couldn't add iov to uio - %d", kret);
2098			if (take_detailed_stats)
2099				BC_ADD_STAT(hit_failure, 1);
2100			vnode_put(BC_cache->c_vp);
2101			buf_unmap(bp);
2102			goto bypass;
2103		}
2104
2105		kret = cluster_copy_ubc_data(BC_cache->c_vp, uio, &resid, 0);
2106		if (kret != KERN_SUCCESS) {
2107			debug("couldn't copy ubc data - %d", kret);
2108			if (take_detailed_stats)
2109				BC_ADD_STAT(hit_failure, 1);
2110			vnode_put(BC_cache->c_vp);
2111			buf_unmap(bp);
2112			goto bypass;
2113		}
2114
2115		if (resid != 0) {
2116			/* blocks have been stolen for pageout or contiguous memory */
2117			if (take_detailed_stats)
2118				BC_ADD_STAT(hit_stolen, 1);
2119			vnode_put(BC_cache->c_vp);
2120			buf_unmap(bp);
2121			is_stolen = 1;
2122			goto bypass;
2123		}
2124
2125		buf_extent_p += nbytes_thisextent;
2126
2127		/* discard blocks we have touched */
2128		discards += BC_discard_bytes(ce, disk_offset, nbytes_thisextent);
2129
2130		UNLOCK_EXTENT(ce);
2131		ce = NULL;
2132
2133		/* copy was successful */
2134		uio_free(uio);
2135		uio = NULL;
2136	}
2137
2138	vnode_put(BC_cache->c_vp);
2139
2140	buf_setresid(bp, 0);
2141
2142	/* buf_unmap will take care of all cases */
2143	buf_unmap(bp);
2144
2145	/* complete the request */
2146	buf_biodone(bp);
2147
2148	/* can't dereference bp after a buf_biodone has been issued */
2149
2150#else //ifndef EMULATE_ONLY
2151	(void) kret;
2152	(void) p;
2153	(void) resid;
2154
2155	/* discard blocks we have touched */
2156	for (ce_idx = 0;
2157		 ce_idx < num_extents;
2158		 ce_idx++) {
2159		ce = pcontaining_extents[ce_idx];
2160		LOCK_EXTENT(ce);
2161		discards += BC_discard_bytes(ce, disk_offset, nbytes);
2162		UNLOCK_EXTENT(ce);
2163		ce = NULL;
2164	}
2165
2166	/* send the IO to the disk, emulate hit in statistics */
2167	BC_cache->c_strategy(bp);
2168
2169#endif //ifndef EMULATE_ONLY else
2170
2171	if (take_detailed_stats) {
2172		BC_ADD_STAT(hit_bytes, discards);
2173		if (cm_idx < STAT_MOUNTMAX) {
2174			BC_ADD_STAT(hit_bytes_m[cm_idx], discards);
2175		}
2176		if (dont_cache) {
2177			BC_ADD_STAT(strategy_hit_nocache, 1);
2178			BC_ADD_STAT(hit_nocache_bytes, discards);
2179		}
2180		if (is_shared) {
2181			BC_ADD_STAT(hit_shared_bytes, discards);
2182		}
2183	} else {
2184		BC_ADD_STAT(hit_bytes_afterhistory, discards);
2185	}
2186
2187	BC_cache->c_num_ios_since_last_hit = 0;
2188
2189	/* we are not busy anymore */
2190	OSAddAtomic(-1, &BC_cache->c_strategycalls);
2191	UNLOCK_CACHE_R();
2192
2193	_FREE_ZERO(pcontaining_extents, M_TEMP);
2194
2195	if (! dont_cache) {
2196		/* record successful fulfilment (may block) */
2197		BC_add_history(blkno, nbytes, pid, 1, 0, 0, is_shared, dev);
2198	}
2199
2200	/*
2201	 * spec_strategy wants to know if the read has been
2202	 * satisfied by the boot cache in order to avoid
2203	 * throttling the thread unnecessarily. spec_strategy
2204	 * will check for the special return value 0xcafefeed
2205	 * to indicate that the read was satisfied by the
2206	 * cache.
2207	 */
2208
2209#define IO_SATISFIED_BY_CACHE ((int)0xcafefeed)
2210	return IO_SATISFIED_BY_CACHE;
2211
2212bypass:
2213	if (ce != NULL)
2214		UNLOCK_EXTENT(ce);
2215	if (uio != NULL)
2216		uio_free(uio);
2217	_FREE_ZERO(pcontaining_extents, M_TEMP);
2218
2219	/* pass the request on */
2220	BC_cache->c_strategy(bp);
2221
2222	/*
2223	 * can't dereference bp after c_strategy has been issued
2224	 * or else we race with buf_biodone
2225	 */
2226	void *bp_void = (void *)bp; // for use in ktrace
2227	bp = NULL;
2228
2229	/* not really "bypassed" if the cache is not active */
2230	if (during_cache) {
2231		if (cm_idx != -1) {
2232			LOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
2233			if (BC_cache->c_mounts[cm_idx].cm_state == CM_READY) {
2234				discards += BC_handle_discards(BC_cache->c_mounts + cm_idx, disk_offset, nbytes);
2235			}
2236
2237			/* Check if we should throttle this IO */
2238			if (BC_cache->c_readahead_throttles_cutthroughs &&
2239				!is_swap &&
2240				BC_cache->c_mounts[cm_idx].cm_disk &&
2241				(BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_HAS_THREAD) &&
2242				!(BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_ISSUE_LOWPRI) &&
2243				!(BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_IS_SSD)) {
2244				/* We're currently issuing readahead for this disk.
2245				 * Throttle this IO so we don't cut-through the readahead so much.
2246				 */
2247				should_throttle = 1;
2248			}
2249
2250			UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
2251		}
2252		if (take_detailed_stats) {
2253			BC_ADD_STAT(strategy_bypassed, 1);
2254			if (during_io) {
2255				BC_ADD_STAT(strategy_bypass_duringio, 1);
2256			}
2257			if (dont_cache) {
2258				BC_ADD_STAT(strategy_bypass_nocache, 1);
2259				BC_ADD_STAT(bypass_nocache_bytes, nbytes);
2260				if (during_io) {
2261					BC_ADD_STAT(strategy_bypass_duringio_nocache, 1);
2262				}
2263			}
2264			if (cache_hit) {
2265				BC_ADD_STAT(error_discards, discards);
2266			} else if (dont_cache) {
2267				BC_ADD_STAT(bypass_nocache_discards, discards);
2268			} else if (bufflags & B_READ) {
2269				BC_ADD_STAT(read_discards, discards);
2270			} else {
2271				BC_ADD_STAT(write_discards, discards);
2272			}
2273		} else {
2274			BC_ADD_STAT(lost_bytes_afterhistory, discards);
2275		}
2276	}
2277
2278	if (during_cache) {
2279		OSAddAtomic(-1, &BC_cache->c_strategycalls);
2280		UNLOCK_CACHE_R();
2281	}
2282
2283	if (! is_swap) {
2284		if (! dont_cache) {
2285			is_root_disk = BC_add_history(blkno, nbytes, pid, 0, ((bufflags & B_READ) ? 0 : 1), 0, is_shared, dev);
2286
2287			if (take_detailed_stats && during_io && is_root_disk) {
2288				if (cache_hit) {
2289					if (unfilled) {
2290						BC_ADD_STAT(strategy_bypass_duringio_unfilled, 1);
2291					} else {
2292						BC_ADD_STAT(strategy_bypass_duringio_rootdisk_failure, 1);
2293					}
2294				} else if (bufflags & B_READ) {
2295					BC_ADD_STAT(strategy_bypass_duringio_rootdisk_read, 1);
2296				} else {
2297					BC_ADD_STAT(strategy_bypass_duringio_rootdisk_nonread, 1);
2298				}
2299			}
2300		}
2301	}
2302
2303	/*
2304	 * Check to make sure that we're not missing the cache
2305	 * too often.  If we are, we're no longer providing a
2306	 * performance win and the best thing would be to give
2307	 * up.
2308	 *
2309	 * Don't count throttled IOs since those aren't time
2310	 * critical. (We don't want to jettison the cache just
2311	 * because spotlight is indexing)
2312	 */
2313
2314	/* if this is a read, and we do have an active cache, and the read isn't throttled */
2315	if (during_cache) {
2316		(void) is_stolen;
2317		if (is_swap /*|| is_stolen*/) {  //rdar://10651288&10658086 seeing stolen pages early during boot
2318			if (is_swap) {
2319				debug("detected %s swap file, jettisoning cache", (bufflags & B_READ) ? "read from" : "write to");
2320			} else {
2321				debug("detected stolen page, jettisoning cache");
2322			}
2323			//rdar://9858070 Do this asynchronously to avoid deadlocks
2324			BC_terminate_cache_async();
2325		} else if ((bufflags & B_READ) &&
2326				   !(throttle_tier)) {
2327
2328			struct timeval current_time;
2329			if (BC_cache->c_stats.ss_history_num_recordings < 2) {
2330				microtime(&current_time);
2331				timersub(&current_time,
2332						 &BC_cache->c_loadtime,
2333						 &current_time);
2334			}
2335			/* Don't start counting misses until we've started login or hit our prelogin timeout */
2336			if (BC_cache->c_stats.ss_history_num_recordings >= 2 || current_time.tv_sec > BC_chit_prelogin_timeout_seconds) {
2337
2338				/* increase number of IOs since last hit */
2339				OSAddAtomic(1, &BC_cache->c_num_ios_since_last_hit);
2340
2341				if (BC_cache->c_num_ios_since_last_hit >= BC_chit_max_num_IOs_since_last_hit) {
2342					/*
2343					 * The hit rate is poor, shut the cache down.
2344					 */
2345					debug("hit rate below threshold (0 hits in the last %u lookups), jettisoning cache",
2346						  BC_cache->c_num_ios_since_last_hit);
2347					//rdar://9858070 Do this asynchronously to avoid deadlocks
2348					BC_terminate_cache_async();
2349				}
2350			}
2351		}
2352	}
2353
2354	if (is_swap && (! (BC_cache->c_flags & BC_FLAG_SHUTDOWN))) {
2355		/* if we're swapping, stop readahead */
2356		debug("Detected %s swap file. Terminating readahead", (bufflags & B_READ) ? "read from" : "write to");
2357		BC_set_flag(BC_FLAG_SHUTDOWN);
2358	}
2359
2360	KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, (should_throttle ? DBG_BC_IO_MISS_CUT_THROUGH : DBG_BC_IO_MISS)) | DBG_FUNC_NONE, buf_kernel_addrperm_addr(bp_void), 0, 0, 0, 0);
2361
2362	if (should_throttle && throttle_tier < IOPOL_THROTTLE) {
2363		/*
2364		 * We need to indicate to spec_strategy that we want to
2365		 * throttle this IO to avoid cutting through readahead
2366		 * too much. spec_strategy will check for the special
2367		 * return value 0xcafebeef to indicate that the IO
2368		 * should be throttled.
2369		 */
2370
2371		BC_ADD_STAT(strategy_forced_throttled, 1);
2372
2373#define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef)
2374		return IO_SHOULD_BE_THROTTLED;
2375	}
2376
2377	return 0;
2378}
2379
2380/*
2381 * Handle a block range that needs to be discarded.
2382 *
2383 * Called with the cache mount read lock held
2384 *
2385 * Returns the number of bytes discarded
2386 */
2387static int
2388BC_handle_discards(struct BC_cache_mount *cm, u_int64_t offset, u_int64_t length)
2389{
2390	struct BC_cache_extent **pce, **p;
2391	int count, total_discards;
2392
2393	total_discards = 0;
2394
2395	/*
2396	 * Look for an extent that we overlap.
2397	 */
2398	if ((pce = BC_find_extent(cm, offset, length, 0, NULL)) == NULL)
2399		return 0;		/* doesn't affect us */
2400
2401	/*
2402	 * Discard bytes in the matched extent.
2403	 */
2404	LOCK_EXTENT(*pce);
2405	count = BC_discard_bytes((*pce), offset, length);
2406	UNLOCK_EXTENT(*pce);
2407	total_discards += count;
2408
2409	/*
2410	 * Scan adjacent extents for possible overlap and discard there as well.
2411	 */
2412	p = pce - 1;
2413	while (p >= cm->cm_pextents &&
2414		   BC_check_intersection((*p), offset, length)) {
2415		LOCK_EXTENT(*p);
2416		count = BC_discard_bytes((*p), offset, length);
2417		UNLOCK_EXTENT(*p);
2418		if (count == 0)
2419			break;
2420		total_discards += count;
2421		p--;
2422	}
2423	p = pce + 1;
2424	while (p < (cm->cm_pextents + cm->cm_nextents) &&
2425		   BC_check_intersection((*p), offset, length)) {
2426		LOCK_EXTENT(*p);
2427		count = BC_discard_bytes((*p), offset, length);
2428		UNLOCK_EXTENT(*p);
2429		if (count == 0)
2430			break;
2431		total_discards += count;
2432		p++;
2433	}
2434
2435	return total_discards;
2436}
2437
2438/*
2439 * Shut down readahead operations.
2440 */
2441static int
2442BC_terminate_readahead(void)
2443{
2444	int error;
2445	struct timespec timeout;
2446	timeout.tv_sec = 10;
2447	timeout.tv_nsec = 0;
2448
2449	/*
2450	 * Signal the readahead thread to terminate, and wait for
2451	 * it to comply.  If this takes > 10 seconds, give up.
2452	 */
2453	BC_set_flag(BC_FLAG_SHUTDOWN);
2454
2455	/*
2456	 * If readahead is still in progress, we have to shut it down
2457	 * cleanly.  This is an expensive process, but since readahead
2458	 * will normally complete long before the reads it tries to serve
2459	 * complete, it will typically only be invoked when the playlist
2460	 * is out of synch and the cache hitrate falls below the acceptable
2461	 * threshold.
2462	 *
2463	 * Note that if readahead is aborted, the reader thread will mark
2464	 * the aborted extents and wake up any strategy callers waiting
2465	 * on them, so we don't have to worry about them here.
2466	 */
2467	LOCK_READERS();
2468	while (BC_cache->c_num_reader_threads > 0) {
2469		debug("terminating active readahead");
2470
2471		error = msleep(&BC_cache->c_num_reader_threads, &BC_cache->c_reader_threads_lock, PRIBIO, "BC_terminate_readahead", &timeout);
2472		if (error == EWOULDBLOCK) {
2473			UNLOCK_READERS();
2474
2475			message("timed out waiting for I/O to stop");
2476			if (BC_cache->c_num_reader_threads == 0) {
2477				debug("but I/O has stopped!");
2478			}
2479#ifdef BC_DEBUG
2480 			Debugger("I/O Kit wedged on us");
2481#endif
2482			/*
2483			 * It might be nice to free all the pages that
2484			 * aren't actually referenced by the outstanding
2485			 * region, since otherwise we may be camped on a
2486			 * very large amount of physical memory.
2487			 *
2488			 * Ideally though, this will never happen.
2489			 */
2490			return(EBUSY);	/* really EWEDGED */
2491		}
2492	}
2493	UNLOCK_READERS();
2494
2495	return(0);
2496}
2497
2498static void
2499BC_terminate_cache_thread(void *param0, wait_result_t param1)
2500{
2501	BC_terminate_cache();
2502}
2503
2504/*
2505 * Start up an auxilliary thread to stop the cache so we avoid potential deadlocks
2506 */
2507static void
2508BC_terminate_cache_async(void)
2509{
2510	if (! (BC_cache->c_flags & BC_FLAG_CACHEACTIVE)) {
2511		return;
2512	}
2513
2514	int error;
2515	thread_t rthread;
2516
2517	debug("Kicking off thread to terminate cache");
2518	if ((error = kernel_thread_start(BC_terminate_cache_thread, NULL, &rthread)) == KERN_SUCCESS) {
2519		thread_deallocate(rthread);
2520	} else {
2521		message("Unable to start thread to terminate cache");
2522	}
2523}
2524
2525/*
2526 * Terminate the cache.
2527 *
2528 * This prevents any further requests from being satisfied from the cache
2529 * and releases all the resources owned by it.
2530 *
2531 * Must be called with no locks held
2532 */
2533static int
2534BC_terminate_cache(void)
2535{
2536	int retry, cm_idx, j, ce_idx, cel_idx;
2537
2538	BC_terminate_readahead();
2539
2540	/* can't shut down if readahead is still active */
2541	if (BC_cache->c_num_reader_threads > 0) {
2542		debug("cannot terminate cache while readahead is in progress");
2543		return(EBUSY);
2544	}
2545
2546	LOCK_CACHE_R();
2547
2548	LOCK_HANDLERS();
2549	if (!BC_clear_flag(BC_FLAG_CACHEACTIVE)) {
2550		/* can't shut down if we're not active */
2551		debug("cannot terminate cache when not active");
2552		UNLOCK_HANDLERS();
2553		UNLOCK_CACHE_R();
2554		return(ENXIO);
2555	}
2556
2557	bootcache_contains_block = NULL;
2558
2559	debug("terminating cache...");
2560
2561	/* if we're no longer recording history also, disconnect our strategy routine */
2562	BC_check_handlers();
2563	UNLOCK_HANDLERS();
2564
2565	/*
2566	 * Mark all extents as FREED. This will notify any sleepers in the
2567	 * strategy routine that the extent won't have data for them.
2568	 */
2569	for (cel_idx = 0;
2570		 cel_idx < BC_cache->c_nextentlists;
2571		 cel_idx++) {
2572		for (ce_idx = 0;
2573			 ce_idx < BC_cache->c_nextents[cel_idx];
2574			 ce_idx++) {
2575			struct BC_cache_extent *ce = BC_cache->c_extentlists[cel_idx] + ce_idx;
2576			LOCK_EXTENT(ce);
2577			/*
2578			 * Track unused bytes
2579			 */
2580			if (ce->ce_blockmap != NULL && BC_cache->c_mounts[ce->ce_mount_idx].cm_blocksize != 0) {
2581				for (j = 0; j < howmany(ce->ce_length, BC_cache->c_mounts[ce->ce_mount_idx].cm_blocksize); j++) {
2582					if (CB_BLOCK_PRESENT(ce, j))
2583						BC_ADD_STAT(spurious_bytes, BC_cache->c_mounts[ce->ce_mount_idx].cm_blocksize);
2584				}
2585			}
2586
2587			BC_teardown_extent(ce);
2588			UNLOCK_EXTENT(ce);
2589			wakeup(ce);
2590		}
2591	}
2592
2593	for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) {
2594		struct BC_cache_mount *cm = BC_cache->c_mounts + cm_idx;
2595		LOCK_MOUNT_W(cm);
2596		BC_teardown_mount(cm);
2597		UNLOCK_MOUNT_W(cm);
2598	}
2599
2600
2601	/*
2602	 * It is possible that one or more callers are asleep in the
2603	 * strategy routine (eg. if readahead was terminated early,
2604	 * or if we are called off the timeout).
2605	 * Check the count of callers in the strategy code, and sleep
2606	 * until there are none left (or we time out here).  Note that
2607	 * by clearing BC_FLAG_CACHEACTIVE above we prevent any new
2608	 * strategy callers from touching the cache, so the count
2609	 * must eventually drain to zero.
2610	 */
2611	retry = 0;
2612	while (BC_cache->c_strategycalls > 0) {
2613		tsleep(BC_cache, PRIBIO, "BC_terminate_cache", 10);
2614		if (retry++ > 50) {
2615			message("could not terminate cache, timed out with %d caller%s in BC_strategy",
2616					(int) BC_cache->c_strategycalls,
2617					BC_cache->c_strategycalls == 1 ? "" : "s");
2618			UNLOCK_CACHE_R();
2619			return(EBUSY);	/* again really EWEDGED */
2620		}
2621	}
2622
2623	if (! LOCK_CACHE_R_TO_W()) {
2624		/* We shouldn't get here. This is the only LOCK_CACHE_R_TO_W call,
2625		 * so this implies someone is terminating the cache in parallel with us,
2626		 * but we check for exclusivity by clearing BC_FLAG_CACHEACTIVE.
2627		 */
2628		message("Unable to upgrade cache lock to free resources");
2629		return ENXIO;
2630	}
2631
2632	for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) {
2633		struct BC_cache_mount *cm = BC_cache->c_mounts + cm_idx;
2634		if (cm->cm_disk != NULL) {
2635			for (j = cm_idx + 1; j < BC_cache->c_nmounts; j++) {
2636				if (BC_cache->c_mounts[j].cm_disk == cm->cm_disk) {
2637					BC_cache->c_mounts[j].cm_disk = NULL;
2638				}
2639			}
2640			BC_teardown_disk(cm->cm_disk);
2641			lck_mtx_destroy(&cm->cm_disk->cd_lock, BC_cache->c_lckgrp);
2642			_FREE_ZERO(cm->cm_disk, M_TEMP);
2643		}
2644	}
2645
2646	BC_free_pagebuffer();
2647	BC_cache->c_cachesize = 0;
2648
2649	/* free memory held by extents and mounts */
2650	for (cel_idx = 0;
2651		 cel_idx < BC_cache->c_nextentlists;
2652		 cel_idx++) {
2653		for (ce_idx = 0;
2654			 ce_idx < BC_cache->c_nextents[cel_idx];
2655			 ce_idx++) {
2656			lck_mtx_destroy(&BC_cache->c_extentlists[cel_idx][ce_idx].ce_lock, BC_cache->c_lckgrp);
2657		}
2658		_FREE_ZERO(BC_cache->c_extentlists[cel_idx], M_TEMP);
2659	}
2660	_FREE_ZERO(BC_cache->c_extentlists, M_TEMP);
2661	_FREE_ZERO(BC_cache->c_nextents, M_TEMP);
2662	BC_cache->c_nextentlists = 0;
2663
2664	for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) {
2665		lck_rw_destroy(&BC_cache->c_mounts[cm_idx].cm_lock, BC_cache->c_lckgrp);
2666	}
2667	_FREE_ZERO(BC_cache->c_mounts, M_TEMP);
2668	BC_cache->c_nmounts = 0;
2669
2670	UNLOCK_CACHE_W();
2671
2672	/* record stop time */
2673	struct timeval endtime;
2674	microtime(&endtime);
2675	timersub(&endtime,
2676			 &BC_cache->c_cache_starttime,
2677			 &endtime);
2678	BC_cache->c_stats.ss_cache_time += (u_int) endtime.tv_sec * 1000 + (u_int) endtime.tv_usec / 1000;
2679	return(0);
2680}
2681
2682/*
2683 * Terminate history recording.
2684 *
2685 * This stops us recording any further history events.
2686 */
2687static int
2688BC_terminate_history(void)
2689{
2690	struct BC_history_mount_device  *hmd;
2691	LOCK_HANDLERS();
2692	if (!BC_clear_flag(BC_FLAG_HISTORYACTIVE)) {
2693		/* can't shut down if we're not active */
2694		debug("cannot terminate history when not active");
2695		UNLOCK_HANDLERS();
2696		return(ENXIO);
2697	}
2698
2699	debug("terminating history collection...");
2700
2701	/* if the cache is no longer active also, disconnect our strategy routine */
2702	BC_check_handlers();
2703	UNLOCK_HANDLERS();
2704
2705	/*
2706	 * Kill the timeout handler; we don't want it going off
2707	 * now.
2708	 */
2709	untimeout(BC_timeout_history, NULL);
2710
2711	if (BC_cache->c_take_detailed_stats) {
2712		BC_cache->c_stats.ss_history_mount_no_uuid = 0;
2713		for (hmd = BC_cache->c_history_mounts; hmd != NULL; hmd = hmd->hmd_next)
2714			if (uuid_is_null(hmd->hmd_mount.hm_uuid))
2715				BC_ADD_STAT(history_mount_no_uuid, 1);
2716	}
2717
2718	/* record stop time */
2719	if (BC_cache->c_take_detailed_stats) {
2720		struct timeval endtime;
2721		/* record stop time */
2722		microtime(&endtime);
2723		timersub(&endtime,
2724				 &BC_cache->c_history_starttime,
2725				 &endtime);
2726		BC_ADD_STAT(history_time, (u_int) endtime.tv_sec * 1000 + (u_int) endtime.tv_usec / 1000);
2727	}
2728
2729	BC_cache->c_take_detailed_stats = 0;
2730
2731	return(0);
2732}
2733
2734/*
2735 * Check our strategy handlers and make sure they are set/cleared depending on our current state.
2736 *
2737 * Should be called after changing BC_FLAG_CACHEACTIVE or BC_FLAG_HISTORYACTIVE.
2738 * Called with the handlers lock held.
2739 */
2740static void
2741BC_check_handlers(void)
2742{
2743	if (BC_cache->c_devsw == NULL ||
2744		BC_cache->c_strategy == NULL ||
2745		BC_cache->c_close == NULL) {
2746		debug("Cannot check device handlers: cache not setup properly");
2747		return;
2748	}
2749
2750	/* if the cache or history recording is active, make sure we've captured the appropriate device handler routines */
2751	if ((BC_cache->c_flags & BC_FLAG_CACHEACTIVE) ||
2752		(BC_cache->c_flags & BC_FLAG_HISTORYACTIVE)) {
2753
2754		if (BC_cache->c_devsw->d_strategy != (strategy_fcn_t*) BC_strategy ||
2755			BC_cache->c_devsw->d_close    != BC_close) {
2756
2757			debug("connecting handlers...");
2758
2759			/* connect the strategy and close routines */
2760			BC_cache->c_devsw->d_strategy = (strategy_fcn_t*) BC_strategy;
2761			BC_cache->c_devsw->d_close    = BC_close;
2762		}
2763	} else {
2764		if (BC_cache->c_devsw->d_strategy != BC_cache->c_strategy ||
2765			BC_cache->c_devsw->d_close    != BC_cache->c_close) {
2766
2767			debug("disconnecting handlers...");
2768
2769			/* disconnect the strategy and close routines */
2770			BC_cache->c_devsw->d_strategy = BC_cache->c_strategy;
2771			BC_cache->c_devsw->d_close    = BC_cache->c_close;
2772		}
2773	}
2774}
2775
2776/*
2777 * Setup a cache extent with the parameters given by the playlist entry
2778 *
2779 * Returns 0 on success
2780 */
2781static int
2782BC_setup_extent(struct BC_cache_extent *ce, const struct BC_playlist_entry *pe, int batch_adjustment, int cache_mount_idx)
2783{
2784	lck_mtx_init(&ce->ce_lock, BC_cache->c_lckgrp,
2785				 LCK_ATTR_NULL);
2786	ce->ce_diskoffset = pe->pe_offset;
2787	ce->ce_mount_idx = cache_mount_idx;
2788	ce->ce_length = pe->pe_length;
2789#ifdef IGNORE_BATCH
2790	ce->ce_batch = 0;
2791#else
2792	ce->ce_batch = pe->pe_batch;
2793#endif
2794	ce->ce_batch += batch_adjustment;
2795	if (ce->ce_batch > BC_MAXBATCHES) {
2796		ce->ce_batch = BC_MAXBATCHES; /* cap batch number */
2797	}
2798	ce->ce_cacheoffset = 0;
2799	ce->ce_blockmap = NULL;
2800	ce->ce_flags = 0;
2801	if (pe->pe_flags & BC_PE_LOWPRIORITY) {
2802		ce->ce_flags |= CE_LOWPRI;
2803	}
2804	if (pe->pe_flags & BC_PE_SHARED) {
2805		ce->ce_flags |= CE_SHARED;
2806	}
2807
2808	/* track highest batch number for this playlist */
2809	if (ce->ce_batch >= BC_cache->c_batch_count) {
2810		BC_cache->c_batch_count = ce->ce_batch + 1;
2811		// debug("Largest batch is now %d from extent with disk offset %llu", BC_cache->c_batch_count, ce->ce_diskoffset);
2812	}
2813
2814	return 0;
2815}
2816
2817/*
2818 * The blocksize is initialised from the first playlist read, the statistics
2819 * structure, or it can be pre-set by the caller.  Once set, only playlists with
2820 * matching sizes can be read/written.
2821 */
2822#define	BLOCK_ROUNDDOWN(cm, x)	(((x) / (cm)->cm_blocksize) * (cm)->cm_blocksize)
2823#define BLOCK_ROUNDUP(cm, x)	((((x) + ((cm)->cm_blocksize - 1)) / (cm)->cm_blocksize) * (cm)->cm_blocksize)
2824/*
2825 * optional for power-of-two blocksize roundup:
2826 * (((x) + ((cm)->cm_blocksize - 1)) & (~((cm)->cm_blocksize - 1)))
2827 */
2828
2829/*
2830 * Fill in a cache extent now that its mount has been filled in
2831 *
2832 * Called with the extent lock held
2833 *
2834 * Returns 0 on success
2835 */
2836static int
2837BC_fill_in_extent(struct BC_cache_extent *ce)
2838{
2839	int numblocks, roundsize, i;
2840	u_int64_t end;
2841
2842	if (ce->ce_flags & CE_ABORTED ||
2843		ce->ce_length == 0) {
2844		return 1;
2845	}
2846
2847	struct BC_cache_mount *cm = BC_cache->c_mounts + ce->ce_mount_idx;
2848
2849	int blocksize = cm->cm_blocksize;
2850
2851	if (0 == blocksize) {
2852		return 1;
2853	}
2854
2855	roundsize = roundup(ce->ce_length, PAGE_SIZE);
2856
2857	/* make sure we're on block boundaries */
2858	end = ce->ce_diskoffset + roundsize;
2859	ce->ce_diskoffset = BLOCK_ROUNDDOWN(cm, ce->ce_diskoffset);
2860	ce->ce_length = BLOCK_ROUNDUP(cm, end) - ce->ce_diskoffset;
2861
2862	/* make sure we didn't grow our pagebuffer size since is's already been allocated */
2863	if (roundup(ce->ce_length, PAGE_SIZE) > roundsize) {
2864        debug("Clipped extent %llu by a page", ce->ce_diskoffset);
2865		BC_ADD_STAT(extents_clipped, 1);
2866		ce->ce_length = roundsize;
2867	}
2868
2869	numblocks = howmany(ce->ce_length, blocksize);
2870
2871	ce->ce_blockmap = BC_MALLOC(howmany(numblocks, (CB_MAPFIELDBITS / CB_MAPFIELDBYTES)),
2872							  M_TEMP, M_WAITOK | M_ZERO);
2873	if (!ce->ce_blockmap) {
2874		message("can't allocate extent blockmap for %d blocks of %d bytes", numblocks, howmany(numblocks, (CB_MAPFIELDBITS / CB_MAPFIELDBYTES)));
2875		return 1;
2876	}
2877
2878	for (i = 0; i < howmany(ce->ce_length, blocksize); i++) {
2879		CB_MARK_BLOCK_PRESENT((ce), i);
2880	}
2881
2882	return 0;
2883}
2884
2885static void
2886BC_teardown_extent(struct BC_cache_extent *ce)
2887{
2888	ce->ce_flags |= CE_ABORTED;
2889	_FREE_ZERO(ce->ce_blockmap, M_TEMP);
2890}
2891
2892/*
2893 * Setup a cache disk.
2894 * Returns 0 on success
2895 */
2896static int
2897BC_setup_disk(struct BC_cache_disk *cd, u_int64_t disk_id, int is_ssd)
2898{
2899	static int next_disk_num;
2900	lck_mtx_init(&cd->cd_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL);
2901	cd->cd_disk_id = disk_id;
2902	cd->cd_disk_num = next_disk_num++;
2903	cd->cd_flags = 0;
2904	cd->cd_nmounts = 0;
2905	cd->cd_batch = 0;
2906
2907	if (is_ssd) {
2908		cd->cd_flags |= CD_IS_SSD;
2909	}
2910
2911	debug("Setup disk 0x%llx as disk num %d%s", disk_id, cd->cd_disk_num, (cd->cd_flags & CD_IS_SSD) ? " (ssd)" : "");
2912
2913	return (0);
2914}
2915
2916/*
2917 * Teardown a cache disk.
2918 */
2919static void
2920BC_teardown_disk(struct BC_cache_disk *cd)
2921{
2922	/* Nothing to do */
2923}
2924
2925
2926/*
2927 * Check if a mount has become available for readahead
2928 *
2929 * If so, make sure the reader thread picks up this mount's IOs.
2930 * If there is no reader thread for the mount's disk, kick one off.
2931 *
2932 * Called with the cache mount read lock held
2933 */
2934static void
2935BC_mount_available(struct BC_cache_mount *cm)
2936{
2937#ifdef EMULATE_ONLY
2938	int i;
2939	for (i = 0; i < cm->cm_nextents; i++)
2940			cm->cm_pextents[i]->ce_flags |= CE_IODONE;
2941#else
2942	int i, error;
2943	thread_t rthread;
2944	if (cm->cm_state != CM_READY) {
2945		/* not ready */
2946		return;
2947	}
2948
2949	struct BC_cache_disk *cd = cm->cm_disk;
2950	LOCK_DISK(cd);
2951	cd->cd_nmounts++;
2952	LOCK_READERS();
2953	if (!(BC_cache->c_flags & BC_FLAG_SHUTDOWN)) {
2954		if (cd->cd_flags & CD_HAS_THREAD) {
2955			/* Make sure the thread is not issuing lowpriority IO */
2956			if (cd->cd_flags & CD_ISSUE_LOWPRI) {
2957				debug("Interrupting low-priority thread for new mount %s", uuid_string(cm->cm_uuid));
2958				cd->cd_flags &= (~CD_ISSUE_LOWPRI);
2959				/* TODO: Unthrottle the readahead thread here rather than waiting out its current throttle delay */
2960			}
2961			UNLOCK_READERS();
2962			UNLOCK_DISK(cd);
2963			return;
2964		}
2965
2966		/* Make sure we issue regular IOs for the new playlist in case we were issuing low-priority previously */
2967		cd->cd_flags &= (~CD_ISSUE_LOWPRI);
2968
2969		debug("Kicking off reader thread for disk %d for mount %s", cd->cd_disk_num, uuid_string(cm->cm_uuid));
2970		if ((error = kernel_thread_start(BC_reader_thread, cd, &rthread)) == KERN_SUCCESS) {
2971			thread_deallocate(rthread);
2972			BC_cache->c_num_reader_threads++;
2973			BC_ADD_STAT(readahead_threads, 1);
2974			cd->cd_flags |= CD_HAS_THREAD;
2975			UNLOCK_READERS();
2976			UNLOCK_DISK(cd);
2977			return;
2978		}
2979
2980		message("Unable to start reader thread for disk %d: %d", cd->cd_disk_num, error);
2981	}
2982	UNLOCK_READERS();
2983	UNLOCK_DISK(cd);
2984
2985	/*
2986	 * Getting here indicates some failure.
2987	 *
2988	 * Mark all extents as aborted. This will notify any sleepers in the
2989	 * strategy routine that the extent won't have data for them.
2990	 */
2991	for (i = 0; i < cm->cm_nextents; i++) {
2992		LOCK_EXTENT(cm->cm_pextents[i]);
2993		BC_teardown_extent(cm->cm_pextents[i]);
2994		UNLOCK_EXTENT(cm->cm_pextents[i]);
2995		wakeup(cm->cm_pextents[i]);
2996	}
2997#endif
2998}
2999
3000/*
3001 * Setup a cache mount from the playlist mount.
3002 *
3003 * Allocates cm_pextents large enough to hold pm->pm_nentries extents,
3004 * but leaves cm_nextents 0 since the array hasn't been initialized.
3005 *
3006 * Returns 0 on success
3007 */
3008static int
3009BC_setup_mount(struct BC_cache_mount *cm, struct BC_playlist_mount* pm)
3010{
3011	int error = 0;
3012
3013	lck_rw_init(&cm->cm_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL);
3014	uuid_copy(cm->cm_uuid, pm->pm_uuid);
3015
3016	/* These will be set once we've detected that this volume has been mounted */
3017	cm->cm_dev = nulldev();
3018	cm->cm_bp = NULL;
3019	cm->cm_devsize = 0;
3020	cm->cm_maxread = 0;
3021	cm->cm_disk = NULL;
3022	cm->cm_blocksize = 0;
3023	cm->cm_nextents = 0;
3024
3025	/* Allocate the sorted extent array.
3026	 * We'll fill it in while we setup the extents.
3027	 */
3028	if (pm->pm_nentries == 0) {
3029		message("Playlist incuded mount with 0 entries");
3030		error = EINVAL;
3031		goto out;
3032	}
3033
3034	cm->cm_pextents = BC_MALLOC(pm->pm_nentries * sizeof(*cm->cm_pextents), M_TEMP, M_WAITOK | M_ZERO);
3035	if (cm->cm_pextents == NULL) {
3036		message("can't allocate mount's extent array (%d entries)", pm->pm_nentries);
3037		error = ENOMEM;
3038		goto out;
3039	}
3040
3041	cm->cm_state = CM_SETUP;
3042
3043out:
3044	if (error) {
3045		cm->cm_state = CM_ABORTED;
3046		_FREE_ZERO(cm->cm_pextents, M_TEMP);
3047		lck_rw_destroy(&cm->cm_lock, BC_cache->c_lckgrp);
3048	}
3049	return error;
3050}
3051
3052/*
3053 * Fill in the rest of the cache mount given the matching mount structure.
3054 *
3055 * Called with the cache mount write lock held
3056 *
3057 * Returns 0 if the mount was successfully filled in and cm_state will be CM_READY
3058 * Returns non-0 on failure or it the mount was already filled in
3059 */
3060static int
3061BC_fill_in_mount(struct BC_cache_mount *cm, mount_t mount, vfs_context_t context)
3062{
3063	uint64_t blkcount, max_byte_count, max_segment_count, max_segment_byte_count, max_block_count;
3064	uint32_t blksize, is_ssd;
3065	int error, mount_idx, i;
3066	u_int64_t disk_id;
3067	struct BC_cache_disk *cd;
3068	vnode_t devvp = NULLVP;
3069
3070	error = 0;
3071
3072	if (CM_SETUP != cm->cm_state) {
3073		return EINVAL;
3074	}
3075
3076	cm->cm_throttle_mask = vfs_throttle_mask(mount);
3077	disk_id = cm->cm_throttle_mask; /* using the throttle mask as a way to identify the physical disk */
3078	/* debug("Got throttle mask %llx for mount %s", cm->cm_throttle_mask, uuid_string(cm->cm_uuid)); */
3079
3080	devvp = vfs_devvp(mount);
3081	if (devvp == NULLVP) {
3082		message("mount %s does not have a vnode", uuid_string(cm->cm_uuid));
3083		error = EINVAL;
3084		goto out;
3085	}
3086
3087#ifdef ROOT_DISK_ONLY
3088	if (devvp == rootvp) {
3089		BC_cache->c_root_disk_id = disk_id;
3090		if (BC_cache->c_root_disk_id == 0) {
3091			message("Root disk is 0");
3092		} else {
3093			debug("Root disk (via cache) is 0x%llx", BC_cache->c_root_disk_id);
3094		}
3095	} else if (0 == BC_cache->c_root_disk_id) {
3096		error = EAGAIN; /* we haven't seen the root mount yet, try again later */
3097		goto out;
3098
3099	//rdar://11653286 disk image volumes (FileVault 1) are messing with this check, so we're going back to != rather than !( & )
3100	} else if (BC_cache->c_root_disk_id != disk_id) {
3101		debug("mount %s (disk 0x%llx) is not on the root disk (disk 0x%llx)", uuid_string(cm->cm_uuid), disk_id, BC_cache->c_root_disk_id);
3102		error = ENODEV;
3103		goto out;
3104	}
3105#endif
3106
3107	/* See if we already have a cache_disk for this disk */
3108	for (mount_idx = 0; mount_idx < BC_cache->c_nmounts; mount_idx++) {
3109		if (BC_cache->c_mounts + mount_idx == cm) continue;
3110
3111		cd = BC_cache->c_mounts[mount_idx].cm_disk;
3112
3113		/*
3114		 * We're not handling mounts backed by multiple disks as gracefull as we should.
3115		 * Previously, this was cd->cd_disk_id == disk_id, so we had a thread per disk combination
3116		 * meaning reader threads may span multiple disks and disks may have multiple reader threads.
3117		 * We've only ever supported the root disk, however, so this wasn't a problem, it just missed
3118		 * cases where you'd have other volumes on one of the disks you're booting from.
3119		 *
3120		 * Now, since we are checking for cd->cd_disk_id & disk_id, we at least include all mounts that
3121		 * are on disks that the root mount uses. We still only have one reader thread, but we don't support
3122		 * playback on composite disks, so that's not a problem yet. See rdar://10081513
3123		 *
3124		 * This assumes that the root mount (or, at least the mount we care most about) will always appear first
3125		 *
3126		 */
3127		if (cd && (cd->cd_disk_id & disk_id)) {
3128			cm->cm_disk = cd;
3129			break;
3130		}
3131	}
3132
3133	/* If we don't already have a cache_disk for this disk, allocate one */
3134	if (cm->cm_disk == NULL) {
3135		cd = BC_MALLOC(sizeof(*cd), M_TEMP, M_WAITOK | M_ZERO);
3136		if (cd == NULL) {
3137			message("can't allocate memory for cache disk");
3138			error = ENOMEM;
3139			goto out;
3140		}
3141
3142		if (VNOP_IOCTL(devvp,       /* vnode */
3143					   DKIOCISSOLIDSTATE,    /* cmd */
3144					   (caddr_t)&is_ssd, /* data */
3145					   0,
3146					   context))           /* context */
3147		{
3148			message("can't determine if disk is a solid state disk for mount %s", uuid_string(cm->cm_uuid));
3149			is_ssd = 0;
3150		}
3151
3152		if ((error = BC_setup_disk(cd, disk_id, is_ssd)) != 0) {
3153			_FREE_ZERO(cd, M_TEMP);
3154			message("cache disk setup failed: %d", error);
3155			goto out;
3156		}
3157		cm->cm_disk = cd;
3158	}
3159
3160
3161	if (VNOP_IOCTL(devvp,
3162				   DKIOCGETBLOCKCOUNT,
3163				   (caddr_t)&blkcount,
3164				   0,
3165				   context)
3166		|| blkcount == 0)
3167	{
3168		message("can't determine device size, not checking");
3169		blkcount = 0;
3170	}
3171
3172	if (VNOP_IOCTL(devvp,
3173				   DKIOCGETBLOCKSIZE,
3174				   (caddr_t)&blksize,
3175				   0,
3176				   context)
3177		|| blksize == 0)
3178	{
3179		message("can't determine device block size for mount %s, defaulting to 512 bytes", uuid_string(cm->cm_uuid));
3180		blksize = 512;
3181	}
3182
3183	if (PAGE_SIZE % blksize != 0) {
3184		message("PAGE_SIZE (%d) is not a multiple of block size (%d) for mount %s", PAGE_SIZE, blksize, uuid_string(cm->cm_uuid));
3185		error = EINVAL;
3186		goto out;
3187	}
3188
3189	cm->cm_blocksize = blksize;
3190	cm->cm_devsize = blksize * blkcount;
3191
3192	/* max read size isn't larger than the max UPL size */
3193	cm->cm_maxread = ubc_upl_maxbufsize();
3194
3195	/* maxread = min ( maxread, MAXBYTECOUNTREAD ) */
3196	if (0 == VNOP_IOCTL(devvp,
3197						DKIOCGETMAXBYTECOUNTREAD,
3198						(caddr_t)&max_byte_count,
3199						0,
3200						context)) {
3201		if (cm->cm_maxread > max_byte_count && max_byte_count > 0) {
3202			cm->cm_maxread = max_byte_count;
3203			debug("MAXBYTECOUNTREAD is %#llx", max_byte_count);
3204		}
3205	}
3206
3207	/* maxread = min ( maxread, MAXBLOCKCOUNTREAD *  BLOCKSIZE ) */
3208	if (0 == VNOP_IOCTL(devvp,
3209						DKIOCGETMAXBLOCKCOUNTREAD,
3210						(caddr_t)&max_block_count,
3211						0,
3212						context)) {
3213		if (cm->cm_maxread > max_block_count * cm->cm_blocksize && max_block_count > 0) {
3214			cm->cm_maxread = max_block_count * cm->cm_blocksize;
3215			debug("MAXBLOCKCOUNTREAD is %#llx, BLOCKSIZE is %#llx, (multiplied %#llx)", max_block_count, cm->cm_blocksize, max_block_count * cm->cm_blocksize, cm->cm_maxread);
3216		}
3217	}
3218
3219	/* maxread = min ( maxread, MAXSEGMENTCOUNTREAD * min (MAXSEGMENTBYTECOUNTREAD, PAGE_SIZE ) ) */
3220	if (0 == VNOP_IOCTL(devvp,
3221						DKIOCGETMAXSEGMENTCOUNTREAD,
3222						(caddr_t)&max_segment_count,
3223						0,
3224						context)) {
3225
3226		if (max_segment_count > 0) {
3227
3228			if (0 == VNOP_IOCTL(devvp,
3229								DKIOCGETMAXSEGMENTBYTECOUNTREAD,
3230								(caddr_t)&max_segment_byte_count,
3231								0,
3232								context)) {
3233				//rdar://13835534 Limit max_segment_byte_count to PAGE_SIZE because some drives don't handle the spec correctly
3234				if (max_segment_byte_count > PAGE_SIZE || max_segment_byte_count == 0) {
3235					debug("MAXSEGMENTBYTECOUNTREAD is %#llx, limiting to PAGE_SIZE %#x", max_segment_byte_count, PAGE_SIZE);
3236					max_segment_byte_count = PAGE_SIZE;
3237				}
3238			} else {
3239				debug("Unable to get MAXSEGMENTBYTECOUNTREAD, assuming PAGE_SIZE %#x", PAGE_SIZE);
3240				max_segment_byte_count = PAGE_SIZE;
3241			}
3242
3243			if (cm->cm_maxread > max_segment_count * max_segment_byte_count) {
3244				cm->cm_maxread = max_segment_count * max_segment_byte_count;
3245				debug("MAXSEGMENTCOUNTREAD is %#llx, MAXSEGMENTBYTECOUNTREAD is %#llx, (multiplied %#llx)", max_segment_count, max_segment_byte_count, max_segment_count * max_segment_byte_count);
3246			}
3247		}
3248	}
3249
3250	/* maxread = min ( maxread, MAX_UPL_TRANSFER * PAGE_SIZE ) */
3251	if (cm->cm_maxread > MAX_UPL_TRANSFER * PAGE_SIZE) {
3252		cm->cm_maxread = MAX_UPL_TRANSFER * PAGE_SIZE;
3253		debug("MAX_UPL_TRANSFER is %#x, PAGE_SIZE is %#x, (multiplied %#x)", MAX_UPL_TRANSFER, PAGE_SIZE, MAX_UPL_TRANSFER * PAGE_SIZE);
3254	}
3255
3256	/* maxread = max ( maxread, MAXPHYS ) */
3257	if (cm->cm_maxread < MAXPHYS) {
3258		debug("can't determine device read size for mount %s; using default", uuid_string(cm->cm_uuid));
3259		cm->cm_maxread = MAXPHYS;
3260	}
3261
3262	/* make sure maxread is a multiple of the block size */
3263	if (cm->cm_maxread % cm->cm_blocksize != 0) {
3264        debug("Mount max IO size (%llu) not a multiple of block size (%llu)", cm->cm_maxread, cm->cm_blocksize);
3265        cm->cm_maxread -= cm->cm_maxread % cm->cm_blocksize;
3266    }
3267
3268	cm->cm_bp = buf_alloc(devvp);
3269	if (cm->cm_bp == NULL) {
3270		message("can't allocate buf");
3271		error = ENOMEM;
3272		goto out;
3273	}
3274
3275	cm->cm_dev = vnode_specrdev(devvp);
3276	if (cm->cm_dev == nulldev()) {
3277		message("mount %s does not have a device", uuid_string(cm->cm_uuid));
3278		error = EINVAL;
3279		goto out;
3280	}
3281
3282	for (i = 0; i < cm->cm_nextents; i++) {
3283		LOCK_EXTENT(cm->cm_pextents[i]);
3284		if (0 != BC_fill_in_extent(cm->cm_pextents[i])) {
3285			BC_teardown_extent(cm->cm_pextents[i]);
3286			UNLOCK_EXTENT(cm->cm_pextents[i]);
3287			wakeup(cm->cm_pextents[i]);
3288		} else {
3289			UNLOCK_EXTENT(cm->cm_pextents[i]);
3290		}
3291	}
3292
3293	cm->cm_state = CM_READY;
3294
3295	debug("Found new cache mount %s disk %d, %d extents", uuid_string(cm->cm_uuid), cm->cm_disk->cd_disk_num, cm->cm_nextents);
3296
3297out:
3298	if (error && error != EAGAIN) {
3299		/*
3300		 * Mark all extents as aborted. This will notify any sleepers in the
3301		 * strategy routine that the extent won't have data for them.
3302		 *
3303		 * Note that its fine to take the extent lock here since no one will
3304		 * have extent lock and try to take the mount lock if the mount isn't CM_READY
3305		 */
3306		for (i = 0; i < cm->cm_nextents; i++) {
3307			LOCK_EXTENT(cm->cm_pextents[i]);
3308			BC_teardown_extent(cm->cm_pextents[i]);
3309			UNLOCK_EXTENT(cm->cm_pextents[i]);
3310			wakeup(cm->cm_pextents[i]);
3311		}
3312
3313		BC_teardown_mount(cm);
3314	}
3315	if (devvp != NULLVP) {
3316		vnode_put(devvp);
3317	}
3318	return error;
3319}
3320
3321/*
3322 * Called with the cache mount write lock held
3323 */
3324static void
3325BC_teardown_mount(struct BC_cache_mount *cm)
3326{
3327	if (cm->cm_bp) {
3328		buf_free(cm->cm_bp);
3329		cm->cm_bp = NULL;
3330	}
3331	cm->cm_nextents = 0;
3332	_FREE_ZERO(cm->cm_pextents, M_TEMP);
3333	cm->cm_state = CM_ABORTED;
3334}
3335
3336/*
3337 * Check to see which mounts have been mounted and finish setting them up.
3338 *
3339 * No extent/mount locks required since we have a write lock on the cache
3340 *
3341 * Called with the cache write lock held
3342 */
3343static int fill_in_bc_cache_mounts(mount_t mount, void* arg)
3344{
3345	int mount_idx, error, i;
3346	struct vfs_attr attr;
3347	vfs_context_t context;
3348	int* go_again = (int*) arg;
3349
3350	VFSATTR_INIT(&attr);
3351	VFSATTR_WANTED(&attr, f_uuid);
3352	context = vfs_context_create(NULL);
3353	error = vfs_getattr(mount, &attr, context);
3354	if ((0 != error) || (! VFSATTR_IS_SUPPORTED(&attr, f_uuid))) {
3355#ifdef BC_DEBUG
3356		char name[MFSNAMELEN];
3357		vfs_name(mount, name);
3358		if (strncmp("devfs", name, sizeof("devfs")) != 0) {
3359			debug("Found mount %s for IO without a UUID: error %d", name, error);
3360		}
3361#endif
3362		vfs_context_rele(context);
3363		return VFS_RETURNED;
3364	} else {
3365		// debug("Found mount for IO with UUID %s", uuid_string(attr.f_uuid));
3366
3367		for (mount_idx = 0; mount_idx < BC_cache->c_nmounts; mount_idx++) {
3368			struct BC_cache_mount *cm = BC_cache->c_mounts + mount_idx;
3369			int match = 0;
3370			if (CM_SETUP == cm->cm_state) {
3371
3372				if (0 == uuid_compare(cm->cm_uuid, attr.f_uuid)) {
3373					match = 1;
3374				}
3375
3376				/* a null uuid indicates we want to match the root volume, no matter what the uuid (8350414) */
3377				if ((!match) && uuid_is_null(cm->cm_uuid)) {
3378					vnode_t devvp = vfs_devvp(mount);
3379					if (vnode_specrdev(devvp) == rootdev) {
3380						uuid_copy(cm->cm_uuid, attr.f_uuid);
3381						match = 1;
3382						debug("Found root mount %s", uuid_string(cm->cm_uuid));
3383					}
3384					vnode_put(devvp);
3385				}
3386
3387				if (match) {
3388					/* Found a matching mount */
3389
3390					/* Locking here isn't necessary since we're only called while holding the cache write lock
3391					 * and no one holds the mount locks without also holding the cache lock
3392					 */
3393					if (BC_fill_in_mount(cm, mount, context) == EAGAIN) {
3394						*go_again = 1;
3395					}
3396					vfs_context_rele(context);
3397
3398					/* Check to see if we have any more mounts to fill in */
3399					for (i = 0; i < BC_cache->c_nmounts; i++) {
3400						if (CM_SETUP == BC_cache->c_mounts[i].cm_state) {
3401							/* have more to fill in, keep iterating */
3402							return VFS_RETURNED;
3403						}
3404					}
3405
3406					return VFS_RETURNED_DONE;
3407				}
3408			}
3409		}
3410	}
3411
3412	vfs_context_rele(context);
3413
3414	/* Check to see if we have any more mounts to fill in */
3415	for (i = 0; i < BC_cache->c_nmounts; i++) {
3416		if (CM_SETUP == BC_cache->c_mounts[i].cm_state) {
3417			/* have more to fill in, keep iterating */
3418			return VFS_RETURNED;
3419		}
3420	}
3421
3422	debug("No new mounts filled in fill_in_bc_cache_mounts");
3423	return VFS_RETURNED_DONE;
3424
3425}
3426
3427#ifndef BOOTCACHE_ENTRIES_SORTED_BY_DISK_OFFSET
3428/*
3429 * Sort the extent list.
3430 */
3431static int
3432compare_cache_extents(const void *vfirst, const void *vsecond)
3433{
3434	const struct BC_cache_extent *first, *second;
3435
3436	first  = (const struct BC_cache_extent *)vfirst;
3437	second = (const struct BC_cache_extent *)vsecond;
3438
3439	// Sort by volume first, then by logical block address
3440	int mount_comparison = first->ce_mount_idx - second->ce_mount_idx;
3441	if (mount_comparison != 0)
3442		return((mount_comparison < 0) ? -1 : 1);
3443
3444	if (first->ce_diskoffset == second->ce_diskoffset)
3445		return(0);
3446	return((first->ce_diskoffset < second->ce_diskoffset) ? -1 : 1);
3447}
3448#endif
3449
3450/*
3451 * Fetch the playlist from userspace or the Mach-O segment where it
3452 * was preloaded. Add it to the cache and readahead in batches after
3453 * the current cache, if any.
3454 *
3455 * Returns 0 on success. In failure cases, the cache is not modified.
3456 *
3457 * Called with the cache write lock held.
3458 */
3459static int
3460BC_copyin_playlist(size_t mounts_size, user_addr_t mounts_buf, size_t entries_size, user_addr_t entries_buf)
3461{
3462	int nplaylist_mounts, nplaylist_entries;
3463	struct BC_playlist_mount *playlist_mounts = NULL, *pm;
3464	struct BC_playlist_entry *playlist_entries = NULL, *pe;
3465	struct BC_cache_mount  *cm, *old_cm;
3466	struct BC_cache_extent *ce;
3467
3468	int ncache_mounts = 0, ncache_extents = 0, nnew_mounts = 0;
3469	struct BC_cache_mount  *cache_mounts = NULL;
3470	struct BC_cache_extent *cache_extents = NULL;
3471	int *pmidx_to_cmidx = NULL;
3472	int *next_old_extent_idx = NULL;
3473
3474	int *cache_nextents = NULL;
3475	struct BC_cache_extent **cache_extents_list = NULL;
3476
3477	int old_batch_count;
3478	u_int64_t old_cache_size;
3479
3480	int error, pm_idx, cm_idx, pe_idx, ce_idx, cel_idx, pm_idx2, max_entries;
3481	off_t p;
3482	u_int64_t size;
3483	int actual, remaining_entries;
3484
3485	int clipped_extents = 0, enveloped_extents = 0;
3486
3487	playlist_mounts = NULL;
3488	playlist_entries = NULL;
3489
3490#ifdef BC_DEBUG
3491	struct timeval start_time;
3492	microtime(&start_time);
3493#endif
3494
3495	old_batch_count = BC_cache->c_batch_count;
3496	old_cache_size = BC_cache->c_cachesize;
3497
3498	nplaylist_mounts = (int) mounts_size / sizeof(*playlist_mounts);
3499	nplaylist_entries = (int) entries_size / sizeof(*playlist_entries);
3500
3501	if (BC_cache->c_stats.ss_total_extents > 0) {
3502		debug("adding %u extents to existing cache of %u extents", nplaylist_entries, BC_cache->c_stats.ss_total_extents);
3503	} else {
3504		debug("setting up first cache of %d extents", nplaylist_entries);
3505	}
3506
3507	for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) {
3508		debug("Mount %s has %d extents", uuid_string(BC_cache->c_mounts[cm_idx].cm_uuid), BC_cache->c_mounts[cm_idx].cm_nextents);
3509	}
3510
3511	/*
3512	 * Mount array
3513	 */
3514	if (BC_preloaded_playlist) {
3515		playlist_mounts = CAST_DOWN(struct BC_playlist_mount *, mounts_buf);
3516	} else {
3517		playlist_mounts = BC_MALLOC(mounts_size, M_TEMP, M_WAITOK);
3518		if (playlist_mounts == NULL) {
3519			message("can't allocate unpack mount buffer of %ld bytes", mounts_size);
3520			error = ENOMEM;
3521			goto out;
3522		}
3523		if ((error = copyin(mounts_buf, playlist_mounts, mounts_size)) != 0) {
3524			message("copyin %ld bytes from 0x%llx to %p failed", mounts_size, mounts_buf, playlist_mounts);
3525			goto out;
3526		}
3527
3528		/* if BC_preloaded_playlist is non-NULL, playlist_mounts must be freed */
3529	}
3530
3531	/* map from the playlist mount index to the corresponding cache mount index */
3532	pmidx_to_cmidx = BC_MALLOC(nplaylist_mounts * sizeof(*pmidx_to_cmidx), M_TEMP, M_WAITOK);
3533	if (pmidx_to_cmidx == NULL) {
3534		message("can't allocate playlist index to cache index reference array for %d mounts", nplaylist_mounts);
3535		error = ENOMEM;
3536		goto out;
3537	}
3538
3539	/* In order to merge the mount's current (sorted) extent list with the new (sorted)
3540	 * extent list from the new playlist we first grow the extent list allocation to fit
3541	 * both arrays, then merge the two arrays into the new allocation.
3542	 */
3543
3544	if (BC_cache->c_nmounts > 0) {
3545		/* next_old_extent_idx saves the index of the next unmerged extent in the original mount's extent list */
3546		next_old_extent_idx = BC_MALLOC(BC_cache->c_nmounts * sizeof(*next_old_extent_idx), M_TEMP, M_WAITOK | M_ZERO);
3547		if (next_old_extent_idx == NULL) {
3548			message("can't allocate index array for %d mounts", BC_cache->c_nmounts);
3549			error = ENOMEM;
3550			goto out;
3551		}
3552
3553		/* 0-filled since all the index start at 0 */
3554	}
3555
3556	nnew_mounts = nplaylist_mounts;
3557
3558	/* determine how many mounts are new and how many we already have */
3559	for (pm_idx = 0; pm_idx < nplaylist_mounts; pm_idx++) {
3560		pmidx_to_cmidx[pm_idx] = -1; /* invalid by default */
3561
3562		for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) {
3563			if (0 == uuid_compare(BC_cache->c_mounts[cm_idx].cm_uuid, playlist_mounts[pm_idx].pm_uuid)) {
3564				/* already have a record of this mount, won't need a new spot for it */
3565				pmidx_to_cmidx[pm_idx] = cm_idx;
3566				nnew_mounts--;
3567				break;
3568			}
3569		}
3570
3571		/* check to make sure there aren't any duplicate mounts within the playlist */
3572		for (pm_idx2 = pm_idx + 1; pm_idx2 < nplaylist_mounts; pm_idx2++) {
3573			if (0 == uuid_compare(playlist_mounts[pm_idx2].pm_uuid, playlist_mounts[pm_idx].pm_uuid)) {
3574				message("Playlist had duplicate mount %s", uuid_string(playlist_mounts[pm_idx2].pm_uuid));
3575				error = EINVAL;
3576				goto out;
3577			}
3578		}
3579	}
3580
3581	/* cache_mounts is the array of mounts that will replace the one currently in the cache */
3582	cache_mounts = BC_MALLOC((BC_cache->c_nmounts + nnew_mounts) * sizeof(*cache_mounts), M_TEMP, M_WAITOK | M_ZERO);
3583	if (cache_mounts == NULL) {
3584		message("can't allocate memory for %d mounts", (BC_cache->c_nmounts + nnew_mounts));
3585		error = ENOMEM;
3586		goto out;
3587	}
3588	memcpy(cache_mounts, BC_cache->c_mounts, (BC_cache->c_nmounts * sizeof(*BC_cache->c_mounts)));
3589
3590	/* ncache_mounts is the current number of valid mounts in the mount array */
3591	ncache_mounts = BC_cache->c_nmounts;
3592
3593	for (pm_idx = 0; pm_idx < nplaylist_mounts; pm_idx++) {
3594		if (pmidx_to_cmidx[pm_idx] != -1) {
3595			/* We already have a record for this mount */
3596
3597			if (playlist_mounts[pm_idx].pm_nentries == 0) {
3598				message("Playlist incuded mount with 0 entries");
3599				error = EINVAL;
3600				goto out;
3601			}
3602
3603			cm_idx = pmidx_to_cmidx[pm_idx];
3604
3605			assert(cm_idx < BC_cache->c_nmounts);
3606
3607			/* grow the mount's extent array to fit the new extents (we'll fill in the new extents below) */
3608			cache_mounts[cm_idx].cm_pextents = BC_MALLOC((BC_cache->c_mounts[cm_idx].cm_nextents + playlist_mounts[pm_idx].pm_nentries) * sizeof(*cache_mounts[cm_idx].cm_pextents), M_TEMP, M_WAITOK | M_ZERO);
3609			if (cache_mounts[cm_idx].cm_pextents == NULL) {
3610				message("can't allocate mount's extent array (%d entries)", (BC_cache->c_mounts[cm_idx].cm_nextents + playlist_mounts[pm_idx].pm_nentries));
3611				error = ENOMEM;
3612				goto out;
3613			}
3614
3615			/* don't free the old extent list yet, the real BC_cache's mount still points to it and we may yet fail */
3616
3617			/* cm_nextents is the number of valid extents in this mount's extent list. It starts out as 0 and we fill it in below */
3618			cache_mounts[cm_idx].cm_nextents = 0;
3619		} else {
3620			/* New mount for the cache */
3621
3622			if ((error = BC_setup_mount(cache_mounts + ncache_mounts, playlist_mounts + pm_idx)) != 0) {
3623				goto out;
3624			}
3625
3626			pmidx_to_cmidx[pm_idx] = ncache_mounts;
3627			ncache_mounts++;
3628		}
3629	}
3630
3631	/*
3632	 * Extent array
3633	 *
3634	 * The existing extent array cannot move since a strategy routine may be
3635	 * in the middle of an msleep with one of the extent locks. So, we allocate
3636	 * a new array for the new extents.
3637	 *
3638	 * Add one extent list to our arrays of extent lists
3639	 */
3640
3641	/* cache_nextents is an array of ints, each indicating the number of extents in the list in cache_extents_list at the same index */
3642	cache_nextents = BC_MALLOC((BC_cache->c_nextentlists + 1) * sizeof(*BC_cache->c_nextents), M_TEMP, M_WAITOK | M_ZERO);
3643	if (cache_nextents == NULL) {
3644		message("can't allocate memory for %d extent list sizes", nplaylist_entries);
3645		error = ENOMEM;
3646		goto out;
3647	}
3648	memcpy(cache_nextents, BC_cache->c_nextents, BC_cache->c_nextentlists * sizeof(*BC_cache->c_nextents));
3649
3650	/* cache_extents_list is the array of extent lists. The extent list at the last index is for the new playlist's cache */
3651	cache_extents_list  = BC_MALLOC((BC_cache->c_nextentlists + 1) * sizeof(*BC_cache->c_extentlists),  M_TEMP, M_WAITOK | M_ZERO);
3652	if (cache_extents_list == NULL) {
3653		message("can't allocate memory for %d extent lists", nplaylist_entries);
3654		error = ENOMEM;
3655		goto out;
3656	}
3657	memcpy(cache_extents_list,  BC_cache->c_extentlists,  BC_cache->c_nextentlists * sizeof(*BC_cache->c_extentlists));
3658
3659	/* The extent list for this new playlist's cache */
3660	cache_extents = BC_MALLOC(nplaylist_entries * sizeof(*cache_extents), M_TEMP, M_WAITOK | M_ZERO);
3661	if (cache_extents == NULL) {
3662		message("can't allocate memory for %d extents", nplaylist_entries);
3663		error = ENOMEM;
3664		goto out;
3665	}
3666
3667	/* TODO: iterate over our history and remove any blocks we've already seen from the new cache */
3668
3669	/* Fill in the new extents.
3670	 *
3671	 * We just tack the new playlist onto the end of any cache we currently have as a new batch.
3672	 *
3673	 * At this point, we assume that any additional caches should be in additional batches just as
3674	 * if there was a single recording with a tag separating the new extents. If we did decide to
3675	 * merge, it would still be hard since that would require reordering the extent list and
3676	 * BC_strategy assumes that the cache extents never move (see the msleep in wait_for_extent).
3677	 *
3678	 * We also don't coalesce the new extents into the old (and lower batch) extents.
3679	 * This would be a bit of work, we assume we want a new batch anyway, and it's rare that
3680	 * a new cache comes in while an old cache is still in readahead. So, we don't bother.
3681	 *
3682	 * We do clip any overlap with the new cache from the old cache, however.
3683	 *
3684	 * We don't clip any overlap with the history from the new extents.
3685	 * The history list isn't ordered, so we don't have a fast way to compare the ranges. It's also
3686	 * expected that the overlap is minimal. If we stopped recording history at some point, we don't
3687	 * have this info anyway. So, we don't bother.
3688	 */
3689
3690	/* size is the size in bytes of the cache, used to allocate our page buffer */
3691	size = 0;
3692
3693	if (BC_preloaded_playlist) {
3694		/*
3695		 * Unpack the static control entry array into the extent array.
3696		 */
3697		playlist_entries = CAST_DOWN(struct BC_playlist_entry *, entries_buf);
3698	} else {
3699		/*
3700		 * Since the playlist control entry structure is not the same as the
3701		 * extent structure we use, we need to copy the control entries in
3702		 * and unpack them.
3703		 */
3704		playlist_entries = BC_MALLOC(BC_PLC_CHUNK * sizeof(*playlist_entries),
3705								   M_TEMP, M_WAITOK);
3706		if (playlist_entries == NULL) {
3707			message("can't allocate unpack buffer for %d entries", BC_PLC_CHUNK);
3708			error = ENOMEM;
3709			goto out;
3710		}
3711	}
3712
3713	remaining_entries = nplaylist_entries;
3714	while (remaining_entries > 0) {
3715
3716		if (BC_preloaded_playlist) {
3717			actual = remaining_entries;
3718		} else {
3719			actual = min(remaining_entries, BC_PLC_CHUNK);
3720			if ((error = copyin(entries_buf, playlist_entries,
3721								actual * sizeof(struct BC_playlist_entry))) != 0) {
3722				message("copyin from 0x%llx to %p failed", entries_buf, playlist_entries);
3723				goto out;
3724			}
3725		}
3726
3727		/* unpack into our array */
3728		for (pe_idx = 0; pe_idx < actual; pe_idx++) {
3729			pe = playlist_entries + pe_idx;
3730			if (pe->pe_length == 0) {
3731				debug("Bad Playlist: entry %d has 0 length", (nplaylist_entries - remaining_entries) + pe_idx);
3732				error = EINVAL;
3733				goto out;
3734			}
3735			pm_idx = pe->pe_mount_idx;
3736
3737			if (pm_idx >= nplaylist_mounts || pm_idx < 0) {
3738				message("Bad playlist: entry %d referenced non-existent mount index %d", (nplaylist_entries - remaining_entries) + pe_idx, pm_idx);
3739				error = EINVAL;
3740				goto out;
3741			}
3742
3743			pm = playlist_mounts + pm_idx;
3744			cm_idx = pmidx_to_cmidx[pm_idx];
3745			cm = cache_mounts + cm_idx;
3746			ce = cache_extents + ncache_extents;
3747
3748			/* The size of the extent list is the number of playlist entries + the number of old cache extents */
3749			max_entries = pm->pm_nentries + ((cm_idx < BC_cache->c_nmounts) ? BC_cache->c_mounts[cm_idx].cm_nextents : 0);
3750
3751			if (cm->cm_nextents >= max_entries) {
3752				message("Bad playlist: more entries existed than the mount %s claimed (%d)", uuid_string(pm->pm_uuid), pm->pm_nentries);
3753				error = EINVAL;
3754				goto out;
3755			}
3756
3757			if ((error = BC_setup_extent(ce, pe, old_batch_count, cm_idx)) != 0) {
3758				goto out;
3759			}
3760
3761			/* Merge the new extent with the old extents for this mount. The new extent may get clipped. */
3762#ifdef BOOTCACHE_ENTRIES_SORTED_BY_DISK_OFFSET
3763			if (cm_idx < BC_cache->c_nmounts) {
3764				old_cm = BC_cache->c_mounts + cm_idx;
3765				/* Copy any lower or equal extents from the existing playlist down to the low end of the array */
3766				while (next_old_extent_idx[cm_idx] < old_cm->cm_nextents &&
3767					   old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_diskoffset <= ce->ce_diskoffset) {
3768					cm->cm_pextents[cm->cm_nextents] = old_cm->cm_pextents[next_old_extent_idx[cm_idx]];
3769					cm->cm_nextents++;
3770					next_old_extent_idx[cm_idx]++;
3771				}
3772
3773				/* check for overlap with the next extent in the old list and clip the new extent */
3774				if (next_old_extent_idx[cm_idx] < old_cm->cm_nextents) {
3775
3776					/* FIXME: rdar://9153031 If the new extent extends past the end of the old extent, we're losing caching! */
3777					if (ce->ce_diskoffset + ce->ce_length > (old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_diskoffset + old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_length)) {
3778						debug("!!! Entry %d (0x%llx:%lld) is getting clipped too much because of previous entry for mount %s (0x%llx:%lld)", pe_idx, ce->ce_diskoffset, ce->ce_length, uuid_string(cm->cm_uuid), old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_diskoffset, old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_length);
3779					}
3780
3781					u_int64_t max_offset = old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_diskoffset;
3782					if (max_offset < ce->ce_diskoffset + ce->ce_length) {
3783						if (max_offset <= ce->ce_diskoffset) {
3784							ce->ce_length = 0;
3785						} else {
3786							ce->ce_length = (max_offset - ce->ce_diskoffset);
3787						}
3788					}
3789				}
3790			}
3791
3792			/* check for intersection with the next lower extent in the list and clip the new extent */
3793			if (cm->cm_nextents > 0) {
3794				u_int64_t min_offset = cm->cm_pextents[cm->cm_nextents - 1]->ce_diskoffset + cm->cm_pextents[cm->cm_nextents - 1]->ce_length;
3795				if (min_offset > ce->ce_diskoffset) {
3796
3797					/* Check if this extent is overlapping with an extent from the same playlist */
3798					if (cm->cm_pextents[cm->cm_nextents - 1] >= cache_extents &&
3799						cm->cm_pextents[cm->cm_nextents - 1] < cache_extents + ncache_extents) {
3800						message("Bad playlist: entry %d (0x%llx:%lld) overlapped with previous entry for mount %s (0x%llx:%lld)", pe_idx, ce->ce_diskoffset, ce->ce_length, uuid_string(cm->cm_uuid), cm->cm_pextents[cm->cm_nextents - 1]->ce_diskoffset, cm->cm_pextents[cm->cm_nextents - 1]->ce_length);
3801						error = EINVAL;
3802						goto out;
3803					}
3804
3805					if (min_offset >= ce->ce_diskoffset + ce->ce_length) {
3806						ce->ce_length = 0;
3807					} else {
3808						ce->ce_length -= (min_offset - ce->ce_diskoffset);
3809						ce->ce_diskoffset = min_offset;
3810					}
3811				}
3812			}
3813
3814			if (ce->ce_length != pe->pe_length) {
3815				clipped_extents++;
3816			}
3817			if (ce->ce_length == 0) {
3818				/* new extent is entirely captured in extents from the old cache, throw it out */
3819				enveloped_extents++;
3820				BC_teardown_extent(ce);
3821				lck_mtx_destroy(&ce->ce_lock, BC_cache->c_lckgrp);
3822
3823				/* continue without incrementing ncache_extents so we reuse this extent index */
3824				continue;
3825			}
3826			size += roundup(ce->ce_length, PAGE_SIZE);
3827#endif
3828
3829			cm->cm_pextents[cm->cm_nextents] = ce;
3830			cm->cm_nextents++;
3831			ncache_extents++;
3832		}
3833		remaining_entries -= actual;
3834		entries_buf += actual * sizeof(struct BC_playlist_entry);
3835	}
3836
3837	/* Fill in any remaining extents from the original cache that are still sitting at the high end of the extent list */
3838	for (pm_idx = 0; pm_idx < nplaylist_mounts; pm_idx++) {
3839		cm_idx = pmidx_to_cmidx[pm_idx];
3840		cm = cache_mounts + cm_idx;
3841		pm = playlist_mounts + pm_idx;
3842
3843		if (cm->cm_nextents == 0) {
3844			message("Bad playlist: No entries existed for mount %s (claimed %d)", uuid_string(pm->pm_uuid), pm->pm_nentries);
3845			error = EINVAL;
3846			goto out;
3847		}
3848
3849		if (cm_idx < BC_cache->c_nmounts) {
3850			if (next_old_extent_idx[cm_idx] < BC_cache->c_mounts[cm_idx].cm_nextents) {
3851				/* There are more extents in the old extent array, copy them over */
3852				memcpy(cm->cm_pextents + cm->cm_nextents, BC_cache->c_mounts[cm_idx].cm_pextents + next_old_extent_idx[cm_idx], sizeof(*cm->cm_pextents) * (BC_cache->c_mounts[cm_idx].cm_nextents - next_old_extent_idx[cm_idx]));
3853				cm->cm_nextents += BC_cache->c_mounts[cm_idx].cm_nextents - next_old_extent_idx[cm_idx];
3854			}
3855		}
3856
3857	}
3858
3859#ifndef BOOTCACHE_ENTRIES_SORTED_BY_DISK_OFFSET
3860	/* We need an ordered list of extents for each mount, but we were given
3861	 * the extents in a different ordering, so we need to sort the mount's
3862	 * extent list here
3863	 */
3864	for (cm_idx = 0; cm_idx < ncache_mounts; cm_idx++) {
3865		cm = cache_mounts + cm_idx;
3866
3867		/* If this is a new mount or a mount with a new extent list, it needs sorting */
3868		if (cm_idx >= BC_cache->c_nmounts ||
3869			cm->cm_pextents != BC_cache->c_mounts[cm_idx].cm_pextents) {
3870
3871			qsort(cm->cm_pextents,
3872				  cm->cm_nextents,
3873				  sizeof(*cm->cm_pextents),
3874				  compare_cache_extents);
3875
3876			/* Clip the new extents of overlap with the old extents */
3877			for (ce_idx = 0; ce_idx < cm->cm_nextents; ce_idx++) {
3878
3879				/* Only look at new extents, old extents were already clipped when they were added and can't be clipped further */
3880				if (cm->cm_pextents[ce_idx] >= cache_extents &&
3881					cm->cm_pextents[ce_idx] < cache_extents + ncache_extents) {
3882
3883					u_int64_t old_length = ce->ce_length;
3884
3885					/* Compare against the previous (lower) extent */
3886					if (ce_idx > 0) {
3887						u_int64_t min_offset = cm->cm_pextents[ce_idx - 1]->ce_diskoffset + cm->cm_pextents[ce_idx - 1]->ce_length;
3888						if (min_offset > ce->ce_diskoffset) {
3889							if (min_offset >= ce->ce_diskoffset + ce->ce_length) {
3890								ce->ce_length = 0;
3891							} else {
3892								ce->ce_length -= (min_offset - ce->ce_diskoffset);
3893								ce->ce_diskoffset = min_offset;
3894							}
3895						}
3896					}
3897
3898					/* Compare against the next (higher) extent */
3899					if (ce_idx < cm_nextents - 1) {
3900						u_int64_t max_offset = cm->cm_pextents[ce_idx + 1]->ce_diskoffset;
3901						if (max_offset < ce->ce_diskoffset + ce->ce_length) {
3902
3903							/* Check if this extent is overlapping with an extent from the same playlist */
3904							if (cm->cm_pextents[ce_idx + 1] >= cache_extents &&
3905								cm->cm_pextents[ce_idx + 1] < cache_extents + ncache_extents) {
3906								message("Bad playlist: entry %d (0x%llx:%lld) overlapped with previous entry for mount %s (0x%llx:%lld)", pe_idx, ce->ce_diskoffset, ce->ce_length, uuid_string(cm->cm_uuid), cm->cm_pextents[ce_idx + 1]->ce_diskoffset, cm->cm_pextents[ce_idx + 1]->ce_length);
3907								error = EINVAL;
3908								goto out;
3909							}
3910
3911							if (max_offset <= ce->ce_diskoffset) {
3912								ce->ce_length = 0;
3913							} else {
3914								ce->ce_length = (max_offset - ce->ce_diskoffset);
3915							}
3916						}
3917					}
3918
3919					if (ce->ce_length != old_length) {
3920						clipped_extents++;
3921						if (ce->ce_length == 0) {
3922							/* extent was enveloped, remove this extent from the mount's extent list */
3923							if ((ce_idx + 1) < cm->cm_nextents) {
3924								memmove(cm->cm_pextents + ce_idx, cm->cm_pextents + (ce_idx + 1), cm->cm_nextents - (ce_idx + 1));
3925							}
3926							cm->cm_nextents--;
3927							ce_idx--;
3928							enveloped_extents++;
3929						}
3930					}
3931				}
3932			}
3933		}
3934	}
3935
3936	for (ce_idx = 0; ce_idx < ncache_extents; ce_idx++) {
3937		size += roundup(cache_extents->ce_length, PAGE_SIZE);
3938	}
3939
3940#endif
3941
3942	if (clipped_extents > 0) {
3943		debug("%d extents were clipped, %d of which were completely enveloped", clipped_extents, enveloped_extents);
3944	}
3945
3946	if (ncache_extents == 0) {
3947		debug("No extents added, not updating cache");
3948		error = EALREADY;
3949		goto out;
3950	}
3951
3952	if (! BC_preloaded_playlist) {
3953		_FREE_ZERO(playlist_entries, M_TEMP);
3954		_FREE_ZERO(playlist_mounts, M_TEMP);
3955	}
3956
3957	if (ncache_extents < nplaylist_entries) {
3958		debug("%d extents added out of %d", ncache_extents, nplaylist_entries);
3959	}
3960	for (cm_idx = 0; cm_idx < ncache_mounts; cm_idx++) {
3961		debug("Mount %s now has %d extents", uuid_string(cache_mounts[cm_idx].cm_uuid), cache_mounts[cm_idx].cm_nextents);
3962	}
3963
3964
3965	/*
3966	 * In order to simplify allocation of the block and page maps, we round
3967	 * the size up to a multiple of CB_MAPFIELDBITS pages.  This means we
3968	 * may waste as many as 31 pages in the worst case.
3969	 */
3970	size = roundup(size, PAGE_SIZE * CB_MAPFIELDBITS);
3971
3972	/*
3973	 * Verify that the cache is not larger than 50% of physical memory
3974	 * to avoid forcing pageout activity.
3975	 *
3976	 * max_mem and size are in bytes.  All calculations are done in page
3977	 * units to avoid overflow/sign issues.
3978	 *
3979	 * If the cache is too large, we discard it and go to record-only mode.
3980	 * This lets us survive the case where the playlist is corrupted or
3981	 * manually made too large, but the "real" bootstrap footprint
3982	 * would be OK.
3983	 */
3984	BC_cache->c_cachesize += size;
3985	if (BC_cache->c_cachesize > BC_MAX_SIZE) {
3986		message("cache size (%lu bytes) too large for physical memory (%llu bytes), capping at %llu bytes",
3987				(unsigned long)BC_cache->c_cachesize, max_mem, BC_MAX_SIZE);
3988		BC_cache->c_cachesize = BC_MAX_SIZE;
3989	}
3990
3991	/*
3992	 * Allocate/grow the pagebuffer.
3993	 */
3994	if (BC_alloc_pagebuffer() != 0) {
3995		message("can't allocate %lld bytes for cache buffer", BC_cache->c_cachesize);
3996		error = ENOMEM;
3997		goto out;
3998	}
3999
4000	/*
4001	 * Fix up the extent cache offsets.
4002	 */
4003	p = 0;
4004	if (BC_cache->c_nextentlists > 0) {
4005		/* start at the offset of the end of the previous cache list */
4006		for (cel_idx = BC_cache->c_nextentlists - 1; cel_idx >= 0; cel_idx--) {
4007			if (BC_cache->c_nextents[cel_idx] > 0) {
4008				struct BC_cache_extent *last_ce = BC_cache->c_extentlists[cel_idx] + BC_cache->c_nextents[cel_idx] - 1;
4009				p = last_ce->ce_cacheoffset + roundup(last_ce->ce_length, PAGE_SIZE);
4010				break;
4011			}
4012		}
4013	}
4014	for (ce_idx = 0; ce_idx < ncache_extents; ce_idx++) {
4015		cache_extents[ce_idx].ce_cacheoffset = p;
4016		p += roundup(cache_extents[ce_idx].ce_length, PAGE_SIZE);
4017
4018		/* Abort any extents that extend past our cache size */
4019		if (p >= BC_cache->c_cachesize || cache_extents[ce_idx].ce_length == 0) {
4020			BC_teardown_extent(cache_extents + ce_idx);
4021		}
4022	}
4023
4024	/* We've successfully integrated the new playlist with our cache, copy it into place */
4025
4026	/* free up old cache's allocations that we're replacing */
4027	if (BC_cache->c_nmounts > 0) {
4028		for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) {
4029			/* If we have a new extent array for this mount, free the old one */
4030			if (BC_cache->c_mounts[cm_idx].cm_pextents != cache_mounts[cm_idx].cm_pextents) {
4031				_FREE_ZERO(BC_cache->c_mounts[cm_idx].cm_pextents, M_TEMP);
4032			}
4033		}
4034		_FREE_ZERO(BC_cache->c_mounts, M_TEMP)
4035	}
4036	BC_cache->c_mounts = cache_mounts;
4037	BC_cache->c_nmounts = ncache_mounts;
4038
4039	if (BC_cache->c_nextentlists > 0) {
4040		_FREE_ZERO(BC_cache->c_extentlists, M_TEMP)
4041		_FREE_ZERO(BC_cache->c_nextents, M_TEMP)
4042	}
4043
4044	cache_nextents[BC_cache->c_nextentlists] = ncache_extents;
4045	cache_extents_list[BC_cache->c_nextentlists] = cache_extents;
4046
4047	BC_cache->c_nextentlists++;
4048
4049	BC_cache->c_nextents = cache_nextents;
4050	BC_cache->c_extentlists = cache_extents_list;
4051
4052	BC_cache->c_stats.ss_total_mounts = ncache_mounts;
4053	BC_cache->c_stats.ss_total_extents += ncache_extents;
4054
4055	/* fill (or tear down) the extents for all the mounts we've already seen */
4056	for (ce_idx = 0; ce_idx < ncache_extents; ce_idx++) {
4057		ce = cache_extents + ce_idx;
4058		if (CM_SETUP != BC_cache->c_mounts[ce->ce_mount_idx].cm_state) {
4059			LOCK_EXTENT(ce);
4060			if (CM_READY == BC_cache->c_mounts[ce->ce_mount_idx].cm_state) {
4061				if (0 == BC_fill_in_extent(ce)) {
4062					UNLOCK_EXTENT(ce);
4063					continue;
4064				}
4065			}
4066			/* either the mount is aborted or fill_in_extent failed */
4067			BC_teardown_extent(ce);
4068			UNLOCK_EXTENT(ce);
4069		}
4070	}
4071
4072	/* Check to see if we have any more mounts to fill in */
4073	for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) {
4074		if (CM_SETUP == BC_cache->c_mounts[cm_idx].cm_state) {
4075			/* have at least one mount to fill in, iterate all of them */
4076			int go_again = 0;
4077			vfs_iterate(0, fill_in_bc_cache_mounts, &go_again);
4078			if (go_again) {
4079				debug("Missing root mount during last iteration, going again");
4080				vfs_iterate(0, fill_in_bc_cache_mounts, &go_again);
4081			}
4082			break;
4083		}
4084	}
4085
4086	error = 0;
4087
4088	/* all done */
4089out:
4090	if (! BC_preloaded_playlist) {
4091		_FREE_ZERO(playlist_entries, M_TEMP);
4092		_FREE_ZERO(playlist_mounts, M_TEMP);
4093	}
4094	_FREE_ZERO(pmidx_to_cmidx, M_TEMP);
4095	_FREE_ZERO(next_old_extent_idx, M_TEMP);
4096	if (error != 0) {
4097		if (error != EALREADY) {
4098			debug("cache setup failed, aborting");
4099		}
4100
4101		/* If it was possible to fail after growing the page buffer, we'd need a way to shrink it back here */
4102
4103		if (BC_cache->c_extentlists == NULL) {
4104			/* If this is our first cache, clear stats and our page buffer */
4105			BC_cache->c_stats.ss_total_extents = 0;
4106			BC_cache->c_stats.ss_total_mounts = 0;
4107			if (BC_cache->c_vp != NULL) {
4108				BC_free_pagebuffer();
4109			}
4110		}
4111
4112		BC_cache->c_cachesize = old_cache_size;
4113		BC_cache->c_batch_count = old_batch_count;
4114
4115		/* free up new cache's allocations that aren't part of the old cache */
4116		if (cache_extents) {
4117			for (ce_idx = 0; ce_idx < ncache_extents; ce_idx++) {
4118				BC_teardown_extent(cache_extents + ce_idx);
4119				lck_mtx_destroy(&cache_extents[ce_idx].ce_lock, BC_cache->c_lckgrp);
4120			}
4121			_FREE_ZERO(cache_extents, M_TEMP);
4122		}
4123		_FREE_ZERO(cache_nextents, M_TEMP);
4124		_FREE_ZERO(cache_extents_list, M_TEMP);
4125
4126		if (cache_mounts) {
4127			for (cm_idx = 0; cm_idx < ncache_mounts; cm_idx++) {
4128				if (cm_idx >= BC_cache->c_nmounts) {
4129					/* newly allocated mounts, teardown completely */
4130					BC_teardown_mount(cache_mounts + cm_idx);
4131					lck_rw_destroy(&cache_mounts[cm_idx].cm_lock, BC_cache->c_lckgrp);
4132				} else if (BC_cache->c_mounts[cm_idx].cm_pextents != cache_mounts[cm_idx].cm_pextents) {
4133					/* old mounts with a new extent list */
4134					_FREE_ZERO(cache_mounts[cm_idx].cm_pextents, M_TEMP);
4135				}
4136			}
4137			_FREE_ZERO(cache_mounts, M_TEMP);
4138		}
4139	}
4140
4141#ifdef BC_DEBUG
4142	struct timeval end_time;
4143	microtime(&end_time);
4144	timersub(&end_time, &start_time, &end_time);
4145	debug("BC_copyin_playlist took %ldus", end_time.tv_sec * 1000000 + end_time.tv_usec);
4146#endif
4147
4148	return(error);
4149}
4150
4151/*
4152 * Initialise the cache.  Fetch the playlist from userspace and
4153 * find and attach to our block device.
4154 */
4155static int
4156BC_init_cache(size_t mounts_size, user_addr_t mounts, size_t entries_size, user_addr_t entries, int record_history)
4157{
4158	int error = 0, copyin_error, mount_idx;
4159	struct timeval current_time;
4160
4161	/* Everything should be set, or nothing should be set */
4162	if ((mounts_size == 0 || mounts == 0 || entries_size == 0 || entries == 0) &&
4163		(mounts_size != 0 || mounts != 0 || entries_size != 0 || entries != 0)) {
4164		debug("Invalid cache parameters: %ld, %lld, %ld, %lld", mounts_size, mounts, entries_size, entries);
4165		return(EINVAL);
4166	}
4167
4168
4169	if (! (BC_cache->c_flags & BC_FLAG_SETUP)) {
4170		debug("Cache not yet setup");
4171		return(ENXIO);
4172	}
4173
4174	microtime(&current_time);
4175
4176	/*
4177	 * Start recording history.
4178	 *
4179	 * Only one history recording may be active at a time
4180	 */
4181	if (record_history) {
4182		LOCK_HANDLERS();
4183		if (BC_set_flag(BC_FLAG_HISTORYACTIVE)) {
4184			debug("history recording already started, only one playlist will be returned");
4185			BC_ADD_STAT(history_num_recordings, 1);
4186			UNLOCK_HANDLERS();
4187		} else {
4188			debug("starting history recording");
4189
4190			/* start the timeout handler */
4191			timeout(BC_timeout_history, NULL, BC_history_timeout);
4192
4193			/* always take stats for the first two recordings (boot + login), even if we don't have a playlist */
4194			if (BC_cache->c_stats.ss_history_num_recordings < 2)
4195				BC_cache->c_take_detailed_stats = 1;
4196
4197			BC_cache->c_history_starttime = current_time;
4198			BC_cache->c_history_size = 0;
4199			BC_cache->c_history_num_clusters = 0;
4200			BC_ADD_STAT(history_num_recordings, 1);
4201			BC_clear_flag(BC_FLAG_HTRUNCATED);
4202
4203			/*
4204			 * Take over the strategy routine for our device; we are now
4205			 * recording read operations.
4206			 */
4207			BC_check_handlers();
4208			UNLOCK_HANDLERS();
4209
4210			BC_update_mounts();
4211
4212#ifndef NO_HISTORY
4213# ifdef STATIC_HISTORY
4214			/* initialise the history buffer */
4215			if ((BC_cache->c_history = BC_MALLOC(BC_HISTORY_ALLOC, M_TEMP, M_WAITOK)) == NULL) {
4216				message("can't allocate %d bytes for static history buffer", BC_HISTORY_ALLOC);
4217				BC_clear_flag(BC_FLAG_HISTORYACTIVE);
4218				error = ENOMEM;
4219				goto out;
4220			}
4221
4222			bzero(BC_cache->c_history, BC_HISTORY_ALLOC);
4223# endif
4224#endif
4225
4226		}
4227	}
4228
4229	/*
4230	 * If we have playlist data, fetch it.
4231	 */
4232	if (entries_size > 0) {
4233		LOCK_CACHE_W();
4234
4235		if (BC_cache->c_flags & BC_FLAG_SHUTDOWN) {
4236			debug("cache already shutdown");
4237			error = ENXIO;
4238			UNLOCK_CACHE_W();
4239			goto out;
4240		}
4241
4242		/* we're playing back a cache (or at least trying to),
4243		 * record detailed statistics until we stop recording history
4244		 */
4245		BC_cache->c_take_detailed_stats = 1;
4246
4247		copyin_error = BC_copyin_playlist(mounts_size, mounts, entries_size, entries);
4248		if (copyin_error != 0 && copyin_error != EALREADY) {
4249			message("can't load playlist: %d", copyin_error);
4250			UNLOCK_CACHE_W();
4251			error = copyin_error;
4252			goto out;
4253		}
4254
4255		/* Always throttle cut-through IOs.
4256		 * I tried throttling only after login had started, but
4257		 * then users who log in quickly see a very long delay
4258		 * after entering their password before apps come up.
4259		 * I'd prefer delay the time until the login screen appears
4260		 * and provide a fast-as-possible login.
4261		 * rdar://8592401
4262		 */
4263		BC_cache->c_readahead_throttles_cutthroughs = 1;
4264
4265		LOCK_HANDLERS();
4266		if (BC_set_flag(BC_FLAG_CACHEACTIVE)) {
4267			/* cache is already running, we're adding to it */
4268			UNLOCK_HANDLERS();
4269		} else {
4270			/* first cache we've created this boot */
4271
4272#ifdef READ_HISTORY_BUFFER
4273			/* initialise the read history buffer */
4274			if (NULL == BC_cache->c_rhistory) {
4275				if ((BC_cache->c_rhistory = BC_MALLOC(READ_HISTORY_BUFFER * sizeof(struct BC_read_history_entry),
4276													M_TEMP, M_WAITOK)) == NULL) {
4277					message("can't allocate %d bytes for read history buffer",
4278							READ_HISTORY_BUFFER * sizeof(struct BC_read_history_entry));
4279				}
4280			}
4281			if (BC_cache->c_rhistory) {
4282				bzero(BC_cache->c_rhistory, READ_HISTORY_BUFFER * sizeof(struct BC_read_history_entry));
4283			}
4284#endif
4285
4286			BC_cache->c_cache_starttime = current_time;
4287
4288			/*
4289			 * Take over the strategy routine for our device; we are now
4290			 * filling read operations from the cache.
4291			 *
4292			 * This must be done before we kick off reader threads to ensure
4293			 * that we see catch any writes that occur on blocks in our cache.
4294			 * This ensures that a write won't come in unnoticed on a block we've
4295			 * already read in so when it's subsequently requested, the cache gives
4296			 * stale data.
4297			 */
4298			BC_check_handlers();
4299			UNLOCK_HANDLERS();
4300
4301			bootcache_contains_block = BC_cache_contains_block;
4302		}
4303
4304		LOCK_CACHE_W_TO_R();
4305		if (copyin_error != EALREADY) {
4306			/*
4307			 * Start the reader threads.
4308			 */
4309			for (mount_idx = 0; mount_idx < BC_cache->c_nmounts; mount_idx++) {
4310				if (BC_cache->c_mounts[mount_idx].cm_state == CM_READY) {
4311					LOCK_MOUNT_R(BC_cache->c_mounts + mount_idx);
4312					BC_mount_available(BC_cache->c_mounts + mount_idx);
4313					UNLOCK_MOUNT_R(BC_cache->c_mounts + mount_idx);
4314				}
4315			}
4316		}
4317
4318		UNLOCK_CACHE_R();
4319
4320		/* cache is now active */
4321
4322	}
4323
4324	if (BC_cache->c_stats.ss_preload_time == 0) {
4325		timersub(&current_time,
4326				 &BC_cache->c_loadtime,
4327				 &current_time);
4328		BC_cache->c_stats.ss_preload_time = (u_int) current_time.tv_sec * 1000 + (u_int) current_time.tv_usec / 1000;
4329	}
4330
4331out:
4332	return(error);
4333}
4334
4335/*
4336 * Timeout on collection.
4337 *
4338 * We haven't heard from the userland component for too long;
4339 * stop recording history, but keep the current history
4340 * around in case they show up.
4341 */
4342static void
4343BC_timeout_history(void *junk)
4344{
4345	/* stop recording history */
4346	debug("history recording timed out");
4347	BC_terminate_history();
4348}
4349
4350/*
4351 * Check for new mounts. Called with no locks held
4352 */
4353static int check_for_new_mount_itr(mount_t mount, void* arg) {
4354	struct vfs_attr attr;
4355	vfs_context_t context;
4356	int error, mount_idx;
4357	int* go_again = (int*) arg;
4358
4359	struct BC_history_mount_device *hmd = NULL;
4360	vnode_t devvp = NULLVP;
4361
4362	LOCK_HISTORY_R();
4363	if (! (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE)) {
4364		/* don't do anything if we've stopped recording history */
4365		UNLOCK_HISTORY_R();
4366		return VFS_RETURNED_DONE;
4367	}
4368
4369#ifdef BC_DEBUG
4370	char name[MFSNAMELEN];
4371	vfs_name(mount, name);
4372#endif
4373
4374	devvp = vfs_devvp(mount);
4375	if (devvp == NULLVP) {
4376		//debug("Mount %s (%p)'s vnode is NULL", name, mount);
4377		UNLOCK_HISTORY_R();
4378		goto out;
4379	}
4380
4381	dev_t mount_dev = vnode_specrdev(devvp);
4382	//debug("Mount %s (%p)'s vnode %p's device is 0x%x", name, mount, devvp, mount_dev);
4383
4384	/*
4385	 * A new mount appeared, but unfortunately we don't know which one.
4386	 * So, for every mount we have to check through our list of history_mounts
4387	 * to see if we already have info for it.
4388	 */
4389	hmd = BC_get_history_mount_device(mount_dev, NULL);
4390	if (hmd == NULL || (! uuid_is_null(hmd->hmd_mount.hm_uuid))) {
4391		/* Unable to allocate new space for a mount, or
4392		 * we already have info about this mount
4393		 */
4394		UNLOCK_HISTORY_R();
4395		goto out;
4396	}
4397
4398	/*
4399	 * We don't yet have info about this mount. It's new!
4400	 */
4401
4402	hmd->hmd_blocksize = vfs_devblocksize(mount);
4403
4404	VFSATTR_INIT(&attr);
4405	VFSATTR_WANTED(&attr, f_uuid);
4406	context = vfs_context_create(NULL);
4407	error = vfs_getattr(mount, &attr, context);
4408
4409	if ((0 != error) || (! VFSATTR_IS_SUPPORTED(&attr, f_uuid))) {
4410		vfs_context_rele(context);
4411#ifdef BC_DEBUG
4412		if (strncmp("devfs", name, sizeof("devfs")) != 0) {
4413			debug("Found mount %s without a UUID: error %d", name, error);
4414		}
4415#endif
4416		UNLOCK_HISTORY_R();
4417		goto out;
4418	}
4419
4420	hmd->hmd_disk_id = vfs_throttle_mask(mount);
4421
4422	if (VNOP_IOCTL(devvp,              /* vnode */
4423				   DKIOCISSOLIDSTATE,    /* cmd */
4424				   (caddr_t)&hmd->hmd_is_ssd, /* data */
4425				   0,
4426				   context))           /* context */
4427	{
4428		debug("can't determine if physical disk for history mount %s is an ssd", uuid_string(hmd->hmd_mount.hm_uuid));
4429		hmd->hmd_is_ssd = 0;
4430	}
4431
4432#ifdef ROOT_DISK_ONLY
4433	if (devvp == rootvp) {
4434		if (BC_cache->c_root_disk_id == 0) {
4435			BC_cache->c_root_disk_id = hmd->hmd_disk_id;
4436			if (BC_cache->c_root_disk_id == 0) {
4437				message("Root disk is 0");
4438			} else {
4439				debug("Root disk (via history) is 0x%llx", BC_cache->c_root_disk_id);
4440			}
4441		} else if (BC_cache->c_root_disk_id != hmd->hmd_disk_id) {
4442			debug("Root disk 0x%llx doesn't match that found by history 0x%llx", BC_cache->c_root_disk_id, hmd->hmd_disk_id);
4443		}
4444	}
4445#endif
4446
4447
4448	/* Found info for our mount */
4449	debug("Found new historic mount after %d IOs: %s, dev 0x%x, disk 0x%llx, blk %d%s",
4450		  hmd->hmd_mount.hm_nentries,
4451		  uuid_string(attr.f_uuid),
4452		  hmd->hmd_dev,
4453		  hmd->hmd_disk_id,
4454		  hmd->hmd_blocksize,
4455		  hmd->hmd_is_ssd ? ", ssd" : "");
4456
4457	if (hmd->hmd_blocksize == 0) {
4458		hmd->hmd_blocksize = 512;
4459	}
4460	hmd->hmd_mount.hm_nentries = 0;
4461
4462	uuid_copy(hmd->hmd_mount.hm_uuid, attr.f_uuid);
4463
4464	UNLOCK_HISTORY_R();
4465
4466	if (BC_cache->c_flags & BC_FLAG_CACHEACTIVE) {
4467		LOCK_CACHE_R();
4468		/* Check if we have a playlist mount that matches this mount and set it up */
4469		for (mount_idx = 0; mount_idx < BC_cache->c_nmounts; mount_idx++) {
4470			struct BC_cache_mount *cm = BC_cache->c_mounts + mount_idx;
4471			if (CM_SETUP == cm->cm_state && 0 == uuid_compare(cm->cm_uuid, attr.f_uuid)) {
4472				/* Found a matching unfilled mount */
4473
4474				LOCK_MOUNT_W(cm);
4475				if ((error = BC_fill_in_mount(cm, mount, context)) == 0) {
4476					LOCK_MOUNT_W_TO_R(cm);
4477					/* Kick off another reader thread if this is a new physical disk */
4478					BC_mount_available(cm);
4479					UNLOCK_MOUNT_R(cm);
4480				} else {
4481					if (error == EAGAIN) {
4482						*go_again = 1;
4483					}
4484					UNLOCK_MOUNT_W(cm);
4485				}
4486
4487				break;
4488			}
4489		}
4490		UNLOCK_CACHE_R();
4491	}
4492
4493	vfs_context_rele(context);
4494
4495out:
4496	if (devvp != NULLVP) {
4497		vnode_put(devvp);
4498	}
4499	return VFS_RETURNED;
4500}
4501
4502/*
4503 * Check for new or disappearing mounts
4504 */
4505static void BC_update_mounts(void) {
4506
4507	/* don't do anything if we've stopped recording history */
4508	if (! (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE)) {
4509		return;
4510	}
4511
4512	int go_again = 0;
4513	vfs_iterate(0, check_for_new_mount_itr, &go_again);
4514	if (go_again) {
4515		debug("Missing root mount during last iteration, going again");
4516		vfs_iterate(0, check_for_new_mount_itr, &go_again);
4517	}
4518}
4519
4520/*
4521 * Find the mount that corresponds to this IO,
4522 * or the first empty slot.
4523 *
4524 * Called with the history read lock
4525 */
4526static struct BC_history_mount_device * BC_get_history_mount_device(dev_t dev, int* pindex) {
4527	struct BC_history_mount_device *tmphmd = NULL, **phmd, *ret = NULL;
4528	int mount_idx = 0;
4529
4530	for (phmd = &BC_cache->c_history_mounts;
4531		 (*phmd) == NULL || (*phmd)->hmd_dev != dev;
4532		 phmd = &(*phmd)->hmd_next) {
4533
4534		if (*phmd == NULL) {
4535			if (tmphmd == NULL) { /* we may have an allocation from previous iterations */
4536				tmphmd = kalloc(sizeof(struct BC_history_mount_device));
4537				if (tmphmd == NULL) {
4538					message("could not allocate %lu bytes for history mount device",
4539							sizeof(struct BC_history_mount_device));
4540					BC_set_flag(BC_FLAG_HTRUNCATED);
4541					goto out;
4542				}
4543
4544				/* claim the new entry */
4545				tmphmd->hmd_next = NULL;
4546				tmphmd->hmd_mount.hm_nentries = 0;
4547				uuid_clear(tmphmd->hmd_mount.hm_uuid);
4548				tmphmd->hmd_disk_id = 0;
4549				tmphmd->hmd_blocksize = 0;
4550				tmphmd->hmd_dev = dev;
4551			}
4552
4553			if (OSCompareAndSwapPtr(NULL, tmphmd, phmd)) {
4554				tmphmd = NULL;
4555				if (BC_cache->c_take_detailed_stats) {
4556					BC_ADD_STAT(history_mounts, 1);
4557				}
4558				break;
4559			}
4560
4561			/* someone else added a mount before we could, check if its the one we want */
4562			if ((*phmd)->hmd_dev == dev) break;
4563
4564		}
4565		mount_idx++;
4566	}
4567
4568	if (pindex)
4569		*pindex = mount_idx;
4570	ret = *phmd;
4571out:
4572	if (tmphmd)
4573		kfree(tmphmd, sizeof(struct BC_history_mount_device));
4574
4575	return ret;
4576}
4577
4578/*
4579 * Record an incoming IO in the history list.
4580 *
4581 * Returns non-0 if this IO was on the root disk.
4582 *
4583 * Note that this function is not reentrant.
4584 */
4585static int
4586BC_add_history(daddr64_t blkno, u_int64_t length, int pid, int hit, int write, int tag, int shared, dev_t dev)
4587{
4588	u_int64_t offset;
4589	struct BC_history_entry_cluster *hec, *tmphec = NULL;
4590	struct BC_history_mount_device *hmd = NULL;
4591	struct BC_history_entry *he;
4592	int mount_idx, entry_idx;
4593	int is_root_disk = 0;
4594
4595	/* don't do anything if we've stopped recording history */
4596	if (! (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE))
4597		return 0;
4598
4599	LOCK_HISTORY_R();
4600
4601	/* check again, with lock */
4602	if (! (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE))
4603		goto out;
4604
4605	/* count IOs during boot (first recording) */
4606	if (BC_cache->c_take_detailed_stats) {
4607		if (! tag) {
4608			if (! write) {
4609				BC_ADD_STAT(history_reads, 1);
4610			} else {
4611				BC_ADD_STAT(history_writes, 1);
4612			}
4613		}
4614	}
4615
4616	/* don't do anything if the history list has been truncated */
4617	if (!tag && BC_cache->c_flags & BC_FLAG_HTRUNCATED)
4618		goto out;
4619
4620	/*
4621	 * In order to avoid recording a playlist larger than we will
4622	 * allow to be played back, if the total byte count for recorded
4623	 * reads reaches 50% of the physical memory in the system, we
4624	 * start ignoring reads.
4625	 * This is preferable to dropping entries from the end of the
4626	 * playlist as we copy it in, since it will result in a clean
4627	 * cessation of caching at a single point, rather than
4628	 * introducing sporadic cache misses throughout the cache's run.
4629	 */
4630	if (!tag && (BC_cache->c_history_size + length) > BC_MAX_SIZE) {
4631		debug("History recording too large, capping at %lluMB", BC_MAX_SIZE / (1024 * 1024));
4632		BC_set_flag(BC_FLAG_HTRUNCATED);
4633		goto out;
4634	}
4635
4636#ifdef NO_HISTORY
4637	BC_set_flag(BC_FLAG_HTRUNCATED);
4638	debug("history disabled, truncating");
4639	goto out;
4640#endif
4641
4642	mount_idx = 0;
4643	offset = 0;
4644	if (! tag) {
4645
4646		hmd = BC_get_history_mount_device(dev, &mount_idx);
4647		if (hmd == NULL) goto out;
4648
4649		if (uuid_is_null(hmd->hmd_mount.hm_uuid)) {
4650			/* Couldn't identify the mount
4651			 *
4652			 * Before the mount has appeared we don't want to
4653			 * include any of this device's IOs in the history
4654			 * since these IOs will be issued next boot before
4655			 * we're able to start this mount's part of the
4656			 * playlist.
4657			 */
4658
4659			/* Keep track of the number of IOs we've seen until the mount appears */
4660			OSIncrementAtomic(&hmd->hmd_mount.hm_nentries);
4661
4662			if (BC_cache->c_take_detailed_stats) {
4663				BC_ADD_STAT(history_unknown, 1);
4664				BC_ADD_STAT(history_unknown_bytes, length);
4665			}
4666			goto out;
4667		}
4668
4669		offset = HB_BLOCK_TO_BYTE(hmd, blkno);
4670
4671		/* Only track IOs on the root disk.
4672		 *
4673		 * We don't expect IOs on other physical disks
4674		 * to be part of the critical path to boot/login
4675		 * and we certainly don't expect the other disks
4676		 * to be contested enough to make a cache worthwhile
4677		 */
4678
4679		if (hmd->hmd_disk_id != BC_cache->c_root_disk_id) {
4680#ifdef ROOT_DISK_ONLY
4681			goto out;
4682#endif
4683		} else {
4684			is_root_disk = 1;
4685		}
4686
4687	}
4688
4689	/*
4690	 * Get the current entry cluster.
4691	 */
4692	while ((hec = BC_cache->c_history) == NULL ||
4693		   (entry_idx = OSAddAtomic(1, &hec->hec_nentries)) >= BC_HISTORY_ENTRIES) {
4694		if (hec)
4695			OSAddAtomic(-1, &hec->hec_nentries);
4696
4697		/* need a new cluster */
4698
4699		/* limit the number of clusters we will allocate */
4700		if (BC_cache->c_history_num_clusters + 1 >= BC_HISTORY_MAXCLUSTERS) {
4701			message("too many history clusters (%d, limit %ld)",
4702					BC_cache->c_history_num_clusters,
4703					(long)BC_HISTORY_MAXCLUSTERS);
4704			BC_set_flag(BC_FLAG_HTRUNCATED);
4705			goto out;
4706		}
4707
4708		if (tmphec == NULL) { /* we may have an allocation from previous iterations */
4709			tmphec = kalloc(BC_HISTORY_ALLOC);
4710			if (tmphec == NULL) {
4711				message("could not allocate %d bytes for history cluster",
4712						BC_HISTORY_ALLOC);
4713				BC_set_flag(BC_FLAG_HTRUNCATED);
4714				goto out;
4715			}
4716
4717			/* claim the first entry of the new cluster*/
4718			tmphec->hec_nentries = 1;
4719			tmphec->hec_link = hec;
4720		}
4721
4722		if (OSCompareAndSwapPtr(hec, tmphec, &BC_cache->c_history)) {
4723			hec = tmphec;
4724			entry_idx = 0;
4725			tmphec = NULL;
4726			OSAddAtomic(1, &BC_cache->c_history_num_clusters);
4727			break;
4728		}
4729	}
4730	if (tmphec != NULL)
4731		kfree(tmphec, BC_HISTORY_ALLOC);
4732
4733	/*
4734	 * We've claimed entry entry_idx of cluster hec, fill it in.
4735	 */
4736	he = &(hec->hec_data[entry_idx]);
4737	assert(he >= &hec->hec_data[0]);
4738	assert(he < &hec->hec_data[BC_HISTORY_ENTRIES]);
4739	he->he_offset    = offset;
4740	he->he_length    = length;
4741	he->he_pid       = pid;
4742	he->he_mount_idx = mount_idx;
4743	he->he_flags     = 0x0;
4744	if (hit)    he->he_flags |= BC_HE_HIT;
4745	if (write)  he->he_flags |= BC_HE_WRITE;
4746	if (tag)    he->he_flags |= BC_HE_TAG;
4747	if (shared) he->he_flags |= BC_HE_SHARED;
4748	if (hmd)
4749		OSIncrementAtomic(&hmd->hmd_mount.hm_nentries);
4750
4751	if (!write) {
4752		OSAddAtomic64(length, &BC_cache->c_history_size);
4753		if (BC_cache->c_take_detailed_stats) {
4754			BC_ADD_STAT(history_bytes, length);
4755			BC_ADD_STAT(history_entries, 1);
4756		}
4757	}
4758out:
4759	UNLOCK_HISTORY_R();
4760	return is_root_disk;
4761}
4762
4763/*
4764 * Return the size of the history buffer.
4765 */
4766static int
4767BC_size_history_mounts(void)
4768{
4769	struct BC_history_mount_device *hmd;
4770	int nmounts;
4771
4772	/* walk the list of mount clusters and count the number of mounts */
4773	nmounts = 0;
4774	for (hmd = BC_cache->c_history_mounts; hmd != NULL; hmd = hmd->hmd_next)
4775		nmounts ++;
4776
4777	return(nmounts * sizeof(struct BC_history_mount));
4778}
4779
4780/*
4781 * Return the size of the history buffer.
4782 */
4783static int
4784BC_size_history_entries(void)
4785{
4786	struct BC_history_entry_cluster *hec;
4787	int nentries, cluster_entries;
4788
4789	/* walk the list of clusters and count the number of entries */
4790	nentries = 0;
4791	for (hec = BC_cache->c_history; hec != NULL; hec = hec->hec_link) {
4792		cluster_entries = hec->hec_nentries;
4793		if (cluster_entries > BC_HISTORY_ENTRIES) {
4794			cluster_entries = BC_HISTORY_ENTRIES;
4795		}
4796
4797		nentries += cluster_entries;
4798	}
4799
4800	return(nentries * sizeof(struct BC_history_entry));
4801}
4802
4803/*
4804 * Copy out either the history buffer size, or the entire history buffer.
4805 *
4806 * Note that the cache must be shut down before calling this function in
4807 * order to avoid a race around the entry count/buffer size.
4808 */
4809static int
4810BC_copyout_history_mounts(user_addr_t uptr)
4811{
4812	struct BC_history_mount_device *hmd;
4813	int error;
4814
4815	assert(uptr != 0);
4816
4817	/*
4818	 * Copy out the mounts
4819	 */
4820	for (hmd = BC_cache->c_history_mounts;
4821		 hmd != NULL;
4822		 hmd = hmd->hmd_next) {
4823
4824		/* copy the cluster out */
4825		if ((error = copyout(&(hmd->hmd_mount), uptr,
4826							 sizeof(struct BC_history_mount))) != 0)
4827			return(error);
4828		uptr += sizeof(struct BC_history_mount);
4829
4830	}
4831	return(0);
4832}
4833
4834/*
4835 * Copy out either the history buffer size, or the entire history buffer.
4836 *
4837 * Note that the cache must be shut down before calling this function in
4838 * order to avoid a race around the entry count/buffer size.
4839 */
4840static int
4841BC_copyout_history_entries(user_addr_t uptr)
4842{
4843	struct BC_history_entry_cluster *hec, *ohec;
4844	int error, cluster_entries;
4845
4846	assert(uptr != 0);
4847	assert(BC_cache->c_history);
4848
4849	/*
4850	 * Copying the history entires out is a little messy due to the fact that the
4851	 * clusters are on the list backwards, but we want the entries in order.
4852	 */
4853	ohec = NULL;
4854	if (BC_cache->c_history) {
4855		for (;;) {
4856			/* scan the list until we find the last one we haven't copied */
4857			hec = BC_cache->c_history;
4858			while (hec->hec_link != ohec)
4859				hec = hec->hec_link;
4860			ohec = hec;
4861
4862			cluster_entries = hec->hec_nentries;
4863			if (cluster_entries > BC_HISTORY_ENTRIES) {
4864				cluster_entries = BC_HISTORY_ENTRIES;
4865			}
4866
4867			/* copy the cluster out */
4868			if ((error = copyout(hec->hec_data, uptr,
4869								 cluster_entries * sizeof(struct BC_history_entry))) != 0)
4870				return(error);
4871			uptr += cluster_entries * sizeof(struct BC_history_entry);
4872
4873			/* if this was the last cluster, all done */
4874			if (hec == BC_cache->c_history)
4875				break;
4876		}
4877	}
4878	return(0);
4879}
4880
4881/*
4882 * Discard the history list.
4883 */
4884static void
4885BC_discard_history(void)
4886{
4887	struct BC_history_mount_device  *hmd;
4888	struct BC_history_entry_cluster *hec;
4889
4890	while (BC_cache->c_history_mounts != NULL) {
4891		hmd = BC_cache->c_history_mounts;
4892		BC_cache->c_history_mounts = hmd->hmd_next;
4893		kfree(hmd, sizeof(struct BC_history_mount_device));
4894	}
4895
4896	while (BC_cache->c_history != NULL) {
4897		hec = BC_cache->c_history;
4898		BC_cache->c_history = hec->hec_link;
4899		kfree(hec, BC_HISTORY_ALLOC);
4900	}
4901}
4902
4903/*
4904 * Setup for the root mounted device
4905 */
4906static int
4907BC_setup(void)
4908{
4909	unsigned int boot_arg;
4910
4911	/*
4912	 * If we have less than the minimum supported physical memory,
4913	 * bail immediately.  With too little memory, the cache will
4914	 * become polluted with nondeterministic pagein activity, leaving
4915	 * large amounts of wired memory around which further exacerbates
4916	 * the problem.
4917	 */
4918	if ((max_mem / (1024 * 1024)) < BC_minimum_mem_size) {
4919		debug("insufficient physical memory (%dMB < %dMB required)",
4920			  (int) (max_mem / (1024 * 1024)), BC_minimum_mem_size);
4921		return(ENOMEM);
4922	}
4923
4924	/*
4925	 * Find our block device.
4926	 *
4927	 * Note that in the netbooted case, we are loaded but should not
4928	 * actually run, unless we're explicitly overridden.
4929	 *
4930	 * Note also that if the override is set, we'd better have a valid
4931	 * rootdev...
4932	 */
4933	if (netboot_root() &&
4934		(!PE_parse_boot_argn("BootCacheOverride", &boot_arg, sizeof(boot_arg)) ||
4935		 (boot_arg != 1))) {                          /* not interesting */
4936			return(ENXIO);
4937		}
4938
4939	debug("Major of rootdev is %d", major(rootdev));
4940
4941	BC_cache->c_devsw = &bdevsw[major(rootdev)];
4942	assert(BC_cache->c_devsw != NULL);
4943
4944	/* Make sure we don't clobber the device strategy routine */
4945	if ((BC_cache->c_devsw->d_strategy == (strategy_fcn_t*)BC_strategy) ||
4946		(BC_cache->c_devsw->d_close    == BC_close)) {
4947		debug("cache was not terminated properly previously");
4948		return(ENXIO);
4949	}
4950
4951	BC_cache->c_strategy = BC_cache->c_devsw->d_strategy;
4952	BC_cache->c_close    = BC_cache->c_devsw->d_close;
4953	if (BC_cache->c_strategy == NULL ||
4954		BC_cache->c_close    == NULL) {	/* not configured */
4955		return(ENXIO);
4956	}
4957
4958	BC_set_flag(BC_FLAG_SETUP);
4959
4960	return(0);
4961}
4962
4963
4964/*
4965 * Called from the root-mount hook.
4966 */
4967static void
4968BC_auto_start(void)
4969{
4970	int error;
4971	user_addr_t pm, pe;
4972
4973	if ((error = BC_setup()) != 0) {
4974		debug("BootCache setup failed: %d", error);
4975		return;
4976	}
4977
4978	pm = CAST_USER_ADDR_T((uintptr_t)BC_preloaded_playlist + sizeof(struct BC_playlist_header));
4979	pe = pm + (BC_preloaded_playlist->ph_nmounts * sizeof(struct BC_playlist_mount));
4980
4981	/* Initialize the cache.
4982	 *
4983	 * auto-start is for DVDs or other read-only media, so don't bother recording history
4984	 */
4985	error = BC_init_cache(BC_preloaded_playlist->ph_nmounts  * sizeof(struct BC_playlist_mount), pm,
4986						  BC_preloaded_playlist->ph_nentries * sizeof(struct BC_playlist_entry), pe,
4987						  0 /* don't record history */);
4988	if (error != 0)
4989		debug("BootCache autostart failed: %d", error);
4990}
4991
4992/*
4993 * Called from the root-unmount hook.
4994 */
4995static void
4996BC_unmount_hook(void)
4997{
4998	debug("device unmounted");
4999
5000	/*
5001	 * If the cache is running, stop it.  If it's already stopped
5002	 * (it may have stopped itself), that's OK.
5003	 */
5004	BC_terminate_cache();
5005	BC_terminate_history();
5006
5007}
5008
5009
5010/*
5011 * Handle the command sysctl.
5012 */
5013static int
5014BC_sysctl SYSCTL_HANDLER_ARGS
5015{
5016	struct BC_command bc;
5017	int error;
5018
5019	/* get the commande structure and validate */
5020	if ((error = SYSCTL_IN(req, &bc, sizeof(bc))) != 0) {
5021		debug("couldn't get command");
5022		return(error);
5023	}
5024	if (bc.bc_magic != BC_MAGIC) {
5025		debug("bad command magic");
5026		return(EINVAL);
5027	}
5028
5029	switch (bc.bc_opcode) {
5030		case BC_OP_START:
5031			debug("BC_OP_START(%d mounts, %d extents)", (int)(bc.bc_data1_size / sizeof(struct BC_playlist_mount)), (int)(bc.bc_data2_size / sizeof(struct BC_playlist_entry)));
5032			if (BC_preloaded_playlist) {
5033				error = EINVAL;
5034				break;
5035			}
5036			error = BC_init_cache(bc.bc_data1_size, (user_addr_t)bc.bc_data1, bc.bc_data2_size, (user_addr_t)bc.bc_data2, 1 /* record history */);
5037			if (error != 0) {
5038				message("cache init failed");
5039			}
5040			break;
5041
5042		case BC_OP_STOP:
5043			debug("BC_OP_STOP");
5044
5045			/*
5046			 * If we're recording history, stop it.  If it's already stopped
5047			 * (it may have stopped itself), that's OK.
5048			 */
5049			BC_terminate_history();
5050
5051			/* return the size of the history buffer */
5052			LOCK_HISTORY_W();
5053			bc.bc_data1_size = BC_size_history_mounts();
5054			bc.bc_data2_size = BC_size_history_entries();
5055			if ((error = SYSCTL_OUT(req, &bc, sizeof(bc))) != 0)
5056				debug("could not return history size");
5057			UNLOCK_HISTORY_W();
5058
5059			break;
5060
5061		case BC_OP_MOUNT:
5062			/* debug("BC_OP_MOUNT"); */
5063
5064			/*
5065			 * A notification that mounts have changed.
5066			 */
5067			BC_update_mounts();
5068			break;
5069
5070		case BC_OP_HISTORY:
5071			debug("BC_OP_HISTORY");
5072			/* if there's a user buffer, verify the size and copy out */
5073			LOCK_HISTORY_W();
5074			if (bc.bc_data1 != 0 && bc.bc_data2 != 0) {
5075				if (bc.bc_data1_size < BC_size_history_mounts() ||
5076					bc.bc_data2_size < BC_size_history_entries()) {
5077					debug("supplied history buffer too small");
5078					error = EINVAL;
5079					UNLOCK_HISTORY_W();
5080					break;
5081				}
5082				if ((error = BC_copyout_history_mounts(bc.bc_data1)) != 0) {
5083					debug("could not copy out history mounts: %d", error);
5084				}
5085				if ((error = BC_copyout_history_entries(bc.bc_data2)) != 0) {
5086					debug("could not copy out history entries: %d", error);
5087				}
5088			}
5089			/*
5090			 * Release the last of the memory we own.
5091			 */
5092			BC_discard_history();
5093			UNLOCK_HISTORY_W();
5094			break;
5095
5096		case BC_OP_JETTISON:
5097			debug("BC_OP_JETTISON");
5098
5099			/*
5100			 * Jettison the cache. If it's already jettisoned
5101			 * (it may have jettisoned itself), that's OK.
5102			 */
5103			BC_terminate_cache();
5104			break;
5105
5106		case BC_OP_STATS:
5107			debug("BC_OP_STATS");
5108			/* check buffer size and copy out */
5109			if (bc.bc_data1_size != sizeof(BC_cache->c_stats)) {
5110				debug("stats structure wrong size");
5111				error = ENOMEM;
5112			} else {
5113				BC_cache->c_stats.ss_cache_flags = BC_cache->c_flags;
5114				if ((error = copyout(&BC_cache->c_stats, bc.bc_data1, bc.bc_data1_size)) != 0)
5115					debug("could not copy out statistics");
5116			}
5117			break;
5118
5119		case BC_OP_TAG:
5120			debug("BC_OP_TAG");
5121			KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, DBG_BC_TAG) |
5122								  DBG_FUNC_NONE, proc_selfppid(), dbg_tag_count,
5123								  0, 0, 0);
5124			BC_add_history(0, 0, 0, 0, 0, 1, 0, 0);
5125			dbg_tag_count++;
5126			break;
5127
5128		case BC_OP_TEST:
5129			debug("BC_OP_TEST");
5130
5131			if (! (BC_cache->c_flags & BC_FLAG_SETUP)) {
5132				error = ENODEV;
5133			}
5134			break;
5135
5136		default:
5137			error = EINVAL;
5138	}
5139	return(error);
5140}
5141
5142
5143/*
5144 * Initialise the block of pages we use to back our data.
5145 *
5146 */
5147static int
5148BC_alloc_pagebuffer()
5149{
5150	kern_return_t kret;
5151	int ret;
5152
5153	if (BC_cache->c_vp == NULL) {
5154
5155		kret = vnode_open(BC_BOOT_BACKINGFILE, (O_CREAT | O_NOFOLLOW | FREAD),
5156						  0400, 0, &BC_cache->c_vp, vfs_context_current());
5157		if (kret != KERN_SUCCESS) {
5158			debug("vnode_open failed - %d", kret);
5159			return -1;
5160		}
5161
5162		BC_cache->c_vid = vnode_vid(BC_cache->c_vp);
5163
5164	} else {
5165		ret = vnode_getwithvid(BC_cache->c_vp, BC_cache->c_vid);
5166		if (ret != 0) {
5167			debug("vnode_getwithvid failed - %d", ret);
5168			return -1;
5169		}
5170	}
5171
5172	assert(BC_cache->c_cachesize != 0);
5173	kret = ubc_setsize(BC_cache->c_vp, BC_cache->c_cachesize);
5174	if (kret != 1) {
5175		debug("ubc_setsize failed - %d", kret);
5176		(void) vnode_put(BC_cache->c_vp);
5177		return -1;
5178	}
5179
5180	(void) vnode_put(BC_cache->c_vp);
5181
5182	return(0);
5183}
5184
5185/*
5186 * Release all resources associated with our pagebuffer.
5187 */
5188static void
5189BC_free_pagebuffer(void)
5190{
5191	kern_return_t kret;
5192	if (BC_cache->c_vp != NULL) {
5193		/* Can handle a vnode without ubc info */
5194		kret = vnode_getwithref(BC_cache->c_vp);
5195		if (kret != KERN_SUCCESS) {
5196			debug("vnode_getwithref failed - %d", kret);
5197			return;
5198		}
5199
5200		kret = ubc_range_op(BC_cache->c_vp, 0, BC_cache->c_cachesize,
5201							UPL_ROP_DUMP, NULL);
5202		if (kret != KERN_SUCCESS) {
5203			debug("ubc_range_op failed - %d", kret);
5204		}
5205
5206#if 0
5207		/* need this interface to be exported... will eliminate need
5208		 * for the ubc_range_op above */
5209		kret = vnode_delete(BC_BOOT_BACKINGFILE, vfs_context_current());
5210		if (kret) {
5211			debug("vnode_delete failed - %d", error);
5212		}
5213#endif
5214		kret = vnode_close(BC_cache->c_vp, 0, vfs_context_current());
5215		if (kret != KERN_SUCCESS) {
5216			debug("vnode_close failed - %d", kret);
5217		}
5218
5219		BC_cache->c_vp = NULL;
5220	}
5221}
5222
5223/*
5224 * Release one page from the pagebuffer.
5225 */
5226static void
5227BC_free_page(struct BC_cache_extent *ce, int page)
5228{
5229	off_t vpoffset;
5230	kern_return_t kret;
5231
5232	vpoffset = (page * PAGE_SIZE) + ce->ce_cacheoffset;
5233	kret = vnode_getwithvid(BC_cache->c_vp, BC_cache->c_vid);
5234	if (kret != KERN_SUCCESS) {
5235		debug("vnode_getwithvid failed - %d", kret);
5236		return;
5237	}
5238
5239	kret = ubc_range_op(BC_cache->c_vp, vpoffset, vpoffset + PAGE_SIZE,
5240						UPL_ROP_DUMP, NULL);
5241	if (kret != KERN_SUCCESS) {
5242		debug("ubc_range_op failed - %d", kret);
5243	}
5244
5245	(void) vnode_put(BC_cache->c_vp);
5246}
5247
5248void
5249BC_hook(void)
5250{
5251	int error;
5252
5253	if ((error = BC_setup()) != 0) {
5254		debug("BootCache setup failed: %d", error);
5255	}
5256}
5257
5258/*
5259 * Cache start/stop functions.
5260 */
5261void
5262BC_load(void)
5263{
5264	struct mach_header	*mh;
5265	long			xsize;
5266	int			has64bitness, error;
5267	size_t			sysctlsize = sizeof(int);
5268	char			*plsection = "playlist";
5269
5270#ifdef BC_DEBUG
5271	microtime(&debug_starttime);
5272#endif
5273
5274	/* register our control interface */
5275	sysctl_register_oid(&sysctl__kern_BootCache);
5276
5277	/* clear the control structure */
5278	bzero(BC_cache, sizeof(*BC_cache));
5279
5280	BC_cache->c_lckgrp = lck_grp_alloc_init("BootCache", LCK_GRP_ATTR_NULL);
5281	lck_rw_init(&BC_cache->c_history_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL);
5282	lck_rw_init(&BC_cache->c_cache_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL);
5283	lck_mtx_init(&BC_cache->c_handlers_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL);
5284	lck_mtx_init(&BC_cache->c_reader_threads_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL);
5285#ifdef BC_DEBUG
5286	lck_mtx_init(&debug_printlock, BC_cache->c_lckgrp, LCK_ATTR_NULL);
5287#endif
5288	/*
5289	 * Find the preload section and its size.  If it's not empty
5290	 * we have a preloaded playlist, so prepare for an early
5291	 * start.
5292	 */
5293	if (kmod_info.info_version != KMOD_INFO_VERSION) {
5294		message("incompatible kmod_info versions");
5295		return;
5296	}
5297	mh = (struct mach_header *)kmod_info.address;
5298
5299	/*
5300	 * If booting on a 32-bit only machine, use the "playlist32" section.
5301	 * Otherwise, fall through to the default "playlist" section.
5302	 */
5303	error = sysctlbyname("hw.cpu64bit_capable", &has64bitness, &sysctlsize,
5304						 NULL, 0);
5305	if (error == 0 && has64bitness == 0) {
5306		plsection = "playlist32";
5307	}
5308	BC_preloaded_playlist = getsectdatafromheader(mh, "BootCache",
5309												  plsection, &BC_preloaded_playlist_size);
5310	debug("preload section %s: %d @ %p",
5311	      plsection, BC_preloaded_playlist_size, BC_preloaded_playlist);
5312
5313	if (BC_preloaded_playlist != NULL) {
5314		if (BC_preloaded_playlist_size < sizeof(struct BC_playlist_header)) {
5315			message("preloaded playlist too small");
5316			return;
5317		}
5318		if (BC_preloaded_playlist->ph_magic != PH_MAGIC) {
5319			message("preloaded playlist has invalid magic (%x, expected %x)",
5320					BC_preloaded_playlist->ph_magic, PH_MAGIC);
5321			return;
5322		}
5323		xsize = sizeof(struct BC_playlist_header) +
5324		(BC_preloaded_playlist->ph_nmounts  * sizeof(struct BC_playlist_mount)) +
5325		(BC_preloaded_playlist->ph_nentries * sizeof(struct BC_playlist_entry));
5326		if (xsize > BC_preloaded_playlist_size) {
5327			message("preloaded playlist too small (%d, expected at least %ld)",
5328					BC_preloaded_playlist_size, xsize);
5329			return;
5330		}
5331		BC_wait_for_readahead = BC_READAHEAD_WAIT_CDROM;
5332		mountroot_post_hook = BC_auto_start;
5333		unmountroot_pre_hook = BC_unmount_hook;
5334
5335		debug("waiting for root mount...");
5336	} else {
5337		/* no preload, normal operation */
5338		BC_preloaded_playlist = NULL;
5339		mountroot_post_hook = BC_hook;
5340		debug("waiting for playlist...");
5341	}
5342
5343	microtime(&BC_cache->c_loadtime);
5344}
5345
5346int
5347BC_unload(void)
5348{
5349	int error;
5350
5351	debug("preparing to unload...");
5352	unmountroot_pre_hook = NULL;
5353
5354	/*
5355	 * If the cache is running, stop it.  If it's already stopped
5356	 * (it may have stopped itself), that's OK.
5357	 */
5358	BC_terminate_history();
5359	if ((error = BC_terminate_cache()) != 0) {
5360		if (error != ENXIO)
5361			goto out;
5362	}
5363	LOCK_HISTORY_W();
5364	BC_discard_history();
5365	UNLOCK_HISTORY_W();
5366
5367	lck_rw_destroy(&BC_cache->c_history_lock, BC_cache->c_lckgrp);
5368	lck_rw_destroy(&BC_cache->c_cache_lock, BC_cache->c_lckgrp);
5369	lck_mtx_destroy(&BC_cache->c_handlers_lock, BC_cache->c_lckgrp);
5370	lck_mtx_destroy(&BC_cache->c_reader_threads_lock, BC_cache->c_lckgrp);
5371	lck_grp_free(BC_cache->c_lckgrp);
5372
5373	sysctl_unregister_oid(&sysctl__kern_BootCache);
5374	error = 0;
5375
5376out:
5377	return(error);
5378}
5379