1/*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*-
30 * Copyright (c) 1994 Christopher G. Demetriou
31 * Copyright (c) 1982, 1986, 1989, 1993
32 *	The Regents of the University of California.  All rights reserved.
33 * (c) UNIX System Laboratories, Inc.
34 * All or some portions of this file are derived from material licensed
35 * to the University of California by American Telephone and Telegraph
36 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
37 * the permission of UNIX System Laboratories, Inc.
38 *
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 * 1. Redistributions of source code must retain the above copyright
43 *    notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 *    notice, this list of conditions and the following disclaimer in the
46 *    documentation and/or other materials provided with the distribution.
47 * 3. All advertising materials mentioning features or use of this software
48 *    must display the following acknowledgement:
49 *	This product includes software developed by the University of
50 *	California, Berkeley and its contributors.
51 * 4. Neither the name of the University nor the names of its contributors
52 *    may be used to endorse or promote products derived from this software
53 *    without specific prior written permission.
54 *
55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65 * SUCH DAMAGE.
66 *
67 *	@(#)vfs_bio.c	8.6 (Berkeley) 1/11/94
68 */
69
70/*
71 * Some references:
72 *	Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
73 *	Leffler, et al.: The Design and Implementation of the 4.3BSD
74 *		UNIX Operating System (Addison Welley, 1989)
75 */
76
77#include <sys/param.h>
78#include <sys/systm.h>
79#include <sys/proc_internal.h>
80#include <sys/buf_internal.h>
81#include <sys/vnode_internal.h>
82#include <sys/mount_internal.h>
83#include <sys/trace.h>
84#include <sys/malloc.h>
85#include <sys/resourcevar.h>
86#include <miscfs/specfs/specdev.h>
87#include <sys/ubc.h>
88#include <sys/kauth.h>
89#if DIAGNOSTIC
90#include <kern/assert.h>
91#endif /* DIAGNOSTIC */
92#include <kern/task.h>
93#include <kern/zalloc.h>
94#include <kern/lock.h>
95
96#include <sys/fslog.h>		/* fslog_io_error() */
97
98#include <mach/mach_types.h>
99#include <mach/memory_object_types.h>
100#include <kern/sched_prim.h>	/* thread_block() */
101
102#include <vm/vm_kern.h>
103#include <vm/vm_pageout.h>
104
105#include <sys/kdebug.h>
106
107#include <libkern/OSAtomic.h>
108#include <libkern/OSDebug.h>
109#include <sys/ubc_internal.h>
110
111#include <sys/sdt.h>
112#include <sys/cprotect.h>
113
114
115#if BALANCE_QUEUES
116static __inline__ void bufqinc(int q);
117static __inline__ void bufqdec(int q);
118#endif
119
120int	bcleanbuf(buf_t bp, boolean_t discard);
121static int	brecover_data(buf_t bp);
122static boolean_t incore(vnode_t vp, daddr64_t blkno);
123/* timeout is in msecs */
124static buf_t	getnewbuf(int slpflag, int slptimeo, int *queue);
125static void	bremfree_locked(buf_t bp);
126static void	buf_reassign(buf_t bp, vnode_t newvp);
127static errno_t	buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo);
128static int	buf_iterprepare(vnode_t vp, struct buflists *, int flags);
129static void	buf_itercomplete(vnode_t vp, struct buflists *, int flags);
130static boolean_t buffer_cache_gc(int);
131static buf_t	buf_brelse_shadow(buf_t bp);
132static void	buf_free_meta_store(buf_t bp);
133
134static buf_t	buf_create_shadow_internal(buf_t bp, boolean_t force_copy,
135					   uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv);
136
137
138__private_extern__ int  bdwrite_internal(buf_t, int);
139
140/* zone allocated buffer headers */
141static void	bufzoneinit(void) __attribute__((section("__TEXT, initcode")));
142static void	bcleanbuf_thread_init(void) __attribute__((section("__TEXT, initcode")));
143static void	bcleanbuf_thread(void);
144
145static zone_t	buf_hdr_zone;
146static int	buf_hdr_count;
147
148
149/*
150 * Definitions for the buffer hash lists.
151 */
152#define	BUFHASH(dvp, lbn)	\
153	(&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
154LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
155u_long	bufhash;
156
157static buf_t	incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp);
158
159/* Definitions for the buffer stats. */
160struct bufstats bufstats;
161
162/* Number of delayed write buffers */
163long nbdwrite = 0;
164int blaundrycnt = 0;
165static int boot_nbuf_headers = 0;
166
167static TAILQ_HEAD(delayqueue, buf) delaybufqueue;
168
169static TAILQ_HEAD(ioqueue, buf) iobufqueue;
170static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
171static int needbuffer;
172static int need_iobuffer;
173
174static lck_grp_t	*buf_mtx_grp;
175static lck_attr_t	*buf_mtx_attr;
176static lck_grp_attr_t   *buf_mtx_grp_attr;
177static lck_mtx_t	*iobuffer_mtxp;
178static lck_mtx_t	*buf_mtxp;
179
180static int buf_busycount;
181
182static __inline__ int
183buf_timestamp(void)
184{
185	struct	timeval		t;
186	microuptime(&t);
187	return (t.tv_sec);
188}
189
190/*
191 * Insq/Remq for the buffer free lists.
192 */
193#if BALANCE_QUEUES
194#define	binsheadfree(bp, dp, whichq)	do { \
195				    TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
196					bufqinc((whichq));	\
197				} while (0)
198
199#define	binstailfree(bp, dp, whichq)	do { \
200				    TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
201					bufqinc((whichq));	\
202				} while (0)
203#else
204#define	binsheadfree(bp, dp, whichq)	do { \
205				    TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
206				} while (0)
207
208#define	binstailfree(bp, dp, whichq)	do { \
209				    TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
210				} while (0)
211#endif
212
213
214#define BHASHENTCHECK(bp)	\
215	if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef)	\
216		panic("%p: b_hash.le_prev is not deadbeef", (bp));
217
218#define BLISTNONE(bp)	\
219	(bp)->b_hash.le_next = (struct buf *)0;	\
220	(bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
221
222/*
223 * Insq/Remq for the vnode usage lists.
224 */
225#define	bufinsvn(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_vnbufs)
226#define	bufremvn(bp) {							\
227	LIST_REMOVE(bp, b_vnbufs);					\
228	(bp)->b_vnbufs.le_next = NOLIST;				\
229}
230
231/*
232 * Time in seconds before a buffer on a list is
233 * considered as a stale buffer
234 */
235#define LRU_IS_STALE 120 /* default value for the LRU */
236#define AGE_IS_STALE 60  /* default value for the AGE */
237#define META_IS_STALE 180 /* default value for the BQ_META */
238
239int lru_is_stale = LRU_IS_STALE;
240int age_is_stale = AGE_IS_STALE;
241int meta_is_stale = META_IS_STALE;
242
243#define MAXLAUNDRY	10
244
245/* LIST_INSERT_HEAD() with assertions */
246static __inline__ void
247blistenterhead(struct bufhashhdr * head, buf_t bp)
248{
249	if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
250		(head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
251	(head)->lh_first = bp;
252	bp->b_hash.le_prev = &(head)->lh_first;
253	if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
254		panic("blistenterhead: le_prev is deadbeef");
255}
256
257static __inline__ void
258binshash(buf_t bp, struct bufhashhdr *dp)
259{
260#if DIAGNOSTIC
261	buf_t	nbp;
262#endif /* DIAGNOSTIC */
263
264	BHASHENTCHECK(bp);
265
266#if DIAGNOSTIC
267	nbp = dp->lh_first;
268	for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
269		if(nbp == bp)
270			panic("buf already in hashlist");
271	}
272#endif /* DIAGNOSTIC */
273
274	blistenterhead(dp, bp);
275}
276
277static __inline__ void
278bremhash(buf_t	bp)
279{
280	if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
281		panic("bremhash le_prev is deadbeef");
282	if (bp->b_hash.le_next == bp)
283		panic("bremhash: next points to self");
284
285	if (bp->b_hash.le_next != NULL)
286		bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
287	*bp->b_hash.le_prev = (bp)->b_hash.le_next;
288}
289
290/*
291 * buf_mtxp held.
292 */
293static __inline__ void
294bmovelaundry(buf_t bp)
295{
296	bp->b_whichq = BQ_LAUNDRY;
297	bp->b_timestamp = buf_timestamp();
298	binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
299	blaundrycnt++;
300}
301
302static __inline__ void
303buf_release_credentials(buf_t bp)
304{
305	if (IS_VALID_CRED(bp->b_rcred)) {
306		kauth_cred_unref(&bp->b_rcred);
307	}
308	if (IS_VALID_CRED(bp->b_wcred)) {
309		kauth_cred_unref(&bp->b_wcred);
310	}
311}
312
313
314int
315buf_valid(buf_t bp) {
316
317        if ( (bp->b_flags & (B_DONE | B_DELWRI)) )
318	        return 1;
319	return 0;
320}
321
322int
323buf_fromcache(buf_t bp) {
324
325        if ( (bp->b_flags & B_CACHE) )
326	        return 1;
327	return 0;
328}
329
330void
331buf_markinvalid(buf_t bp) {
332
333        SET(bp->b_flags, B_INVAL);
334}
335
336void
337buf_markdelayed(buf_t bp) {
338
339	if (!ISSET(bp->b_flags, B_DELWRI)) {
340		SET(bp->b_flags, B_DELWRI);
341
342		OSAddAtomicLong(1, &nbdwrite);
343		buf_reassign(bp, bp->b_vp);
344	}
345        SET(bp->b_flags, B_DONE);
346}
347
348void
349buf_markclean(buf_t bp) {
350
351	if (ISSET(bp->b_flags, B_DELWRI)) {
352		CLR(bp->b_flags, B_DELWRI);
353
354		OSAddAtomicLong(-1, &nbdwrite);
355		buf_reassign(bp, bp->b_vp);
356	}
357}
358
359void
360buf_markeintr(buf_t bp) {
361
362        SET(bp->b_flags, B_EINTR);
363}
364
365
366void
367buf_markaged(buf_t bp) {
368
369        SET(bp->b_flags, B_AGE);
370}
371
372int
373buf_fua(buf_t bp) {
374
375        if ((bp->b_flags & B_FUA) == B_FUA)
376	        return 1;
377	return 0;
378}
379
380void
381buf_markfua(buf_t bp) {
382
383        SET(bp->b_flags, B_FUA);
384}
385
386#if CONFIG_PROTECT
387void
388buf_setcpaddr(buf_t bp, struct cprotect *entry) {
389	bp->b_attr.ba_cpentry = entry;
390}
391
392void
393buf_setcpoff (buf_t bp, uint64_t foffset) {
394	bp->b_attr.ba_cp_file_off = foffset;
395}
396
397void *
398bufattr_cpaddr(bufattr_t bap) {
399	return (bap->ba_cpentry);
400}
401
402uint64_t
403bufattr_cpoff(bufattr_t bap) {
404	return (bap->ba_cp_file_off);
405}
406
407void
408bufattr_setcpaddr(bufattr_t bap, void *cp_entry_addr) {
409        bap->ba_cpentry = cp_entry_addr;
410}
411
412void
413bufattr_setcpoff(bufattr_t bap, uint64_t foffset) {
414        bap->ba_cp_file_off = foffset;
415}
416
417#else
418void *
419bufattr_cpaddr(bufattr_t bap __unused) {
420        return NULL;
421}
422
423uint64_t
424bufattr_cpoff(bufattr_t bap __unused) {
425	return 0;
426}
427
428void
429bufattr_setcpaddr(bufattr_t bap __unused, void *cp_entry_addr __unused) {
430}
431
432void
433bufattr_setcpoff(__unused bufattr_t bap, __unused uint64_t foffset) {
434	return;
435}
436#endif /* CONFIG_PROTECT */
437
438bufattr_t
439bufattr_alloc() {
440	bufattr_t bap;
441	MALLOC(bap, bufattr_t, sizeof(struct bufattr), M_TEMP, M_WAITOK);
442	if (bap == NULL)
443		return NULL;
444
445	bzero(bap, sizeof(struct bufattr));
446	return bap;
447}
448
449void
450bufattr_free(bufattr_t bap) {
451	if (bap)
452		FREE(bap, M_TEMP);
453}
454
455int
456bufattr_rawencrypted(bufattr_t bap) {
457	if ( (bap->ba_flags & BA_RAW_ENCRYPTED_IO) )
458		return 1;
459	return 0;
460}
461
462int
463bufattr_throttled(bufattr_t bap) {
464	if ( (bap->ba_flags & BA_THROTTLED_IO) )
465		return 1;
466	return 0;
467}
468
469int
470bufattr_nocache(bufattr_t bap) {
471	if ( (bap->ba_flags & BA_NOCACHE) )
472		return 1;
473	return 0;
474}
475
476int
477bufattr_meta(bufattr_t bap) {
478	if ( (bap->ba_flags & BA_META) )
479		return 1;
480	return 0;
481}
482
483int
484#if !CONFIG_EMBEDDED
485bufattr_delayidlesleep(bufattr_t bap)
486#else /* !CONFIG_EMBEDDED */
487bufattr_delayidlesleep(__unused bufattr_t bap)
488#endif /* !CONFIG_EMBEDDED */
489{
490#if !CONFIG_EMBEDDED
491	if ( (bap->ba_flags & BA_DELAYIDLESLEEP) )
492		return 1;
493#endif /* !CONFIG_EMBEDDED */
494	return 0;
495}
496
497bufattr_t
498buf_attr(buf_t bp) {
499	return &bp->b_attr;
500}
501
502void
503buf_markstatic(buf_t bp __unused) {
504	SET(bp->b_flags, B_STATICCONTENT);
505}
506
507int
508buf_static(buf_t bp) {
509    if ( (bp->b_flags & B_STATICCONTENT) )
510        return 1;
511    return 0;
512}
513
514errno_t
515buf_error(buf_t bp) {
516
517        return (bp->b_error);
518}
519
520void
521buf_seterror(buf_t bp, errno_t error) {
522
523        if ((bp->b_error = error))
524	        SET(bp->b_flags, B_ERROR);
525	else
526	        CLR(bp->b_flags, B_ERROR);
527}
528
529void
530buf_setflags(buf_t bp, int32_t flags) {
531
532        SET(bp->b_flags, (flags & BUF_X_WRFLAGS));
533}
534
535void
536buf_clearflags(buf_t bp, int32_t flags) {
537
538        CLR(bp->b_flags, (flags & BUF_X_WRFLAGS));
539}
540
541int32_t
542buf_flags(buf_t bp) {
543
544        return ((bp->b_flags & BUF_X_RDFLAGS));
545}
546
547void
548buf_reset(buf_t bp, int32_t io_flags) {
549
550        CLR(bp->b_flags, (B_READ | B_WRITE | B_ERROR | B_DONE | B_INVAL | B_ASYNC | B_NOCACHE | B_FUA));
551	SET(bp->b_flags, (io_flags & (B_ASYNC | B_READ | B_WRITE | B_NOCACHE)));
552
553	bp->b_error = 0;
554}
555
556uint32_t
557buf_count(buf_t bp) {
558
559        return (bp->b_bcount);
560}
561
562void
563buf_setcount(buf_t bp, uint32_t bcount) {
564
565        bp->b_bcount = bcount;
566}
567
568uint32_t
569buf_size(buf_t bp) {
570
571        return (bp->b_bufsize);
572}
573
574void
575buf_setsize(buf_t bp, uint32_t bufsize) {
576
577        bp->b_bufsize = bufsize;
578}
579
580uint32_t
581buf_resid(buf_t bp) {
582
583        return (bp->b_resid);
584}
585
586void
587buf_setresid(buf_t bp, uint32_t resid) {
588
589        bp->b_resid = resid;
590}
591
592uint32_t
593buf_dirtyoff(buf_t bp) {
594
595        return (bp->b_dirtyoff);
596}
597
598uint32_t
599buf_dirtyend(buf_t bp) {
600
601        return (bp->b_dirtyend);
602}
603
604void
605buf_setdirtyoff(buf_t bp, uint32_t dirtyoff) {
606
607        bp->b_dirtyoff = dirtyoff;
608}
609
610void
611buf_setdirtyend(buf_t bp, uint32_t dirtyend) {
612
613        bp->b_dirtyend = dirtyend;
614}
615
616uintptr_t
617buf_dataptr(buf_t bp) {
618
619        return (bp->b_datap);
620}
621
622void
623buf_setdataptr(buf_t bp, uintptr_t data) {
624
625        bp->b_datap = data;
626}
627
628vnode_t
629buf_vnode(buf_t bp) {
630
631        return (bp->b_vp);
632}
633
634void
635buf_setvnode(buf_t bp, vnode_t vp) {
636
637        bp->b_vp = vp;
638}
639
640
641void *
642buf_callback(buf_t bp)
643{
644        if ( !(bp->b_flags & B_CALL) )
645	        return ((void *) NULL);
646
647	return ((void *)bp->b_iodone);
648}
649
650
651errno_t
652buf_setcallback(buf_t bp, void (*callback)(buf_t, void *), void *transaction)
653{
654	if (callback)
655	        bp->b_flags |= (B_CALL | B_ASYNC);
656	else
657	        bp->b_flags &= ~B_CALL;
658	bp->b_transaction = transaction;
659	bp->b_iodone = callback;
660
661	return (0);
662}
663
664errno_t
665buf_setupl(buf_t bp, upl_t upl, uint32_t offset)
666{
667
668        if ( !(bp->b_lflags & BL_IOBUF) )
669	        return (EINVAL);
670
671	if (upl)
672	        bp->b_flags |= B_CLUSTER;
673	else
674	        bp->b_flags &= ~B_CLUSTER;
675	bp->b_upl = upl;
676	bp->b_uploffset = offset;
677
678	return (0);
679}
680
681buf_t
682buf_clone(buf_t bp, int io_offset, int io_size, void (*iodone)(buf_t, void *), void *arg)
683{
684        buf_t	io_bp;
685
686	if (io_offset < 0 || io_size < 0)
687	        return (NULL);
688
689	if ((unsigned)(io_offset + io_size) > (unsigned)bp->b_bcount)
690	        return (NULL);
691
692	if (bp->b_flags & B_CLUSTER) {
693	        if (io_offset && ((bp->b_uploffset + io_offset) & PAGE_MASK))
694		        return (NULL);
695
696	        if (((bp->b_uploffset + io_offset + io_size) & PAGE_MASK) && ((io_offset + io_size) < bp->b_bcount))
697		        return (NULL);
698	}
699	io_bp = alloc_io_buf(bp->b_vp, 0);
700
701	io_bp->b_flags = bp->b_flags & (B_COMMIT_UPL | B_META | B_PAGEIO | B_CLUSTER | B_PHYS | B_RAW | B_ASYNC | B_READ | B_FUA);
702
703	if (iodone) {
704	        io_bp->b_transaction = arg;
705		io_bp->b_iodone = iodone;
706		io_bp->b_flags |= B_CALL;
707	}
708	if (bp->b_flags & B_CLUSTER) {
709	        io_bp->b_upl = bp->b_upl;
710		io_bp->b_uploffset = bp->b_uploffset + io_offset;
711	} else {
712	        io_bp->b_datap  = (uintptr_t)(((char *)bp->b_datap) + io_offset);
713	}
714	io_bp->b_bcount = io_size;
715
716	return (io_bp);
717}
718
719
720int
721buf_shadow(buf_t bp)
722{
723	if (bp->b_lflags & BL_SHADOW)
724		return 1;
725	return 0;
726}
727
728
729buf_t
730buf_create_shadow_priv(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg)
731{
732	return (buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 1));
733}
734
735buf_t
736buf_create_shadow(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg)
737{
738	return (buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 0));
739}
740
741
742static buf_t
743buf_create_shadow_internal(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv)
744{
745        buf_t	io_bp;
746
747	KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_START, bp, 0, 0, 0, 0);
748
749	if ( !(bp->b_flags & B_META) || (bp->b_lflags & BL_IOBUF)) {
750
751		KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, 0, 0, 0, 0);
752		return (NULL);
753	}
754#ifdef BUF_MAKE_PRIVATE
755	if (bp->b_shadow_ref && bp->b_data_ref == 0 && external_storage == 0)
756		panic("buf_create_shadow: %p is in the private state (%d, %d)", bp, bp->b_shadow_ref, bp->b_data_ref);
757#endif
758	io_bp = alloc_io_buf(bp->b_vp, priv);
759
760	io_bp->b_flags = bp->b_flags & (B_META | B_ZALLOC | B_ASYNC | B_READ | B_FUA);
761	io_bp->b_blkno = bp->b_blkno;
762	io_bp->b_lblkno = bp->b_lblkno;
763
764	if (iodone) {
765	        io_bp->b_transaction = arg;
766		io_bp->b_iodone = iodone;
767		io_bp->b_flags |= B_CALL;
768	}
769	if (force_copy == FALSE) {
770		io_bp->b_bcount = bp->b_bcount;
771		io_bp->b_bufsize = bp->b_bufsize;
772
773		if (external_storage) {
774			io_bp->b_datap = external_storage;
775#ifdef BUF_MAKE_PRIVATE
776			io_bp->b_data_store = NULL;
777#endif
778		} else {
779			io_bp->b_datap = bp->b_datap;
780#ifdef BUF_MAKE_PRIVATE
781			io_bp->b_data_store = bp;
782#endif
783		}
784		*(buf_t *)(&io_bp->b_orig) = bp;
785
786		lck_mtx_lock_spin(buf_mtxp);
787
788		io_bp->b_lflags |= BL_SHADOW;
789		io_bp->b_shadow = bp->b_shadow;
790		bp->b_shadow = io_bp;
791		bp->b_shadow_ref++;
792
793#ifdef BUF_MAKE_PRIVATE
794		if (external_storage)
795			io_bp->b_lflags |= BL_EXTERNAL;
796		else
797			bp->b_data_ref++;
798#endif
799		lck_mtx_unlock(buf_mtxp);
800	} else {
801		if (external_storage) {
802#ifdef BUF_MAKE_PRIVATE
803			io_bp->b_lflags |= BL_EXTERNAL;
804#endif
805			io_bp->b_bcount = bp->b_bcount;
806			io_bp->b_bufsize = bp->b_bufsize;
807			io_bp->b_datap = external_storage;
808		} else {
809			allocbuf(io_bp, bp->b_bcount);
810
811			io_bp->b_lflags |= BL_IOBUF_ALLOC;
812		}
813		bcopy((caddr_t)bp->b_datap, (caddr_t)io_bp->b_datap, bp->b_bcount);
814
815#ifdef BUF_MAKE_PRIVATE
816		io_bp->b_data_store = NULL;
817#endif
818	}
819	KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, io_bp, 0);
820
821	return (io_bp);
822}
823
824
825#ifdef BUF_MAKE_PRIVATE
826errno_t
827buf_make_private(buf_t bp)
828{
829	buf_t	ds_bp;
830	buf_t	t_bp;
831	struct buf my_buf;
832
833	KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_START, bp, bp->b_shadow_ref, 0, 0, 0);
834
835	if (bp->b_shadow_ref == 0 || bp->b_data_ref == 0 || ISSET(bp->b_lflags, BL_SHADOW)) {
836
837		KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0);
838		return (EINVAL);
839	}
840	my_buf.b_flags = B_META;
841	my_buf.b_datap = (uintptr_t)NULL;
842	allocbuf(&my_buf, bp->b_bcount);
843
844	bcopy((caddr_t)bp->b_datap, (caddr_t)my_buf.b_datap, bp->b_bcount);
845
846	lck_mtx_lock_spin(buf_mtxp);
847
848	for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) {
849		if ( !ISSET(bp->b_lflags, BL_EXTERNAL))
850			break;
851	}
852	ds_bp = t_bp;
853
854	if (ds_bp == NULL && bp->b_data_ref)
855		panic("buf_make_private: b_data_ref != 0 && ds_bp == NULL");
856
857	if (ds_bp && (bp->b_data_ref == 0 || bp->b_shadow_ref == 0))
858		panic("buf_make_private: ref_count == 0 && ds_bp != NULL");
859
860	if (ds_bp == NULL) {
861		lck_mtx_unlock(buf_mtxp);
862
863		buf_free_meta_store(&my_buf);
864
865		KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0);
866		return (EINVAL);
867	}
868	for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) {
869		if ( !ISSET(t_bp->b_lflags, BL_EXTERNAL))
870			t_bp->b_data_store = ds_bp;
871	}
872	ds_bp->b_data_ref = bp->b_data_ref;
873
874	bp->b_data_ref = 0;
875	bp->b_datap = my_buf.b_datap;
876
877	lck_mtx_unlock(buf_mtxp);
878
879	KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, 0, 0);
880	return (0);
881}
882#endif
883
884
885void
886buf_setfilter(buf_t bp, void (*filter)(buf_t, void *), void *transaction,
887			  void (**old_iodone)(buf_t, void *), void **old_transaction)
888{
889	if (old_iodone)
890		*old_iodone = bp->b_iodone;
891	if (old_transaction)
892		*old_transaction = bp->b_transaction;
893
894	bp->b_transaction = transaction;
895	bp->b_iodone = filter;
896	if (filter)
897	        bp->b_flags |= B_FILTER;
898	else
899	        bp->b_flags &= ~B_FILTER;
900}
901
902
903daddr64_t
904buf_blkno(buf_t bp) {
905
906        return (bp->b_blkno);
907}
908
909daddr64_t
910buf_lblkno(buf_t bp) {
911
912        return (bp->b_lblkno);
913}
914
915void
916buf_setblkno(buf_t bp, daddr64_t blkno) {
917
918        bp->b_blkno = blkno;
919}
920
921void
922buf_setlblkno(buf_t bp, daddr64_t lblkno) {
923
924        bp->b_lblkno = lblkno;
925}
926
927dev_t
928buf_device(buf_t bp) {
929
930        return (bp->b_dev);
931}
932
933errno_t
934buf_setdevice(buf_t bp, vnode_t vp) {
935
936        if ((vp->v_type != VBLK) && (vp->v_type != VCHR))
937	        return EINVAL;
938	bp->b_dev = vp->v_rdev;
939
940	return 0;
941}
942
943
944void *
945buf_drvdata(buf_t bp) {
946
947        return (bp->b_drvdata);
948}
949
950void
951buf_setdrvdata(buf_t bp, void *drvdata) {
952
953        bp->b_drvdata = drvdata;
954}
955
956void *
957buf_fsprivate(buf_t bp) {
958
959        return (bp->b_fsprivate);
960}
961
962void
963buf_setfsprivate(buf_t bp, void *fsprivate) {
964
965        bp->b_fsprivate = fsprivate;
966}
967
968kauth_cred_t
969buf_rcred(buf_t bp) {
970
971        return (bp->b_rcred);
972}
973
974kauth_cred_t
975buf_wcred(buf_t bp) {
976
977        return (bp->b_wcred);
978}
979
980void *
981buf_upl(buf_t bp) {
982
983        return (bp->b_upl);
984}
985
986uint32_t
987buf_uploffset(buf_t bp) {
988
989        return ((uint32_t)(bp->b_uploffset));
990}
991
992proc_t
993buf_proc(buf_t bp) {
994
995        return (bp->b_proc);
996}
997
998
999errno_t
1000buf_map(buf_t bp, caddr_t *io_addr)
1001{
1002        buf_t		real_bp;
1003        vm_offset_t	vaddr;
1004        kern_return_t	kret;
1005
1006        if ( !(bp->b_flags & B_CLUSTER)) {
1007	        *io_addr = (caddr_t)bp->b_datap;
1008		return (0);
1009	}
1010	real_bp = (buf_t)(bp->b_real_bp);
1011
1012	if (real_bp && real_bp->b_datap) {
1013	        /*
1014		 * b_real_bp is only valid if B_CLUSTER is SET
1015		 * if it's non-zero, than someone did a cluster_bp call
1016		 * if the backing physical pages were already mapped
1017		 * in before the call to cluster_bp (non-zero b_datap),
1018		 * than we just use that mapping
1019		 */
1020	        *io_addr = (caddr_t)real_bp->b_datap;
1021		return (0);
1022	}
1023	kret = ubc_upl_map(bp->b_upl, &vaddr);    /* Map it in */
1024
1025	if (kret != KERN_SUCCESS) {
1026	        *io_addr = NULL;
1027
1028	        return(ENOMEM);
1029	}
1030	vaddr += bp->b_uploffset;
1031
1032	*io_addr = (caddr_t)vaddr;
1033
1034	return (0);
1035}
1036
1037errno_t
1038buf_unmap(buf_t bp)
1039{
1040        buf_t		real_bp;
1041        kern_return_t	kret;
1042
1043        if ( !(bp->b_flags & B_CLUSTER))
1044	        return (0);
1045	/*
1046	 * see buf_map for the explanation
1047	 */
1048	real_bp = (buf_t)(bp->b_real_bp);
1049
1050	if (real_bp && real_bp->b_datap)
1051	        return (0);
1052
1053	if ((bp->b_lflags & BL_IOBUF) &&
1054	    ((bp->b_flags & (B_PAGEIO | B_READ)) != (B_PAGEIO | B_READ))) {
1055	        /*
1056		 * ignore pageins... the 'right' thing will
1057		 * happen due to the way we handle speculative
1058		 * clusters...
1059		 *
1060		 * when we commit these pages, we'll hit
1061		 * it with UPL_COMMIT_INACTIVE which
1062		 * will clear the reference bit that got
1063		 * turned on when we touched the mapping
1064		 */
1065	        bp->b_flags |= B_AGE;
1066	}
1067	kret = ubc_upl_unmap(bp->b_upl);
1068
1069	if (kret != KERN_SUCCESS)
1070	        return (EINVAL);
1071	return (0);
1072}
1073
1074
1075void
1076buf_clear(buf_t bp) {
1077        caddr_t baddr;
1078
1079        if (buf_map(bp, &baddr) == 0) {
1080	        bzero(baddr, bp->b_bcount);
1081		buf_unmap(bp);
1082	}
1083	bp->b_resid = 0;
1084}
1085
1086/*
1087 * Read or write a buffer that is not contiguous on disk.
1088 * buffer is marked done/error at the conclusion
1089 */
1090static int
1091buf_strategy_fragmented(vnode_t devvp, buf_t bp, off_t f_offset, size_t contig_bytes)
1092{
1093	vnode_t	vp = buf_vnode(bp);
1094	buf_t	io_bp;			 /* For reading or writing a single block */
1095	int	io_direction;
1096	int	io_resid;
1097	size_t	io_contig_bytes;
1098        daddr64_t io_blkno;
1099	int	error = 0;
1100	int	bmap_flags;
1101
1102	/*
1103	 * save our starting point... the bp was already mapped
1104	 * in buf_strategy before we got called
1105	 * no sense doing it again.
1106	 */
1107	io_blkno = bp->b_blkno;
1108	/*
1109	 * Make sure we redo this mapping for the next I/O
1110	 * i.e. this can never be a 'permanent' mapping
1111	 */
1112	bp->b_blkno = bp->b_lblkno;
1113
1114	/*
1115	 * Get an io buffer to do the deblocking
1116	 */
1117	io_bp = alloc_io_buf(devvp, 0);
1118
1119	io_bp->b_lblkno = bp->b_lblkno;
1120	io_bp->b_datap  = bp->b_datap;
1121	io_resid	= bp->b_bcount;
1122        io_direction	= bp->b_flags & B_READ;
1123	io_contig_bytes = contig_bytes;
1124
1125	if (bp->b_flags & B_READ)
1126	        bmap_flags = VNODE_READ;
1127	else
1128	        bmap_flags = VNODE_WRITE;
1129
1130	for (;;) {
1131		if (io_blkno == -1)
1132		        /*
1133			 * this is unexepected, but we'll allow for it
1134			 */
1135		        bzero((caddr_t)io_bp->b_datap, (int)io_contig_bytes);
1136		else {
1137		        io_bp->b_bcount	 = io_contig_bytes;
1138			io_bp->b_bufsize = io_contig_bytes;
1139			io_bp->b_resid   = io_contig_bytes;
1140			io_bp->b_blkno   = io_blkno;
1141
1142			buf_reset(io_bp, io_direction);
1143
1144			/*
1145			 * Call the device to do the I/O and wait for it.  Make sure the appropriate party is charged for write
1146			 */
1147
1148			if (!ISSET(bp->b_flags, B_READ))
1149			        OSAddAtomic(1, &devvp->v_numoutput);
1150
1151			if ((error = VNOP_STRATEGY(io_bp)))
1152			        break;
1153			if ((error = (int)buf_biowait(io_bp)))
1154			        break;
1155			if (io_bp->b_resid) {
1156			        io_resid -= (io_contig_bytes - io_bp->b_resid);
1157				break;
1158			}
1159		}
1160		if ((io_resid -= io_contig_bytes) == 0)
1161		        break;
1162		f_offset       += io_contig_bytes;
1163		io_bp->b_datap += io_contig_bytes;
1164
1165		/*
1166		 * Map the current position to a physical block number
1167		 */
1168		if ((error = VNOP_BLOCKMAP(vp, f_offset, io_resid, &io_blkno, &io_contig_bytes, NULL, bmap_flags, NULL)))
1169		        break;
1170	}
1171	buf_free(io_bp);
1172
1173	if (error)
1174	        buf_seterror(bp, error);
1175	bp->b_resid = io_resid;
1176	/*
1177	 * This I/O is now complete
1178	 */
1179	buf_biodone(bp);
1180
1181	return error;
1182}
1183
1184
1185/*
1186 * struct vnop_strategy_args {
1187 *      struct buf *a_bp;
1188 * } *ap;
1189 */
1190errno_t
1191buf_strategy(vnode_t devvp, void *ap)
1192{
1193        buf_t	bp = ((struct vnop_strategy_args *)ap)->a_bp;
1194	vnode_t	vp = bp->b_vp;
1195	int	bmap_flags;
1196        errno_t error;
1197#if CONFIG_DTRACE
1198	int dtrace_io_start_flag = 0;	 /* We only want to trip the io:::start
1199					  * probe once, with the true phisical
1200					  * block in place (b_blkno)
1201					  */
1202
1203#endif
1204
1205	if (vp == NULL || vp->v_type == VCHR || vp->v_type == VBLK)
1206	        panic("buf_strategy: b_vp == NULL || vtype == VCHR | VBLK\n");
1207	/*
1208	 * associate the physical device with
1209	 * with this buf_t even if we don't
1210	 * end up issuing the I/O...
1211	 */
1212	bp->b_dev = devvp->v_rdev;
1213
1214	if (bp->b_flags & B_READ)
1215	        bmap_flags = VNODE_READ;
1216	else
1217	        bmap_flags = VNODE_WRITE;
1218
1219        if ( !(bp->b_flags & B_CLUSTER)) {
1220
1221	        if ( (bp->b_upl) ) {
1222		        /*
1223			 * we have a UPL associated with this bp
1224			 * go through cluster_bp which knows how
1225			 * to deal with filesystem block sizes
1226			 * that aren't equal to the page size
1227			 */
1228			DTRACE_IO1(start, buf_t, bp);
1229		        return (cluster_bp(bp));
1230		}
1231		if (bp->b_blkno == bp->b_lblkno) {
1232		    off_t	f_offset;
1233			size_t 	contig_bytes;
1234
1235			if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) {
1236				DTRACE_IO1(start, buf_t, bp);
1237			        buf_seterror(bp, error);
1238				buf_biodone(bp);
1239
1240			    return (error);
1241			}
1242
1243		if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) {
1244				DTRACE_IO1(start, buf_t, bp);
1245			        buf_seterror(bp, error);
1246				buf_biodone(bp);
1247
1248			        return (error);
1249			}
1250
1251			DTRACE_IO1(start, buf_t, bp);
1252#if CONFIG_DTRACE
1253			dtrace_io_start_flag = 1;
1254#endif /* CONFIG_DTRACE */
1255
1256			if ((bp->b_blkno == -1) || (contig_bytes == 0)) {
1257				/* Set block number to force biodone later */
1258				bp->b_blkno = -1;
1259			        buf_clear(bp);
1260			}
1261			else if ((long)contig_bytes < bp->b_bcount) {
1262			        return (buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes));
1263			}
1264		}
1265
1266#if CONFIG_DTRACE
1267		if (dtrace_io_start_flag == 0) {
1268			DTRACE_IO1(start, buf_t, bp);
1269			dtrace_io_start_flag = 1;
1270		}
1271#endif /* CONFIG_DTRACE */
1272
1273		if (bp->b_blkno == -1) {
1274		        buf_biodone(bp);
1275			return (0);
1276		}
1277	}
1278
1279#if CONFIG_DTRACE
1280	if (dtrace_io_start_flag == 0)
1281		DTRACE_IO1(start, buf_t, bp);
1282#endif /* CONFIG_DTRACE */
1283
1284#if CONFIG_PROTECT
1285	/* Capture f_offset in the bufattr*/
1286	if (bp->b_attr.ba_cpentry != 0) {
1287		/* No need to go here for older EAs */
1288		if(bp->b_attr.ba_cpentry->cp_flags & CP_OFF_IV_ENABLED) {
1289			off_t f_offset;
1290			if ((error = VNOP_BLKTOOFF(bp->b_vp, bp->b_lblkno, &f_offset)))
1291				return error;
1292
1293			/*
1294			 * Attach the file offset to this buffer.  The
1295			 * bufattr attributes will be passed down the stack
1296			 * until they reach IOFlashStorage.  IOFlashStorage
1297			 * will retain the offset in a local variable when it
1298			 * issues its I/Os to the NAND controller.
1299			 *
1300			 * Note that LwVM may end up splitting this I/O
1301			 * into sub-I/Os if it crosses a chunk boundary.  In this
1302			 * case, LwVM will update this field when it dispatches
1303			 * each I/O to IOFlashStorage.  But from our perspective
1304			 * we have only issued a single I/O.
1305			 */
1306			bufattr_setcpoff (&(bp->b_attr), (u_int64_t)f_offset);
1307		}
1308	}
1309#endif
1310
1311	/*
1312	 * we can issue the I/O because...
1313	 * either B_CLUSTER is set which
1314	 * means that the I/O is properly set
1315	 * up to be a multiple of the page size, or
1316	 * we were able to successfully set up the
1317	 * phsyical block mapping
1318	 */
1319	return (VOCALL(devvp->v_op, VOFFSET(vnop_strategy), ap));
1320}
1321
1322
1323
1324buf_t
1325buf_alloc(vnode_t vp)
1326{
1327        return(alloc_io_buf(vp, 0));
1328}
1329
1330void
1331buf_free(buf_t bp) {
1332
1333        free_io_buf(bp);
1334}
1335
1336
1337/*
1338 * iterate buffers for the specified vp.
1339 *   if BUF_SCAN_DIRTY is set, do the dirty list
1340 *   if BUF_SCAN_CLEAN is set, do the clean list
1341 *   if neither flag is set, default to BUF_SCAN_DIRTY
1342 *   if BUF_NOTIFY_BUSY is set, call the callout function using a NULL bp for busy pages
1343 */
1344
1345struct buf_iterate_info_t {
1346	int flag;
1347	struct buflists *listhead;
1348};
1349
1350void
1351buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg)
1352{
1353	buf_t 	bp;
1354	int	retval;
1355	struct	buflists local_iterblkhd;
1356	int	lock_flags = BAC_NOWAIT | BAC_REMOVE;
1357	int	notify_busy = flags & BUF_NOTIFY_BUSY;
1358	struct buf_iterate_info_t list[2];
1359	int	num_lists, i;
1360
1361	if (flags & BUF_SKIP_LOCKED)
1362	        lock_flags |= BAC_SKIP_LOCKED;
1363	if (flags & BUF_SKIP_NONLOCKED)
1364	        lock_flags |= BAC_SKIP_NONLOCKED;
1365
1366	if ( !(flags & (BUF_SCAN_DIRTY | BUF_SCAN_CLEAN)))
1367	        flags |= BUF_SCAN_DIRTY;
1368
1369	num_lists = 0;
1370
1371	if (flags & BUF_SCAN_DIRTY) {
1372	        list[num_lists].flag = VBI_DIRTY;
1373		list[num_lists].listhead = &vp->v_dirtyblkhd;
1374		num_lists++;
1375	}
1376	if (flags & BUF_SCAN_CLEAN) {
1377		list[num_lists].flag = VBI_CLEAN;
1378		list[num_lists].listhead = &vp->v_cleanblkhd;
1379		num_lists++;
1380	}
1381
1382	for (i = 0; i < num_lists; i++) {
1383		lck_mtx_lock(buf_mtxp);
1384
1385		if (buf_iterprepare(vp, &local_iterblkhd, list[i].flag))  {
1386			lck_mtx_unlock(buf_mtxp);
1387			continue;
1388		}
1389		while (!LIST_EMPTY(&local_iterblkhd)) {
1390			bp = LIST_FIRST(&local_iterblkhd);
1391			LIST_REMOVE(bp, b_vnbufs);
1392			LIST_INSERT_HEAD(list[i].listhead, bp, b_vnbufs);
1393
1394			if (buf_acquire_locked(bp, lock_flags, 0, 0)) {
1395				if (notify_busy) {
1396					bp = NULL;
1397				} else {
1398					continue;
1399				}
1400			}
1401
1402			lck_mtx_unlock(buf_mtxp);
1403
1404			retval = callout(bp, arg);
1405
1406			switch (retval) {
1407			case BUF_RETURNED:
1408				if (bp)
1409					buf_brelse(bp);
1410				break;
1411			case BUF_CLAIMED:
1412				break;
1413			case BUF_RETURNED_DONE:
1414				if (bp)
1415					buf_brelse(bp);
1416				lck_mtx_lock(buf_mtxp);
1417				goto out;
1418			case BUF_CLAIMED_DONE:
1419				lck_mtx_lock(buf_mtxp);
1420				goto out;
1421			}
1422			lck_mtx_lock(buf_mtxp);
1423		} /* while list has more nodes */
1424	  out:
1425		buf_itercomplete(vp, &local_iterblkhd, list[i].flag);
1426		lck_mtx_unlock(buf_mtxp);
1427	} /* for each list */
1428} /* buf_iterate */
1429
1430
1431/*
1432 * Flush out and invalidate all buffers associated with a vnode.
1433 */
1434int
1435buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
1436{
1437	buf_t	bp;
1438	int	aflags;
1439	int	error = 0;
1440	int	must_rescan = 1;
1441	struct	buflists local_iterblkhd;
1442
1443
1444	if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
1445		return (0);
1446
1447	lck_mtx_lock(buf_mtxp);
1448
1449	for (;;) {
1450		if (must_rescan == 0)
1451		        /*
1452			 * the lists may not be empty, but all that's left at this
1453			 * point are metadata or B_LOCKED buffers which are being
1454			 * skipped... we know this because we made it through both
1455			 * the clean and dirty lists without dropping buf_mtxp...
1456			 * each time we drop buf_mtxp we bump "must_rescan"
1457			 */
1458		        break;
1459		if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
1460		        break;
1461		must_rescan = 0;
1462		/*
1463		 * iterate the clean list
1464		 */
1465		if (buf_iterprepare(vp, &local_iterblkhd, VBI_CLEAN)) {
1466		        goto try_dirty_list;
1467		}
1468		while (!LIST_EMPTY(&local_iterblkhd)) {
1469
1470			bp = LIST_FIRST(&local_iterblkhd);
1471
1472			LIST_REMOVE(bp, b_vnbufs);
1473			LIST_INSERT_HEAD(&vp->v_cleanblkhd, bp, b_vnbufs);
1474
1475			/*
1476			 * some filesystems distinguish meta data blocks with a negative logical block #
1477			 */
1478			if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
1479				continue;
1480
1481			aflags = BAC_REMOVE;
1482
1483			if ( !(flags & BUF_INVALIDATE_LOCKED) )
1484				aflags |= BAC_SKIP_LOCKED;
1485
1486			if ( (error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo)) ) {
1487			        if (error == EDEADLK)
1488				        /*
1489					 * this buffer was marked B_LOCKED...
1490					 * we didn't drop buf_mtxp, so we
1491					 * we don't need to rescan
1492					 */
1493				        continue;
1494			        if (error == EAGAIN) {
1495				        /*
1496					 * found a busy buffer... we blocked and
1497					 * dropped buf_mtxp, so we're going to
1498					 * need to rescan after this pass is completed
1499					 */
1500				        must_rescan++;
1501				        continue;
1502				}
1503				/*
1504				 * got some kind of 'real' error out of the msleep
1505				 * in buf_acquire_locked, terminate the scan and return the error
1506				 */
1507				buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1508
1509				lck_mtx_unlock(buf_mtxp);
1510				return (error);
1511			}
1512			lck_mtx_unlock(buf_mtxp);
1513
1514			if (bp->b_flags & B_LOCKED)
1515				KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 0, 0);
1516
1517			CLR(bp->b_flags, B_LOCKED);
1518			SET(bp->b_flags, B_INVAL);
1519			buf_brelse(bp);
1520
1521			lck_mtx_lock(buf_mtxp);
1522
1523			/*
1524			 * by dropping buf_mtxp, we allow new
1525			 * buffers to be added to the vnode list(s)
1526			 * we'll have to rescan at least once more
1527			 * if the queues aren't empty
1528			 */
1529			must_rescan++;
1530		}
1531		buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1532
1533try_dirty_list:
1534		/*
1535		 * Now iterate on dirty blks
1536		 */
1537		if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) {
1538			continue;
1539		}
1540		while (!LIST_EMPTY(&local_iterblkhd)) {
1541			bp = LIST_FIRST(&local_iterblkhd);
1542
1543			LIST_REMOVE(bp, b_vnbufs);
1544			LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1545
1546			/*
1547			 * some filesystems distinguish meta data blocks with a negative logical block #
1548			 */
1549			if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
1550				continue;
1551
1552			aflags = BAC_REMOVE;
1553
1554			if ( !(flags & BUF_INVALIDATE_LOCKED) )
1555				aflags |= BAC_SKIP_LOCKED;
1556
1557			if ( (error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo)) ) {
1558			        if (error == EDEADLK)
1559				        /*
1560					 * this buffer was marked B_LOCKED...
1561					 * we didn't drop buf_mtxp, so we
1562					 * we don't need to rescan
1563					 */
1564				        continue;
1565			        if (error == EAGAIN) {
1566				        /*
1567					 * found a busy buffer... we blocked and
1568					 * dropped buf_mtxp, so we're going to
1569					 * need to rescan after this pass is completed
1570					 */
1571				        must_rescan++;
1572				        continue;
1573				}
1574				/*
1575				 * got some kind of 'real' error out of the msleep
1576				 * in buf_acquire_locked, terminate the scan and return the error
1577				 */
1578				buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1579
1580				lck_mtx_unlock(buf_mtxp);
1581				return (error);
1582			}
1583			lck_mtx_unlock(buf_mtxp);
1584
1585			if (bp->b_flags & B_LOCKED)
1586				KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 1, 0);
1587
1588			CLR(bp->b_flags, B_LOCKED);
1589			SET(bp->b_flags, B_INVAL);
1590
1591			if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA))
1592				(void) VNOP_BWRITE(bp);
1593			else
1594				buf_brelse(bp);
1595
1596			lck_mtx_lock(buf_mtxp);
1597			/*
1598			 * by dropping buf_mtxp, we allow new
1599			 * buffers to be added to the vnode list(s)
1600			 * we'll have to rescan at least once more
1601			 * if the queues aren't empty
1602			 */
1603			must_rescan++;
1604		}
1605		buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1606	}
1607	lck_mtx_unlock(buf_mtxp);
1608
1609	return (0);
1610}
1611
1612void
1613buf_flushdirtyblks(vnode_t vp, int wait, int flags, const char *msg) {
1614
1615	(void) buf_flushdirtyblks_skipinfo(vp, wait, flags, msg);
1616	return;
1617}
1618
1619int
1620buf_flushdirtyblks_skipinfo(vnode_t vp, int wait, int flags, const char *msg) {
1621	buf_t	bp;
1622	int	writes_issued = 0;
1623	errno_t	error;
1624	int	busy = 0;
1625	struct	buflists local_iterblkhd;
1626	int	lock_flags = BAC_NOWAIT | BAC_REMOVE;
1627	int any_locked = 0;
1628
1629	if (flags & BUF_SKIP_LOCKED)
1630	        lock_flags |= BAC_SKIP_LOCKED;
1631	if (flags & BUF_SKIP_NONLOCKED)
1632	        lock_flags |= BAC_SKIP_NONLOCKED;
1633loop:
1634	lck_mtx_lock(buf_mtxp);
1635
1636	if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == 0)  {
1637	        while (!LIST_EMPTY(&local_iterblkhd)) {
1638			bp = LIST_FIRST(&local_iterblkhd);
1639			LIST_REMOVE(bp, b_vnbufs);
1640			LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1641
1642			if ((error = buf_acquire_locked(bp, lock_flags, 0, 0)) == EBUSY) {
1643				busy++;
1644			}
1645			if (error) {
1646				/*
1647				 * If we passed in BUF_SKIP_LOCKED or BUF_SKIP_NONLOCKED,
1648				 * we may want to do somethign differently if a locked or unlocked
1649				 * buffer was encountered (depending on the arg specified).
1650				 * In this case, we know that one of those two was set, and the
1651				 * buf acquisition failed above.
1652				 *
1653				 * If it failed with EDEADLK, then save state which can be emitted
1654				 * later on to the caller.  Most callers should not care.
1655				 */
1656				if (error == EDEADLK) {
1657					any_locked++;
1658				}
1659				continue;
1660			}
1661			lck_mtx_unlock(buf_mtxp);
1662
1663			bp->b_flags &= ~B_LOCKED;
1664
1665			/*
1666			 * Wait for I/O associated with indirect blocks to complete,
1667			 * since there is no way to quickly wait for them below.
1668			 */
1669			if ((bp->b_vp == vp) || (wait == 0))
1670			        (void) buf_bawrite(bp);
1671			else
1672			        (void) VNOP_BWRITE(bp);
1673			writes_issued++;
1674
1675			lck_mtx_lock(buf_mtxp);
1676		}
1677		buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1678	}
1679	lck_mtx_unlock(buf_mtxp);
1680
1681	if (wait) {
1682	        (void)vnode_waitforwrites(vp, 0, 0, 0, msg);
1683
1684		if (vp->v_dirtyblkhd.lh_first && busy) {
1685		        /*
1686			 * we had one or more BUSY buffers on
1687			 * the dirtyblock list... most likely
1688			 * these are due to delayed writes that
1689			 * were moved to the bclean queue but
1690			 * have not yet been 'written'.
1691			 * if we issued some writes on the
1692			 * previous pass, we try again immediately
1693			 * if we didn't, we'll sleep for some time
1694			 * to allow the state to change...
1695			 */
1696		        if (writes_issued == 0) {
1697			        (void)tsleep((caddr_t)&vp->v_numoutput,
1698					     PRIBIO + 1, "vnode_flushdirtyblks", hz/20);
1699			}
1700			writes_issued = 0;
1701			busy = 0;
1702
1703			goto loop;
1704		}
1705	}
1706
1707	return any_locked;
1708}
1709
1710
1711/*
1712 * called with buf_mtxp held...
1713 * this lock protects the queue manipulation
1714 */
1715static int
1716buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags)
1717{
1718	struct buflists * listheadp;
1719
1720	if (flags & VBI_DIRTY)
1721		listheadp = &vp->v_dirtyblkhd;
1722	else
1723		listheadp = &vp->v_cleanblkhd;
1724
1725	while (vp->v_iterblkflags & VBI_ITER) 	{
1726	        vp->v_iterblkflags |= VBI_ITERWANT;
1727		msleep(&vp->v_iterblkflags, buf_mtxp, 0, "buf_iterprepare", NULL);
1728	}
1729	if (LIST_EMPTY(listheadp)) {
1730	        LIST_INIT(iterheadp);
1731		return(EINVAL);
1732	}
1733	vp->v_iterblkflags |= VBI_ITER;
1734
1735	iterheadp->lh_first = listheadp->lh_first;
1736	listheadp->lh_first->b_vnbufs.le_prev = &iterheadp->lh_first;
1737	LIST_INIT(listheadp);
1738
1739	return(0);
1740}
1741
1742/*
1743 * called with buf_mtxp held...
1744 * this lock protects the queue manipulation
1745 */
1746static void
1747buf_itercomplete(vnode_t vp, struct buflists *iterheadp, int flags)
1748{
1749	struct buflists * listheadp;
1750	buf_t bp;
1751
1752	if (flags & VBI_DIRTY)
1753		listheadp = &vp->v_dirtyblkhd;
1754	else
1755		listheadp = &vp->v_cleanblkhd;
1756
1757	while (!LIST_EMPTY(iterheadp)) {
1758		bp = LIST_FIRST(iterheadp);
1759		LIST_REMOVE(bp, b_vnbufs);
1760		LIST_INSERT_HEAD(listheadp, bp, b_vnbufs);
1761	}
1762	vp->v_iterblkflags &= ~VBI_ITER;
1763
1764	if  (vp->v_iterblkflags & VBI_ITERWANT) 	{
1765		vp->v_iterblkflags &= ~VBI_ITERWANT;
1766		wakeup(&vp->v_iterblkflags);
1767	}
1768}
1769
1770
1771static void
1772bremfree_locked(buf_t bp)
1773{
1774	struct bqueues *dp = NULL;
1775	int whichq;
1776
1777	whichq = bp->b_whichq;
1778
1779	if (whichq == -1) {
1780		if (bp->b_shadow_ref == 0)
1781			panic("bremfree_locked: %p not on freelist", bp);
1782		/*
1783		 * there are clones pointing to 'bp'...
1784		 * therefore, it was not put on a freelist
1785		 * when buf_brelse was last called on 'bp'
1786		 */
1787		return;
1788	}
1789	/*
1790	 * We only calculate the head of the freelist when removing
1791	 * the last element of the list as that is the only time that
1792	 * it is needed (e.g. to reset the tail pointer).
1793	 *
1794	 * NB: This makes an assumption about how tailq's are implemented.
1795	 */
1796	if (bp->b_freelist.tqe_next == NULL) {
1797	        dp = &bufqueues[whichq];
1798
1799		if (dp->tqh_last != &bp->b_freelist.tqe_next)
1800			panic("bremfree: lost tail");
1801	}
1802	TAILQ_REMOVE(dp, bp, b_freelist);
1803
1804#if BALANCE_QUEUES
1805	bufqdec(whichq);
1806#endif
1807	if (whichq == BQ_LAUNDRY)
1808	        blaundrycnt--;
1809
1810	bp->b_whichq = -1;
1811	bp->b_timestamp = 0;
1812	bp->b_shadow = 0;
1813}
1814
1815/*
1816 * Associate a buffer with a vnode.
1817 * buf_mtxp must be locked on entry
1818 */
1819static void
1820bgetvp_locked(vnode_t vp, buf_t bp)
1821{
1822
1823	if (bp->b_vp != vp)
1824		panic("bgetvp_locked: not free");
1825
1826	if (vp->v_type == VBLK || vp->v_type == VCHR)
1827		bp->b_dev = vp->v_rdev;
1828	else
1829		bp->b_dev = NODEV;
1830	/*
1831	 * Insert onto list for new vnode.
1832	 */
1833	bufinsvn(bp, &vp->v_cleanblkhd);
1834}
1835
1836/*
1837 * Disassociate a buffer from a vnode.
1838 * buf_mtxp must be locked on entry
1839 */
1840static void
1841brelvp_locked(buf_t bp)
1842{
1843	/*
1844	 * Delete from old vnode list, if on one.
1845	 */
1846	if (bp->b_vnbufs.le_next != NOLIST)
1847		bufremvn(bp);
1848
1849	bp->b_vp = (vnode_t)NULL;
1850}
1851
1852/*
1853 * Reassign a buffer from one vnode to another.
1854 * Used to assign file specific control information
1855 * (indirect blocks) to the vnode to which they belong.
1856 */
1857static void
1858buf_reassign(buf_t bp, vnode_t newvp)
1859{
1860	struct buflists *listheadp;
1861
1862	if (newvp == NULL) {
1863		printf("buf_reassign: NULL");
1864		return;
1865	}
1866	lck_mtx_lock_spin(buf_mtxp);
1867
1868	/*
1869	 * Delete from old vnode list, if on one.
1870	 */
1871	if (bp->b_vnbufs.le_next != NOLIST)
1872		bufremvn(bp);
1873	/*
1874	 * If dirty, put on list of dirty buffers;
1875	 * otherwise insert onto list of clean buffers.
1876	 */
1877	if (ISSET(bp->b_flags, B_DELWRI))
1878		listheadp = &newvp->v_dirtyblkhd;
1879	else
1880		listheadp = &newvp->v_cleanblkhd;
1881	bufinsvn(bp, listheadp);
1882
1883	lck_mtx_unlock(buf_mtxp);
1884}
1885
1886static __inline__ void
1887bufhdrinit(buf_t bp)
1888{
1889	bzero((char *)bp, sizeof *bp);
1890	bp->b_dev = NODEV;
1891	bp->b_rcred = NOCRED;
1892	bp->b_wcred = NOCRED;
1893	bp->b_vnbufs.le_next = NOLIST;
1894	bp->b_flags = B_INVAL;
1895
1896	return;
1897}
1898
1899/*
1900 * Initialize buffers and hash links for buffers.
1901 */
1902__private_extern__ void
1903bufinit(void)
1904{
1905	buf_t	bp;
1906	struct bqueues *dp;
1907	int	i;
1908
1909	nbuf_headers = 0;
1910	/* Initialize the buffer queues ('freelists') and the hash table */
1911	for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
1912		TAILQ_INIT(dp);
1913	bufhashtbl = hashinit(nbuf_hashelements, M_CACHE, &bufhash);
1914
1915	buf_busycount = 0;
1916
1917	/* Initialize the buffer headers */
1918	for (i = 0; i < max_nbuf_headers; i++) {
1919		nbuf_headers++;
1920		bp = &buf_headers[i];
1921		bufhdrinit(bp);
1922
1923		BLISTNONE(bp);
1924		dp = &bufqueues[BQ_EMPTY];
1925		bp->b_whichq = BQ_EMPTY;
1926		bp->b_timestamp = buf_timestamp();
1927		binsheadfree(bp, dp, BQ_EMPTY);
1928		binshash(bp, &invalhash);
1929	}
1930	boot_nbuf_headers = nbuf_headers;
1931
1932	TAILQ_INIT(&iobufqueue);
1933	TAILQ_INIT(&delaybufqueue);
1934
1935	for (; i < nbuf_headers + niobuf_headers; i++) {
1936		bp = &buf_headers[i];
1937		bufhdrinit(bp);
1938		bp->b_whichq = -1;
1939		binsheadfree(bp, &iobufqueue, -1);
1940	}
1941
1942	/*
1943	 * allocate lock group attribute and group
1944	 */
1945	buf_mtx_grp_attr = lck_grp_attr_alloc_init();
1946	buf_mtx_grp = lck_grp_alloc_init("buffer cache", buf_mtx_grp_attr);
1947
1948	/*
1949	 * allocate the lock attribute
1950	 */
1951	buf_mtx_attr = lck_attr_alloc_init();
1952
1953	/*
1954	 * allocate and initialize mutex's for the buffer and iobuffer pools
1955	 */
1956	buf_mtxp	= lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
1957	iobuffer_mtxp	= lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
1958
1959	if (iobuffer_mtxp == NULL)
1960	        panic("couldn't create iobuffer mutex");
1961
1962	if (buf_mtxp == NULL)
1963	        panic("couldn't create buf mutex");
1964
1965	/*
1966	 * allocate and initialize cluster specific global locks...
1967	 */
1968	cluster_init();
1969
1970	printf("using %d buffer headers and %d cluster IO buffer headers\n",
1971		nbuf_headers, niobuf_headers);
1972
1973	/* Set up zones used by the buffer cache */
1974	bufzoneinit();
1975
1976	/* start the bcleanbuf() thread */
1977	bcleanbuf_thread_init();
1978
1979#ifndef __arm__
1980	/* Register a callout for relieving vm pressure */
1981	if (vm_set_buffer_cleanup_callout(buffer_cache_gc) != KERN_SUCCESS) {
1982		panic("Couldn't register buffer cache callout for vm pressure!\n");
1983	}
1984#endif
1985
1986#if BALANCE_QUEUES
1987	{
1988	static void bufq_balance_thread_init(void) __attribute__((section("__TEXT, initcode")));
1989	/* create a thread to do dynamic buffer queue balancing */
1990	bufq_balance_thread_init();
1991	}
1992#endif /* notyet */
1993}
1994
1995
1996
1997/*
1998 * Zones for the meta data buffers
1999 */
2000
2001#define MINMETA 512
2002#define MAXMETA 8192
2003
2004struct meta_zone_entry {
2005	zone_t mz_zone;
2006	vm_size_t mz_size;
2007	vm_size_t mz_max;
2008	const char *mz_name;
2009};
2010
2011struct meta_zone_entry meta_zones[] = {
2012	{NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
2013	{NULL, (MINMETA * 2),  64 * (MINMETA * 2), "buf.1024" },
2014	{NULL, (MINMETA * 4),  16 * (MINMETA * 4), "buf.2048" },
2015	{NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
2016	{NULL, (MINMETA * 16), 512 * (MINMETA * 16), "buf.8192" },
2017	{NULL, 0, 0, "" } /* End */
2018};
2019
2020/*
2021 * Initialize the meta data zones
2022 */
2023static void
2024bufzoneinit(void)
2025{
2026	int i;
2027
2028	for (i = 0; meta_zones[i].mz_size != 0; i++) {
2029		meta_zones[i].mz_zone =
2030				zinit(meta_zones[i].mz_size,
2031					meta_zones[i].mz_max,
2032					PAGE_SIZE,
2033					meta_zones[i].mz_name);
2034		zone_change(meta_zones[i].mz_zone, Z_CALLERACCT, FALSE);
2035	}
2036	buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
2037	zone_change(buf_hdr_zone, Z_CALLERACCT, FALSE);
2038}
2039
2040static __inline__ zone_t
2041getbufzone(size_t size)
2042{
2043	int i;
2044
2045	if ((size % 512) || (size < MINMETA) || (size > MAXMETA))
2046		panic("getbufzone: incorect size = %lu", size);
2047
2048	for (i = 0; meta_zones[i].mz_size != 0; i++) {
2049		if (meta_zones[i].mz_size >= size)
2050			break;
2051	}
2052
2053	return (meta_zones[i].mz_zone);
2054}
2055
2056
2057
2058static struct buf *
2059bio_doread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, int async, int queuetype)
2060{
2061	buf_t	bp;
2062
2063	bp = buf_getblk(vp, blkno, size, 0, 0, queuetype);
2064
2065	/*
2066	 * If buffer does not have data valid, start a read.
2067	 * Note that if buffer is B_INVAL, buf_getblk() won't return it.
2068	 * Therefore, it's valid if it's I/O has completed or been delayed.
2069	 */
2070	if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
2071		struct proc *p;
2072
2073		p = current_proc();
2074
2075		/* Start I/O for the buffer (keeping credentials). */
2076		SET(bp->b_flags, B_READ | async);
2077		if (IS_VALID_CRED(cred) && !IS_VALID_CRED(bp->b_rcred)) {
2078			kauth_cred_ref(cred);
2079			bp->b_rcred = cred;
2080		}
2081
2082		VNOP_STRATEGY(bp);
2083
2084		trace(TR_BREADMISS, pack(vp, size), blkno);
2085
2086		/* Pay for the read. */
2087		if (p && p->p_stats)
2088			OSIncrementAtomicLong(&p->p_stats->p_ru.ru_inblock);		/* XXX */
2089
2090		if (async) {
2091		        /*
2092			 * since we asked for an ASYNC I/O
2093			 * the biodone will do the brelse
2094			 * we don't want to pass back a bp
2095			 * that we don't 'own'
2096			 */
2097		        bp = NULL;
2098		}
2099	} else if (async) {
2100		buf_brelse(bp);
2101		bp = NULL;
2102	}
2103
2104	trace(TR_BREADHIT, pack(vp, size), blkno);
2105
2106	return (bp);
2107}
2108
2109/*
2110 * Perform the reads for buf_breadn() and buf_meta_breadn().
2111 * Trivial modification to the breada algorithm presented in Bach (p.55).
2112 */
2113static errno_t
2114do_breadn_for_type(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes,
2115		   int nrablks, kauth_cred_t cred, buf_t *bpp, int queuetype)
2116{
2117	buf_t	bp;
2118	int	i;
2119
2120	bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype);
2121
2122	/*
2123	 * For each of the read-ahead blocks, start a read, if necessary.
2124	 */
2125	for (i = 0; i < nrablks; i++) {
2126		/* If it's in the cache, just go on to next one. */
2127		if (incore(vp, rablks[i]))
2128			continue;
2129
2130		/* Get a buffer for the read-ahead block */
2131		(void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, queuetype);
2132	}
2133
2134	/* Otherwise, we had to start a read for it; wait until it's valid. */
2135	return (buf_biowait(bp));
2136}
2137
2138
2139/*
2140 * Read a disk block.
2141 * This algorithm described in Bach (p.54).
2142 */
2143errno_t
2144buf_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp)
2145{
2146	buf_t	bp;
2147
2148	/* Get buffer for block. */
2149	bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
2150
2151	/* Wait for the read to complete, and return result. */
2152	return (buf_biowait(bp));
2153}
2154
2155/*
2156 * Read a disk block. [bread() for meta-data]
2157 * This algorithm described in Bach (p.54).
2158 */
2159errno_t
2160buf_meta_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp)
2161{
2162	buf_t	bp;
2163
2164	/* Get buffer for block. */
2165	bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
2166
2167	/* Wait for the read to complete, and return result. */
2168	return (buf_biowait(bp));
2169}
2170
2171/*
2172 * Read-ahead multiple disk blocks. The first is sync, the rest async.
2173 */
2174errno_t
2175buf_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp)
2176{
2177	return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ));
2178}
2179
2180/*
2181 * Read-ahead multiple disk blocks. The first is sync, the rest async.
2182 * [buf_breadn() for meta-data]
2183 */
2184errno_t
2185buf_meta_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp)
2186{
2187	return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META));
2188}
2189
2190/*
2191 * Block write.  Described in Bach (p.56)
2192 */
2193errno_t
2194buf_bwrite(buf_t bp)
2195{
2196	int	sync, wasdelayed;
2197	errno_t	rv;
2198	proc_t	p = current_proc();
2199	vnode_t	vp = bp->b_vp;
2200
2201	if (bp->b_datap == 0) {
2202	        if (brecover_data(bp) == 0)
2203		        return (0);
2204	}
2205	/* Remember buffer type, to switch on it later. */
2206	sync = !ISSET(bp->b_flags, B_ASYNC);
2207	wasdelayed = ISSET(bp->b_flags, B_DELWRI);
2208	CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
2209
2210	if (wasdelayed)
2211		OSAddAtomicLong(-1, &nbdwrite);
2212
2213	if (!sync) {
2214		/*
2215		 * If not synchronous, pay for the I/O operation and make
2216		 * sure the buf is on the correct vnode queue.  We have
2217		 * to do this now, because if we don't, the vnode may not
2218		 * be properly notified that its I/O has completed.
2219		 */
2220		if (wasdelayed)
2221			buf_reassign(bp, vp);
2222		else
2223		if (p && p->p_stats)
2224			OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);	/* XXX */
2225	}
2226	trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
2227
2228	/* Initiate disk write.  Make sure the appropriate party is charged. */
2229
2230        OSAddAtomic(1, &vp->v_numoutput);
2231
2232	VNOP_STRATEGY(bp);
2233
2234	if (sync) {
2235		/*
2236		 * If I/O was synchronous, wait for it to complete.
2237		 */
2238		rv = buf_biowait(bp);
2239
2240		/*
2241		 * Pay for the I/O operation, if it's not been paid for, and
2242		 * make sure it's on the correct vnode queue. (async operatings
2243		 * were payed for above.)
2244		 */
2245		if (wasdelayed)
2246			buf_reassign(bp, vp);
2247		else
2248		if (p && p->p_stats)
2249			OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);	/* XXX */
2250
2251		/* Release the buffer. */
2252		// XXXdbg - only if the unused bit is set
2253		if (!ISSET(bp->b_flags, B_NORELSE)) {
2254		    buf_brelse(bp);
2255		} else {
2256		    CLR(bp->b_flags, B_NORELSE);
2257		}
2258
2259		return (rv);
2260	} else {
2261		return (0);
2262	}
2263}
2264
2265int
2266vn_bwrite(struct vnop_bwrite_args *ap)
2267{
2268	return (buf_bwrite(ap->a_bp));
2269}
2270
2271/*
2272 * Delayed write.
2273 *
2274 * The buffer is marked dirty, but is not queued for I/O.
2275 * This routine should be used when the buffer is expected
2276 * to be modified again soon, typically a small write that
2277 * partially fills a buffer.
2278 *
2279 * NB: magnetic tapes cannot be delayed; they must be
2280 * written in the order that the writes are requested.
2281 *
2282 * Described in Leffler, et al. (pp. 208-213).
2283 *
2284 * Note: With the ability to allocate additional buffer
2285 * headers, we can get in to the situation where "too" many
2286 * buf_bdwrite()s can create situation where the kernel can create
2287 * buffers faster than the disks can service. Doing a buf_bawrite() in
2288 * cases where we have "too many" outstanding buf_bdwrite()s avoids that.
2289 */
2290__private_extern__ int
2291bdwrite_internal(buf_t bp, int return_error)
2292{
2293	proc_t	p  = current_proc();
2294	vnode_t	vp = bp->b_vp;
2295
2296	/*
2297	 * If the block hasn't been seen before:
2298	 *	(1) Mark it as having been seen,
2299	 *	(2) Charge for the write.
2300	 *	(3) Make sure it's on its vnode's correct block list,
2301	 */
2302	if (!ISSET(bp->b_flags, B_DELWRI)) {
2303		SET(bp->b_flags, B_DELWRI);
2304		if (p && p->p_stats)
2305			OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);	/* XXX */
2306		OSAddAtomicLong(1, &nbdwrite);
2307		buf_reassign(bp, vp);
2308	}
2309
2310	/*
2311	 * if we're not LOCKED, but the total number of delayed writes
2312	 * has climbed above 75% of the total buffers in the system
2313	 * return an error if the caller has indicated that it can
2314	 * handle one in this case, otherwise schedule the I/O now
2315	 * this is done to prevent us from allocating tons of extra
2316	 * buffers when dealing with virtual disks (i.e. DiskImages),
2317	 * because additional buffers are dynamically allocated to prevent
2318	 * deadlocks from occurring
2319	 *
2320	 * however, can't do a buf_bawrite() if the LOCKED bit is set because the
2321	 * buffer is part of a transaction and can't go to disk until
2322	 * the LOCKED bit is cleared.
2323	 */
2324	if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf_headers/4)*3)) {
2325		if (return_error)
2326			return (EAGAIN);
2327		/*
2328		 * If the vnode has "too many" write operations in progress
2329		 * wait for them to finish the IO
2330		 */
2331		(void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, "buf_bdwrite");
2332
2333		return (buf_bawrite(bp));
2334	}
2335
2336	/* Otherwise, the "write" is done, so mark and release the buffer. */
2337	SET(bp->b_flags, B_DONE);
2338	buf_brelse(bp);
2339	return (0);
2340}
2341
2342errno_t
2343buf_bdwrite(buf_t bp)
2344{
2345	return (bdwrite_internal(bp, 0));
2346}
2347
2348
2349/*
2350 * Asynchronous block write; just an asynchronous buf_bwrite().
2351 *
2352 * Note: With the abilitty to allocate additional buffer
2353 * headers, we can get in to the situation where "too" many
2354 * buf_bawrite()s can create situation where the kernel can create
2355 * buffers faster than the disks can service.
2356 * We limit the number of "in flight" writes a vnode can have to
2357 * avoid this.
2358 */
2359static int
2360bawrite_internal(buf_t bp, int throttle)
2361{
2362	vnode_t	vp = bp->b_vp;
2363
2364	if (vp) {
2365	        if (throttle)
2366		        /*
2367			 * If the vnode has "too many" write operations in progress
2368			 * wait for them to finish the IO
2369			 */
2370		        (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (const char *)"buf_bawrite");
2371		else if (vp->v_numoutput >= VNODE_ASYNC_THROTTLE)
2372		        /*
2373			 * return to the caller and
2374			 * let him decide what to do
2375			 */
2376		        return (EWOULDBLOCK);
2377	}
2378	SET(bp->b_flags, B_ASYNC);
2379
2380	return (VNOP_BWRITE(bp));
2381}
2382
2383errno_t
2384buf_bawrite(buf_t bp)
2385{
2386	return (bawrite_internal(bp, 1));
2387}
2388
2389
2390
2391static void
2392buf_free_meta_store(buf_t bp)
2393{
2394	if (bp->b_bufsize) {
2395		if (ISSET(bp->b_flags, B_ZALLOC)) {
2396			zone_t z;
2397
2398			z = getbufzone(bp->b_bufsize);
2399			zfree(z, (void *)bp->b_datap);
2400		} else
2401			kmem_free(kernel_map, bp->b_datap, bp->b_bufsize);
2402
2403		bp->b_datap = (uintptr_t)NULL;
2404		bp->b_bufsize = 0;
2405	}
2406}
2407
2408
2409static buf_t
2410buf_brelse_shadow(buf_t bp)
2411{
2412	buf_t	bp_head;
2413	buf_t	bp_temp;
2414	buf_t	bp_return = NULL;
2415#ifdef BUF_MAKE_PRIVATE
2416	buf_t	bp_data;
2417	int	data_ref = 0;
2418#endif
2419	int need_wakeup = 0;
2420
2421	lck_mtx_lock_spin(buf_mtxp);
2422
2423	bp_head = (buf_t)bp->b_orig;
2424
2425	if (bp_head->b_whichq != -1)
2426		panic("buf_brelse_shadow: bp_head on freelist %d\n", bp_head->b_whichq);
2427
2428#ifdef BUF_MAKE_PRIVATE
2429	if (bp_data = bp->b_data_store) {
2430		bp_data->b_data_ref--;
2431		/*
2432		 * snapshot the ref count so that we can check it
2433		 * outside of the lock... we only want the guy going
2434		 * from 1 -> 0 to try and release the storage
2435		 */
2436		data_ref = bp_data->b_data_ref;
2437	}
2438#endif
2439	KERNEL_DEBUG(0xbbbbc008 | DBG_FUNC_START, bp, bp_head, bp_head->b_shadow_ref, 0, 0);
2440
2441	bp_head->b_shadow_ref--;
2442
2443	for (bp_temp = bp_head; bp_temp && bp != bp_temp->b_shadow; bp_temp = bp_temp->b_shadow);
2444
2445	if (bp_temp == NULL)
2446		panic("buf_brelse_shadow: bp not on list %p", bp_head);
2447
2448	bp_temp->b_shadow = bp_temp->b_shadow->b_shadow;
2449
2450#ifdef BUF_MAKE_PRIVATE
2451	/*
2452	 * we're about to free the current 'owner' of the data buffer and
2453	 * there is at least one other shadow buf_t still pointing at it
2454	 * so transfer it to the first shadow buf left in the chain
2455	 */
2456	if (bp == bp_data && data_ref) {
2457		if ((bp_data = bp_head->b_shadow) == NULL)
2458			panic("buf_brelse_shadow: data_ref mismatch bp(%p)", bp);
2459
2460		for (bp_temp = bp_data; bp_temp; bp_temp = bp_temp->b_shadow)
2461			bp_temp->b_data_store = bp_data;
2462		bp_data->b_data_ref = data_ref;
2463	}
2464#endif
2465	if (bp_head->b_shadow_ref == 0 && bp_head->b_shadow)
2466		panic("buf_relse_shadow: b_shadow != NULL && b_shadow_ref == 0  bp(%p)", bp);
2467	if (bp_head->b_shadow_ref && bp_head->b_shadow == 0)
2468		panic("buf_relse_shadow: b_shadow == NULL && b_shadow_ref != 0  bp(%p)", bp);
2469
2470	if (bp_head->b_shadow_ref == 0) {
2471		if (!ISSET(bp_head->b_lflags, BL_BUSY)) {
2472
2473			CLR(bp_head->b_flags, B_AGE);
2474			bp_head->b_timestamp = buf_timestamp();
2475
2476			if (ISSET(bp_head->b_flags, B_LOCKED)) {
2477				bp_head->b_whichq = BQ_LOCKED;
2478				binstailfree(bp_head, &bufqueues[BQ_LOCKED], BQ_LOCKED);
2479			} else {
2480				bp_head->b_whichq = BQ_META;
2481				binstailfree(bp_head, &bufqueues[BQ_META], BQ_META);
2482			}
2483		} else if (ISSET(bp_head->b_lflags, BL_WAITSHADOW)) {
2484			CLR(bp_head->b_lflags, BL_WAITSHADOW);
2485
2486			bp_return = bp_head;
2487		}
2488		if (ISSET(bp_head->b_lflags, BL_WANTED_REF)) {
2489			CLR(bp_head->b_lflags, BL_WANTED_REF);
2490			need_wakeup = 1;
2491		}
2492	}
2493	lck_mtx_unlock(buf_mtxp);
2494
2495	if (need_wakeup) {
2496		wakeup(bp_head);
2497	}
2498
2499#ifdef BUF_MAKE_PRIVATE
2500	if (bp == bp_data && data_ref == 0)
2501		buf_free_meta_store(bp);
2502
2503	bp->b_data_store = NULL;
2504#endif
2505	KERNEL_DEBUG(0xbbbbc008 | DBG_FUNC_END, bp, 0, 0, 0, 0);
2506
2507	return (bp_return);
2508}
2509
2510
2511/*
2512 * Release a buffer on to the free lists.
2513 * Described in Bach (p. 46).
2514 */
2515void
2516buf_brelse(buf_t bp)
2517{
2518	struct bqueues *bufq;
2519	long	whichq;
2520	upl_t	upl;
2521	int need_wakeup = 0;
2522	int need_bp_wakeup = 0;
2523
2524
2525	if (bp->b_whichq != -1 || !(bp->b_lflags & BL_BUSY))
2526	        panic("buf_brelse: bad buffer = %p\n", bp);
2527
2528#ifdef JOE_DEBUG
2529	(void) OSBacktrace(&bp->b_stackbrelse[0], 6);
2530
2531	bp->b_lastbrelse = current_thread();
2532	bp->b_tag = 0;
2533#endif
2534	if (bp->b_lflags & BL_IOBUF) {
2535		buf_t	shadow_master_bp = NULL;
2536
2537		if (ISSET(bp->b_lflags, BL_SHADOW))
2538			shadow_master_bp = buf_brelse_shadow(bp);
2539		else if (ISSET(bp->b_lflags, BL_IOBUF_ALLOC))
2540			 buf_free_meta_store(bp);
2541	        free_io_buf(bp);
2542
2543		if (shadow_master_bp) {
2544			bp = shadow_master_bp;
2545			goto finish_shadow_master;
2546		}
2547		return;
2548	}
2549
2550	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
2551		     bp->b_lblkno * PAGE_SIZE, bp, bp->b_datap,
2552		     bp->b_flags, 0);
2553
2554	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
2555
2556	/*
2557	 * if we're invalidating a buffer that has the B_FILTER bit
2558	 * set then call the b_iodone function so it gets cleaned
2559	 * up properly.
2560	 *
2561	 * the HFS journal code depends on this
2562	 */
2563	if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
2564		if (ISSET(bp->b_flags, B_FILTER)) {	/* if necessary, call out */
2565			void	(*iodone_func)(struct buf *, void *) = bp->b_iodone;
2566			void 	*arg = bp->b_transaction;
2567
2568			CLR(bp->b_flags, B_FILTER);	/* but note callout done */
2569			bp->b_iodone = NULL;
2570			bp->b_transaction = NULL;
2571
2572			if (iodone_func == NULL) {
2573				panic("brelse: bp @ %p has NULL b_iodone!\n", bp);
2574			}
2575			(*iodone_func)(bp, arg);
2576		}
2577	}
2578	/*
2579	 * I/O is done. Cleanup the UPL state
2580	 */
2581	upl = bp->b_upl;
2582
2583	if ( !ISSET(bp->b_flags, B_META) && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
2584		kern_return_t kret;
2585		int           upl_flags;
2586
2587		if (upl == NULL) {
2588		        if ( !ISSET(bp->b_flags, B_INVAL)) {
2589				kret = ubc_create_upl(bp->b_vp,
2590						      ubc_blktooff(bp->b_vp, bp->b_lblkno),
2591						      bp->b_bufsize,
2592						      &upl,
2593						      NULL,
2594						      UPL_PRECIOUS);
2595
2596				if (kret != KERN_SUCCESS)
2597				        panic("brelse: Failed to create UPL");
2598#if  UPL_DEBUG
2599				upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) 5);
2600#endif /* UPL_DEBUG */
2601			}
2602		} else {
2603			if (bp->b_datap) {
2604			        kret = ubc_upl_unmap(upl);
2605
2606				if (kret != KERN_SUCCESS)
2607				        panic("ubc_upl_unmap failed");
2608				bp->b_datap = (uintptr_t)NULL;
2609			}
2610		}
2611		if (upl) {
2612			if (bp->b_flags & (B_ERROR | B_INVAL)) {
2613			        if (bp->b_flags & (B_READ | B_INVAL))
2614				        upl_flags = UPL_ABORT_DUMP_PAGES;
2615				else
2616				        upl_flags = 0;
2617
2618				ubc_upl_abort(upl, upl_flags);
2619			} else {
2620			        if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
2621				        upl_flags = UPL_COMMIT_SET_DIRTY ;
2622				else
2623				        upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
2624
2625				ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
2626						     UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2627			}
2628			bp->b_upl = NULL;
2629		}
2630	} else {
2631		if ( (upl) )
2632			panic("brelse: UPL set for non VREG; vp=%p", bp->b_vp);
2633	}
2634
2635	/*
2636	 * If it's locked, don't report an error; try again later.
2637	 */
2638	if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
2639		CLR(bp->b_flags, B_ERROR);
2640	/*
2641	 * If it's not cacheable, or an error, mark it invalid.
2642	 */
2643	if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
2644		SET(bp->b_flags, B_INVAL);
2645
2646	if ((bp->b_bufsize <= 0) ||
2647			ISSET(bp->b_flags, B_INVAL) ||
2648			(ISSET(bp->b_lflags, BL_WANTDEALLOC) && !ISSET(bp->b_flags, B_DELWRI))) {
2649
2650		boolean_t	delayed_buf_free_meta_store = FALSE;
2651
2652		/*
2653		 * If it's invalid or empty, dissociate it from its vnode,
2654		 * release its storage if B_META, and
2655		 * clean it up a bit and put it on the EMPTY queue
2656		 */
2657		if (ISSET(bp->b_flags, B_DELWRI))
2658			OSAddAtomicLong(-1, &nbdwrite);
2659
2660		if (ISSET(bp->b_flags, B_META)) {
2661			if (bp->b_shadow_ref)
2662				delayed_buf_free_meta_store = TRUE;
2663			else
2664				buf_free_meta_store(bp);
2665		}
2666		/*
2667		 * nuke any credentials we were holding
2668		 */
2669		buf_release_credentials(bp);
2670
2671		lck_mtx_lock_spin(buf_mtxp);
2672
2673		if (bp->b_shadow_ref) {
2674			SET(bp->b_lflags, BL_WAITSHADOW);
2675
2676			lck_mtx_unlock(buf_mtxp);
2677
2678			return;
2679		}
2680		if (delayed_buf_free_meta_store == TRUE) {
2681
2682			lck_mtx_unlock(buf_mtxp);
2683finish_shadow_master:
2684			buf_free_meta_store(bp);
2685
2686			lck_mtx_lock_spin(buf_mtxp);
2687		}
2688		CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
2689
2690		if (bp->b_vp)
2691			brelvp_locked(bp);
2692
2693		bremhash(bp);
2694		BLISTNONE(bp);
2695		binshash(bp, &invalhash);
2696
2697		bp->b_whichq = BQ_EMPTY;
2698		binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
2699	} else {
2700
2701		/*
2702		 * It has valid data.  Put it on the end of the appropriate
2703		 * queue, so that it'll stick around for as long as possible.
2704		 */
2705		if (ISSET(bp->b_flags, B_LOCKED))
2706			whichq = BQ_LOCKED;		/* locked in core */
2707		else if (ISSET(bp->b_flags, B_META))
2708			whichq = BQ_META;		/* meta-data */
2709		else if (ISSET(bp->b_flags, B_AGE))
2710			whichq = BQ_AGE;		/* stale but valid data */
2711		else
2712			whichq = BQ_LRU;		/* valid data */
2713		bufq = &bufqueues[whichq];
2714
2715		bp->b_timestamp = buf_timestamp();
2716
2717		lck_mtx_lock_spin(buf_mtxp);
2718
2719		/*
2720		 * the buf_brelse_shadow routine doesn't take 'ownership'
2721		 * of the parent buf_t... it updates state that is protected by
2722		 * the buf_mtxp, and checks for BL_BUSY to determine whether to
2723		 * put the buf_t back on a free list.  b_shadow_ref is protected
2724		 * by the lock, and since we have not yet cleared B_BUSY, we need
2725		 * to check it while holding the lock to insure that one of us
2726		 * puts this buf_t back on a free list when it is safe to do so
2727		 */
2728		if (bp->b_shadow_ref == 0) {
2729			CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE));
2730			bp->b_whichq = whichq;
2731			binstailfree(bp, bufq, whichq);
2732		} else {
2733			/*
2734			 * there are still cloned buf_t's pointing
2735			 * at this guy... need to keep it off the
2736			 * freelists until a buf_brelse is done on
2737			 * the last clone
2738			 */
2739			CLR(bp->b_flags, (B_ASYNC | B_NOCACHE));
2740		}
2741	}
2742	if (needbuffer) {
2743	        /*
2744		 * needbuffer is a global
2745		 * we're currently using buf_mtxp to protect it
2746		 * delay doing the actual wakeup until after
2747		 * we drop buf_mtxp
2748		 */
2749		needbuffer = 0;
2750		need_wakeup = 1;
2751	}
2752	if (ISSET(bp->b_lflags, BL_WANTED)) {
2753	        /*
2754		 * delay the actual wakeup until after we
2755		 * clear BL_BUSY and we've dropped buf_mtxp
2756		 */
2757		need_bp_wakeup = 1;
2758	}
2759	/*
2760	 * Unlock the buffer.
2761	 */
2762	CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
2763	buf_busycount--;
2764
2765	lck_mtx_unlock(buf_mtxp);
2766
2767	if (need_wakeup) {
2768	        /*
2769		 * Wake up any processes waiting for any buffer to become free.
2770		 */
2771	        wakeup(&needbuffer);
2772	}
2773	if (need_bp_wakeup) {
2774	        /*
2775		 * Wake up any proceeses waiting for _this_ buffer to become free.
2776		 */
2777	        wakeup(bp);
2778	}
2779	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
2780		     bp, bp->b_datap, bp->b_flags, 0, 0);
2781}
2782
2783/*
2784 * Determine if a block is in the cache.
2785 * Just look on what would be its hash chain.  If it's there, return
2786 * a pointer to it, unless it's marked invalid.  If it's marked invalid,
2787 * we normally don't return the buffer, unless the caller explicitly
2788 * wants us to.
2789 */
2790static boolean_t
2791incore(vnode_t vp, daddr64_t blkno)
2792{
2793        boolean_t retval;
2794	struct	bufhashhdr *dp;
2795
2796	dp = BUFHASH(vp, blkno);
2797
2798	lck_mtx_lock_spin(buf_mtxp);
2799
2800	if (incore_locked(vp, blkno, dp))
2801	        retval = TRUE;
2802	else
2803	        retval = FALSE;
2804	lck_mtx_unlock(buf_mtxp);
2805
2806	return (retval);
2807}
2808
2809
2810static buf_t
2811incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp)
2812{
2813	struct buf *bp;
2814
2815	/* Search hash chain */
2816	for (bp = dp->lh_first; bp != NULL; bp = bp->b_hash.le_next) {
2817		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
2818		    !ISSET(bp->b_flags, B_INVAL)) {
2819			return (bp);
2820		}
2821	}
2822	return (NULL);
2823}
2824
2825void
2826buf_wait_for_shadow_io(vnode_t vp, daddr64_t blkno)
2827{
2828	buf_t bp;
2829	struct	bufhashhdr *dp;
2830
2831	dp = BUFHASH(vp, blkno);
2832
2833	lck_mtx_lock_spin(buf_mtxp);
2834
2835	for (;;) {
2836		if ((bp = incore_locked(vp, blkno, dp)) == NULL)
2837			break;
2838
2839		if (bp->b_shadow_ref == 0)
2840			break;
2841
2842		SET(bp->b_lflags, BL_WANTED_REF);
2843
2844		(void) msleep(bp, buf_mtxp, PSPIN | (PRIBIO+1), "buf_wait_for_shadow", NULL);
2845	}
2846	lck_mtx_unlock(buf_mtxp);
2847}
2848
2849/* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
2850/*
2851 * Get a block of requested size that is associated with
2852 * a given vnode and block offset. If it is found in the
2853 * block cache, mark it as having been found, make it busy
2854 * and return it. Otherwise, return an empty block of the
2855 * correct size. It is up to the caller to insure that the
2856 * cached blocks be of the correct size.
2857 */
2858buf_t
2859buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int operation)
2860{
2861	buf_t bp;
2862	int   err;
2863	upl_t upl;
2864	upl_page_info_t *pl;
2865	kern_return_t kret;
2866	int ret_only_valid;
2867	struct timespec ts;
2868	int upl_flags;
2869	struct	bufhashhdr *dp;
2870
2871	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
2872		     (uintptr_t)(blkno * PAGE_SIZE), size, operation, 0, 0);
2873
2874	ret_only_valid = operation & BLK_ONLYVALID;
2875	operation &= ~BLK_ONLYVALID;
2876	dp = BUFHASH(vp, blkno);
2877start:
2878	lck_mtx_lock_spin(buf_mtxp);
2879
2880	if ((bp = incore_locked(vp, blkno, dp))) {
2881		/*
2882		 * Found in the Buffer Cache
2883		 */
2884		if (ISSET(bp->b_lflags, BL_BUSY)) {
2885			/*
2886			 * but is busy
2887			 */
2888			switch (operation) {
2889			case BLK_READ:
2890			case BLK_WRITE:
2891			case BLK_META:
2892				SET(bp->b_lflags, BL_WANTED);
2893				bufstats.bufs_busyincore++;
2894
2895				/*
2896				 * don't retake the mutex after being awakened...
2897				 * the time out is in msecs
2898				 */
2899				ts.tv_sec = (slptimeo/1000);
2900				ts.tv_nsec = (slptimeo % 1000) * 10  * NSEC_PER_USEC * 1000;
2901
2902				KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 396)) | DBG_FUNC_NONE,
2903					     (uintptr_t)blkno, size, operation, 0, 0);
2904
2905				err = msleep(bp, buf_mtxp, slpflag | PDROP | (PRIBIO + 1), "buf_getblk", &ts);
2906
2907				/*
2908				 * Callers who call with PCATCH or timeout are
2909				 * willing to deal with the NULL pointer
2910				 */
2911				if (err && ((slpflag & PCATCH) || ((err == EWOULDBLOCK) && slptimeo)))
2912					return (NULL);
2913				goto start;
2914				/*NOTREACHED*/
2915				break;
2916
2917			default:
2918			        /*
2919				 * unknown operation requested
2920				 */
2921				panic("getblk: paging or unknown operation for incore busy buffer - %x\n", operation);
2922				/*NOTREACHED*/
2923				break;
2924			}
2925		} else {
2926			/*
2927			 * buffer in core and not busy
2928			 */
2929			SET(bp->b_lflags, BL_BUSY);
2930			SET(bp->b_flags, B_CACHE);
2931			buf_busycount++;
2932
2933			bremfree_locked(bp);
2934			bufstats.bufs_incore++;
2935
2936			lck_mtx_unlock(buf_mtxp);
2937#ifdef JOE_DEBUG
2938			bp->b_owner = current_thread();
2939			bp->b_tag   = 1;
2940#endif
2941			if ( (bp->b_upl) )
2942			        panic("buffer has UPL, but not marked BUSY: %p", bp);
2943
2944			if ( !ret_only_valid && bp->b_bufsize != size)
2945			        allocbuf(bp, size);
2946
2947			upl_flags = 0;
2948			switch (operation) {
2949			case BLK_WRITE:
2950				/*
2951				 * "write" operation:  let the UPL subsystem
2952				 * know that we intend to modify the buffer
2953				 * cache pages we're gathering.
2954				 */
2955				upl_flags |= UPL_WILL_MODIFY;
2956			case BLK_READ:
2957				upl_flags |= UPL_PRECIOUS;
2958			        if (UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
2959					kret = ubc_create_upl(vp,
2960							      ubc_blktooff(vp, bp->b_lblkno),
2961							      bp->b_bufsize,
2962							      &upl,
2963							      &pl,
2964							      upl_flags);
2965					if (kret != KERN_SUCCESS)
2966					        panic("Failed to create UPL");
2967
2968					bp->b_upl = upl;
2969
2970					if (upl_valid_page(pl, 0)) {
2971					        if (upl_dirty_page(pl, 0))
2972						        SET(bp->b_flags, B_WASDIRTY);
2973						else
2974						        CLR(bp->b_flags, B_WASDIRTY);
2975					} else
2976					        CLR(bp->b_flags, (B_DONE | B_CACHE | B_WASDIRTY | B_DELWRI));
2977
2978					kret = ubc_upl_map(upl, (vm_offset_t*)&(bp->b_datap));
2979
2980					if (kret != KERN_SUCCESS)
2981					        panic("getblk: ubc_upl_map() failed with (%d)", kret);
2982				}
2983				break;
2984
2985			case BLK_META:
2986				/*
2987				 * VM is not involved in IO for the meta data
2988				 * buffer already has valid data
2989				 */
2990				break;
2991
2992			default:
2993				panic("getblk: paging or unknown operation for incore buffer- %d\n", operation);
2994				/*NOTREACHED*/
2995				break;
2996			}
2997		}
2998	} else { /* not incore() */
2999		int queue = BQ_EMPTY; /* Start with no preference */
3000
3001		if (ret_only_valid) {
3002			lck_mtx_unlock(buf_mtxp);
3003			return (NULL);
3004		}
3005		if ((vnode_isreg(vp) == 0) || (UBCINFOEXISTS(vp) == 0) /*|| (vnode_issystem(vp) == 1)*/)
3006			operation = BLK_META;
3007
3008		if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
3009			goto start;
3010
3011		/*
3012		 * getnewbuf may block for a number of different reasons...
3013		 * if it does, it's then possible for someone else to
3014		 * create a buffer for the same block and insert it into
3015		 * the hash... if we see it incore at this point we dump
3016		 * the buffer we were working on and start over
3017		 */
3018		if (incore_locked(vp, blkno, dp)) {
3019			SET(bp->b_flags, B_INVAL);
3020			binshash(bp, &invalhash);
3021
3022			lck_mtx_unlock(buf_mtxp);
3023
3024			buf_brelse(bp);
3025			goto start;
3026		}
3027		/*
3028		 * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
3029		 *       CALLED!  BE CAREFUL.
3030		 */
3031
3032		/*
3033		 * mark the buffer as B_META if indicated
3034		 * so that when buffer is released it will goto META queue
3035		 */
3036		if (operation == BLK_META)
3037		        SET(bp->b_flags, B_META);
3038
3039		bp->b_blkno = bp->b_lblkno = blkno;
3040		bp->b_vp = vp;
3041
3042		/*
3043		 * Insert in the hash so that incore() can find it
3044		 */
3045		binshash(bp, BUFHASH(vp, blkno));
3046
3047		bgetvp_locked(vp, bp);
3048
3049		lck_mtx_unlock(buf_mtxp);
3050
3051		allocbuf(bp, size);
3052
3053		upl_flags = 0;
3054		switch (operation) {
3055		case BLK_META:
3056			/*
3057			 * buffer data is invalid...
3058			 *
3059			 * I don't want to have to retake buf_mtxp,
3060			 * so the miss and vmhits counters are done
3061			 * with Atomic updates... all other counters
3062			 * in bufstats are protected with either
3063			 * buf_mtxp or iobuffer_mtxp
3064			 */
3065		        OSAddAtomicLong(1, &bufstats.bufs_miss);
3066			break;
3067
3068		case BLK_WRITE:
3069			/*
3070			 * "write" operation:  let the UPL subsystem know
3071			 * that we intend to modify the buffer cache pages
3072			 * we're gathering.
3073			 */
3074			upl_flags |= UPL_WILL_MODIFY;
3075		case BLK_READ:
3076		  {     off_t	f_offset;
3077			size_t 	contig_bytes;
3078			int	bmap_flags;
3079
3080			if ( (bp->b_upl) )
3081				panic("bp already has UPL: %p",bp);
3082
3083			f_offset = ubc_blktooff(vp, blkno);
3084
3085			upl_flags |= UPL_PRECIOUS;
3086			kret = ubc_create_upl(vp,
3087					      f_offset,
3088					      bp->b_bufsize,
3089					      &upl,
3090					      &pl,
3091					      upl_flags);
3092
3093			if (kret != KERN_SUCCESS)
3094				panic("Failed to create UPL");
3095#if  UPL_DEBUG
3096			upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) 4);
3097#endif /* UPL_DEBUG */
3098			bp->b_upl = upl;
3099
3100			if (upl_valid_page(pl, 0)) {
3101
3102			        if (operation == BLK_READ)
3103				        bmap_flags = VNODE_READ;
3104				else
3105				        bmap_flags = VNODE_WRITE;
3106
3107				SET(bp->b_flags, B_CACHE | B_DONE);
3108
3109			        OSAddAtomicLong(1, &bufstats.bufs_vmhits);
3110
3111				bp->b_validoff = 0;
3112				bp->b_dirtyoff = 0;
3113
3114				if (upl_dirty_page(pl, 0)) {
3115					/* page is dirty */
3116				        SET(bp->b_flags, B_WASDIRTY);
3117
3118					bp->b_validend = bp->b_bcount;
3119					bp->b_dirtyend = bp->b_bcount;
3120				} else {
3121					/* page is clean */
3122					bp->b_validend = bp->b_bcount;
3123					bp->b_dirtyend = 0;
3124				}
3125				/*
3126				 * try to recreate the physical block number associated with
3127				 * this buffer...
3128				 */
3129				if (VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))
3130				        panic("getblk: VNOP_BLOCKMAP failed");
3131				/*
3132				 * if the extent represented by this buffer
3133				 * is not completely physically contiguous on
3134				 * disk, than we can't cache the physical mapping
3135				 * in the buffer header
3136				 */
3137				if ((long)contig_bytes < bp->b_bcount)
3138				        bp->b_blkno = bp->b_lblkno;
3139			} else {
3140			        OSAddAtomicLong(1, &bufstats.bufs_miss);
3141			}
3142			kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap));
3143
3144			if (kret != KERN_SUCCESS)
3145			        panic("getblk: ubc_upl_map() failed with (%d)", kret);
3146			break;
3147		  }
3148		default:
3149			panic("getblk: paging or unknown operation - %x", operation);
3150			/*NOTREACHED*/
3151			break;
3152		}
3153	}
3154	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
3155		     bp, bp->b_datap, bp->b_flags, 3, 0);
3156
3157#ifdef JOE_DEBUG
3158	(void) OSBacktrace(&bp->b_stackgetblk[0], 6);
3159#endif
3160	return (bp);
3161}
3162
3163/*
3164 * Get an empty, disassociated buffer of given size.
3165 */
3166buf_t
3167buf_geteblk(int size)
3168{
3169	buf_t	bp = NULL;
3170	int queue = BQ_EMPTY;
3171
3172	do {
3173		lck_mtx_lock_spin(buf_mtxp);
3174
3175		bp = getnewbuf(0, 0, &queue);
3176	} while (bp == NULL);
3177
3178	SET(bp->b_flags, (B_META|B_INVAL));
3179
3180#if DIAGNOSTIC
3181	assert(queue == BQ_EMPTY);
3182#endif /* DIAGNOSTIC */
3183	/* XXX need to implement logic to deal with other queues */
3184
3185	binshash(bp, &invalhash);
3186	bufstats.bufs_eblk++;
3187
3188	lck_mtx_unlock(buf_mtxp);
3189
3190	allocbuf(bp, size);
3191
3192	return (bp);
3193}
3194
3195uint32_t
3196buf_redundancy_flags(buf_t bp)
3197{
3198	return bp->b_redundancy_flags;
3199}
3200
3201void
3202buf_set_redundancy_flags(buf_t bp, uint32_t flags)
3203{
3204	SET(bp->b_redundancy_flags, flags);
3205}
3206
3207void
3208buf_clear_redundancy_flags(buf_t bp, uint32_t flags)
3209{
3210	CLR(bp->b_redundancy_flags, flags);
3211}
3212
3213/*
3214 * With UBC, there is no need to expand / shrink the file data
3215 * buffer. The VM uses the same pages, hence no waste.
3216 * All the file data buffers can have one size.
3217 * In fact expand / shrink would be an expensive operation.
3218 *
3219 * Only exception to this is meta-data buffers. Most of the
3220 * meta data operations are smaller than PAGE_SIZE. Having the
3221 * meta-data buffers grow and shrink as needed, optimizes use
3222 * of the kernel wired memory.
3223 */
3224
3225int
3226allocbuf(buf_t bp, int size)
3227{
3228	vm_size_t desired_size;
3229
3230	desired_size = roundup(size, CLBYTES);
3231
3232	if (desired_size < PAGE_SIZE)
3233		desired_size = PAGE_SIZE;
3234	if (desired_size > MAXBSIZE)
3235		panic("allocbuf: buffer larger than MAXBSIZE requested");
3236
3237	if (ISSET(bp->b_flags, B_META)) {
3238		zone_t zprev, z;
3239		int    nsize = roundup(size, MINMETA);
3240
3241		if (bp->b_datap) {
3242			vm_offset_t elem = (vm_offset_t)bp->b_datap;
3243
3244			if (ISSET(bp->b_flags, B_ZALLOC)) {
3245			        if (bp->b_bufsize < nsize) {
3246				        /* reallocate to a bigger size */
3247
3248				        zprev = getbufzone(bp->b_bufsize);
3249					if (nsize <= MAXMETA) {
3250					        desired_size = nsize;
3251						z = getbufzone(nsize);
3252						/* b_datap not really a ptr */
3253						*(void **)(&bp->b_datap) = zalloc(z);
3254					} else {
3255					        bp->b_datap = (uintptr_t)NULL;
3256					        kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
3257						CLR(bp->b_flags, B_ZALLOC);
3258					}
3259					bcopy((void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
3260					zfree(zprev, (void *)elem);
3261				} else {
3262				        desired_size = bp->b_bufsize;
3263				}
3264
3265			} else {
3266				if ((vm_size_t)bp->b_bufsize < desired_size) {
3267					/* reallocate to a bigger size */
3268				        bp->b_datap = (uintptr_t)NULL;
3269					kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
3270					bcopy((const void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
3271					kmem_free(kernel_map, elem, bp->b_bufsize);
3272				} else {
3273					desired_size = bp->b_bufsize;
3274				}
3275			}
3276		} else {
3277			/* new allocation */
3278			if (nsize <= MAXMETA) {
3279				desired_size = nsize;
3280				z = getbufzone(nsize);
3281				/* b_datap not really a ptr */
3282				*(void **)(&bp->b_datap) = zalloc(z);
3283				SET(bp->b_flags, B_ZALLOC);
3284			} else
3285				kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
3286		}
3287
3288		if (bp->b_datap == 0)
3289		        panic("allocbuf: NULL b_datap");
3290	}
3291	bp->b_bufsize = desired_size;
3292	bp->b_bcount = size;
3293
3294	return (0);
3295}
3296
3297/*
3298 *	Get a new buffer from one of the free lists.
3299 *
3300 *	Request for a queue is passes in. The queue from which the buffer was taken
3301 *	from is returned. Out of range queue requests get BQ_EMPTY. Request for
3302 *	BQUEUE means no preference. Use heuristics in that case.
3303 *	Heuristics is as follows:
3304 *	Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
3305 *	If none available block till one is made available.
3306 *	If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
3307 *	Pick the most stale buffer.
3308 *	If found buffer was marked delayed write, start the async. write
3309 *	and restart the search.
3310 *	Initialize the fields and disassociate the buffer from the vnode.
3311 *	Remove the buffer from the hash. Return the buffer and the queue
3312 *	on which it was found.
3313 *
3314 *	buf_mtxp is held upon entry
3315 *	returns with buf_mtxp locked if new buf available
3316 *	returns with buf_mtxp UNlocked if new buf NOT available
3317 */
3318
3319static buf_t
3320getnewbuf(int slpflag, int slptimeo, int * queue)
3321{
3322	buf_t	bp;
3323	buf_t	lru_bp;
3324	buf_t	age_bp;
3325	buf_t	meta_bp;
3326	int	age_time, lru_time, bp_time, meta_time;
3327	int	req = *queue;	/* save it for restarts */
3328	struct timespec ts;
3329
3330start:
3331	/*
3332	 * invalid request gets empty queue
3333	 */
3334	if ((*queue >= BQUEUES) || (*queue < 0)
3335		|| (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
3336		*queue = BQ_EMPTY;
3337
3338
3339	if (*queue == BQ_EMPTY && (bp = bufqueues[*queue].tqh_first))
3340	        goto found;
3341
3342	/*
3343	 * need to grow number of bufs, add another one rather than recycling
3344	 */
3345	if (nbuf_headers < max_nbuf_headers) {
3346		/*
3347		 * Increment  count now as lock
3348		 * is dropped for allocation.
3349		 * That avoids over commits
3350		 */
3351		nbuf_headers++;
3352		goto add_newbufs;
3353	}
3354	/* Try for the requested queue first */
3355	bp = bufqueues[*queue].tqh_first;
3356	if (bp)
3357	        goto found;
3358
3359	/* Unable to use requested queue */
3360	age_bp = bufqueues[BQ_AGE].tqh_first;
3361	lru_bp = bufqueues[BQ_LRU].tqh_first;
3362	meta_bp = bufqueues[BQ_META].tqh_first;
3363
3364	if (!age_bp && !lru_bp && !meta_bp) {
3365		/*
3366		 * Unavailble on AGE or LRU or META queues
3367		 * Try the empty list first
3368		 */
3369		bp = bufqueues[BQ_EMPTY].tqh_first;
3370		if (bp) {
3371			*queue = BQ_EMPTY;
3372			goto found;
3373		}
3374		/*
3375		 * We have seen is this is hard to trigger.
3376		 * This is an overcommit of nbufs but needed
3377		 * in some scenarios with diskiamges
3378		 */
3379
3380add_newbufs:
3381		lck_mtx_unlock(buf_mtxp);
3382
3383		/* Create a new temporary buffer header */
3384		bp = (struct buf *)zalloc(buf_hdr_zone);
3385
3386		if (bp) {
3387			bufhdrinit(bp);
3388			bp->b_whichq = BQ_EMPTY;
3389			bp->b_timestamp = buf_timestamp();
3390			BLISTNONE(bp);
3391			SET(bp->b_flags, B_HDRALLOC);
3392			*queue = BQ_EMPTY;
3393		}
3394		lck_mtx_lock_spin(buf_mtxp);
3395
3396		if (bp) {
3397			binshash(bp, &invalhash);
3398			binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
3399			buf_hdr_count++;
3400			goto found;
3401		}
3402		/* subtract already accounted bufcount */
3403		nbuf_headers--;
3404
3405		bufstats.bufs_sleeps++;
3406
3407		/* wait for a free buffer of any kind */
3408		needbuffer = 1;
3409		/* hz value is 100 */
3410		ts.tv_sec = (slptimeo/1000);
3411		/* the hz value is 100; which leads to 10ms */
3412		ts.tv_nsec = (slptimeo % 1000) * NSEC_PER_USEC * 1000 * 10;
3413
3414		msleep(&needbuffer, buf_mtxp, slpflag | PDROP | (PRIBIO+1), "getnewbuf", &ts);
3415		return (NULL);
3416	}
3417
3418	/* Buffer available either on AGE or LRU or META */
3419	bp = NULL;
3420	*queue = -1;
3421
3422	/* Buffer available either on AGE or LRU */
3423	if (!age_bp) {
3424		bp = lru_bp;
3425		*queue = BQ_LRU;
3426	} else if (!lru_bp) {
3427		bp = age_bp;
3428		*queue = BQ_AGE;
3429	} else { /* buffer available on both AGE and LRU */
3430		int		t = buf_timestamp();
3431
3432		age_time = t - age_bp->b_timestamp;
3433		lru_time = t - lru_bp->b_timestamp;
3434		if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
3435			bp = age_bp;
3436			*queue = BQ_AGE;
3437			/*
3438			 * we should probably re-timestamp eveything in the
3439			 * queues at this point with the current time
3440			 */
3441		} else {
3442			if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
3443				bp = lru_bp;
3444				*queue = BQ_LRU;
3445			} else {
3446				bp = age_bp;
3447				*queue = BQ_AGE;
3448			}
3449		}
3450	}
3451
3452	if (!bp) { /* Neither on AGE nor on LRU */
3453		bp = meta_bp;
3454		*queue = BQ_META;
3455	}  else if (meta_bp) {
3456		int		t = buf_timestamp();
3457
3458		bp_time = t - bp->b_timestamp;
3459		meta_time = t - meta_bp->b_timestamp;
3460
3461		if (!(bp_time < 0) && !(meta_time < 0)) {
3462			/* time not set backwards */
3463			int bp_is_stale;
3464			bp_is_stale = (*queue == BQ_LRU) ?
3465					lru_is_stale : age_is_stale;
3466
3467			if ((meta_time >= meta_is_stale) &&
3468					(bp_time < bp_is_stale)) {
3469				bp = meta_bp;
3470				*queue = BQ_META;
3471			}
3472		}
3473	}
3474found:
3475	if (ISSET(bp->b_flags, B_LOCKED) || ISSET(bp->b_lflags, BL_BUSY))
3476	        panic("getnewbuf: bp @ %p is LOCKED or BUSY! (flags 0x%x)\n", bp, bp->b_flags);
3477
3478	/* Clean it */
3479	if (bcleanbuf(bp, FALSE)) {
3480		/*
3481		 * moved to the laundry thread, buffer not ready
3482		 */
3483		*queue = req;
3484		goto start;
3485	}
3486	return (bp);
3487}
3488
3489
3490/*
3491 * Clean a buffer.
3492 * Returns 0 if buffer is ready to use,
3493 * Returns 1 if issued a buf_bawrite() to indicate
3494 * that the buffer is not ready.
3495 *
3496 * buf_mtxp is held upon entry
3497 * returns with buf_mtxp locked
3498 */
3499int
3500bcleanbuf(buf_t bp, boolean_t discard)
3501{
3502	/* Remove from the queue */
3503	bremfree_locked(bp);
3504
3505#ifdef JOE_DEBUG
3506	bp->b_owner = current_thread();
3507	bp->b_tag   = 2;
3508#endif
3509	/*
3510	 * If buffer was a delayed write, start the IO by queuing
3511	 * it on the LAUNDRY queue, and return 1
3512	 */
3513	if (ISSET(bp->b_flags, B_DELWRI)) {
3514		if (discard) {
3515			SET(bp->b_lflags, BL_WANTDEALLOC);
3516		}
3517
3518		bmovelaundry(bp);
3519
3520		lck_mtx_unlock(buf_mtxp);
3521
3522		wakeup(&bufqueues[BQ_LAUNDRY]);
3523		/*
3524		 * and give it a chance to run
3525		 */
3526		(void)thread_block(THREAD_CONTINUE_NULL);
3527
3528		lck_mtx_lock_spin(buf_mtxp);
3529
3530		return (1);
3531	}
3532#ifdef JOE_DEBUG
3533	bp->b_owner = current_thread();
3534	bp->b_tag   = 8;
3535#endif
3536	/*
3537	 * Buffer is no longer on any free list... we own it
3538	 */
3539	SET(bp->b_lflags, BL_BUSY);
3540	buf_busycount++;
3541
3542	bremhash(bp);
3543
3544	/*
3545	 * disassociate us from our vnode, if we had one...
3546	 */
3547	if (bp->b_vp)
3548		brelvp_locked(bp);
3549
3550	lck_mtx_unlock(buf_mtxp);
3551
3552	BLISTNONE(bp);
3553
3554	if (ISSET(bp->b_flags, B_META))
3555		buf_free_meta_store(bp);
3556
3557	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
3558
3559	buf_release_credentials(bp);
3560
3561	/* If discarding, just move to the empty queue */
3562	if (discard) {
3563		lck_mtx_lock_spin(buf_mtxp);
3564		CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
3565		bp->b_whichq = BQ_EMPTY;
3566		binshash(bp, &invalhash);
3567		binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
3568		CLR(bp->b_lflags, BL_BUSY);
3569		buf_busycount--;
3570	} else {
3571		/* Not discarding: clean up and prepare for reuse */
3572		bp->b_bufsize = 0;
3573		bp->b_datap = (uintptr_t)NULL;
3574		bp->b_upl = (void *)NULL;
3575		/*
3576		 * preserve the state of whether this buffer
3577		 * was allocated on the fly or not...
3578		 * the only other flag that should be set at
3579		 * this point is BL_BUSY...
3580		 */
3581#ifdef JOE_DEBUG
3582		bp->b_owner = current_thread();
3583		bp->b_tag   = 3;
3584#endif
3585		bp->b_lflags = BL_BUSY;
3586		bp->b_flags = (bp->b_flags & B_HDRALLOC);
3587		bp->b_dev = NODEV;
3588		bp->b_blkno = bp->b_lblkno = 0;
3589		bp->b_iodone = NULL;
3590		bp->b_error = 0;
3591		bp->b_resid = 0;
3592		bp->b_bcount = 0;
3593		bp->b_dirtyoff = bp->b_dirtyend = 0;
3594		bp->b_validoff = bp->b_validend = 0;
3595		bzero(&bp->b_attr, sizeof(struct bufattr));
3596
3597		lck_mtx_lock_spin(buf_mtxp);
3598	}
3599	return (0);
3600}
3601
3602
3603
3604errno_t
3605buf_invalblkno(vnode_t vp, daddr64_t lblkno, int flags)
3606{
3607        buf_t	bp;
3608	errno_t	error;
3609	struct bufhashhdr *dp;
3610
3611	dp = BUFHASH(vp, lblkno);
3612
3613relook:
3614	lck_mtx_lock_spin(buf_mtxp);
3615
3616	if ((bp = incore_locked(vp, lblkno, dp)) == (struct buf *)0) {
3617	        lck_mtx_unlock(buf_mtxp);
3618		return (0);
3619	}
3620	if (ISSET(bp->b_lflags, BL_BUSY)) {
3621	        if ( !ISSET(flags, BUF_WAIT)) {
3622		        lck_mtx_unlock(buf_mtxp);
3623			return (EBUSY);
3624		}
3625	        SET(bp->b_lflags, BL_WANTED);
3626
3627		error = msleep((caddr_t)bp, buf_mtxp, PDROP | (PRIBIO + 1), "buf_invalblkno", NULL);
3628
3629		if (error) {
3630			return (error);
3631		}
3632		goto relook;
3633	}
3634	bremfree_locked(bp);
3635	SET(bp->b_lflags, BL_BUSY);
3636	SET(bp->b_flags, B_INVAL);
3637	buf_busycount++;
3638#ifdef JOE_DEBUG
3639	bp->b_owner = current_thread();
3640	bp->b_tag   = 4;
3641#endif
3642	lck_mtx_unlock(buf_mtxp);
3643	buf_brelse(bp);
3644
3645	return (0);
3646}
3647
3648
3649void
3650buf_drop(buf_t bp)
3651{
3652        int need_wakeup = 0;
3653
3654	lck_mtx_lock_spin(buf_mtxp);
3655
3656	if (ISSET(bp->b_lflags, BL_WANTED)) {
3657	        /*
3658		 * delay the actual wakeup until after we
3659		 * clear BL_BUSY and we've dropped buf_mtxp
3660		 */
3661		need_wakeup = 1;
3662	}
3663#ifdef JOE_DEBUG
3664	bp->b_owner = current_thread();
3665	bp->b_tag   = 9;
3666#endif
3667	/*
3668	 * Unlock the buffer.
3669	 */
3670	CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
3671	buf_busycount--;
3672
3673	lck_mtx_unlock(buf_mtxp);
3674
3675	if (need_wakeup) {
3676	        /*
3677		 * Wake up any proceeses waiting for _this_ buffer to become free.
3678		 */
3679	        wakeup(bp);
3680	}
3681}
3682
3683
3684errno_t
3685buf_acquire(buf_t bp, int flags, int slpflag, int slptimeo) {
3686        errno_t error;
3687
3688        lck_mtx_lock_spin(buf_mtxp);
3689
3690	error = buf_acquire_locked(bp, flags, slpflag, slptimeo);
3691
3692       	lck_mtx_unlock(buf_mtxp);
3693
3694	return (error);
3695}
3696
3697
3698static errno_t
3699buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo)
3700{
3701	errno_t error;
3702	struct timespec ts;
3703
3704	if (ISSET(bp->b_flags, B_LOCKED)) {
3705	        if ((flags & BAC_SKIP_LOCKED))
3706			return (EDEADLK);
3707	} else {
3708	        if ((flags & BAC_SKIP_NONLOCKED))
3709			return (EDEADLK);
3710	}
3711        if (ISSET(bp->b_lflags, BL_BUSY)) {
3712	        /*
3713		 * since the lck_mtx_lock may block, the buffer
3714		 * may become BUSY, so we need to
3715		 * recheck for a NOWAIT request
3716		 */
3717	        if (flags & BAC_NOWAIT)
3718			return (EBUSY);
3719	        SET(bp->b_lflags, BL_WANTED);
3720
3721		/* the hz value is 100; which leads to 10ms */
3722		ts.tv_sec = (slptimeo/100);
3723		ts.tv_nsec = (slptimeo % 100) * 10  * NSEC_PER_USEC * 1000;
3724		error = msleep((caddr_t)bp, buf_mtxp, slpflag | (PRIBIO + 1), "buf_acquire", &ts);
3725
3726		if (error)
3727			return (error);
3728		return (EAGAIN);
3729	}
3730	if (flags & BAC_REMOVE)
3731	        bremfree_locked(bp);
3732	SET(bp->b_lflags, BL_BUSY);
3733	buf_busycount++;
3734
3735#ifdef JOE_DEBUG
3736	bp->b_owner = current_thread();
3737	bp->b_tag   = 5;
3738#endif
3739	return (0);
3740}
3741
3742
3743/*
3744 * Wait for operations on the buffer to complete.
3745 * When they do, extract and return the I/O's error value.
3746 */
3747errno_t
3748buf_biowait(buf_t bp)
3749{
3750	while (!ISSET(bp->b_flags, B_DONE)) {
3751
3752		lck_mtx_lock_spin(buf_mtxp);
3753
3754		if (!ISSET(bp->b_flags, B_DONE)) {
3755			DTRACE_IO1(wait__start, buf_t, bp);
3756			(void) msleep(bp, buf_mtxp, PDROP | (PRIBIO+1), "buf_biowait", NULL);
3757			DTRACE_IO1(wait__done, buf_t, bp);
3758		} else
3759			lck_mtx_unlock(buf_mtxp);
3760	}
3761	/* check for interruption of I/O (e.g. via NFS), then errors. */
3762	if (ISSET(bp->b_flags, B_EINTR)) {
3763		CLR(bp->b_flags, B_EINTR);
3764		return (EINTR);
3765	} else if (ISSET(bp->b_flags, B_ERROR))
3766		return (bp->b_error ? bp->b_error : EIO);
3767	else
3768		return (0);
3769}
3770
3771
3772/*
3773 * Mark I/O complete on a buffer.
3774 *
3775 * If a callback has been requested, e.g. the pageout
3776 * daemon, do so. Otherwise, awaken waiting processes.
3777 *
3778 * [ Leffler, et al., says on p.247:
3779 *	"This routine wakes up the blocked process, frees the buffer
3780 *	for an asynchronous write, or, for a request by the pagedaemon
3781 *	process, invokes a procedure specified in the buffer structure" ]
3782 *
3783 * In real life, the pagedaemon (or other system processes) wants
3784 * to do async stuff to, and doesn't want the buffer buf_brelse()'d.
3785 * (for swap pager, that puts swap buffers on the free lists (!!!),
3786 * for the vn device, that puts malloc'd buffers on the free lists!)
3787 */
3788extern struct timeval priority_IO_timestamp_for_root;
3789extern int hard_throttle_on_root;
3790
3791void
3792buf_biodone(buf_t bp)
3793{
3794	mount_t mp;
3795
3796	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
3797		     bp, bp->b_datap, bp->b_flags, 0, 0);
3798
3799	if (ISSET(bp->b_flags, B_DONE))
3800		panic("biodone already");
3801
3802	if (ISSET(bp->b_flags, B_ERROR)) {
3803		fslog_io_error(bp);
3804	}
3805
3806	if (bp->b_vp && bp->b_vp->v_mount) {
3807		mp = bp->b_vp->v_mount;
3808	} else {
3809		mp = NULL;
3810	}
3811
3812	if (mp && (bp->b_flags & B_READ) == 0) {
3813		update_last_io_time(mp);
3814		INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_write_size);
3815	} else if (mp) {
3816		INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_read_size);
3817	}
3818
3819        if (kdebug_enable) {
3820	        int    code = DKIO_DONE;
3821
3822		if (bp->b_flags & B_READ)
3823		        code |= DKIO_READ;
3824		if (bp->b_flags & B_ASYNC)
3825		        code |= DKIO_ASYNC;
3826
3827		if (bp->b_flags & B_META)
3828		        code |= DKIO_META;
3829		else if (bp->b_flags & B_PAGEIO)
3830		        code |= DKIO_PAGING;
3831
3832		if (bp->b_flags & B_THROTTLED_IO)
3833			code |= DKIO_THROTTLE;
3834		else if (bp->b_flags & B_PASSIVE)
3835			code |= DKIO_PASSIVE;
3836
3837		if (bp->b_attr.ba_flags & BA_NOCACHE)
3838			code |= DKIO_NOCACHE;
3839
3840		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
3841                              bp, (uintptr_t)bp->b_vp,
3842				      bp->b_resid, bp->b_error, 0);
3843        }
3844	if ((bp->b_vp != NULLVP) &&
3845	    ((bp->b_flags & (B_THROTTLED_IO | B_PASSIVE | B_IOSTREAMING | B_PAGEIO | B_READ | B_THROTTLED_IO | B_PASSIVE)) == (B_PAGEIO | B_READ)) &&
3846	    (bp->b_vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) {
3847	        microuptime(&priority_IO_timestamp_for_root);
3848	        hard_throttle_on_root = 0;
3849	}
3850
3851	/*
3852	 * I/O was done, so don't believe
3853	 * the DIRTY state from VM anymore...
3854	 * and we need to reset the THROTTLED/PASSIVE
3855	 * indicators
3856	 */
3857	CLR(bp->b_flags, (B_WASDIRTY | B_THROTTLED_IO | B_PASSIVE));
3858	CLR(bp->b_attr.ba_flags, (BA_META | BA_NOCACHE));
3859#if !CONFIG_EMBEDDED
3860	CLR(bp->b_attr.ba_flags, (BA_THROTTLED_IO | BA_DELAYIDLESLEEP));
3861#else
3862	CLR(bp->b_attr.ba_flags, BA_THROTTLED_IO);
3863#endif /* !CONFIG_EMBEDDED */
3864	DTRACE_IO1(done, buf_t, bp);
3865
3866	if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
3867	        /*
3868		 * wake up any writer's blocked
3869		 * on throttle or waiting for I/O
3870		 * to drain
3871		 */
3872		vnode_writedone(bp->b_vp);
3873
3874	if (ISSET(bp->b_flags, (B_CALL | B_FILTER))) {	/* if necessary, call out */
3875		void	(*iodone_func)(struct buf *, void *) = bp->b_iodone;
3876		void 	*arg = bp->b_transaction;
3877		int     callout = ISSET(bp->b_flags, B_CALL);
3878
3879		if (iodone_func == NULL)
3880			panic("biodone: bp @ %p has NULL b_iodone!\n", bp);
3881
3882		CLR(bp->b_flags, (B_CALL | B_FILTER));	/* filters and callouts are one-shot */
3883		bp->b_iodone = NULL;
3884		bp->b_transaction = NULL;
3885
3886		if (callout)
3887		        SET(bp->b_flags, B_DONE);	/* note that it's done */
3888
3889		(*iodone_func)(bp, arg);
3890
3891		if (callout) {
3892			/*
3893			 * assumes that the callback function takes
3894			 * ownership of the bp and deals with releasing it if necessary
3895			 */
3896			goto biodone_done;
3897		}
3898		/*
3899		 * in this case the call back function is acting
3900		 * strictly as a filter... it does not take
3901		 * ownership of the bp and is expecting us
3902		 * to finish cleaning up... this is currently used
3903		 * by the HFS journaling code
3904		 */
3905	}
3906	if (ISSET(bp->b_flags, B_ASYNC)) {	/* if async, release it */
3907		SET(bp->b_flags, B_DONE);	/* note that it's done */
3908
3909		buf_brelse(bp);
3910	} else {				/* or just wakeup the buffer */
3911	        /*
3912		 * by taking the mutex, we serialize
3913		 * the buf owner calling buf_biowait so that we'll
3914		 * only see him in one of 2 states...
3915		 * state 1: B_DONE wasn't set and he's
3916		 * blocked in msleep
3917		 * state 2: he's blocked trying to take the
3918		 * mutex before looking at B_DONE
3919		 * BL_WANTED is cleared in case anyone else
3920		 * is blocked waiting for the buffer... note
3921		 * that we haven't cleared B_BUSY yet, so if
3922		 * they do get to run, their going to re-set
3923		 * BL_WANTED and go back to sleep
3924		 */
3925	        lck_mtx_lock_spin(buf_mtxp);
3926
3927		CLR(bp->b_lflags, BL_WANTED);
3928		SET(bp->b_flags, B_DONE);		/* note that it's done */
3929
3930	        lck_mtx_unlock(buf_mtxp);
3931
3932		wakeup(bp);
3933	}
3934biodone_done:
3935	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
3936                 (uintptr_t)bp, (uintptr_t)bp->b_datap, bp->b_flags, 0, 0);
3937}
3938
3939/*
3940 * Return a count of buffers on the "locked" queue.
3941 */
3942int
3943count_lock_queue(void)
3944{
3945	buf_t	bp;
3946	int	n = 0;
3947
3948	lck_mtx_lock_spin(buf_mtxp);
3949
3950	for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
3951	    bp = bp->b_freelist.tqe_next)
3952		n++;
3953	lck_mtx_unlock(buf_mtxp);
3954
3955	return (n);
3956}
3957
3958/*
3959 * Return a count of 'busy' buffers. Used at the time of shutdown.
3960 * note: This is also called from the mach side in debug context in kdp.c
3961 */
3962int
3963count_busy_buffers(void)
3964{
3965	return buf_busycount + bufstats.bufs_iobufinuse;
3966}
3967
3968#if DIAGNOSTIC
3969/*
3970 * Print out statistics on the current allocation of the buffer pool.
3971 * Can be enabled to print out on every ``sync'' by setting "syncprt"
3972 * in vfs_syscalls.c using sysctl.
3973 */
3974void
3975vfs_bufstats()
3976{
3977	int i, j, count;
3978	struct buf *bp;
3979	struct bqueues *dp;
3980	int counts[MAXBSIZE/CLBYTES+1];
3981	static char *bname[BQUEUES] =
3982		{ "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
3983
3984	for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
3985		count = 0;
3986		for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
3987			counts[j] = 0;
3988
3989		lck_mtx_lock(buf_mtxp);
3990
3991		for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
3992			counts[bp->b_bufsize/CLBYTES]++;
3993			count++;
3994		}
3995		lck_mtx_unlock(buf_mtxp);
3996
3997		printf("%s: total-%d", bname[i], count);
3998		for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
3999			if (counts[j] != 0)
4000				printf(", %d-%d", j * CLBYTES, counts[j]);
4001		printf("\n");
4002	}
4003}
4004#endif /* DIAGNOSTIC */
4005
4006#define	NRESERVEDIOBUFS	128
4007
4008
4009buf_t
4010alloc_io_buf(vnode_t vp, int priv)
4011{
4012	buf_t	bp;
4013
4014	lck_mtx_lock_spin(iobuffer_mtxp);
4015
4016	while (((niobuf_headers - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse) && !priv) ||
4017	       (bp = iobufqueue.tqh_first) == NULL) {
4018		bufstats.bufs_iobufsleeps++;
4019
4020		need_iobuffer = 1;
4021		(void) msleep(&need_iobuffer, iobuffer_mtxp, PSPIN | (PRIBIO+1), (const char *)"alloc_io_buf", NULL);
4022	}
4023	TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
4024
4025	bufstats.bufs_iobufinuse++;
4026	if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
4027		bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
4028
4029	lck_mtx_unlock(iobuffer_mtxp);
4030
4031	/*
4032	 * initialize various fields
4033	 * we don't need to hold the mutex since the buffer
4034	 * is now private... the vp should have a reference
4035	 * on it and is not protected by this mutex in any event
4036	 */
4037	bp->b_timestamp = 0;
4038	bp->b_proc = NULL;
4039
4040	bp->b_datap = 0;
4041	bp->b_flags = 0;
4042	bp->b_lflags = BL_BUSY | BL_IOBUF;
4043	bp->b_redundancy_flags = 0;
4044	bp->b_blkno = bp->b_lblkno = 0;
4045#ifdef JOE_DEBUG
4046	bp->b_owner = current_thread();
4047	bp->b_tag   = 6;
4048#endif
4049	bp->b_iodone = NULL;
4050	bp->b_error = 0;
4051	bp->b_resid = 0;
4052	bp->b_bcount = 0;
4053	bp->b_bufsize = 0;
4054	bp->b_upl = NULL;
4055	bp->b_vp = vp;
4056	bzero(&bp->b_attr, sizeof(struct bufattr));
4057
4058	if (vp && (vp->v_type == VBLK || vp->v_type == VCHR))
4059		bp->b_dev = vp->v_rdev;
4060	else
4061		bp->b_dev = NODEV;
4062
4063	return (bp);
4064}
4065
4066
4067void
4068free_io_buf(buf_t bp)
4069{
4070        int need_wakeup = 0;
4071
4072	/*
4073	 * put buffer back on the head of the iobufqueue
4074	 */
4075	bp->b_vp = NULL;
4076	bp->b_flags = B_INVAL;
4077
4078	lck_mtx_lock_spin(iobuffer_mtxp);
4079
4080	binsheadfree(bp, &iobufqueue, -1);
4081
4082	if (need_iobuffer) {
4083	        /*
4084		 * Wake up any processes waiting because they need an io buffer
4085		 *
4086		 * do the wakeup after we drop the mutex... it's possible that the
4087		 * wakeup will be superfluous if need_iobuffer gets set again and
4088		 * another thread runs this path, but it's highly unlikely, doesn't
4089		 * hurt, and it means we don't hold up I/O progress if the wakeup blocks
4090		 * trying to grab a task related lock...
4091		 */
4092		need_iobuffer = 0;
4093		need_wakeup = 1;
4094	}
4095	if (bufstats.bufs_iobufinuse <= 0)
4096		panic("free_io_buf: bp(%p) - bufstats.bufs_iobufinuse < 0", bp);
4097
4098	bufstats.bufs_iobufinuse--;
4099
4100	lck_mtx_unlock(iobuffer_mtxp);
4101
4102	if (need_wakeup)
4103	        wakeup(&need_iobuffer);
4104}
4105
4106
4107void
4108buf_list_lock(void)
4109{
4110        lck_mtx_lock_spin(buf_mtxp);
4111}
4112
4113void
4114buf_list_unlock(void)
4115{
4116        lck_mtx_unlock(buf_mtxp);
4117}
4118
4119/*
4120 * If getnewbuf() calls bcleanbuf() on the same thread
4121 * there is a potential for stack overrun and deadlocks.
4122 * So we always handoff the work to a worker thread for completion
4123 */
4124
4125
4126static void
4127bcleanbuf_thread_init(void)
4128{
4129	thread_t	thread = THREAD_NULL;
4130
4131	/* create worker thread */
4132	kernel_thread_start((thread_continue_t)bcleanbuf_thread, NULL, &thread);
4133	thread_deallocate(thread);
4134}
4135
4136typedef int (*bcleanbufcontinuation)(int);
4137
4138static void
4139bcleanbuf_thread(void)
4140{
4141	struct buf *bp;
4142	int error = 0;
4143	int loopcnt = 0;
4144
4145	for (;;) {
4146	        lck_mtx_lock_spin(buf_mtxp);
4147
4148		while ( (bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY])) == NULL) {
4149			(void)msleep0(&bufqueues[BQ_LAUNDRY], buf_mtxp, PRIBIO|PDROP, "blaundry", 0, (bcleanbufcontinuation)bcleanbuf_thread);
4150		}
4151
4152		/*
4153		 * Remove from the queue
4154		 */
4155		bremfree_locked(bp);
4156
4157		/*
4158		 * Buffer is no longer on any free list
4159		 */
4160		SET(bp->b_lflags, BL_BUSY);
4161		buf_busycount++;
4162
4163#ifdef JOE_DEBUG
4164		bp->b_owner = current_thread();
4165		bp->b_tag   = 10;
4166#endif
4167
4168		lck_mtx_unlock(buf_mtxp);
4169		/*
4170		 * do the IO
4171		 */
4172		error = bawrite_internal(bp, 0);
4173
4174		if (error) {
4175		        bp->b_whichq = BQ_LAUNDRY;
4176			bp->b_timestamp = buf_timestamp();
4177
4178		        lck_mtx_lock_spin(buf_mtxp);
4179
4180			binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
4181			blaundrycnt++;
4182
4183			/* we never leave a busy page on the laundry queue */
4184			CLR(bp->b_lflags, BL_BUSY);
4185			buf_busycount--;
4186#ifdef JOE_DEBUG
4187			bp->b_owner = current_thread();
4188			bp->b_tag   = 11;
4189#endif
4190
4191			lck_mtx_unlock(buf_mtxp);
4192
4193			if (loopcnt > MAXLAUNDRY) {
4194				/*
4195				 * bawrite_internal() can return errors if we're throttled. If we've
4196				 * done several I/Os and failed, give the system some time to unthrottle
4197				 * the vnode
4198				 */
4199				(void)tsleep((void *)&bufqueues[BQ_LAUNDRY], PRIBIO, "blaundry", 1);
4200				loopcnt = 0;
4201			} else {
4202				/* give other threads a chance to run */
4203				(void)thread_block(THREAD_CONTINUE_NULL);
4204				loopcnt++;
4205			}
4206		}
4207	}
4208}
4209
4210
4211static int
4212brecover_data(buf_t bp)
4213{
4214	int	upl_offset;
4215        upl_t	upl;
4216	upl_page_info_t *pl;
4217	kern_return_t kret;
4218	vnode_t	vp = bp->b_vp;
4219	int upl_flags;
4220
4221
4222	if ( !UBCINFOEXISTS(vp) || bp->b_bufsize == 0)
4223	        goto dump_buffer;
4224
4225	upl_flags = UPL_PRECIOUS;
4226	if (! (buf_flags(bp) & B_READ)) {
4227		/*
4228		 * "write" operation:  let the UPL subsystem know
4229		 * that we intend to modify the buffer cache pages we're
4230		 * gathering.
4231		 */
4232		upl_flags |= UPL_WILL_MODIFY;
4233	}
4234
4235	kret = ubc_create_upl(vp,
4236			      ubc_blktooff(vp, bp->b_lblkno),
4237			      bp->b_bufsize,
4238			      &upl,
4239			      &pl,
4240			      upl_flags);
4241	if (kret != KERN_SUCCESS)
4242	        panic("Failed to create UPL");
4243
4244	for (upl_offset = 0; upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) {
4245
4246	        if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) || !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) {
4247		        ubc_upl_abort(upl, 0);
4248			goto dump_buffer;
4249		}
4250	}
4251	bp->b_upl = upl;
4252
4253	kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap));
4254
4255	if (kret != KERN_SUCCESS)
4256	        panic("getblk: ubc_upl_map() failed with (%d)", kret);
4257	return (1);
4258
4259dump_buffer:
4260	bp->b_bufsize = 0;
4261	SET(bp->b_flags, B_INVAL);
4262	buf_brelse(bp);
4263
4264	return(0);
4265}
4266
4267boolean_t
4268buffer_cache_gc(int all)
4269{
4270	buf_t bp;
4271	boolean_t did_large_zfree = FALSE;
4272	boolean_t need_wakeup = FALSE;
4273	int now = buf_timestamp();
4274	uint32_t found = 0;
4275	struct bqueues privq;
4276	int thresh_hold = BUF_STALE_THRESHHOLD;
4277
4278	if (all)
4279		thresh_hold = 0;
4280	/*
4281	 * We only care about metadata (incore storage comes from zalloc()).
4282	 * Unless "all" is set (used to evict meta data buffers in preparation
4283	 * for deep sleep), we only evict up to BUF_MAX_GC_BATCH_SIZE buffers
4284	 * that have not been accessed in the last 30s. This limit controls both
4285	 * the hold time of the global lock "buf_mtxp" and the length of time
4286	 * we spend compute bound in the GC thread which calls this function
4287	 */
4288	lck_mtx_lock(buf_mtxp);
4289
4290	do {
4291		found = 0;
4292		TAILQ_INIT(&privq);
4293		need_wakeup = FALSE;
4294
4295		while (((bp = TAILQ_FIRST(&bufqueues[BQ_META]))) &&
4296				(now > bp->b_timestamp) &&
4297				(now - bp->b_timestamp > thresh_hold) &&
4298				(found < BUF_MAX_GC_BATCH_SIZE)) {
4299
4300			/* Remove from free list */
4301			bremfree_locked(bp);
4302			found++;
4303
4304#ifdef JOE_DEBUG
4305			bp->b_owner = current_thread();
4306			bp->b_tag   = 12;
4307#endif
4308
4309			/* If dirty, move to laundry queue and remember to do wakeup */
4310			if (ISSET(bp->b_flags, B_DELWRI)) {
4311				SET(bp->b_lflags, BL_WANTDEALLOC);
4312
4313				bmovelaundry(bp);
4314				need_wakeup = TRUE;
4315
4316				continue;
4317			}
4318
4319			/*
4320			 * Mark busy and put on private list.  We could technically get
4321			 * away without setting BL_BUSY here.
4322			 */
4323			SET(bp->b_lflags, BL_BUSY);
4324			buf_busycount++;
4325
4326			/*
4327			 * Remove from hash and dissociate from vp.
4328			 */
4329			bremhash(bp);
4330			if (bp->b_vp) {
4331				brelvp_locked(bp);
4332			}
4333
4334			TAILQ_INSERT_TAIL(&privq, bp, b_freelist);
4335		}
4336
4337		if (found == 0) {
4338			break;
4339		}
4340
4341		/* Drop lock for batch processing */
4342		lck_mtx_unlock(buf_mtxp);
4343
4344		/* Wakeup and yield for laundry if need be */
4345		if (need_wakeup) {
4346			wakeup(&bufqueues[BQ_LAUNDRY]);
4347			(void)thread_block(THREAD_CONTINUE_NULL);
4348		}
4349
4350		/* Clean up every buffer on private list */
4351		TAILQ_FOREACH(bp, &privq, b_freelist) {
4352			/* Take note if we've definitely freed at least a page to a zone */
4353			if ((ISSET(bp->b_flags, B_ZALLOC)) && (buf_size(bp) >= PAGE_SIZE)) {
4354				did_large_zfree = TRUE;
4355			}
4356
4357			trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
4358
4359			/* Free Storage */
4360			buf_free_meta_store(bp);
4361
4362			/* Release credentials */
4363			buf_release_credentials(bp);
4364
4365			/* Prepare for moving to empty queue */
4366			CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED
4367						| B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
4368			bp->b_whichq = BQ_EMPTY;
4369			BLISTNONE(bp);
4370		}
4371		lck_mtx_lock(buf_mtxp);
4372
4373		/* Back under lock, move them all to invalid hash and clear busy */
4374		TAILQ_FOREACH(bp, &privq, b_freelist) {
4375			binshash(bp, &invalhash);
4376			CLR(bp->b_lflags, BL_BUSY);
4377			buf_busycount--;
4378
4379#ifdef JOE_DEBUG
4380			if (bp->b_owner != current_thread()) {
4381				panic("Buffer stolen from buffer_cache_gc()");
4382			}
4383			bp->b_owner = current_thread();
4384			bp->b_tag   = 13;
4385#endif
4386		}
4387
4388		/* And do a big bulk move to the empty queue */
4389		TAILQ_CONCAT(&bufqueues[BQ_EMPTY], &privq, b_freelist);
4390
4391	} while (all && (found == BUF_MAX_GC_BATCH_SIZE));
4392
4393	lck_mtx_unlock(buf_mtxp);
4394
4395	return did_large_zfree;
4396}
4397
4398
4399/*
4400 * disabled for now
4401 */
4402
4403#if FLUSH_QUEUES
4404
4405#define NFLUSH 32
4406
4407static int
4408bp_cmp(void *a, void *b)
4409{
4410    buf_t *bp_a = *(buf_t **)a,
4411          *bp_b = *(buf_t **)b;
4412    daddr64_t res;
4413
4414    // don't have to worry about negative block
4415    // numbers so this is ok to do.
4416    //
4417    res = (bp_a->b_blkno - bp_b->b_blkno);
4418
4419    return (int)res;
4420}
4421
4422
4423int
4424bflushq(int whichq, mount_t mp)
4425{
4426	buf_t	bp, next;
4427	int	i, buf_count;
4428	int	total_writes = 0;
4429	static buf_t flush_table[NFLUSH];
4430
4431	if (whichq < 0 || whichq >= BQUEUES) {
4432	    return (0);
4433	}
4434
4435  restart:
4436	lck_mtx_lock(buf_mtxp);
4437
4438	bp = TAILQ_FIRST(&bufqueues[whichq]);
4439
4440	for (buf_count = 0; bp; bp = next) {
4441	    next = bp->b_freelist.tqe_next;
4442
4443	    if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
4444		continue;
4445	    }
4446
4447	    if (ISSET(bp->b_flags, B_DELWRI) && !ISSET(bp->b_lflags, BL_BUSY)) {
4448
4449		bremfree_locked(bp);
4450#ifdef JOE_DEBUG
4451		bp->b_owner = current_thread();
4452		bp->b_tag   = 7;
4453#endif
4454		SET(bp->b_lflags, BL_BUSY);
4455		buf_busycount++;
4456
4457		flush_table[buf_count] = bp;
4458		buf_count++;
4459		total_writes++;
4460
4461		if (buf_count >= NFLUSH) {
4462		    lck_mtx_unlock(buf_mtxp);
4463
4464		    qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
4465
4466		    for (i = 0; i < buf_count; i++) {
4467			buf_bawrite(flush_table[i]);
4468		    }
4469		    goto restart;
4470		}
4471	    }
4472	}
4473	lck_mtx_unlock(buf_mtxp);
4474
4475	if (buf_count > 0) {
4476	    qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
4477
4478	    for (i = 0; i < buf_count; i++) {
4479		buf_bawrite(flush_table[i]);
4480	    }
4481	}
4482
4483	return (total_writes);
4484}
4485#endif
4486
4487
4488#if BALANCE_QUEUES
4489
4490/* XXX move this to a separate file */
4491
4492/*
4493 * NOTE: THIS CODE HAS NOT BEEN UPDATED
4494 * WITH RESPECT TO THE NEW LOCKING MODEL
4495 */
4496
4497
4498/*
4499 * Dynamic Scaling of the Buffer Queues
4500 */
4501
4502typedef long long blsize_t;
4503
4504blsize_t MAXNBUF; /* initialize to (sane_size / PAGE_SIZE) */
4505/* Global tunable limits */
4506blsize_t nbufh;			/* number of buffer headers */
4507blsize_t nbuflow;		/* minimum number of buffer headers required */
4508blsize_t nbufhigh;		/* maximum number of buffer headers allowed */
4509blsize_t nbuftarget;	/* preferred number of buffer headers */
4510
4511/*
4512 * assertions:
4513 *
4514 * 1.	0 < nbuflow <= nbufh <= nbufhigh
4515 * 2.	nbufhigh <= MAXNBUF
4516 * 3.	0 < nbuflow <= nbuftarget <= nbufhigh
4517 * 4.	nbufh can not be set by sysctl().
4518 */
4519
4520/* Per queue tunable limits */
4521
4522struct bufqlim {
4523	blsize_t	bl_nlow;	/* minimum number of buffer headers required */
4524	blsize_t	bl_num;		/* number of buffer headers on the queue */
4525	blsize_t	bl_nlhigh;	/* maximum number of buffer headers allowed */
4526	blsize_t	bl_target;	/* preferred number of buffer headers */
4527	long	bl_stale;	/* Seconds after which a buffer is considered stale */
4528} bufqlim[BQUEUES];
4529
4530/*
4531 * assertions:
4532 *
4533 * 1.	0 <= bl_nlow <= bl_num <= bl_nlhigh
4534 * 2.	bl_nlhigh <= MAXNBUF
4535 * 3.  bufqlim[BQ_META].bl_nlow != 0
4536 * 4.  bufqlim[BQ_META].bl_nlow > (number of possible concurrent
4537 *									file system IO operations)
4538 * 5.	bl_num can not be set by sysctl().
4539 * 6.	bl_nhigh <= nbufhigh
4540 */
4541
4542/*
4543 * Rationale:
4544 * ----------
4545 * Defining it blsize_t as long permits 2^31 buffer headers per queue.
4546 * Which can describe (2^31 * PAGE_SIZE) memory per queue.
4547 *
4548 * These limits are exported to by means of sysctl().
4549 * It was decided to define blsize_t as a 64 bit quantity.
4550 * This will make sure that we will not be required to change it
4551 * as long as we do not exceed 64 bit address space for the kernel.
4552 *
4553 * low and high numbers parameters initialized at compile time
4554 * and boot arguments can be used to override them. sysctl()
4555 * would not change the value. sysctl() can get all the values
4556 * but can set only target. num is the current level.
4557 *
4558 * Advantages of having a "bufqscan" thread doing the balancing are,
4559 * Keep enough bufs on BQ_EMPTY.
4560 *	getnewbuf() by default will always select a buffer from the BQ_EMPTY.
4561 *		getnewbuf() perfoms best if a buffer was found there.
4562 *		Also this minimizes the possibility of starting IO
4563 *		from getnewbuf(). That's a performance win, too.
4564 *
4565 *	Localize complex logic [balancing as well as time aging]
4566 *		to balancebufq().
4567 *
4568 *	Simplify getnewbuf() logic by elimination of time aging code.
4569 */
4570
4571/*
4572 * Algorithm:
4573 * -----------
4574 * The goal of the dynamic scaling of the buffer queues to to keep
4575 * the size of the LRU close to bl_target. Buffers on a queue would
4576 * be time aged.
4577 *
4578 * There would be a thread which will be responsible for "balancing"
4579 * the buffer cache queues.
4580 *
4581 * The scan order would be:	AGE, LRU, META, EMPTY.
4582 */
4583
4584long bufqscanwait = 0;
4585
4586static void bufqscan_thread();
4587static int balancebufq(int q);
4588static int btrimempty(int n);
4589static __inline__ int initbufqscan(void);
4590static __inline__ int nextbufq(int q);
4591static void buqlimprt(int all);
4592
4593
4594static __inline__ void
4595bufqinc(int q)
4596{
4597	if ((q < 0) || (q >= BQUEUES))
4598		return;
4599
4600	bufqlim[q].bl_num++;
4601	return;
4602}
4603
4604static __inline__ void
4605bufqdec(int q)
4606{
4607	if ((q < 0) || (q >= BQUEUES))
4608		return;
4609
4610	bufqlim[q].bl_num--;
4611	return;
4612}
4613
4614static void
4615bufq_balance_thread_init(void)
4616{
4617	thread_t	thread = THREAD_NULL;
4618
4619	if (bufqscanwait++ == 0) {
4620
4621		/* Initalize globals */
4622		MAXNBUF = (sane_size / PAGE_SIZE);
4623		nbufh = nbuf_headers;
4624		nbuflow = min(nbufh, 100);
4625		nbufhigh = min(MAXNBUF, max(nbufh, 2048));
4626		nbuftarget = (sane_size >> 5) / PAGE_SIZE;
4627		nbuftarget = max(nbuflow, nbuftarget);
4628		nbuftarget = min(nbufhigh, nbuftarget);
4629
4630		/*
4631		 * Initialize the bufqlim
4632		 */
4633
4634		/* LOCKED queue */
4635		bufqlim[BQ_LOCKED].bl_nlow = 0;
4636		bufqlim[BQ_LOCKED].bl_nlhigh = 32;
4637		bufqlim[BQ_LOCKED].bl_target = 0;
4638		bufqlim[BQ_LOCKED].bl_stale = 30;
4639
4640		/* LRU queue */
4641		bufqlim[BQ_LRU].bl_nlow = 0;
4642		bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
4643		bufqlim[BQ_LRU].bl_target = nbuftarget/4;
4644		bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
4645
4646		/* AGE queue */
4647		bufqlim[BQ_AGE].bl_nlow = 0;
4648		bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
4649		bufqlim[BQ_AGE].bl_target = nbuftarget/4;
4650		bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
4651
4652		/* EMPTY queue */
4653		bufqlim[BQ_EMPTY].bl_nlow = 0;
4654		bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
4655		bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
4656		bufqlim[BQ_EMPTY].bl_stale = 600000;
4657
4658		/* META queue */
4659		bufqlim[BQ_META].bl_nlow = 0;
4660		bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
4661		bufqlim[BQ_META].bl_target = nbuftarget/4;
4662		bufqlim[BQ_META].bl_stale = META_IS_STALE;
4663
4664		/* LAUNDRY queue */
4665		bufqlim[BQ_LOCKED].bl_nlow = 0;
4666		bufqlim[BQ_LOCKED].bl_nlhigh = 32;
4667		bufqlim[BQ_LOCKED].bl_target = 0;
4668		bufqlim[BQ_LOCKED].bl_stale = 30;
4669
4670		buqlimprt(1);
4671	}
4672
4673	/* create worker thread */
4674	kernel_thread_start((thread_continue_t)bufqscan_thread, NULL, &thread);
4675	thread_deallocate(thread);
4676}
4677
4678/* The workloop for the buffer balancing thread */
4679static void
4680bufqscan_thread()
4681{
4682	int moretodo = 0;
4683
4684	for(;;) {
4685		do {
4686			int q;	/* buffer queue to process */
4687
4688			q = initbufqscan();
4689			for (; q; ) {
4690				moretodo |= balancebufq(q);
4691				q = nextbufq(q);
4692			}
4693		} while (moretodo);
4694
4695#if DIAGNOSTIC
4696		vfs_bufstats();
4697		buqlimprt(0);
4698#endif
4699		(void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
4700		moretodo = 0;
4701	}
4702}
4703
4704/* Seed for the buffer queue balancing */
4705static __inline__ int
4706initbufqscan()
4707{
4708	/* Start with AGE queue */
4709	return (BQ_AGE);
4710}
4711
4712/* Pick next buffer queue to balance */
4713static __inline__ int
4714nextbufq(int q)
4715{
4716	int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
4717
4718	q++;
4719	q %= sizeof(order);
4720	return (order[q]);
4721}
4722
4723/* function to balance the buffer queues */
4724static int
4725balancebufq(int q)
4726{
4727	int moretodo = 0;
4728	int n, t;
4729
4730	/* reject invalid q */
4731	if ((q < 0) || (q >= BQUEUES))
4732		goto out;
4733
4734	/* LOCKED or LAUNDRY queue MUST not be balanced */
4735	if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
4736		goto out;
4737
4738	n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
4739
4740	/* If queue has less than target nothing more to do */
4741	if (n < 0)
4742		goto out;
4743
4744	if ( n > 8 ) {
4745		/* Balance only a small amount (12.5%) at a time */
4746		n >>= 3;
4747	}
4748
4749	/* EMPTY queue needs special handling */
4750	if (q == BQ_EMPTY) {
4751		moretodo |= btrimempty(n);
4752		goto out;
4753	}
4754
4755	t = buf_timestamp():
4756
4757	for (; n > 0; n--) {
4758		struct buf *bp = bufqueues[q].tqh_first;
4759		if (!bp)
4760			break;
4761
4762		/* check if it's stale */
4763		if ((t - bp->b_timestamp) > bufqlim[q].bl_stale) {
4764			if (bcleanbuf(bp, FALSE)) {
4765				/* buf_bawrite() issued, bp not ready */
4766				moretodo = 1;
4767			} else {
4768				/* release the cleaned buffer to BQ_EMPTY */
4769				SET(bp->b_flags, B_INVAL);
4770				buf_brelse(bp);
4771			}
4772		} else
4773			break;
4774	}
4775
4776out:
4777	return (moretodo);
4778}
4779
4780static int
4781btrimempty(int n)
4782{
4783	/*
4784	 * When struct buf are allocated dynamically, this would
4785	 * reclaim upto 'n' struct buf from the empty queue.
4786	 */
4787
4788	 return (0);
4789}
4790
4791static void
4792buqlimprt(int all)
4793{
4794	int i;
4795    static char *bname[BQUEUES] =
4796		{ "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
4797
4798	if (all)
4799		for (i = 0; i < BQUEUES; i++) {
4800			printf("%s : ", bname[i]);
4801			printf("min = %ld, ", (long)bufqlim[i].bl_nlow);
4802			printf("cur = %ld, ", (long)bufqlim[i].bl_num);
4803			printf("max = %ld, ", (long)bufqlim[i].bl_nlhigh);
4804			printf("target = %ld, ", (long)bufqlim[i].bl_target);
4805			printf("stale after %ld seconds\n", bufqlim[i].bl_stale);
4806		}
4807	else
4808		for (i = 0; i < BQUEUES; i++) {
4809			printf("%s : ", bname[i]);
4810			printf("cur = %ld, ", (long)bufqlim[i].bl_num);
4811		}
4812}
4813
4814#endif
4815
4816
4817