vfs_bio.c revision 1.227
1/*	$NetBSD: vfs_bio.c,v 1.227 2011/01/17 07:13:32 uebayasi Exp $	*/
2
3/*-
4 * Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran, and by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/*-
33 * Copyright (c) 1982, 1986, 1989, 1993
34 *	The Regents of the University of California.  All rights reserved.
35 * (c) UNIX System Laboratories, Inc.
36 * All or some portions of this file are derived from material licensed
37 * to the University of California by American Telephone and Telegraph
38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39 * the permission of UNIX System Laboratories, Inc.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 *    notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 *    notice, this list of conditions and the following disclaimer in the
48 *    documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 *    may be used to endorse or promote products derived from this software
51 *    without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 *	@(#)vfs_bio.c	8.6 (Berkeley) 1/11/94
66 */
67
68/*-
69 * Copyright (c) 1994 Christopher G. Demetriou
70 *
71 * Redistribution and use in source and binary forms, with or without
72 * modification, are permitted provided that the following conditions
73 * are met:
74 * 1. Redistributions of source code must retain the above copyright
75 *    notice, this list of conditions and the following disclaimer.
76 * 2. Redistributions in binary form must reproduce the above copyright
77 *    notice, this list of conditions and the following disclaimer in the
78 *    documentation and/or other materials provided with the distribution.
79 * 3. All advertising materials mentioning features or use of this software
80 *    must display the following acknowledgement:
81 *	This product includes software developed by the University of
82 *	California, Berkeley and its contributors.
83 * 4. Neither the name of the University nor the names of its contributors
84 *    may be used to endorse or promote products derived from this software
85 *    without specific prior written permission.
86 *
87 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
88 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
89 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
90 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
91 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
92 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
93 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
94 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
95 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
96 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
97 * SUCH DAMAGE.
98 *
99 *	@(#)vfs_bio.c	8.6 (Berkeley) 1/11/94
100 */
101
102/*
103 * The buffer cache subsystem.
104 *
105 * Some references:
106 *	Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
107 *	Leffler, et al.: The Design and Implementation of the 4.3BSD
108 *		UNIX Operating System (Addison Welley, 1989)
109 *
110 * Locking
111 *
112 * There are three locks:
113 * - bufcache_lock: protects global buffer cache state.
114 * - BC_BUSY: a long term per-buffer lock.
115 * - buf_t::b_objlock: lock on completion (biowait vs biodone).
116 *
117 * For buffers associated with vnodes (a most common case) b_objlock points
118 * to the vnode_t::v_interlock.  Otherwise, it points to generic buffer_lock.
119 *
120 * Lock order:
121 *	bufcache_lock ->
122 *		buf_t::b_objlock
123 */
124
125#include <sys/cdefs.h>
126__KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.227 2011/01/17 07:13:32 uebayasi Exp $");
127
128#include "opt_bufcache.h"
129
130#include <sys/param.h>
131#include <sys/systm.h>
132#include <sys/kernel.h>
133#include <sys/proc.h>
134#include <sys/buf.h>
135#include <sys/vnode.h>
136#include <sys/mount.h>
137#include <sys/resourcevar.h>
138#include <sys/sysctl.h>
139#include <sys/conf.h>
140#include <sys/kauth.h>
141#include <sys/fstrans.h>
142#include <sys/intr.h>
143#include <sys/cpu.h>
144#include <sys/wapbl.h>
145
146#include <uvm/uvm.h>	/* extern struct uvm uvm */
147
148#include <miscfs/specfs/specdev.h>
149
150#ifndef	BUFPAGES
151# define BUFPAGES 0
152#endif
153
154#ifdef BUFCACHE
155# if (BUFCACHE < 5) || (BUFCACHE > 95)
156#  error BUFCACHE is not between 5 and 95
157# endif
158#else
159# define BUFCACHE 15
160#endif
161
162u_int	nbuf;			/* desired number of buffer headers */
163u_int	bufpages = BUFPAGES;	/* optional hardwired count */
164u_int	bufcache = BUFCACHE;	/* max % of RAM to use for buffer cache */
165
166/* Function prototypes */
167struct bqueue;
168
169static void buf_setwm(void);
170static int buf_trim(void);
171static void *bufpool_page_alloc(struct pool *, int);
172static void bufpool_page_free(struct pool *, void *);
173static buf_t *bio_doread(struct vnode *, daddr_t, int,
174    kauth_cred_t, int);
175static buf_t *getnewbuf(int, int, int);
176static int buf_lotsfree(void);
177static int buf_canrelease(void);
178static u_long buf_mempoolidx(u_long);
179static u_long buf_roundsize(u_long);
180static void *buf_malloc(size_t);
181static void buf_mrelease(void *, size_t);
182static void binsheadfree(buf_t *, struct bqueue *);
183static void binstailfree(buf_t *, struct bqueue *);
184int count_lock_queue(void); /* XXX */
185#ifdef DEBUG
186static int checkfreelist(buf_t *, struct bqueue *, int);
187#endif
188static void biointr(void *);
189static void biodone2(buf_t *);
190static void bref(buf_t *);
191static void brele(buf_t *);
192static void sysctl_kern_buf_setup(void);
193static void sysctl_vm_buf_setup(void);
194
195/*
196 * Definitions for the buffer hash lists.
197 */
198#define	BUFHASH(dvp, lbn)	\
199	(&bufhashtbl[(((long)(dvp) >> 8) + (int)(lbn)) & bufhash])
200LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
201u_long	bufhash;
202struct bqueue bufqueues[BQUEUES];
203
204static kcondvar_t needbuffer_cv;
205
206/*
207 * Buffer queue lock.
208 */
209kmutex_t bufcache_lock;
210kmutex_t buffer_lock;
211
212/* Software ISR for completed transfers. */
213static void *biodone_sih;
214
215/* Buffer pool for I/O buffers. */
216static pool_cache_t buf_cache;
217static pool_cache_t bufio_cache;
218
219/* XXX - somewhat gross.. */
220#if MAXBSIZE == 0x2000
221#define NMEMPOOLS 5
222#elif MAXBSIZE == 0x4000
223#define NMEMPOOLS 6
224#elif MAXBSIZE == 0x8000
225#define NMEMPOOLS 7
226#else
227#define NMEMPOOLS 8
228#endif
229
230#define MEMPOOL_INDEX_OFFSET 9	/* smallest pool is 512 bytes */
231#if (1 << (NMEMPOOLS + MEMPOOL_INDEX_OFFSET - 1)) != MAXBSIZE
232#error update vfs_bio buffer memory parameters
233#endif
234
235/* Buffer memory pools */
236static struct pool bmempools[NMEMPOOLS];
237
238static struct vm_map *buf_map;
239
240/*
241 * Buffer memory pool allocator.
242 */
243static void *
244bufpool_page_alloc(struct pool *pp, int flags)
245{
246
247	return (void *)uvm_km_alloc(buf_map,
248	    MAXBSIZE, MAXBSIZE,
249	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
250	    | UVM_KMF_WIRED);
251}
252
253static void
254bufpool_page_free(struct pool *pp, void *v)
255{
256
257	uvm_km_free(buf_map, (vaddr_t)v, MAXBSIZE, UVM_KMF_WIRED);
258}
259
260static struct pool_allocator bufmempool_allocator = {
261	.pa_alloc = bufpool_page_alloc,
262	.pa_free = bufpool_page_free,
263	.pa_pagesz = MAXBSIZE,
264};
265
266/* Buffer memory management variables */
267u_long bufmem_valimit;
268u_long bufmem_hiwater;
269u_long bufmem_lowater;
270u_long bufmem;
271
272/*
273 * MD code can call this to set a hard limit on the amount
274 * of virtual memory used by the buffer cache.
275 */
276int
277buf_setvalimit(vsize_t sz)
278{
279
280	/* We need to accommodate at least NMEMPOOLS of MAXBSIZE each */
281	if (sz < NMEMPOOLS * MAXBSIZE)
282		return EINVAL;
283
284	bufmem_valimit = sz;
285	return 0;
286}
287
288static void
289buf_setwm(void)
290{
291
292	bufmem_hiwater = buf_memcalc();
293	/* lowater is approx. 2% of memory (with bufcache = 15) */
294#define	BUFMEM_WMSHIFT	3
295#define	BUFMEM_HIWMMIN	(64 * 1024 << BUFMEM_WMSHIFT)
296	if (bufmem_hiwater < BUFMEM_HIWMMIN)
297		/* Ensure a reasonable minimum value */
298		bufmem_hiwater = BUFMEM_HIWMMIN;
299	bufmem_lowater = bufmem_hiwater >> BUFMEM_WMSHIFT;
300}
301
302#ifdef DEBUG
303int debug_verify_freelist = 0;
304static int
305checkfreelist(buf_t *bp, struct bqueue *dp, int ison)
306{
307	buf_t *b;
308
309	if (!debug_verify_freelist)
310		return 1;
311
312	TAILQ_FOREACH(b, &dp->bq_queue, b_freelist) {
313		if (b == bp)
314			return ison ? 1 : 0;
315	}
316
317	return ison ? 0 : 1;
318}
319#endif
320
321/*
322 * Insq/Remq for the buffer hash lists.
323 * Call with buffer queue locked.
324 */
325static void
326binsheadfree(buf_t *bp, struct bqueue *dp)
327{
328
329	KASSERT(mutex_owned(&bufcache_lock));
330	KASSERT(bp->b_freelistindex == -1);
331	TAILQ_INSERT_HEAD(&dp->bq_queue, bp, b_freelist);
332	dp->bq_bytes += bp->b_bufsize;
333	bp->b_freelistindex = dp - bufqueues;
334}
335
336static void
337binstailfree(buf_t *bp, struct bqueue *dp)
338{
339
340	KASSERT(mutex_owned(&bufcache_lock));
341	KASSERT(bp->b_freelistindex == -1);
342	TAILQ_INSERT_TAIL(&dp->bq_queue, bp, b_freelist);
343	dp->bq_bytes += bp->b_bufsize;
344	bp->b_freelistindex = dp - bufqueues;
345}
346
347void
348bremfree(buf_t *bp)
349{
350	struct bqueue *dp;
351	int bqidx = bp->b_freelistindex;
352
353	KASSERT(mutex_owned(&bufcache_lock));
354
355	KASSERT(bqidx != -1);
356	dp = &bufqueues[bqidx];
357	KDASSERT(checkfreelist(bp, dp, 1));
358	KASSERT(dp->bq_bytes >= bp->b_bufsize);
359	TAILQ_REMOVE(&dp->bq_queue, bp, b_freelist);
360	dp->bq_bytes -= bp->b_bufsize;
361
362	/* For the sysctl helper. */
363	if (bp == dp->bq_marker)
364		dp->bq_marker = NULL;
365
366#if defined(DIAGNOSTIC)
367	bp->b_freelistindex = -1;
368#endif /* defined(DIAGNOSTIC) */
369}
370
371/*
372 * Add a reference to an buffer structure that came from buf_cache.
373 */
374static inline void
375bref(buf_t *bp)
376{
377
378	KASSERT(mutex_owned(&bufcache_lock));
379	KASSERT(bp->b_refcnt > 0);
380
381	bp->b_refcnt++;
382}
383
384/*
385 * Free an unused buffer structure that came from buf_cache.
386 */
387static inline void
388brele(buf_t *bp)
389{
390
391	KASSERT(mutex_owned(&bufcache_lock));
392	KASSERT(bp->b_refcnt > 0);
393
394	if (bp->b_refcnt-- == 1) {
395		buf_destroy(bp);
396#ifdef DEBUG
397		memset((char *)bp, 0, sizeof(*bp));
398#endif
399		pool_cache_put(buf_cache, bp);
400	}
401}
402
403/*
404 * note that for some ports this is used by pmap bootstrap code to
405 * determine kva size.
406 */
407u_long
408buf_memcalc(void)
409{
410	u_long n;
411
412	/*
413	 * Determine the upper bound of memory to use for buffers.
414	 *
415	 *	- If bufpages is specified, use that as the number
416	 *	  pages.
417	 *
418	 *	- Otherwise, use bufcache as the percentage of
419	 *	  physical memory.
420	 */
421	if (bufpages != 0) {
422		n = bufpages;
423	} else {
424		if (bufcache < 5) {
425			printf("forcing bufcache %d -> 5", bufcache);
426			bufcache = 5;
427		}
428		if (bufcache > 95) {
429			printf("forcing bufcache %d -> 95", bufcache);
430			bufcache = 95;
431		}
432		n = calc_cache_size(buf_map, bufcache,
433		    (buf_map != kernel_map) ? 100 : BUFCACHE_VA_MAXPCT)
434		    / PAGE_SIZE;
435	}
436
437	n <<= PAGE_SHIFT;
438	if (bufmem_valimit != 0 && n > bufmem_valimit)
439		n = bufmem_valimit;
440
441	return (n);
442}
443
444/*
445 * Initialize buffers and hash links for buffers.
446 */
447void
448bufinit(void)
449{
450	struct bqueue *dp;
451	int use_std;
452	u_int i;
453
454	mutex_init(&bufcache_lock, MUTEX_DEFAULT, IPL_NONE);
455	mutex_init(&buffer_lock, MUTEX_DEFAULT, IPL_NONE);
456	cv_init(&needbuffer_cv, "needbuf");
457
458	if (bufmem_valimit != 0) {
459		vaddr_t minaddr = 0, maxaddr;
460		buf_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
461					  bufmem_valimit, 0, false, 0);
462		if (buf_map == NULL)
463			panic("bufinit: cannot allocate submap");
464	} else
465		buf_map = kernel_map;
466
467	/*
468	 * Initialize buffer cache memory parameters.
469	 */
470	bufmem = 0;
471	buf_setwm();
472
473	/* On "small" machines use small pool page sizes where possible */
474	use_std = (physmem < atop(16*1024*1024));
475
476	/*
477	 * Also use them on systems that can map the pool pages using
478	 * a direct-mapped segment.
479	 */
480#ifdef PMAP_MAP_POOLPAGE
481	use_std = 1;
482#endif
483
484	buf_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0,
485	    "bufpl", NULL, IPL_SOFTBIO, NULL, NULL, NULL);
486	bufio_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0,
487	    "biopl", NULL, IPL_BIO, NULL, NULL, NULL);
488
489	bufmempool_allocator.pa_backingmap = buf_map;
490	for (i = 0; i < NMEMPOOLS; i++) {
491		struct pool_allocator *pa;
492		struct pool *pp = &bmempools[i];
493		u_int size = 1 << (i + MEMPOOL_INDEX_OFFSET);
494		char *name = kmem_alloc(8, KM_SLEEP); /* XXX: never freed */
495		if (__predict_true(size >= 1024))
496			(void)snprintf(name, 8, "buf%dk", size / 1024);
497		else
498			(void)snprintf(name, 8, "buf%db", size);
499		pa = (size <= PAGE_SIZE && use_std)
500			? &pool_allocator_nointr
501			: &bufmempool_allocator;
502		pool_init(pp, size, 0, 0, 0, name, pa, IPL_NONE);
503		pool_setlowat(pp, 1);
504		pool_sethiwat(pp, 1);
505	}
506
507	/* Initialize the buffer queues */
508	for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) {
509		TAILQ_INIT(&dp->bq_queue);
510		dp->bq_bytes = 0;
511	}
512
513	/*
514	 * Estimate hash table size based on the amount of memory we
515	 * intend to use for the buffer cache. The average buffer
516	 * size is dependent on our clients (i.e. filesystems).
517	 *
518	 * For now, use an empirical 3K per buffer.
519	 */
520	nbuf = (bufmem_hiwater / 1024) / 3;
521	bufhashtbl = hashinit(nbuf, HASH_LIST, true, &bufhash);
522
523	sysctl_kern_buf_setup();
524	sysctl_vm_buf_setup();
525}
526
527void
528bufinit2(void)
529{
530
531	biodone_sih = softint_establish(SOFTINT_BIO | SOFTINT_MPSAFE, biointr,
532	    NULL);
533	if (biodone_sih == NULL)
534		panic("bufinit2: can't establish soft interrupt");
535}
536
537static int
538buf_lotsfree(void)
539{
540	int try, thresh;
541
542	/* Always allocate if less than the low water mark. */
543	if (bufmem < bufmem_lowater)
544		return 1;
545
546	/* Never allocate if greater than the high water mark. */
547	if (bufmem > bufmem_hiwater)
548		return 0;
549
550	/* If there's anything on the AGE list, it should be eaten. */
551	if (TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue) != NULL)
552		return 0;
553
554	/*
555	 * The probabily of getting a new allocation is inversely
556	 * proportional to the current size of the cache, using
557	 * a granularity of 16 steps.
558	 */
559	try = random() & 0x0000000fL;
560
561	/* Don't use "16 * bufmem" here to avoid a 32-bit overflow. */
562	thresh = (bufmem - bufmem_lowater) /
563	    ((bufmem_hiwater - bufmem_lowater) / 16);
564
565	if (try >= thresh)
566		return 1;
567
568	/* Otherwise don't allocate. */
569	return 0;
570}
571
572/*
573 * Return estimate of bytes we think need to be
574 * released to help resolve low memory conditions.
575 *
576 * => called with bufcache_lock held.
577 */
578static int
579buf_canrelease(void)
580{
581	int pagedemand, ninvalid = 0;
582
583	KASSERT(mutex_owned(&bufcache_lock));
584
585	if (bufmem < bufmem_lowater)
586		return 0;
587
588	if (bufmem > bufmem_hiwater)
589		return bufmem - bufmem_hiwater;
590
591	ninvalid += bufqueues[BQ_AGE].bq_bytes;
592
593	pagedemand = uvmexp.freetarg - uvmexp.free;
594	if (pagedemand < 0)
595		return ninvalid;
596	return MAX(ninvalid, MIN(2 * MAXBSIZE,
597	    MIN((bufmem - bufmem_lowater) / 16, pagedemand * PAGE_SIZE)));
598}
599
600/*
601 * Buffer memory allocation helper functions
602 */
603static u_long
604buf_mempoolidx(u_long size)
605{
606	u_int n = 0;
607
608	size -= 1;
609	size >>= MEMPOOL_INDEX_OFFSET;
610	while (size) {
611		size >>= 1;
612		n += 1;
613	}
614	if (n >= NMEMPOOLS)
615		panic("buf mem pool index %d", n);
616	return n;
617}
618
619static u_long
620buf_roundsize(u_long size)
621{
622	/* Round up to nearest power of 2 */
623	return (1 << (buf_mempoolidx(size) + MEMPOOL_INDEX_OFFSET));
624}
625
626static void *
627buf_malloc(size_t size)
628{
629	u_int n = buf_mempoolidx(size);
630	void *addr;
631
632	while (1) {
633		addr = pool_get(&bmempools[n], PR_NOWAIT);
634		if (addr != NULL)
635			break;
636
637		/* No memory, see if we can free some. If so, try again */
638		mutex_enter(&bufcache_lock);
639		if (buf_drain(1) > 0) {
640			mutex_exit(&bufcache_lock);
641			continue;
642		}
643
644		if (curlwp == uvm.pagedaemon_lwp) {
645			mutex_exit(&bufcache_lock);
646			return NULL;
647		}
648
649		/* Wait for buffers to arrive on the LRU queue */
650		cv_timedwait(&needbuffer_cv, &bufcache_lock, hz / 4);
651		mutex_exit(&bufcache_lock);
652	}
653
654	return addr;
655}
656
657static void
658buf_mrelease(void *addr, size_t size)
659{
660
661	pool_put(&bmempools[buf_mempoolidx(size)], addr);
662}
663
664/*
665 * bread()/breadn() helper.
666 */
667static buf_t *
668bio_doread(struct vnode *vp, daddr_t blkno, int size, kauth_cred_t cred,
669    int async)
670{
671	buf_t *bp;
672	struct mount *mp;
673
674	bp = getblk(vp, blkno, size, 0, 0);
675
676#ifdef DIAGNOSTIC
677	if (bp == NULL) {
678		panic("bio_doread: no such buf");
679	}
680#endif
681
682	/*
683	 * If buffer does not have data valid, start a read.
684	 * Note that if buffer is BC_INVAL, getblk() won't return it.
685	 * Therefore, it's valid if its I/O has completed or been delayed.
686	 */
687	if (!ISSET(bp->b_oflags, (BO_DONE | BO_DELWRI))) {
688		/* Start I/O for the buffer. */
689		SET(bp->b_flags, B_READ | async);
690		if (async)
691			BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
692		else
693			BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
694		VOP_STRATEGY(vp, bp);
695
696		/* Pay for the read. */
697		curlwp->l_ru.ru_inblock++;
698	} else if (async)
699		brelse(bp, 0);
700
701	if (vp->v_type == VBLK)
702		mp = vp->v_specmountpoint;
703	else
704		mp = vp->v_mount;
705
706	/*
707	 * Collect statistics on synchronous and asynchronous reads.
708	 * Reads from block devices are charged to their associated
709	 * filesystem (if any).
710	 */
711	if (mp != NULL) {
712		if (async == 0)
713			mp->mnt_stat.f_syncreads++;
714		else
715			mp->mnt_stat.f_asyncreads++;
716	}
717
718	return (bp);
719}
720
721/*
722 * Read a disk block.
723 * This algorithm described in Bach (p.54).
724 */
725int
726bread(struct vnode *vp, daddr_t blkno, int size, kauth_cred_t cred,
727    int flags, buf_t **bpp)
728{
729	buf_t *bp;
730	int error;
731
732	/* Get buffer for block. */
733	bp = *bpp = bio_doread(vp, blkno, size, cred, 0);
734
735	/* Wait for the read to complete, and return result. */
736	error = biowait(bp);
737	if (error == 0 && (flags & B_MODIFY) != 0)
738		error = fscow_run(bp, true);
739
740	return error;
741}
742
743/*
744 * Read-ahead multiple disk blocks. The first is sync, the rest async.
745 * Trivial modification to the breada algorithm presented in Bach (p.55).
746 */
747int
748breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablks,
749    int *rasizes, int nrablks, kauth_cred_t cred, int flags, buf_t **bpp)
750{
751	buf_t *bp;
752	int error, i;
753
754	bp = *bpp = bio_doread(vp, blkno, size, cred, 0);
755
756	/*
757	 * For each of the read-ahead blocks, start a read, if necessary.
758	 */
759	mutex_enter(&bufcache_lock);
760	for (i = 0; i < nrablks; i++) {
761		/* If it's in the cache, just go on to next one. */
762		if (incore(vp, rablks[i]))
763			continue;
764
765		/* Get a buffer for the read-ahead block */
766		mutex_exit(&bufcache_lock);
767		(void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC);
768		mutex_enter(&bufcache_lock);
769	}
770	mutex_exit(&bufcache_lock);
771
772	/* Otherwise, we had to start a read for it; wait until it's valid. */
773	error = biowait(bp);
774	if (error == 0 && (flags & B_MODIFY) != 0)
775		error = fscow_run(bp, true);
776	return error;
777}
778
779/*
780 * Block write.  Described in Bach (p.56)
781 */
782int
783bwrite(buf_t *bp)
784{
785	int rv, sync, wasdelayed;
786	struct vnode *vp;
787	struct mount *mp;
788
789	KASSERT(ISSET(bp->b_cflags, BC_BUSY));
790	KASSERT(!cv_has_waiters(&bp->b_done));
791
792	vp = bp->b_vp;
793	if (vp != NULL) {
794		KASSERT(bp->b_objlock == &vp->v_interlock);
795		if (vp->v_type == VBLK)
796			mp = vp->v_specmountpoint;
797		else
798			mp = vp->v_mount;
799	} else {
800		mp = NULL;
801	}
802
803	if (mp && mp->mnt_wapbl) {
804		if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) {
805			bdwrite(bp);
806			return 0;
807		}
808	}
809
810	/*
811	 * Remember buffer type, to switch on it later.  If the write was
812	 * synchronous, but the file system was mounted with MNT_ASYNC,
813	 * convert it to a delayed write.
814	 * XXX note that this relies on delayed tape writes being converted
815	 * to async, not sync writes (which is safe, but ugly).
816	 */
817	sync = !ISSET(bp->b_flags, B_ASYNC);
818	if (sync && mp != NULL && ISSET(mp->mnt_flag, MNT_ASYNC)) {
819		bdwrite(bp);
820		return (0);
821	}
822
823	/*
824	 * Collect statistics on synchronous and asynchronous writes.
825	 * Writes to block devices are charged to their associated
826	 * filesystem (if any).
827	 */
828	if (mp != NULL) {
829		if (sync)
830			mp->mnt_stat.f_syncwrites++;
831		else
832			mp->mnt_stat.f_asyncwrites++;
833	}
834
835	/*
836	 * Pay for the I/O operation and make sure the buf is on the correct
837	 * vnode queue.
838	 */
839	bp->b_error = 0;
840	wasdelayed = ISSET(bp->b_oflags, BO_DELWRI);
841	CLR(bp->b_flags, B_READ);
842	if (wasdelayed) {
843		mutex_enter(&bufcache_lock);
844		mutex_enter(bp->b_objlock);
845		CLR(bp->b_oflags, BO_DONE | BO_DELWRI);
846		reassignbuf(bp, bp->b_vp);
847		mutex_exit(&bufcache_lock);
848	} else {
849		curlwp->l_ru.ru_oublock++;
850		mutex_enter(bp->b_objlock);
851		CLR(bp->b_oflags, BO_DONE | BO_DELWRI);
852	}
853	if (vp != NULL)
854		vp->v_numoutput++;
855	mutex_exit(bp->b_objlock);
856
857	/* Initiate disk write. */
858	if (sync)
859		BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
860	else
861		BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
862
863	VOP_STRATEGY(vp, bp);
864
865	if (sync) {
866		/* If I/O was synchronous, wait for it to complete. */
867		rv = biowait(bp);
868
869		/* Release the buffer. */
870		brelse(bp, 0);
871
872		return (rv);
873	} else {
874		return (0);
875	}
876}
877
878int
879vn_bwrite(void *v)
880{
881	struct vop_bwrite_args *ap = v;
882
883	return (bwrite(ap->a_bp));
884}
885
886/*
887 * Delayed write.
888 *
889 * The buffer is marked dirty, but is not queued for I/O.
890 * This routine should be used when the buffer is expected
891 * to be modified again soon, typically a small write that
892 * partially fills a buffer.
893 *
894 * NB: magnetic tapes cannot be delayed; they must be
895 * written in the order that the writes are requested.
896 *
897 * Described in Leffler, et al. (pp. 208-213).
898 */
899void
900bdwrite(buf_t *bp)
901{
902
903	KASSERT(bp->b_vp == NULL || bp->b_vp->v_tag != VT_UFS ||
904	    bp->b_vp->v_type == VBLK || ISSET(bp->b_flags, B_COWDONE));
905	KASSERT(ISSET(bp->b_cflags, BC_BUSY));
906	KASSERT(!cv_has_waiters(&bp->b_done));
907
908	/* If this is a tape block, write the block now. */
909	if (bdev_type(bp->b_dev) == D_TAPE) {
910		bawrite(bp);
911		return;
912	}
913
914	if (wapbl_vphaswapbl(bp->b_vp)) {
915		struct mount *mp = wapbl_vptomp(bp->b_vp);
916
917		if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) {
918			WAPBL_ADD_BUF(mp, bp);
919		}
920	}
921
922	/*
923	 * If the block hasn't been seen before:
924	 *	(1) Mark it as having been seen,
925	 *	(2) Charge for the write,
926	 *	(3) Make sure it's on its vnode's correct block list.
927	 */
928	KASSERT(bp->b_vp == NULL || bp->b_objlock == &bp->b_vp->v_interlock);
929
930	if (!ISSET(bp->b_oflags, BO_DELWRI)) {
931		mutex_enter(&bufcache_lock);
932		mutex_enter(bp->b_objlock);
933		SET(bp->b_oflags, BO_DELWRI);
934		curlwp->l_ru.ru_oublock++;
935		reassignbuf(bp, bp->b_vp);
936		mutex_exit(&bufcache_lock);
937	} else {
938		mutex_enter(bp->b_objlock);
939	}
940	/* Otherwise, the "write" is done, so mark and release the buffer. */
941	CLR(bp->b_oflags, BO_DONE);
942	mutex_exit(bp->b_objlock);
943
944	brelse(bp, 0);
945}
946
947/*
948 * Asynchronous block write; just an asynchronous bwrite().
949 */
950void
951bawrite(buf_t *bp)
952{
953
954	KASSERT(ISSET(bp->b_cflags, BC_BUSY));
955
956	SET(bp->b_flags, B_ASYNC);
957	VOP_BWRITE(bp);
958}
959
960/*
961 * Release a buffer on to the free lists.
962 * Described in Bach (p. 46).
963 */
964void
965brelsel(buf_t *bp, int set)
966{
967	struct bqueue *bufq;
968	struct vnode *vp;
969
970	KASSERT(mutex_owned(&bufcache_lock));
971	KASSERT(!cv_has_waiters(&bp->b_done));
972	KASSERT(bp->b_refcnt > 0);
973
974	SET(bp->b_cflags, set);
975
976	KASSERT(ISSET(bp->b_cflags, BC_BUSY));
977	KASSERT(bp->b_iodone == NULL);
978
979	/* Wake up any processes waiting for any buffer to become free. */
980	cv_signal(&needbuffer_cv);
981
982	/* Wake up any proceeses waiting for _this_ buffer to become */
983	if (ISSET(bp->b_cflags, BC_WANTED))
984		CLR(bp->b_cflags, BC_WANTED|BC_AGE);
985
986	/* If it's clean clear the copy-on-write flag. */
987	if (ISSET(bp->b_flags, B_COWDONE)) {
988		mutex_enter(bp->b_objlock);
989		if (!ISSET(bp->b_oflags, BO_DELWRI))
990			CLR(bp->b_flags, B_COWDONE);
991		mutex_exit(bp->b_objlock);
992	}
993
994	/*
995	 * Determine which queue the buffer should be on, then put it there.
996	 */
997
998	/* If it's locked, don't report an error; try again later. */
999	if (ISSET(bp->b_flags, B_LOCKED))
1000		bp->b_error = 0;
1001
1002	/* If it's not cacheable, or an error, mark it invalid. */
1003	if (ISSET(bp->b_cflags, BC_NOCACHE) || bp->b_error != 0)
1004		SET(bp->b_cflags, BC_INVAL);
1005
1006	if (ISSET(bp->b_cflags, BC_VFLUSH)) {
1007		/*
1008		 * This is a delayed write buffer that was just flushed to
1009		 * disk.  It is still on the LRU queue.  If it's become
1010		 * invalid, then we need to move it to a different queue;
1011		 * otherwise leave it in its current position.
1012		 */
1013		CLR(bp->b_cflags, BC_VFLUSH);
1014		if (!ISSET(bp->b_cflags, BC_INVAL|BC_AGE) &&
1015		    !ISSET(bp->b_flags, B_LOCKED) && bp->b_error == 0) {
1016			KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 1));
1017			goto already_queued;
1018		} else {
1019			bremfree(bp);
1020		}
1021	}
1022
1023	KDASSERT(checkfreelist(bp, &bufqueues[BQ_AGE], 0));
1024	KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 0));
1025	KDASSERT(checkfreelist(bp, &bufqueues[BQ_LOCKED], 0));
1026
1027	if ((bp->b_bufsize <= 0) || ISSET(bp->b_cflags, BC_INVAL)) {
1028		/*
1029		 * If it's invalid or empty, dissociate it from its vnode
1030		 * and put on the head of the appropriate queue.
1031		 */
1032		if (ISSET(bp->b_flags, B_LOCKED)) {
1033			if (wapbl_vphaswapbl(vp = bp->b_vp)) {
1034				struct mount *mp = wapbl_vptomp(vp);
1035
1036				KASSERT(bp->b_iodone
1037				    != mp->mnt_wapbl_op->wo_wapbl_biodone);
1038				WAPBL_REMOVE_BUF(mp, bp);
1039			}
1040		}
1041
1042		mutex_enter(bp->b_objlock);
1043		CLR(bp->b_oflags, BO_DONE|BO_DELWRI);
1044		if ((vp = bp->b_vp) != NULL) {
1045			KASSERT(bp->b_objlock == &vp->v_interlock);
1046			reassignbuf(bp, bp->b_vp);
1047			brelvp(bp);
1048			mutex_exit(&vp->v_interlock);
1049		} else {
1050			KASSERT(bp->b_objlock == &buffer_lock);
1051			mutex_exit(bp->b_objlock);
1052		}
1053
1054		if (bp->b_bufsize <= 0)
1055			/* no data */
1056			goto already_queued;
1057		else
1058			/* invalid data */
1059			bufq = &bufqueues[BQ_AGE];
1060		binsheadfree(bp, bufq);
1061	} else  {
1062		/*
1063		 * It has valid data.  Put it on the end of the appropriate
1064		 * queue, so that it'll stick around for as long as possible.
1065		 * If buf is AGE, but has dependencies, must put it on last
1066		 * bufqueue to be scanned, ie LRU. This protects against the
1067		 * livelock where BQ_AGE only has buffers with dependencies,
1068		 * and we thus never get to the dependent buffers in BQ_LRU.
1069		 */
1070		if (ISSET(bp->b_flags, B_LOCKED)) {
1071			/* locked in core */
1072			bufq = &bufqueues[BQ_LOCKED];
1073		} else if (!ISSET(bp->b_cflags, BC_AGE)) {
1074			/* valid data */
1075			bufq = &bufqueues[BQ_LRU];
1076		} else {
1077			/* stale but valid data */
1078			bufq = &bufqueues[BQ_AGE];
1079		}
1080		binstailfree(bp, bufq);
1081	}
1082already_queued:
1083	/* Unlock the buffer. */
1084	CLR(bp->b_cflags, BC_AGE|BC_BUSY|BC_NOCACHE);
1085	CLR(bp->b_flags, B_ASYNC);
1086	cv_broadcast(&bp->b_busy);
1087
1088	if (bp->b_bufsize <= 0)
1089		brele(bp);
1090}
1091
1092void
1093brelse(buf_t *bp, int set)
1094{
1095
1096	mutex_enter(&bufcache_lock);
1097	brelsel(bp, set);
1098	mutex_exit(&bufcache_lock);
1099}
1100
1101/*
1102 * Determine if a block is in the cache.
1103 * Just look on what would be its hash chain.  If it's there, return
1104 * a pointer to it, unless it's marked invalid.  If it's marked invalid,
1105 * we normally don't return the buffer, unless the caller explicitly
1106 * wants us to.
1107 */
1108buf_t *
1109incore(struct vnode *vp, daddr_t blkno)
1110{
1111	buf_t *bp;
1112
1113	KASSERT(mutex_owned(&bufcache_lock));
1114
1115	/* Search hash chain */
1116	LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) {
1117		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
1118		    !ISSET(bp->b_cflags, BC_INVAL)) {
1119		    	KASSERT(bp->b_objlock == &vp->v_interlock);
1120		    	return (bp);
1121		}
1122	}
1123
1124	return (NULL);
1125}
1126
1127/*
1128 * Get a block of requested size that is associated with
1129 * a given vnode and block offset. If it is found in the
1130 * block cache, mark it as having been found, make it busy
1131 * and return it. Otherwise, return an empty block of the
1132 * correct size. It is up to the caller to insure that the
1133 * cached blocks be of the correct size.
1134 */
1135buf_t *
1136getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
1137{
1138	int err, preserve;
1139	buf_t *bp;
1140
1141	mutex_enter(&bufcache_lock);
1142 loop:
1143	bp = incore(vp, blkno);
1144	if (bp != NULL) {
1145		err = bbusy(bp, ((slpflag & PCATCH) != 0), slptimeo, NULL);
1146		if (err != 0) {
1147			if (err == EPASSTHROUGH)
1148				goto loop;
1149			mutex_exit(&bufcache_lock);
1150			return (NULL);
1151		}
1152		KASSERT(!cv_has_waiters(&bp->b_done));
1153#ifdef DIAGNOSTIC
1154		if (ISSET(bp->b_oflags, BO_DONE|BO_DELWRI) &&
1155		    bp->b_bcount < size && vp->v_type != VBLK)
1156			panic("getblk: block size invariant failed");
1157#endif
1158		bremfree(bp);
1159		preserve = 1;
1160	} else {
1161		if ((bp = getnewbuf(slpflag, slptimeo, 0)) == NULL)
1162			goto loop;
1163
1164		if (incore(vp, blkno) != NULL) {
1165			/* The block has come into memory in the meantime. */
1166			brelsel(bp, 0);
1167			goto loop;
1168		}
1169
1170		LIST_INSERT_HEAD(BUFHASH(vp, blkno), bp, b_hash);
1171		bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = blkno;
1172		mutex_enter(&vp->v_interlock);
1173		bgetvp(vp, bp);
1174		mutex_exit(&vp->v_interlock);
1175		preserve = 0;
1176	}
1177	mutex_exit(&bufcache_lock);
1178
1179	/*
1180	 * LFS can't track total size of B_LOCKED buffer (locked_queue_bytes)
1181	 * if we re-size buffers here.
1182	 */
1183	if (ISSET(bp->b_flags, B_LOCKED)) {
1184		KASSERT(bp->b_bufsize >= size);
1185	} else {
1186		if (allocbuf(bp, size, preserve)) {
1187			mutex_enter(&bufcache_lock);
1188			LIST_REMOVE(bp, b_hash);
1189			mutex_exit(&bufcache_lock);
1190			brelse(bp, BC_INVAL);
1191			return NULL;
1192		}
1193	}
1194	BIO_SETPRIO(bp, BPRIO_DEFAULT);
1195	return (bp);
1196}
1197
1198/*
1199 * Get an empty, disassociated buffer of given size.
1200 */
1201buf_t *
1202geteblk(int size)
1203{
1204	buf_t *bp;
1205	int error;
1206
1207	mutex_enter(&bufcache_lock);
1208	while ((bp = getnewbuf(0, 0, 0)) == NULL)
1209		;
1210
1211	SET(bp->b_cflags, BC_INVAL);
1212	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1213	mutex_exit(&bufcache_lock);
1214	BIO_SETPRIO(bp, BPRIO_DEFAULT);
1215	error = allocbuf(bp, size, 0);
1216	KASSERT(error == 0);
1217	return (bp);
1218}
1219
1220/*
1221 * Expand or contract the actual memory allocated to a buffer.
1222 *
1223 * If the buffer shrinks, data is lost, so it's up to the
1224 * caller to have written it out *first*; this routine will not
1225 * start a write.  If the buffer grows, it's the callers
1226 * responsibility to fill out the buffer's additional contents.
1227 */
1228int
1229allocbuf(buf_t *bp, int size, int preserve)
1230{
1231	void *addr;
1232	vsize_t oldsize, desired_size;
1233	int oldcount;
1234	int delta;
1235
1236	desired_size = buf_roundsize(size);
1237	if (desired_size > MAXBSIZE)
1238		printf("allocbuf: buffer larger than MAXBSIZE requested");
1239
1240	oldcount = bp->b_bcount;
1241
1242	bp->b_bcount = size;
1243
1244	oldsize = bp->b_bufsize;
1245	if (oldsize == desired_size) {
1246		/*
1247		 * Do not short cut the WAPBL resize, as the buffer length
1248		 * could still have changed and this would corrupt the
1249		 * tracking of the transaction length.
1250		 */
1251		goto out;
1252	}
1253
1254	/*
1255	 * If we want a buffer of a different size, re-allocate the
1256	 * buffer's memory; copy old content only if needed.
1257	 */
1258	addr = buf_malloc(desired_size);
1259	if (addr == NULL)
1260		return ENOMEM;
1261	if (preserve)
1262		memcpy(addr, bp->b_data, MIN(oldsize,desired_size));
1263	if (bp->b_data != NULL)
1264		buf_mrelease(bp->b_data, oldsize);
1265	bp->b_data = addr;
1266	bp->b_bufsize = desired_size;
1267
1268	/*
1269	 * Update overall buffer memory counter (protected by bufcache_lock)
1270	 */
1271	delta = (long)desired_size - (long)oldsize;
1272
1273	mutex_enter(&bufcache_lock);
1274	if ((bufmem += delta) > bufmem_hiwater) {
1275		/*
1276		 * Need to trim overall memory usage.
1277		 */
1278		while (buf_canrelease()) {
1279			if (curcpu()->ci_schedstate.spc_flags &
1280			    SPCF_SHOULDYIELD) {
1281				mutex_exit(&bufcache_lock);
1282				preempt();
1283				mutex_enter(&bufcache_lock);
1284			}
1285			if (buf_trim() == 0)
1286				break;
1287		}
1288	}
1289	mutex_exit(&bufcache_lock);
1290
1291 out:
1292	if (wapbl_vphaswapbl(bp->b_vp))
1293		WAPBL_RESIZE_BUF(wapbl_vptomp(bp->b_vp), bp, oldsize, oldcount);
1294
1295	return 0;
1296}
1297
1298/*
1299 * Find a buffer which is available for use.
1300 * Select something from a free list.
1301 * Preference is to AGE list, then LRU list.
1302 *
1303 * Called with the buffer queues locked.
1304 * Return buffer locked.
1305 */
1306buf_t *
1307getnewbuf(int slpflag, int slptimeo, int from_bufq)
1308{
1309	buf_t *bp;
1310	struct vnode *vp;
1311
1312 start:
1313	KASSERT(mutex_owned(&bufcache_lock));
1314
1315	/*
1316	 * Get a new buffer from the pool.
1317	 */
1318	if (!from_bufq && buf_lotsfree()) {
1319		mutex_exit(&bufcache_lock);
1320		bp = pool_cache_get(buf_cache, PR_NOWAIT);
1321		if (bp != NULL) {
1322			memset((char *)bp, 0, sizeof(*bp));
1323			buf_init(bp);
1324			SET(bp->b_cflags, BC_BUSY);	/* mark buffer busy */
1325			mutex_enter(&bufcache_lock);
1326#if defined(DIAGNOSTIC)
1327			bp->b_freelistindex = -1;
1328#endif /* defined(DIAGNOSTIC) */
1329			return (bp);
1330		}
1331		mutex_enter(&bufcache_lock);
1332	}
1333
1334	KASSERT(mutex_owned(&bufcache_lock));
1335	if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue)) != NULL ||
1336	    (bp = TAILQ_FIRST(&bufqueues[BQ_LRU].bq_queue)) != NULL) {
1337	    	KASSERT(!ISSET(bp->b_cflags, BC_BUSY) || ISSET(bp->b_cflags, BC_VFLUSH));
1338		bremfree(bp);
1339
1340		/* Buffer is no longer on free lists. */
1341		SET(bp->b_cflags, BC_BUSY);
1342	} else {
1343		/*
1344		 * XXX: !from_bufq should be removed.
1345		 */
1346		if (!from_bufq || curlwp != uvm.pagedaemon_lwp) {
1347			/* wait for a free buffer of any kind */
1348			if ((slpflag & PCATCH) != 0)
1349				(void)cv_timedwait_sig(&needbuffer_cv,
1350				    &bufcache_lock, slptimeo);
1351			else
1352				(void)cv_timedwait(&needbuffer_cv,
1353				    &bufcache_lock, slptimeo);
1354		}
1355		return (NULL);
1356	}
1357
1358#ifdef DIAGNOSTIC
1359	if (bp->b_bufsize <= 0)
1360		panic("buffer %p: on queue but empty", bp);
1361#endif
1362
1363	if (ISSET(bp->b_cflags, BC_VFLUSH)) {
1364		/*
1365		 * This is a delayed write buffer being flushed to disk.  Make
1366		 * sure it gets aged out of the queue when it's finished, and
1367		 * leave it off the LRU queue.
1368		 */
1369		CLR(bp->b_cflags, BC_VFLUSH);
1370		SET(bp->b_cflags, BC_AGE);
1371		goto start;
1372	}
1373
1374	KASSERT(ISSET(bp->b_cflags, BC_BUSY));
1375	KASSERT(bp->b_refcnt > 0);
1376    	KASSERT(!cv_has_waiters(&bp->b_done));
1377
1378	/*
1379	 * If buffer was a delayed write, start it and return NULL
1380	 * (since we might sleep while starting the write).
1381	 */
1382	if (ISSET(bp->b_oflags, BO_DELWRI)) {
1383		/*
1384		 * This buffer has gone through the LRU, so make sure it gets
1385		 * reused ASAP.
1386		 */
1387		SET(bp->b_cflags, BC_AGE);
1388		mutex_exit(&bufcache_lock);
1389		bawrite(bp);
1390		mutex_enter(&bufcache_lock);
1391		return (NULL);
1392	}
1393
1394	vp = bp->b_vp;
1395
1396	/* clear out various other fields */
1397	bp->b_cflags = BC_BUSY;
1398	bp->b_oflags = 0;
1399	bp->b_flags = 0;
1400	bp->b_dev = NODEV;
1401	bp->b_blkno = 0;
1402	bp->b_lblkno = 0;
1403	bp->b_rawblkno = 0;
1404	bp->b_iodone = 0;
1405	bp->b_error = 0;
1406	bp->b_resid = 0;
1407	bp->b_bcount = 0;
1408
1409	LIST_REMOVE(bp, b_hash);
1410
1411	/* Disassociate us from our vnode, if we had one... */
1412	if (vp != NULL) {
1413		mutex_enter(&vp->v_interlock);
1414		brelvp(bp);
1415		mutex_exit(&vp->v_interlock);
1416	}
1417
1418	return (bp);
1419}
1420
1421/*
1422 * Attempt to free an aged buffer off the queues.
1423 * Called with queue lock held.
1424 * Returns the amount of buffer memory freed.
1425 */
1426static int
1427buf_trim(void)
1428{
1429	buf_t *bp;
1430	long size = 0;
1431
1432	KASSERT(mutex_owned(&bufcache_lock));
1433
1434	/* Instruct getnewbuf() to get buffers off the queues */
1435	if ((bp = getnewbuf(PCATCH, 1, 1)) == NULL)
1436		return 0;
1437
1438	KASSERT((bp->b_cflags & BC_WANTED) == 0);
1439	size = bp->b_bufsize;
1440	bufmem -= size;
1441	if (size > 0) {
1442		buf_mrelease(bp->b_data, size);
1443		bp->b_bcount = bp->b_bufsize = 0;
1444	}
1445	/* brelse() will return the buffer to the global buffer pool */
1446	brelsel(bp, 0);
1447	return size;
1448}
1449
1450int
1451buf_drain(int n)
1452{
1453	int size = 0, sz;
1454
1455	KASSERT(mutex_owned(&bufcache_lock));
1456
1457	while (size < n && bufmem > bufmem_lowater) {
1458		sz = buf_trim();
1459		if (sz <= 0)
1460			break;
1461		size += sz;
1462	}
1463
1464	return size;
1465}
1466
1467/*
1468 * Wait for operations on the buffer to complete.
1469 * When they do, extract and return the I/O's error value.
1470 */
1471int
1472biowait(buf_t *bp)
1473{
1474
1475	KASSERT(ISSET(bp->b_cflags, BC_BUSY));
1476	KASSERT(bp->b_refcnt > 0);
1477
1478	mutex_enter(bp->b_objlock);
1479	while (!ISSET(bp->b_oflags, BO_DONE | BO_DELWRI))
1480		cv_wait(&bp->b_done, bp->b_objlock);
1481	mutex_exit(bp->b_objlock);
1482
1483	return bp->b_error;
1484}
1485
1486/*
1487 * Mark I/O complete on a buffer.
1488 *
1489 * If a callback has been requested, e.g. the pageout
1490 * daemon, do so. Otherwise, awaken waiting processes.
1491 *
1492 * [ Leffler, et al., says on p.247:
1493 *	"This routine wakes up the blocked process, frees the buffer
1494 *	for an asynchronous write, or, for a request by the pagedaemon
1495 *	process, invokes a procedure specified in the buffer structure" ]
1496 *
1497 * In real life, the pagedaemon (or other system processes) wants
1498 * to do async stuff to, and doesn't want the buffer brelse()'d.
1499 * (for swap pager, that puts swap buffers on the free lists (!!!),
1500 * for the vn device, that puts malloc'd buffers on the free lists!)
1501 */
1502void
1503biodone(buf_t *bp)
1504{
1505	int s;
1506
1507	KASSERT(!ISSET(bp->b_oflags, BO_DONE));
1508
1509	if (cpu_intr_p()) {
1510		/* From interrupt mode: defer to a soft interrupt. */
1511		s = splvm();
1512		TAILQ_INSERT_TAIL(&curcpu()->ci_data.cpu_biodone, bp, b_actq);
1513		softint_schedule(biodone_sih);
1514		splx(s);
1515	} else {
1516		/* Process now - the buffer may be freed soon. */
1517		biodone2(bp);
1518	}
1519}
1520
1521static void
1522biodone2(buf_t *bp)
1523{
1524	void (*callout)(buf_t *);
1525
1526	mutex_enter(bp->b_objlock);
1527	/* Note that the transfer is done. */
1528	if (ISSET(bp->b_oflags, BO_DONE))
1529		panic("biodone2 already");
1530	CLR(bp->b_flags, B_COWDONE);
1531	SET(bp->b_oflags, BO_DONE);
1532	BIO_SETPRIO(bp, BPRIO_DEFAULT);
1533
1534	/* Wake up waiting writers. */
1535	if (!ISSET(bp->b_flags, B_READ))
1536		vwakeup(bp);
1537
1538	if ((callout = bp->b_iodone) != NULL) {
1539		/* Note callout done, then call out. */
1540		KASSERT(!cv_has_waiters(&bp->b_done));
1541		KERNEL_LOCK(1, NULL);		/* XXXSMP */
1542		bp->b_iodone = NULL;
1543		mutex_exit(bp->b_objlock);
1544		(*callout)(bp);
1545		KERNEL_UNLOCK_ONE(NULL);	/* XXXSMP */
1546	} else if (ISSET(bp->b_flags, B_ASYNC)) {
1547		/* If async, release. */
1548		KASSERT(!cv_has_waiters(&bp->b_done));
1549		mutex_exit(bp->b_objlock);
1550		brelse(bp, 0);
1551	} else {
1552		/* Otherwise just wake up waiters in biowait(). */
1553		cv_broadcast(&bp->b_done);
1554		mutex_exit(bp->b_objlock);
1555	}
1556}
1557
1558static void
1559biointr(void *cookie)
1560{
1561	struct cpu_info *ci;
1562	buf_t *bp;
1563	int s;
1564
1565	ci = curcpu();
1566
1567	while (!TAILQ_EMPTY(&ci->ci_data.cpu_biodone)) {
1568		KASSERT(curcpu() == ci);
1569
1570		s = splvm();
1571		bp = TAILQ_FIRST(&ci->ci_data.cpu_biodone);
1572		TAILQ_REMOVE(&ci->ci_data.cpu_biodone, bp, b_actq);
1573		splx(s);
1574
1575		biodone2(bp);
1576	}
1577}
1578
1579/*
1580 * Return a count of buffers on the "locked" queue.
1581 */
1582int
1583count_lock_queue(void)
1584{
1585	buf_t *bp;
1586	int n = 0;
1587
1588	mutex_enter(&bufcache_lock);
1589	TAILQ_FOREACH(bp, &bufqueues[BQ_LOCKED].bq_queue, b_freelist)
1590		n++;
1591	mutex_exit(&bufcache_lock);
1592	return (n);
1593}
1594
1595/*
1596 * Wait for all buffers to complete I/O
1597 * Return the number of "stuck" buffers.
1598 */
1599int
1600buf_syncwait(void)
1601{
1602	buf_t *bp;
1603	int iter, nbusy, nbusy_prev = 0, dcount, ihash;
1604
1605	dcount = 10000;
1606	for (iter = 0; iter < 20;) {
1607		mutex_enter(&bufcache_lock);
1608		nbusy = 0;
1609		for (ihash = 0; ihash < bufhash+1; ihash++) {
1610		    LIST_FOREACH(bp, &bufhashtbl[ihash], b_hash) {
1611			if ((bp->b_cflags & (BC_BUSY|BC_INVAL)) == BC_BUSY)
1612				nbusy += ((bp->b_flags & B_READ) == 0);
1613		    }
1614		}
1615		mutex_exit(&bufcache_lock);
1616
1617		if (nbusy == 0)
1618			break;
1619		if (nbusy_prev == 0)
1620			nbusy_prev = nbusy;
1621		printf("%d ", nbusy);
1622		kpause("bflush", false, MAX(1, hz / 25 * iter), NULL);
1623		if (nbusy >= nbusy_prev) /* we didn't flush anything */
1624			iter++;
1625		else
1626			nbusy_prev = nbusy;
1627	}
1628
1629	if (nbusy) {
1630#if defined(DEBUG) || defined(DEBUG_HALT_BUSY)
1631		printf("giving up\nPrinting vnodes for busy buffers\n");
1632		for (ihash = 0; ihash < bufhash+1; ihash++) {
1633		    LIST_FOREACH(bp, &bufhashtbl[ihash], b_hash) {
1634			if ((bp->b_cflags & (BC_BUSY|BC_INVAL)) == BC_BUSY &&
1635			    (bp->b_flags & B_READ) == 0)
1636				vprint(NULL, bp->b_vp);
1637		    }
1638		}
1639#endif
1640	}
1641
1642	return nbusy;
1643}
1644
1645static void
1646sysctl_fillbuf(buf_t *i, struct buf_sysctl *o)
1647{
1648
1649	o->b_flags = i->b_flags | i->b_cflags | i->b_oflags;
1650	o->b_error = i->b_error;
1651	o->b_prio = i->b_prio;
1652	o->b_dev = i->b_dev;
1653	o->b_bufsize = i->b_bufsize;
1654	o->b_bcount = i->b_bcount;
1655	o->b_resid = i->b_resid;
1656	o->b_addr = PTRTOUINT64(i->b_data);
1657	o->b_blkno = i->b_blkno;
1658	o->b_rawblkno = i->b_rawblkno;
1659	o->b_iodone = PTRTOUINT64(i->b_iodone);
1660	o->b_proc = PTRTOUINT64(i->b_proc);
1661	o->b_vp = PTRTOUINT64(i->b_vp);
1662	o->b_saveaddr = PTRTOUINT64(i->b_saveaddr);
1663	o->b_lblkno = i->b_lblkno;
1664}
1665
1666#define KERN_BUFSLOP 20
1667static int
1668sysctl_dobuf(SYSCTLFN_ARGS)
1669{
1670	buf_t *bp;
1671	struct buf_sysctl bs;
1672	struct bqueue *bq;
1673	char *dp;
1674	u_int i, op, arg;
1675	size_t len, needed, elem_size, out_size;
1676	int error, elem_count, retries;
1677
1678	if (namelen == 1 && name[0] == CTL_QUERY)
1679		return (sysctl_query(SYSCTLFN_CALL(rnode)));
1680
1681	if (namelen != 4)
1682		return (EINVAL);
1683
1684	retries = 100;
1685 retry:
1686	dp = oldp;
1687	len = (oldp != NULL) ? *oldlenp : 0;
1688	op = name[0];
1689	arg = name[1];
1690	elem_size = name[2];
1691	elem_count = name[3];
1692	out_size = MIN(sizeof(bs), elem_size);
1693
1694	/*
1695	 * at the moment, these are just "placeholders" to make the
1696	 * API for retrieving kern.buf data more extensible in the
1697	 * future.
1698	 *
1699	 * XXX kern.buf currently has "netbsd32" issues.  hopefully
1700	 * these will be resolved at a later point.
1701	 */
1702	if (op != KERN_BUF_ALL || arg != KERN_BUF_ALL ||
1703	    elem_size < 1 || elem_count < 0)
1704		return (EINVAL);
1705
1706	error = 0;
1707	needed = 0;
1708	sysctl_unlock();
1709	mutex_enter(&bufcache_lock);
1710	for (i = 0; i < BQUEUES; i++) {
1711		bq = &bufqueues[i];
1712		TAILQ_FOREACH(bp, &bq->bq_queue, b_freelist) {
1713			bq->bq_marker = bp;
1714			if (len >= elem_size && elem_count > 0) {
1715				sysctl_fillbuf(bp, &bs);
1716				mutex_exit(&bufcache_lock);
1717				error = copyout(&bs, dp, out_size);
1718				mutex_enter(&bufcache_lock);
1719				if (error)
1720					break;
1721				if (bq->bq_marker != bp) {
1722					/*
1723					 * This sysctl node is only for
1724					 * statistics.  Retry; if the
1725					 * queue keeps changing, then
1726					 * bail out.
1727					 */
1728					if (retries-- == 0) {
1729						error = EAGAIN;
1730						break;
1731					}
1732					mutex_exit(&bufcache_lock);
1733					goto retry;
1734				}
1735				dp += elem_size;
1736				len -= elem_size;
1737			}
1738			needed += elem_size;
1739			if (elem_count > 0 && elem_count != INT_MAX)
1740				elem_count--;
1741		}
1742		if (error != 0)
1743			break;
1744	}
1745	mutex_exit(&bufcache_lock);
1746	sysctl_relock();
1747
1748	*oldlenp = needed;
1749	if (oldp == NULL)
1750		*oldlenp += KERN_BUFSLOP * sizeof(buf_t);
1751
1752	return (error);
1753}
1754
1755static int
1756sysctl_bufvm_update(SYSCTLFN_ARGS)
1757{
1758	int t, error, rv;
1759	struct sysctlnode node;
1760
1761	node = *rnode;
1762	node.sysctl_data = &t;
1763	t = *(int *)rnode->sysctl_data;
1764	error = sysctl_lookup(SYSCTLFN_CALL(&node));
1765	if (error || newp == NULL)
1766		return (error);
1767
1768	if (t < 0)
1769		return EINVAL;
1770	if (rnode->sysctl_data == &bufcache) {
1771		if (t > 100)
1772			return (EINVAL);
1773		bufcache = t;
1774		buf_setwm();
1775	} else if (rnode->sysctl_data == &bufmem_lowater) {
1776		if (bufmem_hiwater - t < 16)
1777			return (EINVAL);
1778		bufmem_lowater = t;
1779	} else if (rnode->sysctl_data == &bufmem_hiwater) {
1780		if (t - bufmem_lowater < 16)
1781			return (EINVAL);
1782		bufmem_hiwater = t;
1783	} else
1784		return (EINVAL);
1785
1786	/* Drain until below new high water mark */
1787	sysctl_unlock();
1788	mutex_enter(&bufcache_lock);
1789	while ((t = bufmem - bufmem_hiwater) >= 0) {
1790		rv = buf_drain(t / (2 * 1024));
1791		if (rv <= 0)
1792			break;
1793	}
1794	mutex_exit(&bufcache_lock);
1795	sysctl_relock();
1796
1797	return 0;
1798}
1799
1800static struct sysctllog *vfsbio_sysctllog;
1801
1802static void
1803sysctl_kern_buf_setup(void)
1804{
1805
1806	sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1807		       CTLFLAG_PERMANENT,
1808		       CTLTYPE_NODE, "kern", NULL,
1809		       NULL, 0, NULL, 0,
1810		       CTL_KERN, CTL_EOL);
1811	sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1812		       CTLFLAG_PERMANENT,
1813		       CTLTYPE_NODE, "buf",
1814		       SYSCTL_DESCR("Kernel buffer cache information"),
1815		       sysctl_dobuf, 0, NULL, 0,
1816		       CTL_KERN, KERN_BUF, CTL_EOL);
1817}
1818
1819static void
1820sysctl_vm_buf_setup(void)
1821{
1822
1823	sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1824		       CTLFLAG_PERMANENT,
1825		       CTLTYPE_NODE, "vm", NULL,
1826		       NULL, 0, NULL, 0,
1827		       CTL_VM, CTL_EOL);
1828	sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1829		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1830		       CTLTYPE_INT, "bufcache",
1831		       SYSCTL_DESCR("Percentage of physical memory to use for "
1832				    "buffer cache"),
1833		       sysctl_bufvm_update, 0, &bufcache, 0,
1834		       CTL_VM, CTL_CREATE, CTL_EOL);
1835	sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1836		       CTLFLAG_PERMANENT|CTLFLAG_READONLY,
1837		       CTLTYPE_INT, "bufmem",
1838		       SYSCTL_DESCR("Amount of kernel memory used by buffer "
1839				    "cache"),
1840		       NULL, 0, &bufmem, 0,
1841		       CTL_VM, CTL_CREATE, CTL_EOL);
1842	sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1843		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1844		       CTLTYPE_INT, "bufmem_lowater",
1845		       SYSCTL_DESCR("Minimum amount of kernel memory to "
1846				    "reserve for buffer cache"),
1847		       sysctl_bufvm_update, 0, &bufmem_lowater, 0,
1848		       CTL_VM, CTL_CREATE, CTL_EOL);
1849	sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1850		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1851		       CTLTYPE_INT, "bufmem_hiwater",
1852		       SYSCTL_DESCR("Maximum amount of kernel memory to use "
1853				    "for buffer cache"),
1854		       sysctl_bufvm_update, 0, &bufmem_hiwater, 0,
1855		       CTL_VM, CTL_CREATE, CTL_EOL);
1856}
1857
1858#ifdef DEBUG
1859/*
1860 * Print out statistics on the current allocation of the buffer pool.
1861 * Can be enabled to print out on every ``sync'' by setting "syncprt"
1862 * in vfs_syscalls.c using sysctl.
1863 */
1864void
1865vfs_bufstats(void)
1866{
1867	int i, j, count;
1868	buf_t *bp;
1869	struct bqueue *dp;
1870	int counts[(MAXBSIZE / PAGE_SIZE) + 1];
1871	static const char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE" };
1872
1873	for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1874		count = 0;
1875		for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
1876			counts[j] = 0;
1877		TAILQ_FOREACH(bp, &dp->bq_queue, b_freelist) {
1878			counts[bp->b_bufsize/PAGE_SIZE]++;
1879			count++;
1880		}
1881		printf("%s: total-%d", bname[i], count);
1882		for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
1883			if (counts[j] != 0)
1884				printf(", %d-%d", j * PAGE_SIZE, counts[j]);
1885		printf("\n");
1886	}
1887}
1888#endif /* DEBUG */
1889
1890/* ------------------------------ */
1891
1892buf_t *
1893getiobuf(struct vnode *vp, bool waitok)
1894{
1895	buf_t *bp;
1896
1897	bp = pool_cache_get(bufio_cache, (waitok ? PR_WAITOK : PR_NOWAIT));
1898	if (bp == NULL)
1899		return bp;
1900
1901	buf_init(bp);
1902
1903	if ((bp->b_vp = vp) == NULL)
1904		bp->b_objlock = &buffer_lock;
1905	else
1906		bp->b_objlock = &vp->v_interlock;
1907
1908	return bp;
1909}
1910
1911void
1912putiobuf(buf_t *bp)
1913{
1914
1915	buf_destroy(bp);
1916	pool_cache_put(bufio_cache, bp);
1917}
1918
1919/*
1920 * nestiobuf_iodone: b_iodone callback for nested buffers.
1921 */
1922
1923void
1924nestiobuf_iodone(buf_t *bp)
1925{
1926	buf_t *mbp = bp->b_private;
1927	int error;
1928	int donebytes;
1929
1930	KASSERT(bp->b_bcount <= bp->b_bufsize);
1931	KASSERT(mbp != bp);
1932
1933	error = bp->b_error;
1934	if (bp->b_error == 0 &&
1935	    (bp->b_bcount < bp->b_bufsize || bp->b_resid > 0)) {
1936		/*
1937		 * Not all got transfered, raise an error. We have no way to
1938		 * propagate these conditions to mbp.
1939		 */
1940		error = EIO;
1941	}
1942
1943	donebytes = bp->b_bufsize;
1944
1945	putiobuf(bp);
1946	nestiobuf_done(mbp, donebytes, error);
1947}
1948
1949/*
1950 * nestiobuf_setup: setup a "nested" buffer.
1951 *
1952 * => 'mbp' is a "master" buffer which is being divided into sub pieces.
1953 * => 'bp' should be a buffer allocated by getiobuf.
1954 * => 'offset' is a byte offset in the master buffer.
1955 * => 'size' is a size in bytes of this nested buffer.
1956 */
1957
1958void
1959nestiobuf_setup(buf_t *mbp, buf_t *bp, int offset, size_t size)
1960{
1961	const int b_read = mbp->b_flags & B_READ;
1962	struct vnode *vp = mbp->b_vp;
1963
1964	KASSERT(mbp->b_bcount >= offset + size);
1965	bp->b_vp = vp;
1966	bp->b_dev = mbp->b_dev;
1967	bp->b_objlock = mbp->b_objlock;
1968	bp->b_cflags = BC_BUSY;
1969	bp->b_flags = B_ASYNC | b_read;
1970	bp->b_iodone = nestiobuf_iodone;
1971	bp->b_data = (char *)mbp->b_data + offset;
1972	bp->b_resid = bp->b_bcount = size;
1973	bp->b_bufsize = bp->b_bcount;
1974	bp->b_private = mbp;
1975	BIO_COPYPRIO(bp, mbp);
1976	if (!b_read && vp != NULL) {
1977		mutex_enter(&vp->v_interlock);
1978		vp->v_numoutput++;
1979		mutex_exit(&vp->v_interlock);
1980	}
1981}
1982
1983/*
1984 * nestiobuf_done: propagate completion to the master buffer.
1985 *
1986 * => 'donebytes' specifies how many bytes in the 'mbp' is completed.
1987 * => 'error' is an errno(2) that 'donebytes' has been completed with.
1988 */
1989
1990void
1991nestiobuf_done(buf_t *mbp, int donebytes, int error)
1992{
1993
1994	if (donebytes == 0) {
1995		return;
1996	}
1997	mutex_enter(mbp->b_objlock);
1998	KASSERT(mbp->b_resid >= donebytes);
1999	mbp->b_resid -= donebytes;
2000	if (error)
2001		mbp->b_error = error;
2002	if (mbp->b_resid == 0) {
2003		if (mbp->b_error)
2004			mbp->b_resid = mbp->b_bcount;
2005		mutex_exit(mbp->b_objlock);
2006		biodone(mbp);
2007	} else
2008		mutex_exit(mbp->b_objlock);
2009}
2010
2011void
2012buf_init(buf_t *bp)
2013{
2014
2015	cv_init(&bp->b_busy, "biolock");
2016	cv_init(&bp->b_done, "biowait");
2017	bp->b_dev = NODEV;
2018	bp->b_error = 0;
2019	bp->b_flags = 0;
2020	bp->b_cflags = 0;
2021	bp->b_oflags = 0;
2022	bp->b_objlock = &buffer_lock;
2023	bp->b_iodone = NULL;
2024	bp->b_refcnt = 1;
2025	bp->b_dev = NODEV;
2026	bp->b_vnbufs.le_next = NOLIST;
2027	BIO_SETPRIO(bp, BPRIO_DEFAULT);
2028}
2029
2030void
2031buf_destroy(buf_t *bp)
2032{
2033
2034	cv_destroy(&bp->b_done);
2035	cv_destroy(&bp->b_busy);
2036}
2037
2038int
2039bbusy(buf_t *bp, bool intr, int timo, kmutex_t *interlock)
2040{
2041	int error;
2042
2043	KASSERT(mutex_owned(&bufcache_lock));
2044
2045	if ((bp->b_cflags & BC_BUSY) != 0) {
2046		if (curlwp == uvm.pagedaemon_lwp)
2047			return EDEADLK;
2048		bp->b_cflags |= BC_WANTED;
2049		bref(bp);
2050		if (interlock != NULL)
2051			mutex_exit(interlock);
2052		if (intr) {
2053			error = cv_timedwait_sig(&bp->b_busy, &bufcache_lock,
2054			    timo);
2055		} else {
2056			error = cv_timedwait(&bp->b_busy, &bufcache_lock,
2057			    timo);
2058		}
2059		brele(bp);
2060		if (interlock != NULL)
2061			mutex_enter(interlock);
2062		if (error != 0)
2063			return error;
2064		return EPASSTHROUGH;
2065	}
2066	bp->b_cflags |= BC_BUSY;
2067
2068	return 0;
2069}
2070