vfs_bio.c revision 211123
150397Sobrien/*-
252284Sobrien * Copyright (c) 2004 Poul-Henning Kamp
350397Sobrien * Copyright (c) 1994,1997 John S. Dyson
450397Sobrien * All rights reserved.
550397Sobrien *
650397Sobrien * Redistribution and use in source and binary forms, with or without
750397Sobrien * modification, are permitted provided that the following conditions
850397Sobrien * are met:
950397Sobrien * 1. Redistributions of source code must retain the above copyright
1050397Sobrien *    notice, this list of conditions and the following disclaimer.
1150397Sobrien * 2. Redistributions in binary form must reproduce the above copyright
1250397Sobrien *    notice, this list of conditions and the following disclaimer in the
1350397Sobrien *    documentation and/or other materials provided with the distribution.
1450397Sobrien *
1550397Sobrien * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
1650397Sobrien * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1750397Sobrien * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1850397Sobrien * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
1950397Sobrien * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2050397Sobrien * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2150397Sobrien * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2250397Sobrien * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2350397Sobrien * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2450397Sobrien * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2550397Sobrien * SUCH DAMAGE.
2650397Sobrien */
2750397Sobrien
2850397Sobrien/*
2950397Sobrien * this file contains a new buffer I/O scheme implementing a coherent
3050397Sobrien * VM object and buffer cache scheme.  Pains have been taken to make
3150397Sobrien * sure that the performance degradation associated with schemes such
3250397Sobrien * as this is not realized.
3350397Sobrien *
3450397Sobrien * Author:  John S. Dyson
3550397Sobrien * Significant help during the development and debugging phases
3650397Sobrien * had been provided by David Greenman, also of the FreeBSD core team.
3750397Sobrien *
3852284Sobrien * see man buf(9) for more info.
3950397Sobrien */
4050397Sobrien
4150397Sobrien#include <sys/cdefs.h>
4250397Sobrien__FBSDID("$FreeBSD: head/sys/kern/vfs_bio.c 211123 2010-08-09 22:22:46Z ivoras $");
4350397Sobrien
4450397Sobrien#include <sys/param.h>
4550397Sobrien#include <sys/systm.h>
4650397Sobrien#include <sys/bio.h>
4750397Sobrien#include <sys/conf.h>
4852284Sobrien#include <sys/buf.h>
4950397Sobrien#include <sys/devicestat.h>
5050397Sobrien#include <sys/eventhandler.h>
5150397Sobrien#include <sys/fail.h>
5250397Sobrien#include <sys/limits.h>
5350397Sobrien#include <sys/lock.h>
5450397Sobrien#include <sys/malloc.h>
5550397Sobrien#include <sys/mount.h>
5650397Sobrien#include <sys/mutex.h>
5750397Sobrien#include <sys/kernel.h>
5850397Sobrien#include <sys/kthread.h>
5950397Sobrien#include <sys/proc.h>
6050397Sobrien#include <sys/resourcevar.h>
6150397Sobrien#include <sys/sysctl.h>
6252284Sobrien#include <sys/vmmeter.h>
6350397Sobrien#include <sys/vnode.h>
6450397Sobrien#include <geom/geom.h>
6550397Sobrien#include <vm/vm.h>
6650397Sobrien#include <vm/vm_param.h>
6752284Sobrien#include <vm/vm_kern.h>
6852284Sobrien#include <vm/vm_pageout.h>
6950397Sobrien#include <vm/vm_page.h>
7050397Sobrien#include <vm/vm_object.h>
7150397Sobrien#include <vm/vm_extern.h>
7250397Sobrien#include <vm/vm_map.h>
7350397Sobrien#include "opt_compat.h"
7450397Sobrien#include "opt_directio.h"
7550397Sobrien#include "opt_swap.h"
7650397Sobrien
7750397Sobrienstatic MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
7850397Sobrien
7950397Sobrienstruct	bio_ops bioops;		/* I/O operation notification */
8050397Sobrien
8150397Sobrienstruct	buf_ops buf_ops_bio = {
8250397Sobrien	.bop_name	=	"buf_ops_bio",
8350397Sobrien	.bop_write	=	bufwrite,
8450397Sobrien	.bop_strategy	=	bufstrategy,
8550397Sobrien	.bop_sync	=	bufsync,
8650397Sobrien	.bop_bdflush	=	bufbdflush,
8750397Sobrien};
8850397Sobrien
8950397Sobrien/*
9050397Sobrien * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has
9150397Sobrien * carnal knowledge of buffers.  This knowledge should be moved to vfs_bio.c.
9250397Sobrien */
9350397Sobrienstruct buf *buf;		/* buffer header pool */
9450397Sobrien
9550397Sobrienstatic struct proc *bufdaemonproc;
9650397Sobrien
9750397Sobrienstatic int inmem(struct vnode *vp, daddr_t blkno);
9850397Sobrienstatic void vm_hold_free_pages(struct buf *bp, int newbsize);
9950397Sobrienstatic void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
10050397Sobrien		vm_offset_t to);
10150397Sobrienstatic void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
10250397Sobrienstatic void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
10350397Sobrien		vm_page_t m);
10450397Sobrienstatic void vfs_drain_busy_pages(struct buf *bp);
10550397Sobrienstatic void vfs_clean_pages_dirty_buf(struct buf *bp);
10650397Sobrienstatic void vfs_setdirty_locked_object(struct buf *bp);
10750397Sobrienstatic void vfs_vmio_release(struct buf *bp);
10850397Sobrienstatic int vfs_bio_clcheck(struct vnode *vp, int size,
10950397Sobrien		daddr_t lblkno, daddr_t blkno);
11050397Sobrienstatic int buf_do_flush(struct vnode *vp);
11152284Sobrienstatic int flushbufqueues(struct vnode *, int, int);
11252284Sobrienstatic void buf_daemon(void);
11352284Sobrienstatic void bremfreel(struct buf *bp);
11452284Sobrien#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
11552284Sobrien    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
11650397Sobrienstatic int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
11750397Sobrien#endif
11850397Sobrien
11950397Sobrienint vmiodirenable = TRUE;
12050397SobrienSYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
12150397Sobrien    "Use the VM system for directory writes");
12250397Sobrienlong runningbufspace;
12350397SobrienSYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
12450397Sobrien    "Amount of presently outstanding async buffer io");
12550397Sobrienstatic long bufspace;
12650397Sobrien#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
12750397Sobrien    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
12850397SobrienSYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
12950397Sobrien    &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers");
13050397Sobrien#else
13152284SobrienSYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
13252284Sobrien    "Virtual memory used for buffers");
13350397Sobrien#endif
13450397Sobrienstatic long maxbufspace;
13550397SobrienSYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
13650397Sobrien    "Maximum allowed value of bufspace (including buf_daemon)");
13750397Sobrienstatic long bufmallocspace;
13850397SobrienSYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
13950397Sobrien    "Amount of malloced memory for buffers");
14050397Sobrienstatic long maxbufmallocspace;
14152284SobrienSYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0,
14250397Sobrien    "Maximum amount of malloced memory for buffers");
14350397Sobrienstatic long lobufspace;
14450397SobrienSYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
14550397Sobrien    "Minimum amount of buffers we want to have");
14650397Sobrienlong hibufspace;
14750397SobrienSYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
14850397Sobrien    "Maximum allowed value of bufspace (excluding buf_daemon)");
14950397Sobrienstatic int bufreusecnt;
15050397SobrienSYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0,
15150397Sobrien    "Number of times we have reused a buffer");
15250397Sobrienstatic int buffreekvacnt;
15350397SobrienSYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
15450397Sobrien    "Number of times we have freed the KVA space from some buffer");
15550397Sobrienstatic int bufdefragcnt;
15650397SobrienSYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
15750397Sobrien    "Number of times we have had to repeat buffer allocation to defragment");
15850397Sobrienstatic long lorunningspace;
15950397SobrienSYSCTL_LONG(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0,
16050397Sobrien    "Minimum preferred space used for in-progress I/O");
16150397Sobrienstatic long hirunningspace;
16252284SobrienSYSCTL_LONG(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0,
16350397Sobrien    "Maximum amount of space to use for in-progress I/O");
16450397Sobrienint dirtybufferflushes;
16550397SobrienSYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
16650397Sobrien    0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
16750397Sobrienint bdwriteskip;
16850397SobrienSYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
16950397Sobrien    0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
17050397Sobrienint altbufferflushes;
17150397SobrienSYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
17250397Sobrien    0, "Number of fsync flushes to limit dirty buffers");
17350397Sobrienstatic int recursiveflushes;
17450397SobrienSYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes,
17550397Sobrien    0, "Number of flushes skipped due to being recursive");
17650397Sobrienstatic int numdirtybuffers;
17750397SobrienSYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
17850397Sobrien    "Number of buffers that are dirty (has unwritten changes) at the moment");
17950397Sobrienstatic int lodirtybuffers;
18050397SobrienSYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
18150397Sobrien    "How many buffers we want to have free before bufdaemon can sleep");
18250397Sobrienstatic int hidirtybuffers;
18350397SobrienSYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
18450397Sobrien    "When the number of dirty buffers is considered severe");
18550397Sobrienint dirtybufthresh;
18650397SobrienSYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
18750397Sobrien    0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
18850397Sobrienstatic int numfreebuffers;
18950397SobrienSYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
19050397Sobrien    "Number of free buffers");
19150397Sobrienstatic int lofreebuffers;
19250397SobrienSYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
19350397Sobrien   "XXX Unused");
19450397Sobrienstatic int hifreebuffers;
19550397SobrienSYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
19650397Sobrien   "XXX Complicatedly unused");
19750397Sobrienstatic int getnewbufcalls;
19850397SobrienSYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
19950397Sobrien   "Number of calls to getnewbuf");
20050397Sobrienstatic int getnewbufrestarts;
20150397SobrienSYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
20250397Sobrien    "Number of times getnewbuf has had to restart a buffer aquisition");
20350397Sobrienstatic int flushbufqtarget = 100;
20450397SobrienSYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
20550397Sobrien    "Amount of work to do in flushbufqueues when helping bufdaemon");
20650397Sobrienstatic long notbufdflashes;
20750397SobrienSYSCTL_LONG(_vfs, OID_AUTO, notbufdflashes, CTLFLAG_RD, &notbufdflashes, 0,
20850397Sobrien    "Number of dirty buffer flushes done by the bufdaemon helpers");
20950397Sobrien
21052284Sobrien/*
21152284Sobrien * Wakeup point for bufdaemon, as well as indicator of whether it is already
21252284Sobrien * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
21352284Sobrien * is idling.
21452284Sobrien */
21552284Sobrienstatic int bd_request;
21650397Sobrien
21750397Sobrien/*
21850397Sobrien * Request for the buf daemon to write more buffers than is indicated by
21950397Sobrien * lodirtybuf.  This may be necessary to push out excess dependencies or
22050397Sobrien * defragment the address space where a simple count of the number of dirty
22150397Sobrien * buffers is insufficient to characterize the demand for flushing them.
22250397Sobrien */
22352284Sobrienstatic int bd_speedupreq;
22452284Sobrien
22550397Sobrien/*
22652284Sobrien * This lock synchronizes access to bd_request.
22752284Sobrien */
22850397Sobrienstatic struct mtx bdlock;
22950397Sobrien
23050397Sobrien/*
23150397Sobrien * bogus page -- for I/O to/from partially complete buffers
23250397Sobrien * this is a temporary solution to the problem, but it is not
23350397Sobrien * really that bad.  it would be better to split the buffer
23450397Sobrien * for input in the case of buffers partially already in memory,
23550397Sobrien * but the code is intricate enough already.
23650397Sobrien */
23750397Sobrienvm_page_t bogus_page;
23850397Sobrien
23950397Sobrien/*
24050397Sobrien * Synchronization (sleep/wakeup) variable for active buffer space requests.
24150397Sobrien * Set when wait starts, cleared prior to wakeup().
24250397Sobrien * Used in runningbufwakeup() and waitrunningbufspace().
24350397Sobrien */
24450397Sobrienstatic int runningbufreq;
24550397Sobrien
24650397Sobrien/*
24750397Sobrien * This lock protects the runningbufreq and synchronizes runningbufwakeup and
24850397Sobrien * waitrunningbufspace().
24950397Sobrien */
25050397Sobrienstatic struct mtx rbreqlock;
25150397Sobrien
25250397Sobrien/*
25350397Sobrien * Synchronization (sleep/wakeup) variable for buffer requests.
25450397Sobrien * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
25550397Sobrien * by and/or.
25650397Sobrien * Used in numdirtywakeup(), bufspacewakeup(), bufcountwakeup(), bwillwrite(),
25750397Sobrien * getnewbuf(), and getblk().
25850397Sobrien */
25950397Sobrienstatic int needsbuffer;
26050397Sobrien
26150397Sobrien/*
26250397Sobrien * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
26350397Sobrien */
26452284Sobrienstatic struct mtx nblock;
26550397Sobrien
26650397Sobrien/*
26752284Sobrien * Definitions for the buffer free lists.
26852284Sobrien */
26952284Sobrien#define BUFFER_QUEUES	6	/* number of free buffer queues */
27052284Sobrien
27152284Sobrien#define QUEUE_NONE	0	/* on no queue */
27252284Sobrien#define QUEUE_CLEAN	1	/* non-B_DELWRI buffers */
27352284Sobrien#define QUEUE_DIRTY	2	/* B_DELWRI buffers */
27452284Sobrien#define QUEUE_DIRTY_GIANT 3	/* B_DELWRI buffers that need giant */
27552284Sobrien#define QUEUE_EMPTYKVA	4	/* empty buffer headers w/KVA assignment */
27652284Sobrien#define QUEUE_EMPTY	5	/* empty buffer headers */
27752284Sobrien#define QUEUE_SENTINEL	1024	/* not an queue index, but mark for sentinel */
27852284Sobrien
27950397Sobrien/* Queues for free buffers with various properties */
28050397Sobrienstatic TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
28150397Sobrien
28250397Sobrien/* Lock for the bufqueues */
28350397Sobrienstatic struct mtx bqlock;
28450397Sobrien
28550397Sobrien/*
28650397Sobrien * Single global constant for BUF_WMESG, to avoid getting multiple references.
28750397Sobrien * buf_wmesg is referred from macros.
28850397Sobrien */
28950397Sobrienconst char *buf_wmesg = BUF_WMESG;
29050397Sobrien
29150397Sobrien#define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
29250397Sobrien#define VFS_BIO_NEED_DIRTYFLUSH	0x02	/* waiting for dirty buffer flush */
29350397Sobrien#define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
29450397Sobrien#define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
29550397Sobrien
29650397Sobrien#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
29752284Sobrien    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
29850397Sobrienstatic int
29950397Sobriensysctl_bufspace(SYSCTL_HANDLER_ARGS)
30050397Sobrien{
30150397Sobrien	long lvalue;
30250397Sobrien	int ivalue;
30350397Sobrien
30450397Sobrien	if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
30550397Sobrien		return (sysctl_handle_long(oidp, arg1, arg2, req));
30650397Sobrien	lvalue = *(long *)arg1;
30750397Sobrien	if (lvalue > INT_MAX)
30850397Sobrien		/* On overflow, still write out a long to trigger ENOMEM. */
30950397Sobrien		return (sysctl_handle_long(oidp, &lvalue, 0, req));
31050397Sobrien	ivalue = lvalue;
31150397Sobrien	return (sysctl_handle_int(oidp, &ivalue, 0, req));
31250397Sobrien}
31350397Sobrien#endif
31450397Sobrien
31550397Sobrien#ifdef DIRECTIO
31650397Sobrienextern void ffs_rawread_setup(void);
31750397Sobrien#endif /* DIRECTIO */
31850397Sobrien/*
31950397Sobrien *	numdirtywakeup:
32050397Sobrien *
32150397Sobrien *	If someone is blocked due to there being too many dirty buffers,
32250397Sobrien *	and numdirtybuffers is now reasonable, wake them up.
32350397Sobrien */
32450397Sobrien
32550397Sobrienstatic __inline void
32650397Sobriennumdirtywakeup(int level)
32752284Sobrien{
32850397Sobrien
32950397Sobrien	if (numdirtybuffers <= level) {
33050397Sobrien		mtx_lock(&nblock);
33150397Sobrien		if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
33250397Sobrien			needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
33350397Sobrien			wakeup(&needsbuffer);
33450397Sobrien		}
33550397Sobrien		mtx_unlock(&nblock);
33650397Sobrien	}
33750397Sobrien}
33850397Sobrien
33950397Sobrien/*
34050397Sobrien *	bufspacewakeup:
34150397Sobrien *
34250397Sobrien *	Called when buffer space is potentially available for recovery.
34350397Sobrien *	getnewbuf() will block on this flag when it is unable to free
34450397Sobrien *	sufficient buffer space.  Buffer space becomes recoverable when
34550397Sobrien *	bp's get placed back in the queues.
34650397Sobrien */
34750397Sobrien
34850397Sobrienstatic __inline void
34950397Sobrienbufspacewakeup(void)
35050397Sobrien{
35150397Sobrien
35250397Sobrien	/*
35350397Sobrien	 * If someone is waiting for BUF space, wake them up.  Even
35450397Sobrien	 * though we haven't freed the kva space yet, the waiting
35550397Sobrien	 * process will be able to now.
35650397Sobrien	 */
35750397Sobrien	mtx_lock(&nblock);
35850397Sobrien	if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
35950397Sobrien		needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
36050397Sobrien		wakeup(&needsbuffer);
36150397Sobrien	}
36250397Sobrien	mtx_unlock(&nblock);
36350397Sobrien}
36450397Sobrien
36550397Sobrien/*
36650397Sobrien * runningbufwakeup() - in-progress I/O accounting.
36750397Sobrien *
36850397Sobrien */
36950397Sobrienvoid
37050397Sobrienrunningbufwakeup(struct buf *bp)
37150397Sobrien{
37250397Sobrien
37350397Sobrien	if (bp->b_runningbufspace) {
37450397Sobrien		atomic_subtract_long(&runningbufspace, bp->b_runningbufspace);
37550397Sobrien		bp->b_runningbufspace = 0;
37650397Sobrien		mtx_lock(&rbreqlock);
37750397Sobrien		if (runningbufreq && runningbufspace <= lorunningspace) {
37850397Sobrien			runningbufreq = 0;
37950397Sobrien			wakeup(&runningbufreq);
38050397Sobrien		}
38150397Sobrien		mtx_unlock(&rbreqlock);
38250397Sobrien	}
38350397Sobrien}
38450397Sobrien
38550397Sobrien/*
38650397Sobrien *	bufcountwakeup:
38750397Sobrien *
38850397Sobrien *	Called when a buffer has been added to one of the free queues to
38950397Sobrien *	account for the buffer and to wakeup anyone waiting for free buffers.
39050397Sobrien *	This typically occurs when large amounts of metadata are being handled
39152284Sobrien *	by the buffer cache ( else buffer space runs out first, usually ).
39250397Sobrien */
39350397Sobrien
39450397Sobrienstatic __inline void
39550397Sobrienbufcountwakeup(struct buf *bp)
39652284Sobrien{
39750397Sobrien	int old;
39850397Sobrien
39950397Sobrien	KASSERT((bp->b_vflags & BV_INFREECNT) == 0,
40050397Sobrien	    ("buf %p already counted as free", bp));
40150397Sobrien	bp->b_vflags |= BV_INFREECNT;
40250397Sobrien	old = atomic_fetchadd_int(&numfreebuffers, 1);
40352284Sobrien	KASSERT(old >= 0 && old < nbuf,
40450397Sobrien	    ("numfreebuffers climbed to %d", old + 1));
40550397Sobrien	mtx_lock(&nblock);
40650397Sobrien	if (needsbuffer) {
40752284Sobrien		needsbuffer &= ~VFS_BIO_NEED_ANY;
40852284Sobrien		if (numfreebuffers >= hifreebuffers)
40952284Sobrien			needsbuffer &= ~VFS_BIO_NEED_FREE;
41050397Sobrien		wakeup(&needsbuffer);
41150397Sobrien	}
41250397Sobrien	mtx_unlock(&nblock);
41350397Sobrien}
41450397Sobrien
41550397Sobrien/*
41650397Sobrien *	waitrunningbufspace()
41750397Sobrien *
41850397Sobrien *	runningbufspace is a measure of the amount of I/O currently
41950397Sobrien *	running.  This routine is used in async-write situations to
42050397Sobrien *	prevent creating huge backups of pending writes to a device.
42150397Sobrien *	Only asynchronous writes are governed by this function.
42252284Sobrien *
42352284Sobrien *	Reads will adjust runningbufspace, but will not block based on it.
42452284Sobrien *	The read load has a side effect of reducing the allowed write load.
42552284Sobrien *
42652284Sobrien *	This does NOT turn an async write into a sync write.  It waits
42752284Sobrien *	for earlier writes to complete and generally returns before the
42850397Sobrien *	caller's write has reached the device.
42950397Sobrien */
43052284Sobrienvoid
43150397Sobrienwaitrunningbufspace(void)
43250397Sobrien{
43350397Sobrien
43450397Sobrien	mtx_lock(&rbreqlock);
43550397Sobrien	while (runningbufspace > hirunningspace) {
43650397Sobrien		++runningbufreq;
43750397Sobrien		msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
43850397Sobrien	}
43950397Sobrien	mtx_unlock(&rbreqlock);
44050397Sobrien}
44150397Sobrien
44250397Sobrien
44350397Sobrien/*
44450397Sobrien *	vfs_buf_test_cache:
44552284Sobrien *
44652284Sobrien *	Called when a buffer is extended.  This function clears the B_CACHE
44752284Sobrien *	bit if the newly extended portion of the buffer does not contain
44852284Sobrien *	valid data.
44952284Sobrien */
45052284Sobrienstatic __inline
45150397Sobrienvoid
45250397Sobrienvfs_buf_test_cache(struct buf *bp,
45350397Sobrien		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
45450397Sobrien		  vm_page_t m)
45550397Sobrien{
45650397Sobrien
45750397Sobrien	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
45850397Sobrien	if (bp->b_flags & B_CACHE) {
45950397Sobrien		int base = (foff + off) & PAGE_MASK;
46050397Sobrien		if (vm_page_is_valid(m, base, size) == 0)
46150397Sobrien			bp->b_flags &= ~B_CACHE;
46250397Sobrien	}
46350397Sobrien}
46450397Sobrien
46550397Sobrien/* Wake up the buffer daemon if necessary */
46650397Sobrienstatic __inline
46750397Sobrienvoid
46850397Sobrienbd_wakeup(int dirtybuflevel)
46950397Sobrien{
47050397Sobrien
47150397Sobrien	mtx_lock(&bdlock);
47250397Sobrien	if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) {
47350397Sobrien		bd_request = 1;
47450397Sobrien		wakeup(&bd_request);
47552284Sobrien	}
47650397Sobrien	mtx_unlock(&bdlock);
47750397Sobrien}
47852284Sobrien
47950397Sobrien/*
48052284Sobrien * bd_speedup - speedup the buffer cache flushing code
48152284Sobrien */
48252284Sobrien
48352284Sobrienvoid
48450397Sobrienbd_speedup(void)
48550397Sobrien{
48650397Sobrien	int needwake;
48750397Sobrien
48850397Sobrien	mtx_lock(&bdlock);
48950397Sobrien	needwake = 0;
49050397Sobrien	if (bd_speedupreq == 0 || bd_request == 0)
49150397Sobrien		needwake = 1;
49250397Sobrien	bd_speedupreq = 1;
49350397Sobrien	bd_request = 1;
49450397Sobrien	if (needwake)
49550397Sobrien		wakeup(&bd_request);
49650397Sobrien	mtx_unlock(&bdlock);
49750397Sobrien}
49850397Sobrien
49950397Sobrien/*
50050397Sobrien * Calculating buffer cache scaling values and reserve space for buffer
50150397Sobrien * headers.  This is called during low level kernel initialization and
50252284Sobrien * may be called more then once.  We CANNOT write to the memory area
50352284Sobrien * being reserved at this time.
50450397Sobrien */
50550397Sobriencaddr_t
50650397Sobrienkern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
50750397Sobrien{
50850397Sobrien	int tuned_nbuf;
50950397Sobrien	long maxbuf;
51050397Sobrien
51150397Sobrien	/*
51250397Sobrien	 * physmem_est is in pages.  Convert it to kilobytes (assumes
51350397Sobrien	 * PAGE_SIZE is >= 1K)
51450397Sobrien	 */
51550397Sobrien	physmem_est = physmem_est * (PAGE_SIZE / 1024);
51650397Sobrien
51750397Sobrien	/*
51850397Sobrien	 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
51950397Sobrien	 * For the first 64MB of ram nominally allocate sufficient buffers to
52050397Sobrien	 * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
52150397Sobrien	 * buffers to cover 1/10 of our ram over 64MB.  When auto-sizing
52250397Sobrien	 * the buffer cache we limit the eventual kva reservation to
52350397Sobrien	 * maxbcache bytes.
52450397Sobrien	 *
52550397Sobrien	 * factor represents the 1/4 x ram conversion.
52650397Sobrien	 */
52750397Sobrien	if (nbuf == 0) {
52850397Sobrien		int factor = 4 * BKVASIZE / 1024;
52950397Sobrien
53050397Sobrien		nbuf = 50;
53150397Sobrien		if (physmem_est > 4096)
53250397Sobrien			nbuf += min((physmem_est - 4096) / factor,
53350397Sobrien			    65536 / factor);
53450397Sobrien		if (physmem_est > 65536)
53550397Sobrien			nbuf += (physmem_est - 65536) * 2 / (factor * 5);
53652284Sobrien
53752284Sobrien		if (maxbcache && nbuf > maxbcache / BKVASIZE)
53850397Sobrien			nbuf = maxbcache / BKVASIZE;
53950397Sobrien		tuned_nbuf = 1;
54050397Sobrien	} else
54150397Sobrien		tuned_nbuf = 0;
54250397Sobrien
54350397Sobrien	/* XXX Avoid unsigned long overflows later on with maxbufspace. */
54450397Sobrien	maxbuf = (LONG_MAX / 3) / BKVASIZE;
54550397Sobrien	if (nbuf > maxbuf) {
54650397Sobrien		if (!tuned_nbuf)
54750397Sobrien			printf("Warning: nbufs lowered from %d to %ld\n", nbuf,
54850397Sobrien			    maxbuf);
54952284Sobrien		nbuf = maxbuf;
55052284Sobrien	}
55152284Sobrien
55252284Sobrien	/*
55352284Sobrien	 * swbufs are used as temporary holders for I/O, such as paging I/O.
55452284Sobrien	 * We have no less then 16 and no more then 256.
55552284Sobrien	 */
55652284Sobrien	nswbuf = max(min(nbuf/4, 256), 16);
55752284Sobrien#ifdef NSWBUF_MIN
55852284Sobrien	if (nswbuf < NSWBUF_MIN)
55952284Sobrien		nswbuf = NSWBUF_MIN;
56052284Sobrien#endif
56152284Sobrien#ifdef DIRECTIO
56252284Sobrien	ffs_rawread_setup();
56350397Sobrien#endif
56450397Sobrien
56550397Sobrien	/*
56650397Sobrien	 * Reserve space for the buffer cache buffers
56750397Sobrien	 */
56850397Sobrien	swbuf = (void *)v;
56950397Sobrien	v = (caddr_t)(swbuf + nswbuf);
57050397Sobrien	buf = (void *)v;
57150397Sobrien	v = (caddr_t)(buf + nbuf);
57250397Sobrien
57350397Sobrien	return(v);
57450397Sobrien}
57550397Sobrien
57650397Sobrien/* Initialize the buffer subsystem.  Called before use of any buffers. */
57750397Sobrienvoid
57850397Sobrienbufinit(void)
57950397Sobrien{
58050397Sobrien	struct buf *bp;
58150397Sobrien	int i;
58250397Sobrien
58350397Sobrien	mtx_init(&bqlock, "buf queue lock", NULL, MTX_DEF);
58450397Sobrien	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
58550397Sobrien	mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF);
58650397Sobrien	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
58750397Sobrien
58850397Sobrien	/* next, make a null set of free lists */
58950397Sobrien	for (i = 0; i < BUFFER_QUEUES; i++)
59050397Sobrien		TAILQ_INIT(&bufqueues[i]);
59150397Sobrien
59250397Sobrien	/* finally, initialize each buffer header and stick on empty q */
59350397Sobrien	for (i = 0; i < nbuf; i++) {
59450397Sobrien		bp = &buf[i];
59550397Sobrien		bzero(bp, sizeof *bp);
59650397Sobrien		bp->b_flags = B_INVAL;	/* we're just an empty header */
59750397Sobrien		bp->b_rcred = NOCRED;
59852284Sobrien		bp->b_wcred = NOCRED;
59952284Sobrien		bp->b_qindex = QUEUE_EMPTY;
60050397Sobrien		bp->b_vflags = BV_INFREECNT;	/* buf is counted as free */
60150397Sobrien		bp->b_xflags = 0;
60252284Sobrien		LIST_INIT(&bp->b_dep);
60350397Sobrien		BUF_LOCKINIT(bp);
60450397Sobrien		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
60550397Sobrien	}
60650397Sobrien
60750397Sobrien	/*
60850397Sobrien	 * maxbufspace is the absolute maximum amount of buffer space we are
60950397Sobrien	 * allowed to reserve in KVM and in real terms.  The absolute maximum
61050397Sobrien	 * is nominally used by buf_daemon.  hibufspace is the nominal maximum
61150397Sobrien	 * used by most other processes.  The differential is required to
61250397Sobrien	 * ensure that buf_daemon is able to run when other processes might
61350397Sobrien	 * be blocked waiting for buffer space.
61452284Sobrien	 *
61550397Sobrien	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
61650397Sobrien	 * this may result in KVM fragmentation which is not handled optimally
61750397Sobrien	 * by the system.
61850397Sobrien	 */
61950397Sobrien	maxbufspace = (long)nbuf * BKVASIZE;
62050397Sobrien	hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
62150397Sobrien	lobufspace = hibufspace - MAXBSIZE;
62250397Sobrien
62350397Sobrien	/*
62450397Sobrien	 * Note: The 16 MB upper limit for hirunningspace was chosen
62550397Sobrien	 * arbitrarily and may need further tuning. It corresponds to
62650397Sobrien	 * 128 outstanding write IO requests (if IO size is 128 KiB),
62750397Sobrien	 * which fits with many RAID controllers' tagged queing limits.
62850397Sobrien	 * The lower 1 MB limit is the historical upper limit for
62950397Sobrien	 * hirunningspace.
63050397Sobrien	 */
63150397Sobrien	hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBSIZE),
63250397Sobrien	    16 * 1024 * 1024), 1024 * 1024);
63350397Sobrien	lorunningspace = roundup(hirunningspace / 2, MAXBSIZE);
63450397Sobrien
63550397Sobrien/*
63650397Sobrien * Limit the amount of malloc memory since it is wired permanently into
63750397Sobrien * the kernel space.  Even though this is accounted for in the buffer
63850397Sobrien * allocation, we don't want the malloced region to grow uncontrolled.
63950397Sobrien * The malloc scheme improves memory utilization significantly on average
64050397Sobrien * (small) directories.
64150397Sobrien */
64250397Sobrien	maxbufmallocspace = hibufspace / 20;
64350397Sobrien
64450397Sobrien/*
64550397Sobrien * Reduce the chance of a deadlock occuring by limiting the number
64650397Sobrien * of delayed-write dirty buffers we allow to stack up.
64750397Sobrien */
64850397Sobrien	hidirtybuffers = nbuf / 4 + 20;
64950397Sobrien	dirtybufthresh = hidirtybuffers * 9 / 10;
65050397Sobrien	numdirtybuffers = 0;
65150397Sobrien/*
65250397Sobrien * To support extreme low-memory systems, make sure hidirtybuffers cannot
65350397Sobrien * eat up all available buffer space.  This occurs when our minimum cannot
65450397Sobrien * be met.  We try to size hidirtybuffers to 3/4 our buffer space assuming
65550397Sobrien * BKVASIZE'd (8K) buffers.
65650397Sobrien */
65750397Sobrien	while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
65850397Sobrien		hidirtybuffers >>= 1;
65950397Sobrien	}
66050397Sobrien	lodirtybuffers = hidirtybuffers / 2;
66150397Sobrien
66250397Sobrien/*
66350397Sobrien * Try to keep the number of free buffers in the specified range,
66450397Sobrien * and give special processes (e.g. like buf_daemon) access to an
66550397Sobrien * emergency reserve.
66650397Sobrien */
66750397Sobrien	lofreebuffers = nbuf / 18 + 5;
66850397Sobrien	hifreebuffers = 2 * lofreebuffers;
66950397Sobrien	numfreebuffers = nbuf;
67050397Sobrien
67150397Sobrien	bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
67250397Sobrien	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
67350397Sobrien}
67450397Sobrien
67552284Sobrien/*
67650397Sobrien * bfreekva() - free the kva allocation for a buffer.
67750397Sobrien *
67850397Sobrien *	Since this call frees up buffer space, we call bufspacewakeup().
67950397Sobrien */
68050397Sobrienstatic void
68150397Sobrienbfreekva(struct buf *bp)
68250397Sobrien{
68350397Sobrien
68450397Sobrien	if (bp->b_kvasize) {
68550397Sobrien		atomic_add_int(&buffreekvacnt, 1);
68650397Sobrien		atomic_subtract_long(&bufspace, bp->b_kvasize);
68750397Sobrien		vm_map_remove(buffer_map, (vm_offset_t) bp->b_kvabase,
68850397Sobrien		    (vm_offset_t) bp->b_kvabase + bp->b_kvasize);
68950397Sobrien		bp->b_kvasize = 0;
69050397Sobrien		bufspacewakeup();
69150397Sobrien	}
69250397Sobrien}
69350397Sobrien
69450397Sobrien/*
69550397Sobrien *	bremfree:
69650397Sobrien *
69750397Sobrien *	Mark the buffer for removal from the appropriate free list in brelse.
69850397Sobrien *
69950397Sobrien */
70050397Sobrienvoid
70150397Sobrienbremfree(struct buf *bp)
70250397Sobrien{
70350397Sobrien	int old;
70450397Sobrien
70550397Sobrien	CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
70650397Sobrien	KASSERT((bp->b_flags & B_REMFREE) == 0,
70750397Sobrien	    ("bremfree: buffer %p already marked for delayed removal.", bp));
70850397Sobrien	KASSERT(bp->b_qindex != QUEUE_NONE,
70950397Sobrien	    ("bremfree: buffer %p not on a queue.", bp));
71050397Sobrien	BUF_ASSERT_HELD(bp);
71150397Sobrien
71250397Sobrien	bp->b_flags |= B_REMFREE;
71350397Sobrien	/* Fixup numfreebuffers count.  */
71450397Sobrien	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
71550397Sobrien		KASSERT((bp->b_vflags & BV_INFREECNT) != 0,
71650397Sobrien		    ("buf %p not counted in numfreebuffers", bp));
71750397Sobrien		bp->b_vflags &= ~BV_INFREECNT;
71850397Sobrien		old = atomic_fetchadd_int(&numfreebuffers, -1);
71950397Sobrien		KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
72050397Sobrien	}
72150397Sobrien}
72250397Sobrien
72350397Sobrien/*
72450397Sobrien *	bremfreef:
72550397Sobrien *
72650397Sobrien *	Force an immediate removal from a free list.  Used only in nfs when
72750397Sobrien *	it abuses the b_freelist pointer.
72850397Sobrien */
72950397Sobrienvoid
73050397Sobrienbremfreef(struct buf *bp)
73150397Sobrien{
73250397Sobrien	mtx_lock(&bqlock);
73350397Sobrien	bremfreel(bp);
73450397Sobrien	mtx_unlock(&bqlock);
73550397Sobrien}
73650397Sobrien
73750397Sobrien/*
73850397Sobrien *	bremfreel:
73952284Sobrien *
74050397Sobrien *	Removes a buffer from the free list, must be called with the
74150397Sobrien *	bqlock held.
74250397Sobrien */
74350397Sobrienstatic void
74450397Sobrienbremfreel(struct buf *bp)
74550397Sobrien{
74650397Sobrien	int old;
74750397Sobrien
74850397Sobrien	CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
74950397Sobrien	    bp, bp->b_vp, bp->b_flags);
75050397Sobrien	KASSERT(bp->b_qindex != QUEUE_NONE,
75150397Sobrien	    ("bremfreel: buffer %p not on a queue.", bp));
75250397Sobrien	BUF_ASSERT_HELD(bp);
75350397Sobrien	mtx_assert(&bqlock, MA_OWNED);
75450397Sobrien
75550397Sobrien	TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
75650397Sobrien	bp->b_qindex = QUEUE_NONE;
75750397Sobrien	/*
75850397Sobrien	 * If this was a delayed bremfree() we only need to remove the buffer
75950397Sobrien	 * from the queue and return the stats are already done.
76050397Sobrien	 */
76150397Sobrien	if (bp->b_flags & B_REMFREE) {
76250397Sobrien		bp->b_flags &= ~B_REMFREE;
76350397Sobrien		return;
76450397Sobrien	}
76550397Sobrien	/*
76650397Sobrien	 * Fixup numfreebuffers count.  If the buffer is invalid or not
76750397Sobrien	 * delayed-write, the buffer was free and we must decrement
76850397Sobrien	 * numfreebuffers.
76950397Sobrien	 */
77050397Sobrien	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
77150397Sobrien		KASSERT((bp->b_vflags & BV_INFREECNT) != 0,
77250397Sobrien		    ("buf %p not counted in numfreebuffers", bp));
77352284Sobrien		bp->b_vflags &= ~BV_INFREECNT;
77450397Sobrien		old = atomic_fetchadd_int(&numfreebuffers, -1);
77550397Sobrien		KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
77650397Sobrien	}
77750397Sobrien}
77850397Sobrien
77950397Sobrien
78050397Sobrien/*
78150397Sobrien * Get a buffer with the specified data.  Look in the cache first.  We
78250397Sobrien * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
78350397Sobrien * is set, the buffer is valid and we do not have to do anything ( see
78450397Sobrien * getblk() ).  This is really just a special case of breadn().
78550397Sobrien */
78650397Sobrienint
78752284Sobrienbread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
78850397Sobrien    struct buf **bpp)
78950397Sobrien{
79050397Sobrien
79150397Sobrien	return (breadn(vp, blkno, size, 0, 0, 0, cred, bpp));
79250397Sobrien}
79350397Sobrien
79450397Sobrien/*
79550397Sobrien * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
79650397Sobrien * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
79750397Sobrien * the buffer is valid and we do not have to do anything.
79850397Sobrien */
79950397Sobrienvoid
80050397Sobrienbreada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
80150397Sobrien    int cnt, struct ucred * cred)
80250397Sobrien{
80350397Sobrien	struct buf *rabp;
80450397Sobrien	int i;
80550397Sobrien
80650397Sobrien	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
80750397Sobrien		if (inmem(vp, *rablkno))
80850397Sobrien			continue;
80950397Sobrien		rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
81050397Sobrien
81150397Sobrien		if ((rabp->b_flags & B_CACHE) == 0) {
81250397Sobrien			if (!TD_IS_IDLETHREAD(curthread))
81352284Sobrien				curthread->td_ru.ru_inblock++;
81452284Sobrien			rabp->b_flags |= B_ASYNC;
81552284Sobrien			rabp->b_flags &= ~B_INVAL;
81650397Sobrien			rabp->b_ioflags &= ~BIO_ERROR;
81750397Sobrien			rabp->b_iocmd = BIO_READ;
81850397Sobrien			if (rabp->b_rcred == NOCRED && cred != NOCRED)
81950397Sobrien				rabp->b_rcred = crhold(cred);
82050397Sobrien			vfs_busy_pages(rabp, 0);
82150397Sobrien			BUF_KERNPROC(rabp);
82250397Sobrien			rabp->b_iooffset = dbtob(rabp->b_blkno);
82350397Sobrien			bstrategy(rabp);
82450397Sobrien		} else {
82550397Sobrien			brelse(rabp);
82650397Sobrien		}
82750397Sobrien	}
82850397Sobrien}
82950397Sobrien
83050397Sobrien/*
83150397Sobrien * Operates like bread, but also starts asynchronous I/O on
83250397Sobrien * read-ahead blocks.
83350397Sobrien */
83450397Sobrienint
83550397Sobrienbreadn(struct vnode * vp, daddr_t blkno, int size,
83650397Sobrien    daddr_t * rablkno, int *rabsize,
83750397Sobrien    int cnt, struct ucred * cred, struct buf **bpp)
83850397Sobrien{
83950397Sobrien	struct buf *bp;
84050397Sobrien	int rv = 0, readwait = 0;
84150397Sobrien
84250397Sobrien	CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
84350397Sobrien	*bpp = bp = getblk(vp, blkno, size, 0, 0, 0);
84450397Sobrien
84550397Sobrien	/* if not found in cache, do some I/O */
84650397Sobrien	if ((bp->b_flags & B_CACHE) == 0) {
84752284Sobrien		if (!TD_IS_IDLETHREAD(curthread))
84850397Sobrien			curthread->td_ru.ru_inblock++;
84950397Sobrien		bp->b_iocmd = BIO_READ;
85052284Sobrien		bp->b_flags &= ~B_INVAL;
85150397Sobrien		bp->b_ioflags &= ~BIO_ERROR;
85250397Sobrien		if (bp->b_rcred == NOCRED && cred != NOCRED)
85350397Sobrien			bp->b_rcred = crhold(cred);
85450397Sobrien		vfs_busy_pages(bp, 0);
85550397Sobrien		bp->b_iooffset = dbtob(bp->b_blkno);
85650397Sobrien		bstrategy(bp);
85750397Sobrien		++readwait;
85850397Sobrien	}
85950397Sobrien
86050397Sobrien	breada(vp, rablkno, rabsize, cnt, cred);
86150397Sobrien
86250397Sobrien	if (readwait) {
86350397Sobrien		rv = bufwait(bp);
86450397Sobrien	}
86550397Sobrien	return (rv);
86650397Sobrien}
86750397Sobrien
86850397Sobrien/*
86950397Sobrien * Write, release buffer on completion.  (Done by iodone
87050397Sobrien * if async).  Do not bother writing anything if the buffer
87150397Sobrien * is invalid.
87250397Sobrien *
87350397Sobrien * Note that we set B_CACHE here, indicating that buffer is
87450397Sobrien * fully valid and thus cacheable.  This is true even of NFS
87550397Sobrien * now so we set it generally.  This could be set either here
87650397Sobrien * or in biodone() since the I/O is synchronous.  We put it
87750397Sobrien * here.
87850397Sobrien */
87950397Sobrienint
88050397Sobrienbufwrite(struct buf *bp)
88150397Sobrien{
88250397Sobrien	int oldflags;
88350397Sobrien	struct vnode *vp;
88450397Sobrien	int vp_md;
88550397Sobrien
88650397Sobrien	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
88750397Sobrien	if (bp->b_flags & B_INVAL) {
88850397Sobrien		brelse(bp);
88950397Sobrien		return (0);
89050397Sobrien	}
89150397Sobrien
89250397Sobrien	oldflags = bp->b_flags;
89350397Sobrien
89450397Sobrien	BUF_ASSERT_HELD(bp);
89550397Sobrien
89650397Sobrien	if (bp->b_pin_count > 0)
89750397Sobrien		bunpin_wait(bp);
89850397Sobrien
89950397Sobrien	KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
90050397Sobrien	    ("FFS background buffer should not get here %p", bp));
90150397Sobrien
90250397Sobrien	vp = bp->b_vp;
90350397Sobrien	if (vp)
90450397Sobrien		vp_md = vp->v_vflag & VV_MD;
90550397Sobrien	else
90650397Sobrien		vp_md = 0;
90750397Sobrien
90850397Sobrien	/* Mark the buffer clean */
90950397Sobrien	bundirty(bp);
91050397Sobrien
91150397Sobrien	bp->b_flags &= ~B_DONE;
91250397Sobrien	bp->b_ioflags &= ~BIO_ERROR;
91350397Sobrien	bp->b_flags |= B_CACHE;
91450397Sobrien	bp->b_iocmd = BIO_WRITE;
91550397Sobrien
91650397Sobrien	bufobj_wref(bp->b_bufobj);
91750397Sobrien	vfs_busy_pages(bp, 1);
91850397Sobrien
91950397Sobrien	/*
92050397Sobrien	 * Normal bwrites pipeline writes
92150397Sobrien	 */
92250397Sobrien	bp->b_runningbufspace = bp->b_bufsize;
92350397Sobrien	atomic_add_long(&runningbufspace, bp->b_runningbufspace);
92450397Sobrien
92550397Sobrien	if (!TD_IS_IDLETHREAD(curthread))
92650397Sobrien		curthread->td_ru.ru_oublock++;
92750397Sobrien	if (oldflags & B_ASYNC)
92850397Sobrien		BUF_KERNPROC(bp);
92950397Sobrien	bp->b_iooffset = dbtob(bp->b_blkno);
93050397Sobrien	bstrategy(bp);
93150397Sobrien
93250397Sobrien	if ((oldflags & B_ASYNC) == 0) {
93350397Sobrien		int rtval = bufwait(bp);
93450397Sobrien		brelse(bp);
93550397Sobrien		return (rtval);
93650397Sobrien	} else {
93750397Sobrien		/*
93850397Sobrien		 * don't allow the async write to saturate the I/O
93950397Sobrien		 * system.  We will not deadlock here because
94050397Sobrien		 * we are blocking waiting for I/O that is already in-progress
94150397Sobrien		 * to complete. We do not block here if it is the update
94250397Sobrien		 * or syncer daemon trying to clean up as that can lead
94350397Sobrien		 * to deadlock.
94450397Sobrien		 */
94550397Sobrien		if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
94650397Sobrien			waitrunningbufspace();
94750397Sobrien	}
94850397Sobrien
94950397Sobrien	return (0);
95050397Sobrien}
95150397Sobrien
95250397Sobrienvoid
95350397Sobrienbufbdflush(struct bufobj *bo, struct buf *bp)
95452284Sobrien{
95550397Sobrien	struct buf *nbp;
95650397Sobrien
95750397Sobrien	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
95850397Sobrien		(void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
95950397Sobrien		altbufferflushes++;
96050397Sobrien	} else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
96150397Sobrien		BO_LOCK(bo);
96250397Sobrien		/*
96350397Sobrien		 * Try to find a buffer to flush.
96450397Sobrien		 */
96550397Sobrien		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
96650397Sobrien			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
96752284Sobrien			    BUF_LOCK(nbp,
96850397Sobrien				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
96950397Sobrien				continue;
97050397Sobrien			if (bp == nbp)
97150397Sobrien				panic("bdwrite: found ourselves");
97250397Sobrien			BO_UNLOCK(bo);
97350397Sobrien			/* Don't countdeps with the bo lock held. */
97450397Sobrien			if (buf_countdeps(nbp, 0)) {
97550397Sobrien				BO_LOCK(bo);
97650397Sobrien				BUF_UNLOCK(nbp);
97750397Sobrien				continue;
97850397Sobrien			}
97950397Sobrien			if (nbp->b_flags & B_CLUSTEROK) {
98050397Sobrien				vfs_bio_awrite(nbp);
98150397Sobrien			} else {
98250397Sobrien				bremfree(nbp);
98350397Sobrien				bawrite(nbp);
98450397Sobrien			}
98550397Sobrien			dirtybufferflushes++;
98650397Sobrien			break;
98750397Sobrien		}
98850397Sobrien		if (nbp == NULL)
98950397Sobrien			BO_UNLOCK(bo);
99050397Sobrien	}
99150397Sobrien}
99250397Sobrien
99350397Sobrien/*
99450397Sobrien * Delayed write. (Buffer is marked dirty).  Do not bother writing
99550397Sobrien * anything if the buffer is marked invalid.
99650397Sobrien *
99750397Sobrien * Note that since the buffer must be completely valid, we can safely
99850397Sobrien * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
99950397Sobrien * biodone() in order to prevent getblk from writing the buffer
100050397Sobrien * out synchronously.
100152284Sobrien */
100250397Sobrienvoid
100350397Sobrienbdwrite(struct buf *bp)
100450397Sobrien{
100550397Sobrien	struct thread *td = curthread;
100650397Sobrien	struct vnode *vp;
100750397Sobrien	struct bufobj *bo;
100850397Sobrien
100950397Sobrien	CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
101050397Sobrien	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
101150397Sobrien	BUF_ASSERT_HELD(bp);
101250397Sobrien
101350397Sobrien	if (bp->b_flags & B_INVAL) {
101452284Sobrien		brelse(bp);
101550397Sobrien		return;
101652284Sobrien	}
101750397Sobrien
101850397Sobrien	/*
101950397Sobrien	 * If we have too many dirty buffers, don't create any more.
102050397Sobrien	 * If we are wildly over our limit, then force a complete
102150397Sobrien	 * cleanup. Otherwise, just keep the situation from getting
102250397Sobrien	 * out of control. Note that we have to avoid a recursive
102350397Sobrien	 * disaster and not try to clean up after our own cleanup!
102450397Sobrien	 */
102550397Sobrien	vp = bp->b_vp;
102650397Sobrien	bo = bp->b_bufobj;
102750397Sobrien	if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) {
102850397Sobrien		td->td_pflags |= TDP_INBDFLUSH;
102950397Sobrien		BO_BDFLUSH(bo, bp);
103050397Sobrien		td->td_pflags &= ~TDP_INBDFLUSH;
103150397Sobrien	} else
103250397Sobrien		recursiveflushes++;
103350397Sobrien
103450397Sobrien	bdirty(bp);
103550397Sobrien	/*
103650397Sobrien	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
103750397Sobrien	 * true even of NFS now.
103850397Sobrien	 */
103950397Sobrien	bp->b_flags |= B_CACHE;
104050397Sobrien
104150397Sobrien	/*
104250397Sobrien	 * This bmap keeps the system from needing to do the bmap later,
104350397Sobrien	 * perhaps when the system is attempting to do a sync.  Since it
104450397Sobrien	 * is likely that the indirect block -- or whatever other datastructure
104550397Sobrien	 * that the filesystem needs is still in memory now, it is a good
104650397Sobrien	 * thing to do this.  Note also, that if the pageout daemon is
104750397Sobrien	 * requesting a sync -- there might not be enough memory to do
104850397Sobrien	 * the bmap then...  So, this is important to do.
104952284Sobrien	 */
105050397Sobrien	if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
105150397Sobrien		VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
105250397Sobrien	}
105350397Sobrien
105450397Sobrien	/*
105550397Sobrien	 * Set the *dirty* buffer range based upon the VM system dirty
105650397Sobrien	 * pages.
105750397Sobrien	 *
105850397Sobrien	 * Mark the buffer pages as clean.  We need to do this here to
105950397Sobrien	 * satisfy the vnode_pager and the pageout daemon, so that it
106052284Sobrien	 * thinks that the pages have been "cleaned".  Note that since
106150397Sobrien	 * the pages are in a delayed write buffer -- the VFS layer
106252284Sobrien	 * "will" see that the pages get written out on the next sync,
106350397Sobrien	 * or perhaps the cluster will be completed.
106450397Sobrien	 */
106550397Sobrien	vfs_clean_pages_dirty_buf(bp);
106650397Sobrien	bqrelse(bp);
106750397Sobrien
106850397Sobrien	/*
106950397Sobrien	 * Wakeup the buffer flushing daemon if we have a lot of dirty
107050397Sobrien	 * buffers (midpoint between our recovery point and our stall
107150397Sobrien	 * point).
107250397Sobrien	 */
107350397Sobrien	bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
107450397Sobrien
107550397Sobrien	/*
107650397Sobrien	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
107750397Sobrien	 * due to the softdep code.
107850397Sobrien	 */
107950397Sobrien}
108050397Sobrien
108150397Sobrien/*
108250397Sobrien *	bdirty:
108350397Sobrien *
108450397Sobrien *	Turn buffer into delayed write request.  We must clear BIO_READ and
108550397Sobrien *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to
108650397Sobrien *	itself to properly update it in the dirty/clean lists.  We mark it
108750397Sobrien *	B_DONE to ensure that any asynchronization of the buffer properly
108850397Sobrien *	clears B_DONE ( else a panic will occur later ).
108952284Sobrien *
109050397Sobrien *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
109150397Sobrien *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
109250397Sobrien *	should only be called if the buffer is known-good.
109350397Sobrien *
109450397Sobrien *	Since the buffer is not on a queue, we do not update the numfreebuffers
109550397Sobrien *	count.
109650397Sobrien *
109750397Sobrien *	The buffer must be on QUEUE_NONE.
109850397Sobrien */
109950397Sobrienvoid
110050397Sobrienbdirty(struct buf *bp)
110150397Sobrien{
110250397Sobrien
110350397Sobrien	CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X",
110450397Sobrien	    bp, bp->b_vp, bp->b_flags);
110550397Sobrien	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
110650397Sobrien	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
110750397Sobrien	    ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
110850397Sobrien	BUF_ASSERT_HELD(bp);
110950397Sobrien	bp->b_flags &= ~(B_RELBUF);
111050397Sobrien	bp->b_iocmd = BIO_WRITE;
111150397Sobrien
111250397Sobrien	if ((bp->b_flags & B_DELWRI) == 0) {
111350397Sobrien		bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
111450397Sobrien		reassignbuf(bp);
111550397Sobrien		atomic_add_int(&numdirtybuffers, 1);
111650397Sobrien		bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
111750397Sobrien	}
111850397Sobrien}
111950397Sobrien
112050397Sobrien/*
112150397Sobrien *	bundirty:
112250397Sobrien *
112350397Sobrien *	Clear B_DELWRI for buffer.
112450397Sobrien *
112550397Sobrien *	Since the buffer is not on a queue, we do not update the numfreebuffers
112650397Sobrien *	count.
112750397Sobrien *
112850397Sobrien *	The buffer must be on QUEUE_NONE.
112950397Sobrien */
113050397Sobrien
113150397Sobrienvoid
113250397Sobrienbundirty(struct buf *bp)
113350397Sobrien{
113450397Sobrien
113550397Sobrien	CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
113650397Sobrien	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
113750397Sobrien	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
113850397Sobrien	    ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
113952284Sobrien	BUF_ASSERT_HELD(bp);
114050397Sobrien
114150397Sobrien	if (bp->b_flags & B_DELWRI) {
114250397Sobrien		bp->b_flags &= ~B_DELWRI;
114350397Sobrien		reassignbuf(bp);
114450397Sobrien		atomic_subtract_int(&numdirtybuffers, 1);
114550397Sobrien		numdirtywakeup(lodirtybuffers);
114652284Sobrien	}
114750397Sobrien	/*
114850397Sobrien	 * Since it is now being written, we can clear its deferred write flag.
114950397Sobrien	 */
115050397Sobrien	bp->b_flags &= ~B_DEFERRED;
115150397Sobrien}
115250397Sobrien
115350397Sobrien/*
115450397Sobrien *	bawrite:
115550397Sobrien *
115650397Sobrien *	Asynchronous write.  Start output on a buffer, but do not wait for
115750397Sobrien *	it to complete.  The buffer is released when the output completes.
115850397Sobrien *
115950397Sobrien *	bwrite() ( or the VOP routine anyway ) is responsible for handling
116050397Sobrien *	B_INVAL buffers.  Not us.
116150397Sobrien */
116250397Sobrienvoid
116350397Sobrienbawrite(struct buf *bp)
116450397Sobrien{
116550397Sobrien
116650397Sobrien	bp->b_flags |= B_ASYNC;
116750397Sobrien	(void) bwrite(bp);
116850397Sobrien}
116950397Sobrien
117050397Sobrien/*
117150397Sobrien *	bwillwrite:
117250397Sobrien *
117350397Sobrien *	Called prior to the locking of any vnodes when we are expecting to
117450397Sobrien *	write.  We do not want to starve the buffer cache with too many
117550397Sobrien *	dirty buffers so we block here.  By blocking prior to the locking
117650397Sobrien *	of any vnodes we attempt to avoid the situation where a locked vnode
117750397Sobrien *	prevents the various system daemons from flushing related buffers.
117850397Sobrien */
117950397Sobrien
118050397Sobrienvoid
118150397Sobrienbwillwrite(void)
118250397Sobrien{
1183
1184	if (numdirtybuffers >= hidirtybuffers) {
1185		mtx_lock(&nblock);
1186		while (numdirtybuffers >= hidirtybuffers) {
1187			bd_wakeup(1);
1188			needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
1189			msleep(&needsbuffer, &nblock,
1190			    (PRIBIO + 4), "flswai", 0);
1191		}
1192		mtx_unlock(&nblock);
1193	}
1194}
1195
1196/*
1197 * Return true if we have too many dirty buffers.
1198 */
1199int
1200buf_dirty_count_severe(void)
1201{
1202
1203	return(numdirtybuffers >= hidirtybuffers);
1204}
1205
1206static __noinline int
1207buf_vm_page_count_severe(void)
1208{
1209
1210	KFAIL_POINT_CODE(DEBUG_FP, buf_pressure, return 1);
1211
1212	return vm_page_count_severe();
1213}
1214
1215/*
1216 *	brelse:
1217 *
1218 *	Release a busy buffer and, if requested, free its resources.  The
1219 *	buffer will be stashed in the appropriate bufqueue[] allowing it
1220 *	to be accessed later as a cache entity or reused for other purposes.
1221 */
1222void
1223brelse(struct buf *bp)
1224{
1225	CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
1226	    bp, bp->b_vp, bp->b_flags);
1227	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
1228	    ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
1229
1230	if (bp->b_flags & B_MANAGED) {
1231		bqrelse(bp);
1232		return;
1233	}
1234
1235	if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
1236	    bp->b_error == EIO && !(bp->b_flags & B_INVAL)) {
1237		/*
1238		 * Failed write, redirty.  Must clear BIO_ERROR to prevent
1239		 * pages from being scrapped.  If the error is anything
1240		 * other than an I/O error (EIO), assume that retrying
1241		 * is futile.
1242		 */
1243		bp->b_ioflags &= ~BIO_ERROR;
1244		bdirty(bp);
1245	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
1246	    (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) {
1247		/*
1248		 * Either a failed I/O or we were asked to free or not
1249		 * cache the buffer.
1250		 */
1251		bp->b_flags |= B_INVAL;
1252		if (!LIST_EMPTY(&bp->b_dep))
1253			buf_deallocate(bp);
1254		if (bp->b_flags & B_DELWRI) {
1255			atomic_subtract_int(&numdirtybuffers, 1);
1256			numdirtywakeup(lodirtybuffers);
1257		}
1258		bp->b_flags &= ~(B_DELWRI | B_CACHE);
1259		if ((bp->b_flags & B_VMIO) == 0) {
1260			if (bp->b_bufsize)
1261				allocbuf(bp, 0);
1262			if (bp->b_vp)
1263				brelvp(bp);
1264		}
1265	}
1266
1267	/*
1268	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release()
1269	 * is called with B_DELWRI set, the underlying pages may wind up
1270	 * getting freed causing a previous write (bdwrite()) to get 'lost'
1271	 * because pages associated with a B_DELWRI bp are marked clean.
1272	 *
1273	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
1274	 * if B_DELWRI is set.
1275	 *
1276	 * If B_DELWRI is not set we may have to set B_RELBUF if we are low
1277	 * on pages to return pages to the VM page queues.
1278	 */
1279	if (bp->b_flags & B_DELWRI)
1280		bp->b_flags &= ~B_RELBUF;
1281	else if (buf_vm_page_count_severe()) {
1282		/*
1283		 * The locking of the BO_LOCK is not necessary since
1284		 * BKGRDINPROG cannot be set while we hold the buf
1285		 * lock, it can only be cleared if it is already
1286		 * pending.
1287		 */
1288		if (bp->b_vp) {
1289			if (!(bp->b_vflags & BV_BKGRDINPROG))
1290				bp->b_flags |= B_RELBUF;
1291		} else
1292			bp->b_flags |= B_RELBUF;
1293	}
1294
1295	/*
1296	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
1297	 * constituted, not even NFS buffers now.  Two flags effect this.  If
1298	 * B_INVAL, the struct buf is invalidated but the VM object is kept
1299	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
1300	 *
1301	 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
1302	 * invalidated.  BIO_ERROR cannot be set for a failed write unless the
1303	 * buffer is also B_INVAL because it hits the re-dirtying code above.
1304	 *
1305	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
1306	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
1307	 * the commit state and we cannot afford to lose the buffer. If the
1308	 * buffer has a background write in progress, we need to keep it
1309	 * around to prevent it from being reconstituted and starting a second
1310	 * background write.
1311	 */
1312	if ((bp->b_flags & B_VMIO)
1313	    && !(bp->b_vp->v_mount != NULL &&
1314		 (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
1315		 !vn_isdisk(bp->b_vp, NULL) &&
1316		 (bp->b_flags & B_DELWRI))
1317	    ) {
1318
1319		int i, j, resid;
1320		vm_page_t m;
1321		off_t foff;
1322		vm_pindex_t poff;
1323		vm_object_t obj;
1324
1325		obj = bp->b_bufobj->bo_object;
1326
1327		/*
1328		 * Get the base offset and length of the buffer.  Note that
1329		 * in the VMIO case if the buffer block size is not
1330		 * page-aligned then b_data pointer may not be page-aligned.
1331		 * But our b_pages[] array *IS* page aligned.
1332		 *
1333		 * block sizes less then DEV_BSIZE (usually 512) are not
1334		 * supported due to the page granularity bits (m->valid,
1335		 * m->dirty, etc...).
1336		 *
1337		 * See man buf(9) for more information
1338		 */
1339		resid = bp->b_bufsize;
1340		foff = bp->b_offset;
1341		VM_OBJECT_LOCK(obj);
1342		for (i = 0; i < bp->b_npages; i++) {
1343			int had_bogus = 0;
1344
1345			m = bp->b_pages[i];
1346
1347			/*
1348			 * If we hit a bogus page, fixup *all* the bogus pages
1349			 * now.
1350			 */
1351			if (m == bogus_page) {
1352				poff = OFF_TO_IDX(bp->b_offset);
1353				had_bogus = 1;
1354
1355				for (j = i; j < bp->b_npages; j++) {
1356					vm_page_t mtmp;
1357					mtmp = bp->b_pages[j];
1358					if (mtmp == bogus_page) {
1359						mtmp = vm_page_lookup(obj, poff + j);
1360						if (!mtmp) {
1361							panic("brelse: page missing\n");
1362						}
1363						bp->b_pages[j] = mtmp;
1364					}
1365				}
1366
1367				if ((bp->b_flags & B_INVAL) == 0) {
1368					pmap_qenter(
1369					    trunc_page((vm_offset_t)bp->b_data),
1370					    bp->b_pages, bp->b_npages);
1371				}
1372				m = bp->b_pages[i];
1373			}
1374			if ((bp->b_flags & B_NOCACHE) ||
1375			    (bp->b_ioflags & BIO_ERROR &&
1376			     bp->b_iocmd == BIO_READ)) {
1377				int poffset = foff & PAGE_MASK;
1378				int presid = resid > (PAGE_SIZE - poffset) ?
1379					(PAGE_SIZE - poffset) : resid;
1380
1381				KASSERT(presid >= 0, ("brelse: extra page"));
1382				vm_page_set_invalid(m, poffset, presid);
1383				if (had_bogus)
1384					printf("avoided corruption bug in bogus_page/brelse code\n");
1385			}
1386			resid -= PAGE_SIZE - (foff & PAGE_MASK);
1387			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
1388		}
1389		VM_OBJECT_UNLOCK(obj);
1390		if (bp->b_flags & (B_INVAL | B_RELBUF))
1391			vfs_vmio_release(bp);
1392
1393	} else if (bp->b_flags & B_VMIO) {
1394
1395		if (bp->b_flags & (B_INVAL | B_RELBUF)) {
1396			vfs_vmio_release(bp);
1397		}
1398
1399	} else if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0) {
1400		if (bp->b_bufsize != 0)
1401			allocbuf(bp, 0);
1402		if (bp->b_vp != NULL)
1403			brelvp(bp);
1404	}
1405
1406	if (BUF_LOCKRECURSED(bp)) {
1407		/* do not release to free list */
1408		BUF_UNLOCK(bp);
1409		return;
1410	}
1411
1412	/* enqueue */
1413	mtx_lock(&bqlock);
1414	/* Handle delayed bremfree() processing. */
1415	if (bp->b_flags & B_REMFREE)
1416		bremfreel(bp);
1417	if (bp->b_qindex != QUEUE_NONE)
1418		panic("brelse: free buffer onto another queue???");
1419
1420	/*
1421	 * If the buffer has junk contents signal it and eventually
1422	 * clean up B_DELWRI and diassociate the vnode so that gbincore()
1423	 * doesn't find it.
1424	 */
1425	if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 ||
1426	    (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0)
1427		bp->b_flags |= B_INVAL;
1428	if (bp->b_flags & B_INVAL) {
1429		if (bp->b_flags & B_DELWRI)
1430			bundirty(bp);
1431		if (bp->b_vp)
1432			brelvp(bp);
1433	}
1434
1435	/* buffers with no memory */
1436	if (bp->b_bufsize == 0) {
1437		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
1438		if (bp->b_vflags & BV_BKGRDINPROG)
1439			panic("losing buffer 1");
1440		if (bp->b_kvasize) {
1441			bp->b_qindex = QUEUE_EMPTYKVA;
1442		} else {
1443			bp->b_qindex = QUEUE_EMPTY;
1444		}
1445		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
1446	/* buffers with junk contents */
1447	} else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
1448	    (bp->b_ioflags & BIO_ERROR)) {
1449		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
1450		if (bp->b_vflags & BV_BKGRDINPROG)
1451			panic("losing buffer 2");
1452		bp->b_qindex = QUEUE_CLEAN;
1453		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
1454	/* remaining buffers */
1455	} else {
1456		if ((bp->b_flags & (B_DELWRI|B_NEEDSGIANT)) ==
1457		    (B_DELWRI|B_NEEDSGIANT))
1458			bp->b_qindex = QUEUE_DIRTY_GIANT;
1459		else if (bp->b_flags & B_DELWRI)
1460			bp->b_qindex = QUEUE_DIRTY;
1461		else
1462			bp->b_qindex = QUEUE_CLEAN;
1463		if (bp->b_flags & B_AGE)
1464			TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
1465		else
1466			TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
1467	}
1468	mtx_unlock(&bqlock);
1469
1470	/*
1471	 * Fixup numfreebuffers count.  The bp is on an appropriate queue
1472	 * unless locked.  We then bump numfreebuffers if it is not B_DELWRI.
1473	 * We've already handled the B_INVAL case ( B_DELWRI will be clear
1474	 * if B_INVAL is set ).
1475	 */
1476
1477	if (!(bp->b_flags & B_DELWRI))
1478		bufcountwakeup(bp);
1479
1480	/*
1481	 * Something we can maybe free or reuse
1482	 */
1483	if (bp->b_bufsize || bp->b_kvasize)
1484		bufspacewakeup();
1485
1486	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
1487	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
1488		panic("brelse: not dirty");
1489	/* unlock */
1490	BUF_UNLOCK(bp);
1491}
1492
1493/*
1494 * Release a buffer back to the appropriate queue but do not try to free
1495 * it.  The buffer is expected to be used again soon.
1496 *
1497 * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
1498 * biodone() to requeue an async I/O on completion.  It is also used when
1499 * known good buffers need to be requeued but we think we may need the data
1500 * again soon.
1501 *
1502 * XXX we should be able to leave the B_RELBUF hint set on completion.
1503 */
1504void
1505bqrelse(struct buf *bp)
1506{
1507	CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1508	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
1509	    ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
1510
1511	if (BUF_LOCKRECURSED(bp)) {
1512		/* do not release to free list */
1513		BUF_UNLOCK(bp);
1514		return;
1515	}
1516
1517	if (bp->b_flags & B_MANAGED) {
1518		if (bp->b_flags & B_REMFREE) {
1519			mtx_lock(&bqlock);
1520			bremfreel(bp);
1521			mtx_unlock(&bqlock);
1522		}
1523		bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
1524		BUF_UNLOCK(bp);
1525		return;
1526	}
1527
1528	mtx_lock(&bqlock);
1529	/* Handle delayed bremfree() processing. */
1530	if (bp->b_flags & B_REMFREE)
1531		bremfreel(bp);
1532	if (bp->b_qindex != QUEUE_NONE)
1533		panic("bqrelse: free buffer onto another queue???");
1534	/* buffers with stale but valid contents */
1535	if (bp->b_flags & B_DELWRI) {
1536		if (bp->b_flags & B_NEEDSGIANT)
1537			bp->b_qindex = QUEUE_DIRTY_GIANT;
1538		else
1539			bp->b_qindex = QUEUE_DIRTY;
1540		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
1541	} else {
1542		/*
1543		 * The locking of the BO_LOCK for checking of the
1544		 * BV_BKGRDINPROG is not necessary since the
1545		 * BV_BKGRDINPROG cannot be set while we hold the buf
1546		 * lock, it can only be cleared if it is already
1547		 * pending.
1548		 */
1549		if (!buf_vm_page_count_severe() || (bp->b_vflags & BV_BKGRDINPROG)) {
1550			bp->b_qindex = QUEUE_CLEAN;
1551			TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp,
1552			    b_freelist);
1553		} else {
1554			/*
1555			 * We are too low on memory, we have to try to free
1556			 * the buffer (most importantly: the wired pages
1557			 * making up its backing store) *now*.
1558			 */
1559			mtx_unlock(&bqlock);
1560			brelse(bp);
1561			return;
1562		}
1563	}
1564	mtx_unlock(&bqlock);
1565
1566	if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
1567		bufcountwakeup(bp);
1568
1569	/*
1570	 * Something we can maybe free or reuse.
1571	 */
1572	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
1573		bufspacewakeup();
1574
1575	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
1576	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
1577		panic("bqrelse: not dirty");
1578	/* unlock */
1579	BUF_UNLOCK(bp);
1580}
1581
1582/* Give pages used by the bp back to the VM system (where possible) */
1583static void
1584vfs_vmio_release(struct buf *bp)
1585{
1586	int i;
1587	vm_page_t m;
1588
1589	VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
1590	for (i = 0; i < bp->b_npages; i++) {
1591		m = bp->b_pages[i];
1592		bp->b_pages[i] = NULL;
1593		/*
1594		 * In order to keep page LRU ordering consistent, put
1595		 * everything on the inactive queue.
1596		 */
1597		vm_page_lock(m);
1598		vm_page_unwire(m, 0);
1599		/*
1600		 * We don't mess with busy pages, it is
1601		 * the responsibility of the process that
1602		 * busied the pages to deal with them.
1603		 */
1604		if ((m->oflags & VPO_BUSY) == 0 && m->busy == 0 &&
1605		    m->wire_count == 0) {
1606			/*
1607			 * Might as well free the page if we can and it has
1608			 * no valid data.  We also free the page if the
1609			 * buffer was used for direct I/O
1610			 */
1611			if ((bp->b_flags & B_ASYNC) == 0 && !m->valid &&
1612			    m->hold_count == 0) {
1613				vm_page_free(m);
1614			} else if (bp->b_flags & B_DIRECT) {
1615				vm_page_try_to_free(m);
1616			} else if (buf_vm_page_count_severe()) {
1617				vm_page_try_to_cache(m);
1618			}
1619		}
1620		vm_page_unlock(m);
1621	}
1622	VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
1623	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
1624
1625	if (bp->b_bufsize) {
1626		bufspacewakeup();
1627		bp->b_bufsize = 0;
1628	}
1629	bp->b_npages = 0;
1630	bp->b_flags &= ~B_VMIO;
1631	if (bp->b_vp)
1632		brelvp(bp);
1633}
1634
1635/*
1636 * Check to see if a block at a particular lbn is available for a clustered
1637 * write.
1638 */
1639static int
1640vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno)
1641{
1642	struct buf *bpa;
1643	int match;
1644
1645	match = 0;
1646
1647	/* If the buf isn't in core skip it */
1648	if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL)
1649		return (0);
1650
1651	/* If the buf is busy we don't want to wait for it */
1652	if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
1653		return (0);
1654
1655	/* Only cluster with valid clusterable delayed write buffers */
1656	if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) !=
1657	    (B_DELWRI | B_CLUSTEROK))
1658		goto done;
1659
1660	if (bpa->b_bufsize != size)
1661		goto done;
1662
1663	/*
1664	 * Check to see if it is in the expected place on disk and that the
1665	 * block has been mapped.
1666	 */
1667	if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno))
1668		match = 1;
1669done:
1670	BUF_UNLOCK(bpa);
1671	return (match);
1672}
1673
1674/*
1675 *	vfs_bio_awrite:
1676 *
1677 *	Implement clustered async writes for clearing out B_DELWRI buffers.
1678 *	This is much better then the old way of writing only one buffer at
1679 *	a time.  Note that we may not be presented with the buffers in the
1680 *	correct order, so we search for the cluster in both directions.
1681 */
1682int
1683vfs_bio_awrite(struct buf *bp)
1684{
1685	struct bufobj *bo;
1686	int i;
1687	int j;
1688	daddr_t lblkno = bp->b_lblkno;
1689	struct vnode *vp = bp->b_vp;
1690	int ncl;
1691	int nwritten;
1692	int size;
1693	int maxcl;
1694
1695	bo = &vp->v_bufobj;
1696	/*
1697	 * right now we support clustered writing only to regular files.  If
1698	 * we find a clusterable block we could be in the middle of a cluster
1699	 * rather then at the beginning.
1700	 */
1701	if ((vp->v_type == VREG) &&
1702	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
1703	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
1704
1705		size = vp->v_mount->mnt_stat.f_iosize;
1706		maxcl = MAXPHYS / size;
1707
1708		BO_LOCK(bo);
1709		for (i = 1; i < maxcl; i++)
1710			if (vfs_bio_clcheck(vp, size, lblkno + i,
1711			    bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
1712				break;
1713
1714		for (j = 1; i + j <= maxcl && j <= lblkno; j++)
1715			if (vfs_bio_clcheck(vp, size, lblkno - j,
1716			    bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
1717				break;
1718		BO_UNLOCK(bo);
1719		--j;
1720		ncl = i + j;
1721		/*
1722		 * this is a possible cluster write
1723		 */
1724		if (ncl != 1) {
1725			BUF_UNLOCK(bp);
1726			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl);
1727			return nwritten;
1728		}
1729	}
1730	bremfree(bp);
1731	bp->b_flags |= B_ASYNC;
1732	/*
1733	 * default (old) behavior, writing out only one block
1734	 *
1735	 * XXX returns b_bufsize instead of b_bcount for nwritten?
1736	 */
1737	nwritten = bp->b_bufsize;
1738	(void) bwrite(bp);
1739
1740	return nwritten;
1741}
1742
1743/*
1744 *	getnewbuf:
1745 *
1746 *	Find and initialize a new buffer header, freeing up existing buffers
1747 *	in the bufqueues as necessary.  The new buffer is returned locked.
1748 *
1749 *	Important:  B_INVAL is not set.  If the caller wishes to throw the
1750 *	buffer away, the caller must set B_INVAL prior to calling brelse().
1751 *
1752 *	We block if:
1753 *		We have insufficient buffer headers
1754 *		We have insufficient buffer space
1755 *		buffer_map is too fragmented ( space reservation fails )
1756 *		If we have to flush dirty buffers ( but we try to avoid this )
1757 *
1758 *	To avoid VFS layer recursion we do not flush dirty buffers ourselves.
1759 *	Instead we ask the buf daemon to do it for us.  We attempt to
1760 *	avoid piecemeal wakeups of the pageout daemon.
1761 */
1762
1763static struct buf *
1764getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
1765    int gbflags)
1766{
1767	struct thread *td;
1768	struct buf *bp;
1769	struct buf *nbp;
1770	int defrag = 0;
1771	int nqindex;
1772	static int flushingbufs;
1773
1774	td = curthread;
1775	/*
1776	 * We can't afford to block since we might be holding a vnode lock,
1777	 * which may prevent system daemons from running.  We deal with
1778	 * low-memory situations by proactively returning memory and running
1779	 * async I/O rather then sync I/O.
1780	 */
1781	atomic_add_int(&getnewbufcalls, 1);
1782	atomic_subtract_int(&getnewbufrestarts, 1);
1783restart:
1784	atomic_add_int(&getnewbufrestarts, 1);
1785
1786	/*
1787	 * Setup for scan.  If we do not have enough free buffers,
1788	 * we setup a degenerate case that immediately fails.  Note
1789	 * that if we are specially marked process, we are allowed to
1790	 * dip into our reserves.
1791	 *
1792	 * The scanning sequence is nominally:  EMPTY->EMPTYKVA->CLEAN
1793	 *
1794	 * We start with EMPTYKVA.  If the list is empty we backup to EMPTY.
1795	 * However, there are a number of cases (defragging, reusing, ...)
1796	 * where we cannot backup.
1797	 */
1798	mtx_lock(&bqlock);
1799	nqindex = QUEUE_EMPTYKVA;
1800	nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
1801
1802	if (nbp == NULL) {
1803		/*
1804		 * If no EMPTYKVA buffers and we are either
1805		 * defragging or reusing, locate a CLEAN buffer
1806		 * to free or reuse.  If bufspace useage is low
1807		 * skip this step so we can allocate a new buffer.
1808		 */
1809		if (defrag || bufspace >= lobufspace) {
1810			nqindex = QUEUE_CLEAN;
1811			nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
1812		}
1813
1814		/*
1815		 * If we could not find or were not allowed to reuse a
1816		 * CLEAN buffer, check to see if it is ok to use an EMPTY
1817		 * buffer.  We can only use an EMPTY buffer if allocating
1818		 * its KVA would not otherwise run us out of buffer space.
1819		 */
1820		if (nbp == NULL && defrag == 0 &&
1821		    bufspace + maxsize < hibufspace) {
1822			nqindex = QUEUE_EMPTY;
1823			nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
1824		}
1825	}
1826
1827	/*
1828	 * Run scan, possibly freeing data and/or kva mappings on the fly
1829	 * depending.
1830	 */
1831
1832	while ((bp = nbp) != NULL) {
1833		int qindex = nqindex;
1834
1835		/*
1836		 * Calculate next bp ( we can only use it if we do not block
1837		 * or do other fancy things ).
1838		 */
1839		if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
1840			switch(qindex) {
1841			case QUEUE_EMPTY:
1842				nqindex = QUEUE_EMPTYKVA;
1843				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA])))
1844					break;
1845				/* FALLTHROUGH */
1846			case QUEUE_EMPTYKVA:
1847				nqindex = QUEUE_CLEAN;
1848				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN])))
1849					break;
1850				/* FALLTHROUGH */
1851			case QUEUE_CLEAN:
1852				/*
1853				 * nbp is NULL.
1854				 */
1855				break;
1856			}
1857		}
1858		/*
1859		 * If we are defragging then we need a buffer with
1860		 * b_kvasize != 0.  XXX this situation should no longer
1861		 * occur, if defrag is non-zero the buffer's b_kvasize
1862		 * should also be non-zero at this point.  XXX
1863		 */
1864		if (defrag && bp->b_kvasize == 0) {
1865			printf("Warning: defrag empty buffer %p\n", bp);
1866			continue;
1867		}
1868
1869		/*
1870		 * Start freeing the bp.  This is somewhat involved.  nbp
1871		 * remains valid only for QUEUE_EMPTY[KVA] bp's.
1872		 */
1873		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
1874			continue;
1875		if (bp->b_vp) {
1876			BO_LOCK(bp->b_bufobj);
1877			if (bp->b_vflags & BV_BKGRDINPROG) {
1878				BO_UNLOCK(bp->b_bufobj);
1879				BUF_UNLOCK(bp);
1880				continue;
1881			}
1882			BO_UNLOCK(bp->b_bufobj);
1883		}
1884		CTR6(KTR_BUF,
1885		    "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
1886		    "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
1887		    bp->b_kvasize, bp->b_bufsize, qindex);
1888
1889		/*
1890		 * Sanity Checks
1891		 */
1892		KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
1893
1894		/*
1895		 * Note: we no longer distinguish between VMIO and non-VMIO
1896		 * buffers.
1897		 */
1898
1899		KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex));
1900
1901		bremfreel(bp);
1902		mtx_unlock(&bqlock);
1903
1904		if (qindex == QUEUE_CLEAN) {
1905			if (bp->b_flags & B_VMIO) {
1906				bp->b_flags &= ~B_ASYNC;
1907				vfs_vmio_release(bp);
1908			}
1909			if (bp->b_vp)
1910				brelvp(bp);
1911		}
1912
1913		/*
1914		 * NOTE:  nbp is now entirely invalid.  We can only restart
1915		 * the scan from this point on.
1916		 *
1917		 * Get the rest of the buffer freed up.  b_kva* is still
1918		 * valid after this operation.
1919		 */
1920
1921		if (bp->b_rcred != NOCRED) {
1922			crfree(bp->b_rcred);
1923			bp->b_rcred = NOCRED;
1924		}
1925		if (bp->b_wcred != NOCRED) {
1926			crfree(bp->b_wcred);
1927			bp->b_wcred = NOCRED;
1928		}
1929		if (!LIST_EMPTY(&bp->b_dep))
1930			buf_deallocate(bp);
1931		if (bp->b_vflags & BV_BKGRDINPROG)
1932			panic("losing buffer 3");
1933		KASSERT(bp->b_vp == NULL,
1934		    ("bp: %p still has vnode %p.  qindex: %d",
1935		    bp, bp->b_vp, qindex));
1936		KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
1937		   ("bp: %p still on a buffer list. xflags %X",
1938		    bp, bp->b_xflags));
1939
1940		if (bp->b_bufsize)
1941			allocbuf(bp, 0);
1942
1943		bp->b_flags = 0;
1944		bp->b_ioflags = 0;
1945		bp->b_xflags = 0;
1946		KASSERT((bp->b_vflags & BV_INFREECNT) == 0,
1947		    ("buf %p still counted as free?", bp));
1948		bp->b_vflags = 0;
1949		bp->b_vp = NULL;
1950		bp->b_blkno = bp->b_lblkno = 0;
1951		bp->b_offset = NOOFFSET;
1952		bp->b_iodone = 0;
1953		bp->b_error = 0;
1954		bp->b_resid = 0;
1955		bp->b_bcount = 0;
1956		bp->b_npages = 0;
1957		bp->b_dirtyoff = bp->b_dirtyend = 0;
1958		bp->b_bufobj = NULL;
1959		bp->b_pin_count = 0;
1960		bp->b_fsprivate1 = NULL;
1961		bp->b_fsprivate2 = NULL;
1962		bp->b_fsprivate3 = NULL;
1963
1964		LIST_INIT(&bp->b_dep);
1965
1966		/*
1967		 * If we are defragging then free the buffer.
1968		 */
1969		if (defrag) {
1970			bp->b_flags |= B_INVAL;
1971			bfreekva(bp);
1972			brelse(bp);
1973			defrag = 0;
1974			goto restart;
1975		}
1976
1977		/*
1978		 * Notify any waiters for the buffer lock about
1979		 * identity change by freeing the buffer.
1980		 */
1981		if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp)) {
1982			bp->b_flags |= B_INVAL;
1983			bfreekva(bp);
1984			brelse(bp);
1985			goto restart;
1986		}
1987
1988		/*
1989		 * If we are overcomitted then recover the buffer and its
1990		 * KVM space.  This occurs in rare situations when multiple
1991		 * processes are blocked in getnewbuf() or allocbuf().
1992		 */
1993		if (bufspace >= hibufspace)
1994			flushingbufs = 1;
1995		if (flushingbufs && bp->b_kvasize != 0) {
1996			bp->b_flags |= B_INVAL;
1997			bfreekva(bp);
1998			brelse(bp);
1999			goto restart;
2000		}
2001		if (bufspace < lobufspace)
2002			flushingbufs = 0;
2003		break;
2004	}
2005
2006	/*
2007	 * If we exhausted our list, sleep as appropriate.  We may have to
2008	 * wakeup various daemons and write out some dirty buffers.
2009	 *
2010	 * Generally we are sleeping due to insufficient buffer space.
2011	 */
2012
2013	if (bp == NULL) {
2014		int flags, norunbuf;
2015		char *waitmsg;
2016		int fl;
2017
2018		if (defrag) {
2019			flags = VFS_BIO_NEED_BUFSPACE;
2020			waitmsg = "nbufkv";
2021		} else if (bufspace >= hibufspace) {
2022			waitmsg = "nbufbs";
2023			flags = VFS_BIO_NEED_BUFSPACE;
2024		} else {
2025			waitmsg = "newbuf";
2026			flags = VFS_BIO_NEED_ANY;
2027		}
2028		mtx_lock(&nblock);
2029		needsbuffer |= flags;
2030		mtx_unlock(&nblock);
2031		mtx_unlock(&bqlock);
2032
2033		bd_speedup();	/* heeeelp */
2034		if (gbflags & GB_NOWAIT_BD)
2035			return (NULL);
2036
2037		mtx_lock(&nblock);
2038		while (needsbuffer & flags) {
2039			if (vp != NULL && (td->td_pflags & TDP_BUFNEED) == 0) {
2040				mtx_unlock(&nblock);
2041				/*
2042				 * getblk() is called with a vnode
2043				 * locked, and some majority of the
2044				 * dirty buffers may as well belong to
2045				 * the vnode. Flushing the buffers
2046				 * there would make a progress that
2047				 * cannot be achieved by the
2048				 * buf_daemon, that cannot lock the
2049				 * vnode.
2050				 */
2051				norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
2052				    (td->td_pflags & TDP_NORUNNINGBUF);
2053				/* play bufdaemon */
2054				td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
2055				fl = buf_do_flush(vp);
2056				td->td_pflags &= norunbuf;
2057				mtx_lock(&nblock);
2058				if (fl != 0)
2059					continue;
2060				if ((needsbuffer & flags) == 0)
2061					break;
2062			}
2063			if (msleep(&needsbuffer, &nblock,
2064			    (PRIBIO + 4) | slpflag, waitmsg, slptimeo)) {
2065				mtx_unlock(&nblock);
2066				return (NULL);
2067			}
2068		}
2069		mtx_unlock(&nblock);
2070	} else {
2071		/*
2072		 * We finally have a valid bp.  We aren't quite out of the
2073		 * woods, we still have to reserve kva space.  In order
2074		 * to keep fragmentation sane we only allocate kva in
2075		 * BKVASIZE chunks.
2076		 */
2077		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
2078
2079		if (maxsize != bp->b_kvasize) {
2080			vm_offset_t addr = 0;
2081
2082			bfreekva(bp);
2083
2084			vm_map_lock(buffer_map);
2085			if (vm_map_findspace(buffer_map,
2086				vm_map_min(buffer_map), maxsize, &addr)) {
2087				/*
2088				 * Uh oh.  Buffer map is to fragmented.  We
2089				 * must defragment the map.
2090				 */
2091				atomic_add_int(&bufdefragcnt, 1);
2092				vm_map_unlock(buffer_map);
2093				defrag = 1;
2094				bp->b_flags |= B_INVAL;
2095				brelse(bp);
2096				goto restart;
2097			}
2098			if (addr) {
2099				vm_map_insert(buffer_map, NULL, 0,
2100					addr, addr + maxsize,
2101					VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
2102
2103				bp->b_kvabase = (caddr_t) addr;
2104				bp->b_kvasize = maxsize;
2105				atomic_add_long(&bufspace, bp->b_kvasize);
2106				atomic_add_int(&bufreusecnt, 1);
2107			}
2108			vm_map_unlock(buffer_map);
2109		}
2110		bp->b_saveaddr = bp->b_kvabase;
2111		bp->b_data = bp->b_saveaddr;
2112	}
2113	return(bp);
2114}
2115
2116/*
2117 *	buf_daemon:
2118 *
2119 *	buffer flushing daemon.  Buffers are normally flushed by the
2120 *	update daemon but if it cannot keep up this process starts to
2121 *	take the load in an attempt to prevent getnewbuf() from blocking.
2122 */
2123
2124static struct kproc_desc buf_kp = {
2125	"bufdaemon",
2126	buf_daemon,
2127	&bufdaemonproc
2128};
2129SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
2130
2131static int
2132buf_do_flush(struct vnode *vp)
2133{
2134	int flushed;
2135
2136	flushed = flushbufqueues(vp, QUEUE_DIRTY, 0);
2137	/* The list empty check here is slightly racy */
2138	if (!TAILQ_EMPTY(&bufqueues[QUEUE_DIRTY_GIANT])) {
2139		mtx_lock(&Giant);
2140		flushed += flushbufqueues(vp, QUEUE_DIRTY_GIANT, 0);
2141		mtx_unlock(&Giant);
2142	}
2143	if (flushed == 0) {
2144		/*
2145		 * Could not find any buffers without rollback
2146		 * dependencies, so just write the first one
2147		 * in the hopes of eventually making progress.
2148		 */
2149		flushbufqueues(vp, QUEUE_DIRTY, 1);
2150		if (!TAILQ_EMPTY(
2151			    &bufqueues[QUEUE_DIRTY_GIANT])) {
2152			mtx_lock(&Giant);
2153			flushbufqueues(vp, QUEUE_DIRTY_GIANT, 1);
2154			mtx_unlock(&Giant);
2155		}
2156	}
2157	return (flushed);
2158}
2159
2160static void
2161buf_daemon()
2162{
2163	int lodirtysave;
2164
2165	/*
2166	 * This process needs to be suspended prior to shutdown sync.
2167	 */
2168	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc,
2169	    SHUTDOWN_PRI_LAST);
2170
2171	/*
2172	 * This process is allowed to take the buffer cache to the limit
2173	 */
2174	curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED;
2175	mtx_lock(&bdlock);
2176	for (;;) {
2177		bd_request = 0;
2178		mtx_unlock(&bdlock);
2179
2180		kproc_suspend_check(bufdaemonproc);
2181		lodirtysave = lodirtybuffers;
2182		if (bd_speedupreq) {
2183			lodirtybuffers = numdirtybuffers / 2;
2184			bd_speedupreq = 0;
2185		}
2186		/*
2187		 * Do the flush.  Limit the amount of in-transit I/O we
2188		 * allow to build up, otherwise we would completely saturate
2189		 * the I/O system.  Wakeup any waiting processes before we
2190		 * normally would so they can run in parallel with our drain.
2191		 */
2192		while (numdirtybuffers > lodirtybuffers) {
2193			if (buf_do_flush(NULL) == 0)
2194				break;
2195			uio_yield();
2196		}
2197		lodirtybuffers = lodirtysave;
2198
2199		/*
2200		 * Only clear bd_request if we have reached our low water
2201		 * mark.  The buf_daemon normally waits 1 second and
2202		 * then incrementally flushes any dirty buffers that have
2203		 * built up, within reason.
2204		 *
2205		 * If we were unable to hit our low water mark and couldn't
2206		 * find any flushable buffers, we sleep half a second.
2207		 * Otherwise we loop immediately.
2208		 */
2209		mtx_lock(&bdlock);
2210		if (numdirtybuffers <= lodirtybuffers) {
2211			/*
2212			 * We reached our low water mark, reset the
2213			 * request and sleep until we are needed again.
2214			 * The sleep is just so the suspend code works.
2215			 */
2216			bd_request = 0;
2217			msleep(&bd_request, &bdlock, PVM, "psleep", hz);
2218		} else {
2219			/*
2220			 * We couldn't find any flushable dirty buffers but
2221			 * still have too many dirty buffers, we
2222			 * have to sleep and try again.  (rare)
2223			 */
2224			msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10);
2225		}
2226	}
2227}
2228
2229/*
2230 *	flushbufqueues:
2231 *
2232 *	Try to flush a buffer in the dirty queue.  We must be careful to
2233 *	free up B_INVAL buffers instead of write them, which NFS is
2234 *	particularly sensitive to.
2235 */
2236static int flushwithdeps = 0;
2237SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
2238    0, "Number of buffers flushed with dependecies that require rollbacks");
2239
2240static int
2241flushbufqueues(struct vnode *lvp, int queue, int flushdeps)
2242{
2243	struct buf *sentinel;
2244	struct vnode *vp;
2245	struct mount *mp;
2246	struct buf *bp;
2247	int hasdeps;
2248	int flushed;
2249	int target;
2250
2251	if (lvp == NULL) {
2252		target = numdirtybuffers - lodirtybuffers;
2253		if (flushdeps && target > 2)
2254			target /= 2;
2255	} else
2256		target = flushbufqtarget;
2257	flushed = 0;
2258	bp = NULL;
2259	sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
2260	sentinel->b_qindex = QUEUE_SENTINEL;
2261	mtx_lock(&bqlock);
2262	TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist);
2263	while (flushed != target) {
2264		bp = TAILQ_NEXT(sentinel, b_freelist);
2265		if (bp != NULL) {
2266			TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
2267			TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel,
2268			    b_freelist);
2269		} else
2270			break;
2271		/*
2272		 * Skip sentinels inserted by other invocations of the
2273		 * flushbufqueues(), taking care to not reorder them.
2274		 */
2275		if (bp->b_qindex == QUEUE_SENTINEL)
2276			continue;
2277		/*
2278		 * Only flush the buffers that belong to the
2279		 * vnode locked by the curthread.
2280		 */
2281		if (lvp != NULL && bp->b_vp != lvp)
2282			continue;
2283		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
2284			continue;
2285		if (bp->b_pin_count > 0) {
2286			BUF_UNLOCK(bp);
2287			continue;
2288		}
2289		BO_LOCK(bp->b_bufobj);
2290		if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
2291		    (bp->b_flags & B_DELWRI) == 0) {
2292			BO_UNLOCK(bp->b_bufobj);
2293			BUF_UNLOCK(bp);
2294			continue;
2295		}
2296		BO_UNLOCK(bp->b_bufobj);
2297		if (bp->b_flags & B_INVAL) {
2298			bremfreel(bp);
2299			mtx_unlock(&bqlock);
2300			brelse(bp);
2301			flushed++;
2302			numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
2303			mtx_lock(&bqlock);
2304			continue;
2305		}
2306
2307		if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
2308			if (flushdeps == 0) {
2309				BUF_UNLOCK(bp);
2310				continue;
2311			}
2312			hasdeps = 1;
2313		} else
2314			hasdeps = 0;
2315		/*
2316		 * We must hold the lock on a vnode before writing
2317		 * one of its buffers. Otherwise we may confuse, or
2318		 * in the case of a snapshot vnode, deadlock the
2319		 * system.
2320		 *
2321		 * The lock order here is the reverse of the normal
2322		 * of vnode followed by buf lock.  This is ok because
2323		 * the NOWAIT will prevent deadlock.
2324		 */
2325		vp = bp->b_vp;
2326		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
2327			BUF_UNLOCK(bp);
2328			continue;
2329		}
2330		if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_CANRECURSE) == 0) {
2331			mtx_unlock(&bqlock);
2332			CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
2333			    bp, bp->b_vp, bp->b_flags);
2334			if (curproc == bufdaemonproc)
2335				vfs_bio_awrite(bp);
2336			else {
2337				bremfree(bp);
2338				bwrite(bp);
2339				notbufdflashes++;
2340			}
2341			vn_finished_write(mp);
2342			VOP_UNLOCK(vp, 0);
2343			flushwithdeps += hasdeps;
2344			flushed++;
2345
2346			/*
2347			 * Sleeping on runningbufspace while holding
2348			 * vnode lock leads to deadlock.
2349			 */
2350			if (curproc == bufdaemonproc)
2351				waitrunningbufspace();
2352			numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
2353			mtx_lock(&bqlock);
2354			continue;
2355		}
2356		vn_finished_write(mp);
2357		BUF_UNLOCK(bp);
2358	}
2359	TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
2360	mtx_unlock(&bqlock);
2361	free(sentinel, M_TEMP);
2362	return (flushed);
2363}
2364
2365/*
2366 * Check to see if a block is currently memory resident.
2367 */
2368struct buf *
2369incore(struct bufobj *bo, daddr_t blkno)
2370{
2371	struct buf *bp;
2372
2373	BO_LOCK(bo);
2374	bp = gbincore(bo, blkno);
2375	BO_UNLOCK(bo);
2376	return (bp);
2377}
2378
2379/*
2380 * Returns true if no I/O is needed to access the
2381 * associated VM object.  This is like incore except
2382 * it also hunts around in the VM system for the data.
2383 */
2384
2385static int
2386inmem(struct vnode * vp, daddr_t blkno)
2387{
2388	vm_object_t obj;
2389	vm_offset_t toff, tinc, size;
2390	vm_page_t m;
2391	vm_ooffset_t off;
2392
2393	ASSERT_VOP_LOCKED(vp, "inmem");
2394
2395	if (incore(&vp->v_bufobj, blkno))
2396		return 1;
2397	if (vp->v_mount == NULL)
2398		return 0;
2399	obj = vp->v_object;
2400	if (obj == NULL)
2401		return (0);
2402
2403	size = PAGE_SIZE;
2404	if (size > vp->v_mount->mnt_stat.f_iosize)
2405		size = vp->v_mount->mnt_stat.f_iosize;
2406	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
2407
2408	VM_OBJECT_LOCK(obj);
2409	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
2410		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
2411		if (!m)
2412			goto notinmem;
2413		tinc = size;
2414		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
2415			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
2416		if (vm_page_is_valid(m,
2417		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
2418			goto notinmem;
2419	}
2420	VM_OBJECT_UNLOCK(obj);
2421	return 1;
2422
2423notinmem:
2424	VM_OBJECT_UNLOCK(obj);
2425	return (0);
2426}
2427
2428/*
2429 * Set the dirty range for a buffer based on the status of the dirty
2430 * bits in the pages comprising the buffer.  The range is limited
2431 * to the size of the buffer.
2432 *
2433 * Tell the VM system that the pages associated with this buffer
2434 * are clean.  This is used for delayed writes where the data is
2435 * going to go to disk eventually without additional VM intevention.
2436 *
2437 * Note that while we only really need to clean through to b_bcount, we
2438 * just go ahead and clean through to b_bufsize.
2439 */
2440static void
2441vfs_clean_pages_dirty_buf(struct buf *bp)
2442{
2443	vm_ooffset_t foff, noff, eoff;
2444	vm_page_t m;
2445	int i;
2446
2447	if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0)
2448		return;
2449
2450	foff = bp->b_offset;
2451	KASSERT(bp->b_offset != NOOFFSET,
2452	    ("vfs_clean_pages_dirty_buf: no buffer offset"));
2453
2454	VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
2455	vfs_drain_busy_pages(bp);
2456	vfs_setdirty_locked_object(bp);
2457	for (i = 0; i < bp->b_npages; i++) {
2458		noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
2459		eoff = noff;
2460		if (eoff > bp->b_offset + bp->b_bufsize)
2461			eoff = bp->b_offset + bp->b_bufsize;
2462		m = bp->b_pages[i];
2463		vfs_page_set_validclean(bp, foff, m);
2464		/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
2465		foff = noff;
2466	}
2467	VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
2468}
2469
2470static void
2471vfs_setdirty_locked_object(struct buf *bp)
2472{
2473	vm_object_t object;
2474	int i;
2475
2476	object = bp->b_bufobj->bo_object;
2477	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2478
2479	/*
2480	 * We qualify the scan for modified pages on whether the
2481	 * object has been flushed yet.
2482	 */
2483	if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) {
2484		vm_offset_t boffset;
2485		vm_offset_t eoffset;
2486
2487		/*
2488		 * test the pages to see if they have been modified directly
2489		 * by users through the VM system.
2490		 */
2491		for (i = 0; i < bp->b_npages; i++)
2492			vm_page_test_dirty(bp->b_pages[i]);
2493
2494		/*
2495		 * Calculate the encompassing dirty range, boffset and eoffset,
2496		 * (eoffset - boffset) bytes.
2497		 */
2498
2499		for (i = 0; i < bp->b_npages; i++) {
2500			if (bp->b_pages[i]->dirty)
2501				break;
2502		}
2503		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
2504
2505		for (i = bp->b_npages - 1; i >= 0; --i) {
2506			if (bp->b_pages[i]->dirty) {
2507				break;
2508			}
2509		}
2510		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
2511
2512		/*
2513		 * Fit it to the buffer.
2514		 */
2515
2516		if (eoffset > bp->b_bcount)
2517			eoffset = bp->b_bcount;
2518
2519		/*
2520		 * If we have a good dirty range, merge with the existing
2521		 * dirty range.
2522		 */
2523
2524		if (boffset < eoffset) {
2525			if (bp->b_dirtyoff > boffset)
2526				bp->b_dirtyoff = boffset;
2527			if (bp->b_dirtyend < eoffset)
2528				bp->b_dirtyend = eoffset;
2529		}
2530	}
2531}
2532
2533/*
2534 *	getblk:
2535 *
2536 *	Get a block given a specified block and offset into a file/device.
2537 *	The buffers B_DONE bit will be cleared on return, making it almost
2538 * 	ready for an I/O initiation.  B_INVAL may or may not be set on
2539 *	return.  The caller should clear B_INVAL prior to initiating a
2540 *	READ.
2541 *
2542 *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
2543 *	an existing buffer.
2544 *
2545 *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
2546 *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
2547 *	and then cleared based on the backing VM.  If the previous buffer is
2548 *	non-0-sized but invalid, B_CACHE will be cleared.
2549 *
2550 *	If getblk() must create a new buffer, the new buffer is returned with
2551 *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
2552 *	case it is returned with B_INVAL clear and B_CACHE set based on the
2553 *	backing VM.
2554 *
2555 *	getblk() also forces a bwrite() for any B_DELWRI buffer whos
2556 *	B_CACHE bit is clear.
2557 *
2558 *	What this means, basically, is that the caller should use B_CACHE to
2559 *	determine whether the buffer is fully valid or not and should clear
2560 *	B_INVAL prior to issuing a read.  If the caller intends to validate
2561 *	the buffer by loading its data area with something, the caller needs
2562 *	to clear B_INVAL.  If the caller does this without issuing an I/O,
2563 *	the caller should set B_CACHE ( as an optimization ), else the caller
2564 *	should issue the I/O and biodone() will set B_CACHE if the I/O was
2565 *	a write attempt or if it was a successfull read.  If the caller
2566 *	intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
2567 *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
2568 */
2569struct buf *
2570getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo,
2571    int flags)
2572{
2573	struct buf *bp;
2574	struct bufobj *bo;
2575	int error;
2576
2577	CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
2578	ASSERT_VOP_LOCKED(vp, "getblk");
2579	if (size > MAXBSIZE)
2580		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
2581
2582	bo = &vp->v_bufobj;
2583loop:
2584	/*
2585	 * Block if we are low on buffers.   Certain processes are allowed
2586	 * to completely exhaust the buffer cache.
2587         *
2588         * If this check ever becomes a bottleneck it may be better to
2589         * move it into the else, when gbincore() fails.  At the moment
2590         * it isn't a problem.
2591	 *
2592	 * XXX remove if 0 sections (clean this up after its proven)
2593         */
2594	if (numfreebuffers == 0) {
2595		if (TD_IS_IDLETHREAD(curthread))
2596			return NULL;
2597		mtx_lock(&nblock);
2598		needsbuffer |= VFS_BIO_NEED_ANY;
2599		mtx_unlock(&nblock);
2600	}
2601
2602	BO_LOCK(bo);
2603	bp = gbincore(bo, blkno);
2604	if (bp != NULL) {
2605		int lockflags;
2606		/*
2607		 * Buffer is in-core.  If the buffer is not busy, it must
2608		 * be on a queue.
2609		 */
2610		lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
2611
2612		if (flags & GB_LOCK_NOWAIT)
2613			lockflags |= LK_NOWAIT;
2614
2615		error = BUF_TIMELOCK(bp, lockflags,
2616		    BO_MTX(bo), "getblk", slpflag, slptimeo);
2617
2618		/*
2619		 * If we slept and got the lock we have to restart in case
2620		 * the buffer changed identities.
2621		 */
2622		if (error == ENOLCK)
2623			goto loop;
2624		/* We timed out or were interrupted. */
2625		else if (error)
2626			return (NULL);
2627
2628		/*
2629		 * The buffer is locked.  B_CACHE is cleared if the buffer is
2630		 * invalid.  Otherwise, for a non-VMIO buffer, B_CACHE is set
2631		 * and for a VMIO buffer B_CACHE is adjusted according to the
2632		 * backing VM cache.
2633		 */
2634		if (bp->b_flags & B_INVAL)
2635			bp->b_flags &= ~B_CACHE;
2636		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
2637			bp->b_flags |= B_CACHE;
2638		bremfree(bp);
2639
2640		/*
2641		 * check for size inconsistancies for non-VMIO case.
2642		 */
2643
2644		if (bp->b_bcount != size) {
2645			if ((bp->b_flags & B_VMIO) == 0 ||
2646			    (size > bp->b_kvasize)) {
2647				if (bp->b_flags & B_DELWRI) {
2648					/*
2649					 * If buffer is pinned and caller does
2650					 * not want sleep  waiting for it to be
2651					 * unpinned, bail out
2652					 * */
2653					if (bp->b_pin_count > 0) {
2654						if (flags & GB_LOCK_NOWAIT) {
2655							bqrelse(bp);
2656							return (NULL);
2657						} else {
2658							bunpin_wait(bp);
2659						}
2660					}
2661					bp->b_flags |= B_NOCACHE;
2662					bwrite(bp);
2663				} else {
2664					if (LIST_EMPTY(&bp->b_dep)) {
2665						bp->b_flags |= B_RELBUF;
2666						brelse(bp);
2667					} else {
2668						bp->b_flags |= B_NOCACHE;
2669						bwrite(bp);
2670					}
2671				}
2672				goto loop;
2673			}
2674		}
2675
2676		/*
2677		 * If the size is inconsistant in the VMIO case, we can resize
2678		 * the buffer.  This might lead to B_CACHE getting set or
2679		 * cleared.  If the size has not changed, B_CACHE remains
2680		 * unchanged from its previous state.
2681		 */
2682
2683		if (bp->b_bcount != size)
2684			allocbuf(bp, size);
2685
2686		KASSERT(bp->b_offset != NOOFFSET,
2687		    ("getblk: no buffer offset"));
2688
2689		/*
2690		 * A buffer with B_DELWRI set and B_CACHE clear must
2691		 * be committed before we can return the buffer in
2692		 * order to prevent the caller from issuing a read
2693		 * ( due to B_CACHE not being set ) and overwriting
2694		 * it.
2695		 *
2696		 * Most callers, including NFS and FFS, need this to
2697		 * operate properly either because they assume they
2698		 * can issue a read if B_CACHE is not set, or because
2699		 * ( for example ) an uncached B_DELWRI might loop due
2700		 * to softupdates re-dirtying the buffer.  In the latter
2701		 * case, B_CACHE is set after the first write completes,
2702		 * preventing further loops.
2703		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
2704		 * above while extending the buffer, we cannot allow the
2705		 * buffer to remain with B_CACHE set after the write
2706		 * completes or it will represent a corrupt state.  To
2707		 * deal with this we set B_NOCACHE to scrap the buffer
2708		 * after the write.
2709		 *
2710		 * We might be able to do something fancy, like setting
2711		 * B_CACHE in bwrite() except if B_DELWRI is already set,
2712		 * so the below call doesn't set B_CACHE, but that gets real
2713		 * confusing.  This is much easier.
2714		 */
2715
2716		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
2717			bp->b_flags |= B_NOCACHE;
2718			bwrite(bp);
2719			goto loop;
2720		}
2721		bp->b_flags &= ~B_DONE;
2722	} else {
2723		int bsize, maxsize, vmio;
2724		off_t offset;
2725
2726		/*
2727		 * Buffer is not in-core, create new buffer.  The buffer
2728		 * returned by getnewbuf() is locked.  Note that the returned
2729		 * buffer is also considered valid (not marked B_INVAL).
2730		 */
2731		BO_UNLOCK(bo);
2732		/*
2733		 * If the user does not want us to create the buffer, bail out
2734		 * here.
2735		 */
2736		if (flags & GB_NOCREAT)
2737			return NULL;
2738		bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
2739		offset = blkno * bsize;
2740		vmio = vp->v_object != NULL;
2741		maxsize = vmio ? size + (offset & PAGE_MASK) : size;
2742		maxsize = imax(maxsize, bsize);
2743
2744		bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags);
2745		if (bp == NULL) {
2746			if (slpflag || slptimeo)
2747				return NULL;
2748			goto loop;
2749		}
2750
2751		/*
2752		 * This code is used to make sure that a buffer is not
2753		 * created while the getnewbuf routine is blocked.
2754		 * This can be a problem whether the vnode is locked or not.
2755		 * If the buffer is created out from under us, we have to
2756		 * throw away the one we just created.
2757		 *
2758		 * Note: this must occur before we associate the buffer
2759		 * with the vp especially considering limitations in
2760		 * the splay tree implementation when dealing with duplicate
2761		 * lblkno's.
2762		 */
2763		BO_LOCK(bo);
2764		if (gbincore(bo, blkno)) {
2765			BO_UNLOCK(bo);
2766			bp->b_flags |= B_INVAL;
2767			brelse(bp);
2768			goto loop;
2769		}
2770
2771		/*
2772		 * Insert the buffer into the hash, so that it can
2773		 * be found by incore.
2774		 */
2775		bp->b_blkno = bp->b_lblkno = blkno;
2776		bp->b_offset = offset;
2777		bgetvp(vp, bp);
2778		BO_UNLOCK(bo);
2779
2780		/*
2781		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
2782		 * buffer size starts out as 0, B_CACHE will be set by
2783		 * allocbuf() for the VMIO case prior to it testing the
2784		 * backing store for validity.
2785		 */
2786
2787		if (vmio) {
2788			bp->b_flags |= B_VMIO;
2789#if defined(VFS_BIO_DEBUG)
2790			if (vn_canvmio(vp) != TRUE)
2791				printf("getblk: VMIO on vnode type %d\n",
2792					vp->v_type);
2793#endif
2794			KASSERT(vp->v_object == bp->b_bufobj->bo_object,
2795			    ("ARGH! different b_bufobj->bo_object %p %p %p\n",
2796			    bp, vp->v_object, bp->b_bufobj->bo_object));
2797		} else {
2798			bp->b_flags &= ~B_VMIO;
2799			KASSERT(bp->b_bufobj->bo_object == NULL,
2800			    ("ARGH! has b_bufobj->bo_object %p %p\n",
2801			    bp, bp->b_bufobj->bo_object));
2802		}
2803
2804		allocbuf(bp, size);
2805		bp->b_flags &= ~B_DONE;
2806	}
2807	CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
2808	BUF_ASSERT_HELD(bp);
2809	KASSERT(bp->b_bufobj == bo,
2810	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2811	return (bp);
2812}
2813
2814/*
2815 * Get an empty, disassociated buffer of given size.  The buffer is initially
2816 * set to B_INVAL.
2817 */
2818struct buf *
2819geteblk(int size, int flags)
2820{
2821	struct buf *bp;
2822	int maxsize;
2823
2824	maxsize = (size + BKVAMASK) & ~BKVAMASK;
2825	while ((bp = getnewbuf(NULL, 0, 0, size, maxsize, flags)) == NULL) {
2826		if ((flags & GB_NOWAIT_BD) &&
2827		    (curthread->td_pflags & TDP_BUFNEED) != 0)
2828			return (NULL);
2829	}
2830	allocbuf(bp, size);
2831	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
2832	BUF_ASSERT_HELD(bp);
2833	return (bp);
2834}
2835
2836
2837/*
2838 * This code constitutes the buffer memory from either anonymous system
2839 * memory (in the case of non-VMIO operations) or from an associated
2840 * VM object (in the case of VMIO operations).  This code is able to
2841 * resize a buffer up or down.
2842 *
2843 * Note that this code is tricky, and has many complications to resolve
2844 * deadlock or inconsistant data situations.  Tread lightly!!!
2845 * There are B_CACHE and B_DELWRI interactions that must be dealt with by
2846 * the caller.  Calling this code willy nilly can result in the loss of data.
2847 *
2848 * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
2849 * B_CACHE for the non-VMIO case.
2850 */
2851
2852int
2853allocbuf(struct buf *bp, int size)
2854{
2855	int newbsize, mbsize;
2856	int i;
2857
2858	BUF_ASSERT_HELD(bp);
2859
2860	if (bp->b_kvasize < size)
2861		panic("allocbuf: buffer too small");
2862
2863	if ((bp->b_flags & B_VMIO) == 0) {
2864		caddr_t origbuf;
2865		int origbufsize;
2866		/*
2867		 * Just get anonymous memory from the kernel.  Don't
2868		 * mess with B_CACHE.
2869		 */
2870		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
2871		if (bp->b_flags & B_MALLOC)
2872			newbsize = mbsize;
2873		else
2874			newbsize = round_page(size);
2875
2876		if (newbsize < bp->b_bufsize) {
2877			/*
2878			 * malloced buffers are not shrunk
2879			 */
2880			if (bp->b_flags & B_MALLOC) {
2881				if (newbsize) {
2882					bp->b_bcount = size;
2883				} else {
2884					free(bp->b_data, M_BIOBUF);
2885					if (bp->b_bufsize) {
2886						atomic_subtract_long(
2887						    &bufmallocspace,
2888						    bp->b_bufsize);
2889						bufspacewakeup();
2890						bp->b_bufsize = 0;
2891					}
2892					bp->b_saveaddr = bp->b_kvabase;
2893					bp->b_data = bp->b_saveaddr;
2894					bp->b_bcount = 0;
2895					bp->b_flags &= ~B_MALLOC;
2896				}
2897				return 1;
2898			}
2899			vm_hold_free_pages(bp, newbsize);
2900		} else if (newbsize > bp->b_bufsize) {
2901			/*
2902			 * We only use malloced memory on the first allocation.
2903			 * and revert to page-allocated memory when the buffer
2904			 * grows.
2905			 */
2906			/*
2907			 * There is a potential smp race here that could lead
2908			 * to bufmallocspace slightly passing the max.  It
2909			 * is probably extremely rare and not worth worrying
2910			 * over.
2911			 */
2912			if ( (bufmallocspace < maxbufmallocspace) &&
2913				(bp->b_bufsize == 0) &&
2914				(mbsize <= PAGE_SIZE/2)) {
2915
2916				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
2917				bp->b_bufsize = mbsize;
2918				bp->b_bcount = size;
2919				bp->b_flags |= B_MALLOC;
2920				atomic_add_long(&bufmallocspace, mbsize);
2921				return 1;
2922			}
2923			origbuf = NULL;
2924			origbufsize = 0;
2925			/*
2926			 * If the buffer is growing on its other-than-first allocation,
2927			 * then we revert to the page-allocation scheme.
2928			 */
2929			if (bp->b_flags & B_MALLOC) {
2930				origbuf = bp->b_data;
2931				origbufsize = bp->b_bufsize;
2932				bp->b_data = bp->b_kvabase;
2933				if (bp->b_bufsize) {
2934					atomic_subtract_long(&bufmallocspace,
2935					    bp->b_bufsize);
2936					bufspacewakeup();
2937					bp->b_bufsize = 0;
2938				}
2939				bp->b_flags &= ~B_MALLOC;
2940				newbsize = round_page(newbsize);
2941			}
2942			vm_hold_load_pages(
2943			    bp,
2944			    (vm_offset_t) bp->b_data + bp->b_bufsize,
2945			    (vm_offset_t) bp->b_data + newbsize);
2946			if (origbuf) {
2947				bcopy(origbuf, bp->b_data, origbufsize);
2948				free(origbuf, M_BIOBUF);
2949			}
2950		}
2951	} else {
2952		int desiredpages;
2953
2954		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
2955		desiredpages = (size == 0) ? 0 :
2956			num_pages((bp->b_offset & PAGE_MASK) + newbsize);
2957
2958		if (bp->b_flags & B_MALLOC)
2959			panic("allocbuf: VMIO buffer can't be malloced");
2960		/*
2961		 * Set B_CACHE initially if buffer is 0 length or will become
2962		 * 0-length.
2963		 */
2964		if (size == 0 || bp->b_bufsize == 0)
2965			bp->b_flags |= B_CACHE;
2966
2967		if (newbsize < bp->b_bufsize) {
2968			/*
2969			 * DEV_BSIZE aligned new buffer size is less then the
2970			 * DEV_BSIZE aligned existing buffer size.  Figure out
2971			 * if we have to remove any pages.
2972			 */
2973			if (desiredpages < bp->b_npages) {
2974				vm_page_t m;
2975
2976				VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
2977				for (i = desiredpages; i < bp->b_npages; i++) {
2978					/*
2979					 * the page is not freed here -- it
2980					 * is the responsibility of
2981					 * vnode_pager_setsize
2982					 */
2983					m = bp->b_pages[i];
2984					KASSERT(m != bogus_page,
2985					    ("allocbuf: bogus page found"));
2986					while (vm_page_sleep_if_busy(m, TRUE,
2987					    "biodep"))
2988						continue;
2989
2990					bp->b_pages[i] = NULL;
2991					vm_page_lock(m);
2992					vm_page_unwire(m, 0);
2993					vm_page_unlock(m);
2994				}
2995				VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
2996				pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
2997				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
2998				bp->b_npages = desiredpages;
2999			}
3000		} else if (size > bp->b_bcount) {
3001			/*
3002			 * We are growing the buffer, possibly in a
3003			 * byte-granular fashion.
3004			 */
3005			vm_object_t obj;
3006			vm_offset_t toff;
3007			vm_offset_t tinc;
3008
3009			/*
3010			 * Step 1, bring in the VM pages from the object,
3011			 * allocating them if necessary.  We must clear
3012			 * B_CACHE if these pages are not valid for the
3013			 * range covered by the buffer.
3014			 */
3015
3016			obj = bp->b_bufobj->bo_object;
3017
3018			VM_OBJECT_LOCK(obj);
3019			while (bp->b_npages < desiredpages) {
3020				vm_page_t m;
3021
3022				/*
3023				 * We must allocate system pages since blocking
3024				 * here could intefere with paging I/O, no
3025				 * matter which process we are.
3026				 *
3027				 * We can only test VPO_BUSY here.  Blocking on
3028				 * m->busy might lead to a deadlock:
3029				 *  vm_fault->getpages->cluster_read->allocbuf
3030				 * Thus, we specify VM_ALLOC_IGN_SBUSY.
3031				 */
3032				m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) +
3033				    bp->b_npages, VM_ALLOC_NOBUSY |
3034				    VM_ALLOC_SYSTEM | VM_ALLOC_WIRED |
3035				    VM_ALLOC_RETRY | VM_ALLOC_IGN_SBUSY |
3036				    VM_ALLOC_COUNT(desiredpages - bp->b_npages));
3037				if (m->valid == 0)
3038					bp->b_flags &= ~B_CACHE;
3039				bp->b_pages[bp->b_npages] = m;
3040				++bp->b_npages;
3041			}
3042
3043			/*
3044			 * Step 2.  We've loaded the pages into the buffer,
3045			 * we have to figure out if we can still have B_CACHE
3046			 * set.  Note that B_CACHE is set according to the
3047			 * byte-granular range ( bcount and size ), new the
3048			 * aligned range ( newbsize ).
3049			 *
3050			 * The VM test is against m->valid, which is DEV_BSIZE
3051			 * aligned.  Needless to say, the validity of the data
3052			 * needs to also be DEV_BSIZE aligned.  Note that this
3053			 * fails with NFS if the server or some other client
3054			 * extends the file's EOF.  If our buffer is resized,
3055			 * B_CACHE may remain set! XXX
3056			 */
3057
3058			toff = bp->b_bcount;
3059			tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
3060
3061			while ((bp->b_flags & B_CACHE) && toff < size) {
3062				vm_pindex_t pi;
3063
3064				if (tinc > (size - toff))
3065					tinc = size - toff;
3066
3067				pi = ((bp->b_offset & PAGE_MASK) + toff) >>
3068				    PAGE_SHIFT;
3069
3070				vfs_buf_test_cache(
3071				    bp,
3072				    bp->b_offset,
3073				    toff,
3074				    tinc,
3075				    bp->b_pages[pi]
3076				);
3077				toff += tinc;
3078				tinc = PAGE_SIZE;
3079			}
3080			VM_OBJECT_UNLOCK(obj);
3081
3082			/*
3083			 * Step 3, fixup the KVM pmap.  Remember that
3084			 * bp->b_data is relative to bp->b_offset, but
3085			 * bp->b_offset may be offset into the first page.
3086			 */
3087
3088			bp->b_data = (caddr_t)
3089			    trunc_page((vm_offset_t)bp->b_data);
3090			pmap_qenter(
3091			    (vm_offset_t)bp->b_data,
3092			    bp->b_pages,
3093			    bp->b_npages
3094			);
3095
3096			bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
3097			    (vm_offset_t)(bp->b_offset & PAGE_MASK));
3098		}
3099	}
3100	if (newbsize < bp->b_bufsize)
3101		bufspacewakeup();
3102	bp->b_bufsize = newbsize;	/* actual buffer allocation	*/
3103	bp->b_bcount = size;		/* requested buffer size	*/
3104	return 1;
3105}
3106
3107void
3108biodone(struct bio *bp)
3109{
3110	struct mtx *mtxp;
3111	void (*done)(struct bio *);
3112
3113	mtxp = mtx_pool_find(mtxpool_sleep, bp);
3114	mtx_lock(mtxp);
3115	bp->bio_flags |= BIO_DONE;
3116	done = bp->bio_done;
3117	if (done == NULL)
3118		wakeup(bp);
3119	mtx_unlock(mtxp);
3120	if (done != NULL)
3121		done(bp);
3122}
3123
3124/*
3125 * Wait for a BIO to finish.
3126 *
3127 * XXX: resort to a timeout for now.  The optimal locking (if any) for this
3128 * case is not yet clear.
3129 */
3130int
3131biowait(struct bio *bp, const char *wchan)
3132{
3133	struct mtx *mtxp;
3134
3135	mtxp = mtx_pool_find(mtxpool_sleep, bp);
3136	mtx_lock(mtxp);
3137	while ((bp->bio_flags & BIO_DONE) == 0)
3138		msleep(bp, mtxp, PRIBIO, wchan, hz / 10);
3139	mtx_unlock(mtxp);
3140	if (bp->bio_error != 0)
3141		return (bp->bio_error);
3142	if (!(bp->bio_flags & BIO_ERROR))
3143		return (0);
3144	return (EIO);
3145}
3146
3147void
3148biofinish(struct bio *bp, struct devstat *stat, int error)
3149{
3150
3151	if (error) {
3152		bp->bio_error = error;
3153		bp->bio_flags |= BIO_ERROR;
3154	}
3155	if (stat != NULL)
3156		devstat_end_transaction_bio(stat, bp);
3157	biodone(bp);
3158}
3159
3160/*
3161 *	bufwait:
3162 *
3163 *	Wait for buffer I/O completion, returning error status.  The buffer
3164 *	is left locked and B_DONE on return.  B_EINTR is converted into an EINTR
3165 *	error and cleared.
3166 */
3167int
3168bufwait(struct buf *bp)
3169{
3170	if (bp->b_iocmd == BIO_READ)
3171		bwait(bp, PRIBIO, "biord");
3172	else
3173		bwait(bp, PRIBIO, "biowr");
3174	if (bp->b_flags & B_EINTR) {
3175		bp->b_flags &= ~B_EINTR;
3176		return (EINTR);
3177	}
3178	if (bp->b_ioflags & BIO_ERROR) {
3179		return (bp->b_error ? bp->b_error : EIO);
3180	} else {
3181		return (0);
3182	}
3183}
3184
3185 /*
3186  * Call back function from struct bio back up to struct buf.
3187  */
3188static void
3189bufdonebio(struct bio *bip)
3190{
3191	struct buf *bp;
3192
3193	bp = bip->bio_caller2;
3194	bp->b_resid = bp->b_bcount - bip->bio_completed;
3195	bp->b_resid = bip->bio_resid;	/* XXX: remove */
3196	bp->b_ioflags = bip->bio_flags;
3197	bp->b_error = bip->bio_error;
3198	if (bp->b_error)
3199		bp->b_ioflags |= BIO_ERROR;
3200	bufdone(bp);
3201	g_destroy_bio(bip);
3202}
3203
3204void
3205dev_strategy(struct cdev *dev, struct buf *bp)
3206{
3207	struct cdevsw *csw;
3208	struct bio *bip;
3209	int ref;
3210
3211	if ((!bp->b_iocmd) || (bp->b_iocmd & (bp->b_iocmd - 1)))
3212		panic("b_iocmd botch");
3213	for (;;) {
3214		bip = g_new_bio();
3215		if (bip != NULL)
3216			break;
3217		/* Try again later */
3218		tsleep(&bp, PRIBIO, "dev_strat", hz/10);
3219	}
3220	bip->bio_cmd = bp->b_iocmd;
3221	bip->bio_offset = bp->b_iooffset;
3222	bip->bio_length = bp->b_bcount;
3223	bip->bio_bcount = bp->b_bcount;	/* XXX: remove */
3224	bip->bio_data = bp->b_data;
3225	bip->bio_done = bufdonebio;
3226	bip->bio_caller2 = bp;
3227	bip->bio_dev = dev;
3228	KASSERT(dev->si_refcount > 0,
3229	    ("dev_strategy on un-referenced struct cdev *(%s)",
3230	    devtoname(dev)));
3231	csw = dev_refthread(dev, &ref);
3232	if (csw == NULL) {
3233		g_destroy_bio(bip);
3234		bp->b_error = ENXIO;
3235		bp->b_ioflags = BIO_ERROR;
3236		bufdone(bp);
3237		return;
3238	}
3239	(*csw->d_strategy)(bip);
3240	dev_relthread(dev, ref);
3241}
3242
3243/*
3244 *	bufdone:
3245 *
3246 *	Finish I/O on a buffer, optionally calling a completion function.
3247 *	This is usually called from an interrupt so process blocking is
3248 *	not allowed.
3249 *
3250 *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
3251 *	In a non-VMIO bp, B_CACHE will be set on the next getblk()
3252 *	assuming B_INVAL is clear.
3253 *
3254 *	For the VMIO case, we set B_CACHE if the op was a read and no
3255 *	read error occured, or if the op was a write.  B_CACHE is never
3256 *	set if the buffer is invalid or otherwise uncacheable.
3257 *
3258 *	biodone does not mess with B_INVAL, allowing the I/O routine or the
3259 *	initiator to leave B_INVAL set to brelse the buffer out of existance
3260 *	in the biodone routine.
3261 */
3262void
3263bufdone(struct buf *bp)
3264{
3265	struct bufobj *dropobj;
3266	void    (*biodone)(struct buf *);
3267
3268	CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
3269	dropobj = NULL;
3270
3271	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
3272	BUF_ASSERT_HELD(bp);
3273
3274	runningbufwakeup(bp);
3275	if (bp->b_iocmd == BIO_WRITE)
3276		dropobj = bp->b_bufobj;
3277	/* call optional completion function if requested */
3278	if (bp->b_iodone != NULL) {
3279		biodone = bp->b_iodone;
3280		bp->b_iodone = NULL;
3281		(*biodone) (bp);
3282		if (dropobj)
3283			bufobj_wdrop(dropobj);
3284		return;
3285	}
3286
3287	bufdone_finish(bp);
3288
3289	if (dropobj)
3290		bufobj_wdrop(dropobj);
3291}
3292
3293void
3294bufdone_finish(struct buf *bp)
3295{
3296	BUF_ASSERT_HELD(bp);
3297
3298	if (!LIST_EMPTY(&bp->b_dep))
3299		buf_complete(bp);
3300
3301	if (bp->b_flags & B_VMIO) {
3302		int i;
3303		vm_ooffset_t foff;
3304		vm_page_t m;
3305		vm_object_t obj;
3306		int bogus, iosize;
3307		struct vnode *vp = bp->b_vp;
3308
3309		obj = bp->b_bufobj->bo_object;
3310
3311#if defined(VFS_BIO_DEBUG)
3312		mp_fixme("usecount and vflag accessed without locks.");
3313		if (vp->v_usecount == 0) {
3314			panic("biodone: zero vnode ref count");
3315		}
3316
3317		KASSERT(vp->v_object != NULL,
3318			("biodone: vnode %p has no vm_object", vp));
3319#endif
3320
3321		foff = bp->b_offset;
3322		KASSERT(bp->b_offset != NOOFFSET,
3323		    ("biodone: no buffer offset"));
3324
3325		VM_OBJECT_LOCK(obj);
3326#if defined(VFS_BIO_DEBUG)
3327		if (obj->paging_in_progress < bp->b_npages) {
3328			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
3329			    obj->paging_in_progress, bp->b_npages);
3330		}
3331#endif
3332
3333		/*
3334		 * Set B_CACHE if the op was a normal read and no error
3335		 * occured.  B_CACHE is set for writes in the b*write()
3336		 * routines.
3337		 */
3338		iosize = bp->b_bcount - bp->b_resid;
3339		if (bp->b_iocmd == BIO_READ &&
3340		    !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
3341		    !(bp->b_ioflags & BIO_ERROR)) {
3342			bp->b_flags |= B_CACHE;
3343		}
3344		bogus = 0;
3345		for (i = 0; i < bp->b_npages; i++) {
3346			int bogusflag = 0;
3347			int resid;
3348
3349			resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
3350			if (resid > iosize)
3351				resid = iosize;
3352
3353			/*
3354			 * cleanup bogus pages, restoring the originals
3355			 */
3356			m = bp->b_pages[i];
3357			if (m == bogus_page) {
3358				bogus = bogusflag = 1;
3359				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
3360				if (m == NULL)
3361					panic("biodone: page disappeared!");
3362				bp->b_pages[i] = m;
3363			}
3364#if defined(VFS_BIO_DEBUG)
3365			if (OFF_TO_IDX(foff) != m->pindex) {
3366				printf(
3367"biodone: foff(%jd)/m->pindex(%ju) mismatch\n",
3368				    (intmax_t)foff, (uintmax_t)m->pindex);
3369			}
3370#endif
3371
3372			/*
3373			 * In the write case, the valid and clean bits are
3374			 * already changed correctly ( see bdwrite() ), so we
3375			 * only need to do this here in the read case.
3376			 */
3377			if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) {
3378				KASSERT((m->dirty & vm_page_bits(foff &
3379				    PAGE_MASK, resid)) == 0, ("bufdone_finish:"
3380				    " page %p has unexpected dirty bits", m));
3381				vfs_page_set_valid(bp, foff, m);
3382			}
3383
3384			/*
3385			 * when debugging new filesystems or buffer I/O methods, this
3386			 * is the most common error that pops up.  if you see this, you
3387			 * have not set the page busy flag correctly!!!
3388			 */
3389			if (m->busy == 0) {
3390				printf("biodone: page busy < 0, "
3391				    "pindex: %d, foff: 0x(%x,%x), "
3392				    "resid: %d, index: %d\n",
3393				    (int) m->pindex, (int)(foff >> 32),
3394						(int) foff & 0xffffffff, resid, i);
3395				if (!vn_isdisk(vp, NULL))
3396					printf(" iosize: %jd, lblkno: %jd, flags: 0x%x, npages: %d\n",
3397					    (intmax_t)bp->b_vp->v_mount->mnt_stat.f_iosize,
3398					    (intmax_t) bp->b_lblkno,
3399					    bp->b_flags, bp->b_npages);
3400				else
3401					printf(" VDEV, lblkno: %jd, flags: 0x%x, npages: %d\n",
3402					    (intmax_t) bp->b_lblkno,
3403					    bp->b_flags, bp->b_npages);
3404				printf(" valid: 0x%lx, dirty: 0x%lx, wired: %d\n",
3405				    (u_long)m->valid, (u_long)m->dirty,
3406				    m->wire_count);
3407				panic("biodone: page busy < 0\n");
3408			}
3409			vm_page_io_finish(m);
3410			vm_object_pip_subtract(obj, 1);
3411			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
3412			iosize -= resid;
3413		}
3414		vm_object_pip_wakeupn(obj, 0);
3415		VM_OBJECT_UNLOCK(obj);
3416		if (bogus)
3417			pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
3418			    bp->b_pages, bp->b_npages);
3419	}
3420
3421	/*
3422	 * For asynchronous completions, release the buffer now. The brelse
3423	 * will do a wakeup there if necessary - so no need to do a wakeup
3424	 * here in the async case. The sync case always needs to do a wakeup.
3425	 */
3426
3427	if (bp->b_flags & B_ASYNC) {
3428		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR))
3429			brelse(bp);
3430		else
3431			bqrelse(bp);
3432	} else
3433		bdone(bp);
3434}
3435
3436/*
3437 * This routine is called in lieu of iodone in the case of
3438 * incomplete I/O.  This keeps the busy status for pages
3439 * consistant.
3440 */
3441void
3442vfs_unbusy_pages(struct buf *bp)
3443{
3444	int i;
3445	vm_object_t obj;
3446	vm_page_t m;
3447
3448	runningbufwakeup(bp);
3449	if (!(bp->b_flags & B_VMIO))
3450		return;
3451
3452	obj = bp->b_bufobj->bo_object;
3453	VM_OBJECT_LOCK(obj);
3454	for (i = 0; i < bp->b_npages; i++) {
3455		m = bp->b_pages[i];
3456		if (m == bogus_page) {
3457			m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
3458			if (!m)
3459				panic("vfs_unbusy_pages: page missing\n");
3460			bp->b_pages[i] = m;
3461			pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
3462			    bp->b_pages, bp->b_npages);
3463		}
3464		vm_object_pip_subtract(obj, 1);
3465		vm_page_io_finish(m);
3466	}
3467	vm_object_pip_wakeupn(obj, 0);
3468	VM_OBJECT_UNLOCK(obj);
3469}
3470
3471/*
3472 * vfs_page_set_valid:
3473 *
3474 *	Set the valid bits in a page based on the supplied offset.   The
3475 *	range is restricted to the buffer's size.
3476 *
3477 *	This routine is typically called after a read completes.
3478 */
3479static void
3480vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m)
3481{
3482	vm_ooffset_t eoff;
3483
3484	/*
3485	 * Compute the end offset, eoff, such that [off, eoff) does not span a
3486	 * page boundary and eoff is not greater than the end of the buffer.
3487	 * The end of the buffer, in this case, is our file EOF, not the
3488	 * allocation size of the buffer.
3489	 */
3490	eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK;
3491	if (eoff > bp->b_offset + bp->b_bcount)
3492		eoff = bp->b_offset + bp->b_bcount;
3493
3494	/*
3495	 * Set valid range.  This is typically the entire buffer and thus the
3496	 * entire page.
3497	 */
3498	if (eoff > off)
3499		vm_page_set_valid(m, off & PAGE_MASK, eoff - off);
3500}
3501
3502/*
3503 * vfs_page_set_validclean:
3504 *
3505 *	Set the valid bits and clear the dirty bits in a page based on the
3506 *	supplied offset.   The range is restricted to the buffer's size.
3507 */
3508static void
3509vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m)
3510{
3511	vm_ooffset_t soff, eoff;
3512
3513	/*
3514	 * Start and end offsets in buffer.  eoff - soff may not cross a
3515	 * page boundry or cross the end of the buffer.  The end of the
3516	 * buffer, in this case, is our file EOF, not the allocation size
3517	 * of the buffer.
3518	 */
3519	soff = off;
3520	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
3521	if (eoff > bp->b_offset + bp->b_bcount)
3522		eoff = bp->b_offset + bp->b_bcount;
3523
3524	/*
3525	 * Set valid range.  This is typically the entire buffer and thus the
3526	 * entire page.
3527	 */
3528	if (eoff > soff) {
3529		vm_page_set_validclean(
3530		    m,
3531		   (vm_offset_t) (soff & PAGE_MASK),
3532		   (vm_offset_t) (eoff - soff)
3533		);
3534	}
3535}
3536
3537/*
3538 * Ensure that all buffer pages are not busied by VPO_BUSY flag. If
3539 * any page is busy, drain the flag.
3540 */
3541static void
3542vfs_drain_busy_pages(struct buf *bp)
3543{
3544	vm_page_t m;
3545	int i, last_busied;
3546
3547	VM_OBJECT_LOCK_ASSERT(bp->b_bufobj->bo_object, MA_OWNED);
3548	last_busied = 0;
3549	for (i = 0; i < bp->b_npages; i++) {
3550		m = bp->b_pages[i];
3551		if ((m->oflags & VPO_BUSY) != 0) {
3552			for (; last_busied < i; last_busied++)
3553				vm_page_busy(bp->b_pages[last_busied]);
3554			while ((m->oflags & VPO_BUSY) != 0)
3555				vm_page_sleep(m, "vbpage");
3556		}
3557	}
3558	for (i = 0; i < last_busied; i++)
3559		vm_page_wakeup(bp->b_pages[i]);
3560}
3561
3562/*
3563 * This routine is called before a device strategy routine.
3564 * It is used to tell the VM system that paging I/O is in
3565 * progress, and treat the pages associated with the buffer
3566 * almost as being VPO_BUSY.  Also the object paging_in_progress
3567 * flag is handled to make sure that the object doesn't become
3568 * inconsistant.
3569 *
3570 * Since I/O has not been initiated yet, certain buffer flags
3571 * such as BIO_ERROR or B_INVAL may be in an inconsistant state
3572 * and should be ignored.
3573 */
3574void
3575vfs_busy_pages(struct buf *bp, int clear_modify)
3576{
3577	int i, bogus;
3578	vm_object_t obj;
3579	vm_ooffset_t foff;
3580	vm_page_t m;
3581
3582	if (!(bp->b_flags & B_VMIO))
3583		return;
3584
3585	obj = bp->b_bufobj->bo_object;
3586	foff = bp->b_offset;
3587	KASSERT(bp->b_offset != NOOFFSET,
3588	    ("vfs_busy_pages: no buffer offset"));
3589	VM_OBJECT_LOCK(obj);
3590	vfs_drain_busy_pages(bp);
3591	if (bp->b_bufsize != 0)
3592		vfs_setdirty_locked_object(bp);
3593	bogus = 0;
3594	for (i = 0; i < bp->b_npages; i++) {
3595		m = bp->b_pages[i];
3596
3597		if ((bp->b_flags & B_CLUSTER) == 0) {
3598			vm_object_pip_add(obj, 1);
3599			vm_page_io_start(m);
3600		}
3601		/*
3602		 * When readying a buffer for a read ( i.e
3603		 * clear_modify == 0 ), it is important to do
3604		 * bogus_page replacement for valid pages in
3605		 * partially instantiated buffers.  Partially
3606		 * instantiated buffers can, in turn, occur when
3607		 * reconstituting a buffer from its VM backing store
3608		 * base.  We only have to do this if B_CACHE is
3609		 * clear ( which causes the I/O to occur in the
3610		 * first place ).  The replacement prevents the read
3611		 * I/O from overwriting potentially dirty VM-backed
3612		 * pages.  XXX bogus page replacement is, uh, bogus.
3613		 * It may not work properly with small-block devices.
3614		 * We need to find a better way.
3615		 */
3616		if (clear_modify) {
3617			pmap_remove_write(m);
3618			vfs_page_set_validclean(bp, foff, m);
3619		} else if (m->valid == VM_PAGE_BITS_ALL &&
3620		    (bp->b_flags & B_CACHE) == 0) {
3621			bp->b_pages[i] = bogus_page;
3622			bogus++;
3623		}
3624		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
3625	}
3626	VM_OBJECT_UNLOCK(obj);
3627	if (bogus)
3628		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
3629		    bp->b_pages, bp->b_npages);
3630}
3631
3632/*
3633 *	vfs_bio_set_valid:
3634 *
3635 *	Set the range within the buffer to valid.  The range is
3636 *	relative to the beginning of the buffer, b_offset.  Note that
3637 *	b_offset itself may be offset from the beginning of the first
3638 *	page.
3639 */
3640void
3641vfs_bio_set_valid(struct buf *bp, int base, int size)
3642{
3643	int i, n;
3644	vm_page_t m;
3645
3646	if (!(bp->b_flags & B_VMIO))
3647		return;
3648
3649	/*
3650	 * Fixup base to be relative to beginning of first page.
3651	 * Set initial n to be the maximum number of bytes in the
3652	 * first page that can be validated.
3653	 */
3654	base += (bp->b_offset & PAGE_MASK);
3655	n = PAGE_SIZE - (base & PAGE_MASK);
3656
3657	VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
3658	for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
3659		m = bp->b_pages[i];
3660		if (n > size)
3661			n = size;
3662		vm_page_set_valid(m, base & PAGE_MASK, n);
3663		base += n;
3664		size -= n;
3665		n = PAGE_SIZE;
3666	}
3667	VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
3668}
3669
3670/*
3671 *	vfs_bio_clrbuf:
3672 *
3673 *	If the specified buffer is a non-VMIO buffer, clear the entire
3674 *	buffer.  If the specified buffer is a VMIO buffer, clear and
3675 *	validate only the previously invalid portions of the buffer.
3676 *	This routine essentially fakes an I/O, so we need to clear
3677 *	BIO_ERROR and B_INVAL.
3678 *
3679 *	Note that while we only theoretically need to clear through b_bcount,
3680 *	we go ahead and clear through b_bufsize.
3681 */
3682void
3683vfs_bio_clrbuf(struct buf *bp)
3684{
3685	int i, j, mask;
3686	caddr_t sa, ea;
3687
3688	if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) {
3689		clrbuf(bp);
3690		return;
3691	}
3692	bp->b_flags &= ~B_INVAL;
3693	bp->b_ioflags &= ~BIO_ERROR;
3694	VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
3695	if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
3696	    (bp->b_offset & PAGE_MASK) == 0) {
3697		if (bp->b_pages[0] == bogus_page)
3698			goto unlock;
3699		mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
3700		VM_OBJECT_LOCK_ASSERT(bp->b_pages[0]->object, MA_OWNED);
3701		if ((bp->b_pages[0]->valid & mask) == mask)
3702			goto unlock;
3703		if ((bp->b_pages[0]->valid & mask) == 0) {
3704			bzero(bp->b_data, bp->b_bufsize);
3705			bp->b_pages[0]->valid |= mask;
3706			goto unlock;
3707		}
3708	}
3709	ea = sa = bp->b_data;
3710	for(i = 0; i < bp->b_npages; i++, sa = ea) {
3711		ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
3712		ea = (caddr_t)(vm_offset_t)ulmin(
3713		    (u_long)(vm_offset_t)ea,
3714		    (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize);
3715		if (bp->b_pages[i] == bogus_page)
3716			continue;
3717		j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE;
3718		mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
3719		VM_OBJECT_LOCK_ASSERT(bp->b_pages[i]->object, MA_OWNED);
3720		if ((bp->b_pages[i]->valid & mask) == mask)
3721			continue;
3722		if ((bp->b_pages[i]->valid & mask) == 0)
3723			bzero(sa, ea - sa);
3724		else {
3725			for (; sa < ea; sa += DEV_BSIZE, j++) {
3726				if ((bp->b_pages[i]->valid & (1 << j)) == 0)
3727					bzero(sa, DEV_BSIZE);
3728			}
3729		}
3730		bp->b_pages[i]->valid |= mask;
3731	}
3732unlock:
3733	VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
3734	bp->b_resid = 0;
3735}
3736
3737/*
3738 * vm_hold_load_pages and vm_hold_free_pages get pages into
3739 * a buffers address space.  The pages are anonymous and are
3740 * not associated with a file object.
3741 */
3742static void
3743vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
3744{
3745	vm_offset_t pg;
3746	vm_page_t p;
3747	int index;
3748
3749	to = round_page(to);
3750	from = round_page(from);
3751	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
3752
3753	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
3754tryagain:
3755		/*
3756		 * note: must allocate system pages since blocking here
3757		 * could interfere with paging I/O, no matter which
3758		 * process we are.
3759		 */
3760		p = vm_page_alloc(NULL, pg >> PAGE_SHIFT, VM_ALLOC_NOOBJ |
3761		    VM_ALLOC_SYSTEM | VM_ALLOC_WIRED |
3762		    VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT));
3763		if (!p) {
3764			VM_WAIT;
3765			goto tryagain;
3766		}
3767		pmap_qenter(pg, &p, 1);
3768		bp->b_pages[index] = p;
3769	}
3770	bp->b_npages = index;
3771}
3772
3773/* Return pages associated with this buf to the vm system */
3774static void
3775vm_hold_free_pages(struct buf *bp, int newbsize)
3776{
3777	vm_offset_t from;
3778	vm_page_t p;
3779	int index, newnpages;
3780
3781	from = round_page((vm_offset_t)bp->b_data + newbsize);
3782	newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
3783	if (bp->b_npages > newnpages)
3784		pmap_qremove(from, bp->b_npages - newnpages);
3785	for (index = newnpages; index < bp->b_npages; index++) {
3786		p = bp->b_pages[index];
3787		bp->b_pages[index] = NULL;
3788		if (p->busy != 0)
3789			printf("vm_hold_free_pages: blkno: %jd, lblkno: %jd\n",
3790			    (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno);
3791		p->wire_count--;
3792		vm_page_free(p);
3793		atomic_subtract_int(&cnt.v_wire_count, 1);
3794	}
3795	bp->b_npages = newnpages;
3796}
3797
3798/*
3799 * Map an IO request into kernel virtual address space.
3800 *
3801 * All requests are (re)mapped into kernel VA space.
3802 * Notice that we use b_bufsize for the size of the buffer
3803 * to be mapped.  b_bcount might be modified by the driver.
3804 *
3805 * Note that even if the caller determines that the address space should
3806 * be valid, a race or a smaller-file mapped into a larger space may
3807 * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST
3808 * check the return value.
3809 */
3810int
3811vmapbuf(struct buf *bp)
3812{
3813	caddr_t addr, kva;
3814	vm_prot_t prot;
3815	int pidx, i;
3816	struct vm_page *m;
3817	struct pmap *pmap = &curproc->p_vmspace->vm_pmap;
3818
3819	if (bp->b_bufsize < 0)
3820		return (-1);
3821	prot = VM_PROT_READ;
3822	if (bp->b_iocmd == BIO_READ)
3823		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
3824	for (addr = (caddr_t)trunc_page((vm_offset_t)bp->b_data), pidx = 0;
3825	     addr < bp->b_data + bp->b_bufsize;
3826	     addr += PAGE_SIZE, pidx++) {
3827		/*
3828		 * Do the vm_fault if needed; do the copy-on-write thing
3829		 * when reading stuff off device into memory.
3830		 *
3831		 * NOTE! Must use pmap_extract() because addr may be in
3832		 * the userland address space, and kextract is only guarenteed
3833		 * to work for the kernland address space (see: sparc64 port).
3834		 */
3835retry:
3836		if (vm_fault_quick(addr >= bp->b_data ? addr : bp->b_data,
3837		    prot) < 0) {
3838			for (i = 0; i < pidx; ++i) {
3839				vm_page_lock(bp->b_pages[i]);
3840				vm_page_unhold(bp->b_pages[i]);
3841				vm_page_unlock(bp->b_pages[i]);
3842				bp->b_pages[i] = NULL;
3843			}
3844			return(-1);
3845		}
3846		m = pmap_extract_and_hold(pmap, (vm_offset_t)addr, prot);
3847		if (m == NULL)
3848			goto retry;
3849		bp->b_pages[pidx] = m;
3850	}
3851	if (pidx > btoc(MAXPHYS))
3852		panic("vmapbuf: mapped more than MAXPHYS");
3853	pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx);
3854
3855	kva = bp->b_saveaddr;
3856	bp->b_npages = pidx;
3857	bp->b_saveaddr = bp->b_data;
3858	bp->b_data = kva + (((vm_offset_t) bp->b_data) & PAGE_MASK);
3859	return(0);
3860}
3861
3862/*
3863 * Free the io map PTEs associated with this IO operation.
3864 * We also invalidate the TLB entries and restore the original b_addr.
3865 */
3866void
3867vunmapbuf(struct buf *bp)
3868{
3869	int pidx;
3870	int npages;
3871
3872	npages = bp->b_npages;
3873	pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
3874	for (pidx = 0; pidx < npages; pidx++) {
3875		vm_page_lock(bp->b_pages[pidx]);
3876		vm_page_unhold(bp->b_pages[pidx]);
3877		vm_page_unlock(bp->b_pages[pidx]);
3878	}
3879
3880	bp->b_data = bp->b_saveaddr;
3881}
3882
3883void
3884bdone(struct buf *bp)
3885{
3886	struct mtx *mtxp;
3887
3888	mtxp = mtx_pool_find(mtxpool_sleep, bp);
3889	mtx_lock(mtxp);
3890	bp->b_flags |= B_DONE;
3891	wakeup(bp);
3892	mtx_unlock(mtxp);
3893}
3894
3895void
3896bwait(struct buf *bp, u_char pri, const char *wchan)
3897{
3898	struct mtx *mtxp;
3899
3900	mtxp = mtx_pool_find(mtxpool_sleep, bp);
3901	mtx_lock(mtxp);
3902	while ((bp->b_flags & B_DONE) == 0)
3903		msleep(bp, mtxp, pri, wchan, 0);
3904	mtx_unlock(mtxp);
3905}
3906
3907int
3908bufsync(struct bufobj *bo, int waitfor)
3909{
3910
3911	return (VOP_FSYNC(bo->__bo_vnode, waitfor, curthread));
3912}
3913
3914void
3915bufstrategy(struct bufobj *bo, struct buf *bp)
3916{
3917	int i = 0;
3918	struct vnode *vp;
3919
3920	vp = bp->b_vp;
3921	KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy"));
3922	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
3923	    ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
3924	i = VOP_STRATEGY(vp, bp);
3925	KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
3926}
3927
3928void
3929bufobj_wrefl(struct bufobj *bo)
3930{
3931
3932	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
3933	ASSERT_BO_LOCKED(bo);
3934	bo->bo_numoutput++;
3935}
3936
3937void
3938bufobj_wref(struct bufobj *bo)
3939{
3940
3941	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
3942	BO_LOCK(bo);
3943	bo->bo_numoutput++;
3944	BO_UNLOCK(bo);
3945}
3946
3947void
3948bufobj_wdrop(struct bufobj *bo)
3949{
3950
3951	KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop"));
3952	BO_LOCK(bo);
3953	KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count"));
3954	if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) {
3955		bo->bo_flag &= ~BO_WWAIT;
3956		wakeup(&bo->bo_numoutput);
3957	}
3958	BO_UNLOCK(bo);
3959}
3960
3961int
3962bufobj_wwait(struct bufobj *bo, int slpflag, int timeo)
3963{
3964	int error;
3965
3966	KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
3967	ASSERT_BO_LOCKED(bo);
3968	error = 0;
3969	while (bo->bo_numoutput) {
3970		bo->bo_flag |= BO_WWAIT;
3971		error = msleep(&bo->bo_numoutput, BO_MTX(bo),
3972		    slpflag | (PRIBIO + 1), "bo_wwait", timeo);
3973		if (error)
3974			break;
3975	}
3976	return (error);
3977}
3978
3979void
3980bpin(struct buf *bp)
3981{
3982	struct mtx *mtxp;
3983
3984	mtxp = mtx_pool_find(mtxpool_sleep, bp);
3985	mtx_lock(mtxp);
3986	bp->b_pin_count++;
3987	mtx_unlock(mtxp);
3988}
3989
3990void
3991bunpin(struct buf *bp)
3992{
3993	struct mtx *mtxp;
3994
3995	mtxp = mtx_pool_find(mtxpool_sleep, bp);
3996	mtx_lock(mtxp);
3997	if (--bp->b_pin_count == 0)
3998		wakeup(bp);
3999	mtx_unlock(mtxp);
4000}
4001
4002void
4003bunpin_wait(struct buf *bp)
4004{
4005	struct mtx *mtxp;
4006
4007	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4008	mtx_lock(mtxp);
4009	while (bp->b_pin_count > 0)
4010		msleep(bp, mtxp, PRIBIO, "bwunpin", 0);
4011	mtx_unlock(mtxp);
4012}
4013
4014#include "opt_ddb.h"
4015#ifdef DDB
4016#include <ddb/ddb.h>
4017
4018/* DDB command to show buffer data */
4019DB_SHOW_COMMAND(buffer, db_show_buffer)
4020{
4021	/* get args */
4022	struct buf *bp = (struct buf *)addr;
4023
4024	if (!have_addr) {
4025		db_printf("usage: show buffer <addr>\n");
4026		return;
4027	}
4028
4029	db_printf("buf at %p\n", bp);
4030	db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS);
4031	db_printf(
4032	    "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
4033	    "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_dep = %p\n",
4034	    bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
4035	    bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno,
4036	    bp->b_dep.lh_first);
4037	if (bp->b_npages) {
4038		int i;
4039		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
4040		for (i = 0; i < bp->b_npages; i++) {
4041			vm_page_t m;
4042			m = bp->b_pages[i];
4043			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
4044			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
4045			if ((i + 1) < bp->b_npages)
4046				db_printf(",");
4047		}
4048		db_printf("\n");
4049	}
4050	db_printf(" ");
4051	lockmgr_printinfo(&bp->b_lock);
4052}
4053
4054DB_SHOW_COMMAND(lockedbufs, lockedbufs)
4055{
4056	struct buf *bp;
4057	int i;
4058
4059	for (i = 0; i < nbuf; i++) {
4060		bp = &buf[i];
4061		if (BUF_ISLOCKED(bp)) {
4062			db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4063			db_printf("\n");
4064		}
4065	}
4066}
4067
4068DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs)
4069{
4070	struct vnode *vp;
4071	struct buf *bp;
4072
4073	if (!have_addr) {
4074		db_printf("usage: show vnodebufs <addr>\n");
4075		return;
4076	}
4077	vp = (struct vnode *)addr;
4078	db_printf("Clean buffers:\n");
4079	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) {
4080		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4081		db_printf("\n");
4082	}
4083	db_printf("Dirty buffers:\n");
4084	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
4085		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4086		db_printf("\n");
4087	}
4088}
4089
4090DB_COMMAND(countfreebufs, db_coundfreebufs)
4091{
4092	struct buf *bp;
4093	int i, used = 0, nfree = 0;
4094
4095	if (have_addr) {
4096		db_printf("usage: countfreebufs\n");
4097		return;
4098	}
4099
4100	for (i = 0; i < nbuf; i++) {
4101		bp = &buf[i];
4102		if ((bp->b_vflags & BV_INFREECNT) != 0)
4103			nfree++;
4104		else
4105			used++;
4106	}
4107
4108	db_printf("Counted %d free, %d used (%d tot)\n", nfree, used,
4109	    nfree + used);
4110	db_printf("numfreebuffers is %d\n", numfreebuffers);
4111}
4112#endif /* DDB */
4113