vfs_bio.c revision 251171
168651Skris/*-
268651Skris * Copyright (c) 2004 Poul-Henning Kamp
368651Skris * Copyright (c) 1994,1997 John S. Dyson
468651Skris * Copyright (c) 2013 The FreeBSD Foundation
568651Skris * All rights reserved.
668651Skris *
768651Skris * Portions of this software were developed by Konstantin Belousov
868651Skris * under sponsorship from the FreeBSD Foundation.
968651Skris *
1068651Skris * Redistribution and use in source and binary forms, with or without
1168651Skris * modification, are permitted provided that the following conditions
1268651Skris * are met:
1368651Skris * 1. Redistributions of source code must retain the above copyright
1468651Skris *    notice, this list of conditions and the following disclaimer.
1568651Skris * 2. Redistributions in binary form must reproduce the above copyright
1668651Skris *    notice, this list of conditions and the following disclaimer in the
1768651Skris *    documentation and/or other materials provided with the distribution.
1868651Skris *
1968651Skris * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
2068651Skris * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2168651Skris * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2268651Skris * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
2368651Skris * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2468651Skris * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2568651Skris * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2668651Skris * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2768651Skris * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2868651Skris * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2968651Skris * SUCH DAMAGE.
3068651Skris */
3168651Skris
3268651Skris/*
3368651Skris * this file contains a new buffer I/O scheme implementing a coherent
3468651Skris * VM object and buffer cache scheme.  Pains have been taken to make
3568651Skris * sure that the performance degradation associated with schemes such
3668651Skris * as this is not realized.
3768651Skris *
3868651Skris * Author:  John S. Dyson
3968651Skris * Significant help during the development and debugging phases
4076866Skris * had been provided by David Greenman, also of the FreeBSD core team.
4176866Skris *
4276866Skris * see man buf(9) for more info.
4376866Skris */
4476866Skris
4568651Skris#include <sys/cdefs.h>
4668651Skris__FBSDID("$FreeBSD: head/sys/kern/vfs_bio.c 251171 2013-05-31 00:43:41Z jeff $");
4768651Skris
4868651Skris#include <sys/param.h>
4968651Skris#include <sys/systm.h>
5068651Skris#include <sys/bio.h>
5168651Skris#include <sys/conf.h>
5268651Skris#include <sys/buf.h>
5368651Skris#include <sys/devicestat.h>
5468651Skris#include <sys/eventhandler.h>
5568651Skris#include <sys/fail.h>
5668651Skris#include <sys/limits.h>
5768651Skris#include <sys/lock.h>
5868651Skris#include <sys/malloc.h>
5968651Skris#include <sys/mount.h>
6068651Skris#include <sys/mutex.h>
6168651Skris#include <sys/kernel.h>
6272613Skris#include <sys/kthread.h>
6368651Skris#include <sys/proc.h>
6468651Skris#include <sys/resourcevar.h>
6568651Skris#include <sys/rwlock.h>
6668651Skris#include <sys/sysctl.h>
6768651Skris#include <sys/vmmeter.h>
6868651Skris#include <sys/vnode.h>
6968651Skris#include <geom/geom.h>
7068651Skris#include <vm/vm.h>
7168651Skris#include <vm/vm_param.h>
7268651Skris#include <vm/vm_kern.h>
7368651Skris#include <vm/vm_pageout.h>
7468651Skris#include <vm/vm_page.h>
7576866Skris#include <vm/vm_object.h>
7676866Skris#include <vm/vm_extern.h>
7776866Skris#include <vm/vm_map.h>
7868651Skris#include "opt_compat.h"
7968651Skris#include "opt_directio.h"
80#include "opt_swap.h"
81
82static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
83
84struct	bio_ops bioops;		/* I/O operation notification */
85
86struct	buf_ops buf_ops_bio = {
87	.bop_name	=	"buf_ops_bio",
88	.bop_write	=	bufwrite,
89	.bop_strategy	=	bufstrategy,
90	.bop_sync	=	bufsync,
91	.bop_bdflush	=	bufbdflush,
92};
93
94/*
95 * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has
96 * carnal knowledge of buffers.  This knowledge should be moved to vfs_bio.c.
97 */
98struct buf *buf;		/* buffer header pool */
99caddr_t unmapped_buf;
100
101static struct proc *bufdaemonproc;
102
103static int inmem(struct vnode *vp, daddr_t blkno);
104static void vm_hold_free_pages(struct buf *bp, int newbsize);
105static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
106		vm_offset_t to);
107static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
108static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
109		vm_page_t m);
110static void vfs_drain_busy_pages(struct buf *bp);
111static void vfs_clean_pages_dirty_buf(struct buf *bp);
112static void vfs_setdirty_locked_object(struct buf *bp);
113static void vfs_vmio_release(struct buf *bp);
114static int vfs_bio_clcheck(struct vnode *vp, int size,
115		daddr_t lblkno, daddr_t blkno);
116static int buf_do_flush(struct vnode *vp);
117static int flushbufqueues(struct vnode *, int, int);
118static void buf_daemon(void);
119static void bremfreel(struct buf *bp);
120#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
121    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
122static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
123#endif
124
125int vmiodirenable = TRUE;
126SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
127    "Use the VM system for directory writes");
128long runningbufspace;
129SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
130    "Amount of presently outstanding async buffer io");
131static long bufspace;
132#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
133    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
134SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
135    &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers");
136#else
137SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
138    "Virtual memory used for buffers");
139#endif
140static long unmapped_bufspace;
141SYSCTL_LONG(_vfs, OID_AUTO, unmapped_bufspace, CTLFLAG_RD,
142    &unmapped_bufspace, 0,
143    "Amount of unmapped buffers, inclusive in the bufspace");
144static long maxbufspace;
145SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
146    "Maximum allowed value of bufspace (including buf_daemon)");
147static long bufmallocspace;
148SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
149    "Amount of malloced memory for buffers");
150static long maxbufmallocspace;
151SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0,
152    "Maximum amount of malloced memory for buffers");
153static long lobufspace;
154SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
155    "Minimum amount of buffers we want to have");
156long hibufspace;
157SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
158    "Maximum allowed value of bufspace (excluding buf_daemon)");
159static int bufreusecnt;
160SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0,
161    "Number of times we have reused a buffer");
162static int buffreekvacnt;
163SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
164    "Number of times we have freed the KVA space from some buffer");
165static int bufdefragcnt;
166SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
167    "Number of times we have had to repeat buffer allocation to defragment");
168static long lorunningspace;
169SYSCTL_LONG(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0,
170    "Minimum preferred space used for in-progress I/O");
171static long hirunningspace;
172SYSCTL_LONG(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0,
173    "Maximum amount of space to use for in-progress I/O");
174int dirtybufferflushes;
175SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
176    0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
177int bdwriteskip;
178SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
179    0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
180int altbufferflushes;
181SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
182    0, "Number of fsync flushes to limit dirty buffers");
183static int recursiveflushes;
184SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes,
185    0, "Number of flushes skipped due to being recursive");
186static int numdirtybuffers;
187SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
188    "Number of buffers that are dirty (has unwritten changes) at the moment");
189static int lodirtybuffers;
190SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
191    "How many buffers we want to have free before bufdaemon can sleep");
192static int hidirtybuffers;
193SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
194    "When the number of dirty buffers is considered severe");
195int dirtybufthresh;
196SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
197    0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
198static int numfreebuffers;
199SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
200    "Number of free buffers");
201static int lofreebuffers;
202SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
203   "XXX Unused");
204static int hifreebuffers;
205SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
206   "XXX Complicatedly unused");
207static int getnewbufcalls;
208SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
209   "Number of calls to getnewbuf");
210static int getnewbufrestarts;
211SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
212    "Number of times getnewbuf has had to restart a buffer aquisition");
213static int mappingrestarts;
214SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
215    "Number of times getblk has had to restart a buffer mapping for "
216    "unmapped buffer");
217static int flushbufqtarget = 100;
218SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
219    "Amount of work to do in flushbufqueues when helping bufdaemon");
220static long notbufdflashes;
221SYSCTL_LONG(_vfs, OID_AUTO, notbufdflashes, CTLFLAG_RD, &notbufdflashes, 0,
222    "Number of dirty buffer flushes done by the bufdaemon helpers");
223static long barrierwrites;
224SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
225    "Number of barrier writes");
226SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
227    &unmapped_buf_allowed, 0,
228    "Permit the use of the unmapped i/o");
229
230/*
231 * Wakeup point for bufdaemon, as well as indicator of whether it is already
232 * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
233 * is idling.
234 */
235static int bd_request;
236
237/*
238 * Request for the buf daemon to write more buffers than is indicated by
239 * lodirtybuf.  This may be necessary to push out excess dependencies or
240 * defragment the address space where a simple count of the number of dirty
241 * buffers is insufficient to characterize the demand for flushing them.
242 */
243static int bd_speedupreq;
244
245/*
246 * This lock synchronizes access to bd_request.
247 */
248static struct mtx bdlock;
249
250/*
251 * bogus page -- for I/O to/from partially complete buffers
252 * this is a temporary solution to the problem, but it is not
253 * really that bad.  it would be better to split the buffer
254 * for input in the case of buffers partially already in memory,
255 * but the code is intricate enough already.
256 */
257vm_page_t bogus_page;
258
259/*
260 * Synchronization (sleep/wakeup) variable for active buffer space requests.
261 * Set when wait starts, cleared prior to wakeup().
262 * Used in runningbufwakeup() and waitrunningbufspace().
263 */
264static int runningbufreq;
265
266/*
267 * This lock protects the runningbufreq and synchronizes runningbufwakeup and
268 * waitrunningbufspace().
269 */
270static struct mtx rbreqlock;
271
272/*
273 * Synchronization (sleep/wakeup) variable for buffer requests.
274 * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
275 * by and/or.
276 * Used in numdirtywakeup(), bufspacewakeup(), bufcountwakeup(), bwillwrite(),
277 * getnewbuf(), and getblk().
278 */
279static int needsbuffer;
280
281/*
282 * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
283 */
284static struct mtx nblock;
285
286/*
287 * Definitions for the buffer free lists.
288 */
289#define BUFFER_QUEUES	5	/* number of free buffer queues */
290
291#define QUEUE_NONE	0	/* on no queue */
292#define QUEUE_CLEAN	1	/* non-B_DELWRI buffers */
293#define QUEUE_DIRTY	2	/* B_DELWRI buffers */
294#define QUEUE_EMPTYKVA	3	/* empty buffer headers w/KVA assignment */
295#define QUEUE_EMPTY	4	/* empty buffer headers */
296#define QUEUE_SENTINEL	1024	/* not an queue index, but mark for sentinel */
297
298/* Queues for free buffers with various properties */
299static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
300#ifdef INVARIANTS
301static int bq_len[BUFFER_QUEUES];
302#endif
303
304/* Lock for the bufqueues */
305static struct mtx bqlock;
306
307/*
308 * Single global constant for BUF_WMESG, to avoid getting multiple references.
309 * buf_wmesg is referred from macros.
310 */
311const char *buf_wmesg = BUF_WMESG;
312
313#define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
314#define VFS_BIO_NEED_DIRTYFLUSH	0x02	/* waiting for dirty buffer flush */
315#define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
316#define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
317
318#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
319    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
320static int
321sysctl_bufspace(SYSCTL_HANDLER_ARGS)
322{
323	long lvalue;
324	int ivalue;
325
326	if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
327		return (sysctl_handle_long(oidp, arg1, arg2, req));
328	lvalue = *(long *)arg1;
329	if (lvalue > INT_MAX)
330		/* On overflow, still write out a long to trigger ENOMEM. */
331		return (sysctl_handle_long(oidp, &lvalue, 0, req));
332	ivalue = lvalue;
333	return (sysctl_handle_int(oidp, &ivalue, 0, req));
334}
335#endif
336
337#ifdef DIRECTIO
338extern void ffs_rawread_setup(void);
339#endif /* DIRECTIO */
340/*
341 *	numdirtywakeup:
342 *
343 *	If someone is blocked due to there being too many dirty buffers,
344 *	and numdirtybuffers is now reasonable, wake them up.
345 */
346
347static __inline void
348numdirtywakeup(int level)
349{
350
351	if (numdirtybuffers <= level) {
352		mtx_lock(&nblock);
353		if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
354			needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
355			wakeup(&needsbuffer);
356		}
357		mtx_unlock(&nblock);
358	}
359}
360
361/*
362 *	bufspacewakeup:
363 *
364 *	Called when buffer space is potentially available for recovery.
365 *	getnewbuf() will block on this flag when it is unable to free
366 *	sufficient buffer space.  Buffer space becomes recoverable when
367 *	bp's get placed back in the queues.
368 */
369
370static __inline void
371bufspacewakeup(void)
372{
373
374	/*
375	 * If someone is waiting for BUF space, wake them up.  Even
376	 * though we haven't freed the kva space yet, the waiting
377	 * process will be able to now.
378	 */
379	mtx_lock(&nblock);
380	if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
381		needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
382		wakeup(&needsbuffer);
383	}
384	mtx_unlock(&nblock);
385}
386
387/*
388 * runningbufwakeup() - in-progress I/O accounting.
389 *
390 */
391void
392runningbufwakeup(struct buf *bp)
393{
394
395	if (bp->b_runningbufspace) {
396		atomic_subtract_long(&runningbufspace, bp->b_runningbufspace);
397		bp->b_runningbufspace = 0;
398		mtx_lock(&rbreqlock);
399		if (runningbufreq && runningbufspace <= lorunningspace) {
400			runningbufreq = 0;
401			wakeup(&runningbufreq);
402		}
403		mtx_unlock(&rbreqlock);
404	}
405}
406
407/*
408 *	bufcountwakeup:
409 *
410 *	Called when a buffer has been added to one of the free queues to
411 *	account for the buffer and to wakeup anyone waiting for free buffers.
412 *	This typically occurs when large amounts of metadata are being handled
413 *	by the buffer cache ( else buffer space runs out first, usually ).
414 */
415
416static __inline void
417bufcountwakeup(struct buf *bp)
418{
419	int old;
420
421	KASSERT((bp->b_flags & B_INFREECNT) == 0,
422	    ("buf %p already counted as free", bp));
423	bp->b_flags |= B_INFREECNT;
424	old = atomic_fetchadd_int(&numfreebuffers, 1);
425	KASSERT(old >= 0 && old < nbuf,
426	    ("numfreebuffers climbed to %d", old + 1));
427	mtx_lock(&nblock);
428	if (needsbuffer) {
429		needsbuffer &= ~VFS_BIO_NEED_ANY;
430		if (numfreebuffers >= hifreebuffers)
431			needsbuffer &= ~VFS_BIO_NEED_FREE;
432		wakeup(&needsbuffer);
433	}
434	mtx_unlock(&nblock);
435}
436
437/*
438 *	waitrunningbufspace()
439 *
440 *	runningbufspace is a measure of the amount of I/O currently
441 *	running.  This routine is used in async-write situations to
442 *	prevent creating huge backups of pending writes to a device.
443 *	Only asynchronous writes are governed by this function.
444 *
445 *	Reads will adjust runningbufspace, but will not block based on it.
446 *	The read load has a side effect of reducing the allowed write load.
447 *
448 *	This does NOT turn an async write into a sync write.  It waits
449 *	for earlier writes to complete and generally returns before the
450 *	caller's write has reached the device.
451 */
452void
453waitrunningbufspace(void)
454{
455
456	mtx_lock(&rbreqlock);
457	while (runningbufspace > hirunningspace) {
458		++runningbufreq;
459		msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
460	}
461	mtx_unlock(&rbreqlock);
462}
463
464
465/*
466 *	vfs_buf_test_cache:
467 *
468 *	Called when a buffer is extended.  This function clears the B_CACHE
469 *	bit if the newly extended portion of the buffer does not contain
470 *	valid data.
471 */
472static __inline
473void
474vfs_buf_test_cache(struct buf *bp,
475		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
476		  vm_page_t m)
477{
478
479	VM_OBJECT_ASSERT_WLOCKED(m->object);
480	if (bp->b_flags & B_CACHE) {
481		int base = (foff + off) & PAGE_MASK;
482		if (vm_page_is_valid(m, base, size) == 0)
483			bp->b_flags &= ~B_CACHE;
484	}
485}
486
487/* Wake up the buffer daemon if necessary */
488static __inline
489void
490bd_wakeup(int dirtybuflevel)
491{
492
493	mtx_lock(&bdlock);
494	if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) {
495		bd_request = 1;
496		wakeup(&bd_request);
497	}
498	mtx_unlock(&bdlock);
499}
500
501/*
502 * bd_speedup - speedup the buffer cache flushing code
503 */
504
505void
506bd_speedup(void)
507{
508	int needwake;
509
510	mtx_lock(&bdlock);
511	needwake = 0;
512	if (bd_speedupreq == 0 || bd_request == 0)
513		needwake = 1;
514	bd_speedupreq = 1;
515	bd_request = 1;
516	if (needwake)
517		wakeup(&bd_request);
518	mtx_unlock(&bdlock);
519}
520
521#ifdef __i386__
522#define	TRANSIENT_DENOM	5
523#else
524#define	TRANSIENT_DENOM 10
525#endif
526
527/*
528 * Calculating buffer cache scaling values and reserve space for buffer
529 * headers.  This is called during low level kernel initialization and
530 * may be called more then once.  We CANNOT write to the memory area
531 * being reserved at this time.
532 */
533caddr_t
534kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
535{
536	int tuned_nbuf;
537	long maxbuf, maxbuf_sz, buf_sz,	biotmap_sz;
538
539	/*
540	 * physmem_est is in pages.  Convert it to kilobytes (assumes
541	 * PAGE_SIZE is >= 1K)
542	 */
543	physmem_est = physmem_est * (PAGE_SIZE / 1024);
544
545	/*
546	 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
547	 * For the first 64MB of ram nominally allocate sufficient buffers to
548	 * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
549	 * buffers to cover 1/10 of our ram over 64MB.  When auto-sizing
550	 * the buffer cache we limit the eventual kva reservation to
551	 * maxbcache bytes.
552	 *
553	 * factor represents the 1/4 x ram conversion.
554	 */
555	if (nbuf == 0) {
556		int factor = 4 * BKVASIZE / 1024;
557
558		nbuf = 50;
559		if (physmem_est > 4096)
560			nbuf += min((physmem_est - 4096) / factor,
561			    65536 / factor);
562		if (physmem_est > 65536)
563			nbuf += (physmem_est - 65536) * 2 / (factor * 5);
564
565		if (maxbcache && nbuf > maxbcache / BKVASIZE)
566			nbuf = maxbcache / BKVASIZE;
567		tuned_nbuf = 1;
568	} else
569		tuned_nbuf = 0;
570
571	/* XXX Avoid unsigned long overflows later on with maxbufspace. */
572	maxbuf = (LONG_MAX / 3) / BKVASIZE;
573	if (nbuf > maxbuf) {
574		if (!tuned_nbuf)
575			printf("Warning: nbufs lowered from %d to %ld\n", nbuf,
576			    maxbuf);
577		nbuf = maxbuf;
578	}
579
580	/*
581	 * Ideal allocation size for the transient bio submap if 10%
582	 * of the maximal space buffer map.  This roughly corresponds
583	 * to the amount of the buffer mapped for typical UFS load.
584	 *
585	 * Clip the buffer map to reserve space for the transient
586	 * BIOs, if its extent is bigger than 90% (80% on i386) of the
587	 * maximum buffer map extent on the platform.
588	 *
589	 * The fall-back to the maxbuf in case of maxbcache unset,
590	 * allows to not trim the buffer KVA for the architectures
591	 * with ample KVA space.
592	 */
593	if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) {
594		maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
595		buf_sz = (long)nbuf * BKVASIZE;
596		if (buf_sz < maxbuf_sz / TRANSIENT_DENOM *
597		    (TRANSIENT_DENOM - 1)) {
598			/*
599			 * There is more KVA than memory.  Do not
600			 * adjust buffer map size, and assign the rest
601			 * of maxbuf to transient map.
602			 */
603			biotmap_sz = maxbuf_sz - buf_sz;
604		} else {
605			/*
606			 * Buffer map spans all KVA we could afford on
607			 * this platform.  Give 10% (20% on i386) of
608			 * the buffer map to the transient bio map.
609			 */
610			biotmap_sz = buf_sz / TRANSIENT_DENOM;
611			buf_sz -= biotmap_sz;
612		}
613		if (biotmap_sz / INT_MAX > MAXPHYS)
614			bio_transient_maxcnt = INT_MAX;
615		else
616			bio_transient_maxcnt = biotmap_sz / MAXPHYS;
617		/*
618		 * Artifically limit to 1024 simultaneous in-flight I/Os
619		 * using the transient mapping.
620		 */
621		if (bio_transient_maxcnt > 1024)
622			bio_transient_maxcnt = 1024;
623		if (tuned_nbuf)
624			nbuf = buf_sz / BKVASIZE;
625	}
626
627	/*
628	 * swbufs are used as temporary holders for I/O, such as paging I/O.
629	 * We have no less then 16 and no more then 256.
630	 */
631	nswbuf = max(min(nbuf/4, 256), 16);
632#ifdef NSWBUF_MIN
633	if (nswbuf < NSWBUF_MIN)
634		nswbuf = NSWBUF_MIN;
635#endif
636#ifdef DIRECTIO
637	ffs_rawread_setup();
638#endif
639
640	/*
641	 * Reserve space for the buffer cache buffers
642	 */
643	swbuf = (void *)v;
644	v = (caddr_t)(swbuf + nswbuf);
645	buf = (void *)v;
646	v = (caddr_t)(buf + nbuf);
647
648	return(v);
649}
650
651/* Initialize the buffer subsystem.  Called before use of any buffers. */
652void
653bufinit(void)
654{
655	struct buf *bp;
656	int i;
657
658	mtx_init(&bqlock, "buf queue lock", NULL, MTX_DEF);
659	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
660	mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF);
661	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
662
663	/* next, make a null set of free lists */
664	for (i = 0; i < BUFFER_QUEUES; i++)
665		TAILQ_INIT(&bufqueues[i]);
666
667	/* finally, initialize each buffer header and stick on empty q */
668	for (i = 0; i < nbuf; i++) {
669		bp = &buf[i];
670		bzero(bp, sizeof *bp);
671		bp->b_flags = B_INVAL | B_INFREECNT;
672		bp->b_rcred = NOCRED;
673		bp->b_wcred = NOCRED;
674		bp->b_qindex = QUEUE_EMPTY;
675		bp->b_xflags = 0;
676		LIST_INIT(&bp->b_dep);
677		BUF_LOCKINIT(bp);
678		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
679#ifdef INVARIANTS
680		bq_len[QUEUE_EMPTY]++;
681#endif
682	}
683
684	/*
685	 * maxbufspace is the absolute maximum amount of buffer space we are
686	 * allowed to reserve in KVM and in real terms.  The absolute maximum
687	 * is nominally used by buf_daemon.  hibufspace is the nominal maximum
688	 * used by most other processes.  The differential is required to
689	 * ensure that buf_daemon is able to run when other processes might
690	 * be blocked waiting for buffer space.
691	 *
692	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
693	 * this may result in KVM fragmentation which is not handled optimally
694	 * by the system.
695	 */
696	maxbufspace = (long)nbuf * BKVASIZE;
697	hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
698	lobufspace = hibufspace - MAXBSIZE;
699
700	/*
701	 * Note: The 16 MiB upper limit for hirunningspace was chosen
702	 * arbitrarily and may need further tuning. It corresponds to
703	 * 128 outstanding write IO requests (if IO size is 128 KiB),
704	 * which fits with many RAID controllers' tagged queuing limits.
705	 * The lower 1 MiB limit is the historical upper limit for
706	 * hirunningspace.
707	 */
708	hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBSIZE),
709	    16 * 1024 * 1024), 1024 * 1024);
710	lorunningspace = roundup((hirunningspace * 2) / 3, MAXBSIZE);
711
712/*
713 * Limit the amount of malloc memory since it is wired permanently into
714 * the kernel space.  Even though this is accounted for in the buffer
715 * allocation, we don't want the malloced region to grow uncontrolled.
716 * The malloc scheme improves memory utilization significantly on average
717 * (small) directories.
718 */
719	maxbufmallocspace = hibufspace / 20;
720
721/*
722 * Reduce the chance of a deadlock occuring by limiting the number
723 * of delayed-write dirty buffers we allow to stack up.
724 */
725	hidirtybuffers = nbuf / 4 + 20;
726	dirtybufthresh = hidirtybuffers * 9 / 10;
727	numdirtybuffers = 0;
728/*
729 * To support extreme low-memory systems, make sure hidirtybuffers cannot
730 * eat up all available buffer space.  This occurs when our minimum cannot
731 * be met.  We try to size hidirtybuffers to 3/4 our buffer space assuming
732 * BKVASIZE'd buffers.
733 */
734	while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
735		hidirtybuffers >>= 1;
736	}
737	lodirtybuffers = hidirtybuffers / 2;
738
739/*
740 * Try to keep the number of free buffers in the specified range,
741 * and give special processes (e.g. like buf_daemon) access to an
742 * emergency reserve.
743 */
744	lofreebuffers = nbuf / 18 + 5;
745	hifreebuffers = 2 * lofreebuffers;
746	numfreebuffers = nbuf;
747
748	bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
749	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
750	unmapped_buf = (caddr_t)kmem_alloc_nofault(kernel_map, MAXPHYS);
751}
752
753#ifdef INVARIANTS
754static inline void
755vfs_buf_check_mapped(struct buf *bp)
756{
757
758	KASSERT((bp->b_flags & B_UNMAPPED) == 0,
759	    ("mapped buf %p %x", bp, bp->b_flags));
760	KASSERT(bp->b_kvabase != unmapped_buf,
761	    ("mapped buf: b_kvabase was not updated %p", bp));
762	KASSERT(bp->b_data != unmapped_buf,
763	    ("mapped buf: b_data was not updated %p", bp));
764}
765
766static inline void
767vfs_buf_check_unmapped(struct buf *bp)
768{
769
770	KASSERT((bp->b_flags & B_UNMAPPED) == B_UNMAPPED,
771	    ("unmapped buf %p %x", bp, bp->b_flags));
772	KASSERT(bp->b_kvabase == unmapped_buf,
773	    ("unmapped buf: corrupted b_kvabase %p", bp));
774	KASSERT(bp->b_data == unmapped_buf,
775	    ("unmapped buf: corrupted b_data %p", bp));
776}
777
778#define	BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
779#define	BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
780#else
781#define	BUF_CHECK_MAPPED(bp) do {} while (0)
782#define	BUF_CHECK_UNMAPPED(bp) do {} while (0)
783#endif
784
785static void
786bpmap_qenter(struct buf *bp)
787{
788
789	BUF_CHECK_MAPPED(bp);
790
791	/*
792	 * bp->b_data is relative to bp->b_offset, but
793	 * bp->b_offset may be offset into the first page.
794	 */
795	bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
796	pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
797	bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
798	    (vm_offset_t)(bp->b_offset & PAGE_MASK));
799}
800
801/*
802 * bfreekva() - free the kva allocation for a buffer.
803 *
804 *	Since this call frees up buffer space, we call bufspacewakeup().
805 */
806static void
807bfreekva(struct buf *bp)
808{
809
810	if (bp->b_kvasize == 0)
811		return;
812
813	atomic_add_int(&buffreekvacnt, 1);
814	atomic_subtract_long(&bufspace, bp->b_kvasize);
815	if ((bp->b_flags & B_UNMAPPED) == 0) {
816		BUF_CHECK_MAPPED(bp);
817		vm_map_remove(buffer_map, (vm_offset_t)bp->b_kvabase,
818		    (vm_offset_t)bp->b_kvabase + bp->b_kvasize);
819	} else {
820		BUF_CHECK_UNMAPPED(bp);
821		if ((bp->b_flags & B_KVAALLOC) != 0) {
822			vm_map_remove(buffer_map, (vm_offset_t)bp->b_kvaalloc,
823			    (vm_offset_t)bp->b_kvaalloc + bp->b_kvasize);
824		}
825		atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
826		bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
827	}
828	bp->b_kvasize = 0;
829	bufspacewakeup();
830}
831
832/*
833 *	bremfree:
834 *
835 *	Mark the buffer for removal from the appropriate free list in brelse.
836 *
837 */
838void
839bremfree(struct buf *bp)
840{
841	int old;
842
843	CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
844	KASSERT((bp->b_flags & B_REMFREE) == 0,
845	    ("bremfree: buffer %p already marked for delayed removal.", bp));
846	KASSERT(bp->b_qindex != QUEUE_NONE,
847	    ("bremfree: buffer %p not on a queue.", bp));
848	BUF_ASSERT_XLOCKED(bp);
849
850	bp->b_flags |= B_REMFREE;
851	/* Fixup numfreebuffers count.  */
852	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
853		KASSERT((bp->b_flags & B_INFREECNT) != 0,
854		    ("buf %p not counted in numfreebuffers", bp));
855		bp->b_flags &= ~B_INFREECNT;
856		old = atomic_fetchadd_int(&numfreebuffers, -1);
857		KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
858	}
859}
860
861/*
862 *	bremfreef:
863 *
864 *	Force an immediate removal from a free list.  Used only in nfs when
865 *	it abuses the b_freelist pointer.
866 */
867void
868bremfreef(struct buf *bp)
869{
870	mtx_lock(&bqlock);
871	bremfreel(bp);
872	mtx_unlock(&bqlock);
873}
874
875/*
876 *	bremfreel:
877 *
878 *	Removes a buffer from the free list, must be called with the
879 *	bqlock held.
880 */
881static void
882bremfreel(struct buf *bp)
883{
884	int old;
885
886	CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
887	    bp, bp->b_vp, bp->b_flags);
888	KASSERT(bp->b_qindex != QUEUE_NONE,
889	    ("bremfreel: buffer %p not on a queue.", bp));
890	BUF_ASSERT_XLOCKED(bp);
891	mtx_assert(&bqlock, MA_OWNED);
892
893	TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
894#ifdef INVARIANTS
895	KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow",
896	    bp->b_qindex));
897	bq_len[bp->b_qindex]--;
898#endif
899	bp->b_qindex = QUEUE_NONE;
900	/*
901	 * If this was a delayed bremfree() we only need to remove the buffer
902	 * from the queue and return the stats are already done.
903	 */
904	if (bp->b_flags & B_REMFREE) {
905		bp->b_flags &= ~B_REMFREE;
906		return;
907	}
908	/*
909	 * Fixup numfreebuffers count.  If the buffer is invalid or not
910	 * delayed-write, the buffer was free and we must decrement
911	 * numfreebuffers.
912	 */
913	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
914		KASSERT((bp->b_flags & B_INFREECNT) != 0,
915		    ("buf %p not counted in numfreebuffers", bp));
916		bp->b_flags &= ~B_INFREECNT;
917		old = atomic_fetchadd_int(&numfreebuffers, -1);
918		KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
919	}
920}
921
922/*
923 * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
924 * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
925 * the buffer is valid and we do not have to do anything.
926 */
927void
928breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
929    int cnt, struct ucred * cred)
930{
931	struct buf *rabp;
932	int i;
933
934	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
935		if (inmem(vp, *rablkno))
936			continue;
937		rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
938
939		if ((rabp->b_flags & B_CACHE) == 0) {
940			if (!TD_IS_IDLETHREAD(curthread))
941				curthread->td_ru.ru_inblock++;
942			rabp->b_flags |= B_ASYNC;
943			rabp->b_flags &= ~B_INVAL;
944			rabp->b_ioflags &= ~BIO_ERROR;
945			rabp->b_iocmd = BIO_READ;
946			if (rabp->b_rcred == NOCRED && cred != NOCRED)
947				rabp->b_rcred = crhold(cred);
948			vfs_busy_pages(rabp, 0);
949			BUF_KERNPROC(rabp);
950			rabp->b_iooffset = dbtob(rabp->b_blkno);
951			bstrategy(rabp);
952		} else {
953			brelse(rabp);
954		}
955	}
956}
957
958/*
959 * Entry point for bread() and breadn() via #defines in sys/buf.h.
960 *
961 * Get a buffer with the specified data.  Look in the cache first.  We
962 * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
963 * is set, the buffer is valid and we do not have to do anything, see
964 * getblk(). Also starts asynchronous I/O on read-ahead blocks.
965 */
966int
967breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno,
968    int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp)
969{
970	struct buf *bp;
971	int rv = 0, readwait = 0;
972
973	CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
974	/*
975	 * Can only return NULL if GB_LOCK_NOWAIT flag is specified.
976	 */
977	*bpp = bp = getblk(vp, blkno, size, 0, 0, flags);
978	if (bp == NULL)
979		return (EBUSY);
980
981	/* if not found in cache, do some I/O */
982	if ((bp->b_flags & B_CACHE) == 0) {
983		if (!TD_IS_IDLETHREAD(curthread))
984			curthread->td_ru.ru_inblock++;
985		bp->b_iocmd = BIO_READ;
986		bp->b_flags &= ~B_INVAL;
987		bp->b_ioflags &= ~BIO_ERROR;
988		if (bp->b_rcred == NOCRED && cred != NOCRED)
989			bp->b_rcred = crhold(cred);
990		vfs_busy_pages(bp, 0);
991		bp->b_iooffset = dbtob(bp->b_blkno);
992		bstrategy(bp);
993		++readwait;
994	}
995
996	breada(vp, rablkno, rabsize, cnt, cred);
997
998	if (readwait) {
999		rv = bufwait(bp);
1000	}
1001	return (rv);
1002}
1003
1004/*
1005 * Write, release buffer on completion.  (Done by iodone
1006 * if async).  Do not bother writing anything if the buffer
1007 * is invalid.
1008 *
1009 * Note that we set B_CACHE here, indicating that buffer is
1010 * fully valid and thus cacheable.  This is true even of NFS
1011 * now so we set it generally.  This could be set either here
1012 * or in biodone() since the I/O is synchronous.  We put it
1013 * here.
1014 */
1015int
1016bufwrite(struct buf *bp)
1017{
1018	int oldflags;
1019	struct vnode *vp;
1020	int vp_md;
1021
1022	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1023	if (bp->b_flags & B_INVAL) {
1024		brelse(bp);
1025		return (0);
1026	}
1027
1028	if (bp->b_flags & B_BARRIER)
1029		barrierwrites++;
1030
1031	oldflags = bp->b_flags;
1032
1033	BUF_ASSERT_HELD(bp);
1034
1035	if (bp->b_pin_count > 0)
1036		bunpin_wait(bp);
1037
1038	KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
1039	    ("FFS background buffer should not get here %p", bp));
1040
1041	vp = bp->b_vp;
1042	if (vp)
1043		vp_md = vp->v_vflag & VV_MD;
1044	else
1045		vp_md = 0;
1046
1047	/*
1048	 * Mark the buffer clean.  Increment the bufobj write count
1049	 * before bundirty() call, to prevent other thread from seeing
1050	 * empty dirty list and zero counter for writes in progress,
1051	 * falsely indicating that the bufobj is clean.
1052	 */
1053	bufobj_wref(bp->b_bufobj);
1054	bundirty(bp);
1055
1056	bp->b_flags &= ~B_DONE;
1057	bp->b_ioflags &= ~BIO_ERROR;
1058	bp->b_flags |= B_CACHE;
1059	bp->b_iocmd = BIO_WRITE;
1060
1061	vfs_busy_pages(bp, 1);
1062
1063	/*
1064	 * Normal bwrites pipeline writes
1065	 */
1066	bp->b_runningbufspace = bp->b_bufsize;
1067	atomic_add_long(&runningbufspace, bp->b_runningbufspace);
1068
1069	if (!TD_IS_IDLETHREAD(curthread))
1070		curthread->td_ru.ru_oublock++;
1071	if (oldflags & B_ASYNC)
1072		BUF_KERNPROC(bp);
1073	bp->b_iooffset = dbtob(bp->b_blkno);
1074	bstrategy(bp);
1075
1076	if ((oldflags & B_ASYNC) == 0) {
1077		int rtval = bufwait(bp);
1078		brelse(bp);
1079		return (rtval);
1080	} else {
1081		/*
1082		 * don't allow the async write to saturate the I/O
1083		 * system.  We will not deadlock here because
1084		 * we are blocking waiting for I/O that is already in-progress
1085		 * to complete. We do not block here if it is the update
1086		 * or syncer daemon trying to clean up as that can lead
1087		 * to deadlock.
1088		 */
1089		if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
1090			waitrunningbufspace();
1091	}
1092
1093	return (0);
1094}
1095
1096void
1097bufbdflush(struct bufobj *bo, struct buf *bp)
1098{
1099	struct buf *nbp;
1100
1101	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
1102		(void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
1103		altbufferflushes++;
1104	} else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
1105		BO_LOCK(bo);
1106		/*
1107		 * Try to find a buffer to flush.
1108		 */
1109		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
1110			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
1111			    BUF_LOCK(nbp,
1112				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
1113				continue;
1114			if (bp == nbp)
1115				panic("bdwrite: found ourselves");
1116			BO_UNLOCK(bo);
1117			/* Don't countdeps with the bo lock held. */
1118			if (buf_countdeps(nbp, 0)) {
1119				BO_LOCK(bo);
1120				BUF_UNLOCK(nbp);
1121				continue;
1122			}
1123			if (nbp->b_flags & B_CLUSTEROK) {
1124				vfs_bio_awrite(nbp);
1125			} else {
1126				bremfree(nbp);
1127				bawrite(nbp);
1128			}
1129			dirtybufferflushes++;
1130			break;
1131		}
1132		if (nbp == NULL)
1133			BO_UNLOCK(bo);
1134	}
1135}
1136
1137/*
1138 * Delayed write. (Buffer is marked dirty).  Do not bother writing
1139 * anything if the buffer is marked invalid.
1140 *
1141 * Note that since the buffer must be completely valid, we can safely
1142 * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
1143 * biodone() in order to prevent getblk from writing the buffer
1144 * out synchronously.
1145 */
1146void
1147bdwrite(struct buf *bp)
1148{
1149	struct thread *td = curthread;
1150	struct vnode *vp;
1151	struct bufobj *bo;
1152
1153	CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1154	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1155	KASSERT((bp->b_flags & B_BARRIER) == 0,
1156	    ("Barrier request in delayed write %p", bp));
1157	BUF_ASSERT_HELD(bp);
1158
1159	if (bp->b_flags & B_INVAL) {
1160		brelse(bp);
1161		return;
1162	}
1163
1164	/*
1165	 * If we have too many dirty buffers, don't create any more.
1166	 * If we are wildly over our limit, then force a complete
1167	 * cleanup. Otherwise, just keep the situation from getting
1168	 * out of control. Note that we have to avoid a recursive
1169	 * disaster and not try to clean up after our own cleanup!
1170	 */
1171	vp = bp->b_vp;
1172	bo = bp->b_bufobj;
1173	if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) {
1174		td->td_pflags |= TDP_INBDFLUSH;
1175		BO_BDFLUSH(bo, bp);
1176		td->td_pflags &= ~TDP_INBDFLUSH;
1177	} else
1178		recursiveflushes++;
1179
1180	bdirty(bp);
1181	/*
1182	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
1183	 * true even of NFS now.
1184	 */
1185	bp->b_flags |= B_CACHE;
1186
1187	/*
1188	 * This bmap keeps the system from needing to do the bmap later,
1189	 * perhaps when the system is attempting to do a sync.  Since it
1190	 * is likely that the indirect block -- or whatever other datastructure
1191	 * that the filesystem needs is still in memory now, it is a good
1192	 * thing to do this.  Note also, that if the pageout daemon is
1193	 * requesting a sync -- there might not be enough memory to do
1194	 * the bmap then...  So, this is important to do.
1195	 */
1196	if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
1197		VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
1198	}
1199
1200	/*
1201	 * Set the *dirty* buffer range based upon the VM system dirty
1202	 * pages.
1203	 *
1204	 * Mark the buffer pages as clean.  We need to do this here to
1205	 * satisfy the vnode_pager and the pageout daemon, so that it
1206	 * thinks that the pages have been "cleaned".  Note that since
1207	 * the pages are in a delayed write buffer -- the VFS layer
1208	 * "will" see that the pages get written out on the next sync,
1209	 * or perhaps the cluster will be completed.
1210	 */
1211	vfs_clean_pages_dirty_buf(bp);
1212	bqrelse(bp);
1213
1214	/*
1215	 * Wakeup the buffer flushing daemon if we have a lot of dirty
1216	 * buffers (midpoint between our recovery point and our stall
1217	 * point).
1218	 */
1219	bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
1220
1221	/*
1222	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
1223	 * due to the softdep code.
1224	 */
1225}
1226
1227/*
1228 *	bdirty:
1229 *
1230 *	Turn buffer into delayed write request.  We must clear BIO_READ and
1231 *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to
1232 *	itself to properly update it in the dirty/clean lists.  We mark it
1233 *	B_DONE to ensure that any asynchronization of the buffer properly
1234 *	clears B_DONE ( else a panic will occur later ).
1235 *
1236 *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
1237 *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
1238 *	should only be called if the buffer is known-good.
1239 *
1240 *	Since the buffer is not on a queue, we do not update the numfreebuffers
1241 *	count.
1242 *
1243 *	The buffer must be on QUEUE_NONE.
1244 */
1245void
1246bdirty(struct buf *bp)
1247{
1248
1249	CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X",
1250	    bp, bp->b_vp, bp->b_flags);
1251	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1252	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
1253	    ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
1254	BUF_ASSERT_HELD(bp);
1255	bp->b_flags &= ~(B_RELBUF);
1256	bp->b_iocmd = BIO_WRITE;
1257
1258	if ((bp->b_flags & B_DELWRI) == 0) {
1259		bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
1260		reassignbuf(bp);
1261		atomic_add_int(&numdirtybuffers, 1);
1262		bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
1263	}
1264}
1265
1266/*
1267 *	bundirty:
1268 *
1269 *	Clear B_DELWRI for buffer.
1270 *
1271 *	Since the buffer is not on a queue, we do not update the numfreebuffers
1272 *	count.
1273 *
1274 *	The buffer must be on QUEUE_NONE.
1275 */
1276
1277void
1278bundirty(struct buf *bp)
1279{
1280
1281	CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1282	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1283	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
1284	    ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
1285	BUF_ASSERT_HELD(bp);
1286
1287	if (bp->b_flags & B_DELWRI) {
1288		bp->b_flags &= ~B_DELWRI;
1289		reassignbuf(bp);
1290		atomic_subtract_int(&numdirtybuffers, 1);
1291		numdirtywakeup(lodirtybuffers);
1292	}
1293	/*
1294	 * Since it is now being written, we can clear its deferred write flag.
1295	 */
1296	bp->b_flags &= ~B_DEFERRED;
1297}
1298
1299/*
1300 *	bawrite:
1301 *
1302 *	Asynchronous write.  Start output on a buffer, but do not wait for
1303 *	it to complete.  The buffer is released when the output completes.
1304 *
1305 *	bwrite() ( or the VOP routine anyway ) is responsible for handling
1306 *	B_INVAL buffers.  Not us.
1307 */
1308void
1309bawrite(struct buf *bp)
1310{
1311
1312	bp->b_flags |= B_ASYNC;
1313	(void) bwrite(bp);
1314}
1315
1316/*
1317 *	babarrierwrite:
1318 *
1319 *	Asynchronous barrier write.  Start output on a buffer, but do not
1320 *	wait for it to complete.  Place a write barrier after this write so
1321 *	that this buffer and all buffers written before it are committed to
1322 *	the disk before any buffers written after this write are committed
1323 *	to the disk.  The buffer is released when the output completes.
1324 */
1325void
1326babarrierwrite(struct buf *bp)
1327{
1328
1329	bp->b_flags |= B_ASYNC | B_BARRIER;
1330	(void) bwrite(bp);
1331}
1332
1333/*
1334 *	bbarrierwrite:
1335 *
1336 *	Synchronous barrier write.  Start output on a buffer and wait for
1337 *	it to complete.  Place a write barrier after this write so that
1338 *	this buffer and all buffers written before it are committed to
1339 *	the disk before any buffers written after this write are committed
1340 *	to the disk.  The buffer is released when the output completes.
1341 */
1342int
1343bbarrierwrite(struct buf *bp)
1344{
1345
1346	bp->b_flags |= B_BARRIER;
1347	return (bwrite(bp));
1348}
1349
1350/*
1351 *	bwillwrite:
1352 *
1353 *	Called prior to the locking of any vnodes when we are expecting to
1354 *	write.  We do not want to starve the buffer cache with too many
1355 *	dirty buffers so we block here.  By blocking prior to the locking
1356 *	of any vnodes we attempt to avoid the situation where a locked vnode
1357 *	prevents the various system daemons from flushing related buffers.
1358 */
1359
1360void
1361bwillwrite(void)
1362{
1363
1364	if (numdirtybuffers >= hidirtybuffers) {
1365		mtx_lock(&nblock);
1366		while (numdirtybuffers >= hidirtybuffers) {
1367			bd_wakeup(1);
1368			needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
1369			msleep(&needsbuffer, &nblock,
1370			    (PRIBIO + 4), "flswai", 0);
1371		}
1372		mtx_unlock(&nblock);
1373	}
1374}
1375
1376/*
1377 * Return true if we have too many dirty buffers.
1378 */
1379int
1380buf_dirty_count_severe(void)
1381{
1382
1383	return(numdirtybuffers >= hidirtybuffers);
1384}
1385
1386static __noinline int
1387buf_vm_page_count_severe(void)
1388{
1389
1390	KFAIL_POINT_CODE(DEBUG_FP, buf_pressure, return 1);
1391
1392	return vm_page_count_severe();
1393}
1394
1395/*
1396 *	brelse:
1397 *
1398 *	Release a busy buffer and, if requested, free its resources.  The
1399 *	buffer will be stashed in the appropriate bufqueue[] allowing it
1400 *	to be accessed later as a cache entity or reused for other purposes.
1401 */
1402void
1403brelse(struct buf *bp)
1404{
1405	CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
1406	    bp, bp->b_vp, bp->b_flags);
1407	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
1408	    ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
1409
1410	if (BUF_LOCKRECURSED(bp)) {
1411		/*
1412		 * Do not process, in particular, do not handle the
1413		 * B_INVAL/B_RELBUF and do not release to free list.
1414		 */
1415		BUF_UNLOCK(bp);
1416		return;
1417	}
1418
1419	if (bp->b_flags & B_MANAGED) {
1420		bqrelse(bp);
1421		return;
1422	}
1423
1424	if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
1425	    bp->b_error == EIO && !(bp->b_flags & B_INVAL)) {
1426		/*
1427		 * Failed write, redirty.  Must clear BIO_ERROR to prevent
1428		 * pages from being scrapped.  If the error is anything
1429		 * other than an I/O error (EIO), assume that retrying
1430		 * is futile.
1431		 */
1432		bp->b_ioflags &= ~BIO_ERROR;
1433		bdirty(bp);
1434	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
1435	    (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) {
1436		/*
1437		 * Either a failed I/O or we were asked to free or not
1438		 * cache the buffer.
1439		 */
1440		bp->b_flags |= B_INVAL;
1441		if (!LIST_EMPTY(&bp->b_dep))
1442			buf_deallocate(bp);
1443		if (bp->b_flags & B_DELWRI) {
1444			atomic_subtract_int(&numdirtybuffers, 1);
1445			numdirtywakeup(lodirtybuffers);
1446		}
1447		bp->b_flags &= ~(B_DELWRI | B_CACHE);
1448		if ((bp->b_flags & B_VMIO) == 0) {
1449			if (bp->b_bufsize)
1450				allocbuf(bp, 0);
1451			if (bp->b_vp)
1452				brelvp(bp);
1453		}
1454	}
1455
1456	/*
1457	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release()
1458	 * is called with B_DELWRI set, the underlying pages may wind up
1459	 * getting freed causing a previous write (bdwrite()) to get 'lost'
1460	 * because pages associated with a B_DELWRI bp are marked clean.
1461	 *
1462	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
1463	 * if B_DELWRI is set.
1464	 *
1465	 * If B_DELWRI is not set we may have to set B_RELBUF if we are low
1466	 * on pages to return pages to the VM page queues.
1467	 */
1468	if (bp->b_flags & B_DELWRI)
1469		bp->b_flags &= ~B_RELBUF;
1470	else if (buf_vm_page_count_severe()) {
1471		/*
1472		 * BKGRDINPROG can only be set with the buf and bufobj
1473		 * locks both held.  We tolerate a race to clear it here.
1474		 */
1475		if (!(bp->b_vflags & BV_BKGRDINPROG))
1476			bp->b_flags |= B_RELBUF;
1477	}
1478
1479	/*
1480	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
1481	 * constituted, not even NFS buffers now.  Two flags effect this.  If
1482	 * B_INVAL, the struct buf is invalidated but the VM object is kept
1483	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
1484	 *
1485	 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
1486	 * invalidated.  BIO_ERROR cannot be set for a failed write unless the
1487	 * buffer is also B_INVAL because it hits the re-dirtying code above.
1488	 *
1489	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
1490	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
1491	 * the commit state and we cannot afford to lose the buffer. If the
1492	 * buffer has a background write in progress, we need to keep it
1493	 * around to prevent it from being reconstituted and starting a second
1494	 * background write.
1495	 */
1496	if ((bp->b_flags & B_VMIO)
1497	    && !(bp->b_vp->v_mount != NULL &&
1498		 (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
1499		 !vn_isdisk(bp->b_vp, NULL) &&
1500		 (bp->b_flags & B_DELWRI))
1501	    ) {
1502
1503		int i, j, resid;
1504		vm_page_t m;
1505		off_t foff;
1506		vm_pindex_t poff;
1507		vm_object_t obj;
1508
1509		obj = bp->b_bufobj->bo_object;
1510
1511		/*
1512		 * Get the base offset and length of the buffer.  Note that
1513		 * in the VMIO case if the buffer block size is not
1514		 * page-aligned then b_data pointer may not be page-aligned.
1515		 * But our b_pages[] array *IS* page aligned.
1516		 *
1517		 * block sizes less then DEV_BSIZE (usually 512) are not
1518		 * supported due to the page granularity bits (m->valid,
1519		 * m->dirty, etc...).
1520		 *
1521		 * See man buf(9) for more information
1522		 */
1523		resid = bp->b_bufsize;
1524		foff = bp->b_offset;
1525		VM_OBJECT_WLOCK(obj);
1526		for (i = 0; i < bp->b_npages; i++) {
1527			int had_bogus = 0;
1528
1529			m = bp->b_pages[i];
1530
1531			/*
1532			 * If we hit a bogus page, fixup *all* the bogus pages
1533			 * now.
1534			 */
1535			if (m == bogus_page) {
1536				poff = OFF_TO_IDX(bp->b_offset);
1537				had_bogus = 1;
1538
1539				for (j = i; j < bp->b_npages; j++) {
1540					vm_page_t mtmp;
1541					mtmp = bp->b_pages[j];
1542					if (mtmp == bogus_page) {
1543						mtmp = vm_page_lookup(obj, poff + j);
1544						if (!mtmp) {
1545							panic("brelse: page missing\n");
1546						}
1547						bp->b_pages[j] = mtmp;
1548					}
1549				}
1550
1551				if ((bp->b_flags & (B_INVAL | B_UNMAPPED)) == 0) {
1552					BUF_CHECK_MAPPED(bp);
1553					pmap_qenter(
1554					    trunc_page((vm_offset_t)bp->b_data),
1555					    bp->b_pages, bp->b_npages);
1556				}
1557				m = bp->b_pages[i];
1558			}
1559			if ((bp->b_flags & B_NOCACHE) ||
1560			    (bp->b_ioflags & BIO_ERROR &&
1561			     bp->b_iocmd == BIO_READ)) {
1562				int poffset = foff & PAGE_MASK;
1563				int presid = resid > (PAGE_SIZE - poffset) ?
1564					(PAGE_SIZE - poffset) : resid;
1565
1566				KASSERT(presid >= 0, ("brelse: extra page"));
1567				vm_page_set_invalid(m, poffset, presid);
1568				if (had_bogus)
1569					printf("avoided corruption bug in bogus_page/brelse code\n");
1570			}
1571			resid -= PAGE_SIZE - (foff & PAGE_MASK);
1572			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
1573		}
1574		VM_OBJECT_WUNLOCK(obj);
1575		if (bp->b_flags & (B_INVAL | B_RELBUF))
1576			vfs_vmio_release(bp);
1577
1578	} else if (bp->b_flags & B_VMIO) {
1579
1580		if (bp->b_flags & (B_INVAL | B_RELBUF)) {
1581			vfs_vmio_release(bp);
1582		}
1583
1584	} else if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0) {
1585		if (bp->b_bufsize != 0)
1586			allocbuf(bp, 0);
1587		if (bp->b_vp != NULL)
1588			brelvp(bp);
1589	}
1590
1591	/* enqueue */
1592	mtx_lock(&bqlock);
1593	/* Handle delayed bremfree() processing. */
1594	if (bp->b_flags & B_REMFREE)
1595		bremfreel(bp);
1596
1597	if (bp->b_qindex != QUEUE_NONE)
1598		panic("brelse: free buffer onto another queue???");
1599
1600	/*
1601	 * If the buffer has junk contents signal it and eventually
1602	 * clean up B_DELWRI and diassociate the vnode so that gbincore()
1603	 * doesn't find it.
1604	 */
1605	if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 ||
1606	    (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0)
1607		bp->b_flags |= B_INVAL;
1608	if (bp->b_flags & B_INVAL) {
1609		if (bp->b_flags & B_DELWRI)
1610			bundirty(bp);
1611		if (bp->b_vp)
1612			brelvp(bp);
1613	}
1614
1615	/* buffers with no memory */
1616	if (bp->b_bufsize == 0) {
1617		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
1618		if (bp->b_vflags & BV_BKGRDINPROG)
1619			panic("losing buffer 1");
1620		if (bp->b_kvasize) {
1621			bp->b_qindex = QUEUE_EMPTYKVA;
1622		} else {
1623			bp->b_qindex = QUEUE_EMPTY;
1624		}
1625		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
1626	/* buffers with junk contents */
1627	} else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
1628	    (bp->b_ioflags & BIO_ERROR)) {
1629		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
1630		if (bp->b_vflags & BV_BKGRDINPROG)
1631			panic("losing buffer 2");
1632		bp->b_qindex = QUEUE_CLEAN;
1633		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
1634	/* remaining buffers */
1635	} else {
1636		if (bp->b_flags & B_DELWRI)
1637			bp->b_qindex = QUEUE_DIRTY;
1638		else
1639			bp->b_qindex = QUEUE_CLEAN;
1640		if (bp->b_flags & B_AGE) {
1641			TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp,
1642			    b_freelist);
1643		} else {
1644			TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp,
1645			    b_freelist);
1646		}
1647	}
1648#ifdef INVARIANTS
1649	bq_len[bp->b_qindex]++;
1650#endif
1651	mtx_unlock(&bqlock);
1652
1653	/*
1654	 * Fixup numfreebuffers count.  The bp is on an appropriate queue
1655	 * unless locked.  We then bump numfreebuffers if it is not B_DELWRI.
1656	 * We've already handled the B_INVAL case ( B_DELWRI will be clear
1657	 * if B_INVAL is set ).
1658	 */
1659
1660	if (!(bp->b_flags & B_DELWRI))
1661		bufcountwakeup(bp);
1662
1663	/*
1664	 * Something we can maybe free or reuse
1665	 */
1666	if (bp->b_bufsize || bp->b_kvasize)
1667		bufspacewakeup();
1668
1669	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
1670	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
1671		panic("brelse: not dirty");
1672	/* unlock */
1673	BUF_UNLOCK(bp);
1674}
1675
1676/*
1677 * Release a buffer back to the appropriate queue but do not try to free
1678 * it.  The buffer is expected to be used again soon.
1679 *
1680 * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
1681 * biodone() to requeue an async I/O on completion.  It is also used when
1682 * known good buffers need to be requeued but we think we may need the data
1683 * again soon.
1684 *
1685 * XXX we should be able to leave the B_RELBUF hint set on completion.
1686 */
1687void
1688bqrelse(struct buf *bp)
1689{
1690	struct bufobj *bo;
1691
1692	CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1693	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
1694	    ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
1695
1696	if (BUF_LOCKRECURSED(bp)) {
1697		/* do not release to free list */
1698		BUF_UNLOCK(bp);
1699		return;
1700	}
1701
1702	bo = bp->b_bufobj;
1703	if (bp->b_flags & B_MANAGED) {
1704		if (bp->b_flags & B_REMFREE) {
1705			mtx_lock(&bqlock);
1706			bremfreel(bp);
1707			mtx_unlock(&bqlock);
1708		}
1709		bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
1710		BUF_UNLOCK(bp);
1711		return;
1712	}
1713
1714	mtx_lock(&bqlock);
1715	/* Handle delayed bremfree() processing. */
1716	if (bp->b_flags & B_REMFREE)
1717		bremfreel(bp);
1718
1719	if (bp->b_qindex != QUEUE_NONE)
1720		panic("bqrelse: free buffer onto another queue???");
1721	/* buffers with stale but valid contents */
1722	if (bp->b_flags & B_DELWRI) {
1723		bp->b_qindex = QUEUE_DIRTY;
1724		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
1725#ifdef INVARIANTS
1726		bq_len[bp->b_qindex]++;
1727#endif
1728	} else {
1729		/*
1730		 * BKGRDINPROG can only be set with the buf and bufobj
1731		 * locks both held.  We tolerate a race to clear it here.
1732		 */
1733		if (!buf_vm_page_count_severe() ||
1734		    (bp->b_vflags & BV_BKGRDINPROG)) {
1735			bp->b_qindex = QUEUE_CLEAN;
1736			TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp,
1737			    b_freelist);
1738#ifdef INVARIANTS
1739			bq_len[QUEUE_CLEAN]++;
1740#endif
1741		} else {
1742			/*
1743			 * We are too low on memory, we have to try to free
1744			 * the buffer (most importantly: the wired pages
1745			 * making up its backing store) *now*.
1746			 */
1747			mtx_unlock(&bqlock);
1748			brelse(bp);
1749			return;
1750		}
1751	}
1752	mtx_unlock(&bqlock);
1753
1754	if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
1755		bufcountwakeup(bp);
1756
1757	/*
1758	 * Something we can maybe free or reuse.
1759	 */
1760	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
1761		bufspacewakeup();
1762
1763	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
1764	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
1765		panic("bqrelse: not dirty");
1766	/* unlock */
1767	BUF_UNLOCK(bp);
1768}
1769
1770/* Give pages used by the bp back to the VM system (where possible) */
1771static void
1772vfs_vmio_release(struct buf *bp)
1773{
1774	int i;
1775	vm_page_t m;
1776
1777	if ((bp->b_flags & B_UNMAPPED) == 0) {
1778		BUF_CHECK_MAPPED(bp);
1779		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
1780	} else
1781		BUF_CHECK_UNMAPPED(bp);
1782	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
1783	for (i = 0; i < bp->b_npages; i++) {
1784		m = bp->b_pages[i];
1785		bp->b_pages[i] = NULL;
1786		/*
1787		 * In order to keep page LRU ordering consistent, put
1788		 * everything on the inactive queue.
1789		 */
1790		vm_page_lock(m);
1791		vm_page_unwire(m, 0);
1792		/*
1793		 * We don't mess with busy pages, it is
1794		 * the responsibility of the process that
1795		 * busied the pages to deal with them.
1796		 */
1797		if ((m->oflags & VPO_BUSY) == 0 && m->busy == 0 &&
1798		    m->wire_count == 0) {
1799			/*
1800			 * Might as well free the page if we can and it has
1801			 * no valid data.  We also free the page if the
1802			 * buffer was used for direct I/O
1803			 */
1804			if ((bp->b_flags & B_ASYNC) == 0 && !m->valid) {
1805				vm_page_free(m);
1806			} else if (bp->b_flags & B_DIRECT) {
1807				vm_page_try_to_free(m);
1808			} else if (buf_vm_page_count_severe()) {
1809				vm_page_try_to_cache(m);
1810			}
1811		}
1812		vm_page_unlock(m);
1813	}
1814	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
1815
1816	if (bp->b_bufsize) {
1817		bufspacewakeup();
1818		bp->b_bufsize = 0;
1819	}
1820	bp->b_npages = 0;
1821	bp->b_flags &= ~B_VMIO;
1822	if (bp->b_vp)
1823		brelvp(bp);
1824}
1825
1826/*
1827 * Check to see if a block at a particular lbn is available for a clustered
1828 * write.
1829 */
1830static int
1831vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno)
1832{
1833	struct buf *bpa;
1834	int match;
1835
1836	match = 0;
1837
1838	/* If the buf isn't in core skip it */
1839	if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL)
1840		return (0);
1841
1842	/* If the buf is busy we don't want to wait for it */
1843	if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
1844		return (0);
1845
1846	/* Only cluster with valid clusterable delayed write buffers */
1847	if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) !=
1848	    (B_DELWRI | B_CLUSTEROK))
1849		goto done;
1850
1851	if (bpa->b_bufsize != size)
1852		goto done;
1853
1854	/*
1855	 * Check to see if it is in the expected place on disk and that the
1856	 * block has been mapped.
1857	 */
1858	if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno))
1859		match = 1;
1860done:
1861	BUF_UNLOCK(bpa);
1862	return (match);
1863}
1864
1865/*
1866 *	vfs_bio_awrite:
1867 *
1868 *	Implement clustered async writes for clearing out B_DELWRI buffers.
1869 *	This is much better then the old way of writing only one buffer at
1870 *	a time.  Note that we may not be presented with the buffers in the
1871 *	correct order, so we search for the cluster in both directions.
1872 */
1873int
1874vfs_bio_awrite(struct buf *bp)
1875{
1876	struct bufobj *bo;
1877	int i;
1878	int j;
1879	daddr_t lblkno = bp->b_lblkno;
1880	struct vnode *vp = bp->b_vp;
1881	int ncl;
1882	int nwritten;
1883	int size;
1884	int maxcl;
1885	int gbflags;
1886
1887	bo = &vp->v_bufobj;
1888	gbflags = (bp->b_flags & B_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
1889	/*
1890	 * right now we support clustered writing only to regular files.  If
1891	 * we find a clusterable block we could be in the middle of a cluster
1892	 * rather then at the beginning.
1893	 */
1894	if ((vp->v_type == VREG) &&
1895	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
1896	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
1897
1898		size = vp->v_mount->mnt_stat.f_iosize;
1899		maxcl = MAXPHYS / size;
1900
1901		BO_RLOCK(bo);
1902		for (i = 1; i < maxcl; i++)
1903			if (vfs_bio_clcheck(vp, size, lblkno + i,
1904			    bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
1905				break;
1906
1907		for (j = 1; i + j <= maxcl && j <= lblkno; j++)
1908			if (vfs_bio_clcheck(vp, size, lblkno - j,
1909			    bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
1910				break;
1911		BO_RUNLOCK(bo);
1912		--j;
1913		ncl = i + j;
1914		/*
1915		 * this is a possible cluster write
1916		 */
1917		if (ncl != 1) {
1918			BUF_UNLOCK(bp);
1919			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
1920			    gbflags);
1921			return (nwritten);
1922		}
1923	}
1924	bremfree(bp);
1925	bp->b_flags |= B_ASYNC;
1926	/*
1927	 * default (old) behavior, writing out only one block
1928	 *
1929	 * XXX returns b_bufsize instead of b_bcount for nwritten?
1930	 */
1931	nwritten = bp->b_bufsize;
1932	(void) bwrite(bp);
1933
1934	return (nwritten);
1935}
1936
1937static void
1938setbufkva(struct buf *bp, vm_offset_t addr, int maxsize, int gbflags)
1939{
1940
1941	KASSERT((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
1942	    bp->b_kvasize == 0, ("call bfreekva(%p)", bp));
1943	if ((gbflags & GB_UNMAPPED) == 0) {
1944		bp->b_kvabase = (caddr_t)addr;
1945	} else if ((gbflags & GB_KVAALLOC) != 0) {
1946		KASSERT((gbflags & GB_UNMAPPED) != 0,
1947		    ("GB_KVAALLOC without GB_UNMAPPED"));
1948		bp->b_kvaalloc = (caddr_t)addr;
1949		bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
1950		atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
1951	}
1952	bp->b_kvasize = maxsize;
1953}
1954
1955/*
1956 * Allocate the buffer KVA and set b_kvasize. Also set b_kvabase if
1957 * needed.
1958 */
1959static int
1960allocbufkva(struct buf *bp, int maxsize, int gbflags)
1961{
1962	vm_offset_t addr;
1963	int rv;
1964
1965	bfreekva(bp);
1966	addr = 0;
1967
1968	vm_map_lock(buffer_map);
1969	if (vm_map_findspace(buffer_map, vm_map_min(buffer_map), maxsize,
1970	    &addr)) {
1971		vm_map_unlock(buffer_map);
1972		/*
1973		 * Buffer map is too fragmented.  Request the caller
1974		 * to defragment the map.
1975		 */
1976		atomic_add_int(&bufdefragcnt, 1);
1977		return (1);
1978	}
1979	rv = vm_map_insert(buffer_map, NULL, 0, addr, addr + maxsize,
1980	    VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT);
1981	KASSERT(rv == KERN_SUCCESS, ("vm_map_insert(buffer_map) rv %d", rv));
1982	vm_map_unlock(buffer_map);
1983	setbufkva(bp, addr, maxsize, gbflags);
1984	atomic_add_long(&bufspace, bp->b_kvasize);
1985	return (0);
1986}
1987
1988/*
1989 * Ask the bufdaemon for help, or act as bufdaemon itself, when a
1990 * locked vnode is supplied.
1991 */
1992static void
1993getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo,
1994    int defrag)
1995{
1996	struct thread *td;
1997	char *waitmsg;
1998	int fl, flags, norunbuf;
1999
2000	mtx_assert(&bqlock, MA_OWNED);
2001
2002	if (defrag) {
2003		flags = VFS_BIO_NEED_BUFSPACE;
2004		waitmsg = "nbufkv";
2005	} else if (bufspace >= hibufspace) {
2006		waitmsg = "nbufbs";
2007		flags = VFS_BIO_NEED_BUFSPACE;
2008	} else {
2009		waitmsg = "newbuf";
2010		flags = VFS_BIO_NEED_ANY;
2011	}
2012	mtx_lock(&nblock);
2013	needsbuffer |= flags;
2014	mtx_unlock(&nblock);
2015	mtx_unlock(&bqlock);
2016
2017	bd_speedup();	/* heeeelp */
2018	if ((gbflags & GB_NOWAIT_BD) != 0)
2019		return;
2020
2021	td = curthread;
2022	mtx_lock(&nblock);
2023	while (needsbuffer & flags) {
2024		if (vp != NULL && (td->td_pflags & TDP_BUFNEED) == 0) {
2025			mtx_unlock(&nblock);
2026			/*
2027			 * getblk() is called with a vnode locked, and
2028			 * some majority of the dirty buffers may as
2029			 * well belong to the vnode.  Flushing the
2030			 * buffers there would make a progress that
2031			 * cannot be achieved by the buf_daemon, that
2032			 * cannot lock the vnode.
2033			 */
2034			norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
2035			    (td->td_pflags & TDP_NORUNNINGBUF);
2036			/* play bufdaemon */
2037			td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
2038			fl = buf_do_flush(vp);
2039			td->td_pflags &= norunbuf;
2040			mtx_lock(&nblock);
2041			if (fl != 0)
2042				continue;
2043			if ((needsbuffer & flags) == 0)
2044				break;
2045		}
2046		if (msleep(&needsbuffer, &nblock, (PRIBIO + 4) | slpflag,
2047		    waitmsg, slptimeo))
2048			break;
2049	}
2050	mtx_unlock(&nblock);
2051}
2052
2053static void
2054getnewbuf_reuse_bp(struct buf *bp, int qindex)
2055{
2056
2057	CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
2058	    "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
2059	     bp->b_kvasize, bp->b_bufsize, qindex);
2060	mtx_assert(&bqlock, MA_NOTOWNED);
2061
2062	/*
2063	 * Note: we no longer distinguish between VMIO and non-VMIO
2064	 * buffers.
2065	 */
2066	KASSERT((bp->b_flags & B_DELWRI) == 0,
2067	    ("delwri buffer %p found in queue %d", bp, qindex));
2068
2069	if (qindex == QUEUE_CLEAN) {
2070		if (bp->b_flags & B_VMIO) {
2071			bp->b_flags &= ~B_ASYNC;
2072			vfs_vmio_release(bp);
2073		}
2074		if (bp->b_vp != NULL)
2075			brelvp(bp);
2076	}
2077
2078	/*
2079	 * Get the rest of the buffer freed up.  b_kva* is still valid
2080	 * after this operation.
2081	 */
2082
2083	if (bp->b_rcred != NOCRED) {
2084		crfree(bp->b_rcred);
2085		bp->b_rcred = NOCRED;
2086	}
2087	if (bp->b_wcred != NOCRED) {
2088		crfree(bp->b_wcred);
2089		bp->b_wcred = NOCRED;
2090	}
2091	if (!LIST_EMPTY(&bp->b_dep))
2092		buf_deallocate(bp);
2093	if (bp->b_vflags & BV_BKGRDINPROG)
2094		panic("losing buffer 3");
2095	KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p.  qindex: %d",
2096	    bp, bp->b_vp, qindex));
2097	KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
2098	    ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
2099
2100	if (bp->b_bufsize)
2101		allocbuf(bp, 0);
2102
2103	bp->b_flags &= B_UNMAPPED | B_KVAALLOC;
2104	bp->b_ioflags = 0;
2105	bp->b_xflags = 0;
2106	KASSERT((bp->b_flags & B_INFREECNT) == 0,
2107	    ("buf %p still counted as free?", bp));
2108	bp->b_vflags = 0;
2109	bp->b_vp = NULL;
2110	bp->b_blkno = bp->b_lblkno = 0;
2111	bp->b_offset = NOOFFSET;
2112	bp->b_iodone = 0;
2113	bp->b_error = 0;
2114	bp->b_resid = 0;
2115	bp->b_bcount = 0;
2116	bp->b_npages = 0;
2117	bp->b_dirtyoff = bp->b_dirtyend = 0;
2118	bp->b_bufobj = NULL;
2119	bp->b_pin_count = 0;
2120	bp->b_fsprivate1 = NULL;
2121	bp->b_fsprivate2 = NULL;
2122	bp->b_fsprivate3 = NULL;
2123
2124	LIST_INIT(&bp->b_dep);
2125}
2126
2127static int flushingbufs;
2128
2129static struct buf *
2130getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata)
2131{
2132	struct buf *bp, *nbp;
2133	int nqindex, qindex, pass;
2134
2135	KASSERT(!unmapped || !defrag, ("both unmapped and defrag"));
2136
2137	pass = 1;
2138restart:
2139	atomic_add_int(&getnewbufrestarts, 1);
2140
2141	/*
2142	 * Setup for scan.  If we do not have enough free buffers,
2143	 * we setup a degenerate case that immediately fails.  Note
2144	 * that if we are specially marked process, we are allowed to
2145	 * dip into our reserves.
2146	 *
2147	 * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN
2148	 * for the allocation of the mapped buffer.  For unmapped, the
2149	 * easiest is to start with EMPTY outright.
2150	 *
2151	 * We start with EMPTYKVA.  If the list is empty we backup to EMPTY.
2152	 * However, there are a number of cases (defragging, reusing, ...)
2153	 * where we cannot backup.
2154	 */
2155	nbp = NULL;
2156	mtx_lock(&bqlock);
2157	if (!defrag && unmapped) {
2158		nqindex = QUEUE_EMPTY;
2159		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
2160	}
2161	if (nbp == NULL) {
2162		nqindex = QUEUE_EMPTYKVA;
2163		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
2164	}
2165
2166	/*
2167	 * If no EMPTYKVA buffers and we are either defragging or
2168	 * reusing, locate a CLEAN buffer to free or reuse.  If
2169	 * bufspace useage is low skip this step so we can allocate a
2170	 * new buffer.
2171	 */
2172	if (nbp == NULL && (defrag || bufspace >= lobufspace)) {
2173		nqindex = QUEUE_CLEAN;
2174		nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
2175	}
2176
2177	/*
2178	 * If we could not find or were not allowed to reuse a CLEAN
2179	 * buffer, check to see if it is ok to use an EMPTY buffer.
2180	 * We can only use an EMPTY buffer if allocating its KVA would
2181	 * not otherwise run us out of buffer space.  No KVA is needed
2182	 * for the unmapped allocation.
2183	 */
2184	if (nbp == NULL && defrag == 0 && (bufspace + maxsize < hibufspace ||
2185	    metadata)) {
2186		nqindex = QUEUE_EMPTY;
2187		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
2188	}
2189
2190	/*
2191	 * All available buffers might be clean, retry ignoring the
2192	 * lobufspace as the last resort.
2193	 */
2194	if (nbp == NULL && !TAILQ_EMPTY(&bufqueues[QUEUE_CLEAN])) {
2195		nqindex = QUEUE_CLEAN;
2196		nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
2197	}
2198
2199	/*
2200	 * Run scan, possibly freeing data and/or kva mappings on the fly
2201	 * depending.
2202	 */
2203	while ((bp = nbp) != NULL) {
2204		qindex = nqindex;
2205
2206		/*
2207		 * Calculate next bp (we can only use it if we do not
2208		 * block or do other fancy things).
2209		 */
2210		if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
2211			switch (qindex) {
2212			case QUEUE_EMPTY:
2213				nqindex = QUEUE_EMPTYKVA;
2214				nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
2215				if (nbp != NULL)
2216					break;
2217				/* FALLTHROUGH */
2218			case QUEUE_EMPTYKVA:
2219				nqindex = QUEUE_CLEAN;
2220				nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
2221				if (nbp != NULL)
2222					break;
2223				/* FALLTHROUGH */
2224			case QUEUE_CLEAN:
2225				if (metadata && pass == 1) {
2226					pass = 2;
2227					nqindex = QUEUE_EMPTY;
2228					nbp = TAILQ_FIRST(
2229					    &bufqueues[QUEUE_EMPTY]);
2230				}
2231				/*
2232				 * nbp is NULL.
2233				 */
2234				break;
2235			}
2236		}
2237		/*
2238		 * If we are defragging then we need a buffer with
2239		 * b_kvasize != 0.  XXX this situation should no longer
2240		 * occur, if defrag is non-zero the buffer's b_kvasize
2241		 * should also be non-zero at this point.  XXX
2242		 */
2243		if (defrag && bp->b_kvasize == 0) {
2244			printf("Warning: defrag empty buffer %p\n", bp);
2245			continue;
2246		}
2247
2248		/*
2249		 * Start freeing the bp.  This is somewhat involved.  nbp
2250		 * remains valid only for QUEUE_EMPTY[KVA] bp's.
2251		 */
2252		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
2253			continue;
2254		/*
2255		 * BKGRDINPROG can only be set with the buf and bufobj
2256		 * locks both held.  We tolerate a race to clear it here.
2257		 */
2258		if (bp->b_vflags & BV_BKGRDINPROG) {
2259			BUF_UNLOCK(bp);
2260			continue;
2261		}
2262
2263		KASSERT(bp->b_qindex == qindex,
2264		    ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
2265
2266		bremfreel(bp);
2267		mtx_unlock(&bqlock);
2268		/*
2269		 * NOTE:  nbp is now entirely invalid.  We can only restart
2270		 * the scan from this point on.
2271		 */
2272
2273		getnewbuf_reuse_bp(bp, qindex);
2274		mtx_assert(&bqlock, MA_NOTOWNED);
2275
2276		/*
2277		 * If we are defragging then free the buffer.
2278		 */
2279		if (defrag) {
2280			bp->b_flags |= B_INVAL;
2281			bfreekva(bp);
2282			brelse(bp);
2283			defrag = 0;
2284			goto restart;
2285		}
2286
2287		/*
2288		 * Notify any waiters for the buffer lock about
2289		 * identity change by freeing the buffer.
2290		 */
2291		if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp)) {
2292			bp->b_flags |= B_INVAL;
2293			bfreekva(bp);
2294			brelse(bp);
2295			goto restart;
2296		}
2297
2298		if (metadata)
2299			break;
2300
2301		/*
2302		 * If we are overcomitted then recover the buffer and its
2303		 * KVM space.  This occurs in rare situations when multiple
2304		 * processes are blocked in getnewbuf() or allocbuf().
2305		 */
2306		if (bufspace >= hibufspace)
2307			flushingbufs = 1;
2308		if (flushingbufs && bp->b_kvasize != 0) {
2309			bp->b_flags |= B_INVAL;
2310			bfreekva(bp);
2311			brelse(bp);
2312			goto restart;
2313		}
2314		if (bufspace < lobufspace)
2315			flushingbufs = 0;
2316		break;
2317	}
2318	return (bp);
2319}
2320
2321/*
2322 *	getnewbuf:
2323 *
2324 *	Find and initialize a new buffer header, freeing up existing buffers
2325 *	in the bufqueues as necessary.  The new buffer is returned locked.
2326 *
2327 *	Important:  B_INVAL is not set.  If the caller wishes to throw the
2328 *	buffer away, the caller must set B_INVAL prior to calling brelse().
2329 *
2330 *	We block if:
2331 *		We have insufficient buffer headers
2332 *		We have insufficient buffer space
2333 *		buffer_map is too fragmented ( space reservation fails )
2334 *		If we have to flush dirty buffers ( but we try to avoid this )
2335 *
2336 *	To avoid VFS layer recursion we do not flush dirty buffers ourselves.
2337 *	Instead we ask the buf daemon to do it for us.  We attempt to
2338 *	avoid piecemeal wakeups of the pageout daemon.
2339 */
2340static struct buf *
2341getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
2342    int gbflags)
2343{
2344	struct buf *bp;
2345	int defrag, metadata;
2346
2347	KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
2348	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
2349	if (!unmapped_buf_allowed)
2350		gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
2351
2352	defrag = 0;
2353	if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
2354	    vp->v_type == VCHR)
2355		metadata = 1;
2356	else
2357		metadata = 0;
2358	/*
2359	 * We can't afford to block since we might be holding a vnode lock,
2360	 * which may prevent system daemons from running.  We deal with
2361	 * low-memory situations by proactively returning memory and running
2362	 * async I/O rather then sync I/O.
2363	 */
2364	atomic_add_int(&getnewbufcalls, 1);
2365	atomic_subtract_int(&getnewbufrestarts, 1);
2366restart:
2367	bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED |
2368	    GB_KVAALLOC)) == GB_UNMAPPED, metadata);
2369	if (bp != NULL)
2370		defrag = 0;
2371
2372	/*
2373	 * If we exhausted our list, sleep as appropriate.  We may have to
2374	 * wakeup various daemons and write out some dirty buffers.
2375	 *
2376	 * Generally we are sleeping due to insufficient buffer space.
2377	 */
2378	if (bp == NULL) {
2379		mtx_assert(&bqlock, MA_OWNED);
2380		getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag);
2381		mtx_assert(&bqlock, MA_NOTOWNED);
2382	} else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) {
2383		mtx_assert(&bqlock, MA_NOTOWNED);
2384
2385		bfreekva(bp);
2386		bp->b_flags |= B_UNMAPPED;
2387		bp->b_kvabase = bp->b_data = unmapped_buf;
2388		bp->b_kvasize = maxsize;
2389		atomic_add_long(&bufspace, bp->b_kvasize);
2390		atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
2391		atomic_add_int(&bufreusecnt, 1);
2392	} else {
2393		mtx_assert(&bqlock, MA_NOTOWNED);
2394
2395		/*
2396		 * We finally have a valid bp.  We aren't quite out of the
2397		 * woods, we still have to reserve kva space.  In order
2398		 * to keep fragmentation sane we only allocate kva in
2399		 * BKVASIZE chunks.
2400		 */
2401		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
2402
2403		if (maxsize != bp->b_kvasize || (bp->b_flags & (B_UNMAPPED |
2404		    B_KVAALLOC)) == B_UNMAPPED) {
2405			if (allocbufkva(bp, maxsize, gbflags)) {
2406				defrag = 1;
2407				bp->b_flags |= B_INVAL;
2408				brelse(bp);
2409				goto restart;
2410			}
2411			atomic_add_int(&bufreusecnt, 1);
2412		} else if ((bp->b_flags & B_KVAALLOC) != 0 &&
2413		    (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == 0) {
2414			/*
2415			 * If the reused buffer has KVA allocated,
2416			 * reassign b_kvaalloc to b_kvabase.
2417			 */
2418			bp->b_kvabase = bp->b_kvaalloc;
2419			bp->b_flags &= ~B_KVAALLOC;
2420			atomic_subtract_long(&unmapped_bufspace,
2421			    bp->b_kvasize);
2422			atomic_add_int(&bufreusecnt, 1);
2423		} else if ((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
2424		    (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == (GB_UNMAPPED |
2425		    GB_KVAALLOC)) {
2426			/*
2427			 * The case of reused buffer already have KVA
2428			 * mapped, but the request is for unmapped
2429			 * buffer with KVA allocated.
2430			 */
2431			bp->b_kvaalloc = bp->b_kvabase;
2432			bp->b_data = bp->b_kvabase = unmapped_buf;
2433			bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
2434			atomic_add_long(&unmapped_bufspace,
2435			    bp->b_kvasize);
2436			atomic_add_int(&bufreusecnt, 1);
2437		}
2438		if ((gbflags & GB_UNMAPPED) == 0) {
2439			bp->b_saveaddr = bp->b_kvabase;
2440			bp->b_data = bp->b_saveaddr;
2441			bp->b_flags &= ~B_UNMAPPED;
2442			BUF_CHECK_MAPPED(bp);
2443		}
2444	}
2445	return (bp);
2446}
2447
2448/*
2449 *	buf_daemon:
2450 *
2451 *	buffer flushing daemon.  Buffers are normally flushed by the
2452 *	update daemon but if it cannot keep up this process starts to
2453 *	take the load in an attempt to prevent getnewbuf() from blocking.
2454 */
2455
2456static struct kproc_desc buf_kp = {
2457	"bufdaemon",
2458	buf_daemon,
2459	&bufdaemonproc
2460};
2461SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
2462
2463static int
2464buf_do_flush(struct vnode *vp)
2465{
2466	int flushed;
2467
2468	flushed = flushbufqueues(vp, QUEUE_DIRTY, 0);
2469	if (flushed == 0) {
2470		/*
2471		 * Could not find any buffers without rollback
2472		 * dependencies, so just write the first one
2473		 * in the hopes of eventually making progress.
2474		 */
2475		flushbufqueues(vp, QUEUE_DIRTY, 1);
2476	}
2477	return (flushed);
2478}
2479
2480static void
2481buf_daemon()
2482{
2483	int lodirtysave;
2484
2485	/*
2486	 * This process needs to be suspended prior to shutdown sync.
2487	 */
2488	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc,
2489	    SHUTDOWN_PRI_LAST);
2490
2491	/*
2492	 * This process is allowed to take the buffer cache to the limit
2493	 */
2494	curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED;
2495	mtx_lock(&bdlock);
2496	for (;;) {
2497		bd_request = 0;
2498		mtx_unlock(&bdlock);
2499
2500		kproc_suspend_check(bufdaemonproc);
2501		lodirtysave = lodirtybuffers;
2502		if (bd_speedupreq) {
2503			lodirtybuffers = numdirtybuffers / 2;
2504			bd_speedupreq = 0;
2505		}
2506		/*
2507		 * Do the flush.  Limit the amount of in-transit I/O we
2508		 * allow to build up, otherwise we would completely saturate
2509		 * the I/O system.  Wakeup any waiting processes before we
2510		 * normally would so they can run in parallel with our drain.
2511		 */
2512		while (numdirtybuffers > lodirtybuffers) {
2513			if (buf_do_flush(NULL) == 0)
2514				break;
2515			kern_yield(PRI_USER);
2516		}
2517		lodirtybuffers = lodirtysave;
2518
2519		/*
2520		 * Only clear bd_request if we have reached our low water
2521		 * mark.  The buf_daemon normally waits 1 second and
2522		 * then incrementally flushes any dirty buffers that have
2523		 * built up, within reason.
2524		 *
2525		 * If we were unable to hit our low water mark and couldn't
2526		 * find any flushable buffers, we sleep half a second.
2527		 * Otherwise we loop immediately.
2528		 */
2529		mtx_lock(&bdlock);
2530		if (numdirtybuffers <= lodirtybuffers) {
2531			/*
2532			 * We reached our low water mark, reset the
2533			 * request and sleep until we are needed again.
2534			 * The sleep is just so the suspend code works.
2535			 */
2536			bd_request = 0;
2537			msleep(&bd_request, &bdlock, PVM, "psleep", hz);
2538		} else {
2539			/*
2540			 * We couldn't find any flushable dirty buffers but
2541			 * still have too many dirty buffers, we
2542			 * have to sleep and try again.  (rare)
2543			 */
2544			msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10);
2545		}
2546	}
2547}
2548
2549/*
2550 *	flushbufqueues:
2551 *
2552 *	Try to flush a buffer in the dirty queue.  We must be careful to
2553 *	free up B_INVAL buffers instead of write them, which NFS is
2554 *	particularly sensitive to.
2555 */
2556static int flushwithdeps = 0;
2557SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
2558    0, "Number of buffers flushed with dependecies that require rollbacks");
2559
2560static int
2561flushbufqueues(struct vnode *lvp, int queue, int flushdeps)
2562{
2563	struct buf *sentinel;
2564	struct vnode *vp;
2565	struct mount *mp;
2566	struct buf *bp;
2567	int hasdeps;
2568	int flushed;
2569	int target;
2570
2571	if (lvp == NULL) {
2572		target = numdirtybuffers - lodirtybuffers;
2573		if (flushdeps && target > 2)
2574			target /= 2;
2575	} else
2576		target = flushbufqtarget;
2577	flushed = 0;
2578	bp = NULL;
2579	sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
2580	sentinel->b_qindex = QUEUE_SENTINEL;
2581	mtx_lock(&bqlock);
2582	TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist);
2583	while (flushed != target) {
2584		bp = TAILQ_NEXT(sentinel, b_freelist);
2585		if (bp != NULL) {
2586			TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
2587			TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel,
2588			    b_freelist);
2589		} else
2590			break;
2591		/*
2592		 * Skip sentinels inserted by other invocations of the
2593		 * flushbufqueues(), taking care to not reorder them.
2594		 */
2595		if (bp->b_qindex == QUEUE_SENTINEL)
2596			continue;
2597		/*
2598		 * Only flush the buffers that belong to the
2599		 * vnode locked by the curthread.
2600		 */
2601		if (lvp != NULL && bp->b_vp != lvp)
2602			continue;
2603		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
2604			continue;
2605		if (bp->b_pin_count > 0) {
2606			BUF_UNLOCK(bp);
2607			continue;
2608		}
2609		/*
2610		 * BKGRDINPROG can only be set with the buf and bufobj
2611		 * locks both held.  We tolerate a race to clear it here.
2612		 */
2613		if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
2614		    (bp->b_flags & B_DELWRI) == 0) {
2615			BUF_UNLOCK(bp);
2616			continue;
2617		}
2618		if (bp->b_flags & B_INVAL) {
2619			bremfreel(bp);
2620			mtx_unlock(&bqlock);
2621			brelse(bp);
2622			flushed++;
2623			numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
2624			mtx_lock(&bqlock);
2625			continue;
2626		}
2627
2628		if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
2629			if (flushdeps == 0) {
2630				BUF_UNLOCK(bp);
2631				continue;
2632			}
2633			hasdeps = 1;
2634		} else
2635			hasdeps = 0;
2636		/*
2637		 * We must hold the lock on a vnode before writing
2638		 * one of its buffers. Otherwise we may confuse, or
2639		 * in the case of a snapshot vnode, deadlock the
2640		 * system.
2641		 *
2642		 * The lock order here is the reverse of the normal
2643		 * of vnode followed by buf lock.  This is ok because
2644		 * the NOWAIT will prevent deadlock.
2645		 */
2646		vp = bp->b_vp;
2647		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
2648			BUF_UNLOCK(bp);
2649			continue;
2650		}
2651		if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_CANRECURSE) == 0) {
2652			mtx_unlock(&bqlock);
2653			CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
2654			    bp, bp->b_vp, bp->b_flags);
2655			if (curproc == bufdaemonproc)
2656				vfs_bio_awrite(bp);
2657			else {
2658				bremfree(bp);
2659				bwrite(bp);
2660				notbufdflashes++;
2661			}
2662			vn_finished_write(mp);
2663			VOP_UNLOCK(vp, 0);
2664			flushwithdeps += hasdeps;
2665			flushed++;
2666
2667			/*
2668			 * Sleeping on runningbufspace while holding
2669			 * vnode lock leads to deadlock.
2670			 */
2671			if (curproc == bufdaemonproc)
2672				waitrunningbufspace();
2673			numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
2674			mtx_lock(&bqlock);
2675			continue;
2676		}
2677		vn_finished_write(mp);
2678		BUF_UNLOCK(bp);
2679	}
2680	TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
2681	mtx_unlock(&bqlock);
2682	free(sentinel, M_TEMP);
2683	return (flushed);
2684}
2685
2686/*
2687 * Check to see if a block is currently memory resident.
2688 */
2689struct buf *
2690incore(struct bufobj *bo, daddr_t blkno)
2691{
2692	struct buf *bp;
2693
2694	BO_RLOCK(bo);
2695	bp = gbincore(bo, blkno);
2696	BO_RUNLOCK(bo);
2697	return (bp);
2698}
2699
2700/*
2701 * Returns true if no I/O is needed to access the
2702 * associated VM object.  This is like incore except
2703 * it also hunts around in the VM system for the data.
2704 */
2705
2706static int
2707inmem(struct vnode * vp, daddr_t blkno)
2708{
2709	vm_object_t obj;
2710	vm_offset_t toff, tinc, size;
2711	vm_page_t m;
2712	vm_ooffset_t off;
2713
2714	ASSERT_VOP_LOCKED(vp, "inmem");
2715
2716	if (incore(&vp->v_bufobj, blkno))
2717		return 1;
2718	if (vp->v_mount == NULL)
2719		return 0;
2720	obj = vp->v_object;
2721	if (obj == NULL)
2722		return (0);
2723
2724	size = PAGE_SIZE;
2725	if (size > vp->v_mount->mnt_stat.f_iosize)
2726		size = vp->v_mount->mnt_stat.f_iosize;
2727	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
2728
2729	VM_OBJECT_RLOCK(obj);
2730	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
2731		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
2732		if (!m)
2733			goto notinmem;
2734		tinc = size;
2735		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
2736			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
2737		if (vm_page_is_valid(m,
2738		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
2739			goto notinmem;
2740	}
2741	VM_OBJECT_RUNLOCK(obj);
2742	return 1;
2743
2744notinmem:
2745	VM_OBJECT_RUNLOCK(obj);
2746	return (0);
2747}
2748
2749/*
2750 * Set the dirty range for a buffer based on the status of the dirty
2751 * bits in the pages comprising the buffer.  The range is limited
2752 * to the size of the buffer.
2753 *
2754 * Tell the VM system that the pages associated with this buffer
2755 * are clean.  This is used for delayed writes where the data is
2756 * going to go to disk eventually without additional VM intevention.
2757 *
2758 * Note that while we only really need to clean through to b_bcount, we
2759 * just go ahead and clean through to b_bufsize.
2760 */
2761static void
2762vfs_clean_pages_dirty_buf(struct buf *bp)
2763{
2764	vm_ooffset_t foff, noff, eoff;
2765	vm_page_t m;
2766	int i;
2767
2768	if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0)
2769		return;
2770
2771	foff = bp->b_offset;
2772	KASSERT(bp->b_offset != NOOFFSET,
2773	    ("vfs_clean_pages_dirty_buf: no buffer offset"));
2774
2775	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
2776	vfs_drain_busy_pages(bp);
2777	vfs_setdirty_locked_object(bp);
2778	for (i = 0; i < bp->b_npages; i++) {
2779		noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
2780		eoff = noff;
2781		if (eoff > bp->b_offset + bp->b_bufsize)
2782			eoff = bp->b_offset + bp->b_bufsize;
2783		m = bp->b_pages[i];
2784		vfs_page_set_validclean(bp, foff, m);
2785		/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
2786		foff = noff;
2787	}
2788	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
2789}
2790
2791static void
2792vfs_setdirty_locked_object(struct buf *bp)
2793{
2794	vm_object_t object;
2795	int i;
2796
2797	object = bp->b_bufobj->bo_object;
2798	VM_OBJECT_ASSERT_WLOCKED(object);
2799
2800	/*
2801	 * We qualify the scan for modified pages on whether the
2802	 * object has been flushed yet.
2803	 */
2804	if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) {
2805		vm_offset_t boffset;
2806		vm_offset_t eoffset;
2807
2808		/*
2809		 * test the pages to see if they have been modified directly
2810		 * by users through the VM system.
2811		 */
2812		for (i = 0; i < bp->b_npages; i++)
2813			vm_page_test_dirty(bp->b_pages[i]);
2814
2815		/*
2816		 * Calculate the encompassing dirty range, boffset and eoffset,
2817		 * (eoffset - boffset) bytes.
2818		 */
2819
2820		for (i = 0; i < bp->b_npages; i++) {
2821			if (bp->b_pages[i]->dirty)
2822				break;
2823		}
2824		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
2825
2826		for (i = bp->b_npages - 1; i >= 0; --i) {
2827			if (bp->b_pages[i]->dirty) {
2828				break;
2829			}
2830		}
2831		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
2832
2833		/*
2834		 * Fit it to the buffer.
2835		 */
2836
2837		if (eoffset > bp->b_bcount)
2838			eoffset = bp->b_bcount;
2839
2840		/*
2841		 * If we have a good dirty range, merge with the existing
2842		 * dirty range.
2843		 */
2844
2845		if (boffset < eoffset) {
2846			if (bp->b_dirtyoff > boffset)
2847				bp->b_dirtyoff = boffset;
2848			if (bp->b_dirtyend < eoffset)
2849				bp->b_dirtyend = eoffset;
2850		}
2851	}
2852}
2853
2854/*
2855 * Allocate the KVA mapping for an existing buffer. It handles the
2856 * cases of both B_UNMAPPED buffer, and buffer with the preallocated
2857 * KVA which is not mapped (B_KVAALLOC).
2858 */
2859static void
2860bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
2861{
2862	struct buf *scratch_bp;
2863	int bsize, maxsize, need_mapping, need_kva;
2864	off_t offset;
2865
2866	need_mapping = (bp->b_flags & B_UNMAPPED) != 0 &&
2867	    (gbflags & GB_UNMAPPED) == 0;
2868	need_kva = (bp->b_flags & (B_KVAALLOC | B_UNMAPPED)) == B_UNMAPPED &&
2869	    (gbflags & GB_KVAALLOC) != 0;
2870	if (!need_mapping && !need_kva)
2871		return;
2872
2873	BUF_CHECK_UNMAPPED(bp);
2874
2875	if (need_mapping && (bp->b_flags & B_KVAALLOC) != 0) {
2876		/*
2877		 * Buffer is not mapped, but the KVA was already
2878		 * reserved at the time of the instantiation.  Use the
2879		 * allocated space.
2880		 */
2881		bp->b_flags &= ~B_KVAALLOC;
2882		KASSERT(bp->b_kvaalloc != 0, ("kvaalloc == 0"));
2883		bp->b_kvabase = bp->b_kvaalloc;
2884		atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
2885		goto has_addr;
2886	}
2887
2888	/*
2889	 * Calculate the amount of the address space we would reserve
2890	 * if the buffer was mapped.
2891	 */
2892	bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
2893	offset = blkno * bsize;
2894	maxsize = size + (offset & PAGE_MASK);
2895	maxsize = imax(maxsize, bsize);
2896
2897mapping_loop:
2898	if (allocbufkva(bp, maxsize, gbflags)) {
2899		/*
2900		 * Request defragmentation. getnewbuf() returns us the
2901		 * allocated space by the scratch buffer KVA.
2902		 */
2903		scratch_bp = getnewbuf(bp->b_vp, 0, 0, size, maxsize, gbflags |
2904		    (GB_UNMAPPED | GB_KVAALLOC));
2905		if (scratch_bp == NULL) {
2906			if ((gbflags & GB_NOWAIT_BD) != 0) {
2907				/*
2908				 * XXXKIB: defragmentation cannot
2909				 * succeed, not sure what else to do.
2910				 */
2911				panic("GB_NOWAIT_BD and B_UNMAPPED %p", bp);
2912			}
2913			atomic_add_int(&mappingrestarts, 1);
2914			goto mapping_loop;
2915		}
2916		KASSERT((scratch_bp->b_flags & B_KVAALLOC) != 0,
2917		    ("scratch bp !B_KVAALLOC %p", scratch_bp));
2918		setbufkva(bp, (vm_offset_t)scratch_bp->b_kvaalloc,
2919		    scratch_bp->b_kvasize, gbflags);
2920
2921		/* Get rid of the scratch buffer. */
2922		scratch_bp->b_kvasize = 0;
2923		scratch_bp->b_flags |= B_INVAL;
2924		scratch_bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
2925		brelse(scratch_bp);
2926	}
2927	if (!need_mapping)
2928		return;
2929
2930has_addr:
2931	bp->b_saveaddr = bp->b_kvabase;
2932	bp->b_data = bp->b_saveaddr; /* b_offset is handled by bpmap_qenter */
2933	bp->b_flags &= ~B_UNMAPPED;
2934	BUF_CHECK_MAPPED(bp);
2935	bpmap_qenter(bp);
2936}
2937
2938/*
2939 *	getblk:
2940 *
2941 *	Get a block given a specified block and offset into a file/device.
2942 *	The buffers B_DONE bit will be cleared on return, making it almost
2943 * 	ready for an I/O initiation.  B_INVAL may or may not be set on
2944 *	return.  The caller should clear B_INVAL prior to initiating a
2945 *	READ.
2946 *
2947 *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
2948 *	an existing buffer.
2949 *
2950 *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
2951 *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
2952 *	and then cleared based on the backing VM.  If the previous buffer is
2953 *	non-0-sized but invalid, B_CACHE will be cleared.
2954 *
2955 *	If getblk() must create a new buffer, the new buffer is returned with
2956 *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
2957 *	case it is returned with B_INVAL clear and B_CACHE set based on the
2958 *	backing VM.
2959 *
2960 *	getblk() also forces a bwrite() for any B_DELWRI buffer whos
2961 *	B_CACHE bit is clear.
2962 *
2963 *	What this means, basically, is that the caller should use B_CACHE to
2964 *	determine whether the buffer is fully valid or not and should clear
2965 *	B_INVAL prior to issuing a read.  If the caller intends to validate
2966 *	the buffer by loading its data area with something, the caller needs
2967 *	to clear B_INVAL.  If the caller does this without issuing an I/O,
2968 *	the caller should set B_CACHE ( as an optimization ), else the caller
2969 *	should issue the I/O and biodone() will set B_CACHE if the I/O was
2970 *	a write attempt or if it was a successfull read.  If the caller
2971 *	intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
2972 *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
2973 */
2974struct buf *
2975getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
2976    int flags)
2977{
2978	struct buf *bp;
2979	struct bufobj *bo;
2980	int bsize, error, maxsize, vmio;
2981	off_t offset;
2982
2983	CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
2984	KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
2985	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
2986	ASSERT_VOP_LOCKED(vp, "getblk");
2987	if (size > MAXBSIZE)
2988		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
2989	if (!unmapped_buf_allowed)
2990		flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
2991
2992	bo = &vp->v_bufobj;
2993loop:
2994	/*
2995	 * Block if we are low on buffers.   Certain processes are allowed
2996	 * to completely exhaust the buffer cache.
2997         *
2998         * If this check ever becomes a bottleneck it may be better to
2999         * move it into the else, when gbincore() fails.  At the moment
3000         * it isn't a problem.
3001         */
3002	if (numfreebuffers == 0) {
3003		if (TD_IS_IDLETHREAD(curthread))
3004			return NULL;
3005		mtx_lock(&nblock);
3006		needsbuffer |= VFS_BIO_NEED_ANY;
3007		mtx_unlock(&nblock);
3008	}
3009
3010	BO_RLOCK(bo);
3011	bp = gbincore(bo, blkno);
3012	if (bp != NULL) {
3013		int lockflags;
3014		/*
3015		 * Buffer is in-core.  If the buffer is not busy nor managed,
3016		 * it must be on a queue.
3017		 */
3018		lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
3019
3020		if (flags & GB_LOCK_NOWAIT)
3021			lockflags |= LK_NOWAIT;
3022
3023		error = BUF_TIMELOCK(bp, lockflags,
3024		    BO_LOCKPTR(bo), "getblk", slpflag, slptimeo);
3025
3026		/*
3027		 * If we slept and got the lock we have to restart in case
3028		 * the buffer changed identities.
3029		 */
3030		if (error == ENOLCK)
3031			goto loop;
3032		/* We timed out or were interrupted. */
3033		else if (error)
3034			return (NULL);
3035		/* If recursed, assume caller knows the rules. */
3036		else if (BUF_LOCKRECURSED(bp))
3037			goto end;
3038
3039		/*
3040		 * The buffer is locked.  B_CACHE is cleared if the buffer is
3041		 * invalid.  Otherwise, for a non-VMIO buffer, B_CACHE is set
3042		 * and for a VMIO buffer B_CACHE is adjusted according to the
3043		 * backing VM cache.
3044		 */
3045		if (bp->b_flags & B_INVAL)
3046			bp->b_flags &= ~B_CACHE;
3047		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
3048			bp->b_flags |= B_CACHE;
3049		if (bp->b_flags & B_MANAGED)
3050			MPASS(bp->b_qindex == QUEUE_NONE);
3051		else
3052			bremfree(bp);
3053
3054		/*
3055		 * check for size inconsistencies for non-VMIO case.
3056		 */
3057		if (bp->b_bcount != size) {
3058			if ((bp->b_flags & B_VMIO) == 0 ||
3059			    (size > bp->b_kvasize)) {
3060				if (bp->b_flags & B_DELWRI) {
3061					/*
3062					 * If buffer is pinned and caller does
3063					 * not want sleep  waiting for it to be
3064					 * unpinned, bail out
3065					 * */
3066					if (bp->b_pin_count > 0) {
3067						if (flags & GB_LOCK_NOWAIT) {
3068							bqrelse(bp);
3069							return (NULL);
3070						} else {
3071							bunpin_wait(bp);
3072						}
3073					}
3074					bp->b_flags |= B_NOCACHE;
3075					bwrite(bp);
3076				} else {
3077					if (LIST_EMPTY(&bp->b_dep)) {
3078						bp->b_flags |= B_RELBUF;
3079						brelse(bp);
3080					} else {
3081						bp->b_flags |= B_NOCACHE;
3082						bwrite(bp);
3083					}
3084				}
3085				goto loop;
3086			}
3087		}
3088
3089		/*
3090		 * Handle the case of unmapped buffer which should
3091		 * become mapped, or the buffer for which KVA
3092		 * reservation is requested.
3093		 */
3094		bp_unmapped_get_kva(bp, blkno, size, flags);
3095
3096		/*
3097		 * If the size is inconsistant in the VMIO case, we can resize
3098		 * the buffer.  This might lead to B_CACHE getting set or
3099		 * cleared.  If the size has not changed, B_CACHE remains
3100		 * unchanged from its previous state.
3101		 */
3102		if (bp->b_bcount != size)
3103			allocbuf(bp, size);
3104
3105		KASSERT(bp->b_offset != NOOFFSET,
3106		    ("getblk: no buffer offset"));
3107
3108		/*
3109		 * A buffer with B_DELWRI set and B_CACHE clear must
3110		 * be committed before we can return the buffer in
3111		 * order to prevent the caller from issuing a read
3112		 * ( due to B_CACHE not being set ) and overwriting
3113		 * it.
3114		 *
3115		 * Most callers, including NFS and FFS, need this to
3116		 * operate properly either because they assume they
3117		 * can issue a read if B_CACHE is not set, or because
3118		 * ( for example ) an uncached B_DELWRI might loop due
3119		 * to softupdates re-dirtying the buffer.  In the latter
3120		 * case, B_CACHE is set after the first write completes,
3121		 * preventing further loops.
3122		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
3123		 * above while extending the buffer, we cannot allow the
3124		 * buffer to remain with B_CACHE set after the write
3125		 * completes or it will represent a corrupt state.  To
3126		 * deal with this we set B_NOCACHE to scrap the buffer
3127		 * after the write.
3128		 *
3129		 * We might be able to do something fancy, like setting
3130		 * B_CACHE in bwrite() except if B_DELWRI is already set,
3131		 * so the below call doesn't set B_CACHE, but that gets real
3132		 * confusing.  This is much easier.
3133		 */
3134
3135		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
3136			bp->b_flags |= B_NOCACHE;
3137			bwrite(bp);
3138			goto loop;
3139		}
3140		bp->b_flags &= ~B_DONE;
3141	} else {
3142		/*
3143		 * Buffer is not in-core, create new buffer.  The buffer
3144		 * returned by getnewbuf() is locked.  Note that the returned
3145		 * buffer is also considered valid (not marked B_INVAL).
3146		 */
3147		BO_RUNLOCK(bo);
3148		/*
3149		 * If the user does not want us to create the buffer, bail out
3150		 * here.
3151		 */
3152		if (flags & GB_NOCREAT)
3153			return NULL;
3154		bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
3155		offset = blkno * bsize;
3156		vmio = vp->v_object != NULL;
3157		if (vmio) {
3158			maxsize = size + (offset & PAGE_MASK);
3159		} else {
3160			maxsize = size;
3161			/* Do not allow non-VMIO notmapped buffers. */
3162			flags &= ~GB_UNMAPPED;
3163		}
3164		maxsize = imax(maxsize, bsize);
3165
3166		bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags);
3167		if (bp == NULL) {
3168			if (slpflag || slptimeo)
3169				return NULL;
3170			goto loop;
3171		}
3172
3173		/*
3174		 * This code is used to make sure that a buffer is not
3175		 * created while the getnewbuf routine is blocked.
3176		 * This can be a problem whether the vnode is locked or not.
3177		 * If the buffer is created out from under us, we have to
3178		 * throw away the one we just created.
3179		 *
3180		 * Note: this must occur before we associate the buffer
3181		 * with the vp especially considering limitations in
3182		 * the splay tree implementation when dealing with duplicate
3183		 * lblkno's.
3184		 */
3185		BO_LOCK(bo);
3186		if (gbincore(bo, blkno)) {
3187			BO_UNLOCK(bo);
3188			bp->b_flags |= B_INVAL;
3189			brelse(bp);
3190			goto loop;
3191		}
3192
3193		/*
3194		 * Insert the buffer into the hash, so that it can
3195		 * be found by incore.
3196		 */
3197		bp->b_blkno = bp->b_lblkno = blkno;
3198		bp->b_offset = offset;
3199		bgetvp(vp, bp);
3200		BO_UNLOCK(bo);
3201
3202		/*
3203		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
3204		 * buffer size starts out as 0, B_CACHE will be set by
3205		 * allocbuf() for the VMIO case prior to it testing the
3206		 * backing store for validity.
3207		 */
3208
3209		if (vmio) {
3210			bp->b_flags |= B_VMIO;
3211			KASSERT(vp->v_object == bp->b_bufobj->bo_object,
3212			    ("ARGH! different b_bufobj->bo_object %p %p %p\n",
3213			    bp, vp->v_object, bp->b_bufobj->bo_object));
3214		} else {
3215			bp->b_flags &= ~B_VMIO;
3216			KASSERT(bp->b_bufobj->bo_object == NULL,
3217			    ("ARGH! has b_bufobj->bo_object %p %p\n",
3218			    bp, bp->b_bufobj->bo_object));
3219			BUF_CHECK_MAPPED(bp);
3220		}
3221
3222		allocbuf(bp, size);
3223		bp->b_flags &= ~B_DONE;
3224	}
3225	CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
3226	BUF_ASSERT_HELD(bp);
3227end:
3228	KASSERT(bp->b_bufobj == bo,
3229	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
3230	return (bp);
3231}
3232
3233/*
3234 * Get an empty, disassociated buffer of given size.  The buffer is initially
3235 * set to B_INVAL.
3236 */
3237struct buf *
3238geteblk(int size, int flags)
3239{
3240	struct buf *bp;
3241	int maxsize;
3242
3243	maxsize = (size + BKVAMASK) & ~BKVAMASK;
3244	while ((bp = getnewbuf(NULL, 0, 0, size, maxsize, flags)) == NULL) {
3245		if ((flags & GB_NOWAIT_BD) &&
3246		    (curthread->td_pflags & TDP_BUFNEED) != 0)
3247			return (NULL);
3248	}
3249	allocbuf(bp, size);
3250	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
3251	BUF_ASSERT_HELD(bp);
3252	return (bp);
3253}
3254
3255
3256/*
3257 * This code constitutes the buffer memory from either anonymous system
3258 * memory (in the case of non-VMIO operations) or from an associated
3259 * VM object (in the case of VMIO operations).  This code is able to
3260 * resize a buffer up or down.
3261 *
3262 * Note that this code is tricky, and has many complications to resolve
3263 * deadlock or inconsistant data situations.  Tread lightly!!!
3264 * There are B_CACHE and B_DELWRI interactions that must be dealt with by
3265 * the caller.  Calling this code willy nilly can result in the loss of data.
3266 *
3267 * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
3268 * B_CACHE for the non-VMIO case.
3269 */
3270
3271int
3272allocbuf(struct buf *bp, int size)
3273{
3274	int newbsize, mbsize;
3275	int i;
3276
3277	BUF_ASSERT_HELD(bp);
3278
3279	if (bp->b_kvasize < size)
3280		panic("allocbuf: buffer too small");
3281
3282	if ((bp->b_flags & B_VMIO) == 0) {
3283		caddr_t origbuf;
3284		int origbufsize;
3285		/*
3286		 * Just get anonymous memory from the kernel.  Don't
3287		 * mess with B_CACHE.
3288		 */
3289		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
3290		if (bp->b_flags & B_MALLOC)
3291			newbsize = mbsize;
3292		else
3293			newbsize = round_page(size);
3294
3295		if (newbsize < bp->b_bufsize) {
3296			/*
3297			 * malloced buffers are not shrunk
3298			 */
3299			if (bp->b_flags & B_MALLOC) {
3300				if (newbsize) {
3301					bp->b_bcount = size;
3302				} else {
3303					free(bp->b_data, M_BIOBUF);
3304					if (bp->b_bufsize) {
3305						atomic_subtract_long(
3306						    &bufmallocspace,
3307						    bp->b_bufsize);
3308						bufspacewakeup();
3309						bp->b_bufsize = 0;
3310					}
3311					bp->b_saveaddr = bp->b_kvabase;
3312					bp->b_data = bp->b_saveaddr;
3313					bp->b_bcount = 0;
3314					bp->b_flags &= ~B_MALLOC;
3315				}
3316				return 1;
3317			}
3318			vm_hold_free_pages(bp, newbsize);
3319		} else if (newbsize > bp->b_bufsize) {
3320			/*
3321			 * We only use malloced memory on the first allocation.
3322			 * and revert to page-allocated memory when the buffer
3323			 * grows.
3324			 */
3325			/*
3326			 * There is a potential smp race here that could lead
3327			 * to bufmallocspace slightly passing the max.  It
3328			 * is probably extremely rare and not worth worrying
3329			 * over.
3330			 */
3331			if ( (bufmallocspace < maxbufmallocspace) &&
3332				(bp->b_bufsize == 0) &&
3333				(mbsize <= PAGE_SIZE/2)) {
3334
3335				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
3336				bp->b_bufsize = mbsize;
3337				bp->b_bcount = size;
3338				bp->b_flags |= B_MALLOC;
3339				atomic_add_long(&bufmallocspace, mbsize);
3340				return 1;
3341			}
3342			origbuf = NULL;
3343			origbufsize = 0;
3344			/*
3345			 * If the buffer is growing on its other-than-first allocation,
3346			 * then we revert to the page-allocation scheme.
3347			 */
3348			if (bp->b_flags & B_MALLOC) {
3349				origbuf = bp->b_data;
3350				origbufsize = bp->b_bufsize;
3351				bp->b_data = bp->b_kvabase;
3352				if (bp->b_bufsize) {
3353					atomic_subtract_long(&bufmallocspace,
3354					    bp->b_bufsize);
3355					bufspacewakeup();
3356					bp->b_bufsize = 0;
3357				}
3358				bp->b_flags &= ~B_MALLOC;
3359				newbsize = round_page(newbsize);
3360			}
3361			vm_hold_load_pages(
3362			    bp,
3363			    (vm_offset_t) bp->b_data + bp->b_bufsize,
3364			    (vm_offset_t) bp->b_data + newbsize);
3365			if (origbuf) {
3366				bcopy(origbuf, bp->b_data, origbufsize);
3367				free(origbuf, M_BIOBUF);
3368			}
3369		}
3370	} else {
3371		int desiredpages;
3372
3373		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
3374		desiredpages = (size == 0) ? 0 :
3375			num_pages((bp->b_offset & PAGE_MASK) + newbsize);
3376
3377		if (bp->b_flags & B_MALLOC)
3378			panic("allocbuf: VMIO buffer can't be malloced");
3379		/*
3380		 * Set B_CACHE initially if buffer is 0 length or will become
3381		 * 0-length.
3382		 */
3383		if (size == 0 || bp->b_bufsize == 0)
3384			bp->b_flags |= B_CACHE;
3385
3386		if (newbsize < bp->b_bufsize) {
3387			/*
3388			 * DEV_BSIZE aligned new buffer size is less then the
3389			 * DEV_BSIZE aligned existing buffer size.  Figure out
3390			 * if we have to remove any pages.
3391			 */
3392			if (desiredpages < bp->b_npages) {
3393				vm_page_t m;
3394
3395				if ((bp->b_flags & B_UNMAPPED) == 0) {
3396					BUF_CHECK_MAPPED(bp);
3397					pmap_qremove((vm_offset_t)trunc_page(
3398					    (vm_offset_t)bp->b_data) +
3399					    (desiredpages << PAGE_SHIFT),
3400					    (bp->b_npages - desiredpages));
3401				} else
3402					BUF_CHECK_UNMAPPED(bp);
3403				VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
3404				for (i = desiredpages; i < bp->b_npages; i++) {
3405					/*
3406					 * the page is not freed here -- it
3407					 * is the responsibility of
3408					 * vnode_pager_setsize
3409					 */
3410					m = bp->b_pages[i];
3411					KASSERT(m != bogus_page,
3412					    ("allocbuf: bogus page found"));
3413					while (vm_page_sleep_if_busy(m, TRUE,
3414					    "biodep"))
3415						continue;
3416
3417					bp->b_pages[i] = NULL;
3418					vm_page_lock(m);
3419					vm_page_unwire(m, 0);
3420					vm_page_unlock(m);
3421				}
3422				VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
3423				bp->b_npages = desiredpages;
3424			}
3425		} else if (size > bp->b_bcount) {
3426			/*
3427			 * We are growing the buffer, possibly in a
3428			 * byte-granular fashion.
3429			 */
3430			vm_object_t obj;
3431			vm_offset_t toff;
3432			vm_offset_t tinc;
3433
3434			/*
3435			 * Step 1, bring in the VM pages from the object,
3436			 * allocating them if necessary.  We must clear
3437			 * B_CACHE if these pages are not valid for the
3438			 * range covered by the buffer.
3439			 */
3440
3441			obj = bp->b_bufobj->bo_object;
3442
3443			VM_OBJECT_WLOCK(obj);
3444			while (bp->b_npages < desiredpages) {
3445				vm_page_t m;
3446
3447				/*
3448				 * We must allocate system pages since blocking
3449				 * here could interfere with paging I/O, no
3450				 * matter which process we are.
3451				 *
3452				 * We can only test VPO_BUSY here.  Blocking on
3453				 * m->busy might lead to a deadlock:
3454				 *  vm_fault->getpages->cluster_read->allocbuf
3455				 * Thus, we specify VM_ALLOC_IGN_SBUSY.
3456				 */
3457				m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) +
3458				    bp->b_npages, VM_ALLOC_NOBUSY |
3459				    VM_ALLOC_SYSTEM | VM_ALLOC_WIRED |
3460				    VM_ALLOC_RETRY | VM_ALLOC_IGN_SBUSY |
3461				    VM_ALLOC_COUNT(desiredpages - bp->b_npages));
3462				if (m->valid == 0)
3463					bp->b_flags &= ~B_CACHE;
3464				bp->b_pages[bp->b_npages] = m;
3465				++bp->b_npages;
3466			}
3467
3468			/*
3469			 * Step 2.  We've loaded the pages into the buffer,
3470			 * we have to figure out if we can still have B_CACHE
3471			 * set.  Note that B_CACHE is set according to the
3472			 * byte-granular range ( bcount and size ), new the
3473			 * aligned range ( newbsize ).
3474			 *
3475			 * The VM test is against m->valid, which is DEV_BSIZE
3476			 * aligned.  Needless to say, the validity of the data
3477			 * needs to also be DEV_BSIZE aligned.  Note that this
3478			 * fails with NFS if the server or some other client
3479			 * extends the file's EOF.  If our buffer is resized,
3480			 * B_CACHE may remain set! XXX
3481			 */
3482
3483			toff = bp->b_bcount;
3484			tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
3485
3486			while ((bp->b_flags & B_CACHE) && toff < size) {
3487				vm_pindex_t pi;
3488
3489				if (tinc > (size - toff))
3490					tinc = size - toff;
3491
3492				pi = ((bp->b_offset & PAGE_MASK) + toff) >>
3493				    PAGE_SHIFT;
3494
3495				vfs_buf_test_cache(
3496				    bp,
3497				    bp->b_offset,
3498				    toff,
3499				    tinc,
3500				    bp->b_pages[pi]
3501				);
3502				toff += tinc;
3503				tinc = PAGE_SIZE;
3504			}
3505			VM_OBJECT_WUNLOCK(obj);
3506
3507			/*
3508			 * Step 3, fixup the KVM pmap.
3509			 */
3510			if ((bp->b_flags & B_UNMAPPED) == 0)
3511				bpmap_qenter(bp);
3512			else
3513				BUF_CHECK_UNMAPPED(bp);
3514		}
3515	}
3516	if (newbsize < bp->b_bufsize)
3517		bufspacewakeup();
3518	bp->b_bufsize = newbsize;	/* actual buffer allocation	*/
3519	bp->b_bcount = size;		/* requested buffer size	*/
3520	return 1;
3521}
3522
3523extern int inflight_transient_maps;
3524
3525void
3526biodone(struct bio *bp)
3527{
3528	struct mtx *mtxp;
3529	void (*done)(struct bio *);
3530	vm_offset_t start, end;
3531	int transient;
3532
3533	mtxp = mtx_pool_find(mtxpool_sleep, bp);
3534	mtx_lock(mtxp);
3535	bp->bio_flags |= BIO_DONE;
3536	if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
3537		start = trunc_page((vm_offset_t)bp->bio_data);
3538		end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
3539		transient = 1;
3540	} else {
3541		transient = 0;
3542		start = end = 0;
3543	}
3544	done = bp->bio_done;
3545	if (done == NULL)
3546		wakeup(bp);
3547	mtx_unlock(mtxp);
3548	if (done != NULL)
3549		done(bp);
3550	if (transient) {
3551		pmap_qremove(start, OFF_TO_IDX(end - start));
3552		vm_map_remove(bio_transient_map, start, end);
3553		atomic_add_int(&inflight_transient_maps, -1);
3554	}
3555}
3556
3557/*
3558 * Wait for a BIO to finish.
3559 *
3560 * XXX: resort to a timeout for now.  The optimal locking (if any) for this
3561 * case is not yet clear.
3562 */
3563int
3564biowait(struct bio *bp, const char *wchan)
3565{
3566	struct mtx *mtxp;
3567
3568	mtxp = mtx_pool_find(mtxpool_sleep, bp);
3569	mtx_lock(mtxp);
3570	while ((bp->bio_flags & BIO_DONE) == 0)
3571		msleep(bp, mtxp, PRIBIO, wchan, hz / 10);
3572	mtx_unlock(mtxp);
3573	if (bp->bio_error != 0)
3574		return (bp->bio_error);
3575	if (!(bp->bio_flags & BIO_ERROR))
3576		return (0);
3577	return (EIO);
3578}
3579
3580void
3581biofinish(struct bio *bp, struct devstat *stat, int error)
3582{
3583
3584	if (error) {
3585		bp->bio_error = error;
3586		bp->bio_flags |= BIO_ERROR;
3587	}
3588	if (stat != NULL)
3589		devstat_end_transaction_bio(stat, bp);
3590	biodone(bp);
3591}
3592
3593/*
3594 *	bufwait:
3595 *
3596 *	Wait for buffer I/O completion, returning error status.  The buffer
3597 *	is left locked and B_DONE on return.  B_EINTR is converted into an EINTR
3598 *	error and cleared.
3599 */
3600int
3601bufwait(struct buf *bp)
3602{
3603	if (bp->b_iocmd == BIO_READ)
3604		bwait(bp, PRIBIO, "biord");
3605	else
3606		bwait(bp, PRIBIO, "biowr");
3607	if (bp->b_flags & B_EINTR) {
3608		bp->b_flags &= ~B_EINTR;
3609		return (EINTR);
3610	}
3611	if (bp->b_ioflags & BIO_ERROR) {
3612		return (bp->b_error ? bp->b_error : EIO);
3613	} else {
3614		return (0);
3615	}
3616}
3617
3618 /*
3619  * Call back function from struct bio back up to struct buf.
3620  */
3621static void
3622bufdonebio(struct bio *bip)
3623{
3624	struct buf *bp;
3625
3626	bp = bip->bio_caller2;
3627	bp->b_resid = bp->b_bcount - bip->bio_completed;
3628	bp->b_resid = bip->bio_resid;	/* XXX: remove */
3629	bp->b_ioflags = bip->bio_flags;
3630	bp->b_error = bip->bio_error;
3631	if (bp->b_error)
3632		bp->b_ioflags |= BIO_ERROR;
3633	bufdone(bp);
3634	g_destroy_bio(bip);
3635}
3636
3637void
3638dev_strategy(struct cdev *dev, struct buf *bp)
3639{
3640	struct cdevsw *csw;
3641	int ref;
3642
3643	KASSERT(dev->si_refcount > 0,
3644	    ("dev_strategy on un-referenced struct cdev *(%s) %p",
3645	    devtoname(dev), dev));
3646
3647	csw = dev_refthread(dev, &ref);
3648	dev_strategy_csw(dev, csw, bp);
3649	dev_relthread(dev, ref);
3650}
3651
3652void
3653dev_strategy_csw(struct cdev *dev, struct cdevsw *csw, struct buf *bp)
3654{
3655	struct bio *bip;
3656
3657	KASSERT(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE,
3658	    ("b_iocmd botch"));
3659	KASSERT(((dev->si_flags & SI_ETERNAL) != 0 && csw != NULL) ||
3660	    dev->si_threadcount > 0,
3661	    ("dev_strategy_csw threadcount cdev *(%s) %p", devtoname(dev),
3662	    dev));
3663	if (csw == NULL) {
3664		bp->b_error = ENXIO;
3665		bp->b_ioflags = BIO_ERROR;
3666		bufdone(bp);
3667		return;
3668	}
3669	for (;;) {
3670		bip = g_new_bio();
3671		if (bip != NULL)
3672			break;
3673		/* Try again later */
3674		tsleep(&bp, PRIBIO, "dev_strat", hz/10);
3675	}
3676	bip->bio_cmd = bp->b_iocmd;
3677	bip->bio_offset = bp->b_iooffset;
3678	bip->bio_length = bp->b_bcount;
3679	bip->bio_bcount = bp->b_bcount;	/* XXX: remove */
3680	bdata2bio(bp, bip);
3681	bip->bio_done = bufdonebio;
3682	bip->bio_caller2 = bp;
3683	bip->bio_dev = dev;
3684	(*csw->d_strategy)(bip);
3685}
3686
3687/*
3688 *	bufdone:
3689 *
3690 *	Finish I/O on a buffer, optionally calling a completion function.
3691 *	This is usually called from an interrupt so process blocking is
3692 *	not allowed.
3693 *
3694 *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
3695 *	In a non-VMIO bp, B_CACHE will be set on the next getblk()
3696 *	assuming B_INVAL is clear.
3697 *
3698 *	For the VMIO case, we set B_CACHE if the op was a read and no
3699 *	read error occured, or if the op was a write.  B_CACHE is never
3700 *	set if the buffer is invalid or otherwise uncacheable.
3701 *
3702 *	biodone does not mess with B_INVAL, allowing the I/O routine or the
3703 *	initiator to leave B_INVAL set to brelse the buffer out of existance
3704 *	in the biodone routine.
3705 */
3706void
3707bufdone(struct buf *bp)
3708{
3709	struct bufobj *dropobj;
3710	void    (*biodone)(struct buf *);
3711
3712	CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
3713	dropobj = NULL;
3714
3715	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
3716	BUF_ASSERT_HELD(bp);
3717
3718	runningbufwakeup(bp);
3719	if (bp->b_iocmd == BIO_WRITE)
3720		dropobj = bp->b_bufobj;
3721	/* call optional completion function if requested */
3722	if (bp->b_iodone != NULL) {
3723		biodone = bp->b_iodone;
3724		bp->b_iodone = NULL;
3725		(*biodone) (bp);
3726		if (dropobj)
3727			bufobj_wdrop(dropobj);
3728		return;
3729	}
3730
3731	bufdone_finish(bp);
3732
3733	if (dropobj)
3734		bufobj_wdrop(dropobj);
3735}
3736
3737void
3738bufdone_finish(struct buf *bp)
3739{
3740	BUF_ASSERT_HELD(bp);
3741
3742	if (!LIST_EMPTY(&bp->b_dep))
3743		buf_complete(bp);
3744
3745	if (bp->b_flags & B_VMIO) {
3746		vm_ooffset_t foff;
3747		vm_page_t m;
3748		vm_object_t obj;
3749		struct vnode *vp;
3750		int bogus, i, iosize;
3751
3752		obj = bp->b_bufobj->bo_object;
3753		KASSERT(obj->paging_in_progress >= bp->b_npages,
3754		    ("biodone_finish: paging in progress(%d) < b_npages(%d)",
3755		    obj->paging_in_progress, bp->b_npages));
3756
3757		vp = bp->b_vp;
3758		KASSERT(vp->v_holdcnt > 0,
3759		    ("biodone_finish: vnode %p has zero hold count", vp));
3760		KASSERT(vp->v_object != NULL,
3761		    ("biodone_finish: vnode %p has no vm_object", vp));
3762
3763		foff = bp->b_offset;
3764		KASSERT(bp->b_offset != NOOFFSET,
3765		    ("biodone_finish: bp %p has no buffer offset", bp));
3766
3767		/*
3768		 * Set B_CACHE if the op was a normal read and no error
3769		 * occured.  B_CACHE is set for writes in the b*write()
3770		 * routines.
3771		 */
3772		iosize = bp->b_bcount - bp->b_resid;
3773		if (bp->b_iocmd == BIO_READ &&
3774		    !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
3775		    !(bp->b_ioflags & BIO_ERROR)) {
3776			bp->b_flags |= B_CACHE;
3777		}
3778		bogus = 0;
3779		VM_OBJECT_WLOCK(obj);
3780		for (i = 0; i < bp->b_npages; i++) {
3781			int bogusflag = 0;
3782			int resid;
3783
3784			resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
3785			if (resid > iosize)
3786				resid = iosize;
3787
3788			/*
3789			 * cleanup bogus pages, restoring the originals
3790			 */
3791			m = bp->b_pages[i];
3792			if (m == bogus_page) {
3793				bogus = bogusflag = 1;
3794				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
3795				if (m == NULL)
3796					panic("biodone: page disappeared!");
3797				bp->b_pages[i] = m;
3798			}
3799			KASSERT(OFF_TO_IDX(foff) == m->pindex,
3800			    ("biodone_finish: foff(%jd)/pindex(%ju) mismatch",
3801			    (intmax_t)foff, (uintmax_t)m->pindex));
3802
3803			/*
3804			 * In the write case, the valid and clean bits are
3805			 * already changed correctly ( see bdwrite() ), so we
3806			 * only need to do this here in the read case.
3807			 */
3808			if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) {
3809				KASSERT((m->dirty & vm_page_bits(foff &
3810				    PAGE_MASK, resid)) == 0, ("bufdone_finish:"
3811				    " page %p has unexpected dirty bits", m));
3812				vfs_page_set_valid(bp, foff, m);
3813			}
3814
3815			vm_page_io_finish(m);
3816			vm_object_pip_subtract(obj, 1);
3817			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
3818			iosize -= resid;
3819		}
3820		vm_object_pip_wakeupn(obj, 0);
3821		VM_OBJECT_WUNLOCK(obj);
3822		if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
3823			BUF_CHECK_MAPPED(bp);
3824			pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
3825			    bp->b_pages, bp->b_npages);
3826		}
3827	}
3828
3829	/*
3830	 * For asynchronous completions, release the buffer now. The brelse
3831	 * will do a wakeup there if necessary - so no need to do a wakeup
3832	 * here in the async case. The sync case always needs to do a wakeup.
3833	 */
3834
3835	if (bp->b_flags & B_ASYNC) {
3836		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR))
3837			brelse(bp);
3838		else
3839			bqrelse(bp);
3840	} else
3841		bdone(bp);
3842}
3843
3844/*
3845 * This routine is called in lieu of iodone in the case of
3846 * incomplete I/O.  This keeps the busy status for pages
3847 * consistant.
3848 */
3849void
3850vfs_unbusy_pages(struct buf *bp)
3851{
3852	int i;
3853	vm_object_t obj;
3854	vm_page_t m;
3855
3856	runningbufwakeup(bp);
3857	if (!(bp->b_flags & B_VMIO))
3858		return;
3859
3860	obj = bp->b_bufobj->bo_object;
3861	VM_OBJECT_WLOCK(obj);
3862	for (i = 0; i < bp->b_npages; i++) {
3863		m = bp->b_pages[i];
3864		if (m == bogus_page) {
3865			m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
3866			if (!m)
3867				panic("vfs_unbusy_pages: page missing\n");
3868			bp->b_pages[i] = m;
3869			if ((bp->b_flags & B_UNMAPPED) == 0) {
3870				BUF_CHECK_MAPPED(bp);
3871				pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
3872				    bp->b_pages, bp->b_npages);
3873			} else
3874				BUF_CHECK_UNMAPPED(bp);
3875		}
3876		vm_object_pip_subtract(obj, 1);
3877		vm_page_io_finish(m);
3878	}
3879	vm_object_pip_wakeupn(obj, 0);
3880	VM_OBJECT_WUNLOCK(obj);
3881}
3882
3883/*
3884 * vfs_page_set_valid:
3885 *
3886 *	Set the valid bits in a page based on the supplied offset.   The
3887 *	range is restricted to the buffer's size.
3888 *
3889 *	This routine is typically called after a read completes.
3890 */
3891static void
3892vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m)
3893{
3894	vm_ooffset_t eoff;
3895
3896	/*
3897	 * Compute the end offset, eoff, such that [off, eoff) does not span a
3898	 * page boundary and eoff is not greater than the end of the buffer.
3899	 * The end of the buffer, in this case, is our file EOF, not the
3900	 * allocation size of the buffer.
3901	 */
3902	eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK;
3903	if (eoff > bp->b_offset + bp->b_bcount)
3904		eoff = bp->b_offset + bp->b_bcount;
3905
3906	/*
3907	 * Set valid range.  This is typically the entire buffer and thus the
3908	 * entire page.
3909	 */
3910	if (eoff > off)
3911		vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off);
3912}
3913
3914/*
3915 * vfs_page_set_validclean:
3916 *
3917 *	Set the valid bits and clear the dirty bits in a page based on the
3918 *	supplied offset.   The range is restricted to the buffer's size.
3919 */
3920static void
3921vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m)
3922{
3923	vm_ooffset_t soff, eoff;
3924
3925	/*
3926	 * Start and end offsets in buffer.  eoff - soff may not cross a
3927	 * page boundry or cross the end of the buffer.  The end of the
3928	 * buffer, in this case, is our file EOF, not the allocation size
3929	 * of the buffer.
3930	 */
3931	soff = off;
3932	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
3933	if (eoff > bp->b_offset + bp->b_bcount)
3934		eoff = bp->b_offset + bp->b_bcount;
3935
3936	/*
3937	 * Set valid range.  This is typically the entire buffer and thus the
3938	 * entire page.
3939	 */
3940	if (eoff > soff) {
3941		vm_page_set_validclean(
3942		    m,
3943		   (vm_offset_t) (soff & PAGE_MASK),
3944		   (vm_offset_t) (eoff - soff)
3945		);
3946	}
3947}
3948
3949/*
3950 * Ensure that all buffer pages are not busied by VPO_BUSY flag. If
3951 * any page is busy, drain the flag.
3952 */
3953static void
3954vfs_drain_busy_pages(struct buf *bp)
3955{
3956	vm_page_t m;
3957	int i, last_busied;
3958
3959	VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object);
3960	last_busied = 0;
3961	for (i = 0; i < bp->b_npages; i++) {
3962		m = bp->b_pages[i];
3963		if ((m->oflags & VPO_BUSY) != 0) {
3964			for (; last_busied < i; last_busied++)
3965				vm_page_busy(bp->b_pages[last_busied]);
3966			while ((m->oflags & VPO_BUSY) != 0)
3967				vm_page_sleep(m, "vbpage");
3968		}
3969	}
3970	for (i = 0; i < last_busied; i++)
3971		vm_page_wakeup(bp->b_pages[i]);
3972}
3973
3974/*
3975 * This routine is called before a device strategy routine.
3976 * It is used to tell the VM system that paging I/O is in
3977 * progress, and treat the pages associated with the buffer
3978 * almost as being VPO_BUSY.  Also the object paging_in_progress
3979 * flag is handled to make sure that the object doesn't become
3980 * inconsistant.
3981 *
3982 * Since I/O has not been initiated yet, certain buffer flags
3983 * such as BIO_ERROR or B_INVAL may be in an inconsistant state
3984 * and should be ignored.
3985 */
3986void
3987vfs_busy_pages(struct buf *bp, int clear_modify)
3988{
3989	int i, bogus;
3990	vm_object_t obj;
3991	vm_ooffset_t foff;
3992	vm_page_t m;
3993
3994	if (!(bp->b_flags & B_VMIO))
3995		return;
3996
3997	obj = bp->b_bufobj->bo_object;
3998	foff = bp->b_offset;
3999	KASSERT(bp->b_offset != NOOFFSET,
4000	    ("vfs_busy_pages: no buffer offset"));
4001	VM_OBJECT_WLOCK(obj);
4002	vfs_drain_busy_pages(bp);
4003	if (bp->b_bufsize != 0)
4004		vfs_setdirty_locked_object(bp);
4005	bogus = 0;
4006	for (i = 0; i < bp->b_npages; i++) {
4007		m = bp->b_pages[i];
4008
4009		if ((bp->b_flags & B_CLUSTER) == 0) {
4010			vm_object_pip_add(obj, 1);
4011			vm_page_io_start(m);
4012		}
4013		/*
4014		 * When readying a buffer for a read ( i.e
4015		 * clear_modify == 0 ), it is important to do
4016		 * bogus_page replacement for valid pages in
4017		 * partially instantiated buffers.  Partially
4018		 * instantiated buffers can, in turn, occur when
4019		 * reconstituting a buffer from its VM backing store
4020		 * base.  We only have to do this if B_CACHE is
4021		 * clear ( which causes the I/O to occur in the
4022		 * first place ).  The replacement prevents the read
4023		 * I/O from overwriting potentially dirty VM-backed
4024		 * pages.  XXX bogus page replacement is, uh, bogus.
4025		 * It may not work properly with small-block devices.
4026		 * We need to find a better way.
4027		 */
4028		if (clear_modify) {
4029			pmap_remove_write(m);
4030			vfs_page_set_validclean(bp, foff, m);
4031		} else if (m->valid == VM_PAGE_BITS_ALL &&
4032		    (bp->b_flags & B_CACHE) == 0) {
4033			bp->b_pages[i] = bogus_page;
4034			bogus++;
4035		}
4036		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
4037	}
4038	VM_OBJECT_WUNLOCK(obj);
4039	if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
4040		BUF_CHECK_MAPPED(bp);
4041		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
4042		    bp->b_pages, bp->b_npages);
4043	}
4044}
4045
4046/*
4047 *	vfs_bio_set_valid:
4048 *
4049 *	Set the range within the buffer to valid.  The range is
4050 *	relative to the beginning of the buffer, b_offset.  Note that
4051 *	b_offset itself may be offset from the beginning of the first
4052 *	page.
4053 */
4054void
4055vfs_bio_set_valid(struct buf *bp, int base, int size)
4056{
4057	int i, n;
4058	vm_page_t m;
4059
4060	if (!(bp->b_flags & B_VMIO))
4061		return;
4062
4063	/*
4064	 * Fixup base to be relative to beginning of first page.
4065	 * Set initial n to be the maximum number of bytes in the
4066	 * first page that can be validated.
4067	 */
4068	base += (bp->b_offset & PAGE_MASK);
4069	n = PAGE_SIZE - (base & PAGE_MASK);
4070
4071	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
4072	for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
4073		m = bp->b_pages[i];
4074		if (n > size)
4075			n = size;
4076		vm_page_set_valid_range(m, base & PAGE_MASK, n);
4077		base += n;
4078		size -= n;
4079		n = PAGE_SIZE;
4080	}
4081	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
4082}
4083
4084/*
4085 *	vfs_bio_clrbuf:
4086 *
4087 *	If the specified buffer is a non-VMIO buffer, clear the entire
4088 *	buffer.  If the specified buffer is a VMIO buffer, clear and
4089 *	validate only the previously invalid portions of the buffer.
4090 *	This routine essentially fakes an I/O, so we need to clear
4091 *	BIO_ERROR and B_INVAL.
4092 *
4093 *	Note that while we only theoretically need to clear through b_bcount,
4094 *	we go ahead and clear through b_bufsize.
4095 */
4096void
4097vfs_bio_clrbuf(struct buf *bp)
4098{
4099	int i, j, mask, sa, ea, slide;
4100
4101	if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) {
4102		clrbuf(bp);
4103		return;
4104	}
4105	bp->b_flags &= ~B_INVAL;
4106	bp->b_ioflags &= ~BIO_ERROR;
4107	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
4108	if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
4109	    (bp->b_offset & PAGE_MASK) == 0) {
4110		if (bp->b_pages[0] == bogus_page)
4111			goto unlock;
4112		mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
4113		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object);
4114		if ((bp->b_pages[0]->valid & mask) == mask)
4115			goto unlock;
4116		if ((bp->b_pages[0]->valid & mask) == 0) {
4117			pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize);
4118			bp->b_pages[0]->valid |= mask;
4119			goto unlock;
4120		}
4121	}
4122	sa = bp->b_offset & PAGE_MASK;
4123	slide = 0;
4124	for (i = 0; i < bp->b_npages; i++, sa = 0) {
4125		slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize);
4126		ea = slide & PAGE_MASK;
4127		if (ea == 0)
4128			ea = PAGE_SIZE;
4129		if (bp->b_pages[i] == bogus_page)
4130			continue;
4131		j = sa / DEV_BSIZE;
4132		mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
4133		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object);
4134		if ((bp->b_pages[i]->valid & mask) == mask)
4135			continue;
4136		if ((bp->b_pages[i]->valid & mask) == 0)
4137			pmap_zero_page_area(bp->b_pages[i], sa, ea - sa);
4138		else {
4139			for (; sa < ea; sa += DEV_BSIZE, j++) {
4140				if ((bp->b_pages[i]->valid & (1 << j)) == 0) {
4141					pmap_zero_page_area(bp->b_pages[i],
4142					    sa, DEV_BSIZE);
4143				}
4144			}
4145		}
4146		bp->b_pages[i]->valid |= mask;
4147	}
4148unlock:
4149	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
4150	bp->b_resid = 0;
4151}
4152
4153void
4154vfs_bio_bzero_buf(struct buf *bp, int base, int size)
4155{
4156	vm_page_t m;
4157	int i, n;
4158
4159	if ((bp->b_flags & B_UNMAPPED) == 0) {
4160		BUF_CHECK_MAPPED(bp);
4161		bzero(bp->b_data + base, size);
4162	} else {
4163		BUF_CHECK_UNMAPPED(bp);
4164		n = PAGE_SIZE - (base & PAGE_MASK);
4165		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
4166			m = bp->b_pages[i];
4167			if (n > size)
4168				n = size;
4169			pmap_zero_page_area(m, base & PAGE_MASK, n);
4170			base += n;
4171			size -= n;
4172			n = PAGE_SIZE;
4173		}
4174	}
4175}
4176
4177/*
4178 * vm_hold_load_pages and vm_hold_free_pages get pages into
4179 * a buffers address space.  The pages are anonymous and are
4180 * not associated with a file object.
4181 */
4182static void
4183vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
4184{
4185	vm_offset_t pg;
4186	vm_page_t p;
4187	int index;
4188
4189	BUF_CHECK_MAPPED(bp);
4190
4191	to = round_page(to);
4192	from = round_page(from);
4193	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
4194
4195	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
4196tryagain:
4197		/*
4198		 * note: must allocate system pages since blocking here
4199		 * could interfere with paging I/O, no matter which
4200		 * process we are.
4201		 */
4202		p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
4203		    VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT));
4204		if (p == NULL) {
4205			VM_WAIT;
4206			goto tryagain;
4207		}
4208		pmap_qenter(pg, &p, 1);
4209		bp->b_pages[index] = p;
4210	}
4211	bp->b_npages = index;
4212}
4213
4214/* Return pages associated with this buf to the vm system */
4215static void
4216vm_hold_free_pages(struct buf *bp, int newbsize)
4217{
4218	vm_offset_t from;
4219	vm_page_t p;
4220	int index, newnpages;
4221
4222	BUF_CHECK_MAPPED(bp);
4223
4224	from = round_page((vm_offset_t)bp->b_data + newbsize);
4225	newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
4226	if (bp->b_npages > newnpages)
4227		pmap_qremove(from, bp->b_npages - newnpages);
4228	for (index = newnpages; index < bp->b_npages; index++) {
4229		p = bp->b_pages[index];
4230		bp->b_pages[index] = NULL;
4231		if (p->busy != 0)
4232			printf("vm_hold_free_pages: blkno: %jd, lblkno: %jd\n",
4233			    (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno);
4234		p->wire_count--;
4235		vm_page_free(p);
4236		atomic_subtract_int(&cnt.v_wire_count, 1);
4237	}
4238	bp->b_npages = newnpages;
4239}
4240
4241/*
4242 * Map an IO request into kernel virtual address space.
4243 *
4244 * All requests are (re)mapped into kernel VA space.
4245 * Notice that we use b_bufsize for the size of the buffer
4246 * to be mapped.  b_bcount might be modified by the driver.
4247 *
4248 * Note that even if the caller determines that the address space should
4249 * be valid, a race or a smaller-file mapped into a larger space may
4250 * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST
4251 * check the return value.
4252 */
4253int
4254vmapbuf(struct buf *bp, int mapbuf)
4255{
4256	caddr_t kva;
4257	vm_prot_t prot;
4258	int pidx;
4259
4260	if (bp->b_bufsize < 0)
4261		return (-1);
4262	prot = VM_PROT_READ;
4263	if (bp->b_iocmd == BIO_READ)
4264		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
4265	if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
4266	    (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages,
4267	    btoc(MAXPHYS))) < 0)
4268		return (-1);
4269	bp->b_npages = pidx;
4270	if (mapbuf || !unmapped_buf_allowed) {
4271		pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx);
4272		kva = bp->b_saveaddr;
4273		bp->b_saveaddr = bp->b_data;
4274		bp->b_data = kva + (((vm_offset_t)bp->b_data) & PAGE_MASK);
4275		bp->b_flags &= ~B_UNMAPPED;
4276	} else {
4277		bp->b_flags |= B_UNMAPPED;
4278		bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK;
4279		bp->b_saveaddr = bp->b_data;
4280		bp->b_data = unmapped_buf;
4281	}
4282	return(0);
4283}
4284
4285/*
4286 * Free the io map PTEs associated with this IO operation.
4287 * We also invalidate the TLB entries and restore the original b_addr.
4288 */
4289void
4290vunmapbuf(struct buf *bp)
4291{
4292	int npages;
4293
4294	npages = bp->b_npages;
4295	if (bp->b_flags & B_UNMAPPED)
4296		bp->b_flags &= ~B_UNMAPPED;
4297	else
4298		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
4299	vm_page_unhold_pages(bp->b_pages, npages);
4300
4301	bp->b_data = bp->b_saveaddr;
4302}
4303
4304void
4305bdone(struct buf *bp)
4306{
4307	struct mtx *mtxp;
4308
4309	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4310	mtx_lock(mtxp);
4311	bp->b_flags |= B_DONE;
4312	wakeup(bp);
4313	mtx_unlock(mtxp);
4314}
4315
4316void
4317bwait(struct buf *bp, u_char pri, const char *wchan)
4318{
4319	struct mtx *mtxp;
4320
4321	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4322	mtx_lock(mtxp);
4323	while ((bp->b_flags & B_DONE) == 0)
4324		msleep(bp, mtxp, pri, wchan, 0);
4325	mtx_unlock(mtxp);
4326}
4327
4328int
4329bufsync(struct bufobj *bo, int waitfor)
4330{
4331
4332	return (VOP_FSYNC(bo->__bo_vnode, waitfor, curthread));
4333}
4334
4335void
4336bufstrategy(struct bufobj *bo, struct buf *bp)
4337{
4338	int i = 0;
4339	struct vnode *vp;
4340
4341	vp = bp->b_vp;
4342	KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy"));
4343	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
4344	    ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
4345	i = VOP_STRATEGY(vp, bp);
4346	KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
4347}
4348
4349void
4350bufobj_wrefl(struct bufobj *bo)
4351{
4352
4353	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
4354	ASSERT_BO_WLOCKED(bo);
4355	bo->bo_numoutput++;
4356}
4357
4358void
4359bufobj_wref(struct bufobj *bo)
4360{
4361
4362	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
4363	BO_LOCK(bo);
4364	bo->bo_numoutput++;
4365	BO_UNLOCK(bo);
4366}
4367
4368void
4369bufobj_wdrop(struct bufobj *bo)
4370{
4371
4372	KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop"));
4373	BO_LOCK(bo);
4374	KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count"));
4375	if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) {
4376		bo->bo_flag &= ~BO_WWAIT;
4377		wakeup(&bo->bo_numoutput);
4378	}
4379	BO_UNLOCK(bo);
4380}
4381
4382int
4383bufobj_wwait(struct bufobj *bo, int slpflag, int timeo)
4384{
4385	int error;
4386
4387	KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
4388	ASSERT_BO_WLOCKED(bo);
4389	error = 0;
4390	while (bo->bo_numoutput) {
4391		bo->bo_flag |= BO_WWAIT;
4392		error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo),
4393		    slpflag | (PRIBIO + 1), "bo_wwait", timeo);
4394		if (error)
4395			break;
4396	}
4397	return (error);
4398}
4399
4400void
4401bpin(struct buf *bp)
4402{
4403	struct mtx *mtxp;
4404
4405	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4406	mtx_lock(mtxp);
4407	bp->b_pin_count++;
4408	mtx_unlock(mtxp);
4409}
4410
4411void
4412bunpin(struct buf *bp)
4413{
4414	struct mtx *mtxp;
4415
4416	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4417	mtx_lock(mtxp);
4418	if (--bp->b_pin_count == 0)
4419		wakeup(bp);
4420	mtx_unlock(mtxp);
4421}
4422
4423void
4424bunpin_wait(struct buf *bp)
4425{
4426	struct mtx *mtxp;
4427
4428	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4429	mtx_lock(mtxp);
4430	while (bp->b_pin_count > 0)
4431		msleep(bp, mtxp, PRIBIO, "bwunpin", 0);
4432	mtx_unlock(mtxp);
4433}
4434
4435/*
4436 * Set bio_data or bio_ma for struct bio from the struct buf.
4437 */
4438void
4439bdata2bio(struct buf *bp, struct bio *bip)
4440{
4441
4442	if ((bp->b_flags & B_UNMAPPED) != 0) {
4443		KASSERT(unmapped_buf_allowed, ("unmapped"));
4444		bip->bio_ma = bp->b_pages;
4445		bip->bio_ma_n = bp->b_npages;
4446		bip->bio_data = unmapped_buf;
4447		bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
4448		bip->bio_flags |= BIO_UNMAPPED;
4449		KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
4450		    PAGE_SIZE == bp->b_npages,
4451		    ("Buffer %p too short: %d %d %d", bp, bip->bio_ma_offset,
4452		    bip->bio_length, bip->bio_ma_n));
4453	} else {
4454		bip->bio_data = bp->b_data;
4455		bip->bio_ma = NULL;
4456	}
4457}
4458
4459#include "opt_ddb.h"
4460#ifdef DDB
4461#include <ddb/ddb.h>
4462
4463/* DDB command to show buffer data */
4464DB_SHOW_COMMAND(buffer, db_show_buffer)
4465{
4466	/* get args */
4467	struct buf *bp = (struct buf *)addr;
4468
4469	if (!have_addr) {
4470		db_printf("usage: show buffer <addr>\n");
4471		return;
4472	}
4473
4474	db_printf("buf at %p\n", bp);
4475	db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n",
4476	    (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags,
4477	    PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS);
4478	db_printf(
4479	    "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
4480	    "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, "
4481	    "b_dep = %p\n",
4482	    bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
4483	    bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno,
4484	    (intmax_t)bp->b_lblkno, bp->b_dep.lh_first);
4485	if (bp->b_npages) {
4486		int i;
4487		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
4488		for (i = 0; i < bp->b_npages; i++) {
4489			vm_page_t m;
4490			m = bp->b_pages[i];
4491			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
4492			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
4493			if ((i + 1) < bp->b_npages)
4494				db_printf(",");
4495		}
4496		db_printf("\n");
4497	}
4498	db_printf(" ");
4499	BUF_LOCKPRINTINFO(bp);
4500}
4501
4502DB_SHOW_COMMAND(lockedbufs, lockedbufs)
4503{
4504	struct buf *bp;
4505	int i;
4506
4507	for (i = 0; i < nbuf; i++) {
4508		bp = &buf[i];
4509		if (BUF_ISLOCKED(bp)) {
4510			db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4511			db_printf("\n");
4512		}
4513	}
4514}
4515
4516DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs)
4517{
4518	struct vnode *vp;
4519	struct buf *bp;
4520
4521	if (!have_addr) {
4522		db_printf("usage: show vnodebufs <addr>\n");
4523		return;
4524	}
4525	vp = (struct vnode *)addr;
4526	db_printf("Clean buffers:\n");
4527	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) {
4528		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4529		db_printf("\n");
4530	}
4531	db_printf("Dirty buffers:\n");
4532	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
4533		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4534		db_printf("\n");
4535	}
4536}
4537
4538DB_COMMAND(countfreebufs, db_coundfreebufs)
4539{
4540	struct buf *bp;
4541	int i, used = 0, nfree = 0;
4542
4543	if (have_addr) {
4544		db_printf("usage: countfreebufs\n");
4545		return;
4546	}
4547
4548	for (i = 0; i < nbuf; i++) {
4549		bp = &buf[i];
4550		if ((bp->b_flags & B_INFREECNT) != 0)
4551			nfree++;
4552		else
4553			used++;
4554	}
4555
4556	db_printf("Counted %d free, %d used (%d tot)\n", nfree, used,
4557	    nfree + used);
4558	db_printf("numfreebuffers is %d\n", numfreebuffers);
4559}
4560#endif /* DDB */
4561