vfs_bio.c revision 285819
1/*-
2 * Copyright (c) 2004 Poul-Henning Kamp
3 * Copyright (c) 1994,1997 John S. Dyson
4 * Copyright (c) 2013 The FreeBSD Foundation
5 * All rights reserved.
6 *
7 * Portions of this software were developed by Konstantin Belousov
8 * under sponsorship from the FreeBSD Foundation.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32/*
33 * this file contains a new buffer I/O scheme implementing a coherent
34 * VM object and buffer cache scheme.  Pains have been taken to make
35 * sure that the performance degradation associated with schemes such
36 * as this is not realized.
37 *
38 * Author:  John S. Dyson
39 * Significant help during the development and debugging phases
40 * had been provided by David Greenman, also of the FreeBSD core team.
41 *
42 * see man buf(9) for more info.
43 */
44
45#include <sys/cdefs.h>
46__FBSDID("$FreeBSD: head/sys/kern/vfs_bio.c 285819 2015-07-23 19:13:41Z jeff $");
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/bio.h>
51#include <sys/conf.h>
52#include <sys/buf.h>
53#include <sys/devicestat.h>
54#include <sys/eventhandler.h>
55#include <sys/fail.h>
56#include <sys/limits.h>
57#include <sys/lock.h>
58#include <sys/malloc.h>
59#include <sys/mount.h>
60#include <sys/mutex.h>
61#include <sys/kernel.h>
62#include <sys/kthread.h>
63#include <sys/proc.h>
64#include <sys/resourcevar.h>
65#include <sys/rwlock.h>
66#include <sys/sysctl.h>
67#include <sys/vmem.h>
68#include <sys/vmmeter.h>
69#include <sys/vnode.h>
70#include <geom/geom.h>
71#include <vm/vm.h>
72#include <vm/vm_param.h>
73#include <vm/vm_kern.h>
74#include <vm/vm_pageout.h>
75#include <vm/vm_page.h>
76#include <vm/vm_object.h>
77#include <vm/vm_extern.h>
78#include <vm/vm_map.h>
79#include "opt_compat.h"
80#include "opt_swap.h"
81
82static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
83
84struct	bio_ops bioops;		/* I/O operation notification */
85
86struct	buf_ops buf_ops_bio = {
87	.bop_name	=	"buf_ops_bio",
88	.bop_write	=	bufwrite,
89	.bop_strategy	=	bufstrategy,
90	.bop_sync	=	bufsync,
91	.bop_bdflush	=	bufbdflush,
92};
93
94/*
95 * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has
96 * carnal knowledge of buffers.  This knowledge should be moved to vfs_bio.c.
97 */
98struct buf *buf;		/* buffer header pool */
99caddr_t unmapped_buf;
100
101/* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
102struct proc *bufdaemonproc;
103
104static int inmem(struct vnode *vp, daddr_t blkno);
105static void vm_hold_free_pages(struct buf *bp, int newbsize);
106static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
107		vm_offset_t to);
108static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
109static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
110		vm_page_t m);
111static void vfs_clean_pages_dirty_buf(struct buf *bp);
112static void vfs_setdirty_locked_object(struct buf *bp);
113static void vfs_vmio_release(struct buf *bp);
114static int vfs_bio_clcheck(struct vnode *vp, int size,
115		daddr_t lblkno, daddr_t blkno);
116static int buf_flush(struct vnode *vp, int);
117static int flushbufqueues(struct vnode *, int, int);
118static void buf_daemon(void);
119static void bremfreel(struct buf *bp);
120static __inline void bd_wakeup(void);
121static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
122#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
123    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
124static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
125#endif
126
127int vmiodirenable = TRUE;
128SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
129    "Use the VM system for directory writes");
130long runningbufspace;
131SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
132    "Amount of presently outstanding async buffer io");
133static long bufspace;
134#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
135    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
136SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
137    &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers");
138#else
139SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
140    "Physical memory used for buffers");
141#endif
142static long bufkvaspace;
143SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0,
144    "Kernel virtual memory used for buffers");
145static long maxbufspace;
146SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
147    "Maximum allowed value of bufspace (including buf_daemon)");
148static long bufmallocspace;
149SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
150    "Amount of malloced memory for buffers");
151static long maxbufmallocspace;
152SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0,
153    "Maximum amount of malloced memory for buffers");
154static long lobufspace;
155SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
156    "Minimum amount of buffers we want to have");
157long hibufspace;
158SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
159    "Maximum allowed value of bufspace (excluding buf_daemon)");
160static int bufreusecnt;
161SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0,
162    "Number of times we have reused a buffer");
163static int buffreekvacnt;
164SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
165    "Number of times we have freed the KVA space from some buffer");
166static int bufdefragcnt;
167SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
168    "Number of times we have had to repeat buffer allocation to defragment");
169static long lorunningspace;
170SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
171    CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L",
172    "Minimum preferred space used for in-progress I/O");
173static long hirunningspace;
174SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
175    CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L",
176    "Maximum amount of space to use for in-progress I/O");
177int dirtybufferflushes;
178SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
179    0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
180int bdwriteskip;
181SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
182    0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
183int altbufferflushes;
184SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
185    0, "Number of fsync flushes to limit dirty buffers");
186static int recursiveflushes;
187SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes,
188    0, "Number of flushes skipped due to being recursive");
189static int numdirtybuffers;
190SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
191    "Number of buffers that are dirty (has unwritten changes) at the moment");
192static int lodirtybuffers;
193SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
194    "How many buffers we want to have free before bufdaemon can sleep");
195static int hidirtybuffers;
196SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
197    "When the number of dirty buffers is considered severe");
198int dirtybufthresh;
199SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
200    0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
201static int numfreebuffers;
202SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
203    "Number of free buffers");
204static int lofreebuffers;
205SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
206   "XXX Unused");
207static int hifreebuffers;
208SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
209   "XXX Complicatedly unused");
210static int getnewbufcalls;
211SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
212   "Number of calls to getnewbuf");
213static int getnewbufrestarts;
214SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
215    "Number of times getnewbuf has had to restart a buffer aquisition");
216static int mappingrestarts;
217SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
218    "Number of times getblk has had to restart a buffer mapping for "
219    "unmapped buffer");
220static int flushbufqtarget = 100;
221SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
222    "Amount of work to do in flushbufqueues when helping bufdaemon");
223static long notbufdflushes;
224SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes, 0,
225    "Number of dirty buffer flushes done by the bufdaemon helpers");
226static long barrierwrites;
227SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
228    "Number of barrier writes");
229SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
230    &unmapped_buf_allowed, 0,
231    "Permit the use of the unmapped i/o");
232
233/*
234 * Lock for the non-dirty bufqueues
235 */
236static struct mtx_padalign bqclean;
237
238/*
239 * Lock for the dirty queue.
240 */
241static struct mtx_padalign bqdirty;
242
243/*
244 * This lock synchronizes access to bd_request.
245 */
246static struct mtx_padalign bdlock;
247
248/*
249 * This lock protects the runningbufreq and synchronizes runningbufwakeup and
250 * waitrunningbufspace().
251 */
252static struct mtx_padalign rbreqlock;
253
254/*
255 * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
256 */
257static struct rwlock_padalign nblock;
258
259/*
260 * Lock that protects bdirtywait.
261 */
262static struct mtx_padalign bdirtylock;
263
264/*
265 * Wakeup point for bufdaemon, as well as indicator of whether it is already
266 * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
267 * is idling.
268 */
269static int bd_request;
270
271/*
272 * Request for the buf daemon to write more buffers than is indicated by
273 * lodirtybuf.  This may be necessary to push out excess dependencies or
274 * defragment the address space where a simple count of the number of dirty
275 * buffers is insufficient to characterize the demand for flushing them.
276 */
277static int bd_speedupreq;
278
279/*
280 * bogus page -- for I/O to/from partially complete buffers
281 * this is a temporary solution to the problem, but it is not
282 * really that bad.  it would be better to split the buffer
283 * for input in the case of buffers partially already in memory,
284 * but the code is intricate enough already.
285 */
286vm_page_t bogus_page;
287
288/*
289 * Synchronization (sleep/wakeup) variable for active buffer space requests.
290 * Set when wait starts, cleared prior to wakeup().
291 * Used in runningbufwakeup() and waitrunningbufspace().
292 */
293static int runningbufreq;
294
295/*
296 * Synchronization (sleep/wakeup) variable for buffer requests.
297 * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
298 * by and/or.
299 * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(),
300 * getnewbuf(), and getblk().
301 */
302static volatile int needsbuffer;
303
304/*
305 * Synchronization for bwillwrite() waiters.
306 */
307static int bdirtywait;
308
309/*
310 * Definitions for the buffer free lists.
311 */
312#define BUFFER_QUEUES	5	/* number of free buffer queues */
313
314#define QUEUE_NONE	0	/* on no queue */
315#define QUEUE_CLEAN	1	/* non-B_DELWRI buffers */
316#define QUEUE_DIRTY	2	/* B_DELWRI buffers */
317#define QUEUE_EMPTYKVA	3	/* empty buffer headers w/KVA assignment */
318#define QUEUE_EMPTY	4	/* empty buffer headers */
319#define QUEUE_SENTINEL	1024	/* not an queue index, but mark for sentinel */
320
321/* Queues for free buffers with various properties */
322static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
323#ifdef INVARIANTS
324static int bq_len[BUFFER_QUEUES];
325#endif
326
327/*
328 * Single global constant for BUF_WMESG, to avoid getting multiple references.
329 * buf_wmesg is referred from macros.
330 */
331const char *buf_wmesg = BUF_WMESG;
332
333#define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
334#define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
335#define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
336
337static int
338sysctl_runningspace(SYSCTL_HANDLER_ARGS)
339{
340	long value;
341	int error;
342
343	value = *(long *)arg1;
344	error = sysctl_handle_long(oidp, &value, 0, req);
345	if (error != 0 || req->newptr == NULL)
346		return (error);
347	mtx_lock(&rbreqlock);
348	if (arg1 == &hirunningspace) {
349		if (value < lorunningspace)
350			error = EINVAL;
351		else
352			hirunningspace = value;
353	} else {
354		KASSERT(arg1 == &lorunningspace,
355		    ("%s: unknown arg1", __func__));
356		if (value > hirunningspace)
357			error = EINVAL;
358		else
359			lorunningspace = value;
360	}
361	mtx_unlock(&rbreqlock);
362	return (error);
363}
364
365#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
366    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
367static int
368sysctl_bufspace(SYSCTL_HANDLER_ARGS)
369{
370	long lvalue;
371	int ivalue;
372
373	if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
374		return (sysctl_handle_long(oidp, arg1, arg2, req));
375	lvalue = *(long *)arg1;
376	if (lvalue > INT_MAX)
377		/* On overflow, still write out a long to trigger ENOMEM. */
378		return (sysctl_handle_long(oidp, &lvalue, 0, req));
379	ivalue = lvalue;
380	return (sysctl_handle_int(oidp, &ivalue, 0, req));
381}
382#endif
383
384/*
385 *	bqlock:
386 *
387 *	Return the appropriate queue lock based on the index.
388 */
389static inline struct mtx *
390bqlock(int qindex)
391{
392
393	if (qindex == QUEUE_DIRTY)
394		return (struct mtx *)(&bqdirty);
395	return (struct mtx *)(&bqclean);
396}
397
398/*
399 *	bdirtywakeup:
400 *
401 *	Wakeup any bwillwrite() waiters.
402 */
403static void
404bdirtywakeup(void)
405{
406	mtx_lock(&bdirtylock);
407	if (bdirtywait) {
408		bdirtywait = 0;
409		wakeup(&bdirtywait);
410	}
411	mtx_unlock(&bdirtylock);
412}
413
414/*
415 *	bdirtysub:
416 *
417 *	Decrement the numdirtybuffers count by one and wakeup any
418 *	threads blocked in bwillwrite().
419 */
420static void
421bdirtysub(void)
422{
423
424	if (atomic_fetchadd_int(&numdirtybuffers, -1) ==
425	    (lodirtybuffers + hidirtybuffers) / 2)
426		bdirtywakeup();
427}
428
429/*
430 *	bdirtyadd:
431 *
432 *	Increment the numdirtybuffers count by one and wakeup the buf
433 *	daemon if needed.
434 */
435static void
436bdirtyadd(void)
437{
438
439	/*
440	 * Only do the wakeup once as we cross the boundary.  The
441	 * buf daemon will keep running until the condition clears.
442	 */
443	if (atomic_fetchadd_int(&numdirtybuffers, 1) ==
444	    (lodirtybuffers + hidirtybuffers) / 2)
445		bd_wakeup();
446}
447
448/*
449 *	bufspacewakeup:
450 *
451 *	Called when buffer space is potentially available for recovery.
452 *	getnewbuf() will block on this flag when it is unable to free
453 *	sufficient buffer space.  Buffer space becomes recoverable when
454 *	bp's get placed back in the queues.
455 */
456static __inline void
457bufspacewakeup(void)
458{
459	int need_wakeup, on;
460
461	/*
462	 * If someone is waiting for bufspace, wake them up.  Even
463	 * though we may not have freed the kva space yet, the waiting
464	 * process will be able to now.
465	 */
466	rw_rlock(&nblock);
467	for (;;) {
468		need_wakeup = 0;
469		on = needsbuffer;
470		if ((on & VFS_BIO_NEED_BUFSPACE) == 0)
471			break;
472		need_wakeup = 1;
473		if (atomic_cmpset_rel_int(&needsbuffer, on,
474		    on & ~VFS_BIO_NEED_BUFSPACE))
475			break;
476	}
477	if (need_wakeup)
478		wakeup(__DEVOLATILE(void *, &needsbuffer));
479	rw_runlock(&nblock);
480}
481
482/*
483 *	bufspaceadjust:
484 *
485 *	Adjust the reported bufspace for a KVA managed buffer, possibly
486 * 	waking any waiters.
487 */
488static void
489bufspaceadjust(struct buf *bp, int bufsize)
490{
491	int diff;
492
493	KASSERT((bp->b_flags & B_MALLOC) == 0,
494	    ("bufspaceadjust: malloc buf %p", bp));
495	diff = bufsize - bp->b_bufsize;
496	if (diff < 0) {
497		atomic_subtract_long(&bufspace, -diff);
498		bufspacewakeup();
499	} else
500		atomic_add_long(&bufspace, diff);
501	bp->b_bufsize = bufsize;
502}
503
504/*
505 *	bufmallocadjust:
506 *
507 *	Adjust the reported bufspace for a malloc managed buffer, possibly
508 *	waking any waiters.
509 */
510static void
511bufmallocadjust(struct buf *bp, int bufsize)
512{
513	int diff;
514
515	KASSERT((bp->b_flags & B_MALLOC) != 0,
516	    ("bufmallocadjust: non-malloc buf %p", bp));
517	diff = bufsize - bp->b_bufsize;
518	if (diff < 0) {
519		atomic_subtract_long(&bufmallocspace, -diff);
520		bufspacewakeup();
521	} else
522		atomic_add_long(&bufmallocspace, diff);
523	bp->b_bufsize = bufsize;
524}
525
526/*
527 *	runningwakeup:
528 *
529 *	Wake up processes that are waiting on asynchronous writes to fall
530 *	below lorunningspace.
531 */
532static void
533runningwakeup(void)
534{
535
536	mtx_lock(&rbreqlock);
537	if (runningbufreq) {
538		runningbufreq = 0;
539		wakeup(&runningbufreq);
540	}
541	mtx_unlock(&rbreqlock);
542}
543
544/*
545 *	runningbufwakeup:
546 *
547 *	Decrement the outstanding write count according.
548 */
549void
550runningbufwakeup(struct buf *bp)
551{
552	long space, bspace;
553
554	bspace = bp->b_runningbufspace;
555	if (bspace == 0)
556		return;
557	space = atomic_fetchadd_long(&runningbufspace, -bspace);
558	KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld",
559	    space, bspace));
560	bp->b_runningbufspace = 0;
561	/*
562	 * Only acquire the lock and wakeup on the transition from exceeding
563	 * the threshold to falling below it.
564	 */
565	if (space < lorunningspace)
566		return;
567	if (space - bspace > lorunningspace)
568		return;
569	runningwakeup();
570}
571
572/*
573 *	bufcountadd:
574 *
575 *	Called when a buffer has been added to one of the free queues to
576 *	account for the buffer and to wakeup anyone waiting for free buffers.
577 *	This typically occurs when large amounts of metadata are being handled
578 *	by the buffer cache ( else buffer space runs out first, usually ).
579 */
580static __inline void
581bufcountadd(struct buf *bp)
582{
583	int mask, need_wakeup, old, on;
584
585	KASSERT((bp->b_flags & B_INFREECNT) == 0,
586	    ("buf %p already counted as free", bp));
587	bp->b_flags |= B_INFREECNT;
588	old = atomic_fetchadd_int(&numfreebuffers, 1);
589	KASSERT(old >= 0 && old < nbuf,
590	    ("numfreebuffers climbed to %d", old + 1));
591	mask = VFS_BIO_NEED_ANY;
592	if (numfreebuffers >= hifreebuffers)
593		mask |= VFS_BIO_NEED_FREE;
594	rw_rlock(&nblock);
595	for (;;) {
596		need_wakeup = 0;
597		on = needsbuffer;
598		if (on == 0)
599			break;
600		need_wakeup = 1;
601		if (atomic_cmpset_rel_int(&needsbuffer, on, on & ~mask))
602			break;
603	}
604	if (need_wakeup)
605		wakeup(__DEVOLATILE(void *, &needsbuffer));
606	rw_runlock(&nblock);
607}
608
609/*
610 *	bufcountsub:
611 *
612 *	Decrement the numfreebuffers count as needed.
613 */
614static void
615bufcountsub(struct buf *bp)
616{
617	int old;
618
619	/*
620	 * Fixup numfreebuffers count.  If the buffer is invalid or not
621	 * delayed-write, the buffer was free and we must decrement
622	 * numfreebuffers.
623	 */
624	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
625		KASSERT((bp->b_flags & B_INFREECNT) != 0,
626		    ("buf %p not counted in numfreebuffers", bp));
627		bp->b_flags &= ~B_INFREECNT;
628		old = atomic_fetchadd_int(&numfreebuffers, -1);
629		KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
630	}
631}
632
633/*
634 *	waitrunningbufspace()
635 *
636 *	runningbufspace is a measure of the amount of I/O currently
637 *	running.  This routine is used in async-write situations to
638 *	prevent creating huge backups of pending writes to a device.
639 *	Only asynchronous writes are governed by this function.
640 *
641 *	This does NOT turn an async write into a sync write.  It waits
642 *	for earlier writes to complete and generally returns before the
643 *	caller's write has reached the device.
644 */
645void
646waitrunningbufspace(void)
647{
648
649	mtx_lock(&rbreqlock);
650	while (runningbufspace > hirunningspace) {
651		runningbufreq = 1;
652		msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
653	}
654	mtx_unlock(&rbreqlock);
655}
656
657
658/*
659 *	vfs_buf_test_cache:
660 *
661 *	Called when a buffer is extended.  This function clears the B_CACHE
662 *	bit if the newly extended portion of the buffer does not contain
663 *	valid data.
664 */
665static __inline
666void
667vfs_buf_test_cache(struct buf *bp,
668		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
669		  vm_page_t m)
670{
671
672	VM_OBJECT_ASSERT_LOCKED(m->object);
673	if (bp->b_flags & B_CACHE) {
674		int base = (foff + off) & PAGE_MASK;
675		if (vm_page_is_valid(m, base, size) == 0)
676			bp->b_flags &= ~B_CACHE;
677	}
678}
679
680/* Wake up the buffer daemon if necessary */
681static __inline void
682bd_wakeup(void)
683{
684
685	mtx_lock(&bdlock);
686	if (bd_request == 0) {
687		bd_request = 1;
688		wakeup(&bd_request);
689	}
690	mtx_unlock(&bdlock);
691}
692
693/*
694 * bd_speedup - speedup the buffer cache flushing code
695 */
696void
697bd_speedup(void)
698{
699	int needwake;
700
701	mtx_lock(&bdlock);
702	needwake = 0;
703	if (bd_speedupreq == 0 || bd_request == 0)
704		needwake = 1;
705	bd_speedupreq = 1;
706	bd_request = 1;
707	if (needwake)
708		wakeup(&bd_request);
709	mtx_unlock(&bdlock);
710}
711
712#ifndef NSWBUF_MIN
713#define	NSWBUF_MIN	16
714#endif
715
716#ifdef __i386__
717#define	TRANSIENT_DENOM	5
718#else
719#define	TRANSIENT_DENOM 10
720#endif
721
722/*
723 * Calculating buffer cache scaling values and reserve space for buffer
724 * headers.  This is called during low level kernel initialization and
725 * may be called more then once.  We CANNOT write to the memory area
726 * being reserved at this time.
727 */
728caddr_t
729kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
730{
731	int tuned_nbuf;
732	long maxbuf, maxbuf_sz, buf_sz,	biotmap_sz;
733
734	/*
735	 * physmem_est is in pages.  Convert it to kilobytes (assumes
736	 * PAGE_SIZE is >= 1K)
737	 */
738	physmem_est = physmem_est * (PAGE_SIZE / 1024);
739
740	/*
741	 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
742	 * For the first 64MB of ram nominally allocate sufficient buffers to
743	 * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
744	 * buffers to cover 1/10 of our ram over 64MB.  When auto-sizing
745	 * the buffer cache we limit the eventual kva reservation to
746	 * maxbcache bytes.
747	 *
748	 * factor represents the 1/4 x ram conversion.
749	 */
750	if (nbuf == 0) {
751		int factor = 4 * BKVASIZE / 1024;
752
753		nbuf = 50;
754		if (physmem_est > 4096)
755			nbuf += min((physmem_est - 4096) / factor,
756			    65536 / factor);
757		if (physmem_est > 65536)
758			nbuf += min((physmem_est - 65536) * 2 / (factor * 5),
759			    32 * 1024 * 1024 / (factor * 5));
760
761		if (maxbcache && nbuf > maxbcache / BKVASIZE)
762			nbuf = maxbcache / BKVASIZE;
763		tuned_nbuf = 1;
764	} else
765		tuned_nbuf = 0;
766
767	/* XXX Avoid unsigned long overflows later on with maxbufspace. */
768	maxbuf = (LONG_MAX / 3) / BKVASIZE;
769	if (nbuf > maxbuf) {
770		if (!tuned_nbuf)
771			printf("Warning: nbufs lowered from %d to %ld\n", nbuf,
772			    maxbuf);
773		nbuf = maxbuf;
774	}
775
776	/*
777	 * Ideal allocation size for the transient bio submap is 10%
778	 * of the maximal space buffer map.  This roughly corresponds
779	 * to the amount of the buffer mapped for typical UFS load.
780	 *
781	 * Clip the buffer map to reserve space for the transient
782	 * BIOs, if its extent is bigger than 90% (80% on i386) of the
783	 * maximum buffer map extent on the platform.
784	 *
785	 * The fall-back to the maxbuf in case of maxbcache unset,
786	 * allows to not trim the buffer KVA for the architectures
787	 * with ample KVA space.
788	 */
789	if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) {
790		maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
791		buf_sz = (long)nbuf * BKVASIZE;
792		if (buf_sz < maxbuf_sz / TRANSIENT_DENOM *
793		    (TRANSIENT_DENOM - 1)) {
794			/*
795			 * There is more KVA than memory.  Do not
796			 * adjust buffer map size, and assign the rest
797			 * of maxbuf to transient map.
798			 */
799			biotmap_sz = maxbuf_sz - buf_sz;
800		} else {
801			/*
802			 * Buffer map spans all KVA we could afford on
803			 * this platform.  Give 10% (20% on i386) of
804			 * the buffer map to the transient bio map.
805			 */
806			biotmap_sz = buf_sz / TRANSIENT_DENOM;
807			buf_sz -= biotmap_sz;
808		}
809		if (biotmap_sz / INT_MAX > MAXPHYS)
810			bio_transient_maxcnt = INT_MAX;
811		else
812			bio_transient_maxcnt = biotmap_sz / MAXPHYS;
813		/*
814		 * Artifically limit to 1024 simultaneous in-flight I/Os
815		 * using the transient mapping.
816		 */
817		if (bio_transient_maxcnt > 1024)
818			bio_transient_maxcnt = 1024;
819		if (tuned_nbuf)
820			nbuf = buf_sz / BKVASIZE;
821	}
822
823	/*
824	 * swbufs are used as temporary holders for I/O, such as paging I/O.
825	 * We have no less then 16 and no more then 256.
826	 */
827	nswbuf = min(nbuf / 4, 256);
828	TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf);
829	if (nswbuf < NSWBUF_MIN)
830		nswbuf = NSWBUF_MIN;
831
832	/*
833	 * Reserve space for the buffer cache buffers
834	 */
835	swbuf = (void *)v;
836	v = (caddr_t)(swbuf + nswbuf);
837	buf = (void *)v;
838	v = (caddr_t)(buf + nbuf);
839
840	return(v);
841}
842
843/* Initialize the buffer subsystem.  Called before use of any buffers. */
844void
845bufinit(void)
846{
847	struct buf *bp;
848	int i;
849
850	CTASSERT(MAXBCACHEBUF >= MAXBSIZE);
851	mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF);
852	mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF);
853	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
854	rw_init(&nblock, "needsbuffer lock");
855	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
856	mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
857
858	/* next, make a null set of free lists */
859	for (i = 0; i < BUFFER_QUEUES; i++)
860		TAILQ_INIT(&bufqueues[i]);
861
862	unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
863
864	/* finally, initialize each buffer header and stick on empty q */
865	for (i = 0; i < nbuf; i++) {
866		bp = &buf[i];
867		bzero(bp, sizeof *bp);
868		bp->b_flags = B_INVAL | B_INFREECNT;
869		bp->b_rcred = NOCRED;
870		bp->b_wcred = NOCRED;
871		bp->b_qindex = QUEUE_EMPTY;
872		bp->b_xflags = 0;
873		bp->b_data = bp->b_kvabase = unmapped_buf;
874		LIST_INIT(&bp->b_dep);
875		BUF_LOCKINIT(bp);
876		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
877#ifdef INVARIANTS
878		bq_len[QUEUE_EMPTY]++;
879#endif
880	}
881
882	/*
883	 * maxbufspace is the absolute maximum amount of buffer space we are
884	 * allowed to reserve in KVM and in real terms.  The absolute maximum
885	 * is nominally used by buf_daemon.  hibufspace is the nominal maximum
886	 * used by most other processes.  The differential is required to
887	 * ensure that buf_daemon is able to run when other processes might
888	 * be blocked waiting for buffer space.
889	 *
890	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
891	 * this may result in KVM fragmentation which is not handled optimally
892	 * by the system.
893	 */
894	maxbufspace = (long)nbuf * BKVASIZE;
895	hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10);
896	lobufspace = hibufspace - MAXBCACHEBUF;
897
898	/*
899	 * Note: The 16 MiB upper limit for hirunningspace was chosen
900	 * arbitrarily and may need further tuning. It corresponds to
901	 * 128 outstanding write IO requests (if IO size is 128 KiB),
902	 * which fits with many RAID controllers' tagged queuing limits.
903	 * The lower 1 MiB limit is the historical upper limit for
904	 * hirunningspace.
905	 */
906	hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBCACHEBUF),
907	    16 * 1024 * 1024), 1024 * 1024);
908	lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF);
909
910/*
911 * Limit the amount of malloc memory since it is wired permanently into
912 * the kernel space.  Even though this is accounted for in the buffer
913 * allocation, we don't want the malloced region to grow uncontrolled.
914 * The malloc scheme improves memory utilization significantly on average
915 * (small) directories.
916 */
917	maxbufmallocspace = hibufspace / 20;
918
919/*
920 * Reduce the chance of a deadlock occuring by limiting the number
921 * of delayed-write dirty buffers we allow to stack up.
922 */
923	hidirtybuffers = nbuf / 4 + 20;
924	dirtybufthresh = hidirtybuffers * 9 / 10;
925	numdirtybuffers = 0;
926/*
927 * To support extreme low-memory systems, make sure hidirtybuffers cannot
928 * eat up all available buffer space.  This occurs when our minimum cannot
929 * be met.  We try to size hidirtybuffers to 3/4 our buffer space assuming
930 * BKVASIZE'd buffers.
931 */
932	while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
933		hidirtybuffers >>= 1;
934	}
935	lodirtybuffers = hidirtybuffers / 2;
936
937/*
938 * Try to keep the number of free buffers in the specified range,
939 * and give special processes (e.g. like buf_daemon) access to an
940 * emergency reserve.
941 */
942	lofreebuffers = nbuf / 18 + 5;
943	hifreebuffers = 2 * lofreebuffers;
944	numfreebuffers = nbuf;
945
946	bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
947	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
948}
949
950#ifdef INVARIANTS
951static inline void
952vfs_buf_check_mapped(struct buf *bp)
953{
954
955	KASSERT(bp->b_kvabase != unmapped_buf,
956	    ("mapped buf: b_kvabase was not updated %p", bp));
957	KASSERT(bp->b_data != unmapped_buf,
958	    ("mapped buf: b_data was not updated %p", bp));
959}
960
961static inline void
962vfs_buf_check_unmapped(struct buf *bp)
963{
964
965	KASSERT(bp->b_data == unmapped_buf,
966	    ("unmapped buf: corrupted b_data %p", bp));
967}
968
969#define	BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
970#define	BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
971#else
972#define	BUF_CHECK_MAPPED(bp) do {} while (0)
973#define	BUF_CHECK_UNMAPPED(bp) do {} while (0)
974#endif
975
976static void
977bpmap_qenter(struct buf *bp)
978{
979
980	BUF_CHECK_MAPPED(bp);
981
982	/*
983	 * bp->b_data is relative to bp->b_offset, but
984	 * bp->b_offset may be offset into the first page.
985	 */
986	bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
987	pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
988	bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
989	    (vm_offset_t)(bp->b_offset & PAGE_MASK));
990}
991
992/*
993 *	binsfree:
994 *
995 *	Insert the buffer into the appropriate free list.
996 */
997static void
998binsfree(struct buf *bp, int qindex)
999{
1000	struct mtx *olock, *nlock;
1001
1002	BUF_ASSERT_XLOCKED(bp);
1003
1004	nlock = bqlock(qindex);
1005	/* Handle delayed bremfree() processing. */
1006	if (bp->b_flags & B_REMFREE) {
1007		olock = bqlock(bp->b_qindex);
1008		mtx_lock(olock);
1009		bremfreel(bp);
1010		if (olock != nlock) {
1011			mtx_unlock(olock);
1012			mtx_lock(nlock);
1013		}
1014	} else
1015		mtx_lock(nlock);
1016
1017	if (bp->b_qindex != QUEUE_NONE)
1018		panic("binsfree: free buffer onto another queue???");
1019
1020	bp->b_qindex = qindex;
1021	if (bp->b_flags & B_AGE)
1022		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
1023	else
1024		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
1025#ifdef INVARIANTS
1026	bq_len[bp->b_qindex]++;
1027#endif
1028	mtx_unlock(nlock);
1029
1030	/*
1031	 * Something we can maybe free or reuse.
1032	 */
1033	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
1034		bufspacewakeup();
1035
1036	if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
1037		bufcountadd(bp);
1038}
1039
1040/*
1041 *	bremfree:
1042 *
1043 *	Mark the buffer for removal from the appropriate free list.
1044 *
1045 */
1046void
1047bremfree(struct buf *bp)
1048{
1049
1050	CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1051	KASSERT((bp->b_flags & B_REMFREE) == 0,
1052	    ("bremfree: buffer %p already marked for delayed removal.", bp));
1053	KASSERT(bp->b_qindex != QUEUE_NONE,
1054	    ("bremfree: buffer %p not on a queue.", bp));
1055	BUF_ASSERT_XLOCKED(bp);
1056
1057	bp->b_flags |= B_REMFREE;
1058	bufcountsub(bp);
1059}
1060
1061/*
1062 *	bremfreef:
1063 *
1064 *	Force an immediate removal from a free list.  Used only in nfs when
1065 *	it abuses the b_freelist pointer.
1066 */
1067void
1068bremfreef(struct buf *bp)
1069{
1070	struct mtx *qlock;
1071
1072	qlock = bqlock(bp->b_qindex);
1073	mtx_lock(qlock);
1074	bremfreel(bp);
1075	mtx_unlock(qlock);
1076}
1077
1078/*
1079 *	bremfreel:
1080 *
1081 *	Removes a buffer from the free list, must be called with the
1082 *	correct qlock held.
1083 */
1084static void
1085bremfreel(struct buf *bp)
1086{
1087
1088	CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
1089	    bp, bp->b_vp, bp->b_flags);
1090	KASSERT(bp->b_qindex != QUEUE_NONE,
1091	    ("bremfreel: buffer %p not on a queue.", bp));
1092	BUF_ASSERT_XLOCKED(bp);
1093	mtx_assert(bqlock(bp->b_qindex), MA_OWNED);
1094
1095	TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
1096#ifdef INVARIANTS
1097	KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow",
1098	    bp->b_qindex));
1099	bq_len[bp->b_qindex]--;
1100#endif
1101	bp->b_qindex = QUEUE_NONE;
1102	/*
1103	 * If this was a delayed bremfree() we only need to remove the buffer
1104	 * from the queue and return the stats are already done.
1105	 */
1106	if (bp->b_flags & B_REMFREE) {
1107		bp->b_flags &= ~B_REMFREE;
1108		return;
1109	}
1110	bufcountsub(bp);
1111}
1112
1113/*
1114 *	bufkvafree:
1115 *
1116 *	Free the kva allocation for a buffer.
1117 *
1118 */
1119static void
1120bufkvafree(struct buf *bp)
1121{
1122
1123#ifdef INVARIANTS
1124	if (bp->b_kvasize == 0) {
1125		KASSERT(bp->b_kvabase == unmapped_buf &&
1126		    bp->b_data == unmapped_buf,
1127		    ("Leaked KVA space on %p", bp));
1128	} else if (buf_mapped(bp))
1129		BUF_CHECK_MAPPED(bp);
1130	else
1131		BUF_CHECK_UNMAPPED(bp);
1132#endif
1133	if (bp->b_kvasize == 0)
1134		return;
1135
1136	vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize);
1137	atomic_subtract_long(&bufkvaspace, bp->b_kvasize);
1138	atomic_add_int(&buffreekvacnt, 1);
1139	bp->b_data = bp->b_kvabase = unmapped_buf;
1140	bp->b_kvasize = 0;
1141}
1142
1143/*
1144 *	bufkvaalloc:
1145 *
1146 *	Allocate the buffer KVA and set b_kvasize and b_kvabase.
1147 */
1148static int
1149bufkvaalloc(struct buf *bp, int maxsize, int gbflags)
1150{
1151	vm_offset_t addr;
1152	int error;
1153
1154	KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0,
1155	    ("Invalid gbflags 0x%x in %s", gbflags, __func__));
1156
1157	bufkvafree(bp);
1158
1159	addr = 0;
1160	error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr);
1161	if (error != 0) {
1162		/*
1163		 * Buffer map is too fragmented.  Request the caller
1164		 * to defragment the map.
1165		 */
1166		atomic_add_int(&bufdefragcnt, 1);
1167		return (error);
1168	}
1169	bp->b_kvabase = (caddr_t)addr;
1170	bp->b_kvasize = maxsize;
1171	atomic_add_long(&bufkvaspace, bp->b_kvasize);
1172	if ((gbflags & GB_UNMAPPED) != 0) {
1173		bp->b_data = unmapped_buf;
1174		BUF_CHECK_UNMAPPED(bp);
1175	} else {
1176		bp->b_data = bp->b_kvabase;
1177		BUF_CHECK_MAPPED(bp);
1178	}
1179	return (0);
1180}
1181
1182/*
1183 * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
1184 * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
1185 * the buffer is valid and we do not have to do anything.
1186 */
1187void
1188breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
1189    int cnt, struct ucred * cred)
1190{
1191	struct buf *rabp;
1192	int i;
1193
1194	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
1195		if (inmem(vp, *rablkno))
1196			continue;
1197		rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
1198
1199		if ((rabp->b_flags & B_CACHE) == 0) {
1200			if (!TD_IS_IDLETHREAD(curthread))
1201				curthread->td_ru.ru_inblock++;
1202			rabp->b_flags |= B_ASYNC;
1203			rabp->b_flags &= ~B_INVAL;
1204			rabp->b_ioflags &= ~BIO_ERROR;
1205			rabp->b_iocmd = BIO_READ;
1206			if (rabp->b_rcred == NOCRED && cred != NOCRED)
1207				rabp->b_rcred = crhold(cred);
1208			vfs_busy_pages(rabp, 0);
1209			BUF_KERNPROC(rabp);
1210			rabp->b_iooffset = dbtob(rabp->b_blkno);
1211			bstrategy(rabp);
1212		} else {
1213			brelse(rabp);
1214		}
1215	}
1216}
1217
1218/*
1219 * Entry point for bread() and breadn() via #defines in sys/buf.h.
1220 *
1221 * Get a buffer with the specified data.  Look in the cache first.  We
1222 * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
1223 * is set, the buffer is valid and we do not have to do anything, see
1224 * getblk(). Also starts asynchronous I/O on read-ahead blocks.
1225 */
1226int
1227breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno,
1228    int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp)
1229{
1230	struct buf *bp;
1231	int rv = 0, readwait = 0;
1232
1233	CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
1234	/*
1235	 * Can only return NULL if GB_LOCK_NOWAIT flag is specified.
1236	 */
1237	*bpp = bp = getblk(vp, blkno, size, 0, 0, flags);
1238	if (bp == NULL)
1239		return (EBUSY);
1240
1241	/* if not found in cache, do some I/O */
1242	if ((bp->b_flags & B_CACHE) == 0) {
1243		if (!TD_IS_IDLETHREAD(curthread))
1244			curthread->td_ru.ru_inblock++;
1245		bp->b_iocmd = BIO_READ;
1246		bp->b_flags &= ~B_INVAL;
1247		bp->b_ioflags &= ~BIO_ERROR;
1248		if (bp->b_rcred == NOCRED && cred != NOCRED)
1249			bp->b_rcred = crhold(cred);
1250		vfs_busy_pages(bp, 0);
1251		bp->b_iooffset = dbtob(bp->b_blkno);
1252		bstrategy(bp);
1253		++readwait;
1254	}
1255
1256	breada(vp, rablkno, rabsize, cnt, cred);
1257
1258	if (readwait) {
1259		rv = bufwait(bp);
1260	}
1261	return (rv);
1262}
1263
1264/*
1265 * Write, release buffer on completion.  (Done by iodone
1266 * if async).  Do not bother writing anything if the buffer
1267 * is invalid.
1268 *
1269 * Note that we set B_CACHE here, indicating that buffer is
1270 * fully valid and thus cacheable.  This is true even of NFS
1271 * now so we set it generally.  This could be set either here
1272 * or in biodone() since the I/O is synchronous.  We put it
1273 * here.
1274 */
1275int
1276bufwrite(struct buf *bp)
1277{
1278	int oldflags;
1279	struct vnode *vp;
1280	long space;
1281	int vp_md;
1282
1283	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1284	if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) {
1285		bp->b_flags |= B_INVAL | B_RELBUF;
1286		bp->b_flags &= ~B_CACHE;
1287		brelse(bp);
1288		return (ENXIO);
1289	}
1290	if (bp->b_flags & B_INVAL) {
1291		brelse(bp);
1292		return (0);
1293	}
1294
1295	if (bp->b_flags & B_BARRIER)
1296		barrierwrites++;
1297
1298	oldflags = bp->b_flags;
1299
1300	BUF_ASSERT_HELD(bp);
1301
1302	if (bp->b_pin_count > 0)
1303		bunpin_wait(bp);
1304
1305	KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
1306	    ("FFS background buffer should not get here %p", bp));
1307
1308	vp = bp->b_vp;
1309	if (vp)
1310		vp_md = vp->v_vflag & VV_MD;
1311	else
1312		vp_md = 0;
1313
1314	/*
1315	 * Mark the buffer clean.  Increment the bufobj write count
1316	 * before bundirty() call, to prevent other thread from seeing
1317	 * empty dirty list and zero counter for writes in progress,
1318	 * falsely indicating that the bufobj is clean.
1319	 */
1320	bufobj_wref(bp->b_bufobj);
1321	bundirty(bp);
1322
1323	bp->b_flags &= ~B_DONE;
1324	bp->b_ioflags &= ~BIO_ERROR;
1325	bp->b_flags |= B_CACHE;
1326	bp->b_iocmd = BIO_WRITE;
1327
1328	vfs_busy_pages(bp, 1);
1329
1330	/*
1331	 * Normal bwrites pipeline writes
1332	 */
1333	bp->b_runningbufspace = bp->b_bufsize;
1334	space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
1335
1336	if (!TD_IS_IDLETHREAD(curthread))
1337		curthread->td_ru.ru_oublock++;
1338	if (oldflags & B_ASYNC)
1339		BUF_KERNPROC(bp);
1340	bp->b_iooffset = dbtob(bp->b_blkno);
1341	bstrategy(bp);
1342
1343	if ((oldflags & B_ASYNC) == 0) {
1344		int rtval = bufwait(bp);
1345		brelse(bp);
1346		return (rtval);
1347	} else if (space > hirunningspace) {
1348		/*
1349		 * don't allow the async write to saturate the I/O
1350		 * system.  We will not deadlock here because
1351		 * we are blocking waiting for I/O that is already in-progress
1352		 * to complete. We do not block here if it is the update
1353		 * or syncer daemon trying to clean up as that can lead
1354		 * to deadlock.
1355		 */
1356		if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
1357			waitrunningbufspace();
1358	}
1359
1360	return (0);
1361}
1362
1363void
1364bufbdflush(struct bufobj *bo, struct buf *bp)
1365{
1366	struct buf *nbp;
1367
1368	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
1369		(void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
1370		altbufferflushes++;
1371	} else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
1372		BO_LOCK(bo);
1373		/*
1374		 * Try to find a buffer to flush.
1375		 */
1376		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
1377			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
1378			    BUF_LOCK(nbp,
1379				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
1380				continue;
1381			if (bp == nbp)
1382				panic("bdwrite: found ourselves");
1383			BO_UNLOCK(bo);
1384			/* Don't countdeps with the bo lock held. */
1385			if (buf_countdeps(nbp, 0)) {
1386				BO_LOCK(bo);
1387				BUF_UNLOCK(nbp);
1388				continue;
1389			}
1390			if (nbp->b_flags & B_CLUSTEROK) {
1391				vfs_bio_awrite(nbp);
1392			} else {
1393				bremfree(nbp);
1394				bawrite(nbp);
1395			}
1396			dirtybufferflushes++;
1397			break;
1398		}
1399		if (nbp == NULL)
1400			BO_UNLOCK(bo);
1401	}
1402}
1403
1404/*
1405 * Delayed write. (Buffer is marked dirty).  Do not bother writing
1406 * anything if the buffer is marked invalid.
1407 *
1408 * Note that since the buffer must be completely valid, we can safely
1409 * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
1410 * biodone() in order to prevent getblk from writing the buffer
1411 * out synchronously.
1412 */
1413void
1414bdwrite(struct buf *bp)
1415{
1416	struct thread *td = curthread;
1417	struct vnode *vp;
1418	struct bufobj *bo;
1419
1420	CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1421	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1422	KASSERT((bp->b_flags & B_BARRIER) == 0,
1423	    ("Barrier request in delayed write %p", bp));
1424	BUF_ASSERT_HELD(bp);
1425
1426	if (bp->b_flags & B_INVAL) {
1427		brelse(bp);
1428		return;
1429	}
1430
1431	/*
1432	 * If we have too many dirty buffers, don't create any more.
1433	 * If we are wildly over our limit, then force a complete
1434	 * cleanup. Otherwise, just keep the situation from getting
1435	 * out of control. Note that we have to avoid a recursive
1436	 * disaster and not try to clean up after our own cleanup!
1437	 */
1438	vp = bp->b_vp;
1439	bo = bp->b_bufobj;
1440	if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) {
1441		td->td_pflags |= TDP_INBDFLUSH;
1442		BO_BDFLUSH(bo, bp);
1443		td->td_pflags &= ~TDP_INBDFLUSH;
1444	} else
1445		recursiveflushes++;
1446
1447	bdirty(bp);
1448	/*
1449	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
1450	 * true even of NFS now.
1451	 */
1452	bp->b_flags |= B_CACHE;
1453
1454	/*
1455	 * This bmap keeps the system from needing to do the bmap later,
1456	 * perhaps when the system is attempting to do a sync.  Since it
1457	 * is likely that the indirect block -- or whatever other datastructure
1458	 * that the filesystem needs is still in memory now, it is a good
1459	 * thing to do this.  Note also, that if the pageout daemon is
1460	 * requesting a sync -- there might not be enough memory to do
1461	 * the bmap then...  So, this is important to do.
1462	 */
1463	if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
1464		VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
1465	}
1466
1467	/*
1468	 * Set the *dirty* buffer range based upon the VM system dirty
1469	 * pages.
1470	 *
1471	 * Mark the buffer pages as clean.  We need to do this here to
1472	 * satisfy the vnode_pager and the pageout daemon, so that it
1473	 * thinks that the pages have been "cleaned".  Note that since
1474	 * the pages are in a delayed write buffer -- the VFS layer
1475	 * "will" see that the pages get written out on the next sync,
1476	 * or perhaps the cluster will be completed.
1477	 */
1478	vfs_clean_pages_dirty_buf(bp);
1479	bqrelse(bp);
1480
1481	/*
1482	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
1483	 * due to the softdep code.
1484	 */
1485}
1486
1487/*
1488 *	bdirty:
1489 *
1490 *	Turn buffer into delayed write request.  We must clear BIO_READ and
1491 *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to
1492 *	itself to properly update it in the dirty/clean lists.  We mark it
1493 *	B_DONE to ensure that any asynchronization of the buffer properly
1494 *	clears B_DONE ( else a panic will occur later ).
1495 *
1496 *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
1497 *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
1498 *	should only be called if the buffer is known-good.
1499 *
1500 *	Since the buffer is not on a queue, we do not update the numfreebuffers
1501 *	count.
1502 *
1503 *	The buffer must be on QUEUE_NONE.
1504 */
1505void
1506bdirty(struct buf *bp)
1507{
1508
1509	CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X",
1510	    bp, bp->b_vp, bp->b_flags);
1511	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1512	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
1513	    ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
1514	BUF_ASSERT_HELD(bp);
1515	bp->b_flags &= ~(B_RELBUF);
1516	bp->b_iocmd = BIO_WRITE;
1517
1518	if ((bp->b_flags & B_DELWRI) == 0) {
1519		bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
1520		reassignbuf(bp);
1521		bdirtyadd();
1522	}
1523}
1524
1525/*
1526 *	bundirty:
1527 *
1528 *	Clear B_DELWRI for buffer.
1529 *
1530 *	Since the buffer is not on a queue, we do not update the numfreebuffers
1531 *	count.
1532 *
1533 *	The buffer must be on QUEUE_NONE.
1534 */
1535
1536void
1537bundirty(struct buf *bp)
1538{
1539
1540	CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1541	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1542	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
1543	    ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
1544	BUF_ASSERT_HELD(bp);
1545
1546	if (bp->b_flags & B_DELWRI) {
1547		bp->b_flags &= ~B_DELWRI;
1548		reassignbuf(bp);
1549		bdirtysub();
1550	}
1551	/*
1552	 * Since it is now being written, we can clear its deferred write flag.
1553	 */
1554	bp->b_flags &= ~B_DEFERRED;
1555}
1556
1557/*
1558 *	bawrite:
1559 *
1560 *	Asynchronous write.  Start output on a buffer, but do not wait for
1561 *	it to complete.  The buffer is released when the output completes.
1562 *
1563 *	bwrite() ( or the VOP routine anyway ) is responsible for handling
1564 *	B_INVAL buffers.  Not us.
1565 */
1566void
1567bawrite(struct buf *bp)
1568{
1569
1570	bp->b_flags |= B_ASYNC;
1571	(void) bwrite(bp);
1572}
1573
1574/*
1575 *	babarrierwrite:
1576 *
1577 *	Asynchronous barrier write.  Start output on a buffer, but do not
1578 *	wait for it to complete.  Place a write barrier after this write so
1579 *	that this buffer and all buffers written before it are committed to
1580 *	the disk before any buffers written after this write are committed
1581 *	to the disk.  The buffer is released when the output completes.
1582 */
1583void
1584babarrierwrite(struct buf *bp)
1585{
1586
1587	bp->b_flags |= B_ASYNC | B_BARRIER;
1588	(void) bwrite(bp);
1589}
1590
1591/*
1592 *	bbarrierwrite:
1593 *
1594 *	Synchronous barrier write.  Start output on a buffer and wait for
1595 *	it to complete.  Place a write barrier after this write so that
1596 *	this buffer and all buffers written before it are committed to
1597 *	the disk before any buffers written after this write are committed
1598 *	to the disk.  The buffer is released when the output completes.
1599 */
1600int
1601bbarrierwrite(struct buf *bp)
1602{
1603
1604	bp->b_flags |= B_BARRIER;
1605	return (bwrite(bp));
1606}
1607
1608/*
1609 *	bwillwrite:
1610 *
1611 *	Called prior to the locking of any vnodes when we are expecting to
1612 *	write.  We do not want to starve the buffer cache with too many
1613 *	dirty buffers so we block here.  By blocking prior to the locking
1614 *	of any vnodes we attempt to avoid the situation where a locked vnode
1615 *	prevents the various system daemons from flushing related buffers.
1616 */
1617void
1618bwillwrite(void)
1619{
1620
1621	if (numdirtybuffers >= hidirtybuffers) {
1622		mtx_lock(&bdirtylock);
1623		while (numdirtybuffers >= hidirtybuffers) {
1624			bdirtywait = 1;
1625			msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
1626			    "flswai", 0);
1627		}
1628		mtx_unlock(&bdirtylock);
1629	}
1630}
1631
1632/*
1633 * Return true if we have too many dirty buffers.
1634 */
1635int
1636buf_dirty_count_severe(void)
1637{
1638
1639	return(numdirtybuffers >= hidirtybuffers);
1640}
1641
1642/*
1643 *	brelse:
1644 *
1645 *	Release a busy buffer and, if requested, free its resources.  The
1646 *	buffer will be stashed in the appropriate bufqueue[] allowing it
1647 *	to be accessed later as a cache entity or reused for other purposes.
1648 */
1649void
1650brelse(struct buf *bp)
1651{
1652	int qindex;
1653
1654	CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
1655	    bp, bp->b_vp, bp->b_flags);
1656	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
1657	    ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
1658
1659	if (BUF_LOCKRECURSED(bp)) {
1660		/*
1661		 * Do not process, in particular, do not handle the
1662		 * B_INVAL/B_RELBUF and do not release to free list.
1663		 */
1664		BUF_UNLOCK(bp);
1665		return;
1666	}
1667
1668	if (bp->b_flags & B_MANAGED) {
1669		bqrelse(bp);
1670		return;
1671	}
1672
1673	if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) {
1674		BO_LOCK(bp->b_bufobj);
1675		bp->b_vflags &= ~BV_BKGRDERR;
1676		BO_UNLOCK(bp->b_bufobj);
1677		bdirty(bp);
1678	}
1679	if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
1680	    bp->b_error == EIO && !(bp->b_flags & B_INVAL)) {
1681		/*
1682		 * Failed write, redirty.  Must clear BIO_ERROR to prevent
1683		 * pages from being scrapped.  If the error is anything
1684		 * other than an I/O error (EIO), assume that retrying
1685		 * is futile.
1686		 */
1687		bp->b_ioflags &= ~BIO_ERROR;
1688		bdirty(bp);
1689	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
1690	    (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) {
1691		/*
1692		 * Either a failed I/O or we were asked to free or not
1693		 * cache the buffer.
1694		 */
1695		bp->b_flags |= B_INVAL;
1696		if (!LIST_EMPTY(&bp->b_dep))
1697			buf_deallocate(bp);
1698		if (bp->b_flags & B_DELWRI)
1699			bdirtysub();
1700		bp->b_flags &= ~(B_DELWRI | B_CACHE);
1701		if ((bp->b_flags & B_VMIO) == 0) {
1702			if (bp->b_bufsize)
1703				allocbuf(bp, 0);
1704			if (bp->b_vp)
1705				brelvp(bp);
1706		}
1707	}
1708
1709	/*
1710	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release()
1711	 * is called with B_DELWRI set, the underlying pages may wind up
1712	 * getting freed causing a previous write (bdwrite()) to get 'lost'
1713	 * because pages associated with a B_DELWRI bp are marked clean.
1714	 *
1715	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
1716	 * if B_DELWRI is set.
1717	 */
1718	if (bp->b_flags & B_DELWRI)
1719		bp->b_flags &= ~B_RELBUF;
1720
1721	/*
1722	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
1723	 * constituted, not even NFS buffers now.  Two flags effect this.  If
1724	 * B_INVAL, the struct buf is invalidated but the VM object is kept
1725	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
1726	 *
1727	 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
1728	 * invalidated.  BIO_ERROR cannot be set for a failed write unless the
1729	 * buffer is also B_INVAL because it hits the re-dirtying code above.
1730	 *
1731	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
1732	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
1733	 * the commit state and we cannot afford to lose the buffer. If the
1734	 * buffer has a background write in progress, we need to keep it
1735	 * around to prevent it from being reconstituted and starting a second
1736	 * background write.
1737	 */
1738	if ((bp->b_flags & B_VMIO)
1739	    && !(bp->b_vp->v_mount != NULL &&
1740		 (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
1741		 !vn_isdisk(bp->b_vp, NULL) &&
1742		 (bp->b_flags & B_DELWRI))
1743	    ) {
1744
1745		int i, j, resid;
1746		vm_page_t m;
1747		off_t foff;
1748		vm_pindex_t poff;
1749		vm_object_t obj;
1750
1751		obj = bp->b_bufobj->bo_object;
1752
1753		/*
1754		 * Get the base offset and length of the buffer.  Note that
1755		 * in the VMIO case if the buffer block size is not
1756		 * page-aligned then b_data pointer may not be page-aligned.
1757		 * But our b_pages[] array *IS* page aligned.
1758		 *
1759		 * block sizes less then DEV_BSIZE (usually 512) are not
1760		 * supported due to the page granularity bits (m->valid,
1761		 * m->dirty, etc...).
1762		 *
1763		 * See man buf(9) for more information
1764		 */
1765		resid = bp->b_bufsize;
1766		foff = bp->b_offset;
1767		for (i = 0; i < bp->b_npages; i++) {
1768			int had_bogus = 0;
1769
1770			m = bp->b_pages[i];
1771
1772			/*
1773			 * If we hit a bogus page, fixup *all* the bogus pages
1774			 * now.
1775			 */
1776			if (m == bogus_page) {
1777				poff = OFF_TO_IDX(bp->b_offset);
1778				had_bogus = 1;
1779
1780				VM_OBJECT_RLOCK(obj);
1781				for (j = i; j < bp->b_npages; j++) {
1782					vm_page_t mtmp;
1783					mtmp = bp->b_pages[j];
1784					if (mtmp == bogus_page) {
1785						mtmp = vm_page_lookup(obj, poff + j);
1786						if (!mtmp) {
1787							panic("brelse: page missing\n");
1788						}
1789						bp->b_pages[j] = mtmp;
1790					}
1791				}
1792				VM_OBJECT_RUNLOCK(obj);
1793
1794				if ((bp->b_flags & B_INVAL) == 0 &&
1795				    buf_mapped(bp)) {
1796					BUF_CHECK_MAPPED(bp);
1797					pmap_qenter(
1798					    trunc_page((vm_offset_t)bp->b_data),
1799					    bp->b_pages, bp->b_npages);
1800				}
1801				m = bp->b_pages[i];
1802			}
1803			if ((bp->b_flags & B_NOCACHE) ||
1804			    (bp->b_ioflags & BIO_ERROR &&
1805			     bp->b_iocmd == BIO_READ)) {
1806				int poffset = foff & PAGE_MASK;
1807				int presid = resid > (PAGE_SIZE - poffset) ?
1808					(PAGE_SIZE - poffset) : resid;
1809
1810				KASSERT(presid >= 0, ("brelse: extra page"));
1811				VM_OBJECT_WLOCK(obj);
1812				while (vm_page_xbusied(m)) {
1813					vm_page_lock(m);
1814					VM_OBJECT_WUNLOCK(obj);
1815					vm_page_busy_sleep(m, "mbncsh");
1816					VM_OBJECT_WLOCK(obj);
1817				}
1818				if (pmap_page_wired_mappings(m) == 0)
1819					vm_page_set_invalid(m, poffset, presid);
1820				VM_OBJECT_WUNLOCK(obj);
1821				if (had_bogus)
1822					printf("avoided corruption bug in bogus_page/brelse code\n");
1823			}
1824			resid -= PAGE_SIZE - (foff & PAGE_MASK);
1825			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
1826		}
1827		if (bp->b_flags & (B_INVAL | B_RELBUF))
1828			vfs_vmio_release(bp);
1829
1830	} else if (bp->b_flags & B_VMIO) {
1831
1832		if (bp->b_flags & (B_INVAL | B_RELBUF)) {
1833			vfs_vmio_release(bp);
1834		}
1835
1836	} else if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0) {
1837		if (bp->b_bufsize != 0)
1838			allocbuf(bp, 0);
1839		if (bp->b_vp != NULL)
1840			brelvp(bp);
1841	}
1842
1843	/*
1844	 * If the buffer has junk contents signal it and eventually
1845	 * clean up B_DELWRI and diassociate the vnode so that gbincore()
1846	 * doesn't find it.
1847	 */
1848	if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 ||
1849	    (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0)
1850		bp->b_flags |= B_INVAL;
1851	if (bp->b_flags & B_INVAL) {
1852		if (bp->b_flags & B_DELWRI)
1853			bundirty(bp);
1854		if (bp->b_vp)
1855			brelvp(bp);
1856	}
1857
1858	/* buffers with no memory */
1859	if (bp->b_bufsize == 0) {
1860		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
1861		if (bp->b_vflags & BV_BKGRDINPROG)
1862			panic("losing buffer 1");
1863		if (bp->b_kvasize)
1864			qindex = QUEUE_EMPTYKVA;
1865		else
1866			qindex = QUEUE_EMPTY;
1867		bp->b_flags |= B_AGE;
1868	/* buffers with junk contents */
1869	} else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
1870	    (bp->b_ioflags & BIO_ERROR)) {
1871		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
1872		if (bp->b_vflags & BV_BKGRDINPROG)
1873			panic("losing buffer 2");
1874		qindex = QUEUE_CLEAN;
1875		bp->b_flags |= B_AGE;
1876	/* remaining buffers */
1877	} else if (bp->b_flags & B_DELWRI)
1878		qindex = QUEUE_DIRTY;
1879	else
1880		qindex = QUEUE_CLEAN;
1881
1882	binsfree(bp, qindex);
1883
1884	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
1885	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
1886		panic("brelse: not dirty");
1887	/* unlock */
1888	BUF_UNLOCK(bp);
1889}
1890
1891/*
1892 * Release a buffer back to the appropriate queue but do not try to free
1893 * it.  The buffer is expected to be used again soon.
1894 *
1895 * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
1896 * biodone() to requeue an async I/O on completion.  It is also used when
1897 * known good buffers need to be requeued but we think we may need the data
1898 * again soon.
1899 *
1900 * XXX we should be able to leave the B_RELBUF hint set on completion.
1901 */
1902void
1903bqrelse(struct buf *bp)
1904{
1905	int qindex;
1906
1907	CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1908	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
1909	    ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
1910
1911	if (BUF_LOCKRECURSED(bp)) {
1912		/* do not release to free list */
1913		BUF_UNLOCK(bp);
1914		return;
1915	}
1916	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
1917
1918	if (bp->b_flags & B_MANAGED) {
1919		if (bp->b_flags & B_REMFREE)
1920			bremfreef(bp);
1921		goto out;
1922	}
1923
1924	/* buffers with stale but valid contents */
1925	if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG |
1926	    BV_BKGRDERR)) == BV_BKGRDERR) {
1927		BO_LOCK(bp->b_bufobj);
1928		bp->b_vflags &= ~BV_BKGRDERR;
1929		BO_UNLOCK(bp->b_bufobj);
1930		qindex = QUEUE_DIRTY;
1931	} else {
1932		if ((bp->b_flags & B_DELWRI) == 0 &&
1933		    (bp->b_xflags & BX_VNDIRTY))
1934			panic("bqrelse: not dirty");
1935		qindex = QUEUE_CLEAN;
1936	}
1937	binsfree(bp, qindex);
1938
1939out:
1940	/* unlock */
1941	BUF_UNLOCK(bp);
1942}
1943
1944/* Give pages used by the bp back to the VM system (where possible) */
1945static void
1946vfs_vmio_release(struct buf *bp)
1947{
1948	vm_object_t obj;
1949	vm_page_t m;
1950	int i;
1951
1952	if (buf_mapped(bp)) {
1953		BUF_CHECK_MAPPED(bp);
1954		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
1955	} else
1956		BUF_CHECK_UNMAPPED(bp);
1957	obj = bp->b_bufobj->bo_object;
1958	if (obj != NULL)
1959		VM_OBJECT_WLOCK(obj);
1960	for (i = 0; i < bp->b_npages; i++) {
1961		m = bp->b_pages[i];
1962		bp->b_pages[i] = NULL;
1963		/*
1964		 * In order to keep page LRU ordering consistent, put
1965		 * everything on the inactive queue.
1966		 */
1967		vm_page_lock(m);
1968		vm_page_unwire(m, PQ_INACTIVE);
1969
1970		/*
1971		 * Might as well free the page if we can and it has
1972		 * no valid data.  We also free the page if the
1973		 * buffer was used for direct I/O
1974		 */
1975		if ((bp->b_flags & B_ASYNC) == 0 && !m->valid) {
1976			if (m->wire_count == 0 && !vm_page_busied(m))
1977				vm_page_free(m);
1978		} else if (bp->b_flags & B_DIRECT)
1979			vm_page_try_to_free(m);
1980		vm_page_unlock(m);
1981	}
1982	if (obj != NULL)
1983		VM_OBJECT_WUNLOCK(obj);
1984
1985	if (bp->b_bufsize)
1986		bufspaceadjust(bp, 0);
1987	bp->b_npages = 0;
1988	bp->b_flags &= ~B_VMIO;
1989	if (bp->b_vp)
1990		brelvp(bp);
1991}
1992
1993/*
1994 * Check to see if a block at a particular lbn is available for a clustered
1995 * write.
1996 */
1997static int
1998vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno)
1999{
2000	struct buf *bpa;
2001	int match;
2002
2003	match = 0;
2004
2005	/* If the buf isn't in core skip it */
2006	if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL)
2007		return (0);
2008
2009	/* If the buf is busy we don't want to wait for it */
2010	if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
2011		return (0);
2012
2013	/* Only cluster with valid clusterable delayed write buffers */
2014	if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) !=
2015	    (B_DELWRI | B_CLUSTEROK))
2016		goto done;
2017
2018	if (bpa->b_bufsize != size)
2019		goto done;
2020
2021	/*
2022	 * Check to see if it is in the expected place on disk and that the
2023	 * block has been mapped.
2024	 */
2025	if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno))
2026		match = 1;
2027done:
2028	BUF_UNLOCK(bpa);
2029	return (match);
2030}
2031
2032/*
2033 *	vfs_bio_awrite:
2034 *
2035 *	Implement clustered async writes for clearing out B_DELWRI buffers.
2036 *	This is much better then the old way of writing only one buffer at
2037 *	a time.  Note that we may not be presented with the buffers in the
2038 *	correct order, so we search for the cluster in both directions.
2039 */
2040int
2041vfs_bio_awrite(struct buf *bp)
2042{
2043	struct bufobj *bo;
2044	int i;
2045	int j;
2046	daddr_t lblkno = bp->b_lblkno;
2047	struct vnode *vp = bp->b_vp;
2048	int ncl;
2049	int nwritten;
2050	int size;
2051	int maxcl;
2052	int gbflags;
2053
2054	bo = &vp->v_bufobj;
2055	gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0;
2056	/*
2057	 * right now we support clustered writing only to regular files.  If
2058	 * we find a clusterable block we could be in the middle of a cluster
2059	 * rather then at the beginning.
2060	 */
2061	if ((vp->v_type == VREG) &&
2062	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
2063	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
2064
2065		size = vp->v_mount->mnt_stat.f_iosize;
2066		maxcl = MAXPHYS / size;
2067
2068		BO_RLOCK(bo);
2069		for (i = 1; i < maxcl; i++)
2070			if (vfs_bio_clcheck(vp, size, lblkno + i,
2071			    bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
2072				break;
2073
2074		for (j = 1; i + j <= maxcl && j <= lblkno; j++)
2075			if (vfs_bio_clcheck(vp, size, lblkno - j,
2076			    bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
2077				break;
2078		BO_RUNLOCK(bo);
2079		--j;
2080		ncl = i + j;
2081		/*
2082		 * this is a possible cluster write
2083		 */
2084		if (ncl != 1) {
2085			BUF_UNLOCK(bp);
2086			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
2087			    gbflags);
2088			return (nwritten);
2089		}
2090	}
2091	bremfree(bp);
2092	bp->b_flags |= B_ASYNC;
2093	/*
2094	 * default (old) behavior, writing out only one block
2095	 *
2096	 * XXX returns b_bufsize instead of b_bcount for nwritten?
2097	 */
2098	nwritten = bp->b_bufsize;
2099	(void) bwrite(bp);
2100
2101	return (nwritten);
2102}
2103
2104/*
2105 * Ask the bufdaemon for help, or act as bufdaemon itself, when a
2106 * locked vnode is supplied.
2107 */
2108static void
2109getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo,
2110    int defrag)
2111{
2112	struct thread *td;
2113	char *waitmsg;
2114	int error, fl, flags, norunbuf;
2115
2116	mtx_assert(&bqclean, MA_OWNED);
2117
2118	if (defrag) {
2119		flags = VFS_BIO_NEED_BUFSPACE;
2120		waitmsg = "nbufkv";
2121	} else if (bufspace >= hibufspace) {
2122		waitmsg = "nbufbs";
2123		flags = VFS_BIO_NEED_BUFSPACE;
2124	} else {
2125		waitmsg = "newbuf";
2126		flags = VFS_BIO_NEED_ANY;
2127	}
2128	atomic_set_int(&needsbuffer, flags);
2129	mtx_unlock(&bqclean);
2130
2131	bd_speedup();	/* heeeelp */
2132	if ((gbflags & GB_NOWAIT_BD) != 0)
2133		return;
2134
2135	td = curthread;
2136	rw_wlock(&nblock);
2137	while ((needsbuffer & flags) != 0) {
2138		if (vp != NULL && vp->v_type != VCHR &&
2139		    (td->td_pflags & TDP_BUFNEED) == 0) {
2140			rw_wunlock(&nblock);
2141			/*
2142			 * getblk() is called with a vnode locked, and
2143			 * some majority of the dirty buffers may as
2144			 * well belong to the vnode.  Flushing the
2145			 * buffers there would make a progress that
2146			 * cannot be achieved by the buf_daemon, that
2147			 * cannot lock the vnode.
2148			 */
2149			norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
2150			    (td->td_pflags & TDP_NORUNNINGBUF);
2151
2152			/*
2153			 * Play bufdaemon.  The getnewbuf() function
2154			 * may be called while the thread owns lock
2155			 * for another dirty buffer for the same
2156			 * vnode, which makes it impossible to use
2157			 * VOP_FSYNC() there, due to the buffer lock
2158			 * recursion.
2159			 */
2160			td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
2161			fl = buf_flush(vp, flushbufqtarget);
2162			td->td_pflags &= norunbuf;
2163			rw_wlock(&nblock);
2164			if (fl != 0)
2165				continue;
2166			if ((needsbuffer & flags) == 0)
2167				break;
2168		}
2169		error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
2170		    (PRIBIO + 4) | slpflag, waitmsg, slptimeo);
2171		if (error != 0)
2172			break;
2173	}
2174	rw_wunlock(&nblock);
2175}
2176
2177static void
2178getnewbuf_reuse_bp(struct buf *bp, int qindex)
2179{
2180
2181	CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
2182	    "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
2183	     bp->b_kvasize, bp->b_bufsize, qindex);
2184	mtx_assert(&bqclean, MA_NOTOWNED);
2185
2186	/*
2187	 * Note: we no longer distinguish between VMIO and non-VMIO
2188	 * buffers.
2189	 */
2190	KASSERT((bp->b_flags & B_DELWRI) == 0,
2191	    ("delwri buffer %p found in queue %d", bp, qindex));
2192
2193	if (qindex == QUEUE_CLEAN) {
2194		if (bp->b_flags & B_VMIO) {
2195			bp->b_flags &= ~B_ASYNC;
2196			vfs_vmio_release(bp);
2197		}
2198		if (bp->b_vp != NULL)
2199			brelvp(bp);
2200	}
2201
2202	/*
2203	 * Get the rest of the buffer freed up.  b_kva* is still valid
2204	 * after this operation.
2205	 */
2206
2207	if (bp->b_rcred != NOCRED) {
2208		crfree(bp->b_rcred);
2209		bp->b_rcred = NOCRED;
2210	}
2211	if (bp->b_wcred != NOCRED) {
2212		crfree(bp->b_wcred);
2213		bp->b_wcred = NOCRED;
2214	}
2215	if (!LIST_EMPTY(&bp->b_dep))
2216		buf_deallocate(bp);
2217	if (bp->b_vflags & BV_BKGRDINPROG)
2218		panic("losing buffer 3");
2219	KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p.  qindex: %d",
2220	    bp, bp->b_vp, qindex));
2221	KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
2222	    ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
2223
2224	if (bp->b_bufsize)
2225		allocbuf(bp, 0);
2226
2227	bp->b_flags = 0;
2228	bp->b_ioflags = 0;
2229	bp->b_xflags = 0;
2230	KASSERT((bp->b_flags & B_INFREECNT) == 0,
2231	    ("buf %p still counted as free?", bp));
2232	bp->b_vflags = 0;
2233	bp->b_vp = NULL;
2234	bp->b_blkno = bp->b_lblkno = 0;
2235	bp->b_offset = NOOFFSET;
2236	bp->b_iodone = 0;
2237	bp->b_error = 0;
2238	bp->b_resid = 0;
2239	bp->b_bcount = 0;
2240	bp->b_npages = 0;
2241	bp->b_dirtyoff = bp->b_dirtyend = 0;
2242	bp->b_bufobj = NULL;
2243	bp->b_pin_count = 0;
2244	bp->b_fsprivate1 = NULL;
2245	bp->b_fsprivate2 = NULL;
2246	bp->b_fsprivate3 = NULL;
2247
2248	LIST_INIT(&bp->b_dep);
2249}
2250
2251static int flushingbufs;
2252
2253static struct buf *
2254getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata)
2255{
2256	struct buf *bp, *nbp;
2257	int nqindex, qindex, pass;
2258
2259	KASSERT(!unmapped || !defrag, ("both unmapped and defrag"));
2260
2261	pass = 1;
2262restart:
2263	atomic_add_int(&getnewbufrestarts, 1);
2264
2265	/*
2266	 * Setup for scan.  If we do not have enough free buffers,
2267	 * we setup a degenerate case that immediately fails.  Note
2268	 * that if we are specially marked process, we are allowed to
2269	 * dip into our reserves.
2270	 *
2271	 * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN
2272	 * for the allocation of the mapped buffer.  For unmapped, the
2273	 * easiest is to start with EMPTY outright.
2274	 *
2275	 * We start with EMPTYKVA.  If the list is empty we backup to EMPTY.
2276	 * However, there are a number of cases (defragging, reusing, ...)
2277	 * where we cannot backup.
2278	 */
2279	nbp = NULL;
2280	mtx_lock(&bqclean);
2281	if (!defrag && unmapped) {
2282		nqindex = QUEUE_EMPTY;
2283		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
2284	}
2285	if (nbp == NULL) {
2286		nqindex = QUEUE_EMPTYKVA;
2287		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
2288	}
2289
2290	/*
2291	 * If no EMPTYKVA buffers and we are either defragging or
2292	 * reusing, locate a CLEAN buffer to free or reuse.  If
2293	 * bufspace useage is low skip this step so we can allocate a
2294	 * new buffer.
2295	 */
2296	if (nbp == NULL && (defrag || bufspace >= lobufspace)) {
2297		nqindex = QUEUE_CLEAN;
2298		nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
2299	}
2300
2301	/*
2302	 * If we could not find or were not allowed to reuse a CLEAN
2303	 * buffer, check to see if it is ok to use an EMPTY buffer.
2304	 * We can only use an EMPTY buffer if allocating its KVA would
2305	 * not otherwise run us out of buffer space.  No KVA is needed
2306	 * for the unmapped allocation.
2307	 */
2308	if (nbp == NULL && defrag == 0 && (bufspace + maxsize < hibufspace ||
2309	    metadata)) {
2310		nqindex = QUEUE_EMPTY;
2311		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
2312	}
2313
2314	/*
2315	 * All available buffers might be clean, retry ignoring the
2316	 * lobufspace as the last resort.
2317	 */
2318	if (nbp == NULL && !TAILQ_EMPTY(&bufqueues[QUEUE_CLEAN])) {
2319		nqindex = QUEUE_CLEAN;
2320		nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
2321	}
2322
2323	/*
2324	 * Run scan, possibly freeing data and/or kva mappings on the fly
2325	 * depending.
2326	 */
2327	while ((bp = nbp) != NULL) {
2328		qindex = nqindex;
2329
2330		/*
2331		 * Calculate next bp (we can only use it if we do not
2332		 * block or do other fancy things).
2333		 */
2334		if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
2335			switch (qindex) {
2336			case QUEUE_EMPTY:
2337				nqindex = QUEUE_EMPTYKVA;
2338				nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
2339				if (nbp != NULL)
2340					break;
2341				/* FALLTHROUGH */
2342			case QUEUE_EMPTYKVA:
2343				nqindex = QUEUE_CLEAN;
2344				nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
2345				if (nbp != NULL)
2346					break;
2347				/* FALLTHROUGH */
2348			case QUEUE_CLEAN:
2349				if (metadata && pass == 1) {
2350					pass = 2;
2351					nqindex = QUEUE_EMPTY;
2352					nbp = TAILQ_FIRST(
2353					    &bufqueues[QUEUE_EMPTY]);
2354				}
2355				/*
2356				 * nbp is NULL.
2357				 */
2358				break;
2359			}
2360		}
2361		/*
2362		 * If we are defragging then we need a buffer with
2363		 * b_kvasize != 0.  This situation occurs when we
2364		 * have many unmapped bufs.
2365		 */
2366		if (defrag && bp->b_kvasize == 0)
2367			continue;
2368
2369		/*
2370		 * Start freeing the bp.  This is somewhat involved.  nbp
2371		 * remains valid only for QUEUE_EMPTY[KVA] bp's.
2372		 */
2373		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
2374			continue;
2375		/*
2376		 * BKGRDINPROG can only be set with the buf and bufobj
2377		 * locks both held.  We tolerate a race to clear it here.
2378		 */
2379		if (bp->b_vflags & BV_BKGRDINPROG) {
2380			BUF_UNLOCK(bp);
2381			continue;
2382		}
2383
2384		/*
2385		 * Requeue the background write buffer with error.
2386		 */
2387		if ((bp->b_vflags & BV_BKGRDERR) != 0) {
2388			bremfreel(bp);
2389			mtx_unlock(&bqclean);
2390			bqrelse(bp);
2391			continue;
2392		}
2393
2394		KASSERT(bp->b_qindex == qindex,
2395		    ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
2396
2397		bremfreel(bp);
2398		mtx_unlock(&bqclean);
2399		/*
2400		 * NOTE:  nbp is now entirely invalid.  We can only restart
2401		 * the scan from this point on.
2402		 */
2403
2404		getnewbuf_reuse_bp(bp, qindex);
2405		mtx_assert(&bqclean, MA_NOTOWNED);
2406
2407		/*
2408		 * If we are defragging then free the buffer.
2409		 */
2410		if (defrag) {
2411			bp->b_flags |= B_INVAL;
2412			bufkvafree(bp);
2413			brelse(bp);
2414			defrag = 0;
2415			goto restart;
2416		}
2417
2418		/*
2419		 * Notify any waiters for the buffer lock about
2420		 * identity change by freeing the buffer.
2421		 */
2422		if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp)) {
2423			bp->b_flags |= B_INVAL;
2424			bufkvafree(bp);
2425			brelse(bp);
2426			goto restart;
2427		}
2428
2429		if (metadata)
2430			break;
2431
2432		/*
2433		 * If we are overcomitted then recover the buffer and its
2434		 * KVM space.  This occurs in rare situations when multiple
2435		 * processes are blocked in getnewbuf() or allocbuf().
2436		 */
2437		if (bufspace >= hibufspace)
2438			flushingbufs = 1;
2439		if (flushingbufs && bp->b_kvasize != 0) {
2440			bp->b_flags |= B_INVAL;
2441			bufkvafree(bp);
2442			brelse(bp);
2443			goto restart;
2444		}
2445		if (bufspace < lobufspace)
2446			flushingbufs = 0;
2447		break;
2448	}
2449	return (bp);
2450}
2451
2452/*
2453 *	getnewbuf:
2454 *
2455 *	Find and initialize a new buffer header, freeing up existing buffers
2456 *	in the bufqueues as necessary.  The new buffer is returned locked.
2457 *
2458 *	Important:  B_INVAL is not set.  If the caller wishes to throw the
2459 *	buffer away, the caller must set B_INVAL prior to calling brelse().
2460 *
2461 *	We block if:
2462 *		We have insufficient buffer headers
2463 *		We have insufficient buffer space
2464 *		buffer_arena is too fragmented ( space reservation fails )
2465 *		If we have to flush dirty buffers ( but we try to avoid this )
2466 */
2467static struct buf *
2468getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
2469    int gbflags)
2470{
2471	struct buf *bp;
2472	int defrag, metadata;
2473
2474	KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
2475	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
2476	if (!unmapped_buf_allowed)
2477		gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
2478
2479	defrag = 0;
2480	if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
2481	    vp->v_type == VCHR)
2482		metadata = 1;
2483	else
2484		metadata = 0;
2485	/*
2486	 * We can't afford to block since we might be holding a vnode lock,
2487	 * which may prevent system daemons from running.  We deal with
2488	 * low-memory situations by proactively returning memory and running
2489	 * async I/O rather then sync I/O.
2490	 */
2491	atomic_add_int(&getnewbufcalls, 1);
2492	atomic_subtract_int(&getnewbufrestarts, 1);
2493restart:
2494	bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED |
2495	    GB_KVAALLOC)) == GB_UNMAPPED, metadata);
2496	if (bp != NULL)
2497		defrag = 0;
2498
2499	/*
2500	 * If we exhausted our list, sleep as appropriate.  We may have to
2501	 * wakeup various daemons and write out some dirty buffers.
2502	 *
2503	 * Generally we are sleeping due to insufficient buffer space.
2504	 */
2505	if (bp == NULL) {
2506		mtx_assert(&bqclean, MA_OWNED);
2507		getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag);
2508		mtx_assert(&bqclean, MA_NOTOWNED);
2509	} else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) {
2510		mtx_assert(&bqclean, MA_NOTOWNED);
2511
2512		bufkvafree(bp);
2513		atomic_add_int(&bufreusecnt, 1);
2514	} else {
2515		mtx_assert(&bqclean, MA_NOTOWNED);
2516
2517		/*
2518		 * We finally have a valid bp.  We aren't quite out of the
2519		 * woods, we still have to reserve kva space. In order to
2520		 * keep fragmentation sane we only allocate kva in BKVASIZE
2521		 * chunks.
2522		 */
2523		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
2524
2525		if (maxsize != bp->b_kvasize &&
2526		    bufkvaalloc(bp, maxsize, gbflags)) {
2527			defrag = 1;
2528			bp->b_flags |= B_INVAL;
2529			brelse(bp);
2530			goto restart;
2531		}
2532		atomic_add_int(&bufreusecnt, 1);
2533	}
2534	return (bp);
2535}
2536
2537/*
2538 *	buf_daemon:
2539 *
2540 *	buffer flushing daemon.  Buffers are normally flushed by the
2541 *	update daemon but if it cannot keep up this process starts to
2542 *	take the load in an attempt to prevent getnewbuf() from blocking.
2543 */
2544
2545static struct kproc_desc buf_kp = {
2546	"bufdaemon",
2547	buf_daemon,
2548	&bufdaemonproc
2549};
2550SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
2551
2552static int
2553buf_flush(struct vnode *vp, int target)
2554{
2555	int flushed;
2556
2557	flushed = flushbufqueues(vp, target, 0);
2558	if (flushed == 0) {
2559		/*
2560		 * Could not find any buffers without rollback
2561		 * dependencies, so just write the first one
2562		 * in the hopes of eventually making progress.
2563		 */
2564		if (vp != NULL && target > 2)
2565			target /= 2;
2566		flushbufqueues(vp, target, 1);
2567	}
2568	return (flushed);
2569}
2570
2571static void
2572buf_daemon()
2573{
2574	int lodirty;
2575
2576	/*
2577	 * This process needs to be suspended prior to shutdown sync.
2578	 */
2579	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc,
2580	    SHUTDOWN_PRI_LAST);
2581
2582	/*
2583	 * This process is allowed to take the buffer cache to the limit
2584	 */
2585	curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED;
2586	mtx_lock(&bdlock);
2587	for (;;) {
2588		bd_request = 0;
2589		mtx_unlock(&bdlock);
2590
2591		kproc_suspend_check(bufdaemonproc);
2592		lodirty = lodirtybuffers;
2593		if (bd_speedupreq) {
2594			lodirty = numdirtybuffers / 2;
2595			bd_speedupreq = 0;
2596		}
2597		/*
2598		 * Do the flush.  Limit the amount of in-transit I/O we
2599		 * allow to build up, otherwise we would completely saturate
2600		 * the I/O system.
2601		 */
2602		while (numdirtybuffers > lodirty) {
2603			if (buf_flush(NULL, numdirtybuffers - lodirty) == 0)
2604				break;
2605			kern_yield(PRI_USER);
2606		}
2607
2608		/*
2609		 * Only clear bd_request if we have reached our low water
2610		 * mark.  The buf_daemon normally waits 1 second and
2611		 * then incrementally flushes any dirty buffers that have
2612		 * built up, within reason.
2613		 *
2614		 * If we were unable to hit our low water mark and couldn't
2615		 * find any flushable buffers, we sleep for a short period
2616		 * to avoid endless loops on unlockable buffers.
2617		 */
2618		mtx_lock(&bdlock);
2619		if (numdirtybuffers <= lodirtybuffers) {
2620			/*
2621			 * We reached our low water mark, reset the
2622			 * request and sleep until we are needed again.
2623			 * The sleep is just so the suspend code works.
2624			 */
2625			bd_request = 0;
2626			/*
2627			 * Do an extra wakeup in case dirty threshold
2628			 * changed via sysctl and the explicit transition
2629			 * out of shortfall was missed.
2630			 */
2631			bdirtywakeup();
2632			if (runningbufspace <= lorunningspace)
2633				runningwakeup();
2634			msleep(&bd_request, &bdlock, PVM, "psleep", hz);
2635		} else {
2636			/*
2637			 * We couldn't find any flushable dirty buffers but
2638			 * still have too many dirty buffers, we
2639			 * have to sleep and try again.  (rare)
2640			 */
2641			msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10);
2642		}
2643	}
2644}
2645
2646/*
2647 *	flushbufqueues:
2648 *
2649 *	Try to flush a buffer in the dirty queue.  We must be careful to
2650 *	free up B_INVAL buffers instead of write them, which NFS is
2651 *	particularly sensitive to.
2652 */
2653static int flushwithdeps = 0;
2654SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
2655    0, "Number of buffers flushed with dependecies that require rollbacks");
2656
2657static int
2658flushbufqueues(struct vnode *lvp, int target, int flushdeps)
2659{
2660	struct buf *sentinel;
2661	struct vnode *vp;
2662	struct mount *mp;
2663	struct buf *bp;
2664	int hasdeps;
2665	int flushed;
2666	int queue;
2667	int error;
2668	bool unlock;
2669
2670	flushed = 0;
2671	queue = QUEUE_DIRTY;
2672	bp = NULL;
2673	sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
2674	sentinel->b_qindex = QUEUE_SENTINEL;
2675	mtx_lock(&bqdirty);
2676	TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist);
2677	mtx_unlock(&bqdirty);
2678	while (flushed != target) {
2679		maybe_yield();
2680		mtx_lock(&bqdirty);
2681		bp = TAILQ_NEXT(sentinel, b_freelist);
2682		if (bp != NULL) {
2683			TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
2684			TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel,
2685			    b_freelist);
2686		} else {
2687			mtx_unlock(&bqdirty);
2688			break;
2689		}
2690		/*
2691		 * Skip sentinels inserted by other invocations of the
2692		 * flushbufqueues(), taking care to not reorder them.
2693		 *
2694		 * Only flush the buffers that belong to the
2695		 * vnode locked by the curthread.
2696		 */
2697		if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL &&
2698		    bp->b_vp != lvp)) {
2699			mtx_unlock(&bqdirty);
2700 			continue;
2701		}
2702		error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL);
2703		mtx_unlock(&bqdirty);
2704		if (error != 0)
2705			continue;
2706		if (bp->b_pin_count > 0) {
2707			BUF_UNLOCK(bp);
2708			continue;
2709		}
2710		/*
2711		 * BKGRDINPROG can only be set with the buf and bufobj
2712		 * locks both held.  We tolerate a race to clear it here.
2713		 */
2714		if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
2715		    (bp->b_flags & B_DELWRI) == 0) {
2716			BUF_UNLOCK(bp);
2717			continue;
2718		}
2719		if (bp->b_flags & B_INVAL) {
2720			bremfreef(bp);
2721			brelse(bp);
2722			flushed++;
2723			continue;
2724		}
2725
2726		if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
2727			if (flushdeps == 0) {
2728				BUF_UNLOCK(bp);
2729				continue;
2730			}
2731			hasdeps = 1;
2732		} else
2733			hasdeps = 0;
2734		/*
2735		 * We must hold the lock on a vnode before writing
2736		 * one of its buffers. Otherwise we may confuse, or
2737		 * in the case of a snapshot vnode, deadlock the
2738		 * system.
2739		 *
2740		 * The lock order here is the reverse of the normal
2741		 * of vnode followed by buf lock.  This is ok because
2742		 * the NOWAIT will prevent deadlock.
2743		 */
2744		vp = bp->b_vp;
2745		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
2746			BUF_UNLOCK(bp);
2747			continue;
2748		}
2749		if (lvp == NULL) {
2750			unlock = true;
2751			error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
2752		} else {
2753			ASSERT_VOP_LOCKED(vp, "getbuf");
2754			unlock = false;
2755			error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 :
2756			    vn_lock(vp, LK_TRYUPGRADE);
2757		}
2758		if (error == 0) {
2759			CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
2760			    bp, bp->b_vp, bp->b_flags);
2761			if (curproc == bufdaemonproc) {
2762				vfs_bio_awrite(bp);
2763			} else {
2764				bremfree(bp);
2765				bwrite(bp);
2766				notbufdflushes++;
2767			}
2768			vn_finished_write(mp);
2769			if (unlock)
2770				VOP_UNLOCK(vp, 0);
2771			flushwithdeps += hasdeps;
2772			flushed++;
2773
2774			/*
2775			 * Sleeping on runningbufspace while holding
2776			 * vnode lock leads to deadlock.
2777			 */
2778			if (curproc == bufdaemonproc &&
2779			    runningbufspace > hirunningspace)
2780				waitrunningbufspace();
2781			continue;
2782		}
2783		vn_finished_write(mp);
2784		BUF_UNLOCK(bp);
2785	}
2786	mtx_lock(&bqdirty);
2787	TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
2788	mtx_unlock(&bqdirty);
2789	free(sentinel, M_TEMP);
2790	return (flushed);
2791}
2792
2793/*
2794 * Check to see if a block is currently memory resident.
2795 */
2796struct buf *
2797incore(struct bufobj *bo, daddr_t blkno)
2798{
2799	struct buf *bp;
2800
2801	BO_RLOCK(bo);
2802	bp = gbincore(bo, blkno);
2803	BO_RUNLOCK(bo);
2804	return (bp);
2805}
2806
2807/*
2808 * Returns true if no I/O is needed to access the
2809 * associated VM object.  This is like incore except
2810 * it also hunts around in the VM system for the data.
2811 */
2812
2813static int
2814inmem(struct vnode * vp, daddr_t blkno)
2815{
2816	vm_object_t obj;
2817	vm_offset_t toff, tinc, size;
2818	vm_page_t m;
2819	vm_ooffset_t off;
2820
2821	ASSERT_VOP_LOCKED(vp, "inmem");
2822
2823	if (incore(&vp->v_bufobj, blkno))
2824		return 1;
2825	if (vp->v_mount == NULL)
2826		return 0;
2827	obj = vp->v_object;
2828	if (obj == NULL)
2829		return (0);
2830
2831	size = PAGE_SIZE;
2832	if (size > vp->v_mount->mnt_stat.f_iosize)
2833		size = vp->v_mount->mnt_stat.f_iosize;
2834	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
2835
2836	VM_OBJECT_RLOCK(obj);
2837	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
2838		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
2839		if (!m)
2840			goto notinmem;
2841		tinc = size;
2842		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
2843			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
2844		if (vm_page_is_valid(m,
2845		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
2846			goto notinmem;
2847	}
2848	VM_OBJECT_RUNLOCK(obj);
2849	return 1;
2850
2851notinmem:
2852	VM_OBJECT_RUNLOCK(obj);
2853	return (0);
2854}
2855
2856/*
2857 * Set the dirty range for a buffer based on the status of the dirty
2858 * bits in the pages comprising the buffer.  The range is limited
2859 * to the size of the buffer.
2860 *
2861 * Tell the VM system that the pages associated with this buffer
2862 * are clean.  This is used for delayed writes where the data is
2863 * going to go to disk eventually without additional VM intevention.
2864 *
2865 * Note that while we only really need to clean through to b_bcount, we
2866 * just go ahead and clean through to b_bufsize.
2867 */
2868static void
2869vfs_clean_pages_dirty_buf(struct buf *bp)
2870{
2871	vm_ooffset_t foff, noff, eoff;
2872	vm_page_t m;
2873	int i;
2874
2875	if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0)
2876		return;
2877
2878	foff = bp->b_offset;
2879	KASSERT(bp->b_offset != NOOFFSET,
2880	    ("vfs_clean_pages_dirty_buf: no buffer offset"));
2881
2882	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
2883	vfs_drain_busy_pages(bp);
2884	vfs_setdirty_locked_object(bp);
2885	for (i = 0; i < bp->b_npages; i++) {
2886		noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
2887		eoff = noff;
2888		if (eoff > bp->b_offset + bp->b_bufsize)
2889			eoff = bp->b_offset + bp->b_bufsize;
2890		m = bp->b_pages[i];
2891		vfs_page_set_validclean(bp, foff, m);
2892		/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
2893		foff = noff;
2894	}
2895	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
2896}
2897
2898static void
2899vfs_setdirty_locked_object(struct buf *bp)
2900{
2901	vm_object_t object;
2902	int i;
2903
2904	object = bp->b_bufobj->bo_object;
2905	VM_OBJECT_ASSERT_WLOCKED(object);
2906
2907	/*
2908	 * We qualify the scan for modified pages on whether the
2909	 * object has been flushed yet.
2910	 */
2911	if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) {
2912		vm_offset_t boffset;
2913		vm_offset_t eoffset;
2914
2915		/*
2916		 * test the pages to see if they have been modified directly
2917		 * by users through the VM system.
2918		 */
2919		for (i = 0; i < bp->b_npages; i++)
2920			vm_page_test_dirty(bp->b_pages[i]);
2921
2922		/*
2923		 * Calculate the encompassing dirty range, boffset and eoffset,
2924		 * (eoffset - boffset) bytes.
2925		 */
2926
2927		for (i = 0; i < bp->b_npages; i++) {
2928			if (bp->b_pages[i]->dirty)
2929				break;
2930		}
2931		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
2932
2933		for (i = bp->b_npages - 1; i >= 0; --i) {
2934			if (bp->b_pages[i]->dirty) {
2935				break;
2936			}
2937		}
2938		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
2939
2940		/*
2941		 * Fit it to the buffer.
2942		 */
2943
2944		if (eoffset > bp->b_bcount)
2945			eoffset = bp->b_bcount;
2946
2947		/*
2948		 * If we have a good dirty range, merge with the existing
2949		 * dirty range.
2950		 */
2951
2952		if (boffset < eoffset) {
2953			if (bp->b_dirtyoff > boffset)
2954				bp->b_dirtyoff = boffset;
2955			if (bp->b_dirtyend < eoffset)
2956				bp->b_dirtyend = eoffset;
2957		}
2958	}
2959}
2960
2961/*
2962 * Allocate the KVA mapping for an existing buffer.
2963 * If an unmapped buffer is provided but a mapped buffer is requested, take
2964 * also care to properly setup mappings between pages and KVA.
2965 */
2966static void
2967bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
2968{
2969	struct buf *scratch_bp;
2970	int bsize, maxsize, need_mapping, need_kva;
2971	off_t offset;
2972
2973	need_mapping = bp->b_data == unmapped_buf &&
2974	    (gbflags & GB_UNMAPPED) == 0;
2975	need_kva = bp->b_kvabase == unmapped_buf &&
2976	    bp->b_data == unmapped_buf &&
2977	    (gbflags & GB_KVAALLOC) != 0;
2978	if (!need_mapping && !need_kva)
2979		return;
2980
2981	BUF_CHECK_UNMAPPED(bp);
2982
2983	if (need_mapping && bp->b_kvabase != unmapped_buf) {
2984		/*
2985		 * Buffer is not mapped, but the KVA was already
2986		 * reserved at the time of the instantiation.  Use the
2987		 * allocated space.
2988		 */
2989		goto has_addr;
2990	}
2991
2992	/*
2993	 * Calculate the amount of the address space we would reserve
2994	 * if the buffer was mapped.
2995	 */
2996	bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
2997	KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
2998	offset = blkno * bsize;
2999	maxsize = size + (offset & PAGE_MASK);
3000	maxsize = imax(maxsize, bsize);
3001
3002mapping_loop:
3003	if (bufkvaalloc(bp, maxsize, gbflags)) {
3004		/*
3005		 * Request defragmentation. getnewbuf() returns us the
3006		 * allocated space by the scratch buffer KVA.
3007		 */
3008		scratch_bp = getnewbuf(bp->b_vp, 0, 0, size, maxsize, gbflags |
3009		    (GB_UNMAPPED | GB_KVAALLOC));
3010		if (scratch_bp == NULL) {
3011			if ((gbflags & GB_NOWAIT_BD) != 0) {
3012				/*
3013				 * XXXKIB: defragmentation cannot
3014				 * succeed, not sure what else to do.
3015				 */
3016				panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp);
3017			}
3018			atomic_add_int(&mappingrestarts, 1);
3019			goto mapping_loop;
3020		}
3021		KASSERT(scratch_bp->b_kvabase != unmapped_buf,
3022		    ("scratch bp has no KVA %p", scratch_bp));
3023		/* Grab pointers. */
3024		bp->b_kvabase = scratch_bp->b_kvabase;
3025		bp->b_kvasize = scratch_bp->b_kvasize;
3026		bp->b_data = scratch_bp->b_data;
3027
3028		/* Get rid of the scratch buffer. */
3029		scratch_bp->b_kvasize = 0;
3030		scratch_bp->b_flags |= B_INVAL;
3031		scratch_bp->b_data = scratch_bp->b_kvabase = unmapped_buf;
3032		brelse(scratch_bp);
3033	}
3034has_addr:
3035	if (need_mapping) {
3036		/* b_offset is handled by bpmap_qenter. */
3037		bp->b_data = bp->b_kvabase;
3038		BUF_CHECK_MAPPED(bp);
3039		bpmap_qenter(bp);
3040	}
3041}
3042
3043/*
3044 *	getblk:
3045 *
3046 *	Get a block given a specified block and offset into a file/device.
3047 *	The buffers B_DONE bit will be cleared on return, making it almost
3048 * 	ready for an I/O initiation.  B_INVAL may or may not be set on
3049 *	return.  The caller should clear B_INVAL prior to initiating a
3050 *	READ.
3051 *
3052 *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
3053 *	an existing buffer.
3054 *
3055 *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
3056 *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
3057 *	and then cleared based on the backing VM.  If the previous buffer is
3058 *	non-0-sized but invalid, B_CACHE will be cleared.
3059 *
3060 *	If getblk() must create a new buffer, the new buffer is returned with
3061 *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
3062 *	case it is returned with B_INVAL clear and B_CACHE set based on the
3063 *	backing VM.
3064 *
3065 *	getblk() also forces a bwrite() for any B_DELWRI buffer whos
3066 *	B_CACHE bit is clear.
3067 *
3068 *	What this means, basically, is that the caller should use B_CACHE to
3069 *	determine whether the buffer is fully valid or not and should clear
3070 *	B_INVAL prior to issuing a read.  If the caller intends to validate
3071 *	the buffer by loading its data area with something, the caller needs
3072 *	to clear B_INVAL.  If the caller does this without issuing an I/O,
3073 *	the caller should set B_CACHE ( as an optimization ), else the caller
3074 *	should issue the I/O and biodone() will set B_CACHE if the I/O was
3075 *	a write attempt or if it was a successfull read.  If the caller
3076 *	intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
3077 *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
3078 */
3079struct buf *
3080getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
3081    int flags)
3082{
3083	struct buf *bp;
3084	struct bufobj *bo;
3085	int bsize, error, maxsize, vmio;
3086	off_t offset;
3087
3088	CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
3089	KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
3090	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
3091	ASSERT_VOP_LOCKED(vp, "getblk");
3092	if (size > MAXBCACHEBUF)
3093		panic("getblk: size(%d) > MAXBCACHEBUF(%d)\n", size,
3094		    MAXBCACHEBUF);
3095	if (!unmapped_buf_allowed)
3096		flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
3097
3098	bo = &vp->v_bufobj;
3099loop:
3100	BO_RLOCK(bo);
3101	bp = gbincore(bo, blkno);
3102	if (bp != NULL) {
3103		int lockflags;
3104		/*
3105		 * Buffer is in-core.  If the buffer is not busy nor managed,
3106		 * it must be on a queue.
3107		 */
3108		lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
3109
3110		if (flags & GB_LOCK_NOWAIT)
3111			lockflags |= LK_NOWAIT;
3112
3113		error = BUF_TIMELOCK(bp, lockflags,
3114		    BO_LOCKPTR(bo), "getblk", slpflag, slptimeo);
3115
3116		/*
3117		 * If we slept and got the lock we have to restart in case
3118		 * the buffer changed identities.
3119		 */
3120		if (error == ENOLCK)
3121			goto loop;
3122		/* We timed out or were interrupted. */
3123		else if (error)
3124			return (NULL);
3125		/* If recursed, assume caller knows the rules. */
3126		else if (BUF_LOCKRECURSED(bp))
3127			goto end;
3128
3129		/*
3130		 * The buffer is locked.  B_CACHE is cleared if the buffer is
3131		 * invalid.  Otherwise, for a non-VMIO buffer, B_CACHE is set
3132		 * and for a VMIO buffer B_CACHE is adjusted according to the
3133		 * backing VM cache.
3134		 */
3135		if (bp->b_flags & B_INVAL)
3136			bp->b_flags &= ~B_CACHE;
3137		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
3138			bp->b_flags |= B_CACHE;
3139		if (bp->b_flags & B_MANAGED)
3140			MPASS(bp->b_qindex == QUEUE_NONE);
3141		else
3142			bremfree(bp);
3143
3144		/*
3145		 * check for size inconsistencies for non-VMIO case.
3146		 */
3147		if (bp->b_bcount != size) {
3148			if ((bp->b_flags & B_VMIO) == 0 ||
3149			    (size > bp->b_kvasize)) {
3150				if (bp->b_flags & B_DELWRI) {
3151					/*
3152					 * If buffer is pinned and caller does
3153					 * not want sleep  waiting for it to be
3154					 * unpinned, bail out
3155					 * */
3156					if (bp->b_pin_count > 0) {
3157						if (flags & GB_LOCK_NOWAIT) {
3158							bqrelse(bp);
3159							return (NULL);
3160						} else {
3161							bunpin_wait(bp);
3162						}
3163					}
3164					bp->b_flags |= B_NOCACHE;
3165					bwrite(bp);
3166				} else {
3167					if (LIST_EMPTY(&bp->b_dep)) {
3168						bp->b_flags |= B_RELBUF;
3169						brelse(bp);
3170					} else {
3171						bp->b_flags |= B_NOCACHE;
3172						bwrite(bp);
3173					}
3174				}
3175				goto loop;
3176			}
3177		}
3178
3179		/*
3180		 * Handle the case of unmapped buffer which should
3181		 * become mapped, or the buffer for which KVA
3182		 * reservation is requested.
3183		 */
3184		bp_unmapped_get_kva(bp, blkno, size, flags);
3185
3186		/*
3187		 * If the size is inconsistant in the VMIO case, we can resize
3188		 * the buffer.  This might lead to B_CACHE getting set or
3189		 * cleared.  If the size has not changed, B_CACHE remains
3190		 * unchanged from its previous state.
3191		 */
3192		if (bp->b_bcount != size)
3193			allocbuf(bp, size);
3194
3195		KASSERT(bp->b_offset != NOOFFSET,
3196		    ("getblk: no buffer offset"));
3197
3198		/*
3199		 * A buffer with B_DELWRI set and B_CACHE clear must
3200		 * be committed before we can return the buffer in
3201		 * order to prevent the caller from issuing a read
3202		 * ( due to B_CACHE not being set ) and overwriting
3203		 * it.
3204		 *
3205		 * Most callers, including NFS and FFS, need this to
3206		 * operate properly either because they assume they
3207		 * can issue a read if B_CACHE is not set, or because
3208		 * ( for example ) an uncached B_DELWRI might loop due
3209		 * to softupdates re-dirtying the buffer.  In the latter
3210		 * case, B_CACHE is set after the first write completes,
3211		 * preventing further loops.
3212		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
3213		 * above while extending the buffer, we cannot allow the
3214		 * buffer to remain with B_CACHE set after the write
3215		 * completes or it will represent a corrupt state.  To
3216		 * deal with this we set B_NOCACHE to scrap the buffer
3217		 * after the write.
3218		 *
3219		 * We might be able to do something fancy, like setting
3220		 * B_CACHE in bwrite() except if B_DELWRI is already set,
3221		 * so the below call doesn't set B_CACHE, but that gets real
3222		 * confusing.  This is much easier.
3223		 */
3224
3225		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
3226			bp->b_flags |= B_NOCACHE;
3227			bwrite(bp);
3228			goto loop;
3229		}
3230		bp->b_flags &= ~B_DONE;
3231	} else {
3232		/*
3233		 * Buffer is not in-core, create new buffer.  The buffer
3234		 * returned by getnewbuf() is locked.  Note that the returned
3235		 * buffer is also considered valid (not marked B_INVAL).
3236		 */
3237		BO_RUNLOCK(bo);
3238		/*
3239		 * If the user does not want us to create the buffer, bail out
3240		 * here.
3241		 */
3242		if (flags & GB_NOCREAT)
3243			return NULL;
3244		if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread))
3245			return NULL;
3246
3247		bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
3248		KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
3249		offset = blkno * bsize;
3250		vmio = vp->v_object != NULL;
3251		if (vmio) {
3252			maxsize = size + (offset & PAGE_MASK);
3253		} else {
3254			maxsize = size;
3255			/* Do not allow non-VMIO notmapped buffers. */
3256			flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
3257		}
3258		maxsize = imax(maxsize, bsize);
3259
3260		bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags);
3261		if (bp == NULL) {
3262			if (slpflag || slptimeo)
3263				return NULL;
3264			goto loop;
3265		}
3266
3267		/*
3268		 * This code is used to make sure that a buffer is not
3269		 * created while the getnewbuf routine is blocked.
3270		 * This can be a problem whether the vnode is locked or not.
3271		 * If the buffer is created out from under us, we have to
3272		 * throw away the one we just created.
3273		 *
3274		 * Note: this must occur before we associate the buffer
3275		 * with the vp especially considering limitations in
3276		 * the splay tree implementation when dealing with duplicate
3277		 * lblkno's.
3278		 */
3279		BO_LOCK(bo);
3280		if (gbincore(bo, blkno)) {
3281			BO_UNLOCK(bo);
3282			bp->b_flags |= B_INVAL;
3283			brelse(bp);
3284			goto loop;
3285		}
3286
3287		/*
3288		 * Insert the buffer into the hash, so that it can
3289		 * be found by incore.
3290		 */
3291		bp->b_blkno = bp->b_lblkno = blkno;
3292		bp->b_offset = offset;
3293		bgetvp(vp, bp);
3294		BO_UNLOCK(bo);
3295
3296		/*
3297		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
3298		 * buffer size starts out as 0, B_CACHE will be set by
3299		 * allocbuf() for the VMIO case prior to it testing the
3300		 * backing store for validity.
3301		 */
3302
3303		if (vmio) {
3304			bp->b_flags |= B_VMIO;
3305			KASSERT(vp->v_object == bp->b_bufobj->bo_object,
3306			    ("ARGH! different b_bufobj->bo_object %p %p %p\n",
3307			    bp, vp->v_object, bp->b_bufobj->bo_object));
3308		} else {
3309			bp->b_flags &= ~B_VMIO;
3310			KASSERT(bp->b_bufobj->bo_object == NULL,
3311			    ("ARGH! has b_bufobj->bo_object %p %p\n",
3312			    bp, bp->b_bufobj->bo_object));
3313			BUF_CHECK_MAPPED(bp);
3314		}
3315
3316		allocbuf(bp, size);
3317		bp->b_flags &= ~B_DONE;
3318	}
3319	CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
3320	BUF_ASSERT_HELD(bp);
3321end:
3322	KASSERT(bp->b_bufobj == bo,
3323	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
3324	return (bp);
3325}
3326
3327/*
3328 * Get an empty, disassociated buffer of given size.  The buffer is initially
3329 * set to B_INVAL.
3330 */
3331struct buf *
3332geteblk(int size, int flags)
3333{
3334	struct buf *bp;
3335	int maxsize;
3336
3337	maxsize = (size + BKVAMASK) & ~BKVAMASK;
3338	while ((bp = getnewbuf(NULL, 0, 0, size, maxsize, flags)) == NULL) {
3339		if ((flags & GB_NOWAIT_BD) &&
3340		    (curthread->td_pflags & TDP_BUFNEED) != 0)
3341			return (NULL);
3342	}
3343	allocbuf(bp, size);
3344	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
3345	BUF_ASSERT_HELD(bp);
3346	return (bp);
3347}
3348
3349/*
3350 * This code constitutes the buffer memory from either anonymous system
3351 * memory (in the case of non-VMIO operations) or from an associated
3352 * VM object (in the case of VMIO operations).  This code is able to
3353 * resize a buffer up or down.
3354 *
3355 * Note that this code is tricky, and has many complications to resolve
3356 * deadlock or inconsistant data situations.  Tread lightly!!!
3357 * There are B_CACHE and B_DELWRI interactions that must be dealt with by
3358 * the caller.  Calling this code willy nilly can result in the loss of data.
3359 *
3360 * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
3361 * B_CACHE for the non-VMIO case.
3362 */
3363
3364int
3365allocbuf(struct buf *bp, int size)
3366{
3367	int newbsize, mbsize;
3368	int i;
3369
3370	BUF_ASSERT_HELD(bp);
3371
3372	if (bp->b_kvasize != 0 && bp->b_kvasize < size)
3373		panic("allocbuf: buffer too small");
3374
3375	if ((bp->b_flags & B_VMIO) == 0) {
3376		caddr_t origbuf;
3377		int origbufsize;
3378		/*
3379		 * Just get anonymous memory from the kernel.  Don't
3380		 * mess with B_CACHE.
3381		 */
3382		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
3383		if (bp->b_flags & B_MALLOC)
3384			newbsize = mbsize;
3385		else
3386			newbsize = round_page(size);
3387
3388		if (newbsize < bp->b_bufsize) {
3389			/*
3390			 * malloced buffers are not shrunk
3391			 */
3392			if (bp->b_flags & B_MALLOC) {
3393				if (newbsize) {
3394					bp->b_bcount = size;
3395				} else {
3396					free(bp->b_data, M_BIOBUF);
3397					bufmallocadjust(bp, 0);
3398					bp->b_data = bp->b_kvabase;
3399					bp->b_bcount = 0;
3400					bp->b_flags &= ~B_MALLOC;
3401				}
3402				return 1;
3403			}
3404			vm_hold_free_pages(bp, newbsize);
3405		} else if (newbsize > bp->b_bufsize) {
3406			/*
3407			 * We only use malloced memory on the first allocation.
3408			 * and revert to page-allocated memory when the buffer
3409			 * grows.
3410			 */
3411			/*
3412			 * There is a potential smp race here that could lead
3413			 * to bufmallocspace slightly passing the max.  It
3414			 * is probably extremely rare and not worth worrying
3415			 * over.
3416			 */
3417			if ((bufmallocspace < maxbufmallocspace) &&
3418				(bp->b_bufsize == 0) &&
3419				(mbsize <= PAGE_SIZE/2)) {
3420
3421				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
3422				bp->b_bcount = size;
3423				bp->b_flags |= B_MALLOC;
3424				bufmallocadjust(bp, mbsize);
3425				return 1;
3426			}
3427			origbuf = NULL;
3428			origbufsize = 0;
3429			/*
3430			 * If the buffer is growing on its other-than-first
3431			 * allocation then we revert to the page-allocation
3432			 * scheme.
3433			 */
3434			if (bp->b_flags & B_MALLOC) {
3435				origbuf = bp->b_data;
3436				origbufsize = bp->b_bufsize;
3437				bp->b_data = bp->b_kvabase;
3438				bufmallocadjust(bp, 0);
3439				bp->b_flags &= ~B_MALLOC;
3440				newbsize = round_page(newbsize);
3441			}
3442			vm_hold_load_pages(
3443			    bp,
3444			    (vm_offset_t) bp->b_data + bp->b_bufsize,
3445			    (vm_offset_t) bp->b_data + newbsize);
3446			if (origbuf) {
3447				bcopy(origbuf, bp->b_data, origbufsize);
3448				free(origbuf, M_BIOBUF);
3449			}
3450		}
3451	} else {
3452		int desiredpages;
3453
3454		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
3455		desiredpages = (size == 0) ? 0 :
3456			num_pages((bp->b_offset & PAGE_MASK) + newbsize);
3457
3458		if (bp->b_flags & B_MALLOC)
3459			panic("allocbuf: VMIO buffer can't be malloced");
3460		/*
3461		 * Set B_CACHE initially if buffer is 0 length or will become
3462		 * 0-length.
3463		 */
3464		if (size == 0 || bp->b_bufsize == 0)
3465			bp->b_flags |= B_CACHE;
3466
3467		if (newbsize < bp->b_bufsize) {
3468			/*
3469			 * DEV_BSIZE aligned new buffer size is less then the
3470			 * DEV_BSIZE aligned existing buffer size.  Figure out
3471			 * if we have to remove any pages.
3472			 */
3473			if (desiredpages < bp->b_npages) {
3474				vm_page_t m;
3475
3476				if (buf_mapped(bp)) {
3477					BUF_CHECK_MAPPED(bp);
3478					pmap_qremove((vm_offset_t)trunc_page(
3479					    (vm_offset_t)bp->b_data) +
3480					    (desiredpages << PAGE_SHIFT),
3481					    (bp->b_npages - desiredpages));
3482				} else
3483					BUF_CHECK_UNMAPPED(bp);
3484				VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
3485				for (i = desiredpages; i < bp->b_npages; i++) {
3486					/*
3487					 * the page is not freed here -- it
3488					 * is the responsibility of
3489					 * vnode_pager_setsize
3490					 */
3491					m = bp->b_pages[i];
3492					KASSERT(m != bogus_page,
3493					    ("allocbuf: bogus page found"));
3494					while (vm_page_sleep_if_busy(m,
3495					    "biodep"))
3496						continue;
3497
3498					bp->b_pages[i] = NULL;
3499					vm_page_lock(m);
3500					vm_page_unwire(m, PQ_INACTIVE);
3501					vm_page_unlock(m);
3502				}
3503				VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
3504				bp->b_npages = desiredpages;
3505			}
3506		} else if (size > bp->b_bcount) {
3507			/*
3508			 * We are growing the buffer, possibly in a
3509			 * byte-granular fashion.
3510			 */
3511			vm_object_t obj;
3512			vm_offset_t toff;
3513			vm_offset_t tinc;
3514
3515			/*
3516			 * Step 1, bring in the VM pages from the object,
3517			 * allocating them if necessary.  We must clear
3518			 * B_CACHE if these pages are not valid for the
3519			 * range covered by the buffer.
3520			 */
3521
3522			obj = bp->b_bufobj->bo_object;
3523
3524			VM_OBJECT_WLOCK(obj);
3525			while (bp->b_npages < desiredpages) {
3526				vm_page_t m;
3527
3528				/*
3529				 * We must allocate system pages since blocking
3530				 * here could interfere with paging I/O, no
3531				 * matter which process we are.
3532				 *
3533				 * Only exclusive busy can be tested here.
3534				 * Blocking on shared busy might lead to
3535				 * deadlocks once allocbuf() is called after
3536				 * pages are vfs_busy_pages().
3537				 */
3538				m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) +
3539				    bp->b_npages, VM_ALLOC_NOBUSY |
3540				    VM_ALLOC_SYSTEM | VM_ALLOC_WIRED |
3541				    VM_ALLOC_IGN_SBUSY |
3542				    VM_ALLOC_COUNT(desiredpages - bp->b_npages));
3543				if (m->valid == 0)
3544					bp->b_flags &= ~B_CACHE;
3545				bp->b_pages[bp->b_npages] = m;
3546				++bp->b_npages;
3547			}
3548
3549			/*
3550			 * Step 2.  We've loaded the pages into the buffer,
3551			 * we have to figure out if we can still have B_CACHE
3552			 * set.  Note that B_CACHE is set according to the
3553			 * byte-granular range ( bcount and size ), new the
3554			 * aligned range ( newbsize ).
3555			 *
3556			 * The VM test is against m->valid, which is DEV_BSIZE
3557			 * aligned.  Needless to say, the validity of the data
3558			 * needs to also be DEV_BSIZE aligned.  Note that this
3559			 * fails with NFS if the server or some other client
3560			 * extends the file's EOF.  If our buffer is resized,
3561			 * B_CACHE may remain set! XXX
3562			 */
3563
3564			toff = bp->b_bcount;
3565			tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
3566
3567			while ((bp->b_flags & B_CACHE) && toff < size) {
3568				vm_pindex_t pi;
3569
3570				if (tinc > (size - toff))
3571					tinc = size - toff;
3572
3573				pi = ((bp->b_offset & PAGE_MASK) + toff) >>
3574				    PAGE_SHIFT;
3575
3576				vfs_buf_test_cache(
3577				    bp,
3578				    bp->b_offset,
3579				    toff,
3580				    tinc,
3581				    bp->b_pages[pi]
3582				);
3583				toff += tinc;
3584				tinc = PAGE_SIZE;
3585			}
3586			VM_OBJECT_WUNLOCK(obj);
3587
3588			/*
3589			 * Step 3, fixup the KVA pmap.
3590			 */
3591			if (buf_mapped(bp))
3592				bpmap_qenter(bp);
3593			else
3594				BUF_CHECK_UNMAPPED(bp);
3595		}
3596	}
3597	/* Record changes in allocation size. */
3598	if (bp->b_bufsize != newbsize)
3599		bufspaceadjust(bp, newbsize);
3600	bp->b_bcount = size;		/* requested buffer size. */
3601	return 1;
3602}
3603
3604extern int inflight_transient_maps;
3605
3606void
3607biodone(struct bio *bp)
3608{
3609	struct mtx *mtxp;
3610	void (*done)(struct bio *);
3611	vm_offset_t start, end;
3612
3613	if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
3614		bp->bio_flags &= ~BIO_TRANSIENT_MAPPING;
3615		bp->bio_flags |= BIO_UNMAPPED;
3616		start = trunc_page((vm_offset_t)bp->bio_data);
3617		end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
3618		bp->bio_data = unmapped_buf;
3619		pmap_qremove(start, OFF_TO_IDX(end - start));
3620		vmem_free(transient_arena, start, end - start);
3621		atomic_add_int(&inflight_transient_maps, -1);
3622	}
3623	done = bp->bio_done;
3624	if (done == NULL) {
3625		mtxp = mtx_pool_find(mtxpool_sleep, bp);
3626		mtx_lock(mtxp);
3627		bp->bio_flags |= BIO_DONE;
3628		wakeup(bp);
3629		mtx_unlock(mtxp);
3630	} else {
3631		bp->bio_flags |= BIO_DONE;
3632		done(bp);
3633	}
3634}
3635
3636/*
3637 * Wait for a BIO to finish.
3638 */
3639int
3640biowait(struct bio *bp, const char *wchan)
3641{
3642	struct mtx *mtxp;
3643
3644	mtxp = mtx_pool_find(mtxpool_sleep, bp);
3645	mtx_lock(mtxp);
3646	while ((bp->bio_flags & BIO_DONE) == 0)
3647		msleep(bp, mtxp, PRIBIO, wchan, 0);
3648	mtx_unlock(mtxp);
3649	if (bp->bio_error != 0)
3650		return (bp->bio_error);
3651	if (!(bp->bio_flags & BIO_ERROR))
3652		return (0);
3653	return (EIO);
3654}
3655
3656void
3657biofinish(struct bio *bp, struct devstat *stat, int error)
3658{
3659
3660	if (error) {
3661		bp->bio_error = error;
3662		bp->bio_flags |= BIO_ERROR;
3663	}
3664	if (stat != NULL)
3665		devstat_end_transaction_bio(stat, bp);
3666	biodone(bp);
3667}
3668
3669/*
3670 *	bufwait:
3671 *
3672 *	Wait for buffer I/O completion, returning error status.  The buffer
3673 *	is left locked and B_DONE on return.  B_EINTR is converted into an EINTR
3674 *	error and cleared.
3675 */
3676int
3677bufwait(struct buf *bp)
3678{
3679	if (bp->b_iocmd == BIO_READ)
3680		bwait(bp, PRIBIO, "biord");
3681	else
3682		bwait(bp, PRIBIO, "biowr");
3683	if (bp->b_flags & B_EINTR) {
3684		bp->b_flags &= ~B_EINTR;
3685		return (EINTR);
3686	}
3687	if (bp->b_ioflags & BIO_ERROR) {
3688		return (bp->b_error ? bp->b_error : EIO);
3689	} else {
3690		return (0);
3691	}
3692}
3693
3694 /*
3695  * Call back function from struct bio back up to struct buf.
3696  */
3697static void
3698bufdonebio(struct bio *bip)
3699{
3700	struct buf *bp;
3701
3702	bp = bip->bio_caller2;
3703	bp->b_resid = bip->bio_resid;
3704	bp->b_ioflags = bip->bio_flags;
3705	bp->b_error = bip->bio_error;
3706	if (bp->b_error)
3707		bp->b_ioflags |= BIO_ERROR;
3708	bufdone(bp);
3709	g_destroy_bio(bip);
3710}
3711
3712void
3713dev_strategy(struct cdev *dev, struct buf *bp)
3714{
3715	struct cdevsw *csw;
3716	int ref;
3717
3718	KASSERT(dev->si_refcount > 0,
3719	    ("dev_strategy on un-referenced struct cdev *(%s) %p",
3720	    devtoname(dev), dev));
3721
3722	csw = dev_refthread(dev, &ref);
3723	dev_strategy_csw(dev, csw, bp);
3724	dev_relthread(dev, ref);
3725}
3726
3727void
3728dev_strategy_csw(struct cdev *dev, struct cdevsw *csw, struct buf *bp)
3729{
3730	struct bio *bip;
3731
3732	KASSERT(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE,
3733	    ("b_iocmd botch"));
3734	KASSERT(((dev->si_flags & SI_ETERNAL) != 0 && csw != NULL) ||
3735	    dev->si_threadcount > 0,
3736	    ("dev_strategy_csw threadcount cdev *(%s) %p", devtoname(dev),
3737	    dev));
3738	if (csw == NULL) {
3739		bp->b_error = ENXIO;
3740		bp->b_ioflags = BIO_ERROR;
3741		bufdone(bp);
3742		return;
3743	}
3744	for (;;) {
3745		bip = g_new_bio();
3746		if (bip != NULL)
3747			break;
3748		/* Try again later */
3749		tsleep(&bp, PRIBIO, "dev_strat", hz/10);
3750	}
3751	bip->bio_cmd = bp->b_iocmd;
3752	bip->bio_offset = bp->b_iooffset;
3753	bip->bio_length = bp->b_bcount;
3754	bip->bio_bcount = bp->b_bcount;	/* XXX: remove */
3755	bdata2bio(bp, bip);
3756	bip->bio_done = bufdonebio;
3757	bip->bio_caller2 = bp;
3758	bip->bio_dev = dev;
3759	(*csw->d_strategy)(bip);
3760}
3761
3762/*
3763 *	bufdone:
3764 *
3765 *	Finish I/O on a buffer, optionally calling a completion function.
3766 *	This is usually called from an interrupt so process blocking is
3767 *	not allowed.
3768 *
3769 *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
3770 *	In a non-VMIO bp, B_CACHE will be set on the next getblk()
3771 *	assuming B_INVAL is clear.
3772 *
3773 *	For the VMIO case, we set B_CACHE if the op was a read and no
3774 *	read error occured, or if the op was a write.  B_CACHE is never
3775 *	set if the buffer is invalid or otherwise uncacheable.
3776 *
3777 *	biodone does not mess with B_INVAL, allowing the I/O routine or the
3778 *	initiator to leave B_INVAL set to brelse the buffer out of existance
3779 *	in the biodone routine.
3780 */
3781void
3782bufdone(struct buf *bp)
3783{
3784	struct bufobj *dropobj;
3785	void    (*biodone)(struct buf *);
3786
3787	CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
3788	dropobj = NULL;
3789
3790	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
3791	BUF_ASSERT_HELD(bp);
3792
3793	runningbufwakeup(bp);
3794	if (bp->b_iocmd == BIO_WRITE)
3795		dropobj = bp->b_bufobj;
3796	/* call optional completion function if requested */
3797	if (bp->b_iodone != NULL) {
3798		biodone = bp->b_iodone;
3799		bp->b_iodone = NULL;
3800		(*biodone) (bp);
3801		if (dropobj)
3802			bufobj_wdrop(dropobj);
3803		return;
3804	}
3805
3806	bufdone_finish(bp);
3807
3808	if (dropobj)
3809		bufobj_wdrop(dropobj);
3810}
3811
3812void
3813bufdone_finish(struct buf *bp)
3814{
3815	BUF_ASSERT_HELD(bp);
3816
3817	if (!LIST_EMPTY(&bp->b_dep))
3818		buf_complete(bp);
3819
3820	if (bp->b_flags & B_VMIO) {
3821		vm_ooffset_t foff;
3822		vm_page_t m;
3823		vm_object_t obj;
3824		struct vnode *vp;
3825		int bogus, i, iosize;
3826
3827		obj = bp->b_bufobj->bo_object;
3828		KASSERT(obj->paging_in_progress >= bp->b_npages,
3829		    ("biodone_finish: paging in progress(%d) < b_npages(%d)",
3830		    obj->paging_in_progress, bp->b_npages));
3831
3832		vp = bp->b_vp;
3833		KASSERT(vp->v_holdcnt > 0,
3834		    ("biodone_finish: vnode %p has zero hold count", vp));
3835		KASSERT(vp->v_object != NULL,
3836		    ("biodone_finish: vnode %p has no vm_object", vp));
3837
3838		foff = bp->b_offset;
3839		KASSERT(bp->b_offset != NOOFFSET,
3840		    ("biodone_finish: bp %p has no buffer offset", bp));
3841
3842		/*
3843		 * Set B_CACHE if the op was a normal read and no error
3844		 * occured.  B_CACHE is set for writes in the b*write()
3845		 * routines.
3846		 */
3847		iosize = bp->b_bcount - bp->b_resid;
3848		if (bp->b_iocmd == BIO_READ &&
3849		    !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
3850		    !(bp->b_ioflags & BIO_ERROR)) {
3851			bp->b_flags |= B_CACHE;
3852		}
3853		bogus = 0;
3854		VM_OBJECT_WLOCK(obj);
3855		for (i = 0; i < bp->b_npages; i++) {
3856			int bogusflag = 0;
3857			int resid;
3858
3859			resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
3860			if (resid > iosize)
3861				resid = iosize;
3862
3863			/*
3864			 * cleanup bogus pages, restoring the originals
3865			 */
3866			m = bp->b_pages[i];
3867			if (m == bogus_page) {
3868				bogus = bogusflag = 1;
3869				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
3870				if (m == NULL)
3871					panic("biodone: page disappeared!");
3872				bp->b_pages[i] = m;
3873			}
3874			KASSERT(OFF_TO_IDX(foff) == m->pindex,
3875			    ("biodone_finish: foff(%jd)/pindex(%ju) mismatch",
3876			    (intmax_t)foff, (uintmax_t)m->pindex));
3877
3878			/*
3879			 * In the write case, the valid and clean bits are
3880			 * already changed correctly ( see bdwrite() ), so we
3881			 * only need to do this here in the read case.
3882			 */
3883			if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) {
3884				KASSERT((m->dirty & vm_page_bits(foff &
3885				    PAGE_MASK, resid)) == 0, ("bufdone_finish:"
3886				    " page %p has unexpected dirty bits", m));
3887				vfs_page_set_valid(bp, foff, m);
3888			}
3889
3890			vm_page_sunbusy(m);
3891			vm_object_pip_subtract(obj, 1);
3892			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
3893			iosize -= resid;
3894		}
3895		vm_object_pip_wakeupn(obj, 0);
3896		VM_OBJECT_WUNLOCK(obj);
3897		if (bogus && buf_mapped(bp)) {
3898			BUF_CHECK_MAPPED(bp);
3899			pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
3900			    bp->b_pages, bp->b_npages);
3901		}
3902	}
3903
3904	/*
3905	 * For asynchronous completions, release the buffer now. The brelse
3906	 * will do a wakeup there if necessary - so no need to do a wakeup
3907	 * here in the async case. The sync case always needs to do a wakeup.
3908	 */
3909
3910	if (bp->b_flags & B_ASYNC) {
3911		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR))
3912			brelse(bp);
3913		else
3914			bqrelse(bp);
3915	} else
3916		bdone(bp);
3917}
3918
3919/*
3920 * This routine is called in lieu of iodone in the case of
3921 * incomplete I/O.  This keeps the busy status for pages
3922 * consistant.
3923 */
3924void
3925vfs_unbusy_pages(struct buf *bp)
3926{
3927	int i;
3928	vm_object_t obj;
3929	vm_page_t m;
3930
3931	runningbufwakeup(bp);
3932	if (!(bp->b_flags & B_VMIO))
3933		return;
3934
3935	obj = bp->b_bufobj->bo_object;
3936	VM_OBJECT_WLOCK(obj);
3937	for (i = 0; i < bp->b_npages; i++) {
3938		m = bp->b_pages[i];
3939		if (m == bogus_page) {
3940			m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
3941			if (!m)
3942				panic("vfs_unbusy_pages: page missing\n");
3943			bp->b_pages[i] = m;
3944			if (buf_mapped(bp)) {
3945				BUF_CHECK_MAPPED(bp);
3946				pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
3947				    bp->b_pages, bp->b_npages);
3948			} else
3949				BUF_CHECK_UNMAPPED(bp);
3950		}
3951		vm_object_pip_subtract(obj, 1);
3952		vm_page_sunbusy(m);
3953	}
3954	vm_object_pip_wakeupn(obj, 0);
3955	VM_OBJECT_WUNLOCK(obj);
3956}
3957
3958/*
3959 * vfs_page_set_valid:
3960 *
3961 *	Set the valid bits in a page based on the supplied offset.   The
3962 *	range is restricted to the buffer's size.
3963 *
3964 *	This routine is typically called after a read completes.
3965 */
3966static void
3967vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m)
3968{
3969	vm_ooffset_t eoff;
3970
3971	/*
3972	 * Compute the end offset, eoff, such that [off, eoff) does not span a
3973	 * page boundary and eoff is not greater than the end of the buffer.
3974	 * The end of the buffer, in this case, is our file EOF, not the
3975	 * allocation size of the buffer.
3976	 */
3977	eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK;
3978	if (eoff > bp->b_offset + bp->b_bcount)
3979		eoff = bp->b_offset + bp->b_bcount;
3980
3981	/*
3982	 * Set valid range.  This is typically the entire buffer and thus the
3983	 * entire page.
3984	 */
3985	if (eoff > off)
3986		vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off);
3987}
3988
3989/*
3990 * vfs_page_set_validclean:
3991 *
3992 *	Set the valid bits and clear the dirty bits in a page based on the
3993 *	supplied offset.   The range is restricted to the buffer's size.
3994 */
3995static void
3996vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m)
3997{
3998	vm_ooffset_t soff, eoff;
3999
4000	/*
4001	 * Start and end offsets in buffer.  eoff - soff may not cross a
4002	 * page boundry or cross the end of the buffer.  The end of the
4003	 * buffer, in this case, is our file EOF, not the allocation size
4004	 * of the buffer.
4005	 */
4006	soff = off;
4007	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
4008	if (eoff > bp->b_offset + bp->b_bcount)
4009		eoff = bp->b_offset + bp->b_bcount;
4010
4011	/*
4012	 * Set valid range.  This is typically the entire buffer and thus the
4013	 * entire page.
4014	 */
4015	if (eoff > soff) {
4016		vm_page_set_validclean(
4017		    m,
4018		   (vm_offset_t) (soff & PAGE_MASK),
4019		   (vm_offset_t) (eoff - soff)
4020		);
4021	}
4022}
4023
4024/*
4025 * Ensure that all buffer pages are not exclusive busied.  If any page is
4026 * exclusive busy, drain it.
4027 */
4028void
4029vfs_drain_busy_pages(struct buf *bp)
4030{
4031	vm_page_t m;
4032	int i, last_busied;
4033
4034	VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object);
4035	last_busied = 0;
4036	for (i = 0; i < bp->b_npages; i++) {
4037		m = bp->b_pages[i];
4038		if (vm_page_xbusied(m)) {
4039			for (; last_busied < i; last_busied++)
4040				vm_page_sbusy(bp->b_pages[last_busied]);
4041			while (vm_page_xbusied(m)) {
4042				vm_page_lock(m);
4043				VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
4044				vm_page_busy_sleep(m, "vbpage");
4045				VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
4046			}
4047		}
4048	}
4049	for (i = 0; i < last_busied; i++)
4050		vm_page_sunbusy(bp->b_pages[i]);
4051}
4052
4053/*
4054 * This routine is called before a device strategy routine.
4055 * It is used to tell the VM system that paging I/O is in
4056 * progress, and treat the pages associated with the buffer
4057 * almost as being exclusive busy.  Also the object paging_in_progress
4058 * flag is handled to make sure that the object doesn't become
4059 * inconsistant.
4060 *
4061 * Since I/O has not been initiated yet, certain buffer flags
4062 * such as BIO_ERROR or B_INVAL may be in an inconsistant state
4063 * and should be ignored.
4064 */
4065void
4066vfs_busy_pages(struct buf *bp, int clear_modify)
4067{
4068	int i, bogus;
4069	vm_object_t obj;
4070	vm_ooffset_t foff;
4071	vm_page_t m;
4072
4073	if (!(bp->b_flags & B_VMIO))
4074		return;
4075
4076	obj = bp->b_bufobj->bo_object;
4077	foff = bp->b_offset;
4078	KASSERT(bp->b_offset != NOOFFSET,
4079	    ("vfs_busy_pages: no buffer offset"));
4080	VM_OBJECT_WLOCK(obj);
4081	vfs_drain_busy_pages(bp);
4082	if (bp->b_bufsize != 0)
4083		vfs_setdirty_locked_object(bp);
4084	bogus = 0;
4085	for (i = 0; i < bp->b_npages; i++) {
4086		m = bp->b_pages[i];
4087
4088		if ((bp->b_flags & B_CLUSTER) == 0) {
4089			vm_object_pip_add(obj, 1);
4090			vm_page_sbusy(m);
4091		}
4092		/*
4093		 * When readying a buffer for a read ( i.e
4094		 * clear_modify == 0 ), it is important to do
4095		 * bogus_page replacement for valid pages in
4096		 * partially instantiated buffers.  Partially
4097		 * instantiated buffers can, in turn, occur when
4098		 * reconstituting a buffer from its VM backing store
4099		 * base.  We only have to do this if B_CACHE is
4100		 * clear ( which causes the I/O to occur in the
4101		 * first place ).  The replacement prevents the read
4102		 * I/O from overwriting potentially dirty VM-backed
4103		 * pages.  XXX bogus page replacement is, uh, bogus.
4104		 * It may not work properly with small-block devices.
4105		 * We need to find a better way.
4106		 */
4107		if (clear_modify) {
4108			pmap_remove_write(m);
4109			vfs_page_set_validclean(bp, foff, m);
4110		} else if (m->valid == VM_PAGE_BITS_ALL &&
4111		    (bp->b_flags & B_CACHE) == 0) {
4112			bp->b_pages[i] = bogus_page;
4113			bogus++;
4114		}
4115		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
4116	}
4117	VM_OBJECT_WUNLOCK(obj);
4118	if (bogus && buf_mapped(bp)) {
4119		BUF_CHECK_MAPPED(bp);
4120		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
4121		    bp->b_pages, bp->b_npages);
4122	}
4123}
4124
4125/*
4126 *	vfs_bio_set_valid:
4127 *
4128 *	Set the range within the buffer to valid.  The range is
4129 *	relative to the beginning of the buffer, b_offset.  Note that
4130 *	b_offset itself may be offset from the beginning of the first
4131 *	page.
4132 */
4133void
4134vfs_bio_set_valid(struct buf *bp, int base, int size)
4135{
4136	int i, n;
4137	vm_page_t m;
4138
4139	if (!(bp->b_flags & B_VMIO))
4140		return;
4141
4142	/*
4143	 * Fixup base to be relative to beginning of first page.
4144	 * Set initial n to be the maximum number of bytes in the
4145	 * first page that can be validated.
4146	 */
4147	base += (bp->b_offset & PAGE_MASK);
4148	n = PAGE_SIZE - (base & PAGE_MASK);
4149
4150	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
4151	for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
4152		m = bp->b_pages[i];
4153		if (n > size)
4154			n = size;
4155		vm_page_set_valid_range(m, base & PAGE_MASK, n);
4156		base += n;
4157		size -= n;
4158		n = PAGE_SIZE;
4159	}
4160	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
4161}
4162
4163/*
4164 *	vfs_bio_clrbuf:
4165 *
4166 *	If the specified buffer is a non-VMIO buffer, clear the entire
4167 *	buffer.  If the specified buffer is a VMIO buffer, clear and
4168 *	validate only the previously invalid portions of the buffer.
4169 *	This routine essentially fakes an I/O, so we need to clear
4170 *	BIO_ERROR and B_INVAL.
4171 *
4172 *	Note that while we only theoretically need to clear through b_bcount,
4173 *	we go ahead and clear through b_bufsize.
4174 */
4175void
4176vfs_bio_clrbuf(struct buf *bp)
4177{
4178	int i, j, mask, sa, ea, slide;
4179
4180	if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) {
4181		clrbuf(bp);
4182		return;
4183	}
4184	bp->b_flags &= ~B_INVAL;
4185	bp->b_ioflags &= ~BIO_ERROR;
4186	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
4187	if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
4188	    (bp->b_offset & PAGE_MASK) == 0) {
4189		if (bp->b_pages[0] == bogus_page)
4190			goto unlock;
4191		mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
4192		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object);
4193		if ((bp->b_pages[0]->valid & mask) == mask)
4194			goto unlock;
4195		if ((bp->b_pages[0]->valid & mask) == 0) {
4196			pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize);
4197			bp->b_pages[0]->valid |= mask;
4198			goto unlock;
4199		}
4200	}
4201	sa = bp->b_offset & PAGE_MASK;
4202	slide = 0;
4203	for (i = 0; i < bp->b_npages; i++, sa = 0) {
4204		slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize);
4205		ea = slide & PAGE_MASK;
4206		if (ea == 0)
4207			ea = PAGE_SIZE;
4208		if (bp->b_pages[i] == bogus_page)
4209			continue;
4210		j = sa / DEV_BSIZE;
4211		mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
4212		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object);
4213		if ((bp->b_pages[i]->valid & mask) == mask)
4214			continue;
4215		if ((bp->b_pages[i]->valid & mask) == 0)
4216			pmap_zero_page_area(bp->b_pages[i], sa, ea - sa);
4217		else {
4218			for (; sa < ea; sa += DEV_BSIZE, j++) {
4219				if ((bp->b_pages[i]->valid & (1 << j)) == 0) {
4220					pmap_zero_page_area(bp->b_pages[i],
4221					    sa, DEV_BSIZE);
4222				}
4223			}
4224		}
4225		bp->b_pages[i]->valid |= mask;
4226	}
4227unlock:
4228	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
4229	bp->b_resid = 0;
4230}
4231
4232void
4233vfs_bio_bzero_buf(struct buf *bp, int base, int size)
4234{
4235	vm_page_t m;
4236	int i, n;
4237
4238	if (buf_mapped(bp)) {
4239		BUF_CHECK_MAPPED(bp);
4240		bzero(bp->b_data + base, size);
4241	} else {
4242		BUF_CHECK_UNMAPPED(bp);
4243		n = PAGE_SIZE - (base & PAGE_MASK);
4244		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
4245			m = bp->b_pages[i];
4246			if (n > size)
4247				n = size;
4248			pmap_zero_page_area(m, base & PAGE_MASK, n);
4249			base += n;
4250			size -= n;
4251			n = PAGE_SIZE;
4252		}
4253	}
4254}
4255
4256/*
4257 * vm_hold_load_pages and vm_hold_free_pages get pages into
4258 * a buffers address space.  The pages are anonymous and are
4259 * not associated with a file object.
4260 */
4261static void
4262vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
4263{
4264	vm_offset_t pg;
4265	vm_page_t p;
4266	int index;
4267
4268	BUF_CHECK_MAPPED(bp);
4269
4270	to = round_page(to);
4271	from = round_page(from);
4272	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
4273
4274	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
4275tryagain:
4276		/*
4277		 * note: must allocate system pages since blocking here
4278		 * could interfere with paging I/O, no matter which
4279		 * process we are.
4280		 */
4281		p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
4282		    VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT));
4283		if (p == NULL) {
4284			VM_WAIT;
4285			goto tryagain;
4286		}
4287		pmap_qenter(pg, &p, 1);
4288		bp->b_pages[index] = p;
4289	}
4290	bp->b_npages = index;
4291}
4292
4293/* Return pages associated with this buf to the vm system */
4294static void
4295vm_hold_free_pages(struct buf *bp, int newbsize)
4296{
4297	vm_offset_t from;
4298	vm_page_t p;
4299	int index, newnpages;
4300
4301	BUF_CHECK_MAPPED(bp);
4302
4303	from = round_page((vm_offset_t)bp->b_data + newbsize);
4304	newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
4305	if (bp->b_npages > newnpages)
4306		pmap_qremove(from, bp->b_npages - newnpages);
4307	for (index = newnpages; index < bp->b_npages; index++) {
4308		p = bp->b_pages[index];
4309		bp->b_pages[index] = NULL;
4310		if (vm_page_sbusied(p))
4311			printf("vm_hold_free_pages: blkno: %jd, lblkno: %jd\n",
4312			    (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno);
4313		p->wire_count--;
4314		vm_page_free(p);
4315		atomic_subtract_int(&vm_cnt.v_wire_count, 1);
4316	}
4317	bp->b_npages = newnpages;
4318}
4319
4320/*
4321 * Map an IO request into kernel virtual address space.
4322 *
4323 * All requests are (re)mapped into kernel VA space.
4324 * Notice that we use b_bufsize for the size of the buffer
4325 * to be mapped.  b_bcount might be modified by the driver.
4326 *
4327 * Note that even if the caller determines that the address space should
4328 * be valid, a race or a smaller-file mapped into a larger space may
4329 * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST
4330 * check the return value.
4331 *
4332 * This function only works with pager buffers.
4333 */
4334int
4335vmapbuf(struct buf *bp, int mapbuf)
4336{
4337	vm_prot_t prot;
4338	int pidx;
4339
4340	if (bp->b_bufsize < 0)
4341		return (-1);
4342	prot = VM_PROT_READ;
4343	if (bp->b_iocmd == BIO_READ)
4344		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
4345	if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
4346	    (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages,
4347	    btoc(MAXPHYS))) < 0)
4348		return (-1);
4349	bp->b_npages = pidx;
4350	bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK;
4351	if (mapbuf || !unmapped_buf_allowed) {
4352		pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx);
4353		bp->b_data = bp->b_kvabase + bp->b_offset;
4354	} else
4355		bp->b_data = unmapped_buf;
4356	return(0);
4357}
4358
4359/*
4360 * Free the io map PTEs associated with this IO operation.
4361 * We also invalidate the TLB entries and restore the original b_addr.
4362 *
4363 * This function only works with pager buffers.
4364 */
4365void
4366vunmapbuf(struct buf *bp)
4367{
4368	int npages;
4369
4370	npages = bp->b_npages;
4371	if (buf_mapped(bp))
4372		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
4373	vm_page_unhold_pages(bp->b_pages, npages);
4374
4375	bp->b_data = unmapped_buf;
4376}
4377
4378void
4379bdone(struct buf *bp)
4380{
4381	struct mtx *mtxp;
4382
4383	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4384	mtx_lock(mtxp);
4385	bp->b_flags |= B_DONE;
4386	wakeup(bp);
4387	mtx_unlock(mtxp);
4388}
4389
4390void
4391bwait(struct buf *bp, u_char pri, const char *wchan)
4392{
4393	struct mtx *mtxp;
4394
4395	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4396	mtx_lock(mtxp);
4397	while ((bp->b_flags & B_DONE) == 0)
4398		msleep(bp, mtxp, pri, wchan, 0);
4399	mtx_unlock(mtxp);
4400}
4401
4402int
4403bufsync(struct bufobj *bo, int waitfor)
4404{
4405
4406	return (VOP_FSYNC(bo->__bo_vnode, waitfor, curthread));
4407}
4408
4409void
4410bufstrategy(struct bufobj *bo, struct buf *bp)
4411{
4412	int i = 0;
4413	struct vnode *vp;
4414
4415	vp = bp->b_vp;
4416	KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy"));
4417	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
4418	    ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
4419	i = VOP_STRATEGY(vp, bp);
4420	KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
4421}
4422
4423void
4424bufobj_wrefl(struct bufobj *bo)
4425{
4426
4427	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
4428	ASSERT_BO_WLOCKED(bo);
4429	bo->bo_numoutput++;
4430}
4431
4432void
4433bufobj_wref(struct bufobj *bo)
4434{
4435
4436	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
4437	BO_LOCK(bo);
4438	bo->bo_numoutput++;
4439	BO_UNLOCK(bo);
4440}
4441
4442void
4443bufobj_wdrop(struct bufobj *bo)
4444{
4445
4446	KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop"));
4447	BO_LOCK(bo);
4448	KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count"));
4449	if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) {
4450		bo->bo_flag &= ~BO_WWAIT;
4451		wakeup(&bo->bo_numoutput);
4452	}
4453	BO_UNLOCK(bo);
4454}
4455
4456int
4457bufobj_wwait(struct bufobj *bo, int slpflag, int timeo)
4458{
4459	int error;
4460
4461	KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
4462	ASSERT_BO_WLOCKED(bo);
4463	error = 0;
4464	while (bo->bo_numoutput) {
4465		bo->bo_flag |= BO_WWAIT;
4466		error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo),
4467		    slpflag | (PRIBIO + 1), "bo_wwait", timeo);
4468		if (error)
4469			break;
4470	}
4471	return (error);
4472}
4473
4474void
4475bpin(struct buf *bp)
4476{
4477	struct mtx *mtxp;
4478
4479	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4480	mtx_lock(mtxp);
4481	bp->b_pin_count++;
4482	mtx_unlock(mtxp);
4483}
4484
4485void
4486bunpin(struct buf *bp)
4487{
4488	struct mtx *mtxp;
4489
4490	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4491	mtx_lock(mtxp);
4492	if (--bp->b_pin_count == 0)
4493		wakeup(bp);
4494	mtx_unlock(mtxp);
4495}
4496
4497void
4498bunpin_wait(struct buf *bp)
4499{
4500	struct mtx *mtxp;
4501
4502	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4503	mtx_lock(mtxp);
4504	while (bp->b_pin_count > 0)
4505		msleep(bp, mtxp, PRIBIO, "bwunpin", 0);
4506	mtx_unlock(mtxp);
4507}
4508
4509/*
4510 * Set bio_data or bio_ma for struct bio from the struct buf.
4511 */
4512void
4513bdata2bio(struct buf *bp, struct bio *bip)
4514{
4515
4516	if (!buf_mapped(bp)) {
4517		KASSERT(unmapped_buf_allowed, ("unmapped"));
4518		bip->bio_ma = bp->b_pages;
4519		bip->bio_ma_n = bp->b_npages;
4520		bip->bio_data = unmapped_buf;
4521		bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
4522		bip->bio_flags |= BIO_UNMAPPED;
4523		KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
4524		    PAGE_SIZE == bp->b_npages,
4525		    ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset,
4526		    (long long)bip->bio_length, bip->bio_ma_n));
4527	} else {
4528		bip->bio_data = bp->b_data;
4529		bip->bio_ma = NULL;
4530	}
4531}
4532
4533#include "opt_ddb.h"
4534#ifdef DDB
4535#include <ddb/ddb.h>
4536
4537/* DDB command to show buffer data */
4538DB_SHOW_COMMAND(buffer, db_show_buffer)
4539{
4540	/* get args */
4541	struct buf *bp = (struct buf *)addr;
4542
4543	if (!have_addr) {
4544		db_printf("usage: show buffer <addr>\n");
4545		return;
4546	}
4547
4548	db_printf("buf at %p\n", bp);
4549	db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n",
4550	    (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags,
4551	    PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS);
4552	db_printf(
4553	    "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
4554	    "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, "
4555	    "b_dep = %p\n",
4556	    bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
4557	    bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno,
4558	    (intmax_t)bp->b_lblkno, bp->b_dep.lh_first);
4559	db_printf("b_kvabase = %p, b_kvasize = %d\n",
4560	    bp->b_kvabase, bp->b_kvasize);
4561	if (bp->b_npages) {
4562		int i;
4563		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
4564		for (i = 0; i < bp->b_npages; i++) {
4565			vm_page_t m;
4566			m = bp->b_pages[i];
4567			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
4568			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
4569			if ((i + 1) < bp->b_npages)
4570				db_printf(",");
4571		}
4572		db_printf("\n");
4573	}
4574	db_printf(" ");
4575	BUF_LOCKPRINTINFO(bp);
4576}
4577
4578DB_SHOW_COMMAND(lockedbufs, lockedbufs)
4579{
4580	struct buf *bp;
4581	int i;
4582
4583	for (i = 0; i < nbuf; i++) {
4584		bp = &buf[i];
4585		if (BUF_ISLOCKED(bp)) {
4586			db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4587			db_printf("\n");
4588		}
4589	}
4590}
4591
4592DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs)
4593{
4594	struct vnode *vp;
4595	struct buf *bp;
4596
4597	if (!have_addr) {
4598		db_printf("usage: show vnodebufs <addr>\n");
4599		return;
4600	}
4601	vp = (struct vnode *)addr;
4602	db_printf("Clean buffers:\n");
4603	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) {
4604		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4605		db_printf("\n");
4606	}
4607	db_printf("Dirty buffers:\n");
4608	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
4609		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4610		db_printf("\n");
4611	}
4612}
4613
4614DB_COMMAND(countfreebufs, db_coundfreebufs)
4615{
4616	struct buf *bp;
4617	int i, used = 0, nfree = 0;
4618
4619	if (have_addr) {
4620		db_printf("usage: countfreebufs\n");
4621		return;
4622	}
4623
4624	for (i = 0; i < nbuf; i++) {
4625		bp = &buf[i];
4626		if ((bp->b_flags & B_INFREECNT) != 0)
4627			nfree++;
4628		else
4629			used++;
4630	}
4631
4632	db_printf("Counted %d free, %d used (%d tot)\n", nfree, used,
4633	    nfree + used);
4634	db_printf("numfreebuffers is %d\n", numfreebuffers);
4635}
4636#endif /* DDB */
4637