vfs_bio.c revision 270157
1/*-
2 * Copyright (c) 2004 Poul-Henning Kamp
3 * Copyright (c) 1994,1997 John S. Dyson
4 * Copyright (c) 2013 The FreeBSD Foundation
5 * All rights reserved.
6 *
7 * Portions of this software were developed by Konstantin Belousov
8 * under sponsorship from the FreeBSD Foundation.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32/*
33 * this file contains a new buffer I/O scheme implementing a coherent
34 * VM object and buffer cache scheme.  Pains have been taken to make
35 * sure that the performance degradation associated with schemes such
36 * as this is not realized.
37 *
38 * Author:  John S. Dyson
39 * Significant help during the development and debugging phases
40 * had been provided by David Greenman, also of the FreeBSD core team.
41 *
42 * see man buf(9) for more info.
43 */
44
45#include <sys/cdefs.h>
46__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_bio.c 270157 2014-08-18 22:53:48Z mckusick $");
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/bio.h>
51#include <sys/conf.h>
52#include <sys/buf.h>
53#include <sys/devicestat.h>
54#include <sys/eventhandler.h>
55#include <sys/fail.h>
56#include <sys/limits.h>
57#include <sys/lock.h>
58#include <sys/malloc.h>
59#include <sys/mount.h>
60#include <sys/mutex.h>
61#include <sys/kernel.h>
62#include <sys/kthread.h>
63#include <sys/proc.h>
64#include <sys/resourcevar.h>
65#include <sys/rwlock.h>
66#include <sys/sysctl.h>
67#include <sys/vmem.h>
68#include <sys/vmmeter.h>
69#include <sys/vnode.h>
70#include <geom/geom.h>
71#include <vm/vm.h>
72#include <vm/vm_param.h>
73#include <vm/vm_kern.h>
74#include <vm/vm_pageout.h>
75#include <vm/vm_page.h>
76#include <vm/vm_object.h>
77#include <vm/vm_extern.h>
78#include <vm/vm_map.h>
79#include "opt_compat.h"
80#include "opt_swap.h"
81
82static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
83
84struct	bio_ops bioops;		/* I/O operation notification */
85
86struct	buf_ops buf_ops_bio = {
87	.bop_name	=	"buf_ops_bio",
88	.bop_write	=	bufwrite,
89	.bop_strategy	=	bufstrategy,
90	.bop_sync	=	bufsync,
91	.bop_bdflush	=	bufbdflush,
92};
93
94/*
95 * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has
96 * carnal knowledge of buffers.  This knowledge should be moved to vfs_bio.c.
97 */
98struct buf *buf;		/* buffer header pool */
99caddr_t unmapped_buf;
100
101/* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
102struct proc *bufdaemonproc;
103
104static int inmem(struct vnode *vp, daddr_t blkno);
105static void vm_hold_free_pages(struct buf *bp, int newbsize);
106static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
107		vm_offset_t to);
108static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
109static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
110		vm_page_t m);
111static void vfs_clean_pages_dirty_buf(struct buf *bp);
112static void vfs_setdirty_locked_object(struct buf *bp);
113static void vfs_vmio_release(struct buf *bp);
114static int vfs_bio_clcheck(struct vnode *vp, int size,
115		daddr_t lblkno, daddr_t blkno);
116static int buf_flush(int);
117static int flushbufqueues(int, int);
118static void buf_daemon(void);
119static void bremfreel(struct buf *bp);
120static __inline void bd_wakeup(void);
121#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
122    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
123static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
124#endif
125
126int vmiodirenable = TRUE;
127SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
128    "Use the VM system for directory writes");
129long runningbufspace;
130SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
131    "Amount of presently outstanding async buffer io");
132static long bufspace;
133#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
134    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
135SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
136    &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers");
137#else
138SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
139    "Virtual memory used for buffers");
140#endif
141static long unmapped_bufspace;
142SYSCTL_LONG(_vfs, OID_AUTO, unmapped_bufspace, CTLFLAG_RD,
143    &unmapped_bufspace, 0,
144    "Amount of unmapped buffers, inclusive in the bufspace");
145static long maxbufspace;
146SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
147    "Maximum allowed value of bufspace (including buf_daemon)");
148static long bufmallocspace;
149SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
150    "Amount of malloced memory for buffers");
151static long maxbufmallocspace;
152SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0,
153    "Maximum amount of malloced memory for buffers");
154static long lobufspace;
155SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
156    "Minimum amount of buffers we want to have");
157long hibufspace;
158SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
159    "Maximum allowed value of bufspace (excluding buf_daemon)");
160static int bufreusecnt;
161SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0,
162    "Number of times we have reused a buffer");
163static int buffreekvacnt;
164SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
165    "Number of times we have freed the KVA space from some buffer");
166static int bufdefragcnt;
167SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
168    "Number of times we have had to repeat buffer allocation to defragment");
169static long lorunningspace;
170SYSCTL_LONG(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0,
171    "Minimum preferred space used for in-progress I/O");
172static long hirunningspace;
173SYSCTL_LONG(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0,
174    "Maximum amount of space to use for in-progress I/O");
175int dirtybufferflushes;
176SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
177    0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
178int bdwriteskip;
179SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
180    0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
181int altbufferflushes;
182SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
183    0, "Number of fsync flushes to limit dirty buffers");
184static int recursiveflushes;
185SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes,
186    0, "Number of flushes skipped due to being recursive");
187static int numdirtybuffers;
188SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
189    "Number of buffers that are dirty (has unwritten changes) at the moment");
190static int lodirtybuffers;
191SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
192    "How many buffers we want to have free before bufdaemon can sleep");
193static int hidirtybuffers;
194SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
195    "When the number of dirty buffers is considered severe");
196int dirtybufthresh;
197SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
198    0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
199static int numfreebuffers;
200SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
201    "Number of free buffers");
202static int lofreebuffers;
203SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
204   "XXX Unused");
205static int hifreebuffers;
206SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
207   "XXX Complicatedly unused");
208static int getnewbufcalls;
209SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
210   "Number of calls to getnewbuf");
211static int getnewbufrestarts;
212SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
213    "Number of times getnewbuf has had to restart a buffer aquisition");
214static int mappingrestarts;
215SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
216    "Number of times getblk has had to restart a buffer mapping for "
217    "unmapped buffer");
218static int flushbufqtarget = 100;
219SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
220    "Amount of work to do in flushbufqueues when helping bufdaemon");
221static long notbufdflushes;
222SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes, 0,
223    "Number of dirty buffer flushes done by the bufdaemon helpers");
224static long barrierwrites;
225SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
226    "Number of barrier writes");
227SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
228    &unmapped_buf_allowed, 0,
229    "Permit the use of the unmapped i/o");
230
231/*
232 * Lock for the non-dirty bufqueues
233 */
234static struct mtx_padalign bqclean;
235
236/*
237 * Lock for the dirty queue.
238 */
239static struct mtx_padalign bqdirty;
240
241/*
242 * This lock synchronizes access to bd_request.
243 */
244static struct mtx_padalign bdlock;
245
246/*
247 * This lock protects the runningbufreq and synchronizes runningbufwakeup and
248 * waitrunningbufspace().
249 */
250static struct mtx_padalign rbreqlock;
251
252/*
253 * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
254 */
255static struct rwlock_padalign nblock;
256
257/*
258 * Lock that protects bdirtywait.
259 */
260static struct mtx_padalign bdirtylock;
261
262/*
263 * Wakeup point for bufdaemon, as well as indicator of whether it is already
264 * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
265 * is idling.
266 */
267static int bd_request;
268
269/*
270 * Request for the buf daemon to write more buffers than is indicated by
271 * lodirtybuf.  This may be necessary to push out excess dependencies or
272 * defragment the address space where a simple count of the number of dirty
273 * buffers is insufficient to characterize the demand for flushing them.
274 */
275static int bd_speedupreq;
276
277/*
278 * bogus page -- for I/O to/from partially complete buffers
279 * this is a temporary solution to the problem, but it is not
280 * really that bad.  it would be better to split the buffer
281 * for input in the case of buffers partially already in memory,
282 * but the code is intricate enough already.
283 */
284vm_page_t bogus_page;
285
286/*
287 * Synchronization (sleep/wakeup) variable for active buffer space requests.
288 * Set when wait starts, cleared prior to wakeup().
289 * Used in runningbufwakeup() and waitrunningbufspace().
290 */
291static int runningbufreq;
292
293/*
294 * Synchronization (sleep/wakeup) variable for buffer requests.
295 * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
296 * by and/or.
297 * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(),
298 * getnewbuf(), and getblk().
299 */
300static volatile int needsbuffer;
301
302/*
303 * Synchronization for bwillwrite() waiters.
304 */
305static int bdirtywait;
306
307/*
308 * Definitions for the buffer free lists.
309 */
310#define BUFFER_QUEUES	5	/* number of free buffer queues */
311
312#define QUEUE_NONE	0	/* on no queue */
313#define QUEUE_CLEAN	1	/* non-B_DELWRI buffers */
314#define QUEUE_DIRTY	2	/* B_DELWRI buffers */
315#define QUEUE_EMPTYKVA	3	/* empty buffer headers w/KVA assignment */
316#define QUEUE_EMPTY	4	/* empty buffer headers */
317#define QUEUE_SENTINEL	1024	/* not an queue index, but mark for sentinel */
318
319/* Queues for free buffers with various properties */
320static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
321#ifdef INVARIANTS
322static int bq_len[BUFFER_QUEUES];
323#endif
324
325/*
326 * Single global constant for BUF_WMESG, to avoid getting multiple references.
327 * buf_wmesg is referred from macros.
328 */
329const char *buf_wmesg = BUF_WMESG;
330
331#define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
332#define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
333#define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
334
335#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
336    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
337static int
338sysctl_bufspace(SYSCTL_HANDLER_ARGS)
339{
340	long lvalue;
341	int ivalue;
342
343	if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
344		return (sysctl_handle_long(oidp, arg1, arg2, req));
345	lvalue = *(long *)arg1;
346	if (lvalue > INT_MAX)
347		/* On overflow, still write out a long to trigger ENOMEM. */
348		return (sysctl_handle_long(oidp, &lvalue, 0, req));
349	ivalue = lvalue;
350	return (sysctl_handle_int(oidp, &ivalue, 0, req));
351}
352#endif
353
354/*
355 *	bqlock:
356 *
357 *	Return the appropriate queue lock based on the index.
358 */
359static inline struct mtx *
360bqlock(int qindex)
361{
362
363	if (qindex == QUEUE_DIRTY)
364		return (struct mtx *)(&bqdirty);
365	return (struct mtx *)(&bqclean);
366}
367
368/*
369 *	bdirtywakeup:
370 *
371 *	Wakeup any bwillwrite() waiters.
372 */
373static void
374bdirtywakeup(void)
375{
376	mtx_lock(&bdirtylock);
377	if (bdirtywait) {
378		bdirtywait = 0;
379		wakeup(&bdirtywait);
380	}
381	mtx_unlock(&bdirtylock);
382}
383
384/*
385 *	bdirtysub:
386 *
387 *	Decrement the numdirtybuffers count by one and wakeup any
388 *	threads blocked in bwillwrite().
389 */
390static void
391bdirtysub(void)
392{
393
394	if (atomic_fetchadd_int(&numdirtybuffers, -1) ==
395	    (lodirtybuffers + hidirtybuffers) / 2)
396		bdirtywakeup();
397}
398
399/*
400 *	bdirtyadd:
401 *
402 *	Increment the numdirtybuffers count by one and wakeup the buf
403 *	daemon if needed.
404 */
405static void
406bdirtyadd(void)
407{
408
409	/*
410	 * Only do the wakeup once as we cross the boundary.  The
411	 * buf daemon will keep running until the condition clears.
412	 */
413	if (atomic_fetchadd_int(&numdirtybuffers, 1) ==
414	    (lodirtybuffers + hidirtybuffers) / 2)
415		bd_wakeup();
416}
417
418/*
419 *	bufspacewakeup:
420 *
421 *	Called when buffer space is potentially available for recovery.
422 *	getnewbuf() will block on this flag when it is unable to free
423 *	sufficient buffer space.  Buffer space becomes recoverable when
424 *	bp's get placed back in the queues.
425 */
426
427static __inline void
428bufspacewakeup(void)
429{
430	int need_wakeup, on;
431
432	/*
433	 * If someone is waiting for BUF space, wake them up.  Even
434	 * though we haven't freed the kva space yet, the waiting
435	 * process will be able to now.
436	 */
437	rw_rlock(&nblock);
438	for (;;) {
439		need_wakeup = 0;
440		on = needsbuffer;
441		if ((on & VFS_BIO_NEED_BUFSPACE) == 0)
442			break;
443		need_wakeup = 1;
444		if (atomic_cmpset_rel_int(&needsbuffer, on,
445		    on & ~VFS_BIO_NEED_BUFSPACE))
446			break;
447	}
448	if (need_wakeup)
449		wakeup(__DEVOLATILE(void *, &needsbuffer));
450	rw_runlock(&nblock);
451}
452
453/*
454 *	runningwakeup:
455 *
456 *	Wake up processes that are waiting on asynchronous writes to fall
457 *	below lorunningspace.
458 */
459static void
460runningwakeup(void)
461{
462
463	mtx_lock(&rbreqlock);
464	if (runningbufreq) {
465		runningbufreq = 0;
466		wakeup(&runningbufreq);
467	}
468	mtx_unlock(&rbreqlock);
469}
470
471/*
472 *	runningbufwakeup:
473 *
474 *	Decrement the outstanding write count according.
475 */
476void
477runningbufwakeup(struct buf *bp)
478{
479	long space, bspace;
480
481	bspace = bp->b_runningbufspace;
482	if (bspace == 0)
483		return;
484	space = atomic_fetchadd_long(&runningbufspace, -bspace);
485	KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld",
486	    space, bspace));
487	bp->b_runningbufspace = 0;
488	/*
489	 * Only acquire the lock and wakeup on the transition from exceeding
490	 * the threshold to falling below it.
491	 */
492	if (space < lorunningspace)
493		return;
494	if (space - bspace > lorunningspace)
495		return;
496	runningwakeup();
497}
498
499/*
500 *	bufcountadd:
501 *
502 *	Called when a buffer has been added to one of the free queues to
503 *	account for the buffer and to wakeup anyone waiting for free buffers.
504 *	This typically occurs when large amounts of metadata are being handled
505 *	by the buffer cache ( else buffer space runs out first, usually ).
506 */
507static __inline void
508bufcountadd(struct buf *bp)
509{
510	int mask, need_wakeup, old, on;
511
512	KASSERT((bp->b_flags & B_INFREECNT) == 0,
513	    ("buf %p already counted as free", bp));
514	bp->b_flags |= B_INFREECNT;
515	old = atomic_fetchadd_int(&numfreebuffers, 1);
516	KASSERT(old >= 0 && old < nbuf,
517	    ("numfreebuffers climbed to %d", old + 1));
518	mask = VFS_BIO_NEED_ANY;
519	if (numfreebuffers >= hifreebuffers)
520		mask |= VFS_BIO_NEED_FREE;
521	rw_rlock(&nblock);
522	for (;;) {
523		need_wakeup = 0;
524		on = needsbuffer;
525		if (on == 0)
526			break;
527		need_wakeup = 1;
528		if (atomic_cmpset_rel_int(&needsbuffer, on, on & ~mask))
529			break;
530	}
531	if (need_wakeup)
532		wakeup(__DEVOLATILE(void *, &needsbuffer));
533	rw_runlock(&nblock);
534}
535
536/*
537 *	bufcountsub:
538 *
539 *	Decrement the numfreebuffers count as needed.
540 */
541static void
542bufcountsub(struct buf *bp)
543{
544	int old;
545
546	/*
547	 * Fixup numfreebuffers count.  If the buffer is invalid or not
548	 * delayed-write, the buffer was free and we must decrement
549	 * numfreebuffers.
550	 */
551	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
552		KASSERT((bp->b_flags & B_INFREECNT) != 0,
553		    ("buf %p not counted in numfreebuffers", bp));
554		bp->b_flags &= ~B_INFREECNT;
555		old = atomic_fetchadd_int(&numfreebuffers, -1);
556		KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
557	}
558}
559
560/*
561 *	waitrunningbufspace()
562 *
563 *	runningbufspace is a measure of the amount of I/O currently
564 *	running.  This routine is used in async-write situations to
565 *	prevent creating huge backups of pending writes to a device.
566 *	Only asynchronous writes are governed by this function.
567 *
568 *	This does NOT turn an async write into a sync write.  It waits
569 *	for earlier writes to complete and generally returns before the
570 *	caller's write has reached the device.
571 */
572void
573waitrunningbufspace(void)
574{
575
576	mtx_lock(&rbreqlock);
577	while (runningbufspace > hirunningspace) {
578		runningbufreq = 1;
579		msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
580	}
581	mtx_unlock(&rbreqlock);
582}
583
584
585/*
586 *	vfs_buf_test_cache:
587 *
588 *	Called when a buffer is extended.  This function clears the B_CACHE
589 *	bit if the newly extended portion of the buffer does not contain
590 *	valid data.
591 */
592static __inline
593void
594vfs_buf_test_cache(struct buf *bp,
595		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
596		  vm_page_t m)
597{
598
599	VM_OBJECT_ASSERT_LOCKED(m->object);
600	if (bp->b_flags & B_CACHE) {
601		int base = (foff + off) & PAGE_MASK;
602		if (vm_page_is_valid(m, base, size) == 0)
603			bp->b_flags &= ~B_CACHE;
604	}
605}
606
607/* Wake up the buffer daemon if necessary */
608static __inline void
609bd_wakeup(void)
610{
611
612	mtx_lock(&bdlock);
613	if (bd_request == 0) {
614		bd_request = 1;
615		wakeup(&bd_request);
616	}
617	mtx_unlock(&bdlock);
618}
619
620/*
621 * bd_speedup - speedup the buffer cache flushing code
622 */
623void
624bd_speedup(void)
625{
626	int needwake;
627
628	mtx_lock(&bdlock);
629	needwake = 0;
630	if (bd_speedupreq == 0 || bd_request == 0)
631		needwake = 1;
632	bd_speedupreq = 1;
633	bd_request = 1;
634	if (needwake)
635		wakeup(&bd_request);
636	mtx_unlock(&bdlock);
637}
638
639#ifdef __i386__
640#define	TRANSIENT_DENOM	5
641#else
642#define	TRANSIENT_DENOM 10
643#endif
644
645/*
646 * Calculating buffer cache scaling values and reserve space for buffer
647 * headers.  This is called during low level kernel initialization and
648 * may be called more then once.  We CANNOT write to the memory area
649 * being reserved at this time.
650 */
651caddr_t
652kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
653{
654	int tuned_nbuf;
655	long maxbuf, maxbuf_sz, buf_sz,	biotmap_sz;
656
657	/*
658	 * physmem_est is in pages.  Convert it to kilobytes (assumes
659	 * PAGE_SIZE is >= 1K)
660	 */
661	physmem_est = physmem_est * (PAGE_SIZE / 1024);
662
663	/*
664	 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
665	 * For the first 64MB of ram nominally allocate sufficient buffers to
666	 * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
667	 * buffers to cover 1/10 of our ram over 64MB.  When auto-sizing
668	 * the buffer cache we limit the eventual kva reservation to
669	 * maxbcache bytes.
670	 *
671	 * factor represents the 1/4 x ram conversion.
672	 */
673	if (nbuf == 0) {
674		int factor = 4 * BKVASIZE / 1024;
675
676		nbuf = 50;
677		if (physmem_est > 4096)
678			nbuf += min((physmem_est - 4096) / factor,
679			    65536 / factor);
680		if (physmem_est > 65536)
681			nbuf += min((physmem_est - 65536) * 2 / (factor * 5),
682			    32 * 1024 * 1024 / (factor * 5));
683
684		if (maxbcache && nbuf > maxbcache / BKVASIZE)
685			nbuf = maxbcache / BKVASIZE;
686		tuned_nbuf = 1;
687	} else
688		tuned_nbuf = 0;
689
690	/* XXX Avoid unsigned long overflows later on with maxbufspace. */
691	maxbuf = (LONG_MAX / 3) / BKVASIZE;
692	if (nbuf > maxbuf) {
693		if (!tuned_nbuf)
694			printf("Warning: nbufs lowered from %d to %ld\n", nbuf,
695			    maxbuf);
696		nbuf = maxbuf;
697	}
698
699	/*
700	 * Ideal allocation size for the transient bio submap if 10%
701	 * of the maximal space buffer map.  This roughly corresponds
702	 * to the amount of the buffer mapped for typical UFS load.
703	 *
704	 * Clip the buffer map to reserve space for the transient
705	 * BIOs, if its extent is bigger than 90% (80% on i386) of the
706	 * maximum buffer map extent on the platform.
707	 *
708	 * The fall-back to the maxbuf in case of maxbcache unset,
709	 * allows to not trim the buffer KVA for the architectures
710	 * with ample KVA space.
711	 */
712	if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) {
713		maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
714		buf_sz = (long)nbuf * BKVASIZE;
715		if (buf_sz < maxbuf_sz / TRANSIENT_DENOM *
716		    (TRANSIENT_DENOM - 1)) {
717			/*
718			 * There is more KVA than memory.  Do not
719			 * adjust buffer map size, and assign the rest
720			 * of maxbuf to transient map.
721			 */
722			biotmap_sz = maxbuf_sz - buf_sz;
723		} else {
724			/*
725			 * Buffer map spans all KVA we could afford on
726			 * this platform.  Give 10% (20% on i386) of
727			 * the buffer map to the transient bio map.
728			 */
729			biotmap_sz = buf_sz / TRANSIENT_DENOM;
730			buf_sz -= biotmap_sz;
731		}
732		if (biotmap_sz / INT_MAX > MAXPHYS)
733			bio_transient_maxcnt = INT_MAX;
734		else
735			bio_transient_maxcnt = biotmap_sz / MAXPHYS;
736		/*
737		 * Artifically limit to 1024 simultaneous in-flight I/Os
738		 * using the transient mapping.
739		 */
740		if (bio_transient_maxcnt > 1024)
741			bio_transient_maxcnt = 1024;
742		if (tuned_nbuf)
743			nbuf = buf_sz / BKVASIZE;
744	}
745
746	/*
747	 * swbufs are used as temporary holders for I/O, such as paging I/O.
748	 * We have no less then 16 and no more then 256.
749	 */
750	nswbuf = max(min(nbuf/4, 256), 16);
751#ifdef NSWBUF_MIN
752	if (nswbuf < NSWBUF_MIN)
753		nswbuf = NSWBUF_MIN;
754#endif
755
756	/*
757	 * Reserve space for the buffer cache buffers
758	 */
759	swbuf = (void *)v;
760	v = (caddr_t)(swbuf + nswbuf);
761	buf = (void *)v;
762	v = (caddr_t)(buf + nbuf);
763
764	return(v);
765}
766
767/* Initialize the buffer subsystem.  Called before use of any buffers. */
768void
769bufinit(void)
770{
771	struct buf *bp;
772	int i;
773
774	mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF);
775	mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF);
776	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
777	rw_init(&nblock, "needsbuffer lock");
778	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
779	mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
780
781	/* next, make a null set of free lists */
782	for (i = 0; i < BUFFER_QUEUES; i++)
783		TAILQ_INIT(&bufqueues[i]);
784
785	/* finally, initialize each buffer header and stick on empty q */
786	for (i = 0; i < nbuf; i++) {
787		bp = &buf[i];
788		bzero(bp, sizeof *bp);
789		bp->b_flags = B_INVAL | B_INFREECNT;
790		bp->b_rcred = NOCRED;
791		bp->b_wcred = NOCRED;
792		bp->b_qindex = QUEUE_EMPTY;
793		bp->b_xflags = 0;
794		LIST_INIT(&bp->b_dep);
795		BUF_LOCKINIT(bp);
796		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
797#ifdef INVARIANTS
798		bq_len[QUEUE_EMPTY]++;
799#endif
800	}
801
802	/*
803	 * maxbufspace is the absolute maximum amount of buffer space we are
804	 * allowed to reserve in KVM and in real terms.  The absolute maximum
805	 * is nominally used by buf_daemon.  hibufspace is the nominal maximum
806	 * used by most other processes.  The differential is required to
807	 * ensure that buf_daemon is able to run when other processes might
808	 * be blocked waiting for buffer space.
809	 *
810	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
811	 * this may result in KVM fragmentation which is not handled optimally
812	 * by the system.
813	 */
814	maxbufspace = (long)nbuf * BKVASIZE;
815	hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
816	lobufspace = hibufspace - MAXBSIZE;
817
818	/*
819	 * Note: The 16 MiB upper limit for hirunningspace was chosen
820	 * arbitrarily and may need further tuning. It corresponds to
821	 * 128 outstanding write IO requests (if IO size is 128 KiB),
822	 * which fits with many RAID controllers' tagged queuing limits.
823	 * The lower 1 MiB limit is the historical upper limit for
824	 * hirunningspace.
825	 */
826	hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBSIZE),
827	    16 * 1024 * 1024), 1024 * 1024);
828	lorunningspace = roundup((hirunningspace * 2) / 3, MAXBSIZE);
829
830/*
831 * Limit the amount of malloc memory since it is wired permanently into
832 * the kernel space.  Even though this is accounted for in the buffer
833 * allocation, we don't want the malloced region to grow uncontrolled.
834 * The malloc scheme improves memory utilization significantly on average
835 * (small) directories.
836 */
837	maxbufmallocspace = hibufspace / 20;
838
839/*
840 * Reduce the chance of a deadlock occuring by limiting the number
841 * of delayed-write dirty buffers we allow to stack up.
842 */
843	hidirtybuffers = nbuf / 4 + 20;
844	dirtybufthresh = hidirtybuffers * 9 / 10;
845	numdirtybuffers = 0;
846/*
847 * To support extreme low-memory systems, make sure hidirtybuffers cannot
848 * eat up all available buffer space.  This occurs when our minimum cannot
849 * be met.  We try to size hidirtybuffers to 3/4 our buffer space assuming
850 * BKVASIZE'd buffers.
851 */
852	while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
853		hidirtybuffers >>= 1;
854	}
855	lodirtybuffers = hidirtybuffers / 2;
856
857/*
858 * Try to keep the number of free buffers in the specified range,
859 * and give special processes (e.g. like buf_daemon) access to an
860 * emergency reserve.
861 */
862	lofreebuffers = nbuf / 18 + 5;
863	hifreebuffers = 2 * lofreebuffers;
864	numfreebuffers = nbuf;
865
866	bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
867	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
868	unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
869}
870
871#ifdef INVARIANTS
872static inline void
873vfs_buf_check_mapped(struct buf *bp)
874{
875
876	KASSERT((bp->b_flags & B_UNMAPPED) == 0,
877	    ("mapped buf %p %x", bp, bp->b_flags));
878	KASSERT(bp->b_kvabase != unmapped_buf,
879	    ("mapped buf: b_kvabase was not updated %p", bp));
880	KASSERT(bp->b_data != unmapped_buf,
881	    ("mapped buf: b_data was not updated %p", bp));
882}
883
884static inline void
885vfs_buf_check_unmapped(struct buf *bp)
886{
887
888	KASSERT((bp->b_flags & B_UNMAPPED) == B_UNMAPPED,
889	    ("unmapped buf %p %x", bp, bp->b_flags));
890	KASSERT(bp->b_kvabase == unmapped_buf,
891	    ("unmapped buf: corrupted b_kvabase %p", bp));
892	KASSERT(bp->b_data == unmapped_buf,
893	    ("unmapped buf: corrupted b_data %p", bp));
894}
895
896#define	BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
897#define	BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
898#else
899#define	BUF_CHECK_MAPPED(bp) do {} while (0)
900#define	BUF_CHECK_UNMAPPED(bp) do {} while (0)
901#endif
902
903static void
904bpmap_qenter(struct buf *bp)
905{
906
907	BUF_CHECK_MAPPED(bp);
908
909	/*
910	 * bp->b_data is relative to bp->b_offset, but
911	 * bp->b_offset may be offset into the first page.
912	 */
913	bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
914	pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
915	bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
916	    (vm_offset_t)(bp->b_offset & PAGE_MASK));
917}
918
919/*
920 * bfreekva() - free the kva allocation for a buffer.
921 *
922 *	Since this call frees up buffer space, we call bufspacewakeup().
923 */
924static void
925bfreekva(struct buf *bp)
926{
927
928	if (bp->b_kvasize == 0)
929		return;
930
931	atomic_add_int(&buffreekvacnt, 1);
932	atomic_subtract_long(&bufspace, bp->b_kvasize);
933	if ((bp->b_flags & B_UNMAPPED) == 0) {
934		BUF_CHECK_MAPPED(bp);
935		vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase,
936		    bp->b_kvasize);
937	} else {
938		BUF_CHECK_UNMAPPED(bp);
939		if ((bp->b_flags & B_KVAALLOC) != 0) {
940			vmem_free(buffer_arena, (vm_offset_t)bp->b_kvaalloc,
941			    bp->b_kvasize);
942		}
943		atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
944		bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
945	}
946	bp->b_kvasize = 0;
947	bufspacewakeup();
948}
949
950/*
951 *	binsfree:
952 *
953 *	Insert the buffer into the appropriate free list.
954 */
955static void
956binsfree(struct buf *bp, int qindex)
957{
958	struct mtx *olock, *nlock;
959
960	BUF_ASSERT_XLOCKED(bp);
961
962	olock = bqlock(bp->b_qindex);
963	nlock = bqlock(qindex);
964	mtx_lock(olock);
965	/* Handle delayed bremfree() processing. */
966	if (bp->b_flags & B_REMFREE)
967		bremfreel(bp);
968
969	if (bp->b_qindex != QUEUE_NONE)
970		panic("binsfree: free buffer onto another queue???");
971
972	bp->b_qindex = qindex;
973	if (olock != nlock) {
974		mtx_unlock(olock);
975		mtx_lock(nlock);
976	}
977	if (bp->b_flags & B_AGE)
978		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
979	else
980		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
981#ifdef INVARIANTS
982	bq_len[bp->b_qindex]++;
983#endif
984	mtx_unlock(nlock);
985
986	/*
987	 * Something we can maybe free or reuse.
988	 */
989	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
990		bufspacewakeup();
991
992	if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
993		bufcountadd(bp);
994}
995
996/*
997 *	bremfree:
998 *
999 *	Mark the buffer for removal from the appropriate free list.
1000 *
1001 */
1002void
1003bremfree(struct buf *bp)
1004{
1005
1006	CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1007	KASSERT((bp->b_flags & B_REMFREE) == 0,
1008	    ("bremfree: buffer %p already marked for delayed removal.", bp));
1009	KASSERT(bp->b_qindex != QUEUE_NONE,
1010	    ("bremfree: buffer %p not on a queue.", bp));
1011	BUF_ASSERT_XLOCKED(bp);
1012
1013	bp->b_flags |= B_REMFREE;
1014	bufcountsub(bp);
1015}
1016
1017/*
1018 *	bremfreef:
1019 *
1020 *	Force an immediate removal from a free list.  Used only in nfs when
1021 *	it abuses the b_freelist pointer.
1022 */
1023void
1024bremfreef(struct buf *bp)
1025{
1026	struct mtx *qlock;
1027
1028	qlock = bqlock(bp->b_qindex);
1029	mtx_lock(qlock);
1030	bremfreel(bp);
1031	mtx_unlock(qlock);
1032}
1033
1034/*
1035 *	bremfreel:
1036 *
1037 *	Removes a buffer from the free list, must be called with the
1038 *	correct qlock held.
1039 */
1040static void
1041bremfreel(struct buf *bp)
1042{
1043
1044	CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
1045	    bp, bp->b_vp, bp->b_flags);
1046	KASSERT(bp->b_qindex != QUEUE_NONE,
1047	    ("bremfreel: buffer %p not on a queue.", bp));
1048	BUF_ASSERT_XLOCKED(bp);
1049	mtx_assert(bqlock(bp->b_qindex), MA_OWNED);
1050
1051	TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
1052#ifdef INVARIANTS
1053	KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow",
1054	    bp->b_qindex));
1055	bq_len[bp->b_qindex]--;
1056#endif
1057	bp->b_qindex = QUEUE_NONE;
1058	/*
1059	 * If this was a delayed bremfree() we only need to remove the buffer
1060	 * from the queue and return the stats are already done.
1061	 */
1062	if (bp->b_flags & B_REMFREE) {
1063		bp->b_flags &= ~B_REMFREE;
1064		return;
1065	}
1066	bufcountsub(bp);
1067}
1068
1069/*
1070 * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
1071 * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
1072 * the buffer is valid and we do not have to do anything.
1073 */
1074void
1075breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
1076    int cnt, struct ucred * cred)
1077{
1078	struct buf *rabp;
1079	int i;
1080
1081	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
1082		if (inmem(vp, *rablkno))
1083			continue;
1084		rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
1085
1086		if ((rabp->b_flags & B_CACHE) == 0) {
1087			if (!TD_IS_IDLETHREAD(curthread))
1088				curthread->td_ru.ru_inblock++;
1089			rabp->b_flags |= B_ASYNC;
1090			rabp->b_flags &= ~B_INVAL;
1091			rabp->b_ioflags &= ~BIO_ERROR;
1092			rabp->b_iocmd = BIO_READ;
1093			if (rabp->b_rcred == NOCRED && cred != NOCRED)
1094				rabp->b_rcred = crhold(cred);
1095			vfs_busy_pages(rabp, 0);
1096			BUF_KERNPROC(rabp);
1097			rabp->b_iooffset = dbtob(rabp->b_blkno);
1098			bstrategy(rabp);
1099		} else {
1100			brelse(rabp);
1101		}
1102	}
1103}
1104
1105/*
1106 * Entry point for bread() and breadn() via #defines in sys/buf.h.
1107 *
1108 * Get a buffer with the specified data.  Look in the cache first.  We
1109 * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
1110 * is set, the buffer is valid and we do not have to do anything, see
1111 * getblk(). Also starts asynchronous I/O on read-ahead blocks.
1112 */
1113int
1114breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno,
1115    int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp)
1116{
1117	struct buf *bp;
1118	int rv = 0, readwait = 0;
1119
1120	CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
1121	/*
1122	 * Can only return NULL if GB_LOCK_NOWAIT flag is specified.
1123	 */
1124	*bpp = bp = getblk(vp, blkno, size, 0, 0, flags);
1125	if (bp == NULL)
1126		return (EBUSY);
1127
1128	/* if not found in cache, do some I/O */
1129	if ((bp->b_flags & B_CACHE) == 0) {
1130		if (!TD_IS_IDLETHREAD(curthread))
1131			curthread->td_ru.ru_inblock++;
1132		bp->b_iocmd = BIO_READ;
1133		bp->b_flags &= ~B_INVAL;
1134		bp->b_ioflags &= ~BIO_ERROR;
1135		if (bp->b_rcred == NOCRED && cred != NOCRED)
1136			bp->b_rcred = crhold(cred);
1137		vfs_busy_pages(bp, 0);
1138		bp->b_iooffset = dbtob(bp->b_blkno);
1139		bstrategy(bp);
1140		++readwait;
1141	}
1142
1143	breada(vp, rablkno, rabsize, cnt, cred);
1144
1145	if (readwait) {
1146		rv = bufwait(bp);
1147	}
1148	return (rv);
1149}
1150
1151/*
1152 * Write, release buffer on completion.  (Done by iodone
1153 * if async).  Do not bother writing anything if the buffer
1154 * is invalid.
1155 *
1156 * Note that we set B_CACHE here, indicating that buffer is
1157 * fully valid and thus cacheable.  This is true even of NFS
1158 * now so we set it generally.  This could be set either here
1159 * or in biodone() since the I/O is synchronous.  We put it
1160 * here.
1161 */
1162int
1163bufwrite(struct buf *bp)
1164{
1165	int oldflags;
1166	struct vnode *vp;
1167	long space;
1168	int vp_md;
1169
1170	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1171	if (bp->b_flags & B_INVAL) {
1172		brelse(bp);
1173		return (0);
1174	}
1175
1176	if (bp->b_flags & B_BARRIER)
1177		barrierwrites++;
1178
1179	oldflags = bp->b_flags;
1180
1181	BUF_ASSERT_HELD(bp);
1182
1183	if (bp->b_pin_count > 0)
1184		bunpin_wait(bp);
1185
1186	KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
1187	    ("FFS background buffer should not get here %p", bp));
1188
1189	vp = bp->b_vp;
1190	if (vp)
1191		vp_md = vp->v_vflag & VV_MD;
1192	else
1193		vp_md = 0;
1194
1195	/*
1196	 * Mark the buffer clean.  Increment the bufobj write count
1197	 * before bundirty() call, to prevent other thread from seeing
1198	 * empty dirty list and zero counter for writes in progress,
1199	 * falsely indicating that the bufobj is clean.
1200	 */
1201	bufobj_wref(bp->b_bufobj);
1202	bundirty(bp);
1203
1204	bp->b_flags &= ~B_DONE;
1205	bp->b_ioflags &= ~BIO_ERROR;
1206	bp->b_flags |= B_CACHE;
1207	bp->b_iocmd = BIO_WRITE;
1208
1209	vfs_busy_pages(bp, 1);
1210
1211	/*
1212	 * Normal bwrites pipeline writes
1213	 */
1214	bp->b_runningbufspace = bp->b_bufsize;
1215	space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
1216
1217	if (!TD_IS_IDLETHREAD(curthread))
1218		curthread->td_ru.ru_oublock++;
1219	if (oldflags & B_ASYNC)
1220		BUF_KERNPROC(bp);
1221	bp->b_iooffset = dbtob(bp->b_blkno);
1222	bstrategy(bp);
1223
1224	if ((oldflags & B_ASYNC) == 0) {
1225		int rtval = bufwait(bp);
1226		brelse(bp);
1227		return (rtval);
1228	} else if (space > hirunningspace) {
1229		/*
1230		 * don't allow the async write to saturate the I/O
1231		 * system.  We will not deadlock here because
1232		 * we are blocking waiting for I/O that is already in-progress
1233		 * to complete. We do not block here if it is the update
1234		 * or syncer daemon trying to clean up as that can lead
1235		 * to deadlock.
1236		 */
1237		if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
1238			waitrunningbufspace();
1239	}
1240
1241	return (0);
1242}
1243
1244void
1245bufbdflush(struct bufobj *bo, struct buf *bp)
1246{
1247	struct buf *nbp;
1248
1249	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
1250		(void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
1251		altbufferflushes++;
1252	} else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
1253		BO_LOCK(bo);
1254		/*
1255		 * Try to find a buffer to flush.
1256		 */
1257		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
1258			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
1259			    BUF_LOCK(nbp,
1260				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
1261				continue;
1262			if (bp == nbp)
1263				panic("bdwrite: found ourselves");
1264			BO_UNLOCK(bo);
1265			/* Don't countdeps with the bo lock held. */
1266			if (buf_countdeps(nbp, 0)) {
1267				BO_LOCK(bo);
1268				BUF_UNLOCK(nbp);
1269				continue;
1270			}
1271			if (nbp->b_flags & B_CLUSTEROK) {
1272				vfs_bio_awrite(nbp);
1273			} else {
1274				bremfree(nbp);
1275				bawrite(nbp);
1276			}
1277			dirtybufferflushes++;
1278			break;
1279		}
1280		if (nbp == NULL)
1281			BO_UNLOCK(bo);
1282	}
1283}
1284
1285/*
1286 * Delayed write. (Buffer is marked dirty).  Do not bother writing
1287 * anything if the buffer is marked invalid.
1288 *
1289 * Note that since the buffer must be completely valid, we can safely
1290 * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
1291 * biodone() in order to prevent getblk from writing the buffer
1292 * out synchronously.
1293 */
1294void
1295bdwrite(struct buf *bp)
1296{
1297	struct thread *td = curthread;
1298	struct vnode *vp;
1299	struct bufobj *bo;
1300
1301	CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1302	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1303	KASSERT((bp->b_flags & B_BARRIER) == 0,
1304	    ("Barrier request in delayed write %p", bp));
1305	BUF_ASSERT_HELD(bp);
1306
1307	if (bp->b_flags & B_INVAL) {
1308		brelse(bp);
1309		return;
1310	}
1311
1312	/*
1313	 * If we have too many dirty buffers, don't create any more.
1314	 * If we are wildly over our limit, then force a complete
1315	 * cleanup. Otherwise, just keep the situation from getting
1316	 * out of control. Note that we have to avoid a recursive
1317	 * disaster and not try to clean up after our own cleanup!
1318	 */
1319	vp = bp->b_vp;
1320	bo = bp->b_bufobj;
1321	if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) {
1322		td->td_pflags |= TDP_INBDFLUSH;
1323		BO_BDFLUSH(bo, bp);
1324		td->td_pflags &= ~TDP_INBDFLUSH;
1325	} else
1326		recursiveflushes++;
1327
1328	bdirty(bp);
1329	/*
1330	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
1331	 * true even of NFS now.
1332	 */
1333	bp->b_flags |= B_CACHE;
1334
1335	/*
1336	 * This bmap keeps the system from needing to do the bmap later,
1337	 * perhaps when the system is attempting to do a sync.  Since it
1338	 * is likely that the indirect block -- or whatever other datastructure
1339	 * that the filesystem needs is still in memory now, it is a good
1340	 * thing to do this.  Note also, that if the pageout daemon is
1341	 * requesting a sync -- there might not be enough memory to do
1342	 * the bmap then...  So, this is important to do.
1343	 */
1344	if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
1345		VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
1346	}
1347
1348	/*
1349	 * Set the *dirty* buffer range based upon the VM system dirty
1350	 * pages.
1351	 *
1352	 * Mark the buffer pages as clean.  We need to do this here to
1353	 * satisfy the vnode_pager and the pageout daemon, so that it
1354	 * thinks that the pages have been "cleaned".  Note that since
1355	 * the pages are in a delayed write buffer -- the VFS layer
1356	 * "will" see that the pages get written out on the next sync,
1357	 * or perhaps the cluster will be completed.
1358	 */
1359	vfs_clean_pages_dirty_buf(bp);
1360	bqrelse(bp);
1361
1362	/*
1363	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
1364	 * due to the softdep code.
1365	 */
1366}
1367
1368/*
1369 *	bdirty:
1370 *
1371 *	Turn buffer into delayed write request.  We must clear BIO_READ and
1372 *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to
1373 *	itself to properly update it in the dirty/clean lists.  We mark it
1374 *	B_DONE to ensure that any asynchronization of the buffer properly
1375 *	clears B_DONE ( else a panic will occur later ).
1376 *
1377 *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
1378 *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
1379 *	should only be called if the buffer is known-good.
1380 *
1381 *	Since the buffer is not on a queue, we do not update the numfreebuffers
1382 *	count.
1383 *
1384 *	The buffer must be on QUEUE_NONE.
1385 */
1386void
1387bdirty(struct buf *bp)
1388{
1389
1390	CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X",
1391	    bp, bp->b_vp, bp->b_flags);
1392	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1393	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
1394	    ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
1395	BUF_ASSERT_HELD(bp);
1396	bp->b_flags &= ~(B_RELBUF);
1397	bp->b_iocmd = BIO_WRITE;
1398
1399	if ((bp->b_flags & B_DELWRI) == 0) {
1400		bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
1401		reassignbuf(bp);
1402		bdirtyadd();
1403	}
1404}
1405
1406/*
1407 *	bundirty:
1408 *
1409 *	Clear B_DELWRI for buffer.
1410 *
1411 *	Since the buffer is not on a queue, we do not update the numfreebuffers
1412 *	count.
1413 *
1414 *	The buffer must be on QUEUE_NONE.
1415 */
1416
1417void
1418bundirty(struct buf *bp)
1419{
1420
1421	CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1422	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1423	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
1424	    ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
1425	BUF_ASSERT_HELD(bp);
1426
1427	if (bp->b_flags & B_DELWRI) {
1428		bp->b_flags &= ~B_DELWRI;
1429		reassignbuf(bp);
1430		bdirtysub();
1431	}
1432	/*
1433	 * Since it is now being written, we can clear its deferred write flag.
1434	 */
1435	bp->b_flags &= ~B_DEFERRED;
1436}
1437
1438/*
1439 *	bawrite:
1440 *
1441 *	Asynchronous write.  Start output on a buffer, but do not wait for
1442 *	it to complete.  The buffer is released when the output completes.
1443 *
1444 *	bwrite() ( or the VOP routine anyway ) is responsible for handling
1445 *	B_INVAL buffers.  Not us.
1446 */
1447void
1448bawrite(struct buf *bp)
1449{
1450
1451	bp->b_flags |= B_ASYNC;
1452	(void) bwrite(bp);
1453}
1454
1455/*
1456 *	babarrierwrite:
1457 *
1458 *	Asynchronous barrier write.  Start output on a buffer, but do not
1459 *	wait for it to complete.  Place a write barrier after this write so
1460 *	that this buffer and all buffers written before it are committed to
1461 *	the disk before any buffers written after this write are committed
1462 *	to the disk.  The buffer is released when the output completes.
1463 */
1464void
1465babarrierwrite(struct buf *bp)
1466{
1467
1468	bp->b_flags |= B_ASYNC | B_BARRIER;
1469	(void) bwrite(bp);
1470}
1471
1472/*
1473 *	bbarrierwrite:
1474 *
1475 *	Synchronous barrier write.  Start output on a buffer and wait for
1476 *	it to complete.  Place a write barrier after this write so that
1477 *	this buffer and all buffers written before it are committed to
1478 *	the disk before any buffers written after this write are committed
1479 *	to the disk.  The buffer is released when the output completes.
1480 */
1481int
1482bbarrierwrite(struct buf *bp)
1483{
1484
1485	bp->b_flags |= B_BARRIER;
1486	return (bwrite(bp));
1487}
1488
1489/*
1490 *	bwillwrite:
1491 *
1492 *	Called prior to the locking of any vnodes when we are expecting to
1493 *	write.  We do not want to starve the buffer cache with too many
1494 *	dirty buffers so we block here.  By blocking prior to the locking
1495 *	of any vnodes we attempt to avoid the situation where a locked vnode
1496 *	prevents the various system daemons from flushing related buffers.
1497 */
1498void
1499bwillwrite(void)
1500{
1501
1502	if (numdirtybuffers >= hidirtybuffers) {
1503		mtx_lock(&bdirtylock);
1504		while (numdirtybuffers >= hidirtybuffers) {
1505			bdirtywait = 1;
1506			msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
1507			    "flswai", 0);
1508		}
1509		mtx_unlock(&bdirtylock);
1510	}
1511}
1512
1513/*
1514 * Return true if we have too many dirty buffers.
1515 */
1516int
1517buf_dirty_count_severe(void)
1518{
1519
1520	return(numdirtybuffers >= hidirtybuffers);
1521}
1522
1523static __noinline int
1524buf_vm_page_count_severe(void)
1525{
1526
1527	KFAIL_POINT_CODE(DEBUG_FP, buf_pressure, return 1);
1528
1529	return vm_page_count_severe();
1530}
1531
1532/*
1533 *	brelse:
1534 *
1535 *	Release a busy buffer and, if requested, free its resources.  The
1536 *	buffer will be stashed in the appropriate bufqueue[] allowing it
1537 *	to be accessed later as a cache entity or reused for other purposes.
1538 */
1539void
1540brelse(struct buf *bp)
1541{
1542	int qindex;
1543
1544	CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
1545	    bp, bp->b_vp, bp->b_flags);
1546	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
1547	    ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
1548
1549	if (BUF_LOCKRECURSED(bp)) {
1550		/*
1551		 * Do not process, in particular, do not handle the
1552		 * B_INVAL/B_RELBUF and do not release to free list.
1553		 */
1554		BUF_UNLOCK(bp);
1555		return;
1556	}
1557
1558	if (bp->b_flags & B_MANAGED) {
1559		bqrelse(bp);
1560		return;
1561	}
1562
1563	if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
1564	    bp->b_error == EIO && !(bp->b_flags & B_INVAL)) {
1565		/*
1566		 * Failed write, redirty.  Must clear BIO_ERROR to prevent
1567		 * pages from being scrapped.  If the error is anything
1568		 * other than an I/O error (EIO), assume that retrying
1569		 * is futile.
1570		 */
1571		bp->b_ioflags &= ~BIO_ERROR;
1572		bdirty(bp);
1573	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
1574	    (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) {
1575		/*
1576		 * Either a failed I/O or we were asked to free or not
1577		 * cache the buffer.
1578		 */
1579		bp->b_flags |= B_INVAL;
1580		if (!LIST_EMPTY(&bp->b_dep))
1581			buf_deallocate(bp);
1582		if (bp->b_flags & B_DELWRI)
1583			bdirtysub();
1584		bp->b_flags &= ~(B_DELWRI | B_CACHE);
1585		if ((bp->b_flags & B_VMIO) == 0) {
1586			if (bp->b_bufsize)
1587				allocbuf(bp, 0);
1588			if (bp->b_vp)
1589				brelvp(bp);
1590		}
1591	}
1592
1593	/*
1594	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release()
1595	 * is called with B_DELWRI set, the underlying pages may wind up
1596	 * getting freed causing a previous write (bdwrite()) to get 'lost'
1597	 * because pages associated with a B_DELWRI bp are marked clean.
1598	 *
1599	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
1600	 * if B_DELWRI is set.
1601	 *
1602	 * If B_DELWRI is not set we may have to set B_RELBUF if we are low
1603	 * on pages to return pages to the VM page queues.
1604	 */
1605	if (bp->b_flags & B_DELWRI)
1606		bp->b_flags &= ~B_RELBUF;
1607	else if (buf_vm_page_count_severe()) {
1608		/*
1609		 * BKGRDINPROG can only be set with the buf and bufobj
1610		 * locks both held.  We tolerate a race to clear it here.
1611		 */
1612		if (!(bp->b_vflags & BV_BKGRDINPROG))
1613			bp->b_flags |= B_RELBUF;
1614	}
1615
1616	/*
1617	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
1618	 * constituted, not even NFS buffers now.  Two flags effect this.  If
1619	 * B_INVAL, the struct buf is invalidated but the VM object is kept
1620	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
1621	 *
1622	 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
1623	 * invalidated.  BIO_ERROR cannot be set for a failed write unless the
1624	 * buffer is also B_INVAL because it hits the re-dirtying code above.
1625	 *
1626	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
1627	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
1628	 * the commit state and we cannot afford to lose the buffer. If the
1629	 * buffer has a background write in progress, we need to keep it
1630	 * around to prevent it from being reconstituted and starting a second
1631	 * background write.
1632	 */
1633	if ((bp->b_flags & B_VMIO)
1634	    && !(bp->b_vp->v_mount != NULL &&
1635		 (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
1636		 !vn_isdisk(bp->b_vp, NULL) &&
1637		 (bp->b_flags & B_DELWRI))
1638	    ) {
1639
1640		int i, j, resid;
1641		vm_page_t m;
1642		off_t foff;
1643		vm_pindex_t poff;
1644		vm_object_t obj;
1645
1646		obj = bp->b_bufobj->bo_object;
1647
1648		/*
1649		 * Get the base offset and length of the buffer.  Note that
1650		 * in the VMIO case if the buffer block size is not
1651		 * page-aligned then b_data pointer may not be page-aligned.
1652		 * But our b_pages[] array *IS* page aligned.
1653		 *
1654		 * block sizes less then DEV_BSIZE (usually 512) are not
1655		 * supported due to the page granularity bits (m->valid,
1656		 * m->dirty, etc...).
1657		 *
1658		 * See man buf(9) for more information
1659		 */
1660		resid = bp->b_bufsize;
1661		foff = bp->b_offset;
1662		for (i = 0; i < bp->b_npages; i++) {
1663			int had_bogus = 0;
1664
1665			m = bp->b_pages[i];
1666
1667			/*
1668			 * If we hit a bogus page, fixup *all* the bogus pages
1669			 * now.
1670			 */
1671			if (m == bogus_page) {
1672				poff = OFF_TO_IDX(bp->b_offset);
1673				had_bogus = 1;
1674
1675				VM_OBJECT_RLOCK(obj);
1676				for (j = i; j < bp->b_npages; j++) {
1677					vm_page_t mtmp;
1678					mtmp = bp->b_pages[j];
1679					if (mtmp == bogus_page) {
1680						mtmp = vm_page_lookup(obj, poff + j);
1681						if (!mtmp) {
1682							panic("brelse: page missing\n");
1683						}
1684						bp->b_pages[j] = mtmp;
1685					}
1686				}
1687				VM_OBJECT_RUNLOCK(obj);
1688
1689				if ((bp->b_flags & (B_INVAL | B_UNMAPPED)) == 0) {
1690					BUF_CHECK_MAPPED(bp);
1691					pmap_qenter(
1692					    trunc_page((vm_offset_t)bp->b_data),
1693					    bp->b_pages, bp->b_npages);
1694				}
1695				m = bp->b_pages[i];
1696			}
1697			if ((bp->b_flags & B_NOCACHE) ||
1698			    (bp->b_ioflags & BIO_ERROR &&
1699			     bp->b_iocmd == BIO_READ)) {
1700				int poffset = foff & PAGE_MASK;
1701				int presid = resid > (PAGE_SIZE - poffset) ?
1702					(PAGE_SIZE - poffset) : resid;
1703
1704				KASSERT(presid >= 0, ("brelse: extra page"));
1705				VM_OBJECT_WLOCK(obj);
1706				while (vm_page_xbusied(m)) {
1707					vm_page_lock(m);
1708					VM_OBJECT_WUNLOCK(obj);
1709					vm_page_busy_sleep(m, "mbncsh");
1710					VM_OBJECT_WLOCK(obj);
1711				}
1712				if (pmap_page_wired_mappings(m) == 0)
1713					vm_page_set_invalid(m, poffset, presid);
1714				VM_OBJECT_WUNLOCK(obj);
1715				if (had_bogus)
1716					printf("avoided corruption bug in bogus_page/brelse code\n");
1717			}
1718			resid -= PAGE_SIZE - (foff & PAGE_MASK);
1719			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
1720		}
1721		if (bp->b_flags & (B_INVAL | B_RELBUF))
1722			vfs_vmio_release(bp);
1723
1724	} else if (bp->b_flags & B_VMIO) {
1725
1726		if (bp->b_flags & (B_INVAL | B_RELBUF)) {
1727			vfs_vmio_release(bp);
1728		}
1729
1730	} else if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0) {
1731		if (bp->b_bufsize != 0)
1732			allocbuf(bp, 0);
1733		if (bp->b_vp != NULL)
1734			brelvp(bp);
1735	}
1736
1737	/*
1738	 * If the buffer has junk contents signal it and eventually
1739	 * clean up B_DELWRI and diassociate the vnode so that gbincore()
1740	 * doesn't find it.
1741	 */
1742	if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 ||
1743	    (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0)
1744		bp->b_flags |= B_INVAL;
1745	if (bp->b_flags & B_INVAL) {
1746		if (bp->b_flags & B_DELWRI)
1747			bundirty(bp);
1748		if (bp->b_vp)
1749			brelvp(bp);
1750	}
1751
1752	/* buffers with no memory */
1753	if (bp->b_bufsize == 0) {
1754		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
1755		if (bp->b_vflags & BV_BKGRDINPROG)
1756			panic("losing buffer 1");
1757		if (bp->b_kvasize)
1758			qindex = QUEUE_EMPTYKVA;
1759		else
1760			qindex = QUEUE_EMPTY;
1761		bp->b_flags |= B_AGE;
1762	/* buffers with junk contents */
1763	} else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
1764	    (bp->b_ioflags & BIO_ERROR)) {
1765		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
1766		if (bp->b_vflags & BV_BKGRDINPROG)
1767			panic("losing buffer 2");
1768		qindex = QUEUE_CLEAN;
1769		bp->b_flags |= B_AGE;
1770	/* remaining buffers */
1771	} else if (bp->b_flags & B_DELWRI)
1772		qindex = QUEUE_DIRTY;
1773	else
1774		qindex = QUEUE_CLEAN;
1775
1776	binsfree(bp, qindex);
1777
1778	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
1779	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
1780		panic("brelse: not dirty");
1781	/* unlock */
1782	BUF_UNLOCK(bp);
1783}
1784
1785/*
1786 * Release a buffer back to the appropriate queue but do not try to free
1787 * it.  The buffer is expected to be used again soon.
1788 *
1789 * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
1790 * biodone() to requeue an async I/O on completion.  It is also used when
1791 * known good buffers need to be requeued but we think we may need the data
1792 * again soon.
1793 *
1794 * XXX we should be able to leave the B_RELBUF hint set on completion.
1795 */
1796void
1797bqrelse(struct buf *bp)
1798{
1799	int qindex;
1800
1801	CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1802	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
1803	    ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
1804
1805	if (BUF_LOCKRECURSED(bp)) {
1806		/* do not release to free list */
1807		BUF_UNLOCK(bp);
1808		return;
1809	}
1810	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
1811
1812	if (bp->b_flags & B_MANAGED) {
1813		if (bp->b_flags & B_REMFREE)
1814			bremfreef(bp);
1815		goto out;
1816	}
1817
1818	/* buffers with stale but valid contents */
1819	if (bp->b_flags & B_DELWRI) {
1820		qindex = QUEUE_DIRTY;
1821	} else {
1822		if ((bp->b_flags & B_DELWRI) == 0 &&
1823		    (bp->b_xflags & BX_VNDIRTY))
1824			panic("bqrelse: not dirty");
1825		/*
1826		 * BKGRDINPROG can only be set with the buf and bufobj
1827		 * locks both held.  We tolerate a race to clear it here.
1828		 */
1829		if (buf_vm_page_count_severe() &&
1830		    (bp->b_vflags & BV_BKGRDINPROG) == 0) {
1831			/*
1832			 * We are too low on memory, we have to try to free
1833			 * the buffer (most importantly: the wired pages
1834			 * making up its backing store) *now*.
1835			 */
1836			brelse(bp);
1837			return;
1838		}
1839		qindex = QUEUE_CLEAN;
1840	}
1841	binsfree(bp, qindex);
1842
1843out:
1844	/* unlock */
1845	BUF_UNLOCK(bp);
1846}
1847
1848/* Give pages used by the bp back to the VM system (where possible) */
1849static void
1850vfs_vmio_release(struct buf *bp)
1851{
1852	int i;
1853	vm_page_t m;
1854
1855	if ((bp->b_flags & B_UNMAPPED) == 0) {
1856		BUF_CHECK_MAPPED(bp);
1857		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
1858	} else
1859		BUF_CHECK_UNMAPPED(bp);
1860	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
1861	for (i = 0; i < bp->b_npages; i++) {
1862		m = bp->b_pages[i];
1863		bp->b_pages[i] = NULL;
1864		/*
1865		 * In order to keep page LRU ordering consistent, put
1866		 * everything on the inactive queue.
1867		 */
1868		vm_page_lock(m);
1869		vm_page_unwire(m, 0);
1870
1871		/*
1872		 * Might as well free the page if we can and it has
1873		 * no valid data.  We also free the page if the
1874		 * buffer was used for direct I/O
1875		 */
1876		if ((bp->b_flags & B_ASYNC) == 0 && !m->valid) {
1877			if (m->wire_count == 0 && !vm_page_busied(m))
1878				vm_page_free(m);
1879		} else if (bp->b_flags & B_DIRECT)
1880			vm_page_try_to_free(m);
1881		else if (buf_vm_page_count_severe())
1882			vm_page_try_to_cache(m);
1883		vm_page_unlock(m);
1884	}
1885	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
1886
1887	if (bp->b_bufsize) {
1888		bufspacewakeup();
1889		bp->b_bufsize = 0;
1890	}
1891	bp->b_npages = 0;
1892	bp->b_flags &= ~B_VMIO;
1893	if (bp->b_vp)
1894		brelvp(bp);
1895}
1896
1897/*
1898 * Check to see if a block at a particular lbn is available for a clustered
1899 * write.
1900 */
1901static int
1902vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno)
1903{
1904	struct buf *bpa;
1905	int match;
1906
1907	match = 0;
1908
1909	/* If the buf isn't in core skip it */
1910	if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL)
1911		return (0);
1912
1913	/* If the buf is busy we don't want to wait for it */
1914	if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
1915		return (0);
1916
1917	/* Only cluster with valid clusterable delayed write buffers */
1918	if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) !=
1919	    (B_DELWRI | B_CLUSTEROK))
1920		goto done;
1921
1922	if (bpa->b_bufsize != size)
1923		goto done;
1924
1925	/*
1926	 * Check to see if it is in the expected place on disk and that the
1927	 * block has been mapped.
1928	 */
1929	if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno))
1930		match = 1;
1931done:
1932	BUF_UNLOCK(bpa);
1933	return (match);
1934}
1935
1936/*
1937 *	vfs_bio_awrite:
1938 *
1939 *	Implement clustered async writes for clearing out B_DELWRI buffers.
1940 *	This is much better then the old way of writing only one buffer at
1941 *	a time.  Note that we may not be presented with the buffers in the
1942 *	correct order, so we search for the cluster in both directions.
1943 */
1944int
1945vfs_bio_awrite(struct buf *bp)
1946{
1947	struct bufobj *bo;
1948	int i;
1949	int j;
1950	daddr_t lblkno = bp->b_lblkno;
1951	struct vnode *vp = bp->b_vp;
1952	int ncl;
1953	int nwritten;
1954	int size;
1955	int maxcl;
1956	int gbflags;
1957
1958	bo = &vp->v_bufobj;
1959	gbflags = (bp->b_flags & B_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
1960	/*
1961	 * right now we support clustered writing only to regular files.  If
1962	 * we find a clusterable block we could be in the middle of a cluster
1963	 * rather then at the beginning.
1964	 */
1965	if ((vp->v_type == VREG) &&
1966	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
1967	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
1968
1969		size = vp->v_mount->mnt_stat.f_iosize;
1970		maxcl = MAXPHYS / size;
1971
1972		BO_RLOCK(bo);
1973		for (i = 1; i < maxcl; i++)
1974			if (vfs_bio_clcheck(vp, size, lblkno + i,
1975			    bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
1976				break;
1977
1978		for (j = 1; i + j <= maxcl && j <= lblkno; j++)
1979			if (vfs_bio_clcheck(vp, size, lblkno - j,
1980			    bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
1981				break;
1982		BO_RUNLOCK(bo);
1983		--j;
1984		ncl = i + j;
1985		/*
1986		 * this is a possible cluster write
1987		 */
1988		if (ncl != 1) {
1989			BUF_UNLOCK(bp);
1990			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
1991			    gbflags);
1992			return (nwritten);
1993		}
1994	}
1995	bremfree(bp);
1996	bp->b_flags |= B_ASYNC;
1997	/*
1998	 * default (old) behavior, writing out only one block
1999	 *
2000	 * XXX returns b_bufsize instead of b_bcount for nwritten?
2001	 */
2002	nwritten = bp->b_bufsize;
2003	(void) bwrite(bp);
2004
2005	return (nwritten);
2006}
2007
2008static void
2009setbufkva(struct buf *bp, vm_offset_t addr, int maxsize, int gbflags)
2010{
2011
2012	KASSERT((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
2013	    bp->b_kvasize == 0, ("call bfreekva(%p)", bp));
2014	if ((gbflags & GB_UNMAPPED) == 0) {
2015		bp->b_kvabase = (caddr_t)addr;
2016	} else if ((gbflags & GB_KVAALLOC) != 0) {
2017		KASSERT((gbflags & GB_UNMAPPED) != 0,
2018		    ("GB_KVAALLOC without GB_UNMAPPED"));
2019		bp->b_kvaalloc = (caddr_t)addr;
2020		bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
2021		atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
2022	}
2023	bp->b_kvasize = maxsize;
2024}
2025
2026/*
2027 * Allocate the buffer KVA and set b_kvasize. Also set b_kvabase if
2028 * needed.
2029 */
2030static int
2031allocbufkva(struct buf *bp, int maxsize, int gbflags)
2032{
2033	vm_offset_t addr;
2034
2035	bfreekva(bp);
2036	addr = 0;
2037
2038	if (vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr)) {
2039		/*
2040		 * Buffer map is too fragmented.  Request the caller
2041		 * to defragment the map.
2042		 */
2043		atomic_add_int(&bufdefragcnt, 1);
2044		return (1);
2045	}
2046	setbufkva(bp, addr, maxsize, gbflags);
2047	atomic_add_long(&bufspace, bp->b_kvasize);
2048	return (0);
2049}
2050
2051/*
2052 * Ask the bufdaemon for help, or act as bufdaemon itself, when a
2053 * locked vnode is supplied.
2054 */
2055static void
2056getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo,
2057    int defrag)
2058{
2059	struct thread *td;
2060	char *waitmsg;
2061	int cnt, error, flags, norunbuf, wait;
2062
2063	mtx_assert(&bqclean, MA_OWNED);
2064
2065	if (defrag) {
2066		flags = VFS_BIO_NEED_BUFSPACE;
2067		waitmsg = "nbufkv";
2068	} else if (bufspace >= hibufspace) {
2069		waitmsg = "nbufbs";
2070		flags = VFS_BIO_NEED_BUFSPACE;
2071	} else {
2072		waitmsg = "newbuf";
2073		flags = VFS_BIO_NEED_ANY;
2074	}
2075	atomic_set_int(&needsbuffer, flags);
2076	mtx_unlock(&bqclean);
2077
2078	bd_speedup();	/* heeeelp */
2079	if ((gbflags & GB_NOWAIT_BD) != 0)
2080		return;
2081
2082	td = curthread;
2083	cnt = 0;
2084	wait = MNT_NOWAIT;
2085	rw_wlock(&nblock);
2086	while ((needsbuffer & flags) != 0) {
2087		if (vp != NULL && vp->v_type != VCHR &&
2088		    (td->td_pflags & TDP_BUFNEED) == 0) {
2089			rw_wunlock(&nblock);
2090			/*
2091			 * getblk() is called with a vnode locked, and
2092			 * some majority of the dirty buffers may as
2093			 * well belong to the vnode.  Flushing the
2094			 * buffers there would make a progress that
2095			 * cannot be achieved by the buf_daemon, that
2096			 * cannot lock the vnode.
2097			 */
2098			if (cnt++ > 2)
2099				wait = MNT_WAIT;
2100			ASSERT_VOP_LOCKED(vp, "bufd_helper");
2101			error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 :
2102			    vn_lock(vp, LK_TRYUPGRADE);
2103			if (error == 0) {
2104				/* play bufdaemon */
2105				norunbuf = curthread_pflags_set(TDP_BUFNEED |
2106				    TDP_NORUNNINGBUF);
2107				VOP_FSYNC(vp, wait, td);
2108				atomic_add_long(&notbufdflushes, 1);
2109				curthread_pflags_restore(norunbuf);
2110			}
2111			rw_wlock(&nblock);
2112			if ((needsbuffer & flags) == 0)
2113				break;
2114		}
2115		error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
2116		    (PRIBIO + 4) | slpflag, waitmsg, slptimeo);
2117		if (error != 0)
2118			break;
2119	}
2120	rw_wunlock(&nblock);
2121}
2122
2123static void
2124getnewbuf_reuse_bp(struct buf *bp, int qindex)
2125{
2126
2127	CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
2128	    "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
2129	     bp->b_kvasize, bp->b_bufsize, qindex);
2130	mtx_assert(&bqclean, MA_NOTOWNED);
2131
2132	/*
2133	 * Note: we no longer distinguish between VMIO and non-VMIO
2134	 * buffers.
2135	 */
2136	KASSERT((bp->b_flags & B_DELWRI) == 0,
2137	    ("delwri buffer %p found in queue %d", bp, qindex));
2138
2139	if (qindex == QUEUE_CLEAN) {
2140		if (bp->b_flags & B_VMIO) {
2141			bp->b_flags &= ~B_ASYNC;
2142			vfs_vmio_release(bp);
2143		}
2144		if (bp->b_vp != NULL)
2145			brelvp(bp);
2146	}
2147
2148	/*
2149	 * Get the rest of the buffer freed up.  b_kva* is still valid
2150	 * after this operation.
2151	 */
2152
2153	if (bp->b_rcred != NOCRED) {
2154		crfree(bp->b_rcred);
2155		bp->b_rcred = NOCRED;
2156	}
2157	if (bp->b_wcred != NOCRED) {
2158		crfree(bp->b_wcred);
2159		bp->b_wcred = NOCRED;
2160	}
2161	if (!LIST_EMPTY(&bp->b_dep))
2162		buf_deallocate(bp);
2163	if (bp->b_vflags & BV_BKGRDINPROG)
2164		panic("losing buffer 3");
2165	KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p.  qindex: %d",
2166	    bp, bp->b_vp, qindex));
2167	KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
2168	    ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
2169
2170	if (bp->b_bufsize)
2171		allocbuf(bp, 0);
2172
2173	bp->b_flags &= B_UNMAPPED | B_KVAALLOC;
2174	bp->b_ioflags = 0;
2175	bp->b_xflags = 0;
2176	KASSERT((bp->b_flags & B_INFREECNT) == 0,
2177	    ("buf %p still counted as free?", bp));
2178	bp->b_vflags = 0;
2179	bp->b_vp = NULL;
2180	bp->b_blkno = bp->b_lblkno = 0;
2181	bp->b_offset = NOOFFSET;
2182	bp->b_iodone = 0;
2183	bp->b_error = 0;
2184	bp->b_resid = 0;
2185	bp->b_bcount = 0;
2186	bp->b_npages = 0;
2187	bp->b_dirtyoff = bp->b_dirtyend = 0;
2188	bp->b_bufobj = NULL;
2189	bp->b_pin_count = 0;
2190	bp->b_fsprivate1 = NULL;
2191	bp->b_fsprivate2 = NULL;
2192	bp->b_fsprivate3 = NULL;
2193
2194	LIST_INIT(&bp->b_dep);
2195}
2196
2197static int flushingbufs;
2198
2199static struct buf *
2200getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata)
2201{
2202	struct buf *bp, *nbp;
2203	int nqindex, qindex, pass;
2204
2205	KASSERT(!unmapped || !defrag, ("both unmapped and defrag"));
2206
2207	pass = 1;
2208restart:
2209	atomic_add_int(&getnewbufrestarts, 1);
2210
2211	/*
2212	 * Setup for scan.  If we do not have enough free buffers,
2213	 * we setup a degenerate case that immediately fails.  Note
2214	 * that if we are specially marked process, we are allowed to
2215	 * dip into our reserves.
2216	 *
2217	 * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN
2218	 * for the allocation of the mapped buffer.  For unmapped, the
2219	 * easiest is to start with EMPTY outright.
2220	 *
2221	 * We start with EMPTYKVA.  If the list is empty we backup to EMPTY.
2222	 * However, there are a number of cases (defragging, reusing, ...)
2223	 * where we cannot backup.
2224	 */
2225	nbp = NULL;
2226	mtx_lock(&bqclean);
2227	if (!defrag && unmapped) {
2228		nqindex = QUEUE_EMPTY;
2229		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
2230	}
2231	if (nbp == NULL) {
2232		nqindex = QUEUE_EMPTYKVA;
2233		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
2234	}
2235
2236	/*
2237	 * If no EMPTYKVA buffers and we are either defragging or
2238	 * reusing, locate a CLEAN buffer to free or reuse.  If
2239	 * bufspace useage is low skip this step so we can allocate a
2240	 * new buffer.
2241	 */
2242	if (nbp == NULL && (defrag || bufspace >= lobufspace)) {
2243		nqindex = QUEUE_CLEAN;
2244		nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
2245	}
2246
2247	/*
2248	 * If we could not find or were not allowed to reuse a CLEAN
2249	 * buffer, check to see if it is ok to use an EMPTY buffer.
2250	 * We can only use an EMPTY buffer if allocating its KVA would
2251	 * not otherwise run us out of buffer space.  No KVA is needed
2252	 * for the unmapped allocation.
2253	 */
2254	if (nbp == NULL && defrag == 0 && (bufspace + maxsize < hibufspace ||
2255	    metadata)) {
2256		nqindex = QUEUE_EMPTY;
2257		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
2258	}
2259
2260	/*
2261	 * All available buffers might be clean, retry ignoring the
2262	 * lobufspace as the last resort.
2263	 */
2264	if (nbp == NULL && !TAILQ_EMPTY(&bufqueues[QUEUE_CLEAN])) {
2265		nqindex = QUEUE_CLEAN;
2266		nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
2267	}
2268
2269	/*
2270	 * Run scan, possibly freeing data and/or kva mappings on the fly
2271	 * depending.
2272	 */
2273	while ((bp = nbp) != NULL) {
2274		qindex = nqindex;
2275
2276		/*
2277		 * Calculate next bp (we can only use it if we do not
2278		 * block or do other fancy things).
2279		 */
2280		if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
2281			switch (qindex) {
2282			case QUEUE_EMPTY:
2283				nqindex = QUEUE_EMPTYKVA;
2284				nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
2285				if (nbp != NULL)
2286					break;
2287				/* FALLTHROUGH */
2288			case QUEUE_EMPTYKVA:
2289				nqindex = QUEUE_CLEAN;
2290				nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
2291				if (nbp != NULL)
2292					break;
2293				/* FALLTHROUGH */
2294			case QUEUE_CLEAN:
2295				if (metadata && pass == 1) {
2296					pass = 2;
2297					nqindex = QUEUE_EMPTY;
2298					nbp = TAILQ_FIRST(
2299					    &bufqueues[QUEUE_EMPTY]);
2300				}
2301				/*
2302				 * nbp is NULL.
2303				 */
2304				break;
2305			}
2306		}
2307		/*
2308		 * If we are defragging then we need a buffer with
2309		 * b_kvasize != 0.  XXX this situation should no longer
2310		 * occur, if defrag is non-zero the buffer's b_kvasize
2311		 * should also be non-zero at this point.  XXX
2312		 */
2313		if (defrag && bp->b_kvasize == 0) {
2314			printf("Warning: defrag empty buffer %p\n", bp);
2315			continue;
2316		}
2317
2318		/*
2319		 * Start freeing the bp.  This is somewhat involved.  nbp
2320		 * remains valid only for QUEUE_EMPTY[KVA] bp's.
2321		 */
2322		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
2323			continue;
2324		/*
2325		 * BKGRDINPROG can only be set with the buf and bufobj
2326		 * locks both held.  We tolerate a race to clear it here.
2327		 */
2328		if (bp->b_vflags & BV_BKGRDINPROG) {
2329			BUF_UNLOCK(bp);
2330			continue;
2331		}
2332
2333		KASSERT(bp->b_qindex == qindex,
2334		    ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
2335
2336		bremfreel(bp);
2337		mtx_unlock(&bqclean);
2338		/*
2339		 * NOTE:  nbp is now entirely invalid.  We can only restart
2340		 * the scan from this point on.
2341		 */
2342
2343		getnewbuf_reuse_bp(bp, qindex);
2344		mtx_assert(&bqclean, MA_NOTOWNED);
2345
2346		/*
2347		 * If we are defragging then free the buffer.
2348		 */
2349		if (defrag) {
2350			bp->b_flags |= B_INVAL;
2351			bfreekva(bp);
2352			brelse(bp);
2353			defrag = 0;
2354			goto restart;
2355		}
2356
2357		/*
2358		 * Notify any waiters for the buffer lock about
2359		 * identity change by freeing the buffer.
2360		 */
2361		if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp)) {
2362			bp->b_flags |= B_INVAL;
2363			bfreekva(bp);
2364			brelse(bp);
2365			goto restart;
2366		}
2367
2368		if (metadata)
2369			break;
2370
2371		/*
2372		 * If we are overcomitted then recover the buffer and its
2373		 * KVM space.  This occurs in rare situations when multiple
2374		 * processes are blocked in getnewbuf() or allocbuf().
2375		 */
2376		if (bufspace >= hibufspace)
2377			flushingbufs = 1;
2378		if (flushingbufs && bp->b_kvasize != 0) {
2379			bp->b_flags |= B_INVAL;
2380			bfreekva(bp);
2381			brelse(bp);
2382			goto restart;
2383		}
2384		if (bufspace < lobufspace)
2385			flushingbufs = 0;
2386		break;
2387	}
2388	return (bp);
2389}
2390
2391/*
2392 *	getnewbuf:
2393 *
2394 *	Find and initialize a new buffer header, freeing up existing buffers
2395 *	in the bufqueues as necessary.  The new buffer is returned locked.
2396 *
2397 *	Important:  B_INVAL is not set.  If the caller wishes to throw the
2398 *	buffer away, the caller must set B_INVAL prior to calling brelse().
2399 *
2400 *	We block if:
2401 *		We have insufficient buffer headers
2402 *		We have insufficient buffer space
2403 *		buffer_arena is too fragmented ( space reservation fails )
2404 *		If we have to flush dirty buffers ( but we try to avoid this )
2405 */
2406static struct buf *
2407getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
2408    int gbflags)
2409{
2410	struct buf *bp;
2411	int defrag, metadata;
2412
2413	KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
2414	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
2415	if (!unmapped_buf_allowed)
2416		gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
2417
2418	defrag = 0;
2419	if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
2420	    vp->v_type == VCHR)
2421		metadata = 1;
2422	else
2423		metadata = 0;
2424	/*
2425	 * We can't afford to block since we might be holding a vnode lock,
2426	 * which may prevent system daemons from running.  We deal with
2427	 * low-memory situations by proactively returning memory and running
2428	 * async I/O rather then sync I/O.
2429	 */
2430	atomic_add_int(&getnewbufcalls, 1);
2431	atomic_subtract_int(&getnewbufrestarts, 1);
2432restart:
2433	bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED |
2434	    GB_KVAALLOC)) == GB_UNMAPPED, metadata);
2435	if (bp != NULL)
2436		defrag = 0;
2437
2438	/*
2439	 * If we exhausted our list, sleep as appropriate.  We may have to
2440	 * wakeup various daemons and write out some dirty buffers.
2441	 *
2442	 * Generally we are sleeping due to insufficient buffer space.
2443	 */
2444	if (bp == NULL) {
2445		mtx_assert(&bqclean, MA_OWNED);
2446		getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag);
2447		mtx_assert(&bqclean, MA_NOTOWNED);
2448	} else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) {
2449		mtx_assert(&bqclean, MA_NOTOWNED);
2450
2451		bfreekva(bp);
2452		bp->b_flags |= B_UNMAPPED;
2453		bp->b_kvabase = bp->b_data = unmapped_buf;
2454		bp->b_kvasize = maxsize;
2455		atomic_add_long(&bufspace, bp->b_kvasize);
2456		atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
2457		atomic_add_int(&bufreusecnt, 1);
2458	} else {
2459		mtx_assert(&bqclean, MA_NOTOWNED);
2460
2461		/*
2462		 * We finally have a valid bp.  We aren't quite out of the
2463		 * woods, we still have to reserve kva space.  In order
2464		 * to keep fragmentation sane we only allocate kva in
2465		 * BKVASIZE chunks.
2466		 */
2467		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
2468
2469		if (maxsize != bp->b_kvasize || (bp->b_flags & (B_UNMAPPED |
2470		    B_KVAALLOC)) == B_UNMAPPED) {
2471			if (allocbufkva(bp, maxsize, gbflags)) {
2472				defrag = 1;
2473				bp->b_flags |= B_INVAL;
2474				brelse(bp);
2475				goto restart;
2476			}
2477			atomic_add_int(&bufreusecnt, 1);
2478		} else if ((bp->b_flags & B_KVAALLOC) != 0 &&
2479		    (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == 0) {
2480			/*
2481			 * If the reused buffer has KVA allocated,
2482			 * reassign b_kvaalloc to b_kvabase.
2483			 */
2484			bp->b_kvabase = bp->b_kvaalloc;
2485			bp->b_flags &= ~B_KVAALLOC;
2486			atomic_subtract_long(&unmapped_bufspace,
2487			    bp->b_kvasize);
2488			atomic_add_int(&bufreusecnt, 1);
2489		} else if ((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
2490		    (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == (GB_UNMAPPED |
2491		    GB_KVAALLOC)) {
2492			/*
2493			 * The case of reused buffer already have KVA
2494			 * mapped, but the request is for unmapped
2495			 * buffer with KVA allocated.
2496			 */
2497			bp->b_kvaalloc = bp->b_kvabase;
2498			bp->b_data = bp->b_kvabase = unmapped_buf;
2499			bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
2500			atomic_add_long(&unmapped_bufspace,
2501			    bp->b_kvasize);
2502			atomic_add_int(&bufreusecnt, 1);
2503		}
2504		if ((gbflags & GB_UNMAPPED) == 0) {
2505			bp->b_saveaddr = bp->b_kvabase;
2506			bp->b_data = bp->b_saveaddr;
2507			bp->b_flags &= ~B_UNMAPPED;
2508			BUF_CHECK_MAPPED(bp);
2509		}
2510	}
2511	return (bp);
2512}
2513
2514/*
2515 *	buf_daemon:
2516 *
2517 *	buffer flushing daemon.  Buffers are normally flushed by the
2518 *	update daemon but if it cannot keep up this process starts to
2519 *	take the load in an attempt to prevent getnewbuf() from blocking.
2520 */
2521
2522static struct kproc_desc buf_kp = {
2523	"bufdaemon",
2524	buf_daemon,
2525	&bufdaemonproc
2526};
2527SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
2528
2529static int
2530buf_flush(int target)
2531{
2532	int flushed;
2533
2534	flushed = flushbufqueues(target, 0);
2535	if (flushed == 0) {
2536		/*
2537		 * Could not find any buffers without rollback
2538		 * dependencies, so just write the first one
2539		 * in the hopes of eventually making progress.
2540		 */
2541		flushed = flushbufqueues(target, 1);
2542	}
2543	return (flushed);
2544}
2545
2546static void
2547buf_daemon()
2548{
2549	int lodirty;
2550
2551	/*
2552	 * This process needs to be suspended prior to shutdown sync.
2553	 */
2554	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc,
2555	    SHUTDOWN_PRI_LAST);
2556
2557	/*
2558	 * This process is allowed to take the buffer cache to the limit
2559	 */
2560	curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED;
2561	mtx_lock(&bdlock);
2562	for (;;) {
2563		bd_request = 0;
2564		mtx_unlock(&bdlock);
2565
2566		kproc_suspend_check(bufdaemonproc);
2567		lodirty = lodirtybuffers;
2568		if (bd_speedupreq) {
2569			lodirty = numdirtybuffers / 2;
2570			bd_speedupreq = 0;
2571		}
2572		/*
2573		 * Do the flush.  Limit the amount of in-transit I/O we
2574		 * allow to build up, otherwise we would completely saturate
2575		 * the I/O system.
2576		 */
2577		while (numdirtybuffers > lodirty) {
2578			if (buf_flush(numdirtybuffers - lodirty) == 0)
2579				break;
2580			kern_yield(PRI_USER);
2581		}
2582
2583		/*
2584		 * Only clear bd_request if we have reached our low water
2585		 * mark.  The buf_daemon normally waits 1 second and
2586		 * then incrementally flushes any dirty buffers that have
2587		 * built up, within reason.
2588		 *
2589		 * If we were unable to hit our low water mark and couldn't
2590		 * find any flushable buffers, we sleep for a short period
2591		 * to avoid endless loops on unlockable buffers.
2592		 */
2593		mtx_lock(&bdlock);
2594		if (numdirtybuffers <= lodirtybuffers) {
2595			/*
2596			 * We reached our low water mark, reset the
2597			 * request and sleep until we are needed again.
2598			 * The sleep is just so the suspend code works.
2599			 */
2600			bd_request = 0;
2601			/*
2602			 * Do an extra wakeup in case dirty threshold
2603			 * changed via sysctl and the explicit transition
2604			 * out of shortfall was missed.
2605			 */
2606			bdirtywakeup();
2607			if (runningbufspace <= lorunningspace)
2608				runningwakeup();
2609			msleep(&bd_request, &bdlock, PVM, "psleep", hz);
2610		} else {
2611			/*
2612			 * We couldn't find any flushable dirty buffers but
2613			 * still have too many dirty buffers, we
2614			 * have to sleep and try again.  (rare)
2615			 */
2616			msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10);
2617		}
2618	}
2619}
2620
2621/*
2622 *	flushbufqueues:
2623 *
2624 *	Try to flush a buffer in the dirty queue.  We must be careful to
2625 *	free up B_INVAL buffers instead of write them, which NFS is
2626 *	particularly sensitive to.
2627 */
2628static int flushwithdeps = 0;
2629SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
2630    0, "Number of buffers flushed with dependecies that require rollbacks");
2631
2632static int
2633flushbufqueues(int target, int flushdeps)
2634{
2635	struct buf *sentinel;
2636	struct vnode *vp;
2637	struct mount *mp;
2638	struct buf *bp;
2639	int hasdeps;
2640	int flushed;
2641	int queue;
2642	int error;
2643
2644	flushed = 0;
2645	queue = QUEUE_DIRTY;
2646	bp = NULL;
2647	sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
2648	sentinel->b_qindex = QUEUE_SENTINEL;
2649	mtx_lock(&bqdirty);
2650	TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist);
2651	mtx_unlock(&bqdirty);
2652	while (flushed != target) {
2653		maybe_yield();
2654		mtx_lock(&bqdirty);
2655		bp = TAILQ_NEXT(sentinel, b_freelist);
2656		if (bp != NULL) {
2657			TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
2658			TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel,
2659			    b_freelist);
2660		} else {
2661			mtx_unlock(&bqdirty);
2662			break;
2663		}
2664		KASSERT(bp->b_qindex != QUEUE_SENTINEL,
2665		    ("parallel calls to flushbufqueues() bp %p", bp));
2666		error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL);
2667		mtx_unlock(&bqdirty);
2668		if (error != 0)
2669			continue;
2670		if (bp->b_pin_count > 0) {
2671			BUF_UNLOCK(bp);
2672			continue;
2673		}
2674		/*
2675		 * BKGRDINPROG can only be set with the buf and bufobj
2676		 * locks both held.  We tolerate a race to clear it here.
2677		 */
2678		if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
2679		    (bp->b_flags & B_DELWRI) == 0) {
2680			BUF_UNLOCK(bp);
2681			continue;
2682		}
2683		if (bp->b_flags & B_INVAL) {
2684			bremfreef(bp);
2685			brelse(bp);
2686			flushed++;
2687			continue;
2688		}
2689
2690		if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
2691			if (flushdeps == 0) {
2692				BUF_UNLOCK(bp);
2693				continue;
2694			}
2695			hasdeps = 1;
2696		} else
2697			hasdeps = 0;
2698		/*
2699		 * We must hold the lock on a vnode before writing
2700		 * one of its buffers. Otherwise we may confuse, or
2701		 * in the case of a snapshot vnode, deadlock the
2702		 * system.
2703		 *
2704		 * The lock order here is the reverse of the normal
2705		 * of vnode followed by buf lock.  This is ok because
2706		 * the NOWAIT will prevent deadlock.
2707		 */
2708		vp = bp->b_vp;
2709		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
2710			BUF_UNLOCK(bp);
2711			continue;
2712		}
2713		error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
2714		if (error == 0) {
2715			CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
2716			    bp, bp->b_vp, bp->b_flags);
2717			vfs_bio_awrite(bp);
2718			vn_finished_write(mp);
2719			VOP_UNLOCK(vp, 0);
2720			flushwithdeps += hasdeps;
2721			flushed++;
2722			if (runningbufspace > hirunningspace)
2723				waitrunningbufspace();
2724			continue;
2725		}
2726		vn_finished_write(mp);
2727		BUF_UNLOCK(bp);
2728	}
2729	mtx_lock(&bqdirty);
2730	TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
2731	mtx_unlock(&bqdirty);
2732	free(sentinel, M_TEMP);
2733	return (flushed);
2734}
2735
2736/*
2737 * Check to see if a block is currently memory resident.
2738 */
2739struct buf *
2740incore(struct bufobj *bo, daddr_t blkno)
2741{
2742	struct buf *bp;
2743
2744	BO_RLOCK(bo);
2745	bp = gbincore(bo, blkno);
2746	BO_RUNLOCK(bo);
2747	return (bp);
2748}
2749
2750/*
2751 * Returns true if no I/O is needed to access the
2752 * associated VM object.  This is like incore except
2753 * it also hunts around in the VM system for the data.
2754 */
2755
2756static int
2757inmem(struct vnode * vp, daddr_t blkno)
2758{
2759	vm_object_t obj;
2760	vm_offset_t toff, tinc, size;
2761	vm_page_t m;
2762	vm_ooffset_t off;
2763
2764	ASSERT_VOP_LOCKED(vp, "inmem");
2765
2766	if (incore(&vp->v_bufobj, blkno))
2767		return 1;
2768	if (vp->v_mount == NULL)
2769		return 0;
2770	obj = vp->v_object;
2771	if (obj == NULL)
2772		return (0);
2773
2774	size = PAGE_SIZE;
2775	if (size > vp->v_mount->mnt_stat.f_iosize)
2776		size = vp->v_mount->mnt_stat.f_iosize;
2777	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
2778
2779	VM_OBJECT_RLOCK(obj);
2780	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
2781		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
2782		if (!m)
2783			goto notinmem;
2784		tinc = size;
2785		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
2786			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
2787		if (vm_page_is_valid(m,
2788		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
2789			goto notinmem;
2790	}
2791	VM_OBJECT_RUNLOCK(obj);
2792	return 1;
2793
2794notinmem:
2795	VM_OBJECT_RUNLOCK(obj);
2796	return (0);
2797}
2798
2799/*
2800 * Set the dirty range for a buffer based on the status of the dirty
2801 * bits in the pages comprising the buffer.  The range is limited
2802 * to the size of the buffer.
2803 *
2804 * Tell the VM system that the pages associated with this buffer
2805 * are clean.  This is used for delayed writes where the data is
2806 * going to go to disk eventually without additional VM intevention.
2807 *
2808 * Note that while we only really need to clean through to b_bcount, we
2809 * just go ahead and clean through to b_bufsize.
2810 */
2811static void
2812vfs_clean_pages_dirty_buf(struct buf *bp)
2813{
2814	vm_ooffset_t foff, noff, eoff;
2815	vm_page_t m;
2816	int i;
2817
2818	if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0)
2819		return;
2820
2821	foff = bp->b_offset;
2822	KASSERT(bp->b_offset != NOOFFSET,
2823	    ("vfs_clean_pages_dirty_buf: no buffer offset"));
2824
2825	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
2826	vfs_drain_busy_pages(bp);
2827	vfs_setdirty_locked_object(bp);
2828	for (i = 0; i < bp->b_npages; i++) {
2829		noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
2830		eoff = noff;
2831		if (eoff > bp->b_offset + bp->b_bufsize)
2832			eoff = bp->b_offset + bp->b_bufsize;
2833		m = bp->b_pages[i];
2834		vfs_page_set_validclean(bp, foff, m);
2835		/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
2836		foff = noff;
2837	}
2838	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
2839}
2840
2841static void
2842vfs_setdirty_locked_object(struct buf *bp)
2843{
2844	vm_object_t object;
2845	int i;
2846
2847	object = bp->b_bufobj->bo_object;
2848	VM_OBJECT_ASSERT_WLOCKED(object);
2849
2850	/*
2851	 * We qualify the scan for modified pages on whether the
2852	 * object has been flushed yet.
2853	 */
2854	if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) {
2855		vm_offset_t boffset;
2856		vm_offset_t eoffset;
2857
2858		/*
2859		 * test the pages to see if they have been modified directly
2860		 * by users through the VM system.
2861		 */
2862		for (i = 0; i < bp->b_npages; i++)
2863			vm_page_test_dirty(bp->b_pages[i]);
2864
2865		/*
2866		 * Calculate the encompassing dirty range, boffset and eoffset,
2867		 * (eoffset - boffset) bytes.
2868		 */
2869
2870		for (i = 0; i < bp->b_npages; i++) {
2871			if (bp->b_pages[i]->dirty)
2872				break;
2873		}
2874		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
2875
2876		for (i = bp->b_npages - 1; i >= 0; --i) {
2877			if (bp->b_pages[i]->dirty) {
2878				break;
2879			}
2880		}
2881		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
2882
2883		/*
2884		 * Fit it to the buffer.
2885		 */
2886
2887		if (eoffset > bp->b_bcount)
2888			eoffset = bp->b_bcount;
2889
2890		/*
2891		 * If we have a good dirty range, merge with the existing
2892		 * dirty range.
2893		 */
2894
2895		if (boffset < eoffset) {
2896			if (bp->b_dirtyoff > boffset)
2897				bp->b_dirtyoff = boffset;
2898			if (bp->b_dirtyend < eoffset)
2899				bp->b_dirtyend = eoffset;
2900		}
2901	}
2902}
2903
2904/*
2905 * Allocate the KVA mapping for an existing buffer. It handles the
2906 * cases of both B_UNMAPPED buffer, and buffer with the preallocated
2907 * KVA which is not mapped (B_KVAALLOC).
2908 */
2909static void
2910bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
2911{
2912	struct buf *scratch_bp;
2913	int bsize, maxsize, need_mapping, need_kva;
2914	off_t offset;
2915
2916	need_mapping = (bp->b_flags & B_UNMAPPED) != 0 &&
2917	    (gbflags & GB_UNMAPPED) == 0;
2918	need_kva = (bp->b_flags & (B_KVAALLOC | B_UNMAPPED)) == B_UNMAPPED &&
2919	    (gbflags & GB_KVAALLOC) != 0;
2920	if (!need_mapping && !need_kva)
2921		return;
2922
2923	BUF_CHECK_UNMAPPED(bp);
2924
2925	if (need_mapping && (bp->b_flags & B_KVAALLOC) != 0) {
2926		/*
2927		 * Buffer is not mapped, but the KVA was already
2928		 * reserved at the time of the instantiation.  Use the
2929		 * allocated space.
2930		 */
2931		bp->b_flags &= ~B_KVAALLOC;
2932		KASSERT(bp->b_kvaalloc != 0, ("kvaalloc == 0"));
2933		bp->b_kvabase = bp->b_kvaalloc;
2934		atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
2935		goto has_addr;
2936	}
2937
2938	/*
2939	 * Calculate the amount of the address space we would reserve
2940	 * if the buffer was mapped.
2941	 */
2942	bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
2943	offset = blkno * bsize;
2944	maxsize = size + (offset & PAGE_MASK);
2945	maxsize = imax(maxsize, bsize);
2946
2947mapping_loop:
2948	if (allocbufkva(bp, maxsize, gbflags)) {
2949		/*
2950		 * Request defragmentation. getnewbuf() returns us the
2951		 * allocated space by the scratch buffer KVA.
2952		 */
2953		scratch_bp = getnewbuf(bp->b_vp, 0, 0, size, maxsize, gbflags |
2954		    (GB_UNMAPPED | GB_KVAALLOC));
2955		if (scratch_bp == NULL) {
2956			if ((gbflags & GB_NOWAIT_BD) != 0) {
2957				/*
2958				 * XXXKIB: defragmentation cannot
2959				 * succeed, not sure what else to do.
2960				 */
2961				panic("GB_NOWAIT_BD and B_UNMAPPED %p", bp);
2962			}
2963			atomic_add_int(&mappingrestarts, 1);
2964			goto mapping_loop;
2965		}
2966		KASSERT((scratch_bp->b_flags & B_KVAALLOC) != 0,
2967		    ("scratch bp !B_KVAALLOC %p", scratch_bp));
2968		setbufkva(bp, (vm_offset_t)scratch_bp->b_kvaalloc,
2969		    scratch_bp->b_kvasize, gbflags);
2970
2971		/* Get rid of the scratch buffer. */
2972		scratch_bp->b_kvasize = 0;
2973		scratch_bp->b_flags |= B_INVAL;
2974		scratch_bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
2975		brelse(scratch_bp);
2976	}
2977	if (!need_mapping)
2978		return;
2979
2980has_addr:
2981	bp->b_saveaddr = bp->b_kvabase;
2982	bp->b_data = bp->b_saveaddr; /* b_offset is handled by bpmap_qenter */
2983	bp->b_flags &= ~B_UNMAPPED;
2984	BUF_CHECK_MAPPED(bp);
2985	bpmap_qenter(bp);
2986}
2987
2988/*
2989 *	getblk:
2990 *
2991 *	Get a block given a specified block and offset into a file/device.
2992 *	The buffers B_DONE bit will be cleared on return, making it almost
2993 * 	ready for an I/O initiation.  B_INVAL may or may not be set on
2994 *	return.  The caller should clear B_INVAL prior to initiating a
2995 *	READ.
2996 *
2997 *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
2998 *	an existing buffer.
2999 *
3000 *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
3001 *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
3002 *	and then cleared based on the backing VM.  If the previous buffer is
3003 *	non-0-sized but invalid, B_CACHE will be cleared.
3004 *
3005 *	If getblk() must create a new buffer, the new buffer is returned with
3006 *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
3007 *	case it is returned with B_INVAL clear and B_CACHE set based on the
3008 *	backing VM.
3009 *
3010 *	getblk() also forces a bwrite() for any B_DELWRI buffer whos
3011 *	B_CACHE bit is clear.
3012 *
3013 *	What this means, basically, is that the caller should use B_CACHE to
3014 *	determine whether the buffer is fully valid or not and should clear
3015 *	B_INVAL prior to issuing a read.  If the caller intends to validate
3016 *	the buffer by loading its data area with something, the caller needs
3017 *	to clear B_INVAL.  If the caller does this without issuing an I/O,
3018 *	the caller should set B_CACHE ( as an optimization ), else the caller
3019 *	should issue the I/O and biodone() will set B_CACHE if the I/O was
3020 *	a write attempt or if it was a successfull read.  If the caller
3021 *	intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
3022 *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
3023 */
3024struct buf *
3025getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
3026    int flags)
3027{
3028	struct buf *bp;
3029	struct bufobj *bo;
3030	int bsize, error, maxsize, vmio;
3031	off_t offset;
3032
3033	CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
3034	KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
3035	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
3036	ASSERT_VOP_LOCKED(vp, "getblk");
3037	if (size > MAXBSIZE)
3038		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
3039	if (!unmapped_buf_allowed)
3040		flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
3041
3042	bo = &vp->v_bufobj;
3043loop:
3044	BO_RLOCK(bo);
3045	bp = gbincore(bo, blkno);
3046	if (bp != NULL) {
3047		int lockflags;
3048		/*
3049		 * Buffer is in-core.  If the buffer is not busy nor managed,
3050		 * it must be on a queue.
3051		 */
3052		lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
3053
3054		if (flags & GB_LOCK_NOWAIT)
3055			lockflags |= LK_NOWAIT;
3056
3057		error = BUF_TIMELOCK(bp, lockflags,
3058		    BO_LOCKPTR(bo), "getblk", slpflag, slptimeo);
3059
3060		/*
3061		 * If we slept and got the lock we have to restart in case
3062		 * the buffer changed identities.
3063		 */
3064		if (error == ENOLCK)
3065			goto loop;
3066		/* We timed out or were interrupted. */
3067		else if (error)
3068			return (NULL);
3069		/* If recursed, assume caller knows the rules. */
3070		else if (BUF_LOCKRECURSED(bp))
3071			goto end;
3072
3073		/*
3074		 * The buffer is locked.  B_CACHE is cleared if the buffer is
3075		 * invalid.  Otherwise, for a non-VMIO buffer, B_CACHE is set
3076		 * and for a VMIO buffer B_CACHE is adjusted according to the
3077		 * backing VM cache.
3078		 */
3079		if (bp->b_flags & B_INVAL)
3080			bp->b_flags &= ~B_CACHE;
3081		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
3082			bp->b_flags |= B_CACHE;
3083		if (bp->b_flags & B_MANAGED)
3084			MPASS(bp->b_qindex == QUEUE_NONE);
3085		else
3086			bremfree(bp);
3087
3088		/*
3089		 * check for size inconsistencies for non-VMIO case.
3090		 */
3091		if (bp->b_bcount != size) {
3092			if ((bp->b_flags & B_VMIO) == 0 ||
3093			    (size > bp->b_kvasize)) {
3094				if (bp->b_flags & B_DELWRI) {
3095					/*
3096					 * If buffer is pinned and caller does
3097					 * not want sleep  waiting for it to be
3098					 * unpinned, bail out
3099					 * */
3100					if (bp->b_pin_count > 0) {
3101						if (flags & GB_LOCK_NOWAIT) {
3102							bqrelse(bp);
3103							return (NULL);
3104						} else {
3105							bunpin_wait(bp);
3106						}
3107					}
3108					bp->b_flags |= B_NOCACHE;
3109					bwrite(bp);
3110				} else {
3111					if (LIST_EMPTY(&bp->b_dep)) {
3112						bp->b_flags |= B_RELBUF;
3113						brelse(bp);
3114					} else {
3115						bp->b_flags |= B_NOCACHE;
3116						bwrite(bp);
3117					}
3118				}
3119				goto loop;
3120			}
3121		}
3122
3123		/*
3124		 * Handle the case of unmapped buffer which should
3125		 * become mapped, or the buffer for which KVA
3126		 * reservation is requested.
3127		 */
3128		bp_unmapped_get_kva(bp, blkno, size, flags);
3129
3130		/*
3131		 * If the size is inconsistant in the VMIO case, we can resize
3132		 * the buffer.  This might lead to B_CACHE getting set or
3133		 * cleared.  If the size has not changed, B_CACHE remains
3134		 * unchanged from its previous state.
3135		 */
3136		if (bp->b_bcount != size)
3137			allocbuf(bp, size);
3138
3139		KASSERT(bp->b_offset != NOOFFSET,
3140		    ("getblk: no buffer offset"));
3141
3142		/*
3143		 * A buffer with B_DELWRI set and B_CACHE clear must
3144		 * be committed before we can return the buffer in
3145		 * order to prevent the caller from issuing a read
3146		 * ( due to B_CACHE not being set ) and overwriting
3147		 * it.
3148		 *
3149		 * Most callers, including NFS and FFS, need this to
3150		 * operate properly either because they assume they
3151		 * can issue a read if B_CACHE is not set, or because
3152		 * ( for example ) an uncached B_DELWRI might loop due
3153		 * to softupdates re-dirtying the buffer.  In the latter
3154		 * case, B_CACHE is set after the first write completes,
3155		 * preventing further loops.
3156		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
3157		 * above while extending the buffer, we cannot allow the
3158		 * buffer to remain with B_CACHE set after the write
3159		 * completes or it will represent a corrupt state.  To
3160		 * deal with this we set B_NOCACHE to scrap the buffer
3161		 * after the write.
3162		 *
3163		 * We might be able to do something fancy, like setting
3164		 * B_CACHE in bwrite() except if B_DELWRI is already set,
3165		 * so the below call doesn't set B_CACHE, but that gets real
3166		 * confusing.  This is much easier.
3167		 */
3168
3169		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
3170			bp->b_flags |= B_NOCACHE;
3171			bwrite(bp);
3172			goto loop;
3173		}
3174		bp->b_flags &= ~B_DONE;
3175	} else {
3176		/*
3177		 * Buffer is not in-core, create new buffer.  The buffer
3178		 * returned by getnewbuf() is locked.  Note that the returned
3179		 * buffer is also considered valid (not marked B_INVAL).
3180		 */
3181		BO_RUNLOCK(bo);
3182		/*
3183		 * If the user does not want us to create the buffer, bail out
3184		 * here.
3185		 */
3186		if (flags & GB_NOCREAT)
3187			return NULL;
3188		if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread))
3189			return NULL;
3190
3191		bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
3192		offset = blkno * bsize;
3193		vmio = vp->v_object != NULL;
3194		if (vmio) {
3195			maxsize = size + (offset & PAGE_MASK);
3196		} else {
3197			maxsize = size;
3198			/* Do not allow non-VMIO notmapped buffers. */
3199			flags &= ~GB_UNMAPPED;
3200		}
3201		maxsize = imax(maxsize, bsize);
3202
3203		bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags);
3204		if (bp == NULL) {
3205			if (slpflag || slptimeo)
3206				return NULL;
3207			goto loop;
3208		}
3209
3210		/*
3211		 * This code is used to make sure that a buffer is not
3212		 * created while the getnewbuf routine is blocked.
3213		 * This can be a problem whether the vnode is locked or not.
3214		 * If the buffer is created out from under us, we have to
3215		 * throw away the one we just created.
3216		 *
3217		 * Note: this must occur before we associate the buffer
3218		 * with the vp especially considering limitations in
3219		 * the splay tree implementation when dealing with duplicate
3220		 * lblkno's.
3221		 */
3222		BO_LOCK(bo);
3223		if (gbincore(bo, blkno)) {
3224			BO_UNLOCK(bo);
3225			bp->b_flags |= B_INVAL;
3226			brelse(bp);
3227			goto loop;
3228		}
3229
3230		/*
3231		 * Insert the buffer into the hash, so that it can
3232		 * be found by incore.
3233		 */
3234		bp->b_blkno = bp->b_lblkno = blkno;
3235		bp->b_offset = offset;
3236		bgetvp(vp, bp);
3237		BO_UNLOCK(bo);
3238
3239		/*
3240		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
3241		 * buffer size starts out as 0, B_CACHE will be set by
3242		 * allocbuf() for the VMIO case prior to it testing the
3243		 * backing store for validity.
3244		 */
3245
3246		if (vmio) {
3247			bp->b_flags |= B_VMIO;
3248			KASSERT(vp->v_object == bp->b_bufobj->bo_object,
3249			    ("ARGH! different b_bufobj->bo_object %p %p %p\n",
3250			    bp, vp->v_object, bp->b_bufobj->bo_object));
3251		} else {
3252			bp->b_flags &= ~B_VMIO;
3253			KASSERT(bp->b_bufobj->bo_object == NULL,
3254			    ("ARGH! has b_bufobj->bo_object %p %p\n",
3255			    bp, bp->b_bufobj->bo_object));
3256			BUF_CHECK_MAPPED(bp);
3257		}
3258
3259		allocbuf(bp, size);
3260		bp->b_flags &= ~B_DONE;
3261	}
3262	CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
3263	BUF_ASSERT_HELD(bp);
3264end:
3265	KASSERT(bp->b_bufobj == bo,
3266	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
3267	return (bp);
3268}
3269
3270/*
3271 * Get an empty, disassociated buffer of given size.  The buffer is initially
3272 * set to B_INVAL.
3273 */
3274struct buf *
3275geteblk(int size, int flags)
3276{
3277	struct buf *bp;
3278	int maxsize;
3279
3280	maxsize = (size + BKVAMASK) & ~BKVAMASK;
3281	while ((bp = getnewbuf(NULL, 0, 0, size, maxsize, flags)) == NULL) {
3282		if ((flags & GB_NOWAIT_BD) &&
3283		    (curthread->td_pflags & TDP_BUFNEED) != 0)
3284			return (NULL);
3285	}
3286	allocbuf(bp, size);
3287	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
3288	BUF_ASSERT_HELD(bp);
3289	return (bp);
3290}
3291
3292
3293/*
3294 * This code constitutes the buffer memory from either anonymous system
3295 * memory (in the case of non-VMIO operations) or from an associated
3296 * VM object (in the case of VMIO operations).  This code is able to
3297 * resize a buffer up or down.
3298 *
3299 * Note that this code is tricky, and has many complications to resolve
3300 * deadlock or inconsistant data situations.  Tread lightly!!!
3301 * There are B_CACHE and B_DELWRI interactions that must be dealt with by
3302 * the caller.  Calling this code willy nilly can result in the loss of data.
3303 *
3304 * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
3305 * B_CACHE for the non-VMIO case.
3306 */
3307
3308int
3309allocbuf(struct buf *bp, int size)
3310{
3311	int newbsize, mbsize;
3312	int i;
3313
3314	BUF_ASSERT_HELD(bp);
3315
3316	if (bp->b_kvasize < size)
3317		panic("allocbuf: buffer too small");
3318
3319	if ((bp->b_flags & B_VMIO) == 0) {
3320		caddr_t origbuf;
3321		int origbufsize;
3322		/*
3323		 * Just get anonymous memory from the kernel.  Don't
3324		 * mess with B_CACHE.
3325		 */
3326		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
3327		if (bp->b_flags & B_MALLOC)
3328			newbsize = mbsize;
3329		else
3330			newbsize = round_page(size);
3331
3332		if (newbsize < bp->b_bufsize) {
3333			/*
3334			 * malloced buffers are not shrunk
3335			 */
3336			if (bp->b_flags & B_MALLOC) {
3337				if (newbsize) {
3338					bp->b_bcount = size;
3339				} else {
3340					free(bp->b_data, M_BIOBUF);
3341					if (bp->b_bufsize) {
3342						atomic_subtract_long(
3343						    &bufmallocspace,
3344						    bp->b_bufsize);
3345						bufspacewakeup();
3346						bp->b_bufsize = 0;
3347					}
3348					bp->b_saveaddr = bp->b_kvabase;
3349					bp->b_data = bp->b_saveaddr;
3350					bp->b_bcount = 0;
3351					bp->b_flags &= ~B_MALLOC;
3352				}
3353				return 1;
3354			}
3355			vm_hold_free_pages(bp, newbsize);
3356		} else if (newbsize > bp->b_bufsize) {
3357			/*
3358			 * We only use malloced memory on the first allocation.
3359			 * and revert to page-allocated memory when the buffer
3360			 * grows.
3361			 */
3362			/*
3363			 * There is a potential smp race here that could lead
3364			 * to bufmallocspace slightly passing the max.  It
3365			 * is probably extremely rare and not worth worrying
3366			 * over.
3367			 */
3368			if ( (bufmallocspace < maxbufmallocspace) &&
3369				(bp->b_bufsize == 0) &&
3370				(mbsize <= PAGE_SIZE/2)) {
3371
3372				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
3373				bp->b_bufsize = mbsize;
3374				bp->b_bcount = size;
3375				bp->b_flags |= B_MALLOC;
3376				atomic_add_long(&bufmallocspace, mbsize);
3377				return 1;
3378			}
3379			origbuf = NULL;
3380			origbufsize = 0;
3381			/*
3382			 * If the buffer is growing on its other-than-first allocation,
3383			 * then we revert to the page-allocation scheme.
3384			 */
3385			if (bp->b_flags & B_MALLOC) {
3386				origbuf = bp->b_data;
3387				origbufsize = bp->b_bufsize;
3388				bp->b_data = bp->b_kvabase;
3389				if (bp->b_bufsize) {
3390					atomic_subtract_long(&bufmallocspace,
3391					    bp->b_bufsize);
3392					bufspacewakeup();
3393					bp->b_bufsize = 0;
3394				}
3395				bp->b_flags &= ~B_MALLOC;
3396				newbsize = round_page(newbsize);
3397			}
3398			vm_hold_load_pages(
3399			    bp,
3400			    (vm_offset_t) bp->b_data + bp->b_bufsize,
3401			    (vm_offset_t) bp->b_data + newbsize);
3402			if (origbuf) {
3403				bcopy(origbuf, bp->b_data, origbufsize);
3404				free(origbuf, M_BIOBUF);
3405			}
3406		}
3407	} else {
3408		int desiredpages;
3409
3410		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
3411		desiredpages = (size == 0) ? 0 :
3412			num_pages((bp->b_offset & PAGE_MASK) + newbsize);
3413
3414		if (bp->b_flags & B_MALLOC)
3415			panic("allocbuf: VMIO buffer can't be malloced");
3416		/*
3417		 * Set B_CACHE initially if buffer is 0 length or will become
3418		 * 0-length.
3419		 */
3420		if (size == 0 || bp->b_bufsize == 0)
3421			bp->b_flags |= B_CACHE;
3422
3423		if (newbsize < bp->b_bufsize) {
3424			/*
3425			 * DEV_BSIZE aligned new buffer size is less then the
3426			 * DEV_BSIZE aligned existing buffer size.  Figure out
3427			 * if we have to remove any pages.
3428			 */
3429			if (desiredpages < bp->b_npages) {
3430				vm_page_t m;
3431
3432				if ((bp->b_flags & B_UNMAPPED) == 0) {
3433					BUF_CHECK_MAPPED(bp);
3434					pmap_qremove((vm_offset_t)trunc_page(
3435					    (vm_offset_t)bp->b_data) +
3436					    (desiredpages << PAGE_SHIFT),
3437					    (bp->b_npages - desiredpages));
3438				} else
3439					BUF_CHECK_UNMAPPED(bp);
3440				VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
3441				for (i = desiredpages; i < bp->b_npages; i++) {
3442					/*
3443					 * the page is not freed here -- it
3444					 * is the responsibility of
3445					 * vnode_pager_setsize
3446					 */
3447					m = bp->b_pages[i];
3448					KASSERT(m != bogus_page,
3449					    ("allocbuf: bogus page found"));
3450					while (vm_page_sleep_if_busy(m,
3451					    "biodep"))
3452						continue;
3453
3454					bp->b_pages[i] = NULL;
3455					vm_page_lock(m);
3456					vm_page_unwire(m, 0);
3457					vm_page_unlock(m);
3458				}
3459				VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
3460				bp->b_npages = desiredpages;
3461			}
3462		} else if (size > bp->b_bcount) {
3463			/*
3464			 * We are growing the buffer, possibly in a
3465			 * byte-granular fashion.
3466			 */
3467			vm_object_t obj;
3468			vm_offset_t toff;
3469			vm_offset_t tinc;
3470
3471			/*
3472			 * Step 1, bring in the VM pages from the object,
3473			 * allocating them if necessary.  We must clear
3474			 * B_CACHE if these pages are not valid for the
3475			 * range covered by the buffer.
3476			 */
3477
3478			obj = bp->b_bufobj->bo_object;
3479
3480			VM_OBJECT_WLOCK(obj);
3481			while (bp->b_npages < desiredpages) {
3482				vm_page_t m;
3483
3484				/*
3485				 * We must allocate system pages since blocking
3486				 * here could interfere with paging I/O, no
3487				 * matter which process we are.
3488				 *
3489				 * Only exclusive busy can be tested here.
3490				 * Blocking on shared busy might lead to
3491				 * deadlocks once allocbuf() is called after
3492				 * pages are vfs_busy_pages().
3493				 */
3494				m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) +
3495				    bp->b_npages, VM_ALLOC_NOBUSY |
3496				    VM_ALLOC_SYSTEM | VM_ALLOC_WIRED |
3497				    VM_ALLOC_IGN_SBUSY |
3498				    VM_ALLOC_COUNT(desiredpages - bp->b_npages));
3499				if (m->valid == 0)
3500					bp->b_flags &= ~B_CACHE;
3501				bp->b_pages[bp->b_npages] = m;
3502				++bp->b_npages;
3503			}
3504
3505			/*
3506			 * Step 2.  We've loaded the pages into the buffer,
3507			 * we have to figure out if we can still have B_CACHE
3508			 * set.  Note that B_CACHE is set according to the
3509			 * byte-granular range ( bcount and size ), new the
3510			 * aligned range ( newbsize ).
3511			 *
3512			 * The VM test is against m->valid, which is DEV_BSIZE
3513			 * aligned.  Needless to say, the validity of the data
3514			 * needs to also be DEV_BSIZE aligned.  Note that this
3515			 * fails with NFS if the server or some other client
3516			 * extends the file's EOF.  If our buffer is resized,
3517			 * B_CACHE may remain set! XXX
3518			 */
3519
3520			toff = bp->b_bcount;
3521			tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
3522
3523			while ((bp->b_flags & B_CACHE) && toff < size) {
3524				vm_pindex_t pi;
3525
3526				if (tinc > (size - toff))
3527					tinc = size - toff;
3528
3529				pi = ((bp->b_offset & PAGE_MASK) + toff) >>
3530				    PAGE_SHIFT;
3531
3532				vfs_buf_test_cache(
3533				    bp,
3534				    bp->b_offset,
3535				    toff,
3536				    tinc,
3537				    bp->b_pages[pi]
3538				);
3539				toff += tinc;
3540				tinc = PAGE_SIZE;
3541			}
3542			VM_OBJECT_WUNLOCK(obj);
3543
3544			/*
3545			 * Step 3, fixup the KVM pmap.
3546			 */
3547			if ((bp->b_flags & B_UNMAPPED) == 0)
3548				bpmap_qenter(bp);
3549			else
3550				BUF_CHECK_UNMAPPED(bp);
3551		}
3552	}
3553	if (newbsize < bp->b_bufsize)
3554		bufspacewakeup();
3555	bp->b_bufsize = newbsize;	/* actual buffer allocation	*/
3556	bp->b_bcount = size;		/* requested buffer size	*/
3557	return 1;
3558}
3559
3560extern int inflight_transient_maps;
3561
3562void
3563biodone(struct bio *bp)
3564{
3565	struct mtx *mtxp;
3566	void (*done)(struct bio *);
3567	vm_offset_t start, end;
3568
3569	if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
3570		bp->bio_flags &= ~BIO_TRANSIENT_MAPPING;
3571		bp->bio_flags |= BIO_UNMAPPED;
3572		start = trunc_page((vm_offset_t)bp->bio_data);
3573		end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
3574		pmap_qremove(start, OFF_TO_IDX(end - start));
3575		vmem_free(transient_arena, start, end - start);
3576		atomic_add_int(&inflight_transient_maps, -1);
3577	}
3578	done = bp->bio_done;
3579	if (done == NULL) {
3580		mtxp = mtx_pool_find(mtxpool_sleep, bp);
3581		mtx_lock(mtxp);
3582		bp->bio_flags |= BIO_DONE;
3583		wakeup(bp);
3584		mtx_unlock(mtxp);
3585	} else {
3586		bp->bio_flags |= BIO_DONE;
3587		done(bp);
3588	}
3589}
3590
3591/*
3592 * Wait for a BIO to finish.
3593 */
3594int
3595biowait(struct bio *bp, const char *wchan)
3596{
3597	struct mtx *mtxp;
3598
3599	mtxp = mtx_pool_find(mtxpool_sleep, bp);
3600	mtx_lock(mtxp);
3601	while ((bp->bio_flags & BIO_DONE) == 0)
3602		msleep(bp, mtxp, PRIBIO, wchan, 0);
3603	mtx_unlock(mtxp);
3604	if (bp->bio_error != 0)
3605		return (bp->bio_error);
3606	if (!(bp->bio_flags & BIO_ERROR))
3607		return (0);
3608	return (EIO);
3609}
3610
3611void
3612biofinish(struct bio *bp, struct devstat *stat, int error)
3613{
3614
3615	if (error) {
3616		bp->bio_error = error;
3617		bp->bio_flags |= BIO_ERROR;
3618	}
3619	if (stat != NULL)
3620		devstat_end_transaction_bio(stat, bp);
3621	biodone(bp);
3622}
3623
3624/*
3625 *	bufwait:
3626 *
3627 *	Wait for buffer I/O completion, returning error status.  The buffer
3628 *	is left locked and B_DONE on return.  B_EINTR is converted into an EINTR
3629 *	error and cleared.
3630 */
3631int
3632bufwait(struct buf *bp)
3633{
3634	if (bp->b_iocmd == BIO_READ)
3635		bwait(bp, PRIBIO, "biord");
3636	else
3637		bwait(bp, PRIBIO, "biowr");
3638	if (bp->b_flags & B_EINTR) {
3639		bp->b_flags &= ~B_EINTR;
3640		return (EINTR);
3641	}
3642	if (bp->b_ioflags & BIO_ERROR) {
3643		return (bp->b_error ? bp->b_error : EIO);
3644	} else {
3645		return (0);
3646	}
3647}
3648
3649 /*
3650  * Call back function from struct bio back up to struct buf.
3651  */
3652static void
3653bufdonebio(struct bio *bip)
3654{
3655	struct buf *bp;
3656
3657	bp = bip->bio_caller2;
3658	bp->b_resid = bp->b_bcount - bip->bio_completed;
3659	bp->b_resid = bip->bio_resid;	/* XXX: remove */
3660	bp->b_ioflags = bip->bio_flags;
3661	bp->b_error = bip->bio_error;
3662	if (bp->b_error)
3663		bp->b_ioflags |= BIO_ERROR;
3664	bufdone(bp);
3665	g_destroy_bio(bip);
3666}
3667
3668void
3669dev_strategy(struct cdev *dev, struct buf *bp)
3670{
3671	struct cdevsw *csw;
3672	int ref;
3673
3674	KASSERT(dev->si_refcount > 0,
3675	    ("dev_strategy on un-referenced struct cdev *(%s) %p",
3676	    devtoname(dev), dev));
3677
3678	csw = dev_refthread(dev, &ref);
3679	dev_strategy_csw(dev, csw, bp);
3680	dev_relthread(dev, ref);
3681}
3682
3683void
3684dev_strategy_csw(struct cdev *dev, struct cdevsw *csw, struct buf *bp)
3685{
3686	struct bio *bip;
3687
3688	KASSERT(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE,
3689	    ("b_iocmd botch"));
3690	KASSERT(((dev->si_flags & SI_ETERNAL) != 0 && csw != NULL) ||
3691	    dev->si_threadcount > 0,
3692	    ("dev_strategy_csw threadcount cdev *(%s) %p", devtoname(dev),
3693	    dev));
3694	if (csw == NULL) {
3695		bp->b_error = ENXIO;
3696		bp->b_ioflags = BIO_ERROR;
3697		bufdone(bp);
3698		return;
3699	}
3700	for (;;) {
3701		bip = g_new_bio();
3702		if (bip != NULL)
3703			break;
3704		/* Try again later */
3705		tsleep(&bp, PRIBIO, "dev_strat", hz/10);
3706	}
3707	bip->bio_cmd = bp->b_iocmd;
3708	bip->bio_offset = bp->b_iooffset;
3709	bip->bio_length = bp->b_bcount;
3710	bip->bio_bcount = bp->b_bcount;	/* XXX: remove */
3711	bdata2bio(bp, bip);
3712	bip->bio_done = bufdonebio;
3713	bip->bio_caller2 = bp;
3714	bip->bio_dev = dev;
3715	(*csw->d_strategy)(bip);
3716}
3717
3718/*
3719 *	bufdone:
3720 *
3721 *	Finish I/O on a buffer, optionally calling a completion function.
3722 *	This is usually called from an interrupt so process blocking is
3723 *	not allowed.
3724 *
3725 *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
3726 *	In a non-VMIO bp, B_CACHE will be set on the next getblk()
3727 *	assuming B_INVAL is clear.
3728 *
3729 *	For the VMIO case, we set B_CACHE if the op was a read and no
3730 *	read error occured, or if the op was a write.  B_CACHE is never
3731 *	set if the buffer is invalid or otherwise uncacheable.
3732 *
3733 *	biodone does not mess with B_INVAL, allowing the I/O routine or the
3734 *	initiator to leave B_INVAL set to brelse the buffer out of existance
3735 *	in the biodone routine.
3736 */
3737void
3738bufdone(struct buf *bp)
3739{
3740	struct bufobj *dropobj;
3741	void    (*biodone)(struct buf *);
3742
3743	CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
3744	dropobj = NULL;
3745
3746	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
3747	BUF_ASSERT_HELD(bp);
3748
3749	runningbufwakeup(bp);
3750	if (bp->b_iocmd == BIO_WRITE)
3751		dropobj = bp->b_bufobj;
3752	/* call optional completion function if requested */
3753	if (bp->b_iodone != NULL) {
3754		biodone = bp->b_iodone;
3755		bp->b_iodone = NULL;
3756		(*biodone) (bp);
3757		if (dropobj)
3758			bufobj_wdrop(dropobj);
3759		return;
3760	}
3761
3762	bufdone_finish(bp);
3763
3764	if (dropobj)
3765		bufobj_wdrop(dropobj);
3766}
3767
3768void
3769bufdone_finish(struct buf *bp)
3770{
3771	BUF_ASSERT_HELD(bp);
3772
3773	if (!LIST_EMPTY(&bp->b_dep))
3774		buf_complete(bp);
3775
3776	if (bp->b_flags & B_VMIO) {
3777		vm_ooffset_t foff;
3778		vm_page_t m;
3779		vm_object_t obj;
3780		struct vnode *vp;
3781		int bogus, i, iosize;
3782
3783		obj = bp->b_bufobj->bo_object;
3784		KASSERT(obj->paging_in_progress >= bp->b_npages,
3785		    ("biodone_finish: paging in progress(%d) < b_npages(%d)",
3786		    obj->paging_in_progress, bp->b_npages));
3787
3788		vp = bp->b_vp;
3789		KASSERT(vp->v_holdcnt > 0,
3790		    ("biodone_finish: vnode %p has zero hold count", vp));
3791		KASSERT(vp->v_object != NULL,
3792		    ("biodone_finish: vnode %p has no vm_object", vp));
3793
3794		foff = bp->b_offset;
3795		KASSERT(bp->b_offset != NOOFFSET,
3796		    ("biodone_finish: bp %p has no buffer offset", bp));
3797
3798		/*
3799		 * Set B_CACHE if the op was a normal read and no error
3800		 * occured.  B_CACHE is set for writes in the b*write()
3801		 * routines.
3802		 */
3803		iosize = bp->b_bcount - bp->b_resid;
3804		if (bp->b_iocmd == BIO_READ &&
3805		    !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
3806		    !(bp->b_ioflags & BIO_ERROR)) {
3807			bp->b_flags |= B_CACHE;
3808		}
3809		bogus = 0;
3810		VM_OBJECT_WLOCK(obj);
3811		for (i = 0; i < bp->b_npages; i++) {
3812			int bogusflag = 0;
3813			int resid;
3814
3815			resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
3816			if (resid > iosize)
3817				resid = iosize;
3818
3819			/*
3820			 * cleanup bogus pages, restoring the originals
3821			 */
3822			m = bp->b_pages[i];
3823			if (m == bogus_page) {
3824				bogus = bogusflag = 1;
3825				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
3826				if (m == NULL)
3827					panic("biodone: page disappeared!");
3828				bp->b_pages[i] = m;
3829			}
3830			KASSERT(OFF_TO_IDX(foff) == m->pindex,
3831			    ("biodone_finish: foff(%jd)/pindex(%ju) mismatch",
3832			    (intmax_t)foff, (uintmax_t)m->pindex));
3833
3834			/*
3835			 * In the write case, the valid and clean bits are
3836			 * already changed correctly ( see bdwrite() ), so we
3837			 * only need to do this here in the read case.
3838			 */
3839			if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) {
3840				KASSERT((m->dirty & vm_page_bits(foff &
3841				    PAGE_MASK, resid)) == 0, ("bufdone_finish:"
3842				    " page %p has unexpected dirty bits", m));
3843				vfs_page_set_valid(bp, foff, m);
3844			}
3845
3846			vm_page_sunbusy(m);
3847			vm_object_pip_subtract(obj, 1);
3848			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
3849			iosize -= resid;
3850		}
3851		vm_object_pip_wakeupn(obj, 0);
3852		VM_OBJECT_WUNLOCK(obj);
3853		if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
3854			BUF_CHECK_MAPPED(bp);
3855			pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
3856			    bp->b_pages, bp->b_npages);
3857		}
3858	}
3859
3860	/*
3861	 * For asynchronous completions, release the buffer now. The brelse
3862	 * will do a wakeup there if necessary - so no need to do a wakeup
3863	 * here in the async case. The sync case always needs to do a wakeup.
3864	 */
3865
3866	if (bp->b_flags & B_ASYNC) {
3867		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR))
3868			brelse(bp);
3869		else
3870			bqrelse(bp);
3871	} else
3872		bdone(bp);
3873}
3874
3875/*
3876 * This routine is called in lieu of iodone in the case of
3877 * incomplete I/O.  This keeps the busy status for pages
3878 * consistant.
3879 */
3880void
3881vfs_unbusy_pages(struct buf *bp)
3882{
3883	int i;
3884	vm_object_t obj;
3885	vm_page_t m;
3886
3887	runningbufwakeup(bp);
3888	if (!(bp->b_flags & B_VMIO))
3889		return;
3890
3891	obj = bp->b_bufobj->bo_object;
3892	VM_OBJECT_WLOCK(obj);
3893	for (i = 0; i < bp->b_npages; i++) {
3894		m = bp->b_pages[i];
3895		if (m == bogus_page) {
3896			m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
3897			if (!m)
3898				panic("vfs_unbusy_pages: page missing\n");
3899			bp->b_pages[i] = m;
3900			if ((bp->b_flags & B_UNMAPPED) == 0) {
3901				BUF_CHECK_MAPPED(bp);
3902				pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
3903				    bp->b_pages, bp->b_npages);
3904			} else
3905				BUF_CHECK_UNMAPPED(bp);
3906		}
3907		vm_object_pip_subtract(obj, 1);
3908		vm_page_sunbusy(m);
3909	}
3910	vm_object_pip_wakeupn(obj, 0);
3911	VM_OBJECT_WUNLOCK(obj);
3912}
3913
3914/*
3915 * vfs_page_set_valid:
3916 *
3917 *	Set the valid bits in a page based on the supplied offset.   The
3918 *	range is restricted to the buffer's size.
3919 *
3920 *	This routine is typically called after a read completes.
3921 */
3922static void
3923vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m)
3924{
3925	vm_ooffset_t eoff;
3926
3927	/*
3928	 * Compute the end offset, eoff, such that [off, eoff) does not span a
3929	 * page boundary and eoff is not greater than the end of the buffer.
3930	 * The end of the buffer, in this case, is our file EOF, not the
3931	 * allocation size of the buffer.
3932	 */
3933	eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK;
3934	if (eoff > bp->b_offset + bp->b_bcount)
3935		eoff = bp->b_offset + bp->b_bcount;
3936
3937	/*
3938	 * Set valid range.  This is typically the entire buffer and thus the
3939	 * entire page.
3940	 */
3941	if (eoff > off)
3942		vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off);
3943}
3944
3945/*
3946 * vfs_page_set_validclean:
3947 *
3948 *	Set the valid bits and clear the dirty bits in a page based on the
3949 *	supplied offset.   The range is restricted to the buffer's size.
3950 */
3951static void
3952vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m)
3953{
3954	vm_ooffset_t soff, eoff;
3955
3956	/*
3957	 * Start and end offsets in buffer.  eoff - soff may not cross a
3958	 * page boundry or cross the end of the buffer.  The end of the
3959	 * buffer, in this case, is our file EOF, not the allocation size
3960	 * of the buffer.
3961	 */
3962	soff = off;
3963	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
3964	if (eoff > bp->b_offset + bp->b_bcount)
3965		eoff = bp->b_offset + bp->b_bcount;
3966
3967	/*
3968	 * Set valid range.  This is typically the entire buffer and thus the
3969	 * entire page.
3970	 */
3971	if (eoff > soff) {
3972		vm_page_set_validclean(
3973		    m,
3974		   (vm_offset_t) (soff & PAGE_MASK),
3975		   (vm_offset_t) (eoff - soff)
3976		);
3977	}
3978}
3979
3980/*
3981 * Ensure that all buffer pages are not exclusive busied.  If any page is
3982 * exclusive busy, drain it.
3983 */
3984void
3985vfs_drain_busy_pages(struct buf *bp)
3986{
3987	vm_page_t m;
3988	int i, last_busied;
3989
3990	VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object);
3991	last_busied = 0;
3992	for (i = 0; i < bp->b_npages; i++) {
3993		m = bp->b_pages[i];
3994		if (vm_page_xbusied(m)) {
3995			for (; last_busied < i; last_busied++)
3996				vm_page_sbusy(bp->b_pages[last_busied]);
3997			while (vm_page_xbusied(m)) {
3998				vm_page_lock(m);
3999				VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
4000				vm_page_busy_sleep(m, "vbpage");
4001				VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
4002			}
4003		}
4004	}
4005	for (i = 0; i < last_busied; i++)
4006		vm_page_sunbusy(bp->b_pages[i]);
4007}
4008
4009/*
4010 * This routine is called before a device strategy routine.
4011 * It is used to tell the VM system that paging I/O is in
4012 * progress, and treat the pages associated with the buffer
4013 * almost as being exclusive busy.  Also the object paging_in_progress
4014 * flag is handled to make sure that the object doesn't become
4015 * inconsistant.
4016 *
4017 * Since I/O has not been initiated yet, certain buffer flags
4018 * such as BIO_ERROR or B_INVAL may be in an inconsistant state
4019 * and should be ignored.
4020 */
4021void
4022vfs_busy_pages(struct buf *bp, int clear_modify)
4023{
4024	int i, bogus;
4025	vm_object_t obj;
4026	vm_ooffset_t foff;
4027	vm_page_t m;
4028
4029	if (!(bp->b_flags & B_VMIO))
4030		return;
4031
4032	obj = bp->b_bufobj->bo_object;
4033	foff = bp->b_offset;
4034	KASSERT(bp->b_offset != NOOFFSET,
4035	    ("vfs_busy_pages: no buffer offset"));
4036	VM_OBJECT_WLOCK(obj);
4037	vfs_drain_busy_pages(bp);
4038	if (bp->b_bufsize != 0)
4039		vfs_setdirty_locked_object(bp);
4040	bogus = 0;
4041	for (i = 0; i < bp->b_npages; i++) {
4042		m = bp->b_pages[i];
4043
4044		if ((bp->b_flags & B_CLUSTER) == 0) {
4045			vm_object_pip_add(obj, 1);
4046			vm_page_sbusy(m);
4047		}
4048		/*
4049		 * When readying a buffer for a read ( i.e
4050		 * clear_modify == 0 ), it is important to do
4051		 * bogus_page replacement for valid pages in
4052		 * partially instantiated buffers.  Partially
4053		 * instantiated buffers can, in turn, occur when
4054		 * reconstituting a buffer from its VM backing store
4055		 * base.  We only have to do this if B_CACHE is
4056		 * clear ( which causes the I/O to occur in the
4057		 * first place ).  The replacement prevents the read
4058		 * I/O from overwriting potentially dirty VM-backed
4059		 * pages.  XXX bogus page replacement is, uh, bogus.
4060		 * It may not work properly with small-block devices.
4061		 * We need to find a better way.
4062		 */
4063		if (clear_modify) {
4064			pmap_remove_write(m);
4065			vfs_page_set_validclean(bp, foff, m);
4066		} else if (m->valid == VM_PAGE_BITS_ALL &&
4067		    (bp->b_flags & B_CACHE) == 0) {
4068			bp->b_pages[i] = bogus_page;
4069			bogus++;
4070		}
4071		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
4072	}
4073	VM_OBJECT_WUNLOCK(obj);
4074	if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
4075		BUF_CHECK_MAPPED(bp);
4076		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
4077		    bp->b_pages, bp->b_npages);
4078	}
4079}
4080
4081/*
4082 *	vfs_bio_set_valid:
4083 *
4084 *	Set the range within the buffer to valid.  The range is
4085 *	relative to the beginning of the buffer, b_offset.  Note that
4086 *	b_offset itself may be offset from the beginning of the first
4087 *	page.
4088 */
4089void
4090vfs_bio_set_valid(struct buf *bp, int base, int size)
4091{
4092	int i, n;
4093	vm_page_t m;
4094
4095	if (!(bp->b_flags & B_VMIO))
4096		return;
4097
4098	/*
4099	 * Fixup base to be relative to beginning of first page.
4100	 * Set initial n to be the maximum number of bytes in the
4101	 * first page that can be validated.
4102	 */
4103	base += (bp->b_offset & PAGE_MASK);
4104	n = PAGE_SIZE - (base & PAGE_MASK);
4105
4106	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
4107	for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
4108		m = bp->b_pages[i];
4109		if (n > size)
4110			n = size;
4111		vm_page_set_valid_range(m, base & PAGE_MASK, n);
4112		base += n;
4113		size -= n;
4114		n = PAGE_SIZE;
4115	}
4116	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
4117}
4118
4119/*
4120 *	vfs_bio_clrbuf:
4121 *
4122 *	If the specified buffer is a non-VMIO buffer, clear the entire
4123 *	buffer.  If the specified buffer is a VMIO buffer, clear and
4124 *	validate only the previously invalid portions of the buffer.
4125 *	This routine essentially fakes an I/O, so we need to clear
4126 *	BIO_ERROR and B_INVAL.
4127 *
4128 *	Note that while we only theoretically need to clear through b_bcount,
4129 *	we go ahead and clear through b_bufsize.
4130 */
4131void
4132vfs_bio_clrbuf(struct buf *bp)
4133{
4134	int i, j, mask, sa, ea, slide;
4135
4136	if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) {
4137		clrbuf(bp);
4138		return;
4139	}
4140	bp->b_flags &= ~B_INVAL;
4141	bp->b_ioflags &= ~BIO_ERROR;
4142	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
4143	if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
4144	    (bp->b_offset & PAGE_MASK) == 0) {
4145		if (bp->b_pages[0] == bogus_page)
4146			goto unlock;
4147		mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
4148		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object);
4149		if ((bp->b_pages[0]->valid & mask) == mask)
4150			goto unlock;
4151		if ((bp->b_pages[0]->valid & mask) == 0) {
4152			pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize);
4153			bp->b_pages[0]->valid |= mask;
4154			goto unlock;
4155		}
4156	}
4157	sa = bp->b_offset & PAGE_MASK;
4158	slide = 0;
4159	for (i = 0; i < bp->b_npages; i++, sa = 0) {
4160		slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize);
4161		ea = slide & PAGE_MASK;
4162		if (ea == 0)
4163			ea = PAGE_SIZE;
4164		if (bp->b_pages[i] == bogus_page)
4165			continue;
4166		j = sa / DEV_BSIZE;
4167		mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
4168		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object);
4169		if ((bp->b_pages[i]->valid & mask) == mask)
4170			continue;
4171		if ((bp->b_pages[i]->valid & mask) == 0)
4172			pmap_zero_page_area(bp->b_pages[i], sa, ea - sa);
4173		else {
4174			for (; sa < ea; sa += DEV_BSIZE, j++) {
4175				if ((bp->b_pages[i]->valid & (1 << j)) == 0) {
4176					pmap_zero_page_area(bp->b_pages[i],
4177					    sa, DEV_BSIZE);
4178				}
4179			}
4180		}
4181		bp->b_pages[i]->valid |= mask;
4182	}
4183unlock:
4184	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
4185	bp->b_resid = 0;
4186}
4187
4188void
4189vfs_bio_bzero_buf(struct buf *bp, int base, int size)
4190{
4191	vm_page_t m;
4192	int i, n;
4193
4194	if ((bp->b_flags & B_UNMAPPED) == 0) {
4195		BUF_CHECK_MAPPED(bp);
4196		bzero(bp->b_data + base, size);
4197	} else {
4198		BUF_CHECK_UNMAPPED(bp);
4199		n = PAGE_SIZE - (base & PAGE_MASK);
4200		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
4201			m = bp->b_pages[i];
4202			if (n > size)
4203				n = size;
4204			pmap_zero_page_area(m, base & PAGE_MASK, n);
4205			base += n;
4206			size -= n;
4207			n = PAGE_SIZE;
4208		}
4209	}
4210}
4211
4212/*
4213 * vm_hold_load_pages and vm_hold_free_pages get pages into
4214 * a buffers address space.  The pages are anonymous and are
4215 * not associated with a file object.
4216 */
4217static void
4218vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
4219{
4220	vm_offset_t pg;
4221	vm_page_t p;
4222	int index;
4223
4224	BUF_CHECK_MAPPED(bp);
4225
4226	to = round_page(to);
4227	from = round_page(from);
4228	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
4229
4230	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
4231tryagain:
4232		/*
4233		 * note: must allocate system pages since blocking here
4234		 * could interfere with paging I/O, no matter which
4235		 * process we are.
4236		 */
4237		p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
4238		    VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT));
4239		if (p == NULL) {
4240			VM_WAIT;
4241			goto tryagain;
4242		}
4243		pmap_qenter(pg, &p, 1);
4244		bp->b_pages[index] = p;
4245	}
4246	bp->b_npages = index;
4247}
4248
4249/* Return pages associated with this buf to the vm system */
4250static void
4251vm_hold_free_pages(struct buf *bp, int newbsize)
4252{
4253	vm_offset_t from;
4254	vm_page_t p;
4255	int index, newnpages;
4256
4257	BUF_CHECK_MAPPED(bp);
4258
4259	from = round_page((vm_offset_t)bp->b_data + newbsize);
4260	newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
4261	if (bp->b_npages > newnpages)
4262		pmap_qremove(from, bp->b_npages - newnpages);
4263	for (index = newnpages; index < bp->b_npages; index++) {
4264		p = bp->b_pages[index];
4265		bp->b_pages[index] = NULL;
4266		if (vm_page_sbusied(p))
4267			printf("vm_hold_free_pages: blkno: %jd, lblkno: %jd\n",
4268			    (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno);
4269		p->wire_count--;
4270		vm_page_free(p);
4271		atomic_subtract_int(&cnt.v_wire_count, 1);
4272	}
4273	bp->b_npages = newnpages;
4274}
4275
4276/*
4277 * Map an IO request into kernel virtual address space.
4278 *
4279 * All requests are (re)mapped into kernel VA space.
4280 * Notice that we use b_bufsize for the size of the buffer
4281 * to be mapped.  b_bcount might be modified by the driver.
4282 *
4283 * Note that even if the caller determines that the address space should
4284 * be valid, a race or a smaller-file mapped into a larger space may
4285 * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST
4286 * check the return value.
4287 */
4288int
4289vmapbuf(struct buf *bp, int mapbuf)
4290{
4291	caddr_t kva;
4292	vm_prot_t prot;
4293	int pidx;
4294
4295	if (bp->b_bufsize < 0)
4296		return (-1);
4297	prot = VM_PROT_READ;
4298	if (bp->b_iocmd == BIO_READ)
4299		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
4300	if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
4301	    (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages,
4302	    btoc(MAXPHYS))) < 0)
4303		return (-1);
4304	bp->b_npages = pidx;
4305	if (mapbuf || !unmapped_buf_allowed) {
4306		pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx);
4307		kva = bp->b_saveaddr;
4308		bp->b_saveaddr = bp->b_data;
4309		bp->b_data = kva + (((vm_offset_t)bp->b_data) & PAGE_MASK);
4310		bp->b_flags &= ~B_UNMAPPED;
4311	} else {
4312		bp->b_flags |= B_UNMAPPED;
4313		bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK;
4314		bp->b_saveaddr = bp->b_data;
4315		bp->b_data = unmapped_buf;
4316	}
4317	return(0);
4318}
4319
4320/*
4321 * Free the io map PTEs associated with this IO operation.
4322 * We also invalidate the TLB entries and restore the original b_addr.
4323 */
4324void
4325vunmapbuf(struct buf *bp)
4326{
4327	int npages;
4328
4329	npages = bp->b_npages;
4330	if (bp->b_flags & B_UNMAPPED)
4331		bp->b_flags &= ~B_UNMAPPED;
4332	else
4333		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
4334	vm_page_unhold_pages(bp->b_pages, npages);
4335
4336	bp->b_data = bp->b_saveaddr;
4337}
4338
4339void
4340bdone(struct buf *bp)
4341{
4342	struct mtx *mtxp;
4343
4344	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4345	mtx_lock(mtxp);
4346	bp->b_flags |= B_DONE;
4347	wakeup(bp);
4348	mtx_unlock(mtxp);
4349}
4350
4351void
4352bwait(struct buf *bp, u_char pri, const char *wchan)
4353{
4354	struct mtx *mtxp;
4355
4356	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4357	mtx_lock(mtxp);
4358	while ((bp->b_flags & B_DONE) == 0)
4359		msleep(bp, mtxp, pri, wchan, 0);
4360	mtx_unlock(mtxp);
4361}
4362
4363int
4364bufsync(struct bufobj *bo, int waitfor)
4365{
4366
4367	return (VOP_FSYNC(bo->__bo_vnode, waitfor, curthread));
4368}
4369
4370void
4371bufstrategy(struct bufobj *bo, struct buf *bp)
4372{
4373	int i = 0;
4374	struct vnode *vp;
4375
4376	vp = bp->b_vp;
4377	KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy"));
4378	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
4379	    ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
4380	i = VOP_STRATEGY(vp, bp);
4381	KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
4382}
4383
4384void
4385bufobj_wrefl(struct bufobj *bo)
4386{
4387
4388	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
4389	ASSERT_BO_WLOCKED(bo);
4390	bo->bo_numoutput++;
4391}
4392
4393void
4394bufobj_wref(struct bufobj *bo)
4395{
4396
4397	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
4398	BO_LOCK(bo);
4399	bo->bo_numoutput++;
4400	BO_UNLOCK(bo);
4401}
4402
4403void
4404bufobj_wdrop(struct bufobj *bo)
4405{
4406
4407	KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop"));
4408	BO_LOCK(bo);
4409	KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count"));
4410	if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) {
4411		bo->bo_flag &= ~BO_WWAIT;
4412		wakeup(&bo->bo_numoutput);
4413	}
4414	BO_UNLOCK(bo);
4415}
4416
4417int
4418bufobj_wwait(struct bufobj *bo, int slpflag, int timeo)
4419{
4420	int error;
4421
4422	KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
4423	ASSERT_BO_WLOCKED(bo);
4424	error = 0;
4425	while (bo->bo_numoutput) {
4426		bo->bo_flag |= BO_WWAIT;
4427		error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo),
4428		    slpflag | (PRIBIO + 1), "bo_wwait", timeo);
4429		if (error)
4430			break;
4431	}
4432	return (error);
4433}
4434
4435void
4436bpin(struct buf *bp)
4437{
4438	struct mtx *mtxp;
4439
4440	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4441	mtx_lock(mtxp);
4442	bp->b_pin_count++;
4443	mtx_unlock(mtxp);
4444}
4445
4446void
4447bunpin(struct buf *bp)
4448{
4449	struct mtx *mtxp;
4450
4451	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4452	mtx_lock(mtxp);
4453	if (--bp->b_pin_count == 0)
4454		wakeup(bp);
4455	mtx_unlock(mtxp);
4456}
4457
4458void
4459bunpin_wait(struct buf *bp)
4460{
4461	struct mtx *mtxp;
4462
4463	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4464	mtx_lock(mtxp);
4465	while (bp->b_pin_count > 0)
4466		msleep(bp, mtxp, PRIBIO, "bwunpin", 0);
4467	mtx_unlock(mtxp);
4468}
4469
4470/*
4471 * Set bio_data or bio_ma for struct bio from the struct buf.
4472 */
4473void
4474bdata2bio(struct buf *bp, struct bio *bip)
4475{
4476
4477	if ((bp->b_flags & B_UNMAPPED) != 0) {
4478		KASSERT(unmapped_buf_allowed, ("unmapped"));
4479		bip->bio_ma = bp->b_pages;
4480		bip->bio_ma_n = bp->b_npages;
4481		bip->bio_data = unmapped_buf;
4482		bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
4483		bip->bio_flags |= BIO_UNMAPPED;
4484		KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
4485		    PAGE_SIZE == bp->b_npages,
4486		    ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset,
4487		    (long long)bip->bio_length, bip->bio_ma_n));
4488	} else {
4489		bip->bio_data = bp->b_data;
4490		bip->bio_ma = NULL;
4491	}
4492}
4493
4494#include "opt_ddb.h"
4495#ifdef DDB
4496#include <ddb/ddb.h>
4497
4498/* DDB command to show buffer data */
4499DB_SHOW_COMMAND(buffer, db_show_buffer)
4500{
4501	/* get args */
4502	struct buf *bp = (struct buf *)addr;
4503
4504	if (!have_addr) {
4505		db_printf("usage: show buffer <addr>\n");
4506		return;
4507	}
4508
4509	db_printf("buf at %p\n", bp);
4510	db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n",
4511	    (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags,
4512	    PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS);
4513	db_printf(
4514	    "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
4515	    "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, "
4516	    "b_dep = %p\n",
4517	    bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
4518	    bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno,
4519	    (intmax_t)bp->b_lblkno, bp->b_dep.lh_first);
4520	if (bp->b_npages) {
4521		int i;
4522		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
4523		for (i = 0; i < bp->b_npages; i++) {
4524			vm_page_t m;
4525			m = bp->b_pages[i];
4526			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
4527			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
4528			if ((i + 1) < bp->b_npages)
4529				db_printf(",");
4530		}
4531		db_printf("\n");
4532	}
4533	db_printf(" ");
4534	BUF_LOCKPRINTINFO(bp);
4535}
4536
4537DB_SHOW_COMMAND(lockedbufs, lockedbufs)
4538{
4539	struct buf *bp;
4540	int i;
4541
4542	for (i = 0; i < nbuf; i++) {
4543		bp = &buf[i];
4544		if (BUF_ISLOCKED(bp)) {
4545			db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4546			db_printf("\n");
4547		}
4548	}
4549}
4550
4551DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs)
4552{
4553	struct vnode *vp;
4554	struct buf *bp;
4555
4556	if (!have_addr) {
4557		db_printf("usage: show vnodebufs <addr>\n");
4558		return;
4559	}
4560	vp = (struct vnode *)addr;
4561	db_printf("Clean buffers:\n");
4562	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) {
4563		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4564		db_printf("\n");
4565	}
4566	db_printf("Dirty buffers:\n");
4567	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
4568		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4569		db_printf("\n");
4570	}
4571}
4572
4573DB_COMMAND(countfreebufs, db_coundfreebufs)
4574{
4575	struct buf *bp;
4576	int i, used = 0, nfree = 0;
4577
4578	if (have_addr) {
4579		db_printf("usage: countfreebufs\n");
4580		return;
4581	}
4582
4583	for (i = 0; i < nbuf; i++) {
4584		bp = &buf[i];
4585		if ((bp->b_flags & B_INFREECNT) != 0)
4586			nfree++;
4587		else
4588			used++;
4589	}
4590
4591	db_printf("Counted %d free, %d used (%d tot)\n", nfree, used,
4592	    nfree + used);
4593	db_printf("numfreebuffers is %d\n", numfreebuffers);
4594}
4595#endif /* DDB */
4596