vfs_bio.c revision 292299
1/*-
2 * Copyright (c) 2004 Poul-Henning Kamp
3 * Copyright (c) 1994,1997 John S. Dyson
4 * Copyright (c) 2013 The FreeBSD Foundation
5 * All rights reserved.
6 *
7 * Portions of this software were developed by Konstantin Belousov
8 * under sponsorship from the FreeBSD Foundation.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32/*
33 * this file contains a new buffer I/O scheme implementing a coherent
34 * VM object and buffer cache scheme.  Pains have been taken to make
35 * sure that the performance degradation associated with schemes such
36 * as this is not realized.
37 *
38 * Author:  John S. Dyson
39 * Significant help during the development and debugging phases
40 * had been provided by David Greenman, also of the FreeBSD core team.
41 *
42 * see man buf(9) for more info.
43 */
44
45#include <sys/cdefs.h>
46__FBSDID("$FreeBSD: head/sys/kern/vfs_bio.c 292299 2015-12-16 00:13:16Z adrian $");
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/bio.h>
51#include <sys/conf.h>
52#include <sys/buf.h>
53#include <sys/devicestat.h>
54#include <sys/eventhandler.h>
55#include <sys/fail.h>
56#include <sys/limits.h>
57#include <sys/lock.h>
58#include <sys/malloc.h>
59#include <sys/mount.h>
60#include <sys/mutex.h>
61#include <sys/kernel.h>
62#include <sys/kthread.h>
63#include <sys/proc.h>
64#include <sys/resourcevar.h>
65#include <sys/rwlock.h>
66#include <sys/smp.h>
67#include <sys/sysctl.h>
68#include <sys/sysproto.h>
69#include <sys/vmem.h>
70#include <sys/vmmeter.h>
71#include <sys/vnode.h>
72#include <sys/watchdog.h>
73#include <geom/geom.h>
74#include <vm/vm.h>
75#include <vm/vm_param.h>
76#include <vm/vm_kern.h>
77#include <vm/vm_pageout.h>
78#include <vm/vm_page.h>
79#include <vm/vm_object.h>
80#include <vm/vm_extern.h>
81#include <vm/vm_map.h>
82#include <vm/swap_pager.h>
83#include "opt_compat.h"
84#include "opt_swap.h"
85
86static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
87
88struct	bio_ops bioops;		/* I/O operation notification */
89
90struct	buf_ops buf_ops_bio = {
91	.bop_name	=	"buf_ops_bio",
92	.bop_write	=	bufwrite,
93	.bop_strategy	=	bufstrategy,
94	.bop_sync	=	bufsync,
95	.bop_bdflush	=	bufbdflush,
96};
97
98static struct buf *buf;		/* buffer header pool */
99extern struct buf *swbuf;	/* Swap buffer header pool. */
100caddr_t unmapped_buf;
101
102/* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
103struct proc *bufdaemonproc;
104struct proc *bufspacedaemonproc;
105
106static int inmem(struct vnode *vp, daddr_t blkno);
107static void vm_hold_free_pages(struct buf *bp, int newbsize);
108static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
109		vm_offset_t to);
110static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
111static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
112		vm_page_t m);
113static void vfs_clean_pages_dirty_buf(struct buf *bp);
114static void vfs_setdirty_locked_object(struct buf *bp);
115static void vfs_vmio_invalidate(struct buf *bp);
116static void vfs_vmio_truncate(struct buf *bp, int npages);
117static void vfs_vmio_extend(struct buf *bp, int npages, int size);
118static int vfs_bio_clcheck(struct vnode *vp, int size,
119		daddr_t lblkno, daddr_t blkno);
120static int buf_flush(struct vnode *vp, int);
121static int buf_recycle(bool);
122static int buf_scan(bool);
123static int flushbufqueues(struct vnode *, int, int);
124static void buf_daemon(void);
125static void bremfreel(struct buf *bp);
126static __inline void bd_wakeup(void);
127static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
128static void bufkva_reclaim(vmem_t *, int);
129static void bufkva_free(struct buf *);
130static int buf_import(void *, void **, int, int);
131static void buf_release(void *, void **, int);
132
133#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
134    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
135static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
136#endif
137
138int vmiodirenable = TRUE;
139SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
140    "Use the VM system for directory writes");
141long runningbufspace;
142SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
143    "Amount of presently outstanding async buffer io");
144static long bufspace;
145#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
146    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
147SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
148    &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers");
149#else
150SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
151    "Physical memory used for buffers");
152#endif
153static long bufkvaspace;
154SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0,
155    "Kernel virtual memory used for buffers");
156static long maxbufspace;
157SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0,
158    "Maximum allowed value of bufspace (including metadata)");
159static long bufmallocspace;
160SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
161    "Amount of malloced memory for buffers");
162static long maxbufmallocspace;
163SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace,
164    0, "Maximum amount of malloced memory for buffers");
165static long lobufspace;
166SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0,
167    "Minimum amount of buffers we want to have");
168long hibufspace;
169SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0,
170    "Maximum allowed value of bufspace (excluding metadata)");
171long bufspacethresh;
172SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh,
173    0, "Bufspace consumed before waking the daemon to free some");
174static int buffreekvacnt;
175SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
176    "Number of times we have freed the KVA space from some buffer");
177static int bufdefragcnt;
178SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
179    "Number of times we have had to repeat buffer allocation to defragment");
180static long lorunningspace;
181SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
182    CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L",
183    "Minimum preferred space used for in-progress I/O");
184static long hirunningspace;
185SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
186    CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L",
187    "Maximum amount of space to use for in-progress I/O");
188int dirtybufferflushes;
189SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
190    0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
191int bdwriteskip;
192SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
193    0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
194int altbufferflushes;
195SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
196    0, "Number of fsync flushes to limit dirty buffers");
197static int recursiveflushes;
198SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes,
199    0, "Number of flushes skipped due to being recursive");
200static int numdirtybuffers;
201SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
202    "Number of buffers that are dirty (has unwritten changes) at the moment");
203static int lodirtybuffers;
204SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
205    "How many buffers we want to have free before bufdaemon can sleep");
206static int hidirtybuffers;
207SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
208    "When the number of dirty buffers is considered severe");
209int dirtybufthresh;
210SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
211    0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
212static int numfreebuffers;
213SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
214    "Number of free buffers");
215static int lofreebuffers;
216SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
217   "Target number of free buffers");
218static int hifreebuffers;
219SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
220   "Threshold for clean buffer recycling");
221static int getnewbufcalls;
222SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
223   "Number of calls to getnewbuf");
224static int getnewbufrestarts;
225SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
226    "Number of times getnewbuf has had to restart a buffer aquisition");
227static int mappingrestarts;
228SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
229    "Number of times getblk has had to restart a buffer mapping for "
230    "unmapped buffer");
231static int numbufallocfails;
232SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0,
233    "Number of times buffer allocations failed");
234static int flushbufqtarget = 100;
235SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
236    "Amount of work to do in flushbufqueues when helping bufdaemon");
237static long notbufdflushes;
238SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes, 0,
239    "Number of dirty buffer flushes done by the bufdaemon helpers");
240static long barrierwrites;
241SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
242    "Number of barrier writes");
243SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
244    &unmapped_buf_allowed, 0,
245    "Permit the use of the unmapped i/o");
246
247/*
248 * This lock synchronizes access to bd_request.
249 */
250static struct mtx_padalign bdlock;
251
252/*
253 * This lock protects the runningbufreq and synchronizes runningbufwakeup and
254 * waitrunningbufspace().
255 */
256static struct mtx_padalign rbreqlock;
257
258/*
259 * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
260 */
261static struct rwlock_padalign nblock;
262
263/*
264 * Lock that protects bdirtywait.
265 */
266static struct mtx_padalign bdirtylock;
267
268/*
269 * Wakeup point for bufdaemon, as well as indicator of whether it is already
270 * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
271 * is idling.
272 */
273static int bd_request;
274
275/*
276 * Request/wakeup point for the bufspace daemon.
277 */
278static int bufspace_request;
279
280/*
281 * Request for the buf daemon to write more buffers than is indicated by
282 * lodirtybuf.  This may be necessary to push out excess dependencies or
283 * defragment the address space where a simple count of the number of dirty
284 * buffers is insufficient to characterize the demand for flushing them.
285 */
286static int bd_speedupreq;
287
288/*
289 * bogus page -- for I/O to/from partially complete buffers
290 * this is a temporary solution to the problem, but it is not
291 * really that bad.  it would be better to split the buffer
292 * for input in the case of buffers partially already in memory,
293 * but the code is intricate enough already.
294 */
295vm_page_t bogus_page;
296
297/*
298 * Synchronization (sleep/wakeup) variable for active buffer space requests.
299 * Set when wait starts, cleared prior to wakeup().
300 * Used in runningbufwakeup() and waitrunningbufspace().
301 */
302static int runningbufreq;
303
304/*
305 * Synchronization (sleep/wakeup) variable for buffer requests.
306 * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
307 * by and/or.
308 * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(),
309 * getnewbuf(), and getblk().
310 */
311static volatile int needsbuffer;
312
313/*
314 * Synchronization for bwillwrite() waiters.
315 */
316static int bdirtywait;
317
318/*
319 * Definitions for the buffer free lists.
320 */
321#define QUEUE_NONE	0	/* on no queue */
322#define QUEUE_EMPTY	1	/* empty buffer headers */
323#define QUEUE_DIRTY	2	/* B_DELWRI buffers */
324#define QUEUE_CLEAN	3	/* non-B_DELWRI buffers */
325#define QUEUE_SENTINEL	1024	/* not an queue index, but mark for sentinel */
326
327/* Maximum number of clean buffer queues. */
328#define	CLEAN_QUEUES	16
329
330/* Configured number of clean queues. */
331static int clean_queues;
332
333/* Maximum number of buffer queues. */
334#define BUFFER_QUEUES	(QUEUE_CLEAN + CLEAN_QUEUES)
335
336/* Queues for free buffers with various properties */
337static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
338#ifdef INVARIANTS
339static int bq_len[BUFFER_QUEUES];
340#endif
341
342/*
343 * Lock for each bufqueue
344 */
345static struct mtx_padalign bqlocks[BUFFER_QUEUES];
346
347/*
348 * per-cpu empty buffer cache.
349 */
350uma_zone_t buf_zone;
351
352/*
353 * Single global constant for BUF_WMESG, to avoid getting multiple references.
354 * buf_wmesg is referred from macros.
355 */
356const char *buf_wmesg = BUF_WMESG;
357
358static int
359sysctl_runningspace(SYSCTL_HANDLER_ARGS)
360{
361	long value;
362	int error;
363
364	value = *(long *)arg1;
365	error = sysctl_handle_long(oidp, &value, 0, req);
366	if (error != 0 || req->newptr == NULL)
367		return (error);
368	mtx_lock(&rbreqlock);
369	if (arg1 == &hirunningspace) {
370		if (value < lorunningspace)
371			error = EINVAL;
372		else
373			hirunningspace = value;
374	} else {
375		KASSERT(arg1 == &lorunningspace,
376		    ("%s: unknown arg1", __func__));
377		if (value > hirunningspace)
378			error = EINVAL;
379		else
380			lorunningspace = value;
381	}
382	mtx_unlock(&rbreqlock);
383	return (error);
384}
385
386#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
387    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
388static int
389sysctl_bufspace(SYSCTL_HANDLER_ARGS)
390{
391	long lvalue;
392	int ivalue;
393
394	if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
395		return (sysctl_handle_long(oidp, arg1, arg2, req));
396	lvalue = *(long *)arg1;
397	if (lvalue > INT_MAX)
398		/* On overflow, still write out a long to trigger ENOMEM. */
399		return (sysctl_handle_long(oidp, &lvalue, 0, req));
400	ivalue = lvalue;
401	return (sysctl_handle_int(oidp, &ivalue, 0, req));
402}
403#endif
404
405static int
406bqcleanq(void)
407{
408	static int nextq;
409
410	return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN);
411}
412
413static int
414bqisclean(int qindex)
415{
416
417	return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES);
418}
419
420/*
421 *	bqlock:
422 *
423 *	Return the appropriate queue lock based on the index.
424 */
425static inline struct mtx *
426bqlock(int qindex)
427{
428
429	return (struct mtx *)&bqlocks[qindex];
430}
431
432/*
433 *	bdirtywakeup:
434 *
435 *	Wakeup any bwillwrite() waiters.
436 */
437static void
438bdirtywakeup(void)
439{
440	mtx_lock(&bdirtylock);
441	if (bdirtywait) {
442		bdirtywait = 0;
443		wakeup(&bdirtywait);
444	}
445	mtx_unlock(&bdirtylock);
446}
447
448/*
449 *	bdirtysub:
450 *
451 *	Decrement the numdirtybuffers count by one and wakeup any
452 *	threads blocked in bwillwrite().
453 */
454static void
455bdirtysub(void)
456{
457
458	if (atomic_fetchadd_int(&numdirtybuffers, -1) ==
459	    (lodirtybuffers + hidirtybuffers) / 2)
460		bdirtywakeup();
461}
462
463/*
464 *	bdirtyadd:
465 *
466 *	Increment the numdirtybuffers count by one and wakeup the buf
467 *	daemon if needed.
468 */
469static void
470bdirtyadd(void)
471{
472
473	/*
474	 * Only do the wakeup once as we cross the boundary.  The
475	 * buf daemon will keep running until the condition clears.
476	 */
477	if (atomic_fetchadd_int(&numdirtybuffers, 1) ==
478	    (lodirtybuffers + hidirtybuffers) / 2)
479		bd_wakeup();
480}
481
482/*
483 *	bufspace_wakeup:
484 *
485 *	Called when buffer space is potentially available for recovery.
486 *	getnewbuf() will block on this flag when it is unable to free
487 *	sufficient buffer space.  Buffer space becomes recoverable when
488 *	bp's get placed back in the queues.
489 */
490static void
491bufspace_wakeup(void)
492{
493
494	/*
495	 * If someone is waiting for bufspace, wake them up.
496	 *
497	 * Since needsbuffer is set prior to doing an additional queue
498	 * scan it is safe to check for the flag prior to acquiring the
499	 * lock.  The thread that is preparing to scan again before
500	 * blocking would discover the buf we released.
501	 */
502	if (needsbuffer) {
503		rw_rlock(&nblock);
504		if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1)
505			wakeup(__DEVOLATILE(void *, &needsbuffer));
506		rw_runlock(&nblock);
507	}
508}
509
510/*
511 *	bufspace_daemonwakeup:
512 *
513 *	Wakeup the daemon responsible for freeing clean bufs.
514 */
515static void
516bufspace_daemonwakeup(void)
517{
518	rw_rlock(&nblock);
519	if (bufspace_request == 0) {
520		bufspace_request = 1;
521		wakeup(&bufspace_request);
522	}
523	rw_runlock(&nblock);
524}
525
526/*
527 *	bufspace_adjust:
528 *
529 *	Adjust the reported bufspace for a KVA managed buffer, possibly
530 * 	waking any waiters.
531 */
532static void
533bufspace_adjust(struct buf *bp, int bufsize)
534{
535	long space;
536	int diff;
537
538	KASSERT((bp->b_flags & B_MALLOC) == 0,
539	    ("bufspace_adjust: malloc buf %p", bp));
540	diff = bufsize - bp->b_bufsize;
541	if (diff < 0) {
542		atomic_subtract_long(&bufspace, -diff);
543		bufspace_wakeup();
544	} else {
545		space = atomic_fetchadd_long(&bufspace, diff);
546		/* Wake up the daemon on the transition. */
547		if (space < bufspacethresh && space + diff >= bufspacethresh)
548			bufspace_daemonwakeup();
549	}
550	bp->b_bufsize = bufsize;
551}
552
553/*
554 *	bufspace_reserve:
555 *
556 *	Reserve bufspace before calling allocbuf().  metadata has a
557 *	different space limit than data.
558 */
559static int
560bufspace_reserve(int size, bool metadata)
561{
562	long limit;
563	long space;
564
565	if (metadata)
566		limit = maxbufspace;
567	else
568		limit = hibufspace;
569	do {
570		space = bufspace;
571		if (space + size > limit)
572			return (ENOSPC);
573	} while (atomic_cmpset_long(&bufspace, space, space + size) == 0);
574
575	/* Wake up the daemon on the transition. */
576	if (space < bufspacethresh && space + size >= bufspacethresh)
577		bufspace_daemonwakeup();
578
579	return (0);
580}
581
582/*
583 *	bufspace_release:
584 *
585 *	Release reserved bufspace after bufspace_adjust() has consumed it.
586 */
587static void
588bufspace_release(int size)
589{
590	atomic_subtract_long(&bufspace, size);
591	bufspace_wakeup();
592}
593
594/*
595 *	bufspace_wait:
596 *
597 *	Wait for bufspace, acting as the buf daemon if a locked vnode is
598 *	supplied.  needsbuffer must be set in a safe fashion prior to
599 *	polling for space.  The operation must be re-tried on return.
600 */
601static void
602bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo)
603{
604	struct thread *td;
605	int error, fl, norunbuf;
606
607	if ((gbflags & GB_NOWAIT_BD) != 0)
608		return;
609
610	td = curthread;
611	rw_wlock(&nblock);
612	while (needsbuffer != 0) {
613		if (vp != NULL && vp->v_type != VCHR &&
614		    (td->td_pflags & TDP_BUFNEED) == 0) {
615			rw_wunlock(&nblock);
616			/*
617			 * getblk() is called with a vnode locked, and
618			 * some majority of the dirty buffers may as
619			 * well belong to the vnode.  Flushing the
620			 * buffers there would make a progress that
621			 * cannot be achieved by the buf_daemon, that
622			 * cannot lock the vnode.
623			 */
624			norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
625			    (td->td_pflags & TDP_NORUNNINGBUF);
626
627			/*
628			 * Play bufdaemon.  The getnewbuf() function
629			 * may be called while the thread owns lock
630			 * for another dirty buffer for the same
631			 * vnode, which makes it impossible to use
632			 * VOP_FSYNC() there, due to the buffer lock
633			 * recursion.
634			 */
635			td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
636			fl = buf_flush(vp, flushbufqtarget);
637			td->td_pflags &= norunbuf;
638			rw_wlock(&nblock);
639			if (fl != 0)
640				continue;
641			if (needsbuffer == 0)
642				break;
643		}
644		error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
645		    (PRIBIO + 4) | slpflag, "newbuf", slptimeo);
646		if (error != 0)
647			break;
648	}
649	rw_wunlock(&nblock);
650}
651
652
653/*
654 *	bufspace_daemon:
655 *
656 *	buffer space management daemon.  Tries to maintain some marginal
657 *	amount of free buffer space so that requesting processes neither
658 *	block nor work to reclaim buffers.
659 */
660static void
661bufspace_daemon(void)
662{
663	for (;;) {
664		kproc_suspend_check(bufspacedaemonproc);
665
666		/*
667		 * Free buffers from the clean queue until we meet our
668		 * targets.
669		 *
670		 * Theory of operation:  The buffer cache is most efficient
671		 * when some free buffer headers and space are always
672		 * available to getnewbuf().  This daemon attempts to prevent
673		 * the excessive blocking and synchronization associated
674		 * with shortfall.  It goes through three phases according
675		 * demand:
676		 *
677		 * 1)	The daemon wakes up voluntarily once per-second
678		 *	during idle periods when the counters are below
679		 *	the wakeup thresholds (bufspacethresh, lofreebuffers).
680		 *
681		 * 2)	The daemon wakes up as we cross the thresholds
682		 *	ahead of any potential blocking.  This may bounce
683		 *	slightly according to the rate of consumption and
684		 *	release.
685		 *
686		 * 3)	The daemon and consumers are starved for working
687		 *	clean buffers.  This is the 'bufspace' sleep below
688		 *	which will inefficiently trade bufs with bqrelse
689		 *	until we return to condition 2.
690		 */
691		while (bufspace > lobufspace ||
692		    numfreebuffers < hifreebuffers) {
693			if (buf_recycle(false) != 0) {
694				atomic_set_int(&needsbuffer, 1);
695				if (buf_recycle(false) != 0) {
696					rw_wlock(&nblock);
697					if (needsbuffer)
698						rw_sleep(__DEVOLATILE(void *,
699						    &needsbuffer), &nblock,
700						    PRIBIO|PDROP, "bufspace",
701						    hz/10);
702					else
703						rw_wunlock(&nblock);
704				}
705			}
706			maybe_yield();
707		}
708
709		/*
710		 * Re-check our limits under the exclusive nblock.
711		 */
712		rw_wlock(&nblock);
713		if (bufspace < bufspacethresh &&
714		    numfreebuffers > lofreebuffers) {
715			bufspace_request = 0;
716			rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP,
717			    "-", hz);
718		} else
719			rw_wunlock(&nblock);
720	}
721}
722
723static struct kproc_desc bufspace_kp = {
724	"bufspacedaemon",
725	bufspace_daemon,
726	&bufspacedaemonproc
727};
728SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start,
729    &bufspace_kp);
730
731/*
732 *	bufmallocadjust:
733 *
734 *	Adjust the reported bufspace for a malloc managed buffer, possibly
735 *	waking any waiters.
736 */
737static void
738bufmallocadjust(struct buf *bp, int bufsize)
739{
740	int diff;
741
742	KASSERT((bp->b_flags & B_MALLOC) != 0,
743	    ("bufmallocadjust: non-malloc buf %p", bp));
744	diff = bufsize - bp->b_bufsize;
745	if (diff < 0)
746		atomic_subtract_long(&bufmallocspace, -diff);
747	else
748		atomic_add_long(&bufmallocspace, diff);
749	bp->b_bufsize = bufsize;
750}
751
752/*
753 *	runningwakeup:
754 *
755 *	Wake up processes that are waiting on asynchronous writes to fall
756 *	below lorunningspace.
757 */
758static void
759runningwakeup(void)
760{
761
762	mtx_lock(&rbreqlock);
763	if (runningbufreq) {
764		runningbufreq = 0;
765		wakeup(&runningbufreq);
766	}
767	mtx_unlock(&rbreqlock);
768}
769
770/*
771 *	runningbufwakeup:
772 *
773 *	Decrement the outstanding write count according.
774 */
775void
776runningbufwakeup(struct buf *bp)
777{
778	long space, bspace;
779
780	bspace = bp->b_runningbufspace;
781	if (bspace == 0)
782		return;
783	space = atomic_fetchadd_long(&runningbufspace, -bspace);
784	KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld",
785	    space, bspace));
786	bp->b_runningbufspace = 0;
787	/*
788	 * Only acquire the lock and wakeup on the transition from exceeding
789	 * the threshold to falling below it.
790	 */
791	if (space < lorunningspace)
792		return;
793	if (space - bspace > lorunningspace)
794		return;
795	runningwakeup();
796}
797
798/*
799 *	waitrunningbufspace()
800 *
801 *	runningbufspace is a measure of the amount of I/O currently
802 *	running.  This routine is used in async-write situations to
803 *	prevent creating huge backups of pending writes to a device.
804 *	Only asynchronous writes are governed by this function.
805 *
806 *	This does NOT turn an async write into a sync write.  It waits
807 *	for earlier writes to complete and generally returns before the
808 *	caller's write has reached the device.
809 */
810void
811waitrunningbufspace(void)
812{
813
814	mtx_lock(&rbreqlock);
815	while (runningbufspace > hirunningspace) {
816		runningbufreq = 1;
817		msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
818	}
819	mtx_unlock(&rbreqlock);
820}
821
822
823/*
824 *	vfs_buf_test_cache:
825 *
826 *	Called when a buffer is extended.  This function clears the B_CACHE
827 *	bit if the newly extended portion of the buffer does not contain
828 *	valid data.
829 */
830static __inline void
831vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off,
832    vm_offset_t size, vm_page_t m)
833{
834
835	VM_OBJECT_ASSERT_LOCKED(m->object);
836	if (bp->b_flags & B_CACHE) {
837		int base = (foff + off) & PAGE_MASK;
838		if (vm_page_is_valid(m, base, size) == 0)
839			bp->b_flags &= ~B_CACHE;
840	}
841}
842
843/* Wake up the buffer daemon if necessary */
844static __inline void
845bd_wakeup(void)
846{
847
848	mtx_lock(&bdlock);
849	if (bd_request == 0) {
850		bd_request = 1;
851		wakeup(&bd_request);
852	}
853	mtx_unlock(&bdlock);
854}
855
856/*
857 * bd_speedup - speedup the buffer cache flushing code
858 */
859void
860bd_speedup(void)
861{
862	int needwake;
863
864	mtx_lock(&bdlock);
865	needwake = 0;
866	if (bd_speedupreq == 0 || bd_request == 0)
867		needwake = 1;
868	bd_speedupreq = 1;
869	bd_request = 1;
870	if (needwake)
871		wakeup(&bd_request);
872	mtx_unlock(&bdlock);
873}
874
875#ifndef NSWBUF_MIN
876#define	NSWBUF_MIN	16
877#endif
878
879#ifdef __i386__
880#define	TRANSIENT_DENOM	5
881#else
882#define	TRANSIENT_DENOM 10
883#endif
884
885/*
886 * Calculating buffer cache scaling values and reserve space for buffer
887 * headers.  This is called during low level kernel initialization and
888 * may be called more then once.  We CANNOT write to the memory area
889 * being reserved at this time.
890 */
891caddr_t
892kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
893{
894	int tuned_nbuf;
895	long maxbuf, maxbuf_sz, buf_sz,	biotmap_sz;
896
897	/*
898	 * physmem_est is in pages.  Convert it to kilobytes (assumes
899	 * PAGE_SIZE is >= 1K)
900	 */
901	physmem_est = physmem_est * (PAGE_SIZE / 1024);
902
903	/*
904	 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
905	 * For the first 64MB of ram nominally allocate sufficient buffers to
906	 * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
907	 * buffers to cover 1/10 of our ram over 64MB.  When auto-sizing
908	 * the buffer cache we limit the eventual kva reservation to
909	 * maxbcache bytes.
910	 *
911	 * factor represents the 1/4 x ram conversion.
912	 */
913	if (nbuf == 0) {
914		int factor = 4 * BKVASIZE / 1024;
915
916		nbuf = 50;
917		if (physmem_est > 4096)
918			nbuf += min((physmem_est - 4096) / factor,
919			    65536 / factor);
920		if (physmem_est > 65536)
921			nbuf += min((physmem_est - 65536) * 2 / (factor * 5),
922			    32 * 1024 * 1024 / (factor * 5));
923
924		if (maxbcache && nbuf > maxbcache / BKVASIZE)
925			nbuf = maxbcache / BKVASIZE;
926		tuned_nbuf = 1;
927	} else
928		tuned_nbuf = 0;
929
930	/* XXX Avoid unsigned long overflows later on with maxbufspace. */
931	maxbuf = (LONG_MAX / 3) / BKVASIZE;
932	if (nbuf > maxbuf) {
933		if (!tuned_nbuf)
934			printf("Warning: nbufs lowered from %d to %ld\n", nbuf,
935			    maxbuf);
936		nbuf = maxbuf;
937	}
938
939	/*
940	 * Ideal allocation size for the transient bio submap is 10%
941	 * of the maximal space buffer map.  This roughly corresponds
942	 * to the amount of the buffer mapped for typical UFS load.
943	 *
944	 * Clip the buffer map to reserve space for the transient
945	 * BIOs, if its extent is bigger than 90% (80% on i386) of the
946	 * maximum buffer map extent on the platform.
947	 *
948	 * The fall-back to the maxbuf in case of maxbcache unset,
949	 * allows to not trim the buffer KVA for the architectures
950	 * with ample KVA space.
951	 */
952	if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) {
953		maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
954		buf_sz = (long)nbuf * BKVASIZE;
955		if (buf_sz < maxbuf_sz / TRANSIENT_DENOM *
956		    (TRANSIENT_DENOM - 1)) {
957			/*
958			 * There is more KVA than memory.  Do not
959			 * adjust buffer map size, and assign the rest
960			 * of maxbuf to transient map.
961			 */
962			biotmap_sz = maxbuf_sz - buf_sz;
963		} else {
964			/*
965			 * Buffer map spans all KVA we could afford on
966			 * this platform.  Give 10% (20% on i386) of
967			 * the buffer map to the transient bio map.
968			 */
969			biotmap_sz = buf_sz / TRANSIENT_DENOM;
970			buf_sz -= biotmap_sz;
971		}
972		if (biotmap_sz / INT_MAX > MAXPHYS)
973			bio_transient_maxcnt = INT_MAX;
974		else
975			bio_transient_maxcnt = biotmap_sz / MAXPHYS;
976		/*
977		 * Artifically limit to 1024 simultaneous in-flight I/Os
978		 * using the transient mapping.
979		 */
980		if (bio_transient_maxcnt > 1024)
981			bio_transient_maxcnt = 1024;
982		if (tuned_nbuf)
983			nbuf = buf_sz / BKVASIZE;
984	}
985
986	/*
987	 * swbufs are used as temporary holders for I/O, such as paging I/O.
988	 * We have no less then 16 and no more then 256.
989	 */
990	nswbuf = min(nbuf / 4, 256);
991	TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf);
992	if (nswbuf < NSWBUF_MIN)
993		nswbuf = NSWBUF_MIN;
994
995	/*
996	 * Reserve space for the buffer cache buffers
997	 */
998	swbuf = (void *)v;
999	v = (caddr_t)(swbuf + nswbuf);
1000	buf = (void *)v;
1001	v = (caddr_t)(buf + nbuf);
1002
1003	return(v);
1004}
1005
1006/* Initialize the buffer subsystem.  Called before use of any buffers. */
1007void
1008bufinit(void)
1009{
1010	struct buf *bp;
1011	int i;
1012
1013	CTASSERT(MAXBCACHEBUF >= MAXBSIZE);
1014	mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF);
1015	mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF);
1016	for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++)
1017		mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF);
1018	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
1019	rw_init(&nblock, "needsbuffer lock");
1020	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
1021	mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
1022
1023	/* next, make a null set of free lists */
1024	for (i = 0; i < BUFFER_QUEUES; i++)
1025		TAILQ_INIT(&bufqueues[i]);
1026
1027	unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
1028
1029	/* finally, initialize each buffer header and stick on empty q */
1030	for (i = 0; i < nbuf; i++) {
1031		bp = &buf[i];
1032		bzero(bp, sizeof *bp);
1033		bp->b_flags = B_INVAL;
1034		bp->b_rcred = NOCRED;
1035		bp->b_wcred = NOCRED;
1036		bp->b_qindex = QUEUE_EMPTY;
1037		bp->b_xflags = 0;
1038		bp->b_data = bp->b_kvabase = unmapped_buf;
1039		LIST_INIT(&bp->b_dep);
1040		BUF_LOCKINIT(bp);
1041		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
1042#ifdef INVARIANTS
1043		bq_len[QUEUE_EMPTY]++;
1044#endif
1045	}
1046
1047	/*
1048	 * maxbufspace is the absolute maximum amount of buffer space we are
1049	 * allowed to reserve in KVM and in real terms.  The absolute maximum
1050	 * is nominally used by metadata.  hibufspace is the nominal maximum
1051	 * used by most other requests.  The differential is required to
1052	 * ensure that metadata deadlocks don't occur.
1053	 *
1054	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
1055	 * this may result in KVM fragmentation which is not handled optimally
1056	 * by the system. XXX This is less true with vmem.  We could use
1057	 * PAGE_SIZE.
1058	 */
1059	maxbufspace = (long)nbuf * BKVASIZE;
1060	hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10);
1061	lobufspace = (hibufspace / 20) * 19; /* 95% */
1062	bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2;
1063
1064	/*
1065	 * Note: The 16 MiB upper limit for hirunningspace was chosen
1066	 * arbitrarily and may need further tuning. It corresponds to
1067	 * 128 outstanding write IO requests (if IO size is 128 KiB),
1068	 * which fits with many RAID controllers' tagged queuing limits.
1069	 * The lower 1 MiB limit is the historical upper limit for
1070	 * hirunningspace.
1071	 */
1072	hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBCACHEBUF),
1073	    16 * 1024 * 1024), 1024 * 1024);
1074	lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF);
1075
1076	/*
1077	 * Limit the amount of malloc memory since it is wired permanently into
1078	 * the kernel space.  Even though this is accounted for in the buffer
1079	 * allocation, we don't want the malloced region to grow uncontrolled.
1080	 * The malloc scheme improves memory utilization significantly on
1081	 * average (small) directories.
1082	 */
1083	maxbufmallocspace = hibufspace / 20;
1084
1085	/*
1086	 * Reduce the chance of a deadlock occuring by limiting the number
1087	 * of delayed-write dirty buffers we allow to stack up.
1088	 */
1089	hidirtybuffers = nbuf / 4 + 20;
1090	dirtybufthresh = hidirtybuffers * 9 / 10;
1091	numdirtybuffers = 0;
1092	/*
1093	 * To support extreme low-memory systems, make sure hidirtybuffers
1094	 * cannot eat up all available buffer space.  This occurs when our
1095	 * minimum cannot be met.  We try to size hidirtybuffers to 3/4 our
1096	 * buffer space assuming BKVASIZE'd buffers.
1097	 */
1098	while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
1099		hidirtybuffers >>= 1;
1100	}
1101	lodirtybuffers = hidirtybuffers / 2;
1102
1103	/*
1104	 * lofreebuffers should be sufficient to avoid stalling waiting on
1105	 * buf headers under heavy utilization.  The bufs in per-cpu caches
1106	 * are counted as free but will be unavailable to threads executing
1107	 * on other cpus.
1108	 *
1109	 * hifreebuffers is the free target for the bufspace daemon.  This
1110	 * should be set appropriately to limit work per-iteration.
1111	 */
1112	lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus);
1113	hifreebuffers = (3 * lofreebuffers) / 2;
1114	numfreebuffers = nbuf;
1115
1116	bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
1117	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
1118
1119	/* Setup the kva and free list allocators. */
1120	vmem_set_reclaim(buffer_arena, bufkva_reclaim);
1121	buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf),
1122	    NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0);
1123
1124	/*
1125	 * Size the clean queue according to the amount of buffer space.
1126	 * One queue per-256mb up to the max.  More queues gives better
1127	 * concurrency but less accurate LRU.
1128	 */
1129	clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES);
1130
1131}
1132
1133#ifdef INVARIANTS
1134static inline void
1135vfs_buf_check_mapped(struct buf *bp)
1136{
1137
1138	KASSERT(bp->b_kvabase != unmapped_buf,
1139	    ("mapped buf: b_kvabase was not updated %p", bp));
1140	KASSERT(bp->b_data != unmapped_buf,
1141	    ("mapped buf: b_data was not updated %p", bp));
1142	KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf +
1143	    MAXPHYS, ("b_data + b_offset unmapped %p", bp));
1144}
1145
1146static inline void
1147vfs_buf_check_unmapped(struct buf *bp)
1148{
1149
1150	KASSERT(bp->b_data == unmapped_buf,
1151	    ("unmapped buf: corrupted b_data %p", bp));
1152}
1153
1154#define	BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
1155#define	BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
1156#else
1157#define	BUF_CHECK_MAPPED(bp) do {} while (0)
1158#define	BUF_CHECK_UNMAPPED(bp) do {} while (0)
1159#endif
1160
1161static int
1162isbufbusy(struct buf *bp)
1163{
1164	if (((bp->b_flags & (B_INVAL | B_PERSISTENT)) == 0 &&
1165	    BUF_ISLOCKED(bp)) ||
1166	    ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI))
1167		return (1);
1168	return (0);
1169}
1170
1171/*
1172 * Shutdown the system cleanly to prepare for reboot, halt, or power off.
1173 */
1174void
1175bufshutdown(int show_busybufs)
1176{
1177	static int first_buf_printf = 1;
1178	struct buf *bp;
1179	int iter, nbusy, pbusy;
1180#ifndef PREEMPTION
1181	int subiter;
1182#endif
1183
1184	/*
1185	 * Sync filesystems for shutdown
1186	 */
1187	wdog_kern_pat(WD_LASTVAL);
1188	sys_sync(curthread, NULL);
1189
1190	/*
1191	 * With soft updates, some buffers that are
1192	 * written will be remarked as dirty until other
1193	 * buffers are written.
1194	 */
1195	for (iter = pbusy = 0; iter < 20; iter++) {
1196		nbusy = 0;
1197		for (bp = &buf[nbuf]; --bp >= buf; )
1198			if (isbufbusy(bp))
1199				nbusy++;
1200		if (nbusy == 0) {
1201			if (first_buf_printf)
1202				printf("All buffers synced.");
1203			break;
1204		}
1205		if (first_buf_printf) {
1206			printf("Syncing disks, buffers remaining... ");
1207			first_buf_printf = 0;
1208		}
1209		printf("%d ", nbusy);
1210		if (nbusy < pbusy)
1211			iter = 0;
1212		pbusy = nbusy;
1213
1214		wdog_kern_pat(WD_LASTVAL);
1215		sys_sync(curthread, NULL);
1216
1217#ifdef PREEMPTION
1218		/*
1219		 * Drop Giant and spin for a while to allow
1220		 * interrupt threads to run.
1221		 */
1222		DROP_GIANT();
1223		DELAY(50000 * iter);
1224		PICKUP_GIANT();
1225#else
1226		/*
1227		 * Drop Giant and context switch several times to
1228		 * allow interrupt threads to run.
1229		 */
1230		DROP_GIANT();
1231		for (subiter = 0; subiter < 50 * iter; subiter++) {
1232			thread_lock(curthread);
1233			mi_switch(SW_VOL, NULL);
1234			thread_unlock(curthread);
1235			DELAY(1000);
1236		}
1237		PICKUP_GIANT();
1238#endif
1239	}
1240	printf("\n");
1241	/*
1242	 * Count only busy local buffers to prevent forcing
1243	 * a fsck if we're just a client of a wedged NFS server
1244	 */
1245	nbusy = 0;
1246	for (bp = &buf[nbuf]; --bp >= buf; ) {
1247		if (isbufbusy(bp)) {
1248#if 0
1249/* XXX: This is bogus.  We should probably have a BO_REMOTE flag instead */
1250			if (bp->b_dev == NULL) {
1251				TAILQ_REMOVE(&mountlist,
1252				    bp->b_vp->v_mount, mnt_list);
1253				continue;
1254			}
1255#endif
1256			nbusy++;
1257			if (show_busybufs > 0) {
1258				printf(
1259	    "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:",
1260				    nbusy, bp, bp->b_vp, bp->b_flags,
1261				    (intmax_t)bp->b_blkno,
1262				    (intmax_t)bp->b_lblkno);
1263				BUF_LOCKPRINTINFO(bp);
1264				if (show_busybufs > 1)
1265					vn_printf(bp->b_vp,
1266					    "vnode content: ");
1267			}
1268		}
1269	}
1270	if (nbusy) {
1271		/*
1272		 * Failed to sync all blocks. Indicate this and don't
1273		 * unmount filesystems (thus forcing an fsck on reboot).
1274		 */
1275		printf("Giving up on %d buffers\n", nbusy);
1276		DELAY(5000000);	/* 5 seconds */
1277	} else {
1278		if (!first_buf_printf)
1279			printf("Final sync complete\n");
1280		/*
1281		 * Unmount filesystems
1282		 */
1283		if (panicstr == NULL)
1284			vfs_unmountall();
1285	}
1286	swapoff_all();
1287	DELAY(100000);		/* wait for console output to finish */
1288}
1289
1290static void
1291bpmap_qenter(struct buf *bp)
1292{
1293
1294	BUF_CHECK_MAPPED(bp);
1295
1296	/*
1297	 * bp->b_data is relative to bp->b_offset, but
1298	 * bp->b_offset may be offset into the first page.
1299	 */
1300	bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
1301	pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
1302	bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
1303	    (vm_offset_t)(bp->b_offset & PAGE_MASK));
1304}
1305
1306/*
1307 *	binsfree:
1308 *
1309 *	Insert the buffer into the appropriate free list.
1310 */
1311static void
1312binsfree(struct buf *bp, int qindex)
1313{
1314	struct mtx *olock, *nlock;
1315
1316	if (qindex != QUEUE_EMPTY) {
1317		BUF_ASSERT_XLOCKED(bp);
1318	}
1319
1320	/*
1321	 * Stick to the same clean queue for the lifetime of the buf to
1322	 * limit locking below.  Otherwise pick ont sequentially.
1323	 */
1324	if (qindex == QUEUE_CLEAN) {
1325		if (bqisclean(bp->b_qindex))
1326			qindex = bp->b_qindex;
1327		else
1328			qindex = bqcleanq();
1329	}
1330
1331	/*
1332	 * Handle delayed bremfree() processing.
1333	 */
1334	nlock = bqlock(qindex);
1335	if (bp->b_flags & B_REMFREE) {
1336		olock = bqlock(bp->b_qindex);
1337		mtx_lock(olock);
1338		bremfreel(bp);
1339		if (olock != nlock) {
1340			mtx_unlock(olock);
1341			mtx_lock(nlock);
1342		}
1343	} else
1344		mtx_lock(nlock);
1345
1346	if (bp->b_qindex != QUEUE_NONE)
1347		panic("binsfree: free buffer onto another queue???");
1348
1349	bp->b_qindex = qindex;
1350	if (bp->b_flags & B_AGE)
1351		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
1352	else
1353		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
1354#ifdef INVARIANTS
1355	bq_len[bp->b_qindex]++;
1356#endif
1357	mtx_unlock(nlock);
1358}
1359
1360/*
1361 * buf_free:
1362 *
1363 *	Free a buffer to the buf zone once it no longer has valid contents.
1364 */
1365static void
1366buf_free(struct buf *bp)
1367{
1368
1369	if (bp->b_flags & B_REMFREE)
1370		bremfreef(bp);
1371	if (bp->b_vflags & BV_BKGRDINPROG)
1372		panic("losing buffer 1");
1373	if (bp->b_rcred != NOCRED) {
1374		crfree(bp->b_rcred);
1375		bp->b_rcred = NOCRED;
1376	}
1377	if (bp->b_wcred != NOCRED) {
1378		crfree(bp->b_wcred);
1379		bp->b_wcred = NOCRED;
1380	}
1381	if (!LIST_EMPTY(&bp->b_dep))
1382		buf_deallocate(bp);
1383	bufkva_free(bp);
1384	BUF_UNLOCK(bp);
1385	uma_zfree(buf_zone, bp);
1386	atomic_add_int(&numfreebuffers, 1);
1387	bufspace_wakeup();
1388}
1389
1390/*
1391 * buf_import:
1392 *
1393 *	Import bufs into the uma cache from the buf list.  The system still
1394 *	expects a static array of bufs and much of the synchronization
1395 *	around bufs assumes type stable storage.  As a result, UMA is used
1396 *	only as a per-cpu cache of bufs still maintained on a global list.
1397 */
1398static int
1399buf_import(void *arg, void **store, int cnt, int flags)
1400{
1401	struct buf *bp;
1402	int i;
1403
1404	mtx_lock(&bqlocks[QUEUE_EMPTY]);
1405	for (i = 0; i < cnt; i++) {
1406		bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
1407		if (bp == NULL)
1408			break;
1409		bremfreel(bp);
1410		store[i] = bp;
1411	}
1412	mtx_unlock(&bqlocks[QUEUE_EMPTY]);
1413
1414	return (i);
1415}
1416
1417/*
1418 * buf_release:
1419 *
1420 *	Release bufs from the uma cache back to the buffer queues.
1421 */
1422static void
1423buf_release(void *arg, void **store, int cnt)
1424{
1425        int i;
1426
1427        for (i = 0; i < cnt; i++)
1428		binsfree(store[i], QUEUE_EMPTY);
1429}
1430
1431/*
1432 * buf_alloc:
1433 *
1434 *	Allocate an empty buffer header.
1435 */
1436static struct buf *
1437buf_alloc(void)
1438{
1439	struct buf *bp;
1440
1441	bp = uma_zalloc(buf_zone, M_NOWAIT);
1442	if (bp == NULL) {
1443		bufspace_daemonwakeup();
1444		atomic_add_int(&numbufallocfails, 1);
1445		return (NULL);
1446	}
1447
1448	/*
1449	 * Wake-up the bufspace daemon on transition.
1450	 */
1451	if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers)
1452		bufspace_daemonwakeup();
1453
1454	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
1455		panic("getnewbuf_empty: Locked buf %p on free queue.", bp);
1456
1457	KASSERT(bp->b_vp == NULL,
1458	    ("bp: %p still has vnode %p.", bp, bp->b_vp));
1459	KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0,
1460	    ("invalid buffer %p flags %#x", bp, bp->b_flags));
1461	KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
1462	    ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
1463	KASSERT(bp->b_npages == 0,
1464	    ("bp: %p still has %d vm pages\n", bp, bp->b_npages));
1465	KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp));
1466	KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp));
1467
1468	bp->b_flags = 0;
1469	bp->b_ioflags = 0;
1470	bp->b_xflags = 0;
1471	bp->b_vflags = 0;
1472	bp->b_vp = NULL;
1473	bp->b_blkno = bp->b_lblkno = 0;
1474	bp->b_offset = NOOFFSET;
1475	bp->b_iodone = 0;
1476	bp->b_error = 0;
1477	bp->b_resid = 0;
1478	bp->b_bcount = 0;
1479	bp->b_npages = 0;
1480	bp->b_dirtyoff = bp->b_dirtyend = 0;
1481	bp->b_bufobj = NULL;
1482	bp->b_pin_count = 0;
1483	bp->b_data = bp->b_kvabase = unmapped_buf;
1484	bp->b_fsprivate1 = NULL;
1485	bp->b_fsprivate2 = NULL;
1486	bp->b_fsprivate3 = NULL;
1487	LIST_INIT(&bp->b_dep);
1488
1489	return (bp);
1490}
1491
1492/*
1493 *	buf_qrecycle:
1494 *
1495 *	Free a buffer from the given bufqueue.  kva controls whether the
1496 *	freed buf must own some kva resources.  This is used for
1497 *	defragmenting.
1498 */
1499static int
1500buf_qrecycle(int qindex, bool kva)
1501{
1502	struct buf *bp, *nbp;
1503
1504	if (kva)
1505		atomic_add_int(&bufdefragcnt, 1);
1506	nbp = NULL;
1507	mtx_lock(&bqlocks[qindex]);
1508	nbp = TAILQ_FIRST(&bufqueues[qindex]);
1509
1510	/*
1511	 * Run scan, possibly freeing data and/or kva mappings on the fly
1512	 * depending.
1513	 */
1514	while ((bp = nbp) != NULL) {
1515		/*
1516		 * Calculate next bp (we can only use it if we do not
1517		 * release the bqlock).
1518		 */
1519		nbp = TAILQ_NEXT(bp, b_freelist);
1520
1521		/*
1522		 * If we are defragging then we need a buffer with
1523		 * some kva to reclaim.
1524		 */
1525		if (kva && bp->b_kvasize == 0)
1526			continue;
1527
1528		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
1529			continue;
1530
1531		/*
1532		 * Skip buffers with background writes in progress.
1533		 */
1534		if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
1535			BUF_UNLOCK(bp);
1536			continue;
1537		}
1538
1539		KASSERT(bp->b_qindex == qindex,
1540		    ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
1541		/*
1542		 * NOTE:  nbp is now entirely invalid.  We can only restart
1543		 * the scan from this point on.
1544		 */
1545		bremfreel(bp);
1546		mtx_unlock(&bqlocks[qindex]);
1547
1548		/*
1549		 * Requeue the background write buffer with error and
1550		 * restart the scan.
1551		 */
1552		if ((bp->b_vflags & BV_BKGRDERR) != 0) {
1553			bqrelse(bp);
1554			mtx_lock(&bqlocks[qindex]);
1555			nbp = TAILQ_FIRST(&bufqueues[qindex]);
1556			continue;
1557		}
1558		bp->b_flags |= B_INVAL;
1559		brelse(bp);
1560		return (0);
1561	}
1562	mtx_unlock(&bqlocks[qindex]);
1563
1564	return (ENOBUFS);
1565}
1566
1567/*
1568 *	buf_recycle:
1569 *
1570 *	Iterate through all clean queues until we find a buf to recycle or
1571 *	exhaust the search.
1572 */
1573static int
1574buf_recycle(bool kva)
1575{
1576	int qindex, first_qindex;
1577
1578	qindex = first_qindex = bqcleanq();
1579	do {
1580		if (buf_qrecycle(qindex, kva) == 0)
1581			return (0);
1582		if (++qindex == QUEUE_CLEAN + clean_queues)
1583			qindex = QUEUE_CLEAN;
1584	} while (qindex != first_qindex);
1585
1586	return (ENOBUFS);
1587}
1588
1589/*
1590 *	buf_scan:
1591 *
1592 *	Scan the clean queues looking for a buffer to recycle.  needsbuffer
1593 *	is set on failure so that the caller may optionally bufspace_wait()
1594 *	in a race-free fashion.
1595 */
1596static int
1597buf_scan(bool defrag)
1598{
1599	int error;
1600
1601	/*
1602	 * To avoid heavy synchronization and wakeup races we set
1603	 * needsbuffer and re-poll before failing.  This ensures that
1604	 * no frees can be missed between an unsuccessful poll and
1605	 * going to sleep in a synchronized fashion.
1606	 */
1607	if ((error = buf_recycle(defrag)) != 0) {
1608		atomic_set_int(&needsbuffer, 1);
1609		bufspace_daemonwakeup();
1610		error = buf_recycle(defrag);
1611	}
1612	if (error == 0)
1613		atomic_add_int(&getnewbufrestarts, 1);
1614	return (error);
1615}
1616
1617/*
1618 *	bremfree:
1619 *
1620 *	Mark the buffer for removal from the appropriate free list.
1621 *
1622 */
1623void
1624bremfree(struct buf *bp)
1625{
1626
1627	CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1628	KASSERT((bp->b_flags & B_REMFREE) == 0,
1629	    ("bremfree: buffer %p already marked for delayed removal.", bp));
1630	KASSERT(bp->b_qindex != QUEUE_NONE,
1631	    ("bremfree: buffer %p not on a queue.", bp));
1632	BUF_ASSERT_XLOCKED(bp);
1633
1634	bp->b_flags |= B_REMFREE;
1635}
1636
1637/*
1638 *	bremfreef:
1639 *
1640 *	Force an immediate removal from a free list.  Used only in nfs when
1641 *	it abuses the b_freelist pointer.
1642 */
1643void
1644bremfreef(struct buf *bp)
1645{
1646	struct mtx *qlock;
1647
1648	qlock = bqlock(bp->b_qindex);
1649	mtx_lock(qlock);
1650	bremfreel(bp);
1651	mtx_unlock(qlock);
1652}
1653
1654/*
1655 *	bremfreel:
1656 *
1657 *	Removes a buffer from the free list, must be called with the
1658 *	correct qlock held.
1659 */
1660static void
1661bremfreel(struct buf *bp)
1662{
1663
1664	CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
1665	    bp, bp->b_vp, bp->b_flags);
1666	KASSERT(bp->b_qindex != QUEUE_NONE,
1667	    ("bremfreel: buffer %p not on a queue.", bp));
1668	if (bp->b_qindex != QUEUE_EMPTY) {
1669		BUF_ASSERT_XLOCKED(bp);
1670	}
1671	mtx_assert(bqlock(bp->b_qindex), MA_OWNED);
1672
1673	TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
1674#ifdef INVARIANTS
1675	KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow",
1676	    bp->b_qindex));
1677	bq_len[bp->b_qindex]--;
1678#endif
1679	bp->b_qindex = QUEUE_NONE;
1680	bp->b_flags &= ~B_REMFREE;
1681}
1682
1683/*
1684 *	bufkva_free:
1685 *
1686 *	Free the kva allocation for a buffer.
1687 *
1688 */
1689static void
1690bufkva_free(struct buf *bp)
1691{
1692
1693#ifdef INVARIANTS
1694	if (bp->b_kvasize == 0) {
1695		KASSERT(bp->b_kvabase == unmapped_buf &&
1696		    bp->b_data == unmapped_buf,
1697		    ("Leaked KVA space on %p", bp));
1698	} else if (buf_mapped(bp))
1699		BUF_CHECK_MAPPED(bp);
1700	else
1701		BUF_CHECK_UNMAPPED(bp);
1702#endif
1703	if (bp->b_kvasize == 0)
1704		return;
1705
1706	vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize);
1707	atomic_subtract_long(&bufkvaspace, bp->b_kvasize);
1708	atomic_add_int(&buffreekvacnt, 1);
1709	bp->b_data = bp->b_kvabase = unmapped_buf;
1710	bp->b_kvasize = 0;
1711}
1712
1713/*
1714 *	bufkva_alloc:
1715 *
1716 *	Allocate the buffer KVA and set b_kvasize and b_kvabase.
1717 */
1718static int
1719bufkva_alloc(struct buf *bp, int maxsize, int gbflags)
1720{
1721	vm_offset_t addr;
1722	int error;
1723
1724	KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0,
1725	    ("Invalid gbflags 0x%x in %s", gbflags, __func__));
1726
1727	bufkva_free(bp);
1728
1729	addr = 0;
1730	error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr);
1731	if (error != 0) {
1732		/*
1733		 * Buffer map is too fragmented.  Request the caller
1734		 * to defragment the map.
1735		 */
1736		return (error);
1737	}
1738	bp->b_kvabase = (caddr_t)addr;
1739	bp->b_kvasize = maxsize;
1740	atomic_add_long(&bufkvaspace, bp->b_kvasize);
1741	if ((gbflags & GB_UNMAPPED) != 0) {
1742		bp->b_data = unmapped_buf;
1743		BUF_CHECK_UNMAPPED(bp);
1744	} else {
1745		bp->b_data = bp->b_kvabase;
1746		BUF_CHECK_MAPPED(bp);
1747	}
1748	return (0);
1749}
1750
1751/*
1752 *	bufkva_reclaim:
1753 *
1754 *	Reclaim buffer kva by freeing buffers holding kva.  This is a vmem
1755 *	callback that fires to avoid returning failure.
1756 */
1757static void
1758bufkva_reclaim(vmem_t *vmem, int flags)
1759{
1760	int i;
1761
1762	for (i = 0; i < 5; i++)
1763		if (buf_scan(true) != 0)
1764			break;
1765	return;
1766}
1767
1768
1769/*
1770 * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
1771 * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
1772 * the buffer is valid and we do not have to do anything.
1773 */
1774void
1775breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
1776    int cnt, struct ucred * cred)
1777{
1778	struct buf *rabp;
1779	int i;
1780
1781	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
1782		if (inmem(vp, *rablkno))
1783			continue;
1784		rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
1785
1786		if ((rabp->b_flags & B_CACHE) == 0) {
1787			if (!TD_IS_IDLETHREAD(curthread))
1788				curthread->td_ru.ru_inblock++;
1789			rabp->b_flags |= B_ASYNC;
1790			rabp->b_flags &= ~B_INVAL;
1791			rabp->b_ioflags &= ~BIO_ERROR;
1792			rabp->b_iocmd = BIO_READ;
1793			if (rabp->b_rcred == NOCRED && cred != NOCRED)
1794				rabp->b_rcred = crhold(cred);
1795			vfs_busy_pages(rabp, 0);
1796			BUF_KERNPROC(rabp);
1797			rabp->b_iooffset = dbtob(rabp->b_blkno);
1798			bstrategy(rabp);
1799		} else {
1800			brelse(rabp);
1801		}
1802	}
1803}
1804
1805/*
1806 * Entry point for bread() and breadn() via #defines in sys/buf.h.
1807 *
1808 * Get a buffer with the specified data.  Look in the cache first.  We
1809 * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
1810 * is set, the buffer is valid and we do not have to do anything, see
1811 * getblk(). Also starts asynchronous I/O on read-ahead blocks.
1812 */
1813int
1814breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno,
1815    int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp)
1816{
1817	struct buf *bp;
1818	int rv = 0, readwait = 0;
1819
1820	CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
1821	/*
1822	 * Can only return NULL if GB_LOCK_NOWAIT flag is specified.
1823	 */
1824	*bpp = bp = getblk(vp, blkno, size, 0, 0, flags);
1825	if (bp == NULL)
1826		return (EBUSY);
1827
1828	/* if not found in cache, do some I/O */
1829	if ((bp->b_flags & B_CACHE) == 0) {
1830		if (!TD_IS_IDLETHREAD(curthread))
1831			curthread->td_ru.ru_inblock++;
1832		bp->b_iocmd = BIO_READ;
1833		bp->b_flags &= ~B_INVAL;
1834		bp->b_ioflags &= ~BIO_ERROR;
1835		if (bp->b_rcred == NOCRED && cred != NOCRED)
1836			bp->b_rcred = crhold(cred);
1837		vfs_busy_pages(bp, 0);
1838		bp->b_iooffset = dbtob(bp->b_blkno);
1839		bstrategy(bp);
1840		++readwait;
1841	}
1842
1843	breada(vp, rablkno, rabsize, cnt, cred);
1844
1845	if (readwait) {
1846		rv = bufwait(bp);
1847	}
1848	return (rv);
1849}
1850
1851/*
1852 * Write, release buffer on completion.  (Done by iodone
1853 * if async).  Do not bother writing anything if the buffer
1854 * is invalid.
1855 *
1856 * Note that we set B_CACHE here, indicating that buffer is
1857 * fully valid and thus cacheable.  This is true even of NFS
1858 * now so we set it generally.  This could be set either here
1859 * or in biodone() since the I/O is synchronous.  We put it
1860 * here.
1861 */
1862int
1863bufwrite(struct buf *bp)
1864{
1865	int oldflags;
1866	struct vnode *vp;
1867	long space;
1868	int vp_md;
1869
1870	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1871	if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) {
1872		bp->b_flags |= B_INVAL | B_RELBUF;
1873		bp->b_flags &= ~B_CACHE;
1874		brelse(bp);
1875		return (ENXIO);
1876	}
1877	if (bp->b_flags & B_INVAL) {
1878		brelse(bp);
1879		return (0);
1880	}
1881
1882	if (bp->b_flags & B_BARRIER)
1883		barrierwrites++;
1884
1885	oldflags = bp->b_flags;
1886
1887	BUF_ASSERT_HELD(bp);
1888
1889	if (bp->b_pin_count > 0)
1890		bunpin_wait(bp);
1891
1892	KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
1893	    ("FFS background buffer should not get here %p", bp));
1894
1895	vp = bp->b_vp;
1896	if (vp)
1897		vp_md = vp->v_vflag & VV_MD;
1898	else
1899		vp_md = 0;
1900
1901	/*
1902	 * Mark the buffer clean.  Increment the bufobj write count
1903	 * before bundirty() call, to prevent other thread from seeing
1904	 * empty dirty list and zero counter for writes in progress,
1905	 * falsely indicating that the bufobj is clean.
1906	 */
1907	bufobj_wref(bp->b_bufobj);
1908	bundirty(bp);
1909
1910	bp->b_flags &= ~B_DONE;
1911	bp->b_ioflags &= ~BIO_ERROR;
1912	bp->b_flags |= B_CACHE;
1913	bp->b_iocmd = BIO_WRITE;
1914
1915	vfs_busy_pages(bp, 1);
1916
1917	/*
1918	 * Normal bwrites pipeline writes
1919	 */
1920	bp->b_runningbufspace = bp->b_bufsize;
1921	space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
1922
1923	if (!TD_IS_IDLETHREAD(curthread))
1924		curthread->td_ru.ru_oublock++;
1925	if (oldflags & B_ASYNC)
1926		BUF_KERNPROC(bp);
1927	bp->b_iooffset = dbtob(bp->b_blkno);
1928	bstrategy(bp);
1929
1930	if ((oldflags & B_ASYNC) == 0) {
1931		int rtval = bufwait(bp);
1932		brelse(bp);
1933		return (rtval);
1934	} else if (space > hirunningspace) {
1935		/*
1936		 * don't allow the async write to saturate the I/O
1937		 * system.  We will not deadlock here because
1938		 * we are blocking waiting for I/O that is already in-progress
1939		 * to complete. We do not block here if it is the update
1940		 * or syncer daemon trying to clean up as that can lead
1941		 * to deadlock.
1942		 */
1943		if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
1944			waitrunningbufspace();
1945	}
1946
1947	return (0);
1948}
1949
1950void
1951bufbdflush(struct bufobj *bo, struct buf *bp)
1952{
1953	struct buf *nbp;
1954
1955	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
1956		(void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
1957		altbufferflushes++;
1958	} else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
1959		BO_LOCK(bo);
1960		/*
1961		 * Try to find a buffer to flush.
1962		 */
1963		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
1964			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
1965			    BUF_LOCK(nbp,
1966				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
1967				continue;
1968			if (bp == nbp)
1969				panic("bdwrite: found ourselves");
1970			BO_UNLOCK(bo);
1971			/* Don't countdeps with the bo lock held. */
1972			if (buf_countdeps(nbp, 0)) {
1973				BO_LOCK(bo);
1974				BUF_UNLOCK(nbp);
1975				continue;
1976			}
1977			if (nbp->b_flags & B_CLUSTEROK) {
1978				vfs_bio_awrite(nbp);
1979			} else {
1980				bremfree(nbp);
1981				bawrite(nbp);
1982			}
1983			dirtybufferflushes++;
1984			break;
1985		}
1986		if (nbp == NULL)
1987			BO_UNLOCK(bo);
1988	}
1989}
1990
1991/*
1992 * Delayed write. (Buffer is marked dirty).  Do not bother writing
1993 * anything if the buffer is marked invalid.
1994 *
1995 * Note that since the buffer must be completely valid, we can safely
1996 * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
1997 * biodone() in order to prevent getblk from writing the buffer
1998 * out synchronously.
1999 */
2000void
2001bdwrite(struct buf *bp)
2002{
2003	struct thread *td = curthread;
2004	struct vnode *vp;
2005	struct bufobj *bo;
2006
2007	CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
2008	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
2009	KASSERT((bp->b_flags & B_BARRIER) == 0,
2010	    ("Barrier request in delayed write %p", bp));
2011	BUF_ASSERT_HELD(bp);
2012
2013	if (bp->b_flags & B_INVAL) {
2014		brelse(bp);
2015		return;
2016	}
2017
2018	/*
2019	 * If we have too many dirty buffers, don't create any more.
2020	 * If we are wildly over our limit, then force a complete
2021	 * cleanup. Otherwise, just keep the situation from getting
2022	 * out of control. Note that we have to avoid a recursive
2023	 * disaster and not try to clean up after our own cleanup!
2024	 */
2025	vp = bp->b_vp;
2026	bo = bp->b_bufobj;
2027	if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) {
2028		td->td_pflags |= TDP_INBDFLUSH;
2029		BO_BDFLUSH(bo, bp);
2030		td->td_pflags &= ~TDP_INBDFLUSH;
2031	} else
2032		recursiveflushes++;
2033
2034	bdirty(bp);
2035	/*
2036	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
2037	 * true even of NFS now.
2038	 */
2039	bp->b_flags |= B_CACHE;
2040
2041	/*
2042	 * This bmap keeps the system from needing to do the bmap later,
2043	 * perhaps when the system is attempting to do a sync.  Since it
2044	 * is likely that the indirect block -- or whatever other datastructure
2045	 * that the filesystem needs is still in memory now, it is a good
2046	 * thing to do this.  Note also, that if the pageout daemon is
2047	 * requesting a sync -- there might not be enough memory to do
2048	 * the bmap then...  So, this is important to do.
2049	 */
2050	if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
2051		VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
2052	}
2053
2054	/*
2055	 * Set the *dirty* buffer range based upon the VM system dirty
2056	 * pages.
2057	 *
2058	 * Mark the buffer pages as clean.  We need to do this here to
2059	 * satisfy the vnode_pager and the pageout daemon, so that it
2060	 * thinks that the pages have been "cleaned".  Note that since
2061	 * the pages are in a delayed write buffer -- the VFS layer
2062	 * "will" see that the pages get written out on the next sync,
2063	 * or perhaps the cluster will be completed.
2064	 */
2065	vfs_clean_pages_dirty_buf(bp);
2066	bqrelse(bp);
2067
2068	/*
2069	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
2070	 * due to the softdep code.
2071	 */
2072}
2073
2074/*
2075 *	bdirty:
2076 *
2077 *	Turn buffer into delayed write request.  We must clear BIO_READ and
2078 *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to
2079 *	itself to properly update it in the dirty/clean lists.  We mark it
2080 *	B_DONE to ensure that any asynchronization of the buffer properly
2081 *	clears B_DONE ( else a panic will occur later ).
2082 *
2083 *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
2084 *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
2085 *	should only be called if the buffer is known-good.
2086 *
2087 *	Since the buffer is not on a queue, we do not update the numfreebuffers
2088 *	count.
2089 *
2090 *	The buffer must be on QUEUE_NONE.
2091 */
2092void
2093bdirty(struct buf *bp)
2094{
2095
2096	CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X",
2097	    bp, bp->b_vp, bp->b_flags);
2098	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
2099	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
2100	    ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
2101	BUF_ASSERT_HELD(bp);
2102	bp->b_flags &= ~(B_RELBUF);
2103	bp->b_iocmd = BIO_WRITE;
2104
2105	if ((bp->b_flags & B_DELWRI) == 0) {
2106		bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
2107		reassignbuf(bp);
2108		bdirtyadd();
2109	}
2110}
2111
2112/*
2113 *	bundirty:
2114 *
2115 *	Clear B_DELWRI for buffer.
2116 *
2117 *	Since the buffer is not on a queue, we do not update the numfreebuffers
2118 *	count.
2119 *
2120 *	The buffer must be on QUEUE_NONE.
2121 */
2122
2123void
2124bundirty(struct buf *bp)
2125{
2126
2127	CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
2128	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
2129	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
2130	    ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
2131	BUF_ASSERT_HELD(bp);
2132
2133	if (bp->b_flags & B_DELWRI) {
2134		bp->b_flags &= ~B_DELWRI;
2135		reassignbuf(bp);
2136		bdirtysub();
2137	}
2138	/*
2139	 * Since it is now being written, we can clear its deferred write flag.
2140	 */
2141	bp->b_flags &= ~B_DEFERRED;
2142}
2143
2144/*
2145 *	bawrite:
2146 *
2147 *	Asynchronous write.  Start output on a buffer, but do not wait for
2148 *	it to complete.  The buffer is released when the output completes.
2149 *
2150 *	bwrite() ( or the VOP routine anyway ) is responsible for handling
2151 *	B_INVAL buffers.  Not us.
2152 */
2153void
2154bawrite(struct buf *bp)
2155{
2156
2157	bp->b_flags |= B_ASYNC;
2158	(void) bwrite(bp);
2159}
2160
2161/*
2162 *	babarrierwrite:
2163 *
2164 *	Asynchronous barrier write.  Start output on a buffer, but do not
2165 *	wait for it to complete.  Place a write barrier after this write so
2166 *	that this buffer and all buffers written before it are committed to
2167 *	the disk before any buffers written after this write are committed
2168 *	to the disk.  The buffer is released when the output completes.
2169 */
2170void
2171babarrierwrite(struct buf *bp)
2172{
2173
2174	bp->b_flags |= B_ASYNC | B_BARRIER;
2175	(void) bwrite(bp);
2176}
2177
2178/*
2179 *	bbarrierwrite:
2180 *
2181 *	Synchronous barrier write.  Start output on a buffer and wait for
2182 *	it to complete.  Place a write barrier after this write so that
2183 *	this buffer and all buffers written before it are committed to
2184 *	the disk before any buffers written after this write are committed
2185 *	to the disk.  The buffer is released when the output completes.
2186 */
2187int
2188bbarrierwrite(struct buf *bp)
2189{
2190
2191	bp->b_flags |= B_BARRIER;
2192	return (bwrite(bp));
2193}
2194
2195/*
2196 *	bwillwrite:
2197 *
2198 *	Called prior to the locking of any vnodes when we are expecting to
2199 *	write.  We do not want to starve the buffer cache with too many
2200 *	dirty buffers so we block here.  By blocking prior to the locking
2201 *	of any vnodes we attempt to avoid the situation where a locked vnode
2202 *	prevents the various system daemons from flushing related buffers.
2203 */
2204void
2205bwillwrite(void)
2206{
2207
2208	if (numdirtybuffers >= hidirtybuffers) {
2209		mtx_lock(&bdirtylock);
2210		while (numdirtybuffers >= hidirtybuffers) {
2211			bdirtywait = 1;
2212			msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
2213			    "flswai", 0);
2214		}
2215		mtx_unlock(&bdirtylock);
2216	}
2217}
2218
2219/*
2220 * Return true if we have too many dirty buffers.
2221 */
2222int
2223buf_dirty_count_severe(void)
2224{
2225
2226	return(numdirtybuffers >= hidirtybuffers);
2227}
2228
2229/*
2230 *	brelse:
2231 *
2232 *	Release a busy buffer and, if requested, free its resources.  The
2233 *	buffer will be stashed in the appropriate bufqueue[] allowing it
2234 *	to be accessed later as a cache entity or reused for other purposes.
2235 */
2236void
2237brelse(struct buf *bp)
2238{
2239	int qindex;
2240
2241	CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
2242	    bp, bp->b_vp, bp->b_flags);
2243	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
2244	    ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
2245	KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0,
2246	    ("brelse: non-VMIO buffer marked NOREUSE"));
2247
2248	if (BUF_LOCKRECURSED(bp)) {
2249		/*
2250		 * Do not process, in particular, do not handle the
2251		 * B_INVAL/B_RELBUF and do not release to free list.
2252		 */
2253		BUF_UNLOCK(bp);
2254		return;
2255	}
2256
2257	if (bp->b_flags & B_MANAGED) {
2258		bqrelse(bp);
2259		return;
2260	}
2261
2262	if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) {
2263		BO_LOCK(bp->b_bufobj);
2264		bp->b_vflags &= ~BV_BKGRDERR;
2265		BO_UNLOCK(bp->b_bufobj);
2266		bdirty(bp);
2267	}
2268	if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
2269	    !(bp->b_flags & B_INVAL)) {
2270		/*
2271		 * Failed write, redirty.  Must clear BIO_ERROR to prevent
2272		 * pages from being scrapped.
2273		 */
2274		bp->b_ioflags &= ~BIO_ERROR;
2275		bdirty(bp);
2276	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
2277	    (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) {
2278		/*
2279		 * Either a failed read I/O or we were asked to free or not
2280		 * cache the buffer.
2281		 */
2282		bp->b_flags |= B_INVAL;
2283		if (!LIST_EMPTY(&bp->b_dep))
2284			buf_deallocate(bp);
2285		if (bp->b_flags & B_DELWRI)
2286			bdirtysub();
2287		bp->b_flags &= ~(B_DELWRI | B_CACHE);
2288		if ((bp->b_flags & B_VMIO) == 0) {
2289			allocbuf(bp, 0);
2290			if (bp->b_vp)
2291				brelvp(bp);
2292		}
2293	}
2294
2295	/*
2296	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_truncate()
2297	 * is called with B_DELWRI set, the underlying pages may wind up
2298	 * getting freed causing a previous write (bdwrite()) to get 'lost'
2299	 * because pages associated with a B_DELWRI bp are marked clean.
2300	 *
2301	 * We still allow the B_INVAL case to call vfs_vmio_truncate(), even
2302	 * if B_DELWRI is set.
2303	 */
2304	if (bp->b_flags & B_DELWRI)
2305		bp->b_flags &= ~B_RELBUF;
2306
2307	/*
2308	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
2309	 * constituted, not even NFS buffers now.  Two flags effect this.  If
2310	 * B_INVAL, the struct buf is invalidated but the VM object is kept
2311	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
2312	 *
2313	 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
2314	 * invalidated.  BIO_ERROR cannot be set for a failed write unless the
2315	 * buffer is also B_INVAL because it hits the re-dirtying code above.
2316	 *
2317	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
2318	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
2319	 * the commit state and we cannot afford to lose the buffer. If the
2320	 * buffer has a background write in progress, we need to keep it
2321	 * around to prevent it from being reconstituted and starting a second
2322	 * background write.
2323	 */
2324	if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE ||
2325	    (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) &&
2326	    !(bp->b_vp->v_mount != NULL &&
2327	    (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
2328	    !vn_isdisk(bp->b_vp, NULL) && (bp->b_flags & B_DELWRI))) {
2329		vfs_vmio_invalidate(bp);
2330		allocbuf(bp, 0);
2331	}
2332
2333	if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 ||
2334	    (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) {
2335		allocbuf(bp, 0);
2336		bp->b_flags &= ~B_NOREUSE;
2337		if (bp->b_vp != NULL)
2338			brelvp(bp);
2339	}
2340
2341	/*
2342	 * If the buffer has junk contents signal it and eventually
2343	 * clean up B_DELWRI and diassociate the vnode so that gbincore()
2344	 * doesn't find it.
2345	 */
2346	if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 ||
2347	    (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0)
2348		bp->b_flags |= B_INVAL;
2349	if (bp->b_flags & B_INVAL) {
2350		if (bp->b_flags & B_DELWRI)
2351			bundirty(bp);
2352		if (bp->b_vp)
2353			brelvp(bp);
2354	}
2355
2356	/* buffers with no memory */
2357	if (bp->b_bufsize == 0) {
2358		buf_free(bp);
2359		return;
2360	}
2361	/* buffers with junk contents */
2362	if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
2363	    (bp->b_ioflags & BIO_ERROR)) {
2364		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
2365		if (bp->b_vflags & BV_BKGRDINPROG)
2366			panic("losing buffer 2");
2367		qindex = QUEUE_CLEAN;
2368		bp->b_flags |= B_AGE;
2369	/* remaining buffers */
2370	} else if (bp->b_flags & B_DELWRI)
2371		qindex = QUEUE_DIRTY;
2372	else
2373		qindex = QUEUE_CLEAN;
2374
2375	binsfree(bp, qindex);
2376
2377	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
2378	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
2379		panic("brelse: not dirty");
2380	/* unlock */
2381	BUF_UNLOCK(bp);
2382	if (qindex == QUEUE_CLEAN)
2383		bufspace_wakeup();
2384}
2385
2386/*
2387 * Release a buffer back to the appropriate queue but do not try to free
2388 * it.  The buffer is expected to be used again soon.
2389 *
2390 * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
2391 * biodone() to requeue an async I/O on completion.  It is also used when
2392 * known good buffers need to be requeued but we think we may need the data
2393 * again soon.
2394 *
2395 * XXX we should be able to leave the B_RELBUF hint set on completion.
2396 */
2397void
2398bqrelse(struct buf *bp)
2399{
2400	int qindex;
2401
2402	CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
2403	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
2404	    ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
2405
2406	qindex = QUEUE_NONE;
2407	if (BUF_LOCKRECURSED(bp)) {
2408		/* do not release to free list */
2409		BUF_UNLOCK(bp);
2410		return;
2411	}
2412	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
2413
2414	if (bp->b_flags & B_MANAGED) {
2415		if (bp->b_flags & B_REMFREE)
2416			bremfreef(bp);
2417		goto out;
2418	}
2419
2420	/* buffers with stale but valid contents */
2421	if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG |
2422	    BV_BKGRDERR)) == BV_BKGRDERR) {
2423		BO_LOCK(bp->b_bufobj);
2424		bp->b_vflags &= ~BV_BKGRDERR;
2425		BO_UNLOCK(bp->b_bufobj);
2426		qindex = QUEUE_DIRTY;
2427	} else {
2428		if ((bp->b_flags & B_DELWRI) == 0 &&
2429		    (bp->b_xflags & BX_VNDIRTY))
2430			panic("bqrelse: not dirty");
2431		if ((bp->b_flags & B_NOREUSE) != 0) {
2432			brelse(bp);
2433			return;
2434		}
2435		qindex = QUEUE_CLEAN;
2436	}
2437	binsfree(bp, qindex);
2438
2439out:
2440	/* unlock */
2441	BUF_UNLOCK(bp);
2442	if (qindex == QUEUE_CLEAN)
2443		bufspace_wakeup();
2444}
2445
2446/*
2447 * Complete I/O to a VMIO backed page.  Validate the pages as appropriate,
2448 * restore bogus pages.
2449 */
2450static void
2451vfs_vmio_iodone(struct buf *bp)
2452{
2453	vm_ooffset_t foff;
2454	vm_page_t m;
2455	vm_object_t obj;
2456	struct vnode *vp;
2457	int bogus, i, iosize;
2458
2459	obj = bp->b_bufobj->bo_object;
2460	KASSERT(obj->paging_in_progress >= bp->b_npages,
2461	    ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)",
2462	    obj->paging_in_progress, bp->b_npages));
2463
2464	vp = bp->b_vp;
2465	KASSERT(vp->v_holdcnt > 0,
2466	    ("vfs_vmio_iodone: vnode %p has zero hold count", vp));
2467	KASSERT(vp->v_object != NULL,
2468	    ("vfs_vmio_iodone: vnode %p has no vm_object", vp));
2469
2470	foff = bp->b_offset;
2471	KASSERT(bp->b_offset != NOOFFSET,
2472	    ("vfs_vmio_iodone: bp %p has no buffer offset", bp));
2473
2474	bogus = 0;
2475	iosize = bp->b_bcount - bp->b_resid;
2476	VM_OBJECT_WLOCK(obj);
2477	for (i = 0; i < bp->b_npages; i++) {
2478		int resid;
2479
2480		resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
2481		if (resid > iosize)
2482			resid = iosize;
2483
2484		/*
2485		 * cleanup bogus pages, restoring the originals
2486		 */
2487		m = bp->b_pages[i];
2488		if (m == bogus_page) {
2489			bogus = 1;
2490			m = vm_page_lookup(obj, OFF_TO_IDX(foff));
2491			if (m == NULL)
2492				panic("biodone: page disappeared!");
2493			bp->b_pages[i] = m;
2494		} else if ((bp->b_iocmd == BIO_READ) && resid > 0) {
2495			/*
2496			 * In the write case, the valid and clean bits are
2497			 * already changed correctly ( see bdwrite() ), so we
2498			 * only need to do this here in the read case.
2499			 */
2500			KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK,
2501			    resid)) == 0, ("vfs_vmio_iodone: page %p "
2502			    "has unexpected dirty bits", m));
2503			vfs_page_set_valid(bp, foff, m);
2504		}
2505		KASSERT(OFF_TO_IDX(foff) == m->pindex,
2506		    ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch",
2507		    (intmax_t)foff, (uintmax_t)m->pindex));
2508
2509		vm_page_sunbusy(m);
2510		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
2511		iosize -= resid;
2512	}
2513	vm_object_pip_wakeupn(obj, bp->b_npages);
2514	VM_OBJECT_WUNLOCK(obj);
2515	if (bogus && buf_mapped(bp)) {
2516		BUF_CHECK_MAPPED(bp);
2517		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
2518		    bp->b_pages, bp->b_npages);
2519	}
2520}
2521
2522/*
2523 * Unwire a page held by a buf and place it on the appropriate vm queue.
2524 */
2525static void
2526vfs_vmio_unwire(struct buf *bp, vm_page_t m)
2527{
2528	bool freed;
2529
2530	vm_page_lock(m);
2531	if (vm_page_unwire(m, PQ_NONE)) {
2532		/*
2533		 * Determine if the page should be freed before adding
2534		 * it to the inactive queue.
2535		 */
2536		if (m->valid == 0) {
2537			freed = !vm_page_busied(m);
2538			if (freed)
2539				vm_page_free(m);
2540		} else if ((bp->b_flags & B_DIRECT) != 0)
2541			freed = vm_page_try_to_free(m);
2542		else
2543			freed = false;
2544		if (!freed) {
2545			/*
2546			 * If the page is unlikely to be reused, let the
2547			 * VM know.  Otherwise, maintain LRU page
2548			 * ordering and put the page at the tail of the
2549			 * inactive queue.
2550			 */
2551			if ((bp->b_flags & B_NOREUSE) != 0)
2552				vm_page_deactivate_noreuse(m);
2553			else
2554				vm_page_deactivate(m);
2555		}
2556	}
2557	vm_page_unlock(m);
2558}
2559
2560/*
2561 * Perform page invalidation when a buffer is released.  The fully invalid
2562 * pages will be reclaimed later in vfs_vmio_truncate().
2563 */
2564static void
2565vfs_vmio_invalidate(struct buf *bp)
2566{
2567	vm_object_t obj;
2568	vm_page_t m;
2569	int i, resid, poffset, presid;
2570
2571	if (buf_mapped(bp)) {
2572		BUF_CHECK_MAPPED(bp);
2573		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
2574	} else
2575		BUF_CHECK_UNMAPPED(bp);
2576	/*
2577	 * Get the base offset and length of the buffer.  Note that
2578	 * in the VMIO case if the buffer block size is not
2579	 * page-aligned then b_data pointer may not be page-aligned.
2580	 * But our b_pages[] array *IS* page aligned.
2581	 *
2582	 * block sizes less then DEV_BSIZE (usually 512) are not
2583	 * supported due to the page granularity bits (m->valid,
2584	 * m->dirty, etc...).
2585	 *
2586	 * See man buf(9) for more information
2587	 */
2588	obj = bp->b_bufobj->bo_object;
2589	resid = bp->b_bufsize;
2590	poffset = bp->b_offset & PAGE_MASK;
2591	VM_OBJECT_WLOCK(obj);
2592	for (i = 0; i < bp->b_npages; i++) {
2593		m = bp->b_pages[i];
2594		if (m == bogus_page)
2595			panic("vfs_vmio_invalidate: Unexpected bogus page.");
2596		bp->b_pages[i] = NULL;
2597
2598		presid = resid > (PAGE_SIZE - poffset) ?
2599		    (PAGE_SIZE - poffset) : resid;
2600		KASSERT(presid >= 0, ("brelse: extra page"));
2601		while (vm_page_xbusied(m)) {
2602			vm_page_lock(m);
2603			VM_OBJECT_WUNLOCK(obj);
2604			vm_page_busy_sleep(m, "mbncsh");
2605			VM_OBJECT_WLOCK(obj);
2606		}
2607		if (pmap_page_wired_mappings(m) == 0)
2608			vm_page_set_invalid(m, poffset, presid);
2609		vfs_vmio_unwire(bp, m);
2610		resid -= presid;
2611		poffset = 0;
2612	}
2613	VM_OBJECT_WUNLOCK(obj);
2614	bp->b_npages = 0;
2615}
2616
2617/*
2618 * Page-granular truncation of an existing VMIO buffer.
2619 */
2620static void
2621vfs_vmio_truncate(struct buf *bp, int desiredpages)
2622{
2623	vm_object_t obj;
2624	vm_page_t m;
2625	int i;
2626
2627	if (bp->b_npages == desiredpages)
2628		return;
2629
2630	if (buf_mapped(bp)) {
2631		BUF_CHECK_MAPPED(bp);
2632		pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) +
2633		    (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages);
2634	} else
2635		BUF_CHECK_UNMAPPED(bp);
2636	obj = bp->b_bufobj->bo_object;
2637	if (obj != NULL)
2638		VM_OBJECT_WLOCK(obj);
2639	for (i = desiredpages; i < bp->b_npages; i++) {
2640		m = bp->b_pages[i];
2641		KASSERT(m != bogus_page, ("allocbuf: bogus page found"));
2642		bp->b_pages[i] = NULL;
2643		vfs_vmio_unwire(bp, m);
2644	}
2645	if (obj != NULL)
2646		VM_OBJECT_WUNLOCK(obj);
2647	bp->b_npages = desiredpages;
2648}
2649
2650/*
2651 * Byte granular extension of VMIO buffers.
2652 */
2653static void
2654vfs_vmio_extend(struct buf *bp, int desiredpages, int size)
2655{
2656	/*
2657	 * We are growing the buffer, possibly in a
2658	 * byte-granular fashion.
2659	 */
2660	vm_object_t obj;
2661	vm_offset_t toff;
2662	vm_offset_t tinc;
2663	vm_page_t m;
2664
2665	/*
2666	 * Step 1, bring in the VM pages from the object, allocating
2667	 * them if necessary.  We must clear B_CACHE if these pages
2668	 * are not valid for the range covered by the buffer.
2669	 */
2670	obj = bp->b_bufobj->bo_object;
2671	VM_OBJECT_WLOCK(obj);
2672	while (bp->b_npages < desiredpages) {
2673		/*
2674		 * We must allocate system pages since blocking
2675		 * here could interfere with paging I/O, no
2676		 * matter which process we are.
2677		 *
2678		 * Only exclusive busy can be tested here.
2679		 * Blocking on shared busy might lead to
2680		 * deadlocks once allocbuf() is called after
2681		 * pages are vfs_busy_pages().
2682		 */
2683		m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) + bp->b_npages,
2684		    VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM |
2685		    VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY |
2686		    VM_ALLOC_COUNT(desiredpages - bp->b_npages));
2687		if (m->valid == 0)
2688			bp->b_flags &= ~B_CACHE;
2689		bp->b_pages[bp->b_npages] = m;
2690		++bp->b_npages;
2691	}
2692
2693	/*
2694	 * Step 2.  We've loaded the pages into the buffer,
2695	 * we have to figure out if we can still have B_CACHE
2696	 * set.  Note that B_CACHE is set according to the
2697	 * byte-granular range ( bcount and size ), not the
2698	 * aligned range ( newbsize ).
2699	 *
2700	 * The VM test is against m->valid, which is DEV_BSIZE
2701	 * aligned.  Needless to say, the validity of the data
2702	 * needs to also be DEV_BSIZE aligned.  Note that this
2703	 * fails with NFS if the server or some other client
2704	 * extends the file's EOF.  If our buffer is resized,
2705	 * B_CACHE may remain set! XXX
2706	 */
2707	toff = bp->b_bcount;
2708	tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
2709	while ((bp->b_flags & B_CACHE) && toff < size) {
2710		vm_pindex_t pi;
2711
2712		if (tinc > (size - toff))
2713			tinc = size - toff;
2714		pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT;
2715		m = bp->b_pages[pi];
2716		vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m);
2717		toff += tinc;
2718		tinc = PAGE_SIZE;
2719	}
2720	VM_OBJECT_WUNLOCK(obj);
2721
2722	/*
2723	 * Step 3, fixup the KVA pmap.
2724	 */
2725	if (buf_mapped(bp))
2726		bpmap_qenter(bp);
2727	else
2728		BUF_CHECK_UNMAPPED(bp);
2729}
2730
2731/*
2732 * Check to see if a block at a particular lbn is available for a clustered
2733 * write.
2734 */
2735static int
2736vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno)
2737{
2738	struct buf *bpa;
2739	int match;
2740
2741	match = 0;
2742
2743	/* If the buf isn't in core skip it */
2744	if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL)
2745		return (0);
2746
2747	/* If the buf is busy we don't want to wait for it */
2748	if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
2749		return (0);
2750
2751	/* Only cluster with valid clusterable delayed write buffers */
2752	if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) !=
2753	    (B_DELWRI | B_CLUSTEROK))
2754		goto done;
2755
2756	if (bpa->b_bufsize != size)
2757		goto done;
2758
2759	/*
2760	 * Check to see if it is in the expected place on disk and that the
2761	 * block has been mapped.
2762	 */
2763	if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno))
2764		match = 1;
2765done:
2766	BUF_UNLOCK(bpa);
2767	return (match);
2768}
2769
2770/*
2771 *	vfs_bio_awrite:
2772 *
2773 *	Implement clustered async writes for clearing out B_DELWRI buffers.
2774 *	This is much better then the old way of writing only one buffer at
2775 *	a time.  Note that we may not be presented with the buffers in the
2776 *	correct order, so we search for the cluster in both directions.
2777 */
2778int
2779vfs_bio_awrite(struct buf *bp)
2780{
2781	struct bufobj *bo;
2782	int i;
2783	int j;
2784	daddr_t lblkno = bp->b_lblkno;
2785	struct vnode *vp = bp->b_vp;
2786	int ncl;
2787	int nwritten;
2788	int size;
2789	int maxcl;
2790	int gbflags;
2791
2792	bo = &vp->v_bufobj;
2793	gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0;
2794	/*
2795	 * right now we support clustered writing only to regular files.  If
2796	 * we find a clusterable block we could be in the middle of a cluster
2797	 * rather then at the beginning.
2798	 */
2799	if ((vp->v_type == VREG) &&
2800	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
2801	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
2802
2803		size = vp->v_mount->mnt_stat.f_iosize;
2804		maxcl = MAXPHYS / size;
2805
2806		BO_RLOCK(bo);
2807		for (i = 1; i < maxcl; i++)
2808			if (vfs_bio_clcheck(vp, size, lblkno + i,
2809			    bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
2810				break;
2811
2812		for (j = 1; i + j <= maxcl && j <= lblkno; j++)
2813			if (vfs_bio_clcheck(vp, size, lblkno - j,
2814			    bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
2815				break;
2816		BO_RUNLOCK(bo);
2817		--j;
2818		ncl = i + j;
2819		/*
2820		 * this is a possible cluster write
2821		 */
2822		if (ncl != 1) {
2823			BUF_UNLOCK(bp);
2824			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
2825			    gbflags);
2826			return (nwritten);
2827		}
2828	}
2829	bremfree(bp);
2830	bp->b_flags |= B_ASYNC;
2831	/*
2832	 * default (old) behavior, writing out only one block
2833	 *
2834	 * XXX returns b_bufsize instead of b_bcount for nwritten?
2835	 */
2836	nwritten = bp->b_bufsize;
2837	(void) bwrite(bp);
2838
2839	return (nwritten);
2840}
2841
2842/*
2843 *	getnewbuf_kva:
2844 *
2845 *	Allocate KVA for an empty buf header according to gbflags.
2846 */
2847static int
2848getnewbuf_kva(struct buf *bp, int gbflags, int maxsize)
2849{
2850
2851	if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) {
2852		/*
2853		 * In order to keep fragmentation sane we only allocate kva
2854		 * in BKVASIZE chunks.  XXX with vmem we can do page size.
2855		 */
2856		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
2857
2858		if (maxsize != bp->b_kvasize &&
2859		    bufkva_alloc(bp, maxsize, gbflags))
2860			return (ENOSPC);
2861	}
2862	return (0);
2863}
2864
2865/*
2866 *	getnewbuf:
2867 *
2868 *	Find and initialize a new buffer header, freeing up existing buffers
2869 *	in the bufqueues as necessary.  The new buffer is returned locked.
2870 *
2871 *	We block if:
2872 *		We have insufficient buffer headers
2873 *		We have insufficient buffer space
2874 *		buffer_arena is too fragmented ( space reservation fails )
2875 *		If we have to flush dirty buffers ( but we try to avoid this )
2876 *
2877 *	The caller is responsible for releasing the reserved bufspace after
2878 *	allocbuf() is called.
2879 */
2880static struct buf *
2881getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags)
2882{
2883	struct buf *bp;
2884	bool metadata, reserved;
2885
2886	bp = NULL;
2887	KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
2888	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
2889	if (!unmapped_buf_allowed)
2890		gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
2891
2892	if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
2893	    vp->v_type == VCHR)
2894		metadata = true;
2895	else
2896		metadata = false;
2897	atomic_add_int(&getnewbufcalls, 1);
2898	reserved = false;
2899	do {
2900		if (reserved == false &&
2901		    bufspace_reserve(maxsize, metadata) != 0)
2902			continue;
2903		reserved = true;
2904		if ((bp = buf_alloc()) == NULL)
2905			continue;
2906		if (getnewbuf_kva(bp, gbflags, maxsize) == 0)
2907			return (bp);
2908		break;
2909	} while(buf_scan(false) == 0);
2910
2911	if (reserved)
2912		atomic_subtract_long(&bufspace, maxsize);
2913	if (bp != NULL) {
2914		bp->b_flags |= B_INVAL;
2915		brelse(bp);
2916	}
2917	bufspace_wait(vp, gbflags, slpflag, slptimeo);
2918
2919	return (NULL);
2920}
2921
2922/*
2923 *	buf_daemon:
2924 *
2925 *	buffer flushing daemon.  Buffers are normally flushed by the
2926 *	update daemon but if it cannot keep up this process starts to
2927 *	take the load in an attempt to prevent getnewbuf() from blocking.
2928 */
2929static struct kproc_desc buf_kp = {
2930	"bufdaemon",
2931	buf_daemon,
2932	&bufdaemonproc
2933};
2934SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
2935
2936static int
2937buf_flush(struct vnode *vp, int target)
2938{
2939	int flushed;
2940
2941	flushed = flushbufqueues(vp, target, 0);
2942	if (flushed == 0) {
2943		/*
2944		 * Could not find any buffers without rollback
2945		 * dependencies, so just write the first one
2946		 * in the hopes of eventually making progress.
2947		 */
2948		if (vp != NULL && target > 2)
2949			target /= 2;
2950		flushbufqueues(vp, target, 1);
2951	}
2952	return (flushed);
2953}
2954
2955static void
2956buf_daemon()
2957{
2958	int lodirty;
2959
2960	/*
2961	 * This process needs to be suspended prior to shutdown sync.
2962	 */
2963	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc,
2964	    SHUTDOWN_PRI_LAST);
2965
2966	/*
2967	 * This process is allowed to take the buffer cache to the limit
2968	 */
2969	curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED;
2970	mtx_lock(&bdlock);
2971	for (;;) {
2972		bd_request = 0;
2973		mtx_unlock(&bdlock);
2974
2975		kproc_suspend_check(bufdaemonproc);
2976		lodirty = lodirtybuffers;
2977		if (bd_speedupreq) {
2978			lodirty = numdirtybuffers / 2;
2979			bd_speedupreq = 0;
2980		}
2981		/*
2982		 * Do the flush.  Limit the amount of in-transit I/O we
2983		 * allow to build up, otherwise we would completely saturate
2984		 * the I/O system.
2985		 */
2986		while (numdirtybuffers > lodirty) {
2987			if (buf_flush(NULL, numdirtybuffers - lodirty) == 0)
2988				break;
2989			kern_yield(PRI_USER);
2990		}
2991
2992		/*
2993		 * Only clear bd_request if we have reached our low water
2994		 * mark.  The buf_daemon normally waits 1 second and
2995		 * then incrementally flushes any dirty buffers that have
2996		 * built up, within reason.
2997		 *
2998		 * If we were unable to hit our low water mark and couldn't
2999		 * find any flushable buffers, we sleep for a short period
3000		 * to avoid endless loops on unlockable buffers.
3001		 */
3002		mtx_lock(&bdlock);
3003		if (numdirtybuffers <= lodirtybuffers) {
3004			/*
3005			 * We reached our low water mark, reset the
3006			 * request and sleep until we are needed again.
3007			 * The sleep is just so the suspend code works.
3008			 */
3009			bd_request = 0;
3010			/*
3011			 * Do an extra wakeup in case dirty threshold
3012			 * changed via sysctl and the explicit transition
3013			 * out of shortfall was missed.
3014			 */
3015			bdirtywakeup();
3016			if (runningbufspace <= lorunningspace)
3017				runningwakeup();
3018			msleep(&bd_request, &bdlock, PVM, "psleep", hz);
3019		} else {
3020			/*
3021			 * We couldn't find any flushable dirty buffers but
3022			 * still have too many dirty buffers, we
3023			 * have to sleep and try again.  (rare)
3024			 */
3025			msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10);
3026		}
3027	}
3028}
3029
3030/*
3031 *	flushbufqueues:
3032 *
3033 *	Try to flush a buffer in the dirty queue.  We must be careful to
3034 *	free up B_INVAL buffers instead of write them, which NFS is
3035 *	particularly sensitive to.
3036 */
3037static int flushwithdeps = 0;
3038SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
3039    0, "Number of buffers flushed with dependecies that require rollbacks");
3040
3041static int
3042flushbufqueues(struct vnode *lvp, int target, int flushdeps)
3043{
3044	struct buf *sentinel;
3045	struct vnode *vp;
3046	struct mount *mp;
3047	struct buf *bp;
3048	int hasdeps;
3049	int flushed;
3050	int queue;
3051	int error;
3052	bool unlock;
3053
3054	flushed = 0;
3055	queue = QUEUE_DIRTY;
3056	bp = NULL;
3057	sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
3058	sentinel->b_qindex = QUEUE_SENTINEL;
3059	mtx_lock(&bqlocks[queue]);
3060	TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist);
3061	mtx_unlock(&bqlocks[queue]);
3062	while (flushed != target) {
3063		maybe_yield();
3064		mtx_lock(&bqlocks[queue]);
3065		bp = TAILQ_NEXT(sentinel, b_freelist);
3066		if (bp != NULL) {
3067			TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
3068			TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel,
3069			    b_freelist);
3070		} else {
3071			mtx_unlock(&bqlocks[queue]);
3072			break;
3073		}
3074		/*
3075		 * Skip sentinels inserted by other invocations of the
3076		 * flushbufqueues(), taking care to not reorder them.
3077		 *
3078		 * Only flush the buffers that belong to the
3079		 * vnode locked by the curthread.
3080		 */
3081		if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL &&
3082		    bp->b_vp != lvp)) {
3083			mtx_unlock(&bqlocks[queue]);
3084 			continue;
3085		}
3086		error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL);
3087		mtx_unlock(&bqlocks[queue]);
3088		if (error != 0)
3089			continue;
3090		if (bp->b_pin_count > 0) {
3091			BUF_UNLOCK(bp);
3092			continue;
3093		}
3094		/*
3095		 * BKGRDINPROG can only be set with the buf and bufobj
3096		 * locks both held.  We tolerate a race to clear it here.
3097		 */
3098		if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
3099		    (bp->b_flags & B_DELWRI) == 0) {
3100			BUF_UNLOCK(bp);
3101			continue;
3102		}
3103		if (bp->b_flags & B_INVAL) {
3104			bremfreef(bp);
3105			brelse(bp);
3106			flushed++;
3107			continue;
3108		}
3109
3110		if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
3111			if (flushdeps == 0) {
3112				BUF_UNLOCK(bp);
3113				continue;
3114			}
3115			hasdeps = 1;
3116		} else
3117			hasdeps = 0;
3118		/*
3119		 * We must hold the lock on a vnode before writing
3120		 * one of its buffers. Otherwise we may confuse, or
3121		 * in the case of a snapshot vnode, deadlock the
3122		 * system.
3123		 *
3124		 * The lock order here is the reverse of the normal
3125		 * of vnode followed by buf lock.  This is ok because
3126		 * the NOWAIT will prevent deadlock.
3127		 */
3128		vp = bp->b_vp;
3129		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
3130			BUF_UNLOCK(bp);
3131			continue;
3132		}
3133		if (lvp == NULL) {
3134			unlock = true;
3135			error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
3136		} else {
3137			ASSERT_VOP_LOCKED(vp, "getbuf");
3138			unlock = false;
3139			error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 :
3140			    vn_lock(vp, LK_TRYUPGRADE);
3141		}
3142		if (error == 0) {
3143			CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
3144			    bp, bp->b_vp, bp->b_flags);
3145			if (curproc == bufdaemonproc) {
3146				vfs_bio_awrite(bp);
3147			} else {
3148				bremfree(bp);
3149				bwrite(bp);
3150				notbufdflushes++;
3151			}
3152			vn_finished_write(mp);
3153			if (unlock)
3154				VOP_UNLOCK(vp, 0);
3155			flushwithdeps += hasdeps;
3156			flushed++;
3157
3158			/*
3159			 * Sleeping on runningbufspace while holding
3160			 * vnode lock leads to deadlock.
3161			 */
3162			if (curproc == bufdaemonproc &&
3163			    runningbufspace > hirunningspace)
3164				waitrunningbufspace();
3165			continue;
3166		}
3167		vn_finished_write(mp);
3168		BUF_UNLOCK(bp);
3169	}
3170	mtx_lock(&bqlocks[queue]);
3171	TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
3172	mtx_unlock(&bqlocks[queue]);
3173	free(sentinel, M_TEMP);
3174	return (flushed);
3175}
3176
3177/*
3178 * Check to see if a block is currently memory resident.
3179 */
3180struct buf *
3181incore(struct bufobj *bo, daddr_t blkno)
3182{
3183	struct buf *bp;
3184
3185	BO_RLOCK(bo);
3186	bp = gbincore(bo, blkno);
3187	BO_RUNLOCK(bo);
3188	return (bp);
3189}
3190
3191/*
3192 * Returns true if no I/O is needed to access the
3193 * associated VM object.  This is like incore except
3194 * it also hunts around in the VM system for the data.
3195 */
3196
3197static int
3198inmem(struct vnode * vp, daddr_t blkno)
3199{
3200	vm_object_t obj;
3201	vm_offset_t toff, tinc, size;
3202	vm_page_t m;
3203	vm_ooffset_t off;
3204
3205	ASSERT_VOP_LOCKED(vp, "inmem");
3206
3207	if (incore(&vp->v_bufobj, blkno))
3208		return 1;
3209	if (vp->v_mount == NULL)
3210		return 0;
3211	obj = vp->v_object;
3212	if (obj == NULL)
3213		return (0);
3214
3215	size = PAGE_SIZE;
3216	if (size > vp->v_mount->mnt_stat.f_iosize)
3217		size = vp->v_mount->mnt_stat.f_iosize;
3218	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
3219
3220	VM_OBJECT_RLOCK(obj);
3221	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
3222		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
3223		if (!m)
3224			goto notinmem;
3225		tinc = size;
3226		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
3227			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
3228		if (vm_page_is_valid(m,
3229		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
3230			goto notinmem;
3231	}
3232	VM_OBJECT_RUNLOCK(obj);
3233	return 1;
3234
3235notinmem:
3236	VM_OBJECT_RUNLOCK(obj);
3237	return (0);
3238}
3239
3240/*
3241 * Set the dirty range for a buffer based on the status of the dirty
3242 * bits in the pages comprising the buffer.  The range is limited
3243 * to the size of the buffer.
3244 *
3245 * Tell the VM system that the pages associated with this buffer
3246 * are clean.  This is used for delayed writes where the data is
3247 * going to go to disk eventually without additional VM intevention.
3248 *
3249 * Note that while we only really need to clean through to b_bcount, we
3250 * just go ahead and clean through to b_bufsize.
3251 */
3252static void
3253vfs_clean_pages_dirty_buf(struct buf *bp)
3254{
3255	vm_ooffset_t foff, noff, eoff;
3256	vm_page_t m;
3257	int i;
3258
3259	if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0)
3260		return;
3261
3262	foff = bp->b_offset;
3263	KASSERT(bp->b_offset != NOOFFSET,
3264	    ("vfs_clean_pages_dirty_buf: no buffer offset"));
3265
3266	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
3267	vfs_drain_busy_pages(bp);
3268	vfs_setdirty_locked_object(bp);
3269	for (i = 0; i < bp->b_npages; i++) {
3270		noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
3271		eoff = noff;
3272		if (eoff > bp->b_offset + bp->b_bufsize)
3273			eoff = bp->b_offset + bp->b_bufsize;
3274		m = bp->b_pages[i];
3275		vfs_page_set_validclean(bp, foff, m);
3276		/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
3277		foff = noff;
3278	}
3279	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
3280}
3281
3282static void
3283vfs_setdirty_locked_object(struct buf *bp)
3284{
3285	vm_object_t object;
3286	int i;
3287
3288	object = bp->b_bufobj->bo_object;
3289	VM_OBJECT_ASSERT_WLOCKED(object);
3290
3291	/*
3292	 * We qualify the scan for modified pages on whether the
3293	 * object has been flushed yet.
3294	 */
3295	if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) {
3296		vm_offset_t boffset;
3297		vm_offset_t eoffset;
3298
3299		/*
3300		 * test the pages to see if they have been modified directly
3301		 * by users through the VM system.
3302		 */
3303		for (i = 0; i < bp->b_npages; i++)
3304			vm_page_test_dirty(bp->b_pages[i]);
3305
3306		/*
3307		 * Calculate the encompassing dirty range, boffset and eoffset,
3308		 * (eoffset - boffset) bytes.
3309		 */
3310
3311		for (i = 0; i < bp->b_npages; i++) {
3312			if (bp->b_pages[i]->dirty)
3313				break;
3314		}
3315		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
3316
3317		for (i = bp->b_npages - 1; i >= 0; --i) {
3318			if (bp->b_pages[i]->dirty) {
3319				break;
3320			}
3321		}
3322		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
3323
3324		/*
3325		 * Fit it to the buffer.
3326		 */
3327
3328		if (eoffset > bp->b_bcount)
3329			eoffset = bp->b_bcount;
3330
3331		/*
3332		 * If we have a good dirty range, merge with the existing
3333		 * dirty range.
3334		 */
3335
3336		if (boffset < eoffset) {
3337			if (bp->b_dirtyoff > boffset)
3338				bp->b_dirtyoff = boffset;
3339			if (bp->b_dirtyend < eoffset)
3340				bp->b_dirtyend = eoffset;
3341		}
3342	}
3343}
3344
3345/*
3346 * Allocate the KVA mapping for an existing buffer.
3347 * If an unmapped buffer is provided but a mapped buffer is requested, take
3348 * also care to properly setup mappings between pages and KVA.
3349 */
3350static void
3351bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
3352{
3353	int bsize, maxsize, need_mapping, need_kva;
3354	off_t offset;
3355
3356	need_mapping = bp->b_data == unmapped_buf &&
3357	    (gbflags & GB_UNMAPPED) == 0;
3358	need_kva = bp->b_kvabase == unmapped_buf &&
3359	    bp->b_data == unmapped_buf &&
3360	    (gbflags & GB_KVAALLOC) != 0;
3361	if (!need_mapping && !need_kva)
3362		return;
3363
3364	BUF_CHECK_UNMAPPED(bp);
3365
3366	if (need_mapping && bp->b_kvabase != unmapped_buf) {
3367		/*
3368		 * Buffer is not mapped, but the KVA was already
3369		 * reserved at the time of the instantiation.  Use the
3370		 * allocated space.
3371		 */
3372		goto has_addr;
3373	}
3374
3375	/*
3376	 * Calculate the amount of the address space we would reserve
3377	 * if the buffer was mapped.
3378	 */
3379	bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
3380	KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
3381	offset = blkno * bsize;
3382	maxsize = size + (offset & PAGE_MASK);
3383	maxsize = imax(maxsize, bsize);
3384
3385	while (bufkva_alloc(bp, maxsize, gbflags) != 0) {
3386		if ((gbflags & GB_NOWAIT_BD) != 0) {
3387			/*
3388			 * XXXKIB: defragmentation cannot
3389			 * succeed, not sure what else to do.
3390			 */
3391			panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp);
3392		}
3393		atomic_add_int(&mappingrestarts, 1);
3394		bufspace_wait(bp->b_vp, gbflags, 0, 0);
3395	}
3396has_addr:
3397	if (need_mapping) {
3398		/* b_offset is handled by bpmap_qenter. */
3399		bp->b_data = bp->b_kvabase;
3400		BUF_CHECK_MAPPED(bp);
3401		bpmap_qenter(bp);
3402	}
3403}
3404
3405/*
3406 *	getblk:
3407 *
3408 *	Get a block given a specified block and offset into a file/device.
3409 *	The buffers B_DONE bit will be cleared on return, making it almost
3410 * 	ready for an I/O initiation.  B_INVAL may or may not be set on
3411 *	return.  The caller should clear B_INVAL prior to initiating a
3412 *	READ.
3413 *
3414 *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
3415 *	an existing buffer.
3416 *
3417 *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
3418 *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
3419 *	and then cleared based on the backing VM.  If the previous buffer is
3420 *	non-0-sized but invalid, B_CACHE will be cleared.
3421 *
3422 *	If getblk() must create a new buffer, the new buffer is returned with
3423 *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
3424 *	case it is returned with B_INVAL clear and B_CACHE set based on the
3425 *	backing VM.
3426 *
3427 *	getblk() also forces a bwrite() for any B_DELWRI buffer whos
3428 *	B_CACHE bit is clear.
3429 *
3430 *	What this means, basically, is that the caller should use B_CACHE to
3431 *	determine whether the buffer is fully valid or not and should clear
3432 *	B_INVAL prior to issuing a read.  If the caller intends to validate
3433 *	the buffer by loading its data area with something, the caller needs
3434 *	to clear B_INVAL.  If the caller does this without issuing an I/O,
3435 *	the caller should set B_CACHE ( as an optimization ), else the caller
3436 *	should issue the I/O and biodone() will set B_CACHE if the I/O was
3437 *	a write attempt or if it was a successfull read.  If the caller
3438 *	intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
3439 *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
3440 */
3441struct buf *
3442getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
3443    int flags)
3444{
3445	struct buf *bp;
3446	struct bufobj *bo;
3447	int bsize, error, maxsize, vmio;
3448	off_t offset;
3449
3450	CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
3451	KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
3452	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
3453	ASSERT_VOP_LOCKED(vp, "getblk");
3454	if (size > MAXBCACHEBUF)
3455		panic("getblk: size(%d) > MAXBCACHEBUF(%d)\n", size,
3456		    MAXBCACHEBUF);
3457	if (!unmapped_buf_allowed)
3458		flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
3459
3460	bo = &vp->v_bufobj;
3461loop:
3462	BO_RLOCK(bo);
3463	bp = gbincore(bo, blkno);
3464	if (bp != NULL) {
3465		int lockflags;
3466		/*
3467		 * Buffer is in-core.  If the buffer is not busy nor managed,
3468		 * it must be on a queue.
3469		 */
3470		lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
3471
3472		if (flags & GB_LOCK_NOWAIT)
3473			lockflags |= LK_NOWAIT;
3474
3475		error = BUF_TIMELOCK(bp, lockflags,
3476		    BO_LOCKPTR(bo), "getblk", slpflag, slptimeo);
3477
3478		/*
3479		 * If we slept and got the lock we have to restart in case
3480		 * the buffer changed identities.
3481		 */
3482		if (error == ENOLCK)
3483			goto loop;
3484		/* We timed out or were interrupted. */
3485		else if (error)
3486			return (NULL);
3487		/* If recursed, assume caller knows the rules. */
3488		else if (BUF_LOCKRECURSED(bp))
3489			goto end;
3490
3491		/*
3492		 * The buffer is locked.  B_CACHE is cleared if the buffer is
3493		 * invalid.  Otherwise, for a non-VMIO buffer, B_CACHE is set
3494		 * and for a VMIO buffer B_CACHE is adjusted according to the
3495		 * backing VM cache.
3496		 */
3497		if (bp->b_flags & B_INVAL)
3498			bp->b_flags &= ~B_CACHE;
3499		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
3500			bp->b_flags |= B_CACHE;
3501		if (bp->b_flags & B_MANAGED)
3502			MPASS(bp->b_qindex == QUEUE_NONE);
3503		else
3504			bremfree(bp);
3505
3506		/*
3507		 * check for size inconsistencies for non-VMIO case.
3508		 */
3509		if (bp->b_bcount != size) {
3510			if ((bp->b_flags & B_VMIO) == 0 ||
3511			    (size > bp->b_kvasize)) {
3512				if (bp->b_flags & B_DELWRI) {
3513					/*
3514					 * If buffer is pinned and caller does
3515					 * not want sleep  waiting for it to be
3516					 * unpinned, bail out
3517					 * */
3518					if (bp->b_pin_count > 0) {
3519						if (flags & GB_LOCK_NOWAIT) {
3520							bqrelse(bp);
3521							return (NULL);
3522						} else {
3523							bunpin_wait(bp);
3524						}
3525					}
3526					bp->b_flags |= B_NOCACHE;
3527					bwrite(bp);
3528				} else {
3529					if (LIST_EMPTY(&bp->b_dep)) {
3530						bp->b_flags |= B_RELBUF;
3531						brelse(bp);
3532					} else {
3533						bp->b_flags |= B_NOCACHE;
3534						bwrite(bp);
3535					}
3536				}
3537				goto loop;
3538			}
3539		}
3540
3541		/*
3542		 * Handle the case of unmapped buffer which should
3543		 * become mapped, or the buffer for which KVA
3544		 * reservation is requested.
3545		 */
3546		bp_unmapped_get_kva(bp, blkno, size, flags);
3547
3548		/*
3549		 * If the size is inconsistant in the VMIO case, we can resize
3550		 * the buffer.  This might lead to B_CACHE getting set or
3551		 * cleared.  If the size has not changed, B_CACHE remains
3552		 * unchanged from its previous state.
3553		 */
3554		allocbuf(bp, size);
3555
3556		KASSERT(bp->b_offset != NOOFFSET,
3557		    ("getblk: no buffer offset"));
3558
3559		/*
3560		 * A buffer with B_DELWRI set and B_CACHE clear must
3561		 * be committed before we can return the buffer in
3562		 * order to prevent the caller from issuing a read
3563		 * ( due to B_CACHE not being set ) and overwriting
3564		 * it.
3565		 *
3566		 * Most callers, including NFS and FFS, need this to
3567		 * operate properly either because they assume they
3568		 * can issue a read if B_CACHE is not set, or because
3569		 * ( for example ) an uncached B_DELWRI might loop due
3570		 * to softupdates re-dirtying the buffer.  In the latter
3571		 * case, B_CACHE is set after the first write completes,
3572		 * preventing further loops.
3573		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
3574		 * above while extending the buffer, we cannot allow the
3575		 * buffer to remain with B_CACHE set after the write
3576		 * completes or it will represent a corrupt state.  To
3577		 * deal with this we set B_NOCACHE to scrap the buffer
3578		 * after the write.
3579		 *
3580		 * We might be able to do something fancy, like setting
3581		 * B_CACHE in bwrite() except if B_DELWRI is already set,
3582		 * so the below call doesn't set B_CACHE, but that gets real
3583		 * confusing.  This is much easier.
3584		 */
3585
3586		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
3587			bp->b_flags |= B_NOCACHE;
3588			bwrite(bp);
3589			goto loop;
3590		}
3591		bp->b_flags &= ~B_DONE;
3592	} else {
3593		/*
3594		 * Buffer is not in-core, create new buffer.  The buffer
3595		 * returned by getnewbuf() is locked.  Note that the returned
3596		 * buffer is also considered valid (not marked B_INVAL).
3597		 */
3598		BO_RUNLOCK(bo);
3599		/*
3600		 * If the user does not want us to create the buffer, bail out
3601		 * here.
3602		 */
3603		if (flags & GB_NOCREAT)
3604			return NULL;
3605		if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread))
3606			return NULL;
3607
3608		bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
3609		KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
3610		offset = blkno * bsize;
3611		vmio = vp->v_object != NULL;
3612		if (vmio) {
3613			maxsize = size + (offset & PAGE_MASK);
3614		} else {
3615			maxsize = size;
3616			/* Do not allow non-VMIO notmapped buffers. */
3617			flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
3618		}
3619		maxsize = imax(maxsize, bsize);
3620
3621		bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags);
3622		if (bp == NULL) {
3623			if (slpflag || slptimeo)
3624				return NULL;
3625			/*
3626			 * XXX This is here until the sleep path is diagnosed
3627			 * enough to work under very low memory conditions.
3628			 *
3629			 * There's an issue on low memory, 4BSD+non-preempt
3630			 * systems (eg MIPS routers with 32MB RAM) where buffer
3631			 * exhaustion occurs without sleeping for buffer
3632			 * reclaimation.  This just sticks in a loop and
3633			 * constantly attempts to allocate a buffer, which
3634			 * hits exhaustion and tries to wakeup bufdaemon.
3635			 * This never happens because we never yield.
3636			 *
3637			 * The real solution is to identify and fix these cases
3638			 * so we aren't effectively busy-waiting in a loop
3639			 * until the reclaimation path has cycles to run.
3640			 */
3641			kern_yield(PRI_USER);
3642			goto loop;
3643		}
3644
3645		/*
3646		 * This code is used to make sure that a buffer is not
3647		 * created while the getnewbuf routine is blocked.
3648		 * This can be a problem whether the vnode is locked or not.
3649		 * If the buffer is created out from under us, we have to
3650		 * throw away the one we just created.
3651		 *
3652		 * Note: this must occur before we associate the buffer
3653		 * with the vp especially considering limitations in
3654		 * the splay tree implementation when dealing with duplicate
3655		 * lblkno's.
3656		 */
3657		BO_LOCK(bo);
3658		if (gbincore(bo, blkno)) {
3659			BO_UNLOCK(bo);
3660			bp->b_flags |= B_INVAL;
3661			brelse(bp);
3662			bufspace_release(maxsize);
3663			goto loop;
3664		}
3665
3666		/*
3667		 * Insert the buffer into the hash, so that it can
3668		 * be found by incore.
3669		 */
3670		bp->b_blkno = bp->b_lblkno = blkno;
3671		bp->b_offset = offset;
3672		bgetvp(vp, bp);
3673		BO_UNLOCK(bo);
3674
3675		/*
3676		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
3677		 * buffer size starts out as 0, B_CACHE will be set by
3678		 * allocbuf() for the VMIO case prior to it testing the
3679		 * backing store for validity.
3680		 */
3681
3682		if (vmio) {
3683			bp->b_flags |= B_VMIO;
3684			KASSERT(vp->v_object == bp->b_bufobj->bo_object,
3685			    ("ARGH! different b_bufobj->bo_object %p %p %p\n",
3686			    bp, vp->v_object, bp->b_bufobj->bo_object));
3687		} else {
3688			bp->b_flags &= ~B_VMIO;
3689			KASSERT(bp->b_bufobj->bo_object == NULL,
3690			    ("ARGH! has b_bufobj->bo_object %p %p\n",
3691			    bp, bp->b_bufobj->bo_object));
3692			BUF_CHECK_MAPPED(bp);
3693		}
3694
3695		allocbuf(bp, size);
3696		bufspace_release(maxsize);
3697		bp->b_flags &= ~B_DONE;
3698	}
3699	CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
3700	BUF_ASSERT_HELD(bp);
3701end:
3702	KASSERT(bp->b_bufobj == bo,
3703	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
3704	return (bp);
3705}
3706
3707/*
3708 * Get an empty, disassociated buffer of given size.  The buffer is initially
3709 * set to B_INVAL.
3710 */
3711struct buf *
3712geteblk(int size, int flags)
3713{
3714	struct buf *bp;
3715	int maxsize;
3716
3717	maxsize = (size + BKVAMASK) & ~BKVAMASK;
3718	while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) {
3719		if ((flags & GB_NOWAIT_BD) &&
3720		    (curthread->td_pflags & TDP_BUFNEED) != 0)
3721			return (NULL);
3722	}
3723	allocbuf(bp, size);
3724	bufspace_release(maxsize);
3725	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
3726	BUF_ASSERT_HELD(bp);
3727	return (bp);
3728}
3729
3730/*
3731 * Truncate the backing store for a non-vmio buffer.
3732 */
3733static void
3734vfs_nonvmio_truncate(struct buf *bp, int newbsize)
3735{
3736
3737	if (bp->b_flags & B_MALLOC) {
3738		/*
3739		 * malloced buffers are not shrunk
3740		 */
3741		if (newbsize == 0) {
3742			bufmallocadjust(bp, 0);
3743			free(bp->b_data, M_BIOBUF);
3744			bp->b_data = bp->b_kvabase;
3745			bp->b_flags &= ~B_MALLOC;
3746		}
3747		return;
3748	}
3749	vm_hold_free_pages(bp, newbsize);
3750	bufspace_adjust(bp, newbsize);
3751}
3752
3753/*
3754 * Extend the backing for a non-VMIO buffer.
3755 */
3756static void
3757vfs_nonvmio_extend(struct buf *bp, int newbsize)
3758{
3759	caddr_t origbuf;
3760	int origbufsize;
3761
3762	/*
3763	 * We only use malloced memory on the first allocation.
3764	 * and revert to page-allocated memory when the buffer
3765	 * grows.
3766	 *
3767	 * There is a potential smp race here that could lead
3768	 * to bufmallocspace slightly passing the max.  It
3769	 * is probably extremely rare and not worth worrying
3770	 * over.
3771	 */
3772	if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 &&
3773	    bufmallocspace < maxbufmallocspace) {
3774		bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK);
3775		bp->b_flags |= B_MALLOC;
3776		bufmallocadjust(bp, newbsize);
3777		return;
3778	}
3779
3780	/*
3781	 * If the buffer is growing on its other-than-first
3782	 * allocation then we revert to the page-allocation
3783	 * scheme.
3784	 */
3785	origbuf = NULL;
3786	origbufsize = 0;
3787	if (bp->b_flags & B_MALLOC) {
3788		origbuf = bp->b_data;
3789		origbufsize = bp->b_bufsize;
3790		bp->b_data = bp->b_kvabase;
3791		bufmallocadjust(bp, 0);
3792		bp->b_flags &= ~B_MALLOC;
3793		newbsize = round_page(newbsize);
3794	}
3795	vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize,
3796	    (vm_offset_t) bp->b_data + newbsize);
3797	if (origbuf != NULL) {
3798		bcopy(origbuf, bp->b_data, origbufsize);
3799		free(origbuf, M_BIOBUF);
3800	}
3801	bufspace_adjust(bp, newbsize);
3802}
3803
3804/*
3805 * This code constitutes the buffer memory from either anonymous system
3806 * memory (in the case of non-VMIO operations) or from an associated
3807 * VM object (in the case of VMIO operations).  This code is able to
3808 * resize a buffer up or down.
3809 *
3810 * Note that this code is tricky, and has many complications to resolve
3811 * deadlock or inconsistant data situations.  Tread lightly!!!
3812 * There are B_CACHE and B_DELWRI interactions that must be dealt with by
3813 * the caller.  Calling this code willy nilly can result in the loss of data.
3814 *
3815 * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
3816 * B_CACHE for the non-VMIO case.
3817 */
3818int
3819allocbuf(struct buf *bp, int size)
3820{
3821	int newbsize;
3822
3823	BUF_ASSERT_HELD(bp);
3824
3825	if (bp->b_bcount == size)
3826		return (1);
3827
3828	if (bp->b_kvasize != 0 && bp->b_kvasize < size)
3829		panic("allocbuf: buffer too small");
3830
3831	newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
3832	if ((bp->b_flags & B_VMIO) == 0) {
3833		if ((bp->b_flags & B_MALLOC) == 0)
3834			newbsize = round_page(newbsize);
3835		/*
3836		 * Just get anonymous memory from the kernel.  Don't
3837		 * mess with B_CACHE.
3838		 */
3839		if (newbsize < bp->b_bufsize)
3840			vfs_nonvmio_truncate(bp, newbsize);
3841		else if (newbsize > bp->b_bufsize)
3842			vfs_nonvmio_extend(bp, newbsize);
3843	} else {
3844		int desiredpages;
3845
3846		desiredpages = (size == 0) ? 0 :
3847		    num_pages((bp->b_offset & PAGE_MASK) + newbsize);
3848
3849		if (bp->b_flags & B_MALLOC)
3850			panic("allocbuf: VMIO buffer can't be malloced");
3851		/*
3852		 * Set B_CACHE initially if buffer is 0 length or will become
3853		 * 0-length.
3854		 */
3855		if (size == 0 || bp->b_bufsize == 0)
3856			bp->b_flags |= B_CACHE;
3857
3858		if (newbsize < bp->b_bufsize)
3859			vfs_vmio_truncate(bp, desiredpages);
3860		/* XXX This looks as if it should be newbsize > b_bufsize */
3861		else if (size > bp->b_bcount)
3862			vfs_vmio_extend(bp, desiredpages, size);
3863		bufspace_adjust(bp, newbsize);
3864	}
3865	bp->b_bcount = size;		/* requested buffer size. */
3866	return (1);
3867}
3868
3869extern int inflight_transient_maps;
3870
3871void
3872biodone(struct bio *bp)
3873{
3874	struct mtx *mtxp;
3875	void (*done)(struct bio *);
3876	vm_offset_t start, end;
3877
3878	if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
3879		bp->bio_flags &= ~BIO_TRANSIENT_MAPPING;
3880		bp->bio_flags |= BIO_UNMAPPED;
3881		start = trunc_page((vm_offset_t)bp->bio_data);
3882		end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
3883		bp->bio_data = unmapped_buf;
3884		pmap_qremove(start, OFF_TO_IDX(end - start));
3885		vmem_free(transient_arena, start, end - start);
3886		atomic_add_int(&inflight_transient_maps, -1);
3887	}
3888	done = bp->bio_done;
3889	if (done == NULL) {
3890		mtxp = mtx_pool_find(mtxpool_sleep, bp);
3891		mtx_lock(mtxp);
3892		bp->bio_flags |= BIO_DONE;
3893		wakeup(bp);
3894		mtx_unlock(mtxp);
3895	} else {
3896		bp->bio_flags |= BIO_DONE;
3897		done(bp);
3898	}
3899}
3900
3901/*
3902 * Wait for a BIO to finish.
3903 */
3904int
3905biowait(struct bio *bp, const char *wchan)
3906{
3907	struct mtx *mtxp;
3908
3909	mtxp = mtx_pool_find(mtxpool_sleep, bp);
3910	mtx_lock(mtxp);
3911	while ((bp->bio_flags & BIO_DONE) == 0)
3912		msleep(bp, mtxp, PRIBIO, wchan, 0);
3913	mtx_unlock(mtxp);
3914	if (bp->bio_error != 0)
3915		return (bp->bio_error);
3916	if (!(bp->bio_flags & BIO_ERROR))
3917		return (0);
3918	return (EIO);
3919}
3920
3921void
3922biofinish(struct bio *bp, struct devstat *stat, int error)
3923{
3924
3925	if (error) {
3926		bp->bio_error = error;
3927		bp->bio_flags |= BIO_ERROR;
3928	}
3929	if (stat != NULL)
3930		devstat_end_transaction_bio(stat, bp);
3931	biodone(bp);
3932}
3933
3934/*
3935 *	bufwait:
3936 *
3937 *	Wait for buffer I/O completion, returning error status.  The buffer
3938 *	is left locked and B_DONE on return.  B_EINTR is converted into an EINTR
3939 *	error and cleared.
3940 */
3941int
3942bufwait(struct buf *bp)
3943{
3944	if (bp->b_iocmd == BIO_READ)
3945		bwait(bp, PRIBIO, "biord");
3946	else
3947		bwait(bp, PRIBIO, "biowr");
3948	if (bp->b_flags & B_EINTR) {
3949		bp->b_flags &= ~B_EINTR;
3950		return (EINTR);
3951	}
3952	if (bp->b_ioflags & BIO_ERROR) {
3953		return (bp->b_error ? bp->b_error : EIO);
3954	} else {
3955		return (0);
3956	}
3957}
3958
3959/*
3960 *	bufdone:
3961 *
3962 *	Finish I/O on a buffer, optionally calling a completion function.
3963 *	This is usually called from an interrupt so process blocking is
3964 *	not allowed.
3965 *
3966 *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
3967 *	In a non-VMIO bp, B_CACHE will be set on the next getblk()
3968 *	assuming B_INVAL is clear.
3969 *
3970 *	For the VMIO case, we set B_CACHE if the op was a read and no
3971 *	read error occured, or if the op was a write.  B_CACHE is never
3972 *	set if the buffer is invalid or otherwise uncacheable.
3973 *
3974 *	biodone does not mess with B_INVAL, allowing the I/O routine or the
3975 *	initiator to leave B_INVAL set to brelse the buffer out of existance
3976 *	in the biodone routine.
3977 */
3978void
3979bufdone(struct buf *bp)
3980{
3981	struct bufobj *dropobj;
3982	void    (*biodone)(struct buf *);
3983
3984	CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
3985	dropobj = NULL;
3986
3987	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
3988	BUF_ASSERT_HELD(bp);
3989
3990	runningbufwakeup(bp);
3991	if (bp->b_iocmd == BIO_WRITE)
3992		dropobj = bp->b_bufobj;
3993	/* call optional completion function if requested */
3994	if (bp->b_iodone != NULL) {
3995		biodone = bp->b_iodone;
3996		bp->b_iodone = NULL;
3997		(*biodone) (bp);
3998		if (dropobj)
3999			bufobj_wdrop(dropobj);
4000		return;
4001	}
4002
4003	bufdone_finish(bp);
4004
4005	if (dropobj)
4006		bufobj_wdrop(dropobj);
4007}
4008
4009void
4010bufdone_finish(struct buf *bp)
4011{
4012	BUF_ASSERT_HELD(bp);
4013
4014	if (!LIST_EMPTY(&bp->b_dep))
4015		buf_complete(bp);
4016
4017	if (bp->b_flags & B_VMIO) {
4018		/*
4019		 * Set B_CACHE if the op was a normal read and no error
4020		 * occured.  B_CACHE is set for writes in the b*write()
4021		 * routines.
4022		 */
4023		if (bp->b_iocmd == BIO_READ &&
4024		    !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
4025		    !(bp->b_ioflags & BIO_ERROR))
4026			bp->b_flags |= B_CACHE;
4027		vfs_vmio_iodone(bp);
4028	}
4029
4030	/*
4031	 * For asynchronous completions, release the buffer now. The brelse
4032	 * will do a wakeup there if necessary - so no need to do a wakeup
4033	 * here in the async case. The sync case always needs to do a wakeup.
4034	 */
4035	if (bp->b_flags & B_ASYNC) {
4036		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) ||
4037		    (bp->b_ioflags & BIO_ERROR))
4038			brelse(bp);
4039		else
4040			bqrelse(bp);
4041	} else
4042		bdone(bp);
4043}
4044
4045/*
4046 * This routine is called in lieu of iodone in the case of
4047 * incomplete I/O.  This keeps the busy status for pages
4048 * consistant.
4049 */
4050void
4051vfs_unbusy_pages(struct buf *bp)
4052{
4053	int i;
4054	vm_object_t obj;
4055	vm_page_t m;
4056
4057	runningbufwakeup(bp);
4058	if (!(bp->b_flags & B_VMIO))
4059		return;
4060
4061	obj = bp->b_bufobj->bo_object;
4062	VM_OBJECT_WLOCK(obj);
4063	for (i = 0; i < bp->b_npages; i++) {
4064		m = bp->b_pages[i];
4065		if (m == bogus_page) {
4066			m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
4067			if (!m)
4068				panic("vfs_unbusy_pages: page missing\n");
4069			bp->b_pages[i] = m;
4070			if (buf_mapped(bp)) {
4071				BUF_CHECK_MAPPED(bp);
4072				pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
4073				    bp->b_pages, bp->b_npages);
4074			} else
4075				BUF_CHECK_UNMAPPED(bp);
4076		}
4077		vm_page_sunbusy(m);
4078	}
4079	vm_object_pip_wakeupn(obj, bp->b_npages);
4080	VM_OBJECT_WUNLOCK(obj);
4081}
4082
4083/*
4084 * vfs_page_set_valid:
4085 *
4086 *	Set the valid bits in a page based on the supplied offset.   The
4087 *	range is restricted to the buffer's size.
4088 *
4089 *	This routine is typically called after a read completes.
4090 */
4091static void
4092vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m)
4093{
4094	vm_ooffset_t eoff;
4095
4096	/*
4097	 * Compute the end offset, eoff, such that [off, eoff) does not span a
4098	 * page boundary and eoff is not greater than the end of the buffer.
4099	 * The end of the buffer, in this case, is our file EOF, not the
4100	 * allocation size of the buffer.
4101	 */
4102	eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK;
4103	if (eoff > bp->b_offset + bp->b_bcount)
4104		eoff = bp->b_offset + bp->b_bcount;
4105
4106	/*
4107	 * Set valid range.  This is typically the entire buffer and thus the
4108	 * entire page.
4109	 */
4110	if (eoff > off)
4111		vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off);
4112}
4113
4114/*
4115 * vfs_page_set_validclean:
4116 *
4117 *	Set the valid bits and clear the dirty bits in a page based on the
4118 *	supplied offset.   The range is restricted to the buffer's size.
4119 */
4120static void
4121vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m)
4122{
4123	vm_ooffset_t soff, eoff;
4124
4125	/*
4126	 * Start and end offsets in buffer.  eoff - soff may not cross a
4127	 * page boundry or cross the end of the buffer.  The end of the
4128	 * buffer, in this case, is our file EOF, not the allocation size
4129	 * of the buffer.
4130	 */
4131	soff = off;
4132	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
4133	if (eoff > bp->b_offset + bp->b_bcount)
4134		eoff = bp->b_offset + bp->b_bcount;
4135
4136	/*
4137	 * Set valid range.  This is typically the entire buffer and thus the
4138	 * entire page.
4139	 */
4140	if (eoff > soff) {
4141		vm_page_set_validclean(
4142		    m,
4143		   (vm_offset_t) (soff & PAGE_MASK),
4144		   (vm_offset_t) (eoff - soff)
4145		);
4146	}
4147}
4148
4149/*
4150 * Ensure that all buffer pages are not exclusive busied.  If any page is
4151 * exclusive busy, drain it.
4152 */
4153void
4154vfs_drain_busy_pages(struct buf *bp)
4155{
4156	vm_page_t m;
4157	int i, last_busied;
4158
4159	VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object);
4160	last_busied = 0;
4161	for (i = 0; i < bp->b_npages; i++) {
4162		m = bp->b_pages[i];
4163		if (vm_page_xbusied(m)) {
4164			for (; last_busied < i; last_busied++)
4165				vm_page_sbusy(bp->b_pages[last_busied]);
4166			while (vm_page_xbusied(m)) {
4167				vm_page_lock(m);
4168				VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
4169				vm_page_busy_sleep(m, "vbpage");
4170				VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
4171			}
4172		}
4173	}
4174	for (i = 0; i < last_busied; i++)
4175		vm_page_sunbusy(bp->b_pages[i]);
4176}
4177
4178/*
4179 * This routine is called before a device strategy routine.
4180 * It is used to tell the VM system that paging I/O is in
4181 * progress, and treat the pages associated with the buffer
4182 * almost as being exclusive busy.  Also the object paging_in_progress
4183 * flag is handled to make sure that the object doesn't become
4184 * inconsistant.
4185 *
4186 * Since I/O has not been initiated yet, certain buffer flags
4187 * such as BIO_ERROR or B_INVAL may be in an inconsistant state
4188 * and should be ignored.
4189 */
4190void
4191vfs_busy_pages(struct buf *bp, int clear_modify)
4192{
4193	int i, bogus;
4194	vm_object_t obj;
4195	vm_ooffset_t foff;
4196	vm_page_t m;
4197
4198	if (!(bp->b_flags & B_VMIO))
4199		return;
4200
4201	obj = bp->b_bufobj->bo_object;
4202	foff = bp->b_offset;
4203	KASSERT(bp->b_offset != NOOFFSET,
4204	    ("vfs_busy_pages: no buffer offset"));
4205	VM_OBJECT_WLOCK(obj);
4206	vfs_drain_busy_pages(bp);
4207	if (bp->b_bufsize != 0)
4208		vfs_setdirty_locked_object(bp);
4209	bogus = 0;
4210	for (i = 0; i < bp->b_npages; i++) {
4211		m = bp->b_pages[i];
4212
4213		if ((bp->b_flags & B_CLUSTER) == 0) {
4214			vm_object_pip_add(obj, 1);
4215			vm_page_sbusy(m);
4216		}
4217		/*
4218		 * When readying a buffer for a read ( i.e
4219		 * clear_modify == 0 ), it is important to do
4220		 * bogus_page replacement for valid pages in
4221		 * partially instantiated buffers.  Partially
4222		 * instantiated buffers can, in turn, occur when
4223		 * reconstituting a buffer from its VM backing store
4224		 * base.  We only have to do this if B_CACHE is
4225		 * clear ( which causes the I/O to occur in the
4226		 * first place ).  The replacement prevents the read
4227		 * I/O from overwriting potentially dirty VM-backed
4228		 * pages.  XXX bogus page replacement is, uh, bogus.
4229		 * It may not work properly with small-block devices.
4230		 * We need to find a better way.
4231		 */
4232		if (clear_modify) {
4233			pmap_remove_write(m);
4234			vfs_page_set_validclean(bp, foff, m);
4235		} else if (m->valid == VM_PAGE_BITS_ALL &&
4236		    (bp->b_flags & B_CACHE) == 0) {
4237			bp->b_pages[i] = bogus_page;
4238			bogus++;
4239		}
4240		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
4241	}
4242	VM_OBJECT_WUNLOCK(obj);
4243	if (bogus && buf_mapped(bp)) {
4244		BUF_CHECK_MAPPED(bp);
4245		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
4246		    bp->b_pages, bp->b_npages);
4247	}
4248}
4249
4250/*
4251 *	vfs_bio_set_valid:
4252 *
4253 *	Set the range within the buffer to valid.  The range is
4254 *	relative to the beginning of the buffer, b_offset.  Note that
4255 *	b_offset itself may be offset from the beginning of the first
4256 *	page.
4257 */
4258void
4259vfs_bio_set_valid(struct buf *bp, int base, int size)
4260{
4261	int i, n;
4262	vm_page_t m;
4263
4264	if (!(bp->b_flags & B_VMIO))
4265		return;
4266
4267	/*
4268	 * Fixup base to be relative to beginning of first page.
4269	 * Set initial n to be the maximum number of bytes in the
4270	 * first page that can be validated.
4271	 */
4272	base += (bp->b_offset & PAGE_MASK);
4273	n = PAGE_SIZE - (base & PAGE_MASK);
4274
4275	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
4276	for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
4277		m = bp->b_pages[i];
4278		if (n > size)
4279			n = size;
4280		vm_page_set_valid_range(m, base & PAGE_MASK, n);
4281		base += n;
4282		size -= n;
4283		n = PAGE_SIZE;
4284	}
4285	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
4286}
4287
4288/*
4289 *	vfs_bio_clrbuf:
4290 *
4291 *	If the specified buffer is a non-VMIO buffer, clear the entire
4292 *	buffer.  If the specified buffer is a VMIO buffer, clear and
4293 *	validate only the previously invalid portions of the buffer.
4294 *	This routine essentially fakes an I/O, so we need to clear
4295 *	BIO_ERROR and B_INVAL.
4296 *
4297 *	Note that while we only theoretically need to clear through b_bcount,
4298 *	we go ahead and clear through b_bufsize.
4299 */
4300void
4301vfs_bio_clrbuf(struct buf *bp)
4302{
4303	int i, j, mask, sa, ea, slide;
4304
4305	if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) {
4306		clrbuf(bp);
4307		return;
4308	}
4309	bp->b_flags &= ~B_INVAL;
4310	bp->b_ioflags &= ~BIO_ERROR;
4311	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
4312	if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
4313	    (bp->b_offset & PAGE_MASK) == 0) {
4314		if (bp->b_pages[0] == bogus_page)
4315			goto unlock;
4316		mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
4317		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object);
4318		if ((bp->b_pages[0]->valid & mask) == mask)
4319			goto unlock;
4320		if ((bp->b_pages[0]->valid & mask) == 0) {
4321			pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize);
4322			bp->b_pages[0]->valid |= mask;
4323			goto unlock;
4324		}
4325	}
4326	sa = bp->b_offset & PAGE_MASK;
4327	slide = 0;
4328	for (i = 0; i < bp->b_npages; i++, sa = 0) {
4329		slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize);
4330		ea = slide & PAGE_MASK;
4331		if (ea == 0)
4332			ea = PAGE_SIZE;
4333		if (bp->b_pages[i] == bogus_page)
4334			continue;
4335		j = sa / DEV_BSIZE;
4336		mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
4337		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object);
4338		if ((bp->b_pages[i]->valid & mask) == mask)
4339			continue;
4340		if ((bp->b_pages[i]->valid & mask) == 0)
4341			pmap_zero_page_area(bp->b_pages[i], sa, ea - sa);
4342		else {
4343			for (; sa < ea; sa += DEV_BSIZE, j++) {
4344				if ((bp->b_pages[i]->valid & (1 << j)) == 0) {
4345					pmap_zero_page_area(bp->b_pages[i],
4346					    sa, DEV_BSIZE);
4347				}
4348			}
4349		}
4350		bp->b_pages[i]->valid |= mask;
4351	}
4352unlock:
4353	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
4354	bp->b_resid = 0;
4355}
4356
4357void
4358vfs_bio_bzero_buf(struct buf *bp, int base, int size)
4359{
4360	vm_page_t m;
4361	int i, n;
4362
4363	if (buf_mapped(bp)) {
4364		BUF_CHECK_MAPPED(bp);
4365		bzero(bp->b_data + base, size);
4366	} else {
4367		BUF_CHECK_UNMAPPED(bp);
4368		n = PAGE_SIZE - (base & PAGE_MASK);
4369		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
4370			m = bp->b_pages[i];
4371			if (n > size)
4372				n = size;
4373			pmap_zero_page_area(m, base & PAGE_MASK, n);
4374			base += n;
4375			size -= n;
4376			n = PAGE_SIZE;
4377		}
4378	}
4379}
4380
4381/*
4382 * vm_hold_load_pages and vm_hold_free_pages get pages into
4383 * a buffers address space.  The pages are anonymous and are
4384 * not associated with a file object.
4385 */
4386static void
4387vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
4388{
4389	vm_offset_t pg;
4390	vm_page_t p;
4391	int index;
4392
4393	BUF_CHECK_MAPPED(bp);
4394
4395	to = round_page(to);
4396	from = round_page(from);
4397	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
4398
4399	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
4400tryagain:
4401		/*
4402		 * note: must allocate system pages since blocking here
4403		 * could interfere with paging I/O, no matter which
4404		 * process we are.
4405		 */
4406		p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
4407		    VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT));
4408		if (p == NULL) {
4409			VM_WAIT;
4410			goto tryagain;
4411		}
4412		pmap_qenter(pg, &p, 1);
4413		bp->b_pages[index] = p;
4414	}
4415	bp->b_npages = index;
4416}
4417
4418/* Return pages associated with this buf to the vm system */
4419static void
4420vm_hold_free_pages(struct buf *bp, int newbsize)
4421{
4422	vm_offset_t from;
4423	vm_page_t p;
4424	int index, newnpages;
4425
4426	BUF_CHECK_MAPPED(bp);
4427
4428	from = round_page((vm_offset_t)bp->b_data + newbsize);
4429	newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
4430	if (bp->b_npages > newnpages)
4431		pmap_qremove(from, bp->b_npages - newnpages);
4432	for (index = newnpages; index < bp->b_npages; index++) {
4433		p = bp->b_pages[index];
4434		bp->b_pages[index] = NULL;
4435		if (vm_page_sbusied(p))
4436			printf("vm_hold_free_pages: blkno: %jd, lblkno: %jd\n",
4437			    (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno);
4438		p->wire_count--;
4439		vm_page_free(p);
4440		atomic_subtract_int(&vm_cnt.v_wire_count, 1);
4441	}
4442	bp->b_npages = newnpages;
4443}
4444
4445/*
4446 * Map an IO request into kernel virtual address space.
4447 *
4448 * All requests are (re)mapped into kernel VA space.
4449 * Notice that we use b_bufsize for the size of the buffer
4450 * to be mapped.  b_bcount might be modified by the driver.
4451 *
4452 * Note that even if the caller determines that the address space should
4453 * be valid, a race or a smaller-file mapped into a larger space may
4454 * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST
4455 * check the return value.
4456 *
4457 * This function only works with pager buffers.
4458 */
4459int
4460vmapbuf(struct buf *bp, int mapbuf)
4461{
4462	vm_prot_t prot;
4463	int pidx;
4464
4465	if (bp->b_bufsize < 0)
4466		return (-1);
4467	prot = VM_PROT_READ;
4468	if (bp->b_iocmd == BIO_READ)
4469		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
4470	if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
4471	    (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages,
4472	    btoc(MAXPHYS))) < 0)
4473		return (-1);
4474	bp->b_npages = pidx;
4475	bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK;
4476	if (mapbuf || !unmapped_buf_allowed) {
4477		pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx);
4478		bp->b_data = bp->b_kvabase + bp->b_offset;
4479	} else
4480		bp->b_data = unmapped_buf;
4481	return(0);
4482}
4483
4484/*
4485 * Free the io map PTEs associated with this IO operation.
4486 * We also invalidate the TLB entries and restore the original b_addr.
4487 *
4488 * This function only works with pager buffers.
4489 */
4490void
4491vunmapbuf(struct buf *bp)
4492{
4493	int npages;
4494
4495	npages = bp->b_npages;
4496	if (buf_mapped(bp))
4497		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
4498	vm_page_unhold_pages(bp->b_pages, npages);
4499
4500	bp->b_data = unmapped_buf;
4501}
4502
4503void
4504bdone(struct buf *bp)
4505{
4506	struct mtx *mtxp;
4507
4508	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4509	mtx_lock(mtxp);
4510	bp->b_flags |= B_DONE;
4511	wakeup(bp);
4512	mtx_unlock(mtxp);
4513}
4514
4515void
4516bwait(struct buf *bp, u_char pri, const char *wchan)
4517{
4518	struct mtx *mtxp;
4519
4520	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4521	mtx_lock(mtxp);
4522	while ((bp->b_flags & B_DONE) == 0)
4523		msleep(bp, mtxp, pri, wchan, 0);
4524	mtx_unlock(mtxp);
4525}
4526
4527int
4528bufsync(struct bufobj *bo, int waitfor)
4529{
4530
4531	return (VOP_FSYNC(bo->__bo_vnode, waitfor, curthread));
4532}
4533
4534void
4535bufstrategy(struct bufobj *bo, struct buf *bp)
4536{
4537	int i = 0;
4538	struct vnode *vp;
4539
4540	vp = bp->b_vp;
4541	KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy"));
4542	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
4543	    ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
4544	i = VOP_STRATEGY(vp, bp);
4545	KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
4546}
4547
4548void
4549bufobj_wrefl(struct bufobj *bo)
4550{
4551
4552	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
4553	ASSERT_BO_WLOCKED(bo);
4554	bo->bo_numoutput++;
4555}
4556
4557void
4558bufobj_wref(struct bufobj *bo)
4559{
4560
4561	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
4562	BO_LOCK(bo);
4563	bo->bo_numoutput++;
4564	BO_UNLOCK(bo);
4565}
4566
4567void
4568bufobj_wdrop(struct bufobj *bo)
4569{
4570
4571	KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop"));
4572	BO_LOCK(bo);
4573	KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count"));
4574	if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) {
4575		bo->bo_flag &= ~BO_WWAIT;
4576		wakeup(&bo->bo_numoutput);
4577	}
4578	BO_UNLOCK(bo);
4579}
4580
4581int
4582bufobj_wwait(struct bufobj *bo, int slpflag, int timeo)
4583{
4584	int error;
4585
4586	KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
4587	ASSERT_BO_WLOCKED(bo);
4588	error = 0;
4589	while (bo->bo_numoutput) {
4590		bo->bo_flag |= BO_WWAIT;
4591		error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo),
4592		    slpflag | (PRIBIO + 1), "bo_wwait", timeo);
4593		if (error)
4594			break;
4595	}
4596	return (error);
4597}
4598
4599void
4600bpin(struct buf *bp)
4601{
4602	struct mtx *mtxp;
4603
4604	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4605	mtx_lock(mtxp);
4606	bp->b_pin_count++;
4607	mtx_unlock(mtxp);
4608}
4609
4610void
4611bunpin(struct buf *bp)
4612{
4613	struct mtx *mtxp;
4614
4615	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4616	mtx_lock(mtxp);
4617	if (--bp->b_pin_count == 0)
4618		wakeup(bp);
4619	mtx_unlock(mtxp);
4620}
4621
4622void
4623bunpin_wait(struct buf *bp)
4624{
4625	struct mtx *mtxp;
4626
4627	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4628	mtx_lock(mtxp);
4629	while (bp->b_pin_count > 0)
4630		msleep(bp, mtxp, PRIBIO, "bwunpin", 0);
4631	mtx_unlock(mtxp);
4632}
4633
4634/*
4635 * Set bio_data or bio_ma for struct bio from the struct buf.
4636 */
4637void
4638bdata2bio(struct buf *bp, struct bio *bip)
4639{
4640
4641	if (!buf_mapped(bp)) {
4642		KASSERT(unmapped_buf_allowed, ("unmapped"));
4643		bip->bio_ma = bp->b_pages;
4644		bip->bio_ma_n = bp->b_npages;
4645		bip->bio_data = unmapped_buf;
4646		bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
4647		bip->bio_flags |= BIO_UNMAPPED;
4648		KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
4649		    PAGE_SIZE == bp->b_npages,
4650		    ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset,
4651		    (long long)bip->bio_length, bip->bio_ma_n));
4652	} else {
4653		bip->bio_data = bp->b_data;
4654		bip->bio_ma = NULL;
4655	}
4656}
4657
4658#include "opt_ddb.h"
4659#ifdef DDB
4660#include <ddb/ddb.h>
4661
4662/* DDB command to show buffer data */
4663DB_SHOW_COMMAND(buffer, db_show_buffer)
4664{
4665	/* get args */
4666	struct buf *bp = (struct buf *)addr;
4667
4668	if (!have_addr) {
4669		db_printf("usage: show buffer <addr>\n");
4670		return;
4671	}
4672
4673	db_printf("buf at %p\n", bp);
4674	db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n",
4675	    (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags,
4676	    PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS);
4677	db_printf(
4678	    "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
4679	    "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, "
4680	    "b_dep = %p\n",
4681	    bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
4682	    bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno,
4683	    (intmax_t)bp->b_lblkno, bp->b_dep.lh_first);
4684	db_printf("b_kvabase = %p, b_kvasize = %d\n",
4685	    bp->b_kvabase, bp->b_kvasize);
4686	if (bp->b_npages) {
4687		int i;
4688		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
4689		for (i = 0; i < bp->b_npages; i++) {
4690			vm_page_t m;
4691			m = bp->b_pages[i];
4692			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
4693			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
4694			if ((i + 1) < bp->b_npages)
4695				db_printf(",");
4696		}
4697		db_printf("\n");
4698	}
4699	db_printf(" ");
4700	BUF_LOCKPRINTINFO(bp);
4701}
4702
4703DB_SHOW_COMMAND(lockedbufs, lockedbufs)
4704{
4705	struct buf *bp;
4706	int i;
4707
4708	for (i = 0; i < nbuf; i++) {
4709		bp = &buf[i];
4710		if (BUF_ISLOCKED(bp)) {
4711			db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4712			db_printf("\n");
4713		}
4714	}
4715}
4716
4717DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs)
4718{
4719	struct vnode *vp;
4720	struct buf *bp;
4721
4722	if (!have_addr) {
4723		db_printf("usage: show vnodebufs <addr>\n");
4724		return;
4725	}
4726	vp = (struct vnode *)addr;
4727	db_printf("Clean buffers:\n");
4728	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) {
4729		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4730		db_printf("\n");
4731	}
4732	db_printf("Dirty buffers:\n");
4733	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
4734		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4735		db_printf("\n");
4736	}
4737}
4738
4739DB_COMMAND(countfreebufs, db_coundfreebufs)
4740{
4741	struct buf *bp;
4742	int i, used = 0, nfree = 0;
4743
4744	if (have_addr) {
4745		db_printf("usage: countfreebufs\n");
4746		return;
4747	}
4748
4749	for (i = 0; i < nbuf; i++) {
4750		bp = &buf[i];
4751		if (bp->b_qindex == QUEUE_EMPTY)
4752			nfree++;
4753		else
4754			used++;
4755	}
4756
4757	db_printf("Counted %d free, %d used (%d tot)\n", nfree, used,
4758	    nfree + used);
4759	db_printf("numfreebuffers is %d\n", numfreebuffers);
4760}
4761#endif /* DDB */
4762