vfs_bio.c revision 30309
1/*
2 * Copyright (c) 1994 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 *    John S. Dyson.
16 * 4. This work was done expressly for inclusion into FreeBSD.  Other use
17 *    is allowed if this notation is included.
18 * 5. Modifications may be freely made to this file if the above conditions
19 *    are met.
20 *
21 * $Id: vfs_bio.c,v 1.128 1997/09/21 22:00:25 gibbs Exp $
22 */
23
24/*
25 * this file contains a new buffer I/O scheme implementing a coherent
26 * VM object and buffer cache scheme.  Pains have been taken to make
27 * sure that the performance degradation associated with schemes such
28 * as this is not realized.
29 *
30 * Author:  John S. Dyson
31 * Significant help during the development and debugging phases
32 * had been provided by David Greenman, also of the FreeBSD core team.
33 */
34
35#include "opt_bounce.h"
36
37#define VMIO
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/sysproto.h>
41#include <sys/kernel.h>
42#include <sys/sysctl.h>
43#include <sys/proc.h>
44#include <sys/vnode.h>
45#include <sys/vmmeter.h>
46#include <vm/vm.h>
47#include <vm/vm_param.h>
48#include <vm/vm_prot.h>
49#include <vm/vm_kern.h>
50#include <vm/vm_pageout.h>
51#include <vm/vm_page.h>
52#include <vm/vm_object.h>
53#include <vm/vm_extern.h>
54#include <vm/vm_map.h>
55#include <sys/buf.h>
56#include <sys/mount.h>
57#include <sys/malloc.h>
58#include <sys/resourcevar.h>
59#include <sys/proc.h>
60
61#include <miscfs/specfs/specdev.h>
62
63MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
64
65static void vfs_update __P((void));
66static struct	proc *updateproc;
67static struct kproc_desc up_kp = {
68	"update",
69	vfs_update,
70	&updateproc
71};
72SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
73
74struct buf *buf;		/* buffer header pool */
75struct swqueue bswlist;
76
77int count_lock_queue __P((void));
78static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
79		vm_offset_t to);
80static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
81		vm_offset_t to);
82static void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff,
83			      vm_offset_t off, vm_offset_t size,
84			      vm_page_t m);
85static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
86			       int pageno, vm_page_t m);
87static void vfs_clean_pages(struct buf * bp);
88static void vfs_setdirty(struct buf *bp);
89static void vfs_vmio_release(struct buf *bp);
90static void flushdirtybuffers(int slpflag, int slptimeo);
91
92int needsbuffer;
93
94/*
95 * Internal update daemon, process 3
96 *	The variable vfs_update_wakeup allows for internal syncs.
97 */
98int vfs_update_wakeup;
99
100
101/*
102 * buffers base kva
103 */
104
105/*
106 * bogus page -- for I/O to/from partially complete buffers
107 * this is a temporary solution to the problem, but it is not
108 * really that bad.  it would be better to split the buffer
109 * for input in the case of buffers partially already in memory,
110 * but the code is intricate enough already.
111 */
112vm_page_t bogus_page;
113static vm_offset_t bogus_offset;
114
115static int bufspace, maxbufspace, vmiospace, maxvmiobufspace,
116	bufmallocspace, maxbufmallocspace;
117int numdirtybuffers, lodirtybuffers, hidirtybuffers;
118static int numfreebuffers, lofreebuffers, hifreebuffers;
119
120SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
121	&numdirtybuffers, 0, "");
122SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
123	&lodirtybuffers, 0, "");
124SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
125	&hidirtybuffers, 0, "");
126SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
127	&numfreebuffers, 0, "");
128SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
129	&lofreebuffers, 0, "");
130SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
131	&hifreebuffers, 0, "");
132SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW,
133	&maxbufspace, 0, "");
134SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
135	&bufspace, 0, "");
136SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW,
137	&maxvmiobufspace, 0, "");
138SYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD,
139	&vmiospace, 0, "");
140SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
141	&maxbufmallocspace, 0, "");
142SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
143	&bufmallocspace, 0, "");
144
145static LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash;
146static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES];
147
148extern int vm_swap_size;
149
150#define BUF_MAXUSE 24
151
152#define VFS_BIO_NEED_ANY 1
153#define VFS_BIO_NEED_LOWLIMIT 2
154#define VFS_BIO_NEED_FREE 4
155
156/*
157 * Initialize buffer headers and related structures.
158 */
159void
160bufinit()
161{
162	struct buf *bp;
163	int i;
164
165	TAILQ_INIT(&bswlist);
166	LIST_INIT(&invalhash);
167
168	/* first, make a null hash table */
169	for (i = 0; i < BUFHSZ; i++)
170		LIST_INIT(&bufhashtbl[i]);
171
172	/* next, make a null set of free lists */
173	for (i = 0; i < BUFFER_QUEUES; i++)
174		TAILQ_INIT(&bufqueues[i]);
175
176	/* finally, initialize each buffer header and stick on empty q */
177	for (i = 0; i < nbuf; i++) {
178		bp = &buf[i];
179		bzero(bp, sizeof *bp);
180		bp->b_flags = B_INVAL;	/* we're just an empty header */
181		bp->b_dev = NODEV;
182		bp->b_rcred = NOCRED;
183		bp->b_wcred = NOCRED;
184		bp->b_qindex = QUEUE_EMPTY;
185		bp->b_vnbufs.le_next = NOLIST;
186		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
187		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
188	}
189/*
190 * maxbufspace is currently calculated to support all filesystem blocks
191 * to be 8K.  If you happen to use a 16K filesystem, the size of the buffer
192 * cache is still the same as it would be for 8K filesystems.  This
193 * keeps the size of the buffer cache "in check" for big block filesystems.
194 */
195	maxbufspace = (nbuf + 8) * DFLTBSIZE;
196/*
197 * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed
198 */
199	maxvmiobufspace = 2 * maxbufspace / 3;
200/*
201 * Limit the amount of malloc memory since it is wired permanently into
202 * the kernel space.  Even though this is accounted for in the buffer
203 * allocation, we don't want the malloced region to grow uncontrolled.
204 * The malloc scheme improves memory utilization significantly on average
205 * (small) directories.
206 */
207	maxbufmallocspace = maxbufspace / 20;
208
209/*
210 * Remove the probability of deadlock conditions by limiting the
211 * number of dirty buffers.
212 */
213	hidirtybuffers = nbuf / 6 + 20;
214	lodirtybuffers = nbuf / 12 + 10;
215	numdirtybuffers = 0;
216	lofreebuffers = nbuf / 18 + 5;
217	hifreebuffers = 2 * lofreebuffers;
218	numfreebuffers = nbuf;
219
220	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
221	bogus_page = vm_page_alloc(kernel_object,
222			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
223			VM_ALLOC_NORMAL);
224
225}
226
227/*
228 * Free the kva allocation for a buffer
229 * Must be called only at splbio or higher,
230 *  as this is the only locking for buffer_map.
231 */
232static void
233bfreekva(struct buf * bp)
234{
235	if (bp->b_kvasize == 0)
236		return;
237
238	vm_map_delete(buffer_map,
239		(vm_offset_t) bp->b_kvabase,
240		(vm_offset_t) bp->b_kvabase + bp->b_kvasize);
241
242	bp->b_kvasize = 0;
243
244}
245
246/*
247 * remove the buffer from the appropriate free list
248 */
249void
250bremfree(struct buf * bp)
251{
252	int s = splbio();
253
254	if (bp->b_qindex != QUEUE_NONE) {
255		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
256		bp->b_qindex = QUEUE_NONE;
257	} else {
258#if !defined(MAX_PERF)
259		panic("bremfree: removing a buffer when not on a queue");
260#endif
261	}
262	if ((bp->b_flags & B_INVAL) ||
263		(bp->b_flags & (B_DELWRI|B_LOCKED)) == 0)
264		--numfreebuffers;
265	splx(s);
266}
267
268/*
269 * Get a buffer with the specified data.  Look in the cache first.
270 */
271int
272bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
273    struct buf ** bpp)
274{
275	struct buf *bp;
276
277	bp = getblk(vp, blkno, size, 0, 0);
278	*bpp = bp;
279
280	/* if not found in cache, do some I/O */
281	if ((bp->b_flags & B_CACHE) == 0) {
282		if (curproc != NULL)
283			curproc->p_stats->p_ru.ru_inblock++;
284		bp->b_flags |= B_READ;
285		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
286		if (bp->b_rcred == NOCRED) {
287			if (cred != NOCRED)
288				crhold(cred);
289			bp->b_rcred = cred;
290		}
291		vfs_busy_pages(bp, 0);
292		VOP_STRATEGY(bp);
293		return (biowait(bp));
294	}
295	return (0);
296}
297
298/*
299 * Operates like bread, but also starts asynchronous I/O on
300 * read-ahead blocks.
301 */
302int
303breadn(struct vnode * vp, daddr_t blkno, int size,
304    daddr_t * rablkno, int *rabsize,
305    int cnt, struct ucred * cred, struct buf ** bpp)
306{
307	struct buf *bp, *rabp;
308	int i;
309	int rv = 0, readwait = 0;
310
311	*bpp = bp = getblk(vp, blkno, size, 0, 0);
312
313	/* if not found in cache, do some I/O */
314	if ((bp->b_flags & B_CACHE) == 0) {
315		if (curproc != NULL)
316			curproc->p_stats->p_ru.ru_inblock++;
317		bp->b_flags |= B_READ;
318		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
319		if (bp->b_rcred == NOCRED) {
320			if (cred != NOCRED)
321				crhold(cred);
322			bp->b_rcred = cred;
323		}
324		vfs_busy_pages(bp, 0);
325		VOP_STRATEGY(bp);
326		++readwait;
327	}
328	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
329		if (inmem(vp, *rablkno))
330			continue;
331		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
332
333		if ((rabp->b_flags & B_CACHE) == 0) {
334			if (curproc != NULL)
335				curproc->p_stats->p_ru.ru_inblock++;
336			rabp->b_flags |= B_READ | B_ASYNC;
337			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
338			if (rabp->b_rcred == NOCRED) {
339				if (cred != NOCRED)
340					crhold(cred);
341				rabp->b_rcred = cred;
342			}
343			vfs_busy_pages(rabp, 0);
344			VOP_STRATEGY(rabp);
345		} else {
346			brelse(rabp);
347		}
348	}
349
350	if (readwait) {
351		rv = biowait(bp);
352	}
353	return (rv);
354}
355
356/*
357 * Write, release buffer on completion.  (Done by iodone
358 * if async.)
359 */
360int
361bwrite(struct buf * bp)
362{
363	int oldflags = bp->b_flags;
364
365	if (bp->b_flags & B_INVAL) {
366		brelse(bp);
367		return (0);
368	}
369#if !defined(MAX_PERF)
370	if (!(bp->b_flags & B_BUSY))
371		panic("bwrite: buffer is not busy???");
372#endif
373
374	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
375	bp->b_flags |= B_WRITEINPROG;
376
377	if ((oldflags & B_DELWRI) == B_DELWRI) {
378		--numdirtybuffers;
379		reassignbuf(bp, bp->b_vp);
380	}
381
382	bp->b_vp->v_numoutput++;
383	vfs_busy_pages(bp, 1);
384	if (curproc != NULL)
385		curproc->p_stats->p_ru.ru_oublock++;
386	VOP_STRATEGY(bp);
387
388	if ((oldflags & B_ASYNC) == 0) {
389		int rtval = biowait(bp);
390
391		if (oldflags & B_DELWRI) {
392			reassignbuf(bp, bp->b_vp);
393		}
394		brelse(bp);
395		return (rtval);
396	}
397	return (0);
398}
399
400int
401vn_bwrite(ap)
402	struct vop_bwrite_args *ap;
403{
404	return (bwrite(ap->a_bp));
405}
406
407void
408vfs_bio_need_satisfy(void) {
409	++numfreebuffers;
410	if (!needsbuffer)
411		return;
412	if (numdirtybuffers < lodirtybuffers) {
413		needsbuffer &= ~(VFS_BIO_NEED_ANY | VFS_BIO_NEED_LOWLIMIT);
414	} else {
415		needsbuffer &= ~VFS_BIO_NEED_ANY;
416	}
417	if (numfreebuffers >= hifreebuffers) {
418		needsbuffer &= ~VFS_BIO_NEED_FREE;
419	}
420	wakeup(&needsbuffer);
421}
422
423/*
424 * Delayed write. (Buffer is marked dirty).
425 */
426void
427bdwrite(struct buf * bp)
428{
429
430#if !defined(MAX_PERF)
431	if ((bp->b_flags & B_BUSY) == 0) {
432		panic("bdwrite: buffer is not busy");
433	}
434#endif
435
436	if (bp->b_flags & B_INVAL) {
437		brelse(bp);
438		return;
439	}
440	if (bp->b_flags & B_TAPE) {
441		bawrite(bp);
442		return;
443	}
444	bp->b_flags &= ~(B_READ|B_RELBUF);
445	if ((bp->b_flags & B_DELWRI) == 0) {
446		bp->b_flags |= B_DONE | B_DELWRI;
447		reassignbuf(bp, bp->b_vp);
448		++numdirtybuffers;
449	}
450
451	/*
452	 * This bmap keeps the system from needing to do the bmap later,
453	 * perhaps when the system is attempting to do a sync.  Since it
454	 * is likely that the indirect block -- or whatever other datastructure
455	 * that the filesystem needs is still in memory now, it is a good
456	 * thing to do this.  Note also, that if the pageout daemon is
457	 * requesting a sync -- there might not be enough memory to do
458	 * the bmap then...  So, this is important to do.
459	 */
460	if (bp->b_lblkno == bp->b_blkno) {
461		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
462	}
463
464	/*
465	 * Set the *dirty* buffer range based upon the VM system dirty pages.
466	 */
467	vfs_setdirty(bp);
468
469	/*
470	 * We need to do this here to satisfy the vnode_pager and the
471	 * pageout daemon, so that it thinks that the pages have been
472	 * "cleaned".  Note that since the pages are in a delayed write
473	 * buffer -- the VFS layer "will" see that the pages get written
474	 * out on the next sync, or perhaps the cluster will be completed.
475	 */
476	vfs_clean_pages(bp);
477	bqrelse(bp);
478
479	if (numdirtybuffers >= hidirtybuffers)
480		flushdirtybuffers(0, 0);
481
482	return;
483}
484
485/*
486 * Asynchronous write.
487 * Start output on a buffer, but do not wait for it to complete.
488 * The buffer is released when the output completes.
489 */
490void
491bawrite(struct buf * bp)
492{
493	bp->b_flags |= B_ASYNC;
494	(void) VOP_BWRITE(bp);
495}
496
497/*
498 * Ordered write.
499 * Start output on a buffer, but only wait for it to complete if the
500 * output device cannot guarantee ordering in some other way.  Devices
501 * that can perform asynchronous ordered writes will set the B_ASYNC
502 * flag in their strategy routine.
503 * The buffer is released when the output completes.
504 */
505int
506bowrite(struct buf * bp)
507{
508	/*
509	 * XXX Add in B_ASYNC once the SCSI
510	 *     layer can deal with ordered
511	 *     writes properly.
512	 */
513	bp->b_flags |= B_ORDERED;
514	return (VOP_BWRITE(bp));
515}
516
517/*
518 * Release a buffer.
519 */
520void
521brelse(struct buf * bp)
522{
523	int s;
524
525	if (bp->b_flags & B_CLUSTER) {
526		relpbuf(bp);
527		return;
528	}
529	/* anyone need a "free" block? */
530	s = splbio();
531
532	/* anyone need this block? */
533	if (bp->b_flags & B_WANTED) {
534		bp->b_flags &= ~(B_WANTED | B_AGE);
535		wakeup(bp);
536	}
537
538	if (bp->b_flags & B_LOCKED)
539		bp->b_flags &= ~B_ERROR;
540
541	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
542	    (bp->b_bufsize <= 0)) {
543		bp->b_flags |= B_INVAL;
544		if (bp->b_flags & B_DELWRI)
545			--numdirtybuffers;
546		bp->b_flags &= ~(B_DELWRI | B_CACHE);
547		if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) {
548			if (bp->b_bufsize)
549				allocbuf(bp, 0);
550			brelvp(bp);
551		}
552	}
553
554	/*
555	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
556	 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
557	 * but the VM object is kept around.  The B_NOCACHE flag is used to
558	 * invalidate the pages in the VM object.
559	 *
560	 * If the buffer is a partially filled NFS buffer, keep it
561	 * since invalidating it now will lose informatio.  The valid
562	 * flags in the vm_pages have only DEV_BSIZE resolution but
563	 * the b_validoff, b_validend fields have byte resolution.
564	 * This can avoid unnecessary re-reads of the buffer.
565	 * XXX this seems to cause performance problems.
566	 */
567	if ((bp->b_flags & B_VMIO)
568	    && !(bp->b_vp->v_tag == VT_NFS &&
569		 bp->b_vp->v_type != VBLK &&
570		 (bp->b_flags & B_DELWRI) != 0)
571#ifdef notdef
572	    && (bp->b_vp->v_tag != VT_NFS
573		|| bp->b_vp->v_type == VBLK
574		|| (bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR))
575		|| bp->b_validend == 0
576		|| (bp->b_validoff == 0
577		    && bp->b_validend == bp->b_bufsize))
578#endif
579	    ) {
580		vm_ooffset_t foff;
581		vm_object_t obj;
582		int i, resid;
583		vm_page_t m;
584		struct vnode *vp;
585		int iototal = bp->b_bufsize;
586
587		vp = bp->b_vp;
588
589#if !defined(MAX_PERF)
590		if (!vp)
591			panic("brelse: missing vp");
592#endif
593
594		if (bp->b_npages) {
595			vm_pindex_t poff;
596			obj = (vm_object_t) vp->v_object;
597			if (vp->v_type == VBLK)
598				foff = ((vm_ooffset_t) bp->b_lblkno) << DEV_BSHIFT;
599			else
600				foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
601			poff = OFF_TO_IDX(foff);
602			for (i = 0; i < bp->b_npages; i++) {
603				m = bp->b_pages[i];
604				if (m == bogus_page) {
605					m = vm_page_lookup(obj, poff + i);
606#if !defined(MAX_PERF)
607					if (!m) {
608						panic("brelse: page missing\n");
609					}
610#endif
611					bp->b_pages[i] = m;
612					pmap_qenter(trunc_page(bp->b_data),
613						bp->b_pages, bp->b_npages);
614				}
615				resid = IDX_TO_OFF(m->pindex+1) - foff;
616				if (resid > iototal)
617					resid = iototal;
618				if (resid > 0) {
619					/*
620					 * Don't invalidate the page if the local machine has already
621					 * modified it.  This is the lesser of two evils, and should
622					 * be fixed.
623					 */
624					if (bp->b_flags & (B_NOCACHE | B_ERROR)) {
625						vm_page_test_dirty(m);
626						if (m->dirty == 0) {
627							vm_page_set_invalid(m, (vm_offset_t) foff, resid);
628							if (m->valid == 0)
629								vm_page_protect(m, VM_PROT_NONE);
630						}
631					}
632					if (resid >= PAGE_SIZE) {
633						if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
634							bp->b_flags |= B_INVAL;
635						}
636					} else {
637						if (!vm_page_is_valid(m,
638							(((vm_offset_t) bp->b_data) & PAGE_MASK), resid)) {
639							bp->b_flags |= B_INVAL;
640						}
641					}
642				}
643				foff += resid;
644				iototal -= resid;
645			}
646		}
647		if (bp->b_flags & (B_INVAL | B_RELBUF))
648			vfs_vmio_release(bp);
649	}
650#if !defined(MAX_PERF)
651	if (bp->b_qindex != QUEUE_NONE)
652		panic("brelse: free buffer onto another queue???");
653#endif
654
655	/* enqueue */
656	/* buffers with no memory */
657	if (bp->b_bufsize == 0) {
658		bp->b_flags |= B_INVAL;
659		bp->b_qindex = QUEUE_EMPTY;
660		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
661		LIST_REMOVE(bp, b_hash);
662		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
663		bp->b_dev = NODEV;
664		/*
665		 * Get rid of the kva allocation *now*
666		 */
667		bfreekva(bp);
668
669	/* buffers with junk contents */
670	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
671		bp->b_flags |= B_INVAL;
672		bp->b_qindex = QUEUE_AGE;
673		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
674		LIST_REMOVE(bp, b_hash);
675		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
676		bp->b_dev = NODEV;
677
678	/* buffers that are locked */
679	} else if (bp->b_flags & B_LOCKED) {
680		bp->b_qindex = QUEUE_LOCKED;
681		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
682
683	/* buffers with stale but valid contents */
684	} else if (bp->b_flags & B_AGE) {
685		bp->b_qindex = QUEUE_AGE;
686		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
687
688	/* buffers with valid and quite potentially reuseable contents */
689	} else {
690		bp->b_qindex = QUEUE_LRU;
691		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
692	}
693
694	if ((bp->b_flags & B_INVAL) ||
695		(bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
696		if (bp->b_flags & B_DELWRI) {
697			--numdirtybuffers;
698			bp->b_flags &= ~B_DELWRI;
699		}
700		vfs_bio_need_satisfy();
701	}
702
703	/* unlock */
704	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
705				B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
706	splx(s);
707}
708
709/*
710 * Release a buffer.
711 */
712void
713bqrelse(struct buf * bp)
714{
715	int s;
716
717	s = splbio();
718
719	/* anyone need this block? */
720	if (bp->b_flags & B_WANTED) {
721		bp->b_flags &= ~(B_WANTED | B_AGE);
722		wakeup(bp);
723	}
724
725#if !defined(MAX_PERF)
726	if (bp->b_qindex != QUEUE_NONE)
727		panic("bqrelse: free buffer onto another queue???");
728#endif
729
730	if (bp->b_flags & B_LOCKED) {
731		bp->b_flags &= ~B_ERROR;
732		bp->b_qindex = QUEUE_LOCKED;
733		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
734		/* buffers with stale but valid contents */
735	} else {
736		bp->b_qindex = QUEUE_LRU;
737		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
738	}
739
740	if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
741		vfs_bio_need_satisfy();
742	}
743
744	/* unlock */
745	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
746		B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
747	splx(s);
748}
749
750static void
751vfs_vmio_release(bp)
752	struct buf *bp;
753{
754	int i;
755	vm_page_t m;
756
757	for (i = 0; i < bp->b_npages; i++) {
758		m = bp->b_pages[i];
759		bp->b_pages[i] = NULL;
760		vm_page_unwire(m);
761		/*
762		 * We don't mess with busy pages, it is
763		 * the responsibility of the process that
764		 * busied the pages to deal with them.
765		 */
766		if ((m->flags & PG_BUSY) || (m->busy != 0))
767			continue;
768
769		if (m->wire_count == 0) {
770
771			if (m->flags & PG_WANTED) {
772				m->flags &= ~PG_WANTED;
773				wakeup(m);
774			}
775
776			/*
777			 * If this is an async free -- we cannot place
778			 * pages onto the cache queue.  If it is an
779			 * async free, then we don't modify any queues.
780			 * This is probably in error (for perf reasons),
781			 * and we will eventually need to build
782			 * a more complete infrastructure to support I/O
783			 * rundown.
784			 */
785			if ((bp->b_flags & B_ASYNC) == 0) {
786
787			/*
788			 * In the case of sync buffer frees, we can do pretty much
789			 * anything to any of the memory queues.  Specifically,
790			 * the cache queue is okay to be modified.
791			 */
792				if (m->valid) {
793					if(m->dirty == 0)
794						vm_page_test_dirty(m);
795					/*
796					 * this keeps pressure off of the process memory
797					 */
798					if (m->dirty == 0 && m->hold_count == 0)
799						vm_page_cache(m);
800					else
801						vm_page_deactivate(m);
802				} else if (m->hold_count == 0) {
803					vm_page_protect(m, VM_PROT_NONE);
804					vm_page_free(m);
805				}
806			} else {
807				/*
808				 * If async, then at least we clear the
809				 * act_count.
810				 */
811				m->act_count = 0;
812			}
813		}
814	}
815	bufspace -= bp->b_bufsize;
816	vmiospace -= bp->b_bufsize;
817	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
818	bp->b_npages = 0;
819	bp->b_bufsize = 0;
820	bp->b_flags &= ~B_VMIO;
821	if (bp->b_vp)
822		brelvp(bp);
823}
824
825/*
826 * Check to see if a block is currently memory resident.
827 */
828struct buf *
829gbincore(struct vnode * vp, daddr_t blkno)
830{
831	struct buf *bp;
832	struct bufhashhdr *bh;
833
834	bh = BUFHASH(vp, blkno);
835	bp = bh->lh_first;
836
837	/* Search hash chain */
838	while (bp != NULL) {
839		/* hit */
840		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
841		    (bp->b_flags & B_INVAL) == 0) {
842			break;
843		}
844		bp = bp->b_hash.le_next;
845	}
846	return (bp);
847}
848
849/*
850 * this routine implements clustered async writes for
851 * clearing out B_DELWRI buffers...  This is much better
852 * than the old way of writing only one buffer at a time.
853 */
854int
855vfs_bio_awrite(struct buf * bp)
856{
857	int i;
858	daddr_t lblkno = bp->b_lblkno;
859	struct vnode *vp = bp->b_vp;
860	int s;
861	int ncl;
862	struct buf *bpa;
863	int nwritten;
864
865	s = splbio();
866	/*
867	 * right now we support clustered writing only to regular files
868	 */
869	if ((vp->v_type == VREG) &&
870	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
871	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
872		int size;
873		int maxcl;
874
875		size = vp->v_mount->mnt_stat.f_iosize;
876		maxcl = MAXPHYS / size;
877
878		for (i = 1; i < maxcl; i++) {
879			if ((bpa = gbincore(vp, lblkno + i)) &&
880			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
881			    (B_DELWRI | B_CLUSTEROK)) &&
882			    (bpa->b_bufsize == size)) {
883				if ((bpa->b_blkno == bpa->b_lblkno) ||
884				    (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
885					break;
886			} else {
887				break;
888			}
889		}
890		ncl = i;
891		/*
892		 * this is a possible cluster write
893		 */
894		if (ncl != 1) {
895			nwritten = cluster_wbuild(vp, size, lblkno, ncl);
896			splx(s);
897			return nwritten;
898		}
899	}
900	bremfree(bp);
901	splx(s);
902	/*
903	 * default (old) behavior, writing out only one block
904	 */
905	bp->b_flags |= B_BUSY | B_ASYNC;
906	nwritten = bp->b_bufsize;
907	(void) VOP_BWRITE(bp);
908	return nwritten;
909}
910
911
912/*
913 * Find a buffer header which is available for use.
914 */
915static struct buf *
916getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize)
917{
918	struct buf *bp;
919	int nbyteswritten = 0;
920	vm_offset_t addr;
921	static int writerecursion = 0;
922
923start:
924	if (bufspace >= maxbufspace)
925		goto trytofreespace;
926
927	/* can we constitute a new buffer? */
928	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) {
929#if !defined(MAX_PERF)
930		if (bp->b_qindex != QUEUE_EMPTY)
931			panic("getnewbuf: inconsistent EMPTY queue, qindex=%d",
932			    bp->b_qindex);
933#endif
934		bp->b_flags |= B_BUSY;
935		bremfree(bp);
936		goto fillbuf;
937	}
938trytofreespace:
939	/*
940	 * We keep the file I/O from hogging metadata I/O
941	 * This is desirable because file data is cached in the
942	 * VM/Buffer cache even if a buffer is freed.
943	 */
944	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) {
945#if !defined(MAX_PERF)
946		if (bp->b_qindex != QUEUE_AGE)
947			panic("getnewbuf: inconsistent AGE queue, qindex=%d",
948			    bp->b_qindex);
949#endif
950	} else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) {
951#if !defined(MAX_PERF)
952		if (bp->b_qindex != QUEUE_LRU)
953			panic("getnewbuf: inconsistent LRU queue, qindex=%d",
954			    bp->b_qindex);
955#endif
956	}
957	if (!bp) {
958		/* wait for a free buffer of any kind */
959		needsbuffer |= VFS_BIO_NEED_ANY;
960		do
961			tsleep(&needsbuffer, (PRIBIO + 1) | slpflag, "newbuf",
962			    slptimeo);
963		while (needsbuffer & VFS_BIO_NEED_ANY);
964		return (0);
965	}
966
967#if defined(DIAGNOSTIC)
968	if (bp->b_flags & B_BUSY) {
969		panic("getnewbuf: busy buffer on free list\n");
970	}
971#endif
972
973	/*
974	 * We are fairly aggressive about freeing VMIO buffers, but since
975	 * the buffering is intact without buffer headers, there is not
976	 * much loss.  We gain by maintaining non-VMIOed metadata in buffers.
977	 */
978	if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) {
979		if ((bp->b_flags & B_VMIO) == 0 ||
980			(vmiospace < maxvmiobufspace)) {
981			--bp->b_usecount;
982			TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
983			if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
984				TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
985				goto start;
986			}
987			TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
988		}
989	}
990
991
992	/* if we are a delayed write, convert to an async write */
993	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
994
995		if (writerecursion > 0) {
996			bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
997			while (bp) {
998				if ((bp->b_flags & B_DELWRI) == 0)
999					break;
1000				bp = TAILQ_NEXT(bp, b_freelist);
1001			}
1002			if (bp == NULL) {
1003				bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
1004				while (bp) {
1005					if ((bp->b_flags & B_DELWRI) == 0)
1006						break;
1007					bp = TAILQ_NEXT(bp, b_freelist);
1008				}
1009			}
1010			if (bp == NULL)
1011				panic("getnewbuf: cannot get buffer, infinite recursion failure");
1012		} else {
1013			++writerecursion;
1014			nbyteswritten += vfs_bio_awrite(bp);
1015			--writerecursion;
1016			if (!slpflag && !slptimeo) {
1017				return (0);
1018			}
1019			goto start;
1020		}
1021	}
1022
1023	if (bp->b_flags & B_WANTED) {
1024		bp->b_flags &= ~B_WANTED;
1025		wakeup(bp);
1026	}
1027	bremfree(bp);
1028	bp->b_flags |= B_BUSY;
1029
1030	if (bp->b_flags & B_VMIO) {
1031		bp->b_flags &= ~B_ASYNC;
1032		vfs_vmio_release(bp);
1033	}
1034
1035	if (bp->b_vp)
1036		brelvp(bp);
1037
1038fillbuf:
1039	/* we are not free, nor do we contain interesting data */
1040	if (bp->b_rcred != NOCRED) {
1041		crfree(bp->b_rcred);
1042		bp->b_rcred = NOCRED;
1043	}
1044	if (bp->b_wcred != NOCRED) {
1045		crfree(bp->b_wcred);
1046		bp->b_wcred = NOCRED;
1047	}
1048
1049	LIST_REMOVE(bp, b_hash);
1050	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1051	if (bp->b_bufsize) {
1052		allocbuf(bp, 0);
1053	}
1054	bp->b_flags = B_BUSY;
1055	bp->b_dev = NODEV;
1056	bp->b_vp = NULL;
1057	bp->b_blkno = bp->b_lblkno = 0;
1058	bp->b_iodone = 0;
1059	bp->b_error = 0;
1060	bp->b_resid = 0;
1061	bp->b_bcount = 0;
1062	bp->b_npages = 0;
1063	bp->b_dirtyoff = bp->b_dirtyend = 0;
1064	bp->b_validoff = bp->b_validend = 0;
1065	bp->b_usecount = 4;
1066
1067	maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
1068
1069	/*
1070	 * we assume that buffer_map is not at address 0
1071	 */
1072	addr = 0;
1073	if (maxsize != bp->b_kvasize) {
1074		bfreekva(bp);
1075
1076		/*
1077		 * See if we have buffer kva space
1078		 */
1079		if (vm_map_findspace(buffer_map,
1080			vm_map_min(buffer_map), maxsize, &addr)) {
1081			bp->b_flags |= B_INVAL;
1082			brelse(bp);
1083			goto trytofreespace;
1084		}
1085	}
1086
1087	/*
1088	 * See if we are below are allocated minimum
1089	 */
1090	if (bufspace >= (maxbufspace + nbyteswritten)) {
1091		bp->b_flags |= B_INVAL;
1092		brelse(bp);
1093		goto trytofreespace;
1094	}
1095
1096	/*
1097	 * create a map entry for the buffer -- in essence
1098	 * reserving the kva space.
1099	 */
1100	if (addr) {
1101		vm_map_insert(buffer_map, NULL, 0,
1102			addr, addr + maxsize,
1103			VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
1104
1105		bp->b_kvabase = (caddr_t) addr;
1106		bp->b_kvasize = maxsize;
1107	}
1108	bp->b_data = bp->b_kvabase;
1109
1110	return (bp);
1111}
1112
1113static void
1114waitfreebuffers(int slpflag, int slptimeo) {
1115	while (numfreebuffers < hifreebuffers) {
1116		flushdirtybuffers(slpflag, slptimeo);
1117		if (numfreebuffers < hifreebuffers)
1118			break;
1119		needsbuffer |= VFS_BIO_NEED_FREE;
1120		if (tsleep(&needsbuffer, PRIBIO|slpflag, "biofre", slptimeo))
1121			break;
1122	}
1123}
1124
1125static void
1126flushdirtybuffers(int slpflag, int slptimeo) {
1127	int s;
1128	static pid_t flushing = 0;
1129
1130	s = splbio();
1131
1132	if (flushing) {
1133		if (flushing == curproc->p_pid) {
1134			splx(s);
1135			return;
1136		}
1137		while (flushing) {
1138			if (tsleep(&flushing, PRIBIO|slpflag, "biofls", slptimeo)) {
1139				splx(s);
1140				return;
1141			}
1142		}
1143	}
1144	flushing = curproc->p_pid;
1145
1146	while (numdirtybuffers > lodirtybuffers) {
1147		struct buf *bp;
1148		needsbuffer |= VFS_BIO_NEED_LOWLIMIT;
1149		bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
1150		if (bp == NULL)
1151			bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
1152
1153		while (bp && ((bp->b_flags & B_DELWRI) == 0)) {
1154			bp = TAILQ_NEXT(bp, b_freelist);
1155		}
1156
1157		if (bp) {
1158			splx(s);
1159			vfs_bio_awrite(bp);
1160			s = splbio();
1161			continue;
1162		}
1163		break;
1164	}
1165
1166	flushing = 0;
1167	wakeup(&flushing);
1168	splx(s);
1169}
1170
1171/*
1172 * Check to see if a block is currently memory resident.
1173 */
1174struct buf *
1175incore(struct vnode * vp, daddr_t blkno)
1176{
1177	struct buf *bp;
1178
1179	int s = splbio();
1180	bp = gbincore(vp, blkno);
1181	splx(s);
1182	return (bp);
1183}
1184
1185/*
1186 * Returns true if no I/O is needed to access the
1187 * associated VM object.  This is like incore except
1188 * it also hunts around in the VM system for the data.
1189 */
1190
1191int
1192inmem(struct vnode * vp, daddr_t blkno)
1193{
1194	vm_object_t obj;
1195	vm_offset_t toff, tinc;
1196	vm_page_t m;
1197	vm_ooffset_t off;
1198
1199	if (incore(vp, blkno))
1200		return 1;
1201	if (vp->v_mount == NULL)
1202		return 0;
1203	if ((vp->v_object == NULL) || (vp->v_flag & VVMIO) == 0)
1204		return 0;
1205
1206	obj = vp->v_object;
1207	tinc = PAGE_SIZE;
1208	if (tinc > vp->v_mount->mnt_stat.f_iosize)
1209		tinc = vp->v_mount->mnt_stat.f_iosize;
1210	off = blkno * vp->v_mount->mnt_stat.f_iosize;
1211
1212	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
1213
1214		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
1215		if (!m)
1216			return 0;
1217		if (vm_page_is_valid(m, (vm_offset_t) (toff + off), tinc) == 0)
1218			return 0;
1219	}
1220	return 1;
1221}
1222
1223/*
1224 * now we set the dirty range for the buffer --
1225 * for NFS -- if the file is mapped and pages have
1226 * been written to, let it know.  We want the
1227 * entire range of the buffer to be marked dirty if
1228 * any of the pages have been written to for consistancy
1229 * with the b_validoff, b_validend set in the nfs write
1230 * code, and used by the nfs read code.
1231 */
1232static void
1233vfs_setdirty(struct buf *bp) {
1234	int i;
1235	vm_object_t object;
1236	vm_offset_t boffset, offset;
1237	/*
1238	 * We qualify the scan for modified pages on whether the
1239	 * object has been flushed yet.  The OBJ_WRITEABLE flag
1240	 * is not cleared simply by protecting pages off.
1241	 */
1242	if ((bp->b_flags & B_VMIO) &&
1243		((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
1244		/*
1245		 * test the pages to see if they have been modified directly
1246		 * by users through the VM system.
1247		 */
1248		for (i = 0; i < bp->b_npages; i++)
1249			vm_page_test_dirty(bp->b_pages[i]);
1250
1251		/*
1252		 * scan forwards for the first page modified
1253		 */
1254		for (i = 0; i < bp->b_npages; i++) {
1255			if (bp->b_pages[i]->dirty) {
1256				break;
1257			}
1258		}
1259		boffset = (i << PAGE_SHIFT);
1260		if (boffset < bp->b_dirtyoff) {
1261			bp->b_dirtyoff = boffset;
1262		}
1263
1264		/*
1265		 * scan backwards for the last page modified
1266		 */
1267		for (i = bp->b_npages - 1; i >= 0; --i) {
1268			if (bp->b_pages[i]->dirty) {
1269				break;
1270			}
1271		}
1272		boffset = (i + 1);
1273		offset = boffset + bp->b_pages[0]->pindex;
1274		if (offset >= object->size)
1275			boffset = object->size - bp->b_pages[0]->pindex;
1276		if (bp->b_dirtyend < (boffset << PAGE_SHIFT))
1277			bp->b_dirtyend = (boffset << PAGE_SHIFT);
1278	}
1279}
1280
1281/*
1282 * Get a block given a specified block and offset into a file/device.
1283 */
1284struct buf *
1285getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
1286{
1287	struct buf *bp;
1288	int s;
1289	struct bufhashhdr *bh;
1290	int maxsize;
1291	static pid_t flushing = 0;
1292
1293	if (vp->v_mount) {
1294		maxsize = vp->v_mount->mnt_stat.f_iosize;
1295		/*
1296		 * This happens on mount points.
1297		 */
1298		if (maxsize < size)
1299			maxsize = size;
1300	} else {
1301		maxsize = size;
1302	}
1303
1304#if !defined(MAX_PERF)
1305	if (size > MAXBSIZE)
1306		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
1307#endif
1308
1309	s = splbio();
1310loop:
1311	if (numfreebuffers < lofreebuffers) {
1312		waitfreebuffers(slpflag, slptimeo);
1313	}
1314
1315	if ((bp = gbincore(vp, blkno))) {
1316		if (bp->b_flags & B_BUSY) {
1317			bp->b_flags |= B_WANTED;
1318			if (bp->b_usecount < BUF_MAXUSE)
1319				++bp->b_usecount;
1320			if (!tsleep(bp,
1321				(PRIBIO + 1) | slpflag, "getblk", slptimeo))
1322				goto loop;
1323
1324			splx(s);
1325			return (struct buf *) NULL;
1326		}
1327		bp->b_flags |= B_BUSY | B_CACHE;
1328		bremfree(bp);
1329
1330		/*
1331		 * check for size inconsistancies (note that they shouldn't
1332		 * happen but do when filesystems don't handle the size changes
1333		 * correctly.) We are conservative on metadata and don't just
1334		 * extend the buffer but write and re-constitute it.
1335		 */
1336
1337		if (bp->b_bcount != size) {
1338			if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) {
1339				allocbuf(bp, size);
1340			} else {
1341				bp->b_flags |= B_NOCACHE;
1342				VOP_BWRITE(bp);
1343				goto loop;
1344			}
1345		}
1346
1347		if (bp->b_usecount < BUF_MAXUSE)
1348			++bp->b_usecount;
1349		splx(s);
1350		return (bp);
1351	} else {
1352		vm_object_t obj;
1353
1354		if ((bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize)) == 0) {
1355			if (slpflag || slptimeo) {
1356				splx(s);
1357				return NULL;
1358			}
1359			goto loop;
1360		}
1361
1362		/*
1363		 * This code is used to make sure that a buffer is not
1364		 * created while the getnewbuf routine is blocked.
1365		 * Normally the vnode is locked so this isn't a problem.
1366		 * VBLK type I/O requests, however, don't lock the vnode.
1367		 */
1368		if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) {
1369			bp->b_flags |= B_INVAL;
1370			brelse(bp);
1371			goto loop;
1372		}
1373
1374		/*
1375		 * Insert the buffer into the hash, so that it can
1376		 * be found by incore.
1377		 */
1378		bp->b_blkno = bp->b_lblkno = blkno;
1379		bgetvp(vp, bp);
1380		LIST_REMOVE(bp, b_hash);
1381		bh = BUFHASH(vp, blkno);
1382		LIST_INSERT_HEAD(bh, bp, b_hash);
1383
1384		if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) {
1385			bp->b_flags |= (B_VMIO | B_CACHE);
1386#if defined(VFS_BIO_DEBUG)
1387			if (vp->v_type != VREG && vp->v_type != VBLK)
1388				printf("getblk: vmioing file type %d???\n", vp->v_type);
1389#endif
1390		} else {
1391			bp->b_flags &= ~B_VMIO;
1392		}
1393		splx(s);
1394
1395		allocbuf(bp, size);
1396#ifdef	PC98
1397		/*
1398		 * 1024byte/sector support
1399		 */
1400#define B_XXX2 0x8000000
1401		if (vp->v_flag & 0x10000) bp->b_flags |= B_XXX2;
1402#endif
1403		return (bp);
1404	}
1405}
1406
1407/*
1408 * Get an empty, disassociated buffer of given size.
1409 */
1410struct buf *
1411geteblk(int size)
1412{
1413	struct buf *bp;
1414	int s;
1415
1416	s = splbio();
1417	while ((bp = getnewbuf(0, 0, 0, size, MAXBSIZE)) == 0);
1418	splx(s);
1419	allocbuf(bp, size);
1420	bp->b_flags |= B_INVAL;
1421	return (bp);
1422}
1423
1424
1425/*
1426 * This code constitutes the buffer memory from either anonymous system
1427 * memory (in the case of non-VMIO operations) or from an associated
1428 * VM object (in the case of VMIO operations).
1429 *
1430 * Note that this code is tricky, and has many complications to resolve
1431 * deadlock or inconsistant data situations.  Tread lightly!!!
1432 *
1433 * Modify the length of a buffer's underlying buffer storage without
1434 * destroying information (unless, of course the buffer is shrinking).
1435 */
1436int
1437allocbuf(struct buf * bp, int size)
1438{
1439
1440	int s;
1441	int newbsize, mbsize;
1442	int i;
1443
1444#if !defined(MAX_PERF)
1445	if (!(bp->b_flags & B_BUSY))
1446		panic("allocbuf: buffer not busy");
1447
1448	if (bp->b_kvasize < size)
1449		panic("allocbuf: buffer too small");
1450#endif
1451
1452	if ((bp->b_flags & B_VMIO) == 0) {
1453		caddr_t origbuf;
1454		int origbufsize;
1455		/*
1456		 * Just get anonymous memory from the kernel
1457		 */
1458		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1459#if !defined(NO_B_MALLOC)
1460		if (bp->b_flags & B_MALLOC)
1461			newbsize = mbsize;
1462		else
1463#endif
1464			newbsize = round_page(size);
1465
1466		if (newbsize < bp->b_bufsize) {
1467#if !defined(NO_B_MALLOC)
1468			/*
1469			 * malloced buffers are not shrunk
1470			 */
1471			if (bp->b_flags & B_MALLOC) {
1472				if (newbsize) {
1473					bp->b_bcount = size;
1474				} else {
1475					free(bp->b_data, M_BIOBUF);
1476					bufspace -= bp->b_bufsize;
1477					bufmallocspace -= bp->b_bufsize;
1478					bp->b_data = bp->b_kvabase;
1479					bp->b_bufsize = 0;
1480					bp->b_bcount = 0;
1481					bp->b_flags &= ~B_MALLOC;
1482				}
1483				return 1;
1484			}
1485#endif
1486			vm_hold_free_pages(
1487			    bp,
1488			    (vm_offset_t) bp->b_data + newbsize,
1489			    (vm_offset_t) bp->b_data + bp->b_bufsize);
1490		} else if (newbsize > bp->b_bufsize) {
1491#if !defined(NO_B_MALLOC)
1492			/*
1493			 * We only use malloced memory on the first allocation.
1494			 * and revert to page-allocated memory when the buffer grows.
1495			 */
1496			if ( (bufmallocspace < maxbufmallocspace) &&
1497				(bp->b_bufsize == 0) &&
1498				(mbsize <= PAGE_SIZE/2)) {
1499
1500				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
1501				bp->b_bufsize = mbsize;
1502				bp->b_bcount = size;
1503				bp->b_flags |= B_MALLOC;
1504				bufspace += mbsize;
1505				bufmallocspace += mbsize;
1506				return 1;
1507			}
1508#endif
1509			origbuf = NULL;
1510			origbufsize = 0;
1511#if !defined(NO_B_MALLOC)
1512			/*
1513			 * If the buffer is growing on it's other-than-first allocation,
1514			 * then we revert to the page-allocation scheme.
1515			 */
1516			if (bp->b_flags & B_MALLOC) {
1517				origbuf = bp->b_data;
1518				origbufsize = bp->b_bufsize;
1519				bp->b_data = bp->b_kvabase;
1520				bufspace -= bp->b_bufsize;
1521				bufmallocspace -= bp->b_bufsize;
1522				bp->b_bufsize = 0;
1523				bp->b_flags &= ~B_MALLOC;
1524				newbsize = round_page(newbsize);
1525			}
1526#endif
1527			vm_hold_load_pages(
1528			    bp,
1529			    (vm_offset_t) bp->b_data + bp->b_bufsize,
1530			    (vm_offset_t) bp->b_data + newbsize);
1531#if !defined(NO_B_MALLOC)
1532			if (origbuf) {
1533				bcopy(origbuf, bp->b_data, origbufsize);
1534				free(origbuf, M_BIOBUF);
1535			}
1536#endif
1537		}
1538	} else {
1539		vm_page_t m;
1540		int desiredpages;
1541
1542		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1543		desiredpages = (round_page(newbsize) >> PAGE_SHIFT);
1544
1545#if !defined(NO_B_MALLOC)
1546		if (bp->b_flags & B_MALLOC)
1547			panic("allocbuf: VMIO buffer can't be malloced");
1548#endif
1549
1550		if (newbsize < bp->b_bufsize) {
1551			if (desiredpages < bp->b_npages) {
1552				for (i = desiredpages; i < bp->b_npages; i++) {
1553					/*
1554					 * the page is not freed here -- it
1555					 * is the responsibility of vnode_pager_setsize
1556					 */
1557					m = bp->b_pages[i];
1558#if defined(DIAGNOSTIC)
1559					if (m == bogus_page)
1560						panic("allocbuf: bogus page found");
1561#endif
1562					s = splvm();
1563					while ((m->flags & PG_BUSY) || (m->busy != 0)) {
1564						m->flags |= PG_WANTED;
1565						tsleep(m, PVM, "biodep", 0);
1566					}
1567					splx(s);
1568
1569					bp->b_pages[i] = NULL;
1570					vm_page_unwire(m);
1571				}
1572				pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
1573				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
1574				bp->b_npages = desiredpages;
1575			}
1576		} else if (newbsize > bp->b_bufsize) {
1577			vm_object_t obj;
1578			vm_offset_t tinc, toff;
1579			vm_ooffset_t off;
1580			vm_pindex_t objoff;
1581			int pageindex, curbpnpages;
1582			struct vnode *vp;
1583			int bsize;
1584
1585			vp = bp->b_vp;
1586
1587			if (vp->v_type == VBLK)
1588				bsize = DEV_BSIZE;
1589			else
1590				bsize = vp->v_mount->mnt_stat.f_iosize;
1591
1592			if (bp->b_npages < desiredpages) {
1593				obj = vp->v_object;
1594				tinc = PAGE_SIZE;
1595				if (tinc > bsize)
1596					tinc = bsize;
1597				off = (vm_ooffset_t) bp->b_lblkno * bsize;
1598				curbpnpages = bp->b_npages;
1599		doretry:
1600				bp->b_flags |= B_CACHE;
1601				bp->b_validoff = bp->b_validend = 0;
1602				for (toff = 0; toff < newbsize; toff += tinc) {
1603					int bytesinpage;
1604
1605					pageindex = toff >> PAGE_SHIFT;
1606					objoff = OFF_TO_IDX(off + toff);
1607					if (pageindex < curbpnpages) {
1608
1609						m = bp->b_pages[pageindex];
1610#ifdef VFS_BIO_DIAG
1611						if (m->pindex != objoff)
1612							panic("allocbuf: page changed offset??!!!?");
1613#endif
1614						bytesinpage = tinc;
1615						if (tinc > (newbsize - toff))
1616							bytesinpage = newbsize - toff;
1617						if (bp->b_flags & B_CACHE)
1618							vfs_buf_set_valid(bp, off, toff, bytesinpage, m);
1619						continue;
1620					}
1621					m = vm_page_lookup(obj, objoff);
1622					if (!m) {
1623						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
1624						if (!m) {
1625							VM_WAIT;
1626							goto doretry;
1627						}
1628						/*
1629						 * Normally it is unwise to clear PG_BUSY without
1630						 * PAGE_WAKEUP -- but it is okay here, as there is
1631						 * no chance for blocking between here and vm_page_alloc
1632						 */
1633						m->flags &= ~PG_BUSY;
1634						vm_page_wire(m);
1635						bp->b_flags &= ~B_CACHE;
1636					} else if (m->flags & PG_BUSY) {
1637						s = splvm();
1638						if (m->flags & PG_BUSY) {
1639							m->flags |= PG_WANTED;
1640							tsleep(m, PVM, "pgtblk", 0);
1641						}
1642						splx(s);
1643						goto doretry;
1644					} else {
1645						if ((curproc != pageproc) &&
1646							((m->queue - m->pc) == PQ_CACHE) &&
1647						    ((cnt.v_free_count + cnt.v_cache_count) <
1648								(cnt.v_free_min + cnt.v_cache_min))) {
1649							pagedaemon_wakeup();
1650						}
1651						bytesinpage = tinc;
1652						if (tinc > (newbsize - toff))
1653							bytesinpage = newbsize - toff;
1654						if (bp->b_flags & B_CACHE)
1655							vfs_buf_set_valid(bp, off, toff, bytesinpage, m);
1656						vm_page_wire(m);
1657					}
1658					bp->b_pages[pageindex] = m;
1659					curbpnpages = pageindex + 1;
1660				}
1661				if (vp->v_tag == VT_NFS &&
1662				    vp->v_type != VBLK) {
1663					if (bp->b_dirtyend > 0) {
1664						bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
1665						bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
1666					}
1667					if (bp->b_validend == 0)
1668						bp->b_flags &= ~B_CACHE;
1669				}
1670				bp->b_data = (caddr_t) trunc_page(bp->b_data);
1671				bp->b_npages = curbpnpages;
1672				pmap_qenter((vm_offset_t) bp->b_data,
1673					bp->b_pages, bp->b_npages);
1674				((vm_offset_t) bp->b_data) |= off & PAGE_MASK;
1675			}
1676		}
1677	}
1678	if (bp->b_flags & B_VMIO)
1679		vmiospace += (newbsize - bp->b_bufsize);
1680	bufspace += (newbsize - bp->b_bufsize);
1681	bp->b_bufsize = newbsize;
1682	bp->b_bcount = size;
1683	return 1;
1684}
1685
1686/*
1687 * Wait for buffer I/O completion, returning error status.
1688 */
1689int
1690biowait(register struct buf * bp)
1691{
1692	int s;
1693
1694	s = splbio();
1695	while ((bp->b_flags & B_DONE) == 0)
1696#if defined(NO_SCHEDULE_MODS)
1697		tsleep(bp, PRIBIO, "biowait", 0);
1698#else
1699		tsleep(bp, curproc->p_usrpri, "biowait", 0);
1700#endif
1701	splx(s);
1702	if (bp->b_flags & B_EINTR) {
1703		bp->b_flags &= ~B_EINTR;
1704		return (EINTR);
1705	}
1706	if (bp->b_flags & B_ERROR) {
1707		return (bp->b_error ? bp->b_error : EIO);
1708	} else {
1709		return (0);
1710	}
1711}
1712
1713/*
1714 * Finish I/O on a buffer, calling an optional function.
1715 * This is usually called from interrupt level, so process blocking
1716 * is not *a good idea*.
1717 */
1718void
1719biodone(register struct buf * bp)
1720{
1721	int s;
1722
1723	s = splbio();
1724
1725#if !defined(MAX_PERF)
1726	if (!(bp->b_flags & B_BUSY))
1727		panic("biodone: buffer not busy");
1728#endif
1729
1730	if (bp->b_flags & B_DONE) {
1731		splx(s);
1732#if !defined(MAX_PERF)
1733		printf("biodone: buffer already done\n");
1734#endif
1735		return;
1736	}
1737	bp->b_flags |= B_DONE;
1738
1739	if ((bp->b_flags & B_READ) == 0) {
1740		vwakeup(bp);
1741	}
1742#ifdef BOUNCE_BUFFERS
1743	if (bp->b_flags & B_BOUNCE)
1744		vm_bounce_free(bp);
1745#endif
1746
1747	/* call optional completion function if requested */
1748	if (bp->b_flags & B_CALL) {
1749		bp->b_flags &= ~B_CALL;
1750		(*bp->b_iodone) (bp);
1751		splx(s);
1752		return;
1753	}
1754	if (bp->b_flags & B_VMIO) {
1755		int i, resid;
1756		vm_ooffset_t foff;
1757		vm_page_t m;
1758		vm_object_t obj;
1759		int iosize;
1760		struct vnode *vp = bp->b_vp;
1761
1762		obj = vp->v_object;
1763
1764#if defined(VFS_BIO_DEBUG)
1765		if (vp->v_usecount == 0) {
1766			panic("biodone: zero vnode ref count");
1767		}
1768
1769		if (vp->v_object == NULL) {
1770			panic("biodone: missing VM object");
1771		}
1772
1773		if ((vp->v_flag & VVMIO) == 0) {
1774			panic("biodone: vnode is not setup for merged cache");
1775		}
1776#endif
1777
1778		if (vp->v_type == VBLK)
1779			foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
1780		else
1781			foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1782#if !defined(MAX_PERF)
1783		if (!obj) {
1784			panic("biodone: no object");
1785		}
1786#endif
1787#if defined(VFS_BIO_DEBUG)
1788		if (obj->paging_in_progress < bp->b_npages) {
1789			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1790			    obj->paging_in_progress, bp->b_npages);
1791		}
1792#endif
1793		iosize = bp->b_bufsize;
1794		for (i = 0; i < bp->b_npages; i++) {
1795			int bogusflag = 0;
1796			m = bp->b_pages[i];
1797			if (m == bogus_page) {
1798				bogusflag = 1;
1799				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
1800				if (!m) {
1801#if defined(VFS_BIO_DEBUG)
1802					printf("biodone: page disappeared\n");
1803#endif
1804					--obj->paging_in_progress;
1805					continue;
1806				}
1807				bp->b_pages[i] = m;
1808				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1809			}
1810#if defined(VFS_BIO_DEBUG)
1811			if (OFF_TO_IDX(foff) != m->pindex) {
1812				printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex);
1813			}
1814#endif
1815			resid = IDX_TO_OFF(m->pindex + 1) - foff;
1816			if (resid > iosize)
1817				resid = iosize;
1818			/*
1819			 * In the write case, the valid and clean bits are
1820			 * already changed correctly, so we only need to do this
1821			 * here in the read case.
1822			 */
1823			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
1824				vfs_page_set_valid(bp, foff, i, m);
1825			}
1826
1827			/*
1828			 * when debugging new filesystems or buffer I/O methods, this
1829			 * is the most common error that pops up.  if you see this, you
1830			 * have not set the page busy flag correctly!!!
1831			 */
1832			if (m->busy == 0) {
1833#if !defined(MAX_PERF)
1834				printf("biodone: page busy < 0, "
1835				    "pindex: %d, foff: 0x(%x,%x), "
1836				    "resid: %d, index: %d\n",
1837				    (int) m->pindex, (int)(foff >> 32),
1838						(int) foff & 0xffffffff, resid, i);
1839#endif
1840				if (vp->v_type != VBLK)
1841#if !defined(MAX_PERF)
1842					printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
1843					    bp->b_vp->v_mount->mnt_stat.f_iosize,
1844					    (int) bp->b_lblkno,
1845					    bp->b_flags, bp->b_npages);
1846				else
1847					printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
1848					    (int) bp->b_lblkno,
1849					    bp->b_flags, bp->b_npages);
1850				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
1851				    m->valid, m->dirty, m->wire_count);
1852#endif
1853				panic("biodone: page busy < 0\n");
1854			}
1855			--m->busy;
1856			if ((m->busy == 0) && (m->flags & PG_WANTED)) {
1857				m->flags &= ~PG_WANTED;
1858				wakeup(m);
1859			}
1860			--obj->paging_in_progress;
1861			foff += resid;
1862			iosize -= resid;
1863		}
1864		if (obj && obj->paging_in_progress == 0 &&
1865		    (obj->flags & OBJ_PIPWNT)) {
1866			obj->flags &= ~OBJ_PIPWNT;
1867			wakeup(obj);
1868		}
1869	}
1870	/*
1871	 * For asynchronous completions, release the buffer now. The brelse
1872	 * checks for B_WANTED and will do the wakeup there if necessary - so
1873	 * no need to do a wakeup here in the async case.
1874	 */
1875
1876	if (bp->b_flags & B_ASYNC) {
1877		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
1878			brelse(bp);
1879		else
1880			bqrelse(bp);
1881	} else {
1882		bp->b_flags &= ~B_WANTED;
1883		wakeup(bp);
1884	}
1885	splx(s);
1886}
1887
1888int
1889count_lock_queue()
1890{
1891	int count;
1892	struct buf *bp;
1893
1894	count = 0;
1895	for (bp = TAILQ_FIRST(&bufqueues[QUEUE_LOCKED]);
1896	    bp != NULL;
1897	    bp = TAILQ_NEXT(bp, b_freelist))
1898		count++;
1899	return (count);
1900}
1901
1902int vfs_update_interval = 30;
1903
1904static void
1905vfs_update()
1906{
1907	while (1) {
1908		tsleep(&vfs_update_wakeup, PUSER, "update",
1909		    hz * vfs_update_interval);
1910		vfs_update_wakeup = 0;
1911		sync(curproc, NULL, NULL);
1912	}
1913}
1914
1915static int
1916sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS
1917{
1918	int error = sysctl_handle_int(oidp,
1919		oidp->oid_arg1, oidp->oid_arg2, req);
1920	if (!error)
1921		wakeup(&vfs_update_wakeup);
1922	return error;
1923}
1924
1925SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW,
1926	&vfs_update_interval, 0, sysctl_kern_updateinterval, "I", "");
1927
1928
1929/*
1930 * This routine is called in lieu of iodone in the case of
1931 * incomplete I/O.  This keeps the busy status for pages
1932 * consistant.
1933 */
1934void
1935vfs_unbusy_pages(struct buf * bp)
1936{
1937	int i;
1938
1939	if (bp->b_flags & B_VMIO) {
1940		struct vnode *vp = bp->b_vp;
1941		vm_object_t obj = vp->v_object;
1942		vm_ooffset_t foff;
1943
1944		foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1945
1946		for (i = 0; i < bp->b_npages; i++) {
1947			vm_page_t m = bp->b_pages[i];
1948
1949			if (m == bogus_page) {
1950				m = vm_page_lookup(obj, OFF_TO_IDX(foff) + i);
1951#if !defined(MAX_PERF)
1952				if (!m) {
1953					panic("vfs_unbusy_pages: page missing\n");
1954				}
1955#endif
1956				bp->b_pages[i] = m;
1957				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1958			}
1959			--obj->paging_in_progress;
1960			--m->busy;
1961			if ((m->busy == 0) && (m->flags & PG_WANTED)) {
1962				m->flags &= ~PG_WANTED;
1963				wakeup(m);
1964			}
1965		}
1966		if (obj->paging_in_progress == 0 &&
1967		    (obj->flags & OBJ_PIPWNT)) {
1968			obj->flags &= ~OBJ_PIPWNT;
1969			wakeup(obj);
1970		}
1971	}
1972}
1973
1974/*
1975 * Set NFS' b_validoff and b_validend fields from the valid bits
1976 * of a page.  If the consumer is not NFS, and the page is not
1977 * valid for the entire range, clear the B_CACHE flag to force
1978 * the consumer to re-read the page.
1979 */
1980static void
1981vfs_buf_set_valid(struct buf *bp,
1982		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
1983		  vm_page_t m)
1984{
1985	if (bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK) {
1986		vm_offset_t svalid, evalid;
1987		int validbits = m->valid;
1988
1989		/*
1990		 * This only bothers with the first valid range in the
1991		 * page.
1992		 */
1993		svalid = off;
1994		while (validbits && !(validbits & 1)) {
1995			svalid += DEV_BSIZE;
1996			validbits >>= 1;
1997		}
1998		evalid = svalid;
1999		while (validbits & 1) {
2000			evalid += DEV_BSIZE;
2001			validbits >>= 1;
2002		}
2003		/*
2004		 * Make sure this range is contiguous with the range
2005		 * built up from previous pages.  If not, then we will
2006		 * just use the range from the previous pages.
2007		 */
2008		if (svalid == bp->b_validend) {
2009			bp->b_validoff = min(bp->b_validoff, svalid);
2010			bp->b_validend = max(bp->b_validend, evalid);
2011		}
2012	} else if (!vm_page_is_valid(m,
2013				     (vm_offset_t) ((foff + off) & PAGE_MASK),
2014				     size)) {
2015		bp->b_flags &= ~B_CACHE;
2016	}
2017}
2018
2019/*
2020 * Set the valid bits in a page, taking care of the b_validoff,
2021 * b_validend fields which NFS uses to optimise small reads.  Off is
2022 * the offset within the file and pageno is the page index within the buf.
2023 */
2024static void
2025vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
2026{
2027	struct vnode *vp = bp->b_vp;
2028	vm_ooffset_t soff, eoff;
2029
2030	soff = off;
2031	eoff = off + min(PAGE_SIZE, bp->b_bufsize);
2032	vm_page_set_invalid(m,
2033			    (vm_offset_t) (soff & PAGE_MASK),
2034			    (vm_offset_t) (eoff - soff));
2035	if (vp->v_tag == VT_NFS && vp->v_type != VBLK) {
2036		vm_ooffset_t sv, ev;
2037		off = off - pageno * PAGE_SIZE;
2038		sv = off + ((bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1));
2039		ev = off + (bp->b_validend & ~(DEV_BSIZE - 1));
2040		soff = max(sv, soff);
2041		eoff = min(ev, eoff);
2042	}
2043	if (eoff > soff)
2044		vm_page_set_validclean(m,
2045				       (vm_offset_t) (soff & PAGE_MASK),
2046				       (vm_offset_t) (eoff - soff));
2047}
2048
2049/*
2050 * This routine is called before a device strategy routine.
2051 * It is used to tell the VM system that paging I/O is in
2052 * progress, and treat the pages associated with the buffer
2053 * almost as being PG_BUSY.  Also the object paging_in_progress
2054 * flag is handled to make sure that the object doesn't become
2055 * inconsistant.
2056 */
2057void
2058vfs_busy_pages(struct buf * bp, int clear_modify)
2059{
2060	int i;
2061
2062	if (bp->b_flags & B_VMIO) {
2063		struct vnode *vp = bp->b_vp;
2064		vm_object_t obj = vp->v_object;
2065		vm_ooffset_t foff;
2066
2067		if (vp->v_type == VBLK)
2068			foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
2069		else
2070			foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
2071		vfs_setdirty(bp);
2072		for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) {
2073			vm_page_t m = bp->b_pages[i];
2074
2075			if ((bp->b_flags & B_CLUSTER) == 0) {
2076				obj->paging_in_progress++;
2077				m->busy++;
2078			}
2079			vm_page_protect(m, VM_PROT_NONE);
2080			if (clear_modify)
2081				vfs_page_set_valid(bp, foff, i, m);
2082			else if (bp->b_bcount >= PAGE_SIZE) {
2083				if (m->valid && (bp->b_flags & B_CACHE) == 0) {
2084					bp->b_pages[i] = bogus_page;
2085					pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
2086				}
2087			}
2088		}
2089	}
2090}
2091
2092/*
2093 * Tell the VM system that the pages associated with this buffer
2094 * are clean.  This is used for delayed writes where the data is
2095 * going to go to disk eventually without additional VM intevention.
2096 */
2097void
2098vfs_clean_pages(struct buf * bp)
2099{
2100	int i;
2101
2102	if (bp->b_flags & B_VMIO) {
2103		struct vnode *vp = bp->b_vp;
2104		vm_object_t obj = vp->v_object;
2105		vm_ooffset_t foff;
2106
2107		if (vp->v_type == VBLK)
2108			foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
2109		else
2110			foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
2111		for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) {
2112			vm_page_t m = bp->b_pages[i];
2113
2114			vfs_page_set_valid(bp, foff, i, m);
2115		}
2116	}
2117}
2118
2119void
2120vfs_bio_clrbuf(struct buf *bp) {
2121	int i;
2122	if( bp->b_flags & B_VMIO) {
2123		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
2124			int mask;
2125			mask = 0;
2126			for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE)
2127				mask |= (1 << (i/DEV_BSIZE));
2128			if( bp->b_pages[0]->valid != mask) {
2129				bzero(bp->b_data, bp->b_bufsize);
2130			}
2131			bp->b_pages[0]->valid = mask;
2132			bp->b_resid = 0;
2133			return;
2134		}
2135		for(i=0;i<bp->b_npages;i++) {
2136			if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
2137				continue;
2138			if( bp->b_pages[i]->valid == 0) {
2139				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
2140					bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE);
2141				}
2142			} else {
2143				int j;
2144				for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
2145					if( (bp->b_pages[i]->valid & (1<<j)) == 0)
2146						bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE);
2147				}
2148			}
2149			/* bp->b_pages[i]->valid = VM_PAGE_BITS_ALL; */
2150		}
2151		bp->b_resid = 0;
2152	} else {
2153		clrbuf(bp);
2154	}
2155}
2156
2157/*
2158 * vm_hold_load_pages and vm_hold_unload pages get pages into
2159 * a buffers address space.  The pages are anonymous and are
2160 * not associated with a file object.
2161 */
2162void
2163vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
2164{
2165	vm_offset_t pg;
2166	vm_page_t p;
2167	int index;
2168
2169	to = round_page(to);
2170	from = round_page(from);
2171	index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
2172
2173	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
2174
2175tryagain:
2176
2177		p = vm_page_alloc(kernel_object,
2178			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
2179		    VM_ALLOC_NORMAL);
2180		if (!p) {
2181			VM_WAIT;
2182			goto tryagain;
2183		}
2184		vm_page_wire(p);
2185		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
2186		bp->b_pages[index] = p;
2187		PAGE_WAKEUP(p);
2188	}
2189	bp->b_npages = index;
2190}
2191
2192void
2193vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
2194{
2195	vm_offset_t pg;
2196	vm_page_t p;
2197	int index, newnpages;
2198
2199	from = round_page(from);
2200	to = round_page(to);
2201	newnpages = index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
2202
2203	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
2204		p = bp->b_pages[index];
2205		if (p && (index < bp->b_npages)) {
2206#if !defined(MAX_PERF)
2207			if (p->busy) {
2208				printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
2209					bp->b_blkno, bp->b_lblkno);
2210			}
2211#endif
2212			bp->b_pages[index] = NULL;
2213			pmap_kremove(pg);
2214			vm_page_unwire(p);
2215			vm_page_free(p);
2216		}
2217	}
2218	bp->b_npages = newnpages;
2219}
2220
2221
2222#include "opt_ddb.h"
2223#ifdef DDB
2224#include <ddb/ddb.h>
2225
2226DB_SHOW_COMMAND(buffer, db_show_buffer)
2227{
2228	/* get args */
2229	struct buf *bp = (struct buf *)addr;
2230
2231	if (!have_addr) {
2232		db_printf("usage: show buffer <addr>\n");
2233		return;
2234	}
2235
2236	db_printf("b_proc = %p,\nb_flags = 0x%b\n", (void *)bp->b_proc,
2237		  bp->b_flags, "\20\40bounce\37cluster\36vmio\35ram\34ordered"
2238		  "\33paging\32xxx\31writeinprog\30wanted\27relbuf\26tape"
2239		  "\25read\24raw\23phys\22clusterok\21malloc\20nocache"
2240		  "\17locked\16inval\15gathered\14error\13eintr\12done\11dirty"
2241		  "\10delwri\7call\6cache\5busy\4bad\3async\2needcommit\1age");
2242	db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, "
2243		  "b_resid = %ld\nb_dev = 0x%x, b_un.b_addr = %p, "
2244		  "b_blkno = %d, b_pblkno = %d\n",
2245		  bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
2246		  bp->b_dev, bp->b_un.b_addr, bp->b_blkno, bp->b_pblkno);
2247	if (bp->b_npages) {
2248		int i;
2249		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
2250		for (i = 0; i < bp->b_npages; i++) {
2251			vm_page_t m;
2252			m = bp->b_pages[i];
2253			db_printf("(0x%x, 0x%x, 0x%x)", m->object, m->pindex,
2254				VM_PAGE_TO_PHYS(m));
2255			if ((i + 1) < bp->b_npages)
2256				db_printf(",");
2257		}
2258		db_printf("\n");
2259	}
2260}
2261#endif /* DDB */
2262