vfs_bio.c revision 9706
1243789Sdim/*
2243789Sdim * Copyright (c) 1994 John S. Dyson
3243789Sdim * All rights reserved.
4243789Sdim *
5243789Sdim * Redistribution and use in source and binary forms, with or without
6243789Sdim * modification, are permitted provided that the following conditions
7243789Sdim * are met:
8243789Sdim * 1. Redistributions of source code must retain the above copyright
9243789Sdim *    notice immediately at the beginning of the file, without modification,
10243789Sdim *    this list of conditions, and the following disclaimer.
11243789Sdim * 2. Redistributions in binary form must reproduce the above copyright
12249423Sdim *    notice, this list of conditions and the following disclaimer in the
13243789Sdim *    documentation and/or other materials provided with the distribution.
14243789Sdim * 3. Absolutely no warranty of function or purpose is made by the author
15249423Sdim *    John S. Dyson.
16243789Sdim * 4. This work was done expressly for inclusion into FreeBSD.  Other use
17243789Sdim *    is allowed if this notation is included.
18243789Sdim * 5. Modifications may be freely made to this file if the above conditions
19243789Sdim *    are met.
20243789Sdim *
21243789Sdim * $Id: vfs_bio.c,v 1.53 1995/07/24 03:16:41 davidg Exp $
22243789Sdim */
23249423Sdim
24243789Sdim/*
25243789Sdim * this file contains a new buffer I/O scheme implementing a coherent
26243789Sdim * VM object and buffer cache scheme.  Pains have been taken to make
27243789Sdim * sure that the performance degradation associated with schemes such
28249423Sdim * as this is not realized.
29243789Sdim *
30243789Sdim * Author:  John S. Dyson
31243789Sdim * Significant help during the development and debugging phases
32243789Sdim * had been provided by David Greenman, also of the FreeBSD core team.
33243789Sdim */
34243789Sdim
35249423Sdim#define VMIO
36243789Sdim#include <sys/param.h>
37243789Sdim#include <sys/systm.h>
38243789Sdim#include <sys/kernel.h>
39243789Sdim#include <sys/proc.h>
40249423Sdim#include <sys/vnode.h>
41249423Sdim#include <vm/vm.h>
42249423Sdim#include <vm/vm_kern.h>
43243789Sdim#include <vm/vm_pageout.h>
44249423Sdim#include <vm/vm_page.h>
45243789Sdim#include <vm/vm_object.h>
46243789Sdim#include <sys/buf.h>
47249423Sdim#include <sys/mount.h>
48249423Sdim#include <sys/malloc.h>
49249423Sdim#include <sys/resourcevar.h>
50249423Sdim#include <sys/proc.h>
51249423Sdim
52249423Sdim#include <miscfs/specfs/specdev.h>
53243789Sdim
54243789Sdimstruct buf *buf;		/* buffer header pool */
55243789Sdimint nbuf;			/* number of buffer headers calculated
56243789Sdim				 * elsewhere */
57249423Sdimstruct swqueue bswlist;
58249423Sdim
59249423Sdimvoid vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
60249423Sdimvoid vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
61243789Sdimvoid vfs_clean_pages(struct buf * bp);
62249423Sdimstatic void vfs_setdirty(struct buf *bp);
63243789Sdim
64249423Sdimint needsbuffer;
65243789Sdim
66249423Sdim/*
67249423Sdim * Internal update daemon, process 3
68249423Sdim *	The variable vfs_update_wakeup allows for internal syncs.
69249423Sdim */
70249423Sdimint vfs_update_wakeup;
71249423Sdim
72249423Sdim
73249423Sdim/*
74249423Sdim * buffers base kva
75249423Sdim */
76249423Sdimcaddr_t buffers_kva;
77249423Sdim
78249423Sdim/*
79243789Sdim * bogus page -- for I/O to/from partially complete buffers
80243789Sdim * this is a temporary solution to the problem, but it is not
81243789Sdim * really that bad.  it would be better to split the buffer
82249423Sdim * for input in the case of buffers partially already in memory,
83249423Sdim * but the code is intricate enough already.
84249423Sdim */
85243789Sdimvm_page_t bogus_page;
86243789Sdimvm_offset_t bogus_offset;
87249423Sdim
88249423Sdimint bufspace, maxbufspace;
89243789Sdim
90243789Sdim/*
91243789Sdim * advisory minimum for size of LRU queue or VMIO queue
92243789Sdim */
93249423Sdimint minbuf;
94243789Sdim
95249423Sdim/*
96249423Sdim * Initialize buffer headers and related structures.
97249423Sdim */
98249423Sdimvoid
99249423Sdimbufinit()
100249423Sdim{
101249423Sdim	struct buf *bp;
102249423Sdim	int i;
103249423Sdim
104243789Sdim	TAILQ_INIT(&bswlist);
105249423Sdim	LIST_INIT(&invalhash);
106249423Sdim
107249423Sdim	/* first, make a null hash table */
108249423Sdim	for (i = 0; i < BUFHSZ; i++)
109249423Sdim		LIST_INIT(&bufhashtbl[i]);
110249423Sdim
111243789Sdim	/* next, make a null set of free lists */
112249423Sdim	for (i = 0; i < BUFFER_QUEUES; i++)
113249423Sdim		TAILQ_INIT(&bufqueues[i]);
114249423Sdim
115243789Sdim	buffers_kva = (caddr_t) kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf);
116243789Sdim	/* finally, initialize each buffer header and stick on empty q */
117249423Sdim	for (i = 0; i < nbuf; i++) {
118243789Sdim		bp = &buf[i];
119249423Sdim		bzero(bp, sizeof *bp);
120249423Sdim		bp->b_flags = B_INVAL;	/* we're just an empty header */
121249423Sdim		bp->b_dev = NODEV;
122249423Sdim		bp->b_rcred = NOCRED;
123249423Sdim		bp->b_wcred = NOCRED;
124249423Sdim		bp->b_qindex = QUEUE_EMPTY;
125243789Sdim		bp->b_vnbufs.le_next = NOLIST;
126243789Sdim		bp->b_data = buffers_kva + i * MAXBSIZE;
127243789Sdim		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
128243789Sdim		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
129243789Sdim	}
130243789Sdim/*
131249423Sdim * maxbufspace is currently calculated to support all filesystem blocks
132243789Sdim * to be 8K.  If you happen to use a 16K filesystem, the size of the buffer
133243789Sdim * cache is still the same as it would be for 8K filesystems.  This
134243789Sdim * keeps the size of the buffer cache "in check" for big block filesystems.
135243789Sdim */
136243789Sdim	minbuf = nbuf / 3;
137243789Sdim	maxbufspace = 2 * (nbuf + 8) * PAGE_SIZE;
138243789Sdim
139243789Sdim	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
140249423Sdim	bogus_page = vm_page_alloc(kernel_object,
141243789Sdim			bogus_offset - VM_MIN_KERNEL_ADDRESS, VM_ALLOC_NORMAL);
142249423Sdim
143243789Sdim}
144243789Sdim
145249423Sdim/*
146243789Sdim * remove the buffer from the appropriate free list
147249423Sdim */
148249423Sdimvoid
149249423Sdimbremfree(struct buf * bp)
150249423Sdim{
151249423Sdim	int s = splbio();
152249423Sdim
153249423Sdim	if (bp->b_qindex != QUEUE_NONE) {
154243789Sdim		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
155243789Sdim		bp->b_qindex = QUEUE_NONE;
156243789Sdim	} else {
157249423Sdim		panic("bremfree: removing a buffer when not on a queue");
158243789Sdim	}
159249423Sdim	splx(s);
160249423Sdim}
161243789Sdim
162243789Sdim/*
163243789Sdim * Get a buffer with the specified data.  Look in the cache first.
164249423Sdim */
165243789Sdimint
166243789Sdimbread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
167249423Sdim    struct buf ** bpp)
168249423Sdim{
169249423Sdim	struct buf *bp;
170249423Sdim
171249423Sdim	bp = getblk(vp, blkno, size, 0, 0);
172249423Sdim	*bpp = bp;
173249423Sdim
174249423Sdim	/* if not found in cache, do some I/O */
175249423Sdim	if ((bp->b_flags & B_CACHE) == 0) {
176249423Sdim		if (curproc != NULL)
177249423Sdim			curproc->p_stats->p_ru.ru_inblock++;
178243789Sdim		bp->b_flags |= B_READ;
179243789Sdim		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
180243789Sdim		if (bp->b_rcred == NOCRED) {
181243789Sdim			if (cred != NOCRED)
182249423Sdim				crhold(cred);
183249423Sdim			bp->b_rcred = cred;
184249423Sdim		}
185249423Sdim		vfs_busy_pages(bp, 0);
186249423Sdim		VOP_STRATEGY(bp);
187249423Sdim		return (biowait(bp));
188249423Sdim	}
189249423Sdim	return (0);
190249423Sdim}
191249423Sdim
192249423Sdim/*
193249423Sdim * Operates like bread, but also starts asynchronous I/O on
194249423Sdim * read-ahead blocks.
195243789Sdim */
196243789Sdimint
197243789Sdimbreadn(struct vnode * vp, daddr_t blkno, int size,
198243789Sdim    daddr_t * rablkno, int *rabsize,
199243789Sdim    int cnt, struct ucred * cred, struct buf ** bpp)
200243789Sdim{
201243789Sdim	struct buf *bp, *rabp;
202243789Sdim	int i;
203249423Sdim	int rv = 0, readwait = 0;
204249423Sdim
205249423Sdim	*bpp = bp = getblk(vp, blkno, size, 0, 0);
206249423Sdim
207243789Sdim	/* if not found in cache, do some I/O */
208243789Sdim	if ((bp->b_flags & B_CACHE) == 0) {
209243789Sdim		if (curproc != NULL)
210243789Sdim			curproc->p_stats->p_ru.ru_inblock++;
211243789Sdim		bp->b_flags |= B_READ;
212243789Sdim		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
213243789Sdim		if (bp->b_rcred == NOCRED) {
214249423Sdim			if (cred != NOCRED)
215249423Sdim				crhold(cred);
216249423Sdim			bp->b_rcred = cred;
217249423Sdim		}
218243789Sdim		vfs_busy_pages(bp, 0);
219243789Sdim		VOP_STRATEGY(bp);
220243789Sdim		++readwait;
221243789Sdim	}
222243789Sdim	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
223243789Sdim		if (inmem(vp, *rablkno))
224249423Sdim			continue;
225243789Sdim		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
226249423Sdim
227249423Sdim		if ((rabp->b_flags & B_CACHE) == 0) {
228243789Sdim			if (curproc != NULL)
229249423Sdim				curproc->p_stats->p_ru.ru_inblock++;
230249423Sdim			rabp->b_flags |= B_READ | B_ASYNC;
231249423Sdim			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
232249423Sdim			if (rabp->b_rcred == NOCRED) {
233249423Sdim				if (cred != NOCRED)
234249423Sdim					crhold(cred);
235249423Sdim				rabp->b_rcred = cred;
236249423Sdim			}
237243789Sdim			vfs_busy_pages(rabp, 0);
238249423Sdim			VOP_STRATEGY(rabp);
239249423Sdim		} else {
240249423Sdim			brelse(rabp);
241249423Sdim		}
242249423Sdim	}
243249423Sdim
244249423Sdim	if (readwait) {
245249423Sdim		rv = biowait(bp);
246249423Sdim	}
247249423Sdim	return (rv);
248249423Sdim}
249249423Sdim
250249423Sdim/*
251249423Sdim * Write, release buffer on completion.  (Done by iodone
252249423Sdim * if async.)
253249423Sdim */
254249423Sdimint
255249423Sdimbwrite(struct buf * bp)
256249423Sdim{
257249423Sdim	int oldflags = bp->b_flags;
258249423Sdim
259249423Sdim	if (bp->b_flags & B_INVAL) {
260249423Sdim		brelse(bp);
261249423Sdim		return (0);
262249423Sdim	}
263249423Sdim	if (!(bp->b_flags & B_BUSY))
264249423Sdim		panic("bwrite: buffer is not busy???");
265249423Sdim
266249423Sdim	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
267249423Sdim	bp->b_flags |= B_WRITEINPROG;
268249423Sdim
269243789Sdim	if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) {
270243789Sdim		reassignbuf(bp, bp->b_vp);
271249423Sdim	}
272243789Sdim
273249423Sdim	bp->b_vp->v_numoutput++;
274243789Sdim	vfs_busy_pages(bp, 1);
275249423Sdim	if (curproc != NULL)
276243789Sdim		curproc->p_stats->p_ru.ru_oublock++;
277249423Sdim	VOP_STRATEGY(bp);
278249423Sdim
279249423Sdim	if ((oldflags & B_ASYNC) == 0) {
280249423Sdim		int rtval = biowait(bp);
281249423Sdim
282249423Sdim		if (oldflags & B_DELWRI) {
283249423Sdim			reassignbuf(bp, bp->b_vp);
284243789Sdim		}
285249423Sdim		brelse(bp);
286249423Sdim		return (rtval);
287249423Sdim	}
288243789Sdim	return (0);
289249423Sdim}
290243789Sdim
291243789Sdimint
292243789Sdimvn_bwrite(ap)
293243789Sdim	struct vop_bwrite_args *ap;
294243789Sdim{
295243789Sdim	return (bwrite(ap->a_bp));
296243789Sdim}
297243789Sdim
298243789Sdim/*
299243789Sdim * Delayed write. (Buffer is marked dirty).
300243789Sdim */
301243789Sdimvoid
302243789Sdimbdwrite(struct buf * bp)
303243789Sdim{
304243789Sdim
305243789Sdim	if ((bp->b_flags & B_BUSY) == 0) {
306249423Sdim		panic("bdwrite: buffer is not busy");
307249423Sdim	}
308243789Sdim	if (bp->b_flags & B_INVAL) {
309243789Sdim		brelse(bp);
310243789Sdim		return;
311243789Sdim	}
312243789Sdim	if (bp->b_flags & B_TAPE) {
313249423Sdim		bawrite(bp);
314243789Sdim		return;
315243789Sdim	}
316243789Sdim	bp->b_flags &= ~(B_READ|B_RELBUF);
317243789Sdim	if ((bp->b_flags & B_DELWRI) == 0) {
318243789Sdim		bp->b_flags |= B_DONE | B_DELWRI;
319243789Sdim		reassignbuf(bp, bp->b_vp);
320243789Sdim	}
321243789Sdim
322243789Sdim	/*
323243789Sdim	 * This bmap keeps the system from needing to do the bmap later,
324249423Sdim	 * perhaps when the system is attempting to do a sync.  Since it
325249423Sdim	 * is likely that the indirect block -- or whatever other datastructure
326249423Sdim	 * that the filesystem needs is still in memory now, it is a good
327249423Sdim	 * thing to do this.  Note also, that if the pageout daemon is
328249423Sdim	 * requesting a sync -- there might not be enough memory to do
329243789Sdim	 * the bmap then...  So, this is important to do.
330243789Sdim	 */
331243789Sdim	if( bp->b_lblkno == bp->b_blkno) {
332249423Sdim		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL);
333249423Sdim	}
334249423Sdim
335249423Sdim	/*
336249423Sdim	 * Set the *dirty* buffer range based upon the VM system dirty pages.
337243789Sdim	 */
338249423Sdim	vfs_setdirty(bp);
339243789Sdim
340249423Sdim	/*
341249423Sdim	 * We need to do this here to satisfy the vnode_pager and the
342249423Sdim	 * pageout daemon, so that it thinks that the pages have been
343249423Sdim	 * "cleaned".  Note that since the pages are in a delayed write
344249423Sdim	 * buffer -- the VFS layer "will" see that the pages get written
345249423Sdim	 * out on the next sync, or perhaps the cluster will be completed.
346249423Sdim	 */
347249423Sdim	vfs_clean_pages(bp);
348243789Sdim	brelse(bp);
349243789Sdim	return;
350249423Sdim}
351249423Sdim
352249423Sdim/*
353249423Sdim * Asynchronous write.
354249423Sdim * Start output on a buffer, but do not wait for it to complete.
355249423Sdim * The buffer is released when the output completes.
356249423Sdim */
357249423Sdimvoid
358249423Sdimbawrite(struct buf * bp)
359243789Sdim{
360243789Sdim	bp->b_flags |= B_ASYNC;
361249423Sdim	(void) VOP_BWRITE(bp);
362249423Sdim}
363243789Sdim
364249423Sdim/*
365249423Sdim * Release a buffer.
366243789Sdim */
367243789Sdimvoid
368243789Sdimbrelse(struct buf * bp)
369243789Sdim{
370243789Sdim	int s;
371243789Sdim
372243789Sdim	if (bp->b_flags & B_CLUSTER) {
373243789Sdim		relpbuf(bp);
374243789Sdim		return;
375243789Sdim	}
376243789Sdim	/* anyone need a "free" block? */
377243789Sdim	s = splbio();
378243789Sdim
379249423Sdim	if (needsbuffer) {
380249423Sdim		needsbuffer = 0;
381249423Sdim		wakeup((caddr_t) &needsbuffer);
382249423Sdim	}
383249423Sdim
384249423Sdim	/* anyone need this block? */
385249423Sdim	if (bp->b_flags & B_WANTED) {
386249423Sdim		bp->b_flags &= ~(B_WANTED | B_AGE);
387249423Sdim		wakeup((caddr_t) bp);
388249423Sdim	} else if (bp->b_flags & B_VMIO) {
389249423Sdim		bp->b_flags &= ~B_WANTED;
390249423Sdim		wakeup((caddr_t) bp);
391249423Sdim	}
392243789Sdim	if (bp->b_flags & B_LOCKED)
393243789Sdim		bp->b_flags &= ~B_ERROR;
394243789Sdim
395243789Sdim	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
396249423Sdim	    (bp->b_bufsize <= 0)) {
397249423Sdim		bp->b_flags |= B_INVAL;
398249423Sdim		bp->b_flags &= ~(B_DELWRI | B_CACHE);
399249423Sdim		if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp)
400243789Sdim			brelvp(bp);
401243789Sdim	}
402249423Sdim
403249423Sdim	/*
404249423Sdim	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
405249423Sdim	 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
406249423Sdim	 * but the VM object is kept around.  The B_NOCACHE flag is used to
407249423Sdim	 * invalidate the pages in the VM object.
408249423Sdim	 */
409249423Sdim	if (bp->b_flags & B_VMIO) {
410249423Sdim		vm_offset_t foff;
411249423Sdim		vm_object_t obj;
412243789Sdim		int i, resid;
413243789Sdim		vm_page_t m;
414243789Sdim		int iototal = bp->b_bufsize;
415243789Sdim
416249423Sdim		foff = 0;
417249423Sdim		obj = 0;
418249423Sdim		if (bp->b_npages) {
419249423Sdim			if (bp->b_vp && bp->b_vp->v_mount) {
420249423Sdim				foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
421249423Sdim			} else {
422249423Sdim				/*
423249423Sdim				 * vnode pointer has been ripped away --
424249423Sdim				 * probably file gone...
425243789Sdim				 */
426243789Sdim				foff = bp->b_pages[0]->offset;
427243789Sdim			}
428243789Sdim		}
429243789Sdim		for (i = 0; i < bp->b_npages; i++) {
430243789Sdim			m = bp->b_pages[i];
431249423Sdim			if (m == bogus_page) {
432243789Sdim				m = vm_page_lookup(obj, foff);
433243789Sdim				if (!m) {
434243789Sdim					panic("brelse: page missing\n");
435243789Sdim				}
436249423Sdim				bp->b_pages[i] = m;
437249423Sdim				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
438249423Sdim			}
439249423Sdim			resid = (m->offset + PAGE_SIZE) - foff;
440249423Sdim			if (resid > iototal)
441249423Sdim				resid = iototal;
442249423Sdim			if (resid > 0) {
443249423Sdim				/*
444249423Sdim				 * Don't invalidate the page if the local machine has already
445249423Sdim				 * modified it.  This is the lesser of two evils, and should
446249423Sdim				 * be fixed.
447249423Sdim				 */
448249423Sdim				if (bp->b_flags & (B_NOCACHE | B_ERROR)) {
449243789Sdim					vm_page_test_dirty(m);
450249423Sdim					if (m->dirty == 0) {
451249423Sdim						vm_page_set_invalid(m, foff, resid);
452249423Sdim						if (m->valid == 0)
453249423Sdim							vm_page_protect(m, VM_PROT_NONE);
454249423Sdim					}
455243789Sdim				}
456243789Sdim			}
457243789Sdim			foff += resid;
458243789Sdim			iototal -= resid;
459243789Sdim		}
460249423Sdim
461243789Sdim		if (bp->b_flags & (B_INVAL | B_RELBUF)) {
462243789Sdim			for(i=0;i<bp->b_npages;i++) {
463249423Sdim				m = bp->b_pages[i];
464243789Sdim				--m->bmapped;
465243789Sdim				if (m->bmapped == 0) {
466243789Sdim					if (m->flags & PG_WANTED) {
467243789Sdim						wakeup((caddr_t) m);
468249423Sdim						m->flags &= ~PG_WANTED;
469243789Sdim					}
470243789Sdim					vm_page_test_dirty(m);
471243789Sdim					if ((m->dirty & m->valid) == 0 &&
472243789Sdim						(m->flags & PG_REFERENCED) == 0 &&
473249423Sdim							!pmap_is_referenced(VM_PAGE_TO_PHYS(m))) {
474249423Sdim						vm_page_cache(m);
475243789Sdim					} else if ((m->flags & PG_ACTIVE) == 0) {
476249423Sdim						vm_page_activate(m);
477249423Sdim						m->act_count = 0;
478249423Sdim					}
479249423Sdim				}
480249423Sdim			}
481249423Sdim			bufspace -= bp->b_bufsize;
482249423Sdim			pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
483249423Sdim			bp->b_npages = 0;
484249423Sdim			bp->b_bufsize = 0;
485249423Sdim			bp->b_flags &= ~B_VMIO;
486249423Sdim			if (bp->b_vp)
487243789Sdim				brelvp(bp);
488243789Sdim		}
489243789Sdim	}
490243789Sdim	if (bp->b_qindex != QUEUE_NONE)
491243789Sdim		panic("brelse: free buffer onto another queue???");
492243789Sdim
493249423Sdim	/* enqueue */
494249423Sdim	/* buffers with no memory */
495249423Sdim	if (bp->b_bufsize == 0) {
496243789Sdim		bp->b_qindex = QUEUE_EMPTY;
497243789Sdim		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
498249423Sdim		LIST_REMOVE(bp, b_hash);
499249423Sdim		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
500249423Sdim		bp->b_dev = NODEV;
501249423Sdim		/* buffers with junk contents */
502249423Sdim	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
503249423Sdim		bp->b_qindex = QUEUE_AGE;
504249423Sdim		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
505243789Sdim		LIST_REMOVE(bp, b_hash);
506249423Sdim		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
507243789Sdim		bp->b_dev = NODEV;
508243789Sdim		/* buffers that are locked */
509243789Sdim	} else if (bp->b_flags & B_LOCKED) {
510243789Sdim		bp->b_qindex = QUEUE_LOCKED;
511243789Sdim		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
512243789Sdim		/* buffers with stale but valid contents */
513249423Sdim	} else if (bp->b_flags & B_AGE) {
514249423Sdim		bp->b_qindex = QUEUE_AGE;
515249423Sdim		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
516249423Sdim		/* buffers with valid and quite potentially reuseable contents */
517249423Sdim	} else {
518249423Sdim		bp->b_qindex = QUEUE_LRU;
519249423Sdim		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
520249423Sdim	}
521243789Sdim
522243789Sdim	/* unlock */
523243789Sdim	bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
524249423Sdim	splx(s);
525249423Sdim}
526243789Sdim
527243789Sdim/*
528243789Sdim * this routine implements clustered async writes for
529249423Sdim * clearing out B_DELWRI buffers...  This is much better
530249423Sdim * than the old way of writing only one buffer at a time.
531249423Sdim */
532249423Sdimvoid
533249423Sdimvfs_bio_awrite(struct buf * bp)
534243789Sdim{
535243789Sdim	int i;
536243789Sdim	daddr_t lblkno = bp->b_lblkno;
537243789Sdim	struct vnode *vp = bp->b_vp;
538243789Sdim	int s;
539243789Sdim	int ncl;
540243789Sdim	struct buf *bpa;
541243789Sdim
542243789Sdim	s = splbio();
543243789Sdim	if( vp->v_mount && (vp->v_flag & VVMIO) &&
544243789Sdim	    	(bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
545243789Sdim		int size = vp->v_mount->mnt_stat.f_iosize;
546243789Sdim
547249423Sdim		for (i = 1; i < MAXPHYS / size; i++) {
548249423Sdim			if ((bpa = incore(vp, lblkno + i)) &&
549249423Sdim			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
550249423Sdim			    (B_DELWRI | B_CLUSTEROK)) &&
551249423Sdim			    (bpa->b_bufsize == size)) {
552243789Sdim				if ((bpa->b_blkno == bpa->b_lblkno) ||
553243789Sdim				    (bpa->b_blkno != bp->b_blkno + (i * size) / DEV_BSIZE))
554249423Sdim					break;
555249423Sdim			} else {
556249423Sdim				break;
557249423Sdim			}
558249423Sdim		}
559243789Sdim		ncl = i;
560249423Sdim		/*
561249423Sdim		 * this is a possible cluster write
562249423Sdim		 */
563249423Sdim		if (ncl != 1) {
564249423Sdim			bremfree(bp);
565249423Sdim			cluster_wbuild(vp, bp, size, lblkno, ncl, -1);
566249423Sdim			splx(s);
567249423Sdim			return;
568249423Sdim		}
569249423Sdim	}
570249423Sdim	/*
571243789Sdim	 * default (old) behavior, writing out only one block
572249423Sdim	 */
573249423Sdim	bremfree(bp);
574249423Sdim	bp->b_flags |= B_BUSY | B_ASYNC;
575249423Sdim	(void) VOP_BWRITE(bp);
576249423Sdim	splx(s);
577249423Sdim}
578249423Sdim
579249423Sdim
580249423Sdim/*
581249423Sdim * Find a buffer header which is available for use.
582249423Sdim */
583249423Sdimstatic struct buf *
584249423Sdimgetnewbuf(int slpflag, int slptimeo, int doingvmio)
585249423Sdim{
586249423Sdim	struct buf *bp;
587249423Sdim	int s;
588249423Sdim	int firstbp = 1;
589249423Sdim
590249423Sdim	s = splbio();
591249423Sdimstart:
592249423Sdim	if (bufspace >= maxbufspace)
593249423Sdim		goto trytofreespace;
594249423Sdim
595249423Sdim	/* can we constitute a new buffer? */
596249423Sdim	if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) {
597249423Sdim		if (bp->b_qindex != QUEUE_EMPTY)
598249423Sdim			panic("getnewbuf: inconsistent EMPTY queue");
599243789Sdim		bremfree(bp);
600243789Sdim		goto fillbuf;
601243789Sdim	}
602243789Sdimtrytofreespace:
603243789Sdim	/*
604243789Sdim	 * We keep the file I/O from hogging metadata I/O
605243789Sdim	 * This is desirable because file data is cached in the
606243789Sdim	 * VM/Buffer cache even if a buffer is freed.
607243789Sdim	 */
608243789Sdim	if ((bp = bufqueues[QUEUE_AGE].tqh_first)) {
609243789Sdim		if (bp->b_qindex != QUEUE_AGE)
610243789Sdim			panic("getnewbuf: inconsistent AGE queue");
611243789Sdim	} else if ((bp = bufqueues[QUEUE_LRU].tqh_first)) {
612243789Sdim		if (bp->b_qindex != QUEUE_LRU)
613243789Sdim			panic("getnewbuf: inconsistent LRU queue");
614243789Sdim	}
615249423Sdim	if (!bp) {
616249423Sdim		/* wait for a free buffer of any kind */
617249423Sdim		needsbuffer = 1;
618249423Sdim		tsleep((caddr_t) &needsbuffer, PRIBIO | slpflag, "newbuf", slptimeo);
619243789Sdim		splx(s);
620243789Sdim		return (0);
621243789Sdim	}
622243789Sdim
623249423Sdim	/* if we are a delayed write, convert to an async write */
624249423Sdim	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
625243789Sdim		vfs_bio_awrite(bp);
626243789Sdim		if (!slpflag && !slptimeo) {
627243789Sdim			splx(s);
628249423Sdim			return (0);
629249423Sdim		}
630249423Sdim		goto start;
631249423Sdim	}
632249423Sdim
633243789Sdim	if (bp->b_flags & B_WANTED) {
634243789Sdim		bp->b_flags &= ~B_WANTED;
635249423Sdim		wakeup((caddr_t) bp);
636243789Sdim	}
637249423Sdim	bremfree(bp);
638249423Sdim
639243789Sdim	if (bp->b_flags & B_VMIO) {
640249423Sdim		bp->b_flags |= B_RELBUF | B_BUSY | B_DONE;
641243789Sdim		brelse(bp);
642243789Sdim		bremfree(bp);
643243789Sdim	}
644243789Sdim
645243789Sdim	if (bp->b_vp)
646243789Sdim		brelvp(bp);
647243789Sdim
648243789Sdim	/* we are not free, nor do we contain interesting data */
649249423Sdim	if (bp->b_rcred != NOCRED)
650249423Sdim		crfree(bp->b_rcred);
651243789Sdim	if (bp->b_wcred != NOCRED)
652243789Sdim		crfree(bp->b_wcred);
653243789Sdimfillbuf:
654243789Sdim	bp->b_flags |= B_BUSY;
655243789Sdim	LIST_REMOVE(bp, b_hash);
656243789Sdim	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
657243789Sdim	splx(s);
658243789Sdim	if (bp->b_bufsize) {
659243789Sdim		allocbuf(bp, 0);
660249423Sdim	}
661243789Sdim	bp->b_flags = B_BUSY;
662249423Sdim	bp->b_dev = NODEV;
663249423Sdim	bp->b_vp = NULL;
664243789Sdim	bp->b_blkno = bp->b_lblkno = 0;
665243789Sdim	bp->b_iodone = 0;
666243789Sdim	bp->b_error = 0;
667243789Sdim	bp->b_resid = 0;
668243789Sdim	bp->b_bcount = 0;
669249423Sdim	bp->b_npages = 0;
670243789Sdim	bp->b_wcred = bp->b_rcred = NOCRED;
671243789Sdim	bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
672243789Sdim	bp->b_dirtyoff = bp->b_dirtyend = 0;
673243789Sdim	bp->b_validoff = bp->b_validend = 0;
674243789Sdim	if (bufspace >= maxbufspace) {
675249423Sdim		s = splbio();
676249423Sdim		bp->b_flags |= B_INVAL;
677243789Sdim		brelse(bp);
678249423Sdim		goto trytofreespace;
679249423Sdim	}
680249423Sdim	return (bp);
681249423Sdim}
682249423Sdim
683249423Sdim/*
684249423Sdim * Check to see if a block is currently memory resident.
685249423Sdim */
686243789Sdimstruct buf *
687249423Sdimincore(struct vnode * vp, daddr_t blkno)
688249423Sdim{
689249423Sdim	struct buf *bp;
690249423Sdim	struct bufhashhdr *bh;
691243789Sdim
692243789Sdim	int s = splbio();
693249423Sdim
694249423Sdim	bh = BUFHASH(vp, blkno);
695249423Sdim	bp = bh->lh_first;
696249423Sdim
697249423Sdim	/* Search hash chain */
698249423Sdim	while (bp) {
699243789Sdim		/* hit */
700249423Sdim		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
701249423Sdim		    (bp->b_flags & B_INVAL) == 0) {
702249423Sdim			splx(s);
703249423Sdim			return (bp);
704249423Sdim		}
705249423Sdim		bp = bp->b_hash.le_next;
706249423Sdim	}
707249423Sdim	splx(s);
708249423Sdim
709249423Sdim	return (0);
710249423Sdim}
711243789Sdim
712243789Sdim/*
713243789Sdim * Returns true if no I/O is needed to access the
714243789Sdim * associated VM object.  This is like incore except
715243789Sdim * it also hunts around in the VM system for the data.
716243789Sdim */
717243789Sdim
718243789Sdimint
719243789Sdiminmem(struct vnode * vp, daddr_t blkno)
720243789Sdim{
721249423Sdim	vm_object_t obj;
722243789Sdim	vm_offset_t off, toff, tinc;
723243789Sdim	vm_page_t m;
724249423Sdim
725243789Sdim	if (incore(vp, blkno))
726243789Sdim		return 1;
727243789Sdim	if (vp->v_mount == 0)
728243789Sdim		return 0;
729243789Sdim	if ((vp->v_object == 0) || (vp->v_flag & VVMIO) == 0)
730243789Sdim		return 0;
731249423Sdim
732249423Sdim	obj = vp->v_object;
733249423Sdim	tinc = PAGE_SIZE;
734249423Sdim	if (tinc > vp->v_mount->mnt_stat.f_iosize)
735249423Sdim		tinc = vp->v_mount->mnt_stat.f_iosize;
736249423Sdim	off = blkno * vp->v_mount->mnt_stat.f_iosize;
737249423Sdim
738249423Sdim	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
739249423Sdim		int mask;
740249423Sdim
741249423Sdim		m = vm_page_lookup(obj, trunc_page(toff + off));
742249423Sdim		if (!m)
743249423Sdim			return 0;
744249423Sdim		if (vm_page_is_valid(m, toff + off, tinc) == 0)
745249423Sdim			return 0;
746249423Sdim	}
747249423Sdim	return 1;
748249423Sdim}
749249423Sdim
750249423Sdim/*
751249423Sdim * now we set the dirty range for the buffer --
752249423Sdim * for NFS -- if the file is mapped and pages have
753249423Sdim * been written to, let it know.  We want the
754249423Sdim * entire range of the buffer to be marked dirty if
755249423Sdim * any of the pages have been written to for consistancy
756249423Sdim * with the b_validoff, b_validend set in the nfs write
757249423Sdim * code, and used by the nfs read code.
758249423Sdim */
759249423Sdimstatic void
760249423Sdimvfs_setdirty(struct buf *bp) {
761249423Sdim	int i;
762249423Sdim	vm_object_t object;
763249423Sdim	vm_offset_t boffset, offset;
764243789Sdim	/*
765249423Sdim	 * We qualify the scan for modified pages on whether the
766249423Sdim	 * object has been flushed yet.  The OBJ_WRITEABLE flag
767249423Sdim	 * is not cleared simply by protecting pages off.
768249423Sdim	 */
769249423Sdim	if ((bp->b_flags & B_VMIO) &&
770249423Sdim		((object = bp->b_pages[0]->object)->flags & OBJ_WRITEABLE)) {
771243789Sdim		/*
772243789Sdim		 * test the pages to see if they have been modified directly
773243789Sdim		 * by users through the VM system.
774249423Sdim		 */
775249423Sdim		for (i = 0; i < bp->b_npages; i++)
776243789Sdim			vm_page_test_dirty(bp->b_pages[i]);
777243789Sdim
778243789Sdim		/*
779243789Sdim		 * scan forwards for the first page modified
780243789Sdim		 */
781243789Sdim		for (i = 0; i < bp->b_npages; i++) {
782249423Sdim			if (bp->b_pages[i]->dirty) {
783243789Sdim				break;
784243789Sdim			}
785243789Sdim		}
786249423Sdim		boffset = i * PAGE_SIZE;
787249423Sdim		if (boffset < bp->b_dirtyoff) {
788249423Sdim			bp->b_dirtyoff = boffset;
789249423Sdim		}
790243789Sdim
791243789Sdim		/*
792243789Sdim		 * scan backwards for the last page modified
793243789Sdim		 */
794243789Sdim		for (i = bp->b_npages - 1; i >= 0; --i) {
795243789Sdim			if (bp->b_pages[i]->dirty) {
796243789Sdim				break;
797249423Sdim			}
798249423Sdim		}
799249423Sdim		boffset = (i + 1) * PAGE_SIZE;
800249423Sdim		offset = boffset + bp->b_pages[0]->offset;
801249423Sdim		if (offset >= object->size) {
802249423Sdim			boffset = object->size - bp->b_pages[0]->offset;
803249423Sdim		}
804249423Sdim		if (bp->b_dirtyend < boffset) {
805249423Sdim			bp->b_dirtyend = boffset;
806249423Sdim		}
807249423Sdim	}
808249423Sdim}
809249423Sdim
810249423Sdim/*
811249423Sdim * Get a block given a specified block and offset into a file/device.
812249423Sdim */
813243789Sdimstruct buf *
814243789Sdimgetblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
815249423Sdim{
816243789Sdim	struct buf *bp;
817243789Sdim	int s;
818243789Sdim	struct bufhashhdr *bh;
819243789Sdim	vm_offset_t off;
820249423Sdim	int nleft;
821249423Sdim
822249423Sdim	s = splbio();
823249423Sdimloop:
824249423Sdim	if (bp = incore(vp, blkno)) {
825249423Sdim		if (bp->b_flags & B_BUSY) {
826249423Sdim			bp->b_flags |= B_WANTED;
827249423Sdim			if (!tsleep((caddr_t) bp, PRIBIO | slpflag, "getblk", slptimeo))
828249423Sdim				goto loop;
829249423Sdim
830249423Sdim			splx(s);
831249423Sdim			return (struct buf *) NULL;
832249423Sdim		}
833249423Sdim		bp->b_flags |= B_BUSY | B_CACHE;
834249423Sdim		bremfree(bp);
835249423Sdim		/*
836249423Sdim		 * check for size inconsistancies
837249423Sdim		 */
838249423Sdim		if (bp->b_bcount != size) {
839249423Sdim			if (bp->b_flags & B_VMIO) {
840249423Sdim				allocbuf(bp, size);
841249423Sdim			} else {
842249423Sdim				bp->b_flags |= B_NOCACHE;
843243789Sdim				VOP_BWRITE(bp);
844243789Sdim				goto loop;
845243789Sdim			}
846249423Sdim		}
847243789Sdim		splx(s);
848249423Sdim		return (bp);
849243789Sdim	} else {
850243789Sdim		vm_object_t obj;
851243789Sdim		int doingvmio;
852243789Sdim
853243789Sdim		if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) {
854243789Sdim			doingvmio = 1;
855243789Sdim		} else {
856243789Sdim			doingvmio = 0;
857249423Sdim		}
858249423Sdim		if ((bp = getnewbuf(slpflag, slptimeo, doingvmio)) == 0) {
859249423Sdim			if (slpflag || slptimeo)
860243789Sdim				return NULL;
861243789Sdim			goto loop;
862249423Sdim		}
863243789Sdim
864243789Sdim		/*
865243789Sdim		 * This code is used to make sure that a buffer is not
866243789Sdim		 * created while the getnewbuf routine is blocked.
867243789Sdim		 * Normally the vnode is locked so this isn't a problem.
868243789Sdim		 * VBLK type I/O requests, however, don't lock the vnode.
869249423Sdim		 */
870249423Sdim		if (!VOP_ISLOCKED(vp) && incore(vp, blkno)) {
871249423Sdim			bp->b_flags |= B_INVAL;
872243789Sdim			brelse(bp);
873243789Sdim			goto loop;
874249423Sdim		}
875249423Sdim
876249423Sdim		/*
877249423Sdim		 * Insert the buffer into the hash, so that it can
878249423Sdim		 * be found by incore.
879249423Sdim		 */
880243789Sdim		bp->b_blkno = bp->b_lblkno = blkno;
881249423Sdim		bgetvp(vp, bp);
882243789Sdim		LIST_REMOVE(bp, b_hash);
883243789Sdim		bh = BUFHASH(vp, blkno);
884249423Sdim		LIST_INSERT_HEAD(bh, bp, b_hash);
885249423Sdim
886249423Sdim		if (doingvmio) {
887243789Sdim			bp->b_flags |= (B_VMIO | B_CACHE);
888249423Sdim#if defined(VFS_BIO_DEBUG)
889243789Sdim			if (vp->v_type != VREG)
890249423Sdim				printf("getblk: vmioing file type %d???\n", vp->v_type);
891249423Sdim#endif
892249423Sdim		} else {
893243789Sdim			bp->b_flags &= ~B_VMIO;
894243789Sdim		}
895249423Sdim		splx(s);
896249423Sdim
897249423Sdim		allocbuf(bp, size);
898249423Sdim		return (bp);
899249423Sdim	}
900249423Sdim}
901249423Sdim
902249423Sdim/*
903249423Sdim * Get an empty, disassociated buffer of given size.
904249423Sdim */
905249423Sdimstruct buf *
906249423Sdimgeteblk(int size)
907249423Sdim{
908249423Sdim	struct buf *bp;
909249423Sdim
910249423Sdim	while ((bp = getnewbuf(0, 0, 0)) == 0);
911249423Sdim	allocbuf(bp, size);
912249423Sdim	bp->b_flags |= B_INVAL;
913249423Sdim	return (bp);
914249423Sdim}
915249423Sdim
916249423Sdim/*
917249423Sdim * This code constitutes the buffer memory from either anonymous system
918249423Sdim * memory (in the case of non-VMIO operations) or from an associated
919249423Sdim * VM object (in the case of VMIO operations).
920249423Sdim *
921249423Sdim * Note that this code is tricky, and has many complications to resolve
922249423Sdim * deadlock or inconsistant data situations.  Tread lightly!!!
923249423Sdim *
924249423Sdim * Modify the length of a buffer's underlying buffer storage without
925249423Sdim * destroying information (unless, of course the buffer is shrinking).
926249423Sdim */
927249423Sdimint
928249423Sdimallocbuf(struct buf * bp, int size)
929249423Sdim{
930249423Sdim
931249423Sdim	int s;
932249423Sdim	int newbsize, mbsize;
933249423Sdim	int i;
934249423Sdim
935249423Sdim	if (!(bp->b_flags & B_BUSY))
936249423Sdim		panic("allocbuf: buffer not busy");
937249423Sdim
938249423Sdim	if ((bp->b_flags & B_VMIO) == 0) {
939249423Sdim		/*
940249423Sdim		 * Just get anonymous memory from the kernel
941249423Sdim		 */
942249423Sdim		mbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE;
943249423Sdim		newbsize = round_page(size);
944249423Sdim
945249423Sdim		if (newbsize < bp->b_bufsize) {
946249423Sdim			vm_hold_free_pages(
947249423Sdim			    bp,
948249423Sdim			    (vm_offset_t) bp->b_data + newbsize,
949249423Sdim			    (vm_offset_t) bp->b_data + bp->b_bufsize);
950249423Sdim		} else if (newbsize > bp->b_bufsize) {
951249423Sdim			vm_hold_load_pages(
952249423Sdim			    bp,
953249423Sdim			    (vm_offset_t) bp->b_data + bp->b_bufsize,
954249423Sdim			    (vm_offset_t) bp->b_data + newbsize);
955249423Sdim		}
956249423Sdim	} else {
957249423Sdim		vm_page_t m;
958249423Sdim		int desiredpages;
959249423Sdim
960249423Sdim		newbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE;
961249423Sdim		desiredpages = round_page(newbsize) / PAGE_SIZE;
962249423Sdim
963249423Sdim		if (newbsize < bp->b_bufsize) {
964249423Sdim			if (desiredpages < bp->b_npages) {
965249423Sdim				pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
966249423Sdim				    desiredpages * PAGE_SIZE, (bp->b_npages - desiredpages));
967249423Sdim				for (i = desiredpages; i < bp->b_npages; i++) {
968249423Sdim					m = bp->b_pages[i];
969249423Sdim					s = splhigh();
970249423Sdim					while ((m->flags & PG_BUSY) || (m->busy != 0)) {
971249423Sdim						m->flags |= PG_WANTED;
972249423Sdim						tsleep(m, PVM, "biodep", 0);
973249423Sdim					}
974249423Sdim					splx(s);
975249423Sdim
976249423Sdim					if (m->bmapped == 0) {
977249423Sdim						printf("allocbuf: bmapped is zero for page %d\n", i);
978249423Sdim						panic("allocbuf: error");
979249423Sdim					}
980249423Sdim					--m->bmapped;
981249423Sdim					if (m->bmapped == 0) {
982249423Sdim						vm_page_protect(m, VM_PROT_NONE);
983249423Sdim						vm_page_free(m);
984249423Sdim					}
985249423Sdim					bp->b_pages[i] = NULL;
986249423Sdim				}
987249423Sdim				bp->b_npages = desiredpages;
988249423Sdim			}
989249423Sdim		} else if (newbsize > bp->b_bufsize) {
990249423Sdim			vm_object_t obj;
991249423Sdim			vm_offset_t tinc, off, toff, objoff;
992249423Sdim			int pageindex, curbpnpages;
993249423Sdim			struct vnode *vp;
994249423Sdim			int bsize;
995249423Sdim
996249423Sdim			vp = bp->b_vp;
997249423Sdim			bsize = vp->v_mount->mnt_stat.f_iosize;
998249423Sdim
999249423Sdim			if (bp->b_npages < desiredpages) {
1000249423Sdim				obj = vp->v_object;
1001243789Sdim				tinc = PAGE_SIZE;
1002243789Sdim				if (tinc > bsize)
1003249423Sdim					tinc = bsize;
1004243789Sdim				off = bp->b_lblkno * bsize;
1005243789Sdim		doretry:
1006243789Sdim				curbpnpages = bp->b_npages;
1007243789Sdim				bp->b_flags |= B_CACHE;
1008243789Sdim				for (toff = 0; toff < newbsize; toff += tinc) {
1009243789Sdim					int mask;
1010243789Sdim					int bytesinpage;
1011249423Sdim
1012243789Sdim					pageindex = toff / PAGE_SIZE;
1013243789Sdim					objoff = trunc_page(toff + off);
1014243789Sdim					if (pageindex < curbpnpages) {
1015243789Sdim						int pb;
1016243789Sdim
1017243789Sdim						m = bp->b_pages[pageindex];
1018243789Sdim						if (m->offset != objoff)
1019243789Sdim							panic("allocbuf: page changed offset??!!!?");
1020249423Sdim						bytesinpage = tinc;
1021249423Sdim						if (tinc > (newbsize - toff))
1022243789Sdim							bytesinpage = newbsize - toff;
1023249423Sdim						if (!vm_page_is_valid(m, toff + off, bytesinpage)) {
1024243789Sdim							bp->b_flags &= ~B_CACHE;
1025243789Sdim						}
1026249423Sdim						if ((m->flags & PG_ACTIVE) == 0) {
1027249423Sdim							vm_page_activate(m);
1028249423Sdim							m->act_count = 0;
1029243789Sdim						}
1030243789Sdim						continue;
1031243789Sdim					}
1032243789Sdim					m = vm_page_lookup(obj, objoff);
1033243789Sdim					if (!m) {
1034243789Sdim						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
1035243789Sdim						if (!m) {
1036243789Sdim							int j;
1037243789Sdim
1038249423Sdim							for (j = bp->b_npages; j < pageindex; j++) {
1039249423Sdim								PAGE_WAKEUP(bp->b_pages[j]);
1040249423Sdim							}
1041249423Sdim							VM_WAIT;
1042243789Sdim							goto doretry;
1043243789Sdim						}
1044249423Sdim						vm_page_activate(m);
1045249423Sdim						m->act_count = 0;
1046249423Sdim						m->valid = 0;
1047249423Sdim						bp->b_flags &= ~B_CACHE;
1048249423Sdim					} else if (m->flags & PG_BUSY) {
1049249423Sdim						int j;
1050249423Sdim
1051249423Sdim						for (j = bp->b_npages; j < pageindex; j++) {
1052249423Sdim							PAGE_WAKEUP(bp->b_pages[j]);
1053249423Sdim						}
1054249423Sdim
1055249423Sdim						s = splbio();
1056249423Sdim						m->flags |= PG_WANTED;
1057249423Sdim						tsleep(m, PRIBIO, "pgtblk", 0);
1058249423Sdim						splx(s);
1059249423Sdim
1060249423Sdim						goto doretry;
1061249423Sdim					} else {
1062249423Sdim						int pb;
1063249423Sdim						if ((curproc != pageproc) &&
1064249423Sdim							(m->flags & PG_CACHE) &&
1065249423Sdim						    (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) {
1066249423Sdim							pagedaemon_wakeup();
1067243789Sdim						}
1068249423Sdim						bytesinpage = tinc;
1069249423Sdim						if (tinc > (newbsize - toff))
1070243789Sdim							bytesinpage = newbsize - toff;
1071249423Sdim						if (!vm_page_is_valid(m, toff + off, bytesinpage)) {
1072249423Sdim							bp->b_flags &= ~B_CACHE;
1073249423Sdim						}
1074249423Sdim						if ((m->flags & PG_ACTIVE) == 0) {
1075249423Sdim							vm_page_activate(m);
1076243789Sdim							m->act_count = 0;
1077249423Sdim						}
1078249423Sdim						m->flags |= PG_BUSY;
1079249423Sdim					}
1080249423Sdim					bp->b_pages[pageindex] = m;
1081249423Sdim					curbpnpages = pageindex + 1;
1082249423Sdim				}
1083249423Sdim				for (i = bp->b_npages; i < curbpnpages; i++) {
1084249423Sdim					m = bp->b_pages[i];
1085249423Sdim					m->bmapped++;
1086249423Sdim					PAGE_WAKEUP(m);
1087249423Sdim				}
1088249423Sdim				bp->b_npages = curbpnpages;
1089249423Sdim				bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
1090249423Sdim				pmap_qenter((vm_offset_t) bp->b_data, bp->b_pages, bp->b_npages);
1091249423Sdim				bp->b_data += off % PAGE_SIZE;
1092249423Sdim			}
1093249423Sdim		}
1094249423Sdim	}
1095249423Sdim	bufspace += (newbsize - bp->b_bufsize);
1096249423Sdim	bp->b_bufsize = newbsize;
1097249423Sdim	bp->b_bcount = size;
1098249423Sdim	return 1;
1099249423Sdim}
1100249423Sdim
1101249423Sdim/*
1102249423Sdim * Wait for buffer I/O completion, returning error status.
1103249423Sdim */
1104249423Sdimint
1105249423Sdimbiowait(register struct buf * bp)
1106249423Sdim{
1107243789Sdim	int s;
1108243789Sdim
1109249423Sdim	s = splbio();
1110249423Sdim	while ((bp->b_flags & B_DONE) == 0)
1111249423Sdim		tsleep((caddr_t) bp, PRIBIO, "biowait", 0);
1112249423Sdim	splx(s);
1113249423Sdim	if (bp->b_flags & B_EINTR) {
1114249423Sdim		bp->b_flags &= ~B_EINTR;
1115249423Sdim		return (EINTR);
1116249423Sdim	}
1117249423Sdim	if (bp->b_flags & B_ERROR) {
1118249423Sdim		return (bp->b_error ? bp->b_error : EIO);
1119249423Sdim	} else {
1120249423Sdim		return (0);
1121249423Sdim	}
1122249423Sdim}
1123249423Sdim
1124249423Sdim/*
1125249423Sdim * Finish I/O on a buffer, calling an optional function.
1126249423Sdim * This is usually called from interrupt level, so process blocking
1127249423Sdim * is not *a good idea*.
1128249423Sdim */
1129249423Sdimvoid
1130243789Sdimbiodone(register struct buf * bp)
1131243789Sdim{
1132243789Sdim	int s;
1133249423Sdim
1134243789Sdim	s = splbio();
1135243789Sdim	if (!(bp->b_flags & B_BUSY))
1136243789Sdim		panic("biodone: buffer not busy");
1137243789Sdim
1138243789Sdim	if (bp->b_flags & B_DONE) {
1139249423Sdim		splx(s);
1140249423Sdim		printf("biodone: buffer already done\n");
1141249423Sdim		return;
1142249423Sdim	}
1143249423Sdim	bp->b_flags |= B_DONE;
1144249423Sdim
1145249423Sdim	if ((bp->b_flags & B_READ) == 0) {
1146249423Sdim		struct vnode *vp = bp->b_vp;
1147249423Sdim		vwakeup(bp);
1148249423Sdim	}
1149249423Sdim#ifdef BOUNCE_BUFFERS
1150249423Sdim	if (bp->b_flags & B_BOUNCE)
1151249423Sdim		vm_bounce_free(bp);
1152249423Sdim#endif
1153249423Sdim
1154249423Sdim	/* call optional completion function if requested */
1155249423Sdim	if (bp->b_flags & B_CALL) {
1156249423Sdim		bp->b_flags &= ~B_CALL;
1157249423Sdim		(*bp->b_iodone) (bp);
1158249423Sdim		splx(s);
1159249423Sdim		return;
1160243789Sdim	}
1161243789Sdim	if (bp->b_flags & B_VMIO) {
1162243789Sdim		int i, resid;
1163249423Sdim		vm_offset_t foff;
1164249423Sdim		vm_page_t m;
1165249423Sdim		vm_object_t obj;
1166249423Sdim		int iosize;
1167249423Sdim		struct vnode *vp = bp->b_vp;
1168249423Sdim
1169249423Sdim		foff = vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1170249423Sdim		obj = vp->v_object;
1171249423Sdim		if (!obj) {
1172249423Sdim			panic("biodone: no object");
1173249423Sdim		}
1174249423Sdim#if defined(VFS_BIO_DEBUG)
1175249423Sdim		if (obj->paging_in_progress < bp->b_npages) {
1176249423Sdim			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1177243789Sdim			    obj->paging_in_progress, bp->b_npages);
1178249423Sdim		}
1179249423Sdim#endif
1180243789Sdim		iosize = bp->b_bufsize;
1181243789Sdim		for (i = 0; i < bp->b_npages; i++) {
1182249423Sdim			int bogusflag = 0;
1183243789Sdim			m = bp->b_pages[i];
1184243789Sdim			if (m == bogus_page) {
1185243789Sdim				bogusflag = 1;
1186243789Sdim				m = vm_page_lookup(obj, foff);
1187243789Sdim				if (!m) {
1188243789Sdim#if defined(VFS_BIO_DEBUG)
1189249423Sdim					printf("biodone: page disappeared\n");
1190249423Sdim#endif
1191249423Sdim					--obj->paging_in_progress;
1192243789Sdim					continue;
1193249423Sdim				}
1194249423Sdim				bp->b_pages[i] = m;
1195249423Sdim				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1196243789Sdim			}
1197249423Sdim#if defined(VFS_BIO_DEBUG)
1198249423Sdim			if (trunc_page(foff) != m->offset) {
1199249423Sdim				printf("biodone: foff(%d)/m->offset(%d) mismatch\n", foff, m->offset);
1200249423Sdim			}
1201249423Sdim#endif
1202249423Sdim			resid = (m->offset + PAGE_SIZE) - foff;
1203249423Sdim			if (resid > iosize)
1204243789Sdim				resid = iosize;
1205249423Sdim			/*
1206243789Sdim			 * In the write case, the valid and clean bits are
1207249423Sdim			 * already changed correctly, so we only need to do this
1208243789Sdim			 * here in the read case.
1209249423Sdim			 */
1210249423Sdim			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
1211249423Sdim				vm_page_set_valid(m, foff & (PAGE_SIZE-1), resid);
1212249423Sdim				vm_page_set_clean(m, foff & (PAGE_SIZE-1), resid);
1213249423Sdim			}
1214243789Sdim
1215249423Sdim			/*
1216243789Sdim			 * when debugging new filesystems or buffer I/O methods, this
1217243789Sdim			 * is the most common error that pops up.  if you see this, you
1218243789Sdim			 * have not set the page busy flag correctly!!!
1219243789Sdim			 */
1220243789Sdim			if (m->busy == 0) {
1221243789Sdim				printf("biodone: page busy < 0, "
1222243789Sdim				    "off: %ld, foff: %ld, "
1223249423Sdim				    "resid: %d, index: %d\n",
1224249423Sdim				    m->offset, foff, resid, i);
1225249423Sdim				printf(" iosize: %ld, lblkno: %ld, flags: 0x%x, npages: %d\n",
1226243789Sdim				    bp->b_vp->v_mount->mnt_stat.f_iosize,
1227249423Sdim				    bp->b_lblkno, bp->b_flags, bp->b_npages);
1228249423Sdim				printf(" valid: 0x%x, dirty: 0x%x, mapped: %d\n",
1229249423Sdim				    m->valid, m->dirty, m->bmapped);
1230243789Sdim				panic("biodone: page busy < 0\n");
1231249423Sdim			}
1232249423Sdim			--m->busy;
1233249423Sdim			if ((m->busy == 0) && (m->flags & PG_WANTED)) {
1234249423Sdim				m->flags &= ~PG_WANTED;
1235249423Sdim				wakeup((caddr_t) m);
1236249423Sdim			}
1237249423Sdim			--obj->paging_in_progress;
1238249423Sdim			foff += resid;
1239249423Sdim			iosize -= resid;
1240249423Sdim		}
1241243789Sdim		if (obj && obj->paging_in_progress == 0 &&
1242243789Sdim		    (obj->flags & OBJ_PIPWNT)) {
1243249423Sdim			obj->flags &= ~OBJ_PIPWNT;
1244243789Sdim			wakeup((caddr_t) obj);
1245243789Sdim		}
1246243789Sdim	}
1247249423Sdim	/*
1248249423Sdim	 * For asynchronous completions, release the buffer now. The brelse
1249249423Sdim	 * checks for B_WANTED and will do the wakeup there if necessary - so
1250249423Sdim	 * no need to do a wakeup here in the async case.
1251243789Sdim	 */
1252249423Sdim
1253249423Sdim	if (bp->b_flags & B_ASYNC) {
1254249423Sdim		brelse(bp);
1255249423Sdim	} else {
1256243789Sdim		bp->b_flags &= ~B_WANTED;
1257249423Sdim		wakeup((caddr_t) bp);
1258243789Sdim	}
1259249423Sdim	splx(s);
1260249423Sdim}
1261249423Sdim
1262249423Sdimint
1263249423Sdimcount_lock_queue()
1264249423Sdim{
1265249423Sdim	int count;
1266249423Sdim	struct buf *bp;
1267249423Sdim
1268249423Sdim	count = 0;
1269243789Sdim	for (bp = bufqueues[QUEUE_LOCKED].tqh_first;
1270249423Sdim	    bp != NULL;
1271249423Sdim	    bp = bp->b_freelist.tqe_next)
1272249423Sdim		count++;
1273249423Sdim	return (count);
1274249423Sdim}
1275243789Sdim
1276249423Sdimint vfs_update_interval = 30;
1277249423Sdim
1278249423Sdimvoid
1279243789Sdimvfs_update()
1280249423Sdim{
1281249423Sdim	(void) spl0();
1282249423Sdim	while (1) {
1283249423Sdim		tsleep((caddr_t) &vfs_update_wakeup, PRIBIO, "update",
1284249423Sdim		    hz * vfs_update_interval);
1285249423Sdim		vfs_update_wakeup = 0;
1286249423Sdim		sync(curproc, NULL, NULL);
1287249423Sdim	}
1288249423Sdim}
1289249423Sdim
1290249423Sdim/*
1291249423Sdim * This routine is called in lieu of iodone in the case of
1292249423Sdim * incomplete I/O.  This keeps the busy status for pages
1293249423Sdim * consistant.
1294249423Sdim */
1295249423Sdimvoid
1296249423Sdimvfs_unbusy_pages(struct buf * bp)
1297249423Sdim{
1298249423Sdim	int i;
1299249423Sdim
1300249423Sdim	if (bp->b_flags & B_VMIO) {
1301249423Sdim		struct vnode *vp = bp->b_vp;
1302249423Sdim		vm_object_t obj = vp->v_object;
1303249423Sdim		vm_offset_t foff;
1304249423Sdim
1305249423Sdim		foff = trunc_page(vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno);
1306249423Sdim
1307249423Sdim		for (i = 0; i < bp->b_npages; i++) {
1308249423Sdim			vm_page_t m = bp->b_pages[i];
1309249423Sdim
1310249423Sdim			if (m == bogus_page) {
1311249423Sdim				m = vm_page_lookup(obj, foff + i * PAGE_SIZE);
1312249423Sdim				if (!m) {
1313249423Sdim					panic("vfs_unbusy_pages: page missing\n");
1314243789Sdim				}
1315249423Sdim				bp->b_pages[i] = m;
1316249423Sdim				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1317249423Sdim			}
1318249423Sdim			--obj->paging_in_progress;
1319249423Sdim			--m->busy;
1320249423Sdim			if ((m->busy == 0) && (m->flags & PG_WANTED)) {
1321249423Sdim				m->flags &= ~PG_WANTED;
1322249423Sdim				wakeup((caddr_t) m);
1323249423Sdim			}
1324249423Sdim		}
1325249423Sdim		if (obj->paging_in_progress == 0 &&
1326249423Sdim		    (obj->flags & OBJ_PIPWNT)) {
1327249423Sdim			obj->flags &= ~OBJ_PIPWNT;
1328249423Sdim			wakeup((caddr_t) obj);
1329249423Sdim		}
1330249423Sdim	}
1331249423Sdim}
1332249423Sdim
1333249423Sdim/*
1334249423Sdim * This routine is called before a device strategy routine.
1335249423Sdim * It is used to tell the VM system that paging I/O is in
1336249423Sdim * progress, and treat the pages associated with the buffer
1337249423Sdim * almost as being PG_BUSY.  Also the object paging_in_progress
1338249423Sdim * flag is handled to make sure that the object doesn't become
1339249423Sdim * inconsistant.
1340249423Sdim */
1341249423Sdimvoid
1342249423Sdimvfs_busy_pages(struct buf * bp, int clear_modify)
1343249423Sdim{
1344249423Sdim	int i;
1345249423Sdim
1346249423Sdim	if (bp->b_flags & B_VMIO) {
1347249423Sdim		vm_object_t obj = bp->b_vp->v_object;
1348249423Sdim		vm_offset_t foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1349249423Sdim		int iocount = bp->b_bufsize;
1350249423Sdim
1351249423Sdim		vfs_setdirty(bp);
1352249423Sdim		for (i = 0; i < bp->b_npages; i++) {
1353249423Sdim			vm_page_t m = bp->b_pages[i];
1354249423Sdim			int resid = (m->offset + PAGE_SIZE) - foff;
1355249423Sdim
1356243789Sdim			if (resid > iocount)
1357249423Sdim				resid = iocount;
1358249423Sdim			obj->paging_in_progress++;
1359249423Sdim			m->busy++;
1360249423Sdim			if (clear_modify) {
1361249423Sdim				vm_page_protect(m, VM_PROT_READ);
1362249423Sdim				vm_page_set_valid(m,
1363249423Sdim					foff & (PAGE_SIZE-1), resid);
1364249423Sdim				vm_page_set_clean(m,
1365249423Sdim					foff & (PAGE_SIZE-1), resid);
1366243789Sdim			} else if (bp->b_bcount >= PAGE_SIZE) {
1367243789Sdim				if (m->valid && (bp->b_flags & B_CACHE) == 0) {
1368249423Sdim					bp->b_pages[i] = bogus_page;
1369249423Sdim					pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1370249423Sdim				}
1371249423Sdim			}
1372249423Sdim			foff += resid;
1373249423Sdim			iocount -= resid;
1374249423Sdim		}
1375249423Sdim	}
1376249423Sdim}
1377249423Sdim
1378249423Sdim/*
1379249423Sdim * Tell the VM system that the pages associated with this buffer
1380243789Sdim * are clean.  This is used for delayed writes where the data is
1381249423Sdim * going to go to disk eventually without additional VM intevention.
1382249423Sdim */
1383249423Sdimvoid
1384243789Sdimvfs_clean_pages(struct buf * bp)
1385243789Sdim{
1386243789Sdim	int i;
1387243789Sdim
1388243789Sdim	if (bp->b_flags & B_VMIO) {
1389243789Sdim		vm_offset_t foff =
1390243789Sdim			bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1391243789Sdim		int iocount = bp->b_bufsize;
1392243789Sdim
1393243789Sdim		for (i = 0; i < bp->b_npages; i++) {
1394243789Sdim			vm_page_t m = bp->b_pages[i];
1395243789Sdim			int resid = (m->offset + PAGE_SIZE) - foff;
1396243789Sdim
1397243789Sdim			if (resid > iocount)
1398243789Sdim				resid = iocount;
1399243789Sdim			if (resid > 0) {
1400243789Sdim				vm_page_set_valid(m,
1401243789Sdim					foff & (PAGE_SIZE-1), resid);
1402243789Sdim				vm_page_set_clean(m,
1403243789Sdim					foff & (PAGE_SIZE-1), resid);
1404243789Sdim			}
1405243789Sdim			foff += resid;
1406243789Sdim			iocount -= resid;
1407243789Sdim		}
1408243789Sdim	}
1409243789Sdim}
1410249423Sdim
1411243789Sdimvoid
1412249423Sdimvfs_bio_clrbuf(struct buf *bp) {
1413243789Sdim	int i;
1414249423Sdim	if( bp->b_flags & B_VMIO) {
1415243789Sdim		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
1416249423Sdim			int j;
1417249423Sdim			if( bp->b_pages[0]->valid != VM_PAGE_BITS_ALL) {
1418249423Sdim				for(j=0; j < bp->b_bufsize / DEV_BSIZE;j++) {
1419243789Sdim					bzero(bp->b_data + j * DEV_BSIZE, DEV_BSIZE);
1420243789Sdim				}
1421243789Sdim			}
1422249423Sdim			bp->b_resid = 0;
1423249423Sdim			return;
1424243789Sdim		}
1425243789Sdim		for(i=0;i<bp->b_npages;i++) {
1426249423Sdim			if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
1427249423Sdim				continue;
1428243789Sdim			if( bp->b_pages[i]->valid == 0) {
1429243789Sdim				bzero(bp->b_data + i * PAGE_SIZE, PAGE_SIZE);
1430243789Sdim			} else {
1431243789Sdim				int j;
1432243789Sdim				for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
1433243789Sdim					if( (bp->b_pages[i]->valid & (1<<j)) == 0)
1434243789Sdim						bzero(bp->b_data + i * PAGE_SIZE + j * DEV_BSIZE, DEV_BSIZE);
1435243789Sdim				}
1436243789Sdim			}
1437243789Sdim			bp->b_pages[i]->valid = VM_PAGE_BITS_ALL;
1438243789Sdim		}
1439249423Sdim		bp->b_resid = 0;
1440249423Sdim	} else {
1441243789Sdim		clrbuf(bp);
1442249423Sdim	}
1443249423Sdim}
1444249423Sdim
1445243789Sdim/*
1446249423Sdim * vm_hold_load_pages and vm_hold_unload pages get pages into
1447249423Sdim * a buffers address space.  The pages are anonymous and are
1448243789Sdim * not associated with a file object.
1449249423Sdim */
1450249423Sdimvoid
1451243789Sdimvm_hold_load_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
1452249423Sdim{
1453249423Sdim	vm_offset_t pg;
1454249423Sdim	vm_page_t p;
1455249423Sdim	vm_offset_t from = round_page(froma);
1456249423Sdim	vm_offset_t to = round_page(toa);
1457249423Sdim
1458249423Sdim	for (pg = from; pg < to; pg += PAGE_SIZE) {
1459243789Sdim
1460243789Sdimtryagain:
1461243789Sdim
1462243789Sdim		p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS,
1463243789Sdim		    VM_ALLOC_NORMAL);
1464249423Sdim		if (!p) {
1465249423Sdim			VM_WAIT;
1466249423Sdim			goto tryagain;
1467249423Sdim		}
1468249423Sdim		vm_page_wire(p);
1469249423Sdim		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
1470249423Sdim		bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = p;
1471249423Sdim		PAGE_WAKEUP(p);
1472249423Sdim		bp->b_npages++;
1473249423Sdim	}
1474249423Sdim}
1475249423Sdim
1476249423Sdimvoid
1477249423Sdimvm_hold_free_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
1478249423Sdim{
1479249423Sdim	vm_offset_t pg;
1480249423Sdim	vm_page_t p;
1481249423Sdim	vm_offset_t from = round_page(froma);
1482249423Sdim	vm_offset_t to = round_page(toa);
1483249423Sdim
1484249423Sdim	for (pg = from; pg < to; pg += PAGE_SIZE) {
1485249423Sdim		p = bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE];
1486249423Sdim		bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = 0;
1487249423Sdim		pmap_kremove(pg);
1488249423Sdim		vm_page_free(p);
1489249423Sdim		--bp->b_npages;
1490249423Sdim	}
1491249423Sdim}
1492249423Sdim