sys/kern/vfs_bio.c

243789Sdim/*
243789Sdim * Copyright (c) 1994 John S. Dyson
243789Sdim * All rights reserved.
243789Sdim *
243789Sdim * Redistribution and use in source and binary forms, with or without
243789Sdim * modification, are permitted provided that the following conditions
243789Sdim * are met:
243789Sdim * 1. Redistributions of source code must retain the above copyright
243789Sdim *    notice immediately at the beginning of the file, without modification,
243789Sdim *    this list of conditions, and the following disclaimer.
243789Sdim * 2. Redistributions in binary form must reproduce the above copyright
249423Sdim *    notice, this list of conditions and the following disclaimer in the
243789Sdim *    documentation and/or other materials provided with the distribution.
243789Sdim * 3. Absolutely no warranty of function or purpose is made by the author
249423Sdim *    John S. Dyson.
243789Sdim * 4. This work was done expressly for inclusion into FreeBSD.  Other use
243789Sdim *    is allowed if this notation is included.
243789Sdim * 5. Modifications may be freely made to this file if the above conditions
243789Sdim *    are met.
243789Sdim *
243789Sdim * $Id: vfs_bio.c,v 1.53 1995/07/24 03:16:41 davidg Exp $
243789Sdim */
249423Sdim
243789Sdim/*
243789Sdim * this file contains a new buffer I/O scheme implementing a coherent
243789Sdim * VM object and buffer cache scheme.  Pains have been taken to make
243789Sdim * sure that the performance degradation associated with schemes such
249423Sdim * as this is not realized.
243789Sdim *
243789Sdim * Author:  John S. Dyson
243789Sdim * Significant help during the development and debugging phases
243789Sdim * had been provided by David Greenman, also of the FreeBSD core team.
243789Sdim */
243789Sdim
249423Sdim#define VMIO
243789Sdim#include <sys/param.h>
243789Sdim#include <sys/systm.h>
243789Sdim#include <sys/kernel.h>
243789Sdim#include <sys/proc.h>
249423Sdim#include <sys/vnode.h>
249423Sdim#include <vm/vm.h>
249423Sdim#include <vm/vm_kern.h>
243789Sdim#include <vm/vm_pageout.h>
249423Sdim#include <vm/vm_page.h>
243789Sdim#include <vm/vm_object.h>
243789Sdim#include <sys/buf.h>
249423Sdim#include <sys/mount.h>
249423Sdim#include <sys/malloc.h>
249423Sdim#include <sys/resourcevar.h>
249423Sdim#include <sys/proc.h>
249423Sdim
249423Sdim#include <miscfs/specfs/specdev.h>
243789Sdim
243789Sdimstruct buf *buf;		/* buffer header pool */
243789Sdimint nbuf;			/* number of buffer headers calculated
243789Sdim				 * elsewhere */
249423Sdimstruct swqueue bswlist;
249423Sdim
249423Sdimvoid vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
249423Sdimvoid vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
243789Sdimvoid vfs_clean_pages(struct buf * bp);
249423Sdimstatic void vfs_setdirty(struct buf *bp);
243789Sdim
249423Sdimint needsbuffer;
243789Sdim
249423Sdim/*
249423Sdim * Internal update daemon, process 3
249423Sdim *	The variable vfs_update_wakeup allows for internal syncs.
249423Sdim */
249423Sdimint vfs_update_wakeup;
249423Sdim
249423Sdim
249423Sdim/*
249423Sdim * buffers base kva
249423Sdim */
249423Sdimcaddr_t buffers_kva;
249423Sdim
249423Sdim/*
243789Sdim * bogus page -- for I/O to/from partially complete buffers
243789Sdim * this is a temporary solution to the problem, but it is not
243789Sdim * really that bad.  it would be better to split the buffer
249423Sdim * for input in the case of buffers partially already in memory,
249423Sdim * but the code is intricate enough already.
249423Sdim */
243789Sdimvm_page_t bogus_page;
243789Sdimvm_offset_t bogus_offset;
249423Sdim
249423Sdimint bufspace, maxbufspace;
243789Sdim
243789Sdim/*
243789Sdim * advisory minimum for size of LRU queue or VMIO queue
243789Sdim */
249423Sdimint minbuf;
243789Sdim
249423Sdim/*
249423Sdim * Initialize buffer headers and related structures.
249423Sdim */
249423Sdimvoid
249423Sdimbufinit()
249423Sdim{
249423Sdim	struct buf *bp;
249423Sdim	int i;
249423Sdim
243789Sdim	TAILQ_INIT(&bswlist);
249423Sdim	LIST_INIT(&invalhash);
249423Sdim
249423Sdim	/* first, make a null hash table */
249423Sdim	for (i = 0; i < BUFHSZ; i++)
249423Sdim		LIST_INIT(&bufhashtbl[i]);
249423Sdim
243789Sdim	/* next, make a null set of free lists */
249423Sdim	for (i = 0; i < BUFFER_QUEUES; i++)
249423Sdim		TAILQ_INIT(&bufqueues[i]);
249423Sdim
243789Sdim	buffers_kva = (caddr_t) kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf);
243789Sdim	/* finally, initialize each buffer header and stick on empty q */
249423Sdim	for (i = 0; i < nbuf; i++) {
243789Sdim		bp = &buf[i];
249423Sdim		bzero(bp, sizeof *bp);
249423Sdim		bp->b_flags = B_INVAL;	/* we're just an empty header */
249423Sdim		bp->b_dev = NODEV;
249423Sdim		bp->b_rcred = NOCRED;
249423Sdim		bp->b_wcred = NOCRED;
249423Sdim		bp->b_qindex = QUEUE_EMPTY;
243789Sdim		bp->b_vnbufs.le_next = NOLIST;
243789Sdim		bp->b_data = buffers_kva + i * MAXBSIZE;
243789Sdim		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
243789Sdim		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
243789Sdim	}
243789Sdim/*
249423Sdim * maxbufspace is currently calculated to support all filesystem blocks
243789Sdim * to be 8K.  If you happen to use a 16K filesystem, the size of the buffer
243789Sdim * cache is still the same as it would be for 8K filesystems.  This
243789Sdim * keeps the size of the buffer cache "in check" for big block filesystems.
243789Sdim */
243789Sdim	minbuf = nbuf / 3;
243789Sdim	maxbufspace = 2 * (nbuf + 8) * PAGE_SIZE;
243789Sdim
243789Sdim	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
249423Sdim	bogus_page = vm_page_alloc(kernel_object,
243789Sdim			bogus_offset - VM_MIN_KERNEL_ADDRESS, VM_ALLOC_NORMAL);
249423Sdim
243789Sdim}
243789Sdim
249423Sdim/*
243789Sdim * remove the buffer from the appropriate free list
249423Sdim */
249423Sdimvoid
249423Sdimbremfree(struct buf * bp)
249423Sdim{
249423Sdim	int s = splbio();
249423Sdim
249423Sdim	if (bp->b_qindex != QUEUE_NONE) {
243789Sdim		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
243789Sdim		bp->b_qindex = QUEUE_NONE;
243789Sdim	} else {
249423Sdim		panic("bremfree: removing a buffer when not on a queue");
243789Sdim	}
249423Sdim	splx(s);
249423Sdim}
243789Sdim
243789Sdim/*
243789Sdim * Get a buffer with the specified data.  Look in the cache first.
249423Sdim */
243789Sdimint
243789Sdimbread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
249423Sdim    struct buf ** bpp)
249423Sdim{
249423Sdim	struct buf *bp;
249423Sdim
249423Sdim	bp = getblk(vp, blkno, size, 0, 0);
249423Sdim	*bpp = bp;
249423Sdim
249423Sdim	/* if not found in cache, do some I/O */
249423Sdim	if ((bp->b_flags & B_CACHE) == 0) {
249423Sdim		if (curproc != NULL)
249423Sdim			curproc->p_stats->p_ru.ru_inblock++;
243789Sdim		bp->b_flags |= B_READ;
243789Sdim		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
243789Sdim		if (bp->b_rcred == NOCRED) {
243789Sdim			if (cred != NOCRED)
249423Sdim				crhold(cred);
249423Sdim			bp->b_rcred = cred;
249423Sdim		}
249423Sdim		vfs_busy_pages(bp, 0);
249423Sdim		VOP_STRATEGY(bp);
249423Sdim		return (biowait(bp));
249423Sdim	}
249423Sdim	return (0);
249423Sdim}
249423Sdim
249423Sdim/*
249423Sdim * Operates like bread, but also starts asynchronous I/O on
249423Sdim * read-ahead blocks.
243789Sdim */
243789Sdimint
243789Sdimbreadn(struct vnode * vp, daddr_t blkno, int size,
243789Sdim    daddr_t * rablkno, int *rabsize,
243789Sdim    int cnt, struct ucred * cred, struct buf ** bpp)
243789Sdim{
243789Sdim	struct buf *bp, *rabp;
243789Sdim	int i;
249423Sdim	int rv = 0, readwait = 0;
249423Sdim
249423Sdim	*bpp = bp = getblk(vp, blkno, size, 0, 0);
249423Sdim
243789Sdim	/* if not found in cache, do some I/O */
243789Sdim	if ((bp->b_flags & B_CACHE) == 0) {
243789Sdim		if (curproc != NULL)
243789Sdim			curproc->p_stats->p_ru.ru_inblock++;
243789Sdim		bp->b_flags |= B_READ;
243789Sdim		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
243789Sdim		if (bp->b_rcred == NOCRED) {
249423Sdim			if (cred != NOCRED)
249423Sdim				crhold(cred);
249423Sdim			bp->b_rcred = cred;
249423Sdim		}
243789Sdim		vfs_busy_pages(bp, 0);
243789Sdim		VOP_STRATEGY(bp);
243789Sdim		++readwait;
243789Sdim	}
243789Sdim	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
243789Sdim		if (inmem(vp, *rablkno))
249423Sdim			continue;
243789Sdim		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
249423Sdim
249423Sdim		if ((rabp->b_flags & B_CACHE) == 0) {
243789Sdim			if (curproc != NULL)
249423Sdim				curproc->p_stats->p_ru.ru_inblock++;
249423Sdim			rabp->b_flags |= B_READ | B_ASYNC;
249423Sdim			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
249423Sdim			if (rabp->b_rcred == NOCRED) {
249423Sdim				if (cred != NOCRED)
249423Sdim					crhold(cred);
249423Sdim				rabp->b_rcred = cred;
249423Sdim			}
243789Sdim			vfs_busy_pages(rabp, 0);
249423Sdim			VOP_STRATEGY(rabp);
249423Sdim		} else {
249423Sdim			brelse(rabp);
249423Sdim		}
249423Sdim	}
249423Sdim
249423Sdim	if (readwait) {
249423Sdim		rv = biowait(bp);
249423Sdim	}
249423Sdim	return (rv);
249423Sdim}
249423Sdim
249423Sdim/*
249423Sdim * Write, release buffer on completion.  (Done by iodone
249423Sdim * if async.)
249423Sdim */
249423Sdimint
249423Sdimbwrite(struct buf * bp)
249423Sdim{
249423Sdim	int oldflags = bp->b_flags;
249423Sdim
249423Sdim	if (bp->b_flags & B_INVAL) {
249423Sdim		brelse(bp);
249423Sdim		return (0);
249423Sdim	}
249423Sdim	if (!(bp->b_flags & B_BUSY))
249423Sdim		panic("bwrite: buffer is not busy???");
249423Sdim
249423Sdim	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
249423Sdim	bp->b_flags |= B_WRITEINPROG;
249423Sdim
243789Sdim	if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) {
243789Sdim		reassignbuf(bp, bp->b_vp);
249423Sdim	}
243789Sdim
249423Sdim	bp->b_vp->v_numoutput++;
243789Sdim	vfs_busy_pages(bp, 1);
249423Sdim	if (curproc != NULL)
243789Sdim		curproc->p_stats->p_ru.ru_oublock++;
249423Sdim	VOP_STRATEGY(bp);
249423Sdim
249423Sdim	if ((oldflags & B_ASYNC) == 0) {
249423Sdim		int rtval = biowait(bp);
249423Sdim
249423Sdim		if (oldflags & B_DELWRI) {
249423Sdim			reassignbuf(bp, bp->b_vp);
243789Sdim		}
249423Sdim		brelse(bp);
249423Sdim		return (rtval);
249423Sdim	}
243789Sdim	return (0);
249423Sdim}
243789Sdim
243789Sdimint
243789Sdimvn_bwrite(ap)
243789Sdim	struct vop_bwrite_args *ap;
243789Sdim{
243789Sdim	return (bwrite(ap->a_bp));
243789Sdim}
243789Sdim
243789Sdim/*
243789Sdim * Delayed write. (Buffer is marked dirty).
243789Sdim */
243789Sdimvoid
243789Sdimbdwrite(struct buf * bp)
243789Sdim{
243789Sdim
243789Sdim	if ((bp->b_flags & B_BUSY) == 0) {
249423Sdim		panic("bdwrite: buffer is not busy");
249423Sdim	}
243789Sdim	if (bp->b_flags & B_INVAL) {
243789Sdim		brelse(bp);
243789Sdim		return;
243789Sdim	}
243789Sdim	if (bp->b_flags & B_TAPE) {
249423Sdim		bawrite(bp);
243789Sdim		return;
243789Sdim	}
243789Sdim	bp->b_flags &= ~(B_READ|B_RELBUF);
243789Sdim	if ((bp->b_flags & B_DELWRI) == 0) {
243789Sdim		bp->b_flags |= B_DONE | B_DELWRI;
243789Sdim		reassignbuf(bp, bp->b_vp);
243789Sdim	}
243789Sdim
243789Sdim	/*
243789Sdim	 * This bmap keeps the system from needing to do the bmap later,
249423Sdim	 * perhaps when the system is attempting to do a sync.  Since it
249423Sdim	 * is likely that the indirect block -- or whatever other datastructure
249423Sdim	 * that the filesystem needs is still in memory now, it is a good
249423Sdim	 * thing to do this.  Note also, that if the pageout daemon is
249423Sdim	 * requesting a sync -- there might not be enough memory to do
243789Sdim	 * the bmap then...  So, this is important to do.
243789Sdim	 */
243789Sdim	if( bp->b_lblkno == bp->b_blkno) {
249423Sdim		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL);
249423Sdim	}
249423Sdim
249423Sdim	/*
249423Sdim	 * Set the *dirty* buffer range based upon the VM system dirty pages.
243789Sdim	 */
249423Sdim	vfs_setdirty(bp);
243789Sdim
249423Sdim	/*
249423Sdim	 * We need to do this here to satisfy the vnode_pager and the
249423Sdim	 * pageout daemon, so that it thinks that the pages have been
249423Sdim	 * "cleaned".  Note that since the pages are in a delayed write
249423Sdim	 * buffer -- the VFS layer "will" see that the pages get written
249423Sdim	 * out on the next sync, or perhaps the cluster will be completed.
249423Sdim	 */
249423Sdim	vfs_clean_pages(bp);
243789Sdim	brelse(bp);
243789Sdim	return;
249423Sdim}
249423Sdim
249423Sdim/*
249423Sdim * Asynchronous write.
249423Sdim * Start output on a buffer, but do not wait for it to complete.
249423Sdim * The buffer is released when the output completes.
249423Sdim */
249423Sdimvoid
249423Sdimbawrite(struct buf * bp)
243789Sdim{
243789Sdim	bp->b_flags |= B_ASYNC;
249423Sdim	(void) VOP_BWRITE(bp);
249423Sdim}
243789Sdim
249423Sdim/*
249423Sdim * Release a buffer.
243789Sdim */
243789Sdimvoid
243789Sdimbrelse(struct buf * bp)
243789Sdim{
243789Sdim	int s;
243789Sdim
243789Sdim	if (bp->b_flags & B_CLUSTER) {
243789Sdim		relpbuf(bp);
243789Sdim		return;
243789Sdim	}
243789Sdim	/* anyone need a "free" block? */
243789Sdim	s = splbio();
243789Sdim
249423Sdim	if (needsbuffer) {
249423Sdim		needsbuffer = 0;
249423Sdim		wakeup((caddr_t) &needsbuffer);
249423Sdim	}
249423Sdim
249423Sdim	/* anyone need this block? */
249423Sdim	if (bp->b_flags & B_WANTED) {
249423Sdim		bp->b_flags &= ~(B_WANTED | B_AGE);
249423Sdim		wakeup((caddr_t) bp);
249423Sdim	} else if (bp->b_flags & B_VMIO) {
249423Sdim		bp->b_flags &= ~B_WANTED;
249423Sdim		wakeup((caddr_t) bp);
249423Sdim	}
243789Sdim	if (bp->b_flags & B_LOCKED)
243789Sdim		bp->b_flags &= ~B_ERROR;
243789Sdim
243789Sdim	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
249423Sdim	    (bp->b_bufsize <= 0)) {
249423Sdim		bp->b_flags |= B_INVAL;
249423Sdim		bp->b_flags &= ~(B_DELWRI | B_CACHE);
249423Sdim		if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp)
243789Sdim			brelvp(bp);
243789Sdim	}
249423Sdim
249423Sdim	/*
249423Sdim	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
249423Sdim	 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
249423Sdim	 * but the VM object is kept around.  The B_NOCACHE flag is used to
249423Sdim	 * invalidate the pages in the VM object.
249423Sdim	 */
249423Sdim	if (bp->b_flags & B_VMIO) {
249423Sdim		vm_offset_t foff;
249423Sdim		vm_object_t obj;
243789Sdim		int i, resid;
243789Sdim		vm_page_t m;
243789Sdim		int iototal = bp->b_bufsize;
243789Sdim
249423Sdim		foff = 0;
249423Sdim		obj = 0;
249423Sdim		if (bp->b_npages) {
249423Sdim			if (bp->b_vp && bp->b_vp->v_mount) {
249423Sdim				foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
249423Sdim			} else {
249423Sdim				/*
249423Sdim				 * vnode pointer has been ripped away --
249423Sdim				 * probably file gone...
243789Sdim				 */
243789Sdim				foff = bp->b_pages[0]->offset;
243789Sdim			}
243789Sdim		}
243789Sdim		for (i = 0; i < bp->b_npages; i++) {
243789Sdim			m = bp->b_pages[i];
249423Sdim			if (m == bogus_page) {
243789Sdim				m = vm_page_lookup(obj, foff);
243789Sdim				if (!m) {
243789Sdim					panic("brelse: page missing\n");
243789Sdim				}
249423Sdim				bp->b_pages[i] = m;
249423Sdim				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
249423Sdim			}
249423Sdim			resid = (m->offset + PAGE_SIZE) - foff;
249423Sdim			if (resid > iototal)
249423Sdim				resid = iototal;
249423Sdim			if (resid > 0) {
249423Sdim				/*
249423Sdim				 * Don't invalidate the page if the local machine has already
249423Sdim				 * modified it.  This is the lesser of two evils, and should
249423Sdim				 * be fixed.
249423Sdim				 */
249423Sdim				if (bp->b_flags & (B_NOCACHE | B_ERROR)) {
243789Sdim					vm_page_test_dirty(m);
249423Sdim					if (m->dirty == 0) {
249423Sdim						vm_page_set_invalid(m, foff, resid);
249423Sdim						if (m->valid == 0)
249423Sdim							vm_page_protect(m, VM_PROT_NONE);
249423Sdim					}
243789Sdim				}
243789Sdim			}
243789Sdim			foff += resid;
243789Sdim			iototal -= resid;
243789Sdim		}
249423Sdim
243789Sdim		if (bp->b_flags & (B_INVAL | B_RELBUF)) {
243789Sdim			for(i=0;i<bp->b_npages;i++) {
249423Sdim				m = bp->b_pages[i];
243789Sdim				--m->bmapped;
243789Sdim				if (m->bmapped == 0) {
243789Sdim					if (m->flags & PG_WANTED) {
243789Sdim						wakeup((caddr_t) m);
249423Sdim						m->flags &= ~PG_WANTED;
243789Sdim					}
243789Sdim					vm_page_test_dirty(m);
243789Sdim					if ((m->dirty & m->valid) == 0 &&
243789Sdim						(m->flags & PG_REFERENCED) == 0 &&
249423Sdim							!pmap_is_referenced(VM_PAGE_TO_PHYS(m))) {
249423Sdim						vm_page_cache(m);
243789Sdim					} else if ((m->flags & PG_ACTIVE) == 0) {
249423Sdim						vm_page_activate(m);
249423Sdim						m->act_count = 0;
249423Sdim					}
249423Sdim				}
249423Sdim			}
249423Sdim			bufspace -= bp->b_bufsize;
249423Sdim			pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
249423Sdim			bp->b_npages = 0;
249423Sdim			bp->b_bufsize = 0;
249423Sdim			bp->b_flags &= ~B_VMIO;
249423Sdim			if (bp->b_vp)
243789Sdim				brelvp(bp);
243789Sdim		}
243789Sdim	}
243789Sdim	if (bp->b_qindex != QUEUE_NONE)
243789Sdim		panic("brelse: free buffer onto another queue???");
243789Sdim
249423Sdim	/* enqueue */
249423Sdim	/* buffers with no memory */
249423Sdim	if (bp->b_bufsize == 0) {
243789Sdim		bp->b_qindex = QUEUE_EMPTY;
243789Sdim		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
249423Sdim		LIST_REMOVE(bp, b_hash);
249423Sdim		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
249423Sdim		bp->b_dev = NODEV;
249423Sdim		/* buffers with junk contents */
249423Sdim	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
249423Sdim		bp->b_qindex = QUEUE_AGE;
249423Sdim		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
243789Sdim		LIST_REMOVE(bp, b_hash);
249423Sdim		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
243789Sdim		bp->b_dev = NODEV;
243789Sdim		/* buffers that are locked */
243789Sdim	} else if (bp->b_flags & B_LOCKED) {
243789Sdim		bp->b_qindex = QUEUE_LOCKED;
243789Sdim		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
243789Sdim		/* buffers with stale but valid contents */
249423Sdim	} else if (bp->b_flags & B_AGE) {
249423Sdim		bp->b_qindex = QUEUE_AGE;
249423Sdim		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
249423Sdim		/* buffers with valid and quite potentially reuseable contents */
249423Sdim	} else {
249423Sdim		bp->b_qindex = QUEUE_LRU;
249423Sdim		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
249423Sdim	}
243789Sdim
243789Sdim	/* unlock */
243789Sdim	bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
249423Sdim	splx(s);
249423Sdim}
243789Sdim
243789Sdim/*
243789Sdim * this routine implements clustered async writes for
249423Sdim * clearing out B_DELWRI buffers...  This is much better
249423Sdim * than the old way of writing only one buffer at a time.
249423Sdim */
249423Sdimvoid
249423Sdimvfs_bio_awrite(struct buf * bp)
243789Sdim{
243789Sdim	int i;
243789Sdim	daddr_t lblkno = bp->b_lblkno;
243789Sdim	struct vnode *vp = bp->b_vp;
243789Sdim	int s;
243789Sdim	int ncl;
243789Sdim	struct buf *bpa;
243789Sdim
243789Sdim	s = splbio();
243789Sdim	if( vp->v_mount && (vp->v_flag & VVMIO) &&
243789Sdim	    	(bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
243789Sdim		int size = vp->v_mount->mnt_stat.f_iosize;
243789Sdim
249423Sdim		for (i = 1; i < MAXPHYS / size; i++) {
249423Sdim			if ((bpa = incore(vp, lblkno + i)) &&
249423Sdim			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
249423Sdim			    (B_DELWRI | B_CLUSTEROK)) &&
249423Sdim			    (bpa->b_bufsize == size)) {
243789Sdim				if ((bpa->b_blkno == bpa->b_lblkno) ||
243789Sdim				    (bpa->b_blkno != bp->b_blkno + (i * size) / DEV_BSIZE))
249423Sdim					break;
249423Sdim			} else {
249423Sdim				break;
249423Sdim			}
249423Sdim		}
243789Sdim		ncl = i;
249423Sdim		/*
249423Sdim		 * this is a possible cluster write
249423Sdim		 */
249423Sdim		if (ncl != 1) {
249423Sdim			bremfree(bp);
249423Sdim			cluster_wbuild(vp, bp, size, lblkno, ncl, -1);
249423Sdim			splx(s);
249423Sdim			return;
249423Sdim		}
249423Sdim	}
249423Sdim	/*
243789Sdim	 * default (old) behavior, writing out only one block
249423Sdim	 */
249423Sdim	bremfree(bp);
249423Sdim	bp->b_flags |= B_BUSY | B_ASYNC;
249423Sdim	(void) VOP_BWRITE(bp);
249423Sdim	splx(s);
249423Sdim}
249423Sdim
249423Sdim
249423Sdim/*
249423Sdim * Find a buffer header which is available for use.
249423Sdim */
249423Sdimstatic struct buf *
249423Sdimgetnewbuf(int slpflag, int slptimeo, int doingvmio)
249423Sdim{
249423Sdim	struct buf *bp;
249423Sdim	int s;
249423Sdim	int firstbp = 1;
249423Sdim
249423Sdim	s = splbio();
249423Sdimstart:
249423Sdim	if (bufspace >= maxbufspace)
249423Sdim		goto trytofreespace;
249423Sdim
249423Sdim	/* can we constitute a new buffer? */
249423Sdim	if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) {
249423Sdim		if (bp->b_qindex != QUEUE_EMPTY)
249423Sdim			panic("getnewbuf: inconsistent EMPTY queue");
243789Sdim		bremfree(bp);
243789Sdim		goto fillbuf;
243789Sdim	}
243789Sdimtrytofreespace:
243789Sdim	/*
243789Sdim	 * We keep the file I/O from hogging metadata I/O
243789Sdim	 * This is desirable because file data is cached in the
243789Sdim	 * VM/Buffer cache even if a buffer is freed.
243789Sdim	 */
243789Sdim	if ((bp = bufqueues[QUEUE_AGE].tqh_first)) {
243789Sdim		if (bp->b_qindex != QUEUE_AGE)
243789Sdim			panic("getnewbuf: inconsistent AGE queue");
243789Sdim	} else if ((bp = bufqueues[QUEUE_LRU].tqh_first)) {
243789Sdim		if (bp->b_qindex != QUEUE_LRU)
243789Sdim			panic("getnewbuf: inconsistent LRU queue");
243789Sdim	}
249423Sdim	if (!bp) {
249423Sdim		/* wait for a free buffer of any kind */
249423Sdim		needsbuffer = 1;
249423Sdim		tsleep((caddr_t) &needsbuffer, PRIBIO | slpflag, "newbuf", slptimeo);
243789Sdim		splx(s);
243789Sdim		return (0);
243789Sdim	}
243789Sdim
249423Sdim	/* if we are a delayed write, convert to an async write */
249423Sdim	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
243789Sdim		vfs_bio_awrite(bp);
243789Sdim		if (!slpflag && !slptimeo) {
243789Sdim			splx(s);
249423Sdim			return (0);
249423Sdim		}
249423Sdim		goto start;
249423Sdim	}
249423Sdim
243789Sdim	if (bp->b_flags & B_WANTED) {
243789Sdim		bp->b_flags &= ~B_WANTED;
249423Sdim		wakeup((caddr_t) bp);
243789Sdim	}
249423Sdim	bremfree(bp);
249423Sdim
243789Sdim	if (bp->b_flags & B_VMIO) {
249423Sdim		bp->b_flags |= B_RELBUF | B_BUSY | B_DONE;
243789Sdim		brelse(bp);
243789Sdim		bremfree(bp);
243789Sdim	}
243789Sdim
243789Sdim	if (bp->b_vp)
243789Sdim		brelvp(bp);
243789Sdim
243789Sdim	/* we are not free, nor do we contain interesting data */
249423Sdim	if (bp->b_rcred != NOCRED)
249423Sdim		crfree(bp->b_rcred);
243789Sdim	if (bp->b_wcred != NOCRED)
243789Sdim		crfree(bp->b_wcred);
243789Sdimfillbuf:
243789Sdim	bp->b_flags |= B_BUSY;
243789Sdim	LIST_REMOVE(bp, b_hash);
243789Sdim	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
243789Sdim	splx(s);
243789Sdim	if (bp->b_bufsize) {
243789Sdim		allocbuf(bp, 0);
249423Sdim	}
243789Sdim	bp->b_flags = B_BUSY;
249423Sdim	bp->b_dev = NODEV;
249423Sdim	bp->b_vp = NULL;
243789Sdim	bp->b_blkno = bp->b_lblkno = 0;
243789Sdim	bp->b_iodone = 0;
243789Sdim	bp->b_error = 0;
243789Sdim	bp->b_resid = 0;
243789Sdim	bp->b_bcount = 0;
249423Sdim	bp->b_npages = 0;
243789Sdim	bp->b_wcred = bp->b_rcred = NOCRED;
243789Sdim	bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
243789Sdim	bp->b_dirtyoff = bp->b_dirtyend = 0;
243789Sdim	bp->b_validoff = bp->b_validend = 0;
243789Sdim	if (bufspace >= maxbufspace) {
249423Sdim		s = splbio();
249423Sdim		bp->b_flags |= B_INVAL;
243789Sdim		brelse(bp);
249423Sdim		goto trytofreespace;
249423Sdim	}
249423Sdim	return (bp);
249423Sdim}
249423Sdim
249423Sdim/*
249423Sdim * Check to see if a block is currently memory resident.
249423Sdim */
243789Sdimstruct buf *
249423Sdimincore(struct vnode * vp, daddr_t blkno)
249423Sdim{
249423Sdim	struct buf *bp;
249423Sdim	struct bufhashhdr *bh;
243789Sdim
243789Sdim	int s = splbio();
249423Sdim
249423Sdim	bh = BUFHASH(vp, blkno);
249423Sdim	bp = bh->lh_first;
249423Sdim
249423Sdim	/* Search hash chain */
249423Sdim	while (bp) {
243789Sdim		/* hit */
249423Sdim		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
249423Sdim		    (bp->b_flags & B_INVAL) == 0) {
249423Sdim			splx(s);
249423Sdim			return (bp);
249423Sdim		}
249423Sdim		bp = bp->b_hash.le_next;
249423Sdim	}
249423Sdim	splx(s);
249423Sdim
249423Sdim	return (0);
249423Sdim}
243789Sdim
243789Sdim/*
243789Sdim * Returns true if no I/O is needed to access the
243789Sdim * associated VM object.  This is like incore except
243789Sdim * it also hunts around in the VM system for the data.
243789Sdim */
243789Sdim
243789Sdimint
243789Sdiminmem(struct vnode * vp, daddr_t blkno)
243789Sdim{
249423Sdim	vm_object_t obj;
243789Sdim	vm_offset_t off, toff, tinc;
243789Sdim	vm_page_t m;
249423Sdim
243789Sdim	if (incore(vp, blkno))
243789Sdim		return 1;
243789Sdim	if (vp->v_mount == 0)
243789Sdim		return 0;
243789Sdim	if ((vp->v_object == 0) || (vp->v_flag & VVMIO) == 0)
243789Sdim		return 0;
249423Sdim
249423Sdim	obj = vp->v_object;
249423Sdim	tinc = PAGE_SIZE;
249423Sdim	if (tinc > vp->v_mount->mnt_stat.f_iosize)
249423Sdim		tinc = vp->v_mount->mnt_stat.f_iosize;
249423Sdim	off = blkno * vp->v_mount->mnt_stat.f_iosize;
249423Sdim
249423Sdim	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
249423Sdim		int mask;
249423Sdim
249423Sdim		m = vm_page_lookup(obj, trunc_page(toff + off));
249423Sdim		if (!m)
249423Sdim			return 0;
249423Sdim		if (vm_page_is_valid(m, toff + off, tinc) == 0)
249423Sdim			return 0;
249423Sdim	}
249423Sdim	return 1;
249423Sdim}
249423Sdim
249423Sdim/*
249423Sdim * now we set the dirty range for the buffer --
249423Sdim * for NFS -- if the file is mapped and pages have
249423Sdim * been written to, let it know.  We want the
249423Sdim * entire range of the buffer to be marked dirty if
249423Sdim * any of the pages have been written to for consistancy
249423Sdim * with the b_validoff, b_validend set in the nfs write
249423Sdim * code, and used by the nfs read code.
249423Sdim */
249423Sdimstatic void
249423Sdimvfs_setdirty(struct buf *bp) {
249423Sdim	int i;
249423Sdim	vm_object_t object;
249423Sdim	vm_offset_t boffset, offset;
243789Sdim	/*
249423Sdim	 * We qualify the scan for modified pages on whether the
249423Sdim	 * object has been flushed yet.  The OBJ_WRITEABLE flag
249423Sdim	 * is not cleared simply by protecting pages off.
249423Sdim	 */
249423Sdim	if ((bp->b_flags & B_VMIO) &&
249423Sdim		((object = bp->b_pages[0]->object)->flags & OBJ_WRITEABLE)) {
243789Sdim		/*
243789Sdim		 * test the pages to see if they have been modified directly
243789Sdim		 * by users through the VM system.
249423Sdim		 */
249423Sdim		for (i = 0; i < bp->b_npages; i++)
243789Sdim			vm_page_test_dirty(bp->b_pages[i]);
243789Sdim
243789Sdim		/*
243789Sdim		 * scan forwards for the first page modified
243789Sdim		 */
243789Sdim		for (i = 0; i < bp->b_npages; i++) {
249423Sdim			if (bp->b_pages[i]->dirty) {
243789Sdim				break;
243789Sdim			}
243789Sdim		}
249423Sdim		boffset = i * PAGE_SIZE;
249423Sdim		if (boffset < bp->b_dirtyoff) {
249423Sdim			bp->b_dirtyoff = boffset;
249423Sdim		}
243789Sdim
243789Sdim		/*
243789Sdim		 * scan backwards for the last page modified
243789Sdim		 */
243789Sdim		for (i = bp->b_npages - 1; i >= 0; --i) {
243789Sdim			if (bp->b_pages[i]->dirty) {
243789Sdim				break;
249423Sdim			}
249423Sdim		}
249423Sdim		boffset = (i + 1) * PAGE_SIZE;
249423Sdim		offset = boffset + bp->b_pages[0]->offset;
249423Sdim		if (offset >= object->size) {
249423Sdim			boffset = object->size - bp->b_pages[0]->offset;
249423Sdim		}
249423Sdim		if (bp->b_dirtyend < boffset) {
249423Sdim			bp->b_dirtyend = boffset;
249423Sdim		}
249423Sdim	}
249423Sdim}
249423Sdim
249423Sdim/*
249423Sdim * Get a block given a specified block and offset into a file/device.
249423Sdim */
243789Sdimstruct buf *
243789Sdimgetblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
249423Sdim{
243789Sdim	struct buf *bp;
243789Sdim	int s;
243789Sdim	struct bufhashhdr *bh;
243789Sdim	vm_offset_t off;
249423Sdim	int nleft;
249423Sdim
249423Sdim	s = splbio();
249423Sdimloop:
249423Sdim	if (bp = incore(vp, blkno)) {
249423Sdim		if (bp->b_flags & B_BUSY) {
249423Sdim			bp->b_flags |= B_WANTED;
249423Sdim			if (!tsleep((caddr_t) bp, PRIBIO | slpflag, "getblk", slptimeo))
249423Sdim				goto loop;
249423Sdim
249423Sdim			splx(s);
249423Sdim			return (struct buf *) NULL;
249423Sdim		}
249423Sdim		bp->b_flags |= B_BUSY | B_CACHE;
249423Sdim		bremfree(bp);
249423Sdim		/*
249423Sdim		 * check for size inconsistancies
249423Sdim		 */
249423Sdim		if (bp->b_bcount != size) {
249423Sdim			if (bp->b_flags & B_VMIO) {
249423Sdim				allocbuf(bp, size);
249423Sdim			} else {
249423Sdim				bp->b_flags |= B_NOCACHE;
243789Sdim				VOP_BWRITE(bp);
243789Sdim				goto loop;
243789Sdim			}
249423Sdim		}
243789Sdim		splx(s);
249423Sdim		return (bp);
243789Sdim	} else {
243789Sdim		vm_object_t obj;
243789Sdim		int doingvmio;
243789Sdim
243789Sdim		if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) {
243789Sdim			doingvmio = 1;
243789Sdim		} else {
243789Sdim			doingvmio = 0;
249423Sdim		}
249423Sdim		if ((bp = getnewbuf(slpflag, slptimeo, doingvmio)) == 0) {
249423Sdim			if (slpflag || slptimeo)
243789Sdim				return NULL;
243789Sdim			goto loop;
249423Sdim		}
243789Sdim
243789Sdim		/*
243789Sdim		 * This code is used to make sure that a buffer is not
243789Sdim		 * created while the getnewbuf routine is blocked.
243789Sdim		 * Normally the vnode is locked so this isn't a problem.
243789Sdim		 * VBLK type I/O requests, however, don't lock the vnode.
249423Sdim		 */
249423Sdim		if (!VOP_ISLOCKED(vp) && incore(vp, blkno)) {
249423Sdim			bp->b_flags |= B_INVAL;
243789Sdim			brelse(bp);
243789Sdim			goto loop;
249423Sdim		}
249423Sdim
249423Sdim		/*
249423Sdim		 * Insert the buffer into the hash, so that it can
249423Sdim		 * be found by incore.
249423Sdim		 */
243789Sdim		bp->b_blkno = bp->b_lblkno = blkno;
249423Sdim		bgetvp(vp, bp);
243789Sdim		LIST_REMOVE(bp, b_hash);
243789Sdim		bh = BUFHASH(vp, blkno);
249423Sdim		LIST_INSERT_HEAD(bh, bp, b_hash);
249423Sdim
249423Sdim		if (doingvmio) {
243789Sdim			bp->b_flags |= (B_VMIO | B_CACHE);
249423Sdim#if defined(VFS_BIO_DEBUG)
243789Sdim			if (vp->v_type != VREG)
249423Sdim				printf("getblk: vmioing file type %d???\n", vp->v_type);
249423Sdim#endif
249423Sdim		} else {
243789Sdim			bp->b_flags &= ~B_VMIO;
243789Sdim		}
249423Sdim		splx(s);
249423Sdim
249423Sdim		allocbuf(bp, size);
249423Sdim		return (bp);
249423Sdim	}
249423Sdim}
249423Sdim
249423Sdim/*
249423Sdim * Get an empty, disassociated buffer of given size.
249423Sdim */
249423Sdimstruct buf *
249423Sdimgeteblk(int size)
249423Sdim{
249423Sdim	struct buf *bp;
249423Sdim
249423Sdim	while ((bp = getnewbuf(0, 0, 0)) == 0);
249423Sdim	allocbuf(bp, size);
249423Sdim	bp->b_flags |= B_INVAL;
249423Sdim	return (bp);
249423Sdim}
249423Sdim
249423Sdim/*
249423Sdim * This code constitutes the buffer memory from either anonymous system
249423Sdim * memory (in the case of non-VMIO operations) or from an associated
249423Sdim * VM object (in the case of VMIO operations).
249423Sdim *
249423Sdim * Note that this code is tricky, and has many complications to resolve
249423Sdim * deadlock or inconsistant data situations.  Tread lightly!!!
249423Sdim *
249423Sdim * Modify the length of a buffer's underlying buffer storage without
249423Sdim * destroying information (unless, of course the buffer is shrinking).
249423Sdim */
249423Sdimint
249423Sdimallocbuf(struct buf * bp, int size)
249423Sdim{
249423Sdim
249423Sdim	int s;
249423Sdim	int newbsize, mbsize;
249423Sdim	int i;
249423Sdim
249423Sdim	if (!(bp->b_flags & B_BUSY))
249423Sdim		panic("allocbuf: buffer not busy");
249423Sdim
249423Sdim	if ((bp->b_flags & B_VMIO) == 0) {
249423Sdim		/*
249423Sdim		 * Just get anonymous memory from the kernel
249423Sdim		 */
249423Sdim		mbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE;
249423Sdim		newbsize = round_page(size);
249423Sdim
249423Sdim		if (newbsize < bp->b_bufsize) {
249423Sdim			vm_hold_free_pages(
249423Sdim			    bp,
249423Sdim			    (vm_offset_t) bp->b_data + newbsize,
249423Sdim			    (vm_offset_t) bp->b_data + bp->b_bufsize);
249423Sdim		} else if (newbsize > bp->b_bufsize) {
249423Sdim			vm_hold_load_pages(
249423Sdim			    bp,
249423Sdim			    (vm_offset_t) bp->b_data + bp->b_bufsize,
249423Sdim			    (vm_offset_t) bp->b_data + newbsize);
249423Sdim		}
249423Sdim	} else {
249423Sdim		vm_page_t m;
249423Sdim		int desiredpages;
249423Sdim
249423Sdim		newbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE;
249423Sdim		desiredpages = round_page(newbsize) / PAGE_SIZE;
249423Sdim
249423Sdim		if (newbsize < bp->b_bufsize) {
249423Sdim			if (desiredpages < bp->b_npages) {
249423Sdim				pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
249423Sdim				    desiredpages * PAGE_SIZE, (bp->b_npages - desiredpages));
249423Sdim				for (i = desiredpages; i < bp->b_npages; i++) {
249423Sdim					m = bp->b_pages[i];
249423Sdim					s = splhigh();
249423Sdim					while ((m->flags & PG_BUSY) || (m->busy != 0)) {
249423Sdim						m->flags |= PG_WANTED;
249423Sdim						tsleep(m, PVM, "biodep", 0);
249423Sdim					}
249423Sdim					splx(s);
249423Sdim
249423Sdim					if (m->bmapped == 0) {
249423Sdim						printf("allocbuf: bmapped is zero for page %d\n", i);
249423Sdim						panic("allocbuf: error");
249423Sdim					}
249423Sdim					--m->bmapped;
249423Sdim					if (m->bmapped == 0) {
249423Sdim						vm_page_protect(m, VM_PROT_NONE);
249423Sdim						vm_page_free(m);
249423Sdim					}
249423Sdim					bp->b_pages[i] = NULL;
249423Sdim				}
249423Sdim				bp->b_npages = desiredpages;
249423Sdim			}
249423Sdim		} else if (newbsize > bp->b_bufsize) {
249423Sdim			vm_object_t obj;
249423Sdim			vm_offset_t tinc, off, toff, objoff;
249423Sdim			int pageindex, curbpnpages;
249423Sdim			struct vnode *vp;
249423Sdim			int bsize;
249423Sdim
249423Sdim			vp = bp->b_vp;
249423Sdim			bsize = vp->v_mount->mnt_stat.f_iosize;
249423Sdim
249423Sdim			if (bp->b_npages < desiredpages) {
249423Sdim				obj = vp->v_object;
243789Sdim				tinc = PAGE_SIZE;
243789Sdim				if (tinc > bsize)
249423Sdim					tinc = bsize;
243789Sdim				off = bp->b_lblkno * bsize;
243789Sdim		doretry:
243789Sdim				curbpnpages = bp->b_npages;
243789Sdim				bp->b_flags |= B_CACHE;
243789Sdim				for (toff = 0; toff < newbsize; toff += tinc) {
243789Sdim					int mask;
243789Sdim					int bytesinpage;
249423Sdim
243789Sdim					pageindex = toff / PAGE_SIZE;
243789Sdim					objoff = trunc_page(toff + off);
243789Sdim					if (pageindex < curbpnpages) {
243789Sdim						int pb;
243789Sdim
243789Sdim						m = bp->b_pages[pageindex];
243789Sdim						if (m->offset != objoff)
243789Sdim							panic("allocbuf: page changed offset??!!!?");
249423Sdim						bytesinpage = tinc;
249423Sdim						if (tinc > (newbsize - toff))
243789Sdim							bytesinpage = newbsize - toff;
249423Sdim						if (!vm_page_is_valid(m, toff + off, bytesinpage)) {
243789Sdim							bp->b_flags &= ~B_CACHE;
243789Sdim						}
249423Sdim						if ((m->flags & PG_ACTIVE) == 0) {
249423Sdim							vm_page_activate(m);
249423Sdim							m->act_count = 0;
243789Sdim						}
243789Sdim						continue;
243789Sdim					}
243789Sdim					m = vm_page_lookup(obj, objoff);
243789Sdim					if (!m) {
243789Sdim						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
243789Sdim						if (!m) {
243789Sdim							int j;
243789Sdim
249423Sdim							for (j = bp->b_npages; j < pageindex; j++) {
249423Sdim								PAGE_WAKEUP(bp->b_pages[j]);
249423Sdim							}
249423Sdim							VM_WAIT;
243789Sdim							goto doretry;
243789Sdim						}
249423Sdim						vm_page_activate(m);
249423Sdim						m->act_count = 0;
249423Sdim						m->valid = 0;
249423Sdim						bp->b_flags &= ~B_CACHE;
249423Sdim					} else if (m->flags & PG_BUSY) {
249423Sdim						int j;
249423Sdim
249423Sdim						for (j = bp->b_npages; j < pageindex; j++) {
249423Sdim							PAGE_WAKEUP(bp->b_pages[j]);
249423Sdim						}
249423Sdim
249423Sdim						s = splbio();
249423Sdim						m->flags |= PG_WANTED;
249423Sdim						tsleep(m, PRIBIO, "pgtblk", 0);
249423Sdim						splx(s);
249423Sdim
249423Sdim						goto doretry;
249423Sdim					} else {
249423Sdim						int pb;
249423Sdim						if ((curproc != pageproc) &&
249423Sdim							(m->flags & PG_CACHE) &&
249423Sdim						    (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) {
249423Sdim							pagedaemon_wakeup();
243789Sdim						}
249423Sdim						bytesinpage = tinc;
249423Sdim						if (tinc > (newbsize - toff))
243789Sdim							bytesinpage = newbsize - toff;
249423Sdim						if (!vm_page_is_valid(m, toff + off, bytesinpage)) {
249423Sdim							bp->b_flags &= ~B_CACHE;
249423Sdim						}
249423Sdim						if ((m->flags & PG_ACTIVE) == 0) {
249423Sdim							vm_page_activate(m);
243789Sdim							m->act_count = 0;
249423Sdim						}
249423Sdim						m->flags |= PG_BUSY;
249423Sdim					}
249423Sdim					bp->b_pages[pageindex] = m;
249423Sdim					curbpnpages = pageindex + 1;
249423Sdim				}
249423Sdim				for (i = bp->b_npages; i < curbpnpages; i++) {
249423Sdim					m = bp->b_pages[i];
249423Sdim					m->bmapped++;
249423Sdim					PAGE_WAKEUP(m);
249423Sdim				}
249423Sdim				bp->b_npages = curbpnpages;
249423Sdim				bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
249423Sdim				pmap_qenter((vm_offset_t) bp->b_data, bp->b_pages, bp->b_npages);
249423Sdim				bp->b_data += off % PAGE_SIZE;
249423Sdim			}
249423Sdim		}
249423Sdim	}
249423Sdim	bufspace += (newbsize - bp->b_bufsize);
249423Sdim	bp->b_bufsize = newbsize;
249423Sdim	bp->b_bcount = size;
249423Sdim	return 1;
249423Sdim}
249423Sdim
249423Sdim/*
249423Sdim * Wait for buffer I/O completion, returning error status.
249423Sdim */
249423Sdimint
249423Sdimbiowait(register struct buf * bp)
249423Sdim{
243789Sdim	int s;
243789Sdim
249423Sdim	s = splbio();
249423Sdim	while ((bp->b_flags & B_DONE) == 0)
249423Sdim		tsleep((caddr_t) bp, PRIBIO, "biowait", 0);
249423Sdim	splx(s);
249423Sdim	if (bp->b_flags & B_EINTR) {
249423Sdim		bp->b_flags &= ~B_EINTR;
249423Sdim		return (EINTR);
249423Sdim	}
249423Sdim	if (bp->b_flags & B_ERROR) {
249423Sdim		return (bp->b_error ? bp->b_error : EIO);
249423Sdim	} else {
249423Sdim		return (0);
249423Sdim	}
249423Sdim}
249423Sdim
249423Sdim/*
249423Sdim * Finish I/O on a buffer, calling an optional function.
249423Sdim * This is usually called from interrupt level, so process blocking
249423Sdim * is not *a good idea*.
249423Sdim */
249423Sdimvoid
243789Sdimbiodone(register struct buf * bp)
243789Sdim{
243789Sdim	int s;
249423Sdim
243789Sdim	s = splbio();
243789Sdim	if (!(bp->b_flags & B_BUSY))
243789Sdim		panic("biodone: buffer not busy");
243789Sdim
243789Sdim	if (bp->b_flags & B_DONE) {
249423Sdim		splx(s);
249423Sdim		printf("biodone: buffer already done\n");
249423Sdim		return;
249423Sdim	}
249423Sdim	bp->b_flags |= B_DONE;
249423Sdim
249423Sdim	if ((bp->b_flags & B_READ) == 0) {
249423Sdim		struct vnode *vp = bp->b_vp;
249423Sdim		vwakeup(bp);
249423Sdim	}
249423Sdim#ifdef BOUNCE_BUFFERS
249423Sdim	if (bp->b_flags & B_BOUNCE)
249423Sdim		vm_bounce_free(bp);
249423Sdim#endif
249423Sdim
249423Sdim	/* call optional completion function if requested */
249423Sdim	if (bp->b_flags & B_CALL) {
249423Sdim		bp->b_flags &= ~B_CALL;
249423Sdim		(*bp->b_iodone) (bp);
249423Sdim		splx(s);
249423Sdim		return;
243789Sdim	}
243789Sdim	if (bp->b_flags & B_VMIO) {
243789Sdim		int i, resid;
249423Sdim		vm_offset_t foff;
249423Sdim		vm_page_t m;
249423Sdim		vm_object_t obj;
249423Sdim		int iosize;
249423Sdim		struct vnode *vp = bp->b_vp;
249423Sdim
249423Sdim		foff = vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
249423Sdim		obj = vp->v_object;
249423Sdim		if (!obj) {
249423Sdim			panic("biodone: no object");
249423Sdim		}
249423Sdim#if defined(VFS_BIO_DEBUG)
249423Sdim		if (obj->paging_in_progress < bp->b_npages) {
249423Sdim			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
243789Sdim			    obj->paging_in_progress, bp->b_npages);
249423Sdim		}
249423Sdim#endif
243789Sdim		iosize = bp->b_bufsize;
243789Sdim		for (i = 0; i < bp->b_npages; i++) {
249423Sdim			int bogusflag = 0;
243789Sdim			m = bp->b_pages[i];
243789Sdim			if (m == bogus_page) {
243789Sdim				bogusflag = 1;
243789Sdim				m = vm_page_lookup(obj, foff);
243789Sdim				if (!m) {
243789Sdim#if defined(VFS_BIO_DEBUG)
249423Sdim					printf("biodone: page disappeared\n");
249423Sdim#endif
249423Sdim					--obj->paging_in_progress;
243789Sdim					continue;
249423Sdim				}
249423Sdim				bp->b_pages[i] = m;
249423Sdim				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
243789Sdim			}
249423Sdim#if defined(VFS_BIO_DEBUG)
249423Sdim			if (trunc_page(foff) != m->offset) {
249423Sdim				printf("biodone: foff(%d)/m->offset(%d) mismatch\n", foff, m->offset);
249423Sdim			}
249423Sdim#endif
249423Sdim			resid = (m->offset + PAGE_SIZE) - foff;
249423Sdim			if (resid > iosize)
243789Sdim				resid = iosize;
249423Sdim			/*
243789Sdim			 * In the write case, the valid and clean bits are
249423Sdim			 * already changed correctly, so we only need to do this
243789Sdim			 * here in the read case.
249423Sdim			 */
249423Sdim			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
249423Sdim				vm_page_set_valid(m, foff & (PAGE_SIZE-1), resid);
249423Sdim				vm_page_set_clean(m, foff & (PAGE_SIZE-1), resid);
249423Sdim			}
243789Sdim
249423Sdim			/*
243789Sdim			 * when debugging new filesystems or buffer I/O methods, this
243789Sdim			 * is the most common error that pops up.  if you see this, you
243789Sdim			 * have not set the page busy flag correctly!!!
243789Sdim			 */
243789Sdim			if (m->busy == 0) {
243789Sdim				printf("biodone: page busy < 0, "
243789Sdim				    "off: %ld, foff: %ld, "
249423Sdim				    "resid: %d, index: %d\n",
249423Sdim				    m->offset, foff, resid, i);
249423Sdim				printf(" iosize: %ld, lblkno: %ld, flags: 0x%x, npages: %d\n",
243789Sdim				    bp->b_vp->v_mount->mnt_stat.f_iosize,
249423Sdim				    bp->b_lblkno, bp->b_flags, bp->b_npages);
249423Sdim				printf(" valid: 0x%x, dirty: 0x%x, mapped: %d\n",
249423Sdim				    m->valid, m->dirty, m->bmapped);
243789Sdim				panic("biodone: page busy < 0\n");
249423Sdim			}
249423Sdim			--m->busy;
249423Sdim			if ((m->busy == 0) && (m->flags & PG_WANTED)) {
249423Sdim				m->flags &= ~PG_WANTED;
249423Sdim				wakeup((caddr_t) m);
249423Sdim			}
249423Sdim			--obj->paging_in_progress;
249423Sdim			foff += resid;
249423Sdim			iosize -= resid;
249423Sdim		}
243789Sdim		if (obj && obj->paging_in_progress == 0 &&
243789Sdim		    (obj->flags & OBJ_PIPWNT)) {
249423Sdim			obj->flags &= ~OBJ_PIPWNT;
243789Sdim			wakeup((caddr_t) obj);
243789Sdim		}
243789Sdim	}
249423Sdim	/*
249423Sdim	 * For asynchronous completions, release the buffer now. The brelse
249423Sdim	 * checks for B_WANTED and will do the wakeup there if necessary - so
249423Sdim	 * no need to do a wakeup here in the async case.
243789Sdim	 */
249423Sdim
249423Sdim	if (bp->b_flags & B_ASYNC) {
249423Sdim		brelse(bp);
249423Sdim	} else {
243789Sdim		bp->b_flags &= ~B_WANTED;
249423Sdim		wakeup((caddr_t) bp);
243789Sdim	}
249423Sdim	splx(s);
249423Sdim}
249423Sdim
249423Sdimint
249423Sdimcount_lock_queue()
249423Sdim{
249423Sdim	int count;
249423Sdim	struct buf *bp;
249423Sdim
249423Sdim	count = 0;
243789Sdim	for (bp = bufqueues[QUEUE_LOCKED].tqh_first;
249423Sdim	    bp != NULL;
249423Sdim	    bp = bp->b_freelist.tqe_next)
249423Sdim		count++;
249423Sdim	return (count);
249423Sdim}
243789Sdim
249423Sdimint vfs_update_interval = 30;
249423Sdim
249423Sdimvoid
243789Sdimvfs_update()
249423Sdim{
249423Sdim	(void) spl0();
249423Sdim	while (1) {
249423Sdim		tsleep((caddr_t) &vfs_update_wakeup, PRIBIO, "update",
249423Sdim		    hz * vfs_update_interval);
249423Sdim		vfs_update_wakeup = 0;
249423Sdim		sync(curproc, NULL, NULL);
249423Sdim	}
249423Sdim}
249423Sdim
249423Sdim/*
249423Sdim * This routine is called in lieu of iodone in the case of
249423Sdim * incomplete I/O.  This keeps the busy status for pages
249423Sdim * consistant.
249423Sdim */
249423Sdimvoid
249423Sdimvfs_unbusy_pages(struct buf * bp)
249423Sdim{
249423Sdim	int i;
249423Sdim
249423Sdim	if (bp->b_flags & B_VMIO) {
249423Sdim		struct vnode *vp = bp->b_vp;
249423Sdim		vm_object_t obj = vp->v_object;
249423Sdim		vm_offset_t foff;
249423Sdim
249423Sdim		foff = trunc_page(vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno);
249423Sdim
249423Sdim		for (i = 0; i < bp->b_npages; i++) {
249423Sdim			vm_page_t m = bp->b_pages[i];
249423Sdim
249423Sdim			if (m == bogus_page) {
249423Sdim				m = vm_page_lookup(obj, foff + i * PAGE_SIZE);
249423Sdim				if (!m) {
249423Sdim					panic("vfs_unbusy_pages: page missing\n");
243789Sdim				}
249423Sdim				bp->b_pages[i] = m;
249423Sdim				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
249423Sdim			}
249423Sdim			--obj->paging_in_progress;
249423Sdim			--m->busy;
249423Sdim			if ((m->busy == 0) && (m->flags & PG_WANTED)) {
249423Sdim				m->flags &= ~PG_WANTED;
249423Sdim				wakeup((caddr_t) m);
249423Sdim			}
249423Sdim		}
249423Sdim		if (obj->paging_in_progress == 0 &&
249423Sdim		    (obj->flags & OBJ_PIPWNT)) {
249423Sdim			obj->flags &= ~OBJ_PIPWNT;
249423Sdim			wakeup((caddr_t) obj);
249423Sdim		}
249423Sdim	}
249423Sdim}
249423Sdim
249423Sdim/*
249423Sdim * This routine is called before a device strategy routine.
249423Sdim * It is used to tell the VM system that paging I/O is in
249423Sdim * progress, and treat the pages associated with the buffer
249423Sdim * almost as being PG_BUSY.  Also the object paging_in_progress
249423Sdim * flag is handled to make sure that the object doesn't become
249423Sdim * inconsistant.
249423Sdim */
249423Sdimvoid
249423Sdimvfs_busy_pages(struct buf * bp, int clear_modify)
249423Sdim{
249423Sdim	int i;
249423Sdim
249423Sdim	if (bp->b_flags & B_VMIO) {
249423Sdim		vm_object_t obj = bp->b_vp->v_object;
249423Sdim		vm_offset_t foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
249423Sdim		int iocount = bp->b_bufsize;
249423Sdim
249423Sdim		vfs_setdirty(bp);
249423Sdim		for (i = 0; i < bp->b_npages; i++) {
249423Sdim			vm_page_t m = bp->b_pages[i];
249423Sdim			int resid = (m->offset + PAGE_SIZE) - foff;
249423Sdim
243789Sdim			if (resid > iocount)
249423Sdim				resid = iocount;
249423Sdim			obj->paging_in_progress++;
249423Sdim			m->busy++;
249423Sdim			if (clear_modify) {
249423Sdim				vm_page_protect(m, VM_PROT_READ);
249423Sdim				vm_page_set_valid(m,
249423Sdim					foff & (PAGE_SIZE-1), resid);
249423Sdim				vm_page_set_clean(m,
249423Sdim					foff & (PAGE_SIZE-1), resid);
243789Sdim			} else if (bp->b_bcount >= PAGE_SIZE) {
243789Sdim				if (m->valid && (bp->b_flags & B_CACHE) == 0) {
249423Sdim					bp->b_pages[i] = bogus_page;
249423Sdim					pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
249423Sdim				}
249423Sdim			}
249423Sdim			foff += resid;
249423Sdim			iocount -= resid;
249423Sdim		}
249423Sdim	}
249423Sdim}
249423Sdim
249423Sdim/*
249423Sdim * Tell the VM system that the pages associated with this buffer
243789Sdim * are clean.  This is used for delayed writes where the data is
249423Sdim * going to go to disk eventually without additional VM intevention.
249423Sdim */
249423Sdimvoid
243789Sdimvfs_clean_pages(struct buf * bp)
243789Sdim{
243789Sdim	int i;
243789Sdim
243789Sdim	if (bp->b_flags & B_VMIO) {
243789Sdim		vm_offset_t foff =
243789Sdim			bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
243789Sdim		int iocount = bp->b_bufsize;
243789Sdim
243789Sdim		for (i = 0; i < bp->b_npages; i++) {
243789Sdim			vm_page_t m = bp->b_pages[i];
243789Sdim			int resid = (m->offset + PAGE_SIZE) - foff;
243789Sdim
243789Sdim			if (resid > iocount)
243789Sdim				resid = iocount;
243789Sdim			if (resid > 0) {
243789Sdim				vm_page_set_valid(m,
243789Sdim					foff & (PAGE_SIZE-1), resid);
243789Sdim				vm_page_set_clean(m,
243789Sdim					foff & (PAGE_SIZE-1), resid);
243789Sdim			}
243789Sdim			foff += resid;
243789Sdim			iocount -= resid;
243789Sdim		}
243789Sdim	}
243789Sdim}
249423Sdim
243789Sdimvoid
249423Sdimvfs_bio_clrbuf(struct buf *bp) {
243789Sdim	int i;
249423Sdim	if( bp->b_flags & B_VMIO) {
243789Sdim		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
249423Sdim			int j;
249423Sdim			if( bp->b_pages[0]->valid != VM_PAGE_BITS_ALL) {
249423Sdim				for(j=0; j < bp->b_bufsize / DEV_BSIZE;j++) {
243789Sdim					bzero(bp->b_data + j * DEV_BSIZE, DEV_BSIZE);
243789Sdim				}
243789Sdim			}
249423Sdim			bp->b_resid = 0;
249423Sdim			return;
243789Sdim		}
243789Sdim		for(i=0;i<bp->b_npages;i++) {
249423Sdim			if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
249423Sdim				continue;
243789Sdim			if( bp->b_pages[i]->valid == 0) {
243789Sdim				bzero(bp->b_data + i * PAGE_SIZE, PAGE_SIZE);
243789Sdim			} else {
243789Sdim				int j;
243789Sdim				for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
243789Sdim					if( (bp->b_pages[i]->valid & (1<<j)) == 0)
243789Sdim						bzero(bp->b_data + i * PAGE_SIZE + j * DEV_BSIZE, DEV_BSIZE);
243789Sdim				}
243789Sdim			}
243789Sdim			bp->b_pages[i]->valid = VM_PAGE_BITS_ALL;
243789Sdim		}
249423Sdim		bp->b_resid = 0;
249423Sdim	} else {
243789Sdim		clrbuf(bp);
249423Sdim	}
249423Sdim}
249423Sdim
243789Sdim/*
249423Sdim * vm_hold_load_pages and vm_hold_unload pages get pages into
249423Sdim * a buffers address space.  The pages are anonymous and are
243789Sdim * not associated with a file object.
249423Sdim */
249423Sdimvoid
243789Sdimvm_hold_load_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
249423Sdim{
249423Sdim	vm_offset_t pg;
249423Sdim	vm_page_t p;
249423Sdim	vm_offset_t from = round_page(froma);
249423Sdim	vm_offset_t to = round_page(toa);
249423Sdim
249423Sdim	for (pg = from; pg < to; pg += PAGE_SIZE) {
243789Sdim
243789Sdimtryagain:
243789Sdim
243789Sdim		p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS,
243789Sdim		    VM_ALLOC_NORMAL);
249423Sdim		if (!p) {
249423Sdim			VM_WAIT;
249423Sdim			goto tryagain;
249423Sdim		}
249423Sdim		vm_page_wire(p);
249423Sdim		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
249423Sdim		bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = p;
249423Sdim		PAGE_WAKEUP(p);
249423Sdim		bp->b_npages++;
249423Sdim	}
249423Sdim}
249423Sdim
249423Sdimvoid
249423Sdimvm_hold_free_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
249423Sdim{
249423Sdim	vm_offset_t pg;
249423Sdim	vm_page_t p;
249423Sdim	vm_offset_t from = round_page(froma);
249423Sdim	vm_offset_t to = round_page(toa);
249423Sdim
249423Sdim	for (pg = from; pg < to; pg += PAGE_SIZE) {
249423Sdim		p = bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE];
249423Sdim		bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = 0;
249423Sdim		pmap_kremove(pg);
249423Sdim		vm_page_free(p);
249423Sdim		--bp->b_npages;
249423Sdim	}
249423Sdim}
249423Sdim