vfs_cluster.c revision 6837
11541Srgrimes/*-
21541Srgrimes * Copyright (c) 1993
31541Srgrimes *	The Regents of the University of California.  All rights reserved.
45455Sdg * Modifications/enhancements:
55455Sdg * 	Copyright (c) 1995 John S. Dyson.  All rights reserved.
61541Srgrimes *
71541Srgrimes * Redistribution and use in source and binary forms, with or without
81541Srgrimes * modification, are permitted provided that the following conditions
91541Srgrimes * are met:
101541Srgrimes * 1. Redistributions of source code must retain the above copyright
111541Srgrimes *    notice, this list of conditions and the following disclaimer.
121541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
131541Srgrimes *    notice, this list of conditions and the following disclaimer in the
141541Srgrimes *    documentation and/or other materials provided with the distribution.
151541Srgrimes * 3. All advertising materials mentioning features or use of this software
161541Srgrimes *    must display the following acknowledgement:
171541Srgrimes *	This product includes software developed by the University of
181541Srgrimes *	California, Berkeley and its contributors.
191541Srgrimes * 4. Neither the name of the University nor the names of its contributors
201541Srgrimes *    may be used to endorse or promote products derived from this software
211541Srgrimes *    without specific prior written permission.
221541Srgrimes *
231541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
241541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
251541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
261541Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
271541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
281541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
291541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
301541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
311541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
321541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
331541Srgrimes * SUCH DAMAGE.
341541Srgrimes *
351541Srgrimes *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
366837Sdg * $Id: vfs_cluster.c,v 1.10 1995/02/22 09:39:20 davidg Exp $
371541Srgrimes */
381541Srgrimes
391541Srgrimes#include <sys/param.h>
401549Srgrimes#include <sys/systm.h>
411541Srgrimes#include <sys/proc.h>
421541Srgrimes#include <sys/buf.h>
431541Srgrimes#include <sys/vnode.h>
441541Srgrimes#include <sys/mount.h>
451541Srgrimes#include <sys/trace.h>
461541Srgrimes#include <sys/malloc.h>
471541Srgrimes#include <sys/resourcevar.h>
485455Sdg#include <sys/vmmeter.h>
495455Sdg#include <miscfs/specfs/specdev.h>
506621Sdg#include <vm/vm.h>
516621Sdg#include <vm/vm_pageout.h>
521541Srgrimes
531541Srgrimes#ifdef DEBUG
541541Srgrimes#include <vm/vm.h>
551541Srgrimes#include <sys/sysctl.h>
563055Sdgint doreallocblks = 0;
575455Sdgstruct ctldebug debug13 = {"doreallocblks", &doreallocblks};
585455Sdg
591541Srgrimes#else
601541Srgrimes/* XXX for cluster_write */
613055Sdg#define doreallocblks 0
621541Srgrimes#endif
631541Srgrimes
641541Srgrimes/*
651541Srgrimes * Local declarations
661541Srgrimes */
671541Srgrimesstruct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *,
685455Sdg    daddr_t, daddr_t, long, int, long));
695455Sdgvoid cluster_wbuild __P((struct vnode *, struct buf *, long, daddr_t, int, daddr_t));
701541Srgrimesstruct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *));
711541Srgrimes
725455Sdgint totreads;
735455Sdgint totreadblocks;
745455Sdg
751541Srgrimes#ifdef DIAGNOSTIC
761541Srgrimes/*
771541Srgrimes * Set to 1 if reads of block zero should cause readahead to be done.
781541Srgrimes * Set to 0 treats a read of block zero as a non-sequential read.
791541Srgrimes *
801541Srgrimes * Setting to one assumes that most reads of block zero of files are due to
811541Srgrimes * sequential passes over the files (e.g. cat, sum) where additional blocks
821541Srgrimes * will soon be needed.  Setting to zero assumes that the majority are
831541Srgrimes * surgical strikes to get particular info (e.g. size, file) where readahead
841541Srgrimes * blocks will not be used and, in fact, push out other potentially useful
851541Srgrimes * blocks from the cache.  The former seems intuitive, but some quick tests
861541Srgrimes * showed that the latter performed better from a system-wide point of view.
871541Srgrimes */
885455Sdg	int doclusterraz = 0;
895455Sdg
901541Srgrimes#define ISSEQREAD(vp, blk) \
911541Srgrimes	(((blk) != 0 || doclusterraz) && \
921541Srgrimes	 ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
931541Srgrimes#else
941541Srgrimes#define ISSEQREAD(vp, blk) \
955839Sdg	(/* (blk) != 0 && */ ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
961541Srgrimes#endif
971541Srgrimes
981541Srgrimes/*
991541Srgrimes * This replaces bread.  If this is a bread at the beginning of a file and
1001541Srgrimes * lastr is 0, we assume this is the first read and we'll read up to two
1011541Srgrimes * blocks if they are sequential.  After that, we'll do regular read ahead
1021541Srgrimes * in clustered chunks.
1031541Srgrimes * 	bp is the block requested.
1041541Srgrimes *	rbp is the read-ahead block.
1051541Srgrimes *	If either is NULL, then you don't have to do the I/O.
1061541Srgrimes */
1071549Srgrimesint
1081541Srgrimescluster_read(vp, filesize, lblkno, size, cred, bpp)
1091541Srgrimes	struct vnode *vp;
1101541Srgrimes	u_quad_t filesize;
1111541Srgrimes	daddr_t lblkno;
1121541Srgrimes	long size;
1131541Srgrimes	struct ucred *cred;
1141541Srgrimes	struct buf **bpp;
1151541Srgrimes{
1161541Srgrimes	struct buf *bp, *rbp;
1175455Sdg	daddr_t blkno, rablkno, origlblkno;
1181541Srgrimes	long flags;
1191541Srgrimes	int error, num_ra, alreadyincore;
1201541Srgrimes
1215455Sdg	origlblkno = lblkno;
1221541Srgrimes	error = 0;
1235455Sdg	/*
1245455Sdg	 * get the requested block
1255455Sdg	 */
1261541Srgrimes	*bpp = bp = getblk(vp, lblkno, size, 0, 0);
1275455Sdg	/*
1285455Sdg	 * if it is in the cache, then check to see if the reads have been
1295455Sdg	 * sequential.  If they have, then try some read-ahead, otherwise
1305455Sdg	 * back-off on prospective read-aheads.
1315455Sdg	 */
1321541Srgrimes	if (bp->b_flags & B_CACHE) {
1335455Sdg		int i;
1345455Sdg
1355455Sdg		if (!ISSEQREAD(vp, origlblkno)) {
1366621Sdg			vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
1375455Sdg			vp->v_ralen >>= 1;
1385455Sdg			return 0;
1396621Sdg		} else if( vp->v_maxra >= origlblkno) {
1405839Sdg			if ((vp->v_ralen + 1) < (MAXPHYS / size))
1415839Sdg				vp->v_ralen++;
1426621Sdg			if ( vp->v_maxra >= (origlblkno + vp->v_ralen))
1435839Sdg				return 0;
1446621Sdg			lblkno = vp->v_maxra;
1455839Sdg		}
1465455Sdg		bp = NULL;
1475455Sdg	} else {
1481541Srgrimes		/*
1495455Sdg		 * if it isn't in the cache, then get a chunk from disk if
1505455Sdg		 * sequential, otherwise just get the block.
1511541Srgrimes		 */
1521541Srgrimes		bp->b_flags |= B_READ;
1535455Sdg		lblkno += 1;
1545455Sdg		curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
1551541Srgrimes	}
1561541Srgrimes	/*
1575455Sdg	 * if ralen is "none", then try a little
1581541Srgrimes	 */
1595455Sdg	if (vp->v_ralen == 0)
1605455Sdg		vp->v_ralen = 1;
1615455Sdg	/*
1625455Sdg	 * assume no read-ahead
1635455Sdg	 */
1645455Sdg	alreadyincore = 1;
1655455Sdg	rablkno = lblkno;
1665455Sdg
1675455Sdg	/*
1685455Sdg	 * if we have been doing sequential I/O, then do some read-ahead
1695455Sdg	 */
1705455Sdg	if (ISSEQREAD(vp, origlblkno)) {
1715455Sdg		int i;
1725455Sdg
1731541Srgrimes		/*
1745455Sdg		 * this code makes sure that the stuff that we have read-ahead
1755455Sdg		 * is still in the cache.  If it isn't, we have been reading
1765455Sdg		 * ahead too much, and we need to back-off, otherwise we might
1775455Sdg		 * try to read more.
1781541Srgrimes		 */
1795455Sdg		for (i = 0; i < vp->v_ralen; i++) {
1805455Sdg			rablkno = lblkno + i;
1815455Sdg			alreadyincore = (int) incore(vp, rablkno);
1825455Sdg			if (!alreadyincore) {
1835455Sdg				if (rablkno < vp->v_maxra) {
1845455Sdg					vp->v_maxra = rablkno;
1855455Sdg					vp->v_ralen >>= 1;
1865455Sdg					alreadyincore = 1;
1875455Sdg				} else {
1886621Sdg					if (inmem(vp, rablkno)) {
1896621Sdg						if( vp->v_maxra < rablkno)
1906621Sdg							vp->v_maxra = rablkno + 1;
1915455Sdg						continue;
1926621Sdg					}
1935455Sdg					if ((vp->v_ralen + 1) < MAXPHYS / size)
1945455Sdg						vp->v_ralen++;
1955455Sdg				}
1965455Sdg				break;
1976621Sdg			} else if( vp->v_maxra < rablkno) {
1986621Sdg				vp->v_maxra = rablkno + 1;
1995455Sdg			}
2001541Srgrimes		}
2015455Sdg	}
2025455Sdg	/*
2035455Sdg	 * we now build the read-ahead buffer if it is desirable.
2045455Sdg	 */
2055455Sdg	rbp = NULL;
2065455Sdg	if (!alreadyincore &&
2075455Sdg	    (rablkno + 1) * size <= filesize &&
2085455Sdg	    !(error = VOP_BMAP(vp, rablkno, NULL, &blkno, &num_ra)) &&
2095455Sdg	    blkno != -1) {
2105839Sdg		if ((vp->v_ralen + 1) < MAXPHYS / size)
2115839Sdg			vp->v_ralen++;
2125455Sdg		if (num_ra > vp->v_ralen)
2135455Sdg			num_ra = vp->v_ralen;
2141541Srgrimes
2156621Sdg		if (num_ra) {
2161541Srgrimes			rbp = cluster_rbuild(vp, filesize,
2175455Sdg			    NULL, rablkno, blkno, size, num_ra, B_READ | B_ASYNC);
2181541Srgrimes		} else {
2195455Sdg			rbp = getblk(vp, rablkno, size, 0, 0);
2205455Sdg			rbp->b_flags |= B_READ | B_ASYNC;
2211541Srgrimes			rbp->b_blkno = blkno;
2221541Srgrimes		}
2231541Srgrimes	}
2245839Sdg
2255455Sdg	/*
2265455Sdg	 * if the synchronous read is a cluster, handle it, otherwise do a
2275455Sdg	 * simple, non-clustered read.
2285455Sdg	 */
2295455Sdg	if (bp) {
2301541Srgrimes		if (bp->b_flags & (B_DONE | B_DELWRI))
2311541Srgrimes			panic("cluster_read: DONE bp");
2325455Sdg		else {
2335455Sdg			vfs_busy_pages(bp, 0);
2341541Srgrimes			error = VOP_STRATEGY(bp);
2355455Sdg			vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
2365455Sdg			totreads++;
2375455Sdg			totreadblocks += bp->b_bcount / size;
2385455Sdg			curproc->p_stats->p_ru.ru_inblock++;
2395455Sdg		}
2405455Sdg	}
2415455Sdg	/*
2425455Sdg	 * and if we have read-aheads, do them too
2435455Sdg	 */
2445455Sdg	if (rbp) {
2456621Sdg		vp->v_maxra = rbp->b_lblkno + rbp->b_bcount / size;
2465455Sdg		if (error || (rbp->b_flags & B_CACHE)) {
2471541Srgrimes			rbp->b_flags &= ~(B_ASYNC | B_READ);
2481541Srgrimes			brelse(rbp);
2495455Sdg		} else {
2505455Sdg			vfs_busy_pages(rbp, 0);
2511541Srgrimes			(void) VOP_STRATEGY(rbp);
2525455Sdg			totreads++;
2535455Sdg			totreadblocks += rbp->b_bcount / size;
2545455Sdg			curproc->p_stats->p_ru.ru_inblock++;
2555455Sdg		}
2565455Sdg	}
2575839Sdg	if (bp && ((bp->b_flags & B_ASYNC) == 0))
2585455Sdg		return (biowait(bp));
2595455Sdg	return (error);
2601541Srgrimes}
2611541Srgrimes
2621541Srgrimes/*
2631541Srgrimes * If blocks are contiguous on disk, use this to provide clustered
2641541Srgrimes * read ahead.  We will read as many blocks as possible sequentially
2651541Srgrimes * and then parcel them up into logical blocks in the buffer hash table.
2661541Srgrimes */
2671541Srgrimesstruct buf *
2681541Srgrimescluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags)
2691541Srgrimes	struct vnode *vp;
2701541Srgrimes	u_quad_t filesize;
2711541Srgrimes	struct buf *bp;
2721541Srgrimes	daddr_t lbn;
2731541Srgrimes	daddr_t blkno;
2741541Srgrimes	long size;
2751541Srgrimes	int run;
2761541Srgrimes	long flags;
2771541Srgrimes{
2781541Srgrimes	struct cluster_save *b_save;
2791541Srgrimes	struct buf *tbp;
2801541Srgrimes	daddr_t bn;
2815455Sdg	int i, inc, j;
2821541Srgrimes
2831541Srgrimes#ifdef DIAGNOSTIC
2841541Srgrimes	if (size != vp->v_mount->mnt_stat.f_iosize)
2851541Srgrimes		panic("cluster_rbuild: size %d != filesize %d\n",
2865455Sdg		    size, vp->v_mount->mnt_stat.f_iosize);
2871541Srgrimes#endif
2881541Srgrimes	if (size * (lbn + run + 1) > filesize)
2891541Srgrimes		--run;
2901541Srgrimes	if (run == 0) {
2911541Srgrimes		if (!bp) {
2921541Srgrimes			bp = getblk(vp, lbn, size, 0, 0);
2931541Srgrimes			bp->b_blkno = blkno;
2941541Srgrimes			bp->b_flags |= flags;
2951541Srgrimes		}
2965455Sdg		return (bp);
2971541Srgrimes	}
2985455Sdg	tbp = bp;
2995455Sdg	if (!tbp) {
3005455Sdg		tbp = getblk(vp, lbn, size, 0, 0);
3015455Sdg	}
3025455Sdg	if (tbp->b_flags & B_CACHE) {
3035455Sdg		return (tbp);
3045455Sdg	} else if (bp == NULL) {
3055455Sdg		tbp->b_flags |= B_ASYNC;
3065455Sdg	}
3075455Sdg	bp = getpbuf();
3085455Sdg	bp->b_flags = flags | B_CALL | B_BUSY | B_CLUSTER;
3095455Sdg	bp->b_iodone = cluster_callback;
3105455Sdg	bp->b_blkno = blkno;
3115455Sdg	bp->b_lblkno = lbn;
3125455Sdg	pbgetvp(vp, bp);
3131541Srgrimes
3145142Sdg	b_save = malloc(sizeof(struct buf *) * (run + 1) + sizeof(struct cluster_save),
3151541Srgrimes	    M_SEGMENT, M_WAITOK);
3161541Srgrimes	b_save->bs_nchildren = 0;
3175455Sdg	b_save->bs_children = (struct buf **) (b_save + 1);
3185455Sdg	bp->b_saveaddr = b_save;
3191541Srgrimes
3205455Sdg	bp->b_bcount = 0;
3215455Sdg	bp->b_bufsize = 0;
3225455Sdg	bp->b_npages = 0;
3235455Sdg
3245455Sdg	if (tbp->b_flags & B_VMIO)
3255455Sdg		bp->b_flags |= B_VMIO;
3265455Sdg
3271541Srgrimes	inc = btodb(size);
3285455Sdg	for (bn = blkno, i = 0; i <= run; ++i, bn += inc) {
3295455Sdg		if (i != 0) {
3305455Sdg			tbp = getblk(vp, lbn + i, size, 0, 0);
3315455Sdg			if ((tbp->b_flags & B_CACHE) ||
3325455Sdg			    (tbp->b_flags & B_VMIO) != (bp->b_flags & B_VMIO)) {
3335455Sdg				brelse(tbp);
3345455Sdg				break;
3355455Sdg			}
3365455Sdg			tbp->b_blkno = bn;
3375455Sdg			tbp->b_flags |= flags | B_READ | B_ASYNC;
3385455Sdg		} else {
3395455Sdg			tbp->b_flags |= flags | B_READ;
3401541Srgrimes		}
3411541Srgrimes		++b_save->bs_nchildren;
3425455Sdg		b_save->bs_children[i] = tbp;
3435455Sdg		for (j = 0; j < tbp->b_npages; j += 1) {
3445455Sdg			bp->b_pages[j + bp->b_npages] = tbp->b_pages[j];
3451541Srgrimes		}
3465455Sdg		bp->b_npages += tbp->b_npages;
3475455Sdg		bp->b_bcount += size;
3485455Sdg		bp->b_bufsize += size;
3491541Srgrimes	}
3506621Sdg	pmap_qenter((vm_offset_t) bp->b_data, (vm_page_t *)bp->b_pages, bp->b_npages);
3515455Sdg	return (bp);
3521541Srgrimes}
3531541Srgrimes
3541541Srgrimes/*
3551541Srgrimes * Cleanup after a clustered read or write.
3561541Srgrimes * This is complicated by the fact that any of the buffers might have
3571541Srgrimes * extra memory (if there were no empty buffer headers at allocbuf time)
3581541Srgrimes * that we will need to shift around.
3591541Srgrimes */
3601541Srgrimesvoid
3611541Srgrimescluster_callback(bp)
3621541Srgrimes	struct buf *bp;
3631541Srgrimes{
3641541Srgrimes	struct cluster_save *b_save;
3651541Srgrimes	struct buf **bpp, *tbp;
3661541Srgrimes	caddr_t cp;
3671541Srgrimes	int error = 0;
3681541Srgrimes
3691541Srgrimes	/*
3701541Srgrimes	 * Must propogate errors to all the components.
3711541Srgrimes	 */
3721541Srgrimes	if (bp->b_flags & B_ERROR)
3731541Srgrimes		error = bp->b_error;
3741541Srgrimes
3755455Sdg	b_save = (struct cluster_save *) (bp->b_saveaddr);
3766621Sdg	pmap_qremove((vm_offset_t) bp->b_data, bp->b_npages);
3771541Srgrimes	/*
3781541Srgrimes	 * Move memory from the large cluster buffer into the component
3791541Srgrimes	 * buffers and mark IO as done on these.
3801541Srgrimes	 */
3811541Srgrimes	for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) {
3821541Srgrimes		tbp = *bpp;
3831541Srgrimes		if (error) {
3841541Srgrimes			tbp->b_flags |= B_ERROR;
3851541Srgrimes			tbp->b_error = error;
3861541Srgrimes		}
3871541Srgrimes		biodone(tbp);
3881541Srgrimes	}
3891541Srgrimes	free(b_save, M_SEGMENT);
3905455Sdg	relpbuf(bp);
3911541Srgrimes}
3921541Srgrimes
3931541Srgrimes/*
3941541Srgrimes * Do clustered write for FFS.
3951541Srgrimes *
3961541Srgrimes * Three cases:
3971541Srgrimes *	1. Write is not sequential (write asynchronously)
3981541Srgrimes *	Write is sequential:
3991541Srgrimes *	2.	beginning of cluster - begin cluster
4001541Srgrimes *	3.	middle of a cluster - add to cluster
4011541Srgrimes *	4.	end of a cluster - asynchronously write cluster
4021541Srgrimes */
4031541Srgrimesvoid
4041541Srgrimescluster_write(bp, filesize)
4055455Sdg	struct buf *bp;
4061541Srgrimes	u_quad_t filesize;
4071541Srgrimes{
4085455Sdg	struct vnode *vp;
4095455Sdg	daddr_t lbn;
4105455Sdg	int maxclen, cursize;
4115455Sdg	int lblocksize;
4121541Srgrimes
4135455Sdg	vp = bp->b_vp;
4145455Sdg	lblocksize = vp->v_mount->mnt_stat.f_iosize;
4155455Sdg	lbn = bp->b_lblkno;
4161541Srgrimes
4171541Srgrimes	/* Initialize vnode to beginning of file. */
4181541Srgrimes	if (lbn == 0)
4191541Srgrimes		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
4201541Srgrimes
4215455Sdg	if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
4225455Sdg	    (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
4235839Sdg		maxclen = MAXPHYS / lblocksize - 1;
4241541Srgrimes		if (vp->v_clen != 0) {
4251541Srgrimes			/*
4261541Srgrimes			 * Next block is not sequential.
4275455Sdg			 *
4281541Srgrimes			 * If we are not writing at end of file, the process
4295455Sdg			 * seeked to another point in the file since its last
4305455Sdg			 * write, or we have reached our maximum cluster size,
4315455Sdg			 * then push the previous cluster. Otherwise try
4325455Sdg			 * reallocating to make it sequential.
4331541Srgrimes			 */
4341541Srgrimes			cursize = vp->v_lastw - vp->v_cstart + 1;
4355455Sdg			cluster_wbuild(vp, NULL, lblocksize,
4365455Sdg			    vp->v_cstart, cursize, lbn);
4371541Srgrimes		}
4381541Srgrimes		/*
4395455Sdg		 * Consider beginning a cluster. If at end of file, make
4405455Sdg		 * cluster as large as possible, otherwise find size of
4415455Sdg		 * existing cluster.
4421541Srgrimes		 */
4435455Sdg		if ((lbn + 1) * lblocksize != filesize &&
4441541Srgrimes		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) ||
4455455Sdg			bp->b_blkno == -1)) {
4461541Srgrimes			bawrite(bp);
4471541Srgrimes			vp->v_clen = 0;
4481541Srgrimes			vp->v_lasta = bp->b_blkno;
4491541Srgrimes			vp->v_cstart = lbn + 1;
4501541Srgrimes			vp->v_lastw = lbn;
4511541Srgrimes			return;
4521541Srgrimes		}
4535455Sdg		vp->v_clen = maxclen;
4545455Sdg		if (maxclen == 0) {	/* I/O not contiguous */
4551541Srgrimes			vp->v_cstart = lbn + 1;
4565455Sdg			bawrite(bp);
4575455Sdg		} else {	/* Wait for rest of cluster */
4581541Srgrimes			vp->v_cstart = lbn;
4595455Sdg			bdwrite(bp);
4601541Srgrimes		}
4611541Srgrimes	} else if (lbn == vp->v_cstart + vp->v_clen) {
4621541Srgrimes		/*
4631541Srgrimes		 * At end of cluster, write it out.
4641541Srgrimes		 */
4651541Srgrimes		cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart,
4661541Srgrimes		    vp->v_clen + 1, lbn);
4671541Srgrimes		vp->v_clen = 0;
4681541Srgrimes		vp->v_cstart = lbn + 1;
4691541Srgrimes	} else
4701541Srgrimes		/*
4715455Sdg		 * In the middle of a cluster, so just delay the I/O for now.
4721541Srgrimes		 */
4731541Srgrimes		bdwrite(bp);
4741541Srgrimes	vp->v_lastw = lbn;
4751541Srgrimes	vp->v_lasta = bp->b_blkno;
4761541Srgrimes}
4771541Srgrimes
4781541Srgrimes
4791541Srgrimes/*
4801541Srgrimes * This is an awful lot like cluster_rbuild...wish they could be combined.
4811541Srgrimes * The last lbn argument is the current block on which I/O is being
4821541Srgrimes * performed.  Check to see that it doesn't fall in the middle of
4831541Srgrimes * the current block (if last_bp == NULL).
4841541Srgrimes */
4851541Srgrimesvoid
4861541Srgrimescluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
4871541Srgrimes	struct vnode *vp;
4881541Srgrimes	struct buf *last_bp;
4891541Srgrimes	long size;
4901541Srgrimes	daddr_t start_lbn;
4911541Srgrimes	int len;
4925455Sdg	daddr_t lbn;
4931541Srgrimes{
4941541Srgrimes	struct cluster_save *b_save;
4955839Sdg	struct buf *bp, *tbp, *pb;
4965455Sdg	caddr_t cp;
4975455Sdg	int i, j, s;
4981541Srgrimes
4991541Srgrimes#ifdef DIAGNOSTIC
5001541Srgrimes	if (size != vp->v_mount->mnt_stat.f_iosize)
5011541Srgrimes		panic("cluster_wbuild: size %d != filesize %d\n",
5025455Sdg		    size, vp->v_mount->mnt_stat.f_iosize);
5031541Srgrimes#endif
5041541Srgrimesredo:
5056837Sdg	if( (lbn != -1) || (last_bp == 0)) {
5066837Sdg		while ((!(tbp = incore(vp, start_lbn)) || (tbp->b_flags & B_BUSY)
5076837Sdg			|| (start_lbn == lbn)) && len) {
5086837Sdg			++start_lbn;
5096837Sdg			--len;
5106837Sdg		}
5111541Srgrimes
5126837Sdg		pb = (struct buf *) trypbuf();
5136837Sdg		/* Get more memory for current buffer */
5146837Sdg		if (len <= 1 || pb == 0) {
5156837Sdg			relpbuf(pb);
5166837Sdg			if (last_bp) {
5176837Sdg				bawrite(last_bp);
5186837Sdg			} else if (len) {
5196837Sdg				bp = getblk(vp, start_lbn, size, 0, 0);
5206837Sdg				bawrite(bp);
5216837Sdg			}
5226837Sdg			return;
5231541Srgrimes		}
5246837Sdg		tbp = getblk(vp, start_lbn, size, 0, 0);
5256837Sdg	} else {
5266837Sdg		tbp = last_bp;
5276837Sdg		if( tbp->b_flags & B_BUSY) {
5286837Sdg			printf("vfs_cluster: warning: buffer already busy\n");
5296837Sdg		}
5306837Sdg		tbp->b_flags |= B_BUSY;
5316837Sdg		last_bp = 0;
5326837Sdg		pb = (struct buf *) trypbuf();
5336837Sdg		if( pb == 0) {
5346837Sdg			bawrite(tbp);
5356837Sdg			return;
5366837Sdg		}
5371541Srgrimes	}
5386837Sdg
5395455Sdg	if (!(tbp->b_flags & B_DELWRI)) {
5405839Sdg		relpbuf(pb);
5411541Srgrimes		++start_lbn;
5421541Srgrimes		--len;
5435455Sdg		brelse(tbp);
5441541Srgrimes		goto redo;
5451541Srgrimes	}
5461541Srgrimes	/*
5475455Sdg	 * Extra memory in the buffer, punt on this buffer. XXX we could
5485455Sdg	 * handle this in most cases, but we would have to push the extra
5495455Sdg	 * memory down to after our max possible cluster size and then
5505455Sdg	 * potentially pull it back up if the cluster was terminated
5515455Sdg	 * prematurely--too much hassle.
5521541Srgrimes	 */
5535455Sdg	if (tbp->b_bcount != tbp->b_bufsize) {
5545839Sdg		relpbuf(pb);
5551541Srgrimes		++start_lbn;
5561541Srgrimes		--len;
5575455Sdg		bawrite(tbp);
5581541Srgrimes		goto redo;
5591541Srgrimes	}
5605839Sdg	bp = pb;
5615455Sdg	b_save = malloc(sizeof(struct buf *) * (len + 1) + sizeof(struct cluster_save),
5621541Srgrimes	    M_SEGMENT, M_WAITOK);
5631541Srgrimes	b_save->bs_nchildren = 0;
5645455Sdg	b_save->bs_children = (struct buf **) (b_save + 1);
5655455Sdg	bp->b_saveaddr = b_save;
5665455Sdg	bp->b_bcount = 0;
5675455Sdg	bp->b_bufsize = 0;
5685455Sdg	bp->b_npages = 0;
5691541Srgrimes
5705455Sdg	if (tbp->b_flags & B_VMIO)
5715455Sdg		bp->b_flags |= B_VMIO;
5725455Sdg
5735455Sdg	bp->b_blkno = tbp->b_blkno;
5745455Sdg	bp->b_lblkno = tbp->b_lblkno;
5755455Sdg	bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER;
5761541Srgrimes	bp->b_iodone = cluster_callback;
5775455Sdg	pbgetvp(vp, bp);
5781541Srgrimes
5795455Sdg	for (i = 0; i < len; ++i, ++start_lbn) {
5805455Sdg		if (i != 0) {
5815455Sdg			/*
5825455Sdg			 * Block is not in core or the non-sequential block
5835455Sdg			 * ending our cluster was part of the cluster (in
5845455Sdg			 * which case we don't want to write it twice).
5855455Sdg			 */
5865455Sdg			if (!(tbp = incore(vp, start_lbn)) ||
5875455Sdg			    (last_bp == NULL && start_lbn == lbn))
5881541Srgrimes				break;
5891541Srgrimes
5905839Sdg			if ((tbp->b_flags & (B_INVAL | B_CLUSTEROK)) != B_CLUSTEROK)
5915455Sdg				break;
5921541Srgrimes
5936837Sdg			if ((tbp->b_npages + bp->b_npages) > (MAXPHYS / PAGE_SIZE))
5946837Sdg				break;
5956837Sdg
5965455Sdg			/*
5975455Sdg			 * Get the desired block buffer (unless it is the
5985455Sdg			 * final sequential block whose buffer was passed in
5995455Sdg			 * explictly as last_bp).
6005455Sdg			 */
6015455Sdg			if (last_bp == NULL || start_lbn != lbn) {
6025839Sdg				if( tbp->b_flags & B_BUSY)
6035839Sdg					break;
6045455Sdg				tbp = getblk(vp, start_lbn, size, 0, 0);
6055455Sdg				if (!(tbp->b_flags & B_DELWRI) ||
6065455Sdg				    ((tbp->b_flags & B_VMIO) != (bp->b_flags & B_VMIO))) {
6075455Sdg					brelse(tbp);
6085455Sdg					break;
6095455Sdg				}
6105455Sdg			} else
6115455Sdg				tbp = last_bp;
6121541Srgrimes		}
6135455Sdg		for (j = 0; j < tbp->b_npages; j += 1) {
6145455Sdg			bp->b_pages[j + bp->b_npages] = tbp->b_pages[j];
6155455Sdg		}
6165455Sdg		bp->b_npages += tbp->b_npages;
6171541Srgrimes		bp->b_bcount += size;
6181541Srgrimes		bp->b_bufsize += size;
6191541Srgrimes
6201541Srgrimes		tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
6211937Sdg		tbp->b_flags |= B_ASYNC;
6221541Srgrimes		s = splbio();
6235455Sdg		reassignbuf(tbp, tbp->b_vp);	/* put on clean list */
6241541Srgrimes		++tbp->b_vp->v_numoutput;
6251541Srgrimes		splx(s);
6261541Srgrimes		b_save->bs_children[i] = tbp;
6271541Srgrimes	}
6285455Sdg	b_save->bs_nchildren = i;
6296621Sdg	pmap_qenter((vm_offset_t) bp->b_data, (vm_page_t *) bp->b_pages, bp->b_npages);
6305455Sdg	bawrite(bp);
6311541Srgrimes
6321541Srgrimes	if (i < len) {
6335455Sdg		len -= i;
6341541Srgrimes		goto redo;
6351541Srgrimes	}
6361541Srgrimes}
6371541Srgrimes
6381541Srgrimes/*
6391541Srgrimes * Collect together all the buffers in a cluster.
6401541Srgrimes * Plus add one additional buffer.
6411541Srgrimes */
6421541Srgrimesstruct cluster_save *
6431541Srgrimescluster_collectbufs(vp, last_bp)
6441541Srgrimes	struct vnode *vp;
6451541Srgrimes	struct buf *last_bp;
6461541Srgrimes{
6471541Srgrimes	struct cluster_save *buflist;
6485455Sdg	daddr_t lbn;
6491541Srgrimes	int i, len;
6501541Srgrimes
6511541Srgrimes	len = vp->v_lastw - vp->v_cstart + 1;
6521541Srgrimes	buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
6531541Srgrimes	    M_SEGMENT, M_WAITOK);
6541541Srgrimes	buflist->bs_nchildren = 0;
6555455Sdg	buflist->bs_children = (struct buf **) (buflist + 1);
6561541Srgrimes	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++)
6575455Sdg		(void) bread(vp, lbn, last_bp->b_bcount, NOCRED,
6585455Sdg		    &buflist->bs_children[i]);
6591541Srgrimes	buflist->bs_children[i] = last_bp;
6601541Srgrimes	buflist->bs_nchildren = i + 1;
6611541Srgrimes	return (buflist);
6621541Srgrimes}
663