vfs_cluster.c revision 10541
11541Srgrimes/*-
21541Srgrimes * Copyright (c) 1993
31541Srgrimes *	The Regents of the University of California.  All rights reserved.
45455Sdg * Modifications/enhancements:
55455Sdg * 	Copyright (c) 1995 John S. Dyson.  All rights reserved.
61541Srgrimes *
71541Srgrimes * Redistribution and use in source and binary forms, with or without
81541Srgrimes * modification, are permitted provided that the following conditions
91541Srgrimes * are met:
101541Srgrimes * 1. Redistributions of source code must retain the above copyright
111541Srgrimes *    notice, this list of conditions and the following disclaimer.
121541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
131541Srgrimes *    notice, this list of conditions and the following disclaimer in the
141541Srgrimes *    documentation and/or other materials provided with the distribution.
151541Srgrimes * 3. All advertising materials mentioning features or use of this software
161541Srgrimes *    must display the following acknowledgement:
171541Srgrimes *	This product includes software developed by the University of
181541Srgrimes *	California, Berkeley and its contributors.
191541Srgrimes * 4. Neither the name of the University nor the names of its contributors
201541Srgrimes *    may be used to endorse or promote products derived from this software
211541Srgrimes *    without specific prior written permission.
221541Srgrimes *
231541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
241541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
251541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
261541Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
271541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
281541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
291541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
301541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
311541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
321541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
331541Srgrimes * SUCH DAMAGE.
341541Srgrimes *
351541Srgrimes *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
3610541Sdyson * $Id: vfs_cluster.c,v 1.17 1995/06/28 12:31:47 davidg Exp $
371541Srgrimes */
381541Srgrimes
391541Srgrimes#include <sys/param.h>
401549Srgrimes#include <sys/systm.h>
411541Srgrimes#include <sys/proc.h>
421541Srgrimes#include <sys/buf.h>
431541Srgrimes#include <sys/vnode.h>
441541Srgrimes#include <sys/mount.h>
451541Srgrimes#include <sys/malloc.h>
461541Srgrimes#include <sys/resourcevar.h>
475455Sdg#include <sys/vmmeter.h>
485455Sdg#include <miscfs/specfs/specdev.h>
496621Sdg#include <vm/vm.h>
5010541Sdyson#include <vm/vm_object.h>
5110541Sdyson#include <vm/vm_page.h>
521541Srgrimes
531541Srgrimes#ifdef DEBUG
541541Srgrimes#include <vm/vm.h>
551541Srgrimes#include <sys/sysctl.h>
563055Sdgint doreallocblks = 0;
575455Sdgstruct ctldebug debug13 = {"doreallocblks", &doreallocblks};
585455Sdg
591541Srgrimes#else
601541Srgrimes/* XXX for cluster_write */
613055Sdg#define doreallocblks 0
621541Srgrimes#endif
631541Srgrimes
641541Srgrimes/*
651541Srgrimes * Local declarations
661541Srgrimes */
6710541Sdysonstatic struct buf *cluster_rbuild __P((struct vnode *, u_quad_t,
6810541Sdyson    daddr_t, daddr_t, long, int));
691541Srgrimesstruct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *));
701541Srgrimes
715455Sdgint totreads;
725455Sdgint totreadblocks;
7310541Sdysonextern vm_page_t bogus_page;
745455Sdg
751541Srgrimes#ifdef DIAGNOSTIC
761541Srgrimes/*
771541Srgrimes * Set to 1 if reads of block zero should cause readahead to be done.
781541Srgrimes * Set to 0 treats a read of block zero as a non-sequential read.
791541Srgrimes *
801541Srgrimes * Setting to one assumes that most reads of block zero of files are due to
811541Srgrimes * sequential passes over the files (e.g. cat, sum) where additional blocks
821541Srgrimes * will soon be needed.  Setting to zero assumes that the majority are
831541Srgrimes * surgical strikes to get particular info (e.g. size, file) where readahead
841541Srgrimes * blocks will not be used and, in fact, push out other potentially useful
851541Srgrimes * blocks from the cache.  The former seems intuitive, but some quick tests
861541Srgrimes * showed that the latter performed better from a system-wide point of view.
871541Srgrimes */
885455Sdg	int doclusterraz = 0;
895455Sdg
901541Srgrimes#define ISSEQREAD(vp, blk) \
911541Srgrimes	(((blk) != 0 || doclusterraz) && \
921541Srgrimes	 ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
931541Srgrimes#else
941541Srgrimes#define ISSEQREAD(vp, blk) \
955839Sdg	(/* (blk) != 0 && */ ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
961541Srgrimes#endif
971541Srgrimes
981541Srgrimes/*
9910541Sdyson * allow for three entire read-aheads...  The system will
10010541Sdyson * adjust downwards rapidly if needed...
10110541Sdyson */
10210541Sdyson#define RA_MULTIPLE_FAST	2
10310541Sdyson#define RA_MULTIPLE_SLOW	3
10410541Sdyson#define RA_SHIFTDOWN	1	/* approx lg2(RA_MULTIPLE) */
10510541Sdyson/*
1061541Srgrimes * This replaces bread.  If this is a bread at the beginning of a file and
1071541Srgrimes * lastr is 0, we assume this is the first read and we'll read up to two
1081541Srgrimes * blocks if they are sequential.  After that, we'll do regular read ahead
1091541Srgrimes * in clustered chunks.
1101541Srgrimes * 	bp is the block requested.
1111541Srgrimes *	rbp is the read-ahead block.
1121541Srgrimes *	If either is NULL, then you don't have to do the I/O.
1131541Srgrimes */
1141549Srgrimesint
1151541Srgrimescluster_read(vp, filesize, lblkno, size, cred, bpp)
1161541Srgrimes	struct vnode *vp;
1171541Srgrimes	u_quad_t filesize;
1181541Srgrimes	daddr_t lblkno;
1191541Srgrimes	long size;
1201541Srgrimes	struct ucred *cred;
1211541Srgrimes	struct buf **bpp;
1221541Srgrimes{
1231541Srgrimes	struct buf *bp, *rbp;
1245455Sdg	daddr_t blkno, rablkno, origlblkno;
1251541Srgrimes	long flags;
1261541Srgrimes	int error, num_ra, alreadyincore;
12710541Sdyson	int i;
12810541Sdyson	int seq;
1291541Srgrimes
1301541Srgrimes	error = 0;
1315455Sdg	/*
1325455Sdg	 * get the requested block
1335455Sdg	 */
13410541Sdyson	origlblkno = lblkno;
1351541Srgrimes	*bpp = bp = getblk(vp, lblkno, size, 0, 0);
13610541Sdyson	seq = ISSEQREAD(vp, lblkno);
1375455Sdg	/*
1385455Sdg	 * if it is in the cache, then check to see if the reads have been
1395455Sdg	 * sequential.  If they have, then try some read-ahead, otherwise
1405455Sdg	 * back-off on prospective read-aheads.
1415455Sdg	 */
1421541Srgrimes	if (bp->b_flags & B_CACHE) {
14310541Sdyson		if (!seq) {
1446621Sdg			vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
14510541Sdyson			vp->v_ralen >>= RA_SHIFTDOWN;
1465455Sdg			return 0;
14710541Sdyson		} else if( vp->v_maxra > lblkno) {
14810541Sdyson			if ( (vp->v_maxra + (vp->v_ralen / RA_MULTIPLE_SLOW)) >= (lblkno + vp->v_ralen)) {
14910541Sdyson				if ((vp->v_ralen + 1) < RA_MULTIPLE_FAST*(MAXPHYS / size))
15010541Sdyson					++vp->v_ralen;
1515839Sdg				return 0;
15210541Sdyson			}
1536621Sdg			lblkno = vp->v_maxra;
15410541Sdyson		} else {
15510541Sdyson			lblkno += 1;
1568876Srgrimes		}
1575455Sdg		bp = NULL;
1585455Sdg	} else {
1591541Srgrimes		/*
1605455Sdg		 * if it isn't in the cache, then get a chunk from disk if
1615455Sdg		 * sequential, otherwise just get the block.
1621541Srgrimes		 */
1631541Srgrimes		bp->b_flags |= B_READ;
1645455Sdg		lblkno += 1;
1655455Sdg		curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
16610541Sdyson		vp->v_ralen = 0;
1671541Srgrimes	}
1681541Srgrimes	/*
1695455Sdg	 * assume no read-ahead
1705455Sdg	 */
1715455Sdg	alreadyincore = 1;
1725455Sdg	rablkno = lblkno;
1735455Sdg
1745455Sdg	/*
1755455Sdg	 * if we have been doing sequential I/O, then do some read-ahead
1765455Sdg	 */
17710541Sdyson	if (seq) {
1785455Sdg
17910541Sdyson	/*
18010541Sdyson	 * bump ralen a bit...
18110541Sdyson	 */
18210541Sdyson		if ((vp->v_ralen + 1) < RA_MULTIPLE_SLOW*(MAXPHYS / size))
18310541Sdyson			++vp->v_ralen;
1841541Srgrimes		/*
1855455Sdg		 * this code makes sure that the stuff that we have read-ahead
1865455Sdg		 * is still in the cache.  If it isn't, we have been reading
1875455Sdg		 * ahead too much, and we need to back-off, otherwise we might
1885455Sdg		 * try to read more.
1891541Srgrimes		 */
1905455Sdg		for (i = 0; i < vp->v_ralen; i++) {
1915455Sdg			rablkno = lblkno + i;
1925455Sdg			alreadyincore = (int) incore(vp, rablkno);
1935455Sdg			if (!alreadyincore) {
19410541Sdyson				if (inmem(vp, rablkno)) {
19510541Sdyson					struct buf *bpt;
19610541Sdyson					if (vp->v_maxra < rablkno)
19710541Sdyson						vp->v_maxra = rablkno + 1;
19810541Sdyson					continue;
19910541Sdyson				}
2005455Sdg				if (rablkno < vp->v_maxra) {
2015455Sdg					vp->v_maxra = rablkno;
20210541Sdyson					vp->v_ralen >>= RA_SHIFTDOWN;
2035455Sdg					alreadyincore = 1;
2045455Sdg				}
2055455Sdg				break;
20610541Sdyson			} else if (vp->v_maxra < rablkno) {
2076621Sdg				vp->v_maxra = rablkno + 1;
2085455Sdg			}
2091541Srgrimes		}
2105455Sdg	}
2115455Sdg	/*
2125455Sdg	 * we now build the read-ahead buffer if it is desirable.
2135455Sdg	 */
2145455Sdg	rbp = NULL;
2155455Sdg	if (!alreadyincore &&
2165455Sdg	    (rablkno + 1) * size <= filesize &&
21710541Sdyson	    !(error = VOP_BMAP(vp, rablkno, NULL, &blkno, &num_ra, NULL)) &&
2185455Sdg	    blkno != -1) {
2195455Sdg		if (num_ra > vp->v_ralen)
2205455Sdg			num_ra = vp->v_ralen;
2211541Srgrimes
2226621Sdg		if (num_ra) {
22310541Sdyson			rbp = cluster_rbuild(vp, filesize, rablkno, blkno, size,
22410541Sdyson				num_ra + 1);
2251541Srgrimes		} else {
2265455Sdg			rbp = getblk(vp, rablkno, size, 0, 0);
2275455Sdg			rbp->b_flags |= B_READ | B_ASYNC;
2281541Srgrimes			rbp->b_blkno = blkno;
2291541Srgrimes		}
2301541Srgrimes	}
2315839Sdg
2325455Sdg	/*
23310541Sdyson	 * handle the synchronous read
2345455Sdg	 */
2355455Sdg	if (bp) {
2361541Srgrimes		if (bp->b_flags & (B_DONE | B_DELWRI))
2371541Srgrimes			panic("cluster_read: DONE bp");
2385455Sdg		else {
2395455Sdg			vfs_busy_pages(bp, 0);
2401541Srgrimes			error = VOP_STRATEGY(bp);
2415455Sdg			vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
2425455Sdg			totreads++;
2435455Sdg			totreadblocks += bp->b_bcount / size;
2445455Sdg			curproc->p_stats->p_ru.ru_inblock++;
2455455Sdg		}
2465455Sdg	}
2475455Sdg	/*
2485455Sdg	 * and if we have read-aheads, do them too
2495455Sdg	 */
2505455Sdg	if (rbp) {
2516621Sdg		vp->v_maxra = rbp->b_lblkno + rbp->b_bcount / size;
2525455Sdg		if (error || (rbp->b_flags & B_CACHE)) {
2531541Srgrimes			rbp->b_flags &= ~(B_ASYNC | B_READ);
2541541Srgrimes			brelse(rbp);
2555455Sdg		} else {
25610541Sdyson			if ((rbp->b_flags & B_CLUSTER) == 0)
25710541Sdyson				vfs_busy_pages(rbp, 0);
2581541Srgrimes			(void) VOP_STRATEGY(rbp);
2595455Sdg			totreads++;
2605455Sdg			totreadblocks += rbp->b_bcount / size;
2615455Sdg			curproc->p_stats->p_ru.ru_inblock++;
2625455Sdg		}
2635455Sdg	}
2645839Sdg	if (bp && ((bp->b_flags & B_ASYNC) == 0))
2655455Sdg		return (biowait(bp));
2665455Sdg	return (error);
2671541Srgrimes}
2681541Srgrimes
2691541Srgrimes/*
2701541Srgrimes * If blocks are contiguous on disk, use this to provide clustered
2711541Srgrimes * read ahead.  We will read as many blocks as possible sequentially
2721541Srgrimes * and then parcel them up into logical blocks in the buffer hash table.
2731541Srgrimes */
27410541Sdysonstatic struct buf *
27510541Sdysoncluster_rbuild(vp, filesize, lbn, blkno, size, run)
2761541Srgrimes	struct vnode *vp;
2771541Srgrimes	u_quad_t filesize;
2781541Srgrimes	daddr_t lbn;
2791541Srgrimes	daddr_t blkno;
2801541Srgrimes	long size;
2811541Srgrimes	int run;
2821541Srgrimes{
2831541Srgrimes	struct cluster_save *b_save;
28410541Sdyson	struct buf *bp, *tbp;
2851541Srgrimes	daddr_t bn;
2865455Sdg	int i, inc, j;
2871541Srgrimes
2881541Srgrimes#ifdef DIAGNOSTIC
2891541Srgrimes	if (size != vp->v_mount->mnt_stat.f_iosize)
2901541Srgrimes		panic("cluster_rbuild: size %d != filesize %d\n",
2915455Sdg		    size, vp->v_mount->mnt_stat.f_iosize);
2921541Srgrimes#endif
2931541Srgrimes	if (size * (lbn + run + 1) > filesize)
2941541Srgrimes		--run;
29510541Sdyson
29610541Sdyson	tbp = getblk(vp, lbn, size, 0, 0);
29710541Sdyson	if (tbp->b_flags & B_CACHE)
29810541Sdyson		return tbp;
29910541Sdyson
30010541Sdyson	tbp->b_blkno = blkno;
30110541Sdyson	tbp->b_flags |= B_ASYNC | B_READ;
30210541Sdyson	if( ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
30310541Sdyson		return tbp;
30410541Sdyson
30510541Sdyson	bp = trypbuf();
30610541Sdyson	if (bp == 0)
30710541Sdyson		return tbp;
30810541Sdyson
30910541Sdyson	(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
31010541Sdyson	bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO;
3115455Sdg	bp->b_iodone = cluster_callback;
3125455Sdg	bp->b_blkno = blkno;
3135455Sdg	bp->b_lblkno = lbn;
3145455Sdg	pbgetvp(vp, bp);
3151541Srgrimes
31610541Sdyson	b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save),
3171541Srgrimes	    M_SEGMENT, M_WAITOK);
3181541Srgrimes	b_save->bs_nchildren = 0;
3195455Sdg	b_save->bs_children = (struct buf **) (b_save + 1);
3205455Sdg	bp->b_saveaddr = b_save;
3211541Srgrimes
3225455Sdg	bp->b_bcount = 0;
3235455Sdg	bp->b_bufsize = 0;
3245455Sdg	bp->b_npages = 0;
3255455Sdg
3261541Srgrimes	inc = btodb(size);
32710541Sdyson	for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
3285455Sdg		if (i != 0) {
32910541Sdyson			if ((bp->b_npages * PAGE_SIZE) + size > MAXPHYS)
33010541Sdyson				break;
33110541Sdyson			if (incore(vp, lbn + i))
33210541Sdyson				break;
3335455Sdg			tbp = getblk(vp, lbn + i, size, 0, 0);
33410541Sdyson
3355455Sdg			if ((tbp->b_flags & B_CACHE) ||
33610541Sdyson				(tbp->b_flags & B_VMIO) == 0) {
3375455Sdg				brelse(tbp);
3385455Sdg				break;
3395455Sdg			}
34010541Sdyson
34110541Sdyson			for (j=0;j<tbp->b_npages;j++) {
34210541Sdyson				if (tbp->b_pages[j]->valid) {
34310541Sdyson					break;
34410541Sdyson				}
34510541Sdyson			}
34610541Sdyson
34710541Sdyson			if (j != tbp->b_npages) {
34810541Sdyson				brelse(tbp);
34910541Sdyson				break;
35010541Sdyson			}
35110541Sdyson
35210541Sdyson			tbp->b_flags |= B_READ | B_ASYNC;
35310541Sdyson			if( tbp->b_blkno == tbp->b_lblkno) {
35410541Sdyson				tbp->b_blkno = bn;
35510541Sdyson			} else if (tbp->b_blkno != bn) {
35610541Sdyson				brelse(tbp);
35710541Sdyson				break;
35810541Sdyson			}
3591541Srgrimes		}
3601541Srgrimes		++b_save->bs_nchildren;
3615455Sdg		b_save->bs_children[i] = tbp;
3625455Sdg		for (j = 0; j < tbp->b_npages; j += 1) {
36310541Sdyson			vm_page_t m;
36410541Sdyson			m = tbp->b_pages[j];
36510541Sdyson			++m->busy;
36610541Sdyson			++m->object->paging_in_progress;
36710541Sdyson			if (m->valid == VM_PAGE_BITS_ALL) {
36810541Sdyson				m = bogus_page;
36910541Sdyson			}
37010541Sdyson			if ((bp->b_npages == 0) ||
37110541Sdyson				(bp->b_pages[bp->b_npages - 1] != m)) {
37210541Sdyson				bp->b_pages[bp->b_npages] = m;
37310541Sdyson				bp->b_npages++;
37410541Sdyson			}
3751541Srgrimes		}
37610541Sdyson		bp->b_bcount += tbp->b_bcount;
37710541Sdyson		bp->b_bufsize += tbp->b_bufsize;
3781541Srgrimes	}
37910541Sdyson	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
38010541Sdyson		(vm_page_t *)bp->b_pages, bp->b_npages);
3815455Sdg	return (bp);
3821541Srgrimes}
3831541Srgrimes
3841541Srgrimes/*
3851541Srgrimes * Cleanup after a clustered read or write.
3861541Srgrimes * This is complicated by the fact that any of the buffers might have
3871541Srgrimes * extra memory (if there were no empty buffer headers at allocbuf time)
3881541Srgrimes * that we will need to shift around.
3891541Srgrimes */
3901541Srgrimesvoid
3911541Srgrimescluster_callback(bp)
3921541Srgrimes	struct buf *bp;
3931541Srgrimes{
3941541Srgrimes	struct cluster_save *b_save;
3951541Srgrimes	struct buf **bpp, *tbp;
3961541Srgrimes	caddr_t cp;
3971541Srgrimes	int error = 0;
3981541Srgrimes
3991541Srgrimes	/*
4001541Srgrimes	 * Must propogate errors to all the components.
4011541Srgrimes	 */
4021541Srgrimes	if (bp->b_flags & B_ERROR)
4031541Srgrimes		error = bp->b_error;
4041541Srgrimes
4055455Sdg	b_save = (struct cluster_save *) (bp->b_saveaddr);
40610541Sdyson	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
4071541Srgrimes	/*
4081541Srgrimes	 * Move memory from the large cluster buffer into the component
4091541Srgrimes	 * buffers and mark IO as done on these.
4101541Srgrimes	 */
4111541Srgrimes	for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) {
4121541Srgrimes		tbp = *bpp;
4131541Srgrimes		if (error) {
4141541Srgrimes			tbp->b_flags |= B_ERROR;
4151541Srgrimes			tbp->b_error = error;
4161541Srgrimes		}
4171541Srgrimes		biodone(tbp);
4181541Srgrimes	}
4191541Srgrimes	free(b_save, M_SEGMENT);
4205455Sdg	relpbuf(bp);
4211541Srgrimes}
4221541Srgrimes
4231541Srgrimes/*
4241541Srgrimes * Do clustered write for FFS.
4251541Srgrimes *
4261541Srgrimes * Three cases:
4271541Srgrimes *	1. Write is not sequential (write asynchronously)
4281541Srgrimes *	Write is sequential:
4291541Srgrimes *	2.	beginning of cluster - begin cluster
4301541Srgrimes *	3.	middle of a cluster - add to cluster
4311541Srgrimes *	4.	end of a cluster - asynchronously write cluster
4321541Srgrimes */
4331541Srgrimesvoid
4341541Srgrimescluster_write(bp, filesize)
4355455Sdg	struct buf *bp;
4361541Srgrimes	u_quad_t filesize;
4371541Srgrimes{
4385455Sdg	struct vnode *vp;
4395455Sdg	daddr_t lbn;
4405455Sdg	int maxclen, cursize;
4415455Sdg	int lblocksize;
4421541Srgrimes
4435455Sdg	vp = bp->b_vp;
4445455Sdg	lblocksize = vp->v_mount->mnt_stat.f_iosize;
4455455Sdg	lbn = bp->b_lblkno;
4461541Srgrimes
4471541Srgrimes	/* Initialize vnode to beginning of file. */
4481541Srgrimes	if (lbn == 0)
4491541Srgrimes		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
4501541Srgrimes
4515455Sdg	if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
4525455Sdg	    (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
4535839Sdg		maxclen = MAXPHYS / lblocksize - 1;
4541541Srgrimes		if (vp->v_clen != 0) {
4551541Srgrimes			/*
4561541Srgrimes			 * Next block is not sequential.
4578876Srgrimes			 *
4581541Srgrimes			 * If we are not writing at end of file, the process
4595455Sdg			 * seeked to another point in the file since its last
4605455Sdg			 * write, or we have reached our maximum cluster size,
4615455Sdg			 * then push the previous cluster. Otherwise try
4625455Sdg			 * reallocating to make it sequential.
4631541Srgrimes			 */
4641541Srgrimes			cursize = vp->v_lastw - vp->v_cstart + 1;
46510541Sdyson			if (!doreallocblks ||
46610541Sdyson			    (lbn + 1) * lblocksize != filesize ||
46710541Sdyson			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
46810541Sdyson				cluster_wbuild(vp, NULL, lblocksize,
46910541Sdyson				    vp->v_cstart, cursize, lbn);
47010541Sdyson			} else {
47110541Sdyson				struct buf **bpp, **endbp;
47210541Sdyson				struct cluster_save *buflist;
47310541Sdyson
47410541Sdyson				buflist = cluster_collectbufs(vp, bp);
47510541Sdyson				endbp = &buflist->bs_children
47610541Sdyson				    [buflist->bs_nchildren - 1];
47710541Sdyson				if (VOP_REALLOCBLKS(vp, buflist)) {
47810541Sdyson					/*
47910541Sdyson					 * Failed, push the previous cluster.
48010541Sdyson					 */
48110541Sdyson					for (bpp = buflist->bs_children;
48210541Sdyson					     bpp < endbp; bpp++)
48310541Sdyson						brelse(*bpp);
48410541Sdyson					free(buflist, M_SEGMENT);
48510541Sdyson					cluster_wbuild(vp, NULL, lblocksize,
48610541Sdyson					    vp->v_cstart, cursize, lbn);
48710541Sdyson				} else {
48810541Sdyson					/*
48910541Sdyson					 * Succeeded, keep building cluster.
49010541Sdyson					 */
49110541Sdyson					for (bpp = buflist->bs_children;
49210541Sdyson					     bpp <= endbp; bpp++)
49310541Sdyson						bdwrite(*bpp);
49410541Sdyson					free(buflist, M_SEGMENT);
49510541Sdyson					vp->v_lastw = lbn;
49610541Sdyson					vp->v_lasta = bp->b_blkno;
49710541Sdyson					return;
49810541Sdyson				}
49910541Sdyson			}
5001541Srgrimes		}
5011541Srgrimes		/*
5025455Sdg		 * Consider beginning a cluster. If at end of file, make
5035455Sdg		 * cluster as large as possible, otherwise find size of
5045455Sdg		 * existing cluster.
5051541Srgrimes		 */
5065455Sdg		if ((lbn + 1) * lblocksize != filesize &&
5077613Sdg		    (bp->b_blkno == bp->b_lblkno) &&
50810541Sdyson		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
50910541Sdyson		     bp->b_blkno == -1)) {
5101541Srgrimes			bawrite(bp);
5111541Srgrimes			vp->v_clen = 0;
5121541Srgrimes			vp->v_lasta = bp->b_blkno;
5131541Srgrimes			vp->v_cstart = lbn + 1;
5141541Srgrimes			vp->v_lastw = lbn;
5151541Srgrimes			return;
5161541Srgrimes		}
5175455Sdg		vp->v_clen = maxclen;
5185455Sdg		if (maxclen == 0) {	/* I/O not contiguous */
5191541Srgrimes			vp->v_cstart = lbn + 1;
5205455Sdg			bawrite(bp);
5215455Sdg		} else {	/* Wait for rest of cluster */
5221541Srgrimes			vp->v_cstart = lbn;
5235455Sdg			bdwrite(bp);
5241541Srgrimes		}
5251541Srgrimes	} else if (lbn == vp->v_cstart + vp->v_clen) {
5261541Srgrimes		/*
5271541Srgrimes		 * At end of cluster, write it out.
5281541Srgrimes		 */
5291541Srgrimes		cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart,
5301541Srgrimes		    vp->v_clen + 1, lbn);
5311541Srgrimes		vp->v_clen = 0;
5321541Srgrimes		vp->v_cstart = lbn + 1;
5331541Srgrimes	} else
5341541Srgrimes		/*
5355455Sdg		 * In the middle of a cluster, so just delay the I/O for now.
5361541Srgrimes		 */
5371541Srgrimes		bdwrite(bp);
5381541Srgrimes	vp->v_lastw = lbn;
5391541Srgrimes	vp->v_lasta = bp->b_blkno;
5401541Srgrimes}
5411541Srgrimes
5421541Srgrimes
5431541Srgrimes/*
5441541Srgrimes * This is an awful lot like cluster_rbuild...wish they could be combined.
5451541Srgrimes * The last lbn argument is the current block on which I/O is being
5461541Srgrimes * performed.  Check to see that it doesn't fall in the middle of
5471541Srgrimes * the current block (if last_bp == NULL).
5481541Srgrimes */
5491541Srgrimesvoid
5501541Srgrimescluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
5511541Srgrimes	struct vnode *vp;
5521541Srgrimes	struct buf *last_bp;
5531541Srgrimes	long size;
5541541Srgrimes	daddr_t start_lbn;
5551541Srgrimes	int len;
5565455Sdg	daddr_t lbn;
5571541Srgrimes{
5581541Srgrimes	struct cluster_save *b_save;
5595839Sdg	struct buf *bp, *tbp, *pb;
5605455Sdg	caddr_t cp;
5615455Sdg	int i, j, s;
5621541Srgrimes
5631541Srgrimes#ifdef DIAGNOSTIC
5641541Srgrimes	if (size != vp->v_mount->mnt_stat.f_iosize)
5651541Srgrimes		panic("cluster_wbuild: size %d != filesize %d\n",
5665455Sdg		    size, vp->v_mount->mnt_stat.f_iosize);
5671541Srgrimes#endif
5681541Srgrimesredo:
5696837Sdg	if( (lbn != -1) || (last_bp == 0)) {
5706837Sdg		while ((!(tbp = incore(vp, start_lbn)) || (tbp->b_flags & B_BUSY)
5716837Sdg			|| (start_lbn == lbn)) && len) {
5726837Sdg			++start_lbn;
5736837Sdg			--len;
5746837Sdg		}
5751541Srgrimes
5767090Sbde		pb = trypbuf();
5776837Sdg		/* Get more memory for current buffer */
5787164Sdg		if (len <= 1 || pb == NULL) {
5797164Sdg			if (pb != NULL)
5807164Sdg				relpbuf(pb);
5816837Sdg			if (last_bp) {
5826837Sdg				bawrite(last_bp);
5836837Sdg			} else if (len) {
5846837Sdg				bp = getblk(vp, start_lbn, size, 0, 0);
5856837Sdg				bawrite(bp);
5866837Sdg			}
5876837Sdg			return;
5881541Srgrimes		}
5896837Sdg		tbp = getblk(vp, start_lbn, size, 0, 0);
5906837Sdg	} else {
5916837Sdg		tbp = last_bp;
5926837Sdg		if( tbp->b_flags & B_BUSY) {
5936837Sdg			printf("vfs_cluster: warning: buffer already busy\n");
5946837Sdg		}
5956837Sdg		tbp->b_flags |= B_BUSY;
5966837Sdg		last_bp = 0;
5977090Sbde		pb = trypbuf();
5987164Sdg		if (pb == NULL) {
5996837Sdg			bawrite(tbp);
6006837Sdg			return;
6016837Sdg		}
6021541Srgrimes	}
6036837Sdg
6045455Sdg	if (!(tbp->b_flags & B_DELWRI)) {
6055839Sdg		relpbuf(pb);
6061541Srgrimes		++start_lbn;
6071541Srgrimes		--len;
6085455Sdg		brelse(tbp);
6091541Srgrimes		goto redo;
6101541Srgrimes	}
6111541Srgrimes	/*
6125455Sdg	 * Extra memory in the buffer, punt on this buffer. XXX we could
6135455Sdg	 * handle this in most cases, but we would have to push the extra
6145455Sdg	 * memory down to after our max possible cluster size and then
6155455Sdg	 * potentially pull it back up if the cluster was terminated
6165455Sdg	 * prematurely--too much hassle.
6171541Srgrimes	 */
6185455Sdg	if (tbp->b_bcount != tbp->b_bufsize) {
6195839Sdg		relpbuf(pb);
6201541Srgrimes		++start_lbn;
6211541Srgrimes		--len;
6225455Sdg		bawrite(tbp);
6231541Srgrimes		goto redo;
6241541Srgrimes	}
6255839Sdg	bp = pb;
6265455Sdg	b_save = malloc(sizeof(struct buf *) * (len + 1) + sizeof(struct cluster_save),
6271541Srgrimes	    M_SEGMENT, M_WAITOK);
6281541Srgrimes	b_save->bs_nchildren = 0;
6295455Sdg	b_save->bs_children = (struct buf **) (b_save + 1);
6305455Sdg	bp->b_saveaddr = b_save;
6315455Sdg	bp->b_bcount = 0;
6325455Sdg	bp->b_bufsize = 0;
6335455Sdg	bp->b_npages = 0;
6341541Srgrimes
6355455Sdg	if (tbp->b_flags & B_VMIO)
6365455Sdg		bp->b_flags |= B_VMIO;
6375455Sdg
6385455Sdg	bp->b_blkno = tbp->b_blkno;
6395455Sdg	bp->b_lblkno = tbp->b_lblkno;
64010541Sdyson	(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
6415455Sdg	bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER;
6421541Srgrimes	bp->b_iodone = cluster_callback;
6435455Sdg	pbgetvp(vp, bp);
6441541Srgrimes
6455455Sdg	for (i = 0; i < len; ++i, ++start_lbn) {
6465455Sdg		if (i != 0) {
6475455Sdg			/*
6485455Sdg			 * Block is not in core or the non-sequential block
6495455Sdg			 * ending our cluster was part of the cluster (in
6505455Sdg			 * which case we don't want to write it twice).
6515455Sdg			 */
6525455Sdg			if (!(tbp = incore(vp, start_lbn)) ||
6535455Sdg			    (last_bp == NULL && start_lbn == lbn))
6541541Srgrimes				break;
6551541Srgrimes
6565839Sdg			if ((tbp->b_flags & (B_INVAL | B_CLUSTEROK)) != B_CLUSTEROK)
6575455Sdg				break;
6581541Srgrimes
6596837Sdg			if ((tbp->b_npages + bp->b_npages) > (MAXPHYS / PAGE_SIZE))
6606837Sdg				break;
6616837Sdg
66210541Sdyson			if ( (tbp->b_blkno != tbp->b_lblkno) &&
66310541Sdyson				((bp->b_blkno + btodb(size) * i) != tbp->b_blkno))
66410541Sdyson				break;
66510541Sdyson
6665455Sdg			/*
6675455Sdg			 * Get the desired block buffer (unless it is the
6685455Sdg			 * final sequential block whose buffer was passed in
6695455Sdg			 * explictly as last_bp).
6705455Sdg			 */
6715455Sdg			if (last_bp == NULL || start_lbn != lbn) {
6725839Sdg				if( tbp->b_flags & B_BUSY)
6735839Sdg					break;
6745455Sdg				tbp = getblk(vp, start_lbn, size, 0, 0);
6755455Sdg				if (!(tbp->b_flags & B_DELWRI) ||
6765455Sdg				    ((tbp->b_flags & B_VMIO) != (bp->b_flags & B_VMIO))) {
6775455Sdg					brelse(tbp);
6785455Sdg					break;
6795455Sdg				}
6805455Sdg			} else
6815455Sdg				tbp = last_bp;
6821541Srgrimes		}
6835455Sdg		for (j = 0; j < tbp->b_npages; j += 1) {
68410541Sdyson			vm_page_t m;
68510541Sdyson			m = tbp->b_pages[j];
68610541Sdyson			++m->busy;
68710541Sdyson			++m->object->paging_in_progress;
68810541Sdyson			if ((bp->b_npages == 0) ||
68910541Sdyson				(bp->b_pages[bp->b_npages - 1] != m)) {
69010541Sdyson				bp->b_pages[bp->b_npages] = m;
69110541Sdyson				bp->b_npages++;
69210541Sdyson			}
6935455Sdg		}
6941541Srgrimes		bp->b_bcount += size;
6951541Srgrimes		bp->b_bufsize += size;
6961541Srgrimes
6971541Srgrimes		tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
6981937Sdg		tbp->b_flags |= B_ASYNC;
6991541Srgrimes		s = splbio();
7005455Sdg		reassignbuf(tbp, tbp->b_vp);	/* put on clean list */
7011541Srgrimes		++tbp->b_vp->v_numoutput;
7021541Srgrimes		splx(s);
7031541Srgrimes		b_save->bs_children[i] = tbp;
7041541Srgrimes	}
7055455Sdg	b_save->bs_nchildren = i;
70610541Sdyson	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
70710541Sdyson		(vm_page_t *) bp->b_pages, bp->b_npages);
7085455Sdg	bawrite(bp);
7091541Srgrimes
7101541Srgrimes	if (i < len) {
7115455Sdg		len -= i;
7121541Srgrimes		goto redo;
7131541Srgrimes	}
7141541Srgrimes}
7151541Srgrimes
7161541Srgrimes/*
7171541Srgrimes * Collect together all the buffers in a cluster.
7181541Srgrimes * Plus add one additional buffer.
7191541Srgrimes */
7201541Srgrimesstruct cluster_save *
7211541Srgrimescluster_collectbufs(vp, last_bp)
7221541Srgrimes	struct vnode *vp;
7231541Srgrimes	struct buf *last_bp;
7241541Srgrimes{
7251541Srgrimes	struct cluster_save *buflist;
7265455Sdg	daddr_t lbn;
7271541Srgrimes	int i, len;
7281541Srgrimes
7291541Srgrimes	len = vp->v_lastw - vp->v_cstart + 1;
7301541Srgrimes	buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
7311541Srgrimes	    M_SEGMENT, M_WAITOK);
7321541Srgrimes	buflist->bs_nchildren = 0;
7335455Sdg	buflist->bs_children = (struct buf **) (buflist + 1);
7341541Srgrimes	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++)
7355455Sdg		(void) bread(vp, lbn, last_bp->b_bcount, NOCRED,
7365455Sdg		    &buflist->bs_children[i]);
7371541Srgrimes	buflist->bs_children[i] = last_bp;
7381541Srgrimes	buflist->bs_nchildren = i + 1;
7391541Srgrimes	return (buflist);
7401541Srgrimes}
741