vfs_cluster.c revision 12413
11541Srgrimes/*-
21541Srgrimes * Copyright (c) 1993
31541Srgrimes *	The Regents of the University of California.  All rights reserved.
45455Sdg * Modifications/enhancements:
55455Sdg * 	Copyright (c) 1995 John S. Dyson.  All rights reserved.
61541Srgrimes *
71541Srgrimes * Redistribution and use in source and binary forms, with or without
81541Srgrimes * modification, are permitted provided that the following conditions
91541Srgrimes * are met:
101541Srgrimes * 1. Redistributions of source code must retain the above copyright
111541Srgrimes *    notice, this list of conditions and the following disclaimer.
121541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
131541Srgrimes *    notice, this list of conditions and the following disclaimer in the
141541Srgrimes *    documentation and/or other materials provided with the distribution.
151541Srgrimes * 3. All advertising materials mentioning features or use of this software
161541Srgrimes *    must display the following acknowledgement:
171541Srgrimes *	This product includes software developed by the University of
181541Srgrimes *	California, Berkeley and its contributors.
191541Srgrimes * 4. Neither the name of the University nor the names of its contributors
201541Srgrimes *    may be used to endorse or promote products derived from this software
211541Srgrimes *    without specific prior written permission.
221541Srgrimes *
231541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
241541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
251541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
261541Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
271541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
281541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
291541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
301541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
311541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
321541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
331541Srgrimes * SUCH DAMAGE.
341541Srgrimes *
351541Srgrimes *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
3612413Sdyson * $Id: vfs_cluster.c,v 1.27 1995/11/20 03:55:48 dyson Exp $
371541Srgrimes */
381541Srgrimes
391541Srgrimes#include <sys/param.h>
401549Srgrimes#include <sys/systm.h>
411541Srgrimes#include <sys/proc.h>
421541Srgrimes#include <sys/buf.h>
431541Srgrimes#include <sys/vnode.h>
441541Srgrimes#include <sys/mount.h>
451541Srgrimes#include <sys/malloc.h>
461541Srgrimes#include <sys/resourcevar.h>
475455Sdg#include <sys/vmmeter.h>
485455Sdg#include <miscfs/specfs/specdev.h>
496621Sdg#include <vm/vm.h>
5010541Sdyson#include <vm/vm_object.h>
5110541Sdyson#include <vm/vm_page.h>
521541Srgrimes
531541Srgrimes#ifdef DEBUG
541541Srgrimes#include <vm/vm.h>
551541Srgrimes#include <sys/sysctl.h>
563055Sdgint doreallocblks = 0;
5712283SphkSYSCTL_INT(_debug, 13, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, "");
585455Sdg
591541Srgrimes#else
601541Srgrimes/* XXX for cluster_write */
613055Sdg#define doreallocblks 0
621541Srgrimes#endif
631541Srgrimes
641541Srgrimes/*
651541Srgrimes * Local declarations
661541Srgrimes */
6710541Sdysonstatic struct buf *cluster_rbuild __P((struct vnode *, u_quad_t,
6810541Sdyson    daddr_t, daddr_t, long, int));
691541Srgrimesstruct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *));
701541Srgrimes
715455Sdgint totreads;
725455Sdgint totreadblocks;
7310541Sdysonextern vm_page_t bogus_page;
745455Sdg
751541Srgrimes#ifdef DIAGNOSTIC
761541Srgrimes/*
771541Srgrimes * Set to 1 if reads of block zero should cause readahead to be done.
781541Srgrimes * Set to 0 treats a read of block zero as a non-sequential read.
791541Srgrimes *
801541Srgrimes * Setting to one assumes that most reads of block zero of files are due to
811541Srgrimes * sequential passes over the files (e.g. cat, sum) where additional blocks
821541Srgrimes * will soon be needed.  Setting to zero assumes that the majority are
831541Srgrimes * surgical strikes to get particular info (e.g. size, file) where readahead
841541Srgrimes * blocks will not be used and, in fact, push out other potentially useful
851541Srgrimes * blocks from the cache.  The former seems intuitive, but some quick tests
861541Srgrimes * showed that the latter performed better from a system-wide point of view.
871541Srgrimes */
885455Sdg	int doclusterraz = 0;
895455Sdg
901541Srgrimes#define ISSEQREAD(vp, blk) \
911541Srgrimes	(((blk) != 0 || doclusterraz) && \
921541Srgrimes	 ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
931541Srgrimes#else
941541Srgrimes#define ISSEQREAD(vp, blk) \
955839Sdg	(/* (blk) != 0 && */ ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
961541Srgrimes#endif
971541Srgrimes
981541Srgrimes/*
9910541Sdyson * allow for three entire read-aheads...  The system will
10010541Sdyson * adjust downwards rapidly if needed...
10110541Sdyson */
10210541Sdyson#define RA_MULTIPLE_FAST	2
10310541Sdyson#define RA_MULTIPLE_SLOW	3
10410541Sdyson#define RA_SHIFTDOWN	1	/* approx lg2(RA_MULTIPLE) */
10510541Sdyson/*
1061541Srgrimes * This replaces bread.  If this is a bread at the beginning of a file and
1071541Srgrimes * lastr is 0, we assume this is the first read and we'll read up to two
1081541Srgrimes * blocks if they are sequential.  After that, we'll do regular read ahead
1091541Srgrimes * in clustered chunks.
1101541Srgrimes * 	bp is the block requested.
1111541Srgrimes *	rbp is the read-ahead block.
1121541Srgrimes *	If either is NULL, then you don't have to do the I/O.
1131541Srgrimes */
1141549Srgrimesint
1151541Srgrimescluster_read(vp, filesize, lblkno, size, cred, bpp)
1161541Srgrimes	struct vnode *vp;
1171541Srgrimes	u_quad_t filesize;
1181541Srgrimes	daddr_t lblkno;
1191541Srgrimes	long size;
1201541Srgrimes	struct ucred *cred;
1211541Srgrimes	struct buf **bpp;
1221541Srgrimes{
1231541Srgrimes	struct buf *bp, *rbp;
1245455Sdg	daddr_t blkno, rablkno, origlblkno;
1251541Srgrimes	int error, num_ra, alreadyincore;
12610541Sdyson	int i;
12710541Sdyson	int seq;
1281541Srgrimes
1291541Srgrimes	error = 0;
1305455Sdg	/*
1315455Sdg	 * get the requested block
1325455Sdg	 */
13310541Sdyson	origlblkno = lblkno;
1341541Srgrimes	*bpp = bp = getblk(vp, lblkno, size, 0, 0);
13510541Sdyson	seq = ISSEQREAD(vp, lblkno);
1365455Sdg	/*
1375455Sdg	 * if it is in the cache, then check to see if the reads have been
1385455Sdg	 * sequential.  If they have, then try some read-ahead, otherwise
1395455Sdg	 * back-off on prospective read-aheads.
1405455Sdg	 */
1411541Srgrimes	if (bp->b_flags & B_CACHE) {
14210541Sdyson		if (!seq) {
1436621Sdg			vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
14410541Sdyson			vp->v_ralen >>= RA_SHIFTDOWN;
1455455Sdg			return 0;
14610541Sdyson		} else if( vp->v_maxra > lblkno) {
14710541Sdyson			if ( (vp->v_maxra + (vp->v_ralen / RA_MULTIPLE_SLOW)) >= (lblkno + vp->v_ralen)) {
14810541Sdyson				if ((vp->v_ralen + 1) < RA_MULTIPLE_FAST*(MAXPHYS / size))
14910541Sdyson					++vp->v_ralen;
1505839Sdg				return 0;
15110541Sdyson			}
1526621Sdg			lblkno = vp->v_maxra;
15310541Sdyson		} else {
15410541Sdyson			lblkno += 1;
1558876Srgrimes		}
1565455Sdg		bp = NULL;
1575455Sdg	} else {
1581541Srgrimes		/*
1595455Sdg		 * if it isn't in the cache, then get a chunk from disk if
1605455Sdg		 * sequential, otherwise just get the block.
1611541Srgrimes		 */
1621541Srgrimes		bp->b_flags |= B_READ;
1635455Sdg		lblkno += 1;
1645455Sdg		curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
16510541Sdyson		vp->v_ralen = 0;
1661541Srgrimes	}
1671541Srgrimes	/*
1685455Sdg	 * assume no read-ahead
1695455Sdg	 */
1705455Sdg	alreadyincore = 1;
1715455Sdg	rablkno = lblkno;
1725455Sdg
1735455Sdg	/*
1745455Sdg	 * if we have been doing sequential I/O, then do some read-ahead
1755455Sdg	 */
17610541Sdyson	if (seq) {
1775455Sdg
17810541Sdyson	/*
17910541Sdyson	 * bump ralen a bit...
18010541Sdyson	 */
18110541Sdyson		if ((vp->v_ralen + 1) < RA_MULTIPLE_SLOW*(MAXPHYS / size))
18210541Sdyson			++vp->v_ralen;
1831541Srgrimes		/*
1845455Sdg		 * this code makes sure that the stuff that we have read-ahead
1855455Sdg		 * is still in the cache.  If it isn't, we have been reading
1865455Sdg		 * ahead too much, and we need to back-off, otherwise we might
1875455Sdg		 * try to read more.
1881541Srgrimes		 */
1895455Sdg		for (i = 0; i < vp->v_ralen; i++) {
1905455Sdg			rablkno = lblkno + i;
1915455Sdg			alreadyincore = (int) incore(vp, rablkno);
1925455Sdg			if (!alreadyincore) {
19310541Sdyson				if (inmem(vp, rablkno)) {
19410541Sdyson					if (vp->v_maxra < rablkno)
19510541Sdyson						vp->v_maxra = rablkno + 1;
19610541Sdyson					continue;
19710541Sdyson				}
1985455Sdg				if (rablkno < vp->v_maxra) {
1995455Sdg					vp->v_maxra = rablkno;
20010541Sdyson					vp->v_ralen >>= RA_SHIFTDOWN;
2015455Sdg					alreadyincore = 1;
2025455Sdg				}
2035455Sdg				break;
20410541Sdyson			} else if (vp->v_maxra < rablkno) {
2056621Sdg				vp->v_maxra = rablkno + 1;
2065455Sdg			}
2071541Srgrimes		}
2085455Sdg	}
2095455Sdg	/*
2105455Sdg	 * we now build the read-ahead buffer if it is desirable.
2115455Sdg	 */
2125455Sdg	rbp = NULL;
2135455Sdg	if (!alreadyincore &&
2145455Sdg	    (rablkno + 1) * size <= filesize &&
21510551Sdyson	    !(error = VOP_BMAP(vp, rablkno, NULL, &blkno, &num_ra, NULL)) &&
2165455Sdg	    blkno != -1) {
2175455Sdg		if (num_ra > vp->v_ralen)
2185455Sdg			num_ra = vp->v_ralen;
2191541Srgrimes
2206621Sdg		if (num_ra) {
22110541Sdyson			rbp = cluster_rbuild(vp, filesize, rablkno, blkno, size,
22210541Sdyson				num_ra + 1);
2231541Srgrimes		} else {
2245455Sdg			rbp = getblk(vp, rablkno, size, 0, 0);
2255455Sdg			rbp->b_flags |= B_READ | B_ASYNC;
2261541Srgrimes			rbp->b_blkno = blkno;
2271541Srgrimes		}
2281541Srgrimes	}
2295839Sdg
2305455Sdg	/*
23110541Sdyson	 * handle the synchronous read
2325455Sdg	 */
2335455Sdg	if (bp) {
2341541Srgrimes		if (bp->b_flags & (B_DONE | B_DELWRI))
2351541Srgrimes			panic("cluster_read: DONE bp");
2365455Sdg		else {
2375455Sdg			vfs_busy_pages(bp, 0);
2381541Srgrimes			error = VOP_STRATEGY(bp);
2395455Sdg			vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
2405455Sdg			totreads++;
2415455Sdg			totreadblocks += bp->b_bcount / size;
2425455Sdg			curproc->p_stats->p_ru.ru_inblock++;
2435455Sdg		}
2445455Sdg	}
2455455Sdg	/*
2465455Sdg	 * and if we have read-aheads, do them too
2475455Sdg	 */
2485455Sdg	if (rbp) {
2496621Sdg		vp->v_maxra = rbp->b_lblkno + rbp->b_bcount / size;
2505455Sdg		if (error || (rbp->b_flags & B_CACHE)) {
2511541Srgrimes			rbp->b_flags &= ~(B_ASYNC | B_READ);
2521541Srgrimes			brelse(rbp);
2535455Sdg		} else {
25410541Sdyson			if ((rbp->b_flags & B_CLUSTER) == 0)
25510541Sdyson				vfs_busy_pages(rbp, 0);
2561541Srgrimes			(void) VOP_STRATEGY(rbp);
2575455Sdg			totreads++;
2585455Sdg			totreadblocks += rbp->b_bcount / size;
2595455Sdg			curproc->p_stats->p_ru.ru_inblock++;
2605455Sdg		}
2615455Sdg	}
2625839Sdg	if (bp && ((bp->b_flags & B_ASYNC) == 0))
2635455Sdg		return (biowait(bp));
2645455Sdg	return (error);
2651541Srgrimes}
2661541Srgrimes
2671541Srgrimes/*
2681541Srgrimes * If blocks are contiguous on disk, use this to provide clustered
2691541Srgrimes * read ahead.  We will read as many blocks as possible sequentially
2701541Srgrimes * and then parcel them up into logical blocks in the buffer hash table.
2711541Srgrimes */
27210541Sdysonstatic struct buf *
27310541Sdysoncluster_rbuild(vp, filesize, lbn, blkno, size, run)
2741541Srgrimes	struct vnode *vp;
2751541Srgrimes	u_quad_t filesize;
2761541Srgrimes	daddr_t lbn;
2771541Srgrimes	daddr_t blkno;
2781541Srgrimes	long size;
2791541Srgrimes	int run;
2801541Srgrimes{
28110541Sdyson	struct buf *bp, *tbp;
2821541Srgrimes	daddr_t bn;
2835455Sdg	int i, inc, j;
2841541Srgrimes
2851541Srgrimes#ifdef DIAGNOSTIC
2861541Srgrimes	if (size != vp->v_mount->mnt_stat.f_iosize)
2871541Srgrimes		panic("cluster_rbuild: size %d != filesize %d\n",
2885455Sdg		    size, vp->v_mount->mnt_stat.f_iosize);
2891541Srgrimes#endif
29010978Sdyson	if (size * (lbn + run) > filesize)
2911541Srgrimes		--run;
29210541Sdyson
29310541Sdyson	tbp = getblk(vp, lbn, size, 0, 0);
29410541Sdyson	if (tbp->b_flags & B_CACHE)
29510541Sdyson		return tbp;
29610541Sdyson
29710541Sdyson	tbp->b_blkno = blkno;
29810541Sdyson	tbp->b_flags |= B_ASYNC | B_READ;
29910541Sdyson	if( ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
30010541Sdyson		return tbp;
30110541Sdyson
30210541Sdyson	bp = trypbuf();
30310541Sdyson	if (bp == 0)
30410541Sdyson		return tbp;
30510541Sdyson
30610541Sdyson	(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
30710541Sdyson	bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO;
3085455Sdg	bp->b_iodone = cluster_callback;
3095455Sdg	bp->b_blkno = blkno;
3105455Sdg	bp->b_lblkno = lbn;
3115455Sdg	pbgetvp(vp, bp);
3121541Srgrimes
31312404Sdyson	TAILQ_INIT(&bp->b_cluster.cluster_head);
3141541Srgrimes
3155455Sdg	bp->b_bcount = 0;
3165455Sdg	bp->b_bufsize = 0;
3175455Sdg	bp->b_npages = 0;
3185455Sdg
3191541Srgrimes	inc = btodb(size);
32010541Sdyson	for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
3215455Sdg		if (i != 0) {
32210541Sdyson			if ((bp->b_npages * PAGE_SIZE) + size > MAXPHYS)
32310541Sdyson				break;
32410978Sdyson
32510541Sdyson			if (incore(vp, lbn + i))
32610541Sdyson				break;
3275455Sdg			tbp = getblk(vp, lbn + i, size, 0, 0);
32810541Sdyson
3295455Sdg			if ((tbp->b_flags & B_CACHE) ||
33010541Sdyson				(tbp->b_flags & B_VMIO) == 0) {
3315455Sdg				brelse(tbp);
3325455Sdg				break;
3335455Sdg			}
33410541Sdyson
33510541Sdyson			for (j=0;j<tbp->b_npages;j++) {
33610541Sdyson				if (tbp->b_pages[j]->valid) {
33710541Sdyson					break;
33810541Sdyson				}
33910541Sdyson			}
34010541Sdyson
34110541Sdyson			if (j != tbp->b_npages) {
34210978Sdyson				/*
34310978Sdyson				 * force buffer to be re-constituted later
34410978Sdyson				 */
34510978Sdyson				tbp->b_flags |= B_RELBUF;
34610541Sdyson				brelse(tbp);
34710541Sdyson				break;
34810541Sdyson			}
34910541Sdyson
35010541Sdyson			tbp->b_flags |= B_READ | B_ASYNC;
35110541Sdyson			if( tbp->b_blkno == tbp->b_lblkno) {
35210541Sdyson				tbp->b_blkno = bn;
35310541Sdyson			} else if (tbp->b_blkno != bn) {
35410541Sdyson				brelse(tbp);
35510541Sdyson				break;
35610541Sdyson			}
3571541Srgrimes		}
35812404Sdyson		TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
35912404Sdyson			tbp, b_cluster.cluster_entry);
3605455Sdg		for (j = 0; j < tbp->b_npages; j += 1) {
36110541Sdyson			vm_page_t m;
36210541Sdyson			m = tbp->b_pages[j];
36310541Sdyson			++m->busy;
36410541Sdyson			++m->object->paging_in_progress;
36510978Sdyson			if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) {
36610541Sdyson				m = bogus_page;
36710541Sdyson			}
36810541Sdyson			if ((bp->b_npages == 0) ||
36912413Sdyson				(bp->b_pages[bp->b_npages-1] != m)) {
37010541Sdyson				bp->b_pages[bp->b_npages] = m;
37110541Sdyson				bp->b_npages++;
37210541Sdyson			}
3731541Srgrimes		}
37410541Sdyson		bp->b_bcount += tbp->b_bcount;
37510541Sdyson		bp->b_bufsize += tbp->b_bufsize;
3761541Srgrimes	}
37710541Sdyson	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
37810541Sdyson		(vm_page_t *)bp->b_pages, bp->b_npages);
3795455Sdg	return (bp);
3801541Srgrimes}
3811541Srgrimes
3821541Srgrimes/*
3831541Srgrimes * Cleanup after a clustered read or write.
3841541Srgrimes * This is complicated by the fact that any of the buffers might have
3851541Srgrimes * extra memory (if there were no empty buffer headers at allocbuf time)
3861541Srgrimes * that we will need to shift around.
3871541Srgrimes */
3881541Srgrimesvoid
3891541Srgrimescluster_callback(bp)
3901541Srgrimes	struct buf *bp;
3911541Srgrimes{
39212404Sdyson	struct buf *nbp, *tbp;
3931541Srgrimes	int error = 0;
3941541Srgrimes
3951541Srgrimes	/*
3961541Srgrimes	 * Must propogate errors to all the components.
3971541Srgrimes	 */
3981541Srgrimes	if (bp->b_flags & B_ERROR)
3991541Srgrimes		error = bp->b_error;
4001541Srgrimes
40110541Sdyson	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
4021541Srgrimes	/*
4031541Srgrimes	 * Move memory from the large cluster buffer into the component
4041541Srgrimes	 * buffers and mark IO as done on these.
4051541Srgrimes	 */
40612404Sdyson	for (tbp = bp->b_cluster.cluster_head.tqh_first;
40712404Sdyson		tbp; tbp = nbp) {
40812404Sdyson		nbp = tbp->b_cluster.cluster_entry.tqe_next;
4091541Srgrimes		if (error) {
4101541Srgrimes			tbp->b_flags |= B_ERROR;
4111541Srgrimes			tbp->b_error = error;
4121541Srgrimes		}
4131541Srgrimes		biodone(tbp);
4141541Srgrimes	}
4155455Sdg	relpbuf(bp);
4161541Srgrimes}
4171541Srgrimes
4181541Srgrimes/*
4191541Srgrimes * Do clustered write for FFS.
4201541Srgrimes *
4211541Srgrimes * Three cases:
4221541Srgrimes *	1. Write is not sequential (write asynchronously)
4231541Srgrimes *	Write is sequential:
4241541Srgrimes *	2.	beginning of cluster - begin cluster
4251541Srgrimes *	3.	middle of a cluster - add to cluster
4261541Srgrimes *	4.	end of a cluster - asynchronously write cluster
4271541Srgrimes */
4281541Srgrimesvoid
4291541Srgrimescluster_write(bp, filesize)
4305455Sdg	struct buf *bp;
4311541Srgrimes	u_quad_t filesize;
4321541Srgrimes{
4335455Sdg	struct vnode *vp;
4345455Sdg	daddr_t lbn;
4355455Sdg	int maxclen, cursize;
4365455Sdg	int lblocksize;
43712404Sdyson	int async;
4381541Srgrimes
4395455Sdg	vp = bp->b_vp;
44012404Sdyson	async = (vp->v_mount && (vp->v_mount->mnt_flag & MNT_ASYNC));
4415455Sdg	lblocksize = vp->v_mount->mnt_stat.f_iosize;
4425455Sdg	lbn = bp->b_lblkno;
4431541Srgrimes
4441541Srgrimes	/* Initialize vnode to beginning of file. */
4451541Srgrimes	if (lbn == 0)
4461541Srgrimes		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
4471541Srgrimes
4485455Sdg	if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
4495455Sdg	    (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
4505839Sdg		maxclen = MAXPHYS / lblocksize - 1;
4511541Srgrimes		if (vp->v_clen != 0) {
4521541Srgrimes			/*
4531541Srgrimes			 * Next block is not sequential.
4548876Srgrimes			 *
4551541Srgrimes			 * If we are not writing at end of file, the process
4565455Sdg			 * seeked to another point in the file since its last
4575455Sdg			 * write, or we have reached our maximum cluster size,
4585455Sdg			 * then push the previous cluster. Otherwise try
4595455Sdg			 * reallocating to make it sequential.
4601541Srgrimes			 */
4611541Srgrimes			cursize = vp->v_lastw - vp->v_cstart + 1;
46212404Sdyson#if 1
46312404Sdyson			if ((lbn + 1) * lblocksize != filesize ||
46412404Sdyson				lbn != vp->v_lastw + 1 ||
46512404Sdyson				vp->v_clen <= cursize) {
46612404Sdyson				if (!async)
46712404Sdyson					cluster_wbuild(vp, lblocksize,
46812404Sdyson						vp->v_cstart, cursize);
46912404Sdyson			}
47012404Sdyson#else
47110541Sdyson			if (!doreallocblks ||
47210541Sdyson			    (lbn + 1) * lblocksize != filesize ||
47310541Sdyson			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
47412404Sdyson				if (!async)
47512404Sdyson					cluster_wbuild(vp, lblocksize,
47612404Sdyson						vp->v_cstart, cursize);
47710541Sdyson			} else {
47810541Sdyson				struct buf **bpp, **endbp;
47910541Sdyson				struct cluster_save *buflist;
48010541Sdyson
48110541Sdyson				buflist = cluster_collectbufs(vp, bp);
48210541Sdyson				endbp = &buflist->bs_children
48310541Sdyson				    [buflist->bs_nchildren - 1];
48410541Sdyson				if (VOP_REALLOCBLKS(vp, buflist)) {
48510541Sdyson					/*
48610541Sdyson					 * Failed, push the previous cluster.
48710541Sdyson					 */
48810541Sdyson					for (bpp = buflist->bs_children;
48910541Sdyson					     bpp < endbp; bpp++)
49010541Sdyson						brelse(*bpp);
49110541Sdyson					free(buflist, M_SEGMENT);
49212404Sdyson					cluster_wbuild(vp, lblocksize,
49312404Sdyson					    vp->v_cstart, cursize);
49410541Sdyson				} else {
49510541Sdyson					/*
49610541Sdyson					 * Succeeded, keep building cluster.
49710541Sdyson					 */
49810541Sdyson					for (bpp = buflist->bs_children;
49910541Sdyson					     bpp <= endbp; bpp++)
50010541Sdyson						bdwrite(*bpp);
50110541Sdyson					free(buflist, M_SEGMENT);
50210541Sdyson					vp->v_lastw = lbn;
50310541Sdyson					vp->v_lasta = bp->b_blkno;
50410541Sdyson					return;
50510541Sdyson				}
50610541Sdyson			}
50712404Sdyson#endif
5081541Srgrimes		}
5091541Srgrimes		/*
5105455Sdg		 * Consider beginning a cluster. If at end of file, make
5115455Sdg		 * cluster as large as possible, otherwise find size of
5125455Sdg		 * existing cluster.
5131541Srgrimes		 */
5145455Sdg		if ((lbn + 1) * lblocksize != filesize &&
5157613Sdg		    (bp->b_blkno == bp->b_lblkno) &&
51610551Sdyson		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
51710541Sdyson		     bp->b_blkno == -1)) {
5181541Srgrimes			bawrite(bp);
5191541Srgrimes			vp->v_clen = 0;
5201541Srgrimes			vp->v_lasta = bp->b_blkno;
5211541Srgrimes			vp->v_cstart = lbn + 1;
5221541Srgrimes			vp->v_lastw = lbn;
5231541Srgrimes			return;
5241541Srgrimes		}
5255455Sdg		vp->v_clen = maxclen;
52612404Sdyson		if (!async && maxclen == 0) {	/* I/O not contiguous */
5271541Srgrimes			vp->v_cstart = lbn + 1;
5285455Sdg			bawrite(bp);
5295455Sdg		} else {	/* Wait for rest of cluster */
5301541Srgrimes			vp->v_cstart = lbn;
5315455Sdg			bdwrite(bp);
5321541Srgrimes		}
5331541Srgrimes	} else if (lbn == vp->v_cstart + vp->v_clen) {
5341541Srgrimes		/*
5351541Srgrimes		 * At end of cluster, write it out.
5361541Srgrimes		 */
53712404Sdyson		bdwrite(bp);
53812404Sdyson		cluster_wbuild(vp, lblocksize, vp->v_cstart,
53912404Sdyson		    vp->v_clen + 1);
5401541Srgrimes		vp->v_clen = 0;
5411541Srgrimes		vp->v_cstart = lbn + 1;
5421541Srgrimes	} else
5431541Srgrimes		/*
5445455Sdg		 * In the middle of a cluster, so just delay the I/O for now.
5451541Srgrimes		 */
5461541Srgrimes		bdwrite(bp);
5471541Srgrimes	vp->v_lastw = lbn;
5481541Srgrimes	vp->v_lasta = bp->b_blkno;
5491541Srgrimes}
5501541Srgrimes
5511541Srgrimes
5521541Srgrimes/*
5531541Srgrimes * This is an awful lot like cluster_rbuild...wish they could be combined.
5541541Srgrimes * The last lbn argument is the current block on which I/O is being
5551541Srgrimes * performed.  Check to see that it doesn't fall in the middle of
5561541Srgrimes * the current block (if last_bp == NULL).
5571541Srgrimes */
5581541Srgrimesvoid
55912404Sdysoncluster_wbuild(vp, size, start_lbn, len)
5601541Srgrimes	struct vnode *vp;
5611541Srgrimes	long size;
5621541Srgrimes	daddr_t start_lbn;
5631541Srgrimes	int len;
5641541Srgrimes{
56512404Sdyson	struct buf *bp, *tbp;
5665455Sdg	int i, j, s;
56712404Sdyson	int dbsize = btodb(size);
56812404Sdyson	int origlen = len;
5691541Srgrimes
5701541Srgrimesredo:
57112404Sdyson	if (len == 0)
57212404Sdyson		return;
57312404Sdyson	if ( ((tbp = incore(vp, start_lbn)) == NULL) ||
57412404Sdyson		((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) {
57512404Sdyson		++start_lbn;
57612404Sdyson		--len;
57712404Sdyson		goto redo;
5781541Srgrimes	}
5796837Sdg
58012404Sdyson	tbp = getblk(vp, start_lbn, size, 0, 0);
58112404Sdyson	if ((tbp->b_flags & B_DELWRI) == 0) {
5821541Srgrimes		++start_lbn;
5831541Srgrimes		--len;
5845455Sdg		brelse(tbp);
5851541Srgrimes		goto redo;
5861541Srgrimes	}
5871541Srgrimes	/*
5885455Sdg	 * Extra memory in the buffer, punt on this buffer. XXX we could
5895455Sdg	 * handle this in most cases, but we would have to push the extra
5905455Sdg	 * memory down to after our max possible cluster size and then
5915455Sdg	 * potentially pull it back up if the cluster was terminated
5925455Sdg	 * prematurely--too much hassle.
5931541Srgrimes	 */
59412404Sdyson	if (((tbp->b_flags & (B_VMIO|B_CLUSTEROK)) != (B_VMIO|B_CLUSTEROK)) ||
59512404Sdyson		(tbp->b_bcount != tbp->b_bufsize) ||
59612404Sdyson		len == 1) {
59712404Sdyson		bawrite(tbp);
5981541Srgrimes		++start_lbn;
5991541Srgrimes		--len;
60012404Sdyson		goto redo;
60112404Sdyson	}
60212404Sdyson
60312404Sdyson	bp = trypbuf();
60412404Sdyson	if (bp == NULL) {
6055455Sdg		bawrite(tbp);
60612404Sdyson		++start_lbn;
60712404Sdyson		--len;
6081541Srgrimes		goto redo;
6091541Srgrimes	}
61012404Sdyson
61112404Sdyson	TAILQ_INIT(&bp->b_cluster.cluster_head);
6125455Sdg	bp->b_bcount = 0;
6135455Sdg	bp->b_bufsize = 0;
6145455Sdg	bp->b_npages = 0;
6151541Srgrimes
6165455Sdg	bp->b_blkno = tbp->b_blkno;
6175455Sdg	bp->b_lblkno = tbp->b_lblkno;
61810541Sdyson	(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
6195455Sdg	bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER;
6201541Srgrimes	bp->b_iodone = cluster_callback;
6215455Sdg	pbgetvp(vp, bp);
6221541Srgrimes
6235455Sdg	for (i = 0; i < len; ++i, ++start_lbn) {
6245455Sdg		if (i != 0) {
62512407Sdyson			s = splbio();
62612404Sdyson			if ((tbp = incore(vp, start_lbn)) == NULL) {
62712404Sdyson				splx(s);
6281541Srgrimes				break;
62912404Sdyson			}
6301541Srgrimes
63112404Sdyson			if ((tbp->b_flags & (B_CLUSTEROK|B_INVAL|B_BUSY|B_DELWRI)) != (B_DELWRI|B_CLUSTEROK)) {
63212404Sdyson				splx(s);
6335455Sdg				break;
63412404Sdyson			}
6351541Srgrimes
63612404Sdyson			if ((tbp->b_bcount != size) ||
63712404Sdyson				((bp->b_blkno + dbsize * i) != tbp->b_blkno) ||
63812404Sdyson				((tbp->b_npages + bp->b_npages) > (MAXPHYS / PAGE_SIZE))) {
63912404Sdyson				splx(s);
6406837Sdg				break;
64112404Sdyson			}
64212404Sdyson			bremfree(tbp);
64312404Sdyson			tbp->b_flags |= B_BUSY;
64412404Sdyson			tbp->b_flags &= ~B_DONE;
64512404Sdyson			splx(s);
6461541Srgrimes		}
6475455Sdg		for (j = 0; j < tbp->b_npages; j += 1) {
64810541Sdyson			vm_page_t m;
64910541Sdyson			m = tbp->b_pages[j];
65010541Sdyson			++m->busy;
65110541Sdyson			++m->object->paging_in_progress;
65210541Sdyson			if ((bp->b_npages == 0) ||
65310541Sdyson				(bp->b_pages[bp->b_npages - 1] != m)) {
65410541Sdyson				bp->b_pages[bp->b_npages] = m;
65510541Sdyson				bp->b_npages++;
65610541Sdyson			}
6575455Sdg		}
6581541Srgrimes		bp->b_bcount += size;
6591541Srgrimes		bp->b_bufsize += size;
6601541Srgrimes
6611541Srgrimes		tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
6621937Sdg		tbp->b_flags |= B_ASYNC;
6631541Srgrimes		s = splbio();
6645455Sdg		reassignbuf(tbp, tbp->b_vp);	/* put on clean list */
6651541Srgrimes		++tbp->b_vp->v_numoutput;
6661541Srgrimes		splx(s);
66712404Sdyson		TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
66812404Sdyson			tbp, b_cluster.cluster_entry);
6691541Srgrimes	}
67010541Sdyson	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
67110541Sdyson		(vm_page_t *) bp->b_pages, bp->b_npages);
6725455Sdg	bawrite(bp);
6731541Srgrimes
67412404Sdyson	len -= i;
67512404Sdyson	goto redo;
6761541Srgrimes}
6771541Srgrimes
67812404Sdyson#if 0
6791541Srgrimes/*
6801541Srgrimes * Collect together all the buffers in a cluster.
6811541Srgrimes * Plus add one additional buffer.
6821541Srgrimes */
6831541Srgrimesstruct cluster_save *
6841541Srgrimescluster_collectbufs(vp, last_bp)
6851541Srgrimes	struct vnode *vp;
6861541Srgrimes	struct buf *last_bp;
6871541Srgrimes{
6881541Srgrimes	struct cluster_save *buflist;
6895455Sdg	daddr_t lbn;
6901541Srgrimes	int i, len;
6911541Srgrimes
6921541Srgrimes	len = vp->v_lastw - vp->v_cstart + 1;
6931541Srgrimes	buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
6941541Srgrimes	    M_SEGMENT, M_WAITOK);
6951541Srgrimes	buflist->bs_nchildren = 0;
6965455Sdg	buflist->bs_children = (struct buf **) (buflist + 1);
6971541Srgrimes	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++)
6985455Sdg		(void) bread(vp, lbn, last_bp->b_bcount, NOCRED,
6995455Sdg		    &buflist->bs_children[i]);
7001541Srgrimes	buflist->bs_children[i] = last_bp;
7011541Srgrimes	buflist->bs_nchildren = i + 1;
7021541Srgrimes	return (buflist);
7031541Srgrimes}
70412404Sdyson#endif
705