11541Srgrimes/*-
21541Srgrimes * Copyright (c) 1993
31541Srgrimes *	The Regents of the University of California.  All rights reserved.
45455Sdg * Modifications/enhancements:
55455Sdg * 	Copyright (c) 1995 John S. Dyson.  All rights reserved.
61541Srgrimes *
71541Srgrimes * Redistribution and use in source and binary forms, with or without
81541Srgrimes * modification, are permitted provided that the following conditions
91541Srgrimes * are met:
101541Srgrimes * 1. Redistributions of source code must retain the above copyright
111541Srgrimes *    notice, this list of conditions and the following disclaimer.
121541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
131541Srgrimes *    notice, this list of conditions and the following disclaimer in the
141541Srgrimes *    documentation and/or other materials provided with the distribution.
151541Srgrimes * 4. Neither the name of the University nor the names of its contributors
161541Srgrimes *    may be used to endorse or promote products derived from this software
171541Srgrimes *    without specific prior written permission.
181541Srgrimes *
191541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
201541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
211541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
221541Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
231541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
241541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
251541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
261541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
271541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
281541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
291541Srgrimes * SUCH DAMAGE.
301541Srgrimes *
311541Srgrimes *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
321541Srgrimes */
331541Srgrimes
34116182Sobrien#include <sys/cdefs.h>
35116182Sobrien__FBSDID("$FreeBSD: stable/11/sys/kern/vfs_cluster.c 352947 2019-10-01 23:28:22Z mckusick $");
36116182Sobrien
3732929Seivind#include "opt_debug_cluster.h"
3832929Seivind
391541Srgrimes#include <sys/param.h>
401549Srgrimes#include <sys/systm.h>
4141168Sbde#include <sys/kernel.h>
421541Srgrimes#include <sys/proc.h>
4360041Sphk#include <sys/bio.h>
441541Srgrimes#include <sys/buf.h>
451541Srgrimes#include <sys/vnode.h>
4641124Sdg#include <sys/malloc.h>
471541Srgrimes#include <sys/mount.h>
48297633Strasz#include <sys/racct.h>
491541Srgrimes#include <sys/resourcevar.h>
50248084Sattilio#include <sys/rwlock.h>
5168885Sdillon#include <sys/vmmeter.h>
526621Sdg#include <vm/vm.h>
5310541Sdyson#include <vm/vm_object.h>
5410541Sdyson#include <vm/vm_page.h>
5548545Smckusick#include <sys/sysctl.h>
561541Srgrimes
5721002Sdyson#if defined(CLUSTERDEBUG)
5821002Sdysonstatic int	rcluster= 0;
5991690SeivindSYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0,
6091690Seivind    "Debug VFS clustering code");
6121002Sdyson#endif
6221002Sdyson
63167214Swkoszekstatic MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer");
6441124Sdg
65248508Skibstatic struct cluster_save *cluster_collectbufs(struct vnode *vp,
66248508Skib	    struct buf *last_bp, int gbflags);
67248508Skibstatic struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize,
68248508Skib	    daddr_t lbn, daddr_t blkno, long size, int run, int gbflags,
69248508Skib	    struct buf *fbp);
70141628Sphkstatic void cluster_callback(struct buf *);
711541Srgrimes
7248545Smckusickstatic int write_behind = 1;
7391690SeivindSYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0,
7491690Seivind    "Cluster write-behind; 0: disable, 1: enable, 2: backed off");
7548545Smckusick
76219699Sivorasstatic int read_max = 64;
77112080SjeffSYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0,
78112080Sjeff    "Cluster read-ahead max block count");
79112080Sjeff
80250327Sscottlstatic int read_min = 1;
81250327SscottlSYSCTL_INT(_vfs, OID_AUTO, read_min, CTLFLAG_RW, &read_min, 0,
82250327Sscottl    "Cluster read min block count");
83250327Sscottl
8491690Seivind/* Page expended to mark partially backed buffers */
8512973Sbdeextern vm_page_t	bogus_page;
865455Sdg
8791690Seivind/*
8891690Seivind * Read data to a buf, including read-ahead if we find this to be beneficial.
8991690Seivind * cluster_read replaces bread.
9010541Sdyson */
911549Srgrimesint
92248282Skibcluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
93248282Skib    struct ucred *cred, long totread, int seqcount, int gbflags,
94248282Skib    struct buf **bpp)
951541Srgrimes{
9621002Sdyson	struct buf *bp, *rbp, *reqbp;
97177493Sjeff	struct bufobj *bo;
9896572Sphk	daddr_t blkno, origblkno;
99112080Sjeff	int maxra, racluster;
100112080Sjeff	int error, ncontig;
10110541Sdyson	int i;
1021541Srgrimes
1031541Srgrimes	error = 0;
104177493Sjeff	bo = &vp->v_bufobj;
105248508Skib	if (!unmapped_buf_allowed)
106248508Skib		gbflags &= ~GB_UNMAPPED;
10721002Sdyson
1085455Sdg	/*
10921002Sdyson	 * Try to limit the amount of read-ahead by a few
11021002Sdyson	 * ad-hoc parameters.  This needs work!!!
11121002Sdyson	 */
11251797Sphk	racluster = vp->v_mount->mnt_iosize_max / size;
113112080Sjeff	maxra = seqcount;
114112080Sjeff	maxra = min(read_max, maxra);
115112080Sjeff	maxra = min(nbuf/8, maxra);
116112080Sjeff	if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize)
117112080Sjeff		maxra = (filesize / size) - lblkno;
11821002Sdyson
11921002Sdyson	/*
1205455Sdg	 * get the requested block
1215455Sdg	 */
122248508Skib	*bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, gbflags);
123294954Smckusick	if (bp == NULL)
124294954Smckusick		return (EBUSY);
12521002Sdyson	origblkno = lblkno;
12612767Sdyson
1275455Sdg	/*
1285455Sdg	 * if it is in the cache, then check to see if the reads have been
1295455Sdg	 * sequential.  If they have, then try some read-ahead, otherwise
1305455Sdg	 * back-off on prospective read-aheads.
1315455Sdg	 */
1321541Srgrimes	if (bp->b_flags & B_CACHE) {
13321002Sdyson		if (!seqcount) {
1345455Sdg			return 0;
13521002Sdyson		} else if ((bp->b_flags & B_RAM) == 0) {
13621002Sdyson			return 0;
13721002Sdyson		} else {
13821002Sdyson			bp->b_flags &= ~B_RAM;
139251171Sjeff			BO_RLOCK(bo);
14048225Smckusick			for (i = 1; i < maxra; i++) {
14199737Sdillon				/*
14299737Sdillon				 * Stop if the buffer does not exist or it
14399737Sdillon				 * is invalid (about to go away?)
14499737Sdillon				 */
145136767Sphk				rbp = gbincore(&vp->v_bufobj, lblkno+i);
146112080Sjeff				if (rbp == NULL || (rbp->b_flags & B_INVAL))
14721002Sdyson					break;
14821002Sdyson
14921002Sdyson				/*
15048677Smckusick				 * Set another read-ahead mark so we know
151151621Sups				 * to check again. (If we can lock the
152151621Sups				 * buffer without waiting)
15321002Sdyson				 */
154151621Sups				if ((((i % racluster) == (racluster - 1)) ||
155151621Sups				    (i == (maxra - 1)))
156151621Sups				    && (0 == BUF_LOCK(rbp,
157151621Sups					LK_EXCLUSIVE | LK_NOWAIT, NULL))) {
158112080Sjeff					rbp->b_flags |= B_RAM;
159151621Sups					BUF_UNLOCK(rbp);
160151621Sups				}
16121002Sdyson			}
162251171Sjeff			BO_RUNLOCK(bo);
16321002Sdyson			if (i >= maxra) {
1645839Sdg				return 0;
16510541Sdyson			}
16621002Sdyson			lblkno += i;
16721002Sdyson		}
16821002Sdyson		reqbp = bp = NULL;
169111886Sjeff	/*
170111886Sjeff	 * If it isn't in the cache, then get a chunk from
171111886Sjeff	 * disk if sequential, otherwise just get the block.
172111886Sjeff	 */
17321002Sdyson	} else {
17442453Seivind		off_t firstread = bp->b_offset;
175111886Sjeff		int nblks;
176250327Sscottl		long minread;
17742453Seivind
17842408Seivind		KASSERT(bp->b_offset != NOOFFSET,
17942453Seivind		    ("cluster_read: no buffer offset"));
180111886Sjeff
181112080Sjeff		ncontig = 0;
182111886Sjeff
183111886Sjeff		/*
184250327Sscottl		 * Adjust totread if needed
185250327Sscottl		 */
186250327Sscottl		minread = read_min * size;
187250327Sscottl		if (minread > totread)
188250327Sscottl			totread = minread;
189250327Sscottl
190250327Sscottl		/*
191111886Sjeff		 * Compute the total number of blocks that we should read
192111886Sjeff		 * synchronously.
193111886Sjeff		 */
19421002Sdyson		if (firstread + totread > filesize)
19521002Sdyson			totread = filesize - firstread;
196111886Sjeff		nblks = howmany(totread, size);
197111886Sjeff		if (nblks > racluster)
198111886Sjeff			nblks = racluster;
19921002Sdyson
200111886Sjeff		/*
201111886Sjeff		 * Now compute the number of contiguous blocks.
202111886Sjeff		 */
203111886Sjeff		if (nblks > 1) {
20421002Sdyson	    		error = VOP_BMAP(vp, lblkno, NULL,
205112080Sjeff				&blkno, &ncontig, NULL);
206111886Sjeff			/*
207111886Sjeff			 * If this failed to map just do the original block.
208111886Sjeff			 */
209111886Sjeff			if (error || blkno == -1)
210112080Sjeff				ncontig = 0;
211111886Sjeff		}
21221002Sdyson
213111886Sjeff		/*
214111886Sjeff		 * If we have contiguous data available do a cluster
215111886Sjeff		 * otherwise just read the requested block.
216111886Sjeff		 */
217112080Sjeff		if (ncontig) {
218111886Sjeff			/* Account for our first block. */
219112080Sjeff			ncontig = min(ncontig + 1, nblks);
220112080Sjeff			if (ncontig < nblks)
221112080Sjeff				nblks = ncontig;
22221002Sdyson			bp = cluster_rbuild(vp, filesize, lblkno,
223248508Skib			    blkno, size, nblks, gbflags, bp);
22434694Sdyson			lblkno += (bp->b_bufsize / size);
22510541Sdyson		} else {
22658345Sphk			bp->b_flags |= B_RAM;
22758345Sphk			bp->b_iocmd = BIO_READ;
22810541Sdyson			lblkno += 1;
2298876Srgrimes		}
2301541Srgrimes	}
2315455Sdg
2325455Sdg	/*
233112080Sjeff	 * handle the synchronous read so that it is available ASAP.
2345455Sdg	 */
2355455Sdg	if (bp) {
23670374Sdillon		if ((bp->b_flags & B_CLUSTER) == 0) {
23736275Sdyson			vfs_busy_pages(bp, 0);
23870374Sdillon		}
23958934Sphk		bp->b_flags &= ~B_INVAL;
24058934Sphk		bp->b_ioflags &= ~BIO_ERROR;
24158345Sphk		if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL)
24248333Speter			BUF_KERNPROC(bp);
243121205Sphk		bp->b_iooffset = dbtob(bp->b_blkno);
244136927Sphk		bstrategy(bp);
245297633Strasz#ifdef RACCT
246297633Strasz		if (racct_enable) {
247297633Strasz			PROC_LOCK(curproc);
248297633Strasz			racct_add_buf(curproc, bp, 0);
249297633Strasz			PROC_UNLOCK(curproc);
250297633Strasz		}
251297633Strasz#endif /* RACCT */
252170174Sjeff		curthread->td_ru.ru_inblock++;
2535455Sdg	}
25434611Sdyson
2555455Sdg	/*
256112080Sjeff	 * If we have been doing sequential I/O, then do some read-ahead.
2575455Sdg	 */
258112080Sjeff	while (lblkno < (origblkno + maxra)) {
259112080Sjeff		error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL);
260112080Sjeff		if (error)
261112080Sjeff			break;
262112080Sjeff
263112080Sjeff		if (blkno == -1)
264112080Sjeff			break;
265112080Sjeff
266112080Sjeff		/*
267112080Sjeff		 * We could throttle ncontig here by maxra but we might as
268112080Sjeff		 * well read the data if it is contiguous.  We're throttled
269112080Sjeff		 * by racluster anyway.
270112080Sjeff		 */
271112080Sjeff		if (ncontig) {
272112080Sjeff			ncontig = min(ncontig + 1, racluster);
273112080Sjeff			rbp = cluster_rbuild(vp, filesize, lblkno, blkno,
274248508Skib			    size, ncontig, gbflags, NULL);
275112080Sjeff			lblkno += (rbp->b_bufsize / size);
276112838Sjeff			if (rbp->b_flags & B_DELWRI) {
277112838Sjeff				bqrelse(rbp);
278112838Sjeff				continue;
279112838Sjeff			}
280112080Sjeff		} else {
281248508Skib			rbp = getblk(vp, lblkno, size, 0, 0, gbflags);
282112838Sjeff			lblkno += 1;
283112838Sjeff			if (rbp->b_flags & B_DELWRI) {
284112838Sjeff				bqrelse(rbp);
285112838Sjeff				continue;
286112838Sjeff			}
287112080Sjeff			rbp->b_flags |= B_ASYNC | B_RAM;
288112080Sjeff			rbp->b_iocmd = BIO_READ;
289112080Sjeff			rbp->b_blkno = blkno;
290112080Sjeff		}
291112080Sjeff		if (rbp->b_flags & B_CACHE) {
29258345Sphk			rbp->b_flags &= ~B_ASYNC;
29313490Sdyson			bqrelse(rbp);
294112080Sjeff			continue;
2955455Sdg		}
296112080Sjeff		if ((rbp->b_flags & B_CLUSTER) == 0) {
297112080Sjeff			vfs_busy_pages(rbp, 0);
298112080Sjeff		}
299112080Sjeff		rbp->b_flags &= ~B_INVAL;
300112080Sjeff		rbp->b_ioflags &= ~BIO_ERROR;
301112080Sjeff		if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL)
302112080Sjeff			BUF_KERNPROC(rbp);
303121205Sphk		rbp->b_iooffset = dbtob(rbp->b_blkno);
304136927Sphk		bstrategy(rbp);
305297633Strasz#ifdef RACCT
306297633Strasz		if (racct_enable) {
307297633Strasz			PROC_LOCK(curproc);
308297633Strasz			racct_add_buf(curproc, rbp, 0);
309297633Strasz			PROC_UNLOCK(curproc);
310297633Strasz		}
311297633Strasz#endif /* RACCT */
312170174Sjeff		curthread->td_ru.ru_inblock++;
3135455Sdg	}
314112080Sjeff
315294954Smckusick	if (reqbp) {
316294954Smckusick		/*
317294954Smckusick		 * Like bread, always brelse() the buffer when
318294954Smckusick		 * returning an error.
319294954Smckusick		 */
320294954Smckusick		error = bufwait(reqbp);
321294954Smckusick		if (error != 0) {
322294954Smckusick			brelse(reqbp);
323294954Smckusick			*bpp = NULL;
324294954Smckusick		}
325294954Smckusick	}
326294954Smckusick	return (error);
3271541Srgrimes}
3281541Srgrimes
3291541Srgrimes/*
3301541Srgrimes * If blocks are contiguous on disk, use this to provide clustered
3311541Srgrimes * read ahead.  We will read as many blocks as possible sequentially
3321541Srgrimes * and then parcel them up into logical blocks in the buffer hash table.
3331541Srgrimes */
33410541Sdysonstatic struct buf *
335248508Skibcluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
336248508Skib    daddr_t blkno, long size, int run, int gbflags, struct buf *fbp)
3371541Srgrimes{
33810541Sdyson	struct buf *bp, *tbp;
3391541Srgrimes	daddr_t bn;
340195122Salc	off_t off;
341195122Salc	long tinc, tsize;
342254668Skib	int i, inc, j, k, toff;
3431541Srgrimes
34442408Seivind	KASSERT(size == vp->v_mount->mnt_stat.f_iosize,
345239315Salc	    ("cluster_rbuild: size %ld != f_iosize %jd\n",
346122537Smckusick	    size, (intmax_t)vp->v_mount->mnt_stat.f_iosize));
34742453Seivind
34812767Sdyson	/*
34912767Sdyson	 * avoid a division
35012767Sdyson	 */
35112767Sdyson	while ((u_quad_t) size * (lbn + run) > filesize) {
3521541Srgrimes		--run;
35312767Sdyson	}
35410541Sdyson
35521002Sdyson	if (fbp) {
35621002Sdyson		tbp = fbp;
35758345Sphk		tbp->b_iocmd = BIO_READ;
35821002Sdyson	} else {
359248508Skib		tbp = getblk(vp, lbn, size, 0, 0, gbflags);
36021002Sdyson		if (tbp->b_flags & B_CACHE)
36121002Sdyson			return tbp;
36258345Sphk		tbp->b_flags |= B_ASYNC | B_RAM;
36358345Sphk		tbp->b_iocmd = BIO_READ;
36421002Sdyson	}
36510541Sdyson	tbp->b_blkno = blkno;
36616086Sdyson	if( (tbp->b_flags & B_MALLOC) ||
36716086Sdyson		((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
36810541Sdyson		return tbp;
36910541Sdyson
37042957Sdillon	bp = trypbuf(&cluster_pbuf_freecnt);
371298069Spfg	if (bp == NULL)
37210541Sdyson		return tbp;
37310541Sdyson
37485272Sdillon	/*
37585272Sdillon	 * We are synthesizing a buffer out of vm_page_t's, but
37685272Sdillon	 * if the block size is not page aligned then the starting
37785272Sdillon	 * address may not be either.  Inherit the b_data offset
37885272Sdillon	 * from the original buffer.
37985272Sdillon	 */
38058345Sphk	bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO;
381248508Skib	if ((gbflags & GB_UNMAPPED) != 0) {
382248508Skib		bp->b_data = unmapped_buf;
383248508Skib	} else {
384248508Skib		bp->b_data = (char *)((vm_offset_t)bp->b_data |
385248508Skib		    ((vm_offset_t)tbp->b_data & PAGE_MASK));
386248508Skib	}
38758345Sphk	bp->b_iocmd = BIO_READ;
3885455Sdg	bp->b_iodone = cluster_callback;
3895455Sdg	bp->b_blkno = blkno;
3905455Sdg	bp->b_lblkno = lbn;
39134611Sdyson	bp->b_offset = tbp->b_offset;
39242453Seivind	KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset"));
3935455Sdg	pbgetvp(vp, bp);
3941541Srgrimes
39512404Sdyson	TAILQ_INIT(&bp->b_cluster.cluster_head);
3961541Srgrimes
3975455Sdg	bp->b_bcount = 0;
3985455Sdg	bp->b_bufsize = 0;
3995455Sdg	bp->b_npages = 0;
4005455Sdg
4011541Srgrimes	inc = btodb(size);
40210541Sdyson	for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
403254668Skib		if (i == 0) {
404254668Skib			VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
405254668Skib			vfs_drain_busy_pages(tbp);
406254668Skib			vm_object_pip_add(tbp->b_bufobj->bo_object,
407254668Skib			    tbp->b_npages);
408254668Skib			for (k = 0; k < tbp->b_npages; k++)
409254668Skib				vm_page_sbusy(tbp->b_pages[k]);
410254668Skib			VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
411254668Skib		} else {
41212767Sdyson			if ((bp->b_npages * PAGE_SIZE) +
41385272Sdillon			    round_page(size) > vp->v_mount->mnt_iosize_max) {
41410541Sdyson				break;
41585272Sdillon			}
41610978Sdyson
417248508Skib			tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT |
418248508Skib			    (gbflags & GB_UNMAPPED));
41912767Sdyson
420111886Sjeff			/* Don't wait around for locked bufs. */
421111886Sjeff			if (tbp == NULL)
422111886Sjeff				break;
42334611Sdyson
42471230Sdillon			/*
42585272Sdillon			 * Stop scanning if the buffer is fully valid
42685272Sdillon			 * (marked B_CACHE), or locked (may be doing a
42785272Sdillon			 * background write), or if the buffer is not
42885272Sdillon			 * VMIO backed.  The clustering code can only deal
429251171Sjeff			 * with VMIO-backed buffers.  The bo lock is not
430251171Sjeff			 * required for the BKGRDINPROG check since it
431251171Sjeff			 * can not be set without the buf lock.
43271230Sdillon			 */
433119521Sjeff			if ((tbp->b_vflags & BV_BKGRDINPROG) ||
434119521Sjeff			    (tbp->b_flags & B_CACHE) ||
435119521Sjeff			    (tbp->b_flags & B_VMIO) == 0) {
43613490Sdyson				bqrelse(tbp);
4375455Sdg				break;
4385455Sdg			}
43910541Sdyson
44085272Sdillon			/*
44185272Sdillon			 * The buffer must be completely invalid in order to
44285272Sdillon			 * take part in the cluster.  If it is partially valid
44385272Sdillon			 * then we stop.
44485272Sdillon			 */
445195122Salc			off = tbp->b_offset;
446195122Salc			tsize = size;
447248084Sattilio			VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
448195122Salc			for (j = 0; tsize > 0; j++) {
449195122Salc				toff = off & PAGE_MASK;
450195122Salc				tinc = tsize;
451195122Salc				if (toff + tinc > PAGE_SIZE)
452195122Salc					tinc = PAGE_SIZE - toff;
453248084Sattilio				VM_OBJECT_ASSERT_WLOCKED(tbp->b_pages[j]->object);
454195122Salc				if ((tbp->b_pages[j]->valid &
455195122Salc				    vm_page_bits(toff, tinc)) != 0)
45610541Sdyson					break;
457254668Skib				if (vm_page_xbusied(tbp->b_pages[j]))
458254668Skib					break;
459254668Skib				vm_object_pip_add(tbp->b_bufobj->bo_object, 1);
460254668Skib				vm_page_sbusy(tbp->b_pages[j]);
461195122Salc				off += tinc;
462195122Salc				tsize -= tinc;
46371230Sdillon			}
464195122Salc			if (tsize > 0) {
465254668Skibclean_sbusy:
466254668Skib				vm_object_pip_add(tbp->b_bufobj->bo_object, -j);
467254668Skib				for (k = 0; k < j; k++)
468254668Skib					vm_page_sunbusy(tbp->b_pages[k]);
469254668Skib				VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
47034611Sdyson				bqrelse(tbp);
47110541Sdyson				break;
47210541Sdyson			}
473254668Skib			VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
47410541Sdyson
47585272Sdillon			/*
47685272Sdillon			 * Set a read-ahead mark as appropriate
47785272Sdillon			 */
47821002Sdyson			if ((fbp && (i == 1)) || (i == (run - 1)))
47921002Sdyson				tbp->b_flags |= B_RAM;
48085272Sdillon
48185272Sdillon			/*
48285272Sdillon			 * Set the buffer up for an async read (XXX should
48385272Sdillon			 * we do this only if we do not wind up brelse()ing?).
48485272Sdillon			 * Set the block number if it isn't set, otherwise
48585272Sdillon			 * if it is make sure it matches the block number we
48685272Sdillon			 * expect.
48785272Sdillon			 */
48858345Sphk			tbp->b_flags |= B_ASYNC;
48958345Sphk			tbp->b_iocmd = BIO_READ;
49012767Sdyson			if (tbp->b_blkno == tbp->b_lblkno) {
49110541Sdyson				tbp->b_blkno = bn;
49210541Sdyson			} else if (tbp->b_blkno != bn) {
493254668Skib				VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
494254668Skib				goto clean_sbusy;
49510541Sdyson			}
4961541Srgrimes		}
49748333Speter		/*
49848333Speter		 * XXX fbp from caller may not be B_ASYNC, but we are going
49948333Speter		 * to biodone() it in cluster_callback() anyway
50048333Speter		 */
50148333Speter		BUF_KERNPROC(tbp);
50212404Sdyson		TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
50312404Sdyson			tbp, b_cluster.cluster_entry);
504248084Sattilio		VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
5055455Sdg		for (j = 0; j < tbp->b_npages; j += 1) {
50610541Sdyson			vm_page_t m;
50710541Sdyson			m = tbp->b_pages[j];
50810541Sdyson			if ((bp->b_npages == 0) ||
509254668Skib			    (bp->b_pages[bp->b_npages-1] != m)) {
51010541Sdyson				bp->b_pages[bp->b_npages] = m;
51110541Sdyson				bp->b_npages++;
51210541Sdyson			}
513193643Salc			if (m->valid == VM_PAGE_BITS_ALL)
51418737Sdyson				tbp->b_pages[j] = bogus_page;
5151541Srgrimes		}
516248084Sattilio		VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
51785511Sdillon		/*
51885511Sdillon		 * Don't inherit tbp->b_bufsize as it may be larger due to
51985511Sdillon		 * a non-page-aligned size.  Instead just aggregate using
52085511Sdillon		 * 'size'.
52185511Sdillon		 */
52285511Sdillon		if (tbp->b_bcount != size)
52385511Sdillon			printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size);
52485511Sdillon		if (tbp->b_bufsize != size)
52585511Sdillon			printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size);
52685511Sdillon		bp->b_bcount += size;
52785511Sdillon		bp->b_bufsize += size;
5281541Srgrimes	}
52918737Sdyson
53085272Sdillon	/*
53185272Sdillon	 * Fully valid pages in the cluster are already good and do not need
53285272Sdillon	 * to be re-read from disk.  Replace the page with bogus_page
53385272Sdillon	 */
534248084Sattilio	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
53585272Sdillon	for (j = 0; j < bp->b_npages; j++) {
536248084Sattilio		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[j]->object);
537193643Salc		if (bp->b_pages[j]->valid == VM_PAGE_BITS_ALL)
53818737Sdyson			bp->b_pages[j] = bogus_page;
53918737Sdyson	}
540248084Sattilio	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
54120054Sdyson	if (bp->b_bufsize > bp->b_kvasize)
54237559Sbde		panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
54337559Sbde		    bp->b_bufsize, bp->b_kvasize);
54418737Sdyson
545285819Sjeff	if (buf_mapped(bp)) {
546248508Skib		pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
547248508Skib		    (vm_page_t *)bp->b_pages, bp->b_npages);
548248508Skib	}
5495455Sdg	return (bp);
5501541Srgrimes}
5511541Srgrimes
5521541Srgrimes/*
5531541Srgrimes * Cleanup after a clustered read or write.
5541541Srgrimes * This is complicated by the fact that any of the buffers might have
5551541Srgrimes * extra memory (if there were no empty buffer headers at allocbuf time)
5561541Srgrimes * that we will need to shift around.
5571541Srgrimes */
558141628Sphkstatic void
5591541Srgrimescluster_callback(bp)
5601541Srgrimes	struct buf *bp;
5611541Srgrimes{
56212404Sdyson	struct buf *nbp, *tbp;
5631541Srgrimes	int error = 0;
5641541Srgrimes
5651541Srgrimes	/*
566298819Spfg	 * Must propagate errors to all the components.
5671541Srgrimes	 */
56858934Sphk	if (bp->b_ioflags & BIO_ERROR)
5691541Srgrimes		error = bp->b_error;
5701541Srgrimes
571285819Sjeff	if (buf_mapped(bp)) {
572248508Skib		pmap_qremove(trunc_page((vm_offset_t) bp->b_data),
573248508Skib		    bp->b_npages);
574248508Skib	}
5751541Srgrimes	/*
5761541Srgrimes	 * Move memory from the large cluster buffer into the component
5771541Srgrimes	 * buffers and mark IO as done on these.
5781541Srgrimes	 */
57921002Sdyson	for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
58012404Sdyson		tbp; tbp = nbp) {
58121002Sdyson		nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
5821541Srgrimes		if (error) {
58358934Sphk			tbp->b_ioflags |= BIO_ERROR;
5841541Srgrimes			tbp->b_error = error;
58546349Salc		} else {
58646349Salc			tbp->b_dirtyoff = tbp->b_dirtyend = 0;
58758934Sphk			tbp->b_flags &= ~B_INVAL;
58858934Sphk			tbp->b_ioflags &= ~BIO_ERROR;
58977115Sdillon			/*
59077115Sdillon			 * XXX the bdwrite()/bqrelse() issued during
59177115Sdillon			 * cluster building clears B_RELBUF (see bqrelse()
59277115Sdillon			 * comment).  If direct I/O was specified, we have
59377115Sdillon			 * to restore it here to allow the buffer and VM
59477115Sdillon			 * to be freed.
59577115Sdillon			 */
59677115Sdillon			if (tbp->b_flags & B_DIRECT)
59777115Sdillon				tbp->b_flags |= B_RELBUF;
59846349Salc		}
59959249Sphk		bufdone(tbp);
6001541Srgrimes	}
601137719Sphk	pbrelvp(bp);
60242957Sdillon	relpbuf(bp, &cluster_pbuf_freecnt);
6031541Srgrimes}
6041541Srgrimes
6051541Srgrimes/*
60648545Smckusick *	cluster_wbuild_wb:
60748545Smckusick *
60848545Smckusick *	Implement modified write build for cluster.
60948545Smckusick *
61048545Smckusick *		write_behind = 0	write behind disabled
61148545Smckusick *		write_behind = 1	write behind normal (default)
61248545Smckusick *		write_behind = 2	write behind backed-off
61348545Smckusick */
61448545Smckusick
61548545Smckusickstatic __inline int
616248508Skibcluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len,
617248508Skib    int gbflags)
61848545Smckusick{
61948545Smckusick	int r = 0;
62048545Smckusick
621248283Skib	switch (write_behind) {
62248545Smckusick	case 2:
62348545Smckusick		if (start_lbn < len)
62448545Smckusick			break;
62548545Smckusick		start_lbn -= len;
626102412Scharnier		/* FALLTHROUGH */
62748545Smckusick	case 1:
628248508Skib		r = cluster_wbuild(vp, size, start_lbn, len, gbflags);
629102412Scharnier		/* FALLTHROUGH */
63048545Smckusick	default:
631102412Scharnier		/* FALLTHROUGH */
63248545Smckusick		break;
63348545Smckusick	}
63448545Smckusick	return(r);
63548545Smckusick}
63648545Smckusick
63748545Smckusick/*
6381541Srgrimes * Do clustered write for FFS.
6391541Srgrimes *
6401541Srgrimes * Three cases:
6411541Srgrimes *	1. Write is not sequential (write asynchronously)
6421541Srgrimes *	Write is sequential:
6431541Srgrimes *	2.	beginning of cluster - begin cluster
6441541Srgrimes *	3.	middle of a cluster - add to cluster
6451541Srgrimes *	4.	end of a cluster - asynchronously write cluster
6461541Srgrimes */
6471541Srgrimesvoid
648248282Skibcluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount,
649248282Skib    int gbflags)
6501541Srgrimes{
6515455Sdg	daddr_t lbn;
6525455Sdg	int maxclen, cursize;
6535455Sdg	int lblocksize;
65412404Sdyson	int async;
6551541Srgrimes
656248508Skib	if (!unmapped_buf_allowed)
657248508Skib		gbflags &= ~GB_UNMAPPED;
658248508Skib
65932286Sdyson	if (vp->v_type == VREG) {
660231204Skib		async = DOINGASYNC(vp);
66132286Sdyson		lblocksize = vp->v_mount->mnt_stat.f_iosize;
66232286Sdyson	} else {
66332286Sdyson		async = 0;
66432286Sdyson		lblocksize = bp->b_bufsize;
66532286Sdyson	}
6665455Sdg	lbn = bp->b_lblkno;
66742408Seivind	KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset"));
66834694Sdyson
6691541Srgrimes	/* Initialize vnode to beginning of file. */
6701541Srgrimes	if (lbn == 0)
6711541Srgrimes		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
6721541Srgrimes
6735455Sdg	if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
6745455Sdg	    (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
67551797Sphk		maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1;
6761541Srgrimes		if (vp->v_clen != 0) {
6771541Srgrimes			/*
6781541Srgrimes			 * Next block is not sequential.
6798876Srgrimes			 *
6801541Srgrimes			 * If we are not writing at end of file, the process
6815455Sdg			 * seeked to another point in the file since its last
6825455Sdg			 * write, or we have reached our maximum cluster size,
6835455Sdg			 * then push the previous cluster. Otherwise try
6845455Sdg			 * reallocating to make it sequential.
68558909Sdillon			 *
68658909Sdillon			 * Change to algorithm: only push previous cluster if
68758909Sdillon			 * it was sequential from the point of view of the
68858909Sdillon			 * seqcount heuristic, otherwise leave the buffer
68958909Sdillon			 * intact so we can potentially optimize the I/O
69058909Sdillon			 * later on in the buf_daemon or update daemon
69158909Sdillon			 * flush.
6921541Srgrimes			 */
6931541Srgrimes			cursize = vp->v_lastw - vp->v_cstart + 1;
69434611Sdyson			if (((u_quad_t) bp->b_offset + lblocksize) != filesize ||
69510541Sdyson			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
69658909Sdillon				if (!async && seqcount > 0) {
69748677Smckusick					cluster_wbuild_wb(vp, lblocksize,
698248508Skib					    vp->v_cstart, cursize, gbflags);
69958909Sdillon				}
70010541Sdyson			} else {
70110541Sdyson				struct buf **bpp, **endbp;
70210541Sdyson				struct cluster_save *buflist;
70310541Sdyson
704248508Skib				buflist = cluster_collectbufs(vp, bp, gbflags);
705352947Smckusick				if (buflist == NULL) {
706352947Smckusick					/*
707352947Smckusick					 * Cluster build failed so just write
708352947Smckusick					 * it now.
709352947Smckusick					 */
710352947Smckusick					bawrite(bp);
711352947Smckusick					return;
712352947Smckusick				}
71310541Sdyson				endbp = &buflist->bs_children
71410541Sdyson				    [buflist->bs_nchildren - 1];
71510541Sdyson				if (VOP_REALLOCBLKS(vp, buflist)) {
71610541Sdyson					/*
71758909Sdillon					 * Failed, push the previous cluster
71858909Sdillon					 * if *really* writing sequentially
71958909Sdillon					 * in the logical file (seqcount > 1),
72058909Sdillon					 * otherwise delay it in the hopes that
72158909Sdillon					 * the low level disk driver can
72258909Sdillon					 * optimize the write ordering.
72310541Sdyson					 */
72410541Sdyson					for (bpp = buflist->bs_children;
72510541Sdyson					     bpp < endbp; bpp++)
72610541Sdyson						brelse(*bpp);
72710541Sdyson					free(buflist, M_SEGMENT);
72858909Sdillon					if (seqcount > 1) {
72958909Sdillon						cluster_wbuild_wb(vp,
73058909Sdillon						    lblocksize, vp->v_cstart,
731248508Skib						    cursize, gbflags);
73258909Sdillon					}
73310541Sdyson				} else {
73410541Sdyson					/*
73510541Sdyson					 * Succeeded, keep building cluster.
73610541Sdyson					 */
73710541Sdyson					for (bpp = buflist->bs_children;
73810541Sdyson					     bpp <= endbp; bpp++)
73910541Sdyson						bdwrite(*bpp);
74010541Sdyson					free(buflist, M_SEGMENT);
74110541Sdyson					vp->v_lastw = lbn;
74210541Sdyson					vp->v_lasta = bp->b_blkno;
74310541Sdyson					return;
74410541Sdyson				}
74510541Sdyson			}
7461541Srgrimes		}
7471541Srgrimes		/*
7485455Sdg		 * Consider beginning a cluster. If at end of file, make
7495455Sdg		 * cluster as large as possible, otherwise find size of
7505455Sdg		 * existing cluster.
7511541Srgrimes		 */
75232286Sdyson		if ((vp->v_type == VREG) &&
75334611Sdyson			((u_quad_t) bp->b_offset + lblocksize) != filesize &&
7547613Sdg		    (bp->b_blkno == bp->b_lblkno) &&
75510551Sdyson		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
75610541Sdyson		     bp->b_blkno == -1)) {
7571541Srgrimes			bawrite(bp);
7581541Srgrimes			vp->v_clen = 0;
7591541Srgrimes			vp->v_lasta = bp->b_blkno;
7601541Srgrimes			vp->v_cstart = lbn + 1;
7611541Srgrimes			vp->v_lastw = lbn;
7621541Srgrimes			return;
7631541Srgrimes		}
7645455Sdg		vp->v_clen = maxclen;
76512404Sdyson		if (!async && maxclen == 0) {	/* I/O not contiguous */
7661541Srgrimes			vp->v_cstart = lbn + 1;
76713490Sdyson			bawrite(bp);
7685455Sdg		} else {	/* Wait for rest of cluster */
7691541Srgrimes			vp->v_cstart = lbn;
7705455Sdg			bdwrite(bp);
7711541Srgrimes		}
7721541Srgrimes	} else if (lbn == vp->v_cstart + vp->v_clen) {
7731541Srgrimes		/*
77458909Sdillon		 * At end of cluster, write it out if seqcount tells us we
77558909Sdillon		 * are operating sequentially, otherwise let the buf or
77658909Sdillon		 * update daemon handle it.
7771541Srgrimes		 */
77812404Sdyson		bdwrite(bp);
779248508Skib		if (seqcount > 1) {
780248508Skib			cluster_wbuild_wb(vp, lblocksize, vp->v_cstart,
781248508Skib			    vp->v_clen + 1, gbflags);
782248508Skib		}
7831541Srgrimes		vp->v_clen = 0;
7841541Srgrimes		vp->v_cstart = lbn + 1;
78568885Sdillon	} else if (vm_page_count_severe()) {
78668885Sdillon		/*
78768885Sdillon		 * We are low on memory, get it going NOW
78868885Sdillon		 */
78968885Sdillon		bawrite(bp);
79058909Sdillon	} else {
7911541Srgrimes		/*
7925455Sdg		 * In the middle of a cluster, so just delay the I/O for now.
7931541Srgrimes		 */
7941541Srgrimes		bdwrite(bp);
79558909Sdillon	}
7961541Srgrimes	vp->v_lastw = lbn;
7971541Srgrimes	vp->v_lasta = bp->b_blkno;
7981541Srgrimes}
7991541Srgrimes
8001541Srgrimes
8011541Srgrimes/*
8021541Srgrimes * This is an awful lot like cluster_rbuild...wish they could be combined.
8031541Srgrimes * The last lbn argument is the current block on which I/O is being
8041541Srgrimes * performed.  Check to see that it doesn't fall in the middle of
8051541Srgrimes * the current block (if last_bp == NULL).
8061541Srgrimes */
80712767Sdysonint
808248282Skibcluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len,
809248282Skib    int gbflags)
8101541Srgrimes{
81112404Sdyson	struct buf *bp, *tbp;
812177493Sjeff	struct bufobj *bo;
813145734Sjeff	int i, j;
81412767Sdyson	int totalwritten = 0;
81512404Sdyson	int dbsize = btodb(size);
81635595Sbde
817248508Skib	if (!unmapped_buf_allowed)
818248508Skib		gbflags &= ~GB_UNMAPPED;
819248508Skib
820177493Sjeff	bo = &vp->v_bufobj;
82112767Sdyson	while (len > 0) {
82271230Sdillon		/*
82371230Sdillon		 * If the buffer is not delayed-write (i.e. dirty), or it
82471230Sdillon		 * is delayed-write but either locked or inval, it cannot
82572080Sasmodai		 * partake in the clustered write.
82671230Sdillon		 */
827177493Sjeff		BO_LOCK(bo);
828136767Sphk		if ((tbp = gbincore(&vp->v_bufobj, start_lbn)) == NULL ||
829119521Sjeff		    (tbp->b_vflags & BV_BKGRDINPROG)) {
830177493Sjeff			BO_UNLOCK(bo);
83112767Sdyson			++start_lbn;
83212767Sdyson			--len;
83312767Sdyson			continue;
83412767Sdyson		}
835111886Sjeff		if (BUF_LOCK(tbp,
836251171Sjeff		    LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, BO_LOCKPTR(bo))) {
837111886Sjeff			++start_lbn;
838111886Sjeff			--len;
839111886Sjeff			continue;
840111886Sjeff		}
841119521Sjeff		if ((tbp->b_flags & (B_INVAL | B_DELWRI)) != B_DELWRI) {
842111886Sjeff			BUF_UNLOCK(tbp);
843111886Sjeff			++start_lbn;
844111886Sjeff			--len;
845111886Sjeff			continue;
846111886Sjeff		}
847153192Srodrigc		if (tbp->b_pin_count >  0) {
848153192Srodrigc			BUF_UNLOCK(tbp);
849153192Srodrigc			++start_lbn;
850153192Srodrigc			--len;
851153192Srodrigc			continue;
852153192Srodrigc		}
85312767Sdyson		bremfree(tbp);
85412767Sdyson		tbp->b_flags &= ~B_DONE;
8551541Srgrimes
85647967Sjulian		/*
85747967Sjulian		 * Extra memory in the buffer, punt on this buffer.
85847967Sjulian		 * XXX we could handle this in most cases, but we would
85947967Sjulian		 * have to push the extra memory down to after our max
86047967Sjulian		 * possible cluster size and then potentially pull it back
86147967Sjulian		 * up if the cluster was terminated prematurely--too much
86247967Sjulian		 * hassle.
86347967Sjulian		 */
86468868Stegge		if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) !=
86568868Stegge		     (B_CLUSTEROK | B_VMIO)) ||
86634630Sjulian		  (tbp->b_bcount != tbp->b_bufsize) ||
86734630Sjulian		  (tbp->b_bcount != size) ||
86834630Sjulian		  (len == 1) ||
869254945Skib		  ((bp = (vp->v_vflag & VV_MD) != 0 ?
870254945Skib		  trypbuf(&cluster_pbuf_freecnt) :
871254945Skib		  getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
87212767Sdyson			totalwritten += tbp->b_bufsize;
87312767Sdyson			bawrite(tbp);
87412767Sdyson			++start_lbn;
87512767Sdyson			--len;
87612767Sdyson			continue;
87712767Sdyson		}
87812404Sdyson
87934630Sjulian		/*
88034630Sjulian		 * We got a pbuf to make the cluster in.
88134630Sjulian		 * so initialise it.
88234630Sjulian		 */
88312767Sdyson		TAILQ_INIT(&bp->b_cluster.cluster_head);
88412767Sdyson		bp->b_bcount = 0;
88512767Sdyson		bp->b_bufsize = 0;
88612767Sdyson		bp->b_npages = 0;
88784827Sjhb		if (tbp->b_wcred != NOCRED)
88884827Sjhb			bp->b_wcred = crhold(tbp->b_wcred);
8891541Srgrimes
89012767Sdyson		bp->b_blkno = tbp->b_blkno;
89112767Sdyson		bp->b_lblkno = tbp->b_lblkno;
89234611Sdyson		bp->b_offset = tbp->b_offset;
89385272Sdillon
89485272Sdillon		/*
89585272Sdillon		 * We are synthesizing a buffer out of vm_page_t's, but
89685272Sdillon		 * if the block size is not page aligned then the starting
89785272Sdillon		 * address may not be either.  Inherit the b_data offset
89885272Sdillon		 * from the original buffer.
89985272Sdillon		 */
900248508Skib		if ((gbflags & GB_UNMAPPED) == 0 ||
901248508Skib		    (tbp->b_flags & B_VMIO) == 0) {
902248508Skib			bp->b_data = (char *)((vm_offset_t)bp->b_data |
903248508Skib			    ((vm_offset_t)tbp->b_data & PAGE_MASK));
904248508Skib		} else {
905248508Skib			bp->b_data = unmapped_buf;
906248508Skib		}
907248508Skib		bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO |
908248508Skib		    B_NEEDCOMMIT));
90912767Sdyson		bp->b_iodone = cluster_callback;
91012767Sdyson		pbgetvp(vp, bp);
91134630Sjulian		/*
91234630Sjulian		 * From this location in the file, scan forward to see
91334630Sjulian		 * if there are buffers with adjacent data that need to
91434630Sjulian		 * be written as well.
91534630Sjulian		 */
91612767Sdyson		for (i = 0; i < len; ++i, ++start_lbn) {
91734630Sjulian			if (i != 0) { /* If not the first buffer */
91834630Sjulian				/*
91934630Sjulian				 * If the adjacent data is not even in core it
92034630Sjulian				 * can't need to be written.
92134630Sjulian				 */
922177493Sjeff				BO_LOCK(bo);
923177493Sjeff				if ((tbp = gbincore(bo, start_lbn)) == NULL ||
924119521Sjeff				    (tbp->b_vflags & BV_BKGRDINPROG)) {
925177493Sjeff					BO_UNLOCK(bo);
92612767Sdyson					break;
92712767Sdyson				}
9281541Srgrimes
92934630Sjulian				/*
93034630Sjulian				 * If it IS in core, but has different
93171230Sdillon				 * characteristics, or is locked (which
93271230Sdillon				 * means it could be undergoing a background
93371230Sdillon				 * I/O or be in a weird state), then don't
93471230Sdillon				 * cluster with it.
93534630Sjulian				 */
936111886Sjeff				if (BUF_LOCK(tbp,
937111886Sjeff				    LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK,
938251171Sjeff				    BO_LOCKPTR(bo)))
939111886Sjeff					break;
940111886Sjeff
94148225Smckusick				if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK |
94248225Smckusick				    B_INVAL | B_DELWRI | B_NEEDCOMMIT))
943111886Sjeff				    != (B_DELWRI | B_CLUSTEROK |
94448225Smckusick				    (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) ||
945111886Sjeff				    tbp->b_wcred != bp->b_wcred) {
946112347Sjeff					BUF_UNLOCK(tbp);
94712767Sdyson					break;
94812767Sdyson				}
94912767Sdyson
95034630Sjulian				/*
95134630Sjulian				 * Check that the combined cluster
95234630Sjulian				 * would make sense with regard to pages
95334630Sjulian				 * and would not be too large
95434630Sjulian				 */
95512767Sdyson				if ((tbp->b_bcount != size) ||
95634630Sjulian				  ((bp->b_blkno + (dbsize * i)) !=
95734694Sdyson				    tbp->b_blkno) ||
95834630Sjulian				  ((tbp->b_npages + bp->b_npages) >
95951797Sphk				    (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) {
96048225Smckusick					BUF_UNLOCK(tbp);
96112767Sdyson					break;
96212767Sdyson				}
963153192Srodrigc
96434630Sjulian				/*
965153192Srodrigc				 * Do not pull in pinned buffers.
966153192Srodrigc				 */
967153192Srodrigc				if (tbp->b_pin_count > 0) {
968153192Srodrigc					BUF_UNLOCK(tbp);
969153192Srodrigc					break;
970153192Srodrigc				}
971153192Srodrigc
972153192Srodrigc				/*
97334630Sjulian				 * Ok, it's passed all the tests,
97434630Sjulian				 * so remove it from the free list
97534630Sjulian				 * and mark it busy. We will use it.
97634630Sjulian				 */
97712767Sdyson				bremfree(tbp);
97812767Sdyson				tbp->b_flags &= ~B_DONE;
97934630Sjulian			} /* end of code for non-first buffers only */
98034630Sjulian			/*
98134630Sjulian			 * If the IO is via the VM then we do some
98285272Sdillon			 * special VM hackery (yuck).  Since the buffer's
98385272Sdillon			 * block size may not be page-aligned it is possible
98485272Sdillon			 * for a page to be shared between two buffers.  We
98585272Sdillon			 * have to get rid of the duplication when building
98685272Sdillon			 * the cluster.
98734630Sjulian			 */
98813490Sdyson			if (tbp->b_flags & B_VMIO) {
98932937Sdyson				vm_page_t m;
99032937Sdyson
991248084Sattilio				VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
992254668Skib				if (i == 0) {
993254668Skib					vfs_drain_busy_pages(tbp);
994254717Sjkim				} else { /* if not first buffer */
99532937Sdyson					for (j = 0; j < tbp->b_npages; j += 1) {
99632937Sdyson						m = tbp->b_pages[j];
997254138Sattilio						if (vm_page_xbusied(m)) {
998248084Sattilio							VM_OBJECT_WUNLOCK(
999136985Salc							    tbp->b_object);
100050701Stegge							bqrelse(tbp);
100132937Sdyson							goto finishcluster;
100250701Stegge						}
100332937Sdyson					}
100432937Sdyson				}
100513490Sdyson				for (j = 0; j < tbp->b_npages; j += 1) {
100613490Sdyson					m = tbp->b_pages[j];
1007254138Sattilio					vm_page_sbusy(m);
100838517Sdfr					vm_object_pip_add(m->object, 1);
100913490Sdyson					if ((bp->b_npages == 0) ||
101034630Sjulian					  (bp->b_pages[bp->b_npages - 1] != m)) {
101113490Sdyson						bp->b_pages[bp->b_npages] = m;
101213490Sdyson						bp->b_npages++;
101313490Sdyson					}
101412767Sdyson				}
1015248084Sattilio				VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
101612767Sdyson			}
101712767Sdyson			bp->b_bcount += size;
101812767Sdyson			bp->b_bufsize += size;
1019246876Smckusick			/*
1020246876Smckusick			 * If any of the clustered buffers have their
1021246876Smckusick			 * B_BARRIER flag set, transfer that request to
1022246876Smckusick			 * the cluster.
1023246876Smckusick			 */
1024246876Smckusick			bp->b_flags |= (tbp->b_flags & B_BARRIER);
1025246876Smckusick			tbp->b_flags &= ~(B_DONE | B_BARRIER);
1026246876Smckusick			tbp->b_flags |= B_ASYNC;
102758934Sphk			tbp->b_ioflags &= ~BIO_ERROR;
102858345Sphk			tbp->b_iocmd = BIO_WRITE;
1029246876Smckusick			bundirty(tbp);
1030132640Sphk			reassignbuf(tbp);		/* put on clean list */
1031136767Sphk			bufobj_wref(tbp->b_bufobj);
103248333Speter			BUF_KERNPROC(tbp);
103312767Sdyson			TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
103412767Sdyson				tbp, b_cluster.cluster_entry);
10351541Srgrimes		}
103632937Sdyson	finishcluster:
1037285819Sjeff		if (buf_mapped(bp)) {
1038248508Skib			pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
1039248508Skib			    (vm_page_t *)bp->b_pages, bp->b_npages);
1040248508Skib		}
104120054Sdyson		if (bp->b_bufsize > bp->b_kvasize)
104237559Sbde			panic(
104337559Sbde			    "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
104437559Sbde			    bp->b_bufsize, bp->b_kvasize);
104512767Sdyson		totalwritten += bp->b_bufsize;
104617304Sdyson		bp->b_dirtyoff = 0;
104717304Sdyson		bp->b_dirtyend = bp->b_bufsize;
104812767Sdyson		bawrite(bp);
10491541Srgrimes
105012767Sdyson		len -= i;
10511541Srgrimes	}
105212767Sdyson	return totalwritten;
10531541Srgrimes}
10541541Srgrimes
10551541Srgrimes/*
10561541Srgrimes * Collect together all the buffers in a cluster.
10571541Srgrimes * Plus add one additional buffer.
10581541Srgrimes */
105912973Sbdestatic struct cluster_save *
1060248508Skibcluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags)
10611541Srgrimes{
10621541Srgrimes	struct cluster_save *buflist;
106341205Smckusick	struct buf *bp;
10645455Sdg	daddr_t lbn;
1065352947Smckusick	int i, j, len, error;
10661541Srgrimes
10671541Srgrimes	len = vp->v_lastw - vp->v_cstart + 1;
10681541Srgrimes	buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
1069111119Simp	    M_SEGMENT, M_WAITOK);
10701541Srgrimes	buflist->bs_nchildren = 0;
10715455Sdg	buflist->bs_children = (struct buf **) (buflist + 1);
107241205Smckusick	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) {
1073352947Smckusick		error = bread_gb(vp, lbn, last_bp->b_bcount, NOCRED,
1074248508Skib		    gbflags, &bp);
1075352947Smckusick		if (error != 0) {
1076352947Smckusick			/*
1077352947Smckusick			 * If read fails, release collected buffers
1078352947Smckusick			 * and return failure.
1079352947Smckusick			 */
1080352947Smckusick			for (j = 0; j < i; j++)
1081352947Smckusick				brelse(buflist->bs_children[j]);
1082352947Smckusick			free(buflist, M_SEGMENT);
1083352947Smckusick			return (NULL);
1084352947Smckusick		}
108541205Smckusick		buflist->bs_children[i] = bp;
108641205Smckusick		if (bp->b_blkno == bp->b_lblkno)
1087136989Sphk			VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
108841205Smckusick				NULL, NULL);
108941205Smckusick	}
109041529Smckusick	buflist->bs_children[i] = bp = last_bp;
109141529Smckusick	if (bp->b_blkno == bp->b_lblkno)
1092136989Sphk		VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
10931541Srgrimes	buflist->bs_nchildren = i + 1;
10941541Srgrimes	return (buflist);
10951541Srgrimes}
1096