vfs_cluster.c revision 33108
1/*-
2 * Copyright (c) 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * Modifications/enhancements:
5 * 	Copyright (c) 1995 John S. Dyson.  All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 *    must display the following acknowledgement:
17 *	This product includes software developed by the University of
18 *	California, Berkeley and its contributors.
19 * 4. Neither the name of the University nor the names of its contributors
20 *    may be used to endorse or promote products derived from this software
21 *    without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
36 * $Id: vfs_cluster.c,v 1.53 1998/01/31 11:56:01 dyson Exp $
37 */
38
39#include "opt_debug_cluster.h"
40#include "opt_diagnostic.h"
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/proc.h>
45#include <sys/buf.h>
46#include <sys/vnode.h>
47#include <sys/mount.h>
48#include <sys/resourcevar.h>
49#include <vm/vm.h>
50#include <vm/vm_prot.h>
51#include <vm/vm_object.h>
52#include <vm/vm_page.h>
53
54#if defined(CLUSTERDEBUG)
55#include <sys/sysctl.h>
56#include <sys/kernel.h>
57static int	rcluster= 0;
58SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, "");
59#endif
60
61#ifdef notyet_block_reallocation_enabled
62static struct cluster_save *
63	cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp));
64#endif
65static struct buf *
66	cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn,
67			    daddr_t blkno, long size, int run, struct buf *fbp));
68
69extern vm_page_t	bogus_page;
70
71/*
72 * Maximum number of blocks for read-ahead.
73 */
74#define MAXRA 32
75
76/*
77 * This replaces bread.
78 */
79int
80cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp)
81	struct vnode *vp;
82	u_quad_t filesize;
83	daddr_t lblkno;
84	long size;
85	struct ucred *cred;
86	long totread;
87	int seqcount;
88	struct buf **bpp;
89{
90	struct buf *bp, *rbp, *reqbp;
91	daddr_t blkno, origblkno;
92	int error, num_ra;
93	int i;
94	int maxra, racluster;
95	long origtotread;
96
97	error = 0;
98	if (vp->v_maxio == 0)
99		vp->v_maxio = DFLTPHYS;
100
101	/*
102	 * Try to limit the amount of read-ahead by a few
103	 * ad-hoc parameters.  This needs work!!!
104	 */
105	racluster = vp->v_maxio/size;
106	maxra = 2 * racluster + (totread / size);
107	if (maxra > MAXRA)
108		maxra = MAXRA;
109	if (maxra > nbuf/8)
110		maxra = nbuf/8;
111
112	/*
113	 * get the requested block
114	 */
115	*bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0);
116	origblkno = lblkno;
117	origtotread = totread;
118
119	/*
120	 * if it is in the cache, then check to see if the reads have been
121	 * sequential.  If they have, then try some read-ahead, otherwise
122	 * back-off on prospective read-aheads.
123	 */
124	if (bp->b_flags & B_CACHE) {
125		if (!seqcount) {
126			return 0;
127		} else if ((bp->b_flags & B_RAM) == 0) {
128			return 0;
129		} else {
130			int s;
131			struct buf *tbp;
132			bp->b_flags &= ~B_RAM;
133			/*
134			 * We do the spl here so that there is no window
135			 * between the incore and the b_usecount increment
136			 * below.  We opt to keep the spl out of the loop
137			 * for efficiency.
138			 */
139			s = splbio();
140			for(i=1;i<maxra;i++) {
141
142				if (!(tbp = incore(vp, lblkno+i))) {
143					break;
144				}
145
146				/*
147				 * Set another read-ahead mark so we know to check
148				 * again.
149				 */
150				if (((i % racluster) == (racluster - 1)) ||
151					(i == (maxra - 1)))
152					tbp->b_flags |= B_RAM;
153
154#if 0
155				if (tbp->b_usecount == 0) {
156					/*
157					 * Make sure that the soon-to-be used readaheads
158					 * are still there.  The getblk/bqrelse pair will
159					 * boost the priority of the buffer.
160					 */
161					tbp = getblk(vp, lblkno+i, size, 0, 0);
162					bqrelse(tbp);
163				}
164#endif
165			}
166			splx(s);
167			if (i >= maxra) {
168				return 0;
169			}
170			lblkno += i;
171		}
172		reqbp = bp = NULL;
173	} else {
174		u_quad_t firstread;
175		firstread = (u_quad_t) lblkno * size;
176		if (firstread + totread > filesize)
177			totread = filesize - firstread;
178		if (totread > size) {
179			int nblks = 0;
180			int ncontigafter;
181			while (totread > 0) {
182				nblks++;
183				totread -= size;
184			}
185			if (nblks == 1)
186				goto single_block_read;
187			if (nblks > racluster)
188				nblks = racluster;
189
190	    		error = VOP_BMAP(vp, lblkno, NULL,
191				&blkno, &ncontigafter, NULL);
192			if (error)
193				goto single_block_read;
194			if (blkno == -1)
195				goto single_block_read;
196			if (ncontigafter == 0)
197				goto single_block_read;
198			if (ncontigafter + 1 < nblks)
199				nblks = ncontigafter + 1;
200
201			bp = cluster_rbuild(vp, filesize, lblkno,
202				blkno, size, nblks, bp);
203			lblkno += nblks;
204		} else {
205single_block_read:
206			/*
207			 * if it isn't in the cache, then get a chunk from
208			 * disk if sequential, otherwise just get the block.
209			 */
210			bp->b_flags |= B_READ | B_RAM;
211			lblkno += 1;
212		}
213	}
214
215	/*
216	 * if we have been doing sequential I/O, then do some read-ahead
217	 */
218	rbp = NULL;
219	/* if (seqcount && (lblkno < (origblkno + maxra))) { */
220	if (seqcount && (lblkno < (origblkno + seqcount))) {
221		/*
222		 * we now build the read-ahead buffer if it is desirable.
223		 */
224		if (((u_quad_t)(lblkno + 1) * size) <= filesize &&
225		    !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) &&
226		    blkno != -1) {
227			int nblksread;
228			int ntoread = num_ra + 1;
229			nblksread = (origtotread + size - 1) / size;
230			if (seqcount < nblksread)
231				seqcount = nblksread;
232			if (seqcount < ntoread)
233				ntoread = seqcount;
234			if (num_ra) {
235				rbp = cluster_rbuild(vp, filesize, lblkno,
236					blkno, size, ntoread, NULL);
237			} else {
238				rbp = getblk(vp, lblkno, size, 0, 0);
239				rbp->b_flags |= B_READ | B_ASYNC | B_RAM;
240				rbp->b_blkno = blkno;
241			}
242		}
243	}
244
245	/*
246	 * handle the synchronous read
247	 */
248	if (bp) {
249		if (bp->b_flags & (B_DONE | B_DELWRI)) {
250			panic("cluster_read: DONE bp");
251		} else {
252#if defined(CLUSTERDEBUG)
253			if (rcluster)
254				printf("S(%d,%d,%d) ",
255					bp->b_lblkno, bp->b_bcount, seqcount);
256#endif
257			if ((bp->b_flags & B_CLUSTER) == 0)
258				vfs_busy_pages(bp, 0);
259			error = VOP_STRATEGY(bp);
260			curproc->p_stats->p_ru.ru_inblock++;
261		}
262	}
263	/*
264	 * and if we have read-aheads, do them too
265	 */
266	if (rbp) {
267		if (error) {
268			rbp->b_flags &= ~(B_ASYNC | B_READ);
269			brelse(rbp);
270		} else if (rbp->b_flags & B_CACHE) {
271			rbp->b_flags &= ~(B_ASYNC | B_READ);
272			bqrelse(rbp);
273		} else {
274#if defined(CLUSTERDEBUG)
275			if (rcluster) {
276				if (bp)
277					printf("A+(%d,%d,%d,%d) ",
278					rbp->b_lblkno, rbp->b_bcount,
279					rbp->b_lblkno - origblkno,
280					seqcount);
281				else
282					printf("A(%d,%d,%d,%d) ",
283					rbp->b_lblkno, rbp->b_bcount,
284					rbp->b_lblkno - origblkno,
285					seqcount);
286			}
287#endif
288
289			if ((rbp->b_flags & B_CLUSTER) == 0)
290				vfs_busy_pages(rbp, 0);
291			(void) VOP_STRATEGY(rbp);
292			curproc->p_stats->p_ru.ru_inblock++;
293		}
294	}
295	if (reqbp)
296		return (biowait(reqbp));
297	else
298		return (error);
299}
300
301/*
302 * If blocks are contiguous on disk, use this to provide clustered
303 * read ahead.  We will read as many blocks as possible sequentially
304 * and then parcel them up into logical blocks in the buffer hash table.
305 */
306static struct buf *
307cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
308	struct vnode *vp;
309	u_quad_t filesize;
310	daddr_t lbn;
311	daddr_t blkno;
312	long size;
313	int run;
314	struct buf *fbp;
315{
316	struct buf *bp, *tbp;
317	daddr_t bn;
318	int i, inc, j;
319
320#ifdef DIAGNOSTIC
321	if (size != vp->v_mount->mnt_stat.f_iosize)
322		panic("cluster_rbuild: size %d != filesize %d\n",
323		    size, vp->v_mount->mnt_stat.f_iosize);
324#endif
325	/*
326	 * avoid a division
327	 */
328	while ((u_quad_t) size * (lbn + run) > filesize) {
329		--run;
330	}
331
332	if (fbp) {
333		tbp = fbp;
334		tbp->b_flags |= B_READ;
335	} else {
336		tbp = getblk(vp, lbn, size, 0, 0);
337		if (tbp->b_flags & B_CACHE)
338			return tbp;
339		tbp->b_flags |= B_ASYNC | B_READ | B_RAM;
340	}
341
342	tbp->b_blkno = blkno;
343	if( (tbp->b_flags & B_MALLOC) ||
344		((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
345		return tbp;
346
347	bp = trypbuf();
348	if (bp == 0)
349		return tbp;
350
351	(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
352	bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO;
353	bp->b_iodone = cluster_callback;
354	bp->b_blkno = blkno;
355	bp->b_lblkno = lbn;
356	pbgetvp(vp, bp);
357
358	TAILQ_INIT(&bp->b_cluster.cluster_head);
359
360	bp->b_bcount = 0;
361	bp->b_bufsize = 0;
362	bp->b_npages = 0;
363
364	if (vp->v_maxio == 0)
365		vp->v_maxio = DFLTPHYS;
366	inc = btodb(size);
367	for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
368		if (i != 0) {
369			if ((bp->b_npages * PAGE_SIZE) +
370				round_page(size) > vp->v_maxio)
371				break;
372
373			if (incore(vp, lbn + i))
374				break;
375
376			tbp = getblk(vp, lbn + i, size, 0, 0);
377
378			if ((tbp->b_flags & B_CACHE) ||
379				(tbp->b_flags & B_VMIO) == 0) {
380				bqrelse(tbp);
381				break;
382			}
383
384			for (j=0;j<tbp->b_npages;j++) {
385				if (tbp->b_pages[j]->valid) {
386					break;
387				}
388			}
389
390			if (j != tbp->b_npages) {
391				/*
392				 * force buffer to be re-constituted later
393				 */
394				tbp->b_flags |= B_RELBUF;
395				brelse(tbp);
396				break;
397			}
398
399			if ((fbp && (i == 1)) || (i == (run - 1)))
400				tbp->b_flags |= B_RAM;
401			tbp->b_flags |= B_READ | B_ASYNC;
402			if (tbp->b_blkno == tbp->b_lblkno) {
403				tbp->b_blkno = bn;
404			} else if (tbp->b_blkno != bn) {
405				brelse(tbp);
406				break;
407			}
408		}
409		TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
410			tbp, b_cluster.cluster_entry);
411		for (j = 0; j < tbp->b_npages; j += 1) {
412			vm_page_t m;
413			m = tbp->b_pages[j];
414			++m->busy;
415			++m->object->paging_in_progress;
416			if ((bp->b_npages == 0) ||
417				(bp->b_pages[bp->b_npages-1] != m)) {
418				bp->b_pages[bp->b_npages] = m;
419				bp->b_npages++;
420			}
421			if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
422				tbp->b_pages[j] = bogus_page;
423		}
424		bp->b_bcount += tbp->b_bcount;
425		bp->b_bufsize += tbp->b_bufsize;
426	}
427
428	for(j=0;j<bp->b_npages;j++) {
429		if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) ==
430			VM_PAGE_BITS_ALL)
431			bp->b_pages[j] = bogus_page;
432	}
433	if (bp->b_bufsize > bp->b_kvasize)
434		panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)\n",
435			bp->b_bufsize, bp->b_kvasize);
436	bp->b_kvasize = bp->b_bufsize;
437
438	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
439		(vm_page_t *)bp->b_pages, bp->b_npages);
440	return (bp);
441}
442
443/*
444 * Cleanup after a clustered read or write.
445 * This is complicated by the fact that any of the buffers might have
446 * extra memory (if there were no empty buffer headers at allocbuf time)
447 * that we will need to shift around.
448 */
449void
450cluster_callback(bp)
451	struct buf *bp;
452{
453	struct buf *nbp, *tbp;
454	int error = 0;
455
456	/*
457	 * Must propogate errors to all the components.
458	 */
459	if (bp->b_flags & B_ERROR)
460		error = bp->b_error;
461
462	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
463	/*
464	 * Move memory from the large cluster buffer into the component
465	 * buffers and mark IO as done on these.
466	 */
467	for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
468		tbp; tbp = nbp) {
469		nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
470		if (error) {
471			tbp->b_flags |= B_ERROR;
472			tbp->b_error = error;
473		} else
474		    tbp->b_dirtyoff = tbp->b_dirtyend = 0;
475		biodone(tbp);
476	}
477	relpbuf(bp);
478}
479
480/*
481 * Do clustered write for FFS.
482 *
483 * Three cases:
484 *	1. Write is not sequential (write asynchronously)
485 *	Write is sequential:
486 *	2.	beginning of cluster - begin cluster
487 *	3.	middle of a cluster - add to cluster
488 *	4.	end of a cluster - asynchronously write cluster
489 */
490void
491cluster_write(bp, filesize)
492	struct buf *bp;
493	u_quad_t filesize;
494{
495	struct vnode *vp;
496	daddr_t lbn;
497	int maxclen, cursize;
498	int lblocksize;
499	int async;
500
501	vp = bp->b_vp;
502	if (vp->v_maxio == 0)
503		vp->v_maxio = DFLTPHYS;
504	if (vp->v_type == VREG) {
505		async = vp->v_mount->mnt_flag & MNT_ASYNC;
506		lblocksize = vp->v_mount->mnt_stat.f_iosize;
507	} else {
508		async = 0;
509		lblocksize = bp->b_bufsize;
510	}
511	lbn = bp->b_lblkno;
512
513	/* Initialize vnode to beginning of file. */
514	if (lbn == 0)
515		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
516
517	if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
518	    (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
519		maxclen = vp->v_maxio / lblocksize - 1;
520		if (vp->v_clen != 0) {
521			/*
522			 * Next block is not sequential.
523			 *
524			 * If we are not writing at end of file, the process
525			 * seeked to another point in the file since its last
526			 * write, or we have reached our maximum cluster size,
527			 * then push the previous cluster. Otherwise try
528			 * reallocating to make it sequential.
529			 */
530			cursize = vp->v_lastw - vp->v_cstart + 1;
531#ifndef notyet_block_reallocation_enabled
532			if (((u_quad_t)(lbn + 1) * lblocksize) != filesize ||
533				lbn != vp->v_lastw + 1 ||
534				vp->v_clen <= cursize) {
535				if (!async)
536					cluster_wbuild(vp, lblocksize,
537						vp->v_cstart, cursize);
538			}
539#else
540			if ((lbn + 1) * lblocksize != filesize ||
541			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
542				if (!async)
543					cluster_wbuild(vp, lblocksize,
544						vp->v_cstart, cursize);
545			} else {
546				struct buf **bpp, **endbp;
547				struct cluster_save *buflist;
548
549				buflist = cluster_collectbufs(vp, bp);
550				endbp = &buflist->bs_children
551				    [buflist->bs_nchildren - 1];
552				if (VOP_REALLOCBLKS(vp, buflist)) {
553					/*
554					 * Failed, push the previous cluster.
555					 */
556					for (bpp = buflist->bs_children;
557					     bpp < endbp; bpp++)
558						brelse(*bpp);
559					free(buflist, M_SEGMENT);
560					cluster_wbuild(vp, lblocksize,
561					    vp->v_cstart, cursize);
562				} else {
563					/*
564					 * Succeeded, keep building cluster.
565					 */
566					for (bpp = buflist->bs_children;
567					     bpp <= endbp; bpp++)
568						bdwrite(*bpp);
569					free(buflist, M_SEGMENT);
570					vp->v_lastw = lbn;
571					vp->v_lasta = bp->b_blkno;
572					return;
573				}
574			}
575#endif /* notyet_block_reallocation_enabled */
576		}
577		/*
578		 * Consider beginning a cluster. If at end of file, make
579		 * cluster as large as possible, otherwise find size of
580		 * existing cluster.
581		 */
582		if ((vp->v_type == VREG) &&
583			((u_quad_t) (lbn + 1) * lblocksize) != filesize &&
584		    (bp->b_blkno == bp->b_lblkno) &&
585		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
586		     bp->b_blkno == -1)) {
587			bawrite(bp);
588			vp->v_clen = 0;
589			vp->v_lasta = bp->b_blkno;
590			vp->v_cstart = lbn + 1;
591			vp->v_lastw = lbn;
592			return;
593		}
594		vp->v_clen = maxclen;
595		if (!async && maxclen == 0) {	/* I/O not contiguous */
596			vp->v_cstart = lbn + 1;
597			bawrite(bp);
598		} else {	/* Wait for rest of cluster */
599			vp->v_cstart = lbn;
600			bdwrite(bp);
601		}
602	} else if (lbn == vp->v_cstart + vp->v_clen) {
603		/*
604		 * At end of cluster, write it out.
605		 */
606		bdwrite(bp);
607		cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
608		vp->v_clen = 0;
609		vp->v_cstart = lbn + 1;
610	} else
611		/*
612		 * In the middle of a cluster, so just delay the I/O for now.
613		 */
614		bdwrite(bp);
615	vp->v_lastw = lbn;
616	vp->v_lasta = bp->b_blkno;
617}
618
619
620/*
621 * This is an awful lot like cluster_rbuild...wish they could be combined.
622 * The last lbn argument is the current block on which I/O is being
623 * performed.  Check to see that it doesn't fall in the middle of
624 * the current block (if last_bp == NULL).
625 */
626int
627cluster_wbuild(vp, size, start_lbn, len)
628	struct vnode *vp;
629	long size;
630	daddr_t start_lbn;
631	int len;
632{
633	struct buf *bp, *tbp;
634	int i, j, s;
635	int totalwritten = 0;
636	int dbsize = btodb(size);
637	while (len > 0) {
638		s = splbio();
639		if (((tbp = gbincore(vp, start_lbn)) == NULL) ||
640			((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) {
641			++start_lbn;
642			--len;
643			splx(s);
644			continue;
645		}
646		bremfree(tbp);
647		tbp->b_flags |= B_BUSY;
648		tbp->b_flags &= ~B_DONE;
649		splx(s);
650
651	/*
652	 * Extra memory in the buffer, punt on this buffer. XXX we could
653	 * handle this in most cases, but we would have to push the extra
654	 * memory down to after our max possible cluster size and then
655	 * potentially pull it back up if the cluster was terminated
656	 * prematurely--too much hassle.
657	 */
658		if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) ||
659			(tbp->b_bcount != tbp->b_bufsize) ||
660			(tbp->b_bcount != size) ||
661			len == 1) {
662			totalwritten += tbp->b_bufsize;
663			bawrite(tbp);
664			++start_lbn;
665			--len;
666			continue;
667		}
668
669		bp = trypbuf();
670		if (bp == NULL) {
671			totalwritten += tbp->b_bufsize;
672			bawrite(tbp);
673			++start_lbn;
674			--len;
675			continue;
676		}
677
678		TAILQ_INIT(&bp->b_cluster.cluster_head);
679		bp->b_bcount = 0;
680		bp->b_bufsize = 0;
681		bp->b_npages = 0;
682		if (tbp->b_wcred != NOCRED) {
683		    bp->b_wcred = tbp->b_wcred;
684		    crhold(bp->b_wcred);
685		}
686
687		bp->b_blkno = tbp->b_blkno;
688		bp->b_lblkno = tbp->b_lblkno;
689		(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
690		bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER |
691						(tbp->b_flags & (B_VMIO|B_NEEDCOMMIT));
692		bp->b_iodone = cluster_callback;
693		pbgetvp(vp, bp);
694
695		for (i = 0; i < len; ++i, ++start_lbn) {
696			if (i != 0) {
697				s = splbio();
698				if ((tbp = gbincore(vp, start_lbn)) == NULL) {
699					splx(s);
700					break;
701				}
702
703				if ((tbp->b_flags & (B_VMIO|B_CLUSTEROK|B_INVAL|B_BUSY|B_DELWRI|B_NEEDCOMMIT)) != (B_DELWRI|B_CLUSTEROK|(bp->b_flags & (B_VMIO|B_NEEDCOMMIT)))) {
704					splx(s);
705					break;
706				}
707
708				if (tbp->b_wcred != bp->b_wcred) {
709					splx(s);
710					break;
711				}
712
713				if ((tbp->b_bcount != size) ||
714					((bp->b_blkno + dbsize * i) != tbp->b_blkno) ||
715					((tbp->b_npages + bp->b_npages) > (vp->v_maxio / PAGE_SIZE))) {
716					splx(s);
717					break;
718				}
719				bremfree(tbp);
720				tbp->b_flags |= B_BUSY;
721				tbp->b_flags &= ~B_DONE;
722				splx(s);
723			}
724
725			if (tbp->b_flags & B_VMIO) {
726				vm_page_t m;
727
728				if (i != 0) {
729					for (j = 0; j < tbp->b_npages; j += 1) {
730						m = tbp->b_pages[j];
731						if (m->flags & PG_BUSY)
732							goto finishcluster;
733					}
734				}
735
736				for (j = 0; j < tbp->b_npages; j += 1) {
737					m = tbp->b_pages[j];
738					++m->busy;
739					++m->object->paging_in_progress;
740					if ((bp->b_npages == 0) ||
741						(bp->b_pages[bp->b_npages - 1] != m)) {
742						bp->b_pages[bp->b_npages] = m;
743						bp->b_npages++;
744					}
745				}
746			}
747			bp->b_bcount += size;
748			bp->b_bufsize += size;
749
750			--numdirtybuffers;
751			tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
752			tbp->b_flags |= B_ASYNC;
753			s = splbio();
754			reassignbuf(tbp, tbp->b_vp);	/* put on clean list */
755			++tbp->b_vp->v_numoutput;
756			splx(s);
757			TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
758				tbp, b_cluster.cluster_entry);
759		}
760	finishcluster:
761		pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
762			(vm_page_t *) bp->b_pages, bp->b_npages);
763		if (bp->b_bufsize > bp->b_kvasize)
764			panic("cluster_wbuild: b_bufsize(%d) > b_kvasize(%d)\n",
765				bp->b_bufsize, bp->b_kvasize);
766		bp->b_kvasize = bp->b_bufsize;
767		totalwritten += bp->b_bufsize;
768		bp->b_dirtyoff = 0;
769		bp->b_dirtyend = bp->b_bufsize;
770		bawrite(bp);
771
772		len -= i;
773	}
774	return totalwritten;
775}
776
777#ifdef notyet_block_reallocation_enabled
778/*
779 * Collect together all the buffers in a cluster.
780 * Plus add one additional buffer.
781 */
782static struct cluster_save *
783cluster_collectbufs(vp, last_bp)
784	struct vnode *vp;
785	struct buf *last_bp;
786{
787	struct cluster_save *buflist;
788	daddr_t lbn;
789	int i, len;
790
791	len = vp->v_lastw - vp->v_cstart + 1;
792	buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
793	    M_SEGMENT, M_WAITOK);
794	buflist->bs_nchildren = 0;
795	buflist->bs_children = (struct buf **) (buflist + 1);
796	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++)
797		(void) bread(vp, lbn, last_bp->b_bcount, NOCRED,
798		    &buflist->bs_children[i]);
799	buflist->bs_children[i] = last_bp;
800	buflist->bs_nchildren = i + 1;
801	return (buflist);
802}
803#endif /* notyet_block_reallocation_enabled */
804