vfs_cluster.c revision 27845
1/*-
2 * Copyright (c) 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * Modifications/enhancements:
5 * 	Copyright (c) 1995 John S. Dyson.  All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 *    must display the following acknowledgement:
17 *	This product includes software developed by the University of
18 *	California, Berkeley and its contributors.
19 * 4. Neither the name of the University nor the names of its contributors
20 *    may be used to endorse or promote products derived from this software
21 *    without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
36 * $Id: vfs_cluster.c,v 1.47 1997/06/15 17:56:49 dyson Exp $
37 */
38
39#include <sys/param.h>
40#include <sys/systm.h>
41#include <sys/proc.h>
42#include <sys/buf.h>
43#include <sys/vnode.h>
44#include <sys/mount.h>
45#include <sys/resourcevar.h>
46#include <vm/vm.h>
47#include <vm/vm_prot.h>
48#include <vm/vm_object.h>
49#include <vm/vm_page.h>
50
51#if defined(CLUSTERDEBUG)
52#include <sys/sysctl.h>
53#include <sys/kernel.h>
54static int	rcluster= 0;
55SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, "");
56#endif
57
58#ifdef notyet_block_reallocation_enabled
59static struct cluster_save *
60	cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp));
61#endif
62static struct buf *
63	cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn,
64			    daddr_t blkno, long size, int run, struct buf *fbp));
65
66extern vm_page_t	bogus_page;
67
68/*
69 * Maximum number of blocks for read-ahead.
70 */
71#define MAXRA 32
72
73/*
74 * This replaces bread.
75 */
76int
77cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp)
78	struct vnode *vp;
79	u_quad_t filesize;
80	daddr_t lblkno;
81	long size;
82	struct ucred *cred;
83	long totread;
84	int seqcount;
85	struct buf **bpp;
86{
87	struct buf *bp, *rbp, *reqbp;
88	daddr_t blkno, rablkno, origblkno;
89	int error, num_ra;
90	int i;
91	int maxra, racluster;
92	long origtotread;
93
94	error = 0;
95
96	/*
97	 * Try to limit the amount of read-ahead by a few
98	 * ad-hoc parameters.  This needs work!!!
99	 */
100	racluster = MAXPHYS/size;
101	maxra = 2 * racluster + (totread / size);
102	if (maxra > MAXRA)
103		maxra = MAXRA;
104	if (maxra > nbuf/8)
105		maxra = nbuf/8;
106
107	/*
108	 * get the requested block
109	 */
110	*bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0);
111	origblkno = lblkno;
112	origtotread = totread;
113
114	/*
115	 * if it is in the cache, then check to see if the reads have been
116	 * sequential.  If they have, then try some read-ahead, otherwise
117	 * back-off on prospective read-aheads.
118	 */
119	if (bp->b_flags & B_CACHE) {
120		if (!seqcount) {
121			return 0;
122		} else if ((bp->b_flags & B_RAM) == 0) {
123			return 0;
124		} else {
125			int s;
126			struct buf *tbp;
127			bp->b_flags &= ~B_RAM;
128			/*
129			 * We do the spl here so that there is no window
130			 * between the incore and the b_usecount increment
131			 * below.  We opt to keep the spl out of the loop
132			 * for efficiency.
133			 */
134			s = splbio();
135			for(i=1;i<maxra;i++) {
136
137				if (!(tbp = incore(vp, lblkno+i))) {
138					break;
139				}
140
141				/*
142				 * Set another read-ahead mark so we know to check
143				 * again.
144				 */
145				if (((i % racluster) == (racluster - 1)) ||
146					(i == (maxra - 1)))
147					tbp->b_flags |= B_RAM;
148
149#if 0
150				if (tbp->b_usecount == 0) {
151					/*
152					 * Make sure that the soon-to-be used readaheads
153					 * are still there.  The getblk/bqrelse pair will
154					 * boost the priority of the buffer.
155					 */
156					tbp = getblk(vp, lblkno+i, size, 0, 0);
157					bqrelse(tbp);
158				}
159#endif
160			}
161			splx(s);
162			if (i >= maxra) {
163				return 0;
164			}
165			lblkno += i;
166		}
167		reqbp = bp = NULL;
168	} else {
169		u_quad_t firstread;
170		firstread = (u_quad_t) lblkno * size;
171		if (firstread + totread > filesize)
172			totread = filesize - firstread;
173		if (totread > size) {
174			int nblks = 0;
175			int ncontigafter;
176			while (totread > 0) {
177				nblks++;
178				totread -= size;
179			}
180			if (nblks == 1)
181				goto single_block_read;
182			if (nblks > racluster)
183				nblks = racluster;
184
185	    		error = VOP_BMAP(vp, lblkno, NULL,
186				&blkno, &ncontigafter, NULL);
187			if (error)
188				goto single_block_read;
189			if (blkno == -1)
190				goto single_block_read;
191			if (ncontigafter == 0)
192				goto single_block_read;
193			if (ncontigafter + 1 < nblks)
194				nblks = ncontigafter + 1;
195
196			bp = cluster_rbuild(vp, filesize, lblkno,
197				blkno, size, nblks, bp);
198			lblkno += nblks;
199		} else {
200single_block_read:
201			/*
202			 * if it isn't in the cache, then get a chunk from
203			 * disk if sequential, otherwise just get the block.
204			 */
205			bp->b_flags |= B_READ | B_RAM;
206			lblkno += 1;
207		}
208	}
209
210	/*
211	 * if we have been doing sequential I/O, then do some read-ahead
212	 */
213	rbp = NULL;
214	/* if (seqcount && (lblkno < (origblkno + maxra))) { */
215	if (seqcount && (lblkno < (origblkno + seqcount))) {
216		/*
217		 * we now build the read-ahead buffer if it is desirable.
218		 */
219		if (((u_quad_t)(lblkno + 1) * size) <= filesize &&
220		    !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) &&
221		    blkno != -1) {
222			int nblksread;
223			int ntoread = num_ra + 1;
224			nblksread = (origtotread + size - 1) / size;
225			if (seqcount < nblksread)
226				seqcount = nblksread;
227			if (seqcount < ntoread)
228				ntoread = seqcount;
229			if (num_ra) {
230				rbp = cluster_rbuild(vp, filesize, lblkno,
231					blkno, size, ntoread, NULL);
232			} else {
233				rbp = getblk(vp, lblkno, size, 0, 0);
234				rbp->b_flags |= B_READ | B_ASYNC | B_RAM;
235				rbp->b_blkno = blkno;
236			}
237		}
238	}
239
240	/*
241	 * handle the synchronous read
242	 */
243	if (bp) {
244		if (bp->b_flags & (B_DONE | B_DELWRI)) {
245			panic("cluster_read: DONE bp");
246		} else {
247#if defined(CLUSTERDEBUG)
248			if (rcluster)
249				printf("S(%d,%d,%d) ",
250					bp->b_lblkno, bp->b_bcount, seqcount);
251#endif
252			if ((bp->b_flags & B_CLUSTER) == 0)
253				vfs_busy_pages(bp, 0);
254			error = VOP_STRATEGY(bp);
255			curproc->p_stats->p_ru.ru_inblock++;
256		}
257	}
258	/*
259	 * and if we have read-aheads, do them too
260	 */
261	if (rbp) {
262		if (error) {
263			rbp->b_flags &= ~(B_ASYNC | B_READ);
264			brelse(rbp);
265		} else if (rbp->b_flags & B_CACHE) {
266			rbp->b_flags &= ~(B_ASYNC | B_READ);
267			bqrelse(rbp);
268		} else {
269#if defined(CLUSTERDEBUG)
270			if (rcluster) {
271				if (bp)
272					printf("A+(%d,%d,%d,%d) ",
273					rbp->b_lblkno, rbp->b_bcount,
274					rbp->b_lblkno - origblkno,
275					seqcount);
276				else
277					printf("A(%d,%d,%d,%d) ",
278					rbp->b_lblkno, rbp->b_bcount,
279					rbp->b_lblkno - origblkno,
280					seqcount);
281			}
282#endif
283
284			if ((rbp->b_flags & B_CLUSTER) == 0)
285				vfs_busy_pages(rbp, 0);
286			(void) VOP_STRATEGY(rbp);
287			curproc->p_stats->p_ru.ru_inblock++;
288		}
289	}
290	if (reqbp)
291		return (biowait(reqbp));
292	else
293		return (error);
294}
295
296/*
297 * If blocks are contiguous on disk, use this to provide clustered
298 * read ahead.  We will read as many blocks as possible sequentially
299 * and then parcel them up into logical blocks in the buffer hash table.
300 */
301static struct buf *
302cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
303	struct vnode *vp;
304	u_quad_t filesize;
305	daddr_t lbn;
306	daddr_t blkno;
307	long size;
308	int run;
309	struct buf *fbp;
310{
311	struct buf *bp, *tbp;
312	daddr_t bn;
313	int i, inc, j;
314
315#ifdef DIAGNOSTIC
316	if (size != vp->v_mount->mnt_stat.f_iosize)
317		panic("cluster_rbuild: size %d != filesize %d\n",
318		    size, vp->v_mount->mnt_stat.f_iosize);
319#endif
320	/*
321	 * avoid a division
322	 */
323	while ((u_quad_t) size * (lbn + run) > filesize) {
324		--run;
325	}
326
327	if (fbp) {
328		tbp = fbp;
329		tbp->b_flags |= B_READ;
330	} else {
331		tbp = getblk(vp, lbn, size, 0, 0);
332		if (tbp->b_flags & B_CACHE)
333			return tbp;
334		tbp->b_flags |= B_ASYNC | B_READ | B_RAM;
335	}
336
337	tbp->b_blkno = blkno;
338	if( (tbp->b_flags & B_MALLOC) ||
339		((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
340		return tbp;
341
342	bp = trypbuf();
343	if (bp == 0)
344		return tbp;
345
346	(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
347	bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO;
348	bp->b_iodone = cluster_callback;
349	bp->b_blkno = blkno;
350	bp->b_lblkno = lbn;
351	pbgetvp(vp, bp);
352
353	TAILQ_INIT(&bp->b_cluster.cluster_head);
354
355	bp->b_bcount = 0;
356	bp->b_bufsize = 0;
357	bp->b_npages = 0;
358
359	inc = btodb(size);
360	for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
361		if (i != 0) {
362			if ((bp->b_npages * PAGE_SIZE) +
363				round_page(size) > MAXPHYS)
364				break;
365
366			if (incore(vp, lbn + i))
367				break;
368
369			tbp = getblk(vp, lbn + i, size, 0, 0);
370
371			if ((tbp->b_flags & B_CACHE) ||
372				(tbp->b_flags & B_VMIO) == 0) {
373				bqrelse(tbp);
374				break;
375			}
376
377			for (j=0;j<tbp->b_npages;j++) {
378				if (tbp->b_pages[j]->valid) {
379					break;
380				}
381			}
382
383			if (j != tbp->b_npages) {
384				/*
385				 * force buffer to be re-constituted later
386				 */
387				tbp->b_flags |= B_RELBUF;
388				brelse(tbp);
389				break;
390			}
391
392			if ((fbp && (i == 1)) || (i == (run - 1)))
393				tbp->b_flags |= B_RAM;
394			tbp->b_flags |= B_READ | B_ASYNC;
395			if (tbp->b_blkno == tbp->b_lblkno) {
396				tbp->b_blkno = bn;
397			} else if (tbp->b_blkno != bn) {
398				brelse(tbp);
399				break;
400			}
401		}
402		TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
403			tbp, b_cluster.cluster_entry);
404		for (j = 0; j < tbp->b_npages; j += 1) {
405			vm_page_t m;
406			m = tbp->b_pages[j];
407			++m->busy;
408			++m->object->paging_in_progress;
409			if ((bp->b_npages == 0) ||
410				(bp->b_pages[bp->b_npages-1] != m)) {
411				bp->b_pages[bp->b_npages] = m;
412				bp->b_npages++;
413			}
414			if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
415				tbp->b_pages[j] = bogus_page;
416		}
417		bp->b_bcount += tbp->b_bcount;
418		bp->b_bufsize += tbp->b_bufsize;
419	}
420
421	for(j=0;j<bp->b_npages;j++) {
422		if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) ==
423			VM_PAGE_BITS_ALL)
424			bp->b_pages[j] = bogus_page;
425	}
426	if (bp->b_bufsize > bp->b_kvasize)
427		panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)\n",
428			bp->b_bufsize, bp->b_kvasize);
429	bp->b_kvasize = bp->b_bufsize;
430
431	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
432		(vm_page_t *)bp->b_pages, bp->b_npages);
433	return (bp);
434}
435
436/*
437 * Cleanup after a clustered read or write.
438 * This is complicated by the fact that any of the buffers might have
439 * extra memory (if there were no empty buffer headers at allocbuf time)
440 * that we will need to shift around.
441 */
442void
443cluster_callback(bp)
444	struct buf *bp;
445{
446	struct buf *nbp, *tbp;
447	int error = 0;
448
449	/*
450	 * Must propogate errors to all the components.
451	 */
452	if (bp->b_flags & B_ERROR)
453		error = bp->b_error;
454
455	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
456	/*
457	 * Move memory from the large cluster buffer into the component
458	 * buffers and mark IO as done on these.
459	 */
460	for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
461		tbp; tbp = nbp) {
462		nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
463		if (error) {
464			tbp->b_flags |= B_ERROR;
465			tbp->b_error = error;
466		} else
467		    tbp->b_dirtyoff = tbp->b_dirtyend = 0;
468		biodone(tbp);
469	}
470	relpbuf(bp);
471}
472
473/*
474 * Do clustered write for FFS.
475 *
476 * Three cases:
477 *	1. Write is not sequential (write asynchronously)
478 *	Write is sequential:
479 *	2.	beginning of cluster - begin cluster
480 *	3.	middle of a cluster - add to cluster
481 *	4.	end of a cluster - asynchronously write cluster
482 */
483void
484cluster_write(bp, filesize)
485	struct buf *bp;
486	u_quad_t filesize;
487{
488	struct vnode *vp;
489	daddr_t lbn;
490	int maxclen, cursize;
491	int lblocksize;
492	int async;
493
494	vp = bp->b_vp;
495	async = vp->v_mount->mnt_flag & MNT_ASYNC;
496	lblocksize = vp->v_mount->mnt_stat.f_iosize;
497	lbn = bp->b_lblkno;
498
499	/* Initialize vnode to beginning of file. */
500	if (lbn == 0)
501		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
502
503	if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
504	    (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
505		maxclen = MAXPHYS / lblocksize - 1;
506		if (vp->v_clen != 0) {
507			/*
508			 * Next block is not sequential.
509			 *
510			 * If we are not writing at end of file, the process
511			 * seeked to another point in the file since its last
512			 * write, or we have reached our maximum cluster size,
513			 * then push the previous cluster. Otherwise try
514			 * reallocating to make it sequential.
515			 */
516			cursize = vp->v_lastw - vp->v_cstart + 1;
517#ifndef notyet_block_reallocation_enabled
518			if (((u_quad_t)(lbn + 1) * lblocksize) != filesize ||
519				lbn != vp->v_lastw + 1 ||
520				vp->v_clen <= cursize) {
521				if (!async)
522					cluster_wbuild(vp, lblocksize,
523						vp->v_cstart, cursize);
524			}
525#else
526			if ((lbn + 1) * lblocksize != filesize ||
527			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
528				if (!async)
529					cluster_wbuild(vp, lblocksize,
530						vp->v_cstart, cursize);
531			} else {
532				struct buf **bpp, **endbp;
533				struct cluster_save *buflist;
534
535				buflist = cluster_collectbufs(vp, bp);
536				endbp = &buflist->bs_children
537				    [buflist->bs_nchildren - 1];
538				if (VOP_REALLOCBLKS(vp, buflist)) {
539					/*
540					 * Failed, push the previous cluster.
541					 */
542					for (bpp = buflist->bs_children;
543					     bpp < endbp; bpp++)
544						brelse(*bpp);
545					free(buflist, M_SEGMENT);
546					cluster_wbuild(vp, lblocksize,
547					    vp->v_cstart, cursize);
548				} else {
549					/*
550					 * Succeeded, keep building cluster.
551					 */
552					for (bpp = buflist->bs_children;
553					     bpp <= endbp; bpp++)
554						bdwrite(*bpp);
555					free(buflist, M_SEGMENT);
556					vp->v_lastw = lbn;
557					vp->v_lasta = bp->b_blkno;
558					return;
559				}
560			}
561#endif /* notyet_block_reallocation_enabled */
562		}
563		/*
564		 * Consider beginning a cluster. If at end of file, make
565		 * cluster as large as possible, otherwise find size of
566		 * existing cluster.
567		 */
568		if (((u_quad_t) (lbn + 1) * lblocksize) != filesize &&
569		    (bp->b_blkno == bp->b_lblkno) &&
570		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
571		     bp->b_blkno == -1)) {
572			bawrite(bp);
573			vp->v_clen = 0;
574			vp->v_lasta = bp->b_blkno;
575			vp->v_cstart = lbn + 1;
576			vp->v_lastw = lbn;
577			return;
578		}
579		vp->v_clen = maxclen;
580		if (!async && maxclen == 0) {	/* I/O not contiguous */
581			vp->v_cstart = lbn + 1;
582			bawrite(bp);
583		} else {	/* Wait for rest of cluster */
584			vp->v_cstart = lbn;
585			bdwrite(bp);
586		}
587	} else if (lbn == vp->v_cstart + vp->v_clen) {
588		/*
589		 * At end of cluster, write it out.
590		 */
591		bdwrite(bp);
592		cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
593		vp->v_clen = 0;
594		vp->v_cstart = lbn + 1;
595	} else
596		/*
597		 * In the middle of a cluster, so just delay the I/O for now.
598		 */
599		bdwrite(bp);
600	vp->v_lastw = lbn;
601	vp->v_lasta = bp->b_blkno;
602}
603
604
605/*
606 * This is an awful lot like cluster_rbuild...wish they could be combined.
607 * The last lbn argument is the current block on which I/O is being
608 * performed.  Check to see that it doesn't fall in the middle of
609 * the current block (if last_bp == NULL).
610 */
611int
612cluster_wbuild(vp, size, start_lbn, len)
613	struct vnode *vp;
614	long size;
615	daddr_t start_lbn;
616	int len;
617{
618	struct buf *bp, *tbp;
619	int i, j, s;
620	int totalwritten = 0;
621	int dbsize = btodb(size);
622	while (len > 0) {
623		s = splbio();
624		if ( ((tbp = gbincore(vp, start_lbn)) == NULL) ||
625			((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) {
626			++start_lbn;
627			--len;
628			splx(s);
629			continue;
630		}
631		bremfree(tbp);
632		tbp->b_flags |= B_BUSY;
633		tbp->b_flags &= ~B_DONE;
634		splx(s);
635
636	/*
637	 * Extra memory in the buffer, punt on this buffer. XXX we could
638	 * handle this in most cases, but we would have to push the extra
639	 * memory down to after our max possible cluster size and then
640	 * potentially pull it back up if the cluster was terminated
641	 * prematurely--too much hassle.
642	 */
643		if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) ||
644			(tbp->b_bcount != tbp->b_bufsize) ||
645			(tbp->b_bcount != size) ||
646			len == 1) {
647			totalwritten += tbp->b_bufsize;
648			bawrite(tbp);
649			++start_lbn;
650			--len;
651			continue;
652		}
653
654		bp = trypbuf();
655		if (bp == NULL) {
656			totalwritten += tbp->b_bufsize;
657			bawrite(tbp);
658			++start_lbn;
659			--len;
660			continue;
661		}
662
663		TAILQ_INIT(&bp->b_cluster.cluster_head);
664		bp->b_bcount = 0;
665		bp->b_bufsize = 0;
666		bp->b_npages = 0;
667		if (tbp->b_wcred != NOCRED) {
668		    bp->b_wcred = tbp->b_wcred;
669		    crhold(bp->b_wcred);
670		}
671
672		bp->b_blkno = tbp->b_blkno;
673		bp->b_lblkno = tbp->b_lblkno;
674		(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
675		bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER | (tbp->b_flags & (B_VMIO|B_NEEDCOMMIT));
676		bp->b_iodone = cluster_callback;
677		pbgetvp(vp, bp);
678
679		for (i = 0; i < len; ++i, ++start_lbn) {
680			if (i != 0) {
681				s = splbio();
682				if ((tbp = gbincore(vp, start_lbn)) == NULL) {
683					splx(s);
684					break;
685				}
686
687				if ((tbp->b_flags & (B_VMIO|B_CLUSTEROK|B_INVAL|B_BUSY|B_DELWRI|B_NEEDCOMMIT)) != (B_DELWRI|B_CLUSTEROK|(bp->b_flags & (B_VMIO|B_NEEDCOMMIT)))) {
688					splx(s);
689					break;
690				}
691
692				if (tbp->b_wcred != bp->b_wcred) {
693					splx(s);
694					break;
695				}
696
697				if ((tbp->b_bcount != size) ||
698					((bp->b_blkno + dbsize * i) != tbp->b_blkno) ||
699					((tbp->b_npages + bp->b_npages) > (MAXPHYS / PAGE_SIZE))) {
700					splx(s);
701					break;
702				}
703				bremfree(tbp);
704				tbp->b_flags |= B_BUSY;
705				tbp->b_flags &= ~B_DONE;
706				splx(s);
707			}
708			if (tbp->b_flags & B_VMIO) {
709				for (j = 0; j < tbp->b_npages; j += 1) {
710					vm_page_t m;
711					m = tbp->b_pages[j];
712					++m->busy;
713					++m->object->paging_in_progress;
714					if ((bp->b_npages == 0) ||
715						(bp->b_pages[bp->b_npages - 1] != m)) {
716						bp->b_pages[bp->b_npages] = m;
717						bp->b_npages++;
718					}
719				}
720			}
721			bp->b_bcount += size;
722			bp->b_bufsize += size;
723
724			--numdirtybuffers;
725			tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
726			tbp->b_flags |= B_ASYNC;
727			s = splbio();
728			reassignbuf(tbp, tbp->b_vp);	/* put on clean list */
729			++tbp->b_vp->v_numoutput;
730			splx(s);
731			TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
732				tbp, b_cluster.cluster_entry);
733		}
734		pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
735			(vm_page_t *) bp->b_pages, bp->b_npages);
736		if (bp->b_bufsize > bp->b_kvasize)
737			panic("cluster_wbuild: b_bufsize(%d) > b_kvasize(%d)\n",
738				bp->b_bufsize, bp->b_kvasize);
739		bp->b_kvasize = bp->b_bufsize;
740		totalwritten += bp->b_bufsize;
741		bp->b_dirtyoff = 0;
742		bp->b_dirtyend = bp->b_bufsize;
743		bawrite(bp);
744
745		len -= i;
746	}
747	return totalwritten;
748}
749
750#ifdef notyet_block_reallocation_enabled
751/*
752 * Collect together all the buffers in a cluster.
753 * Plus add one additional buffer.
754 */
755static struct cluster_save *
756cluster_collectbufs(vp, last_bp)
757	struct vnode *vp;
758	struct buf *last_bp;
759{
760	struct cluster_save *buflist;
761	daddr_t lbn;
762	int i, len;
763
764	len = vp->v_lastw - vp->v_cstart + 1;
765	buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
766	    M_SEGMENT, M_WAITOK);
767	buflist->bs_nchildren = 0;
768	buflist->bs_children = (struct buf **) (buflist + 1);
769	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++)
770		(void) bread(vp, lbn, last_bp->b_bcount, NOCRED,
771		    &buflist->bs_children[i]);
772	buflist->bs_children[i] = last_bp;
773	buflist->bs_nchildren = i + 1;
774	return (buflist);
775}
776#endif /* notyet_block_reallocation_enabled */
777