vfs_cluster.c revision 34266
1/*-
2 * Copyright (c) 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * Modifications/enhancements:
5 * 	Copyright (c) 1995 John S. Dyson.  All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 *    must display the following acknowledgement:
17 *	This product includes software developed by the University of
18 *	California, Berkeley and its contributors.
19 * 4. Neither the name of the University nor the names of its contributors
20 *    may be used to endorse or promote products derived from this software
21 *    without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
36 * $Id: vfs_cluster.c,v 1.56 1998/03/07 21:35:28 dyson Exp $
37 */
38
39#include "opt_debug_cluster.h"
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/proc.h>
44#include <sys/buf.h>
45#include <sys/vnode.h>
46#include <sys/mount.h>
47#include <sys/resourcevar.h>
48#include <vm/vm.h>
49#include <vm/vm_prot.h>
50#include <vm/vm_object.h>
51#include <vm/vm_page.h>
52
53#if defined(CLUSTERDEBUG)
54#include <sys/sysctl.h>
55#include <sys/kernel.h>
56static int	rcluster= 0;
57SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, "");
58#endif
59
60#ifdef notyet_block_reallocation_enabled
61static struct cluster_save *
62	cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp));
63#endif
64static struct buf *
65	cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn,
66			    daddr_t blkno, long size, int run, struct buf *fbp));
67
68extern vm_page_t	bogus_page;
69
70/*
71 * Maximum number of blocks for read-ahead.
72 */
73#define MAXRA 32
74
75/*
76 * This replaces bread.
77 */
78int
79cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp)
80	struct vnode *vp;
81	u_quad_t filesize;
82	daddr_t lblkno;
83	long size;
84	struct ucred *cred;
85	long totread;
86	int seqcount;
87	struct buf **bpp;
88{
89	struct buf *bp, *rbp, *reqbp;
90	daddr_t blkno, origblkno;
91	int error, num_ra;
92	int i;
93	int maxra, racluster;
94	long origtotread;
95
96	error = 0;
97	if (vp->v_maxio == 0)
98		vp->v_maxio = DFLTPHYS;
99
100	/*
101	 * Try to limit the amount of read-ahead by a few
102	 * ad-hoc parameters.  This needs work!!!
103	 */
104	racluster = vp->v_maxio/size;
105	maxra = 2 * racluster + (totread / size);
106	if (maxra > MAXRA)
107		maxra = MAXRA;
108	if (maxra > nbuf/8)
109		maxra = nbuf/8;
110
111	/*
112	 * get the requested block
113	 */
114	*bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0);
115	origblkno = lblkno;
116	origtotread = totread;
117
118	/*
119	 * if it is in the cache, then check to see if the reads have been
120	 * sequential.  If they have, then try some read-ahead, otherwise
121	 * back-off on prospective read-aheads.
122	 */
123	if (bp->b_flags & B_CACHE) {
124		if (!seqcount) {
125			return 0;
126		} else if ((bp->b_flags & B_RAM) == 0) {
127			return 0;
128		} else {
129			int s;
130			struct buf *tbp;
131			bp->b_flags &= ~B_RAM;
132			/*
133			 * We do the spl here so that there is no window
134			 * between the incore and the b_usecount increment
135			 * below.  We opt to keep the spl out of the loop
136			 * for efficiency.
137			 */
138			s = splbio();
139			for(i=1;i<maxra;i++) {
140
141				if (!(tbp = incore(vp, lblkno+i))) {
142					break;
143				}
144
145				/*
146				 * Set another read-ahead mark so we know to check
147				 * again.
148				 */
149				if (((i % racluster) == (racluster - 1)) ||
150					(i == (maxra - 1)))
151					tbp->b_flags |= B_RAM;
152
153				if ((tbp->b_usecount < 5) &&
154					((tbp->b_flags & B_BUSY) == 0) &&
155					(tbp->b_qindex == QUEUE_LRU)) {
156					TAILQ_REMOVE(&bufqueues[QUEUE_LRU], tbp, b_freelist);
157					TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], tbp, b_freelist);
158				}
159			}
160			splx(s);
161			if (i >= maxra) {
162				return 0;
163			}
164			lblkno += i;
165		}
166		reqbp = bp = NULL;
167	} else {
168		u_quad_t firstread;
169		firstread = (u_quad_t) lblkno * size;
170		if (firstread + totread > filesize)
171			totread = filesize - firstread;
172		if (totread > size) {
173			int nblks = 0;
174			int ncontigafter;
175			while (totread > 0) {
176				nblks++;
177				totread -= size;
178			}
179			if (nblks == 1)
180				goto single_block_read;
181			if (nblks > racluster)
182				nblks = racluster;
183
184	    		error = VOP_BMAP(vp, lblkno, NULL,
185				&blkno, &ncontigafter, NULL);
186			if (error)
187				goto single_block_read;
188			if (blkno == -1)
189				goto single_block_read;
190			if (ncontigafter == 0)
191				goto single_block_read;
192			if (ncontigafter + 1 < nblks)
193				nblks = ncontigafter + 1;
194
195			bp = cluster_rbuild(vp, filesize, lblkno,
196				blkno, size, nblks, bp);
197			lblkno += nblks;
198		} else {
199single_block_read:
200			/*
201			 * if it isn't in the cache, then get a chunk from
202			 * disk if sequential, otherwise just get the block.
203			 */
204			bp->b_flags |= B_READ | B_RAM;
205			lblkno += 1;
206		}
207	}
208
209	/*
210	 * if we have been doing sequential I/O, then do some read-ahead
211	 */
212	rbp = NULL;
213	if (seqcount && (lblkno < (origblkno + seqcount))) {
214		/*
215		 * we now build the read-ahead buffer if it is desirable.
216		 */
217		if (((u_quad_t)(lblkno + 1) * size) <= filesize &&
218		    !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) &&
219		    blkno != -1) {
220			int nblksread;
221			int ntoread = num_ra + 1;
222			nblksread = (origtotread + size - 1) / size;
223			if (seqcount < nblksread)
224				seqcount = nblksread;
225			if (seqcount < ntoread)
226				ntoread = seqcount;
227			if (num_ra) {
228				rbp = cluster_rbuild(vp, filesize, lblkno,
229					blkno, size, ntoread, NULL);
230			} else {
231				rbp = getblk(vp, lblkno, size, 0, 0);
232				rbp->b_flags |= B_READ | B_ASYNC | B_RAM;
233				rbp->b_blkno = blkno;
234			}
235		}
236	}
237
238	/*
239	 * handle the synchronous read
240	 */
241	if (bp) {
242		if (bp->b_flags & (B_DONE | B_DELWRI)) {
243			panic("cluster_read: DONE bp");
244		} else {
245#if defined(CLUSTERDEBUG)
246			if (rcluster)
247				printf("S(%d,%d,%d) ",
248					bp->b_lblkno, bp->b_bcount, seqcount);
249#endif
250			if ((bp->b_flags & B_CLUSTER) == 0)
251				vfs_busy_pages(bp, 0);
252			error = VOP_STRATEGY(bp);
253			curproc->p_stats->p_ru.ru_inblock++;
254		}
255	}
256	/*
257	 * and if we have read-aheads, do them too
258	 */
259	if (rbp) {
260		if (error) {
261			rbp->b_flags &= ~(B_ASYNC | B_READ);
262			brelse(rbp);
263		} else if (rbp->b_flags & B_CACHE) {
264			rbp->b_flags &= ~(B_ASYNC | B_READ);
265			bqrelse(rbp);
266		} else {
267#if defined(CLUSTERDEBUG)
268			if (rcluster) {
269				if (bp)
270					printf("A+(%d,%d,%d,%d) ",
271					rbp->b_lblkno, rbp->b_bcount,
272					rbp->b_lblkno - origblkno,
273					seqcount);
274				else
275					printf("A(%d,%d,%d,%d) ",
276					rbp->b_lblkno, rbp->b_bcount,
277					rbp->b_lblkno - origblkno,
278					seqcount);
279			}
280#endif
281
282			if ((rbp->b_flags & B_CLUSTER) == 0)
283				vfs_busy_pages(rbp, 0);
284			(void) VOP_STRATEGY(rbp);
285			curproc->p_stats->p_ru.ru_inblock++;
286		}
287	}
288	if (reqbp)
289		return (biowait(reqbp));
290	else
291		return (error);
292}
293
294/*
295 * If blocks are contiguous on disk, use this to provide clustered
296 * read ahead.  We will read as many blocks as possible sequentially
297 * and then parcel them up into logical blocks in the buffer hash table.
298 */
299static struct buf *
300cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
301	struct vnode *vp;
302	u_quad_t filesize;
303	daddr_t lbn;
304	daddr_t blkno;
305	long size;
306	int run;
307	struct buf *fbp;
308{
309	struct buf *bp, *tbp;
310	daddr_t bn;
311	int i, inc, j;
312
313#ifdef DIAGNOSTIC
314	if (size != vp->v_mount->mnt_stat.f_iosize)
315		panic("cluster_rbuild: size %d != filesize %d\n",
316		    size, vp->v_mount->mnt_stat.f_iosize);
317#endif
318	/*
319	 * avoid a division
320	 */
321	while ((u_quad_t) size * (lbn + run) > filesize) {
322		--run;
323	}
324
325	if (fbp) {
326		tbp = fbp;
327		tbp->b_flags |= B_READ;
328	} else {
329		tbp = getblk(vp, lbn, size, 0, 0);
330		if (tbp->b_flags & B_CACHE)
331			return tbp;
332		tbp->b_flags |= B_ASYNC | B_READ | B_RAM;
333	}
334
335	tbp->b_blkno = blkno;
336	if( (tbp->b_flags & B_MALLOC) ||
337		((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
338		return tbp;
339
340	bp = trypbuf();
341	if (bp == 0)
342		return tbp;
343
344	(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
345	bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO;
346	bp->b_iodone = cluster_callback;
347	bp->b_blkno = blkno;
348	bp->b_lblkno = lbn;
349	pbgetvp(vp, bp);
350
351	TAILQ_INIT(&bp->b_cluster.cluster_head);
352
353	bp->b_bcount = 0;
354	bp->b_bufsize = 0;
355	bp->b_npages = 0;
356
357	if (vp->v_maxio == 0)
358		vp->v_maxio = DFLTPHYS;
359	inc = btodb(size);
360	for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
361		if (i != 0) {
362			if ((bp->b_npages * PAGE_SIZE) +
363				round_page(size) > vp->v_maxio)
364				break;
365
366			if (incore(vp, lbn + i))
367				break;
368
369			tbp = getblk(vp, lbn + i, size, 0, 0);
370
371			if ((tbp->b_flags & B_CACHE) ||
372				(tbp->b_flags & B_VMIO) == 0) {
373				bqrelse(tbp);
374				break;
375			}
376
377			for (j=0;j<tbp->b_npages;j++) {
378				if (tbp->b_pages[j]->valid) {
379					break;
380				}
381			}
382
383			if (j != tbp->b_npages) {
384				/*
385				 * force buffer to be re-constituted later
386				 */
387				tbp->b_flags |= B_RELBUF;
388				brelse(tbp);
389				break;
390			}
391
392			if ((fbp && (i == 1)) || (i == (run - 1)))
393				tbp->b_flags |= B_RAM;
394			tbp->b_flags |= B_READ | B_ASYNC;
395			if (tbp->b_blkno == tbp->b_lblkno) {
396				tbp->b_blkno = bn;
397			} else if (tbp->b_blkno != bn) {
398				brelse(tbp);
399				break;
400			}
401		}
402		/* check for latent dependencies to be handled */
403		if ((LIST_FIRST(&tbp->b_dep)) != NULL && bioops.io_start)
404			(*bioops.io_start)(tbp);
405		TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
406			tbp, b_cluster.cluster_entry);
407		for (j = 0; j < tbp->b_npages; j += 1) {
408			vm_page_t m;
409			m = tbp->b_pages[j];
410			++m->busy;
411			++m->object->paging_in_progress;
412			if ((bp->b_npages == 0) ||
413				(bp->b_pages[bp->b_npages-1] != m)) {
414				bp->b_pages[bp->b_npages] = m;
415				bp->b_npages++;
416			}
417			if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
418				tbp->b_pages[j] = bogus_page;
419		}
420		bp->b_bcount += tbp->b_bcount;
421		bp->b_bufsize += tbp->b_bufsize;
422	}
423
424	for(j=0;j<bp->b_npages;j++) {
425		if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) ==
426			VM_PAGE_BITS_ALL)
427			bp->b_pages[j] = bogus_page;
428	}
429	if (bp->b_bufsize > bp->b_kvasize)
430		panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)\n",
431			bp->b_bufsize, bp->b_kvasize);
432	bp->b_kvasize = bp->b_bufsize;
433
434	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
435		(vm_page_t *)bp->b_pages, bp->b_npages);
436	return (bp);
437}
438
439/*
440 * Cleanup after a clustered read or write.
441 * This is complicated by the fact that any of the buffers might have
442 * extra memory (if there were no empty buffer headers at allocbuf time)
443 * that we will need to shift around.
444 */
445void
446cluster_callback(bp)
447	struct buf *bp;
448{
449	struct buf *nbp, *tbp;
450	int error = 0;
451
452	/*
453	 * Must propogate errors to all the components.
454	 */
455	if (bp->b_flags & B_ERROR)
456		error = bp->b_error;
457
458	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
459	/*
460	 * Move memory from the large cluster buffer into the component
461	 * buffers and mark IO as done on these.
462	 */
463	for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
464		tbp; tbp = nbp) {
465		nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
466		if (error) {
467			tbp->b_flags |= B_ERROR;
468			tbp->b_error = error;
469		} else
470		    tbp->b_dirtyoff = tbp->b_dirtyend = 0;
471		biodone(tbp);
472	}
473	relpbuf(bp);
474}
475
476/*
477 * Do clustered write for FFS.
478 *
479 * Three cases:
480 *	1. Write is not sequential (write asynchronously)
481 *	Write is sequential:
482 *	2.	beginning of cluster - begin cluster
483 *	3.	middle of a cluster - add to cluster
484 *	4.	end of a cluster - asynchronously write cluster
485 */
486void
487cluster_write(bp, filesize)
488	struct buf *bp;
489	u_quad_t filesize;
490{
491	struct vnode *vp;
492	daddr_t lbn;
493	int maxclen, cursize;
494	int lblocksize;
495	int async;
496
497	vp = bp->b_vp;
498	if (vp->v_maxio == 0)
499		vp->v_maxio = DFLTPHYS;
500	if (vp->v_type == VREG) {
501		async = vp->v_mount->mnt_flag & MNT_ASYNC;
502		lblocksize = vp->v_mount->mnt_stat.f_iosize;
503	} else {
504		async = 0;
505		lblocksize = bp->b_bufsize;
506	}
507	lbn = bp->b_lblkno;
508
509	/* Initialize vnode to beginning of file. */
510	if (lbn == 0)
511		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
512
513	if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
514	    (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
515		maxclen = vp->v_maxio / lblocksize - 1;
516		if (vp->v_clen != 0) {
517			/*
518			 * Next block is not sequential.
519			 *
520			 * If we are not writing at end of file, the process
521			 * seeked to another point in the file since its last
522			 * write, or we have reached our maximum cluster size,
523			 * then push the previous cluster. Otherwise try
524			 * reallocating to make it sequential.
525			 */
526			cursize = vp->v_lastw - vp->v_cstart + 1;
527#ifndef notyet_block_reallocation_enabled
528			if (((u_quad_t)(lbn + 1) * lblocksize) != filesize ||
529				lbn != vp->v_lastw + 1 ||
530				vp->v_clen <= cursize) {
531				if (!async)
532					cluster_wbuild(vp, lblocksize,
533						vp->v_cstart, cursize);
534			}
535#else
536			if ((lbn + 1) * lblocksize != filesize ||
537			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
538				if (!async)
539					cluster_wbuild(vp, lblocksize,
540						vp->v_cstart, cursize);
541			} else {
542				struct buf **bpp, **endbp;
543				struct cluster_save *buflist;
544
545				buflist = cluster_collectbufs(vp, bp);
546				endbp = &buflist->bs_children
547				    [buflist->bs_nchildren - 1];
548				if (VOP_REALLOCBLKS(vp, buflist)) {
549					/*
550					 * Failed, push the previous cluster.
551					 */
552					for (bpp = buflist->bs_children;
553					     bpp < endbp; bpp++)
554						brelse(*bpp);
555					free(buflist, M_SEGMENT);
556					cluster_wbuild(vp, lblocksize,
557					    vp->v_cstart, cursize);
558				} else {
559					/*
560					 * Succeeded, keep building cluster.
561					 */
562					for (bpp = buflist->bs_children;
563					     bpp <= endbp; bpp++)
564						bdwrite(*bpp);
565					free(buflist, M_SEGMENT);
566					vp->v_lastw = lbn;
567					vp->v_lasta = bp->b_blkno;
568					return;
569				}
570			}
571#endif /* notyet_block_reallocation_enabled */
572		}
573		/*
574		 * Consider beginning a cluster. If at end of file, make
575		 * cluster as large as possible, otherwise find size of
576		 * existing cluster.
577		 */
578		if ((vp->v_type == VREG) &&
579			((u_quad_t) (lbn + 1) * lblocksize) != filesize &&
580		    (bp->b_blkno == bp->b_lblkno) &&
581		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
582		     bp->b_blkno == -1)) {
583			bawrite(bp);
584			vp->v_clen = 0;
585			vp->v_lasta = bp->b_blkno;
586			vp->v_cstart = lbn + 1;
587			vp->v_lastw = lbn;
588			return;
589		}
590		vp->v_clen = maxclen;
591		if (!async && maxclen == 0) {	/* I/O not contiguous */
592			vp->v_cstart = lbn + 1;
593			bawrite(bp);
594		} else {	/* Wait for rest of cluster */
595			vp->v_cstart = lbn;
596			bdwrite(bp);
597		}
598	} else if (lbn == vp->v_cstart + vp->v_clen) {
599		/*
600		 * At end of cluster, write it out.
601		 */
602		bdwrite(bp);
603		cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
604		vp->v_clen = 0;
605		vp->v_cstart = lbn + 1;
606	} else
607		/*
608		 * In the middle of a cluster, so just delay the I/O for now.
609		 */
610		bdwrite(bp);
611	vp->v_lastw = lbn;
612	vp->v_lasta = bp->b_blkno;
613}
614
615
616/*
617 * This is an awful lot like cluster_rbuild...wish they could be combined.
618 * The last lbn argument is the current block on which I/O is being
619 * performed.  Check to see that it doesn't fall in the middle of
620 * the current block (if last_bp == NULL).
621 */
622int
623cluster_wbuild(vp, size, start_lbn, len)
624	struct vnode *vp;
625	long size;
626	daddr_t start_lbn;
627	int len;
628{
629	struct buf *bp, *tbp;
630	int i, j, s;
631	int totalwritten = 0;
632	int dbsize = btodb(size);
633	while (len > 0) {
634		s = splbio();
635		if (((tbp = gbincore(vp, start_lbn)) == NULL) ||
636			((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) {
637			++start_lbn;
638			--len;
639			splx(s);
640			continue;
641		}
642		bremfree(tbp);
643		tbp->b_flags |= B_BUSY;
644		tbp->b_flags &= ~B_DONE;
645		splx(s);
646
647	/*
648	 * Extra memory in the buffer, punt on this buffer. XXX we could
649	 * handle this in most cases, but we would have to push the extra
650	 * memory down to after our max possible cluster size and then
651	 * potentially pull it back up if the cluster was terminated
652	 * prematurely--too much hassle.
653	 */
654		if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) ||
655			(tbp->b_bcount != tbp->b_bufsize) ||
656			(tbp->b_bcount != size) ||
657			len == 1) {
658			totalwritten += tbp->b_bufsize;
659			bawrite(tbp);
660			++start_lbn;
661			--len;
662			continue;
663		}
664
665		bp = trypbuf();
666		if (bp == NULL) {
667			totalwritten += tbp->b_bufsize;
668			bawrite(tbp);
669			++start_lbn;
670			--len;
671			continue;
672		}
673
674		TAILQ_INIT(&bp->b_cluster.cluster_head);
675		bp->b_bcount = 0;
676		bp->b_bufsize = 0;
677		bp->b_npages = 0;
678		if (tbp->b_wcred != NOCRED) {
679		    bp->b_wcred = tbp->b_wcred;
680		    crhold(bp->b_wcred);
681		}
682
683		bp->b_blkno = tbp->b_blkno;
684		bp->b_lblkno = tbp->b_lblkno;
685		(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
686		bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER |
687						(tbp->b_flags & (B_VMIO|B_NEEDCOMMIT));
688		bp->b_iodone = cluster_callback;
689		pbgetvp(vp, bp);
690		for (i = 0; i < len; ++i, ++start_lbn) {
691			if (i != 0) {
692				s = splbio();
693				if ((tbp = gbincore(vp, start_lbn)) == NULL) {
694					splx(s);
695					break;
696				}
697
698				if ((tbp->b_flags & (B_VMIO|B_CLUSTEROK|B_INVAL|B_BUSY|B_DELWRI|B_NEEDCOMMIT)) != (B_DELWRI|B_CLUSTEROK|(bp->b_flags & (B_VMIO|B_NEEDCOMMIT)))) {
699					splx(s);
700					break;
701				}
702
703				if (tbp->b_wcred != bp->b_wcred) {
704					splx(s);
705					break;
706				}
707
708				if ((tbp->b_bcount != size) ||
709					((bp->b_blkno + dbsize * i) != tbp->b_blkno) ||
710					((tbp->b_npages + bp->b_npages) > (vp->v_maxio / PAGE_SIZE))) {
711					splx(s);
712					break;
713				}
714				bremfree(tbp);
715				tbp->b_flags |= B_BUSY;
716				tbp->b_flags &= ~B_DONE;
717				splx(s);
718			}
719			/* check for latent dependencies to be handled */
720			if ((LIST_FIRST(&tbp->b_dep)) != NULL &&
721			    bioops.io_start)
722				(*bioops.io_start)(tbp);
723			if (tbp->b_flags & B_VMIO) {
724				vm_page_t m;
725
726				if (i != 0) {
727					for (j = 0; j < tbp->b_npages; j += 1) {
728						m = tbp->b_pages[j];
729						if (m->flags & PG_BUSY)
730							goto finishcluster;
731					}
732				}
733
734				for (j = 0; j < tbp->b_npages; j += 1) {
735					m = tbp->b_pages[j];
736					++m->busy;
737					++m->object->paging_in_progress;
738					if ((bp->b_npages == 0) ||
739						(bp->b_pages[bp->b_npages - 1] != m)) {
740						bp->b_pages[bp->b_npages] = m;
741						bp->b_npages++;
742					}
743				}
744			}
745			bp->b_bcount += size;
746			bp->b_bufsize += size;
747
748			--numdirtybuffers;
749			tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
750			tbp->b_flags |= B_ASYNC;
751			s = splbio();
752			reassignbuf(tbp, tbp->b_vp);	/* put on clean list */
753			++tbp->b_vp->v_numoutput;
754			splx(s);
755			TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
756				tbp, b_cluster.cluster_entry);
757		}
758	finishcluster:
759		pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
760			(vm_page_t *) bp->b_pages, bp->b_npages);
761		if (bp->b_bufsize > bp->b_kvasize)
762			panic("cluster_wbuild: b_bufsize(%d) > b_kvasize(%d)\n",
763				bp->b_bufsize, bp->b_kvasize);
764		bp->b_kvasize = bp->b_bufsize;
765		totalwritten += bp->b_bufsize;
766		bp->b_dirtyoff = 0;
767		bp->b_dirtyend = bp->b_bufsize;
768		bawrite(bp);
769
770		len -= i;
771	}
772	return totalwritten;
773}
774
775#ifdef notyet_block_reallocation_enabled
776/*
777 * Collect together all the buffers in a cluster.
778 * Plus add one additional buffer.
779 */
780static struct cluster_save *
781cluster_collectbufs(vp, last_bp)
782	struct vnode *vp;
783	struct buf *last_bp;
784{
785	struct cluster_save *buflist;
786	daddr_t lbn;
787	int i, len;
788
789	len = vp->v_lastw - vp->v_cstart + 1;
790	buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
791	    M_SEGMENT, M_WAITOK);
792	buflist->bs_nchildren = 0;
793	buflist->bs_children = (struct buf **) (buflist + 1);
794	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++)
795		(void) bread(vp, lbn, last_bp->b_bcount, NOCRED,
796		    &buflist->bs_children[i]);
797	buflist->bs_children[i] = last_bp;
798	buflist->bs_nchildren = i + 1;
799	return (buflist);
800}
801#endif /* notyet_block_reallocation_enabled */
802