vfs_cluster.c revision 31016
1270096Strasz/*-
2270096Strasz * Copyright (c) 1993
3270096Strasz *	The Regents of the University of California.  All rights reserved.
4270096Strasz * Modifications/enhancements:
5270096Strasz * 	Copyright (c) 1995 John S. Dyson.  All rights reserved.
6270096Strasz *
7270096Strasz * Redistribution and use in source and binary forms, with or without
8270096Strasz * modification, are permitted provided that the following conditions
9270096Strasz * are met:
10270096Strasz * 1. Redistributions of source code must retain the above copyright
11270096Strasz *    notice, this list of conditions and the following disclaimer.
12270096Strasz * 2. Redistributions in binary form must reproduce the above copyright
13270096Strasz *    notice, this list of conditions and the following disclaimer in the
14270096Strasz *    documentation and/or other materials provided with the distribution.
15270096Strasz * 3. All advertising materials mentioning features or use of this software
16270096Strasz *    must display the following acknowledgement:
17270096Strasz *	This product includes software developed by the University of
18270096Strasz *	California, Berkeley and its contributors.
19270096Strasz * 4. Neither the name of the University nor the names of its contributors
20270096Strasz *    may be used to endorse or promote products derived from this software
21270096Strasz *    without specific prior written permission.
22270096Strasz *
23270096Strasz * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24270096Strasz * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25270096Strasz * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26270096Strasz * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27270096Strasz * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28270096Strasz * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29270096Strasz * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30270096Strasz * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31270096Strasz * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32270096Strasz * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33270096Strasz * SUCH DAMAGE.
34270096Strasz *
35270096Strasz *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
36270096Strasz * $Id: vfs_cluster.c,v 1.48 1997/08/02 14:31:43 bde Exp $
37270096Strasz */
38270096Strasz
39270096Strasz#include <sys/param.h>
40270096Strasz#include <sys/systm.h>
41270096Strasz#include <sys/proc.h>
42270096Strasz#include <sys/buf.h>
43270096Strasz#include <sys/vnode.h>
44296718Strasz#include <sys/mount.h>
45270096Strasz#include <sys/resourcevar.h>
46272403Strasz#include <vm/vm.h>
47297236Strasz#include <vm/vm_prot.h>
48270096Strasz#include <vm/vm_object.h>
49270096Strasz#include <vm/vm_page.h>
50270096Strasz
51270096Strasz#if defined(CLUSTERDEBUG)
52270281Strasz#include <sys/sysctl.h>
53270096Strasz#include <sys/kernel.h>
54270096Straszstatic int	rcluster= 0;
55270096StraszSYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, "");
56270096Strasz#endif
57270402Strasz
58270402Strasz#ifdef notyet_block_reallocation_enabled
59270096Straszstatic struct cluster_save *
60270096Strasz	cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp));
61270096Strasz#endif
62270096Straszstatic struct buf *
63270096Strasz	cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn,
64270096Strasz			    daddr_t blkno, long size, int run, struct buf *fbp));
65270096Strasz
66270096Straszextern vm_page_t	bogus_page;
67270096Strasz
68270096Strasz/*
69270096Strasz * Maximum number of blocks for read-ahead.
70270096Strasz */
71270096Strasz#define MAXRA 32
72270096Strasz
73270096Strasz/*
74270096Strasz * This replaces bread.
75270096Strasz */
76270096Straszint
77270096Straszcluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp)
78270096Strasz	struct vnode *vp;
79270096Strasz	u_quad_t filesize;
80270096Strasz	daddr_t lblkno;
81270096Strasz	long size;
82270096Strasz	struct ucred *cred;
83270096Strasz	long totread;
84270096Strasz	int seqcount;
85270096Strasz	struct buf **bpp;
86270096Strasz{
87270096Strasz	struct buf *bp, *rbp, *reqbp;
88270096Strasz	daddr_t blkno, origblkno;
89270096Strasz	int error, num_ra;
90270096Strasz	int i;
91270096Strasz	int maxra, racluster;
92270096Strasz	long origtotread;
93270096Strasz
94270096Strasz	error = 0;
95270096Strasz
96270096Strasz	/*
97270096Strasz	 * Try to limit the amount of read-ahead by a few
98270096Strasz	 * ad-hoc parameters.  This needs work!!!
99270096Strasz	 */
100270096Strasz	racluster = MAXPHYS/size;
101270096Strasz	maxra = 2 * racluster + (totread / size);
102270096Strasz	if (maxra > MAXRA)
103270096Strasz		maxra = MAXRA;
104270096Strasz	if (maxra > nbuf/8)
105270096Strasz		maxra = nbuf/8;
106270096Strasz
107270096Strasz	/*
108270096Strasz	 * get the requested block
109270096Strasz	 */
110270096Strasz	*bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0);
111270096Strasz	origblkno = lblkno;
112270096Strasz	origtotread = totread;
113270096Strasz
114270096Strasz	/*
115296718Strasz	 * if it is in the cache, then check to see if the reads have been
116296718Strasz	 * sequential.  If they have, then try some read-ahead, otherwise
117270096Strasz	 * back-off on prospective read-aheads.
118270096Strasz	 */
119270096Strasz	if (bp->b_flags & B_CACHE) {
120270096Strasz		if (!seqcount) {
121270096Strasz			return 0;
122270096Strasz		} else if ((bp->b_flags & B_RAM) == 0) {
123270096Strasz			return 0;
124296718Strasz		} else {
125270096Strasz			int s;
126270096Strasz			struct buf *tbp;
127270096Strasz			bp->b_flags &= ~B_RAM;
128270096Strasz			/*
129270096Strasz			 * We do the spl here so that there is no window
130270096Strasz			 * between the incore and the b_usecount increment
131270096Strasz			 * below.  We opt to keep the spl out of the loop
132270096Strasz			 * for efficiency.
133270096Strasz			 */
134270096Strasz			s = splbio();
135270096Strasz			for(i=1;i<maxra;i++) {
136270096Strasz
137270096Strasz				if (!(tbp = incore(vp, lblkno+i))) {
138270096Strasz					break;
139270096Strasz				}
140270096Strasz
141270096Strasz				/*
142270096Strasz				 * Set another read-ahead mark so we know to check
143270096Strasz				 * again.
144270096Strasz				 */
145270096Strasz				if (((i % racluster) == (racluster - 1)) ||
146270096Strasz					(i == (maxra - 1)))
147270096Strasz					tbp->b_flags |= B_RAM;
148270096Strasz
149270096Strasz#if 0
150270096Strasz				if (tbp->b_usecount == 0) {
151270096Strasz					/*
152270096Strasz					 * Make sure that the soon-to-be used readaheads
153270096Strasz					 * are still there.  The getblk/bqrelse pair will
154270096Strasz					 * boost the priority of the buffer.
155270096Strasz					 */
156270096Strasz					tbp = getblk(vp, lblkno+i, size, 0, 0);
157270402Strasz					bqrelse(tbp);
158270096Strasz				}
159270096Strasz#endif
160270096Strasz			}
161270096Strasz			splx(s);
162270096Strasz			if (i >= maxra) {
163270096Strasz				return 0;
164270096Strasz			}
165270096Strasz			lblkno += i;
166270096Strasz		}
167270096Strasz		reqbp = bp = NULL;
168270096Strasz	} else {
169270402Strasz		u_quad_t firstread;
170270096Strasz		firstread = (u_quad_t) lblkno * size;
171270096Strasz		if (firstread + totread > filesize)
172270096Strasz			totread = filesize - firstread;
173270096Strasz		if (totread > size) {
174270096Strasz			int nblks = 0;
175270096Strasz			int ncontigafter;
176270096Strasz			while (totread > 0) {
177270096Strasz				nblks++;
178270096Strasz				totread -= size;
179270096Strasz			}
180270096Strasz			if (nblks == 1)
181270096Strasz				goto single_block_read;
182270096Strasz			if (nblks > racluster)
183270096Strasz				nblks = racluster;
184270096Strasz
185270096Strasz	    		error = VOP_BMAP(vp, lblkno, NULL,
186270096Strasz				&blkno, &ncontigafter, NULL);
187270096Strasz			if (error)
188270096Strasz				goto single_block_read;
189270096Strasz			if (blkno == -1)
190270096Strasz				goto single_block_read;
191270096Strasz			if (ncontigafter == 0)
192270096Strasz				goto single_block_read;
193270096Strasz			if (ncontigafter + 1 < nblks)
194270096Strasz				nblks = ncontigafter + 1;
195270096Strasz
196270096Strasz			bp = cluster_rbuild(vp, filesize, lblkno,
197270096Strasz				blkno, size, nblks, bp);
198270096Strasz			lblkno += nblks;
199270096Strasz		} else {
200270096Straszsingle_block_read:
201270096Strasz			/*
202270096Strasz			 * if it isn't in the cache, then get a chunk from
203272512Strasz			 * disk if sequential, otherwise just get the block.
204270207Strasz			 */
205270207Strasz			bp->b_flags |= B_READ | B_RAM;
206270207Strasz			lblkno += 1;
207270207Strasz		}
208272512Strasz	}
209270207Strasz
210270207Strasz	/*
211270207Strasz	 * if we have been doing sequential I/O, then do some read-ahead
212270096Strasz	 */
213270096Strasz	rbp = NULL;
214270096Strasz	/* if (seqcount && (lblkno < (origblkno + maxra))) { */
215270096Strasz	if (seqcount && (lblkno < (origblkno + seqcount))) {
216270096Strasz		/*
217270096Strasz		 * we now build the read-ahead buffer if it is desirable.
218270096Strasz		 */
219296715Strasz		if (((u_quad_t)(lblkno + 1) * size) <= filesize &&
220270096Strasz		    !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) &&
221270096Strasz		    blkno != -1) {
222270096Strasz			int nblksread;
223270096Strasz			int ntoread = num_ra + 1;
224270096Strasz			nblksread = (origtotread + size - 1) / size;
225270096Strasz			if (seqcount < nblksread)
226270096Strasz				seqcount = nblksread;
227270096Strasz			if (seqcount < ntoread)
228270096Strasz				ntoread = seqcount;
229270096Strasz			if (num_ra) {
230270096Strasz				rbp = cluster_rbuild(vp, filesize, lblkno,
231270207Strasz					blkno, size, ntoread, NULL);
232270207Strasz			} else {
233270207Strasz				rbp = getblk(vp, lblkno, size, 0, 0);
234270207Strasz				rbp->b_flags |= B_READ | B_ASYNC | B_RAM;
235270207Strasz				rbp->b_blkno = blkno;
236270096Strasz			}
237270207Strasz		}
238272512Strasz	}
239270096Strasz
240270207Strasz	/*
241270096Strasz	 * handle the synchronous read
242270207Strasz	 */
243270096Strasz	if (bp) {
244270096Strasz		if (bp->b_flags & (B_DONE | B_DELWRI)) {
245270096Strasz			panic("cluster_read: DONE bp");
246270096Strasz		} else {
247270096Strasz#if defined(CLUSTERDEBUG)
248270096Strasz			if (rcluster)
249270096Strasz				printf("S(%d,%d,%d) ",
250270096Strasz					bp->b_lblkno, bp->b_bcount, seqcount);
251270096Strasz#endif
252270096Strasz			if ((bp->b_flags & B_CLUSTER) == 0)
253270096Strasz				vfs_busy_pages(bp, 0);
254270096Strasz			error = VOP_STRATEGY(bp);
255270096Strasz			curproc->p_stats->p_ru.ru_inblock++;
256270096Strasz		}
257270096Strasz	}
258270096Strasz	/*
259270096Strasz	 * and if we have read-aheads, do them too
260270096Strasz	 */
261270096Strasz	if (rbp) {
262270096Strasz		if (error) {
263296715Strasz			rbp->b_flags &= ~(B_ASYNC | B_READ);
264296715Strasz			brelse(rbp);
265296715Strasz		} else if (rbp->b_flags & B_CACHE) {
266270096Strasz			rbp->b_flags &= ~(B_ASYNC | B_READ);
267296715Strasz			bqrelse(rbp);
268296715Strasz		} else {
269270096Strasz#if defined(CLUSTERDEBUG)
270270096Strasz			if (rcluster) {
271270096Strasz				if (bp)
272272470Strasz					printf("A+(%d,%d,%d,%d) ",
273270096Strasz					rbp->b_lblkno, rbp->b_bcount,
274270096Strasz					rbp->b_lblkno - origblkno,
275270096Strasz					seqcount);
276272470Strasz				else
277270096Strasz					printf("A(%d,%d,%d,%d) ",
278270096Strasz					rbp->b_lblkno, rbp->b_bcount,
279270096Strasz					rbp->b_lblkno - origblkno,
280272470Strasz					seqcount);
281270096Strasz			}
282270096Strasz#endif
283270096Strasz
284270096Strasz			if ((rbp->b_flags & B_CLUSTER) == 0)
285270096Strasz				vfs_busy_pages(rbp, 0);
286270096Strasz			(void) VOP_STRATEGY(rbp);
287272470Strasz			curproc->p_stats->p_ru.ru_inblock++;
288270096Strasz		}
289272512Strasz	}
290270096Strasz	if (reqbp)
291270096Strasz		return (biowait(reqbp));
292270096Strasz	else
293270096Strasz		return (error);
294270096Strasz}
295270096Strasz
296270096Strasz/*
297270096Strasz * If blocks are contiguous on disk, use this to provide clustered
298270096Strasz * read ahead.  We will read as many blocks as possible sequentially
299270096Strasz * and then parcel them up into logical blocks in the buffer hash table.
300270096Strasz */
301270096Straszstatic struct buf *
302270096Straszcluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
303270096Strasz	struct vnode *vp;
304270096Strasz	u_quad_t filesize;
305270096Strasz	daddr_t lbn;
306270096Strasz	daddr_t blkno;
307270096Strasz	long size;
308270096Strasz	int run;
309270096Strasz	struct buf *fbp;
310270096Strasz{
311270096Strasz	struct buf *bp, *tbp;
312270096Strasz	daddr_t bn;
313270096Strasz	int i, inc, j;
314270096Strasz
315270096Strasz#ifdef DIAGNOSTIC
316270096Strasz	if (size != vp->v_mount->mnt_stat.f_iosize)
317270096Strasz		panic("cluster_rbuild: size %d != filesize %d\n",
318270096Strasz		    size, vp->v_mount->mnt_stat.f_iosize);
319270096Strasz#endif
320272470Strasz	/*
321270096Strasz	 * avoid a division
322270096Strasz	 */
323270096Strasz	while ((u_quad_t) size * (lbn + run) > filesize) {
324272470Strasz		--run;
325270096Strasz	}
326270096Strasz
327272470Strasz	if (fbp) {
328270096Strasz		tbp = fbp;
329272512Strasz		tbp->b_flags |= B_READ;
330270096Strasz	} else {
331270096Strasz		tbp = getblk(vp, lbn, size, 0, 0);
332270096Strasz		if (tbp->b_flags & B_CACHE)
333270096Strasz			return tbp;
334296798Strasz		tbp->b_flags |= B_ASYNC | B_READ | B_RAM;
335296798Strasz	}
336296798Strasz
337270096Strasz	tbp->b_blkno = blkno;
338296798Strasz	if( (tbp->b_flags & B_MALLOC) ||
339296798Strasz		((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
340270096Strasz		return tbp;
341270096Strasz
342296798Strasz	bp = trypbuf();
343296798Strasz	if (bp == 0)
344270096Strasz		return tbp;
345296798Strasz
346296798Strasz	(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
347296798Strasz	bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO;
348296798Strasz	bp->b_iodone = cluster_callback;
349296798Strasz	bp->b_blkno = blkno;
350296798Strasz	bp->b_lblkno = lbn;
351296798Strasz	pbgetvp(vp, bp);
352296798Strasz
353296798Strasz	TAILQ_INIT(&bp->b_cluster.cluster_head);
354296798Strasz
355296798Strasz	bp->b_bcount = 0;
356296798Strasz	bp->b_bufsize = 0;
357296798Strasz	bp->b_npages = 0;
358296798Strasz
359296798Strasz	inc = btodb(size);
360296798Strasz	for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
361270096Strasz		if (i != 0) {
362296798Strasz			if ((bp->b_npages * PAGE_SIZE) +
363296798Strasz				round_page(size) > MAXPHYS)
364296798Strasz				break;
365296798Strasz
366270096Strasz			if (incore(vp, lbn + i))
367270096Strasz				break;
368270096Strasz
369270096Strasz			tbp = getblk(vp, lbn + i, size, 0, 0);
370296798Strasz
371296798Strasz			if ((tbp->b_flags & B_CACHE) ||
372296798Strasz				(tbp->b_flags & B_VMIO) == 0) {
373296798Strasz				bqrelse(tbp);
374296798Strasz				break;
375296937Strasz			}
376296798Strasz
377296798Strasz			for (j=0;j<tbp->b_npages;j++) {
378296798Strasz				if (tbp->b_pages[j]->valid) {
379296798Strasz					break;
380270096Strasz				}
381270096Strasz			}
382270096Strasz
383270096Strasz			if (j != tbp->b_npages) {
384270096Strasz				/*
385270096Strasz				 * force buffer to be re-constituted later
386270096Strasz				 */
387296798Strasz				tbp->b_flags |= B_RELBUF;
388296798Strasz				brelse(tbp);
389296798Strasz				break;
390270096Strasz			}
391270096Strasz
392270096Strasz			if ((fbp && (i == 1)) || (i == (run - 1)))
393270096Strasz				tbp->b_flags |= B_RAM;
394270096Strasz			tbp->b_flags |= B_READ | B_ASYNC;
395296798Strasz			if (tbp->b_blkno == tbp->b_lblkno) {
396270096Strasz				tbp->b_blkno = bn;
397270096Strasz			} else if (tbp->b_blkno != bn) {
398270096Strasz				brelse(tbp);
399270096Strasz				break;
400270096Strasz			}
401270096Strasz		}
402270096Strasz		TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
403270096Strasz			tbp, b_cluster.cluster_entry);
404270096Strasz		for (j = 0; j < tbp->b_npages; j += 1) {
405270096Strasz			vm_page_t m;
406270096Strasz			m = tbp->b_pages[j];
407270096Strasz			++m->busy;
408270096Strasz			++m->object->paging_in_progress;
409270096Strasz			if ((bp->b_npages == 0) ||
410270096Strasz				(bp->b_pages[bp->b_npages-1] != m)) {
411270096Strasz				bp->b_pages[bp->b_npages] = m;
412270096Strasz				bp->b_npages++;
413296798Strasz			}
414270096Strasz			if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
415270096Strasz				tbp->b_pages[j] = bogus_page;
416270096Strasz		}
417296798Strasz		bp->b_bcount += tbp->b_bcount;
418270096Strasz		bp->b_bufsize += tbp->b_bufsize;
419296798Strasz	}
420296798Strasz
421296798Strasz	for(j=0;j<bp->b_npages;j++) {
422296798Strasz		if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) ==
423296798Strasz			VM_PAGE_BITS_ALL)
424296798Strasz			bp->b_pages[j] = bogus_page;
425296798Strasz	}
426270096Strasz	if (bp->b_bufsize > bp->b_kvasize)
427296798Strasz		panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)\n",
428270096Strasz			bp->b_bufsize, bp->b_kvasize);
429296798Strasz	bp->b_kvasize = bp->b_bufsize;
430270096Strasz
431296798Strasz	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
432296798Strasz		(vm_page_t *)bp->b_pages, bp->b_npages);
433296798Strasz	return (bp);
434296798Strasz}
435296798Strasz
436296798Strasz/*
437270096Strasz * Cleanup after a clustered read or write.
438296798Strasz * This is complicated by the fact that any of the buffers might have
439296798Strasz * extra memory (if there were no empty buffer headers at allocbuf time)
440270096Strasz * that we will need to shift around.
441270096Strasz */
442296798Straszvoid
443270096Straszcluster_callback(bp)
444270096Strasz	struct buf *bp;
445296798Strasz{
446270096Strasz	struct buf *nbp, *tbp;
447270096Strasz	int error = 0;
448296798Strasz
449296798Strasz	/*
450296798Strasz	 * Must propogate errors to all the components.
451296798Strasz	 */
452296798Strasz	if (bp->b_flags & B_ERROR)
453272470Strasz		error = bp->b_error;
454297236Strasz
455296798Strasz	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
456296798Strasz	/*
457296798Strasz	 * Move memory from the large cluster buffer into the component
458296798Strasz	 * buffers and mark IO as done on these.
459296798Strasz	 */
460296798Strasz	for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
461296798Strasz		tbp; tbp = nbp) {
462270096Strasz		nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
463270096Strasz		if (error) {
464270096Strasz			tbp->b_flags |= B_ERROR;
465296798Strasz			tbp->b_error = error;
466270096Strasz		} else
467296798Strasz		    tbp->b_dirtyoff = tbp->b_dirtyend = 0;
468296798Strasz		biodone(tbp);
469296798Strasz	}
470296798Strasz	relpbuf(bp);
471270096Strasz}
472270096Strasz
473296798Strasz/*
474296798Strasz * Do clustered write for FFS.
475270096Strasz *
476272470Strasz * Three cases:
477296798Strasz *	1. Write is not sequential (write asynchronously)
478270096Strasz *	Write is sequential:
479270096Strasz *	2.	beginning of cluster - begin cluster
480296798Strasz *	3.	middle of a cluster - add to cluster
481270096Strasz *	4.	end of a cluster - asynchronously write cluster
482296798Strasz */
483296798Straszvoid
484296798Straszcluster_write(bp, filesize)
485270096Strasz	struct buf *bp;
486296798Strasz	u_quad_t filesize;
487296798Strasz{
488296798Strasz	struct vnode *vp;
489296798Strasz	daddr_t lbn;
490296798Strasz	int maxclen, cursize;
491296798Strasz	int lblocksize;
492296798Strasz	int async;
493296798Strasz
494296798Strasz	vp = bp->b_vp;
495296798Strasz	async = vp->v_mount->mnt_flag & MNT_ASYNC;
496296798Strasz	lblocksize = vp->v_mount->mnt_stat.f_iosize;
497296798Strasz	lbn = bp->b_lblkno;
498296798Strasz
499296798Strasz	/* Initialize vnode to beginning of file. */
500296798Strasz	if (lbn == 0)
501270096Strasz		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
502270096Strasz
503270096Strasz	if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
504270096Strasz	    (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
505270096Strasz		maxclen = MAXPHYS / lblocksize - 1;
506272836Strasz		if (vp->v_clen != 0) {
507272836Strasz			/*
508270096Strasz			 * Next block is not sequential.
509270096Strasz			 *
510270096Strasz			 * If we are not writing at end of file, the process
511270096Strasz			 * seeked to another point in the file since its last
512270096Strasz			 * write, or we have reached our maximum cluster size,
513270096Strasz			 * then push the previous cluster. Otherwise try
514270096Strasz			 * reallocating to make it sequential.
515270096Strasz			 */
516270096Strasz			cursize = vp->v_lastw - vp->v_cstart + 1;
517270096Strasz#ifndef notyet_block_reallocation_enabled
518270096Strasz			if (((u_quad_t)(lbn + 1) * lblocksize) != filesize ||
519270096Strasz				lbn != vp->v_lastw + 1 ||
520270096Strasz				vp->v_clen <= cursize) {
521270096Strasz				if (!async)
522270096Strasz					cluster_wbuild(vp, lblocksize,
523270096Strasz						vp->v_cstart, cursize);
524270096Strasz			}
525270096Strasz#else
526270096Strasz			if ((lbn + 1) * lblocksize != filesize ||
527270096Strasz			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
528270096Strasz				if (!async)
529270096Strasz					cluster_wbuild(vp, lblocksize,
530270096Strasz						vp->v_cstart, cursize);
531270096Strasz			} else {
532270096Strasz				struct buf **bpp, **endbp;
533270096Strasz				struct cluster_save *buflist;
534270096Strasz
535270096Strasz				buflist = cluster_collectbufs(vp, bp);
536270096Strasz				endbp = &buflist->bs_children
537270096Strasz				    [buflist->bs_nchildren - 1];
538270096Strasz				if (VOP_REALLOCBLKS(vp, buflist)) {
539270096Strasz					/*
540270096Strasz					 * Failed, push the previous cluster.
541270096Strasz					 */
542270096Strasz					for (bpp = buflist->bs_children;
543270096Strasz					     bpp < endbp; bpp++)
544270096Strasz						brelse(*bpp);
545270096Strasz					free(buflist, M_SEGMENT);
546270096Strasz					cluster_wbuild(vp, lblocksize,
547270096Strasz					    vp->v_cstart, cursize);
548270096Strasz				} else {
549270096Strasz					/*
550270096Strasz					 * Succeeded, keep building cluster.
551272931Strasz					 */
552272470Strasz					for (bpp = buflist->bs_children;
553270096Strasz					     bpp <= endbp; bpp++)
554272931Strasz						bdwrite(*bpp);
555272931Strasz					free(buflist, M_SEGMENT);
556272931Strasz					vp->v_lastw = lbn;
557272931Strasz					vp->v_lasta = bp->b_blkno;
558270096Strasz					return;
559270096Strasz				}
560270096Strasz			}
561270096Strasz#endif /* notyet_block_reallocation_enabled */
562270096Strasz		}
563270096Strasz		/*
564270096Strasz		 * Consider beginning a cluster. If at end of file, make
565270096Strasz		 * cluster as large as possible, otherwise find size of
566270096Strasz		 * existing cluster.
567270096Strasz		 */
568270096Strasz		if (((u_quad_t) (lbn + 1) * lblocksize) != filesize &&
569270096Strasz		    (bp->b_blkno == bp->b_lblkno) &&
570270096Strasz		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
571270096Strasz		     bp->b_blkno == -1)) {
572270096Strasz			bawrite(bp);
573270096Strasz			vp->v_clen = 0;
574270096Strasz			vp->v_lasta = bp->b_blkno;
575270096Strasz			vp->v_cstart = lbn + 1;
576270096Strasz			vp->v_lastw = lbn;
577270096Strasz			return;
578270096Strasz		}
579297236Strasz		vp->v_clen = maxclen;
580297236Strasz		if (!async && maxclen == 0) {	/* I/O not contiguous */
581270096Strasz			vp->v_cstart = lbn + 1;
582270096Strasz			bawrite(bp);
583270096Strasz		} else {	/* Wait for rest of cluster */
584270096Strasz			vp->v_cstart = lbn;
585270096Strasz			bdwrite(bp);
586270096Strasz		}
587270096Strasz	} else if (lbn == vp->v_cstart + vp->v_clen) {
588270096Strasz		/*
589270096Strasz		 * At end of cluster, write it out.
590297236Strasz		 */
591297236Strasz		bdwrite(bp);
592270096Strasz		cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
593270096Strasz		vp->v_clen = 0;
594270096Strasz		vp->v_cstart = lbn + 1;
595297236Strasz	} else
596297236Strasz		/*
597297236Strasz		 * In the middle of a cluster, so just delay the I/O for now.
598297236Strasz		 */
599270096Strasz		bdwrite(bp);
600297236Strasz	vp->v_lastw = lbn;
601297236Strasz	vp->v_lasta = bp->b_blkno;
602297236Strasz}
603270096Strasz
604270096Strasz
605297236Strasz/*
606297236Strasz * This is an awful lot like cluster_rbuild...wish they could be combined.
607270096Strasz * The last lbn argument is the current block on which I/O is being
608270096Strasz * performed.  Check to see that it doesn't fall in the middle of
609297236Strasz * the current block (if last_bp == NULL).
610297236Strasz */
611297236Straszint
612270096Straszcluster_wbuild(vp, size, start_lbn, len)
613270096Strasz	struct vnode *vp;
614270096Strasz	long size;
615270096Strasz	daddr_t start_lbn;
616270096Strasz	int len;
617270096Strasz{
618270096Strasz	struct buf *bp, *tbp;
619272470Strasz	int i, j, s;
620297236Strasz	int totalwritten = 0;
621270096Strasz	int dbsize = btodb(size);
622270096Strasz	while (len > 0) {
623270096Strasz		s = splbio();
624270096Strasz		if ( ((tbp = gbincore(vp, start_lbn)) == NULL) ||
625270096Strasz			((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) {
626297236Strasz			++start_lbn;
627270096Strasz			--len;
628270096Strasz			splx(s);
629270096Strasz			continue;
630270096Strasz		}
631270096Strasz		bremfree(tbp);
632270096Strasz		tbp->b_flags |= B_BUSY;
633272512Strasz		tbp->b_flags &= ~B_DONE;
634272512Strasz		splx(s);
635270096Strasz
636270096Strasz	/*
637270096Strasz	 * Extra memory in the buffer, punt on this buffer. XXX we could
638270096Strasz	 * handle this in most cases, but we would have to push the extra
639270096Strasz	 * memory down to after our max possible cluster size and then
640270096Strasz	 * potentially pull it back up if the cluster was terminated
641270096Strasz	 * prematurely--too much hassle.
642270096Strasz	 */
643270096Strasz		if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) ||
644270096Strasz			(tbp->b_bcount != tbp->b_bufsize) ||
645272512Strasz			(tbp->b_bcount != size) ||
646270096Strasz			len == 1) {
647270096Strasz			totalwritten += tbp->b_bufsize;
648270096Strasz			bawrite(tbp);
649270096Strasz			++start_lbn;
650270096Strasz			--len;
651270096Strasz			continue;
652270096Strasz		}
653270096Strasz
654270096Strasz		bp = trypbuf();
655270096Strasz		if (bp == NULL) {
656270096Strasz			totalwritten += tbp->b_bufsize;
657270096Strasz			bawrite(tbp);
658270096Strasz			++start_lbn;
659270096Strasz			--len;
660270096Strasz			continue;
661270096Strasz		}
662270096Strasz
663270096Strasz		TAILQ_INIT(&bp->b_cluster.cluster_head);
664270096Strasz		bp->b_bcount = 0;
665270096Strasz		bp->b_bufsize = 0;
666270096Strasz		bp->b_npages = 0;
667270096Strasz		if (tbp->b_wcred != NOCRED) {
668270096Strasz		    bp->b_wcred = tbp->b_wcred;
669270096Strasz		    crhold(bp->b_wcred);
670270096Strasz		}
671270096Strasz
672270096Strasz		bp->b_blkno = tbp->b_blkno;
673270096Strasz		bp->b_lblkno = tbp->b_lblkno;
674270096Strasz		(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
675270096Strasz		bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER | (tbp->b_flags & (B_VMIO|B_NEEDCOMMIT));
676270096Strasz		bp->b_iodone = cluster_callback;
677270096Strasz		pbgetvp(vp, bp);
678270096Strasz
679270096Strasz		for (i = 0; i < len; ++i, ++start_lbn) {
680270096Strasz			if (i != 0) {
681270096Strasz				s = splbio();
682270096Strasz				if ((tbp = gbincore(vp, start_lbn)) == NULL) {
683270096Strasz					splx(s);
684270096Strasz					break;
685272512Strasz				}
686272512Strasz
687270096Strasz				if ((tbp->b_flags & (B_VMIO|B_CLUSTEROK|B_INVAL|B_BUSY|B_DELWRI|B_NEEDCOMMIT)) != (B_DELWRI|B_CLUSTEROK|(bp->b_flags & (B_VMIO|B_NEEDCOMMIT)))) {
688270096Strasz					splx(s);
689300047Strasz					break;
690270096Strasz				}
691270096Strasz
692270096Strasz				if (tbp->b_wcred != bp->b_wcred) {
693270096Strasz					splx(s);
694270096Strasz					break;
695270096Strasz				}
696270096Strasz
697270096Strasz				if ((tbp->b_bcount != size) ||
698270096Strasz					((bp->b_blkno + dbsize * i) != tbp->b_blkno) ||
699270096Strasz					((tbp->b_npages + bp->b_npages) > (MAXPHYS / PAGE_SIZE))) {
700270096Strasz					splx(s);
701270096Strasz					break;
702				}
703				bremfree(tbp);
704				tbp->b_flags |= B_BUSY;
705				tbp->b_flags &= ~B_DONE;
706				splx(s);
707			}
708			if (tbp->b_flags & B_VMIO) {
709				for (j = 0; j < tbp->b_npages; j += 1) {
710					vm_page_t m;
711					m = tbp->b_pages[j];
712					++m->busy;
713					++m->object->paging_in_progress;
714					if ((bp->b_npages == 0) ||
715						(bp->b_pages[bp->b_npages - 1] != m)) {
716						bp->b_pages[bp->b_npages] = m;
717						bp->b_npages++;
718					}
719				}
720			}
721			bp->b_bcount += size;
722			bp->b_bufsize += size;
723
724			--numdirtybuffers;
725			tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
726			tbp->b_flags |= B_ASYNC;
727			s = splbio();
728			reassignbuf(tbp, tbp->b_vp);	/* put on clean list */
729			++tbp->b_vp->v_numoutput;
730			splx(s);
731			TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
732				tbp, b_cluster.cluster_entry);
733		}
734		pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
735			(vm_page_t *) bp->b_pages, bp->b_npages);
736		if (bp->b_bufsize > bp->b_kvasize)
737			panic("cluster_wbuild: b_bufsize(%d) > b_kvasize(%d)\n",
738				bp->b_bufsize, bp->b_kvasize);
739		bp->b_kvasize = bp->b_bufsize;
740		totalwritten += bp->b_bufsize;
741		bp->b_dirtyoff = 0;
742		bp->b_dirtyend = bp->b_bufsize;
743		bawrite(bp);
744
745		len -= i;
746	}
747	return totalwritten;
748}
749
750#ifdef notyet_block_reallocation_enabled
751/*
752 * Collect together all the buffers in a cluster.
753 * Plus add one additional buffer.
754 */
755static struct cluster_save *
756cluster_collectbufs(vp, last_bp)
757	struct vnode *vp;
758	struct buf *last_bp;
759{
760	struct cluster_save *buflist;
761	daddr_t lbn;
762	int i, len;
763
764	len = vp->v_lastw - vp->v_cstart + 1;
765	buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
766	    M_SEGMENT, M_WAITOK);
767	buflist->bs_nchildren = 0;
768	buflist->bs_children = (struct buf **) (buflist + 1);
769	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++)
770		(void) bread(vp, lbn, last_bp->b_bcount, NOCRED,
771		    &buflist->bs_children[i]);
772	buflist->bs_children[i] = last_bp;
773	buflist->bs_nchildren = i + 1;
774	return (buflist);
775}
776#endif /* notyet_block_reallocation_enabled */
777