ffs_balloc.c revision 105422
1/*
2 * Copyright (c) 2002 Networks Associates Technology, Inc.
3 * All rights reserved.
4 *
5 * This software was developed for the FreeBSD Project by Marshall
6 * Kirk McKusick and Network Associates Laboratories, the Security
7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9 * research program
10 *
11 * Copyright (c) 1982, 1989, 1993
12 *	The Regents of the University of California.  All rights reserved.
13 * (c) UNIX System Laboratories, Inc.
14 * Copyright (c) 1982, 1986, 1989, 1993
15 *	The Regents of the University of California.  All rights reserved.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 * 3. All advertising materials mentioning features or use of this software
26 *    must display the following acknowledgement:
27 *	This product includes software developed by the University of
28 *	California, Berkeley and its contributors.
29 * 4. Neither the name of the University nor the names of its contributors
30 *    may be used to endorse or promote products derived from this software
31 *    without specific prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
43 * SUCH DAMAGE.
44 *
45 *	@(#)ffs_balloc.c	8.8 (Berkeley) 6/16/95
46 * $FreeBSD: head/sys/ufs/ffs/ffs_balloc.c 105422 2002-10-18 22:52:41Z dillon $
47 */
48
49#include <sys/param.h>
50#include <sys/systm.h>
51#include <sys/bio.h>
52#include <sys/buf.h>
53#include <sys/lock.h>
54#include <sys/mount.h>
55#include <sys/vnode.h>
56
57#include <ufs/ufs/quota.h>
58#include <ufs/ufs/inode.h>
59#include <ufs/ufs/ufs_extern.h>
60
61#include <ufs/ffs/fs.h>
62#include <ufs/ffs/ffs_extern.h>
63
64/*
65 * Balloc defines the structure of filesystem storage
66 * by allocating the physical blocks on a device given
67 * the inode and the logical block number in a file.
68 * This is the allocation strategy for UFS1. Below is
69 * the allocation strategy for UFS2.
70 */
71int
72ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
73    struct ucred *cred, int flags, struct buf **bpp)
74{
75	struct inode *ip;
76	struct ufs1_dinode *dp;
77	ufs_lbn_t lbn, lastlbn;
78	struct fs *fs;
79	ufs1_daddr_t nb;
80	struct buf *bp, *nbp;
81	struct indir indirs[NIADDR + 2];
82	int deallocated, osize, nsize, num, i, error;
83	ufs2_daddr_t newb;
84	ufs1_daddr_t *bap, pref;
85	ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
86	int unwindidx = -1;
87	struct thread *td = curthread;	/* XXX */
88
89	ip = VTOI(vp);
90	dp = ip->i_din1;
91	fs = ip->i_fs;
92	lbn = lblkno(fs, startoffset);
93	size = blkoff(fs, startoffset) + size;
94	if (size > fs->fs_bsize)
95		panic("ffs_balloc_ufs1: blk too big");
96	*bpp = NULL;
97	if (flags & IO_EXT)
98		return (EOPNOTSUPP);
99	if (lbn < 0)
100		return (EFBIG);
101
102	/*
103	 * If the next write will extend the file into a new block,
104	 * and the file is currently composed of a fragment
105	 * this fragment has to be extended to be a full block.
106	 */
107	lastlbn = lblkno(fs, ip->i_size);
108	if (lastlbn < NDADDR && lastlbn < lbn) {
109		nb = lastlbn;
110		osize = blksize(fs, ip, nb);
111		if (osize < fs->fs_bsize && osize > 0) {
112			error = ffs_realloccg(ip, nb, dp->di_db[nb],
113			   ffs_blkpref_ufs1(ip, lastlbn, (int)nb,
114			   &dp->di_db[0]), osize, (int)fs->fs_bsize, cred, &bp);
115			if (error)
116				return (error);
117			if (DOINGSOFTDEP(vp))
118				softdep_setup_allocdirect(ip, nb,
119				    dbtofsb(fs, bp->b_blkno), dp->di_db[nb],
120				    fs->fs_bsize, osize, bp);
121			ip->i_size = smalllblktosize(fs, nb + 1);
122			dp->di_size = ip->i_size;
123			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
124			ip->i_flag |= IN_CHANGE | IN_UPDATE;
125			if (flags & IO_SYNC)
126				bwrite(bp);
127			else
128				bawrite(bp);
129		}
130	}
131	/*
132	 * The first NDADDR blocks are direct blocks
133	 */
134	if (lbn < NDADDR) {
135		if (flags & BA_METAONLY)
136			panic("ffs_balloc_ufs1: BA_METAONLY for direct block");
137		nb = dp->di_db[lbn];
138		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
139			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
140			if (error) {
141				brelse(bp);
142				return (error);
143			}
144			bp->b_blkno = fsbtodb(fs, nb);
145			*bpp = bp;
146			return (0);
147		}
148		if (nb != 0) {
149			/*
150			 * Consider need to reallocate a fragment.
151			 */
152			osize = fragroundup(fs, blkoff(fs, ip->i_size));
153			nsize = fragroundup(fs, size);
154			if (nsize <= osize) {
155				error = bread(vp, lbn, osize, NOCRED, &bp);
156				if (error) {
157					brelse(bp);
158					return (error);
159				}
160				bp->b_blkno = fsbtodb(fs, nb);
161			} else {
162				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
163				    ffs_blkpref_ufs1(ip, lbn, (int)lbn,
164				    &dp->di_db[0]), osize, nsize, cred, &bp);
165				if (error)
166					return (error);
167				if (DOINGSOFTDEP(vp))
168					softdep_setup_allocdirect(ip, lbn,
169					    dbtofsb(fs, bp->b_blkno), nb,
170					    nsize, osize, bp);
171			}
172		} else {
173			if (ip->i_size < smalllblktosize(fs, lbn + 1))
174				nsize = fragroundup(fs, size);
175			else
176				nsize = fs->fs_bsize;
177			error = ffs_alloc(ip, lbn,
178			    ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]),
179			    nsize, cred, &newb);
180			if (error)
181				return (error);
182			bp = getblk(vp, lbn, nsize, 0, 0);
183			bp->b_blkno = fsbtodb(fs, newb);
184			if (flags & BA_CLRBUF)
185				vfs_bio_clrbuf(bp);
186			if (DOINGSOFTDEP(vp))
187				softdep_setup_allocdirect(ip, lbn, newb, 0,
188				    nsize, 0, bp);
189		}
190		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
191		ip->i_flag |= IN_CHANGE | IN_UPDATE;
192		*bpp = bp;
193		return (0);
194	}
195	/*
196	 * Determine the number of levels of indirection.
197	 */
198	pref = 0;
199	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
200		return(error);
201#ifdef DIAGNOSTIC
202	if (num < 1)
203		panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block");
204#endif
205	/*
206	 * Fetch the first indirect block allocating if necessary.
207	 */
208	--num;
209	nb = dp->di_ib[indirs[0].in_off];
210	allocib = NULL;
211	allocblk = allociblk;
212	if (nb == 0) {
213		pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
214	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
215		    cred, &newb)) != 0)
216			return (error);
217		nb = newb;
218		*allocblk++ = nb;
219		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0);
220		bp->b_blkno = fsbtodb(fs, nb);
221		vfs_bio_clrbuf(bp);
222		if (DOINGSOFTDEP(vp)) {
223			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
224			    newb, 0, fs->fs_bsize, 0, bp);
225			bdwrite(bp);
226		} else {
227			/*
228			 * Write synchronously so that indirect blocks
229			 * never point at garbage.
230			 */
231			if (DOINGASYNC(vp))
232				bdwrite(bp);
233			else if ((error = bwrite(bp)) != 0)
234				goto fail;
235		}
236		allocib = &dp->di_ib[indirs[0].in_off];
237		*allocib = nb;
238		ip->i_flag |= IN_CHANGE | IN_UPDATE;
239	}
240	/*
241	 * Fetch through the indirect blocks, allocating as necessary.
242	 */
243	for (i = 1;;) {
244		error = bread(vp,
245		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
246		if (error) {
247			brelse(bp);
248			goto fail;
249		}
250		bap = (ufs1_daddr_t *)bp->b_data;
251		nb = bap[indirs[i].in_off];
252		if (i == num)
253			break;
254		i += 1;
255		if (nb != 0) {
256			bqrelse(bp);
257			continue;
258		}
259		if (pref == 0)
260			pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
261		if ((error =
262		    ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) != 0) {
263			brelse(bp);
264			goto fail;
265		}
266		nb = newb;
267		*allocblk++ = nb;
268		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0);
269		nbp->b_blkno = fsbtodb(fs, nb);
270		vfs_bio_clrbuf(nbp);
271		if (DOINGSOFTDEP(vp)) {
272			softdep_setup_allocindir_meta(nbp, ip, bp,
273			    indirs[i - 1].in_off, nb);
274			bdwrite(nbp);
275		} else {
276			/*
277			 * Write synchronously so that indirect blocks
278			 * never point at garbage.
279			 */
280			if ((error = bwrite(nbp)) != 0) {
281				brelse(bp);
282				goto fail;
283			}
284		}
285		bap[indirs[i - 1].in_off] = nb;
286		if (allocib == NULL && unwindidx < 0)
287			unwindidx = i - 1;
288		/*
289		 * If required, write synchronously, otherwise use
290		 * delayed write.
291		 */
292		if (flags & IO_SYNC) {
293			bwrite(bp);
294		} else {
295			if (bp->b_bufsize == fs->fs_bsize)
296				bp->b_flags |= B_CLUSTEROK;
297			bdwrite(bp);
298		}
299	}
300	/*
301	 * If asked only for the indirect block, then return it.
302	 */
303	if (flags & BA_METAONLY) {
304		*bpp = bp;
305		return (0);
306	}
307	/*
308	 * Get the data block, allocating if necessary.
309	 */
310	if (nb == 0) {
311		pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off, &bap[0]);
312		error = ffs_alloc(ip,
313		    lbn, pref, (int)fs->fs_bsize, cred, &newb);
314		if (error) {
315			brelse(bp);
316			goto fail;
317		}
318		nb = newb;
319		*allocblk++ = nb;
320		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0);
321		nbp->b_blkno = fsbtodb(fs, nb);
322		if (flags & BA_CLRBUF)
323			vfs_bio_clrbuf(nbp);
324		if (DOINGSOFTDEP(vp))
325			softdep_setup_allocindir_page(ip, lbn, bp,
326			    indirs[i].in_off, nb, 0, nbp);
327		bap[indirs[i].in_off] = nb;
328		/*
329		 * If required, write synchronously, otherwise use
330		 * delayed write.
331		 */
332		if (flags & IO_SYNC) {
333			bwrite(bp);
334		} else {
335			if (bp->b_bufsize == fs->fs_bsize)
336				bp->b_flags |= B_CLUSTEROK;
337			bdwrite(bp);
338		}
339		*bpp = nbp;
340		return (0);
341	}
342	brelse(bp);
343	if (flags & BA_CLRBUF) {
344		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
345		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
346			error = cluster_read(vp, ip->i_size, lbn,
347			    (int)fs->fs_bsize, NOCRED,
348			    MAXBSIZE, seqcount, &nbp);
349		} else {
350			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
351		}
352		if (error) {
353			brelse(nbp);
354			goto fail;
355		}
356	} else {
357		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0);
358		nbp->b_blkno = fsbtodb(fs, nb);
359	}
360	*bpp = nbp;
361	return (0);
362fail:
363	/*
364	 * If we have failed part way through block allocation, we
365	 * have to deallocate any indirect blocks that we have allocated.
366	 * We have to fsync the file before we start to get rid of all
367	 * of its dependencies so that we do not leave them dangling.
368	 * We have to sync it at the end so that the soft updates code
369	 * does not find any untracked changes. Although this is really
370	 * slow, running out of disk space is not expected to be a common
371	 * occurence. The error return from fsync is ignored as we already
372	 * have an error to return to the user.
373	 */
374	(void) VOP_FSYNC(vp, cred, MNT_WAIT, td);
375	for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
376		ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number);
377		deallocated += fs->fs_bsize;
378	}
379	if (allocib != NULL) {
380		*allocib = 0;
381	} else if (unwindidx >= 0) {
382		int r;
383
384		r = bread(vp, indirs[unwindidx].in_lbn,
385		    (int)fs->fs_bsize, NOCRED, &bp);
386		if (r) {
387			panic("Could not unwind indirect block, error %d", r);
388			brelse(bp);
389		} else {
390			bap = (ufs1_daddr_t *)bp->b_data;
391			bap[indirs[unwindidx].in_off] = 0;
392			if (flags & IO_SYNC) {
393				bwrite(bp);
394			} else {
395				if (bp->b_bufsize == fs->fs_bsize)
396					bp->b_flags |= B_CLUSTEROK;
397				bdwrite(bp);
398			}
399		}
400	}
401	if (deallocated) {
402#ifdef QUOTA
403		/*
404		 * Restore user's disk quota because allocation failed.
405		 */
406		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
407#endif
408		dp->di_blocks -= btodb(deallocated);
409		ip->i_flag |= IN_CHANGE | IN_UPDATE;
410	}
411	(void) VOP_FSYNC(vp, cred, MNT_WAIT, td);
412	return (error);
413}
414
415/*
416 * Balloc defines the structure of file system storage
417 * by allocating the physical blocks on a device given
418 * the inode and the logical block number in a file.
419 * This is the allocation strategy for UFS2. Above is
420 * the allocation strategy for UFS1.
421 */
422int
423ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
424    struct ucred *cred, int flags, struct buf **bpp)
425{
426	struct inode *ip;
427	struct ufs2_dinode *dp;
428	ufs_lbn_t lbn, lastlbn;
429	struct fs *fs;
430	struct buf *bp, *nbp;
431	struct indir indirs[NIADDR + 2];
432	ufs2_daddr_t nb, newb, *bap, pref;
433	ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
434	int deallocated, osize, nsize, num, i, error;
435	int unwindidx = -1;
436	struct thread *td = curthread;	/* XXX */
437
438	ip = VTOI(vp);
439	dp = ip->i_din2;
440	fs = ip->i_fs;
441	lbn = lblkno(fs, startoffset);
442	size = blkoff(fs, startoffset) + size;
443	if (size > fs->fs_bsize)
444		panic("ffs_balloc_ufs2: blk too big");
445	*bpp = NULL;
446	if (lbn < 0)
447		return (EFBIG);
448
449	/*
450	 * Check for allocating external data.
451	 */
452	if (flags & IO_EXT) {
453		if (lbn >= NXADDR)
454			return (EFBIG);
455		/*
456		 * If the next write will extend the data into a new block,
457		 * and the data is currently composed of a fragment
458		 * this fragment has to be extended to be a full block.
459		 */
460		lastlbn = lblkno(fs, dp->di_extsize);
461		if (lastlbn < lbn) {
462			nb = lastlbn;
463			osize = sblksize(fs, dp->di_extsize, nb);
464			if (osize < fs->fs_bsize && osize > 0) {
465				error = ffs_realloccg(ip, -1 - nb,
466				    dp->di_extb[nb],
467				    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
468				    &dp->di_extb[0]), osize,
469				    (int)fs->fs_bsize, cred, &bp);
470				if (error)
471					return (error);
472				if (DOINGSOFTDEP(vp))
473					softdep_setup_allocext(ip, nb,
474					    dbtofsb(fs, bp->b_blkno),
475					    dp->di_extb[nb],
476					    fs->fs_bsize, osize, bp);
477				dp->di_extsize = smalllblktosize(fs, nb + 1);
478				dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno);
479				bp->b_xflags |= BX_ALTDATA;
480				ip->i_flag |= IN_CHANGE | IN_UPDATE;
481				if (flags & IO_SYNC)
482					bwrite(bp);
483				else
484					bawrite(bp);
485			}
486		}
487		/*
488		 * All blocks are direct blocks
489		 */
490		if (flags & BA_METAONLY)
491			panic("ffs_balloc_ufs2: BA_METAONLY for ext block");
492		nb = dp->di_extb[lbn];
493		if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) {
494			error = bread(vp, -1 - lbn, fs->fs_bsize, NOCRED, &bp);
495			if (error) {
496				brelse(bp);
497				return (error);
498			}
499			bp->b_blkno = fsbtodb(fs, nb);
500			bp->b_xflags |= BX_ALTDATA;
501			*bpp = bp;
502			return (0);
503		}
504		if (nb != 0) {
505			/*
506			 * Consider need to reallocate a fragment.
507			 */
508			osize = fragroundup(fs, blkoff(fs, dp->di_extsize));
509			nsize = fragroundup(fs, size);
510			if (nsize <= osize) {
511				error = bread(vp, -1 - lbn, osize, NOCRED, &bp);
512				if (error) {
513					brelse(bp);
514					return (error);
515				}
516				bp->b_blkno = fsbtodb(fs, nb);
517				bp->b_xflags |= BX_ALTDATA;
518			} else {
519				error = ffs_realloccg(ip, -1 - lbn,
520				    dp->di_extb[lbn],
521				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
522				    &dp->di_extb[0]), osize, nsize, cred, &bp);
523				if (error)
524					return (error);
525				bp->b_xflags |= BX_ALTDATA;
526				if (DOINGSOFTDEP(vp))
527					softdep_setup_allocext(ip, lbn,
528					    dbtofsb(fs, bp->b_blkno), nb,
529					    nsize, osize, bp);
530			}
531		} else {
532			if (dp->di_extsize < smalllblktosize(fs, lbn + 1))
533				nsize = fragroundup(fs, size);
534			else
535				nsize = fs->fs_bsize;
536			error = ffs_alloc(ip, lbn,
537			   ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]),
538			   nsize, cred, &newb);
539			if (error)
540				return (error);
541			bp = getblk(vp, -1 - lbn, nsize, 0, 0);
542			bp->b_blkno = fsbtodb(fs, newb);
543			bp->b_xflags |= BX_ALTDATA;
544			if (flags & BA_CLRBUF)
545				vfs_bio_clrbuf(bp);
546			if (DOINGSOFTDEP(vp))
547				softdep_setup_allocext(ip, lbn, newb, 0,
548				    nsize, 0, bp);
549		}
550		dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno);
551		ip->i_flag |= IN_CHANGE | IN_UPDATE;
552		*bpp = bp;
553		return (0);
554	}
555	/*
556	 * If the next write will extend the file into a new block,
557	 * and the file is currently composed of a fragment
558	 * this fragment has to be extended to be a full block.
559	 */
560	lastlbn = lblkno(fs, ip->i_size);
561	if (lastlbn < NDADDR && lastlbn < lbn) {
562		nb = lastlbn;
563		osize = blksize(fs, ip, nb);
564		if (osize < fs->fs_bsize && osize > 0) {
565			error = ffs_realloccg(ip, nb, dp->di_db[nb],
566				ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
567				    &dp->di_db[0]), osize, (int)fs->fs_bsize,
568				    cred, &bp);
569			if (error)
570				return (error);
571			if (DOINGSOFTDEP(vp))
572				softdep_setup_allocdirect(ip, nb,
573				    dbtofsb(fs, bp->b_blkno),
574				    dp->di_db[nb],
575				    fs->fs_bsize, osize, bp);
576			ip->i_size = smalllblktosize(fs, nb + 1);
577			dp->di_size = ip->i_size;
578			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
579			ip->i_flag |= IN_CHANGE | IN_UPDATE;
580			if (flags & IO_SYNC)
581				bwrite(bp);
582			else
583				bawrite(bp);
584		}
585	}
586	/*
587	 * The first NDADDR blocks are direct blocks
588	 */
589	if (lbn < NDADDR) {
590		if (flags & BA_METAONLY)
591			panic("ffs_balloc_ufs2: BA_METAONLY for direct block");
592		nb = dp->di_db[lbn];
593		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
594			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
595			if (error) {
596				brelse(bp);
597				return (error);
598			}
599			bp->b_blkno = fsbtodb(fs, nb);
600			*bpp = bp;
601			return (0);
602		}
603		if (nb != 0) {
604			/*
605			 * Consider need to reallocate a fragment.
606			 */
607			osize = fragroundup(fs, blkoff(fs, ip->i_size));
608			nsize = fragroundup(fs, size);
609			if (nsize <= osize) {
610				error = bread(vp, lbn, osize, NOCRED, &bp);
611				if (error) {
612					brelse(bp);
613					return (error);
614				}
615				bp->b_blkno = fsbtodb(fs, nb);
616			} else {
617				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
618				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
619				       &dp->di_db[0]), osize, nsize, cred, &bp);
620				if (error)
621					return (error);
622				if (DOINGSOFTDEP(vp))
623					softdep_setup_allocdirect(ip, lbn,
624					    dbtofsb(fs, bp->b_blkno), nb,
625					    nsize, osize, bp);
626			}
627		} else {
628			if (ip->i_size < smalllblktosize(fs, lbn + 1))
629				nsize = fragroundup(fs, size);
630			else
631				nsize = fs->fs_bsize;
632			error = ffs_alloc(ip, lbn,
633			    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
634				&dp->di_db[0]), nsize, cred, &newb);
635			if (error)
636				return (error);
637			bp = getblk(vp, lbn, nsize, 0, 0);
638			bp->b_blkno = fsbtodb(fs, newb);
639			if (flags & BA_CLRBUF)
640				vfs_bio_clrbuf(bp);
641			if (DOINGSOFTDEP(vp))
642				softdep_setup_allocdirect(ip, lbn, newb, 0,
643				    nsize, 0, bp);
644		}
645		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
646		ip->i_flag |= IN_CHANGE | IN_UPDATE;
647		*bpp = bp;
648		return (0);
649	}
650	/*
651	 * Determine the number of levels of indirection.
652	 */
653	pref = 0;
654	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
655		return(error);
656#ifdef DIAGNOSTIC
657	if (num < 1)
658		panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block");
659#endif
660	/*
661	 * Fetch the first indirect block allocating if necessary.
662	 */
663	--num;
664	nb = dp->di_ib[indirs[0].in_off];
665	allocib = NULL;
666	allocblk = allociblk;
667	if (nb == 0) {
668		pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
669	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
670		    cred, &newb)) != 0)
671			return (error);
672		nb = newb;
673		*allocblk++ = nb;
674		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0);
675		bp->b_blkno = fsbtodb(fs, nb);
676		vfs_bio_clrbuf(bp);
677		if (DOINGSOFTDEP(vp)) {
678			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
679			    newb, 0, fs->fs_bsize, 0, bp);
680			bdwrite(bp);
681		} else {
682			/*
683			 * Write synchronously so that indirect blocks
684			 * never point at garbage.
685			 */
686			if (DOINGASYNC(vp))
687				bdwrite(bp);
688			else if ((error = bwrite(bp)) != 0)
689				goto fail;
690		}
691		allocib = &dp->di_ib[indirs[0].in_off];
692		*allocib = nb;
693		ip->i_flag |= IN_CHANGE | IN_UPDATE;
694	}
695	/*
696	 * Fetch through the indirect blocks, allocating as necessary.
697	 */
698	for (i = 1;;) {
699		error = bread(vp,
700		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
701		if (error) {
702			brelse(bp);
703			goto fail;
704		}
705		bap = (ufs2_daddr_t *)bp->b_data;
706		nb = bap[indirs[i].in_off];
707		if (i == num)
708			break;
709		i += 1;
710		if (nb != 0) {
711			bqrelse(bp);
712			continue;
713		}
714		if (pref == 0)
715			pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
716		if ((error =
717		    ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) != 0) {
718			brelse(bp);
719			goto fail;
720		}
721		nb = newb;
722		*allocblk++ = nb;
723		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0);
724		nbp->b_blkno = fsbtodb(fs, nb);
725		vfs_bio_clrbuf(nbp);
726		if (DOINGSOFTDEP(vp)) {
727			softdep_setup_allocindir_meta(nbp, ip, bp,
728			    indirs[i - 1].in_off, nb);
729			bdwrite(nbp);
730		} else {
731			/*
732			 * Write synchronously so that indirect blocks
733			 * never point at garbage.
734			 */
735			if ((error = bwrite(nbp)) != 0) {
736				brelse(bp);
737				goto fail;
738			}
739		}
740		bap[indirs[i - 1].in_off] = nb;
741		if (allocib == NULL && unwindidx < 0)
742			unwindidx = i - 1;
743		/*
744		 * If required, write synchronously, otherwise use
745		 * delayed write.
746		 */
747		if (flags & IO_SYNC) {
748			bwrite(bp);
749		} else {
750			if (bp->b_bufsize == fs->fs_bsize)
751				bp->b_flags |= B_CLUSTEROK;
752			bdwrite(bp);
753		}
754	}
755	/*
756	 * If asked only for the indirect block, then return it.
757	 */
758	if (flags & BA_METAONLY) {
759		*bpp = bp;
760		return (0);
761	}
762	/*
763	 * Get the data block, allocating if necessary.
764	 */
765	if (nb == 0) {
766		pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off, &bap[0]);
767		error = ffs_alloc(ip,
768		    lbn, pref, (int)fs->fs_bsize, cred, &newb);
769		if (error) {
770			brelse(bp);
771			goto fail;
772		}
773		nb = newb;
774		*allocblk++ = nb;
775		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0);
776		nbp->b_blkno = fsbtodb(fs, nb);
777		if (flags & BA_CLRBUF)
778			vfs_bio_clrbuf(nbp);
779		if (DOINGSOFTDEP(vp))
780			softdep_setup_allocindir_page(ip, lbn, bp,
781			    indirs[i].in_off, nb, 0, nbp);
782		bap[indirs[i].in_off] = nb;
783		/*
784		 * If required, write synchronously, otherwise use
785		 * delayed write.
786		 */
787		if (flags & IO_SYNC) {
788			bwrite(bp);
789		} else {
790			if (bp->b_bufsize == fs->fs_bsize)
791				bp->b_flags |= B_CLUSTEROK;
792			bdwrite(bp);
793		}
794		*bpp = nbp;
795		return (0);
796	}
797	brelse(bp);
798	/*
799	 * If requested clear invalid portions of the buffer.  If we
800	 * have to do a read-before-write (typical if BA_CLRBUF is set),
801	 * try to do some read-ahead in the sequential case to reduce
802	 * the number of I/O transactions.
803	 */
804	if (flags & BA_CLRBUF) {
805		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
806		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
807			error = cluster_read(vp, ip->i_size, lbn,
808			    (int)fs->fs_bsize, NOCRED,
809			    MAXBSIZE, seqcount, &nbp);
810		} else {
811			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
812		}
813		if (error) {
814			brelse(nbp);
815			goto fail;
816		}
817	} else {
818		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0);
819		nbp->b_blkno = fsbtodb(fs, nb);
820	}
821	*bpp = nbp;
822	return (0);
823fail:
824	/*
825	 * If we have failed part way through block allocation, we
826	 * have to deallocate any indirect blocks that we have allocated.
827	 * We have to fsync the file before we start to get rid of all
828	 * of its dependencies so that we do not leave them dangling.
829	 * We have to sync it at the end so that the soft updates code
830	 * does not find any untracked changes. Although this is really
831	 * slow, running out of disk space is not expected to be a common
832	 * occurence. The error return from fsync is ignored as we already
833	 * have an error to return to the user.
834	 */
835	(void) VOP_FSYNC(vp, cred, MNT_WAIT, td);
836	for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
837		ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number);
838		deallocated += fs->fs_bsize;
839	}
840	if (allocib != NULL) {
841		*allocib = 0;
842	} else if (unwindidx >= 0) {
843		int r;
844
845		r = bread(vp, indirs[unwindidx].in_lbn,
846		    (int)fs->fs_bsize, NOCRED, &bp);
847		if (r) {
848			panic("Could not unwind indirect block, error %d", r);
849			brelse(bp);
850		} else {
851			bap = (ufs2_daddr_t *)bp->b_data;
852			bap[indirs[unwindidx].in_off] = 0;
853			if (flags & IO_SYNC) {
854				bwrite(bp);
855			} else {
856				if (bp->b_bufsize == fs->fs_bsize)
857					bp->b_flags |= B_CLUSTEROK;
858				bdwrite(bp);
859			}
860		}
861	}
862	if (deallocated) {
863#ifdef QUOTA
864		/*
865		 * Restore user's disk quota because allocation failed.
866		 */
867		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
868#endif
869		dp->di_blocks -= btodb(deallocated);
870		ip->i_flag |= IN_CHANGE | IN_UPDATE;
871	}
872	(void) VOP_FSYNC(vp, cred, MNT_WAIT, td);
873	return (error);
874}
875