ffs_balloc.c revision 110837
1/*
2 * Copyright (c) 2002 Networks Associates Technology, Inc.
3 * All rights reserved.
4 *
5 * This software was developed for the FreeBSD Project by Marshall
6 * Kirk McKusick and Network Associates Laboratories, the Security
7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9 * research program
10 *
11 * Copyright (c) 1982, 1986, 1989, 1993
12 *	The Regents of the University of California.  All rights reserved.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 *    notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 *    notice, this list of conditions and the following disclaimer in the
21 *    documentation and/or other materials provided with the distribution.
22 * 3. All advertising materials mentioning features or use of this software
23 *    must display the following acknowledgement:
24 *	This product includes software developed by the University of
25 *	California, Berkeley and its contributors.
26 * 4. Neither the name of the University nor the names of its contributors
27 *    may be used to endorse or promote products derived from this software
28 *    without specific prior written permission.
29 *
30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40 * SUCH DAMAGE.
41 *
42 *	@(#)ffs_balloc.c	8.8 (Berkeley) 6/16/95
43 * $FreeBSD: head/sys/ufs/ffs/ffs_balloc.c 110837 2003-02-14 00:31:06Z mckusick $
44 */
45
46#include <sys/param.h>
47#include <sys/systm.h>
48#include <sys/bio.h>
49#include <sys/buf.h>
50#include <sys/lock.h>
51#include <sys/mount.h>
52#include <sys/vnode.h>
53
54#include <ufs/ufs/quota.h>
55#include <ufs/ufs/inode.h>
56#include <ufs/ufs/ufs_extern.h>
57
58#include <ufs/ffs/fs.h>
59#include <ufs/ffs/ffs_extern.h>
60
61/*
62 * Balloc defines the structure of filesystem storage
63 * by allocating the physical blocks on a device given
64 * the inode and the logical block number in a file.
65 * This is the allocation strategy for UFS1. Below is
66 * the allocation strategy for UFS2.
67 */
68int
69ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
70    struct ucred *cred, int flags, struct buf **bpp)
71{
72	struct inode *ip;
73	struct ufs1_dinode *dp;
74	ufs_lbn_t lbn, lastlbn;
75	struct fs *fs;
76	ufs1_daddr_t nb;
77	struct buf *bp, *nbp;
78	struct indir indirs[NIADDR + 2];
79	int deallocated, osize, nsize, num, i, error;
80	ufs2_daddr_t newb;
81	ufs1_daddr_t *bap, pref;
82	ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
83	int unwindidx = -1;
84	struct thread *td = curthread;	/* XXX */
85
86	ip = VTOI(vp);
87	dp = ip->i_din1;
88	fs = ip->i_fs;
89	lbn = lblkno(fs, startoffset);
90	size = blkoff(fs, startoffset) + size;
91	if (size > fs->fs_bsize)
92		panic("ffs_balloc_ufs1: blk too big");
93	*bpp = NULL;
94	if (flags & IO_EXT)
95		return (EOPNOTSUPP);
96	if (lbn < 0)
97		return (EFBIG);
98
99	/*
100	 * If the next write will extend the file into a new block,
101	 * and the file is currently composed of a fragment
102	 * this fragment has to be extended to be a full block.
103	 */
104	lastlbn = lblkno(fs, ip->i_size);
105	if (lastlbn < NDADDR && lastlbn < lbn) {
106		nb = lastlbn;
107		osize = blksize(fs, ip, nb);
108		if (osize < fs->fs_bsize && osize > 0) {
109			error = ffs_realloccg(ip, nb, dp->di_db[nb],
110			   ffs_blkpref_ufs1(ip, lastlbn, (int)nb,
111			   &dp->di_db[0]), osize, (int)fs->fs_bsize, cred, &bp);
112			if (error)
113				return (error);
114			if (DOINGSOFTDEP(vp))
115				softdep_setup_allocdirect(ip, nb,
116				    dbtofsb(fs, bp->b_blkno), dp->di_db[nb],
117				    fs->fs_bsize, osize, bp);
118			ip->i_size = smalllblktosize(fs, nb + 1);
119			dp->di_size = ip->i_size;
120			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
121			ip->i_flag |= IN_CHANGE | IN_UPDATE;
122			if (flags & IO_SYNC)
123				bwrite(bp);
124			else
125				bawrite(bp);
126		}
127	}
128	/*
129	 * The first NDADDR blocks are direct blocks
130	 */
131	if (lbn < NDADDR) {
132		if (flags & BA_METAONLY)
133			panic("ffs_balloc_ufs1: BA_METAONLY for direct block");
134		nb = dp->di_db[lbn];
135		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
136			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
137			if (error) {
138				brelse(bp);
139				return (error);
140			}
141			bp->b_blkno = fsbtodb(fs, nb);
142			*bpp = bp;
143			return (0);
144		}
145		if (nb != 0) {
146			/*
147			 * Consider need to reallocate a fragment.
148			 */
149			osize = fragroundup(fs, blkoff(fs, ip->i_size));
150			nsize = fragroundup(fs, size);
151			if (nsize <= osize) {
152				error = bread(vp, lbn, osize, NOCRED, &bp);
153				if (error) {
154					brelse(bp);
155					return (error);
156				}
157				bp->b_blkno = fsbtodb(fs, nb);
158			} else {
159				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
160				    ffs_blkpref_ufs1(ip, lbn, (int)lbn,
161				    &dp->di_db[0]), osize, nsize, cred, &bp);
162				if (error)
163					return (error);
164				if (DOINGSOFTDEP(vp))
165					softdep_setup_allocdirect(ip, lbn,
166					    dbtofsb(fs, bp->b_blkno), nb,
167					    nsize, osize, bp);
168			}
169		} else {
170			if (ip->i_size < smalllblktosize(fs, lbn + 1))
171				nsize = fragroundup(fs, size);
172			else
173				nsize = fs->fs_bsize;
174			error = ffs_alloc(ip, lbn,
175			    ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]),
176			    nsize, cred, &newb);
177			if (error)
178				return (error);
179			bp = getblk(vp, lbn, nsize, 0, 0);
180			bp->b_blkno = fsbtodb(fs, newb);
181			if (flags & BA_CLRBUF)
182				vfs_bio_clrbuf(bp);
183			if (DOINGSOFTDEP(vp))
184				softdep_setup_allocdirect(ip, lbn, newb, 0,
185				    nsize, 0, bp);
186		}
187		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
188		ip->i_flag |= IN_CHANGE | IN_UPDATE;
189		*bpp = bp;
190		return (0);
191	}
192	/*
193	 * Determine the number of levels of indirection.
194	 */
195	pref = 0;
196	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
197		return(error);
198#ifdef DIAGNOSTIC
199	if (num < 1)
200		panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block");
201#endif
202	/*
203	 * Fetch the first indirect block allocating if necessary.
204	 */
205	--num;
206	nb = dp->di_ib[indirs[0].in_off];
207	allocib = NULL;
208	allocblk = allociblk;
209	if (nb == 0) {
210		pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
211	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
212		    cred, &newb)) != 0)
213			return (error);
214		nb = newb;
215		*allocblk++ = nb;
216		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0);
217		bp->b_blkno = fsbtodb(fs, nb);
218		vfs_bio_clrbuf(bp);
219		if (DOINGSOFTDEP(vp)) {
220			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
221			    newb, 0, fs->fs_bsize, 0, bp);
222			bdwrite(bp);
223		} else {
224			/*
225			 * Write synchronously so that indirect blocks
226			 * never point at garbage.
227			 */
228			if (DOINGASYNC(vp))
229				bdwrite(bp);
230			else if ((error = bwrite(bp)) != 0)
231				goto fail;
232		}
233		allocib = &dp->di_ib[indirs[0].in_off];
234		*allocib = nb;
235		ip->i_flag |= IN_CHANGE | IN_UPDATE;
236	}
237	/*
238	 * Fetch through the indirect blocks, allocating as necessary.
239	 */
240	for (i = 1;;) {
241		error = bread(vp,
242		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
243		if (error) {
244			brelse(bp);
245			goto fail;
246		}
247		bap = (ufs1_daddr_t *)bp->b_data;
248		nb = bap[indirs[i].in_off];
249		if (i == num)
250			break;
251		i += 1;
252		if (nb != 0) {
253			bqrelse(bp);
254			continue;
255		}
256		if (pref == 0)
257			pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
258		if ((error =
259		    ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) != 0) {
260			brelse(bp);
261			goto fail;
262		}
263		nb = newb;
264		*allocblk++ = nb;
265		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0);
266		nbp->b_blkno = fsbtodb(fs, nb);
267		vfs_bio_clrbuf(nbp);
268		if (DOINGSOFTDEP(vp)) {
269			softdep_setup_allocindir_meta(nbp, ip, bp,
270			    indirs[i - 1].in_off, nb);
271			bdwrite(nbp);
272		} else {
273			/*
274			 * Write synchronously so that indirect blocks
275			 * never point at garbage.
276			 */
277			if ((error = bwrite(nbp)) != 0) {
278				brelse(bp);
279				goto fail;
280			}
281		}
282		bap[indirs[i - 1].in_off] = nb;
283		if (allocib == NULL && unwindidx < 0)
284			unwindidx = i - 1;
285		/*
286		 * If required, write synchronously, otherwise use
287		 * delayed write.
288		 */
289		if (flags & IO_SYNC) {
290			bwrite(bp);
291		} else {
292			if (bp->b_bufsize == fs->fs_bsize)
293				bp->b_flags |= B_CLUSTEROK;
294			bdwrite(bp);
295		}
296	}
297	/*
298	 * If asked only for the indirect block, then return it.
299	 */
300	if (flags & BA_METAONLY) {
301		*bpp = bp;
302		return (0);
303	}
304	/*
305	 * Get the data block, allocating if necessary.
306	 */
307	if (nb == 0) {
308		pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off, &bap[0]);
309		error = ffs_alloc(ip,
310		    lbn, pref, (int)fs->fs_bsize, cred, &newb);
311		if (error) {
312			brelse(bp);
313			goto fail;
314		}
315		nb = newb;
316		*allocblk++ = nb;
317		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0);
318		nbp->b_blkno = fsbtodb(fs, nb);
319		if (flags & BA_CLRBUF)
320			vfs_bio_clrbuf(nbp);
321		if (DOINGSOFTDEP(vp))
322			softdep_setup_allocindir_page(ip, lbn, bp,
323			    indirs[i].in_off, nb, 0, nbp);
324		bap[indirs[i].in_off] = nb;
325		/*
326		 * If required, write synchronously, otherwise use
327		 * delayed write.
328		 */
329		if (flags & IO_SYNC) {
330			bwrite(bp);
331		} else {
332			if (bp->b_bufsize == fs->fs_bsize)
333				bp->b_flags |= B_CLUSTEROK;
334			bdwrite(bp);
335		}
336		*bpp = nbp;
337		return (0);
338	}
339	brelse(bp);
340	if (flags & BA_CLRBUF) {
341		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
342		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
343			error = cluster_read(vp, ip->i_size, lbn,
344			    (int)fs->fs_bsize, NOCRED,
345			    MAXBSIZE, seqcount, &nbp);
346		} else {
347			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
348		}
349		if (error) {
350			brelse(nbp);
351			goto fail;
352		}
353	} else {
354		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0);
355		nbp->b_blkno = fsbtodb(fs, nb);
356	}
357	*bpp = nbp;
358	return (0);
359fail:
360	/*
361	 * If we have failed to allocate any blocks, simply return the error.
362	 * This is the usual case and avoids the need to fsync the file.
363	 */
364	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
365		return (error);
366	/*
367	 * If we have failed part way through block allocation, we
368	 * have to deallocate any indirect blocks that we have allocated.
369	 * We have to fsync the file before we start to get rid of all
370	 * of its dependencies so that we do not leave them dangling.
371	 * We have to sync it at the end so that the soft updates code
372	 * does not find any untracked changes. Although this is really
373	 * slow, running out of disk space is not expected to be a common
374	 * occurence. The error return from fsync is ignored as we already
375	 * have an error to return to the user.
376	 */
377	(void) VOP_FSYNC(vp, cred, MNT_WAIT, td);
378	for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
379		ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number);
380		deallocated += fs->fs_bsize;
381	}
382	if (allocib != NULL) {
383		*allocib = 0;
384	} else if (unwindidx >= 0) {
385		int r;
386
387		r = bread(vp, indirs[unwindidx].in_lbn,
388		    (int)fs->fs_bsize, NOCRED, &bp);
389		if (r) {
390			panic("Could not unwind indirect block, error %d", r);
391			brelse(bp);
392		} else {
393			bap = (ufs1_daddr_t *)bp->b_data;
394			bap[indirs[unwindidx].in_off] = 0;
395			if (flags & IO_SYNC) {
396				bwrite(bp);
397			} else {
398				if (bp->b_bufsize == fs->fs_bsize)
399					bp->b_flags |= B_CLUSTEROK;
400				bdwrite(bp);
401			}
402		}
403	}
404	if (deallocated) {
405#ifdef QUOTA
406		/*
407		 * Restore user's disk quota because allocation failed.
408		 */
409		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
410#endif
411		dp->di_blocks -= btodb(deallocated);
412		ip->i_flag |= IN_CHANGE | IN_UPDATE;
413	}
414	(void) VOP_FSYNC(vp, cred, MNT_WAIT, td);
415	return (error);
416}
417
418/*
419 * Balloc defines the structure of file system storage
420 * by allocating the physical blocks on a device given
421 * the inode and the logical block number in a file.
422 * This is the allocation strategy for UFS2. Above is
423 * the allocation strategy for UFS1.
424 */
425int
426ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
427    struct ucred *cred, int flags, struct buf **bpp)
428{
429	struct inode *ip;
430	struct ufs2_dinode *dp;
431	ufs_lbn_t lbn, lastlbn;
432	struct fs *fs;
433	struct buf *bp, *nbp;
434	struct indir indirs[NIADDR + 2];
435	ufs2_daddr_t nb, newb, *bap, pref;
436	ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
437	int deallocated, osize, nsize, num, i, error;
438	int unwindidx = -1;
439	struct thread *td = curthread;	/* XXX */
440
441	ip = VTOI(vp);
442	dp = ip->i_din2;
443	fs = ip->i_fs;
444	lbn = lblkno(fs, startoffset);
445	size = blkoff(fs, startoffset) + size;
446	if (size > fs->fs_bsize)
447		panic("ffs_balloc_ufs2: blk too big");
448	*bpp = NULL;
449	if (lbn < 0)
450		return (EFBIG);
451
452	/*
453	 * Check for allocating external data.
454	 */
455	if (flags & IO_EXT) {
456		if (lbn >= NXADDR)
457			return (EFBIG);
458		/*
459		 * If the next write will extend the data into a new block,
460		 * and the data is currently composed of a fragment
461		 * this fragment has to be extended to be a full block.
462		 */
463		lastlbn = lblkno(fs, dp->di_extsize);
464		if (lastlbn < lbn) {
465			nb = lastlbn;
466			osize = sblksize(fs, dp->di_extsize, nb);
467			if (osize < fs->fs_bsize && osize > 0) {
468				error = ffs_realloccg(ip, -1 - nb,
469				    dp->di_extb[nb],
470				    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
471				    &dp->di_extb[0]), osize,
472				    (int)fs->fs_bsize, cred, &bp);
473				if (error)
474					return (error);
475				if (DOINGSOFTDEP(vp))
476					softdep_setup_allocext(ip, nb,
477					    dbtofsb(fs, bp->b_blkno),
478					    dp->di_extb[nb],
479					    fs->fs_bsize, osize, bp);
480				dp->di_extsize = smalllblktosize(fs, nb + 1);
481				dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno);
482				bp->b_xflags |= BX_ALTDATA;
483				ip->i_flag |= IN_CHANGE | IN_UPDATE;
484				if (flags & IO_SYNC)
485					bwrite(bp);
486				else
487					bawrite(bp);
488			}
489		}
490		/*
491		 * All blocks are direct blocks
492		 */
493		if (flags & BA_METAONLY)
494			panic("ffs_balloc_ufs2: BA_METAONLY for ext block");
495		nb = dp->di_extb[lbn];
496		if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) {
497			error = bread(vp, -1 - lbn, fs->fs_bsize, NOCRED, &bp);
498			if (error) {
499				brelse(bp);
500				return (error);
501			}
502			bp->b_blkno = fsbtodb(fs, nb);
503			bp->b_xflags |= BX_ALTDATA;
504			*bpp = bp;
505			return (0);
506		}
507		if (nb != 0) {
508			/*
509			 * Consider need to reallocate a fragment.
510			 */
511			osize = fragroundup(fs, blkoff(fs, dp->di_extsize));
512			nsize = fragroundup(fs, size);
513			if (nsize <= osize) {
514				error = bread(vp, -1 - lbn, osize, NOCRED, &bp);
515				if (error) {
516					brelse(bp);
517					return (error);
518				}
519				bp->b_blkno = fsbtodb(fs, nb);
520				bp->b_xflags |= BX_ALTDATA;
521			} else {
522				error = ffs_realloccg(ip, -1 - lbn,
523				    dp->di_extb[lbn],
524				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
525				    &dp->di_extb[0]), osize, nsize, cred, &bp);
526				if (error)
527					return (error);
528				bp->b_xflags |= BX_ALTDATA;
529				if (DOINGSOFTDEP(vp))
530					softdep_setup_allocext(ip, lbn,
531					    dbtofsb(fs, bp->b_blkno), nb,
532					    nsize, osize, bp);
533			}
534		} else {
535			if (dp->di_extsize < smalllblktosize(fs, lbn + 1))
536				nsize = fragroundup(fs, size);
537			else
538				nsize = fs->fs_bsize;
539			error = ffs_alloc(ip, lbn,
540			   ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]),
541			   nsize, cred, &newb);
542			if (error)
543				return (error);
544			bp = getblk(vp, -1 - lbn, nsize, 0, 0);
545			bp->b_blkno = fsbtodb(fs, newb);
546			bp->b_xflags |= BX_ALTDATA;
547			if (flags & BA_CLRBUF)
548				vfs_bio_clrbuf(bp);
549			if (DOINGSOFTDEP(vp))
550				softdep_setup_allocext(ip, lbn, newb, 0,
551				    nsize, 0, bp);
552		}
553		dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno);
554		ip->i_flag |= IN_CHANGE | IN_UPDATE;
555		*bpp = bp;
556		return (0);
557	}
558	/*
559	 * If the next write will extend the file into a new block,
560	 * and the file is currently composed of a fragment
561	 * this fragment has to be extended to be a full block.
562	 */
563	lastlbn = lblkno(fs, ip->i_size);
564	if (lastlbn < NDADDR && lastlbn < lbn) {
565		nb = lastlbn;
566		osize = blksize(fs, ip, nb);
567		if (osize < fs->fs_bsize && osize > 0) {
568			error = ffs_realloccg(ip, nb, dp->di_db[nb],
569				ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
570				    &dp->di_db[0]), osize, (int)fs->fs_bsize,
571				    cred, &bp);
572			if (error)
573				return (error);
574			if (DOINGSOFTDEP(vp))
575				softdep_setup_allocdirect(ip, nb,
576				    dbtofsb(fs, bp->b_blkno),
577				    dp->di_db[nb],
578				    fs->fs_bsize, osize, bp);
579			ip->i_size = smalllblktosize(fs, nb + 1);
580			dp->di_size = ip->i_size;
581			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
582			ip->i_flag |= IN_CHANGE | IN_UPDATE;
583			if (flags & IO_SYNC)
584				bwrite(bp);
585			else
586				bawrite(bp);
587		}
588	}
589	/*
590	 * The first NDADDR blocks are direct blocks
591	 */
592	if (lbn < NDADDR) {
593		if (flags & BA_METAONLY)
594			panic("ffs_balloc_ufs2: BA_METAONLY for direct block");
595		nb = dp->di_db[lbn];
596		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
597			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
598			if (error) {
599				brelse(bp);
600				return (error);
601			}
602			bp->b_blkno = fsbtodb(fs, nb);
603			*bpp = bp;
604			return (0);
605		}
606		if (nb != 0) {
607			/*
608			 * Consider need to reallocate a fragment.
609			 */
610			osize = fragroundup(fs, blkoff(fs, ip->i_size));
611			nsize = fragroundup(fs, size);
612			if (nsize <= osize) {
613				error = bread(vp, lbn, osize, NOCRED, &bp);
614				if (error) {
615					brelse(bp);
616					return (error);
617				}
618				bp->b_blkno = fsbtodb(fs, nb);
619			} else {
620				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
621				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
622				       &dp->di_db[0]), osize, nsize, cred, &bp);
623				if (error)
624					return (error);
625				if (DOINGSOFTDEP(vp))
626					softdep_setup_allocdirect(ip, lbn,
627					    dbtofsb(fs, bp->b_blkno), nb,
628					    nsize, osize, bp);
629			}
630		} else {
631			if (ip->i_size < smalllblktosize(fs, lbn + 1))
632				nsize = fragroundup(fs, size);
633			else
634				nsize = fs->fs_bsize;
635			error = ffs_alloc(ip, lbn,
636			    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
637				&dp->di_db[0]), nsize, cred, &newb);
638			if (error)
639				return (error);
640			bp = getblk(vp, lbn, nsize, 0, 0);
641			bp->b_blkno = fsbtodb(fs, newb);
642			if (flags & BA_CLRBUF)
643				vfs_bio_clrbuf(bp);
644			if (DOINGSOFTDEP(vp))
645				softdep_setup_allocdirect(ip, lbn, newb, 0,
646				    nsize, 0, bp);
647		}
648		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
649		ip->i_flag |= IN_CHANGE | IN_UPDATE;
650		*bpp = bp;
651		return (0);
652	}
653	/*
654	 * Determine the number of levels of indirection.
655	 */
656	pref = 0;
657	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
658		return(error);
659#ifdef DIAGNOSTIC
660	if (num < 1)
661		panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block");
662#endif
663	/*
664	 * Fetch the first indirect block allocating if necessary.
665	 */
666	--num;
667	nb = dp->di_ib[indirs[0].in_off];
668	allocib = NULL;
669	allocblk = allociblk;
670	if (nb == 0) {
671		pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
672	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
673		    cred, &newb)) != 0)
674			return (error);
675		nb = newb;
676		*allocblk++ = nb;
677		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0);
678		bp->b_blkno = fsbtodb(fs, nb);
679		vfs_bio_clrbuf(bp);
680		if (DOINGSOFTDEP(vp)) {
681			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
682			    newb, 0, fs->fs_bsize, 0, bp);
683			bdwrite(bp);
684		} else {
685			/*
686			 * Write synchronously so that indirect blocks
687			 * never point at garbage.
688			 */
689			if (DOINGASYNC(vp))
690				bdwrite(bp);
691			else if ((error = bwrite(bp)) != 0)
692				goto fail;
693		}
694		allocib = &dp->di_ib[indirs[0].in_off];
695		*allocib = nb;
696		ip->i_flag |= IN_CHANGE | IN_UPDATE;
697	}
698	/*
699	 * Fetch through the indirect blocks, allocating as necessary.
700	 */
701	for (i = 1;;) {
702		error = bread(vp,
703		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
704		if (error) {
705			brelse(bp);
706			goto fail;
707		}
708		bap = (ufs2_daddr_t *)bp->b_data;
709		nb = bap[indirs[i].in_off];
710		if (i == num)
711			break;
712		i += 1;
713		if (nb != 0) {
714			bqrelse(bp);
715			continue;
716		}
717		if (pref == 0)
718			pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
719		if ((error =
720		    ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) != 0) {
721			brelse(bp);
722			goto fail;
723		}
724		nb = newb;
725		*allocblk++ = nb;
726		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0);
727		nbp->b_blkno = fsbtodb(fs, nb);
728		vfs_bio_clrbuf(nbp);
729		if (DOINGSOFTDEP(vp)) {
730			softdep_setup_allocindir_meta(nbp, ip, bp,
731			    indirs[i - 1].in_off, nb);
732			bdwrite(nbp);
733		} else {
734			/*
735			 * Write synchronously so that indirect blocks
736			 * never point at garbage.
737			 */
738			if ((error = bwrite(nbp)) != 0) {
739				brelse(bp);
740				goto fail;
741			}
742		}
743		bap[indirs[i - 1].in_off] = nb;
744		if (allocib == NULL && unwindidx < 0)
745			unwindidx = i - 1;
746		/*
747		 * If required, write synchronously, otherwise use
748		 * delayed write.
749		 */
750		if (flags & IO_SYNC) {
751			bwrite(bp);
752		} else {
753			if (bp->b_bufsize == fs->fs_bsize)
754				bp->b_flags |= B_CLUSTEROK;
755			bdwrite(bp);
756		}
757	}
758	/*
759	 * If asked only for the indirect block, then return it.
760	 */
761	if (flags & BA_METAONLY) {
762		*bpp = bp;
763		return (0);
764	}
765	/*
766	 * Get the data block, allocating if necessary.
767	 */
768	if (nb == 0) {
769		pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off, &bap[0]);
770		error = ffs_alloc(ip,
771		    lbn, pref, (int)fs->fs_bsize, cred, &newb);
772		if (error) {
773			brelse(bp);
774			goto fail;
775		}
776		nb = newb;
777		*allocblk++ = nb;
778		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0);
779		nbp->b_blkno = fsbtodb(fs, nb);
780		if (flags & BA_CLRBUF)
781			vfs_bio_clrbuf(nbp);
782		if (DOINGSOFTDEP(vp))
783			softdep_setup_allocindir_page(ip, lbn, bp,
784			    indirs[i].in_off, nb, 0, nbp);
785		bap[indirs[i].in_off] = nb;
786		/*
787		 * If required, write synchronously, otherwise use
788		 * delayed write.
789		 */
790		if (flags & IO_SYNC) {
791			bwrite(bp);
792		} else {
793			if (bp->b_bufsize == fs->fs_bsize)
794				bp->b_flags |= B_CLUSTEROK;
795			bdwrite(bp);
796		}
797		*bpp = nbp;
798		return (0);
799	}
800	brelse(bp);
801	/*
802	 * If requested clear invalid portions of the buffer.  If we
803	 * have to do a read-before-write (typical if BA_CLRBUF is set),
804	 * try to do some read-ahead in the sequential case to reduce
805	 * the number of I/O transactions.
806	 */
807	if (flags & BA_CLRBUF) {
808		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
809		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
810			error = cluster_read(vp, ip->i_size, lbn,
811			    (int)fs->fs_bsize, NOCRED,
812			    MAXBSIZE, seqcount, &nbp);
813		} else {
814			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
815		}
816		if (error) {
817			brelse(nbp);
818			goto fail;
819		}
820	} else {
821		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0);
822		nbp->b_blkno = fsbtodb(fs, nb);
823	}
824	*bpp = nbp;
825	return (0);
826fail:
827	/*
828	 * If we have failed to allocate any blocks, simply return the error.
829	 * This is the usual case and avoids the need to fsync the file.
830	 */
831	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
832		return (error);
833	/*
834	 * If we have failed part way through block allocation, we
835	 * have to deallocate any indirect blocks that we have allocated.
836	 * We have to fsync the file before we start to get rid of all
837	 * of its dependencies so that we do not leave them dangling.
838	 * We have to sync it at the end so that the soft updates code
839	 * does not find any untracked changes. Although this is really
840	 * slow, running out of disk space is not expected to be a common
841	 * occurence. The error return from fsync is ignored as we already
842	 * have an error to return to the user.
843	 */
844	(void) VOP_FSYNC(vp, cred, MNT_WAIT, td);
845	for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
846		ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number);
847		deallocated += fs->fs_bsize;
848	}
849	if (allocib != NULL) {
850		*allocib = 0;
851	} else if (unwindidx >= 0) {
852		int r;
853
854		r = bread(vp, indirs[unwindidx].in_lbn,
855		    (int)fs->fs_bsize, NOCRED, &bp);
856		if (r) {
857			panic("Could not unwind indirect block, error %d", r);
858			brelse(bp);
859		} else {
860			bap = (ufs2_daddr_t *)bp->b_data;
861			bap[indirs[unwindidx].in_off] = 0;
862			if (flags & IO_SYNC) {
863				bwrite(bp);
864			} else {
865				if (bp->b_bufsize == fs->fs_bsize)
866					bp->b_flags |= B_CLUSTEROK;
867				bdwrite(bp);
868			}
869		}
870	}
871	if (deallocated) {
872#ifdef QUOTA
873		/*
874		 * Restore user's disk quota because allocation failed.
875		 */
876		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
877#endif
878		dp->di_blocks -= btodb(deallocated);
879		ip->i_flag |= IN_CHANGE | IN_UPDATE;
880	}
881	(void) VOP_FSYNC(vp, cred, MNT_WAIT, td);
882	return (error);
883}
884