ffs_balloc.c revision 173464
1/*-
2 * Copyright (c) 2002 Networks Associates Technology, Inc.
3 * All rights reserved.
4 *
5 * This software was developed for the FreeBSD Project by Marshall
6 * Kirk McKusick and Network Associates Laboratories, the Security
7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9 * research program
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * Copyright (c) 1982, 1986, 1989, 1993
33 *	The Regents of the University of California.  All rights reserved.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 *    notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 *    notice, this list of conditions and the following disclaimer in the
42 *    documentation and/or other materials provided with the distribution.
43 * 4. Neither the name of the University nor the names of its contributors
44 *    may be used to endorse or promote products derived from this software
45 *    without specific prior written permission.
46 *
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * SUCH DAMAGE.
58 *
59 *	@(#)ffs_balloc.c	8.8 (Berkeley) 6/16/95
60 */
61
62#include <sys/cdefs.h>
63__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_balloc.c 173464 2007-11-08 17:21:51Z obrien $");
64
65#include <sys/param.h>
66#include <sys/systm.h>
67#include <sys/bio.h>
68#include <sys/buf.h>
69#include <sys/lock.h>
70#include <sys/mount.h>
71#include <sys/vnode.h>
72
73#include <ufs/ufs/quota.h>
74#include <ufs/ufs/inode.h>
75#include <ufs/ufs/ufs_extern.h>
76#include <ufs/ufs/extattr.h>
77#include <ufs/ufs/ufsmount.h>
78
79#include <ufs/ffs/fs.h>
80#include <ufs/ffs/ffs_extern.h>
81
82/*
83 * Balloc defines the structure of filesystem storage
84 * by allocating the physical blocks on a device given
85 * the inode and the logical block number in a file.
86 * This is the allocation strategy for UFS1. Below is
87 * the allocation strategy for UFS2.
88 */
89int
90ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
91    struct ucred *cred, int flags, struct buf **bpp)
92{
93	struct inode *ip;
94	struct ufs1_dinode *dp;
95	ufs_lbn_t lbn, lastlbn;
96	struct fs *fs;
97	ufs1_daddr_t nb;
98	struct buf *bp, *nbp;
99	struct ufsmount *ump;
100	struct indir indirs[NIADDR + 2];
101	int deallocated, osize, nsize, num, i, error;
102	ufs2_daddr_t newb;
103	ufs1_daddr_t *bap, pref;
104	ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
105	int unwindidx = -1;
106
107	ip = VTOI(vp);
108	dp = ip->i_din1;
109	fs = ip->i_fs;
110	ump = ip->i_ump;
111	lbn = lblkno(fs, startoffset);
112	size = blkoff(fs, startoffset) + size;
113	if (size > fs->fs_bsize)
114		panic("ffs_balloc_ufs1: blk too big");
115	*bpp = NULL;
116	if (flags & IO_EXT)
117		return (EOPNOTSUPP);
118	if (lbn < 0)
119		return (EFBIG);
120
121	/*
122	 * If the next write will extend the file into a new block,
123	 * and the file is currently composed of a fragment
124	 * this fragment has to be extended to be a full block.
125	 */
126	lastlbn = lblkno(fs, ip->i_size);
127	if (lastlbn < NDADDR && lastlbn < lbn) {
128		nb = lastlbn;
129		osize = blksize(fs, ip, nb);
130		if (osize < fs->fs_bsize && osize > 0) {
131			UFS_LOCK(ump);
132			error = ffs_realloccg(ip, nb, dp->di_db[nb],
133			   ffs_blkpref_ufs1(ip, lastlbn, (int)nb,
134			   &dp->di_db[0]), osize, (int)fs->fs_bsize, cred, &bp);
135			if (error)
136				return (error);
137			if (DOINGSOFTDEP(vp))
138				softdep_setup_allocdirect(ip, nb,
139				    dbtofsb(fs, bp->b_blkno), dp->di_db[nb],
140				    fs->fs_bsize, osize, bp);
141			ip->i_size = smalllblktosize(fs, nb + 1);
142			dp->di_size = ip->i_size;
143			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
144			ip->i_flag |= IN_CHANGE | IN_UPDATE;
145			if (flags & IO_SYNC)
146				bwrite(bp);
147			else
148				bawrite(bp);
149		}
150	}
151	/*
152	 * The first NDADDR blocks are direct blocks
153	 */
154	if (lbn < NDADDR) {
155		if (flags & BA_METAONLY)
156			panic("ffs_balloc_ufs1: BA_METAONLY for direct block");
157		nb = dp->di_db[lbn];
158		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
159			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
160			if (error) {
161				brelse(bp);
162				return (error);
163			}
164			bp->b_blkno = fsbtodb(fs, nb);
165			*bpp = bp;
166			return (0);
167		}
168		if (nb != 0) {
169			/*
170			 * Consider need to reallocate a fragment.
171			 */
172			osize = fragroundup(fs, blkoff(fs, ip->i_size));
173			nsize = fragroundup(fs, size);
174			if (nsize <= osize) {
175				error = bread(vp, lbn, osize, NOCRED, &bp);
176				if (error) {
177					brelse(bp);
178					return (error);
179				}
180				bp->b_blkno = fsbtodb(fs, nb);
181			} else {
182				UFS_LOCK(ump);
183				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
184				    ffs_blkpref_ufs1(ip, lbn, (int)lbn,
185				    &dp->di_db[0]), osize, nsize, cred, &bp);
186				if (error)
187					return (error);
188				if (DOINGSOFTDEP(vp))
189					softdep_setup_allocdirect(ip, lbn,
190					    dbtofsb(fs, bp->b_blkno), nb,
191					    nsize, osize, bp);
192			}
193		} else {
194			if (ip->i_size < smalllblktosize(fs, lbn + 1))
195				nsize = fragroundup(fs, size);
196			else
197				nsize = fs->fs_bsize;
198			UFS_LOCK(ump);
199			error = ffs_alloc(ip, lbn,
200			    ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]),
201			    nsize, cred, &newb);
202			if (error)
203				return (error);
204			bp = getblk(vp, lbn, nsize, 0, 0, 0);
205			bp->b_blkno = fsbtodb(fs, newb);
206			if (flags & BA_CLRBUF)
207				vfs_bio_clrbuf(bp);
208			if (DOINGSOFTDEP(vp))
209				softdep_setup_allocdirect(ip, lbn, newb, 0,
210				    nsize, 0, bp);
211		}
212		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
213		ip->i_flag |= IN_CHANGE | IN_UPDATE;
214		*bpp = bp;
215		return (0);
216	}
217	/*
218	 * Determine the number of levels of indirection.
219	 */
220	pref = 0;
221	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
222		return(error);
223#ifdef INVARIANTS
224	if (num < 1)
225		panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block");
226#endif
227	/*
228	 * Fetch the first indirect block allocating if necessary.
229	 */
230	--num;
231	nb = dp->di_ib[indirs[0].in_off];
232	allocib = NULL;
233	allocblk = allociblk;
234	if (nb == 0) {
235		UFS_LOCK(ump);
236		pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
237	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
238		    cred, &newb)) != 0)
239			return (error);
240		nb = newb;
241		*allocblk++ = nb;
242		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0);
243		bp->b_blkno = fsbtodb(fs, nb);
244		vfs_bio_clrbuf(bp);
245		if (DOINGSOFTDEP(vp)) {
246			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
247			    newb, 0, fs->fs_bsize, 0, bp);
248			bdwrite(bp);
249		} else {
250			/*
251			 * Write synchronously so that indirect blocks
252			 * never point at garbage.
253			 */
254			if (DOINGASYNC(vp))
255				bdwrite(bp);
256			else if ((error = bwrite(bp)) != 0)
257				goto fail;
258		}
259		allocib = &dp->di_ib[indirs[0].in_off];
260		*allocib = nb;
261		ip->i_flag |= IN_CHANGE | IN_UPDATE;
262	}
263	/*
264	 * Fetch through the indirect blocks, allocating as necessary.
265	 */
266	for (i = 1;;) {
267		error = bread(vp,
268		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
269		if (error) {
270			brelse(bp);
271			goto fail;
272		}
273		bap = (ufs1_daddr_t *)bp->b_data;
274		nb = bap[indirs[i].in_off];
275		if (i == num)
276			break;
277		i += 1;
278		if (nb != 0) {
279			bqrelse(bp);
280			continue;
281		}
282		UFS_LOCK(ump);
283		if (pref == 0)
284			pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
285		if ((error =
286		    ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) != 0) {
287			brelse(bp);
288			goto fail;
289		}
290		nb = newb;
291		*allocblk++ = nb;
292		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
293		nbp->b_blkno = fsbtodb(fs, nb);
294		vfs_bio_clrbuf(nbp);
295		if (DOINGSOFTDEP(vp)) {
296			softdep_setup_allocindir_meta(nbp, ip, bp,
297			    indirs[i - 1].in_off, nb);
298			bdwrite(nbp);
299		} else {
300			/*
301			 * Write synchronously so that indirect blocks
302			 * never point at garbage.
303			 */
304			if ((error = bwrite(nbp)) != 0) {
305				brelse(bp);
306				goto fail;
307			}
308		}
309		bap[indirs[i - 1].in_off] = nb;
310		if (allocib == NULL && unwindidx < 0)
311			unwindidx = i - 1;
312		/*
313		 * If required, write synchronously, otherwise use
314		 * delayed write.
315		 */
316		if (flags & IO_SYNC) {
317			bwrite(bp);
318		} else {
319			if (bp->b_bufsize == fs->fs_bsize)
320				bp->b_flags |= B_CLUSTEROK;
321			bdwrite(bp);
322		}
323	}
324	/*
325	 * If asked only for the indirect block, then return it.
326	 */
327	if (flags & BA_METAONLY) {
328		*bpp = bp;
329		return (0);
330	}
331	/*
332	 * Get the data block, allocating if necessary.
333	 */
334	if (nb == 0) {
335		UFS_LOCK(ump);
336		pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off, &bap[0]);
337		error = ffs_alloc(ip,
338		    lbn, pref, (int)fs->fs_bsize, cred, &newb);
339		if (error) {
340			brelse(bp);
341			goto fail;
342		}
343		nb = newb;
344		*allocblk++ = nb;
345		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
346		nbp->b_blkno = fsbtodb(fs, nb);
347		if (flags & BA_CLRBUF)
348			vfs_bio_clrbuf(nbp);
349		if (DOINGSOFTDEP(vp))
350			softdep_setup_allocindir_page(ip, lbn, bp,
351			    indirs[i].in_off, nb, 0, nbp);
352		bap[indirs[i].in_off] = nb;
353		/*
354		 * If required, write synchronously, otherwise use
355		 * delayed write.
356		 */
357		if (flags & IO_SYNC) {
358			bwrite(bp);
359		} else {
360			if (bp->b_bufsize == fs->fs_bsize)
361				bp->b_flags |= B_CLUSTEROK;
362			bdwrite(bp);
363		}
364		*bpp = nbp;
365		return (0);
366	}
367	brelse(bp);
368	if (flags & BA_CLRBUF) {
369		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
370		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
371			error = cluster_read(vp, ip->i_size, lbn,
372			    (int)fs->fs_bsize, NOCRED,
373			    MAXBSIZE, seqcount, &nbp);
374		} else {
375			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
376		}
377		if (error) {
378			brelse(nbp);
379			goto fail;
380		}
381	} else {
382		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
383		nbp->b_blkno = fsbtodb(fs, nb);
384	}
385	*bpp = nbp;
386	return (0);
387fail:
388	/*
389	 * If we have failed to allocate any blocks, simply return the error.
390	 * This is the usual case and avoids the need to fsync the file.
391	 */
392	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
393		return (error);
394	/*
395	 * If we have failed part way through block allocation, we
396	 * have to deallocate any indirect blocks that we have allocated.
397	 * We have to fsync the file before we start to get rid of all
398	 * of its dependencies so that we do not leave them dangling.
399	 * We have to sync it at the end so that the soft updates code
400	 * does not find any untracked changes. Although this is really
401	 * slow, running out of disk space is not expected to be a common
402	 * occurence. The error return from fsync is ignored as we already
403	 * have an error to return to the user.
404	 */
405	(void) ffs_syncvnode(vp, MNT_WAIT);
406	for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
407		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
408		    ip->i_number);
409		deallocated += fs->fs_bsize;
410	}
411	if (allocib != NULL) {
412		*allocib = 0;
413	} else if (unwindidx >= 0) {
414		int r;
415
416		r = bread(vp, indirs[unwindidx].in_lbn,
417		    (int)fs->fs_bsize, NOCRED, &bp);
418		if (r) {
419			panic("Could not unwind indirect block, error %d", r);
420			brelse(bp);
421		} else {
422			bap = (ufs1_daddr_t *)bp->b_data;
423			bap[indirs[unwindidx].in_off] = 0;
424			if (flags & IO_SYNC) {
425				bwrite(bp);
426			} else {
427				if (bp->b_bufsize == fs->fs_bsize)
428					bp->b_flags |= B_CLUSTEROK;
429				bdwrite(bp);
430			}
431		}
432	}
433	if (deallocated) {
434#ifdef QUOTA
435		/*
436		 * Restore user's disk quota because allocation failed.
437		 */
438		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
439#endif
440		dp->di_blocks -= btodb(deallocated);
441		ip->i_flag |= IN_CHANGE | IN_UPDATE;
442	}
443	(void) ffs_syncvnode(vp, MNT_WAIT);
444	return (error);
445}
446
447/*
448 * Balloc defines the structure of file system storage
449 * by allocating the physical blocks on a device given
450 * the inode and the logical block number in a file.
451 * This is the allocation strategy for UFS2. Above is
452 * the allocation strategy for UFS1.
453 */
454int
455ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
456    struct ucred *cred, int flags, struct buf **bpp)
457{
458	struct inode *ip;
459	struct ufs2_dinode *dp;
460	ufs_lbn_t lbn, lastlbn;
461	struct fs *fs;
462	struct buf *bp, *nbp;
463	struct ufsmount *ump;
464	struct indir indirs[NIADDR + 2];
465	ufs2_daddr_t nb, newb, *bap, pref;
466	ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
467	int deallocated, osize, nsize, num, i, error;
468	int unwindidx = -1;
469
470	ip = VTOI(vp);
471	dp = ip->i_din2;
472	fs = ip->i_fs;
473	ump = ip->i_ump;
474	lbn = lblkno(fs, startoffset);
475	size = blkoff(fs, startoffset) + size;
476	if (size > fs->fs_bsize)
477		panic("ffs_balloc_ufs2: blk too big");
478	*bpp = NULL;
479	if (lbn < 0)
480		return (EFBIG);
481
482	/*
483	 * Check for allocating external data.
484	 */
485	if (flags & IO_EXT) {
486		if (lbn >= NXADDR)
487			return (EFBIG);
488		/*
489		 * If the next write will extend the data into a new block,
490		 * and the data is currently composed of a fragment
491		 * this fragment has to be extended to be a full block.
492		 */
493		lastlbn = lblkno(fs, dp->di_extsize);
494		if (lastlbn < lbn) {
495			nb = lastlbn;
496			osize = sblksize(fs, dp->di_extsize, nb);
497			if (osize < fs->fs_bsize && osize > 0) {
498				UFS_LOCK(ump);
499				error = ffs_realloccg(ip, -1 - nb,
500				    dp->di_extb[nb],
501				    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
502				    &dp->di_extb[0]), osize,
503				    (int)fs->fs_bsize, cred, &bp);
504				if (error)
505					return (error);
506				if (DOINGSOFTDEP(vp))
507					softdep_setup_allocext(ip, nb,
508					    dbtofsb(fs, bp->b_blkno),
509					    dp->di_extb[nb],
510					    fs->fs_bsize, osize, bp);
511				dp->di_extsize = smalllblktosize(fs, nb + 1);
512				dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno);
513				bp->b_xflags |= BX_ALTDATA;
514				ip->i_flag |= IN_CHANGE | IN_UPDATE;
515				if (flags & IO_SYNC)
516					bwrite(bp);
517				else
518					bawrite(bp);
519			}
520		}
521		/*
522		 * All blocks are direct blocks
523		 */
524		if (flags & BA_METAONLY)
525			panic("ffs_balloc_ufs2: BA_METAONLY for ext block");
526		nb = dp->di_extb[lbn];
527		if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) {
528			error = bread(vp, -1 - lbn, fs->fs_bsize, NOCRED, &bp);
529			if (error) {
530				brelse(bp);
531				return (error);
532			}
533			bp->b_blkno = fsbtodb(fs, nb);
534			bp->b_xflags |= BX_ALTDATA;
535			*bpp = bp;
536			return (0);
537		}
538		if (nb != 0) {
539			/*
540			 * Consider need to reallocate a fragment.
541			 */
542			osize = fragroundup(fs, blkoff(fs, dp->di_extsize));
543			nsize = fragroundup(fs, size);
544			if (nsize <= osize) {
545				error = bread(vp, -1 - lbn, osize, NOCRED, &bp);
546				if (error) {
547					brelse(bp);
548					return (error);
549				}
550				bp->b_blkno = fsbtodb(fs, nb);
551				bp->b_xflags |= BX_ALTDATA;
552			} else {
553				UFS_LOCK(ump);
554				error = ffs_realloccg(ip, -1 - lbn,
555				    dp->di_extb[lbn],
556				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
557				    &dp->di_extb[0]), osize, nsize, cred, &bp);
558				if (error)
559					return (error);
560				bp->b_xflags |= BX_ALTDATA;
561				if (DOINGSOFTDEP(vp))
562					softdep_setup_allocext(ip, lbn,
563					    dbtofsb(fs, bp->b_blkno), nb,
564					    nsize, osize, bp);
565			}
566		} else {
567			if (dp->di_extsize < smalllblktosize(fs, lbn + 1))
568				nsize = fragroundup(fs, size);
569			else
570				nsize = fs->fs_bsize;
571			UFS_LOCK(ump);
572			error = ffs_alloc(ip, lbn,
573			   ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]),
574			   nsize, cred, &newb);
575			if (error)
576				return (error);
577			bp = getblk(vp, -1 - lbn, nsize, 0, 0, 0);
578			bp->b_blkno = fsbtodb(fs, newb);
579			bp->b_xflags |= BX_ALTDATA;
580			if (flags & BA_CLRBUF)
581				vfs_bio_clrbuf(bp);
582			if (DOINGSOFTDEP(vp))
583				softdep_setup_allocext(ip, lbn, newb, 0,
584				    nsize, 0, bp);
585		}
586		dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno);
587		ip->i_flag |= IN_CHANGE | IN_UPDATE;
588		*bpp = bp;
589		return (0);
590	}
591	/*
592	 * If the next write will extend the file into a new block,
593	 * and the file is currently composed of a fragment
594	 * this fragment has to be extended to be a full block.
595	 */
596	lastlbn = lblkno(fs, ip->i_size);
597	if (lastlbn < NDADDR && lastlbn < lbn) {
598		nb = lastlbn;
599		osize = blksize(fs, ip, nb);
600		if (osize < fs->fs_bsize && osize > 0) {
601			UFS_LOCK(ump);
602			error = ffs_realloccg(ip, nb, dp->di_db[nb],
603				ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
604				    &dp->di_db[0]), osize, (int)fs->fs_bsize,
605				    cred, &bp);
606			if (error)
607				return (error);
608			if (DOINGSOFTDEP(vp))
609				softdep_setup_allocdirect(ip, nb,
610				    dbtofsb(fs, bp->b_blkno),
611				    dp->di_db[nb],
612				    fs->fs_bsize, osize, bp);
613			ip->i_size = smalllblktosize(fs, nb + 1);
614			dp->di_size = ip->i_size;
615			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
616			ip->i_flag |= IN_CHANGE | IN_UPDATE;
617			if (flags & IO_SYNC)
618				bwrite(bp);
619			else
620				bawrite(bp);
621		}
622	}
623	/*
624	 * The first NDADDR blocks are direct blocks
625	 */
626	if (lbn < NDADDR) {
627		if (flags & BA_METAONLY)
628			panic("ffs_balloc_ufs2: BA_METAONLY for direct block");
629		nb = dp->di_db[lbn];
630		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
631			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
632			if (error) {
633				brelse(bp);
634				return (error);
635			}
636			bp->b_blkno = fsbtodb(fs, nb);
637			*bpp = bp;
638			return (0);
639		}
640		if (nb != 0) {
641			/*
642			 * Consider need to reallocate a fragment.
643			 */
644			osize = fragroundup(fs, blkoff(fs, ip->i_size));
645			nsize = fragroundup(fs, size);
646			if (nsize <= osize) {
647				error = bread(vp, lbn, osize, NOCRED, &bp);
648				if (error) {
649					brelse(bp);
650					return (error);
651				}
652				bp->b_blkno = fsbtodb(fs, nb);
653			} else {
654				UFS_LOCK(ump);
655				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
656				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
657				       &dp->di_db[0]), osize, nsize, cred, &bp);
658				if (error)
659					return (error);
660				if (DOINGSOFTDEP(vp))
661					softdep_setup_allocdirect(ip, lbn,
662					    dbtofsb(fs, bp->b_blkno), nb,
663					    nsize, osize, bp);
664			}
665		} else {
666			if (ip->i_size < smalllblktosize(fs, lbn + 1))
667				nsize = fragroundup(fs, size);
668			else
669				nsize = fs->fs_bsize;
670			UFS_LOCK(ump);
671			error = ffs_alloc(ip, lbn,
672			    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
673				&dp->di_db[0]), nsize, cred, &newb);
674			if (error)
675				return (error);
676			bp = getblk(vp, lbn, nsize, 0, 0, 0);
677			bp->b_blkno = fsbtodb(fs, newb);
678			if (flags & BA_CLRBUF)
679				vfs_bio_clrbuf(bp);
680			if (DOINGSOFTDEP(vp))
681				softdep_setup_allocdirect(ip, lbn, newb, 0,
682				    nsize, 0, bp);
683		}
684		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
685		ip->i_flag |= IN_CHANGE | IN_UPDATE;
686		*bpp = bp;
687		return (0);
688	}
689	/*
690	 * Determine the number of levels of indirection.
691	 */
692	pref = 0;
693	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
694		return(error);
695#ifdef INVARIANTS
696	if (num < 1)
697		panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block");
698#endif
699	/*
700	 * Fetch the first indirect block allocating if necessary.
701	 */
702	--num;
703	nb = dp->di_ib[indirs[0].in_off];
704	allocib = NULL;
705	allocblk = allociblk;
706	if (nb == 0) {
707		UFS_LOCK(ump);
708		pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
709	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
710		    cred, &newb)) != 0)
711			return (error);
712		nb = newb;
713		*allocblk++ = nb;
714		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0);
715		bp->b_blkno = fsbtodb(fs, nb);
716		vfs_bio_clrbuf(bp);
717		if (DOINGSOFTDEP(vp)) {
718			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
719			    newb, 0, fs->fs_bsize, 0, bp);
720			bdwrite(bp);
721		} else {
722			/*
723			 * Write synchronously so that indirect blocks
724			 * never point at garbage.
725			 */
726			if (DOINGASYNC(vp))
727				bdwrite(bp);
728			else if ((error = bwrite(bp)) != 0)
729				goto fail;
730		}
731		allocib = &dp->di_ib[indirs[0].in_off];
732		*allocib = nb;
733		ip->i_flag |= IN_CHANGE | IN_UPDATE;
734	}
735	/*
736	 * Fetch through the indirect blocks, allocating as necessary.
737	 */
738	for (i = 1;;) {
739		error = bread(vp,
740		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
741		if (error) {
742			brelse(bp);
743			goto fail;
744		}
745		bap = (ufs2_daddr_t *)bp->b_data;
746		nb = bap[indirs[i].in_off];
747		if (i == num)
748			break;
749		i += 1;
750		if (nb != 0) {
751			bqrelse(bp);
752			continue;
753		}
754		UFS_LOCK(ump);
755		if (pref == 0)
756			pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
757		if ((error =
758		    ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) != 0) {
759			brelse(bp);
760			goto fail;
761		}
762		nb = newb;
763		*allocblk++ = nb;
764		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
765		nbp->b_blkno = fsbtodb(fs, nb);
766		vfs_bio_clrbuf(nbp);
767		if (DOINGSOFTDEP(vp)) {
768			softdep_setup_allocindir_meta(nbp, ip, bp,
769			    indirs[i - 1].in_off, nb);
770			bdwrite(nbp);
771		} else {
772			/*
773			 * Write synchronously so that indirect blocks
774			 * never point at garbage.
775			 */
776			if ((error = bwrite(nbp)) != 0) {
777				brelse(bp);
778				goto fail;
779			}
780		}
781		bap[indirs[i - 1].in_off] = nb;
782		if (allocib == NULL && unwindidx < 0)
783			unwindidx = i - 1;
784		/*
785		 * If required, write synchronously, otherwise use
786		 * delayed write.
787		 */
788		if (flags & IO_SYNC) {
789			bwrite(bp);
790		} else {
791			if (bp->b_bufsize == fs->fs_bsize)
792				bp->b_flags |= B_CLUSTEROK;
793			bdwrite(bp);
794		}
795	}
796	/*
797	 * If asked only for the indirect block, then return it.
798	 */
799	if (flags & BA_METAONLY) {
800		*bpp = bp;
801		return (0);
802	}
803	/*
804	 * Get the data block, allocating if necessary.
805	 */
806	if (nb == 0) {
807		UFS_LOCK(ump);
808		pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off, &bap[0]);
809		error = ffs_alloc(ip,
810		    lbn, pref, (int)fs->fs_bsize, cred, &newb);
811		if (error) {
812			brelse(bp);
813			goto fail;
814		}
815		nb = newb;
816		*allocblk++ = nb;
817		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
818		nbp->b_blkno = fsbtodb(fs, nb);
819		if (flags & BA_CLRBUF)
820			vfs_bio_clrbuf(nbp);
821		if (DOINGSOFTDEP(vp))
822			softdep_setup_allocindir_page(ip, lbn, bp,
823			    indirs[i].in_off, nb, 0, nbp);
824		bap[indirs[i].in_off] = nb;
825		/*
826		 * If required, write synchronously, otherwise use
827		 * delayed write.
828		 */
829		if (flags & IO_SYNC) {
830			bwrite(bp);
831		} else {
832			if (bp->b_bufsize == fs->fs_bsize)
833				bp->b_flags |= B_CLUSTEROK;
834			bdwrite(bp);
835		}
836		*bpp = nbp;
837		return (0);
838	}
839	brelse(bp);
840	/*
841	 * If requested clear invalid portions of the buffer.  If we
842	 * have to do a read-before-write (typical if BA_CLRBUF is set),
843	 * try to do some read-ahead in the sequential case to reduce
844	 * the number of I/O transactions.
845	 */
846	if (flags & BA_CLRBUF) {
847		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
848		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
849			error = cluster_read(vp, ip->i_size, lbn,
850			    (int)fs->fs_bsize, NOCRED,
851			    MAXBSIZE, seqcount, &nbp);
852		} else {
853			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
854		}
855		if (error) {
856			brelse(nbp);
857			goto fail;
858		}
859	} else {
860		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
861		nbp->b_blkno = fsbtodb(fs, nb);
862	}
863	*bpp = nbp;
864	return (0);
865fail:
866	/*
867	 * If we have failed to allocate any blocks, simply return the error.
868	 * This is the usual case and avoids the need to fsync the file.
869	 */
870	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
871		return (error);
872	/*
873	 * If we have failed part way through block allocation, we
874	 * have to deallocate any indirect blocks that we have allocated.
875	 * We have to fsync the file before we start to get rid of all
876	 * of its dependencies so that we do not leave them dangling.
877	 * We have to sync it at the end so that the soft updates code
878	 * does not find any untracked changes. Although this is really
879	 * slow, running out of disk space is not expected to be a common
880	 * occurence. The error return from fsync is ignored as we already
881	 * have an error to return to the user.
882	 */
883	(void) ffs_syncvnode(vp, MNT_WAIT);
884	for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
885		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
886		    ip->i_number);
887		deallocated += fs->fs_bsize;
888	}
889	if (allocib != NULL) {
890		*allocib = 0;
891	} else if (unwindidx >= 0) {
892		int r;
893
894		r = bread(vp, indirs[unwindidx].in_lbn,
895		    (int)fs->fs_bsize, NOCRED, &bp);
896		if (r) {
897			panic("Could not unwind indirect block, error %d", r);
898			brelse(bp);
899		} else {
900			bap = (ufs2_daddr_t *)bp->b_data;
901			bap[indirs[unwindidx].in_off] = 0;
902			if (flags & IO_SYNC) {
903				bwrite(bp);
904			} else {
905				if (bp->b_bufsize == fs->fs_bsize)
906					bp->b_flags |= B_CLUSTEROK;
907				bdwrite(bp);
908			}
909		}
910	}
911	if (deallocated) {
912#ifdef QUOTA
913		/*
914		 * Restore user's disk quota because allocation failed.
915		 */
916		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
917#endif
918		dp->di_blocks -= btodb(deallocated);
919		ip->i_flag |= IN_CHANGE | IN_UPDATE;
920	}
921	(void) ffs_syncvnode(vp, MNT_WAIT);
922	return (error);
923}
924