ffs_balloc.c revision 175068
1/*-
2 * Copyright (c) 2002 Networks Associates Technology, Inc.
3 * All rights reserved.
4 *
5 * This software was developed for the FreeBSD Project by Marshall
6 * Kirk McKusick and Network Associates Laboratories, the Security
7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9 * research program
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * Copyright (c) 1982, 1986, 1989, 1993
33 *	The Regents of the University of California.  All rights reserved.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 *    notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 *    notice, this list of conditions and the following disclaimer in the
42 *    documentation and/or other materials provided with the distribution.
43 * 4. Neither the name of the University nor the names of its contributors
44 *    may be used to endorse or promote products derived from this software
45 *    without specific prior written permission.
46 *
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * SUCH DAMAGE.
58 *
59 *	@(#)ffs_balloc.c	8.8 (Berkeley) 6/16/95
60 */
61
62#include <sys/cdefs.h>
63__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_balloc.c 175068 2008-01-03 12:28:57Z kib $");
64
65#include <sys/param.h>
66#include <sys/systm.h>
67#include <sys/bio.h>
68#include <sys/buf.h>
69#include <sys/lock.h>
70#include <sys/mount.h>
71#include <sys/vnode.h>
72
73#include <ufs/ufs/quota.h>
74#include <ufs/ufs/inode.h>
75#include <ufs/ufs/ufs_extern.h>
76#include <ufs/ufs/extattr.h>
77#include <ufs/ufs/ufsmount.h>
78
79#include <ufs/ffs/fs.h>
80#include <ufs/ffs/ffs_extern.h>
81
82/*
83 * Balloc defines the structure of filesystem storage
84 * by allocating the physical blocks on a device given
85 * the inode and the logical block number in a file.
86 * This is the allocation strategy for UFS1. Below is
87 * the allocation strategy for UFS2.
88 */
89int
90ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
91    struct ucred *cred, int flags, struct buf **bpp)
92{
93	struct inode *ip;
94	struct ufs1_dinode *dp;
95	ufs_lbn_t lbn, lastlbn;
96	struct fs *fs;
97	ufs1_daddr_t nb;
98	struct buf *bp, *nbp;
99	struct ufsmount *ump;
100	struct indir indirs[NIADDR + 2];
101	int deallocated, osize, nsize, num, i, error;
102	ufs2_daddr_t newb;
103	ufs1_daddr_t *bap, pref;
104	ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
105	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
106	int unwindidx = -1;
107
108	ip = VTOI(vp);
109	dp = ip->i_din1;
110	fs = ip->i_fs;
111	ump = ip->i_ump;
112	lbn = lblkno(fs, startoffset);
113	size = blkoff(fs, startoffset) + size;
114	if (size > fs->fs_bsize)
115		panic("ffs_balloc_ufs1: blk too big");
116	*bpp = NULL;
117	if (flags & IO_EXT)
118		return (EOPNOTSUPP);
119	if (lbn < 0)
120		return (EFBIG);
121
122	/*
123	 * If the next write will extend the file into a new block,
124	 * and the file is currently composed of a fragment
125	 * this fragment has to be extended to be a full block.
126	 */
127	lastlbn = lblkno(fs, ip->i_size);
128	if (lastlbn < NDADDR && lastlbn < lbn) {
129		nb = lastlbn;
130		osize = blksize(fs, ip, nb);
131		if (osize < fs->fs_bsize && osize > 0) {
132			UFS_LOCK(ump);
133			error = ffs_realloccg(ip, nb, dp->di_db[nb],
134			   ffs_blkpref_ufs1(ip, lastlbn, (int)nb,
135			   &dp->di_db[0]), osize, (int)fs->fs_bsize, cred, &bp);
136			if (error)
137				return (error);
138			if (DOINGSOFTDEP(vp))
139				softdep_setup_allocdirect(ip, nb,
140				    dbtofsb(fs, bp->b_blkno), dp->di_db[nb],
141				    fs->fs_bsize, osize, bp);
142			ip->i_size = smalllblktosize(fs, nb + 1);
143			dp->di_size = ip->i_size;
144			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
145			ip->i_flag |= IN_CHANGE | IN_UPDATE;
146			if (flags & IO_SYNC)
147				bwrite(bp);
148			else
149				bawrite(bp);
150		}
151	}
152	/*
153	 * The first NDADDR blocks are direct blocks
154	 */
155	if (lbn < NDADDR) {
156		if (flags & BA_METAONLY)
157			panic("ffs_balloc_ufs1: BA_METAONLY for direct block");
158		nb = dp->di_db[lbn];
159		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
160			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
161			if (error) {
162				brelse(bp);
163				return (error);
164			}
165			bp->b_blkno = fsbtodb(fs, nb);
166			*bpp = bp;
167			return (0);
168		}
169		if (nb != 0) {
170			/*
171			 * Consider need to reallocate a fragment.
172			 */
173			osize = fragroundup(fs, blkoff(fs, ip->i_size));
174			nsize = fragroundup(fs, size);
175			if (nsize <= osize) {
176				error = bread(vp, lbn, osize, NOCRED, &bp);
177				if (error) {
178					brelse(bp);
179					return (error);
180				}
181				bp->b_blkno = fsbtodb(fs, nb);
182			} else {
183				UFS_LOCK(ump);
184				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
185				    ffs_blkpref_ufs1(ip, lbn, (int)lbn,
186				    &dp->di_db[0]), osize, nsize, cred, &bp);
187				if (error)
188					return (error);
189				if (DOINGSOFTDEP(vp))
190					softdep_setup_allocdirect(ip, lbn,
191					    dbtofsb(fs, bp->b_blkno), nb,
192					    nsize, osize, bp);
193			}
194		} else {
195			if (ip->i_size < smalllblktosize(fs, lbn + 1))
196				nsize = fragroundup(fs, size);
197			else
198				nsize = fs->fs_bsize;
199			UFS_LOCK(ump);
200			error = ffs_alloc(ip, lbn,
201			    ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]),
202			    nsize, cred, &newb);
203			if (error)
204				return (error);
205			bp = getblk(vp, lbn, nsize, 0, 0, 0);
206			bp->b_blkno = fsbtodb(fs, newb);
207			if (flags & BA_CLRBUF)
208				vfs_bio_clrbuf(bp);
209			if (DOINGSOFTDEP(vp))
210				softdep_setup_allocdirect(ip, lbn, newb, 0,
211				    nsize, 0, bp);
212		}
213		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
214		ip->i_flag |= IN_CHANGE | IN_UPDATE;
215		*bpp = bp;
216		return (0);
217	}
218	/*
219	 * Determine the number of levels of indirection.
220	 */
221	pref = 0;
222	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
223		return(error);
224#ifdef INVARIANTS
225	if (num < 1)
226		panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block");
227#endif
228	/*
229	 * Fetch the first indirect block allocating if necessary.
230	 */
231	--num;
232	nb = dp->di_ib[indirs[0].in_off];
233	allocib = NULL;
234	allocblk = allociblk;
235	lbns_remfree = lbns;
236	if (nb == 0) {
237		UFS_LOCK(ump);
238		pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
239	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
240		    cred, &newb)) != 0)
241			return (error);
242		nb = newb;
243		*allocblk++ = nb;
244		*lbns_remfree++ = indirs[1].in_lbn;
245		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0);
246		bp->b_blkno = fsbtodb(fs, nb);
247		vfs_bio_clrbuf(bp);
248		if (DOINGSOFTDEP(vp)) {
249			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
250			    newb, 0, fs->fs_bsize, 0, bp);
251			bdwrite(bp);
252		} else {
253			/*
254			 * Write synchronously so that indirect blocks
255			 * never point at garbage.
256			 */
257			if (DOINGASYNC(vp))
258				bdwrite(bp);
259			else if ((error = bwrite(bp)) != 0)
260				goto fail;
261		}
262		allocib = &dp->di_ib[indirs[0].in_off];
263		*allocib = nb;
264		ip->i_flag |= IN_CHANGE | IN_UPDATE;
265	}
266	/*
267	 * Fetch through the indirect blocks, allocating as necessary.
268	 */
269	for (i = 1;;) {
270		error = bread(vp,
271		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
272		if (error) {
273			brelse(bp);
274			goto fail;
275		}
276		bap = (ufs1_daddr_t *)bp->b_data;
277		nb = bap[indirs[i].in_off];
278		if (i == num)
279			break;
280		i += 1;
281		if (nb != 0) {
282			bqrelse(bp);
283			continue;
284		}
285		UFS_LOCK(ump);
286		if (pref == 0)
287			pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
288		if ((error =
289		    ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) != 0) {
290			brelse(bp);
291			goto fail;
292		}
293		nb = newb;
294		*allocblk++ = nb;
295		*lbns_remfree++ = indirs[i].in_lbn;
296		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
297		nbp->b_blkno = fsbtodb(fs, nb);
298		vfs_bio_clrbuf(nbp);
299		if (DOINGSOFTDEP(vp)) {
300			softdep_setup_allocindir_meta(nbp, ip, bp,
301			    indirs[i - 1].in_off, nb);
302			bdwrite(nbp);
303		} else {
304			/*
305			 * Write synchronously so that indirect blocks
306			 * never point at garbage.
307			 */
308			if ((error = bwrite(nbp)) != 0) {
309				brelse(bp);
310				goto fail;
311			}
312		}
313		bap[indirs[i - 1].in_off] = nb;
314		if (allocib == NULL && unwindidx < 0)
315			unwindidx = i - 1;
316		/*
317		 * If required, write synchronously, otherwise use
318		 * delayed write.
319		 */
320		if (flags & IO_SYNC) {
321			bwrite(bp);
322		} else {
323			if (bp->b_bufsize == fs->fs_bsize)
324				bp->b_flags |= B_CLUSTEROK;
325			bdwrite(bp);
326		}
327	}
328	/*
329	 * If asked only for the indirect block, then return it.
330	 */
331	if (flags & BA_METAONLY) {
332		*bpp = bp;
333		return (0);
334	}
335	/*
336	 * Get the data block, allocating if necessary.
337	 */
338	if (nb == 0) {
339		UFS_LOCK(ump);
340		pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off, &bap[0]);
341		error = ffs_alloc(ip,
342		    lbn, pref, (int)fs->fs_bsize, cred, &newb);
343		if (error) {
344			brelse(bp);
345			goto fail;
346		}
347		nb = newb;
348		*allocblk++ = nb;
349		*lbns_remfree++ = lbn;
350		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
351		nbp->b_blkno = fsbtodb(fs, nb);
352		if (flags & BA_CLRBUF)
353			vfs_bio_clrbuf(nbp);
354		if (DOINGSOFTDEP(vp))
355			softdep_setup_allocindir_page(ip, lbn, bp,
356			    indirs[i].in_off, nb, 0, nbp);
357		bap[indirs[i].in_off] = nb;
358		/*
359		 * If required, write synchronously, otherwise use
360		 * delayed write.
361		 */
362		if (flags & IO_SYNC) {
363			bwrite(bp);
364		} else {
365			if (bp->b_bufsize == fs->fs_bsize)
366				bp->b_flags |= B_CLUSTEROK;
367			bdwrite(bp);
368		}
369		*bpp = nbp;
370		return (0);
371	}
372	brelse(bp);
373	if (flags & BA_CLRBUF) {
374		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
375		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
376			error = cluster_read(vp, ip->i_size, lbn,
377			    (int)fs->fs_bsize, NOCRED,
378			    MAXBSIZE, seqcount, &nbp);
379		} else {
380			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
381		}
382		if (error) {
383			brelse(nbp);
384			goto fail;
385		}
386	} else {
387		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
388		nbp->b_blkno = fsbtodb(fs, nb);
389	}
390	*bpp = nbp;
391	return (0);
392fail:
393	/*
394	 * If we have failed to allocate any blocks, simply return the error.
395	 * This is the usual case and avoids the need to fsync the file.
396	 */
397	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
398		return (error);
399	/*
400	 * If we have failed part way through block allocation, we
401	 * have to deallocate any indirect blocks that we have allocated.
402	 * We have to fsync the file before we start to get rid of all
403	 * of its dependencies so that we do not leave them dangling.
404	 * We have to sync it at the end so that the soft updates code
405	 * does not find any untracked changes. Although this is really
406	 * slow, running out of disk space is not expected to be a common
407	 * occurence. The error return from fsync is ignored as we already
408	 * have an error to return to the user.
409	 */
410	(void) ffs_syncvnode(vp, MNT_WAIT);
411	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
412	     blkp < allocblk; blkp++, lbns_remfree++) {
413		/*
414		 * We shall not leave the freed blocks on the vnode
415		 * buffer object lists.
416		 */
417		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
418		if (bp != NULL) {
419			bp->b_flags |= (B_INVAL | B_RELBUF);
420			bp->b_flags &= ~B_ASYNC;
421			brelse(bp);
422		}
423		deallocated += fs->fs_bsize;
424	}
425	if (allocib != NULL) {
426		*allocib = 0;
427	} else if (unwindidx >= 0) {
428		int r;
429
430		r = bread(vp, indirs[unwindidx].in_lbn,
431		    (int)fs->fs_bsize, NOCRED, &bp);
432		if (r) {
433			panic("Could not unwind indirect block, error %d", r);
434			brelse(bp);
435		} else {
436			bap = (ufs1_daddr_t *)bp->b_data;
437			bap[indirs[unwindidx].in_off] = 0;
438			if (flags & IO_SYNC) {
439				bwrite(bp);
440			} else {
441				if (bp->b_bufsize == fs->fs_bsize)
442					bp->b_flags |= B_CLUSTEROK;
443				bdwrite(bp);
444			}
445		}
446	}
447	if (deallocated) {
448#ifdef QUOTA
449		/*
450		 * Restore user's disk quota because allocation failed.
451		 */
452		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
453#endif
454		dp->di_blocks -= btodb(deallocated);
455		ip->i_flag |= IN_CHANGE | IN_UPDATE;
456	}
457	(void) ffs_syncvnode(vp, MNT_WAIT);
458	/*
459	 * After the buffers are invalidated and on-disk pointers are
460	 * cleared, free the blocks.
461	 */
462	for (blkp = allociblk; blkp < allocblk; blkp++) {
463		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
464		    ip->i_number);
465	}
466	return (error);
467}
468
469/*
470 * Balloc defines the structure of file system storage
471 * by allocating the physical blocks on a device given
472 * the inode and the logical block number in a file.
473 * This is the allocation strategy for UFS2. Above is
474 * the allocation strategy for UFS1.
475 */
476int
477ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
478    struct ucred *cred, int flags, struct buf **bpp)
479{
480	struct inode *ip;
481	struct ufs2_dinode *dp;
482	ufs_lbn_t lbn, lastlbn;
483	struct fs *fs;
484	struct buf *bp, *nbp;
485	struct ufsmount *ump;
486	struct indir indirs[NIADDR + 2];
487	ufs2_daddr_t nb, newb, *bap, pref;
488	ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
489	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
490	int deallocated, osize, nsize, num, i, error;
491	int unwindidx = -1;
492
493	ip = VTOI(vp);
494	dp = ip->i_din2;
495	fs = ip->i_fs;
496	ump = ip->i_ump;
497	lbn = lblkno(fs, startoffset);
498	size = blkoff(fs, startoffset) + size;
499	if (size > fs->fs_bsize)
500		panic("ffs_balloc_ufs2: blk too big");
501	*bpp = NULL;
502	if (lbn < 0)
503		return (EFBIG);
504
505	/*
506	 * Check for allocating external data.
507	 */
508	if (flags & IO_EXT) {
509		if (lbn >= NXADDR)
510			return (EFBIG);
511		/*
512		 * If the next write will extend the data into a new block,
513		 * and the data is currently composed of a fragment
514		 * this fragment has to be extended to be a full block.
515		 */
516		lastlbn = lblkno(fs, dp->di_extsize);
517		if (lastlbn < lbn) {
518			nb = lastlbn;
519			osize = sblksize(fs, dp->di_extsize, nb);
520			if (osize < fs->fs_bsize && osize > 0) {
521				UFS_LOCK(ump);
522				error = ffs_realloccg(ip, -1 - nb,
523				    dp->di_extb[nb],
524				    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
525				    &dp->di_extb[0]), osize,
526				    (int)fs->fs_bsize, cred, &bp);
527				if (error)
528					return (error);
529				if (DOINGSOFTDEP(vp))
530					softdep_setup_allocext(ip, nb,
531					    dbtofsb(fs, bp->b_blkno),
532					    dp->di_extb[nb],
533					    fs->fs_bsize, osize, bp);
534				dp->di_extsize = smalllblktosize(fs, nb + 1);
535				dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno);
536				bp->b_xflags |= BX_ALTDATA;
537				ip->i_flag |= IN_CHANGE | IN_UPDATE;
538				if (flags & IO_SYNC)
539					bwrite(bp);
540				else
541					bawrite(bp);
542			}
543		}
544		/*
545		 * All blocks are direct blocks
546		 */
547		if (flags & BA_METAONLY)
548			panic("ffs_balloc_ufs2: BA_METAONLY for ext block");
549		nb = dp->di_extb[lbn];
550		if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) {
551			error = bread(vp, -1 - lbn, fs->fs_bsize, NOCRED, &bp);
552			if (error) {
553				brelse(bp);
554				return (error);
555			}
556			bp->b_blkno = fsbtodb(fs, nb);
557			bp->b_xflags |= BX_ALTDATA;
558			*bpp = bp;
559			return (0);
560		}
561		if (nb != 0) {
562			/*
563			 * Consider need to reallocate a fragment.
564			 */
565			osize = fragroundup(fs, blkoff(fs, dp->di_extsize));
566			nsize = fragroundup(fs, size);
567			if (nsize <= osize) {
568				error = bread(vp, -1 - lbn, osize, NOCRED, &bp);
569				if (error) {
570					brelse(bp);
571					return (error);
572				}
573				bp->b_blkno = fsbtodb(fs, nb);
574				bp->b_xflags |= BX_ALTDATA;
575			} else {
576				UFS_LOCK(ump);
577				error = ffs_realloccg(ip, -1 - lbn,
578				    dp->di_extb[lbn],
579				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
580				    &dp->di_extb[0]), osize, nsize, cred, &bp);
581				if (error)
582					return (error);
583				bp->b_xflags |= BX_ALTDATA;
584				if (DOINGSOFTDEP(vp))
585					softdep_setup_allocext(ip, lbn,
586					    dbtofsb(fs, bp->b_blkno), nb,
587					    nsize, osize, bp);
588			}
589		} else {
590			if (dp->di_extsize < smalllblktosize(fs, lbn + 1))
591				nsize = fragroundup(fs, size);
592			else
593				nsize = fs->fs_bsize;
594			UFS_LOCK(ump);
595			error = ffs_alloc(ip, lbn,
596			   ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]),
597			   nsize, cred, &newb);
598			if (error)
599				return (error);
600			bp = getblk(vp, -1 - lbn, nsize, 0, 0, 0);
601			bp->b_blkno = fsbtodb(fs, newb);
602			bp->b_xflags |= BX_ALTDATA;
603			if (flags & BA_CLRBUF)
604				vfs_bio_clrbuf(bp);
605			if (DOINGSOFTDEP(vp))
606				softdep_setup_allocext(ip, lbn, newb, 0,
607				    nsize, 0, bp);
608		}
609		dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno);
610		ip->i_flag |= IN_CHANGE | IN_UPDATE;
611		*bpp = bp;
612		return (0);
613	}
614	/*
615	 * If the next write will extend the file into a new block,
616	 * and the file is currently composed of a fragment
617	 * this fragment has to be extended to be a full block.
618	 */
619	lastlbn = lblkno(fs, ip->i_size);
620	if (lastlbn < NDADDR && lastlbn < lbn) {
621		nb = lastlbn;
622		osize = blksize(fs, ip, nb);
623		if (osize < fs->fs_bsize && osize > 0) {
624			UFS_LOCK(ump);
625			error = ffs_realloccg(ip, nb, dp->di_db[nb],
626				ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
627				    &dp->di_db[0]), osize, (int)fs->fs_bsize,
628				    cred, &bp);
629			if (error)
630				return (error);
631			if (DOINGSOFTDEP(vp))
632				softdep_setup_allocdirect(ip, nb,
633				    dbtofsb(fs, bp->b_blkno),
634				    dp->di_db[nb],
635				    fs->fs_bsize, osize, bp);
636			ip->i_size = smalllblktosize(fs, nb + 1);
637			dp->di_size = ip->i_size;
638			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
639			ip->i_flag |= IN_CHANGE | IN_UPDATE;
640			if (flags & IO_SYNC)
641				bwrite(bp);
642			else
643				bawrite(bp);
644		}
645	}
646	/*
647	 * The first NDADDR blocks are direct blocks
648	 */
649	if (lbn < NDADDR) {
650		if (flags & BA_METAONLY)
651			panic("ffs_balloc_ufs2: BA_METAONLY for direct block");
652		nb = dp->di_db[lbn];
653		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
654			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
655			if (error) {
656				brelse(bp);
657				return (error);
658			}
659			bp->b_blkno = fsbtodb(fs, nb);
660			*bpp = bp;
661			return (0);
662		}
663		if (nb != 0) {
664			/*
665			 * Consider need to reallocate a fragment.
666			 */
667			osize = fragroundup(fs, blkoff(fs, ip->i_size));
668			nsize = fragroundup(fs, size);
669			if (nsize <= osize) {
670				error = bread(vp, lbn, osize, NOCRED, &bp);
671				if (error) {
672					brelse(bp);
673					return (error);
674				}
675				bp->b_blkno = fsbtodb(fs, nb);
676			} else {
677				UFS_LOCK(ump);
678				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
679				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
680				       &dp->di_db[0]), osize, nsize, cred, &bp);
681				if (error)
682					return (error);
683				if (DOINGSOFTDEP(vp))
684					softdep_setup_allocdirect(ip, lbn,
685					    dbtofsb(fs, bp->b_blkno), nb,
686					    nsize, osize, bp);
687			}
688		} else {
689			if (ip->i_size < smalllblktosize(fs, lbn + 1))
690				nsize = fragroundup(fs, size);
691			else
692				nsize = fs->fs_bsize;
693			UFS_LOCK(ump);
694			error = ffs_alloc(ip, lbn,
695			    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
696				&dp->di_db[0]), nsize, cred, &newb);
697			if (error)
698				return (error);
699			bp = getblk(vp, lbn, nsize, 0, 0, 0);
700			bp->b_blkno = fsbtodb(fs, newb);
701			if (flags & BA_CLRBUF)
702				vfs_bio_clrbuf(bp);
703			if (DOINGSOFTDEP(vp))
704				softdep_setup_allocdirect(ip, lbn, newb, 0,
705				    nsize, 0, bp);
706		}
707		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
708		ip->i_flag |= IN_CHANGE | IN_UPDATE;
709		*bpp = bp;
710		return (0);
711	}
712	/*
713	 * Determine the number of levels of indirection.
714	 */
715	pref = 0;
716	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
717		return(error);
718#ifdef INVARIANTS
719	if (num < 1)
720		panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block");
721#endif
722	/*
723	 * Fetch the first indirect block allocating if necessary.
724	 */
725	--num;
726	nb = dp->di_ib[indirs[0].in_off];
727	allocib = NULL;
728	allocblk = allociblk;
729	lbns_remfree = lbns;
730	if (nb == 0) {
731		UFS_LOCK(ump);
732		pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
733	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
734		    cred, &newb)) != 0)
735			return (error);
736		nb = newb;
737		*allocblk++ = nb;
738		*lbns_remfree++ = indirs[1].in_lbn;
739		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0);
740		bp->b_blkno = fsbtodb(fs, nb);
741		vfs_bio_clrbuf(bp);
742		if (DOINGSOFTDEP(vp)) {
743			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
744			    newb, 0, fs->fs_bsize, 0, bp);
745			bdwrite(bp);
746		} else {
747			/*
748			 * Write synchronously so that indirect blocks
749			 * never point at garbage.
750			 */
751			if (DOINGASYNC(vp))
752				bdwrite(bp);
753			else if ((error = bwrite(bp)) != 0)
754				goto fail;
755		}
756		allocib = &dp->di_ib[indirs[0].in_off];
757		*allocib = nb;
758		ip->i_flag |= IN_CHANGE | IN_UPDATE;
759	}
760	/*
761	 * Fetch through the indirect blocks, allocating as necessary.
762	 */
763	for (i = 1;;) {
764		error = bread(vp,
765		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
766		if (error) {
767			brelse(bp);
768			goto fail;
769		}
770		bap = (ufs2_daddr_t *)bp->b_data;
771		nb = bap[indirs[i].in_off];
772		if (i == num)
773			break;
774		i += 1;
775		if (nb != 0) {
776			bqrelse(bp);
777			continue;
778		}
779		UFS_LOCK(ump);
780		if (pref == 0)
781			pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
782		if ((error =
783		    ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) != 0) {
784			brelse(bp);
785			goto fail;
786		}
787		nb = newb;
788		*allocblk++ = nb;
789		*lbns_remfree++ = indirs[i].in_lbn;
790		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
791		nbp->b_blkno = fsbtodb(fs, nb);
792		vfs_bio_clrbuf(nbp);
793		if (DOINGSOFTDEP(vp)) {
794			softdep_setup_allocindir_meta(nbp, ip, bp,
795			    indirs[i - 1].in_off, nb);
796			bdwrite(nbp);
797		} else {
798			/*
799			 * Write synchronously so that indirect blocks
800			 * never point at garbage.
801			 */
802			if ((error = bwrite(nbp)) != 0) {
803				brelse(bp);
804				goto fail;
805			}
806		}
807		bap[indirs[i - 1].in_off] = nb;
808		if (allocib == NULL && unwindidx < 0)
809			unwindidx = i - 1;
810		/*
811		 * If required, write synchronously, otherwise use
812		 * delayed write.
813		 */
814		if (flags & IO_SYNC) {
815			bwrite(bp);
816		} else {
817			if (bp->b_bufsize == fs->fs_bsize)
818				bp->b_flags |= B_CLUSTEROK;
819			bdwrite(bp);
820		}
821	}
822	/*
823	 * If asked only for the indirect block, then return it.
824	 */
825	if (flags & BA_METAONLY) {
826		*bpp = bp;
827		return (0);
828	}
829	/*
830	 * Get the data block, allocating if necessary.
831	 */
832	if (nb == 0) {
833		UFS_LOCK(ump);
834		pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off, &bap[0]);
835		error = ffs_alloc(ip,
836		    lbn, pref, (int)fs->fs_bsize, cred, &newb);
837		if (error) {
838			brelse(bp);
839			goto fail;
840		}
841		nb = newb;
842		*allocblk++ = nb;
843		*lbns_remfree++ = lbn;
844		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
845		nbp->b_blkno = fsbtodb(fs, nb);
846		if (flags & BA_CLRBUF)
847			vfs_bio_clrbuf(nbp);
848		if (DOINGSOFTDEP(vp))
849			softdep_setup_allocindir_page(ip, lbn, bp,
850			    indirs[i].in_off, nb, 0, nbp);
851		bap[indirs[i].in_off] = nb;
852		/*
853		 * If required, write synchronously, otherwise use
854		 * delayed write.
855		 */
856		if (flags & IO_SYNC) {
857			bwrite(bp);
858		} else {
859			if (bp->b_bufsize == fs->fs_bsize)
860				bp->b_flags |= B_CLUSTEROK;
861			bdwrite(bp);
862		}
863		*bpp = nbp;
864		return (0);
865	}
866	brelse(bp);
867	/*
868	 * If requested clear invalid portions of the buffer.  If we
869	 * have to do a read-before-write (typical if BA_CLRBUF is set),
870	 * try to do some read-ahead in the sequential case to reduce
871	 * the number of I/O transactions.
872	 */
873	if (flags & BA_CLRBUF) {
874		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
875		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
876			error = cluster_read(vp, ip->i_size, lbn,
877			    (int)fs->fs_bsize, NOCRED,
878			    MAXBSIZE, seqcount, &nbp);
879		} else {
880			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
881		}
882		if (error) {
883			brelse(nbp);
884			goto fail;
885		}
886	} else {
887		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
888		nbp->b_blkno = fsbtodb(fs, nb);
889	}
890	*bpp = nbp;
891	return (0);
892fail:
893	/*
894	 * If we have failed to allocate any blocks, simply return the error.
895	 * This is the usual case and avoids the need to fsync the file.
896	 */
897	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
898		return (error);
899	/*
900	 * If we have failed part way through block allocation, we
901	 * have to deallocate any indirect blocks that we have allocated.
902	 * We have to fsync the file before we start to get rid of all
903	 * of its dependencies so that we do not leave them dangling.
904	 * We have to sync it at the end so that the soft updates code
905	 * does not find any untracked changes. Although this is really
906	 * slow, running out of disk space is not expected to be a common
907	 * occurence. The error return from fsync is ignored as we already
908	 * have an error to return to the user.
909	 */
910	(void) ffs_syncvnode(vp, MNT_WAIT);
911	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
912	     blkp < allocblk; blkp++, lbns_remfree++) {
913		/*
914		 * We shall not leave the freed blocks on the vnode
915		 * buffer object lists.
916		 */
917		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
918		if (bp != NULL) {
919			bp->b_flags |= (B_INVAL | B_RELBUF);
920			bp->b_flags &= ~B_ASYNC;
921			brelse(bp);
922		}
923		deallocated += fs->fs_bsize;
924	}
925	if (allocib != NULL) {
926		*allocib = 0;
927	} else if (unwindidx >= 0) {
928		int r;
929
930		r = bread(vp, indirs[unwindidx].in_lbn,
931		    (int)fs->fs_bsize, NOCRED, &bp);
932		if (r) {
933			panic("Could not unwind indirect block, error %d", r);
934			brelse(bp);
935		} else {
936			bap = (ufs2_daddr_t *)bp->b_data;
937			bap[indirs[unwindidx].in_off] = 0;
938			if (flags & IO_SYNC) {
939				bwrite(bp);
940			} else {
941				if (bp->b_bufsize == fs->fs_bsize)
942					bp->b_flags |= B_CLUSTEROK;
943				bdwrite(bp);
944			}
945		}
946	}
947	if (deallocated) {
948#ifdef QUOTA
949		/*
950		 * Restore user's disk quota because allocation failed.
951		 */
952		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
953#endif
954		dp->di_blocks -= btodb(deallocated);
955		ip->i_flag |= IN_CHANGE | IN_UPDATE;
956	}
957	(void) ffs_syncvnode(vp, MNT_WAIT);
958	/*
959	 * After the buffers are invalidated and on-disk pointers are
960	 * cleared, free the blocks.
961	 */
962	for (blkp = allociblk; blkp < allocblk; blkp++) {
963		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
964		    ip->i_number);
965	}
966	return (error);
967}
968