ffs_balloc.c revision 248282
1/*-
2 * Copyright (c) 2002 Networks Associates Technology, Inc.
3 * All rights reserved.
4 *
5 * This software was developed for the FreeBSD Project by Marshall
6 * Kirk McKusick and Network Associates Laboratories, the Security
7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9 * research program
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * Copyright (c) 1982, 1986, 1989, 1993
33 *	The Regents of the University of California.  All rights reserved.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 *    notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 *    notice, this list of conditions and the following disclaimer in the
42 *    documentation and/or other materials provided with the distribution.
43 * 4. Neither the name of the University nor the names of its contributors
44 *    may be used to endorse or promote products derived from this software
45 *    without specific prior written permission.
46 *
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * SUCH DAMAGE.
58 *
59 *	@(#)ffs_balloc.c	8.8 (Berkeley) 6/16/95
60 */
61
62#include <sys/cdefs.h>
63__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_balloc.c 248282 2013-03-14 20:28:26Z kib $");
64
65#include <sys/param.h>
66#include <sys/systm.h>
67#include <sys/bio.h>
68#include <sys/buf.h>
69#include <sys/lock.h>
70#include <sys/mount.h>
71#include <sys/vnode.h>
72
73#include <ufs/ufs/quota.h>
74#include <ufs/ufs/inode.h>
75#include <ufs/ufs/ufs_extern.h>
76#include <ufs/ufs/extattr.h>
77#include <ufs/ufs/ufsmount.h>
78
79#include <ufs/ffs/fs.h>
80#include <ufs/ffs/ffs_extern.h>
81
82/*
83 * Balloc defines the structure of filesystem storage
84 * by allocating the physical blocks on a device given
85 * the inode and the logical block number in a file.
86 * This is the allocation strategy for UFS1. Below is
87 * the allocation strategy for UFS2.
88 */
89int
90ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
91    struct ucred *cred, int flags, struct buf **bpp)
92{
93	struct inode *ip;
94	struct ufs1_dinode *dp;
95	ufs_lbn_t lbn, lastlbn;
96	struct fs *fs;
97	ufs1_daddr_t nb;
98	struct buf *bp, *nbp;
99	struct ufsmount *ump;
100	struct indir indirs[NIADDR + 2];
101	int deallocated, osize, nsize, num, i, error;
102	ufs2_daddr_t newb;
103	ufs1_daddr_t *bap, pref;
104	ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
105	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
106	int unwindidx = -1;
107	int saved_inbdflush;
108	static struct timeval lastfail;
109	static int curfail;
110	int reclaimed;
111
112	ip = VTOI(vp);
113	dp = ip->i_din1;
114	fs = ip->i_fs;
115	ump = ip->i_ump;
116	lbn = lblkno(fs, startoffset);
117	size = blkoff(fs, startoffset) + size;
118	reclaimed = 0;
119	if (size > fs->fs_bsize)
120		panic("ffs_balloc_ufs1: blk too big");
121	*bpp = NULL;
122	if (flags & IO_EXT)
123		return (EOPNOTSUPP);
124	if (lbn < 0)
125		return (EFBIG);
126
127	if (DOINGSOFTDEP(vp))
128		softdep_prealloc(vp, MNT_WAIT);
129	/*
130	 * If the next write will extend the file into a new block,
131	 * and the file is currently composed of a fragment
132	 * this fragment has to be extended to be a full block.
133	 */
134	lastlbn = lblkno(fs, ip->i_size);
135	if (lastlbn < NDADDR && lastlbn < lbn) {
136		nb = lastlbn;
137		osize = blksize(fs, ip, nb);
138		if (osize < fs->fs_bsize && osize > 0) {
139			UFS_LOCK(ump);
140			error = ffs_realloccg(ip, nb, dp->di_db[nb],
141			   ffs_blkpref_ufs1(ip, lastlbn, (int)nb,
142			   &dp->di_db[0]), osize, (int)fs->fs_bsize, flags,
143			   cred, &bp);
144			if (error)
145				return (error);
146			if (DOINGSOFTDEP(vp))
147				softdep_setup_allocdirect(ip, nb,
148				    dbtofsb(fs, bp->b_blkno), dp->di_db[nb],
149				    fs->fs_bsize, osize, bp);
150			ip->i_size = smalllblktosize(fs, nb + 1);
151			dp->di_size = ip->i_size;
152			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
153			ip->i_flag |= IN_CHANGE | IN_UPDATE;
154			if (flags & IO_SYNC)
155				bwrite(bp);
156			else
157				bawrite(bp);
158		}
159	}
160	/*
161	 * The first NDADDR blocks are direct blocks
162	 */
163	if (lbn < NDADDR) {
164		if (flags & BA_METAONLY)
165			panic("ffs_balloc_ufs1: BA_METAONLY for direct block");
166		nb = dp->di_db[lbn];
167		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
168			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
169			if (error) {
170				brelse(bp);
171				return (error);
172			}
173			bp->b_blkno = fsbtodb(fs, nb);
174			*bpp = bp;
175			return (0);
176		}
177		if (nb != 0) {
178			/*
179			 * Consider need to reallocate a fragment.
180			 */
181			osize = fragroundup(fs, blkoff(fs, ip->i_size));
182			nsize = fragroundup(fs, size);
183			if (nsize <= osize) {
184				error = bread(vp, lbn, osize, NOCRED, &bp);
185				if (error) {
186					brelse(bp);
187					return (error);
188				}
189				bp->b_blkno = fsbtodb(fs, nb);
190			} else {
191				UFS_LOCK(ump);
192				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
193				    ffs_blkpref_ufs1(ip, lbn, (int)lbn,
194				    &dp->di_db[0]), osize, nsize, flags,
195				    cred, &bp);
196				if (error)
197					return (error);
198				if (DOINGSOFTDEP(vp))
199					softdep_setup_allocdirect(ip, lbn,
200					    dbtofsb(fs, bp->b_blkno), nb,
201					    nsize, osize, bp);
202			}
203		} else {
204			if (ip->i_size < smalllblktosize(fs, lbn + 1))
205				nsize = fragroundup(fs, size);
206			else
207				nsize = fs->fs_bsize;
208			UFS_LOCK(ump);
209			error = ffs_alloc(ip, lbn,
210			    ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]),
211			    nsize, flags, cred, &newb);
212			if (error)
213				return (error);
214			bp = getblk(vp, lbn, nsize, 0, 0, 0);
215			bp->b_blkno = fsbtodb(fs, newb);
216			if (flags & BA_CLRBUF)
217				vfs_bio_clrbuf(bp);
218			if (DOINGSOFTDEP(vp))
219				softdep_setup_allocdirect(ip, lbn, newb, 0,
220				    nsize, 0, bp);
221		}
222		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
223		ip->i_flag |= IN_CHANGE | IN_UPDATE;
224		*bpp = bp;
225		return (0);
226	}
227	/*
228	 * Determine the number of levels of indirection.
229	 */
230	pref = 0;
231	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
232		return(error);
233#ifdef INVARIANTS
234	if (num < 1)
235		panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block");
236#endif
237	saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH);
238	/*
239	 * Fetch the first indirect block allocating if necessary.
240	 */
241	--num;
242	nb = dp->di_ib[indirs[0].in_off];
243	allocib = NULL;
244	allocblk = allociblk;
245	lbns_remfree = lbns;
246	if (nb == 0) {
247		UFS_LOCK(ump);
248		pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
249	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
250		    flags, cred, &newb)) != 0) {
251			curthread_pflags_restore(saved_inbdflush);
252			return (error);
253		}
254		pref = newb + fs->fs_frag;
255		nb = newb;
256		*allocblk++ = nb;
257		*lbns_remfree++ = indirs[1].in_lbn;
258		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0);
259		bp->b_blkno = fsbtodb(fs, nb);
260		vfs_bio_clrbuf(bp);
261		if (DOINGSOFTDEP(vp)) {
262			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
263			    newb, 0, fs->fs_bsize, 0, bp);
264			bdwrite(bp);
265		} else {
266			/*
267			 * Write synchronously so that indirect blocks
268			 * never point at garbage.
269			 */
270			if (DOINGASYNC(vp))
271				bdwrite(bp);
272			else if ((error = bwrite(bp)) != 0)
273				goto fail;
274		}
275		allocib = &dp->di_ib[indirs[0].in_off];
276		*allocib = nb;
277		ip->i_flag |= IN_CHANGE | IN_UPDATE;
278	}
279	/*
280	 * Fetch through the indirect blocks, allocating as necessary.
281	 */
282retry:
283	for (i = 1;;) {
284		error = bread(vp,
285		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
286		if (error) {
287			brelse(bp);
288			goto fail;
289		}
290		bap = (ufs1_daddr_t *)bp->b_data;
291		nb = bap[indirs[i].in_off];
292		if (i == num)
293			break;
294		i += 1;
295		if (nb != 0) {
296			bqrelse(bp);
297			continue;
298		}
299		UFS_LOCK(ump);
300		if (pref == 0)
301			pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
302		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
303		    flags | IO_BUFLOCKED, cred, &newb)) != 0) {
304			brelse(bp);
305			if (++reclaimed == 1) {
306				UFS_LOCK(ump);
307				softdep_request_cleanup(fs, vp, cred,
308				    FLUSH_BLOCKS_WAIT);
309				UFS_UNLOCK(ump);
310				goto retry;
311			}
312			if (ppsratecheck(&lastfail, &curfail, 1)) {
313				ffs_fserr(fs, ip->i_number, "filesystem full");
314				uprintf("\n%s: write failed, filesystem "
315				    "is full\n", fs->fs_fsmnt);
316			}
317			goto fail;
318		}
319		pref = newb + fs->fs_frag;
320		nb = newb;
321		*allocblk++ = nb;
322		*lbns_remfree++ = indirs[i].in_lbn;
323		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
324		nbp->b_blkno = fsbtodb(fs, nb);
325		vfs_bio_clrbuf(nbp);
326		if (DOINGSOFTDEP(vp)) {
327			softdep_setup_allocindir_meta(nbp, ip, bp,
328			    indirs[i - 1].in_off, nb);
329			bdwrite(nbp);
330		} else {
331			/*
332			 * Write synchronously so that indirect blocks
333			 * never point at garbage.
334			 */
335			if ((error = bwrite(nbp)) != 0) {
336				brelse(bp);
337				goto fail;
338			}
339		}
340		bap[indirs[i - 1].in_off] = nb;
341		if (allocib == NULL && unwindidx < 0)
342			unwindidx = i - 1;
343		/*
344		 * If required, write synchronously, otherwise use
345		 * delayed write.
346		 */
347		if (flags & IO_SYNC) {
348			bwrite(bp);
349		} else {
350			if (bp->b_bufsize == fs->fs_bsize)
351				bp->b_flags |= B_CLUSTEROK;
352			bdwrite(bp);
353		}
354	}
355	/*
356	 * If asked only for the indirect block, then return it.
357	 */
358	if (flags & BA_METAONLY) {
359		curthread_pflags_restore(saved_inbdflush);
360		*bpp = bp;
361		return (0);
362	}
363	/*
364	 * Get the data block, allocating if necessary.
365	 */
366	if (nb == 0) {
367		UFS_LOCK(ump);
368		if (pref == 0)
369			pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off,
370			    &bap[0]);
371		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
372		    flags | IO_BUFLOCKED, cred, &newb);
373		if (error) {
374			brelse(bp);
375			if (++reclaimed == 1) {
376				UFS_LOCK(ump);
377				softdep_request_cleanup(fs, vp, cred,
378				    FLUSH_BLOCKS_WAIT);
379				UFS_UNLOCK(ump);
380				goto retry;
381			}
382			if (ppsratecheck(&lastfail, &curfail, 1)) {
383				ffs_fserr(fs, ip->i_number, "filesystem full");
384				uprintf("\n%s: write failed, filesystem "
385				    "is full\n", fs->fs_fsmnt);
386			}
387			goto fail;
388		}
389		nb = newb;
390		*allocblk++ = nb;
391		*lbns_remfree++ = lbn;
392		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
393		nbp->b_blkno = fsbtodb(fs, nb);
394		if (flags & BA_CLRBUF)
395			vfs_bio_clrbuf(nbp);
396		if (DOINGSOFTDEP(vp))
397			softdep_setup_allocindir_page(ip, lbn, bp,
398			    indirs[i].in_off, nb, 0, nbp);
399		bap[indirs[i].in_off] = nb;
400		/*
401		 * If required, write synchronously, otherwise use
402		 * delayed write.
403		 */
404		if (flags & IO_SYNC) {
405			bwrite(bp);
406		} else {
407			if (bp->b_bufsize == fs->fs_bsize)
408				bp->b_flags |= B_CLUSTEROK;
409			bdwrite(bp);
410		}
411		curthread_pflags_restore(saved_inbdflush);
412		*bpp = nbp;
413		return (0);
414	}
415	brelse(bp);
416	if (flags & BA_CLRBUF) {
417		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
418		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
419			error = cluster_read(vp, ip->i_size, lbn,
420			    (int)fs->fs_bsize, NOCRED,
421			    MAXBSIZE, seqcount, 0, &nbp);
422		} else {
423			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
424		}
425		if (error) {
426			brelse(nbp);
427			goto fail;
428		}
429	} else {
430		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
431		nbp->b_blkno = fsbtodb(fs, nb);
432	}
433	curthread_pflags_restore(saved_inbdflush);
434	*bpp = nbp;
435	return (0);
436fail:
437	curthread_pflags_restore(saved_inbdflush);
438	/*
439	 * If we have failed to allocate any blocks, simply return the error.
440	 * This is the usual case and avoids the need to fsync the file.
441	 */
442	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
443		return (error);
444	/*
445	 * If we have failed part way through block allocation, we
446	 * have to deallocate any indirect blocks that we have allocated.
447	 * We have to fsync the file before we start to get rid of all
448	 * of its dependencies so that we do not leave them dangling.
449	 * We have to sync it at the end so that the soft updates code
450	 * does not find any untracked changes. Although this is really
451	 * slow, running out of disk space is not expected to be a common
452	 * occurence. The error return from fsync is ignored as we already
453	 * have an error to return to the user.
454	 *
455	 * XXX Still have to journal the free below
456	 */
457	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
458	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
459	     blkp < allocblk; blkp++, lbns_remfree++) {
460		/*
461		 * We shall not leave the freed blocks on the vnode
462		 * buffer object lists.
463		 */
464		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
465		if (bp != NULL) {
466			bp->b_flags |= (B_INVAL | B_RELBUF);
467			bp->b_flags &= ~B_ASYNC;
468			brelse(bp);
469		}
470		deallocated += fs->fs_bsize;
471	}
472	if (allocib != NULL) {
473		*allocib = 0;
474	} else if (unwindidx >= 0) {
475		int r;
476
477		r = bread(vp, indirs[unwindidx].in_lbn,
478		    (int)fs->fs_bsize, NOCRED, &bp);
479		if (r) {
480			panic("Could not unwind indirect block, error %d", r);
481			brelse(bp);
482		} else {
483			bap = (ufs1_daddr_t *)bp->b_data;
484			bap[indirs[unwindidx].in_off] = 0;
485			if (flags & IO_SYNC) {
486				bwrite(bp);
487			} else {
488				if (bp->b_bufsize == fs->fs_bsize)
489					bp->b_flags |= B_CLUSTEROK;
490				bdwrite(bp);
491			}
492		}
493	}
494	if (deallocated) {
495#ifdef QUOTA
496		/*
497		 * Restore user's disk quota because allocation failed.
498		 */
499		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
500#endif
501		dp->di_blocks -= btodb(deallocated);
502		ip->i_flag |= IN_CHANGE | IN_UPDATE;
503	}
504	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
505	/*
506	 * After the buffers are invalidated and on-disk pointers are
507	 * cleared, free the blocks.
508	 */
509	for (blkp = allociblk; blkp < allocblk; blkp++) {
510		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
511		    ip->i_number, vp->v_type, NULL);
512	}
513	return (error);
514}
515
516/*
517 * Balloc defines the structure of file system storage
518 * by allocating the physical blocks on a device given
519 * the inode and the logical block number in a file.
520 * This is the allocation strategy for UFS2. Above is
521 * the allocation strategy for UFS1.
522 */
523int
524ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
525    struct ucred *cred, int flags, struct buf **bpp)
526{
527	struct inode *ip;
528	struct ufs2_dinode *dp;
529	ufs_lbn_t lbn, lastlbn;
530	struct fs *fs;
531	struct buf *bp, *nbp;
532	struct ufsmount *ump;
533	struct indir indirs[NIADDR + 2];
534	ufs2_daddr_t nb, newb, *bap, pref;
535	ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
536	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
537	int deallocated, osize, nsize, num, i, error;
538	int unwindidx = -1;
539	int saved_inbdflush;
540	static struct timeval lastfail;
541	static int curfail;
542	int reclaimed;
543
544	ip = VTOI(vp);
545	dp = ip->i_din2;
546	fs = ip->i_fs;
547	ump = ip->i_ump;
548	lbn = lblkno(fs, startoffset);
549	size = blkoff(fs, startoffset) + size;
550	reclaimed = 0;
551	if (size > fs->fs_bsize)
552		panic("ffs_balloc_ufs2: blk too big");
553	*bpp = NULL;
554	if (lbn < 0)
555		return (EFBIG);
556
557	if (DOINGSOFTDEP(vp))
558		softdep_prealloc(vp, MNT_WAIT);
559
560	/*
561	 * Check for allocating external data.
562	 */
563	if (flags & IO_EXT) {
564		if (lbn >= NXADDR)
565			return (EFBIG);
566		/*
567		 * If the next write will extend the data into a new block,
568		 * and the data is currently composed of a fragment
569		 * this fragment has to be extended to be a full block.
570		 */
571		lastlbn = lblkno(fs, dp->di_extsize);
572		if (lastlbn < lbn) {
573			nb = lastlbn;
574			osize = sblksize(fs, dp->di_extsize, nb);
575			if (osize < fs->fs_bsize && osize > 0) {
576				UFS_LOCK(ump);
577				error = ffs_realloccg(ip, -1 - nb,
578				    dp->di_extb[nb],
579				    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
580				    &dp->di_extb[0]), osize,
581				    (int)fs->fs_bsize, flags, cred, &bp);
582				if (error)
583					return (error);
584				if (DOINGSOFTDEP(vp))
585					softdep_setup_allocext(ip, nb,
586					    dbtofsb(fs, bp->b_blkno),
587					    dp->di_extb[nb],
588					    fs->fs_bsize, osize, bp);
589				dp->di_extsize = smalllblktosize(fs, nb + 1);
590				dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno);
591				bp->b_xflags |= BX_ALTDATA;
592				ip->i_flag |= IN_CHANGE;
593				if (flags & IO_SYNC)
594					bwrite(bp);
595				else
596					bawrite(bp);
597			}
598		}
599		/*
600		 * All blocks are direct blocks
601		 */
602		if (flags & BA_METAONLY)
603			panic("ffs_balloc_ufs2: BA_METAONLY for ext block");
604		nb = dp->di_extb[lbn];
605		if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) {
606			error = bread(vp, -1 - lbn, fs->fs_bsize, NOCRED, &bp);
607			if (error) {
608				brelse(bp);
609				return (error);
610			}
611			bp->b_blkno = fsbtodb(fs, nb);
612			bp->b_xflags |= BX_ALTDATA;
613			*bpp = bp;
614			return (0);
615		}
616		if (nb != 0) {
617			/*
618			 * Consider need to reallocate a fragment.
619			 */
620			osize = fragroundup(fs, blkoff(fs, dp->di_extsize));
621			nsize = fragroundup(fs, size);
622			if (nsize <= osize) {
623				error = bread(vp, -1 - lbn, osize, NOCRED, &bp);
624				if (error) {
625					brelse(bp);
626					return (error);
627				}
628				bp->b_blkno = fsbtodb(fs, nb);
629				bp->b_xflags |= BX_ALTDATA;
630			} else {
631				UFS_LOCK(ump);
632				error = ffs_realloccg(ip, -1 - lbn,
633				    dp->di_extb[lbn],
634				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
635				    &dp->di_extb[0]), osize, nsize, flags,
636				    cred, &bp);
637				if (error)
638					return (error);
639				bp->b_xflags |= BX_ALTDATA;
640				if (DOINGSOFTDEP(vp))
641					softdep_setup_allocext(ip, lbn,
642					    dbtofsb(fs, bp->b_blkno), nb,
643					    nsize, osize, bp);
644			}
645		} else {
646			if (dp->di_extsize < smalllblktosize(fs, lbn + 1))
647				nsize = fragroundup(fs, size);
648			else
649				nsize = fs->fs_bsize;
650			UFS_LOCK(ump);
651			error = ffs_alloc(ip, lbn,
652			   ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]),
653			   nsize, flags, cred, &newb);
654			if (error)
655				return (error);
656			bp = getblk(vp, -1 - lbn, nsize, 0, 0, 0);
657			bp->b_blkno = fsbtodb(fs, newb);
658			bp->b_xflags |= BX_ALTDATA;
659			if (flags & BA_CLRBUF)
660				vfs_bio_clrbuf(bp);
661			if (DOINGSOFTDEP(vp))
662				softdep_setup_allocext(ip, lbn, newb, 0,
663				    nsize, 0, bp);
664		}
665		dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno);
666		ip->i_flag |= IN_CHANGE;
667		*bpp = bp;
668		return (0);
669	}
670	/*
671	 * If the next write will extend the file into a new block,
672	 * and the file is currently composed of a fragment
673	 * this fragment has to be extended to be a full block.
674	 */
675	lastlbn = lblkno(fs, ip->i_size);
676	if (lastlbn < NDADDR && lastlbn < lbn) {
677		nb = lastlbn;
678		osize = blksize(fs, ip, nb);
679		if (osize < fs->fs_bsize && osize > 0) {
680			UFS_LOCK(ump);
681			error = ffs_realloccg(ip, nb, dp->di_db[nb],
682				ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
683				    &dp->di_db[0]), osize, (int)fs->fs_bsize,
684				    flags, cred, &bp);
685			if (error)
686				return (error);
687			if (DOINGSOFTDEP(vp))
688				softdep_setup_allocdirect(ip, nb,
689				    dbtofsb(fs, bp->b_blkno),
690				    dp->di_db[nb],
691				    fs->fs_bsize, osize, bp);
692			ip->i_size = smalllblktosize(fs, nb + 1);
693			dp->di_size = ip->i_size;
694			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
695			ip->i_flag |= IN_CHANGE | IN_UPDATE;
696			if (flags & IO_SYNC)
697				bwrite(bp);
698			else
699				bawrite(bp);
700		}
701	}
702	/*
703	 * The first NDADDR blocks are direct blocks
704	 */
705	if (lbn < NDADDR) {
706		if (flags & BA_METAONLY)
707			panic("ffs_balloc_ufs2: BA_METAONLY for direct block");
708		nb = dp->di_db[lbn];
709		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
710			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
711			if (error) {
712				brelse(bp);
713				return (error);
714			}
715			bp->b_blkno = fsbtodb(fs, nb);
716			*bpp = bp;
717			return (0);
718		}
719		if (nb != 0) {
720			/*
721			 * Consider need to reallocate a fragment.
722			 */
723			osize = fragroundup(fs, blkoff(fs, ip->i_size));
724			nsize = fragroundup(fs, size);
725			if (nsize <= osize) {
726				error = bread(vp, lbn, osize, NOCRED, &bp);
727				if (error) {
728					brelse(bp);
729					return (error);
730				}
731				bp->b_blkno = fsbtodb(fs, nb);
732			} else {
733				UFS_LOCK(ump);
734				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
735				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
736				       &dp->di_db[0]), osize, nsize, flags,
737				    cred, &bp);
738				if (error)
739					return (error);
740				if (DOINGSOFTDEP(vp))
741					softdep_setup_allocdirect(ip, lbn,
742					    dbtofsb(fs, bp->b_blkno), nb,
743					    nsize, osize, bp);
744			}
745		} else {
746			if (ip->i_size < smalllblktosize(fs, lbn + 1))
747				nsize = fragroundup(fs, size);
748			else
749				nsize = fs->fs_bsize;
750			UFS_LOCK(ump);
751			error = ffs_alloc(ip, lbn,
752			    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
753				&dp->di_db[0]), nsize, flags, cred, &newb);
754			if (error)
755				return (error);
756			bp = getblk(vp, lbn, nsize, 0, 0, 0);
757			bp->b_blkno = fsbtodb(fs, newb);
758			if (flags & BA_CLRBUF)
759				vfs_bio_clrbuf(bp);
760			if (DOINGSOFTDEP(vp))
761				softdep_setup_allocdirect(ip, lbn, newb, 0,
762				    nsize, 0, bp);
763		}
764		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
765		ip->i_flag |= IN_CHANGE | IN_UPDATE;
766		*bpp = bp;
767		return (0);
768	}
769	/*
770	 * Determine the number of levels of indirection.
771	 */
772	pref = 0;
773	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
774		return(error);
775#ifdef INVARIANTS
776	if (num < 1)
777		panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block");
778#endif
779	saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH);
780	/*
781	 * Fetch the first indirect block allocating if necessary.
782	 */
783	--num;
784	nb = dp->di_ib[indirs[0].in_off];
785	allocib = NULL;
786	allocblk = allociblk;
787	lbns_remfree = lbns;
788	if (nb == 0) {
789		UFS_LOCK(ump);
790		pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
791	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
792		    flags, cred, &newb)) != 0) {
793			curthread_pflags_restore(saved_inbdflush);
794			return (error);
795		}
796		pref = newb + fs->fs_frag;
797		nb = newb;
798		*allocblk++ = nb;
799		*lbns_remfree++ = indirs[1].in_lbn;
800		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0);
801		bp->b_blkno = fsbtodb(fs, nb);
802		vfs_bio_clrbuf(bp);
803		if (DOINGSOFTDEP(vp)) {
804			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
805			    newb, 0, fs->fs_bsize, 0, bp);
806			bdwrite(bp);
807		} else {
808			/*
809			 * Write synchronously so that indirect blocks
810			 * never point at garbage.
811			 */
812			if (DOINGASYNC(vp))
813				bdwrite(bp);
814			else if ((error = bwrite(bp)) != 0)
815				goto fail;
816		}
817		allocib = &dp->di_ib[indirs[0].in_off];
818		*allocib = nb;
819		ip->i_flag |= IN_CHANGE | IN_UPDATE;
820	}
821	/*
822	 * Fetch through the indirect blocks, allocating as necessary.
823	 */
824retry:
825	for (i = 1;;) {
826		error = bread(vp,
827		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
828		if (error) {
829			brelse(bp);
830			goto fail;
831		}
832		bap = (ufs2_daddr_t *)bp->b_data;
833		nb = bap[indirs[i].in_off];
834		if (i == num)
835			break;
836		i += 1;
837		if (nb != 0) {
838			bqrelse(bp);
839			continue;
840		}
841		UFS_LOCK(ump);
842		if (pref == 0)
843			pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
844		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
845		    flags | IO_BUFLOCKED, cred, &newb)) != 0) {
846			brelse(bp);
847			if (++reclaimed == 1) {
848				UFS_LOCK(ump);
849				softdep_request_cleanup(fs, vp, cred,
850				    FLUSH_BLOCKS_WAIT);
851				UFS_UNLOCK(ump);
852				goto retry;
853			}
854			if (ppsratecheck(&lastfail, &curfail, 1)) {
855				ffs_fserr(fs, ip->i_number, "filesystem full");
856				uprintf("\n%s: write failed, filesystem "
857				    "is full\n", fs->fs_fsmnt);
858			}
859			goto fail;
860		}
861		pref = newb + fs->fs_frag;
862		nb = newb;
863		*allocblk++ = nb;
864		*lbns_remfree++ = indirs[i].in_lbn;
865		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
866		nbp->b_blkno = fsbtodb(fs, nb);
867		vfs_bio_clrbuf(nbp);
868		if (DOINGSOFTDEP(vp)) {
869			softdep_setup_allocindir_meta(nbp, ip, bp,
870			    indirs[i - 1].in_off, nb);
871			bdwrite(nbp);
872		} else {
873			/*
874			 * Write synchronously so that indirect blocks
875			 * never point at garbage.
876			 */
877			if ((error = bwrite(nbp)) != 0) {
878				brelse(bp);
879				goto fail;
880			}
881		}
882		bap[indirs[i - 1].in_off] = nb;
883		if (allocib == NULL && unwindidx < 0)
884			unwindidx = i - 1;
885		/*
886		 * If required, write synchronously, otherwise use
887		 * delayed write.
888		 */
889		if (flags & IO_SYNC) {
890			bwrite(bp);
891		} else {
892			if (bp->b_bufsize == fs->fs_bsize)
893				bp->b_flags |= B_CLUSTEROK;
894			bdwrite(bp);
895		}
896	}
897	/*
898	 * If asked only for the indirect block, then return it.
899	 */
900	if (flags & BA_METAONLY) {
901		curthread_pflags_restore(saved_inbdflush);
902		*bpp = bp;
903		return (0);
904	}
905	/*
906	 * Get the data block, allocating if necessary.
907	 */
908	if (nb == 0) {
909		UFS_LOCK(ump);
910		if (pref == 0)
911			pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off,
912			    &bap[0]);
913		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
914		    flags | IO_BUFLOCKED, cred, &newb);
915		if (error) {
916			brelse(bp);
917			if (++reclaimed == 1) {
918				UFS_LOCK(ump);
919				softdep_request_cleanup(fs, vp, cred,
920				    FLUSH_BLOCKS_WAIT);
921				UFS_UNLOCK(ump);
922				goto retry;
923			}
924			if (ppsratecheck(&lastfail, &curfail, 1)) {
925				ffs_fserr(fs, ip->i_number, "filesystem full");
926				uprintf("\n%s: write failed, filesystem "
927				    "is full\n", fs->fs_fsmnt);
928			}
929			goto fail;
930		}
931		nb = newb;
932		*allocblk++ = nb;
933		*lbns_remfree++ = lbn;
934		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
935		nbp->b_blkno = fsbtodb(fs, nb);
936		if (flags & BA_CLRBUF)
937			vfs_bio_clrbuf(nbp);
938		if (DOINGSOFTDEP(vp))
939			softdep_setup_allocindir_page(ip, lbn, bp,
940			    indirs[i].in_off, nb, 0, nbp);
941		bap[indirs[i].in_off] = nb;
942		/*
943		 * If required, write synchronously, otherwise use
944		 * delayed write.
945		 */
946		if (flags & IO_SYNC) {
947			bwrite(bp);
948		} else {
949			if (bp->b_bufsize == fs->fs_bsize)
950				bp->b_flags |= B_CLUSTEROK;
951			bdwrite(bp);
952		}
953		curthread_pflags_restore(saved_inbdflush);
954		*bpp = nbp;
955		return (0);
956	}
957	brelse(bp);
958	/*
959	 * If requested clear invalid portions of the buffer.  If we
960	 * have to do a read-before-write (typical if BA_CLRBUF is set),
961	 * try to do some read-ahead in the sequential case to reduce
962	 * the number of I/O transactions.
963	 */
964	if (flags & BA_CLRBUF) {
965		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
966		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
967			error = cluster_read(vp, ip->i_size, lbn,
968			    (int)fs->fs_bsize, NOCRED,
969			    MAXBSIZE, seqcount, 0, &nbp);
970		} else {
971			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
972		}
973		if (error) {
974			brelse(nbp);
975			goto fail;
976		}
977	} else {
978		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
979		nbp->b_blkno = fsbtodb(fs, nb);
980	}
981	curthread_pflags_restore(saved_inbdflush);
982	*bpp = nbp;
983	return (0);
984fail:
985	curthread_pflags_restore(saved_inbdflush);
986	/*
987	 * If we have failed to allocate any blocks, simply return the error.
988	 * This is the usual case and avoids the need to fsync the file.
989	 */
990	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
991		return (error);
992	/*
993	 * If we have failed part way through block allocation, we
994	 * have to deallocate any indirect blocks that we have allocated.
995	 * We have to fsync the file before we start to get rid of all
996	 * of its dependencies so that we do not leave them dangling.
997	 * We have to sync it at the end so that the soft updates code
998	 * does not find any untracked changes. Although this is really
999	 * slow, running out of disk space is not expected to be a common
1000	 * occurence. The error return from fsync is ignored as we already
1001	 * have an error to return to the user.
1002	 *
1003	 * XXX Still have to journal the free below
1004	 */
1005	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
1006	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
1007	     blkp < allocblk; blkp++, lbns_remfree++) {
1008		/*
1009		 * We shall not leave the freed blocks on the vnode
1010		 * buffer object lists.
1011		 */
1012		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
1013		if (bp != NULL) {
1014			bp->b_flags |= (B_INVAL | B_RELBUF);
1015			bp->b_flags &= ~B_ASYNC;
1016			brelse(bp);
1017		}
1018		deallocated += fs->fs_bsize;
1019	}
1020	if (allocib != NULL) {
1021		*allocib = 0;
1022	} else if (unwindidx >= 0) {
1023		int r;
1024
1025		r = bread(vp, indirs[unwindidx].in_lbn,
1026		    (int)fs->fs_bsize, NOCRED, &bp);
1027		if (r) {
1028			panic("Could not unwind indirect block, error %d", r);
1029			brelse(bp);
1030		} else {
1031			bap = (ufs2_daddr_t *)bp->b_data;
1032			bap[indirs[unwindidx].in_off] = 0;
1033			if (flags & IO_SYNC) {
1034				bwrite(bp);
1035			} else {
1036				if (bp->b_bufsize == fs->fs_bsize)
1037					bp->b_flags |= B_CLUSTEROK;
1038				bdwrite(bp);
1039			}
1040		}
1041	}
1042	if (deallocated) {
1043#ifdef QUOTA
1044		/*
1045		 * Restore user's disk quota because allocation failed.
1046		 */
1047		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
1048#endif
1049		dp->di_blocks -= btodb(deallocated);
1050		ip->i_flag |= IN_CHANGE | IN_UPDATE;
1051	}
1052	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
1053	/*
1054	 * After the buffers are invalidated and on-disk pointers are
1055	 * cleared, free the blocks.
1056	 */
1057	for (blkp = allociblk; blkp < allocblk; blkp++) {
1058		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
1059		    ip->i_number, vp->v_type, NULL);
1060	}
1061	return (error);
1062}
1063