ffs_balloc.c revision 250576
1/*-
2 * Copyright (c) 2002 Networks Associates Technology, Inc.
3 * All rights reserved.
4 *
5 * This software was developed for the FreeBSD Project by Marshall
6 * Kirk McKusick and Network Associates Laboratories, the Security
7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9 * research program
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * Copyright (c) 1982, 1986, 1989, 1993
33 *	The Regents of the University of California.  All rights reserved.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 *    notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 *    notice, this list of conditions and the following disclaimer in the
42 *    documentation and/or other materials provided with the distribution.
43 * 4. Neither the name of the University nor the names of its contributors
44 *    may be used to endorse or promote products derived from this software
45 *    without specific prior written permission.
46 *
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * SUCH DAMAGE.
58 *
59 *	@(#)ffs_balloc.c	8.8 (Berkeley) 6/16/95
60 */
61
62#include <sys/cdefs.h>
63__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_balloc.c 250576 2013-05-12 16:43:26Z eadler $");
64
65#include <sys/param.h>
66#include <sys/systm.h>
67#include <sys/bio.h>
68#include <sys/buf.h>
69#include <sys/lock.h>
70#include <sys/mount.h>
71#include <sys/vnode.h>
72
73#include <ufs/ufs/quota.h>
74#include <ufs/ufs/inode.h>
75#include <ufs/ufs/ufs_extern.h>
76#include <ufs/ufs/extattr.h>
77#include <ufs/ufs/ufsmount.h>
78
79#include <ufs/ffs/fs.h>
80#include <ufs/ffs/ffs_extern.h>
81
82/*
83 * Balloc defines the structure of filesystem storage
84 * by allocating the physical blocks on a device given
85 * the inode and the logical block number in a file.
86 * This is the allocation strategy for UFS1. Below is
87 * the allocation strategy for UFS2.
88 */
89int
90ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
91    struct ucred *cred, int flags, struct buf **bpp)
92{
93	struct inode *ip;
94	struct ufs1_dinode *dp;
95	ufs_lbn_t lbn, lastlbn;
96	struct fs *fs;
97	ufs1_daddr_t nb;
98	struct buf *bp, *nbp;
99	struct ufsmount *ump;
100	struct indir indirs[NIADDR + 2];
101	int deallocated, osize, nsize, num, i, error;
102	ufs2_daddr_t newb;
103	ufs1_daddr_t *bap, pref;
104	ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
105	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
106	int unwindidx = -1;
107	int saved_inbdflush;
108	static struct timeval lastfail;
109	static int curfail;
110	int gbflags, reclaimed;
111
112	ip = VTOI(vp);
113	dp = ip->i_din1;
114	fs = ip->i_fs;
115	ump = ip->i_ump;
116	lbn = lblkno(fs, startoffset);
117	size = blkoff(fs, startoffset) + size;
118	reclaimed = 0;
119	if (size > fs->fs_bsize)
120		panic("ffs_balloc_ufs1: blk too big");
121	*bpp = NULL;
122	if (flags & IO_EXT)
123		return (EOPNOTSUPP);
124	if (lbn < 0)
125		return (EFBIG);
126	gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
127
128	if (DOINGSOFTDEP(vp))
129		softdep_prealloc(vp, MNT_WAIT);
130	/*
131	 * If the next write will extend the file into a new block,
132	 * and the file is currently composed of a fragment
133	 * this fragment has to be extended to be a full block.
134	 */
135	lastlbn = lblkno(fs, ip->i_size);
136	if (lastlbn < NDADDR && lastlbn < lbn) {
137		nb = lastlbn;
138		osize = blksize(fs, ip, nb);
139		if (osize < fs->fs_bsize && osize > 0) {
140			UFS_LOCK(ump);
141			error = ffs_realloccg(ip, nb, dp->di_db[nb],
142			   ffs_blkpref_ufs1(ip, lastlbn, (int)nb,
143			   &dp->di_db[0]), osize, (int)fs->fs_bsize, flags,
144			   cred, &bp);
145			if (error)
146				return (error);
147			if (DOINGSOFTDEP(vp))
148				softdep_setup_allocdirect(ip, nb,
149				    dbtofsb(fs, bp->b_blkno), dp->di_db[nb],
150				    fs->fs_bsize, osize, bp);
151			ip->i_size = smalllblktosize(fs, nb + 1);
152			dp->di_size = ip->i_size;
153			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
154			ip->i_flag |= IN_CHANGE | IN_UPDATE;
155			if (flags & IO_SYNC)
156				bwrite(bp);
157			else
158				bawrite(bp);
159		}
160	}
161	/*
162	 * The first NDADDR blocks are direct blocks
163	 */
164	if (lbn < NDADDR) {
165		if (flags & BA_METAONLY)
166			panic("ffs_balloc_ufs1: BA_METAONLY for direct block");
167		nb = dp->di_db[lbn];
168		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
169			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
170			if (error) {
171				brelse(bp);
172				return (error);
173			}
174			bp->b_blkno = fsbtodb(fs, nb);
175			*bpp = bp;
176			return (0);
177		}
178		if (nb != 0) {
179			/*
180			 * Consider need to reallocate a fragment.
181			 */
182			osize = fragroundup(fs, blkoff(fs, ip->i_size));
183			nsize = fragroundup(fs, size);
184			if (nsize <= osize) {
185				error = bread(vp, lbn, osize, NOCRED, &bp);
186				if (error) {
187					brelse(bp);
188					return (error);
189				}
190				bp->b_blkno = fsbtodb(fs, nb);
191			} else {
192				UFS_LOCK(ump);
193				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
194				    ffs_blkpref_ufs1(ip, lbn, (int)lbn,
195				    &dp->di_db[0]), osize, nsize, flags,
196				    cred, &bp);
197				if (error)
198					return (error);
199				if (DOINGSOFTDEP(vp))
200					softdep_setup_allocdirect(ip, lbn,
201					    dbtofsb(fs, bp->b_blkno), nb,
202					    nsize, osize, bp);
203			}
204		} else {
205			if (ip->i_size < smalllblktosize(fs, lbn + 1))
206				nsize = fragroundup(fs, size);
207			else
208				nsize = fs->fs_bsize;
209			UFS_LOCK(ump);
210			error = ffs_alloc(ip, lbn,
211			    ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]),
212			    nsize, flags, cred, &newb);
213			if (error)
214				return (error);
215			bp = getblk(vp, lbn, nsize, 0, 0, gbflags);
216			bp->b_blkno = fsbtodb(fs, newb);
217			if (flags & BA_CLRBUF)
218				vfs_bio_clrbuf(bp);
219			if (DOINGSOFTDEP(vp))
220				softdep_setup_allocdirect(ip, lbn, newb, 0,
221				    nsize, 0, bp);
222		}
223		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
224		ip->i_flag |= IN_CHANGE | IN_UPDATE;
225		*bpp = bp;
226		return (0);
227	}
228	/*
229	 * Determine the number of levels of indirection.
230	 */
231	pref = 0;
232	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
233		return(error);
234#ifdef INVARIANTS
235	if (num < 1)
236		panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block");
237#endif
238	saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH);
239	/*
240	 * Fetch the first indirect block allocating if necessary.
241	 */
242	--num;
243	nb = dp->di_ib[indirs[0].in_off];
244	allocib = NULL;
245	allocblk = allociblk;
246	lbns_remfree = lbns;
247	if (nb == 0) {
248		UFS_LOCK(ump);
249		pref = ffs_blkpref_ufs1(ip, lbn, -indirs[0].in_off - 1,
250		    (ufs1_daddr_t *)0);
251	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
252		    flags, cred, &newb)) != 0) {
253			curthread_pflags_restore(saved_inbdflush);
254			return (error);
255		}
256		pref = newb + fs->fs_frag;
257		nb = newb;
258		*allocblk++ = nb;
259		*lbns_remfree++ = indirs[1].in_lbn;
260		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, gbflags);
261		bp->b_blkno = fsbtodb(fs, nb);
262		vfs_bio_clrbuf(bp);
263		if (DOINGSOFTDEP(vp)) {
264			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
265			    newb, 0, fs->fs_bsize, 0, bp);
266			bdwrite(bp);
267		} else {
268			/*
269			 * Write synchronously so that indirect blocks
270			 * never point at garbage.
271			 */
272			if (DOINGASYNC(vp))
273				bdwrite(bp);
274			else if ((error = bwrite(bp)) != 0)
275				goto fail;
276		}
277		allocib = &dp->di_ib[indirs[0].in_off];
278		*allocib = nb;
279		ip->i_flag |= IN_CHANGE | IN_UPDATE;
280	}
281	/*
282	 * Fetch through the indirect blocks, allocating as necessary.
283	 */
284retry:
285	for (i = 1;;) {
286		error = bread(vp,
287		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
288		if (error) {
289			brelse(bp);
290			goto fail;
291		}
292		bap = (ufs1_daddr_t *)bp->b_data;
293		nb = bap[indirs[i].in_off];
294		if (i == num)
295			break;
296		i += 1;
297		if (nb != 0) {
298			bqrelse(bp);
299			continue;
300		}
301		UFS_LOCK(ump);
302		if (pref == 0)
303			pref = ffs_blkpref_ufs1(ip, lbn, i - num - 1,
304			    (ufs1_daddr_t *)0);
305		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
306		    flags | IO_BUFLOCKED, cred, &newb)) != 0) {
307			brelse(bp);
308			if (++reclaimed == 1) {
309				UFS_LOCK(ump);
310				softdep_request_cleanup(fs, vp, cred,
311				    FLUSH_BLOCKS_WAIT);
312				UFS_UNLOCK(ump);
313				goto retry;
314			}
315			if (ppsratecheck(&lastfail, &curfail, 1)) {
316				ffs_fserr(fs, ip->i_number, "filesystem full");
317				uprintf("\n%s: write failed, filesystem "
318				    "is full\n", fs->fs_fsmnt);
319			}
320			goto fail;
321		}
322		pref = newb + fs->fs_frag;
323		nb = newb;
324		*allocblk++ = nb;
325		*lbns_remfree++ = indirs[i].in_lbn;
326		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
327		nbp->b_blkno = fsbtodb(fs, nb);
328		vfs_bio_clrbuf(nbp);
329		if (DOINGSOFTDEP(vp)) {
330			softdep_setup_allocindir_meta(nbp, ip, bp,
331			    indirs[i - 1].in_off, nb);
332			bdwrite(nbp);
333		} else {
334			/*
335			 * Write synchronously so that indirect blocks
336			 * never point at garbage.
337			 */
338			if ((error = bwrite(nbp)) != 0) {
339				brelse(bp);
340				goto fail;
341			}
342		}
343		bap[indirs[i - 1].in_off] = nb;
344		if (allocib == NULL && unwindidx < 0)
345			unwindidx = i - 1;
346		/*
347		 * If required, write synchronously, otherwise use
348		 * delayed write.
349		 */
350		if (flags & IO_SYNC) {
351			bwrite(bp);
352		} else {
353			if (bp->b_bufsize == fs->fs_bsize)
354				bp->b_flags |= B_CLUSTEROK;
355			bdwrite(bp);
356		}
357	}
358	/*
359	 * If asked only for the indirect block, then return it.
360	 */
361	if (flags & BA_METAONLY) {
362		curthread_pflags_restore(saved_inbdflush);
363		*bpp = bp;
364		return (0);
365	}
366	/*
367	 * Get the data block, allocating if necessary.
368	 */
369	if (nb == 0) {
370		UFS_LOCK(ump);
371		if (pref == 0)
372			pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off,
373			    &bap[0]);
374		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
375		    flags | IO_BUFLOCKED, cred, &newb);
376		if (error) {
377			brelse(bp);
378			if (++reclaimed == 1) {
379				UFS_LOCK(ump);
380				softdep_request_cleanup(fs, vp, cred,
381				    FLUSH_BLOCKS_WAIT);
382				UFS_UNLOCK(ump);
383				goto retry;
384			}
385			if (ppsratecheck(&lastfail, &curfail, 1)) {
386				ffs_fserr(fs, ip->i_number, "filesystem full");
387				uprintf("\n%s: write failed, filesystem "
388				    "is full\n", fs->fs_fsmnt);
389			}
390			goto fail;
391		}
392		nb = newb;
393		*allocblk++ = nb;
394		*lbns_remfree++ = lbn;
395		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
396		nbp->b_blkno = fsbtodb(fs, nb);
397		if (flags & BA_CLRBUF)
398			vfs_bio_clrbuf(nbp);
399		if (DOINGSOFTDEP(vp))
400			softdep_setup_allocindir_page(ip, lbn, bp,
401			    indirs[i].in_off, nb, 0, nbp);
402		bap[indirs[i].in_off] = nb;
403		/*
404		 * If required, write synchronously, otherwise use
405		 * delayed write.
406		 */
407		if (flags & IO_SYNC) {
408			bwrite(bp);
409		} else {
410			if (bp->b_bufsize == fs->fs_bsize)
411				bp->b_flags |= B_CLUSTEROK;
412			bdwrite(bp);
413		}
414		curthread_pflags_restore(saved_inbdflush);
415		*bpp = nbp;
416		return (0);
417	}
418	brelse(bp);
419	if (flags & BA_CLRBUF) {
420		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
421		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
422			error = cluster_read(vp, ip->i_size, lbn,
423			    (int)fs->fs_bsize, NOCRED,
424			    MAXBSIZE, seqcount, gbflags, &nbp);
425		} else {
426			error = bread_gb(vp, lbn, (int)fs->fs_bsize, NOCRED,
427			    gbflags, &nbp);
428		}
429		if (error) {
430			brelse(nbp);
431			goto fail;
432		}
433	} else {
434		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
435		nbp->b_blkno = fsbtodb(fs, nb);
436	}
437	curthread_pflags_restore(saved_inbdflush);
438	*bpp = nbp;
439	return (0);
440fail:
441	curthread_pflags_restore(saved_inbdflush);
442	/*
443	 * If we have failed to allocate any blocks, simply return the error.
444	 * This is the usual case and avoids the need to fsync the file.
445	 */
446	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
447		return (error);
448	/*
449	 * If we have failed part way through block allocation, we
450	 * have to deallocate any indirect blocks that we have allocated.
451	 * We have to fsync the file before we start to get rid of all
452	 * of its dependencies so that we do not leave them dangling.
453	 * We have to sync it at the end so that the soft updates code
454	 * does not find any untracked changes. Although this is really
455	 * slow, running out of disk space is not expected to be a common
456	 * occurrence. The error return from fsync is ignored as we already
457	 * have an error to return to the user.
458	 *
459	 * XXX Still have to journal the free below
460	 */
461	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
462	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
463	     blkp < allocblk; blkp++, lbns_remfree++) {
464		/*
465		 * We shall not leave the freed blocks on the vnode
466		 * buffer object lists.
467		 */
468		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
469		if (bp != NULL) {
470			bp->b_flags |= (B_INVAL | B_RELBUF);
471			bp->b_flags &= ~B_ASYNC;
472			brelse(bp);
473		}
474		deallocated += fs->fs_bsize;
475	}
476	if (allocib != NULL) {
477		*allocib = 0;
478	} else if (unwindidx >= 0) {
479		int r;
480
481		r = bread(vp, indirs[unwindidx].in_lbn,
482		    (int)fs->fs_bsize, NOCRED, &bp);
483		if (r) {
484			panic("Could not unwind indirect block, error %d", r);
485			brelse(bp);
486		} else {
487			bap = (ufs1_daddr_t *)bp->b_data;
488			bap[indirs[unwindidx].in_off] = 0;
489			if (flags & IO_SYNC) {
490				bwrite(bp);
491			} else {
492				if (bp->b_bufsize == fs->fs_bsize)
493					bp->b_flags |= B_CLUSTEROK;
494				bdwrite(bp);
495			}
496		}
497	}
498	if (deallocated) {
499#ifdef QUOTA
500		/*
501		 * Restore user's disk quota because allocation failed.
502		 */
503		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
504#endif
505		dp->di_blocks -= btodb(deallocated);
506		ip->i_flag |= IN_CHANGE | IN_UPDATE;
507	}
508	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
509	/*
510	 * After the buffers are invalidated and on-disk pointers are
511	 * cleared, free the blocks.
512	 */
513	for (blkp = allociblk; blkp < allocblk; blkp++) {
514		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
515		    ip->i_number, vp->v_type, NULL);
516	}
517	return (error);
518}
519
520/*
521 * Balloc defines the structure of file system storage
522 * by allocating the physical blocks on a device given
523 * the inode and the logical block number in a file.
524 * This is the allocation strategy for UFS2. Above is
525 * the allocation strategy for UFS1.
526 */
527int
528ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
529    struct ucred *cred, int flags, struct buf **bpp)
530{
531	struct inode *ip;
532	struct ufs2_dinode *dp;
533	ufs_lbn_t lbn, lastlbn;
534	struct fs *fs;
535	struct buf *bp, *nbp;
536	struct ufsmount *ump;
537	struct indir indirs[NIADDR + 2];
538	ufs2_daddr_t nb, newb, *bap, pref;
539	ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
540	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
541	int deallocated, osize, nsize, num, i, error;
542	int unwindidx = -1;
543	int saved_inbdflush;
544	static struct timeval lastfail;
545	static int curfail;
546	int gbflags, reclaimed;
547
548	ip = VTOI(vp);
549	dp = ip->i_din2;
550	fs = ip->i_fs;
551	ump = ip->i_ump;
552	lbn = lblkno(fs, startoffset);
553	size = blkoff(fs, startoffset) + size;
554	reclaimed = 0;
555	if (size > fs->fs_bsize)
556		panic("ffs_balloc_ufs2: blk too big");
557	*bpp = NULL;
558	if (lbn < 0)
559		return (EFBIG);
560	gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
561
562	if (DOINGSOFTDEP(vp))
563		softdep_prealloc(vp, MNT_WAIT);
564
565	/*
566	 * Check for allocating external data.
567	 */
568	if (flags & IO_EXT) {
569		if (lbn >= NXADDR)
570			return (EFBIG);
571		/*
572		 * If the next write will extend the data into a new block,
573		 * and the data is currently composed of a fragment
574		 * this fragment has to be extended to be a full block.
575		 */
576		lastlbn = lblkno(fs, dp->di_extsize);
577		if (lastlbn < lbn) {
578			nb = lastlbn;
579			osize = sblksize(fs, dp->di_extsize, nb);
580			if (osize < fs->fs_bsize && osize > 0) {
581				UFS_LOCK(ump);
582				error = ffs_realloccg(ip, -1 - nb,
583				    dp->di_extb[nb],
584				    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
585				    &dp->di_extb[0]), osize,
586				    (int)fs->fs_bsize, flags, cred, &bp);
587				if (error)
588					return (error);
589				if (DOINGSOFTDEP(vp))
590					softdep_setup_allocext(ip, nb,
591					    dbtofsb(fs, bp->b_blkno),
592					    dp->di_extb[nb],
593					    fs->fs_bsize, osize, bp);
594				dp->di_extsize = smalllblktosize(fs, nb + 1);
595				dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno);
596				bp->b_xflags |= BX_ALTDATA;
597				ip->i_flag |= IN_CHANGE;
598				if (flags & IO_SYNC)
599					bwrite(bp);
600				else
601					bawrite(bp);
602			}
603		}
604		/*
605		 * All blocks are direct blocks
606		 */
607		if (flags & BA_METAONLY)
608			panic("ffs_balloc_ufs2: BA_METAONLY for ext block");
609		nb = dp->di_extb[lbn];
610		if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) {
611			error = bread_gb(vp, -1 - lbn, fs->fs_bsize, NOCRED,
612			    gbflags, &bp);
613			if (error) {
614				brelse(bp);
615				return (error);
616			}
617			bp->b_blkno = fsbtodb(fs, nb);
618			bp->b_xflags |= BX_ALTDATA;
619			*bpp = bp;
620			return (0);
621		}
622		if (nb != 0) {
623			/*
624			 * Consider need to reallocate a fragment.
625			 */
626			osize = fragroundup(fs, blkoff(fs, dp->di_extsize));
627			nsize = fragroundup(fs, size);
628			if (nsize <= osize) {
629				error = bread_gb(vp, -1 - lbn, osize, NOCRED,
630				    gbflags, &bp);
631				if (error) {
632					brelse(bp);
633					return (error);
634				}
635				bp->b_blkno = fsbtodb(fs, nb);
636				bp->b_xflags |= BX_ALTDATA;
637			} else {
638				UFS_LOCK(ump);
639				error = ffs_realloccg(ip, -1 - lbn,
640				    dp->di_extb[lbn],
641				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
642				    &dp->di_extb[0]), osize, nsize, flags,
643				    cred, &bp);
644				if (error)
645					return (error);
646				bp->b_xflags |= BX_ALTDATA;
647				if (DOINGSOFTDEP(vp))
648					softdep_setup_allocext(ip, lbn,
649					    dbtofsb(fs, bp->b_blkno), nb,
650					    nsize, osize, bp);
651			}
652		} else {
653			if (dp->di_extsize < smalllblktosize(fs, lbn + 1))
654				nsize = fragroundup(fs, size);
655			else
656				nsize = fs->fs_bsize;
657			UFS_LOCK(ump);
658			error = ffs_alloc(ip, lbn,
659			   ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]),
660			   nsize, flags, cred, &newb);
661			if (error)
662				return (error);
663			bp = getblk(vp, -1 - lbn, nsize, 0, 0, gbflags);
664			bp->b_blkno = fsbtodb(fs, newb);
665			bp->b_xflags |= BX_ALTDATA;
666			if (flags & BA_CLRBUF)
667				vfs_bio_clrbuf(bp);
668			if (DOINGSOFTDEP(vp))
669				softdep_setup_allocext(ip, lbn, newb, 0,
670				    nsize, 0, bp);
671		}
672		dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno);
673		ip->i_flag |= IN_CHANGE;
674		*bpp = bp;
675		return (0);
676	}
677	/*
678	 * If the next write will extend the file into a new block,
679	 * and the file is currently composed of a fragment
680	 * this fragment has to be extended to be a full block.
681	 */
682	lastlbn = lblkno(fs, ip->i_size);
683	if (lastlbn < NDADDR && lastlbn < lbn) {
684		nb = lastlbn;
685		osize = blksize(fs, ip, nb);
686		if (osize < fs->fs_bsize && osize > 0) {
687			UFS_LOCK(ump);
688			error = ffs_realloccg(ip, nb, dp->di_db[nb],
689			    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
690			    &dp->di_db[0]), osize, (int)fs->fs_bsize,
691			    flags, cred, &bp);
692			if (error)
693				return (error);
694			if (DOINGSOFTDEP(vp))
695				softdep_setup_allocdirect(ip, nb,
696				    dbtofsb(fs, bp->b_blkno),
697				    dp->di_db[nb],
698				    fs->fs_bsize, osize, bp);
699			ip->i_size = smalllblktosize(fs, nb + 1);
700			dp->di_size = ip->i_size;
701			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
702			ip->i_flag |= IN_CHANGE | IN_UPDATE;
703			if (flags & IO_SYNC)
704				bwrite(bp);
705			else
706				bawrite(bp);
707		}
708	}
709	/*
710	 * The first NDADDR blocks are direct blocks
711	 */
712	if (lbn < NDADDR) {
713		if (flags & BA_METAONLY)
714			panic("ffs_balloc_ufs2: BA_METAONLY for direct block");
715		nb = dp->di_db[lbn];
716		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
717			error = bread_gb(vp, lbn, fs->fs_bsize, NOCRED,
718			    gbflags, &bp);
719			if (error) {
720				brelse(bp);
721				return (error);
722			}
723			bp->b_blkno = fsbtodb(fs, nb);
724			*bpp = bp;
725			return (0);
726		}
727		if (nb != 0) {
728			/*
729			 * Consider need to reallocate a fragment.
730			 */
731			osize = fragroundup(fs, blkoff(fs, ip->i_size));
732			nsize = fragroundup(fs, size);
733			if (nsize <= osize) {
734				error = bread_gb(vp, lbn, osize, NOCRED,
735				    gbflags, &bp);
736				if (error) {
737					brelse(bp);
738					return (error);
739				}
740				bp->b_blkno = fsbtodb(fs, nb);
741			} else {
742				UFS_LOCK(ump);
743				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
744				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
745				    &dp->di_db[0]), osize, nsize, flags,
746				    cred, &bp);
747				if (error)
748					return (error);
749				if (DOINGSOFTDEP(vp))
750					softdep_setup_allocdirect(ip, lbn,
751					    dbtofsb(fs, bp->b_blkno), nb,
752					    nsize, osize, bp);
753			}
754		} else {
755			if (ip->i_size < smalllblktosize(fs, lbn + 1))
756				nsize = fragroundup(fs, size);
757			else
758				nsize = fs->fs_bsize;
759			UFS_LOCK(ump);
760			error = ffs_alloc(ip, lbn,
761			    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
762				&dp->di_db[0]), nsize, flags, cred, &newb);
763			if (error)
764				return (error);
765			bp = getblk(vp, lbn, nsize, 0, 0, gbflags);
766			bp->b_blkno = fsbtodb(fs, newb);
767			if (flags & BA_CLRBUF)
768				vfs_bio_clrbuf(bp);
769			if (DOINGSOFTDEP(vp))
770				softdep_setup_allocdirect(ip, lbn, newb, 0,
771				    nsize, 0, bp);
772		}
773		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
774		ip->i_flag |= IN_CHANGE | IN_UPDATE;
775		*bpp = bp;
776		return (0);
777	}
778	/*
779	 * Determine the number of levels of indirection.
780	 */
781	pref = 0;
782	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
783		return(error);
784#ifdef INVARIANTS
785	if (num < 1)
786		panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block");
787#endif
788	saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH);
789	/*
790	 * Fetch the first indirect block allocating if necessary.
791	 */
792	--num;
793	nb = dp->di_ib[indirs[0].in_off];
794	allocib = NULL;
795	allocblk = allociblk;
796	lbns_remfree = lbns;
797	if (nb == 0) {
798		UFS_LOCK(ump);
799		pref = ffs_blkpref_ufs2(ip, lbn, -indirs[0].in_off - 1,
800		    (ufs2_daddr_t *)0);
801	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
802		    flags, cred, &newb)) != 0) {
803			curthread_pflags_restore(saved_inbdflush);
804			return (error);
805		}
806		pref = newb + fs->fs_frag;
807		nb = newb;
808		*allocblk++ = nb;
809		*lbns_remfree++ = indirs[1].in_lbn;
810		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0,
811		    GB_UNMAPPED);
812		bp->b_blkno = fsbtodb(fs, nb);
813		vfs_bio_clrbuf(bp);
814		if (DOINGSOFTDEP(vp)) {
815			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
816			    newb, 0, fs->fs_bsize, 0, bp);
817			bdwrite(bp);
818		} else {
819			/*
820			 * Write synchronously so that indirect blocks
821			 * never point at garbage.
822			 */
823			if (DOINGASYNC(vp))
824				bdwrite(bp);
825			else if ((error = bwrite(bp)) != 0)
826				goto fail;
827		}
828		allocib = &dp->di_ib[indirs[0].in_off];
829		*allocib = nb;
830		ip->i_flag |= IN_CHANGE | IN_UPDATE;
831	}
832	/*
833	 * Fetch through the indirect blocks, allocating as necessary.
834	 */
835retry:
836	for (i = 1;;) {
837		error = bread(vp,
838		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
839		if (error) {
840			brelse(bp);
841			goto fail;
842		}
843		bap = (ufs2_daddr_t *)bp->b_data;
844		nb = bap[indirs[i].in_off];
845		if (i == num)
846			break;
847		i += 1;
848		if (nb != 0) {
849			bqrelse(bp);
850			continue;
851		}
852		UFS_LOCK(ump);
853		if (pref == 0)
854			pref = ffs_blkpref_ufs2(ip, lbn, i - num - 1,
855			    (ufs2_daddr_t *)0);
856		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
857		    flags | IO_BUFLOCKED, cred, &newb)) != 0) {
858			brelse(bp);
859			if (++reclaimed == 1) {
860				UFS_LOCK(ump);
861				softdep_request_cleanup(fs, vp, cred,
862				    FLUSH_BLOCKS_WAIT);
863				UFS_UNLOCK(ump);
864				goto retry;
865			}
866			if (ppsratecheck(&lastfail, &curfail, 1)) {
867				ffs_fserr(fs, ip->i_number, "filesystem full");
868				uprintf("\n%s: write failed, filesystem "
869				    "is full\n", fs->fs_fsmnt);
870			}
871			goto fail;
872		}
873		pref = newb + fs->fs_frag;
874		nb = newb;
875		*allocblk++ = nb;
876		*lbns_remfree++ = indirs[i].in_lbn;
877		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0,
878		    GB_UNMAPPED);
879		nbp->b_blkno = fsbtodb(fs, nb);
880		vfs_bio_clrbuf(nbp);
881		if (DOINGSOFTDEP(vp)) {
882			softdep_setup_allocindir_meta(nbp, ip, bp,
883			    indirs[i - 1].in_off, nb);
884			bdwrite(nbp);
885		} else {
886			/*
887			 * Write synchronously so that indirect blocks
888			 * never point at garbage.
889			 */
890			if ((error = bwrite(nbp)) != 0) {
891				brelse(bp);
892				goto fail;
893			}
894		}
895		bap[indirs[i - 1].in_off] = nb;
896		if (allocib == NULL && unwindidx < 0)
897			unwindidx = i - 1;
898		/*
899		 * If required, write synchronously, otherwise use
900		 * delayed write.
901		 */
902		if (flags & IO_SYNC) {
903			bwrite(bp);
904		} else {
905			if (bp->b_bufsize == fs->fs_bsize)
906				bp->b_flags |= B_CLUSTEROK;
907			bdwrite(bp);
908		}
909	}
910	/*
911	 * If asked only for the indirect block, then return it.
912	 */
913	if (flags & BA_METAONLY) {
914		curthread_pflags_restore(saved_inbdflush);
915		*bpp = bp;
916		return (0);
917	}
918	/*
919	 * Get the data block, allocating if necessary.
920	 */
921	if (nb == 0) {
922		UFS_LOCK(ump);
923		if (pref == 0)
924			pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off,
925			    &bap[0]);
926		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
927		    flags | IO_BUFLOCKED, cred, &newb);
928		if (error) {
929			brelse(bp);
930			if (++reclaimed == 1) {
931				UFS_LOCK(ump);
932				softdep_request_cleanup(fs, vp, cred,
933				    FLUSH_BLOCKS_WAIT);
934				UFS_UNLOCK(ump);
935				goto retry;
936			}
937			if (ppsratecheck(&lastfail, &curfail, 1)) {
938				ffs_fserr(fs, ip->i_number, "filesystem full");
939				uprintf("\n%s: write failed, filesystem "
940				    "is full\n", fs->fs_fsmnt);
941			}
942			goto fail;
943		}
944		nb = newb;
945		*allocblk++ = nb;
946		*lbns_remfree++ = lbn;
947		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
948		nbp->b_blkno = fsbtodb(fs, nb);
949		if (flags & BA_CLRBUF)
950			vfs_bio_clrbuf(nbp);
951		if (DOINGSOFTDEP(vp))
952			softdep_setup_allocindir_page(ip, lbn, bp,
953			    indirs[i].in_off, nb, 0, nbp);
954		bap[indirs[i].in_off] = nb;
955		/*
956		 * If required, write synchronously, otherwise use
957		 * delayed write.
958		 */
959		if (flags & IO_SYNC) {
960			bwrite(bp);
961		} else {
962			if (bp->b_bufsize == fs->fs_bsize)
963				bp->b_flags |= B_CLUSTEROK;
964			bdwrite(bp);
965		}
966		curthread_pflags_restore(saved_inbdflush);
967		*bpp = nbp;
968		return (0);
969	}
970	brelse(bp);
971	/*
972	 * If requested clear invalid portions of the buffer.  If we
973	 * have to do a read-before-write (typical if BA_CLRBUF is set),
974	 * try to do some read-ahead in the sequential case to reduce
975	 * the number of I/O transactions.
976	 */
977	if (flags & BA_CLRBUF) {
978		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
979		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
980			error = cluster_read(vp, ip->i_size, lbn,
981			    (int)fs->fs_bsize, NOCRED,
982			    MAXBSIZE, seqcount, gbflags, &nbp);
983		} else {
984			error = bread_gb(vp, lbn, (int)fs->fs_bsize,
985			    NOCRED, gbflags, &nbp);
986		}
987		if (error) {
988			brelse(nbp);
989			goto fail;
990		}
991	} else {
992		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
993		nbp->b_blkno = fsbtodb(fs, nb);
994	}
995	curthread_pflags_restore(saved_inbdflush);
996	*bpp = nbp;
997	return (0);
998fail:
999	curthread_pflags_restore(saved_inbdflush);
1000	/*
1001	 * If we have failed to allocate any blocks, simply return the error.
1002	 * This is the usual case and avoids the need to fsync the file.
1003	 */
1004	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
1005		return (error);
1006	/*
1007	 * If we have failed part way through block allocation, we
1008	 * have to deallocate any indirect blocks that we have allocated.
1009	 * We have to fsync the file before we start to get rid of all
1010	 * of its dependencies so that we do not leave them dangling.
1011	 * We have to sync it at the end so that the soft updates code
1012	 * does not find any untracked changes. Although this is really
1013	 * slow, running out of disk space is not expected to be a common
1014	 * occurrence. The error return from fsync is ignored as we already
1015	 * have an error to return to the user.
1016	 *
1017	 * XXX Still have to journal the free below
1018	 */
1019	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
1020	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
1021	     blkp < allocblk; blkp++, lbns_remfree++) {
1022		/*
1023		 * We shall not leave the freed blocks on the vnode
1024		 * buffer object lists.
1025		 */
1026		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
1027		if (bp != NULL) {
1028			bp->b_flags |= (B_INVAL | B_RELBUF);
1029			bp->b_flags &= ~B_ASYNC;
1030			brelse(bp);
1031		}
1032		deallocated += fs->fs_bsize;
1033	}
1034	if (allocib != NULL) {
1035		*allocib = 0;
1036	} else if (unwindidx >= 0) {
1037		int r;
1038
1039		r = bread(vp, indirs[unwindidx].in_lbn,
1040		    (int)fs->fs_bsize, NOCRED, &bp);
1041		if (r) {
1042			panic("Could not unwind indirect block, error %d", r);
1043			brelse(bp);
1044		} else {
1045			bap = (ufs2_daddr_t *)bp->b_data;
1046			bap[indirs[unwindidx].in_off] = 0;
1047			if (flags & IO_SYNC) {
1048				bwrite(bp);
1049			} else {
1050				if (bp->b_bufsize == fs->fs_bsize)
1051					bp->b_flags |= B_CLUSTEROK;
1052				bdwrite(bp);
1053			}
1054		}
1055	}
1056	if (deallocated) {
1057#ifdef QUOTA
1058		/*
1059		 * Restore user's disk quota because allocation failed.
1060		 */
1061		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
1062#endif
1063		dp->di_blocks -= btodb(deallocated);
1064		ip->i_flag |= IN_CHANGE | IN_UPDATE;
1065	}
1066	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
1067	/*
1068	 * After the buffers are invalidated and on-disk pointers are
1069	 * cleared, free the blocks.
1070	 */
1071	for (blkp = allociblk; blkp < allocblk; blkp++) {
1072		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
1073		    ip->i_number, vp->v_type, NULL);
1074	}
1075	return (error);
1076}
1077