ffs_balloc.c revision 304670
1/*-
2 * Copyright (c) 2002 Networks Associates Technology, Inc.
3 * All rights reserved.
4 *
5 * This software was developed for the FreeBSD Project by Marshall
6 * Kirk McKusick and Network Associates Laboratories, the Security
7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9 * research program
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * Copyright (c) 1982, 1986, 1989, 1993
33 *	The Regents of the University of California.  All rights reserved.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 *    notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 *    notice, this list of conditions and the following disclaimer in the
42 *    documentation and/or other materials provided with the distribution.
43 * 4. Neither the name of the University nor the names of its contributors
44 *    may be used to endorse or promote products derived from this software
45 *    without specific prior written permission.
46 *
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * SUCH DAMAGE.
58 *
59 *	@(#)ffs_balloc.c	8.8 (Berkeley) 6/16/95
60 */
61
62#include <sys/cdefs.h>
63__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_balloc.c 304670 2016-08-23 07:54:14Z kib $");
64
65#include <sys/param.h>
66#include <sys/systm.h>
67#include <sys/bio.h>
68#include <sys/buf.h>
69#include <sys/lock.h>
70#include <sys/mount.h>
71#include <sys/vnode.h>
72
73#include <ufs/ufs/quota.h>
74#include <ufs/ufs/inode.h>
75#include <ufs/ufs/ufs_extern.h>
76#include <ufs/ufs/extattr.h>
77#include <ufs/ufs/ufsmount.h>
78
79#include <ufs/ffs/fs.h>
80#include <ufs/ffs/ffs_extern.h>
81
82/*
83 * Balloc defines the structure of filesystem storage
84 * by allocating the physical blocks on a device given
85 * the inode and the logical block number in a file.
86 * This is the allocation strategy for UFS1. Below is
87 * the allocation strategy for UFS2.
88 */
89int
90ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
91    struct ucred *cred, int flags, struct buf **bpp)
92{
93	struct inode *ip;
94	struct ufs1_dinode *dp;
95	ufs_lbn_t lbn, lastlbn;
96	struct fs *fs;
97	ufs1_daddr_t nb;
98	struct buf *bp, *nbp;
99	struct ufsmount *ump;
100	struct indir indirs[NIADDR + 2];
101	int deallocated, osize, nsize, num, i, error;
102	ufs2_daddr_t newb;
103	ufs1_daddr_t *bap, pref;
104	ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
105	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
106	int unwindidx = -1;
107	int saved_inbdflush;
108	static struct timeval lastfail;
109	static int curfail;
110	int gbflags, reclaimed;
111
112	ip = VTOI(vp);
113	dp = ip->i_din1;
114	fs = ip->i_fs;
115	ump = ip->i_ump;
116	lbn = lblkno(fs, startoffset);
117	size = blkoff(fs, startoffset) + size;
118	reclaimed = 0;
119	if (size > fs->fs_bsize)
120		panic("ffs_balloc_ufs1: blk too big");
121	*bpp = NULL;
122	if (flags & IO_EXT)
123		return (EOPNOTSUPP);
124	if (lbn < 0)
125		return (EFBIG);
126	gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
127
128	if (DOINGSOFTDEP(vp))
129		softdep_prealloc(vp, MNT_WAIT);
130	/*
131	 * If the next write will extend the file into a new block,
132	 * and the file is currently composed of a fragment
133	 * this fragment has to be extended to be a full block.
134	 */
135	lastlbn = lblkno(fs, ip->i_size);
136	if (lastlbn < NDADDR && lastlbn < lbn) {
137		nb = lastlbn;
138		osize = blksize(fs, ip, nb);
139		if (osize < fs->fs_bsize && osize > 0) {
140			UFS_LOCK(ump);
141			error = ffs_realloccg(ip, nb, dp->di_db[nb],
142			   ffs_blkpref_ufs1(ip, lastlbn, (int)nb,
143			   &dp->di_db[0]), osize, (int)fs->fs_bsize, flags,
144			   cred, &bp);
145			if (error)
146				return (error);
147			if (DOINGSOFTDEP(vp))
148				softdep_setup_allocdirect(ip, nb,
149				    dbtofsb(fs, bp->b_blkno), dp->di_db[nb],
150				    fs->fs_bsize, osize, bp);
151			ip->i_size = smalllblktosize(fs, nb + 1);
152			dp->di_size = ip->i_size;
153			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
154			ip->i_flag |= IN_CHANGE | IN_UPDATE;
155			if (flags & IO_SYNC)
156				bwrite(bp);
157			else
158				bawrite(bp);
159		}
160	}
161	/*
162	 * The first NDADDR blocks are direct blocks
163	 */
164	if (lbn < NDADDR) {
165		if (flags & BA_METAONLY)
166			panic("ffs_balloc_ufs1: BA_METAONLY for direct block");
167		nb = dp->di_db[lbn];
168		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
169			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
170			if (error) {
171				brelse(bp);
172				return (error);
173			}
174			bp->b_blkno = fsbtodb(fs, nb);
175			*bpp = bp;
176			return (0);
177		}
178		if (nb != 0) {
179			/*
180			 * Consider need to reallocate a fragment.
181			 */
182			osize = fragroundup(fs, blkoff(fs, ip->i_size));
183			nsize = fragroundup(fs, size);
184			if (nsize <= osize) {
185				error = bread(vp, lbn, osize, NOCRED, &bp);
186				if (error) {
187					brelse(bp);
188					return (error);
189				}
190				bp->b_blkno = fsbtodb(fs, nb);
191			} else {
192				UFS_LOCK(ump);
193				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
194				    ffs_blkpref_ufs1(ip, lbn, (int)lbn,
195				    &dp->di_db[0]), osize, nsize, flags,
196				    cred, &bp);
197				if (error)
198					return (error);
199				if (DOINGSOFTDEP(vp))
200					softdep_setup_allocdirect(ip, lbn,
201					    dbtofsb(fs, bp->b_blkno), nb,
202					    nsize, osize, bp);
203			}
204		} else {
205			if (ip->i_size < smalllblktosize(fs, lbn + 1))
206				nsize = fragroundup(fs, size);
207			else
208				nsize = fs->fs_bsize;
209			UFS_LOCK(ump);
210			error = ffs_alloc(ip, lbn,
211			    ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]),
212			    nsize, flags, cred, &newb);
213			if (error)
214				return (error);
215			bp = getblk(vp, lbn, nsize, 0, 0, gbflags);
216			bp->b_blkno = fsbtodb(fs, newb);
217			if (flags & BA_CLRBUF)
218				vfs_bio_clrbuf(bp);
219			if (DOINGSOFTDEP(vp))
220				softdep_setup_allocdirect(ip, lbn, newb, 0,
221				    nsize, 0, bp);
222		}
223		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
224		ip->i_flag |= IN_CHANGE | IN_UPDATE;
225		*bpp = bp;
226		return (0);
227	}
228	/*
229	 * Determine the number of levels of indirection.
230	 */
231	pref = 0;
232	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
233		return(error);
234#ifdef INVARIANTS
235	if (num < 1)
236		panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block");
237#endif
238	saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH);
239	/*
240	 * Fetch the first indirect block allocating if necessary.
241	 */
242	--num;
243	nb = dp->di_ib[indirs[0].in_off];
244	allocib = NULL;
245	allocblk = allociblk;
246	lbns_remfree = lbns;
247	if (nb == 0) {
248		UFS_LOCK(ump);
249		pref = ffs_blkpref_ufs1(ip, lbn, -indirs[0].in_off - 1,
250		    (ufs1_daddr_t *)0);
251		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
252		    flags, cred, &newb)) != 0) {
253			curthread_pflags_restore(saved_inbdflush);
254			return (error);
255		}
256		pref = newb + fs->fs_frag;
257		nb = newb;
258		MPASS(allocblk < allociblk + nitems(allociblk));
259		MPASS(lbns_remfree < lbns + nitems(lbns));
260		*allocblk++ = nb;
261		*lbns_remfree++ = indirs[1].in_lbn;
262		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, gbflags);
263		bp->b_blkno = fsbtodb(fs, nb);
264		vfs_bio_clrbuf(bp);
265		if (DOINGSOFTDEP(vp)) {
266			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
267			    newb, 0, fs->fs_bsize, 0, bp);
268			bdwrite(bp);
269		} else {
270			/*
271			 * Write synchronously so that indirect blocks
272			 * never point at garbage.
273			 */
274			if (DOINGASYNC(vp))
275				bdwrite(bp);
276			else if ((error = bwrite(bp)) != 0)
277				goto fail;
278		}
279		allocib = &dp->di_ib[indirs[0].in_off];
280		*allocib = nb;
281		ip->i_flag |= IN_CHANGE | IN_UPDATE;
282	}
283	/*
284	 * Fetch through the indirect blocks, allocating as necessary.
285	 */
286retry:
287	for (i = 1;;) {
288		error = bread(vp,
289		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
290		if (error) {
291			brelse(bp);
292			goto fail;
293		}
294		bap = (ufs1_daddr_t *)bp->b_data;
295		nb = bap[indirs[i].in_off];
296		if (i == num)
297			break;
298		i += 1;
299		if (nb != 0) {
300			bqrelse(bp);
301			continue;
302		}
303		UFS_LOCK(ump);
304		/*
305		 * If parent indirect has just been allocated, try to cluster
306		 * immediately following it.
307		 */
308		if (pref == 0)
309			pref = ffs_blkpref_ufs1(ip, lbn, i - num - 1,
310			    (ufs1_daddr_t *)0);
311		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
312		    flags | IO_BUFLOCKED, cred, &newb)) != 0) {
313			brelse(bp);
314			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
315				UFS_LOCK(ump);
316				softdep_request_cleanup(fs, vp, cred,
317				    FLUSH_BLOCKS_WAIT);
318				UFS_UNLOCK(ump);
319				goto retry;
320			}
321			if (ppsratecheck(&lastfail, &curfail, 1)) {
322				ffs_fserr(fs, ip->i_number, "filesystem full");
323				uprintf("\n%s: write failed, filesystem "
324				    "is full\n", fs->fs_fsmnt);
325			}
326			goto fail;
327		}
328		pref = newb + fs->fs_frag;
329		nb = newb;
330		MPASS(allocblk < allociblk + nitems(allociblk));
331		MPASS(lbns_remfree < lbns + nitems(lbns));
332		*allocblk++ = nb;
333		*lbns_remfree++ = indirs[i].in_lbn;
334		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
335		nbp->b_blkno = fsbtodb(fs, nb);
336		vfs_bio_clrbuf(nbp);
337		if (DOINGSOFTDEP(vp)) {
338			softdep_setup_allocindir_meta(nbp, ip, bp,
339			    indirs[i - 1].in_off, nb);
340			bdwrite(nbp);
341		} else {
342			/*
343			 * Write synchronously so that indirect blocks
344			 * never point at garbage.
345			 */
346			if ((error = bwrite(nbp)) != 0) {
347				brelse(bp);
348				goto fail;
349			}
350		}
351		bap[indirs[i - 1].in_off] = nb;
352		if (allocib == NULL && unwindidx < 0)
353			unwindidx = i - 1;
354		/*
355		 * If required, write synchronously, otherwise use
356		 * delayed write.
357		 */
358		if (flags & IO_SYNC) {
359			bwrite(bp);
360		} else {
361			if (bp->b_bufsize == fs->fs_bsize)
362				bp->b_flags |= B_CLUSTEROK;
363			bdwrite(bp);
364		}
365	}
366	/*
367	 * If asked only for the indirect block, then return it.
368	 */
369	if (flags & BA_METAONLY) {
370		curthread_pflags_restore(saved_inbdflush);
371		*bpp = bp;
372		return (0);
373	}
374	/*
375	 * Get the data block, allocating if necessary.
376	 */
377	if (nb == 0) {
378		UFS_LOCK(ump);
379		/*
380		 * If allocating metadata at the front of the cylinder
381		 * group and parent indirect block has just been allocated,
382		 * then cluster next to it if it is the first indirect in
383		 * the file. Otherwise it has been allocated in the metadata
384		 * area, so we want to find our own place out in the data area.
385		 */
386		if (pref == 0 || (lbn > NDADDR && fs->fs_metaspace != 0))
387			pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off,
388			    &bap[0]);
389		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
390		    flags | IO_BUFLOCKED, cred, &newb);
391		if (error) {
392			brelse(bp);
393			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
394				UFS_LOCK(ump);
395				softdep_request_cleanup(fs, vp, cred,
396				    FLUSH_BLOCKS_WAIT);
397				UFS_UNLOCK(ump);
398				goto retry;
399			}
400			if (ppsratecheck(&lastfail, &curfail, 1)) {
401				ffs_fserr(fs, ip->i_number, "filesystem full");
402				uprintf("\n%s: write failed, filesystem "
403				    "is full\n", fs->fs_fsmnt);
404			}
405			goto fail;
406		}
407		nb = newb;
408		MPASS(allocblk < allociblk + nitems(allociblk));
409		MPASS(lbns_remfree < lbns + nitems(lbns));
410		*allocblk++ = nb;
411		*lbns_remfree++ = lbn;
412		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
413		nbp->b_blkno = fsbtodb(fs, nb);
414		if (flags & BA_CLRBUF)
415			vfs_bio_clrbuf(nbp);
416		if (DOINGSOFTDEP(vp))
417			softdep_setup_allocindir_page(ip, lbn, bp,
418			    indirs[i].in_off, nb, 0, nbp);
419		bap[indirs[i].in_off] = nb;
420		/*
421		 * If required, write synchronously, otherwise use
422		 * delayed write.
423		 */
424		if (flags & IO_SYNC) {
425			bwrite(bp);
426		} else {
427			if (bp->b_bufsize == fs->fs_bsize)
428				bp->b_flags |= B_CLUSTEROK;
429			bdwrite(bp);
430		}
431		curthread_pflags_restore(saved_inbdflush);
432		*bpp = nbp;
433		return (0);
434	}
435	brelse(bp);
436	if (flags & BA_CLRBUF) {
437		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
438		if (seqcount != 0 &&
439		    (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 &&
440		    !(vm_page_count_severe() || buf_dirty_count_severe())) {
441			error = cluster_read(vp, ip->i_size, lbn,
442			    (int)fs->fs_bsize, NOCRED,
443			    MAXBSIZE, seqcount, gbflags, &nbp);
444		} else {
445			error = bread_gb(vp, lbn, (int)fs->fs_bsize, NOCRED,
446			    gbflags, &nbp);
447		}
448		if (error) {
449			brelse(nbp);
450			goto fail;
451		}
452	} else {
453		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
454		nbp->b_blkno = fsbtodb(fs, nb);
455	}
456	curthread_pflags_restore(saved_inbdflush);
457	*bpp = nbp;
458	return (0);
459fail:
460	curthread_pflags_restore(saved_inbdflush);
461	/*
462	 * If we have failed to allocate any blocks, simply return the error.
463	 * This is the usual case and avoids the need to fsync the file.
464	 */
465	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
466		return (error);
467	/*
468	 * If we have failed part way through block allocation, we
469	 * have to deallocate any indirect blocks that we have allocated.
470	 * We have to fsync the file before we start to get rid of all
471	 * of its dependencies so that we do not leave them dangling.
472	 * We have to sync it at the end so that the soft updates code
473	 * does not find any untracked changes. Although this is really
474	 * slow, running out of disk space is not expected to be a common
475	 * occurrence. The error return from fsync is ignored as we already
476	 * have an error to return to the user.
477	 *
478	 * XXX Still have to journal the free below
479	 */
480	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
481	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
482	     blkp < allocblk; blkp++, lbns_remfree++) {
483		/*
484		 * We shall not leave the freed blocks on the vnode
485		 * buffer object lists.
486		 */
487		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
488		    GB_NOCREAT | GB_UNMAPPED);
489		if (bp != NULL) {
490			KASSERT(bp->b_blkno == fsbtodb(fs, *blkp),
491			    ("mismatch1 l %jd %jd b %ju %ju",
492			    (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree,
493			    (uintmax_t)bp->b_blkno,
494			    (uintmax_t)fsbtodb(fs, *blkp)));
495			bp->b_flags |= (B_INVAL | B_RELBUF);
496			bp->b_flags &= ~B_ASYNC;
497			brelse(bp);
498		}
499		deallocated += fs->fs_bsize;
500	}
501	if (allocib != NULL) {
502		*allocib = 0;
503	} else if (unwindidx >= 0) {
504		int r;
505
506		r = bread(vp, indirs[unwindidx].in_lbn,
507		    (int)fs->fs_bsize, NOCRED, &bp);
508		if (r) {
509			panic("Could not unwind indirect block, error %d", r);
510			brelse(bp);
511		} else {
512			bap = (ufs1_daddr_t *)bp->b_data;
513			bap[indirs[unwindidx].in_off] = 0;
514			if (flags & IO_SYNC) {
515				bwrite(bp);
516			} else {
517				if (bp->b_bufsize == fs->fs_bsize)
518					bp->b_flags |= B_CLUSTEROK;
519				bdwrite(bp);
520			}
521		}
522	}
523	if (deallocated) {
524#ifdef QUOTA
525		/*
526		 * Restore user's disk quota because allocation failed.
527		 */
528		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
529#endif
530		dp->di_blocks -= btodb(deallocated);
531		ip->i_flag |= IN_CHANGE | IN_UPDATE;
532	}
533	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
534	/*
535	 * After the buffers are invalidated and on-disk pointers are
536	 * cleared, free the blocks.
537	 */
538	for (blkp = allociblk; blkp < allocblk; blkp++) {
539#ifdef INVARIANTS
540		if (blkp == allociblk)
541			lbns_remfree = lbns;
542		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
543		    GB_NOCREAT | GB_UNMAPPED);
544		if (bp != NULL) {
545			panic("zombie1 %jd %ju %ju",
546			    (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno,
547			    (uintmax_t)fsbtodb(fs, *blkp));
548		}
549		lbns_remfree++;
550#endif
551		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
552		    ip->i_number, vp->v_type, NULL);
553	}
554	return (error);
555}
556
557/*
558 * Balloc defines the structure of file system storage
559 * by allocating the physical blocks on a device given
560 * the inode and the logical block number in a file.
561 * This is the allocation strategy for UFS2. Above is
562 * the allocation strategy for UFS1.
563 */
564int
565ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
566    struct ucred *cred, int flags, struct buf **bpp)
567{
568	struct inode *ip;
569	struct ufs2_dinode *dp;
570	ufs_lbn_t lbn, lastlbn;
571	struct fs *fs;
572	struct buf *bp, *nbp;
573	struct ufsmount *ump;
574	struct indir indirs[NIADDR + 2];
575	ufs2_daddr_t nb, newb, *bap, pref;
576	ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
577	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
578	int deallocated, osize, nsize, num, i, error;
579	int unwindidx = -1;
580	int saved_inbdflush;
581	static struct timeval lastfail;
582	static int curfail;
583	int gbflags, reclaimed;
584
585	ip = VTOI(vp);
586	dp = ip->i_din2;
587	fs = ip->i_fs;
588	ump = ip->i_ump;
589	lbn = lblkno(fs, startoffset);
590	size = blkoff(fs, startoffset) + size;
591	reclaimed = 0;
592	if (size > fs->fs_bsize)
593		panic("ffs_balloc_ufs2: blk too big");
594	*bpp = NULL;
595	if (lbn < 0)
596		return (EFBIG);
597	gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
598
599	if (DOINGSOFTDEP(vp))
600		softdep_prealloc(vp, MNT_WAIT);
601
602	/*
603	 * Check for allocating external data.
604	 */
605	if (flags & IO_EXT) {
606		if (lbn >= NXADDR)
607			return (EFBIG);
608		/*
609		 * If the next write will extend the data into a new block,
610		 * and the data is currently composed of a fragment
611		 * this fragment has to be extended to be a full block.
612		 */
613		lastlbn = lblkno(fs, dp->di_extsize);
614		if (lastlbn < lbn) {
615			nb = lastlbn;
616			osize = sblksize(fs, dp->di_extsize, nb);
617			if (osize < fs->fs_bsize && osize > 0) {
618				UFS_LOCK(ump);
619				error = ffs_realloccg(ip, -1 - nb,
620				    dp->di_extb[nb],
621				    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
622				    &dp->di_extb[0]), osize,
623				    (int)fs->fs_bsize, flags, cred, &bp);
624				if (error)
625					return (error);
626				if (DOINGSOFTDEP(vp))
627					softdep_setup_allocext(ip, nb,
628					    dbtofsb(fs, bp->b_blkno),
629					    dp->di_extb[nb],
630					    fs->fs_bsize, osize, bp);
631				dp->di_extsize = smalllblktosize(fs, nb + 1);
632				dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno);
633				bp->b_xflags |= BX_ALTDATA;
634				ip->i_flag |= IN_CHANGE;
635				if (flags & IO_SYNC)
636					bwrite(bp);
637				else
638					bawrite(bp);
639			}
640		}
641		/*
642		 * All blocks are direct blocks
643		 */
644		if (flags & BA_METAONLY)
645			panic("ffs_balloc_ufs2: BA_METAONLY for ext block");
646		nb = dp->di_extb[lbn];
647		if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) {
648			error = bread_gb(vp, -1 - lbn, fs->fs_bsize, NOCRED,
649			    gbflags, &bp);
650			if (error) {
651				brelse(bp);
652				return (error);
653			}
654			bp->b_blkno = fsbtodb(fs, nb);
655			bp->b_xflags |= BX_ALTDATA;
656			*bpp = bp;
657			return (0);
658		}
659		if (nb != 0) {
660			/*
661			 * Consider need to reallocate a fragment.
662			 */
663			osize = fragroundup(fs, blkoff(fs, dp->di_extsize));
664			nsize = fragroundup(fs, size);
665			if (nsize <= osize) {
666				error = bread_gb(vp, -1 - lbn, osize, NOCRED,
667				    gbflags, &bp);
668				if (error) {
669					brelse(bp);
670					return (error);
671				}
672				bp->b_blkno = fsbtodb(fs, nb);
673				bp->b_xflags |= BX_ALTDATA;
674			} else {
675				UFS_LOCK(ump);
676				error = ffs_realloccg(ip, -1 - lbn,
677				    dp->di_extb[lbn],
678				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
679				    &dp->di_extb[0]), osize, nsize, flags,
680				    cred, &bp);
681				if (error)
682					return (error);
683				bp->b_xflags |= BX_ALTDATA;
684				if (DOINGSOFTDEP(vp))
685					softdep_setup_allocext(ip, lbn,
686					    dbtofsb(fs, bp->b_blkno), nb,
687					    nsize, osize, bp);
688			}
689		} else {
690			if (dp->di_extsize < smalllblktosize(fs, lbn + 1))
691				nsize = fragroundup(fs, size);
692			else
693				nsize = fs->fs_bsize;
694			UFS_LOCK(ump);
695			error = ffs_alloc(ip, lbn,
696			   ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]),
697			   nsize, flags, cred, &newb);
698			if (error)
699				return (error);
700			bp = getblk(vp, -1 - lbn, nsize, 0, 0, gbflags);
701			bp->b_blkno = fsbtodb(fs, newb);
702			bp->b_xflags |= BX_ALTDATA;
703			if (flags & BA_CLRBUF)
704				vfs_bio_clrbuf(bp);
705			if (DOINGSOFTDEP(vp))
706				softdep_setup_allocext(ip, lbn, newb, 0,
707				    nsize, 0, bp);
708		}
709		dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno);
710		ip->i_flag |= IN_CHANGE;
711		*bpp = bp;
712		return (0);
713	}
714	/*
715	 * If the next write will extend the file into a new block,
716	 * and the file is currently composed of a fragment
717	 * this fragment has to be extended to be a full block.
718	 */
719	lastlbn = lblkno(fs, ip->i_size);
720	if (lastlbn < NDADDR && lastlbn < lbn) {
721		nb = lastlbn;
722		osize = blksize(fs, ip, nb);
723		if (osize < fs->fs_bsize && osize > 0) {
724			UFS_LOCK(ump);
725			error = ffs_realloccg(ip, nb, dp->di_db[nb],
726			    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
727			    &dp->di_db[0]), osize, (int)fs->fs_bsize,
728			    flags, cred, &bp);
729			if (error)
730				return (error);
731			if (DOINGSOFTDEP(vp))
732				softdep_setup_allocdirect(ip, nb,
733				    dbtofsb(fs, bp->b_blkno),
734				    dp->di_db[nb],
735				    fs->fs_bsize, osize, bp);
736			ip->i_size = smalllblktosize(fs, nb + 1);
737			dp->di_size = ip->i_size;
738			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
739			ip->i_flag |= IN_CHANGE | IN_UPDATE;
740			if (flags & IO_SYNC)
741				bwrite(bp);
742			else
743				bawrite(bp);
744		}
745	}
746	/*
747	 * The first NDADDR blocks are direct blocks
748	 */
749	if (lbn < NDADDR) {
750		if (flags & BA_METAONLY)
751			panic("ffs_balloc_ufs2: BA_METAONLY for direct block");
752		nb = dp->di_db[lbn];
753		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
754			error = bread_gb(vp, lbn, fs->fs_bsize, NOCRED,
755			    gbflags, &bp);
756			if (error) {
757				brelse(bp);
758				return (error);
759			}
760			bp->b_blkno = fsbtodb(fs, nb);
761			*bpp = bp;
762			return (0);
763		}
764		if (nb != 0) {
765			/*
766			 * Consider need to reallocate a fragment.
767			 */
768			osize = fragroundup(fs, blkoff(fs, ip->i_size));
769			nsize = fragroundup(fs, size);
770			if (nsize <= osize) {
771				error = bread_gb(vp, lbn, osize, NOCRED,
772				    gbflags, &bp);
773				if (error) {
774					brelse(bp);
775					return (error);
776				}
777				bp->b_blkno = fsbtodb(fs, nb);
778			} else {
779				UFS_LOCK(ump);
780				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
781				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
782				    &dp->di_db[0]), osize, nsize, flags,
783				    cred, &bp);
784				if (error)
785					return (error);
786				if (DOINGSOFTDEP(vp))
787					softdep_setup_allocdirect(ip, lbn,
788					    dbtofsb(fs, bp->b_blkno), nb,
789					    nsize, osize, bp);
790			}
791		} else {
792			if (ip->i_size < smalllblktosize(fs, lbn + 1))
793				nsize = fragroundup(fs, size);
794			else
795				nsize = fs->fs_bsize;
796			UFS_LOCK(ump);
797			error = ffs_alloc(ip, lbn,
798			    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
799				&dp->di_db[0]), nsize, flags, cred, &newb);
800			if (error)
801				return (error);
802			bp = getblk(vp, lbn, nsize, 0, 0, gbflags);
803			bp->b_blkno = fsbtodb(fs, newb);
804			if (flags & BA_CLRBUF)
805				vfs_bio_clrbuf(bp);
806			if (DOINGSOFTDEP(vp))
807				softdep_setup_allocdirect(ip, lbn, newb, 0,
808				    nsize, 0, bp);
809		}
810		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
811		ip->i_flag |= IN_CHANGE | IN_UPDATE;
812		*bpp = bp;
813		return (0);
814	}
815	/*
816	 * Determine the number of levels of indirection.
817	 */
818	pref = 0;
819	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
820		return(error);
821#ifdef INVARIANTS
822	if (num < 1)
823		panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block");
824#endif
825	saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH);
826	/*
827	 * Fetch the first indirect block allocating if necessary.
828	 */
829	--num;
830	nb = dp->di_ib[indirs[0].in_off];
831	allocib = NULL;
832	allocblk = allociblk;
833	lbns_remfree = lbns;
834	if (nb == 0) {
835		UFS_LOCK(ump);
836		pref = ffs_blkpref_ufs2(ip, lbn, -indirs[0].in_off - 1,
837		    (ufs2_daddr_t *)0);
838		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
839		    flags, cred, &newb)) != 0) {
840			curthread_pflags_restore(saved_inbdflush);
841			return (error);
842		}
843		pref = newb + fs->fs_frag;
844		nb = newb;
845		MPASS(allocblk < allociblk + nitems(allociblk));
846		MPASS(lbns_remfree < lbns + nitems(lbns));
847		*allocblk++ = nb;
848		*lbns_remfree++ = indirs[1].in_lbn;
849		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0,
850		    GB_UNMAPPED);
851		bp->b_blkno = fsbtodb(fs, nb);
852		vfs_bio_clrbuf(bp);
853		if (DOINGSOFTDEP(vp)) {
854			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
855			    newb, 0, fs->fs_bsize, 0, bp);
856			bdwrite(bp);
857		} else {
858			/*
859			 * Write synchronously so that indirect blocks
860			 * never point at garbage.
861			 */
862			if (DOINGASYNC(vp))
863				bdwrite(bp);
864			else if ((error = bwrite(bp)) != 0)
865				goto fail;
866		}
867		allocib = &dp->di_ib[indirs[0].in_off];
868		*allocib = nb;
869		ip->i_flag |= IN_CHANGE | IN_UPDATE;
870	}
871	/*
872	 * Fetch through the indirect blocks, allocating as necessary.
873	 */
874retry:
875	for (i = 1;;) {
876		error = bread(vp,
877		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
878		if (error) {
879			brelse(bp);
880			goto fail;
881		}
882		bap = (ufs2_daddr_t *)bp->b_data;
883		nb = bap[indirs[i].in_off];
884		if (i == num)
885			break;
886		i += 1;
887		if (nb != 0) {
888			bqrelse(bp);
889			continue;
890		}
891		UFS_LOCK(ump);
892		/*
893		 * If parent indirect has just been allocated, try to cluster
894		 * immediately following it.
895		 */
896		if (pref == 0)
897			pref = ffs_blkpref_ufs2(ip, lbn, i - num - 1,
898			    (ufs2_daddr_t *)0);
899		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
900		    flags | IO_BUFLOCKED, cred, &newb)) != 0) {
901			brelse(bp);
902			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
903				UFS_LOCK(ump);
904				softdep_request_cleanup(fs, vp, cred,
905				    FLUSH_BLOCKS_WAIT);
906				UFS_UNLOCK(ump);
907				goto retry;
908			}
909			if (ppsratecheck(&lastfail, &curfail, 1)) {
910				ffs_fserr(fs, ip->i_number, "filesystem full");
911				uprintf("\n%s: write failed, filesystem "
912				    "is full\n", fs->fs_fsmnt);
913			}
914			goto fail;
915		}
916		pref = newb + fs->fs_frag;
917		nb = newb;
918		MPASS(allocblk < allociblk + nitems(allociblk));
919		MPASS(lbns_remfree < lbns + nitems(lbns));
920		*allocblk++ = nb;
921		*lbns_remfree++ = indirs[i].in_lbn;
922		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0,
923		    GB_UNMAPPED);
924		nbp->b_blkno = fsbtodb(fs, nb);
925		vfs_bio_clrbuf(nbp);
926		if (DOINGSOFTDEP(vp)) {
927			softdep_setup_allocindir_meta(nbp, ip, bp,
928			    indirs[i - 1].in_off, nb);
929			bdwrite(nbp);
930		} else {
931			/*
932			 * Write synchronously so that indirect blocks
933			 * never point at garbage.
934			 */
935			if ((error = bwrite(nbp)) != 0) {
936				brelse(bp);
937				goto fail;
938			}
939		}
940		bap[indirs[i - 1].in_off] = nb;
941		if (allocib == NULL && unwindidx < 0)
942			unwindidx = i - 1;
943		/*
944		 * If required, write synchronously, otherwise use
945		 * delayed write.
946		 */
947		if (flags & IO_SYNC) {
948			bwrite(bp);
949		} else {
950			if (bp->b_bufsize == fs->fs_bsize)
951				bp->b_flags |= B_CLUSTEROK;
952			bdwrite(bp);
953		}
954	}
955	/*
956	 * If asked only for the indirect block, then return it.
957	 */
958	if (flags & BA_METAONLY) {
959		curthread_pflags_restore(saved_inbdflush);
960		*bpp = bp;
961		return (0);
962	}
963	/*
964	 * Get the data block, allocating if necessary.
965	 */
966	if (nb == 0) {
967		UFS_LOCK(ump);
968		/*
969		 * If allocating metadata at the front of the cylinder
970		 * group and parent indirect block has just been allocated,
971		 * then cluster next to it if it is the first indirect in
972		 * the file. Otherwise it has been allocated in the metadata
973		 * area, so we want to find our own place out in the data area.
974		 */
975		if (pref == 0 || (lbn > NDADDR && fs->fs_metaspace != 0))
976			pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off,
977			    &bap[0]);
978		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
979		    flags | IO_BUFLOCKED, cred, &newb);
980		if (error) {
981			brelse(bp);
982			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
983				UFS_LOCK(ump);
984				softdep_request_cleanup(fs, vp, cred,
985				    FLUSH_BLOCKS_WAIT);
986				UFS_UNLOCK(ump);
987				goto retry;
988			}
989			if (ppsratecheck(&lastfail, &curfail, 1)) {
990				ffs_fserr(fs, ip->i_number, "filesystem full");
991				uprintf("\n%s: write failed, filesystem "
992				    "is full\n", fs->fs_fsmnt);
993			}
994			goto fail;
995		}
996		nb = newb;
997		MPASS(allocblk < allociblk + nitems(allociblk));
998		MPASS(lbns_remfree < lbns + nitems(lbns));
999		*allocblk++ = nb;
1000		*lbns_remfree++ = lbn;
1001		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
1002		nbp->b_blkno = fsbtodb(fs, nb);
1003		if (flags & BA_CLRBUF)
1004			vfs_bio_clrbuf(nbp);
1005		if (DOINGSOFTDEP(vp))
1006			softdep_setup_allocindir_page(ip, lbn, bp,
1007			    indirs[i].in_off, nb, 0, nbp);
1008		bap[indirs[i].in_off] = nb;
1009		/*
1010		 * If required, write synchronously, otherwise use
1011		 * delayed write.
1012		 */
1013		if (flags & IO_SYNC) {
1014			bwrite(bp);
1015		} else {
1016			if (bp->b_bufsize == fs->fs_bsize)
1017				bp->b_flags |= B_CLUSTEROK;
1018			bdwrite(bp);
1019		}
1020		curthread_pflags_restore(saved_inbdflush);
1021		*bpp = nbp;
1022		return (0);
1023	}
1024	brelse(bp);
1025	/*
1026	 * If requested clear invalid portions of the buffer.  If we
1027	 * have to do a read-before-write (typical if BA_CLRBUF is set),
1028	 * try to do some read-ahead in the sequential case to reduce
1029	 * the number of I/O transactions.
1030	 */
1031	if (flags & BA_CLRBUF) {
1032		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
1033		if (seqcount != 0 &&
1034		    (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 &&
1035		    !(vm_page_count_severe() || buf_dirty_count_severe())) {
1036			error = cluster_read(vp, ip->i_size, lbn,
1037			    (int)fs->fs_bsize, NOCRED,
1038			    MAXBSIZE, seqcount, gbflags, &nbp);
1039		} else {
1040			error = bread_gb(vp, lbn, (int)fs->fs_bsize,
1041			    NOCRED, gbflags, &nbp);
1042		}
1043		if (error) {
1044			brelse(nbp);
1045			goto fail;
1046		}
1047	} else {
1048		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
1049		nbp->b_blkno = fsbtodb(fs, nb);
1050	}
1051	curthread_pflags_restore(saved_inbdflush);
1052	*bpp = nbp;
1053	return (0);
1054fail:
1055	curthread_pflags_restore(saved_inbdflush);
1056	/*
1057	 * If we have failed to allocate any blocks, simply return the error.
1058	 * This is the usual case and avoids the need to fsync the file.
1059	 */
1060	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
1061		return (error);
1062	/*
1063	 * If we have failed part way through block allocation, we
1064	 * have to deallocate any indirect blocks that we have allocated.
1065	 * We have to fsync the file before we start to get rid of all
1066	 * of its dependencies so that we do not leave them dangling.
1067	 * We have to sync it at the end so that the soft updates code
1068	 * does not find any untracked changes. Although this is really
1069	 * slow, running out of disk space is not expected to be a common
1070	 * occurrence. The error return from fsync is ignored as we already
1071	 * have an error to return to the user.
1072	 *
1073	 * XXX Still have to journal the free below
1074	 */
1075	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
1076	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
1077	     blkp < allocblk; blkp++, lbns_remfree++) {
1078		/*
1079		 * We shall not leave the freed blocks on the vnode
1080		 * buffer object lists.
1081		 */
1082		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
1083		    GB_NOCREAT | GB_UNMAPPED);
1084		if (bp != NULL) {
1085			KASSERT(bp->b_blkno == fsbtodb(fs, *blkp),
1086			    ("mismatch2 l %jd %jd b %ju %ju",
1087			    (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree,
1088			    (uintmax_t)bp->b_blkno,
1089			    (uintmax_t)fsbtodb(fs, *blkp)));
1090			bp->b_flags |= (B_INVAL | B_RELBUF);
1091			bp->b_flags &= ~B_ASYNC;
1092			brelse(bp);
1093		}
1094		deallocated += fs->fs_bsize;
1095	}
1096	if (allocib != NULL) {
1097		*allocib = 0;
1098	} else if (unwindidx >= 0) {
1099		int r;
1100
1101		r = bread(vp, indirs[unwindidx].in_lbn,
1102		    (int)fs->fs_bsize, NOCRED, &bp);
1103		if (r) {
1104			panic("Could not unwind indirect block, error %d", r);
1105			brelse(bp);
1106		} else {
1107			bap = (ufs2_daddr_t *)bp->b_data;
1108			bap[indirs[unwindidx].in_off] = 0;
1109			if (flags & IO_SYNC) {
1110				bwrite(bp);
1111			} else {
1112				if (bp->b_bufsize == fs->fs_bsize)
1113					bp->b_flags |= B_CLUSTEROK;
1114				bdwrite(bp);
1115			}
1116		}
1117	}
1118	if (deallocated) {
1119#ifdef QUOTA
1120		/*
1121		 * Restore user's disk quota because allocation failed.
1122		 */
1123		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
1124#endif
1125		dp->di_blocks -= btodb(deallocated);
1126		ip->i_flag |= IN_CHANGE | IN_UPDATE;
1127	}
1128	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
1129	/*
1130	 * After the buffers are invalidated and on-disk pointers are
1131	 * cleared, free the blocks.
1132	 */
1133	for (blkp = allociblk; blkp < allocblk; blkp++) {
1134#ifdef INVARIANTS
1135		if (blkp == allociblk)
1136			lbns_remfree = lbns;
1137		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
1138		    GB_NOCREAT | GB_UNMAPPED);
1139		if (bp != NULL) {
1140			panic("zombie2 %jd %ju %ju",
1141			    (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno,
1142			    (uintmax_t)fsbtodb(fs, *blkp));
1143		}
1144		lbns_remfree++;
1145#endif
1146		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
1147		    ip->i_number, vp->v_type, NULL);
1148	}
1149	return (error);
1150}
1151