1/*-
2 * Copyright (c) 2002 Networks Associates Technology, Inc.
3 * All rights reserved.
4 *
5 * This software was developed for the FreeBSD Project by Marshall
6 * Kirk McKusick and Network Associates Laboratories, the Security
7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9 * research program
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * Copyright (c) 1982, 1986, 1989, 1993
33 *	The Regents of the University of California.  All rights reserved.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 *    notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 *    notice, this list of conditions and the following disclaimer in the
42 *    documentation and/or other materials provided with the distribution.
43 * 4. Neither the name of the University nor the names of its contributors
44 *    may be used to endorse or promote products derived from this software
45 *    without specific prior written permission.
46 *
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * SUCH DAMAGE.
58 *
59 *	@(#)ffs_balloc.c	8.8 (Berkeley) 6/16/95
60 */
61
62#include <sys/cdefs.h>
63__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_balloc.c 362050 2020-06-11 11:45:30Z kib $");
64
65#include <sys/param.h>
66#include <sys/systm.h>
67#include <sys/bio.h>
68#include <sys/buf.h>
69#include <sys/lock.h>
70#include <sys/mount.h>
71#include <sys/vnode.h>
72#include <sys/vmmeter.h>
73
74#include <ufs/ufs/quota.h>
75#include <ufs/ufs/inode.h>
76#include <ufs/ufs/ufs_extern.h>
77#include <ufs/ufs/extattr.h>
78#include <ufs/ufs/ufsmount.h>
79
80#include <ufs/ffs/fs.h>
81#include <ufs/ffs/ffs_extern.h>
82
83/*
84 * Balloc defines the structure of filesystem storage
85 * by allocating the physical blocks on a device given
86 * the inode and the logical block number in a file.
87 * This is the allocation strategy for UFS1. Below is
88 * the allocation strategy for UFS2.
89 */
90int
91ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
92    struct ucred *cred, int flags, struct buf **bpp)
93{
94	struct inode *ip;
95	struct ufs1_dinode *dp;
96	ufs_lbn_t lbn, lastlbn;
97	struct fs *fs;
98	ufs1_daddr_t nb;
99	struct buf *bp, *nbp;
100	struct ufsmount *ump;
101	struct indir indirs[NIADDR + 2];
102	int deallocated, osize, nsize, num, i, error;
103	ufs2_daddr_t newb;
104	ufs1_daddr_t *bap, pref;
105	ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
106	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
107	int unwindidx = -1;
108	int saved_inbdflush;
109	static struct timeval lastfail;
110	static int curfail;
111	int gbflags, reclaimed;
112
113	ip = VTOI(vp);
114	dp = ip->i_din1;
115	fs = ITOFS(ip);
116	ump = ITOUMP(ip);
117	lbn = lblkno(fs, startoffset);
118	size = blkoff(fs, startoffset) + size;
119	reclaimed = 0;
120	if (size > fs->fs_bsize)
121		panic("ffs_balloc_ufs1: blk too big");
122	*bpp = NULL;
123	if (flags & IO_EXT)
124		return (EOPNOTSUPP);
125	if (lbn < 0)
126		return (EFBIG);
127	gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
128
129	if (DOINGSOFTDEP(vp))
130		softdep_prealloc(vp, MNT_WAIT);
131	/*
132	 * If the next write will extend the file into a new block,
133	 * and the file is currently composed of a fragment
134	 * this fragment has to be extended to be a full block.
135	 */
136	lastlbn = lblkno(fs, ip->i_size);
137	if (lastlbn < NDADDR && lastlbn < lbn) {
138		nb = lastlbn;
139		osize = blksize(fs, ip, nb);
140		if (osize < fs->fs_bsize && osize > 0) {
141			UFS_LOCK(ump);
142			error = ffs_realloccg(ip, nb, dp->di_db[nb],
143			   ffs_blkpref_ufs1(ip, lastlbn, (int)nb,
144			   &dp->di_db[0]), osize, (int)fs->fs_bsize, flags,
145			   cred, &bp);
146			if (error)
147				return (error);
148			if (DOINGSOFTDEP(vp))
149				softdep_setup_allocdirect(ip, nb,
150				    dbtofsb(fs, bp->b_blkno), dp->di_db[nb],
151				    fs->fs_bsize, osize, bp);
152			ip->i_size = smalllblktosize(fs, nb + 1);
153			dp->di_size = ip->i_size;
154			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
155			ip->i_flag |= IN_SIZEMOD | IN_CHANGE | IN_UPDATE |
156			    IN_IBLKDATA;
157			if (flags & IO_SYNC)
158				bwrite(bp);
159			else
160				bawrite(bp);
161		}
162	}
163	/*
164	 * The first NDADDR blocks are direct blocks
165	 */
166	if (lbn < NDADDR) {
167		if (flags & BA_METAONLY)
168			panic("ffs_balloc_ufs1: BA_METAONLY for direct block");
169		nb = dp->di_db[lbn];
170		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
171			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
172			if (error) {
173				brelse(bp);
174				return (error);
175			}
176			bp->b_blkno = fsbtodb(fs, nb);
177			*bpp = bp;
178			return (0);
179		}
180		if (nb != 0) {
181			/*
182			 * Consider need to reallocate a fragment.
183			 */
184			osize = fragroundup(fs, blkoff(fs, ip->i_size));
185			nsize = fragroundup(fs, size);
186			if (nsize <= osize) {
187				error = bread(vp, lbn, osize, NOCRED, &bp);
188				if (error) {
189					brelse(bp);
190					return (error);
191				}
192				bp->b_blkno = fsbtodb(fs, nb);
193			} else {
194				UFS_LOCK(ump);
195				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
196				    ffs_blkpref_ufs1(ip, lbn, (int)lbn,
197				    &dp->di_db[0]), osize, nsize, flags,
198				    cred, &bp);
199				if (error)
200					return (error);
201				if (DOINGSOFTDEP(vp))
202					softdep_setup_allocdirect(ip, lbn,
203					    dbtofsb(fs, bp->b_blkno), nb,
204					    nsize, osize, bp);
205			}
206		} else {
207			if (ip->i_size < smalllblktosize(fs, lbn + 1))
208				nsize = fragroundup(fs, size);
209			else
210				nsize = fs->fs_bsize;
211			UFS_LOCK(ump);
212			error = ffs_alloc(ip, lbn,
213			    ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]),
214			    nsize, flags, cred, &newb);
215			if (error)
216				return (error);
217			bp = getblk(vp, lbn, nsize, 0, 0, gbflags);
218			bp->b_blkno = fsbtodb(fs, newb);
219			if (flags & BA_CLRBUF)
220				vfs_bio_clrbuf(bp);
221			if (DOINGSOFTDEP(vp))
222				softdep_setup_allocdirect(ip, lbn, newb, 0,
223				    nsize, 0, bp);
224		}
225		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
226		ip->i_flag |= IN_CHANGE | IN_UPDATE | IN_IBLKDATA;
227		*bpp = bp;
228		return (0);
229	}
230	/*
231	 * Determine the number of levels of indirection.
232	 */
233	pref = 0;
234	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
235		return(error);
236#ifdef INVARIANTS
237	if (num < 1)
238		panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block");
239#endif
240	saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH);
241	/*
242	 * Fetch the first indirect block allocating if necessary.
243	 */
244	--num;
245	nb = dp->di_ib[indirs[0].in_off];
246	allocib = NULL;
247	allocblk = allociblk;
248	lbns_remfree = lbns;
249	if (nb == 0) {
250		UFS_LOCK(ump);
251		pref = ffs_blkpref_ufs1(ip, lbn, -indirs[0].in_off - 1,
252		    (ufs1_daddr_t *)0);
253		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
254		    flags, cred, &newb)) != 0) {
255			curthread_pflags_restore(saved_inbdflush);
256			return (error);
257		}
258		pref = newb + fs->fs_frag;
259		nb = newb;
260		MPASS(allocblk < allociblk + nitems(allociblk));
261		MPASS(lbns_remfree < lbns + nitems(lbns));
262		*allocblk++ = nb;
263		*lbns_remfree++ = indirs[1].in_lbn;
264		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, gbflags);
265		bp->b_blkno = fsbtodb(fs, nb);
266		vfs_bio_clrbuf(bp);
267		if (DOINGSOFTDEP(vp)) {
268			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
269			    newb, 0, fs->fs_bsize, 0, bp);
270			bdwrite(bp);
271		} else {
272			/*
273			 * Write synchronously so that indirect blocks
274			 * never point at garbage.
275			 */
276			if (DOINGASYNC(vp))
277				bdwrite(bp);
278			else if ((error = bwrite(bp)) != 0)
279				goto fail;
280		}
281		allocib = &dp->di_ib[indirs[0].in_off];
282		*allocib = nb;
283		ip->i_flag |= IN_CHANGE | IN_UPDATE | IN_IBLKDATA;
284	}
285	/*
286	 * Fetch through the indirect blocks, allocating as necessary.
287	 */
288retry:
289	for (i = 1;;) {
290		error = bread(vp,
291		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
292		if (error) {
293			brelse(bp);
294			goto fail;
295		}
296		bap = (ufs1_daddr_t *)bp->b_data;
297		nb = bap[indirs[i].in_off];
298		if (i == num)
299			break;
300		i += 1;
301		if (nb != 0) {
302			bqrelse(bp);
303			continue;
304		}
305		UFS_LOCK(ump);
306		/*
307		 * If parent indirect has just been allocated, try to cluster
308		 * immediately following it.
309		 */
310		if (pref == 0)
311			pref = ffs_blkpref_ufs1(ip, lbn, i - num - 1,
312			    (ufs1_daddr_t *)0);
313		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
314		    flags | IO_BUFLOCKED, cred, &newb)) != 0) {
315			brelse(bp);
316			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
317				UFS_LOCK(ump);
318				softdep_request_cleanup(fs, vp, cred,
319				    FLUSH_BLOCKS_WAIT);
320				UFS_UNLOCK(ump);
321				goto retry;
322			}
323			if (ppsratecheck(&lastfail, &curfail, 1)) {
324				ffs_fserr(fs, ip->i_number, "filesystem full");
325				uprintf("\n%s: write failed, filesystem "
326				    "is full\n", fs->fs_fsmnt);
327			}
328			goto fail;
329		}
330		pref = newb + fs->fs_frag;
331		nb = newb;
332		MPASS(allocblk < allociblk + nitems(allociblk));
333		MPASS(lbns_remfree < lbns + nitems(lbns));
334		*allocblk++ = nb;
335		*lbns_remfree++ = indirs[i].in_lbn;
336		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
337		nbp->b_blkno = fsbtodb(fs, nb);
338		vfs_bio_clrbuf(nbp);
339		if (DOINGSOFTDEP(vp)) {
340			softdep_setup_allocindir_meta(nbp, ip, bp,
341			    indirs[i - 1].in_off, nb);
342			bdwrite(nbp);
343		} else {
344			/*
345			 * Write synchronously so that indirect blocks
346			 * never point at garbage.
347			 */
348			if ((error = bwrite(nbp)) != 0) {
349				brelse(bp);
350				goto fail;
351			}
352		}
353		bap[indirs[i - 1].in_off] = nb;
354		if (allocib == NULL && unwindidx < 0)
355			unwindidx = i - 1;
356		/*
357		 * If required, write synchronously, otherwise use
358		 * delayed write.
359		 */
360		if (flags & IO_SYNC) {
361			bwrite(bp);
362		} else {
363			if (bp->b_bufsize == fs->fs_bsize)
364				bp->b_flags |= B_CLUSTEROK;
365			bdwrite(bp);
366		}
367	}
368	/*
369	 * If asked only for the indirect block, then return it.
370	 */
371	if (flags & BA_METAONLY) {
372		curthread_pflags_restore(saved_inbdflush);
373		*bpp = bp;
374		return (0);
375	}
376	/*
377	 * Get the data block, allocating if necessary.
378	 */
379	if (nb == 0) {
380		UFS_LOCK(ump);
381		/*
382		 * If allocating metadata at the front of the cylinder
383		 * group and parent indirect block has just been allocated,
384		 * then cluster next to it if it is the first indirect in
385		 * the file. Otherwise it has been allocated in the metadata
386		 * area, so we want to find our own place out in the data area.
387		 */
388		if (pref == 0 || (lbn > NDADDR && fs->fs_metaspace != 0))
389			pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off,
390			    &bap[0]);
391		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
392		    flags | IO_BUFLOCKED, cred, &newb);
393		if (error) {
394			brelse(bp);
395			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
396				UFS_LOCK(ump);
397				softdep_request_cleanup(fs, vp, cred,
398				    FLUSH_BLOCKS_WAIT);
399				UFS_UNLOCK(ump);
400				goto retry;
401			}
402			if (ppsratecheck(&lastfail, &curfail, 1)) {
403				ffs_fserr(fs, ip->i_number, "filesystem full");
404				uprintf("\n%s: write failed, filesystem "
405				    "is full\n", fs->fs_fsmnt);
406			}
407			goto fail;
408		}
409		nb = newb;
410		MPASS(allocblk < allociblk + nitems(allociblk));
411		MPASS(lbns_remfree < lbns + nitems(lbns));
412		*allocblk++ = nb;
413		*lbns_remfree++ = lbn;
414		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
415		nbp->b_blkno = fsbtodb(fs, nb);
416		if (flags & BA_CLRBUF)
417			vfs_bio_clrbuf(nbp);
418		if (DOINGSOFTDEP(vp))
419			softdep_setup_allocindir_page(ip, lbn, bp,
420			    indirs[i].in_off, nb, 0, nbp);
421		bap[indirs[i].in_off] = nb;
422		/*
423		 * If required, write synchronously, otherwise use
424		 * delayed write.
425		 */
426		if (flags & IO_SYNC) {
427			bwrite(bp);
428		} else {
429			if (bp->b_bufsize == fs->fs_bsize)
430				bp->b_flags |= B_CLUSTEROK;
431			bdwrite(bp);
432		}
433		curthread_pflags_restore(saved_inbdflush);
434		*bpp = nbp;
435		return (0);
436	}
437	brelse(bp);
438	if (flags & BA_CLRBUF) {
439		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
440		if (seqcount != 0 &&
441		    (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 &&
442		    !(vm_page_count_severe() || buf_dirty_count_severe())) {
443			error = cluster_read(vp, ip->i_size, lbn,
444			    (int)fs->fs_bsize, NOCRED,
445			    MAXBSIZE, seqcount, gbflags, &nbp);
446		} else {
447			error = bread_gb(vp, lbn, (int)fs->fs_bsize, NOCRED,
448			    gbflags, &nbp);
449		}
450		if (error) {
451			brelse(nbp);
452			goto fail;
453		}
454	} else {
455		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
456		nbp->b_blkno = fsbtodb(fs, nb);
457	}
458	curthread_pflags_restore(saved_inbdflush);
459	*bpp = nbp;
460	return (0);
461fail:
462	curthread_pflags_restore(saved_inbdflush);
463	/*
464	 * If we have failed to allocate any blocks, simply return the error.
465	 * This is the usual case and avoids the need to fsync the file.
466	 */
467	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
468		return (error);
469	/*
470	 * If we have failed part way through block allocation, we
471	 * have to deallocate any indirect blocks that we have allocated.
472	 * We have to fsync the file before we start to get rid of all
473	 * of its dependencies so that we do not leave them dangling.
474	 * We have to sync it at the end so that the soft updates code
475	 * does not find any untracked changes. Although this is really
476	 * slow, running out of disk space is not expected to be a common
477	 * occurrence. The error return from fsync is ignored as we already
478	 * have an error to return to the user.
479	 *
480	 * XXX Still have to journal the free below
481	 */
482	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
483	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
484	     blkp < allocblk; blkp++, lbns_remfree++) {
485		/*
486		 * We shall not leave the freed blocks on the vnode
487		 * buffer object lists.
488		 */
489		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
490		    GB_NOCREAT | GB_UNMAPPED);
491		if (bp != NULL) {
492			KASSERT(bp->b_blkno == fsbtodb(fs, *blkp),
493			    ("mismatch1 l %jd %jd b %ju %ju",
494			    (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree,
495			    (uintmax_t)bp->b_blkno,
496			    (uintmax_t)fsbtodb(fs, *blkp)));
497			bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE;
498			bp->b_flags &= ~(B_ASYNC | B_CACHE);
499			brelse(bp);
500		}
501		deallocated += fs->fs_bsize;
502	}
503	if (allocib != NULL) {
504		*allocib = 0;
505	} else if (unwindidx >= 0) {
506		int r;
507
508		r = bread(vp, indirs[unwindidx].in_lbn,
509		    (int)fs->fs_bsize, NOCRED, &bp);
510		if (r) {
511			panic("Could not unwind indirect block, error %d", r);
512			brelse(bp);
513		} else {
514			bap = (ufs1_daddr_t *)bp->b_data;
515			bap[indirs[unwindidx].in_off] = 0;
516			if (flags & IO_SYNC) {
517				bwrite(bp);
518			} else {
519				if (bp->b_bufsize == fs->fs_bsize)
520					bp->b_flags |= B_CLUSTEROK;
521				bdwrite(bp);
522			}
523		}
524	}
525	if (deallocated) {
526#ifdef QUOTA
527		/*
528		 * Restore user's disk quota because allocation failed.
529		 */
530		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
531#endif
532		dp->di_blocks -= btodb(deallocated);
533		ip->i_flag |= IN_CHANGE | IN_UPDATE;
534	}
535	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
536	/*
537	 * After the buffers are invalidated and on-disk pointers are
538	 * cleared, free the blocks.
539	 */
540	for (blkp = allociblk; blkp < allocblk; blkp++) {
541#ifdef INVARIANTS
542		if (blkp == allociblk)
543			lbns_remfree = lbns;
544		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
545		    GB_NOCREAT | GB_UNMAPPED);
546		if (bp != NULL) {
547			panic("zombie1 %jd %ju %ju",
548			    (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno,
549			    (uintmax_t)fsbtodb(fs, *blkp));
550		}
551		lbns_remfree++;
552#endif
553		ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize,
554		    ip->i_number, vp->v_type, NULL);
555	}
556	return (error);
557}
558
559/*
560 * Balloc defines the structure of file system storage
561 * by allocating the physical blocks on a device given
562 * the inode and the logical block number in a file.
563 * This is the allocation strategy for UFS2. Above is
564 * the allocation strategy for UFS1.
565 */
566int
567ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
568    struct ucred *cred, int flags, struct buf **bpp)
569{
570	struct inode *ip;
571	struct ufs2_dinode *dp;
572	ufs_lbn_t lbn, lastlbn;
573	struct fs *fs;
574	struct buf *bp, *nbp;
575	struct ufsmount *ump;
576	struct indir indirs[NIADDR + 2];
577	ufs2_daddr_t nb, newb, *bap, pref;
578	ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
579	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
580	int deallocated, osize, nsize, num, i, error;
581	int unwindidx = -1;
582	int saved_inbdflush;
583	static struct timeval lastfail;
584	static int curfail;
585	int gbflags, reclaimed;
586
587	ip = VTOI(vp);
588	dp = ip->i_din2;
589	fs = ITOFS(ip);
590	ump = ITOUMP(ip);
591	lbn = lblkno(fs, startoffset);
592	size = blkoff(fs, startoffset) + size;
593	reclaimed = 0;
594	if (size > fs->fs_bsize)
595		panic("ffs_balloc_ufs2: blk too big");
596	*bpp = NULL;
597	if (lbn < 0)
598		return (EFBIG);
599	gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
600
601	if (DOINGSOFTDEP(vp))
602		softdep_prealloc(vp, MNT_WAIT);
603
604	/*
605	 * Check for allocating external data.
606	 */
607	if (flags & IO_EXT) {
608		if (lbn >= NXADDR)
609			return (EFBIG);
610		/*
611		 * If the next write will extend the data into a new block,
612		 * and the data is currently composed of a fragment
613		 * this fragment has to be extended to be a full block.
614		 */
615		lastlbn = lblkno(fs, dp->di_extsize);
616		if (lastlbn < lbn) {
617			nb = lastlbn;
618			osize = sblksize(fs, dp->di_extsize, nb);
619			if (osize < fs->fs_bsize && osize > 0) {
620				UFS_LOCK(ump);
621				error = ffs_realloccg(ip, -1 - nb,
622				    dp->di_extb[nb],
623				    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
624				    &dp->di_extb[0]), osize,
625				    (int)fs->fs_bsize, flags, cred, &bp);
626				if (error)
627					return (error);
628				if (DOINGSOFTDEP(vp))
629					softdep_setup_allocext(ip, nb,
630					    dbtofsb(fs, bp->b_blkno),
631					    dp->di_extb[nb],
632					    fs->fs_bsize, osize, bp);
633				dp->di_extsize = smalllblktosize(fs, nb + 1);
634				dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno);
635				bp->b_xflags |= BX_ALTDATA;
636				ip->i_flag |= IN_SIZEMOD | IN_CHANGE | IN_IBLKDATA;
637				if (flags & IO_SYNC)
638					bwrite(bp);
639				else
640					bawrite(bp);
641			}
642		}
643		/*
644		 * All blocks are direct blocks
645		 */
646		if (flags & BA_METAONLY)
647			panic("ffs_balloc_ufs2: BA_METAONLY for ext block");
648		nb = dp->di_extb[lbn];
649		if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) {
650			error = bread_gb(vp, -1 - lbn, fs->fs_bsize, NOCRED,
651			    gbflags, &bp);
652			if (error) {
653				brelse(bp);
654				return (error);
655			}
656			bp->b_blkno = fsbtodb(fs, nb);
657			bp->b_xflags |= BX_ALTDATA;
658			*bpp = bp;
659			return (0);
660		}
661		if (nb != 0) {
662			/*
663			 * Consider need to reallocate a fragment.
664			 */
665			osize = fragroundup(fs, blkoff(fs, dp->di_extsize));
666			nsize = fragroundup(fs, size);
667			if (nsize <= osize) {
668				error = bread_gb(vp, -1 - lbn, osize, NOCRED,
669				    gbflags, &bp);
670				if (error) {
671					brelse(bp);
672					return (error);
673				}
674				bp->b_blkno = fsbtodb(fs, nb);
675				bp->b_xflags |= BX_ALTDATA;
676			} else {
677				UFS_LOCK(ump);
678				error = ffs_realloccg(ip, -1 - lbn,
679				    dp->di_extb[lbn],
680				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
681				    &dp->di_extb[0]), osize, nsize, flags,
682				    cred, &bp);
683				if (error)
684					return (error);
685				bp->b_xflags |= BX_ALTDATA;
686				if (DOINGSOFTDEP(vp))
687					softdep_setup_allocext(ip, lbn,
688					    dbtofsb(fs, bp->b_blkno), nb,
689					    nsize, osize, bp);
690			}
691		} else {
692			if (dp->di_extsize < smalllblktosize(fs, lbn + 1))
693				nsize = fragroundup(fs, size);
694			else
695				nsize = fs->fs_bsize;
696			UFS_LOCK(ump);
697			error = ffs_alloc(ip, lbn,
698			   ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]),
699			   nsize, flags, cred, &newb);
700			if (error)
701				return (error);
702			bp = getblk(vp, -1 - lbn, nsize, 0, 0, gbflags);
703			bp->b_blkno = fsbtodb(fs, newb);
704			bp->b_xflags |= BX_ALTDATA;
705			if (flags & BA_CLRBUF)
706				vfs_bio_clrbuf(bp);
707			if (DOINGSOFTDEP(vp))
708				softdep_setup_allocext(ip, lbn, newb, 0,
709				    nsize, 0, bp);
710		}
711		dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno);
712		ip->i_flag |= IN_CHANGE | IN_IBLKDATA;
713		*bpp = bp;
714		return (0);
715	}
716	/*
717	 * If the next write will extend the file into a new block,
718	 * and the file is currently composed of a fragment
719	 * this fragment has to be extended to be a full block.
720	 */
721	lastlbn = lblkno(fs, ip->i_size);
722	if (lastlbn < NDADDR && lastlbn < lbn) {
723		nb = lastlbn;
724		osize = blksize(fs, ip, nb);
725		if (osize < fs->fs_bsize && osize > 0) {
726			UFS_LOCK(ump);
727			error = ffs_realloccg(ip, nb, dp->di_db[nb],
728			    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
729			    &dp->di_db[0]), osize, (int)fs->fs_bsize,
730			    flags, cred, &bp);
731			if (error)
732				return (error);
733			if (DOINGSOFTDEP(vp))
734				softdep_setup_allocdirect(ip, nb,
735				    dbtofsb(fs, bp->b_blkno),
736				    dp->di_db[nb],
737				    fs->fs_bsize, osize, bp);
738			ip->i_size = smalllblktosize(fs, nb + 1);
739			dp->di_size = ip->i_size;
740			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
741			ip->i_flag |= IN_SIZEMOD | IN_CHANGE | IN_UPDATE |
742			    IN_IBLKDATA;
743			if (flags & IO_SYNC)
744				bwrite(bp);
745			else
746				bawrite(bp);
747		}
748	}
749	/*
750	 * The first NDADDR blocks are direct blocks
751	 */
752	if (lbn < NDADDR) {
753		if (flags & BA_METAONLY)
754			panic("ffs_balloc_ufs2: BA_METAONLY for direct block");
755		nb = dp->di_db[lbn];
756		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
757			error = bread_gb(vp, lbn, fs->fs_bsize, NOCRED,
758			    gbflags, &bp);
759			if (error) {
760				brelse(bp);
761				return (error);
762			}
763			bp->b_blkno = fsbtodb(fs, nb);
764			*bpp = bp;
765			return (0);
766		}
767		if (nb != 0) {
768			/*
769			 * Consider need to reallocate a fragment.
770			 */
771			osize = fragroundup(fs, blkoff(fs, ip->i_size));
772			nsize = fragroundup(fs, size);
773			if (nsize <= osize) {
774				error = bread_gb(vp, lbn, osize, NOCRED,
775				    gbflags, &bp);
776				if (error) {
777					brelse(bp);
778					return (error);
779				}
780				bp->b_blkno = fsbtodb(fs, nb);
781			} else {
782				UFS_LOCK(ump);
783				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
784				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
785				    &dp->di_db[0]), osize, nsize, flags,
786				    cred, &bp);
787				if (error)
788					return (error);
789				if (DOINGSOFTDEP(vp))
790					softdep_setup_allocdirect(ip, lbn,
791					    dbtofsb(fs, bp->b_blkno), nb,
792					    nsize, osize, bp);
793			}
794		} else {
795			if (ip->i_size < smalllblktosize(fs, lbn + 1))
796				nsize = fragroundup(fs, size);
797			else
798				nsize = fs->fs_bsize;
799			UFS_LOCK(ump);
800			error = ffs_alloc(ip, lbn,
801			    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
802				&dp->di_db[0]), nsize, flags, cred, &newb);
803			if (error)
804				return (error);
805			bp = getblk(vp, lbn, nsize, 0, 0, gbflags);
806			bp->b_blkno = fsbtodb(fs, newb);
807			if (flags & BA_CLRBUF)
808				vfs_bio_clrbuf(bp);
809			if (DOINGSOFTDEP(vp))
810				softdep_setup_allocdirect(ip, lbn, newb, 0,
811				    nsize, 0, bp);
812		}
813		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
814		ip->i_flag |= IN_CHANGE | IN_UPDATE | IN_IBLKDATA;
815		*bpp = bp;
816		return (0);
817	}
818	/*
819	 * Determine the number of levels of indirection.
820	 */
821	pref = 0;
822	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
823		return(error);
824#ifdef INVARIANTS
825	if (num < 1)
826		panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block");
827#endif
828	saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH);
829	/*
830	 * Fetch the first indirect block allocating if necessary.
831	 */
832	--num;
833	nb = dp->di_ib[indirs[0].in_off];
834	allocib = NULL;
835	allocblk = allociblk;
836	lbns_remfree = lbns;
837	if (nb == 0) {
838		UFS_LOCK(ump);
839		pref = ffs_blkpref_ufs2(ip, lbn, -indirs[0].in_off - 1,
840		    (ufs2_daddr_t *)0);
841		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
842		    flags, cred, &newb)) != 0) {
843			curthread_pflags_restore(saved_inbdflush);
844			return (error);
845		}
846		pref = newb + fs->fs_frag;
847		nb = newb;
848		MPASS(allocblk < allociblk + nitems(allociblk));
849		MPASS(lbns_remfree < lbns + nitems(lbns));
850		*allocblk++ = nb;
851		*lbns_remfree++ = indirs[1].in_lbn;
852		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0,
853		    GB_UNMAPPED);
854		bp->b_blkno = fsbtodb(fs, nb);
855		vfs_bio_clrbuf(bp);
856		if (DOINGSOFTDEP(vp)) {
857			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
858			    newb, 0, fs->fs_bsize, 0, bp);
859			bdwrite(bp);
860		} else {
861			/*
862			 * Write synchronously so that indirect blocks
863			 * never point at garbage.
864			 */
865			if (DOINGASYNC(vp))
866				bdwrite(bp);
867			else if ((error = bwrite(bp)) != 0)
868				goto fail;
869		}
870		allocib = &dp->di_ib[indirs[0].in_off];
871		*allocib = nb;
872		ip->i_flag |= IN_CHANGE | IN_UPDATE | IN_IBLKDATA;
873	}
874	/*
875	 * Fetch through the indirect blocks, allocating as necessary.
876	 */
877retry:
878	for (i = 1;;) {
879		error = bread(vp,
880		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
881		if (error) {
882			brelse(bp);
883			goto fail;
884		}
885		bap = (ufs2_daddr_t *)bp->b_data;
886		nb = bap[indirs[i].in_off];
887		if (i == num)
888			break;
889		i += 1;
890		if (nb != 0) {
891			bqrelse(bp);
892			continue;
893		}
894		UFS_LOCK(ump);
895		/*
896		 * If parent indirect has just been allocated, try to cluster
897		 * immediately following it.
898		 */
899		if (pref == 0)
900			pref = ffs_blkpref_ufs2(ip, lbn, i - num - 1,
901			    (ufs2_daddr_t *)0);
902		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
903		    flags | IO_BUFLOCKED, cred, &newb)) != 0) {
904			brelse(bp);
905			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
906				UFS_LOCK(ump);
907				softdep_request_cleanup(fs, vp, cred,
908				    FLUSH_BLOCKS_WAIT);
909				UFS_UNLOCK(ump);
910				goto retry;
911			}
912			if (ppsratecheck(&lastfail, &curfail, 1)) {
913				ffs_fserr(fs, ip->i_number, "filesystem full");
914				uprintf("\n%s: write failed, filesystem "
915				    "is full\n", fs->fs_fsmnt);
916			}
917			goto fail;
918		}
919		pref = newb + fs->fs_frag;
920		nb = newb;
921		MPASS(allocblk < allociblk + nitems(allociblk));
922		MPASS(lbns_remfree < lbns + nitems(lbns));
923		*allocblk++ = nb;
924		*lbns_remfree++ = indirs[i].in_lbn;
925		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0,
926		    GB_UNMAPPED);
927		nbp->b_blkno = fsbtodb(fs, nb);
928		vfs_bio_clrbuf(nbp);
929		if (DOINGSOFTDEP(vp)) {
930			softdep_setup_allocindir_meta(nbp, ip, bp,
931			    indirs[i - 1].in_off, nb);
932			bdwrite(nbp);
933		} else {
934			/*
935			 * Write synchronously so that indirect blocks
936			 * never point at garbage.
937			 */
938			if ((error = bwrite(nbp)) != 0) {
939				brelse(bp);
940				goto fail;
941			}
942		}
943		bap[indirs[i - 1].in_off] = nb;
944		if (allocib == NULL && unwindidx < 0)
945			unwindidx = i - 1;
946		/*
947		 * If required, write synchronously, otherwise use
948		 * delayed write.
949		 */
950		if (flags & IO_SYNC) {
951			bwrite(bp);
952		} else {
953			if (bp->b_bufsize == fs->fs_bsize)
954				bp->b_flags |= B_CLUSTEROK;
955			bdwrite(bp);
956		}
957	}
958	/*
959	 * If asked only for the indirect block, then return it.
960	 */
961	if (flags & BA_METAONLY) {
962		curthread_pflags_restore(saved_inbdflush);
963		*bpp = bp;
964		return (0);
965	}
966	/*
967	 * Get the data block, allocating if necessary.
968	 */
969	if (nb == 0) {
970		UFS_LOCK(ump);
971		/*
972		 * If allocating metadata at the front of the cylinder
973		 * group and parent indirect block has just been allocated,
974		 * then cluster next to it if it is the first indirect in
975		 * the file. Otherwise it has been allocated in the metadata
976		 * area, so we want to find our own place out in the data area.
977		 */
978		if (pref == 0 || (lbn > NDADDR && fs->fs_metaspace != 0))
979			pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off,
980			    &bap[0]);
981		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
982		    flags | IO_BUFLOCKED, cred, &newb);
983		if (error) {
984			brelse(bp);
985			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
986				UFS_LOCK(ump);
987				softdep_request_cleanup(fs, vp, cred,
988				    FLUSH_BLOCKS_WAIT);
989				UFS_UNLOCK(ump);
990				goto retry;
991			}
992			if (ppsratecheck(&lastfail, &curfail, 1)) {
993				ffs_fserr(fs, ip->i_number, "filesystem full");
994				uprintf("\n%s: write failed, filesystem "
995				    "is full\n", fs->fs_fsmnt);
996			}
997			goto fail;
998		}
999		nb = newb;
1000		MPASS(allocblk < allociblk + nitems(allociblk));
1001		MPASS(lbns_remfree < lbns + nitems(lbns));
1002		*allocblk++ = nb;
1003		*lbns_remfree++ = lbn;
1004		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
1005		nbp->b_blkno = fsbtodb(fs, nb);
1006		if (flags & BA_CLRBUF)
1007			vfs_bio_clrbuf(nbp);
1008		if (DOINGSOFTDEP(vp))
1009			softdep_setup_allocindir_page(ip, lbn, bp,
1010			    indirs[i].in_off, nb, 0, nbp);
1011		bap[indirs[i].in_off] = nb;
1012		/*
1013		 * If required, write synchronously, otherwise use
1014		 * delayed write.
1015		 */
1016		if (flags & IO_SYNC) {
1017			bwrite(bp);
1018		} else {
1019			if (bp->b_bufsize == fs->fs_bsize)
1020				bp->b_flags |= B_CLUSTEROK;
1021			bdwrite(bp);
1022		}
1023		curthread_pflags_restore(saved_inbdflush);
1024		*bpp = nbp;
1025		return (0);
1026	}
1027	brelse(bp);
1028	/*
1029	 * If requested clear invalid portions of the buffer.  If we
1030	 * have to do a read-before-write (typical if BA_CLRBUF is set),
1031	 * try to do some read-ahead in the sequential case to reduce
1032	 * the number of I/O transactions.
1033	 */
1034	if (flags & BA_CLRBUF) {
1035		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
1036		if (seqcount != 0 &&
1037		    (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 &&
1038		    !(vm_page_count_severe() || buf_dirty_count_severe())) {
1039			error = cluster_read(vp, ip->i_size, lbn,
1040			    (int)fs->fs_bsize, NOCRED,
1041			    MAXBSIZE, seqcount, gbflags, &nbp);
1042		} else {
1043			error = bread_gb(vp, lbn, (int)fs->fs_bsize,
1044			    NOCRED, gbflags, &nbp);
1045		}
1046		if (error) {
1047			brelse(nbp);
1048			goto fail;
1049		}
1050	} else {
1051		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
1052		nbp->b_blkno = fsbtodb(fs, nb);
1053	}
1054	curthread_pflags_restore(saved_inbdflush);
1055	*bpp = nbp;
1056	return (0);
1057fail:
1058	curthread_pflags_restore(saved_inbdflush);
1059	/*
1060	 * If we have failed to allocate any blocks, simply return the error.
1061	 * This is the usual case and avoids the need to fsync the file.
1062	 */
1063	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
1064		return (error);
1065	/*
1066	 * If we have failed part way through block allocation, we
1067	 * have to deallocate any indirect blocks that we have allocated.
1068	 * We have to fsync the file before we start to get rid of all
1069	 * of its dependencies so that we do not leave them dangling.
1070	 * We have to sync it at the end so that the soft updates code
1071	 * does not find any untracked changes. Although this is really
1072	 * slow, running out of disk space is not expected to be a common
1073	 * occurrence. The error return from fsync is ignored as we already
1074	 * have an error to return to the user.
1075	 *
1076	 * XXX Still have to journal the free below
1077	 */
1078	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
1079	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
1080	     blkp < allocblk; blkp++, lbns_remfree++) {
1081		/*
1082		 * We shall not leave the freed blocks on the vnode
1083		 * buffer object lists.
1084		 */
1085		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
1086		    GB_NOCREAT | GB_UNMAPPED);
1087		if (bp != NULL) {
1088			KASSERT(bp->b_blkno == fsbtodb(fs, *blkp),
1089			    ("mismatch2 l %jd %jd b %ju %ju",
1090			    (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree,
1091			    (uintmax_t)bp->b_blkno,
1092			    (uintmax_t)fsbtodb(fs, *blkp)));
1093			bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE;
1094			bp->b_flags &= ~(B_ASYNC | B_CACHE);
1095			brelse(bp);
1096		}
1097		deallocated += fs->fs_bsize;
1098	}
1099	if (allocib != NULL) {
1100		*allocib = 0;
1101	} else if (unwindidx >= 0) {
1102		int r;
1103
1104		r = bread(vp, indirs[unwindidx].in_lbn,
1105		    (int)fs->fs_bsize, NOCRED, &bp);
1106		if (r) {
1107			panic("Could not unwind indirect block, error %d", r);
1108			brelse(bp);
1109		} else {
1110			bap = (ufs2_daddr_t *)bp->b_data;
1111			bap[indirs[unwindidx].in_off] = 0;
1112			if (flags & IO_SYNC) {
1113				bwrite(bp);
1114			} else {
1115				if (bp->b_bufsize == fs->fs_bsize)
1116					bp->b_flags |= B_CLUSTEROK;
1117				bdwrite(bp);
1118			}
1119		}
1120	}
1121	if (deallocated) {
1122#ifdef QUOTA
1123		/*
1124		 * Restore user's disk quota because allocation failed.
1125		 */
1126		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
1127#endif
1128		dp->di_blocks -= btodb(deallocated);
1129		ip->i_flag |= IN_CHANGE | IN_UPDATE;
1130	}
1131	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
1132	/*
1133	 * After the buffers are invalidated and on-disk pointers are
1134	 * cleared, free the blocks.
1135	 */
1136	for (blkp = allociblk; blkp < allocblk; blkp++) {
1137#ifdef INVARIANTS
1138		if (blkp == allociblk)
1139			lbns_remfree = lbns;
1140		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
1141		    GB_NOCREAT | GB_UNMAPPED);
1142		if (bp != NULL) {
1143			panic("zombie2 %jd %ju %ju",
1144			    (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno,
1145			    (uintmax_t)fsbtodb(fs, *blkp));
1146		}
1147		lbns_remfree++;
1148#endif
1149		ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize,
1150		    ip->i_number, vp->v_type, NULL);
1151	}
1152	return (error);
1153}
1154