1/*	$NetBSD: ffs_alloc.c,v 1.173 2024/05/13 00:24:19 msaitoh Exp $	*/
2
3/*-
4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/*
33 * Copyright (c) 2002 Networks Associates Technology, Inc.
34 * All rights reserved.
35 *
36 * This software was developed for the FreeBSD Project by Marshall
37 * Kirk McKusick and Network Associates Laboratories, the Security
38 * Research Division of Network Associates, Inc. under DARPA/SPAWAR
39 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
40 * research program
41 *
42 * Copyright (c) 1982, 1986, 1989, 1993
43 *	The Regents of the University of California.  All rights reserved.
44 *
45 * Redistribution and use in source and binary forms, with or without
46 * modification, are permitted provided that the following conditions
47 * are met:
48 * 1. Redistributions of source code must retain the above copyright
49 *    notice, this list of conditions and the following disclaimer.
50 * 2. Redistributions in binary form must reproduce the above copyright
51 *    notice, this list of conditions and the following disclaimer in the
52 *    documentation and/or other materials provided with the distribution.
53 * 3. Neither the name of the University nor the names of its contributors
54 *    may be used to endorse or promote products derived from this software
55 *    without specific prior written permission.
56 *
57 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
58 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
59 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
60 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
61 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
62 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
63 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
64 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
65 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
66 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
67 * SUCH DAMAGE.
68 *
69 *	@(#)ffs_alloc.c	8.19 (Berkeley) 7/13/95
70 */
71
72#include <sys/cdefs.h>
73__KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.173 2024/05/13 00:24:19 msaitoh Exp $");
74
75#if defined(_KERNEL_OPT)
76#include "opt_ffs.h"
77#include "opt_quota.h"
78#include "opt_uvm_page_trkown.h"
79#endif
80
81#include <sys/param.h>
82#include <sys/systm.h>
83#include <sys/buf.h>
84#include <sys/cprng.h>
85#include <sys/kauth.h>
86#include <sys/kernel.h>
87#include <sys/mount.h>
88#include <sys/proc.h>
89#include <sys/syslog.h>
90#include <sys/vnode.h>
91#include <sys/wapbl.h>
92#include <sys/cprng.h>
93
94#include <miscfs/specfs/specdev.h>
95#include <ufs/ufs/quota.h>
96#include <ufs/ufs/ufsmount.h>
97#include <ufs/ufs/inode.h>
98#include <ufs/ufs/ufs_extern.h>
99#include <ufs/ufs/ufs_bswap.h>
100#include <ufs/ufs/ufs_wapbl.h>
101
102#include <ufs/ffs/fs.h>
103#include <ufs/ffs/ffs_extern.h>
104
105#ifdef UVM_PAGE_TRKOWN
106#include <uvm/uvm_object.h>
107#include <uvm/uvm_page.h>
108#endif
109
110static daddr_t ffs_alloccg(struct inode *, u_int, daddr_t, int, int, int);
111static daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t, int, int);
112static ino_t ffs_dirpref(struct inode *);
113static daddr_t ffs_fragextend(struct inode *, u_int, daddr_t, int, int);
114static void ffs_fserr(struct fs *, kauth_cred_t, const char *);
115static daddr_t ffs_hashalloc(struct inode *, u_int, daddr_t, int, int, int,
116    daddr_t (*)(struct inode *, u_int, daddr_t, int, int, int));
117static daddr_t ffs_nodealloccg(struct inode *, u_int, daddr_t, int, int, int);
118static int32_t ffs_mapsearch(struct fs *, struct cg *,
119				      daddr_t, int);
120static void ffs_blkfree_common(struct ufsmount *, struct fs *, dev_t, struct buf *,
121    daddr_t, long, bool);
122static void ffs_freefile_common(struct ufsmount *, struct fs *, dev_t, struct buf *, ino_t,
123    int, bool);
124
125/* if 1, changes in optimalization strategy are logged */
126int ffs_log_changeopt = 0;
127
128/* in ffs_tables.c */
129extern const int inside[], around[];
130extern const u_char * const fragtbl[];
131
132/* Basic consistency check for block allocations */
133static int
134ffs_check_bad_allocation(const char *func, struct fs *fs, daddr_t bno,
135    long size, dev_t dev, ino_t inum)
136{
137	if ((u_int)size > fs->fs_bsize || ffs_fragoff(fs, size) != 0 ||
138	    ffs_fragnum(fs, bno) + ffs_numfrags(fs, size) > fs->fs_frag) {
139		panic("%s: bad size: dev = 0x%llx, bno = %" PRId64
140		    " bsize = %d, size = %ld, fs = %s", func,
141		    (long long)dev, bno, fs->fs_bsize, size, fs->fs_fsmnt);
142	}
143
144	if (bno >= fs->fs_size) {
145		printf("%s: bad block %" PRId64 ", ino %llu\n", func, bno,
146		    (unsigned long long)inum);
147		ffs_fserr(fs, NOCRED, "bad block");
148		return EINVAL;
149	}
150	return 0;
151}
152
153/*
154 * Allocate a block in the file system.
155 *
156 * The size of the requested block is given, which must be some
157 * multiple of fs_fsize and <= fs_bsize.
158 * A preference may be optionally specified. If a preference is given
159 * the following hierarchy is used to allocate a block:
160 *   1) allocate the requested block.
161 *   2) allocate a rotationally optimal block in the same cylinder.
162 *   3) allocate a block in the same cylinder group.
163 *   4) quadradically rehash into other cylinder groups, until an
164 *      available block is located.
165 * If no block preference is given the following hierarchy is used
166 * to allocate a block:
167 *   1) allocate a block in the cylinder group that contains the
168 *      inode for the file.
169 *   2) quadradically rehash into other cylinder groups, until an
170 *      available block is located.
171 *
172 * => called with um_lock held
173 * => releases um_lock before returning
174 */
175int
176ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size,
177    int flags, kauth_cred_t cred, daddr_t *bnp)
178{
179	struct ufsmount *ump;
180	struct fs *fs;
181	daddr_t bno;
182	u_int cg;
183#if defined(QUOTA) || defined(QUOTA2)
184	int error;
185#endif
186
187	fs = ip->i_fs;
188	ump = ip->i_ump;
189
190	KASSERT(mutex_owned(&ump->um_lock));
191
192#ifdef UVM_PAGE_TRKOWN
193
194	/*
195	 * Sanity-check that allocations within the file size
196	 * do not allow other threads to read the stale contents
197	 * of newly allocated blocks.
198	 * Usually pages will exist to cover the new allocation.
199	 * There is an optimization in ffs_write() where we skip
200	 * creating pages if several conditions are met:
201	 *  - the file must not be mapped (in any user address space).
202	 *  - the write must cover whole pages and whole blocks.
203	 * If those conditions are not met then pages must exist and
204	 * be locked by the current thread.
205	 */
206
207	struct vnode *vp = ITOV(ip);
208	if (vp->v_type == VREG && (flags & IO_EXT) == 0 &&
209	    ffs_lblktosize(fs, (voff_t)lbn) < round_page(vp->v_size) &&
210	    ((vp->v_vflag & VV_MAPPED) != 0 || (size & PAGE_MASK) != 0 ||
211	     ffs_blkoff(fs, size) != 0)) {
212		struct vm_page *pg __diagused;
213		struct uvm_object *uobj = &vp->v_uobj;
214		voff_t off = trunc_page(ffs_lblktosize(fs, lbn));
215		voff_t endoff = round_page(ffs_lblktosize(fs, lbn) + size);
216
217		rw_enter(uobj->vmobjlock, RW_WRITER);
218		while (off < endoff) {
219			pg = uvm_pagelookup(uobj, off);
220			KASSERT((pg != NULL && pg->owner_tag != NULL &&
221				 pg->owner == curproc->p_pid &&
222				 pg->lowner == curlwp->l_lid));
223			off += PAGE_SIZE;
224		}
225		rw_exit(uobj->vmobjlock);
226	}
227#endif
228
229	*bnp = 0;
230
231	KASSERTMSG((cred != NOCRED), "missing credential");
232	KASSERTMSG(((u_int)size <= fs->fs_bsize),
233	    "bad size: dev = 0x%llx, bsize = %d, size = %d, fs = %s",
234	    (unsigned long long)ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
235	KASSERTMSG((ffs_fragoff(fs, size) == 0),
236	    "bad size: dev = 0x%llx, bsize = %d, size = %d, fs = %s",
237	    (unsigned long long)ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
238
239	if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
240		goto nospace;
241	if (freespace(fs, fs->fs_minfree) <= 0 &&
242	    kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL,
243	    NULL, NULL) != 0)
244		goto nospace;
245#if defined(QUOTA) || defined(QUOTA2)
246	mutex_exit(&ump->um_lock);
247	if ((error = chkdq(ip, btodb(size), cred, 0)) != 0)
248		return (error);
249	mutex_enter(&ump->um_lock);
250#endif
251
252	if (bpref >= fs->fs_size)
253		bpref = 0;
254	if (bpref == 0)
255		cg = ino_to_cg(fs, ip->i_number);
256	else
257		cg = dtog(fs, bpref);
258	bno = ffs_hashalloc(ip, cg, bpref, size, 0, flags, ffs_alloccg);
259	if (bno > 0) {
260		DIP_ADD(ip, blocks, btodb(size));
261		if (flags & IO_EXT)
262			ip->i_flag |= IN_CHANGE;
263		else
264			ip->i_flag |= IN_CHANGE | IN_UPDATE;
265		*bnp = bno;
266		return (0);
267	}
268#if defined(QUOTA) || defined(QUOTA2)
269	/*
270	 * Restore user's disk quota because allocation failed.
271	 */
272	(void) chkdq(ip, -btodb(size), cred, FORCE);
273#endif
274	if (flags & B_CONTIG) {
275		/*
276		 * XXX ump->um_lock handling is "suspect" at best.
277		 * For the case where ffs_hashalloc() fails early
278		 * in the B_CONTIG case we reach here with um_lock
279		 * already unlocked, so we can't release it again
280		 * like in the normal error path.  See kern/39206.
281		 *
282		 *
283		 * Fail silently - it's up to our caller to report
284		 * errors.
285		 */
286		return (ENOSPC);
287	}
288nospace:
289	mutex_exit(&ump->um_lock);
290	ffs_fserr(fs, cred, "file system full");
291	uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
292	return (ENOSPC);
293}
294
295/*
296 * Reallocate a fragment to a bigger size
297 *
298 * The number and size of the old block is given, and a preference
299 * and new size is also specified. The allocator attempts to extend
300 * the original block. Failing that, the regular block allocator is
301 * invoked to get an appropriate block.
302 *
303 * => called with um_lock held
304 * => return with um_lock released
305 */
306int
307ffs_realloccg(struct inode *ip, daddr_t lbprev, daddr_t bprev, daddr_t bpref,
308    int osize, int nsize, int flags, kauth_cred_t cred, struct buf **bpp,
309    daddr_t *blknop)
310{
311	struct ufsmount *ump;
312	struct fs *fs;
313	struct buf *bp;
314	u_int cg, request;
315	int error;
316	daddr_t bno;
317
318	fs = ip->i_fs;
319	ump = ip->i_ump;
320
321	KASSERT(mutex_owned(&ump->um_lock));
322
323#ifdef UVM_PAGE_TRKOWN
324
325	/*
326	 * Sanity-check that allocations within the file size
327	 * do not allow other threads to read the stale contents
328	 * of newly allocated blocks.
329	 * Unlike in ffs_alloc(), here pages must always exist
330	 * for such allocations, because only the last block of a file
331	 * can be a fragment and ffs_write() will reallocate the
332	 * fragment to the new size using ufs_balloc_range(),
333	 * which always creates pages to cover blocks it allocates.
334	 */
335
336	if (ITOV(ip)->v_type == VREG) {
337		struct vm_page *pg __diagused;
338		struct uvm_object *uobj = &ITOV(ip)->v_uobj;
339		voff_t off = trunc_page(ffs_lblktosize(fs, lbprev));
340		voff_t endoff = round_page(ffs_lblktosize(fs, lbprev) + osize);
341
342		rw_enter(uobj->vmobjlock, RW_WRITER);
343		while (off < endoff) {
344			pg = uvm_pagelookup(uobj, off);
345			KASSERT(pg->owner == curproc->p_pid &&
346				pg->lowner == curlwp->l_lid);
347			off += PAGE_SIZE;
348		}
349		rw_exit(uobj->vmobjlock);
350	}
351#endif
352
353	KASSERTMSG((cred != NOCRED), "missing credential");
354	KASSERTMSG(((u_int)osize <= fs->fs_bsize),
355	    "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s",
356	    (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
357	    fs->fs_fsmnt);
358	KASSERTMSG((ffs_fragoff(fs, osize) == 0),
359	    "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s",
360	    (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
361	    fs->fs_fsmnt);
362	KASSERTMSG(((u_int)nsize <= fs->fs_bsize),
363	    "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s",
364	    (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
365	    fs->fs_fsmnt);
366	KASSERTMSG((ffs_fragoff(fs, nsize) == 0),
367	    "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s",
368	    (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
369	    fs->fs_fsmnt);
370
371	if (freespace(fs, fs->fs_minfree) <= 0 &&
372	    kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL,
373	    NULL, NULL) != 0) {
374		mutex_exit(&ump->um_lock);
375		goto nospace;
376	}
377
378	if (bprev == 0) {
379		panic("%s: bad bprev: dev = 0x%llx, bsize = %d, bprev = %"
380		    PRId64 ", fs = %s", __func__,
381		    (unsigned long long)ip->i_dev, fs->fs_bsize, bprev,
382		    fs->fs_fsmnt);
383	}
384	mutex_exit(&ump->um_lock);
385
386	/*
387	 * Allocate the extra space in the buffer.
388	 */
389	if (bpp != NULL &&
390	    (error = bread(ITOV(ip), lbprev, osize, 0, &bp)) != 0) {
391		return (error);
392	}
393#if defined(QUOTA) || defined(QUOTA2)
394	if ((error = chkdq(ip, btodb(nsize - osize), cred, 0)) != 0) {
395		if (bpp != NULL) {
396			brelse(bp, 0);
397		}
398		return (error);
399	}
400#endif
401	/*
402	 * Check for extension in the existing location.
403	 */
404	cg = dtog(fs, bprev);
405	mutex_enter(&ump->um_lock);
406	if ((bno = ffs_fragextend(ip, cg, bprev, osize, nsize)) != 0) {
407		DIP_ADD(ip, blocks, btodb(nsize - osize));
408		if (flags & IO_EXT)
409			ip->i_flag |= IN_CHANGE;
410		else
411			ip->i_flag |= IN_CHANGE | IN_UPDATE;
412
413		if (bpp != NULL) {
414			if (bp->b_blkno != FFS_FSBTODB(fs, bno)) {
415				panic("%s: bad blockno %#llx != %#llx",
416				    __func__, (unsigned long long) bp->b_blkno,
417				    (unsigned long long)FFS_FSBTODB(fs, bno));
418			}
419			allocbuf(bp, nsize, 1);
420			memset((char *)bp->b_data + osize, 0, nsize - osize);
421			mutex_enter(bp->b_objlock);
422			KASSERT(!cv_has_waiters(&bp->b_done));
423			bp->b_oflags |= BO_DONE;
424			mutex_exit(bp->b_objlock);
425			*bpp = bp;
426		}
427		if (blknop != NULL) {
428			*blknop = bno;
429		}
430		return (0);
431	}
432	/*
433	 * Allocate a new disk location.
434	 */
435	if (bpref >= fs->fs_size)
436		bpref = 0;
437	switch ((int)fs->fs_optim) {
438	case FS_OPTSPACE:
439		/*
440		 * Allocate an exact sized fragment. Although this makes
441		 * best use of space, we will waste time relocating it if
442		 * the file continues to grow. If the fragmentation is
443		 * less than half of the minimum free reserve, we choose
444		 * to begin optimizing for time.
445		 */
446		request = nsize;
447		if (fs->fs_minfree < 5 ||
448		    fs->fs_cstotal.cs_nffree >
449		    fs->fs_dsize * fs->fs_minfree / (2 * 100))
450			break;
451
452		if (ffs_log_changeopt) {
453			log(LOG_NOTICE,
454				"%s: optimization changed from SPACE to TIME\n",
455				fs->fs_fsmnt);
456		}
457
458		fs->fs_optim = FS_OPTTIME;
459		break;
460	case FS_OPTTIME:
461		/*
462		 * At this point we have discovered a file that is trying to
463		 * grow a small fragment to a larger fragment. To save time,
464		 * we allocate a full sized block, then free the unused portion.
465		 * If the file continues to grow, the `ffs_fragextend' call
466		 * above will be able to grow it in place without further
467		 * copying. If aberrant programs cause disk fragmentation to
468		 * grow within 2% of the free reserve, we choose to begin
469		 * optimizing for space.
470		 */
471		request = fs->fs_bsize;
472		if (fs->fs_cstotal.cs_nffree <
473		    fs->fs_dsize * (fs->fs_minfree - 2) / 100)
474			break;
475
476		if (ffs_log_changeopt) {
477			log(LOG_NOTICE,
478				"%s: optimization changed from TIME to SPACE\n",
479				fs->fs_fsmnt);
480		}
481
482		fs->fs_optim = FS_OPTSPACE;
483		break;
484	default:
485		panic("%s: bad optim: dev = 0x%llx, optim = %d, fs = %s",
486		    __func__, (unsigned long long)ip->i_dev, fs->fs_optim,
487		    fs->fs_fsmnt);
488		/* NOTREACHED */
489	}
490	bno = ffs_hashalloc(ip, cg, bpref, request, nsize, 0, ffs_alloccg);
491	if (bno > 0) {
492		/*
493		 * Use forced deallocation registration, we can't handle
494		 * failure here. This is safe, as this place is ever hit
495		 * maximum once per write operation, when fragment is extended
496		 * to longer fragment, or a full block.
497		 */
498		if ((ip->i_ump->um_mountp->mnt_wapbl) &&
499		    (ITOV(ip)->v_type != VREG)) {
500			/* this should never fail */
501			error = UFS_WAPBL_REGISTER_DEALLOCATION_FORCE(
502			    ip->i_ump->um_mountp, FFS_FSBTODB(fs, bprev),
503			    osize);
504			if (error)
505				panic("ffs_realloccg: dealloc registration failed");
506		} else {
507			ffs_blkfree(fs, ip->i_devvp, bprev, (long)osize,
508			    ip->i_number);
509		}
510		DIP_ADD(ip, blocks, btodb(nsize - osize));
511		if (flags & IO_EXT)
512			ip->i_flag |= IN_CHANGE;
513		else
514			ip->i_flag |= IN_CHANGE | IN_UPDATE;
515		if (bpp != NULL) {
516			bp->b_blkno = FFS_FSBTODB(fs, bno);
517			allocbuf(bp, nsize, 1);
518			memset((char *)bp->b_data + osize, 0, (u_int)nsize - osize);
519			mutex_enter(bp->b_objlock);
520			KASSERT(!cv_has_waiters(&bp->b_done));
521			bp->b_oflags |= BO_DONE;
522			mutex_exit(bp->b_objlock);
523			*bpp = bp;
524		}
525		if (blknop != NULL) {
526			*blknop = bno;
527		}
528		return (0);
529	}
530	mutex_exit(&ump->um_lock);
531
532#if defined(QUOTA) || defined(QUOTA2)
533	/*
534	 * Restore user's disk quota because allocation failed.
535	 */
536	(void) chkdq(ip, -btodb(nsize - osize), cred, FORCE);
537#endif
538	if (bpp != NULL) {
539		brelse(bp, 0);
540	}
541
542nospace:
543	/*
544	 * no space available
545	 */
546	ffs_fserr(fs, cred, "file system full");
547	uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
548	return (ENOSPC);
549}
550
551/*
552 * Allocate an inode in the file system.
553 *
554 * If allocating a directory, use ffs_dirpref to select the inode.
555 * If allocating in a directory, the following hierarchy is followed:
556 *   1) allocate the preferred inode.
557 *   2) allocate an inode in the same cylinder group.
558 *   3) quadradically rehash into other cylinder groups, until an
559 *      available inode is located.
560 * If no inode preference is given the following hierarchy is used
561 * to allocate an inode:
562 *   1) allocate an inode in cylinder group 0.
563 *   2) quadradically rehash into other cylinder groups, until an
564 *      available inode is located.
565 *
566 * => um_lock not held upon entry or return
567 */
568int
569ffs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred, ino_t *inop)
570{
571	struct ufsmount *ump;
572	struct inode *pip;
573	struct fs *fs;
574	ino_t ino, ipref;
575	u_int cg;
576	int error;
577
578	UFS_WAPBL_JUNLOCK_ASSERT(pvp->v_mount);
579
580	pip = VTOI(pvp);
581	fs = pip->i_fs;
582	ump = pip->i_ump;
583
584	error = UFS_WAPBL_BEGIN(pvp->v_mount);
585	if (error) {
586		return error;
587	}
588	mutex_enter(&ump->um_lock);
589	if (fs->fs_cstotal.cs_nifree == 0)
590		goto noinodes;
591
592	if ((mode & IFMT) == IFDIR)
593		ipref = ffs_dirpref(pip);
594	else
595		ipref = pip->i_number;
596	if (ipref >= fs->fs_ncg * fs->fs_ipg)
597		ipref = 0;
598	cg = ino_to_cg(fs, ipref);
599	/*
600	 * Track number of dirs created one after another
601	 * in a same cg without intervening by files.
602	 */
603	if ((mode & IFMT) == IFDIR) {
604		if (fs->fs_contigdirs[cg] < 255)
605			fs->fs_contigdirs[cg]++;
606	} else {
607		if (fs->fs_contigdirs[cg] > 0)
608			fs->fs_contigdirs[cg]--;
609	}
610	ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, 0, ffs_nodealloccg);
611	if (ino == 0)
612		goto noinodes;
613	UFS_WAPBL_END(pvp->v_mount);
614	*inop = ino;
615	return 0;
616
617noinodes:
618	mutex_exit(&ump->um_lock);
619	UFS_WAPBL_END(pvp->v_mount);
620	ffs_fserr(fs, cred, "out of inodes");
621	uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt);
622	return ENOSPC;
623}
624
625/*
626 * Find a cylinder group in which to place a directory.
627 *
628 * The policy implemented by this algorithm is to allocate a
629 * directory inode in the same cylinder group as its parent
630 * directory, but also to reserve space for its files inodes
631 * and data. Restrict the number of directories which may be
632 * allocated one after another in the same cylinder group
633 * without intervening allocation of files.
634 *
635 * If we allocate a first level directory then force allocation
636 * in another cylinder group.
637 */
638static ino_t
639ffs_dirpref(struct inode *pip)
640{
641	register struct fs *fs;
642	u_int cg, prefcg;
643	uint64_t dirsize, cgsize, curdsz;
644	u_int avgifree, avgbfree, avgndir;
645	u_int minifree, minbfree, maxndir;
646	u_int mincg, minndir;
647	u_int maxcontigdirs;
648
649	KASSERT(mutex_owned(&pip->i_ump->um_lock));
650
651	fs = pip->i_fs;
652
653	avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
654	avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
655	avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;
656
657	/*
658	 * Force allocation in another cg if creating a first level dir.
659	 */
660	if (ITOV(pip)->v_vflag & VV_ROOT) {
661		prefcg = cprng_fast32() % fs->fs_ncg;
662		mincg = prefcg;
663		minndir = fs->fs_ipg;
664		for (cg = prefcg; cg < fs->fs_ncg; cg++)
665			if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
666			    fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
667			    fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
668				mincg = cg;
669				minndir = fs->fs_cs(fs, cg).cs_ndir;
670			}
671		for (cg = 0; cg < prefcg; cg++)
672			if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
673			    fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
674			    fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
675				mincg = cg;
676				minndir = fs->fs_cs(fs, cg).cs_ndir;
677			}
678		return ((ino_t)(fs->fs_ipg * mincg));
679	}
680
681	/*
682	 * Count various limits which used for
683	 * optimal allocation of a directory inode.
684	 * Try cylinder groups with >75% avgifree and avgbfree.
685	 * Avoid cylinder groups with no free blocks or inodes as that
686	 * triggers an I/O-expensive cylinder group scan.
687	 */
688	maxndir = uimin(avgndir + fs->fs_ipg / 16, fs->fs_ipg);
689	minifree = avgifree - avgifree / 4;
690	if (minifree < 1)
691		minifree = 1;
692	minbfree = avgbfree - avgbfree / 4;
693	if (minbfree < 1)
694		minbfree = 1;
695	cgsize = (int64_t)fs->fs_fsize * fs->fs_fpg;
696	dirsize = (int64_t)fs->fs_avgfilesize * fs->fs_avgfpdir;
697	if (avgndir != 0) {
698		curdsz = (cgsize - (int64_t)avgbfree * fs->fs_bsize) / avgndir;
699		if (dirsize < curdsz)
700			dirsize = curdsz;
701	}
702	if (cgsize < dirsize * 255)
703		maxcontigdirs = (avgbfree * fs->fs_bsize) / dirsize;
704	else
705		maxcontigdirs = 255;
706	if (fs->fs_avgfpdir > 0)
707		maxcontigdirs = uimin(maxcontigdirs,
708				    fs->fs_ipg / fs->fs_avgfpdir);
709	if (maxcontigdirs == 0)
710		maxcontigdirs = 1;
711
712	/*
713	 * Limit number of dirs in one cg and reserve space for
714	 * regular files, but only if we have no deficit in
715	 * inodes or space.
716	 */
717	prefcg = ino_to_cg(fs, pip->i_number);
718	for (cg = prefcg; cg < fs->fs_ncg; cg++)
719		if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
720		    fs->fs_cs(fs, cg).cs_nifree >= minifree &&
721	    	    fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
722			if (fs->fs_contigdirs[cg] < maxcontigdirs)
723				return ((ino_t)(fs->fs_ipg * cg));
724		}
725	for (cg = 0; cg < prefcg; cg++)
726		if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
727		    fs->fs_cs(fs, cg).cs_nifree >= minifree &&
728	    	    fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
729			if (fs->fs_contigdirs[cg] < maxcontigdirs)
730				return ((ino_t)(fs->fs_ipg * cg));
731		}
732	/*
733	 * This is a backstop when we are deficient in space.
734	 */
735	for (cg = prefcg; cg < fs->fs_ncg; cg++)
736		if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
737			return ((ino_t)(fs->fs_ipg * cg));
738	for (cg = 0; cg < prefcg; cg++)
739		if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
740			break;
741	return ((ino_t)(fs->fs_ipg * cg));
742}
743
744/*
745 * Select the desired position for the next block in a file.  The file is
746 * logically divided into sections. The first section is composed of the
747 * direct blocks. Each additional section contains fs_maxbpg blocks.
748 *
749 * If no blocks have been allocated in the first section, the policy is to
750 * request a block in the same cylinder group as the inode that describes
751 * the file. If no blocks have been allocated in any other section, the
752 * policy is to place the section in a cylinder group with a greater than
753 * average number of free blocks.  An appropriate cylinder group is found
754 * by using a rotor that sweeps the cylinder groups. When a new group of
755 * blocks is needed, the sweep begins in the cylinder group following the
756 * cylinder group from which the previous allocation was made. The sweep
757 * continues until a cylinder group with greater than the average number
758 * of free blocks is found. If the allocation is for the first block in an
759 * indirect block, the information on the previous allocation is unavailable;
760 * here a best guess is made based upon the logical block number being
761 * allocated.
762 *
763 * If a section is already partially allocated, the policy is to
764 * contiguously allocate fs_maxcontig blocks.  The end of one of these
765 * contiguous blocks and the beginning of the next is laid out
766 * contiguously if possible.
767 *
768 * => um_lock held on entry and exit
769 */
770daddr_t
771ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx, int flags,
772    int32_t *bap /* XXX ondisk32 */)
773{
774	struct fs *fs;
775	u_int cg;
776	u_int avgbfree, startcg;
777
778	KASSERT(mutex_owned(&ip->i_ump->um_lock));
779
780	fs = ip->i_fs;
781
782	/*
783	 * If allocating a contiguous file with B_CONTIG, use the hints
784	 * in the inode extensions to return the desired block.
785	 *
786	 * For metadata (indirect blocks) return the address of where
787	 * the first indirect block resides - we'll scan for the next
788	 * available slot if we need to allocate more than one indirect
789	 * block.  For data, return the address of the actual block
790	 * relative to the address of the first data block.
791	 */
792	if (flags & B_CONTIG) {
793		KASSERT(ip->i_ffs_first_data_blk != 0);
794		KASSERT(ip->i_ffs_first_indir_blk != 0);
795		if (flags & B_METAONLY)
796			return ip->i_ffs_first_indir_blk;
797		else
798			return ip->i_ffs_first_data_blk + ffs_blkstofrags(fs, lbn);
799	}
800
801	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
802		if (lbn < UFS_NDADDR + FFS_NINDIR(fs)) {
803			cg = ino_to_cg(fs, ip->i_number);
804			return (cgbase(fs, cg) + fs->fs_frag);
805		}
806		/*
807		 * Find a cylinder with greater than average number of
808		 * unused data blocks.
809		 */
810		if (indx == 0 || bap[indx - 1] == 0)
811			startcg =
812			    ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
813		else
814			startcg = dtog(fs,
815				ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1);
816		startcg %= fs->fs_ncg;
817		avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
818		for (cg = startcg; cg < fs->fs_ncg; cg++)
819			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
820				return (cgbase(fs, cg) + fs->fs_frag);
821			}
822		for (cg = 0; cg < startcg; cg++)
823			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
824				return (cgbase(fs, cg) + fs->fs_frag);
825			}
826		return (0);
827	}
828	/*
829	 * We just always try to lay things out contiguously.
830	 */
831	return ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag;
832}
833
834daddr_t
835ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int flags,
836    int64_t *bap)
837{
838	struct fs *fs;
839	u_int cg;
840	u_int avgbfree, startcg;
841
842	KASSERT(mutex_owned(&ip->i_ump->um_lock));
843
844	fs = ip->i_fs;
845
846	/*
847	 * If allocating a contiguous file with B_CONTIG, use the hints
848	 * in the inode extensions to return the desired block.
849	 *
850	 * For metadata (indirect blocks) return the address of where
851	 * the first indirect block resides - we'll scan for the next
852	 * available slot if we need to allocate more than one indirect
853	 * block.  For data, return the address of the actual block
854	 * relative to the address of the first data block.
855	 */
856	if (flags & B_CONTIG) {
857		KASSERT(ip->i_ffs_first_data_blk != 0);
858		KASSERT(ip->i_ffs_first_indir_blk != 0);
859		if (flags & B_METAONLY)
860			return ip->i_ffs_first_indir_blk;
861		else
862			return ip->i_ffs_first_data_blk + ffs_blkstofrags(fs, lbn);
863	}
864
865	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
866		if (lbn < UFS_NDADDR + FFS_NINDIR(fs)) {
867			cg = ino_to_cg(fs, ip->i_number);
868			return (cgbase(fs, cg) + fs->fs_frag);
869		}
870		/*
871		 * Find a cylinder with greater than average number of
872		 * unused data blocks.
873		 */
874		if (indx == 0 || bap[indx - 1] == 0)
875			startcg =
876			    ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
877		else
878			startcg = dtog(fs,
879				ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1);
880		startcg %= fs->fs_ncg;
881		avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
882		for (cg = startcg; cg < fs->fs_ncg; cg++)
883			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
884				return (cgbase(fs, cg) + fs->fs_frag);
885			}
886		for (cg = 0; cg < startcg; cg++)
887			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
888				return (cgbase(fs, cg) + fs->fs_frag);
889			}
890		return (0);
891	}
892	/*
893	 * We just always try to lay things out contiguously.
894	 */
895	return ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag;
896}
897
898
899/*
900 * Implement the cylinder overflow algorithm.
901 *
902 * The policy implemented by this algorithm is:
903 *   1) allocate the block in its requested cylinder group.
904 *   2) quadradically rehash on the cylinder group number.
905 *   3) brute force search for a free block.
906 *
907 * => called with um_lock held
908 * => returns with um_lock released on success, held on failure
909 *    (*allocator releases lock on success, retains lock on failure)
910 */
911/*VARARGS5*/
912static daddr_t
913ffs_hashalloc(struct inode *ip, u_int cg, daddr_t pref,
914    int size /* size for data blocks, mode for inodes */,
915    int realsize,
916    int flags,
917    daddr_t (*allocator)(struct inode *, u_int, daddr_t, int, int, int))
918{
919	struct fs *fs;
920	daddr_t result;
921	u_int i, icg = cg;
922
923	fs = ip->i_fs;
924	/*
925	 * 1: preferred cylinder group
926	 */
927	result = (*allocator)(ip, cg, pref, size, realsize, flags);
928	if (result)
929		return (result);
930
931	if (flags & B_CONTIG)
932		return (result);
933	/*
934	 * 2: quadratic rehash
935	 */
936	for (i = 1; i < fs->fs_ncg; i *= 2) {
937		cg += i;
938		if (cg >= fs->fs_ncg)
939			cg -= fs->fs_ncg;
940		result = (*allocator)(ip, cg, 0, size, realsize, flags);
941		if (result)
942			return (result);
943	}
944	/*
945	 * 3: brute force search
946	 * Note that we start at i == 2, since 0 was checked initially,
947	 * and 1 is always checked in the quadratic rehash.
948	 */
949	cg = (icg + 2) % fs->fs_ncg;
950	for (i = 2; i < fs->fs_ncg; i++) {
951		result = (*allocator)(ip, cg, 0, size, realsize, flags);
952		if (result)
953			return (result);
954		cg++;
955		if (cg == fs->fs_ncg)
956			cg = 0;
957	}
958	return (0);
959}
960
961/*
962 * Determine whether a fragment can be extended.
963 *
964 * Check to see if the necessary fragments are available, and
965 * if they are, allocate them.
966 *
967 * => called with um_lock held
968 * => returns with um_lock released on success, held on failure
969 */
970static daddr_t
971ffs_fragextend(struct inode *ip, u_int cg, daddr_t bprev, int osize, int nsize)
972{
973	struct ufsmount *ump;
974	struct fs *fs;
975	struct cg *cgp;
976	struct buf *bp;
977	daddr_t bno;
978	int frags, bbase;
979	int i, error;
980	u_int8_t *blksfree;
981
982	fs = ip->i_fs;
983	ump = ip->i_ump;
984
985	KASSERT(mutex_owned(&ump->um_lock));
986
987	if (fs->fs_cs(fs, cg).cs_nffree < ffs_numfrags(fs, nsize - osize))
988		return (0);
989	frags = ffs_numfrags(fs, nsize);
990	bbase = ffs_fragnum(fs, bprev);
991	if (bbase > ffs_fragnum(fs, (bprev + frags - 1))) {
992		/* cannot extend across a block boundary */
993		return (0);
994	}
995	mutex_exit(&ump->um_lock);
996	error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
997		(int)fs->fs_cgsize, B_MODIFY, &bp);
998	if (error)
999		goto fail;
1000	cgp = (struct cg *)bp->b_data;
1001	if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs)))
1002		goto fail;
1003	cgp->cg_old_time = ufs_rw32(time_second, UFS_FSNEEDSWAP(fs));
1004	if ((fs->fs_magic != FS_UFS1_MAGIC) ||
1005	    (fs->fs_old_flags & FS_FLAGS_UPDATED))
1006		cgp->cg_time = ufs_rw64(time_second, UFS_FSNEEDSWAP(fs));
1007	bno = dtogd(fs, bprev);
1008	blksfree = cg_blksfree(cgp, UFS_FSNEEDSWAP(fs));
1009	for (i = ffs_numfrags(fs, osize); i < frags; i++)
1010		if (isclr(blksfree, bno + i))
1011			goto fail;
1012	/*
1013	 * the current fragment can be extended
1014	 * deduct the count on fragment being extended into
1015	 * increase the count on the remaining fragment (if any)
1016	 * allocate the extended piece
1017	 */
1018	for (i = frags; i < fs->fs_frag - bbase; i++)
1019		if (isclr(blksfree, bno + i))
1020			break;
1021	ufs_add32(cgp->cg_frsum[i - ffs_numfrags(fs, osize)], -1, UFS_FSNEEDSWAP(fs));
1022	if (i != frags)
1023		ufs_add32(cgp->cg_frsum[i - frags], 1, UFS_FSNEEDSWAP(fs));
1024	mutex_enter(&ump->um_lock);
1025	for (i = ffs_numfrags(fs, osize); i < frags; i++) {
1026		clrbit(blksfree, bno + i);
1027		ufs_add32(cgp->cg_cs.cs_nffree, -1, UFS_FSNEEDSWAP(fs));
1028		fs->fs_cstotal.cs_nffree--;
1029		fs->fs_cs(fs, cg).cs_nffree--;
1030	}
1031	fs->fs_fmod = 1;
1032	ACTIVECG_CLR(fs, cg);
1033	mutex_exit(&ump->um_lock);
1034	bdwrite(bp);
1035	return (bprev);
1036
1037 fail:
1038 	if (bp != NULL)
1039		brelse(bp, 0);
1040 	mutex_enter(&ump->um_lock);
1041 	return (0);
1042}
1043
1044/*
1045 * Determine whether a block can be allocated.
1046 *
1047 * Check to see if a block of the appropriate size is available,
1048 * and if it is, allocate it.
1049 */
1050static daddr_t
1051ffs_alloccg(struct inode *ip, u_int cg, daddr_t bpref, int size, int realsize,
1052    int flags)
1053{
1054	struct ufsmount *ump;
1055	struct fs *fs = ip->i_fs;
1056	struct cg *cgp;
1057	struct buf *bp;
1058	int32_t bno;
1059	daddr_t blkno;
1060	int error, frags, allocsiz, i;
1061	u_int8_t *blksfree;
1062	const int needswap = UFS_FSNEEDSWAP(fs);
1063
1064	ump = ip->i_ump;
1065
1066	KASSERT(mutex_owned(&ump->um_lock));
1067
1068	if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
1069		return (0);
1070	mutex_exit(&ump->um_lock);
1071	error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
1072		(int)fs->fs_cgsize, B_MODIFY, &bp);
1073	if (error)
1074		goto fail;
1075	cgp = (struct cg *)bp->b_data;
1076	if (!cg_chkmagic(cgp, needswap) ||
1077	    (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize))
1078		goto fail;
1079	cgp->cg_old_time = ufs_rw32(time_second, needswap);
1080	if ((fs->fs_magic != FS_UFS1_MAGIC) ||
1081	    (fs->fs_old_flags & FS_FLAGS_UPDATED))
1082		cgp->cg_time = ufs_rw64(time_second, needswap);
1083	if (size == fs->fs_bsize) {
1084		mutex_enter(&ump->um_lock);
1085		blkno = ffs_alloccgblk(ip, bp, bpref, realsize, flags);
1086		ACTIVECG_CLR(fs, cg);
1087		mutex_exit(&ump->um_lock);
1088
1089		/*
1090		 * If actually needed size is lower, free the extra blocks now.
1091		 * This is safe to call here, there is no outside reference
1092		 * to this block yet. It is not necessary to keep um_lock
1093		 * locked.
1094		 */
1095		if (realsize != 0 && realsize < size) {
1096			ffs_blkfree_common(ip->i_ump, ip->i_fs,
1097			    ip->i_devvp->v_rdev,
1098			    bp, blkno + ffs_numfrags(fs, realsize),
1099			    (long)(size - realsize), false);
1100		}
1101
1102		bdwrite(bp);
1103		return (blkno);
1104	}
1105	/*
1106	 * check to see if any fragments are already available
1107	 * allocsiz is the size which will be allocated, hacking
1108	 * it down to a smaller size if necessary
1109	 */
1110	blksfree = cg_blksfree(cgp, needswap);
1111	frags = ffs_numfrags(fs, size);
1112	for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
1113		if (cgp->cg_frsum[allocsiz] != 0)
1114			break;
1115	if (allocsiz == fs->fs_frag) {
1116		/*
1117		 * no fragments were available, so a block will be
1118		 * allocated, and hacked up
1119		 */
1120		if (cgp->cg_cs.cs_nbfree == 0)
1121			goto fail;
1122		mutex_enter(&ump->um_lock);
1123		blkno = ffs_alloccgblk(ip, bp, bpref, realsize, flags);
1124		bno = dtogd(fs, blkno);
1125		for (i = frags; i < fs->fs_frag; i++)
1126			setbit(blksfree, bno + i);
1127		i = fs->fs_frag - frags;
1128		ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
1129		fs->fs_cstotal.cs_nffree += i;
1130		fs->fs_cs(fs, cg).cs_nffree += i;
1131		fs->fs_fmod = 1;
1132		ufs_add32(cgp->cg_frsum[i], 1, needswap);
1133		ACTIVECG_CLR(fs, cg);
1134		mutex_exit(&ump->um_lock);
1135		bdwrite(bp);
1136		return (blkno);
1137	}
1138	bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
1139#if 0
1140	/*
1141	 * XXX fvdl mapsearch will panic, and never return -1
1142	 *          also: returning NULL as daddr_t ?
1143	 */
1144	if (bno < 0)
1145		goto fail;
1146#endif
1147	for (i = 0; i < frags; i++)
1148		clrbit(blksfree, bno + i);
1149	mutex_enter(&ump->um_lock);
1150	ufs_add32(cgp->cg_cs.cs_nffree, -frags, needswap);
1151	fs->fs_cstotal.cs_nffree -= frags;
1152	fs->fs_cs(fs, cg).cs_nffree -= frags;
1153	fs->fs_fmod = 1;
1154	ufs_add32(cgp->cg_frsum[allocsiz], -1, needswap);
1155	if (frags != allocsiz)
1156		ufs_add32(cgp->cg_frsum[allocsiz - frags], 1, needswap);
1157	blkno = cgbase(fs, cg) + bno;
1158	ACTIVECG_CLR(fs, cg);
1159	mutex_exit(&ump->um_lock);
1160	bdwrite(bp);
1161	return blkno;
1162
1163 fail:
1164 	if (bp != NULL)
1165		brelse(bp, 0);
1166 	mutex_enter(&ump->um_lock);
1167 	return (0);
1168}
1169
1170/*
1171 * Allocate a block in a cylinder group.
1172 *
1173 * This algorithm implements the following policy:
1174 *   1) allocate the requested block.
1175 *   2) allocate a rotationally optimal block in the same cylinder.
1176 *   3) allocate the next available block on the block rotor for the
1177 *      specified cylinder group.
1178 * Note that this routine only allocates fs_bsize blocks; these
1179 * blocks may be fragmented by the routine that allocates them.
1180 */
1181static daddr_t
1182ffs_alloccgblk(struct inode *ip, struct buf *bp, daddr_t bpref, int realsize,
1183    int flags)
1184{
1185	struct fs *fs = ip->i_fs;
1186	struct cg *cgp;
1187	int cg;
1188	daddr_t blkno;
1189	int32_t bno;
1190	u_int8_t *blksfree;
1191	const int needswap = UFS_FSNEEDSWAP(fs);
1192
1193	KASSERT(mutex_owned(&ip->i_ump->um_lock));
1194
1195	cgp = (struct cg *)bp->b_data;
1196	blksfree = cg_blksfree(cgp, needswap);
1197	if (bpref == 0 || dtog(fs, bpref) != ufs_rw32(cgp->cg_cgx, needswap)) {
1198		bpref = ufs_rw32(cgp->cg_rotor, needswap);
1199	} else {
1200		bpref = ffs_blknum(fs, bpref);
1201		bno = dtogd(fs, bpref);
1202		/*
1203		 * if the requested block is available, use it
1204		 */
1205		if (ffs_isblock(fs, blksfree, ffs_fragstoblks(fs, bno)))
1206			goto gotit;
1207		/*
1208		 * if the requested data block isn't available and we are
1209		 * trying to allocate a contiguous file, return an error.
1210		 */
1211		if ((flags & (B_CONTIG | B_METAONLY)) == B_CONTIG)
1212			return (0);
1213	}
1214
1215	/*
1216	 * Take the next available block in this cylinder group.
1217	 */
1218	bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
1219#if 0
1220	/*
1221	 * XXX jdolecek ffs_mapsearch() succeeds or panics
1222	 */
1223	if (bno < 0)
1224		return (0);
1225#endif
1226	cgp->cg_rotor = ufs_rw32(bno, needswap);
1227gotit:
1228	blkno = ffs_fragstoblks(fs, bno);
1229	ffs_clrblock(fs, blksfree, blkno);
1230	ffs_clusteracct(fs, cgp, blkno, -1);
1231	ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
1232	fs->fs_cstotal.cs_nbfree--;
1233	fs->fs_cs(fs, ufs_rw32(cgp->cg_cgx, needswap)).cs_nbfree--;
1234	if ((fs->fs_magic == FS_UFS1_MAGIC) &&
1235	    ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
1236		int cylno;
1237		cylno = old_cbtocylno(fs, bno);
1238		KASSERT(cylno >= 0);
1239		KASSERT(cylno < fs->fs_old_ncyl);
1240		KASSERT(old_cbtorpos(fs, bno) >= 0);
1241		KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bno) < fs->fs_old_nrpos);
1242		ufs_add16(old_cg_blks(fs, cgp, cylno, needswap)[old_cbtorpos(fs, bno)], -1,
1243		    needswap);
1244		ufs_add32(old_cg_blktot(cgp, needswap)[cylno], -1, needswap);
1245	}
1246	fs->fs_fmod = 1;
1247	cg = ufs_rw32(cgp->cg_cgx, needswap);
1248	blkno = cgbase(fs, cg) + bno;
1249	return (blkno);
1250}
1251
1252/*
1253 * Determine whether an inode can be allocated.
1254 *
1255 * Check to see if an inode is available, and if it is,
1256 * allocate it using the following policy:
1257 *   1) allocate the requested inode.
1258 *   2) allocate the next available inode after the requested
1259 *      inode in the specified cylinder group.
1260 */
1261static daddr_t
1262ffs_nodealloccg(struct inode *ip, u_int cg, daddr_t ipref, int mode, int realsize,
1263    int flags)
1264{
1265	struct ufsmount *ump = ip->i_ump;
1266	struct fs *fs = ip->i_fs;
1267	struct cg *cgp;
1268	struct buf *bp, *ibp;
1269	u_int8_t *inosused;
1270	int error, start, len, loc, map, i;
1271	int32_t initediblk, maxiblk, irotor;
1272	daddr_t nalloc;
1273	struct ufs2_dinode *dp2;
1274	const int needswap = UFS_FSNEEDSWAP(fs);
1275
1276	KASSERT(mutex_owned(&ump->um_lock));
1277	UFS_WAPBL_JLOCK_ASSERT(ip->i_ump->um_mountp);
1278
1279	if (fs->fs_cs(fs, cg).cs_nifree == 0)
1280		return (0);
1281	mutex_exit(&ump->um_lock);
1282	ibp = NULL;
1283	if (fs->fs_magic == FS_UFS2_MAGIC) {
1284		initediblk = -1;
1285	} else {
1286		initediblk = fs->fs_ipg;
1287	}
1288	maxiblk = initediblk;
1289
1290retry:
1291	error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
1292		(int)fs->fs_cgsize, B_MODIFY, &bp);
1293	if (error)
1294		goto fail;
1295	cgp = (struct cg *)bp->b_data;
1296	if (!cg_chkmagic(cgp, needswap) || cgp->cg_cs.cs_nifree == 0)
1297		goto fail;
1298
1299	if (ibp != NULL &&
1300	    initediblk != ufs_rw32(cgp->cg_initediblk, needswap)) {
1301		/* Another thread allocated more inodes so we retry the test. */
1302		brelse(ibp, 0);
1303		ibp = NULL;
1304	}
1305	/*
1306	 * Check to see if we need to initialize more inodes.
1307	 */
1308	if (fs->fs_magic == FS_UFS2_MAGIC && ibp == NULL) {
1309	        initediblk = ufs_rw32(cgp->cg_initediblk, needswap);
1310		maxiblk = initediblk;
1311		nalloc = fs->fs_ipg - ufs_rw32(cgp->cg_cs.cs_nifree, needswap);
1312		if (nalloc + FFS_INOPB(fs) > initediblk &&
1313		    initediblk < ufs_rw32(cgp->cg_niblk, needswap)) {
1314			/*
1315			 * We have to release the cg buffer here to prevent
1316			 * a deadlock when reading the inode block will
1317			 * run a copy-on-write that might use this cg.
1318			 */
1319			brelse(bp, 0);
1320			bp = NULL;
1321			error = ffs_getblk(ip->i_devvp, FFS_FSBTODB(fs,
1322			    ino_to_fsba(fs, cg * fs->fs_ipg + initediblk)),
1323			    FFS_NOBLK, fs->fs_bsize, false, &ibp);
1324			if (error)
1325				goto fail;
1326
1327			maxiblk += FFS_INOPB(fs);
1328
1329			goto retry;
1330		}
1331	}
1332
1333	cgp->cg_old_time = ufs_rw32(time_second, needswap);
1334	if ((fs->fs_magic != FS_UFS1_MAGIC) ||
1335	    (fs->fs_old_flags & FS_FLAGS_UPDATED))
1336		cgp->cg_time = ufs_rw64(time_second, needswap);
1337	inosused = cg_inosused(cgp, needswap);
1338
1339	if (ipref) {
1340		ipref %= fs->fs_ipg;
1341		/* safeguard to stay in (to be) allocated range */
1342		if (ipref < maxiblk && isclr(inosused, ipref))
1343			goto gotit;
1344	}
1345
1346	irotor = ufs_rw32(cgp->cg_irotor, needswap);
1347
1348	KASSERTMSG(irotor < initediblk, "%s: allocation botch: cg=%d, irotor %d"
1349		   " out of bounds, initediblk=%d",
1350		   __func__, cg, irotor, initediblk);
1351
1352	start = irotor / NBBY;
1353	len = howmany(maxiblk - irotor, NBBY);
1354	loc = skpc(0xff, len, &inosused[start]);
1355	if (loc == 0) {
1356		len = start + 1;
1357		start = 0;
1358		loc = skpc(0xff, len, &inosused[0]);
1359		if (loc == 0) {
1360			panic("%s: map corrupted: cg=%d, irotor=%d, fs=%s",
1361			    __func__, cg, ufs_rw32(cgp->cg_irotor, needswap),
1362			    fs->fs_fsmnt);
1363			/* NOTREACHED */
1364		}
1365	}
1366	i = start + len - loc;
1367	map = inosused[i] ^ 0xff;
1368	if (map == 0) {
1369		panic("%s: block not in map: fs=%s", __func__, fs->fs_fsmnt);
1370	}
1371
1372	ipref = i * NBBY + ffs(map) - 1;
1373
1374	cgp->cg_irotor = ufs_rw32(ipref, needswap);
1375
1376gotit:
1377	KASSERTMSG(ipref < maxiblk, "%s: allocation botch: cg=%d attempt to "
1378		   "allocate inode index %d beyond max allocated index %d"
1379		   " of %d inodes/cg",
1380		   __func__, cg, (int)ipref, maxiblk, cgp->cg_niblk);
1381
1382	UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp, cg * fs->fs_ipg + ipref,
1383	    mode);
1384	/*
1385	 * Check to see if we need to initialize more inodes.
1386	 */
1387	if (ibp != NULL) {
1388		KASSERT(initediblk == ufs_rw32(cgp->cg_initediblk, needswap));
1389		memset(ibp->b_data, 0, fs->fs_bsize);
1390		dp2 = (struct ufs2_dinode *)(ibp->b_data);
1391		for (i = 0; i < FFS_INOPB(fs); i++) {
1392			/*
1393			 * Don't bother to swap, it's supposed to be
1394			 * random, after all.
1395			 */
1396			dp2->di_gen = (cprng_fast32() & INT32_MAX) / 2 + 1;
1397			dp2++;
1398		}
1399		initediblk += FFS_INOPB(fs);
1400		cgp->cg_initediblk = ufs_rw32(initediblk, needswap);
1401	}
1402
1403	mutex_enter(&ump->um_lock);
1404	ACTIVECG_CLR(fs, cg);
1405	setbit(inosused, ipref);
1406	ufs_add32(cgp->cg_cs.cs_nifree, -1, needswap);
1407	fs->fs_cstotal.cs_nifree--;
1408	fs->fs_cs(fs, cg).cs_nifree--;
1409	fs->fs_fmod = 1;
1410	if ((mode & IFMT) == IFDIR) {
1411		ufs_add32(cgp->cg_cs.cs_ndir, 1, needswap);
1412		fs->fs_cstotal.cs_ndir++;
1413		fs->fs_cs(fs, cg).cs_ndir++;
1414	}
1415	mutex_exit(&ump->um_lock);
1416	if (ibp != NULL) {
1417		bwrite(ibp);
1418		bwrite(bp);
1419	} else
1420		bdwrite(bp);
1421	return ((ino_t)(cg * fs->fs_ipg + ipref));
1422 fail:
1423	if (bp != NULL)
1424		brelse(bp, 0);
1425	if (ibp != NULL)
1426		brelse(ibp, 0);
1427	mutex_enter(&ump->um_lock);
1428	return (0);
1429}
1430
1431/*
1432 * Allocate a block or fragment.
1433 *
1434 * The specified block or fragment is removed from the
1435 * free map, possibly fragmenting a block in the process.
1436 *
1437 * This implementation should mirror fs_blkfree
1438 *
1439 * => um_lock not held on entry or exit
1440 */
1441int
1442ffs_blkalloc(struct inode *ip, daddr_t bno, long size)
1443{
1444	int error;
1445
1446	error = ffs_check_bad_allocation(__func__, ip->i_fs, bno, size,
1447	    ip->i_dev, ip->i_uid);
1448	if (error)
1449		return error;
1450
1451	return ffs_blkalloc_ump(ip->i_ump, bno, size);
1452}
1453
1454int
1455ffs_blkalloc_ump(struct ufsmount *ump, daddr_t bno, long size)
1456{
1457	struct fs *fs = ump->um_fs;
1458	struct cg *cgp;
1459	struct buf *bp;
1460	int32_t fragno, cgbno;
1461	int i, error, blk, frags, bbase;
1462	u_int cg;
1463	u_int8_t *blksfree;
1464	const int needswap = UFS_FSNEEDSWAP(fs);
1465
1466	KASSERT((u_int)size <= fs->fs_bsize && ffs_fragoff(fs, size) == 0 &&
1467	    ffs_fragnum(fs, bno) + ffs_numfrags(fs, size) <= fs->fs_frag);
1468	KASSERT(bno < fs->fs_size);
1469
1470	cg = dtog(fs, bno);
1471	error = bread(ump->um_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
1472		(int)fs->fs_cgsize, B_MODIFY, &bp);
1473	if (error) {
1474		return error;
1475	}
1476	cgp = (struct cg *)bp->b_data;
1477	if (!cg_chkmagic(cgp, needswap)) {
1478		brelse(bp, 0);
1479		return EIO;
1480	}
1481	cgp->cg_old_time = ufs_rw32(time_second, needswap);
1482	cgp->cg_time = ufs_rw64(time_second, needswap);
1483	cgbno = dtogd(fs, bno);
1484	blksfree = cg_blksfree(cgp, needswap);
1485
1486	mutex_enter(&ump->um_lock);
1487	if (size == fs->fs_bsize) {
1488		fragno = ffs_fragstoblks(fs, cgbno);
1489		if (!ffs_isblock(fs, blksfree, fragno)) {
1490			mutex_exit(&ump->um_lock);
1491			brelse(bp, 0);
1492			return EBUSY;
1493		}
1494		ffs_clrblock(fs, blksfree, fragno);
1495		ffs_clusteracct(fs, cgp, fragno, -1);
1496		ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
1497		fs->fs_cstotal.cs_nbfree--;
1498		fs->fs_cs(fs, cg).cs_nbfree--;
1499	} else {
1500		bbase = cgbno - ffs_fragnum(fs, cgbno);
1501
1502		frags = ffs_numfrags(fs, size);
1503		for (i = 0; i < frags; i++) {
1504			if (isclr(blksfree, cgbno + i)) {
1505				mutex_exit(&ump->um_lock);
1506				brelse(bp, 0);
1507				return EBUSY;
1508			}
1509		}
1510		/*
1511		 * if a complete block is being split, account for it
1512		 */
1513		fragno = ffs_fragstoblks(fs, bbase);
1514		if (ffs_isblock(fs, blksfree, fragno)) {
1515			ufs_add32(cgp->cg_cs.cs_nffree, fs->fs_frag, needswap);
1516			fs->fs_cstotal.cs_nffree += fs->fs_frag;
1517			fs->fs_cs(fs, cg).cs_nffree += fs->fs_frag;
1518			ffs_clusteracct(fs, cgp, fragno, -1);
1519			ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
1520			fs->fs_cstotal.cs_nbfree--;
1521			fs->fs_cs(fs, cg).cs_nbfree--;
1522		}
1523		/*
1524		 * decrement the counts associated with the old frags
1525		 */
1526		blk = blkmap(fs, blksfree, bbase);
1527		ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap);
1528		/*
1529		 * allocate the fragment
1530		 */
1531		for (i = 0; i < frags; i++) {
1532			clrbit(blksfree, cgbno + i);
1533		}
1534		ufs_add32(cgp->cg_cs.cs_nffree, -i, needswap);
1535		fs->fs_cstotal.cs_nffree -= i;
1536		fs->fs_cs(fs, cg).cs_nffree -= i;
1537		/*
1538		 * add back in counts associated with the new frags
1539		 */
1540		blk = blkmap(fs, blksfree, bbase);
1541		ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap);
1542	}
1543	fs->fs_fmod = 1;
1544	ACTIVECG_CLR(fs, cg);
1545	mutex_exit(&ump->um_lock);
1546	bdwrite(bp);
1547	return 0;
1548}
1549
1550/*
1551 * Free a block or fragment.
1552 *
1553 * The specified block or fragment is placed back in the
1554 * free map. If a fragment is deallocated, a possible
1555 * block reassembly is checked.
1556 *
1557 * => um_lock not held on entry or exit
1558 */
1559static void
1560ffs_blkfree_cg(struct fs *fs, struct vnode *devvp, daddr_t bno, long size)
1561{
1562	struct cg *cgp;
1563	struct buf *bp;
1564	struct ufsmount *ump;
1565	daddr_t cgblkno;
1566	int error;
1567	u_int cg;
1568	dev_t dev;
1569	const bool devvp_is_snapshot = (devvp->v_type != VBLK);
1570	const int needswap = UFS_FSNEEDSWAP(fs);
1571
1572	KASSERT(!devvp_is_snapshot);
1573
1574	cg = dtog(fs, bno);
1575	dev = devvp->v_rdev;
1576	ump = VFSTOUFS(spec_node_getmountedfs(devvp));
1577	KASSERT(fs == ump->um_fs);
1578	cgblkno = FFS_FSBTODB(fs, cgtod(fs, cg));
1579
1580	error = bread(devvp, cgblkno, (int)fs->fs_cgsize,
1581	    B_MODIFY, &bp);
1582	if (error) {
1583		return;
1584	}
1585	cgp = (struct cg *)bp->b_data;
1586	if (!cg_chkmagic(cgp, needswap)) {
1587		brelse(bp, 0);
1588		return;
1589	}
1590
1591	ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot);
1592
1593	bdwrite(bp);
1594}
1595
1596struct discardopdata {
1597	struct work wk; /* must be first */
1598	struct vnode *devvp;
1599	daddr_t bno;
1600	long size;
1601};
1602
1603struct discarddata {
1604	struct fs *fs;
1605	struct discardopdata *entry;
1606	long maxsize;
1607	kmutex_t entrylk;
1608	struct workqueue *wq;
1609	int wqcnt, wqdraining;
1610	kmutex_t wqlk;
1611	kcondvar_t wqcv;
1612	/* timer for flush? */
1613};
1614
1615static void
1616ffs_blkfree_td(struct fs *fs, struct discardopdata *td)
1617{
1618	struct mount *mp = spec_node_getmountedfs(td->devvp);
1619	long todo;
1620	int error;
1621
1622	while (td->size) {
1623		todo = uimin(td->size,
1624		  ffs_lfragtosize(fs, (fs->fs_frag - ffs_fragnum(fs, td->bno))));
1625		error = UFS_WAPBL_BEGIN(mp);
1626		if (error) {
1627			printf("ffs: failed to begin wapbl transaction"
1628			    " for discard: %d\n", error);
1629			break;
1630		}
1631		ffs_blkfree_cg(fs, td->devvp, td->bno, todo);
1632		UFS_WAPBL_END(mp);
1633		td->bno += ffs_numfrags(fs, todo);
1634		td->size -= todo;
1635	}
1636}
1637
1638static void
1639ffs_discardcb(struct work *wk, void *arg)
1640{
1641	struct discardopdata *td = (void *)wk;
1642	struct discarddata *ts = arg;
1643	struct fs *fs = ts->fs;
1644	off_t start, len;
1645#ifdef TRIMDEBUG
1646	int error;
1647#endif
1648
1649/* like FSBTODB but emits bytes; XXX move to fs.h */
1650#ifndef FFS_FSBTOBYTES
1651#define FFS_FSBTOBYTES(fs, b) ((b) << (fs)->fs_fshift)
1652#endif
1653
1654	start = FFS_FSBTOBYTES(fs, td->bno);
1655	len = td->size;
1656	vn_lock(td->devvp, LK_EXCLUSIVE | LK_RETRY);
1657#ifdef TRIMDEBUG
1658	error =
1659#endif
1660		VOP_FDISCARD(td->devvp, start, len);
1661	VOP_UNLOCK(td->devvp);
1662#ifdef TRIMDEBUG
1663	printf("trim(%" PRId64 ",%ld):%d\n", td->bno, td->size, error);
1664#endif
1665
1666	ffs_blkfree_td(fs, td);
1667	kmem_free(td, sizeof(*td));
1668	mutex_enter(&ts->wqlk);
1669	ts->wqcnt--;
1670	if (ts->wqdraining && !ts->wqcnt)
1671		cv_signal(&ts->wqcv);
1672	mutex_exit(&ts->wqlk);
1673}
1674
1675void *
1676ffs_discard_init(struct vnode *devvp, struct fs *fs)
1677{
1678	struct discarddata *ts;
1679	int error;
1680
1681	ts = kmem_zalloc(sizeof (*ts), KM_SLEEP);
1682	error = workqueue_create(&ts->wq, "trimwq", ffs_discardcb, ts,
1683				 PRI_USER, IPL_NONE, 0);
1684	if (error) {
1685		kmem_free(ts, sizeof (*ts));
1686		return NULL;
1687	}
1688	mutex_init(&ts->entrylk, MUTEX_DEFAULT, IPL_NONE);
1689	mutex_init(&ts->wqlk, MUTEX_DEFAULT, IPL_NONE);
1690	cv_init(&ts->wqcv, "trimwqcv");
1691	ts->maxsize = 100*1024; /* XXX */
1692	ts->fs = fs;
1693	return ts;
1694}
1695
1696void
1697ffs_discard_finish(void *vts, int flags)
1698{
1699	struct discarddata *ts = vts;
1700	struct discardopdata *td = NULL;
1701
1702	/* wait for workqueue to drain */
1703	mutex_enter(&ts->wqlk);
1704	if (ts->wqcnt) {
1705		ts->wqdraining = 1;
1706		cv_wait(&ts->wqcv, &ts->wqlk);
1707	}
1708	mutex_exit(&ts->wqlk);
1709
1710	mutex_enter(&ts->entrylk);
1711	if (ts->entry) {
1712		td = ts->entry;
1713		ts->entry = NULL;
1714	}
1715	mutex_exit(&ts->entrylk);
1716	if (td) {
1717		/* XXX don't tell disk, its optional */
1718		ffs_blkfree_td(ts->fs, td);
1719#ifdef TRIMDEBUG
1720		printf("finish(%" PRId64 ",%ld)\n", td->bno, td->size);
1721#endif
1722		kmem_free(td, sizeof(*td));
1723	}
1724
1725	cv_destroy(&ts->wqcv);
1726	mutex_destroy(&ts->entrylk);
1727	mutex_destroy(&ts->wqlk);
1728	workqueue_destroy(ts->wq);
1729	kmem_free(ts, sizeof(*ts));
1730}
1731
1732void
1733ffs_blkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, long size,
1734    ino_t inum)
1735{
1736	struct ufsmount *ump;
1737	int error;
1738	dev_t dev;
1739	struct discarddata *ts;
1740	struct discardopdata *td;
1741
1742	dev = devvp->v_rdev;
1743	ump = VFSTOUFS(spec_node_getmountedfs(devvp));
1744	if (ffs_snapblkfree(fs, devvp, bno, size, inum))
1745		return;
1746
1747	error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum);
1748	if (error)
1749		return;
1750
1751	if (!ump->um_discarddata) {
1752		ffs_blkfree_cg(fs, devvp, bno, size);
1753		return;
1754	}
1755
1756#ifdef TRIMDEBUG
1757	printf("blkfree(%" PRId64 ",%ld)\n", bno, size);
1758#endif
1759	ts = ump->um_discarddata;
1760	td = NULL;
1761
1762	mutex_enter(&ts->entrylk);
1763	if (ts->entry) {
1764		td = ts->entry;
1765		/* ffs deallocs backwards, check for prepend only */
1766		if (td->bno == bno + ffs_numfrags(fs, size)
1767		    && td->size + size <= ts->maxsize) {
1768			td->bno = bno;
1769			td->size += size;
1770			if (td->size < ts->maxsize) {
1771#ifdef TRIMDEBUG
1772				printf("defer(%" PRId64 ",%ld)\n", td->bno, td->size);
1773#endif
1774				mutex_exit(&ts->entrylk);
1775				return;
1776			}
1777			size = 0; /* mark done */
1778		}
1779		ts->entry = NULL;
1780	}
1781	mutex_exit(&ts->entrylk);
1782
1783	if (td) {
1784#ifdef TRIMDEBUG
1785		printf("enq old(%" PRId64 ",%ld)\n", td->bno, td->size);
1786#endif
1787		mutex_enter(&ts->wqlk);
1788		ts->wqcnt++;
1789		mutex_exit(&ts->wqlk);
1790		workqueue_enqueue(ts->wq, &td->wk, NULL);
1791	}
1792	if (!size)
1793		return;
1794
1795	td = kmem_alloc(sizeof(*td), KM_SLEEP);
1796	td->devvp = devvp;
1797	td->bno = bno;
1798	td->size = size;
1799
1800	if (td->size < ts->maxsize) { /* XXX always the case */
1801		mutex_enter(&ts->entrylk);
1802		if (!ts->entry) { /* possible race? */
1803#ifdef TRIMDEBUG
1804			printf("defer(%" PRId64 ",%ld)\n", td->bno, td->size);
1805#endif
1806			ts->entry = td;
1807			td = NULL;
1808		}
1809		mutex_exit(&ts->entrylk);
1810	}
1811	if (td) {
1812#ifdef TRIMDEBUG
1813		printf("enq new(%" PRId64 ",%ld)\n", td->bno, td->size);
1814#endif
1815		mutex_enter(&ts->wqlk);
1816		ts->wqcnt++;
1817		mutex_exit(&ts->wqlk);
1818		workqueue_enqueue(ts->wq, &td->wk, NULL);
1819	}
1820}
1821
1822/*
1823 * Free a block or fragment from a snapshot cg copy.
1824 *
1825 * The specified block or fragment is placed back in the
1826 * free map. If a fragment is deallocated, a possible
1827 * block reassembly is checked.
1828 *
1829 * => um_lock not held on entry or exit
1830 */
1831void
1832ffs_blkfree_snap(struct fs *fs, struct vnode *devvp, daddr_t bno, long size,
1833    ino_t inum)
1834{
1835	struct cg *cgp;
1836	struct buf *bp;
1837	struct ufsmount *ump;
1838	daddr_t cgblkno;
1839	int error, cg;
1840	dev_t dev;
1841	const bool devvp_is_snapshot = (devvp->v_type != VBLK);
1842	const int needswap = UFS_FSNEEDSWAP(fs);
1843
1844	KASSERT(devvp_is_snapshot);
1845
1846	cg = dtog(fs, bno);
1847	dev = VTOI(devvp)->i_devvp->v_rdev;
1848	ump = VFSTOUFS(devvp->v_mount);
1849	cgblkno = ffs_fragstoblks(fs, cgtod(fs, cg));
1850
1851	error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum);
1852	if (error)
1853		return;
1854
1855	error = bread(devvp, cgblkno, (int)fs->fs_cgsize,
1856	    B_MODIFY, &bp);
1857	if (error) {
1858		return;
1859	}
1860	cgp = (struct cg *)bp->b_data;
1861	if (!cg_chkmagic(cgp, needswap)) {
1862		brelse(bp, 0);
1863		return;
1864	}
1865
1866	ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot);
1867
1868	bdwrite(bp);
1869}
1870
1871static void
1872ffs_blkfree_common(struct ufsmount *ump, struct fs *fs, dev_t dev,
1873    struct buf *bp, daddr_t bno, long size, bool devvp_is_snapshot)
1874{
1875	struct cg *cgp;
1876	int32_t fragno, cgbno;
1877	int i, blk, frags, bbase;
1878	u_int cg;
1879	u_int8_t *blksfree;
1880	const int needswap = UFS_FSNEEDSWAP(fs);
1881
1882	cg = dtog(fs, bno);
1883	cgp = (struct cg *)bp->b_data;
1884	cgp->cg_old_time = ufs_rw32(time_second, needswap);
1885	if ((fs->fs_magic != FS_UFS1_MAGIC) ||
1886	    (fs->fs_old_flags & FS_FLAGS_UPDATED))
1887		cgp->cg_time = ufs_rw64(time_second, needswap);
1888	cgbno = dtogd(fs, bno);
1889	blksfree = cg_blksfree(cgp, needswap);
1890	mutex_enter(&ump->um_lock);
1891	if (size == fs->fs_bsize) {
1892		fragno = ffs_fragstoblks(fs, cgbno);
1893		if (!ffs_isfreeblock(fs, blksfree, fragno)) {
1894			if (devvp_is_snapshot) {
1895				mutex_exit(&ump->um_lock);
1896				return;
1897			}
1898			panic("%s: freeing free block: dev = 0x%llx, block = %"
1899			    PRId64 ", fs = %s", __func__,
1900			    (unsigned long long)dev, bno, fs->fs_fsmnt);
1901		}
1902		ffs_setblock(fs, blksfree, fragno);
1903		ffs_clusteracct(fs, cgp, fragno, 1);
1904		ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap);
1905		fs->fs_cstotal.cs_nbfree++;
1906		fs->fs_cs(fs, cg).cs_nbfree++;
1907		if ((fs->fs_magic == FS_UFS1_MAGIC) &&
1908		    ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
1909			i = old_cbtocylno(fs, cgbno);
1910			KASSERT(i >= 0);
1911			KASSERT(i < fs->fs_old_ncyl);
1912			KASSERT(old_cbtorpos(fs, cgbno) >= 0);
1913			KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, cgbno) < fs->fs_old_nrpos);
1914			ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs, cgbno)], 1,
1915			    needswap);
1916			ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap);
1917		}
1918	} else {
1919		bbase = cgbno - ffs_fragnum(fs, cgbno);
1920		/*
1921		 * decrement the counts associated with the old frags
1922		 */
1923		blk = blkmap(fs, blksfree, bbase);
1924		ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap);
1925		/*
1926		 * deallocate the fragment
1927		 */
1928		frags = ffs_numfrags(fs, size);
1929		for (i = 0; i < frags; i++) {
1930			if (isset(blksfree, cgbno + i)) {
1931				panic("%s: freeing free frag: "
1932				    "dev = 0x%llx, block = %" PRId64
1933				    ", fs = %s", __func__,
1934				    (unsigned long long)dev, bno + i,
1935				    fs->fs_fsmnt);
1936			}
1937			setbit(blksfree, cgbno + i);
1938		}
1939		ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
1940		fs->fs_cstotal.cs_nffree += i;
1941		fs->fs_cs(fs, cg).cs_nffree += i;
1942		/*
1943		 * add back in counts associated with the new frags
1944		 */
1945		blk = blkmap(fs, blksfree, bbase);
1946		ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap);
1947		/*
1948		 * if a complete block has been reassembled, account for it
1949		 */
1950		fragno = ffs_fragstoblks(fs, bbase);
1951		if (ffs_isblock(fs, blksfree, fragno)) {
1952			ufs_add32(cgp->cg_cs.cs_nffree, -fs->fs_frag, needswap);
1953			fs->fs_cstotal.cs_nffree -= fs->fs_frag;
1954			fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
1955			ffs_clusteracct(fs, cgp, fragno, 1);
1956			ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap);
1957			fs->fs_cstotal.cs_nbfree++;
1958			fs->fs_cs(fs, cg).cs_nbfree++;
1959			if ((fs->fs_magic == FS_UFS1_MAGIC) &&
1960			    ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
1961				i = old_cbtocylno(fs, bbase);
1962				KASSERT(i >= 0);
1963				KASSERT(i < fs->fs_old_ncyl);
1964				KASSERT(old_cbtorpos(fs, bbase) >= 0);
1965				KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bbase) < fs->fs_old_nrpos);
1966				ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs,
1967				    bbase)], 1, needswap);
1968				ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap);
1969			}
1970		}
1971	}
1972	fs->fs_fmod = 1;
1973	ACTIVECG_CLR(fs, cg);
1974	mutex_exit(&ump->um_lock);
1975}
1976
1977/*
1978 * Free an inode.
1979 */
1980int
1981ffs_vfree(struct vnode *vp, ino_t ino, int mode)
1982{
1983
1984	return ffs_freefile(vp->v_mount, ino, mode);
1985}
1986
1987/*
1988 * Do the actual free operation.
1989 * The specified inode is placed back in the free map.
1990 *
1991 * => um_lock not held on entry or exit
1992 */
1993int
1994ffs_freefile(struct mount *mp, ino_t ino, int mode)
1995{
1996	struct ufsmount *ump = VFSTOUFS(mp);
1997	struct fs *fs = ump->um_fs;
1998	struct vnode *devvp;
1999	struct cg *cgp;
2000	struct buf *bp;
2001	int error;
2002	u_int cg;
2003	daddr_t cgbno;
2004	dev_t dev;
2005	const int needswap = UFS_FSNEEDSWAP(fs);
2006
2007	cg = ino_to_cg(fs, ino);
2008	devvp = ump->um_devvp;
2009	dev = devvp->v_rdev;
2010	cgbno = FFS_FSBTODB(fs, cgtod(fs, cg));
2011
2012	if (ino >= fs->fs_ipg * fs->fs_ncg)
2013		panic("%s: range: dev = 0x%llx, ino = %llu, fs = %s", __func__,
2014		    (long long)dev, (unsigned long long)ino, fs->fs_fsmnt);
2015	error = bread(devvp, cgbno, (int)fs->fs_cgsize,
2016	    B_MODIFY, &bp);
2017	if (error) {
2018		return (error);
2019	}
2020	cgp = (struct cg *)bp->b_data;
2021	if (!cg_chkmagic(cgp, needswap)) {
2022		brelse(bp, 0);
2023		return (0);
2024	}
2025
2026	ffs_freefile_common(ump, fs, dev, bp, ino, mode, false);
2027
2028	bdwrite(bp);
2029
2030	return 0;
2031}
2032
2033int
2034ffs_freefile_snap(struct fs *fs, struct vnode *devvp, ino_t ino, int mode)
2035{
2036	struct ufsmount *ump;
2037	struct cg *cgp;
2038	struct buf *bp;
2039	int error, cg;
2040	daddr_t cgbno;
2041	dev_t dev;
2042	const int needswap = UFS_FSNEEDSWAP(fs);
2043
2044	KASSERT(devvp->v_type != VBLK);
2045
2046	cg = ino_to_cg(fs, ino);
2047	dev = VTOI(devvp)->i_devvp->v_rdev;
2048	ump = VFSTOUFS(devvp->v_mount);
2049	cgbno = ffs_fragstoblks(fs, cgtod(fs, cg));
2050	if (ino >= fs->fs_ipg * fs->fs_ncg)
2051		panic("%s: range: dev = 0x%llx, ino = %llu, fs = %s", __func__,
2052		    (unsigned long long)dev, (unsigned long long)ino,
2053		    fs->fs_fsmnt);
2054	error = bread(devvp, cgbno, (int)fs->fs_cgsize,
2055	    B_MODIFY, &bp);
2056	if (error) {
2057		return (error);
2058	}
2059	cgp = (struct cg *)bp->b_data;
2060	if (!cg_chkmagic(cgp, needswap)) {
2061		brelse(bp, 0);
2062		return (0);
2063	}
2064	ffs_freefile_common(ump, fs, dev, bp, ino, mode, true);
2065
2066	bdwrite(bp);
2067
2068	return 0;
2069}
2070
2071static void
2072ffs_freefile_common(struct ufsmount *ump, struct fs *fs, dev_t dev,
2073    struct buf *bp, ino_t ino, int mode, bool devvp_is_snapshot)
2074{
2075	u_int cg;
2076	struct cg *cgp;
2077	u_int8_t *inosused;
2078	const int needswap = UFS_FSNEEDSWAP(fs);
2079	ino_t cgino;
2080
2081	cg = ino_to_cg(fs, ino);
2082	cgp = (struct cg *)bp->b_data;
2083	cgp->cg_old_time = ufs_rw32(time_second, needswap);
2084	if ((fs->fs_magic != FS_UFS1_MAGIC) ||
2085	    (fs->fs_old_flags & FS_FLAGS_UPDATED))
2086		cgp->cg_time = ufs_rw64(time_second, needswap);
2087	inosused = cg_inosused(cgp, needswap);
2088	cgino = ino % fs->fs_ipg;
2089	if (isclr(inosused, cgino)) {
2090		printf("ifree: dev = 0x%llx, ino = %llu, fs = %s\n",
2091		    (unsigned long long)dev, (unsigned long long)ino,
2092		    fs->fs_fsmnt);
2093		if (fs->fs_ronly == 0)
2094			panic("%s: freeing free inode", __func__);
2095	}
2096	clrbit(inosused, cgino);
2097	if (!devvp_is_snapshot)
2098		UFS_WAPBL_UNREGISTER_INODE(ump->um_mountp, ino, mode);
2099	if (cgino < ufs_rw32(cgp->cg_irotor, needswap))
2100		cgp->cg_irotor = ufs_rw32(cgino, needswap);
2101	ufs_add32(cgp->cg_cs.cs_nifree, 1, needswap);
2102	mutex_enter(&ump->um_lock);
2103	fs->fs_cstotal.cs_nifree++;
2104	fs->fs_cs(fs, cg).cs_nifree++;
2105	if ((mode & IFMT) == IFDIR) {
2106		ufs_add32(cgp->cg_cs.cs_ndir, -1, needswap);
2107		fs->fs_cstotal.cs_ndir--;
2108		fs->fs_cs(fs, cg).cs_ndir--;
2109	}
2110	fs->fs_fmod = 1;
2111	ACTIVECG_CLR(fs, cg);
2112	mutex_exit(&ump->um_lock);
2113}
2114
2115/*
2116 * Check to see if a file is free.
2117 */
2118int
2119ffs_checkfreefile(struct fs *fs, struct vnode *devvp, ino_t ino)
2120{
2121	struct cg *cgp;
2122	struct buf *bp;
2123	daddr_t cgbno;
2124	int ret;
2125	u_int cg;
2126	u_int8_t *inosused;
2127	const bool devvp_is_snapshot = (devvp->v_type != VBLK);
2128
2129	KASSERT(devvp_is_snapshot);
2130
2131	cg = ino_to_cg(fs, ino);
2132	if (devvp_is_snapshot)
2133		cgbno = ffs_fragstoblks(fs, cgtod(fs, cg));
2134	else
2135		cgbno = FFS_FSBTODB(fs, cgtod(fs, cg));
2136	if (ino >= fs->fs_ipg * fs->fs_ncg)
2137		return 1;
2138	if (bread(devvp, cgbno, (int)fs->fs_cgsize, 0, &bp)) {
2139		return 1;
2140	}
2141	cgp = (struct cg *)bp->b_data;
2142	if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) {
2143		brelse(bp, 0);
2144		return 1;
2145	}
2146	inosused = cg_inosused(cgp, UFS_FSNEEDSWAP(fs));
2147	ino %= fs->fs_ipg;
2148	ret = isclr(inosused, ino);
2149	brelse(bp, 0);
2150	return ret;
2151}
2152
2153/*
2154 * Find a block of the specified size in the specified cylinder group.
2155 *
2156 * It is a panic if a request is made to find a block if none are
2157 * available.
2158 */
2159static int32_t
2160ffs_mapsearch(struct fs *fs, struct cg *cgp, daddr_t bpref, int allocsiz)
2161{
2162	int32_t bno;
2163	int start, len, loc, i;
2164	int blk, field, subfield, pos;
2165	int ostart, olen;
2166	u_int8_t *blksfree;
2167	const int needswap = UFS_FSNEEDSWAP(fs);
2168
2169	/* KASSERT(mutex_owned(&ump->um_lock)); */
2170
2171	/*
2172	 * find the fragment by searching through the free block
2173	 * map for an appropriate bit pattern
2174	 */
2175	if (bpref)
2176		start = dtogd(fs, bpref) / NBBY;
2177	else
2178		start = ufs_rw32(cgp->cg_frotor, needswap) / NBBY;
2179	blksfree = cg_blksfree(cgp, needswap);
2180	len = howmany(fs->fs_fpg, NBBY) - start;
2181	ostart = start;
2182	olen = len;
2183	loc = scanc((u_int)len,
2184		(const u_char *)&blksfree[start],
2185		(const u_char *)fragtbl[fs->fs_frag],
2186		(1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1)))));
2187	if (loc == 0) {
2188		len = start + 1;
2189		start = 0;
2190		loc = scanc((u_int)len,
2191			(const u_char *)&blksfree[0],
2192			(const u_char *)fragtbl[fs->fs_frag],
2193			(1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1)))));
2194		if (loc == 0) {
2195			panic("%s: map corrupted: start=%d, len=%d, "
2196			    "fs = %s, offset=%d/%ld, cg %d", __func__,
2197			    ostart, olen, fs->fs_fsmnt,
2198			    ufs_rw32(cgp->cg_freeoff, needswap),
2199			    (long)blksfree - (long)cgp, cgp->cg_cgx);
2200			/* NOTREACHED */
2201		}
2202	}
2203	bno = (start + len - loc) * NBBY;
2204	cgp->cg_frotor = ufs_rw32(bno, needswap);
2205	/*
2206	 * found the byte in the map
2207	 * sift through the bits to find the selected frag
2208	 */
2209	for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
2210		blk = blkmap(fs, blksfree, bno);
2211		blk <<= 1;
2212		field = around[allocsiz];
2213		subfield = inside[allocsiz];
2214		for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
2215			if ((blk & field) == subfield)
2216				return (bno + pos);
2217			field <<= 1;
2218			subfield <<= 1;
2219		}
2220	}
2221	panic("%s: block not in map: bno=%d, fs=%s", __func__,
2222	    bno, fs->fs_fsmnt);
2223	/* return (-1); */
2224}
2225
2226/*
2227 * Fserr prints the name of a file system with an error diagnostic.
2228 *
2229 * The form of the error message is:
2230 *	fs: error message
2231 */
2232static void
2233ffs_fserr(struct fs *fs, kauth_cred_t cred, const char *cp)
2234{
2235	KASSERT(cred != NULL);
2236
2237	if (cred == NOCRED || cred == FSCRED) {
2238		log(LOG_ERR, "pid %d, command %s, on %s: %s\n",
2239		    curproc->p_pid, curproc->p_comm,
2240		    fs->fs_fsmnt, cp);
2241	} else {
2242		log(LOG_ERR, "uid %d, pid %d, command %s, on %s: %s\n",
2243		    kauth_cred_getuid(cred), curproc->p_pid, curproc->p_comm,
2244		    fs->fs_fsmnt, cp);
2245	}
2246}
2247