1/* $NetBSD: ffs_alloc.c,v 1.173 2024/05/13 00:24:19 msaitoh Exp $ */ 2 3/*- 4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Wasabi Systems, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32/* 33 * Copyright (c) 2002 Networks Associates Technology, Inc. 34 * All rights reserved. 35 * 36 * This software was developed for the FreeBSD Project by Marshall 37 * Kirk McKusick and Network Associates Laboratories, the Security 38 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 39 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 40 * research program 41 * 42 * Copyright (c) 1982, 1986, 1989, 1993 43 * The Regents of the University of California. All rights reserved. 44 * 45 * Redistribution and use in source and binary forms, with or without 46 * modification, are permitted provided that the following conditions 47 * are met: 48 * 1. Redistributions of source code must retain the above copyright 49 * notice, this list of conditions and the following disclaimer. 50 * 2. Redistributions in binary form must reproduce the above copyright 51 * notice, this list of conditions and the following disclaimer in the 52 * documentation and/or other materials provided with the distribution. 53 * 3. Neither the name of the University nor the names of its contributors 54 * may be used to endorse or promote products derived from this software 55 * without specific prior written permission. 56 * 57 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 58 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 59 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 60 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 61 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 62 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 63 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 64 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 65 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 66 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 67 * SUCH DAMAGE. 68 * 69 * @(#)ffs_alloc.c 8.19 (Berkeley) 7/13/95 70 */ 71 72#include <sys/cdefs.h> 73__KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.173 2024/05/13 00:24:19 msaitoh Exp $"); 74 75#if defined(_KERNEL_OPT) 76#include "opt_ffs.h" 77#include "opt_quota.h" 78#include "opt_uvm_page_trkown.h" 79#endif 80 81#include <sys/param.h> 82#include <sys/systm.h> 83#include <sys/buf.h> 84#include <sys/cprng.h> 85#include <sys/kauth.h> 86#include <sys/kernel.h> 87#include <sys/mount.h> 88#include <sys/proc.h> 89#include <sys/syslog.h> 90#include <sys/vnode.h> 91#include <sys/wapbl.h> 92#include <sys/cprng.h> 93 94#include <miscfs/specfs/specdev.h> 95#include <ufs/ufs/quota.h> 96#include <ufs/ufs/ufsmount.h> 97#include <ufs/ufs/inode.h> 98#include <ufs/ufs/ufs_extern.h> 99#include <ufs/ufs/ufs_bswap.h> 100#include <ufs/ufs/ufs_wapbl.h> 101 102#include <ufs/ffs/fs.h> 103#include <ufs/ffs/ffs_extern.h> 104 105#ifdef UVM_PAGE_TRKOWN 106#include <uvm/uvm_object.h> 107#include <uvm/uvm_page.h> 108#endif 109 110static daddr_t ffs_alloccg(struct inode *, u_int, daddr_t, int, int, int); 111static daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t, int, int); 112static ino_t ffs_dirpref(struct inode *); 113static daddr_t ffs_fragextend(struct inode *, u_int, daddr_t, int, int); 114static void ffs_fserr(struct fs *, kauth_cred_t, const char *); 115static daddr_t ffs_hashalloc(struct inode *, u_int, daddr_t, int, int, int, 116 daddr_t (*)(struct inode *, u_int, daddr_t, int, int, int)); 117static daddr_t ffs_nodealloccg(struct inode *, u_int, daddr_t, int, int, int); 118static int32_t ffs_mapsearch(struct fs *, struct cg *, 119 daddr_t, int); 120static void ffs_blkfree_common(struct ufsmount *, struct fs *, dev_t, struct buf *, 121 daddr_t, long, bool); 122static void ffs_freefile_common(struct ufsmount *, struct fs *, dev_t, struct buf *, ino_t, 123 int, bool); 124 125/* if 1, changes in optimalization strategy are logged */ 126int ffs_log_changeopt = 0; 127 128/* in ffs_tables.c */ 129extern const int inside[], around[]; 130extern const u_char * const fragtbl[]; 131 132/* Basic consistency check for block allocations */ 133static int 134ffs_check_bad_allocation(const char *func, struct fs *fs, daddr_t bno, 135 long size, dev_t dev, ino_t inum) 136{ 137 if ((u_int)size > fs->fs_bsize || ffs_fragoff(fs, size) != 0 || 138 ffs_fragnum(fs, bno) + ffs_numfrags(fs, size) > fs->fs_frag) { 139 panic("%s: bad size: dev = 0x%llx, bno = %" PRId64 140 " bsize = %d, size = %ld, fs = %s", func, 141 (long long)dev, bno, fs->fs_bsize, size, fs->fs_fsmnt); 142 } 143 144 if (bno >= fs->fs_size) { 145 printf("%s: bad block %" PRId64 ", ino %llu\n", func, bno, 146 (unsigned long long)inum); 147 ffs_fserr(fs, NOCRED, "bad block"); 148 return EINVAL; 149 } 150 return 0; 151} 152 153/* 154 * Allocate a block in the file system. 155 * 156 * The size of the requested block is given, which must be some 157 * multiple of fs_fsize and <= fs_bsize. 158 * A preference may be optionally specified. If a preference is given 159 * the following hierarchy is used to allocate a block: 160 * 1) allocate the requested block. 161 * 2) allocate a rotationally optimal block in the same cylinder. 162 * 3) allocate a block in the same cylinder group. 163 * 4) quadradically rehash into other cylinder groups, until an 164 * available block is located. 165 * If no block preference is given the following hierarchy is used 166 * to allocate a block: 167 * 1) allocate a block in the cylinder group that contains the 168 * inode for the file. 169 * 2) quadradically rehash into other cylinder groups, until an 170 * available block is located. 171 * 172 * => called with um_lock held 173 * => releases um_lock before returning 174 */ 175int 176ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size, 177 int flags, kauth_cred_t cred, daddr_t *bnp) 178{ 179 struct ufsmount *ump; 180 struct fs *fs; 181 daddr_t bno; 182 u_int cg; 183#if defined(QUOTA) || defined(QUOTA2) 184 int error; 185#endif 186 187 fs = ip->i_fs; 188 ump = ip->i_ump; 189 190 KASSERT(mutex_owned(&ump->um_lock)); 191 192#ifdef UVM_PAGE_TRKOWN 193 194 /* 195 * Sanity-check that allocations within the file size 196 * do not allow other threads to read the stale contents 197 * of newly allocated blocks. 198 * Usually pages will exist to cover the new allocation. 199 * There is an optimization in ffs_write() where we skip 200 * creating pages if several conditions are met: 201 * - the file must not be mapped (in any user address space). 202 * - the write must cover whole pages and whole blocks. 203 * If those conditions are not met then pages must exist and 204 * be locked by the current thread. 205 */ 206 207 struct vnode *vp = ITOV(ip); 208 if (vp->v_type == VREG && (flags & IO_EXT) == 0 && 209 ffs_lblktosize(fs, (voff_t)lbn) < round_page(vp->v_size) && 210 ((vp->v_vflag & VV_MAPPED) != 0 || (size & PAGE_MASK) != 0 || 211 ffs_blkoff(fs, size) != 0)) { 212 struct vm_page *pg __diagused; 213 struct uvm_object *uobj = &vp->v_uobj; 214 voff_t off = trunc_page(ffs_lblktosize(fs, lbn)); 215 voff_t endoff = round_page(ffs_lblktosize(fs, lbn) + size); 216 217 rw_enter(uobj->vmobjlock, RW_WRITER); 218 while (off < endoff) { 219 pg = uvm_pagelookup(uobj, off); 220 KASSERT((pg != NULL && pg->owner_tag != NULL && 221 pg->owner == curproc->p_pid && 222 pg->lowner == curlwp->l_lid)); 223 off += PAGE_SIZE; 224 } 225 rw_exit(uobj->vmobjlock); 226 } 227#endif 228 229 *bnp = 0; 230 231 KASSERTMSG((cred != NOCRED), "missing credential"); 232 KASSERTMSG(((u_int)size <= fs->fs_bsize), 233 "bad size: dev = 0x%llx, bsize = %d, size = %d, fs = %s", 234 (unsigned long long)ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt); 235 KASSERTMSG((ffs_fragoff(fs, size) == 0), 236 "bad size: dev = 0x%llx, bsize = %d, size = %d, fs = %s", 237 (unsigned long long)ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt); 238 239 if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) 240 goto nospace; 241 if (freespace(fs, fs->fs_minfree) <= 0 && 242 kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL, 243 NULL, NULL) != 0) 244 goto nospace; 245#if defined(QUOTA) || defined(QUOTA2) 246 mutex_exit(&ump->um_lock); 247 if ((error = chkdq(ip, btodb(size), cred, 0)) != 0) 248 return (error); 249 mutex_enter(&ump->um_lock); 250#endif 251 252 if (bpref >= fs->fs_size) 253 bpref = 0; 254 if (bpref == 0) 255 cg = ino_to_cg(fs, ip->i_number); 256 else 257 cg = dtog(fs, bpref); 258 bno = ffs_hashalloc(ip, cg, bpref, size, 0, flags, ffs_alloccg); 259 if (bno > 0) { 260 DIP_ADD(ip, blocks, btodb(size)); 261 if (flags & IO_EXT) 262 ip->i_flag |= IN_CHANGE; 263 else 264 ip->i_flag |= IN_CHANGE | IN_UPDATE; 265 *bnp = bno; 266 return (0); 267 } 268#if defined(QUOTA) || defined(QUOTA2) 269 /* 270 * Restore user's disk quota because allocation failed. 271 */ 272 (void) chkdq(ip, -btodb(size), cred, FORCE); 273#endif 274 if (flags & B_CONTIG) { 275 /* 276 * XXX ump->um_lock handling is "suspect" at best. 277 * For the case where ffs_hashalloc() fails early 278 * in the B_CONTIG case we reach here with um_lock 279 * already unlocked, so we can't release it again 280 * like in the normal error path. See kern/39206. 281 * 282 * 283 * Fail silently - it's up to our caller to report 284 * errors. 285 */ 286 return (ENOSPC); 287 } 288nospace: 289 mutex_exit(&ump->um_lock); 290 ffs_fserr(fs, cred, "file system full"); 291 uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt); 292 return (ENOSPC); 293} 294 295/* 296 * Reallocate a fragment to a bigger size 297 * 298 * The number and size of the old block is given, and a preference 299 * and new size is also specified. The allocator attempts to extend 300 * the original block. Failing that, the regular block allocator is 301 * invoked to get an appropriate block. 302 * 303 * => called with um_lock held 304 * => return with um_lock released 305 */ 306int 307ffs_realloccg(struct inode *ip, daddr_t lbprev, daddr_t bprev, daddr_t bpref, 308 int osize, int nsize, int flags, kauth_cred_t cred, struct buf **bpp, 309 daddr_t *blknop) 310{ 311 struct ufsmount *ump; 312 struct fs *fs; 313 struct buf *bp; 314 u_int cg, request; 315 int error; 316 daddr_t bno; 317 318 fs = ip->i_fs; 319 ump = ip->i_ump; 320 321 KASSERT(mutex_owned(&ump->um_lock)); 322 323#ifdef UVM_PAGE_TRKOWN 324 325 /* 326 * Sanity-check that allocations within the file size 327 * do not allow other threads to read the stale contents 328 * of newly allocated blocks. 329 * Unlike in ffs_alloc(), here pages must always exist 330 * for such allocations, because only the last block of a file 331 * can be a fragment and ffs_write() will reallocate the 332 * fragment to the new size using ufs_balloc_range(), 333 * which always creates pages to cover blocks it allocates. 334 */ 335 336 if (ITOV(ip)->v_type == VREG) { 337 struct vm_page *pg __diagused; 338 struct uvm_object *uobj = &ITOV(ip)->v_uobj; 339 voff_t off = trunc_page(ffs_lblktosize(fs, lbprev)); 340 voff_t endoff = round_page(ffs_lblktosize(fs, lbprev) + osize); 341 342 rw_enter(uobj->vmobjlock, RW_WRITER); 343 while (off < endoff) { 344 pg = uvm_pagelookup(uobj, off); 345 KASSERT(pg->owner == curproc->p_pid && 346 pg->lowner == curlwp->l_lid); 347 off += PAGE_SIZE; 348 } 349 rw_exit(uobj->vmobjlock); 350 } 351#endif 352 353 KASSERTMSG((cred != NOCRED), "missing credential"); 354 KASSERTMSG(((u_int)osize <= fs->fs_bsize), 355 "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s", 356 (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize, 357 fs->fs_fsmnt); 358 KASSERTMSG((ffs_fragoff(fs, osize) == 0), 359 "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s", 360 (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize, 361 fs->fs_fsmnt); 362 KASSERTMSG(((u_int)nsize <= fs->fs_bsize), 363 "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s", 364 (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize, 365 fs->fs_fsmnt); 366 KASSERTMSG((ffs_fragoff(fs, nsize) == 0), 367 "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s", 368 (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize, 369 fs->fs_fsmnt); 370 371 if (freespace(fs, fs->fs_minfree) <= 0 && 372 kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL, 373 NULL, NULL) != 0) { 374 mutex_exit(&ump->um_lock); 375 goto nospace; 376 } 377 378 if (bprev == 0) { 379 panic("%s: bad bprev: dev = 0x%llx, bsize = %d, bprev = %" 380 PRId64 ", fs = %s", __func__, 381 (unsigned long long)ip->i_dev, fs->fs_bsize, bprev, 382 fs->fs_fsmnt); 383 } 384 mutex_exit(&ump->um_lock); 385 386 /* 387 * Allocate the extra space in the buffer. 388 */ 389 if (bpp != NULL && 390 (error = bread(ITOV(ip), lbprev, osize, 0, &bp)) != 0) { 391 return (error); 392 } 393#if defined(QUOTA) || defined(QUOTA2) 394 if ((error = chkdq(ip, btodb(nsize - osize), cred, 0)) != 0) { 395 if (bpp != NULL) { 396 brelse(bp, 0); 397 } 398 return (error); 399 } 400#endif 401 /* 402 * Check for extension in the existing location. 403 */ 404 cg = dtog(fs, bprev); 405 mutex_enter(&ump->um_lock); 406 if ((bno = ffs_fragextend(ip, cg, bprev, osize, nsize)) != 0) { 407 DIP_ADD(ip, blocks, btodb(nsize - osize)); 408 if (flags & IO_EXT) 409 ip->i_flag |= IN_CHANGE; 410 else 411 ip->i_flag |= IN_CHANGE | IN_UPDATE; 412 413 if (bpp != NULL) { 414 if (bp->b_blkno != FFS_FSBTODB(fs, bno)) { 415 panic("%s: bad blockno %#llx != %#llx", 416 __func__, (unsigned long long) bp->b_blkno, 417 (unsigned long long)FFS_FSBTODB(fs, bno)); 418 } 419 allocbuf(bp, nsize, 1); 420 memset((char *)bp->b_data + osize, 0, nsize - osize); 421 mutex_enter(bp->b_objlock); 422 KASSERT(!cv_has_waiters(&bp->b_done)); 423 bp->b_oflags |= BO_DONE; 424 mutex_exit(bp->b_objlock); 425 *bpp = bp; 426 } 427 if (blknop != NULL) { 428 *blknop = bno; 429 } 430 return (0); 431 } 432 /* 433 * Allocate a new disk location. 434 */ 435 if (bpref >= fs->fs_size) 436 bpref = 0; 437 switch ((int)fs->fs_optim) { 438 case FS_OPTSPACE: 439 /* 440 * Allocate an exact sized fragment. Although this makes 441 * best use of space, we will waste time relocating it if 442 * the file continues to grow. If the fragmentation is 443 * less than half of the minimum free reserve, we choose 444 * to begin optimizing for time. 445 */ 446 request = nsize; 447 if (fs->fs_minfree < 5 || 448 fs->fs_cstotal.cs_nffree > 449 fs->fs_dsize * fs->fs_minfree / (2 * 100)) 450 break; 451 452 if (ffs_log_changeopt) { 453 log(LOG_NOTICE, 454 "%s: optimization changed from SPACE to TIME\n", 455 fs->fs_fsmnt); 456 } 457 458 fs->fs_optim = FS_OPTTIME; 459 break; 460 case FS_OPTTIME: 461 /* 462 * At this point we have discovered a file that is trying to 463 * grow a small fragment to a larger fragment. To save time, 464 * we allocate a full sized block, then free the unused portion. 465 * If the file continues to grow, the `ffs_fragextend' call 466 * above will be able to grow it in place without further 467 * copying. If aberrant programs cause disk fragmentation to 468 * grow within 2% of the free reserve, we choose to begin 469 * optimizing for space. 470 */ 471 request = fs->fs_bsize; 472 if (fs->fs_cstotal.cs_nffree < 473 fs->fs_dsize * (fs->fs_minfree - 2) / 100) 474 break; 475 476 if (ffs_log_changeopt) { 477 log(LOG_NOTICE, 478 "%s: optimization changed from TIME to SPACE\n", 479 fs->fs_fsmnt); 480 } 481 482 fs->fs_optim = FS_OPTSPACE; 483 break; 484 default: 485 panic("%s: bad optim: dev = 0x%llx, optim = %d, fs = %s", 486 __func__, (unsigned long long)ip->i_dev, fs->fs_optim, 487 fs->fs_fsmnt); 488 /* NOTREACHED */ 489 } 490 bno = ffs_hashalloc(ip, cg, bpref, request, nsize, 0, ffs_alloccg); 491 if (bno > 0) { 492 /* 493 * Use forced deallocation registration, we can't handle 494 * failure here. This is safe, as this place is ever hit 495 * maximum once per write operation, when fragment is extended 496 * to longer fragment, or a full block. 497 */ 498 if ((ip->i_ump->um_mountp->mnt_wapbl) && 499 (ITOV(ip)->v_type != VREG)) { 500 /* this should never fail */ 501 error = UFS_WAPBL_REGISTER_DEALLOCATION_FORCE( 502 ip->i_ump->um_mountp, FFS_FSBTODB(fs, bprev), 503 osize); 504 if (error) 505 panic("ffs_realloccg: dealloc registration failed"); 506 } else { 507 ffs_blkfree(fs, ip->i_devvp, bprev, (long)osize, 508 ip->i_number); 509 } 510 DIP_ADD(ip, blocks, btodb(nsize - osize)); 511 if (flags & IO_EXT) 512 ip->i_flag |= IN_CHANGE; 513 else 514 ip->i_flag |= IN_CHANGE | IN_UPDATE; 515 if (bpp != NULL) { 516 bp->b_blkno = FFS_FSBTODB(fs, bno); 517 allocbuf(bp, nsize, 1); 518 memset((char *)bp->b_data + osize, 0, (u_int)nsize - osize); 519 mutex_enter(bp->b_objlock); 520 KASSERT(!cv_has_waiters(&bp->b_done)); 521 bp->b_oflags |= BO_DONE; 522 mutex_exit(bp->b_objlock); 523 *bpp = bp; 524 } 525 if (blknop != NULL) { 526 *blknop = bno; 527 } 528 return (0); 529 } 530 mutex_exit(&ump->um_lock); 531 532#if defined(QUOTA) || defined(QUOTA2) 533 /* 534 * Restore user's disk quota because allocation failed. 535 */ 536 (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE); 537#endif 538 if (bpp != NULL) { 539 brelse(bp, 0); 540 } 541 542nospace: 543 /* 544 * no space available 545 */ 546 ffs_fserr(fs, cred, "file system full"); 547 uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt); 548 return (ENOSPC); 549} 550 551/* 552 * Allocate an inode in the file system. 553 * 554 * If allocating a directory, use ffs_dirpref to select the inode. 555 * If allocating in a directory, the following hierarchy is followed: 556 * 1) allocate the preferred inode. 557 * 2) allocate an inode in the same cylinder group. 558 * 3) quadradically rehash into other cylinder groups, until an 559 * available inode is located. 560 * If no inode preference is given the following hierarchy is used 561 * to allocate an inode: 562 * 1) allocate an inode in cylinder group 0. 563 * 2) quadradically rehash into other cylinder groups, until an 564 * available inode is located. 565 * 566 * => um_lock not held upon entry or return 567 */ 568int 569ffs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred, ino_t *inop) 570{ 571 struct ufsmount *ump; 572 struct inode *pip; 573 struct fs *fs; 574 ino_t ino, ipref; 575 u_int cg; 576 int error; 577 578 UFS_WAPBL_JUNLOCK_ASSERT(pvp->v_mount); 579 580 pip = VTOI(pvp); 581 fs = pip->i_fs; 582 ump = pip->i_ump; 583 584 error = UFS_WAPBL_BEGIN(pvp->v_mount); 585 if (error) { 586 return error; 587 } 588 mutex_enter(&ump->um_lock); 589 if (fs->fs_cstotal.cs_nifree == 0) 590 goto noinodes; 591 592 if ((mode & IFMT) == IFDIR) 593 ipref = ffs_dirpref(pip); 594 else 595 ipref = pip->i_number; 596 if (ipref >= fs->fs_ncg * fs->fs_ipg) 597 ipref = 0; 598 cg = ino_to_cg(fs, ipref); 599 /* 600 * Track number of dirs created one after another 601 * in a same cg without intervening by files. 602 */ 603 if ((mode & IFMT) == IFDIR) { 604 if (fs->fs_contigdirs[cg] < 255) 605 fs->fs_contigdirs[cg]++; 606 } else { 607 if (fs->fs_contigdirs[cg] > 0) 608 fs->fs_contigdirs[cg]--; 609 } 610 ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, 0, ffs_nodealloccg); 611 if (ino == 0) 612 goto noinodes; 613 UFS_WAPBL_END(pvp->v_mount); 614 *inop = ino; 615 return 0; 616 617noinodes: 618 mutex_exit(&ump->um_lock); 619 UFS_WAPBL_END(pvp->v_mount); 620 ffs_fserr(fs, cred, "out of inodes"); 621 uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt); 622 return ENOSPC; 623} 624 625/* 626 * Find a cylinder group in which to place a directory. 627 * 628 * The policy implemented by this algorithm is to allocate a 629 * directory inode in the same cylinder group as its parent 630 * directory, but also to reserve space for its files inodes 631 * and data. Restrict the number of directories which may be 632 * allocated one after another in the same cylinder group 633 * without intervening allocation of files. 634 * 635 * If we allocate a first level directory then force allocation 636 * in another cylinder group. 637 */ 638static ino_t 639ffs_dirpref(struct inode *pip) 640{ 641 register struct fs *fs; 642 u_int cg, prefcg; 643 uint64_t dirsize, cgsize, curdsz; 644 u_int avgifree, avgbfree, avgndir; 645 u_int minifree, minbfree, maxndir; 646 u_int mincg, minndir; 647 u_int maxcontigdirs; 648 649 KASSERT(mutex_owned(&pip->i_ump->um_lock)); 650 651 fs = pip->i_fs; 652 653 avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; 654 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 655 avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; 656 657 /* 658 * Force allocation in another cg if creating a first level dir. 659 */ 660 if (ITOV(pip)->v_vflag & VV_ROOT) { 661 prefcg = cprng_fast32() % fs->fs_ncg; 662 mincg = prefcg; 663 minndir = fs->fs_ipg; 664 for (cg = prefcg; cg < fs->fs_ncg; cg++) 665 if (fs->fs_cs(fs, cg).cs_ndir < minndir && 666 fs->fs_cs(fs, cg).cs_nifree >= avgifree && 667 fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 668 mincg = cg; 669 minndir = fs->fs_cs(fs, cg).cs_ndir; 670 } 671 for (cg = 0; cg < prefcg; cg++) 672 if (fs->fs_cs(fs, cg).cs_ndir < minndir && 673 fs->fs_cs(fs, cg).cs_nifree >= avgifree && 674 fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 675 mincg = cg; 676 minndir = fs->fs_cs(fs, cg).cs_ndir; 677 } 678 return ((ino_t)(fs->fs_ipg * mincg)); 679 } 680 681 /* 682 * Count various limits which used for 683 * optimal allocation of a directory inode. 684 * Try cylinder groups with >75% avgifree and avgbfree. 685 * Avoid cylinder groups with no free blocks or inodes as that 686 * triggers an I/O-expensive cylinder group scan. 687 */ 688 maxndir = uimin(avgndir + fs->fs_ipg / 16, fs->fs_ipg); 689 minifree = avgifree - avgifree / 4; 690 if (minifree < 1) 691 minifree = 1; 692 minbfree = avgbfree - avgbfree / 4; 693 if (minbfree < 1) 694 minbfree = 1; 695 cgsize = (int64_t)fs->fs_fsize * fs->fs_fpg; 696 dirsize = (int64_t)fs->fs_avgfilesize * fs->fs_avgfpdir; 697 if (avgndir != 0) { 698 curdsz = (cgsize - (int64_t)avgbfree * fs->fs_bsize) / avgndir; 699 if (dirsize < curdsz) 700 dirsize = curdsz; 701 } 702 if (cgsize < dirsize * 255) 703 maxcontigdirs = (avgbfree * fs->fs_bsize) / dirsize; 704 else 705 maxcontigdirs = 255; 706 if (fs->fs_avgfpdir > 0) 707 maxcontigdirs = uimin(maxcontigdirs, 708 fs->fs_ipg / fs->fs_avgfpdir); 709 if (maxcontigdirs == 0) 710 maxcontigdirs = 1; 711 712 /* 713 * Limit number of dirs in one cg and reserve space for 714 * regular files, but only if we have no deficit in 715 * inodes or space. 716 */ 717 prefcg = ino_to_cg(fs, pip->i_number); 718 for (cg = prefcg; cg < fs->fs_ncg; cg++) 719 if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 720 fs->fs_cs(fs, cg).cs_nifree >= minifree && 721 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 722 if (fs->fs_contigdirs[cg] < maxcontigdirs) 723 return ((ino_t)(fs->fs_ipg * cg)); 724 } 725 for (cg = 0; cg < prefcg; cg++) 726 if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 727 fs->fs_cs(fs, cg).cs_nifree >= minifree && 728 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 729 if (fs->fs_contigdirs[cg] < maxcontigdirs) 730 return ((ino_t)(fs->fs_ipg * cg)); 731 } 732 /* 733 * This is a backstop when we are deficient in space. 734 */ 735 for (cg = prefcg; cg < fs->fs_ncg; cg++) 736 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 737 return ((ino_t)(fs->fs_ipg * cg)); 738 for (cg = 0; cg < prefcg; cg++) 739 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 740 break; 741 return ((ino_t)(fs->fs_ipg * cg)); 742} 743 744/* 745 * Select the desired position for the next block in a file. The file is 746 * logically divided into sections. The first section is composed of the 747 * direct blocks. Each additional section contains fs_maxbpg blocks. 748 * 749 * If no blocks have been allocated in the first section, the policy is to 750 * request a block in the same cylinder group as the inode that describes 751 * the file. If no blocks have been allocated in any other section, the 752 * policy is to place the section in a cylinder group with a greater than 753 * average number of free blocks. An appropriate cylinder group is found 754 * by using a rotor that sweeps the cylinder groups. When a new group of 755 * blocks is needed, the sweep begins in the cylinder group following the 756 * cylinder group from which the previous allocation was made. The sweep 757 * continues until a cylinder group with greater than the average number 758 * of free blocks is found. If the allocation is for the first block in an 759 * indirect block, the information on the previous allocation is unavailable; 760 * here a best guess is made based upon the logical block number being 761 * allocated. 762 * 763 * If a section is already partially allocated, the policy is to 764 * contiguously allocate fs_maxcontig blocks. The end of one of these 765 * contiguous blocks and the beginning of the next is laid out 766 * contiguously if possible. 767 * 768 * => um_lock held on entry and exit 769 */ 770daddr_t 771ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx, int flags, 772 int32_t *bap /* XXX ondisk32 */) 773{ 774 struct fs *fs; 775 u_int cg; 776 u_int avgbfree, startcg; 777 778 KASSERT(mutex_owned(&ip->i_ump->um_lock)); 779 780 fs = ip->i_fs; 781 782 /* 783 * If allocating a contiguous file with B_CONTIG, use the hints 784 * in the inode extensions to return the desired block. 785 * 786 * For metadata (indirect blocks) return the address of where 787 * the first indirect block resides - we'll scan for the next 788 * available slot if we need to allocate more than one indirect 789 * block. For data, return the address of the actual block 790 * relative to the address of the first data block. 791 */ 792 if (flags & B_CONTIG) { 793 KASSERT(ip->i_ffs_first_data_blk != 0); 794 KASSERT(ip->i_ffs_first_indir_blk != 0); 795 if (flags & B_METAONLY) 796 return ip->i_ffs_first_indir_blk; 797 else 798 return ip->i_ffs_first_data_blk + ffs_blkstofrags(fs, lbn); 799 } 800 801 if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { 802 if (lbn < UFS_NDADDR + FFS_NINDIR(fs)) { 803 cg = ino_to_cg(fs, ip->i_number); 804 return (cgbase(fs, cg) + fs->fs_frag); 805 } 806 /* 807 * Find a cylinder with greater than average number of 808 * unused data blocks. 809 */ 810 if (indx == 0 || bap[indx - 1] == 0) 811 startcg = 812 ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg; 813 else 814 startcg = dtog(fs, 815 ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1); 816 startcg %= fs->fs_ncg; 817 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 818 for (cg = startcg; cg < fs->fs_ncg; cg++) 819 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 820 return (cgbase(fs, cg) + fs->fs_frag); 821 } 822 for (cg = 0; cg < startcg; cg++) 823 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 824 return (cgbase(fs, cg) + fs->fs_frag); 825 } 826 return (0); 827 } 828 /* 829 * We just always try to lay things out contiguously. 830 */ 831 return ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag; 832} 833 834daddr_t 835ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int flags, 836 int64_t *bap) 837{ 838 struct fs *fs; 839 u_int cg; 840 u_int avgbfree, startcg; 841 842 KASSERT(mutex_owned(&ip->i_ump->um_lock)); 843 844 fs = ip->i_fs; 845 846 /* 847 * If allocating a contiguous file with B_CONTIG, use the hints 848 * in the inode extensions to return the desired block. 849 * 850 * For metadata (indirect blocks) return the address of where 851 * the first indirect block resides - we'll scan for the next 852 * available slot if we need to allocate more than one indirect 853 * block. For data, return the address of the actual block 854 * relative to the address of the first data block. 855 */ 856 if (flags & B_CONTIG) { 857 KASSERT(ip->i_ffs_first_data_blk != 0); 858 KASSERT(ip->i_ffs_first_indir_blk != 0); 859 if (flags & B_METAONLY) 860 return ip->i_ffs_first_indir_blk; 861 else 862 return ip->i_ffs_first_data_blk + ffs_blkstofrags(fs, lbn); 863 } 864 865 if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { 866 if (lbn < UFS_NDADDR + FFS_NINDIR(fs)) { 867 cg = ino_to_cg(fs, ip->i_number); 868 return (cgbase(fs, cg) + fs->fs_frag); 869 } 870 /* 871 * Find a cylinder with greater than average number of 872 * unused data blocks. 873 */ 874 if (indx == 0 || bap[indx - 1] == 0) 875 startcg = 876 ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg; 877 else 878 startcg = dtog(fs, 879 ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1); 880 startcg %= fs->fs_ncg; 881 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 882 for (cg = startcg; cg < fs->fs_ncg; cg++) 883 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 884 return (cgbase(fs, cg) + fs->fs_frag); 885 } 886 for (cg = 0; cg < startcg; cg++) 887 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 888 return (cgbase(fs, cg) + fs->fs_frag); 889 } 890 return (0); 891 } 892 /* 893 * We just always try to lay things out contiguously. 894 */ 895 return ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag; 896} 897 898 899/* 900 * Implement the cylinder overflow algorithm. 901 * 902 * The policy implemented by this algorithm is: 903 * 1) allocate the block in its requested cylinder group. 904 * 2) quadradically rehash on the cylinder group number. 905 * 3) brute force search for a free block. 906 * 907 * => called with um_lock held 908 * => returns with um_lock released on success, held on failure 909 * (*allocator releases lock on success, retains lock on failure) 910 */ 911/*VARARGS5*/ 912static daddr_t 913ffs_hashalloc(struct inode *ip, u_int cg, daddr_t pref, 914 int size /* size for data blocks, mode for inodes */, 915 int realsize, 916 int flags, 917 daddr_t (*allocator)(struct inode *, u_int, daddr_t, int, int, int)) 918{ 919 struct fs *fs; 920 daddr_t result; 921 u_int i, icg = cg; 922 923 fs = ip->i_fs; 924 /* 925 * 1: preferred cylinder group 926 */ 927 result = (*allocator)(ip, cg, pref, size, realsize, flags); 928 if (result) 929 return (result); 930 931 if (flags & B_CONTIG) 932 return (result); 933 /* 934 * 2: quadratic rehash 935 */ 936 for (i = 1; i < fs->fs_ncg; i *= 2) { 937 cg += i; 938 if (cg >= fs->fs_ncg) 939 cg -= fs->fs_ncg; 940 result = (*allocator)(ip, cg, 0, size, realsize, flags); 941 if (result) 942 return (result); 943 } 944 /* 945 * 3: brute force search 946 * Note that we start at i == 2, since 0 was checked initially, 947 * and 1 is always checked in the quadratic rehash. 948 */ 949 cg = (icg + 2) % fs->fs_ncg; 950 for (i = 2; i < fs->fs_ncg; i++) { 951 result = (*allocator)(ip, cg, 0, size, realsize, flags); 952 if (result) 953 return (result); 954 cg++; 955 if (cg == fs->fs_ncg) 956 cg = 0; 957 } 958 return (0); 959} 960 961/* 962 * Determine whether a fragment can be extended. 963 * 964 * Check to see if the necessary fragments are available, and 965 * if they are, allocate them. 966 * 967 * => called with um_lock held 968 * => returns with um_lock released on success, held on failure 969 */ 970static daddr_t 971ffs_fragextend(struct inode *ip, u_int cg, daddr_t bprev, int osize, int nsize) 972{ 973 struct ufsmount *ump; 974 struct fs *fs; 975 struct cg *cgp; 976 struct buf *bp; 977 daddr_t bno; 978 int frags, bbase; 979 int i, error; 980 u_int8_t *blksfree; 981 982 fs = ip->i_fs; 983 ump = ip->i_ump; 984 985 KASSERT(mutex_owned(&ump->um_lock)); 986 987 if (fs->fs_cs(fs, cg).cs_nffree < ffs_numfrags(fs, nsize - osize)) 988 return (0); 989 frags = ffs_numfrags(fs, nsize); 990 bbase = ffs_fragnum(fs, bprev); 991 if (bbase > ffs_fragnum(fs, (bprev + frags - 1))) { 992 /* cannot extend across a block boundary */ 993 return (0); 994 } 995 mutex_exit(&ump->um_lock); 996 error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), 997 (int)fs->fs_cgsize, B_MODIFY, &bp); 998 if (error) 999 goto fail; 1000 cgp = (struct cg *)bp->b_data; 1001 if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) 1002 goto fail; 1003 cgp->cg_old_time = ufs_rw32(time_second, UFS_FSNEEDSWAP(fs)); 1004 if ((fs->fs_magic != FS_UFS1_MAGIC) || 1005 (fs->fs_old_flags & FS_FLAGS_UPDATED)) 1006 cgp->cg_time = ufs_rw64(time_second, UFS_FSNEEDSWAP(fs)); 1007 bno = dtogd(fs, bprev); 1008 blksfree = cg_blksfree(cgp, UFS_FSNEEDSWAP(fs)); 1009 for (i = ffs_numfrags(fs, osize); i < frags; i++) 1010 if (isclr(blksfree, bno + i)) 1011 goto fail; 1012 /* 1013 * the current fragment can be extended 1014 * deduct the count on fragment being extended into 1015 * increase the count on the remaining fragment (if any) 1016 * allocate the extended piece 1017 */ 1018 for (i = frags; i < fs->fs_frag - bbase; i++) 1019 if (isclr(blksfree, bno + i)) 1020 break; 1021 ufs_add32(cgp->cg_frsum[i - ffs_numfrags(fs, osize)], -1, UFS_FSNEEDSWAP(fs)); 1022 if (i != frags) 1023 ufs_add32(cgp->cg_frsum[i - frags], 1, UFS_FSNEEDSWAP(fs)); 1024 mutex_enter(&ump->um_lock); 1025 for (i = ffs_numfrags(fs, osize); i < frags; i++) { 1026 clrbit(blksfree, bno + i); 1027 ufs_add32(cgp->cg_cs.cs_nffree, -1, UFS_FSNEEDSWAP(fs)); 1028 fs->fs_cstotal.cs_nffree--; 1029 fs->fs_cs(fs, cg).cs_nffree--; 1030 } 1031 fs->fs_fmod = 1; 1032 ACTIVECG_CLR(fs, cg); 1033 mutex_exit(&ump->um_lock); 1034 bdwrite(bp); 1035 return (bprev); 1036 1037 fail: 1038 if (bp != NULL) 1039 brelse(bp, 0); 1040 mutex_enter(&ump->um_lock); 1041 return (0); 1042} 1043 1044/* 1045 * Determine whether a block can be allocated. 1046 * 1047 * Check to see if a block of the appropriate size is available, 1048 * and if it is, allocate it. 1049 */ 1050static daddr_t 1051ffs_alloccg(struct inode *ip, u_int cg, daddr_t bpref, int size, int realsize, 1052 int flags) 1053{ 1054 struct ufsmount *ump; 1055 struct fs *fs = ip->i_fs; 1056 struct cg *cgp; 1057 struct buf *bp; 1058 int32_t bno; 1059 daddr_t blkno; 1060 int error, frags, allocsiz, i; 1061 u_int8_t *blksfree; 1062 const int needswap = UFS_FSNEEDSWAP(fs); 1063 1064 ump = ip->i_ump; 1065 1066 KASSERT(mutex_owned(&ump->um_lock)); 1067 1068 if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) 1069 return (0); 1070 mutex_exit(&ump->um_lock); 1071 error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), 1072 (int)fs->fs_cgsize, B_MODIFY, &bp); 1073 if (error) 1074 goto fail; 1075 cgp = (struct cg *)bp->b_data; 1076 if (!cg_chkmagic(cgp, needswap) || 1077 (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) 1078 goto fail; 1079 cgp->cg_old_time = ufs_rw32(time_second, needswap); 1080 if ((fs->fs_magic != FS_UFS1_MAGIC) || 1081 (fs->fs_old_flags & FS_FLAGS_UPDATED)) 1082 cgp->cg_time = ufs_rw64(time_second, needswap); 1083 if (size == fs->fs_bsize) { 1084 mutex_enter(&ump->um_lock); 1085 blkno = ffs_alloccgblk(ip, bp, bpref, realsize, flags); 1086 ACTIVECG_CLR(fs, cg); 1087 mutex_exit(&ump->um_lock); 1088 1089 /* 1090 * If actually needed size is lower, free the extra blocks now. 1091 * This is safe to call here, there is no outside reference 1092 * to this block yet. It is not necessary to keep um_lock 1093 * locked. 1094 */ 1095 if (realsize != 0 && realsize < size) { 1096 ffs_blkfree_common(ip->i_ump, ip->i_fs, 1097 ip->i_devvp->v_rdev, 1098 bp, blkno + ffs_numfrags(fs, realsize), 1099 (long)(size - realsize), false); 1100 } 1101 1102 bdwrite(bp); 1103 return (blkno); 1104 } 1105 /* 1106 * check to see if any fragments are already available 1107 * allocsiz is the size which will be allocated, hacking 1108 * it down to a smaller size if necessary 1109 */ 1110 blksfree = cg_blksfree(cgp, needswap); 1111 frags = ffs_numfrags(fs, size); 1112 for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) 1113 if (cgp->cg_frsum[allocsiz] != 0) 1114 break; 1115 if (allocsiz == fs->fs_frag) { 1116 /* 1117 * no fragments were available, so a block will be 1118 * allocated, and hacked up 1119 */ 1120 if (cgp->cg_cs.cs_nbfree == 0) 1121 goto fail; 1122 mutex_enter(&ump->um_lock); 1123 blkno = ffs_alloccgblk(ip, bp, bpref, realsize, flags); 1124 bno = dtogd(fs, blkno); 1125 for (i = frags; i < fs->fs_frag; i++) 1126 setbit(blksfree, bno + i); 1127 i = fs->fs_frag - frags; 1128 ufs_add32(cgp->cg_cs.cs_nffree, i, needswap); 1129 fs->fs_cstotal.cs_nffree += i; 1130 fs->fs_cs(fs, cg).cs_nffree += i; 1131 fs->fs_fmod = 1; 1132 ufs_add32(cgp->cg_frsum[i], 1, needswap); 1133 ACTIVECG_CLR(fs, cg); 1134 mutex_exit(&ump->um_lock); 1135 bdwrite(bp); 1136 return (blkno); 1137 } 1138 bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); 1139#if 0 1140 /* 1141 * XXX fvdl mapsearch will panic, and never return -1 1142 * also: returning NULL as daddr_t ? 1143 */ 1144 if (bno < 0) 1145 goto fail; 1146#endif 1147 for (i = 0; i < frags; i++) 1148 clrbit(blksfree, bno + i); 1149 mutex_enter(&ump->um_lock); 1150 ufs_add32(cgp->cg_cs.cs_nffree, -frags, needswap); 1151 fs->fs_cstotal.cs_nffree -= frags; 1152 fs->fs_cs(fs, cg).cs_nffree -= frags; 1153 fs->fs_fmod = 1; 1154 ufs_add32(cgp->cg_frsum[allocsiz], -1, needswap); 1155 if (frags != allocsiz) 1156 ufs_add32(cgp->cg_frsum[allocsiz - frags], 1, needswap); 1157 blkno = cgbase(fs, cg) + bno; 1158 ACTIVECG_CLR(fs, cg); 1159 mutex_exit(&ump->um_lock); 1160 bdwrite(bp); 1161 return blkno; 1162 1163 fail: 1164 if (bp != NULL) 1165 brelse(bp, 0); 1166 mutex_enter(&ump->um_lock); 1167 return (0); 1168} 1169 1170/* 1171 * Allocate a block in a cylinder group. 1172 * 1173 * This algorithm implements the following policy: 1174 * 1) allocate the requested block. 1175 * 2) allocate a rotationally optimal block in the same cylinder. 1176 * 3) allocate the next available block on the block rotor for the 1177 * specified cylinder group. 1178 * Note that this routine only allocates fs_bsize blocks; these 1179 * blocks may be fragmented by the routine that allocates them. 1180 */ 1181static daddr_t 1182ffs_alloccgblk(struct inode *ip, struct buf *bp, daddr_t bpref, int realsize, 1183 int flags) 1184{ 1185 struct fs *fs = ip->i_fs; 1186 struct cg *cgp; 1187 int cg; 1188 daddr_t blkno; 1189 int32_t bno; 1190 u_int8_t *blksfree; 1191 const int needswap = UFS_FSNEEDSWAP(fs); 1192 1193 KASSERT(mutex_owned(&ip->i_ump->um_lock)); 1194 1195 cgp = (struct cg *)bp->b_data; 1196 blksfree = cg_blksfree(cgp, needswap); 1197 if (bpref == 0 || dtog(fs, bpref) != ufs_rw32(cgp->cg_cgx, needswap)) { 1198 bpref = ufs_rw32(cgp->cg_rotor, needswap); 1199 } else { 1200 bpref = ffs_blknum(fs, bpref); 1201 bno = dtogd(fs, bpref); 1202 /* 1203 * if the requested block is available, use it 1204 */ 1205 if (ffs_isblock(fs, blksfree, ffs_fragstoblks(fs, bno))) 1206 goto gotit; 1207 /* 1208 * if the requested data block isn't available and we are 1209 * trying to allocate a contiguous file, return an error. 1210 */ 1211 if ((flags & (B_CONTIG | B_METAONLY)) == B_CONTIG) 1212 return (0); 1213 } 1214 1215 /* 1216 * Take the next available block in this cylinder group. 1217 */ 1218 bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); 1219#if 0 1220 /* 1221 * XXX jdolecek ffs_mapsearch() succeeds or panics 1222 */ 1223 if (bno < 0) 1224 return (0); 1225#endif 1226 cgp->cg_rotor = ufs_rw32(bno, needswap); 1227gotit: 1228 blkno = ffs_fragstoblks(fs, bno); 1229 ffs_clrblock(fs, blksfree, blkno); 1230 ffs_clusteracct(fs, cgp, blkno, -1); 1231 ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap); 1232 fs->fs_cstotal.cs_nbfree--; 1233 fs->fs_cs(fs, ufs_rw32(cgp->cg_cgx, needswap)).cs_nbfree--; 1234 if ((fs->fs_magic == FS_UFS1_MAGIC) && 1235 ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) { 1236 int cylno; 1237 cylno = old_cbtocylno(fs, bno); 1238 KASSERT(cylno >= 0); 1239 KASSERT(cylno < fs->fs_old_ncyl); 1240 KASSERT(old_cbtorpos(fs, bno) >= 0); 1241 KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bno) < fs->fs_old_nrpos); 1242 ufs_add16(old_cg_blks(fs, cgp, cylno, needswap)[old_cbtorpos(fs, bno)], -1, 1243 needswap); 1244 ufs_add32(old_cg_blktot(cgp, needswap)[cylno], -1, needswap); 1245 } 1246 fs->fs_fmod = 1; 1247 cg = ufs_rw32(cgp->cg_cgx, needswap); 1248 blkno = cgbase(fs, cg) + bno; 1249 return (blkno); 1250} 1251 1252/* 1253 * Determine whether an inode can be allocated. 1254 * 1255 * Check to see if an inode is available, and if it is, 1256 * allocate it using the following policy: 1257 * 1) allocate the requested inode. 1258 * 2) allocate the next available inode after the requested 1259 * inode in the specified cylinder group. 1260 */ 1261static daddr_t 1262ffs_nodealloccg(struct inode *ip, u_int cg, daddr_t ipref, int mode, int realsize, 1263 int flags) 1264{ 1265 struct ufsmount *ump = ip->i_ump; 1266 struct fs *fs = ip->i_fs; 1267 struct cg *cgp; 1268 struct buf *bp, *ibp; 1269 u_int8_t *inosused; 1270 int error, start, len, loc, map, i; 1271 int32_t initediblk, maxiblk, irotor; 1272 daddr_t nalloc; 1273 struct ufs2_dinode *dp2; 1274 const int needswap = UFS_FSNEEDSWAP(fs); 1275 1276 KASSERT(mutex_owned(&ump->um_lock)); 1277 UFS_WAPBL_JLOCK_ASSERT(ip->i_ump->um_mountp); 1278 1279 if (fs->fs_cs(fs, cg).cs_nifree == 0) 1280 return (0); 1281 mutex_exit(&ump->um_lock); 1282 ibp = NULL; 1283 if (fs->fs_magic == FS_UFS2_MAGIC) { 1284 initediblk = -1; 1285 } else { 1286 initediblk = fs->fs_ipg; 1287 } 1288 maxiblk = initediblk; 1289 1290retry: 1291 error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), 1292 (int)fs->fs_cgsize, B_MODIFY, &bp); 1293 if (error) 1294 goto fail; 1295 cgp = (struct cg *)bp->b_data; 1296 if (!cg_chkmagic(cgp, needswap) || cgp->cg_cs.cs_nifree == 0) 1297 goto fail; 1298 1299 if (ibp != NULL && 1300 initediblk != ufs_rw32(cgp->cg_initediblk, needswap)) { 1301 /* Another thread allocated more inodes so we retry the test. */ 1302 brelse(ibp, 0); 1303 ibp = NULL; 1304 } 1305 /* 1306 * Check to see if we need to initialize more inodes. 1307 */ 1308 if (fs->fs_magic == FS_UFS2_MAGIC && ibp == NULL) { 1309 initediblk = ufs_rw32(cgp->cg_initediblk, needswap); 1310 maxiblk = initediblk; 1311 nalloc = fs->fs_ipg - ufs_rw32(cgp->cg_cs.cs_nifree, needswap); 1312 if (nalloc + FFS_INOPB(fs) > initediblk && 1313 initediblk < ufs_rw32(cgp->cg_niblk, needswap)) { 1314 /* 1315 * We have to release the cg buffer here to prevent 1316 * a deadlock when reading the inode block will 1317 * run a copy-on-write that might use this cg. 1318 */ 1319 brelse(bp, 0); 1320 bp = NULL; 1321 error = ffs_getblk(ip->i_devvp, FFS_FSBTODB(fs, 1322 ino_to_fsba(fs, cg * fs->fs_ipg + initediblk)), 1323 FFS_NOBLK, fs->fs_bsize, false, &ibp); 1324 if (error) 1325 goto fail; 1326 1327 maxiblk += FFS_INOPB(fs); 1328 1329 goto retry; 1330 } 1331 } 1332 1333 cgp->cg_old_time = ufs_rw32(time_second, needswap); 1334 if ((fs->fs_magic != FS_UFS1_MAGIC) || 1335 (fs->fs_old_flags & FS_FLAGS_UPDATED)) 1336 cgp->cg_time = ufs_rw64(time_second, needswap); 1337 inosused = cg_inosused(cgp, needswap); 1338 1339 if (ipref) { 1340 ipref %= fs->fs_ipg; 1341 /* safeguard to stay in (to be) allocated range */ 1342 if (ipref < maxiblk && isclr(inosused, ipref)) 1343 goto gotit; 1344 } 1345 1346 irotor = ufs_rw32(cgp->cg_irotor, needswap); 1347 1348 KASSERTMSG(irotor < initediblk, "%s: allocation botch: cg=%d, irotor %d" 1349 " out of bounds, initediblk=%d", 1350 __func__, cg, irotor, initediblk); 1351 1352 start = irotor / NBBY; 1353 len = howmany(maxiblk - irotor, NBBY); 1354 loc = skpc(0xff, len, &inosused[start]); 1355 if (loc == 0) { 1356 len = start + 1; 1357 start = 0; 1358 loc = skpc(0xff, len, &inosused[0]); 1359 if (loc == 0) { 1360 panic("%s: map corrupted: cg=%d, irotor=%d, fs=%s", 1361 __func__, cg, ufs_rw32(cgp->cg_irotor, needswap), 1362 fs->fs_fsmnt); 1363 /* NOTREACHED */ 1364 } 1365 } 1366 i = start + len - loc; 1367 map = inosused[i] ^ 0xff; 1368 if (map == 0) { 1369 panic("%s: block not in map: fs=%s", __func__, fs->fs_fsmnt); 1370 } 1371 1372 ipref = i * NBBY + ffs(map) - 1; 1373 1374 cgp->cg_irotor = ufs_rw32(ipref, needswap); 1375 1376gotit: 1377 KASSERTMSG(ipref < maxiblk, "%s: allocation botch: cg=%d attempt to " 1378 "allocate inode index %d beyond max allocated index %d" 1379 " of %d inodes/cg", 1380 __func__, cg, (int)ipref, maxiblk, cgp->cg_niblk); 1381 1382 UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp, cg * fs->fs_ipg + ipref, 1383 mode); 1384 /* 1385 * Check to see if we need to initialize more inodes. 1386 */ 1387 if (ibp != NULL) { 1388 KASSERT(initediblk == ufs_rw32(cgp->cg_initediblk, needswap)); 1389 memset(ibp->b_data, 0, fs->fs_bsize); 1390 dp2 = (struct ufs2_dinode *)(ibp->b_data); 1391 for (i = 0; i < FFS_INOPB(fs); i++) { 1392 /* 1393 * Don't bother to swap, it's supposed to be 1394 * random, after all. 1395 */ 1396 dp2->di_gen = (cprng_fast32() & INT32_MAX) / 2 + 1; 1397 dp2++; 1398 } 1399 initediblk += FFS_INOPB(fs); 1400 cgp->cg_initediblk = ufs_rw32(initediblk, needswap); 1401 } 1402 1403 mutex_enter(&ump->um_lock); 1404 ACTIVECG_CLR(fs, cg); 1405 setbit(inosused, ipref); 1406 ufs_add32(cgp->cg_cs.cs_nifree, -1, needswap); 1407 fs->fs_cstotal.cs_nifree--; 1408 fs->fs_cs(fs, cg).cs_nifree--; 1409 fs->fs_fmod = 1; 1410 if ((mode & IFMT) == IFDIR) { 1411 ufs_add32(cgp->cg_cs.cs_ndir, 1, needswap); 1412 fs->fs_cstotal.cs_ndir++; 1413 fs->fs_cs(fs, cg).cs_ndir++; 1414 } 1415 mutex_exit(&ump->um_lock); 1416 if (ibp != NULL) { 1417 bwrite(ibp); 1418 bwrite(bp); 1419 } else 1420 bdwrite(bp); 1421 return ((ino_t)(cg * fs->fs_ipg + ipref)); 1422 fail: 1423 if (bp != NULL) 1424 brelse(bp, 0); 1425 if (ibp != NULL) 1426 brelse(ibp, 0); 1427 mutex_enter(&ump->um_lock); 1428 return (0); 1429} 1430 1431/* 1432 * Allocate a block or fragment. 1433 * 1434 * The specified block or fragment is removed from the 1435 * free map, possibly fragmenting a block in the process. 1436 * 1437 * This implementation should mirror fs_blkfree 1438 * 1439 * => um_lock not held on entry or exit 1440 */ 1441int 1442ffs_blkalloc(struct inode *ip, daddr_t bno, long size) 1443{ 1444 int error; 1445 1446 error = ffs_check_bad_allocation(__func__, ip->i_fs, bno, size, 1447 ip->i_dev, ip->i_uid); 1448 if (error) 1449 return error; 1450 1451 return ffs_blkalloc_ump(ip->i_ump, bno, size); 1452} 1453 1454int 1455ffs_blkalloc_ump(struct ufsmount *ump, daddr_t bno, long size) 1456{ 1457 struct fs *fs = ump->um_fs; 1458 struct cg *cgp; 1459 struct buf *bp; 1460 int32_t fragno, cgbno; 1461 int i, error, blk, frags, bbase; 1462 u_int cg; 1463 u_int8_t *blksfree; 1464 const int needswap = UFS_FSNEEDSWAP(fs); 1465 1466 KASSERT((u_int)size <= fs->fs_bsize && ffs_fragoff(fs, size) == 0 && 1467 ffs_fragnum(fs, bno) + ffs_numfrags(fs, size) <= fs->fs_frag); 1468 KASSERT(bno < fs->fs_size); 1469 1470 cg = dtog(fs, bno); 1471 error = bread(ump->um_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), 1472 (int)fs->fs_cgsize, B_MODIFY, &bp); 1473 if (error) { 1474 return error; 1475 } 1476 cgp = (struct cg *)bp->b_data; 1477 if (!cg_chkmagic(cgp, needswap)) { 1478 brelse(bp, 0); 1479 return EIO; 1480 } 1481 cgp->cg_old_time = ufs_rw32(time_second, needswap); 1482 cgp->cg_time = ufs_rw64(time_second, needswap); 1483 cgbno = dtogd(fs, bno); 1484 blksfree = cg_blksfree(cgp, needswap); 1485 1486 mutex_enter(&ump->um_lock); 1487 if (size == fs->fs_bsize) { 1488 fragno = ffs_fragstoblks(fs, cgbno); 1489 if (!ffs_isblock(fs, blksfree, fragno)) { 1490 mutex_exit(&ump->um_lock); 1491 brelse(bp, 0); 1492 return EBUSY; 1493 } 1494 ffs_clrblock(fs, blksfree, fragno); 1495 ffs_clusteracct(fs, cgp, fragno, -1); 1496 ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap); 1497 fs->fs_cstotal.cs_nbfree--; 1498 fs->fs_cs(fs, cg).cs_nbfree--; 1499 } else { 1500 bbase = cgbno - ffs_fragnum(fs, cgbno); 1501 1502 frags = ffs_numfrags(fs, size); 1503 for (i = 0; i < frags; i++) { 1504 if (isclr(blksfree, cgbno + i)) { 1505 mutex_exit(&ump->um_lock); 1506 brelse(bp, 0); 1507 return EBUSY; 1508 } 1509 } 1510 /* 1511 * if a complete block is being split, account for it 1512 */ 1513 fragno = ffs_fragstoblks(fs, bbase); 1514 if (ffs_isblock(fs, blksfree, fragno)) { 1515 ufs_add32(cgp->cg_cs.cs_nffree, fs->fs_frag, needswap); 1516 fs->fs_cstotal.cs_nffree += fs->fs_frag; 1517 fs->fs_cs(fs, cg).cs_nffree += fs->fs_frag; 1518 ffs_clusteracct(fs, cgp, fragno, -1); 1519 ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap); 1520 fs->fs_cstotal.cs_nbfree--; 1521 fs->fs_cs(fs, cg).cs_nbfree--; 1522 } 1523 /* 1524 * decrement the counts associated with the old frags 1525 */ 1526 blk = blkmap(fs, blksfree, bbase); 1527 ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap); 1528 /* 1529 * allocate the fragment 1530 */ 1531 for (i = 0; i < frags; i++) { 1532 clrbit(blksfree, cgbno + i); 1533 } 1534 ufs_add32(cgp->cg_cs.cs_nffree, -i, needswap); 1535 fs->fs_cstotal.cs_nffree -= i; 1536 fs->fs_cs(fs, cg).cs_nffree -= i; 1537 /* 1538 * add back in counts associated with the new frags 1539 */ 1540 blk = blkmap(fs, blksfree, bbase); 1541 ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap); 1542 } 1543 fs->fs_fmod = 1; 1544 ACTIVECG_CLR(fs, cg); 1545 mutex_exit(&ump->um_lock); 1546 bdwrite(bp); 1547 return 0; 1548} 1549 1550/* 1551 * Free a block or fragment. 1552 * 1553 * The specified block or fragment is placed back in the 1554 * free map. If a fragment is deallocated, a possible 1555 * block reassembly is checked. 1556 * 1557 * => um_lock not held on entry or exit 1558 */ 1559static void 1560ffs_blkfree_cg(struct fs *fs, struct vnode *devvp, daddr_t bno, long size) 1561{ 1562 struct cg *cgp; 1563 struct buf *bp; 1564 struct ufsmount *ump; 1565 daddr_t cgblkno; 1566 int error; 1567 u_int cg; 1568 dev_t dev; 1569 const bool devvp_is_snapshot = (devvp->v_type != VBLK); 1570 const int needswap = UFS_FSNEEDSWAP(fs); 1571 1572 KASSERT(!devvp_is_snapshot); 1573 1574 cg = dtog(fs, bno); 1575 dev = devvp->v_rdev; 1576 ump = VFSTOUFS(spec_node_getmountedfs(devvp)); 1577 KASSERT(fs == ump->um_fs); 1578 cgblkno = FFS_FSBTODB(fs, cgtod(fs, cg)); 1579 1580 error = bread(devvp, cgblkno, (int)fs->fs_cgsize, 1581 B_MODIFY, &bp); 1582 if (error) { 1583 return; 1584 } 1585 cgp = (struct cg *)bp->b_data; 1586 if (!cg_chkmagic(cgp, needswap)) { 1587 brelse(bp, 0); 1588 return; 1589 } 1590 1591 ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot); 1592 1593 bdwrite(bp); 1594} 1595 1596struct discardopdata { 1597 struct work wk; /* must be first */ 1598 struct vnode *devvp; 1599 daddr_t bno; 1600 long size; 1601}; 1602 1603struct discarddata { 1604 struct fs *fs; 1605 struct discardopdata *entry; 1606 long maxsize; 1607 kmutex_t entrylk; 1608 struct workqueue *wq; 1609 int wqcnt, wqdraining; 1610 kmutex_t wqlk; 1611 kcondvar_t wqcv; 1612 /* timer for flush? */ 1613}; 1614 1615static void 1616ffs_blkfree_td(struct fs *fs, struct discardopdata *td) 1617{ 1618 struct mount *mp = spec_node_getmountedfs(td->devvp); 1619 long todo; 1620 int error; 1621 1622 while (td->size) { 1623 todo = uimin(td->size, 1624 ffs_lfragtosize(fs, (fs->fs_frag - ffs_fragnum(fs, td->bno)))); 1625 error = UFS_WAPBL_BEGIN(mp); 1626 if (error) { 1627 printf("ffs: failed to begin wapbl transaction" 1628 " for discard: %d\n", error); 1629 break; 1630 } 1631 ffs_blkfree_cg(fs, td->devvp, td->bno, todo); 1632 UFS_WAPBL_END(mp); 1633 td->bno += ffs_numfrags(fs, todo); 1634 td->size -= todo; 1635 } 1636} 1637 1638static void 1639ffs_discardcb(struct work *wk, void *arg) 1640{ 1641 struct discardopdata *td = (void *)wk; 1642 struct discarddata *ts = arg; 1643 struct fs *fs = ts->fs; 1644 off_t start, len; 1645#ifdef TRIMDEBUG 1646 int error; 1647#endif 1648 1649/* like FSBTODB but emits bytes; XXX move to fs.h */ 1650#ifndef FFS_FSBTOBYTES 1651#define FFS_FSBTOBYTES(fs, b) ((b) << (fs)->fs_fshift) 1652#endif 1653 1654 start = FFS_FSBTOBYTES(fs, td->bno); 1655 len = td->size; 1656 vn_lock(td->devvp, LK_EXCLUSIVE | LK_RETRY); 1657#ifdef TRIMDEBUG 1658 error = 1659#endif 1660 VOP_FDISCARD(td->devvp, start, len); 1661 VOP_UNLOCK(td->devvp); 1662#ifdef TRIMDEBUG 1663 printf("trim(%" PRId64 ",%ld):%d\n", td->bno, td->size, error); 1664#endif 1665 1666 ffs_blkfree_td(fs, td); 1667 kmem_free(td, sizeof(*td)); 1668 mutex_enter(&ts->wqlk); 1669 ts->wqcnt--; 1670 if (ts->wqdraining && !ts->wqcnt) 1671 cv_signal(&ts->wqcv); 1672 mutex_exit(&ts->wqlk); 1673} 1674 1675void * 1676ffs_discard_init(struct vnode *devvp, struct fs *fs) 1677{ 1678 struct discarddata *ts; 1679 int error; 1680 1681 ts = kmem_zalloc(sizeof (*ts), KM_SLEEP); 1682 error = workqueue_create(&ts->wq, "trimwq", ffs_discardcb, ts, 1683 PRI_USER, IPL_NONE, 0); 1684 if (error) { 1685 kmem_free(ts, sizeof (*ts)); 1686 return NULL; 1687 } 1688 mutex_init(&ts->entrylk, MUTEX_DEFAULT, IPL_NONE); 1689 mutex_init(&ts->wqlk, MUTEX_DEFAULT, IPL_NONE); 1690 cv_init(&ts->wqcv, "trimwqcv"); 1691 ts->maxsize = 100*1024; /* XXX */ 1692 ts->fs = fs; 1693 return ts; 1694} 1695 1696void 1697ffs_discard_finish(void *vts, int flags) 1698{ 1699 struct discarddata *ts = vts; 1700 struct discardopdata *td = NULL; 1701 1702 /* wait for workqueue to drain */ 1703 mutex_enter(&ts->wqlk); 1704 if (ts->wqcnt) { 1705 ts->wqdraining = 1; 1706 cv_wait(&ts->wqcv, &ts->wqlk); 1707 } 1708 mutex_exit(&ts->wqlk); 1709 1710 mutex_enter(&ts->entrylk); 1711 if (ts->entry) { 1712 td = ts->entry; 1713 ts->entry = NULL; 1714 } 1715 mutex_exit(&ts->entrylk); 1716 if (td) { 1717 /* XXX don't tell disk, its optional */ 1718 ffs_blkfree_td(ts->fs, td); 1719#ifdef TRIMDEBUG 1720 printf("finish(%" PRId64 ",%ld)\n", td->bno, td->size); 1721#endif 1722 kmem_free(td, sizeof(*td)); 1723 } 1724 1725 cv_destroy(&ts->wqcv); 1726 mutex_destroy(&ts->entrylk); 1727 mutex_destroy(&ts->wqlk); 1728 workqueue_destroy(ts->wq); 1729 kmem_free(ts, sizeof(*ts)); 1730} 1731 1732void 1733ffs_blkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, long size, 1734 ino_t inum) 1735{ 1736 struct ufsmount *ump; 1737 int error; 1738 dev_t dev; 1739 struct discarddata *ts; 1740 struct discardopdata *td; 1741 1742 dev = devvp->v_rdev; 1743 ump = VFSTOUFS(spec_node_getmountedfs(devvp)); 1744 if (ffs_snapblkfree(fs, devvp, bno, size, inum)) 1745 return; 1746 1747 error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum); 1748 if (error) 1749 return; 1750 1751 if (!ump->um_discarddata) { 1752 ffs_blkfree_cg(fs, devvp, bno, size); 1753 return; 1754 } 1755 1756#ifdef TRIMDEBUG 1757 printf("blkfree(%" PRId64 ",%ld)\n", bno, size); 1758#endif 1759 ts = ump->um_discarddata; 1760 td = NULL; 1761 1762 mutex_enter(&ts->entrylk); 1763 if (ts->entry) { 1764 td = ts->entry; 1765 /* ffs deallocs backwards, check for prepend only */ 1766 if (td->bno == bno + ffs_numfrags(fs, size) 1767 && td->size + size <= ts->maxsize) { 1768 td->bno = bno; 1769 td->size += size; 1770 if (td->size < ts->maxsize) { 1771#ifdef TRIMDEBUG 1772 printf("defer(%" PRId64 ",%ld)\n", td->bno, td->size); 1773#endif 1774 mutex_exit(&ts->entrylk); 1775 return; 1776 } 1777 size = 0; /* mark done */ 1778 } 1779 ts->entry = NULL; 1780 } 1781 mutex_exit(&ts->entrylk); 1782 1783 if (td) { 1784#ifdef TRIMDEBUG 1785 printf("enq old(%" PRId64 ",%ld)\n", td->bno, td->size); 1786#endif 1787 mutex_enter(&ts->wqlk); 1788 ts->wqcnt++; 1789 mutex_exit(&ts->wqlk); 1790 workqueue_enqueue(ts->wq, &td->wk, NULL); 1791 } 1792 if (!size) 1793 return; 1794 1795 td = kmem_alloc(sizeof(*td), KM_SLEEP); 1796 td->devvp = devvp; 1797 td->bno = bno; 1798 td->size = size; 1799 1800 if (td->size < ts->maxsize) { /* XXX always the case */ 1801 mutex_enter(&ts->entrylk); 1802 if (!ts->entry) { /* possible race? */ 1803#ifdef TRIMDEBUG 1804 printf("defer(%" PRId64 ",%ld)\n", td->bno, td->size); 1805#endif 1806 ts->entry = td; 1807 td = NULL; 1808 } 1809 mutex_exit(&ts->entrylk); 1810 } 1811 if (td) { 1812#ifdef TRIMDEBUG 1813 printf("enq new(%" PRId64 ",%ld)\n", td->bno, td->size); 1814#endif 1815 mutex_enter(&ts->wqlk); 1816 ts->wqcnt++; 1817 mutex_exit(&ts->wqlk); 1818 workqueue_enqueue(ts->wq, &td->wk, NULL); 1819 } 1820} 1821 1822/* 1823 * Free a block or fragment from a snapshot cg copy. 1824 * 1825 * The specified block or fragment is placed back in the 1826 * free map. If a fragment is deallocated, a possible 1827 * block reassembly is checked. 1828 * 1829 * => um_lock not held on entry or exit 1830 */ 1831void 1832ffs_blkfree_snap(struct fs *fs, struct vnode *devvp, daddr_t bno, long size, 1833 ino_t inum) 1834{ 1835 struct cg *cgp; 1836 struct buf *bp; 1837 struct ufsmount *ump; 1838 daddr_t cgblkno; 1839 int error, cg; 1840 dev_t dev; 1841 const bool devvp_is_snapshot = (devvp->v_type != VBLK); 1842 const int needswap = UFS_FSNEEDSWAP(fs); 1843 1844 KASSERT(devvp_is_snapshot); 1845 1846 cg = dtog(fs, bno); 1847 dev = VTOI(devvp)->i_devvp->v_rdev; 1848 ump = VFSTOUFS(devvp->v_mount); 1849 cgblkno = ffs_fragstoblks(fs, cgtod(fs, cg)); 1850 1851 error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum); 1852 if (error) 1853 return; 1854 1855 error = bread(devvp, cgblkno, (int)fs->fs_cgsize, 1856 B_MODIFY, &bp); 1857 if (error) { 1858 return; 1859 } 1860 cgp = (struct cg *)bp->b_data; 1861 if (!cg_chkmagic(cgp, needswap)) { 1862 brelse(bp, 0); 1863 return; 1864 } 1865 1866 ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot); 1867 1868 bdwrite(bp); 1869} 1870 1871static void 1872ffs_blkfree_common(struct ufsmount *ump, struct fs *fs, dev_t dev, 1873 struct buf *bp, daddr_t bno, long size, bool devvp_is_snapshot) 1874{ 1875 struct cg *cgp; 1876 int32_t fragno, cgbno; 1877 int i, blk, frags, bbase; 1878 u_int cg; 1879 u_int8_t *blksfree; 1880 const int needswap = UFS_FSNEEDSWAP(fs); 1881 1882 cg = dtog(fs, bno); 1883 cgp = (struct cg *)bp->b_data; 1884 cgp->cg_old_time = ufs_rw32(time_second, needswap); 1885 if ((fs->fs_magic != FS_UFS1_MAGIC) || 1886 (fs->fs_old_flags & FS_FLAGS_UPDATED)) 1887 cgp->cg_time = ufs_rw64(time_second, needswap); 1888 cgbno = dtogd(fs, bno); 1889 blksfree = cg_blksfree(cgp, needswap); 1890 mutex_enter(&ump->um_lock); 1891 if (size == fs->fs_bsize) { 1892 fragno = ffs_fragstoblks(fs, cgbno); 1893 if (!ffs_isfreeblock(fs, blksfree, fragno)) { 1894 if (devvp_is_snapshot) { 1895 mutex_exit(&ump->um_lock); 1896 return; 1897 } 1898 panic("%s: freeing free block: dev = 0x%llx, block = %" 1899 PRId64 ", fs = %s", __func__, 1900 (unsigned long long)dev, bno, fs->fs_fsmnt); 1901 } 1902 ffs_setblock(fs, blksfree, fragno); 1903 ffs_clusteracct(fs, cgp, fragno, 1); 1904 ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap); 1905 fs->fs_cstotal.cs_nbfree++; 1906 fs->fs_cs(fs, cg).cs_nbfree++; 1907 if ((fs->fs_magic == FS_UFS1_MAGIC) && 1908 ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) { 1909 i = old_cbtocylno(fs, cgbno); 1910 KASSERT(i >= 0); 1911 KASSERT(i < fs->fs_old_ncyl); 1912 KASSERT(old_cbtorpos(fs, cgbno) >= 0); 1913 KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, cgbno) < fs->fs_old_nrpos); 1914 ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs, cgbno)], 1, 1915 needswap); 1916 ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap); 1917 } 1918 } else { 1919 bbase = cgbno - ffs_fragnum(fs, cgbno); 1920 /* 1921 * decrement the counts associated with the old frags 1922 */ 1923 blk = blkmap(fs, blksfree, bbase); 1924 ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap); 1925 /* 1926 * deallocate the fragment 1927 */ 1928 frags = ffs_numfrags(fs, size); 1929 for (i = 0; i < frags; i++) { 1930 if (isset(blksfree, cgbno + i)) { 1931 panic("%s: freeing free frag: " 1932 "dev = 0x%llx, block = %" PRId64 1933 ", fs = %s", __func__, 1934 (unsigned long long)dev, bno + i, 1935 fs->fs_fsmnt); 1936 } 1937 setbit(blksfree, cgbno + i); 1938 } 1939 ufs_add32(cgp->cg_cs.cs_nffree, i, needswap); 1940 fs->fs_cstotal.cs_nffree += i; 1941 fs->fs_cs(fs, cg).cs_nffree += i; 1942 /* 1943 * add back in counts associated with the new frags 1944 */ 1945 blk = blkmap(fs, blksfree, bbase); 1946 ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap); 1947 /* 1948 * if a complete block has been reassembled, account for it 1949 */ 1950 fragno = ffs_fragstoblks(fs, bbase); 1951 if (ffs_isblock(fs, blksfree, fragno)) { 1952 ufs_add32(cgp->cg_cs.cs_nffree, -fs->fs_frag, needswap); 1953 fs->fs_cstotal.cs_nffree -= fs->fs_frag; 1954 fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; 1955 ffs_clusteracct(fs, cgp, fragno, 1); 1956 ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap); 1957 fs->fs_cstotal.cs_nbfree++; 1958 fs->fs_cs(fs, cg).cs_nbfree++; 1959 if ((fs->fs_magic == FS_UFS1_MAGIC) && 1960 ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) { 1961 i = old_cbtocylno(fs, bbase); 1962 KASSERT(i >= 0); 1963 KASSERT(i < fs->fs_old_ncyl); 1964 KASSERT(old_cbtorpos(fs, bbase) >= 0); 1965 KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bbase) < fs->fs_old_nrpos); 1966 ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs, 1967 bbase)], 1, needswap); 1968 ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap); 1969 } 1970 } 1971 } 1972 fs->fs_fmod = 1; 1973 ACTIVECG_CLR(fs, cg); 1974 mutex_exit(&ump->um_lock); 1975} 1976 1977/* 1978 * Free an inode. 1979 */ 1980int 1981ffs_vfree(struct vnode *vp, ino_t ino, int mode) 1982{ 1983 1984 return ffs_freefile(vp->v_mount, ino, mode); 1985} 1986 1987/* 1988 * Do the actual free operation. 1989 * The specified inode is placed back in the free map. 1990 * 1991 * => um_lock not held on entry or exit 1992 */ 1993int 1994ffs_freefile(struct mount *mp, ino_t ino, int mode) 1995{ 1996 struct ufsmount *ump = VFSTOUFS(mp); 1997 struct fs *fs = ump->um_fs; 1998 struct vnode *devvp; 1999 struct cg *cgp; 2000 struct buf *bp; 2001 int error; 2002 u_int cg; 2003 daddr_t cgbno; 2004 dev_t dev; 2005 const int needswap = UFS_FSNEEDSWAP(fs); 2006 2007 cg = ino_to_cg(fs, ino); 2008 devvp = ump->um_devvp; 2009 dev = devvp->v_rdev; 2010 cgbno = FFS_FSBTODB(fs, cgtod(fs, cg)); 2011 2012 if (ino >= fs->fs_ipg * fs->fs_ncg) 2013 panic("%s: range: dev = 0x%llx, ino = %llu, fs = %s", __func__, 2014 (long long)dev, (unsigned long long)ino, fs->fs_fsmnt); 2015 error = bread(devvp, cgbno, (int)fs->fs_cgsize, 2016 B_MODIFY, &bp); 2017 if (error) { 2018 return (error); 2019 } 2020 cgp = (struct cg *)bp->b_data; 2021 if (!cg_chkmagic(cgp, needswap)) { 2022 brelse(bp, 0); 2023 return (0); 2024 } 2025 2026 ffs_freefile_common(ump, fs, dev, bp, ino, mode, false); 2027 2028 bdwrite(bp); 2029 2030 return 0; 2031} 2032 2033int 2034ffs_freefile_snap(struct fs *fs, struct vnode *devvp, ino_t ino, int mode) 2035{ 2036 struct ufsmount *ump; 2037 struct cg *cgp; 2038 struct buf *bp; 2039 int error, cg; 2040 daddr_t cgbno; 2041 dev_t dev; 2042 const int needswap = UFS_FSNEEDSWAP(fs); 2043 2044 KASSERT(devvp->v_type != VBLK); 2045 2046 cg = ino_to_cg(fs, ino); 2047 dev = VTOI(devvp)->i_devvp->v_rdev; 2048 ump = VFSTOUFS(devvp->v_mount); 2049 cgbno = ffs_fragstoblks(fs, cgtod(fs, cg)); 2050 if (ino >= fs->fs_ipg * fs->fs_ncg) 2051 panic("%s: range: dev = 0x%llx, ino = %llu, fs = %s", __func__, 2052 (unsigned long long)dev, (unsigned long long)ino, 2053 fs->fs_fsmnt); 2054 error = bread(devvp, cgbno, (int)fs->fs_cgsize, 2055 B_MODIFY, &bp); 2056 if (error) { 2057 return (error); 2058 } 2059 cgp = (struct cg *)bp->b_data; 2060 if (!cg_chkmagic(cgp, needswap)) { 2061 brelse(bp, 0); 2062 return (0); 2063 } 2064 ffs_freefile_common(ump, fs, dev, bp, ino, mode, true); 2065 2066 bdwrite(bp); 2067 2068 return 0; 2069} 2070 2071static void 2072ffs_freefile_common(struct ufsmount *ump, struct fs *fs, dev_t dev, 2073 struct buf *bp, ino_t ino, int mode, bool devvp_is_snapshot) 2074{ 2075 u_int cg; 2076 struct cg *cgp; 2077 u_int8_t *inosused; 2078 const int needswap = UFS_FSNEEDSWAP(fs); 2079 ino_t cgino; 2080 2081 cg = ino_to_cg(fs, ino); 2082 cgp = (struct cg *)bp->b_data; 2083 cgp->cg_old_time = ufs_rw32(time_second, needswap); 2084 if ((fs->fs_magic != FS_UFS1_MAGIC) || 2085 (fs->fs_old_flags & FS_FLAGS_UPDATED)) 2086 cgp->cg_time = ufs_rw64(time_second, needswap); 2087 inosused = cg_inosused(cgp, needswap); 2088 cgino = ino % fs->fs_ipg; 2089 if (isclr(inosused, cgino)) { 2090 printf("ifree: dev = 0x%llx, ino = %llu, fs = %s\n", 2091 (unsigned long long)dev, (unsigned long long)ino, 2092 fs->fs_fsmnt); 2093 if (fs->fs_ronly == 0) 2094 panic("%s: freeing free inode", __func__); 2095 } 2096 clrbit(inosused, cgino); 2097 if (!devvp_is_snapshot) 2098 UFS_WAPBL_UNREGISTER_INODE(ump->um_mountp, ino, mode); 2099 if (cgino < ufs_rw32(cgp->cg_irotor, needswap)) 2100 cgp->cg_irotor = ufs_rw32(cgino, needswap); 2101 ufs_add32(cgp->cg_cs.cs_nifree, 1, needswap); 2102 mutex_enter(&ump->um_lock); 2103 fs->fs_cstotal.cs_nifree++; 2104 fs->fs_cs(fs, cg).cs_nifree++; 2105 if ((mode & IFMT) == IFDIR) { 2106 ufs_add32(cgp->cg_cs.cs_ndir, -1, needswap); 2107 fs->fs_cstotal.cs_ndir--; 2108 fs->fs_cs(fs, cg).cs_ndir--; 2109 } 2110 fs->fs_fmod = 1; 2111 ACTIVECG_CLR(fs, cg); 2112 mutex_exit(&ump->um_lock); 2113} 2114 2115/* 2116 * Check to see if a file is free. 2117 */ 2118int 2119ffs_checkfreefile(struct fs *fs, struct vnode *devvp, ino_t ino) 2120{ 2121 struct cg *cgp; 2122 struct buf *bp; 2123 daddr_t cgbno; 2124 int ret; 2125 u_int cg; 2126 u_int8_t *inosused; 2127 const bool devvp_is_snapshot = (devvp->v_type != VBLK); 2128 2129 KASSERT(devvp_is_snapshot); 2130 2131 cg = ino_to_cg(fs, ino); 2132 if (devvp_is_snapshot) 2133 cgbno = ffs_fragstoblks(fs, cgtod(fs, cg)); 2134 else 2135 cgbno = FFS_FSBTODB(fs, cgtod(fs, cg)); 2136 if (ino >= fs->fs_ipg * fs->fs_ncg) 2137 return 1; 2138 if (bread(devvp, cgbno, (int)fs->fs_cgsize, 0, &bp)) { 2139 return 1; 2140 } 2141 cgp = (struct cg *)bp->b_data; 2142 if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) { 2143 brelse(bp, 0); 2144 return 1; 2145 } 2146 inosused = cg_inosused(cgp, UFS_FSNEEDSWAP(fs)); 2147 ino %= fs->fs_ipg; 2148 ret = isclr(inosused, ino); 2149 brelse(bp, 0); 2150 return ret; 2151} 2152 2153/* 2154 * Find a block of the specified size in the specified cylinder group. 2155 * 2156 * It is a panic if a request is made to find a block if none are 2157 * available. 2158 */ 2159static int32_t 2160ffs_mapsearch(struct fs *fs, struct cg *cgp, daddr_t bpref, int allocsiz) 2161{ 2162 int32_t bno; 2163 int start, len, loc, i; 2164 int blk, field, subfield, pos; 2165 int ostart, olen; 2166 u_int8_t *blksfree; 2167 const int needswap = UFS_FSNEEDSWAP(fs); 2168 2169 /* KASSERT(mutex_owned(&ump->um_lock)); */ 2170 2171 /* 2172 * find the fragment by searching through the free block 2173 * map for an appropriate bit pattern 2174 */ 2175 if (bpref) 2176 start = dtogd(fs, bpref) / NBBY; 2177 else 2178 start = ufs_rw32(cgp->cg_frotor, needswap) / NBBY; 2179 blksfree = cg_blksfree(cgp, needswap); 2180 len = howmany(fs->fs_fpg, NBBY) - start; 2181 ostart = start; 2182 olen = len; 2183 loc = scanc((u_int)len, 2184 (const u_char *)&blksfree[start], 2185 (const u_char *)fragtbl[fs->fs_frag], 2186 (1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1))))); 2187 if (loc == 0) { 2188 len = start + 1; 2189 start = 0; 2190 loc = scanc((u_int)len, 2191 (const u_char *)&blksfree[0], 2192 (const u_char *)fragtbl[fs->fs_frag], 2193 (1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1))))); 2194 if (loc == 0) { 2195 panic("%s: map corrupted: start=%d, len=%d, " 2196 "fs = %s, offset=%d/%ld, cg %d", __func__, 2197 ostart, olen, fs->fs_fsmnt, 2198 ufs_rw32(cgp->cg_freeoff, needswap), 2199 (long)blksfree - (long)cgp, cgp->cg_cgx); 2200 /* NOTREACHED */ 2201 } 2202 } 2203 bno = (start + len - loc) * NBBY; 2204 cgp->cg_frotor = ufs_rw32(bno, needswap); 2205 /* 2206 * found the byte in the map 2207 * sift through the bits to find the selected frag 2208 */ 2209 for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { 2210 blk = blkmap(fs, blksfree, bno); 2211 blk <<= 1; 2212 field = around[allocsiz]; 2213 subfield = inside[allocsiz]; 2214 for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { 2215 if ((blk & field) == subfield) 2216 return (bno + pos); 2217 field <<= 1; 2218 subfield <<= 1; 2219 } 2220 } 2221 panic("%s: block not in map: bno=%d, fs=%s", __func__, 2222 bno, fs->fs_fsmnt); 2223 /* return (-1); */ 2224} 2225 2226/* 2227 * Fserr prints the name of a file system with an error diagnostic. 2228 * 2229 * The form of the error message is: 2230 * fs: error message 2231 */ 2232static void 2233ffs_fserr(struct fs *fs, kauth_cred_t cred, const char *cp) 2234{ 2235 KASSERT(cred != NULL); 2236 2237 if (cred == NOCRED || cred == FSCRED) { 2238 log(LOG_ERR, "pid %d, command %s, on %s: %s\n", 2239 curproc->p_pid, curproc->p_comm, 2240 fs->fs_fsmnt, cp); 2241 } else { 2242 log(LOG_ERR, "uid %d, pid %d, command %s, on %s: %s\n", 2243 kauth_cred_getuid(cred), curproc->p_pid, curproc->p_comm, 2244 fs->fs_fsmnt, cp); 2245 } 2246} 2247