1/*- 2 * SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND BSD-3-Clause) 3 * 4 * Copyright (c) 2002 Networks Associates Technology, Inc. 5 * All rights reserved. 6 * 7 * This software was developed for the FreeBSD Project by Marshall 8 * Kirk McKusick and Network Associates Laboratories, the Security 9 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 10 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 11 * research program 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1982, 1986, 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 1. Redistributions of source code must retain the above copyright 41 * notice, this list of conditions and the following disclaimer. 42 * 2. Redistributions in binary form must reproduce the above copyright 43 * notice, this list of conditions and the following disclaimer in the 44 * documentation and/or other materials provided with the distribution. 45 * 3. Neither the name of the University nor the names of its contributors 46 * may be used to endorse or promote products derived from this software 47 * without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * SUCH DAMAGE. 60 * 61 * @(#)ffs_alloc.c 8.18 (Berkeley) 5/26/95 62 */ 63 64#include <sys/cdefs.h> 65__FBSDID("$FreeBSD$"); 66 67#include "opt_quota.h" 68 69#include <sys/param.h> 70#include <sys/capsicum.h> 71#include <sys/systm.h> 72#include <sys/bio.h> 73#include <sys/buf.h> 74#include <sys/conf.h> 75#include <sys/fcntl.h> 76#include <sys/file.h> 77#include <sys/filedesc.h> 78#include <sys/priv.h> 79#include <sys/proc.h> 80#include <sys/vnode.h> 81#include <sys/mount.h> 82#include <sys/kernel.h> 83#include <sys/syscallsubr.h> 84#include <sys/sysctl.h> 85#include <sys/syslog.h> 86#include <sys/taskqueue.h> 87 88#include <security/audit/audit.h> 89 90#include <geom/geom.h> 91#include <geom/geom_vfs.h> 92 93#include <ufs/ufs/dir.h> 94#include <ufs/ufs/extattr.h> 95#include <ufs/ufs/quota.h> 96#include <ufs/ufs/inode.h> 97#include <ufs/ufs/ufs_extern.h> 98#include <ufs/ufs/ufsmount.h> 99 100#include <ufs/ffs/fs.h> 101#include <ufs/ffs/ffs_extern.h> 102#include <ufs/ffs/softdep.h> 103 104typedef ufs2_daddr_t allocfcn_t(struct inode *ip, u_int cg, ufs2_daddr_t bpref, 105 int size, int rsize); 106 107static ufs2_daddr_t ffs_alloccg(struct inode *, u_int, ufs2_daddr_t, int, int); 108static ufs2_daddr_t 109 ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int); 110static void ffs_blkfree_cg(struct ufsmount *, struct fs *, 111 struct vnode *, ufs2_daddr_t, long, ino_t, 112 struct workhead *); 113#ifdef INVARIANTS 114static int ffs_checkblk(struct inode *, ufs2_daddr_t, long); 115#endif 116static ufs2_daddr_t ffs_clusteralloc(struct inode *, u_int, ufs2_daddr_t, int); 117static ino_t ffs_dirpref(struct inode *); 118static ufs2_daddr_t ffs_fragextend(struct inode *, u_int, ufs2_daddr_t, 119 int, int); 120static ufs2_daddr_t ffs_hashalloc 121 (struct inode *, u_int, ufs2_daddr_t, int, int, allocfcn_t *); 122static ufs2_daddr_t ffs_nodealloccg(struct inode *, u_int, ufs2_daddr_t, int, 123 int); 124static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int); 125static int ffs_reallocblks_ufs1(struct vop_reallocblks_args *); 126static int ffs_reallocblks_ufs2(struct vop_reallocblks_args *); 127static void ffs_ckhash_cg(struct buf *); 128 129/* 130 * Allocate a block in the filesystem. 131 * 132 * The size of the requested block is given, which must be some 133 * multiple of fs_fsize and <= fs_bsize. 134 * A preference may be optionally specified. If a preference is given 135 * the following hierarchy is used to allocate a block: 136 * 1) allocate the requested block. 137 * 2) allocate a rotationally optimal block in the same cylinder. 138 * 3) allocate a block in the same cylinder group. 139 * 4) quadradically rehash into other cylinder groups, until an 140 * available block is located. 141 * If no block preference is given the following hierarchy is used 142 * to allocate a block: 143 * 1) allocate a block in the cylinder group that contains the 144 * inode for the file. 145 * 2) quadradically rehash into other cylinder groups, until an 146 * available block is located. 147 */ 148int 149ffs_alloc(ip, lbn, bpref, size, flags, cred, bnp) 150 struct inode *ip; 151 ufs2_daddr_t lbn, bpref; 152 int size, flags; 153 struct ucred *cred; 154 ufs2_daddr_t *bnp; 155{ 156 struct fs *fs; 157 struct ufsmount *ump; 158 ufs2_daddr_t bno; 159 u_int cg, reclaimed; 160 int64_t delta; 161#ifdef QUOTA 162 int error; 163#endif 164 165 *bnp = 0; 166 ump = ITOUMP(ip); 167 fs = ump->um_fs; 168 mtx_assert(UFS_MTX(ump), MA_OWNED); 169#ifdef INVARIANTS 170 if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { 171 printf("dev = %s, bsize = %ld, size = %d, fs = %s\n", 172 devtoname(ump->um_dev), (long)fs->fs_bsize, size, 173 fs->fs_fsmnt); 174 panic("ffs_alloc: bad size"); 175 } 176 if (cred == NOCRED) 177 panic("ffs_alloc: missing credential"); 178#endif /* INVARIANTS */ 179 reclaimed = 0; 180retry: 181#ifdef QUOTA 182 UFS_UNLOCK(ump); 183 error = chkdq(ip, btodb(size), cred, 0); 184 if (error) 185 return (error); 186 UFS_LOCK(ump); 187#endif 188 if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) 189 goto nospace; 190 if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0) && 191 freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0) 192 goto nospace; 193 if (bpref >= fs->fs_size) 194 bpref = 0; 195 if (bpref == 0) 196 cg = ino_to_cg(fs, ip->i_number); 197 else 198 cg = dtog(fs, bpref); 199 bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg); 200 if (bno > 0) { 201 delta = btodb(size); 202 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); 203 if (flags & IO_EXT) 204 ip->i_flag |= IN_CHANGE; 205 else 206 ip->i_flag |= IN_CHANGE | IN_UPDATE; 207 *bnp = bno; 208 return (0); 209 } 210nospace: 211#ifdef QUOTA 212 UFS_UNLOCK(ump); 213 /* 214 * Restore user's disk quota because allocation failed. 215 */ 216 (void) chkdq(ip, -btodb(size), cred, FORCE); 217 UFS_LOCK(ump); 218#endif 219 if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { 220 reclaimed = 1; 221 softdep_request_cleanup(fs, ITOV(ip), cred, FLUSH_BLOCKS_WAIT); 222 goto retry; 223 } 224 if (reclaimed > 0 && 225 ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { 226 UFS_UNLOCK(ump); 227 ffs_fserr(fs, ip->i_number, "filesystem full"); 228 uprintf("\n%s: write failed, filesystem is full\n", 229 fs->fs_fsmnt); 230 } else { 231 UFS_UNLOCK(ump); 232 } 233 return (ENOSPC); 234} 235 236/* 237 * Reallocate a fragment to a bigger size 238 * 239 * The number and size of the old block is given, and a preference 240 * and new size is also specified. The allocator attempts to extend 241 * the original block. Failing that, the regular block allocator is 242 * invoked to get an appropriate block. 243 */ 244int 245ffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, flags, cred, bpp) 246 struct inode *ip; 247 ufs2_daddr_t lbprev; 248 ufs2_daddr_t bprev; 249 ufs2_daddr_t bpref; 250 int osize, nsize, flags; 251 struct ucred *cred; 252 struct buf **bpp; 253{ 254 struct vnode *vp; 255 struct fs *fs; 256 struct buf *bp; 257 struct ufsmount *ump; 258 u_int cg, request, reclaimed; 259 int error, gbflags; 260 ufs2_daddr_t bno; 261 int64_t delta; 262 263 vp = ITOV(ip); 264 ump = ITOUMP(ip); 265 fs = ump->um_fs; 266 bp = NULL; 267 gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; 268 269 mtx_assert(UFS_MTX(ump), MA_OWNED); 270#ifdef INVARIANTS 271 if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) 272 panic("ffs_realloccg: allocation on suspended filesystem"); 273 if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || 274 (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { 275 printf( 276 "dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n", 277 devtoname(ump->um_dev), (long)fs->fs_bsize, osize, 278 nsize, fs->fs_fsmnt); 279 panic("ffs_realloccg: bad size"); 280 } 281 if (cred == NOCRED) 282 panic("ffs_realloccg: missing credential"); 283#endif /* INVARIANTS */ 284 reclaimed = 0; 285retry: 286 if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0) && 287 freespace(fs, fs->fs_minfree) - numfrags(fs, nsize - osize) < 0) { 288 goto nospace; 289 } 290 if (bprev == 0) { 291 printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n", 292 devtoname(ump->um_dev), (long)fs->fs_bsize, (intmax_t)bprev, 293 fs->fs_fsmnt); 294 panic("ffs_realloccg: bad bprev"); 295 } 296 UFS_UNLOCK(ump); 297 /* 298 * Allocate the extra space in the buffer. 299 */ 300 error = bread_gb(vp, lbprev, osize, NOCRED, gbflags, &bp); 301 if (error) { 302 brelse(bp); 303 return (error); 304 } 305 306 if (bp->b_blkno == bp->b_lblkno) { 307 if (lbprev >= UFS_NDADDR) 308 panic("ffs_realloccg: lbprev out of range"); 309 bp->b_blkno = fsbtodb(fs, bprev); 310 } 311 312#ifdef QUOTA 313 error = chkdq(ip, btodb(nsize - osize), cred, 0); 314 if (error) { 315 brelse(bp); 316 return (error); 317 } 318#endif 319 /* 320 * Check for extension in the existing location. 321 */ 322 *bpp = NULL; 323 cg = dtog(fs, bprev); 324 UFS_LOCK(ump); 325 bno = ffs_fragextend(ip, cg, bprev, osize, nsize); 326 if (bno) { 327 if (bp->b_blkno != fsbtodb(fs, bno)) 328 panic("ffs_realloccg: bad blockno"); 329 delta = btodb(nsize - osize); 330 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); 331 if (flags & IO_EXT) 332 ip->i_flag |= IN_CHANGE; 333 else 334 ip->i_flag |= IN_CHANGE | IN_UPDATE; 335 allocbuf(bp, nsize); 336 bp->b_flags |= B_DONE; 337 vfs_bio_bzero_buf(bp, osize, nsize - osize); 338 if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) 339 vfs_bio_set_valid(bp, osize, nsize - osize); 340 *bpp = bp; 341 return (0); 342 } 343 /* 344 * Allocate a new disk location. 345 */ 346 if (bpref >= fs->fs_size) 347 bpref = 0; 348 switch ((int)fs->fs_optim) { 349 case FS_OPTSPACE: 350 /* 351 * Allocate an exact sized fragment. Although this makes 352 * best use of space, we will waste time relocating it if 353 * the file continues to grow. If the fragmentation is 354 * less than half of the minimum free reserve, we choose 355 * to begin optimizing for time. 356 */ 357 request = nsize; 358 if (fs->fs_minfree <= 5 || 359 fs->fs_cstotal.cs_nffree > 360 (off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100)) 361 break; 362 log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n", 363 fs->fs_fsmnt); 364 fs->fs_optim = FS_OPTTIME; 365 break; 366 case FS_OPTTIME: 367 /* 368 * At this point we have discovered a file that is trying to 369 * grow a small fragment to a larger fragment. To save time, 370 * we allocate a full sized block, then free the unused portion. 371 * If the file continues to grow, the `ffs_fragextend' call 372 * above will be able to grow it in place without further 373 * copying. If aberrant programs cause disk fragmentation to 374 * grow within 2% of the free reserve, we choose to begin 375 * optimizing for space. 376 */ 377 request = fs->fs_bsize; 378 if (fs->fs_cstotal.cs_nffree < 379 (off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100) 380 break; 381 log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n", 382 fs->fs_fsmnt); 383 fs->fs_optim = FS_OPTSPACE; 384 break; 385 default: 386 printf("dev = %s, optim = %ld, fs = %s\n", 387 devtoname(ump->um_dev), (long)fs->fs_optim, fs->fs_fsmnt); 388 panic("ffs_realloccg: bad optim"); 389 /* NOTREACHED */ 390 } 391 bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg); 392 if (bno > 0) { 393 bp->b_blkno = fsbtodb(fs, bno); 394 if (!DOINGSOFTDEP(vp)) 395 /* 396 * The usual case is that a smaller fragment that 397 * was just allocated has been replaced with a bigger 398 * fragment or a full-size block. If it is marked as 399 * B_DELWRI, the current contents have not been written 400 * to disk. It is possible that the block was written 401 * earlier, but very uncommon. If the block has never 402 * been written, there is no need to send a BIO_DELETE 403 * for it when it is freed. The gain from avoiding the 404 * TRIMs for the common case of unwritten blocks far 405 * exceeds the cost of the write amplification for the 406 * uncommon case of failing to send a TRIM for a block 407 * that had been written. 408 */ 409 ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize, 410 ip->i_number, vp->v_type, NULL, 411 (bp->b_flags & B_DELWRI) != 0 ? 412 NOTRIM_KEY : SINGLETON_KEY); 413 delta = btodb(nsize - osize); 414 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); 415 if (flags & IO_EXT) 416 ip->i_flag |= IN_CHANGE; 417 else 418 ip->i_flag |= IN_CHANGE | IN_UPDATE; 419 allocbuf(bp, nsize); 420 bp->b_flags |= B_DONE; 421 vfs_bio_bzero_buf(bp, osize, nsize - osize); 422 if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) 423 vfs_bio_set_valid(bp, osize, nsize - osize); 424 *bpp = bp; 425 return (0); 426 } 427#ifdef QUOTA 428 UFS_UNLOCK(ump); 429 /* 430 * Restore user's disk quota because allocation failed. 431 */ 432 (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE); 433 UFS_LOCK(ump); 434#endif 435nospace: 436 /* 437 * no space available 438 */ 439 if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { 440 reclaimed = 1; 441 UFS_UNLOCK(ump); 442 if (bp) { 443 brelse(bp); 444 bp = NULL; 445 } 446 UFS_LOCK(ump); 447 softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT); 448 goto retry; 449 } 450 if (reclaimed > 0 && 451 ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { 452 UFS_UNLOCK(ump); 453 ffs_fserr(fs, ip->i_number, "filesystem full"); 454 uprintf("\n%s: write failed, filesystem is full\n", 455 fs->fs_fsmnt); 456 } else { 457 UFS_UNLOCK(ump); 458 } 459 if (bp) 460 brelse(bp); 461 return (ENOSPC); 462} 463 464/* 465 * Reallocate a sequence of blocks into a contiguous sequence of blocks. 466 * 467 * The vnode and an array of buffer pointers for a range of sequential 468 * logical blocks to be made contiguous is given. The allocator attempts 469 * to find a range of sequential blocks starting as close as possible 470 * from the end of the allocation for the logical block immediately 471 * preceding the current range. If successful, the physical block numbers 472 * in the buffer pointers and in the inode are changed to reflect the new 473 * allocation. If unsuccessful, the allocation is left unchanged. The 474 * success in doing the reallocation is returned. Note that the error 475 * return is not reflected back to the user. Rather the previous block 476 * allocation will be used. 477 */ 478 479SYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW, 0, "FFS filesystem"); 480 481static int doasyncfree = 1; 482SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0, 483"do not force synchronous writes when blocks are reallocated"); 484 485static int doreallocblks = 1; 486SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, 487"enable block reallocation"); 488 489static int dotrimcons = 1; 490SYSCTL_INT(_vfs_ffs, OID_AUTO, dotrimcons, CTLFLAG_RWTUN, &dotrimcons, 0, 491"enable BIO_DELETE / TRIM consolidation"); 492 493static int maxclustersearch = 10; 494SYSCTL_INT(_vfs_ffs, OID_AUTO, maxclustersearch, CTLFLAG_RW, &maxclustersearch, 4950, "max number of cylinder group to search for contigous blocks"); 496 497#ifdef DEBUG 498static volatile int prtrealloc = 0; 499#endif 500 501int 502ffs_reallocblks(ap) 503 struct vop_reallocblks_args /* { 504 struct vnode *a_vp; 505 struct cluster_save *a_buflist; 506 } */ *ap; 507{ 508 struct ufsmount *ump; 509 510 /* 511 * We used to skip reallocating the blocks of a file into a 512 * contiguous sequence if the underlying flash device requested 513 * BIO_DELETE notifications, because devices that benefit from 514 * BIO_DELETE also benefit from not moving the data. However, 515 * the destination for the data is usually moved before the data 516 * is written to the initially allocated location, so we rarely 517 * suffer the penalty of extra writes. With the addition of the 518 * consolidation of contiguous blocks into single BIO_DELETE 519 * operations, having fewer but larger contiguous blocks reduces 520 * the number of (slow and expensive) BIO_DELETE operations. So 521 * when doing BIO_DELETE consolidation, we do block reallocation. 522 * 523 * Skip if reallocblks has been disabled globally. 524 */ 525 ump = ap->a_vp->v_mount->mnt_data; 526 if ((((ump->um_flags) & UM_CANDELETE) != 0 && dotrimcons == 0) || 527 doreallocblks == 0) 528 return (ENOSPC); 529 530 /* 531 * We can't wait in softdep prealloc as it may fsync and recurse 532 * here. Instead we simply fail to reallocate blocks if this 533 * rare condition arises. 534 */ 535 if (DOINGSOFTDEP(ap->a_vp)) 536 if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0) 537 return (ENOSPC); 538 if (ump->um_fstype == UFS1) 539 return (ffs_reallocblks_ufs1(ap)); 540 return (ffs_reallocblks_ufs2(ap)); 541} 542 543static int 544ffs_reallocblks_ufs1(ap) 545 struct vop_reallocblks_args /* { 546 struct vnode *a_vp; 547 struct cluster_save *a_buflist; 548 } */ *ap; 549{ 550 struct fs *fs; 551 struct inode *ip; 552 struct vnode *vp; 553 struct buf *sbp, *ebp, *bp; 554 ufs1_daddr_t *bap, *sbap, *ebap; 555 struct cluster_save *buflist; 556 struct ufsmount *ump; 557 ufs_lbn_t start_lbn, end_lbn; 558 ufs1_daddr_t soff, newblk, blkno; 559 ufs2_daddr_t pref; 560 struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp; 561 int i, cg, len, start_lvl, end_lvl, ssize; 562 563 vp = ap->a_vp; 564 ip = VTOI(vp); 565 ump = ITOUMP(ip); 566 fs = ump->um_fs; 567 /* 568 * If we are not tracking block clusters or if we have less than 4% 569 * free blocks left, then do not attempt to cluster. Running with 570 * less than 5% free block reserve is not recommended and those that 571 * choose to do so do not expect to have good file layout. 572 */ 573 if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) 574 return (ENOSPC); 575 buflist = ap->a_buflist; 576 len = buflist->bs_nchildren; 577 start_lbn = buflist->bs_children[0]->b_lblkno; 578 end_lbn = start_lbn + len - 1; 579#ifdef INVARIANTS 580 for (i = 0; i < len; i++) 581 if (!ffs_checkblk(ip, 582 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 583 panic("ffs_reallocblks: unallocated block 1"); 584 for (i = 1; i < len; i++) 585 if (buflist->bs_children[i]->b_lblkno != start_lbn + i) 586 panic("ffs_reallocblks: non-logical cluster"); 587 blkno = buflist->bs_children[0]->b_blkno; 588 ssize = fsbtodb(fs, fs->fs_frag); 589 for (i = 1; i < len - 1; i++) 590 if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) 591 panic("ffs_reallocblks: non-physical cluster %d", i); 592#endif 593 /* 594 * If the cluster crosses the boundary for the first indirect 595 * block, leave space for the indirect block. Indirect blocks 596 * are initially laid out in a position after the last direct 597 * block. Block reallocation would usually destroy locality by 598 * moving the indirect block out of the way to make room for 599 * data blocks if we didn't compensate here. We should also do 600 * this for other indirect block boundaries, but it is only 601 * important for the first one. 602 */ 603 if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR) 604 return (ENOSPC); 605 /* 606 * If the latest allocation is in a new cylinder group, assume that 607 * the filesystem has decided to move and do not force it back to 608 * the previous cylinder group. 609 */ 610 if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != 611 dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) 612 return (ENOSPC); 613 if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || 614 ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) 615 return (ENOSPC); 616 /* 617 * Get the starting offset and block map for the first block. 618 */ 619 if (start_lvl == 0) { 620 sbap = &ip->i_din1->di_db[0]; 621 soff = start_lbn; 622 } else { 623 idp = &start_ap[start_lvl - 1]; 624 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { 625 brelse(sbp); 626 return (ENOSPC); 627 } 628 sbap = (ufs1_daddr_t *)sbp->b_data; 629 soff = idp->in_off; 630 } 631 /* 632 * If the block range spans two block maps, get the second map. 633 */ 634 ebap = NULL; 635 if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { 636 ssize = len; 637 } else { 638#ifdef INVARIANTS 639 if (start_lvl > 0 && 640 start_ap[start_lvl - 1].in_lbn == idp->in_lbn) 641 panic("ffs_reallocblk: start == end"); 642#endif 643 ssize = len - (idp->in_off + 1); 644 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) 645 goto fail; 646 ebap = (ufs1_daddr_t *)ebp->b_data; 647 } 648 /* 649 * Find the preferred location for the cluster. If we have not 650 * previously failed at this endeavor, then follow our standard 651 * preference calculation. If we have failed at it, then pick up 652 * where we last ended our search. 653 */ 654 UFS_LOCK(ump); 655 if (ip->i_nextclustercg == -1) 656 pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap); 657 else 658 pref = cgdata(fs, ip->i_nextclustercg); 659 /* 660 * Search the block map looking for an allocation of the desired size. 661 * To avoid wasting too much time, we limit the number of cylinder 662 * groups that we will search. 663 */ 664 cg = dtog(fs, pref); 665 for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { 666 if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) 667 break; 668 cg += 1; 669 if (cg >= fs->fs_ncg) 670 cg = 0; 671 } 672 /* 673 * If we have failed in our search, record where we gave up for 674 * next time. Otherwise, fall back to our usual search citerion. 675 */ 676 if (newblk == 0) { 677 ip->i_nextclustercg = cg; 678 UFS_UNLOCK(ump); 679 goto fail; 680 } 681 ip->i_nextclustercg = -1; 682 /* 683 * We have found a new contiguous block. 684 * 685 * First we have to replace the old block pointers with the new 686 * block pointers in the inode and indirect blocks associated 687 * with the file. 688 */ 689#ifdef DEBUG 690 if (prtrealloc) 691 printf("realloc: ino %ju, lbns %jd-%jd\n\told:", 692 (uintmax_t)ip->i_number, 693 (intmax_t)start_lbn, (intmax_t)end_lbn); 694#endif 695 blkno = newblk; 696 for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { 697 if (i == ssize) { 698 bap = ebap; 699 soff = -i; 700 } 701#ifdef INVARIANTS 702 if (!ffs_checkblk(ip, 703 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 704 panic("ffs_reallocblks: unallocated block 2"); 705 if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) 706 panic("ffs_reallocblks: alloc mismatch"); 707#endif 708#ifdef DEBUG 709 if (prtrealloc) 710 printf(" %d,", *bap); 711#endif 712 if (DOINGSOFTDEP(vp)) { 713 if (sbap == &ip->i_din1->di_db[0] && i < ssize) 714 softdep_setup_allocdirect(ip, start_lbn + i, 715 blkno, *bap, fs->fs_bsize, fs->fs_bsize, 716 buflist->bs_children[i]); 717 else 718 softdep_setup_allocindir_page(ip, start_lbn + i, 719 i < ssize ? sbp : ebp, soff + i, blkno, 720 *bap, buflist->bs_children[i]); 721 } 722 *bap++ = blkno; 723 } 724 /* 725 * Next we must write out the modified inode and indirect blocks. 726 * For strict correctness, the writes should be synchronous since 727 * the old block values may have been written to disk. In practise 728 * they are almost never written, but if we are concerned about 729 * strict correctness, the `doasyncfree' flag should be set to zero. 730 * 731 * The test on `doasyncfree' should be changed to test a flag 732 * that shows whether the associated buffers and inodes have 733 * been written. The flag should be set when the cluster is 734 * started and cleared whenever the buffer or inode is flushed. 735 * We can then check below to see if it is set, and do the 736 * synchronous write only when it has been cleared. 737 */ 738 if (sbap != &ip->i_din1->di_db[0]) { 739 if (doasyncfree) 740 bdwrite(sbp); 741 else 742 bwrite(sbp); 743 } else { 744 ip->i_flag |= IN_CHANGE | IN_UPDATE; 745 if (!doasyncfree) 746 ffs_update(vp, 1); 747 } 748 if (ssize < len) { 749 if (doasyncfree) 750 bdwrite(ebp); 751 else 752 bwrite(ebp); 753 } 754 /* 755 * Last, free the old blocks and assign the new blocks to the buffers. 756 */ 757#ifdef DEBUG 758 if (prtrealloc) 759 printf("\n\tnew:"); 760#endif 761 for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { 762 bp = buflist->bs_children[i]; 763 if (!DOINGSOFTDEP(vp)) 764 /* 765 * The usual case is that a set of N-contiguous blocks 766 * that was just allocated has been replaced with a 767 * set of N+1-contiguous blocks. If they are marked as 768 * B_DELWRI, the current contents have not been written 769 * to disk. It is possible that the blocks were written 770 * earlier, but very uncommon. If the blocks have never 771 * been written, there is no need to send a BIO_DELETE 772 * for them when they are freed. The gain from avoiding 773 * the TRIMs for the common case of unwritten blocks 774 * far exceeds the cost of the write amplification for 775 * the uncommon case of failing to send a TRIM for the 776 * blocks that had been written. 777 */ 778 ffs_blkfree(ump, fs, ump->um_devvp, 779 dbtofsb(fs, bp->b_blkno), 780 fs->fs_bsize, ip->i_number, vp->v_type, NULL, 781 (bp->b_flags & B_DELWRI) != 0 ? 782 NOTRIM_KEY : SINGLETON_KEY); 783 bp->b_blkno = fsbtodb(fs, blkno); 784#ifdef INVARIANTS 785 if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize)) 786 panic("ffs_reallocblks: unallocated block 3"); 787#endif 788#ifdef DEBUG 789 if (prtrealloc) 790 printf(" %d,", blkno); 791#endif 792 } 793#ifdef DEBUG 794 if (prtrealloc) { 795 prtrealloc--; 796 printf("\n"); 797 } 798#endif 799 return (0); 800 801fail: 802 if (ssize < len) 803 brelse(ebp); 804 if (sbap != &ip->i_din1->di_db[0]) 805 brelse(sbp); 806 return (ENOSPC); 807} 808 809static int 810ffs_reallocblks_ufs2(ap) 811 struct vop_reallocblks_args /* { 812 struct vnode *a_vp; 813 struct cluster_save *a_buflist; 814 } */ *ap; 815{ 816 struct fs *fs; 817 struct inode *ip; 818 struct vnode *vp; 819 struct buf *sbp, *ebp, *bp; 820 ufs2_daddr_t *bap, *sbap, *ebap; 821 struct cluster_save *buflist; 822 struct ufsmount *ump; 823 ufs_lbn_t start_lbn, end_lbn; 824 ufs2_daddr_t soff, newblk, blkno, pref; 825 struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp; 826 int i, cg, len, start_lvl, end_lvl, ssize; 827 828 vp = ap->a_vp; 829 ip = VTOI(vp); 830 ump = ITOUMP(ip); 831 fs = ump->um_fs; 832 /* 833 * If we are not tracking block clusters or if we have less than 4% 834 * free blocks left, then do not attempt to cluster. Running with 835 * less than 5% free block reserve is not recommended and those that 836 * choose to do so do not expect to have good file layout. 837 */ 838 if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) 839 return (ENOSPC); 840 buflist = ap->a_buflist; 841 len = buflist->bs_nchildren; 842 start_lbn = buflist->bs_children[0]->b_lblkno; 843 end_lbn = start_lbn + len - 1; 844#ifdef INVARIANTS 845 for (i = 0; i < len; i++) 846 if (!ffs_checkblk(ip, 847 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 848 panic("ffs_reallocblks: unallocated block 1"); 849 for (i = 1; i < len; i++) 850 if (buflist->bs_children[i]->b_lblkno != start_lbn + i) 851 panic("ffs_reallocblks: non-logical cluster"); 852 blkno = buflist->bs_children[0]->b_blkno; 853 ssize = fsbtodb(fs, fs->fs_frag); 854 for (i = 1; i < len - 1; i++) 855 if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) 856 panic("ffs_reallocblks: non-physical cluster %d", i); 857#endif 858 /* 859 * If the cluster crosses the boundary for the first indirect 860 * block, do not move anything in it. Indirect blocks are 861 * usually initially laid out in a position between the data 862 * blocks. Block reallocation would usually destroy locality by 863 * moving the indirect block out of the way to make room for 864 * data blocks if we didn't compensate here. We should also do 865 * this for other indirect block boundaries, but it is only 866 * important for the first one. 867 */ 868 if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR) 869 return (ENOSPC); 870 /* 871 * If the latest allocation is in a new cylinder group, assume that 872 * the filesystem has decided to move and do not force it back to 873 * the previous cylinder group. 874 */ 875 if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != 876 dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) 877 return (ENOSPC); 878 if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || 879 ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) 880 return (ENOSPC); 881 /* 882 * Get the starting offset and block map for the first block. 883 */ 884 if (start_lvl == 0) { 885 sbap = &ip->i_din2->di_db[0]; 886 soff = start_lbn; 887 } else { 888 idp = &start_ap[start_lvl - 1]; 889 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { 890 brelse(sbp); 891 return (ENOSPC); 892 } 893 sbap = (ufs2_daddr_t *)sbp->b_data; 894 soff = idp->in_off; 895 } 896 /* 897 * If the block range spans two block maps, get the second map. 898 */ 899 ebap = NULL; 900 if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { 901 ssize = len; 902 } else { 903#ifdef INVARIANTS 904 if (start_lvl > 0 && 905 start_ap[start_lvl - 1].in_lbn == idp->in_lbn) 906 panic("ffs_reallocblk: start == end"); 907#endif 908 ssize = len - (idp->in_off + 1); 909 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) 910 goto fail; 911 ebap = (ufs2_daddr_t *)ebp->b_data; 912 } 913 /* 914 * Find the preferred location for the cluster. If we have not 915 * previously failed at this endeavor, then follow our standard 916 * preference calculation. If we have failed at it, then pick up 917 * where we last ended our search. 918 */ 919 UFS_LOCK(ump); 920 if (ip->i_nextclustercg == -1) 921 pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap); 922 else 923 pref = cgdata(fs, ip->i_nextclustercg); 924 /* 925 * Search the block map looking for an allocation of the desired size. 926 * To avoid wasting too much time, we limit the number of cylinder 927 * groups that we will search. 928 */ 929 cg = dtog(fs, pref); 930 for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { 931 if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) 932 break; 933 cg += 1; 934 if (cg >= fs->fs_ncg) 935 cg = 0; 936 } 937 /* 938 * If we have failed in our search, record where we gave up for 939 * next time. Otherwise, fall back to our usual search citerion. 940 */ 941 if (newblk == 0) { 942 ip->i_nextclustercg = cg; 943 UFS_UNLOCK(ump); 944 goto fail; 945 } 946 ip->i_nextclustercg = -1; 947 /* 948 * We have found a new contiguous block. 949 * 950 * First we have to replace the old block pointers with the new 951 * block pointers in the inode and indirect blocks associated 952 * with the file. 953 */ 954#ifdef DEBUG 955 if (prtrealloc) 956 printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number, 957 (intmax_t)start_lbn, (intmax_t)end_lbn); 958#endif 959 blkno = newblk; 960 for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { 961 if (i == ssize) { 962 bap = ebap; 963 soff = -i; 964 } 965#ifdef INVARIANTS 966 if (!ffs_checkblk(ip, 967 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 968 panic("ffs_reallocblks: unallocated block 2"); 969 if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) 970 panic("ffs_reallocblks: alloc mismatch"); 971#endif 972#ifdef DEBUG 973 if (prtrealloc) 974 printf(" %jd,", (intmax_t)*bap); 975#endif 976 if (DOINGSOFTDEP(vp)) { 977 if (sbap == &ip->i_din2->di_db[0] && i < ssize) 978 softdep_setup_allocdirect(ip, start_lbn + i, 979 blkno, *bap, fs->fs_bsize, fs->fs_bsize, 980 buflist->bs_children[i]); 981 else 982 softdep_setup_allocindir_page(ip, start_lbn + i, 983 i < ssize ? sbp : ebp, soff + i, blkno, 984 *bap, buflist->bs_children[i]); 985 } 986 *bap++ = blkno; 987 } 988 /* 989 * Next we must write out the modified inode and indirect blocks. 990 * For strict correctness, the writes should be synchronous since 991 * the old block values may have been written to disk. In practise 992 * they are almost never written, but if we are concerned about 993 * strict correctness, the `doasyncfree' flag should be set to zero. 994 * 995 * The test on `doasyncfree' should be changed to test a flag 996 * that shows whether the associated buffers and inodes have 997 * been written. The flag should be set when the cluster is 998 * started and cleared whenever the buffer or inode is flushed. 999 * We can then check below to see if it is set, and do the 1000 * synchronous write only when it has been cleared. 1001 */ 1002 if (sbap != &ip->i_din2->di_db[0]) { 1003 if (doasyncfree) 1004 bdwrite(sbp); 1005 else 1006 bwrite(sbp); 1007 } else { 1008 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1009 if (!doasyncfree) 1010 ffs_update(vp, 1); 1011 } 1012 if (ssize < len) { 1013 if (doasyncfree) 1014 bdwrite(ebp); 1015 else 1016 bwrite(ebp); 1017 } 1018 /* 1019 * Last, free the old blocks and assign the new blocks to the buffers. 1020 */ 1021#ifdef DEBUG 1022 if (prtrealloc) 1023 printf("\n\tnew:"); 1024#endif 1025 for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { 1026 bp = buflist->bs_children[i]; 1027 if (!DOINGSOFTDEP(vp)) 1028 /* 1029 * The usual case is that a set of N-contiguous blocks 1030 * that was just allocated has been replaced with a 1031 * set of N+1-contiguous blocks. If they are marked as 1032 * B_DELWRI, the current contents have not been written 1033 * to disk. It is possible that the blocks were written 1034 * earlier, but very uncommon. If the blocks have never 1035 * been written, there is no need to send a BIO_DELETE 1036 * for them when they are freed. The gain from avoiding 1037 * the TRIMs for the common case of unwritten blocks 1038 * far exceeds the cost of the write amplification for 1039 * the uncommon case of failing to send a TRIM for the 1040 * blocks that had been written. 1041 */ 1042 ffs_blkfree(ump, fs, ump->um_devvp, 1043 dbtofsb(fs, bp->b_blkno), 1044 fs->fs_bsize, ip->i_number, vp->v_type, NULL, 1045 (bp->b_flags & B_DELWRI) != 0 ? 1046 NOTRIM_KEY : SINGLETON_KEY); 1047 bp->b_blkno = fsbtodb(fs, blkno); 1048#ifdef INVARIANTS 1049 if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize)) 1050 panic("ffs_reallocblks: unallocated block 3"); 1051#endif 1052#ifdef DEBUG 1053 if (prtrealloc) 1054 printf(" %jd,", (intmax_t)blkno); 1055#endif 1056 } 1057#ifdef DEBUG 1058 if (prtrealloc) { 1059 prtrealloc--; 1060 printf("\n"); 1061 } 1062#endif 1063 return (0); 1064 1065fail: 1066 if (ssize < len) 1067 brelse(ebp); 1068 if (sbap != &ip->i_din2->di_db[0]) 1069 brelse(sbp); 1070 return (ENOSPC); 1071} 1072 1073/* 1074 * Allocate an inode in the filesystem. 1075 * 1076 * If allocating a directory, use ffs_dirpref to select the inode. 1077 * If allocating in a directory, the following hierarchy is followed: 1078 * 1) allocate the preferred inode. 1079 * 2) allocate an inode in the same cylinder group. 1080 * 3) quadradically rehash into other cylinder groups, until an 1081 * available inode is located. 1082 * If no inode preference is given the following hierarchy is used 1083 * to allocate an inode: 1084 * 1) allocate an inode in cylinder group 0. 1085 * 2) quadradically rehash into other cylinder groups, until an 1086 * available inode is located. 1087 */ 1088int 1089ffs_valloc(pvp, mode, cred, vpp) 1090 struct vnode *pvp; 1091 int mode; 1092 struct ucred *cred; 1093 struct vnode **vpp; 1094{ 1095 struct inode *pip; 1096 struct fs *fs; 1097 struct inode *ip; 1098 struct timespec ts; 1099 struct ufsmount *ump; 1100 ino_t ino, ipref; 1101 u_int cg; 1102 int error, error1, reclaimed; 1103 1104 *vpp = NULL; 1105 pip = VTOI(pvp); 1106 ump = ITOUMP(pip); 1107 fs = ump->um_fs; 1108 1109 UFS_LOCK(ump); 1110 reclaimed = 0; 1111retry: 1112 if (fs->fs_cstotal.cs_nifree == 0) 1113 goto noinodes; 1114 1115 if ((mode & IFMT) == IFDIR) 1116 ipref = ffs_dirpref(pip); 1117 else 1118 ipref = pip->i_number; 1119 if (ipref >= fs->fs_ncg * fs->fs_ipg) 1120 ipref = 0; 1121 cg = ino_to_cg(fs, ipref); 1122 /* 1123 * Track number of dirs created one after another 1124 * in a same cg without intervening by files. 1125 */ 1126 if ((mode & IFMT) == IFDIR) { 1127 if (fs->fs_contigdirs[cg] < 255) 1128 fs->fs_contigdirs[cg]++; 1129 } else { 1130 if (fs->fs_contigdirs[cg] > 0) 1131 fs->fs_contigdirs[cg]--; 1132 } 1133 ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, 1134 (allocfcn_t *)ffs_nodealloccg); 1135 if (ino == 0) 1136 goto noinodes; 1137 error = ffs_vget(pvp->v_mount, ino, LK_EXCLUSIVE, vpp); 1138 if (error) { 1139 error1 = ffs_vgetf(pvp->v_mount, ino, LK_EXCLUSIVE, vpp, 1140 FFSV_FORCEINSMQ); 1141 ffs_vfree(pvp, ino, mode); 1142 if (error1 == 0) { 1143 ip = VTOI(*vpp); 1144 if (ip->i_mode) 1145 goto dup_alloc; 1146 ip->i_flag |= IN_MODIFIED; 1147 vput(*vpp); 1148 } 1149 return (error); 1150 } 1151 ip = VTOI(*vpp); 1152 if (ip->i_mode) { 1153dup_alloc: 1154 printf("mode = 0%o, inum = %ju, fs = %s\n", 1155 ip->i_mode, (uintmax_t)ip->i_number, fs->fs_fsmnt); 1156 panic("ffs_valloc: dup alloc"); 1157 } 1158 if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) { /* XXX */ 1159 printf("free inode %s/%lu had %ld blocks\n", 1160 fs->fs_fsmnt, (u_long)ino, (long)DIP(ip, i_blocks)); 1161 DIP_SET(ip, i_blocks, 0); 1162 } 1163 ip->i_flags = 0; 1164 DIP_SET(ip, i_flags, 0); 1165 /* 1166 * Set up a new generation number for this inode. 1167 */ 1168 while (ip->i_gen == 0 || ++ip->i_gen == 0) 1169 ip->i_gen = arc4random(); 1170 DIP_SET(ip, i_gen, ip->i_gen); 1171 if (fs->fs_magic == FS_UFS2_MAGIC) { 1172 vfs_timestamp(&ts); 1173 ip->i_din2->di_birthtime = ts.tv_sec; 1174 ip->i_din2->di_birthnsec = ts.tv_nsec; 1175 } 1176 ufs_prepare_reclaim(*vpp); 1177 ip->i_flag = 0; 1178 (*vpp)->v_vflag = 0; 1179 (*vpp)->v_type = VNON; 1180 if (fs->fs_magic == FS_UFS2_MAGIC) { 1181 (*vpp)->v_op = &ffs_vnodeops2; 1182 ip->i_flag |= IN_UFS2; 1183 } else { 1184 (*vpp)->v_op = &ffs_vnodeops1; 1185 } 1186 return (0); 1187noinodes: 1188 if (reclaimed == 0) { 1189 reclaimed = 1; 1190 softdep_request_cleanup(fs, pvp, cred, FLUSH_INODES_WAIT); 1191 goto retry; 1192 } 1193 if (ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { 1194 UFS_UNLOCK(ump); 1195 ffs_fserr(fs, pip->i_number, "out of inodes"); 1196 uprintf("\n%s: create/symlink failed, no inodes free\n", 1197 fs->fs_fsmnt); 1198 } else { 1199 UFS_UNLOCK(ump); 1200 } 1201 return (ENOSPC); 1202} 1203 1204/* 1205 * Find a cylinder group to place a directory. 1206 * 1207 * The policy implemented by this algorithm is to allocate a 1208 * directory inode in the same cylinder group as its parent 1209 * directory, but also to reserve space for its files inodes 1210 * and data. Restrict the number of directories which may be 1211 * allocated one after another in the same cylinder group 1212 * without intervening allocation of files. 1213 * 1214 * If we allocate a first level directory then force allocation 1215 * in another cylinder group. 1216 */ 1217static ino_t 1218ffs_dirpref(pip) 1219 struct inode *pip; 1220{ 1221 struct fs *fs; 1222 int cg, prefcg, dirsize, cgsize; 1223 u_int avgifree, avgbfree, avgndir, curdirsize; 1224 u_int minifree, minbfree, maxndir; 1225 u_int mincg, minndir; 1226 u_int maxcontigdirs; 1227 1228 mtx_assert(UFS_MTX(ITOUMP(pip)), MA_OWNED); 1229 fs = ITOFS(pip); 1230 1231 avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; 1232 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 1233 avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; 1234 1235 /* 1236 * Force allocation in another cg if creating a first level dir. 1237 */ 1238 ASSERT_VOP_LOCKED(ITOV(pip), "ffs_dirpref"); 1239 if (ITOV(pip)->v_vflag & VV_ROOT) { 1240 prefcg = arc4random() % fs->fs_ncg; 1241 mincg = prefcg; 1242 minndir = fs->fs_ipg; 1243 for (cg = prefcg; cg < fs->fs_ncg; cg++) 1244 if (fs->fs_cs(fs, cg).cs_ndir < minndir && 1245 fs->fs_cs(fs, cg).cs_nifree >= avgifree && 1246 fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1247 mincg = cg; 1248 minndir = fs->fs_cs(fs, cg).cs_ndir; 1249 } 1250 for (cg = 0; cg < prefcg; cg++) 1251 if (fs->fs_cs(fs, cg).cs_ndir < minndir && 1252 fs->fs_cs(fs, cg).cs_nifree >= avgifree && 1253 fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1254 mincg = cg; 1255 minndir = fs->fs_cs(fs, cg).cs_ndir; 1256 } 1257 return ((ino_t)(fs->fs_ipg * mincg)); 1258 } 1259 1260 /* 1261 * Count various limits which used for 1262 * optimal allocation of a directory inode. 1263 */ 1264 maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg); 1265 minifree = avgifree - avgifree / 4; 1266 if (minifree < 1) 1267 minifree = 1; 1268 minbfree = avgbfree - avgbfree / 4; 1269 if (minbfree < 1) 1270 minbfree = 1; 1271 cgsize = fs->fs_fsize * fs->fs_fpg; 1272 dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir; 1273 curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0; 1274 if (dirsize < curdirsize) 1275 dirsize = curdirsize; 1276 if (dirsize <= 0) 1277 maxcontigdirs = 0; /* dirsize overflowed */ 1278 else 1279 maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255); 1280 if (fs->fs_avgfpdir > 0) 1281 maxcontigdirs = min(maxcontigdirs, 1282 fs->fs_ipg / fs->fs_avgfpdir); 1283 if (maxcontigdirs == 0) 1284 maxcontigdirs = 1; 1285 1286 /* 1287 * Limit number of dirs in one cg and reserve space for 1288 * regular files, but only if we have no deficit in 1289 * inodes or space. 1290 * 1291 * We are trying to find a suitable cylinder group nearby 1292 * our preferred cylinder group to place a new directory. 1293 * We scan from our preferred cylinder group forward looking 1294 * for a cylinder group that meets our criterion. If we get 1295 * to the final cylinder group and do not find anything, 1296 * we start scanning forwards from the beginning of the 1297 * filesystem. While it might seem sensible to start scanning 1298 * backwards or even to alternate looking forward and backward, 1299 * this approach fails badly when the filesystem is nearly full. 1300 * Specifically, we first search all the areas that have no space 1301 * and finally try the one preceding that. We repeat this on 1302 * every request and in the case of the final block end up 1303 * searching the entire filesystem. By jumping to the front 1304 * of the filesystem, our future forward searches always look 1305 * in new cylinder groups so finds every possible block after 1306 * one pass over the filesystem. 1307 */ 1308 prefcg = ino_to_cg(fs, pip->i_number); 1309 for (cg = prefcg; cg < fs->fs_ncg; cg++) 1310 if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 1311 fs->fs_cs(fs, cg).cs_nifree >= minifree && 1312 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 1313 if (fs->fs_contigdirs[cg] < maxcontigdirs) 1314 return ((ino_t)(fs->fs_ipg * cg)); 1315 } 1316 for (cg = 0; cg < prefcg; cg++) 1317 if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 1318 fs->fs_cs(fs, cg).cs_nifree >= minifree && 1319 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 1320 if (fs->fs_contigdirs[cg] < maxcontigdirs) 1321 return ((ino_t)(fs->fs_ipg * cg)); 1322 } 1323 /* 1324 * This is a backstop when we have deficit in space. 1325 */ 1326 for (cg = prefcg; cg < fs->fs_ncg; cg++) 1327 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 1328 return ((ino_t)(fs->fs_ipg * cg)); 1329 for (cg = 0; cg < prefcg; cg++) 1330 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 1331 break; 1332 return ((ino_t)(fs->fs_ipg * cg)); 1333} 1334 1335/* 1336 * Select the desired position for the next block in a file. The file is 1337 * logically divided into sections. The first section is composed of the 1338 * direct blocks and the next fs_maxbpg blocks. Each additional section 1339 * contains fs_maxbpg blocks. 1340 * 1341 * If no blocks have been allocated in the first section, the policy is to 1342 * request a block in the same cylinder group as the inode that describes 1343 * the file. The first indirect is allocated immediately following the last 1344 * direct block and the data blocks for the first indirect immediately 1345 * follow it. 1346 * 1347 * If no blocks have been allocated in any other section, the indirect 1348 * block(s) are allocated in the same cylinder group as its inode in an 1349 * area reserved immediately following the inode blocks. The policy for 1350 * the data blocks is to place them in a cylinder group with a greater than 1351 * average number of free blocks. An appropriate cylinder group is found 1352 * by using a rotor that sweeps the cylinder groups. When a new group of 1353 * blocks is needed, the sweep begins in the cylinder group following the 1354 * cylinder group from which the previous allocation was made. The sweep 1355 * continues until a cylinder group with greater than the average number 1356 * of free blocks is found. If the allocation is for the first block in an 1357 * indirect block or the previous block is a hole, then the information on 1358 * the previous allocation is unavailable; here a best guess is made based 1359 * on the logical block number being allocated. 1360 * 1361 * If a section is already partially allocated, the policy is to 1362 * allocate blocks contiguously within the section if possible. 1363 */ 1364ufs2_daddr_t 1365ffs_blkpref_ufs1(ip, lbn, indx, bap) 1366 struct inode *ip; 1367 ufs_lbn_t lbn; 1368 int indx; 1369 ufs1_daddr_t *bap; 1370{ 1371 struct fs *fs; 1372 u_int cg, inocg; 1373 u_int avgbfree, startcg; 1374 ufs2_daddr_t pref, prevbn; 1375 1376 KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); 1377 mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); 1378 fs = ITOFS(ip); 1379 /* 1380 * Allocation of indirect blocks is indicated by passing negative 1381 * values in indx: -1 for single indirect, -2 for double indirect, 1382 * -3 for triple indirect. As noted below, we attempt to allocate 1383 * the first indirect inline with the file data. For all later 1384 * indirect blocks, the data is often allocated in other cylinder 1385 * groups. However to speed random file access and to speed up 1386 * fsck, the filesystem reserves the first fs_metaspace blocks 1387 * (typically half of fs_minfree) of the data area of each cylinder 1388 * group to hold these later indirect blocks. 1389 */ 1390 inocg = ino_to_cg(fs, ip->i_number); 1391 if (indx < 0) { 1392 /* 1393 * Our preference for indirect blocks is the zone at the 1394 * beginning of the inode's cylinder group data area that 1395 * we try to reserve for indirect blocks. 1396 */ 1397 pref = cgmeta(fs, inocg); 1398 /* 1399 * If we are allocating the first indirect block, try to 1400 * place it immediately following the last direct block. 1401 */ 1402 if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) && 1403 ip->i_din1->di_db[UFS_NDADDR - 1] != 0) 1404 pref = ip->i_din1->di_db[UFS_NDADDR - 1] + fs->fs_frag; 1405 return (pref); 1406 } 1407 /* 1408 * If we are allocating the first data block in the first indirect 1409 * block and the indirect has been allocated in the data block area, 1410 * try to place it immediately following the indirect block. 1411 */ 1412 if (lbn == UFS_NDADDR) { 1413 pref = ip->i_din1->di_ib[0]; 1414 if (pref != 0 && pref >= cgdata(fs, inocg) && 1415 pref < cgbase(fs, inocg + 1)) 1416 return (pref + fs->fs_frag); 1417 } 1418 /* 1419 * If we are at the beginning of a file, or we have already allocated 1420 * the maximum number of blocks per cylinder group, or we do not 1421 * have a block allocated immediately preceding us, then we need 1422 * to decide where to start allocating new blocks. 1423 */ 1424 if (indx == 0) { 1425 prevbn = 0; 1426 } else { 1427 prevbn = bap[indx - 1]; 1428 if (UFS_CHECK_BLKNO(ITOVFS(ip), ip->i_number, prevbn, 1429 fs->fs_bsize) != 0) 1430 prevbn = 0; 1431 } 1432 if (indx % fs->fs_maxbpg == 0 || prevbn == 0) { 1433 /* 1434 * If we are allocating a directory data block, we want 1435 * to place it in the metadata area. 1436 */ 1437 if ((ip->i_mode & IFMT) == IFDIR) 1438 return (cgmeta(fs, inocg)); 1439 /* 1440 * Until we fill all the direct and all the first indirect's 1441 * blocks, we try to allocate in the data area of the inode's 1442 * cylinder group. 1443 */ 1444 if (lbn < UFS_NDADDR + NINDIR(fs)) 1445 return (cgdata(fs, inocg)); 1446 /* 1447 * Find a cylinder with greater than average number of 1448 * unused data blocks. 1449 */ 1450 if (indx == 0 || prevbn == 0) 1451 startcg = inocg + lbn / fs->fs_maxbpg; 1452 else 1453 startcg = dtog(fs, prevbn) + 1; 1454 startcg %= fs->fs_ncg; 1455 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 1456 for (cg = startcg; cg < fs->fs_ncg; cg++) 1457 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1458 fs->fs_cgrotor = cg; 1459 return (cgdata(fs, cg)); 1460 } 1461 for (cg = 0; cg <= startcg; cg++) 1462 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1463 fs->fs_cgrotor = cg; 1464 return (cgdata(fs, cg)); 1465 } 1466 return (0); 1467 } 1468 /* 1469 * Otherwise, we just always try to lay things out contiguously. 1470 */ 1471 return (prevbn + fs->fs_frag); 1472} 1473 1474/* 1475 * Same as above, but for UFS2 1476 */ 1477ufs2_daddr_t 1478ffs_blkpref_ufs2(ip, lbn, indx, bap) 1479 struct inode *ip; 1480 ufs_lbn_t lbn; 1481 int indx; 1482 ufs2_daddr_t *bap; 1483{ 1484 struct fs *fs; 1485 u_int cg, inocg; 1486 u_int avgbfree, startcg; 1487 ufs2_daddr_t pref, prevbn; 1488 1489 KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); 1490 mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); 1491 fs = ITOFS(ip); 1492 /* 1493 * Allocation of indirect blocks is indicated by passing negative 1494 * values in indx: -1 for single indirect, -2 for double indirect, 1495 * -3 for triple indirect. As noted below, we attempt to allocate 1496 * the first indirect inline with the file data. For all later 1497 * indirect blocks, the data is often allocated in other cylinder 1498 * groups. However to speed random file access and to speed up 1499 * fsck, the filesystem reserves the first fs_metaspace blocks 1500 * (typically half of fs_minfree) of the data area of each cylinder 1501 * group to hold these later indirect blocks. 1502 */ 1503 inocg = ino_to_cg(fs, ip->i_number); 1504 if (indx < 0) { 1505 /* 1506 * Our preference for indirect blocks is the zone at the 1507 * beginning of the inode's cylinder group data area that 1508 * we try to reserve for indirect blocks. 1509 */ 1510 pref = cgmeta(fs, inocg); 1511 /* 1512 * If we are allocating the first indirect block, try to 1513 * place it immediately following the last direct block. 1514 */ 1515 if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) && 1516 ip->i_din2->di_db[UFS_NDADDR - 1] != 0) 1517 pref = ip->i_din2->di_db[UFS_NDADDR - 1] + fs->fs_frag; 1518 return (pref); 1519 } 1520 /* 1521 * If we are allocating the first data block in the first indirect 1522 * block and the indirect has been allocated in the data block area, 1523 * try to place it immediately following the indirect block. 1524 */ 1525 if (lbn == UFS_NDADDR) { 1526 pref = ip->i_din2->di_ib[0]; 1527 if (pref != 0 && pref >= cgdata(fs, inocg) && 1528 pref < cgbase(fs, inocg + 1)) 1529 return (pref + fs->fs_frag); 1530 } 1531 /* 1532 * If we are at the beginning of a file, or we have already allocated 1533 * the maximum number of blocks per cylinder group, or we do not 1534 * have a block allocated immediately preceding us, then we need 1535 * to decide where to start allocating new blocks. 1536 */ 1537 if (indx == 0) { 1538 prevbn = 0; 1539 } else { 1540 prevbn = bap[indx - 1]; 1541 if (UFS_CHECK_BLKNO(ITOVFS(ip), ip->i_number, prevbn, 1542 fs->fs_bsize) != 0) 1543 prevbn = 0; 1544 } 1545 if (indx % fs->fs_maxbpg == 0 || prevbn == 0) { 1546 /* 1547 * If we are allocating a directory data block, we want 1548 * to place it in the metadata area. 1549 */ 1550 if ((ip->i_mode & IFMT) == IFDIR) 1551 return (cgmeta(fs, inocg)); 1552 /* 1553 * Until we fill all the direct and all the first indirect's 1554 * blocks, we try to allocate in the data area of the inode's 1555 * cylinder group. 1556 */ 1557 if (lbn < UFS_NDADDR + NINDIR(fs)) 1558 return (cgdata(fs, inocg)); 1559 /* 1560 * Find a cylinder with greater than average number of 1561 * unused data blocks. 1562 */ 1563 if (indx == 0 || prevbn == 0) 1564 startcg = inocg + lbn / fs->fs_maxbpg; 1565 else 1566 startcg = dtog(fs, prevbn) + 1; 1567 startcg %= fs->fs_ncg; 1568 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 1569 for (cg = startcg; cg < fs->fs_ncg; cg++) 1570 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1571 fs->fs_cgrotor = cg; 1572 return (cgdata(fs, cg)); 1573 } 1574 for (cg = 0; cg <= startcg; cg++) 1575 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1576 fs->fs_cgrotor = cg; 1577 return (cgdata(fs, cg)); 1578 } 1579 return (0); 1580 } 1581 /* 1582 * Otherwise, we just always try to lay things out contiguously. 1583 */ 1584 return (prevbn + fs->fs_frag); 1585} 1586 1587/* 1588 * Implement the cylinder overflow algorithm. 1589 * 1590 * The policy implemented by this algorithm is: 1591 * 1) allocate the block in its requested cylinder group. 1592 * 2) quadradically rehash on the cylinder group number. 1593 * 3) brute force search for a free block. 1594 * 1595 * Must be called with the UFS lock held. Will release the lock on success 1596 * and return with it held on failure. 1597 */ 1598/*VARARGS5*/ 1599static ufs2_daddr_t 1600ffs_hashalloc(ip, cg, pref, size, rsize, allocator) 1601 struct inode *ip; 1602 u_int cg; 1603 ufs2_daddr_t pref; 1604 int size; /* Search size for data blocks, mode for inodes */ 1605 int rsize; /* Real allocated size. */ 1606 allocfcn_t *allocator; 1607{ 1608 struct fs *fs; 1609 ufs2_daddr_t result; 1610 u_int i, icg = cg; 1611 1612 mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); 1613#ifdef INVARIANTS 1614 if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED) 1615 panic("ffs_hashalloc: allocation on suspended filesystem"); 1616#endif 1617 fs = ITOFS(ip); 1618 /* 1619 * 1: preferred cylinder group 1620 */ 1621 result = (*allocator)(ip, cg, pref, size, rsize); 1622 if (result) 1623 return (result); 1624 /* 1625 * 2: quadratic rehash 1626 */ 1627 for (i = 1; i < fs->fs_ncg; i *= 2) { 1628 cg += i; 1629 if (cg >= fs->fs_ncg) 1630 cg -= fs->fs_ncg; 1631 result = (*allocator)(ip, cg, 0, size, rsize); 1632 if (result) 1633 return (result); 1634 } 1635 /* 1636 * 3: brute force search 1637 * Note that we start at i == 2, since 0 was checked initially, 1638 * and 1 is always checked in the quadratic rehash. 1639 */ 1640 cg = (icg + 2) % fs->fs_ncg; 1641 for (i = 2; i < fs->fs_ncg; i++) { 1642 result = (*allocator)(ip, cg, 0, size, rsize); 1643 if (result) 1644 return (result); 1645 cg++; 1646 if (cg == fs->fs_ncg) 1647 cg = 0; 1648 } 1649 return (0); 1650} 1651 1652/* 1653 * Determine whether a fragment can be extended. 1654 * 1655 * Check to see if the necessary fragments are available, and 1656 * if they are, allocate them. 1657 */ 1658static ufs2_daddr_t 1659ffs_fragextend(ip, cg, bprev, osize, nsize) 1660 struct inode *ip; 1661 u_int cg; 1662 ufs2_daddr_t bprev; 1663 int osize, nsize; 1664{ 1665 struct fs *fs; 1666 struct cg *cgp; 1667 struct buf *bp; 1668 struct ufsmount *ump; 1669 int nffree; 1670 long bno; 1671 int frags, bbase; 1672 int i, error; 1673 u_int8_t *blksfree; 1674 1675 ump = ITOUMP(ip); 1676 fs = ump->um_fs; 1677 if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) 1678 return (0); 1679 frags = numfrags(fs, nsize); 1680 bbase = fragnum(fs, bprev); 1681 if (bbase > fragnum(fs, (bprev + frags - 1))) { 1682 /* cannot extend across a block boundary */ 1683 return (0); 1684 } 1685 UFS_UNLOCK(ump); 1686 if ((error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp)) != 0) 1687 goto fail; 1688 bno = dtogd(fs, bprev); 1689 blksfree = cg_blksfree(cgp); 1690 for (i = numfrags(fs, osize); i < frags; i++) 1691 if (isclr(blksfree, bno + i)) 1692 goto fail; 1693 /* 1694 * the current fragment can be extended 1695 * deduct the count on fragment being extended into 1696 * increase the count on the remaining fragment (if any) 1697 * allocate the extended piece 1698 */ 1699 for (i = frags; i < fs->fs_frag - bbase; i++) 1700 if (isclr(blksfree, bno + i)) 1701 break; 1702 cgp->cg_frsum[i - numfrags(fs, osize)]--; 1703 if (i != frags) 1704 cgp->cg_frsum[i - frags]++; 1705 for (i = numfrags(fs, osize), nffree = 0; i < frags; i++) { 1706 clrbit(blksfree, bno + i); 1707 cgp->cg_cs.cs_nffree--; 1708 nffree++; 1709 } 1710 UFS_LOCK(ump); 1711 fs->fs_cstotal.cs_nffree -= nffree; 1712 fs->fs_cs(fs, cg).cs_nffree -= nffree; 1713 fs->fs_fmod = 1; 1714 ACTIVECLEAR(fs, cg); 1715 UFS_UNLOCK(ump); 1716 if (DOINGSOFTDEP(ITOV(ip))) 1717 softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev, 1718 frags, numfrags(fs, osize)); 1719 bdwrite(bp); 1720 return (bprev); 1721 1722fail: 1723 brelse(bp); 1724 UFS_LOCK(ump); 1725 return (0); 1726 1727} 1728 1729/* 1730 * Determine whether a block can be allocated. 1731 * 1732 * Check to see if a block of the appropriate size is available, 1733 * and if it is, allocate it. 1734 */ 1735static ufs2_daddr_t 1736ffs_alloccg(ip, cg, bpref, size, rsize) 1737 struct inode *ip; 1738 u_int cg; 1739 ufs2_daddr_t bpref; 1740 int size; 1741 int rsize; 1742{ 1743 struct fs *fs; 1744 struct cg *cgp; 1745 struct buf *bp; 1746 struct ufsmount *ump; 1747 ufs1_daddr_t bno; 1748 ufs2_daddr_t blkno; 1749 int i, allocsiz, error, frags; 1750 u_int8_t *blksfree; 1751 1752 ump = ITOUMP(ip); 1753 fs = ump->um_fs; 1754 if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) 1755 return (0); 1756 UFS_UNLOCK(ump); 1757 if ((error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp)) != 0 || 1758 (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) 1759 goto fail; 1760 if (size == fs->fs_bsize) { 1761 UFS_LOCK(ump); 1762 blkno = ffs_alloccgblk(ip, bp, bpref, rsize); 1763 ACTIVECLEAR(fs, cg); 1764 UFS_UNLOCK(ump); 1765 bdwrite(bp); 1766 return (blkno); 1767 } 1768 /* 1769 * check to see if any fragments are already available 1770 * allocsiz is the size which will be allocated, hacking 1771 * it down to a smaller size if necessary 1772 */ 1773 blksfree = cg_blksfree(cgp); 1774 frags = numfrags(fs, size); 1775 for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) 1776 if (cgp->cg_frsum[allocsiz] != 0) 1777 break; 1778 if (allocsiz == fs->fs_frag) { 1779 /* 1780 * no fragments were available, so a block will be 1781 * allocated, and hacked up 1782 */ 1783 if (cgp->cg_cs.cs_nbfree == 0) 1784 goto fail; 1785 UFS_LOCK(ump); 1786 blkno = ffs_alloccgblk(ip, bp, bpref, rsize); 1787 ACTIVECLEAR(fs, cg); 1788 UFS_UNLOCK(ump); 1789 bdwrite(bp); 1790 return (blkno); 1791 } 1792 KASSERT(size == rsize, 1793 ("ffs_alloccg: size(%d) != rsize(%d)", size, rsize)); 1794 bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); 1795 if (bno < 0) 1796 goto fail; 1797 for (i = 0; i < frags; i++) 1798 clrbit(blksfree, bno + i); 1799 cgp->cg_cs.cs_nffree -= frags; 1800 cgp->cg_frsum[allocsiz]--; 1801 if (frags != allocsiz) 1802 cgp->cg_frsum[allocsiz - frags]++; 1803 UFS_LOCK(ump); 1804 fs->fs_cstotal.cs_nffree -= frags; 1805 fs->fs_cs(fs, cg).cs_nffree -= frags; 1806 fs->fs_fmod = 1; 1807 blkno = cgbase(fs, cg) + bno; 1808 ACTIVECLEAR(fs, cg); 1809 UFS_UNLOCK(ump); 1810 if (DOINGSOFTDEP(ITOV(ip))) 1811 softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0); 1812 bdwrite(bp); 1813 return (blkno); 1814 1815fail: 1816 brelse(bp); 1817 UFS_LOCK(ump); 1818 return (0); 1819} 1820 1821/* 1822 * Allocate a block in a cylinder group. 1823 * 1824 * This algorithm implements the following policy: 1825 * 1) allocate the requested block. 1826 * 2) allocate a rotationally optimal block in the same cylinder. 1827 * 3) allocate the next available block on the block rotor for the 1828 * specified cylinder group. 1829 * Note that this routine only allocates fs_bsize blocks; these 1830 * blocks may be fragmented by the routine that allocates them. 1831 */ 1832static ufs2_daddr_t 1833ffs_alloccgblk(ip, bp, bpref, size) 1834 struct inode *ip; 1835 struct buf *bp; 1836 ufs2_daddr_t bpref; 1837 int size; 1838{ 1839 struct fs *fs; 1840 struct cg *cgp; 1841 struct ufsmount *ump; 1842 ufs1_daddr_t bno; 1843 ufs2_daddr_t blkno; 1844 u_int8_t *blksfree; 1845 int i, cgbpref; 1846 1847 ump = ITOUMP(ip); 1848 fs = ump->um_fs; 1849 mtx_assert(UFS_MTX(ump), MA_OWNED); 1850 cgp = (struct cg *)bp->b_data; 1851 blksfree = cg_blksfree(cgp); 1852 if (bpref == 0) { 1853 bpref = cgbase(fs, cgp->cg_cgx) + cgp->cg_rotor + fs->fs_frag; 1854 } else if ((cgbpref = dtog(fs, bpref)) != cgp->cg_cgx) { 1855 /* map bpref to correct zone in this cg */ 1856 if (bpref < cgdata(fs, cgbpref)) 1857 bpref = cgmeta(fs, cgp->cg_cgx); 1858 else 1859 bpref = cgdata(fs, cgp->cg_cgx); 1860 } 1861 /* 1862 * if the requested block is available, use it 1863 */ 1864 bno = dtogd(fs, blknum(fs, bpref)); 1865 if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno))) 1866 goto gotit; 1867 /* 1868 * Take the next available block in this cylinder group. 1869 */ 1870 bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); 1871 if (bno < 0) 1872 return (0); 1873 /* Update cg_rotor only if allocated from the data zone */ 1874 if (bno >= dtogd(fs, cgdata(fs, cgp->cg_cgx))) 1875 cgp->cg_rotor = bno; 1876gotit: 1877 blkno = fragstoblks(fs, bno); 1878 ffs_clrblock(fs, blksfree, (long)blkno); 1879 ffs_clusteracct(fs, cgp, blkno, -1); 1880 cgp->cg_cs.cs_nbfree--; 1881 fs->fs_cstotal.cs_nbfree--; 1882 fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; 1883 fs->fs_fmod = 1; 1884 blkno = cgbase(fs, cgp->cg_cgx) + bno; 1885 /* 1886 * If the caller didn't want the whole block free the frags here. 1887 */ 1888 size = numfrags(fs, size); 1889 if (size != fs->fs_frag) { 1890 bno = dtogd(fs, blkno); 1891 for (i = size; i < fs->fs_frag; i++) 1892 setbit(blksfree, bno + i); 1893 i = fs->fs_frag - size; 1894 cgp->cg_cs.cs_nffree += i; 1895 fs->fs_cstotal.cs_nffree += i; 1896 fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i; 1897 fs->fs_fmod = 1; 1898 cgp->cg_frsum[i]++; 1899 } 1900 /* XXX Fixme. */ 1901 UFS_UNLOCK(ump); 1902 if (DOINGSOFTDEP(ITOV(ip))) 1903 softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, size, 0); 1904 UFS_LOCK(ump); 1905 return (blkno); 1906} 1907 1908/* 1909 * Determine whether a cluster can be allocated. 1910 * 1911 * We do not currently check for optimal rotational layout if there 1912 * are multiple choices in the same cylinder group. Instead we just 1913 * take the first one that we find following bpref. 1914 */ 1915static ufs2_daddr_t 1916ffs_clusteralloc(ip, cg, bpref, len) 1917 struct inode *ip; 1918 u_int cg; 1919 ufs2_daddr_t bpref; 1920 int len; 1921{ 1922 struct fs *fs; 1923 struct cg *cgp; 1924 struct buf *bp; 1925 struct ufsmount *ump; 1926 int i, run, bit, map, got, error; 1927 ufs2_daddr_t bno; 1928 u_char *mapp; 1929 int32_t *lp; 1930 u_int8_t *blksfree; 1931 1932 ump = ITOUMP(ip); 1933 fs = ump->um_fs; 1934 if (fs->fs_maxcluster[cg] < len) 1935 return (0); 1936 UFS_UNLOCK(ump); 1937 if ((error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp)) != 0) { 1938 UFS_LOCK(ump); 1939 return (0); 1940 } 1941 /* 1942 * Check to see if a cluster of the needed size (or bigger) is 1943 * available in this cylinder group. 1944 */ 1945 lp = &cg_clustersum(cgp)[len]; 1946 for (i = len; i <= fs->fs_contigsumsize; i++) 1947 if (*lp++ > 0) 1948 break; 1949 if (i > fs->fs_contigsumsize) { 1950 /* 1951 * This is the first time looking for a cluster in this 1952 * cylinder group. Update the cluster summary information 1953 * to reflect the true maximum sized cluster so that 1954 * future cluster allocation requests can avoid reading 1955 * the cylinder group map only to find no clusters. 1956 */ 1957 lp = &cg_clustersum(cgp)[len - 1]; 1958 for (i = len - 1; i > 0; i--) 1959 if (*lp-- > 0) 1960 break; 1961 UFS_LOCK(ump); 1962 fs->fs_maxcluster[cg] = i; 1963 brelse(bp); 1964 return (0); 1965 } 1966 /* 1967 * Search the cluster map to find a big enough cluster. 1968 * We take the first one that we find, even if it is larger 1969 * than we need as we prefer to get one close to the previous 1970 * block allocation. We do not search before the current 1971 * preference point as we do not want to allocate a block 1972 * that is allocated before the previous one (as we will 1973 * then have to wait for another pass of the elevator 1974 * algorithm before it will be read). We prefer to fail and 1975 * be recalled to try an allocation in the next cylinder group. 1976 */ 1977 if (dtog(fs, bpref) != cg) 1978 bpref = cgdata(fs, cg); 1979 else 1980 bpref = blknum(fs, bpref); 1981 bpref = fragstoblks(fs, dtogd(fs, bpref)); 1982 mapp = &cg_clustersfree(cgp)[bpref / NBBY]; 1983 map = *mapp++; 1984 bit = 1 << (bpref % NBBY); 1985 for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) { 1986 if ((map & bit) == 0) { 1987 run = 0; 1988 } else { 1989 run++; 1990 if (run == len) 1991 break; 1992 } 1993 if ((got & (NBBY - 1)) != (NBBY - 1)) { 1994 bit <<= 1; 1995 } else { 1996 map = *mapp++; 1997 bit = 1; 1998 } 1999 } 2000 if (got >= cgp->cg_nclusterblks) { 2001 UFS_LOCK(ump); 2002 brelse(bp); 2003 return (0); 2004 } 2005 /* 2006 * Allocate the cluster that we have found. 2007 */ 2008 blksfree = cg_blksfree(cgp); 2009 for (i = 1; i <= len; i++) 2010 if (!ffs_isblock(fs, blksfree, got - run + i)) 2011 panic("ffs_clusteralloc: map mismatch"); 2012 bno = cgbase(fs, cg) + blkstofrags(fs, got - run + 1); 2013 if (dtog(fs, bno) != cg) 2014 panic("ffs_clusteralloc: allocated out of group"); 2015 len = blkstofrags(fs, len); 2016 UFS_LOCK(ump); 2017 for (i = 0; i < len; i += fs->fs_frag) 2018 if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i) 2019 panic("ffs_clusteralloc: lost block"); 2020 ACTIVECLEAR(fs, cg); 2021 UFS_UNLOCK(ump); 2022 bdwrite(bp); 2023 return (bno); 2024} 2025 2026static inline struct buf * 2027getinobuf(struct inode *ip, u_int cg, u_int32_t cginoblk, int gbflags) 2028{ 2029 struct fs *fs; 2030 2031 fs = ITOFS(ip); 2032 return (getblk(ITODEVVP(ip), fsbtodb(fs, ino_to_fsba(fs, 2033 cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0, 2034 gbflags)); 2035} 2036 2037/* 2038 * Synchronous inode initialization is needed only when barrier writes do not 2039 * work as advertised, and will impose a heavy cost on file creation in a newly 2040 * created filesystem. 2041 */ 2042static int doasyncinodeinit = 1; 2043SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncinodeinit, CTLFLAG_RWTUN, 2044 &doasyncinodeinit, 0, 2045 "Perform inode block initialization using asynchronous writes"); 2046 2047/* 2048 * Determine whether an inode can be allocated. 2049 * 2050 * Check to see if an inode is available, and if it is, 2051 * allocate it using the following policy: 2052 * 1) allocate the requested inode. 2053 * 2) allocate the next available inode after the requested 2054 * inode in the specified cylinder group. 2055 */ 2056static ufs2_daddr_t 2057ffs_nodealloccg(ip, cg, ipref, mode, unused) 2058 struct inode *ip; 2059 u_int cg; 2060 ufs2_daddr_t ipref; 2061 int mode; 2062 int unused; 2063{ 2064 struct fs *fs; 2065 struct cg *cgp; 2066 struct buf *bp, *ibp; 2067 struct ufsmount *ump; 2068 u_int8_t *inosused, *loc; 2069 struct ufs2_dinode *dp2; 2070 int error, start, len, i; 2071 u_int32_t old_initediblk; 2072 2073 ump = ITOUMP(ip); 2074 fs = ump->um_fs; 2075check_nifree: 2076 if (fs->fs_cs(fs, cg).cs_nifree == 0) 2077 return (0); 2078 UFS_UNLOCK(ump); 2079 if ((error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp)) != 0) { 2080 UFS_LOCK(ump); 2081 return (0); 2082 } 2083restart: 2084 if (cgp->cg_cs.cs_nifree == 0) { 2085 brelse(bp); 2086 UFS_LOCK(ump); 2087 return (0); 2088 } 2089 inosused = cg_inosused(cgp); 2090 if (ipref) { 2091 ipref %= fs->fs_ipg; 2092 if (isclr(inosused, ipref)) 2093 goto gotit; 2094 } 2095 start = cgp->cg_irotor / NBBY; 2096 len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY); 2097 loc = memcchr(&inosused[start], 0xff, len); 2098 if (loc == NULL) { 2099 len = start + 1; 2100 start = 0; 2101 loc = memcchr(&inosused[start], 0xff, len); 2102 if (loc == NULL) { 2103 printf("cg = %d, irotor = %ld, fs = %s\n", 2104 cg, (long)cgp->cg_irotor, fs->fs_fsmnt); 2105 panic("ffs_nodealloccg: map corrupted"); 2106 /* NOTREACHED */ 2107 } 2108 } 2109 ipref = (loc - inosused) * NBBY + ffs(~*loc) - 1; 2110gotit: 2111 /* 2112 * Check to see if we need to initialize more inodes. 2113 */ 2114 if (fs->fs_magic == FS_UFS2_MAGIC && 2115 ipref + INOPB(fs) > cgp->cg_initediblk && 2116 cgp->cg_initediblk < cgp->cg_niblk) { 2117 old_initediblk = cgp->cg_initediblk; 2118 2119 /* 2120 * Free the cylinder group lock before writing the 2121 * initialized inode block. Entering the 2122 * babarrierwrite() with the cylinder group lock 2123 * causes lock order violation between the lock and 2124 * snaplk. 2125 * 2126 * Another thread can decide to initialize the same 2127 * inode block, but whichever thread first gets the 2128 * cylinder group lock after writing the newly 2129 * allocated inode block will update it and the other 2130 * will realize that it has lost and leave the 2131 * cylinder group unchanged. 2132 */ 2133 ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT); 2134 brelse(bp); 2135 if (ibp == NULL) { 2136 /* 2137 * The inode block buffer is already owned by 2138 * another thread, which must initialize it. 2139 * Wait on the buffer to allow another thread 2140 * to finish the updates, with dropped cg 2141 * buffer lock, then retry. 2142 */ 2143 ibp = getinobuf(ip, cg, old_initediblk, 0); 2144 brelse(ibp); 2145 UFS_LOCK(ump); 2146 goto check_nifree; 2147 } 2148 bzero(ibp->b_data, (int)fs->fs_bsize); 2149 dp2 = (struct ufs2_dinode *)(ibp->b_data); 2150 for (i = 0; i < INOPB(fs); i++) { 2151 while (dp2->di_gen == 0) 2152 dp2->di_gen = arc4random(); 2153 dp2++; 2154 } 2155 2156 /* 2157 * Rather than adding a soft updates dependency to ensure 2158 * that the new inode block is written before it is claimed 2159 * by the cylinder group map, we just do a barrier write 2160 * here. The barrier write will ensure that the inode block 2161 * gets written before the updated cylinder group map can be 2162 * written. The barrier write should only slow down bulk 2163 * loading of newly created filesystems. 2164 */ 2165 if (doasyncinodeinit) 2166 babarrierwrite(ibp); 2167 else 2168 bwrite(ibp); 2169 2170 /* 2171 * After the inode block is written, try to update the 2172 * cg initediblk pointer. If another thread beat us 2173 * to it, then leave it unchanged as the other thread 2174 * has already set it correctly. 2175 */ 2176 error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp); 2177 UFS_LOCK(ump); 2178 ACTIVECLEAR(fs, cg); 2179 UFS_UNLOCK(ump); 2180 if (error != 0) 2181 return (error); 2182 if (cgp->cg_initediblk == old_initediblk) 2183 cgp->cg_initediblk += INOPB(fs); 2184 goto restart; 2185 } 2186 cgp->cg_irotor = ipref; 2187 UFS_LOCK(ump); 2188 ACTIVECLEAR(fs, cg); 2189 setbit(inosused, ipref); 2190 cgp->cg_cs.cs_nifree--; 2191 fs->fs_cstotal.cs_nifree--; 2192 fs->fs_cs(fs, cg).cs_nifree--; 2193 fs->fs_fmod = 1; 2194 if ((mode & IFMT) == IFDIR) { 2195 cgp->cg_cs.cs_ndir++; 2196 fs->fs_cstotal.cs_ndir++; 2197 fs->fs_cs(fs, cg).cs_ndir++; 2198 } 2199 UFS_UNLOCK(ump); 2200 if (DOINGSOFTDEP(ITOV(ip))) 2201 softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref, mode); 2202 bdwrite(bp); 2203 return ((ino_t)(cg * fs->fs_ipg + ipref)); 2204} 2205 2206/* 2207 * Free a block or fragment. 2208 * 2209 * The specified block or fragment is placed back in the 2210 * free map. If a fragment is deallocated, a possible 2211 * block reassembly is checked. 2212 */ 2213static void 2214ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd) 2215 struct ufsmount *ump; 2216 struct fs *fs; 2217 struct vnode *devvp; 2218 ufs2_daddr_t bno; 2219 long size; 2220 ino_t inum; 2221 struct workhead *dephd; 2222{ 2223 struct mount *mp; 2224 struct cg *cgp; 2225 struct buf *bp; 2226 ufs1_daddr_t fragno, cgbno; 2227 int i, blk, frags, bbase, error; 2228 u_int cg; 2229 u_int8_t *blksfree; 2230 struct cdev *dev; 2231 2232 cg = dtog(fs, bno); 2233 if (devvp->v_type == VREG) { 2234 /* devvp is a snapshot */ 2235 MPASS(devvp->v_mount->mnt_data == ump); 2236 dev = ump->um_devvp->v_rdev; 2237 } else if (devvp->v_type == VCHR) { 2238 /* devvp is a normal disk device */ 2239 dev = devvp->v_rdev; 2240 ASSERT_VOP_LOCKED(devvp, "ffs_blkfree_cg"); 2241 } else 2242 return; 2243#ifdef INVARIANTS 2244 if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 || 2245 fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) { 2246 printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n", 2247 devtoname(dev), (intmax_t)bno, (long)fs->fs_bsize, 2248 size, fs->fs_fsmnt); 2249 panic("ffs_blkfree_cg: bad size"); 2250 } 2251#endif 2252 if ((u_int)bno >= fs->fs_size) { 2253 printf("bad block %jd, ino %lu\n", (intmax_t)bno, 2254 (u_long)inum); 2255 ffs_fserr(fs, inum, "bad block"); 2256 return; 2257 } 2258 if ((error = ffs_getcg(fs, devvp, cg, &bp, &cgp)) != 0) 2259 return; 2260 cgbno = dtogd(fs, bno); 2261 blksfree = cg_blksfree(cgp); 2262 UFS_LOCK(ump); 2263 if (size == fs->fs_bsize) { 2264 fragno = fragstoblks(fs, cgbno); 2265 if (!ffs_isfreeblock(fs, blksfree, fragno)) { 2266 if (devvp->v_type == VREG) { 2267 UFS_UNLOCK(ump); 2268 /* devvp is a snapshot */ 2269 brelse(bp); 2270 return; 2271 } 2272 printf("dev = %s, block = %jd, fs = %s\n", 2273 devtoname(dev), (intmax_t)bno, fs->fs_fsmnt); 2274 panic("ffs_blkfree_cg: freeing free block"); 2275 } 2276 ffs_setblock(fs, blksfree, fragno); 2277 ffs_clusteracct(fs, cgp, fragno, 1); 2278 cgp->cg_cs.cs_nbfree++; 2279 fs->fs_cstotal.cs_nbfree++; 2280 fs->fs_cs(fs, cg).cs_nbfree++; 2281 } else { 2282 bbase = cgbno - fragnum(fs, cgbno); 2283 /* 2284 * decrement the counts associated with the old frags 2285 */ 2286 blk = blkmap(fs, blksfree, bbase); 2287 ffs_fragacct(fs, blk, cgp->cg_frsum, -1); 2288 /* 2289 * deallocate the fragment 2290 */ 2291 frags = numfrags(fs, size); 2292 for (i = 0; i < frags; i++) { 2293 if (isset(blksfree, cgbno + i)) { 2294 printf("dev = %s, block = %jd, fs = %s\n", 2295 devtoname(dev), (intmax_t)(bno + i), 2296 fs->fs_fsmnt); 2297 panic("ffs_blkfree_cg: freeing free frag"); 2298 } 2299 setbit(blksfree, cgbno + i); 2300 } 2301 cgp->cg_cs.cs_nffree += i; 2302 fs->fs_cstotal.cs_nffree += i; 2303 fs->fs_cs(fs, cg).cs_nffree += i; 2304 /* 2305 * add back in counts associated with the new frags 2306 */ 2307 blk = blkmap(fs, blksfree, bbase); 2308 ffs_fragacct(fs, blk, cgp->cg_frsum, 1); 2309 /* 2310 * if a complete block has been reassembled, account for it 2311 */ 2312 fragno = fragstoblks(fs, bbase); 2313 if (ffs_isblock(fs, blksfree, fragno)) { 2314 cgp->cg_cs.cs_nffree -= fs->fs_frag; 2315 fs->fs_cstotal.cs_nffree -= fs->fs_frag; 2316 fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; 2317 ffs_clusteracct(fs, cgp, fragno, 1); 2318 cgp->cg_cs.cs_nbfree++; 2319 fs->fs_cstotal.cs_nbfree++; 2320 fs->fs_cs(fs, cg).cs_nbfree++; 2321 } 2322 } 2323 fs->fs_fmod = 1; 2324 ACTIVECLEAR(fs, cg); 2325 UFS_UNLOCK(ump); 2326 mp = UFSTOVFS(ump); 2327 if (MOUNTEDSOFTDEP(mp) && devvp->v_type == VCHR) 2328 softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, 2329 numfrags(fs, size), dephd); 2330 bdwrite(bp); 2331} 2332 2333/* 2334 * Structures and routines associated with trim management. 2335 * 2336 * The following requests are passed to trim_lookup to indicate 2337 * the actions that should be taken. 2338 */ 2339#define NEW 1 /* if found, error else allocate and hash it */ 2340#define OLD 2 /* if not found, error, else return it */ 2341#define REPLACE 3 /* if not found, error else unhash and reallocate it */ 2342#define DONE 4 /* if not found, error else unhash and return it */ 2343#define SINGLE 5 /* don't look up, just allocate it and don't hash it */ 2344 2345MALLOC_DEFINE(M_TRIM, "ufs_trim", "UFS trim structures"); 2346 2347#define TRIMLIST_HASH(ump, key) \ 2348 (&(ump)->um_trimhash[(key) & (ump)->um_trimlisthashsize]) 2349 2350/* 2351 * These structures describe each of the block free requests aggregated 2352 * together to make up a trim request. 2353 */ 2354struct trim_blkreq { 2355 TAILQ_ENTRY(trim_blkreq) blkreqlist; 2356 ufs2_daddr_t bno; 2357 long size; 2358 struct workhead *pdephd; 2359 struct workhead dephd; 2360}; 2361 2362/* 2363 * Description of a trim request. 2364 */ 2365struct ffs_blkfree_trim_params { 2366 TAILQ_HEAD(, trim_blkreq) blklist; 2367 LIST_ENTRY(ffs_blkfree_trim_params) hashlist; 2368 struct task task; 2369 struct ufsmount *ump; 2370 struct vnode *devvp; 2371 ino_t inum; 2372 ufs2_daddr_t bno; 2373 long size; 2374 long key; 2375}; 2376 2377static void ffs_blkfree_trim_completed(struct buf *); 2378static void ffs_blkfree_trim_task(void *ctx, int pending __unused); 2379static struct ffs_blkfree_trim_params *trim_lookup(struct ufsmount *, 2380 struct vnode *, ufs2_daddr_t, long, ino_t, u_long, int); 2381static void ffs_blkfree_sendtrim(struct ffs_blkfree_trim_params *); 2382 2383/* 2384 * Called on trim completion to start a task to free the associated block(s). 2385 */ 2386static void 2387ffs_blkfree_trim_completed(bp) 2388 struct buf *bp; 2389{ 2390 struct ffs_blkfree_trim_params *tp; 2391 2392 tp = bp->b_fsprivate1; 2393 free(bp, M_TRIM); 2394 TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp); 2395 taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task); 2396} 2397 2398/* 2399 * Trim completion task that free associated block(s). 2400 */ 2401static void 2402ffs_blkfree_trim_task(ctx, pending) 2403 void *ctx; 2404 int pending; 2405{ 2406 struct ffs_blkfree_trim_params *tp; 2407 struct trim_blkreq *blkelm; 2408 struct ufsmount *ump; 2409 2410 tp = ctx; 2411 ump = tp->ump; 2412 while ((blkelm = TAILQ_FIRST(&tp->blklist)) != NULL) { 2413 ffs_blkfree_cg(ump, ump->um_fs, tp->devvp, blkelm->bno, 2414 blkelm->size, tp->inum, blkelm->pdephd); 2415 TAILQ_REMOVE(&tp->blklist, blkelm, blkreqlist); 2416 free(blkelm, M_TRIM); 2417 } 2418 vn_finished_secondary_write(UFSTOVFS(ump)); 2419 UFS_LOCK(ump); 2420 ump->um_trim_inflight -= 1; 2421 ump->um_trim_inflight_blks -= numfrags(ump->um_fs, tp->size); 2422 UFS_UNLOCK(ump); 2423 free(tp, M_TRIM); 2424} 2425 2426/* 2427 * Lookup a trim request by inode number. 2428 * Allocate if requested (NEW, REPLACE, SINGLE). 2429 */ 2430static struct ffs_blkfree_trim_params * 2431trim_lookup(ump, devvp, bno, size, inum, key, alloctype) 2432 struct ufsmount *ump; 2433 struct vnode *devvp; 2434 ufs2_daddr_t bno; 2435 long size; 2436 ino_t inum; 2437 u_long key; 2438 int alloctype; 2439{ 2440 struct trimlist_hashhead *tphashhead; 2441 struct ffs_blkfree_trim_params *tp, *ntp; 2442 2443 ntp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TRIM, M_WAITOK); 2444 if (alloctype != SINGLE) { 2445 KASSERT(key >= FIRST_VALID_KEY, ("trim_lookup: invalid key")); 2446 UFS_LOCK(ump); 2447 tphashhead = TRIMLIST_HASH(ump, key); 2448 LIST_FOREACH(tp, tphashhead, hashlist) 2449 if (key == tp->key) 2450 break; 2451 } 2452 switch (alloctype) { 2453 case NEW: 2454 KASSERT(tp == NULL, ("trim_lookup: found trim")); 2455 break; 2456 case OLD: 2457 KASSERT(tp != NULL, 2458 ("trim_lookup: missing call to ffs_blkrelease_start()")); 2459 UFS_UNLOCK(ump); 2460 free(ntp, M_TRIM); 2461 return (tp); 2462 case REPLACE: 2463 KASSERT(tp != NULL, ("trim_lookup: missing REPLACE trim")); 2464 LIST_REMOVE(tp, hashlist); 2465 /* tp will be freed by caller */ 2466 break; 2467 case DONE: 2468 KASSERT(tp != NULL, ("trim_lookup: missing DONE trim")); 2469 LIST_REMOVE(tp, hashlist); 2470 UFS_UNLOCK(ump); 2471 free(ntp, M_TRIM); 2472 return (tp); 2473 } 2474 TAILQ_INIT(&ntp->blklist); 2475 ntp->ump = ump; 2476 ntp->devvp = devvp; 2477 ntp->bno = bno; 2478 ntp->size = size; 2479 ntp->inum = inum; 2480 ntp->key = key; 2481 if (alloctype != SINGLE) { 2482 LIST_INSERT_HEAD(tphashhead, ntp, hashlist); 2483 UFS_UNLOCK(ump); 2484 } 2485 return (ntp); 2486} 2487 2488/* 2489 * Dispatch a trim request. 2490 */ 2491static void 2492ffs_blkfree_sendtrim(tp) 2493 struct ffs_blkfree_trim_params *tp; 2494{ 2495 struct ufsmount *ump; 2496 struct mount *mp; 2497 struct buf *bp; 2498 2499 /* 2500 * Postpone the set of the free bit in the cg bitmap until the 2501 * BIO_DELETE is completed. Otherwise, due to disk queue 2502 * reordering, TRIM might be issued after we reuse the block 2503 * and write some new data into it. 2504 */ 2505 ump = tp->ump; 2506 bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO); 2507 bp->b_iocmd = BIO_DELETE; 2508 bp->b_iooffset = dbtob(fsbtodb(ump->um_fs, tp->bno)); 2509 bp->b_iodone = ffs_blkfree_trim_completed; 2510 bp->b_bcount = tp->size; 2511 bp->b_fsprivate1 = tp; 2512 UFS_LOCK(ump); 2513 ump->um_trim_total += 1; 2514 ump->um_trim_inflight += 1; 2515 ump->um_trim_inflight_blks += numfrags(ump->um_fs, tp->size); 2516 ump->um_trim_total_blks += numfrags(ump->um_fs, tp->size); 2517 UFS_UNLOCK(ump); 2518 2519 mp = UFSTOVFS(ump); 2520 vn_start_secondary_write(NULL, &mp, 0); 2521 g_vfs_strategy(ump->um_bo, bp); 2522} 2523 2524/* 2525 * Allocate a new key to use to identify a range of blocks. 2526 */ 2527u_long 2528ffs_blkrelease_start(ump, devvp, inum) 2529 struct ufsmount *ump; 2530 struct vnode *devvp; 2531 ino_t inum; 2532{ 2533 static u_long masterkey; 2534 u_long key; 2535 2536 if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0) 2537 return (SINGLETON_KEY); 2538 do { 2539 key = atomic_fetchadd_long(&masterkey, 1); 2540 } while (key < FIRST_VALID_KEY); 2541 (void) trim_lookup(ump, devvp, 0, 0, inum, key, NEW); 2542 return (key); 2543} 2544 2545/* 2546 * Deallocate a key that has been used to identify a range of blocks. 2547 */ 2548void 2549ffs_blkrelease_finish(ump, key) 2550 struct ufsmount *ump; 2551 u_long key; 2552{ 2553 struct ffs_blkfree_trim_params *tp; 2554 2555 if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0) 2556 return; 2557 /* 2558 * We are done with sending blocks using this key. Look up the key 2559 * using the DONE alloctype (in tp) to request that it be unhashed 2560 * as we will not be adding to it. If the key has never been used, 2561 * tp->size will be zero, so we can just free tp. Otherwise the call 2562 * to ffs_blkfree_sendtrim(tp) causes the block range described by 2563 * tp to be issued (and then tp to be freed). 2564 */ 2565 tp = trim_lookup(ump, NULL, 0, 0, 0, key, DONE); 2566 if (tp->size == 0) 2567 free(tp, M_TRIM); 2568 else 2569 ffs_blkfree_sendtrim(tp); 2570} 2571 2572/* 2573 * Setup to free a block or fragment. 2574 * 2575 * Check for snapshots that might want to claim the block. 2576 * If trims are requested, prepare a trim request. Attempt to 2577 * aggregate consecutive blocks into a single trim request. 2578 */ 2579void 2580ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd, key) 2581 struct ufsmount *ump; 2582 struct fs *fs; 2583 struct vnode *devvp; 2584 ufs2_daddr_t bno; 2585 long size; 2586 ino_t inum; 2587 enum vtype vtype; 2588 struct workhead *dephd; 2589 u_long key; 2590{ 2591 struct ffs_blkfree_trim_params *tp, *ntp; 2592 struct trim_blkreq *blkelm; 2593 2594 /* 2595 * Check to see if a snapshot wants to claim the block. 2596 * Check that devvp is a normal disk device, not a snapshot, 2597 * it has a snapshot(s) associated with it, and one of the 2598 * snapshots wants to claim the block. 2599 */ 2600 if (devvp->v_type == VCHR && 2601 (devvp->v_vflag & VV_COPYONWRITE) && 2602 ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, dephd)) { 2603 return; 2604 } 2605 /* 2606 * Nothing to delay if TRIM is not required for this block or TRIM 2607 * is disabled or the operation is performed on a snapshot. 2608 */ 2609 if (key == NOTRIM_KEY || ((ump->um_flags & UM_CANDELETE) == 0) || 2610 devvp->v_type == VREG) { 2611 ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd); 2612 return; 2613 } 2614 blkelm = malloc(sizeof(struct trim_blkreq), M_TRIM, M_WAITOK); 2615 blkelm->bno = bno; 2616 blkelm->size = size; 2617 if (dephd == NULL) { 2618 blkelm->pdephd = NULL; 2619 } else { 2620 LIST_INIT(&blkelm->dephd); 2621 LIST_SWAP(dephd, &blkelm->dephd, worklist, wk_list); 2622 blkelm->pdephd = &blkelm->dephd; 2623 } 2624 if (key == SINGLETON_KEY) { 2625 /* 2626 * Just a single non-contiguous piece. Use the SINGLE 2627 * alloctype to return a trim request that will not be 2628 * hashed for future lookup. 2629 */ 2630 tp = trim_lookup(ump, devvp, bno, size, inum, key, SINGLE); 2631 TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); 2632 ffs_blkfree_sendtrim(tp); 2633 return; 2634 } 2635 /* 2636 * The callers of this function are not tracking whether or not 2637 * the blocks are contiguous. They are just saying that they 2638 * are freeing a set of blocks. It is this code that determines 2639 * the pieces of that range that are actually contiguous. 2640 * 2641 * Calling ffs_blkrelease_start() will have created an entry 2642 * that we will use. 2643 */ 2644 tp = trim_lookup(ump, devvp, bno, size, inum, key, OLD); 2645 if (tp->size == 0) { 2646 /* 2647 * First block of a potential range, set block and size 2648 * for the trim block. 2649 */ 2650 tp->bno = bno; 2651 tp->size = size; 2652 TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); 2653 return; 2654 } 2655 /* 2656 * If this block is a continuation of the range (either 2657 * follows at the end or preceeds in the front) then we 2658 * add it to the front or back of the list and return. 2659 * 2660 * If it is not a continuation of the trim that we were 2661 * building, using the REPLACE alloctype, we request that 2662 * the old trim request (still in tp) be unhashed and a 2663 * new range started (in ntp). The ffs_blkfree_sendtrim(tp) 2664 * call causes the block range described by tp to be issued 2665 * (and then tp to be freed). 2666 */ 2667 if (bno + numfrags(fs, size) == tp->bno) { 2668 TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); 2669 tp->bno = bno; 2670 tp->size += size; 2671 return; 2672 } else if (bno == tp->bno + numfrags(fs, tp->size)) { 2673 TAILQ_INSERT_TAIL(&tp->blklist, blkelm, blkreqlist); 2674 tp->size += size; 2675 return; 2676 } 2677 ntp = trim_lookup(ump, devvp, bno, size, inum, key, REPLACE); 2678 TAILQ_INSERT_HEAD(&ntp->blklist, blkelm, blkreqlist); 2679 ffs_blkfree_sendtrim(tp); 2680} 2681 2682#ifdef INVARIANTS 2683/* 2684 * Verify allocation of a block or fragment. Returns true if block or 2685 * fragment is allocated, false if it is free. 2686 */ 2687static int 2688ffs_checkblk(ip, bno, size) 2689 struct inode *ip; 2690 ufs2_daddr_t bno; 2691 long size; 2692{ 2693 struct fs *fs; 2694 struct cg *cgp; 2695 struct buf *bp; 2696 ufs1_daddr_t cgbno; 2697 int i, error, frags, free; 2698 u_int8_t *blksfree; 2699 2700 fs = ITOFS(ip); 2701 if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { 2702 printf("bsize = %ld, size = %ld, fs = %s\n", 2703 (long)fs->fs_bsize, size, fs->fs_fsmnt); 2704 panic("ffs_checkblk: bad size"); 2705 } 2706 if ((u_int)bno >= fs->fs_size) 2707 panic("ffs_checkblk: bad block %jd", (intmax_t)bno); 2708 error = ffs_getcg(fs, ITODEVVP(ip), dtog(fs, bno), &bp, &cgp); 2709 if (error) 2710 panic("ffs_checkblk: cylinder group read failed"); 2711 blksfree = cg_blksfree(cgp); 2712 cgbno = dtogd(fs, bno); 2713 if (size == fs->fs_bsize) { 2714 free = ffs_isblock(fs, blksfree, fragstoblks(fs, cgbno)); 2715 } else { 2716 frags = numfrags(fs, size); 2717 for (free = 0, i = 0; i < frags; i++) 2718 if (isset(blksfree, cgbno + i)) 2719 free++; 2720 if (free != 0 && free != frags) 2721 panic("ffs_checkblk: partially free fragment"); 2722 } 2723 brelse(bp); 2724 return (!free); 2725} 2726#endif /* INVARIANTS */ 2727 2728/* 2729 * Free an inode. 2730 */ 2731int 2732ffs_vfree(pvp, ino, mode) 2733 struct vnode *pvp; 2734 ino_t ino; 2735 int mode; 2736{ 2737 struct ufsmount *ump; 2738 2739 if (DOINGSOFTDEP(pvp)) { 2740 softdep_freefile(pvp, ino, mode); 2741 return (0); 2742 } 2743 ump = VFSTOUFS(pvp->v_mount); 2744 return (ffs_freefile(ump, ump->um_fs, ump->um_devvp, ino, mode, NULL)); 2745} 2746 2747/* 2748 * Do the actual free operation. 2749 * The specified inode is placed back in the free map. 2750 */ 2751int 2752ffs_freefile(ump, fs, devvp, ino, mode, wkhd) 2753 struct ufsmount *ump; 2754 struct fs *fs; 2755 struct vnode *devvp; 2756 ino_t ino; 2757 int mode; 2758 struct workhead *wkhd; 2759{ 2760 struct cg *cgp; 2761 struct buf *bp; 2762 int error; 2763 u_int cg; 2764 u_int8_t *inosused; 2765 struct cdev *dev; 2766 2767 cg = ino_to_cg(fs, ino); 2768 if (devvp->v_type == VREG) { 2769 /* devvp is a snapshot */ 2770 MPASS(devvp->v_mount->mnt_data == ump); 2771 dev = ump->um_devvp->v_rdev; 2772 } else if (devvp->v_type == VCHR) { 2773 /* devvp is a normal disk device */ 2774 dev = devvp->v_rdev; 2775 } else { 2776 bp = NULL; 2777 return (0); 2778 } 2779 if (ino >= fs->fs_ipg * fs->fs_ncg) 2780 panic("ffs_freefile: range: dev = %s, ino = %ju, fs = %s", 2781 devtoname(dev), (uintmax_t)ino, fs->fs_fsmnt); 2782 if ((error = ffs_getcg(fs, devvp, cg, &bp, &cgp)) != 0) 2783 return (error); 2784 inosused = cg_inosused(cgp); 2785 ino %= fs->fs_ipg; 2786 if (isclr(inosused, ino)) { 2787 printf("dev = %s, ino = %ju, fs = %s\n", devtoname(dev), 2788 (uintmax_t)(ino + cg * fs->fs_ipg), fs->fs_fsmnt); 2789 if (fs->fs_ronly == 0) 2790 panic("ffs_freefile: freeing free inode"); 2791 } 2792 clrbit(inosused, ino); 2793 if (ino < cgp->cg_irotor) 2794 cgp->cg_irotor = ino; 2795 cgp->cg_cs.cs_nifree++; 2796 UFS_LOCK(ump); 2797 fs->fs_cstotal.cs_nifree++; 2798 fs->fs_cs(fs, cg).cs_nifree++; 2799 if ((mode & IFMT) == IFDIR) { 2800 cgp->cg_cs.cs_ndir--; 2801 fs->fs_cstotal.cs_ndir--; 2802 fs->fs_cs(fs, cg).cs_ndir--; 2803 } 2804 fs->fs_fmod = 1; 2805 ACTIVECLEAR(fs, cg); 2806 UFS_UNLOCK(ump); 2807 if (MOUNTEDSOFTDEP(UFSTOVFS(ump)) && devvp->v_type == VCHR) 2808 softdep_setup_inofree(UFSTOVFS(ump), bp, 2809 ino + cg * fs->fs_ipg, wkhd); 2810 bdwrite(bp); 2811 return (0); 2812} 2813 2814/* 2815 * Check to see if a file is free. 2816 * Used to check for allocated files in snapshots. 2817 */ 2818int 2819ffs_checkfreefile(fs, devvp, ino) 2820 struct fs *fs; 2821 struct vnode *devvp; 2822 ino_t ino; 2823{ 2824 struct cg *cgp; 2825 struct buf *bp; 2826 int ret, error; 2827 u_int cg; 2828 u_int8_t *inosused; 2829 2830 cg = ino_to_cg(fs, ino); 2831 if ((devvp->v_type != VREG) && (devvp->v_type != VCHR)) 2832 return (1); 2833 if (ino >= fs->fs_ipg * fs->fs_ncg) 2834 return (1); 2835 if ((error = ffs_getcg(fs, devvp, cg, &bp, &cgp)) != 0) 2836 return (1); 2837 inosused = cg_inosused(cgp); 2838 ino %= fs->fs_ipg; 2839 ret = isclr(inosused, ino); 2840 brelse(bp); 2841 return (ret); 2842} 2843 2844/* 2845 * Find a block of the specified size in the specified cylinder group. 2846 * 2847 * It is a panic if a request is made to find a block if none are 2848 * available. 2849 */ 2850static ufs1_daddr_t 2851ffs_mapsearch(fs, cgp, bpref, allocsiz) 2852 struct fs *fs; 2853 struct cg *cgp; 2854 ufs2_daddr_t bpref; 2855 int allocsiz; 2856{ 2857 ufs1_daddr_t bno; 2858 int start, len, loc, i; 2859 int blk, field, subfield, pos; 2860 u_int8_t *blksfree; 2861 2862 /* 2863 * find the fragment by searching through the free block 2864 * map for an appropriate bit pattern 2865 */ 2866 if (bpref) 2867 start = dtogd(fs, bpref) / NBBY; 2868 else 2869 start = cgp->cg_frotor / NBBY; 2870 blksfree = cg_blksfree(cgp); 2871 len = howmany(fs->fs_fpg, NBBY) - start; 2872 loc = scanc((u_int)len, (u_char *)&blksfree[start], 2873 fragtbl[fs->fs_frag], 2874 (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); 2875 if (loc == 0) { 2876 len = start + 1; 2877 start = 0; 2878 loc = scanc((u_int)len, (u_char *)&blksfree[0], 2879 fragtbl[fs->fs_frag], 2880 (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); 2881 if (loc == 0) { 2882 printf("start = %d, len = %d, fs = %s\n", 2883 start, len, fs->fs_fsmnt); 2884 panic("ffs_alloccg: map corrupted"); 2885 /* NOTREACHED */ 2886 } 2887 } 2888 bno = (start + len - loc) * NBBY; 2889 cgp->cg_frotor = bno; 2890 /* 2891 * found the byte in the map 2892 * sift through the bits to find the selected frag 2893 */ 2894 for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { 2895 blk = blkmap(fs, blksfree, bno); 2896 blk <<= 1; 2897 field = around[allocsiz]; 2898 subfield = inside[allocsiz]; 2899 for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { 2900 if ((blk & field) == subfield) 2901 return (bno + pos); 2902 field <<= 1; 2903 subfield <<= 1; 2904 } 2905 } 2906 printf("bno = %lu, fs = %s\n", (u_long)bno, fs->fs_fsmnt); 2907 panic("ffs_alloccg: block not in map"); 2908 return (-1); 2909} 2910 2911static const struct statfs * 2912ffs_getmntstat(struct vnode *devvp) 2913{ 2914 2915 if (devvp->v_type == VCHR) 2916 return (&devvp->v_rdev->si_mountpt->mnt_stat); 2917 return (ffs_getmntstat(VFSTOUFS(devvp->v_mount)->um_devvp)); 2918} 2919 2920/* 2921 * Fetch and verify a cylinder group. 2922 */ 2923int 2924ffs_getcg(fs, devvp, cg, bpp, cgpp) 2925 struct fs *fs; 2926 struct vnode *devvp; 2927 u_int cg; 2928 struct buf **bpp; 2929 struct cg **cgpp; 2930{ 2931 struct buf *bp; 2932 struct cg *cgp; 2933 const struct statfs *sfs; 2934 int flags, error; 2935 2936 *bpp = NULL; 2937 *cgpp = NULL; 2938 flags = 0; 2939 if ((fs->fs_metackhash & CK_CYLGRP) != 0) 2940 flags |= GB_CKHASH; 2941 error = breadn_flags(devvp, devvp->v_type == VREG ? 2942 fragstoblks(fs, cgtod(fs, cg)) : fsbtodb(fs, cgtod(fs, cg)), 2943 (int)fs->fs_cgsize, NULL, NULL, 0, NOCRED, flags, 2944 ffs_ckhash_cg, &bp); 2945 if (error != 0) 2946 return (error); 2947 cgp = (struct cg *)bp->b_data; 2948 if ((fs->fs_metackhash & CK_CYLGRP) != 0 && 2949 (bp->b_flags & B_CKHASH) != 0 && 2950 cgp->cg_ckhash != bp->b_ckhash) { 2951 sfs = ffs_getmntstat(devvp); 2952 printf("UFS %s%s (%s) cylinder checksum failed: cg %u, cgp: " 2953 "0x%x != bp: 0x%jx\n", 2954 devvp->v_type == VCHR ? "" : "snapshot of ", 2955 sfs->f_mntfromname, sfs->f_mntonname, 2956 cg, cgp->cg_ckhash, (uintmax_t)bp->b_ckhash); 2957 bp->b_flags &= ~B_CKHASH; 2958 bp->b_flags |= B_INVAL | B_NOCACHE; 2959 brelse(bp); 2960 return (EIO); 2961 } 2962 if (!cg_chkmagic(cgp) || cgp->cg_cgx != cg) { 2963 sfs = ffs_getmntstat(devvp); 2964 printf("UFS %s%s (%s)", 2965 devvp->v_type == VCHR ? "" : "snapshot of ", 2966 sfs->f_mntfromname, sfs->f_mntonname); 2967 if (!cg_chkmagic(cgp)) 2968 printf(" cg %u: bad magic number 0x%x should be 0x%x\n", 2969 cg, cgp->cg_magic, CG_MAGIC); 2970 else 2971 printf(": wrong cylinder group cg %u != cgx %u\n", cg, 2972 cgp->cg_cgx); 2973 bp->b_flags &= ~B_CKHASH; 2974 bp->b_flags |= B_INVAL | B_NOCACHE; 2975 brelse(bp); 2976 return (EIO); 2977 } 2978 bp->b_flags &= ~B_CKHASH; 2979 bp->b_xflags |= BX_BKGRDWRITE; 2980 /* 2981 * If we are using check hashes on the cylinder group then we want 2982 * to limit changing the cylinder group time to when we are actually 2983 * going to write it to disk so that its check hash remains correct 2984 * in memory. If the CK_CYLGRP flag is set the time is updated in 2985 * ffs_bufwrite() as the buffer is queued for writing. Otherwise we 2986 * update the time here as we have done historically. 2987 */ 2988 if ((fs->fs_metackhash & CK_CYLGRP) != 0) 2989 bp->b_xflags |= BX_CYLGRP; 2990 else 2991 cgp->cg_old_time = cgp->cg_time = time_second; 2992 *bpp = bp; 2993 *cgpp = cgp; 2994 return (0); 2995} 2996 2997static void 2998ffs_ckhash_cg(bp) 2999 struct buf *bp; 3000{ 3001 uint32_t ckhash; 3002 struct cg *cgp; 3003 3004 cgp = (struct cg *)bp->b_data; 3005 ckhash = cgp->cg_ckhash; 3006 cgp->cg_ckhash = 0; 3007 bp->b_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount); 3008 cgp->cg_ckhash = ckhash; 3009} 3010 3011/* 3012 * Fserr prints the name of a filesystem with an error diagnostic. 3013 * 3014 * The form of the error message is: 3015 * fs: error message 3016 */ 3017void 3018ffs_fserr(fs, inum, cp) 3019 struct fs *fs; 3020 ino_t inum; 3021 char *cp; 3022{ 3023 struct thread *td = curthread; /* XXX */ 3024 struct proc *p = td->td_proc; 3025 3026 log(LOG_ERR, "pid %d (%s), uid %d inumber %ju on %s: %s\n", 3027 p->p_pid, p->p_comm, td->td_ucred->cr_uid, (uintmax_t)inum, 3028 fs->fs_fsmnt, cp); 3029} 3030 3031/* 3032 * This function provides the capability for the fsck program to 3033 * update an active filesystem. Fourteen operations are provided: 3034 * 3035 * adjrefcnt(inode, amt) - adjusts the reference count on the 3036 * specified inode by the specified amount. Under normal 3037 * operation the count should always go down. Decrementing 3038 * the count to zero will cause the inode to be freed. 3039 * adjblkcnt(inode, amt) - adjust the number of blocks used by the 3040 * inode by the specified amount. 3041 * adjsize(inode, size) - set the size of the inode to the 3042 * specified size. 3043 * adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) - 3044 * adjust the superblock summary. 3045 * freedirs(inode, count) - directory inodes [inode..inode + count - 1] 3046 * are marked as free. Inodes should never have to be marked 3047 * as in use. 3048 * freefiles(inode, count) - file inodes [inode..inode + count - 1] 3049 * are marked as free. Inodes should never have to be marked 3050 * as in use. 3051 * freeblks(blockno, size) - blocks [blockno..blockno + size - 1] 3052 * are marked as free. Blocks should never have to be marked 3053 * as in use. 3054 * setflags(flags, set/clear) - the fs_flags field has the specified 3055 * flags set (second parameter +1) or cleared (second parameter -1). 3056 * setcwd(dirinode) - set the current directory to dirinode in the 3057 * filesystem associated with the snapshot. 3058 * setdotdot(oldvalue, newvalue) - Verify that the inode number for ".." 3059 * in the current directory is oldvalue then change it to newvalue. 3060 * unlink(nameptr, oldvalue) - Verify that the inode number associated 3061 * with nameptr in the current directory is oldvalue then unlink it. 3062 * 3063 * The following functions may only be used on a quiescent filesystem 3064 * by the soft updates journal. They are not safe to be run on an active 3065 * filesystem. 3066 * 3067 * setinode(inode, dip) - the specified disk inode is replaced with the 3068 * contents pointed to by dip. 3069 * setbufoutput(fd, flags) - output associated with the specified file 3070 * descriptor (which must reference the character device supporting 3071 * the filesystem) switches from using physio to running through the 3072 * buffer cache when flags is set to 1. The descriptor reverts to 3073 * physio for output when flags is set to zero. 3074 */ 3075 3076static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS); 3077 3078SYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt, CTLFLAG_WR|CTLTYPE_STRUCT, 3079 0, 0, sysctl_ffs_fsck, "S,fsck", "Adjust Inode Reference Count"); 3080 3081static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt, CTLFLAG_WR, 3082 sysctl_ffs_fsck, "Adjust Inode Used Blocks Count"); 3083 3084static SYSCTL_NODE(_vfs_ffs, FFS_SET_SIZE, setsize, CTLFLAG_WR, 3085 sysctl_ffs_fsck, "Set the inode size"); 3086 3087static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NDIR, adjndir, CTLFLAG_WR, 3088 sysctl_ffs_fsck, "Adjust number of directories"); 3089 3090static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NBFREE, adjnbfree, CTLFLAG_WR, 3091 sysctl_ffs_fsck, "Adjust number of free blocks"); 3092 3093static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NIFREE, adjnifree, CTLFLAG_WR, 3094 sysctl_ffs_fsck, "Adjust number of free inodes"); 3095 3096static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NFFREE, adjnffree, CTLFLAG_WR, 3097 sysctl_ffs_fsck, "Adjust number of free frags"); 3098 3099static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NUMCLUSTERS, adjnumclusters, CTLFLAG_WR, 3100 sysctl_ffs_fsck, "Adjust number of free clusters"); 3101 3102static SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs, CTLFLAG_WR, 3103 sysctl_ffs_fsck, "Free Range of Directory Inodes"); 3104 3105static SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles, CTLFLAG_WR, 3106 sysctl_ffs_fsck, "Free Range of File Inodes"); 3107 3108static SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks, CTLFLAG_WR, 3109 sysctl_ffs_fsck, "Free Range of Blocks"); 3110 3111static SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags, CTLFLAG_WR, 3112 sysctl_ffs_fsck, "Change Filesystem Flags"); 3113 3114static SYSCTL_NODE(_vfs_ffs, FFS_SET_CWD, setcwd, CTLFLAG_WR, 3115 sysctl_ffs_fsck, "Set Current Working Directory"); 3116 3117static SYSCTL_NODE(_vfs_ffs, FFS_SET_DOTDOT, setdotdot, CTLFLAG_WR, 3118 sysctl_ffs_fsck, "Change Value of .. Entry"); 3119 3120static SYSCTL_NODE(_vfs_ffs, FFS_UNLINK, unlink, CTLFLAG_WR, 3121 sysctl_ffs_fsck, "Unlink a Duplicate Name"); 3122 3123static SYSCTL_NODE(_vfs_ffs, FFS_SET_INODE, setinode, CTLFLAG_WR, 3124 sysctl_ffs_fsck, "Update an On-Disk Inode"); 3125 3126static SYSCTL_NODE(_vfs_ffs, FFS_SET_BUFOUTPUT, setbufoutput, CTLFLAG_WR, 3127 sysctl_ffs_fsck, "Set Buffered Writing for Descriptor"); 3128 3129#define DEBUG 1 3130#ifdef DEBUG 3131static int fsckcmds = 0; 3132SYSCTL_INT(_debug, OID_AUTO, fsckcmds, CTLFLAG_RW, &fsckcmds, 0, ""); 3133#endif /* DEBUG */ 3134 3135static int buffered_write(struct file *, struct uio *, struct ucred *, 3136 int, struct thread *); 3137 3138static int 3139sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) 3140{ 3141 struct thread *td = curthread; 3142 struct fsck_cmd cmd; 3143 struct ufsmount *ump; 3144 struct vnode *vp, *dvp, *fdvp; 3145 struct inode *ip, *dp; 3146 struct mount *mp; 3147 struct fs *fs; 3148 ufs2_daddr_t blkno; 3149 long blkcnt, blksize; 3150 u_long key; 3151 struct file *fp, *vfp; 3152 cap_rights_t rights; 3153 int filetype, error; 3154 static struct fileops *origops, bufferedops; 3155 3156 if (req->newptr == NULL || req->newlen > sizeof(cmd)) 3157 return (EBADRPC); 3158 if ((error = SYSCTL_IN(req, &cmd, sizeof(cmd))) != 0) 3159 return (error); 3160 if (cmd.version != FFS_CMD_VERSION) 3161 return (ERPCMISMATCH); 3162 if ((error = getvnode(td, cmd.handle, 3163 cap_rights_init(&rights, CAP_FSCK), &fp)) != 0) 3164 return (error); 3165 vp = fp->f_data; 3166 if (vp->v_type != VREG && vp->v_type != VDIR) { 3167 fdrop(fp, td); 3168 return (EINVAL); 3169 } 3170 vn_start_write(vp, &mp, V_WAIT); 3171 if (mp == NULL || 3172 strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) { 3173 vn_finished_write(mp); 3174 fdrop(fp, td); 3175 return (EINVAL); 3176 } 3177 ump = VFSTOUFS(mp); 3178 if ((mp->mnt_flag & MNT_RDONLY) && 3179 ump->um_fsckpid != td->td_proc->p_pid) { 3180 vn_finished_write(mp); 3181 fdrop(fp, td); 3182 return (EROFS); 3183 } 3184 fs = ump->um_fs; 3185 filetype = IFREG; 3186 3187 switch (oidp->oid_number) { 3188 3189 case FFS_SET_FLAGS: 3190#ifdef DEBUG 3191 if (fsckcmds) 3192 printf("%s: %s flags\n", mp->mnt_stat.f_mntonname, 3193 cmd.size > 0 ? "set" : "clear"); 3194#endif /* DEBUG */ 3195 if (cmd.size > 0) 3196 fs->fs_flags |= (long)cmd.value; 3197 else 3198 fs->fs_flags &= ~(long)cmd.value; 3199 break; 3200 3201 case FFS_ADJ_REFCNT: 3202#ifdef DEBUG 3203 if (fsckcmds) { 3204 printf("%s: adjust inode %jd link count by %jd\n", 3205 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3206 (intmax_t)cmd.size); 3207 } 3208#endif /* DEBUG */ 3209 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3210 break; 3211 ip = VTOI(vp); 3212 ip->i_nlink += cmd.size; 3213 DIP_SET(ip, i_nlink, ip->i_nlink); 3214 ip->i_effnlink += cmd.size; 3215 ip->i_flag |= IN_CHANGE | IN_MODIFIED; 3216 error = ffs_update(vp, 1); 3217 if (DOINGSOFTDEP(vp)) 3218 softdep_change_linkcnt(ip); 3219 vput(vp); 3220 break; 3221 3222 case FFS_ADJ_BLKCNT: 3223#ifdef DEBUG 3224 if (fsckcmds) { 3225 printf("%s: adjust inode %jd block count by %jd\n", 3226 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3227 (intmax_t)cmd.size); 3228 } 3229#endif /* DEBUG */ 3230 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3231 break; 3232 ip = VTOI(vp); 3233 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size); 3234 ip->i_flag |= IN_CHANGE | IN_MODIFIED; 3235 error = ffs_update(vp, 1); 3236 vput(vp); 3237 break; 3238 3239 case FFS_SET_SIZE: 3240#ifdef DEBUG 3241 if (fsckcmds) { 3242 printf("%s: set inode %jd size to %jd\n", 3243 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3244 (intmax_t)cmd.size); 3245 } 3246#endif /* DEBUG */ 3247 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3248 break; 3249 ip = VTOI(vp); 3250 DIP_SET(ip, i_size, cmd.size); 3251 ip->i_flag |= IN_SIZEMOD | IN_CHANGE | IN_MODIFIED; 3252 error = ffs_update(vp, 1); 3253 vput(vp); 3254 break; 3255 3256 case FFS_DIR_FREE: 3257 filetype = IFDIR; 3258 /* fall through */ 3259 3260 case FFS_FILE_FREE: 3261#ifdef DEBUG 3262 if (fsckcmds) { 3263 if (cmd.size == 1) 3264 printf("%s: free %s inode %ju\n", 3265 mp->mnt_stat.f_mntonname, 3266 filetype == IFDIR ? "directory" : "file", 3267 (uintmax_t)cmd.value); 3268 else 3269 printf("%s: free %s inodes %ju-%ju\n", 3270 mp->mnt_stat.f_mntonname, 3271 filetype == IFDIR ? "directory" : "file", 3272 (uintmax_t)cmd.value, 3273 (uintmax_t)(cmd.value + cmd.size - 1)); 3274 } 3275#endif /* DEBUG */ 3276 while (cmd.size > 0) { 3277 if ((error = ffs_freefile(ump, fs, ump->um_devvp, 3278 cmd.value, filetype, NULL))) 3279 break; 3280 cmd.size -= 1; 3281 cmd.value += 1; 3282 } 3283 break; 3284 3285 case FFS_BLK_FREE: 3286#ifdef DEBUG 3287 if (fsckcmds) { 3288 if (cmd.size == 1) 3289 printf("%s: free block %jd\n", 3290 mp->mnt_stat.f_mntonname, 3291 (intmax_t)cmd.value); 3292 else 3293 printf("%s: free blocks %jd-%jd\n", 3294 mp->mnt_stat.f_mntonname, 3295 (intmax_t)cmd.value, 3296 (intmax_t)cmd.value + cmd.size - 1); 3297 } 3298#endif /* DEBUG */ 3299 blkno = cmd.value; 3300 blkcnt = cmd.size; 3301 blksize = fs->fs_frag - (blkno % fs->fs_frag); 3302 key = ffs_blkrelease_start(ump, ump->um_devvp, UFS_ROOTINO); 3303 while (blkcnt > 0) { 3304 if (blkcnt < blksize) 3305 blksize = blkcnt; 3306 ffs_blkfree(ump, fs, ump->um_devvp, blkno, 3307 blksize * fs->fs_fsize, UFS_ROOTINO, 3308 VDIR, NULL, key); 3309 blkno += blksize; 3310 blkcnt -= blksize; 3311 blksize = fs->fs_frag; 3312 } 3313 ffs_blkrelease_finish(ump, key); 3314 break; 3315 3316 /* 3317 * Adjust superblock summaries. fsck(8) is expected to 3318 * submit deltas when necessary. 3319 */ 3320 case FFS_ADJ_NDIR: 3321#ifdef DEBUG 3322 if (fsckcmds) { 3323 printf("%s: adjust number of directories by %jd\n", 3324 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3325 } 3326#endif /* DEBUG */ 3327 fs->fs_cstotal.cs_ndir += cmd.value; 3328 break; 3329 3330 case FFS_ADJ_NBFREE: 3331#ifdef DEBUG 3332 if (fsckcmds) { 3333 printf("%s: adjust number of free blocks by %+jd\n", 3334 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3335 } 3336#endif /* DEBUG */ 3337 fs->fs_cstotal.cs_nbfree += cmd.value; 3338 break; 3339 3340 case FFS_ADJ_NIFREE: 3341#ifdef DEBUG 3342 if (fsckcmds) { 3343 printf("%s: adjust number of free inodes by %+jd\n", 3344 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3345 } 3346#endif /* DEBUG */ 3347 fs->fs_cstotal.cs_nifree += cmd.value; 3348 break; 3349 3350 case FFS_ADJ_NFFREE: 3351#ifdef DEBUG 3352 if (fsckcmds) { 3353 printf("%s: adjust number of free frags by %+jd\n", 3354 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3355 } 3356#endif /* DEBUG */ 3357 fs->fs_cstotal.cs_nffree += cmd.value; 3358 break; 3359 3360 case FFS_ADJ_NUMCLUSTERS: 3361#ifdef DEBUG 3362 if (fsckcmds) { 3363 printf("%s: adjust number of free clusters by %+jd\n", 3364 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3365 } 3366#endif /* DEBUG */ 3367 fs->fs_cstotal.cs_numclusters += cmd.value; 3368 break; 3369 3370 case FFS_SET_CWD: 3371#ifdef DEBUG 3372 if (fsckcmds) { 3373 printf("%s: set current directory to inode %jd\n", 3374 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3375 } 3376#endif /* DEBUG */ 3377 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_SHARED, &vp))) 3378 break; 3379 AUDIT_ARG_VNODE1(vp); 3380 if ((error = change_dir(vp, td)) != 0) { 3381 vput(vp); 3382 break; 3383 } 3384 VOP_UNLOCK(vp, 0); 3385 pwd_chdir(td, vp); 3386 break; 3387 3388 case FFS_SET_DOTDOT: 3389#ifdef DEBUG 3390 if (fsckcmds) { 3391 printf("%s: change .. in cwd from %jd to %jd\n", 3392 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3393 (intmax_t)cmd.size); 3394 } 3395#endif /* DEBUG */ 3396 /* 3397 * First we have to get and lock the parent directory 3398 * to which ".." points. 3399 */ 3400 error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &fdvp); 3401 if (error) 3402 break; 3403 /* 3404 * Now we get and lock the child directory containing "..". 3405 */ 3406 FILEDESC_SLOCK(td->td_proc->p_fd); 3407 dvp = td->td_proc->p_fd->fd_cdir; 3408 FILEDESC_SUNLOCK(td->td_proc->p_fd); 3409 if ((error = vget(dvp, LK_EXCLUSIVE, td)) != 0) { 3410 vput(fdvp); 3411 break; 3412 } 3413 dp = VTOI(dvp); 3414 dp->i_offset = 12; /* XXX mastertemplate.dot_reclen */ 3415 error = ufs_dirrewrite(dp, VTOI(fdvp), (ino_t)cmd.size, 3416 DT_DIR, 0); 3417 cache_purge(fdvp); 3418 cache_purge(dvp); 3419 vput(dvp); 3420 vput(fdvp); 3421 break; 3422 3423 case FFS_UNLINK: 3424#ifdef DEBUG 3425 if (fsckcmds) { 3426 char buf[32]; 3427 3428 if (copyinstr((char *)(intptr_t)cmd.value, buf,32,NULL)) 3429 strncpy(buf, "Name_too_long", 32); 3430 printf("%s: unlink %s (inode %jd)\n", 3431 mp->mnt_stat.f_mntonname, buf, (intmax_t)cmd.size); 3432 } 3433#endif /* DEBUG */ 3434 /* 3435 * kern_unlinkat will do its own start/finish writes and 3436 * they do not nest, so drop ours here. Setting mp == NULL 3437 * indicates that vn_finished_write is not needed down below. 3438 */ 3439 vn_finished_write(mp); 3440 mp = NULL; 3441 error = kern_unlinkat(td, AT_FDCWD, (char *)(intptr_t)cmd.value, 3442 UIO_USERSPACE, 0, (ino_t)cmd.size); 3443 break; 3444 3445 case FFS_SET_INODE: 3446 if (ump->um_fsckpid != td->td_proc->p_pid) { 3447 error = EPERM; 3448 break; 3449 } 3450#ifdef DEBUG 3451 if (fsckcmds) { 3452 printf("%s: update inode %jd\n", 3453 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3454 } 3455#endif /* DEBUG */ 3456 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3457 break; 3458 AUDIT_ARG_VNODE1(vp); 3459 ip = VTOI(vp); 3460 if (I_IS_UFS1(ip)) 3461 error = copyin((void *)(intptr_t)cmd.size, ip->i_din1, 3462 sizeof(struct ufs1_dinode)); 3463 else 3464 error = copyin((void *)(intptr_t)cmd.size, ip->i_din2, 3465 sizeof(struct ufs2_dinode)); 3466 if (error) { 3467 vput(vp); 3468 break; 3469 } 3470 ip->i_flag |= IN_CHANGE | IN_MODIFIED; 3471 error = ffs_update(vp, 1); 3472 vput(vp); 3473 break; 3474 3475 case FFS_SET_BUFOUTPUT: 3476 if (ump->um_fsckpid != td->td_proc->p_pid) { 3477 error = EPERM; 3478 break; 3479 } 3480 if (ITOUMP(VTOI(vp)) != ump) { 3481 error = EINVAL; 3482 break; 3483 } 3484#ifdef DEBUG 3485 if (fsckcmds) { 3486 printf("%s: %s buffered output for descriptor %jd\n", 3487 mp->mnt_stat.f_mntonname, 3488 cmd.size == 1 ? "enable" : "disable", 3489 (intmax_t)cmd.value); 3490 } 3491#endif /* DEBUG */ 3492 if ((error = getvnode(td, cmd.value, 3493 cap_rights_init(&rights, CAP_FSCK), &vfp)) != 0) 3494 break; 3495 if (vfp->f_vnode->v_type != VCHR) { 3496 fdrop(vfp, td); 3497 error = EINVAL; 3498 break; 3499 } 3500 if (origops == NULL) { 3501 origops = vfp->f_ops; 3502 bcopy((void *)origops, (void *)&bufferedops, 3503 sizeof(bufferedops)); 3504 bufferedops.fo_write = buffered_write; 3505 } 3506 if (cmd.size == 1) 3507 atomic_store_rel_ptr((volatile uintptr_t *)&vfp->f_ops, 3508 (uintptr_t)&bufferedops); 3509 else 3510 atomic_store_rel_ptr((volatile uintptr_t *)&vfp->f_ops, 3511 (uintptr_t)origops); 3512 fdrop(vfp, td); 3513 break; 3514 3515 default: 3516#ifdef DEBUG 3517 if (fsckcmds) { 3518 printf("Invalid request %d from fsck\n", 3519 oidp->oid_number); 3520 } 3521#endif /* DEBUG */ 3522 error = EINVAL; 3523 break; 3524 3525 } 3526 fdrop(fp, td); 3527 vn_finished_write(mp); 3528 return (error); 3529} 3530 3531/* 3532 * Function to switch a descriptor to use the buffer cache to stage 3533 * its I/O. This is needed so that writes to the filesystem device 3534 * will give snapshots a chance to copy modified blocks for which it 3535 * needs to retain copies. 3536 */ 3537static int 3538buffered_write(fp, uio, active_cred, flags, td) 3539 struct file *fp; 3540 struct uio *uio; 3541 struct ucred *active_cred; 3542 int flags; 3543 struct thread *td; 3544{ 3545 struct vnode *devvp, *vp; 3546 struct inode *ip; 3547 struct buf *bp; 3548 struct fs *fs; 3549 struct filedesc *fdp; 3550 int error; 3551 daddr_t lbn; 3552 3553 /* 3554 * The devvp is associated with the /dev filesystem. To discover 3555 * the filesystem with which the device is associated, we depend 3556 * on the application setting the current directory to a location 3557 * within the filesystem being written. Yes, this is an ugly hack. 3558 */ 3559 devvp = fp->f_vnode; 3560 if (!vn_isdisk(devvp, NULL)) 3561 return (EINVAL); 3562 fdp = td->td_proc->p_fd; 3563 FILEDESC_SLOCK(fdp); 3564 vp = fdp->fd_cdir; 3565 vref(vp); 3566 FILEDESC_SUNLOCK(fdp); 3567 vn_lock(vp, LK_SHARED | LK_RETRY); 3568 /* 3569 * Check that the current directory vnode indeed belongs to 3570 * UFS before trying to dereference UFS-specific v_data fields. 3571 */ 3572 if (vp->v_op != &ffs_vnodeops1 && vp->v_op != &ffs_vnodeops2) { 3573 vput(vp); 3574 return (EINVAL); 3575 } 3576 ip = VTOI(vp); 3577 if (ITODEVVP(ip) != devvp) { 3578 vput(vp); 3579 return (EINVAL); 3580 } 3581 fs = ITOFS(ip); 3582 vput(vp); 3583 foffset_lock_uio(fp, uio, flags); 3584 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 3585#ifdef DEBUG 3586 if (fsckcmds) { 3587 printf("%s: buffered write for block %jd\n", 3588 fs->fs_fsmnt, (intmax_t)btodb(uio->uio_offset)); 3589 } 3590#endif /* DEBUG */ 3591 /* 3592 * All I/O must be contained within a filesystem block, start on 3593 * a fragment boundary, and be a multiple of fragments in length. 3594 */ 3595 if (uio->uio_resid > fs->fs_bsize - (uio->uio_offset % fs->fs_bsize) || 3596 fragoff(fs, uio->uio_offset) != 0 || 3597 fragoff(fs, uio->uio_resid) != 0) { 3598 error = EINVAL; 3599 goto out; 3600 } 3601 lbn = numfrags(fs, uio->uio_offset); 3602 bp = getblk(devvp, lbn, uio->uio_resid, 0, 0, 0); 3603 bp->b_flags |= B_RELBUF; 3604 if ((error = uiomove((char *)bp->b_data, uio->uio_resid, uio)) != 0) { 3605 brelse(bp); 3606 goto out; 3607 } 3608 error = bwrite(bp); 3609out: 3610 VOP_UNLOCK(devvp, 0); 3611 foffset_unlock_uio(fp, uio, flags | FOF_NEXTOFF); 3612 return (error); 3613} 3614