1/*- 2 * Copyright (c) 2002 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * Copyright (c) 1982, 1986, 1989, 1993 33 * The Regents of the University of California. All rights reserved. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 4. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * @(#)ffs_balloc.c 8.8 (Berkeley) 6/16/95 60 */ 61 62#include <sys/cdefs.h> 63__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_balloc.c 362050 2020-06-11 11:45:30Z kib $"); 64 65#include <sys/param.h> 66#include <sys/systm.h> 67#include <sys/bio.h> 68#include <sys/buf.h> 69#include <sys/lock.h> 70#include <sys/mount.h> 71#include <sys/vnode.h> 72#include <sys/vmmeter.h> 73 74#include <ufs/ufs/quota.h> 75#include <ufs/ufs/inode.h> 76#include <ufs/ufs/ufs_extern.h> 77#include <ufs/ufs/extattr.h> 78#include <ufs/ufs/ufsmount.h> 79 80#include <ufs/ffs/fs.h> 81#include <ufs/ffs/ffs_extern.h> 82 83/* 84 * Balloc defines the structure of filesystem storage 85 * by allocating the physical blocks on a device given 86 * the inode and the logical block number in a file. 87 * This is the allocation strategy for UFS1. Below is 88 * the allocation strategy for UFS2. 89 */ 90int 91ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, 92 struct ucred *cred, int flags, struct buf **bpp) 93{ 94 struct inode *ip; 95 struct ufs1_dinode *dp; 96 ufs_lbn_t lbn, lastlbn; 97 struct fs *fs; 98 ufs1_daddr_t nb; 99 struct buf *bp, *nbp; 100 struct ufsmount *ump; 101 struct indir indirs[NIADDR + 2]; 102 int deallocated, osize, nsize, num, i, error; 103 ufs2_daddr_t newb; 104 ufs1_daddr_t *bap, pref; 105 ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1]; 106 ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1]; 107 int unwindidx = -1; 108 int saved_inbdflush; 109 static struct timeval lastfail; 110 static int curfail; 111 int gbflags, reclaimed; 112 113 ip = VTOI(vp); 114 dp = ip->i_din1; 115 fs = ITOFS(ip); 116 ump = ITOUMP(ip); 117 lbn = lblkno(fs, startoffset); 118 size = blkoff(fs, startoffset) + size; 119 reclaimed = 0; 120 if (size > fs->fs_bsize) 121 panic("ffs_balloc_ufs1: blk too big"); 122 *bpp = NULL; 123 if (flags & IO_EXT) 124 return (EOPNOTSUPP); 125 if (lbn < 0) 126 return (EFBIG); 127 gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; 128 129 if (DOINGSOFTDEP(vp)) 130 softdep_prealloc(vp, MNT_WAIT); 131 /* 132 * If the next write will extend the file into a new block, 133 * and the file is currently composed of a fragment 134 * this fragment has to be extended to be a full block. 135 */ 136 lastlbn = lblkno(fs, ip->i_size); 137 if (lastlbn < NDADDR && lastlbn < lbn) { 138 nb = lastlbn; 139 osize = blksize(fs, ip, nb); 140 if (osize < fs->fs_bsize && osize > 0) { 141 UFS_LOCK(ump); 142 error = ffs_realloccg(ip, nb, dp->di_db[nb], 143 ffs_blkpref_ufs1(ip, lastlbn, (int)nb, 144 &dp->di_db[0]), osize, (int)fs->fs_bsize, flags, 145 cred, &bp); 146 if (error) 147 return (error); 148 if (DOINGSOFTDEP(vp)) 149 softdep_setup_allocdirect(ip, nb, 150 dbtofsb(fs, bp->b_blkno), dp->di_db[nb], 151 fs->fs_bsize, osize, bp); 152 ip->i_size = smalllblktosize(fs, nb + 1); 153 dp->di_size = ip->i_size; 154 dp->di_db[nb] = dbtofsb(fs, bp->b_blkno); 155 ip->i_flag |= IN_SIZEMOD | IN_CHANGE | IN_UPDATE | 156 IN_IBLKDATA; 157 if (flags & IO_SYNC) 158 bwrite(bp); 159 else 160 bawrite(bp); 161 } 162 } 163 /* 164 * The first NDADDR blocks are direct blocks 165 */ 166 if (lbn < NDADDR) { 167 if (flags & BA_METAONLY) 168 panic("ffs_balloc_ufs1: BA_METAONLY for direct block"); 169 nb = dp->di_db[lbn]; 170 if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { 171 error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp); 172 if (error) { 173 brelse(bp); 174 return (error); 175 } 176 bp->b_blkno = fsbtodb(fs, nb); 177 *bpp = bp; 178 return (0); 179 } 180 if (nb != 0) { 181 /* 182 * Consider need to reallocate a fragment. 183 */ 184 osize = fragroundup(fs, blkoff(fs, ip->i_size)); 185 nsize = fragroundup(fs, size); 186 if (nsize <= osize) { 187 error = bread(vp, lbn, osize, NOCRED, &bp); 188 if (error) { 189 brelse(bp); 190 return (error); 191 } 192 bp->b_blkno = fsbtodb(fs, nb); 193 } else { 194 UFS_LOCK(ump); 195 error = ffs_realloccg(ip, lbn, dp->di_db[lbn], 196 ffs_blkpref_ufs1(ip, lbn, (int)lbn, 197 &dp->di_db[0]), osize, nsize, flags, 198 cred, &bp); 199 if (error) 200 return (error); 201 if (DOINGSOFTDEP(vp)) 202 softdep_setup_allocdirect(ip, lbn, 203 dbtofsb(fs, bp->b_blkno), nb, 204 nsize, osize, bp); 205 } 206 } else { 207 if (ip->i_size < smalllblktosize(fs, lbn + 1)) 208 nsize = fragroundup(fs, size); 209 else 210 nsize = fs->fs_bsize; 211 UFS_LOCK(ump); 212 error = ffs_alloc(ip, lbn, 213 ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]), 214 nsize, flags, cred, &newb); 215 if (error) 216 return (error); 217 bp = getblk(vp, lbn, nsize, 0, 0, gbflags); 218 bp->b_blkno = fsbtodb(fs, newb); 219 if (flags & BA_CLRBUF) 220 vfs_bio_clrbuf(bp); 221 if (DOINGSOFTDEP(vp)) 222 softdep_setup_allocdirect(ip, lbn, newb, 0, 223 nsize, 0, bp); 224 } 225 dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno); 226 ip->i_flag |= IN_CHANGE | IN_UPDATE | IN_IBLKDATA; 227 *bpp = bp; 228 return (0); 229 } 230 /* 231 * Determine the number of levels of indirection. 232 */ 233 pref = 0; 234 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 235 return(error); 236#ifdef INVARIANTS 237 if (num < 1) 238 panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block"); 239#endif 240 saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH); 241 /* 242 * Fetch the first indirect block allocating if necessary. 243 */ 244 --num; 245 nb = dp->di_ib[indirs[0].in_off]; 246 allocib = NULL; 247 allocblk = allociblk; 248 lbns_remfree = lbns; 249 if (nb == 0) { 250 UFS_LOCK(ump); 251 pref = ffs_blkpref_ufs1(ip, lbn, -indirs[0].in_off - 1, 252 (ufs1_daddr_t *)0); 253 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 254 flags, cred, &newb)) != 0) { 255 curthread_pflags_restore(saved_inbdflush); 256 return (error); 257 } 258 pref = newb + fs->fs_frag; 259 nb = newb; 260 MPASS(allocblk < allociblk + nitems(allociblk)); 261 MPASS(lbns_remfree < lbns + nitems(lbns)); 262 *allocblk++ = nb; 263 *lbns_remfree++ = indirs[1].in_lbn; 264 bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, gbflags); 265 bp->b_blkno = fsbtodb(fs, nb); 266 vfs_bio_clrbuf(bp); 267 if (DOINGSOFTDEP(vp)) { 268 softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off, 269 newb, 0, fs->fs_bsize, 0, bp); 270 bdwrite(bp); 271 } else { 272 /* 273 * Write synchronously so that indirect blocks 274 * never point at garbage. 275 */ 276 if (DOINGASYNC(vp)) 277 bdwrite(bp); 278 else if ((error = bwrite(bp)) != 0) 279 goto fail; 280 } 281 allocib = &dp->di_ib[indirs[0].in_off]; 282 *allocib = nb; 283 ip->i_flag |= IN_CHANGE | IN_UPDATE | IN_IBLKDATA; 284 } 285 /* 286 * Fetch through the indirect blocks, allocating as necessary. 287 */ 288retry: 289 for (i = 1;;) { 290 error = bread(vp, 291 indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); 292 if (error) { 293 brelse(bp); 294 goto fail; 295 } 296 bap = (ufs1_daddr_t *)bp->b_data; 297 nb = bap[indirs[i].in_off]; 298 if (i == num) 299 break; 300 i += 1; 301 if (nb != 0) { 302 bqrelse(bp); 303 continue; 304 } 305 UFS_LOCK(ump); 306 /* 307 * If parent indirect has just been allocated, try to cluster 308 * immediately following it. 309 */ 310 if (pref == 0) 311 pref = ffs_blkpref_ufs1(ip, lbn, i - num - 1, 312 (ufs1_daddr_t *)0); 313 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 314 flags | IO_BUFLOCKED, cred, &newb)) != 0) { 315 brelse(bp); 316 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 317 UFS_LOCK(ump); 318 softdep_request_cleanup(fs, vp, cred, 319 FLUSH_BLOCKS_WAIT); 320 UFS_UNLOCK(ump); 321 goto retry; 322 } 323 if (ppsratecheck(&lastfail, &curfail, 1)) { 324 ffs_fserr(fs, ip->i_number, "filesystem full"); 325 uprintf("\n%s: write failed, filesystem " 326 "is full\n", fs->fs_fsmnt); 327 } 328 goto fail; 329 } 330 pref = newb + fs->fs_frag; 331 nb = newb; 332 MPASS(allocblk < allociblk + nitems(allociblk)); 333 MPASS(lbns_remfree < lbns + nitems(lbns)); 334 *allocblk++ = nb; 335 *lbns_remfree++ = indirs[i].in_lbn; 336 nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0); 337 nbp->b_blkno = fsbtodb(fs, nb); 338 vfs_bio_clrbuf(nbp); 339 if (DOINGSOFTDEP(vp)) { 340 softdep_setup_allocindir_meta(nbp, ip, bp, 341 indirs[i - 1].in_off, nb); 342 bdwrite(nbp); 343 } else { 344 /* 345 * Write synchronously so that indirect blocks 346 * never point at garbage. 347 */ 348 if ((error = bwrite(nbp)) != 0) { 349 brelse(bp); 350 goto fail; 351 } 352 } 353 bap[indirs[i - 1].in_off] = nb; 354 if (allocib == NULL && unwindidx < 0) 355 unwindidx = i - 1; 356 /* 357 * If required, write synchronously, otherwise use 358 * delayed write. 359 */ 360 if (flags & IO_SYNC) { 361 bwrite(bp); 362 } else { 363 if (bp->b_bufsize == fs->fs_bsize) 364 bp->b_flags |= B_CLUSTEROK; 365 bdwrite(bp); 366 } 367 } 368 /* 369 * If asked only for the indirect block, then return it. 370 */ 371 if (flags & BA_METAONLY) { 372 curthread_pflags_restore(saved_inbdflush); 373 *bpp = bp; 374 return (0); 375 } 376 /* 377 * Get the data block, allocating if necessary. 378 */ 379 if (nb == 0) { 380 UFS_LOCK(ump); 381 /* 382 * If allocating metadata at the front of the cylinder 383 * group and parent indirect block has just been allocated, 384 * then cluster next to it if it is the first indirect in 385 * the file. Otherwise it has been allocated in the metadata 386 * area, so we want to find our own place out in the data area. 387 */ 388 if (pref == 0 || (lbn > NDADDR && fs->fs_metaspace != 0)) 389 pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off, 390 &bap[0]); 391 error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 392 flags | IO_BUFLOCKED, cred, &newb); 393 if (error) { 394 brelse(bp); 395 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 396 UFS_LOCK(ump); 397 softdep_request_cleanup(fs, vp, cred, 398 FLUSH_BLOCKS_WAIT); 399 UFS_UNLOCK(ump); 400 goto retry; 401 } 402 if (ppsratecheck(&lastfail, &curfail, 1)) { 403 ffs_fserr(fs, ip->i_number, "filesystem full"); 404 uprintf("\n%s: write failed, filesystem " 405 "is full\n", fs->fs_fsmnt); 406 } 407 goto fail; 408 } 409 nb = newb; 410 MPASS(allocblk < allociblk + nitems(allociblk)); 411 MPASS(lbns_remfree < lbns + nitems(lbns)); 412 *allocblk++ = nb; 413 *lbns_remfree++ = lbn; 414 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 415 nbp->b_blkno = fsbtodb(fs, nb); 416 if (flags & BA_CLRBUF) 417 vfs_bio_clrbuf(nbp); 418 if (DOINGSOFTDEP(vp)) 419 softdep_setup_allocindir_page(ip, lbn, bp, 420 indirs[i].in_off, nb, 0, nbp); 421 bap[indirs[i].in_off] = nb; 422 /* 423 * If required, write synchronously, otherwise use 424 * delayed write. 425 */ 426 if (flags & IO_SYNC) { 427 bwrite(bp); 428 } else { 429 if (bp->b_bufsize == fs->fs_bsize) 430 bp->b_flags |= B_CLUSTEROK; 431 bdwrite(bp); 432 } 433 curthread_pflags_restore(saved_inbdflush); 434 *bpp = nbp; 435 return (0); 436 } 437 brelse(bp); 438 if (flags & BA_CLRBUF) { 439 int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT; 440 if (seqcount != 0 && 441 (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 && 442 !(vm_page_count_severe() || buf_dirty_count_severe())) { 443 error = cluster_read(vp, ip->i_size, lbn, 444 (int)fs->fs_bsize, NOCRED, 445 MAXBSIZE, seqcount, gbflags, &nbp); 446 } else { 447 error = bread_gb(vp, lbn, (int)fs->fs_bsize, NOCRED, 448 gbflags, &nbp); 449 } 450 if (error) { 451 brelse(nbp); 452 goto fail; 453 } 454 } else { 455 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 456 nbp->b_blkno = fsbtodb(fs, nb); 457 } 458 curthread_pflags_restore(saved_inbdflush); 459 *bpp = nbp; 460 return (0); 461fail: 462 curthread_pflags_restore(saved_inbdflush); 463 /* 464 * If we have failed to allocate any blocks, simply return the error. 465 * This is the usual case and avoids the need to fsync the file. 466 */ 467 if (allocblk == allociblk && allocib == NULL && unwindidx == -1) 468 return (error); 469 /* 470 * If we have failed part way through block allocation, we 471 * have to deallocate any indirect blocks that we have allocated. 472 * We have to fsync the file before we start to get rid of all 473 * of its dependencies so that we do not leave them dangling. 474 * We have to sync it at the end so that the soft updates code 475 * does not find any untracked changes. Although this is really 476 * slow, running out of disk space is not expected to be a common 477 * occurrence. The error return from fsync is ignored as we already 478 * have an error to return to the user. 479 * 480 * XXX Still have to journal the free below 481 */ 482 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 483 for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; 484 blkp < allocblk; blkp++, lbns_remfree++) { 485 /* 486 * We shall not leave the freed blocks on the vnode 487 * buffer object lists. 488 */ 489 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 490 GB_NOCREAT | GB_UNMAPPED); 491 if (bp != NULL) { 492 KASSERT(bp->b_blkno == fsbtodb(fs, *blkp), 493 ("mismatch1 l %jd %jd b %ju %ju", 494 (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree, 495 (uintmax_t)bp->b_blkno, 496 (uintmax_t)fsbtodb(fs, *blkp))); 497 bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE; 498 bp->b_flags &= ~(B_ASYNC | B_CACHE); 499 brelse(bp); 500 } 501 deallocated += fs->fs_bsize; 502 } 503 if (allocib != NULL) { 504 *allocib = 0; 505 } else if (unwindidx >= 0) { 506 int r; 507 508 r = bread(vp, indirs[unwindidx].in_lbn, 509 (int)fs->fs_bsize, NOCRED, &bp); 510 if (r) { 511 panic("Could not unwind indirect block, error %d", r); 512 brelse(bp); 513 } else { 514 bap = (ufs1_daddr_t *)bp->b_data; 515 bap[indirs[unwindidx].in_off] = 0; 516 if (flags & IO_SYNC) { 517 bwrite(bp); 518 } else { 519 if (bp->b_bufsize == fs->fs_bsize) 520 bp->b_flags |= B_CLUSTEROK; 521 bdwrite(bp); 522 } 523 } 524 } 525 if (deallocated) { 526#ifdef QUOTA 527 /* 528 * Restore user's disk quota because allocation failed. 529 */ 530 (void) chkdq(ip, -btodb(deallocated), cred, FORCE); 531#endif 532 dp->di_blocks -= btodb(deallocated); 533 ip->i_flag |= IN_CHANGE | IN_UPDATE; 534 } 535 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 536 /* 537 * After the buffers are invalidated and on-disk pointers are 538 * cleared, free the blocks. 539 */ 540 for (blkp = allociblk; blkp < allocblk; blkp++) { 541#ifdef INVARIANTS 542 if (blkp == allociblk) 543 lbns_remfree = lbns; 544 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 545 GB_NOCREAT | GB_UNMAPPED); 546 if (bp != NULL) { 547 panic("zombie1 %jd %ju %ju", 548 (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno, 549 (uintmax_t)fsbtodb(fs, *blkp)); 550 } 551 lbns_remfree++; 552#endif 553 ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize, 554 ip->i_number, vp->v_type, NULL); 555 } 556 return (error); 557} 558 559/* 560 * Balloc defines the structure of file system storage 561 * by allocating the physical blocks on a device given 562 * the inode and the logical block number in a file. 563 * This is the allocation strategy for UFS2. Above is 564 * the allocation strategy for UFS1. 565 */ 566int 567ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, 568 struct ucred *cred, int flags, struct buf **bpp) 569{ 570 struct inode *ip; 571 struct ufs2_dinode *dp; 572 ufs_lbn_t lbn, lastlbn; 573 struct fs *fs; 574 struct buf *bp, *nbp; 575 struct ufsmount *ump; 576 struct indir indirs[NIADDR + 2]; 577 ufs2_daddr_t nb, newb, *bap, pref; 578 ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1]; 579 ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1]; 580 int deallocated, osize, nsize, num, i, error; 581 int unwindidx = -1; 582 int saved_inbdflush; 583 static struct timeval lastfail; 584 static int curfail; 585 int gbflags, reclaimed; 586 587 ip = VTOI(vp); 588 dp = ip->i_din2; 589 fs = ITOFS(ip); 590 ump = ITOUMP(ip); 591 lbn = lblkno(fs, startoffset); 592 size = blkoff(fs, startoffset) + size; 593 reclaimed = 0; 594 if (size > fs->fs_bsize) 595 panic("ffs_balloc_ufs2: blk too big"); 596 *bpp = NULL; 597 if (lbn < 0) 598 return (EFBIG); 599 gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; 600 601 if (DOINGSOFTDEP(vp)) 602 softdep_prealloc(vp, MNT_WAIT); 603 604 /* 605 * Check for allocating external data. 606 */ 607 if (flags & IO_EXT) { 608 if (lbn >= NXADDR) 609 return (EFBIG); 610 /* 611 * If the next write will extend the data into a new block, 612 * and the data is currently composed of a fragment 613 * this fragment has to be extended to be a full block. 614 */ 615 lastlbn = lblkno(fs, dp->di_extsize); 616 if (lastlbn < lbn) { 617 nb = lastlbn; 618 osize = sblksize(fs, dp->di_extsize, nb); 619 if (osize < fs->fs_bsize && osize > 0) { 620 UFS_LOCK(ump); 621 error = ffs_realloccg(ip, -1 - nb, 622 dp->di_extb[nb], 623 ffs_blkpref_ufs2(ip, lastlbn, (int)nb, 624 &dp->di_extb[0]), osize, 625 (int)fs->fs_bsize, flags, cred, &bp); 626 if (error) 627 return (error); 628 if (DOINGSOFTDEP(vp)) 629 softdep_setup_allocext(ip, nb, 630 dbtofsb(fs, bp->b_blkno), 631 dp->di_extb[nb], 632 fs->fs_bsize, osize, bp); 633 dp->di_extsize = smalllblktosize(fs, nb + 1); 634 dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno); 635 bp->b_xflags |= BX_ALTDATA; 636 ip->i_flag |= IN_SIZEMOD | IN_CHANGE | IN_IBLKDATA; 637 if (flags & IO_SYNC) 638 bwrite(bp); 639 else 640 bawrite(bp); 641 } 642 } 643 /* 644 * All blocks are direct blocks 645 */ 646 if (flags & BA_METAONLY) 647 panic("ffs_balloc_ufs2: BA_METAONLY for ext block"); 648 nb = dp->di_extb[lbn]; 649 if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) { 650 error = bread_gb(vp, -1 - lbn, fs->fs_bsize, NOCRED, 651 gbflags, &bp); 652 if (error) { 653 brelse(bp); 654 return (error); 655 } 656 bp->b_blkno = fsbtodb(fs, nb); 657 bp->b_xflags |= BX_ALTDATA; 658 *bpp = bp; 659 return (0); 660 } 661 if (nb != 0) { 662 /* 663 * Consider need to reallocate a fragment. 664 */ 665 osize = fragroundup(fs, blkoff(fs, dp->di_extsize)); 666 nsize = fragroundup(fs, size); 667 if (nsize <= osize) { 668 error = bread_gb(vp, -1 - lbn, osize, NOCRED, 669 gbflags, &bp); 670 if (error) { 671 brelse(bp); 672 return (error); 673 } 674 bp->b_blkno = fsbtodb(fs, nb); 675 bp->b_xflags |= BX_ALTDATA; 676 } else { 677 UFS_LOCK(ump); 678 error = ffs_realloccg(ip, -1 - lbn, 679 dp->di_extb[lbn], 680 ffs_blkpref_ufs2(ip, lbn, (int)lbn, 681 &dp->di_extb[0]), osize, nsize, flags, 682 cred, &bp); 683 if (error) 684 return (error); 685 bp->b_xflags |= BX_ALTDATA; 686 if (DOINGSOFTDEP(vp)) 687 softdep_setup_allocext(ip, lbn, 688 dbtofsb(fs, bp->b_blkno), nb, 689 nsize, osize, bp); 690 } 691 } else { 692 if (dp->di_extsize < smalllblktosize(fs, lbn + 1)) 693 nsize = fragroundup(fs, size); 694 else 695 nsize = fs->fs_bsize; 696 UFS_LOCK(ump); 697 error = ffs_alloc(ip, lbn, 698 ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]), 699 nsize, flags, cred, &newb); 700 if (error) 701 return (error); 702 bp = getblk(vp, -1 - lbn, nsize, 0, 0, gbflags); 703 bp->b_blkno = fsbtodb(fs, newb); 704 bp->b_xflags |= BX_ALTDATA; 705 if (flags & BA_CLRBUF) 706 vfs_bio_clrbuf(bp); 707 if (DOINGSOFTDEP(vp)) 708 softdep_setup_allocext(ip, lbn, newb, 0, 709 nsize, 0, bp); 710 } 711 dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno); 712 ip->i_flag |= IN_CHANGE | IN_IBLKDATA; 713 *bpp = bp; 714 return (0); 715 } 716 /* 717 * If the next write will extend the file into a new block, 718 * and the file is currently composed of a fragment 719 * this fragment has to be extended to be a full block. 720 */ 721 lastlbn = lblkno(fs, ip->i_size); 722 if (lastlbn < NDADDR && lastlbn < lbn) { 723 nb = lastlbn; 724 osize = blksize(fs, ip, nb); 725 if (osize < fs->fs_bsize && osize > 0) { 726 UFS_LOCK(ump); 727 error = ffs_realloccg(ip, nb, dp->di_db[nb], 728 ffs_blkpref_ufs2(ip, lastlbn, (int)nb, 729 &dp->di_db[0]), osize, (int)fs->fs_bsize, 730 flags, cred, &bp); 731 if (error) 732 return (error); 733 if (DOINGSOFTDEP(vp)) 734 softdep_setup_allocdirect(ip, nb, 735 dbtofsb(fs, bp->b_blkno), 736 dp->di_db[nb], 737 fs->fs_bsize, osize, bp); 738 ip->i_size = smalllblktosize(fs, nb + 1); 739 dp->di_size = ip->i_size; 740 dp->di_db[nb] = dbtofsb(fs, bp->b_blkno); 741 ip->i_flag |= IN_SIZEMOD | IN_CHANGE | IN_UPDATE | 742 IN_IBLKDATA; 743 if (flags & IO_SYNC) 744 bwrite(bp); 745 else 746 bawrite(bp); 747 } 748 } 749 /* 750 * The first NDADDR blocks are direct blocks 751 */ 752 if (lbn < NDADDR) { 753 if (flags & BA_METAONLY) 754 panic("ffs_balloc_ufs2: BA_METAONLY for direct block"); 755 nb = dp->di_db[lbn]; 756 if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { 757 error = bread_gb(vp, lbn, fs->fs_bsize, NOCRED, 758 gbflags, &bp); 759 if (error) { 760 brelse(bp); 761 return (error); 762 } 763 bp->b_blkno = fsbtodb(fs, nb); 764 *bpp = bp; 765 return (0); 766 } 767 if (nb != 0) { 768 /* 769 * Consider need to reallocate a fragment. 770 */ 771 osize = fragroundup(fs, blkoff(fs, ip->i_size)); 772 nsize = fragroundup(fs, size); 773 if (nsize <= osize) { 774 error = bread_gb(vp, lbn, osize, NOCRED, 775 gbflags, &bp); 776 if (error) { 777 brelse(bp); 778 return (error); 779 } 780 bp->b_blkno = fsbtodb(fs, nb); 781 } else { 782 UFS_LOCK(ump); 783 error = ffs_realloccg(ip, lbn, dp->di_db[lbn], 784 ffs_blkpref_ufs2(ip, lbn, (int)lbn, 785 &dp->di_db[0]), osize, nsize, flags, 786 cred, &bp); 787 if (error) 788 return (error); 789 if (DOINGSOFTDEP(vp)) 790 softdep_setup_allocdirect(ip, lbn, 791 dbtofsb(fs, bp->b_blkno), nb, 792 nsize, osize, bp); 793 } 794 } else { 795 if (ip->i_size < smalllblktosize(fs, lbn + 1)) 796 nsize = fragroundup(fs, size); 797 else 798 nsize = fs->fs_bsize; 799 UFS_LOCK(ump); 800 error = ffs_alloc(ip, lbn, 801 ffs_blkpref_ufs2(ip, lbn, (int)lbn, 802 &dp->di_db[0]), nsize, flags, cred, &newb); 803 if (error) 804 return (error); 805 bp = getblk(vp, lbn, nsize, 0, 0, gbflags); 806 bp->b_blkno = fsbtodb(fs, newb); 807 if (flags & BA_CLRBUF) 808 vfs_bio_clrbuf(bp); 809 if (DOINGSOFTDEP(vp)) 810 softdep_setup_allocdirect(ip, lbn, newb, 0, 811 nsize, 0, bp); 812 } 813 dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno); 814 ip->i_flag |= IN_CHANGE | IN_UPDATE | IN_IBLKDATA; 815 *bpp = bp; 816 return (0); 817 } 818 /* 819 * Determine the number of levels of indirection. 820 */ 821 pref = 0; 822 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 823 return(error); 824#ifdef INVARIANTS 825 if (num < 1) 826 panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block"); 827#endif 828 saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH); 829 /* 830 * Fetch the first indirect block allocating if necessary. 831 */ 832 --num; 833 nb = dp->di_ib[indirs[0].in_off]; 834 allocib = NULL; 835 allocblk = allociblk; 836 lbns_remfree = lbns; 837 if (nb == 0) { 838 UFS_LOCK(ump); 839 pref = ffs_blkpref_ufs2(ip, lbn, -indirs[0].in_off - 1, 840 (ufs2_daddr_t *)0); 841 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 842 flags, cred, &newb)) != 0) { 843 curthread_pflags_restore(saved_inbdflush); 844 return (error); 845 } 846 pref = newb + fs->fs_frag; 847 nb = newb; 848 MPASS(allocblk < allociblk + nitems(allociblk)); 849 MPASS(lbns_remfree < lbns + nitems(lbns)); 850 *allocblk++ = nb; 851 *lbns_remfree++ = indirs[1].in_lbn; 852 bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 853 GB_UNMAPPED); 854 bp->b_blkno = fsbtodb(fs, nb); 855 vfs_bio_clrbuf(bp); 856 if (DOINGSOFTDEP(vp)) { 857 softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off, 858 newb, 0, fs->fs_bsize, 0, bp); 859 bdwrite(bp); 860 } else { 861 /* 862 * Write synchronously so that indirect blocks 863 * never point at garbage. 864 */ 865 if (DOINGASYNC(vp)) 866 bdwrite(bp); 867 else if ((error = bwrite(bp)) != 0) 868 goto fail; 869 } 870 allocib = &dp->di_ib[indirs[0].in_off]; 871 *allocib = nb; 872 ip->i_flag |= IN_CHANGE | IN_UPDATE | IN_IBLKDATA; 873 } 874 /* 875 * Fetch through the indirect blocks, allocating as necessary. 876 */ 877retry: 878 for (i = 1;;) { 879 error = bread(vp, 880 indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); 881 if (error) { 882 brelse(bp); 883 goto fail; 884 } 885 bap = (ufs2_daddr_t *)bp->b_data; 886 nb = bap[indirs[i].in_off]; 887 if (i == num) 888 break; 889 i += 1; 890 if (nb != 0) { 891 bqrelse(bp); 892 continue; 893 } 894 UFS_LOCK(ump); 895 /* 896 * If parent indirect has just been allocated, try to cluster 897 * immediately following it. 898 */ 899 if (pref == 0) 900 pref = ffs_blkpref_ufs2(ip, lbn, i - num - 1, 901 (ufs2_daddr_t *)0); 902 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 903 flags | IO_BUFLOCKED, cred, &newb)) != 0) { 904 brelse(bp); 905 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 906 UFS_LOCK(ump); 907 softdep_request_cleanup(fs, vp, cred, 908 FLUSH_BLOCKS_WAIT); 909 UFS_UNLOCK(ump); 910 goto retry; 911 } 912 if (ppsratecheck(&lastfail, &curfail, 1)) { 913 ffs_fserr(fs, ip->i_number, "filesystem full"); 914 uprintf("\n%s: write failed, filesystem " 915 "is full\n", fs->fs_fsmnt); 916 } 917 goto fail; 918 } 919 pref = newb + fs->fs_frag; 920 nb = newb; 921 MPASS(allocblk < allociblk + nitems(allociblk)); 922 MPASS(lbns_remfree < lbns + nitems(lbns)); 923 *allocblk++ = nb; 924 *lbns_remfree++ = indirs[i].in_lbn; 925 nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 926 GB_UNMAPPED); 927 nbp->b_blkno = fsbtodb(fs, nb); 928 vfs_bio_clrbuf(nbp); 929 if (DOINGSOFTDEP(vp)) { 930 softdep_setup_allocindir_meta(nbp, ip, bp, 931 indirs[i - 1].in_off, nb); 932 bdwrite(nbp); 933 } else { 934 /* 935 * Write synchronously so that indirect blocks 936 * never point at garbage. 937 */ 938 if ((error = bwrite(nbp)) != 0) { 939 brelse(bp); 940 goto fail; 941 } 942 } 943 bap[indirs[i - 1].in_off] = nb; 944 if (allocib == NULL && unwindidx < 0) 945 unwindidx = i - 1; 946 /* 947 * If required, write synchronously, otherwise use 948 * delayed write. 949 */ 950 if (flags & IO_SYNC) { 951 bwrite(bp); 952 } else { 953 if (bp->b_bufsize == fs->fs_bsize) 954 bp->b_flags |= B_CLUSTEROK; 955 bdwrite(bp); 956 } 957 } 958 /* 959 * If asked only for the indirect block, then return it. 960 */ 961 if (flags & BA_METAONLY) { 962 curthread_pflags_restore(saved_inbdflush); 963 *bpp = bp; 964 return (0); 965 } 966 /* 967 * Get the data block, allocating if necessary. 968 */ 969 if (nb == 0) { 970 UFS_LOCK(ump); 971 /* 972 * If allocating metadata at the front of the cylinder 973 * group and parent indirect block has just been allocated, 974 * then cluster next to it if it is the first indirect in 975 * the file. Otherwise it has been allocated in the metadata 976 * area, so we want to find our own place out in the data area. 977 */ 978 if (pref == 0 || (lbn > NDADDR && fs->fs_metaspace != 0)) 979 pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off, 980 &bap[0]); 981 error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 982 flags | IO_BUFLOCKED, cred, &newb); 983 if (error) { 984 brelse(bp); 985 if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { 986 UFS_LOCK(ump); 987 softdep_request_cleanup(fs, vp, cred, 988 FLUSH_BLOCKS_WAIT); 989 UFS_UNLOCK(ump); 990 goto retry; 991 } 992 if (ppsratecheck(&lastfail, &curfail, 1)) { 993 ffs_fserr(fs, ip->i_number, "filesystem full"); 994 uprintf("\n%s: write failed, filesystem " 995 "is full\n", fs->fs_fsmnt); 996 } 997 goto fail; 998 } 999 nb = newb; 1000 MPASS(allocblk < allociblk + nitems(allociblk)); 1001 MPASS(lbns_remfree < lbns + nitems(lbns)); 1002 *allocblk++ = nb; 1003 *lbns_remfree++ = lbn; 1004 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 1005 nbp->b_blkno = fsbtodb(fs, nb); 1006 if (flags & BA_CLRBUF) 1007 vfs_bio_clrbuf(nbp); 1008 if (DOINGSOFTDEP(vp)) 1009 softdep_setup_allocindir_page(ip, lbn, bp, 1010 indirs[i].in_off, nb, 0, nbp); 1011 bap[indirs[i].in_off] = nb; 1012 /* 1013 * If required, write synchronously, otherwise use 1014 * delayed write. 1015 */ 1016 if (flags & IO_SYNC) { 1017 bwrite(bp); 1018 } else { 1019 if (bp->b_bufsize == fs->fs_bsize) 1020 bp->b_flags |= B_CLUSTEROK; 1021 bdwrite(bp); 1022 } 1023 curthread_pflags_restore(saved_inbdflush); 1024 *bpp = nbp; 1025 return (0); 1026 } 1027 brelse(bp); 1028 /* 1029 * If requested clear invalid portions of the buffer. If we 1030 * have to do a read-before-write (typical if BA_CLRBUF is set), 1031 * try to do some read-ahead in the sequential case to reduce 1032 * the number of I/O transactions. 1033 */ 1034 if (flags & BA_CLRBUF) { 1035 int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT; 1036 if (seqcount != 0 && 1037 (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 && 1038 !(vm_page_count_severe() || buf_dirty_count_severe())) { 1039 error = cluster_read(vp, ip->i_size, lbn, 1040 (int)fs->fs_bsize, NOCRED, 1041 MAXBSIZE, seqcount, gbflags, &nbp); 1042 } else { 1043 error = bread_gb(vp, lbn, (int)fs->fs_bsize, 1044 NOCRED, gbflags, &nbp); 1045 } 1046 if (error) { 1047 brelse(nbp); 1048 goto fail; 1049 } 1050 } else { 1051 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); 1052 nbp->b_blkno = fsbtodb(fs, nb); 1053 } 1054 curthread_pflags_restore(saved_inbdflush); 1055 *bpp = nbp; 1056 return (0); 1057fail: 1058 curthread_pflags_restore(saved_inbdflush); 1059 /* 1060 * If we have failed to allocate any blocks, simply return the error. 1061 * This is the usual case and avoids the need to fsync the file. 1062 */ 1063 if (allocblk == allociblk && allocib == NULL && unwindidx == -1) 1064 return (error); 1065 /* 1066 * If we have failed part way through block allocation, we 1067 * have to deallocate any indirect blocks that we have allocated. 1068 * We have to fsync the file before we start to get rid of all 1069 * of its dependencies so that we do not leave them dangling. 1070 * We have to sync it at the end so that the soft updates code 1071 * does not find any untracked changes. Although this is really 1072 * slow, running out of disk space is not expected to be a common 1073 * occurrence. The error return from fsync is ignored as we already 1074 * have an error to return to the user. 1075 * 1076 * XXX Still have to journal the free below 1077 */ 1078 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 1079 for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; 1080 blkp < allocblk; blkp++, lbns_remfree++) { 1081 /* 1082 * We shall not leave the freed blocks on the vnode 1083 * buffer object lists. 1084 */ 1085 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 1086 GB_NOCREAT | GB_UNMAPPED); 1087 if (bp != NULL) { 1088 KASSERT(bp->b_blkno == fsbtodb(fs, *blkp), 1089 ("mismatch2 l %jd %jd b %ju %ju", 1090 (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree, 1091 (uintmax_t)bp->b_blkno, 1092 (uintmax_t)fsbtodb(fs, *blkp))); 1093 bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE; 1094 bp->b_flags &= ~(B_ASYNC | B_CACHE); 1095 brelse(bp); 1096 } 1097 deallocated += fs->fs_bsize; 1098 } 1099 if (allocib != NULL) { 1100 *allocib = 0; 1101 } else if (unwindidx >= 0) { 1102 int r; 1103 1104 r = bread(vp, indirs[unwindidx].in_lbn, 1105 (int)fs->fs_bsize, NOCRED, &bp); 1106 if (r) { 1107 panic("Could not unwind indirect block, error %d", r); 1108 brelse(bp); 1109 } else { 1110 bap = (ufs2_daddr_t *)bp->b_data; 1111 bap[indirs[unwindidx].in_off] = 0; 1112 if (flags & IO_SYNC) { 1113 bwrite(bp); 1114 } else { 1115 if (bp->b_bufsize == fs->fs_bsize) 1116 bp->b_flags |= B_CLUSTEROK; 1117 bdwrite(bp); 1118 } 1119 } 1120 } 1121 if (deallocated) { 1122#ifdef QUOTA 1123 /* 1124 * Restore user's disk quota because allocation failed. 1125 */ 1126 (void) chkdq(ip, -btodb(deallocated), cred, FORCE); 1127#endif 1128 dp->di_blocks -= btodb(deallocated); 1129 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1130 } 1131 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 1132 /* 1133 * After the buffers are invalidated and on-disk pointers are 1134 * cleared, free the blocks. 1135 */ 1136 for (blkp = allociblk; blkp < allocblk; blkp++) { 1137#ifdef INVARIANTS 1138 if (blkp == allociblk) 1139 lbns_remfree = lbns; 1140 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, 1141 GB_NOCREAT | GB_UNMAPPED); 1142 if (bp != NULL) { 1143 panic("zombie2 %jd %ju %ju", 1144 (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno, 1145 (uintmax_t)fsbtodb(fs, *blkp)); 1146 } 1147 lbns_remfree++; 1148#endif 1149 ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize, 1150 ip->i_number, vp->v_type, NULL); 1151 } 1152 return (error); 1153} 1154