1/* $NetBSD: lfs_syscalls.c,v 1.140 2012/01/02 22:10:45 perseant Exp $ */ 2 3/*- 4 * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007, 2008 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Konrad E. Schroder <perseant@hhhh.org>. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32/*- 33 * Copyright (c) 1991, 1993, 1994 34 * The Regents of the University of California. All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. Neither the name of the University nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 * 60 * @(#)lfs_syscalls.c 8.10 (Berkeley) 5/14/95 61 */ 62 63#include <sys/cdefs.h> 64__KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.140 2012/01/02 22:10:45 perseant Exp $"); 65 66#ifndef LFS 67# define LFS /* for prototypes in syscallargs.h */ 68#endif 69 70#include <sys/param.h> 71#include <sys/systm.h> 72#include <sys/proc.h> 73#include <sys/buf.h> 74#include <sys/mount.h> 75#include <sys/vnode.h> 76#include <sys/kernel.h> 77#include <sys/kauth.h> 78#include <sys/syscallargs.h> 79 80#include <ufs/ufs/inode.h> 81#include <ufs/ufs/ufsmount.h> 82#include <ufs/ufs/ufs_extern.h> 83 84#include <ufs/lfs/lfs.h> 85#include <ufs/lfs/lfs_extern.h> 86 87struct buf *lfs_fakebuf(struct lfs *, struct vnode *, int, size_t, void *); 88int lfs_fasthashget(dev_t, ino_t, struct vnode **); 89 90pid_t lfs_cleaner_pid = 0; 91 92/* 93 * sys_lfs_markv: 94 * 95 * This will mark inodes and blocks dirty, so they are written into the log. 96 * It will block until all the blocks have been written. The segment create 97 * time passed in the block_info and inode_info structures is used to decide 98 * if the data is valid for each block (in case some process dirtied a block 99 * or inode that is being cleaned between the determination that a block is 100 * live and the lfs_markv call). 101 * 102 * 0 on success 103 * -1/errno is return on error. 104 */ 105#ifdef USE_64BIT_SYSCALLS 106int 107sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval) 108{ 109 /* { 110 syscallarg(fsid_t *) fsidp; 111 syscallarg(struct block_info *) blkiov; 112 syscallarg(int) blkcnt; 113 } */ 114 BLOCK_INFO *blkiov; 115 int blkcnt, error; 116 fsid_t fsid; 117 struct lfs *fs; 118 struct mount *mntp; 119 120 if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, 121 NULL)) != 0) 122 return (error); 123 124 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 125 return (error); 126 127 if ((mntp = vfs_getvfs(fsidp)) == NULL) 128 return (ENOENT); 129 fs = VFSTOUFS(mntp)->um_lfs; 130 131 blkcnt = SCARG(uap, blkcnt); 132 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT) 133 return (EINVAL); 134 135 KERNEL_LOCK(1, NULL); 136 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); 137 if ((error = copyin(SCARG(uap, blkiov), blkiov, 138 blkcnt * sizeof(BLOCK_INFO))) != 0) 139 goto out; 140 141 if ((error = lfs_markv(p, &fsid, blkiov, blkcnt)) == 0) 142 copyout(blkiov, SCARG(uap, blkiov), 143 blkcnt * sizeof(BLOCK_INFO)); 144 out: 145 lfs_free(fs, blkiov, LFS_NB_BLKIOV); 146 KERNEL_UNLOCK_ONE(NULL); 147 return error; 148} 149#else 150int 151sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval) 152{ 153 /* { 154 syscallarg(fsid_t *) fsidp; 155 syscallarg(struct block_info *) blkiov; 156 syscallarg(int) blkcnt; 157 } */ 158 BLOCK_INFO *blkiov; 159 BLOCK_INFO_15 *blkiov15; 160 int i, blkcnt, error; 161 fsid_t fsid; 162 struct lfs *fs; 163 struct mount *mntp; 164 165 if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, 166 NULL)) != 0) 167 return (error); 168 169 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 170 return (error); 171 172 if ((mntp = vfs_getvfs(&fsid)) == NULL) 173 return (ENOENT); 174 fs = VFSTOUFS(mntp)->um_lfs; 175 176 blkcnt = SCARG(uap, blkcnt); 177 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT) 178 return (EINVAL); 179 180 KERNEL_LOCK(1, NULL); 181 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); 182 blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV); 183 if ((error = copyin(SCARG(uap, blkiov), blkiov15, 184 blkcnt * sizeof(BLOCK_INFO_15))) != 0) 185 goto out; 186 187 for (i = 0; i < blkcnt; i++) { 188 blkiov[i].bi_inode = blkiov15[i].bi_inode; 189 blkiov[i].bi_lbn = blkiov15[i].bi_lbn; 190 blkiov[i].bi_daddr = blkiov15[i].bi_daddr; 191 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate; 192 blkiov[i].bi_version = blkiov15[i].bi_version; 193 blkiov[i].bi_bp = blkiov15[i].bi_bp; 194 blkiov[i].bi_size = blkiov15[i].bi_size; 195 } 196 197 if ((error = lfs_markv(l->l_proc, &fsid, blkiov, blkcnt)) == 0) { 198 for (i = 0; i < blkcnt; i++) { 199 blkiov15[i].bi_inode = blkiov[i].bi_inode; 200 blkiov15[i].bi_lbn = blkiov[i].bi_lbn; 201 blkiov15[i].bi_daddr = blkiov[i].bi_daddr; 202 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate; 203 blkiov15[i].bi_version = blkiov[i].bi_version; 204 blkiov15[i].bi_bp = blkiov[i].bi_bp; 205 blkiov15[i].bi_size = blkiov[i].bi_size; 206 } 207 copyout(blkiov15, SCARG(uap, blkiov), 208 blkcnt * sizeof(BLOCK_INFO_15)); 209 } 210 out: 211 lfs_free(fs, blkiov, LFS_NB_BLKIOV); 212 lfs_free(fs, blkiov15, LFS_NB_BLKIOV); 213 KERNEL_UNLOCK_ONE(NULL); 214 return error; 215} 216#endif 217 218#define LFS_MARKV_MAX_BLOCKS (LFS_MAX_BUFS) 219 220int 221lfs_markv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, 222 int blkcnt) 223{ 224 BLOCK_INFO *blkp; 225 IFILE *ifp; 226 struct buf *bp; 227 struct inode *ip = NULL; 228 struct lfs *fs; 229 struct mount *mntp; 230 struct vnode *vp = NULL; 231 ino_t lastino; 232 daddr_t b_daddr, v_daddr; 233 int cnt, error; 234 int do_again = 0; 235 int numrefed = 0; 236 ino_t maxino; 237 size_t obsize; 238 239 /* number of blocks/inodes that we have already bwrite'ed */ 240 int nblkwritten, ninowritten; 241 242 if ((mntp = vfs_getvfs(fsidp)) == NULL) 243 return (ENOENT); 244 245 fs = VFSTOUFS(mntp)->um_lfs; 246 247 if (fs->lfs_ronly) 248 return EROFS; 249 250 maxino = (fragstoblks(fs, VTOI(fs->lfs_ivnode)->i_ffs1_blocks) - 251 fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb; 252 253 cnt = blkcnt; 254 255 if ((error = vfs_busy(mntp, NULL)) != 0) 256 return (error); 257 258 /* 259 * This seglock is just to prevent the fact that we might have to sleep 260 * from allowing the possibility that our blocks might become 261 * invalid. 262 * 263 * It is also important to note here that unless we specify SEGM_CKP, 264 * any Ifile blocks that we might be asked to clean will never get 265 * to the disk. 266 */ 267 lfs_seglock(fs, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC); 268 269 /* Mark blocks/inodes dirty. */ 270 error = 0; 271 272 /* these were inside the initialization for the for loop */ 273 v_daddr = LFS_UNUSED_DADDR; 274 lastino = LFS_UNUSED_INUM; 275 nblkwritten = ninowritten = 0; 276 for (blkp = blkiov; cnt--; ++blkp) 277 { 278 /* Bounds-check incoming data, avoid panic for failed VGET */ 279 if (blkp->bi_inode <= 0 || blkp->bi_inode >= maxino) { 280 error = EINVAL; 281 goto err3; 282 } 283 /* 284 * Get the IFILE entry (only once) and see if the file still 285 * exists. 286 */ 287 if (lastino != blkp->bi_inode) { 288 /* 289 * Finish the old file, if there was one. The presence 290 * of a usable vnode in vp is signaled by a valid v_daddr. 291 */ 292 if (v_daddr != LFS_UNUSED_DADDR) { 293 lfs_vunref(vp); 294 numrefed--; 295 } 296 297 /* 298 * Start a new file 299 */ 300 lastino = blkp->bi_inode; 301 if (blkp->bi_inode == LFS_IFILE_INUM) 302 v_daddr = fs->lfs_idaddr; 303 else { 304 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); 305 /* XXX fix for force write */ 306 v_daddr = ifp->if_daddr; 307 brelse(bp, 0); 308 } 309 if (v_daddr == LFS_UNUSED_DADDR) 310 continue; 311 312 /* Get the vnode/inode. */ 313 error = lfs_fastvget(mntp, blkp->bi_inode, v_daddr, 314 &vp, 315 (blkp->bi_lbn == LFS_UNUSED_LBN 316 ? blkp->bi_bp 317 : NULL)); 318 319 if (!error) { 320 numrefed++; 321 } 322 if (error) { 323 DLOG((DLOG_CLEAN, "lfs_markv: lfs_fastvget" 324 " failed with %d (ino %d, segment %d)\n", 325 error, blkp->bi_inode, 326 dtosn(fs, blkp->bi_daddr))); 327 /* 328 * If we got EAGAIN, that means that the 329 * Inode was locked. This is 330 * recoverable: just clean the rest of 331 * this segment, and let the cleaner try 332 * again with another. (When the 333 * cleaner runs again, this segment will 334 * sort high on the list, since it is 335 * now almost entirely empty.) But, we 336 * still set v_daddr = LFS_UNUSED_ADDR 337 * so as not to test this over and over 338 * again. 339 */ 340 if (error == EAGAIN) { 341 error = 0; 342 do_again++; 343 } 344#ifdef DIAGNOSTIC 345 else if (error != ENOENT) 346 panic("lfs_markv VFS_VGET FAILED"); 347#endif 348 /* lastino = LFS_UNUSED_INUM; */ 349 v_daddr = LFS_UNUSED_DADDR; 350 vp = NULL; 351 ip = NULL; 352 continue; 353 } 354 ip = VTOI(vp); 355 ninowritten++; 356 } else if (v_daddr == LFS_UNUSED_DADDR) { 357 /* 358 * This can only happen if the vnode is dead (or 359 * in any case we can't get it...e.g., it is 360 * inlocked). Keep going. 361 */ 362 continue; 363 } 364 365 /* Past this point we are guaranteed that vp, ip are valid. */ 366 367 /* Can't clean VU_DIROP directories in case of truncation */ 368 /* XXX - maybe we should mark removed dirs specially? */ 369 if (vp->v_type == VDIR && (vp->v_uflag & VU_DIROP)) { 370 do_again++; 371 continue; 372 } 373 374 /* If this BLOCK_INFO didn't contain a block, keep going. */ 375 if (blkp->bi_lbn == LFS_UNUSED_LBN) { 376 /* XXX need to make sure that the inode gets written in this case */ 377 /* XXX but only write the inode if it's the right one */ 378 if (blkp->bi_inode != LFS_IFILE_INUM) { 379 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); 380 if (ifp->if_daddr == blkp->bi_daddr) { 381 mutex_enter(&lfs_lock); 382 LFS_SET_UINO(ip, IN_CLEANING); 383 mutex_exit(&lfs_lock); 384 } 385 brelse(bp, 0); 386 } 387 continue; 388 } 389 390 b_daddr = 0; 391 if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) || 392 dbtofsb(fs, b_daddr) != blkp->bi_daddr) 393 { 394 if (dtosn(fs, dbtofsb(fs, b_daddr)) == 395 dtosn(fs, blkp->bi_daddr)) 396 { 397 DLOG((DLOG_CLEAN, "lfs_markv: wrong da same seg: %llx vs %llx\n", 398 (long long)blkp->bi_daddr, (long long)dbtofsb(fs, b_daddr))); 399 } 400 do_again++; 401 continue; 402 } 403 404 /* 405 * Check block sizes. The blocks being cleaned come from 406 * disk, so they should have the same size as their on-disk 407 * counterparts. 408 */ 409 if (blkp->bi_lbn >= 0) 410 obsize = blksize(fs, ip, blkp->bi_lbn); 411 else 412 obsize = fs->lfs_bsize; 413 /* Check for fragment size change */ 414 if (blkp->bi_lbn >= 0 && blkp->bi_lbn < NDADDR) { 415 obsize = ip->i_lfs_fragsize[blkp->bi_lbn]; 416 } 417 if (obsize != blkp->bi_size) { 418 DLOG((DLOG_CLEAN, "lfs_markv: ino %d lbn %lld wrong" 419 " size (%ld != %d), try again\n", 420 blkp->bi_inode, (long long)blkp->bi_lbn, 421 (long) obsize, blkp->bi_size)); 422 do_again++; 423 continue; 424 } 425 426 /* 427 * If we get to here, then we are keeping the block. If 428 * it is an indirect block, we want to actually put it 429 * in the buffer cache so that it can be updated in the 430 * finish_meta section. If it's not, we need to 431 * allocate a fake buffer so that writeseg can perform 432 * the copyin and write the buffer. 433 */ 434 if (ip->i_number != LFS_IFILE_INUM && blkp->bi_lbn >= 0) { 435 /* Data Block */ 436 bp = lfs_fakebuf(fs, vp, blkp->bi_lbn, 437 blkp->bi_size, blkp->bi_bp); 438 /* Pretend we used bread() to get it */ 439 bp->b_blkno = fsbtodb(fs, blkp->bi_daddr); 440 } else { 441 /* Indirect block or ifile */ 442 if (blkp->bi_size != fs->lfs_bsize && 443 ip->i_number != LFS_IFILE_INUM) 444 panic("lfs_markv: partial indirect block?" 445 " size=%d\n", blkp->bi_size); 446 bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0); 447 if (!(bp->b_oflags & (BO_DONE|BO_DELWRI))) { 448 /* 449 * The block in question was not found 450 * in the cache; i.e., the block that 451 * getblk() returned is empty. So, we 452 * can (and should) copy in the 453 * contents, because we've already 454 * determined that this was the right 455 * version of this block on disk. 456 * 457 * And, it can't have changed underneath 458 * us, because we have the segment lock. 459 */ 460 error = copyin(blkp->bi_bp, bp->b_data, blkp->bi_size); 461 if (error) 462 goto err2; 463 } 464 } 465 if ((error = lfs_bwrite_ext(bp, BW_CLEAN)) != 0) 466 goto err2; 467 468 nblkwritten++; 469 /* 470 * XXX should account indirect blocks and ifile pages as well 471 */ 472 if (nblkwritten + lblkno(fs, ninowritten * sizeof (struct ufs1_dinode)) 473 > LFS_MARKV_MAX_BLOCKS) { 474 DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos\n", 475 nblkwritten, ninowritten)); 476 lfs_segwrite(mntp, SEGM_CLEAN); 477 nblkwritten = ninowritten = 0; 478 } 479 } 480 481 /* 482 * Finish the old file, if there was one 483 */ 484 if (v_daddr != LFS_UNUSED_DADDR) { 485 lfs_vunref(vp); 486 numrefed--; 487 } 488 489#ifdef DIAGNOSTIC 490 if (numrefed != 0) 491 panic("lfs_markv: numrefed=%d", numrefed); 492#endif 493 DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos (check point)\n", 494 nblkwritten, ninowritten)); 495 496 /* 497 * The last write has to be SEGM_SYNC, because of calling semantics. 498 * It also has to be SEGM_CKP, because otherwise we could write 499 * over the newly cleaned data contained in a checkpoint, and then 500 * we'd be unhappy at recovery time. 501 */ 502 lfs_segwrite(mntp, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC); 503 504 lfs_segunlock(fs); 505 506 vfs_unbusy(mntp, false, NULL); 507 if (error) 508 return (error); 509 else if (do_again) 510 return EAGAIN; 511 512 return 0; 513 514err2: 515 DLOG((DLOG_CLEAN, "lfs_markv err2\n")); 516 517 /* 518 * XXX we're here because copyin() failed. 519 * XXX it means that we can't trust the cleanerd. too bad. 520 * XXX how can we recover from this? 521 */ 522 523err3: 524 KERNEL_UNLOCK_ONE(NULL); 525 /* 526 * XXX should do segwrite here anyway? 527 */ 528 529 if (v_daddr != LFS_UNUSED_DADDR) { 530 lfs_vunref(vp); 531 --numrefed; 532 } 533 534 lfs_segunlock(fs); 535 vfs_unbusy(mntp, false, NULL); 536#ifdef DIAGNOSTIC 537 if (numrefed != 0) 538 panic("lfs_markv: numrefed=%d", numrefed); 539#endif 540 541 return (error); 542} 543 544/* 545 * sys_lfs_bmapv: 546 * 547 * This will fill in the current disk address for arrays of blocks. 548 * 549 * 0 on success 550 * -1/errno is return on error. 551 */ 552#ifdef USE_64BIT_SYSCALLS 553int 554sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval) 555{ 556 /* { 557 syscallarg(fsid_t *) fsidp; 558 syscallarg(struct block_info *) blkiov; 559 syscallarg(int) blkcnt; 560 } */ 561 BLOCK_INFO *blkiov; 562 int blkcnt, error; 563 fsid_t fsid; 564 struct lfs *fs; 565 struct mount *mntp; 566 567 if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, 568 NULL)) != 0) 569 return (error); 570 571 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 572 return (error); 573 574 if ((mntp = vfs_getvfs(&fsid)) == NULL) 575 return (ENOENT); 576 fs = VFSTOUFS(mntp)->um_lfs; 577 578 blkcnt = SCARG(uap, blkcnt); 579 if ((u_int) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO)) 580 return (EINVAL); 581 KERNEL_LOCK(1, NULL); 582 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); 583 if ((error = copyin(SCARG(uap, blkiov), blkiov, 584 blkcnt * sizeof(BLOCK_INFO))) != 0) 585 goto out; 586 587 if ((error = lfs_bmapv(p, &fsid, blkiov, blkcnt)) == 0) 588 copyout(blkiov, SCARG(uap, blkiov), 589 blkcnt * sizeof(BLOCK_INFO)); 590 out: 591 lfs_free(fs, blkiov, LFS_NB_BLKIOV); 592 KERNEL_UNLOCK_ONE(NULL); 593 return error; 594} 595#else 596int 597sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval) 598{ 599 /* { 600 syscallarg(fsid_t *) fsidp; 601 syscallarg(struct block_info *) blkiov; 602 syscallarg(int) blkcnt; 603 } */ 604 BLOCK_INFO *blkiov; 605 BLOCK_INFO_15 *blkiov15; 606 int i, blkcnt, error; 607 fsid_t fsid; 608 struct lfs *fs; 609 struct mount *mntp; 610 611 if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, 612 NULL)) != 0) 613 return (error); 614 615 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 616 return (error); 617 618 if ((mntp = vfs_getvfs(&fsid)) == NULL) 619 return (ENOENT); 620 fs = VFSTOUFS(mntp)->um_lfs; 621 622 blkcnt = SCARG(uap, blkcnt); 623 if ((size_t) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO)) 624 return (EINVAL); 625 KERNEL_LOCK(1, NULL); 626 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); 627 blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV); 628 if ((error = copyin(SCARG(uap, blkiov), blkiov15, 629 blkcnt * sizeof(BLOCK_INFO_15))) != 0) 630 goto out; 631 632 for (i = 0; i < blkcnt; i++) { 633 blkiov[i].bi_inode = blkiov15[i].bi_inode; 634 blkiov[i].bi_lbn = blkiov15[i].bi_lbn; 635 blkiov[i].bi_daddr = blkiov15[i].bi_daddr; 636 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate; 637 blkiov[i].bi_version = blkiov15[i].bi_version; 638 blkiov[i].bi_bp = blkiov15[i].bi_bp; 639 blkiov[i].bi_size = blkiov15[i].bi_size; 640 } 641 642 if ((error = lfs_bmapv(l->l_proc, &fsid, blkiov, blkcnt)) == 0) { 643 for (i = 0; i < blkcnt; i++) { 644 blkiov15[i].bi_inode = blkiov[i].bi_inode; 645 blkiov15[i].bi_lbn = blkiov[i].bi_lbn; 646 blkiov15[i].bi_daddr = blkiov[i].bi_daddr; 647 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate; 648 blkiov15[i].bi_version = blkiov[i].bi_version; 649 blkiov15[i].bi_bp = blkiov[i].bi_bp; 650 blkiov15[i].bi_size = blkiov[i].bi_size; 651 } 652 copyout(blkiov15, SCARG(uap, blkiov), 653 blkcnt * sizeof(BLOCK_INFO_15)); 654 } 655 out: 656 lfs_free(fs, blkiov, LFS_NB_BLKIOV); 657 lfs_free(fs, blkiov15, LFS_NB_BLKIOV); 658 KERNEL_UNLOCK_ONE(NULL); 659 return error; 660} 661#endif 662 663int 664lfs_bmapv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt) 665{ 666 BLOCK_INFO *blkp; 667 IFILE *ifp; 668 struct buf *bp; 669 struct inode *ip = NULL; 670 struct lfs *fs; 671 struct mount *mntp; 672 struct ufsmount *ump; 673 struct vnode *vp; 674 ino_t lastino; 675 daddr_t v_daddr; 676 int cnt, error; 677 int numrefed = 0; 678 679 lfs_cleaner_pid = p->p_pid; 680 681 if ((mntp = vfs_getvfs(fsidp)) == NULL) 682 return (ENOENT); 683 684 ump = VFSTOUFS(mntp); 685 if ((error = vfs_busy(mntp, NULL)) != 0) 686 return (error); 687 688 cnt = blkcnt; 689 690 fs = VFSTOUFS(mntp)->um_lfs; 691 692 error = 0; 693 694 /* these were inside the initialization for the for loop */ 695 v_daddr = LFS_UNUSED_DADDR; 696 lastino = LFS_UNUSED_INUM; 697 for (blkp = blkiov; cnt--; ++blkp) 698 { 699 /* 700 * Get the IFILE entry (only once) and see if the file still 701 * exists. 702 */ 703 if (lastino != blkp->bi_inode) { 704 /* 705 * Finish the old file, if there was one. The presence 706 * of a usable vnode in vp is signaled by a valid 707 * v_daddr. 708 */ 709 if (v_daddr != LFS_UNUSED_DADDR) { 710 lfs_vunref(vp); 711 if (VTOI(vp)->i_lfs_iflags & LFSI_BMAP) 712 vrecycle(vp, NULL, NULL); 713 numrefed--; 714 } 715 716 /* 717 * Start a new file 718 */ 719 lastino = blkp->bi_inode; 720 if (blkp->bi_inode == LFS_IFILE_INUM) 721 v_daddr = fs->lfs_idaddr; 722 else { 723 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); 724 v_daddr = ifp->if_daddr; 725 brelse(bp, 0); 726 } 727 if (v_daddr == LFS_UNUSED_DADDR) { 728 blkp->bi_daddr = LFS_UNUSED_DADDR; 729 continue; 730 } 731 /* 732 * A regular call to VFS_VGET could deadlock 733 * here. Instead, we try an unlocked access. 734 */ 735 mutex_enter(&ufs_ihash_lock); 736 vp = ufs_ihashlookup(ump->um_dev, blkp->bi_inode); 737 if (vp != NULL && !(vp->v_iflag & VI_XLOCK)) { 738 ip = VTOI(vp); 739 mutex_enter(vp->v_interlock); 740 mutex_exit(&ufs_ihash_lock); 741 if (lfs_vref(vp)) { 742 v_daddr = LFS_UNUSED_DADDR; 743 continue; 744 } 745 numrefed++; 746 } else { 747 mutex_exit(&ufs_ihash_lock); 748 /* 749 * Don't VFS_VGET if we're being unmounted, 750 * since we hold vfs_busy(). 751 */ 752 if (mntp->mnt_iflag & IMNT_UNMOUNT) { 753 v_daddr = LFS_UNUSED_DADDR; 754 continue; 755 } 756 error = VFS_VGET(mntp, blkp->bi_inode, &vp); 757 if (error) { 758 DLOG((DLOG_CLEAN, "lfs_bmapv: vget ino" 759 "%d failed with %d", 760 blkp->bi_inode,error)); 761 v_daddr = LFS_UNUSED_DADDR; 762 continue; 763 } else { 764 KASSERT(VOP_ISLOCKED(vp)); 765 VTOI(vp)->i_lfs_iflags |= LFSI_BMAP; 766 VOP_UNLOCK(vp); 767 numrefed++; 768 } 769 } 770 ip = VTOI(vp); 771 } else if (v_daddr == LFS_UNUSED_DADDR) { 772 /* 773 * This can only happen if the vnode is dead. 774 * Keep going. Note that we DO NOT set the 775 * bi_addr to anything -- if we failed to get 776 * the vnode, for example, we want to assume 777 * conservatively that all of its blocks *are* 778 * located in the segment in question. 779 * lfs_markv will throw them out if we are 780 * wrong. 781 */ 782 /* blkp->bi_daddr = LFS_UNUSED_DADDR; */ 783 continue; 784 } 785 786 /* Past this point we are guaranteed that vp, ip are valid. */ 787 788 if (blkp->bi_lbn == LFS_UNUSED_LBN) { 789 /* 790 * We just want the inode address, which is 791 * conveniently in v_daddr. 792 */ 793 blkp->bi_daddr = v_daddr; 794 } else { 795 daddr_t bi_daddr; 796 797 /* XXX ondisk32 */ 798 error = VOP_BMAP(vp, blkp->bi_lbn, NULL, 799 &bi_daddr, NULL); 800 if (error) 801 { 802 blkp->bi_daddr = LFS_UNUSED_DADDR; 803 continue; 804 } 805 blkp->bi_daddr = dbtofsb(fs, bi_daddr); 806 /* Fill in the block size, too */ 807 if (blkp->bi_lbn >= 0) 808 blkp->bi_size = blksize(fs, ip, blkp->bi_lbn); 809 else 810 blkp->bi_size = fs->lfs_bsize; 811 } 812 } 813 814 /* 815 * Finish the old file, if there was one. The presence 816 * of a usable vnode in vp is signaled by a valid v_daddr. 817 */ 818 if (v_daddr != LFS_UNUSED_DADDR) { 819 lfs_vunref(vp); 820 /* Recycle as above. */ 821 if (ip->i_lfs_iflags & LFSI_BMAP) 822 vrecycle(vp, NULL, NULL); 823 numrefed--; 824 } 825 826#ifdef DIAGNOSTIC 827 if (numrefed != 0) 828 panic("lfs_bmapv: numrefed=%d", numrefed); 829#endif 830 831 vfs_unbusy(mntp, false, NULL); 832 833 return 0; 834} 835 836/* 837 * sys_lfs_segclean: 838 * 839 * Mark the segment clean. 840 * 841 * 0 on success 842 * -1/errno is return on error. 843 */ 844int 845sys_lfs_segclean(struct lwp *l, const struct sys_lfs_segclean_args *uap, register_t *retval) 846{ 847 /* { 848 syscallarg(fsid_t *) fsidp; 849 syscallarg(u_long) segment; 850 } */ 851 struct lfs *fs; 852 struct mount *mntp; 853 fsid_t fsid; 854 int error; 855 unsigned long segnum; 856 857 if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, 858 NULL)) != 0) 859 return (error); 860 861 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 862 return (error); 863 if ((mntp = vfs_getvfs(&fsid)) == NULL) 864 return (ENOENT); 865 866 fs = VFSTOUFS(mntp)->um_lfs; 867 segnum = SCARG(uap, segment); 868 869 if ((error = vfs_busy(mntp, NULL)) != 0) 870 return (error); 871 872 KERNEL_LOCK(1, NULL); 873 lfs_seglock(fs, SEGM_PROT); 874 error = lfs_do_segclean(fs, segnum); 875 lfs_segunlock(fs); 876 KERNEL_UNLOCK_ONE(NULL); 877 vfs_unbusy(mntp, false, NULL); 878 return error; 879} 880 881/* 882 * Actually mark the segment clean. 883 * Must be called with the segment lock held. 884 */ 885int 886lfs_do_segclean(struct lfs *fs, unsigned long segnum) 887{ 888 extern int lfs_dostats; 889 struct buf *bp; 890 CLEANERINFO *cip; 891 SEGUSE *sup; 892 893 if (dtosn(fs, fs->lfs_curseg) == segnum) { 894 return (EBUSY); 895 } 896 897 LFS_SEGENTRY(sup, fs, segnum, bp); 898 if (sup->su_nbytes) { 899 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:" 900 " %d live bytes\n", segnum, sup->su_nbytes)); 901 brelse(bp, 0); 902 return (EBUSY); 903 } 904 if (sup->su_flags & SEGUSE_ACTIVE) { 905 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:" 906 " segment is active\n", segnum)); 907 brelse(bp, 0); 908 return (EBUSY); 909 } 910 if (!(sup->su_flags & SEGUSE_DIRTY)) { 911 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:" 912 " segment is already clean\n", segnum)); 913 brelse(bp, 0); 914 return (EALREADY); 915 } 916 917 fs->lfs_avail += segtod(fs, 1); 918 if (sup->su_flags & SEGUSE_SUPERBLOCK) 919 fs->lfs_avail -= btofsb(fs, LFS_SBPAD); 920 if (fs->lfs_version > 1 && segnum == 0 && 921 fs->lfs_start < btofsb(fs, LFS_LABELPAD)) 922 fs->lfs_avail -= btofsb(fs, LFS_LABELPAD) - fs->lfs_start; 923 mutex_enter(&lfs_lock); 924 fs->lfs_bfree += sup->su_nsums * btofsb(fs, fs->lfs_sumsize) + 925 btofsb(fs, sup->su_ninos * fs->lfs_ibsize); 926 fs->lfs_dmeta -= sup->su_nsums * btofsb(fs, fs->lfs_sumsize) + 927 btofsb(fs, sup->su_ninos * fs->lfs_ibsize); 928 if (fs->lfs_dmeta < 0) 929 fs->lfs_dmeta = 0; 930 mutex_exit(&lfs_lock); 931 sup->su_flags &= ~SEGUSE_DIRTY; 932 LFS_WRITESEGENTRY(sup, fs, segnum, bp); 933 934 LFS_CLEANERINFO(cip, fs, bp); 935 ++cip->clean; 936 --cip->dirty; 937 fs->lfs_nclean = cip->clean; 938 cip->bfree = fs->lfs_bfree; 939 mutex_enter(&lfs_lock); 940 cip->avail = fs->lfs_avail - fs->lfs_ravail - fs->lfs_favail; 941 wakeup(&fs->lfs_avail); 942 mutex_exit(&lfs_lock); 943 (void) LFS_BWRITE_LOG(bp); 944 945 if (lfs_dostats) 946 ++lfs_stats.segs_reclaimed; 947 948 return (0); 949} 950 951/* 952 * This will block until a segment in file system fsid is written. A timeout 953 * in milliseconds may be specified which will awake the cleaner automatically. 954 * An fsid of -1 means any file system, and a timeout of 0 means forever. 955 */ 956int 957lfs_segwait(fsid_t *fsidp, struct timeval *tv) 958{ 959 struct mount *mntp; 960 void *addr; 961 u_long timeout; 962 int error; 963 964 KERNEL_LOCK(1, NULL); 965 if (fsidp == NULL || (mntp = vfs_getvfs(fsidp)) == NULL) 966 addr = &lfs_allclean_wakeup; 967 else 968 addr = &VFSTOUFS(mntp)->um_lfs->lfs_nextseg; 969 /* 970 * XXX THIS COULD SLEEP FOREVER IF TIMEOUT IS {0,0}! 971 * XXX IS THAT WHAT IS INTENDED? 972 */ 973 timeout = tvtohz(tv); 974 error = tsleep(addr, PCATCH | PVFS, "segment", timeout); 975 KERNEL_UNLOCK_ONE(NULL); 976 return (error == ERESTART ? EINTR : 0); 977} 978 979/* 980 * sys_lfs_segwait: 981 * 982 * System call wrapper around lfs_segwait(). 983 * 984 * 0 on success 985 * 1 on timeout 986 * -1/errno is return on error. 987 */ 988int 989sys___lfs_segwait50(struct lwp *l, const struct sys___lfs_segwait50_args *uap, 990 register_t *retval) 991{ 992 /* { 993 syscallarg(fsid_t *) fsidp; 994 syscallarg(struct timeval *) tv; 995 } */ 996 struct timeval atv; 997 fsid_t fsid; 998 int error; 999 1000 /* XXX need we be su to segwait? */ 1001 if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, 1002 NULL)) != 0) 1003 return (error); 1004 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 1005 return (error); 1006 1007 if (SCARG(uap, tv)) { 1008 error = copyin(SCARG(uap, tv), &atv, sizeof(struct timeval)); 1009 if (error) 1010 return (error); 1011 if (itimerfix(&atv)) 1012 return (EINVAL); 1013 } else /* NULL or invalid */ 1014 atv.tv_sec = atv.tv_usec = 0; 1015 return lfs_segwait(&fsid, &atv); 1016} 1017 1018/* 1019 * VFS_VGET call specialized for the cleaner. The cleaner already knows the 1020 * daddr from the ifile, so don't look it up again. If the cleaner is 1021 * processing IINFO structures, it may have the ondisk inode already, so 1022 * don't go retrieving it again. 1023 * 1024 * we lfs_vref, and it is the caller's responsibility to lfs_vunref 1025 * when finished. 1026 */ 1027 1028int 1029lfs_fasthashget(dev_t dev, ino_t ino, struct vnode **vpp) 1030{ 1031 struct vnode *vp; 1032 1033 mutex_enter(&ufs_ihash_lock); 1034 if ((vp = ufs_ihashlookup(dev, ino)) != NULL) { 1035 mutex_enter(vp->v_interlock); 1036 mutex_exit(&ufs_ihash_lock); 1037 if (vp->v_iflag & VI_XLOCK) { 1038 DLOG((DLOG_CLEAN, "lfs_fastvget: ino %d VI_XLOCK\n", 1039 ino)); 1040 lfs_stats.clean_vnlocked++; 1041 mutex_exit(vp->v_interlock); 1042 return EAGAIN; 1043 } 1044 if (lfs_vref(vp)) { 1045 DLOG((DLOG_CLEAN, "lfs_fastvget: lfs_vref failed" 1046 " for ino %d\n", ino)); 1047 lfs_stats.clean_inlocked++; 1048 return EAGAIN; 1049 } 1050 } else { 1051 mutex_exit(&ufs_ihash_lock); 1052 } 1053 *vpp = vp; 1054 1055 return (0); 1056} 1057 1058int 1059lfs_fastvget(struct mount *mp, ino_t ino, daddr_t daddr, struct vnode **vpp, 1060 struct ufs1_dinode *dinp) 1061{ 1062 struct inode *ip; 1063 struct ufs1_dinode *dip; 1064 struct vnode *vp; 1065 struct ufsmount *ump; 1066 dev_t dev; 1067 int error, retries; 1068 struct buf *bp; 1069 struct lfs *fs; 1070 1071 ump = VFSTOUFS(mp); 1072 dev = ump->um_dev; 1073 fs = ump->um_lfs; 1074 1075 /* 1076 * Wait until the filesystem is fully mounted before allowing vget 1077 * to complete. This prevents possible problems with roll-forward. 1078 */ 1079 mutex_enter(&lfs_lock); 1080 while (fs->lfs_flags & LFS_NOTYET) { 1081 mtsleep(&fs->lfs_flags, PRIBIO+1, "lfs_fnotyet", 0, 1082 &lfs_lock); 1083 } 1084 mutex_exit(&lfs_lock); 1085 1086 /* 1087 * This is playing fast and loose. Someone may have the inode 1088 * locked, in which case they are going to be distinctly unhappy 1089 * if we trash something. 1090 */ 1091 1092 error = lfs_fasthashget(dev, ino, vpp); 1093 if (error != 0 || *vpp != NULL) 1094 return (error); 1095 1096 /* 1097 * getnewvnode(9) will call vfs_busy, which will block if the 1098 * filesystem is being unmounted; but umount(9) is waiting for 1099 * us because we're already holding the fs busy. 1100 * XXXMP 1101 */ 1102 if (mp->mnt_iflag & IMNT_UNMOUNT) { 1103 *vpp = NULL; 1104 return EDEADLK; 1105 } 1106 error = getnewvnode(VT_LFS, mp, lfs_vnodeop_p, NULL, &vp); 1107 if (error) { 1108 *vpp = NULL; 1109 return (error); 1110 } 1111 1112 mutex_enter(&ufs_hashlock); 1113 error = lfs_fasthashget(dev, ino, vpp); 1114 if (error != 0 || *vpp != NULL) { 1115 mutex_exit(&ufs_hashlock); 1116 ungetnewvnode(vp); 1117 return (error); 1118 } 1119 1120 /* Allocate new vnode/inode. */ 1121 lfs_vcreate(mp, ino, vp); 1122 1123 /* 1124 * Put it onto its hash chain and lock it so that other requests for 1125 * this inode will block if they arrive while we are sleeping waiting 1126 * for old data structures to be purged or for the contents of the 1127 * disk portion of this inode to be read. 1128 */ 1129 ip = VTOI(vp); 1130 ufs_ihashins(ip); 1131 mutex_exit(&ufs_hashlock); 1132 1133#ifdef notyet 1134 /* Not found in the cache => this vnode was loaded only for cleaning. */ 1135 ip->i_lfs_iflags |= LFSI_BMAP; 1136#endif 1137 1138 /* 1139 * XXX 1140 * This may not need to be here, logically it should go down with 1141 * the i_devvp initialization. 1142 * Ask Kirk. 1143 */ 1144 ip->i_lfs = fs; 1145 1146 /* Read in the disk contents for the inode, copy into the inode. */ 1147 if (dinp) { 1148 error = copyin(dinp, ip->i_din.ffs1_din, sizeof (struct ufs1_dinode)); 1149 if (error) { 1150 DLOG((DLOG_CLEAN, "lfs_fastvget: dinode copyin failed" 1151 " for ino %d\n", ino)); 1152 ufs_ihashrem(ip); 1153 1154 /* Unlock and discard unneeded inode. */ 1155 VOP_UNLOCK(vp); 1156 lfs_vunref(vp); 1157 *vpp = NULL; 1158 return (error); 1159 } 1160 if (ip->i_number != ino) 1161 panic("lfs_fastvget: I was fed the wrong inode!"); 1162 } else { 1163 retries = 0; 1164 again: 1165 error = bread(ump->um_devvp, fsbtodb(fs, daddr), fs->lfs_ibsize, 1166 NOCRED, 0, &bp); 1167 if (error) { 1168 DLOG((DLOG_CLEAN, "lfs_fastvget: bread failed (%d)\n", 1169 error)); 1170 /* 1171 * The inode does not contain anything useful, so it 1172 * would be misleading to leave it on its hash chain. 1173 * Iput() will return it to the free list. 1174 */ 1175 ufs_ihashrem(ip); 1176 1177 /* Unlock and discard unneeded inode. */ 1178 VOP_UNLOCK(vp); 1179 lfs_vunref(vp); 1180 brelse(bp, 0); 1181 *vpp = NULL; 1182 return (error); 1183 } 1184 dip = lfs_ifind(ump->um_lfs, ino, bp); 1185 if (dip == NULL) { 1186 /* Assume write has not completed yet; try again */ 1187 brelse(bp, BC_INVAL); 1188 ++retries; 1189 if (retries > LFS_IFIND_RETRIES) 1190 panic("lfs_fastvget: dinode not found"); 1191 DLOG((DLOG_CLEAN, "lfs_fastvget: dinode not found," 1192 " retrying...\n")); 1193 goto again; 1194 } 1195 *ip->i_din.ffs1_din = *dip; 1196 brelse(bp, 0); 1197 } 1198 lfs_vinit(mp, &vp); 1199 1200 *vpp = vp; 1201 1202 KASSERT(VOP_ISLOCKED(vp)); 1203 VOP_UNLOCK(vp); 1204 1205 return (0); 1206} 1207 1208/* 1209 * Make up a "fake" cleaner buffer, copy the data from userland into it. 1210 */ 1211struct buf * 1212lfs_fakebuf(struct lfs *fs, struct vnode *vp, int lbn, size_t size, void *uaddr) 1213{ 1214 struct buf *bp; 1215 int error; 1216 1217 KASSERT(VTOI(vp)->i_number != LFS_IFILE_INUM); 1218 1219 bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size, LFS_NB_CLEAN); 1220 error = copyin(uaddr, bp->b_data, size); 1221 if (error) { 1222 lfs_freebuf(fs, bp); 1223 return NULL; 1224 } 1225 KDASSERT(bp->b_iodone == lfs_callback); 1226 1227#if 0 1228 mutex_enter(&lfs_lock); 1229 ++fs->lfs_iocount; 1230 mutex_exit(&lfs_lock); 1231#endif 1232 bp->b_bufsize = size; 1233 bp->b_bcount = size; 1234 return (bp); 1235} 1236