1/* $NetBSD: coalesce.c,v 1.18 2009/08/06 00:51:55 pooka Exp $ */ 2 3/*- 4 * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant@hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32#include <sys/param.h> 33#include <sys/mount.h> 34#include <sys/time.h> 35#include <sys/resource.h> 36#include <sys/types.h> 37#include <sys/wait.h> 38#include <sys/mman.h> 39 40#include <ufs/ufs/dinode.h> 41#include <ufs/lfs/lfs.h> 42 43#include <fcntl.h> 44#include <signal.h> 45#include <stdio.h> 46#include <stdlib.h> 47#include <string.h> 48#include <time.h> 49#include <unistd.h> 50#include <util.h> 51#include <errno.h> 52#include <err.h> 53 54#include <syslog.h> 55 56#include "bufcache.h" 57#include "vnode.h" 58#include "cleaner.h" 59#include "kernelops.h" 60 61extern int debug, do_mmap; 62 63int log2int(int n) 64{ 65 int log; 66 67 log = 0; 68 while (n > 0) { 69 ++log; 70 n >>= 1; 71 } 72 return log - 1; 73} 74 75enum coalesce_returncodes { 76 COALESCE_OK = 0, 77 COALESCE_NOINODE, 78 COALESCE_TOOSMALL, 79 COALESCE_BADSIZE, 80 COALESCE_BADBLOCKSIZE, 81 COALESCE_NOMEM, 82 COALESCE_BADBMAPV, 83 COALESCE_BADMARKV, 84 COALESCE_NOTWORTHIT, 85 COALESCE_NOTHINGLEFT, 86 COALESCE_EIO, 87 88 COALESCE_MAXERROR 89}; 90 91const char *coalesce_return[] = { 92 "Successfully coalesced", 93 "File not in use or inode not found", 94 "Not large enough to coalesce", 95 "Negative size", 96 "Not enough blocks to account for size", 97 "Malloc failed", 98 "LFCNBMAPV failed", 99 "Not broken enough to fix", 100 "Too many blocks not found", 101 "Too many blocks found in active segments", 102 "I/O error", 103 104 "No such error" 105}; 106 107static struct ufs1_dinode * 108get_dinode(struct clfs *fs, ino_t ino) 109{ 110 IFILE *ifp; 111 daddr_t daddr; 112 struct ubuf *bp; 113 struct ufs1_dinode *dip, *r; 114 115 lfs_ientry(&ifp, fs, ino, &bp); 116 daddr = ifp->if_daddr; 117 brelse(bp, 0); 118 119 if (daddr == 0x0) 120 return NULL; 121 122 bread(fs->clfs_devvp, daddr, fs->lfs_ibsize, NOCRED, 0, &bp); 123 for (dip = (struct ufs1_dinode *)bp->b_data; 124 dip < (struct ufs1_dinode *)(bp->b_data + fs->lfs_ibsize); dip++) 125 if (dip->di_inumber == ino) { 126 r = (struct ufs1_dinode *)malloc(sizeof(*r)); 127 if (r == NULL) 128 break; 129 memcpy(r, dip, sizeof(*r)); 130 brelse(bp, 0); 131 return r; 132 } 133 brelse(bp, 0); 134 return NULL; 135} 136 137/* 138 * Find out if this inode's data blocks are discontinuous; if they are, 139 * rewrite them using markv. Return the number of inodes rewritten. 140 */ 141static int 142clean_inode(struct clfs *fs, ino_t ino) 143{ 144 BLOCK_INFO *bip = NULL, *tbip; 145 CLEANERINFO cip; 146 struct ubuf *bp; 147 struct ufs1_dinode *dip; 148 struct clfs_seguse *sup; 149 struct lfs_fcntl_markv /* { 150 BLOCK_INFO *blkiov; 151 int blkcnt; 152 } */ lim; 153 daddr_t toff; 154 int i; 155 int nb, onb, noff; 156 int retval; 157 int bps; 158 159 dip = get_dinode(fs, ino); 160 if (dip == NULL) 161 return COALESCE_NOINODE; 162 163 /* Compute file block size, set up for bmapv */ 164 onb = nb = lblkno(fs, dip->di_size); 165 166 /* XXX for now, don't do any file small enough to have fragments */ 167 if (nb < NDADDR) { 168 free(dip); 169 return COALESCE_TOOSMALL; 170 } 171 172 /* Sanity checks */ 173#if 0 /* di_size is uint64_t -- this is a noop */ 174 if (dip->di_size < 0) { 175 dlog("ino %d, negative size (%" PRId64 ")", ino, dip->di_size); 176 free(dip); 177 return COALESCE_BADSIZE; 178 } 179#endif 180 if (nb > dip->di_blocks) { 181 dlog("ino %d, computed blocks %d > held blocks %d", ino, nb, 182 dip->di_blocks); 183 free(dip); 184 return COALESCE_BADBLOCKSIZE; 185 } 186 187 bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb); 188 if (bip == NULL) { 189 syslog(LOG_WARNING, "ino %llu, %d blocks: %m", 190 (unsigned long long)ino, nb); 191 free(dip); 192 return COALESCE_NOMEM; 193 } 194 for (i = 0; i < nb; i++) { 195 memset(bip + i, 0, sizeof(BLOCK_INFO)); 196 bip[i].bi_inode = ino; 197 bip[i].bi_lbn = i; 198 bip[i].bi_version = dip->di_gen; 199 /* Don't set the size, but let lfs_bmap fill it in */ 200 } 201 lim.blkiov = bip; 202 lim.blkcnt = nb; 203 if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) { 204 syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: %m", 205 fs->lfs_fsmnt); 206 retval = COALESCE_BADBMAPV; 207 goto out; 208 } 209#if 0 210 for (i = 0; i < nb; i++) { 211 printf("bi_size = %d, bi_ino = %d, " 212 "bi_lbn = %d, bi_daddr = %d\n", 213 bip[i].bi_size, bip[i].bi_inode, bip[i].bi_lbn, 214 bip[i].bi_daddr); 215 } 216#endif 217 noff = toff = 0; 218 for (i = 1; i < nb; i++) { 219 if (bip[i].bi_daddr != bip[i - 1].bi_daddr + fs->lfs_frag) 220 ++noff; 221 toff += abs(bip[i].bi_daddr - bip[i - 1].bi_daddr 222 - fs->lfs_frag) >> fs->lfs_fbshift; 223 } 224 225 /* 226 * If this file is not discontinuous, there's no point in rewriting it. 227 * 228 * Explicitly allow a certain amount of discontinuity, since large 229 * files will be broken among segments and medium-sized files 230 * can have a break or two and it's okay. 231 */ 232 if (nb <= 1 || noff == 0 || noff < log2int(nb) || 233 segtod(fs, noff) * 2 < nb) { 234 retval = COALESCE_NOTWORTHIT; 235 goto out; 236 } else if (debug) 237 syslog(LOG_DEBUG, "ino %llu total discontinuity " 238 "%d (%lld) for %d blocks", (unsigned long long)ino, 239 noff, (long long)toff, nb); 240 241 /* Search for blocks in active segments; don't move them. */ 242 for (i = 0; i < nb; i++) { 243 if (bip[i].bi_daddr <= 0) 244 continue; 245 sup = &fs->clfs_segtab[dtosn(fs, bip[i].bi_daddr)]; 246 if (sup->flags & SEGUSE_ACTIVE) 247 bip[i].bi_daddr = LFS_UNUSED_DADDR; /* 0 */ 248 } 249 250 /* 251 * Get rid of any blocks we've marked dead. If this is an older 252 * kernel that doesn't have bmapv fill in the block sizes, we'll 253 * toss everything here. 254 */ 255 onb = nb; 256 toss_old_blocks(fs, &bip, &nb, NULL); 257 nb = i; 258 259 /* 260 * We may have tossed enough blocks that it is no longer worthwhile 261 * to rewrite this inode. 262 */ 263 if (nb == 0 || onb - nb > log2int(onb)) { 264 if (debug) 265 syslog(LOG_DEBUG, "too many blocks tossed, not rewriting"); 266 retval = COALESCE_NOTHINGLEFT; 267 goto out; 268 } 269 270 /* 271 * We are going to rewrite this inode. 272 * For any remaining blocks, read in their contents. 273 */ 274 for (i = 0; i < nb; i++) { 275 bip[i].bi_bp = malloc(bip[i].bi_size); 276 if (bip[i].bi_bp == NULL) { 277 syslog(LOG_WARNING, "allocate block buffer size=%d: %m", 278 bip[i].bi_size); 279 retval = COALESCE_NOMEM; 280 goto out; 281 } 282 283 if (kops.ko_pread(fs->clfs_devfd, bip[i].bi_bp, bip[i].bi_size, 284 fsbtob(fs, bip[i].bi_daddr)) < 0) { 285 retval = COALESCE_EIO; 286 goto out; 287 } 288 } 289 if (debug) 290 syslog(LOG_DEBUG, "ino %llu markv %d blocks", 291 (unsigned long long)ino, nb); 292 293 /* 294 * Write in segment-sized chunks. If at any point we'd write more 295 * than half of the available segments, sleep until that's not 296 * true any more. 297 */ 298 bps = segtod(fs, 1); 299 for (tbip = bip; tbip < bip + nb; tbip += bps) { 300 do { 301 bread(fs->lfs_ivnode, 0, fs->lfs_bsize, NOCRED, 0, &bp); 302 cip = *(CLEANERINFO *)bp->b_data; 303 brelse(bp, B_INVAL); 304 305 if (cip.clean < 4) /* XXX magic number 4 */ 306 kops.ko_fcntl(fs->clfs_ifilefd, 307 LFCNSEGWAIT, NULL); 308 } while(cip.clean < 4); 309 310 lim.blkiov = tbip; 311 lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps); 312 if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) { 313 retval = COALESCE_BADMARKV; 314 goto out; 315 } 316 } 317 318 retval = COALESCE_OK; 319out: 320 free(dip); 321 if (bip) { 322 for (i = 0; i < onb; i++) 323 if (bip[i].bi_bp) 324 free(bip[i].bi_bp); 325 free(bip); 326 } 327 return retval; 328} 329 330/* 331 * Try coalescing every inode in the filesystem. 332 * Return the number of inodes actually altered. 333 */ 334int clean_all_inodes(struct clfs *fs) 335{ 336 int i, r, maxino; 337 int totals[COALESCE_MAXERROR]; 338 struct stat st; 339 340 memset(totals, 0, sizeof(totals)); 341 342 fstat(fs->clfs_ifilefd, &st); 343 maxino = fs->lfs_ifpb * (st.st_size >> fs->lfs_bshift) - 344 fs->lfs_segtabsz - fs->lfs_cleansz; 345 346 for (i = 0; i < maxino; i++) { 347 r = clean_inode(fs, i); 348 ++totals[r]; 349 } 350 351 for (i = 0; i < COALESCE_MAXERROR; i++) 352 if (totals[i]) 353 syslog(LOG_DEBUG, "%s: %d", coalesce_return[i], 354 totals[i]); 355 356 return totals[COALESCE_OK]; 357} 358 359/* 360 * Fork a child process to coalesce this fs. 361 */ 362int 363fork_coalesce(struct clfs *fs) 364{ 365 static pid_t childpid; 366 int num; 367 368 /* 369 * If already running a coalescing child, don't start a new one. 370 */ 371 if (childpid) { 372 if (waitpid(childpid, NULL, WNOHANG) == childpid) 373 childpid = 0; 374 } 375 if (childpid && kill(childpid, 0) >= 0) { 376 /* already running a coalesce process */ 377 if (debug) 378 syslog(LOG_DEBUG, "coalescing already in progress"); 379 return 0; 380 } 381 382 /* 383 * Fork a child and let the child coalease 384 */ 385 childpid = fork(); 386 if (childpid < 0) { 387 syslog(LOG_ERR, "%s: fork to coaleasce: %m", fs->lfs_fsmnt); 388 return 0; 389 } else if (childpid == 0) { 390 syslog(LOG_NOTICE, "%s: new coalescing process, pid %d", 391 fs->lfs_fsmnt, getpid()); 392 num = clean_all_inodes(fs); 393 syslog(LOG_NOTICE, "%s: coalesced %d discontiguous inodes", 394 fs->lfs_fsmnt, num); 395 exit(0); 396 } 397 398 return 0; 399} 400