1/* $NetBSD: rcache.c,v 1.22 2008/04/28 20:23:08 martin Exp $ */ 2 3/*- 4 * Copyright (c) 1999 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Martin J. Laubach <mjl@emsi.priv.at> and 9 * Manuel Bouyer <Manuel.Bouyer@lip6.fr>. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33#include <sys/cdefs.h> 34#ifndef lint 35__RCSID("$NetBSD: rcache.c,v 1.22 2008/04/28 20:23:08 martin Exp $"); 36#endif /* not lint */ 37 38#include <sys/types.h> 39#include <sys/uio.h> 40#include <sys/mman.h> 41#include <sys/param.h> 42#include <sys/sysctl.h> 43#include <ufs/ufs/dinode.h> 44 45#include <stdio.h> 46#include <stdlib.h> 47#include <unistd.h> 48#include <fcntl.h> 49#include <errno.h> 50#include <string.h> 51 52#include "dump.h" 53 54/*-----------------------------------------------------------------------*/ 55#define MAXCACHEBUFS 512 /* max 512 buffers */ 56#define MAXMEMPART 6 /* max 15% of the user mem */ 57 58/*-----------------------------------------------------------------------*/ 59union cdesc { 60 volatile size_t cd_count; 61 struct { 62 volatile daddr_t blkstart; 63 volatile daddr_t blkend; /* start + nblksread */ 64 volatile daddr_t blocksRead; 65 volatile size_t time; 66#ifdef DIAGNOSTICS 67 volatile pid_t owner; 68#endif 69 } desc; 70#define cd_blkstart desc.blkstart 71#define cd_blkend desc.blkend 72#define cd_blocksRead desc.blocksRead 73#define cd_time desc.time 74#define cd_owner desc.owner 75}; 76 77static int findlru(void); 78 79static void *shareBuffer = NULL; 80static union cdesc *cheader; 81static union cdesc *cdesc; 82static char *cdata; 83static int cachebufs; 84static int nblksread; 85 86#ifdef STATS 87static int nreads; 88static int nphysread; 89static int64_t readsize; 90static int64_t physreadsize; 91#endif 92 93#define CSIZE (nblksread << dev_bshift) /* cache buf size */ 94#define CDATA(desc) (cdata + ((desc) - cdesc) * CSIZE) 95 96void 97initcache(int cachesize, int readblksize) 98{ 99 size_t len; 100 size_t sharedSize; 101 102 /* Convert read block size in terms of filesystem block size */ 103 nblksread = howmany(readblksize, ufsib->ufs_bsize); 104 105 /* Then, convert it in terms of device block size */ 106 nblksread <<= ufsib->ufs_bshift - dev_bshift; 107 108 if (cachesize == -1) { /* Compute from memory available */ 109 uint64_t usermem, cachetmp; 110 int mib[2] = { CTL_HW, HW_USERMEM64 }; 111 112 len = sizeof(usermem); 113 if (sysctl(mib, 2, &usermem, &len, NULL, 0) < 0) { 114 msg("sysctl(hw.usermem) failed: %s\n", 115 strerror(errno)); 116 return; 117 } 118 cachetmp = (usermem / MAXMEMPART) / CSIZE; 119 /* for those with TB of RAM */ 120 cachebufs = (cachetmp > INT_MAX) ? INT_MAX : cachetmp; 121 } else { /* User specified */ 122 cachebufs = cachesize; 123 } 124 125 if (cachebufs) { /* Don't allocate if zero --> no caching */ 126 if (cachebufs > MAXCACHEBUFS) 127 cachebufs = MAXCACHEBUFS; 128 129 sharedSize = sizeof(union cdesc) + 130 sizeof(union cdesc) * cachebufs + 131 cachebufs * CSIZE; 132#ifdef STATS 133 fprintf(stderr, "Using %d buffers (%d bytes)\n", cachebufs, 134 sharedSize); 135#endif 136 shareBuffer = mmap(NULL, sharedSize, PROT_READ | PROT_WRITE, 137 MAP_ANON | MAP_SHARED, -1, 0); 138 if (shareBuffer == MAP_FAILED) { 139 msg("can't mmap shared memory for buffer: %s\n", 140 strerror(errno)); 141 return; 142 } 143 cheader = shareBuffer; 144 cdesc = (union cdesc *) (((char *) shareBuffer) + 145 sizeof(union cdesc)); 146 cdata = ((char *) shareBuffer) + sizeof(union cdesc) + 147 sizeof(union cdesc) * cachebufs; 148 149 memset(shareBuffer, '\0', sharedSize); 150 } 151} 152 153/* 154 * Find the cache buffer descriptor that shows the minimal access time 155 */ 156static int 157findlru(void) 158{ 159 int i; 160 size_t minTime = cdesc[0].cd_time; 161 int minIdx = 0; 162 163 for (i = 0; i < cachebufs; i++) { 164 if (cdesc[i].cd_time < minTime) { 165 minIdx = i; 166 minTime = cdesc[i].cd_time; 167 } 168 } 169 170 return minIdx; 171} 172 173/* 174 * Read data directly from disk, with smart error handling. 175 * Try to recover from hard errors by reading in sector sized pieces. 176 * Error recovery is attempted at most BREADEMAX times before seeking 177 * consent from the operator to continue. 178 */ 179 180static int breaderrors = 0; 181#define BREADEMAX 32 182 183void 184rawread(daddr_t blkno, char *buf, int size) 185{ 186 int cnt, i; 187 188#ifdef STATS 189 nphysread++; 190 physreadsize += size; 191#endif 192 193loop: 194 if (lseek(diskfd, ((off_t) blkno << dev_bshift), SEEK_SET) == -1) { 195 msg("rawread: lseek fails\n"); 196 goto err; 197 } 198 if ((cnt = read(diskfd, buf, size)) == size) 199 return; 200 if (blkno + (size >> dev_bshift) > ufsib->ufs_dsize) { 201 /* 202 * Trying to read the final fragment. 203 * 204 * NB - dump only works in TP_BSIZE blocks, hence 205 * rounds `dev_bsize' fragments up to TP_BSIZE pieces. 206 * It should be smarter about not actually trying to 207 * read more than it can get, but for the time being 208 * we punt and scale back the read only when it gets 209 * us into trouble. (mkm 9/25/83) 210 */ 211 size -= dev_bsize; 212 goto loop; 213 } 214 if (cnt == -1) 215 msg("read error from %s: %s: [block %lld]: count=%d\n", 216 disk, strerror(errno), (long long)blkno, size); 217 else 218 msg("short read error from %s: [block %lld]: " 219 "count=%d, got=%d\n", 220 disk, (long long)blkno, size, cnt); 221err: 222 if (++breaderrors > BREADEMAX) { 223 msg("More than %d block read errors from %s\n", 224 BREADEMAX, disk); 225 broadcast("DUMP IS AILING!\n"); 226 msg("This is an unrecoverable error.\n"); 227 if (!query("Do you want to attempt to continue?")) { 228 dumpabort(0); 229 /*NOTREACHED*/ 230 } else 231 breaderrors = 0; 232 } 233 /* 234 * Zero buffer, then try to read each sector of buffer separately. 235 */ 236 memset(buf, 0, size); 237 for (i = 0; i < size; i += dev_bsize, buf += dev_bsize, blkno++) { 238 if (lseek(diskfd, ((off_t)blkno << dev_bshift), 239 SEEK_SET) == -1) { 240 msg("rawread: lseek2 fails: %s!\n", 241 strerror(errno)); 242 continue; 243 } 244 if ((cnt = read(diskfd, buf, (int)dev_bsize)) == dev_bsize) 245 continue; 246 if (cnt == -1) { 247 msg("read error from %s: %s: [sector %lld]: " 248 "count=%ld\n", disk, strerror(errno), 249 (long long)blkno, dev_bsize); 250 continue; 251 } 252 msg("short read error from %s: [sector %lld]: " 253 "count=%ld, got=%d\n", 254 disk, (long long)blkno, dev_bsize, cnt); 255 } 256} 257 258void 259bread(daddr_t blkno, char *buf, int size) 260{ 261 int osize = size, idx; 262 daddr_t oblkno = blkno; 263 char *obuf = buf; 264 daddr_t numBlocks = howmany(size, dev_bsize); 265 266#ifdef STATS 267 nreads++; 268 readsize += size; 269#endif 270 271 if (!shareBuffer) { 272 rawread(blkno, buf, size); 273 return; 274 } 275 276 if (flock(diskfd, LOCK_EX)) { 277 msg("flock(LOCK_EX) failed: %s\n", 278 strerror(errno)); 279 rawread(blkno, buf, size); 280 return; 281 } 282 283retry: 284 idx = 0; 285 while (size > 0) { 286 int i; 287 288 for (i = 0; i < cachebufs; i++) { 289 union cdesc *curr = &cdesc[(i + idx) % cachebufs]; 290 291#ifdef DIAGNOSTICS 292 if (curr->cd_owner) { 293 fprintf(stderr, "Owner is set (%d, me=%d), can" 294 "not happen.\n", curr->cd_owner, getpid()); 295 } 296#endif 297 298 if (curr->cd_blkend == 0) 299 continue; 300 /* 301 * If we find a bit of the read in the buffers, 302 * now compute how many blocks we can copy, 303 * copy them out, adjust blkno, buf and size, 304 * and restart 305 */ 306 if (curr->cd_blkstart <= blkno && 307 blkno < curr->cd_blkend) { 308 /* Number of data blocks to be copied */ 309 int toCopy = MIN(size, 310 (curr->cd_blkend - blkno) << dev_bshift); 311#ifdef DIAGNOSTICS 312 if (toCopy <= 0 || toCopy > CSIZE) { 313 fprintf(stderr, "toCopy %d !\n", 314 toCopy); 315 dumpabort(0); 316 } 317 if (CDATA(curr) + 318 ((blkno - curr->cd_blkstart) << 319 dev_bshift) < CDATA(curr) || 320 CDATA(curr) + 321 ((blkno - curr->cd_blkstart) << 322 dev_bshift) > CDATA(curr) + CSIZE) { 323 fprintf(stderr, "%p < %p !!!\n", 324 CDATA(curr) + ((blkno - 325 curr->cd_blkstart) << dev_bshift), 326 CDATA(curr)); 327 fprintf(stderr, 328 "cdesc[i].cd_blkstart %lld " 329 "blkno %lld dev_bsize %ld\n", 330 (long long)curr->cd_blkstart, 331 (long long)blkno, 332 dev_bsize); 333 dumpabort(0); 334 } 335#endif 336 memcpy(buf, CDATA(curr) + 337 ((blkno - curr->cd_blkstart) << 338 dev_bshift), 339 toCopy); 340 341 buf += toCopy; 342 size -= toCopy; 343 blkno += howmany(toCopy, dev_bsize); 344 numBlocks -= howmany(toCopy, dev_bsize); 345 346 curr->cd_time = cheader->cd_count++; 347 348 /* 349 * If all data of a cache block have been 350 * read, chances are good no more reads 351 * will occur, so expire the cache immediately 352 */ 353 354 curr->cd_blocksRead += 355 howmany(toCopy, dev_bsize); 356 if (curr->cd_blocksRead >= nblksread) 357 curr->cd_time = 0; 358 359 goto retry; 360 } 361 } 362 363 /* No more to do? */ 364 if (size == 0) 365 break; 366 367 /* 368 * This does actually not happen if fs blocks are not greater 369 * than nblksread. 370 */ 371 if (numBlocks > nblksread || blkno >= ufsib->ufs_dsize) { 372 rawread(oblkno, obuf, osize); 373 break; 374 } else { 375 ssize_t rsize; 376 daddr_t blockBlkNo; 377 378 blockBlkNo = (blkno / nblksread) * nblksread; 379 idx = findlru(); 380 rsize = MIN(nblksread, 381 ufsib->ufs_dsize - blockBlkNo) << dev_bshift; 382 383#ifdef DIAGNOSTICS 384 if (cdesc[idx].cd_owner) 385 fprintf(stderr, "Owner is set (%d, me=%d), can" 386 "not happen(2).\n", cdesc[idx].cd_owner, 387 getpid()); 388 cdesc[idx].cd_owner = getpid(); 389#endif 390 cdesc[idx].cd_time = cheader->cd_count++; 391 cdesc[idx].cd_blkstart = blockBlkNo; 392 cdesc[idx].cd_blkend = 0; 393 cdesc[idx].cd_blocksRead = 0; 394 395 if (lseek(diskfd, ((off_t) blockBlkNo << dev_bshift), 396 SEEK_SET) == -1) { 397 msg("readBlocks: lseek fails: %s\n", 398 strerror(errno)); 399 rsize = -1; 400 } else { 401 rsize = read(diskfd, 402 CDATA(&cdesc[idx]), rsize); 403 if (rsize < 0) { 404 msg("readBlocks: read fails: %s\n", 405 strerror(errno)); 406 } 407 } 408 409 /* On errors, panic, punt, try to read without 410 * cache and let raw read routine do the rest. 411 */ 412 413 if (rsize <= 0) { 414 rawread(oblkno, obuf, osize); 415#ifdef DIAGNOSTICS 416 if (cdesc[idx].cd_owner != getpid()) 417 fprintf(stderr, "Owner changed from " 418 "%d to %d, can't happen\n", 419 getpid(), cdesc[idx].cd_owner); 420 cdesc[idx].cd_owner = 0; 421#endif 422 break; 423 } 424 425 /* On short read, just note the fact and go on */ 426 cdesc[idx].cd_blkend = blockBlkNo + rsize / dev_bsize; 427 428#ifdef STATS 429 nphysread++; 430 physreadsize += rsize; 431#endif 432#ifdef DIAGNOSTICS 433 if (cdesc[idx].cd_owner != getpid()) 434 fprintf(stderr, "Owner changed from " 435 "%d to %d, can't happen\n", 436 getpid(), cdesc[idx].cd_owner); 437 cdesc[idx].cd_owner = 0; 438#endif 439 /* 440 * We swapped some of data in, let the loop fetch 441 * them from cache 442 */ 443 } 444 } 445 446 if (flock(diskfd, LOCK_UN)) 447 msg("flock(LOCK_UN) failed: %s\n", 448 strerror(errno)); 449} 450 451void 452printcachestats(void) 453{ 454 455#ifdef STATS 456 fprintf(stderr, "Pid %d: %d reads (%u bytes) " 457 "%d physical reads (%u bytes) %d%% hits, %d%% overhead\n", 458 getpid(), nreads, (u_int) readsize, nphysread, 459 (u_int) physreadsize, (nreads - nphysread) * 100 / nreads, 460 (int) (((physreadsize - readsize) * 100) / readsize)); 461#endif 462} 463