1/* $NetBSD: rcache.c,v 1.24 2013/06/15 01:27:19 christos Exp $ */ 2 3/*- 4 * Copyright (c) 1999 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Martin J. Laubach <mjl@emsi.priv.at> and 9 * Manuel Bouyer <Manuel.Bouyer@lip6.fr>. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33#include <sys/cdefs.h> 34#ifndef lint 35__RCSID("$NetBSD: rcache.c,v 1.24 2013/06/15 01:27:19 christos Exp $"); 36#endif /* not lint */ 37 38#include <sys/types.h> 39#include <sys/uio.h> 40#include <sys/mman.h> 41#include <sys/param.h> 42#include <sys/sysctl.h> 43 44#include <stdio.h> 45#include <stdlib.h> 46#include <unistd.h> 47#include <fcntl.h> 48#include <errno.h> 49#include <string.h> 50 51#include "dump.h" 52 53/*-----------------------------------------------------------------------*/ 54#define MAXCACHEBUFS 512 /* max 512 buffers */ 55#define MAXMEMPART 6 /* max 15% of the user mem */ 56 57/*-----------------------------------------------------------------------*/ 58union cdesc { 59 volatile size_t cd_count; 60 struct { 61 volatile daddr_t blkstart; 62 volatile daddr_t blkend; /* start + nblksread */ 63 volatile daddr_t blocksRead; 64 volatile size_t time; 65#ifdef DIAGNOSTICS 66 volatile pid_t owner; 67#endif 68 } desc; 69#define cd_blkstart desc.blkstart 70#define cd_blkend desc.blkend 71#define cd_blocksRead desc.blocksRead 72#define cd_time desc.time 73#define cd_owner desc.owner 74}; 75 76static int findlru(void); 77 78static void *shareBuffer = NULL; 79static union cdesc *cheader; 80static union cdesc *cdesc; 81static char *cdata; 82static int cachebufs; 83static int nblksread; 84 85#ifdef STATS 86static int nreads; 87static int nphysread; 88static int64_t readsize; 89static int64_t physreadsize; 90#endif 91 92#define CSIZE (nblksread << dev_bshift) /* cache buf size */ 93#define CDATA(desc) (cdata + ((desc) - cdesc) * CSIZE) 94 95void 96initcache(int cachesize, int readblksize) 97{ 98 size_t len; 99 size_t sharedSize; 100 101 if (readblksize == -1) { /* use kern.maxphys */ 102 int kern_maxphys; 103 int mib[2] = { CTL_KERN, KERN_MAXPHYS }; 104 105 len = sizeof(kern_maxphys); 106 if (sysctl(mib, 2, &kern_maxphys, &len, NULL, 0) < 0) { 107 msg("sysctl(kern.maxphys) failed: %s\n", 108 strerror(errno)); 109 return; 110 } 111 readblksize = kern_maxphys; 112 } 113 114 /* Convert read block size in terms of filesystem block size */ 115 nblksread = howmany(readblksize, ufsib->ufs_bsize); 116 117 /* Then, convert it in terms of device block size */ 118 nblksread <<= ufsib->ufs_bshift - dev_bshift; 119 120 if (cachesize == -1) { /* Compute from memory available */ 121 uint64_t usermem, cachetmp; 122 int mib[2] = { CTL_HW, HW_USERMEM64 }; 123 124 len = sizeof(usermem); 125 if (sysctl(mib, 2, &usermem, &len, NULL, 0) < 0) { 126 msg("sysctl(hw.usermem) failed: %s\n", 127 strerror(errno)); 128 return; 129 } 130 cachetmp = (usermem / MAXMEMPART) / CSIZE; 131 /* for those with TB of RAM */ 132 cachebufs = (cachetmp > INT_MAX) ? INT_MAX : cachetmp; 133 } else { /* User specified */ 134 cachebufs = cachesize; 135 } 136 137 if (cachebufs) { /* Don't allocate if zero --> no caching */ 138 if (cachebufs > MAXCACHEBUFS) 139 cachebufs = MAXCACHEBUFS; 140 141 sharedSize = sizeof(union cdesc) + 142 sizeof(union cdesc) * cachebufs + 143 cachebufs * CSIZE; 144#ifdef STATS 145 fprintf(stderr, "Using %d buffers (%d bytes)\n", cachebufs, 146 sharedSize); 147#endif 148 shareBuffer = mmap(NULL, sharedSize, PROT_READ | PROT_WRITE, 149 MAP_ANON | MAP_SHARED, -1, 0); 150 if (shareBuffer == MAP_FAILED) { 151 msg("can't mmap shared memory for buffer: %s\n", 152 strerror(errno)); 153 return; 154 } 155 cheader = shareBuffer; 156 cdesc = (union cdesc *) (((char *) shareBuffer) + 157 sizeof(union cdesc)); 158 cdata = ((char *) shareBuffer) + sizeof(union cdesc) + 159 sizeof(union cdesc) * cachebufs; 160 161 memset(shareBuffer, '\0', sharedSize); 162 } 163} 164 165/* 166 * Find the cache buffer descriptor that shows the minimal access time 167 */ 168static int 169findlru(void) 170{ 171 int i; 172 size_t minTime = cdesc[0].cd_time; 173 int minIdx = 0; 174 175 for (i = 0; i < cachebufs; i++) { 176 if (cdesc[i].cd_time < minTime) { 177 minIdx = i; 178 minTime = cdesc[i].cd_time; 179 } 180 } 181 182 return minIdx; 183} 184 185/* 186 * Read data directly from disk, with smart error handling. 187 * Try to recover from hard errors by reading in sector sized pieces. 188 * Error recovery is attempted at most BREADEMAX times before seeking 189 * consent from the operator to continue. 190 */ 191 192static int breaderrors = 0; 193#define BREADEMAX 32 194 195void 196rawread(daddr_t blkno, char *buf, int size) 197{ 198 int cnt, i; 199 200#ifdef STATS 201 nphysread++; 202 physreadsize += size; 203#endif 204 205loop: 206 if (lseek(diskfd, ((off_t) blkno << dev_bshift), SEEK_SET) == -1) { 207 msg("rawread: lseek fails\n"); 208 goto err; 209 } 210 if ((cnt = read(diskfd, buf, size)) == size) 211 return; 212 if (blkno + (size >> dev_bshift) > ufsib->ufs_dsize) { 213 /* 214 * Trying to read the final fragment. 215 * 216 * NB - dump only works in TP_BSIZE blocks, hence 217 * rounds `dev_bsize' fragments up to TP_BSIZE pieces. 218 * It should be smarter about not actually trying to 219 * read more than it can get, but for the time being 220 * we punt and scale back the read only when it gets 221 * us into trouble. (mkm 9/25/83) 222 */ 223 size -= dev_bsize; 224 goto loop; 225 } 226 if (cnt == -1) 227 msg("read error from %s: %s: [block %lld]: count=%d\n", 228 disk, strerror(errno), (long long)blkno, size); 229 else 230 msg("short read error from %s: [block %lld]: " 231 "count=%d, got=%d\n", 232 disk, (long long)blkno, size, cnt); 233err: 234 if (++breaderrors > BREADEMAX) { 235 msg("More than %d block read errors from %s\n", 236 BREADEMAX, disk); 237 broadcast("DUMP IS AILING!\n"); 238 msg("This is an unrecoverable error.\n"); 239 if (!query("Do you want to attempt to continue?")) { 240 dumpabort(0); 241 /*NOTREACHED*/ 242 } else 243 breaderrors = 0; 244 } 245 /* 246 * Zero buffer, then try to read each sector of buffer separately. 247 */ 248 memset(buf, 0, size); 249 for (i = 0; i < size; i += dev_bsize, buf += dev_bsize, blkno++) { 250 if (lseek(diskfd, ((off_t)blkno << dev_bshift), 251 SEEK_SET) == -1) { 252 msg("rawread: lseek2 fails: %s!\n", 253 strerror(errno)); 254 continue; 255 } 256 if ((cnt = read(diskfd, buf, (int)dev_bsize)) == dev_bsize) 257 continue; 258 if (cnt == -1) { 259 msg("read error from %s: %s: [sector %lld]: " 260 "count=%ld\n", disk, strerror(errno), 261 (long long)blkno, dev_bsize); 262 continue; 263 } 264 msg("short read error from %s: [sector %lld]: " 265 "count=%ld, got=%d\n", 266 disk, (long long)blkno, dev_bsize, cnt); 267 } 268} 269 270void 271bread(daddr_t blkno, char *buf, int size) 272{ 273 int osize = size, idx; 274 daddr_t oblkno = blkno; 275 char *obuf = buf; 276 daddr_t numBlocks = howmany(size, dev_bsize); 277 278#ifdef STATS 279 nreads++; 280 readsize += size; 281#endif 282 283 if (!shareBuffer) { 284 rawread(blkno, buf, size); 285 return; 286 } 287 288 if (flock(diskfd, LOCK_EX)) { 289 msg("flock(LOCK_EX) failed: %s\n", 290 strerror(errno)); 291 rawread(blkno, buf, size); 292 return; 293 } 294 295retry: 296 idx = 0; 297 while (size > 0) { 298 int i; 299 300 for (i = 0; i < cachebufs; i++) { 301 union cdesc *curr = &cdesc[(i + idx) % cachebufs]; 302 303#ifdef DIAGNOSTICS 304 if (curr->cd_owner) { 305 fprintf(stderr, "Owner is set (%d, me=%d), can" 306 "not happen.\n", curr->cd_owner, getpid()); 307 } 308#endif 309 310 if (curr->cd_blkend == 0) 311 continue; 312 /* 313 * If we find a bit of the read in the buffers, 314 * now compute how many blocks we can copy, 315 * copy them out, adjust blkno, buf and size, 316 * and restart 317 */ 318 if (curr->cd_blkstart <= blkno && 319 blkno < curr->cd_blkend) { 320 /* Number of data blocks to be copied */ 321 int toCopy = MIN(size, 322 (curr->cd_blkend - blkno) << dev_bshift); 323#ifdef DIAGNOSTICS 324 if (toCopy <= 0 || toCopy > CSIZE) { 325 fprintf(stderr, "toCopy %d !\n", 326 toCopy); 327 dumpabort(0); 328 } 329 if (CDATA(curr) + 330 ((blkno - curr->cd_blkstart) << 331 dev_bshift) < CDATA(curr) || 332 CDATA(curr) + 333 ((blkno - curr->cd_blkstart) << 334 dev_bshift) > CDATA(curr) + CSIZE) { 335 fprintf(stderr, "%p < %p !!!\n", 336 CDATA(curr) + ((blkno - 337 curr->cd_blkstart) << dev_bshift), 338 CDATA(curr)); 339 fprintf(stderr, 340 "cdesc[i].cd_blkstart %lld " 341 "blkno %lld dev_bsize %ld\n", 342 (long long)curr->cd_blkstart, 343 (long long)blkno, 344 dev_bsize); 345 dumpabort(0); 346 } 347#endif 348 memcpy(buf, CDATA(curr) + 349 ((blkno - curr->cd_blkstart) << 350 dev_bshift), 351 toCopy); 352 353 buf += toCopy; 354 size -= toCopy; 355 blkno += howmany(toCopy, dev_bsize); 356 numBlocks -= howmany(toCopy, dev_bsize); 357 358 curr->cd_time = cheader->cd_count++; 359 360 /* 361 * If all data of a cache block have been 362 * read, chances are good no more reads 363 * will occur, so expire the cache immediately 364 */ 365 366 curr->cd_blocksRead += 367 howmany(toCopy, dev_bsize); 368 if (curr->cd_blocksRead >= nblksread) 369 curr->cd_time = 0; 370 371 goto retry; 372 } 373 } 374 375 /* No more to do? */ 376 if (size == 0) 377 break; 378 379 /* 380 * This does actually not happen if fs blocks are not greater 381 * than nblksread. 382 */ 383 if (numBlocks > nblksread || blkno >= ufsib->ufs_dsize) { 384 rawread(oblkno, obuf, osize); 385 break; 386 } else { 387 ssize_t rsize; 388 daddr_t blockBlkNo; 389 390 blockBlkNo = (blkno / nblksread) * nblksread; 391 idx = findlru(); 392 rsize = MIN(nblksread, 393 ufsib->ufs_dsize - blockBlkNo) << dev_bshift; 394 395#ifdef DIAGNOSTICS 396 if (cdesc[idx].cd_owner) 397 fprintf(stderr, "Owner is set (%d, me=%d), can" 398 "not happen(2).\n", cdesc[idx].cd_owner, 399 getpid()); 400 cdesc[idx].cd_owner = getpid(); 401#endif 402 cdesc[idx].cd_time = cheader->cd_count++; 403 cdesc[idx].cd_blkstart = blockBlkNo; 404 cdesc[idx].cd_blkend = 0; 405 cdesc[idx].cd_blocksRead = 0; 406 407 if (lseek(diskfd, ((off_t) blockBlkNo << dev_bshift), 408 SEEK_SET) == -1) { 409 msg("readBlocks: lseek fails: %s\n", 410 strerror(errno)); 411 rsize = -1; 412 } else { 413 rsize = read(diskfd, 414 CDATA(&cdesc[idx]), rsize); 415 if (rsize < 0) { 416 msg("readBlocks: read fails: %s\n", 417 strerror(errno)); 418 } 419 } 420 421 /* On errors, panic, punt, try to read without 422 * cache and let raw read routine do the rest. 423 */ 424 425 if (rsize <= 0) { 426 rawread(oblkno, obuf, osize); 427#ifdef DIAGNOSTICS 428 if (cdesc[idx].cd_owner != getpid()) 429 fprintf(stderr, "Owner changed from " 430 "%d to %d, can't happen\n", 431 getpid(), cdesc[idx].cd_owner); 432 cdesc[idx].cd_owner = 0; 433#endif 434 break; 435 } 436 437 /* On short read, just note the fact and go on */ 438 cdesc[idx].cd_blkend = blockBlkNo + rsize / dev_bsize; 439 440#ifdef STATS 441 nphysread++; 442 physreadsize += rsize; 443#endif 444#ifdef DIAGNOSTICS 445 if (cdesc[idx].cd_owner != getpid()) 446 fprintf(stderr, "Owner changed from " 447 "%d to %d, can't happen\n", 448 getpid(), cdesc[idx].cd_owner); 449 cdesc[idx].cd_owner = 0; 450#endif 451 /* 452 * We swapped some of data in, let the loop fetch 453 * them from cache 454 */ 455 } 456 } 457 458 if (flock(diskfd, LOCK_UN)) 459 msg("flock(LOCK_UN) failed: %s\n", 460 strerror(errno)); 461} 462 463void 464printcachestats(void) 465{ 466 467#ifdef STATS 468 fprintf(stderr, "Pid %d: %d reads (%u bytes) " 469 "%d physical reads (%u bytes) %d%% hits, %d%% overhead\n", 470 getpid(), nreads, (u_int) readsize, nphysread, 471 (u_int) physreadsize, (nreads - nphysread) * 100 / nreads, 472 (int) (((physreadsize - readsize) * 100) / readsize)); 473#endif 474} 475