1/*- 2 * Copyright (c) 1998 Michael Smith <msmith@freebsd.org> 3 * Copyright 2015 Toomas Soome <tsoome@me.com> 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28#include <sys/cdefs.h> 29#include <sys/param.h> 30__FBSDID("$FreeBSD$"); 31 32/* 33 * Simple hashed block cache 34 */ 35 36#include <sys/stdint.h> 37 38#include <stand.h> 39#include <string.h> 40#include <strings.h> 41 42#include "bootstrap.h" 43 44/* #define BCACHE_DEBUG */ 45 46#ifdef BCACHE_DEBUG 47# define DPRINTF(fmt, args...) printf("%s: " fmt "\n" , __func__ , ## args) 48#else 49# define DPRINTF(fmt, args...) ((void)0) 50#endif 51 52struct bcachectl 53{ 54 daddr_t bc_blkno; 55 int bc_count; 56}; 57 58/* 59 * bcache per device node. cache is allocated on device first open and freed 60 * on last close, to save memory. The issue there is the size; biosdisk 61 * supports up to 31 (0x1f) devices. Classic setup would use single disk 62 * to boot from, but this has changed with zfs. 63 */ 64struct bcache { 65 struct bcachectl *bcache_ctl; 66 caddr_t bcache_data; 67 size_t bcache_nblks; 68 size_t ra; 69}; 70 71static u_int bcache_total_nblks; /* set by bcache_init */ 72static u_int bcache_blksize; /* set by bcache_init */ 73static u_int bcache_numdev; /* set by bcache_add_dev */ 74/* statistics */ 75static u_int bcache_units; /* number of devices with cache */ 76static u_int bcache_unit_nblks; /* nblocks per unit */ 77static u_int bcache_hits; 78static u_int bcache_misses; 79static u_int bcache_ops; 80static u_int bcache_bypasses; 81static u_int bcache_bcount; 82static u_int bcache_rablks; 83 84#define BHASH(bc, blkno) ((blkno) & ((bc)->bcache_nblks - 1)) 85#define BCACHE_LOOKUP(bc, blkno) \ 86 ((bc)->bcache_ctl[BHASH((bc), (blkno))].bc_blkno != (blkno)) 87#define BCACHE_READAHEAD 256 88#define BCACHE_MINREADAHEAD 32 89#define BCACHE_MARKER 0xdeadbeef 90 91static void bcache_invalidate(struct bcache *bc, daddr_t blkno); 92static void bcache_insert(struct bcache *bc, daddr_t blkno); 93static void bcache_free_instance(struct bcache *bc); 94 95/* 96 * Initialise the cache for (nblks) of (bsize). 97 */ 98void 99bcache_init(size_t nblks, size_t bsize) 100{ 101 /* set up control data */ 102 bcache_total_nblks = nblks; 103 bcache_blksize = bsize; 104} 105 106/* 107 * add number of devices to bcache. we have to divide cache space 108 * between the devices, so bcache_add_dev() can be used to set up the 109 * number. The issue is, we need to get the number before actual allocations. 110 * bcache_add_dev() is supposed to be called from device init() call, so the 111 * assumption is, devsw dv_init is called for plain devices first, and 112 * for zfs, last. 113 */ 114void 115bcache_add_dev(int devices) 116{ 117 bcache_numdev += devices; 118} 119 120void * 121bcache_allocate(void) 122{ 123 u_int i; 124 struct bcache *bc = malloc(sizeof (struct bcache)); 125 int disks = bcache_numdev; 126 uint32_t *marker; 127 128 if (disks == 0) 129 disks = 1; /* safe guard */ 130 131 if (bc == NULL) { 132 errno = ENOMEM; 133 return (bc); 134 } 135 136 /* 137 * the bcache block count must be power of 2 for hash function 138 */ 139 i = fls(disks) - 1; /* highbit - 1 */ 140 if (disks > (1 << i)) /* next power of 2 */ 141 i++; 142 143 bc->bcache_nblks = bcache_total_nblks >> i; 144 bcache_unit_nblks = bc->bcache_nblks; 145 bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize + 146 sizeof(uint32_t)); 147 if (bc->bcache_data == NULL) { 148 /* dont error out yet. fall back to 32 blocks and try again */ 149 bc->bcache_nblks = 32; 150 bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize + 151 sizeof(uint32_t)); 152 } 153 154 bc->bcache_ctl = malloc(bc->bcache_nblks * sizeof(struct bcachectl)); 155 156 if ((bc->bcache_data == NULL) || (bc->bcache_ctl == NULL)) { 157 bcache_free_instance(bc); 158 errno = ENOMEM; 159 return (NULL); 160 } 161 /* Insert cache end marker. */ 162 marker = (uint32_t *)(bc->bcache_data + bc->bcache_nblks * bcache_blksize); 163 *marker = BCACHE_MARKER; 164 165 /* Flush the cache */ 166 for (i = 0; i < bc->bcache_nblks; i++) { 167 bc->bcache_ctl[i].bc_count = -1; 168 bc->bcache_ctl[i].bc_blkno = -1; 169 } 170 bcache_units++; 171 bc->ra = BCACHE_READAHEAD; /* optimistic read ahead */ 172 return (bc); 173} 174 175void 176bcache_free(void *cache) 177{ 178 struct bcache *bc = cache; 179 180 if (bc == NULL) 181 return; 182 183 bcache_free_instance(bc); 184 bcache_units--; 185} 186 187/* 188 * Handle a write request; write directly to the disk, and populate the 189 * cache with the new values. 190 */ 191static int 192write_strategy(void *devdata, int rw, daddr_t blk, size_t size, 193 char *buf, size_t *rsize) 194{ 195 struct bcache_devdata *dd = (struct bcache_devdata *)devdata; 196 struct bcache *bc = dd->dv_cache; 197 daddr_t i, nblk; 198 199 nblk = size / bcache_blksize; 200 201 /* Invalidate the blocks being written */ 202 for (i = 0; i < nblk; i++) { 203 bcache_invalidate(bc, blk + i); 204 } 205 206 /* Write the blocks */ 207 return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize)); 208} 209 210/* 211 * Handle a read request; fill in parts of the request that can 212 * be satisfied by the cache, use the supplied strategy routine to do 213 * device I/O and then use the I/O results to populate the cache. 214 */ 215static int 216read_strategy(void *devdata, int rw, daddr_t blk, size_t size, 217 char *buf, size_t *rsize) 218{ 219 struct bcache_devdata *dd = (struct bcache_devdata *)devdata; 220 struct bcache *bc = dd->dv_cache; 221 size_t i, nblk, p_size, r_size, complete, ra; 222 int result; 223 daddr_t p_blk; 224 caddr_t p_buf; 225 uint32_t *marker; 226 227 if (bc == NULL) { 228 errno = ENODEV; 229 return (-1); 230 } 231 232 marker = (uint32_t *)(bc->bcache_data + bc->bcache_nblks * bcache_blksize); 233 234 if (rsize != NULL) 235 *rsize = 0; 236 237 nblk = size / bcache_blksize; 238 if (nblk == 0 && size != 0) 239 nblk++; 240 result = 0; 241 complete = 1; 242 243 /* Satisfy any cache hits up front, break on first miss */ 244 for (i = 0; i < nblk; i++) { 245 if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i))) { 246 bcache_misses += (nblk - i); 247 complete = 0; 248 if (nblk - i > BCACHE_MINREADAHEAD && bc->ra > BCACHE_MINREADAHEAD) 249 bc->ra >>= 1; /* reduce read ahead */ 250 break; 251 } else { 252 bcache_hits++; 253 } 254 } 255 256 if (complete) { /* whole set was in cache, return it */ 257 if (bc->ra < BCACHE_READAHEAD) 258 bc->ra <<= 1; /* increase read ahead */ 259 bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)), buf, size); 260 goto done; 261 } 262 263 /* 264 * Fill in any misses. From check we have i pointing to first missing 265 * block, read in all remaining blocks + readahead. 266 * We have space at least for nblk - i before bcache wraps. 267 */ 268 p_blk = blk + i; 269 p_buf = bc->bcache_data + (bcache_blksize * BHASH(bc, p_blk)); 270 r_size = bc->bcache_nblks - BHASH(bc, p_blk); /* remaining blocks */ 271 272 p_size = MIN(r_size, nblk - i); /* read at least those blocks */ 273 274 /* 275 * The read ahead size setup. 276 * While the read ahead can save us IO, it also can complicate things: 277 * 1. We do not want to read ahead by wrapping around the 278 * bcache end - this would complicate the cache management. 279 * 2. We are using bc->ra as dynamic hint for read ahead size, 280 * detected cache hits will increase the read-ahead block count, and 281 * misses will decrease, see the code above. 282 * 3. The bcache is sized by 512B blocks, however, the underlying device 283 * may have a larger sector size, and we should perform the IO by 284 * taking into account these larger sector sizes. We could solve this by 285 * passing the sector size to bcache_allocate(), or by using ioctl(), but 286 * in this version we are using the constant, 16 blocks, and are rounding 287 * read ahead block count down to multiple of 16. 288 * Using the constant has two reasons, we are not entirely sure if the 289 * BIOS disk interface is providing the correct value for sector size. 290 * And secondly, this way we get the most conservative setup for the ra. 291 * 292 * The selection of multiple of 16 blocks (8KB) is quite arbitrary, however, 293 * we want to cover CDs (2K) and 4K disks. 294 * bcache_allocate() will always fall back to a minimum of 32 blocks. 295 * Our choice of 16 read ahead blocks will always fit inside the bcache. 296 */ 297 298 if ((rw & F_NORA) == F_NORA) 299 ra = 0; 300 else 301 ra = bc->bcache_nblks - BHASH(bc, p_blk + p_size); 302 303 if (ra != 0 && ra != bc->bcache_nblks) { /* do we have RA space? */ 304 ra = MIN(bc->ra, ra - 1); 305 ra = rounddown(ra, 16); /* multiple of 16 blocks */ 306 p_size += ra; 307 } 308 309 /* invalidate bcache */ 310 for (i = 0; i < p_size; i++) { 311 bcache_invalidate(bc, p_blk + i); 312 } 313 314 r_size = 0; 315 /* 316 * with read-ahead, it may happen we are attempting to read past 317 * disk end, as bcache has no information about disk size. 318 * in such case we should get partial read if some blocks can be 319 * read or error, if no blocks can be read. 320 * in either case we should return the data in bcache and only 321 * return error if there is no data. 322 */ 323 rw &= F_MASK; 324 result = dd->dv_strategy(dd->dv_devdata, rw, p_blk, 325 p_size * bcache_blksize, p_buf, &r_size); 326 327 r_size /= bcache_blksize; 328 for (i = 0; i < r_size; i++) 329 bcache_insert(bc, p_blk + i); 330 331 /* update ra statistics */ 332 if (r_size != 0) { 333 if (r_size < p_size) 334 bcache_rablks += (p_size - r_size); 335 else 336 bcache_rablks += ra; 337 } 338 339 /* check how much data can we copy */ 340 for (i = 0; i < nblk; i++) { 341 if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i))) 342 break; 343 } 344 345 if (size > i * bcache_blksize) 346 size = i * bcache_blksize; 347 348 if (size != 0) { 349 bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)), buf, size); 350 result = 0; 351 } 352 353 if (*marker != BCACHE_MARKER) { 354 printf("BUG: bcache corruption detected: nblks: %zu p_blk: %lu, " 355 "p_size: %zu, ra: %zu\n", bc->bcache_nblks, 356 (long unsigned)BHASH(bc, p_blk), p_size, ra); 357 } 358 359 done: 360 if ((result == 0) && (rsize != NULL)) 361 *rsize = size; 362 return(result); 363} 364 365/* 366 * Requests larger than 1/2 cache size will be bypassed and go 367 * directly to the disk. XXX tune this. 368 */ 369int 370bcache_strategy(void *devdata, int rw, daddr_t blk, size_t size, 371 char *buf, size_t *rsize) 372{ 373 struct bcache_devdata *dd = (struct bcache_devdata *)devdata; 374 struct bcache *bc = dd->dv_cache; 375 u_int bcache_nblks = 0; 376 int nblk, cblk, ret; 377 size_t csize, isize, total; 378 379 bcache_ops++; 380 381 if (bc != NULL) 382 bcache_nblks = bc->bcache_nblks; 383 384 /* bypass large requests, or when the cache is inactive */ 385 if (bc == NULL || 386 ((size * 2 / bcache_blksize) > bcache_nblks)) { 387 DPRINTF("bypass %zu from %qu", size / bcache_blksize, blk); 388 bcache_bypasses++; 389 rw &= F_MASK; 390 return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize)); 391 } 392 393 switch (rw & F_MASK) { 394 case F_READ: 395 nblk = size / bcache_blksize; 396 if (size != 0 && nblk == 0) 397 nblk++; /* read at least one block */ 398 399 ret = 0; 400 total = 0; 401 while(size) { 402 cblk = bcache_nblks - BHASH(bc, blk); /* # of blocks left */ 403 cblk = MIN(cblk, nblk); 404 405 if (size <= bcache_blksize) 406 csize = size; 407 else 408 csize = cblk * bcache_blksize; 409 410 ret = read_strategy(devdata, rw, blk, csize, buf+total, &isize); 411 412 /* 413 * we may have error from read ahead, if we have read some data 414 * return partial read. 415 */ 416 if (ret != 0 || isize == 0) { 417 if (total != 0) 418 ret = 0; 419 break; 420 } 421 blk += isize / bcache_blksize; 422 total += isize; 423 size -= isize; 424 nblk = size / bcache_blksize; 425 } 426 427 if (rsize) 428 *rsize = total; 429 430 return (ret); 431 case F_WRITE: 432 return write_strategy(devdata, F_WRITE, blk, size, buf, rsize); 433 } 434 return -1; 435} 436 437/* 438 * Free allocated bcache instance 439 */ 440static void 441bcache_free_instance(struct bcache *bc) 442{ 443 if (bc != NULL) { 444 free(bc->bcache_ctl); 445 free(bc->bcache_data); 446 free(bc); 447 } 448} 449 450/* 451 * Insert a block into the cache. 452 */ 453static void 454bcache_insert(struct bcache *bc, daddr_t blkno) 455{ 456 u_int cand; 457 458 cand = BHASH(bc, blkno); 459 460 DPRINTF("insert blk %llu -> %u # %d", blkno, cand, bcache_bcount); 461 bc->bcache_ctl[cand].bc_blkno = blkno; 462 bc->bcache_ctl[cand].bc_count = bcache_bcount++; 463} 464 465/* 466 * Invalidate a block from the cache. 467 */ 468static void 469bcache_invalidate(struct bcache *bc, daddr_t blkno) 470{ 471 u_int i; 472 473 i = BHASH(bc, blkno); 474 if (bc->bcache_ctl[i].bc_blkno == blkno) { 475 bc->bcache_ctl[i].bc_count = -1; 476 bc->bcache_ctl[i].bc_blkno = -1; 477 DPRINTF("invalidate blk %llu", blkno); 478 } 479} 480 481#ifndef BOOT2 482COMMAND_SET(bcachestat, "bcachestat", "get disk block cache stats", command_bcache); 483 484static int 485command_bcache(int argc, char *argv[] __unused) 486{ 487 if (argc != 1) { 488 command_errmsg = "wrong number of arguments"; 489 return(CMD_ERROR); 490 } 491 492 printf("\ncache blocks: %u\n", bcache_total_nblks); 493 printf("cache blocksz: %u\n", bcache_blksize); 494 printf("cache readahead: %u\n", bcache_rablks); 495 printf("unit cache blocks: %u\n", bcache_unit_nblks); 496 printf("cached units: %u\n", bcache_units); 497 printf("%u ops %d bypasses %u hits %u misses\n", bcache_ops, 498 bcache_bypasses, bcache_hits, bcache_misses); 499 return(CMD_OK); 500} 501#endif 502