dmu_zfetch.c revision 177698
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26#pragma ident "%Z%%M% %I% %E% SMI" 27 28#include <sys/zfs_context.h> 29#include <sys/dnode.h> 30#include <sys/dmu_objset.h> 31#include <sys/dmu_zfetch.h> 32#include <sys/dmu.h> 33#include <sys/dbuf.h> 34 35/* 36 * I'm against tune-ables, but these should probably exist as tweakable globals 37 * until we can get this working the way we want it to. 38 */ 39 40int zfs_prefetch_disable = 0; 41SYSCTL_DECL(_vfs_zfs); 42TUNABLE_INT("vfs.zfs.prefetch_disable", &zfs_prefetch_disable); 43SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RDTUN, 44 &zfs_prefetch_disable, 0, "Disable prefetch"); 45 46/* max # of streams per zfetch */ 47uint32_t zfetch_max_streams = 8; 48/* min time before stream reclaim */ 49uint32_t zfetch_min_sec_reap = 2; 50/* max number of blocks to fetch at a time */ 51uint32_t zfetch_block_cap = 256; 52/* number of bytes in a array_read at which we stop prefetching (1Mb) */ 53uint64_t zfetch_array_rd_sz = 1024 * 1024; 54 55/* forward decls for static routines */ 56static int dmu_zfetch_colinear(zfetch_t *, zstream_t *); 57static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *); 58static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t); 59static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t); 60static int dmu_zfetch_find(zfetch_t *, zstream_t *, int); 61static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *); 62static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *); 63static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *); 64static int dmu_zfetch_streams_equal(zstream_t *, zstream_t *); 65 66/* 67 * Given a zfetch structure and a zstream structure, determine whether the 68 * blocks to be read are part of a co-linear pair of existing prefetch 69 * streams. If a set is found, coalesce the streams, removing one, and 70 * configure the prefetch so it looks for a strided access pattern. 71 * 72 * In other words: if we find two sequential access streams that are 73 * the same length and distance N appart, and this read is N from the 74 * last stream, then we are probably in a strided access pattern. So 75 * combine the two sequential streams into a single strided stream. 76 * 77 * If no co-linear streams are found, return NULL. 78 */ 79static int 80dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh) 81{ 82 zstream_t *z_walk; 83 zstream_t *z_comp; 84 85 if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER)) 86 return (0); 87 88 if (zh == NULL) { 89 rw_exit(&zf->zf_rwlock); 90 return (0); 91 } 92 93 for (z_walk = list_head(&zf->zf_stream); z_walk; 94 z_walk = list_next(&zf->zf_stream, z_walk)) { 95 for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp; 96 z_comp = list_next(&zf->zf_stream, z_comp)) { 97 int64_t diff; 98 99 if (z_walk->zst_len != z_walk->zst_stride || 100 z_comp->zst_len != z_comp->zst_stride) { 101 continue; 102 } 103 104 diff = z_comp->zst_offset - z_walk->zst_offset; 105 if (z_comp->zst_offset + diff == zh->zst_offset) { 106 z_walk->zst_offset = zh->zst_offset; 107 z_walk->zst_direction = diff < 0 ? -1 : 1; 108 z_walk->zst_stride = 109 diff * z_walk->zst_direction; 110 z_walk->zst_ph_offset = 111 zh->zst_offset + z_walk->zst_stride; 112 dmu_zfetch_stream_remove(zf, z_comp); 113 mutex_destroy(&z_comp->zst_lock); 114 kmem_free(z_comp, sizeof (zstream_t)); 115 116 dmu_zfetch_dofetch(zf, z_walk); 117 118 rw_exit(&zf->zf_rwlock); 119 return (1); 120 } 121 122 diff = z_walk->zst_offset - z_comp->zst_offset; 123 if (z_walk->zst_offset + diff == zh->zst_offset) { 124 z_walk->zst_offset = zh->zst_offset; 125 z_walk->zst_direction = diff < 0 ? -1 : 1; 126 z_walk->zst_stride = 127 diff * z_walk->zst_direction; 128 z_walk->zst_ph_offset = 129 zh->zst_offset + z_walk->zst_stride; 130 dmu_zfetch_stream_remove(zf, z_comp); 131 mutex_destroy(&z_comp->zst_lock); 132 kmem_free(z_comp, sizeof (zstream_t)); 133 134 dmu_zfetch_dofetch(zf, z_walk); 135 136 rw_exit(&zf->zf_rwlock); 137 return (1); 138 } 139 } 140 } 141 142 rw_exit(&zf->zf_rwlock); 143 return (0); 144} 145 146/* 147 * Given a zstream_t, determine the bounds of the prefetch. Then call the 148 * routine that actually prefetches the individual blocks. 149 */ 150static void 151dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs) 152{ 153 uint64_t prefetch_tail; 154 uint64_t prefetch_limit; 155 uint64_t prefetch_ofst; 156 uint64_t prefetch_len; 157 uint64_t blocks_fetched; 158 159 zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len); 160 zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap); 161 162 prefetch_tail = MAX((int64_t)zs->zst_ph_offset, 163 (int64_t)(zs->zst_offset + zs->zst_stride)); 164 /* 165 * XXX: use a faster division method? 166 */ 167 prefetch_limit = zs->zst_offset + zs->zst_len + 168 (zs->zst_cap * zs->zst_stride) / zs->zst_len; 169 170 while (prefetch_tail < prefetch_limit) { 171 prefetch_ofst = zs->zst_offset + zs->zst_direction * 172 (prefetch_tail - zs->zst_offset); 173 174 prefetch_len = zs->zst_len; 175 176 /* 177 * Don't prefetch beyond the end of the file, if working 178 * backwards. 179 */ 180 if ((zs->zst_direction == ZFETCH_BACKWARD) && 181 (prefetch_ofst > prefetch_tail)) { 182 prefetch_len += prefetch_ofst; 183 prefetch_ofst = 0; 184 } 185 186 /* don't prefetch more than we're supposed to */ 187 if (prefetch_len > zs->zst_len) 188 break; 189 190 blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode, 191 prefetch_ofst, zs->zst_len); 192 193 prefetch_tail += zs->zst_stride; 194 /* stop if we've run out of stuff to prefetch */ 195 if (blocks_fetched < zs->zst_len) 196 break; 197 } 198 zs->zst_ph_offset = prefetch_tail; 199 zs->zst_last = lbolt; 200} 201 202/* 203 * This takes a pointer to a zfetch structure and a dnode. It performs the 204 * necessary setup for the zfetch structure, grokking data from the 205 * associated dnode. 206 */ 207void 208dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) 209{ 210 if (zf == NULL) { 211 return; 212 } 213 214 zf->zf_dnode = dno; 215 zf->zf_stream_cnt = 0; 216 zf->zf_alloc_fail = 0; 217 218 list_create(&zf->zf_stream, sizeof (zstream_t), 219 offsetof(zstream_t, zst_node)); 220 221 rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL); 222} 223 224/* 225 * This function computes the actual size, in blocks, that can be prefetched, 226 * and fetches it. 227 */ 228static uint64_t 229dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks) 230{ 231 uint64_t fetchsz; 232 uint64_t i; 233 234 fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks); 235 236 for (i = 0; i < fetchsz; i++) { 237 dbuf_prefetch(dn, blkid + i); 238 } 239 240 return (fetchsz); 241} 242 243/* 244 * this function returns the number of blocks that would be prefetched, based 245 * upon the supplied dnode, blockid, and nblks. This is used so that we can 246 * update streams in place, and then prefetch with their old value after the 247 * fact. This way, we can delay the prefetch, but subsequent accesses to the 248 * stream won't result in the same data being prefetched multiple times. 249 */ 250static uint64_t 251dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks) 252{ 253 uint64_t fetchsz; 254 255 if (blkid > dn->dn_maxblkid) { 256 return (0); 257 } 258 259 /* compute fetch size */ 260 if (blkid + nblks + 1 > dn->dn_maxblkid) { 261 fetchsz = (dn->dn_maxblkid - blkid) + 1; 262 ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid); 263 } else { 264 fetchsz = nblks; 265 } 266 267 268 return (fetchsz); 269} 270 271/* 272 * given a zfetch and a zsearch structure, see if there is an associated zstream 273 * for this block read. If so, it starts a prefetch for the stream it 274 * located and returns true, otherwise it returns false 275 */ 276static int 277dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) 278{ 279 zstream_t *zs; 280 int64_t diff; 281 int reset = !prefetched; 282 int rc = 0; 283 284 if (zh == NULL) 285 return (0); 286 287 /* 288 * XXX: This locking strategy is a bit coarse; however, it's impact has 289 * yet to be tested. If this turns out to be an issue, it can be 290 * modified in a number of different ways. 291 */ 292 293 rw_enter(&zf->zf_rwlock, RW_READER); 294top: 295 296 for (zs = list_head(&zf->zf_stream); zs; 297 zs = list_next(&zf->zf_stream, zs)) { 298 299 /* 300 * XXX - should this be an assert? 301 */ 302 if (zs->zst_len == 0) { 303 /* bogus stream */ 304 continue; 305 } 306 307 /* 308 * We hit this case when we are in a strided prefetch stream: 309 * we will read "len" blocks before "striding". 310 */ 311 if (zh->zst_offset >= zs->zst_offset && 312 zh->zst_offset < zs->zst_offset + zs->zst_len) { 313 /* already fetched */ 314 rc = 1; 315 goto out; 316 } 317 318 /* 319 * This is the forward sequential read case: we increment 320 * len by one each time we hit here, so we will enter this 321 * case on every read. 322 */ 323 if (zh->zst_offset == zs->zst_offset + zs->zst_len) { 324 325 reset = !prefetched && zs->zst_len > 1; 326 327 mutex_enter(&zs->zst_lock); 328 329 if (zh->zst_offset != zs->zst_offset + zs->zst_len) { 330 mutex_exit(&zs->zst_lock); 331 goto top; 332 } 333 zs->zst_len += zh->zst_len; 334 diff = zs->zst_len - zfetch_block_cap; 335 if (diff > 0) { 336 zs->zst_offset += diff; 337 zs->zst_len = zs->zst_len > diff ? 338 zs->zst_len - diff : 0; 339 } 340 zs->zst_direction = ZFETCH_FORWARD; 341 342 break; 343 344 /* 345 * Same as above, but reading backwards through the file. 346 */ 347 } else if (zh->zst_offset == zs->zst_offset - zh->zst_len) { 348 /* backwards sequential access */ 349 350 reset = !prefetched && zs->zst_len > 1; 351 352 mutex_enter(&zs->zst_lock); 353 354 if (zh->zst_offset != zs->zst_offset - zh->zst_len) { 355 mutex_exit(&zs->zst_lock); 356 goto top; 357 } 358 359 zs->zst_offset = zs->zst_offset > zh->zst_len ? 360 zs->zst_offset - zh->zst_len : 0; 361 zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ? 362 zs->zst_ph_offset - zh->zst_len : 0; 363 zs->zst_len += zh->zst_len; 364 365 diff = zs->zst_len - zfetch_block_cap; 366 if (diff > 0) { 367 zs->zst_ph_offset = zs->zst_ph_offset > diff ? 368 zs->zst_ph_offset - diff : 0; 369 zs->zst_len = zs->zst_len > diff ? 370 zs->zst_len - diff : zs->zst_len; 371 } 372 zs->zst_direction = ZFETCH_BACKWARD; 373 374 break; 375 376 } else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride < 377 zs->zst_len) && (zs->zst_len != zs->zst_stride)) { 378 /* strided forward access */ 379 380 mutex_enter(&zs->zst_lock); 381 382 if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >= 383 zs->zst_len) || (zs->zst_len == zs->zst_stride)) { 384 mutex_exit(&zs->zst_lock); 385 goto top; 386 } 387 388 zs->zst_offset += zs->zst_stride; 389 zs->zst_direction = ZFETCH_FORWARD; 390 391 break; 392 393 } else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride < 394 zs->zst_len) && (zs->zst_len != zs->zst_stride)) { 395 /* strided reverse access */ 396 397 mutex_enter(&zs->zst_lock); 398 399 if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >= 400 zs->zst_len) || (zs->zst_len == zs->zst_stride)) { 401 mutex_exit(&zs->zst_lock); 402 goto top; 403 } 404 405 zs->zst_offset = zs->zst_offset > zs->zst_stride ? 406 zs->zst_offset - zs->zst_stride : 0; 407 zs->zst_ph_offset = (zs->zst_ph_offset > 408 (2 * zs->zst_stride)) ? 409 (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0; 410 zs->zst_direction = ZFETCH_BACKWARD; 411 412 break; 413 } 414 } 415 416 if (zs) { 417 if (reset) { 418 zstream_t *remove = zs; 419 420 rc = 0; 421 mutex_exit(&zs->zst_lock); 422 rw_exit(&zf->zf_rwlock); 423 rw_enter(&zf->zf_rwlock, RW_WRITER); 424 /* 425 * Relocate the stream, in case someone removes 426 * it while we were acquiring the WRITER lock. 427 */ 428 for (zs = list_head(&zf->zf_stream); zs; 429 zs = list_next(&zf->zf_stream, zs)) { 430 if (zs == remove) { 431 dmu_zfetch_stream_remove(zf, zs); 432 mutex_destroy(&zs->zst_lock); 433 kmem_free(zs, sizeof (zstream_t)); 434 break; 435 } 436 } 437 } else { 438 rc = 1; 439 dmu_zfetch_dofetch(zf, zs); 440 mutex_exit(&zs->zst_lock); 441 } 442 } 443out: 444 rw_exit(&zf->zf_rwlock); 445 return (rc); 446} 447 448/* 449 * Clean-up state associated with a zfetch structure. This frees allocated 450 * structure members, empties the zf_stream tree, and generally makes things 451 * nice. This doesn't free the zfetch_t itself, that's left to the caller. 452 */ 453void 454dmu_zfetch_rele(zfetch_t *zf) 455{ 456 zstream_t *zs; 457 zstream_t *zs_next; 458 459 ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock)); 460 461 for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) { 462 zs_next = list_next(&zf->zf_stream, zs); 463 464 list_remove(&zf->zf_stream, zs); 465 mutex_destroy(&zs->zst_lock); 466 kmem_free(zs, sizeof (zstream_t)); 467 } 468 list_destroy(&zf->zf_stream); 469 rw_destroy(&zf->zf_rwlock); 470 471 zf->zf_dnode = NULL; 472} 473 474/* 475 * Given a zfetch and zstream structure, insert the zstream structure into the 476 * AVL tree contained within the zfetch structure. Peform the appropriate 477 * book-keeping. It is possible that another thread has inserted a stream which 478 * matches one that we are about to insert, so we must be sure to check for this 479 * case. If one is found, return failure, and let the caller cleanup the 480 * duplicates. 481 */ 482static int 483dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs) 484{ 485 zstream_t *zs_walk; 486 zstream_t *zs_next; 487 488 ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); 489 490 for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) { 491 zs_next = list_next(&zf->zf_stream, zs_walk); 492 493 if (dmu_zfetch_streams_equal(zs_walk, zs)) { 494 return (0); 495 } 496 } 497 498 list_insert_head(&zf->zf_stream, zs); 499 zf->zf_stream_cnt++; 500 501 return (1); 502} 503 504 505/* 506 * Walk the list of zstreams in the given zfetch, find an old one (by time), and 507 * reclaim it for use by the caller. 508 */ 509static zstream_t * 510dmu_zfetch_stream_reclaim(zfetch_t *zf) 511{ 512 zstream_t *zs; 513 514 if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER)) 515 return (0); 516 517 for (zs = list_head(&zf->zf_stream); zs; 518 zs = list_next(&zf->zf_stream, zs)) { 519 520 if (((lbolt - zs->zst_last) / hz) > zfetch_min_sec_reap) 521 break; 522 } 523 524 if (zs) { 525 dmu_zfetch_stream_remove(zf, zs); 526 mutex_destroy(&zs->zst_lock); 527 bzero(zs, sizeof (zstream_t)); 528 } else { 529 zf->zf_alloc_fail++; 530 } 531 rw_exit(&zf->zf_rwlock); 532 533 return (zs); 534} 535 536/* 537 * Given a zfetch and zstream structure, remove the zstream structure from its 538 * container in the zfetch structure. Perform the appropriate book-keeping. 539 */ 540static void 541dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) 542{ 543 ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); 544 545 list_remove(&zf->zf_stream, zs); 546 zf->zf_stream_cnt--; 547} 548 549static int 550dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2) 551{ 552 if (zs1->zst_offset != zs2->zst_offset) 553 return (0); 554 555 if (zs1->zst_len != zs2->zst_len) 556 return (0); 557 558 if (zs1->zst_stride != zs2->zst_stride) 559 return (0); 560 561 if (zs1->zst_ph_offset != zs2->zst_ph_offset) 562 return (0); 563 564 if (zs1->zst_cap != zs2->zst_cap) 565 return (0); 566 567 if (zs1->zst_direction != zs2->zst_direction) 568 return (0); 569 570 return (1); 571} 572 573/* 574 * This is the prefetch entry point. It calls all of the other dmu_zfetch 575 * routines to create, delete, find, or operate upon prefetch streams. 576 */ 577void 578dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) 579{ 580 zstream_t zst; 581 zstream_t *newstream; 582 int fetched; 583 int inserted; 584 unsigned int blkshft; 585 uint64_t blksz; 586 587 if (zfs_prefetch_disable) 588 return; 589 590 /* files that aren't ln2 blocksz are only one block -- nothing to do */ 591 if (!zf->zf_dnode->dn_datablkshift) 592 return; 593 594 /* convert offset and size, into blockid and nblocks */ 595 blkshft = zf->zf_dnode->dn_datablkshift; 596 blksz = (1 << blkshft); 597 598 bzero(&zst, sizeof (zstream_t)); 599 zst.zst_offset = offset >> blkshft; 600 zst.zst_len = (P2ROUNDUP(offset + size, blksz) - 601 P2ALIGN(offset, blksz)) >> blkshft; 602 603 fetched = dmu_zfetch_find(zf, &zst, prefetched); 604 if (!fetched) { 605 fetched = dmu_zfetch_colinear(zf, &zst); 606 } 607 608 if (!fetched) { 609 newstream = dmu_zfetch_stream_reclaim(zf); 610 611 /* 612 * we still couldn't find a stream, drop the lock, and allocate 613 * one if possible. Otherwise, give up and go home. 614 */ 615 if (newstream == NULL) { 616 uint64_t maxblocks; 617 uint32_t max_streams; 618 uint32_t cur_streams; 619 620 cur_streams = zf->zf_stream_cnt; 621 maxblocks = zf->zf_dnode->dn_maxblkid; 622 623 max_streams = MIN(zfetch_max_streams, 624 (maxblocks / zfetch_block_cap)); 625 if (max_streams == 0) { 626 max_streams++; 627 } 628 629 if (cur_streams >= max_streams) { 630 return; 631 } 632 633 newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP); 634 } 635 636 newstream->zst_offset = zst.zst_offset; 637 newstream->zst_len = zst.zst_len; 638 newstream->zst_stride = zst.zst_len; 639 newstream->zst_ph_offset = zst.zst_len + zst.zst_offset; 640 newstream->zst_cap = zst.zst_len; 641 newstream->zst_direction = ZFETCH_FORWARD; 642 newstream->zst_last = lbolt; 643 644 mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL); 645 646 rw_enter(&zf->zf_rwlock, RW_WRITER); 647 inserted = dmu_zfetch_stream_insert(zf, newstream); 648 rw_exit(&zf->zf_rwlock); 649 650 if (!inserted) { 651 mutex_destroy(&newstream->zst_lock); 652 kmem_free(newstream, sizeof (zstream_t)); 653 } 654 } 655} 656