1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22208130Smm * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23168404Spjd * Use is subject to license terms. 24168404Spjd */ 25168404Spjd 26168404Spjd#include <sys/zfs_context.h> 27168404Spjd#include <sys/dnode.h> 28168404Spjd#include <sys/dmu_objset.h> 29168404Spjd#include <sys/dmu_zfetch.h> 30168404Spjd#include <sys/dmu.h> 31168404Spjd#include <sys/dbuf.h> 32208130Smm#include <sys/kstat.h> 33168404Spjd 34168404Spjd/* 35168404Spjd * I'm against tune-ables, but these should probably exist as tweakable globals 36168404Spjd * until we can get this working the way we want it to. 37168404Spjd */ 38168404Spjd 39194043Skmacyint zfs_prefetch_disable = 0; 40168404Spjd 41168404Spjd/* max # of streams per zfetch */ 42168404Spjduint32_t zfetch_max_streams = 8; 43168404Spjd/* min time before stream reclaim */ 44168404Spjduint32_t zfetch_min_sec_reap = 2; 45168404Spjd/* max number of blocks to fetch at a time */ 46168404Spjduint32_t zfetch_block_cap = 256; 47168404Spjd/* number of bytes in a array_read at which we stop prefetching (1Mb) */ 48168404Spjduint64_t zfetch_array_rd_sz = 1024 * 1024; 49168404Spjd 50185029SpjdSYSCTL_DECL(_vfs_zfs); 51205132SkmacySYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RW, 52194043Skmacy &zfs_prefetch_disable, 0, "Disable prefetch"); 53185029SpjdSYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH"); 54185029SpjdTUNABLE_INT("vfs.zfs.zfetch.max_streams", &zfetch_max_streams); 55205132SkmacySYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RW, 56185029Spjd &zfetch_max_streams, 0, "Max # of streams per zfetch"); 57185029SpjdTUNABLE_INT("vfs.zfs.zfetch.min_sec_reap", &zfetch_min_sec_reap); 58185029SpjdSYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RDTUN, 59185029Spjd &zfetch_min_sec_reap, 0, "Min time before stream reclaim"); 60185029SpjdTUNABLE_INT("vfs.zfs.zfetch.block_cap", &zfetch_block_cap); 61185029SpjdSYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, block_cap, CTLFLAG_RDTUN, 62185029Spjd &zfetch_block_cap, 0, "Max number of blocks to fetch at a time"); 63185029SpjdTUNABLE_QUAD("vfs.zfs.zfetch.array_rd_sz", &zfetch_array_rd_sz); 64217367SmdfSYSCTL_UQUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RDTUN, 65185029Spjd &zfetch_array_rd_sz, 0, 66185029Spjd "Number of bytes in a array_read at which we stop prefetching"); 67185029Spjd 68168404Spjd/* forward decls for static routines */ 69251629Sdelphijstatic boolean_t dmu_zfetch_colinear(zfetch_t *, zstream_t *); 70168404Spjdstatic void dmu_zfetch_dofetch(zfetch_t *, zstream_t *); 71168404Spjdstatic uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t); 72168404Spjdstatic uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t); 73251629Sdelphijstatic boolean_t dmu_zfetch_find(zfetch_t *, zstream_t *, int); 74168404Spjdstatic int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *); 75168404Spjdstatic zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *); 76168404Spjdstatic void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *); 77168404Spjdstatic int dmu_zfetch_streams_equal(zstream_t *, zstream_t *); 78168404Spjd 79208130Smmtypedef struct zfetch_stats { 80208130Smm kstat_named_t zfetchstat_hits; 81208130Smm kstat_named_t zfetchstat_misses; 82208130Smm kstat_named_t zfetchstat_colinear_hits; 83208130Smm kstat_named_t zfetchstat_colinear_misses; 84208130Smm kstat_named_t zfetchstat_stride_hits; 85208130Smm kstat_named_t zfetchstat_stride_misses; 86208130Smm kstat_named_t zfetchstat_reclaim_successes; 87208130Smm kstat_named_t zfetchstat_reclaim_failures; 88208130Smm kstat_named_t zfetchstat_stream_resets; 89208130Smm kstat_named_t zfetchstat_stream_noresets; 90208130Smm kstat_named_t zfetchstat_bogus_streams; 91208130Smm} zfetch_stats_t; 92208130Smm 93208130Smmstatic zfetch_stats_t zfetch_stats = { 94208130Smm { "hits", KSTAT_DATA_UINT64 }, 95208130Smm { "misses", KSTAT_DATA_UINT64 }, 96208130Smm { "colinear_hits", KSTAT_DATA_UINT64 }, 97208130Smm { "colinear_misses", KSTAT_DATA_UINT64 }, 98208130Smm { "stride_hits", KSTAT_DATA_UINT64 }, 99208130Smm { "stride_misses", KSTAT_DATA_UINT64 }, 100208130Smm { "reclaim_successes", KSTAT_DATA_UINT64 }, 101208130Smm { "reclaim_failures", KSTAT_DATA_UINT64 }, 102208130Smm { "streams_resets", KSTAT_DATA_UINT64 }, 103208130Smm { "streams_noresets", KSTAT_DATA_UINT64 }, 104208130Smm { "bogus_streams", KSTAT_DATA_UINT64 }, 105208130Smm}; 106208130Smm 107208130Smm#define ZFETCHSTAT_INCR(stat, val) \ 108208130Smm atomic_add_64(&zfetch_stats.stat.value.ui64, (val)); 109208130Smm 110208130Smm#define ZFETCHSTAT_BUMP(stat) ZFETCHSTAT_INCR(stat, 1); 111208130Smm 112208130Smmkstat_t *zfetch_ksp; 113208130Smm 114168404Spjd/* 115168404Spjd * Given a zfetch structure and a zstream structure, determine whether the 116168404Spjd * blocks to be read are part of a co-linear pair of existing prefetch 117168404Spjd * streams. If a set is found, coalesce the streams, removing one, and 118168404Spjd * configure the prefetch so it looks for a strided access pattern. 119168404Spjd * 120168404Spjd * In other words: if we find two sequential access streams that are 121168404Spjd * the same length and distance N appart, and this read is N from the 122168404Spjd * last stream, then we are probably in a strided access pattern. So 123168404Spjd * combine the two sequential streams into a single strided stream. 124168404Spjd * 125251629Sdelphij * Returns whether co-linear streams were found. 126168404Spjd */ 127251629Sdelphijstatic boolean_t 128168404Spjddmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh) 129168404Spjd{ 130168404Spjd zstream_t *z_walk; 131168404Spjd zstream_t *z_comp; 132168404Spjd 133168404Spjd if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER)) 134168404Spjd return (0); 135168404Spjd 136168404Spjd if (zh == NULL) { 137168404Spjd rw_exit(&zf->zf_rwlock); 138168404Spjd return (0); 139168404Spjd } 140168404Spjd 141168404Spjd for (z_walk = list_head(&zf->zf_stream); z_walk; 142168404Spjd z_walk = list_next(&zf->zf_stream, z_walk)) { 143168404Spjd for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp; 144168404Spjd z_comp = list_next(&zf->zf_stream, z_comp)) { 145168404Spjd int64_t diff; 146168404Spjd 147168404Spjd if (z_walk->zst_len != z_walk->zst_stride || 148168404Spjd z_comp->zst_len != z_comp->zst_stride) { 149168404Spjd continue; 150168404Spjd } 151168404Spjd 152168404Spjd diff = z_comp->zst_offset - z_walk->zst_offset; 153168404Spjd if (z_comp->zst_offset + diff == zh->zst_offset) { 154168404Spjd z_walk->zst_offset = zh->zst_offset; 155168404Spjd z_walk->zst_direction = diff < 0 ? -1 : 1; 156168404Spjd z_walk->zst_stride = 157168404Spjd diff * z_walk->zst_direction; 158168404Spjd z_walk->zst_ph_offset = 159168404Spjd zh->zst_offset + z_walk->zst_stride; 160168404Spjd dmu_zfetch_stream_remove(zf, z_comp); 161168404Spjd mutex_destroy(&z_comp->zst_lock); 162168404Spjd kmem_free(z_comp, sizeof (zstream_t)); 163168404Spjd 164168404Spjd dmu_zfetch_dofetch(zf, z_walk); 165168404Spjd 166168404Spjd rw_exit(&zf->zf_rwlock); 167168404Spjd return (1); 168168404Spjd } 169168404Spjd 170168404Spjd diff = z_walk->zst_offset - z_comp->zst_offset; 171168404Spjd if (z_walk->zst_offset + diff == zh->zst_offset) { 172168404Spjd z_walk->zst_offset = zh->zst_offset; 173168404Spjd z_walk->zst_direction = diff < 0 ? -1 : 1; 174168404Spjd z_walk->zst_stride = 175168404Spjd diff * z_walk->zst_direction; 176168404Spjd z_walk->zst_ph_offset = 177168404Spjd zh->zst_offset + z_walk->zst_stride; 178168404Spjd dmu_zfetch_stream_remove(zf, z_comp); 179168404Spjd mutex_destroy(&z_comp->zst_lock); 180168404Spjd kmem_free(z_comp, sizeof (zstream_t)); 181168404Spjd 182168404Spjd dmu_zfetch_dofetch(zf, z_walk); 183168404Spjd 184168404Spjd rw_exit(&zf->zf_rwlock); 185168404Spjd return (1); 186168404Spjd } 187168404Spjd } 188168404Spjd } 189168404Spjd 190168404Spjd rw_exit(&zf->zf_rwlock); 191168404Spjd return (0); 192168404Spjd} 193168404Spjd 194168404Spjd/* 195168404Spjd * Given a zstream_t, determine the bounds of the prefetch. Then call the 196168404Spjd * routine that actually prefetches the individual blocks. 197168404Spjd */ 198168404Spjdstatic void 199168404Spjddmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs) 200168404Spjd{ 201168404Spjd uint64_t prefetch_tail; 202168404Spjd uint64_t prefetch_limit; 203168404Spjd uint64_t prefetch_ofst; 204168404Spjd uint64_t prefetch_len; 205168404Spjd uint64_t blocks_fetched; 206168404Spjd 207168404Spjd zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len); 208168404Spjd zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap); 209168404Spjd 210168404Spjd prefetch_tail = MAX((int64_t)zs->zst_ph_offset, 211168404Spjd (int64_t)(zs->zst_offset + zs->zst_stride)); 212168404Spjd /* 213168404Spjd * XXX: use a faster division method? 214168404Spjd */ 215168404Spjd prefetch_limit = zs->zst_offset + zs->zst_len + 216168404Spjd (zs->zst_cap * zs->zst_stride) / zs->zst_len; 217168404Spjd 218168404Spjd while (prefetch_tail < prefetch_limit) { 219168404Spjd prefetch_ofst = zs->zst_offset + zs->zst_direction * 220168404Spjd (prefetch_tail - zs->zst_offset); 221168404Spjd 222168404Spjd prefetch_len = zs->zst_len; 223168404Spjd 224168404Spjd /* 225168404Spjd * Don't prefetch beyond the end of the file, if working 226168404Spjd * backwards. 227168404Spjd */ 228168404Spjd if ((zs->zst_direction == ZFETCH_BACKWARD) && 229168404Spjd (prefetch_ofst > prefetch_tail)) { 230168404Spjd prefetch_len += prefetch_ofst; 231168404Spjd prefetch_ofst = 0; 232168404Spjd } 233168404Spjd 234168404Spjd /* don't prefetch more than we're supposed to */ 235168404Spjd if (prefetch_len > zs->zst_len) 236168404Spjd break; 237168404Spjd 238168404Spjd blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode, 239168404Spjd prefetch_ofst, zs->zst_len); 240168404Spjd 241168404Spjd prefetch_tail += zs->zst_stride; 242168404Spjd /* stop if we've run out of stuff to prefetch */ 243168404Spjd if (blocks_fetched < zs->zst_len) 244168404Spjd break; 245168404Spjd } 246168404Spjd zs->zst_ph_offset = prefetch_tail; 247219089Spjd zs->zst_last = ddi_get_lbolt(); 248168404Spjd} 249168404Spjd 250208130Smmvoid 251208130Smmzfetch_init(void) 252208130Smm{ 253208130Smm 254208130Smm zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", 255208130Smm KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), 256208130Smm KSTAT_FLAG_VIRTUAL); 257208130Smm 258208130Smm if (zfetch_ksp != NULL) { 259208130Smm zfetch_ksp->ks_data = &zfetch_stats; 260208130Smm kstat_install(zfetch_ksp); 261208130Smm } 262208130Smm} 263208130Smm 264208130Smmvoid 265208130Smmzfetch_fini(void) 266208130Smm{ 267208130Smm if (zfetch_ksp != NULL) { 268208130Smm kstat_delete(zfetch_ksp); 269208130Smm zfetch_ksp = NULL; 270208130Smm } 271208130Smm} 272208130Smm 273168404Spjd/* 274168404Spjd * This takes a pointer to a zfetch structure and a dnode. It performs the 275168404Spjd * necessary setup for the zfetch structure, grokking data from the 276168404Spjd * associated dnode. 277168404Spjd */ 278168404Spjdvoid 279168404Spjddmu_zfetch_init(zfetch_t *zf, dnode_t *dno) 280168404Spjd{ 281168404Spjd if (zf == NULL) { 282168404Spjd return; 283168404Spjd } 284168404Spjd 285168404Spjd zf->zf_dnode = dno; 286168404Spjd zf->zf_stream_cnt = 0; 287168404Spjd zf->zf_alloc_fail = 0; 288168404Spjd 289168404Spjd list_create(&zf->zf_stream, sizeof (zstream_t), 290168404Spjd offsetof(zstream_t, zst_node)); 291168404Spjd 292168404Spjd rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL); 293168404Spjd} 294168404Spjd 295168404Spjd/* 296168404Spjd * This function computes the actual size, in blocks, that can be prefetched, 297168404Spjd * and fetches it. 298168404Spjd */ 299168404Spjdstatic uint64_t 300168404Spjddmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks) 301168404Spjd{ 302168404Spjd uint64_t fetchsz; 303168404Spjd uint64_t i; 304168404Spjd 305168404Spjd fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks); 306168404Spjd 307168404Spjd for (i = 0; i < fetchsz; i++) { 308168404Spjd dbuf_prefetch(dn, blkid + i); 309168404Spjd } 310168404Spjd 311168404Spjd return (fetchsz); 312168404Spjd} 313168404Spjd 314168404Spjd/* 315168404Spjd * this function returns the number of blocks that would be prefetched, based 316168404Spjd * upon the supplied dnode, blockid, and nblks. This is used so that we can 317168404Spjd * update streams in place, and then prefetch with their old value after the 318168404Spjd * fact. This way, we can delay the prefetch, but subsequent accesses to the 319168404Spjd * stream won't result in the same data being prefetched multiple times. 320168404Spjd */ 321168404Spjdstatic uint64_t 322168404Spjddmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks) 323168404Spjd{ 324168404Spjd uint64_t fetchsz; 325168404Spjd 326168404Spjd if (blkid > dn->dn_maxblkid) { 327168404Spjd return (0); 328168404Spjd } 329168404Spjd 330168404Spjd /* compute fetch size */ 331168404Spjd if (blkid + nblks + 1 > dn->dn_maxblkid) { 332168404Spjd fetchsz = (dn->dn_maxblkid - blkid) + 1; 333168404Spjd ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid); 334168404Spjd } else { 335168404Spjd fetchsz = nblks; 336168404Spjd } 337168404Spjd 338168404Spjd 339168404Spjd return (fetchsz); 340168404Spjd} 341168404Spjd 342168404Spjd/* 343208130Smm * given a zfetch and a zstream structure, see if there is an associated zstream 344168404Spjd * for this block read. If so, it starts a prefetch for the stream it 345168404Spjd * located and returns true, otherwise it returns false 346168404Spjd */ 347251629Sdelphijstatic boolean_t 348168404Spjddmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) 349168404Spjd{ 350168404Spjd zstream_t *zs; 351168404Spjd int64_t diff; 352168404Spjd int reset = !prefetched; 353168404Spjd int rc = 0; 354168404Spjd 355168404Spjd if (zh == NULL) 356168404Spjd return (0); 357168404Spjd 358168404Spjd /* 359168404Spjd * XXX: This locking strategy is a bit coarse; however, it's impact has 360168404Spjd * yet to be tested. If this turns out to be an issue, it can be 361168404Spjd * modified in a number of different ways. 362168404Spjd */ 363168404Spjd 364168404Spjd rw_enter(&zf->zf_rwlock, RW_READER); 365168404Spjdtop: 366168404Spjd 367168404Spjd for (zs = list_head(&zf->zf_stream); zs; 368168404Spjd zs = list_next(&zf->zf_stream, zs)) { 369168404Spjd 370168404Spjd /* 371168404Spjd * XXX - should this be an assert? 372168404Spjd */ 373168404Spjd if (zs->zst_len == 0) { 374168404Spjd /* bogus stream */ 375208130Smm ZFETCHSTAT_BUMP(zfetchstat_bogus_streams); 376168404Spjd continue; 377168404Spjd } 378168404Spjd 379168404Spjd /* 380168404Spjd * We hit this case when we are in a strided prefetch stream: 381168404Spjd * we will read "len" blocks before "striding". 382168404Spjd */ 383168404Spjd if (zh->zst_offset >= zs->zst_offset && 384168404Spjd zh->zst_offset < zs->zst_offset + zs->zst_len) { 385208130Smm if (prefetched) { 386208130Smm /* already fetched */ 387208130Smm ZFETCHSTAT_BUMP(zfetchstat_stride_hits); 388208130Smm rc = 1; 389208130Smm goto out; 390208130Smm } else { 391208130Smm ZFETCHSTAT_BUMP(zfetchstat_stride_misses); 392208130Smm } 393168404Spjd } 394168404Spjd 395168404Spjd /* 396168404Spjd * This is the forward sequential read case: we increment 397168404Spjd * len by one each time we hit here, so we will enter this 398168404Spjd * case on every read. 399168404Spjd */ 400168404Spjd if (zh->zst_offset == zs->zst_offset + zs->zst_len) { 401168404Spjd 402168404Spjd reset = !prefetched && zs->zst_len > 1; 403168404Spjd 404205132Skmacy if (mutex_tryenter(&zs->zst_lock) == 0) { 405205132Skmacy rc = 1; 406205132Skmacy goto out; 407205132Skmacy } 408219089Spjd 409168404Spjd if (zh->zst_offset != zs->zst_offset + zs->zst_len) { 410168404Spjd mutex_exit(&zs->zst_lock); 411168404Spjd goto top; 412168404Spjd } 413168404Spjd zs->zst_len += zh->zst_len; 414168404Spjd diff = zs->zst_len - zfetch_block_cap; 415168404Spjd if (diff > 0) { 416168404Spjd zs->zst_offset += diff; 417168404Spjd zs->zst_len = zs->zst_len > diff ? 418168404Spjd zs->zst_len - diff : 0; 419168404Spjd } 420168404Spjd zs->zst_direction = ZFETCH_FORWARD; 421168404Spjd 422168404Spjd break; 423168404Spjd 424168404Spjd /* 425168404Spjd * Same as above, but reading backwards through the file. 426168404Spjd */ 427168404Spjd } else if (zh->zst_offset == zs->zst_offset - zh->zst_len) { 428168404Spjd /* backwards sequential access */ 429168404Spjd 430168404Spjd reset = !prefetched && zs->zst_len > 1; 431168404Spjd 432205132Skmacy if (mutex_tryenter(&zs->zst_lock) == 0) { 433205132Skmacy rc = 1; 434205132Skmacy goto out; 435205132Skmacy } 436219089Spjd 437168404Spjd if (zh->zst_offset != zs->zst_offset - zh->zst_len) { 438168404Spjd mutex_exit(&zs->zst_lock); 439168404Spjd goto top; 440168404Spjd } 441168404Spjd 442168404Spjd zs->zst_offset = zs->zst_offset > zh->zst_len ? 443168404Spjd zs->zst_offset - zh->zst_len : 0; 444168404Spjd zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ? 445168404Spjd zs->zst_ph_offset - zh->zst_len : 0; 446168404Spjd zs->zst_len += zh->zst_len; 447168404Spjd 448168404Spjd diff = zs->zst_len - zfetch_block_cap; 449168404Spjd if (diff > 0) { 450168404Spjd zs->zst_ph_offset = zs->zst_ph_offset > diff ? 451168404Spjd zs->zst_ph_offset - diff : 0; 452168404Spjd zs->zst_len = zs->zst_len > diff ? 453168404Spjd zs->zst_len - diff : zs->zst_len; 454168404Spjd } 455168404Spjd zs->zst_direction = ZFETCH_BACKWARD; 456168404Spjd 457168404Spjd break; 458168404Spjd 459168404Spjd } else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride < 460168404Spjd zs->zst_len) && (zs->zst_len != zs->zst_stride)) { 461168404Spjd /* strided forward access */ 462168404Spjd 463205132Skmacy if (mutex_tryenter(&zs->zst_lock) == 0) { 464205132Skmacy rc = 1; 465205132Skmacy goto out; 466205132Skmacy } 467219089Spjd 468168404Spjd if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >= 469168404Spjd zs->zst_len) || (zs->zst_len == zs->zst_stride)) { 470168404Spjd mutex_exit(&zs->zst_lock); 471168404Spjd goto top; 472168404Spjd } 473168404Spjd 474168404Spjd zs->zst_offset += zs->zst_stride; 475168404Spjd zs->zst_direction = ZFETCH_FORWARD; 476168404Spjd 477168404Spjd break; 478168404Spjd 479168404Spjd } else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride < 480168404Spjd zs->zst_len) && (zs->zst_len != zs->zst_stride)) { 481168404Spjd /* strided reverse access */ 482168404Spjd 483205132Skmacy if (mutex_tryenter(&zs->zst_lock) == 0) { 484205132Skmacy rc = 1; 485205132Skmacy goto out; 486205132Skmacy } 487219089Spjd 488168404Spjd if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >= 489168404Spjd zs->zst_len) || (zs->zst_len == zs->zst_stride)) { 490168404Spjd mutex_exit(&zs->zst_lock); 491168404Spjd goto top; 492168404Spjd } 493168404Spjd 494168404Spjd zs->zst_offset = zs->zst_offset > zs->zst_stride ? 495168404Spjd zs->zst_offset - zs->zst_stride : 0; 496168404Spjd zs->zst_ph_offset = (zs->zst_ph_offset > 497168404Spjd (2 * zs->zst_stride)) ? 498168404Spjd (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0; 499168404Spjd zs->zst_direction = ZFETCH_BACKWARD; 500168404Spjd 501168404Spjd break; 502168404Spjd } 503168404Spjd } 504168404Spjd 505168404Spjd if (zs) { 506168404Spjd if (reset) { 507168404Spjd zstream_t *remove = zs; 508168404Spjd 509208130Smm ZFETCHSTAT_BUMP(zfetchstat_stream_resets); 510168404Spjd rc = 0; 511168404Spjd mutex_exit(&zs->zst_lock); 512168404Spjd rw_exit(&zf->zf_rwlock); 513168404Spjd rw_enter(&zf->zf_rwlock, RW_WRITER); 514168404Spjd /* 515168404Spjd * Relocate the stream, in case someone removes 516168404Spjd * it while we were acquiring the WRITER lock. 517168404Spjd */ 518168404Spjd for (zs = list_head(&zf->zf_stream); zs; 519168404Spjd zs = list_next(&zf->zf_stream, zs)) { 520168404Spjd if (zs == remove) { 521168404Spjd dmu_zfetch_stream_remove(zf, zs); 522168404Spjd mutex_destroy(&zs->zst_lock); 523168404Spjd kmem_free(zs, sizeof (zstream_t)); 524168404Spjd break; 525168404Spjd } 526168404Spjd } 527168404Spjd } else { 528208130Smm ZFETCHSTAT_BUMP(zfetchstat_stream_noresets); 529168404Spjd rc = 1; 530168404Spjd dmu_zfetch_dofetch(zf, zs); 531168404Spjd mutex_exit(&zs->zst_lock); 532168404Spjd } 533168404Spjd } 534168404Spjdout: 535168404Spjd rw_exit(&zf->zf_rwlock); 536168404Spjd return (rc); 537168404Spjd} 538168404Spjd 539168404Spjd/* 540168404Spjd * Clean-up state associated with a zfetch structure. This frees allocated 541168404Spjd * structure members, empties the zf_stream tree, and generally makes things 542168404Spjd * nice. This doesn't free the zfetch_t itself, that's left to the caller. 543168404Spjd */ 544168404Spjdvoid 545168404Spjddmu_zfetch_rele(zfetch_t *zf) 546168404Spjd{ 547168404Spjd zstream_t *zs; 548168404Spjd zstream_t *zs_next; 549168404Spjd 550168404Spjd ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock)); 551168404Spjd 552168404Spjd for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) { 553168404Spjd zs_next = list_next(&zf->zf_stream, zs); 554168404Spjd 555168404Spjd list_remove(&zf->zf_stream, zs); 556168404Spjd mutex_destroy(&zs->zst_lock); 557168404Spjd kmem_free(zs, sizeof (zstream_t)); 558168404Spjd } 559168404Spjd list_destroy(&zf->zf_stream); 560168404Spjd rw_destroy(&zf->zf_rwlock); 561168404Spjd 562168404Spjd zf->zf_dnode = NULL; 563168404Spjd} 564168404Spjd 565168404Spjd/* 566168404Spjd * Given a zfetch and zstream structure, insert the zstream structure into the 567168404Spjd * AVL tree contained within the zfetch structure. Peform the appropriate 568168404Spjd * book-keeping. It is possible that another thread has inserted a stream which 569168404Spjd * matches one that we are about to insert, so we must be sure to check for this 570168404Spjd * case. If one is found, return failure, and let the caller cleanup the 571168404Spjd * duplicates. 572168404Spjd */ 573168404Spjdstatic int 574168404Spjddmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs) 575168404Spjd{ 576168404Spjd zstream_t *zs_walk; 577168404Spjd zstream_t *zs_next; 578168404Spjd 579168404Spjd ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); 580168404Spjd 581168404Spjd for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) { 582168404Spjd zs_next = list_next(&zf->zf_stream, zs_walk); 583168404Spjd 584168404Spjd if (dmu_zfetch_streams_equal(zs_walk, zs)) { 585208130Smm return (0); 586168404Spjd } 587168404Spjd } 588168404Spjd 589168404Spjd list_insert_head(&zf->zf_stream, zs); 590168404Spjd zf->zf_stream_cnt++; 591168404Spjd return (1); 592168404Spjd} 593168404Spjd 594168404Spjd 595168404Spjd/* 596168404Spjd * Walk the list of zstreams in the given zfetch, find an old one (by time), and 597168404Spjd * reclaim it for use by the caller. 598168404Spjd */ 599168404Spjdstatic zstream_t * 600168404Spjddmu_zfetch_stream_reclaim(zfetch_t *zf) 601168404Spjd{ 602168404Spjd zstream_t *zs; 603168404Spjd 604168404Spjd if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER)) 605168404Spjd return (0); 606168404Spjd 607168404Spjd for (zs = list_head(&zf->zf_stream); zs; 608168404Spjd zs = list_next(&zf->zf_stream, zs)) { 609168404Spjd 610219089Spjd if (((ddi_get_lbolt() - zs->zst_last)/hz) > zfetch_min_sec_reap) 611168404Spjd break; 612168404Spjd } 613168404Spjd 614168404Spjd if (zs) { 615168404Spjd dmu_zfetch_stream_remove(zf, zs); 616168404Spjd mutex_destroy(&zs->zst_lock); 617168404Spjd bzero(zs, sizeof (zstream_t)); 618168404Spjd } else { 619168404Spjd zf->zf_alloc_fail++; 620168404Spjd } 621168404Spjd rw_exit(&zf->zf_rwlock); 622168404Spjd 623168404Spjd return (zs); 624168404Spjd} 625168404Spjd 626168404Spjd/* 627168404Spjd * Given a zfetch and zstream structure, remove the zstream structure from its 628168404Spjd * container in the zfetch structure. Perform the appropriate book-keeping. 629168404Spjd */ 630168404Spjdstatic void 631168404Spjddmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) 632168404Spjd{ 633168404Spjd ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); 634168404Spjd 635168404Spjd list_remove(&zf->zf_stream, zs); 636168404Spjd zf->zf_stream_cnt--; 637168404Spjd} 638168404Spjd 639168404Spjdstatic int 640168404Spjddmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2) 641168404Spjd{ 642168404Spjd if (zs1->zst_offset != zs2->zst_offset) 643168404Spjd return (0); 644168404Spjd 645168404Spjd if (zs1->zst_len != zs2->zst_len) 646168404Spjd return (0); 647168404Spjd 648168404Spjd if (zs1->zst_stride != zs2->zst_stride) 649168404Spjd return (0); 650168404Spjd 651168404Spjd if (zs1->zst_ph_offset != zs2->zst_ph_offset) 652168404Spjd return (0); 653168404Spjd 654168404Spjd if (zs1->zst_cap != zs2->zst_cap) 655168404Spjd return (0); 656168404Spjd 657168404Spjd if (zs1->zst_direction != zs2->zst_direction) 658168404Spjd return (0); 659168404Spjd 660168404Spjd return (1); 661168404Spjd} 662168404Spjd 663168404Spjd/* 664168404Spjd * This is the prefetch entry point. It calls all of the other dmu_zfetch 665168404Spjd * routines to create, delete, find, or operate upon prefetch streams. 666168404Spjd */ 667168404Spjdvoid 668168404Spjddmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) 669168404Spjd{ 670168404Spjd zstream_t zst; 671168404Spjd zstream_t *newstream; 672251629Sdelphij boolean_t fetched; 673168404Spjd int inserted; 674168404Spjd unsigned int blkshft; 675168404Spjd uint64_t blksz; 676168404Spjd 677194043Skmacy if (zfs_prefetch_disable) 678168404Spjd return; 679168404Spjd 680168404Spjd /* files that aren't ln2 blocksz are only one block -- nothing to do */ 681168404Spjd if (!zf->zf_dnode->dn_datablkshift) 682168404Spjd return; 683168404Spjd 684168404Spjd /* convert offset and size, into blockid and nblocks */ 685168404Spjd blkshft = zf->zf_dnode->dn_datablkshift; 686168404Spjd blksz = (1 << blkshft); 687168404Spjd 688168404Spjd bzero(&zst, sizeof (zstream_t)); 689168404Spjd zst.zst_offset = offset >> blkshft; 690168404Spjd zst.zst_len = (P2ROUNDUP(offset + size, blksz) - 691168404Spjd P2ALIGN(offset, blksz)) >> blkshft; 692168404Spjd 693168404Spjd fetched = dmu_zfetch_find(zf, &zst, prefetched); 694208130Smm if (fetched) { 695208130Smm ZFETCHSTAT_BUMP(zfetchstat_hits); 696208130Smm } else { 697208130Smm ZFETCHSTAT_BUMP(zfetchstat_misses); 698251629Sdelphij fetched = dmu_zfetch_colinear(zf, &zst); 699251629Sdelphij if (fetched) { 700208130Smm ZFETCHSTAT_BUMP(zfetchstat_colinear_hits); 701208130Smm } else { 702208130Smm ZFETCHSTAT_BUMP(zfetchstat_colinear_misses); 703208130Smm } 704168404Spjd } 705168404Spjd 706168404Spjd if (!fetched) { 707168404Spjd newstream = dmu_zfetch_stream_reclaim(zf); 708168404Spjd 709168404Spjd /* 710168404Spjd * we still couldn't find a stream, drop the lock, and allocate 711168404Spjd * one if possible. Otherwise, give up and go home. 712168404Spjd */ 713208130Smm if (newstream) { 714208130Smm ZFETCHSTAT_BUMP(zfetchstat_reclaim_successes); 715208130Smm } else { 716168404Spjd uint64_t maxblocks; 717168404Spjd uint32_t max_streams; 718168404Spjd uint32_t cur_streams; 719168404Spjd 720208130Smm ZFETCHSTAT_BUMP(zfetchstat_reclaim_failures); 721168404Spjd cur_streams = zf->zf_stream_cnt; 722168404Spjd maxblocks = zf->zf_dnode->dn_maxblkid; 723168404Spjd 724168404Spjd max_streams = MIN(zfetch_max_streams, 725168404Spjd (maxblocks / zfetch_block_cap)); 726168404Spjd if (max_streams == 0) { 727168404Spjd max_streams++; 728168404Spjd } 729168404Spjd 730168404Spjd if (cur_streams >= max_streams) { 731168404Spjd return; 732168404Spjd } 733168404Spjd newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP); 734168404Spjd } 735168404Spjd 736168404Spjd newstream->zst_offset = zst.zst_offset; 737168404Spjd newstream->zst_len = zst.zst_len; 738168404Spjd newstream->zst_stride = zst.zst_len; 739168404Spjd newstream->zst_ph_offset = zst.zst_len + zst.zst_offset; 740168404Spjd newstream->zst_cap = zst.zst_len; 741168404Spjd newstream->zst_direction = ZFETCH_FORWARD; 742219089Spjd newstream->zst_last = ddi_get_lbolt(); 743168404Spjd 744168404Spjd mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL); 745168404Spjd 746168404Spjd rw_enter(&zf->zf_rwlock, RW_WRITER); 747168404Spjd inserted = dmu_zfetch_stream_insert(zf, newstream); 748168404Spjd rw_exit(&zf->zf_rwlock); 749168404Spjd 750168404Spjd if (!inserted) { 751168404Spjd mutex_destroy(&newstream->zst_lock); 752168404Spjd kmem_free(newstream, sizeof (zstream_t)); 753168404Spjd } 754168404Spjd } 755168404Spjd} 756