1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22209962Smm * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23168404Spjd * Use is subject to license terms. 24168404Spjd */ 25249195Smm/* 26321610Smav * Copyright (c) 2013, 2017 by Delphix. All rights reserved. 27249195Smm */ 28168404Spjd 29168404Spjd#include <sys/zfs_context.h> 30168404Spjd#include <sys/spa.h> 31168404Spjd#include <sys/vdev_impl.h> 32168404Spjd#include <sys/zio.h> 33185029Spjd#include <sys/kstat.h> 34321610Smav#include <sys/abd.h> 35168404Spjd 36168404Spjd/* 37168404Spjd * Virtual device read-ahead caching. 38168404Spjd * 39168404Spjd * This file implements a simple LRU read-ahead cache. When the DMU reads 40168404Spjd * a given block, it will often want other, nearby blocks soon thereafter. 41168404Spjd * We take advantage of this by reading a larger disk region and caching 42185029Spjd * the result. In the best case, this can turn 128 back-to-back 512-byte 43185029Spjd * reads into a single 64k read followed by 127 cache hits; this reduces 44168404Spjd * latency dramatically. In the worst case, it can turn an isolated 512-byte 45185029Spjd * read into a 64k read, which doesn't affect latency all that much but is 46168404Spjd * terribly wasteful of bandwidth. A more intelligent version of the cache 47168404Spjd * could keep track of access patterns and not do read-ahead unless it sees 48185029Spjd * at least two temporally close I/Os to the same region. Currently, only 49185029Spjd * metadata I/O is inflated. A futher enhancement could take advantage of 50185029Spjd * more semantic information about the I/O. And it could use something 51185029Spjd * faster than an AVL tree; that was chosen solely for convenience. 52168404Spjd * 53168404Spjd * There are five cache operations: allocate, fill, read, write, evict. 54168404Spjd * 55168404Spjd * (1) Allocate. This reserves a cache entry for the specified region. 56168404Spjd * We separate the allocate and fill operations so that multiple threads 57168404Spjd * don't generate I/O for the same cache miss. 58168404Spjd * 59168404Spjd * (2) Fill. When the I/O for a cache miss completes, the fill routine 60168404Spjd * places the data in the previously allocated cache entry. 61168404Spjd * 62168404Spjd * (3) Read. Read data from the cache. 63168404Spjd * 64168404Spjd * (4) Write. Update cache contents after write completion. 65168404Spjd * 66168404Spjd * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry 67168404Spjd * if the total cache size exceeds zfs_vdev_cache_size. 68168404Spjd */ 69168404Spjd 70168404Spjd/* 71168404Spjd * These tunables are for performance analysis. 72168404Spjd */ 73168404Spjd/* 74168404Spjd * All i/os smaller than zfs_vdev_cache_max will be turned into 75168404Spjd * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software 76185029Spjd * track buffer). At most zfs_vdev_cache_size bytes will be kept in each 77168404Spjd * vdev's vdev_cache. 78223622Smm * 79223622Smm * TODO: Note that with the current ZFS code, it turns out that the 80223622Smm * vdev cache is not helpful, and in some cases actually harmful. It 81223622Smm * is better if we disable this. Once some time has passed, we should 82223622Smm * actually remove this to simplify the code. For now we just disable 83223622Smm * it by setting the zfs_vdev_cache_size to zero. Note that Solaris 11 84223622Smm * has made these same changes. 85168404Spjd */ 86185029Spjdint zfs_vdev_cache_max = 1<<14; /* 16KB */ 87223622Smmint zfs_vdev_cache_size = 0; 88168404Spjdint zfs_vdev_cache_bshift = 16; 89168404Spjd 90185029Spjd#define VCBS (1 << zfs_vdev_cache_bshift) /* 64KB */ 91185029Spjd 92168404SpjdSYSCTL_DECL(_vfs_zfs_vdev); 93168404SpjdSYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache"); 94168404SpjdSYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, max, CTLFLAG_RDTUN, 95168404Spjd &zfs_vdev_cache_max, 0, "Maximum I/O request size that increase read size"); 96168404SpjdSYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, size, CTLFLAG_RDTUN, 97168404Spjd &zfs_vdev_cache_size, 0, "Size of VDEV cache"); 98185029SpjdSYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, bshift, CTLFLAG_RDTUN, 99185029Spjd &zfs_vdev_cache_bshift, 0, "Turn too small requests into 1 << this value"); 100168404Spjd 101185029Spjdkstat_t *vdc_ksp = NULL; 102168404Spjd 103185029Spjdtypedef struct vdc_stats { 104185029Spjd kstat_named_t vdc_stat_delegations; 105185029Spjd kstat_named_t vdc_stat_hits; 106185029Spjd kstat_named_t vdc_stat_misses; 107185029Spjd} vdc_stats_t; 108185029Spjd 109185029Spjdstatic vdc_stats_t vdc_stats = { 110185029Spjd { "delegations", KSTAT_DATA_UINT64 }, 111185029Spjd { "hits", KSTAT_DATA_UINT64 }, 112185029Spjd { "misses", KSTAT_DATA_UINT64 } 113185029Spjd}; 114185029Spjd 115270247Sdelphij#define VDCSTAT_BUMP(stat) atomic_inc_64(&vdc_stats.stat.value.ui64); 116185029Spjd 117168404Spjdstatic int 118168404Spjdvdev_cache_offset_compare(const void *a1, const void *a2) 119168404Spjd{ 120168404Spjd const vdev_cache_entry_t *ve1 = a1; 121168404Spjd const vdev_cache_entry_t *ve2 = a2; 122168404Spjd 123168404Spjd if (ve1->ve_offset < ve2->ve_offset) 124168404Spjd return (-1); 125168404Spjd if (ve1->ve_offset > ve2->ve_offset) 126168404Spjd return (1); 127168404Spjd return (0); 128168404Spjd} 129168404Spjd 130168404Spjdstatic int 131168404Spjdvdev_cache_lastused_compare(const void *a1, const void *a2) 132168404Spjd{ 133168404Spjd const vdev_cache_entry_t *ve1 = a1; 134168404Spjd const vdev_cache_entry_t *ve2 = a2; 135168404Spjd 136168404Spjd if (ve1->ve_lastused < ve2->ve_lastused) 137168404Spjd return (-1); 138168404Spjd if (ve1->ve_lastused > ve2->ve_lastused) 139168404Spjd return (1); 140168404Spjd 141168404Spjd /* 142168404Spjd * Among equally old entries, sort by offset to ensure uniqueness. 143168404Spjd */ 144168404Spjd return (vdev_cache_offset_compare(a1, a2)); 145168404Spjd} 146168404Spjd 147168404Spjd/* 148168404Spjd * Evict the specified entry from the cache. 149168404Spjd */ 150168404Spjdstatic void 151168404Spjdvdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) 152168404Spjd{ 153168404Spjd ASSERT(MUTEX_HELD(&vc->vc_lock)); 154321610Smav ASSERT3P(ve->ve_fill_io, ==, NULL); 155321610Smav ASSERT3P(ve->ve_abd, !=, NULL); 156168404Spjd 157168404Spjd avl_remove(&vc->vc_lastused_tree, ve); 158168404Spjd avl_remove(&vc->vc_offset_tree, ve); 159321610Smav abd_free(ve->ve_abd); 160168404Spjd kmem_free(ve, sizeof (vdev_cache_entry_t)); 161168404Spjd} 162168404Spjd 163168404Spjd/* 164168404Spjd * Allocate an entry in the cache. At the point we don't have the data, 165168404Spjd * we're just creating a placeholder so that multiple threads don't all 166168404Spjd * go off and read the same blocks. 167168404Spjd */ 168168404Spjdstatic vdev_cache_entry_t * 169168404Spjdvdev_cache_allocate(zio_t *zio) 170168404Spjd{ 171168404Spjd vdev_cache_t *vc = &zio->io_vd->vdev_cache; 172168404Spjd uint64_t offset = P2ALIGN(zio->io_offset, VCBS); 173168404Spjd vdev_cache_entry_t *ve; 174168404Spjd 175168404Spjd ASSERT(MUTEX_HELD(&vc->vc_lock)); 176168404Spjd 177168404Spjd if (zfs_vdev_cache_size == 0) 178168404Spjd return (NULL); 179168404Spjd 180168404Spjd /* 181168404Spjd * If adding a new entry would exceed the cache size, 182168404Spjd * evict the oldest entry (LRU). 183168404Spjd */ 184168404Spjd if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > 185168404Spjd zfs_vdev_cache_size) { 186168404Spjd ve = avl_first(&vc->vc_lastused_tree); 187185029Spjd if (ve->ve_fill_io != NULL) 188168404Spjd return (NULL); 189321610Smav ASSERT3U(ve->ve_hits, !=, 0); 190168404Spjd vdev_cache_evict(vc, ve); 191168404Spjd } 192168404Spjd 193168404Spjd ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); 194168404Spjd ve->ve_offset = offset; 195219089Spjd ve->ve_lastused = ddi_get_lbolt(); 196321610Smav ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE); 197168404Spjd 198168404Spjd avl_add(&vc->vc_offset_tree, ve); 199168404Spjd avl_add(&vc->vc_lastused_tree, ve); 200168404Spjd 201168404Spjd return (ve); 202168404Spjd} 203168404Spjd 204168404Spjdstatic void 205168404Spjdvdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) 206168404Spjd{ 207168404Spjd uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); 208168404Spjd 209168404Spjd ASSERT(MUTEX_HELD(&vc->vc_lock)); 210321610Smav ASSERT3P(ve->ve_fill_io, ==, NULL); 211168404Spjd 212219089Spjd if (ve->ve_lastused != ddi_get_lbolt()) { 213168404Spjd avl_remove(&vc->vc_lastused_tree, ve); 214219089Spjd ve->ve_lastused = ddi_get_lbolt(); 215168404Spjd avl_add(&vc->vc_lastused_tree, ve); 216168404Spjd } 217168404Spjd 218168404Spjd ve->ve_hits++; 219321610Smav abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size); 220168404Spjd} 221168404Spjd 222168404Spjd/* 223168404Spjd * Fill a previously allocated cache entry with data. 224168404Spjd */ 225168404Spjdstatic void 226209962Smmvdev_cache_fill(zio_t *fio) 227168404Spjd{ 228209962Smm vdev_t *vd = fio->io_vd; 229168404Spjd vdev_cache_t *vc = &vd->vdev_cache; 230209962Smm vdev_cache_entry_t *ve = fio->io_private; 231209962Smm zio_t *pio; 232168404Spjd 233321610Smav ASSERT3U(fio->io_size, ==, VCBS); 234168404Spjd 235168404Spjd /* 236168404Spjd * Add data to the cache. 237168404Spjd */ 238168404Spjd mutex_enter(&vc->vc_lock); 239168404Spjd 240321610Smav ASSERT3P(ve->ve_fill_io, ==, fio); 241321610Smav ASSERT3U(ve->ve_offset, ==, fio->io_offset); 242321610Smav ASSERT3P(ve->ve_abd, ==, fio->io_abd); 243168404Spjd 244168404Spjd ve->ve_fill_io = NULL; 245168404Spjd 246168404Spjd /* 247168404Spjd * Even if this cache line was invalidated by a missed write update, 248168404Spjd * any reads that were queued up before the missed update are still 249168404Spjd * valid, so we can satisfy them from this line before we evict it. 250168404Spjd */ 251307277Smav zio_link_t *zl = NULL; 252307277Smav while ((pio = zio_walk_parents(fio, &zl)) != NULL) 253209962Smm vdev_cache_hit(vc, ve, pio); 254168404Spjd 255209962Smm if (fio->io_error || ve->ve_missed_update) 256168404Spjd vdev_cache_evict(vc, ve); 257168404Spjd 258168404Spjd mutex_exit(&vc->vc_lock); 259168404Spjd} 260168404Spjd 261168404Spjd/* 262260150Sdelphij * Read data from the cache. Returns B_TRUE cache hit, B_FALSE on miss. 263168404Spjd */ 264260150Sdelphijboolean_t 265168404Spjdvdev_cache_read(zio_t *zio) 266168404Spjd{ 267168404Spjd vdev_cache_t *vc = &zio->io_vd->vdev_cache; 268168404Spjd vdev_cache_entry_t *ve, ve_search; 269168404Spjd uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); 270168404Spjd uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); 271168404Spjd zio_t *fio; 272168404Spjd 273321610Smav ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 274168404Spjd 275168404Spjd if (zio->io_flags & ZIO_FLAG_DONT_CACHE) 276260150Sdelphij return (B_FALSE); 277168404Spjd 278168404Spjd if (zio->io_size > zfs_vdev_cache_max) 279260150Sdelphij return (B_FALSE); 280168404Spjd 281168404Spjd /* 282168404Spjd * If the I/O straddles two or more cache blocks, don't cache it. 283168404Spjd */ 284208047Smm if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) 285260150Sdelphij return (B_FALSE); 286168404Spjd 287321610Smav ASSERT3U(cache_phase + zio->io_size, <=, VCBS); 288168404Spjd 289168404Spjd mutex_enter(&vc->vc_lock); 290168404Spjd 291168404Spjd ve_search.ve_offset = cache_offset; 292168404Spjd ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL); 293168404Spjd 294168404Spjd if (ve != NULL) { 295168404Spjd if (ve->ve_missed_update) { 296168404Spjd mutex_exit(&vc->vc_lock); 297260150Sdelphij return (B_FALSE); 298168404Spjd } 299168404Spjd 300168404Spjd if ((fio = ve->ve_fill_io) != NULL) { 301168404Spjd zio_vdev_io_bypass(zio); 302209962Smm zio_add_child(zio, fio); 303168404Spjd mutex_exit(&vc->vc_lock); 304185029Spjd VDCSTAT_BUMP(vdc_stat_delegations); 305260150Sdelphij return (B_TRUE); 306168404Spjd } 307168404Spjd 308168404Spjd vdev_cache_hit(vc, ve, zio); 309168404Spjd zio_vdev_io_bypass(zio); 310168404Spjd 311168404Spjd mutex_exit(&vc->vc_lock); 312185029Spjd VDCSTAT_BUMP(vdc_stat_hits); 313260150Sdelphij return (B_TRUE); 314168404Spjd } 315168404Spjd 316168404Spjd ve = vdev_cache_allocate(zio); 317168404Spjd 318168404Spjd if (ve == NULL) { 319168404Spjd mutex_exit(&vc->vc_lock); 320260150Sdelphij return (B_FALSE); 321168404Spjd } 322168404Spjd 323185029Spjd fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, 324321610Smav ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW, 325185029Spjd ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); 326168404Spjd 327168404Spjd ve->ve_fill_io = fio; 328168404Spjd zio_vdev_io_bypass(zio); 329209962Smm zio_add_child(zio, fio); 330168404Spjd 331168404Spjd mutex_exit(&vc->vc_lock); 332168404Spjd zio_nowait(fio); 333185029Spjd VDCSTAT_BUMP(vdc_stat_misses); 334168404Spjd 335260150Sdelphij return (B_TRUE); 336168404Spjd} 337168404Spjd 338168404Spjd/* 339168404Spjd * Update cache contents upon write completion. 340168404Spjd */ 341168404Spjdvoid 342168404Spjdvdev_cache_write(zio_t *zio) 343168404Spjd{ 344168404Spjd vdev_cache_t *vc = &zio->io_vd->vdev_cache; 345168404Spjd vdev_cache_entry_t *ve, ve_search; 346168404Spjd uint64_t io_start = zio->io_offset; 347168404Spjd uint64_t io_end = io_start + zio->io_size; 348168404Spjd uint64_t min_offset = P2ALIGN(io_start, VCBS); 349168404Spjd uint64_t max_offset = P2ROUNDUP(io_end, VCBS); 350168404Spjd avl_index_t where; 351168404Spjd 352321610Smav ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 353168404Spjd 354168404Spjd mutex_enter(&vc->vc_lock); 355168404Spjd 356168404Spjd ve_search.ve_offset = min_offset; 357168404Spjd ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); 358168404Spjd 359168404Spjd if (ve == NULL) 360168404Spjd ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); 361168404Spjd 362168404Spjd while (ve != NULL && ve->ve_offset < max_offset) { 363168404Spjd uint64_t start = MAX(ve->ve_offset, io_start); 364168404Spjd uint64_t end = MIN(ve->ve_offset + VCBS, io_end); 365168404Spjd 366168404Spjd if (ve->ve_fill_io != NULL) { 367168404Spjd ve->ve_missed_update = 1; 368168404Spjd } else { 369321610Smav abd_copy_off(ve->ve_abd, zio->io_abd, 370321610Smav start - ve->ve_offset, start - io_start, 371321610Smav end - start); 372168404Spjd } 373168404Spjd ve = AVL_NEXT(&vc->vc_offset_tree, ve); 374168404Spjd } 375168404Spjd mutex_exit(&vc->vc_lock); 376168404Spjd} 377168404Spjd 378168404Spjdvoid 379185029Spjdvdev_cache_purge(vdev_t *vd) 380185029Spjd{ 381185029Spjd vdev_cache_t *vc = &vd->vdev_cache; 382185029Spjd vdev_cache_entry_t *ve; 383185029Spjd 384185029Spjd mutex_enter(&vc->vc_lock); 385185029Spjd while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) 386185029Spjd vdev_cache_evict(vc, ve); 387185029Spjd mutex_exit(&vc->vc_lock); 388185029Spjd} 389185029Spjd 390185029Spjdvoid 391168404Spjdvdev_cache_init(vdev_t *vd) 392168404Spjd{ 393168404Spjd vdev_cache_t *vc = &vd->vdev_cache; 394168404Spjd 395168404Spjd mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL); 396168404Spjd 397168404Spjd avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare, 398168404Spjd sizeof (vdev_cache_entry_t), 399168404Spjd offsetof(struct vdev_cache_entry, ve_offset_node)); 400168404Spjd 401168404Spjd avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, 402168404Spjd sizeof (vdev_cache_entry_t), 403168404Spjd offsetof(struct vdev_cache_entry, ve_lastused_node)); 404168404Spjd} 405168404Spjd 406168404Spjdvoid 407168404Spjdvdev_cache_fini(vdev_t *vd) 408168404Spjd{ 409168404Spjd vdev_cache_t *vc = &vd->vdev_cache; 410168404Spjd 411185029Spjd vdev_cache_purge(vd); 412168404Spjd 413168404Spjd avl_destroy(&vc->vc_offset_tree); 414168404Spjd avl_destroy(&vc->vc_lastused_tree); 415168404Spjd 416168404Spjd mutex_destroy(&vc->vc_lock); 417168404Spjd} 418185029Spjd 419185029Spjdvoid 420185029Spjdvdev_cache_stat_init(void) 421185029Spjd{ 422185029Spjd vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc", 423185029Spjd KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t), 424185029Spjd KSTAT_FLAG_VIRTUAL); 425185029Spjd if (vdc_ksp != NULL) { 426185029Spjd vdc_ksp->ks_data = &vdc_stats; 427185029Spjd kstat_install(vdc_ksp); 428185029Spjd } 429185029Spjd} 430185029Spjd 431185029Spjdvoid 432185029Spjdvdev_cache_stat_fini(void) 433185029Spjd{ 434185029Spjd if (vdc_ksp != NULL) { 435185029Spjd kstat_delete(vdc_ksp); 436185029Spjd vdc_ksp = NULL; 437185029Spjd } 438185029Spjd} 439