trim_map.c revision 248575
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>. 23 * All rights reserved. 24 */ 25 26#include <sys/zfs_context.h> 27#include <sys/spa_impl.h> 28#include <sys/vdev_impl.h> 29#include <sys/trim_map.h> 30#include <sys/time.h> 31 32/* 33 * Calculate the zio end, upgrading based on ashift which would be 34 * done by zio_vdev_io_start. 35 * 36 * This makes free range consolidation much more effective 37 * than it would otherwise be as well as ensuring that entire 38 * blocks are invalidated by writes. 39 */ 40#define TRIM_ZIO_END(vd, offset, size) (offset + \ 41 P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift)) 42 43typedef struct trim_map { 44 list_t tm_head; /* List of segments sorted by txg. */ 45 avl_tree_t tm_queued_frees; /* AVL tree of segments waiting for TRIM. */ 46 avl_tree_t tm_inflight_frees; /* AVL tree of in-flight TRIMs. */ 47 avl_tree_t tm_inflight_writes; /* AVL tree of in-flight writes. */ 48 list_t tm_pending_writes; /* Writes blocked on in-flight frees. */ 49 kmutex_t tm_lock; 50} trim_map_t; 51 52typedef struct trim_seg { 53 avl_node_t ts_node; /* AVL node. */ 54 list_node_t ts_next; /* List element. */ 55 uint64_t ts_start; /* Starting offset of this segment. */ 56 uint64_t ts_end; /* Ending offset (non-inclusive). */ 57 uint64_t ts_txg; /* Segment creation txg. */ 58 hrtime_t ts_time; /* Segment creation time. */ 59} trim_seg_t; 60 61extern boolean_t zfs_notrim; 62 63SYSCTL_DECL(_vfs_zfs); 64/* Delay TRIMs by that many TXGs. */ 65static int trim_txg_limit = 64; 66TUNABLE_INT("vfs.zfs.trim_txg_limit", &trim_txg_limit); 67SYSCTL_INT(_vfs_zfs, OID_AUTO, trim_txg_limit, CTLFLAG_RW, &trim_txg_limit, 0, 68 "Delay TRIMs by that many TXGs."); 69 70static int trim_l2arc_limit = 30; 71TUNABLE_INT("vfs.zfs.trim_l2arc_limit", &trim_l2arc_limit); 72SYSCTL_INT(_vfs_zfs, OID_AUTO, trim_l2arc_limit, CTLFLAG_RWTUN, &trim_l2arc_limit, 0, 73 "Delay TRIMs by this many seconds for cache devices."); 74 75static void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd); 76 77static int 78trim_map_seg_compare(const void *x1, const void *x2) 79{ 80 const trim_seg_t *s1 = x1; 81 const trim_seg_t *s2 = x2; 82 83 if (s1->ts_start < s2->ts_start) { 84 if (s1->ts_end > s2->ts_start) 85 return (0); 86 return (-1); 87 } 88 if (s1->ts_start > s2->ts_start) { 89 if (s1->ts_start < s2->ts_end) 90 return (0); 91 return (1); 92 } 93 return (0); 94} 95 96static int 97trim_map_zio_compare(const void *x1, const void *x2) 98{ 99 const zio_t *z1 = x1; 100 const zio_t *z2 = x2; 101 102 if (z1->io_offset < z2->io_offset) { 103 if (z1->io_offset + z1->io_size > z2->io_offset) 104 return (0); 105 return (-1); 106 } 107 if (z1->io_offset > z2->io_offset) { 108 if (z1->io_offset < z2->io_offset + z2->io_size) 109 return (0); 110 return (1); 111 } 112 return (0); 113} 114 115void 116trim_map_create(vdev_t *vd) 117{ 118 trim_map_t *tm; 119 120 ASSERT(vd->vdev_ops->vdev_op_leaf); 121 122 if (zfs_notrim) 123 return; 124 125 tm = kmem_zalloc(sizeof (*tm), KM_SLEEP); 126 mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL); 127 list_create(&tm->tm_head, sizeof (trim_seg_t), 128 offsetof(trim_seg_t, ts_next)); 129 list_create(&tm->tm_pending_writes, sizeof (zio_t), 130 offsetof(zio_t, io_trim_link)); 131 avl_create(&tm->tm_queued_frees, trim_map_seg_compare, 132 sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); 133 avl_create(&tm->tm_inflight_frees, trim_map_seg_compare, 134 sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); 135 avl_create(&tm->tm_inflight_writes, trim_map_zio_compare, 136 sizeof (zio_t), offsetof(zio_t, io_trim_node)); 137 vd->vdev_trimmap = tm; 138} 139 140void 141trim_map_destroy(vdev_t *vd) 142{ 143 trim_map_t *tm; 144 trim_seg_t *ts; 145 146 ASSERT(vd->vdev_ops->vdev_op_leaf); 147 148 if (zfs_notrim) 149 return; 150 151 tm = vd->vdev_trimmap; 152 if (tm == NULL) 153 return; 154 155 /* 156 * We may have been called before trim_map_vdev_commit_done() 157 * had a chance to run, so do it now to prune the remaining 158 * inflight frees. 159 */ 160 trim_map_vdev_commit_done(vd->vdev_spa, vd); 161 162 mutex_enter(&tm->tm_lock); 163 while ((ts = list_head(&tm->tm_head)) != NULL) { 164 avl_remove(&tm->tm_queued_frees, ts); 165 list_remove(&tm->tm_head, ts); 166 kmem_free(ts, sizeof (*ts)); 167 } 168 mutex_exit(&tm->tm_lock); 169 170 avl_destroy(&tm->tm_queued_frees); 171 avl_destroy(&tm->tm_inflight_frees); 172 avl_destroy(&tm->tm_inflight_writes); 173 list_destroy(&tm->tm_pending_writes); 174 list_destroy(&tm->tm_head); 175 mutex_destroy(&tm->tm_lock); 176 kmem_free(tm, sizeof (*tm)); 177 vd->vdev_trimmap = NULL; 178} 179 180static void 181trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) 182{ 183 avl_index_t where; 184 trim_seg_t tsearch, *ts_before, *ts_after, *ts; 185 boolean_t merge_before, merge_after; 186 hrtime_t time; 187 188 ASSERT(MUTEX_HELD(&tm->tm_lock)); 189 VERIFY(start < end); 190 191 time = gethrtime(); 192 tsearch.ts_start = start; 193 tsearch.ts_end = end; 194 195 ts = avl_find(&tm->tm_queued_frees, &tsearch, &where); 196 if (ts != NULL) { 197 if (start < ts->ts_start) 198 trim_map_segment_add(tm, start, ts->ts_start, txg); 199 if (end > ts->ts_end) 200 trim_map_segment_add(tm, ts->ts_end, end, txg); 201 return; 202 } 203 204 ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE); 205 ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER); 206 207 merge_before = (ts_before != NULL && ts_before->ts_end == start && 208 ts_before->ts_txg == txg); 209 merge_after = (ts_after != NULL && ts_after->ts_start == end && 210 ts_after->ts_txg == txg); 211 212 if (merge_before && merge_after) { 213 avl_remove(&tm->tm_queued_frees, ts_before); 214 list_remove(&tm->tm_head, ts_before); 215 ts_after->ts_start = ts_before->ts_start; 216 kmem_free(ts_before, sizeof (*ts_before)); 217 } else if (merge_before) { 218 ts_before->ts_end = end; 219 } else if (merge_after) { 220 ts_after->ts_start = start; 221 } else { 222 ts = kmem_alloc(sizeof (*ts), KM_SLEEP); 223 ts->ts_start = start; 224 ts->ts_end = end; 225 ts->ts_txg = txg; 226 ts->ts_time = time; 227 avl_insert(&tm->tm_queued_frees, ts, where); 228 list_insert_tail(&tm->tm_head, ts); 229 } 230} 231 232static void 233trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start, 234 uint64_t end) 235{ 236 trim_seg_t *nts; 237 boolean_t left_over, right_over; 238 239 ASSERT(MUTEX_HELD(&tm->tm_lock)); 240 241 left_over = (ts->ts_start < start); 242 right_over = (ts->ts_end > end); 243 244 if (left_over && right_over) { 245 nts = kmem_alloc(sizeof (*nts), KM_SLEEP); 246 nts->ts_start = end; 247 nts->ts_end = ts->ts_end; 248 nts->ts_txg = ts->ts_txg; 249 nts->ts_time = ts->ts_time; 250 ts->ts_end = start; 251 avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER); 252 list_insert_after(&tm->tm_head, ts, nts); 253 } else if (left_over) { 254 ts->ts_end = start; 255 } else if (right_over) { 256 ts->ts_start = end; 257 } else { 258 avl_remove(&tm->tm_queued_frees, ts); 259 list_remove(&tm->tm_head, ts); 260 kmem_free(ts, sizeof (*ts)); 261 } 262} 263 264static void 265trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) 266{ 267 zio_t zsearch, *zs; 268 269 ASSERT(MUTEX_HELD(&tm->tm_lock)); 270 271 zsearch.io_offset = start; 272 zsearch.io_size = end - start; 273 274 zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL); 275 if (zs == NULL) { 276 trim_map_segment_add(tm, start, end, txg); 277 return; 278 } 279 if (start < zs->io_offset) 280 trim_map_free_locked(tm, start, zs->io_offset, txg); 281 if (zs->io_offset + zs->io_size < end) 282 trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg); 283} 284 285void 286trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) 287{ 288 trim_map_t *tm = vd->vdev_trimmap; 289 290 if (zfs_notrim || vd->vdev_notrim || tm == NULL) 291 return; 292 293 mutex_enter(&tm->tm_lock); 294 trim_map_free_locked(tm, offset, TRIM_ZIO_END(vd, offset, size), txg); 295 mutex_exit(&tm->tm_lock); 296} 297 298boolean_t 299trim_map_write_start(zio_t *zio) 300{ 301 vdev_t *vd = zio->io_vd; 302 trim_map_t *tm = vd->vdev_trimmap; 303 trim_seg_t tsearch, *ts; 304 boolean_t left_over, right_over; 305 uint64_t start, end; 306 307 if (zfs_notrim || vd->vdev_notrim || tm == NULL) 308 return (B_TRUE); 309 310 start = zio->io_offset; 311 end = TRIM_ZIO_END(zio->io_vd, start, zio->io_size); 312 tsearch.ts_start = start; 313 tsearch.ts_end = end; 314 315 mutex_enter(&tm->tm_lock); 316 317 /* 318 * Checking for colliding in-flight frees. 319 */ 320 ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL); 321 if (ts != NULL) { 322 list_insert_tail(&tm->tm_pending_writes, zio); 323 mutex_exit(&tm->tm_lock); 324 return (B_FALSE); 325 } 326 327 ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL); 328 if (ts != NULL) { 329 /* 330 * Loop until all overlapping segments are removed. 331 */ 332 do { 333 trim_map_segment_remove(tm, ts, start, end); 334 ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL); 335 } while (ts != NULL); 336 } 337 avl_add(&tm->tm_inflight_writes, zio); 338 339 mutex_exit(&tm->tm_lock); 340 341 return (B_TRUE); 342} 343 344void 345trim_map_write_done(zio_t *zio) 346{ 347 vdev_t *vd = zio->io_vd; 348 trim_map_t *tm = vd->vdev_trimmap; 349 350 /* 351 * Don't check for vdev_notrim, since the write could have 352 * started before vdev_notrim was set. 353 */ 354 if (zfs_notrim || tm == NULL) 355 return; 356 357 mutex_enter(&tm->tm_lock); 358 /* 359 * Don't fail if the write isn't in the tree, since the write 360 * could have started after vdev_notrim was set. 361 */ 362 if (zio->io_trim_node.avl_child[0] || 363 zio->io_trim_node.avl_child[1] || 364 AVL_XPARENT(&zio->io_trim_node) || 365 tm->tm_inflight_writes.avl_root == &zio->io_trim_node) 366 avl_remove(&tm->tm_inflight_writes, zio); 367 mutex_exit(&tm->tm_lock); 368} 369 370/* 371 * Return the oldest segment (the one with the lowest txg) or false if 372 * the list is empty or the first element's txg is greater than txg given 373 * as function argument, or the first element's time is greater than time 374 * given as function argument 375 */ 376static trim_seg_t * 377trim_map_first(trim_map_t *tm, uint64_t txg, hrtime_t time) 378{ 379 trim_seg_t *ts; 380 381 ASSERT(MUTEX_HELD(&tm->tm_lock)); 382 383 ts = list_head(&tm->tm_head); 384 if (ts != NULL && ts->ts_txg <= txg && ts->ts_time <= time) 385 return (ts); 386 return (NULL); 387} 388 389static void 390trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd) 391{ 392 trim_map_t *tm = vd->vdev_trimmap; 393 trim_seg_t *ts; 394 uint64_t start, size, txglimit; 395 hrtime_t timelimit; 396 397 ASSERT(vd->vdev_ops->vdev_op_leaf); 398 399 if (tm == NULL) 400 return; 401 402 if (vd->vdev_isl2cache) { 403 timelimit = gethrtime() - trim_l2arc_limit * NANOSEC; 404 txglimit = UINT64_MAX; 405 } else { 406 timelimit = TIME_MAX; 407 txglimit = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa)) - 408 trim_txg_limit; 409 } 410 411 mutex_enter(&tm->tm_lock); 412 /* 413 * Loop until we send all frees up to the txglimit 414 * or time limit if this is a cache device. 415 */ 416 while ((ts = trim_map_first(tm, txglimit, timelimit)) != NULL) { 417 list_remove(&tm->tm_head, ts); 418 avl_remove(&tm->tm_queued_frees, ts); 419 avl_add(&tm->tm_inflight_frees, ts); 420 zio_nowait(zio_trim(zio, spa, vd, ts->ts_start, 421 ts->ts_end - ts->ts_start)); 422 } 423 mutex_exit(&tm->tm_lock); 424} 425 426static void 427trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd) 428{ 429 trim_map_t *tm = vd->vdev_trimmap; 430 trim_seg_t *ts; 431 list_t pending_writes; 432 zio_t *zio; 433 uint64_t start, size; 434 void *cookie; 435 436 ASSERT(vd->vdev_ops->vdev_op_leaf); 437 438 if (tm == NULL) 439 return; 440 441 mutex_enter(&tm->tm_lock); 442 if (!avl_is_empty(&tm->tm_inflight_frees)) { 443 cookie = NULL; 444 while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees, 445 &cookie)) != NULL) { 446 kmem_free(ts, sizeof (*ts)); 447 } 448 } 449 list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t, 450 io_trim_link)); 451 list_move_tail(&pending_writes, &tm->tm_pending_writes); 452 mutex_exit(&tm->tm_lock); 453 454 while ((zio = list_remove_head(&pending_writes)) != NULL) { 455 zio_vdev_io_reissue(zio); 456 zio_execute(zio); 457 } 458 list_destroy(&pending_writes); 459} 460 461static void 462trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd) 463{ 464 int c; 465 466 if (vd == NULL || spa_last_synced_txg(spa) <= trim_txg_limit) 467 return; 468 469 if (vd->vdev_ops->vdev_op_leaf) { 470 trim_map_vdev_commit(spa, zio, vd); 471 } else { 472 for (c = 0; c < vd->vdev_children; c++) 473 trim_map_commit(spa, zio, vd->vdev_child[c]); 474 } 475} 476 477static void 478trim_map_commit_done(spa_t *spa, vdev_t *vd) 479{ 480 int c; 481 482 if (vd == NULL) 483 return; 484 485 if (vd->vdev_ops->vdev_op_leaf) { 486 trim_map_vdev_commit_done(spa, vd); 487 } else { 488 for (c = 0; c < vd->vdev_children; c++) 489 trim_map_commit_done(spa, vd->vdev_child[c]); 490 } 491} 492 493static void 494trim_thread(void *arg) 495{ 496 spa_t *spa = arg; 497 zio_t *zio; 498 499 for (;;) { 500 mutex_enter(&spa->spa_trim_lock); 501 if (spa->spa_trim_thread == NULL) { 502 spa->spa_trim_thread = curthread; 503 cv_signal(&spa->spa_trim_cv); 504 mutex_exit(&spa->spa_trim_lock); 505 thread_exit(); 506 } 507 cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock); 508 mutex_exit(&spa->spa_trim_lock); 509 510 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 511 512 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 513 trim_map_commit(spa, zio, spa->spa_root_vdev); 514 (void) zio_wait(zio); 515 trim_map_commit_done(spa, spa->spa_root_vdev); 516 spa_config_exit(spa, SCL_STATE, FTAG); 517 } 518} 519 520void 521trim_thread_create(spa_t *spa) 522{ 523 524 if (zfs_notrim) 525 return; 526 527 mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL); 528 cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL); 529 mutex_enter(&spa->spa_trim_lock); 530 spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0, 531 TS_RUN, minclsyspri); 532 mutex_exit(&spa->spa_trim_lock); 533} 534 535void 536trim_thread_destroy(spa_t *spa) 537{ 538 539 if (zfs_notrim) 540 return; 541 if (spa->spa_trim_thread == NULL) 542 return; 543 544 mutex_enter(&spa->spa_trim_lock); 545 /* Setting spa_trim_thread to NULL tells the thread to stop. */ 546 spa->spa_trim_thread = NULL; 547 cv_signal(&spa->spa_trim_cv); 548 /* The thread will set it back to != NULL on exit. */ 549 while (spa->spa_trim_thread == NULL) 550 cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock); 551 spa->spa_trim_thread = NULL; 552 mutex_exit(&spa->spa_trim_lock); 553 554 cv_destroy(&spa->spa_trim_cv); 555 mutex_destroy(&spa->spa_trim_lock); 556} 557 558void 559trim_thread_wakeup(spa_t *spa) 560{ 561 562 if (zfs_notrim) 563 return; 564 if (spa->spa_trim_thread == NULL) 565 return; 566 567 mutex_enter(&spa->spa_trim_lock); 568 cv_signal(&spa->spa_trim_cv); 569 mutex_exit(&spa->spa_trim_lock); 570} 571