1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>. 23 * All rights reserved. 24 */ 25 26#include <sys/zfs_context.h> 27#include <sys/spa_impl.h> 28#include <sys/vdev_impl.h> 29#include <sys/trim_map.h> 30#include <sys/time.h> 31 32/* 33 * Calculate the zio end, upgrading based on ashift which would be 34 * done by zio_vdev_io_start. 35 * 36 * This makes free range consolidation much more effective 37 * than it would otherwise be as well as ensuring that entire 38 * blocks are invalidated by writes. 39 */ 40#define TRIM_ZIO_END(vd, offset, size) (offset + \ 41 P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift)) 42 43/* Maximal segment size for ATA TRIM. */ 44#define TRIM_MAP_SIZE_FACTOR (512 << 16) 45 46#define TRIM_MAP_SEGS(size) (1 + (size) / TRIM_MAP_SIZE_FACTOR) 47 48#define TRIM_MAP_ADD(tm, ts) do { \ 49 list_insert_tail(&(tm)->tm_head, (ts)); \ 50 (tm)->tm_pending += TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \ 51} while (0) 52 53#define TRIM_MAP_REM(tm, ts) do { \ 54 list_remove(&(tm)->tm_head, (ts)); \ 55 (tm)->tm_pending -= TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \ 56} while (0) 57 58typedef struct trim_map { 59 list_t tm_head; /* List of segments sorted by txg. */ 60 avl_tree_t tm_queued_frees; /* AVL tree of segments waiting for TRIM. */ 61 avl_tree_t tm_inflight_frees; /* AVL tree of in-flight TRIMs. */ 62 avl_tree_t tm_inflight_writes; /* AVL tree of in-flight writes. */ 63 list_t tm_pending_writes; /* Writes blocked on in-flight frees. */ 64 kmutex_t tm_lock; 65 uint64_t tm_pending; /* Count of pending TRIMs. */ 66} trim_map_t; 67 68typedef struct trim_seg { 69 avl_node_t ts_node; /* AVL node. */ 70 list_node_t ts_next; /* List element. */ 71 uint64_t ts_start; /* Starting offset of this segment. */ 72 uint64_t ts_end; /* Ending offset (non-inclusive). */ 73 uint64_t ts_txg; /* Segment creation txg. */ 74 hrtime_t ts_time; /* Segment creation time. */ 75} trim_seg_t; 76 77extern boolean_t zfs_trim_enabled; 78 79static u_int trim_txg_delay = 32; /* Keep deleted data up to 32 TXG */ 80static u_int trim_timeout = 30; /* Keep deleted data up to 30s */ 81static u_int trim_max_interval = 1; /* 1s delays between TRIMs */ 82static u_int trim_vdev_max_pending = 10000; /* Keep up to 10K segments */ 83 84SYSCTL_DECL(_vfs_zfs); 85SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RD, 0, "ZFS TRIM"); 86 87SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, txg_delay, CTLFLAG_RWTUN, &trim_txg_delay, 88 0, "Delay TRIMs by up to this many TXGs"); 89SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, timeout, CTLFLAG_RWTUN, &trim_timeout, 0, 90 "Delay TRIMs by up to this many seconds"); 91SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, max_interval, CTLFLAG_RWTUN, 92 &trim_max_interval, 0, 93 "Maximum interval between TRIM queue processing (seconds)"); 94 95SYSCTL_DECL(_vfs_zfs_vdev); 96SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, trim_max_pending, CTLFLAG_RWTUN, 97 &trim_vdev_max_pending, 0, 98 "Maximum pending TRIM segments for a vdev"); 99 100static void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd); 101 102static int 103trim_map_seg_compare(const void *x1, const void *x2) 104{ 105 const trim_seg_t *s1 = x1; 106 const trim_seg_t *s2 = x2; 107 108 if (s1->ts_start < s2->ts_start) { 109 if (s1->ts_end > s2->ts_start) 110 return (0); 111 return (-1); 112 } 113 if (s1->ts_start > s2->ts_start) { 114 if (s1->ts_start < s2->ts_end) 115 return (0); 116 return (1); 117 } 118 return (0); 119} 120 121static int 122trim_map_zio_compare(const void *x1, const void *x2) 123{ 124 const zio_t *z1 = x1; 125 const zio_t *z2 = x2; 126 127 if (z1->io_offset < z2->io_offset) { 128 if (z1->io_offset + z1->io_size > z2->io_offset) 129 return (0); 130 return (-1); 131 } 132 if (z1->io_offset > z2->io_offset) { 133 if (z1->io_offset < z2->io_offset + z2->io_size) 134 return (0); 135 return (1); 136 } 137 return (0); 138} 139 140void 141trim_map_create(vdev_t *vd) 142{ 143 trim_map_t *tm; 144 145 ASSERT(zfs_trim_enabled && !vd->vdev_notrim && 146 vd->vdev_ops->vdev_op_leaf); 147 148 tm = kmem_zalloc(sizeof (*tm), KM_SLEEP); 149 mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL); 150 list_create(&tm->tm_head, sizeof (trim_seg_t), 151 offsetof(trim_seg_t, ts_next)); 152 list_create(&tm->tm_pending_writes, sizeof (zio_t), 153 offsetof(zio_t, io_trim_link)); 154 avl_create(&tm->tm_queued_frees, trim_map_seg_compare, 155 sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); 156 avl_create(&tm->tm_inflight_frees, trim_map_seg_compare, 157 sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); 158 avl_create(&tm->tm_inflight_writes, trim_map_zio_compare, 159 sizeof (zio_t), offsetof(zio_t, io_trim_node)); 160 vd->vdev_trimmap = tm; 161} 162 163void 164trim_map_destroy(vdev_t *vd) 165{ 166 trim_map_t *tm; 167 trim_seg_t *ts; 168 169 ASSERT(vd->vdev_ops->vdev_op_leaf); 170 171 if (!zfs_trim_enabled) 172 return; 173 174 tm = vd->vdev_trimmap; 175 if (tm == NULL) 176 return; 177 178 /* 179 * We may have been called before trim_map_vdev_commit_done() 180 * had a chance to run, so do it now to prune the remaining 181 * inflight frees. 182 */ 183 trim_map_vdev_commit_done(vd->vdev_spa, vd); 184 185 mutex_enter(&tm->tm_lock); 186 while ((ts = list_head(&tm->tm_head)) != NULL) { 187 avl_remove(&tm->tm_queued_frees, ts); 188 TRIM_MAP_REM(tm, ts); 189 kmem_free(ts, sizeof (*ts)); 190 } 191 mutex_exit(&tm->tm_lock); 192 193 avl_destroy(&tm->tm_queued_frees); 194 avl_destroy(&tm->tm_inflight_frees); 195 avl_destroy(&tm->tm_inflight_writes); 196 list_destroy(&tm->tm_pending_writes); 197 list_destroy(&tm->tm_head); 198 mutex_destroy(&tm->tm_lock); 199 kmem_free(tm, sizeof (*tm)); 200 vd->vdev_trimmap = NULL; 201} 202 203static void 204trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) 205{ 206 avl_index_t where; 207 trim_seg_t tsearch, *ts_before, *ts_after, *ts; 208 boolean_t merge_before, merge_after; 209 hrtime_t time; 210 211 ASSERT(MUTEX_HELD(&tm->tm_lock)); 212 VERIFY(start < end); 213 214 time = gethrtime(); 215 tsearch.ts_start = start; 216 tsearch.ts_end = end; 217 218 ts = avl_find(&tm->tm_queued_frees, &tsearch, &where); 219 if (ts != NULL) { 220 if (start < ts->ts_start) 221 trim_map_segment_add(tm, start, ts->ts_start, txg); 222 if (end > ts->ts_end) 223 trim_map_segment_add(tm, ts->ts_end, end, txg); 224 return; 225 } 226 227 ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE); 228 ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER); 229 230 merge_before = (ts_before != NULL && ts_before->ts_end == start); 231 merge_after = (ts_after != NULL && ts_after->ts_start == end); 232 233 if (merge_before && merge_after) { 234 avl_remove(&tm->tm_queued_frees, ts_before); 235 TRIM_MAP_REM(tm, ts_before); 236 TRIM_MAP_REM(tm, ts_after); 237 ts_after->ts_start = ts_before->ts_start; 238 ts_after->ts_txg = txg; 239 ts_after->ts_time = time; 240 TRIM_MAP_ADD(tm, ts_after); 241 kmem_free(ts_before, sizeof (*ts_before)); 242 } else if (merge_before) { 243 TRIM_MAP_REM(tm, ts_before); 244 ts_before->ts_end = end; 245 ts_before->ts_txg = txg; 246 ts_before->ts_time = time; 247 TRIM_MAP_ADD(tm, ts_before); 248 } else if (merge_after) { 249 TRIM_MAP_REM(tm, ts_after); 250 ts_after->ts_start = start; 251 ts_after->ts_txg = txg; 252 ts_after->ts_time = time; 253 TRIM_MAP_ADD(tm, ts_after); 254 } else { 255 ts = kmem_alloc(sizeof (*ts), KM_SLEEP); 256 ts->ts_start = start; 257 ts->ts_end = end; 258 ts->ts_txg = txg; 259 ts->ts_time = time; 260 avl_insert(&tm->tm_queued_frees, ts, where); 261 TRIM_MAP_ADD(tm, ts); 262 } 263} 264 265static void 266trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start, 267 uint64_t end) 268{ 269 trim_seg_t *nts; 270 boolean_t left_over, right_over; 271 272 ASSERT(MUTEX_HELD(&tm->tm_lock)); 273 274 left_over = (ts->ts_start < start); 275 right_over = (ts->ts_end > end); 276 277 TRIM_MAP_REM(tm, ts); 278 if (left_over && right_over) { 279 nts = kmem_alloc(sizeof (*nts), KM_SLEEP); 280 nts->ts_start = end; 281 nts->ts_end = ts->ts_end; 282 nts->ts_txg = ts->ts_txg; 283 nts->ts_time = ts->ts_time; 284 ts->ts_end = start; 285 avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER); 286 TRIM_MAP_ADD(tm, ts); 287 TRIM_MAP_ADD(tm, nts); 288 } else if (left_over) { 289 ts->ts_end = start; 290 TRIM_MAP_ADD(tm, ts); 291 } else if (right_over) { 292 ts->ts_start = end; 293 TRIM_MAP_ADD(tm, ts); 294 } else { 295 avl_remove(&tm->tm_queued_frees, ts); 296 kmem_free(ts, sizeof (*ts)); 297 } 298} 299 300static void 301trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) 302{ 303 zio_t zsearch, *zs; 304 305 ASSERT(MUTEX_HELD(&tm->tm_lock)); 306 307 zsearch.io_offset = start; 308 zsearch.io_size = end - start; 309 310 zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL); 311 if (zs == NULL) { 312 trim_map_segment_add(tm, start, end, txg); 313 return; 314 } 315 if (start < zs->io_offset) 316 trim_map_free_locked(tm, start, zs->io_offset, txg); 317 if (zs->io_offset + zs->io_size < end) 318 trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg); 319} 320 321void 322trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) 323{ 324 trim_map_t *tm = vd->vdev_trimmap; 325 326 if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL) 327 return; 328 329 mutex_enter(&tm->tm_lock); 330 trim_map_free_locked(tm, offset, TRIM_ZIO_END(vd, offset, size), txg); 331 mutex_exit(&tm->tm_lock); 332} 333 334boolean_t 335trim_map_write_start(zio_t *zio) 336{ 337 vdev_t *vd = zio->io_vd; 338 trim_map_t *tm = vd->vdev_trimmap; 339 trim_seg_t tsearch, *ts; 340 boolean_t left_over, right_over; 341 uint64_t start, end; 342 343 if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL) 344 return (B_TRUE); 345 346 start = zio->io_offset; 347 end = TRIM_ZIO_END(zio->io_vd, start, zio->io_size); 348 tsearch.ts_start = start; 349 tsearch.ts_end = end; 350 351 mutex_enter(&tm->tm_lock); 352 353 /* 354 * Checking for colliding in-flight frees. 355 */ 356 ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL); 357 if (ts != NULL) { 358 list_insert_tail(&tm->tm_pending_writes, zio); 359 mutex_exit(&tm->tm_lock); 360 return (B_FALSE); 361 } 362 363 ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL); 364 if (ts != NULL) { 365 /* 366 * Loop until all overlapping segments are removed. 367 */ 368 do { 369 trim_map_segment_remove(tm, ts, start, end); 370 ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL); 371 } while (ts != NULL); 372 } 373 avl_add(&tm->tm_inflight_writes, zio); 374 375 mutex_exit(&tm->tm_lock); 376 377 return (B_TRUE); 378} 379 380void 381trim_map_write_done(zio_t *zio) 382{ 383 vdev_t *vd = zio->io_vd; 384 trim_map_t *tm = vd->vdev_trimmap; 385 386 /* 387 * Don't check for vdev_notrim, since the write could have 388 * started before vdev_notrim was set. 389 */ 390 if (!zfs_trim_enabled || tm == NULL) 391 return; 392 393 mutex_enter(&tm->tm_lock); 394 /* 395 * Don't fail if the write isn't in the tree, since the write 396 * could have started after vdev_notrim was set. 397 */ 398 if (zio->io_trim_node.avl_child[0] || 399 zio->io_trim_node.avl_child[1] || 400 AVL_XPARENT(&zio->io_trim_node) || 401 tm->tm_inflight_writes.avl_root == &zio->io_trim_node) 402 avl_remove(&tm->tm_inflight_writes, zio); 403 mutex_exit(&tm->tm_lock); 404} 405 406/* 407 * Return the oldest segment (the one with the lowest txg / time) or NULL if: 408 * 1. The list is empty 409 * 2. The first element's txg is greater than txgsafe 410 * 3. The first element's txg is not greater than the txg argument and the 411 * the first element's time is not greater than time argument 412 */ 413static trim_seg_t * 414trim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time, 415 boolean_t force) 416{ 417 trim_seg_t *ts; 418 419 ASSERT(MUTEX_HELD(&tm->tm_lock)); 420 VERIFY(txgsafe >= txg); 421 422 ts = list_head(&tm->tm_head); 423 if (ts != NULL && ts->ts_txg <= txgsafe && 424 (ts->ts_txg <= txg || ts->ts_time <= time || force)) 425 return (ts); 426 return (NULL); 427} 428 429static void 430trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd) 431{ 432 trim_map_t *tm = vd->vdev_trimmap; 433 trim_seg_t *ts; 434 uint64_t size, offset, txgtarget, txgsafe; 435 int64_t hard, soft; 436 hrtime_t timelimit; 437 438 ASSERT(vd->vdev_ops->vdev_op_leaf); 439 440 if (tm == NULL) 441 return; 442 443 timelimit = gethrtime() - (hrtime_t)trim_timeout * NANOSEC; 444 if (vd->vdev_isl2cache) { 445 txgsafe = UINT64_MAX; 446 txgtarget = UINT64_MAX; 447 } else { 448 txgsafe = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa)); 449 if (txgsafe > trim_txg_delay) 450 txgtarget = txgsafe - trim_txg_delay; 451 else 452 txgtarget = 0; 453 } 454 455 mutex_enter(&tm->tm_lock); 456 hard = 0; 457 if (tm->tm_pending > trim_vdev_max_pending) 458 hard = (tm->tm_pending - trim_vdev_max_pending) / 4; 459 soft = P2ROUNDUP(hard + tm->tm_pending / trim_timeout + 1, 64); 460 /* Loop until we have sent all outstanding free's */ 461 while (soft > 0 && 462 (ts = trim_map_first(tm, txgtarget, txgsafe, timelimit, hard > 0)) 463 != NULL) { 464 TRIM_MAP_REM(tm, ts); 465 avl_remove(&tm->tm_queued_frees, ts); 466 avl_add(&tm->tm_inflight_frees, ts); 467 size = ts->ts_end - ts->ts_start; 468 offset = ts->ts_start; 469 /* 470 * We drop the lock while we call zio_nowait as the IO 471 * scheduler can result in a different IO being run e.g. 472 * a write which would result in a recursive lock. 473 */ 474 mutex_exit(&tm->tm_lock); 475 476 zio_nowait(zio_trim(zio, spa, vd, offset, size)); 477 478 soft -= TRIM_MAP_SEGS(size); 479 hard -= TRIM_MAP_SEGS(size); 480 mutex_enter(&tm->tm_lock); 481 } 482 mutex_exit(&tm->tm_lock); 483} 484 485static void 486trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd) 487{ 488 trim_map_t *tm = vd->vdev_trimmap; 489 trim_seg_t *ts; 490 list_t pending_writes; 491 zio_t *zio; 492 uint64_t start, size; 493 void *cookie; 494 495 ASSERT(vd->vdev_ops->vdev_op_leaf); 496 497 if (tm == NULL) 498 return; 499 500 mutex_enter(&tm->tm_lock); 501 if (!avl_is_empty(&tm->tm_inflight_frees)) { 502 cookie = NULL; 503 while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees, 504 &cookie)) != NULL) { 505 kmem_free(ts, sizeof (*ts)); 506 } 507 } 508 list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t, 509 io_trim_link)); 510 list_move_tail(&pending_writes, &tm->tm_pending_writes); 511 mutex_exit(&tm->tm_lock); 512 513 while ((zio = list_remove_head(&pending_writes)) != NULL) { 514 zio_vdev_io_reissue(zio); 515 zio_execute(zio); 516 } 517 list_destroy(&pending_writes); 518} 519 520static void 521trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd) 522{ 523 int c; 524 525 if (vd == NULL) 526 return; 527 528 if (vd->vdev_ops->vdev_op_leaf) { 529 trim_map_vdev_commit(spa, zio, vd); 530 } else { 531 for (c = 0; c < vd->vdev_children; c++) 532 trim_map_commit(spa, zio, vd->vdev_child[c]); 533 } 534} 535 536static void 537trim_map_commit_done(spa_t *spa, vdev_t *vd) 538{ 539 int c; 540 541 if (vd == NULL) 542 return; 543 544 if (vd->vdev_ops->vdev_op_leaf) { 545 trim_map_vdev_commit_done(spa, vd); 546 } else { 547 for (c = 0; c < vd->vdev_children; c++) 548 trim_map_commit_done(spa, vd->vdev_child[c]); 549 } 550} 551 552static void 553trim_thread(void *arg) 554{ 555 spa_t *spa = arg; 556 zio_t *zio; 557 558#ifdef _KERNEL 559 (void) snprintf(curthread->td_name, sizeof(curthread->td_name), 560 "trim %s", spa_name(spa)); 561#endif 562 563 for (;;) { 564 mutex_enter(&spa->spa_trim_lock); 565 if (spa->spa_trim_thread == NULL) { 566 spa->spa_trim_thread = curthread; 567 cv_signal(&spa->spa_trim_cv); 568 mutex_exit(&spa->spa_trim_lock); 569 thread_exit(); 570 } 571 572 (void) cv_timedwait(&spa->spa_trim_cv, &spa->spa_trim_lock, 573 hz * trim_max_interval); 574 mutex_exit(&spa->spa_trim_lock); 575 576 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 577 578 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 579 trim_map_commit(spa, zio, spa->spa_root_vdev); 580 (void) zio_wait(zio); 581 trim_map_commit_done(spa, spa->spa_root_vdev); 582 spa_config_exit(spa, SCL_STATE, FTAG); 583 } 584} 585 586void 587trim_thread_create(spa_t *spa) 588{ 589 590 if (!zfs_trim_enabled) 591 return; 592 593 mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL); 594 cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL); 595 mutex_enter(&spa->spa_trim_lock); 596 spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0, 597 TS_RUN, minclsyspri); 598 mutex_exit(&spa->spa_trim_lock); 599} 600 601void 602trim_thread_destroy(spa_t *spa) 603{ 604 605 if (!zfs_trim_enabled) 606 return; 607 if (spa->spa_trim_thread == NULL) 608 return; 609 610 mutex_enter(&spa->spa_trim_lock); 611 /* Setting spa_trim_thread to NULL tells the thread to stop. */ 612 spa->spa_trim_thread = NULL; 613 cv_signal(&spa->spa_trim_cv); 614 /* The thread will set it back to != NULL on exit. */ 615 while (spa->spa_trim_thread == NULL) 616 cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock); 617 spa->spa_trim_thread = NULL; 618 mutex_exit(&spa->spa_trim_lock); 619 620 cv_destroy(&spa->spa_trim_cv); 621 mutex_destroy(&spa->spa_trim_lock); 622} 623 624void 625trim_thread_wakeup(spa_t *spa) 626{ 627 628 if (!zfs_trim_enabled) 629 return; 630 if (spa->spa_trim_thread == NULL) 631 return; 632 633 mutex_enter(&spa->spa_trim_lock); 634 cv_signal(&spa->spa_trim_cv); 635 mutex_exit(&spa->spa_trim_lock); 636} 637