1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>. 23 * All rights reserved. 24 */ 25 26#include <sys/zfs_context.h> 27#include <sys/spa_impl.h> 28#include <sys/vdev_impl.h> 29#include <sys/trim_map.h> 30#include <sys/time.h> 31 32/* 33 * Calculate the zio end, upgrading based on ashift which would be 34 * done by zio_vdev_io_start. 35 * 36 * This makes free range consolidation much more effective 37 * than it would otherwise be as well as ensuring that entire 38 * blocks are invalidated by writes. 39 */ 40#define TRIM_ZIO_END(vd, offset, size) (offset + \ 41 P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift)) 42 43/* Maximal segment size for ATA TRIM. */ 44#define TRIM_MAP_SIZE_FACTOR (512 << 16) 45 46#define TRIM_MAP_SEGS(size) (1 + (size) / TRIM_MAP_SIZE_FACTOR) 47 48#define TRIM_MAP_ADD(tm, ts) do { \ 49 list_insert_tail(&(tm)->tm_head, (ts)); \ 50 (tm)->tm_pending += TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \ 51} while (0) 52 53#define TRIM_MAP_REM(tm, ts) do { \ 54 list_remove(&(tm)->tm_head, (ts)); \ 55 (tm)->tm_pending -= TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \ 56} while (0) 57 58typedef struct trim_map { 59 list_t tm_head; /* List of segments sorted by txg. */ 60 avl_tree_t tm_queued_frees; /* AVL tree of segments waiting for TRIM. */ 61 avl_tree_t tm_inflight_frees; /* AVL tree of in-flight TRIMs. */ 62 avl_tree_t tm_inflight_writes; /* AVL tree of in-flight writes. */ 63 list_t tm_pending_writes; /* Writes blocked on in-flight frees. */ 64 kmutex_t tm_lock; 65 uint64_t tm_pending; /* Count of pending TRIMs. */ 66} trim_map_t; 67 68typedef struct trim_seg { 69 avl_node_t ts_node; /* AVL node. */ 70 list_node_t ts_next; /* List element. */ 71 uint64_t ts_start; /* Starting offset of this segment. */ 72 uint64_t ts_end; /* Ending offset (non-inclusive). */ 73 uint64_t ts_txg; /* Segment creation txg. */ 74 hrtime_t ts_time; /* Segment creation time. */ 75} trim_seg_t; 76 77extern boolean_t zfs_trim_enabled; 78 79static u_int trim_txg_delay = 32; /* Keep deleted data up to 32 TXG */ 80static u_int trim_timeout = 30; /* Keep deleted data up to 30s */ 81static u_int trim_max_interval = 1; /* 1s delays between TRIMs */ 82static u_int trim_vdev_max_pending = 10000; /* Keep up to 10K segments */ 83 84SYSCTL_DECL(_vfs_zfs); 85SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RD, 0, "ZFS TRIM"); 86 87SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, txg_delay, CTLFLAG_RWTUN, &trim_txg_delay, 88 0, "Delay TRIMs by up to this many TXGs"); 89SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, timeout, CTLFLAG_RWTUN, &trim_timeout, 0, 90 "Delay TRIMs by up to this many seconds"); 91SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, max_interval, CTLFLAG_RWTUN, 92 &trim_max_interval, 0, 93 "Maximum interval between TRIM queue processing (seconds)"); 94 95SYSCTL_DECL(_vfs_zfs_vdev); 96SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, trim_max_pending, CTLFLAG_RWTUN, 97 &trim_vdev_max_pending, 0, 98 "Maximum pending TRIM segments for a vdev"); 99 100static void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd); 101 102static int 103trim_map_seg_compare(const void *x1, const void *x2) 104{ 105 const trim_seg_t *s1 = x1; 106 const trim_seg_t *s2 = x2; 107 108 if (s1->ts_start < s2->ts_start) { 109 if (s1->ts_end > s2->ts_start) 110 return (0); 111 return (-1); 112 } 113 if (s1->ts_start > s2->ts_start) { 114 if (s1->ts_start < s2->ts_end) 115 return (0); 116 return (1); 117 } 118 return (0); 119} 120 121static int 122trim_map_zio_compare(const void *x1, const void *x2) 123{ 124 const zio_t *z1 = x1; 125 const zio_t *z2 = x2; 126 127 if (z1->io_offset < z2->io_offset) { 128 if (z1->io_offset + z1->io_size > z2->io_offset) 129 return (0); 130 return (-1); 131 } 132 if (z1->io_offset > z2->io_offset) { 133 if (z1->io_offset < z2->io_offset + z2->io_size) 134 return (0); 135 return (1); 136 } 137 return (0); 138} 139 140void 141trim_map_create(vdev_t *vd) 142{ 143 trim_map_t *tm; 144 145 ASSERT(zfs_trim_enabled && !vd->vdev_notrim && 146 vd->vdev_ops->vdev_op_leaf); 147 148 tm = kmem_zalloc(sizeof (*tm), KM_SLEEP); 149 mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL); 150 list_create(&tm->tm_head, sizeof (trim_seg_t), 151 offsetof(trim_seg_t, ts_next)); 152 list_create(&tm->tm_pending_writes, sizeof (zio_t), 153 offsetof(zio_t, io_trim_link)); 154 avl_create(&tm->tm_queued_frees, trim_map_seg_compare, 155 sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); 156 avl_create(&tm->tm_inflight_frees, trim_map_seg_compare, 157 sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); 158 avl_create(&tm->tm_inflight_writes, trim_map_zio_compare, 159 sizeof (zio_t), offsetof(zio_t, io_trim_node)); 160 vd->vdev_trimmap = tm; 161} 162 163void 164trim_map_destroy(vdev_t *vd) 165{ 166 trim_map_t *tm; 167 trim_seg_t *ts; 168 169 ASSERT(vd->vdev_ops->vdev_op_leaf); 170 171 if (!zfs_trim_enabled) 172 return; 173 174 tm = vd->vdev_trimmap; 175 if (tm == NULL) 176 return; 177 178 /* 179 * We may have been called before trim_map_vdev_commit_done() 180 * had a chance to run, so do it now to prune the remaining 181 * inflight frees. 182 */ 183 trim_map_vdev_commit_done(vd->vdev_spa, vd); 184 185 mutex_enter(&tm->tm_lock); 186 while ((ts = list_head(&tm->tm_head)) != NULL) { 187 avl_remove(&tm->tm_queued_frees, ts); 188 TRIM_MAP_REM(tm, ts); 189 kmem_free(ts, sizeof (*ts)); 190 } 191 mutex_exit(&tm->tm_lock); 192 193 avl_destroy(&tm->tm_queued_frees); 194 avl_destroy(&tm->tm_inflight_frees); 195 avl_destroy(&tm->tm_inflight_writes); 196 list_destroy(&tm->tm_pending_writes); 197 list_destroy(&tm->tm_head); 198 mutex_destroy(&tm->tm_lock); 199 kmem_free(tm, sizeof (*tm)); 200 vd->vdev_trimmap = NULL; 201} 202 203static void 204trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) 205{ 206 avl_index_t where; 207 trim_seg_t tsearch, *ts_before, *ts_after, *ts; 208 boolean_t merge_before, merge_after; 209 hrtime_t time; 210 211 ASSERT(MUTEX_HELD(&tm->tm_lock)); 212 VERIFY(start < end); 213 214 time = gethrtime(); 215 tsearch.ts_start = start; 216 tsearch.ts_end = end; 217 218 ts = avl_find(&tm->tm_queued_frees, &tsearch, &where); 219 if (ts != NULL) { 220 if (start < ts->ts_start) 221 trim_map_segment_add(tm, start, ts->ts_start, txg); 222 if (end > ts->ts_end) 223 trim_map_segment_add(tm, ts->ts_end, end, txg); 224 return; 225 } 226 227 ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE); 228 ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER); 229 230 merge_before = (ts_before != NULL && ts_before->ts_end == start); 231 merge_after = (ts_after != NULL && ts_after->ts_start == end); 232 233 if (merge_before && merge_after) { 234 avl_remove(&tm->tm_queued_frees, ts_before); 235 TRIM_MAP_REM(tm, ts_before); 236 TRIM_MAP_REM(tm, ts_after); 237 ts_after->ts_start = ts_before->ts_start; 238 ts_after->ts_txg = txg; 239 ts_after->ts_time = time; 240 TRIM_MAP_ADD(tm, ts_after); 241 kmem_free(ts_before, sizeof (*ts_before)); 242 } else if (merge_before) { 243 TRIM_MAP_REM(tm, ts_before); 244 ts_before->ts_end = end; 245 ts_before->ts_txg = txg; 246 ts_before->ts_time = time; 247 TRIM_MAP_ADD(tm, ts_before); 248 } else if (merge_after) { 249 TRIM_MAP_REM(tm, ts_after); 250 ts_after->ts_start = start; 251 ts_after->ts_txg = txg; 252 ts_after->ts_time = time; 253 TRIM_MAP_ADD(tm, ts_after); 254 } else { 255 ts = kmem_alloc(sizeof (*ts), KM_SLEEP); 256 ts->ts_start = start; 257 ts->ts_end = end; 258 ts->ts_txg = txg; 259 ts->ts_time = time; 260 avl_insert(&tm->tm_queued_frees, ts, where); 261 TRIM_MAP_ADD(tm, ts); 262 } 263} 264 265static void 266trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start, 267 uint64_t end) 268{ 269 trim_seg_t *nts; 270 boolean_t left_over, right_over; 271 272 ASSERT(MUTEX_HELD(&tm->tm_lock)); 273 274 left_over = (ts->ts_start < start); 275 right_over = (ts->ts_end > end); 276 277 TRIM_MAP_REM(tm, ts); 278 if (left_over && right_over) { 279 nts = kmem_alloc(sizeof (*nts), KM_SLEEP); 280 nts->ts_start = end; 281 nts->ts_end = ts->ts_end; 282 nts->ts_txg = ts->ts_txg; 283 nts->ts_time = ts->ts_time; 284 ts->ts_end = start; 285 avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER); 286 TRIM_MAP_ADD(tm, ts); 287 TRIM_MAP_ADD(tm, nts); 288 } else if (left_over) { 289 ts->ts_end = start; 290 TRIM_MAP_ADD(tm, ts); 291 } else if (right_over) { 292 ts->ts_start = end; 293 TRIM_MAP_ADD(tm, ts); 294 } else { 295 avl_remove(&tm->tm_queued_frees, ts); 296 kmem_free(ts, sizeof (*ts)); 297 } 298} 299 300static void 301trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) 302{ 303 zio_t zsearch, *zs; 304 305 ASSERT(MUTEX_HELD(&tm->tm_lock)); 306 307 zsearch.io_offset = start; 308 zsearch.io_size = end - start; 309 310 zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL); 311 if (zs == NULL) { 312 trim_map_segment_add(tm, start, end, txg); 313 return; 314 } 315 if (start < zs->io_offset) 316 trim_map_free_locked(tm, start, zs->io_offset, txg); 317 if (zs->io_offset + zs->io_size < end) 318 trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg); 319} 320 321void 322trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) 323{ 324 trim_map_t *tm = vd->vdev_trimmap; 325 326 if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL) 327 return; 328 329 mutex_enter(&tm->tm_lock); 330 trim_map_free_locked(tm, offset, TRIM_ZIO_END(vd, offset, size), txg); 331 mutex_exit(&tm->tm_lock); 332} 333 334boolean_t 335trim_map_write_start(zio_t *zio) 336{ 337 vdev_t *vd = zio->io_vd; 338 trim_map_t *tm = vd->vdev_trimmap; 339 trim_seg_t tsearch, *ts; 340 boolean_t left_over, right_over; 341 uint64_t start, end; 342 343 if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL) 344 return (B_TRUE); 345 346 start = zio->io_offset; 347 end = TRIM_ZIO_END(zio->io_vd, start, zio->io_size); 348 tsearch.ts_start = start; 349 tsearch.ts_end = end; 350 351 mutex_enter(&tm->tm_lock); 352 353 /* 354 * Checking for colliding in-flight frees. 355 */ 356 ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL); 357 if (ts != NULL) { 358 list_insert_tail(&tm->tm_pending_writes, zio); 359 mutex_exit(&tm->tm_lock); 360 return (B_FALSE); 361 } 362 363 /* 364 * Loop until all overlapping segments are removed. 365 */ 366 while ((ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL)) != NULL) { 367 trim_map_segment_remove(tm, ts, start, end); 368 } 369 370 avl_add(&tm->tm_inflight_writes, zio); 371 372 mutex_exit(&tm->tm_lock); 373 374 return (B_TRUE); 375} 376 377void 378trim_map_write_done(zio_t *zio) 379{ 380 vdev_t *vd = zio->io_vd; 381 trim_map_t *tm = vd->vdev_trimmap; 382 383 /* 384 * Don't check for vdev_notrim, since the write could have 385 * started before vdev_notrim was set. 386 */ 387 if (!zfs_trim_enabled || tm == NULL) 388 return; 389 390 mutex_enter(&tm->tm_lock); 391 /* 392 * Don't fail if the write isn't in the tree, since the write 393 * could have started after vdev_notrim was set. 394 */ 395 if (zio->io_trim_node.avl_child[0] || 396 zio->io_trim_node.avl_child[1] || 397 AVL_XPARENT(&zio->io_trim_node) || 398 tm->tm_inflight_writes.avl_root == &zio->io_trim_node) 399 avl_remove(&tm->tm_inflight_writes, zio); 400 mutex_exit(&tm->tm_lock); 401} 402 403/* 404 * Return the oldest segment (the one with the lowest txg / time) or NULL if: 405 * 1. The list is empty 406 * 2. The first element's txg is greater than txgsafe 407 * 3. The first element's txg is not greater than the txg argument and the 408 * the first element's time is not greater than time argument 409 */ 410static trim_seg_t * 411trim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time, 412 boolean_t force) 413{ 414 trim_seg_t *ts; 415 416 ASSERT(MUTEX_HELD(&tm->tm_lock)); 417 VERIFY(txgsafe >= txg); 418 419 ts = list_head(&tm->tm_head); 420 if (ts != NULL && ts->ts_txg <= txgsafe && 421 (ts->ts_txg <= txg || ts->ts_time <= time || force)) 422 return (ts); 423 return (NULL); 424} 425 426static void 427trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd) 428{ 429 trim_map_t *tm = vd->vdev_trimmap; 430 trim_seg_t *ts; 431 uint64_t size, offset, txgtarget, txgsafe; 432 int64_t hard, soft; 433 hrtime_t timelimit; 434 435 ASSERT(vd->vdev_ops->vdev_op_leaf); 436 437 if (tm == NULL) 438 return; 439 440 timelimit = gethrtime() - (hrtime_t)trim_timeout * NANOSEC; 441 if (vd->vdev_isl2cache) { 442 txgsafe = UINT64_MAX; 443 txgtarget = UINT64_MAX; 444 } else { 445 txgsafe = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa)); 446 if (txgsafe > trim_txg_delay) 447 txgtarget = txgsafe - trim_txg_delay; 448 else 449 txgtarget = 0; 450 } 451 452 mutex_enter(&tm->tm_lock); 453 hard = 0; 454 if (tm->tm_pending > trim_vdev_max_pending) 455 hard = (tm->tm_pending - trim_vdev_max_pending) / 4; 456 soft = P2ROUNDUP(hard + tm->tm_pending / trim_timeout + 1, 64); 457 /* Loop until we have sent all outstanding free's */ 458 while (soft > 0 && 459 (ts = trim_map_first(tm, txgtarget, txgsafe, timelimit, hard > 0)) 460 != NULL) { 461 TRIM_MAP_REM(tm, ts); 462 avl_remove(&tm->tm_queued_frees, ts); 463 avl_add(&tm->tm_inflight_frees, ts); 464 size = ts->ts_end - ts->ts_start; 465 offset = ts->ts_start; 466 /* 467 * We drop the lock while we call zio_nowait as the IO 468 * scheduler can result in a different IO being run e.g. 469 * a write which would result in a recursive lock. 470 */ 471 mutex_exit(&tm->tm_lock); 472 473 zio_nowait(zio_trim(zio, spa, vd, offset, size)); 474 475 soft -= TRIM_MAP_SEGS(size); 476 hard -= TRIM_MAP_SEGS(size); 477 mutex_enter(&tm->tm_lock); 478 } 479 mutex_exit(&tm->tm_lock); 480} 481 482static void 483trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd) 484{ 485 trim_map_t *tm = vd->vdev_trimmap; 486 trim_seg_t *ts; 487 list_t pending_writes; 488 zio_t *zio; 489 uint64_t start, size; 490 void *cookie; 491 492 ASSERT(vd->vdev_ops->vdev_op_leaf); 493 494 if (tm == NULL) 495 return; 496 497 mutex_enter(&tm->tm_lock); 498 if (!avl_is_empty(&tm->tm_inflight_frees)) { 499 cookie = NULL; 500 while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees, 501 &cookie)) != NULL) { 502 kmem_free(ts, sizeof (*ts)); 503 } 504 } 505 list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t, 506 io_trim_link)); 507 list_move_tail(&pending_writes, &tm->tm_pending_writes); 508 mutex_exit(&tm->tm_lock); 509 510 while ((zio = list_remove_head(&pending_writes)) != NULL) { 511 zio_vdev_io_reissue(zio); 512 zio_execute(zio); 513 } 514 list_destroy(&pending_writes); 515} 516 517static void 518trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd) 519{ 520 int c; 521 522 if (vd == NULL) 523 return; 524 525 if (vd->vdev_ops->vdev_op_leaf) { 526 trim_map_vdev_commit(spa, zio, vd); 527 } else { 528 for (c = 0; c < vd->vdev_children; c++) 529 trim_map_commit(spa, zio, vd->vdev_child[c]); 530 } 531} 532 533static void 534trim_map_commit_done(spa_t *spa, vdev_t *vd) 535{ 536 int c; 537 538 if (vd == NULL) 539 return; 540 541 if (vd->vdev_ops->vdev_op_leaf) { 542 trim_map_vdev_commit_done(spa, vd); 543 } else { 544 for (c = 0; c < vd->vdev_children; c++) 545 trim_map_commit_done(spa, vd->vdev_child[c]); 546 } 547} 548 549static void 550trim_thread(void *arg) 551{ 552 spa_t *spa = arg; 553 zio_t *zio; 554 555#ifdef _KERNEL 556 (void) snprintf(curthread->td_name, sizeof(curthread->td_name), 557 "trim %s", spa_name(spa)); 558#endif 559 560 for (;;) { 561 mutex_enter(&spa->spa_trim_lock); 562 if (spa->spa_trim_thread == NULL) { 563 spa->spa_trim_thread = curthread; 564 cv_signal(&spa->spa_trim_cv); 565 mutex_exit(&spa->spa_trim_lock); 566 thread_exit(); 567 } 568 569 (void) cv_timedwait(&spa->spa_trim_cv, &spa->spa_trim_lock, 570 hz * trim_max_interval); 571 mutex_exit(&spa->spa_trim_lock); 572 573 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 574 575 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 576 trim_map_commit(spa, zio, spa->spa_root_vdev); 577 (void) zio_wait(zio); 578 trim_map_commit_done(spa, spa->spa_root_vdev); 579 spa_config_exit(spa, SCL_STATE, FTAG); 580 } 581} 582 583void 584trim_thread_create(spa_t *spa) 585{ 586 587 if (!zfs_trim_enabled) 588 return; 589 590 mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL); 591 cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL); 592 mutex_enter(&spa->spa_trim_lock); 593 spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0, 594 TS_RUN, minclsyspri); 595 mutex_exit(&spa->spa_trim_lock); 596} 597 598void 599trim_thread_destroy(spa_t *spa) 600{ 601 602 if (!zfs_trim_enabled) 603 return; 604 if (spa->spa_trim_thread == NULL) 605 return; 606 607 mutex_enter(&spa->spa_trim_lock); 608 /* Setting spa_trim_thread to NULL tells the thread to stop. */ 609 spa->spa_trim_thread = NULL; 610 cv_signal(&spa->spa_trim_cv); 611 /* The thread will set it back to != NULL on exit. */ 612 while (spa->spa_trim_thread == NULL) 613 cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock); 614 spa->spa_trim_thread = NULL; 615 mutex_exit(&spa->spa_trim_lock); 616 617 cv_destroy(&spa->spa_trim_cv); 618 mutex_destroy(&spa->spa_trim_lock); 619} 620 621void 622trim_thread_wakeup(spa_t *spa) 623{ 624 625 if (!zfs_trim_enabled) 626 return; 627 if (spa->spa_trim_thread == NULL) 628 return; 629 630 mutex_enter(&spa->spa_trim_lock); 631 cv_signal(&spa->spa_trim_cv); 632 mutex_exit(&spa->spa_trim_lock); 633} 634