trim_map.c revision 248572
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>. 23 * All rights reserved. 24 */ 25 26#include <sys/zfs_context.h> 27#include <sys/spa_impl.h> 28#include <sys/vdev_impl.h> 29#include <sys/trim_map.h> 30 31/* 32 * Calculate the zio end, upgrading based on ashift which would be 33 * done by zio_vdev_io_start. 34 * 35 * This makes free range consolidation much more effective 36 * than it would otherwise be as well as ensuring that entire 37 * blocks are invalidated by writes. 38 */ 39#define TRIM_ZIO_END(vd, offset, size) (offset + \ 40 P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift)) 41 42typedef struct trim_map { 43 list_t tm_head; /* List of segments sorted by txg. */ 44 avl_tree_t tm_queued_frees; /* AVL tree of segments waiting for TRIM. */ 45 avl_tree_t tm_inflight_frees; /* AVL tree of in-flight TRIMs. */ 46 avl_tree_t tm_inflight_writes; /* AVL tree of in-flight writes. */ 47 list_t tm_pending_writes; /* Writes blocked on in-flight frees. */ 48 kmutex_t tm_lock; 49} trim_map_t; 50 51typedef struct trim_seg { 52 avl_node_t ts_node; /* AVL node. */ 53 list_node_t ts_next; /* List element. */ 54 uint64_t ts_start; /* Starting offset of this segment. */ 55 uint64_t ts_end; /* Ending offset (non-inclusive). */ 56 uint64_t ts_txg; /* Segment creation txg. */ 57} trim_seg_t; 58 59extern boolean_t zfs_notrim; 60 61SYSCTL_DECL(_vfs_zfs); 62/* Delay TRIMs by that many TXGs. */ 63static int trim_txg_limit = 64; 64TUNABLE_INT("vfs.zfs.trim_txg_limit", &trim_txg_limit); 65SYSCTL_INT(_vfs_zfs, OID_AUTO, trim_txg_limit, CTLFLAG_RW, &trim_txg_limit, 0, 66 "Delay TRIMs by that many TXGs."); 67 68static void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd); 69 70static int 71trim_map_seg_compare(const void *x1, const void *x2) 72{ 73 const trim_seg_t *s1 = x1; 74 const trim_seg_t *s2 = x2; 75 76 if (s1->ts_start < s2->ts_start) { 77 if (s1->ts_end > s2->ts_start) 78 return (0); 79 return (-1); 80 } 81 if (s1->ts_start > s2->ts_start) { 82 if (s1->ts_start < s2->ts_end) 83 return (0); 84 return (1); 85 } 86 return (0); 87} 88 89static int 90trim_map_zio_compare(const void *x1, const void *x2) 91{ 92 const zio_t *z1 = x1; 93 const zio_t *z2 = x2; 94 95 if (z1->io_offset < z2->io_offset) { 96 if (z1->io_offset + z1->io_size > z2->io_offset) 97 return (0); 98 return (-1); 99 } 100 if (z1->io_offset > z2->io_offset) { 101 if (z1->io_offset < z2->io_offset + z2->io_size) 102 return (0); 103 return (1); 104 } 105 return (0); 106} 107 108void 109trim_map_create(vdev_t *vd) 110{ 111 trim_map_t *tm; 112 113 ASSERT(vd->vdev_ops->vdev_op_leaf); 114 115 if (zfs_notrim) 116 return; 117 118 tm = kmem_zalloc(sizeof (*tm), KM_SLEEP); 119 mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL); 120 list_create(&tm->tm_head, sizeof (trim_seg_t), 121 offsetof(trim_seg_t, ts_next)); 122 list_create(&tm->tm_pending_writes, sizeof (zio_t), 123 offsetof(zio_t, io_trim_link)); 124 avl_create(&tm->tm_queued_frees, trim_map_seg_compare, 125 sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); 126 avl_create(&tm->tm_inflight_frees, trim_map_seg_compare, 127 sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); 128 avl_create(&tm->tm_inflight_writes, trim_map_zio_compare, 129 sizeof (zio_t), offsetof(zio_t, io_trim_node)); 130 vd->vdev_trimmap = tm; 131} 132 133void 134trim_map_destroy(vdev_t *vd) 135{ 136 trim_map_t *tm; 137 trim_seg_t *ts; 138 139 ASSERT(vd->vdev_ops->vdev_op_leaf); 140 141 if (zfs_notrim) 142 return; 143 144 tm = vd->vdev_trimmap; 145 if (tm == NULL) 146 return; 147 148 /* 149 * We may have been called before trim_map_vdev_commit_done() 150 * had a chance to run, so do it now to prune the remaining 151 * inflight frees. 152 */ 153 trim_map_vdev_commit_done(vd->vdev_spa, vd); 154 155 mutex_enter(&tm->tm_lock); 156 while ((ts = list_head(&tm->tm_head)) != NULL) { 157 avl_remove(&tm->tm_queued_frees, ts); 158 list_remove(&tm->tm_head, ts); 159 kmem_free(ts, sizeof (*ts)); 160 } 161 mutex_exit(&tm->tm_lock); 162 163 avl_destroy(&tm->tm_queued_frees); 164 avl_destroy(&tm->tm_inflight_frees); 165 avl_destroy(&tm->tm_inflight_writes); 166 list_destroy(&tm->tm_pending_writes); 167 list_destroy(&tm->tm_head); 168 mutex_destroy(&tm->tm_lock); 169 kmem_free(tm, sizeof (*tm)); 170 vd->vdev_trimmap = NULL; 171} 172 173static void 174trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) 175{ 176 avl_index_t where; 177 trim_seg_t tsearch, *ts_before, *ts_after, *ts; 178 boolean_t merge_before, merge_after; 179 180 ASSERT(MUTEX_HELD(&tm->tm_lock)); 181 VERIFY(start < end); 182 183 tsearch.ts_start = start; 184 tsearch.ts_end = end; 185 186 ts = avl_find(&tm->tm_queued_frees, &tsearch, &where); 187 if (ts != NULL) { 188 if (start < ts->ts_start) 189 trim_map_segment_add(tm, start, ts->ts_start, txg); 190 if (end > ts->ts_end) 191 trim_map_segment_add(tm, ts->ts_end, end, txg); 192 return; 193 } 194 195 ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE); 196 ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER); 197 198 merge_before = (ts_before != NULL && ts_before->ts_end == start && 199 ts_before->ts_txg == txg); 200 merge_after = (ts_after != NULL && ts_after->ts_start == end && 201 ts_after->ts_txg == txg); 202 203 if (merge_before && merge_after) { 204 avl_remove(&tm->tm_queued_frees, ts_before); 205 list_remove(&tm->tm_head, ts_before); 206 ts_after->ts_start = ts_before->ts_start; 207 kmem_free(ts_before, sizeof (*ts_before)); 208 } else if (merge_before) { 209 ts_before->ts_end = end; 210 } else if (merge_after) { 211 ts_after->ts_start = start; 212 } else { 213 ts = kmem_alloc(sizeof (*ts), KM_SLEEP); 214 ts->ts_start = start; 215 ts->ts_end = end; 216 ts->ts_txg = txg; 217 avl_insert(&tm->tm_queued_frees, ts, where); 218 list_insert_tail(&tm->tm_head, ts); 219 } 220} 221 222static void 223trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start, 224 uint64_t end) 225{ 226 trim_seg_t *nts; 227 boolean_t left_over, right_over; 228 229 ASSERT(MUTEX_HELD(&tm->tm_lock)); 230 231 left_over = (ts->ts_start < start); 232 right_over = (ts->ts_end > end); 233 234 if (left_over && right_over) { 235 nts = kmem_alloc(sizeof (*nts), KM_SLEEP); 236 nts->ts_start = end; 237 nts->ts_end = ts->ts_end; 238 nts->ts_txg = ts->ts_txg; 239 ts->ts_end = start; 240 avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER); 241 list_insert_after(&tm->tm_head, ts, nts); 242 } else if (left_over) { 243 ts->ts_end = start; 244 } else if (right_over) { 245 ts->ts_start = end; 246 } else { 247 avl_remove(&tm->tm_queued_frees, ts); 248 list_remove(&tm->tm_head, ts); 249 kmem_free(ts, sizeof (*ts)); 250 } 251} 252 253static void 254trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) 255{ 256 zio_t zsearch, *zs; 257 258 ASSERT(MUTEX_HELD(&tm->tm_lock)); 259 260 zsearch.io_offset = start; 261 zsearch.io_size = end - start; 262 263 zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL); 264 if (zs == NULL) { 265 trim_map_segment_add(tm, start, end, txg); 266 return; 267 } 268 if (start < zs->io_offset) 269 trim_map_free_locked(tm, start, zs->io_offset, txg); 270 if (zs->io_offset + zs->io_size < end) 271 trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg); 272} 273 274void 275trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size) 276{ 277 trim_map_t *tm = vd->vdev_trimmap; 278 279 if (zfs_notrim || vd->vdev_notrim || tm == NULL) 280 return; 281 282 mutex_enter(&tm->tm_lock); 283 trim_map_free_locked(tm, offset, TRIM_ZIO_END(vd, offset, size), 284 vd->vdev_spa->spa_syncing_txg); 285 mutex_exit(&tm->tm_lock); 286} 287 288boolean_t 289trim_map_write_start(zio_t *zio) 290{ 291 vdev_t *vd = zio->io_vd; 292 trim_map_t *tm = vd->vdev_trimmap; 293 trim_seg_t tsearch, *ts; 294 boolean_t left_over, right_over; 295 uint64_t start, end; 296 297 if (zfs_notrim || vd->vdev_notrim || tm == NULL) 298 return (B_TRUE); 299 300 start = zio->io_offset; 301 end = TRIM_ZIO_END(zio->io_vd, start, zio->io_size); 302 tsearch.ts_start = start; 303 tsearch.ts_end = end; 304 305 mutex_enter(&tm->tm_lock); 306 307 /* 308 * Checking for colliding in-flight frees. 309 */ 310 ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL); 311 if (ts != NULL) { 312 list_insert_tail(&tm->tm_pending_writes, zio); 313 mutex_exit(&tm->tm_lock); 314 return (B_FALSE); 315 } 316 317 ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL); 318 if (ts != NULL) { 319 /* 320 * Loop until all overlapping segments are removed. 321 */ 322 do { 323 trim_map_segment_remove(tm, ts, start, end); 324 ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL); 325 } while (ts != NULL); 326 } 327 avl_add(&tm->tm_inflight_writes, zio); 328 329 mutex_exit(&tm->tm_lock); 330 331 return (B_TRUE); 332} 333 334void 335trim_map_write_done(zio_t *zio) 336{ 337 vdev_t *vd = zio->io_vd; 338 trim_map_t *tm = vd->vdev_trimmap; 339 340 /* 341 * Don't check for vdev_notrim, since the write could have 342 * started before vdev_notrim was set. 343 */ 344 if (zfs_notrim || tm == NULL) 345 return; 346 347 mutex_enter(&tm->tm_lock); 348 /* 349 * Don't fail if the write isn't in the tree, since the write 350 * could have started after vdev_notrim was set. 351 */ 352 if (zio->io_trim_node.avl_child[0] || 353 zio->io_trim_node.avl_child[1] || 354 AVL_XPARENT(&zio->io_trim_node) || 355 tm->tm_inflight_writes.avl_root == &zio->io_trim_node) 356 avl_remove(&tm->tm_inflight_writes, zio); 357 mutex_exit(&tm->tm_lock); 358} 359 360/* 361 * Return the oldest segment (the one with the lowest txg) or false if 362 * the list is empty or the first element's txg is greater than txg given 363 * as function argument. 364 */ 365static trim_seg_t * 366trim_map_first(trim_map_t *tm, uint64_t txg) 367{ 368 trim_seg_t *ts; 369 370 ASSERT(MUTEX_HELD(&tm->tm_lock)); 371 372 ts = list_head(&tm->tm_head); 373 if (ts != NULL && ts->ts_txg <= txg) 374 return (ts); 375 return (NULL); 376} 377 378static void 379trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd) 380{ 381 trim_map_t *tm = vd->vdev_trimmap; 382 trim_seg_t *ts; 383 uint64_t start, size, txglimit; 384 385 ASSERT(vd->vdev_ops->vdev_op_leaf); 386 387 if (tm == NULL) 388 return; 389 390 txglimit = MIN(spa->spa_syncing_txg, spa_freeze_txg(spa)) - 391 trim_txg_limit; 392 393 mutex_enter(&tm->tm_lock); 394 /* 395 * Loop until we send all frees up to the txglimit. 396 */ 397 while ((ts = trim_map_first(tm, txglimit)) != NULL) { 398 list_remove(&tm->tm_head, ts); 399 avl_remove(&tm->tm_queued_frees, ts); 400 avl_add(&tm->tm_inflight_frees, ts); 401 zio_nowait(zio_trim(zio, spa, vd, ts->ts_start, 402 ts->ts_end - ts->ts_start)); 403 } 404 mutex_exit(&tm->tm_lock); 405} 406 407static void 408trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd) 409{ 410 trim_map_t *tm = vd->vdev_trimmap; 411 trim_seg_t *ts; 412 list_t pending_writes; 413 zio_t *zio; 414 uint64_t start, size; 415 void *cookie; 416 417 ASSERT(vd->vdev_ops->vdev_op_leaf); 418 419 if (tm == NULL) 420 return; 421 422 mutex_enter(&tm->tm_lock); 423 if (!avl_is_empty(&tm->tm_inflight_frees)) { 424 cookie = NULL; 425 while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees, 426 &cookie)) != NULL) { 427 kmem_free(ts, sizeof (*ts)); 428 } 429 } 430 list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t, 431 io_trim_link)); 432 list_move_tail(&pending_writes, &tm->tm_pending_writes); 433 mutex_exit(&tm->tm_lock); 434 435 while ((zio = list_remove_head(&pending_writes)) != NULL) { 436 zio_vdev_io_reissue(zio); 437 zio_execute(zio); 438 } 439 list_destroy(&pending_writes); 440} 441 442static void 443trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd) 444{ 445 int c; 446 447 if (vd == NULL || spa->spa_syncing_txg <= trim_txg_limit) 448 return; 449 450 if (vd->vdev_ops->vdev_op_leaf) { 451 trim_map_vdev_commit(spa, zio, vd); 452 } else { 453 for (c = 0; c < vd->vdev_children; c++) 454 trim_map_commit(spa, zio, vd->vdev_child[c]); 455 } 456} 457 458static void 459trim_map_commit_done(spa_t *spa, vdev_t *vd) 460{ 461 int c; 462 463 if (vd == NULL) 464 return; 465 466 if (vd->vdev_ops->vdev_op_leaf) { 467 trim_map_vdev_commit_done(spa, vd); 468 } else { 469 for (c = 0; c < vd->vdev_children; c++) 470 trim_map_commit_done(spa, vd->vdev_child[c]); 471 } 472} 473 474static void 475trim_thread(void *arg) 476{ 477 spa_t *spa = arg; 478 zio_t *zio; 479 480 for (;;) { 481 mutex_enter(&spa->spa_trim_lock); 482 if (spa->spa_trim_thread == NULL) { 483 spa->spa_trim_thread = curthread; 484 cv_signal(&spa->spa_trim_cv); 485 mutex_exit(&spa->spa_trim_lock); 486 thread_exit(); 487 } 488 cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock); 489 mutex_exit(&spa->spa_trim_lock); 490 491 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 492 493 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 494 trim_map_commit(spa, zio, spa->spa_root_vdev); 495 (void) zio_wait(zio); 496 trim_map_commit_done(spa, spa->spa_root_vdev); 497 spa_config_exit(spa, SCL_STATE, FTAG); 498 } 499} 500 501void 502trim_thread_create(spa_t *spa) 503{ 504 505 if (zfs_notrim) 506 return; 507 508 mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL); 509 cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL); 510 mutex_enter(&spa->spa_trim_lock); 511 spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0, 512 TS_RUN, minclsyspri); 513 mutex_exit(&spa->spa_trim_lock); 514} 515 516void 517trim_thread_destroy(spa_t *spa) 518{ 519 520 if (zfs_notrim) 521 return; 522 if (spa->spa_trim_thread == NULL) 523 return; 524 525 mutex_enter(&spa->spa_trim_lock); 526 /* Setting spa_trim_thread to NULL tells the thread to stop. */ 527 spa->spa_trim_thread = NULL; 528 cv_signal(&spa->spa_trim_cv); 529 /* The thread will set it back to != NULL on exit. */ 530 while (spa->spa_trim_thread == NULL) 531 cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock); 532 spa->spa_trim_thread = NULL; 533 mutex_exit(&spa->spa_trim_lock); 534 535 cv_destroy(&spa->spa_trim_cv); 536 mutex_destroy(&spa->spa_trim_lock); 537} 538 539void 540trim_thread_wakeup(spa_t *spa) 541{ 542 543 if (zfs_notrim) 544 return; 545 if (spa->spa_trim_thread == NULL) 546 return; 547 548 mutex_enter(&spa->spa_trim_lock); 549 cv_signal(&spa->spa_trim_cv); 550 mutex_exit(&spa->spa_trim_lock); 551} 552