1240868Spjd/* 2240868Spjd * CDDL HEADER START 3240868Spjd * 4240868Spjd * The contents of this file are subject to the terms of the 5240868Spjd * Common Development and Distribution License (the "License"). 6240868Spjd * You may not use this file except in compliance with the License. 7240868Spjd * 8240868Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9240868Spjd * or http://www.opensolaris.org/os/licensing. 10240868Spjd * See the License for the specific language governing permissions 11240868Spjd * and limitations under the License. 12240868Spjd * 13240868Spjd * When distributing Covered Code, include this CDDL HEADER in each 14240868Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15240868Spjd * If applicable, add the following below this CDDL HEADER, with the 16240868Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17240868Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18240868Spjd * 19240868Spjd * CDDL HEADER END 20240868Spjd */ 21240868Spjd/* 22240868Spjd * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>. 23240868Spjd * All rights reserved. 24240868Spjd */ 25240868Spjd 26240868Spjd#include <sys/zfs_context.h> 27240868Spjd#include <sys/spa_impl.h> 28240868Spjd#include <sys/vdev_impl.h> 29240868Spjd#include <sys/trim_map.h> 30248575Ssmh#include <sys/time.h> 31240868Spjd 32244187Ssmh/* 33244187Ssmh * Calculate the zio end, upgrading based on ashift which would be 34244187Ssmh * done by zio_vdev_io_start. 35244187Ssmh * 36244187Ssmh * This makes free range consolidation much more effective 37244187Ssmh * than it would otherwise be as well as ensuring that entire 38244187Ssmh * blocks are invalidated by writes. 39244187Ssmh */ 40248572Ssmh#define TRIM_ZIO_END(vd, offset, size) (offset + \ 41248572Ssmh P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift)) 42244187Ssmh 43277818Smav/* Maximal segment size for ATA TRIM. */ 44277818Smav#define TRIM_MAP_SIZE_FACTOR (512 << 16) 45248577Ssmh 46277818Smav#define TRIM_MAP_SEGS(size) (1 + (size) / TRIM_MAP_SIZE_FACTOR) 47248577Ssmh 48277818Smav#define TRIM_MAP_ADD(tm, ts) do { \ 49277818Smav list_insert_tail(&(tm)->tm_head, (ts)); \ 50277818Smav (tm)->tm_pending += TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \ 51277818Smav} while (0) 52248577Ssmh 53277818Smav#define TRIM_MAP_REM(tm, ts) do { \ 54277818Smav list_remove(&(tm)->tm_head, (ts)); \ 55277818Smav (tm)->tm_pending -= TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \ 56277818Smav} while (0) 57248577Ssmh 58240868Spjdtypedef struct trim_map { 59240868Spjd list_t tm_head; /* List of segments sorted by txg. */ 60240868Spjd avl_tree_t tm_queued_frees; /* AVL tree of segments waiting for TRIM. */ 61240868Spjd avl_tree_t tm_inflight_frees; /* AVL tree of in-flight TRIMs. */ 62240868Spjd avl_tree_t tm_inflight_writes; /* AVL tree of in-flight writes. */ 63240868Spjd list_t tm_pending_writes; /* Writes blocked on in-flight frees. */ 64240868Spjd kmutex_t tm_lock; 65248577Ssmh uint64_t tm_pending; /* Count of pending TRIMs. */ 66240868Spjd} trim_map_t; 67240868Spjd 68240868Spjdtypedef struct trim_seg { 69240868Spjd avl_node_t ts_node; /* AVL node. */ 70240868Spjd list_node_t ts_next; /* List element. */ 71240868Spjd uint64_t ts_start; /* Starting offset of this segment. */ 72240868Spjd uint64_t ts_end; /* Ending offset (non-inclusive). */ 73240868Spjd uint64_t ts_txg; /* Segment creation txg. */ 74248575Ssmh hrtime_t ts_time; /* Segment creation time. */ 75240868Spjd} trim_seg_t; 76240868Spjd 77249921Ssmhextern boolean_t zfs_trim_enabled; 78240868Spjd 79277818Smavstatic u_int trim_txg_delay = 32; /* Keep deleted data up to 32 TXG */ 80277818Smavstatic u_int trim_timeout = 30; /* Keep deleted data up to 30s */ 81277818Smavstatic u_int trim_max_interval = 1; /* 1s delays between TRIMs */ 82277818Smavstatic u_int trim_vdev_max_pending = 10000; /* Keep up to 10K segments */ 83248577Ssmh 84240868SpjdSYSCTL_DECL(_vfs_zfs); 85248577SsmhSYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RD, 0, "ZFS TRIM"); 86240868Spjd 87248577SsmhTUNABLE_INT("vfs.zfs.trim.txg_delay", &trim_txg_delay); 88248577SsmhSYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, txg_delay, CTLFLAG_RWTUN, &trim_txg_delay, 89248577Ssmh 0, "Delay TRIMs by up to this many TXGs"); 90248575Ssmh 91248577SsmhTUNABLE_INT("vfs.zfs.trim.timeout", &trim_timeout); 92248577SsmhSYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, timeout, CTLFLAG_RWTUN, &trim_timeout, 0, 93248577Ssmh "Delay TRIMs by up to this many seconds"); 94248577Ssmh 95248577SsmhTUNABLE_INT("vfs.zfs.trim.max_interval", &trim_max_interval); 96248577SsmhSYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, max_interval, CTLFLAG_RWTUN, 97248577Ssmh &trim_max_interval, 0, 98248577Ssmh "Maximum interval between TRIM queue processing (seconds)"); 99248577Ssmh 100248577SsmhSYSCTL_DECL(_vfs_zfs_vdev); 101248577SsmhTUNABLE_INT("vfs.zfs.vdev.trim_max_pending", &trim_vdev_max_pending); 102248577SsmhSYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, trim_max_pending, CTLFLAG_RWTUN, 103248577Ssmh &trim_vdev_max_pending, 0, 104248577Ssmh "Maximum pending TRIM segments for a vdev"); 105248577Ssmh 106248577Ssmh 107240868Spjdstatic void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd); 108240868Spjd 109240868Spjdstatic int 110240868Spjdtrim_map_seg_compare(const void *x1, const void *x2) 111240868Spjd{ 112240868Spjd const trim_seg_t *s1 = x1; 113240868Spjd const trim_seg_t *s2 = x2; 114240868Spjd 115240868Spjd if (s1->ts_start < s2->ts_start) { 116240868Spjd if (s1->ts_end > s2->ts_start) 117240868Spjd return (0); 118240868Spjd return (-1); 119240868Spjd } 120240868Spjd if (s1->ts_start > s2->ts_start) { 121240868Spjd if (s1->ts_start < s2->ts_end) 122240868Spjd return (0); 123240868Spjd return (1); 124240868Spjd } 125240868Spjd return (0); 126240868Spjd} 127240868Spjd 128240868Spjdstatic int 129240868Spjdtrim_map_zio_compare(const void *x1, const void *x2) 130240868Spjd{ 131240868Spjd const zio_t *z1 = x1; 132240868Spjd const zio_t *z2 = x2; 133240868Spjd 134240868Spjd if (z1->io_offset < z2->io_offset) { 135240868Spjd if (z1->io_offset + z1->io_size > z2->io_offset) 136240868Spjd return (0); 137240868Spjd return (-1); 138240868Spjd } 139240868Spjd if (z1->io_offset > z2->io_offset) { 140240868Spjd if (z1->io_offset < z2->io_offset + z2->io_size) 141240868Spjd return (0); 142240868Spjd return (1); 143240868Spjd } 144240868Spjd return (0); 145240868Spjd} 146240868Spjd 147240868Spjdvoid 148240868Spjdtrim_map_create(vdev_t *vd) 149240868Spjd{ 150240868Spjd trim_map_t *tm; 151240868Spjd 152274800Ssmh ASSERT(zfs_trim_enabled && !vd->vdev_notrim && 153274800Ssmh vd->vdev_ops->vdev_op_leaf); 154240868Spjd 155240868Spjd tm = kmem_zalloc(sizeof (*tm), KM_SLEEP); 156240868Spjd mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL); 157240868Spjd list_create(&tm->tm_head, sizeof (trim_seg_t), 158240868Spjd offsetof(trim_seg_t, ts_next)); 159240868Spjd list_create(&tm->tm_pending_writes, sizeof (zio_t), 160240868Spjd offsetof(zio_t, io_trim_link)); 161240868Spjd avl_create(&tm->tm_queued_frees, trim_map_seg_compare, 162240868Spjd sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); 163240868Spjd avl_create(&tm->tm_inflight_frees, trim_map_seg_compare, 164240868Spjd sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); 165240868Spjd avl_create(&tm->tm_inflight_writes, trim_map_zio_compare, 166240868Spjd sizeof (zio_t), offsetof(zio_t, io_trim_node)); 167240868Spjd vd->vdev_trimmap = tm; 168240868Spjd} 169240868Spjd 170240868Spjdvoid 171240868Spjdtrim_map_destroy(vdev_t *vd) 172240868Spjd{ 173240868Spjd trim_map_t *tm; 174240868Spjd trim_seg_t *ts; 175240868Spjd 176240868Spjd ASSERT(vd->vdev_ops->vdev_op_leaf); 177240868Spjd 178249921Ssmh if (!zfs_trim_enabled) 179240868Spjd return; 180240868Spjd 181240868Spjd tm = vd->vdev_trimmap; 182240868Spjd if (tm == NULL) 183240868Spjd return; 184240868Spjd 185240868Spjd /* 186240868Spjd * We may have been called before trim_map_vdev_commit_done() 187240868Spjd * had a chance to run, so do it now to prune the remaining 188240868Spjd * inflight frees. 189240868Spjd */ 190240868Spjd trim_map_vdev_commit_done(vd->vdev_spa, vd); 191240868Spjd 192240868Spjd mutex_enter(&tm->tm_lock); 193240868Spjd while ((ts = list_head(&tm->tm_head)) != NULL) { 194240868Spjd avl_remove(&tm->tm_queued_frees, ts); 195277818Smav TRIM_MAP_REM(tm, ts); 196240868Spjd kmem_free(ts, sizeof (*ts)); 197240868Spjd } 198240868Spjd mutex_exit(&tm->tm_lock); 199240868Spjd 200240868Spjd avl_destroy(&tm->tm_queued_frees); 201240868Spjd avl_destroy(&tm->tm_inflight_frees); 202240868Spjd avl_destroy(&tm->tm_inflight_writes); 203240868Spjd list_destroy(&tm->tm_pending_writes); 204240868Spjd list_destroy(&tm->tm_head); 205240868Spjd mutex_destroy(&tm->tm_lock); 206240868Spjd kmem_free(tm, sizeof (*tm)); 207240868Spjd vd->vdev_trimmap = NULL; 208240868Spjd} 209240868Spjd 210240868Spjdstatic void 211240868Spjdtrim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) 212240868Spjd{ 213240868Spjd avl_index_t where; 214240868Spjd trim_seg_t tsearch, *ts_before, *ts_after, *ts; 215240868Spjd boolean_t merge_before, merge_after; 216248575Ssmh hrtime_t time; 217240868Spjd 218240868Spjd ASSERT(MUTEX_HELD(&tm->tm_lock)); 219240868Spjd VERIFY(start < end); 220240868Spjd 221248575Ssmh time = gethrtime(); 222240868Spjd tsearch.ts_start = start; 223240868Spjd tsearch.ts_end = end; 224240868Spjd 225240868Spjd ts = avl_find(&tm->tm_queued_frees, &tsearch, &where); 226240868Spjd if (ts != NULL) { 227240868Spjd if (start < ts->ts_start) 228240868Spjd trim_map_segment_add(tm, start, ts->ts_start, txg); 229240868Spjd if (end > ts->ts_end) 230240868Spjd trim_map_segment_add(tm, ts->ts_end, end, txg); 231240868Spjd return; 232240868Spjd } 233240868Spjd 234240868Spjd ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE); 235240868Spjd ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER); 236240868Spjd 237248577Ssmh merge_before = (ts_before != NULL && ts_before->ts_end == start); 238248577Ssmh merge_after = (ts_after != NULL && ts_after->ts_start == end); 239240868Spjd 240240868Spjd if (merge_before && merge_after) { 241240868Spjd avl_remove(&tm->tm_queued_frees, ts_before); 242277818Smav TRIM_MAP_REM(tm, ts_before); 243277818Smav TRIM_MAP_REM(tm, ts_after); 244240868Spjd ts_after->ts_start = ts_before->ts_start; 245248577Ssmh ts_after->ts_txg = txg; 246248577Ssmh ts_after->ts_time = time; 247277818Smav TRIM_MAP_ADD(tm, ts_after); 248240868Spjd kmem_free(ts_before, sizeof (*ts_before)); 249240868Spjd } else if (merge_before) { 250277818Smav TRIM_MAP_REM(tm, ts_before); 251240868Spjd ts_before->ts_end = end; 252248577Ssmh ts_before->ts_txg = txg; 253248577Ssmh ts_before->ts_time = time; 254277818Smav TRIM_MAP_ADD(tm, ts_before); 255240868Spjd } else if (merge_after) { 256277818Smav TRIM_MAP_REM(tm, ts_after); 257240868Spjd ts_after->ts_start = start; 258248577Ssmh ts_after->ts_txg = txg; 259248577Ssmh ts_after->ts_time = time; 260277818Smav TRIM_MAP_ADD(tm, ts_after); 261240868Spjd } else { 262240868Spjd ts = kmem_alloc(sizeof (*ts), KM_SLEEP); 263240868Spjd ts->ts_start = start; 264240868Spjd ts->ts_end = end; 265240868Spjd ts->ts_txg = txg; 266248575Ssmh ts->ts_time = time; 267240868Spjd avl_insert(&tm->tm_queued_frees, ts, where); 268277818Smav TRIM_MAP_ADD(tm, ts); 269240868Spjd } 270240868Spjd} 271240868Spjd 272240868Spjdstatic void 273240868Spjdtrim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start, 274240868Spjd uint64_t end) 275240868Spjd{ 276240868Spjd trim_seg_t *nts; 277240868Spjd boolean_t left_over, right_over; 278240868Spjd 279240868Spjd ASSERT(MUTEX_HELD(&tm->tm_lock)); 280240868Spjd 281240868Spjd left_over = (ts->ts_start < start); 282240868Spjd right_over = (ts->ts_end > end); 283240868Spjd 284277818Smav TRIM_MAP_REM(tm, ts); 285240868Spjd if (left_over && right_over) { 286240868Spjd nts = kmem_alloc(sizeof (*nts), KM_SLEEP); 287240868Spjd nts->ts_start = end; 288240868Spjd nts->ts_end = ts->ts_end; 289240868Spjd nts->ts_txg = ts->ts_txg; 290248575Ssmh nts->ts_time = ts->ts_time; 291240868Spjd ts->ts_end = start; 292240868Spjd avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER); 293277818Smav TRIM_MAP_ADD(tm, ts); 294277818Smav TRIM_MAP_ADD(tm, nts); 295240868Spjd } else if (left_over) { 296240868Spjd ts->ts_end = start; 297277818Smav TRIM_MAP_ADD(tm, ts); 298240868Spjd } else if (right_over) { 299240868Spjd ts->ts_start = end; 300277818Smav TRIM_MAP_ADD(tm, ts); 301240868Spjd } else { 302240868Spjd avl_remove(&tm->tm_queued_frees, ts); 303240868Spjd kmem_free(ts, sizeof (*ts)); 304240868Spjd } 305240868Spjd} 306240868Spjd 307240868Spjdstatic void 308240868Spjdtrim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) 309240868Spjd{ 310240868Spjd zio_t zsearch, *zs; 311240868Spjd 312240868Spjd ASSERT(MUTEX_HELD(&tm->tm_lock)); 313240868Spjd 314240868Spjd zsearch.io_offset = start; 315240868Spjd zsearch.io_size = end - start; 316240868Spjd 317240868Spjd zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL); 318240868Spjd if (zs == NULL) { 319240868Spjd trim_map_segment_add(tm, start, end, txg); 320240868Spjd return; 321240868Spjd } 322240868Spjd if (start < zs->io_offset) 323240868Spjd trim_map_free_locked(tm, start, zs->io_offset, txg); 324240868Spjd if (zs->io_offset + zs->io_size < end) 325240868Spjd trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg); 326240868Spjd} 327240868Spjd 328240868Spjdvoid 329248574Ssmhtrim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) 330240868Spjd{ 331240868Spjd trim_map_t *tm = vd->vdev_trimmap; 332240868Spjd 333249921Ssmh if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL) 334240868Spjd return; 335240868Spjd 336240868Spjd mutex_enter(&tm->tm_lock); 337248574Ssmh trim_map_free_locked(tm, offset, TRIM_ZIO_END(vd, offset, size), txg); 338240868Spjd mutex_exit(&tm->tm_lock); 339240868Spjd} 340240868Spjd 341240868Spjdboolean_t 342240868Spjdtrim_map_write_start(zio_t *zio) 343240868Spjd{ 344240868Spjd vdev_t *vd = zio->io_vd; 345240868Spjd trim_map_t *tm = vd->vdev_trimmap; 346240868Spjd trim_seg_t tsearch, *ts; 347240868Spjd boolean_t left_over, right_over; 348240868Spjd uint64_t start, end; 349240868Spjd 350249921Ssmh if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL) 351240868Spjd return (B_TRUE); 352240868Spjd 353240868Spjd start = zio->io_offset; 354248572Ssmh end = TRIM_ZIO_END(zio->io_vd, start, zio->io_size); 355240868Spjd tsearch.ts_start = start; 356240868Spjd tsearch.ts_end = end; 357240868Spjd 358240868Spjd mutex_enter(&tm->tm_lock); 359240868Spjd 360240868Spjd /* 361240868Spjd * Checking for colliding in-flight frees. 362240868Spjd */ 363240868Spjd ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL); 364240868Spjd if (ts != NULL) { 365240868Spjd list_insert_tail(&tm->tm_pending_writes, zio); 366240868Spjd mutex_exit(&tm->tm_lock); 367240868Spjd return (B_FALSE); 368240868Spjd } 369240868Spjd 370240868Spjd ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL); 371240868Spjd if (ts != NULL) { 372240868Spjd /* 373240868Spjd * Loop until all overlapping segments are removed. 374240868Spjd */ 375240868Spjd do { 376240868Spjd trim_map_segment_remove(tm, ts, start, end); 377240868Spjd ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL); 378240868Spjd } while (ts != NULL); 379240868Spjd } 380240868Spjd avl_add(&tm->tm_inflight_writes, zio); 381240868Spjd 382240868Spjd mutex_exit(&tm->tm_lock); 383240868Spjd 384240868Spjd return (B_TRUE); 385240868Spjd} 386240868Spjd 387240868Spjdvoid 388240868Spjdtrim_map_write_done(zio_t *zio) 389240868Spjd{ 390240868Spjd vdev_t *vd = zio->io_vd; 391240868Spjd trim_map_t *tm = vd->vdev_trimmap; 392240868Spjd 393240868Spjd /* 394240868Spjd * Don't check for vdev_notrim, since the write could have 395240868Spjd * started before vdev_notrim was set. 396240868Spjd */ 397249921Ssmh if (!zfs_trim_enabled || tm == NULL) 398240868Spjd return; 399240868Spjd 400240868Spjd mutex_enter(&tm->tm_lock); 401240868Spjd /* 402240868Spjd * Don't fail if the write isn't in the tree, since the write 403240868Spjd * could have started after vdev_notrim was set. 404240868Spjd */ 405240868Spjd if (zio->io_trim_node.avl_child[0] || 406240868Spjd zio->io_trim_node.avl_child[1] || 407240868Spjd AVL_XPARENT(&zio->io_trim_node) || 408240868Spjd tm->tm_inflight_writes.avl_root == &zio->io_trim_node) 409240868Spjd avl_remove(&tm->tm_inflight_writes, zio); 410240868Spjd mutex_exit(&tm->tm_lock); 411240868Spjd} 412240868Spjd 413240868Spjd/* 414248577Ssmh * Return the oldest segment (the one with the lowest txg / time) or NULL if: 415248577Ssmh * 1. The list is empty 416248577Ssmh * 2. The first element's txg is greater than txgsafe 417248577Ssmh * 3. The first element's txg is not greater than the txg argument and the 418248577Ssmh * the first element's time is not greater than time argument 419240868Spjd */ 420240868Spjdstatic trim_seg_t * 421277818Smavtrim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time, 422277818Smav boolean_t force) 423240868Spjd{ 424240868Spjd trim_seg_t *ts; 425240868Spjd 426240868Spjd ASSERT(MUTEX_HELD(&tm->tm_lock)); 427248577Ssmh VERIFY(txgsafe >= txg); 428240868Spjd 429240868Spjd ts = list_head(&tm->tm_head); 430248577Ssmh if (ts != NULL && ts->ts_txg <= txgsafe && 431277818Smav (ts->ts_txg <= txg || ts->ts_time <= time || force)) 432240868Spjd return (ts); 433240868Spjd return (NULL); 434240868Spjd} 435240868Spjd 436240868Spjdstatic void 437240868Spjdtrim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd) 438240868Spjd{ 439240868Spjd trim_map_t *tm = vd->vdev_trimmap; 440240868Spjd trim_seg_t *ts; 441270312Ssmh uint64_t size, offset, txgtarget, txgsafe; 442277818Smav int64_t hard, soft; 443248575Ssmh hrtime_t timelimit; 444240868Spjd 445240868Spjd ASSERT(vd->vdev_ops->vdev_op_leaf); 446240868Spjd 447240868Spjd if (tm == NULL) 448240868Spjd return; 449240868Spjd 450277819Smav timelimit = gethrtime() - (hrtime_t)trim_timeout * NANOSEC; 451248575Ssmh if (vd->vdev_isl2cache) { 452248577Ssmh txgsafe = UINT64_MAX; 453248577Ssmh txgtarget = UINT64_MAX; 454248575Ssmh } else { 455248577Ssmh txgsafe = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa)); 456248577Ssmh if (txgsafe > trim_txg_delay) 457248577Ssmh txgtarget = txgsafe - trim_txg_delay; 458248577Ssmh else 459248577Ssmh txgtarget = 0; 460248575Ssmh } 461240868Spjd 462240868Spjd mutex_enter(&tm->tm_lock); 463277818Smav hard = 0; 464277818Smav if (tm->tm_pending > trim_vdev_max_pending) 465277818Smav hard = (tm->tm_pending - trim_vdev_max_pending) / 4; 466277818Smav soft = P2ROUNDUP(hard + tm->tm_pending / trim_timeout + 1, 64); 467248577Ssmh /* Loop until we have sent all outstanding free's */ 468277818Smav while (soft > 0 && 469277818Smav (ts = trim_map_first(tm, txgtarget, txgsafe, timelimit, hard > 0)) 470248577Ssmh != NULL) { 471277818Smav TRIM_MAP_REM(tm, ts); 472240868Spjd avl_remove(&tm->tm_queued_frees, ts); 473240868Spjd avl_add(&tm->tm_inflight_frees, ts); 474248577Ssmh size = ts->ts_end - ts->ts_start; 475270312Ssmh offset = ts->ts_start; 476270312Ssmh /* 477270312Ssmh * We drop the lock while we call zio_nowait as the IO 478270312Ssmh * scheduler can result in a different IO being run e.g. 479270312Ssmh * a write which would result in a recursive lock. 480270312Ssmh */ 481270312Ssmh mutex_exit(&tm->tm_lock); 482270312Ssmh 483270312Ssmh zio_nowait(zio_trim(zio, spa, vd, offset, size)); 484270312Ssmh 485277818Smav soft -= TRIM_MAP_SEGS(size); 486277818Smav hard -= TRIM_MAP_SEGS(size); 487270312Ssmh mutex_enter(&tm->tm_lock); 488240868Spjd } 489240868Spjd mutex_exit(&tm->tm_lock); 490240868Spjd} 491240868Spjd 492240868Spjdstatic void 493240868Spjdtrim_map_vdev_commit_done(spa_t *spa, vdev_t *vd) 494240868Spjd{ 495240868Spjd trim_map_t *tm = vd->vdev_trimmap; 496240868Spjd trim_seg_t *ts; 497240868Spjd list_t pending_writes; 498240868Spjd zio_t *zio; 499240868Spjd uint64_t start, size; 500240868Spjd void *cookie; 501240868Spjd 502240868Spjd ASSERT(vd->vdev_ops->vdev_op_leaf); 503240868Spjd 504240868Spjd if (tm == NULL) 505240868Spjd return; 506240868Spjd 507240868Spjd mutex_enter(&tm->tm_lock); 508240868Spjd if (!avl_is_empty(&tm->tm_inflight_frees)) { 509240868Spjd cookie = NULL; 510240868Spjd while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees, 511240868Spjd &cookie)) != NULL) { 512240868Spjd kmem_free(ts, sizeof (*ts)); 513240868Spjd } 514240868Spjd } 515240868Spjd list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t, 516240868Spjd io_trim_link)); 517240868Spjd list_move_tail(&pending_writes, &tm->tm_pending_writes); 518240868Spjd mutex_exit(&tm->tm_lock); 519240868Spjd 520240868Spjd while ((zio = list_remove_head(&pending_writes)) != NULL) { 521240868Spjd zio_vdev_io_reissue(zio); 522240868Spjd zio_execute(zio); 523240868Spjd } 524240868Spjd list_destroy(&pending_writes); 525240868Spjd} 526240868Spjd 527240868Spjdstatic void 528240868Spjdtrim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd) 529240868Spjd{ 530240868Spjd int c; 531240868Spjd 532248577Ssmh if (vd == NULL) 533240868Spjd return; 534240868Spjd 535240868Spjd if (vd->vdev_ops->vdev_op_leaf) { 536240868Spjd trim_map_vdev_commit(spa, zio, vd); 537240868Spjd } else { 538240868Spjd for (c = 0; c < vd->vdev_children; c++) 539240868Spjd trim_map_commit(spa, zio, vd->vdev_child[c]); 540240868Spjd } 541240868Spjd} 542240868Spjd 543240868Spjdstatic void 544240868Spjdtrim_map_commit_done(spa_t *spa, vdev_t *vd) 545240868Spjd{ 546240868Spjd int c; 547240868Spjd 548240868Spjd if (vd == NULL) 549240868Spjd return; 550240868Spjd 551240868Spjd if (vd->vdev_ops->vdev_op_leaf) { 552240868Spjd trim_map_vdev_commit_done(spa, vd); 553240868Spjd } else { 554240868Spjd for (c = 0; c < vd->vdev_children; c++) 555240868Spjd trim_map_commit_done(spa, vd->vdev_child[c]); 556240868Spjd } 557240868Spjd} 558240868Spjd 559240868Spjdstatic void 560240868Spjdtrim_thread(void *arg) 561240868Spjd{ 562240868Spjd spa_t *spa = arg; 563240868Spjd zio_t *zio; 564240868Spjd 565248576Ssmh#ifdef _KERNEL 566248576Ssmh (void) snprintf(curthread->td_name, sizeof(curthread->td_name), 567248576Ssmh "trim %s", spa_name(spa)); 568248576Ssmh#endif 569248576Ssmh 570240868Spjd for (;;) { 571240868Spjd mutex_enter(&spa->spa_trim_lock); 572240868Spjd if (spa->spa_trim_thread == NULL) { 573240868Spjd spa->spa_trim_thread = curthread; 574240868Spjd cv_signal(&spa->spa_trim_cv); 575240868Spjd mutex_exit(&spa->spa_trim_lock); 576240868Spjd thread_exit(); 577240868Spjd } 578248577Ssmh 579248577Ssmh (void) cv_timedwait(&spa->spa_trim_cv, &spa->spa_trim_lock, 580248577Ssmh hz * trim_max_interval); 581240868Spjd mutex_exit(&spa->spa_trim_lock); 582240868Spjd 583240868Spjd zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 584240868Spjd 585240868Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 586240868Spjd trim_map_commit(spa, zio, spa->spa_root_vdev); 587240868Spjd (void) zio_wait(zio); 588240868Spjd trim_map_commit_done(spa, spa->spa_root_vdev); 589240868Spjd spa_config_exit(spa, SCL_STATE, FTAG); 590240868Spjd } 591240868Spjd} 592240868Spjd 593240868Spjdvoid 594240868Spjdtrim_thread_create(spa_t *spa) 595240868Spjd{ 596240868Spjd 597249921Ssmh if (!zfs_trim_enabled) 598240868Spjd return; 599240868Spjd 600240868Spjd mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL); 601240868Spjd cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL); 602240868Spjd mutex_enter(&spa->spa_trim_lock); 603240868Spjd spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0, 604240868Spjd TS_RUN, minclsyspri); 605240868Spjd mutex_exit(&spa->spa_trim_lock); 606240868Spjd} 607240868Spjd 608240868Spjdvoid 609240868Spjdtrim_thread_destroy(spa_t *spa) 610240868Spjd{ 611240868Spjd 612249921Ssmh if (!zfs_trim_enabled) 613240868Spjd return; 614240868Spjd if (spa->spa_trim_thread == NULL) 615240868Spjd return; 616240868Spjd 617240868Spjd mutex_enter(&spa->spa_trim_lock); 618240868Spjd /* Setting spa_trim_thread to NULL tells the thread to stop. */ 619240868Spjd spa->spa_trim_thread = NULL; 620240868Spjd cv_signal(&spa->spa_trim_cv); 621240868Spjd /* The thread will set it back to != NULL on exit. */ 622240868Spjd while (spa->spa_trim_thread == NULL) 623240868Spjd cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock); 624240868Spjd spa->spa_trim_thread = NULL; 625240868Spjd mutex_exit(&spa->spa_trim_lock); 626240868Spjd 627240868Spjd cv_destroy(&spa->spa_trim_cv); 628240868Spjd mutex_destroy(&spa->spa_trim_lock); 629240868Spjd} 630240868Spjd 631240868Spjdvoid 632240868Spjdtrim_thread_wakeup(spa_t *spa) 633240868Spjd{ 634240868Spjd 635249921Ssmh if (!zfs_trim_enabled) 636240868Spjd return; 637240868Spjd if (spa->spa_trim_thread == NULL) 638240868Spjd return; 639240868Spjd 640240868Spjd mutex_enter(&spa->spa_trim_lock); 641240868Spjd cv_signal(&spa->spa_trim_cv); 642240868Spjd mutex_exit(&spa->spa_trim_lock); 643240868Spjd} 644