1240868Spjd/*
2240868Spjd * CDDL HEADER START
3240868Spjd *
4240868Spjd * The contents of this file are subject to the terms of the
5240868Spjd * Common Development and Distribution License (the "License").
6240868Spjd * You may not use this file except in compliance with the License.
7240868Spjd *
8240868Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9240868Spjd * or http://www.opensolaris.org/os/licensing.
10240868Spjd * See the License for the specific language governing permissions
11240868Spjd * and limitations under the License.
12240868Spjd *
13240868Spjd * When distributing Covered Code, include this CDDL HEADER in each
14240868Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15240868Spjd * If applicable, add the following below this CDDL HEADER, with the
16240868Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17240868Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18240868Spjd *
19240868Spjd * CDDL HEADER END
20240868Spjd */
21240868Spjd/*
22240868Spjd * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
23240868Spjd * All rights reserved.
24240868Spjd */
25240868Spjd
26240868Spjd#include <sys/zfs_context.h>
27240868Spjd#include <sys/spa_impl.h>
28240868Spjd#include <sys/vdev_impl.h>
29240868Spjd#include <sys/trim_map.h>
30248575Ssmh#include <sys/time.h>
31240868Spjd
32244187Ssmh/*
33244187Ssmh * Calculate the zio end, upgrading based on ashift which would be
34244187Ssmh * done by zio_vdev_io_start.
35244187Ssmh *
36244187Ssmh * This makes free range consolidation much more effective
37244187Ssmh * than it would otherwise be as well as ensuring that entire
38244187Ssmh * blocks are invalidated by writes.
39244187Ssmh */
40248572Ssmh#define	TRIM_ZIO_END(vd, offset, size)	(offset +		\
41248572Ssmh 	P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift))
42244187Ssmh
43277818Smav/* Maximal segment size for ATA TRIM. */
44277818Smav#define TRIM_MAP_SIZE_FACTOR	(512 << 16)
45248577Ssmh
46277818Smav#define TRIM_MAP_SEGS(size)	(1 + (size) / TRIM_MAP_SIZE_FACTOR)
47248577Ssmh
48277818Smav#define TRIM_MAP_ADD(tm, ts)	do {				\
49277818Smav	list_insert_tail(&(tm)->tm_head, (ts));			\
50277818Smav	(tm)->tm_pending += TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \
51277818Smav} while (0)
52248577Ssmh
53277818Smav#define TRIM_MAP_REM(tm, ts)	do {				\
54277818Smav	list_remove(&(tm)->tm_head, (ts));			\
55277818Smav	(tm)->tm_pending -= TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \
56277818Smav} while (0)
57248577Ssmh
58240868Spjdtypedef struct trim_map {
59240868Spjd	list_t		tm_head;		/* List of segments sorted by txg. */
60240868Spjd	avl_tree_t	tm_queued_frees;	/* AVL tree of segments waiting for TRIM. */
61240868Spjd	avl_tree_t	tm_inflight_frees;	/* AVL tree of in-flight TRIMs. */
62240868Spjd	avl_tree_t	tm_inflight_writes;	/* AVL tree of in-flight writes. */
63240868Spjd	list_t		tm_pending_writes;	/* Writes blocked on in-flight frees. */
64240868Spjd	kmutex_t	tm_lock;
65248577Ssmh	uint64_t	tm_pending;		/* Count of pending TRIMs. */
66240868Spjd} trim_map_t;
67240868Spjd
68240868Spjdtypedef struct trim_seg {
69240868Spjd	avl_node_t	ts_node;	/* AVL node. */
70240868Spjd	list_node_t	ts_next;	/* List element. */
71240868Spjd	uint64_t	ts_start;	/* Starting offset of this segment. */
72240868Spjd	uint64_t	ts_end;		/* Ending offset (non-inclusive). */
73240868Spjd	uint64_t	ts_txg;		/* Segment creation txg. */
74248575Ssmh	hrtime_t	ts_time;	/* Segment creation time. */
75240868Spjd} trim_seg_t;
76240868Spjd
77249921Ssmhextern boolean_t zfs_trim_enabled;
78240868Spjd
79277818Smavstatic u_int trim_txg_delay = 32;	/* Keep deleted data up to 32 TXG */
80277818Smavstatic u_int trim_timeout = 30;		/* Keep deleted data up to 30s */
81277818Smavstatic u_int trim_max_interval = 1;	/* 1s delays between TRIMs */
82277818Smavstatic u_int trim_vdev_max_pending = 10000; /* Keep up to 10K segments */
83248577Ssmh
84240868SpjdSYSCTL_DECL(_vfs_zfs);
85248577SsmhSYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RD, 0, "ZFS TRIM");
86240868Spjd
87248577SsmhTUNABLE_INT("vfs.zfs.trim.txg_delay", &trim_txg_delay);
88248577SsmhSYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, txg_delay, CTLFLAG_RWTUN, &trim_txg_delay,
89248577Ssmh    0, "Delay TRIMs by up to this many TXGs");
90248575Ssmh
91248577SsmhTUNABLE_INT("vfs.zfs.trim.timeout", &trim_timeout);
92248577SsmhSYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, timeout, CTLFLAG_RWTUN, &trim_timeout, 0,
93248577Ssmh    "Delay TRIMs by up to this many seconds");
94248577Ssmh
95248577SsmhTUNABLE_INT("vfs.zfs.trim.max_interval", &trim_max_interval);
96248577SsmhSYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, max_interval, CTLFLAG_RWTUN,
97248577Ssmh    &trim_max_interval, 0,
98248577Ssmh    "Maximum interval between TRIM queue processing (seconds)");
99248577Ssmh
100248577SsmhSYSCTL_DECL(_vfs_zfs_vdev);
101248577SsmhTUNABLE_INT("vfs.zfs.vdev.trim_max_pending", &trim_vdev_max_pending);
102248577SsmhSYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, trim_max_pending, CTLFLAG_RWTUN,
103248577Ssmh    &trim_vdev_max_pending, 0,
104248577Ssmh    "Maximum pending TRIM segments for a vdev");
105248577Ssmh
106248577Ssmh
107240868Spjdstatic void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd);
108240868Spjd
109240868Spjdstatic int
110240868Spjdtrim_map_seg_compare(const void *x1, const void *x2)
111240868Spjd{
112240868Spjd	const trim_seg_t *s1 = x1;
113240868Spjd	const trim_seg_t *s2 = x2;
114240868Spjd
115240868Spjd	if (s1->ts_start < s2->ts_start) {
116240868Spjd		if (s1->ts_end > s2->ts_start)
117240868Spjd			return (0);
118240868Spjd		return (-1);
119240868Spjd	}
120240868Spjd	if (s1->ts_start > s2->ts_start) {
121240868Spjd		if (s1->ts_start < s2->ts_end)
122240868Spjd			return (0);
123240868Spjd		return (1);
124240868Spjd	}
125240868Spjd	return (0);
126240868Spjd}
127240868Spjd
128240868Spjdstatic int
129240868Spjdtrim_map_zio_compare(const void *x1, const void *x2)
130240868Spjd{
131240868Spjd	const zio_t *z1 = x1;
132240868Spjd	const zio_t *z2 = x2;
133240868Spjd
134240868Spjd	if (z1->io_offset < z2->io_offset) {
135240868Spjd		if (z1->io_offset + z1->io_size > z2->io_offset)
136240868Spjd			return (0);
137240868Spjd		return (-1);
138240868Spjd	}
139240868Spjd	if (z1->io_offset > z2->io_offset) {
140240868Spjd		if (z1->io_offset < z2->io_offset + z2->io_size)
141240868Spjd			return (0);
142240868Spjd		return (1);
143240868Spjd	}
144240868Spjd	return (0);
145240868Spjd}
146240868Spjd
147240868Spjdvoid
148240868Spjdtrim_map_create(vdev_t *vd)
149240868Spjd{
150240868Spjd	trim_map_t *tm;
151240868Spjd
152274800Ssmh	ASSERT(zfs_trim_enabled && !vd->vdev_notrim &&
153274800Ssmh		vd->vdev_ops->vdev_op_leaf);
154240868Spjd
155240868Spjd	tm = kmem_zalloc(sizeof (*tm), KM_SLEEP);
156240868Spjd	mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL);
157240868Spjd	list_create(&tm->tm_head, sizeof (trim_seg_t),
158240868Spjd	    offsetof(trim_seg_t, ts_next));
159240868Spjd	list_create(&tm->tm_pending_writes, sizeof (zio_t),
160240868Spjd	    offsetof(zio_t, io_trim_link));
161240868Spjd	avl_create(&tm->tm_queued_frees, trim_map_seg_compare,
162240868Spjd	    sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
163240868Spjd	avl_create(&tm->tm_inflight_frees, trim_map_seg_compare,
164240868Spjd	    sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
165240868Spjd	avl_create(&tm->tm_inflight_writes, trim_map_zio_compare,
166240868Spjd	    sizeof (zio_t), offsetof(zio_t, io_trim_node));
167240868Spjd	vd->vdev_trimmap = tm;
168240868Spjd}
169240868Spjd
170240868Spjdvoid
171240868Spjdtrim_map_destroy(vdev_t *vd)
172240868Spjd{
173240868Spjd	trim_map_t *tm;
174240868Spjd	trim_seg_t *ts;
175240868Spjd
176240868Spjd	ASSERT(vd->vdev_ops->vdev_op_leaf);
177240868Spjd
178249921Ssmh	if (!zfs_trim_enabled)
179240868Spjd		return;
180240868Spjd
181240868Spjd	tm = vd->vdev_trimmap;
182240868Spjd	if (tm == NULL)
183240868Spjd		return;
184240868Spjd
185240868Spjd	/*
186240868Spjd	 * We may have been called before trim_map_vdev_commit_done()
187240868Spjd	 * had a chance to run, so do it now to prune the remaining
188240868Spjd	 * inflight frees.
189240868Spjd	 */
190240868Spjd	trim_map_vdev_commit_done(vd->vdev_spa, vd);
191240868Spjd
192240868Spjd	mutex_enter(&tm->tm_lock);
193240868Spjd	while ((ts = list_head(&tm->tm_head)) != NULL) {
194240868Spjd		avl_remove(&tm->tm_queued_frees, ts);
195277818Smav		TRIM_MAP_REM(tm, ts);
196240868Spjd		kmem_free(ts, sizeof (*ts));
197240868Spjd	}
198240868Spjd	mutex_exit(&tm->tm_lock);
199240868Spjd
200240868Spjd	avl_destroy(&tm->tm_queued_frees);
201240868Spjd	avl_destroy(&tm->tm_inflight_frees);
202240868Spjd	avl_destroy(&tm->tm_inflight_writes);
203240868Spjd	list_destroy(&tm->tm_pending_writes);
204240868Spjd	list_destroy(&tm->tm_head);
205240868Spjd	mutex_destroy(&tm->tm_lock);
206240868Spjd	kmem_free(tm, sizeof (*tm));
207240868Spjd	vd->vdev_trimmap = NULL;
208240868Spjd}
209240868Spjd
210240868Spjdstatic void
211240868Spjdtrim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
212240868Spjd{
213240868Spjd	avl_index_t where;
214240868Spjd	trim_seg_t tsearch, *ts_before, *ts_after, *ts;
215240868Spjd	boolean_t merge_before, merge_after;
216248575Ssmh	hrtime_t time;
217240868Spjd
218240868Spjd	ASSERT(MUTEX_HELD(&tm->tm_lock));
219240868Spjd	VERIFY(start < end);
220240868Spjd
221248575Ssmh	time = gethrtime();
222240868Spjd	tsearch.ts_start = start;
223240868Spjd	tsearch.ts_end = end;
224240868Spjd
225240868Spjd	ts = avl_find(&tm->tm_queued_frees, &tsearch, &where);
226240868Spjd	if (ts != NULL) {
227240868Spjd		if (start < ts->ts_start)
228240868Spjd			trim_map_segment_add(tm, start, ts->ts_start, txg);
229240868Spjd		if (end > ts->ts_end)
230240868Spjd			trim_map_segment_add(tm, ts->ts_end, end, txg);
231240868Spjd		return;
232240868Spjd	}
233240868Spjd
234240868Spjd	ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE);
235240868Spjd	ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER);
236240868Spjd
237248577Ssmh	merge_before = (ts_before != NULL && ts_before->ts_end == start);
238248577Ssmh	merge_after = (ts_after != NULL && ts_after->ts_start == end);
239240868Spjd
240240868Spjd	if (merge_before && merge_after) {
241240868Spjd		avl_remove(&tm->tm_queued_frees, ts_before);
242277818Smav		TRIM_MAP_REM(tm, ts_before);
243277818Smav		TRIM_MAP_REM(tm, ts_after);
244240868Spjd		ts_after->ts_start = ts_before->ts_start;
245248577Ssmh		ts_after->ts_txg = txg;
246248577Ssmh		ts_after->ts_time = time;
247277818Smav		TRIM_MAP_ADD(tm, ts_after);
248240868Spjd		kmem_free(ts_before, sizeof (*ts_before));
249240868Spjd	} else if (merge_before) {
250277818Smav		TRIM_MAP_REM(tm, ts_before);
251240868Spjd		ts_before->ts_end = end;
252248577Ssmh		ts_before->ts_txg = txg;
253248577Ssmh		ts_before->ts_time = time;
254277818Smav		TRIM_MAP_ADD(tm, ts_before);
255240868Spjd	} else if (merge_after) {
256277818Smav		TRIM_MAP_REM(tm, ts_after);
257240868Spjd		ts_after->ts_start = start;
258248577Ssmh		ts_after->ts_txg = txg;
259248577Ssmh		ts_after->ts_time = time;
260277818Smav		TRIM_MAP_ADD(tm, ts_after);
261240868Spjd	} else {
262240868Spjd		ts = kmem_alloc(sizeof (*ts), KM_SLEEP);
263240868Spjd		ts->ts_start = start;
264240868Spjd		ts->ts_end = end;
265240868Spjd		ts->ts_txg = txg;
266248575Ssmh		ts->ts_time = time;
267240868Spjd		avl_insert(&tm->tm_queued_frees, ts, where);
268277818Smav		TRIM_MAP_ADD(tm, ts);
269240868Spjd	}
270240868Spjd}
271240868Spjd
272240868Spjdstatic void
273240868Spjdtrim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start,
274240868Spjd    uint64_t end)
275240868Spjd{
276240868Spjd	trim_seg_t *nts;
277240868Spjd	boolean_t left_over, right_over;
278240868Spjd
279240868Spjd	ASSERT(MUTEX_HELD(&tm->tm_lock));
280240868Spjd
281240868Spjd	left_over = (ts->ts_start < start);
282240868Spjd	right_over = (ts->ts_end > end);
283240868Spjd
284277818Smav	TRIM_MAP_REM(tm, ts);
285240868Spjd	if (left_over && right_over) {
286240868Spjd		nts = kmem_alloc(sizeof (*nts), KM_SLEEP);
287240868Spjd		nts->ts_start = end;
288240868Spjd		nts->ts_end = ts->ts_end;
289240868Spjd		nts->ts_txg = ts->ts_txg;
290248575Ssmh		nts->ts_time = ts->ts_time;
291240868Spjd		ts->ts_end = start;
292240868Spjd		avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER);
293277818Smav		TRIM_MAP_ADD(tm, ts);
294277818Smav		TRIM_MAP_ADD(tm, nts);
295240868Spjd	} else if (left_over) {
296240868Spjd		ts->ts_end = start;
297277818Smav		TRIM_MAP_ADD(tm, ts);
298240868Spjd	} else if (right_over) {
299240868Spjd		ts->ts_start = end;
300277818Smav		TRIM_MAP_ADD(tm, ts);
301240868Spjd	} else {
302240868Spjd		avl_remove(&tm->tm_queued_frees, ts);
303240868Spjd		kmem_free(ts, sizeof (*ts));
304240868Spjd	}
305240868Spjd}
306240868Spjd
307240868Spjdstatic void
308240868Spjdtrim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
309240868Spjd{
310240868Spjd	zio_t zsearch, *zs;
311240868Spjd
312240868Spjd	ASSERT(MUTEX_HELD(&tm->tm_lock));
313240868Spjd
314240868Spjd	zsearch.io_offset = start;
315240868Spjd	zsearch.io_size = end - start;
316240868Spjd
317240868Spjd	zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL);
318240868Spjd	if (zs == NULL) {
319240868Spjd		trim_map_segment_add(tm, start, end, txg);
320240868Spjd		return;
321240868Spjd	}
322240868Spjd	if (start < zs->io_offset)
323240868Spjd		trim_map_free_locked(tm, start, zs->io_offset, txg);
324240868Spjd	if (zs->io_offset + zs->io_size < end)
325240868Spjd		trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg);
326240868Spjd}
327240868Spjd
328240868Spjdvoid
329248574Ssmhtrim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
330240868Spjd{
331240868Spjd	trim_map_t *tm = vd->vdev_trimmap;
332240868Spjd
333249921Ssmh	if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL)
334240868Spjd		return;
335240868Spjd
336240868Spjd	mutex_enter(&tm->tm_lock);
337248574Ssmh	trim_map_free_locked(tm, offset, TRIM_ZIO_END(vd, offset, size), txg);
338240868Spjd	mutex_exit(&tm->tm_lock);
339240868Spjd}
340240868Spjd
341240868Spjdboolean_t
342240868Spjdtrim_map_write_start(zio_t *zio)
343240868Spjd{
344240868Spjd	vdev_t *vd = zio->io_vd;
345240868Spjd	trim_map_t *tm = vd->vdev_trimmap;
346240868Spjd	trim_seg_t tsearch, *ts;
347240868Spjd	boolean_t left_over, right_over;
348240868Spjd	uint64_t start, end;
349240868Spjd
350249921Ssmh	if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL)
351240868Spjd		return (B_TRUE);
352240868Spjd
353240868Spjd	start = zio->io_offset;
354248572Ssmh	end = TRIM_ZIO_END(zio->io_vd, start, zio->io_size);
355240868Spjd	tsearch.ts_start = start;
356240868Spjd	tsearch.ts_end = end;
357240868Spjd
358240868Spjd	mutex_enter(&tm->tm_lock);
359240868Spjd
360240868Spjd	/*
361240868Spjd	 * Checking for colliding in-flight frees.
362240868Spjd	 */
363240868Spjd	ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL);
364240868Spjd	if (ts != NULL) {
365240868Spjd		list_insert_tail(&tm->tm_pending_writes, zio);
366240868Spjd		mutex_exit(&tm->tm_lock);
367240868Spjd		return (B_FALSE);
368240868Spjd	}
369240868Spjd
370240868Spjd	ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL);
371240868Spjd	if (ts != NULL) {
372240868Spjd		/*
373240868Spjd		 * Loop until all overlapping segments are removed.
374240868Spjd		 */
375240868Spjd		do {
376240868Spjd			trim_map_segment_remove(tm, ts, start, end);
377240868Spjd			ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL);
378240868Spjd		} while (ts != NULL);
379240868Spjd	}
380240868Spjd	avl_add(&tm->tm_inflight_writes, zio);
381240868Spjd
382240868Spjd	mutex_exit(&tm->tm_lock);
383240868Spjd
384240868Spjd	return (B_TRUE);
385240868Spjd}
386240868Spjd
387240868Spjdvoid
388240868Spjdtrim_map_write_done(zio_t *zio)
389240868Spjd{
390240868Spjd	vdev_t *vd = zio->io_vd;
391240868Spjd	trim_map_t *tm = vd->vdev_trimmap;
392240868Spjd
393240868Spjd	/*
394240868Spjd	 * Don't check for vdev_notrim, since the write could have
395240868Spjd	 * started before vdev_notrim was set.
396240868Spjd	 */
397249921Ssmh	if (!zfs_trim_enabled || tm == NULL)
398240868Spjd		return;
399240868Spjd
400240868Spjd	mutex_enter(&tm->tm_lock);
401240868Spjd	/*
402240868Spjd	 * Don't fail if the write isn't in the tree, since the write
403240868Spjd	 * could have started after vdev_notrim was set.
404240868Spjd	 */
405240868Spjd	if (zio->io_trim_node.avl_child[0] ||
406240868Spjd	    zio->io_trim_node.avl_child[1] ||
407240868Spjd	    AVL_XPARENT(&zio->io_trim_node) ||
408240868Spjd	    tm->tm_inflight_writes.avl_root == &zio->io_trim_node)
409240868Spjd		avl_remove(&tm->tm_inflight_writes, zio);
410240868Spjd	mutex_exit(&tm->tm_lock);
411240868Spjd}
412240868Spjd
413240868Spjd/*
414248577Ssmh * Return the oldest segment (the one with the lowest txg / time) or NULL if:
415248577Ssmh * 1. The list is empty
416248577Ssmh * 2. The first element's txg is greater than txgsafe
417248577Ssmh * 3. The first element's txg is not greater than the txg argument and the
418248577Ssmh *    the first element's time is not greater than time argument
419240868Spjd */
420240868Spjdstatic trim_seg_t *
421277818Smavtrim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time,
422277818Smav    boolean_t force)
423240868Spjd{
424240868Spjd	trim_seg_t *ts;
425240868Spjd
426240868Spjd	ASSERT(MUTEX_HELD(&tm->tm_lock));
427248577Ssmh	VERIFY(txgsafe >= txg);
428240868Spjd
429240868Spjd	ts = list_head(&tm->tm_head);
430248577Ssmh	if (ts != NULL && ts->ts_txg <= txgsafe &&
431277818Smav	    (ts->ts_txg <= txg || ts->ts_time <= time || force))
432240868Spjd		return (ts);
433240868Spjd	return (NULL);
434240868Spjd}
435240868Spjd
436240868Spjdstatic void
437240868Spjdtrim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
438240868Spjd{
439240868Spjd	trim_map_t *tm = vd->vdev_trimmap;
440240868Spjd	trim_seg_t *ts;
441270312Ssmh	uint64_t size, offset, txgtarget, txgsafe;
442277818Smav	int64_t hard, soft;
443248575Ssmh	hrtime_t timelimit;
444240868Spjd
445240868Spjd	ASSERT(vd->vdev_ops->vdev_op_leaf);
446240868Spjd
447240868Spjd	if (tm == NULL)
448240868Spjd		return;
449240868Spjd
450277819Smav	timelimit = gethrtime() - (hrtime_t)trim_timeout * NANOSEC;
451248575Ssmh	if (vd->vdev_isl2cache) {
452248577Ssmh		txgsafe = UINT64_MAX;
453248577Ssmh		txgtarget = UINT64_MAX;
454248575Ssmh	} else {
455248577Ssmh		txgsafe = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa));
456248577Ssmh		if (txgsafe > trim_txg_delay)
457248577Ssmh			txgtarget = txgsafe - trim_txg_delay;
458248577Ssmh		else
459248577Ssmh			txgtarget = 0;
460248575Ssmh	}
461240868Spjd
462240868Spjd	mutex_enter(&tm->tm_lock);
463277818Smav	hard = 0;
464277818Smav	if (tm->tm_pending > trim_vdev_max_pending)
465277818Smav		hard = (tm->tm_pending - trim_vdev_max_pending) / 4;
466277818Smav	soft = P2ROUNDUP(hard + tm->tm_pending / trim_timeout + 1, 64);
467248577Ssmh	/* Loop until we have sent all outstanding free's */
468277818Smav	while (soft > 0 &&
469277818Smav	    (ts = trim_map_first(tm, txgtarget, txgsafe, timelimit, hard > 0))
470248577Ssmh	    != NULL) {
471277818Smav		TRIM_MAP_REM(tm, ts);
472240868Spjd		avl_remove(&tm->tm_queued_frees, ts);
473240868Spjd		avl_add(&tm->tm_inflight_frees, ts);
474248577Ssmh		size = ts->ts_end - ts->ts_start;
475270312Ssmh		offset = ts->ts_start;
476270312Ssmh		/*
477270312Ssmh		 * We drop the lock while we call zio_nowait as the IO
478270312Ssmh		 * scheduler can result in a different IO being run e.g.
479270312Ssmh		 * a write which would result in a recursive lock.
480270312Ssmh		 */
481270312Ssmh		mutex_exit(&tm->tm_lock);
482270312Ssmh
483270312Ssmh		zio_nowait(zio_trim(zio, spa, vd, offset, size));
484270312Ssmh
485277818Smav		soft -= TRIM_MAP_SEGS(size);
486277818Smav		hard -= TRIM_MAP_SEGS(size);
487270312Ssmh		mutex_enter(&tm->tm_lock);
488240868Spjd	}
489240868Spjd	mutex_exit(&tm->tm_lock);
490240868Spjd}
491240868Spjd
492240868Spjdstatic void
493240868Spjdtrim_map_vdev_commit_done(spa_t *spa, vdev_t *vd)
494240868Spjd{
495240868Spjd	trim_map_t *tm = vd->vdev_trimmap;
496240868Spjd	trim_seg_t *ts;
497240868Spjd	list_t pending_writes;
498240868Spjd	zio_t *zio;
499240868Spjd	uint64_t start, size;
500240868Spjd	void *cookie;
501240868Spjd
502240868Spjd	ASSERT(vd->vdev_ops->vdev_op_leaf);
503240868Spjd
504240868Spjd	if (tm == NULL)
505240868Spjd		return;
506240868Spjd
507240868Spjd	mutex_enter(&tm->tm_lock);
508240868Spjd	if (!avl_is_empty(&tm->tm_inflight_frees)) {
509240868Spjd		cookie = NULL;
510240868Spjd		while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees,
511240868Spjd		    &cookie)) != NULL) {
512240868Spjd			kmem_free(ts, sizeof (*ts));
513240868Spjd		}
514240868Spjd	}
515240868Spjd	list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t,
516240868Spjd	    io_trim_link));
517240868Spjd	list_move_tail(&pending_writes, &tm->tm_pending_writes);
518240868Spjd	mutex_exit(&tm->tm_lock);
519240868Spjd
520240868Spjd	while ((zio = list_remove_head(&pending_writes)) != NULL) {
521240868Spjd		zio_vdev_io_reissue(zio);
522240868Spjd		zio_execute(zio);
523240868Spjd	}
524240868Spjd	list_destroy(&pending_writes);
525240868Spjd}
526240868Spjd
527240868Spjdstatic void
528240868Spjdtrim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
529240868Spjd{
530240868Spjd	int c;
531240868Spjd
532248577Ssmh	if (vd == NULL)
533240868Spjd		return;
534240868Spjd
535240868Spjd	if (vd->vdev_ops->vdev_op_leaf) {
536240868Spjd		trim_map_vdev_commit(spa, zio, vd);
537240868Spjd	} else {
538240868Spjd		for (c = 0; c < vd->vdev_children; c++)
539240868Spjd			trim_map_commit(spa, zio, vd->vdev_child[c]);
540240868Spjd	}
541240868Spjd}
542240868Spjd
543240868Spjdstatic void
544240868Spjdtrim_map_commit_done(spa_t *spa, vdev_t *vd)
545240868Spjd{
546240868Spjd	int c;
547240868Spjd
548240868Spjd	if (vd == NULL)
549240868Spjd		return;
550240868Spjd
551240868Spjd	if (vd->vdev_ops->vdev_op_leaf) {
552240868Spjd		trim_map_vdev_commit_done(spa, vd);
553240868Spjd	} else {
554240868Spjd		for (c = 0; c < vd->vdev_children; c++)
555240868Spjd			trim_map_commit_done(spa, vd->vdev_child[c]);
556240868Spjd	}
557240868Spjd}
558240868Spjd
559240868Spjdstatic void
560240868Spjdtrim_thread(void *arg)
561240868Spjd{
562240868Spjd	spa_t *spa = arg;
563240868Spjd	zio_t *zio;
564240868Spjd
565248576Ssmh#ifdef _KERNEL
566248576Ssmh	(void) snprintf(curthread->td_name, sizeof(curthread->td_name),
567248576Ssmh	    "trim %s", spa_name(spa));
568248576Ssmh#endif
569248576Ssmh
570240868Spjd	for (;;) {
571240868Spjd		mutex_enter(&spa->spa_trim_lock);
572240868Spjd		if (spa->spa_trim_thread == NULL) {
573240868Spjd			spa->spa_trim_thread = curthread;
574240868Spjd			cv_signal(&spa->spa_trim_cv);
575240868Spjd			mutex_exit(&spa->spa_trim_lock);
576240868Spjd			thread_exit();
577240868Spjd		}
578248577Ssmh
579248577Ssmh		(void) cv_timedwait(&spa->spa_trim_cv, &spa->spa_trim_lock,
580248577Ssmh		    hz * trim_max_interval);
581240868Spjd		mutex_exit(&spa->spa_trim_lock);
582240868Spjd
583240868Spjd		zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
584240868Spjd
585240868Spjd		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
586240868Spjd		trim_map_commit(spa, zio, spa->spa_root_vdev);
587240868Spjd		(void) zio_wait(zio);
588240868Spjd		trim_map_commit_done(spa, spa->spa_root_vdev);
589240868Spjd		spa_config_exit(spa, SCL_STATE, FTAG);
590240868Spjd	}
591240868Spjd}
592240868Spjd
593240868Spjdvoid
594240868Spjdtrim_thread_create(spa_t *spa)
595240868Spjd{
596240868Spjd
597249921Ssmh	if (!zfs_trim_enabled)
598240868Spjd		return;
599240868Spjd
600240868Spjd	mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL);
601240868Spjd	cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL);
602240868Spjd	mutex_enter(&spa->spa_trim_lock);
603240868Spjd	spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0,
604240868Spjd	    TS_RUN, minclsyspri);
605240868Spjd	mutex_exit(&spa->spa_trim_lock);
606240868Spjd}
607240868Spjd
608240868Spjdvoid
609240868Spjdtrim_thread_destroy(spa_t *spa)
610240868Spjd{
611240868Spjd
612249921Ssmh	if (!zfs_trim_enabled)
613240868Spjd		return;
614240868Spjd	if (spa->spa_trim_thread == NULL)
615240868Spjd		return;
616240868Spjd
617240868Spjd	mutex_enter(&spa->spa_trim_lock);
618240868Spjd	/* Setting spa_trim_thread to NULL tells the thread to stop. */
619240868Spjd	spa->spa_trim_thread = NULL;
620240868Spjd	cv_signal(&spa->spa_trim_cv);
621240868Spjd	/* The thread will set it back to != NULL on exit. */
622240868Spjd	while (spa->spa_trim_thread == NULL)
623240868Spjd		cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock);
624240868Spjd	spa->spa_trim_thread = NULL;
625240868Spjd	mutex_exit(&spa->spa_trim_lock);
626240868Spjd
627240868Spjd	cv_destroy(&spa->spa_trim_cv);
628240868Spjd	mutex_destroy(&spa->spa_trim_lock);
629240868Spjd}
630240868Spjd
631240868Spjdvoid
632240868Spjdtrim_thread_wakeup(spa_t *spa)
633240868Spjd{
634240868Spjd
635249921Ssmh	if (!zfs_trim_enabled)
636240868Spjd		return;
637240868Spjd	if (spa->spa_trim_thread == NULL)
638240868Spjd		return;
639240868Spjd
640240868Spjd	mutex_enter(&spa->spa_trim_lock);
641240868Spjd	cv_signal(&spa->spa_trim_cv);
642240868Spjd	mutex_exit(&spa->spa_trim_lock);
643240868Spjd}
644