1240868Spjd/*
2240868Spjd * CDDL HEADER START
3240868Spjd *
4240868Spjd * The contents of this file are subject to the terms of the
5240868Spjd * Common Development and Distribution License (the "License").
6240868Spjd * You may not use this file except in compliance with the License.
7240868Spjd *
8240868Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9240868Spjd * or http://www.opensolaris.org/os/licensing.
10240868Spjd * See the License for the specific language governing permissions
11240868Spjd * and limitations under the License.
12240868Spjd *
13240868Spjd * When distributing Covered Code, include this CDDL HEADER in each
14240868Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15240868Spjd * If applicable, add the following below this CDDL HEADER, with the
16240868Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17240868Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18240868Spjd *
19240868Spjd * CDDL HEADER END
20240868Spjd */
21240868Spjd/*
22240868Spjd * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
23240868Spjd * All rights reserved.
24240868Spjd */
25240868Spjd
26240868Spjd#include <sys/zfs_context.h>
27240868Spjd#include <sys/spa_impl.h>
28240868Spjd#include <sys/vdev_impl.h>
29240868Spjd#include <sys/trim_map.h>
30251419Ssmh#include <sys/time.h>
31240868Spjd
32251419Ssmh/*
33251419Ssmh * Calculate the zio end, upgrading based on ashift which would be
34251419Ssmh * done by zio_vdev_io_start.
35251419Ssmh *
36251419Ssmh * This makes free range consolidation much more effective
37251419Ssmh * than it would otherwise be as well as ensuring that entire
38251419Ssmh * blocks are invalidated by writes.
39251419Ssmh */
40251419Ssmh#define	TRIM_ZIO_END(vd, offset, size)	(offset +		\
41251419Ssmh 	P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift))
42251419Ssmh
43251419Ssmh#define TRIM_MAP_SINC(tm, size)					\
44251419Ssmh	atomic_add_64(&(tm)->tm_bytes, (size))
45251419Ssmh
46251419Ssmh#define TRIM_MAP_SDEC(tm, size)					\
47251419Ssmh	atomic_add_64(&(tm)->tm_bytes, -(size))
48251419Ssmh
49251419Ssmh#define TRIM_MAP_QINC(tm)					\
50251419Ssmh	atomic_inc_64(&(tm)->tm_pending);			\
51251419Ssmh
52251419Ssmh#define TRIM_MAP_QDEC(tm)					\
53251419Ssmh	atomic_dec_64(&(tm)->tm_pending);
54251419Ssmh
55240868Spjdtypedef struct trim_map {
56240868Spjd	list_t		tm_head;		/* List of segments sorted by txg. */
57240868Spjd	avl_tree_t	tm_queued_frees;	/* AVL tree of segments waiting for TRIM. */
58240868Spjd	avl_tree_t	tm_inflight_frees;	/* AVL tree of in-flight TRIMs. */
59240868Spjd	avl_tree_t	tm_inflight_writes;	/* AVL tree of in-flight writes. */
60240868Spjd	list_t		tm_pending_writes;	/* Writes blocked on in-flight frees. */
61240868Spjd	kmutex_t	tm_lock;
62251419Ssmh	uint64_t	tm_pending;		/* Count of pending TRIMs. */
63251419Ssmh	uint64_t	tm_bytes;		/* Total size in bytes of queued TRIMs. */
64240868Spjd} trim_map_t;
65240868Spjd
66240868Spjdtypedef struct trim_seg {
67240868Spjd	avl_node_t	ts_node;	/* AVL node. */
68240868Spjd	list_node_t	ts_next;	/* List element. */
69240868Spjd	uint64_t	ts_start;	/* Starting offset of this segment. */
70240868Spjd	uint64_t	ts_end;		/* Ending offset (non-inclusive). */
71240868Spjd	uint64_t	ts_txg;		/* Segment creation txg. */
72251419Ssmh	hrtime_t	ts_time;	/* Segment creation time. */
73240868Spjd} trim_seg_t;
74240868Spjd
75251419Ssmhextern boolean_t zfs_trim_enabled;
76240868Spjd
77251419Ssmhstatic u_int trim_txg_delay = 32;
78251419Ssmhstatic u_int trim_timeout = 30;
79251419Ssmhstatic u_int trim_max_interval = 1;
80251419Ssmh/* Limit outstanding TRIMs to 2G (max size for a single TRIM request) */
81251419Ssmhstatic uint64_t trim_vdev_max_bytes = 2147483648;
82251419Ssmh/* Limit outstanding TRIMs to 64 (max ranges for a single TRIM request) */
83251419Ssmhstatic u_int trim_vdev_max_pending = 64;
84251419Ssmh
85240868SpjdSYSCTL_DECL(_vfs_zfs);
86251419SsmhSYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RD, 0, "ZFS TRIM");
87240868Spjd
88251419SsmhTUNABLE_INT("vfs.zfs.trim.txg_delay", &trim_txg_delay);
89251419SsmhSYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, txg_delay, CTLFLAG_RWTUN, &trim_txg_delay,
90251419Ssmh    0, "Delay TRIMs by up to this many TXGs");
91251419Ssmh
92251419SsmhTUNABLE_INT("vfs.zfs.trim.timeout", &trim_timeout);
93251419SsmhSYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, timeout, CTLFLAG_RWTUN, &trim_timeout, 0,
94251419Ssmh    "Delay TRIMs by up to this many seconds");
95251419Ssmh
96251419SsmhTUNABLE_INT("vfs.zfs.trim.max_interval", &trim_max_interval);
97251419SsmhSYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, max_interval, CTLFLAG_RWTUN,
98251419Ssmh    &trim_max_interval, 0,
99251419Ssmh    "Maximum interval between TRIM queue processing (seconds)");
100251419Ssmh
101251419SsmhSYSCTL_DECL(_vfs_zfs_vdev);
102251419SsmhTUNABLE_QUAD("vfs.zfs.vdev.trim_max_bytes", &trim_vdev_max_bytes);
103251419SsmhSYSCTL_QUAD(_vfs_zfs_vdev, OID_AUTO, trim_max_bytes, CTLFLAG_RWTUN,
104251419Ssmh    &trim_vdev_max_bytes, 0,
105251419Ssmh    "Maximum pending TRIM bytes for a vdev");
106251419Ssmh
107251419SsmhTUNABLE_INT("vfs.zfs.vdev.trim_max_pending", &trim_vdev_max_pending);
108251419SsmhSYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, trim_max_pending, CTLFLAG_RWTUN,
109251419Ssmh    &trim_vdev_max_pending, 0,
110251419Ssmh    "Maximum pending TRIM segments for a vdev");
111251419Ssmh
112251419Ssmh
113240868Spjdstatic void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd);
114240868Spjd
115240868Spjdstatic int
116240868Spjdtrim_map_seg_compare(const void *x1, const void *x2)
117240868Spjd{
118240868Spjd	const trim_seg_t *s1 = x1;
119240868Spjd	const trim_seg_t *s2 = x2;
120240868Spjd
121240868Spjd	if (s1->ts_start < s2->ts_start) {
122240868Spjd		if (s1->ts_end > s2->ts_start)
123240868Spjd			return (0);
124240868Spjd		return (-1);
125240868Spjd	}
126240868Spjd	if (s1->ts_start > s2->ts_start) {
127240868Spjd		if (s1->ts_start < s2->ts_end)
128240868Spjd			return (0);
129240868Spjd		return (1);
130240868Spjd	}
131240868Spjd	return (0);
132240868Spjd}
133240868Spjd
134240868Spjdstatic int
135240868Spjdtrim_map_zio_compare(const void *x1, const void *x2)
136240868Spjd{
137240868Spjd	const zio_t *z1 = x1;
138240868Spjd	const zio_t *z2 = x2;
139240868Spjd
140240868Spjd	if (z1->io_offset < z2->io_offset) {
141240868Spjd		if (z1->io_offset + z1->io_size > z2->io_offset)
142240868Spjd			return (0);
143240868Spjd		return (-1);
144240868Spjd	}
145240868Spjd	if (z1->io_offset > z2->io_offset) {
146240868Spjd		if (z1->io_offset < z2->io_offset + z2->io_size)
147240868Spjd			return (0);
148240868Spjd		return (1);
149240868Spjd	}
150240868Spjd	return (0);
151240868Spjd}
152240868Spjd
153240868Spjdvoid
154240868Spjdtrim_map_create(vdev_t *vd)
155240868Spjd{
156240868Spjd	trim_map_t *tm;
157240868Spjd
158240868Spjd	ASSERT(vd->vdev_ops->vdev_op_leaf);
159240868Spjd
160251419Ssmh	if (!zfs_trim_enabled)
161240868Spjd		return;
162240868Spjd
163240868Spjd	tm = kmem_zalloc(sizeof (*tm), KM_SLEEP);
164240868Spjd	mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL);
165240868Spjd	list_create(&tm->tm_head, sizeof (trim_seg_t),
166240868Spjd	    offsetof(trim_seg_t, ts_next));
167240868Spjd	list_create(&tm->tm_pending_writes, sizeof (zio_t),
168240868Spjd	    offsetof(zio_t, io_trim_link));
169240868Spjd	avl_create(&tm->tm_queued_frees, trim_map_seg_compare,
170240868Spjd	    sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
171240868Spjd	avl_create(&tm->tm_inflight_frees, trim_map_seg_compare,
172240868Spjd	    sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
173240868Spjd	avl_create(&tm->tm_inflight_writes, trim_map_zio_compare,
174240868Spjd	    sizeof (zio_t), offsetof(zio_t, io_trim_node));
175240868Spjd	vd->vdev_trimmap = tm;
176240868Spjd}
177240868Spjd
178240868Spjdvoid
179240868Spjdtrim_map_destroy(vdev_t *vd)
180240868Spjd{
181240868Spjd	trim_map_t *tm;
182240868Spjd	trim_seg_t *ts;
183240868Spjd
184240868Spjd	ASSERT(vd->vdev_ops->vdev_op_leaf);
185240868Spjd
186251419Ssmh	if (!zfs_trim_enabled)
187240868Spjd		return;
188240868Spjd
189240868Spjd	tm = vd->vdev_trimmap;
190240868Spjd	if (tm == NULL)
191240868Spjd		return;
192240868Spjd
193240868Spjd	/*
194240868Spjd	 * We may have been called before trim_map_vdev_commit_done()
195240868Spjd	 * had a chance to run, so do it now to prune the remaining
196240868Spjd	 * inflight frees.
197240868Spjd	 */
198240868Spjd	trim_map_vdev_commit_done(vd->vdev_spa, vd);
199240868Spjd
200240868Spjd	mutex_enter(&tm->tm_lock);
201240868Spjd	while ((ts = list_head(&tm->tm_head)) != NULL) {
202240868Spjd		avl_remove(&tm->tm_queued_frees, ts);
203240868Spjd		list_remove(&tm->tm_head, ts);
204240868Spjd		kmem_free(ts, sizeof (*ts));
205251419Ssmh		TRIM_MAP_SDEC(tm, ts->ts_end - ts->ts_start);
206251419Ssmh		TRIM_MAP_QDEC(tm);
207240868Spjd	}
208240868Spjd	mutex_exit(&tm->tm_lock);
209240868Spjd
210240868Spjd	avl_destroy(&tm->tm_queued_frees);
211240868Spjd	avl_destroy(&tm->tm_inflight_frees);
212240868Spjd	avl_destroy(&tm->tm_inflight_writes);
213240868Spjd	list_destroy(&tm->tm_pending_writes);
214240868Spjd	list_destroy(&tm->tm_head);
215240868Spjd	mutex_destroy(&tm->tm_lock);
216240868Spjd	kmem_free(tm, sizeof (*tm));
217240868Spjd	vd->vdev_trimmap = NULL;
218240868Spjd}
219240868Spjd
220240868Spjdstatic void
221240868Spjdtrim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
222240868Spjd{
223240868Spjd	avl_index_t where;
224240868Spjd	trim_seg_t tsearch, *ts_before, *ts_after, *ts;
225240868Spjd	boolean_t merge_before, merge_after;
226251419Ssmh	hrtime_t time;
227240868Spjd
228240868Spjd	ASSERT(MUTEX_HELD(&tm->tm_lock));
229240868Spjd	VERIFY(start < end);
230240868Spjd
231251419Ssmh	time = gethrtime();
232240868Spjd	tsearch.ts_start = start;
233240868Spjd	tsearch.ts_end = end;
234240868Spjd
235240868Spjd	ts = avl_find(&tm->tm_queued_frees, &tsearch, &where);
236240868Spjd	if (ts != NULL) {
237240868Spjd		if (start < ts->ts_start)
238240868Spjd			trim_map_segment_add(tm, start, ts->ts_start, txg);
239240868Spjd		if (end > ts->ts_end)
240240868Spjd			trim_map_segment_add(tm, ts->ts_end, end, txg);
241240868Spjd		return;
242240868Spjd	}
243240868Spjd
244240868Spjd	ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE);
245240868Spjd	ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER);
246240868Spjd
247251419Ssmh	merge_before = (ts_before != NULL && ts_before->ts_end == start);
248251419Ssmh	merge_after = (ts_after != NULL && ts_after->ts_start == end);
249240868Spjd
250240868Spjd	if (merge_before && merge_after) {
251251419Ssmh		TRIM_MAP_SINC(tm, ts_after->ts_start - ts_before->ts_end);
252251419Ssmh		TRIM_MAP_QDEC(tm);
253240868Spjd		avl_remove(&tm->tm_queued_frees, ts_before);
254240868Spjd		list_remove(&tm->tm_head, ts_before);
255240868Spjd		ts_after->ts_start = ts_before->ts_start;
256251419Ssmh		ts_after->ts_txg = txg;
257251419Ssmh		ts_after->ts_time = time;
258240868Spjd		kmem_free(ts_before, sizeof (*ts_before));
259240868Spjd	} else if (merge_before) {
260251419Ssmh		TRIM_MAP_SINC(tm, end - ts_before->ts_end);
261240868Spjd		ts_before->ts_end = end;
262251419Ssmh		ts_before->ts_txg = txg;
263251419Ssmh		ts_before->ts_time = time;
264240868Spjd	} else if (merge_after) {
265251419Ssmh		TRIM_MAP_SINC(tm, ts_after->ts_start - start);
266240868Spjd		ts_after->ts_start = start;
267251419Ssmh		ts_after->ts_txg = txg;
268251419Ssmh		ts_after->ts_time = time;
269240868Spjd	} else {
270251419Ssmh		TRIM_MAP_SINC(tm, end - start);
271251419Ssmh		TRIM_MAP_QINC(tm);
272240868Spjd		ts = kmem_alloc(sizeof (*ts), KM_SLEEP);
273240868Spjd		ts->ts_start = start;
274240868Spjd		ts->ts_end = end;
275240868Spjd		ts->ts_txg = txg;
276251419Ssmh		ts->ts_time = time;
277240868Spjd		avl_insert(&tm->tm_queued_frees, ts, where);
278240868Spjd		list_insert_tail(&tm->tm_head, ts);
279240868Spjd	}
280240868Spjd}
281240868Spjd
282240868Spjdstatic void
283240868Spjdtrim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start,
284240868Spjd    uint64_t end)
285240868Spjd{
286240868Spjd	trim_seg_t *nts;
287240868Spjd	boolean_t left_over, right_over;
288240868Spjd
289240868Spjd	ASSERT(MUTEX_HELD(&tm->tm_lock));
290240868Spjd
291240868Spjd	left_over = (ts->ts_start < start);
292240868Spjd	right_over = (ts->ts_end > end);
293240868Spjd
294251419Ssmh	TRIM_MAP_SDEC(tm, end - start);
295240868Spjd	if (left_over && right_over) {
296240868Spjd		nts = kmem_alloc(sizeof (*nts), KM_SLEEP);
297240868Spjd		nts->ts_start = end;
298240868Spjd		nts->ts_end = ts->ts_end;
299240868Spjd		nts->ts_txg = ts->ts_txg;
300251419Ssmh		nts->ts_time = ts->ts_time;
301240868Spjd		ts->ts_end = start;
302240868Spjd		avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER);
303240868Spjd		list_insert_after(&tm->tm_head, ts, nts);
304251419Ssmh		TRIM_MAP_QINC(tm);
305240868Spjd	} else if (left_over) {
306240868Spjd		ts->ts_end = start;
307240868Spjd	} else if (right_over) {
308240868Spjd		ts->ts_start = end;
309240868Spjd	} else {
310240868Spjd		avl_remove(&tm->tm_queued_frees, ts);
311240868Spjd		list_remove(&tm->tm_head, ts);
312251419Ssmh		TRIM_MAP_QDEC(tm);
313240868Spjd		kmem_free(ts, sizeof (*ts));
314240868Spjd	}
315240868Spjd}
316240868Spjd
317240868Spjdstatic void
318240868Spjdtrim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
319240868Spjd{
320240868Spjd	zio_t zsearch, *zs;
321240868Spjd
322240868Spjd	ASSERT(MUTEX_HELD(&tm->tm_lock));
323240868Spjd
324240868Spjd	zsearch.io_offset = start;
325240868Spjd	zsearch.io_size = end - start;
326240868Spjd
327240868Spjd	zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL);
328240868Spjd	if (zs == NULL) {
329240868Spjd		trim_map_segment_add(tm, start, end, txg);
330240868Spjd		return;
331240868Spjd	}
332240868Spjd	if (start < zs->io_offset)
333240868Spjd		trim_map_free_locked(tm, start, zs->io_offset, txg);
334240868Spjd	if (zs->io_offset + zs->io_size < end)
335240868Spjd		trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg);
336240868Spjd}
337240868Spjd
338240868Spjdvoid
339251419Ssmhtrim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
340240868Spjd{
341240868Spjd	trim_map_t *tm = vd->vdev_trimmap;
342240868Spjd
343251419Ssmh	if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL)
344240868Spjd		return;
345240868Spjd
346240868Spjd	mutex_enter(&tm->tm_lock);
347251419Ssmh	trim_map_free_locked(tm, offset, TRIM_ZIO_END(vd, offset, size), txg);
348240868Spjd	mutex_exit(&tm->tm_lock);
349240868Spjd}
350240868Spjd
351240868Spjdboolean_t
352240868Spjdtrim_map_write_start(zio_t *zio)
353240868Spjd{
354240868Spjd	vdev_t *vd = zio->io_vd;
355240868Spjd	trim_map_t *tm = vd->vdev_trimmap;
356240868Spjd	trim_seg_t tsearch, *ts;
357240868Spjd	boolean_t left_over, right_over;
358240868Spjd	uint64_t start, end;
359240868Spjd
360251419Ssmh	if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL)
361240868Spjd		return (B_TRUE);
362240868Spjd
363240868Spjd	start = zio->io_offset;
364251419Ssmh	end = TRIM_ZIO_END(zio->io_vd, start, zio->io_size);
365240868Spjd	tsearch.ts_start = start;
366240868Spjd	tsearch.ts_end = end;
367240868Spjd
368240868Spjd	mutex_enter(&tm->tm_lock);
369240868Spjd
370240868Spjd	/*
371240868Spjd	 * Checking for colliding in-flight frees.
372240868Spjd	 */
373240868Spjd	ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL);
374240868Spjd	if (ts != NULL) {
375240868Spjd		list_insert_tail(&tm->tm_pending_writes, zio);
376240868Spjd		mutex_exit(&tm->tm_lock);
377240868Spjd		return (B_FALSE);
378240868Spjd	}
379240868Spjd
380240868Spjd	ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL);
381240868Spjd	if (ts != NULL) {
382240868Spjd		/*
383240868Spjd		 * Loop until all overlapping segments are removed.
384240868Spjd		 */
385240868Spjd		do {
386240868Spjd			trim_map_segment_remove(tm, ts, start, end);
387240868Spjd			ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL);
388240868Spjd		} while (ts != NULL);
389240868Spjd	}
390240868Spjd	avl_add(&tm->tm_inflight_writes, zio);
391240868Spjd
392240868Spjd	mutex_exit(&tm->tm_lock);
393240868Spjd
394240868Spjd	return (B_TRUE);
395240868Spjd}
396240868Spjd
397240868Spjdvoid
398240868Spjdtrim_map_write_done(zio_t *zio)
399240868Spjd{
400240868Spjd	vdev_t *vd = zio->io_vd;
401240868Spjd	trim_map_t *tm = vd->vdev_trimmap;
402240868Spjd
403240868Spjd	/*
404240868Spjd	 * Don't check for vdev_notrim, since the write could have
405240868Spjd	 * started before vdev_notrim was set.
406240868Spjd	 */
407251419Ssmh	if (!zfs_trim_enabled || tm == NULL)
408240868Spjd		return;
409240868Spjd
410240868Spjd	mutex_enter(&tm->tm_lock);
411240868Spjd	/*
412240868Spjd	 * Don't fail if the write isn't in the tree, since the write
413240868Spjd	 * could have started after vdev_notrim was set.
414240868Spjd	 */
415240868Spjd	if (zio->io_trim_node.avl_child[0] ||
416240868Spjd	    zio->io_trim_node.avl_child[1] ||
417240868Spjd	    AVL_XPARENT(&zio->io_trim_node) ||
418240868Spjd	    tm->tm_inflight_writes.avl_root == &zio->io_trim_node)
419240868Spjd		avl_remove(&tm->tm_inflight_writes, zio);
420240868Spjd	mutex_exit(&tm->tm_lock);
421240868Spjd}
422240868Spjd
423240868Spjd/*
424251419Ssmh * Return the oldest segment (the one with the lowest txg / time) or NULL if:
425251419Ssmh * 1. The list is empty
426251419Ssmh * 2. The first element's txg is greater than txgsafe
427251419Ssmh * 3. The first element's txg is not greater than the txg argument and the
428251419Ssmh *    the first element's time is not greater than time argument
429240868Spjd */
430240868Spjdstatic trim_seg_t *
431251419Ssmhtrim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time)
432240868Spjd{
433240868Spjd	trim_seg_t *ts;
434240868Spjd
435240868Spjd	ASSERT(MUTEX_HELD(&tm->tm_lock));
436251419Ssmh	VERIFY(txgsafe >= txg);
437240868Spjd
438240868Spjd	ts = list_head(&tm->tm_head);
439251419Ssmh	if (ts != NULL && ts->ts_txg <= txgsafe &&
440251419Ssmh	    (ts->ts_txg <= txg || ts->ts_time <= time ||
441251419Ssmh	    tm->tm_bytes > trim_vdev_max_bytes ||
442251419Ssmh	    tm->tm_pending > trim_vdev_max_pending))
443240868Spjd		return (ts);
444240868Spjd	return (NULL);
445240868Spjd}
446240868Spjd
447240868Spjdstatic void
448240868Spjdtrim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
449240868Spjd{
450240868Spjd	trim_map_t *tm = vd->vdev_trimmap;
451240868Spjd	trim_seg_t *ts;
452251419Ssmh	uint64_t size, txgtarget, txgsafe;
453251419Ssmh	hrtime_t timelimit;
454240868Spjd
455240868Spjd	ASSERT(vd->vdev_ops->vdev_op_leaf);
456240868Spjd
457240868Spjd	if (tm == NULL)
458240868Spjd		return;
459240868Spjd
460251419Ssmh	timelimit = gethrtime() - trim_timeout * NANOSEC;
461251419Ssmh	if (vd->vdev_isl2cache) {
462251419Ssmh		txgsafe = UINT64_MAX;
463251419Ssmh		txgtarget = UINT64_MAX;
464251419Ssmh	} else {
465251419Ssmh		txgsafe = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa));
466251419Ssmh		if (txgsafe > trim_txg_delay)
467251419Ssmh			txgtarget = txgsafe - trim_txg_delay;
468251419Ssmh		else
469251419Ssmh			txgtarget = 0;
470251419Ssmh	}
471240868Spjd
472240868Spjd	mutex_enter(&tm->tm_lock);
473251419Ssmh	/* Loop until we have sent all outstanding free's */
474251419Ssmh	while ((ts = trim_map_first(tm, txgtarget, txgsafe, timelimit))
475251419Ssmh	    != NULL) {
476240868Spjd		list_remove(&tm->tm_head, ts);
477240868Spjd		avl_remove(&tm->tm_queued_frees, ts);
478240868Spjd		avl_add(&tm->tm_inflight_frees, ts);
479251419Ssmh		size = ts->ts_end - ts->ts_start;
480251419Ssmh		zio_nowait(zio_trim(zio, spa, vd, ts->ts_start, size));
481251419Ssmh		TRIM_MAP_SDEC(tm, size);
482251419Ssmh		TRIM_MAP_QDEC(tm);
483240868Spjd	}
484240868Spjd	mutex_exit(&tm->tm_lock);
485240868Spjd}
486240868Spjd
487240868Spjdstatic void
488240868Spjdtrim_map_vdev_commit_done(spa_t *spa, vdev_t *vd)
489240868Spjd{
490240868Spjd	trim_map_t *tm = vd->vdev_trimmap;
491240868Spjd	trim_seg_t *ts;
492240868Spjd	list_t pending_writes;
493240868Spjd	zio_t *zio;
494240868Spjd	uint64_t start, size;
495240868Spjd	void *cookie;
496240868Spjd
497240868Spjd	ASSERT(vd->vdev_ops->vdev_op_leaf);
498240868Spjd
499240868Spjd	if (tm == NULL)
500240868Spjd		return;
501240868Spjd
502240868Spjd	mutex_enter(&tm->tm_lock);
503240868Spjd	if (!avl_is_empty(&tm->tm_inflight_frees)) {
504240868Spjd		cookie = NULL;
505240868Spjd		while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees,
506240868Spjd		    &cookie)) != NULL) {
507240868Spjd			kmem_free(ts, sizeof (*ts));
508240868Spjd		}
509240868Spjd	}
510240868Spjd	list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t,
511240868Spjd	    io_trim_link));
512240868Spjd	list_move_tail(&pending_writes, &tm->tm_pending_writes);
513240868Spjd	mutex_exit(&tm->tm_lock);
514240868Spjd
515240868Spjd	while ((zio = list_remove_head(&pending_writes)) != NULL) {
516240868Spjd		zio_vdev_io_reissue(zio);
517240868Spjd		zio_execute(zio);
518240868Spjd	}
519240868Spjd	list_destroy(&pending_writes);
520240868Spjd}
521240868Spjd
522240868Spjdstatic void
523240868Spjdtrim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
524240868Spjd{
525240868Spjd	int c;
526240868Spjd
527251419Ssmh	if (vd == NULL)
528240868Spjd		return;
529240868Spjd
530240868Spjd	if (vd->vdev_ops->vdev_op_leaf) {
531240868Spjd		trim_map_vdev_commit(spa, zio, vd);
532240868Spjd	} else {
533240868Spjd		for (c = 0; c < vd->vdev_children; c++)
534240868Spjd			trim_map_commit(spa, zio, vd->vdev_child[c]);
535240868Spjd	}
536240868Spjd}
537240868Spjd
538240868Spjdstatic void
539240868Spjdtrim_map_commit_done(spa_t *spa, vdev_t *vd)
540240868Spjd{
541240868Spjd	int c;
542240868Spjd
543240868Spjd	if (vd == NULL)
544240868Spjd		return;
545240868Spjd
546240868Spjd	if (vd->vdev_ops->vdev_op_leaf) {
547240868Spjd		trim_map_vdev_commit_done(spa, vd);
548240868Spjd	} else {
549240868Spjd		for (c = 0; c < vd->vdev_children; c++)
550240868Spjd			trim_map_commit_done(spa, vd->vdev_child[c]);
551240868Spjd	}
552240868Spjd}
553240868Spjd
554240868Spjdstatic void
555240868Spjdtrim_thread(void *arg)
556240868Spjd{
557240868Spjd	spa_t *spa = arg;
558240868Spjd	zio_t *zio;
559240868Spjd
560251419Ssmh#ifdef _KERNEL
561251419Ssmh	(void) snprintf(curthread->td_name, sizeof(curthread->td_name),
562251419Ssmh	    "trim %s", spa_name(spa));
563251419Ssmh#endif
564251419Ssmh
565240868Spjd	for (;;) {
566240868Spjd		mutex_enter(&spa->spa_trim_lock);
567240868Spjd		if (spa->spa_trim_thread == NULL) {
568240868Spjd			spa->spa_trim_thread = curthread;
569240868Spjd			cv_signal(&spa->spa_trim_cv);
570240868Spjd			mutex_exit(&spa->spa_trim_lock);
571240868Spjd			thread_exit();
572240868Spjd		}
573251419Ssmh
574251419Ssmh		(void) cv_timedwait(&spa->spa_trim_cv, &spa->spa_trim_lock,
575251419Ssmh		    hz * trim_max_interval);
576240868Spjd		mutex_exit(&spa->spa_trim_lock);
577240868Spjd
578240868Spjd		zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
579240868Spjd
580240868Spjd		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
581240868Spjd		trim_map_commit(spa, zio, spa->spa_root_vdev);
582240868Spjd		(void) zio_wait(zio);
583240868Spjd		trim_map_commit_done(spa, spa->spa_root_vdev);
584240868Spjd		spa_config_exit(spa, SCL_STATE, FTAG);
585240868Spjd	}
586240868Spjd}
587240868Spjd
588240868Spjdvoid
589240868Spjdtrim_thread_create(spa_t *spa)
590240868Spjd{
591240868Spjd
592251419Ssmh	if (!zfs_trim_enabled)
593240868Spjd		return;
594240868Spjd
595240868Spjd	mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL);
596240868Spjd	cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL);
597240868Spjd	mutex_enter(&spa->spa_trim_lock);
598240868Spjd	spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0,
599240868Spjd	    TS_RUN, minclsyspri);
600240868Spjd	mutex_exit(&spa->spa_trim_lock);
601240868Spjd}
602240868Spjd
603240868Spjdvoid
604240868Spjdtrim_thread_destroy(spa_t *spa)
605240868Spjd{
606240868Spjd
607251419Ssmh	if (!zfs_trim_enabled)
608240868Spjd		return;
609240868Spjd	if (spa->spa_trim_thread == NULL)
610240868Spjd		return;
611240868Spjd
612240868Spjd	mutex_enter(&spa->spa_trim_lock);
613240868Spjd	/* Setting spa_trim_thread to NULL tells the thread to stop. */
614240868Spjd	spa->spa_trim_thread = NULL;
615240868Spjd	cv_signal(&spa->spa_trim_cv);
616240868Spjd	/* The thread will set it back to != NULL on exit. */
617240868Spjd	while (spa->spa_trim_thread == NULL)
618240868Spjd		cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock);
619240868Spjd	spa->spa_trim_thread = NULL;
620240868Spjd	mutex_exit(&spa->spa_trim_lock);
621240868Spjd
622240868Spjd	cv_destroy(&spa->spa_trim_cv);
623240868Spjd	mutex_destroy(&spa->spa_trim_lock);
624240868Spjd}
625240868Spjd
626240868Spjdvoid
627240868Spjdtrim_thread_wakeup(spa_t *spa)
628240868Spjd{
629240868Spjd
630251419Ssmh	if (!zfs_trim_enabled)
631240868Spjd		return;
632240868Spjd	if (spa->spa_trim_thread == NULL)
633240868Spjd		return;
634240868Spjd
635240868Spjd	mutex_enter(&spa->spa_trim_lock);
636240868Spjd	cv_signal(&spa->spa_trim_cv);
637240868Spjd	mutex_exit(&spa->spa_trim_lock);
638240868Spjd}
639