trim_map.c revision 248572
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
23 * All rights reserved.
24 */
25
26#include <sys/zfs_context.h>
27#include <sys/spa_impl.h>
28#include <sys/vdev_impl.h>
29#include <sys/trim_map.h>
30
31/*
32 * Calculate the zio end, upgrading based on ashift which would be
33 * done by zio_vdev_io_start.
34 *
35 * This makes free range consolidation much more effective
36 * than it would otherwise be as well as ensuring that entire
37 * blocks are invalidated by writes.
38 */
39#define	TRIM_ZIO_END(vd, offset, size)	(offset +		\
40 	P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift))
41
42typedef struct trim_map {
43	list_t		tm_head;		/* List of segments sorted by txg. */
44	avl_tree_t	tm_queued_frees;	/* AVL tree of segments waiting for TRIM. */
45	avl_tree_t	tm_inflight_frees;	/* AVL tree of in-flight TRIMs. */
46	avl_tree_t	tm_inflight_writes;	/* AVL tree of in-flight writes. */
47	list_t		tm_pending_writes;	/* Writes blocked on in-flight frees. */
48	kmutex_t	tm_lock;
49} trim_map_t;
50
51typedef struct trim_seg {
52	avl_node_t	ts_node;	/* AVL node. */
53	list_node_t	ts_next;	/* List element. */
54	uint64_t	ts_start;	/* Starting offset of this segment. */
55	uint64_t	ts_end;		/* Ending offset (non-inclusive). */
56	uint64_t	ts_txg;		/* Segment creation txg. */
57} trim_seg_t;
58
59extern boolean_t zfs_notrim;
60
61SYSCTL_DECL(_vfs_zfs);
62/* Delay TRIMs by that many TXGs. */
63static int trim_txg_limit = 64;
64TUNABLE_INT("vfs.zfs.trim_txg_limit", &trim_txg_limit);
65SYSCTL_INT(_vfs_zfs, OID_AUTO, trim_txg_limit, CTLFLAG_RW, &trim_txg_limit, 0,
66    "Delay TRIMs by that many TXGs.");
67
68static void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd);
69
70static int
71trim_map_seg_compare(const void *x1, const void *x2)
72{
73	const trim_seg_t *s1 = x1;
74	const trim_seg_t *s2 = x2;
75
76	if (s1->ts_start < s2->ts_start) {
77		if (s1->ts_end > s2->ts_start)
78			return (0);
79		return (-1);
80	}
81	if (s1->ts_start > s2->ts_start) {
82		if (s1->ts_start < s2->ts_end)
83			return (0);
84		return (1);
85	}
86	return (0);
87}
88
89static int
90trim_map_zio_compare(const void *x1, const void *x2)
91{
92	const zio_t *z1 = x1;
93	const zio_t *z2 = x2;
94
95	if (z1->io_offset < z2->io_offset) {
96		if (z1->io_offset + z1->io_size > z2->io_offset)
97			return (0);
98		return (-1);
99	}
100	if (z1->io_offset > z2->io_offset) {
101		if (z1->io_offset < z2->io_offset + z2->io_size)
102			return (0);
103		return (1);
104	}
105	return (0);
106}
107
108void
109trim_map_create(vdev_t *vd)
110{
111	trim_map_t *tm;
112
113	ASSERT(vd->vdev_ops->vdev_op_leaf);
114
115	if (zfs_notrim)
116		return;
117
118	tm = kmem_zalloc(sizeof (*tm), KM_SLEEP);
119	mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL);
120	list_create(&tm->tm_head, sizeof (trim_seg_t),
121	    offsetof(trim_seg_t, ts_next));
122	list_create(&tm->tm_pending_writes, sizeof (zio_t),
123	    offsetof(zio_t, io_trim_link));
124	avl_create(&tm->tm_queued_frees, trim_map_seg_compare,
125	    sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
126	avl_create(&tm->tm_inflight_frees, trim_map_seg_compare,
127	    sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
128	avl_create(&tm->tm_inflight_writes, trim_map_zio_compare,
129	    sizeof (zio_t), offsetof(zio_t, io_trim_node));
130	vd->vdev_trimmap = tm;
131}
132
133void
134trim_map_destroy(vdev_t *vd)
135{
136	trim_map_t *tm;
137	trim_seg_t *ts;
138
139	ASSERT(vd->vdev_ops->vdev_op_leaf);
140
141	if (zfs_notrim)
142		return;
143
144	tm = vd->vdev_trimmap;
145	if (tm == NULL)
146		return;
147
148	/*
149	 * We may have been called before trim_map_vdev_commit_done()
150	 * had a chance to run, so do it now to prune the remaining
151	 * inflight frees.
152	 */
153	trim_map_vdev_commit_done(vd->vdev_spa, vd);
154
155	mutex_enter(&tm->tm_lock);
156	while ((ts = list_head(&tm->tm_head)) != NULL) {
157		avl_remove(&tm->tm_queued_frees, ts);
158		list_remove(&tm->tm_head, ts);
159		kmem_free(ts, sizeof (*ts));
160	}
161	mutex_exit(&tm->tm_lock);
162
163	avl_destroy(&tm->tm_queued_frees);
164	avl_destroy(&tm->tm_inflight_frees);
165	avl_destroy(&tm->tm_inflight_writes);
166	list_destroy(&tm->tm_pending_writes);
167	list_destroy(&tm->tm_head);
168	mutex_destroy(&tm->tm_lock);
169	kmem_free(tm, sizeof (*tm));
170	vd->vdev_trimmap = NULL;
171}
172
173static void
174trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
175{
176	avl_index_t where;
177	trim_seg_t tsearch, *ts_before, *ts_after, *ts;
178	boolean_t merge_before, merge_after;
179
180	ASSERT(MUTEX_HELD(&tm->tm_lock));
181	VERIFY(start < end);
182
183	tsearch.ts_start = start;
184	tsearch.ts_end = end;
185
186	ts = avl_find(&tm->tm_queued_frees, &tsearch, &where);
187	if (ts != NULL) {
188		if (start < ts->ts_start)
189			trim_map_segment_add(tm, start, ts->ts_start, txg);
190		if (end > ts->ts_end)
191			trim_map_segment_add(tm, ts->ts_end, end, txg);
192		return;
193	}
194
195	ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE);
196	ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER);
197
198	merge_before = (ts_before != NULL && ts_before->ts_end == start &&
199	    ts_before->ts_txg == txg);
200	merge_after = (ts_after != NULL && ts_after->ts_start == end &&
201	    ts_after->ts_txg == txg);
202
203	if (merge_before && merge_after) {
204		avl_remove(&tm->tm_queued_frees, ts_before);
205		list_remove(&tm->tm_head, ts_before);
206		ts_after->ts_start = ts_before->ts_start;
207		kmem_free(ts_before, sizeof (*ts_before));
208	} else if (merge_before) {
209		ts_before->ts_end = end;
210	} else if (merge_after) {
211		ts_after->ts_start = start;
212	} else {
213		ts = kmem_alloc(sizeof (*ts), KM_SLEEP);
214		ts->ts_start = start;
215		ts->ts_end = end;
216		ts->ts_txg = txg;
217		avl_insert(&tm->tm_queued_frees, ts, where);
218		list_insert_tail(&tm->tm_head, ts);
219	}
220}
221
222static void
223trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start,
224    uint64_t end)
225{
226	trim_seg_t *nts;
227	boolean_t left_over, right_over;
228
229	ASSERT(MUTEX_HELD(&tm->tm_lock));
230
231	left_over = (ts->ts_start < start);
232	right_over = (ts->ts_end > end);
233
234	if (left_over && right_over) {
235		nts = kmem_alloc(sizeof (*nts), KM_SLEEP);
236		nts->ts_start = end;
237		nts->ts_end = ts->ts_end;
238		nts->ts_txg = ts->ts_txg;
239		ts->ts_end = start;
240		avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER);
241		list_insert_after(&tm->tm_head, ts, nts);
242	} else if (left_over) {
243		ts->ts_end = start;
244	} else if (right_over) {
245		ts->ts_start = end;
246	} else {
247		avl_remove(&tm->tm_queued_frees, ts);
248		list_remove(&tm->tm_head, ts);
249		kmem_free(ts, sizeof (*ts));
250	}
251}
252
253static void
254trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
255{
256	zio_t zsearch, *zs;
257
258	ASSERT(MUTEX_HELD(&tm->tm_lock));
259
260	zsearch.io_offset = start;
261	zsearch.io_size = end - start;
262
263	zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL);
264	if (zs == NULL) {
265		trim_map_segment_add(tm, start, end, txg);
266		return;
267	}
268	if (start < zs->io_offset)
269		trim_map_free_locked(tm, start, zs->io_offset, txg);
270	if (zs->io_offset + zs->io_size < end)
271		trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg);
272}
273
274void
275trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size)
276{
277	trim_map_t *tm = vd->vdev_trimmap;
278
279	if (zfs_notrim || vd->vdev_notrim || tm == NULL)
280		return;
281
282	mutex_enter(&tm->tm_lock);
283	trim_map_free_locked(tm, offset, TRIM_ZIO_END(vd, offset, size),
284	    vd->vdev_spa->spa_syncing_txg);
285	mutex_exit(&tm->tm_lock);
286}
287
288boolean_t
289trim_map_write_start(zio_t *zio)
290{
291	vdev_t *vd = zio->io_vd;
292	trim_map_t *tm = vd->vdev_trimmap;
293	trim_seg_t tsearch, *ts;
294	boolean_t left_over, right_over;
295	uint64_t start, end;
296
297	if (zfs_notrim || vd->vdev_notrim || tm == NULL)
298		return (B_TRUE);
299
300	start = zio->io_offset;
301	end = TRIM_ZIO_END(zio->io_vd, start, zio->io_size);
302	tsearch.ts_start = start;
303	tsearch.ts_end = end;
304
305	mutex_enter(&tm->tm_lock);
306
307	/*
308	 * Checking for colliding in-flight frees.
309	 */
310	ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL);
311	if (ts != NULL) {
312		list_insert_tail(&tm->tm_pending_writes, zio);
313		mutex_exit(&tm->tm_lock);
314		return (B_FALSE);
315	}
316
317	ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL);
318	if (ts != NULL) {
319		/*
320		 * Loop until all overlapping segments are removed.
321		 */
322		do {
323			trim_map_segment_remove(tm, ts, start, end);
324			ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL);
325		} while (ts != NULL);
326	}
327	avl_add(&tm->tm_inflight_writes, zio);
328
329	mutex_exit(&tm->tm_lock);
330
331	return (B_TRUE);
332}
333
334void
335trim_map_write_done(zio_t *zio)
336{
337	vdev_t *vd = zio->io_vd;
338	trim_map_t *tm = vd->vdev_trimmap;
339
340	/*
341	 * Don't check for vdev_notrim, since the write could have
342	 * started before vdev_notrim was set.
343	 */
344	if (zfs_notrim || tm == NULL)
345		return;
346
347	mutex_enter(&tm->tm_lock);
348	/*
349	 * Don't fail if the write isn't in the tree, since the write
350	 * could have started after vdev_notrim was set.
351	 */
352	if (zio->io_trim_node.avl_child[0] ||
353	    zio->io_trim_node.avl_child[1] ||
354	    AVL_XPARENT(&zio->io_trim_node) ||
355	    tm->tm_inflight_writes.avl_root == &zio->io_trim_node)
356		avl_remove(&tm->tm_inflight_writes, zio);
357	mutex_exit(&tm->tm_lock);
358}
359
360/*
361 * Return the oldest segment (the one with the lowest txg) or false if
362 * the list is empty or the first element's txg is greater than txg given
363 * as function argument.
364 */
365static trim_seg_t *
366trim_map_first(trim_map_t *tm, uint64_t txg)
367{
368	trim_seg_t *ts;
369
370	ASSERT(MUTEX_HELD(&tm->tm_lock));
371
372	ts = list_head(&tm->tm_head);
373	if (ts != NULL && ts->ts_txg <= txg)
374		return (ts);
375	return (NULL);
376}
377
378static void
379trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
380{
381	trim_map_t *tm = vd->vdev_trimmap;
382	trim_seg_t *ts;
383	uint64_t start, size, txglimit;
384
385	ASSERT(vd->vdev_ops->vdev_op_leaf);
386
387	if (tm == NULL)
388		return;
389
390	txglimit = MIN(spa->spa_syncing_txg, spa_freeze_txg(spa)) -
391	    trim_txg_limit;
392
393	mutex_enter(&tm->tm_lock);
394	/*
395	 * Loop until we send all frees up to the txglimit.
396	 */
397	while ((ts = trim_map_first(tm, txglimit)) != NULL) {
398		list_remove(&tm->tm_head, ts);
399		avl_remove(&tm->tm_queued_frees, ts);
400		avl_add(&tm->tm_inflight_frees, ts);
401		zio_nowait(zio_trim(zio, spa, vd, ts->ts_start,
402		    ts->ts_end - ts->ts_start));
403	}
404	mutex_exit(&tm->tm_lock);
405}
406
407static void
408trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd)
409{
410	trim_map_t *tm = vd->vdev_trimmap;
411	trim_seg_t *ts;
412	list_t pending_writes;
413	zio_t *zio;
414	uint64_t start, size;
415	void *cookie;
416
417	ASSERT(vd->vdev_ops->vdev_op_leaf);
418
419	if (tm == NULL)
420		return;
421
422	mutex_enter(&tm->tm_lock);
423	if (!avl_is_empty(&tm->tm_inflight_frees)) {
424		cookie = NULL;
425		while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees,
426		    &cookie)) != NULL) {
427			kmem_free(ts, sizeof (*ts));
428		}
429	}
430	list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t,
431	    io_trim_link));
432	list_move_tail(&pending_writes, &tm->tm_pending_writes);
433	mutex_exit(&tm->tm_lock);
434
435	while ((zio = list_remove_head(&pending_writes)) != NULL) {
436		zio_vdev_io_reissue(zio);
437		zio_execute(zio);
438	}
439	list_destroy(&pending_writes);
440}
441
442static void
443trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
444{
445	int c;
446
447	if (vd == NULL || spa->spa_syncing_txg <= trim_txg_limit)
448		return;
449
450	if (vd->vdev_ops->vdev_op_leaf) {
451		trim_map_vdev_commit(spa, zio, vd);
452	} else {
453		for (c = 0; c < vd->vdev_children; c++)
454			trim_map_commit(spa, zio, vd->vdev_child[c]);
455	}
456}
457
458static void
459trim_map_commit_done(spa_t *spa, vdev_t *vd)
460{
461	int c;
462
463	if (vd == NULL)
464		return;
465
466	if (vd->vdev_ops->vdev_op_leaf) {
467		trim_map_vdev_commit_done(spa, vd);
468	} else {
469		for (c = 0; c < vd->vdev_children; c++)
470			trim_map_commit_done(spa, vd->vdev_child[c]);
471	}
472}
473
474static void
475trim_thread(void *arg)
476{
477	spa_t *spa = arg;
478	zio_t *zio;
479
480	for (;;) {
481		mutex_enter(&spa->spa_trim_lock);
482		if (spa->spa_trim_thread == NULL) {
483			spa->spa_trim_thread = curthread;
484			cv_signal(&spa->spa_trim_cv);
485			mutex_exit(&spa->spa_trim_lock);
486			thread_exit();
487		}
488		cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock);
489		mutex_exit(&spa->spa_trim_lock);
490
491		zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
492
493		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
494		trim_map_commit(spa, zio, spa->spa_root_vdev);
495		(void) zio_wait(zio);
496		trim_map_commit_done(spa, spa->spa_root_vdev);
497		spa_config_exit(spa, SCL_STATE, FTAG);
498	}
499}
500
501void
502trim_thread_create(spa_t *spa)
503{
504
505	if (zfs_notrim)
506		return;
507
508	mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL);
509	cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL);
510	mutex_enter(&spa->spa_trim_lock);
511	spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0,
512	    TS_RUN, minclsyspri);
513	mutex_exit(&spa->spa_trim_lock);
514}
515
516void
517trim_thread_destroy(spa_t *spa)
518{
519
520	if (zfs_notrim)
521		return;
522	if (spa->spa_trim_thread == NULL)
523		return;
524
525	mutex_enter(&spa->spa_trim_lock);
526	/* Setting spa_trim_thread to NULL tells the thread to stop. */
527	spa->spa_trim_thread = NULL;
528	cv_signal(&spa->spa_trim_cv);
529	/* The thread will set it back to != NULL on exit. */
530	while (spa->spa_trim_thread == NULL)
531		cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock);
532	spa->spa_trim_thread = NULL;
533	mutex_exit(&spa->spa_trim_lock);
534
535	cv_destroy(&spa->spa_trim_cv);
536	mutex_destroy(&spa->spa_trim_lock);
537}
538
539void
540trim_thread_wakeup(spa_t *spa)
541{
542
543	if (zfs_notrim)
544		return;
545	if (spa->spa_trim_thread == NULL)
546		return;
547
548	mutex_enter(&spa->spa_trim_lock);
549	cv_signal(&spa->spa_trim_cv);
550	mutex_exit(&spa->spa_trim_lock);
551}
552