trim_map.c revision 248575
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
23 * All rights reserved.
24 */
25
26#include <sys/zfs_context.h>
27#include <sys/spa_impl.h>
28#include <sys/vdev_impl.h>
29#include <sys/trim_map.h>
30#include <sys/time.h>
31
32/*
33 * Calculate the zio end, upgrading based on ashift which would be
34 * done by zio_vdev_io_start.
35 *
36 * This makes free range consolidation much more effective
37 * than it would otherwise be as well as ensuring that entire
38 * blocks are invalidated by writes.
39 */
40#define	TRIM_ZIO_END(vd, offset, size)	(offset +		\
41 	P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift))
42
43typedef struct trim_map {
44	list_t		tm_head;		/* List of segments sorted by txg. */
45	avl_tree_t	tm_queued_frees;	/* AVL tree of segments waiting for TRIM. */
46	avl_tree_t	tm_inflight_frees;	/* AVL tree of in-flight TRIMs. */
47	avl_tree_t	tm_inflight_writes;	/* AVL tree of in-flight writes. */
48	list_t		tm_pending_writes;	/* Writes blocked on in-flight frees. */
49	kmutex_t	tm_lock;
50} trim_map_t;
51
52typedef struct trim_seg {
53	avl_node_t	ts_node;	/* AVL node. */
54	list_node_t	ts_next;	/* List element. */
55	uint64_t	ts_start;	/* Starting offset of this segment. */
56	uint64_t	ts_end;		/* Ending offset (non-inclusive). */
57	uint64_t	ts_txg;		/* Segment creation txg. */
58	hrtime_t	ts_time;	/* Segment creation time. */
59} trim_seg_t;
60
61extern boolean_t zfs_notrim;
62
63SYSCTL_DECL(_vfs_zfs);
64/* Delay TRIMs by that many TXGs. */
65static int trim_txg_limit = 64;
66TUNABLE_INT("vfs.zfs.trim_txg_limit", &trim_txg_limit);
67SYSCTL_INT(_vfs_zfs, OID_AUTO, trim_txg_limit, CTLFLAG_RW, &trim_txg_limit, 0,
68    "Delay TRIMs by that many TXGs.");
69
70static int trim_l2arc_limit = 30;
71TUNABLE_INT("vfs.zfs.trim_l2arc_limit", &trim_l2arc_limit);
72SYSCTL_INT(_vfs_zfs, OID_AUTO, trim_l2arc_limit, CTLFLAG_RWTUN, &trim_l2arc_limit, 0,
73    "Delay TRIMs by this many seconds for cache devices.");
74
75static void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd);
76
77static int
78trim_map_seg_compare(const void *x1, const void *x2)
79{
80	const trim_seg_t *s1 = x1;
81	const trim_seg_t *s2 = x2;
82
83	if (s1->ts_start < s2->ts_start) {
84		if (s1->ts_end > s2->ts_start)
85			return (0);
86		return (-1);
87	}
88	if (s1->ts_start > s2->ts_start) {
89		if (s1->ts_start < s2->ts_end)
90			return (0);
91		return (1);
92	}
93	return (0);
94}
95
96static int
97trim_map_zio_compare(const void *x1, const void *x2)
98{
99	const zio_t *z1 = x1;
100	const zio_t *z2 = x2;
101
102	if (z1->io_offset < z2->io_offset) {
103		if (z1->io_offset + z1->io_size > z2->io_offset)
104			return (0);
105		return (-1);
106	}
107	if (z1->io_offset > z2->io_offset) {
108		if (z1->io_offset < z2->io_offset + z2->io_size)
109			return (0);
110		return (1);
111	}
112	return (0);
113}
114
115void
116trim_map_create(vdev_t *vd)
117{
118	trim_map_t *tm;
119
120	ASSERT(vd->vdev_ops->vdev_op_leaf);
121
122	if (zfs_notrim)
123		return;
124
125	tm = kmem_zalloc(sizeof (*tm), KM_SLEEP);
126	mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL);
127	list_create(&tm->tm_head, sizeof (trim_seg_t),
128	    offsetof(trim_seg_t, ts_next));
129	list_create(&tm->tm_pending_writes, sizeof (zio_t),
130	    offsetof(zio_t, io_trim_link));
131	avl_create(&tm->tm_queued_frees, trim_map_seg_compare,
132	    sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
133	avl_create(&tm->tm_inflight_frees, trim_map_seg_compare,
134	    sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
135	avl_create(&tm->tm_inflight_writes, trim_map_zio_compare,
136	    sizeof (zio_t), offsetof(zio_t, io_trim_node));
137	vd->vdev_trimmap = tm;
138}
139
140void
141trim_map_destroy(vdev_t *vd)
142{
143	trim_map_t *tm;
144	trim_seg_t *ts;
145
146	ASSERT(vd->vdev_ops->vdev_op_leaf);
147
148	if (zfs_notrim)
149		return;
150
151	tm = vd->vdev_trimmap;
152	if (tm == NULL)
153		return;
154
155	/*
156	 * We may have been called before trim_map_vdev_commit_done()
157	 * had a chance to run, so do it now to prune the remaining
158	 * inflight frees.
159	 */
160	trim_map_vdev_commit_done(vd->vdev_spa, vd);
161
162	mutex_enter(&tm->tm_lock);
163	while ((ts = list_head(&tm->tm_head)) != NULL) {
164		avl_remove(&tm->tm_queued_frees, ts);
165		list_remove(&tm->tm_head, ts);
166		kmem_free(ts, sizeof (*ts));
167	}
168	mutex_exit(&tm->tm_lock);
169
170	avl_destroy(&tm->tm_queued_frees);
171	avl_destroy(&tm->tm_inflight_frees);
172	avl_destroy(&tm->tm_inflight_writes);
173	list_destroy(&tm->tm_pending_writes);
174	list_destroy(&tm->tm_head);
175	mutex_destroy(&tm->tm_lock);
176	kmem_free(tm, sizeof (*tm));
177	vd->vdev_trimmap = NULL;
178}
179
180static void
181trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
182{
183	avl_index_t where;
184	trim_seg_t tsearch, *ts_before, *ts_after, *ts;
185	boolean_t merge_before, merge_after;
186	hrtime_t time;
187
188	ASSERT(MUTEX_HELD(&tm->tm_lock));
189	VERIFY(start < end);
190
191	time = gethrtime();
192	tsearch.ts_start = start;
193	tsearch.ts_end = end;
194
195	ts = avl_find(&tm->tm_queued_frees, &tsearch, &where);
196	if (ts != NULL) {
197		if (start < ts->ts_start)
198			trim_map_segment_add(tm, start, ts->ts_start, txg);
199		if (end > ts->ts_end)
200			trim_map_segment_add(tm, ts->ts_end, end, txg);
201		return;
202	}
203
204	ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE);
205	ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER);
206
207	merge_before = (ts_before != NULL && ts_before->ts_end == start &&
208	    ts_before->ts_txg == txg);
209	merge_after = (ts_after != NULL && ts_after->ts_start == end &&
210	    ts_after->ts_txg == txg);
211
212	if (merge_before && merge_after) {
213		avl_remove(&tm->tm_queued_frees, ts_before);
214		list_remove(&tm->tm_head, ts_before);
215		ts_after->ts_start = ts_before->ts_start;
216		kmem_free(ts_before, sizeof (*ts_before));
217	} else if (merge_before) {
218		ts_before->ts_end = end;
219	} else if (merge_after) {
220		ts_after->ts_start = start;
221	} else {
222		ts = kmem_alloc(sizeof (*ts), KM_SLEEP);
223		ts->ts_start = start;
224		ts->ts_end = end;
225		ts->ts_txg = txg;
226		ts->ts_time = time;
227		avl_insert(&tm->tm_queued_frees, ts, where);
228		list_insert_tail(&tm->tm_head, ts);
229	}
230}
231
232static void
233trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start,
234    uint64_t end)
235{
236	trim_seg_t *nts;
237	boolean_t left_over, right_over;
238
239	ASSERT(MUTEX_HELD(&tm->tm_lock));
240
241	left_over = (ts->ts_start < start);
242	right_over = (ts->ts_end > end);
243
244	if (left_over && right_over) {
245		nts = kmem_alloc(sizeof (*nts), KM_SLEEP);
246		nts->ts_start = end;
247		nts->ts_end = ts->ts_end;
248		nts->ts_txg = ts->ts_txg;
249		nts->ts_time = ts->ts_time;
250		ts->ts_end = start;
251		avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER);
252		list_insert_after(&tm->tm_head, ts, nts);
253	} else if (left_over) {
254		ts->ts_end = start;
255	} else if (right_over) {
256		ts->ts_start = end;
257	} else {
258		avl_remove(&tm->tm_queued_frees, ts);
259		list_remove(&tm->tm_head, ts);
260		kmem_free(ts, sizeof (*ts));
261	}
262}
263
264static void
265trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
266{
267	zio_t zsearch, *zs;
268
269	ASSERT(MUTEX_HELD(&tm->tm_lock));
270
271	zsearch.io_offset = start;
272	zsearch.io_size = end - start;
273
274	zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL);
275	if (zs == NULL) {
276		trim_map_segment_add(tm, start, end, txg);
277		return;
278	}
279	if (start < zs->io_offset)
280		trim_map_free_locked(tm, start, zs->io_offset, txg);
281	if (zs->io_offset + zs->io_size < end)
282		trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg);
283}
284
285void
286trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
287{
288	trim_map_t *tm = vd->vdev_trimmap;
289
290	if (zfs_notrim || vd->vdev_notrim || tm == NULL)
291		return;
292
293	mutex_enter(&tm->tm_lock);
294	trim_map_free_locked(tm, offset, TRIM_ZIO_END(vd, offset, size), txg);
295	mutex_exit(&tm->tm_lock);
296}
297
298boolean_t
299trim_map_write_start(zio_t *zio)
300{
301	vdev_t *vd = zio->io_vd;
302	trim_map_t *tm = vd->vdev_trimmap;
303	trim_seg_t tsearch, *ts;
304	boolean_t left_over, right_over;
305	uint64_t start, end;
306
307	if (zfs_notrim || vd->vdev_notrim || tm == NULL)
308		return (B_TRUE);
309
310	start = zio->io_offset;
311	end = TRIM_ZIO_END(zio->io_vd, start, zio->io_size);
312	tsearch.ts_start = start;
313	tsearch.ts_end = end;
314
315	mutex_enter(&tm->tm_lock);
316
317	/*
318	 * Checking for colliding in-flight frees.
319	 */
320	ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL);
321	if (ts != NULL) {
322		list_insert_tail(&tm->tm_pending_writes, zio);
323		mutex_exit(&tm->tm_lock);
324		return (B_FALSE);
325	}
326
327	ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL);
328	if (ts != NULL) {
329		/*
330		 * Loop until all overlapping segments are removed.
331		 */
332		do {
333			trim_map_segment_remove(tm, ts, start, end);
334			ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL);
335		} while (ts != NULL);
336	}
337	avl_add(&tm->tm_inflight_writes, zio);
338
339	mutex_exit(&tm->tm_lock);
340
341	return (B_TRUE);
342}
343
344void
345trim_map_write_done(zio_t *zio)
346{
347	vdev_t *vd = zio->io_vd;
348	trim_map_t *tm = vd->vdev_trimmap;
349
350	/*
351	 * Don't check for vdev_notrim, since the write could have
352	 * started before vdev_notrim was set.
353	 */
354	if (zfs_notrim || tm == NULL)
355		return;
356
357	mutex_enter(&tm->tm_lock);
358	/*
359	 * Don't fail if the write isn't in the tree, since the write
360	 * could have started after vdev_notrim was set.
361	 */
362	if (zio->io_trim_node.avl_child[0] ||
363	    zio->io_trim_node.avl_child[1] ||
364	    AVL_XPARENT(&zio->io_trim_node) ||
365	    tm->tm_inflight_writes.avl_root == &zio->io_trim_node)
366		avl_remove(&tm->tm_inflight_writes, zio);
367	mutex_exit(&tm->tm_lock);
368}
369
370/*
371 * Return the oldest segment (the one with the lowest txg) or false if
372 * the list is empty or the first element's txg is greater than txg given
373 * as function argument, or the first element's time is greater than time
374 * given as function argument
375 */
376static trim_seg_t *
377trim_map_first(trim_map_t *tm, uint64_t txg, hrtime_t time)
378{
379	trim_seg_t *ts;
380
381	ASSERT(MUTEX_HELD(&tm->tm_lock));
382
383	ts = list_head(&tm->tm_head);
384	if (ts != NULL && ts->ts_txg <= txg && ts->ts_time <= time)
385		return (ts);
386	return (NULL);
387}
388
389static void
390trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
391{
392	trim_map_t *tm = vd->vdev_trimmap;
393	trim_seg_t *ts;
394	uint64_t start, size, txglimit;
395	hrtime_t timelimit;
396
397	ASSERT(vd->vdev_ops->vdev_op_leaf);
398
399	if (tm == NULL)
400		return;
401
402	if (vd->vdev_isl2cache) {
403		timelimit = gethrtime() - trim_l2arc_limit * NANOSEC;
404		txglimit = UINT64_MAX;
405	} else {
406		timelimit = TIME_MAX;
407		txglimit = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa)) -
408		    trim_txg_limit;
409	}
410
411	mutex_enter(&tm->tm_lock);
412	/*
413	 * Loop until we send all frees up to the txglimit
414	 * or time limit if this is a cache device.
415	 */
416	while ((ts = trim_map_first(tm, txglimit, timelimit)) != NULL) {
417		list_remove(&tm->tm_head, ts);
418		avl_remove(&tm->tm_queued_frees, ts);
419		avl_add(&tm->tm_inflight_frees, ts);
420		zio_nowait(zio_trim(zio, spa, vd, ts->ts_start,
421		    ts->ts_end - ts->ts_start));
422	}
423	mutex_exit(&tm->tm_lock);
424}
425
426static void
427trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd)
428{
429	trim_map_t *tm = vd->vdev_trimmap;
430	trim_seg_t *ts;
431	list_t pending_writes;
432	zio_t *zio;
433	uint64_t start, size;
434	void *cookie;
435
436	ASSERT(vd->vdev_ops->vdev_op_leaf);
437
438	if (tm == NULL)
439		return;
440
441	mutex_enter(&tm->tm_lock);
442	if (!avl_is_empty(&tm->tm_inflight_frees)) {
443		cookie = NULL;
444		while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees,
445		    &cookie)) != NULL) {
446			kmem_free(ts, sizeof (*ts));
447		}
448	}
449	list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t,
450	    io_trim_link));
451	list_move_tail(&pending_writes, &tm->tm_pending_writes);
452	mutex_exit(&tm->tm_lock);
453
454	while ((zio = list_remove_head(&pending_writes)) != NULL) {
455		zio_vdev_io_reissue(zio);
456		zio_execute(zio);
457	}
458	list_destroy(&pending_writes);
459}
460
461static void
462trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
463{
464	int c;
465
466	if (vd == NULL || spa_last_synced_txg(spa) <= trim_txg_limit)
467		return;
468
469	if (vd->vdev_ops->vdev_op_leaf) {
470		trim_map_vdev_commit(spa, zio, vd);
471	} else {
472		for (c = 0; c < vd->vdev_children; c++)
473			trim_map_commit(spa, zio, vd->vdev_child[c]);
474	}
475}
476
477static void
478trim_map_commit_done(spa_t *spa, vdev_t *vd)
479{
480	int c;
481
482	if (vd == NULL)
483		return;
484
485	if (vd->vdev_ops->vdev_op_leaf) {
486		trim_map_vdev_commit_done(spa, vd);
487	} else {
488		for (c = 0; c < vd->vdev_children; c++)
489			trim_map_commit_done(spa, vd->vdev_child[c]);
490	}
491}
492
493static void
494trim_thread(void *arg)
495{
496	spa_t *spa = arg;
497	zio_t *zio;
498
499	for (;;) {
500		mutex_enter(&spa->spa_trim_lock);
501		if (spa->spa_trim_thread == NULL) {
502			spa->spa_trim_thread = curthread;
503			cv_signal(&spa->spa_trim_cv);
504			mutex_exit(&spa->spa_trim_lock);
505			thread_exit();
506		}
507		cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock);
508		mutex_exit(&spa->spa_trim_lock);
509
510		zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
511
512		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
513		trim_map_commit(spa, zio, spa->spa_root_vdev);
514		(void) zio_wait(zio);
515		trim_map_commit_done(spa, spa->spa_root_vdev);
516		spa_config_exit(spa, SCL_STATE, FTAG);
517	}
518}
519
520void
521trim_thread_create(spa_t *spa)
522{
523
524	if (zfs_notrim)
525		return;
526
527	mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL);
528	cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL);
529	mutex_enter(&spa->spa_trim_lock);
530	spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0,
531	    TS_RUN, minclsyspri);
532	mutex_exit(&spa->spa_trim_lock);
533}
534
535void
536trim_thread_destroy(spa_t *spa)
537{
538
539	if (zfs_notrim)
540		return;
541	if (spa->spa_trim_thread == NULL)
542		return;
543
544	mutex_enter(&spa->spa_trim_lock);
545	/* Setting spa_trim_thread to NULL tells the thread to stop. */
546	spa->spa_trim_thread = NULL;
547	cv_signal(&spa->spa_trim_cv);
548	/* The thread will set it back to != NULL on exit. */
549	while (spa->spa_trim_thread == NULL)
550		cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock);
551	spa->spa_trim_thread = NULL;
552	mutex_exit(&spa->spa_trim_lock);
553
554	cv_destroy(&spa->spa_trim_cv);
555	mutex_destroy(&spa->spa_trim_lock);
556}
557
558void
559trim_thread_wakeup(spa_t *spa)
560{
561
562	if (zfs_notrim)
563		return;
564	if (spa->spa_trim_thread == NULL)
565		return;
566
567	mutex_enter(&spa->spa_trim_lock);
568	cv_signal(&spa->spa_trim_cv);
569	mutex_exit(&spa->spa_trim_lock);
570}
571