1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
23 * All rights reserved.
24 */
25
26#include <sys/zfs_context.h>
27#include <sys/spa_impl.h>
28#include <sys/vdev_impl.h>
29#include <sys/trim_map.h>
30#include <sys/time.h>
31
32/*
33 * Calculate the zio end, upgrading based on ashift which would be
34 * done by zio_vdev_io_start.
35 *
36 * This makes free range consolidation much more effective
37 * than it would otherwise be as well as ensuring that entire
38 * blocks are invalidated by writes.
39 */
40#define	TRIM_ZIO_END(vd, offset, size)	(offset +		\
41 	P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift))
42
43/* Maximal segment size for ATA TRIM. */
44#define TRIM_MAP_SIZE_FACTOR	(512 << 16)
45
46#define TRIM_MAP_SEGS(size)	(1 + (size) / TRIM_MAP_SIZE_FACTOR)
47
48#define TRIM_MAP_ADD(tm, ts)	do {				\
49	list_insert_tail(&(tm)->tm_head, (ts));			\
50	(tm)->tm_pending += TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \
51} while (0)
52
53#define TRIM_MAP_REM(tm, ts)	do {				\
54	list_remove(&(tm)->tm_head, (ts));			\
55	(tm)->tm_pending -= TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \
56} while (0)
57
58typedef struct trim_map {
59	list_t		tm_head;		/* List of segments sorted by txg. */
60	avl_tree_t	tm_queued_frees;	/* AVL tree of segments waiting for TRIM. */
61	avl_tree_t	tm_inflight_frees;	/* AVL tree of in-flight TRIMs. */
62	avl_tree_t	tm_inflight_writes;	/* AVL tree of in-flight writes. */
63	list_t		tm_pending_writes;	/* Writes blocked on in-flight frees. */
64	kmutex_t	tm_lock;
65	uint64_t	tm_pending;		/* Count of pending TRIMs. */
66} trim_map_t;
67
68typedef struct trim_seg {
69	avl_node_t	ts_node;	/* AVL node. */
70	list_node_t	ts_next;	/* List element. */
71	uint64_t	ts_start;	/* Starting offset of this segment. */
72	uint64_t	ts_end;		/* Ending offset (non-inclusive). */
73	uint64_t	ts_txg;		/* Segment creation txg. */
74	hrtime_t	ts_time;	/* Segment creation time. */
75} trim_seg_t;
76
77extern boolean_t zfs_trim_enabled;
78
79static u_int trim_txg_delay = 32;	/* Keep deleted data up to 32 TXG */
80static u_int trim_timeout = 30;		/* Keep deleted data up to 30s */
81static u_int trim_max_interval = 1;	/* 1s delays between TRIMs */
82static u_int trim_vdev_max_pending = 10000; /* Keep up to 10K segments */
83
84SYSCTL_DECL(_vfs_zfs);
85SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RD, 0, "ZFS TRIM");
86
87SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, txg_delay, CTLFLAG_RWTUN, &trim_txg_delay,
88    0, "Delay TRIMs by up to this many TXGs");
89SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, timeout, CTLFLAG_RWTUN, &trim_timeout, 0,
90    "Delay TRIMs by up to this many seconds");
91SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, max_interval, CTLFLAG_RWTUN,
92    &trim_max_interval, 0,
93    "Maximum interval between TRIM queue processing (seconds)");
94
95SYSCTL_DECL(_vfs_zfs_vdev);
96SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, trim_max_pending, CTLFLAG_RWTUN,
97    &trim_vdev_max_pending, 0,
98    "Maximum pending TRIM segments for a vdev");
99
100static void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd);
101
102static int
103trim_map_seg_compare(const void *x1, const void *x2)
104{
105	const trim_seg_t *s1 = x1;
106	const trim_seg_t *s2 = x2;
107
108	if (s1->ts_start < s2->ts_start) {
109		if (s1->ts_end > s2->ts_start)
110			return (0);
111		return (-1);
112	}
113	if (s1->ts_start > s2->ts_start) {
114		if (s1->ts_start < s2->ts_end)
115			return (0);
116		return (1);
117	}
118	return (0);
119}
120
121static int
122trim_map_zio_compare(const void *x1, const void *x2)
123{
124	const zio_t *z1 = x1;
125	const zio_t *z2 = x2;
126
127	if (z1->io_offset < z2->io_offset) {
128		if (z1->io_offset + z1->io_size > z2->io_offset)
129			return (0);
130		return (-1);
131	}
132	if (z1->io_offset > z2->io_offset) {
133		if (z1->io_offset < z2->io_offset + z2->io_size)
134			return (0);
135		return (1);
136	}
137	return (0);
138}
139
140void
141trim_map_create(vdev_t *vd)
142{
143	trim_map_t *tm;
144
145	ASSERT(zfs_trim_enabled && !vd->vdev_notrim &&
146		vd->vdev_ops->vdev_op_leaf);
147
148	tm = kmem_zalloc(sizeof (*tm), KM_SLEEP);
149	mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL);
150	list_create(&tm->tm_head, sizeof (trim_seg_t),
151	    offsetof(trim_seg_t, ts_next));
152	list_create(&tm->tm_pending_writes, sizeof (zio_t),
153	    offsetof(zio_t, io_trim_link));
154	avl_create(&tm->tm_queued_frees, trim_map_seg_compare,
155	    sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
156	avl_create(&tm->tm_inflight_frees, trim_map_seg_compare,
157	    sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
158	avl_create(&tm->tm_inflight_writes, trim_map_zio_compare,
159	    sizeof (zio_t), offsetof(zio_t, io_trim_node));
160	vd->vdev_trimmap = tm;
161}
162
163void
164trim_map_destroy(vdev_t *vd)
165{
166	trim_map_t *tm;
167	trim_seg_t *ts;
168
169	ASSERT(vd->vdev_ops->vdev_op_leaf);
170
171	if (!zfs_trim_enabled)
172		return;
173
174	tm = vd->vdev_trimmap;
175	if (tm == NULL)
176		return;
177
178	/*
179	 * We may have been called before trim_map_vdev_commit_done()
180	 * had a chance to run, so do it now to prune the remaining
181	 * inflight frees.
182	 */
183	trim_map_vdev_commit_done(vd->vdev_spa, vd);
184
185	mutex_enter(&tm->tm_lock);
186	while ((ts = list_head(&tm->tm_head)) != NULL) {
187		avl_remove(&tm->tm_queued_frees, ts);
188		TRIM_MAP_REM(tm, ts);
189		kmem_free(ts, sizeof (*ts));
190	}
191	mutex_exit(&tm->tm_lock);
192
193	avl_destroy(&tm->tm_queued_frees);
194	avl_destroy(&tm->tm_inflight_frees);
195	avl_destroy(&tm->tm_inflight_writes);
196	list_destroy(&tm->tm_pending_writes);
197	list_destroy(&tm->tm_head);
198	mutex_destroy(&tm->tm_lock);
199	kmem_free(tm, sizeof (*tm));
200	vd->vdev_trimmap = NULL;
201}
202
203static void
204trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
205{
206	avl_index_t where;
207	trim_seg_t tsearch, *ts_before, *ts_after, *ts;
208	boolean_t merge_before, merge_after;
209	hrtime_t time;
210
211	ASSERT(MUTEX_HELD(&tm->tm_lock));
212	VERIFY(start < end);
213
214	time = gethrtime();
215	tsearch.ts_start = start;
216	tsearch.ts_end = end;
217
218	ts = avl_find(&tm->tm_queued_frees, &tsearch, &where);
219	if (ts != NULL) {
220		if (start < ts->ts_start)
221			trim_map_segment_add(tm, start, ts->ts_start, txg);
222		if (end > ts->ts_end)
223			trim_map_segment_add(tm, ts->ts_end, end, txg);
224		return;
225	}
226
227	ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE);
228	ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER);
229
230	merge_before = (ts_before != NULL && ts_before->ts_end == start);
231	merge_after = (ts_after != NULL && ts_after->ts_start == end);
232
233	if (merge_before && merge_after) {
234		avl_remove(&tm->tm_queued_frees, ts_before);
235		TRIM_MAP_REM(tm, ts_before);
236		TRIM_MAP_REM(tm, ts_after);
237		ts_after->ts_start = ts_before->ts_start;
238		ts_after->ts_txg = txg;
239		ts_after->ts_time = time;
240		TRIM_MAP_ADD(tm, ts_after);
241		kmem_free(ts_before, sizeof (*ts_before));
242	} else if (merge_before) {
243		TRIM_MAP_REM(tm, ts_before);
244		ts_before->ts_end = end;
245		ts_before->ts_txg = txg;
246		ts_before->ts_time = time;
247		TRIM_MAP_ADD(tm, ts_before);
248	} else if (merge_after) {
249		TRIM_MAP_REM(tm, ts_after);
250		ts_after->ts_start = start;
251		ts_after->ts_txg = txg;
252		ts_after->ts_time = time;
253		TRIM_MAP_ADD(tm, ts_after);
254	} else {
255		ts = kmem_alloc(sizeof (*ts), KM_SLEEP);
256		ts->ts_start = start;
257		ts->ts_end = end;
258		ts->ts_txg = txg;
259		ts->ts_time = time;
260		avl_insert(&tm->tm_queued_frees, ts, where);
261		TRIM_MAP_ADD(tm, ts);
262	}
263}
264
265static void
266trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start,
267    uint64_t end)
268{
269	trim_seg_t *nts;
270	boolean_t left_over, right_over;
271
272	ASSERT(MUTEX_HELD(&tm->tm_lock));
273
274	left_over = (ts->ts_start < start);
275	right_over = (ts->ts_end > end);
276
277	TRIM_MAP_REM(tm, ts);
278	if (left_over && right_over) {
279		nts = kmem_alloc(sizeof (*nts), KM_SLEEP);
280		nts->ts_start = end;
281		nts->ts_end = ts->ts_end;
282		nts->ts_txg = ts->ts_txg;
283		nts->ts_time = ts->ts_time;
284		ts->ts_end = start;
285		avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER);
286		TRIM_MAP_ADD(tm, ts);
287		TRIM_MAP_ADD(tm, nts);
288	} else if (left_over) {
289		ts->ts_end = start;
290		TRIM_MAP_ADD(tm, ts);
291	} else if (right_over) {
292		ts->ts_start = end;
293		TRIM_MAP_ADD(tm, ts);
294	} else {
295		avl_remove(&tm->tm_queued_frees, ts);
296		kmem_free(ts, sizeof (*ts));
297	}
298}
299
300static void
301trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
302{
303	zio_t zsearch, *zs;
304
305	ASSERT(MUTEX_HELD(&tm->tm_lock));
306
307	zsearch.io_offset = start;
308	zsearch.io_size = end - start;
309
310	zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL);
311	if (zs == NULL) {
312		trim_map_segment_add(tm, start, end, txg);
313		return;
314	}
315	if (start < zs->io_offset)
316		trim_map_free_locked(tm, start, zs->io_offset, txg);
317	if (zs->io_offset + zs->io_size < end)
318		trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg);
319}
320
321void
322trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
323{
324	trim_map_t *tm = vd->vdev_trimmap;
325
326	if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL)
327		return;
328
329	mutex_enter(&tm->tm_lock);
330	trim_map_free_locked(tm, offset, TRIM_ZIO_END(vd, offset, size), txg);
331	mutex_exit(&tm->tm_lock);
332}
333
334boolean_t
335trim_map_write_start(zio_t *zio)
336{
337	vdev_t *vd = zio->io_vd;
338	trim_map_t *tm = vd->vdev_trimmap;
339	trim_seg_t tsearch, *ts;
340	boolean_t left_over, right_over;
341	uint64_t start, end;
342
343	if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL)
344		return (B_TRUE);
345
346	start = zio->io_offset;
347	end = TRIM_ZIO_END(zio->io_vd, start, zio->io_size);
348	tsearch.ts_start = start;
349	tsearch.ts_end = end;
350
351	mutex_enter(&tm->tm_lock);
352
353	/*
354	 * Checking for colliding in-flight frees.
355	 */
356	ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL);
357	if (ts != NULL) {
358		list_insert_tail(&tm->tm_pending_writes, zio);
359		mutex_exit(&tm->tm_lock);
360		return (B_FALSE);
361	}
362
363	ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL);
364	if (ts != NULL) {
365		/*
366		 * Loop until all overlapping segments are removed.
367		 */
368		do {
369			trim_map_segment_remove(tm, ts, start, end);
370			ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL);
371		} while (ts != NULL);
372	}
373	avl_add(&tm->tm_inflight_writes, zio);
374
375	mutex_exit(&tm->tm_lock);
376
377	return (B_TRUE);
378}
379
380void
381trim_map_write_done(zio_t *zio)
382{
383	vdev_t *vd = zio->io_vd;
384	trim_map_t *tm = vd->vdev_trimmap;
385
386	/*
387	 * Don't check for vdev_notrim, since the write could have
388	 * started before vdev_notrim was set.
389	 */
390	if (!zfs_trim_enabled || tm == NULL)
391		return;
392
393	mutex_enter(&tm->tm_lock);
394	/*
395	 * Don't fail if the write isn't in the tree, since the write
396	 * could have started after vdev_notrim was set.
397	 */
398	if (zio->io_trim_node.avl_child[0] ||
399	    zio->io_trim_node.avl_child[1] ||
400	    AVL_XPARENT(&zio->io_trim_node) ||
401	    tm->tm_inflight_writes.avl_root == &zio->io_trim_node)
402		avl_remove(&tm->tm_inflight_writes, zio);
403	mutex_exit(&tm->tm_lock);
404}
405
406/*
407 * Return the oldest segment (the one with the lowest txg / time) or NULL if:
408 * 1. The list is empty
409 * 2. The first element's txg is greater than txgsafe
410 * 3. The first element's txg is not greater than the txg argument and the
411 *    the first element's time is not greater than time argument
412 */
413static trim_seg_t *
414trim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time,
415    boolean_t force)
416{
417	trim_seg_t *ts;
418
419	ASSERT(MUTEX_HELD(&tm->tm_lock));
420	VERIFY(txgsafe >= txg);
421
422	ts = list_head(&tm->tm_head);
423	if (ts != NULL && ts->ts_txg <= txgsafe &&
424	    (ts->ts_txg <= txg || ts->ts_time <= time || force))
425		return (ts);
426	return (NULL);
427}
428
429static void
430trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
431{
432	trim_map_t *tm = vd->vdev_trimmap;
433	trim_seg_t *ts;
434	uint64_t size, offset, txgtarget, txgsafe;
435	int64_t hard, soft;
436	hrtime_t timelimit;
437
438	ASSERT(vd->vdev_ops->vdev_op_leaf);
439
440	if (tm == NULL)
441		return;
442
443	timelimit = gethrtime() - (hrtime_t)trim_timeout * NANOSEC;
444	if (vd->vdev_isl2cache) {
445		txgsafe = UINT64_MAX;
446		txgtarget = UINT64_MAX;
447	} else {
448		txgsafe = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa));
449		if (txgsafe > trim_txg_delay)
450			txgtarget = txgsafe - trim_txg_delay;
451		else
452			txgtarget = 0;
453	}
454
455	mutex_enter(&tm->tm_lock);
456	hard = 0;
457	if (tm->tm_pending > trim_vdev_max_pending)
458		hard = (tm->tm_pending - trim_vdev_max_pending) / 4;
459	soft = P2ROUNDUP(hard + tm->tm_pending / trim_timeout + 1, 64);
460	/* Loop until we have sent all outstanding free's */
461	while (soft > 0 &&
462	    (ts = trim_map_first(tm, txgtarget, txgsafe, timelimit, hard > 0))
463	    != NULL) {
464		TRIM_MAP_REM(tm, ts);
465		avl_remove(&tm->tm_queued_frees, ts);
466		avl_add(&tm->tm_inflight_frees, ts);
467		size = ts->ts_end - ts->ts_start;
468		offset = ts->ts_start;
469		/*
470		 * We drop the lock while we call zio_nowait as the IO
471		 * scheduler can result in a different IO being run e.g.
472		 * a write which would result in a recursive lock.
473		 */
474		mutex_exit(&tm->tm_lock);
475
476		zio_nowait(zio_trim(zio, spa, vd, offset, size));
477
478		soft -= TRIM_MAP_SEGS(size);
479		hard -= TRIM_MAP_SEGS(size);
480		mutex_enter(&tm->tm_lock);
481	}
482	mutex_exit(&tm->tm_lock);
483}
484
485static void
486trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd)
487{
488	trim_map_t *tm = vd->vdev_trimmap;
489	trim_seg_t *ts;
490	list_t pending_writes;
491	zio_t *zio;
492	uint64_t start, size;
493	void *cookie;
494
495	ASSERT(vd->vdev_ops->vdev_op_leaf);
496
497	if (tm == NULL)
498		return;
499
500	mutex_enter(&tm->tm_lock);
501	if (!avl_is_empty(&tm->tm_inflight_frees)) {
502		cookie = NULL;
503		while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees,
504		    &cookie)) != NULL) {
505			kmem_free(ts, sizeof (*ts));
506		}
507	}
508	list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t,
509	    io_trim_link));
510	list_move_tail(&pending_writes, &tm->tm_pending_writes);
511	mutex_exit(&tm->tm_lock);
512
513	while ((zio = list_remove_head(&pending_writes)) != NULL) {
514		zio_vdev_io_reissue(zio);
515		zio_execute(zio);
516	}
517	list_destroy(&pending_writes);
518}
519
520static void
521trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
522{
523	int c;
524
525	if (vd == NULL)
526		return;
527
528	if (vd->vdev_ops->vdev_op_leaf) {
529		trim_map_vdev_commit(spa, zio, vd);
530	} else {
531		for (c = 0; c < vd->vdev_children; c++)
532			trim_map_commit(spa, zio, vd->vdev_child[c]);
533	}
534}
535
536static void
537trim_map_commit_done(spa_t *spa, vdev_t *vd)
538{
539	int c;
540
541	if (vd == NULL)
542		return;
543
544	if (vd->vdev_ops->vdev_op_leaf) {
545		trim_map_vdev_commit_done(spa, vd);
546	} else {
547		for (c = 0; c < vd->vdev_children; c++)
548			trim_map_commit_done(spa, vd->vdev_child[c]);
549	}
550}
551
552static void
553trim_thread(void *arg)
554{
555	spa_t *spa = arg;
556	zio_t *zio;
557
558#ifdef _KERNEL
559	(void) snprintf(curthread->td_name, sizeof(curthread->td_name),
560	    "trim %s", spa_name(spa));
561#endif
562
563	for (;;) {
564		mutex_enter(&spa->spa_trim_lock);
565		if (spa->spa_trim_thread == NULL) {
566			spa->spa_trim_thread = curthread;
567			cv_signal(&spa->spa_trim_cv);
568			mutex_exit(&spa->spa_trim_lock);
569			thread_exit();
570		}
571
572		(void) cv_timedwait(&spa->spa_trim_cv, &spa->spa_trim_lock,
573		    hz * trim_max_interval);
574		mutex_exit(&spa->spa_trim_lock);
575
576		zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
577
578		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
579		trim_map_commit(spa, zio, spa->spa_root_vdev);
580		(void) zio_wait(zio);
581		trim_map_commit_done(spa, spa->spa_root_vdev);
582		spa_config_exit(spa, SCL_STATE, FTAG);
583	}
584}
585
586void
587trim_thread_create(spa_t *spa)
588{
589
590	if (!zfs_trim_enabled)
591		return;
592
593	mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL);
594	cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL);
595	mutex_enter(&spa->spa_trim_lock);
596	spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0,
597	    TS_RUN, minclsyspri);
598	mutex_exit(&spa->spa_trim_lock);
599}
600
601void
602trim_thread_destroy(spa_t *spa)
603{
604
605	if (!zfs_trim_enabled)
606		return;
607	if (spa->spa_trim_thread == NULL)
608		return;
609
610	mutex_enter(&spa->spa_trim_lock);
611	/* Setting spa_trim_thread to NULL tells the thread to stop. */
612	spa->spa_trim_thread = NULL;
613	cv_signal(&spa->spa_trim_cv);
614	/* The thread will set it back to != NULL on exit. */
615	while (spa->spa_trim_thread == NULL)
616		cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock);
617	spa->spa_trim_thread = NULL;
618	mutex_exit(&spa->spa_trim_lock);
619
620	cv_destroy(&spa->spa_trim_cv);
621	mutex_destroy(&spa->spa_trim_lock);
622}
623
624void
625trim_thread_wakeup(spa_t *spa)
626{
627
628	if (!zfs_trim_enabled)
629		return;
630	if (spa->spa_trim_thread == NULL)
631		return;
632
633	mutex_enter(&spa->spa_trim_lock);
634	cv_signal(&spa->spa_trim_cv);
635	mutex_exit(&spa->spa_trim_lock);
636}
637