1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22208130Smm * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23168404Spjd * Use is subject to license terms.
24168404Spjd */
25168404Spjd
26258632Savg/*
27289192Smav * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
28258632Savg */
29258632Savg
30168404Spjd#include <sys/zfs_context.h>
31168404Spjd#include <sys/dnode.h>
32168404Spjd#include <sys/dmu_objset.h>
33168404Spjd#include <sys/dmu_zfetch.h>
34168404Spjd#include <sys/dmu.h>
35168404Spjd#include <sys/dbuf.h>
36208130Smm#include <sys/kstat.h>
37168404Spjd
38168404Spjd/*
39287702Sdelphij * This tunable disables predictive prefetch.  Note that it leaves "prescient"
40287702Sdelphij * prefetch (e.g. prefetch for zfs send) intact.  Unlike predictive prefetch,
41287702Sdelphij * prescient prefetch never issues i/os that end up not being needed,
42287702Sdelphij * so it can't hurt performance.
43168404Spjd */
44287702Sdelphijboolean_t zfs_prefetch_disable = B_FALSE;
45168404Spjd
46168404Spjd/* max # of streams per zfetch */
47168404Spjduint32_t	zfetch_max_streams = 8;
48168404Spjd/* min time before stream reclaim */
49168404Spjduint32_t	zfetch_min_sec_reap = 2;
50287702Sdelphij/* max bytes to prefetch per stream (default 8MB) */
51287702Sdelphijuint32_t	zfetch_max_distance = 8 * 1024 * 1024;
52297832Smav/* max bytes to prefetch indirects for per stream (default 64MB) */
53297832Smavuint32_t	zfetch_max_idistance = 64 * 1024 * 1024;
54289192Smav/* max number of bytes in an array_read in which we allow prefetching (1MB) */
55168404Spjduint64_t	zfetch_array_rd_sz = 1024 * 1024;
56168404Spjd
57185029SpjdSYSCTL_DECL(_vfs_zfs);
58205132SkmacySYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RW,
59194043Skmacy    &zfs_prefetch_disable, 0, "Disable prefetch");
60185029SpjdSYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH");
61267992ShselaskySYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RWTUN,
62185029Spjd    &zfetch_max_streams, 0, "Max # of streams per zfetch");
63269117SmavSYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RWTUN,
64185029Spjd    &zfetch_min_sec_reap, 0, "Min time before stream reclaim");
65287702SdelphijSYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN,
66287702Sdelphij    &zfetch_max_distance, 0, "Max bytes to prefetch per stream");
67315445SmavSYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN,
68315445Smav    &zfetch_max_idistance, 0, "Max bytes to prefetch indirects for per stream");
69269117SmavSYSCTL_UQUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RWTUN,
70185029Spjd    &zfetch_array_rd_sz, 0,
71185029Spjd    "Number of bytes in a array_read at which we stop prefetching");
72185029Spjd
73208130Smmtypedef struct zfetch_stats {
74208130Smm	kstat_named_t zfetchstat_hits;
75208130Smm	kstat_named_t zfetchstat_misses;
76287702Sdelphij	kstat_named_t zfetchstat_max_streams;
77208130Smm} zfetch_stats_t;
78208130Smm
79208130Smmstatic zfetch_stats_t zfetch_stats = {
80208130Smm	{ "hits",			KSTAT_DATA_UINT64 },
81208130Smm	{ "misses",			KSTAT_DATA_UINT64 },
82287702Sdelphij	{ "max_streams",		KSTAT_DATA_UINT64 },
83208130Smm};
84208130Smm
85287702Sdelphij#define	ZFETCHSTAT_BUMP(stat) \
86287702Sdelphij	atomic_inc_64(&zfetch_stats.stat.value.ui64);
87208130Smm
88208130Smmkstat_t		*zfetch_ksp;
89208130Smm
90208130Smmvoid
91208130Smmzfetch_init(void)
92208130Smm{
93208130Smm	zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
94208130Smm	    KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
95208130Smm	    KSTAT_FLAG_VIRTUAL);
96208130Smm
97208130Smm	if (zfetch_ksp != NULL) {
98208130Smm		zfetch_ksp->ks_data = &zfetch_stats;
99208130Smm		kstat_install(zfetch_ksp);
100208130Smm	}
101208130Smm}
102208130Smm
103208130Smmvoid
104208130Smmzfetch_fini(void)
105208130Smm{
106208130Smm	if (zfetch_ksp != NULL) {
107208130Smm		kstat_delete(zfetch_ksp);
108208130Smm		zfetch_ksp = NULL;
109208130Smm	}
110208130Smm}
111208130Smm
112168404Spjd/*
113168404Spjd * This takes a pointer to a zfetch structure and a dnode.  It performs the
114168404Spjd * necessary setup for the zfetch structure, grokking data from the
115168404Spjd * associated dnode.
116168404Spjd */
117168404Spjdvoid
118168404Spjddmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
119168404Spjd{
120287702Sdelphij	if (zf == NULL)
121168404Spjd		return;
122168404Spjd
123168404Spjd	zf->zf_dnode = dno;
124168404Spjd
125168404Spjd	list_create(&zf->zf_stream, sizeof (zstream_t),
126287702Sdelphij	    offsetof(zstream_t, zs_node));
127168404Spjd
128168404Spjd	rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
129168404Spjd}
130168404Spjd
131287702Sdelphijstatic void
132287702Sdelphijdmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
133168404Spjd{
134287702Sdelphij	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
135287702Sdelphij	list_remove(&zf->zf_stream, zs);
136287702Sdelphij	mutex_destroy(&zs->zs_lock);
137287702Sdelphij	kmem_free(zs, sizeof (*zs));
138168404Spjd}
139168404Spjd
140168404Spjd/*
141287702Sdelphij * Clean-up state associated with a zfetch structure (e.g. destroy the
142287702Sdelphij * streams).  This doesn't free the zfetch_t itself, that's left to the caller.
143168404Spjd */
144168404Spjdvoid
145287702Sdelphijdmu_zfetch_fini(zfetch_t *zf)
146168404Spjd{
147287702Sdelphij	zstream_t *zs;
148168404Spjd
149168404Spjd	ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
150168404Spjd
151287702Sdelphij	rw_enter(&zf->zf_rwlock, RW_WRITER);
152287702Sdelphij	while ((zs = list_head(&zf->zf_stream)) != NULL)
153287702Sdelphij		dmu_zfetch_stream_remove(zf, zs);
154287702Sdelphij	rw_exit(&zf->zf_rwlock);
155168404Spjd	list_destroy(&zf->zf_stream);
156168404Spjd	rw_destroy(&zf->zf_rwlock);
157168404Spjd
158168404Spjd	zf->zf_dnode = NULL;
159168404Spjd}
160168404Spjd
161168404Spjd/*
162287702Sdelphij * If there aren't too many streams already, create a new stream.
163287702Sdelphij * The "blkid" argument is the next block that we expect this stream to access.
164287702Sdelphij * While we're here, clean up old streams (which haven't been
165287702Sdelphij * accessed for at least zfetch_min_sec_reap seconds).
166168404Spjd */
167287702Sdelphijstatic void
168287702Sdelphijdmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
169168404Spjd{
170287702Sdelphij	zstream_t *zs_next;
171287702Sdelphij	int numstreams = 0;
172168404Spjd
173168404Spjd	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
174168404Spjd
175287702Sdelphij	/*
176287702Sdelphij	 * Clean up old streams.
177287702Sdelphij	 */
178287702Sdelphij	for (zstream_t *zs = list_head(&zf->zf_stream);
179287702Sdelphij	    zs != NULL; zs = zs_next) {
180287702Sdelphij		zs_next = list_next(&zf->zf_stream, zs);
181287702Sdelphij		if (((gethrtime() - zs->zs_atime) / NANOSEC) >
182287702Sdelphij		    zfetch_min_sec_reap)
183287702Sdelphij			dmu_zfetch_stream_remove(zf, zs);
184287702Sdelphij		else
185287702Sdelphij			numstreams++;
186168404Spjd	}
187168404Spjd
188287702Sdelphij	/*
189287702Sdelphij	 * The maximum number of streams is normally zfetch_max_streams,
190287702Sdelphij	 * but for small files we lower it such that it's at least possible
191287702Sdelphij	 * for all the streams to be non-overlapping.
192287702Sdelphij	 *
193287702Sdelphij	 * If we are already at the maximum number of streams for this file,
194287702Sdelphij	 * even after removing old streams, then don't create this stream.
195287702Sdelphij	 */
196287702Sdelphij	uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
197287702Sdelphij	    zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
198287702Sdelphij	    zfetch_max_distance));
199287702Sdelphij	if (numstreams >= max_streams) {
200287702Sdelphij		ZFETCHSTAT_BUMP(zfetchstat_max_streams);
201287702Sdelphij		return;
202168404Spjd	}
203168404Spjd
204287702Sdelphij	zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
205287702Sdelphij	zs->zs_blkid = blkid;
206287702Sdelphij	zs->zs_pf_blkid = blkid;
207297832Smav	zs->zs_ipf_blkid = blkid;
208287702Sdelphij	zs->zs_atime = gethrtime();
209287702Sdelphij	mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
210168404Spjd
211287702Sdelphij	list_insert_head(&zf->zf_stream, zs);
212168404Spjd}
213168404Spjd
214168404Spjd/*
215297832Smav * This is the predictive prefetch entry point.  It associates dnode access
216297832Smav * specified with blkid and nblks arguments with prefetch stream, predicts
217297832Smav * further accesses based on that stats and initiates speculative prefetch.
218297832Smav * fetch_data argument specifies whether actual data blocks should be fetched:
219297832Smav *   FALSE -- prefetch only indirect blocks for predicted data blocks;
220297832Smav *   TRUE -- prefetch predicted data blocks plus following indirect blocks.
221168404Spjd */
222168404Spjdvoid
223297832Smavdmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
224168404Spjd{
225287702Sdelphij	zstream_t *zs;
226297832Smav	int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
227297832Smav	int64_t pf_ahead_blks, max_blks;
228297832Smav	int epbs, max_dist_blks, pf_nblks, ipf_nblks;
229297832Smav	uint64_t end_of_access_blkid = blkid + nblks;
230332525Smav	spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
231168404Spjd
232194043Skmacy	if (zfs_prefetch_disable)
233168404Spjd		return;
234168404Spjd
235287702Sdelphij	/*
236332525Smav	 * If we haven't yet loaded the indirect vdevs' mappings, we
237332525Smav	 * can only read from blocks that we carefully ensure are on
238332525Smav	 * concrete vdevs (or previously-loaded indirect vdevs).  So we
239332525Smav	 * can't allow the predictive prefetcher to attempt reads of other
240332525Smav	 * blocks (e.g. of the MOS's dnode obejct).
241332525Smav	 */
242332525Smav	if (!spa_indirect_vdevs_loaded(spa))
243332525Smav		return;
244332525Smav
245332525Smav	/*
246287702Sdelphij	 * As a fast path for small (single-block) files, ignore access
247287702Sdelphij	 * to the first block.
248287702Sdelphij	 */
249287702Sdelphij	if (blkid == 0)
250168404Spjd		return;
251168404Spjd
252287702Sdelphij	rw_enter(&zf->zf_rwlock, RW_READER);
253168404Spjd
254329494Smav	/*
255329494Smav	 * Find matching prefetch stream.  Depending on whether the accesses
256329494Smav	 * are block-aligned, first block of the new access may either follow
257329494Smav	 * the last block of the previous access, or be equal to it.
258329494Smav	 */
259287702Sdelphij	for (zs = list_head(&zf->zf_stream); zs != NULL;
260287702Sdelphij	    zs = list_next(&zf->zf_stream, zs)) {
261329494Smav		if (blkid == zs->zs_blkid || blkid + 1 == zs->zs_blkid) {
262287702Sdelphij			mutex_enter(&zs->zs_lock);
263287702Sdelphij			/*
264287702Sdelphij			 * zs_blkid could have changed before we
265287702Sdelphij			 * acquired zs_lock; re-check them here.
266287702Sdelphij			 */
267329494Smav			if (blkid == zs->zs_blkid) {
268329494Smav				break;
269329494Smav			} else if (blkid + 1 == zs->zs_blkid) {
270329494Smav				blkid++;
271329494Smav				nblks--;
272329494Smav				if (nblks == 0) {
273329494Smav					/* Already prefetched this before. */
274329494Smav					mutex_exit(&zs->zs_lock);
275329494Smav					rw_exit(&zf->zf_rwlock);
276329494Smav					return;
277329494Smav				}
278329494Smav				break;
279287702Sdelphij			}
280329494Smav			mutex_exit(&zs->zs_lock);
281208130Smm		}
282168404Spjd	}
283168404Spjd
284287702Sdelphij	if (zs == NULL) {
285168404Spjd		/*
286287702Sdelphij		 * This access is not part of any existing stream.  Create
287287702Sdelphij		 * a new stream for it.
288168404Spjd		 */
289287702Sdelphij		ZFETCHSTAT_BUMP(zfetchstat_misses);
290287702Sdelphij		if (rw_tryupgrade(&zf->zf_rwlock))
291297832Smav			dmu_zfetch_stream_create(zf, end_of_access_blkid);
292287702Sdelphij		rw_exit(&zf->zf_rwlock);
293287702Sdelphij		return;
294287702Sdelphij	}
295168404Spjd
296287702Sdelphij	/*
297287702Sdelphij	 * This access was to a block that we issued a prefetch for on
298287702Sdelphij	 * behalf of this stream. Issue further prefetches for this stream.
299287702Sdelphij	 *
300287702Sdelphij	 * Normally, we start prefetching where we stopped
301287702Sdelphij	 * prefetching last (zs_pf_blkid).  But when we get our first
302287702Sdelphij	 * hit on this stream, zs_pf_blkid == zs_blkid, we don't
303297832Smav	 * want to prefetch the block we just accessed.  In this case,
304287702Sdelphij	 * start just after the block we just accessed.
305287702Sdelphij	 */
306297832Smav	pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
307168404Spjd
308287702Sdelphij	/*
309287702Sdelphij	 * Double our amount of prefetched data, but don't let the
310287702Sdelphij	 * prefetch get further ahead than zfetch_max_distance.
311287702Sdelphij	 */
312297832Smav	if (fetch_data) {
313297832Smav		max_dist_blks =
314297832Smav		    zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
315297832Smav		/*
316297832Smav		 * Previously, we were (zs_pf_blkid - blkid) ahead.  We
317297832Smav		 * want to now be double that, so read that amount again,
318297832Smav		 * plus the amount we are catching up by (i.e. the amount
319297832Smav		 * read just now).
320297832Smav		 */
321297832Smav		pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
322297832Smav		max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
323297832Smav		pf_nblks = MIN(pf_ahead_blks, max_blks);
324297832Smav	} else {
325297832Smav		pf_nblks = 0;
326297832Smav	}
327168404Spjd
328287702Sdelphij	zs->zs_pf_blkid = pf_start + pf_nblks;
329168404Spjd
330287702Sdelphij	/*
331297832Smav	 * Do the same for indirects, starting from where we stopped last,
332297832Smav	 * or where we will stop reading data blocks (and the indirects
333297832Smav	 * that point to them).
334287702Sdelphij	 */
335297832Smav	ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
336297832Smav	max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
337297832Smav	/*
338297832Smav	 * We want to double our distance ahead of the data prefetch
339297832Smav	 * (or reader, if we are not prefetching data).  Previously, we
340297832Smav	 * were (zs_ipf_blkid - blkid) ahead.  To double that, we read
341297832Smav	 * that amount again, plus the amount we are catching up by
342297832Smav	 * (i.e. the amount read now + the amount of data prefetched now).
343297832Smav	 */
344297832Smav	pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
345297832Smav	max_blks = max_dist_blks - (ipf_start - end_of_access_blkid);
346297832Smav	ipf_nblks = MIN(pf_ahead_blks, max_blks);
347297832Smav	zs->zs_ipf_blkid = ipf_start + ipf_nblks;
348297832Smav
349297832Smav	epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
350297832Smav	ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
351297832Smav	ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
352297832Smav
353297832Smav	zs->zs_atime = gethrtime();
354297832Smav	zs->zs_blkid = end_of_access_blkid;
355287702Sdelphij	mutex_exit(&zs->zs_lock);
356287702Sdelphij	rw_exit(&zf->zf_rwlock);
357297832Smav
358297832Smav	/*
359297832Smav	 * dbuf_prefetch() is asynchronous (even when it needs to read
360297832Smav	 * indirect blocks), but we still prefer to drop our locks before
361297832Smav	 * calling it to reduce the time we hold them.
362297832Smav	 */
363297832Smav
364287702Sdelphij	for (int i = 0; i < pf_nblks; i++) {
365287702Sdelphij		dbuf_prefetch(zf->zf_dnode, 0, pf_start + i,
366287702Sdelphij		    ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
367168404Spjd	}
368297832Smav	for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
369297832Smav		dbuf_prefetch(zf->zf_dnode, 1, iblk,
370297832Smav		    ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
371297832Smav	}
372287702Sdelphij	ZFETCHSTAT_BUMP(zfetchstat_hits);
373168404Spjd}
374