dmu_zfetch.c revision 177698
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <sys/zfs_context.h>
29#include <sys/dnode.h>
30#include <sys/dmu_objset.h>
31#include <sys/dmu_zfetch.h>
32#include <sys/dmu.h>
33#include <sys/dbuf.h>
34
35/*
36 * I'm against tune-ables, but these should probably exist as tweakable globals
37 * until we can get this working the way we want it to.
38 */
39
40int zfs_prefetch_disable = 0;
41SYSCTL_DECL(_vfs_zfs);
42TUNABLE_INT("vfs.zfs.prefetch_disable", &zfs_prefetch_disable);
43SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RDTUN,
44    &zfs_prefetch_disable, 0, "Disable prefetch");
45
46/* max # of streams per zfetch */
47uint32_t	zfetch_max_streams = 8;
48/* min time before stream reclaim */
49uint32_t	zfetch_min_sec_reap = 2;
50/* max number of blocks to fetch at a time */
51uint32_t	zfetch_block_cap = 256;
52/* number of bytes in a array_read at which we stop prefetching (1Mb) */
53uint64_t	zfetch_array_rd_sz = 1024 * 1024;
54
55/* forward decls for static routines */
56static int		dmu_zfetch_colinear(zfetch_t *, zstream_t *);
57static void		dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
58static uint64_t		dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
59static uint64_t		dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
60static int		dmu_zfetch_find(zfetch_t *, zstream_t *, int);
61static int		dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
62static zstream_t	*dmu_zfetch_stream_reclaim(zfetch_t *);
63static void		dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
64static int		dmu_zfetch_streams_equal(zstream_t *, zstream_t *);
65
66/*
67 * Given a zfetch structure and a zstream structure, determine whether the
68 * blocks to be read are part of a co-linear pair of existing prefetch
69 * streams.  If a set is found, coalesce the streams, removing one, and
70 * configure the prefetch so it looks for a strided access pattern.
71 *
72 * In other words: if we find two sequential access streams that are
73 * the same length and distance N appart, and this read is N from the
74 * last stream, then we are probably in a strided access pattern.  So
75 * combine the two sequential streams into a single strided stream.
76 *
77 * If no co-linear streams are found, return NULL.
78 */
79static int
80dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
81{
82	zstream_t	*z_walk;
83	zstream_t	*z_comp;
84
85	if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
86		return (0);
87
88	if (zh == NULL) {
89		rw_exit(&zf->zf_rwlock);
90		return (0);
91	}
92
93	for (z_walk = list_head(&zf->zf_stream); z_walk;
94	    z_walk = list_next(&zf->zf_stream, z_walk)) {
95		for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp;
96		    z_comp = list_next(&zf->zf_stream, z_comp)) {
97			int64_t		diff;
98
99			if (z_walk->zst_len != z_walk->zst_stride ||
100			    z_comp->zst_len != z_comp->zst_stride) {
101				continue;
102			}
103
104			diff = z_comp->zst_offset - z_walk->zst_offset;
105			if (z_comp->zst_offset + diff == zh->zst_offset) {
106				z_walk->zst_offset = zh->zst_offset;
107				z_walk->zst_direction = diff < 0 ? -1 : 1;
108				z_walk->zst_stride =
109				    diff * z_walk->zst_direction;
110				z_walk->zst_ph_offset =
111				    zh->zst_offset + z_walk->zst_stride;
112				dmu_zfetch_stream_remove(zf, z_comp);
113				mutex_destroy(&z_comp->zst_lock);
114				kmem_free(z_comp, sizeof (zstream_t));
115
116				dmu_zfetch_dofetch(zf, z_walk);
117
118				rw_exit(&zf->zf_rwlock);
119				return (1);
120			}
121
122			diff = z_walk->zst_offset - z_comp->zst_offset;
123			if (z_walk->zst_offset + diff == zh->zst_offset) {
124				z_walk->zst_offset = zh->zst_offset;
125				z_walk->zst_direction = diff < 0 ? -1 : 1;
126				z_walk->zst_stride =
127				    diff * z_walk->zst_direction;
128				z_walk->zst_ph_offset =
129				    zh->zst_offset + z_walk->zst_stride;
130				dmu_zfetch_stream_remove(zf, z_comp);
131				mutex_destroy(&z_comp->zst_lock);
132				kmem_free(z_comp, sizeof (zstream_t));
133
134				dmu_zfetch_dofetch(zf, z_walk);
135
136				rw_exit(&zf->zf_rwlock);
137				return (1);
138			}
139		}
140	}
141
142	rw_exit(&zf->zf_rwlock);
143	return (0);
144}
145
146/*
147 * Given a zstream_t, determine the bounds of the prefetch.  Then call the
148 * routine that actually prefetches the individual blocks.
149 */
150static void
151dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
152{
153	uint64_t	prefetch_tail;
154	uint64_t	prefetch_limit;
155	uint64_t	prefetch_ofst;
156	uint64_t	prefetch_len;
157	uint64_t	blocks_fetched;
158
159	zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len);
160	zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap);
161
162	prefetch_tail = MAX((int64_t)zs->zst_ph_offset,
163	    (int64_t)(zs->zst_offset + zs->zst_stride));
164	/*
165	 * XXX: use a faster division method?
166	 */
167	prefetch_limit = zs->zst_offset + zs->zst_len +
168	    (zs->zst_cap * zs->zst_stride) / zs->zst_len;
169
170	while (prefetch_tail < prefetch_limit) {
171		prefetch_ofst = zs->zst_offset + zs->zst_direction *
172		    (prefetch_tail - zs->zst_offset);
173
174		prefetch_len = zs->zst_len;
175
176		/*
177		 * Don't prefetch beyond the end of the file, if working
178		 * backwards.
179		 */
180		if ((zs->zst_direction == ZFETCH_BACKWARD) &&
181		    (prefetch_ofst > prefetch_tail)) {
182			prefetch_len += prefetch_ofst;
183			prefetch_ofst = 0;
184		}
185
186		/* don't prefetch more than we're supposed to */
187		if (prefetch_len > zs->zst_len)
188			break;
189
190		blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode,
191		    prefetch_ofst, zs->zst_len);
192
193		prefetch_tail += zs->zst_stride;
194		/* stop if we've run out of stuff to prefetch */
195		if (blocks_fetched < zs->zst_len)
196			break;
197	}
198	zs->zst_ph_offset = prefetch_tail;
199	zs->zst_last = lbolt;
200}
201
202/*
203 * This takes a pointer to a zfetch structure and a dnode.  It performs the
204 * necessary setup for the zfetch structure, grokking data from the
205 * associated dnode.
206 */
207void
208dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
209{
210	if (zf == NULL) {
211		return;
212	}
213
214	zf->zf_dnode = dno;
215	zf->zf_stream_cnt = 0;
216	zf->zf_alloc_fail = 0;
217
218	list_create(&zf->zf_stream, sizeof (zstream_t),
219	    offsetof(zstream_t, zst_node));
220
221	rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
222}
223
224/*
225 * This function computes the actual size, in blocks, that can be prefetched,
226 * and fetches it.
227 */
228static uint64_t
229dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
230{
231	uint64_t	fetchsz;
232	uint64_t	i;
233
234	fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
235
236	for (i = 0; i < fetchsz; i++) {
237		dbuf_prefetch(dn, blkid + i);
238	}
239
240	return (fetchsz);
241}
242
243/*
244 * this function returns the number of blocks that would be prefetched, based
245 * upon the supplied dnode, blockid, and nblks.  This is used so that we can
246 * update streams in place, and then prefetch with their old value after the
247 * fact.  This way, we can delay the prefetch, but subsequent accesses to the
248 * stream won't result in the same data being prefetched multiple times.
249 */
250static uint64_t
251dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
252{
253	uint64_t	fetchsz;
254
255	if (blkid > dn->dn_maxblkid) {
256		return (0);
257	}
258
259	/* compute fetch size */
260	if (blkid + nblks + 1 > dn->dn_maxblkid) {
261		fetchsz = (dn->dn_maxblkid - blkid) + 1;
262		ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid);
263	} else {
264		fetchsz = nblks;
265	}
266
267
268	return (fetchsz);
269}
270
271/*
272 * given a zfetch and a zsearch structure, see if there is an associated zstream
273 * for this block read.  If so, it starts a prefetch for the stream it
274 * located and returns true, otherwise it returns false
275 */
276static int
277dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
278{
279	zstream_t	*zs;
280	int64_t		diff;
281	int		reset = !prefetched;
282	int		rc = 0;
283
284	if (zh == NULL)
285		return (0);
286
287	/*
288	 * XXX: This locking strategy is a bit coarse; however, it's impact has
289	 * yet to be tested.  If this turns out to be an issue, it can be
290	 * modified in a number of different ways.
291	 */
292
293	rw_enter(&zf->zf_rwlock, RW_READER);
294top:
295
296	for (zs = list_head(&zf->zf_stream); zs;
297	    zs = list_next(&zf->zf_stream, zs)) {
298
299		/*
300		 * XXX - should this be an assert?
301		 */
302		if (zs->zst_len == 0) {
303			/* bogus stream */
304			continue;
305		}
306
307		/*
308		 * We hit this case when we are in a strided prefetch stream:
309		 * we will read "len" blocks before "striding".
310		 */
311		if (zh->zst_offset >= zs->zst_offset &&
312		    zh->zst_offset < zs->zst_offset + zs->zst_len) {
313			/* already fetched */
314			rc = 1;
315			goto out;
316		}
317
318		/*
319		 * This is the forward sequential read case: we increment
320		 * len by one each time we hit here, so we will enter this
321		 * case on every read.
322		 */
323		if (zh->zst_offset == zs->zst_offset + zs->zst_len) {
324
325			reset = !prefetched && zs->zst_len > 1;
326
327			mutex_enter(&zs->zst_lock);
328
329			if (zh->zst_offset != zs->zst_offset + zs->zst_len) {
330				mutex_exit(&zs->zst_lock);
331				goto top;
332			}
333			zs->zst_len += zh->zst_len;
334			diff = zs->zst_len - zfetch_block_cap;
335			if (diff > 0) {
336				zs->zst_offset += diff;
337				zs->zst_len = zs->zst_len > diff ?
338				    zs->zst_len - diff : 0;
339			}
340			zs->zst_direction = ZFETCH_FORWARD;
341
342			break;
343
344		/*
345		 * Same as above, but reading backwards through the file.
346		 */
347		} else if (zh->zst_offset == zs->zst_offset - zh->zst_len) {
348			/* backwards sequential access */
349
350			reset = !prefetched && zs->zst_len > 1;
351
352			mutex_enter(&zs->zst_lock);
353
354			if (zh->zst_offset != zs->zst_offset - zh->zst_len) {
355				mutex_exit(&zs->zst_lock);
356				goto top;
357			}
358
359			zs->zst_offset = zs->zst_offset > zh->zst_len ?
360			    zs->zst_offset - zh->zst_len : 0;
361			zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ?
362			    zs->zst_ph_offset - zh->zst_len : 0;
363			zs->zst_len += zh->zst_len;
364
365			diff = zs->zst_len - zfetch_block_cap;
366			if (diff > 0) {
367				zs->zst_ph_offset = zs->zst_ph_offset > diff ?
368				    zs->zst_ph_offset - diff : 0;
369				zs->zst_len = zs->zst_len > diff ?
370				    zs->zst_len - diff : zs->zst_len;
371			}
372			zs->zst_direction = ZFETCH_BACKWARD;
373
374			break;
375
376		} else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride <
377		    zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
378			/* strided forward access */
379
380			mutex_enter(&zs->zst_lock);
381
382			if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >=
383			    zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
384				mutex_exit(&zs->zst_lock);
385				goto top;
386			}
387
388			zs->zst_offset += zs->zst_stride;
389			zs->zst_direction = ZFETCH_FORWARD;
390
391			break;
392
393		} else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride <
394		    zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
395			/* strided reverse access */
396
397			mutex_enter(&zs->zst_lock);
398
399			if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >=
400			    zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
401				mutex_exit(&zs->zst_lock);
402				goto top;
403			}
404
405			zs->zst_offset = zs->zst_offset > zs->zst_stride ?
406			    zs->zst_offset - zs->zst_stride : 0;
407			zs->zst_ph_offset = (zs->zst_ph_offset >
408			    (2 * zs->zst_stride)) ?
409			    (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0;
410			zs->zst_direction = ZFETCH_BACKWARD;
411
412			break;
413		}
414	}
415
416	if (zs) {
417		if (reset) {
418			zstream_t *remove = zs;
419
420			rc = 0;
421			mutex_exit(&zs->zst_lock);
422			rw_exit(&zf->zf_rwlock);
423			rw_enter(&zf->zf_rwlock, RW_WRITER);
424			/*
425			 * Relocate the stream, in case someone removes
426			 * it while we were acquiring the WRITER lock.
427			 */
428			for (zs = list_head(&zf->zf_stream); zs;
429			    zs = list_next(&zf->zf_stream, zs)) {
430				if (zs == remove) {
431					dmu_zfetch_stream_remove(zf, zs);
432					mutex_destroy(&zs->zst_lock);
433					kmem_free(zs, sizeof (zstream_t));
434					break;
435				}
436			}
437		} else {
438			rc = 1;
439			dmu_zfetch_dofetch(zf, zs);
440			mutex_exit(&zs->zst_lock);
441		}
442	}
443out:
444	rw_exit(&zf->zf_rwlock);
445	return (rc);
446}
447
448/*
449 * Clean-up state associated with a zfetch structure.  This frees allocated
450 * structure members, empties the zf_stream tree, and generally makes things
451 * nice.  This doesn't free the zfetch_t itself, that's left to the caller.
452 */
453void
454dmu_zfetch_rele(zfetch_t *zf)
455{
456	zstream_t	*zs;
457	zstream_t	*zs_next;
458
459	ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
460
461	for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) {
462		zs_next = list_next(&zf->zf_stream, zs);
463
464		list_remove(&zf->zf_stream, zs);
465		mutex_destroy(&zs->zst_lock);
466		kmem_free(zs, sizeof (zstream_t));
467	}
468	list_destroy(&zf->zf_stream);
469	rw_destroy(&zf->zf_rwlock);
470
471	zf->zf_dnode = NULL;
472}
473
474/*
475 * Given a zfetch and zstream structure, insert the zstream structure into the
476 * AVL tree contained within the zfetch structure.  Peform the appropriate
477 * book-keeping.  It is possible that another thread has inserted a stream which
478 * matches one that we are about to insert, so we must be sure to check for this
479 * case.  If one is found, return failure, and let the caller cleanup the
480 * duplicates.
481 */
482static int
483dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs)
484{
485	zstream_t	*zs_walk;
486	zstream_t	*zs_next;
487
488	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
489
490	for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) {
491		zs_next = list_next(&zf->zf_stream, zs_walk);
492
493		if (dmu_zfetch_streams_equal(zs_walk, zs)) {
494		    return (0);
495		}
496	}
497
498	list_insert_head(&zf->zf_stream, zs);
499	zf->zf_stream_cnt++;
500
501	return (1);
502}
503
504
505/*
506 * Walk the list of zstreams in the given zfetch, find an old one (by time), and
507 * reclaim it for use by the caller.
508 */
509static zstream_t *
510dmu_zfetch_stream_reclaim(zfetch_t *zf)
511{
512	zstream_t	*zs;
513
514	if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
515		return (0);
516
517	for (zs = list_head(&zf->zf_stream); zs;
518	    zs = list_next(&zf->zf_stream, zs)) {
519
520		if (((lbolt - zs->zst_last) / hz) > zfetch_min_sec_reap)
521			break;
522	}
523
524	if (zs) {
525		dmu_zfetch_stream_remove(zf, zs);
526		mutex_destroy(&zs->zst_lock);
527		bzero(zs, sizeof (zstream_t));
528	} else {
529		zf->zf_alloc_fail++;
530	}
531	rw_exit(&zf->zf_rwlock);
532
533	return (zs);
534}
535
536/*
537 * Given a zfetch and zstream structure, remove the zstream structure from its
538 * container in the zfetch structure.  Perform the appropriate book-keeping.
539 */
540static void
541dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
542{
543	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
544
545	list_remove(&zf->zf_stream, zs);
546	zf->zf_stream_cnt--;
547}
548
549static int
550dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2)
551{
552	if (zs1->zst_offset != zs2->zst_offset)
553		return (0);
554
555	if (zs1->zst_len != zs2->zst_len)
556		return (0);
557
558	if (zs1->zst_stride != zs2->zst_stride)
559		return (0);
560
561	if (zs1->zst_ph_offset != zs2->zst_ph_offset)
562		return (0);
563
564	if (zs1->zst_cap != zs2->zst_cap)
565		return (0);
566
567	if (zs1->zst_direction != zs2->zst_direction)
568		return (0);
569
570	return (1);
571}
572
573/*
574 * This is the prefetch entry point.  It calls all of the other dmu_zfetch
575 * routines to create, delete, find, or operate upon prefetch streams.
576 */
577void
578dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
579{
580	zstream_t	zst;
581	zstream_t	*newstream;
582	int		fetched;
583	int		inserted;
584	unsigned int	blkshft;
585	uint64_t	blksz;
586
587	if (zfs_prefetch_disable)
588		return;
589
590	/* files that aren't ln2 blocksz are only one block -- nothing to do */
591	if (!zf->zf_dnode->dn_datablkshift)
592		return;
593
594	/* convert offset and size, into blockid and nblocks */
595	blkshft = zf->zf_dnode->dn_datablkshift;
596	blksz = (1 << blkshft);
597
598	bzero(&zst, sizeof (zstream_t));
599	zst.zst_offset = offset >> blkshft;
600	zst.zst_len = (P2ROUNDUP(offset + size, blksz) -
601	    P2ALIGN(offset, blksz)) >> blkshft;
602
603	fetched = dmu_zfetch_find(zf, &zst, prefetched);
604	if (!fetched) {
605		fetched = dmu_zfetch_colinear(zf, &zst);
606	}
607
608	if (!fetched) {
609		newstream = dmu_zfetch_stream_reclaim(zf);
610
611		/*
612		 * we still couldn't find a stream, drop the lock, and allocate
613		 * one if possible.  Otherwise, give up and go home.
614		 */
615		if (newstream == NULL) {
616			uint64_t	maxblocks;
617			uint32_t	max_streams;
618			uint32_t	cur_streams;
619
620			cur_streams = zf->zf_stream_cnt;
621			maxblocks = zf->zf_dnode->dn_maxblkid;
622
623			max_streams = MIN(zfetch_max_streams,
624			    (maxblocks / zfetch_block_cap));
625			if (max_streams == 0) {
626				max_streams++;
627			}
628
629			if (cur_streams >= max_streams) {
630				return;
631			}
632
633			newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
634		}
635
636		newstream->zst_offset = zst.zst_offset;
637		newstream->zst_len = zst.zst_len;
638		newstream->zst_stride = zst.zst_len;
639		newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
640		newstream->zst_cap = zst.zst_len;
641		newstream->zst_direction = ZFETCH_FORWARD;
642		newstream->zst_last = lbolt;
643
644		mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);
645
646		rw_enter(&zf->zf_rwlock, RW_WRITER);
647		inserted = dmu_zfetch_stream_insert(zf, newstream);
648		rw_exit(&zf->zf_rwlock);
649
650		if (!inserted) {
651			mutex_destroy(&newstream->zst_lock);
652			kmem_free(newstream, sizeof (zstream_t));
653		}
654	}
655}
656