xref: /freebsd-src/sys/contrib/openzfs/module/zfs/dmu_zfetch.c (revision f9693bef8dc83284e7ac905adc346f7d866b5245)
1eda14cbcSMatt Macy /*
2eda14cbcSMatt Macy  * CDDL HEADER START
3eda14cbcSMatt Macy  *
4eda14cbcSMatt Macy  * The contents of this file are subject to the terms of the
5eda14cbcSMatt Macy  * Common Development and Distribution License (the "License").
6eda14cbcSMatt Macy  * You may not use this file except in compliance with the License.
7eda14cbcSMatt Macy  *
8eda14cbcSMatt Macy  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9eda14cbcSMatt Macy  * or http://www.opensolaris.org/os/licensing.
10eda14cbcSMatt Macy  * See the License for the specific language governing permissions
11eda14cbcSMatt Macy  * and limitations under the License.
12eda14cbcSMatt Macy  *
13eda14cbcSMatt Macy  * When distributing Covered Code, include this CDDL HEADER in each
14eda14cbcSMatt Macy  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15eda14cbcSMatt Macy  * If applicable, add the following below this CDDL HEADER, with the
16eda14cbcSMatt Macy  * fields enclosed by brackets "[]" replaced with your own identifying
17eda14cbcSMatt Macy  * information: Portions Copyright [yyyy] [name of copyright owner]
18eda14cbcSMatt Macy  *
19eda14cbcSMatt Macy  * CDDL HEADER END
20eda14cbcSMatt Macy  */
21eda14cbcSMatt Macy /*
22eda14cbcSMatt Macy  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23eda14cbcSMatt Macy  * Use is subject to license terms.
24eda14cbcSMatt Macy  */
25eda14cbcSMatt Macy 
26eda14cbcSMatt Macy /*
27eda14cbcSMatt Macy  * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
28eda14cbcSMatt Macy  */
29eda14cbcSMatt Macy 
30eda14cbcSMatt Macy #include <sys/zfs_context.h>
31eda14cbcSMatt Macy #include <sys/dnode.h>
32eda14cbcSMatt Macy #include <sys/dmu_objset.h>
33eda14cbcSMatt Macy #include <sys/dmu_zfetch.h>
34eda14cbcSMatt Macy #include <sys/dmu.h>
35eda14cbcSMatt Macy #include <sys/dbuf.h>
36eda14cbcSMatt Macy #include <sys/kstat.h>
37eda14cbcSMatt Macy 
38eda14cbcSMatt Macy /*
39eda14cbcSMatt Macy  * This tunable disables predictive prefetch.  Note that it leaves "prescient"
40eda14cbcSMatt Macy  * prefetch (e.g. prefetch for zfs send) intact.  Unlike predictive prefetch,
41eda14cbcSMatt Macy  * prescient prefetch never issues i/os that end up not being needed,
42eda14cbcSMatt Macy  * so it can't hurt performance.
43eda14cbcSMatt Macy  */
44eda14cbcSMatt Macy 
45eda14cbcSMatt Macy int zfs_prefetch_disable = B_FALSE;
46eda14cbcSMatt Macy 
47eda14cbcSMatt Macy /* max # of streams per zfetch */
48eda14cbcSMatt Macy unsigned int	zfetch_max_streams = 8;
49eda14cbcSMatt Macy /* min time before stream reclaim */
50eda14cbcSMatt Macy unsigned int	zfetch_min_sec_reap = 2;
51eda14cbcSMatt Macy /* max bytes to prefetch per stream (default 8MB) */
52eda14cbcSMatt Macy unsigned int	zfetch_max_distance = 8 * 1024 * 1024;
53eda14cbcSMatt Macy /* max bytes to prefetch indirects for per stream (default 64MB) */
54eda14cbcSMatt Macy unsigned int	zfetch_max_idistance = 64 * 1024 * 1024;
55eda14cbcSMatt Macy /* max number of bytes in an array_read in which we allow prefetching (1MB) */
56eda14cbcSMatt Macy unsigned long	zfetch_array_rd_sz = 1024 * 1024;
57eda14cbcSMatt Macy 
58eda14cbcSMatt Macy typedef struct zfetch_stats {
59eda14cbcSMatt Macy 	kstat_named_t zfetchstat_hits;
60eda14cbcSMatt Macy 	kstat_named_t zfetchstat_misses;
61eda14cbcSMatt Macy 	kstat_named_t zfetchstat_max_streams;
627877fdebSMatt Macy 	kstat_named_t zfetchstat_io_issued;
63eda14cbcSMatt Macy } zfetch_stats_t;
64eda14cbcSMatt Macy 
65eda14cbcSMatt Macy static zfetch_stats_t zfetch_stats = {
66eda14cbcSMatt Macy 	{ "hits",			KSTAT_DATA_UINT64 },
67eda14cbcSMatt Macy 	{ "misses",			KSTAT_DATA_UINT64 },
68eda14cbcSMatt Macy 	{ "max_streams",		KSTAT_DATA_UINT64 },
697877fdebSMatt Macy 	{ "io_issued",		KSTAT_DATA_UINT64 },
70eda14cbcSMatt Macy };
71eda14cbcSMatt Macy 
72eda14cbcSMatt Macy #define	ZFETCHSTAT_BUMP(stat) \
737877fdebSMatt Macy 	atomic_inc_64(&zfetch_stats.stat.value.ui64)
747877fdebSMatt Macy #define	ZFETCHSTAT_ADD(stat, val)				\
757877fdebSMatt Macy 	atomic_add_64(&zfetch_stats.stat.value.ui64, val)
767877fdebSMatt Macy #define	ZFETCHSTAT_SET(stat, val)				\
777877fdebSMatt Macy 	zfetch_stats.stat.value.ui64 = val
787877fdebSMatt Macy #define	ZFETCHSTAT_GET(stat)					\
797877fdebSMatt Macy 	zfetch_stats.stat.value.ui64
807877fdebSMatt Macy 
81eda14cbcSMatt Macy 
82eda14cbcSMatt Macy kstat_t		*zfetch_ksp;
83eda14cbcSMatt Macy 
84eda14cbcSMatt Macy void
85eda14cbcSMatt Macy zfetch_init(void)
86eda14cbcSMatt Macy {
87eda14cbcSMatt Macy 	zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
88eda14cbcSMatt Macy 	    KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
89eda14cbcSMatt Macy 	    KSTAT_FLAG_VIRTUAL);
90eda14cbcSMatt Macy 
91eda14cbcSMatt Macy 	if (zfetch_ksp != NULL) {
92eda14cbcSMatt Macy 		zfetch_ksp->ks_data = &zfetch_stats;
93eda14cbcSMatt Macy 		kstat_install(zfetch_ksp);
94eda14cbcSMatt Macy 	}
95eda14cbcSMatt Macy }
96eda14cbcSMatt Macy 
97eda14cbcSMatt Macy void
98eda14cbcSMatt Macy zfetch_fini(void)
99eda14cbcSMatt Macy {
100eda14cbcSMatt Macy 	if (zfetch_ksp != NULL) {
101eda14cbcSMatt Macy 		kstat_delete(zfetch_ksp);
102eda14cbcSMatt Macy 		zfetch_ksp = NULL;
103eda14cbcSMatt Macy 	}
104eda14cbcSMatt Macy }
105eda14cbcSMatt Macy 
106eda14cbcSMatt Macy /*
107eda14cbcSMatt Macy  * This takes a pointer to a zfetch structure and a dnode.  It performs the
108eda14cbcSMatt Macy  * necessary setup for the zfetch structure, grokking data from the
109eda14cbcSMatt Macy  * associated dnode.
110eda14cbcSMatt Macy  */
111eda14cbcSMatt Macy void
112eda14cbcSMatt Macy dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
113eda14cbcSMatt Macy {
114eda14cbcSMatt Macy 	if (zf == NULL)
115eda14cbcSMatt Macy 		return;
116eda14cbcSMatt Macy 	zf->zf_dnode = dno;
1177877fdebSMatt Macy 	zf->zf_numstreams = 0;
118eda14cbcSMatt Macy 
119eda14cbcSMatt Macy 	list_create(&zf->zf_stream, sizeof (zstream_t),
120eda14cbcSMatt Macy 	    offsetof(zstream_t, zs_node));
121eda14cbcSMatt Macy 
122eda14cbcSMatt Macy 	mutex_init(&zf->zf_lock, NULL, MUTEX_DEFAULT, NULL);
123eda14cbcSMatt Macy }
124eda14cbcSMatt Macy 
125eda14cbcSMatt Macy static void
1267877fdebSMatt Macy dmu_zfetch_stream_fini(zstream_t *zs)
1277877fdebSMatt Macy {
128*f9693befSMartin Matuska 	ASSERT(!list_link_active(&zs->zs_node));
1297877fdebSMatt Macy 	kmem_free(zs, sizeof (*zs));
1307877fdebSMatt Macy }
1317877fdebSMatt Macy 
1327877fdebSMatt Macy static void
133eda14cbcSMatt Macy dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
134eda14cbcSMatt Macy {
135eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&zf->zf_lock));
136eda14cbcSMatt Macy 	list_remove(&zf->zf_stream, zs);
137*f9693befSMartin Matuska 	zf->zf_numstreams--;
138*f9693befSMartin Matuska 	membar_producer();
139*f9693befSMartin Matuska 	if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
1407877fdebSMatt Macy 		dmu_zfetch_stream_fini(zs);
141eda14cbcSMatt Macy }
142eda14cbcSMatt Macy 
143eda14cbcSMatt Macy /*
144eda14cbcSMatt Macy  * Clean-up state associated with a zfetch structure (e.g. destroy the
145eda14cbcSMatt Macy  * streams).  This doesn't free the zfetch_t itself, that's left to the caller.
146eda14cbcSMatt Macy  */
147eda14cbcSMatt Macy void
148eda14cbcSMatt Macy dmu_zfetch_fini(zfetch_t *zf)
149eda14cbcSMatt Macy {
150eda14cbcSMatt Macy 	zstream_t *zs;
151eda14cbcSMatt Macy 
152eda14cbcSMatt Macy 	mutex_enter(&zf->zf_lock);
153*f9693befSMartin Matuska 	while ((zs = list_head(&zf->zf_stream)) != NULL)
154eda14cbcSMatt Macy 		dmu_zfetch_stream_remove(zf, zs);
155eda14cbcSMatt Macy 	mutex_exit(&zf->zf_lock);
156eda14cbcSMatt Macy 	list_destroy(&zf->zf_stream);
157eda14cbcSMatt Macy 	mutex_destroy(&zf->zf_lock);
158eda14cbcSMatt Macy 
159eda14cbcSMatt Macy 	zf->zf_dnode = NULL;
160eda14cbcSMatt Macy }
161eda14cbcSMatt Macy 
162eda14cbcSMatt Macy /*
163eda14cbcSMatt Macy  * If there aren't too many streams already, create a new stream.
164eda14cbcSMatt Macy  * The "blkid" argument is the next block that we expect this stream to access.
165eda14cbcSMatt Macy  * While we're here, clean up old streams (which haven't been
166eda14cbcSMatt Macy  * accessed for at least zfetch_min_sec_reap seconds).
167eda14cbcSMatt Macy  */
168eda14cbcSMatt Macy static void
169eda14cbcSMatt Macy dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
170eda14cbcSMatt Macy {
171eda14cbcSMatt Macy 	zstream_t *zs_next;
1727877fdebSMatt Macy 	hrtime_t now = gethrtime();
173eda14cbcSMatt Macy 
174eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&zf->zf_lock));
175eda14cbcSMatt Macy 
176eda14cbcSMatt Macy 	/*
177eda14cbcSMatt Macy 	 * Clean up old streams.
178eda14cbcSMatt Macy 	 */
179eda14cbcSMatt Macy 	for (zstream_t *zs = list_head(&zf->zf_stream);
180eda14cbcSMatt Macy 	    zs != NULL; zs = zs_next) {
181eda14cbcSMatt Macy 		zs_next = list_next(&zf->zf_stream, zs);
1827877fdebSMatt Macy 		/*
183*f9693befSMartin Matuska 		 * Skip if still active.  1 -- zf_stream reference.
1847877fdebSMatt Macy 		 */
185*f9693befSMartin Matuska 		if (zfs_refcount_count(&zs->zs_refs) != 1)
1867877fdebSMatt Macy 			continue;
1877877fdebSMatt Macy 		if (((now - zs->zs_atime) / NANOSEC) >
188eda14cbcSMatt Macy 		    zfetch_min_sec_reap)
189eda14cbcSMatt Macy 			dmu_zfetch_stream_remove(zf, zs);
190eda14cbcSMatt Macy 	}
191eda14cbcSMatt Macy 
192eda14cbcSMatt Macy 	/*
193eda14cbcSMatt Macy 	 * The maximum number of streams is normally zfetch_max_streams,
194eda14cbcSMatt Macy 	 * but for small files we lower it such that it's at least possible
195eda14cbcSMatt Macy 	 * for all the streams to be non-overlapping.
196eda14cbcSMatt Macy 	 *
197eda14cbcSMatt Macy 	 * If we are already at the maximum number of streams for this file,
198eda14cbcSMatt Macy 	 * even after removing old streams, then don't create this stream.
199eda14cbcSMatt Macy 	 */
200eda14cbcSMatt Macy 	uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
201eda14cbcSMatt Macy 	    zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
202eda14cbcSMatt Macy 	    zfetch_max_distance));
2037877fdebSMatt Macy 	if (zf->zf_numstreams >= max_streams) {
204eda14cbcSMatt Macy 		ZFETCHSTAT_BUMP(zfetchstat_max_streams);
205eda14cbcSMatt Macy 		return;
206eda14cbcSMatt Macy 	}
207eda14cbcSMatt Macy 
208eda14cbcSMatt Macy 	zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
209eda14cbcSMatt Macy 	zs->zs_blkid = blkid;
210*f9693befSMartin Matuska 	zs->zs_pf_blkid1 = blkid;
211eda14cbcSMatt Macy 	zs->zs_pf_blkid = blkid;
212*f9693befSMartin Matuska 	zs->zs_ipf_blkid1 = blkid;
213eda14cbcSMatt Macy 	zs->zs_ipf_blkid = blkid;
2147877fdebSMatt Macy 	zs->zs_atime = now;
2157877fdebSMatt Macy 	zs->zs_fetch = zf;
216*f9693befSMartin Matuska 	zs->zs_missed = B_FALSE;
217*f9693befSMartin Matuska 	zfs_refcount_create(&zs->zs_callers);
218*f9693befSMartin Matuska 	zfs_refcount_create(&zs->zs_refs);
219*f9693befSMartin Matuska 	/* One reference for zf_stream. */
220*f9693befSMartin Matuska 	zfs_refcount_add(&zs->zs_refs, NULL);
2217877fdebSMatt Macy 	zf->zf_numstreams++;
222eda14cbcSMatt Macy 	list_insert_head(&zf->zf_stream, zs);
223eda14cbcSMatt Macy }
224eda14cbcSMatt Macy 
2257877fdebSMatt Macy static void
2267877fdebSMatt Macy dmu_zfetch_stream_done(void *arg, boolean_t io_issued)
2277877fdebSMatt Macy {
2287877fdebSMatt Macy 	zstream_t *zs = arg;
2297877fdebSMatt Macy 
230*f9693befSMartin Matuska 	if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
2317877fdebSMatt Macy 		dmu_zfetch_stream_fini(zs);
2327877fdebSMatt Macy }
2337877fdebSMatt Macy 
234eda14cbcSMatt Macy /*
235*f9693befSMartin Matuska  * This is the predictive prefetch entry point.  dmu_zfetch_prepare()
236*f9693befSMartin Matuska  * associates dnode access specified with blkid and nblks arguments with
237*f9693befSMartin Matuska  * prefetch stream, predicts further accesses based on that stats and returns
238*f9693befSMartin Matuska  * the stream pointer on success.  That pointer must later be passed to
239*f9693befSMartin Matuska  * dmu_zfetch_run() to initiate the speculative prefetch for the stream and
240*f9693befSMartin Matuska  * release it.  dmu_zfetch() is a wrapper for simple cases when window between
241*f9693befSMartin Matuska  * prediction and prefetch initiation is not needed.
242eda14cbcSMatt Macy  * fetch_data argument specifies whether actual data blocks should be fetched:
243eda14cbcSMatt Macy  *   FALSE -- prefetch only indirect blocks for predicted data blocks;
244eda14cbcSMatt Macy  *   TRUE -- prefetch predicted data blocks plus following indirect blocks.
245eda14cbcSMatt Macy  */
246*f9693befSMartin Matuska zstream_t *
247*f9693befSMartin Matuska dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
248*f9693befSMartin Matuska     boolean_t fetch_data, boolean_t have_lock)
249eda14cbcSMatt Macy {
250eda14cbcSMatt Macy 	zstream_t *zs;
251*f9693befSMartin Matuska 	int64_t pf_start, ipf_start;
252eda14cbcSMatt Macy 	int64_t pf_ahead_blks, max_blks;
253*f9693befSMartin Matuska 	int max_dist_blks, pf_nblks, ipf_nblks;
254*f9693befSMartin Matuska 	uint64_t end_of_access_blkid, maxblkid;
255eda14cbcSMatt Macy 	end_of_access_blkid = blkid + nblks;
256eda14cbcSMatt Macy 	spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
257eda14cbcSMatt Macy 
258eda14cbcSMatt Macy 	if (zfs_prefetch_disable)
259*f9693befSMartin Matuska 		return (NULL);
260eda14cbcSMatt Macy 	/*
261eda14cbcSMatt Macy 	 * If we haven't yet loaded the indirect vdevs' mappings, we
262eda14cbcSMatt Macy 	 * can only read from blocks that we carefully ensure are on
263eda14cbcSMatt Macy 	 * concrete vdevs (or previously-loaded indirect vdevs).  So we
264eda14cbcSMatt Macy 	 * can't allow the predictive prefetcher to attempt reads of other
265eda14cbcSMatt Macy 	 * blocks (e.g. of the MOS's dnode object).
266eda14cbcSMatt Macy 	 */
267eda14cbcSMatt Macy 	if (!spa_indirect_vdevs_loaded(spa))
268*f9693befSMartin Matuska 		return (NULL);
269eda14cbcSMatt Macy 
270eda14cbcSMatt Macy 	/*
271eda14cbcSMatt Macy 	 * As a fast path for small (single-block) files, ignore access
272eda14cbcSMatt Macy 	 * to the first block.
273eda14cbcSMatt Macy 	 */
2747877fdebSMatt Macy 	if (!have_lock && blkid == 0)
275*f9693befSMartin Matuska 		return (NULL);
276eda14cbcSMatt Macy 
277eda14cbcSMatt Macy 	if (!have_lock)
278eda14cbcSMatt Macy 		rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
2797877fdebSMatt Macy 
2807877fdebSMatt Macy 	/*
2817877fdebSMatt Macy 	 * A fast path for small files for which no prefetch will
2827877fdebSMatt Macy 	 * happen.
2837877fdebSMatt Macy 	 */
284*f9693befSMartin Matuska 	maxblkid = zf->zf_dnode->dn_maxblkid;
285*f9693befSMartin Matuska 	if (maxblkid < 2) {
2867877fdebSMatt Macy 		if (!have_lock)
2877877fdebSMatt Macy 			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
288*f9693befSMartin Matuska 		return (NULL);
2897877fdebSMatt Macy 	}
290eda14cbcSMatt Macy 	mutex_enter(&zf->zf_lock);
291eda14cbcSMatt Macy 
292eda14cbcSMatt Macy 	/*
293eda14cbcSMatt Macy 	 * Find matching prefetch stream.  Depending on whether the accesses
294eda14cbcSMatt Macy 	 * are block-aligned, first block of the new access may either follow
295eda14cbcSMatt Macy 	 * the last block of the previous access, or be equal to it.
296eda14cbcSMatt Macy 	 */
297eda14cbcSMatt Macy 	for (zs = list_head(&zf->zf_stream); zs != NULL;
298eda14cbcSMatt Macy 	    zs = list_next(&zf->zf_stream, zs)) {
299eda14cbcSMatt Macy 		if (blkid == zs->zs_blkid) {
300eda14cbcSMatt Macy 			break;
301eda14cbcSMatt Macy 		} else if (blkid + 1 == zs->zs_blkid) {
302eda14cbcSMatt Macy 			blkid++;
303eda14cbcSMatt Macy 			nblks--;
304eda14cbcSMatt Macy 			break;
305eda14cbcSMatt Macy 		}
306eda14cbcSMatt Macy 	}
307*f9693befSMartin Matuska 
308*f9693befSMartin Matuska 	/*
309*f9693befSMartin Matuska 	 * If the file is ending, remove the matching stream if found.
310*f9693befSMartin Matuska 	 * If not found then it is too late to create a new one now.
311*f9693befSMartin Matuska 	 */
312*f9693befSMartin Matuska 	if (end_of_access_blkid >= maxblkid) {
313*f9693befSMartin Matuska 		if (zs != NULL)
314*f9693befSMartin Matuska 			dmu_zfetch_stream_remove(zf, zs);
315*f9693befSMartin Matuska 		mutex_exit(&zf->zf_lock);
316*f9693befSMartin Matuska 		if (!have_lock)
317*f9693befSMartin Matuska 			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
318*f9693befSMartin Matuska 		return (NULL);
319*f9693befSMartin Matuska 	}
320*f9693befSMartin Matuska 
321*f9693befSMartin Matuska 	/* Exit if we already prefetched this block before. */
322*f9693befSMartin Matuska 	if (nblks == 0) {
323*f9693befSMartin Matuska 		mutex_exit(&zf->zf_lock);
324*f9693befSMartin Matuska 		if (!have_lock)
325*f9693befSMartin Matuska 			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
326*f9693befSMartin Matuska 		return (NULL);
327eda14cbcSMatt Macy 	}
328eda14cbcSMatt Macy 
329eda14cbcSMatt Macy 	if (zs == NULL) {
330eda14cbcSMatt Macy 		/*
331eda14cbcSMatt Macy 		 * This access is not part of any existing stream.  Create
332eda14cbcSMatt Macy 		 * a new stream for it.
333eda14cbcSMatt Macy 		 */
334eda14cbcSMatt Macy 		dmu_zfetch_stream_create(zf, end_of_access_blkid);
335eda14cbcSMatt Macy 		mutex_exit(&zf->zf_lock);
336eda14cbcSMatt Macy 		if (!have_lock)
337eda14cbcSMatt Macy 			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
338*f9693befSMartin Matuska 		ZFETCHSTAT_BUMP(zfetchstat_misses);
339*f9693befSMartin Matuska 		return (NULL);
340eda14cbcSMatt Macy 	}
341eda14cbcSMatt Macy 
342eda14cbcSMatt Macy 	/*
343eda14cbcSMatt Macy 	 * This access was to a block that we issued a prefetch for on
344eda14cbcSMatt Macy 	 * behalf of this stream. Issue further prefetches for this stream.
345eda14cbcSMatt Macy 	 *
346eda14cbcSMatt Macy 	 * Normally, we start prefetching where we stopped
347eda14cbcSMatt Macy 	 * prefetching last (zs_pf_blkid).  But when we get our first
348eda14cbcSMatt Macy 	 * hit on this stream, zs_pf_blkid == zs_blkid, we don't
349eda14cbcSMatt Macy 	 * want to prefetch the block we just accessed.  In this case,
350eda14cbcSMatt Macy 	 * start just after the block we just accessed.
351eda14cbcSMatt Macy 	 */
352eda14cbcSMatt Macy 	pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
353*f9693befSMartin Matuska 	if (zs->zs_pf_blkid1 < end_of_access_blkid)
354*f9693befSMartin Matuska 		zs->zs_pf_blkid1 = end_of_access_blkid;
355*f9693befSMartin Matuska 	if (zs->zs_ipf_blkid1 < end_of_access_blkid)
356*f9693befSMartin Matuska 		zs->zs_ipf_blkid1 = end_of_access_blkid;
357eda14cbcSMatt Macy 
358eda14cbcSMatt Macy 	/*
359eda14cbcSMatt Macy 	 * Double our amount of prefetched data, but don't let the
360eda14cbcSMatt Macy 	 * prefetch get further ahead than zfetch_max_distance.
361eda14cbcSMatt Macy 	 */
362eda14cbcSMatt Macy 	if (fetch_data) {
363eda14cbcSMatt Macy 		max_dist_blks =
364eda14cbcSMatt Macy 		    zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
365eda14cbcSMatt Macy 		/*
366eda14cbcSMatt Macy 		 * Previously, we were (zs_pf_blkid - blkid) ahead.  We
367eda14cbcSMatt Macy 		 * want to now be double that, so read that amount again,
368eda14cbcSMatt Macy 		 * plus the amount we are catching up by (i.e. the amount
369eda14cbcSMatt Macy 		 * read just now).
370eda14cbcSMatt Macy 		 */
371eda14cbcSMatt Macy 		pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
372eda14cbcSMatt Macy 		max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
373eda14cbcSMatt Macy 		pf_nblks = MIN(pf_ahead_blks, max_blks);
374eda14cbcSMatt Macy 	} else {
375eda14cbcSMatt Macy 		pf_nblks = 0;
376eda14cbcSMatt Macy 	}
377eda14cbcSMatt Macy 
378eda14cbcSMatt Macy 	zs->zs_pf_blkid = pf_start + pf_nblks;
379eda14cbcSMatt Macy 
380eda14cbcSMatt Macy 	/*
381eda14cbcSMatt Macy 	 * Do the same for indirects, starting from where we stopped last,
382eda14cbcSMatt Macy 	 * or where we will stop reading data blocks (and the indirects
383eda14cbcSMatt Macy 	 * that point to them).
384eda14cbcSMatt Macy 	 */
385eda14cbcSMatt Macy 	ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
386eda14cbcSMatt Macy 	max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
387eda14cbcSMatt Macy 	/*
388eda14cbcSMatt Macy 	 * We want to double our distance ahead of the data prefetch
389eda14cbcSMatt Macy 	 * (or reader, if we are not prefetching data).  Previously, we
390eda14cbcSMatt Macy 	 * were (zs_ipf_blkid - blkid) ahead.  To double that, we read
391eda14cbcSMatt Macy 	 * that amount again, plus the amount we are catching up by
392eda14cbcSMatt Macy 	 * (i.e. the amount read now + the amount of data prefetched now).
393eda14cbcSMatt Macy 	 */
394eda14cbcSMatt Macy 	pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
395*f9693befSMartin Matuska 	max_blks = max_dist_blks - (ipf_start - zs->zs_pf_blkid);
396eda14cbcSMatt Macy 	ipf_nblks = MIN(pf_ahead_blks, max_blks);
397eda14cbcSMatt Macy 	zs->zs_ipf_blkid = ipf_start + ipf_nblks;
398eda14cbcSMatt Macy 
399eda14cbcSMatt Macy 	zs->zs_blkid = end_of_access_blkid;
400*f9693befSMartin Matuska 	/* Protect the stream from reclamation. */
401*f9693befSMartin Matuska 	zs->zs_atime = gethrtime();
402*f9693befSMartin Matuska 	zfs_refcount_add(&zs->zs_refs, NULL);
403*f9693befSMartin Matuska 	/* Count concurrent callers. */
404*f9693befSMartin Matuska 	zfs_refcount_add(&zs->zs_callers, NULL);
405eda14cbcSMatt Macy 	mutex_exit(&zf->zf_lock);
406*f9693befSMartin Matuska 
407*f9693befSMartin Matuska 	if (!have_lock)
408*f9693befSMartin Matuska 		rw_exit(&zf->zf_dnode->dn_struct_rwlock);
409*f9693befSMartin Matuska 
410*f9693befSMartin Matuska 	ZFETCHSTAT_BUMP(zfetchstat_hits);
411*f9693befSMartin Matuska 	return (zs);
412*f9693befSMartin Matuska }
413*f9693befSMartin Matuska 
414*f9693befSMartin Matuska void
415*f9693befSMartin Matuska dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
416*f9693befSMartin Matuska {
417*f9693befSMartin Matuska 	zfetch_t *zf = zs->zs_fetch;
418*f9693befSMartin Matuska 	int64_t pf_start, pf_end, ipf_start, ipf_end;
419*f9693befSMartin Matuska 	int epbs, issued;
420*f9693befSMartin Matuska 
421*f9693befSMartin Matuska 	if (missed)
422*f9693befSMartin Matuska 		zs->zs_missed = missed;
423eda14cbcSMatt Macy 
424eda14cbcSMatt Macy 	/*
425*f9693befSMartin Matuska 	 * Postpone the prefetch if there are more concurrent callers.
426*f9693befSMartin Matuska 	 * It happens when multiple requests are waiting for the same
427*f9693befSMartin Matuska 	 * indirect block.  The last one will run the prefetch for all.
428eda14cbcSMatt Macy 	 */
429*f9693befSMartin Matuska 	if (zfs_refcount_remove(&zs->zs_callers, NULL) != 0) {
430*f9693befSMartin Matuska 		/* Drop reference taken in dmu_zfetch_prepare(). */
431*f9693befSMartin Matuska 		if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
432*f9693befSMartin Matuska 			dmu_zfetch_stream_fini(zs);
433*f9693befSMartin Matuska 		return;
434*f9693befSMartin Matuska 	}
435eda14cbcSMatt Macy 
436*f9693befSMartin Matuska 	mutex_enter(&zf->zf_lock);
437*f9693befSMartin Matuska 	if (zs->zs_missed) {
438*f9693befSMartin Matuska 		pf_start = zs->zs_pf_blkid1;
439*f9693befSMartin Matuska 		pf_end = zs->zs_pf_blkid1 = zs->zs_pf_blkid;
440*f9693befSMartin Matuska 	} else {
441*f9693befSMartin Matuska 		pf_start = pf_end = 0;
442*f9693befSMartin Matuska 	}
443*f9693befSMartin Matuska 	ipf_start = MAX(zs->zs_pf_blkid1, zs->zs_ipf_blkid1);
444*f9693befSMartin Matuska 	ipf_end = zs->zs_ipf_blkid1 = zs->zs_ipf_blkid;
445*f9693befSMartin Matuska 	mutex_exit(&zf->zf_lock);
446*f9693befSMartin Matuska 	ASSERT3S(pf_start, <=, pf_end);
447*f9693befSMartin Matuska 	ASSERT3S(ipf_start, <=, ipf_end);
448*f9693befSMartin Matuska 
449*f9693befSMartin Matuska 	epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
450*f9693befSMartin Matuska 	ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
451*f9693befSMartin Matuska 	ipf_end = P2ROUNDUP(ipf_end, 1 << epbs) >> epbs;
452*f9693befSMartin Matuska 	ASSERT3S(ipf_start, <=, ipf_end);
453*f9693befSMartin Matuska 	issued = pf_end - pf_start + ipf_end - ipf_start;
454*f9693befSMartin Matuska 	if (issued > 1) {
455*f9693befSMartin Matuska 		/* More references on top of taken in dmu_zfetch_prepare(). */
456*f9693befSMartin Matuska 		zfs_refcount_add_many(&zs->zs_refs, issued - 1, NULL);
457*f9693befSMartin Matuska 	} else if (issued == 0) {
458*f9693befSMartin Matuska 		/* Some other thread has done our work, so drop the ref. */
459*f9693befSMartin Matuska 		if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
460*f9693befSMartin Matuska 			dmu_zfetch_stream_fini(zs);
461*f9693befSMartin Matuska 		return;
462*f9693befSMartin Matuska 	}
463*f9693befSMartin Matuska 
464*f9693befSMartin Matuska 	if (!have_lock)
465*f9693befSMartin Matuska 		rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
466*f9693befSMartin Matuska 
467*f9693befSMartin Matuska 	issued = 0;
468*f9693befSMartin Matuska 	for (int64_t blk = pf_start; blk < pf_end; blk++) {
469*f9693befSMartin Matuska 		issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk,
4707877fdebSMatt Macy 		    ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
4717877fdebSMatt Macy 		    dmu_zfetch_stream_done, zs);
472eda14cbcSMatt Macy 	}
473*f9693befSMartin Matuska 	for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) {
4747877fdebSMatt Macy 		issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk,
4757877fdebSMatt Macy 		    ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
4767877fdebSMatt Macy 		    dmu_zfetch_stream_done, zs);
477eda14cbcSMatt Macy 	}
478*f9693befSMartin Matuska 
479eda14cbcSMatt Macy 	if (!have_lock)
480eda14cbcSMatt Macy 		rw_exit(&zf->zf_dnode->dn_struct_rwlock);
4817877fdebSMatt Macy 
4827877fdebSMatt Macy 	if (issued)
4837877fdebSMatt Macy 		ZFETCHSTAT_ADD(zfetchstat_io_issued, issued);
484eda14cbcSMatt Macy }
485eda14cbcSMatt Macy 
486*f9693befSMartin Matuska void
487*f9693befSMartin Matuska dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
488*f9693befSMartin Matuska     boolean_t missed, boolean_t have_lock)
489*f9693befSMartin Matuska {
490*f9693befSMartin Matuska 	zstream_t *zs;
491*f9693befSMartin Matuska 
492*f9693befSMartin Matuska 	zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock);
493*f9693befSMartin Matuska 	if (zs)
494*f9693befSMartin Matuska 		dmu_zfetch_run(zs, missed, have_lock);
495*f9693befSMartin Matuska }
496*f9693befSMartin Matuska 
497eda14cbcSMatt Macy /* BEGIN CSTYLED */
498eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,
499eda14cbcSMatt Macy 	"Disable all ZFS prefetching");
500eda14cbcSMatt Macy 
501eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_streams, UINT, ZMOD_RW,
502eda14cbcSMatt Macy 	"Max number of streams per zfetch");
503eda14cbcSMatt Macy 
504eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_sec_reap, UINT, ZMOD_RW,
505eda14cbcSMatt Macy 	"Min time before stream reclaim");
506eda14cbcSMatt Macy 
507eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW,
508180f8225SMatt Macy 	"Max bytes to prefetch per stream");
509180f8225SMatt Macy 
510180f8225SMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW,
511180f8225SMatt Macy 	"Max bytes to prefetch indirects for per stream");
512eda14cbcSMatt Macy 
513eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, array_rd_sz, ULONG, ZMOD_RW,
514eda14cbcSMatt Macy 	"Number of bytes in a array_read");
515eda14cbcSMatt Macy /* END CSTYLED */
516