1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7eda14cbcSMatt Macy * 8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9eda14cbcSMatt Macy * or http://www.opensolaris.org/os/licensing. 10eda14cbcSMatt Macy * See the License for the specific language governing permissions 11eda14cbcSMatt Macy * and limitations under the License. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18eda14cbcSMatt Macy * 19eda14cbcSMatt Macy * CDDL HEADER END 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy /* 22eda14cbcSMatt Macy * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23eda14cbcSMatt Macy * Use is subject to license terms. 24eda14cbcSMatt Macy */ 25eda14cbcSMatt Macy 26eda14cbcSMatt Macy /* 27eda14cbcSMatt Macy * Copyright (c) 2013, 2017 by Delphix. All rights reserved. 28eda14cbcSMatt Macy */ 29eda14cbcSMatt Macy 30eda14cbcSMatt Macy #include <sys/zfs_context.h> 31eda14cbcSMatt Macy #include <sys/dnode.h> 32eda14cbcSMatt Macy #include <sys/dmu_objset.h> 33eda14cbcSMatt Macy #include <sys/dmu_zfetch.h> 34eda14cbcSMatt Macy #include <sys/dmu.h> 35eda14cbcSMatt Macy #include <sys/dbuf.h> 36eda14cbcSMatt Macy #include <sys/kstat.h> 37eda14cbcSMatt Macy 38eda14cbcSMatt Macy /* 39eda14cbcSMatt Macy * This tunable disables predictive prefetch. Note that it leaves "prescient" 40eda14cbcSMatt Macy * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch, 41eda14cbcSMatt Macy * prescient prefetch never issues i/os that end up not being needed, 42eda14cbcSMatt Macy * so it can't hurt performance. 43eda14cbcSMatt Macy */ 44eda14cbcSMatt Macy 45eda14cbcSMatt Macy int zfs_prefetch_disable = B_FALSE; 46eda14cbcSMatt Macy 47eda14cbcSMatt Macy /* max # of streams per zfetch */ 48eda14cbcSMatt Macy unsigned int zfetch_max_streams = 8; 49eda14cbcSMatt Macy /* min time before stream reclaim */ 50eda14cbcSMatt Macy unsigned int zfetch_min_sec_reap = 2; 51eda14cbcSMatt Macy /* max bytes to prefetch per stream (default 8MB) */ 52eda14cbcSMatt Macy unsigned int zfetch_max_distance = 8 * 1024 * 1024; 53eda14cbcSMatt Macy /* max bytes to prefetch indirects for per stream (default 64MB) */ 54eda14cbcSMatt Macy unsigned int zfetch_max_idistance = 64 * 1024 * 1024; 55eda14cbcSMatt Macy /* max number of bytes in an array_read in which we allow prefetching (1MB) */ 56eda14cbcSMatt Macy unsigned long zfetch_array_rd_sz = 1024 * 1024; 57eda14cbcSMatt Macy 58eda14cbcSMatt Macy typedef struct zfetch_stats { 59eda14cbcSMatt Macy kstat_named_t zfetchstat_hits; 60eda14cbcSMatt Macy kstat_named_t zfetchstat_misses; 61eda14cbcSMatt Macy kstat_named_t zfetchstat_max_streams; 627877fdebSMatt Macy kstat_named_t zfetchstat_io_issued; 63eda14cbcSMatt Macy } zfetch_stats_t; 64eda14cbcSMatt Macy 65eda14cbcSMatt Macy static zfetch_stats_t zfetch_stats = { 66eda14cbcSMatt Macy { "hits", KSTAT_DATA_UINT64 }, 67eda14cbcSMatt Macy { "misses", KSTAT_DATA_UINT64 }, 68eda14cbcSMatt Macy { "max_streams", KSTAT_DATA_UINT64 }, 697877fdebSMatt Macy { "io_issued", KSTAT_DATA_UINT64 }, 70eda14cbcSMatt Macy }; 71eda14cbcSMatt Macy 72eda14cbcSMatt Macy #define ZFETCHSTAT_BUMP(stat) \ 737877fdebSMatt Macy atomic_inc_64(&zfetch_stats.stat.value.ui64) 747877fdebSMatt Macy #define ZFETCHSTAT_ADD(stat, val) \ 757877fdebSMatt Macy atomic_add_64(&zfetch_stats.stat.value.ui64, val) 767877fdebSMatt Macy #define ZFETCHSTAT_SET(stat, val) \ 777877fdebSMatt Macy zfetch_stats.stat.value.ui64 = val 787877fdebSMatt Macy #define ZFETCHSTAT_GET(stat) \ 797877fdebSMatt Macy zfetch_stats.stat.value.ui64 807877fdebSMatt Macy 81eda14cbcSMatt Macy 82eda14cbcSMatt Macy kstat_t *zfetch_ksp; 83eda14cbcSMatt Macy 84eda14cbcSMatt Macy void 85eda14cbcSMatt Macy zfetch_init(void) 86eda14cbcSMatt Macy { 87eda14cbcSMatt Macy zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", 88eda14cbcSMatt Macy KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), 89eda14cbcSMatt Macy KSTAT_FLAG_VIRTUAL); 90eda14cbcSMatt Macy 91eda14cbcSMatt Macy if (zfetch_ksp != NULL) { 92eda14cbcSMatt Macy zfetch_ksp->ks_data = &zfetch_stats; 93eda14cbcSMatt Macy kstat_install(zfetch_ksp); 94eda14cbcSMatt Macy } 95eda14cbcSMatt Macy } 96eda14cbcSMatt Macy 97eda14cbcSMatt Macy void 98eda14cbcSMatt Macy zfetch_fini(void) 99eda14cbcSMatt Macy { 100eda14cbcSMatt Macy if (zfetch_ksp != NULL) { 101eda14cbcSMatt Macy kstat_delete(zfetch_ksp); 102eda14cbcSMatt Macy zfetch_ksp = NULL; 103eda14cbcSMatt Macy } 104eda14cbcSMatt Macy } 105eda14cbcSMatt Macy 106eda14cbcSMatt Macy /* 107eda14cbcSMatt Macy * This takes a pointer to a zfetch structure and a dnode. It performs the 108eda14cbcSMatt Macy * necessary setup for the zfetch structure, grokking data from the 109eda14cbcSMatt Macy * associated dnode. 110eda14cbcSMatt Macy */ 111eda14cbcSMatt Macy void 112eda14cbcSMatt Macy dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) 113eda14cbcSMatt Macy { 114eda14cbcSMatt Macy if (zf == NULL) 115eda14cbcSMatt Macy return; 116eda14cbcSMatt Macy zf->zf_dnode = dno; 1177877fdebSMatt Macy zf->zf_numstreams = 0; 118eda14cbcSMatt Macy 119eda14cbcSMatt Macy list_create(&zf->zf_stream, sizeof (zstream_t), 120eda14cbcSMatt Macy offsetof(zstream_t, zs_node)); 121eda14cbcSMatt Macy 122eda14cbcSMatt Macy mutex_init(&zf->zf_lock, NULL, MUTEX_DEFAULT, NULL); 123eda14cbcSMatt Macy } 124eda14cbcSMatt Macy 125eda14cbcSMatt Macy static void 1267877fdebSMatt Macy dmu_zfetch_stream_fini(zstream_t *zs) 1277877fdebSMatt Macy { 128*f9693befSMartin Matuska ASSERT(!list_link_active(&zs->zs_node)); 1297877fdebSMatt Macy kmem_free(zs, sizeof (*zs)); 1307877fdebSMatt Macy } 1317877fdebSMatt Macy 1327877fdebSMatt Macy static void 133eda14cbcSMatt Macy dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) 134eda14cbcSMatt Macy { 135eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&zf->zf_lock)); 136eda14cbcSMatt Macy list_remove(&zf->zf_stream, zs); 137*f9693befSMartin Matuska zf->zf_numstreams--; 138*f9693befSMartin Matuska membar_producer(); 139*f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 1407877fdebSMatt Macy dmu_zfetch_stream_fini(zs); 141eda14cbcSMatt Macy } 142eda14cbcSMatt Macy 143eda14cbcSMatt Macy /* 144eda14cbcSMatt Macy * Clean-up state associated with a zfetch structure (e.g. destroy the 145eda14cbcSMatt Macy * streams). This doesn't free the zfetch_t itself, that's left to the caller. 146eda14cbcSMatt Macy */ 147eda14cbcSMatt Macy void 148eda14cbcSMatt Macy dmu_zfetch_fini(zfetch_t *zf) 149eda14cbcSMatt Macy { 150eda14cbcSMatt Macy zstream_t *zs; 151eda14cbcSMatt Macy 152eda14cbcSMatt Macy mutex_enter(&zf->zf_lock); 153*f9693befSMartin Matuska while ((zs = list_head(&zf->zf_stream)) != NULL) 154eda14cbcSMatt Macy dmu_zfetch_stream_remove(zf, zs); 155eda14cbcSMatt Macy mutex_exit(&zf->zf_lock); 156eda14cbcSMatt Macy list_destroy(&zf->zf_stream); 157eda14cbcSMatt Macy mutex_destroy(&zf->zf_lock); 158eda14cbcSMatt Macy 159eda14cbcSMatt Macy zf->zf_dnode = NULL; 160eda14cbcSMatt Macy } 161eda14cbcSMatt Macy 162eda14cbcSMatt Macy /* 163eda14cbcSMatt Macy * If there aren't too many streams already, create a new stream. 164eda14cbcSMatt Macy * The "blkid" argument is the next block that we expect this stream to access. 165eda14cbcSMatt Macy * While we're here, clean up old streams (which haven't been 166eda14cbcSMatt Macy * accessed for at least zfetch_min_sec_reap seconds). 167eda14cbcSMatt Macy */ 168eda14cbcSMatt Macy static void 169eda14cbcSMatt Macy dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) 170eda14cbcSMatt Macy { 171eda14cbcSMatt Macy zstream_t *zs_next; 1727877fdebSMatt Macy hrtime_t now = gethrtime(); 173eda14cbcSMatt Macy 174eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&zf->zf_lock)); 175eda14cbcSMatt Macy 176eda14cbcSMatt Macy /* 177eda14cbcSMatt Macy * Clean up old streams. 178eda14cbcSMatt Macy */ 179eda14cbcSMatt Macy for (zstream_t *zs = list_head(&zf->zf_stream); 180eda14cbcSMatt Macy zs != NULL; zs = zs_next) { 181eda14cbcSMatt Macy zs_next = list_next(&zf->zf_stream, zs); 1827877fdebSMatt Macy /* 183*f9693befSMartin Matuska * Skip if still active. 1 -- zf_stream reference. 1847877fdebSMatt Macy */ 185*f9693befSMartin Matuska if (zfs_refcount_count(&zs->zs_refs) != 1) 1867877fdebSMatt Macy continue; 1877877fdebSMatt Macy if (((now - zs->zs_atime) / NANOSEC) > 188eda14cbcSMatt Macy zfetch_min_sec_reap) 189eda14cbcSMatt Macy dmu_zfetch_stream_remove(zf, zs); 190eda14cbcSMatt Macy } 191eda14cbcSMatt Macy 192eda14cbcSMatt Macy /* 193eda14cbcSMatt Macy * The maximum number of streams is normally zfetch_max_streams, 194eda14cbcSMatt Macy * but for small files we lower it such that it's at least possible 195eda14cbcSMatt Macy * for all the streams to be non-overlapping. 196eda14cbcSMatt Macy * 197eda14cbcSMatt Macy * If we are already at the maximum number of streams for this file, 198eda14cbcSMatt Macy * even after removing old streams, then don't create this stream. 199eda14cbcSMatt Macy */ 200eda14cbcSMatt Macy uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, 201eda14cbcSMatt Macy zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz / 202eda14cbcSMatt Macy zfetch_max_distance)); 2037877fdebSMatt Macy if (zf->zf_numstreams >= max_streams) { 204eda14cbcSMatt Macy ZFETCHSTAT_BUMP(zfetchstat_max_streams); 205eda14cbcSMatt Macy return; 206eda14cbcSMatt Macy } 207eda14cbcSMatt Macy 208eda14cbcSMatt Macy zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); 209eda14cbcSMatt Macy zs->zs_blkid = blkid; 210*f9693befSMartin Matuska zs->zs_pf_blkid1 = blkid; 211eda14cbcSMatt Macy zs->zs_pf_blkid = blkid; 212*f9693befSMartin Matuska zs->zs_ipf_blkid1 = blkid; 213eda14cbcSMatt Macy zs->zs_ipf_blkid = blkid; 2147877fdebSMatt Macy zs->zs_atime = now; 2157877fdebSMatt Macy zs->zs_fetch = zf; 216*f9693befSMartin Matuska zs->zs_missed = B_FALSE; 217*f9693befSMartin Matuska zfs_refcount_create(&zs->zs_callers); 218*f9693befSMartin Matuska zfs_refcount_create(&zs->zs_refs); 219*f9693befSMartin Matuska /* One reference for zf_stream. */ 220*f9693befSMartin Matuska zfs_refcount_add(&zs->zs_refs, NULL); 2217877fdebSMatt Macy zf->zf_numstreams++; 222eda14cbcSMatt Macy list_insert_head(&zf->zf_stream, zs); 223eda14cbcSMatt Macy } 224eda14cbcSMatt Macy 2257877fdebSMatt Macy static void 2267877fdebSMatt Macy dmu_zfetch_stream_done(void *arg, boolean_t io_issued) 2277877fdebSMatt Macy { 2287877fdebSMatt Macy zstream_t *zs = arg; 2297877fdebSMatt Macy 230*f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 2317877fdebSMatt Macy dmu_zfetch_stream_fini(zs); 2327877fdebSMatt Macy } 2337877fdebSMatt Macy 234eda14cbcSMatt Macy /* 235*f9693befSMartin Matuska * This is the predictive prefetch entry point. dmu_zfetch_prepare() 236*f9693befSMartin Matuska * associates dnode access specified with blkid and nblks arguments with 237*f9693befSMartin Matuska * prefetch stream, predicts further accesses based on that stats and returns 238*f9693befSMartin Matuska * the stream pointer on success. That pointer must later be passed to 239*f9693befSMartin Matuska * dmu_zfetch_run() to initiate the speculative prefetch for the stream and 240*f9693befSMartin Matuska * release it. dmu_zfetch() is a wrapper for simple cases when window between 241*f9693befSMartin Matuska * prediction and prefetch initiation is not needed. 242eda14cbcSMatt Macy * fetch_data argument specifies whether actual data blocks should be fetched: 243eda14cbcSMatt Macy * FALSE -- prefetch only indirect blocks for predicted data blocks; 244eda14cbcSMatt Macy * TRUE -- prefetch predicted data blocks plus following indirect blocks. 245eda14cbcSMatt Macy */ 246*f9693befSMartin Matuska zstream_t * 247*f9693befSMartin Matuska dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, 248*f9693befSMartin Matuska boolean_t fetch_data, boolean_t have_lock) 249eda14cbcSMatt Macy { 250eda14cbcSMatt Macy zstream_t *zs; 251*f9693befSMartin Matuska int64_t pf_start, ipf_start; 252eda14cbcSMatt Macy int64_t pf_ahead_blks, max_blks; 253*f9693befSMartin Matuska int max_dist_blks, pf_nblks, ipf_nblks; 254*f9693befSMartin Matuska uint64_t end_of_access_blkid, maxblkid; 255eda14cbcSMatt Macy end_of_access_blkid = blkid + nblks; 256eda14cbcSMatt Macy spa_t *spa = zf->zf_dnode->dn_objset->os_spa; 257eda14cbcSMatt Macy 258eda14cbcSMatt Macy if (zfs_prefetch_disable) 259*f9693befSMartin Matuska return (NULL); 260eda14cbcSMatt Macy /* 261eda14cbcSMatt Macy * If we haven't yet loaded the indirect vdevs' mappings, we 262eda14cbcSMatt Macy * can only read from blocks that we carefully ensure are on 263eda14cbcSMatt Macy * concrete vdevs (or previously-loaded indirect vdevs). So we 264eda14cbcSMatt Macy * can't allow the predictive prefetcher to attempt reads of other 265eda14cbcSMatt Macy * blocks (e.g. of the MOS's dnode object). 266eda14cbcSMatt Macy */ 267eda14cbcSMatt Macy if (!spa_indirect_vdevs_loaded(spa)) 268*f9693befSMartin Matuska return (NULL); 269eda14cbcSMatt Macy 270eda14cbcSMatt Macy /* 271eda14cbcSMatt Macy * As a fast path for small (single-block) files, ignore access 272eda14cbcSMatt Macy * to the first block. 273eda14cbcSMatt Macy */ 2747877fdebSMatt Macy if (!have_lock && blkid == 0) 275*f9693befSMartin Matuska return (NULL); 276eda14cbcSMatt Macy 277eda14cbcSMatt Macy if (!have_lock) 278eda14cbcSMatt Macy rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); 2797877fdebSMatt Macy 2807877fdebSMatt Macy /* 2817877fdebSMatt Macy * A fast path for small files for which no prefetch will 2827877fdebSMatt Macy * happen. 2837877fdebSMatt Macy */ 284*f9693befSMartin Matuska maxblkid = zf->zf_dnode->dn_maxblkid; 285*f9693befSMartin Matuska if (maxblkid < 2) { 2867877fdebSMatt Macy if (!have_lock) 2877877fdebSMatt Macy rw_exit(&zf->zf_dnode->dn_struct_rwlock); 288*f9693befSMartin Matuska return (NULL); 2897877fdebSMatt Macy } 290eda14cbcSMatt Macy mutex_enter(&zf->zf_lock); 291eda14cbcSMatt Macy 292eda14cbcSMatt Macy /* 293eda14cbcSMatt Macy * Find matching prefetch stream. Depending on whether the accesses 294eda14cbcSMatt Macy * are block-aligned, first block of the new access may either follow 295eda14cbcSMatt Macy * the last block of the previous access, or be equal to it. 296eda14cbcSMatt Macy */ 297eda14cbcSMatt Macy for (zs = list_head(&zf->zf_stream); zs != NULL; 298eda14cbcSMatt Macy zs = list_next(&zf->zf_stream, zs)) { 299eda14cbcSMatt Macy if (blkid == zs->zs_blkid) { 300eda14cbcSMatt Macy break; 301eda14cbcSMatt Macy } else if (blkid + 1 == zs->zs_blkid) { 302eda14cbcSMatt Macy blkid++; 303eda14cbcSMatt Macy nblks--; 304eda14cbcSMatt Macy break; 305eda14cbcSMatt Macy } 306eda14cbcSMatt Macy } 307*f9693befSMartin Matuska 308*f9693befSMartin Matuska /* 309*f9693befSMartin Matuska * If the file is ending, remove the matching stream if found. 310*f9693befSMartin Matuska * If not found then it is too late to create a new one now. 311*f9693befSMartin Matuska */ 312*f9693befSMartin Matuska if (end_of_access_blkid >= maxblkid) { 313*f9693befSMartin Matuska if (zs != NULL) 314*f9693befSMartin Matuska dmu_zfetch_stream_remove(zf, zs); 315*f9693befSMartin Matuska mutex_exit(&zf->zf_lock); 316*f9693befSMartin Matuska if (!have_lock) 317*f9693befSMartin Matuska rw_exit(&zf->zf_dnode->dn_struct_rwlock); 318*f9693befSMartin Matuska return (NULL); 319*f9693befSMartin Matuska } 320*f9693befSMartin Matuska 321*f9693befSMartin Matuska /* Exit if we already prefetched this block before. */ 322*f9693befSMartin Matuska if (nblks == 0) { 323*f9693befSMartin Matuska mutex_exit(&zf->zf_lock); 324*f9693befSMartin Matuska if (!have_lock) 325*f9693befSMartin Matuska rw_exit(&zf->zf_dnode->dn_struct_rwlock); 326*f9693befSMartin Matuska return (NULL); 327eda14cbcSMatt Macy } 328eda14cbcSMatt Macy 329eda14cbcSMatt Macy if (zs == NULL) { 330eda14cbcSMatt Macy /* 331eda14cbcSMatt Macy * This access is not part of any existing stream. Create 332eda14cbcSMatt Macy * a new stream for it. 333eda14cbcSMatt Macy */ 334eda14cbcSMatt Macy dmu_zfetch_stream_create(zf, end_of_access_blkid); 335eda14cbcSMatt Macy mutex_exit(&zf->zf_lock); 336eda14cbcSMatt Macy if (!have_lock) 337eda14cbcSMatt Macy rw_exit(&zf->zf_dnode->dn_struct_rwlock); 338*f9693befSMartin Matuska ZFETCHSTAT_BUMP(zfetchstat_misses); 339*f9693befSMartin Matuska return (NULL); 340eda14cbcSMatt Macy } 341eda14cbcSMatt Macy 342eda14cbcSMatt Macy /* 343eda14cbcSMatt Macy * This access was to a block that we issued a prefetch for on 344eda14cbcSMatt Macy * behalf of this stream. Issue further prefetches for this stream. 345eda14cbcSMatt Macy * 346eda14cbcSMatt Macy * Normally, we start prefetching where we stopped 347eda14cbcSMatt Macy * prefetching last (zs_pf_blkid). But when we get our first 348eda14cbcSMatt Macy * hit on this stream, zs_pf_blkid == zs_blkid, we don't 349eda14cbcSMatt Macy * want to prefetch the block we just accessed. In this case, 350eda14cbcSMatt Macy * start just after the block we just accessed. 351eda14cbcSMatt Macy */ 352eda14cbcSMatt Macy pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid); 353*f9693befSMartin Matuska if (zs->zs_pf_blkid1 < end_of_access_blkid) 354*f9693befSMartin Matuska zs->zs_pf_blkid1 = end_of_access_blkid; 355*f9693befSMartin Matuska if (zs->zs_ipf_blkid1 < end_of_access_blkid) 356*f9693befSMartin Matuska zs->zs_ipf_blkid1 = end_of_access_blkid; 357eda14cbcSMatt Macy 358eda14cbcSMatt Macy /* 359eda14cbcSMatt Macy * Double our amount of prefetched data, but don't let the 360eda14cbcSMatt Macy * prefetch get further ahead than zfetch_max_distance. 361eda14cbcSMatt Macy */ 362eda14cbcSMatt Macy if (fetch_data) { 363eda14cbcSMatt Macy max_dist_blks = 364eda14cbcSMatt Macy zfetch_max_distance >> zf->zf_dnode->dn_datablkshift; 365eda14cbcSMatt Macy /* 366eda14cbcSMatt Macy * Previously, we were (zs_pf_blkid - blkid) ahead. We 367eda14cbcSMatt Macy * want to now be double that, so read that amount again, 368eda14cbcSMatt Macy * plus the amount we are catching up by (i.e. the amount 369eda14cbcSMatt Macy * read just now). 370eda14cbcSMatt Macy */ 371eda14cbcSMatt Macy pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks; 372eda14cbcSMatt Macy max_blks = max_dist_blks - (pf_start - end_of_access_blkid); 373eda14cbcSMatt Macy pf_nblks = MIN(pf_ahead_blks, max_blks); 374eda14cbcSMatt Macy } else { 375eda14cbcSMatt Macy pf_nblks = 0; 376eda14cbcSMatt Macy } 377eda14cbcSMatt Macy 378eda14cbcSMatt Macy zs->zs_pf_blkid = pf_start + pf_nblks; 379eda14cbcSMatt Macy 380eda14cbcSMatt Macy /* 381eda14cbcSMatt Macy * Do the same for indirects, starting from where we stopped last, 382eda14cbcSMatt Macy * or where we will stop reading data blocks (and the indirects 383eda14cbcSMatt Macy * that point to them). 384eda14cbcSMatt Macy */ 385eda14cbcSMatt Macy ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid); 386eda14cbcSMatt Macy max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift; 387eda14cbcSMatt Macy /* 388eda14cbcSMatt Macy * We want to double our distance ahead of the data prefetch 389eda14cbcSMatt Macy * (or reader, if we are not prefetching data). Previously, we 390eda14cbcSMatt Macy * were (zs_ipf_blkid - blkid) ahead. To double that, we read 391eda14cbcSMatt Macy * that amount again, plus the amount we are catching up by 392eda14cbcSMatt Macy * (i.e. the amount read now + the amount of data prefetched now). 393eda14cbcSMatt Macy */ 394eda14cbcSMatt Macy pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks; 395*f9693befSMartin Matuska max_blks = max_dist_blks - (ipf_start - zs->zs_pf_blkid); 396eda14cbcSMatt Macy ipf_nblks = MIN(pf_ahead_blks, max_blks); 397eda14cbcSMatt Macy zs->zs_ipf_blkid = ipf_start + ipf_nblks; 398eda14cbcSMatt Macy 399eda14cbcSMatt Macy zs->zs_blkid = end_of_access_blkid; 400*f9693befSMartin Matuska /* Protect the stream from reclamation. */ 401*f9693befSMartin Matuska zs->zs_atime = gethrtime(); 402*f9693befSMartin Matuska zfs_refcount_add(&zs->zs_refs, NULL); 403*f9693befSMartin Matuska /* Count concurrent callers. */ 404*f9693befSMartin Matuska zfs_refcount_add(&zs->zs_callers, NULL); 405eda14cbcSMatt Macy mutex_exit(&zf->zf_lock); 406*f9693befSMartin Matuska 407*f9693befSMartin Matuska if (!have_lock) 408*f9693befSMartin Matuska rw_exit(&zf->zf_dnode->dn_struct_rwlock); 409*f9693befSMartin Matuska 410*f9693befSMartin Matuska ZFETCHSTAT_BUMP(zfetchstat_hits); 411*f9693befSMartin Matuska return (zs); 412*f9693befSMartin Matuska } 413*f9693befSMartin Matuska 414*f9693befSMartin Matuska void 415*f9693befSMartin Matuska dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock) 416*f9693befSMartin Matuska { 417*f9693befSMartin Matuska zfetch_t *zf = zs->zs_fetch; 418*f9693befSMartin Matuska int64_t pf_start, pf_end, ipf_start, ipf_end; 419*f9693befSMartin Matuska int epbs, issued; 420*f9693befSMartin Matuska 421*f9693befSMartin Matuska if (missed) 422*f9693befSMartin Matuska zs->zs_missed = missed; 423eda14cbcSMatt Macy 424eda14cbcSMatt Macy /* 425*f9693befSMartin Matuska * Postpone the prefetch if there are more concurrent callers. 426*f9693befSMartin Matuska * It happens when multiple requests are waiting for the same 427*f9693befSMartin Matuska * indirect block. The last one will run the prefetch for all. 428eda14cbcSMatt Macy */ 429*f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_callers, NULL) != 0) { 430*f9693befSMartin Matuska /* Drop reference taken in dmu_zfetch_prepare(). */ 431*f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 432*f9693befSMartin Matuska dmu_zfetch_stream_fini(zs); 433*f9693befSMartin Matuska return; 434*f9693befSMartin Matuska } 435eda14cbcSMatt Macy 436*f9693befSMartin Matuska mutex_enter(&zf->zf_lock); 437*f9693befSMartin Matuska if (zs->zs_missed) { 438*f9693befSMartin Matuska pf_start = zs->zs_pf_blkid1; 439*f9693befSMartin Matuska pf_end = zs->zs_pf_blkid1 = zs->zs_pf_blkid; 440*f9693befSMartin Matuska } else { 441*f9693befSMartin Matuska pf_start = pf_end = 0; 442*f9693befSMartin Matuska } 443*f9693befSMartin Matuska ipf_start = MAX(zs->zs_pf_blkid1, zs->zs_ipf_blkid1); 444*f9693befSMartin Matuska ipf_end = zs->zs_ipf_blkid1 = zs->zs_ipf_blkid; 445*f9693befSMartin Matuska mutex_exit(&zf->zf_lock); 446*f9693befSMartin Matuska ASSERT3S(pf_start, <=, pf_end); 447*f9693befSMartin Matuska ASSERT3S(ipf_start, <=, ipf_end); 448*f9693befSMartin Matuska 449*f9693befSMartin Matuska epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; 450*f9693befSMartin Matuska ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; 451*f9693befSMartin Matuska ipf_end = P2ROUNDUP(ipf_end, 1 << epbs) >> epbs; 452*f9693befSMartin Matuska ASSERT3S(ipf_start, <=, ipf_end); 453*f9693befSMartin Matuska issued = pf_end - pf_start + ipf_end - ipf_start; 454*f9693befSMartin Matuska if (issued > 1) { 455*f9693befSMartin Matuska /* More references on top of taken in dmu_zfetch_prepare(). */ 456*f9693befSMartin Matuska zfs_refcount_add_many(&zs->zs_refs, issued - 1, NULL); 457*f9693befSMartin Matuska } else if (issued == 0) { 458*f9693befSMartin Matuska /* Some other thread has done our work, so drop the ref. */ 459*f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 460*f9693befSMartin Matuska dmu_zfetch_stream_fini(zs); 461*f9693befSMartin Matuska return; 462*f9693befSMartin Matuska } 463*f9693befSMartin Matuska 464*f9693befSMartin Matuska if (!have_lock) 465*f9693befSMartin Matuska rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); 466*f9693befSMartin Matuska 467*f9693befSMartin Matuska issued = 0; 468*f9693befSMartin Matuska for (int64_t blk = pf_start; blk < pf_end; blk++) { 469*f9693befSMartin Matuska issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk, 4707877fdebSMatt Macy ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, 4717877fdebSMatt Macy dmu_zfetch_stream_done, zs); 472eda14cbcSMatt Macy } 473*f9693befSMartin Matuska for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) { 4747877fdebSMatt Macy issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk, 4757877fdebSMatt Macy ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, 4767877fdebSMatt Macy dmu_zfetch_stream_done, zs); 477eda14cbcSMatt Macy } 478*f9693befSMartin Matuska 479eda14cbcSMatt Macy if (!have_lock) 480eda14cbcSMatt Macy rw_exit(&zf->zf_dnode->dn_struct_rwlock); 4817877fdebSMatt Macy 4827877fdebSMatt Macy if (issued) 4837877fdebSMatt Macy ZFETCHSTAT_ADD(zfetchstat_io_issued, issued); 484eda14cbcSMatt Macy } 485eda14cbcSMatt Macy 486*f9693befSMartin Matuska void 487*f9693befSMartin Matuska dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, 488*f9693befSMartin Matuska boolean_t missed, boolean_t have_lock) 489*f9693befSMartin Matuska { 490*f9693befSMartin Matuska zstream_t *zs; 491*f9693befSMartin Matuska 492*f9693befSMartin Matuska zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock); 493*f9693befSMartin Matuska if (zs) 494*f9693befSMartin Matuska dmu_zfetch_run(zs, missed, have_lock); 495*f9693befSMartin Matuska } 496*f9693befSMartin Matuska 497eda14cbcSMatt Macy /* BEGIN CSTYLED */ 498eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW, 499eda14cbcSMatt Macy "Disable all ZFS prefetching"); 500eda14cbcSMatt Macy 501eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_streams, UINT, ZMOD_RW, 502eda14cbcSMatt Macy "Max number of streams per zfetch"); 503eda14cbcSMatt Macy 504eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_sec_reap, UINT, ZMOD_RW, 505eda14cbcSMatt Macy "Min time before stream reclaim"); 506eda14cbcSMatt Macy 507eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW, 508180f8225SMatt Macy "Max bytes to prefetch per stream"); 509180f8225SMatt Macy 510180f8225SMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW, 511180f8225SMatt Macy "Max bytes to prefetch indirects for per stream"); 512eda14cbcSMatt Macy 513eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, array_rd_sz, ULONG, ZMOD_RW, 514eda14cbcSMatt Macy "Number of bytes in a array_read"); 515eda14cbcSMatt Macy /* END CSTYLED */ 516