1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51484Sek110237 * Common Development and Distribution License (the "License"). 61484Sek110237 * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 223403Sbmc * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23789Sahrens * Use is subject to license terms. 24789Sahrens */ 25789Sahrens 26789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27789Sahrens 28789Sahrens /* 293403Sbmc * DVA-based Adjustable Replacement Cache 30789Sahrens * 311544Seschrock * While much of the theory of operation used here is 321544Seschrock * based on the self-tuning, low overhead replacement cache 33789Sahrens * presented by Megiddo and Modha at FAST 2003, there are some 34789Sahrens * significant differences: 35789Sahrens * 36789Sahrens * 1. The Megiddo and Modha model assumes any page is evictable. 37789Sahrens * Pages in its cache cannot be "locked" into memory. This makes 38789Sahrens * the eviction algorithm simple: evict the last page in the list. 39789Sahrens * This also make the performance characteristics easy to reason 40789Sahrens * about. Our cache is not so simple. At any given moment, some 41789Sahrens * subset of the blocks in the cache are un-evictable because we 42789Sahrens * have handed out a reference to them. Blocks are only evictable 43789Sahrens * when there are no external references active. This makes 44789Sahrens * eviction far more problematic: we choose to evict the evictable 45789Sahrens * blocks that are the "lowest" in the list. 46789Sahrens * 47789Sahrens * There are times when it is not possible to evict the requested 48789Sahrens * space. In these circumstances we are unable to adjust the cache 49789Sahrens * size. To prevent the cache growing unbounded at these times we 50789Sahrens * implement a "cache throttle" that slowes the flow of new data 51789Sahrens * into the cache until we can make space avaiable. 52789Sahrens * 53789Sahrens * 2. The Megiddo and Modha model assumes a fixed cache size. 54789Sahrens * Pages are evicted when the cache is full and there is a cache 55789Sahrens * miss. Our model has a variable sized cache. It grows with 56789Sahrens * high use, but also tries to react to memory preasure from the 57789Sahrens * operating system: decreasing its size when system memory is 58789Sahrens * tight. 59789Sahrens * 60789Sahrens * 3. The Megiddo and Modha model assumes a fixed page size. All 61789Sahrens * elements of the cache are therefor exactly the same size. So 62789Sahrens * when adjusting the cache size following a cache miss, its simply 63789Sahrens * a matter of choosing a single page to evict. In our model, we 64789Sahrens * have variable sized cache blocks (rangeing from 512 bytes to 65789Sahrens * 128K bytes). We therefor choose a set of blocks to evict to make 66789Sahrens * space for a cache miss that approximates as closely as possible 67789Sahrens * the space used by the new block. 68789Sahrens * 69789Sahrens * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 70789Sahrens * by N. Megiddo & D. Modha, FAST 2003 71789Sahrens */ 72789Sahrens 73789Sahrens /* 74789Sahrens * The locking model: 75789Sahrens * 76789Sahrens * A new reference to a cache buffer can be obtained in two 77789Sahrens * ways: 1) via a hash table lookup using the DVA as a key, 78789Sahrens * or 2) via one of the ARC lists. The arc_read() inerface 79789Sahrens * uses method 1, while the internal arc algorithms for 80789Sahrens * adjusting the cache use method 2. We therefor provide two 81789Sahrens * types of locks: 1) the hash table lock array, and 2) the 82789Sahrens * arc list locks. 83789Sahrens * 84789Sahrens * Buffers do not have their own mutexs, rather they rely on the 85789Sahrens * hash table mutexs for the bulk of their protection (i.e. most 86789Sahrens * fields in the arc_buf_hdr_t are protected by these mutexs). 87789Sahrens * 88789Sahrens * buf_hash_find() returns the appropriate mutex (held) when it 89789Sahrens * locates the requested buffer in the hash table. It returns 90789Sahrens * NULL for the mutex if the buffer was not in the table. 91789Sahrens * 92789Sahrens * buf_hash_remove() expects the appropriate hash mutex to be 93789Sahrens * already held before it is invoked. 94789Sahrens * 95789Sahrens * Each arc state also has a mutex which is used to protect the 96789Sahrens * buffer list associated with the state. When attempting to 97789Sahrens * obtain a hash table lock while holding an arc list lock you 98789Sahrens * must use: mutex_tryenter() to avoid deadlock. Also note that 992688Smaybee * the active state mutex must be held before the ghost state mutex. 100789Sahrens * 1011544Seschrock * Arc buffers may have an associated eviction callback function. 1021544Seschrock * This function will be invoked prior to removing the buffer (e.g. 1031544Seschrock * in arc_do_user_evicts()). Note however that the data associated 1041544Seschrock * with the buffer may be evicted prior to the callback. The callback 1051544Seschrock * must be made with *no locks held* (to prevent deadlock). Additionally, 1061544Seschrock * the users of callbacks must ensure that their private data is 1071544Seschrock * protected from simultaneous callbacks from arc_buf_evict() 1081544Seschrock * and arc_do_user_evicts(). 1091544Seschrock * 110789Sahrens * Note that the majority of the performance stats are manipulated 111789Sahrens * with atomic operations. 112789Sahrens */ 113789Sahrens 114789Sahrens #include <sys/spa.h> 115789Sahrens #include <sys/zio.h> 1163093Sahrens #include <sys/zio_checksum.h> 117789Sahrens #include <sys/zfs_context.h> 118789Sahrens #include <sys/arc.h> 119789Sahrens #include <sys/refcount.h> 120789Sahrens #ifdef _KERNEL 121789Sahrens #include <sys/vmsystm.h> 122789Sahrens #include <vm/anon.h> 123789Sahrens #include <sys/fs/swapnode.h> 1241484Sek110237 #include <sys/dnlc.h> 125789Sahrens #endif 126789Sahrens #include <sys/callb.h> 1273403Sbmc #include <sys/kstat.h> 128789Sahrens 129789Sahrens static kmutex_t arc_reclaim_thr_lock; 130789Sahrens static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 131789Sahrens static uint8_t arc_thread_exit; 132789Sahrens 1331484Sek110237 #define ARC_REDUCE_DNLC_PERCENT 3 1341484Sek110237 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 1351484Sek110237 136789Sahrens typedef enum arc_reclaim_strategy { 137789Sahrens ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 138789Sahrens ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 139789Sahrens } arc_reclaim_strategy_t; 140789Sahrens 141789Sahrens /* number of seconds before growing cache again */ 142789Sahrens static int arc_grow_retry = 60; 143789Sahrens 1442391Smaybee /* 1452638Sperrin * minimum lifespan of a prefetch block in clock ticks 1462638Sperrin * (initialized in arc_init()) 1472391Smaybee */ 1482638Sperrin static int arc_min_prefetch_lifespan; 1492391Smaybee 150789Sahrens static int arc_dead; 151789Sahrens 152789Sahrens /* 1532885Sahrens * These tunables are for performance analysis. 1542885Sahrens */ 1552885Sahrens uint64_t zfs_arc_max; 1562885Sahrens uint64_t zfs_arc_min; 1574645Sek110237 uint64_t zfs_arc_meta_limit = 0; 1582885Sahrens 1592885Sahrens /* 1604309Smaybee * Note that buffers can be in one of 5 states: 161789Sahrens * ARC_anon - anonymous (discussed below) 1621544Seschrock * ARC_mru - recently used, currently cached 1631544Seschrock * ARC_mru_ghost - recentely used, no longer in cache 1641544Seschrock * ARC_mfu - frequently used, currently cached 1651544Seschrock * ARC_mfu_ghost - frequently used, no longer in cache 1664309Smaybee * When there are no active references to the buffer, they are 1674309Smaybee * are linked onto a list in one of these arc states. These are 1684309Smaybee * the only buffers that can be evicted or deleted. Within each 1694309Smaybee * state there are multiple lists, one for meta-data and one for 1704309Smaybee * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 1714309Smaybee * etc.) is tracked separately so that it can be managed more 1724309Smaybee * explicitly: favored over data, limited explicitely. 173789Sahrens * 174789Sahrens * Anonymous buffers are buffers that are not associated with 175789Sahrens * a DVA. These are buffers that hold dirty block copies 176789Sahrens * before they are written to stable storage. By definition, 1771544Seschrock * they are "ref'd" and are considered part of arc_mru 178789Sahrens * that cannot be freed. Generally, they will aquire a DVA 1791544Seschrock * as they are written and migrate onto the arc_mru list. 180789Sahrens */ 181789Sahrens 182789Sahrens typedef struct arc_state { 1834309Smaybee list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ 1844309Smaybee uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 1854309Smaybee uint64_t arcs_size; /* total amount of data in this state */ 1863403Sbmc kmutex_t arcs_mtx; 187789Sahrens } arc_state_t; 188789Sahrens 189789Sahrens /* The 5 states: */ 190789Sahrens static arc_state_t ARC_anon; 1911544Seschrock static arc_state_t ARC_mru; 1921544Seschrock static arc_state_t ARC_mru_ghost; 1931544Seschrock static arc_state_t ARC_mfu; 1941544Seschrock static arc_state_t ARC_mfu_ghost; 195789Sahrens 1963403Sbmc typedef struct arc_stats { 1973403Sbmc kstat_named_t arcstat_hits; 1983403Sbmc kstat_named_t arcstat_misses; 1993403Sbmc kstat_named_t arcstat_demand_data_hits; 2003403Sbmc kstat_named_t arcstat_demand_data_misses; 2013403Sbmc kstat_named_t arcstat_demand_metadata_hits; 2023403Sbmc kstat_named_t arcstat_demand_metadata_misses; 2033403Sbmc kstat_named_t arcstat_prefetch_data_hits; 2043403Sbmc kstat_named_t arcstat_prefetch_data_misses; 2053403Sbmc kstat_named_t arcstat_prefetch_metadata_hits; 2063403Sbmc kstat_named_t arcstat_prefetch_metadata_misses; 2073403Sbmc kstat_named_t arcstat_mru_hits; 2083403Sbmc kstat_named_t arcstat_mru_ghost_hits; 2093403Sbmc kstat_named_t arcstat_mfu_hits; 2103403Sbmc kstat_named_t arcstat_mfu_ghost_hits; 2113403Sbmc kstat_named_t arcstat_deleted; 2123403Sbmc kstat_named_t arcstat_recycle_miss; 2133403Sbmc kstat_named_t arcstat_mutex_miss; 2143403Sbmc kstat_named_t arcstat_evict_skip; 2153403Sbmc kstat_named_t arcstat_hash_elements; 2163403Sbmc kstat_named_t arcstat_hash_elements_max; 2173403Sbmc kstat_named_t arcstat_hash_collisions; 2183403Sbmc kstat_named_t arcstat_hash_chains; 2193403Sbmc kstat_named_t arcstat_hash_chain_max; 2203403Sbmc kstat_named_t arcstat_p; 2213403Sbmc kstat_named_t arcstat_c; 2223403Sbmc kstat_named_t arcstat_c_min; 2233403Sbmc kstat_named_t arcstat_c_max; 2243403Sbmc kstat_named_t arcstat_size; 2253403Sbmc } arc_stats_t; 2263403Sbmc 2273403Sbmc static arc_stats_t arc_stats = { 2283403Sbmc { "hits", KSTAT_DATA_UINT64 }, 2293403Sbmc { "misses", KSTAT_DATA_UINT64 }, 2303403Sbmc { "demand_data_hits", KSTAT_DATA_UINT64 }, 2313403Sbmc { "demand_data_misses", KSTAT_DATA_UINT64 }, 2323403Sbmc { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 2333403Sbmc { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 2343403Sbmc { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 2353403Sbmc { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 2363403Sbmc { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 2373403Sbmc { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 2383403Sbmc { "mru_hits", KSTAT_DATA_UINT64 }, 2393403Sbmc { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 2403403Sbmc { "mfu_hits", KSTAT_DATA_UINT64 }, 2413403Sbmc { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 2423403Sbmc { "deleted", KSTAT_DATA_UINT64 }, 2433403Sbmc { "recycle_miss", KSTAT_DATA_UINT64 }, 2443403Sbmc { "mutex_miss", KSTAT_DATA_UINT64 }, 2453403Sbmc { "evict_skip", KSTAT_DATA_UINT64 }, 2463403Sbmc { "hash_elements", KSTAT_DATA_UINT64 }, 2473403Sbmc { "hash_elements_max", KSTAT_DATA_UINT64 }, 2483403Sbmc { "hash_collisions", KSTAT_DATA_UINT64 }, 2493403Sbmc { "hash_chains", KSTAT_DATA_UINT64 }, 2503403Sbmc { "hash_chain_max", KSTAT_DATA_UINT64 }, 2513403Sbmc { "p", KSTAT_DATA_UINT64 }, 2523403Sbmc { "c", KSTAT_DATA_UINT64 }, 2533403Sbmc { "c_min", KSTAT_DATA_UINT64 }, 2543403Sbmc { "c_max", KSTAT_DATA_UINT64 }, 2553403Sbmc { "size", KSTAT_DATA_UINT64 } 2563403Sbmc }; 257789Sahrens 2583403Sbmc #define ARCSTAT(stat) (arc_stats.stat.value.ui64) 2593403Sbmc 2603403Sbmc #define ARCSTAT_INCR(stat, val) \ 2613403Sbmc atomic_add_64(&arc_stats.stat.value.ui64, (val)); 2623403Sbmc 2633403Sbmc #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 2643403Sbmc #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 2653403Sbmc 2663403Sbmc #define ARCSTAT_MAX(stat, val) { \ 2673403Sbmc uint64_t m; \ 2683403Sbmc while ((val) > (m = arc_stats.stat.value.ui64) && \ 2693403Sbmc (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 2703403Sbmc continue; \ 2713403Sbmc } 2723403Sbmc 2733403Sbmc #define ARCSTAT_MAXSTAT(stat) \ 2743403Sbmc ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 275789Sahrens 2763403Sbmc /* 2773403Sbmc * We define a macro to allow ARC hits/misses to be easily broken down by 2783403Sbmc * two separate conditions, giving a total of four different subtypes for 2793403Sbmc * each of hits and misses (so eight statistics total). 2803403Sbmc */ 2813403Sbmc #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 2823403Sbmc if (cond1) { \ 2833403Sbmc if (cond2) { \ 2843403Sbmc ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 2853403Sbmc } else { \ 2863403Sbmc ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 2873403Sbmc } \ 2883403Sbmc } else { \ 2893403Sbmc if (cond2) { \ 2903403Sbmc ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 2913403Sbmc } else { \ 2923403Sbmc ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 2933403Sbmc } \ 2943403Sbmc } 295789Sahrens 2963403Sbmc kstat_t *arc_ksp; 2973403Sbmc static arc_state_t *arc_anon; 2983403Sbmc static arc_state_t *arc_mru; 2993403Sbmc static arc_state_t *arc_mru_ghost; 3003403Sbmc static arc_state_t *arc_mfu; 3013403Sbmc static arc_state_t *arc_mfu_ghost; 3023403Sbmc 3033403Sbmc /* 3043403Sbmc * There are several ARC variables that are critical to export as kstats -- 3053403Sbmc * but we don't want to have to grovel around in the kstat whenever we wish to 3063403Sbmc * manipulate them. For these variables, we therefore define them to be in 3073403Sbmc * terms of the statistic variable. This assures that we are not introducing 3083403Sbmc * the possibility of inconsistency by having shadow copies of the variables, 3093403Sbmc * while still allowing the code to be readable. 3103403Sbmc */ 3113403Sbmc #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 3123403Sbmc #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 3133403Sbmc #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 3143403Sbmc #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 3153403Sbmc #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 3163403Sbmc 3173403Sbmc static int arc_no_grow; /* Don't try to grow cache size */ 3183403Sbmc static uint64_t arc_tempreserve; 3194309Smaybee static uint64_t arc_meta_used; 3204309Smaybee static uint64_t arc_meta_limit; 3214309Smaybee static uint64_t arc_meta_max = 0; 322789Sahrens 323789Sahrens typedef struct arc_callback arc_callback_t; 324789Sahrens 325789Sahrens struct arc_callback { 3263547Smaybee void *acb_private; 327789Sahrens arc_done_func_t *acb_done; 328789Sahrens arc_byteswap_func_t *acb_byteswap; 329789Sahrens arc_buf_t *acb_buf; 330789Sahrens zio_t *acb_zio_dummy; 331789Sahrens arc_callback_t *acb_next; 332789Sahrens }; 333789Sahrens 3343547Smaybee typedef struct arc_write_callback arc_write_callback_t; 3353547Smaybee 3363547Smaybee struct arc_write_callback { 3373547Smaybee void *awcb_private; 3383547Smaybee arc_done_func_t *awcb_ready; 3393547Smaybee arc_done_func_t *awcb_done; 3403547Smaybee arc_buf_t *awcb_buf; 3413547Smaybee }; 3423547Smaybee 343789Sahrens struct arc_buf_hdr { 344789Sahrens /* protected by hash lock */ 345789Sahrens dva_t b_dva; 346789Sahrens uint64_t b_birth; 347789Sahrens uint64_t b_cksum0; 348789Sahrens 3493093Sahrens kmutex_t b_freeze_lock; 3503093Sahrens zio_cksum_t *b_freeze_cksum; 3513093Sahrens 352789Sahrens arc_buf_hdr_t *b_hash_next; 353789Sahrens arc_buf_t *b_buf; 354789Sahrens uint32_t b_flags; 3551544Seschrock uint32_t b_datacnt; 356789Sahrens 3573290Sjohansen arc_callback_t *b_acb; 358789Sahrens kcondvar_t b_cv; 3593290Sjohansen 3603290Sjohansen /* immutable */ 3613290Sjohansen arc_buf_contents_t b_type; 3623290Sjohansen uint64_t b_size; 3633290Sjohansen spa_t *b_spa; 364789Sahrens 365789Sahrens /* protected by arc state mutex */ 366789Sahrens arc_state_t *b_state; 367789Sahrens list_node_t b_arc_node; 368789Sahrens 369789Sahrens /* updated atomically */ 370789Sahrens clock_t b_arc_access; 371789Sahrens 372789Sahrens /* self protecting */ 373789Sahrens refcount_t b_refcnt; 374789Sahrens }; 375789Sahrens 3761544Seschrock static arc_buf_t *arc_eviction_list; 3771544Seschrock static kmutex_t arc_eviction_mtx; 3782887Smaybee static arc_buf_hdr_t arc_eviction_hdr; 3792688Smaybee static void arc_get_data_buf(arc_buf_t *buf); 3802688Smaybee static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 3814309Smaybee static int arc_evict_needed(arc_buf_contents_t type); 3824709Smaybee static void arc_evict_ghost(arc_state_t *state, int64_t bytes); 3831544Seschrock 3841544Seschrock #define GHOST_STATE(state) \ 3853403Sbmc ((state) == arc_mru_ghost || (state) == arc_mfu_ghost) 3861544Seschrock 387789Sahrens /* 388789Sahrens * Private ARC flags. These flags are private ARC only flags that will show up 389789Sahrens * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 390789Sahrens * be passed in as arc_flags in things like arc_read. However, these flags 391789Sahrens * should never be passed and should only be set by ARC code. When adding new 392789Sahrens * public flags, make sure not to smash the private ones. 393789Sahrens */ 394789Sahrens 3951544Seschrock #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 396789Sahrens #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 397789Sahrens #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 398789Sahrens #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 3991544Seschrock #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 4002391Smaybee #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 401789Sahrens 4021544Seschrock #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 403789Sahrens #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 404789Sahrens #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 405789Sahrens #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 4061544Seschrock #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 407789Sahrens 408789Sahrens /* 409789Sahrens * Hash table routines 410789Sahrens */ 411789Sahrens 412789Sahrens #define HT_LOCK_PAD 64 413789Sahrens 414789Sahrens struct ht_lock { 415789Sahrens kmutex_t ht_lock; 416789Sahrens #ifdef _KERNEL 417789Sahrens unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 418789Sahrens #endif 419789Sahrens }; 420789Sahrens 421789Sahrens #define BUF_LOCKS 256 422789Sahrens typedef struct buf_hash_table { 423789Sahrens uint64_t ht_mask; 424789Sahrens arc_buf_hdr_t **ht_table; 425789Sahrens struct ht_lock ht_locks[BUF_LOCKS]; 426789Sahrens } buf_hash_table_t; 427789Sahrens 428789Sahrens static buf_hash_table_t buf_hash_table; 429789Sahrens 430789Sahrens #define BUF_HASH_INDEX(spa, dva, birth) \ 431789Sahrens (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 432789Sahrens #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 433789Sahrens #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 434789Sahrens #define HDR_LOCK(buf) \ 435789Sahrens (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) 436789Sahrens 437789Sahrens uint64_t zfs_crc64_table[256]; 438789Sahrens 439789Sahrens static uint64_t 440789Sahrens buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) 441789Sahrens { 442789Sahrens uintptr_t spav = (uintptr_t)spa; 443789Sahrens uint8_t *vdva = (uint8_t *)dva; 444789Sahrens uint64_t crc = -1ULL; 445789Sahrens int i; 446789Sahrens 447789Sahrens ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 448789Sahrens 449789Sahrens for (i = 0; i < sizeof (dva_t); i++) 450789Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 451789Sahrens 452789Sahrens crc ^= (spav>>8) ^ birth; 453789Sahrens 454789Sahrens return (crc); 455789Sahrens } 456789Sahrens 457789Sahrens #define BUF_EMPTY(buf) \ 458789Sahrens ((buf)->b_dva.dva_word[0] == 0 && \ 459789Sahrens (buf)->b_dva.dva_word[1] == 0 && \ 460789Sahrens (buf)->b_birth == 0) 461789Sahrens 462789Sahrens #define BUF_EQUAL(spa, dva, birth, buf) \ 463789Sahrens ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 464789Sahrens ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 465789Sahrens ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 466789Sahrens 467789Sahrens static arc_buf_hdr_t * 468789Sahrens buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp) 469789Sahrens { 470789Sahrens uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 471789Sahrens kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 472789Sahrens arc_buf_hdr_t *buf; 473789Sahrens 474789Sahrens mutex_enter(hash_lock); 475789Sahrens for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 476789Sahrens buf = buf->b_hash_next) { 477789Sahrens if (BUF_EQUAL(spa, dva, birth, buf)) { 478789Sahrens *lockp = hash_lock; 479789Sahrens return (buf); 480789Sahrens } 481789Sahrens } 482789Sahrens mutex_exit(hash_lock); 483789Sahrens *lockp = NULL; 484789Sahrens return (NULL); 485789Sahrens } 486789Sahrens 487789Sahrens /* 488789Sahrens * Insert an entry into the hash table. If there is already an element 489789Sahrens * equal to elem in the hash table, then the already existing element 490789Sahrens * will be returned and the new element will not be inserted. 491789Sahrens * Otherwise returns NULL. 492789Sahrens */ 493789Sahrens static arc_buf_hdr_t * 494789Sahrens buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 495789Sahrens { 496789Sahrens uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 497789Sahrens kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 498789Sahrens arc_buf_hdr_t *fbuf; 4993403Sbmc uint32_t i; 500789Sahrens 5011544Seschrock ASSERT(!HDR_IN_HASH_TABLE(buf)); 502789Sahrens *lockp = hash_lock; 503789Sahrens mutex_enter(hash_lock); 504789Sahrens for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 505789Sahrens fbuf = fbuf->b_hash_next, i++) { 506789Sahrens if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 507789Sahrens return (fbuf); 508789Sahrens } 509789Sahrens 510789Sahrens buf->b_hash_next = buf_hash_table.ht_table[idx]; 511789Sahrens buf_hash_table.ht_table[idx] = buf; 5121544Seschrock buf->b_flags |= ARC_IN_HASH_TABLE; 513789Sahrens 514789Sahrens /* collect some hash table performance data */ 515789Sahrens if (i > 0) { 5163403Sbmc ARCSTAT_BUMP(arcstat_hash_collisions); 517789Sahrens if (i == 1) 5183403Sbmc ARCSTAT_BUMP(arcstat_hash_chains); 5193403Sbmc 5203403Sbmc ARCSTAT_MAX(arcstat_hash_chain_max, i); 521789Sahrens } 5223403Sbmc 5233403Sbmc ARCSTAT_BUMP(arcstat_hash_elements); 5243403Sbmc ARCSTAT_MAXSTAT(arcstat_hash_elements); 525789Sahrens 526789Sahrens return (NULL); 527789Sahrens } 528789Sahrens 529789Sahrens static void 530789Sahrens buf_hash_remove(arc_buf_hdr_t *buf) 531789Sahrens { 532789Sahrens arc_buf_hdr_t *fbuf, **bufp; 533789Sahrens uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 534789Sahrens 535789Sahrens ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 5361544Seschrock ASSERT(HDR_IN_HASH_TABLE(buf)); 537789Sahrens 538789Sahrens bufp = &buf_hash_table.ht_table[idx]; 539789Sahrens while ((fbuf = *bufp) != buf) { 540789Sahrens ASSERT(fbuf != NULL); 541789Sahrens bufp = &fbuf->b_hash_next; 542789Sahrens } 543789Sahrens *bufp = buf->b_hash_next; 544789Sahrens buf->b_hash_next = NULL; 5451544Seschrock buf->b_flags &= ~ARC_IN_HASH_TABLE; 546789Sahrens 547789Sahrens /* collect some hash table performance data */ 5483403Sbmc ARCSTAT_BUMPDOWN(arcstat_hash_elements); 5493403Sbmc 550789Sahrens if (buf_hash_table.ht_table[idx] && 551789Sahrens buf_hash_table.ht_table[idx]->b_hash_next == NULL) 5523403Sbmc ARCSTAT_BUMPDOWN(arcstat_hash_chains); 553789Sahrens } 554789Sahrens 555789Sahrens /* 556789Sahrens * Global data structures and functions for the buf kmem cache. 557789Sahrens */ 558789Sahrens static kmem_cache_t *hdr_cache; 559789Sahrens static kmem_cache_t *buf_cache; 560789Sahrens 561789Sahrens static void 562789Sahrens buf_fini(void) 563789Sahrens { 564789Sahrens int i; 565789Sahrens 566789Sahrens kmem_free(buf_hash_table.ht_table, 567789Sahrens (buf_hash_table.ht_mask + 1) * sizeof (void *)); 568789Sahrens for (i = 0; i < BUF_LOCKS; i++) 569789Sahrens mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 570789Sahrens kmem_cache_destroy(hdr_cache); 571789Sahrens kmem_cache_destroy(buf_cache); 572789Sahrens } 573789Sahrens 574789Sahrens /* 575789Sahrens * Constructor callback - called when the cache is empty 576789Sahrens * and a new buf is requested. 577789Sahrens */ 578789Sahrens /* ARGSUSED */ 579789Sahrens static int 580789Sahrens hdr_cons(void *vbuf, void *unused, int kmflag) 581789Sahrens { 582789Sahrens arc_buf_hdr_t *buf = vbuf; 583789Sahrens 584789Sahrens bzero(buf, sizeof (arc_buf_hdr_t)); 585789Sahrens refcount_create(&buf->b_refcnt); 586789Sahrens cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 587*4831Sgw25295 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 588789Sahrens return (0); 589789Sahrens } 590789Sahrens 591789Sahrens /* 592789Sahrens * Destructor callback - called when a cached buf is 593789Sahrens * no longer required. 594789Sahrens */ 595789Sahrens /* ARGSUSED */ 596789Sahrens static void 597789Sahrens hdr_dest(void *vbuf, void *unused) 598789Sahrens { 599789Sahrens arc_buf_hdr_t *buf = vbuf; 600789Sahrens 601789Sahrens refcount_destroy(&buf->b_refcnt); 602789Sahrens cv_destroy(&buf->b_cv); 603*4831Sgw25295 mutex_destroy(&buf->b_freeze_lock); 604789Sahrens } 605789Sahrens 606789Sahrens /* 607789Sahrens * Reclaim callback -- invoked when memory is low. 608789Sahrens */ 609789Sahrens /* ARGSUSED */ 610789Sahrens static void 611789Sahrens hdr_recl(void *unused) 612789Sahrens { 613789Sahrens dprintf("hdr_recl called\n"); 6143158Smaybee /* 6153158Smaybee * umem calls the reclaim func when we destroy the buf cache, 6163158Smaybee * which is after we do arc_fini(). 6173158Smaybee */ 6183158Smaybee if (!arc_dead) 6193158Smaybee cv_signal(&arc_reclaim_thr_cv); 620789Sahrens } 621789Sahrens 622789Sahrens static void 623789Sahrens buf_init(void) 624789Sahrens { 625789Sahrens uint64_t *ct; 6261544Seschrock uint64_t hsize = 1ULL << 12; 627789Sahrens int i, j; 628789Sahrens 629789Sahrens /* 630789Sahrens * The hash table is big enough to fill all of physical memory 6311544Seschrock * with an average 64K block size. The table will take up 6321544Seschrock * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 633789Sahrens */ 6341544Seschrock while (hsize * 65536 < physmem * PAGESIZE) 635789Sahrens hsize <<= 1; 6361544Seschrock retry: 637789Sahrens buf_hash_table.ht_mask = hsize - 1; 6381544Seschrock buf_hash_table.ht_table = 6391544Seschrock kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 6401544Seschrock if (buf_hash_table.ht_table == NULL) { 6411544Seschrock ASSERT(hsize > (1ULL << 8)); 6421544Seschrock hsize >>= 1; 6431544Seschrock goto retry; 6441544Seschrock } 645789Sahrens 646789Sahrens hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 647789Sahrens 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 648789Sahrens buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 649789Sahrens 0, NULL, NULL, NULL, NULL, NULL, 0); 650789Sahrens 651789Sahrens for (i = 0; i < 256; i++) 652789Sahrens for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 653789Sahrens *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 654789Sahrens 655789Sahrens for (i = 0; i < BUF_LOCKS; i++) { 656789Sahrens mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 657789Sahrens NULL, MUTEX_DEFAULT, NULL); 658789Sahrens } 659789Sahrens } 660789Sahrens 661789Sahrens #define ARC_MINTIME (hz>>4) /* 62 ms */ 662789Sahrens 663789Sahrens static void 6643093Sahrens arc_cksum_verify(arc_buf_t *buf) 6653093Sahrens { 6663093Sahrens zio_cksum_t zc; 6673093Sahrens 6683312Sahrens if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 6693093Sahrens return; 6703093Sahrens 6713093Sahrens mutex_enter(&buf->b_hdr->b_freeze_lock); 6723265Sahrens if (buf->b_hdr->b_freeze_cksum == NULL || 6733265Sahrens (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 6743093Sahrens mutex_exit(&buf->b_hdr->b_freeze_lock); 6753093Sahrens return; 6763093Sahrens } 6773093Sahrens fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 6783093Sahrens if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 6793093Sahrens panic("buffer modified while frozen!"); 6803093Sahrens mutex_exit(&buf->b_hdr->b_freeze_lock); 6813093Sahrens } 6823093Sahrens 6833093Sahrens static void 6843093Sahrens arc_cksum_compute(arc_buf_t *buf) 6853093Sahrens { 6863312Sahrens if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 6873093Sahrens return; 6883093Sahrens 6893093Sahrens mutex_enter(&buf->b_hdr->b_freeze_lock); 6903093Sahrens if (buf->b_hdr->b_freeze_cksum != NULL) { 6913093Sahrens mutex_exit(&buf->b_hdr->b_freeze_lock); 6923093Sahrens return; 6933093Sahrens } 6943093Sahrens buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 6953093Sahrens fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 6963093Sahrens buf->b_hdr->b_freeze_cksum); 6973093Sahrens mutex_exit(&buf->b_hdr->b_freeze_lock); 6983093Sahrens } 6993093Sahrens 7003093Sahrens void 7013093Sahrens arc_buf_thaw(arc_buf_t *buf) 7023093Sahrens { 7033312Sahrens if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 7043093Sahrens return; 7053093Sahrens 7063403Sbmc if (buf->b_hdr->b_state != arc_anon) 7073093Sahrens panic("modifying non-anon buffer!"); 7083093Sahrens if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 7093093Sahrens panic("modifying buffer while i/o in progress!"); 7103093Sahrens arc_cksum_verify(buf); 7113093Sahrens mutex_enter(&buf->b_hdr->b_freeze_lock); 7123093Sahrens if (buf->b_hdr->b_freeze_cksum != NULL) { 7133093Sahrens kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 7143093Sahrens buf->b_hdr->b_freeze_cksum = NULL; 7153093Sahrens } 7163093Sahrens mutex_exit(&buf->b_hdr->b_freeze_lock); 7173093Sahrens } 7183093Sahrens 7193093Sahrens void 7203093Sahrens arc_buf_freeze(arc_buf_t *buf) 7213093Sahrens { 7223312Sahrens if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 7233312Sahrens return; 7243312Sahrens 7253093Sahrens ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 7263403Sbmc buf->b_hdr->b_state == arc_anon); 7273093Sahrens arc_cksum_compute(buf); 7283093Sahrens } 7293093Sahrens 7303093Sahrens static void 731789Sahrens add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 732789Sahrens { 733789Sahrens ASSERT(MUTEX_HELD(hash_lock)); 734789Sahrens 735789Sahrens if ((refcount_add(&ab->b_refcnt, tag) == 1) && 7363403Sbmc (ab->b_state != arc_anon)) { 7373700Sek110237 uint64_t delta = ab->b_size * ab->b_datacnt; 7384309Smaybee list_t *list = &ab->b_state->arcs_list[ab->b_type]; 7394309Smaybee uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; 740789Sahrens 7413403Sbmc ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); 7423403Sbmc mutex_enter(&ab->b_state->arcs_mtx); 743789Sahrens ASSERT(list_link_active(&ab->b_arc_node)); 7444309Smaybee list_remove(list, ab); 7451544Seschrock if (GHOST_STATE(ab->b_state)) { 7461544Seschrock ASSERT3U(ab->b_datacnt, ==, 0); 7471544Seschrock ASSERT3P(ab->b_buf, ==, NULL); 7481544Seschrock delta = ab->b_size; 7491544Seschrock } 7501544Seschrock ASSERT(delta > 0); 7514309Smaybee ASSERT3U(*size, >=, delta); 7524309Smaybee atomic_add_64(size, -delta); 7533403Sbmc mutex_exit(&ab->b_state->arcs_mtx); 7542391Smaybee /* remove the prefetch flag is we get a reference */ 7552391Smaybee if (ab->b_flags & ARC_PREFETCH) 7562391Smaybee ab->b_flags &= ~ARC_PREFETCH; 757789Sahrens } 758789Sahrens } 759789Sahrens 760789Sahrens static int 761789Sahrens remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 762789Sahrens { 763789Sahrens int cnt; 7643403Sbmc arc_state_t *state = ab->b_state; 765789Sahrens 7663403Sbmc ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 7673403Sbmc ASSERT(!GHOST_STATE(state)); 768789Sahrens 769789Sahrens if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 7703403Sbmc (state != arc_anon)) { 7714309Smaybee uint64_t *size = &state->arcs_lsize[ab->b_type]; 7724309Smaybee 7733403Sbmc ASSERT(!MUTEX_HELD(&state->arcs_mtx)); 7743403Sbmc mutex_enter(&state->arcs_mtx); 775789Sahrens ASSERT(!list_link_active(&ab->b_arc_node)); 7764309Smaybee list_insert_head(&state->arcs_list[ab->b_type], ab); 7771544Seschrock ASSERT(ab->b_datacnt > 0); 7784309Smaybee atomic_add_64(size, ab->b_size * ab->b_datacnt); 7793403Sbmc mutex_exit(&state->arcs_mtx); 780789Sahrens } 781789Sahrens return (cnt); 782789Sahrens } 783789Sahrens 784789Sahrens /* 785789Sahrens * Move the supplied buffer to the indicated state. The mutex 786789Sahrens * for the buffer must be held by the caller. 787789Sahrens */ 788789Sahrens static void 7891544Seschrock arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 790789Sahrens { 7911544Seschrock arc_state_t *old_state = ab->b_state; 7923700Sek110237 int64_t refcnt = refcount_count(&ab->b_refcnt); 7933700Sek110237 uint64_t from_delta, to_delta; 794789Sahrens 795789Sahrens ASSERT(MUTEX_HELD(hash_lock)); 7961544Seschrock ASSERT(new_state != old_state); 7971544Seschrock ASSERT(refcnt == 0 || ab->b_datacnt > 0); 7981544Seschrock ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 7991544Seschrock 8001544Seschrock from_delta = to_delta = ab->b_datacnt * ab->b_size; 801789Sahrens 802789Sahrens /* 803789Sahrens * If this buffer is evictable, transfer it from the 804789Sahrens * old state list to the new state list. 805789Sahrens */ 8061544Seschrock if (refcnt == 0) { 8073403Sbmc if (old_state != arc_anon) { 8083403Sbmc int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); 8094309Smaybee uint64_t *size = &old_state->arcs_lsize[ab->b_type]; 8101544Seschrock 8111544Seschrock if (use_mutex) 8123403Sbmc mutex_enter(&old_state->arcs_mtx); 8131544Seschrock 8141544Seschrock ASSERT(list_link_active(&ab->b_arc_node)); 8154309Smaybee list_remove(&old_state->arcs_list[ab->b_type], ab); 816789Sahrens 8172391Smaybee /* 8182391Smaybee * If prefetching out of the ghost cache, 8192391Smaybee * we will have a non-null datacnt. 8202391Smaybee */ 8212391Smaybee if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 8222391Smaybee /* ghost elements have a ghost size */ 8231544Seschrock ASSERT(ab->b_buf == NULL); 8241544Seschrock from_delta = ab->b_size; 825789Sahrens } 8264309Smaybee ASSERT3U(*size, >=, from_delta); 8274309Smaybee atomic_add_64(size, -from_delta); 8281544Seschrock 8291544Seschrock if (use_mutex) 8303403Sbmc mutex_exit(&old_state->arcs_mtx); 831789Sahrens } 8323403Sbmc if (new_state != arc_anon) { 8333403Sbmc int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); 8344309Smaybee uint64_t *size = &new_state->arcs_lsize[ab->b_type]; 835789Sahrens 8361544Seschrock if (use_mutex) 8373403Sbmc mutex_enter(&new_state->arcs_mtx); 8381544Seschrock 8394309Smaybee list_insert_head(&new_state->arcs_list[ab->b_type], ab); 8401544Seschrock 8411544Seschrock /* ghost elements have a ghost size */ 8421544Seschrock if (GHOST_STATE(new_state)) { 8431544Seschrock ASSERT(ab->b_datacnt == 0); 8441544Seschrock ASSERT(ab->b_buf == NULL); 8451544Seschrock to_delta = ab->b_size; 8461544Seschrock } 8474309Smaybee atomic_add_64(size, to_delta); 8481544Seschrock 8491544Seschrock if (use_mutex) 8503403Sbmc mutex_exit(&new_state->arcs_mtx); 851789Sahrens } 852789Sahrens } 853789Sahrens 854789Sahrens ASSERT(!BUF_EMPTY(ab)); 8553403Sbmc if (new_state == arc_anon && old_state != arc_anon) { 856789Sahrens buf_hash_remove(ab); 857789Sahrens } 858789Sahrens 8591544Seschrock /* adjust state sizes */ 8601544Seschrock if (to_delta) 8613403Sbmc atomic_add_64(&new_state->arcs_size, to_delta); 8621544Seschrock if (from_delta) { 8633403Sbmc ASSERT3U(old_state->arcs_size, >=, from_delta); 8643403Sbmc atomic_add_64(&old_state->arcs_size, -from_delta); 865789Sahrens } 866789Sahrens ab->b_state = new_state; 867789Sahrens } 868789Sahrens 8694309Smaybee void 8704309Smaybee arc_space_consume(uint64_t space) 8714309Smaybee { 8724309Smaybee atomic_add_64(&arc_meta_used, space); 8734309Smaybee atomic_add_64(&arc_size, space); 8744309Smaybee } 8754309Smaybee 8764309Smaybee void 8774309Smaybee arc_space_return(uint64_t space) 8784309Smaybee { 8794309Smaybee ASSERT(arc_meta_used >= space); 8804309Smaybee if (arc_meta_max < arc_meta_used) 8814309Smaybee arc_meta_max = arc_meta_used; 8824309Smaybee atomic_add_64(&arc_meta_used, -space); 8834309Smaybee ASSERT(arc_size >= space); 8844309Smaybee atomic_add_64(&arc_size, -space); 8854309Smaybee } 8864309Smaybee 8874309Smaybee void * 8884309Smaybee arc_data_buf_alloc(uint64_t size) 8894309Smaybee { 8904309Smaybee if (arc_evict_needed(ARC_BUFC_DATA)) 8914309Smaybee cv_signal(&arc_reclaim_thr_cv); 8924309Smaybee atomic_add_64(&arc_size, size); 8934309Smaybee return (zio_data_buf_alloc(size)); 8944309Smaybee } 8954309Smaybee 8964309Smaybee void 8974309Smaybee arc_data_buf_free(void *buf, uint64_t size) 8984309Smaybee { 8994309Smaybee zio_data_buf_free(buf, size); 9004309Smaybee ASSERT(arc_size >= size); 9014309Smaybee atomic_add_64(&arc_size, -size); 9024309Smaybee } 9034309Smaybee 904789Sahrens arc_buf_t * 9053290Sjohansen arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) 906789Sahrens { 907789Sahrens arc_buf_hdr_t *hdr; 908789Sahrens arc_buf_t *buf; 909789Sahrens 910789Sahrens ASSERT3U(size, >, 0); 911789Sahrens hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 912789Sahrens ASSERT(BUF_EMPTY(hdr)); 913789Sahrens hdr->b_size = size; 9143290Sjohansen hdr->b_type = type; 915789Sahrens hdr->b_spa = spa; 9163403Sbmc hdr->b_state = arc_anon; 917789Sahrens hdr->b_arc_access = 0; 918789Sahrens buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 919789Sahrens buf->b_hdr = hdr; 9202688Smaybee buf->b_data = NULL; 9211544Seschrock buf->b_efunc = NULL; 9221544Seschrock buf->b_private = NULL; 923789Sahrens buf->b_next = NULL; 924789Sahrens hdr->b_buf = buf; 9252688Smaybee arc_get_data_buf(buf); 9261544Seschrock hdr->b_datacnt = 1; 927789Sahrens hdr->b_flags = 0; 928789Sahrens ASSERT(refcount_is_zero(&hdr->b_refcnt)); 929789Sahrens (void) refcount_add(&hdr->b_refcnt, tag); 930789Sahrens 931789Sahrens return (buf); 932789Sahrens } 933789Sahrens 9342688Smaybee static arc_buf_t * 9352688Smaybee arc_buf_clone(arc_buf_t *from) 9361544Seschrock { 9372688Smaybee arc_buf_t *buf; 9382688Smaybee arc_buf_hdr_t *hdr = from->b_hdr; 9392688Smaybee uint64_t size = hdr->b_size; 9401544Seschrock 9412688Smaybee buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 9422688Smaybee buf->b_hdr = hdr; 9432688Smaybee buf->b_data = NULL; 9442688Smaybee buf->b_efunc = NULL; 9452688Smaybee buf->b_private = NULL; 9462688Smaybee buf->b_next = hdr->b_buf; 9472688Smaybee hdr->b_buf = buf; 9482688Smaybee arc_get_data_buf(buf); 9492688Smaybee bcopy(from->b_data, buf->b_data, size); 9502688Smaybee hdr->b_datacnt += 1; 9512688Smaybee return (buf); 9521544Seschrock } 9531544Seschrock 9541544Seschrock void 9551544Seschrock arc_buf_add_ref(arc_buf_t *buf, void* tag) 9561544Seschrock { 9572887Smaybee arc_buf_hdr_t *hdr; 9581544Seschrock kmutex_t *hash_lock; 9591544Seschrock 9602724Smaybee /* 9612724Smaybee * Check to see if this buffer is currently being evicted via 9622887Smaybee * arc_do_user_evicts(). 9632724Smaybee */ 9642887Smaybee mutex_enter(&arc_eviction_mtx); 9652887Smaybee hdr = buf->b_hdr; 9662887Smaybee if (hdr == NULL) { 9672887Smaybee mutex_exit(&arc_eviction_mtx); 9682724Smaybee return; 9692887Smaybee } 9702887Smaybee hash_lock = HDR_LOCK(hdr); 9712887Smaybee mutex_exit(&arc_eviction_mtx); 9722724Smaybee 9732724Smaybee mutex_enter(hash_lock); 9741544Seschrock if (buf->b_data == NULL) { 9751544Seschrock /* 9761544Seschrock * This buffer is evicted. 9771544Seschrock */ 9782724Smaybee mutex_exit(hash_lock); 9791544Seschrock return; 9801544Seschrock } 9811544Seschrock 9822724Smaybee ASSERT(buf->b_hdr == hdr); 9833403Sbmc ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 9841544Seschrock add_reference(hdr, hash_lock, tag); 9852688Smaybee arc_access(hdr, hash_lock); 9862688Smaybee mutex_exit(hash_lock); 9873403Sbmc ARCSTAT_BUMP(arcstat_hits); 9883403Sbmc ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 9893403Sbmc demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 9903403Sbmc data, metadata, hits); 9911544Seschrock } 9921544Seschrock 993789Sahrens static void 9942688Smaybee arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) 9951544Seschrock { 9961544Seschrock arc_buf_t **bufp; 9971544Seschrock 9981544Seschrock /* free up data associated with the buf */ 9991544Seschrock if (buf->b_data) { 10001544Seschrock arc_state_t *state = buf->b_hdr->b_state; 10011544Seschrock uint64_t size = buf->b_hdr->b_size; 10023290Sjohansen arc_buf_contents_t type = buf->b_hdr->b_type; 10031544Seschrock 10043093Sahrens arc_cksum_verify(buf); 10052688Smaybee if (!recycle) { 10063290Sjohansen if (type == ARC_BUFC_METADATA) { 10073290Sjohansen zio_buf_free(buf->b_data, size); 10084309Smaybee arc_space_return(size); 10093290Sjohansen } else { 10103290Sjohansen ASSERT(type == ARC_BUFC_DATA); 10113290Sjohansen zio_data_buf_free(buf->b_data, size); 10124309Smaybee atomic_add_64(&arc_size, -size); 10133290Sjohansen } 10142688Smaybee } 10151544Seschrock if (list_link_active(&buf->b_hdr->b_arc_node)) { 10164309Smaybee uint64_t *cnt = &state->arcs_lsize[type]; 10174309Smaybee 10181544Seschrock ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 10193403Sbmc ASSERT(state != arc_anon); 10204309Smaybee 10214309Smaybee ASSERT3U(*cnt, >=, size); 10224309Smaybee atomic_add_64(cnt, -size); 10231544Seschrock } 10243403Sbmc ASSERT3U(state->arcs_size, >=, size); 10253403Sbmc atomic_add_64(&state->arcs_size, -size); 10261544Seschrock buf->b_data = NULL; 10271544Seschrock ASSERT(buf->b_hdr->b_datacnt > 0); 10281544Seschrock buf->b_hdr->b_datacnt -= 1; 10291544Seschrock } 10301544Seschrock 10311544Seschrock /* only remove the buf if requested */ 10321544Seschrock if (!all) 10331544Seschrock return; 10341544Seschrock 10351544Seschrock /* remove the buf from the hdr list */ 10361544Seschrock for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 10371544Seschrock continue; 10381544Seschrock *bufp = buf->b_next; 10391544Seschrock 10401544Seschrock ASSERT(buf->b_efunc == NULL); 10411544Seschrock 10421544Seschrock /* clean up the buf */ 10431544Seschrock buf->b_hdr = NULL; 10441544Seschrock kmem_cache_free(buf_cache, buf); 10451544Seschrock } 10461544Seschrock 10471544Seschrock static void 10481544Seschrock arc_hdr_destroy(arc_buf_hdr_t *hdr) 1049789Sahrens { 1050789Sahrens ASSERT(refcount_is_zero(&hdr->b_refcnt)); 10513403Sbmc ASSERT3P(hdr->b_state, ==, arc_anon); 10521544Seschrock ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1053789Sahrens 1054789Sahrens if (!BUF_EMPTY(hdr)) { 10551544Seschrock ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1056789Sahrens bzero(&hdr->b_dva, sizeof (dva_t)); 1057789Sahrens hdr->b_birth = 0; 1058789Sahrens hdr->b_cksum0 = 0; 1059789Sahrens } 10601544Seschrock while (hdr->b_buf) { 1061789Sahrens arc_buf_t *buf = hdr->b_buf; 1062789Sahrens 10631544Seschrock if (buf->b_efunc) { 10641544Seschrock mutex_enter(&arc_eviction_mtx); 10651544Seschrock ASSERT(buf->b_hdr != NULL); 10662688Smaybee arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 10671544Seschrock hdr->b_buf = buf->b_next; 10682887Smaybee buf->b_hdr = &arc_eviction_hdr; 10691544Seschrock buf->b_next = arc_eviction_list; 10701544Seschrock arc_eviction_list = buf; 10711544Seschrock mutex_exit(&arc_eviction_mtx); 10721544Seschrock } else { 10732688Smaybee arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 10741544Seschrock } 1075789Sahrens } 10763093Sahrens if (hdr->b_freeze_cksum != NULL) { 10773093Sahrens kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 10783093Sahrens hdr->b_freeze_cksum = NULL; 10793093Sahrens } 10801544Seschrock 1081789Sahrens ASSERT(!list_link_active(&hdr->b_arc_node)); 1082789Sahrens ASSERT3P(hdr->b_hash_next, ==, NULL); 1083789Sahrens ASSERT3P(hdr->b_acb, ==, NULL); 1084789Sahrens kmem_cache_free(hdr_cache, hdr); 1085789Sahrens } 1086789Sahrens 1087789Sahrens void 1088789Sahrens arc_buf_free(arc_buf_t *buf, void *tag) 1089789Sahrens { 1090789Sahrens arc_buf_hdr_t *hdr = buf->b_hdr; 10913403Sbmc int hashed = hdr->b_state != arc_anon; 10921544Seschrock 10931544Seschrock ASSERT(buf->b_efunc == NULL); 10941544Seschrock ASSERT(buf->b_data != NULL); 10951544Seschrock 10961544Seschrock if (hashed) { 10971544Seschrock kmutex_t *hash_lock = HDR_LOCK(hdr); 10981544Seschrock 10991544Seschrock mutex_enter(hash_lock); 11001544Seschrock (void) remove_reference(hdr, hash_lock, tag); 11011544Seschrock if (hdr->b_datacnt > 1) 11022688Smaybee arc_buf_destroy(buf, FALSE, TRUE); 11031544Seschrock else 11041544Seschrock hdr->b_flags |= ARC_BUF_AVAILABLE; 11051544Seschrock mutex_exit(hash_lock); 11061544Seschrock } else if (HDR_IO_IN_PROGRESS(hdr)) { 11071544Seschrock int destroy_hdr; 11081544Seschrock /* 11091544Seschrock * We are in the middle of an async write. Don't destroy 11101544Seschrock * this buffer unless the write completes before we finish 11111544Seschrock * decrementing the reference count. 11121544Seschrock */ 11131544Seschrock mutex_enter(&arc_eviction_mtx); 11141544Seschrock (void) remove_reference(hdr, NULL, tag); 11151544Seschrock ASSERT(refcount_is_zero(&hdr->b_refcnt)); 11161544Seschrock destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 11171544Seschrock mutex_exit(&arc_eviction_mtx); 11181544Seschrock if (destroy_hdr) 11191544Seschrock arc_hdr_destroy(hdr); 11201544Seschrock } else { 11211544Seschrock if (remove_reference(hdr, NULL, tag) > 0) { 11221544Seschrock ASSERT(HDR_IO_ERROR(hdr)); 11232688Smaybee arc_buf_destroy(buf, FALSE, TRUE); 11241544Seschrock } else { 11251544Seschrock arc_hdr_destroy(hdr); 11261544Seschrock } 11271544Seschrock } 11281544Seschrock } 11291544Seschrock 11301544Seschrock int 11311544Seschrock arc_buf_remove_ref(arc_buf_t *buf, void* tag) 11321544Seschrock { 11331544Seschrock arc_buf_hdr_t *hdr = buf->b_hdr; 1134789Sahrens kmutex_t *hash_lock = HDR_LOCK(hdr); 11351544Seschrock int no_callback = (buf->b_efunc == NULL); 11361544Seschrock 11373403Sbmc if (hdr->b_state == arc_anon) { 11381544Seschrock arc_buf_free(buf, tag); 11391544Seschrock return (no_callback); 11401544Seschrock } 1141789Sahrens 1142789Sahrens mutex_enter(hash_lock); 11433403Sbmc ASSERT(hdr->b_state != arc_anon); 11441544Seschrock ASSERT(buf->b_data != NULL); 1145789Sahrens 11461544Seschrock (void) remove_reference(hdr, hash_lock, tag); 11471544Seschrock if (hdr->b_datacnt > 1) { 11481544Seschrock if (no_callback) 11492688Smaybee arc_buf_destroy(buf, FALSE, TRUE); 11501544Seschrock } else if (no_callback) { 11511544Seschrock ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 11521544Seschrock hdr->b_flags |= ARC_BUF_AVAILABLE; 1153789Sahrens } 11541544Seschrock ASSERT(no_callback || hdr->b_datacnt > 1 || 11551544Seschrock refcount_is_zero(&hdr->b_refcnt)); 1156789Sahrens mutex_exit(hash_lock); 11571544Seschrock return (no_callback); 1158789Sahrens } 1159789Sahrens 1160789Sahrens int 1161789Sahrens arc_buf_size(arc_buf_t *buf) 1162789Sahrens { 1163789Sahrens return (buf->b_hdr->b_size); 1164789Sahrens } 1165789Sahrens 1166789Sahrens /* 1167789Sahrens * Evict buffers from list until we've removed the specified number of 1168789Sahrens * bytes. Move the removed buffers to the appropriate evict state. 11692688Smaybee * If the recycle flag is set, then attempt to "recycle" a buffer: 11702688Smaybee * - look for a buffer to evict that is `bytes' long. 11712688Smaybee * - return the data block from this buffer rather than freeing it. 11722688Smaybee * This flag is used by callers that are trying to make space for a 11732688Smaybee * new buffer in a full arc cache. 1174789Sahrens */ 11752688Smaybee static void * 11763290Sjohansen arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, 11773290Sjohansen arc_buf_contents_t type) 1178789Sahrens { 1179789Sahrens arc_state_t *evicted_state; 11802688Smaybee uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 11812918Smaybee arc_buf_hdr_t *ab, *ab_prev = NULL; 11824309Smaybee list_t *list = &state->arcs_list[type]; 1183789Sahrens kmutex_t *hash_lock; 11842688Smaybee boolean_t have_lock; 11852918Smaybee void *stolen = NULL; 1186789Sahrens 11873403Sbmc ASSERT(state == arc_mru || state == arc_mfu); 1188789Sahrens 11893403Sbmc evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 1190789Sahrens 11913403Sbmc mutex_enter(&state->arcs_mtx); 11923403Sbmc mutex_enter(&evicted_state->arcs_mtx); 1193789Sahrens 11944309Smaybee for (ab = list_tail(list); ab; ab = ab_prev) { 11954309Smaybee ab_prev = list_prev(list, ab); 11962391Smaybee /* prefetch buffers have a minimum lifespan */ 11972688Smaybee if (HDR_IO_IN_PROGRESS(ab) || 11982688Smaybee (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 11992688Smaybee lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) { 12002391Smaybee skipped++; 12012391Smaybee continue; 12022391Smaybee } 12032918Smaybee /* "lookahead" for better eviction candidate */ 12042918Smaybee if (recycle && ab->b_size != bytes && 12052918Smaybee ab_prev && ab_prev->b_size == bytes) 12062688Smaybee continue; 1207789Sahrens hash_lock = HDR_LOCK(ab); 12082688Smaybee have_lock = MUTEX_HELD(hash_lock); 12092688Smaybee if (have_lock || mutex_tryenter(hash_lock)) { 1210789Sahrens ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); 12111544Seschrock ASSERT(ab->b_datacnt > 0); 12121544Seschrock while (ab->b_buf) { 12131544Seschrock arc_buf_t *buf = ab->b_buf; 12142688Smaybee if (buf->b_data) { 12151544Seschrock bytes_evicted += ab->b_size; 12163290Sjohansen if (recycle && ab->b_type == type && 12173290Sjohansen ab->b_size == bytes) { 12182918Smaybee stolen = buf->b_data; 12192918Smaybee recycle = FALSE; 12202918Smaybee } 12212688Smaybee } 12221544Seschrock if (buf->b_efunc) { 12231544Seschrock mutex_enter(&arc_eviction_mtx); 12242918Smaybee arc_buf_destroy(buf, 12252918Smaybee buf->b_data == stolen, FALSE); 12261544Seschrock ab->b_buf = buf->b_next; 12272887Smaybee buf->b_hdr = &arc_eviction_hdr; 12281544Seschrock buf->b_next = arc_eviction_list; 12291544Seschrock arc_eviction_list = buf; 12301544Seschrock mutex_exit(&arc_eviction_mtx); 12311544Seschrock } else { 12322918Smaybee arc_buf_destroy(buf, 12332918Smaybee buf->b_data == stolen, TRUE); 12341544Seschrock } 12351544Seschrock } 12361544Seschrock ASSERT(ab->b_datacnt == 0); 1237789Sahrens arc_change_state(evicted_state, ab, hash_lock); 12381544Seschrock ASSERT(HDR_IN_HASH_TABLE(ab)); 12391544Seschrock ab->b_flags = ARC_IN_HASH_TABLE; 1240789Sahrens DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 12412688Smaybee if (!have_lock) 12422688Smaybee mutex_exit(hash_lock); 12431544Seschrock if (bytes >= 0 && bytes_evicted >= bytes) 1244789Sahrens break; 1245789Sahrens } else { 12462688Smaybee missed += 1; 1247789Sahrens } 1248789Sahrens } 12493403Sbmc 12503403Sbmc mutex_exit(&evicted_state->arcs_mtx); 12513403Sbmc mutex_exit(&state->arcs_mtx); 1252789Sahrens 1253789Sahrens if (bytes_evicted < bytes) 1254789Sahrens dprintf("only evicted %lld bytes from %x", 1255789Sahrens (longlong_t)bytes_evicted, state); 1256789Sahrens 12572688Smaybee if (skipped) 12583403Sbmc ARCSTAT_INCR(arcstat_evict_skip, skipped); 12593403Sbmc 12602688Smaybee if (missed) 12613403Sbmc ARCSTAT_INCR(arcstat_mutex_miss, missed); 12623403Sbmc 12634709Smaybee /* 12644709Smaybee * We have just evicted some date into the ghost state, make 12654709Smaybee * sure we also adjust the ghost state size if necessary. 12664709Smaybee */ 12674709Smaybee if (arc_no_grow && 12684709Smaybee arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { 12694709Smaybee int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + 12704709Smaybee arc_mru_ghost->arcs_size - arc_c; 12714709Smaybee 12724709Smaybee if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { 12734709Smaybee int64_t todelete = 12744709Smaybee MIN(arc_mru_ghost->arcs_lsize[type], mru_over); 12754709Smaybee arc_evict_ghost(arc_mru_ghost, todelete); 12764709Smaybee } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { 12774709Smaybee int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], 12784709Smaybee arc_mru_ghost->arcs_size + 12794709Smaybee arc_mfu_ghost->arcs_size - arc_c); 12804709Smaybee arc_evict_ghost(arc_mfu_ghost, todelete); 12814709Smaybee } 12824709Smaybee } 12834709Smaybee 12842918Smaybee return (stolen); 1285789Sahrens } 1286789Sahrens 1287789Sahrens /* 1288789Sahrens * Remove buffers from list until we've removed the specified number of 1289789Sahrens * bytes. Destroy the buffers that are removed. 1290789Sahrens */ 1291789Sahrens static void 12921544Seschrock arc_evict_ghost(arc_state_t *state, int64_t bytes) 1293789Sahrens { 1294789Sahrens arc_buf_hdr_t *ab, *ab_prev; 12954309Smaybee list_t *list = &state->arcs_list[ARC_BUFC_DATA]; 1296789Sahrens kmutex_t *hash_lock; 12971544Seschrock uint64_t bytes_deleted = 0; 12983700Sek110237 uint64_t bufs_skipped = 0; 1299789Sahrens 13001544Seschrock ASSERT(GHOST_STATE(state)); 1301789Sahrens top: 13023403Sbmc mutex_enter(&state->arcs_mtx); 13034309Smaybee for (ab = list_tail(list); ab; ab = ab_prev) { 13044309Smaybee ab_prev = list_prev(list, ab); 1305789Sahrens hash_lock = HDR_LOCK(ab); 1306789Sahrens if (mutex_tryenter(hash_lock)) { 13072391Smaybee ASSERT(!HDR_IO_IN_PROGRESS(ab)); 13081544Seschrock ASSERT(ab->b_buf == NULL); 13093403Sbmc arc_change_state(arc_anon, ab, hash_lock); 1310789Sahrens mutex_exit(hash_lock); 13113403Sbmc ARCSTAT_BUMP(arcstat_deleted); 13121544Seschrock bytes_deleted += ab->b_size; 13131544Seschrock arc_hdr_destroy(ab); 1314789Sahrens DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 1315789Sahrens if (bytes >= 0 && bytes_deleted >= bytes) 1316789Sahrens break; 1317789Sahrens } else { 1318789Sahrens if (bytes < 0) { 13193403Sbmc mutex_exit(&state->arcs_mtx); 1320789Sahrens mutex_enter(hash_lock); 1321789Sahrens mutex_exit(hash_lock); 1322789Sahrens goto top; 1323789Sahrens } 1324789Sahrens bufs_skipped += 1; 1325789Sahrens } 1326789Sahrens } 13273403Sbmc mutex_exit(&state->arcs_mtx); 1328789Sahrens 13294309Smaybee if (list == &state->arcs_list[ARC_BUFC_DATA] && 13304309Smaybee (bytes < 0 || bytes_deleted < bytes)) { 13314309Smaybee list = &state->arcs_list[ARC_BUFC_METADATA]; 13324309Smaybee goto top; 13334309Smaybee } 13344309Smaybee 1335789Sahrens if (bufs_skipped) { 13363403Sbmc ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 1337789Sahrens ASSERT(bytes >= 0); 1338789Sahrens } 1339789Sahrens 1340789Sahrens if (bytes_deleted < bytes) 1341789Sahrens dprintf("only deleted %lld bytes from %p", 1342789Sahrens (longlong_t)bytes_deleted, state); 1343789Sahrens } 1344789Sahrens 1345789Sahrens static void 1346789Sahrens arc_adjust(void) 1347789Sahrens { 13483403Sbmc int64_t top_sz, mru_over, arc_over, todelete; 1349789Sahrens 13503403Sbmc top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1351789Sahrens 13524309Smaybee if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 13534309Smaybee int64_t toevict = 13544309Smaybee MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p); 13554309Smaybee (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_DATA); 13564309Smaybee top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 13574309Smaybee } 13584309Smaybee 13594309Smaybee if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 13604309Smaybee int64_t toevict = 13614309Smaybee MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p); 13624309Smaybee (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_METADATA); 13633403Sbmc top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1364789Sahrens } 1365789Sahrens 13663403Sbmc mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c; 1367789Sahrens 1368789Sahrens if (mru_over > 0) { 13694309Smaybee if (arc_mru_ghost->arcs_size > 0) { 13704309Smaybee todelete = MIN(arc_mru_ghost->arcs_size, mru_over); 13713403Sbmc arc_evict_ghost(arc_mru_ghost, todelete); 1372789Sahrens } 1373789Sahrens } 1374789Sahrens 13753403Sbmc if ((arc_over = arc_size - arc_c) > 0) { 13761544Seschrock int64_t tbl_over; 1377789Sahrens 13784309Smaybee if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 13794309Smaybee int64_t toevict = 13804309Smaybee MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over); 13813403Sbmc (void) arc_evict(arc_mfu, toevict, FALSE, 13824309Smaybee ARC_BUFC_DATA); 13834309Smaybee arc_over = arc_size - arc_c; 1384789Sahrens } 1385789Sahrens 13864309Smaybee if (arc_over > 0 && 13874309Smaybee arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 13884309Smaybee int64_t toevict = 13894309Smaybee MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], 13904309Smaybee arc_over); 13914309Smaybee (void) arc_evict(arc_mfu, toevict, FALSE, 13924309Smaybee ARC_BUFC_METADATA); 13934309Smaybee } 13944309Smaybee 13954309Smaybee tbl_over = arc_size + arc_mru_ghost->arcs_size + 13964309Smaybee arc_mfu_ghost->arcs_size - arc_c * 2; 13974309Smaybee 13984309Smaybee if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) { 13994309Smaybee todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over); 14003403Sbmc arc_evict_ghost(arc_mfu_ghost, todelete); 1401789Sahrens } 1402789Sahrens } 1403789Sahrens } 1404789Sahrens 14051544Seschrock static void 14061544Seschrock arc_do_user_evicts(void) 14071544Seschrock { 14081544Seschrock mutex_enter(&arc_eviction_mtx); 14091544Seschrock while (arc_eviction_list != NULL) { 14101544Seschrock arc_buf_t *buf = arc_eviction_list; 14111544Seschrock arc_eviction_list = buf->b_next; 14121544Seschrock buf->b_hdr = NULL; 14131544Seschrock mutex_exit(&arc_eviction_mtx); 14141544Seschrock 14151819Smaybee if (buf->b_efunc != NULL) 14161819Smaybee VERIFY(buf->b_efunc(buf) == 0); 14171544Seschrock 14181544Seschrock buf->b_efunc = NULL; 14191544Seschrock buf->b_private = NULL; 14201544Seschrock kmem_cache_free(buf_cache, buf); 14211544Seschrock mutex_enter(&arc_eviction_mtx); 14221544Seschrock } 14231544Seschrock mutex_exit(&arc_eviction_mtx); 14241544Seschrock } 14251544Seschrock 1426789Sahrens /* 1427789Sahrens * Flush all *evictable* data from the cache. 1428789Sahrens * NOTE: this will not touch "active" (i.e. referenced) data. 1429789Sahrens */ 1430789Sahrens void 1431789Sahrens arc_flush(void) 1432789Sahrens { 14334309Smaybee while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) 14344309Smaybee (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_DATA); 14354309Smaybee while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) 14364309Smaybee (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_METADATA); 14374309Smaybee while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) 14384309Smaybee (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_DATA); 14394309Smaybee while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) 14404309Smaybee (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_METADATA); 1441789Sahrens 14423403Sbmc arc_evict_ghost(arc_mru_ghost, -1); 14433403Sbmc arc_evict_ghost(arc_mfu_ghost, -1); 14441544Seschrock 14451544Seschrock mutex_enter(&arc_reclaim_thr_lock); 14461544Seschrock arc_do_user_evicts(); 14471544Seschrock mutex_exit(&arc_reclaim_thr_lock); 14481544Seschrock ASSERT(arc_eviction_list == NULL); 1449789Sahrens } 1450789Sahrens 14513158Smaybee int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */ 14522391Smaybee 1453789Sahrens void 14543158Smaybee arc_shrink(void) 1455789Sahrens { 14563403Sbmc if (arc_c > arc_c_min) { 14573158Smaybee uint64_t to_free; 1458789Sahrens 14592048Sstans #ifdef _KERNEL 14603403Sbmc to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree)); 14612048Sstans #else 14623403Sbmc to_free = arc_c >> arc_shrink_shift; 14632048Sstans #endif 14643403Sbmc if (arc_c > arc_c_min + to_free) 14653403Sbmc atomic_add_64(&arc_c, -to_free); 14663158Smaybee else 14673403Sbmc arc_c = arc_c_min; 14682048Sstans 14693403Sbmc atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 14703403Sbmc if (arc_c > arc_size) 14713403Sbmc arc_c = MAX(arc_size, arc_c_min); 14723403Sbmc if (arc_p > arc_c) 14733403Sbmc arc_p = (arc_c >> 1); 14743403Sbmc ASSERT(arc_c >= arc_c_min); 14753403Sbmc ASSERT((int64_t)arc_p >= 0); 14763158Smaybee } 1477789Sahrens 14783403Sbmc if (arc_size > arc_c) 14793158Smaybee arc_adjust(); 1480789Sahrens } 1481789Sahrens 1482789Sahrens static int 1483789Sahrens arc_reclaim_needed(void) 1484789Sahrens { 1485789Sahrens uint64_t extra; 1486789Sahrens 1487789Sahrens #ifdef _KERNEL 14882048Sstans 14892048Sstans if (needfree) 14902048Sstans return (1); 14912048Sstans 1492789Sahrens /* 1493789Sahrens * take 'desfree' extra pages, so we reclaim sooner, rather than later 1494789Sahrens */ 1495789Sahrens extra = desfree; 1496789Sahrens 1497789Sahrens /* 1498789Sahrens * check that we're out of range of the pageout scanner. It starts to 1499789Sahrens * schedule paging if freemem is less than lotsfree and needfree. 1500789Sahrens * lotsfree is the high-water mark for pageout, and needfree is the 1501789Sahrens * number of needed free pages. We add extra pages here to make sure 1502789Sahrens * the scanner doesn't start up while we're freeing memory. 1503789Sahrens */ 1504789Sahrens if (freemem < lotsfree + needfree + extra) 1505789Sahrens return (1); 1506789Sahrens 1507789Sahrens /* 1508789Sahrens * check to make sure that swapfs has enough space so that anon 1509789Sahrens * reservations can still succeeed. anon_resvmem() checks that the 1510789Sahrens * availrmem is greater than swapfs_minfree, and the number of reserved 1511789Sahrens * swap pages. We also add a bit of extra here just to prevent 1512789Sahrens * circumstances from getting really dire. 1513789Sahrens */ 1514789Sahrens if (availrmem < swapfs_minfree + swapfs_reserve + extra) 1515789Sahrens return (1); 1516789Sahrens 15171936Smaybee #if defined(__i386) 1518789Sahrens /* 1519789Sahrens * If we're on an i386 platform, it's possible that we'll exhaust the 1520789Sahrens * kernel heap space before we ever run out of available physical 1521789Sahrens * memory. Most checks of the size of the heap_area compare against 1522789Sahrens * tune.t_minarmem, which is the minimum available real memory that we 1523789Sahrens * can have in the system. However, this is generally fixed at 25 pages 1524789Sahrens * which is so low that it's useless. In this comparison, we seek to 1525789Sahrens * calculate the total heap-size, and reclaim if more than 3/4ths of the 1526789Sahrens * heap is allocated. (Or, in the caclulation, if less than 1/4th is 1527789Sahrens * free) 1528789Sahrens */ 1529789Sahrens if (btop(vmem_size(heap_arena, VMEM_FREE)) < 1530789Sahrens (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) 1531789Sahrens return (1); 1532789Sahrens #endif 1533789Sahrens 1534789Sahrens #else 1535789Sahrens if (spa_get_random(100) == 0) 1536789Sahrens return (1); 1537789Sahrens #endif 1538789Sahrens return (0); 1539789Sahrens } 1540789Sahrens 1541789Sahrens static void 1542789Sahrens arc_kmem_reap_now(arc_reclaim_strategy_t strat) 1543789Sahrens { 1544789Sahrens size_t i; 1545789Sahrens kmem_cache_t *prev_cache = NULL; 15463290Sjohansen kmem_cache_t *prev_data_cache = NULL; 1547789Sahrens extern kmem_cache_t *zio_buf_cache[]; 15483290Sjohansen extern kmem_cache_t *zio_data_buf_cache[]; 1549789Sahrens 15501484Sek110237 #ifdef _KERNEL 15514309Smaybee if (arc_meta_used >= arc_meta_limit) { 15524309Smaybee /* 15534309Smaybee * We are exceeding our meta-data cache limit. 15544309Smaybee * Purge some DNLC entries to release holds on meta-data. 15554309Smaybee */ 15564309Smaybee dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 15574309Smaybee } 15581936Smaybee #if defined(__i386) 15591936Smaybee /* 15601936Smaybee * Reclaim unused memory from all kmem caches. 15611936Smaybee */ 15621936Smaybee kmem_reap(); 15631936Smaybee #endif 15641484Sek110237 #endif 15651484Sek110237 1566789Sahrens /* 15671544Seschrock * An agressive reclamation will shrink the cache size as well as 15681544Seschrock * reap free buffers from the arc kmem caches. 1569789Sahrens */ 1570789Sahrens if (strat == ARC_RECLAIM_AGGR) 15713158Smaybee arc_shrink(); 1572789Sahrens 1573789Sahrens for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 1574789Sahrens if (zio_buf_cache[i] != prev_cache) { 1575789Sahrens prev_cache = zio_buf_cache[i]; 1576789Sahrens kmem_cache_reap_now(zio_buf_cache[i]); 1577789Sahrens } 15783290Sjohansen if (zio_data_buf_cache[i] != prev_data_cache) { 15793290Sjohansen prev_data_cache = zio_data_buf_cache[i]; 15803290Sjohansen kmem_cache_reap_now(zio_data_buf_cache[i]); 15813290Sjohansen } 1582789Sahrens } 15831544Seschrock kmem_cache_reap_now(buf_cache); 15841544Seschrock kmem_cache_reap_now(hdr_cache); 1585789Sahrens } 1586789Sahrens 1587789Sahrens static void 1588789Sahrens arc_reclaim_thread(void) 1589789Sahrens { 1590789Sahrens clock_t growtime = 0; 1591789Sahrens arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 1592789Sahrens callb_cpr_t cpr; 1593789Sahrens 1594789Sahrens CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 1595789Sahrens 1596789Sahrens mutex_enter(&arc_reclaim_thr_lock); 1597789Sahrens while (arc_thread_exit == 0) { 1598789Sahrens if (arc_reclaim_needed()) { 1599789Sahrens 16003403Sbmc if (arc_no_grow) { 1601789Sahrens if (last_reclaim == ARC_RECLAIM_CONS) { 1602789Sahrens last_reclaim = ARC_RECLAIM_AGGR; 1603789Sahrens } else { 1604789Sahrens last_reclaim = ARC_RECLAIM_CONS; 1605789Sahrens } 1606789Sahrens } else { 16073403Sbmc arc_no_grow = TRUE; 1608789Sahrens last_reclaim = ARC_RECLAIM_AGGR; 1609789Sahrens membar_producer(); 1610789Sahrens } 1611789Sahrens 1612789Sahrens /* reset the growth delay for every reclaim */ 1613789Sahrens growtime = lbolt + (arc_grow_retry * hz); 1614789Sahrens 1615789Sahrens arc_kmem_reap_now(last_reclaim); 1616789Sahrens 16174309Smaybee } else if (arc_no_grow && lbolt >= growtime) { 16183403Sbmc arc_no_grow = FALSE; 1619789Sahrens } 1620789Sahrens 16213403Sbmc if (2 * arc_c < arc_size + 16223403Sbmc arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size) 16233298Smaybee arc_adjust(); 16243298Smaybee 16251544Seschrock if (arc_eviction_list != NULL) 16261544Seschrock arc_do_user_evicts(); 16271544Seschrock 1628789Sahrens /* block until needed, or one second, whichever is shorter */ 1629789Sahrens CALLB_CPR_SAFE_BEGIN(&cpr); 1630789Sahrens (void) cv_timedwait(&arc_reclaim_thr_cv, 1631789Sahrens &arc_reclaim_thr_lock, (lbolt + hz)); 1632789Sahrens CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 1633789Sahrens } 1634789Sahrens 1635789Sahrens arc_thread_exit = 0; 1636789Sahrens cv_broadcast(&arc_reclaim_thr_cv); 1637789Sahrens CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 1638789Sahrens thread_exit(); 1639789Sahrens } 1640789Sahrens 16411544Seschrock /* 16421544Seschrock * Adapt arc info given the number of bytes we are trying to add and 16431544Seschrock * the state that we are comming from. This function is only called 16441544Seschrock * when we are adding new content to the cache. 16451544Seschrock */ 1646789Sahrens static void 16471544Seschrock arc_adapt(int bytes, arc_state_t *state) 1648789Sahrens { 16491544Seschrock int mult; 16501544Seschrock 16511544Seschrock ASSERT(bytes > 0); 1652789Sahrens /* 16531544Seschrock * Adapt the target size of the MRU list: 16541544Seschrock * - if we just hit in the MRU ghost list, then increase 16551544Seschrock * the target size of the MRU list. 16561544Seschrock * - if we just hit in the MFU ghost list, then increase 16571544Seschrock * the target size of the MFU list by decreasing the 16581544Seschrock * target size of the MRU list. 1659789Sahrens */ 16603403Sbmc if (state == arc_mru_ghost) { 16613403Sbmc mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 16623403Sbmc 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 16631544Seschrock 16643403Sbmc arc_p = MIN(arc_c, arc_p + bytes * mult); 16653403Sbmc } else if (state == arc_mfu_ghost) { 16663403Sbmc mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 16673403Sbmc 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 16681544Seschrock 16693403Sbmc arc_p = MAX(0, (int64_t)arc_p - bytes * mult); 16701544Seschrock } 16713403Sbmc ASSERT((int64_t)arc_p >= 0); 1672789Sahrens 1673789Sahrens if (arc_reclaim_needed()) { 1674789Sahrens cv_signal(&arc_reclaim_thr_cv); 1675789Sahrens return; 1676789Sahrens } 1677789Sahrens 16783403Sbmc if (arc_no_grow) 1679789Sahrens return; 1680789Sahrens 16813403Sbmc if (arc_c >= arc_c_max) 16821544Seschrock return; 16831544Seschrock 1684789Sahrens /* 16851544Seschrock * If we're within (2 * maxblocksize) bytes of the target 16861544Seschrock * cache size, increment the target cache size 1687789Sahrens */ 16883403Sbmc if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 16893403Sbmc atomic_add_64(&arc_c, (int64_t)bytes); 16903403Sbmc if (arc_c > arc_c_max) 16913403Sbmc arc_c = arc_c_max; 16923403Sbmc else if (state == arc_anon) 16933403Sbmc atomic_add_64(&arc_p, (int64_t)bytes); 16943403Sbmc if (arc_p > arc_c) 16953403Sbmc arc_p = arc_c; 1696789Sahrens } 16973403Sbmc ASSERT((int64_t)arc_p >= 0); 1698789Sahrens } 1699789Sahrens 1700789Sahrens /* 17011544Seschrock * Check if the cache has reached its limits and eviction is required 17021544Seschrock * prior to insert. 1703789Sahrens */ 1704789Sahrens static int 17054309Smaybee arc_evict_needed(arc_buf_contents_t type) 1706789Sahrens { 17074309Smaybee if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 17084309Smaybee return (1); 17094309Smaybee 17104309Smaybee #ifdef _KERNEL 17114309Smaybee /* 17124309Smaybee * If zio data pages are being allocated out of a separate heap segment, 17134309Smaybee * then enforce that the size of available vmem for this area remains 17144309Smaybee * above about 1/32nd free. 17154309Smaybee */ 17164309Smaybee if (type == ARC_BUFC_DATA && zio_arena != NULL && 17174309Smaybee vmem_size(zio_arena, VMEM_FREE) < 17184309Smaybee (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) 17194309Smaybee return (1); 17204309Smaybee #endif 17214309Smaybee 1722789Sahrens if (arc_reclaim_needed()) 1723789Sahrens return (1); 1724789Sahrens 17253403Sbmc return (arc_size > arc_c); 1726789Sahrens } 1727789Sahrens 1728789Sahrens /* 17292688Smaybee * The buffer, supplied as the first argument, needs a data block. 17302688Smaybee * So, if we are at cache max, determine which cache should be victimized. 17312688Smaybee * We have the following cases: 1732789Sahrens * 17333403Sbmc * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 1734789Sahrens * In this situation if we're out of space, but the resident size of the MFU is 1735789Sahrens * under the limit, victimize the MFU cache to satisfy this insertion request. 1736789Sahrens * 17373403Sbmc * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 1738789Sahrens * Here, we've used up all of the available space for the MRU, so we need to 1739789Sahrens * evict from our own cache instead. Evict from the set of resident MRU 1740789Sahrens * entries. 1741789Sahrens * 17423403Sbmc * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 1743789Sahrens * c minus p represents the MFU space in the cache, since p is the size of the 1744789Sahrens * cache that is dedicated to the MRU. In this situation there's still space on 1745789Sahrens * the MFU side, so the MRU side needs to be victimized. 1746789Sahrens * 17473403Sbmc * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 1748789Sahrens * MFU's resident set is consuming more space than it has been allotted. In 1749789Sahrens * this situation, we must victimize our own cache, the MFU, for this insertion. 1750789Sahrens */ 1751789Sahrens static void 17522688Smaybee arc_get_data_buf(arc_buf_t *buf) 1753789Sahrens { 17543290Sjohansen arc_state_t *state = buf->b_hdr->b_state; 17553290Sjohansen uint64_t size = buf->b_hdr->b_size; 17563290Sjohansen arc_buf_contents_t type = buf->b_hdr->b_type; 17572688Smaybee 17582688Smaybee arc_adapt(size, state); 1759789Sahrens 17602688Smaybee /* 17612688Smaybee * We have not yet reached cache maximum size, 17622688Smaybee * just allocate a new buffer. 17632688Smaybee */ 17644309Smaybee if (!arc_evict_needed(type)) { 17653290Sjohansen if (type == ARC_BUFC_METADATA) { 17663290Sjohansen buf->b_data = zio_buf_alloc(size); 17674309Smaybee arc_space_consume(size); 17683290Sjohansen } else { 17693290Sjohansen ASSERT(type == ARC_BUFC_DATA); 17703290Sjohansen buf->b_data = zio_data_buf_alloc(size); 17714309Smaybee atomic_add_64(&arc_size, size); 17723290Sjohansen } 17732688Smaybee goto out; 17742688Smaybee } 17752688Smaybee 17762688Smaybee /* 17772688Smaybee * If we are prefetching from the mfu ghost list, this buffer 17782688Smaybee * will end up on the mru list; so steal space from there. 17792688Smaybee */ 17803403Sbmc if (state == arc_mfu_ghost) 17813403Sbmc state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; 17823403Sbmc else if (state == arc_mru_ghost) 17833403Sbmc state = arc_mru; 1784789Sahrens 17853403Sbmc if (state == arc_mru || state == arc_anon) { 17863403Sbmc uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 17874309Smaybee state = (arc_mfu->arcs_lsize[type] > 0 && 17884309Smaybee arc_p > mru_used) ? arc_mfu : arc_mru; 1789789Sahrens } else { 17902688Smaybee /* MFU cases */ 17913403Sbmc uint64_t mfu_space = arc_c - arc_p; 17924309Smaybee state = (arc_mru->arcs_lsize[type] > 0 && 17934309Smaybee mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 17942688Smaybee } 17953290Sjohansen if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) { 17963290Sjohansen if (type == ARC_BUFC_METADATA) { 17973290Sjohansen buf->b_data = zio_buf_alloc(size); 17984309Smaybee arc_space_consume(size); 17993290Sjohansen } else { 18003290Sjohansen ASSERT(type == ARC_BUFC_DATA); 18013290Sjohansen buf->b_data = zio_data_buf_alloc(size); 18024309Smaybee atomic_add_64(&arc_size, size); 18033290Sjohansen } 18043403Sbmc ARCSTAT_BUMP(arcstat_recycle_miss); 18052688Smaybee } 18062688Smaybee ASSERT(buf->b_data != NULL); 18072688Smaybee out: 18082688Smaybee /* 18092688Smaybee * Update the state size. Note that ghost states have a 18102688Smaybee * "ghost size" and so don't need to be updated. 18112688Smaybee */ 18122688Smaybee if (!GHOST_STATE(buf->b_hdr->b_state)) { 18132688Smaybee arc_buf_hdr_t *hdr = buf->b_hdr; 18142688Smaybee 18153403Sbmc atomic_add_64(&hdr->b_state->arcs_size, size); 18162688Smaybee if (list_link_active(&hdr->b_arc_node)) { 18172688Smaybee ASSERT(refcount_is_zero(&hdr->b_refcnt)); 18184309Smaybee atomic_add_64(&hdr->b_state->arcs_lsize[type], size); 1819789Sahrens } 18203298Smaybee /* 18213298Smaybee * If we are growing the cache, and we are adding anonymous 18223403Sbmc * data, and we have outgrown arc_p, update arc_p 18233298Smaybee */ 18243403Sbmc if (arc_size < arc_c && hdr->b_state == arc_anon && 18253403Sbmc arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 18263403Sbmc arc_p = MIN(arc_c, arc_p + size); 1827789Sahrens } 1828789Sahrens } 1829789Sahrens 1830789Sahrens /* 1831789Sahrens * This routine is called whenever a buffer is accessed. 18321544Seschrock * NOTE: the hash lock is dropped in this function. 1833789Sahrens */ 1834789Sahrens static void 18352688Smaybee arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 1836789Sahrens { 1837789Sahrens ASSERT(MUTEX_HELD(hash_lock)); 1838789Sahrens 18393403Sbmc if (buf->b_state == arc_anon) { 1840789Sahrens /* 1841789Sahrens * This buffer is not in the cache, and does not 1842789Sahrens * appear in our "ghost" list. Add the new buffer 1843789Sahrens * to the MRU state. 1844789Sahrens */ 1845789Sahrens 1846789Sahrens ASSERT(buf->b_arc_access == 0); 1847789Sahrens buf->b_arc_access = lbolt; 18481544Seschrock DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 18493403Sbmc arc_change_state(arc_mru, buf, hash_lock); 1850789Sahrens 18513403Sbmc } else if (buf->b_state == arc_mru) { 1852789Sahrens /* 18532391Smaybee * If this buffer is here because of a prefetch, then either: 18542391Smaybee * - clear the flag if this is a "referencing" read 18552391Smaybee * (any subsequent access will bump this into the MFU state). 18562391Smaybee * or 18572391Smaybee * - move the buffer to the head of the list if this is 18582391Smaybee * another prefetch (to make it less likely to be evicted). 1859789Sahrens */ 1860789Sahrens if ((buf->b_flags & ARC_PREFETCH) != 0) { 18612391Smaybee if (refcount_count(&buf->b_refcnt) == 0) { 18622391Smaybee ASSERT(list_link_active(&buf->b_arc_node)); 18632391Smaybee } else { 18642391Smaybee buf->b_flags &= ~ARC_PREFETCH; 18653403Sbmc ARCSTAT_BUMP(arcstat_mru_hits); 18662391Smaybee } 18672391Smaybee buf->b_arc_access = lbolt; 1868789Sahrens return; 1869789Sahrens } 1870789Sahrens 1871789Sahrens /* 1872789Sahrens * This buffer has been "accessed" only once so far, 1873789Sahrens * but it is still in the cache. Move it to the MFU 1874789Sahrens * state. 1875789Sahrens */ 1876789Sahrens if (lbolt > buf->b_arc_access + ARC_MINTIME) { 1877789Sahrens /* 1878789Sahrens * More than 125ms have passed since we 1879789Sahrens * instantiated this buffer. Move it to the 1880789Sahrens * most frequently used state. 1881789Sahrens */ 1882789Sahrens buf->b_arc_access = lbolt; 18831544Seschrock DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 18843403Sbmc arc_change_state(arc_mfu, buf, hash_lock); 1885789Sahrens } 18863403Sbmc ARCSTAT_BUMP(arcstat_mru_hits); 18873403Sbmc } else if (buf->b_state == arc_mru_ghost) { 1888789Sahrens arc_state_t *new_state; 1889789Sahrens /* 1890789Sahrens * This buffer has been "accessed" recently, but 1891789Sahrens * was evicted from the cache. Move it to the 1892789Sahrens * MFU state. 1893789Sahrens */ 1894789Sahrens 1895789Sahrens if (buf->b_flags & ARC_PREFETCH) { 18963403Sbmc new_state = arc_mru; 18972391Smaybee if (refcount_count(&buf->b_refcnt) > 0) 18982391Smaybee buf->b_flags &= ~ARC_PREFETCH; 18991544Seschrock DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1900789Sahrens } else { 19013403Sbmc new_state = arc_mfu; 19021544Seschrock DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1903789Sahrens } 1904789Sahrens 1905789Sahrens buf->b_arc_access = lbolt; 1906789Sahrens arc_change_state(new_state, buf, hash_lock); 1907789Sahrens 19083403Sbmc ARCSTAT_BUMP(arcstat_mru_ghost_hits); 19093403Sbmc } else if (buf->b_state == arc_mfu) { 1910789Sahrens /* 1911789Sahrens * This buffer has been accessed more than once and is 1912789Sahrens * still in the cache. Keep it in the MFU state. 1913789Sahrens * 19142391Smaybee * NOTE: an add_reference() that occurred when we did 19152391Smaybee * the arc_read() will have kicked this off the list. 19162391Smaybee * If it was a prefetch, we will explicitly move it to 19172391Smaybee * the head of the list now. 1918789Sahrens */ 19192391Smaybee if ((buf->b_flags & ARC_PREFETCH) != 0) { 19202391Smaybee ASSERT(refcount_count(&buf->b_refcnt) == 0); 19212391Smaybee ASSERT(list_link_active(&buf->b_arc_node)); 19222391Smaybee } 19233403Sbmc ARCSTAT_BUMP(arcstat_mfu_hits); 19242391Smaybee buf->b_arc_access = lbolt; 19253403Sbmc } else if (buf->b_state == arc_mfu_ghost) { 19263403Sbmc arc_state_t *new_state = arc_mfu; 1927789Sahrens /* 1928789Sahrens * This buffer has been accessed more than once but has 1929789Sahrens * been evicted from the cache. Move it back to the 1930789Sahrens * MFU state. 1931789Sahrens */ 1932789Sahrens 19332391Smaybee if (buf->b_flags & ARC_PREFETCH) { 19342391Smaybee /* 19352391Smaybee * This is a prefetch access... 19362391Smaybee * move this block back to the MRU state. 19372391Smaybee */ 19382391Smaybee ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); 19393403Sbmc new_state = arc_mru; 19402391Smaybee } 19412391Smaybee 1942789Sahrens buf->b_arc_access = lbolt; 19431544Seschrock DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 19442391Smaybee arc_change_state(new_state, buf, hash_lock); 1945789Sahrens 19463403Sbmc ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 1947789Sahrens } else { 1948789Sahrens ASSERT(!"invalid arc state"); 1949789Sahrens } 1950789Sahrens } 1951789Sahrens 1952789Sahrens /* a generic arc_done_func_t which you can use */ 1953789Sahrens /* ARGSUSED */ 1954789Sahrens void 1955789Sahrens arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 1956789Sahrens { 1957789Sahrens bcopy(buf->b_data, arg, buf->b_hdr->b_size); 19581544Seschrock VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1959789Sahrens } 1960789Sahrens 19614309Smaybee /* a generic arc_done_func_t */ 1962789Sahrens void 1963789Sahrens arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 1964789Sahrens { 1965789Sahrens arc_buf_t **bufp = arg; 1966789Sahrens if (zio && zio->io_error) { 19671544Seschrock VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1968789Sahrens *bufp = NULL; 1969789Sahrens } else { 1970789Sahrens *bufp = buf; 1971789Sahrens } 1972789Sahrens } 1973789Sahrens 1974789Sahrens static void 1975789Sahrens arc_read_done(zio_t *zio) 1976789Sahrens { 19771589Smaybee arc_buf_hdr_t *hdr, *found; 1978789Sahrens arc_buf_t *buf; 1979789Sahrens arc_buf_t *abuf; /* buffer we're assigning to callback */ 1980789Sahrens kmutex_t *hash_lock; 1981789Sahrens arc_callback_t *callback_list, *acb; 1982789Sahrens int freeable = FALSE; 1983789Sahrens 1984789Sahrens buf = zio->io_private; 1985789Sahrens hdr = buf->b_hdr; 1986789Sahrens 19871589Smaybee /* 19881589Smaybee * The hdr was inserted into hash-table and removed from lists 19891589Smaybee * prior to starting I/O. We should find this header, since 19901589Smaybee * it's in the hash table, and it should be legit since it's 19911589Smaybee * not possible to evict it during the I/O. The only possible 19921589Smaybee * reason for it not to be found is if we were freed during the 19931589Smaybee * read. 19941589Smaybee */ 19951589Smaybee found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, 19963093Sahrens &hash_lock); 1997789Sahrens 19981589Smaybee ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 19991589Smaybee (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)))); 2000789Sahrens 2001789Sahrens /* byteswap if necessary */ 2002789Sahrens callback_list = hdr->b_acb; 2003789Sahrens ASSERT(callback_list != NULL); 2004789Sahrens if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) 2005789Sahrens callback_list->acb_byteswap(buf->b_data, hdr->b_size); 2006789Sahrens 20073093Sahrens arc_cksum_compute(buf); 20083093Sahrens 2009789Sahrens /* create copies of the data buffer for the callers */ 2010789Sahrens abuf = buf; 2011789Sahrens for (acb = callback_list; acb; acb = acb->acb_next) { 2012789Sahrens if (acb->acb_done) { 20132688Smaybee if (abuf == NULL) 20142688Smaybee abuf = arc_buf_clone(buf); 2015789Sahrens acb->acb_buf = abuf; 2016789Sahrens abuf = NULL; 2017789Sahrens } 2018789Sahrens } 2019789Sahrens hdr->b_acb = NULL; 2020789Sahrens hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 20211544Seschrock ASSERT(!HDR_BUF_AVAILABLE(hdr)); 20221544Seschrock if (abuf == buf) 20231544Seschrock hdr->b_flags |= ARC_BUF_AVAILABLE; 2024789Sahrens 2025789Sahrens ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 2026789Sahrens 2027789Sahrens if (zio->io_error != 0) { 2028789Sahrens hdr->b_flags |= ARC_IO_ERROR; 20293403Sbmc if (hdr->b_state != arc_anon) 20303403Sbmc arc_change_state(arc_anon, hdr, hash_lock); 20311544Seschrock if (HDR_IN_HASH_TABLE(hdr)) 20321544Seschrock buf_hash_remove(hdr); 2033789Sahrens freeable = refcount_is_zero(&hdr->b_refcnt); 20342391Smaybee /* convert checksum errors into IO errors */ 20351544Seschrock if (zio->io_error == ECKSUM) 20361544Seschrock zio->io_error = EIO; 2037789Sahrens } 2038789Sahrens 20391544Seschrock /* 20402391Smaybee * Broadcast before we drop the hash_lock to avoid the possibility 20412391Smaybee * that the hdr (and hence the cv) might be freed before we get to 20422391Smaybee * the cv_broadcast(). 20431544Seschrock */ 20441544Seschrock cv_broadcast(&hdr->b_cv); 20451544Seschrock 20461589Smaybee if (hash_lock) { 2047789Sahrens /* 2048789Sahrens * Only call arc_access on anonymous buffers. This is because 2049789Sahrens * if we've issued an I/O for an evicted buffer, we've already 2050789Sahrens * called arc_access (to prevent any simultaneous readers from 2051789Sahrens * getting confused). 2052789Sahrens */ 20533403Sbmc if (zio->io_error == 0 && hdr->b_state == arc_anon) 20542688Smaybee arc_access(hdr, hash_lock); 20552688Smaybee mutex_exit(hash_lock); 2056789Sahrens } else { 2057789Sahrens /* 2058789Sahrens * This block was freed while we waited for the read to 2059789Sahrens * complete. It has been removed from the hash table and 2060789Sahrens * moved to the anonymous state (so that it won't show up 2061789Sahrens * in the cache). 2062789Sahrens */ 20633403Sbmc ASSERT3P(hdr->b_state, ==, arc_anon); 2064789Sahrens freeable = refcount_is_zero(&hdr->b_refcnt); 2065789Sahrens } 2066789Sahrens 2067789Sahrens /* execute each callback and free its structure */ 2068789Sahrens while ((acb = callback_list) != NULL) { 2069789Sahrens if (acb->acb_done) 2070789Sahrens acb->acb_done(zio, acb->acb_buf, acb->acb_private); 2071789Sahrens 2072789Sahrens if (acb->acb_zio_dummy != NULL) { 2073789Sahrens acb->acb_zio_dummy->io_error = zio->io_error; 2074789Sahrens zio_nowait(acb->acb_zio_dummy); 2075789Sahrens } 2076789Sahrens 2077789Sahrens callback_list = acb->acb_next; 2078789Sahrens kmem_free(acb, sizeof (arc_callback_t)); 2079789Sahrens } 2080789Sahrens 2081789Sahrens if (freeable) 20821544Seschrock arc_hdr_destroy(hdr); 2083789Sahrens } 2084789Sahrens 2085789Sahrens /* 2086789Sahrens * "Read" the block block at the specified DVA (in bp) via the 2087789Sahrens * cache. If the block is found in the cache, invoke the provided 2088789Sahrens * callback immediately and return. Note that the `zio' parameter 2089789Sahrens * in the callback will be NULL in this case, since no IO was 2090789Sahrens * required. If the block is not in the cache pass the read request 2091789Sahrens * on to the spa with a substitute callback function, so that the 2092789Sahrens * requested block will be added to the cache. 2093789Sahrens * 2094789Sahrens * If a read request arrives for a block that has a read in-progress, 2095789Sahrens * either wait for the in-progress read to complete (and return the 2096789Sahrens * results); or, if this is a read with a "done" func, add a record 2097789Sahrens * to the read to invoke the "done" func when the read completes, 2098789Sahrens * and return; or just return. 2099789Sahrens * 2100789Sahrens * arc_read_done() will invoke all the requested "done" functions 2101789Sahrens * for readers of this block. 2102789Sahrens */ 2103789Sahrens int 2104789Sahrens arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, 2105789Sahrens arc_done_func_t *done, void *private, int priority, int flags, 21062391Smaybee uint32_t *arc_flags, zbookmark_t *zb) 2107789Sahrens { 2108789Sahrens arc_buf_hdr_t *hdr; 2109789Sahrens arc_buf_t *buf; 2110789Sahrens kmutex_t *hash_lock; 2111789Sahrens zio_t *rzio; 2112789Sahrens 2113789Sahrens top: 2114789Sahrens hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 21151544Seschrock if (hdr && hdr->b_datacnt > 0) { 2116789Sahrens 21172391Smaybee *arc_flags |= ARC_CACHED; 21182391Smaybee 2119789Sahrens if (HDR_IO_IN_PROGRESS(hdr)) { 21202391Smaybee 21212391Smaybee if (*arc_flags & ARC_WAIT) { 21222391Smaybee cv_wait(&hdr->b_cv, hash_lock); 21232391Smaybee mutex_exit(hash_lock); 21242391Smaybee goto top; 21252391Smaybee } 21262391Smaybee ASSERT(*arc_flags & ARC_NOWAIT); 21272391Smaybee 21282391Smaybee if (done) { 2129789Sahrens arc_callback_t *acb = NULL; 2130789Sahrens 2131789Sahrens acb = kmem_zalloc(sizeof (arc_callback_t), 2132789Sahrens KM_SLEEP); 2133789Sahrens acb->acb_done = done; 2134789Sahrens acb->acb_private = private; 2135789Sahrens acb->acb_byteswap = swap; 2136789Sahrens if (pio != NULL) 2137789Sahrens acb->acb_zio_dummy = zio_null(pio, 2138789Sahrens spa, NULL, NULL, flags); 2139789Sahrens 2140789Sahrens ASSERT(acb->acb_done != NULL); 2141789Sahrens acb->acb_next = hdr->b_acb; 2142789Sahrens hdr->b_acb = acb; 2143789Sahrens add_reference(hdr, hash_lock, private); 2144789Sahrens mutex_exit(hash_lock); 2145789Sahrens return (0); 2146789Sahrens } 2147789Sahrens mutex_exit(hash_lock); 2148789Sahrens return (0); 2149789Sahrens } 2150789Sahrens 21513403Sbmc ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2152789Sahrens 21531544Seschrock if (done) { 21542688Smaybee add_reference(hdr, hash_lock, private); 21551544Seschrock /* 21561544Seschrock * If this block is already in use, create a new 21571544Seschrock * copy of the data so that we will be guaranteed 21581544Seschrock * that arc_release() will always succeed. 21591544Seschrock */ 21601544Seschrock buf = hdr->b_buf; 21611544Seschrock ASSERT(buf); 21621544Seschrock ASSERT(buf->b_data); 21632688Smaybee if (HDR_BUF_AVAILABLE(hdr)) { 21641544Seschrock ASSERT(buf->b_efunc == NULL); 21651544Seschrock hdr->b_flags &= ~ARC_BUF_AVAILABLE; 21662688Smaybee } else { 21672688Smaybee buf = arc_buf_clone(buf); 21681544Seschrock } 21692391Smaybee } else if (*arc_flags & ARC_PREFETCH && 21702391Smaybee refcount_count(&hdr->b_refcnt) == 0) { 21712391Smaybee hdr->b_flags |= ARC_PREFETCH; 2172789Sahrens } 2173789Sahrens DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 21742688Smaybee arc_access(hdr, hash_lock); 21752688Smaybee mutex_exit(hash_lock); 21763403Sbmc ARCSTAT_BUMP(arcstat_hits); 21773403Sbmc ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 21783403Sbmc demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 21793403Sbmc data, metadata, hits); 21803403Sbmc 2181789Sahrens if (done) 2182789Sahrens done(NULL, buf, private); 2183789Sahrens } else { 2184789Sahrens uint64_t size = BP_GET_LSIZE(bp); 2185789Sahrens arc_callback_t *acb; 2186789Sahrens 2187789Sahrens if (hdr == NULL) { 2188789Sahrens /* this block is not in the cache */ 2189789Sahrens arc_buf_hdr_t *exists; 21903290Sjohansen arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 21913290Sjohansen buf = arc_buf_alloc(spa, size, private, type); 2192789Sahrens hdr = buf->b_hdr; 2193789Sahrens hdr->b_dva = *BP_IDENTITY(bp); 2194789Sahrens hdr->b_birth = bp->blk_birth; 2195789Sahrens hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 2196789Sahrens exists = buf_hash_insert(hdr, &hash_lock); 2197789Sahrens if (exists) { 2198789Sahrens /* somebody beat us to the hash insert */ 2199789Sahrens mutex_exit(hash_lock); 2200789Sahrens bzero(&hdr->b_dva, sizeof (dva_t)); 2201789Sahrens hdr->b_birth = 0; 2202789Sahrens hdr->b_cksum0 = 0; 22031544Seschrock (void) arc_buf_remove_ref(buf, private); 2204789Sahrens goto top; /* restart the IO request */ 2205789Sahrens } 22062391Smaybee /* if this is a prefetch, we don't have a reference */ 22072391Smaybee if (*arc_flags & ARC_PREFETCH) { 22082391Smaybee (void) remove_reference(hdr, hash_lock, 22092391Smaybee private); 22102391Smaybee hdr->b_flags |= ARC_PREFETCH; 22112391Smaybee } 22122391Smaybee if (BP_GET_LEVEL(bp) > 0) 22132391Smaybee hdr->b_flags |= ARC_INDIRECT; 2214789Sahrens } else { 2215789Sahrens /* this block is in the ghost cache */ 22161544Seschrock ASSERT(GHOST_STATE(hdr->b_state)); 22171544Seschrock ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 22182391Smaybee ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); 22192391Smaybee ASSERT(hdr->b_buf == NULL); 2220789Sahrens 22212391Smaybee /* if this is a prefetch, we don't have a reference */ 22222391Smaybee if (*arc_flags & ARC_PREFETCH) 22232391Smaybee hdr->b_flags |= ARC_PREFETCH; 22242391Smaybee else 22252391Smaybee add_reference(hdr, hash_lock, private); 2226789Sahrens buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 22271544Seschrock buf->b_hdr = hdr; 22282688Smaybee buf->b_data = NULL; 22291544Seschrock buf->b_efunc = NULL; 22301544Seschrock buf->b_private = NULL; 22311544Seschrock buf->b_next = NULL; 22321544Seschrock hdr->b_buf = buf; 22332688Smaybee arc_get_data_buf(buf); 22341544Seschrock ASSERT(hdr->b_datacnt == 0); 22351544Seschrock hdr->b_datacnt = 1; 22362391Smaybee 2237789Sahrens } 2238789Sahrens 2239789Sahrens acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 2240789Sahrens acb->acb_done = done; 2241789Sahrens acb->acb_private = private; 2242789Sahrens acb->acb_byteswap = swap; 2243789Sahrens 2244789Sahrens ASSERT(hdr->b_acb == NULL); 2245789Sahrens hdr->b_acb = acb; 2246789Sahrens hdr->b_flags |= ARC_IO_IN_PROGRESS; 2247789Sahrens 2248789Sahrens /* 2249789Sahrens * If the buffer has been evicted, migrate it to a present state 2250789Sahrens * before issuing the I/O. Once we drop the hash-table lock, 2251789Sahrens * the header will be marked as I/O in progress and have an 2252789Sahrens * attached buffer. At this point, anybody who finds this 2253789Sahrens * buffer ought to notice that it's legit but has a pending I/O. 2254789Sahrens */ 2255789Sahrens 22561544Seschrock if (GHOST_STATE(hdr->b_state)) 22572688Smaybee arc_access(hdr, hash_lock); 22582688Smaybee mutex_exit(hash_lock); 2259789Sahrens 2260789Sahrens ASSERT3U(hdr->b_size, ==, size); 22611596Sahrens DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, 22621596Sahrens zbookmark_t *, zb); 22633403Sbmc ARCSTAT_BUMP(arcstat_misses); 22643403Sbmc ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 22653403Sbmc demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 22663403Sbmc data, metadata, misses); 22671544Seschrock 2268789Sahrens rzio = zio_read(pio, spa, bp, buf->b_data, size, 22691544Seschrock arc_read_done, buf, priority, flags, zb); 2270789Sahrens 22712391Smaybee if (*arc_flags & ARC_WAIT) 2272789Sahrens return (zio_wait(rzio)); 2273789Sahrens 22742391Smaybee ASSERT(*arc_flags & ARC_NOWAIT); 2275789Sahrens zio_nowait(rzio); 2276789Sahrens } 2277789Sahrens return (0); 2278789Sahrens } 2279789Sahrens 2280789Sahrens /* 2281789Sahrens * arc_read() variant to support pool traversal. If the block is already 2282789Sahrens * in the ARC, make a copy of it; otherwise, the caller will do the I/O. 2283789Sahrens * The idea is that we don't want pool traversal filling up memory, but 2284789Sahrens * if the ARC already has the data anyway, we shouldn't pay for the I/O. 2285789Sahrens */ 2286789Sahrens int 2287789Sahrens arc_tryread(spa_t *spa, blkptr_t *bp, void *data) 2288789Sahrens { 2289789Sahrens arc_buf_hdr_t *hdr; 2290789Sahrens kmutex_t *hash_mtx; 2291789Sahrens int rc = 0; 2292789Sahrens 2293789Sahrens hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); 2294789Sahrens 22951544Seschrock if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { 22961544Seschrock arc_buf_t *buf = hdr->b_buf; 22971544Seschrock 22981544Seschrock ASSERT(buf); 22991544Seschrock while (buf->b_data == NULL) { 23001544Seschrock buf = buf->b_next; 23011544Seschrock ASSERT(buf); 23021544Seschrock } 23031544Seschrock bcopy(buf->b_data, data, hdr->b_size); 23041544Seschrock } else { 2305789Sahrens rc = ENOENT; 23061544Seschrock } 2307789Sahrens 2308789Sahrens if (hash_mtx) 2309789Sahrens mutex_exit(hash_mtx); 2310789Sahrens 2311789Sahrens return (rc); 2312789Sahrens } 2313789Sahrens 23141544Seschrock void 23151544Seschrock arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 23161544Seschrock { 23171544Seschrock ASSERT(buf->b_hdr != NULL); 23183403Sbmc ASSERT(buf->b_hdr->b_state != arc_anon); 23191544Seschrock ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); 23201544Seschrock buf->b_efunc = func; 23211544Seschrock buf->b_private = private; 23221544Seschrock } 23231544Seschrock 23241544Seschrock /* 23251544Seschrock * This is used by the DMU to let the ARC know that a buffer is 23261544Seschrock * being evicted, so the ARC should clean up. If this arc buf 23271544Seschrock * is not yet in the evicted state, it will be put there. 23281544Seschrock */ 23291544Seschrock int 23301544Seschrock arc_buf_evict(arc_buf_t *buf) 23311544Seschrock { 23322887Smaybee arc_buf_hdr_t *hdr; 23331544Seschrock kmutex_t *hash_lock; 23341544Seschrock arc_buf_t **bufp; 23351544Seschrock 23362887Smaybee mutex_enter(&arc_eviction_mtx); 23372887Smaybee hdr = buf->b_hdr; 23381544Seschrock if (hdr == NULL) { 23391544Seschrock /* 23401544Seschrock * We are in arc_do_user_evicts(). 23411544Seschrock */ 23421544Seschrock ASSERT(buf->b_data == NULL); 23432887Smaybee mutex_exit(&arc_eviction_mtx); 23441544Seschrock return (0); 23451544Seschrock } 23462887Smaybee hash_lock = HDR_LOCK(hdr); 23472887Smaybee mutex_exit(&arc_eviction_mtx); 23481544Seschrock 23491544Seschrock mutex_enter(hash_lock); 23501544Seschrock 23512724Smaybee if (buf->b_data == NULL) { 23522724Smaybee /* 23532724Smaybee * We are on the eviction list. 23542724Smaybee */ 23552724Smaybee mutex_exit(hash_lock); 23562724Smaybee mutex_enter(&arc_eviction_mtx); 23572724Smaybee if (buf->b_hdr == NULL) { 23582724Smaybee /* 23592724Smaybee * We are already in arc_do_user_evicts(). 23602724Smaybee */ 23612724Smaybee mutex_exit(&arc_eviction_mtx); 23622724Smaybee return (0); 23632724Smaybee } else { 23642724Smaybee arc_buf_t copy = *buf; /* structure assignment */ 23652724Smaybee /* 23662724Smaybee * Process this buffer now 23672724Smaybee * but let arc_do_user_evicts() do the reaping. 23682724Smaybee */ 23692724Smaybee buf->b_efunc = NULL; 23702724Smaybee mutex_exit(&arc_eviction_mtx); 23712724Smaybee VERIFY(copy.b_efunc(©) == 0); 23722724Smaybee return (1); 23732724Smaybee } 23742724Smaybee } 23752724Smaybee 23762724Smaybee ASSERT(buf->b_hdr == hdr); 23772724Smaybee ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 23783403Sbmc ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 23791544Seschrock 23801544Seschrock /* 23811544Seschrock * Pull this buffer off of the hdr 23821544Seschrock */ 23831544Seschrock bufp = &hdr->b_buf; 23841544Seschrock while (*bufp != buf) 23851544Seschrock bufp = &(*bufp)->b_next; 23861544Seschrock *bufp = buf->b_next; 23871544Seschrock 23881544Seschrock ASSERT(buf->b_data != NULL); 23892688Smaybee arc_buf_destroy(buf, FALSE, FALSE); 23901544Seschrock 23911544Seschrock if (hdr->b_datacnt == 0) { 23921544Seschrock arc_state_t *old_state = hdr->b_state; 23931544Seschrock arc_state_t *evicted_state; 23941544Seschrock 23951544Seschrock ASSERT(refcount_is_zero(&hdr->b_refcnt)); 23961544Seschrock 23971544Seschrock evicted_state = 23983403Sbmc (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 23991544Seschrock 24003403Sbmc mutex_enter(&old_state->arcs_mtx); 24013403Sbmc mutex_enter(&evicted_state->arcs_mtx); 24021544Seschrock 24031544Seschrock arc_change_state(evicted_state, hdr, hash_lock); 24041544Seschrock ASSERT(HDR_IN_HASH_TABLE(hdr)); 24051544Seschrock hdr->b_flags = ARC_IN_HASH_TABLE; 24061544Seschrock 24073403Sbmc mutex_exit(&evicted_state->arcs_mtx); 24083403Sbmc mutex_exit(&old_state->arcs_mtx); 24091544Seschrock } 24101544Seschrock mutex_exit(hash_lock); 24111819Smaybee 24121544Seschrock VERIFY(buf->b_efunc(buf) == 0); 24131544Seschrock buf->b_efunc = NULL; 24141544Seschrock buf->b_private = NULL; 24151544Seschrock buf->b_hdr = NULL; 24161544Seschrock kmem_cache_free(buf_cache, buf); 24171544Seschrock return (1); 24181544Seschrock } 24191544Seschrock 2420789Sahrens /* 2421789Sahrens * Release this buffer from the cache. This must be done 2422789Sahrens * after a read and prior to modifying the buffer contents. 2423789Sahrens * If the buffer has more than one reference, we must make 2424789Sahrens * make a new hdr for the buffer. 2425789Sahrens */ 2426789Sahrens void 2427789Sahrens arc_release(arc_buf_t *buf, void *tag) 2428789Sahrens { 2429789Sahrens arc_buf_hdr_t *hdr = buf->b_hdr; 2430789Sahrens kmutex_t *hash_lock = HDR_LOCK(hdr); 2431789Sahrens 2432789Sahrens /* this buffer is not on any list */ 2433789Sahrens ASSERT(refcount_count(&hdr->b_refcnt) > 0); 2434789Sahrens 24353403Sbmc if (hdr->b_state == arc_anon) { 2436789Sahrens /* this buffer is already released */ 2437789Sahrens ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); 2438789Sahrens ASSERT(BUF_EMPTY(hdr)); 24391544Seschrock ASSERT(buf->b_efunc == NULL); 24403093Sahrens arc_buf_thaw(buf); 2441789Sahrens return; 2442789Sahrens } 2443789Sahrens 2444789Sahrens mutex_enter(hash_lock); 2445789Sahrens 24461544Seschrock /* 24471544Seschrock * Do we have more than one buf? 24481544Seschrock */ 24491544Seschrock if (hdr->b_buf != buf || buf->b_next != NULL) { 2450789Sahrens arc_buf_hdr_t *nhdr; 2451789Sahrens arc_buf_t **bufp; 2452789Sahrens uint64_t blksz = hdr->b_size; 2453789Sahrens spa_t *spa = hdr->b_spa; 24543290Sjohansen arc_buf_contents_t type = hdr->b_type; 2455789Sahrens 24561544Seschrock ASSERT(hdr->b_datacnt > 1); 2457789Sahrens /* 2458789Sahrens * Pull the data off of this buf and attach it to 2459789Sahrens * a new anonymous buf. 2460789Sahrens */ 24611544Seschrock (void) remove_reference(hdr, hash_lock, tag); 2462789Sahrens bufp = &hdr->b_buf; 24631544Seschrock while (*bufp != buf) 2464789Sahrens bufp = &(*bufp)->b_next; 2465789Sahrens *bufp = (*bufp)->b_next; 24663897Smaybee buf->b_next = NULL; 24671544Seschrock 24683403Sbmc ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); 24693403Sbmc atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); 24701544Seschrock if (refcount_is_zero(&hdr->b_refcnt)) { 24714309Smaybee uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; 24724309Smaybee ASSERT3U(*size, >=, hdr->b_size); 24734309Smaybee atomic_add_64(size, -hdr->b_size); 24741544Seschrock } 24751544Seschrock hdr->b_datacnt -= 1; 24763547Smaybee arc_cksum_verify(buf); 24771544Seschrock 2478789Sahrens mutex_exit(hash_lock); 2479789Sahrens 2480789Sahrens nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 2481789Sahrens nhdr->b_size = blksz; 2482789Sahrens nhdr->b_spa = spa; 24833290Sjohansen nhdr->b_type = type; 2484789Sahrens nhdr->b_buf = buf; 24853403Sbmc nhdr->b_state = arc_anon; 2486789Sahrens nhdr->b_arc_access = 0; 2487789Sahrens nhdr->b_flags = 0; 24881544Seschrock nhdr->b_datacnt = 1; 24893547Smaybee nhdr->b_freeze_cksum = NULL; 24903897Smaybee (void) refcount_add(&nhdr->b_refcnt, tag); 2491789Sahrens buf->b_hdr = nhdr; 24923403Sbmc atomic_add_64(&arc_anon->arcs_size, blksz); 2493789Sahrens 2494789Sahrens hdr = nhdr; 2495789Sahrens } else { 24961544Seschrock ASSERT(refcount_count(&hdr->b_refcnt) == 1); 2497789Sahrens ASSERT(!list_link_active(&hdr->b_arc_node)); 2498789Sahrens ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 24993403Sbmc arc_change_state(arc_anon, hdr, hash_lock); 2500789Sahrens hdr->b_arc_access = 0; 2501789Sahrens mutex_exit(hash_lock); 2502789Sahrens bzero(&hdr->b_dva, sizeof (dva_t)); 2503789Sahrens hdr->b_birth = 0; 2504789Sahrens hdr->b_cksum0 = 0; 25053547Smaybee arc_buf_thaw(buf); 2506789Sahrens } 25071544Seschrock buf->b_efunc = NULL; 25081544Seschrock buf->b_private = NULL; 2509789Sahrens } 2510789Sahrens 2511789Sahrens int 2512789Sahrens arc_released(arc_buf_t *buf) 2513789Sahrens { 25143403Sbmc return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); 25151544Seschrock } 25161544Seschrock 25171544Seschrock int 25181544Seschrock arc_has_callback(arc_buf_t *buf) 25191544Seschrock { 25201544Seschrock return (buf->b_efunc != NULL); 2521789Sahrens } 2522789Sahrens 25231544Seschrock #ifdef ZFS_DEBUG 25241544Seschrock int 25251544Seschrock arc_referenced(arc_buf_t *buf) 25261544Seschrock { 25271544Seschrock return (refcount_count(&buf->b_hdr->b_refcnt)); 25281544Seschrock } 25291544Seschrock #endif 25301544Seschrock 2531789Sahrens static void 25323547Smaybee arc_write_ready(zio_t *zio) 25333547Smaybee { 25343547Smaybee arc_write_callback_t *callback = zio->io_private; 25353547Smaybee arc_buf_t *buf = callback->awcb_buf; 25363547Smaybee 25373547Smaybee if (callback->awcb_ready) { 25383547Smaybee ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); 25393547Smaybee callback->awcb_ready(zio, buf, callback->awcb_private); 25403547Smaybee } 25413547Smaybee arc_cksum_compute(buf); 25423547Smaybee } 25433547Smaybee 25443547Smaybee static void 2545789Sahrens arc_write_done(zio_t *zio) 2546789Sahrens { 25473547Smaybee arc_write_callback_t *callback = zio->io_private; 25483547Smaybee arc_buf_t *buf = callback->awcb_buf; 25493547Smaybee arc_buf_hdr_t *hdr = buf->b_hdr; 2550789Sahrens 2551789Sahrens hdr->b_acb = NULL; 2552789Sahrens 2553789Sahrens /* this buffer is on no lists and is not in the hash table */ 25543403Sbmc ASSERT3P(hdr->b_state, ==, arc_anon); 2555789Sahrens 2556789Sahrens hdr->b_dva = *BP_IDENTITY(zio->io_bp); 2557789Sahrens hdr->b_birth = zio->io_bp->blk_birth; 2558789Sahrens hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 25591544Seschrock /* 25601544Seschrock * If the block to be written was all-zero, we may have 25611544Seschrock * compressed it away. In this case no write was performed 25621544Seschrock * so there will be no dva/birth-date/checksum. The buffer 25631544Seschrock * must therefor remain anonymous (and uncached). 25641544Seschrock */ 2565789Sahrens if (!BUF_EMPTY(hdr)) { 2566789Sahrens arc_buf_hdr_t *exists; 2567789Sahrens kmutex_t *hash_lock; 2568789Sahrens 25693093Sahrens arc_cksum_verify(buf); 25703093Sahrens 2571789Sahrens exists = buf_hash_insert(hdr, &hash_lock); 2572789Sahrens if (exists) { 2573789Sahrens /* 2574789Sahrens * This can only happen if we overwrite for 2575789Sahrens * sync-to-convergence, because we remove 2576789Sahrens * buffers from the hash table when we arc_free(). 2577789Sahrens */ 2578789Sahrens ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), 2579789Sahrens BP_IDENTITY(zio->io_bp))); 2580789Sahrens ASSERT3U(zio->io_bp_orig.blk_birth, ==, 2581789Sahrens zio->io_bp->blk_birth); 2582789Sahrens 2583789Sahrens ASSERT(refcount_is_zero(&exists->b_refcnt)); 25843403Sbmc arc_change_state(arc_anon, exists, hash_lock); 2585789Sahrens mutex_exit(hash_lock); 25861544Seschrock arc_hdr_destroy(exists); 2587789Sahrens exists = buf_hash_insert(hdr, &hash_lock); 2588789Sahrens ASSERT3P(exists, ==, NULL); 2589789Sahrens } 25901544Seschrock hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 25912688Smaybee arc_access(hdr, hash_lock); 25922688Smaybee mutex_exit(hash_lock); 25933547Smaybee } else if (callback->awcb_done == NULL) { 25941544Seschrock int destroy_hdr; 25951544Seschrock /* 25961544Seschrock * This is an anonymous buffer with no user callback, 25971544Seschrock * destroy it if there are no active references. 25981544Seschrock */ 25991544Seschrock mutex_enter(&arc_eviction_mtx); 26001544Seschrock destroy_hdr = refcount_is_zero(&hdr->b_refcnt); 26011544Seschrock hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 26021544Seschrock mutex_exit(&arc_eviction_mtx); 26031544Seschrock if (destroy_hdr) 26041544Seschrock arc_hdr_destroy(hdr); 26051544Seschrock } else { 26061544Seschrock hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2607789Sahrens } 26081544Seschrock 26093547Smaybee if (callback->awcb_done) { 2610789Sahrens ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 26113547Smaybee callback->awcb_done(zio, buf, callback->awcb_private); 2612789Sahrens } 2613789Sahrens 26143547Smaybee kmem_free(callback, sizeof (arc_write_callback_t)); 2615789Sahrens } 2616789Sahrens 26173547Smaybee zio_t * 26181775Sbillm arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 2619789Sahrens uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 26203547Smaybee arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, 26213547Smaybee int flags, zbookmark_t *zb) 2622789Sahrens { 2623789Sahrens arc_buf_hdr_t *hdr = buf->b_hdr; 26243547Smaybee arc_write_callback_t *callback; 26253547Smaybee zio_t *zio; 2626789Sahrens 2627789Sahrens /* this is a private buffer - no locking required */ 26283403Sbmc ASSERT3P(hdr->b_state, ==, arc_anon); 2629789Sahrens ASSERT(BUF_EMPTY(hdr)); 2630789Sahrens ASSERT(!HDR_IO_ERROR(hdr)); 26312237Smaybee ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 26322237Smaybee ASSERT(hdr->b_acb == 0); 26333547Smaybee callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 26343547Smaybee callback->awcb_ready = ready; 26353547Smaybee callback->awcb_done = done; 26363547Smaybee callback->awcb_private = private; 26373547Smaybee callback->awcb_buf = buf; 26381544Seschrock hdr->b_flags |= ARC_IO_IN_PROGRESS; 26393547Smaybee zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp, 26403547Smaybee buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback, 26413547Smaybee priority, flags, zb); 2642789Sahrens 26433547Smaybee return (zio); 2644789Sahrens } 2645789Sahrens 2646789Sahrens int 2647789Sahrens arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 2648789Sahrens zio_done_func_t *done, void *private, uint32_t arc_flags) 2649789Sahrens { 2650789Sahrens arc_buf_hdr_t *ab; 2651789Sahrens kmutex_t *hash_lock; 2652789Sahrens zio_t *zio; 2653789Sahrens 2654789Sahrens /* 2655789Sahrens * If this buffer is in the cache, release it, so it 2656789Sahrens * can be re-used. 2657789Sahrens */ 2658789Sahrens ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 2659789Sahrens if (ab != NULL) { 2660789Sahrens /* 2661789Sahrens * The checksum of blocks to free is not always 2662789Sahrens * preserved (eg. on the deadlist). However, if it is 2663789Sahrens * nonzero, it should match what we have in the cache. 2664789Sahrens */ 2665789Sahrens ASSERT(bp->blk_cksum.zc_word[0] == 0 || 2666789Sahrens ab->b_cksum0 == bp->blk_cksum.zc_word[0]); 26673403Sbmc if (ab->b_state != arc_anon) 26683403Sbmc arc_change_state(arc_anon, ab, hash_lock); 26692391Smaybee if (HDR_IO_IN_PROGRESS(ab)) { 26702391Smaybee /* 26712391Smaybee * This should only happen when we prefetch. 26722391Smaybee */ 26732391Smaybee ASSERT(ab->b_flags & ARC_PREFETCH); 26742391Smaybee ASSERT3U(ab->b_datacnt, ==, 1); 26752391Smaybee ab->b_flags |= ARC_FREED_IN_READ; 26762391Smaybee if (HDR_IN_HASH_TABLE(ab)) 26772391Smaybee buf_hash_remove(ab); 26782391Smaybee ab->b_arc_access = 0; 26792391Smaybee bzero(&ab->b_dva, sizeof (dva_t)); 26802391Smaybee ab->b_birth = 0; 26812391Smaybee ab->b_cksum0 = 0; 26822391Smaybee ab->b_buf->b_efunc = NULL; 26832391Smaybee ab->b_buf->b_private = NULL; 26842391Smaybee mutex_exit(hash_lock); 26852391Smaybee } else if (refcount_is_zero(&ab->b_refcnt)) { 2686789Sahrens mutex_exit(hash_lock); 26871544Seschrock arc_hdr_destroy(ab); 26883403Sbmc ARCSTAT_BUMP(arcstat_deleted); 2689789Sahrens } else { 26901589Smaybee /* 26912391Smaybee * We still have an active reference on this 26922391Smaybee * buffer. This can happen, e.g., from 26932391Smaybee * dbuf_unoverride(). 26941589Smaybee */ 26952391Smaybee ASSERT(!HDR_IN_HASH_TABLE(ab)); 2696789Sahrens ab->b_arc_access = 0; 2697789Sahrens bzero(&ab->b_dva, sizeof (dva_t)); 2698789Sahrens ab->b_birth = 0; 2699789Sahrens ab->b_cksum0 = 0; 27001544Seschrock ab->b_buf->b_efunc = NULL; 27011544Seschrock ab->b_buf->b_private = NULL; 2702789Sahrens mutex_exit(hash_lock); 2703789Sahrens } 2704789Sahrens } 2705789Sahrens 2706789Sahrens zio = zio_free(pio, spa, txg, bp, done, private); 2707789Sahrens 2708789Sahrens if (arc_flags & ARC_WAIT) 2709789Sahrens return (zio_wait(zio)); 2710789Sahrens 2711789Sahrens ASSERT(arc_flags & ARC_NOWAIT); 2712789Sahrens zio_nowait(zio); 2713789Sahrens 2714789Sahrens return (0); 2715789Sahrens } 2716789Sahrens 2717789Sahrens void 2718789Sahrens arc_tempreserve_clear(uint64_t tempreserve) 2719789Sahrens { 2720789Sahrens atomic_add_64(&arc_tempreserve, -tempreserve); 2721789Sahrens ASSERT((int64_t)arc_tempreserve >= 0); 2722789Sahrens } 2723789Sahrens 2724789Sahrens int 2725789Sahrens arc_tempreserve_space(uint64_t tempreserve) 2726789Sahrens { 2727789Sahrens #ifdef ZFS_DEBUG 2728789Sahrens /* 2729789Sahrens * Once in a while, fail for no reason. Everything should cope. 2730789Sahrens */ 2731789Sahrens if (spa_get_random(10000) == 0) { 2732789Sahrens dprintf("forcing random failure\n"); 2733789Sahrens return (ERESTART); 2734789Sahrens } 2735789Sahrens #endif 27363403Sbmc if (tempreserve > arc_c/4 && !arc_no_grow) 27373403Sbmc arc_c = MIN(arc_c_max, tempreserve * 4); 27383403Sbmc if (tempreserve > arc_c) 2739982Smaybee return (ENOMEM); 2740982Smaybee 2741789Sahrens /* 2742982Smaybee * Throttle writes when the amount of dirty data in the cache 2743982Smaybee * gets too large. We try to keep the cache less than half full 2744982Smaybee * of dirty blocks so that our sync times don't grow too large. 2745982Smaybee * Note: if two requests come in concurrently, we might let them 2746982Smaybee * both succeed, when one of them should fail. Not a huge deal. 2747982Smaybee * 2748982Smaybee * XXX The limit should be adjusted dynamically to keep the time 2749982Smaybee * to sync a dataset fixed (around 1-5 seconds?). 2750789Sahrens */ 2751789Sahrens 27523403Sbmc if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && 27533403Sbmc arc_tempreserve + arc_anon->arcs_size > arc_c / 4) { 27544309Smaybee dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 27554309Smaybee "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 27564309Smaybee arc_tempreserve>>10, 27574309Smaybee arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 27584309Smaybee arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 27593403Sbmc tempreserve>>10, arc_c>>10); 2760789Sahrens return (ERESTART); 2761789Sahrens } 2762789Sahrens atomic_add_64(&arc_tempreserve, tempreserve); 2763789Sahrens return (0); 2764789Sahrens } 2765789Sahrens 2766789Sahrens void 2767789Sahrens arc_init(void) 2768789Sahrens { 2769789Sahrens mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 2770789Sahrens cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 2771789Sahrens 27722391Smaybee /* Convert seconds to clock ticks */ 27732638Sperrin arc_min_prefetch_lifespan = 1 * hz; 27742391Smaybee 2775789Sahrens /* Start out with 1/8 of all memory */ 27763403Sbmc arc_c = physmem * PAGESIZE / 8; 2777789Sahrens 2778789Sahrens #ifdef _KERNEL 2779789Sahrens /* 2780789Sahrens * On architectures where the physical memory can be larger 2781789Sahrens * than the addressable space (intel in 32-bit mode), we may 2782789Sahrens * need to limit the cache to 1/8 of VM size. 2783789Sahrens */ 27843403Sbmc arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 2785789Sahrens #endif 2786789Sahrens 2787982Smaybee /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 27883403Sbmc arc_c_min = MAX(arc_c / 4, 64<<20); 2789982Smaybee /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 27903403Sbmc if (arc_c * 8 >= 1<<30) 27913403Sbmc arc_c_max = (arc_c * 8) - (1<<30); 2792789Sahrens else 27933403Sbmc arc_c_max = arc_c_min; 27943403Sbmc arc_c_max = MAX(arc_c * 6, arc_c_max); 27952885Sahrens 27962885Sahrens /* 27972885Sahrens * Allow the tunables to override our calculations if they are 27982885Sahrens * reasonable (ie. over 64MB) 27992885Sahrens */ 28002885Sahrens if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) 28013403Sbmc arc_c_max = zfs_arc_max; 28023403Sbmc if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) 28033403Sbmc arc_c_min = zfs_arc_min; 28042885Sahrens 28053403Sbmc arc_c = arc_c_max; 28063403Sbmc arc_p = (arc_c >> 1); 2807789Sahrens 28084309Smaybee /* limit meta-data to 1/4 of the arc capacity */ 28094309Smaybee arc_meta_limit = arc_c_max / 4; 28104645Sek110237 28114645Sek110237 /* Allow the tunable to override if it is reasonable */ 28124645Sek110237 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 28134645Sek110237 arc_meta_limit = zfs_arc_meta_limit; 28144645Sek110237 28154309Smaybee if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 28164309Smaybee arc_c_min = arc_meta_limit / 2; 28174309Smaybee 2818789Sahrens /* if kmem_flags are set, lets try to use less memory */ 2819789Sahrens if (kmem_debugging()) 28203403Sbmc arc_c = arc_c / 2; 28213403Sbmc if (arc_c < arc_c_min) 28223403Sbmc arc_c = arc_c_min; 2823789Sahrens 28243403Sbmc arc_anon = &ARC_anon; 28253403Sbmc arc_mru = &ARC_mru; 28263403Sbmc arc_mru_ghost = &ARC_mru_ghost; 28273403Sbmc arc_mfu = &ARC_mfu; 28283403Sbmc arc_mfu_ghost = &ARC_mfu_ghost; 28293403Sbmc arc_size = 0; 2830789Sahrens 28313403Sbmc mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 28323403Sbmc mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 28333403Sbmc mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 28343403Sbmc mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 28353403Sbmc mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 28362688Smaybee 28374309Smaybee list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], 28384309Smaybee sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 28394309Smaybee list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], 28404309Smaybee sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 28414309Smaybee list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], 28424309Smaybee sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 28434309Smaybee list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], 28444309Smaybee sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 28454309Smaybee list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], 28464309Smaybee sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 28474309Smaybee list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], 28484309Smaybee sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 28494309Smaybee list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], 28504309Smaybee sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 28514309Smaybee list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], 28524309Smaybee sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2853789Sahrens 2854789Sahrens buf_init(); 2855789Sahrens 2856789Sahrens arc_thread_exit = 0; 28571544Seschrock arc_eviction_list = NULL; 28581544Seschrock mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 28592887Smaybee bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 2860789Sahrens 28613403Sbmc arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 28623403Sbmc sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 28633403Sbmc 28643403Sbmc if (arc_ksp != NULL) { 28653403Sbmc arc_ksp->ks_data = &arc_stats; 28663403Sbmc kstat_install(arc_ksp); 28673403Sbmc } 28683403Sbmc 2869789Sahrens (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 2870789Sahrens TS_RUN, minclsyspri); 28713158Smaybee 28723158Smaybee arc_dead = FALSE; 2873789Sahrens } 2874789Sahrens 2875789Sahrens void 2876789Sahrens arc_fini(void) 2877789Sahrens { 2878789Sahrens mutex_enter(&arc_reclaim_thr_lock); 2879789Sahrens arc_thread_exit = 1; 2880789Sahrens while (arc_thread_exit != 0) 2881789Sahrens cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 2882789Sahrens mutex_exit(&arc_reclaim_thr_lock); 2883789Sahrens 2884789Sahrens arc_flush(); 2885789Sahrens 2886789Sahrens arc_dead = TRUE; 2887789Sahrens 28883403Sbmc if (arc_ksp != NULL) { 28893403Sbmc kstat_delete(arc_ksp); 28903403Sbmc arc_ksp = NULL; 28913403Sbmc } 28923403Sbmc 28931544Seschrock mutex_destroy(&arc_eviction_mtx); 2894789Sahrens mutex_destroy(&arc_reclaim_thr_lock); 2895789Sahrens cv_destroy(&arc_reclaim_thr_cv); 2896789Sahrens 28974309Smaybee list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); 28984309Smaybee list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 28994309Smaybee list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); 29004309Smaybee list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 29014309Smaybee list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); 29024309Smaybee list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 29034309Smaybee list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); 29044309Smaybee list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 2905789Sahrens 29063403Sbmc mutex_destroy(&arc_anon->arcs_mtx); 29073403Sbmc mutex_destroy(&arc_mru->arcs_mtx); 29083403Sbmc mutex_destroy(&arc_mru_ghost->arcs_mtx); 29093403Sbmc mutex_destroy(&arc_mfu->arcs_mtx); 29103403Sbmc mutex_destroy(&arc_mfu_ghost->arcs_mtx); 29112856Snd150628 2912789Sahrens buf_fini(); 2913789Sahrens } 2914