xref: /onnv-gate/usr/src/uts/common/fs/zfs/arc.c (revision 3700)
1789Sahrens /*
2789Sahrens  * CDDL HEADER START
3789Sahrens  *
4789Sahrens  * The contents of this file are subject to the terms of the
51484Sek110237  * Common Development and Distribution License (the "License").
61484Sek110237  * You may not use this file except in compliance with the License.
7789Sahrens  *
8789Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9789Sahrens  * or http://www.opensolaris.org/os/licensing.
10789Sahrens  * See the License for the specific language governing permissions
11789Sahrens  * and limitations under the License.
12789Sahrens  *
13789Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14789Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15789Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16789Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17789Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18789Sahrens  *
19789Sahrens  * CDDL HEADER END
20789Sahrens  */
21789Sahrens /*
223403Sbmc  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23789Sahrens  * Use is subject to license terms.
24789Sahrens  */
25789Sahrens 
26789Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
27789Sahrens 
28789Sahrens /*
293403Sbmc  * DVA-based Adjustable Replacement Cache
30789Sahrens  *
311544Seschrock  * While much of the theory of operation used here is
321544Seschrock  * based on the self-tuning, low overhead replacement cache
33789Sahrens  * presented by Megiddo and Modha at FAST 2003, there are some
34789Sahrens  * significant differences:
35789Sahrens  *
36789Sahrens  * 1. The Megiddo and Modha model assumes any page is evictable.
37789Sahrens  * Pages in its cache cannot be "locked" into memory.  This makes
38789Sahrens  * the eviction algorithm simple: evict the last page in the list.
39789Sahrens  * This also make the performance characteristics easy to reason
40789Sahrens  * about.  Our cache is not so simple.  At any given moment, some
41789Sahrens  * subset of the blocks in the cache are un-evictable because we
42789Sahrens  * have handed out a reference to them.  Blocks are only evictable
43789Sahrens  * when there are no external references active.  This makes
44789Sahrens  * eviction far more problematic:  we choose to evict the evictable
45789Sahrens  * blocks that are the "lowest" in the list.
46789Sahrens  *
47789Sahrens  * There are times when it is not possible to evict the requested
48789Sahrens  * space.  In these circumstances we are unable to adjust the cache
49789Sahrens  * size.  To prevent the cache growing unbounded at these times we
50789Sahrens  * implement a "cache throttle" that slowes the flow of new data
51789Sahrens  * into the cache until we can make space avaiable.
52789Sahrens  *
53789Sahrens  * 2. The Megiddo and Modha model assumes a fixed cache size.
54789Sahrens  * Pages are evicted when the cache is full and there is a cache
55789Sahrens  * miss.  Our model has a variable sized cache.  It grows with
56789Sahrens  * high use, but also tries to react to memory preasure from the
57789Sahrens  * operating system: decreasing its size when system memory is
58789Sahrens  * tight.
59789Sahrens  *
60789Sahrens  * 3. The Megiddo and Modha model assumes a fixed page size. All
61789Sahrens  * elements of the cache are therefor exactly the same size.  So
62789Sahrens  * when adjusting the cache size following a cache miss, its simply
63789Sahrens  * a matter of choosing a single page to evict.  In our model, we
64789Sahrens  * have variable sized cache blocks (rangeing from 512 bytes to
65789Sahrens  * 128K bytes).  We therefor choose a set of blocks to evict to make
66789Sahrens  * space for a cache miss that approximates as closely as possible
67789Sahrens  * the space used by the new block.
68789Sahrens  *
69789Sahrens  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
70789Sahrens  * by N. Megiddo & D. Modha, FAST 2003
71789Sahrens  */
72789Sahrens 
73789Sahrens /*
74789Sahrens  * The locking model:
75789Sahrens  *
76789Sahrens  * A new reference to a cache buffer can be obtained in two
77789Sahrens  * ways: 1) via a hash table lookup using the DVA as a key,
78789Sahrens  * or 2) via one of the ARC lists.  The arc_read() inerface
79789Sahrens  * uses method 1, while the internal arc algorithms for
80789Sahrens  * adjusting the cache use method 2.  We therefor provide two
81789Sahrens  * types of locks: 1) the hash table lock array, and 2) the
82789Sahrens  * arc list locks.
83789Sahrens  *
84789Sahrens  * Buffers do not have their own mutexs, rather they rely on the
85789Sahrens  * hash table mutexs for the bulk of their protection (i.e. most
86789Sahrens  * fields in the arc_buf_hdr_t are protected by these mutexs).
87789Sahrens  *
88789Sahrens  * buf_hash_find() returns the appropriate mutex (held) when it
89789Sahrens  * locates the requested buffer in the hash table.  It returns
90789Sahrens  * NULL for the mutex if the buffer was not in the table.
91789Sahrens  *
92789Sahrens  * buf_hash_remove() expects the appropriate hash mutex to be
93789Sahrens  * already held before it is invoked.
94789Sahrens  *
95789Sahrens  * Each arc state also has a mutex which is used to protect the
96789Sahrens  * buffer list associated with the state.  When attempting to
97789Sahrens  * obtain a hash table lock while holding an arc list lock you
98789Sahrens  * must use: mutex_tryenter() to avoid deadlock.  Also note that
992688Smaybee  * the active state mutex must be held before the ghost state mutex.
100789Sahrens  *
1011544Seschrock  * Arc buffers may have an associated eviction callback function.
1021544Seschrock  * This function will be invoked prior to removing the buffer (e.g.
1031544Seschrock  * in arc_do_user_evicts()).  Note however that the data associated
1041544Seschrock  * with the buffer may be evicted prior to the callback.  The callback
1051544Seschrock  * must be made with *no locks held* (to prevent deadlock).  Additionally,
1061544Seschrock  * the users of callbacks must ensure that their private data is
1071544Seschrock  * protected from simultaneous callbacks from arc_buf_evict()
1081544Seschrock  * and arc_do_user_evicts().
1091544Seschrock  *
110789Sahrens  * Note that the majority of the performance stats are manipulated
111789Sahrens  * with atomic operations.
112789Sahrens  */
113789Sahrens 
114789Sahrens #include <sys/spa.h>
115789Sahrens #include <sys/zio.h>
1163093Sahrens #include <sys/zio_checksum.h>
117789Sahrens #include <sys/zfs_context.h>
118789Sahrens #include <sys/arc.h>
119789Sahrens #include <sys/refcount.h>
120789Sahrens #ifdef _KERNEL
121789Sahrens #include <sys/vmsystm.h>
122789Sahrens #include <vm/anon.h>
123789Sahrens #include <sys/fs/swapnode.h>
1241484Sek110237 #include <sys/dnlc.h>
125789Sahrens #endif
126789Sahrens #include <sys/callb.h>
1273403Sbmc #include <sys/kstat.h>
128789Sahrens 
129789Sahrens static kmutex_t		arc_reclaim_thr_lock;
130789Sahrens static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
131789Sahrens static uint8_t		arc_thread_exit;
132789Sahrens 
1331484Sek110237 #define	ARC_REDUCE_DNLC_PERCENT	3
1341484Sek110237 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
1351484Sek110237 
136789Sahrens typedef enum arc_reclaim_strategy {
137789Sahrens 	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
138789Sahrens 	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
139789Sahrens } arc_reclaim_strategy_t;
140789Sahrens 
141789Sahrens /* number of seconds before growing cache again */
142789Sahrens static int		arc_grow_retry = 60;
143789Sahrens 
1442391Smaybee /*
1452638Sperrin  * minimum lifespan of a prefetch block in clock ticks
1462638Sperrin  * (initialized in arc_init())
1472391Smaybee  */
1482638Sperrin static int		arc_min_prefetch_lifespan;
1492391Smaybee 
150789Sahrens static int arc_dead;
151789Sahrens 
152789Sahrens /*
1532885Sahrens  * These tunables are for performance analysis.
1542885Sahrens  */
1552885Sahrens uint64_t zfs_arc_max;
1562885Sahrens uint64_t zfs_arc_min;
1572885Sahrens 
1582885Sahrens /*
159789Sahrens  * Note that buffers can be on one of 5 states:
160789Sahrens  *	ARC_anon	- anonymous (discussed below)
1611544Seschrock  *	ARC_mru		- recently used, currently cached
1621544Seschrock  *	ARC_mru_ghost	- recentely used, no longer in cache
1631544Seschrock  *	ARC_mfu		- frequently used, currently cached
1641544Seschrock  *	ARC_mfu_ghost	- frequently used, no longer in cache
165789Sahrens  * When there are no active references to the buffer, they
166789Sahrens  * are linked onto one of the lists in arc.  These are the
167789Sahrens  * only buffers that can be evicted or deleted.
168789Sahrens  *
169789Sahrens  * Anonymous buffers are buffers that are not associated with
170789Sahrens  * a DVA.  These are buffers that hold dirty block copies
171789Sahrens  * before they are written to stable storage.  By definition,
1721544Seschrock  * they are "ref'd" and are considered part of arc_mru
173789Sahrens  * that cannot be freed.  Generally, they will aquire a DVA
1741544Seschrock  * as they are written and migrate onto the arc_mru list.
175789Sahrens  */
176789Sahrens 
177789Sahrens typedef struct arc_state {
1783403Sbmc 	list_t	arcs_list;	/* linked list of evictable buffer in state */
1793403Sbmc 	uint64_t arcs_lsize;	/* total size of buffers in the linked list */
1803403Sbmc 	uint64_t arcs_size;	/* total size of all buffers in this state */
1813403Sbmc 	kmutex_t arcs_mtx;
182789Sahrens } arc_state_t;
183789Sahrens 
184789Sahrens /* The 5 states: */
185789Sahrens static arc_state_t ARC_anon;
1861544Seschrock static arc_state_t ARC_mru;
1871544Seschrock static arc_state_t ARC_mru_ghost;
1881544Seschrock static arc_state_t ARC_mfu;
1891544Seschrock static arc_state_t ARC_mfu_ghost;
190789Sahrens 
1913403Sbmc typedef struct arc_stats {
1923403Sbmc 	kstat_named_t arcstat_hits;
1933403Sbmc 	kstat_named_t arcstat_misses;
1943403Sbmc 	kstat_named_t arcstat_demand_data_hits;
1953403Sbmc 	kstat_named_t arcstat_demand_data_misses;
1963403Sbmc 	kstat_named_t arcstat_demand_metadata_hits;
1973403Sbmc 	kstat_named_t arcstat_demand_metadata_misses;
1983403Sbmc 	kstat_named_t arcstat_prefetch_data_hits;
1993403Sbmc 	kstat_named_t arcstat_prefetch_data_misses;
2003403Sbmc 	kstat_named_t arcstat_prefetch_metadata_hits;
2013403Sbmc 	kstat_named_t arcstat_prefetch_metadata_misses;
2023403Sbmc 	kstat_named_t arcstat_mru_hits;
2033403Sbmc 	kstat_named_t arcstat_mru_ghost_hits;
2043403Sbmc 	kstat_named_t arcstat_mfu_hits;
2053403Sbmc 	kstat_named_t arcstat_mfu_ghost_hits;
2063403Sbmc 	kstat_named_t arcstat_deleted;
2073403Sbmc 	kstat_named_t arcstat_recycle_miss;
2083403Sbmc 	kstat_named_t arcstat_mutex_miss;
2093403Sbmc 	kstat_named_t arcstat_evict_skip;
2103403Sbmc 	kstat_named_t arcstat_hash_elements;
2113403Sbmc 	kstat_named_t arcstat_hash_elements_max;
2123403Sbmc 	kstat_named_t arcstat_hash_collisions;
2133403Sbmc 	kstat_named_t arcstat_hash_chains;
2143403Sbmc 	kstat_named_t arcstat_hash_chain_max;
2153403Sbmc 	kstat_named_t arcstat_p;
2163403Sbmc 	kstat_named_t arcstat_c;
2173403Sbmc 	kstat_named_t arcstat_c_min;
2183403Sbmc 	kstat_named_t arcstat_c_max;
2193403Sbmc 	kstat_named_t arcstat_size;
2203403Sbmc } arc_stats_t;
2213403Sbmc 
2223403Sbmc static arc_stats_t arc_stats = {
2233403Sbmc 	{ "hits",			KSTAT_DATA_UINT64 },
2243403Sbmc 	{ "misses",			KSTAT_DATA_UINT64 },
2253403Sbmc 	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
2263403Sbmc 	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
2273403Sbmc 	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
2283403Sbmc 	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
2293403Sbmc 	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
2303403Sbmc 	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
2313403Sbmc 	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
2323403Sbmc 	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
2333403Sbmc 	{ "mru_hits",			KSTAT_DATA_UINT64 },
2343403Sbmc 	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
2353403Sbmc 	{ "mfu_hits",			KSTAT_DATA_UINT64 },
2363403Sbmc 	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
2373403Sbmc 	{ "deleted",			KSTAT_DATA_UINT64 },
2383403Sbmc 	{ "recycle_miss",		KSTAT_DATA_UINT64 },
2393403Sbmc 	{ "mutex_miss",			KSTAT_DATA_UINT64 },
2403403Sbmc 	{ "evict_skip",			KSTAT_DATA_UINT64 },
2413403Sbmc 	{ "hash_elements",		KSTAT_DATA_UINT64 },
2423403Sbmc 	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
2433403Sbmc 	{ "hash_collisions",		KSTAT_DATA_UINT64 },
2443403Sbmc 	{ "hash_chains",		KSTAT_DATA_UINT64 },
2453403Sbmc 	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
2463403Sbmc 	{ "p",				KSTAT_DATA_UINT64 },
2473403Sbmc 	{ "c",				KSTAT_DATA_UINT64 },
2483403Sbmc 	{ "c_min",			KSTAT_DATA_UINT64 },
2493403Sbmc 	{ "c_max",			KSTAT_DATA_UINT64 },
2503403Sbmc 	{ "size",			KSTAT_DATA_UINT64 }
2513403Sbmc };
252789Sahrens 
2533403Sbmc #define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
2543403Sbmc 
2553403Sbmc #define	ARCSTAT_INCR(stat, val) \
2563403Sbmc 	atomic_add_64(&arc_stats.stat.value.ui64, (val));
2573403Sbmc 
2583403Sbmc #define	ARCSTAT_BUMP(stat) 	ARCSTAT_INCR(stat, 1)
2593403Sbmc #define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
2603403Sbmc 
2613403Sbmc #define	ARCSTAT_MAX(stat, val) {					\
2623403Sbmc 	uint64_t m;							\
2633403Sbmc 	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
2643403Sbmc 	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
2653403Sbmc 		continue;						\
2663403Sbmc }
2673403Sbmc 
2683403Sbmc #define	ARCSTAT_MAXSTAT(stat) \
2693403Sbmc 	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
270789Sahrens 
2713403Sbmc /*
2723403Sbmc  * We define a macro to allow ARC hits/misses to be easily broken down by
2733403Sbmc  * two separate conditions, giving a total of four different subtypes for
2743403Sbmc  * each of hits and misses (so eight statistics total).
2753403Sbmc  */
2763403Sbmc #define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
2773403Sbmc 	if (cond1) {							\
2783403Sbmc 		if (cond2) {						\
2793403Sbmc 			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
2803403Sbmc 		} else {						\
2813403Sbmc 			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
2823403Sbmc 		}							\
2833403Sbmc 	} else {							\
2843403Sbmc 		if (cond2) {						\
2853403Sbmc 			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
2863403Sbmc 		} else {						\
2873403Sbmc 			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
2883403Sbmc 		}							\
2893403Sbmc 	}
290789Sahrens 
2913403Sbmc kstat_t			*arc_ksp;
2923403Sbmc static arc_state_t 	*arc_anon;
2933403Sbmc static arc_state_t	*arc_mru;
2943403Sbmc static arc_state_t	*arc_mru_ghost;
2953403Sbmc static arc_state_t	*arc_mfu;
2963403Sbmc static arc_state_t	*arc_mfu_ghost;
2973403Sbmc 
2983403Sbmc /*
2993403Sbmc  * There are several ARC variables that are critical to export as kstats --
3003403Sbmc  * but we don't want to have to grovel around in the kstat whenever we wish to
3013403Sbmc  * manipulate them.  For these variables, we therefore define them to be in
3023403Sbmc  * terms of the statistic variable.  This assures that we are not introducing
3033403Sbmc  * the possibility of inconsistency by having shadow copies of the variables,
3043403Sbmc  * while still allowing the code to be readable.
3053403Sbmc  */
3063403Sbmc #define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
3073403Sbmc #define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
3083403Sbmc #define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
3093403Sbmc #define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
3103403Sbmc #define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
3113403Sbmc 
3123403Sbmc static int		arc_no_grow;	/* Don't try to grow cache size */
3133403Sbmc static uint64_t		arc_tempreserve;
314789Sahrens 
315789Sahrens typedef struct arc_callback arc_callback_t;
316789Sahrens 
317789Sahrens struct arc_callback {
3183547Smaybee 	void			*acb_private;
319789Sahrens 	arc_done_func_t		*acb_done;
320789Sahrens 	arc_byteswap_func_t	*acb_byteswap;
321789Sahrens 	arc_buf_t		*acb_buf;
322789Sahrens 	zio_t			*acb_zio_dummy;
323789Sahrens 	arc_callback_t		*acb_next;
324789Sahrens };
325789Sahrens 
3263547Smaybee typedef struct arc_write_callback arc_write_callback_t;
3273547Smaybee 
3283547Smaybee struct arc_write_callback {
3293547Smaybee 	void		*awcb_private;
3303547Smaybee 	arc_done_func_t	*awcb_ready;
3313547Smaybee 	arc_done_func_t	*awcb_done;
3323547Smaybee 	arc_buf_t	*awcb_buf;
3333547Smaybee };
3343547Smaybee 
335789Sahrens struct arc_buf_hdr {
336789Sahrens 	/* protected by hash lock */
337789Sahrens 	dva_t			b_dva;
338789Sahrens 	uint64_t		b_birth;
339789Sahrens 	uint64_t		b_cksum0;
340789Sahrens 
3413093Sahrens 	kmutex_t		b_freeze_lock;
3423093Sahrens 	zio_cksum_t		*b_freeze_cksum;
3433093Sahrens 
344789Sahrens 	arc_buf_hdr_t		*b_hash_next;
345789Sahrens 	arc_buf_t		*b_buf;
346789Sahrens 	uint32_t		b_flags;
3471544Seschrock 	uint32_t		b_datacnt;
348789Sahrens 
3493290Sjohansen 	arc_callback_t		*b_acb;
350789Sahrens 	kcondvar_t		b_cv;
3513290Sjohansen 
3523290Sjohansen 	/* immutable */
3533290Sjohansen 	arc_buf_contents_t	b_type;
3543290Sjohansen 	uint64_t		b_size;
3553290Sjohansen 	spa_t			*b_spa;
356789Sahrens 
357789Sahrens 	/* protected by arc state mutex */
358789Sahrens 	arc_state_t		*b_state;
359789Sahrens 	list_node_t		b_arc_node;
360789Sahrens 
361789Sahrens 	/* updated atomically */
362789Sahrens 	clock_t			b_arc_access;
363789Sahrens 
364789Sahrens 	/* self protecting */
365789Sahrens 	refcount_t		b_refcnt;
366789Sahrens };
367789Sahrens 
3681544Seschrock static arc_buf_t *arc_eviction_list;
3691544Seschrock static kmutex_t arc_eviction_mtx;
3702887Smaybee static arc_buf_hdr_t arc_eviction_hdr;
3713552Sjohansen static size_t arc_ziosize;
3722688Smaybee static void arc_get_data_buf(arc_buf_t *buf);
3732688Smaybee static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
3741544Seschrock 
3751544Seschrock #define	GHOST_STATE(state)	\
3763403Sbmc 	((state) == arc_mru_ghost || (state) == arc_mfu_ghost)
3771544Seschrock 
378789Sahrens /*
379789Sahrens  * Private ARC flags.  These flags are private ARC only flags that will show up
380789Sahrens  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
381789Sahrens  * be passed in as arc_flags in things like arc_read.  However, these flags
382789Sahrens  * should never be passed and should only be set by ARC code.  When adding new
383789Sahrens  * public flags, make sure not to smash the private ones.
384789Sahrens  */
385789Sahrens 
3861544Seschrock #define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
387789Sahrens #define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
388789Sahrens #define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
389789Sahrens #define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
3901544Seschrock #define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
3912391Smaybee #define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
392789Sahrens 
3931544Seschrock #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
394789Sahrens #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
395789Sahrens #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
396789Sahrens #define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
3971544Seschrock #define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
398789Sahrens 
399789Sahrens /*
400789Sahrens  * Hash table routines
401789Sahrens  */
402789Sahrens 
403789Sahrens #define	HT_LOCK_PAD	64
404789Sahrens 
405789Sahrens struct ht_lock {
406789Sahrens 	kmutex_t	ht_lock;
407789Sahrens #ifdef _KERNEL
408789Sahrens 	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
409789Sahrens #endif
410789Sahrens };
411789Sahrens 
412789Sahrens #define	BUF_LOCKS 256
413789Sahrens typedef struct buf_hash_table {
414789Sahrens 	uint64_t ht_mask;
415789Sahrens 	arc_buf_hdr_t **ht_table;
416789Sahrens 	struct ht_lock ht_locks[BUF_LOCKS];
417789Sahrens } buf_hash_table_t;
418789Sahrens 
419789Sahrens static buf_hash_table_t buf_hash_table;
420789Sahrens 
421789Sahrens #define	BUF_HASH_INDEX(spa, dva, birth) \
422789Sahrens 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
423789Sahrens #define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
424789Sahrens #define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
425789Sahrens #define	HDR_LOCK(buf) \
426789Sahrens 	(BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
427789Sahrens 
428789Sahrens uint64_t zfs_crc64_table[256];
429789Sahrens 
430789Sahrens static uint64_t
431789Sahrens buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
432789Sahrens {
433789Sahrens 	uintptr_t spav = (uintptr_t)spa;
434789Sahrens 	uint8_t *vdva = (uint8_t *)dva;
435789Sahrens 	uint64_t crc = -1ULL;
436789Sahrens 	int i;
437789Sahrens 
438789Sahrens 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
439789Sahrens 
440789Sahrens 	for (i = 0; i < sizeof (dva_t); i++)
441789Sahrens 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
442789Sahrens 
443789Sahrens 	crc ^= (spav>>8) ^ birth;
444789Sahrens 
445789Sahrens 	return (crc);
446789Sahrens }
447789Sahrens 
448789Sahrens #define	BUF_EMPTY(buf)						\
449789Sahrens 	((buf)->b_dva.dva_word[0] == 0 &&			\
450789Sahrens 	(buf)->b_dva.dva_word[1] == 0 &&			\
451789Sahrens 	(buf)->b_birth == 0)
452789Sahrens 
453789Sahrens #define	BUF_EQUAL(spa, dva, birth, buf)				\
454789Sahrens 	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
455789Sahrens 	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
456789Sahrens 	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
457789Sahrens 
458789Sahrens static arc_buf_hdr_t *
459789Sahrens buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp)
460789Sahrens {
461789Sahrens 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
462789Sahrens 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
463789Sahrens 	arc_buf_hdr_t *buf;
464789Sahrens 
465789Sahrens 	mutex_enter(hash_lock);
466789Sahrens 	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
467789Sahrens 	    buf = buf->b_hash_next) {
468789Sahrens 		if (BUF_EQUAL(spa, dva, birth, buf)) {
469789Sahrens 			*lockp = hash_lock;
470789Sahrens 			return (buf);
471789Sahrens 		}
472789Sahrens 	}
473789Sahrens 	mutex_exit(hash_lock);
474789Sahrens 	*lockp = NULL;
475789Sahrens 	return (NULL);
476789Sahrens }
477789Sahrens 
478789Sahrens /*
479789Sahrens  * Insert an entry into the hash table.  If there is already an element
480789Sahrens  * equal to elem in the hash table, then the already existing element
481789Sahrens  * will be returned and the new element will not be inserted.
482789Sahrens  * Otherwise returns NULL.
483789Sahrens  */
484789Sahrens static arc_buf_hdr_t *
485789Sahrens buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
486789Sahrens {
487789Sahrens 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
488789Sahrens 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
489789Sahrens 	arc_buf_hdr_t *fbuf;
4903403Sbmc 	uint32_t i;
491789Sahrens 
4921544Seschrock 	ASSERT(!HDR_IN_HASH_TABLE(buf));
493789Sahrens 	*lockp = hash_lock;
494789Sahrens 	mutex_enter(hash_lock);
495789Sahrens 	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
496789Sahrens 	    fbuf = fbuf->b_hash_next, i++) {
497789Sahrens 		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
498789Sahrens 			return (fbuf);
499789Sahrens 	}
500789Sahrens 
501789Sahrens 	buf->b_hash_next = buf_hash_table.ht_table[idx];
502789Sahrens 	buf_hash_table.ht_table[idx] = buf;
5031544Seschrock 	buf->b_flags |= ARC_IN_HASH_TABLE;
504789Sahrens 
505789Sahrens 	/* collect some hash table performance data */
506789Sahrens 	if (i > 0) {
5073403Sbmc 		ARCSTAT_BUMP(arcstat_hash_collisions);
508789Sahrens 		if (i == 1)
5093403Sbmc 			ARCSTAT_BUMP(arcstat_hash_chains);
5103403Sbmc 
5113403Sbmc 		ARCSTAT_MAX(arcstat_hash_chain_max, i);
512789Sahrens 	}
5133403Sbmc 
5143403Sbmc 	ARCSTAT_BUMP(arcstat_hash_elements);
5153403Sbmc 	ARCSTAT_MAXSTAT(arcstat_hash_elements);
516789Sahrens 
517789Sahrens 	return (NULL);
518789Sahrens }
519789Sahrens 
520789Sahrens static void
521789Sahrens buf_hash_remove(arc_buf_hdr_t *buf)
522789Sahrens {
523789Sahrens 	arc_buf_hdr_t *fbuf, **bufp;
524789Sahrens 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
525789Sahrens 
526789Sahrens 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
5271544Seschrock 	ASSERT(HDR_IN_HASH_TABLE(buf));
528789Sahrens 
529789Sahrens 	bufp = &buf_hash_table.ht_table[idx];
530789Sahrens 	while ((fbuf = *bufp) != buf) {
531789Sahrens 		ASSERT(fbuf != NULL);
532789Sahrens 		bufp = &fbuf->b_hash_next;
533789Sahrens 	}
534789Sahrens 	*bufp = buf->b_hash_next;
535789Sahrens 	buf->b_hash_next = NULL;
5361544Seschrock 	buf->b_flags &= ~ARC_IN_HASH_TABLE;
537789Sahrens 
538789Sahrens 	/* collect some hash table performance data */
5393403Sbmc 	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
5403403Sbmc 
541789Sahrens 	if (buf_hash_table.ht_table[idx] &&
542789Sahrens 	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
5433403Sbmc 		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
544789Sahrens }
545789Sahrens 
546789Sahrens /*
547789Sahrens  * Global data structures and functions for the buf kmem cache.
548789Sahrens  */
549789Sahrens static kmem_cache_t *hdr_cache;
550789Sahrens static kmem_cache_t *buf_cache;
551789Sahrens 
552789Sahrens static void
553789Sahrens buf_fini(void)
554789Sahrens {
555789Sahrens 	int i;
556789Sahrens 
557789Sahrens 	kmem_free(buf_hash_table.ht_table,
558789Sahrens 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
559789Sahrens 	for (i = 0; i < BUF_LOCKS; i++)
560789Sahrens 		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
561789Sahrens 	kmem_cache_destroy(hdr_cache);
562789Sahrens 	kmem_cache_destroy(buf_cache);
563789Sahrens }
564789Sahrens 
565789Sahrens /*
566789Sahrens  * Constructor callback - called when the cache is empty
567789Sahrens  * and a new buf is requested.
568789Sahrens  */
569789Sahrens /* ARGSUSED */
570789Sahrens static int
571789Sahrens hdr_cons(void *vbuf, void *unused, int kmflag)
572789Sahrens {
573789Sahrens 	arc_buf_hdr_t *buf = vbuf;
574789Sahrens 
575789Sahrens 	bzero(buf, sizeof (arc_buf_hdr_t));
576789Sahrens 	refcount_create(&buf->b_refcnt);
577789Sahrens 	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
578789Sahrens 	return (0);
579789Sahrens }
580789Sahrens 
581789Sahrens /*
582789Sahrens  * Destructor callback - called when a cached buf is
583789Sahrens  * no longer required.
584789Sahrens  */
585789Sahrens /* ARGSUSED */
586789Sahrens static void
587789Sahrens hdr_dest(void *vbuf, void *unused)
588789Sahrens {
589789Sahrens 	arc_buf_hdr_t *buf = vbuf;
590789Sahrens 
591789Sahrens 	refcount_destroy(&buf->b_refcnt);
592789Sahrens 	cv_destroy(&buf->b_cv);
593789Sahrens }
594789Sahrens 
595789Sahrens /*
596789Sahrens  * Reclaim callback -- invoked when memory is low.
597789Sahrens  */
598789Sahrens /* ARGSUSED */
599789Sahrens static void
600789Sahrens hdr_recl(void *unused)
601789Sahrens {
602789Sahrens 	dprintf("hdr_recl called\n");
6033158Smaybee 	/*
6043158Smaybee 	 * umem calls the reclaim func when we destroy the buf cache,
6053158Smaybee 	 * which is after we do arc_fini().
6063158Smaybee 	 */
6073158Smaybee 	if (!arc_dead)
6083158Smaybee 		cv_signal(&arc_reclaim_thr_cv);
609789Sahrens }
610789Sahrens 
611789Sahrens static void
612789Sahrens buf_init(void)
613789Sahrens {
614789Sahrens 	uint64_t *ct;
6151544Seschrock 	uint64_t hsize = 1ULL << 12;
616789Sahrens 	int i, j;
617789Sahrens 
618789Sahrens 	/*
619789Sahrens 	 * The hash table is big enough to fill all of physical memory
6201544Seschrock 	 * with an average 64K block size.  The table will take up
6211544Seschrock 	 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
622789Sahrens 	 */
6231544Seschrock 	while (hsize * 65536 < physmem * PAGESIZE)
624789Sahrens 		hsize <<= 1;
6251544Seschrock retry:
626789Sahrens 	buf_hash_table.ht_mask = hsize - 1;
6271544Seschrock 	buf_hash_table.ht_table =
6281544Seschrock 	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
6291544Seschrock 	if (buf_hash_table.ht_table == NULL) {
6301544Seschrock 		ASSERT(hsize > (1ULL << 8));
6311544Seschrock 		hsize >>= 1;
6321544Seschrock 		goto retry;
6331544Seschrock 	}
634789Sahrens 
635789Sahrens 	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
636789Sahrens 	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
637789Sahrens 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
638789Sahrens 	    0, NULL, NULL, NULL, NULL, NULL, 0);
639789Sahrens 
640789Sahrens 	for (i = 0; i < 256; i++)
641789Sahrens 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
642789Sahrens 			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
643789Sahrens 
644789Sahrens 	for (i = 0; i < BUF_LOCKS; i++) {
645789Sahrens 		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
646789Sahrens 		    NULL, MUTEX_DEFAULT, NULL);
647789Sahrens 	}
648789Sahrens }
649789Sahrens 
650789Sahrens #define	ARC_MINTIME	(hz>>4) /* 62 ms */
651789Sahrens 
652789Sahrens static void
6533093Sahrens arc_cksum_verify(arc_buf_t *buf)
6543093Sahrens {
6553093Sahrens 	zio_cksum_t zc;
6563093Sahrens 
6573312Sahrens 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
6583093Sahrens 		return;
6593093Sahrens 
6603093Sahrens 	mutex_enter(&buf->b_hdr->b_freeze_lock);
6613265Sahrens 	if (buf->b_hdr->b_freeze_cksum == NULL ||
6623265Sahrens 	    (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
6633093Sahrens 		mutex_exit(&buf->b_hdr->b_freeze_lock);
6643093Sahrens 		return;
6653093Sahrens 	}
6663093Sahrens 	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
6673093Sahrens 	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
6683093Sahrens 		panic("buffer modified while frozen!");
6693093Sahrens 	mutex_exit(&buf->b_hdr->b_freeze_lock);
6703093Sahrens }
6713093Sahrens 
6723093Sahrens static void
6733093Sahrens arc_cksum_compute(arc_buf_t *buf)
6743093Sahrens {
6753312Sahrens 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
6763093Sahrens 		return;
6773093Sahrens 
6783093Sahrens 	mutex_enter(&buf->b_hdr->b_freeze_lock);
6793093Sahrens 	if (buf->b_hdr->b_freeze_cksum != NULL) {
6803093Sahrens 		mutex_exit(&buf->b_hdr->b_freeze_lock);
6813093Sahrens 		return;
6823093Sahrens 	}
6833093Sahrens 	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
6843093Sahrens 	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
6853093Sahrens 	    buf->b_hdr->b_freeze_cksum);
6863093Sahrens 	mutex_exit(&buf->b_hdr->b_freeze_lock);
6873093Sahrens }
6883093Sahrens 
6893093Sahrens void
6903093Sahrens arc_buf_thaw(arc_buf_t *buf)
6913093Sahrens {
6923312Sahrens 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
6933093Sahrens 		return;
6943093Sahrens 
6953403Sbmc 	if (buf->b_hdr->b_state != arc_anon)
6963093Sahrens 		panic("modifying non-anon buffer!");
6973093Sahrens 	if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
6983093Sahrens 		panic("modifying buffer while i/o in progress!");
6993093Sahrens 	arc_cksum_verify(buf);
7003093Sahrens 	mutex_enter(&buf->b_hdr->b_freeze_lock);
7013093Sahrens 	if (buf->b_hdr->b_freeze_cksum != NULL) {
7023093Sahrens 		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
7033093Sahrens 		buf->b_hdr->b_freeze_cksum = NULL;
7043093Sahrens 	}
7053093Sahrens 	mutex_exit(&buf->b_hdr->b_freeze_lock);
7063093Sahrens }
7073093Sahrens 
7083093Sahrens void
7093093Sahrens arc_buf_freeze(arc_buf_t *buf)
7103093Sahrens {
7113312Sahrens 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
7123312Sahrens 		return;
7133312Sahrens 
7143093Sahrens 	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
7153403Sbmc 	    buf->b_hdr->b_state == arc_anon);
7163093Sahrens 	arc_cksum_compute(buf);
7173093Sahrens }
7183093Sahrens 
7193093Sahrens static void
720789Sahrens add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
721789Sahrens {
722789Sahrens 	ASSERT(MUTEX_HELD(hash_lock));
723789Sahrens 
724789Sahrens 	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
7253403Sbmc 	    (ab->b_state != arc_anon)) {
726*3700Sek110237 		uint64_t delta = ab->b_size * ab->b_datacnt;
727789Sahrens 
7283403Sbmc 		ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
7293403Sbmc 		mutex_enter(&ab->b_state->arcs_mtx);
730789Sahrens 		ASSERT(list_link_active(&ab->b_arc_node));
7313403Sbmc 		list_remove(&ab->b_state->arcs_list, ab);
7321544Seschrock 		if (GHOST_STATE(ab->b_state)) {
7331544Seschrock 			ASSERT3U(ab->b_datacnt, ==, 0);
7341544Seschrock 			ASSERT3P(ab->b_buf, ==, NULL);
7351544Seschrock 			delta = ab->b_size;
7361544Seschrock 		}
7371544Seschrock 		ASSERT(delta > 0);
7383403Sbmc 		ASSERT3U(ab->b_state->arcs_lsize, >=, delta);
7393403Sbmc 		atomic_add_64(&ab->b_state->arcs_lsize, -delta);
7403403Sbmc 		mutex_exit(&ab->b_state->arcs_mtx);
7412391Smaybee 		/* remove the prefetch flag is we get a reference */
7422391Smaybee 		if (ab->b_flags & ARC_PREFETCH)
7432391Smaybee 			ab->b_flags &= ~ARC_PREFETCH;
744789Sahrens 	}
745789Sahrens }
746789Sahrens 
747789Sahrens static int
748789Sahrens remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
749789Sahrens {
750789Sahrens 	int cnt;
7513403Sbmc 	arc_state_t *state = ab->b_state;
752789Sahrens 
7533403Sbmc 	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
7543403Sbmc 	ASSERT(!GHOST_STATE(state));
755789Sahrens 
756789Sahrens 	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
7573403Sbmc 	    (state != arc_anon)) {
7583403Sbmc 		ASSERT(!MUTEX_HELD(&state->arcs_mtx));
7593403Sbmc 		mutex_enter(&state->arcs_mtx);
760789Sahrens 		ASSERT(!list_link_active(&ab->b_arc_node));
7613403Sbmc 		list_insert_head(&state->arcs_list, ab);
7621544Seschrock 		ASSERT(ab->b_datacnt > 0);
7633403Sbmc 		atomic_add_64(&state->arcs_lsize, ab->b_size * ab->b_datacnt);
7643403Sbmc 		ASSERT3U(state->arcs_size, >=, state->arcs_lsize);
7653403Sbmc 		mutex_exit(&state->arcs_mtx);
766789Sahrens 	}
767789Sahrens 	return (cnt);
768789Sahrens }
769789Sahrens 
770789Sahrens /*
771789Sahrens  * Move the supplied buffer to the indicated state.  The mutex
772789Sahrens  * for the buffer must be held by the caller.
773789Sahrens  */
774789Sahrens static void
7751544Seschrock arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
776789Sahrens {
7771544Seschrock 	arc_state_t *old_state = ab->b_state;
778*3700Sek110237 	int64_t refcnt = refcount_count(&ab->b_refcnt);
779*3700Sek110237 	uint64_t from_delta, to_delta;
780789Sahrens 
781789Sahrens 	ASSERT(MUTEX_HELD(hash_lock));
7821544Seschrock 	ASSERT(new_state != old_state);
7831544Seschrock 	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
7841544Seschrock 	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
7851544Seschrock 
7861544Seschrock 	from_delta = to_delta = ab->b_datacnt * ab->b_size;
787789Sahrens 
788789Sahrens 	/*
789789Sahrens 	 * If this buffer is evictable, transfer it from the
790789Sahrens 	 * old state list to the new state list.
791789Sahrens 	 */
7921544Seschrock 	if (refcnt == 0) {
7933403Sbmc 		if (old_state != arc_anon) {
7943403Sbmc 			int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
7951544Seschrock 
7961544Seschrock 			if (use_mutex)
7973403Sbmc 				mutex_enter(&old_state->arcs_mtx);
7981544Seschrock 
7991544Seschrock 			ASSERT(list_link_active(&ab->b_arc_node));
8003403Sbmc 			list_remove(&old_state->arcs_list, ab);
801789Sahrens 
8022391Smaybee 			/*
8032391Smaybee 			 * If prefetching out of the ghost cache,
8042391Smaybee 			 * we will have a non-null datacnt.
8052391Smaybee 			 */
8062391Smaybee 			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
8072391Smaybee 				/* ghost elements have a ghost size */
8081544Seschrock 				ASSERT(ab->b_buf == NULL);
8091544Seschrock 				from_delta = ab->b_size;
810789Sahrens 			}
8113403Sbmc 			ASSERT3U(old_state->arcs_lsize, >=, from_delta);
8123403Sbmc 			atomic_add_64(&old_state->arcs_lsize, -from_delta);
8131544Seschrock 
8141544Seschrock 			if (use_mutex)
8153403Sbmc 				mutex_exit(&old_state->arcs_mtx);
816789Sahrens 		}
8173403Sbmc 		if (new_state != arc_anon) {
8183403Sbmc 			int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
819789Sahrens 
8201544Seschrock 			if (use_mutex)
8213403Sbmc 				mutex_enter(&new_state->arcs_mtx);
8221544Seschrock 
8233403Sbmc 			list_insert_head(&new_state->arcs_list, ab);
8241544Seschrock 
8251544Seschrock 			/* ghost elements have a ghost size */
8261544Seschrock 			if (GHOST_STATE(new_state)) {
8271544Seschrock 				ASSERT(ab->b_datacnt == 0);
8281544Seschrock 				ASSERT(ab->b_buf == NULL);
8291544Seschrock 				to_delta = ab->b_size;
8301544Seschrock 			}
8313403Sbmc 			atomic_add_64(&new_state->arcs_lsize, to_delta);
8323403Sbmc 			ASSERT3U(new_state->arcs_size + to_delta, >=,
8333403Sbmc 			    new_state->arcs_lsize);
8341544Seschrock 
8351544Seschrock 			if (use_mutex)
8363403Sbmc 				mutex_exit(&new_state->arcs_mtx);
837789Sahrens 		}
838789Sahrens 	}
839789Sahrens 
840789Sahrens 	ASSERT(!BUF_EMPTY(ab));
8413403Sbmc 	if (new_state == arc_anon && old_state != arc_anon) {
842789Sahrens 		buf_hash_remove(ab);
843789Sahrens 	}
844789Sahrens 
8451544Seschrock 	/* adjust state sizes */
8461544Seschrock 	if (to_delta)
8473403Sbmc 		atomic_add_64(&new_state->arcs_size, to_delta);
8481544Seschrock 	if (from_delta) {
8493403Sbmc 		ASSERT3U(old_state->arcs_size, >=, from_delta);
8503403Sbmc 		atomic_add_64(&old_state->arcs_size, -from_delta);
851789Sahrens 	}
852789Sahrens 	ab->b_state = new_state;
853789Sahrens }
854789Sahrens 
855789Sahrens arc_buf_t *
8563290Sjohansen arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
857789Sahrens {
858789Sahrens 	arc_buf_hdr_t *hdr;
859789Sahrens 	arc_buf_t *buf;
860789Sahrens 
861789Sahrens 	ASSERT3U(size, >, 0);
862789Sahrens 	hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
863789Sahrens 	ASSERT(BUF_EMPTY(hdr));
864789Sahrens 	hdr->b_size = size;
8653290Sjohansen 	hdr->b_type = type;
866789Sahrens 	hdr->b_spa = spa;
8673403Sbmc 	hdr->b_state = arc_anon;
868789Sahrens 	hdr->b_arc_access = 0;
869789Sahrens 	buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
870789Sahrens 	buf->b_hdr = hdr;
8712688Smaybee 	buf->b_data = NULL;
8721544Seschrock 	buf->b_efunc = NULL;
8731544Seschrock 	buf->b_private = NULL;
874789Sahrens 	buf->b_next = NULL;
875789Sahrens 	hdr->b_buf = buf;
8762688Smaybee 	arc_get_data_buf(buf);
8771544Seschrock 	hdr->b_datacnt = 1;
878789Sahrens 	hdr->b_flags = 0;
879789Sahrens 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
880789Sahrens 	(void) refcount_add(&hdr->b_refcnt, tag);
881789Sahrens 
882789Sahrens 	return (buf);
883789Sahrens }
884789Sahrens 
8852688Smaybee static arc_buf_t *
8862688Smaybee arc_buf_clone(arc_buf_t *from)
8871544Seschrock {
8882688Smaybee 	arc_buf_t *buf;
8892688Smaybee 	arc_buf_hdr_t *hdr = from->b_hdr;
8902688Smaybee 	uint64_t size = hdr->b_size;
8911544Seschrock 
8922688Smaybee 	buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
8932688Smaybee 	buf->b_hdr = hdr;
8942688Smaybee 	buf->b_data = NULL;
8952688Smaybee 	buf->b_efunc = NULL;
8962688Smaybee 	buf->b_private = NULL;
8972688Smaybee 	buf->b_next = hdr->b_buf;
8982688Smaybee 	hdr->b_buf = buf;
8992688Smaybee 	arc_get_data_buf(buf);
9002688Smaybee 	bcopy(from->b_data, buf->b_data, size);
9012688Smaybee 	hdr->b_datacnt += 1;
9022688Smaybee 	return (buf);
9031544Seschrock }
9041544Seschrock 
9051544Seschrock void
9061544Seschrock arc_buf_add_ref(arc_buf_t *buf, void* tag)
9071544Seschrock {
9082887Smaybee 	arc_buf_hdr_t *hdr;
9091544Seschrock 	kmutex_t *hash_lock;
9101544Seschrock 
9112724Smaybee 	/*
9122724Smaybee 	 * Check to see if this buffer is currently being evicted via
9132887Smaybee 	 * arc_do_user_evicts().
9142724Smaybee 	 */
9152887Smaybee 	mutex_enter(&arc_eviction_mtx);
9162887Smaybee 	hdr = buf->b_hdr;
9172887Smaybee 	if (hdr == NULL) {
9182887Smaybee 		mutex_exit(&arc_eviction_mtx);
9192724Smaybee 		return;
9202887Smaybee 	}
9212887Smaybee 	hash_lock = HDR_LOCK(hdr);
9222887Smaybee 	mutex_exit(&arc_eviction_mtx);
9232724Smaybee 
9242724Smaybee 	mutex_enter(hash_lock);
9251544Seschrock 	if (buf->b_data == NULL) {
9261544Seschrock 		/*
9271544Seschrock 		 * This buffer is evicted.
9281544Seschrock 		 */
9292724Smaybee 		mutex_exit(hash_lock);
9301544Seschrock 		return;
9311544Seschrock 	}
9321544Seschrock 
9332724Smaybee 	ASSERT(buf->b_hdr == hdr);
9343403Sbmc 	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
9351544Seschrock 	add_reference(hdr, hash_lock, tag);
9362688Smaybee 	arc_access(hdr, hash_lock);
9372688Smaybee 	mutex_exit(hash_lock);
9383403Sbmc 	ARCSTAT_BUMP(arcstat_hits);
9393403Sbmc 	ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
9403403Sbmc 	    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
9413403Sbmc 	    data, metadata, hits);
9421544Seschrock }
9431544Seschrock 
944789Sahrens static void
9452688Smaybee arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
9461544Seschrock {
9471544Seschrock 	arc_buf_t **bufp;
9481544Seschrock 
9491544Seschrock 	/* free up data associated with the buf */
9501544Seschrock 	if (buf->b_data) {
9511544Seschrock 		arc_state_t *state = buf->b_hdr->b_state;
9521544Seschrock 		uint64_t size = buf->b_hdr->b_size;
9533290Sjohansen 		arc_buf_contents_t type = buf->b_hdr->b_type;
9541544Seschrock 
9553093Sahrens 		arc_cksum_verify(buf);
9562688Smaybee 		if (!recycle) {
9573290Sjohansen 			if (type == ARC_BUFC_METADATA) {
9583290Sjohansen 				zio_buf_free(buf->b_data, size);
9593290Sjohansen 			} else {
9603290Sjohansen 				ASSERT(type == ARC_BUFC_DATA);
9613290Sjohansen 				zio_data_buf_free(buf->b_data, size);
9623290Sjohansen 			}
9633403Sbmc 			atomic_add_64(&arc_size, -size);
9642688Smaybee 		}
9651544Seschrock 		if (list_link_active(&buf->b_hdr->b_arc_node)) {
9661544Seschrock 			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
9673403Sbmc 			ASSERT(state != arc_anon);
9683403Sbmc 			ASSERT3U(state->arcs_lsize, >=, size);
9693403Sbmc 			atomic_add_64(&state->arcs_lsize, -size);
9701544Seschrock 		}
9713403Sbmc 		ASSERT3U(state->arcs_size, >=, size);
9723403Sbmc 		atomic_add_64(&state->arcs_size, -size);
9731544Seschrock 		buf->b_data = NULL;
9741544Seschrock 		ASSERT(buf->b_hdr->b_datacnt > 0);
9751544Seschrock 		buf->b_hdr->b_datacnt -= 1;
9761544Seschrock 	}
9771544Seschrock 
9781544Seschrock 	/* only remove the buf if requested */
9791544Seschrock 	if (!all)
9801544Seschrock 		return;
9811544Seschrock 
9821544Seschrock 	/* remove the buf from the hdr list */
9831544Seschrock 	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
9841544Seschrock 		continue;
9851544Seschrock 	*bufp = buf->b_next;
9861544Seschrock 
9871544Seschrock 	ASSERT(buf->b_efunc == NULL);
9881544Seschrock 
9891544Seschrock 	/* clean up the buf */
9901544Seschrock 	buf->b_hdr = NULL;
9911544Seschrock 	kmem_cache_free(buf_cache, buf);
9921544Seschrock }
9931544Seschrock 
9941544Seschrock static void
9951544Seschrock arc_hdr_destroy(arc_buf_hdr_t *hdr)
996789Sahrens {
997789Sahrens 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
9983403Sbmc 	ASSERT3P(hdr->b_state, ==, arc_anon);
9991544Seschrock 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1000789Sahrens 
1001789Sahrens 	if (!BUF_EMPTY(hdr)) {
10021544Seschrock 		ASSERT(!HDR_IN_HASH_TABLE(hdr));
1003789Sahrens 		bzero(&hdr->b_dva, sizeof (dva_t));
1004789Sahrens 		hdr->b_birth = 0;
1005789Sahrens 		hdr->b_cksum0 = 0;
1006789Sahrens 	}
10071544Seschrock 	while (hdr->b_buf) {
1008789Sahrens 		arc_buf_t *buf = hdr->b_buf;
1009789Sahrens 
10101544Seschrock 		if (buf->b_efunc) {
10111544Seschrock 			mutex_enter(&arc_eviction_mtx);
10121544Seschrock 			ASSERT(buf->b_hdr != NULL);
10132688Smaybee 			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
10141544Seschrock 			hdr->b_buf = buf->b_next;
10152887Smaybee 			buf->b_hdr = &arc_eviction_hdr;
10161544Seschrock 			buf->b_next = arc_eviction_list;
10171544Seschrock 			arc_eviction_list = buf;
10181544Seschrock 			mutex_exit(&arc_eviction_mtx);
10191544Seschrock 		} else {
10202688Smaybee 			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
10211544Seschrock 		}
1022789Sahrens 	}
10233093Sahrens 	if (hdr->b_freeze_cksum != NULL) {
10243093Sahrens 		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
10253093Sahrens 		hdr->b_freeze_cksum = NULL;
10263093Sahrens 	}
10271544Seschrock 
1028789Sahrens 	ASSERT(!list_link_active(&hdr->b_arc_node));
1029789Sahrens 	ASSERT3P(hdr->b_hash_next, ==, NULL);
1030789Sahrens 	ASSERT3P(hdr->b_acb, ==, NULL);
1031789Sahrens 	kmem_cache_free(hdr_cache, hdr);
1032789Sahrens }
1033789Sahrens 
1034789Sahrens void
1035789Sahrens arc_buf_free(arc_buf_t *buf, void *tag)
1036789Sahrens {
1037789Sahrens 	arc_buf_hdr_t *hdr = buf->b_hdr;
10383403Sbmc 	int hashed = hdr->b_state != arc_anon;
10391544Seschrock 
10401544Seschrock 	ASSERT(buf->b_efunc == NULL);
10411544Seschrock 	ASSERT(buf->b_data != NULL);
10421544Seschrock 
10431544Seschrock 	if (hashed) {
10441544Seschrock 		kmutex_t *hash_lock = HDR_LOCK(hdr);
10451544Seschrock 
10461544Seschrock 		mutex_enter(hash_lock);
10471544Seschrock 		(void) remove_reference(hdr, hash_lock, tag);
10481544Seschrock 		if (hdr->b_datacnt > 1)
10492688Smaybee 			arc_buf_destroy(buf, FALSE, TRUE);
10501544Seschrock 		else
10511544Seschrock 			hdr->b_flags |= ARC_BUF_AVAILABLE;
10521544Seschrock 		mutex_exit(hash_lock);
10531544Seschrock 	} else if (HDR_IO_IN_PROGRESS(hdr)) {
10541544Seschrock 		int destroy_hdr;
10551544Seschrock 		/*
10561544Seschrock 		 * We are in the middle of an async write.  Don't destroy
10571544Seschrock 		 * this buffer unless the write completes before we finish
10581544Seschrock 		 * decrementing the reference count.
10591544Seschrock 		 */
10601544Seschrock 		mutex_enter(&arc_eviction_mtx);
10611544Seschrock 		(void) remove_reference(hdr, NULL, tag);
10621544Seschrock 		ASSERT(refcount_is_zero(&hdr->b_refcnt));
10631544Seschrock 		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
10641544Seschrock 		mutex_exit(&arc_eviction_mtx);
10651544Seschrock 		if (destroy_hdr)
10661544Seschrock 			arc_hdr_destroy(hdr);
10671544Seschrock 	} else {
10681544Seschrock 		if (remove_reference(hdr, NULL, tag) > 0) {
10691544Seschrock 			ASSERT(HDR_IO_ERROR(hdr));
10702688Smaybee 			arc_buf_destroy(buf, FALSE, TRUE);
10711544Seschrock 		} else {
10721544Seschrock 			arc_hdr_destroy(hdr);
10731544Seschrock 		}
10741544Seschrock 	}
10751544Seschrock }
10761544Seschrock 
10771544Seschrock int
10781544Seschrock arc_buf_remove_ref(arc_buf_t *buf, void* tag)
10791544Seschrock {
10801544Seschrock 	arc_buf_hdr_t *hdr = buf->b_hdr;
1081789Sahrens 	kmutex_t *hash_lock = HDR_LOCK(hdr);
10821544Seschrock 	int no_callback = (buf->b_efunc == NULL);
10831544Seschrock 
10843403Sbmc 	if (hdr->b_state == arc_anon) {
10851544Seschrock 		arc_buf_free(buf, tag);
10861544Seschrock 		return (no_callback);
10871544Seschrock 	}
1088789Sahrens 
1089789Sahrens 	mutex_enter(hash_lock);
10903403Sbmc 	ASSERT(hdr->b_state != arc_anon);
10911544Seschrock 	ASSERT(buf->b_data != NULL);
1092789Sahrens 
10931544Seschrock 	(void) remove_reference(hdr, hash_lock, tag);
10941544Seschrock 	if (hdr->b_datacnt > 1) {
10951544Seschrock 		if (no_callback)
10962688Smaybee 			arc_buf_destroy(buf, FALSE, TRUE);
10971544Seschrock 	} else if (no_callback) {
10981544Seschrock 		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
10991544Seschrock 		hdr->b_flags |= ARC_BUF_AVAILABLE;
1100789Sahrens 	}
11011544Seschrock 	ASSERT(no_callback || hdr->b_datacnt > 1 ||
11021544Seschrock 	    refcount_is_zero(&hdr->b_refcnt));
1103789Sahrens 	mutex_exit(hash_lock);
11041544Seschrock 	return (no_callback);
1105789Sahrens }
1106789Sahrens 
1107789Sahrens int
1108789Sahrens arc_buf_size(arc_buf_t *buf)
1109789Sahrens {
1110789Sahrens 	return (buf->b_hdr->b_size);
1111789Sahrens }
1112789Sahrens 
1113789Sahrens /*
1114789Sahrens  * Evict buffers from list until we've removed the specified number of
1115789Sahrens  * bytes.  Move the removed buffers to the appropriate evict state.
11162688Smaybee  * If the recycle flag is set, then attempt to "recycle" a buffer:
11172688Smaybee  * - look for a buffer to evict that is `bytes' long.
11182688Smaybee  * - return the data block from this buffer rather than freeing it.
11192688Smaybee  * This flag is used by callers that are trying to make space for a
11202688Smaybee  * new buffer in a full arc cache.
1121789Sahrens  */
11222688Smaybee static void *
11233290Sjohansen arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
11243290Sjohansen     arc_buf_contents_t type)
1125789Sahrens {
1126789Sahrens 	arc_state_t *evicted_state;
11272688Smaybee 	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
11282918Smaybee 	arc_buf_hdr_t *ab, *ab_prev = NULL;
1129789Sahrens 	kmutex_t *hash_lock;
11302688Smaybee 	boolean_t have_lock;
11312918Smaybee 	void *stolen = NULL;
1132789Sahrens 
11333403Sbmc 	ASSERT(state == arc_mru || state == arc_mfu);
1134789Sahrens 
11353403Sbmc 	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1136789Sahrens 
11373403Sbmc 	mutex_enter(&state->arcs_mtx);
11383403Sbmc 	mutex_enter(&evicted_state->arcs_mtx);
1139789Sahrens 
11403403Sbmc 	for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) {
11413403Sbmc 		ab_prev = list_prev(&state->arcs_list, ab);
11422391Smaybee 		/* prefetch buffers have a minimum lifespan */
11432688Smaybee 		if (HDR_IO_IN_PROGRESS(ab) ||
11442688Smaybee 		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
11452688Smaybee 		    lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) {
11462391Smaybee 			skipped++;
11472391Smaybee 			continue;
11482391Smaybee 		}
11492918Smaybee 		/* "lookahead" for better eviction candidate */
11502918Smaybee 		if (recycle && ab->b_size != bytes &&
11512918Smaybee 		    ab_prev && ab_prev->b_size == bytes)
11522688Smaybee 			continue;
1153789Sahrens 		hash_lock = HDR_LOCK(ab);
11542688Smaybee 		have_lock = MUTEX_HELD(hash_lock);
11552688Smaybee 		if (have_lock || mutex_tryenter(hash_lock)) {
1156789Sahrens 			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
11571544Seschrock 			ASSERT(ab->b_datacnt > 0);
11581544Seschrock 			while (ab->b_buf) {
11591544Seschrock 				arc_buf_t *buf = ab->b_buf;
11602688Smaybee 				if (buf->b_data) {
11611544Seschrock 					bytes_evicted += ab->b_size;
11623290Sjohansen 					if (recycle && ab->b_type == type &&
11633290Sjohansen 					    ab->b_size == bytes) {
11642918Smaybee 						stolen = buf->b_data;
11652918Smaybee 						recycle = FALSE;
11662918Smaybee 					}
11672688Smaybee 				}
11681544Seschrock 				if (buf->b_efunc) {
11691544Seschrock 					mutex_enter(&arc_eviction_mtx);
11702918Smaybee 					arc_buf_destroy(buf,
11712918Smaybee 					    buf->b_data == stolen, FALSE);
11721544Seschrock 					ab->b_buf = buf->b_next;
11732887Smaybee 					buf->b_hdr = &arc_eviction_hdr;
11741544Seschrock 					buf->b_next = arc_eviction_list;
11751544Seschrock 					arc_eviction_list = buf;
11761544Seschrock 					mutex_exit(&arc_eviction_mtx);
11771544Seschrock 				} else {
11782918Smaybee 					arc_buf_destroy(buf,
11792918Smaybee 					    buf->b_data == stolen, TRUE);
11801544Seschrock 				}
11811544Seschrock 			}
11821544Seschrock 			ASSERT(ab->b_datacnt == 0);
1183789Sahrens 			arc_change_state(evicted_state, ab, hash_lock);
11841544Seschrock 			ASSERT(HDR_IN_HASH_TABLE(ab));
11851544Seschrock 			ab->b_flags = ARC_IN_HASH_TABLE;
1186789Sahrens 			DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
11872688Smaybee 			if (!have_lock)
11882688Smaybee 				mutex_exit(hash_lock);
11891544Seschrock 			if (bytes >= 0 && bytes_evicted >= bytes)
1190789Sahrens 				break;
1191789Sahrens 		} else {
11922688Smaybee 			missed += 1;
1193789Sahrens 		}
1194789Sahrens 	}
11953403Sbmc 
11963403Sbmc 	mutex_exit(&evicted_state->arcs_mtx);
11973403Sbmc 	mutex_exit(&state->arcs_mtx);
1198789Sahrens 
1199789Sahrens 	if (bytes_evicted < bytes)
1200789Sahrens 		dprintf("only evicted %lld bytes from %x",
1201789Sahrens 		    (longlong_t)bytes_evicted, state);
1202789Sahrens 
12032688Smaybee 	if (skipped)
12043403Sbmc 		ARCSTAT_INCR(arcstat_evict_skip, skipped);
12053403Sbmc 
12062688Smaybee 	if (missed)
12073403Sbmc 		ARCSTAT_INCR(arcstat_mutex_miss, missed);
12083403Sbmc 
12092918Smaybee 	return (stolen);
1210789Sahrens }
1211789Sahrens 
1212789Sahrens /*
1213789Sahrens  * Remove buffers from list until we've removed the specified number of
1214789Sahrens  * bytes.  Destroy the buffers that are removed.
1215789Sahrens  */
1216789Sahrens static void
12171544Seschrock arc_evict_ghost(arc_state_t *state, int64_t bytes)
1218789Sahrens {
1219789Sahrens 	arc_buf_hdr_t *ab, *ab_prev;
1220789Sahrens 	kmutex_t *hash_lock;
12211544Seschrock 	uint64_t bytes_deleted = 0;
1222*3700Sek110237 	uint64_t bufs_skipped = 0;
1223789Sahrens 
12241544Seschrock 	ASSERT(GHOST_STATE(state));
1225789Sahrens top:
12263403Sbmc 	mutex_enter(&state->arcs_mtx);
12273403Sbmc 	for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) {
12283403Sbmc 		ab_prev = list_prev(&state->arcs_list, ab);
1229789Sahrens 		hash_lock = HDR_LOCK(ab);
1230789Sahrens 		if (mutex_tryenter(hash_lock)) {
12312391Smaybee 			ASSERT(!HDR_IO_IN_PROGRESS(ab));
12321544Seschrock 			ASSERT(ab->b_buf == NULL);
12333403Sbmc 			arc_change_state(arc_anon, ab, hash_lock);
1234789Sahrens 			mutex_exit(hash_lock);
12353403Sbmc 			ARCSTAT_BUMP(arcstat_deleted);
12361544Seschrock 			bytes_deleted += ab->b_size;
12371544Seschrock 			arc_hdr_destroy(ab);
1238789Sahrens 			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1239789Sahrens 			if (bytes >= 0 && bytes_deleted >= bytes)
1240789Sahrens 				break;
1241789Sahrens 		} else {
1242789Sahrens 			if (bytes < 0) {
12433403Sbmc 				mutex_exit(&state->arcs_mtx);
1244789Sahrens 				mutex_enter(hash_lock);
1245789Sahrens 				mutex_exit(hash_lock);
1246789Sahrens 				goto top;
1247789Sahrens 			}
1248789Sahrens 			bufs_skipped += 1;
1249789Sahrens 		}
1250789Sahrens 	}
12513403Sbmc 	mutex_exit(&state->arcs_mtx);
1252789Sahrens 
1253789Sahrens 	if (bufs_skipped) {
12543403Sbmc 		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1255789Sahrens 		ASSERT(bytes >= 0);
1256789Sahrens 	}
1257789Sahrens 
1258789Sahrens 	if (bytes_deleted < bytes)
1259789Sahrens 		dprintf("only deleted %lld bytes from %p",
1260789Sahrens 		    (longlong_t)bytes_deleted, state);
1261789Sahrens }
1262789Sahrens 
1263789Sahrens static void
1264789Sahrens arc_adjust(void)
1265789Sahrens {
12663403Sbmc 	int64_t top_sz, mru_over, arc_over, todelete;
1267789Sahrens 
12683403Sbmc 	top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
1269789Sahrens 
12703403Sbmc 	if (top_sz > arc_p && arc_mru->arcs_lsize > 0) {
12713403Sbmc 		int64_t toevict = MIN(arc_mru->arcs_lsize, top_sz - arc_p);
12723403Sbmc 		(void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_UNDEF);
12733403Sbmc 		top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
1274789Sahrens 	}
1275789Sahrens 
12763403Sbmc 	mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c;
1277789Sahrens 
1278789Sahrens 	if (mru_over > 0) {
12793403Sbmc 		if (arc_mru_ghost->arcs_lsize > 0) {
12803403Sbmc 			todelete = MIN(arc_mru_ghost->arcs_lsize, mru_over);
12813403Sbmc 			arc_evict_ghost(arc_mru_ghost, todelete);
1282789Sahrens 		}
1283789Sahrens 	}
1284789Sahrens 
12853403Sbmc 	if ((arc_over = arc_size - arc_c) > 0) {
12861544Seschrock 		int64_t tbl_over;
1287789Sahrens 
12883403Sbmc 		if (arc_mfu->arcs_lsize > 0) {
12893403Sbmc 			int64_t toevict = MIN(arc_mfu->arcs_lsize, arc_over);
12903403Sbmc 			(void) arc_evict(arc_mfu, toevict, FALSE,
12913290Sjohansen 			    ARC_BUFC_UNDEF);
1292789Sahrens 		}
1293789Sahrens 
12943403Sbmc 		tbl_over = arc_size + arc_mru_ghost->arcs_lsize +
12953403Sbmc 		    arc_mfu_ghost->arcs_lsize - arc_c*2;
1296789Sahrens 
12973403Sbmc 		if (tbl_over > 0 && arc_mfu_ghost->arcs_lsize > 0) {
12983403Sbmc 			todelete = MIN(arc_mfu_ghost->arcs_lsize, tbl_over);
12993403Sbmc 			arc_evict_ghost(arc_mfu_ghost, todelete);
1300789Sahrens 		}
1301789Sahrens 	}
1302789Sahrens }
1303789Sahrens 
13041544Seschrock static void
13051544Seschrock arc_do_user_evicts(void)
13061544Seschrock {
13071544Seschrock 	mutex_enter(&arc_eviction_mtx);
13081544Seschrock 	while (arc_eviction_list != NULL) {
13091544Seschrock 		arc_buf_t *buf = arc_eviction_list;
13101544Seschrock 		arc_eviction_list = buf->b_next;
13111544Seschrock 		buf->b_hdr = NULL;
13121544Seschrock 		mutex_exit(&arc_eviction_mtx);
13131544Seschrock 
13141819Smaybee 		if (buf->b_efunc != NULL)
13151819Smaybee 			VERIFY(buf->b_efunc(buf) == 0);
13161544Seschrock 
13171544Seschrock 		buf->b_efunc = NULL;
13181544Seschrock 		buf->b_private = NULL;
13191544Seschrock 		kmem_cache_free(buf_cache, buf);
13201544Seschrock 		mutex_enter(&arc_eviction_mtx);
13211544Seschrock 	}
13221544Seschrock 	mutex_exit(&arc_eviction_mtx);
13231544Seschrock }
13241544Seschrock 
1325789Sahrens /*
1326789Sahrens  * Flush all *evictable* data from the cache.
1327789Sahrens  * NOTE: this will not touch "active" (i.e. referenced) data.
1328789Sahrens  */
1329789Sahrens void
1330789Sahrens arc_flush(void)
1331789Sahrens {
13323403Sbmc 	while (list_head(&arc_mru->arcs_list))
13333403Sbmc 		(void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_UNDEF);
13343403Sbmc 	while (list_head(&arc_mfu->arcs_list))
13353403Sbmc 		(void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_UNDEF);
1336789Sahrens 
13373403Sbmc 	arc_evict_ghost(arc_mru_ghost, -1);
13383403Sbmc 	arc_evict_ghost(arc_mfu_ghost, -1);
13391544Seschrock 
13401544Seschrock 	mutex_enter(&arc_reclaim_thr_lock);
13411544Seschrock 	arc_do_user_evicts();
13421544Seschrock 	mutex_exit(&arc_reclaim_thr_lock);
13431544Seschrock 	ASSERT(arc_eviction_list == NULL);
1344789Sahrens }
1345789Sahrens 
13463158Smaybee int arc_shrink_shift = 5;		/* log2(fraction of arc to reclaim) */
13472391Smaybee 
1348789Sahrens void
13493158Smaybee arc_shrink(void)
1350789Sahrens {
13513403Sbmc 	if (arc_c > arc_c_min) {
13523158Smaybee 		uint64_t to_free;
1353789Sahrens 
13542048Sstans #ifdef _KERNEL
13553403Sbmc 		to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
13562048Sstans #else
13573403Sbmc 		to_free = arc_c >> arc_shrink_shift;
13582048Sstans #endif
13593403Sbmc 		if (arc_c > arc_c_min + to_free)
13603403Sbmc 			atomic_add_64(&arc_c, -to_free);
13613158Smaybee 		else
13623403Sbmc 			arc_c = arc_c_min;
13632048Sstans 
13643403Sbmc 		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
13653403Sbmc 		if (arc_c > arc_size)
13663403Sbmc 			arc_c = MAX(arc_size, arc_c_min);
13673403Sbmc 		if (arc_p > arc_c)
13683403Sbmc 			arc_p = (arc_c >> 1);
13693403Sbmc 		ASSERT(arc_c >= arc_c_min);
13703403Sbmc 		ASSERT((int64_t)arc_p >= 0);
13713158Smaybee 	}
1372789Sahrens 
13733403Sbmc 	if (arc_size > arc_c)
13743158Smaybee 		arc_adjust();
1375789Sahrens }
1376789Sahrens 
1377789Sahrens static int
1378789Sahrens arc_reclaim_needed(void)
1379789Sahrens {
1380789Sahrens 	uint64_t extra;
1381789Sahrens 
1382789Sahrens #ifdef _KERNEL
13832048Sstans 
13842048Sstans 	if (needfree)
13852048Sstans 		return (1);
13862048Sstans 
1387789Sahrens 	/*
1388789Sahrens 	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
1389789Sahrens 	 */
1390789Sahrens 	extra = desfree;
1391789Sahrens 
1392789Sahrens 	/*
1393789Sahrens 	 * check that we're out of range of the pageout scanner.  It starts to
1394789Sahrens 	 * schedule paging if freemem is less than lotsfree and needfree.
1395789Sahrens 	 * lotsfree is the high-water mark for pageout, and needfree is the
1396789Sahrens 	 * number of needed free pages.  We add extra pages here to make sure
1397789Sahrens 	 * the scanner doesn't start up while we're freeing memory.
1398789Sahrens 	 */
1399789Sahrens 	if (freemem < lotsfree + needfree + extra)
1400789Sahrens 		return (1);
1401789Sahrens 
1402789Sahrens 	/*
1403789Sahrens 	 * check to make sure that swapfs has enough space so that anon
1404789Sahrens 	 * reservations can still succeeed. anon_resvmem() checks that the
1405789Sahrens 	 * availrmem is greater than swapfs_minfree, and the number of reserved
1406789Sahrens 	 * swap pages.  We also add a bit of extra here just to prevent
1407789Sahrens 	 * circumstances from getting really dire.
1408789Sahrens 	 */
1409789Sahrens 	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
1410789Sahrens 		return (1);
1411789Sahrens 
14123307Sjohansen 	/*
14133307Sjohansen 	 * If zio data pages are being allocated out of a separate heap segment,
14143307Sjohansen 	 * then check that the size of available vmem for this area remains
14153552Sjohansen 	 * above 1/4th free.  This needs to be done when the size of the
14163307Sjohansen 	 * non-default segment is smaller than physical memory, so we could
14173307Sjohansen 	 * conceivably run out of VA in that segment before running out of
14183307Sjohansen 	 * physical memory.
14193307Sjohansen 	 */
14203552Sjohansen 	if (zio_arena != NULL) {
14213552Sjohansen 		if ((btop(physmem) > arc_ziosize) &&
14223552Sjohansen 		    (btop(vmem_size(zio_arena, VMEM_FREE)) < arc_ziosize >> 2))
14233307Sjohansen 		return (1);
14243552Sjohansen 	}
14253307Sjohansen 
14261936Smaybee #if defined(__i386)
1427789Sahrens 	/*
1428789Sahrens 	 * If we're on an i386 platform, it's possible that we'll exhaust the
1429789Sahrens 	 * kernel heap space before we ever run out of available physical
1430789Sahrens 	 * memory.  Most checks of the size of the heap_area compare against
1431789Sahrens 	 * tune.t_minarmem, which is the minimum available real memory that we
1432789Sahrens 	 * can have in the system.  However, this is generally fixed at 25 pages
1433789Sahrens 	 * which is so low that it's useless.  In this comparison, we seek to
1434789Sahrens 	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
1435789Sahrens 	 * heap is allocated.  (Or, in the caclulation, if less than 1/4th is
1436789Sahrens 	 * free)
1437789Sahrens 	 */
1438789Sahrens 	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
1439789Sahrens 	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
1440789Sahrens 		return (1);
1441789Sahrens #endif
1442789Sahrens 
1443789Sahrens #else
1444789Sahrens 	if (spa_get_random(100) == 0)
1445789Sahrens 		return (1);
1446789Sahrens #endif
1447789Sahrens 	return (0);
1448789Sahrens }
1449789Sahrens 
1450789Sahrens static void
1451789Sahrens arc_kmem_reap_now(arc_reclaim_strategy_t strat)
1452789Sahrens {
1453789Sahrens 	size_t			i;
1454789Sahrens 	kmem_cache_t		*prev_cache = NULL;
14553290Sjohansen 	kmem_cache_t		*prev_data_cache = NULL;
1456789Sahrens 	extern kmem_cache_t	*zio_buf_cache[];
14573290Sjohansen 	extern kmem_cache_t	*zio_data_buf_cache[];
1458789Sahrens 
14591484Sek110237 #ifdef _KERNEL
14601484Sek110237 	/*
14611484Sek110237 	 * First purge some DNLC entries, in case the DNLC is using
14621484Sek110237 	 * up too much memory.
14631484Sek110237 	 */
14641505Sek110237 	dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
14651936Smaybee 
14661936Smaybee #if defined(__i386)
14671936Smaybee 	/*
14681936Smaybee 	 * Reclaim unused memory from all kmem caches.
14691936Smaybee 	 */
14701936Smaybee 	kmem_reap();
14711936Smaybee #endif
14721484Sek110237 #endif
14731484Sek110237 
1474789Sahrens 	/*
14751544Seschrock 	 * An agressive reclamation will shrink the cache size as well as
14761544Seschrock 	 * reap free buffers from the arc kmem caches.
1477789Sahrens 	 */
1478789Sahrens 	if (strat == ARC_RECLAIM_AGGR)
14793158Smaybee 		arc_shrink();
1480789Sahrens 
1481789Sahrens 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
1482789Sahrens 		if (zio_buf_cache[i] != prev_cache) {
1483789Sahrens 			prev_cache = zio_buf_cache[i];
1484789Sahrens 			kmem_cache_reap_now(zio_buf_cache[i]);
1485789Sahrens 		}
14863290Sjohansen 		if (zio_data_buf_cache[i] != prev_data_cache) {
14873290Sjohansen 			prev_data_cache = zio_data_buf_cache[i];
14883290Sjohansen 			kmem_cache_reap_now(zio_data_buf_cache[i]);
14893290Sjohansen 		}
1490789Sahrens 	}
14911544Seschrock 	kmem_cache_reap_now(buf_cache);
14921544Seschrock 	kmem_cache_reap_now(hdr_cache);
1493789Sahrens }
1494789Sahrens 
1495789Sahrens static void
1496789Sahrens arc_reclaim_thread(void)
1497789Sahrens {
1498789Sahrens 	clock_t			growtime = 0;
1499789Sahrens 	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
1500789Sahrens 	callb_cpr_t		cpr;
1501789Sahrens 
1502789Sahrens 	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
1503789Sahrens 
1504789Sahrens 	mutex_enter(&arc_reclaim_thr_lock);
1505789Sahrens 	while (arc_thread_exit == 0) {
1506789Sahrens 		if (arc_reclaim_needed()) {
1507789Sahrens 
15083403Sbmc 			if (arc_no_grow) {
1509789Sahrens 				if (last_reclaim == ARC_RECLAIM_CONS) {
1510789Sahrens 					last_reclaim = ARC_RECLAIM_AGGR;
1511789Sahrens 				} else {
1512789Sahrens 					last_reclaim = ARC_RECLAIM_CONS;
1513789Sahrens 				}
1514789Sahrens 			} else {
15153403Sbmc 				arc_no_grow = TRUE;
1516789Sahrens 				last_reclaim = ARC_RECLAIM_AGGR;
1517789Sahrens 				membar_producer();
1518789Sahrens 			}
1519789Sahrens 
1520789Sahrens 			/* reset the growth delay for every reclaim */
1521789Sahrens 			growtime = lbolt + (arc_grow_retry * hz);
15222856Snd150628 			ASSERT(growtime > 0);
1523789Sahrens 
1524789Sahrens 			arc_kmem_reap_now(last_reclaim);
1525789Sahrens 
1526789Sahrens 		} else if ((growtime > 0) && ((growtime - lbolt) <= 0)) {
15273403Sbmc 			arc_no_grow = FALSE;
1528789Sahrens 		}
1529789Sahrens 
15303403Sbmc 		if (2 * arc_c < arc_size +
15313403Sbmc 		    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size)
15323298Smaybee 			arc_adjust();
15333298Smaybee 
15341544Seschrock 		if (arc_eviction_list != NULL)
15351544Seschrock 			arc_do_user_evicts();
15361544Seschrock 
1537789Sahrens 		/* block until needed, or one second, whichever is shorter */
1538789Sahrens 		CALLB_CPR_SAFE_BEGIN(&cpr);
1539789Sahrens 		(void) cv_timedwait(&arc_reclaim_thr_cv,
1540789Sahrens 		    &arc_reclaim_thr_lock, (lbolt + hz));
1541789Sahrens 		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
1542789Sahrens 	}
1543789Sahrens 
1544789Sahrens 	arc_thread_exit = 0;
1545789Sahrens 	cv_broadcast(&arc_reclaim_thr_cv);
1546789Sahrens 	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
1547789Sahrens 	thread_exit();
1548789Sahrens }
1549789Sahrens 
15501544Seschrock /*
15511544Seschrock  * Adapt arc info given the number of bytes we are trying to add and
15521544Seschrock  * the state that we are comming from.  This function is only called
15531544Seschrock  * when we are adding new content to the cache.
15541544Seschrock  */
1555789Sahrens static void
15561544Seschrock arc_adapt(int bytes, arc_state_t *state)
1557789Sahrens {
15581544Seschrock 	int mult;
15591544Seschrock 
15601544Seschrock 	ASSERT(bytes > 0);
1561789Sahrens 	/*
15621544Seschrock 	 * Adapt the target size of the MRU list:
15631544Seschrock 	 *	- if we just hit in the MRU ghost list, then increase
15641544Seschrock 	 *	  the target size of the MRU list.
15651544Seschrock 	 *	- if we just hit in the MFU ghost list, then increase
15661544Seschrock 	 *	  the target size of the MFU list by decreasing the
15671544Seschrock 	 *	  target size of the MRU list.
1568789Sahrens 	 */
15693403Sbmc 	if (state == arc_mru_ghost) {
15703403Sbmc 		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
15713403Sbmc 		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
15721544Seschrock 
15733403Sbmc 		arc_p = MIN(arc_c, arc_p + bytes * mult);
15743403Sbmc 	} else if (state == arc_mfu_ghost) {
15753403Sbmc 		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
15763403Sbmc 		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
15771544Seschrock 
15783403Sbmc 		arc_p = MAX(0, (int64_t)arc_p - bytes * mult);
15791544Seschrock 	}
15803403Sbmc 	ASSERT((int64_t)arc_p >= 0);
1581789Sahrens 
1582789Sahrens 	if (arc_reclaim_needed()) {
1583789Sahrens 		cv_signal(&arc_reclaim_thr_cv);
1584789Sahrens 		return;
1585789Sahrens 	}
1586789Sahrens 
15873403Sbmc 	if (arc_no_grow)
1588789Sahrens 		return;
1589789Sahrens 
15903403Sbmc 	if (arc_c >= arc_c_max)
15911544Seschrock 		return;
15921544Seschrock 
1593789Sahrens 	/*
15941544Seschrock 	 * If we're within (2 * maxblocksize) bytes of the target
15951544Seschrock 	 * cache size, increment the target cache size
1596789Sahrens 	 */
15973403Sbmc 	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
15983403Sbmc 		atomic_add_64(&arc_c, (int64_t)bytes);
15993403Sbmc 		if (arc_c > arc_c_max)
16003403Sbmc 			arc_c = arc_c_max;
16013403Sbmc 		else if (state == arc_anon)
16023403Sbmc 			atomic_add_64(&arc_p, (int64_t)bytes);
16033403Sbmc 		if (arc_p > arc_c)
16043403Sbmc 			arc_p = arc_c;
1605789Sahrens 	}
16063403Sbmc 	ASSERT((int64_t)arc_p >= 0);
1607789Sahrens }
1608789Sahrens 
1609789Sahrens /*
16101544Seschrock  * Check if the cache has reached its limits and eviction is required
16111544Seschrock  * prior to insert.
1612789Sahrens  */
1613789Sahrens static int
1614789Sahrens arc_evict_needed()
1615789Sahrens {
1616789Sahrens 	if (arc_reclaim_needed())
1617789Sahrens 		return (1);
1618789Sahrens 
16193403Sbmc 	return (arc_size > arc_c);
1620789Sahrens }
1621789Sahrens 
1622789Sahrens /*
16232688Smaybee  * The buffer, supplied as the first argument, needs a data block.
16242688Smaybee  * So, if we are at cache max, determine which cache should be victimized.
16252688Smaybee  * We have the following cases:
1626789Sahrens  *
16273403Sbmc  * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
1628789Sahrens  * In this situation if we're out of space, but the resident size of the MFU is
1629789Sahrens  * under the limit, victimize the MFU cache to satisfy this insertion request.
1630789Sahrens  *
16313403Sbmc  * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
1632789Sahrens  * Here, we've used up all of the available space for the MRU, so we need to
1633789Sahrens  * evict from our own cache instead.  Evict from the set of resident MRU
1634789Sahrens  * entries.
1635789Sahrens  *
16363403Sbmc  * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
1637789Sahrens  * c minus p represents the MFU space in the cache, since p is the size of the
1638789Sahrens  * cache that is dedicated to the MRU.  In this situation there's still space on
1639789Sahrens  * the MFU side, so the MRU side needs to be victimized.
1640789Sahrens  *
16413403Sbmc  * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
1642789Sahrens  * MFU's resident set is consuming more space than it has been allotted.  In
1643789Sahrens  * this situation, we must victimize our own cache, the MFU, for this insertion.
1644789Sahrens  */
1645789Sahrens static void
16462688Smaybee arc_get_data_buf(arc_buf_t *buf)
1647789Sahrens {
16483290Sjohansen 	arc_state_t		*state = buf->b_hdr->b_state;
16493290Sjohansen 	uint64_t		size = buf->b_hdr->b_size;
16503290Sjohansen 	arc_buf_contents_t	type = buf->b_hdr->b_type;
16512688Smaybee 
16522688Smaybee 	arc_adapt(size, state);
1653789Sahrens 
16542688Smaybee 	/*
16552688Smaybee 	 * We have not yet reached cache maximum size,
16562688Smaybee 	 * just allocate a new buffer.
16572688Smaybee 	 */
16582688Smaybee 	if (!arc_evict_needed()) {
16593290Sjohansen 		if (type == ARC_BUFC_METADATA) {
16603290Sjohansen 			buf->b_data = zio_buf_alloc(size);
16613290Sjohansen 		} else {
16623290Sjohansen 			ASSERT(type == ARC_BUFC_DATA);
16633290Sjohansen 			buf->b_data = zio_data_buf_alloc(size);
16643290Sjohansen 		}
16653403Sbmc 		atomic_add_64(&arc_size, size);
16662688Smaybee 		goto out;
16672688Smaybee 	}
16682688Smaybee 
16692688Smaybee 	/*
16702688Smaybee 	 * If we are prefetching from the mfu ghost list, this buffer
16712688Smaybee 	 * will end up on the mru list; so steal space from there.
16722688Smaybee 	 */
16733403Sbmc 	if (state == arc_mfu_ghost)
16743403Sbmc 		state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
16753403Sbmc 	else if (state == arc_mru_ghost)
16763403Sbmc 		state = arc_mru;
1677789Sahrens 
16783403Sbmc 	if (state == arc_mru || state == arc_anon) {
16793403Sbmc 		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
16803403Sbmc 		state = (arc_p > mru_used) ? arc_mfu : arc_mru;
1681789Sahrens 	} else {
16822688Smaybee 		/* MFU cases */
16833403Sbmc 		uint64_t mfu_space = arc_c - arc_p;
16843403Sbmc 		state =  (mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
16852688Smaybee 	}
16863290Sjohansen 	if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) {
16873290Sjohansen 		if (type == ARC_BUFC_METADATA) {
16883290Sjohansen 			buf->b_data = zio_buf_alloc(size);
16893290Sjohansen 		} else {
16903290Sjohansen 			ASSERT(type == ARC_BUFC_DATA);
16913290Sjohansen 			buf->b_data = zio_data_buf_alloc(size);
16923290Sjohansen 		}
16933403Sbmc 		atomic_add_64(&arc_size, size);
16943403Sbmc 		ARCSTAT_BUMP(arcstat_recycle_miss);
16952688Smaybee 	}
16962688Smaybee 	ASSERT(buf->b_data != NULL);
16972688Smaybee out:
16982688Smaybee 	/*
16992688Smaybee 	 * Update the state size.  Note that ghost states have a
17002688Smaybee 	 * "ghost size" and so don't need to be updated.
17012688Smaybee 	 */
17022688Smaybee 	if (!GHOST_STATE(buf->b_hdr->b_state)) {
17032688Smaybee 		arc_buf_hdr_t *hdr = buf->b_hdr;
17042688Smaybee 
17053403Sbmc 		atomic_add_64(&hdr->b_state->arcs_size, size);
17062688Smaybee 		if (list_link_active(&hdr->b_arc_node)) {
17072688Smaybee 			ASSERT(refcount_is_zero(&hdr->b_refcnt));
17083403Sbmc 			atomic_add_64(&hdr->b_state->arcs_lsize, size);
1709789Sahrens 		}
17103298Smaybee 		/*
17113298Smaybee 		 * If we are growing the cache, and we are adding anonymous
17123403Sbmc 		 * data, and we have outgrown arc_p, update arc_p
17133298Smaybee 		 */
17143403Sbmc 		if (arc_size < arc_c && hdr->b_state == arc_anon &&
17153403Sbmc 		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
17163403Sbmc 			arc_p = MIN(arc_c, arc_p + size);
1717789Sahrens 	}
1718789Sahrens }
1719789Sahrens 
1720789Sahrens /*
1721789Sahrens  * This routine is called whenever a buffer is accessed.
17221544Seschrock  * NOTE: the hash lock is dropped in this function.
1723789Sahrens  */
1724789Sahrens static void
17252688Smaybee arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
1726789Sahrens {
1727789Sahrens 	ASSERT(MUTEX_HELD(hash_lock));
1728789Sahrens 
17293403Sbmc 	if (buf->b_state == arc_anon) {
1730789Sahrens 		/*
1731789Sahrens 		 * This buffer is not in the cache, and does not
1732789Sahrens 		 * appear in our "ghost" list.  Add the new buffer
1733789Sahrens 		 * to the MRU state.
1734789Sahrens 		 */
1735789Sahrens 
1736789Sahrens 		ASSERT(buf->b_arc_access == 0);
1737789Sahrens 		buf->b_arc_access = lbolt;
17381544Seschrock 		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
17393403Sbmc 		arc_change_state(arc_mru, buf, hash_lock);
1740789Sahrens 
17413403Sbmc 	} else if (buf->b_state == arc_mru) {
1742789Sahrens 		/*
17432391Smaybee 		 * If this buffer is here because of a prefetch, then either:
17442391Smaybee 		 * - clear the flag if this is a "referencing" read
17452391Smaybee 		 *   (any subsequent access will bump this into the MFU state).
17462391Smaybee 		 * or
17472391Smaybee 		 * - move the buffer to the head of the list if this is
17482391Smaybee 		 *   another prefetch (to make it less likely to be evicted).
1749789Sahrens 		 */
1750789Sahrens 		if ((buf->b_flags & ARC_PREFETCH) != 0) {
17512391Smaybee 			if (refcount_count(&buf->b_refcnt) == 0) {
17522391Smaybee 				ASSERT(list_link_active(&buf->b_arc_node));
17533403Sbmc 				mutex_enter(&arc_mru->arcs_mtx);
17543403Sbmc 				list_remove(&arc_mru->arcs_list, buf);
17553403Sbmc 				list_insert_head(&arc_mru->arcs_list, buf);
17563403Sbmc 				mutex_exit(&arc_mru->arcs_mtx);
17572391Smaybee 			} else {
17582391Smaybee 				buf->b_flags &= ~ARC_PREFETCH;
17593403Sbmc 				ARCSTAT_BUMP(arcstat_mru_hits);
17602391Smaybee 			}
17612391Smaybee 			buf->b_arc_access = lbolt;
1762789Sahrens 			return;
1763789Sahrens 		}
1764789Sahrens 
1765789Sahrens 		/*
1766789Sahrens 		 * This buffer has been "accessed" only once so far,
1767789Sahrens 		 * but it is still in the cache. Move it to the MFU
1768789Sahrens 		 * state.
1769789Sahrens 		 */
1770789Sahrens 		if (lbolt > buf->b_arc_access + ARC_MINTIME) {
1771789Sahrens 			/*
1772789Sahrens 			 * More than 125ms have passed since we
1773789Sahrens 			 * instantiated this buffer.  Move it to the
1774789Sahrens 			 * most frequently used state.
1775789Sahrens 			 */
1776789Sahrens 			buf->b_arc_access = lbolt;
17771544Seschrock 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
17783403Sbmc 			arc_change_state(arc_mfu, buf, hash_lock);
1779789Sahrens 		}
17803403Sbmc 		ARCSTAT_BUMP(arcstat_mru_hits);
17813403Sbmc 	} else if (buf->b_state == arc_mru_ghost) {
1782789Sahrens 		arc_state_t	*new_state;
1783789Sahrens 		/*
1784789Sahrens 		 * This buffer has been "accessed" recently, but
1785789Sahrens 		 * was evicted from the cache.  Move it to the
1786789Sahrens 		 * MFU state.
1787789Sahrens 		 */
1788789Sahrens 
1789789Sahrens 		if (buf->b_flags & ARC_PREFETCH) {
17903403Sbmc 			new_state = arc_mru;
17912391Smaybee 			if (refcount_count(&buf->b_refcnt) > 0)
17922391Smaybee 				buf->b_flags &= ~ARC_PREFETCH;
17931544Seschrock 			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
1794789Sahrens 		} else {
17953403Sbmc 			new_state = arc_mfu;
17961544Seschrock 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
1797789Sahrens 		}
1798789Sahrens 
1799789Sahrens 		buf->b_arc_access = lbolt;
1800789Sahrens 		arc_change_state(new_state, buf, hash_lock);
1801789Sahrens 
18023403Sbmc 		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
18033403Sbmc 	} else if (buf->b_state == arc_mfu) {
1804789Sahrens 		/*
1805789Sahrens 		 * This buffer has been accessed more than once and is
1806789Sahrens 		 * still in the cache.  Keep it in the MFU state.
1807789Sahrens 		 *
18082391Smaybee 		 * NOTE: an add_reference() that occurred when we did
18092391Smaybee 		 * the arc_read() will have kicked this off the list.
18102391Smaybee 		 * If it was a prefetch, we will explicitly move it to
18112391Smaybee 		 * the head of the list now.
1812789Sahrens 		 */
18132391Smaybee 		if ((buf->b_flags & ARC_PREFETCH) != 0) {
18142391Smaybee 			ASSERT(refcount_count(&buf->b_refcnt) == 0);
18152391Smaybee 			ASSERT(list_link_active(&buf->b_arc_node));
18163403Sbmc 			mutex_enter(&arc_mfu->arcs_mtx);
18173403Sbmc 			list_remove(&arc_mfu->arcs_list, buf);
18183403Sbmc 			list_insert_head(&arc_mfu->arcs_list, buf);
18193403Sbmc 			mutex_exit(&arc_mfu->arcs_mtx);
18202391Smaybee 		}
18213403Sbmc 		ARCSTAT_BUMP(arcstat_mfu_hits);
18222391Smaybee 		buf->b_arc_access = lbolt;
18233403Sbmc 	} else if (buf->b_state == arc_mfu_ghost) {
18243403Sbmc 		arc_state_t	*new_state = arc_mfu;
1825789Sahrens 		/*
1826789Sahrens 		 * This buffer has been accessed more than once but has
1827789Sahrens 		 * been evicted from the cache.  Move it back to the
1828789Sahrens 		 * MFU state.
1829789Sahrens 		 */
1830789Sahrens 
18312391Smaybee 		if (buf->b_flags & ARC_PREFETCH) {
18322391Smaybee 			/*
18332391Smaybee 			 * This is a prefetch access...
18342391Smaybee 			 * move this block back to the MRU state.
18352391Smaybee 			 */
18362391Smaybee 			ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
18373403Sbmc 			new_state = arc_mru;
18382391Smaybee 		}
18392391Smaybee 
1840789Sahrens 		buf->b_arc_access = lbolt;
18411544Seschrock 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
18422391Smaybee 		arc_change_state(new_state, buf, hash_lock);
1843789Sahrens 
18443403Sbmc 		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
1845789Sahrens 	} else {
1846789Sahrens 		ASSERT(!"invalid arc state");
1847789Sahrens 	}
1848789Sahrens }
1849789Sahrens 
1850789Sahrens /* a generic arc_done_func_t which you can use */
1851789Sahrens /* ARGSUSED */
1852789Sahrens void
1853789Sahrens arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
1854789Sahrens {
1855789Sahrens 	bcopy(buf->b_data, arg, buf->b_hdr->b_size);
18561544Seschrock 	VERIFY(arc_buf_remove_ref(buf, arg) == 1);
1857789Sahrens }
1858789Sahrens 
1859789Sahrens /* a generic arc_done_func_t which you can use */
1860789Sahrens void
1861789Sahrens arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
1862789Sahrens {
1863789Sahrens 	arc_buf_t **bufp = arg;
1864789Sahrens 	if (zio && zio->io_error) {
18651544Seschrock 		VERIFY(arc_buf_remove_ref(buf, arg) == 1);
1866789Sahrens 		*bufp = NULL;
1867789Sahrens 	} else {
1868789Sahrens 		*bufp = buf;
1869789Sahrens 	}
1870789Sahrens }
1871789Sahrens 
1872789Sahrens static void
1873789Sahrens arc_read_done(zio_t *zio)
1874789Sahrens {
18751589Smaybee 	arc_buf_hdr_t	*hdr, *found;
1876789Sahrens 	arc_buf_t	*buf;
1877789Sahrens 	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
1878789Sahrens 	kmutex_t	*hash_lock;
1879789Sahrens 	arc_callback_t	*callback_list, *acb;
1880789Sahrens 	int		freeable = FALSE;
1881789Sahrens 
1882789Sahrens 	buf = zio->io_private;
1883789Sahrens 	hdr = buf->b_hdr;
1884789Sahrens 
18851589Smaybee 	/*
18861589Smaybee 	 * The hdr was inserted into hash-table and removed from lists
18871589Smaybee 	 * prior to starting I/O.  We should find this header, since
18881589Smaybee 	 * it's in the hash table, and it should be legit since it's
18891589Smaybee 	 * not possible to evict it during the I/O.  The only possible
18901589Smaybee 	 * reason for it not to be found is if we were freed during the
18911589Smaybee 	 * read.
18921589Smaybee 	 */
18931589Smaybee 	found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
18943093Sahrens 	    &hash_lock);
1895789Sahrens 
18961589Smaybee 	ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
18971589Smaybee 	    (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))));
1898789Sahrens 
1899789Sahrens 	/* byteswap if necessary */
1900789Sahrens 	callback_list = hdr->b_acb;
1901789Sahrens 	ASSERT(callback_list != NULL);
1902789Sahrens 	if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap)
1903789Sahrens 		callback_list->acb_byteswap(buf->b_data, hdr->b_size);
1904789Sahrens 
19053093Sahrens 	arc_cksum_compute(buf);
19063093Sahrens 
1907789Sahrens 	/* create copies of the data buffer for the callers */
1908789Sahrens 	abuf = buf;
1909789Sahrens 	for (acb = callback_list; acb; acb = acb->acb_next) {
1910789Sahrens 		if (acb->acb_done) {
19112688Smaybee 			if (abuf == NULL)
19122688Smaybee 				abuf = arc_buf_clone(buf);
1913789Sahrens 			acb->acb_buf = abuf;
1914789Sahrens 			abuf = NULL;
1915789Sahrens 		}
1916789Sahrens 	}
1917789Sahrens 	hdr->b_acb = NULL;
1918789Sahrens 	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
19191544Seschrock 	ASSERT(!HDR_BUF_AVAILABLE(hdr));
19201544Seschrock 	if (abuf == buf)
19211544Seschrock 		hdr->b_flags |= ARC_BUF_AVAILABLE;
1922789Sahrens 
1923789Sahrens 	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
1924789Sahrens 
1925789Sahrens 	if (zio->io_error != 0) {
1926789Sahrens 		hdr->b_flags |= ARC_IO_ERROR;
19273403Sbmc 		if (hdr->b_state != arc_anon)
19283403Sbmc 			arc_change_state(arc_anon, hdr, hash_lock);
19291544Seschrock 		if (HDR_IN_HASH_TABLE(hdr))
19301544Seschrock 			buf_hash_remove(hdr);
1931789Sahrens 		freeable = refcount_is_zero(&hdr->b_refcnt);
19322391Smaybee 		/* convert checksum errors into IO errors */
19331544Seschrock 		if (zio->io_error == ECKSUM)
19341544Seschrock 			zio->io_error = EIO;
1935789Sahrens 	}
1936789Sahrens 
19371544Seschrock 	/*
19382391Smaybee 	 * Broadcast before we drop the hash_lock to avoid the possibility
19392391Smaybee 	 * that the hdr (and hence the cv) might be freed before we get to
19402391Smaybee 	 * the cv_broadcast().
19411544Seschrock 	 */
19421544Seschrock 	cv_broadcast(&hdr->b_cv);
19431544Seschrock 
19441589Smaybee 	if (hash_lock) {
1945789Sahrens 		/*
1946789Sahrens 		 * Only call arc_access on anonymous buffers.  This is because
1947789Sahrens 		 * if we've issued an I/O for an evicted buffer, we've already
1948789Sahrens 		 * called arc_access (to prevent any simultaneous readers from
1949789Sahrens 		 * getting confused).
1950789Sahrens 		 */
19513403Sbmc 		if (zio->io_error == 0 && hdr->b_state == arc_anon)
19522688Smaybee 			arc_access(hdr, hash_lock);
19532688Smaybee 		mutex_exit(hash_lock);
1954789Sahrens 	} else {
1955789Sahrens 		/*
1956789Sahrens 		 * This block was freed while we waited for the read to
1957789Sahrens 		 * complete.  It has been removed from the hash table and
1958789Sahrens 		 * moved to the anonymous state (so that it won't show up
1959789Sahrens 		 * in the cache).
1960789Sahrens 		 */
19613403Sbmc 		ASSERT3P(hdr->b_state, ==, arc_anon);
1962789Sahrens 		freeable = refcount_is_zero(&hdr->b_refcnt);
1963789Sahrens 	}
1964789Sahrens 
1965789Sahrens 	/* execute each callback and free its structure */
1966789Sahrens 	while ((acb = callback_list) != NULL) {
1967789Sahrens 		if (acb->acb_done)
1968789Sahrens 			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
1969789Sahrens 
1970789Sahrens 		if (acb->acb_zio_dummy != NULL) {
1971789Sahrens 			acb->acb_zio_dummy->io_error = zio->io_error;
1972789Sahrens 			zio_nowait(acb->acb_zio_dummy);
1973789Sahrens 		}
1974789Sahrens 
1975789Sahrens 		callback_list = acb->acb_next;
1976789Sahrens 		kmem_free(acb, sizeof (arc_callback_t));
1977789Sahrens 	}
1978789Sahrens 
1979789Sahrens 	if (freeable)
19801544Seschrock 		arc_hdr_destroy(hdr);
1981789Sahrens }
1982789Sahrens 
1983789Sahrens /*
1984789Sahrens  * "Read" the block block at the specified DVA (in bp) via the
1985789Sahrens  * cache.  If the block is found in the cache, invoke the provided
1986789Sahrens  * callback immediately and return.  Note that the `zio' parameter
1987789Sahrens  * in the callback will be NULL in this case, since no IO was
1988789Sahrens  * required.  If the block is not in the cache pass the read request
1989789Sahrens  * on to the spa with a substitute callback function, so that the
1990789Sahrens  * requested block will be added to the cache.
1991789Sahrens  *
1992789Sahrens  * If a read request arrives for a block that has a read in-progress,
1993789Sahrens  * either wait for the in-progress read to complete (and return the
1994789Sahrens  * results); or, if this is a read with a "done" func, add a record
1995789Sahrens  * to the read to invoke the "done" func when the read completes,
1996789Sahrens  * and return; or just return.
1997789Sahrens  *
1998789Sahrens  * arc_read_done() will invoke all the requested "done" functions
1999789Sahrens  * for readers of this block.
2000789Sahrens  */
2001789Sahrens int
2002789Sahrens arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
2003789Sahrens     arc_done_func_t *done, void *private, int priority, int flags,
20042391Smaybee     uint32_t *arc_flags, zbookmark_t *zb)
2005789Sahrens {
2006789Sahrens 	arc_buf_hdr_t *hdr;
2007789Sahrens 	arc_buf_t *buf;
2008789Sahrens 	kmutex_t *hash_lock;
2009789Sahrens 	zio_t	*rzio;
2010789Sahrens 
2011789Sahrens top:
2012789Sahrens 	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
20131544Seschrock 	if (hdr && hdr->b_datacnt > 0) {
2014789Sahrens 
20152391Smaybee 		*arc_flags |= ARC_CACHED;
20162391Smaybee 
2017789Sahrens 		if (HDR_IO_IN_PROGRESS(hdr)) {
20182391Smaybee 
20192391Smaybee 			if (*arc_flags & ARC_WAIT) {
20202391Smaybee 				cv_wait(&hdr->b_cv, hash_lock);
20212391Smaybee 				mutex_exit(hash_lock);
20222391Smaybee 				goto top;
20232391Smaybee 			}
20242391Smaybee 			ASSERT(*arc_flags & ARC_NOWAIT);
20252391Smaybee 
20262391Smaybee 			if (done) {
2027789Sahrens 				arc_callback_t	*acb = NULL;
2028789Sahrens 
2029789Sahrens 				acb = kmem_zalloc(sizeof (arc_callback_t),
2030789Sahrens 				    KM_SLEEP);
2031789Sahrens 				acb->acb_done = done;
2032789Sahrens 				acb->acb_private = private;
2033789Sahrens 				acb->acb_byteswap = swap;
2034789Sahrens 				if (pio != NULL)
2035789Sahrens 					acb->acb_zio_dummy = zio_null(pio,
2036789Sahrens 					    spa, NULL, NULL, flags);
2037789Sahrens 
2038789Sahrens 				ASSERT(acb->acb_done != NULL);
2039789Sahrens 				acb->acb_next = hdr->b_acb;
2040789Sahrens 				hdr->b_acb = acb;
2041789Sahrens 				add_reference(hdr, hash_lock, private);
2042789Sahrens 				mutex_exit(hash_lock);
2043789Sahrens 				return (0);
2044789Sahrens 			}
2045789Sahrens 			mutex_exit(hash_lock);
2046789Sahrens 			return (0);
2047789Sahrens 		}
2048789Sahrens 
20493403Sbmc 		ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2050789Sahrens 
20511544Seschrock 		if (done) {
20522688Smaybee 			add_reference(hdr, hash_lock, private);
20531544Seschrock 			/*
20541544Seschrock 			 * If this block is already in use, create a new
20551544Seschrock 			 * copy of the data so that we will be guaranteed
20561544Seschrock 			 * that arc_release() will always succeed.
20571544Seschrock 			 */
20581544Seschrock 			buf = hdr->b_buf;
20591544Seschrock 			ASSERT(buf);
20601544Seschrock 			ASSERT(buf->b_data);
20612688Smaybee 			if (HDR_BUF_AVAILABLE(hdr)) {
20621544Seschrock 				ASSERT(buf->b_efunc == NULL);
20631544Seschrock 				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
20642688Smaybee 			} else {
20652688Smaybee 				buf = arc_buf_clone(buf);
20661544Seschrock 			}
20672391Smaybee 		} else if (*arc_flags & ARC_PREFETCH &&
20682391Smaybee 		    refcount_count(&hdr->b_refcnt) == 0) {
20692391Smaybee 			hdr->b_flags |= ARC_PREFETCH;
2070789Sahrens 		}
2071789Sahrens 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
20722688Smaybee 		arc_access(hdr, hash_lock);
20732688Smaybee 		mutex_exit(hash_lock);
20743403Sbmc 		ARCSTAT_BUMP(arcstat_hits);
20753403Sbmc 		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
20763403Sbmc 		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
20773403Sbmc 		    data, metadata, hits);
20783403Sbmc 
2079789Sahrens 		if (done)
2080789Sahrens 			done(NULL, buf, private);
2081789Sahrens 	} else {
2082789Sahrens 		uint64_t size = BP_GET_LSIZE(bp);
2083789Sahrens 		arc_callback_t	*acb;
2084789Sahrens 
2085789Sahrens 		if (hdr == NULL) {
2086789Sahrens 			/* this block is not in the cache */
2087789Sahrens 			arc_buf_hdr_t	*exists;
20883290Sjohansen 			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
20893290Sjohansen 			buf = arc_buf_alloc(spa, size, private, type);
2090789Sahrens 			hdr = buf->b_hdr;
2091789Sahrens 			hdr->b_dva = *BP_IDENTITY(bp);
2092789Sahrens 			hdr->b_birth = bp->blk_birth;
2093789Sahrens 			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
2094789Sahrens 			exists = buf_hash_insert(hdr, &hash_lock);
2095789Sahrens 			if (exists) {
2096789Sahrens 				/* somebody beat us to the hash insert */
2097789Sahrens 				mutex_exit(hash_lock);
2098789Sahrens 				bzero(&hdr->b_dva, sizeof (dva_t));
2099789Sahrens 				hdr->b_birth = 0;
2100789Sahrens 				hdr->b_cksum0 = 0;
21011544Seschrock 				(void) arc_buf_remove_ref(buf, private);
2102789Sahrens 				goto top; /* restart the IO request */
2103789Sahrens 			}
21042391Smaybee 			/* if this is a prefetch, we don't have a reference */
21052391Smaybee 			if (*arc_flags & ARC_PREFETCH) {
21062391Smaybee 				(void) remove_reference(hdr, hash_lock,
21072391Smaybee 				    private);
21082391Smaybee 				hdr->b_flags |= ARC_PREFETCH;
21092391Smaybee 			}
21102391Smaybee 			if (BP_GET_LEVEL(bp) > 0)
21112391Smaybee 				hdr->b_flags |= ARC_INDIRECT;
2112789Sahrens 		} else {
2113789Sahrens 			/* this block is in the ghost cache */
21141544Seschrock 			ASSERT(GHOST_STATE(hdr->b_state));
21151544Seschrock 			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
21162391Smaybee 			ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
21172391Smaybee 			ASSERT(hdr->b_buf == NULL);
2118789Sahrens 
21192391Smaybee 			/* if this is a prefetch, we don't have a reference */
21202391Smaybee 			if (*arc_flags & ARC_PREFETCH)
21212391Smaybee 				hdr->b_flags |= ARC_PREFETCH;
21222391Smaybee 			else
21232391Smaybee 				add_reference(hdr, hash_lock, private);
2124789Sahrens 			buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
21251544Seschrock 			buf->b_hdr = hdr;
21262688Smaybee 			buf->b_data = NULL;
21271544Seschrock 			buf->b_efunc = NULL;
21281544Seschrock 			buf->b_private = NULL;
21291544Seschrock 			buf->b_next = NULL;
21301544Seschrock 			hdr->b_buf = buf;
21312688Smaybee 			arc_get_data_buf(buf);
21321544Seschrock 			ASSERT(hdr->b_datacnt == 0);
21331544Seschrock 			hdr->b_datacnt = 1;
21342391Smaybee 
2135789Sahrens 		}
2136789Sahrens 
2137789Sahrens 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
2138789Sahrens 		acb->acb_done = done;
2139789Sahrens 		acb->acb_private = private;
2140789Sahrens 		acb->acb_byteswap = swap;
2141789Sahrens 
2142789Sahrens 		ASSERT(hdr->b_acb == NULL);
2143789Sahrens 		hdr->b_acb = acb;
2144789Sahrens 		hdr->b_flags |= ARC_IO_IN_PROGRESS;
2145789Sahrens 
2146789Sahrens 		/*
2147789Sahrens 		 * If the buffer has been evicted, migrate it to a present state
2148789Sahrens 		 * before issuing the I/O.  Once we drop the hash-table lock,
2149789Sahrens 		 * the header will be marked as I/O in progress and have an
2150789Sahrens 		 * attached buffer.  At this point, anybody who finds this
2151789Sahrens 		 * buffer ought to notice that it's legit but has a pending I/O.
2152789Sahrens 		 */
2153789Sahrens 
21541544Seschrock 		if (GHOST_STATE(hdr->b_state))
21552688Smaybee 			arc_access(hdr, hash_lock);
21562688Smaybee 		mutex_exit(hash_lock);
2157789Sahrens 
2158789Sahrens 		ASSERT3U(hdr->b_size, ==, size);
21591596Sahrens 		DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
21601596Sahrens 		    zbookmark_t *, zb);
21613403Sbmc 		ARCSTAT_BUMP(arcstat_misses);
21623403Sbmc 		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
21633403Sbmc 		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
21643403Sbmc 		    data, metadata, misses);
21651544Seschrock 
2166789Sahrens 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
21671544Seschrock 		    arc_read_done, buf, priority, flags, zb);
2168789Sahrens 
21692391Smaybee 		if (*arc_flags & ARC_WAIT)
2170789Sahrens 			return (zio_wait(rzio));
2171789Sahrens 
21722391Smaybee 		ASSERT(*arc_flags & ARC_NOWAIT);
2173789Sahrens 		zio_nowait(rzio);
2174789Sahrens 	}
2175789Sahrens 	return (0);
2176789Sahrens }
2177789Sahrens 
2178789Sahrens /*
2179789Sahrens  * arc_read() variant to support pool traversal.  If the block is already
2180789Sahrens  * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
2181789Sahrens  * The idea is that we don't want pool traversal filling up memory, but
2182789Sahrens  * if the ARC already has the data anyway, we shouldn't pay for the I/O.
2183789Sahrens  */
2184789Sahrens int
2185789Sahrens arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
2186789Sahrens {
2187789Sahrens 	arc_buf_hdr_t *hdr;
2188789Sahrens 	kmutex_t *hash_mtx;
2189789Sahrens 	int rc = 0;
2190789Sahrens 
2191789Sahrens 	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
2192789Sahrens 
21931544Seschrock 	if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
21941544Seschrock 		arc_buf_t *buf = hdr->b_buf;
21951544Seschrock 
21961544Seschrock 		ASSERT(buf);
21971544Seschrock 		while (buf->b_data == NULL) {
21981544Seschrock 			buf = buf->b_next;
21991544Seschrock 			ASSERT(buf);
22001544Seschrock 		}
22011544Seschrock 		bcopy(buf->b_data, data, hdr->b_size);
22021544Seschrock 	} else {
2203789Sahrens 		rc = ENOENT;
22041544Seschrock 	}
2205789Sahrens 
2206789Sahrens 	if (hash_mtx)
2207789Sahrens 		mutex_exit(hash_mtx);
2208789Sahrens 
2209789Sahrens 	return (rc);
2210789Sahrens }
2211789Sahrens 
22121544Seschrock void
22131544Seschrock arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
22141544Seschrock {
22151544Seschrock 	ASSERT(buf->b_hdr != NULL);
22163403Sbmc 	ASSERT(buf->b_hdr->b_state != arc_anon);
22171544Seschrock 	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
22181544Seschrock 	buf->b_efunc = func;
22191544Seschrock 	buf->b_private = private;
22201544Seschrock }
22211544Seschrock 
22221544Seschrock /*
22231544Seschrock  * This is used by the DMU to let the ARC know that a buffer is
22241544Seschrock  * being evicted, so the ARC should clean up.  If this arc buf
22251544Seschrock  * is not yet in the evicted state, it will be put there.
22261544Seschrock  */
22271544Seschrock int
22281544Seschrock arc_buf_evict(arc_buf_t *buf)
22291544Seschrock {
22302887Smaybee 	arc_buf_hdr_t *hdr;
22311544Seschrock 	kmutex_t *hash_lock;
22321544Seschrock 	arc_buf_t **bufp;
22331544Seschrock 
22342887Smaybee 	mutex_enter(&arc_eviction_mtx);
22352887Smaybee 	hdr = buf->b_hdr;
22361544Seschrock 	if (hdr == NULL) {
22371544Seschrock 		/*
22381544Seschrock 		 * We are in arc_do_user_evicts().
22391544Seschrock 		 */
22401544Seschrock 		ASSERT(buf->b_data == NULL);
22412887Smaybee 		mutex_exit(&arc_eviction_mtx);
22421544Seschrock 		return (0);
22431544Seschrock 	}
22442887Smaybee 	hash_lock = HDR_LOCK(hdr);
22452887Smaybee 	mutex_exit(&arc_eviction_mtx);
22461544Seschrock 
22471544Seschrock 	mutex_enter(hash_lock);
22481544Seschrock 
22492724Smaybee 	if (buf->b_data == NULL) {
22502724Smaybee 		/*
22512724Smaybee 		 * We are on the eviction list.
22522724Smaybee 		 */
22532724Smaybee 		mutex_exit(hash_lock);
22542724Smaybee 		mutex_enter(&arc_eviction_mtx);
22552724Smaybee 		if (buf->b_hdr == NULL) {
22562724Smaybee 			/*
22572724Smaybee 			 * We are already in arc_do_user_evicts().
22582724Smaybee 			 */
22592724Smaybee 			mutex_exit(&arc_eviction_mtx);
22602724Smaybee 			return (0);
22612724Smaybee 		} else {
22622724Smaybee 			arc_buf_t copy = *buf; /* structure assignment */
22632724Smaybee 			/*
22642724Smaybee 			 * Process this buffer now
22652724Smaybee 			 * but let arc_do_user_evicts() do the reaping.
22662724Smaybee 			 */
22672724Smaybee 			buf->b_efunc = NULL;
22682724Smaybee 			mutex_exit(&arc_eviction_mtx);
22692724Smaybee 			VERIFY(copy.b_efunc(&copy) == 0);
22702724Smaybee 			return (1);
22712724Smaybee 		}
22722724Smaybee 	}
22732724Smaybee 
22742724Smaybee 	ASSERT(buf->b_hdr == hdr);
22752724Smaybee 	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
22763403Sbmc 	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
22771544Seschrock 
22781544Seschrock 	/*
22791544Seschrock 	 * Pull this buffer off of the hdr
22801544Seschrock 	 */
22811544Seschrock 	bufp = &hdr->b_buf;
22821544Seschrock 	while (*bufp != buf)
22831544Seschrock 		bufp = &(*bufp)->b_next;
22841544Seschrock 	*bufp = buf->b_next;
22851544Seschrock 
22861544Seschrock 	ASSERT(buf->b_data != NULL);
22872688Smaybee 	arc_buf_destroy(buf, FALSE, FALSE);
22881544Seschrock 
22891544Seschrock 	if (hdr->b_datacnt == 0) {
22901544Seschrock 		arc_state_t *old_state = hdr->b_state;
22911544Seschrock 		arc_state_t *evicted_state;
22921544Seschrock 
22931544Seschrock 		ASSERT(refcount_is_zero(&hdr->b_refcnt));
22941544Seschrock 
22951544Seschrock 		evicted_state =
22963403Sbmc 		    (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
22971544Seschrock 
22983403Sbmc 		mutex_enter(&old_state->arcs_mtx);
22993403Sbmc 		mutex_enter(&evicted_state->arcs_mtx);
23001544Seschrock 
23011544Seschrock 		arc_change_state(evicted_state, hdr, hash_lock);
23021544Seschrock 		ASSERT(HDR_IN_HASH_TABLE(hdr));
23031544Seschrock 		hdr->b_flags = ARC_IN_HASH_TABLE;
23041544Seschrock 
23053403Sbmc 		mutex_exit(&evicted_state->arcs_mtx);
23063403Sbmc 		mutex_exit(&old_state->arcs_mtx);
23071544Seschrock 	}
23081544Seschrock 	mutex_exit(hash_lock);
23091819Smaybee 
23101544Seschrock 	VERIFY(buf->b_efunc(buf) == 0);
23111544Seschrock 	buf->b_efunc = NULL;
23121544Seschrock 	buf->b_private = NULL;
23131544Seschrock 	buf->b_hdr = NULL;
23141544Seschrock 	kmem_cache_free(buf_cache, buf);
23151544Seschrock 	return (1);
23161544Seschrock }
23171544Seschrock 
2318789Sahrens /*
2319789Sahrens  * Release this buffer from the cache.  This must be done
2320789Sahrens  * after a read and prior to modifying the buffer contents.
2321789Sahrens  * If the buffer has more than one reference, we must make
2322789Sahrens  * make a new hdr for the buffer.
2323789Sahrens  */
2324789Sahrens void
2325789Sahrens arc_release(arc_buf_t *buf, void *tag)
2326789Sahrens {
2327789Sahrens 	arc_buf_hdr_t *hdr = buf->b_hdr;
2328789Sahrens 	kmutex_t *hash_lock = HDR_LOCK(hdr);
2329789Sahrens 
2330789Sahrens 	/* this buffer is not on any list */
2331789Sahrens 	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
2332789Sahrens 
23333403Sbmc 	if (hdr->b_state == arc_anon) {
2334789Sahrens 		/* this buffer is already released */
2335789Sahrens 		ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
2336789Sahrens 		ASSERT(BUF_EMPTY(hdr));
23371544Seschrock 		ASSERT(buf->b_efunc == NULL);
23383093Sahrens 		arc_buf_thaw(buf);
2339789Sahrens 		return;
2340789Sahrens 	}
2341789Sahrens 
2342789Sahrens 	mutex_enter(hash_lock);
2343789Sahrens 
23441544Seschrock 	/*
23451544Seschrock 	 * Do we have more than one buf?
23461544Seschrock 	 */
23471544Seschrock 	if (hdr->b_buf != buf || buf->b_next != NULL) {
2348789Sahrens 		arc_buf_hdr_t *nhdr;
2349789Sahrens 		arc_buf_t **bufp;
2350789Sahrens 		uint64_t blksz = hdr->b_size;
2351789Sahrens 		spa_t *spa = hdr->b_spa;
23523290Sjohansen 		arc_buf_contents_t type = hdr->b_type;
2353789Sahrens 
23541544Seschrock 		ASSERT(hdr->b_datacnt > 1);
2355789Sahrens 		/*
2356789Sahrens 		 * Pull the data off of this buf and attach it to
2357789Sahrens 		 * a new anonymous buf.
2358789Sahrens 		 */
23591544Seschrock 		(void) remove_reference(hdr, hash_lock, tag);
2360789Sahrens 		bufp = &hdr->b_buf;
23611544Seschrock 		while (*bufp != buf)
2362789Sahrens 			bufp = &(*bufp)->b_next;
2363789Sahrens 		*bufp = (*bufp)->b_next;
23641544Seschrock 
23653403Sbmc 		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
23663403Sbmc 		atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
23671544Seschrock 		if (refcount_is_zero(&hdr->b_refcnt)) {
23683403Sbmc 			ASSERT3U(hdr->b_state->arcs_lsize, >=, hdr->b_size);
23693403Sbmc 			atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size);
23701544Seschrock 		}
23711544Seschrock 		hdr->b_datacnt -= 1;
23723547Smaybee 		arc_cksum_verify(buf);
23731544Seschrock 
2374789Sahrens 		mutex_exit(hash_lock);
2375789Sahrens 
2376789Sahrens 		nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
2377789Sahrens 		nhdr->b_size = blksz;
2378789Sahrens 		nhdr->b_spa = spa;
23793290Sjohansen 		nhdr->b_type = type;
2380789Sahrens 		nhdr->b_buf = buf;
23813403Sbmc 		nhdr->b_state = arc_anon;
2382789Sahrens 		nhdr->b_arc_access = 0;
2383789Sahrens 		nhdr->b_flags = 0;
23841544Seschrock 		nhdr->b_datacnt = 1;
23853547Smaybee 		nhdr->b_freeze_cksum = NULL;
2386789Sahrens 		buf->b_hdr = nhdr;
2387789Sahrens 		buf->b_next = NULL;
2388789Sahrens 		(void) refcount_add(&nhdr->b_refcnt, tag);
23893403Sbmc 		atomic_add_64(&arc_anon->arcs_size, blksz);
2390789Sahrens 
2391789Sahrens 		hdr = nhdr;
2392789Sahrens 	} else {
23931544Seschrock 		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
2394789Sahrens 		ASSERT(!list_link_active(&hdr->b_arc_node));
2395789Sahrens 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
23963403Sbmc 		arc_change_state(arc_anon, hdr, hash_lock);
2397789Sahrens 		hdr->b_arc_access = 0;
2398789Sahrens 		mutex_exit(hash_lock);
2399789Sahrens 		bzero(&hdr->b_dva, sizeof (dva_t));
2400789Sahrens 		hdr->b_birth = 0;
2401789Sahrens 		hdr->b_cksum0 = 0;
24023547Smaybee 		arc_buf_thaw(buf);
2403789Sahrens 	}
24041544Seschrock 	buf->b_efunc = NULL;
24051544Seschrock 	buf->b_private = NULL;
2406789Sahrens }
2407789Sahrens 
2408789Sahrens int
2409789Sahrens arc_released(arc_buf_t *buf)
2410789Sahrens {
24113403Sbmc 	return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
24121544Seschrock }
24131544Seschrock 
24141544Seschrock int
24151544Seschrock arc_has_callback(arc_buf_t *buf)
24161544Seschrock {
24171544Seschrock 	return (buf->b_efunc != NULL);
2418789Sahrens }
2419789Sahrens 
24201544Seschrock #ifdef ZFS_DEBUG
24211544Seschrock int
24221544Seschrock arc_referenced(arc_buf_t *buf)
24231544Seschrock {
24241544Seschrock 	return (refcount_count(&buf->b_hdr->b_refcnt));
24251544Seschrock }
24261544Seschrock #endif
24271544Seschrock 
2428789Sahrens static void
24293547Smaybee arc_write_ready(zio_t *zio)
24303547Smaybee {
24313547Smaybee 	arc_write_callback_t *callback = zio->io_private;
24323547Smaybee 	arc_buf_t *buf = callback->awcb_buf;
24333547Smaybee 
24343547Smaybee 	if (callback->awcb_ready) {
24353547Smaybee 		ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
24363547Smaybee 		callback->awcb_ready(zio, buf, callback->awcb_private);
24373547Smaybee 	}
24383547Smaybee 	arc_cksum_compute(buf);
24393547Smaybee }
24403547Smaybee 
24413547Smaybee static void
2442789Sahrens arc_write_done(zio_t *zio)
2443789Sahrens {
24443547Smaybee 	arc_write_callback_t *callback = zio->io_private;
24453547Smaybee 	arc_buf_t *buf = callback->awcb_buf;
24463547Smaybee 	arc_buf_hdr_t *hdr = buf->b_hdr;
2447789Sahrens 
2448789Sahrens 	hdr->b_acb = NULL;
2449789Sahrens 
2450789Sahrens 	/* this buffer is on no lists and is not in the hash table */
24513403Sbmc 	ASSERT3P(hdr->b_state, ==, arc_anon);
2452789Sahrens 
2453789Sahrens 	hdr->b_dva = *BP_IDENTITY(zio->io_bp);
2454789Sahrens 	hdr->b_birth = zio->io_bp->blk_birth;
2455789Sahrens 	hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
24561544Seschrock 	/*
24571544Seschrock 	 * If the block to be written was all-zero, we may have
24581544Seschrock 	 * compressed it away.  In this case no write was performed
24591544Seschrock 	 * so there will be no dva/birth-date/checksum.  The buffer
24601544Seschrock 	 * must therefor remain anonymous (and uncached).
24611544Seschrock 	 */
2462789Sahrens 	if (!BUF_EMPTY(hdr)) {
2463789Sahrens 		arc_buf_hdr_t *exists;
2464789Sahrens 		kmutex_t *hash_lock;
2465789Sahrens 
24663093Sahrens 		arc_cksum_verify(buf);
24673093Sahrens 
2468789Sahrens 		exists = buf_hash_insert(hdr, &hash_lock);
2469789Sahrens 		if (exists) {
2470789Sahrens 			/*
2471789Sahrens 			 * This can only happen if we overwrite for
2472789Sahrens 			 * sync-to-convergence, because we remove
2473789Sahrens 			 * buffers from the hash table when we arc_free().
2474789Sahrens 			 */
2475789Sahrens 			ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
2476789Sahrens 			    BP_IDENTITY(zio->io_bp)));
2477789Sahrens 			ASSERT3U(zio->io_bp_orig.blk_birth, ==,
2478789Sahrens 			    zio->io_bp->blk_birth);
2479789Sahrens 
2480789Sahrens 			ASSERT(refcount_is_zero(&exists->b_refcnt));
24813403Sbmc 			arc_change_state(arc_anon, exists, hash_lock);
2482789Sahrens 			mutex_exit(hash_lock);
24831544Seschrock 			arc_hdr_destroy(exists);
2484789Sahrens 			exists = buf_hash_insert(hdr, &hash_lock);
2485789Sahrens 			ASSERT3P(exists, ==, NULL);
2486789Sahrens 		}
24871544Seschrock 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
24882688Smaybee 		arc_access(hdr, hash_lock);
24892688Smaybee 		mutex_exit(hash_lock);
24903547Smaybee 	} else if (callback->awcb_done == NULL) {
24911544Seschrock 		int destroy_hdr;
24921544Seschrock 		/*
24931544Seschrock 		 * This is an anonymous buffer with no user callback,
24941544Seschrock 		 * destroy it if there are no active references.
24951544Seschrock 		 */
24961544Seschrock 		mutex_enter(&arc_eviction_mtx);
24971544Seschrock 		destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
24981544Seschrock 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
24991544Seschrock 		mutex_exit(&arc_eviction_mtx);
25001544Seschrock 		if (destroy_hdr)
25011544Seschrock 			arc_hdr_destroy(hdr);
25021544Seschrock 	} else {
25031544Seschrock 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2504789Sahrens 	}
25051544Seschrock 
25063547Smaybee 	if (callback->awcb_done) {
2507789Sahrens 		ASSERT(!refcount_is_zero(&hdr->b_refcnt));
25083547Smaybee 		callback->awcb_done(zio, buf, callback->awcb_private);
2509789Sahrens 	}
2510789Sahrens 
25113547Smaybee 	kmem_free(callback, sizeof (arc_write_callback_t));
2512789Sahrens }
2513789Sahrens 
25143547Smaybee zio_t *
25151775Sbillm arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
2516789Sahrens     uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
25173547Smaybee     arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
25183547Smaybee     int flags, zbookmark_t *zb)
2519789Sahrens {
2520789Sahrens 	arc_buf_hdr_t *hdr = buf->b_hdr;
25213547Smaybee 	arc_write_callback_t *callback;
25223547Smaybee 	zio_t	*zio;
2523789Sahrens 
2524789Sahrens 	/* this is a private buffer - no locking required */
25253403Sbmc 	ASSERT3P(hdr->b_state, ==, arc_anon);
2526789Sahrens 	ASSERT(BUF_EMPTY(hdr));
2527789Sahrens 	ASSERT(!HDR_IO_ERROR(hdr));
25282237Smaybee 	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
25292237Smaybee 	ASSERT(hdr->b_acb == 0);
25303547Smaybee 	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
25313547Smaybee 	callback->awcb_ready = ready;
25323547Smaybee 	callback->awcb_done = done;
25333547Smaybee 	callback->awcb_private = private;
25343547Smaybee 	callback->awcb_buf = buf;
25351544Seschrock 	hdr->b_flags |= ARC_IO_IN_PROGRESS;
25363547Smaybee 	zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
25373547Smaybee 	    buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback,
25383547Smaybee 	    priority, flags, zb);
2539789Sahrens 
25403547Smaybee 	return (zio);
2541789Sahrens }
2542789Sahrens 
2543789Sahrens int
2544789Sahrens arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
2545789Sahrens     zio_done_func_t *done, void *private, uint32_t arc_flags)
2546789Sahrens {
2547789Sahrens 	arc_buf_hdr_t *ab;
2548789Sahrens 	kmutex_t *hash_lock;
2549789Sahrens 	zio_t	*zio;
2550789Sahrens 
2551789Sahrens 	/*
2552789Sahrens 	 * If this buffer is in the cache, release it, so it
2553789Sahrens 	 * can be re-used.
2554789Sahrens 	 */
2555789Sahrens 	ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
2556789Sahrens 	if (ab != NULL) {
2557789Sahrens 		/*
2558789Sahrens 		 * The checksum of blocks to free is not always
2559789Sahrens 		 * preserved (eg. on the deadlist).  However, if it is
2560789Sahrens 		 * nonzero, it should match what we have in the cache.
2561789Sahrens 		 */
2562789Sahrens 		ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
2563789Sahrens 		    ab->b_cksum0 == bp->blk_cksum.zc_word[0]);
25643403Sbmc 		if (ab->b_state != arc_anon)
25653403Sbmc 			arc_change_state(arc_anon, ab, hash_lock);
25662391Smaybee 		if (HDR_IO_IN_PROGRESS(ab)) {
25672391Smaybee 			/*
25682391Smaybee 			 * This should only happen when we prefetch.
25692391Smaybee 			 */
25702391Smaybee 			ASSERT(ab->b_flags & ARC_PREFETCH);
25712391Smaybee 			ASSERT3U(ab->b_datacnt, ==, 1);
25722391Smaybee 			ab->b_flags |= ARC_FREED_IN_READ;
25732391Smaybee 			if (HDR_IN_HASH_TABLE(ab))
25742391Smaybee 				buf_hash_remove(ab);
25752391Smaybee 			ab->b_arc_access = 0;
25762391Smaybee 			bzero(&ab->b_dva, sizeof (dva_t));
25772391Smaybee 			ab->b_birth = 0;
25782391Smaybee 			ab->b_cksum0 = 0;
25792391Smaybee 			ab->b_buf->b_efunc = NULL;
25802391Smaybee 			ab->b_buf->b_private = NULL;
25812391Smaybee 			mutex_exit(hash_lock);
25822391Smaybee 		} else if (refcount_is_zero(&ab->b_refcnt)) {
2583789Sahrens 			mutex_exit(hash_lock);
25841544Seschrock 			arc_hdr_destroy(ab);
25853403Sbmc 			ARCSTAT_BUMP(arcstat_deleted);
2586789Sahrens 		} else {
25871589Smaybee 			/*
25882391Smaybee 			 * We still have an active reference on this
25892391Smaybee 			 * buffer.  This can happen, e.g., from
25902391Smaybee 			 * dbuf_unoverride().
25911589Smaybee 			 */
25922391Smaybee 			ASSERT(!HDR_IN_HASH_TABLE(ab));
2593789Sahrens 			ab->b_arc_access = 0;
2594789Sahrens 			bzero(&ab->b_dva, sizeof (dva_t));
2595789Sahrens 			ab->b_birth = 0;
2596789Sahrens 			ab->b_cksum0 = 0;
25971544Seschrock 			ab->b_buf->b_efunc = NULL;
25981544Seschrock 			ab->b_buf->b_private = NULL;
2599789Sahrens 			mutex_exit(hash_lock);
2600789Sahrens 		}
2601789Sahrens 	}
2602789Sahrens 
2603789Sahrens 	zio = zio_free(pio, spa, txg, bp, done, private);
2604789Sahrens 
2605789Sahrens 	if (arc_flags & ARC_WAIT)
2606789Sahrens 		return (zio_wait(zio));
2607789Sahrens 
2608789Sahrens 	ASSERT(arc_flags & ARC_NOWAIT);
2609789Sahrens 	zio_nowait(zio);
2610789Sahrens 
2611789Sahrens 	return (0);
2612789Sahrens }
2613789Sahrens 
2614789Sahrens void
2615789Sahrens arc_tempreserve_clear(uint64_t tempreserve)
2616789Sahrens {
2617789Sahrens 	atomic_add_64(&arc_tempreserve, -tempreserve);
2618789Sahrens 	ASSERT((int64_t)arc_tempreserve >= 0);
2619789Sahrens }
2620789Sahrens 
2621789Sahrens int
2622789Sahrens arc_tempreserve_space(uint64_t tempreserve)
2623789Sahrens {
2624789Sahrens #ifdef ZFS_DEBUG
2625789Sahrens 	/*
2626789Sahrens 	 * Once in a while, fail for no reason.  Everything should cope.
2627789Sahrens 	 */
2628789Sahrens 	if (spa_get_random(10000) == 0) {
2629789Sahrens 		dprintf("forcing random failure\n");
2630789Sahrens 		return (ERESTART);
2631789Sahrens 	}
2632789Sahrens #endif
26333403Sbmc 	if (tempreserve > arc_c/4 && !arc_no_grow)
26343403Sbmc 		arc_c = MIN(arc_c_max, tempreserve * 4);
26353403Sbmc 	if (tempreserve > arc_c)
2636982Smaybee 		return (ENOMEM);
2637982Smaybee 
2638789Sahrens 	/*
2639982Smaybee 	 * Throttle writes when the amount of dirty data in the cache
2640982Smaybee 	 * gets too large.  We try to keep the cache less than half full
2641982Smaybee 	 * of dirty blocks so that our sync times don't grow too large.
2642982Smaybee 	 * Note: if two requests come in concurrently, we might let them
2643982Smaybee 	 * both succeed, when one of them should fail.  Not a huge deal.
2644982Smaybee 	 *
2645982Smaybee 	 * XXX The limit should be adjusted dynamically to keep the time
2646982Smaybee 	 * to sync a dataset fixed (around 1-5 seconds?).
2647789Sahrens 	 */
2648789Sahrens 
26493403Sbmc 	if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
26503403Sbmc 	    arc_tempreserve + arc_anon->arcs_size > arc_c / 4) {
2651789Sahrens 		dprintf("failing, arc_tempreserve=%lluK anon=%lluK "
26523403Sbmc 		    "tempreserve=%lluK arc_c=%lluK\n",
26533403Sbmc 		    arc_tempreserve>>10, arc_anon->arcs_lsize>>10,
26543403Sbmc 		    tempreserve>>10, arc_c>>10);
2655789Sahrens 		return (ERESTART);
2656789Sahrens 	}
2657789Sahrens 	atomic_add_64(&arc_tempreserve, tempreserve);
2658789Sahrens 	return (0);
2659789Sahrens }
2660789Sahrens 
2661789Sahrens void
2662789Sahrens arc_init(void)
2663789Sahrens {
2664789Sahrens 	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
2665789Sahrens 	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
2666789Sahrens 
26672391Smaybee 	/* Convert seconds to clock ticks */
26682638Sperrin 	arc_min_prefetch_lifespan = 1 * hz;
26692391Smaybee 
2670789Sahrens 	/* Start out with 1/8 of all memory */
26713403Sbmc 	arc_c = physmem * PAGESIZE / 8;
2672789Sahrens 
2673789Sahrens #ifdef _KERNEL
2674789Sahrens 	/*
2675789Sahrens 	 * On architectures where the physical memory can be larger
2676789Sahrens 	 * than the addressable space (intel in 32-bit mode), we may
2677789Sahrens 	 * need to limit the cache to 1/8 of VM size.
2678789Sahrens 	 */
26793403Sbmc 	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
2680789Sahrens #endif
2681789Sahrens 
2682982Smaybee 	/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
26833403Sbmc 	arc_c_min = MAX(arc_c / 4, 64<<20);
2684982Smaybee 	/* set max to 3/4 of all memory, or all but 1GB, whichever is more */
26853403Sbmc 	if (arc_c * 8 >= 1<<30)
26863403Sbmc 		arc_c_max = (arc_c * 8) - (1<<30);
2687789Sahrens 	else
26883403Sbmc 		arc_c_max = arc_c_min;
26893403Sbmc 	arc_c_max = MAX(arc_c * 6, arc_c_max);
26902885Sahrens 
26912885Sahrens 	/*
26922885Sahrens 	 * Allow the tunables to override our calculations if they are
26932885Sahrens 	 * reasonable (ie. over 64MB)
26942885Sahrens 	 */
26952885Sahrens 	if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
26963403Sbmc 		arc_c_max = zfs_arc_max;
26973403Sbmc 	if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
26983403Sbmc 		arc_c_min = zfs_arc_min;
26992885Sahrens 
27003403Sbmc 	arc_c = arc_c_max;
27013403Sbmc 	arc_p = (arc_c >> 1);
2702789Sahrens 
2703789Sahrens 	/* if kmem_flags are set, lets try to use less memory */
2704789Sahrens 	if (kmem_debugging())
27053403Sbmc 		arc_c = arc_c / 2;
27063403Sbmc 	if (arc_c < arc_c_min)
27073403Sbmc 		arc_c = arc_c_min;
2708789Sahrens 
27093403Sbmc 	arc_anon = &ARC_anon;
27103403Sbmc 	arc_mru = &ARC_mru;
27113403Sbmc 	arc_mru_ghost = &ARC_mru_ghost;
27123403Sbmc 	arc_mfu = &ARC_mfu;
27133403Sbmc 	arc_mfu_ghost = &ARC_mfu_ghost;
27143403Sbmc 	arc_size = 0;
2715789Sahrens 
27163403Sbmc 	mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
27173403Sbmc 	mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
27183403Sbmc 	mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
27193403Sbmc 	mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
27203403Sbmc 	mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
27212688Smaybee 
27223403Sbmc 	list_create(&arc_mru->arcs_list, sizeof (arc_buf_hdr_t),
27233403Sbmc 	    offsetof(arc_buf_hdr_t, b_arc_node));
27243403Sbmc 	list_create(&arc_mru_ghost->arcs_list, sizeof (arc_buf_hdr_t),
2725789Sahrens 	    offsetof(arc_buf_hdr_t, b_arc_node));
27263403Sbmc 	list_create(&arc_mfu->arcs_list, sizeof (arc_buf_hdr_t),
2727789Sahrens 	    offsetof(arc_buf_hdr_t, b_arc_node));
27283403Sbmc 	list_create(&arc_mfu_ghost->arcs_list, sizeof (arc_buf_hdr_t),
2729789Sahrens 	    offsetof(arc_buf_hdr_t, b_arc_node));
2730789Sahrens 
2731789Sahrens 	buf_init();
2732789Sahrens 
2733789Sahrens 	arc_thread_exit = 0;
27341544Seschrock 	arc_eviction_list = NULL;
27351544Seschrock 	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
27362887Smaybee 	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
2737789Sahrens 
27383403Sbmc 	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
27393403Sbmc 	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
27403403Sbmc 
27413403Sbmc 	if (arc_ksp != NULL) {
27423403Sbmc 		arc_ksp->ks_data = &arc_stats;
27433403Sbmc 		kstat_install(arc_ksp);
27443403Sbmc 	}
27453403Sbmc 
2746789Sahrens 	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
2747789Sahrens 	    TS_RUN, minclsyspri);
27483158Smaybee 
27493158Smaybee 	arc_dead = FALSE;
27503552Sjohansen 
27513552Sjohansen #ifdef _KERNEL
27523552Sjohansen 	if (zio_arena != NULL)
27533552Sjohansen 		arc_ziosize =
27543552Sjohansen 		    btop(vmem_size(zio_arena, VMEM_FREE | VMEM_ALLOC));
27553552Sjohansen #endif /* _KERNEL */
2756789Sahrens }
2757789Sahrens 
2758789Sahrens void
2759789Sahrens arc_fini(void)
2760789Sahrens {
2761789Sahrens 	mutex_enter(&arc_reclaim_thr_lock);
2762789Sahrens 	arc_thread_exit = 1;
2763789Sahrens 	while (arc_thread_exit != 0)
2764789Sahrens 		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
2765789Sahrens 	mutex_exit(&arc_reclaim_thr_lock);
2766789Sahrens 
2767789Sahrens 	arc_flush();
2768789Sahrens 
2769789Sahrens 	arc_dead = TRUE;
2770789Sahrens 
27713403Sbmc 	if (arc_ksp != NULL) {
27723403Sbmc 		kstat_delete(arc_ksp);
27733403Sbmc 		arc_ksp = NULL;
27743403Sbmc 	}
27753403Sbmc 
27761544Seschrock 	mutex_destroy(&arc_eviction_mtx);
2777789Sahrens 	mutex_destroy(&arc_reclaim_thr_lock);
2778789Sahrens 	cv_destroy(&arc_reclaim_thr_cv);
2779789Sahrens 
27803403Sbmc 	list_destroy(&arc_mru->arcs_list);
27813403Sbmc 	list_destroy(&arc_mru_ghost->arcs_list);
27823403Sbmc 	list_destroy(&arc_mfu->arcs_list);
27833403Sbmc 	list_destroy(&arc_mfu_ghost->arcs_list);
2784789Sahrens 
27853403Sbmc 	mutex_destroy(&arc_anon->arcs_mtx);
27863403Sbmc 	mutex_destroy(&arc_mru->arcs_mtx);
27873403Sbmc 	mutex_destroy(&arc_mru_ghost->arcs_mtx);
27883403Sbmc 	mutex_destroy(&arc_mfu->arcs_mtx);
27893403Sbmc 	mutex_destroy(&arc_mfu_ghost->arcs_mtx);
27902856Snd150628 
2791789Sahrens 	buf_fini();
2792789Sahrens }
2793