xref: /onnv-gate/usr/src/uts/common/fs/zfs/arc.c (revision 789)
1*789Sahrens /*
2*789Sahrens  * CDDL HEADER START
3*789Sahrens  *
4*789Sahrens  * The contents of this file are subject to the terms of the
5*789Sahrens  * Common Development and Distribution License, Version 1.0 only
6*789Sahrens  * (the "License").  You may not use this file except in compliance
7*789Sahrens  * with the License.
8*789Sahrens  *
9*789Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*789Sahrens  * or http://www.opensolaris.org/os/licensing.
11*789Sahrens  * See the License for the specific language governing permissions
12*789Sahrens  * and limitations under the License.
13*789Sahrens  *
14*789Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
15*789Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*789Sahrens  * If applicable, add the following below this CDDL HEADER, with the
17*789Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
18*789Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
19*789Sahrens  *
20*789Sahrens  * CDDL HEADER END
21*789Sahrens  */
22*789Sahrens /*
23*789Sahrens  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*789Sahrens  * Use is subject to license terms.
25*789Sahrens  */
26*789Sahrens 
27*789Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*789Sahrens 
29*789Sahrens /*
30*789Sahrens  * DVA-based Adjustable Relpacement Cache
31*789Sahrens  *
32*789Sahrens  * While much of the theory of operation and algorithms used here
33*789Sahrens  * are based on the self-tuning, low overhead replacement cache
34*789Sahrens  * presented by Megiddo and Modha at FAST 2003, there are some
35*789Sahrens  * significant differences:
36*789Sahrens  *
37*789Sahrens  * 1. The Megiddo and Modha model assumes any page is evictable.
38*789Sahrens  * Pages in its cache cannot be "locked" into memory.  This makes
39*789Sahrens  * the eviction algorithm simple: evict the last page in the list.
40*789Sahrens  * This also make the performance characteristics easy to reason
41*789Sahrens  * about.  Our cache is not so simple.  At any given moment, some
42*789Sahrens  * subset of the blocks in the cache are un-evictable because we
43*789Sahrens  * have handed out a reference to them.  Blocks are only evictable
44*789Sahrens  * when there are no external references active.  This makes
45*789Sahrens  * eviction far more problematic:  we choose to evict the evictable
46*789Sahrens  * blocks that are the "lowest" in the list.
47*789Sahrens  *
48*789Sahrens  * There are times when it is not possible to evict the requested
49*789Sahrens  * space.  In these circumstances we are unable to adjust the cache
50*789Sahrens  * size.  To prevent the cache growing unbounded at these times we
51*789Sahrens  * implement a "cache throttle" that slowes the flow of new data
52*789Sahrens  * into the cache until we can make space avaiable.
53*789Sahrens  *
54*789Sahrens  * 2. The Megiddo and Modha model assumes a fixed cache size.
55*789Sahrens  * Pages are evicted when the cache is full and there is a cache
56*789Sahrens  * miss.  Our model has a variable sized cache.  It grows with
57*789Sahrens  * high use, but also tries to react to memory preasure from the
58*789Sahrens  * operating system: decreasing its size when system memory is
59*789Sahrens  * tight.
60*789Sahrens  *
61*789Sahrens  * 3. The Megiddo and Modha model assumes a fixed page size. All
62*789Sahrens  * elements of the cache are therefor exactly the same size.  So
63*789Sahrens  * when adjusting the cache size following a cache miss, its simply
64*789Sahrens  * a matter of choosing a single page to evict.  In our model, we
65*789Sahrens  * have variable sized cache blocks (rangeing from 512 bytes to
66*789Sahrens  * 128K bytes).  We therefor choose a set of blocks to evict to make
67*789Sahrens  * space for a cache miss that approximates as closely as possible
68*789Sahrens  * the space used by the new block.
69*789Sahrens  *
70*789Sahrens  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
71*789Sahrens  * by N. Megiddo & D. Modha, FAST 2003
72*789Sahrens  */
73*789Sahrens 
74*789Sahrens /*
75*789Sahrens  * The locking model:
76*789Sahrens  *
77*789Sahrens  * A new reference to a cache buffer can be obtained in two
78*789Sahrens  * ways: 1) via a hash table lookup using the DVA as a key,
79*789Sahrens  * or 2) via one of the ARC lists.  The arc_read() inerface
80*789Sahrens  * uses method 1, while the internal arc algorithms for
81*789Sahrens  * adjusting the cache use method 2.  We therefor provide two
82*789Sahrens  * types of locks: 1) the hash table lock array, and 2) the
83*789Sahrens  * arc list locks.
84*789Sahrens  *
85*789Sahrens  * Buffers do not have their own mutexs, rather they rely on the
86*789Sahrens  * hash table mutexs for the bulk of their protection (i.e. most
87*789Sahrens  * fields in the arc_buf_hdr_t are protected by these mutexs).
88*789Sahrens  *
89*789Sahrens  * buf_hash_find() returns the appropriate mutex (held) when it
90*789Sahrens  * locates the requested buffer in the hash table.  It returns
91*789Sahrens  * NULL for the mutex if the buffer was not in the table.
92*789Sahrens  *
93*789Sahrens  * buf_hash_remove() expects the appropriate hash mutex to be
94*789Sahrens  * already held before it is invoked.
95*789Sahrens  *
96*789Sahrens  * Each arc state also has a mutex which is used to protect the
97*789Sahrens  * buffer list associated with the state.  When attempting to
98*789Sahrens  * obtain a hash table lock while holding an arc list lock you
99*789Sahrens  * must use: mutex_tryenter() to avoid deadlock.  Also note that
100*789Sahrens  * the "top" state mutex must be held before the "bot" state mutex.
101*789Sahrens  *
102*789Sahrens  * Note that the majority of the performance stats are manipulated
103*789Sahrens  * with atomic operations.
104*789Sahrens  */
105*789Sahrens 
106*789Sahrens #include <sys/spa.h>
107*789Sahrens #include <sys/zio.h>
108*789Sahrens #include <sys/zfs_context.h>
109*789Sahrens #include <sys/arc.h>
110*789Sahrens #include <sys/refcount.h>
111*789Sahrens #ifdef _KERNEL
112*789Sahrens #include <sys/vmsystm.h>
113*789Sahrens #include <vm/anon.h>
114*789Sahrens #include <sys/fs/swapnode.h>
115*789Sahrens #endif
116*789Sahrens #include <sys/callb.h>
117*789Sahrens 
118*789Sahrens static kmutex_t		arc_reclaim_thr_lock;
119*789Sahrens static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
120*789Sahrens static uint8_t		arc_thread_exit;
121*789Sahrens 
122*789Sahrens typedef enum arc_reclaim_strategy {
123*789Sahrens 	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
124*789Sahrens 	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
125*789Sahrens } arc_reclaim_strategy_t;
126*789Sahrens 
127*789Sahrens /* number of seconds before growing cache again */
128*789Sahrens static int		arc_grow_retry = 60;
129*789Sahrens 
130*789Sahrens static kmutex_t arc_reclaim_lock;
131*789Sahrens static int arc_dead;
132*789Sahrens 
133*789Sahrens /*
134*789Sahrens  * Note that buffers can be on one of 5 states:
135*789Sahrens  *	ARC_anon	- anonymous (discussed below)
136*789Sahrens  *	ARC_mru_top	- recently used, currently cached
137*789Sahrens  *	ARC_mru_bot	- recentely used, no longer in cache
138*789Sahrens  *	ARC_mfu_top	- frequently used, currently cached
139*789Sahrens  *	ARC_mfu_bot	- frequently used, no longer in cache
140*789Sahrens  * When there are no active references to the buffer, they
141*789Sahrens  * are linked onto one of the lists in arc.  These are the
142*789Sahrens  * only buffers that can be evicted or deleted.
143*789Sahrens  *
144*789Sahrens  * Anonymous buffers are buffers that are not associated with
145*789Sahrens  * a DVA.  These are buffers that hold dirty block copies
146*789Sahrens  * before they are written to stable storage.  By definition,
147*789Sahrens  * they are "ref'd" and are considered part of arc_mru_top
148*789Sahrens  * that cannot be freed.  Generally, they will aquire a DVA
149*789Sahrens  * as they are written and migrate onto the arc_mru_top list.
150*789Sahrens  */
151*789Sahrens 
152*789Sahrens typedef struct arc_state {
153*789Sahrens 	list_t	list;	/* linked list of evictable buffer in state */
154*789Sahrens 	uint64_t lsize;	/* total size of buffers in the linked list */
155*789Sahrens 	uint64_t size;	/* total size of all buffers in this state */
156*789Sahrens 	uint64_t hits;
157*789Sahrens 	kmutex_t mtx;
158*789Sahrens } arc_state_t;
159*789Sahrens 
160*789Sahrens /* The 5 states: */
161*789Sahrens static arc_state_t ARC_anon;
162*789Sahrens static arc_state_t ARC_mru_top;
163*789Sahrens static arc_state_t ARC_mru_bot;
164*789Sahrens static arc_state_t ARC_mfu_top;
165*789Sahrens static arc_state_t ARC_mfu_bot;
166*789Sahrens 
167*789Sahrens static struct arc {
168*789Sahrens 	arc_state_t 	*anon;
169*789Sahrens 	arc_state_t	*mru_top;
170*789Sahrens 	arc_state_t	*mru_bot;
171*789Sahrens 	arc_state_t	*mfu_top;
172*789Sahrens 	arc_state_t	*mfu_bot;
173*789Sahrens 	uint64_t	size;		/* Actual total arc size */
174*789Sahrens 	uint64_t	p;		/* Target size (in bytes) of mru_top */
175*789Sahrens 	uint64_t	c;		/* Target size of cache (in bytes) */
176*789Sahrens 	uint64_t	c_min;		/* Minimum target cache size */
177*789Sahrens 	uint64_t	c_max;		/* Maximum target cache size */
178*789Sahrens 	uint64_t	incr;		/* Size by which to increment arc.c */
179*789Sahrens 	int64_t		size_check;
180*789Sahrens 
181*789Sahrens 	/* performance stats */
182*789Sahrens 	uint64_t	hits;
183*789Sahrens 	uint64_t	misses;
184*789Sahrens 	uint64_t	deleted;
185*789Sahrens 	uint64_t	skipped;
186*789Sahrens 	uint64_t	hash_elements;
187*789Sahrens 	uint64_t	hash_elements_max;
188*789Sahrens 	uint64_t	hash_collisions;
189*789Sahrens 	uint64_t	hash_chains;
190*789Sahrens 	uint32_t	hash_chain_max;
191*789Sahrens 
192*789Sahrens 	int		no_grow;	/* Don't try to grow cache size */
193*789Sahrens } arc;
194*789Sahrens 
195*789Sahrens /* Default amount to grow arc.incr */
196*789Sahrens static int64_t arc_incr_size = 1024;
197*789Sahrens 
198*789Sahrens /* > 0 ==> time to increment arc.c */
199*789Sahrens static int64_t arc_size_check_default = -1000;
200*789Sahrens 
201*789Sahrens static uint64_t arc_tempreserve;
202*789Sahrens 
203*789Sahrens typedef struct arc_callback arc_callback_t;
204*789Sahrens 
205*789Sahrens struct arc_callback {
206*789Sahrens 	arc_done_func_t		*acb_done;
207*789Sahrens 	void			*acb_private;
208*789Sahrens 	arc_byteswap_func_t	*acb_byteswap;
209*789Sahrens 	arc_buf_t		*acb_buf;
210*789Sahrens 	zio_t			*acb_zio_dummy;
211*789Sahrens 	arc_callback_t		*acb_next;
212*789Sahrens };
213*789Sahrens 
214*789Sahrens struct arc_buf_hdr {
215*789Sahrens 	/* immutable */
216*789Sahrens 	uint64_t		b_size;
217*789Sahrens 	spa_t			*b_spa;
218*789Sahrens 
219*789Sahrens 	/* protected by hash lock */
220*789Sahrens 	dva_t			b_dva;
221*789Sahrens 	uint64_t		b_birth;
222*789Sahrens 	uint64_t		b_cksum0;
223*789Sahrens 
224*789Sahrens 	arc_buf_hdr_t		*b_hash_next;
225*789Sahrens 	arc_buf_t		*b_buf;
226*789Sahrens 	uint32_t		b_flags;
227*789Sahrens 
228*789Sahrens 	kcondvar_t		b_cv;
229*789Sahrens 	arc_callback_t		*b_acb;
230*789Sahrens 
231*789Sahrens 	/* protected by arc state mutex */
232*789Sahrens 	arc_state_t		*b_state;
233*789Sahrens 	list_node_t		b_arc_node;
234*789Sahrens 
235*789Sahrens 	/* updated atomically */
236*789Sahrens 	clock_t			b_arc_access;
237*789Sahrens 
238*789Sahrens 	/* self protecting */
239*789Sahrens 	refcount_t		b_refcnt;
240*789Sahrens };
241*789Sahrens 
242*789Sahrens /*
243*789Sahrens  * Private ARC flags.  These flags are private ARC only flags that will show up
244*789Sahrens  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
245*789Sahrens  * be passed in as arc_flags in things like arc_read.  However, these flags
246*789Sahrens  * should never be passed and should only be set by ARC code.  When adding new
247*789Sahrens  * public flags, make sure not to smash the private ones.
248*789Sahrens  */
249*789Sahrens 
250*789Sahrens #define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
251*789Sahrens #define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
252*789Sahrens #define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
253*789Sahrens 
254*789Sahrens #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
255*789Sahrens #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
256*789Sahrens #define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
257*789Sahrens 
258*789Sahrens /*
259*789Sahrens  * Hash table routines
260*789Sahrens  */
261*789Sahrens 
262*789Sahrens #define	HT_LOCK_PAD	64
263*789Sahrens 
264*789Sahrens struct ht_lock {
265*789Sahrens 	kmutex_t	ht_lock;
266*789Sahrens #ifdef _KERNEL
267*789Sahrens 	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
268*789Sahrens #endif
269*789Sahrens };
270*789Sahrens 
271*789Sahrens #define	BUF_LOCKS 256
272*789Sahrens typedef struct buf_hash_table {
273*789Sahrens 	uint64_t ht_mask;
274*789Sahrens 	arc_buf_hdr_t **ht_table;
275*789Sahrens 	struct ht_lock ht_locks[BUF_LOCKS];
276*789Sahrens } buf_hash_table_t;
277*789Sahrens 
278*789Sahrens static buf_hash_table_t buf_hash_table;
279*789Sahrens 
280*789Sahrens #define	BUF_HASH_INDEX(spa, dva, birth) \
281*789Sahrens 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
282*789Sahrens #define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
283*789Sahrens #define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
284*789Sahrens #define	HDR_LOCK(buf) \
285*789Sahrens 	(BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
286*789Sahrens 
287*789Sahrens uint64_t zfs_crc64_table[256];
288*789Sahrens 
289*789Sahrens static uint64_t
290*789Sahrens buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
291*789Sahrens {
292*789Sahrens 	uintptr_t spav = (uintptr_t)spa;
293*789Sahrens 	uint8_t *vdva = (uint8_t *)dva;
294*789Sahrens 	uint64_t crc = -1ULL;
295*789Sahrens 	int i;
296*789Sahrens 
297*789Sahrens 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
298*789Sahrens 
299*789Sahrens 	for (i = 0; i < sizeof (dva_t); i++)
300*789Sahrens 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
301*789Sahrens 
302*789Sahrens 	crc ^= (spav>>8) ^ birth;
303*789Sahrens 
304*789Sahrens 	return (crc);
305*789Sahrens }
306*789Sahrens 
307*789Sahrens #define	BUF_EMPTY(buf)						\
308*789Sahrens 	((buf)->b_dva.dva_word[0] == 0 &&			\
309*789Sahrens 	(buf)->b_dva.dva_word[1] == 0 &&			\
310*789Sahrens 	(buf)->b_birth == 0)
311*789Sahrens 
312*789Sahrens #define	BUF_EQUAL(spa, dva, birth, buf)				\
313*789Sahrens 	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
314*789Sahrens 	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
315*789Sahrens 	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
316*789Sahrens 
317*789Sahrens static arc_buf_hdr_t *
318*789Sahrens buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp)
319*789Sahrens {
320*789Sahrens 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
321*789Sahrens 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
322*789Sahrens 	arc_buf_hdr_t *buf;
323*789Sahrens 
324*789Sahrens 	mutex_enter(hash_lock);
325*789Sahrens 	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
326*789Sahrens 	    buf = buf->b_hash_next) {
327*789Sahrens 		if (BUF_EQUAL(spa, dva, birth, buf)) {
328*789Sahrens 			*lockp = hash_lock;
329*789Sahrens 			return (buf);
330*789Sahrens 		}
331*789Sahrens 	}
332*789Sahrens 	mutex_exit(hash_lock);
333*789Sahrens 	*lockp = NULL;
334*789Sahrens 	return (NULL);
335*789Sahrens }
336*789Sahrens 
337*789Sahrens /*
338*789Sahrens  * Insert an entry into the hash table.  If there is already an element
339*789Sahrens  * equal to elem in the hash table, then the already existing element
340*789Sahrens  * will be returned and the new element will not be inserted.
341*789Sahrens  * Otherwise returns NULL.
342*789Sahrens  */
343*789Sahrens static arc_buf_hdr_t *fbufs[4]; /* XXX to find 6341326 */
344*789Sahrens static kthread_t *fbufs_lastthread;
345*789Sahrens static arc_buf_hdr_t *
346*789Sahrens buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
347*789Sahrens {
348*789Sahrens 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
349*789Sahrens 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
350*789Sahrens 	arc_buf_hdr_t *fbuf;
351*789Sahrens 	uint32_t max, i;
352*789Sahrens 
353*789Sahrens 	fbufs_lastthread = curthread;
354*789Sahrens 	*lockp = hash_lock;
355*789Sahrens 	mutex_enter(hash_lock);
356*789Sahrens 	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
357*789Sahrens 	    fbuf = fbuf->b_hash_next, i++) {
358*789Sahrens 		if (i < sizeof (fbufs) / sizeof (fbufs[0]))
359*789Sahrens 			fbufs[i] = fbuf;
360*789Sahrens 		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
361*789Sahrens 			return (fbuf);
362*789Sahrens 	}
363*789Sahrens 
364*789Sahrens 	buf->b_hash_next = buf_hash_table.ht_table[idx];
365*789Sahrens 	buf_hash_table.ht_table[idx] = buf;
366*789Sahrens 
367*789Sahrens 	/* collect some hash table performance data */
368*789Sahrens 	if (i > 0) {
369*789Sahrens 		atomic_add_64(&arc.hash_collisions, 1);
370*789Sahrens 		if (i == 1)
371*789Sahrens 			atomic_add_64(&arc.hash_chains, 1);
372*789Sahrens 	}
373*789Sahrens 	while (i > (max = arc.hash_chain_max) &&
374*789Sahrens 	    max != atomic_cas_32(&arc.hash_chain_max, max, i)) {
375*789Sahrens 		continue;
376*789Sahrens 	}
377*789Sahrens 	atomic_add_64(&arc.hash_elements, 1);
378*789Sahrens 	if (arc.hash_elements > arc.hash_elements_max)
379*789Sahrens 		atomic_add_64(&arc.hash_elements_max, 1);
380*789Sahrens 
381*789Sahrens 	return (NULL);
382*789Sahrens }
383*789Sahrens 
384*789Sahrens static void
385*789Sahrens buf_hash_remove(arc_buf_hdr_t *buf)
386*789Sahrens {
387*789Sahrens 	arc_buf_hdr_t *fbuf, **bufp;
388*789Sahrens 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
389*789Sahrens 
390*789Sahrens 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
391*789Sahrens 
392*789Sahrens 	bufp = &buf_hash_table.ht_table[idx];
393*789Sahrens 	while ((fbuf = *bufp) != buf) {
394*789Sahrens 		ASSERT(fbuf != NULL);
395*789Sahrens 		bufp = &fbuf->b_hash_next;
396*789Sahrens 	}
397*789Sahrens 	*bufp = buf->b_hash_next;
398*789Sahrens 	buf->b_hash_next = NULL;
399*789Sahrens 
400*789Sahrens 	/* collect some hash table performance data */
401*789Sahrens 	atomic_add_64(&arc.hash_elements, -1);
402*789Sahrens 	if (buf_hash_table.ht_table[idx] &&
403*789Sahrens 	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
404*789Sahrens 		atomic_add_64(&arc.hash_chains, -1);
405*789Sahrens }
406*789Sahrens 
407*789Sahrens /*
408*789Sahrens  * Global data structures and functions for the buf kmem cache.
409*789Sahrens  */
410*789Sahrens static kmem_cache_t *hdr_cache;
411*789Sahrens static kmem_cache_t *buf_cache;
412*789Sahrens 
413*789Sahrens static void
414*789Sahrens buf_fini(void)
415*789Sahrens {
416*789Sahrens 	int i;
417*789Sahrens 
418*789Sahrens 	kmem_free(buf_hash_table.ht_table,
419*789Sahrens 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
420*789Sahrens 	for (i = 0; i < BUF_LOCKS; i++)
421*789Sahrens 		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
422*789Sahrens 	kmem_cache_destroy(hdr_cache);
423*789Sahrens 	kmem_cache_destroy(buf_cache);
424*789Sahrens }
425*789Sahrens 
426*789Sahrens /*
427*789Sahrens  * Constructor callback - called when the cache is empty
428*789Sahrens  * and a new buf is requested.
429*789Sahrens  */
430*789Sahrens /* ARGSUSED */
431*789Sahrens static int
432*789Sahrens hdr_cons(void *vbuf, void *unused, int kmflag)
433*789Sahrens {
434*789Sahrens 	arc_buf_hdr_t *buf = vbuf;
435*789Sahrens 
436*789Sahrens 	bzero(buf, sizeof (arc_buf_hdr_t));
437*789Sahrens 	refcount_create(&buf->b_refcnt);
438*789Sahrens 	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
439*789Sahrens 	return (0);
440*789Sahrens }
441*789Sahrens 
442*789Sahrens /*
443*789Sahrens  * Destructor callback - called when a cached buf is
444*789Sahrens  * no longer required.
445*789Sahrens  */
446*789Sahrens /* ARGSUSED */
447*789Sahrens static void
448*789Sahrens hdr_dest(void *vbuf, void *unused)
449*789Sahrens {
450*789Sahrens 	arc_buf_hdr_t *buf = vbuf;
451*789Sahrens 
452*789Sahrens 	refcount_destroy(&buf->b_refcnt);
453*789Sahrens 	cv_destroy(&buf->b_cv);
454*789Sahrens }
455*789Sahrens 
456*789Sahrens void arc_kmem_reclaim(void);
457*789Sahrens 
458*789Sahrens /*
459*789Sahrens  * Reclaim callback -- invoked when memory is low.
460*789Sahrens  */
461*789Sahrens /* ARGSUSED */
462*789Sahrens static void
463*789Sahrens hdr_recl(void *unused)
464*789Sahrens {
465*789Sahrens 	dprintf("hdr_recl called\n");
466*789Sahrens 	arc_kmem_reclaim();
467*789Sahrens }
468*789Sahrens 
469*789Sahrens static void
470*789Sahrens buf_init(void)
471*789Sahrens {
472*789Sahrens 	uint64_t *ct;
473*789Sahrens 	uint64_t hsize = 1ULL << 10;
474*789Sahrens 	int i, j;
475*789Sahrens 
476*789Sahrens 	/*
477*789Sahrens 	 * The hash table is big enough to fill all of physical memory
478*789Sahrens 	 * with an average 4k block size.  The table will take up
479*789Sahrens 	 * totalmem*sizeof(void*)/4k bytes (eg. 2MB/GB with 8-byte
480*789Sahrens 	 * pointers).
481*789Sahrens 	 */
482*789Sahrens 	while (hsize * 4096 < physmem * PAGESIZE)
483*789Sahrens 		hsize <<= 1;
484*789Sahrens 
485*789Sahrens 	buf_hash_table.ht_mask = hsize - 1;
486*789Sahrens 	buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
487*789Sahrens 
488*789Sahrens 	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
489*789Sahrens 	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
490*789Sahrens 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
491*789Sahrens 	    0, NULL, NULL, NULL, NULL, NULL, 0);
492*789Sahrens 
493*789Sahrens 	for (i = 0; i < 256; i++)
494*789Sahrens 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
495*789Sahrens 			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
496*789Sahrens 
497*789Sahrens 	for (i = 0; i < BUF_LOCKS; i++) {
498*789Sahrens 		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
499*789Sahrens 		    NULL, MUTEX_DEFAULT, NULL);
500*789Sahrens 	}
501*789Sahrens }
502*789Sahrens 
503*789Sahrens #define	ARC_MINTIME	(hz>>4) /* 62 ms */
504*789Sahrens 
505*789Sahrens #define	ARC_TAG		(void *)0x05201962
506*789Sahrens 
507*789Sahrens static void
508*789Sahrens add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
509*789Sahrens {
510*789Sahrens 	ASSERT(MUTEX_HELD(hash_lock));
511*789Sahrens 
512*789Sahrens 	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
513*789Sahrens 	    (ab->b_state != arc.anon)) {
514*789Sahrens 
515*789Sahrens 		ASSERT(!MUTEX_HELD(&ab->b_state->mtx));
516*789Sahrens 		mutex_enter(&ab->b_state->mtx);
517*789Sahrens 		ASSERT(!refcount_is_zero(&ab->b_refcnt));
518*789Sahrens 		ASSERT(list_link_active(&ab->b_arc_node));
519*789Sahrens 		list_remove(&ab->b_state->list, ab);
520*789Sahrens 		ASSERT3U(ab->b_state->lsize, >=, ab->b_size);
521*789Sahrens 		ab->b_state->lsize -= ab->b_size;
522*789Sahrens 		mutex_exit(&ab->b_state->mtx);
523*789Sahrens 	}
524*789Sahrens }
525*789Sahrens 
526*789Sahrens static int
527*789Sahrens remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
528*789Sahrens {
529*789Sahrens 	int cnt;
530*789Sahrens 
531*789Sahrens 	ASSERT(MUTEX_HELD(hash_lock));
532*789Sahrens 
533*789Sahrens 	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
534*789Sahrens 	    (ab->b_state != arc.anon)) {
535*789Sahrens 
536*789Sahrens 		ASSERT(!MUTEX_HELD(&ab->b_state->mtx));
537*789Sahrens 		mutex_enter(&ab->b_state->mtx);
538*789Sahrens 		ASSERT(!list_link_active(&ab->b_arc_node));
539*789Sahrens 		list_insert_head(&ab->b_state->list, ab);
540*789Sahrens 		ASSERT(ab->b_buf != NULL);
541*789Sahrens 		ab->b_state->lsize += ab->b_size;
542*789Sahrens 		mutex_exit(&ab->b_state->mtx);
543*789Sahrens 	}
544*789Sahrens 	return (cnt);
545*789Sahrens }
546*789Sahrens 
547*789Sahrens /*
548*789Sahrens  * Move the supplied buffer to the indicated state.  The mutex
549*789Sahrens  * for the buffer must be held by the caller.
550*789Sahrens  */
551*789Sahrens static void
552*789Sahrens arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab,
553*789Sahrens     kmutex_t *hash_lock)
554*789Sahrens {
555*789Sahrens 	arc_buf_t *buf;
556*789Sahrens 
557*789Sahrens 	ASSERT(MUTEX_HELD(hash_lock));
558*789Sahrens 
559*789Sahrens 	/*
560*789Sahrens 	 * If this buffer is evictable, transfer it from the
561*789Sahrens 	 * old state list to the new state list.
562*789Sahrens 	 */
563*789Sahrens 	if (refcount_is_zero(&ab->b_refcnt)) {
564*789Sahrens 		if (ab->b_state != arc.anon) {
565*789Sahrens 			int drop_mutex = FALSE;
566*789Sahrens 
567*789Sahrens 			if (!MUTEX_HELD(&ab->b_state->mtx)) {
568*789Sahrens 				mutex_enter(&ab->b_state->mtx);
569*789Sahrens 				drop_mutex = TRUE;
570*789Sahrens 			}
571*789Sahrens 			ASSERT(list_link_active(&ab->b_arc_node));
572*789Sahrens 			list_remove(&ab->b_state->list, ab);
573*789Sahrens 			ASSERT3U(ab->b_state->lsize, >=, ab->b_size);
574*789Sahrens 			ab->b_state->lsize -= ab->b_size;
575*789Sahrens 			if (drop_mutex)
576*789Sahrens 				mutex_exit(&ab->b_state->mtx);
577*789Sahrens 		}
578*789Sahrens 		if (new_state != arc.anon) {
579*789Sahrens 			int drop_mutex = FALSE;
580*789Sahrens 
581*789Sahrens 			if (!MUTEX_HELD(&new_state->mtx)) {
582*789Sahrens 				mutex_enter(&new_state->mtx);
583*789Sahrens 				drop_mutex = TRUE;
584*789Sahrens 			}
585*789Sahrens 			list_insert_head(&new_state->list, ab);
586*789Sahrens 			ASSERT(ab->b_buf != NULL);
587*789Sahrens 			new_state->lsize += ab->b_size;
588*789Sahrens 			if (drop_mutex)
589*789Sahrens 				mutex_exit(&new_state->mtx);
590*789Sahrens 		}
591*789Sahrens 	}
592*789Sahrens 
593*789Sahrens 	ASSERT(!BUF_EMPTY(ab));
594*789Sahrens 	if (new_state == arc.anon && ab->b_state != arc.anon) {
595*789Sahrens 		buf_hash_remove(ab);
596*789Sahrens 	}
597*789Sahrens 
598*789Sahrens 	/*
599*789Sahrens 	 * If this buffer isn't being transferred to the MRU-top
600*789Sahrens 	 * state, it's safe to clear its prefetch flag
601*789Sahrens 	 */
602*789Sahrens 	if ((new_state != arc.mru_top) && (new_state != arc.mru_bot)) {
603*789Sahrens 		ab->b_flags &= ~ARC_PREFETCH;
604*789Sahrens 	}
605*789Sahrens 
606*789Sahrens 	buf = ab->b_buf;
607*789Sahrens 	if (buf == NULL) {
608*789Sahrens 		ASSERT3U(ab->b_state->size, >=, ab->b_size);
609*789Sahrens 		atomic_add_64(&ab->b_state->size, -ab->b_size);
610*789Sahrens 		/* we should only be here if we are deleting state */
611*789Sahrens 		ASSERT(new_state == arc.anon &&
612*789Sahrens 		    (ab->b_state == arc.mru_bot || ab->b_state == arc.mfu_bot));
613*789Sahrens 	} else while (buf) {
614*789Sahrens 		ASSERT3U(ab->b_state->size, >=, ab->b_size);
615*789Sahrens 		atomic_add_64(&ab->b_state->size, -ab->b_size);
616*789Sahrens 		atomic_add_64(&new_state->size, ab->b_size);
617*789Sahrens 		buf = buf->b_next;
618*789Sahrens 	}
619*789Sahrens 	ab->b_state = new_state;
620*789Sahrens }
621*789Sahrens 
622*789Sahrens arc_buf_t *
623*789Sahrens arc_buf_alloc(spa_t *spa, int size, void *tag)
624*789Sahrens {
625*789Sahrens 	arc_buf_hdr_t *hdr;
626*789Sahrens 	arc_buf_t *buf;
627*789Sahrens 
628*789Sahrens 	ASSERT3U(size, >, 0);
629*789Sahrens 	hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
630*789Sahrens 	ASSERT(BUF_EMPTY(hdr));
631*789Sahrens 	hdr->b_size = size;
632*789Sahrens 	hdr->b_spa = spa;
633*789Sahrens 	hdr->b_state = arc.anon;
634*789Sahrens 	hdr->b_arc_access = 0;
635*789Sahrens 	buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
636*789Sahrens 	buf->b_hdr = hdr;
637*789Sahrens 	buf->b_next = NULL;
638*789Sahrens 	buf->b_data = zio_buf_alloc(size);
639*789Sahrens 	hdr->b_buf = buf;
640*789Sahrens 	hdr->b_flags = 0;
641*789Sahrens 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
642*789Sahrens 	(void) refcount_add(&hdr->b_refcnt, tag);
643*789Sahrens 
644*789Sahrens 	atomic_add_64(&arc.size, size);
645*789Sahrens 	atomic_add_64(&arc.anon->size, size);
646*789Sahrens 
647*789Sahrens 	return (buf);
648*789Sahrens }
649*789Sahrens 
650*789Sahrens static void
651*789Sahrens arc_hdr_free(arc_buf_hdr_t *hdr)
652*789Sahrens {
653*789Sahrens 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
654*789Sahrens 	ASSERT3P(hdr->b_state, ==, arc.anon);
655*789Sahrens 
656*789Sahrens 	if (!BUF_EMPTY(hdr)) {
657*789Sahrens 		/*
658*789Sahrens 		 * We can be called with an arc state lock held,
659*789Sahrens 		 * so we can't hold a hash lock here.
660*789Sahrens 		 * ASSERT(not in hash table)
661*789Sahrens 		 */
662*789Sahrens 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
663*789Sahrens 		bzero(&hdr->b_dva, sizeof (dva_t));
664*789Sahrens 		hdr->b_birth = 0;
665*789Sahrens 		hdr->b_cksum0 = 0;
666*789Sahrens 	}
667*789Sahrens 	if (hdr->b_buf) {
668*789Sahrens 		arc_buf_t *buf = hdr->b_buf;
669*789Sahrens 
670*789Sahrens 		ASSERT3U(hdr->b_size, >, 0);
671*789Sahrens 		zio_buf_free(buf->b_data, hdr->b_size);
672*789Sahrens 		atomic_add_64(&arc.size, -hdr->b_size);
673*789Sahrens 		ASSERT3U(arc.anon->size, >=, hdr->b_size);
674*789Sahrens 		atomic_add_64(&arc.anon->size, -hdr->b_size);
675*789Sahrens 		ASSERT3P(buf->b_next, ==, NULL);
676*789Sahrens 		kmem_cache_free(buf_cache, buf);
677*789Sahrens 		hdr->b_buf = NULL;
678*789Sahrens 	}
679*789Sahrens 	ASSERT(!list_link_active(&hdr->b_arc_node));
680*789Sahrens 	ASSERT3P(hdr->b_hash_next, ==, NULL);
681*789Sahrens 	ASSERT3P(hdr->b_acb, ==, NULL);
682*789Sahrens 	kmem_cache_free(hdr_cache, hdr);
683*789Sahrens }
684*789Sahrens 
685*789Sahrens void
686*789Sahrens arc_buf_free(arc_buf_t *buf, void *tag)
687*789Sahrens {
688*789Sahrens 	arc_buf_hdr_t *hdr = buf->b_hdr;
689*789Sahrens 	kmutex_t *hash_lock = HDR_LOCK(hdr);
690*789Sahrens 	int freeable;
691*789Sahrens 
692*789Sahrens 	mutex_enter(hash_lock);
693*789Sahrens 	if (remove_reference(hdr, hash_lock, tag) > 0) {
694*789Sahrens 		arc_buf_t **bufp = &hdr->b_buf;
695*789Sahrens 		arc_state_t *state = hdr->b_state;
696*789Sahrens 		uint64_t size = hdr->b_size;
697*789Sahrens 
698*789Sahrens 		ASSERT(hdr->b_state != arc.anon || HDR_IO_ERROR(hdr));
699*789Sahrens 		while (*bufp != buf) {
700*789Sahrens 			ASSERT(*bufp);
701*789Sahrens 			bufp = &(*bufp)->b_next;
702*789Sahrens 		}
703*789Sahrens 		*bufp = buf->b_next;
704*789Sahrens 		mutex_exit(hash_lock);
705*789Sahrens 		zio_buf_free(buf->b_data, size);
706*789Sahrens 		atomic_add_64(&arc.size, -size);
707*789Sahrens 		kmem_cache_free(buf_cache, buf);
708*789Sahrens 		ASSERT3U(state->size, >=, size);
709*789Sahrens 		atomic_add_64(&state->size, -size);
710*789Sahrens 		return;
711*789Sahrens 	}
712*789Sahrens 
713*789Sahrens 	/* don't free buffers that are in the middle of an async write */
714*789Sahrens 	freeable = (hdr->b_state == arc.anon && hdr->b_acb == NULL);
715*789Sahrens 	mutex_exit(hash_lock);
716*789Sahrens 
717*789Sahrens 	if (freeable)
718*789Sahrens 		arc_hdr_free(hdr);
719*789Sahrens }
720*789Sahrens 
721*789Sahrens int
722*789Sahrens arc_buf_size(arc_buf_t *buf)
723*789Sahrens {
724*789Sahrens 	return (buf->b_hdr->b_size);
725*789Sahrens }
726*789Sahrens 
727*789Sahrens /*
728*789Sahrens  * Evict buffers from list until we've removed the specified number of
729*789Sahrens  * bytes.  Move the removed buffers to the appropriate evict state.
730*789Sahrens  */
731*789Sahrens static uint64_t
732*789Sahrens arc_evict_state(arc_state_t *state, int64_t bytes)
733*789Sahrens {
734*789Sahrens 	arc_state_t *evicted_state;
735*789Sahrens 	uint64_t bytes_evicted = 0;
736*789Sahrens 	arc_buf_hdr_t *ab, *ab_prev;
737*789Sahrens 	kmutex_t *hash_lock;
738*789Sahrens 
739*789Sahrens 	ASSERT(state == arc.mru_top || state == arc.mfu_top);
740*789Sahrens 
741*789Sahrens 	if (state == arc.mru_top)
742*789Sahrens 		evicted_state = arc.mru_bot;
743*789Sahrens 	else
744*789Sahrens 		evicted_state = arc.mfu_bot;
745*789Sahrens 
746*789Sahrens 	mutex_enter(&state->mtx);
747*789Sahrens 	mutex_enter(&evicted_state->mtx);
748*789Sahrens 
749*789Sahrens 	for (ab = list_tail(&state->list); ab; ab = ab_prev) {
750*789Sahrens 		ab_prev = list_prev(&state->list, ab);
751*789Sahrens 		hash_lock = HDR_LOCK(ab);
752*789Sahrens 		if (mutex_tryenter(hash_lock)) {
753*789Sahrens 			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
754*789Sahrens 			arc_change_state(evicted_state, ab, hash_lock);
755*789Sahrens 			zio_buf_free(ab->b_buf->b_data, ab->b_size);
756*789Sahrens 			atomic_add_64(&arc.size, -ab->b_size);
757*789Sahrens 			ASSERT3P(ab->b_buf->b_next, ==, NULL);
758*789Sahrens 			kmem_cache_free(buf_cache, ab->b_buf);
759*789Sahrens 			ab->b_buf = NULL;
760*789Sahrens 			DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
761*789Sahrens 			bytes_evicted += ab->b_size;
762*789Sahrens 			mutex_exit(hash_lock);
763*789Sahrens 			if (bytes_evicted >= bytes)
764*789Sahrens 				break;
765*789Sahrens 		} else {
766*789Sahrens 			atomic_add_64(&arc.skipped, 1);
767*789Sahrens 		}
768*789Sahrens 	}
769*789Sahrens 	mutex_exit(&evicted_state->mtx);
770*789Sahrens 	mutex_exit(&state->mtx);
771*789Sahrens 
772*789Sahrens 	if (bytes_evicted < bytes)
773*789Sahrens 		dprintf("only evicted %lld bytes from %x",
774*789Sahrens 		    (longlong_t)bytes_evicted, state);
775*789Sahrens 
776*789Sahrens 	return (bytes_evicted);
777*789Sahrens }
778*789Sahrens 
779*789Sahrens /*
780*789Sahrens  * Remove buffers from list until we've removed the specified number of
781*789Sahrens  * bytes.  Destroy the buffers that are removed.
782*789Sahrens  */
783*789Sahrens static void
784*789Sahrens arc_delete_state(arc_state_t *state, int64_t bytes)
785*789Sahrens {
786*789Sahrens 	uint_t bufs_skipped = 0;
787*789Sahrens 	uint64_t bytes_deleted = 0;
788*789Sahrens 	arc_buf_hdr_t *ab, *ab_prev;
789*789Sahrens 	kmutex_t *hash_lock;
790*789Sahrens 
791*789Sahrens top:
792*789Sahrens 	mutex_enter(&state->mtx);
793*789Sahrens 	for (ab = list_tail(&state->list); ab; ab = ab_prev) {
794*789Sahrens 		ab_prev = list_prev(&state->list, ab);
795*789Sahrens 		hash_lock = HDR_LOCK(ab);
796*789Sahrens 		if (mutex_tryenter(hash_lock)) {
797*789Sahrens 			arc_change_state(arc.anon, ab, hash_lock);
798*789Sahrens 			mutex_exit(hash_lock);
799*789Sahrens 			atomic_add_64(&arc.deleted, 1);
800*789Sahrens 			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
801*789Sahrens 			bytes_deleted += ab->b_size;
802*789Sahrens 			arc_hdr_free(ab);
803*789Sahrens 			if (bytes >= 0 && bytes_deleted >= bytes)
804*789Sahrens 				break;
805*789Sahrens 		} else {
806*789Sahrens 			if (bytes < 0) {
807*789Sahrens 				mutex_exit(&state->mtx);
808*789Sahrens 				mutex_enter(hash_lock);
809*789Sahrens 				mutex_exit(hash_lock);
810*789Sahrens 				goto top;
811*789Sahrens 			}
812*789Sahrens 			bufs_skipped += 1;
813*789Sahrens 		}
814*789Sahrens 	}
815*789Sahrens 	mutex_exit(&state->mtx);
816*789Sahrens 
817*789Sahrens 	if (bufs_skipped) {
818*789Sahrens 		atomic_add_64(&arc.skipped, bufs_skipped);
819*789Sahrens 		ASSERT(bytes >= 0);
820*789Sahrens 	}
821*789Sahrens 
822*789Sahrens 	if (bytes_deleted < bytes)
823*789Sahrens 		dprintf("only deleted %lld bytes from %p",
824*789Sahrens 		    (longlong_t)bytes_deleted, state);
825*789Sahrens }
826*789Sahrens 
827*789Sahrens static void
828*789Sahrens arc_adjust(void)
829*789Sahrens {
830*789Sahrens 	int64_t top_sz, mru_over, arc_over;
831*789Sahrens 
832*789Sahrens 	top_sz = arc.anon->size + arc.mru_top->size;
833*789Sahrens 
834*789Sahrens 	if (top_sz > arc.p && arc.mru_top->lsize > 0) {
835*789Sahrens 		int64_t toevict = MIN(arc.mru_top->lsize, top_sz-arc.p);
836*789Sahrens 		(void) arc_evict_state(arc.mru_top, toevict);
837*789Sahrens 		top_sz = arc.anon->size + arc.mru_top->size;
838*789Sahrens 	}
839*789Sahrens 
840*789Sahrens 	mru_over = top_sz + arc.mru_bot->size - arc.c;
841*789Sahrens 
842*789Sahrens 	if (mru_over > 0) {
843*789Sahrens 		if (arc.mru_bot->lsize > 0) {
844*789Sahrens 			int64_t todelete = MIN(arc.mru_bot->lsize, mru_over);
845*789Sahrens 			arc_delete_state(arc.mru_bot, todelete);
846*789Sahrens 		}
847*789Sahrens 	}
848*789Sahrens 
849*789Sahrens 	if ((arc_over = arc.size - arc.c) > 0) {
850*789Sahrens 		int64_t table_over;
851*789Sahrens 
852*789Sahrens 		if (arc.mfu_top->lsize > 0) {
853*789Sahrens 			int64_t toevict = MIN(arc.mfu_top->lsize, arc_over);
854*789Sahrens 			(void) arc_evict_state(arc.mfu_top, toevict);
855*789Sahrens 		}
856*789Sahrens 
857*789Sahrens 		table_over = arc.size + arc.mru_bot->lsize + arc.mfu_bot->lsize
858*789Sahrens 		    - arc.c*2;
859*789Sahrens 
860*789Sahrens 		if (table_over > 0 && arc.mfu_bot->lsize > 0) {
861*789Sahrens 			int64_t todelete = MIN(arc.mfu_bot->lsize, table_over);
862*789Sahrens 			arc_delete_state(arc.mfu_bot, todelete);
863*789Sahrens 		}
864*789Sahrens 	}
865*789Sahrens }
866*789Sahrens 
867*789Sahrens /*
868*789Sahrens  * Flush all *evictable* data from the cache.
869*789Sahrens  * NOTE: this will not touch "active" (i.e. referenced) data.
870*789Sahrens  */
871*789Sahrens void
872*789Sahrens arc_flush(void)
873*789Sahrens {
874*789Sahrens 	arc_delete_state(arc.mru_top, -1);
875*789Sahrens 	arc_delete_state(arc.mfu_top, -1);
876*789Sahrens 
877*789Sahrens 	arc_delete_state(arc.mru_bot, -1);
878*789Sahrens 	arc_delete_state(arc.mfu_bot, -1);
879*789Sahrens }
880*789Sahrens 
881*789Sahrens void
882*789Sahrens arc_kmem_reclaim(void)
883*789Sahrens {
884*789Sahrens 	/* Remove 6.25% */
885*789Sahrens 	/*
886*789Sahrens 	 * We need arc_reclaim_lock because we don't want multiple
887*789Sahrens 	 * threads trying to reclaim concurrently.
888*789Sahrens 	 */
889*789Sahrens 
890*789Sahrens 	/*
891*789Sahrens 	 * umem calls the reclaim func when we destroy the buf cache,
892*789Sahrens 	 * which is after we do arc_fini().  So we set a flag to prevent
893*789Sahrens 	 * accessing the destroyed mutexes and lists.
894*789Sahrens 	 */
895*789Sahrens 	if (arc_dead)
896*789Sahrens 		return;
897*789Sahrens 
898*789Sahrens 	mutex_enter(&arc_reclaim_lock);
899*789Sahrens 
900*789Sahrens 	atomic_add_64(&arc.c, -(arc.c >> 4));
901*789Sahrens 	if (arc.c < arc.c_min)
902*789Sahrens 		arc.c = arc.c_min;
903*789Sahrens 	atomic_add_64(&arc.p, -(arc.p >> 4));
904*789Sahrens 
905*789Sahrens 	arc_adjust();
906*789Sahrens 
907*789Sahrens 	/* Cool it for a while */
908*789Sahrens 	arc.incr = 0;
909*789Sahrens 	arc.size_check = arc_size_check_default << 3;
910*789Sahrens 
911*789Sahrens 	mutex_exit(&arc_reclaim_lock);
912*789Sahrens }
913*789Sahrens 
914*789Sahrens static int
915*789Sahrens arc_reclaim_needed(void)
916*789Sahrens {
917*789Sahrens 	uint64_t extra;
918*789Sahrens 
919*789Sahrens #ifdef _KERNEL
920*789Sahrens 	/*
921*789Sahrens 	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
922*789Sahrens 	 */
923*789Sahrens 	extra = desfree;
924*789Sahrens 
925*789Sahrens 	/*
926*789Sahrens 	 * check that we're out of range of the pageout scanner.  It starts to
927*789Sahrens 	 * schedule paging if freemem is less than lotsfree and needfree.
928*789Sahrens 	 * lotsfree is the high-water mark for pageout, and needfree is the
929*789Sahrens 	 * number of needed free pages.  We add extra pages here to make sure
930*789Sahrens 	 * the scanner doesn't start up while we're freeing memory.
931*789Sahrens 	 */
932*789Sahrens 	if (freemem < lotsfree + needfree + extra)
933*789Sahrens 		return (1);
934*789Sahrens 
935*789Sahrens 	/*
936*789Sahrens 	 * check to make sure that swapfs has enough space so that anon
937*789Sahrens 	 * reservations can still succeeed. anon_resvmem() checks that the
938*789Sahrens 	 * availrmem is greater than swapfs_minfree, and the number of reserved
939*789Sahrens 	 * swap pages.  We also add a bit of extra here just to prevent
940*789Sahrens 	 * circumstances from getting really dire.
941*789Sahrens 	 */
942*789Sahrens 	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
943*789Sahrens 		return (1);
944*789Sahrens 
945*789Sahrens 	/*
946*789Sahrens 	 * If we're on an i386 platform, it's possible that we'll exhaust the
947*789Sahrens 	 * kernel heap space before we ever run out of available physical
948*789Sahrens 	 * memory.  Most checks of the size of the heap_area compare against
949*789Sahrens 	 * tune.t_minarmem, which is the minimum available real memory that we
950*789Sahrens 	 * can have in the system.  However, this is generally fixed at 25 pages
951*789Sahrens 	 * which is so low that it's useless.  In this comparison, we seek to
952*789Sahrens 	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
953*789Sahrens 	 * heap is allocated.  (Or, in the caclulation, if less than 1/4th is
954*789Sahrens 	 * free)
955*789Sahrens 	 */
956*789Sahrens #if defined(__i386)
957*789Sahrens 	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
958*789Sahrens 	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
959*789Sahrens 		return (1);
960*789Sahrens #endif
961*789Sahrens 
962*789Sahrens #else
963*789Sahrens 	if (spa_get_random(100) == 0)
964*789Sahrens 		return (1);
965*789Sahrens #endif
966*789Sahrens 	return (0);
967*789Sahrens }
968*789Sahrens 
969*789Sahrens static void
970*789Sahrens arc_kmem_reap_now(arc_reclaim_strategy_t strat)
971*789Sahrens {
972*789Sahrens 	size_t			i;
973*789Sahrens 	kmem_cache_t		*prev_cache = NULL;
974*789Sahrens 	extern kmem_cache_t	*zio_buf_cache[];
975*789Sahrens 
976*789Sahrens 	/*
977*789Sahrens 	 * an agressive reclamation will shrink the cache size as well as reap
978*789Sahrens 	 * free kmem buffers.  The arc_kmem_reclaim function is called when the
979*789Sahrens 	 * header-cache is reaped, so we only reap the header cache if we're
980*789Sahrens 	 * performing an agressive reclaim.  If we're not, just clean the kmem
981*789Sahrens 	 * buffer caches.
982*789Sahrens 	 */
983*789Sahrens 	if (strat == ARC_RECLAIM_AGGR)
984*789Sahrens 		kmem_cache_reap_now(hdr_cache);
985*789Sahrens 
986*789Sahrens 	kmem_cache_reap_now(buf_cache);
987*789Sahrens 
988*789Sahrens 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
989*789Sahrens 		if (zio_buf_cache[i] != prev_cache) {
990*789Sahrens 			prev_cache = zio_buf_cache[i];
991*789Sahrens 			kmem_cache_reap_now(zio_buf_cache[i]);
992*789Sahrens 		}
993*789Sahrens 	}
994*789Sahrens }
995*789Sahrens 
996*789Sahrens static void
997*789Sahrens arc_reclaim_thread(void)
998*789Sahrens {
999*789Sahrens 	clock_t			growtime = 0;
1000*789Sahrens 	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
1001*789Sahrens 	callb_cpr_t		cpr;
1002*789Sahrens 
1003*789Sahrens 	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
1004*789Sahrens 
1005*789Sahrens 	mutex_enter(&arc_reclaim_thr_lock);
1006*789Sahrens 	while (arc_thread_exit == 0) {
1007*789Sahrens 		if (arc_reclaim_needed()) {
1008*789Sahrens 
1009*789Sahrens 			if (arc.no_grow) {
1010*789Sahrens 				if (last_reclaim == ARC_RECLAIM_CONS) {
1011*789Sahrens 					last_reclaim = ARC_RECLAIM_AGGR;
1012*789Sahrens 				} else {
1013*789Sahrens 					last_reclaim = ARC_RECLAIM_CONS;
1014*789Sahrens 				}
1015*789Sahrens 			} else {
1016*789Sahrens 				arc.no_grow = TRUE;
1017*789Sahrens 				last_reclaim = ARC_RECLAIM_AGGR;
1018*789Sahrens 				membar_producer();
1019*789Sahrens 			}
1020*789Sahrens 
1021*789Sahrens 			/* reset the growth delay for every reclaim */
1022*789Sahrens 			growtime = lbolt + (arc_grow_retry * hz);
1023*789Sahrens 
1024*789Sahrens 			arc_kmem_reap_now(last_reclaim);
1025*789Sahrens 
1026*789Sahrens 		} else if ((growtime > 0) && ((growtime - lbolt) <= 0)) {
1027*789Sahrens 			arc.no_grow = FALSE;
1028*789Sahrens 		}
1029*789Sahrens 
1030*789Sahrens 		/* block until needed, or one second, whichever is shorter */
1031*789Sahrens 		CALLB_CPR_SAFE_BEGIN(&cpr);
1032*789Sahrens 		(void) cv_timedwait(&arc_reclaim_thr_cv,
1033*789Sahrens 		    &arc_reclaim_thr_lock, (lbolt + hz));
1034*789Sahrens 		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
1035*789Sahrens 	}
1036*789Sahrens 
1037*789Sahrens 	arc_thread_exit = 0;
1038*789Sahrens 	cv_broadcast(&arc_reclaim_thr_cv);
1039*789Sahrens 	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
1040*789Sahrens 	thread_exit();
1041*789Sahrens }
1042*789Sahrens 
1043*789Sahrens static void
1044*789Sahrens arc_try_grow(int64_t bytes)
1045*789Sahrens {
1046*789Sahrens 	/*
1047*789Sahrens 	 * If we're within (2 * maxblocksize) bytes of the target
1048*789Sahrens 	 * cache size, increment the target cache size
1049*789Sahrens 	 */
1050*789Sahrens 	atomic_add_64((uint64_t *)&arc.size_check, 1);
1051*789Sahrens 
1052*789Sahrens 	if (arc_reclaim_needed()) {
1053*789Sahrens 		cv_signal(&arc_reclaim_thr_cv);
1054*789Sahrens 		return;
1055*789Sahrens 	}
1056*789Sahrens 
1057*789Sahrens 	if (arc.no_grow)
1058*789Sahrens 		return;
1059*789Sahrens 
1060*789Sahrens 	/*
1061*789Sahrens 	 * return true if we successfully grow, or if there's enough space that
1062*789Sahrens 	 * we don't have to grow.  Above, we return false if we can't grow, or
1063*789Sahrens 	 * if we shouldn't because a reclaim is in progress.
1064*789Sahrens 	 */
1065*789Sahrens 	if ((arc.c - arc.size) <= (2ULL << SPA_MAXBLOCKSHIFT)) {
1066*789Sahrens 		if (arc.size_check > 0) {
1067*789Sahrens 			arc.size_check = arc_size_check_default;
1068*789Sahrens 			atomic_add_64(&arc.incr, arc_incr_size);
1069*789Sahrens 		}
1070*789Sahrens 		atomic_add_64(&arc.c, MIN(bytes, arc.incr));
1071*789Sahrens 		if (arc.c > arc.c_max)
1072*789Sahrens 			arc.c = arc.c_max;
1073*789Sahrens 		else
1074*789Sahrens 			atomic_add_64(&arc.p, MIN(bytes, arc.incr));
1075*789Sahrens 	} else if (arc.size > arc.c) {
1076*789Sahrens 		if (arc.size_check > 0) {
1077*789Sahrens 			arc.size_check = arc_size_check_default;
1078*789Sahrens 			atomic_add_64(&arc.incr, arc_incr_size);
1079*789Sahrens 		}
1080*789Sahrens 		atomic_add_64(&arc.c, MIN(bytes, arc.incr));
1081*789Sahrens 		if (arc.c > arc.c_max)
1082*789Sahrens 			arc.c = arc.c_max;
1083*789Sahrens 		else
1084*789Sahrens 			atomic_add_64(&arc.p, MIN(bytes, arc.incr));
1085*789Sahrens 	}
1086*789Sahrens }
1087*789Sahrens 
1088*789Sahrens /*
1089*789Sahrens  * check if the cache has reached its limits and eviction is required prior to
1090*789Sahrens  * insert.  In this situation, we want to evict if no_grow is set Otherwise, the
1091*789Sahrens  * cache is either big enough that we can insert, or a arc_try_grow will result
1092*789Sahrens  * in more space being made available.
1093*789Sahrens  */
1094*789Sahrens 
1095*789Sahrens static int
1096*789Sahrens arc_evict_needed()
1097*789Sahrens {
1098*789Sahrens 
1099*789Sahrens 	if (arc_reclaim_needed())
1100*789Sahrens 		return (1);
1101*789Sahrens 
1102*789Sahrens 	if (arc.no_grow || (arc.c > arc.c_max) || (arc.size > arc.c))
1103*789Sahrens 		return (1);
1104*789Sahrens 
1105*789Sahrens 	return (0);
1106*789Sahrens }
1107*789Sahrens 
1108*789Sahrens /*
1109*789Sahrens  * The state, supplied as the first argument, is going to have something
1110*789Sahrens  * inserted on its behalf. So, determine which cache must be victimized to
1111*789Sahrens  * satisfy an insertion for this state.  We have the following cases:
1112*789Sahrens  *
1113*789Sahrens  * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru_top) ->
1114*789Sahrens  * In this situation if we're out of space, but the resident size of the MFU is
1115*789Sahrens  * under the limit, victimize the MFU cache to satisfy this insertion request.
1116*789Sahrens  *
1117*789Sahrens  * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru_top) ->
1118*789Sahrens  * Here, we've used up all of the available space for the MRU, so we need to
1119*789Sahrens  * evict from our own cache instead.  Evict from the set of resident MRU
1120*789Sahrens  * entries.
1121*789Sahrens  *
1122*789Sahrens  * 3. Insert for MFU (c - p) > sizeof(arc.mfu_top) ->
1123*789Sahrens  * c minus p represents the MFU space in the cache, since p is the size of the
1124*789Sahrens  * cache that is dedicated to the MRU.  In this situation there's still space on
1125*789Sahrens  * the MFU side, so the MRU side needs to be victimized.
1126*789Sahrens  *
1127*789Sahrens  * 4. Insert for MFU (c - p) < sizeof(arc.mfu_top) ->
1128*789Sahrens  * MFU's resident set is consuming more space than it has been allotted.  In
1129*789Sahrens  * this situation, we must victimize our own cache, the MFU, for this insertion.
1130*789Sahrens  */
1131*789Sahrens static void
1132*789Sahrens arc_evict_for_state(arc_state_t *state, uint64_t bytes)
1133*789Sahrens {
1134*789Sahrens 	uint64_t	mru_used;
1135*789Sahrens 	uint64_t	mfu_space;
1136*789Sahrens 	uint64_t	evicted;
1137*789Sahrens 
1138*789Sahrens 	ASSERT(state == arc.mru_top || state == arc.mfu_top);
1139*789Sahrens 
1140*789Sahrens 	if (state == arc.mru_top) {
1141*789Sahrens 		mru_used = arc.anon->size + arc.mru_top->size;
1142*789Sahrens 		if (arc.p > mru_used) {
1143*789Sahrens 			/* case 1 */
1144*789Sahrens 			evicted = arc_evict_state(arc.mfu_top, bytes);
1145*789Sahrens 			if (evicted < bytes) {
1146*789Sahrens 				arc_adjust();
1147*789Sahrens 			}
1148*789Sahrens 		} else {
1149*789Sahrens 			/* case 2 */
1150*789Sahrens 			evicted = arc_evict_state(arc.mru_top, bytes);
1151*789Sahrens 			if (evicted < bytes) {
1152*789Sahrens 				arc_adjust();
1153*789Sahrens 			}
1154*789Sahrens 		}
1155*789Sahrens 	} else {
1156*789Sahrens 		/* MFU_top case */
1157*789Sahrens 		mfu_space = arc.c - arc.p;
1158*789Sahrens 		if (mfu_space > arc.mfu_top->size) {
1159*789Sahrens 			/* case 3 */
1160*789Sahrens 			evicted = arc_evict_state(arc.mru_top, bytes);
1161*789Sahrens 			if (evicted < bytes) {
1162*789Sahrens 				arc_adjust();
1163*789Sahrens 			}
1164*789Sahrens 		} else {
1165*789Sahrens 			/* case 4 */
1166*789Sahrens 			evicted = arc_evict_state(arc.mfu_top, bytes);
1167*789Sahrens 			if (evicted < bytes) {
1168*789Sahrens 				arc_adjust();
1169*789Sahrens 			}
1170*789Sahrens 		}
1171*789Sahrens 	}
1172*789Sahrens }
1173*789Sahrens 
1174*789Sahrens /*
1175*789Sahrens  * This routine is called whenever a buffer is accessed.
1176*789Sahrens  */
1177*789Sahrens static void
1178*789Sahrens arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
1179*789Sahrens {
1180*789Sahrens 	int		blksz, mult;
1181*789Sahrens 
1182*789Sahrens 	ASSERT(MUTEX_HELD(hash_lock));
1183*789Sahrens 
1184*789Sahrens 	blksz = buf->b_size;
1185*789Sahrens 
1186*789Sahrens 	if (buf->b_state == arc.anon) {
1187*789Sahrens 		/*
1188*789Sahrens 		 * This buffer is not in the cache, and does not
1189*789Sahrens 		 * appear in our "ghost" list.  Add the new buffer
1190*789Sahrens 		 * to the MRU state.
1191*789Sahrens 		 */
1192*789Sahrens 
1193*789Sahrens 		arc_try_grow(blksz);
1194*789Sahrens 		if (arc_evict_needed()) {
1195*789Sahrens 			arc_evict_for_state(arc.mru_top, blksz);
1196*789Sahrens 		}
1197*789Sahrens 
1198*789Sahrens 		ASSERT(buf->b_arc_access == 0);
1199*789Sahrens 		buf->b_arc_access = lbolt;
1200*789Sahrens 		DTRACE_PROBE1(new_state__mru_top, arc_buf_hdr_t *,
1201*789Sahrens 		    buf);
1202*789Sahrens 		arc_change_state(arc.mru_top, buf, hash_lock);
1203*789Sahrens 
1204*789Sahrens 		/*
1205*789Sahrens 		 * If we are using less than 2/3 of our total target
1206*789Sahrens 		 * cache size, bump up the target size for the MRU
1207*789Sahrens 		 * list.
1208*789Sahrens 		 */
1209*789Sahrens 		if (arc.size < arc.c*2/3) {
1210*789Sahrens 			arc.p = arc.anon->size + arc.mru_top->size + arc.c/6;
1211*789Sahrens 		}
1212*789Sahrens 
1213*789Sahrens 	} else if (buf->b_state == arc.mru_top) {
1214*789Sahrens 		/*
1215*789Sahrens 		 * If this buffer is in the MRU-top state and has the prefetch
1216*789Sahrens 		 * flag, the first read was actually part of a prefetch.  In
1217*789Sahrens 		 * this situation, we simply want to clear the flag and return.
1218*789Sahrens 		 * A subsequent access should bump this into the MFU state.
1219*789Sahrens 		 */
1220*789Sahrens 		if ((buf->b_flags & ARC_PREFETCH) != 0) {
1221*789Sahrens 			buf->b_flags &= ~ARC_PREFETCH;
1222*789Sahrens 			atomic_add_64(&arc.mru_top->hits, 1);
1223*789Sahrens 			return;
1224*789Sahrens 		}
1225*789Sahrens 
1226*789Sahrens 		/*
1227*789Sahrens 		 * This buffer has been "accessed" only once so far,
1228*789Sahrens 		 * but it is still in the cache. Move it to the MFU
1229*789Sahrens 		 * state.
1230*789Sahrens 		 */
1231*789Sahrens 		if (lbolt > buf->b_arc_access + ARC_MINTIME) {
1232*789Sahrens 			/*
1233*789Sahrens 			 * More than 125ms have passed since we
1234*789Sahrens 			 * instantiated this buffer.  Move it to the
1235*789Sahrens 			 * most frequently used state.
1236*789Sahrens 			 */
1237*789Sahrens 			buf->b_arc_access = lbolt;
1238*789Sahrens 			DTRACE_PROBE1(new_state__mfu_top,
1239*789Sahrens 			    arc_buf_hdr_t *, buf);
1240*789Sahrens 			arc_change_state(arc.mfu_top, buf, hash_lock);
1241*789Sahrens 		}
1242*789Sahrens 		atomic_add_64(&arc.mru_top->hits, 1);
1243*789Sahrens 	} else if (buf->b_state == arc.mru_bot) {
1244*789Sahrens 		arc_state_t	*new_state;
1245*789Sahrens 		/*
1246*789Sahrens 		 * This buffer has been "accessed" recently, but
1247*789Sahrens 		 * was evicted from the cache.  Move it to the
1248*789Sahrens 		 * MFU state.
1249*789Sahrens 		 */
1250*789Sahrens 
1251*789Sahrens 		if (buf->b_flags & ARC_PREFETCH) {
1252*789Sahrens 			new_state = arc.mru_top;
1253*789Sahrens 			DTRACE_PROBE1(new_state__mru_top,
1254*789Sahrens 			    arc_buf_hdr_t *, buf);
1255*789Sahrens 		} else {
1256*789Sahrens 			new_state = arc.mfu_top;
1257*789Sahrens 			DTRACE_PROBE1(new_state__mfu_top,
1258*789Sahrens 			    arc_buf_hdr_t *, buf);
1259*789Sahrens 		}
1260*789Sahrens 
1261*789Sahrens 		arc_try_grow(blksz);
1262*789Sahrens 		if (arc_evict_needed()) {
1263*789Sahrens 			arc_evict_for_state(new_state, blksz);
1264*789Sahrens 		}
1265*789Sahrens 
1266*789Sahrens 		/* Bump up the target size of the MRU list */
1267*789Sahrens 		mult = ((arc.mru_bot->size >= arc.mfu_bot->size) ?
1268*789Sahrens 		    1 : (arc.mfu_bot->size/arc.mru_bot->size));
1269*789Sahrens 		arc.p = MIN(arc.c, arc.p + blksz * mult);
1270*789Sahrens 
1271*789Sahrens 		buf->b_arc_access = lbolt;
1272*789Sahrens 		arc_change_state(new_state, buf, hash_lock);
1273*789Sahrens 
1274*789Sahrens 		atomic_add_64(&arc.mru_bot->hits, 1);
1275*789Sahrens 	} else if (buf->b_state == arc.mfu_top) {
1276*789Sahrens 		/*
1277*789Sahrens 		 * This buffer has been accessed more than once and is
1278*789Sahrens 		 * still in the cache.  Keep it in the MFU state.
1279*789Sahrens 		 *
1280*789Sahrens 		 * NOTE: the add_reference() that occurred when we did
1281*789Sahrens 		 * the arc_read() should have kicked this off the list,
1282*789Sahrens 		 * so even if it was a prefetch, it will be put back at
1283*789Sahrens 		 * the head of the list when we remove_reference().
1284*789Sahrens 		 */
1285*789Sahrens 		atomic_add_64(&arc.mfu_top->hits, 1);
1286*789Sahrens 	} else if (buf->b_state == arc.mfu_bot) {
1287*789Sahrens 		/*
1288*789Sahrens 		 * This buffer has been accessed more than once but has
1289*789Sahrens 		 * been evicted from the cache.  Move it back to the
1290*789Sahrens 		 * MFU state.
1291*789Sahrens 		 */
1292*789Sahrens 
1293*789Sahrens 		arc_try_grow(blksz);
1294*789Sahrens 		if (arc_evict_needed()) {
1295*789Sahrens 			arc_evict_for_state(arc.mfu_top, blksz);
1296*789Sahrens 		}
1297*789Sahrens 
1298*789Sahrens 		/* Bump up the target size for the MFU list */
1299*789Sahrens 		mult = ((arc.mfu_bot->size >= arc.mru_bot->size) ?
1300*789Sahrens 		    1 : (arc.mru_bot->size/arc.mfu_bot->size));
1301*789Sahrens 		arc.p = MAX(0, (int64_t)arc.p - blksz * mult);
1302*789Sahrens 
1303*789Sahrens 		buf->b_arc_access = lbolt;
1304*789Sahrens 		DTRACE_PROBE1(new_state__mfu_top,
1305*789Sahrens 		    arc_buf_hdr_t *, buf);
1306*789Sahrens 		arc_change_state(arc.mfu_top, buf, hash_lock);
1307*789Sahrens 
1308*789Sahrens 		atomic_add_64(&arc.mfu_bot->hits, 1);
1309*789Sahrens 	} else {
1310*789Sahrens 		ASSERT(!"invalid arc state");
1311*789Sahrens 	}
1312*789Sahrens 
1313*789Sahrens }
1314*789Sahrens 
1315*789Sahrens /* a generic arc_done_func_t which you can use */
1316*789Sahrens /* ARGSUSED */
1317*789Sahrens void
1318*789Sahrens arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
1319*789Sahrens {
1320*789Sahrens 	bcopy(buf->b_data, arg, buf->b_hdr->b_size);
1321*789Sahrens 	arc_buf_free(buf, arg);
1322*789Sahrens }
1323*789Sahrens 
1324*789Sahrens /* a generic arc_done_func_t which you can use */
1325*789Sahrens void
1326*789Sahrens arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
1327*789Sahrens {
1328*789Sahrens 	arc_buf_t **bufp = arg;
1329*789Sahrens 	if (zio && zio->io_error) {
1330*789Sahrens 		arc_buf_free(buf, arg);
1331*789Sahrens 		*bufp = NULL;
1332*789Sahrens 	} else {
1333*789Sahrens 		*bufp = buf;
1334*789Sahrens 	}
1335*789Sahrens }
1336*789Sahrens 
1337*789Sahrens static void
1338*789Sahrens arc_read_done(zio_t *zio)
1339*789Sahrens {
1340*789Sahrens 	arc_buf_hdr_t	*hdr;
1341*789Sahrens 	arc_buf_t	*buf;
1342*789Sahrens 	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
1343*789Sahrens 	kmutex_t	*hash_lock;
1344*789Sahrens 	arc_callback_t	*callback_list, *acb;
1345*789Sahrens 	int		freeable = FALSE;
1346*789Sahrens 
1347*789Sahrens 	buf = zio->io_private;
1348*789Sahrens 	hdr = buf->b_hdr;
1349*789Sahrens 
1350*789Sahrens 	if (!HDR_FREED_IN_READ(hdr)) {
1351*789Sahrens 		arc_buf_hdr_t *found;
1352*789Sahrens 
1353*789Sahrens 		found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
1354*789Sahrens 		    &hash_lock);
1355*789Sahrens 
1356*789Sahrens 		/*
1357*789Sahrens 		 * Buffer was inserted into hash-table and removed from lists
1358*789Sahrens 		 * prior to starting I/O.  We should find this header, since
1359*789Sahrens 		 * it's in the hash table, and it should be legit since it's
1360*789Sahrens 		 * not possible to evict it during the I/O.
1361*789Sahrens 		 */
1362*789Sahrens 
1363*789Sahrens 		ASSERT(found);
1364*789Sahrens 		ASSERT(DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)));
1365*789Sahrens 	}
1366*789Sahrens 
1367*789Sahrens 	/* byteswap if necessary */
1368*789Sahrens 	callback_list = hdr->b_acb;
1369*789Sahrens 	ASSERT(callback_list != NULL);
1370*789Sahrens 	if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap)
1371*789Sahrens 		callback_list->acb_byteswap(buf->b_data, hdr->b_size);
1372*789Sahrens 
1373*789Sahrens 	/* create copies of the data buffer for the callers */
1374*789Sahrens 	abuf = buf;
1375*789Sahrens 	for (acb = callback_list; acb; acb = acb->acb_next) {
1376*789Sahrens 		if (acb->acb_done) {
1377*789Sahrens 			if (abuf == NULL) {
1378*789Sahrens 				abuf = kmem_cache_alloc(buf_cache, KM_SLEEP);
1379*789Sahrens 				abuf->b_data = zio_buf_alloc(hdr->b_size);
1380*789Sahrens 				atomic_add_64(&arc.size, hdr->b_size);
1381*789Sahrens 				bcopy(buf->b_data, abuf->b_data, hdr->b_size);
1382*789Sahrens 				abuf->b_hdr = hdr;
1383*789Sahrens 				abuf->b_next = hdr->b_buf;
1384*789Sahrens 				hdr->b_buf = abuf;
1385*789Sahrens 				atomic_add_64(&hdr->b_state->size, hdr->b_size);
1386*789Sahrens 			}
1387*789Sahrens 			acb->acb_buf = abuf;
1388*789Sahrens 			abuf = NULL;
1389*789Sahrens 		} else {
1390*789Sahrens 			/*
1391*789Sahrens 			 * The caller did not provide a callback function.
1392*789Sahrens 			 * In this case, we should just remove the reference.
1393*789Sahrens 			 */
1394*789Sahrens 			if (HDR_FREED_IN_READ(hdr)) {
1395*789Sahrens 				ASSERT3P(hdr->b_state, ==, arc.anon);
1396*789Sahrens 				(void) refcount_remove(&hdr->b_refcnt,
1397*789Sahrens 				    acb->acb_private);
1398*789Sahrens 			} else {
1399*789Sahrens 				(void) remove_reference(hdr, hash_lock,
1400*789Sahrens 				    acb->acb_private);
1401*789Sahrens 			}
1402*789Sahrens 		}
1403*789Sahrens 	}
1404*789Sahrens 	hdr->b_acb = NULL;
1405*789Sahrens 	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
1406*789Sahrens 
1407*789Sahrens 	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
1408*789Sahrens 
1409*789Sahrens 	if (zio->io_error != 0) {
1410*789Sahrens 		hdr->b_flags |= ARC_IO_ERROR;
1411*789Sahrens 		if (hdr->b_state != arc.anon)
1412*789Sahrens 			arc_change_state(arc.anon, hdr, hash_lock);
1413*789Sahrens 		freeable = refcount_is_zero(&hdr->b_refcnt);
1414*789Sahrens 	}
1415*789Sahrens 
1416*789Sahrens 	if (!HDR_FREED_IN_READ(hdr)) {
1417*789Sahrens 		/*
1418*789Sahrens 		 * Only call arc_access on anonymous buffers.  This is because
1419*789Sahrens 		 * if we've issued an I/O for an evicted buffer, we've already
1420*789Sahrens 		 * called arc_access (to prevent any simultaneous readers from
1421*789Sahrens 		 * getting confused).
1422*789Sahrens 		 */
1423*789Sahrens 		if (zio->io_error == 0 && hdr->b_state == arc.anon)
1424*789Sahrens 			arc_access(hdr, hash_lock);
1425*789Sahrens 		mutex_exit(hash_lock);
1426*789Sahrens 	} else {
1427*789Sahrens 		/*
1428*789Sahrens 		 * This block was freed while we waited for the read to
1429*789Sahrens 		 * complete.  It has been removed from the hash table and
1430*789Sahrens 		 * moved to the anonymous state (so that it won't show up
1431*789Sahrens 		 * in the cache).
1432*789Sahrens 		 */
1433*789Sahrens 		ASSERT3P(hdr->b_state, ==, arc.anon);
1434*789Sahrens 		freeable = refcount_is_zero(&hdr->b_refcnt);
1435*789Sahrens 	}
1436*789Sahrens 
1437*789Sahrens 	cv_broadcast(&hdr->b_cv);
1438*789Sahrens 
1439*789Sahrens 	/* execute each callback and free its structure */
1440*789Sahrens 	while ((acb = callback_list) != NULL) {
1441*789Sahrens 		if (acb->acb_done)
1442*789Sahrens 			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
1443*789Sahrens 
1444*789Sahrens 		if (acb->acb_zio_dummy != NULL) {
1445*789Sahrens 			acb->acb_zio_dummy->io_error = zio->io_error;
1446*789Sahrens 			zio_nowait(acb->acb_zio_dummy);
1447*789Sahrens 		}
1448*789Sahrens 
1449*789Sahrens 		callback_list = acb->acb_next;
1450*789Sahrens 		kmem_free(acb, sizeof (arc_callback_t));
1451*789Sahrens 	}
1452*789Sahrens 
1453*789Sahrens 	if (freeable)
1454*789Sahrens 		arc_hdr_free(hdr);
1455*789Sahrens }
1456*789Sahrens 
1457*789Sahrens /*
1458*789Sahrens  * "Read" the block block at the specified DVA (in bp) via the
1459*789Sahrens  * cache.  If the block is found in the cache, invoke the provided
1460*789Sahrens  * callback immediately and return.  Note that the `zio' parameter
1461*789Sahrens  * in the callback will be NULL in this case, since no IO was
1462*789Sahrens  * required.  If the block is not in the cache pass the read request
1463*789Sahrens  * on to the spa with a substitute callback function, so that the
1464*789Sahrens  * requested block will be added to the cache.
1465*789Sahrens  *
1466*789Sahrens  * If a read request arrives for a block that has a read in-progress,
1467*789Sahrens  * either wait for the in-progress read to complete (and return the
1468*789Sahrens  * results); or, if this is a read with a "done" func, add a record
1469*789Sahrens  * to the read to invoke the "done" func when the read completes,
1470*789Sahrens  * and return; or just return.
1471*789Sahrens  *
1472*789Sahrens  * arc_read_done() will invoke all the requested "done" functions
1473*789Sahrens  * for readers of this block.
1474*789Sahrens  */
1475*789Sahrens int
1476*789Sahrens arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
1477*789Sahrens     arc_done_func_t *done, void *private, int priority, int flags,
1478*789Sahrens     uint32_t arc_flags)
1479*789Sahrens {
1480*789Sahrens 	arc_buf_hdr_t *hdr;
1481*789Sahrens 	arc_buf_t *buf;
1482*789Sahrens 	kmutex_t *hash_lock;
1483*789Sahrens 	zio_t	*rzio;
1484*789Sahrens 
1485*789Sahrens top:
1486*789Sahrens 	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
1487*789Sahrens 	if (hdr && hdr->b_buf) {
1488*789Sahrens 
1489*789Sahrens 		ASSERT((hdr->b_state == arc.mru_top) ||
1490*789Sahrens 		    (hdr->b_state == arc.mfu_top) ||
1491*789Sahrens 		    ((hdr->b_state == arc.anon) &&
1492*789Sahrens 		    (HDR_IO_IN_PROGRESS(hdr))));
1493*789Sahrens 
1494*789Sahrens 		if (HDR_IO_IN_PROGRESS(hdr)) {
1495*789Sahrens 
1496*789Sahrens 			if ((arc_flags & ARC_NOWAIT) && done) {
1497*789Sahrens 				arc_callback_t	*acb = NULL;
1498*789Sahrens 
1499*789Sahrens 				acb = kmem_zalloc(sizeof (arc_callback_t),
1500*789Sahrens 				    KM_SLEEP);
1501*789Sahrens 				acb->acb_done = done;
1502*789Sahrens 				acb->acb_private = private;
1503*789Sahrens 				acb->acb_byteswap = swap;
1504*789Sahrens 				if (pio != NULL)
1505*789Sahrens 					acb->acb_zio_dummy = zio_null(pio,
1506*789Sahrens 					    spa, NULL, NULL, flags);
1507*789Sahrens 
1508*789Sahrens 				ASSERT(acb->acb_done != NULL);
1509*789Sahrens 				acb->acb_next = hdr->b_acb;
1510*789Sahrens 				hdr->b_acb = acb;
1511*789Sahrens 				add_reference(hdr, hash_lock, private);
1512*789Sahrens 				mutex_exit(hash_lock);
1513*789Sahrens 				return (0);
1514*789Sahrens 			} else if (arc_flags & ARC_WAIT) {
1515*789Sahrens 				cv_wait(&hdr->b_cv, hash_lock);
1516*789Sahrens 				mutex_exit(hash_lock);
1517*789Sahrens 				goto top;
1518*789Sahrens 			}
1519*789Sahrens 
1520*789Sahrens 			mutex_exit(hash_lock);
1521*789Sahrens 			return (0);
1522*789Sahrens 		}
1523*789Sahrens 
1524*789Sahrens 		/*
1525*789Sahrens 		 * If there is already a reference on this block, create
1526*789Sahrens 		 * a new copy of the data so that we will be guaranteed
1527*789Sahrens 		 * that arc_release() will always succeed.
1528*789Sahrens 		 */
1529*789Sahrens 
1530*789Sahrens 		if (done)
1531*789Sahrens 			add_reference(hdr, hash_lock, private);
1532*789Sahrens 		if (done && refcount_count(&hdr->b_refcnt) > 1) {
1533*789Sahrens 			buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
1534*789Sahrens 			buf->b_data = zio_buf_alloc(hdr->b_size);
1535*789Sahrens 			ASSERT3U(refcount_count(&hdr->b_refcnt), >, 1);
1536*789Sahrens 			atomic_add_64(&arc.size, hdr->b_size);
1537*789Sahrens 			bcopy(hdr->b_buf->b_data, buf->b_data, hdr->b_size);
1538*789Sahrens 			buf->b_hdr = hdr;
1539*789Sahrens 			buf->b_next = hdr->b_buf;
1540*789Sahrens 			hdr->b_buf = buf;
1541*789Sahrens 			atomic_add_64(&hdr->b_state->size, hdr->b_size);
1542*789Sahrens 		} else {
1543*789Sahrens 			buf = hdr->b_buf;
1544*789Sahrens 		}
1545*789Sahrens 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1546*789Sahrens 		arc_access(hdr, hash_lock);
1547*789Sahrens 		mutex_exit(hash_lock);
1548*789Sahrens 		atomic_add_64(&arc.hits, 1);
1549*789Sahrens 		if (done)
1550*789Sahrens 			done(NULL, buf, private);
1551*789Sahrens 	} else {
1552*789Sahrens 		uint64_t size = BP_GET_LSIZE(bp);
1553*789Sahrens 		arc_callback_t	*acb;
1554*789Sahrens 
1555*789Sahrens 		if (hdr == NULL) {
1556*789Sahrens 			/* this block is not in the cache */
1557*789Sahrens 			arc_buf_hdr_t	*exists;
1558*789Sahrens 
1559*789Sahrens 			buf = arc_buf_alloc(spa, size, private);
1560*789Sahrens 			hdr = buf->b_hdr;
1561*789Sahrens 			hdr->b_dva = *BP_IDENTITY(bp);
1562*789Sahrens 			hdr->b_birth = bp->blk_birth;
1563*789Sahrens 			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
1564*789Sahrens 			exists = buf_hash_insert(hdr, &hash_lock);
1565*789Sahrens 			if (exists) {
1566*789Sahrens 				/* somebody beat us to the hash insert */
1567*789Sahrens 				mutex_exit(hash_lock);
1568*789Sahrens 				bzero(&hdr->b_dva, sizeof (dva_t));
1569*789Sahrens 				hdr->b_birth = 0;
1570*789Sahrens 				hdr->b_cksum0 = 0;
1571*789Sahrens 				arc_buf_free(buf, private);
1572*789Sahrens 				goto top; /* restart the IO request */
1573*789Sahrens 			}
1574*789Sahrens 
1575*789Sahrens 		} else {
1576*789Sahrens 			/* this block is in the ghost cache */
1577*789Sahrens 			ASSERT((hdr->b_state == arc.mru_bot) ||
1578*789Sahrens 			    (hdr->b_state == arc.mfu_bot));
1579*789Sahrens 			add_reference(hdr, hash_lock, private);
1580*789Sahrens 
1581*789Sahrens 			buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
1582*789Sahrens 			buf->b_data = zio_buf_alloc(hdr->b_size);
1583*789Sahrens 			atomic_add_64(&arc.size, hdr->b_size);
1584*789Sahrens 			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1585*789Sahrens 			ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
1586*789Sahrens 			buf->b_hdr = hdr;
1587*789Sahrens 			buf->b_next = NULL;
1588*789Sahrens 			hdr->b_buf = buf;
1589*789Sahrens 		}
1590*789Sahrens 
1591*789Sahrens 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
1592*789Sahrens 		acb->acb_done = done;
1593*789Sahrens 		acb->acb_private = private;
1594*789Sahrens 		acb->acb_byteswap = swap;
1595*789Sahrens 
1596*789Sahrens 		ASSERT(hdr->b_acb == NULL);
1597*789Sahrens 		hdr->b_acb = acb;
1598*789Sahrens 
1599*789Sahrens 		/*
1600*789Sahrens 		 * If this DVA is part of a prefetch, mark the buf
1601*789Sahrens 		 * header with the prefetch flag
1602*789Sahrens 		 */
1603*789Sahrens 		if (arc_flags & ARC_PREFETCH)
1604*789Sahrens 			hdr->b_flags |= ARC_PREFETCH;
1605*789Sahrens 		hdr->b_flags |= ARC_IO_IN_PROGRESS;
1606*789Sahrens 
1607*789Sahrens 		/*
1608*789Sahrens 		 * If the buffer has been evicted, migrate it to a present state
1609*789Sahrens 		 * before issuing the I/O.  Once we drop the hash-table lock,
1610*789Sahrens 		 * the header will be marked as I/O in progress and have an
1611*789Sahrens 		 * attached buffer.  At this point, anybody who finds this
1612*789Sahrens 		 * buffer ought to notice that it's legit but has a pending I/O.
1613*789Sahrens 		 */
1614*789Sahrens 
1615*789Sahrens 		if ((hdr->b_state == arc.mru_bot) ||
1616*789Sahrens 		    (hdr->b_state == arc.mfu_bot))
1617*789Sahrens 			arc_access(hdr, hash_lock);
1618*789Sahrens 
1619*789Sahrens 		mutex_exit(hash_lock);
1620*789Sahrens 
1621*789Sahrens 		ASSERT3U(hdr->b_size, ==, size);
1622*789Sahrens 		DTRACE_PROBE2(arc__miss, blkptr_t *, bp,
1623*789Sahrens 		    uint64_t, size);
1624*789Sahrens 		atomic_add_64(&arc.misses, 1);
1625*789Sahrens 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
1626*789Sahrens 		    arc_read_done, buf, priority, flags);
1627*789Sahrens 
1628*789Sahrens 		if (arc_flags & ARC_WAIT)
1629*789Sahrens 			return (zio_wait(rzio));
1630*789Sahrens 
1631*789Sahrens 		ASSERT(arc_flags & ARC_NOWAIT);
1632*789Sahrens 		zio_nowait(rzio);
1633*789Sahrens 	}
1634*789Sahrens 	return (0);
1635*789Sahrens }
1636*789Sahrens 
1637*789Sahrens /*
1638*789Sahrens  * arc_read() variant to support pool traversal.  If the block is already
1639*789Sahrens  * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
1640*789Sahrens  * The idea is that we don't want pool traversal filling up memory, but
1641*789Sahrens  * if the ARC already has the data anyway, we shouldn't pay for the I/O.
1642*789Sahrens  */
1643*789Sahrens int
1644*789Sahrens arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
1645*789Sahrens {
1646*789Sahrens 	arc_buf_hdr_t *hdr;
1647*789Sahrens 	kmutex_t *hash_mtx;
1648*789Sahrens 	int rc = 0;
1649*789Sahrens 
1650*789Sahrens 	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
1651*789Sahrens 
1652*789Sahrens 	if (hdr && hdr->b_buf && !HDR_IO_IN_PROGRESS(hdr))
1653*789Sahrens 		bcopy(hdr->b_buf->b_data, data, hdr->b_size);
1654*789Sahrens 	else
1655*789Sahrens 		rc = ENOENT;
1656*789Sahrens 
1657*789Sahrens 	if (hash_mtx)
1658*789Sahrens 		mutex_exit(hash_mtx);
1659*789Sahrens 
1660*789Sahrens 	return (rc);
1661*789Sahrens }
1662*789Sahrens 
1663*789Sahrens /*
1664*789Sahrens  * Release this buffer from the cache.  This must be done
1665*789Sahrens  * after a read and prior to modifying the buffer contents.
1666*789Sahrens  * If the buffer has more than one reference, we must make
1667*789Sahrens  * make a new hdr for the buffer.
1668*789Sahrens  */
1669*789Sahrens void
1670*789Sahrens arc_release(arc_buf_t *buf, void *tag)
1671*789Sahrens {
1672*789Sahrens 	arc_buf_hdr_t *hdr = buf->b_hdr;
1673*789Sahrens 	kmutex_t *hash_lock = HDR_LOCK(hdr);
1674*789Sahrens 
1675*789Sahrens 	/* this buffer is not on any list */
1676*789Sahrens 	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
1677*789Sahrens 
1678*789Sahrens 	if (hdr->b_state == arc.anon) {
1679*789Sahrens 		/* this buffer is already released */
1680*789Sahrens 		ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
1681*789Sahrens 		ASSERT(BUF_EMPTY(hdr));
1682*789Sahrens 		return;
1683*789Sahrens 	}
1684*789Sahrens 
1685*789Sahrens 	mutex_enter(hash_lock);
1686*789Sahrens 
1687*789Sahrens 	if (refcount_count(&hdr->b_refcnt) > 1) {
1688*789Sahrens 		arc_buf_hdr_t *nhdr;
1689*789Sahrens 		arc_buf_t **bufp;
1690*789Sahrens 		uint64_t blksz = hdr->b_size;
1691*789Sahrens 		spa_t *spa = hdr->b_spa;
1692*789Sahrens 
1693*789Sahrens 		/*
1694*789Sahrens 		 * Pull the data off of this buf and attach it to
1695*789Sahrens 		 * a new anonymous buf.
1696*789Sahrens 		 */
1697*789Sahrens 		bufp = &hdr->b_buf;
1698*789Sahrens 		while (*bufp != buf) {
1699*789Sahrens 			ASSERT(*bufp);
1700*789Sahrens 			bufp = &(*bufp)->b_next;
1701*789Sahrens 		}
1702*789Sahrens 		*bufp = (*bufp)->b_next;
1703*789Sahrens 		(void) refcount_remove(&hdr->b_refcnt, tag);
1704*789Sahrens 		ASSERT3U(hdr->b_state->size, >=, hdr->b_size);
1705*789Sahrens 		atomic_add_64(&hdr->b_state->size, -hdr->b_size);
1706*789Sahrens 		mutex_exit(hash_lock);
1707*789Sahrens 
1708*789Sahrens 		nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
1709*789Sahrens 		nhdr->b_size = blksz;
1710*789Sahrens 		nhdr->b_spa = spa;
1711*789Sahrens 		nhdr->b_buf = buf;
1712*789Sahrens 		nhdr->b_state = arc.anon;
1713*789Sahrens 		nhdr->b_arc_access = 0;
1714*789Sahrens 		nhdr->b_flags = 0;
1715*789Sahrens 		buf->b_hdr = nhdr;
1716*789Sahrens 		buf->b_next = NULL;
1717*789Sahrens 		(void) refcount_add(&nhdr->b_refcnt, tag);
1718*789Sahrens 		atomic_add_64(&arc.anon->size, blksz);
1719*789Sahrens 
1720*789Sahrens 		hdr = nhdr;
1721*789Sahrens 	} else {
1722*789Sahrens 		ASSERT(!list_link_active(&hdr->b_arc_node));
1723*789Sahrens 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1724*789Sahrens 		arc_change_state(arc.anon, hdr, hash_lock);
1725*789Sahrens 		hdr->b_arc_access = 0;
1726*789Sahrens 		mutex_exit(hash_lock);
1727*789Sahrens 		bzero(&hdr->b_dva, sizeof (dva_t));
1728*789Sahrens 		hdr->b_birth = 0;
1729*789Sahrens 		hdr->b_cksum0 = 0;
1730*789Sahrens 	}
1731*789Sahrens }
1732*789Sahrens 
1733*789Sahrens int
1734*789Sahrens arc_released(arc_buf_t *buf)
1735*789Sahrens {
1736*789Sahrens 	return (buf->b_hdr->b_state == arc.anon);
1737*789Sahrens }
1738*789Sahrens 
1739*789Sahrens static void
1740*789Sahrens arc_write_done(zio_t *zio)
1741*789Sahrens {
1742*789Sahrens 	arc_buf_t *buf;
1743*789Sahrens 	arc_buf_hdr_t *hdr;
1744*789Sahrens 	arc_callback_t *acb;
1745*789Sahrens 
1746*789Sahrens 	buf = zio->io_private;
1747*789Sahrens 	hdr = buf->b_hdr;
1748*789Sahrens 	acb = hdr->b_acb;
1749*789Sahrens 	hdr->b_acb = NULL;
1750*789Sahrens 
1751*789Sahrens 	/* this buffer is on no lists and is not in the hash table */
1752*789Sahrens 	ASSERT3P(hdr->b_state, ==, arc.anon);
1753*789Sahrens 
1754*789Sahrens 	hdr->b_dva = *BP_IDENTITY(zio->io_bp);
1755*789Sahrens 	hdr->b_birth = zio->io_bp->blk_birth;
1756*789Sahrens 	hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
1757*789Sahrens 	/* clear the "in-write" flag */
1758*789Sahrens 	hdr->b_hash_next = NULL;
1759*789Sahrens 	/* This write may be all-zero */
1760*789Sahrens 	if (!BUF_EMPTY(hdr)) {
1761*789Sahrens 		arc_buf_hdr_t *exists;
1762*789Sahrens 		kmutex_t *hash_lock;
1763*789Sahrens 
1764*789Sahrens 		exists = buf_hash_insert(hdr, &hash_lock);
1765*789Sahrens 		if (exists) {
1766*789Sahrens 			/*
1767*789Sahrens 			 * This can only happen if we overwrite for
1768*789Sahrens 			 * sync-to-convergence, because we remove
1769*789Sahrens 			 * buffers from the hash table when we arc_free().
1770*789Sahrens 			 */
1771*789Sahrens 			ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
1772*789Sahrens 			    BP_IDENTITY(zio->io_bp)));
1773*789Sahrens 			ASSERT3U(zio->io_bp_orig.blk_birth, ==,
1774*789Sahrens 			    zio->io_bp->blk_birth);
1775*789Sahrens 
1776*789Sahrens 			ASSERT(refcount_is_zero(&exists->b_refcnt));
1777*789Sahrens 			arc_change_state(arc.anon, exists, hash_lock);
1778*789Sahrens 			mutex_exit(hash_lock);
1779*789Sahrens 			arc_hdr_free(exists);
1780*789Sahrens 			exists = buf_hash_insert(hdr, &hash_lock);
1781*789Sahrens 			ASSERT3P(exists, ==, NULL);
1782*789Sahrens 		}
1783*789Sahrens 		arc_access(hdr, hash_lock);
1784*789Sahrens 		mutex_exit(hash_lock);
1785*789Sahrens 	}
1786*789Sahrens 	if (acb && acb->acb_done) {
1787*789Sahrens 		ASSERT(!refcount_is_zero(&hdr->b_refcnt));
1788*789Sahrens 		acb->acb_done(zio, buf, acb->acb_private);
1789*789Sahrens 	}
1790*789Sahrens 
1791*789Sahrens 	if (acb)
1792*789Sahrens 		kmem_free(acb, sizeof (arc_callback_t));
1793*789Sahrens }
1794*789Sahrens 
1795*789Sahrens int
1796*789Sahrens arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
1797*789Sahrens     uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
1798*789Sahrens     arc_done_func_t *done, void *private, int priority, int flags,
1799*789Sahrens     uint32_t arc_flags)
1800*789Sahrens {
1801*789Sahrens 	arc_buf_hdr_t *hdr = buf->b_hdr;
1802*789Sahrens 	arc_callback_t	*acb;
1803*789Sahrens 	zio_t	*rzio;
1804*789Sahrens 
1805*789Sahrens 	/* this is a private buffer - no locking required */
1806*789Sahrens 	ASSERT3P(hdr->b_state, ==, arc.anon);
1807*789Sahrens 	ASSERT(BUF_EMPTY(hdr));
1808*789Sahrens 	ASSERT(!HDR_IO_ERROR(hdr));
1809*789Sahrens 	acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
1810*789Sahrens 	acb->acb_done = done;
1811*789Sahrens 	acb->acb_private = private;
1812*789Sahrens 	acb->acb_byteswap = (arc_byteswap_func_t *)-1;
1813*789Sahrens 	hdr->b_acb = acb;
1814*789Sahrens 	rzio = zio_write(pio, spa, checksum, compress, txg, bp,
1815*789Sahrens 	    buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags);
1816*789Sahrens 
1817*789Sahrens 	if (arc_flags & ARC_WAIT)
1818*789Sahrens 		return (zio_wait(rzio));
1819*789Sahrens 
1820*789Sahrens 	ASSERT(arc_flags & ARC_NOWAIT);
1821*789Sahrens 	zio_nowait(rzio);
1822*789Sahrens 
1823*789Sahrens 	return (0);
1824*789Sahrens }
1825*789Sahrens 
1826*789Sahrens int
1827*789Sahrens arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
1828*789Sahrens     zio_done_func_t *done, void *private, uint32_t arc_flags)
1829*789Sahrens {
1830*789Sahrens 	arc_buf_hdr_t *ab;
1831*789Sahrens 	kmutex_t *hash_lock;
1832*789Sahrens 	zio_t	*zio;
1833*789Sahrens 
1834*789Sahrens 	/*
1835*789Sahrens 	 * If this buffer is in the cache, release it, so it
1836*789Sahrens 	 * can be re-used.
1837*789Sahrens 	 */
1838*789Sahrens 	ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
1839*789Sahrens 	if (ab != NULL) {
1840*789Sahrens 		/*
1841*789Sahrens 		 * The checksum of blocks to free is not always
1842*789Sahrens 		 * preserved (eg. on the deadlist).  However, if it is
1843*789Sahrens 		 * nonzero, it should match what we have in the cache.
1844*789Sahrens 		 */
1845*789Sahrens 		ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
1846*789Sahrens 		    ab->b_cksum0 == bp->blk_cksum.zc_word[0]);
1847*789Sahrens 		arc_change_state(arc.anon, ab, hash_lock);
1848*789Sahrens 		if (refcount_is_zero(&ab->b_refcnt)) {
1849*789Sahrens 			mutex_exit(hash_lock);
1850*789Sahrens 			arc_hdr_free(ab);
1851*789Sahrens 			atomic_add_64(&arc.deleted, 1);
1852*789Sahrens 		} else {
1853*789Sahrens 			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 1);
1854*789Sahrens 			if (HDR_IO_IN_PROGRESS(ab))
1855*789Sahrens 				ab->b_flags |= ARC_FREED_IN_READ;
1856*789Sahrens 			ab->b_arc_access = 0;
1857*789Sahrens 			bzero(&ab->b_dva, sizeof (dva_t));
1858*789Sahrens 			ab->b_birth = 0;
1859*789Sahrens 			ab->b_cksum0 = 0;
1860*789Sahrens 			mutex_exit(hash_lock);
1861*789Sahrens 		}
1862*789Sahrens 	}
1863*789Sahrens 
1864*789Sahrens 	zio = zio_free(pio, spa, txg, bp, done, private);
1865*789Sahrens 
1866*789Sahrens 	if (arc_flags & ARC_WAIT)
1867*789Sahrens 		return (zio_wait(zio));
1868*789Sahrens 
1869*789Sahrens 	ASSERT(arc_flags & ARC_NOWAIT);
1870*789Sahrens 	zio_nowait(zio);
1871*789Sahrens 
1872*789Sahrens 	return (0);
1873*789Sahrens }
1874*789Sahrens 
1875*789Sahrens void
1876*789Sahrens arc_tempreserve_clear(uint64_t tempreserve)
1877*789Sahrens {
1878*789Sahrens 	atomic_add_64(&arc_tempreserve, -tempreserve);
1879*789Sahrens 	ASSERT((int64_t)arc_tempreserve >= 0);
1880*789Sahrens }
1881*789Sahrens 
1882*789Sahrens int
1883*789Sahrens arc_tempreserve_space(uint64_t tempreserve)
1884*789Sahrens {
1885*789Sahrens #ifdef ZFS_DEBUG
1886*789Sahrens 	/*
1887*789Sahrens 	 * Once in a while, fail for no reason.  Everything should cope.
1888*789Sahrens 	 */
1889*789Sahrens 	if (spa_get_random(10000) == 0) {
1890*789Sahrens 		dprintf("forcing random failure\n");
1891*789Sahrens 		return (ERESTART);
1892*789Sahrens 	}
1893*789Sahrens #endif
1894*789Sahrens 	/*
1895*789Sahrens 	 * XXX This is kind of hacky.  The limit should be adjusted
1896*789Sahrens 	 * dynamically to keep the time to sync a dataset fixed (around
1897*789Sahrens 	 * 1-5 seconds?).
1898*789Sahrens 	 * Maybe should have some sort of locking?  If two requests come
1899*789Sahrens 	 * in concurrently, we might let them both succeed, when one of
1900*789Sahrens 	 * them should fail.  Not a huge deal.
1901*789Sahrens 	 */
1902*789Sahrens 
1903*789Sahrens 	ASSERT3U(tempreserve, <, arc.c/4); /* otherwise we'll loop forever */
1904*789Sahrens 
1905*789Sahrens 	if (arc_tempreserve + tempreserve + arc.anon->size > arc.c / 4) {
1906*789Sahrens 		dprintf("failing, arc_tempreserve=%lluK anon=%lluK "
1907*789Sahrens 		    "tempreserve=%lluK arc.c=%lluK\n",
1908*789Sahrens 		    arc_tempreserve>>10, arc.anon->lsize>>10,
1909*789Sahrens 		    tempreserve>>10, arc.c>>10);
1910*789Sahrens 		return (ERESTART);
1911*789Sahrens 	}
1912*789Sahrens 	atomic_add_64(&arc_tempreserve, tempreserve);
1913*789Sahrens 	return (0);
1914*789Sahrens }
1915*789Sahrens 
1916*789Sahrens void
1917*789Sahrens arc_init(void)
1918*789Sahrens {
1919*789Sahrens 	mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
1920*789Sahrens 	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
1921*789Sahrens 	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
1922*789Sahrens 
1923*789Sahrens 	/* Start out with 1/8 of all memory */
1924*789Sahrens 	arc.c = physmem * PAGESIZE / 8;
1925*789Sahrens 
1926*789Sahrens #ifdef _KERNEL
1927*789Sahrens 	/*
1928*789Sahrens 	 * On architectures where the physical memory can be larger
1929*789Sahrens 	 * than the addressable space (intel in 32-bit mode), we may
1930*789Sahrens 	 * need to limit the cache to 1/8 of VM size.
1931*789Sahrens 	 */
1932*789Sahrens 	arc.c = MIN(arc.c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
1933*789Sahrens #endif
1934*789Sahrens 
1935*789Sahrens 	/* use at least 1/32 of all memory, or 32MB, whichever is more */
1936*789Sahrens 	arc.c_min = MAX(arc.c / 4, 64<<20);
1937*789Sahrens 	/* use at most 3/4 of all memory, or all but 1GB, whichever is more */
1938*789Sahrens 	if (arc.c * 8 >= 1<<30)
1939*789Sahrens 		arc.c_max = (arc.c * 8) - (1<<30);
1940*789Sahrens 	else
1941*789Sahrens 		arc.c_max = arc.c_min;
1942*789Sahrens 	arc.c_max = MAX(arc.c * 6, arc.c_max);
1943*789Sahrens 	arc.c = arc.c_max;
1944*789Sahrens 	arc.p = (arc.c >> 1);
1945*789Sahrens 
1946*789Sahrens 	/* if kmem_flags are set, lets try to use less memory */
1947*789Sahrens 	if (kmem_debugging())
1948*789Sahrens 		arc.c = arc.c / 2;
1949*789Sahrens 	if (arc.c < arc.c_min)
1950*789Sahrens 		arc.c = arc.c_min;
1951*789Sahrens 
1952*789Sahrens 	arc.anon = &ARC_anon;
1953*789Sahrens 	arc.mru_top = &ARC_mru_top;
1954*789Sahrens 	arc.mru_bot = &ARC_mru_bot;
1955*789Sahrens 	arc.mfu_top = &ARC_mfu_top;
1956*789Sahrens 	arc.mfu_bot = &ARC_mfu_bot;
1957*789Sahrens 
1958*789Sahrens 	list_create(&arc.mru_top->list, sizeof (arc_buf_hdr_t),
1959*789Sahrens 	    offsetof(arc_buf_hdr_t, b_arc_node));
1960*789Sahrens 	list_create(&arc.mru_bot->list, sizeof (arc_buf_hdr_t),
1961*789Sahrens 	    offsetof(arc_buf_hdr_t, b_arc_node));
1962*789Sahrens 	list_create(&arc.mfu_top->list, sizeof (arc_buf_hdr_t),
1963*789Sahrens 	    offsetof(arc_buf_hdr_t, b_arc_node));
1964*789Sahrens 	list_create(&arc.mfu_bot->list, sizeof (arc_buf_hdr_t),
1965*789Sahrens 	    offsetof(arc_buf_hdr_t, b_arc_node));
1966*789Sahrens 
1967*789Sahrens 	buf_init();
1968*789Sahrens 
1969*789Sahrens 	arc_thread_exit = 0;
1970*789Sahrens 
1971*789Sahrens 	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
1972*789Sahrens 	    TS_RUN, minclsyspri);
1973*789Sahrens }
1974*789Sahrens 
1975*789Sahrens void
1976*789Sahrens arc_fini(void)
1977*789Sahrens {
1978*789Sahrens 	mutex_enter(&arc_reclaim_thr_lock);
1979*789Sahrens 	arc_thread_exit = 1;
1980*789Sahrens 	while (arc_thread_exit != 0)
1981*789Sahrens 		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
1982*789Sahrens 	mutex_exit(&arc_reclaim_thr_lock);
1983*789Sahrens 
1984*789Sahrens 	arc_flush();
1985*789Sahrens 
1986*789Sahrens 	arc_dead = TRUE;
1987*789Sahrens 
1988*789Sahrens 	mutex_destroy(&arc_reclaim_lock);
1989*789Sahrens 	mutex_destroy(&arc_reclaim_thr_lock);
1990*789Sahrens 	cv_destroy(&arc_reclaim_thr_cv);
1991*789Sahrens 
1992*789Sahrens 	list_destroy(&arc.mru_top->list);
1993*789Sahrens 	list_destroy(&arc.mru_bot->list);
1994*789Sahrens 	list_destroy(&arc.mfu_top->list);
1995*789Sahrens 	list_destroy(&arc.mfu_bot->list);
1996*789Sahrens 
1997*789Sahrens 	buf_fini();
1998*789Sahrens }
1999