1*789Sahrens /* 2*789Sahrens * CDDL HEADER START 3*789Sahrens * 4*789Sahrens * The contents of this file are subject to the terms of the 5*789Sahrens * Common Development and Distribution License, Version 1.0 only 6*789Sahrens * (the "License"). You may not use this file except in compliance 7*789Sahrens * with the License. 8*789Sahrens * 9*789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10*789Sahrens * or http://www.opensolaris.org/os/licensing. 11*789Sahrens * See the License for the specific language governing permissions 12*789Sahrens * and limitations under the License. 13*789Sahrens * 14*789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 15*789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16*789Sahrens * If applicable, add the following below this CDDL HEADER, with the 17*789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 18*789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 19*789Sahrens * 20*789Sahrens * CDDL HEADER END 21*789Sahrens */ 22*789Sahrens /* 23*789Sahrens * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24*789Sahrens * Use is subject to license terms. 25*789Sahrens */ 26*789Sahrens 27*789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28*789Sahrens 29*789Sahrens /* 30*789Sahrens * DVA-based Adjustable Relpacement Cache 31*789Sahrens * 32*789Sahrens * While much of the theory of operation and algorithms used here 33*789Sahrens * are based on the self-tuning, low overhead replacement cache 34*789Sahrens * presented by Megiddo and Modha at FAST 2003, there are some 35*789Sahrens * significant differences: 36*789Sahrens * 37*789Sahrens * 1. The Megiddo and Modha model assumes any page is evictable. 38*789Sahrens * Pages in its cache cannot be "locked" into memory. This makes 39*789Sahrens * the eviction algorithm simple: evict the last page in the list. 40*789Sahrens * This also make the performance characteristics easy to reason 41*789Sahrens * about. Our cache is not so simple. At any given moment, some 42*789Sahrens * subset of the blocks in the cache are un-evictable because we 43*789Sahrens * have handed out a reference to them. Blocks are only evictable 44*789Sahrens * when there are no external references active. This makes 45*789Sahrens * eviction far more problematic: we choose to evict the evictable 46*789Sahrens * blocks that are the "lowest" in the list. 47*789Sahrens * 48*789Sahrens * There are times when it is not possible to evict the requested 49*789Sahrens * space. In these circumstances we are unable to adjust the cache 50*789Sahrens * size. To prevent the cache growing unbounded at these times we 51*789Sahrens * implement a "cache throttle" that slowes the flow of new data 52*789Sahrens * into the cache until we can make space avaiable. 53*789Sahrens * 54*789Sahrens * 2. The Megiddo and Modha model assumes a fixed cache size. 55*789Sahrens * Pages are evicted when the cache is full and there is a cache 56*789Sahrens * miss. Our model has a variable sized cache. It grows with 57*789Sahrens * high use, but also tries to react to memory preasure from the 58*789Sahrens * operating system: decreasing its size when system memory is 59*789Sahrens * tight. 60*789Sahrens * 61*789Sahrens * 3. The Megiddo and Modha model assumes a fixed page size. All 62*789Sahrens * elements of the cache are therefor exactly the same size. So 63*789Sahrens * when adjusting the cache size following a cache miss, its simply 64*789Sahrens * a matter of choosing a single page to evict. In our model, we 65*789Sahrens * have variable sized cache blocks (rangeing from 512 bytes to 66*789Sahrens * 128K bytes). We therefor choose a set of blocks to evict to make 67*789Sahrens * space for a cache miss that approximates as closely as possible 68*789Sahrens * the space used by the new block. 69*789Sahrens * 70*789Sahrens * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 71*789Sahrens * by N. Megiddo & D. Modha, FAST 2003 72*789Sahrens */ 73*789Sahrens 74*789Sahrens /* 75*789Sahrens * The locking model: 76*789Sahrens * 77*789Sahrens * A new reference to a cache buffer can be obtained in two 78*789Sahrens * ways: 1) via a hash table lookup using the DVA as a key, 79*789Sahrens * or 2) via one of the ARC lists. The arc_read() inerface 80*789Sahrens * uses method 1, while the internal arc algorithms for 81*789Sahrens * adjusting the cache use method 2. We therefor provide two 82*789Sahrens * types of locks: 1) the hash table lock array, and 2) the 83*789Sahrens * arc list locks. 84*789Sahrens * 85*789Sahrens * Buffers do not have their own mutexs, rather they rely on the 86*789Sahrens * hash table mutexs for the bulk of their protection (i.e. most 87*789Sahrens * fields in the arc_buf_hdr_t are protected by these mutexs). 88*789Sahrens * 89*789Sahrens * buf_hash_find() returns the appropriate mutex (held) when it 90*789Sahrens * locates the requested buffer in the hash table. It returns 91*789Sahrens * NULL for the mutex if the buffer was not in the table. 92*789Sahrens * 93*789Sahrens * buf_hash_remove() expects the appropriate hash mutex to be 94*789Sahrens * already held before it is invoked. 95*789Sahrens * 96*789Sahrens * Each arc state also has a mutex which is used to protect the 97*789Sahrens * buffer list associated with the state. When attempting to 98*789Sahrens * obtain a hash table lock while holding an arc list lock you 99*789Sahrens * must use: mutex_tryenter() to avoid deadlock. Also note that 100*789Sahrens * the "top" state mutex must be held before the "bot" state mutex. 101*789Sahrens * 102*789Sahrens * Note that the majority of the performance stats are manipulated 103*789Sahrens * with atomic operations. 104*789Sahrens */ 105*789Sahrens 106*789Sahrens #include <sys/spa.h> 107*789Sahrens #include <sys/zio.h> 108*789Sahrens #include <sys/zfs_context.h> 109*789Sahrens #include <sys/arc.h> 110*789Sahrens #include <sys/refcount.h> 111*789Sahrens #ifdef _KERNEL 112*789Sahrens #include <sys/vmsystm.h> 113*789Sahrens #include <vm/anon.h> 114*789Sahrens #include <sys/fs/swapnode.h> 115*789Sahrens #endif 116*789Sahrens #include <sys/callb.h> 117*789Sahrens 118*789Sahrens static kmutex_t arc_reclaim_thr_lock; 119*789Sahrens static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 120*789Sahrens static uint8_t arc_thread_exit; 121*789Sahrens 122*789Sahrens typedef enum arc_reclaim_strategy { 123*789Sahrens ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 124*789Sahrens ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 125*789Sahrens } arc_reclaim_strategy_t; 126*789Sahrens 127*789Sahrens /* number of seconds before growing cache again */ 128*789Sahrens static int arc_grow_retry = 60; 129*789Sahrens 130*789Sahrens static kmutex_t arc_reclaim_lock; 131*789Sahrens static int arc_dead; 132*789Sahrens 133*789Sahrens /* 134*789Sahrens * Note that buffers can be on one of 5 states: 135*789Sahrens * ARC_anon - anonymous (discussed below) 136*789Sahrens * ARC_mru_top - recently used, currently cached 137*789Sahrens * ARC_mru_bot - recentely used, no longer in cache 138*789Sahrens * ARC_mfu_top - frequently used, currently cached 139*789Sahrens * ARC_mfu_bot - frequently used, no longer in cache 140*789Sahrens * When there are no active references to the buffer, they 141*789Sahrens * are linked onto one of the lists in arc. These are the 142*789Sahrens * only buffers that can be evicted or deleted. 143*789Sahrens * 144*789Sahrens * Anonymous buffers are buffers that are not associated with 145*789Sahrens * a DVA. These are buffers that hold dirty block copies 146*789Sahrens * before they are written to stable storage. By definition, 147*789Sahrens * they are "ref'd" and are considered part of arc_mru_top 148*789Sahrens * that cannot be freed. Generally, they will aquire a DVA 149*789Sahrens * as they are written and migrate onto the arc_mru_top list. 150*789Sahrens */ 151*789Sahrens 152*789Sahrens typedef struct arc_state { 153*789Sahrens list_t list; /* linked list of evictable buffer in state */ 154*789Sahrens uint64_t lsize; /* total size of buffers in the linked list */ 155*789Sahrens uint64_t size; /* total size of all buffers in this state */ 156*789Sahrens uint64_t hits; 157*789Sahrens kmutex_t mtx; 158*789Sahrens } arc_state_t; 159*789Sahrens 160*789Sahrens /* The 5 states: */ 161*789Sahrens static arc_state_t ARC_anon; 162*789Sahrens static arc_state_t ARC_mru_top; 163*789Sahrens static arc_state_t ARC_mru_bot; 164*789Sahrens static arc_state_t ARC_mfu_top; 165*789Sahrens static arc_state_t ARC_mfu_bot; 166*789Sahrens 167*789Sahrens static struct arc { 168*789Sahrens arc_state_t *anon; 169*789Sahrens arc_state_t *mru_top; 170*789Sahrens arc_state_t *mru_bot; 171*789Sahrens arc_state_t *mfu_top; 172*789Sahrens arc_state_t *mfu_bot; 173*789Sahrens uint64_t size; /* Actual total arc size */ 174*789Sahrens uint64_t p; /* Target size (in bytes) of mru_top */ 175*789Sahrens uint64_t c; /* Target size of cache (in bytes) */ 176*789Sahrens uint64_t c_min; /* Minimum target cache size */ 177*789Sahrens uint64_t c_max; /* Maximum target cache size */ 178*789Sahrens uint64_t incr; /* Size by which to increment arc.c */ 179*789Sahrens int64_t size_check; 180*789Sahrens 181*789Sahrens /* performance stats */ 182*789Sahrens uint64_t hits; 183*789Sahrens uint64_t misses; 184*789Sahrens uint64_t deleted; 185*789Sahrens uint64_t skipped; 186*789Sahrens uint64_t hash_elements; 187*789Sahrens uint64_t hash_elements_max; 188*789Sahrens uint64_t hash_collisions; 189*789Sahrens uint64_t hash_chains; 190*789Sahrens uint32_t hash_chain_max; 191*789Sahrens 192*789Sahrens int no_grow; /* Don't try to grow cache size */ 193*789Sahrens } arc; 194*789Sahrens 195*789Sahrens /* Default amount to grow arc.incr */ 196*789Sahrens static int64_t arc_incr_size = 1024; 197*789Sahrens 198*789Sahrens /* > 0 ==> time to increment arc.c */ 199*789Sahrens static int64_t arc_size_check_default = -1000; 200*789Sahrens 201*789Sahrens static uint64_t arc_tempreserve; 202*789Sahrens 203*789Sahrens typedef struct arc_callback arc_callback_t; 204*789Sahrens 205*789Sahrens struct arc_callback { 206*789Sahrens arc_done_func_t *acb_done; 207*789Sahrens void *acb_private; 208*789Sahrens arc_byteswap_func_t *acb_byteswap; 209*789Sahrens arc_buf_t *acb_buf; 210*789Sahrens zio_t *acb_zio_dummy; 211*789Sahrens arc_callback_t *acb_next; 212*789Sahrens }; 213*789Sahrens 214*789Sahrens struct arc_buf_hdr { 215*789Sahrens /* immutable */ 216*789Sahrens uint64_t b_size; 217*789Sahrens spa_t *b_spa; 218*789Sahrens 219*789Sahrens /* protected by hash lock */ 220*789Sahrens dva_t b_dva; 221*789Sahrens uint64_t b_birth; 222*789Sahrens uint64_t b_cksum0; 223*789Sahrens 224*789Sahrens arc_buf_hdr_t *b_hash_next; 225*789Sahrens arc_buf_t *b_buf; 226*789Sahrens uint32_t b_flags; 227*789Sahrens 228*789Sahrens kcondvar_t b_cv; 229*789Sahrens arc_callback_t *b_acb; 230*789Sahrens 231*789Sahrens /* protected by arc state mutex */ 232*789Sahrens arc_state_t *b_state; 233*789Sahrens list_node_t b_arc_node; 234*789Sahrens 235*789Sahrens /* updated atomically */ 236*789Sahrens clock_t b_arc_access; 237*789Sahrens 238*789Sahrens /* self protecting */ 239*789Sahrens refcount_t b_refcnt; 240*789Sahrens }; 241*789Sahrens 242*789Sahrens /* 243*789Sahrens * Private ARC flags. These flags are private ARC only flags that will show up 244*789Sahrens * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 245*789Sahrens * be passed in as arc_flags in things like arc_read. However, these flags 246*789Sahrens * should never be passed and should only be set by ARC code. When adding new 247*789Sahrens * public flags, make sure not to smash the private ones. 248*789Sahrens */ 249*789Sahrens 250*789Sahrens #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 251*789Sahrens #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 252*789Sahrens #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 253*789Sahrens 254*789Sahrens #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 255*789Sahrens #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 256*789Sahrens #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 257*789Sahrens 258*789Sahrens /* 259*789Sahrens * Hash table routines 260*789Sahrens */ 261*789Sahrens 262*789Sahrens #define HT_LOCK_PAD 64 263*789Sahrens 264*789Sahrens struct ht_lock { 265*789Sahrens kmutex_t ht_lock; 266*789Sahrens #ifdef _KERNEL 267*789Sahrens unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 268*789Sahrens #endif 269*789Sahrens }; 270*789Sahrens 271*789Sahrens #define BUF_LOCKS 256 272*789Sahrens typedef struct buf_hash_table { 273*789Sahrens uint64_t ht_mask; 274*789Sahrens arc_buf_hdr_t **ht_table; 275*789Sahrens struct ht_lock ht_locks[BUF_LOCKS]; 276*789Sahrens } buf_hash_table_t; 277*789Sahrens 278*789Sahrens static buf_hash_table_t buf_hash_table; 279*789Sahrens 280*789Sahrens #define BUF_HASH_INDEX(spa, dva, birth) \ 281*789Sahrens (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 282*789Sahrens #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 283*789Sahrens #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 284*789Sahrens #define HDR_LOCK(buf) \ 285*789Sahrens (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) 286*789Sahrens 287*789Sahrens uint64_t zfs_crc64_table[256]; 288*789Sahrens 289*789Sahrens static uint64_t 290*789Sahrens buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) 291*789Sahrens { 292*789Sahrens uintptr_t spav = (uintptr_t)spa; 293*789Sahrens uint8_t *vdva = (uint8_t *)dva; 294*789Sahrens uint64_t crc = -1ULL; 295*789Sahrens int i; 296*789Sahrens 297*789Sahrens ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 298*789Sahrens 299*789Sahrens for (i = 0; i < sizeof (dva_t); i++) 300*789Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 301*789Sahrens 302*789Sahrens crc ^= (spav>>8) ^ birth; 303*789Sahrens 304*789Sahrens return (crc); 305*789Sahrens } 306*789Sahrens 307*789Sahrens #define BUF_EMPTY(buf) \ 308*789Sahrens ((buf)->b_dva.dva_word[0] == 0 && \ 309*789Sahrens (buf)->b_dva.dva_word[1] == 0 && \ 310*789Sahrens (buf)->b_birth == 0) 311*789Sahrens 312*789Sahrens #define BUF_EQUAL(spa, dva, birth, buf) \ 313*789Sahrens ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 314*789Sahrens ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 315*789Sahrens ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 316*789Sahrens 317*789Sahrens static arc_buf_hdr_t * 318*789Sahrens buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp) 319*789Sahrens { 320*789Sahrens uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 321*789Sahrens kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 322*789Sahrens arc_buf_hdr_t *buf; 323*789Sahrens 324*789Sahrens mutex_enter(hash_lock); 325*789Sahrens for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 326*789Sahrens buf = buf->b_hash_next) { 327*789Sahrens if (BUF_EQUAL(spa, dva, birth, buf)) { 328*789Sahrens *lockp = hash_lock; 329*789Sahrens return (buf); 330*789Sahrens } 331*789Sahrens } 332*789Sahrens mutex_exit(hash_lock); 333*789Sahrens *lockp = NULL; 334*789Sahrens return (NULL); 335*789Sahrens } 336*789Sahrens 337*789Sahrens /* 338*789Sahrens * Insert an entry into the hash table. If there is already an element 339*789Sahrens * equal to elem in the hash table, then the already existing element 340*789Sahrens * will be returned and the new element will not be inserted. 341*789Sahrens * Otherwise returns NULL. 342*789Sahrens */ 343*789Sahrens static arc_buf_hdr_t *fbufs[4]; /* XXX to find 6341326 */ 344*789Sahrens static kthread_t *fbufs_lastthread; 345*789Sahrens static arc_buf_hdr_t * 346*789Sahrens buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 347*789Sahrens { 348*789Sahrens uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 349*789Sahrens kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 350*789Sahrens arc_buf_hdr_t *fbuf; 351*789Sahrens uint32_t max, i; 352*789Sahrens 353*789Sahrens fbufs_lastthread = curthread; 354*789Sahrens *lockp = hash_lock; 355*789Sahrens mutex_enter(hash_lock); 356*789Sahrens for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 357*789Sahrens fbuf = fbuf->b_hash_next, i++) { 358*789Sahrens if (i < sizeof (fbufs) / sizeof (fbufs[0])) 359*789Sahrens fbufs[i] = fbuf; 360*789Sahrens if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 361*789Sahrens return (fbuf); 362*789Sahrens } 363*789Sahrens 364*789Sahrens buf->b_hash_next = buf_hash_table.ht_table[idx]; 365*789Sahrens buf_hash_table.ht_table[idx] = buf; 366*789Sahrens 367*789Sahrens /* collect some hash table performance data */ 368*789Sahrens if (i > 0) { 369*789Sahrens atomic_add_64(&arc.hash_collisions, 1); 370*789Sahrens if (i == 1) 371*789Sahrens atomic_add_64(&arc.hash_chains, 1); 372*789Sahrens } 373*789Sahrens while (i > (max = arc.hash_chain_max) && 374*789Sahrens max != atomic_cas_32(&arc.hash_chain_max, max, i)) { 375*789Sahrens continue; 376*789Sahrens } 377*789Sahrens atomic_add_64(&arc.hash_elements, 1); 378*789Sahrens if (arc.hash_elements > arc.hash_elements_max) 379*789Sahrens atomic_add_64(&arc.hash_elements_max, 1); 380*789Sahrens 381*789Sahrens return (NULL); 382*789Sahrens } 383*789Sahrens 384*789Sahrens static void 385*789Sahrens buf_hash_remove(arc_buf_hdr_t *buf) 386*789Sahrens { 387*789Sahrens arc_buf_hdr_t *fbuf, **bufp; 388*789Sahrens uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 389*789Sahrens 390*789Sahrens ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 391*789Sahrens 392*789Sahrens bufp = &buf_hash_table.ht_table[idx]; 393*789Sahrens while ((fbuf = *bufp) != buf) { 394*789Sahrens ASSERT(fbuf != NULL); 395*789Sahrens bufp = &fbuf->b_hash_next; 396*789Sahrens } 397*789Sahrens *bufp = buf->b_hash_next; 398*789Sahrens buf->b_hash_next = NULL; 399*789Sahrens 400*789Sahrens /* collect some hash table performance data */ 401*789Sahrens atomic_add_64(&arc.hash_elements, -1); 402*789Sahrens if (buf_hash_table.ht_table[idx] && 403*789Sahrens buf_hash_table.ht_table[idx]->b_hash_next == NULL) 404*789Sahrens atomic_add_64(&arc.hash_chains, -1); 405*789Sahrens } 406*789Sahrens 407*789Sahrens /* 408*789Sahrens * Global data structures and functions for the buf kmem cache. 409*789Sahrens */ 410*789Sahrens static kmem_cache_t *hdr_cache; 411*789Sahrens static kmem_cache_t *buf_cache; 412*789Sahrens 413*789Sahrens static void 414*789Sahrens buf_fini(void) 415*789Sahrens { 416*789Sahrens int i; 417*789Sahrens 418*789Sahrens kmem_free(buf_hash_table.ht_table, 419*789Sahrens (buf_hash_table.ht_mask + 1) * sizeof (void *)); 420*789Sahrens for (i = 0; i < BUF_LOCKS; i++) 421*789Sahrens mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 422*789Sahrens kmem_cache_destroy(hdr_cache); 423*789Sahrens kmem_cache_destroy(buf_cache); 424*789Sahrens } 425*789Sahrens 426*789Sahrens /* 427*789Sahrens * Constructor callback - called when the cache is empty 428*789Sahrens * and a new buf is requested. 429*789Sahrens */ 430*789Sahrens /* ARGSUSED */ 431*789Sahrens static int 432*789Sahrens hdr_cons(void *vbuf, void *unused, int kmflag) 433*789Sahrens { 434*789Sahrens arc_buf_hdr_t *buf = vbuf; 435*789Sahrens 436*789Sahrens bzero(buf, sizeof (arc_buf_hdr_t)); 437*789Sahrens refcount_create(&buf->b_refcnt); 438*789Sahrens cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 439*789Sahrens return (0); 440*789Sahrens } 441*789Sahrens 442*789Sahrens /* 443*789Sahrens * Destructor callback - called when a cached buf is 444*789Sahrens * no longer required. 445*789Sahrens */ 446*789Sahrens /* ARGSUSED */ 447*789Sahrens static void 448*789Sahrens hdr_dest(void *vbuf, void *unused) 449*789Sahrens { 450*789Sahrens arc_buf_hdr_t *buf = vbuf; 451*789Sahrens 452*789Sahrens refcount_destroy(&buf->b_refcnt); 453*789Sahrens cv_destroy(&buf->b_cv); 454*789Sahrens } 455*789Sahrens 456*789Sahrens void arc_kmem_reclaim(void); 457*789Sahrens 458*789Sahrens /* 459*789Sahrens * Reclaim callback -- invoked when memory is low. 460*789Sahrens */ 461*789Sahrens /* ARGSUSED */ 462*789Sahrens static void 463*789Sahrens hdr_recl(void *unused) 464*789Sahrens { 465*789Sahrens dprintf("hdr_recl called\n"); 466*789Sahrens arc_kmem_reclaim(); 467*789Sahrens } 468*789Sahrens 469*789Sahrens static void 470*789Sahrens buf_init(void) 471*789Sahrens { 472*789Sahrens uint64_t *ct; 473*789Sahrens uint64_t hsize = 1ULL << 10; 474*789Sahrens int i, j; 475*789Sahrens 476*789Sahrens /* 477*789Sahrens * The hash table is big enough to fill all of physical memory 478*789Sahrens * with an average 4k block size. The table will take up 479*789Sahrens * totalmem*sizeof(void*)/4k bytes (eg. 2MB/GB with 8-byte 480*789Sahrens * pointers). 481*789Sahrens */ 482*789Sahrens while (hsize * 4096 < physmem * PAGESIZE) 483*789Sahrens hsize <<= 1; 484*789Sahrens 485*789Sahrens buf_hash_table.ht_mask = hsize - 1; 486*789Sahrens buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_SLEEP); 487*789Sahrens 488*789Sahrens hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 489*789Sahrens 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 490*789Sahrens buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 491*789Sahrens 0, NULL, NULL, NULL, NULL, NULL, 0); 492*789Sahrens 493*789Sahrens for (i = 0; i < 256; i++) 494*789Sahrens for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 495*789Sahrens *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 496*789Sahrens 497*789Sahrens for (i = 0; i < BUF_LOCKS; i++) { 498*789Sahrens mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 499*789Sahrens NULL, MUTEX_DEFAULT, NULL); 500*789Sahrens } 501*789Sahrens } 502*789Sahrens 503*789Sahrens #define ARC_MINTIME (hz>>4) /* 62 ms */ 504*789Sahrens 505*789Sahrens #define ARC_TAG (void *)0x05201962 506*789Sahrens 507*789Sahrens static void 508*789Sahrens add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 509*789Sahrens { 510*789Sahrens ASSERT(MUTEX_HELD(hash_lock)); 511*789Sahrens 512*789Sahrens if ((refcount_add(&ab->b_refcnt, tag) == 1) && 513*789Sahrens (ab->b_state != arc.anon)) { 514*789Sahrens 515*789Sahrens ASSERT(!MUTEX_HELD(&ab->b_state->mtx)); 516*789Sahrens mutex_enter(&ab->b_state->mtx); 517*789Sahrens ASSERT(!refcount_is_zero(&ab->b_refcnt)); 518*789Sahrens ASSERT(list_link_active(&ab->b_arc_node)); 519*789Sahrens list_remove(&ab->b_state->list, ab); 520*789Sahrens ASSERT3U(ab->b_state->lsize, >=, ab->b_size); 521*789Sahrens ab->b_state->lsize -= ab->b_size; 522*789Sahrens mutex_exit(&ab->b_state->mtx); 523*789Sahrens } 524*789Sahrens } 525*789Sahrens 526*789Sahrens static int 527*789Sahrens remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 528*789Sahrens { 529*789Sahrens int cnt; 530*789Sahrens 531*789Sahrens ASSERT(MUTEX_HELD(hash_lock)); 532*789Sahrens 533*789Sahrens if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 534*789Sahrens (ab->b_state != arc.anon)) { 535*789Sahrens 536*789Sahrens ASSERT(!MUTEX_HELD(&ab->b_state->mtx)); 537*789Sahrens mutex_enter(&ab->b_state->mtx); 538*789Sahrens ASSERT(!list_link_active(&ab->b_arc_node)); 539*789Sahrens list_insert_head(&ab->b_state->list, ab); 540*789Sahrens ASSERT(ab->b_buf != NULL); 541*789Sahrens ab->b_state->lsize += ab->b_size; 542*789Sahrens mutex_exit(&ab->b_state->mtx); 543*789Sahrens } 544*789Sahrens return (cnt); 545*789Sahrens } 546*789Sahrens 547*789Sahrens /* 548*789Sahrens * Move the supplied buffer to the indicated state. The mutex 549*789Sahrens * for the buffer must be held by the caller. 550*789Sahrens */ 551*789Sahrens static void 552*789Sahrens arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, 553*789Sahrens kmutex_t *hash_lock) 554*789Sahrens { 555*789Sahrens arc_buf_t *buf; 556*789Sahrens 557*789Sahrens ASSERT(MUTEX_HELD(hash_lock)); 558*789Sahrens 559*789Sahrens /* 560*789Sahrens * If this buffer is evictable, transfer it from the 561*789Sahrens * old state list to the new state list. 562*789Sahrens */ 563*789Sahrens if (refcount_is_zero(&ab->b_refcnt)) { 564*789Sahrens if (ab->b_state != arc.anon) { 565*789Sahrens int drop_mutex = FALSE; 566*789Sahrens 567*789Sahrens if (!MUTEX_HELD(&ab->b_state->mtx)) { 568*789Sahrens mutex_enter(&ab->b_state->mtx); 569*789Sahrens drop_mutex = TRUE; 570*789Sahrens } 571*789Sahrens ASSERT(list_link_active(&ab->b_arc_node)); 572*789Sahrens list_remove(&ab->b_state->list, ab); 573*789Sahrens ASSERT3U(ab->b_state->lsize, >=, ab->b_size); 574*789Sahrens ab->b_state->lsize -= ab->b_size; 575*789Sahrens if (drop_mutex) 576*789Sahrens mutex_exit(&ab->b_state->mtx); 577*789Sahrens } 578*789Sahrens if (new_state != arc.anon) { 579*789Sahrens int drop_mutex = FALSE; 580*789Sahrens 581*789Sahrens if (!MUTEX_HELD(&new_state->mtx)) { 582*789Sahrens mutex_enter(&new_state->mtx); 583*789Sahrens drop_mutex = TRUE; 584*789Sahrens } 585*789Sahrens list_insert_head(&new_state->list, ab); 586*789Sahrens ASSERT(ab->b_buf != NULL); 587*789Sahrens new_state->lsize += ab->b_size; 588*789Sahrens if (drop_mutex) 589*789Sahrens mutex_exit(&new_state->mtx); 590*789Sahrens } 591*789Sahrens } 592*789Sahrens 593*789Sahrens ASSERT(!BUF_EMPTY(ab)); 594*789Sahrens if (new_state == arc.anon && ab->b_state != arc.anon) { 595*789Sahrens buf_hash_remove(ab); 596*789Sahrens } 597*789Sahrens 598*789Sahrens /* 599*789Sahrens * If this buffer isn't being transferred to the MRU-top 600*789Sahrens * state, it's safe to clear its prefetch flag 601*789Sahrens */ 602*789Sahrens if ((new_state != arc.mru_top) && (new_state != arc.mru_bot)) { 603*789Sahrens ab->b_flags &= ~ARC_PREFETCH; 604*789Sahrens } 605*789Sahrens 606*789Sahrens buf = ab->b_buf; 607*789Sahrens if (buf == NULL) { 608*789Sahrens ASSERT3U(ab->b_state->size, >=, ab->b_size); 609*789Sahrens atomic_add_64(&ab->b_state->size, -ab->b_size); 610*789Sahrens /* we should only be here if we are deleting state */ 611*789Sahrens ASSERT(new_state == arc.anon && 612*789Sahrens (ab->b_state == arc.mru_bot || ab->b_state == arc.mfu_bot)); 613*789Sahrens } else while (buf) { 614*789Sahrens ASSERT3U(ab->b_state->size, >=, ab->b_size); 615*789Sahrens atomic_add_64(&ab->b_state->size, -ab->b_size); 616*789Sahrens atomic_add_64(&new_state->size, ab->b_size); 617*789Sahrens buf = buf->b_next; 618*789Sahrens } 619*789Sahrens ab->b_state = new_state; 620*789Sahrens } 621*789Sahrens 622*789Sahrens arc_buf_t * 623*789Sahrens arc_buf_alloc(spa_t *spa, int size, void *tag) 624*789Sahrens { 625*789Sahrens arc_buf_hdr_t *hdr; 626*789Sahrens arc_buf_t *buf; 627*789Sahrens 628*789Sahrens ASSERT3U(size, >, 0); 629*789Sahrens hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 630*789Sahrens ASSERT(BUF_EMPTY(hdr)); 631*789Sahrens hdr->b_size = size; 632*789Sahrens hdr->b_spa = spa; 633*789Sahrens hdr->b_state = arc.anon; 634*789Sahrens hdr->b_arc_access = 0; 635*789Sahrens buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 636*789Sahrens buf->b_hdr = hdr; 637*789Sahrens buf->b_next = NULL; 638*789Sahrens buf->b_data = zio_buf_alloc(size); 639*789Sahrens hdr->b_buf = buf; 640*789Sahrens hdr->b_flags = 0; 641*789Sahrens ASSERT(refcount_is_zero(&hdr->b_refcnt)); 642*789Sahrens (void) refcount_add(&hdr->b_refcnt, tag); 643*789Sahrens 644*789Sahrens atomic_add_64(&arc.size, size); 645*789Sahrens atomic_add_64(&arc.anon->size, size); 646*789Sahrens 647*789Sahrens return (buf); 648*789Sahrens } 649*789Sahrens 650*789Sahrens static void 651*789Sahrens arc_hdr_free(arc_buf_hdr_t *hdr) 652*789Sahrens { 653*789Sahrens ASSERT(refcount_is_zero(&hdr->b_refcnt)); 654*789Sahrens ASSERT3P(hdr->b_state, ==, arc.anon); 655*789Sahrens 656*789Sahrens if (!BUF_EMPTY(hdr)) { 657*789Sahrens /* 658*789Sahrens * We can be called with an arc state lock held, 659*789Sahrens * so we can't hold a hash lock here. 660*789Sahrens * ASSERT(not in hash table) 661*789Sahrens */ 662*789Sahrens ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 663*789Sahrens bzero(&hdr->b_dva, sizeof (dva_t)); 664*789Sahrens hdr->b_birth = 0; 665*789Sahrens hdr->b_cksum0 = 0; 666*789Sahrens } 667*789Sahrens if (hdr->b_buf) { 668*789Sahrens arc_buf_t *buf = hdr->b_buf; 669*789Sahrens 670*789Sahrens ASSERT3U(hdr->b_size, >, 0); 671*789Sahrens zio_buf_free(buf->b_data, hdr->b_size); 672*789Sahrens atomic_add_64(&arc.size, -hdr->b_size); 673*789Sahrens ASSERT3U(arc.anon->size, >=, hdr->b_size); 674*789Sahrens atomic_add_64(&arc.anon->size, -hdr->b_size); 675*789Sahrens ASSERT3P(buf->b_next, ==, NULL); 676*789Sahrens kmem_cache_free(buf_cache, buf); 677*789Sahrens hdr->b_buf = NULL; 678*789Sahrens } 679*789Sahrens ASSERT(!list_link_active(&hdr->b_arc_node)); 680*789Sahrens ASSERT3P(hdr->b_hash_next, ==, NULL); 681*789Sahrens ASSERT3P(hdr->b_acb, ==, NULL); 682*789Sahrens kmem_cache_free(hdr_cache, hdr); 683*789Sahrens } 684*789Sahrens 685*789Sahrens void 686*789Sahrens arc_buf_free(arc_buf_t *buf, void *tag) 687*789Sahrens { 688*789Sahrens arc_buf_hdr_t *hdr = buf->b_hdr; 689*789Sahrens kmutex_t *hash_lock = HDR_LOCK(hdr); 690*789Sahrens int freeable; 691*789Sahrens 692*789Sahrens mutex_enter(hash_lock); 693*789Sahrens if (remove_reference(hdr, hash_lock, tag) > 0) { 694*789Sahrens arc_buf_t **bufp = &hdr->b_buf; 695*789Sahrens arc_state_t *state = hdr->b_state; 696*789Sahrens uint64_t size = hdr->b_size; 697*789Sahrens 698*789Sahrens ASSERT(hdr->b_state != arc.anon || HDR_IO_ERROR(hdr)); 699*789Sahrens while (*bufp != buf) { 700*789Sahrens ASSERT(*bufp); 701*789Sahrens bufp = &(*bufp)->b_next; 702*789Sahrens } 703*789Sahrens *bufp = buf->b_next; 704*789Sahrens mutex_exit(hash_lock); 705*789Sahrens zio_buf_free(buf->b_data, size); 706*789Sahrens atomic_add_64(&arc.size, -size); 707*789Sahrens kmem_cache_free(buf_cache, buf); 708*789Sahrens ASSERT3U(state->size, >=, size); 709*789Sahrens atomic_add_64(&state->size, -size); 710*789Sahrens return; 711*789Sahrens } 712*789Sahrens 713*789Sahrens /* don't free buffers that are in the middle of an async write */ 714*789Sahrens freeable = (hdr->b_state == arc.anon && hdr->b_acb == NULL); 715*789Sahrens mutex_exit(hash_lock); 716*789Sahrens 717*789Sahrens if (freeable) 718*789Sahrens arc_hdr_free(hdr); 719*789Sahrens } 720*789Sahrens 721*789Sahrens int 722*789Sahrens arc_buf_size(arc_buf_t *buf) 723*789Sahrens { 724*789Sahrens return (buf->b_hdr->b_size); 725*789Sahrens } 726*789Sahrens 727*789Sahrens /* 728*789Sahrens * Evict buffers from list until we've removed the specified number of 729*789Sahrens * bytes. Move the removed buffers to the appropriate evict state. 730*789Sahrens */ 731*789Sahrens static uint64_t 732*789Sahrens arc_evict_state(arc_state_t *state, int64_t bytes) 733*789Sahrens { 734*789Sahrens arc_state_t *evicted_state; 735*789Sahrens uint64_t bytes_evicted = 0; 736*789Sahrens arc_buf_hdr_t *ab, *ab_prev; 737*789Sahrens kmutex_t *hash_lock; 738*789Sahrens 739*789Sahrens ASSERT(state == arc.mru_top || state == arc.mfu_top); 740*789Sahrens 741*789Sahrens if (state == arc.mru_top) 742*789Sahrens evicted_state = arc.mru_bot; 743*789Sahrens else 744*789Sahrens evicted_state = arc.mfu_bot; 745*789Sahrens 746*789Sahrens mutex_enter(&state->mtx); 747*789Sahrens mutex_enter(&evicted_state->mtx); 748*789Sahrens 749*789Sahrens for (ab = list_tail(&state->list); ab; ab = ab_prev) { 750*789Sahrens ab_prev = list_prev(&state->list, ab); 751*789Sahrens hash_lock = HDR_LOCK(ab); 752*789Sahrens if (mutex_tryenter(hash_lock)) { 753*789Sahrens ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); 754*789Sahrens arc_change_state(evicted_state, ab, hash_lock); 755*789Sahrens zio_buf_free(ab->b_buf->b_data, ab->b_size); 756*789Sahrens atomic_add_64(&arc.size, -ab->b_size); 757*789Sahrens ASSERT3P(ab->b_buf->b_next, ==, NULL); 758*789Sahrens kmem_cache_free(buf_cache, ab->b_buf); 759*789Sahrens ab->b_buf = NULL; 760*789Sahrens DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 761*789Sahrens bytes_evicted += ab->b_size; 762*789Sahrens mutex_exit(hash_lock); 763*789Sahrens if (bytes_evicted >= bytes) 764*789Sahrens break; 765*789Sahrens } else { 766*789Sahrens atomic_add_64(&arc.skipped, 1); 767*789Sahrens } 768*789Sahrens } 769*789Sahrens mutex_exit(&evicted_state->mtx); 770*789Sahrens mutex_exit(&state->mtx); 771*789Sahrens 772*789Sahrens if (bytes_evicted < bytes) 773*789Sahrens dprintf("only evicted %lld bytes from %x", 774*789Sahrens (longlong_t)bytes_evicted, state); 775*789Sahrens 776*789Sahrens return (bytes_evicted); 777*789Sahrens } 778*789Sahrens 779*789Sahrens /* 780*789Sahrens * Remove buffers from list until we've removed the specified number of 781*789Sahrens * bytes. Destroy the buffers that are removed. 782*789Sahrens */ 783*789Sahrens static void 784*789Sahrens arc_delete_state(arc_state_t *state, int64_t bytes) 785*789Sahrens { 786*789Sahrens uint_t bufs_skipped = 0; 787*789Sahrens uint64_t bytes_deleted = 0; 788*789Sahrens arc_buf_hdr_t *ab, *ab_prev; 789*789Sahrens kmutex_t *hash_lock; 790*789Sahrens 791*789Sahrens top: 792*789Sahrens mutex_enter(&state->mtx); 793*789Sahrens for (ab = list_tail(&state->list); ab; ab = ab_prev) { 794*789Sahrens ab_prev = list_prev(&state->list, ab); 795*789Sahrens hash_lock = HDR_LOCK(ab); 796*789Sahrens if (mutex_tryenter(hash_lock)) { 797*789Sahrens arc_change_state(arc.anon, ab, hash_lock); 798*789Sahrens mutex_exit(hash_lock); 799*789Sahrens atomic_add_64(&arc.deleted, 1); 800*789Sahrens DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 801*789Sahrens bytes_deleted += ab->b_size; 802*789Sahrens arc_hdr_free(ab); 803*789Sahrens if (bytes >= 0 && bytes_deleted >= bytes) 804*789Sahrens break; 805*789Sahrens } else { 806*789Sahrens if (bytes < 0) { 807*789Sahrens mutex_exit(&state->mtx); 808*789Sahrens mutex_enter(hash_lock); 809*789Sahrens mutex_exit(hash_lock); 810*789Sahrens goto top; 811*789Sahrens } 812*789Sahrens bufs_skipped += 1; 813*789Sahrens } 814*789Sahrens } 815*789Sahrens mutex_exit(&state->mtx); 816*789Sahrens 817*789Sahrens if (bufs_skipped) { 818*789Sahrens atomic_add_64(&arc.skipped, bufs_skipped); 819*789Sahrens ASSERT(bytes >= 0); 820*789Sahrens } 821*789Sahrens 822*789Sahrens if (bytes_deleted < bytes) 823*789Sahrens dprintf("only deleted %lld bytes from %p", 824*789Sahrens (longlong_t)bytes_deleted, state); 825*789Sahrens } 826*789Sahrens 827*789Sahrens static void 828*789Sahrens arc_adjust(void) 829*789Sahrens { 830*789Sahrens int64_t top_sz, mru_over, arc_over; 831*789Sahrens 832*789Sahrens top_sz = arc.anon->size + arc.mru_top->size; 833*789Sahrens 834*789Sahrens if (top_sz > arc.p && arc.mru_top->lsize > 0) { 835*789Sahrens int64_t toevict = MIN(arc.mru_top->lsize, top_sz-arc.p); 836*789Sahrens (void) arc_evict_state(arc.mru_top, toevict); 837*789Sahrens top_sz = arc.anon->size + arc.mru_top->size; 838*789Sahrens } 839*789Sahrens 840*789Sahrens mru_over = top_sz + arc.mru_bot->size - arc.c; 841*789Sahrens 842*789Sahrens if (mru_over > 0) { 843*789Sahrens if (arc.mru_bot->lsize > 0) { 844*789Sahrens int64_t todelete = MIN(arc.mru_bot->lsize, mru_over); 845*789Sahrens arc_delete_state(arc.mru_bot, todelete); 846*789Sahrens } 847*789Sahrens } 848*789Sahrens 849*789Sahrens if ((arc_over = arc.size - arc.c) > 0) { 850*789Sahrens int64_t table_over; 851*789Sahrens 852*789Sahrens if (arc.mfu_top->lsize > 0) { 853*789Sahrens int64_t toevict = MIN(arc.mfu_top->lsize, arc_over); 854*789Sahrens (void) arc_evict_state(arc.mfu_top, toevict); 855*789Sahrens } 856*789Sahrens 857*789Sahrens table_over = arc.size + arc.mru_bot->lsize + arc.mfu_bot->lsize 858*789Sahrens - arc.c*2; 859*789Sahrens 860*789Sahrens if (table_over > 0 && arc.mfu_bot->lsize > 0) { 861*789Sahrens int64_t todelete = MIN(arc.mfu_bot->lsize, table_over); 862*789Sahrens arc_delete_state(arc.mfu_bot, todelete); 863*789Sahrens } 864*789Sahrens } 865*789Sahrens } 866*789Sahrens 867*789Sahrens /* 868*789Sahrens * Flush all *evictable* data from the cache. 869*789Sahrens * NOTE: this will not touch "active" (i.e. referenced) data. 870*789Sahrens */ 871*789Sahrens void 872*789Sahrens arc_flush(void) 873*789Sahrens { 874*789Sahrens arc_delete_state(arc.mru_top, -1); 875*789Sahrens arc_delete_state(arc.mfu_top, -1); 876*789Sahrens 877*789Sahrens arc_delete_state(arc.mru_bot, -1); 878*789Sahrens arc_delete_state(arc.mfu_bot, -1); 879*789Sahrens } 880*789Sahrens 881*789Sahrens void 882*789Sahrens arc_kmem_reclaim(void) 883*789Sahrens { 884*789Sahrens /* Remove 6.25% */ 885*789Sahrens /* 886*789Sahrens * We need arc_reclaim_lock because we don't want multiple 887*789Sahrens * threads trying to reclaim concurrently. 888*789Sahrens */ 889*789Sahrens 890*789Sahrens /* 891*789Sahrens * umem calls the reclaim func when we destroy the buf cache, 892*789Sahrens * which is after we do arc_fini(). So we set a flag to prevent 893*789Sahrens * accessing the destroyed mutexes and lists. 894*789Sahrens */ 895*789Sahrens if (arc_dead) 896*789Sahrens return; 897*789Sahrens 898*789Sahrens mutex_enter(&arc_reclaim_lock); 899*789Sahrens 900*789Sahrens atomic_add_64(&arc.c, -(arc.c >> 4)); 901*789Sahrens if (arc.c < arc.c_min) 902*789Sahrens arc.c = arc.c_min; 903*789Sahrens atomic_add_64(&arc.p, -(arc.p >> 4)); 904*789Sahrens 905*789Sahrens arc_adjust(); 906*789Sahrens 907*789Sahrens /* Cool it for a while */ 908*789Sahrens arc.incr = 0; 909*789Sahrens arc.size_check = arc_size_check_default << 3; 910*789Sahrens 911*789Sahrens mutex_exit(&arc_reclaim_lock); 912*789Sahrens } 913*789Sahrens 914*789Sahrens static int 915*789Sahrens arc_reclaim_needed(void) 916*789Sahrens { 917*789Sahrens uint64_t extra; 918*789Sahrens 919*789Sahrens #ifdef _KERNEL 920*789Sahrens /* 921*789Sahrens * take 'desfree' extra pages, so we reclaim sooner, rather than later 922*789Sahrens */ 923*789Sahrens extra = desfree; 924*789Sahrens 925*789Sahrens /* 926*789Sahrens * check that we're out of range of the pageout scanner. It starts to 927*789Sahrens * schedule paging if freemem is less than lotsfree and needfree. 928*789Sahrens * lotsfree is the high-water mark for pageout, and needfree is the 929*789Sahrens * number of needed free pages. We add extra pages here to make sure 930*789Sahrens * the scanner doesn't start up while we're freeing memory. 931*789Sahrens */ 932*789Sahrens if (freemem < lotsfree + needfree + extra) 933*789Sahrens return (1); 934*789Sahrens 935*789Sahrens /* 936*789Sahrens * check to make sure that swapfs has enough space so that anon 937*789Sahrens * reservations can still succeeed. anon_resvmem() checks that the 938*789Sahrens * availrmem is greater than swapfs_minfree, and the number of reserved 939*789Sahrens * swap pages. We also add a bit of extra here just to prevent 940*789Sahrens * circumstances from getting really dire. 941*789Sahrens */ 942*789Sahrens if (availrmem < swapfs_minfree + swapfs_reserve + extra) 943*789Sahrens return (1); 944*789Sahrens 945*789Sahrens /* 946*789Sahrens * If we're on an i386 platform, it's possible that we'll exhaust the 947*789Sahrens * kernel heap space before we ever run out of available physical 948*789Sahrens * memory. Most checks of the size of the heap_area compare against 949*789Sahrens * tune.t_minarmem, which is the minimum available real memory that we 950*789Sahrens * can have in the system. However, this is generally fixed at 25 pages 951*789Sahrens * which is so low that it's useless. In this comparison, we seek to 952*789Sahrens * calculate the total heap-size, and reclaim if more than 3/4ths of the 953*789Sahrens * heap is allocated. (Or, in the caclulation, if less than 1/4th is 954*789Sahrens * free) 955*789Sahrens */ 956*789Sahrens #if defined(__i386) 957*789Sahrens if (btop(vmem_size(heap_arena, VMEM_FREE)) < 958*789Sahrens (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) 959*789Sahrens return (1); 960*789Sahrens #endif 961*789Sahrens 962*789Sahrens #else 963*789Sahrens if (spa_get_random(100) == 0) 964*789Sahrens return (1); 965*789Sahrens #endif 966*789Sahrens return (0); 967*789Sahrens } 968*789Sahrens 969*789Sahrens static void 970*789Sahrens arc_kmem_reap_now(arc_reclaim_strategy_t strat) 971*789Sahrens { 972*789Sahrens size_t i; 973*789Sahrens kmem_cache_t *prev_cache = NULL; 974*789Sahrens extern kmem_cache_t *zio_buf_cache[]; 975*789Sahrens 976*789Sahrens /* 977*789Sahrens * an agressive reclamation will shrink the cache size as well as reap 978*789Sahrens * free kmem buffers. The arc_kmem_reclaim function is called when the 979*789Sahrens * header-cache is reaped, so we only reap the header cache if we're 980*789Sahrens * performing an agressive reclaim. If we're not, just clean the kmem 981*789Sahrens * buffer caches. 982*789Sahrens */ 983*789Sahrens if (strat == ARC_RECLAIM_AGGR) 984*789Sahrens kmem_cache_reap_now(hdr_cache); 985*789Sahrens 986*789Sahrens kmem_cache_reap_now(buf_cache); 987*789Sahrens 988*789Sahrens for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 989*789Sahrens if (zio_buf_cache[i] != prev_cache) { 990*789Sahrens prev_cache = zio_buf_cache[i]; 991*789Sahrens kmem_cache_reap_now(zio_buf_cache[i]); 992*789Sahrens } 993*789Sahrens } 994*789Sahrens } 995*789Sahrens 996*789Sahrens static void 997*789Sahrens arc_reclaim_thread(void) 998*789Sahrens { 999*789Sahrens clock_t growtime = 0; 1000*789Sahrens arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 1001*789Sahrens callb_cpr_t cpr; 1002*789Sahrens 1003*789Sahrens CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 1004*789Sahrens 1005*789Sahrens mutex_enter(&arc_reclaim_thr_lock); 1006*789Sahrens while (arc_thread_exit == 0) { 1007*789Sahrens if (arc_reclaim_needed()) { 1008*789Sahrens 1009*789Sahrens if (arc.no_grow) { 1010*789Sahrens if (last_reclaim == ARC_RECLAIM_CONS) { 1011*789Sahrens last_reclaim = ARC_RECLAIM_AGGR; 1012*789Sahrens } else { 1013*789Sahrens last_reclaim = ARC_RECLAIM_CONS; 1014*789Sahrens } 1015*789Sahrens } else { 1016*789Sahrens arc.no_grow = TRUE; 1017*789Sahrens last_reclaim = ARC_RECLAIM_AGGR; 1018*789Sahrens membar_producer(); 1019*789Sahrens } 1020*789Sahrens 1021*789Sahrens /* reset the growth delay for every reclaim */ 1022*789Sahrens growtime = lbolt + (arc_grow_retry * hz); 1023*789Sahrens 1024*789Sahrens arc_kmem_reap_now(last_reclaim); 1025*789Sahrens 1026*789Sahrens } else if ((growtime > 0) && ((growtime - lbolt) <= 0)) { 1027*789Sahrens arc.no_grow = FALSE; 1028*789Sahrens } 1029*789Sahrens 1030*789Sahrens /* block until needed, or one second, whichever is shorter */ 1031*789Sahrens CALLB_CPR_SAFE_BEGIN(&cpr); 1032*789Sahrens (void) cv_timedwait(&arc_reclaim_thr_cv, 1033*789Sahrens &arc_reclaim_thr_lock, (lbolt + hz)); 1034*789Sahrens CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 1035*789Sahrens } 1036*789Sahrens 1037*789Sahrens arc_thread_exit = 0; 1038*789Sahrens cv_broadcast(&arc_reclaim_thr_cv); 1039*789Sahrens CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 1040*789Sahrens thread_exit(); 1041*789Sahrens } 1042*789Sahrens 1043*789Sahrens static void 1044*789Sahrens arc_try_grow(int64_t bytes) 1045*789Sahrens { 1046*789Sahrens /* 1047*789Sahrens * If we're within (2 * maxblocksize) bytes of the target 1048*789Sahrens * cache size, increment the target cache size 1049*789Sahrens */ 1050*789Sahrens atomic_add_64((uint64_t *)&arc.size_check, 1); 1051*789Sahrens 1052*789Sahrens if (arc_reclaim_needed()) { 1053*789Sahrens cv_signal(&arc_reclaim_thr_cv); 1054*789Sahrens return; 1055*789Sahrens } 1056*789Sahrens 1057*789Sahrens if (arc.no_grow) 1058*789Sahrens return; 1059*789Sahrens 1060*789Sahrens /* 1061*789Sahrens * return true if we successfully grow, or if there's enough space that 1062*789Sahrens * we don't have to grow. Above, we return false if we can't grow, or 1063*789Sahrens * if we shouldn't because a reclaim is in progress. 1064*789Sahrens */ 1065*789Sahrens if ((arc.c - arc.size) <= (2ULL << SPA_MAXBLOCKSHIFT)) { 1066*789Sahrens if (arc.size_check > 0) { 1067*789Sahrens arc.size_check = arc_size_check_default; 1068*789Sahrens atomic_add_64(&arc.incr, arc_incr_size); 1069*789Sahrens } 1070*789Sahrens atomic_add_64(&arc.c, MIN(bytes, arc.incr)); 1071*789Sahrens if (arc.c > arc.c_max) 1072*789Sahrens arc.c = arc.c_max; 1073*789Sahrens else 1074*789Sahrens atomic_add_64(&arc.p, MIN(bytes, arc.incr)); 1075*789Sahrens } else if (arc.size > arc.c) { 1076*789Sahrens if (arc.size_check > 0) { 1077*789Sahrens arc.size_check = arc_size_check_default; 1078*789Sahrens atomic_add_64(&arc.incr, arc_incr_size); 1079*789Sahrens } 1080*789Sahrens atomic_add_64(&arc.c, MIN(bytes, arc.incr)); 1081*789Sahrens if (arc.c > arc.c_max) 1082*789Sahrens arc.c = arc.c_max; 1083*789Sahrens else 1084*789Sahrens atomic_add_64(&arc.p, MIN(bytes, arc.incr)); 1085*789Sahrens } 1086*789Sahrens } 1087*789Sahrens 1088*789Sahrens /* 1089*789Sahrens * check if the cache has reached its limits and eviction is required prior to 1090*789Sahrens * insert. In this situation, we want to evict if no_grow is set Otherwise, the 1091*789Sahrens * cache is either big enough that we can insert, or a arc_try_grow will result 1092*789Sahrens * in more space being made available. 1093*789Sahrens */ 1094*789Sahrens 1095*789Sahrens static int 1096*789Sahrens arc_evict_needed() 1097*789Sahrens { 1098*789Sahrens 1099*789Sahrens if (arc_reclaim_needed()) 1100*789Sahrens return (1); 1101*789Sahrens 1102*789Sahrens if (arc.no_grow || (arc.c > arc.c_max) || (arc.size > arc.c)) 1103*789Sahrens return (1); 1104*789Sahrens 1105*789Sahrens return (0); 1106*789Sahrens } 1107*789Sahrens 1108*789Sahrens /* 1109*789Sahrens * The state, supplied as the first argument, is going to have something 1110*789Sahrens * inserted on its behalf. So, determine which cache must be victimized to 1111*789Sahrens * satisfy an insertion for this state. We have the following cases: 1112*789Sahrens * 1113*789Sahrens * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru_top) -> 1114*789Sahrens * In this situation if we're out of space, but the resident size of the MFU is 1115*789Sahrens * under the limit, victimize the MFU cache to satisfy this insertion request. 1116*789Sahrens * 1117*789Sahrens * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru_top) -> 1118*789Sahrens * Here, we've used up all of the available space for the MRU, so we need to 1119*789Sahrens * evict from our own cache instead. Evict from the set of resident MRU 1120*789Sahrens * entries. 1121*789Sahrens * 1122*789Sahrens * 3. Insert for MFU (c - p) > sizeof(arc.mfu_top) -> 1123*789Sahrens * c minus p represents the MFU space in the cache, since p is the size of the 1124*789Sahrens * cache that is dedicated to the MRU. In this situation there's still space on 1125*789Sahrens * the MFU side, so the MRU side needs to be victimized. 1126*789Sahrens * 1127*789Sahrens * 4. Insert for MFU (c - p) < sizeof(arc.mfu_top) -> 1128*789Sahrens * MFU's resident set is consuming more space than it has been allotted. In 1129*789Sahrens * this situation, we must victimize our own cache, the MFU, for this insertion. 1130*789Sahrens */ 1131*789Sahrens static void 1132*789Sahrens arc_evict_for_state(arc_state_t *state, uint64_t bytes) 1133*789Sahrens { 1134*789Sahrens uint64_t mru_used; 1135*789Sahrens uint64_t mfu_space; 1136*789Sahrens uint64_t evicted; 1137*789Sahrens 1138*789Sahrens ASSERT(state == arc.mru_top || state == arc.mfu_top); 1139*789Sahrens 1140*789Sahrens if (state == arc.mru_top) { 1141*789Sahrens mru_used = arc.anon->size + arc.mru_top->size; 1142*789Sahrens if (arc.p > mru_used) { 1143*789Sahrens /* case 1 */ 1144*789Sahrens evicted = arc_evict_state(arc.mfu_top, bytes); 1145*789Sahrens if (evicted < bytes) { 1146*789Sahrens arc_adjust(); 1147*789Sahrens } 1148*789Sahrens } else { 1149*789Sahrens /* case 2 */ 1150*789Sahrens evicted = arc_evict_state(arc.mru_top, bytes); 1151*789Sahrens if (evicted < bytes) { 1152*789Sahrens arc_adjust(); 1153*789Sahrens } 1154*789Sahrens } 1155*789Sahrens } else { 1156*789Sahrens /* MFU_top case */ 1157*789Sahrens mfu_space = arc.c - arc.p; 1158*789Sahrens if (mfu_space > arc.mfu_top->size) { 1159*789Sahrens /* case 3 */ 1160*789Sahrens evicted = arc_evict_state(arc.mru_top, bytes); 1161*789Sahrens if (evicted < bytes) { 1162*789Sahrens arc_adjust(); 1163*789Sahrens } 1164*789Sahrens } else { 1165*789Sahrens /* case 4 */ 1166*789Sahrens evicted = arc_evict_state(arc.mfu_top, bytes); 1167*789Sahrens if (evicted < bytes) { 1168*789Sahrens arc_adjust(); 1169*789Sahrens } 1170*789Sahrens } 1171*789Sahrens } 1172*789Sahrens } 1173*789Sahrens 1174*789Sahrens /* 1175*789Sahrens * This routine is called whenever a buffer is accessed. 1176*789Sahrens */ 1177*789Sahrens static void 1178*789Sahrens arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 1179*789Sahrens { 1180*789Sahrens int blksz, mult; 1181*789Sahrens 1182*789Sahrens ASSERT(MUTEX_HELD(hash_lock)); 1183*789Sahrens 1184*789Sahrens blksz = buf->b_size; 1185*789Sahrens 1186*789Sahrens if (buf->b_state == arc.anon) { 1187*789Sahrens /* 1188*789Sahrens * This buffer is not in the cache, and does not 1189*789Sahrens * appear in our "ghost" list. Add the new buffer 1190*789Sahrens * to the MRU state. 1191*789Sahrens */ 1192*789Sahrens 1193*789Sahrens arc_try_grow(blksz); 1194*789Sahrens if (arc_evict_needed()) { 1195*789Sahrens arc_evict_for_state(arc.mru_top, blksz); 1196*789Sahrens } 1197*789Sahrens 1198*789Sahrens ASSERT(buf->b_arc_access == 0); 1199*789Sahrens buf->b_arc_access = lbolt; 1200*789Sahrens DTRACE_PROBE1(new_state__mru_top, arc_buf_hdr_t *, 1201*789Sahrens buf); 1202*789Sahrens arc_change_state(arc.mru_top, buf, hash_lock); 1203*789Sahrens 1204*789Sahrens /* 1205*789Sahrens * If we are using less than 2/3 of our total target 1206*789Sahrens * cache size, bump up the target size for the MRU 1207*789Sahrens * list. 1208*789Sahrens */ 1209*789Sahrens if (arc.size < arc.c*2/3) { 1210*789Sahrens arc.p = arc.anon->size + arc.mru_top->size + arc.c/6; 1211*789Sahrens } 1212*789Sahrens 1213*789Sahrens } else if (buf->b_state == arc.mru_top) { 1214*789Sahrens /* 1215*789Sahrens * If this buffer is in the MRU-top state and has the prefetch 1216*789Sahrens * flag, the first read was actually part of a prefetch. In 1217*789Sahrens * this situation, we simply want to clear the flag and return. 1218*789Sahrens * A subsequent access should bump this into the MFU state. 1219*789Sahrens */ 1220*789Sahrens if ((buf->b_flags & ARC_PREFETCH) != 0) { 1221*789Sahrens buf->b_flags &= ~ARC_PREFETCH; 1222*789Sahrens atomic_add_64(&arc.mru_top->hits, 1); 1223*789Sahrens return; 1224*789Sahrens } 1225*789Sahrens 1226*789Sahrens /* 1227*789Sahrens * This buffer has been "accessed" only once so far, 1228*789Sahrens * but it is still in the cache. Move it to the MFU 1229*789Sahrens * state. 1230*789Sahrens */ 1231*789Sahrens if (lbolt > buf->b_arc_access + ARC_MINTIME) { 1232*789Sahrens /* 1233*789Sahrens * More than 125ms have passed since we 1234*789Sahrens * instantiated this buffer. Move it to the 1235*789Sahrens * most frequently used state. 1236*789Sahrens */ 1237*789Sahrens buf->b_arc_access = lbolt; 1238*789Sahrens DTRACE_PROBE1(new_state__mfu_top, 1239*789Sahrens arc_buf_hdr_t *, buf); 1240*789Sahrens arc_change_state(arc.mfu_top, buf, hash_lock); 1241*789Sahrens } 1242*789Sahrens atomic_add_64(&arc.mru_top->hits, 1); 1243*789Sahrens } else if (buf->b_state == arc.mru_bot) { 1244*789Sahrens arc_state_t *new_state; 1245*789Sahrens /* 1246*789Sahrens * This buffer has been "accessed" recently, but 1247*789Sahrens * was evicted from the cache. Move it to the 1248*789Sahrens * MFU state. 1249*789Sahrens */ 1250*789Sahrens 1251*789Sahrens if (buf->b_flags & ARC_PREFETCH) { 1252*789Sahrens new_state = arc.mru_top; 1253*789Sahrens DTRACE_PROBE1(new_state__mru_top, 1254*789Sahrens arc_buf_hdr_t *, buf); 1255*789Sahrens } else { 1256*789Sahrens new_state = arc.mfu_top; 1257*789Sahrens DTRACE_PROBE1(new_state__mfu_top, 1258*789Sahrens arc_buf_hdr_t *, buf); 1259*789Sahrens } 1260*789Sahrens 1261*789Sahrens arc_try_grow(blksz); 1262*789Sahrens if (arc_evict_needed()) { 1263*789Sahrens arc_evict_for_state(new_state, blksz); 1264*789Sahrens } 1265*789Sahrens 1266*789Sahrens /* Bump up the target size of the MRU list */ 1267*789Sahrens mult = ((arc.mru_bot->size >= arc.mfu_bot->size) ? 1268*789Sahrens 1 : (arc.mfu_bot->size/arc.mru_bot->size)); 1269*789Sahrens arc.p = MIN(arc.c, arc.p + blksz * mult); 1270*789Sahrens 1271*789Sahrens buf->b_arc_access = lbolt; 1272*789Sahrens arc_change_state(new_state, buf, hash_lock); 1273*789Sahrens 1274*789Sahrens atomic_add_64(&arc.mru_bot->hits, 1); 1275*789Sahrens } else if (buf->b_state == arc.mfu_top) { 1276*789Sahrens /* 1277*789Sahrens * This buffer has been accessed more than once and is 1278*789Sahrens * still in the cache. Keep it in the MFU state. 1279*789Sahrens * 1280*789Sahrens * NOTE: the add_reference() that occurred when we did 1281*789Sahrens * the arc_read() should have kicked this off the list, 1282*789Sahrens * so even if it was a prefetch, it will be put back at 1283*789Sahrens * the head of the list when we remove_reference(). 1284*789Sahrens */ 1285*789Sahrens atomic_add_64(&arc.mfu_top->hits, 1); 1286*789Sahrens } else if (buf->b_state == arc.mfu_bot) { 1287*789Sahrens /* 1288*789Sahrens * This buffer has been accessed more than once but has 1289*789Sahrens * been evicted from the cache. Move it back to the 1290*789Sahrens * MFU state. 1291*789Sahrens */ 1292*789Sahrens 1293*789Sahrens arc_try_grow(blksz); 1294*789Sahrens if (arc_evict_needed()) { 1295*789Sahrens arc_evict_for_state(arc.mfu_top, blksz); 1296*789Sahrens } 1297*789Sahrens 1298*789Sahrens /* Bump up the target size for the MFU list */ 1299*789Sahrens mult = ((arc.mfu_bot->size >= arc.mru_bot->size) ? 1300*789Sahrens 1 : (arc.mru_bot->size/arc.mfu_bot->size)); 1301*789Sahrens arc.p = MAX(0, (int64_t)arc.p - blksz * mult); 1302*789Sahrens 1303*789Sahrens buf->b_arc_access = lbolt; 1304*789Sahrens DTRACE_PROBE1(new_state__mfu_top, 1305*789Sahrens arc_buf_hdr_t *, buf); 1306*789Sahrens arc_change_state(arc.mfu_top, buf, hash_lock); 1307*789Sahrens 1308*789Sahrens atomic_add_64(&arc.mfu_bot->hits, 1); 1309*789Sahrens } else { 1310*789Sahrens ASSERT(!"invalid arc state"); 1311*789Sahrens } 1312*789Sahrens 1313*789Sahrens } 1314*789Sahrens 1315*789Sahrens /* a generic arc_done_func_t which you can use */ 1316*789Sahrens /* ARGSUSED */ 1317*789Sahrens void 1318*789Sahrens arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 1319*789Sahrens { 1320*789Sahrens bcopy(buf->b_data, arg, buf->b_hdr->b_size); 1321*789Sahrens arc_buf_free(buf, arg); 1322*789Sahrens } 1323*789Sahrens 1324*789Sahrens /* a generic arc_done_func_t which you can use */ 1325*789Sahrens void 1326*789Sahrens arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 1327*789Sahrens { 1328*789Sahrens arc_buf_t **bufp = arg; 1329*789Sahrens if (zio && zio->io_error) { 1330*789Sahrens arc_buf_free(buf, arg); 1331*789Sahrens *bufp = NULL; 1332*789Sahrens } else { 1333*789Sahrens *bufp = buf; 1334*789Sahrens } 1335*789Sahrens } 1336*789Sahrens 1337*789Sahrens static void 1338*789Sahrens arc_read_done(zio_t *zio) 1339*789Sahrens { 1340*789Sahrens arc_buf_hdr_t *hdr; 1341*789Sahrens arc_buf_t *buf; 1342*789Sahrens arc_buf_t *abuf; /* buffer we're assigning to callback */ 1343*789Sahrens kmutex_t *hash_lock; 1344*789Sahrens arc_callback_t *callback_list, *acb; 1345*789Sahrens int freeable = FALSE; 1346*789Sahrens 1347*789Sahrens buf = zio->io_private; 1348*789Sahrens hdr = buf->b_hdr; 1349*789Sahrens 1350*789Sahrens if (!HDR_FREED_IN_READ(hdr)) { 1351*789Sahrens arc_buf_hdr_t *found; 1352*789Sahrens 1353*789Sahrens found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, 1354*789Sahrens &hash_lock); 1355*789Sahrens 1356*789Sahrens /* 1357*789Sahrens * Buffer was inserted into hash-table and removed from lists 1358*789Sahrens * prior to starting I/O. We should find this header, since 1359*789Sahrens * it's in the hash table, and it should be legit since it's 1360*789Sahrens * not possible to evict it during the I/O. 1361*789Sahrens */ 1362*789Sahrens 1363*789Sahrens ASSERT(found); 1364*789Sahrens ASSERT(DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))); 1365*789Sahrens } 1366*789Sahrens 1367*789Sahrens /* byteswap if necessary */ 1368*789Sahrens callback_list = hdr->b_acb; 1369*789Sahrens ASSERT(callback_list != NULL); 1370*789Sahrens if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) 1371*789Sahrens callback_list->acb_byteswap(buf->b_data, hdr->b_size); 1372*789Sahrens 1373*789Sahrens /* create copies of the data buffer for the callers */ 1374*789Sahrens abuf = buf; 1375*789Sahrens for (acb = callback_list; acb; acb = acb->acb_next) { 1376*789Sahrens if (acb->acb_done) { 1377*789Sahrens if (abuf == NULL) { 1378*789Sahrens abuf = kmem_cache_alloc(buf_cache, KM_SLEEP); 1379*789Sahrens abuf->b_data = zio_buf_alloc(hdr->b_size); 1380*789Sahrens atomic_add_64(&arc.size, hdr->b_size); 1381*789Sahrens bcopy(buf->b_data, abuf->b_data, hdr->b_size); 1382*789Sahrens abuf->b_hdr = hdr; 1383*789Sahrens abuf->b_next = hdr->b_buf; 1384*789Sahrens hdr->b_buf = abuf; 1385*789Sahrens atomic_add_64(&hdr->b_state->size, hdr->b_size); 1386*789Sahrens } 1387*789Sahrens acb->acb_buf = abuf; 1388*789Sahrens abuf = NULL; 1389*789Sahrens } else { 1390*789Sahrens /* 1391*789Sahrens * The caller did not provide a callback function. 1392*789Sahrens * In this case, we should just remove the reference. 1393*789Sahrens */ 1394*789Sahrens if (HDR_FREED_IN_READ(hdr)) { 1395*789Sahrens ASSERT3P(hdr->b_state, ==, arc.anon); 1396*789Sahrens (void) refcount_remove(&hdr->b_refcnt, 1397*789Sahrens acb->acb_private); 1398*789Sahrens } else { 1399*789Sahrens (void) remove_reference(hdr, hash_lock, 1400*789Sahrens acb->acb_private); 1401*789Sahrens } 1402*789Sahrens } 1403*789Sahrens } 1404*789Sahrens hdr->b_acb = NULL; 1405*789Sahrens hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 1406*789Sahrens 1407*789Sahrens ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 1408*789Sahrens 1409*789Sahrens if (zio->io_error != 0) { 1410*789Sahrens hdr->b_flags |= ARC_IO_ERROR; 1411*789Sahrens if (hdr->b_state != arc.anon) 1412*789Sahrens arc_change_state(arc.anon, hdr, hash_lock); 1413*789Sahrens freeable = refcount_is_zero(&hdr->b_refcnt); 1414*789Sahrens } 1415*789Sahrens 1416*789Sahrens if (!HDR_FREED_IN_READ(hdr)) { 1417*789Sahrens /* 1418*789Sahrens * Only call arc_access on anonymous buffers. This is because 1419*789Sahrens * if we've issued an I/O for an evicted buffer, we've already 1420*789Sahrens * called arc_access (to prevent any simultaneous readers from 1421*789Sahrens * getting confused). 1422*789Sahrens */ 1423*789Sahrens if (zio->io_error == 0 && hdr->b_state == arc.anon) 1424*789Sahrens arc_access(hdr, hash_lock); 1425*789Sahrens mutex_exit(hash_lock); 1426*789Sahrens } else { 1427*789Sahrens /* 1428*789Sahrens * This block was freed while we waited for the read to 1429*789Sahrens * complete. It has been removed from the hash table and 1430*789Sahrens * moved to the anonymous state (so that it won't show up 1431*789Sahrens * in the cache). 1432*789Sahrens */ 1433*789Sahrens ASSERT3P(hdr->b_state, ==, arc.anon); 1434*789Sahrens freeable = refcount_is_zero(&hdr->b_refcnt); 1435*789Sahrens } 1436*789Sahrens 1437*789Sahrens cv_broadcast(&hdr->b_cv); 1438*789Sahrens 1439*789Sahrens /* execute each callback and free its structure */ 1440*789Sahrens while ((acb = callback_list) != NULL) { 1441*789Sahrens if (acb->acb_done) 1442*789Sahrens acb->acb_done(zio, acb->acb_buf, acb->acb_private); 1443*789Sahrens 1444*789Sahrens if (acb->acb_zio_dummy != NULL) { 1445*789Sahrens acb->acb_zio_dummy->io_error = zio->io_error; 1446*789Sahrens zio_nowait(acb->acb_zio_dummy); 1447*789Sahrens } 1448*789Sahrens 1449*789Sahrens callback_list = acb->acb_next; 1450*789Sahrens kmem_free(acb, sizeof (arc_callback_t)); 1451*789Sahrens } 1452*789Sahrens 1453*789Sahrens if (freeable) 1454*789Sahrens arc_hdr_free(hdr); 1455*789Sahrens } 1456*789Sahrens 1457*789Sahrens /* 1458*789Sahrens * "Read" the block block at the specified DVA (in bp) via the 1459*789Sahrens * cache. If the block is found in the cache, invoke the provided 1460*789Sahrens * callback immediately and return. Note that the `zio' parameter 1461*789Sahrens * in the callback will be NULL in this case, since no IO was 1462*789Sahrens * required. If the block is not in the cache pass the read request 1463*789Sahrens * on to the spa with a substitute callback function, so that the 1464*789Sahrens * requested block will be added to the cache. 1465*789Sahrens * 1466*789Sahrens * If a read request arrives for a block that has a read in-progress, 1467*789Sahrens * either wait for the in-progress read to complete (and return the 1468*789Sahrens * results); or, if this is a read with a "done" func, add a record 1469*789Sahrens * to the read to invoke the "done" func when the read completes, 1470*789Sahrens * and return; or just return. 1471*789Sahrens * 1472*789Sahrens * arc_read_done() will invoke all the requested "done" functions 1473*789Sahrens * for readers of this block. 1474*789Sahrens */ 1475*789Sahrens int 1476*789Sahrens arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, 1477*789Sahrens arc_done_func_t *done, void *private, int priority, int flags, 1478*789Sahrens uint32_t arc_flags) 1479*789Sahrens { 1480*789Sahrens arc_buf_hdr_t *hdr; 1481*789Sahrens arc_buf_t *buf; 1482*789Sahrens kmutex_t *hash_lock; 1483*789Sahrens zio_t *rzio; 1484*789Sahrens 1485*789Sahrens top: 1486*789Sahrens hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 1487*789Sahrens if (hdr && hdr->b_buf) { 1488*789Sahrens 1489*789Sahrens ASSERT((hdr->b_state == arc.mru_top) || 1490*789Sahrens (hdr->b_state == arc.mfu_top) || 1491*789Sahrens ((hdr->b_state == arc.anon) && 1492*789Sahrens (HDR_IO_IN_PROGRESS(hdr)))); 1493*789Sahrens 1494*789Sahrens if (HDR_IO_IN_PROGRESS(hdr)) { 1495*789Sahrens 1496*789Sahrens if ((arc_flags & ARC_NOWAIT) && done) { 1497*789Sahrens arc_callback_t *acb = NULL; 1498*789Sahrens 1499*789Sahrens acb = kmem_zalloc(sizeof (arc_callback_t), 1500*789Sahrens KM_SLEEP); 1501*789Sahrens acb->acb_done = done; 1502*789Sahrens acb->acb_private = private; 1503*789Sahrens acb->acb_byteswap = swap; 1504*789Sahrens if (pio != NULL) 1505*789Sahrens acb->acb_zio_dummy = zio_null(pio, 1506*789Sahrens spa, NULL, NULL, flags); 1507*789Sahrens 1508*789Sahrens ASSERT(acb->acb_done != NULL); 1509*789Sahrens acb->acb_next = hdr->b_acb; 1510*789Sahrens hdr->b_acb = acb; 1511*789Sahrens add_reference(hdr, hash_lock, private); 1512*789Sahrens mutex_exit(hash_lock); 1513*789Sahrens return (0); 1514*789Sahrens } else if (arc_flags & ARC_WAIT) { 1515*789Sahrens cv_wait(&hdr->b_cv, hash_lock); 1516*789Sahrens mutex_exit(hash_lock); 1517*789Sahrens goto top; 1518*789Sahrens } 1519*789Sahrens 1520*789Sahrens mutex_exit(hash_lock); 1521*789Sahrens return (0); 1522*789Sahrens } 1523*789Sahrens 1524*789Sahrens /* 1525*789Sahrens * If there is already a reference on this block, create 1526*789Sahrens * a new copy of the data so that we will be guaranteed 1527*789Sahrens * that arc_release() will always succeed. 1528*789Sahrens */ 1529*789Sahrens 1530*789Sahrens if (done) 1531*789Sahrens add_reference(hdr, hash_lock, private); 1532*789Sahrens if (done && refcount_count(&hdr->b_refcnt) > 1) { 1533*789Sahrens buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 1534*789Sahrens buf->b_data = zio_buf_alloc(hdr->b_size); 1535*789Sahrens ASSERT3U(refcount_count(&hdr->b_refcnt), >, 1); 1536*789Sahrens atomic_add_64(&arc.size, hdr->b_size); 1537*789Sahrens bcopy(hdr->b_buf->b_data, buf->b_data, hdr->b_size); 1538*789Sahrens buf->b_hdr = hdr; 1539*789Sahrens buf->b_next = hdr->b_buf; 1540*789Sahrens hdr->b_buf = buf; 1541*789Sahrens atomic_add_64(&hdr->b_state->size, hdr->b_size); 1542*789Sahrens } else { 1543*789Sahrens buf = hdr->b_buf; 1544*789Sahrens } 1545*789Sahrens DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 1546*789Sahrens arc_access(hdr, hash_lock); 1547*789Sahrens mutex_exit(hash_lock); 1548*789Sahrens atomic_add_64(&arc.hits, 1); 1549*789Sahrens if (done) 1550*789Sahrens done(NULL, buf, private); 1551*789Sahrens } else { 1552*789Sahrens uint64_t size = BP_GET_LSIZE(bp); 1553*789Sahrens arc_callback_t *acb; 1554*789Sahrens 1555*789Sahrens if (hdr == NULL) { 1556*789Sahrens /* this block is not in the cache */ 1557*789Sahrens arc_buf_hdr_t *exists; 1558*789Sahrens 1559*789Sahrens buf = arc_buf_alloc(spa, size, private); 1560*789Sahrens hdr = buf->b_hdr; 1561*789Sahrens hdr->b_dva = *BP_IDENTITY(bp); 1562*789Sahrens hdr->b_birth = bp->blk_birth; 1563*789Sahrens hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 1564*789Sahrens exists = buf_hash_insert(hdr, &hash_lock); 1565*789Sahrens if (exists) { 1566*789Sahrens /* somebody beat us to the hash insert */ 1567*789Sahrens mutex_exit(hash_lock); 1568*789Sahrens bzero(&hdr->b_dva, sizeof (dva_t)); 1569*789Sahrens hdr->b_birth = 0; 1570*789Sahrens hdr->b_cksum0 = 0; 1571*789Sahrens arc_buf_free(buf, private); 1572*789Sahrens goto top; /* restart the IO request */ 1573*789Sahrens } 1574*789Sahrens 1575*789Sahrens } else { 1576*789Sahrens /* this block is in the ghost cache */ 1577*789Sahrens ASSERT((hdr->b_state == arc.mru_bot) || 1578*789Sahrens (hdr->b_state == arc.mfu_bot)); 1579*789Sahrens add_reference(hdr, hash_lock, private); 1580*789Sahrens 1581*789Sahrens buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 1582*789Sahrens buf->b_data = zio_buf_alloc(hdr->b_size); 1583*789Sahrens atomic_add_64(&arc.size, hdr->b_size); 1584*789Sahrens ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1585*789Sahrens ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); 1586*789Sahrens buf->b_hdr = hdr; 1587*789Sahrens buf->b_next = NULL; 1588*789Sahrens hdr->b_buf = buf; 1589*789Sahrens } 1590*789Sahrens 1591*789Sahrens acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 1592*789Sahrens acb->acb_done = done; 1593*789Sahrens acb->acb_private = private; 1594*789Sahrens acb->acb_byteswap = swap; 1595*789Sahrens 1596*789Sahrens ASSERT(hdr->b_acb == NULL); 1597*789Sahrens hdr->b_acb = acb; 1598*789Sahrens 1599*789Sahrens /* 1600*789Sahrens * If this DVA is part of a prefetch, mark the buf 1601*789Sahrens * header with the prefetch flag 1602*789Sahrens */ 1603*789Sahrens if (arc_flags & ARC_PREFETCH) 1604*789Sahrens hdr->b_flags |= ARC_PREFETCH; 1605*789Sahrens hdr->b_flags |= ARC_IO_IN_PROGRESS; 1606*789Sahrens 1607*789Sahrens /* 1608*789Sahrens * If the buffer has been evicted, migrate it to a present state 1609*789Sahrens * before issuing the I/O. Once we drop the hash-table lock, 1610*789Sahrens * the header will be marked as I/O in progress and have an 1611*789Sahrens * attached buffer. At this point, anybody who finds this 1612*789Sahrens * buffer ought to notice that it's legit but has a pending I/O. 1613*789Sahrens */ 1614*789Sahrens 1615*789Sahrens if ((hdr->b_state == arc.mru_bot) || 1616*789Sahrens (hdr->b_state == arc.mfu_bot)) 1617*789Sahrens arc_access(hdr, hash_lock); 1618*789Sahrens 1619*789Sahrens mutex_exit(hash_lock); 1620*789Sahrens 1621*789Sahrens ASSERT3U(hdr->b_size, ==, size); 1622*789Sahrens DTRACE_PROBE2(arc__miss, blkptr_t *, bp, 1623*789Sahrens uint64_t, size); 1624*789Sahrens atomic_add_64(&arc.misses, 1); 1625*789Sahrens rzio = zio_read(pio, spa, bp, buf->b_data, size, 1626*789Sahrens arc_read_done, buf, priority, flags); 1627*789Sahrens 1628*789Sahrens if (arc_flags & ARC_WAIT) 1629*789Sahrens return (zio_wait(rzio)); 1630*789Sahrens 1631*789Sahrens ASSERT(arc_flags & ARC_NOWAIT); 1632*789Sahrens zio_nowait(rzio); 1633*789Sahrens } 1634*789Sahrens return (0); 1635*789Sahrens } 1636*789Sahrens 1637*789Sahrens /* 1638*789Sahrens * arc_read() variant to support pool traversal. If the block is already 1639*789Sahrens * in the ARC, make a copy of it; otherwise, the caller will do the I/O. 1640*789Sahrens * The idea is that we don't want pool traversal filling up memory, but 1641*789Sahrens * if the ARC already has the data anyway, we shouldn't pay for the I/O. 1642*789Sahrens */ 1643*789Sahrens int 1644*789Sahrens arc_tryread(spa_t *spa, blkptr_t *bp, void *data) 1645*789Sahrens { 1646*789Sahrens arc_buf_hdr_t *hdr; 1647*789Sahrens kmutex_t *hash_mtx; 1648*789Sahrens int rc = 0; 1649*789Sahrens 1650*789Sahrens hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); 1651*789Sahrens 1652*789Sahrens if (hdr && hdr->b_buf && !HDR_IO_IN_PROGRESS(hdr)) 1653*789Sahrens bcopy(hdr->b_buf->b_data, data, hdr->b_size); 1654*789Sahrens else 1655*789Sahrens rc = ENOENT; 1656*789Sahrens 1657*789Sahrens if (hash_mtx) 1658*789Sahrens mutex_exit(hash_mtx); 1659*789Sahrens 1660*789Sahrens return (rc); 1661*789Sahrens } 1662*789Sahrens 1663*789Sahrens /* 1664*789Sahrens * Release this buffer from the cache. This must be done 1665*789Sahrens * after a read and prior to modifying the buffer contents. 1666*789Sahrens * If the buffer has more than one reference, we must make 1667*789Sahrens * make a new hdr for the buffer. 1668*789Sahrens */ 1669*789Sahrens void 1670*789Sahrens arc_release(arc_buf_t *buf, void *tag) 1671*789Sahrens { 1672*789Sahrens arc_buf_hdr_t *hdr = buf->b_hdr; 1673*789Sahrens kmutex_t *hash_lock = HDR_LOCK(hdr); 1674*789Sahrens 1675*789Sahrens /* this buffer is not on any list */ 1676*789Sahrens ASSERT(refcount_count(&hdr->b_refcnt) > 0); 1677*789Sahrens 1678*789Sahrens if (hdr->b_state == arc.anon) { 1679*789Sahrens /* this buffer is already released */ 1680*789Sahrens ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); 1681*789Sahrens ASSERT(BUF_EMPTY(hdr)); 1682*789Sahrens return; 1683*789Sahrens } 1684*789Sahrens 1685*789Sahrens mutex_enter(hash_lock); 1686*789Sahrens 1687*789Sahrens if (refcount_count(&hdr->b_refcnt) > 1) { 1688*789Sahrens arc_buf_hdr_t *nhdr; 1689*789Sahrens arc_buf_t **bufp; 1690*789Sahrens uint64_t blksz = hdr->b_size; 1691*789Sahrens spa_t *spa = hdr->b_spa; 1692*789Sahrens 1693*789Sahrens /* 1694*789Sahrens * Pull the data off of this buf and attach it to 1695*789Sahrens * a new anonymous buf. 1696*789Sahrens */ 1697*789Sahrens bufp = &hdr->b_buf; 1698*789Sahrens while (*bufp != buf) { 1699*789Sahrens ASSERT(*bufp); 1700*789Sahrens bufp = &(*bufp)->b_next; 1701*789Sahrens } 1702*789Sahrens *bufp = (*bufp)->b_next; 1703*789Sahrens (void) refcount_remove(&hdr->b_refcnt, tag); 1704*789Sahrens ASSERT3U(hdr->b_state->size, >=, hdr->b_size); 1705*789Sahrens atomic_add_64(&hdr->b_state->size, -hdr->b_size); 1706*789Sahrens mutex_exit(hash_lock); 1707*789Sahrens 1708*789Sahrens nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 1709*789Sahrens nhdr->b_size = blksz; 1710*789Sahrens nhdr->b_spa = spa; 1711*789Sahrens nhdr->b_buf = buf; 1712*789Sahrens nhdr->b_state = arc.anon; 1713*789Sahrens nhdr->b_arc_access = 0; 1714*789Sahrens nhdr->b_flags = 0; 1715*789Sahrens buf->b_hdr = nhdr; 1716*789Sahrens buf->b_next = NULL; 1717*789Sahrens (void) refcount_add(&nhdr->b_refcnt, tag); 1718*789Sahrens atomic_add_64(&arc.anon->size, blksz); 1719*789Sahrens 1720*789Sahrens hdr = nhdr; 1721*789Sahrens } else { 1722*789Sahrens ASSERT(!list_link_active(&hdr->b_arc_node)); 1723*789Sahrens ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1724*789Sahrens arc_change_state(arc.anon, hdr, hash_lock); 1725*789Sahrens hdr->b_arc_access = 0; 1726*789Sahrens mutex_exit(hash_lock); 1727*789Sahrens bzero(&hdr->b_dva, sizeof (dva_t)); 1728*789Sahrens hdr->b_birth = 0; 1729*789Sahrens hdr->b_cksum0 = 0; 1730*789Sahrens } 1731*789Sahrens } 1732*789Sahrens 1733*789Sahrens int 1734*789Sahrens arc_released(arc_buf_t *buf) 1735*789Sahrens { 1736*789Sahrens return (buf->b_hdr->b_state == arc.anon); 1737*789Sahrens } 1738*789Sahrens 1739*789Sahrens static void 1740*789Sahrens arc_write_done(zio_t *zio) 1741*789Sahrens { 1742*789Sahrens arc_buf_t *buf; 1743*789Sahrens arc_buf_hdr_t *hdr; 1744*789Sahrens arc_callback_t *acb; 1745*789Sahrens 1746*789Sahrens buf = zio->io_private; 1747*789Sahrens hdr = buf->b_hdr; 1748*789Sahrens acb = hdr->b_acb; 1749*789Sahrens hdr->b_acb = NULL; 1750*789Sahrens 1751*789Sahrens /* this buffer is on no lists and is not in the hash table */ 1752*789Sahrens ASSERT3P(hdr->b_state, ==, arc.anon); 1753*789Sahrens 1754*789Sahrens hdr->b_dva = *BP_IDENTITY(zio->io_bp); 1755*789Sahrens hdr->b_birth = zio->io_bp->blk_birth; 1756*789Sahrens hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 1757*789Sahrens /* clear the "in-write" flag */ 1758*789Sahrens hdr->b_hash_next = NULL; 1759*789Sahrens /* This write may be all-zero */ 1760*789Sahrens if (!BUF_EMPTY(hdr)) { 1761*789Sahrens arc_buf_hdr_t *exists; 1762*789Sahrens kmutex_t *hash_lock; 1763*789Sahrens 1764*789Sahrens exists = buf_hash_insert(hdr, &hash_lock); 1765*789Sahrens if (exists) { 1766*789Sahrens /* 1767*789Sahrens * This can only happen if we overwrite for 1768*789Sahrens * sync-to-convergence, because we remove 1769*789Sahrens * buffers from the hash table when we arc_free(). 1770*789Sahrens */ 1771*789Sahrens ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), 1772*789Sahrens BP_IDENTITY(zio->io_bp))); 1773*789Sahrens ASSERT3U(zio->io_bp_orig.blk_birth, ==, 1774*789Sahrens zio->io_bp->blk_birth); 1775*789Sahrens 1776*789Sahrens ASSERT(refcount_is_zero(&exists->b_refcnt)); 1777*789Sahrens arc_change_state(arc.anon, exists, hash_lock); 1778*789Sahrens mutex_exit(hash_lock); 1779*789Sahrens arc_hdr_free(exists); 1780*789Sahrens exists = buf_hash_insert(hdr, &hash_lock); 1781*789Sahrens ASSERT3P(exists, ==, NULL); 1782*789Sahrens } 1783*789Sahrens arc_access(hdr, hash_lock); 1784*789Sahrens mutex_exit(hash_lock); 1785*789Sahrens } 1786*789Sahrens if (acb && acb->acb_done) { 1787*789Sahrens ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 1788*789Sahrens acb->acb_done(zio, buf, acb->acb_private); 1789*789Sahrens } 1790*789Sahrens 1791*789Sahrens if (acb) 1792*789Sahrens kmem_free(acb, sizeof (arc_callback_t)); 1793*789Sahrens } 1794*789Sahrens 1795*789Sahrens int 1796*789Sahrens arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, 1797*789Sahrens uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 1798*789Sahrens arc_done_func_t *done, void *private, int priority, int flags, 1799*789Sahrens uint32_t arc_flags) 1800*789Sahrens { 1801*789Sahrens arc_buf_hdr_t *hdr = buf->b_hdr; 1802*789Sahrens arc_callback_t *acb; 1803*789Sahrens zio_t *rzio; 1804*789Sahrens 1805*789Sahrens /* this is a private buffer - no locking required */ 1806*789Sahrens ASSERT3P(hdr->b_state, ==, arc.anon); 1807*789Sahrens ASSERT(BUF_EMPTY(hdr)); 1808*789Sahrens ASSERT(!HDR_IO_ERROR(hdr)); 1809*789Sahrens acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 1810*789Sahrens acb->acb_done = done; 1811*789Sahrens acb->acb_private = private; 1812*789Sahrens acb->acb_byteswap = (arc_byteswap_func_t *)-1; 1813*789Sahrens hdr->b_acb = acb; 1814*789Sahrens rzio = zio_write(pio, spa, checksum, compress, txg, bp, 1815*789Sahrens buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags); 1816*789Sahrens 1817*789Sahrens if (arc_flags & ARC_WAIT) 1818*789Sahrens return (zio_wait(rzio)); 1819*789Sahrens 1820*789Sahrens ASSERT(arc_flags & ARC_NOWAIT); 1821*789Sahrens zio_nowait(rzio); 1822*789Sahrens 1823*789Sahrens return (0); 1824*789Sahrens } 1825*789Sahrens 1826*789Sahrens int 1827*789Sahrens arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 1828*789Sahrens zio_done_func_t *done, void *private, uint32_t arc_flags) 1829*789Sahrens { 1830*789Sahrens arc_buf_hdr_t *ab; 1831*789Sahrens kmutex_t *hash_lock; 1832*789Sahrens zio_t *zio; 1833*789Sahrens 1834*789Sahrens /* 1835*789Sahrens * If this buffer is in the cache, release it, so it 1836*789Sahrens * can be re-used. 1837*789Sahrens */ 1838*789Sahrens ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 1839*789Sahrens if (ab != NULL) { 1840*789Sahrens /* 1841*789Sahrens * The checksum of blocks to free is not always 1842*789Sahrens * preserved (eg. on the deadlist). However, if it is 1843*789Sahrens * nonzero, it should match what we have in the cache. 1844*789Sahrens */ 1845*789Sahrens ASSERT(bp->blk_cksum.zc_word[0] == 0 || 1846*789Sahrens ab->b_cksum0 == bp->blk_cksum.zc_word[0]); 1847*789Sahrens arc_change_state(arc.anon, ab, hash_lock); 1848*789Sahrens if (refcount_is_zero(&ab->b_refcnt)) { 1849*789Sahrens mutex_exit(hash_lock); 1850*789Sahrens arc_hdr_free(ab); 1851*789Sahrens atomic_add_64(&arc.deleted, 1); 1852*789Sahrens } else { 1853*789Sahrens ASSERT3U(refcount_count(&ab->b_refcnt), ==, 1); 1854*789Sahrens if (HDR_IO_IN_PROGRESS(ab)) 1855*789Sahrens ab->b_flags |= ARC_FREED_IN_READ; 1856*789Sahrens ab->b_arc_access = 0; 1857*789Sahrens bzero(&ab->b_dva, sizeof (dva_t)); 1858*789Sahrens ab->b_birth = 0; 1859*789Sahrens ab->b_cksum0 = 0; 1860*789Sahrens mutex_exit(hash_lock); 1861*789Sahrens } 1862*789Sahrens } 1863*789Sahrens 1864*789Sahrens zio = zio_free(pio, spa, txg, bp, done, private); 1865*789Sahrens 1866*789Sahrens if (arc_flags & ARC_WAIT) 1867*789Sahrens return (zio_wait(zio)); 1868*789Sahrens 1869*789Sahrens ASSERT(arc_flags & ARC_NOWAIT); 1870*789Sahrens zio_nowait(zio); 1871*789Sahrens 1872*789Sahrens return (0); 1873*789Sahrens } 1874*789Sahrens 1875*789Sahrens void 1876*789Sahrens arc_tempreserve_clear(uint64_t tempreserve) 1877*789Sahrens { 1878*789Sahrens atomic_add_64(&arc_tempreserve, -tempreserve); 1879*789Sahrens ASSERT((int64_t)arc_tempreserve >= 0); 1880*789Sahrens } 1881*789Sahrens 1882*789Sahrens int 1883*789Sahrens arc_tempreserve_space(uint64_t tempreserve) 1884*789Sahrens { 1885*789Sahrens #ifdef ZFS_DEBUG 1886*789Sahrens /* 1887*789Sahrens * Once in a while, fail for no reason. Everything should cope. 1888*789Sahrens */ 1889*789Sahrens if (spa_get_random(10000) == 0) { 1890*789Sahrens dprintf("forcing random failure\n"); 1891*789Sahrens return (ERESTART); 1892*789Sahrens } 1893*789Sahrens #endif 1894*789Sahrens /* 1895*789Sahrens * XXX This is kind of hacky. The limit should be adjusted 1896*789Sahrens * dynamically to keep the time to sync a dataset fixed (around 1897*789Sahrens * 1-5 seconds?). 1898*789Sahrens * Maybe should have some sort of locking? If two requests come 1899*789Sahrens * in concurrently, we might let them both succeed, when one of 1900*789Sahrens * them should fail. Not a huge deal. 1901*789Sahrens */ 1902*789Sahrens 1903*789Sahrens ASSERT3U(tempreserve, <, arc.c/4); /* otherwise we'll loop forever */ 1904*789Sahrens 1905*789Sahrens if (arc_tempreserve + tempreserve + arc.anon->size > arc.c / 4) { 1906*789Sahrens dprintf("failing, arc_tempreserve=%lluK anon=%lluK " 1907*789Sahrens "tempreserve=%lluK arc.c=%lluK\n", 1908*789Sahrens arc_tempreserve>>10, arc.anon->lsize>>10, 1909*789Sahrens tempreserve>>10, arc.c>>10); 1910*789Sahrens return (ERESTART); 1911*789Sahrens } 1912*789Sahrens atomic_add_64(&arc_tempreserve, tempreserve); 1913*789Sahrens return (0); 1914*789Sahrens } 1915*789Sahrens 1916*789Sahrens void 1917*789Sahrens arc_init(void) 1918*789Sahrens { 1919*789Sahrens mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); 1920*789Sahrens mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 1921*789Sahrens cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 1922*789Sahrens 1923*789Sahrens /* Start out with 1/8 of all memory */ 1924*789Sahrens arc.c = physmem * PAGESIZE / 8; 1925*789Sahrens 1926*789Sahrens #ifdef _KERNEL 1927*789Sahrens /* 1928*789Sahrens * On architectures where the physical memory can be larger 1929*789Sahrens * than the addressable space (intel in 32-bit mode), we may 1930*789Sahrens * need to limit the cache to 1/8 of VM size. 1931*789Sahrens */ 1932*789Sahrens arc.c = MIN(arc.c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 1933*789Sahrens #endif 1934*789Sahrens 1935*789Sahrens /* use at least 1/32 of all memory, or 32MB, whichever is more */ 1936*789Sahrens arc.c_min = MAX(arc.c / 4, 64<<20); 1937*789Sahrens /* use at most 3/4 of all memory, or all but 1GB, whichever is more */ 1938*789Sahrens if (arc.c * 8 >= 1<<30) 1939*789Sahrens arc.c_max = (arc.c * 8) - (1<<30); 1940*789Sahrens else 1941*789Sahrens arc.c_max = arc.c_min; 1942*789Sahrens arc.c_max = MAX(arc.c * 6, arc.c_max); 1943*789Sahrens arc.c = arc.c_max; 1944*789Sahrens arc.p = (arc.c >> 1); 1945*789Sahrens 1946*789Sahrens /* if kmem_flags are set, lets try to use less memory */ 1947*789Sahrens if (kmem_debugging()) 1948*789Sahrens arc.c = arc.c / 2; 1949*789Sahrens if (arc.c < arc.c_min) 1950*789Sahrens arc.c = arc.c_min; 1951*789Sahrens 1952*789Sahrens arc.anon = &ARC_anon; 1953*789Sahrens arc.mru_top = &ARC_mru_top; 1954*789Sahrens arc.mru_bot = &ARC_mru_bot; 1955*789Sahrens arc.mfu_top = &ARC_mfu_top; 1956*789Sahrens arc.mfu_bot = &ARC_mfu_bot; 1957*789Sahrens 1958*789Sahrens list_create(&arc.mru_top->list, sizeof (arc_buf_hdr_t), 1959*789Sahrens offsetof(arc_buf_hdr_t, b_arc_node)); 1960*789Sahrens list_create(&arc.mru_bot->list, sizeof (arc_buf_hdr_t), 1961*789Sahrens offsetof(arc_buf_hdr_t, b_arc_node)); 1962*789Sahrens list_create(&arc.mfu_top->list, sizeof (arc_buf_hdr_t), 1963*789Sahrens offsetof(arc_buf_hdr_t, b_arc_node)); 1964*789Sahrens list_create(&arc.mfu_bot->list, sizeof (arc_buf_hdr_t), 1965*789Sahrens offsetof(arc_buf_hdr_t, b_arc_node)); 1966*789Sahrens 1967*789Sahrens buf_init(); 1968*789Sahrens 1969*789Sahrens arc_thread_exit = 0; 1970*789Sahrens 1971*789Sahrens (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 1972*789Sahrens TS_RUN, minclsyspri); 1973*789Sahrens } 1974*789Sahrens 1975*789Sahrens void 1976*789Sahrens arc_fini(void) 1977*789Sahrens { 1978*789Sahrens mutex_enter(&arc_reclaim_thr_lock); 1979*789Sahrens arc_thread_exit = 1; 1980*789Sahrens while (arc_thread_exit != 0) 1981*789Sahrens cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 1982*789Sahrens mutex_exit(&arc_reclaim_thr_lock); 1983*789Sahrens 1984*789Sahrens arc_flush(); 1985*789Sahrens 1986*789Sahrens arc_dead = TRUE; 1987*789Sahrens 1988*789Sahrens mutex_destroy(&arc_reclaim_lock); 1989*789Sahrens mutex_destroy(&arc_reclaim_thr_lock); 1990*789Sahrens cv_destroy(&arc_reclaim_thr_cv); 1991*789Sahrens 1992*789Sahrens list_destroy(&arc.mru_top->list); 1993*789Sahrens list_destroy(&arc.mru_bot->list); 1994*789Sahrens list_destroy(&arc.mfu_top->list); 1995*789Sahrens list_destroy(&arc.mfu_bot->list); 1996*789Sahrens 1997*789Sahrens buf_fini(); 1998*789Sahrens } 1999