1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * DVA-based Adjustable Replacement Cache 28 * 29 * While much of the theory of operation used here is 30 * based on the self-tuning, low overhead replacement cache 31 * presented by Megiddo and Modha at FAST 2003, there are some 32 * significant differences: 33 * 34 * 1. The Megiddo and Modha model assumes any page is evictable. 35 * Pages in its cache cannot be "locked" into memory. This makes 36 * the eviction algorithm simple: evict the last page in the list. 37 * This also make the performance characteristics easy to reason 38 * about. Our cache is not so simple. At any given moment, some 39 * subset of the blocks in the cache are un-evictable because we 40 * have handed out a reference to them. Blocks are only evictable 41 * when there are no external references active. This makes 42 * eviction far more problematic: we choose to evict the evictable 43 * blocks that are the "lowest" in the list. 44 * 45 * There are times when it is not possible to evict the requested 46 * space. In these circumstances we are unable to adjust the cache 47 * size. To prevent the cache growing unbounded at these times we 48 * implement a "cache throttle" that slows the flow of new data 49 * into the cache until we can make space available. 50 * 51 * 2. The Megiddo and Modha model assumes a fixed cache size. 52 * Pages are evicted when the cache is full and there is a cache 53 * miss. Our model has a variable sized cache. It grows with 54 * high use, but also tries to react to memory pressure from the 55 * operating system: decreasing its size when system memory is 56 * tight. 57 * 58 * 3. The Megiddo and Modha model assumes a fixed page size. All 59 * elements of the cache are therefor exactly the same size. So 60 * when adjusting the cache size following a cache miss, its simply 61 * a matter of choosing a single page to evict. In our model, we 62 * have variable sized cache blocks (rangeing from 512 bytes to 63 * 128K bytes). We therefor choose a set of blocks to evict to make 64 * space for a cache miss that approximates as closely as possible 65 * the space used by the new block. 66 * 67 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 68 * by N. Megiddo & D. Modha, FAST 2003 69 */ 70 71 /* 72 * The locking model: 73 * 74 * A new reference to a cache buffer can be obtained in two 75 * ways: 1) via a hash table lookup using the DVA as a key, 76 * or 2) via one of the ARC lists. The arc_read() interface 77 * uses method 1, while the internal arc algorithms for 78 * adjusting the cache use method 2. We therefor provide two 79 * types of locks: 1) the hash table lock array, and 2) the 80 * arc list locks. 81 * 82 * Buffers do not have their own mutexs, rather they rely on the 83 * hash table mutexs for the bulk of their protection (i.e. most 84 * fields in the arc_buf_hdr_t are protected by these mutexs). 85 * 86 * buf_hash_find() returns the appropriate mutex (held) when it 87 * locates the requested buffer in the hash table. It returns 88 * NULL for the mutex if the buffer was not in the table. 89 * 90 * buf_hash_remove() expects the appropriate hash mutex to be 91 * already held before it is invoked. 92 * 93 * Each arc state also has a mutex which is used to protect the 94 * buffer list associated with the state. When attempting to 95 * obtain a hash table lock while holding an arc list lock you 96 * must use: mutex_tryenter() to avoid deadlock. Also note that 97 * the active state mutex must be held before the ghost state mutex. 98 * 99 * Arc buffers may have an associated eviction callback function. 100 * This function will be invoked prior to removing the buffer (e.g. 101 * in arc_do_user_evicts()). Note however that the data associated 102 * with the buffer may be evicted prior to the callback. The callback 103 * must be made with *no locks held* (to prevent deadlock). Additionally, 104 * the users of callbacks must ensure that their private data is 105 * protected from simultaneous callbacks from arc_buf_evict() 106 * and arc_do_user_evicts(). 107 * 108 * Note that the majority of the performance stats are manipulated 109 * with atomic operations. 110 * 111 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: 112 * 113 * - L2ARC buflist creation 114 * - L2ARC buflist eviction 115 * - L2ARC write completion, which walks L2ARC buflists 116 * - ARC header destruction, as it removes from L2ARC buflists 117 * - ARC header release, as it removes from L2ARC buflists 118 */ 119 120 #include <sys/spa.h> 121 #include <sys/zio.h> 122 #include <sys/zfs_context.h> 123 #include <sys/arc.h> 124 #include <sys/refcount.h> 125 #include <sys/vdev.h> 126 #include <sys/vdev_impl.h> 127 #ifdef _KERNEL 128 #include <sys/vmsystm.h> 129 #include <vm/anon.h> 130 #include <sys/fs/swapnode.h> 131 #include <sys/dnlc.h> 132 #endif 133 #include <sys/callb.h> 134 #include <sys/kstat.h> 135 #include <zfs_fletcher.h> 136 137 #ifdef __NetBSD__ 138 #include <uvm/uvm.h> 139 #ifndef btop 140 #define btop(x) ((x) / PAGE_SIZE) 141 #endif 142 #define needfree (uvmexp.free < uvmexp.freetarg ? uvmexp.freetarg : 0) 143 #define buf_init arc_buf_init 144 #define freemem uvmexp.free 145 #define minfree uvmexp.freemin 146 #define desfree uvmexp.freetarg 147 #define lotsfree (desfree * 2) 148 #define availrmem desfree 149 #define swapfs_minfree 0 150 #define swapfs_reserve 0 151 #undef curproc 152 #define curproc curlwp 153 #define proc_pageout uvm.pagedaemon_lwp 154 155 #define heap_arena kernel_map 156 #define VMEM_ALLOC 1 157 #define VMEM_FREE 2 158 static inline size_t 159 vmem_size(struct vm_map *map, int flag) 160 { 161 switch (flag) { 162 case VMEM_ALLOC: 163 return map->size; 164 case VMEM_FREE: 165 return vm_map_max(map) - vm_map_min(map) - map->size; 166 case VMEM_FREE|VMEM_ALLOC: 167 return vm_map_max(map) - vm_map_min(map); 168 default: 169 panic("vmem_size"); 170 } 171 } 172 static void *zio_arena; 173 174 #include <sys/callback.h> 175 /* Structures used for memory and kva space reclaim. */ 176 static struct callback_entry arc_kva_reclaim_entry; 177 static struct uvm_reclaim_hook arc_hook; 178 179 #endif /* __NetBSD__ */ 180 181 static kmutex_t arc_reclaim_thr_lock; 182 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 183 static uint8_t arc_thread_exit; 184 185 extern int zfs_write_limit_shift; 186 extern uint64_t zfs_write_limit_max; 187 extern kmutex_t zfs_write_limit_lock; 188 189 #define ARC_REDUCE_DNLC_PERCENT 3 190 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 191 192 typedef enum arc_reclaim_strategy { 193 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 194 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 195 } arc_reclaim_strategy_t; 196 197 /* number of seconds before growing cache again */ 198 static int arc_grow_retry = 60; 199 200 /* shift of arc_c for calculating both min and max arc_p */ 201 static int arc_p_min_shift = 4; 202 203 /* log2(fraction of arc to reclaim) */ 204 static int arc_shrink_shift = 5; 205 206 /* 207 * minimum lifespan of a prefetch block in clock ticks 208 * (initialized in arc_init()) 209 */ 210 static int arc_min_prefetch_lifespan; 211 212 static int arc_dead; 213 214 /* 215 * The arc has filled available memory and has now warmed up. 216 */ 217 static boolean_t arc_warm; 218 219 /* 220 * These tunables are for performance analysis. 221 */ 222 uint64_t zfs_arc_max; 223 uint64_t zfs_arc_min; 224 uint64_t zfs_arc_meta_limit = 0; 225 int zfs_arc_grow_retry = 0; 226 int zfs_arc_shrink_shift = 0; 227 int zfs_arc_p_min_shift = 0; 228 229 /* 230 * Note that buffers can be in one of 6 states: 231 * ARC_anon - anonymous (discussed below) 232 * ARC_mru - recently used, currently cached 233 * ARC_mru_ghost - recentely used, no longer in cache 234 * ARC_mfu - frequently used, currently cached 235 * ARC_mfu_ghost - frequently used, no longer in cache 236 * ARC_l2c_only - exists in L2ARC but not other states 237 * When there are no active references to the buffer, they are 238 * are linked onto a list in one of these arc states. These are 239 * the only buffers that can be evicted or deleted. Within each 240 * state there are multiple lists, one for meta-data and one for 241 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 242 * etc.) is tracked separately so that it can be managed more 243 * explicitly: favored over data, limited explicitly. 244 * 245 * Anonymous buffers are buffers that are not associated with 246 * a DVA. These are buffers that hold dirty block copies 247 * before they are written to stable storage. By definition, 248 * they are "ref'd" and are considered part of arc_mru 249 * that cannot be freed. Generally, they will aquire a DVA 250 * as they are written and migrate onto the arc_mru list. 251 * 252 * The ARC_l2c_only state is for buffers that are in the second 253 * level ARC but no longer in any of the ARC_m* lists. The second 254 * level ARC itself may also contain buffers that are in any of 255 * the ARC_m* states - meaning that a buffer can exist in two 256 * places. The reason for the ARC_l2c_only state is to keep the 257 * buffer header in the hash table, so that reads that hit the 258 * second level ARC benefit from these fast lookups. 259 */ 260 261 typedef struct arc_state { 262 list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ 263 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 264 uint64_t arcs_size; /* total amount of data in this state */ 265 kmutex_t arcs_mtx; 266 } arc_state_t; 267 268 /* The 6 states: */ 269 static arc_state_t ARC_anon; 270 static arc_state_t ARC_mru; 271 static arc_state_t ARC_mru_ghost; 272 static arc_state_t ARC_mfu; 273 static arc_state_t ARC_mfu_ghost; 274 static arc_state_t ARC_l2c_only; 275 276 typedef struct arc_stats { 277 kstat_named_t arcstat_hits; 278 kstat_named_t arcstat_misses; 279 kstat_named_t arcstat_demand_data_hits; 280 kstat_named_t arcstat_demand_data_misses; 281 kstat_named_t arcstat_demand_metadata_hits; 282 kstat_named_t arcstat_demand_metadata_misses; 283 kstat_named_t arcstat_prefetch_data_hits; 284 kstat_named_t arcstat_prefetch_data_misses; 285 kstat_named_t arcstat_prefetch_metadata_hits; 286 kstat_named_t arcstat_prefetch_metadata_misses; 287 kstat_named_t arcstat_mru_hits; 288 kstat_named_t arcstat_mru_ghost_hits; 289 kstat_named_t arcstat_mfu_hits; 290 kstat_named_t arcstat_mfu_ghost_hits; 291 kstat_named_t arcstat_deleted; 292 kstat_named_t arcstat_recycle_miss; 293 kstat_named_t arcstat_mutex_miss; 294 kstat_named_t arcstat_evict_skip; 295 kstat_named_t arcstat_evict_l2_cached; 296 kstat_named_t arcstat_evict_l2_eligible; 297 kstat_named_t arcstat_evict_l2_ineligible; 298 kstat_named_t arcstat_hash_elements; 299 kstat_named_t arcstat_hash_elements_max; 300 kstat_named_t arcstat_hash_collisions; 301 kstat_named_t arcstat_hash_chains; 302 kstat_named_t arcstat_hash_chain_max; 303 kstat_named_t arcstat_p; 304 kstat_named_t arcstat_c; 305 kstat_named_t arcstat_c_min; 306 kstat_named_t arcstat_c_max; 307 kstat_named_t arcstat_size; 308 kstat_named_t arcstat_hdr_size; 309 kstat_named_t arcstat_data_size; 310 kstat_named_t arcstat_other_size; 311 kstat_named_t arcstat_l2_hits; 312 kstat_named_t arcstat_l2_misses; 313 kstat_named_t arcstat_l2_feeds; 314 kstat_named_t arcstat_l2_rw_clash; 315 kstat_named_t arcstat_l2_read_bytes; 316 kstat_named_t arcstat_l2_write_bytes; 317 kstat_named_t arcstat_l2_writes_sent; 318 kstat_named_t arcstat_l2_writes_done; 319 kstat_named_t arcstat_l2_writes_error; 320 kstat_named_t arcstat_l2_writes_hdr_miss; 321 kstat_named_t arcstat_l2_evict_lock_retry; 322 kstat_named_t arcstat_l2_evict_reading; 323 kstat_named_t arcstat_l2_free_on_write; 324 kstat_named_t arcstat_l2_abort_lowmem; 325 kstat_named_t arcstat_l2_cksum_bad; 326 kstat_named_t arcstat_l2_io_error; 327 kstat_named_t arcstat_l2_size; 328 kstat_named_t arcstat_l2_hdr_size; 329 kstat_named_t arcstat_memory_throttle_count; 330 } arc_stats_t; 331 332 static arc_stats_t arc_stats = { 333 { "hits", KSTAT_DATA_UINT64 }, 334 { "misses", KSTAT_DATA_UINT64 }, 335 { "demand_data_hits", KSTAT_DATA_UINT64 }, 336 { "demand_data_misses", KSTAT_DATA_UINT64 }, 337 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 338 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 339 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 340 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 341 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 342 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 343 { "mru_hits", KSTAT_DATA_UINT64 }, 344 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 345 { "mfu_hits", KSTAT_DATA_UINT64 }, 346 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 347 { "deleted", KSTAT_DATA_UINT64 }, 348 { "recycle_miss", KSTAT_DATA_UINT64 }, 349 { "mutex_miss", KSTAT_DATA_UINT64 }, 350 { "evict_skip", KSTAT_DATA_UINT64 }, 351 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 352 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 353 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 354 { "hash_elements", KSTAT_DATA_UINT64 }, 355 { "hash_elements_max", KSTAT_DATA_UINT64 }, 356 { "hash_collisions", KSTAT_DATA_UINT64 }, 357 { "hash_chains", KSTAT_DATA_UINT64 }, 358 { "hash_chain_max", KSTAT_DATA_UINT64 }, 359 { "p", KSTAT_DATA_UINT64 }, 360 { "c", KSTAT_DATA_UINT64 }, 361 { "c_min", KSTAT_DATA_UINT64 }, 362 { "c_max", KSTAT_DATA_UINT64 }, 363 { "size", KSTAT_DATA_UINT64 }, 364 { "hdr_size", KSTAT_DATA_UINT64 }, 365 { "data_size", KSTAT_DATA_UINT64 }, 366 { "other_size", KSTAT_DATA_UINT64 }, 367 { "l2_hits", KSTAT_DATA_UINT64 }, 368 { "l2_misses", KSTAT_DATA_UINT64 }, 369 { "l2_feeds", KSTAT_DATA_UINT64 }, 370 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 371 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 372 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 373 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 374 { "l2_writes_done", KSTAT_DATA_UINT64 }, 375 { "l2_writes_error", KSTAT_DATA_UINT64 }, 376 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, 377 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 378 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 379 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 380 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 381 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 382 { "l2_io_error", KSTAT_DATA_UINT64 }, 383 { "l2_size", KSTAT_DATA_UINT64 }, 384 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 385 { "memory_throttle_count", KSTAT_DATA_UINT64 } 386 }; 387 388 #define ARCSTAT(stat) (arc_stats.stat.value.ui64) 389 390 #define ARCSTAT_INCR(stat, val) \ 391 atomic_add_64(&arc_stats.stat.value.ui64, (val)); 392 393 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 394 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 395 396 #define ARCSTAT_MAX(stat, val) { \ 397 uint64_t m; \ 398 while ((val) > (m = arc_stats.stat.value.ui64) && \ 399 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 400 continue; \ 401 } 402 403 #define ARCSTAT_MAXSTAT(stat) \ 404 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 405 406 /* 407 * We define a macro to allow ARC hits/misses to be easily broken down by 408 * two separate conditions, giving a total of four different subtypes for 409 * each of hits and misses (so eight statistics total). 410 */ 411 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 412 if (cond1) { \ 413 if (cond2) { \ 414 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 415 } else { \ 416 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 417 } \ 418 } else { \ 419 if (cond2) { \ 420 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 421 } else { \ 422 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 423 } \ 424 } 425 426 kstat_t *arc_ksp; 427 static arc_state_t *arc_anon; 428 static arc_state_t *arc_mru; 429 static arc_state_t *arc_mru_ghost; 430 static arc_state_t *arc_mfu; 431 static arc_state_t *arc_mfu_ghost; 432 static arc_state_t *arc_l2c_only; 433 434 /* 435 * There are several ARC variables that are critical to export as kstats -- 436 * but we don't want to have to grovel around in the kstat whenever we wish to 437 * manipulate them. For these variables, we therefore define them to be in 438 * terms of the statistic variable. This assures that we are not introducing 439 * the possibility of inconsistency by having shadow copies of the variables, 440 * while still allowing the code to be readable. 441 */ 442 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 443 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 444 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 445 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 446 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 447 448 static int arc_no_grow; /* Don't try to grow cache size */ 449 static uint64_t arc_tempreserve; 450 static uint64_t arc_loaned_bytes; 451 static uint64_t arc_meta_used; 452 static uint64_t arc_meta_limit; 453 static uint64_t arc_meta_max = 0; 454 455 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; 456 457 typedef struct arc_callback arc_callback_t; 458 459 struct arc_callback { 460 void *acb_private; 461 arc_done_func_t *acb_done; 462 arc_buf_t *acb_buf; 463 zio_t *acb_zio_dummy; 464 arc_callback_t *acb_next; 465 }; 466 467 typedef struct arc_write_callback arc_write_callback_t; 468 469 struct arc_write_callback { 470 void *awcb_private; 471 arc_done_func_t *awcb_ready; 472 arc_done_func_t *awcb_done; 473 arc_buf_t *awcb_buf; 474 }; 475 476 struct arc_buf_hdr { 477 /* protected by hash lock */ 478 dva_t b_dva; 479 uint64_t b_birth; 480 uint64_t b_cksum0; 481 482 kmutex_t b_freeze_lock; 483 zio_cksum_t *b_freeze_cksum; 484 485 arc_buf_hdr_t *b_hash_next; 486 arc_buf_t *b_buf; 487 uint32_t b_flags; 488 uint32_t b_datacnt; 489 490 arc_callback_t *b_acb; 491 kcondvar_t b_cv; 492 493 /* immutable */ 494 arc_buf_contents_t b_type; 495 uint64_t b_size; 496 uint64_t b_spa; 497 498 /* protected by arc state mutex */ 499 arc_state_t *b_state; 500 list_node_t b_arc_node; 501 502 /* updated atomically */ 503 clock_t b_arc_access; 504 505 /* self protecting */ 506 refcount_t b_refcnt; 507 508 l2arc_buf_hdr_t *b_l2hdr; 509 list_node_t b_l2node; 510 }; 511 512 static arc_buf_t *arc_eviction_list; 513 static kmutex_t arc_eviction_mtx; 514 static arc_buf_hdr_t arc_eviction_hdr; 515 static void arc_get_data_buf(arc_buf_t *buf); 516 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 517 static int arc_evict_needed(arc_buf_contents_t type); 518 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); 519 520 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); 521 522 #define GHOST_STATE(state) \ 523 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 524 (state) == arc_l2c_only) 525 526 /* 527 * Private ARC flags. These flags are private ARC only flags that will show up 528 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 529 * be passed in as arc_flags in things like arc_read. However, these flags 530 * should never be passed and should only be set by ARC code. When adding new 531 * public flags, make sure not to smash the private ones. 532 */ 533 534 #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 535 #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 536 #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 537 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 538 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 539 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 540 #define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ 541 #define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ 542 #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ 543 #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ 544 545 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 546 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 547 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 548 #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) 549 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 550 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 551 #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) 552 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) 553 #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ 554 (hdr)->b_l2hdr != NULL) 555 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) 556 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) 557 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) 558 559 /* 560 * Other sizes 561 */ 562 563 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 564 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) 565 566 /* 567 * Hash table routines 568 */ 569 570 #define HT_LOCK_PAD 64 571 572 struct ht_lock { 573 kmutex_t ht_lock; 574 #ifdef _KERNEL 575 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 576 #endif 577 }; 578 579 #define BUF_LOCKS 256 580 typedef struct buf_hash_table { 581 uint64_t ht_mask; 582 arc_buf_hdr_t **ht_table; 583 struct ht_lock ht_locks[BUF_LOCKS]; 584 } buf_hash_table_t; 585 586 static buf_hash_table_t buf_hash_table; 587 588 #define BUF_HASH_INDEX(spa, dva, birth) \ 589 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 590 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 591 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 592 #define HDR_LOCK(buf) \ 593 (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) 594 595 uint64_t zfs_crc64_table[256]; 596 597 /* 598 * Level 2 ARC 599 */ 600 601 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 602 #define L2ARC_HEADROOM 2 /* num of writes */ 603 #define L2ARC_FEED_SECS 1 /* caching interval secs */ 604 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 605 606 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 607 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 608 609 /* 610 * L2ARC Performance Tunables 611 */ 612 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 613 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 614 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 615 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 616 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 617 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 618 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 619 boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 620 621 /* 622 * L2ARC Internals 623 */ 624 typedef struct l2arc_dev { 625 vdev_t *l2ad_vdev; /* vdev */ 626 spa_t *l2ad_spa; /* spa */ 627 uint64_t l2ad_hand; /* next write location */ 628 uint64_t l2ad_write; /* desired write size, bytes */ 629 uint64_t l2ad_boost; /* warmup write boost, bytes */ 630 uint64_t l2ad_start; /* first addr on device */ 631 uint64_t l2ad_end; /* last addr on device */ 632 uint64_t l2ad_evict; /* last addr eviction reached */ 633 boolean_t l2ad_first; /* first sweep through */ 634 boolean_t l2ad_writing; /* currently writing */ 635 list_t *l2ad_buflist; /* buffer list */ 636 list_node_t l2ad_node; /* device list node */ 637 } l2arc_dev_t; 638 639 static list_t L2ARC_dev_list; /* device list */ 640 static list_t *l2arc_dev_list; /* device list pointer */ 641 static kmutex_t l2arc_dev_mtx; /* device list mutex */ 642 static l2arc_dev_t *l2arc_dev_last; /* last device used */ 643 static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ 644 static list_t L2ARC_free_on_write; /* free after write buf list */ 645 static list_t *l2arc_free_on_write; /* free after write list ptr */ 646 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 647 static uint64_t l2arc_ndev; /* number of devices */ 648 649 typedef struct l2arc_read_callback { 650 arc_buf_t *l2rcb_buf; /* read buffer */ 651 spa_t *l2rcb_spa; /* spa */ 652 blkptr_t l2rcb_bp; /* original blkptr */ 653 zbookmark_t l2rcb_zb; /* original bookmark */ 654 int l2rcb_flags; /* original flags */ 655 } l2arc_read_callback_t; 656 657 typedef struct l2arc_write_callback { 658 l2arc_dev_t *l2wcb_dev; /* device info */ 659 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 660 } l2arc_write_callback_t; 661 662 struct l2arc_buf_hdr { 663 /* protected by arc_buf_hdr mutex */ 664 l2arc_dev_t *b_dev; /* L2ARC device */ 665 uint64_t b_daddr; /* disk address, offset byte */ 666 }; 667 668 typedef struct l2arc_data_free { 669 /* protected by l2arc_free_on_write_mtx */ 670 void *l2df_data; 671 size_t l2df_size; 672 void (*l2df_func)(void *, size_t); 673 list_node_t l2df_list_node; 674 } l2arc_data_free_t; 675 676 static kmutex_t l2arc_feed_thr_lock; 677 static kcondvar_t l2arc_feed_thr_cv; 678 static uint8_t l2arc_thread_exit; 679 680 static void l2arc_read_done(zio_t *zio); 681 static void l2arc_hdr_stat_add(void); 682 static void l2arc_hdr_stat_remove(void); 683 684 static uint64_t 685 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 686 { 687 uint8_t *vdva = (uint8_t *)dva; 688 uint64_t crc = -1ULL; 689 int i; 690 691 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 692 693 for (i = 0; i < sizeof (dva_t); i++) 694 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 695 696 crc ^= (spa>>8) ^ birth; 697 698 return (crc); 699 } 700 701 #define BUF_EMPTY(buf) \ 702 ((buf)->b_dva.dva_word[0] == 0 && \ 703 (buf)->b_dva.dva_word[1] == 0 && \ 704 (buf)->b_birth == 0) 705 706 #define BUF_EQUAL(spa, dva, birth, buf) \ 707 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 708 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 709 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 710 711 static arc_buf_hdr_t * 712 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) 713 { 714 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 715 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 716 arc_buf_hdr_t *buf; 717 718 mutex_enter(hash_lock); 719 for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 720 buf = buf->b_hash_next) { 721 if (BUF_EQUAL(spa, dva, birth, buf)) { 722 *lockp = hash_lock; 723 return (buf); 724 } 725 } 726 mutex_exit(hash_lock); 727 *lockp = NULL; 728 return (NULL); 729 } 730 731 /* 732 * Insert an entry into the hash table. If there is already an element 733 * equal to elem in the hash table, then the already existing element 734 * will be returned and the new element will not be inserted. 735 * Otherwise returns NULL. 736 */ 737 static arc_buf_hdr_t * 738 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 739 { 740 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 741 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 742 arc_buf_hdr_t *fbuf; 743 uint32_t i; 744 745 ASSERT(!HDR_IN_HASH_TABLE(buf)); 746 *lockp = hash_lock; 747 mutex_enter(hash_lock); 748 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 749 fbuf = fbuf->b_hash_next, i++) { 750 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 751 return (fbuf); 752 } 753 754 buf->b_hash_next = buf_hash_table.ht_table[idx]; 755 buf_hash_table.ht_table[idx] = buf; 756 buf->b_flags |= ARC_IN_HASH_TABLE; 757 758 /* collect some hash table performance data */ 759 if (i > 0) { 760 ARCSTAT_BUMP(arcstat_hash_collisions); 761 if (i == 1) 762 ARCSTAT_BUMP(arcstat_hash_chains); 763 764 ARCSTAT_MAX(arcstat_hash_chain_max, i); 765 } 766 767 ARCSTAT_BUMP(arcstat_hash_elements); 768 ARCSTAT_MAXSTAT(arcstat_hash_elements); 769 770 return (NULL); 771 } 772 773 static void 774 buf_hash_remove(arc_buf_hdr_t *buf) 775 { 776 arc_buf_hdr_t *fbuf, **bufp; 777 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 778 779 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 780 ASSERT(HDR_IN_HASH_TABLE(buf)); 781 782 bufp = &buf_hash_table.ht_table[idx]; 783 while ((fbuf = *bufp) != buf) { 784 ASSERT(fbuf != NULL); 785 bufp = &fbuf->b_hash_next; 786 } 787 *bufp = buf->b_hash_next; 788 buf->b_hash_next = NULL; 789 buf->b_flags &= ~ARC_IN_HASH_TABLE; 790 791 /* collect some hash table performance data */ 792 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 793 794 if (buf_hash_table.ht_table[idx] && 795 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 796 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 797 } 798 799 /* 800 * Global data structures and functions for the buf kmem cache. 801 */ 802 static kmem_cache_t *hdr_cache; 803 static kmem_cache_t *buf_cache; 804 805 static void 806 buf_fini(void) 807 { 808 int i; 809 810 kmem_free(buf_hash_table.ht_table, 811 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 812 for (i = 0; i < BUF_LOCKS; i++) 813 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 814 kmem_cache_destroy(hdr_cache); 815 kmem_cache_destroy(buf_cache); 816 } 817 818 /* 819 * Constructor callback - called when the cache is empty 820 * and a new buf is requested. 821 */ 822 /* ARGSUSED */ 823 static int 824 hdr_cons(void *vbuf, void *unused, int kmflag) 825 { 826 arc_buf_hdr_t *buf = unused; 827 828 bzero(buf, sizeof (arc_buf_hdr_t)); 829 refcount_create(&buf->b_refcnt); 830 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 831 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 832 arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 833 834 return (0); 835 } 836 837 /* ARGSUSED */ 838 static int 839 buf_cons(void *vbuf, void *unused, int kmflag) 840 { 841 arc_buf_t *buf = unused; 842 843 bzero(buf, sizeof (arc_buf_t)); 844 rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL); 845 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 846 847 return (0); 848 } 849 850 /* 851 * Destructor callback - called when a cached buf is 852 * no longer required. 853 */ 854 /* ARGSUSED */ 855 static void 856 hdr_dest(void *vbuf, void *unused) 857 { 858 arc_buf_hdr_t *buf = unused; 859 860 ASSERT(BUF_EMPTY(buf)); 861 refcount_destroy(&buf->b_refcnt); 862 cv_destroy(&buf->b_cv); 863 mutex_destroy(&buf->b_freeze_lock); 864 arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 865 } 866 867 /* ARGSUSED */ 868 static void 869 buf_dest(void *vbuf, void *unused) 870 { 871 arc_buf_t *buf = unused; 872 873 rw_destroy(&buf->b_lock); 874 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 875 } 876 877 /* 878 * Reclaim callback -- invoked when memory is low. 879 */ 880 /* ARGSUSED */ 881 static void 882 hdr_recl(void *unused) 883 { 884 dprintf("hdr_recl called\n"); 885 /* 886 * umem calls the reclaim func when we destroy the buf cache, 887 * which is after we do arc_fini(). 888 */ 889 if (!arc_dead) 890 cv_signal(&arc_reclaim_thr_cv); 891 } 892 893 static void 894 buf_init(void) 895 { 896 uint64_t *ct; 897 uint64_t hsize = 1ULL << 12; 898 int i, j; 899 900 /* 901 * The hash table is big enough to fill all of physical memory 902 * with an average 64K block size. The table will take up 903 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 904 */ 905 while (hsize * 65536 < (uint64_t)physmem * PAGESIZE) 906 hsize <<= 1; 907 retry: 908 buf_hash_table.ht_mask = hsize - 1; 909 buf_hash_table.ht_table = 910 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 911 if (buf_hash_table.ht_table == NULL) { 912 ASSERT(hsize > (1ULL << 8)); 913 hsize >>= 1; 914 goto retry; 915 } 916 917 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 918 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 919 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 920 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 921 922 for (i = 0; i < 256; i++) 923 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 924 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 925 926 for (i = 0; i < BUF_LOCKS; i++) { 927 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 928 NULL, MUTEX_DEFAULT, NULL); 929 } 930 } 931 932 #define ARC_MINTIME (hz>>4) /* 62 ms */ 933 934 static void 935 arc_cksum_verify(arc_buf_t *buf) 936 { 937 zio_cksum_t zc; 938 939 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 940 return; 941 942 mutex_enter(&buf->b_hdr->b_freeze_lock); 943 if (buf->b_hdr->b_freeze_cksum == NULL || 944 (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 945 mutex_exit(&buf->b_hdr->b_freeze_lock); 946 return; 947 } 948 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 949 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 950 panic("buffer modified while frozen!"); 951 mutex_exit(&buf->b_hdr->b_freeze_lock); 952 } 953 954 static int 955 arc_cksum_equal(arc_buf_t *buf) 956 { 957 zio_cksum_t zc; 958 int equal; 959 960 mutex_enter(&buf->b_hdr->b_freeze_lock); 961 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 962 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 963 mutex_exit(&buf->b_hdr->b_freeze_lock); 964 965 return (equal); 966 } 967 968 static void 969 arc_cksum_compute(arc_buf_t *buf, boolean_t force) 970 { 971 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 972 return; 973 974 mutex_enter(&buf->b_hdr->b_freeze_lock); 975 if (buf->b_hdr->b_freeze_cksum != NULL) { 976 mutex_exit(&buf->b_hdr->b_freeze_lock); 977 return; 978 } 979 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 980 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 981 buf->b_hdr->b_freeze_cksum); 982 mutex_exit(&buf->b_hdr->b_freeze_lock); 983 } 984 985 void 986 arc_buf_thaw(arc_buf_t *buf) 987 { 988 if (zfs_flags & ZFS_DEBUG_MODIFY) { 989 if (buf->b_hdr->b_state != arc_anon) 990 panic("modifying non-anon buffer!"); 991 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 992 panic("modifying buffer while i/o in progress!"); 993 arc_cksum_verify(buf); 994 } 995 996 mutex_enter(&buf->b_hdr->b_freeze_lock); 997 if (buf->b_hdr->b_freeze_cksum != NULL) { 998 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 999 buf->b_hdr->b_freeze_cksum = NULL; 1000 } 1001 mutex_exit(&buf->b_hdr->b_freeze_lock); 1002 } 1003 1004 void 1005 arc_buf_freeze(arc_buf_t *buf) 1006 { 1007 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1008 return; 1009 1010 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 1011 buf->b_hdr->b_state == arc_anon); 1012 arc_cksum_compute(buf, B_FALSE); 1013 } 1014 1015 static void 1016 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 1017 { 1018 ASSERT(MUTEX_HELD(hash_lock)); 1019 1020 if ((refcount_add(&ab->b_refcnt, tag) == 1) && 1021 (ab->b_state != arc_anon)) { 1022 uint64_t delta = ab->b_size * ab->b_datacnt; 1023 list_t *list = &ab->b_state->arcs_list[ab->b_type]; 1024 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; 1025 1026 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); 1027 mutex_enter(&ab->b_state->arcs_mtx); 1028 ASSERT(list_link_active(&ab->b_arc_node)); 1029 list_remove(list, ab); 1030 if (GHOST_STATE(ab->b_state)) { 1031 ASSERT3U(ab->b_datacnt, ==, 0); 1032 ASSERT3P(ab->b_buf, ==, NULL); 1033 delta = ab->b_size; 1034 } 1035 ASSERT(delta > 0); 1036 ASSERT3U(*size, >=, delta); 1037 atomic_add_64(size, -delta); 1038 mutex_exit(&ab->b_state->arcs_mtx); 1039 /* remove the prefetch flag if we get a reference */ 1040 if (ab->b_flags & ARC_PREFETCH) 1041 ab->b_flags &= ~ARC_PREFETCH; 1042 } 1043 } 1044 1045 static int 1046 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 1047 { 1048 int cnt; 1049 arc_state_t *state = ab->b_state; 1050 1051 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 1052 ASSERT(!GHOST_STATE(state)); 1053 1054 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 1055 (state != arc_anon)) { 1056 uint64_t *size = &state->arcs_lsize[ab->b_type]; 1057 1058 ASSERT(!MUTEX_HELD(&state->arcs_mtx)); 1059 mutex_enter(&state->arcs_mtx); 1060 ASSERT(!list_link_active(&ab->b_arc_node)); 1061 list_insert_head(&state->arcs_list[ab->b_type], ab); 1062 ASSERT(ab->b_datacnt > 0); 1063 atomic_add_64(size, ab->b_size * ab->b_datacnt); 1064 mutex_exit(&state->arcs_mtx); 1065 } 1066 return (cnt); 1067 } 1068 1069 /* 1070 * Move the supplied buffer to the indicated state. The mutex 1071 * for the buffer must be held by the caller. 1072 */ 1073 static void 1074 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 1075 { 1076 arc_state_t *old_state = ab->b_state; 1077 int64_t refcnt = refcount_count(&ab->b_refcnt); 1078 uint64_t from_delta, to_delta; 1079 1080 ASSERT(MUTEX_HELD(hash_lock)); 1081 ASSERT(new_state != old_state); 1082 ASSERT(refcnt == 0 || ab->b_datacnt > 0); 1083 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 1084 ASSERT(ab->b_datacnt <= 1 || new_state != arc_anon); 1085 ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); 1086 1087 from_delta = to_delta = ab->b_datacnt * ab->b_size; 1088 1089 /* 1090 * If this buffer is evictable, transfer it from the 1091 * old state list to the new state list. 1092 */ 1093 if (refcnt == 0) { 1094 if (old_state != arc_anon) { 1095 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); 1096 uint64_t *size = &old_state->arcs_lsize[ab->b_type]; 1097 1098 if (use_mutex) 1099 mutex_enter(&old_state->arcs_mtx); 1100 1101 ASSERT(list_link_active(&ab->b_arc_node)); 1102 list_remove(&old_state->arcs_list[ab->b_type], ab); 1103 1104 /* 1105 * If prefetching out of the ghost cache, 1106 * we will have a non-null datacnt. 1107 */ 1108 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 1109 /* ghost elements have a ghost size */ 1110 ASSERT(ab->b_buf == NULL); 1111 from_delta = ab->b_size; 1112 } 1113 ASSERT3U(*size, >=, from_delta); 1114 atomic_add_64(size, -from_delta); 1115 1116 if (use_mutex) 1117 mutex_exit(&old_state->arcs_mtx); 1118 } 1119 if (new_state != arc_anon) { 1120 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); 1121 uint64_t *size = &new_state->arcs_lsize[ab->b_type]; 1122 1123 if (use_mutex) 1124 mutex_enter(&new_state->arcs_mtx); 1125 1126 list_insert_head(&new_state->arcs_list[ab->b_type], ab); 1127 1128 /* ghost elements have a ghost size */ 1129 if (GHOST_STATE(new_state)) { 1130 ASSERT(ab->b_datacnt == 0); 1131 ASSERT(ab->b_buf == NULL); 1132 to_delta = ab->b_size; 1133 } 1134 atomic_add_64(size, to_delta); 1135 1136 if (use_mutex) 1137 mutex_exit(&new_state->arcs_mtx); 1138 } 1139 } 1140 1141 ASSERT(!BUF_EMPTY(ab)); 1142 if (new_state == arc_anon) { 1143 buf_hash_remove(ab); 1144 } 1145 1146 /* adjust state sizes */ 1147 if (to_delta) 1148 atomic_add_64(&new_state->arcs_size, to_delta); 1149 if (from_delta) { 1150 ASSERT3U(old_state->arcs_size, >=, from_delta); 1151 atomic_add_64(&old_state->arcs_size, -from_delta); 1152 } 1153 ab->b_state = new_state; 1154 1155 /* adjust l2arc hdr stats */ 1156 if (new_state == arc_l2c_only) 1157 l2arc_hdr_stat_add(); 1158 else if (old_state == arc_l2c_only) 1159 l2arc_hdr_stat_remove(); 1160 } 1161 1162 void 1163 arc_space_consume(uint64_t space, arc_space_type_t type) 1164 { 1165 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1166 1167 switch (type) { 1168 case ARC_SPACE_DATA: 1169 ARCSTAT_INCR(arcstat_data_size, space); 1170 break; 1171 case ARC_SPACE_OTHER: 1172 ARCSTAT_INCR(arcstat_other_size, space); 1173 break; 1174 case ARC_SPACE_HDRS: 1175 ARCSTAT_INCR(arcstat_hdr_size, space); 1176 break; 1177 case ARC_SPACE_L2HDRS: 1178 ARCSTAT_INCR(arcstat_l2_hdr_size, space); 1179 break; 1180 } 1181 1182 atomic_add_64(&arc_meta_used, space); 1183 atomic_add_64(&arc_size, space); 1184 } 1185 1186 void 1187 arc_space_return(uint64_t space, arc_space_type_t type) 1188 { 1189 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1190 1191 switch (type) { 1192 case ARC_SPACE_DATA: 1193 ARCSTAT_INCR(arcstat_data_size, -space); 1194 break; 1195 case ARC_SPACE_OTHER: 1196 ARCSTAT_INCR(arcstat_other_size, -space); 1197 break; 1198 case ARC_SPACE_HDRS: 1199 ARCSTAT_INCR(arcstat_hdr_size, -space); 1200 break; 1201 case ARC_SPACE_L2HDRS: 1202 ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 1203 break; 1204 } 1205 1206 ASSERT(arc_meta_used >= space); 1207 if (arc_meta_max < arc_meta_used) 1208 arc_meta_max = arc_meta_used; 1209 atomic_add_64(&arc_meta_used, -space); 1210 ASSERT(arc_size >= space); 1211 atomic_add_64(&arc_size, -space); 1212 } 1213 1214 void * 1215 arc_data_buf_alloc(uint64_t size) 1216 { 1217 if (arc_evict_needed(ARC_BUFC_DATA)) 1218 cv_signal(&arc_reclaim_thr_cv); 1219 atomic_add_64(&arc_size, size); 1220 return (zio_data_buf_alloc(size)); 1221 } 1222 1223 void 1224 arc_data_buf_free(void *buf, uint64_t size) 1225 { 1226 zio_data_buf_free(buf, size); 1227 ASSERT(arc_size >= size); 1228 atomic_add_64(&arc_size, -size); 1229 } 1230 1231 arc_buf_t * 1232 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) 1233 { 1234 arc_buf_hdr_t *hdr; 1235 arc_buf_t *buf; 1236 1237 ASSERT3U(size, >, 0); 1238 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 1239 ASSERT(BUF_EMPTY(hdr)); 1240 hdr->b_size = size; 1241 hdr->b_type = type; 1242 hdr->b_spa = spa_guid(spa); 1243 hdr->b_state = arc_anon; 1244 hdr->b_arc_access = 0; 1245 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1246 buf->b_hdr = hdr; 1247 buf->b_data = NULL; 1248 buf->b_efunc = NULL; 1249 buf->b_private = NULL; 1250 buf->b_next = NULL; 1251 hdr->b_buf = buf; 1252 arc_get_data_buf(buf); 1253 hdr->b_datacnt = 1; 1254 hdr->b_flags = 0; 1255 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1256 (void) refcount_add(&hdr->b_refcnt, tag); 1257 1258 return (buf); 1259 } 1260 1261 static char *arc_onloan_tag = "onloan"; 1262 1263 /* 1264 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 1265 * flight data by arc_tempreserve_space() until they are "returned". Loaned 1266 * buffers must be returned to the arc before they can be used by the DMU or 1267 * freed. 1268 */ 1269 arc_buf_t * 1270 arc_loan_buf(spa_t *spa, int size) 1271 { 1272 arc_buf_t *buf; 1273 1274 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 1275 1276 atomic_add_64(&arc_loaned_bytes, size); 1277 return (buf); 1278 } 1279 1280 /* 1281 * Return a loaned arc buffer to the arc. 1282 */ 1283 void 1284 arc_return_buf(arc_buf_t *buf, void *tag) 1285 { 1286 arc_buf_hdr_t *hdr = buf->b_hdr; 1287 1288 ASSERT(buf->b_data != NULL); 1289 (void) refcount_add(&hdr->b_refcnt, tag); 1290 (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); 1291 1292 atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 1293 } 1294 1295 /* Detach an arc_buf from a dbuf (tag) */ 1296 void 1297 arc_loan_inuse_buf(arc_buf_t *buf, void *tag) 1298 { 1299 arc_buf_hdr_t *hdr; 1300 1301 rw_enter(&buf->b_lock, RW_WRITER); 1302 ASSERT(buf->b_data != NULL); 1303 hdr = buf->b_hdr; 1304 (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); 1305 (void) refcount_remove(&hdr->b_refcnt, tag); 1306 buf->b_efunc = NULL; 1307 buf->b_private = NULL; 1308 1309 atomic_add_64(&arc_loaned_bytes, hdr->b_size); 1310 rw_exit(&buf->b_lock); 1311 } 1312 1313 static arc_buf_t * 1314 arc_buf_clone(arc_buf_t *from) 1315 { 1316 arc_buf_t *buf; 1317 arc_buf_hdr_t *hdr = from->b_hdr; 1318 uint64_t size = hdr->b_size; 1319 1320 ASSERT(hdr->b_state != arc_anon); 1321 1322 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1323 buf->b_hdr = hdr; 1324 buf->b_data = NULL; 1325 buf->b_efunc = NULL; 1326 buf->b_private = NULL; 1327 buf->b_next = hdr->b_buf; 1328 hdr->b_buf = buf; 1329 arc_get_data_buf(buf); 1330 bcopy(from->b_data, buf->b_data, size); 1331 hdr->b_datacnt += 1; 1332 return (buf); 1333 } 1334 1335 void 1336 arc_buf_add_ref(arc_buf_t *buf, void* tag) 1337 { 1338 arc_buf_hdr_t *hdr; 1339 kmutex_t *hash_lock; 1340 1341 /* 1342 * Check to see if this buffer is evicted. Callers 1343 * must verify b_data != NULL to know if the add_ref 1344 * was successful. 1345 */ 1346 rw_enter(&buf->b_lock, RW_READER); 1347 if (buf->b_data == NULL) { 1348 rw_exit(&buf->b_lock); 1349 return; 1350 } 1351 hdr = buf->b_hdr; 1352 ASSERT(hdr != NULL); 1353 hash_lock = HDR_LOCK(hdr); 1354 mutex_enter(hash_lock); 1355 rw_exit(&buf->b_lock); 1356 1357 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 1358 add_reference(hdr, hash_lock, tag); 1359 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 1360 arc_access(hdr, hash_lock); 1361 mutex_exit(hash_lock); 1362 ARCSTAT_BUMP(arcstat_hits); 1363 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 1364 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 1365 data, metadata, hits); 1366 } 1367 1368 /* 1369 * Free the arc data buffer. If it is an l2arc write in progress, 1370 * the buffer is placed on l2arc_free_on_write to be freed later. 1371 */ 1372 static void 1373 arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), 1374 void *data, size_t size) 1375 { 1376 if (HDR_L2_WRITING(hdr)) { 1377 l2arc_data_free_t *df; 1378 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); 1379 df->l2df_data = data; 1380 df->l2df_size = size; 1381 df->l2df_func = free_func; 1382 mutex_enter(&l2arc_free_on_write_mtx); 1383 list_insert_head(l2arc_free_on_write, df); 1384 mutex_exit(&l2arc_free_on_write_mtx); 1385 ARCSTAT_BUMP(arcstat_l2_free_on_write); 1386 } else { 1387 free_func(data, size); 1388 } 1389 } 1390 1391 static void 1392 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) 1393 { 1394 arc_buf_t **bufp; 1395 1396 /* free up data associated with the buf */ 1397 if (buf->b_data) { 1398 arc_state_t *state = buf->b_hdr->b_state; 1399 uint64_t size = buf->b_hdr->b_size; 1400 arc_buf_contents_t type = buf->b_hdr->b_type; 1401 1402 arc_cksum_verify(buf); 1403 1404 if (!recycle) { 1405 if (type == ARC_BUFC_METADATA) { 1406 arc_buf_data_free(buf->b_hdr, zio_buf_free, 1407 buf->b_data, size); 1408 arc_space_return(size, ARC_SPACE_DATA); 1409 } else { 1410 ASSERT(type == ARC_BUFC_DATA); 1411 arc_buf_data_free(buf->b_hdr, 1412 zio_data_buf_free, buf->b_data, size); 1413 ARCSTAT_INCR(arcstat_data_size, -size); 1414 atomic_add_64(&arc_size, -size); 1415 } 1416 } 1417 if (list_link_active(&buf->b_hdr->b_arc_node)) { 1418 uint64_t *cnt = &state->arcs_lsize[type]; 1419 1420 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 1421 ASSERT(state != arc_anon); 1422 1423 ASSERT3U(*cnt, >=, size); 1424 atomic_add_64(cnt, -size); 1425 } 1426 ASSERT3U(state->arcs_size, >=, size); 1427 atomic_add_64(&state->arcs_size, -size); 1428 buf->b_data = NULL; 1429 ASSERT(buf->b_hdr->b_datacnt > 0); 1430 buf->b_hdr->b_datacnt -= 1; 1431 } 1432 1433 /* only remove the buf if requested */ 1434 if (!all) 1435 return; 1436 1437 /* remove the buf from the hdr list */ 1438 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 1439 continue; 1440 *bufp = buf->b_next; 1441 1442 ASSERT(buf->b_efunc == NULL); 1443 1444 /* clean up the buf */ 1445 buf->b_hdr = NULL; 1446 kmem_cache_free(buf_cache, buf); 1447 } 1448 1449 static void 1450 arc_hdr_destroy(arc_buf_hdr_t *hdr) 1451 { 1452 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1453 ASSERT3P(hdr->b_state, ==, arc_anon); 1454 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1455 l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; 1456 1457 if (l2hdr != NULL) { 1458 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); 1459 /* 1460 * To prevent arc_free() and l2arc_evict() from 1461 * attempting to free the same buffer at the same time, 1462 * a FREE_IN_PROGRESS flag is given to arc_free() to 1463 * give it priority. l2arc_evict() can't destroy this 1464 * header while we are waiting on l2arc_buflist_mtx. 1465 * 1466 * The hdr may be removed from l2ad_buflist before we 1467 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. 1468 */ 1469 if (!buflist_held) { 1470 mutex_enter(&l2arc_buflist_mtx); 1471 l2hdr = hdr->b_l2hdr; 1472 } 1473 1474 if (l2hdr != NULL) { 1475 list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 1476 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 1477 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 1478 if (hdr->b_state == arc_l2c_only) 1479 l2arc_hdr_stat_remove(); 1480 hdr->b_l2hdr = NULL; 1481 } 1482 1483 if (!buflist_held) 1484 mutex_exit(&l2arc_buflist_mtx); 1485 } 1486 1487 if (!BUF_EMPTY(hdr)) { 1488 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1489 bzero(&hdr->b_dva, sizeof (dva_t)); 1490 hdr->b_birth = 0; 1491 hdr->b_cksum0 = 0; 1492 } 1493 while (hdr->b_buf) { 1494 arc_buf_t *buf = hdr->b_buf; 1495 1496 if (buf->b_efunc) { 1497 mutex_enter(&arc_eviction_mtx); 1498 rw_enter(&buf->b_lock, RW_WRITER); 1499 ASSERT(buf->b_hdr != NULL); 1500 arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 1501 hdr->b_buf = buf->b_next; 1502 buf->b_hdr = &arc_eviction_hdr; 1503 buf->b_next = arc_eviction_list; 1504 arc_eviction_list = buf; 1505 rw_exit(&buf->b_lock); 1506 mutex_exit(&arc_eviction_mtx); 1507 } else { 1508 arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 1509 } 1510 } 1511 if (hdr->b_freeze_cksum != NULL) { 1512 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1513 hdr->b_freeze_cksum = NULL; 1514 } 1515 1516 ASSERT(!list_link_active(&hdr->b_arc_node)); 1517 ASSERT3P(hdr->b_hash_next, ==, NULL); 1518 ASSERT3P(hdr->b_acb, ==, NULL); 1519 kmem_cache_free(hdr_cache, hdr); 1520 } 1521 1522 void 1523 arc_buf_free(arc_buf_t *buf, void *tag) 1524 { 1525 arc_buf_hdr_t *hdr = buf->b_hdr; 1526 int hashed = hdr->b_state != arc_anon; 1527 1528 ASSERT(buf->b_efunc == NULL); 1529 ASSERT(buf->b_data != NULL); 1530 1531 if (hashed) { 1532 kmutex_t *hash_lock = HDR_LOCK(hdr); 1533 1534 mutex_enter(hash_lock); 1535 (void) remove_reference(hdr, hash_lock, tag); 1536 if (hdr->b_datacnt > 1) { 1537 arc_buf_destroy(buf, FALSE, TRUE); 1538 } else { 1539 ASSERT(buf == hdr->b_buf); 1540 ASSERT(buf->b_efunc == NULL); 1541 hdr->b_flags |= ARC_BUF_AVAILABLE; 1542 } 1543 mutex_exit(hash_lock); 1544 } else if (HDR_IO_IN_PROGRESS(hdr)) { 1545 int destroy_hdr; 1546 /* 1547 * We are in the middle of an async write. Don't destroy 1548 * this buffer unless the write completes before we finish 1549 * decrementing the reference count. 1550 */ 1551 mutex_enter(&arc_eviction_mtx); 1552 (void) remove_reference(hdr, NULL, tag); 1553 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1554 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 1555 mutex_exit(&arc_eviction_mtx); 1556 if (destroy_hdr) 1557 arc_hdr_destroy(hdr); 1558 } else { 1559 if (remove_reference(hdr, NULL, tag) > 0) { 1560 ASSERT(HDR_IO_ERROR(hdr)); 1561 arc_buf_destroy(buf, FALSE, TRUE); 1562 } else { 1563 arc_hdr_destroy(hdr); 1564 } 1565 } 1566 } 1567 1568 int 1569 arc_buf_remove_ref(arc_buf_t *buf, void* tag) 1570 { 1571 arc_buf_hdr_t *hdr = buf->b_hdr; 1572 kmutex_t *hash_lock = HDR_LOCK(hdr); 1573 int no_callback = (buf->b_efunc == NULL); 1574 1575 if (hdr->b_state == arc_anon) { 1576 ASSERT(hdr->b_datacnt == 1); 1577 arc_buf_free(buf, tag); 1578 return (no_callback); 1579 } 1580 1581 mutex_enter(hash_lock); 1582 ASSERT(hdr->b_state != arc_anon); 1583 ASSERT(buf->b_data != NULL); 1584 1585 (void) remove_reference(hdr, hash_lock, tag); 1586 if (hdr->b_datacnt > 1) { 1587 if (no_callback) 1588 arc_buf_destroy(buf, FALSE, TRUE); 1589 } else if (no_callback) { 1590 ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 1591 ASSERT(buf->b_efunc == NULL); 1592 hdr->b_flags |= ARC_BUF_AVAILABLE; 1593 } 1594 ASSERT(no_callback || hdr->b_datacnt > 1 || 1595 refcount_is_zero(&hdr->b_refcnt)); 1596 mutex_exit(hash_lock); 1597 return (no_callback); 1598 } 1599 1600 int 1601 arc_buf_size(arc_buf_t *buf) 1602 { 1603 return (buf->b_hdr->b_size); 1604 } 1605 1606 /* 1607 * Evict buffers from list until we've removed the specified number of 1608 * bytes. Move the removed buffers to the appropriate evict state. 1609 * If the recycle flag is set, then attempt to "recycle" a buffer: 1610 * - look for a buffer to evict that is `bytes' long. 1611 * - return the data block from this buffer rather than freeing it. 1612 * This flag is used by callers that are trying to make space for a 1613 * new buffer in a full arc cache. 1614 * 1615 * This function makes a "best effort". It skips over any buffers 1616 * it can't get a hash_lock on, and so may not catch all candidates. 1617 * It may also return without evicting as much space as requested. 1618 */ 1619 static void * 1620 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, 1621 arc_buf_contents_t type) 1622 { 1623 arc_state_t *evicted_state; 1624 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 1625 arc_buf_hdr_t *ab, *ab_prev = NULL; 1626 list_t *list = &state->arcs_list[type]; 1627 kmutex_t *hash_lock; 1628 boolean_t have_lock; 1629 void *stolen = NULL; 1630 1631 ASSERT(state == arc_mru || state == arc_mfu); 1632 1633 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 1634 1635 mutex_enter(&state->arcs_mtx); 1636 mutex_enter(&evicted_state->arcs_mtx); 1637 1638 for (ab = list_tail(list); ab; ab = ab_prev) { 1639 ab_prev = list_prev(list, ab); 1640 /* prefetch buffers have a minimum lifespan */ 1641 if (HDR_IO_IN_PROGRESS(ab) || 1642 (spa && ab->b_spa != spa) || 1643 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 1644 ddi_get_lbolt() - ab->b_arc_access < 1645 arc_min_prefetch_lifespan)) { 1646 skipped++; 1647 continue; 1648 } 1649 /* "lookahead" for better eviction candidate */ 1650 if (recycle && ab->b_size != bytes && 1651 ab_prev && ab_prev->b_size == bytes) 1652 continue; 1653 hash_lock = HDR_LOCK(ab); 1654 have_lock = MUTEX_HELD(hash_lock); 1655 if (have_lock || mutex_tryenter(hash_lock)) { 1656 ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); 1657 ASSERT(ab->b_datacnt > 0); 1658 while (ab->b_buf) { 1659 arc_buf_t *buf = ab->b_buf; 1660 if (!rw_tryenter(&buf->b_lock, RW_WRITER)) { 1661 missed += 1; 1662 break; 1663 } 1664 if (buf->b_data) { 1665 bytes_evicted += ab->b_size; 1666 if (recycle && ab->b_type == type && 1667 ab->b_size == bytes && 1668 !HDR_L2_WRITING(ab)) { 1669 stolen = buf->b_data; 1670 recycle = FALSE; 1671 } 1672 } 1673 if (buf->b_efunc) { 1674 mutex_enter(&arc_eviction_mtx); 1675 arc_buf_destroy(buf, 1676 buf->b_data == stolen, FALSE); 1677 ab->b_buf = buf->b_next; 1678 buf->b_hdr = &arc_eviction_hdr; 1679 buf->b_next = arc_eviction_list; 1680 arc_eviction_list = buf; 1681 mutex_exit(&arc_eviction_mtx); 1682 rw_exit(&buf->b_lock); 1683 } else { 1684 rw_exit(&buf->b_lock); 1685 arc_buf_destroy(buf, 1686 buf->b_data == stolen, TRUE); 1687 } 1688 } 1689 1690 if (ab->b_l2hdr) { 1691 ARCSTAT_INCR(arcstat_evict_l2_cached, 1692 ab->b_size); 1693 } else { 1694 if (l2arc_write_eligible(ab->b_spa, ab)) { 1695 ARCSTAT_INCR(arcstat_evict_l2_eligible, 1696 ab->b_size); 1697 } else { 1698 ARCSTAT_INCR( 1699 arcstat_evict_l2_ineligible, 1700 ab->b_size); 1701 } 1702 } 1703 1704 if (ab->b_datacnt == 0) { 1705 arc_change_state(evicted_state, ab, hash_lock); 1706 ASSERT(HDR_IN_HASH_TABLE(ab)); 1707 ab->b_flags |= ARC_IN_HASH_TABLE; 1708 ab->b_flags &= ~ARC_BUF_AVAILABLE; 1709 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 1710 } 1711 if (!have_lock) 1712 mutex_exit(hash_lock); 1713 if (bytes >= 0 && bytes_evicted >= bytes) 1714 break; 1715 } else { 1716 missed += 1; 1717 } 1718 } 1719 1720 mutex_exit(&evicted_state->arcs_mtx); 1721 mutex_exit(&state->arcs_mtx); 1722 1723 if (bytes_evicted < bytes) 1724 dprintf("only evicted %lld bytes from %x", 1725 (longlong_t)bytes_evicted, state); 1726 1727 if (skipped) 1728 ARCSTAT_INCR(arcstat_evict_skip, skipped); 1729 1730 if (missed) 1731 ARCSTAT_INCR(arcstat_mutex_miss, missed); 1732 1733 /* 1734 * We have just evicted some date into the ghost state, make 1735 * sure we also adjust the ghost state size if necessary. 1736 */ 1737 if (arc_no_grow && 1738 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { 1739 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + 1740 arc_mru_ghost->arcs_size - arc_c; 1741 1742 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { 1743 int64_t todelete = 1744 MIN(arc_mru_ghost->arcs_lsize[type], mru_over); 1745 arc_evict_ghost(arc_mru_ghost, NULL, todelete); 1746 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { 1747 int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], 1748 arc_mru_ghost->arcs_size + 1749 arc_mfu_ghost->arcs_size - arc_c); 1750 arc_evict_ghost(arc_mfu_ghost, NULL, todelete); 1751 } 1752 } 1753 1754 return (stolen); 1755 } 1756 1757 /* 1758 * Remove buffers from list until we've removed the specified number of 1759 * bytes. Destroy the buffers that are removed. 1760 */ 1761 static void 1762 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) 1763 { 1764 arc_buf_hdr_t *ab, *ab_prev; 1765 list_t *list = &state->arcs_list[ARC_BUFC_DATA]; 1766 kmutex_t *hash_lock; 1767 uint64_t bytes_deleted = 0; 1768 uint64_t bufs_skipped = 0; 1769 boolean_t have_lock; 1770 1771 ASSERT(GHOST_STATE(state)); 1772 top: 1773 mutex_enter(&state->arcs_mtx); 1774 for (ab = list_tail(list); ab; ab = ab_prev) { 1775 ab_prev = list_prev(list, ab); 1776 if (spa && ab->b_spa != spa) 1777 continue; 1778 hash_lock = HDR_LOCK(ab); 1779 have_lock = MUTEX_HELD(hash_lock); 1780 if (have_lock || mutex_tryenter(hash_lock)) { 1781 ASSERT(!HDR_IO_IN_PROGRESS(ab)); 1782 ASSERT(ab->b_buf == NULL); 1783 ARCSTAT_BUMP(arcstat_deleted); 1784 bytes_deleted += ab->b_size; 1785 1786 if (ab->b_l2hdr != NULL) { 1787 /* 1788 * This buffer is cached on the 2nd Level ARC; 1789 * don't destroy the header. 1790 */ 1791 arc_change_state(arc_l2c_only, ab, hash_lock); 1792 if (!have_lock) 1793 mutex_exit(hash_lock); 1794 } else { 1795 arc_change_state(arc_anon, ab, hash_lock); 1796 if (!have_lock) 1797 mutex_exit(hash_lock); 1798 arc_hdr_destroy(ab); 1799 } 1800 1801 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 1802 if (bytes >= 0 && bytes_deleted >= bytes) 1803 break; 1804 } else { 1805 if (bytes < 0) { 1806 mutex_exit(&state->arcs_mtx); 1807 mutex_enter(hash_lock); 1808 mutex_exit(hash_lock); 1809 goto top; 1810 } 1811 bufs_skipped += 1; 1812 } 1813 } 1814 mutex_exit(&state->arcs_mtx); 1815 1816 if (list == &state->arcs_list[ARC_BUFC_DATA] && 1817 (bytes < 0 || bytes_deleted < bytes)) { 1818 list = &state->arcs_list[ARC_BUFC_METADATA]; 1819 goto top; 1820 } 1821 1822 if (bufs_skipped) { 1823 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 1824 ASSERT(bytes >= 0); 1825 } 1826 1827 if (bytes_deleted < bytes) 1828 dprintf("only deleted %lld bytes from %p", 1829 (longlong_t)bytes_deleted, state); 1830 } 1831 1832 static void 1833 arc_adjust(void) 1834 { 1835 int64_t adjustment, delta; 1836 1837 /* 1838 * Adjust MRU size 1839 */ 1840 1841 adjustment = MIN(arc_size - arc_c, 1842 arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p); 1843 1844 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 1845 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); 1846 (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA); 1847 adjustment -= delta; 1848 } 1849 1850 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 1851 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); 1852 (void) arc_evict(arc_mru, NULL, delta, FALSE, 1853 ARC_BUFC_METADATA); 1854 } 1855 1856 /* 1857 * Adjust MFU size 1858 */ 1859 1860 adjustment = arc_size - arc_c; 1861 1862 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 1863 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); 1864 (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA); 1865 adjustment -= delta; 1866 } 1867 1868 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 1869 int64_t delta = MIN(adjustment, 1870 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); 1871 (void) arc_evict(arc_mfu, NULL, delta, FALSE, 1872 ARC_BUFC_METADATA); 1873 } 1874 1875 /* 1876 * Adjust ghost lists 1877 */ 1878 1879 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; 1880 1881 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { 1882 delta = MIN(arc_mru_ghost->arcs_size, adjustment); 1883 arc_evict_ghost(arc_mru_ghost, NULL, delta); 1884 } 1885 1886 adjustment = 1887 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; 1888 1889 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { 1890 delta = MIN(arc_mfu_ghost->arcs_size, adjustment); 1891 arc_evict_ghost(arc_mfu_ghost, NULL, delta); 1892 } 1893 } 1894 1895 static void 1896 arc_do_user_evicts(void) 1897 { 1898 mutex_enter(&arc_eviction_mtx); 1899 while (arc_eviction_list != NULL) { 1900 arc_buf_t *buf = arc_eviction_list; 1901 arc_eviction_list = buf->b_next; 1902 rw_enter(&buf->b_lock, RW_WRITER); 1903 buf->b_hdr = NULL; 1904 rw_exit(&buf->b_lock); 1905 mutex_exit(&arc_eviction_mtx); 1906 1907 if (buf->b_efunc != NULL) 1908 VERIFY(buf->b_efunc(buf) == 0); 1909 1910 buf->b_efunc = NULL; 1911 buf->b_private = NULL; 1912 kmem_cache_free(buf_cache, buf); 1913 mutex_enter(&arc_eviction_mtx); 1914 } 1915 mutex_exit(&arc_eviction_mtx); 1916 } 1917 1918 /* 1919 * Flush all *evictable* data from the cache for the given spa. 1920 * NOTE: this will not touch "active" (i.e. referenced) data. 1921 */ 1922 void 1923 arc_flush(spa_t *spa) 1924 { 1925 uint64_t guid = 0; 1926 1927 if (spa) 1928 guid = spa_guid(spa); 1929 1930 while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { 1931 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); 1932 if (spa) 1933 break; 1934 } 1935 while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { 1936 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); 1937 if (spa) 1938 break; 1939 } 1940 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { 1941 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); 1942 if (spa) 1943 break; 1944 } 1945 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { 1946 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); 1947 if (spa) 1948 break; 1949 } 1950 1951 arc_evict_ghost(arc_mru_ghost, guid, -1); 1952 arc_evict_ghost(arc_mfu_ghost, guid, -1); 1953 1954 mutex_enter(&arc_reclaim_thr_lock); 1955 arc_do_user_evicts(); 1956 mutex_exit(&arc_reclaim_thr_lock); 1957 ASSERT(spa || arc_eviction_list == NULL); 1958 } 1959 1960 void 1961 arc_shrink(void) 1962 { 1963 if (arc_c > arc_c_min) { 1964 uint64_t to_free; 1965 1966 #ifdef _KERNEL 1967 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree)); 1968 #else 1969 to_free = arc_c >> arc_shrink_shift; 1970 #endif 1971 if (arc_c > arc_c_min + to_free) 1972 atomic_add_64(&arc_c, -to_free); 1973 else 1974 arc_c = arc_c_min; 1975 1976 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 1977 if (arc_c > arc_size) 1978 arc_c = MAX(arc_size, arc_c_min); 1979 if (arc_p > arc_c) 1980 arc_p = (arc_c >> 1); 1981 ASSERT(arc_c >= arc_c_min); 1982 ASSERT((int64_t)arc_p >= 0); 1983 } 1984 1985 if (arc_size > arc_c) 1986 arc_adjust(); 1987 } 1988 1989 static int 1990 arc_reclaim_needed(void) 1991 { 1992 uint64_t extra; 1993 1994 #ifdef _KERNEL 1995 1996 if (needfree) 1997 return (1); 1998 1999 /* 2000 * take 'desfree' extra pages, so we reclaim sooner, rather than later 2001 */ 2002 extra = desfree; 2003 2004 /* 2005 * check that we're out of range of the pageout scanner. It starts to 2006 * schedule paging if freemem is less than lotsfree and needfree. 2007 * lotsfree is the high-water mark for pageout, and needfree is the 2008 * number of needed free pages. We add extra pages here to make sure 2009 * the scanner doesn't start up while we're freeing memory. 2010 */ 2011 if (freemem < lotsfree + needfree + extra) 2012 return (1); 2013 2014 /* 2015 * check to make sure that swapfs has enough space so that anon 2016 * reservations can still succeed. anon_resvmem() checks that the 2017 * availrmem is greater than swapfs_minfree, and the number of reserved 2018 * swap pages. We also add a bit of extra here just to prevent 2019 * circumstances from getting really dire. 2020 */ 2021 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 2022 return (1); 2023 2024 #if defined(__i386) 2025 /* 2026 * If we're on an i386 platform, it's possible that we'll exhaust the 2027 * kernel heap space before we ever run out of available physical 2028 * memory. Most checks of the size of the heap_area compare against 2029 * tune.t_minarmem, which is the minimum available real memory that we 2030 * can have in the system. However, this is generally fixed at 25 pages 2031 * which is so low that it's useless. In this comparison, we seek to 2032 * calculate the total heap-size, and reclaim if more than 3/4ths of the 2033 * heap is allocated. (Or, in the calculation, if less than 1/4th is 2034 * free) 2035 */ 2036 if (btop(vmem_size(heap_arena, VMEM_FREE)) < 2037 (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) 2038 return (1); 2039 #endif 2040 2041 #else 2042 if (spa_get_random(100) == 0) 2043 return (1); 2044 #endif 2045 return (0); 2046 } 2047 2048 static void 2049 arc_kmem_reap_now(arc_reclaim_strategy_t strat) 2050 { 2051 size_t i; 2052 kmem_cache_t *prev_cache = NULL; 2053 kmem_cache_t *prev_data_cache = NULL; 2054 extern kmem_cache_t *zio_buf_cache[]; 2055 extern kmem_cache_t *zio_data_buf_cache[]; 2056 2057 #ifdef _KERNEL 2058 if (arc_meta_used >= arc_meta_limit) { 2059 /* 2060 * We are exceeding our meta-data cache limit. 2061 * Purge some DNLC entries to release holds on meta-data. 2062 */ 2063 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 2064 } 2065 #if defined(__i386) 2066 /* 2067 * Reclaim unused memory from all kmem caches. 2068 */ 2069 kmem_reap(); 2070 #endif 2071 #endif 2072 2073 /* 2074 * An aggressive reclamation will shrink the cache size as well as 2075 * reap free buffers from the arc kmem caches. 2076 */ 2077 if (strat == ARC_RECLAIM_AGGR) 2078 arc_shrink(); 2079 2080 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 2081 if (zio_buf_cache[i] != prev_cache) { 2082 prev_cache = zio_buf_cache[i]; 2083 kmem_cache_reap_now(zio_buf_cache[i]); 2084 } 2085 if (zio_data_buf_cache[i] != prev_data_cache) { 2086 prev_data_cache = zio_data_buf_cache[i]; 2087 kmem_cache_reap_now(zio_data_buf_cache[i]); 2088 } 2089 } 2090 kmem_cache_reap_now(buf_cache); 2091 kmem_cache_reap_now(hdr_cache); 2092 } 2093 2094 static void 2095 arc_reclaim_thread(void) 2096 { 2097 clock_t growtime = 0; 2098 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 2099 callb_cpr_t cpr; 2100 2101 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 2102 2103 mutex_enter(&arc_reclaim_thr_lock); 2104 while (arc_thread_exit == 0) { 2105 if (arc_reclaim_needed()) { 2106 2107 if (arc_no_grow) { 2108 if (last_reclaim == ARC_RECLAIM_CONS) { 2109 last_reclaim = ARC_RECLAIM_AGGR; 2110 } else { 2111 last_reclaim = ARC_RECLAIM_CONS; 2112 } 2113 } else { 2114 arc_no_grow = TRUE; 2115 last_reclaim = ARC_RECLAIM_AGGR; 2116 membar_producer(); 2117 } 2118 2119 /* reset the growth delay for every reclaim */ 2120 growtime = ddi_get_lbolt() + (arc_grow_retry * hz); 2121 2122 arc_kmem_reap_now(last_reclaim); 2123 arc_warm = B_TRUE; 2124 2125 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { 2126 arc_no_grow = FALSE; 2127 } 2128 2129 if (2 * arc_c < arc_size + 2130 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size) 2131 arc_adjust(); 2132 2133 if (arc_eviction_list != NULL) 2134 arc_do_user_evicts(); 2135 2136 /* block until needed, or one second, whichever is shorter */ 2137 CALLB_CPR_SAFE_BEGIN(&cpr); 2138 (void) cv_timedwait(&arc_reclaim_thr_cv, 2139 &arc_reclaim_thr_lock, (hz)); 2140 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 2141 } 2142 2143 arc_thread_exit = 0; 2144 cv_broadcast(&arc_reclaim_thr_cv); 2145 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 2146 thread_exit(); 2147 } 2148 2149 /* 2150 * Adapt arc info given the number of bytes we are trying to add and 2151 * the state that we are comming from. This function is only called 2152 * when we are adding new content to the cache. 2153 */ 2154 static void 2155 arc_adapt(int bytes, arc_state_t *state) 2156 { 2157 int mult; 2158 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 2159 2160 if (state == arc_l2c_only) 2161 return; 2162 2163 ASSERT(bytes > 0); 2164 /* 2165 * Adapt the target size of the MRU list: 2166 * - if we just hit in the MRU ghost list, then increase 2167 * the target size of the MRU list. 2168 * - if we just hit in the MFU ghost list, then increase 2169 * the target size of the MFU list by decreasing the 2170 * target size of the MRU list. 2171 */ 2172 if (state == arc_mru_ghost) { 2173 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 2174 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 2175 2176 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 2177 } else if (state == arc_mfu_ghost) { 2178 uint64_t delta; 2179 2180 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 2181 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 2182 2183 delta = MIN(bytes * mult, arc_p); 2184 arc_p = MAX(arc_p_min, arc_p - delta); 2185 } 2186 ASSERT((int64_t)arc_p >= 0); 2187 2188 if (arc_reclaim_needed()) { 2189 cv_signal(&arc_reclaim_thr_cv); 2190 return; 2191 } 2192 2193 if (arc_no_grow) 2194 return; 2195 2196 if (arc_c >= arc_c_max) 2197 return; 2198 2199 /* 2200 * If we're within (2 * maxblocksize) bytes of the target 2201 * cache size, increment the target cache size 2202 */ 2203 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 2204 atomic_add_64(&arc_c, (int64_t)bytes); 2205 if (arc_c > arc_c_max) 2206 arc_c = arc_c_max; 2207 else if (state == arc_anon) 2208 atomic_add_64(&arc_p, (int64_t)bytes); 2209 if (arc_p > arc_c) 2210 arc_p = arc_c; 2211 } 2212 ASSERT((int64_t)arc_p >= 0); 2213 } 2214 2215 /* 2216 * Check if the cache has reached its limits and eviction is required 2217 * prior to insert. 2218 */ 2219 static int 2220 arc_evict_needed(arc_buf_contents_t type) 2221 { 2222 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 2223 return (1); 2224 2225 #ifdef _KERNEL 2226 /* 2227 * If zio data pages are being allocated out of a separate heap segment, 2228 * then enforce that the size of available vmem for this area remains 2229 * above about 1/32nd free. 2230 */ 2231 if (type == ARC_BUFC_DATA && zio_arena != NULL && 2232 vmem_size(zio_arena, VMEM_FREE) < 2233 (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) 2234 return (1); 2235 #endif 2236 2237 if (arc_reclaim_needed()) 2238 return (1); 2239 2240 return (arc_size > arc_c); 2241 } 2242 2243 /* 2244 * The buffer, supplied as the first argument, needs a data block. 2245 * So, if we are at cache max, determine which cache should be victimized. 2246 * We have the following cases: 2247 * 2248 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 2249 * In this situation if we're out of space, but the resident size of the MFU is 2250 * under the limit, victimize the MFU cache to satisfy this insertion request. 2251 * 2252 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 2253 * Here, we've used up all of the available space for the MRU, so we need to 2254 * evict from our own cache instead. Evict from the set of resident MRU 2255 * entries. 2256 * 2257 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 2258 * c minus p represents the MFU space in the cache, since p is the size of the 2259 * cache that is dedicated to the MRU. In this situation there's still space on 2260 * the MFU side, so the MRU side needs to be victimized. 2261 * 2262 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 2263 * MFU's resident set is consuming more space than it has been allotted. In 2264 * this situation, we must victimize our own cache, the MFU, for this insertion. 2265 */ 2266 static void 2267 arc_get_data_buf(arc_buf_t *buf) 2268 { 2269 arc_state_t *state = buf->b_hdr->b_state; 2270 uint64_t size = buf->b_hdr->b_size; 2271 arc_buf_contents_t type = buf->b_hdr->b_type; 2272 2273 arc_adapt(size, state); 2274 2275 /* 2276 * We have not yet reached cache maximum size, 2277 * just allocate a new buffer. 2278 */ 2279 if (!arc_evict_needed(type)) { 2280 if (type == ARC_BUFC_METADATA) { 2281 buf->b_data = zio_buf_alloc(size); 2282 arc_space_consume(size, ARC_SPACE_DATA); 2283 } else { 2284 ASSERT(type == ARC_BUFC_DATA); 2285 buf->b_data = zio_data_buf_alloc(size); 2286 ARCSTAT_INCR(arcstat_data_size, size); 2287 atomic_add_64(&arc_size, size); 2288 } 2289 goto out; 2290 } 2291 2292 /* 2293 * If we are prefetching from the mfu ghost list, this buffer 2294 * will end up on the mru list; so steal space from there. 2295 */ 2296 if (state == arc_mfu_ghost) 2297 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; 2298 else if (state == arc_mru_ghost) 2299 state = arc_mru; 2300 2301 if (state == arc_mru || state == arc_anon) { 2302 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 2303 state = (arc_mfu->arcs_lsize[type] >= size && 2304 arc_p > mru_used) ? arc_mfu : arc_mru; 2305 } else { 2306 /* MFU cases */ 2307 uint64_t mfu_space = arc_c - arc_p; 2308 state = (arc_mru->arcs_lsize[type] >= size && 2309 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 2310 } 2311 if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) { 2312 if (type == ARC_BUFC_METADATA) { 2313 buf->b_data = zio_buf_alloc(size); 2314 arc_space_consume(size, ARC_SPACE_DATA); 2315 } else { 2316 ASSERT(type == ARC_BUFC_DATA); 2317 buf->b_data = zio_data_buf_alloc(size); 2318 ARCSTAT_INCR(arcstat_data_size, size); 2319 atomic_add_64(&arc_size, size); 2320 } 2321 ARCSTAT_BUMP(arcstat_recycle_miss); 2322 } 2323 ASSERT(buf->b_data != NULL); 2324 out: 2325 /* 2326 * Update the state size. Note that ghost states have a 2327 * "ghost size" and so don't need to be updated. 2328 */ 2329 if (!GHOST_STATE(buf->b_hdr->b_state)) { 2330 arc_buf_hdr_t *hdr = buf->b_hdr; 2331 2332 atomic_add_64(&hdr->b_state->arcs_size, size); 2333 if (list_link_active(&hdr->b_arc_node)) { 2334 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2335 atomic_add_64(&hdr->b_state->arcs_lsize[type], size); 2336 } 2337 /* 2338 * If we are growing the cache, and we are adding anonymous 2339 * data, and we have outgrown arc_p, update arc_p 2340 */ 2341 if (arc_size < arc_c && hdr->b_state == arc_anon && 2342 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 2343 arc_p = MIN(arc_c, arc_p + size); 2344 } 2345 } 2346 2347 /* 2348 * This routine is called whenever a buffer is accessed. 2349 * NOTE: the hash lock is dropped in this function. 2350 */ 2351 static void 2352 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 2353 { 2354 clock_t now; 2355 2356 ASSERT(MUTEX_HELD(hash_lock)); 2357 2358 if (buf->b_state == arc_anon) { 2359 /* 2360 * This buffer is not in the cache, and does not 2361 * appear in our "ghost" list. Add the new buffer 2362 * to the MRU state. 2363 */ 2364 2365 ASSERT(buf->b_arc_access == 0); 2366 buf->b_arc_access = ddi_get_lbolt(); 2367 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 2368 arc_change_state(arc_mru, buf, hash_lock); 2369 2370 } else if (buf->b_state == arc_mru) { 2371 now = ddi_get_lbolt(); 2372 2373 /* 2374 * If this buffer is here because of a prefetch, then either: 2375 * - clear the flag if this is a "referencing" read 2376 * (any subsequent access will bump this into the MFU state). 2377 * or 2378 * - move the buffer to the head of the list if this is 2379 * another prefetch (to make it less likely to be evicted). 2380 */ 2381 if ((buf->b_flags & ARC_PREFETCH) != 0) { 2382 if (refcount_count(&buf->b_refcnt) == 0) { 2383 ASSERT(list_link_active(&buf->b_arc_node)); 2384 } else { 2385 buf->b_flags &= ~ARC_PREFETCH; 2386 ARCSTAT_BUMP(arcstat_mru_hits); 2387 } 2388 buf->b_arc_access = now; 2389 return; 2390 } 2391 2392 /* 2393 * This buffer has been "accessed" only once so far, 2394 * but it is still in the cache. Move it to the MFU 2395 * state. 2396 */ 2397 if (now > buf->b_arc_access + ARC_MINTIME) { 2398 /* 2399 * More than 125ms have passed since we 2400 * instantiated this buffer. Move it to the 2401 * most frequently used state. 2402 */ 2403 buf->b_arc_access = now; 2404 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2405 arc_change_state(arc_mfu, buf, hash_lock); 2406 } 2407 ARCSTAT_BUMP(arcstat_mru_hits); 2408 } else if (buf->b_state == arc_mru_ghost) { 2409 arc_state_t *new_state; 2410 /* 2411 * This buffer has been "accessed" recently, but 2412 * was evicted from the cache. Move it to the 2413 * MFU state. 2414 */ 2415 2416 if (buf->b_flags & ARC_PREFETCH) { 2417 new_state = arc_mru; 2418 if (refcount_count(&buf->b_refcnt) > 0) 2419 buf->b_flags &= ~ARC_PREFETCH; 2420 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 2421 } else { 2422 new_state = arc_mfu; 2423 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2424 } 2425 2426 buf->b_arc_access = ddi_get_lbolt(); 2427 arc_change_state(new_state, buf, hash_lock); 2428 2429 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 2430 } else if (buf->b_state == arc_mfu) { 2431 /* 2432 * This buffer has been accessed more than once and is 2433 * still in the cache. Keep it in the MFU state. 2434 * 2435 * NOTE: an add_reference() that occurred when we did 2436 * the arc_read() will have kicked this off the list. 2437 * If it was a prefetch, we will explicitly move it to 2438 * the head of the list now. 2439 */ 2440 if ((buf->b_flags & ARC_PREFETCH) != 0) { 2441 ASSERT(refcount_count(&buf->b_refcnt) == 0); 2442 ASSERT(list_link_active(&buf->b_arc_node)); 2443 } 2444 ARCSTAT_BUMP(arcstat_mfu_hits); 2445 buf->b_arc_access = ddi_get_lbolt(); 2446 } else if (buf->b_state == arc_mfu_ghost) { 2447 arc_state_t *new_state = arc_mfu; 2448 /* 2449 * This buffer has been accessed more than once but has 2450 * been evicted from the cache. Move it back to the 2451 * MFU state. 2452 */ 2453 2454 if (buf->b_flags & ARC_PREFETCH) { 2455 /* 2456 * This is a prefetch access... 2457 * move this block back to the MRU state. 2458 */ 2459 ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); 2460 new_state = arc_mru; 2461 } 2462 2463 buf->b_arc_access = ddi_get_lbolt(); 2464 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2465 arc_change_state(new_state, buf, hash_lock); 2466 2467 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 2468 } else if (buf->b_state == arc_l2c_only) { 2469 /* 2470 * This buffer is on the 2nd Level ARC. 2471 */ 2472 2473 buf->b_arc_access = ddi_get_lbolt(); 2474 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2475 arc_change_state(arc_mfu, buf, hash_lock); 2476 } else { 2477 ASSERT(!"invalid arc state"); 2478 } 2479 } 2480 2481 /* a generic arc_done_func_t which you can use */ 2482 /* ARGSUSED */ 2483 void 2484 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 2485 { 2486 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 2487 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 2488 } 2489 2490 /* a generic arc_done_func_t */ 2491 void 2492 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 2493 { 2494 arc_buf_t **bufp = arg; 2495 if (zio && zio->io_error) { 2496 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 2497 *bufp = NULL; 2498 } else { 2499 *bufp = buf; 2500 } 2501 } 2502 2503 static void 2504 arc_read_done(zio_t *zio) 2505 { 2506 arc_buf_hdr_t *hdr, *found; 2507 arc_buf_t *buf; 2508 arc_buf_t *abuf; /* buffer we're assigning to callback */ 2509 kmutex_t *hash_lock; 2510 arc_callback_t *callback_list, *acb; 2511 int freeable = FALSE; 2512 2513 buf = zio->io_private; 2514 hdr = buf->b_hdr; 2515 2516 /* 2517 * The hdr was inserted into hash-table and removed from lists 2518 * prior to starting I/O. We should find this header, since 2519 * it's in the hash table, and it should be legit since it's 2520 * not possible to evict it during the I/O. The only possible 2521 * reason for it not to be found is if we were freed during the 2522 * read. 2523 */ 2524 found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, 2525 &hash_lock); 2526 2527 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 2528 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 2529 (found == hdr && HDR_L2_READING(hdr))); 2530 2531 hdr->b_flags &= ~ARC_L2_EVICTED; 2532 if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) 2533 hdr->b_flags &= ~ARC_L2CACHE; 2534 2535 /* byteswap if necessary */ 2536 callback_list = hdr->b_acb; 2537 ASSERT(callback_list != NULL); 2538 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 2539 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 2540 byteswap_uint64_array : 2541 dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap; 2542 func(buf->b_data, hdr->b_size); 2543 } 2544 2545 arc_cksum_compute(buf, B_FALSE); 2546 2547 if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { 2548 /* 2549 * Only call arc_access on anonymous buffers. This is because 2550 * if we've issued an I/O for an evicted buffer, we've already 2551 * called arc_access (to prevent any simultaneous readers from 2552 * getting confused). 2553 */ 2554 arc_access(hdr, hash_lock); 2555 } 2556 2557 /* create copies of the data buffer for the callers */ 2558 abuf = buf; 2559 for (acb = callback_list; acb; acb = acb->acb_next) { 2560 if (acb->acb_done) { 2561 if (abuf == NULL) 2562 abuf = arc_buf_clone(buf); 2563 acb->acb_buf = abuf; 2564 abuf = NULL; 2565 } 2566 } 2567 hdr->b_acb = NULL; 2568 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2569 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 2570 if (abuf == buf) { 2571 ASSERT(buf->b_efunc == NULL); 2572 ASSERT(hdr->b_datacnt == 1); 2573 hdr->b_flags |= ARC_BUF_AVAILABLE; 2574 } 2575 2576 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 2577 2578 if (zio->io_error != 0) { 2579 hdr->b_flags |= ARC_IO_ERROR; 2580 if (hdr->b_state != arc_anon) 2581 arc_change_state(arc_anon, hdr, hash_lock); 2582 if (HDR_IN_HASH_TABLE(hdr)) 2583 buf_hash_remove(hdr); 2584 freeable = refcount_is_zero(&hdr->b_refcnt); 2585 } 2586 2587 /* 2588 * Broadcast before we drop the hash_lock to avoid the possibility 2589 * that the hdr (and hence the cv) might be freed before we get to 2590 * the cv_broadcast(). 2591 */ 2592 cv_broadcast(&hdr->b_cv); 2593 2594 if (hash_lock) { 2595 mutex_exit(hash_lock); 2596 } else { 2597 /* 2598 * This block was freed while we waited for the read to 2599 * complete. It has been removed from the hash table and 2600 * moved to the anonymous state (so that it won't show up 2601 * in the cache). 2602 */ 2603 ASSERT3P(hdr->b_state, ==, arc_anon); 2604 freeable = refcount_is_zero(&hdr->b_refcnt); 2605 } 2606 2607 /* execute each callback and free its structure */ 2608 while ((acb = callback_list) != NULL) { 2609 if (acb->acb_done) 2610 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 2611 2612 if (acb->acb_zio_dummy != NULL) { 2613 acb->acb_zio_dummy->io_error = zio->io_error; 2614 zio_nowait(acb->acb_zio_dummy); 2615 } 2616 2617 callback_list = acb->acb_next; 2618 kmem_free(acb, sizeof (arc_callback_t)); 2619 } 2620 2621 if (freeable) 2622 arc_hdr_destroy(hdr); 2623 } 2624 2625 /* 2626 * "Read" the block block at the specified DVA (in bp) via the 2627 * cache. If the block is found in the cache, invoke the provided 2628 * callback immediately and return. Note that the `zio' parameter 2629 * in the callback will be NULL in this case, since no IO was 2630 * required. If the block is not in the cache pass the read request 2631 * on to the spa with a substitute callback function, so that the 2632 * requested block will be added to the cache. 2633 * 2634 * If a read request arrives for a block that has a read in-progress, 2635 * either wait for the in-progress read to complete (and return the 2636 * results); or, if this is a read with a "done" func, add a record 2637 * to the read to invoke the "done" func when the read completes, 2638 * and return; or just return. 2639 * 2640 * arc_read_done() will invoke all the requested "done" functions 2641 * for readers of this block. 2642 * 2643 * Normal callers should use arc_read and pass the arc buffer and offset 2644 * for the bp. But if you know you don't need locking, you can use 2645 * arc_read_bp. 2646 */ 2647 int 2648 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf, 2649 arc_done_func_t *done, void *private, int priority, int zio_flags, 2650 uint32_t *arc_flags, const zbookmark_t *zb) 2651 { 2652 int err; 2653 2654 ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); 2655 ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); 2656 rw_enter(&pbuf->b_lock, RW_READER); 2657 2658 err = arc_read_nolock(pio, spa, bp, done, private, priority, 2659 zio_flags, arc_flags, zb); 2660 rw_exit(&pbuf->b_lock); 2661 2662 return (err); 2663 } 2664 2665 int 2666 arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp, 2667 arc_done_func_t *done, void *private, int priority, int zio_flags, 2668 uint32_t *arc_flags, const zbookmark_t *zb) 2669 { 2670 arc_buf_hdr_t *hdr; 2671 arc_buf_t *buf; 2672 kmutex_t *hash_lock; 2673 zio_t *rzio; 2674 uint64_t guid = spa_guid(spa); 2675 2676 top: 2677 hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), 2678 &hash_lock); 2679 if (hdr && hdr->b_datacnt > 0) { 2680 2681 *arc_flags |= ARC_CACHED; 2682 2683 if (HDR_IO_IN_PROGRESS(hdr)) { 2684 2685 if (*arc_flags & ARC_WAIT) { 2686 cv_wait(&hdr->b_cv, hash_lock); 2687 mutex_exit(hash_lock); 2688 goto top; 2689 } 2690 ASSERT(*arc_flags & ARC_NOWAIT); 2691 2692 if (done) { 2693 arc_callback_t *acb = NULL; 2694 2695 acb = kmem_zalloc(sizeof (arc_callback_t), 2696 KM_SLEEP); 2697 acb->acb_done = done; 2698 acb->acb_private = private; 2699 if (pio != NULL) 2700 acb->acb_zio_dummy = zio_null(pio, 2701 spa, NULL, NULL, NULL, zio_flags); 2702 2703 ASSERT(acb->acb_done != NULL); 2704 acb->acb_next = hdr->b_acb; 2705 hdr->b_acb = acb; 2706 add_reference(hdr, hash_lock, private); 2707 mutex_exit(hash_lock); 2708 return (0); 2709 } 2710 mutex_exit(hash_lock); 2711 return (0); 2712 } 2713 2714 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2715 2716 if (done) { 2717 add_reference(hdr, hash_lock, private); 2718 /* 2719 * If this block is already in use, create a new 2720 * copy of the data so that we will be guaranteed 2721 * that arc_release() will always succeed. 2722 */ 2723 buf = hdr->b_buf; 2724 ASSERT(buf); 2725 ASSERT(buf->b_data); 2726 if (HDR_BUF_AVAILABLE(hdr)) { 2727 ASSERT(buf->b_efunc == NULL); 2728 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 2729 } else { 2730 buf = arc_buf_clone(buf); 2731 } 2732 2733 } else if (*arc_flags & ARC_PREFETCH && 2734 refcount_count(&hdr->b_refcnt) == 0) { 2735 hdr->b_flags |= ARC_PREFETCH; 2736 } 2737 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2738 arc_access(hdr, hash_lock); 2739 if (*arc_flags & ARC_L2CACHE) 2740 hdr->b_flags |= ARC_L2CACHE; 2741 mutex_exit(hash_lock); 2742 ARCSTAT_BUMP(arcstat_hits); 2743 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2744 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2745 data, metadata, hits); 2746 2747 if (done) 2748 done(NULL, buf, private); 2749 } else { 2750 uint64_t size = BP_GET_LSIZE(bp); 2751 arc_callback_t *acb; 2752 vdev_t *vd = NULL; 2753 uint64_t addr; 2754 boolean_t devw = B_FALSE; 2755 2756 if (hdr == NULL) { 2757 /* this block is not in the cache */ 2758 arc_buf_hdr_t *exists; 2759 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 2760 buf = arc_buf_alloc(spa, size, private, type); 2761 hdr = buf->b_hdr; 2762 hdr->b_dva = *BP_IDENTITY(bp); 2763 hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 2764 hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 2765 exists = buf_hash_insert(hdr, &hash_lock); 2766 if (exists) { 2767 /* somebody beat us to the hash insert */ 2768 mutex_exit(hash_lock); 2769 bzero(&hdr->b_dva, sizeof (dva_t)); 2770 hdr->b_birth = 0; 2771 hdr->b_cksum0 = 0; 2772 (void) arc_buf_remove_ref(buf, private); 2773 goto top; /* restart the IO request */ 2774 } 2775 /* if this is a prefetch, we don't have a reference */ 2776 if (*arc_flags & ARC_PREFETCH) { 2777 (void) remove_reference(hdr, hash_lock, 2778 private); 2779 hdr->b_flags |= ARC_PREFETCH; 2780 } 2781 if (*arc_flags & ARC_L2CACHE) 2782 hdr->b_flags |= ARC_L2CACHE; 2783 if (BP_GET_LEVEL(bp) > 0) 2784 hdr->b_flags |= ARC_INDIRECT; 2785 } else { 2786 /* this block is in the ghost cache */ 2787 ASSERT(GHOST_STATE(hdr->b_state)); 2788 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2789 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); 2790 ASSERT(hdr->b_buf == NULL); 2791 2792 /* if this is a prefetch, we don't have a reference */ 2793 if (*arc_flags & ARC_PREFETCH) 2794 hdr->b_flags |= ARC_PREFETCH; 2795 else 2796 add_reference(hdr, hash_lock, private); 2797 if (*arc_flags & ARC_L2CACHE) 2798 hdr->b_flags |= ARC_L2CACHE; 2799 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2800 buf->b_hdr = hdr; 2801 buf->b_data = NULL; 2802 buf->b_efunc = NULL; 2803 buf->b_private = NULL; 2804 buf->b_next = NULL; 2805 hdr->b_buf = buf; 2806 arc_get_data_buf(buf); 2807 ASSERT(hdr->b_datacnt == 0); 2808 hdr->b_datacnt = 1; 2809 } 2810 2811 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 2812 acb->acb_done = done; 2813 acb->acb_private = private; 2814 2815 ASSERT(hdr->b_acb == NULL); 2816 hdr->b_acb = acb; 2817 hdr->b_flags |= ARC_IO_IN_PROGRESS; 2818 2819 /* 2820 * If the buffer has been evicted, migrate it to a present state 2821 * before issuing the I/O. Once we drop the hash-table lock, 2822 * the header will be marked as I/O in progress and have an 2823 * attached buffer. At this point, anybody who finds this 2824 * buffer ought to notice that it's legit but has a pending I/O. 2825 */ 2826 2827 if (GHOST_STATE(hdr->b_state)) 2828 arc_access(hdr, hash_lock); 2829 2830 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && 2831 (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { 2832 devw = hdr->b_l2hdr->b_dev->l2ad_writing; 2833 addr = hdr->b_l2hdr->b_daddr; 2834 /* 2835 * Lock out device removal. 2836 */ 2837 if (vdev_is_dead(vd) || 2838 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 2839 vd = NULL; 2840 } 2841 2842 mutex_exit(hash_lock); 2843 2844 ASSERT3U(hdr->b_size, ==, size); 2845 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 2846 uint64_t, size, zbookmark_t *, zb); 2847 ARCSTAT_BUMP(arcstat_misses); 2848 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2849 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2850 data, metadata, misses); 2851 2852 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 2853 /* 2854 * Read from the L2ARC if the following are true: 2855 * 1. The L2ARC vdev was previously cached. 2856 * 2. This buffer still has L2ARC metadata. 2857 * 3. This buffer isn't currently writing to the L2ARC. 2858 * 4. The L2ARC entry wasn't evicted, which may 2859 * also have invalidated the vdev. 2860 * 5. This isn't prefetch and l2arc_noprefetch is set. 2861 */ 2862 if (hdr->b_l2hdr != NULL && 2863 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 2864 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 2865 l2arc_read_callback_t *cb; 2866 2867 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 2868 ARCSTAT_BUMP(arcstat_l2_hits); 2869 2870 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 2871 KM_SLEEP); 2872 cb->l2rcb_buf = buf; 2873 cb->l2rcb_spa = spa; 2874 cb->l2rcb_bp = *bp; 2875 cb->l2rcb_zb = *zb; 2876 cb->l2rcb_flags = zio_flags; 2877 2878 /* 2879 * l2arc read. The SCL_L2ARC lock will be 2880 * released by l2arc_read_done(). 2881 */ 2882 rzio = zio_read_phys(pio, vd, addr, size, 2883 buf->b_data, ZIO_CHECKSUM_OFF, 2884 l2arc_read_done, cb, priority, zio_flags | 2885 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | 2886 ZIO_FLAG_DONT_PROPAGATE | 2887 ZIO_FLAG_DONT_RETRY, B_FALSE); 2888 DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 2889 zio_t *, rzio); 2890 ARCSTAT_INCR(arcstat_l2_read_bytes, size); 2891 2892 if (*arc_flags & ARC_NOWAIT) { 2893 zio_nowait(rzio); 2894 return (0); 2895 } 2896 2897 ASSERT(*arc_flags & ARC_WAIT); 2898 if (zio_wait(rzio) == 0) 2899 return (0); 2900 2901 /* l2arc read error; goto zio_read() */ 2902 } else { 2903 DTRACE_PROBE1(l2arc__miss, 2904 arc_buf_hdr_t *, hdr); 2905 ARCSTAT_BUMP(arcstat_l2_misses); 2906 if (HDR_L2_WRITING(hdr)) 2907 ARCSTAT_BUMP(arcstat_l2_rw_clash); 2908 spa_config_exit(spa, SCL_L2ARC, vd); 2909 } 2910 } else { 2911 if (vd != NULL) 2912 spa_config_exit(spa, SCL_L2ARC, vd); 2913 if (l2arc_ndev != 0) { 2914 DTRACE_PROBE1(l2arc__miss, 2915 arc_buf_hdr_t *, hdr); 2916 ARCSTAT_BUMP(arcstat_l2_misses); 2917 } 2918 } 2919 2920 rzio = zio_read(pio, spa, bp, buf->b_data, size, 2921 arc_read_done, buf, priority, zio_flags, zb); 2922 2923 if (*arc_flags & ARC_WAIT) 2924 return (zio_wait(rzio)); 2925 2926 ASSERT(*arc_flags & ARC_NOWAIT); 2927 zio_nowait(rzio); 2928 } 2929 return (0); 2930 } 2931 2932 void 2933 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 2934 { 2935 ASSERT(buf->b_hdr != NULL); 2936 ASSERT(buf->b_hdr->b_state != arc_anon); 2937 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); 2938 ASSERT(buf->b_efunc == NULL); 2939 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); 2940 2941 buf->b_efunc = func; 2942 buf->b_private = private; 2943 } 2944 2945 /* 2946 * This is used by the DMU to let the ARC know that a buffer is 2947 * being evicted, so the ARC should clean up. If this arc buf 2948 * is not yet in the evicted state, it will be put there. 2949 */ 2950 int 2951 arc_buf_evict(arc_buf_t *buf) 2952 { 2953 arc_buf_hdr_t *hdr; 2954 kmutex_t *hash_lock; 2955 arc_buf_t **bufp; 2956 2957 rw_enter(&buf->b_lock, RW_WRITER); 2958 hdr = buf->b_hdr; 2959 if (hdr == NULL) { 2960 /* 2961 * We are in arc_do_user_evicts(). 2962 */ 2963 ASSERT(buf->b_data == NULL); 2964 rw_exit(&buf->b_lock); 2965 return (0); 2966 } else if (buf->b_data == NULL) { 2967 arc_buf_t copy = *buf; /* structure assignment */ 2968 /* 2969 * We are on the eviction list; process this buffer now 2970 * but let arc_do_user_evicts() do the reaping. 2971 */ 2972 buf->b_efunc = NULL; 2973 rw_exit(&buf->b_lock); 2974 VERIFY(copy.b_efunc(©) == 0); 2975 return (1); 2976 } 2977 hash_lock = HDR_LOCK(hdr); 2978 mutex_enter(hash_lock); 2979 2980 ASSERT(buf->b_hdr == hdr); 2981 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 2982 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2983 2984 /* 2985 * Pull this buffer off of the hdr 2986 */ 2987 bufp = &hdr->b_buf; 2988 while (*bufp != buf) 2989 bufp = &(*bufp)->b_next; 2990 *bufp = buf->b_next; 2991 2992 ASSERT(buf->b_data != NULL); 2993 arc_buf_destroy(buf, FALSE, FALSE); 2994 2995 if (hdr->b_datacnt == 0) { 2996 arc_state_t *old_state = hdr->b_state; 2997 arc_state_t *evicted_state; 2998 2999 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 3000 3001 evicted_state = 3002 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 3003 3004 mutex_enter(&old_state->arcs_mtx); 3005 mutex_enter(&evicted_state->arcs_mtx); 3006 3007 arc_change_state(evicted_state, hdr, hash_lock); 3008 ASSERT(HDR_IN_HASH_TABLE(hdr)); 3009 hdr->b_flags |= ARC_IN_HASH_TABLE; 3010 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 3011 3012 mutex_exit(&evicted_state->arcs_mtx); 3013 mutex_exit(&old_state->arcs_mtx); 3014 } 3015 mutex_exit(hash_lock); 3016 rw_exit(&buf->b_lock); 3017 3018 VERIFY(buf->b_efunc(buf) == 0); 3019 buf->b_efunc = NULL; 3020 buf->b_private = NULL; 3021 buf->b_hdr = NULL; 3022 kmem_cache_free(buf_cache, buf); 3023 return (1); 3024 } 3025 3026 /* 3027 * Release this buffer from the cache. This must be done 3028 * after a read and prior to modifying the buffer contents. 3029 * If the buffer has more than one reference, we must make 3030 * a new hdr for the buffer. 3031 */ 3032 void 3033 arc_release(arc_buf_t *buf, void *tag) 3034 { 3035 arc_buf_hdr_t *hdr; 3036 kmutex_t *hash_lock; 3037 l2arc_buf_hdr_t *l2hdr; 3038 uint64_t buf_size; 3039 boolean_t released = B_FALSE; 3040 3041 rw_enter(&buf->b_lock, RW_WRITER); 3042 hdr = buf->b_hdr; 3043 3044 /* this buffer is not on any list */ 3045 ASSERT(refcount_count(&hdr->b_refcnt) > 0); 3046 3047 if (hdr->b_state == arc_anon) { 3048 /* this buffer is already released */ 3049 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); 3050 ASSERT(BUF_EMPTY(hdr)); 3051 ASSERT(buf->b_efunc == NULL); 3052 arc_buf_thaw(buf); 3053 rw_exit(&buf->b_lock); 3054 released = B_TRUE; 3055 } else { 3056 hash_lock = HDR_LOCK(hdr); 3057 mutex_enter(hash_lock); 3058 } 3059 3060 l2hdr = hdr->b_l2hdr; 3061 if (l2hdr) { 3062 mutex_enter(&l2arc_buflist_mtx); 3063 hdr->b_l2hdr = NULL; 3064 buf_size = hdr->b_size; 3065 } 3066 3067 if (released) 3068 goto out; 3069 3070 /* 3071 * Do we have more than one buf? 3072 */ 3073 if (hdr->b_datacnt > 1) { 3074 arc_buf_hdr_t *nhdr; 3075 arc_buf_t **bufp; 3076 uint64_t blksz = hdr->b_size; 3077 uint64_t spa = hdr->b_spa; 3078 arc_buf_contents_t type = hdr->b_type; 3079 uint32_t flags = hdr->b_flags; 3080 3081 ASSERT(hdr->b_buf != buf || buf->b_next != NULL); 3082 /* 3083 * Pull the data off of this buf and attach it to 3084 * a new anonymous buf. 3085 */ 3086 (void) remove_reference(hdr, hash_lock, tag); 3087 bufp = &hdr->b_buf; 3088 while (*bufp != buf) 3089 bufp = &(*bufp)->b_next; 3090 *bufp = (*bufp)->b_next; 3091 buf->b_next = NULL; 3092 3093 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); 3094 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); 3095 if (refcount_is_zero(&hdr->b_refcnt)) { 3096 uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; 3097 ASSERT3U(*size, >=, hdr->b_size); 3098 atomic_add_64(size, -hdr->b_size); 3099 } 3100 hdr->b_datacnt -= 1; 3101 arc_cksum_verify(buf); 3102 3103 mutex_exit(hash_lock); 3104 3105 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 3106 nhdr->b_size = blksz; 3107 nhdr->b_spa = spa; 3108 nhdr->b_type = type; 3109 nhdr->b_buf = buf; 3110 nhdr->b_state = arc_anon; 3111 nhdr->b_arc_access = 0; 3112 nhdr->b_flags = flags & ARC_L2_WRITING; 3113 nhdr->b_l2hdr = NULL; 3114 nhdr->b_datacnt = 1; 3115 nhdr->b_freeze_cksum = NULL; 3116 (void) refcount_add(&nhdr->b_refcnt, tag); 3117 buf->b_hdr = nhdr; 3118 rw_exit(&buf->b_lock); 3119 atomic_add_64(&arc_anon->arcs_size, blksz); 3120 } else { 3121 rw_exit(&buf->b_lock); 3122 ASSERT(refcount_count(&hdr->b_refcnt) == 1); 3123 ASSERT(!list_link_active(&hdr->b_arc_node)); 3124 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3125 arc_change_state(arc_anon, hdr, hash_lock); 3126 hdr->b_arc_access = 0; 3127 mutex_exit(hash_lock); 3128 3129 bzero(&hdr->b_dva, sizeof (dva_t)); 3130 hdr->b_birth = 0; 3131 hdr->b_cksum0 = 0; 3132 arc_buf_thaw(buf); 3133 } 3134 buf->b_efunc = NULL; 3135 buf->b_private = NULL; 3136 3137 out: 3138 if (l2hdr) { 3139 list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 3140 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 3141 ARCSTAT_INCR(arcstat_l2_size, -buf_size); 3142 mutex_exit(&l2arc_buflist_mtx); 3143 } 3144 } 3145 3146 int 3147 arc_released(arc_buf_t *buf) 3148 { 3149 int released; 3150 3151 rw_enter(&buf->b_lock, RW_READER); 3152 released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); 3153 rw_exit(&buf->b_lock); 3154 return (released); 3155 } 3156 3157 int 3158 arc_has_callback(arc_buf_t *buf) 3159 { 3160 int callback; 3161 3162 rw_enter(&buf->b_lock, RW_READER); 3163 callback = (buf->b_efunc != NULL); 3164 rw_exit(&buf->b_lock); 3165 return (callback); 3166 } 3167 3168 #ifdef ZFS_DEBUG 3169 int 3170 arc_referenced(arc_buf_t *buf) 3171 { 3172 int referenced; 3173 3174 rw_enter(&buf->b_lock, RW_READER); 3175 referenced = (refcount_count(&buf->b_hdr->b_refcnt)); 3176 rw_exit(&buf->b_lock); 3177 return (referenced); 3178 } 3179 #endif 3180 3181 static void 3182 arc_write_ready(zio_t *zio) 3183 { 3184 arc_write_callback_t *callback = zio->io_private; 3185 arc_buf_t *buf = callback->awcb_buf; 3186 arc_buf_hdr_t *hdr = buf->b_hdr; 3187 3188 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); 3189 callback->awcb_ready(zio, buf, callback->awcb_private); 3190 3191 /* 3192 * If the IO is already in progress, then this is a re-write 3193 * attempt, so we need to thaw and re-compute the cksum. 3194 * It is the responsibility of the callback to handle the 3195 * accounting for any re-write attempt. 3196 */ 3197 if (HDR_IO_IN_PROGRESS(hdr)) { 3198 mutex_enter(&hdr->b_freeze_lock); 3199 if (hdr->b_freeze_cksum != NULL) { 3200 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 3201 hdr->b_freeze_cksum = NULL; 3202 } 3203 mutex_exit(&hdr->b_freeze_lock); 3204 } 3205 arc_cksum_compute(buf, B_FALSE); 3206 hdr->b_flags |= ARC_IO_IN_PROGRESS; 3207 } 3208 3209 static void 3210 arc_write_done(zio_t *zio) 3211 { 3212 arc_write_callback_t *callback = zio->io_private; 3213 arc_buf_t *buf = callback->awcb_buf; 3214 arc_buf_hdr_t *hdr = buf->b_hdr; 3215 3216 ASSERT(hdr->b_acb == NULL); 3217 3218 if (zio->io_error == 0) { 3219 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 3220 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 3221 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 3222 } else { 3223 ASSERT(BUF_EMPTY(hdr)); 3224 } 3225 3226 /* 3227 * If the block to be written was all-zero, we may have 3228 * compressed it away. In this case no write was performed 3229 * so there will be no dva/birth-date/checksum. The buffer 3230 * must therefor remain anonymous (and uncached). 3231 */ 3232 if (!BUF_EMPTY(hdr)) { 3233 arc_buf_hdr_t *exists; 3234 kmutex_t *hash_lock; 3235 3236 ASSERT(zio->io_error == 0); 3237 3238 arc_cksum_verify(buf); 3239 3240 exists = buf_hash_insert(hdr, &hash_lock); 3241 if (exists) { 3242 /* 3243 * This can only happen if we overwrite for 3244 * sync-to-convergence, because we remove 3245 * buffers from the hash table when we arc_free(). 3246 */ 3247 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 3248 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 3249 panic("bad overwrite, hdr=%p exists=%p", 3250 (void *)hdr, (void *)exists); 3251 ASSERT(refcount_is_zero(&exists->b_refcnt)); 3252 arc_change_state(arc_anon, exists, hash_lock); 3253 mutex_exit(hash_lock); 3254 arc_hdr_destroy(exists); 3255 exists = buf_hash_insert(hdr, &hash_lock); 3256 ASSERT3P(exists, ==, NULL); 3257 } else { 3258 /* Dedup */ 3259 ASSERT(hdr->b_datacnt == 1); 3260 ASSERT(hdr->b_state == arc_anon); 3261 ASSERT(BP_GET_DEDUP(zio->io_bp)); 3262 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 3263 } 3264 } 3265 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3266 /* if it's not anon, we are doing a scrub */ 3267 if (!exists && hdr->b_state == arc_anon) 3268 arc_access(hdr, hash_lock); 3269 mutex_exit(hash_lock); 3270 } else { 3271 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3272 } 3273 3274 ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 3275 callback->awcb_done(zio, buf, callback->awcb_private); 3276 3277 kmem_free(callback, sizeof (arc_write_callback_t)); 3278 } 3279 3280 zio_t * 3281 arc_write(zio_t *pio, spa_t *spa, uint64_t txg, 3282 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, 3283 arc_done_func_t *ready, arc_done_func_t *done, void *private, 3284 int priority, int zio_flags, const zbookmark_t *zb) 3285 { 3286 arc_buf_hdr_t *hdr = buf->b_hdr; 3287 arc_write_callback_t *callback; 3288 zio_t *zio; 3289 3290 ASSERT(ready != NULL); 3291 ASSERT(done != NULL); 3292 ASSERT(!HDR_IO_ERROR(hdr)); 3293 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 3294 ASSERT(hdr->b_acb == NULL); 3295 if (l2arc) 3296 hdr->b_flags |= ARC_L2CACHE; 3297 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 3298 callback->awcb_ready = ready; 3299 callback->awcb_done = done; 3300 callback->awcb_private = private; 3301 callback->awcb_buf = buf; 3302 3303 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 3304 arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); 3305 3306 return (zio); 3307 } 3308 3309 void 3310 arc_free(spa_t *spa, const blkptr_t *bp) 3311 { 3312 arc_buf_hdr_t *ab; 3313 kmutex_t *hash_lock; 3314 uint64_t guid = spa_guid(spa); 3315 3316 /* 3317 * If this buffer is in the cache, release it, so it can be re-used. 3318 */ 3319 ab = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), 3320 &hash_lock); 3321 if (ab != NULL) { 3322 if (ab->b_state != arc_anon) 3323 arc_change_state(arc_anon, ab, hash_lock); 3324 if (HDR_IO_IN_PROGRESS(ab)) { 3325 /* 3326 * This should only happen when we prefetch. 3327 */ 3328 ASSERT(ab->b_flags & ARC_PREFETCH); 3329 ASSERT3U(ab->b_datacnt, ==, 1); 3330 ab->b_flags |= ARC_FREED_IN_READ; 3331 if (HDR_IN_HASH_TABLE(ab)) 3332 buf_hash_remove(ab); 3333 ab->b_arc_access = 0; 3334 bzero(&ab->b_dva, sizeof (dva_t)); 3335 ab->b_birth = 0; 3336 ab->b_cksum0 = 0; 3337 ab->b_buf->b_efunc = NULL; 3338 ab->b_buf->b_private = NULL; 3339 mutex_exit(hash_lock); 3340 } else { 3341 ASSERT(refcount_is_zero(&ab->b_refcnt)); 3342 ab->b_flags |= ARC_FREE_IN_PROGRESS; 3343 mutex_exit(hash_lock); 3344 arc_hdr_destroy(ab); 3345 ARCSTAT_BUMP(arcstat_deleted); 3346 } 3347 } 3348 } 3349 3350 static int 3351 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) 3352 { 3353 #ifdef _KERNEL 3354 uint64_t available_memory = ptob(freemem); 3355 static uint64_t page_load = 0; 3356 static uint64_t last_txg = 0; 3357 3358 available_memory = 3359 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); 3360 if (available_memory >= zfs_write_limit_max) 3361 return (0); 3362 3363 if (txg > last_txg) { 3364 last_txg = txg; 3365 page_load = 0; 3366 } 3367 /* 3368 * If we are in pageout, we know that memory is already tight, 3369 * the arc is already going to be evicting, so we just want to 3370 * continue to let page writes occur as quickly as possible. 3371 */ 3372 if (curproc == proc_pageout) { 3373 if (page_load > MAX(ptob(minfree), available_memory) / 4) 3374 return (ERESTART); 3375 /* Note: reserve is inflated, so we deflate */ 3376 page_load += reserve / 8; 3377 return (0); 3378 } else if (page_load > 0 && arc_reclaim_needed()) { 3379 /* memory is low, delay before restarting */ 3380 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 3381 return (EAGAIN); 3382 } 3383 page_load = 0; 3384 3385 if (arc_size > arc_c_min) { 3386 uint64_t evictable_memory = 3387 arc_mru->arcs_lsize[ARC_BUFC_DATA] + 3388 arc_mru->arcs_lsize[ARC_BUFC_METADATA] + 3389 arc_mfu->arcs_lsize[ARC_BUFC_DATA] + 3390 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; 3391 available_memory += MIN(evictable_memory, arc_size - arc_c_min); 3392 } 3393 3394 if (inflight_data > available_memory / 4) { 3395 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 3396 return (ERESTART); 3397 } 3398 #endif 3399 return (0); 3400 } 3401 3402 void 3403 arc_tempreserve_clear(uint64_t reserve) 3404 { 3405 atomic_add_64(&arc_tempreserve, -reserve); 3406 ASSERT((int64_t)arc_tempreserve >= 0); 3407 } 3408 3409 int 3410 arc_tempreserve_space(uint64_t reserve, uint64_t txg) 3411 { 3412 int error; 3413 uint64_t anon_size; 3414 3415 #ifdef ZFS_DEBUG 3416 /* 3417 * Once in a while, fail for no reason. Everything should cope. 3418 */ 3419 if (spa_get_random(10000) == 0) { 3420 dprintf("forcing random failure\n"); 3421 return (ERESTART); 3422 } 3423 #endif 3424 if (reserve > arc_c/4 && !arc_no_grow) 3425 arc_c = MIN(arc_c_max, reserve * 4); 3426 if (reserve > arc_c) 3427 return (ENOMEM); 3428 3429 /* 3430 * Don't count loaned bufs as in flight dirty data to prevent long 3431 * network delays from blocking transactions that are ready to be 3432 * assigned to a txg. 3433 */ 3434 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); 3435 3436 /* 3437 * Writes will, almost always, require additional memory allocations 3438 * in order to compress/encrypt/etc the data. We therefor need to 3439 * make sure that there is sufficient available memory for this. 3440 */ 3441 if (error = arc_memory_throttle(reserve, anon_size, txg)) 3442 return (error); 3443 3444 /* 3445 * Throttle writes when the amount of dirty data in the cache 3446 * gets too large. We try to keep the cache less than half full 3447 * of dirty blocks so that our sync times don't grow too large. 3448 * Note: if two requests come in concurrently, we might let them 3449 * both succeed, when one of them should fail. Not a huge deal. 3450 */ 3451 3452 if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 3453 anon_size > arc_c / 4) { 3454 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 3455 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 3456 arc_tempreserve>>10, 3457 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 3458 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 3459 reserve>>10, arc_c>>10); 3460 return (ERESTART); 3461 } 3462 atomic_add_64(&arc_tempreserve, reserve); 3463 return (0); 3464 } 3465 3466 #if defined(__NetBSD__) && defined(_KERNEL) 3467 /* Reclaim hook registered to uvm for reclaiming KVM and memory */ 3468 static void 3469 arc_uvm_reclaim_hook(void) 3470 { 3471 3472 if (mutex_tryenter(&arc_reclaim_thr_lock)) { 3473 cv_broadcast(&arc_reclaim_thr_cv); 3474 mutex_exit(&arc_reclaim_thr_lock); 3475 } 3476 } 3477 3478 static int 3479 arc_kva_reclaim_callback(struct callback_entry *ce, void *obj, void *arg) 3480 { 3481 3482 3483 if (mutex_tryenter(&arc_reclaim_thr_lock)) { 3484 cv_broadcast(&arc_reclaim_thr_cv); 3485 mutex_exit(&arc_reclaim_thr_lock); 3486 } 3487 3488 return CALLBACK_CHAIN_CONTINUE; 3489 } 3490 3491 #endif /* __NetBSD__ */ 3492 3493 void 3494 arc_init(void) 3495 { 3496 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 3497 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 3498 3499 /* Convert seconds to clock ticks */ 3500 arc_min_prefetch_lifespan = 1 * hz; 3501 3502 /* Start out with 1/8 of all memory */ 3503 arc_c = physmem * PAGESIZE / 8; 3504 3505 #ifdef _KERNEL 3506 /* 3507 * On architectures where the physical memory can be larger 3508 * than the addressable space (intel in 32-bit mode), we may 3509 * need to limit the cache to 1/8 of VM size. 3510 */ 3511 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 3512 #endif 3513 3514 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 3515 arc_c_min = MAX(arc_c / 4, 64<<20); 3516 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 3517 if (arc_c * 8 >= 1<<30) 3518 arc_c_max = (arc_c * 8) - (1<<30); 3519 else 3520 arc_c_max = arc_c_min; 3521 arc_c_max = MAX(arc_c * 6, arc_c_max); 3522 3523 /* 3524 * Allow the tunables to override our calculations if they are 3525 * reasonable (ie. over 64MB) 3526 */ 3527 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) 3528 arc_c_max = zfs_arc_max; 3529 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) 3530 arc_c_min = zfs_arc_min; 3531 3532 arc_c = arc_c_max; 3533 arc_p = (arc_c >> 1); 3534 3535 /* limit meta-data to 1/4 of the arc capacity */ 3536 arc_meta_limit = arc_c_max / 4; 3537 3538 /* Allow the tunable to override if it is reasonable */ 3539 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 3540 arc_meta_limit = zfs_arc_meta_limit; 3541 3542 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 3543 arc_c_min = arc_meta_limit / 2; 3544 3545 if (zfs_arc_grow_retry > 0) 3546 arc_grow_retry = zfs_arc_grow_retry; 3547 3548 if (zfs_arc_shrink_shift > 0) 3549 arc_shrink_shift = zfs_arc_shrink_shift; 3550 3551 if (zfs_arc_p_min_shift > 0) 3552 arc_p_min_shift = zfs_arc_p_min_shift; 3553 3554 /* if kmem_flags are set, lets try to use less memory */ 3555 if (kmem_debugging()) 3556 arc_c = arc_c / 2; 3557 if (arc_c < arc_c_min) 3558 arc_c = arc_c_min; 3559 3560 arc_anon = &ARC_anon; 3561 arc_mru = &ARC_mru; 3562 arc_mru_ghost = &ARC_mru_ghost; 3563 arc_mfu = &ARC_mfu; 3564 arc_mfu_ghost = &ARC_mfu_ghost; 3565 arc_l2c_only = &ARC_l2c_only; 3566 arc_size = 0; 3567 3568 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3569 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3570 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3571 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3572 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3573 mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3574 3575 list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], 3576 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3577 list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], 3578 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3579 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], 3580 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3581 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], 3582 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3583 list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], 3584 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3585 list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], 3586 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3587 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], 3588 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3589 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], 3590 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3591 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], 3592 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3593 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], 3594 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3595 3596 buf_init(); 3597 3598 arc_thread_exit = 0; 3599 arc_eviction_list = NULL; 3600 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 3601 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 3602 3603 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 3604 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 3605 3606 if (arc_ksp != NULL) { 3607 arc_ksp->ks_data = &arc_stats; 3608 kstat_install(arc_ksp); 3609 } 3610 3611 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 3612 TS_RUN, maxclsyspri); 3613 3614 #if defined(__NetBSD__) && defined(_KERNEL) 3615 arc_hook.uvm_reclaim_hook = &arc_uvm_reclaim_hook; 3616 3617 uvm_reclaim_hook_add(&arc_hook); 3618 callback_register(&vm_map_to_kernel(kernel_map)->vmk_reclaim_callback, 3619 &arc_kva_reclaim_entry, NULL, arc_kva_reclaim_callback); 3620 3621 #endif 3622 3623 arc_dead = FALSE; 3624 arc_warm = B_FALSE; 3625 3626 if (zfs_write_limit_max == 0) 3627 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; 3628 else 3629 zfs_write_limit_shift = 0; 3630 mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL); 3631 } 3632 3633 void 3634 arc_fini(void) 3635 { 3636 mutex_enter(&arc_reclaim_thr_lock); 3637 arc_thread_exit = 1; 3638 while (arc_thread_exit != 0) 3639 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 3640 mutex_exit(&arc_reclaim_thr_lock); 3641 3642 arc_flush(NULL); 3643 3644 arc_dead = TRUE; 3645 3646 if (arc_ksp != NULL) { 3647 kstat_delete(arc_ksp); 3648 arc_ksp = NULL; 3649 } 3650 3651 mutex_destroy(&arc_eviction_mtx); 3652 mutex_destroy(&arc_reclaim_thr_lock); 3653 cv_destroy(&arc_reclaim_thr_cv); 3654 3655 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); 3656 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 3657 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); 3658 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 3659 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); 3660 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 3661 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); 3662 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 3663 3664 mutex_destroy(&arc_anon->arcs_mtx); 3665 mutex_destroy(&arc_mru->arcs_mtx); 3666 mutex_destroy(&arc_mru_ghost->arcs_mtx); 3667 mutex_destroy(&arc_mfu->arcs_mtx); 3668 mutex_destroy(&arc_mfu_ghost->arcs_mtx); 3669 mutex_destroy(&arc_l2c_only->arcs_mtx); 3670 3671 mutex_destroy(&zfs_write_limit_lock); 3672 3673 #if defined(__NetBSD__) && defined(_KERNEL) 3674 uvm_reclaim_hook_del(&arc_hook); 3675 callback_unregister(&vm_map_to_kernel(kernel_map)->vmk_reclaim_callback, 3676 &arc_kva_reclaim_entry); 3677 #endif 3678 3679 buf_fini(); 3680 3681 ASSERT(arc_loaned_bytes == 0); 3682 } 3683 3684 /* 3685 * Level 2 ARC 3686 * 3687 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 3688 * It uses dedicated storage devices to hold cached data, which are populated 3689 * using large infrequent writes. The main role of this cache is to boost 3690 * the performance of random read workloads. The intended L2ARC devices 3691 * include short-stroked disks, solid state disks, and other media with 3692 * substantially faster read latency than disk. 3693 * 3694 * +-----------------------+ 3695 * | ARC | 3696 * +-----------------------+ 3697 * | ^ ^ 3698 * | | | 3699 * l2arc_feed_thread() arc_read() 3700 * | | | 3701 * | l2arc read | 3702 * V | | 3703 * +---------------+ | 3704 * | L2ARC | | 3705 * +---------------+ | 3706 * | ^ | 3707 * l2arc_write() | | 3708 * | | | 3709 * V | | 3710 * +-------+ +-------+ 3711 * | vdev | | vdev | 3712 * | cache | | cache | 3713 * +-------+ +-------+ 3714 * +=========+ .-----. 3715 * : L2ARC : |-_____-| 3716 * : devices : | Disks | 3717 * +=========+ `-_____-' 3718 * 3719 * Read requests are satisfied from the following sources, in order: 3720 * 3721 * 1) ARC 3722 * 2) vdev cache of L2ARC devices 3723 * 3) L2ARC devices 3724 * 4) vdev cache of disks 3725 * 5) disks 3726 * 3727 * Some L2ARC device types exhibit extremely slow write performance. 3728 * To accommodate for this there are some significant differences between 3729 * the L2ARC and traditional cache design: 3730 * 3731 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 3732 * the ARC behave as usual, freeing buffers and placing headers on ghost 3733 * lists. The ARC does not send buffers to the L2ARC during eviction as 3734 * this would add inflated write latencies for all ARC memory pressure. 3735 * 3736 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 3737 * It does this by periodically scanning buffers from the eviction-end of 3738 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 3739 * not already there. It scans until a headroom of buffers is satisfied, 3740 * which itself is a buffer for ARC eviction. The thread that does this is 3741 * l2arc_feed_thread(), illustrated below; example sizes are included to 3742 * provide a better sense of ratio than this diagram: 3743 * 3744 * head --> tail 3745 * +---------------------+----------+ 3746 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 3747 * +---------------------+----------+ | o L2ARC eligible 3748 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 3749 * +---------------------+----------+ | 3750 * 15.9 Gbytes ^ 32 Mbytes | 3751 * headroom | 3752 * l2arc_feed_thread() 3753 * | 3754 * l2arc write hand <--[oooo]--' 3755 * | 8 Mbyte 3756 * | write max 3757 * V 3758 * +==============================+ 3759 * L2ARC dev |####|#|###|###| |####| ... | 3760 * +==============================+ 3761 * 32 Gbytes 3762 * 3763 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 3764 * evicted, then the L2ARC has cached a buffer much sooner than it probably 3765 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 3766 * safe to say that this is an uncommon case, since buffers at the end of 3767 * the ARC lists have moved there due to inactivity. 3768 * 3769 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 3770 * then the L2ARC simply misses copying some buffers. This serves as a 3771 * pressure valve to prevent heavy read workloads from both stalling the ARC 3772 * with waits and clogging the L2ARC with writes. This also helps prevent 3773 * the potential for the L2ARC to churn if it attempts to cache content too 3774 * quickly, such as during backups of the entire pool. 3775 * 3776 * 5. After system boot and before the ARC has filled main memory, there are 3777 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 3778 * lists can remain mostly static. Instead of searching from tail of these 3779 * lists as pictured, the l2arc_feed_thread() will search from the list heads 3780 * for eligible buffers, greatly increasing its chance of finding them. 3781 * 3782 * The L2ARC device write speed is also boosted during this time so that 3783 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 3784 * there are no L2ARC reads, and no fear of degrading read performance 3785 * through increased writes. 3786 * 3787 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 3788 * the vdev queue can aggregate them into larger and fewer writes. Each 3789 * device is written to in a rotor fashion, sweeping writes through 3790 * available space then repeating. 3791 * 3792 * 7. The L2ARC does not store dirty content. It never needs to flush 3793 * write buffers back to disk based storage. 3794 * 3795 * 8. If an ARC buffer is written (and dirtied) which also exists in the 3796 * L2ARC, the now stale L2ARC buffer is immediately dropped. 3797 * 3798 * The performance of the L2ARC can be tweaked by a number of tunables, which 3799 * may be necessary for different workloads: 3800 * 3801 * l2arc_write_max max write bytes per interval 3802 * l2arc_write_boost extra write bytes during device warmup 3803 * l2arc_noprefetch skip caching prefetched buffers 3804 * l2arc_headroom number of max device writes to precache 3805 * l2arc_feed_secs seconds between L2ARC writing 3806 * 3807 * Tunables may be removed or added as future performance improvements are 3808 * integrated, and also may become zpool properties. 3809 * 3810 * There are three key functions that control how the L2ARC warms up: 3811 * 3812 * l2arc_write_eligible() check if a buffer is eligible to cache 3813 * l2arc_write_size() calculate how much to write 3814 * l2arc_write_interval() calculate sleep delay between writes 3815 * 3816 * These three functions determine what to write, how much, and how quickly 3817 * to send writes. 3818 */ 3819 3820 static boolean_t 3821 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) 3822 { 3823 /* 3824 * A buffer is *not* eligible for the L2ARC if it: 3825 * 1. belongs to a different spa. 3826 * 2. is already cached on the L2ARC. 3827 * 3. has an I/O in progress (it may be an incomplete read). 3828 * 4. is flagged not eligible (zfs property). 3829 */ 3830 if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL || 3831 HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) 3832 return (B_FALSE); 3833 3834 return (B_TRUE); 3835 } 3836 3837 static uint64_t 3838 l2arc_write_size(l2arc_dev_t *dev) 3839 { 3840 uint64_t size; 3841 3842 size = dev->l2ad_write; 3843 3844 if (arc_warm == B_FALSE) 3845 size += dev->l2ad_boost; 3846 3847 return (size); 3848 3849 } 3850 3851 static clock_t 3852 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 3853 { 3854 clock_t interval, next, now; 3855 3856 /* 3857 * If the ARC lists are busy, increase our write rate; if the 3858 * lists are stale, idle back. This is achieved by checking 3859 * how much we previously wrote - if it was more than half of 3860 * what we wanted, schedule the next write much sooner. 3861 */ 3862 if (l2arc_feed_again && wrote > (wanted / 2)) 3863 interval = (hz * l2arc_feed_min_ms) / 1000; 3864 else 3865 interval = hz * l2arc_feed_secs; 3866 3867 now = ddi_get_lbolt(); 3868 next = MAX(now, MIN(now + interval, began + interval)); 3869 3870 return (next); 3871 } 3872 3873 static void 3874 l2arc_hdr_stat_add(void) 3875 { 3876 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); 3877 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); 3878 } 3879 3880 static void 3881 l2arc_hdr_stat_remove(void) 3882 { 3883 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); 3884 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); 3885 } 3886 3887 /* 3888 * Cycle through L2ARC devices. This is how L2ARC load balances. 3889 * If a device is returned, this also returns holding the spa config lock. 3890 */ 3891 static l2arc_dev_t * 3892 l2arc_dev_get_next(void) 3893 { 3894 l2arc_dev_t *first, *next = NULL; 3895 3896 /* 3897 * Lock out the removal of spas (spa_namespace_lock), then removal 3898 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 3899 * both locks will be dropped and a spa config lock held instead. 3900 */ 3901 mutex_enter(&spa_namespace_lock); 3902 mutex_enter(&l2arc_dev_mtx); 3903 3904 /* if there are no vdevs, there is nothing to do */ 3905 if (l2arc_ndev == 0) 3906 goto out; 3907 3908 first = NULL; 3909 next = l2arc_dev_last; 3910 do { 3911 /* loop around the list looking for a non-faulted vdev */ 3912 if (next == NULL) { 3913 next = list_head(l2arc_dev_list); 3914 } else { 3915 next = list_next(l2arc_dev_list, next); 3916 if (next == NULL) 3917 next = list_head(l2arc_dev_list); 3918 } 3919 3920 /* if we have come back to the start, bail out */ 3921 if (first == NULL) 3922 first = next; 3923 else if (next == first) 3924 break; 3925 3926 } while (vdev_is_dead(next->l2ad_vdev)); 3927 3928 /* if we were unable to find any usable vdevs, return NULL */ 3929 if (vdev_is_dead(next->l2ad_vdev)) 3930 next = NULL; 3931 3932 l2arc_dev_last = next; 3933 3934 out: 3935 mutex_exit(&l2arc_dev_mtx); 3936 3937 /* 3938 * Grab the config lock to prevent the 'next' device from being 3939 * removed while we are writing to it. 3940 */ 3941 if (next != NULL) 3942 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 3943 mutex_exit(&spa_namespace_lock); 3944 3945 return (next); 3946 } 3947 3948 /* 3949 * Free buffers that were tagged for destruction. 3950 */ 3951 static void 3952 l2arc_do_free_on_write() 3953 { 3954 list_t *buflist; 3955 l2arc_data_free_t *df, *df_prev; 3956 3957 mutex_enter(&l2arc_free_on_write_mtx); 3958 buflist = l2arc_free_on_write; 3959 3960 for (df = list_tail(buflist); df; df = df_prev) { 3961 df_prev = list_prev(buflist, df); 3962 ASSERT(df->l2df_data != NULL); 3963 ASSERT(df->l2df_func != NULL); 3964 df->l2df_func(df->l2df_data, df->l2df_size); 3965 list_remove(buflist, df); 3966 kmem_free(df, sizeof (l2arc_data_free_t)); 3967 } 3968 3969 mutex_exit(&l2arc_free_on_write_mtx); 3970 } 3971 3972 /* 3973 * A write to a cache device has completed. Update all headers to allow 3974 * reads from these buffers to begin. 3975 */ 3976 static void 3977 l2arc_write_done(zio_t *zio) 3978 { 3979 l2arc_write_callback_t *cb; 3980 l2arc_dev_t *dev; 3981 list_t *buflist; 3982 arc_buf_hdr_t *head, *ab, *ab_prev; 3983 l2arc_buf_hdr_t *abl2; 3984 kmutex_t *hash_lock; 3985 3986 cb = zio->io_private; 3987 ASSERT(cb != NULL); 3988 dev = cb->l2wcb_dev; 3989 ASSERT(dev != NULL); 3990 head = cb->l2wcb_head; 3991 ASSERT(head != NULL); 3992 buflist = dev->l2ad_buflist; 3993 ASSERT(buflist != NULL); 3994 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 3995 l2arc_write_callback_t *, cb); 3996 3997 if (zio->io_error != 0) 3998 ARCSTAT_BUMP(arcstat_l2_writes_error); 3999 4000 mutex_enter(&l2arc_buflist_mtx); 4001 4002 /* 4003 * All writes completed, or an error was hit. 4004 */ 4005 for (ab = list_prev(buflist, head); ab; ab = ab_prev) { 4006 ab_prev = list_prev(buflist, ab); 4007 4008 hash_lock = HDR_LOCK(ab); 4009 if (!mutex_tryenter(hash_lock)) { 4010 /* 4011 * This buffer misses out. It may be in a stage 4012 * of eviction. Its ARC_L2_WRITING flag will be 4013 * left set, denying reads to this buffer. 4014 */ 4015 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); 4016 continue; 4017 } 4018 4019 if (zio->io_error != 0) { 4020 /* 4021 * Error - drop L2ARC entry. 4022 */ 4023 list_remove(buflist, ab); 4024 abl2 = ab->b_l2hdr; 4025 ab->b_l2hdr = NULL; 4026 kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); 4027 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 4028 } 4029 4030 /* 4031 * Allow ARC to begin reads to this L2ARC entry. 4032 */ 4033 ab->b_flags &= ~ARC_L2_WRITING; 4034 4035 mutex_exit(hash_lock); 4036 } 4037 4038 atomic_inc_64(&l2arc_writes_done); 4039 list_remove(buflist, head); 4040 kmem_cache_free(hdr_cache, head); 4041 mutex_exit(&l2arc_buflist_mtx); 4042 4043 l2arc_do_free_on_write(); 4044 4045 kmem_free(cb, sizeof (l2arc_write_callback_t)); 4046 } 4047 4048 /* 4049 * A read to a cache device completed. Validate buffer contents before 4050 * handing over to the regular ARC routines. 4051 */ 4052 static void 4053 l2arc_read_done(zio_t *zio) 4054 { 4055 l2arc_read_callback_t *cb; 4056 arc_buf_hdr_t *hdr; 4057 arc_buf_t *buf; 4058 kmutex_t *hash_lock; 4059 int equal; 4060 4061 ASSERT(zio->io_vd != NULL); 4062 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 4063 4064 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 4065 4066 cb = zio->io_private; 4067 ASSERT(cb != NULL); 4068 buf = cb->l2rcb_buf; 4069 ASSERT(buf != NULL); 4070 hdr = buf->b_hdr; 4071 ASSERT(hdr != NULL); 4072 4073 hash_lock = HDR_LOCK(hdr); 4074 mutex_enter(hash_lock); 4075 4076 /* 4077 * Check this survived the L2ARC journey. 4078 */ 4079 equal = arc_cksum_equal(buf); 4080 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 4081 mutex_exit(hash_lock); 4082 zio->io_private = buf; 4083 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 4084 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 4085 arc_read_done(zio); 4086 } else { 4087 mutex_exit(hash_lock); 4088 /* 4089 * Buffer didn't survive caching. Increment stats and 4090 * reissue to the original storage device. 4091 */ 4092 if (zio->io_error != 0) { 4093 ARCSTAT_BUMP(arcstat_l2_io_error); 4094 } else { 4095 zio->io_error = EIO; 4096 } 4097 if (!equal) 4098 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 4099 4100 /* 4101 * If there's no waiter, issue an async i/o to the primary 4102 * storage now. If there *is* a waiter, the caller must 4103 * issue the i/o in a context where it's OK to block. 4104 */ 4105 if (zio->io_waiter == NULL) { 4106 zio_t *pio = zio_unique_parent(zio); 4107 4108 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 4109 4110 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 4111 buf->b_data, zio->io_size, arc_read_done, buf, 4112 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 4113 } 4114 } 4115 4116 kmem_free(cb, sizeof (l2arc_read_callback_t)); 4117 } 4118 4119 /* 4120 * This is the list priority from which the L2ARC will search for pages to 4121 * cache. This is used within loops (0..3) to cycle through lists in the 4122 * desired order. This order can have a significant effect on cache 4123 * performance. 4124 * 4125 * Currently the metadata lists are hit first, MFU then MRU, followed by 4126 * the data lists. This function returns a locked list, and also returns 4127 * the lock pointer. 4128 */ 4129 static list_t * 4130 l2arc_list_locked(int list_num, kmutex_t **lock) 4131 { 4132 list_t *list; 4133 4134 ASSERT(list_num >= 0 && list_num <= 3); 4135 4136 switch (list_num) { 4137 case 0: 4138 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; 4139 *lock = &arc_mfu->arcs_mtx; 4140 break; 4141 case 1: 4142 list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; 4143 *lock = &arc_mru->arcs_mtx; 4144 break; 4145 case 2: 4146 list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; 4147 *lock = &arc_mfu->arcs_mtx; 4148 break; 4149 case 3: 4150 list = &arc_mru->arcs_list[ARC_BUFC_DATA]; 4151 *lock = &arc_mru->arcs_mtx; 4152 break; 4153 } 4154 4155 ASSERT(!(MUTEX_HELD(*lock))); 4156 mutex_enter(*lock); 4157 return (list); 4158 } 4159 4160 /* 4161 * Evict buffers from the device write hand to the distance specified in 4162 * bytes. This distance may span populated buffers, it may span nothing. 4163 * This is clearing a region on the L2ARC device ready for writing. 4164 * If the 'all' boolean is set, every buffer is evicted. 4165 */ 4166 static void 4167 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 4168 { 4169 list_t *buflist; 4170 l2arc_buf_hdr_t *abl2; 4171 arc_buf_hdr_t *ab, *ab_prev; 4172 kmutex_t *hash_lock; 4173 uint64_t taddr; 4174 4175 buflist = dev->l2ad_buflist; 4176 4177 if (buflist == NULL) 4178 return; 4179 4180 if (!all && dev->l2ad_first) { 4181 /* 4182 * This is the first sweep through the device. There is 4183 * nothing to evict. 4184 */ 4185 return; 4186 } 4187 4188 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 4189 /* 4190 * When nearing the end of the device, evict to the end 4191 * before the device write hand jumps to the start. 4192 */ 4193 taddr = dev->l2ad_end; 4194 } else { 4195 taddr = dev->l2ad_hand + distance; 4196 } 4197 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 4198 uint64_t, taddr, boolean_t, all); 4199 4200 top: 4201 mutex_enter(&l2arc_buflist_mtx); 4202 for (ab = list_tail(buflist); ab; ab = ab_prev) { 4203 ab_prev = list_prev(buflist, ab); 4204 4205 hash_lock = HDR_LOCK(ab); 4206 if (!mutex_tryenter(hash_lock)) { 4207 /* 4208 * Missed the hash lock. Retry. 4209 */ 4210 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 4211 mutex_exit(&l2arc_buflist_mtx); 4212 mutex_enter(hash_lock); 4213 mutex_exit(hash_lock); 4214 goto top; 4215 } 4216 4217 if (HDR_L2_WRITE_HEAD(ab)) { 4218 /* 4219 * We hit a write head node. Leave it for 4220 * l2arc_write_done(). 4221 */ 4222 list_remove(buflist, ab); 4223 mutex_exit(hash_lock); 4224 continue; 4225 } 4226 4227 if (!all && ab->b_l2hdr != NULL && 4228 (ab->b_l2hdr->b_daddr > taddr || 4229 ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { 4230 /* 4231 * We've evicted to the target address, 4232 * or the end of the device. 4233 */ 4234 mutex_exit(hash_lock); 4235 break; 4236 } 4237 4238 if (HDR_FREE_IN_PROGRESS(ab)) { 4239 /* 4240 * Already on the path to destruction. 4241 */ 4242 mutex_exit(hash_lock); 4243 continue; 4244 } 4245 4246 if (ab->b_state == arc_l2c_only) { 4247 ASSERT(!HDR_L2_READING(ab)); 4248 /* 4249 * This doesn't exist in the ARC. Destroy. 4250 * arc_hdr_destroy() will call list_remove() 4251 * and decrement arcstat_l2_size. 4252 */ 4253 arc_change_state(arc_anon, ab, hash_lock); 4254 arc_hdr_destroy(ab); 4255 } else { 4256 /* 4257 * Invalidate issued or about to be issued 4258 * reads, since we may be about to write 4259 * over this location. 4260 */ 4261 if (HDR_L2_READING(ab)) { 4262 ARCSTAT_BUMP(arcstat_l2_evict_reading); 4263 ab->b_flags |= ARC_L2_EVICTED; 4264 } 4265 4266 /* 4267 * Tell ARC this no longer exists in L2ARC. 4268 */ 4269 if (ab->b_l2hdr != NULL) { 4270 abl2 = ab->b_l2hdr; 4271 ab->b_l2hdr = NULL; 4272 kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); 4273 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 4274 } 4275 list_remove(buflist, ab); 4276 4277 /* 4278 * This may have been leftover after a 4279 * failed write. 4280 */ 4281 ab->b_flags &= ~ARC_L2_WRITING; 4282 } 4283 mutex_exit(hash_lock); 4284 } 4285 mutex_exit(&l2arc_buflist_mtx); 4286 4287 vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0); 4288 dev->l2ad_evict = taddr; 4289 } 4290 4291 /* 4292 * Find and write ARC buffers to the L2ARC device. 4293 * 4294 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid 4295 * for reading until they have completed writing. 4296 */ 4297 static uint64_t 4298 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) 4299 { 4300 arc_buf_hdr_t *ab, *ab_prev, *head; 4301 l2arc_buf_hdr_t *hdrl2; 4302 list_t *list; 4303 uint64_t passed_sz, write_sz, buf_sz, headroom; 4304 void *buf_data; 4305 kmutex_t *hash_lock, *list_lock; 4306 boolean_t have_lock, full; 4307 l2arc_write_callback_t *cb; 4308 zio_t *pio, *wzio; 4309 uint64_t guid = spa_guid(spa); 4310 4311 ASSERT(dev->l2ad_vdev != NULL); 4312 4313 pio = NULL; 4314 write_sz = 0; 4315 full = B_FALSE; 4316 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 4317 head->b_flags |= ARC_L2_WRITE_HEAD; 4318 4319 /* 4320 * Copy buffers for L2ARC writing. 4321 */ 4322 mutex_enter(&l2arc_buflist_mtx); 4323 for (int try = 0; try <= 3; try++) { 4324 list = l2arc_list_locked(try, &list_lock); 4325 passed_sz = 0; 4326 4327 /* 4328 * L2ARC fast warmup. 4329 * 4330 * Until the ARC is warm and starts to evict, read from the 4331 * head of the ARC lists rather than the tail. 4332 */ 4333 headroom = target_sz * l2arc_headroom; 4334 if (arc_warm == B_FALSE) 4335 ab = list_head(list); 4336 else 4337 ab = list_tail(list); 4338 4339 for (; ab; ab = ab_prev) { 4340 if (arc_warm == B_FALSE) 4341 ab_prev = list_next(list, ab); 4342 else 4343 ab_prev = list_prev(list, ab); 4344 4345 hash_lock = HDR_LOCK(ab); 4346 have_lock = MUTEX_HELD(hash_lock); 4347 if (!have_lock && !mutex_tryenter(hash_lock)) { 4348 /* 4349 * Skip this buffer rather than waiting. 4350 */ 4351 continue; 4352 } 4353 4354 passed_sz += ab->b_size; 4355 if (passed_sz > headroom) { 4356 /* 4357 * Searched too far. 4358 */ 4359 mutex_exit(hash_lock); 4360 break; 4361 } 4362 4363 if (!l2arc_write_eligible(guid, ab)) { 4364 mutex_exit(hash_lock); 4365 continue; 4366 } 4367 4368 if ((write_sz + ab->b_size) > target_sz) { 4369 full = B_TRUE; 4370 mutex_exit(hash_lock); 4371 break; 4372 } 4373 4374 if (pio == NULL) { 4375 /* 4376 * Insert a dummy header on the buflist so 4377 * l2arc_write_done() can find where the 4378 * write buffers begin without searching. 4379 */ 4380 list_insert_head(dev->l2ad_buflist, head); 4381 4382 cb = kmem_alloc( 4383 sizeof (l2arc_write_callback_t), KM_SLEEP); 4384 cb->l2wcb_dev = dev; 4385 cb->l2wcb_head = head; 4386 pio = zio_root(spa, l2arc_write_done, cb, 4387 ZIO_FLAG_CANFAIL); 4388 } 4389 4390 /* 4391 * Create and add a new L2ARC header. 4392 */ 4393 hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); 4394 hdrl2->b_dev = dev; 4395 hdrl2->b_daddr = dev->l2ad_hand; 4396 4397 ab->b_flags |= ARC_L2_WRITING; 4398 ab->b_l2hdr = hdrl2; 4399 list_insert_head(dev->l2ad_buflist, ab); 4400 buf_data = ab->b_buf->b_data; 4401 buf_sz = ab->b_size; 4402 4403 /* 4404 * Compute and store the buffer cksum before 4405 * writing. On debug the cksum is verified first. 4406 */ 4407 arc_cksum_verify(ab->b_buf); 4408 arc_cksum_compute(ab->b_buf, B_TRUE); 4409 4410 mutex_exit(hash_lock); 4411 4412 wzio = zio_write_phys(pio, dev->l2ad_vdev, 4413 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 4414 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 4415 ZIO_FLAG_CANFAIL, B_FALSE); 4416 4417 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 4418 zio_t *, wzio); 4419 (void) zio_nowait(wzio); 4420 4421 /* 4422 * Keep the clock hand suitably device-aligned. 4423 */ 4424 buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 4425 4426 write_sz += buf_sz; 4427 dev->l2ad_hand += buf_sz; 4428 } 4429 4430 mutex_exit(list_lock); 4431 4432 if (full == B_TRUE) 4433 break; 4434 } 4435 mutex_exit(&l2arc_buflist_mtx); 4436 4437 if (pio == NULL) { 4438 ASSERT3U(write_sz, ==, 0); 4439 kmem_cache_free(hdr_cache, head); 4440 return (0); 4441 } 4442 4443 ASSERT3U(write_sz, <=, target_sz); 4444 ARCSTAT_BUMP(arcstat_l2_writes_sent); 4445 ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz); 4446 ARCSTAT_INCR(arcstat_l2_size, write_sz); 4447 vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0); 4448 4449 /* 4450 * Bump device hand to the device start if it is approaching the end. 4451 * l2arc_evict() will already have evicted ahead for this case. 4452 */ 4453 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 4454 vdev_space_update(dev->l2ad_vdev, 4455 dev->l2ad_end - dev->l2ad_hand, 0, 0); 4456 dev->l2ad_hand = dev->l2ad_start; 4457 dev->l2ad_evict = dev->l2ad_start; 4458 dev->l2ad_first = B_FALSE; 4459 } 4460 4461 dev->l2ad_writing = B_TRUE; 4462 (void) zio_wait(pio); 4463 dev->l2ad_writing = B_FALSE; 4464 4465 return (write_sz); 4466 } 4467 4468 /* 4469 * This thread feeds the L2ARC at regular intervals. This is the beating 4470 * heart of the L2ARC. 4471 */ 4472 static void 4473 l2arc_feed_thread(void) 4474 { 4475 callb_cpr_t cpr; 4476 l2arc_dev_t *dev; 4477 spa_t *spa; 4478 uint64_t size, wrote; 4479 clock_t begin, next = ddi_get_lbolt(); 4480 4481 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 4482 4483 mutex_enter(&l2arc_feed_thr_lock); 4484 4485 while (l2arc_thread_exit == 0) { 4486 CALLB_CPR_SAFE_BEGIN(&cpr); 4487 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 4488 (hz * l2arc_feed_secs)); 4489 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 4490 next = ddi_get_lbolt() + hz; 4491 4492 /* 4493 * Quick check for L2ARC devices. 4494 */ 4495 mutex_enter(&l2arc_dev_mtx); 4496 if (l2arc_ndev == 0) { 4497 mutex_exit(&l2arc_dev_mtx); 4498 continue; 4499 } 4500 mutex_exit(&l2arc_dev_mtx); 4501 begin = ddi_get_lbolt(); 4502 4503 /* 4504 * This selects the next l2arc device to write to, and in 4505 * doing so the next spa to feed from: dev->l2ad_spa. This 4506 * will return NULL if there are now no l2arc devices or if 4507 * they are all faulted. 4508 * 4509 * If a device is returned, its spa's config lock is also 4510 * held to prevent device removal. l2arc_dev_get_next() 4511 * will grab and release l2arc_dev_mtx. 4512 */ 4513 if ((dev = l2arc_dev_get_next()) == NULL) 4514 continue; 4515 4516 spa = dev->l2ad_spa; 4517 ASSERT(spa != NULL); 4518 4519 /* 4520 * Avoid contributing to memory pressure. 4521 */ 4522 if (arc_reclaim_needed()) { 4523 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 4524 spa_config_exit(spa, SCL_L2ARC, dev); 4525 continue; 4526 } 4527 4528 ARCSTAT_BUMP(arcstat_l2_feeds); 4529 4530 size = l2arc_write_size(dev); 4531 4532 /* 4533 * Evict L2ARC buffers that will be overwritten. 4534 */ 4535 l2arc_evict(dev, size, B_FALSE); 4536 4537 /* 4538 * Write ARC buffers. 4539 */ 4540 wrote = l2arc_write_buffers(spa, dev, size); 4541 4542 /* 4543 * Calculate interval between writes. 4544 */ 4545 next = l2arc_write_interval(begin, size, wrote); 4546 spa_config_exit(spa, SCL_L2ARC, dev); 4547 } 4548 4549 l2arc_thread_exit = 0; 4550 cv_broadcast(&l2arc_feed_thr_cv); 4551 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 4552 thread_exit(); 4553 } 4554 4555 boolean_t 4556 l2arc_vdev_present(vdev_t *vd) 4557 { 4558 l2arc_dev_t *dev; 4559 4560 mutex_enter(&l2arc_dev_mtx); 4561 for (dev = list_head(l2arc_dev_list); dev != NULL; 4562 dev = list_next(l2arc_dev_list, dev)) { 4563 if (dev->l2ad_vdev == vd) 4564 break; 4565 } 4566 mutex_exit(&l2arc_dev_mtx); 4567 4568 return (dev != NULL); 4569 } 4570 4571 /* 4572 * Add a vdev for use by the L2ARC. By this point the spa has already 4573 * validated the vdev and opened it. 4574 */ 4575 void 4576 l2arc_add_vdev(spa_t *spa, vdev_t *vd) 4577 { 4578 l2arc_dev_t *adddev; 4579 4580 ASSERT(!l2arc_vdev_present(vd)); 4581 4582 /* 4583 * Create a new l2arc device entry. 4584 */ 4585 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 4586 adddev->l2ad_spa = spa; 4587 adddev->l2ad_vdev = vd; 4588 adddev->l2ad_write = l2arc_write_max; 4589 adddev->l2ad_boost = l2arc_write_boost; 4590 adddev->l2ad_start = VDEV_LABEL_START_SIZE; 4591 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 4592 adddev->l2ad_hand = adddev->l2ad_start; 4593 adddev->l2ad_evict = adddev->l2ad_start; 4594 adddev->l2ad_first = B_TRUE; 4595 adddev->l2ad_writing = B_FALSE; 4596 ASSERT3U(adddev->l2ad_write, >, 0); 4597 4598 /* 4599 * This is a list of all ARC buffers that are still valid on the 4600 * device. 4601 */ 4602 adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); 4603 list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 4604 offsetof(arc_buf_hdr_t, b_l2node)); 4605 4606 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 4607 4608 /* 4609 * Add device to global list 4610 */ 4611 mutex_enter(&l2arc_dev_mtx); 4612 list_insert_head(l2arc_dev_list, adddev); 4613 atomic_inc_64(&l2arc_ndev); 4614 mutex_exit(&l2arc_dev_mtx); 4615 } 4616 4617 /* 4618 * Remove a vdev from the L2ARC. 4619 */ 4620 void 4621 l2arc_remove_vdev(vdev_t *vd) 4622 { 4623 l2arc_dev_t *dev, *nextdev, *remdev = NULL; 4624 4625 /* 4626 * Find the device by vdev 4627 */ 4628 mutex_enter(&l2arc_dev_mtx); 4629 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 4630 nextdev = list_next(l2arc_dev_list, dev); 4631 if (vd == dev->l2ad_vdev) { 4632 remdev = dev; 4633 break; 4634 } 4635 } 4636 ASSERT(remdev != NULL); 4637 4638 /* 4639 * Remove device from global list 4640 */ 4641 list_remove(l2arc_dev_list, remdev); 4642 l2arc_dev_last = NULL; /* may have been invalidated */ 4643 atomic_dec_64(&l2arc_ndev); 4644 mutex_exit(&l2arc_dev_mtx); 4645 4646 /* 4647 * Clear all buflists and ARC references. L2ARC device flush. 4648 */ 4649 l2arc_evict(remdev, 0, B_TRUE); 4650 list_destroy(remdev->l2ad_buflist); 4651 kmem_free(remdev->l2ad_buflist, sizeof (list_t)); 4652 kmem_free(remdev, sizeof (l2arc_dev_t)); 4653 } 4654 4655 void 4656 l2arc_init(void) 4657 { 4658 l2arc_thread_exit = 0; 4659 l2arc_ndev = 0; 4660 l2arc_writes_sent = 0; 4661 l2arc_writes_done = 0; 4662 4663 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 4664 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 4665 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 4666 mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); 4667 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 4668 4669 l2arc_dev_list = &L2ARC_dev_list; 4670 l2arc_free_on_write = &L2ARC_free_on_write; 4671 list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 4672 offsetof(l2arc_dev_t, l2ad_node)); 4673 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 4674 offsetof(l2arc_data_free_t, l2df_list_node)); 4675 } 4676 4677 void 4678 l2arc_fini(void) 4679 { 4680 /* 4681 * This is called from dmu_fini(), which is called from spa_fini(); 4682 * Because of this, we can assume that all l2arc devices have 4683 * already been removed when the pools themselves were removed. 4684 */ 4685 4686 l2arc_do_free_on_write(); 4687 4688 mutex_destroy(&l2arc_feed_thr_lock); 4689 cv_destroy(&l2arc_feed_thr_cv); 4690 mutex_destroy(&l2arc_dev_mtx); 4691 mutex_destroy(&l2arc_buflist_mtx); 4692 mutex_destroy(&l2arc_free_on_write_mtx); 4693 4694 list_destroy(l2arc_dev_list); 4695 list_destroy(l2arc_free_on_write); 4696 } 4697 4698 void 4699 l2arc_start(void) 4700 { 4701 if (!(spa_mode_global & FWRITE)) 4702 return; 4703 4704 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 4705 TS_RUN, minclsyspri); 4706 } 4707 4708 void 4709 l2arc_stop(void) 4710 { 4711 if (!(spa_mode_global & FWRITE)) 4712 return; 4713 4714 mutex_enter(&l2arc_feed_thr_lock); 4715 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 4716 l2arc_thread_exit = 1; 4717 while (l2arc_thread_exit != 0) 4718 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 4719 mutex_exit(&l2arc_feed_thr_lock); 4720 } 4721