1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 24 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 25 * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 27 * Copyright 2013 Saso Kiselkov. All rights reserved. 28 * Copyright (c) 2014 Integros [integros.com] 29 */ 30 31 #include <sys/zfs_context.h> 32 #include <sys/spa_impl.h> 33 #include <sys/spa_boot.h> 34 #include <sys/zio.h> 35 #include <sys/zio_checksum.h> 36 #include <sys/zio_compress.h> 37 #include <sys/dmu.h> 38 #include <sys/dmu_tx.h> 39 #include <sys/zap.h> 40 #include <sys/zil.h> 41 #include <sys/vdev_impl.h> 42 #include <sys/metaslab.h> 43 #include <sys/uberblock_impl.h> 44 #include <sys/txg.h> 45 #include <sys/avl.h> 46 #include <sys/unique.h> 47 #include <sys/dsl_pool.h> 48 #include <sys/dsl_dir.h> 49 #include <sys/dsl_prop.h> 50 #include <sys/dsl_scan.h> 51 #include <sys/fs/zfs.h> 52 #include <sys/metaslab_impl.h> 53 #include <sys/arc.h> 54 #include <sys/ddt.h> 55 #include "zfs_prop.h" 56 #include <sys/zfeature.h> 57 58 #if defined(__FreeBSD__) && defined(_KERNEL) 59 #include <sys/types.h> 60 #include <sys/sysctl.h> 61 #endif 62 63 /* 64 * SPA locking 65 * 66 * There are four basic locks for managing spa_t structures: 67 * 68 * spa_namespace_lock (global mutex) 69 * 70 * This lock must be acquired to do any of the following: 71 * 72 * - Lookup a spa_t by name 73 * - Add or remove a spa_t from the namespace 74 * - Increase spa_refcount from non-zero 75 * - Check if spa_refcount is zero 76 * - Rename a spa_t 77 * - add/remove/attach/detach devices 78 * - Held for the duration of create/destroy/import/export 79 * 80 * It does not need to handle recursion. A create or destroy may 81 * reference objects (files or zvols) in other pools, but by 82 * definition they must have an existing reference, and will never need 83 * to lookup a spa_t by name. 84 * 85 * spa_refcount (per-spa refcount_t protected by mutex) 86 * 87 * This reference count keep track of any active users of the spa_t. The 88 * spa_t cannot be destroyed or freed while this is non-zero. Internally, 89 * the refcount is never really 'zero' - opening a pool implicitly keeps 90 * some references in the DMU. Internally we check against spa_minref, but 91 * present the image of a zero/non-zero value to consumers. 92 * 93 * spa_config_lock[] (per-spa array of rwlocks) 94 * 95 * This protects the spa_t from config changes, and must be held in 96 * the following circumstances: 97 * 98 * - RW_READER to perform I/O to the spa 99 * - RW_WRITER to change the vdev config 100 * 101 * The locking order is fairly straightforward: 102 * 103 * spa_namespace_lock -> spa_refcount 104 * 105 * The namespace lock must be acquired to increase the refcount from 0 106 * or to check if it is zero. 107 * 108 * spa_refcount -> spa_config_lock[] 109 * 110 * There must be at least one valid reference on the spa_t to acquire 111 * the config lock. 112 * 113 * spa_namespace_lock -> spa_config_lock[] 114 * 115 * The namespace lock must always be taken before the config lock. 116 * 117 * 118 * The spa_namespace_lock can be acquired directly and is globally visible. 119 * 120 * The namespace is manipulated using the following functions, all of which 121 * require the spa_namespace_lock to be held. 122 * 123 * spa_lookup() Lookup a spa_t by name. 124 * 125 * spa_add() Create a new spa_t in the namespace. 126 * 127 * spa_remove() Remove a spa_t from the namespace. This also 128 * frees up any memory associated with the spa_t. 129 * 130 * spa_next() Returns the next spa_t in the system, or the 131 * first if NULL is passed. 132 * 133 * spa_evict_all() Shutdown and remove all spa_t structures in 134 * the system. 135 * 136 * spa_guid_exists() Determine whether a pool/device guid exists. 137 * 138 * The spa_refcount is manipulated using the following functions: 139 * 140 * spa_open_ref() Adds a reference to the given spa_t. Must be 141 * called with spa_namespace_lock held if the 142 * refcount is currently zero. 143 * 144 * spa_close() Remove a reference from the spa_t. This will 145 * not free the spa_t or remove it from the 146 * namespace. No locking is required. 147 * 148 * spa_refcount_zero() Returns true if the refcount is currently 149 * zero. Must be called with spa_namespace_lock 150 * held. 151 * 152 * The spa_config_lock[] is an array of rwlocks, ordered as follows: 153 * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV. 154 * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}(). 155 * 156 * To read the configuration, it suffices to hold one of these locks as reader. 157 * To modify the configuration, you must hold all locks as writer. To modify 158 * vdev state without altering the vdev tree's topology (e.g. online/offline), 159 * you must hold SCL_STATE and SCL_ZIO as writer. 160 * 161 * We use these distinct config locks to avoid recursive lock entry. 162 * For example, spa_sync() (which holds SCL_CONFIG as reader) induces 163 * block allocations (SCL_ALLOC), which may require reading space maps 164 * from disk (dmu_read() -> zio_read() -> SCL_ZIO). 165 * 166 * The spa config locks cannot be normal rwlocks because we need the 167 * ability to hand off ownership. For example, SCL_ZIO is acquired 168 * by the issuing thread and later released by an interrupt thread. 169 * They do, however, obey the usual write-wanted semantics to prevent 170 * writer (i.e. system administrator) starvation. 171 * 172 * The lock acquisition rules are as follows: 173 * 174 * SCL_CONFIG 175 * Protects changes to the vdev tree topology, such as vdev 176 * add/remove/attach/detach. Protects the dirty config list 177 * (spa_config_dirty_list) and the set of spares and l2arc devices. 178 * 179 * SCL_STATE 180 * Protects changes to pool state and vdev state, such as vdev 181 * online/offline/fault/degrade/clear. Protects the dirty state list 182 * (spa_state_dirty_list) and global pool state (spa_state). 183 * 184 * SCL_ALLOC 185 * Protects changes to metaslab groups and classes. 186 * Held as reader by metaslab_alloc() and metaslab_claim(). 187 * 188 * SCL_ZIO 189 * Held by bp-level zios (those which have no io_vd upon entry) 190 * to prevent changes to the vdev tree. The bp-level zio implicitly 191 * protects all of its vdev child zios, which do not hold SCL_ZIO. 192 * 193 * SCL_FREE 194 * Protects changes to metaslab groups and classes. 195 * Held as reader by metaslab_free(). SCL_FREE is distinct from 196 * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free 197 * blocks in zio_done() while another i/o that holds either 198 * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete. 199 * 200 * SCL_VDEV 201 * Held as reader to prevent changes to the vdev tree during trivial 202 * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the 203 * other locks, and lower than all of them, to ensure that it's safe 204 * to acquire regardless of caller context. 205 * 206 * In addition, the following rules apply: 207 * 208 * (a) spa_props_lock protects pool properties, spa_config and spa_config_list. 209 * The lock ordering is SCL_CONFIG > spa_props_lock. 210 * 211 * (b) I/O operations on leaf vdevs. For any zio operation that takes 212 * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(), 213 * or zio_write_phys() -- the caller must ensure that the config cannot 214 * cannot change in the interim, and that the vdev cannot be reopened. 215 * SCL_STATE as reader suffices for both. 216 * 217 * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). 218 * 219 * spa_vdev_enter() Acquire the namespace lock and the config lock 220 * for writing. 221 * 222 * spa_vdev_exit() Release the config lock, wait for all I/O 223 * to complete, sync the updated configs to the 224 * cache, and release the namespace lock. 225 * 226 * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit(). 227 * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual 228 * locking is, always, based on spa_namespace_lock and spa_config_lock[]. 229 * 230 * spa_rename() is also implemented within this file since it requires 231 * manipulation of the namespace. 232 */ 233 234 static avl_tree_t spa_namespace_avl; 235 kmutex_t spa_namespace_lock; 236 static kcondvar_t spa_namespace_cv; 237 static int spa_active_count; 238 int spa_max_replication_override = SPA_DVAS_PER_BP; 239 240 static kmutex_t spa_spare_lock; 241 static avl_tree_t spa_spare_avl; 242 static kmutex_t spa_l2cache_lock; 243 static avl_tree_t spa_l2cache_avl; 244 245 kmem_cache_t *spa_buffer_pool; 246 int spa_mode_global; 247 248 #ifdef ZFS_DEBUG 249 /* Everything except dprintf and spa is on by default in debug builds */ 250 int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA); 251 #else 252 int zfs_flags = 0; 253 #endif 254 255 /* 256 * zfs_recover can be set to nonzero to attempt to recover from 257 * otherwise-fatal errors, typically caused by on-disk corruption. When 258 * set, calls to zfs_panic_recover() will turn into warning messages. 259 * This should only be used as a last resort, as it typically results 260 * in leaked space, or worse. 261 */ 262 boolean_t zfs_recover = B_FALSE; 263 264 /* 265 * If destroy encounters an EIO while reading metadata (e.g. indirect 266 * blocks), space referenced by the missing metadata can not be freed. 267 * Normally this causes the background destroy to become "stalled", as 268 * it is unable to make forward progress. While in this stalled state, 269 * all remaining space to free from the error-encountering filesystem is 270 * "temporarily leaked". Set this flag to cause it to ignore the EIO, 271 * permanently leak the space from indirect blocks that can not be read, 272 * and continue to free everything else that it can. 273 * 274 * The default, "stalling" behavior is useful if the storage partially 275 * fails (i.e. some but not all i/os fail), and then later recovers. In 276 * this case, we will be able to continue pool operations while it is 277 * partially failed, and when it recovers, we can continue to free the 278 * space, with no leaks. However, note that this case is actually 279 * fairly rare. 280 * 281 * Typically pools either (a) fail completely (but perhaps temporarily, 282 * e.g. a top-level vdev going offline), or (b) have localized, 283 * permanent errors (e.g. disk returns the wrong data due to bit flip or 284 * firmware bug). In case (a), this setting does not matter because the 285 * pool will be suspended and the sync thread will not be able to make 286 * forward progress regardless. In case (b), because the error is 287 * permanent, the best we can do is leak the minimum amount of space, 288 * which is what setting this flag will do. Therefore, it is reasonable 289 * for this flag to normally be set, but we chose the more conservative 290 * approach of not setting it, so that there is no possibility of 291 * leaking space in the "partial temporary" failure case. 292 */ 293 boolean_t zfs_free_leak_on_eio = B_FALSE; 294 295 /* 296 * Expiration time in milliseconds. This value has two meanings. First it is 297 * used to determine when the spa_deadman() logic should fire. By default the 298 * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds. 299 * Secondly, the value determines if an I/O is considered "hung". Any I/O that 300 * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting 301 * in a system panic. 302 */ 303 uint64_t zfs_deadman_synctime_ms = 1000000ULL; 304 305 /* 306 * Check time in milliseconds. This defines the frequency at which we check 307 * for hung I/O. 308 */ 309 uint64_t zfs_deadman_checktime_ms = 5000ULL; 310 311 /* 312 * Default value of -1 for zfs_deadman_enabled is resolved in 313 * zfs_deadman_init() 314 */ 315 int zfs_deadman_enabled = -1; 316 317 /* 318 * The worst case is single-sector max-parity RAID-Z blocks, in which 319 * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) 320 * times the size; so just assume that. Add to this the fact that 321 * we can have up to 3 DVAs per bp, and one more factor of 2 because 322 * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together, 323 * the worst case is: 324 * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24 325 */ 326 int spa_asize_inflation = 24; 327 328 #if defined(__FreeBSD__) && defined(_KERNEL) 329 SYSCTL_DECL(_vfs_zfs); 330 SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RWTUN, &zfs_recover, 0, 331 "Try to recover from otherwise-fatal errors."); 332 333 static int 334 sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS) 335 { 336 int err, val; 337 338 val = zfs_flags; 339 err = sysctl_handle_int(oidp, &val, 0, req); 340 if (err != 0 || req->newptr == NULL) 341 return (err); 342 343 /* 344 * ZFS_DEBUG_MODIFY must be enabled prior to boot so all 345 * arc buffers in the system have the necessary additional 346 * checksum data. However, it is safe to disable at any 347 * time. 348 */ 349 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 350 val &= ~ZFS_DEBUG_MODIFY; 351 zfs_flags = val; 352 353 return (0); 354 } 355 356 SYSCTL_PROC(_vfs_zfs, OID_AUTO, debug_flags, 357 CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), 358 sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing."); 359 360 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime_ms, CTLFLAG_RDTUN, 361 &zfs_deadman_synctime_ms, 0, 362 "Stalled ZFS I/O expiration time in milliseconds"); 363 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_checktime_ms, CTLFLAG_RDTUN, 364 &zfs_deadman_checktime_ms, 0, 365 "Period of checks for stalled ZFS I/O in milliseconds"); 366 SYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RDTUN, 367 &zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O"); 368 SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_asize_inflation, CTLFLAG_RWTUN, 369 &spa_asize_inflation, 0, "Worst case inflation factor for single sector writes"); 370 #endif 371 372 373 #ifdef __FreeBSD__ 374 #ifdef _KERNEL 375 static void 376 zfs_deadman_init(void) 377 { 378 /* 379 * If we are not i386 or amd64 or in a virtual machine, 380 * disable ZFS deadman thread by default 381 */ 382 if (zfs_deadman_enabled == -1) { 383 #if defined(__amd64__) || defined(__i386__) 384 zfs_deadman_enabled = (vm_guest == VM_GUEST_NO) ? 1 : 0; 385 #else 386 zfs_deadman_enabled = 0; 387 #endif 388 } 389 } 390 #endif /* _KERNEL */ 391 #endif /* __FreeBSD__ */ 392 393 #ifdef __NetBSD__ 394 #ifdef _HARDKERNEL 395 static struct workqueue *spa_workqueue; 396 397 static void spa_deadman(void *arg); 398 399 static void 400 spa_deadman_wq(struct work *wk, void *arg) 401 { 402 spa_t *spa = container_of(wk, struct spa, spa_deadman_work); 403 404 spa_deadman(spa); 405 } 406 407 static void 408 zfs_deadman_init(void) 409 { 410 int error; 411 412 error = workqueue_create(&spa_workqueue, "spa_deadman", 413 spa_deadman_wq, NULL, PRI_NONE, IPL_NONE, WQ_MPSAFE); 414 VERIFY0(error); 415 } 416 417 static void 418 zfs_deadman_fini(void) 419 { 420 workqueue_destroy(spa_workqueue); 421 spa_workqueue = NULL; 422 } 423 #else /* !_HARDKERNEL */ 424 #define zfs_deadman_init() /* nothing */ 425 #define zfs_deadman_fini() /* nothing */ 426 #endif /* !_HARDKERNEL */ 427 #endif /* __NetBSD__ */ 428 429 /* 430 * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in 431 * the pool to be consumed. This ensures that we don't run the pool 432 * completely out of space, due to unaccounted changes (e.g. to the MOS). 433 * It also limits the worst-case time to allocate space. If we have 434 * less than this amount of free space, most ZPL operations (e.g. write, 435 * create) will return ENOSPC. 436 * 437 * Certain operations (e.g. file removal, most administrative actions) can 438 * use half the slop space. They will only return ENOSPC if less than half 439 * the slop space is free. Typically, once the pool has less than the slop 440 * space free, the user will use these operations to free up space in the pool. 441 * These are the operations that call dsl_pool_adjustedsize() with the netfree 442 * argument set to TRUE. 443 * 444 * A very restricted set of operations are always permitted, regardless of 445 * the amount of free space. These are the operations that call 446 * dsl_sync_task(ZFS_SPACE_CHECK_NONE), e.g. "zfs destroy". If these 447 * operations result in a net increase in the amount of space used, 448 * it is possible to run the pool completely out of space, causing it to 449 * be permanently read-only. 450 * 451 * Note that on very small pools, the slop space will be larger than 452 * 3.2%, in an effort to have it be at least spa_min_slop (128MB), 453 * but we never allow it to be more than half the pool size. 454 * 455 * See also the comments in zfs_space_check_t. 456 */ 457 int spa_slop_shift = 5; 458 SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_slop_shift, CTLFLAG_RWTUN, 459 &spa_slop_shift, 0, 460 "Shift value of reserved space (1/(2^spa_slop_shift))."); 461 uint64_t spa_min_slop = 128 * 1024 * 1024; 462 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, spa_min_slop, CTLFLAG_RWTUN, 463 &spa_min_slop, 0, 464 "Minimal value of reserved space"); 465 466 /* 467 * ========================================================================== 468 * SPA config locking 469 * ========================================================================== 470 */ 471 static void 472 spa_config_lock_init(spa_t *spa) 473 { 474 for (int i = 0; i < SCL_LOCKS; i++) { 475 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 476 mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); 477 cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); 478 refcount_create_untracked(&scl->scl_count); 479 scl->scl_writer = NULL; 480 scl->scl_write_wanted = 0; 481 } 482 } 483 484 static void 485 spa_config_lock_destroy(spa_t *spa) 486 { 487 for (int i = 0; i < SCL_LOCKS; i++) { 488 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 489 mutex_destroy(&scl->scl_lock); 490 cv_destroy(&scl->scl_cv); 491 refcount_destroy(&scl->scl_count); 492 ASSERT(scl->scl_writer == NULL); 493 ASSERT(scl->scl_write_wanted == 0); 494 } 495 } 496 497 int 498 spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) 499 { 500 for (int i = 0; i < SCL_LOCKS; i++) { 501 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 502 if (!(locks & (1 << i))) 503 continue; 504 mutex_enter(&scl->scl_lock); 505 if (rw == RW_READER) { 506 if (scl->scl_writer || scl->scl_write_wanted) { 507 mutex_exit(&scl->scl_lock); 508 spa_config_exit(spa, locks & ((1 << i) - 1), 509 tag); 510 return (0); 511 } 512 } else { 513 ASSERT(scl->scl_writer != curthread); 514 if (!refcount_is_zero(&scl->scl_count)) { 515 mutex_exit(&scl->scl_lock); 516 spa_config_exit(spa, locks & ((1 << i) - 1), 517 tag); 518 return (0); 519 } 520 scl->scl_writer = curthread; 521 } 522 (void) refcount_add(&scl->scl_count, tag); 523 mutex_exit(&scl->scl_lock); 524 } 525 return (1); 526 } 527 528 void 529 spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) 530 { 531 int wlocks_held = 0; 532 533 ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); 534 535 for (int i = 0; i < SCL_LOCKS; i++) { 536 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 537 if (scl->scl_writer == curthread) 538 wlocks_held |= (1 << i); 539 if (!(locks & (1 << i))) 540 continue; 541 mutex_enter(&scl->scl_lock); 542 if (rw == RW_READER) { 543 while (scl->scl_writer || scl->scl_write_wanted) { 544 cv_wait(&scl->scl_cv, &scl->scl_lock); 545 } 546 } else { 547 ASSERT(scl->scl_writer != curthread); 548 while (!refcount_is_zero(&scl->scl_count)) { 549 scl->scl_write_wanted++; 550 cv_wait(&scl->scl_cv, &scl->scl_lock); 551 scl->scl_write_wanted--; 552 } 553 scl->scl_writer = curthread; 554 } 555 (void) refcount_add(&scl->scl_count, tag); 556 mutex_exit(&scl->scl_lock); 557 } 558 ASSERT(wlocks_held <= locks); 559 } 560 561 void 562 spa_config_exit(spa_t *spa, int locks, void *tag) 563 { 564 for (int i = SCL_LOCKS - 1; i >= 0; i--) { 565 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 566 if (!(locks & (1 << i))) 567 continue; 568 mutex_enter(&scl->scl_lock); 569 ASSERT(!refcount_is_zero(&scl->scl_count)); 570 if (refcount_remove(&scl->scl_count, tag) == 0) { 571 ASSERT(scl->scl_writer == NULL || 572 scl->scl_writer == curthread); 573 scl->scl_writer = NULL; /* OK in either case */ 574 cv_broadcast(&scl->scl_cv); 575 } 576 mutex_exit(&scl->scl_lock); 577 } 578 } 579 580 int 581 spa_config_held(spa_t *spa, int locks, krw_t rw) 582 { 583 int locks_held = 0; 584 585 for (int i = 0; i < SCL_LOCKS; i++) { 586 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 587 if (!(locks & (1 << i))) 588 continue; 589 if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) || 590 (rw == RW_WRITER && scl->scl_writer == curthread)) 591 locks_held |= 1 << i; 592 } 593 594 return (locks_held); 595 } 596 597 /* 598 * ========================================================================== 599 * SPA namespace functions 600 * ========================================================================== 601 */ 602 603 /* 604 * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. 605 * Returns NULL if no matching spa_t is found. 606 */ 607 spa_t * 608 spa_lookup(const char *name) 609 { 610 static spa_t search; /* spa_t is large; don't allocate on stack */ 611 spa_t *spa; 612 avl_index_t where; 613 char *cp; 614 615 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 616 617 (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); 618 619 /* 620 * If it's a full dataset name, figure out the pool name and 621 * just use that. 622 */ 623 cp = strpbrk(search.spa_name, "/@#"); 624 if (cp != NULL) 625 *cp = '\0'; 626 627 spa = avl_find(&spa_namespace_avl, &search, &where); 628 629 return (spa); 630 } 631 632 /* 633 * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. 634 * If the zfs_deadman_enabled flag is set then it inspects all vdev queues 635 * looking for potentially hung I/Os. 636 */ 637 static void 638 spa_deadman(void *arg) 639 { 640 spa_t *spa = arg; 641 642 /* 643 * Disable the deadman timer if the pool is suspended. 644 */ 645 if (spa_suspended(spa)) { 646 #ifdef illumos 647 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 648 #else 649 /* Nothing. just don't schedule any future callouts. */ 650 #endif 651 return; 652 } 653 654 zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", 655 (gethrtime() - spa->spa_sync_starttime) / NANOSEC, 656 ++spa->spa_deadman_calls); 657 if (zfs_deadman_enabled) 658 vdev_deadman(spa->spa_root_vdev); 659 #ifndef illumos 660 #ifdef _KERNEL 661 callout_schedule(&spa->spa_deadman_cycid, 662 hz * zfs_deadman_checktime_ms / MILLISEC); 663 #endif 664 #endif 665 } 666 667 #ifdef _HARDKERNEL 668 static void 669 spa_deadman_timeout(void *arg) 670 { 671 spa_t *spa = arg; 672 673 #ifdef __FreeBSD__ 674 taskqueue_enqueue(taskqueue_thread, &spa->spa_deadman_task); 675 #endif 676 #ifdef __NetBSD__ 677 workqueue_enqueue(spa_workqueue, &spa->spa_deadman_work, NULL); 678 #endif 679 } 680 #endif /* _KERNEL */ 681 682 /* 683 * Create an uninitialized spa_t with the given name. Requires 684 * spa_namespace_lock. The caller must ensure that the spa_t doesn't already 685 * exist by calling spa_lookup() first. 686 */ 687 spa_t * 688 spa_add(const char *name, nvlist_t *config, const char *altroot) 689 { 690 spa_t *spa; 691 spa_config_dirent_t *dp; 692 #ifndef __FreeBSD__ 693 cyc_handler_t hdlr; 694 cyc_time_t when; 695 #endif 696 697 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 698 699 spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); 700 701 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); 702 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 703 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 704 mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL); 705 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 706 mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); 707 mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 708 mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL); 709 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); 710 mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); 711 mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); 712 mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL); 713 714 cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); 715 cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); 716 cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); 717 cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); 718 cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); 719 720 for (int t = 0; t < TXG_SIZE; t++) 721 bplist_create(&spa->spa_free_bplist[t]); 722 723 (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); 724 spa->spa_state = POOL_STATE_UNINITIALIZED; 725 spa->spa_freeze_txg = UINT64_MAX; 726 spa->spa_final_txg = UINT64_MAX; 727 spa->spa_load_max_txg = UINT64_MAX; 728 spa->spa_proc = &p0; 729 spa->spa_proc_state = SPA_PROC_NONE; 730 731 #ifndef __FreeBSD__ 732 hdlr.cyh_func = spa_deadman; 733 hdlr.cyh_arg = spa; 734 hdlr.cyh_level = CY_LOW_LEVEL; 735 #endif 736 737 spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); 738 739 #ifdef illumos 740 /* 741 * This determines how often we need to check for hung I/Os after 742 * the cyclic has already fired. Since checking for hung I/Os is 743 * an expensive operation we don't want to check too frequently. 744 * Instead wait for 5 seconds before checking again. 745 */ 746 when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms); 747 when.cyt_when = CY_INFINITY; 748 mutex_enter(&cpu_lock); 749 spa->spa_deadman_cycid = cyclic_add(&hdlr, &when); 750 mutex_exit(&cpu_lock); 751 #endif 752 #ifdef __FreeBSD__ 753 #ifdef _KERNEL 754 /* 755 * callout(9) does not provide a way to initialize a callout with 756 * a function and an argument, so we use callout_reset() to schedule 757 * the callout in the very distant future. Even if that event ever 758 * fires, it should be okayas we won't have any active zio-s. 759 * But normally spa_sync() will reschedule the callout with a proper 760 * timeout. 761 * callout(9) does not allow the callback function to sleep but 762 * vdev_deadman() needs to acquire vq_lock and illumos mutexes are 763 * emulated using sx(9). For this reason spa_deadman_timeout() 764 * will schedule spa_deadman() as task on a taskqueue that allows 765 * sleeping. 766 */ 767 TASK_INIT(&spa->spa_deadman_task, 0, spa_deadman, spa); 768 callout_init(&spa->spa_deadman_cycid, 1); 769 callout_reset_sbt(&spa->spa_deadman_cycid, SBT_MAX, 0, 770 spa_deadman_timeout, spa, 0); 771 #endif 772 #endif 773 #ifdef __NetBSD__ 774 #ifdef _HARDKERNEL 775 callout_init(&spa->spa_deadman_cycid, 0); 776 callout_setfunc(&spa->spa_deadman_cycid, spa_deadman_timeout, spa); 777 #endif 778 #endif 779 780 refcount_create(&spa->spa_refcount); 781 spa_config_lock_init(spa); 782 783 avl_add(&spa_namespace_avl, spa); 784 785 /* 786 * Set the alternate root, if there is one. 787 */ 788 if (altroot) { 789 spa->spa_root = spa_strdup(altroot); 790 spa_active_count++; 791 } 792 793 avl_create(&spa->spa_alloc_tree, zio_timestamp_compare, 794 sizeof (zio_t), offsetof(zio_t, io_alloc_node)); 795 796 /* 797 * Every pool starts with the default cachefile 798 */ 799 list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t), 800 offsetof(spa_config_dirent_t, scd_link)); 801 802 dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP); 803 dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); 804 list_insert_head(&spa->spa_config_list, dp); 805 806 VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, 807 KM_SLEEP) == 0); 808 809 if (config != NULL) { 810 nvlist_t *features; 811 812 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, 813 &features) == 0) { 814 VERIFY(nvlist_dup(features, &spa->spa_label_features, 815 0) == 0); 816 } 817 818 VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); 819 } 820 821 if (spa->spa_label_features == NULL) { 822 VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, 823 KM_SLEEP) == 0); 824 } 825 826 spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0); 827 828 spa->spa_min_ashift = INT_MAX; 829 spa->spa_max_ashift = 0; 830 831 /* 832 * As a pool is being created, treat all features as disabled by 833 * setting SPA_FEATURE_DISABLED for all entries in the feature 834 * refcount cache. 835 */ 836 for (int i = 0; i < SPA_FEATURES; i++) { 837 spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; 838 } 839 840 return (spa); 841 } 842 843 /* 844 * Removes a spa_t from the namespace, freeing up any memory used. Requires 845 * spa_namespace_lock. This is called only after the spa_t has been closed and 846 * deactivated. 847 */ 848 void 849 spa_remove(spa_t *spa) 850 { 851 spa_config_dirent_t *dp; 852 853 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 854 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 855 ASSERT3U(refcount_count(&spa->spa_refcount), ==, 0); 856 857 nvlist_free(spa->spa_config_splitting); 858 859 avl_remove(&spa_namespace_avl, spa); 860 cv_broadcast(&spa_namespace_cv); 861 862 if (spa->spa_root) { 863 spa_strfree(spa->spa_root); 864 spa_active_count--; 865 } 866 867 while ((dp = list_head(&spa->spa_config_list)) != NULL) { 868 list_remove(&spa->spa_config_list, dp); 869 if (dp->scd_path != NULL) 870 spa_strfree(dp->scd_path); 871 kmem_free(dp, sizeof (spa_config_dirent_t)); 872 } 873 874 avl_destroy(&spa->spa_alloc_tree); 875 list_destroy(&spa->spa_config_list); 876 877 nvlist_free(spa->spa_label_features); 878 nvlist_free(spa->spa_load_info); 879 spa_config_set(spa, NULL); 880 881 #ifdef illumos 882 mutex_enter(&cpu_lock); 883 if (spa->spa_deadman_cycid != CYCLIC_NONE) 884 cyclic_remove(spa->spa_deadman_cycid); 885 mutex_exit(&cpu_lock); 886 spa->spa_deadman_cycid = CYCLIC_NONE; 887 #endif /* !illumos */ 888 #ifdef __FreeBSD__ 889 #ifdef _KERNEL 890 callout_drain(&spa->spa_deadman_cycid); 891 taskqueue_drain(taskqueue_thread, &spa->spa_deadman_task); 892 #endif 893 #endif 894 #ifdef __NetBSD__ 895 #ifdef _HARDKERNEL 896 callout_drain(&spa->spa_deadman_cycid); 897 #endif 898 #endif 899 900 refcount_destroy(&spa->spa_refcount); 901 902 spa_config_lock_destroy(spa); 903 904 for (int t = 0; t < TXG_SIZE; t++) 905 bplist_destroy(&spa->spa_free_bplist[t]); 906 907 zio_checksum_templates_free(spa); 908 909 cv_destroy(&spa->spa_async_cv); 910 cv_destroy(&spa->spa_evicting_os_cv); 911 cv_destroy(&spa->spa_proc_cv); 912 cv_destroy(&spa->spa_scrub_io_cv); 913 cv_destroy(&spa->spa_suspend_cv); 914 915 mutex_destroy(&spa->spa_alloc_lock); 916 mutex_destroy(&spa->spa_async_lock); 917 mutex_destroy(&spa->spa_errlist_lock); 918 mutex_destroy(&spa->spa_errlog_lock); 919 mutex_destroy(&spa->spa_evicting_os_lock); 920 mutex_destroy(&spa->spa_history_lock); 921 mutex_destroy(&spa->spa_proc_lock); 922 mutex_destroy(&spa->spa_props_lock); 923 mutex_destroy(&spa->spa_cksum_tmpls_lock); 924 mutex_destroy(&spa->spa_scrub_lock); 925 mutex_destroy(&spa->spa_suspend_lock); 926 mutex_destroy(&spa->spa_vdev_top_lock); 927 928 kmem_free(spa, sizeof (spa_t)); 929 } 930 931 /* 932 * Given a pool, return the next pool in the namespace, or NULL if there is 933 * none. If 'prev' is NULL, return the first pool. 934 */ 935 spa_t * 936 spa_next(spa_t *prev) 937 { 938 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 939 940 if (prev) 941 return (AVL_NEXT(&spa_namespace_avl, prev)); 942 else 943 return (avl_first(&spa_namespace_avl)); 944 } 945 946 /* 947 * ========================================================================== 948 * SPA refcount functions 949 * ========================================================================== 950 */ 951 952 /* 953 * Add a reference to the given spa_t. Must have at least one reference, or 954 * have the namespace lock held. 955 */ 956 void 957 spa_open_ref(spa_t *spa, void *tag) 958 { 959 ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref || 960 MUTEX_HELD(&spa_namespace_lock)); 961 (void) refcount_add(&spa->spa_refcount, tag); 962 } 963 964 /* 965 * Remove a reference to the given spa_t. Must have at least one reference, or 966 * have the namespace lock held. 967 */ 968 void 969 spa_close(spa_t *spa, void *tag) 970 { 971 ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref || 972 MUTEX_HELD(&spa_namespace_lock)); 973 (void) refcount_remove(&spa->spa_refcount, tag); 974 } 975 976 /* 977 * Remove a reference to the given spa_t held by a dsl dir that is 978 * being asynchronously released. Async releases occur from a taskq 979 * performing eviction of dsl datasets and dirs. The namespace lock 980 * isn't held and the hold by the object being evicted may contribute to 981 * spa_minref (e.g. dataset or directory released during pool export), 982 * so the asserts in spa_close() do not apply. 983 */ 984 void 985 spa_async_close(spa_t *spa, void *tag) 986 { 987 (void) refcount_remove(&spa->spa_refcount, tag); 988 } 989 990 /* 991 * Check to see if the spa refcount is zero. Must be called with 992 * spa_namespace_lock held. We really compare against spa_minref, which is the 993 * number of references acquired when opening a pool 994 */ 995 boolean_t 996 spa_refcount_zero(spa_t *spa) 997 { 998 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 999 1000 return (refcount_count(&spa->spa_refcount) == spa->spa_minref); 1001 } 1002 1003 /* 1004 * ========================================================================== 1005 * SPA spare and l2cache tracking 1006 * ========================================================================== 1007 */ 1008 1009 /* 1010 * Hot spares and cache devices are tracked using the same code below, 1011 * for 'auxiliary' devices. 1012 */ 1013 1014 typedef struct spa_aux { 1015 uint64_t aux_guid; 1016 uint64_t aux_pool; 1017 avl_node_t aux_avl; 1018 int aux_count; 1019 } spa_aux_t; 1020 1021 static int 1022 spa_aux_compare(const void *a, const void *b) 1023 { 1024 const spa_aux_t *sa = a; 1025 const spa_aux_t *sb = b; 1026 1027 if (sa->aux_guid < sb->aux_guid) 1028 return (-1); 1029 else if (sa->aux_guid > sb->aux_guid) 1030 return (1); 1031 else 1032 return (0); 1033 } 1034 1035 void 1036 spa_aux_add(vdev_t *vd, avl_tree_t *avl) 1037 { 1038 avl_index_t where; 1039 spa_aux_t search; 1040 spa_aux_t *aux; 1041 1042 search.aux_guid = vd->vdev_guid; 1043 if ((aux = avl_find(avl, &search, &where)) != NULL) { 1044 aux->aux_count++; 1045 } else { 1046 aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP); 1047 aux->aux_guid = vd->vdev_guid; 1048 aux->aux_count = 1; 1049 avl_insert(avl, aux, where); 1050 } 1051 } 1052 1053 void 1054 spa_aux_remove(vdev_t *vd, avl_tree_t *avl) 1055 { 1056 spa_aux_t search; 1057 spa_aux_t *aux; 1058 avl_index_t where; 1059 1060 search.aux_guid = vd->vdev_guid; 1061 aux = avl_find(avl, &search, &where); 1062 1063 ASSERT(aux != NULL); 1064 1065 if (--aux->aux_count == 0) { 1066 avl_remove(avl, aux); 1067 kmem_free(aux, sizeof (spa_aux_t)); 1068 } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) { 1069 aux->aux_pool = 0ULL; 1070 } 1071 } 1072 1073 boolean_t 1074 spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) 1075 { 1076 spa_aux_t search, *found; 1077 1078 search.aux_guid = guid; 1079 found = avl_find(avl, &search, NULL); 1080 1081 if (pool) { 1082 if (found) 1083 *pool = found->aux_pool; 1084 else 1085 *pool = 0ULL; 1086 } 1087 1088 if (refcnt) { 1089 if (found) 1090 *refcnt = found->aux_count; 1091 else 1092 *refcnt = 0; 1093 } 1094 1095 return (found != NULL); 1096 } 1097 1098 void 1099 spa_aux_activate(vdev_t *vd, avl_tree_t *avl) 1100 { 1101 spa_aux_t search, *found; 1102 avl_index_t where; 1103 1104 search.aux_guid = vd->vdev_guid; 1105 found = avl_find(avl, &search, &where); 1106 ASSERT(found != NULL); 1107 ASSERT(found->aux_pool == 0ULL); 1108 1109 found->aux_pool = spa_guid(vd->vdev_spa); 1110 } 1111 1112 /* 1113 * Spares are tracked globally due to the following constraints: 1114 * 1115 * - A spare may be part of multiple pools. 1116 * - A spare may be added to a pool even if it's actively in use within 1117 * another pool. 1118 * - A spare in use in any pool can only be the source of a replacement if 1119 * the target is a spare in the same pool. 1120 * 1121 * We keep track of all spares on the system through the use of a reference 1122 * counted AVL tree. When a vdev is added as a spare, or used as a replacement 1123 * spare, then we bump the reference count in the AVL tree. In addition, we set 1124 * the 'vdev_isspare' member to indicate that the device is a spare (active or 1125 * inactive). When a spare is made active (used to replace a device in the 1126 * pool), we also keep track of which pool its been made a part of. 1127 * 1128 * The 'spa_spare_lock' protects the AVL tree. These functions are normally 1129 * called under the spa_namespace lock as part of vdev reconfiguration. The 1130 * separate spare lock exists for the status query path, which does not need to 1131 * be completely consistent with respect to other vdev configuration changes. 1132 */ 1133 1134 static int 1135 spa_spare_compare(const void *a, const void *b) 1136 { 1137 return (spa_aux_compare(a, b)); 1138 } 1139 1140 void 1141 spa_spare_add(vdev_t *vd) 1142 { 1143 mutex_enter(&spa_spare_lock); 1144 ASSERT(!vd->vdev_isspare); 1145 spa_aux_add(vd, &spa_spare_avl); 1146 vd->vdev_isspare = B_TRUE; 1147 mutex_exit(&spa_spare_lock); 1148 } 1149 1150 void 1151 spa_spare_remove(vdev_t *vd) 1152 { 1153 mutex_enter(&spa_spare_lock); 1154 ASSERT(vd->vdev_isspare); 1155 spa_aux_remove(vd, &spa_spare_avl); 1156 vd->vdev_isspare = B_FALSE; 1157 mutex_exit(&spa_spare_lock); 1158 } 1159 1160 boolean_t 1161 spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt) 1162 { 1163 boolean_t found; 1164 1165 mutex_enter(&spa_spare_lock); 1166 found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl); 1167 mutex_exit(&spa_spare_lock); 1168 1169 return (found); 1170 } 1171 1172 void 1173 spa_spare_activate(vdev_t *vd) 1174 { 1175 mutex_enter(&spa_spare_lock); 1176 ASSERT(vd->vdev_isspare); 1177 spa_aux_activate(vd, &spa_spare_avl); 1178 mutex_exit(&spa_spare_lock); 1179 } 1180 1181 /* 1182 * Level 2 ARC devices are tracked globally for the same reasons as spares. 1183 * Cache devices currently only support one pool per cache device, and so 1184 * for these devices the aux reference count is currently unused beyond 1. 1185 */ 1186 1187 static int 1188 spa_l2cache_compare(const void *a, const void *b) 1189 { 1190 return (spa_aux_compare(a, b)); 1191 } 1192 1193 void 1194 spa_l2cache_add(vdev_t *vd) 1195 { 1196 mutex_enter(&spa_l2cache_lock); 1197 ASSERT(!vd->vdev_isl2cache); 1198 spa_aux_add(vd, &spa_l2cache_avl); 1199 vd->vdev_isl2cache = B_TRUE; 1200 mutex_exit(&spa_l2cache_lock); 1201 } 1202 1203 void 1204 spa_l2cache_remove(vdev_t *vd) 1205 { 1206 mutex_enter(&spa_l2cache_lock); 1207 ASSERT(vd->vdev_isl2cache); 1208 spa_aux_remove(vd, &spa_l2cache_avl); 1209 vd->vdev_isl2cache = B_FALSE; 1210 mutex_exit(&spa_l2cache_lock); 1211 } 1212 1213 boolean_t 1214 spa_l2cache_exists(uint64_t guid, uint64_t *pool) 1215 { 1216 boolean_t found; 1217 1218 mutex_enter(&spa_l2cache_lock); 1219 found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl); 1220 mutex_exit(&spa_l2cache_lock); 1221 1222 return (found); 1223 } 1224 1225 void 1226 spa_l2cache_activate(vdev_t *vd) 1227 { 1228 mutex_enter(&spa_l2cache_lock); 1229 ASSERT(vd->vdev_isl2cache); 1230 spa_aux_activate(vd, &spa_l2cache_avl); 1231 mutex_exit(&spa_l2cache_lock); 1232 } 1233 1234 /* 1235 * ========================================================================== 1236 * SPA vdev locking 1237 * ========================================================================== 1238 */ 1239 1240 /* 1241 * Lock the given spa_t for the purpose of adding or removing a vdev. 1242 * Grabs the global spa_namespace_lock plus the spa config lock for writing. 1243 * It returns the next transaction group for the spa_t. 1244 */ 1245 uint64_t 1246 spa_vdev_enter(spa_t *spa) 1247 { 1248 mutex_enter(&spa->spa_vdev_top_lock); 1249 mutex_enter(&spa_namespace_lock); 1250 return (spa_vdev_config_enter(spa)); 1251 } 1252 1253 /* 1254 * Internal implementation for spa_vdev_enter(). Used when a vdev 1255 * operation requires multiple syncs (i.e. removing a device) while 1256 * keeping the spa_namespace_lock held. 1257 */ 1258 uint64_t 1259 spa_vdev_config_enter(spa_t *spa) 1260 { 1261 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1262 1263 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 1264 1265 return (spa_last_synced_txg(spa) + 1); 1266 } 1267 1268 /* 1269 * Used in combination with spa_vdev_config_enter() to allow the syncing 1270 * of multiple transactions without releasing the spa_namespace_lock. 1271 */ 1272 void 1273 spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) 1274 { 1275 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1276 1277 int config_changed = B_FALSE; 1278 1279 ASSERT(txg > spa_last_synced_txg(spa)); 1280 1281 spa->spa_pending_vdev = NULL; 1282 1283 /* 1284 * Reassess the DTLs. 1285 */ 1286 vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); 1287 1288 if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { 1289 config_changed = B_TRUE; 1290 spa->spa_config_generation++; 1291 } 1292 1293 /* 1294 * Verify the metaslab classes. 1295 */ 1296 ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); 1297 ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); 1298 1299 spa_config_exit(spa, SCL_ALL, spa); 1300 1301 /* 1302 * Panic the system if the specified tag requires it. This 1303 * is useful for ensuring that configurations are updated 1304 * transactionally. 1305 */ 1306 if (zio_injection_enabled) 1307 zio_handle_panic_injection(spa, tag, 0); 1308 1309 /* 1310 * Note: this txg_wait_synced() is important because it ensures 1311 * that there won't be more than one config change per txg. 1312 * This allows us to use the txg as the generation number. 1313 */ 1314 if (error == 0) 1315 txg_wait_synced(spa->spa_dsl_pool, txg); 1316 1317 if (vd != NULL) { 1318 ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL); 1319 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 1320 vdev_free(vd); 1321 spa_config_exit(spa, SCL_ALL, spa); 1322 } 1323 1324 /* 1325 * If the config changed, update the config cache. 1326 */ 1327 if (config_changed) 1328 spa_config_sync(spa, B_FALSE, B_TRUE); 1329 } 1330 1331 /* 1332 * Unlock the spa_t after adding or removing a vdev. Besides undoing the 1333 * locking of spa_vdev_enter(), we also want make sure the transactions have 1334 * synced to disk, and then update the global configuration cache with the new 1335 * information. 1336 */ 1337 int 1338 spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) 1339 { 1340 spa_vdev_config_exit(spa, vd, txg, error, FTAG); 1341 mutex_exit(&spa_namespace_lock); 1342 mutex_exit(&spa->spa_vdev_top_lock); 1343 1344 return (error); 1345 } 1346 1347 /* 1348 * Lock the given spa_t for the purpose of changing vdev state. 1349 */ 1350 void 1351 spa_vdev_state_enter(spa_t *spa, int oplocks) 1352 { 1353 int locks = SCL_STATE_ALL | oplocks; 1354 1355 /* 1356 * Root pools may need to read of the underlying devfs filesystem 1357 * when opening up a vdev. Unfortunately if we're holding the 1358 * SCL_ZIO lock it will result in a deadlock when we try to issue 1359 * the read from the root filesystem. Instead we "prefetch" 1360 * the associated vnodes that we need prior to opening the 1361 * underlying devices and cache them so that we can prevent 1362 * any I/O when we are doing the actual open. 1363 */ 1364 if (spa_is_root(spa)) { 1365 int low = locks & ~(SCL_ZIO - 1); 1366 int high = locks & ~low; 1367 1368 spa_config_enter(spa, high, spa, RW_WRITER); 1369 vdev_hold(spa->spa_root_vdev); 1370 spa_config_enter(spa, low, spa, RW_WRITER); 1371 } else { 1372 spa_config_enter(spa, locks, spa, RW_WRITER); 1373 } 1374 spa->spa_vdev_locks = locks; 1375 } 1376 1377 int 1378 spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) 1379 { 1380 boolean_t config_changed = B_FALSE; 1381 1382 if (vd != NULL || error == 0) 1383 vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev, 1384 0, 0, B_FALSE); 1385 1386 if (vd != NULL) { 1387 vdev_state_dirty(vd->vdev_top); 1388 config_changed = B_TRUE; 1389 spa->spa_config_generation++; 1390 } 1391 1392 if (spa_is_root(spa)) 1393 vdev_rele(spa->spa_root_vdev); 1394 1395 ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); 1396 spa_config_exit(spa, spa->spa_vdev_locks, spa); 1397 1398 /* 1399 * If anything changed, wait for it to sync. This ensures that, 1400 * from the system administrator's perspective, zpool(1M) commands 1401 * are synchronous. This is important for things like zpool offline: 1402 * when the command completes, you expect no further I/O from ZFS. 1403 */ 1404 if (vd != NULL) 1405 txg_wait_synced(spa->spa_dsl_pool, 0); 1406 1407 /* 1408 * If the config changed, update the config cache. 1409 */ 1410 if (config_changed) { 1411 mutex_enter(&spa_namespace_lock); 1412 spa_config_sync(spa, B_FALSE, B_TRUE); 1413 mutex_exit(&spa_namespace_lock); 1414 } 1415 1416 return (error); 1417 } 1418 1419 /* 1420 * ========================================================================== 1421 * Miscellaneous functions 1422 * ========================================================================== 1423 */ 1424 1425 void 1426 spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx) 1427 { 1428 if (!nvlist_exists(spa->spa_label_features, feature)) { 1429 fnvlist_add_boolean(spa->spa_label_features, feature); 1430 /* 1431 * When we are creating the pool (tx_txg==TXG_INITIAL), we can't 1432 * dirty the vdev config because lock SCL_CONFIG is not held. 1433 * Thankfully, in this case we don't need to dirty the config 1434 * because it will be written out anyway when we finish 1435 * creating the pool. 1436 */ 1437 if (tx->tx_txg != TXG_INITIAL) 1438 vdev_config_dirty(spa->spa_root_vdev); 1439 } 1440 } 1441 1442 void 1443 spa_deactivate_mos_feature(spa_t *spa, const char *feature) 1444 { 1445 if (nvlist_remove_all(spa->spa_label_features, feature) == 0) 1446 vdev_config_dirty(spa->spa_root_vdev); 1447 } 1448 1449 /* 1450 * Rename a spa_t. 1451 */ 1452 int 1453 spa_rename(const char *name, const char *newname) 1454 { 1455 spa_t *spa; 1456 int err; 1457 1458 /* 1459 * Lookup the spa_t and grab the config lock for writing. We need to 1460 * actually open the pool so that we can sync out the necessary labels. 1461 * It's OK to call spa_open() with the namespace lock held because we 1462 * allow recursive calls for other reasons. 1463 */ 1464 mutex_enter(&spa_namespace_lock); 1465 if ((err = spa_open(name, &spa, FTAG)) != 0) { 1466 mutex_exit(&spa_namespace_lock); 1467 return (err); 1468 } 1469 1470 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1471 1472 avl_remove(&spa_namespace_avl, spa); 1473 (void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name)); 1474 avl_add(&spa_namespace_avl, spa); 1475 1476 /* 1477 * Sync all labels to disk with the new names by marking the root vdev 1478 * dirty and waiting for it to sync. It will pick up the new pool name 1479 * during the sync. 1480 */ 1481 vdev_config_dirty(spa->spa_root_vdev); 1482 1483 spa_config_exit(spa, SCL_ALL, FTAG); 1484 1485 txg_wait_synced(spa->spa_dsl_pool, 0); 1486 1487 /* 1488 * Sync the updated config cache. 1489 */ 1490 spa_config_sync(spa, B_FALSE, B_TRUE); 1491 1492 spa_close(spa, FTAG); 1493 1494 mutex_exit(&spa_namespace_lock); 1495 1496 return (0); 1497 } 1498 1499 /* 1500 * Return the spa_t associated with given pool_guid, if it exists. If 1501 * device_guid is non-zero, determine whether the pool exists *and* contains 1502 * a device with the specified device_guid. 1503 */ 1504 spa_t * 1505 spa_by_guid(uint64_t pool_guid, uint64_t device_guid) 1506 { 1507 spa_t *spa; 1508 avl_tree_t *t = &spa_namespace_avl; 1509 1510 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1511 1512 for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { 1513 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 1514 continue; 1515 if (spa->spa_root_vdev == NULL) 1516 continue; 1517 if (spa_guid(spa) == pool_guid) { 1518 if (device_guid == 0) 1519 break; 1520 1521 if (vdev_lookup_by_guid(spa->spa_root_vdev, 1522 device_guid) != NULL) 1523 break; 1524 1525 /* 1526 * Check any devices we may be in the process of adding. 1527 */ 1528 if (spa->spa_pending_vdev) { 1529 if (vdev_lookup_by_guid(spa->spa_pending_vdev, 1530 device_guid) != NULL) 1531 break; 1532 } 1533 } 1534 } 1535 1536 return (spa); 1537 } 1538 1539 /* 1540 * Determine whether a pool with the given pool_guid exists. 1541 */ 1542 boolean_t 1543 spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) 1544 { 1545 return (spa_by_guid(pool_guid, device_guid) != NULL); 1546 } 1547 1548 char * 1549 spa_strdup(const char *s) 1550 { 1551 size_t len; 1552 char *new; 1553 1554 len = strlen(s); 1555 new = kmem_alloc(len + 1, KM_SLEEP); 1556 bcopy(s, new, len); 1557 new[len] = '\0'; 1558 1559 return (new); 1560 } 1561 1562 void 1563 spa_strfree(char *s) 1564 { 1565 kmem_free(s, strlen(s) + 1); 1566 } 1567 1568 uint64_t 1569 spa_get_random(uint64_t range) 1570 { 1571 uint64_t r; 1572 1573 ASSERT(range != 0); 1574 1575 (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t)); 1576 1577 return (r % range); 1578 } 1579 1580 uint64_t 1581 spa_generate_guid(spa_t *spa) 1582 { 1583 uint64_t guid = spa_get_random(-1ULL); 1584 1585 if (spa != NULL) { 1586 while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)) 1587 guid = spa_get_random(-1ULL); 1588 } else { 1589 while (guid == 0 || spa_guid_exists(guid, 0)) 1590 guid = spa_get_random(-1ULL); 1591 } 1592 1593 return (guid); 1594 } 1595 1596 void 1597 snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) 1598 { 1599 char type[256]; 1600 char *checksum = NULL; 1601 char *compress = NULL; 1602 1603 if (bp != NULL) { 1604 if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { 1605 dmu_object_byteswap_t bswap = 1606 DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); 1607 (void) snprintf(type, sizeof (type), "bswap %s %s", 1608 DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ? 1609 "metadata" : "data", 1610 dmu_ot_byteswap[bswap].ob_name); 1611 } else { 1612 (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, 1613 sizeof (type)); 1614 } 1615 if (!BP_IS_EMBEDDED(bp)) { 1616 checksum = 1617 zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; 1618 } 1619 compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; 1620 } 1621 1622 SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum, 1623 compress); 1624 } 1625 1626 void 1627 spa_freeze(spa_t *spa) 1628 { 1629 uint64_t freeze_txg = 0; 1630 1631 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1632 if (spa->spa_freeze_txg == UINT64_MAX) { 1633 freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; 1634 spa->spa_freeze_txg = freeze_txg; 1635 } 1636 spa_config_exit(spa, SCL_ALL, FTAG); 1637 if (freeze_txg != 0) 1638 txg_wait_synced(spa_get_dsl(spa), freeze_txg); 1639 } 1640 1641 void 1642 zfs_panic_recover(const char *fmt, ...) 1643 { 1644 va_list adx; 1645 1646 va_start(adx, fmt); 1647 vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); 1648 va_end(adx); 1649 } 1650 1651 /* 1652 * This is a stripped-down version of strtoull, suitable only for converting 1653 * lowercase hexadecimal numbers that don't overflow. 1654 */ 1655 uint64_t 1656 zfs_strtonum(const char *str, char **nptr) 1657 { 1658 uint64_t val = 0; 1659 char c; 1660 int digit; 1661 1662 while ((c = *str) != '\0') { 1663 if (c >= '0' && c <= '9') 1664 digit = c - '0'; 1665 else if (c >= 'a' && c <= 'f') 1666 digit = 10 + c - 'a'; 1667 else 1668 break; 1669 1670 val *= 16; 1671 val += digit; 1672 1673 str++; 1674 } 1675 1676 if (nptr) 1677 *nptr = (char *)str; 1678 1679 return (val); 1680 } 1681 1682 /* 1683 * ========================================================================== 1684 * Accessor functions 1685 * ========================================================================== 1686 */ 1687 1688 boolean_t 1689 spa_shutting_down(spa_t *spa) 1690 { 1691 return (spa->spa_async_suspended); 1692 } 1693 1694 dsl_pool_t * 1695 spa_get_dsl(spa_t *spa) 1696 { 1697 return (spa->spa_dsl_pool); 1698 } 1699 1700 boolean_t 1701 spa_is_initializing(spa_t *spa) 1702 { 1703 return (spa->spa_is_initializing); 1704 } 1705 1706 blkptr_t * 1707 spa_get_rootblkptr(spa_t *spa) 1708 { 1709 return (&spa->spa_ubsync.ub_rootbp); 1710 } 1711 1712 void 1713 spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) 1714 { 1715 spa->spa_uberblock.ub_rootbp = *bp; 1716 } 1717 1718 void 1719 spa_altroot(spa_t *spa, char *buf, size_t buflen) 1720 { 1721 if (spa->spa_root == NULL) 1722 buf[0] = '\0'; 1723 else 1724 (void) strncpy(buf, spa->spa_root, buflen); 1725 } 1726 1727 int 1728 spa_sync_pass(spa_t *spa) 1729 { 1730 return (spa->spa_sync_pass); 1731 } 1732 1733 char * 1734 spa_name(spa_t *spa) 1735 { 1736 return (spa->spa_name); 1737 } 1738 1739 uint64_t 1740 spa_guid(spa_t *spa) 1741 { 1742 dsl_pool_t *dp = spa_get_dsl(spa); 1743 uint64_t guid; 1744 1745 /* 1746 * If we fail to parse the config during spa_load(), we can go through 1747 * the error path (which posts an ereport) and end up here with no root 1748 * vdev. We stash the original pool guid in 'spa_config_guid' to handle 1749 * this case. 1750 */ 1751 if (spa->spa_root_vdev == NULL) 1752 return (spa->spa_config_guid); 1753 1754 guid = spa->spa_last_synced_guid != 0 ? 1755 spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid; 1756 1757 /* 1758 * Return the most recently synced out guid unless we're 1759 * in syncing context. 1760 */ 1761 if (dp && dsl_pool_sync_context(dp)) 1762 return (spa->spa_root_vdev->vdev_guid); 1763 else 1764 return (guid); 1765 } 1766 1767 uint64_t 1768 spa_load_guid(spa_t *spa) 1769 { 1770 /* 1771 * This is a GUID that exists solely as a reference for the 1772 * purposes of the arc. It is generated at load time, and 1773 * is never written to persistent storage. 1774 */ 1775 return (spa->spa_load_guid); 1776 } 1777 1778 uint64_t 1779 spa_last_synced_txg(spa_t *spa) 1780 { 1781 return (spa->spa_ubsync.ub_txg); 1782 } 1783 1784 uint64_t 1785 spa_first_txg(spa_t *spa) 1786 { 1787 return (spa->spa_first_txg); 1788 } 1789 1790 uint64_t 1791 spa_syncing_txg(spa_t *spa) 1792 { 1793 return (spa->spa_syncing_txg); 1794 } 1795 1796 pool_state_t 1797 spa_state(spa_t *spa) 1798 { 1799 return (spa->spa_state); 1800 } 1801 1802 spa_load_state_t 1803 spa_load_state(spa_t *spa) 1804 { 1805 return (spa->spa_load_state); 1806 } 1807 1808 uint64_t 1809 spa_freeze_txg(spa_t *spa) 1810 { 1811 return (spa->spa_freeze_txg); 1812 } 1813 1814 /* ARGSUSED */ 1815 uint64_t 1816 spa_get_asize(spa_t *spa, uint64_t lsize) 1817 { 1818 return (lsize * spa_asize_inflation); 1819 } 1820 1821 /* 1822 * Return the amount of slop space in bytes. It is 1/32 of the pool (3.2%), 1823 * or at least 128MB, unless that would cause it to be more than half the 1824 * pool size. 1825 * 1826 * See the comment above spa_slop_shift for details. 1827 */ 1828 uint64_t 1829 spa_get_slop_space(spa_t *spa) 1830 { 1831 uint64_t space = spa_get_dspace(spa); 1832 return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop))); 1833 } 1834 1835 uint64_t 1836 spa_get_dspace(spa_t *spa) 1837 { 1838 return (spa->spa_dspace); 1839 } 1840 1841 void 1842 spa_update_dspace(spa_t *spa) 1843 { 1844 spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + 1845 ddt_get_dedup_dspace(spa); 1846 } 1847 1848 /* 1849 * Return the failure mode that has been set to this pool. The default 1850 * behavior will be to block all I/Os when a complete failure occurs. 1851 */ 1852 uint8_t 1853 spa_get_failmode(spa_t *spa) 1854 { 1855 return (spa->spa_failmode); 1856 } 1857 1858 boolean_t 1859 spa_suspended(spa_t *spa) 1860 { 1861 return (spa->spa_suspended); 1862 } 1863 1864 uint64_t 1865 spa_version(spa_t *spa) 1866 { 1867 return (spa->spa_ubsync.ub_version); 1868 } 1869 1870 boolean_t 1871 spa_deflate(spa_t *spa) 1872 { 1873 return (spa->spa_deflate); 1874 } 1875 1876 metaslab_class_t * 1877 spa_normal_class(spa_t *spa) 1878 { 1879 return (spa->spa_normal_class); 1880 } 1881 1882 metaslab_class_t * 1883 spa_log_class(spa_t *spa) 1884 { 1885 return (spa->spa_log_class); 1886 } 1887 1888 void 1889 spa_evicting_os_register(spa_t *spa, objset_t *os) 1890 { 1891 mutex_enter(&spa->spa_evicting_os_lock); 1892 list_insert_head(&spa->spa_evicting_os_list, os); 1893 mutex_exit(&spa->spa_evicting_os_lock); 1894 } 1895 1896 void 1897 spa_evicting_os_deregister(spa_t *spa, objset_t *os) 1898 { 1899 mutex_enter(&spa->spa_evicting_os_lock); 1900 list_remove(&spa->spa_evicting_os_list, os); 1901 cv_broadcast(&spa->spa_evicting_os_cv); 1902 mutex_exit(&spa->spa_evicting_os_lock); 1903 } 1904 1905 void 1906 spa_evicting_os_wait(spa_t *spa) 1907 { 1908 mutex_enter(&spa->spa_evicting_os_lock); 1909 while (!list_is_empty(&spa->spa_evicting_os_list)) 1910 cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock); 1911 mutex_exit(&spa->spa_evicting_os_lock); 1912 1913 dmu_buf_user_evict_wait(); 1914 } 1915 1916 int 1917 spa_max_replication(spa_t *spa) 1918 { 1919 /* 1920 * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to 1921 * handle BPs with more than one DVA allocated. Set our max 1922 * replication level accordingly. 1923 */ 1924 if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS) 1925 return (1); 1926 return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); 1927 } 1928 1929 int 1930 spa_prev_software_version(spa_t *spa) 1931 { 1932 return (spa->spa_prev_software_version); 1933 } 1934 1935 uint64_t 1936 spa_deadman_synctime(spa_t *spa) 1937 { 1938 return (spa->spa_deadman_synctime); 1939 } 1940 1941 uint64_t 1942 dva_get_dsize_sync(spa_t *spa, const dva_t *dva) 1943 { 1944 uint64_t asize = DVA_GET_ASIZE(dva); 1945 uint64_t dsize = asize; 1946 1947 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1948 1949 if (asize != 0 && spa->spa_deflate) { 1950 uint64_t vdev = DVA_GET_VDEV(dva); 1951 vdev_t *vd = vdev_lookup_top(spa, vdev); 1952 if (vd == NULL) { 1953 panic( 1954 "dva_get_dsize_sync(): bad DVA %llu:%llu", 1955 (u_longlong_t)vdev, (u_longlong_t)asize); 1956 } 1957 dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; 1958 } 1959 1960 return (dsize); 1961 } 1962 1963 uint64_t 1964 bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) 1965 { 1966 uint64_t dsize = 0; 1967 1968 for (int d = 0; d < BP_GET_NDVAS(bp); d++) 1969 dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); 1970 1971 return (dsize); 1972 } 1973 1974 uint64_t 1975 bp_get_dsize(spa_t *spa, const blkptr_t *bp) 1976 { 1977 uint64_t dsize = 0; 1978 1979 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 1980 1981 for (int d = 0; d < BP_GET_NDVAS(bp); d++) 1982 dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); 1983 1984 spa_config_exit(spa, SCL_VDEV, FTAG); 1985 1986 return (dsize); 1987 } 1988 1989 /* 1990 * ========================================================================== 1991 * Initialization and Termination 1992 * ========================================================================== 1993 */ 1994 1995 static int 1996 spa_name_compare(const void *a1, const void *a2) 1997 { 1998 const spa_t *s1 = a1; 1999 const spa_t *s2 = a2; 2000 int s; 2001 2002 s = strcmp(s1->spa_name, s2->spa_name); 2003 if (s > 0) 2004 return (1); 2005 if (s < 0) 2006 return (-1); 2007 return (0); 2008 } 2009 2010 int 2011 spa_busy(void) 2012 { 2013 return (spa_active_count); 2014 } 2015 2016 void 2017 spa_boot_init() 2018 { 2019 spa_config_load(); 2020 } 2021 2022 #ifdef __FreeBSD__ 2023 #ifdef _KERNEL 2024 EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0); 2025 #endif 2026 #endif 2027 2028 void 2029 spa_init(int mode) 2030 { 2031 mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); 2032 mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); 2033 mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); 2034 cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); 2035 2036 avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), 2037 offsetof(spa_t, spa_avl)); 2038 2039 avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t), 2040 offsetof(spa_aux_t, aux_avl)); 2041 2042 avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), 2043 offsetof(spa_aux_t, aux_avl)); 2044 2045 spa_mode_global = mode; 2046 2047 #ifdef illumos 2048 #ifdef _KERNEL 2049 spa_arch_init(); 2050 #else 2051 if (spa_mode_global != FREAD && dprintf_find_string("watch")) { 2052 arc_procfd = open("/proc/self/ctl", O_WRONLY); 2053 if (arc_procfd == -1) { 2054 perror("could not enable watchpoints: " 2055 "opening /proc/self/ctl failed: "); 2056 } else { 2057 arc_watch = B_TRUE; 2058 } 2059 } 2060 #endif 2061 #endif /* illumos */ 2062 refcount_sysinit(); 2063 unique_init(); 2064 range_tree_init(); 2065 metaslab_alloc_trace_init(); 2066 zio_init(); 2067 lz4_init(); 2068 dmu_init(); 2069 zil_init(); 2070 vdev_cache_stat_init(); 2071 zfs_prop_init(); 2072 zpool_prop_init(); 2073 zpool_feature_init(); 2074 spa_config_load(); 2075 l2arc_start(); 2076 #ifdef __FreeBSD__ 2077 #ifdef _KERNEL 2078 zfs_deadman_init(); 2079 #endif 2080 #endif /* __FreeBSD__ */ 2081 #ifdef __NetBSD__ 2082 zfs_deadman_init(); 2083 #endif 2084 } 2085 2086 void 2087 spa_fini(void) 2088 { 2089 #ifdef __NetBSD__ 2090 zfs_deadman_fini(); 2091 #endif 2092 l2arc_stop(); 2093 2094 spa_evict_all(); 2095 2096 vdev_cache_stat_fini(); 2097 zil_fini(); 2098 dmu_fini(); 2099 lz4_fini(); 2100 zio_fini(); 2101 metaslab_alloc_trace_fini(); 2102 range_tree_fini(); 2103 unique_fini(); 2104 refcount_fini(); 2105 2106 avl_destroy(&spa_namespace_avl); 2107 avl_destroy(&spa_spare_avl); 2108 avl_destroy(&spa_l2cache_avl); 2109 2110 cv_destroy(&spa_namespace_cv); 2111 mutex_destroy(&spa_namespace_lock); 2112 mutex_destroy(&spa_spare_lock); 2113 mutex_destroy(&spa_l2cache_lock); 2114 } 2115 2116 /* 2117 * Return whether this pool has slogs. No locking needed. 2118 * It's not a problem if the wrong answer is returned as it's only for 2119 * performance and not correctness 2120 */ 2121 boolean_t 2122 spa_has_slogs(spa_t *spa) 2123 { 2124 return (spa->spa_log_class->mc_rotor != NULL); 2125 } 2126 2127 spa_log_state_t 2128 spa_get_log_state(spa_t *spa) 2129 { 2130 return (spa->spa_log_state); 2131 } 2132 2133 void 2134 spa_set_log_state(spa_t *spa, spa_log_state_t state) 2135 { 2136 spa->spa_log_state = state; 2137 } 2138 2139 boolean_t 2140 spa_is_root(spa_t *spa) 2141 { 2142 return (spa->spa_is_root); 2143 } 2144 2145 boolean_t 2146 spa_writeable(spa_t *spa) 2147 { 2148 return (!!(spa->spa_mode & FWRITE)); 2149 } 2150 2151 /* 2152 * Returns true if there is a pending sync task in any of the current 2153 * syncing txg, the current quiescing txg, or the current open txg. 2154 */ 2155 boolean_t 2156 spa_has_pending_synctask(spa_t *spa) 2157 { 2158 return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks)); 2159 } 2160 2161 int 2162 spa_mode(spa_t *spa) 2163 { 2164 return (spa->spa_mode); 2165 } 2166 2167 uint64_t 2168 spa_bootfs(spa_t *spa) 2169 { 2170 return (spa->spa_bootfs); 2171 } 2172 2173 uint64_t 2174 spa_delegation(spa_t *spa) 2175 { 2176 return (spa->spa_delegation); 2177 } 2178 2179 objset_t * 2180 spa_meta_objset(spa_t *spa) 2181 { 2182 return (spa->spa_meta_objset); 2183 } 2184 2185 enum zio_checksum 2186 spa_dedup_checksum(spa_t *spa) 2187 { 2188 return (spa->spa_dedup_checksum); 2189 } 2190 2191 /* 2192 * Reset pool scan stat per scan pass (or reboot). 2193 */ 2194 void 2195 spa_scan_stat_init(spa_t *spa) 2196 { 2197 /* data not stored on disk */ 2198 spa->spa_scan_pass_start = gethrestime_sec(); 2199 spa->spa_scan_pass_exam = 0; 2200 vdev_scan_stat_init(spa->spa_root_vdev); 2201 } 2202 2203 /* 2204 * Get scan stats for zpool status reports 2205 */ 2206 int 2207 spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) 2208 { 2209 dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; 2210 2211 if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) 2212 return (SET_ERROR(ENOENT)); 2213 bzero(ps, sizeof (pool_scan_stat_t)); 2214 2215 /* data stored on disk */ 2216 ps->pss_func = scn->scn_phys.scn_func; 2217 ps->pss_start_time = scn->scn_phys.scn_start_time; 2218 ps->pss_end_time = scn->scn_phys.scn_end_time; 2219 ps->pss_to_examine = scn->scn_phys.scn_to_examine; 2220 ps->pss_examined = scn->scn_phys.scn_examined; 2221 ps->pss_to_process = scn->scn_phys.scn_to_process; 2222 ps->pss_processed = scn->scn_phys.scn_processed; 2223 ps->pss_errors = scn->scn_phys.scn_errors; 2224 ps->pss_state = scn->scn_phys.scn_state; 2225 2226 /* data not stored on disk */ 2227 ps->pss_pass_start = spa->spa_scan_pass_start; 2228 ps->pss_pass_exam = spa->spa_scan_pass_exam; 2229 2230 return (0); 2231 } 2232 2233 boolean_t 2234 spa_debug_enabled(spa_t *spa) 2235 { 2236 return (spa->spa_debug); 2237 } 2238 2239 int 2240 spa_maxblocksize(spa_t *spa) 2241 { 2242 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) 2243 return (SPA_MAXBLOCKSIZE); 2244 else 2245 return (SPA_OLD_MAXBLOCKSIZE); 2246 } 2247