1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org> 23 * All rights reserved. 24 * 25 * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org> 26 */ 27 28 #include <sys/zfs_context.h> 29 #include <sys/param.h> 30 #include <sys/kernel.h> 31 #include <sys/bio.h> 32 #include <sys/buf.h> 33 #include <sys/file.h> 34 #include <sys/spa.h> 35 #include <sys/spa_impl.h> 36 #include <sys/vdev_impl.h> 37 #include <sys/vdev_os.h> 38 #include <sys/fs/zfs.h> 39 #include <sys/zio.h> 40 #include <vm/vm_page.h> 41 #include <geom/geom.h> 42 #include <geom/geom_disk.h> 43 #include <geom/geom_int.h> 44 45 #ifndef g_topology_locked 46 #define g_topology_locked() sx_xlocked(&topology_lock) 47 #endif 48 49 /* 50 * Virtual device vector for GEOM. 51 */ 52 53 static g_attrchanged_t vdev_geom_attrchanged; 54 struct g_class zfs_vdev_class = { 55 .name = "ZFS::VDEV", 56 .version = G_VERSION, 57 .attrchanged = vdev_geom_attrchanged, 58 }; 59 60 struct consumer_vdev_elem { 61 SLIST_ENTRY(consumer_vdev_elem) elems; 62 vdev_t *vd; 63 }; 64 65 SLIST_HEAD(consumer_priv_t, consumer_vdev_elem); 66 /* BEGIN CSTYLED */ 67 _Static_assert(sizeof (((struct g_consumer *)NULL)->private) 68 == sizeof (struct consumer_priv_t*), 69 "consumer_priv_t* can't be stored in g_consumer.private"); 70 71 DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev); 72 73 SYSCTL_DECL(_vfs_zfs_vdev); 74 /* Don't send BIO_FLUSH. */ 75 static int vdev_geom_bio_flush_disable; 76 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN, 77 &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH"); 78 /* Don't send BIO_DELETE. */ 79 static int vdev_geom_bio_delete_disable; 80 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN, 81 &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE"); 82 /* END CSTYLED */ 83 84 /* Declare local functions */ 85 static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read); 86 87 /* 88 * Thread local storage used to indicate when a thread is probing geoms 89 * for their guids. If NULL, this thread is not tasting geoms. If non NULL, 90 * it is looking for a replacement for the vdev_t* that is its value. 91 */ 92 uint_t zfs_geom_probe_vdev_key; 93 94 static void 95 vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp, 96 boolean_t do_null_update) 97 { 98 boolean_t needs_update = B_FALSE; 99 char *physpath; 100 int error, physpath_len; 101 102 physpath_len = MAXPATHLEN; 103 physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO); 104 error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath); 105 if (error == 0) { 106 char *old_physpath; 107 108 /* g_topology lock ensures that vdev has not been closed */ 109 g_topology_assert(); 110 old_physpath = vd->vdev_physpath; 111 vd->vdev_physpath = spa_strdup(physpath); 112 113 if (old_physpath != NULL) { 114 needs_update = (strcmp(old_physpath, 115 vd->vdev_physpath) != 0); 116 spa_strfree(old_physpath); 117 } else 118 needs_update = do_null_update; 119 } 120 g_free(physpath); 121 122 /* 123 * If the physical path changed, update the config. 124 * Only request an update for previously unset physpaths if 125 * requested by the caller. 126 */ 127 if (needs_update) 128 spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE); 129 130 } 131 132 static void 133 vdev_geom_attrchanged(struct g_consumer *cp, const char *attr) 134 { 135 struct consumer_priv_t *priv; 136 struct consumer_vdev_elem *elem; 137 138 priv = (struct consumer_priv_t *)&cp->private; 139 if (SLIST_EMPTY(priv)) 140 return; 141 142 SLIST_FOREACH(elem, priv, elems) { 143 vdev_t *vd = elem->vd; 144 if (strcmp(attr, "GEOM::physpath") == 0) { 145 vdev_geom_set_physpath(vd, cp, /* null_update */B_TRUE); 146 return; 147 } 148 } 149 } 150 151 static void 152 vdev_geom_resize(struct g_consumer *cp) 153 { 154 struct consumer_priv_t *priv; 155 struct consumer_vdev_elem *elem; 156 spa_t *spa; 157 vdev_t *vd; 158 159 priv = (struct consumer_priv_t *)&cp->private; 160 if (SLIST_EMPTY(priv)) 161 return; 162 163 SLIST_FOREACH(elem, priv, elems) { 164 vd = elem->vd; 165 if (vd->vdev_state != VDEV_STATE_HEALTHY) 166 continue; 167 spa = vd->vdev_spa; 168 if (!spa->spa_autoexpand) 169 continue; 170 vdev_online(spa, vd->vdev_guid, ZFS_ONLINE_EXPAND, NULL); 171 } 172 } 173 174 static void 175 vdev_geom_orphan(struct g_consumer *cp) 176 { 177 struct consumer_priv_t *priv; 178 // cppcheck-suppress uninitvar 179 struct consumer_vdev_elem *elem; 180 181 g_topology_assert(); 182 183 priv = (struct consumer_priv_t *)&cp->private; 184 if (SLIST_EMPTY(priv)) 185 /* Vdev close in progress. Ignore the event. */ 186 return; 187 188 /* 189 * Orphan callbacks occur from the GEOM event thread. 190 * Concurrent with this call, new I/O requests may be 191 * working their way through GEOM about to find out 192 * (only once executed by the g_down thread) that we've 193 * been orphaned from our disk provider. These I/Os 194 * must be retired before we can detach our consumer. 195 * This is most easily achieved by acquiring the 196 * SPA ZIO configuration lock as a writer, but doing 197 * so with the GEOM topology lock held would cause 198 * a lock order reversal. Instead, rely on the SPA's 199 * async removal support to invoke a close on this 200 * vdev once it is safe to do so. 201 */ 202 SLIST_FOREACH(elem, priv, elems) { 203 // cppcheck-suppress uninitvar 204 vdev_t *vd = elem->vd; 205 206 vd->vdev_remove_wanted = B_TRUE; 207 spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); 208 } 209 } 210 211 static struct g_consumer * 212 vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity) 213 { 214 struct g_geom *gp; 215 struct g_consumer *cp; 216 int error; 217 218 g_topology_assert(); 219 220 ZFS_LOG(1, "Attaching to %s.", pp->name); 221 222 if (sanity) { 223 if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) { 224 ZFS_LOG(1, "Failing attach of %s. " 225 "Incompatible sectorsize %d\n", 226 pp->name, pp->sectorsize); 227 return (NULL); 228 } else if (pp->mediasize < SPA_MINDEVSIZE) { 229 ZFS_LOG(1, "Failing attach of %s. " 230 "Incompatible mediasize %ju\n", 231 pp->name, pp->mediasize); 232 return (NULL); 233 } 234 } 235 236 /* Do we have geom already? No? Create one. */ 237 LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) { 238 if (gp->flags & G_GEOM_WITHER) 239 continue; 240 if (strcmp(gp->name, "zfs::vdev") != 0) 241 continue; 242 break; 243 } 244 if (gp == NULL) { 245 gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev"); 246 gp->orphan = vdev_geom_orphan; 247 gp->attrchanged = vdev_geom_attrchanged; 248 gp->resize = vdev_geom_resize; 249 cp = g_new_consumer(gp); 250 error = g_attach(cp, pp); 251 if (error != 0) { 252 ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__, 253 __LINE__, error); 254 vdev_geom_detach(cp, B_FALSE); 255 return (NULL); 256 } 257 error = g_access(cp, 1, 0, 1); 258 if (error != 0) { 259 ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__, 260 __LINE__, error); 261 vdev_geom_detach(cp, B_FALSE); 262 return (NULL); 263 } 264 ZFS_LOG(1, "Created geom and consumer for %s.", pp->name); 265 } else { 266 /* Check if we are already connected to this provider. */ 267 LIST_FOREACH(cp, &gp->consumer, consumer) { 268 if (cp->provider == pp) { 269 ZFS_LOG(1, "Found consumer for %s.", pp->name); 270 break; 271 } 272 } 273 if (cp == NULL) { 274 cp = g_new_consumer(gp); 275 error = g_attach(cp, pp); 276 if (error != 0) { 277 ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", 278 __func__, __LINE__, error); 279 vdev_geom_detach(cp, B_FALSE); 280 return (NULL); 281 } 282 error = g_access(cp, 1, 0, 1); 283 if (error != 0) { 284 ZFS_LOG(1, "%s(%d): g_access failed: %d\n", 285 __func__, __LINE__, error); 286 vdev_geom_detach(cp, B_FALSE); 287 return (NULL); 288 } 289 ZFS_LOG(1, "Created consumer for %s.", pp->name); 290 } else { 291 error = g_access(cp, 1, 0, 1); 292 if (error != 0) { 293 ZFS_LOG(1, "%s(%d): g_access failed: %d\n", 294 __func__, __LINE__, error); 295 return (NULL); 296 } 297 ZFS_LOG(1, "Used existing consumer for %s.", pp->name); 298 } 299 } 300 301 if (vd != NULL) 302 vd->vdev_tsd = cp; 303 304 cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; 305 return (cp); 306 } 307 308 static void 309 vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read) 310 { 311 struct g_geom *gp; 312 313 g_topology_assert(); 314 315 ZFS_LOG(1, "Detaching from %s.", 316 cp->provider && cp->provider->name ? cp->provider->name : "NULL"); 317 318 gp = cp->geom; 319 if (open_for_read) 320 g_access(cp, -1, 0, -1); 321 /* Destroy consumer on last close. */ 322 if (cp->acr == 0 && cp->ace == 0) { 323 if (cp->acw > 0) 324 g_access(cp, 0, -cp->acw, 0); 325 if (cp->provider != NULL) { 326 ZFS_LOG(1, "Destroying consumer for %s.", 327 cp->provider->name ? cp->provider->name : "NULL"); 328 g_detach(cp); 329 } 330 g_destroy_consumer(cp); 331 } 332 /* Destroy geom if there are no consumers left. */ 333 if (LIST_EMPTY(&gp->consumer)) { 334 ZFS_LOG(1, "Destroyed geom %s.", gp->name); 335 g_wither_geom(gp, ENXIO); 336 } 337 } 338 339 static void 340 vdev_geom_close_locked(vdev_t *vd) 341 { 342 struct g_consumer *cp; 343 struct consumer_priv_t *priv; 344 struct consumer_vdev_elem *elem, *elem_temp; 345 346 g_topology_assert(); 347 348 cp = vd->vdev_tsd; 349 vd->vdev_delayed_close = B_FALSE; 350 if (cp == NULL) 351 return; 352 353 ZFS_LOG(1, "Closing access to %s.", cp->provider->name); 354 KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__)); 355 priv = (struct consumer_priv_t *)&cp->private; 356 vd->vdev_tsd = NULL; 357 SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) { 358 if (elem->vd == vd) { 359 SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems); 360 g_free(elem); 361 } 362 } 363 364 vdev_geom_detach(cp, B_TRUE); 365 } 366 367 /* 368 * Issue one or more bios to the vdev in parallel 369 * cmds, datas, offsets, errors, and sizes are arrays of length ncmds. Each IO 370 * operation is described by parallel entries from each array. There may be 371 * more bios actually issued than entries in the array 372 */ 373 static void 374 vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets, 375 off_t *sizes, int *errors, int ncmds) 376 { 377 struct bio **bios; 378 uint8_t *p; 379 off_t off, maxio, s, end; 380 int i, n_bios, j; 381 size_t bios_size; 382 383 #if __FreeBSD_version > 1300130 384 maxio = maxphys - (maxphys % cp->provider->sectorsize); 385 #else 386 maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize); 387 #endif 388 n_bios = 0; 389 390 /* How many bios are required for all commands ? */ 391 for (i = 0; i < ncmds; i++) 392 n_bios += (sizes[i] + maxio - 1) / maxio; 393 394 /* Allocate memory for the bios */ 395 bios_size = n_bios * sizeof (struct bio *); 396 bios = kmem_zalloc(bios_size, KM_SLEEP); 397 398 /* Prepare and issue all of the bios */ 399 for (i = j = 0; i < ncmds; i++) { 400 off = offsets[i]; 401 p = datas[i]; 402 s = sizes[i]; 403 end = off + s; 404 ASSERT0(off % cp->provider->sectorsize); 405 ASSERT0(s % cp->provider->sectorsize); 406 407 for (; off < end; off += maxio, p += maxio, s -= maxio, j++) { 408 bios[j] = g_alloc_bio(); 409 bios[j]->bio_cmd = cmds[i]; 410 bios[j]->bio_done = NULL; 411 bios[j]->bio_offset = off; 412 bios[j]->bio_length = MIN(s, maxio); 413 bios[j]->bio_data = (caddr_t)p; 414 g_io_request(bios[j], cp); 415 } 416 } 417 ASSERT3S(j, ==, n_bios); 418 419 /* Wait for all of the bios to complete, and clean them up */ 420 for (i = j = 0; i < ncmds; i++) { 421 off = offsets[i]; 422 s = sizes[i]; 423 end = off + s; 424 425 for (; off < end; off += maxio, s -= maxio, j++) { 426 errors[i] = biowait(bios[j], "vdev_geom_io") || 427 errors[i]; 428 g_destroy_bio(bios[j]); 429 } 430 } 431 kmem_free(bios, bios_size); 432 } 433 434 /* 435 * Read the vdev config from a device. Return the number of valid labels that 436 * were found. The vdev config will be returned in config if and only if at 437 * least one valid label was found. 438 */ 439 static int 440 vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp) 441 { 442 struct g_provider *pp; 443 nvlist_t *config; 444 vdev_phys_t *vdev_lists[VDEV_LABELS]; 445 char *buf; 446 size_t buflen; 447 uint64_t psize, state, txg; 448 off_t offsets[VDEV_LABELS]; 449 off_t size; 450 off_t sizes[VDEV_LABELS]; 451 int cmds[VDEV_LABELS]; 452 int errors[VDEV_LABELS]; 453 int l, nlabels; 454 455 g_topology_assert_not(); 456 457 pp = cp->provider; 458 ZFS_LOG(1, "Reading config from %s...", pp->name); 459 460 psize = pp->mediasize; 461 psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); 462 463 size = sizeof (*vdev_lists[0]) + pp->sectorsize - 464 ((sizeof (*vdev_lists[0]) - 1) % pp->sectorsize) - 1; 465 466 buflen = sizeof (vdev_lists[0]->vp_nvlist); 467 468 /* Create all of the IO requests */ 469 for (l = 0; l < VDEV_LABELS; l++) { 470 cmds[l] = BIO_READ; 471 vdev_lists[l] = kmem_alloc(size, KM_SLEEP); 472 offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE; 473 sizes[l] = size; 474 errors[l] = 0; 475 ASSERT0(offsets[l] % pp->sectorsize); 476 } 477 478 /* Issue the IO requests */ 479 vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors, 480 VDEV_LABELS); 481 482 /* Parse the labels */ 483 config = *configp = NULL; 484 nlabels = 0; 485 for (l = 0; l < VDEV_LABELS; l++) { 486 if (errors[l] != 0) 487 continue; 488 489 buf = vdev_lists[l]->vp_nvlist; 490 491 if (nvlist_unpack(buf, buflen, &config, 0) != 0) 492 continue; 493 494 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, 495 &state) != 0 || state > POOL_STATE_L2CACHE) { 496 nvlist_free(config); 497 continue; 498 } 499 500 if (state != POOL_STATE_SPARE && 501 state != POOL_STATE_L2CACHE && 502 (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 503 &txg) != 0 || txg == 0)) { 504 nvlist_free(config); 505 continue; 506 } 507 508 if (*configp != NULL) 509 nvlist_free(*configp); 510 *configp = config; 511 nlabels++; 512 } 513 514 /* Free the label storage */ 515 for (l = 0; l < VDEV_LABELS; l++) 516 kmem_free(vdev_lists[l], size); 517 518 return (nlabels); 519 } 520 521 static void 522 resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id) 523 { 524 nvlist_t **new_configs; 525 uint64_t i; 526 527 if (id < *count) 528 return; 529 new_configs = kmem_zalloc((id + 1) * sizeof (nvlist_t *), 530 KM_SLEEP); 531 for (i = 0; i < *count; i++) 532 new_configs[i] = (*configs)[i]; 533 if (*configs != NULL) 534 kmem_free(*configs, *count * sizeof (void *)); 535 *configs = new_configs; 536 *count = id + 1; 537 } 538 539 static void 540 process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg, 541 const char *name, uint64_t *known_pool_guid) 542 { 543 nvlist_t *vdev_tree; 544 uint64_t pool_guid; 545 uint64_t vdev_guid; 546 uint64_t id, txg, known_txg; 547 char *pname; 548 549 if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 || 550 strcmp(pname, name) != 0) 551 goto ignore; 552 553 if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) 554 goto ignore; 555 556 if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0) 557 goto ignore; 558 559 if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) 560 goto ignore; 561 562 if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0) 563 goto ignore; 564 565 txg = fnvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG); 566 567 if (*known_pool_guid != 0) { 568 if (pool_guid != *known_pool_guid) 569 goto ignore; 570 } else 571 *known_pool_guid = pool_guid; 572 573 resize_configs(configs, count, id); 574 575 if ((*configs)[id] != NULL) { 576 known_txg = fnvlist_lookup_uint64((*configs)[id], 577 ZPOOL_CONFIG_POOL_TXG); 578 if (txg <= known_txg) 579 goto ignore; 580 nvlist_free((*configs)[id]); 581 } 582 583 (*configs)[id] = cfg; 584 return; 585 586 ignore: 587 nvlist_free(cfg); 588 } 589 590 int 591 vdev_geom_read_pool_label(const char *name, 592 nvlist_t ***configs, uint64_t *count) 593 { 594 struct g_class *mp; 595 struct g_geom *gp; 596 struct g_provider *pp; 597 struct g_consumer *zcp; 598 nvlist_t *vdev_cfg; 599 uint64_t pool_guid; 600 int nlabels; 601 602 DROP_GIANT(); 603 g_topology_lock(); 604 605 *configs = NULL; 606 *count = 0; 607 pool_guid = 0; 608 LIST_FOREACH(mp, &g_classes, class) { 609 if (mp == &zfs_vdev_class) 610 continue; 611 LIST_FOREACH(gp, &mp->geom, geom) { 612 if (gp->flags & G_GEOM_WITHER) 613 continue; 614 LIST_FOREACH(pp, &gp->provider, provider) { 615 if (pp->flags & G_PF_WITHER) 616 continue; 617 zcp = vdev_geom_attach(pp, NULL, B_TRUE); 618 if (zcp == NULL) 619 continue; 620 g_topology_unlock(); 621 nlabels = vdev_geom_read_config(zcp, &vdev_cfg); 622 g_topology_lock(); 623 vdev_geom_detach(zcp, B_TRUE); 624 if (nlabels == 0) 625 continue; 626 ZFS_LOG(1, "successfully read vdev config"); 627 628 process_vdev_config(configs, count, 629 vdev_cfg, name, &pool_guid); 630 } 631 } 632 } 633 g_topology_unlock(); 634 PICKUP_GIANT(); 635 636 return (*count > 0 ? 0 : ENOENT); 637 } 638 639 enum match { 640 NO_MATCH = 0, /* No matching labels found */ 641 TOPGUID_MATCH = 1, /* Labels match top guid, not vdev guid */ 642 ZERO_MATCH = 1, /* Should never be returned */ 643 ONE_MATCH = 2, /* 1 label matching the vdev_guid */ 644 TWO_MATCH = 3, /* 2 label matching the vdev_guid */ 645 THREE_MATCH = 4, /* 3 label matching the vdev_guid */ 646 FULL_MATCH = 5 /* all labels match the vdev_guid */ 647 }; 648 649 static enum match 650 vdev_attach_ok(vdev_t *vd, struct g_provider *pp) 651 { 652 nvlist_t *config; 653 uint64_t pool_guid, top_guid, vdev_guid; 654 struct g_consumer *cp; 655 int nlabels; 656 657 cp = vdev_geom_attach(pp, NULL, B_TRUE); 658 if (cp == NULL) { 659 ZFS_LOG(1, "Unable to attach tasting instance to %s.", 660 pp->name); 661 return (NO_MATCH); 662 } 663 g_topology_unlock(); 664 nlabels = vdev_geom_read_config(cp, &config); 665 g_topology_lock(); 666 vdev_geom_detach(cp, B_TRUE); 667 if (nlabels == 0) { 668 ZFS_LOG(1, "Unable to read config from %s.", pp->name); 669 return (NO_MATCH); 670 } 671 672 pool_guid = 0; 673 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid); 674 top_guid = 0; 675 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid); 676 vdev_guid = 0; 677 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); 678 nvlist_free(config); 679 680 /* 681 * Check that the label's pool guid matches the desired guid. 682 * Inactive spares and L2ARCs do not have any pool guid in the label. 683 */ 684 if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) { 685 ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.", 686 pp->name, 687 (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid); 688 return (NO_MATCH); 689 } 690 691 /* 692 * Check that the label's vdev guid matches the desired guid. 693 * The second condition handles possible race on vdev detach, when 694 * remaining vdev receives GUID of destroyed top level mirror vdev. 695 */ 696 if (vdev_guid == vd->vdev_guid) { 697 ZFS_LOG(1, "guids match for provider %s.", pp->name); 698 return (ZERO_MATCH + nlabels); 699 } else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) { 700 ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name); 701 return (TOPGUID_MATCH); 702 } 703 ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.", 704 pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid); 705 return (NO_MATCH); 706 } 707 708 static struct g_consumer * 709 vdev_geom_attach_by_guids(vdev_t *vd) 710 { 711 struct g_class *mp; 712 struct g_geom *gp; 713 struct g_provider *pp, *best_pp; 714 struct g_consumer *cp; 715 const char *vdpath; 716 enum match match, best_match; 717 718 g_topology_assert(); 719 720 vdpath = vd->vdev_path + sizeof ("/dev/") - 1; 721 cp = NULL; 722 best_pp = NULL; 723 best_match = NO_MATCH; 724 LIST_FOREACH(mp, &g_classes, class) { 725 if (mp == &zfs_vdev_class) 726 continue; 727 LIST_FOREACH(gp, &mp->geom, geom) { 728 if (gp->flags & G_GEOM_WITHER) 729 continue; 730 LIST_FOREACH(pp, &gp->provider, provider) { 731 match = vdev_attach_ok(vd, pp); 732 if (match > best_match) { 733 best_match = match; 734 best_pp = pp; 735 } else if (match == best_match) { 736 if (strcmp(pp->name, vdpath) == 0) { 737 best_pp = pp; 738 } 739 } 740 if (match == FULL_MATCH) 741 goto out; 742 } 743 } 744 } 745 746 out: 747 if (best_pp) { 748 cp = vdev_geom_attach(best_pp, vd, B_TRUE); 749 if (cp == NULL) { 750 printf("ZFS WARNING: Unable to attach to %s.\n", 751 best_pp->name); 752 } 753 } 754 return (cp); 755 } 756 757 static struct g_consumer * 758 vdev_geom_open_by_guids(vdev_t *vd) 759 { 760 struct g_consumer *cp; 761 char *buf; 762 size_t len; 763 764 g_topology_assert(); 765 766 ZFS_LOG(1, "Searching by guids [%ju:%ju].", 767 (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid); 768 cp = vdev_geom_attach_by_guids(vd); 769 if (cp != NULL) { 770 len = strlen(cp->provider->name) + strlen("/dev/") + 1; 771 buf = kmem_alloc(len, KM_SLEEP); 772 773 snprintf(buf, len, "/dev/%s", cp->provider->name); 774 spa_strfree(vd->vdev_path); 775 vd->vdev_path = buf; 776 777 ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.", 778 (uintmax_t)spa_guid(vd->vdev_spa), 779 (uintmax_t)vd->vdev_guid, cp->provider->name); 780 } else { 781 ZFS_LOG(1, "Search by guid [%ju:%ju] failed.", 782 (uintmax_t)spa_guid(vd->vdev_spa), 783 (uintmax_t)vd->vdev_guid); 784 } 785 786 return (cp); 787 } 788 789 static struct g_consumer * 790 vdev_geom_open_by_path(vdev_t *vd, int check_guid) 791 { 792 struct g_provider *pp; 793 struct g_consumer *cp; 794 795 g_topology_assert(); 796 797 cp = NULL; 798 pp = g_provider_by_name(vd->vdev_path + sizeof ("/dev/") - 1); 799 if (pp != NULL) { 800 ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path); 801 if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH) 802 cp = vdev_geom_attach(pp, vd, B_FALSE); 803 } 804 805 return (cp); 806 } 807 808 static int 809 vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, 810 uint64_t *logical_ashift, uint64_t *physical_ashift) 811 { 812 struct g_provider *pp; 813 struct g_consumer *cp; 814 int error, has_trim; 815 uint16_t rate; 816 817 /* 818 * Set the TLS to indicate downstack that we 819 * should not access zvols 820 */ 821 VERIFY0(tsd_set(zfs_geom_probe_vdev_key, vd)); 822 823 /* 824 * We must have a pathname, and it must be absolute. 825 */ 826 if (vd->vdev_path == NULL || strncmp(vd->vdev_path, "/dev/", 5) != 0) { 827 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 828 return (EINVAL); 829 } 830 831 /* 832 * Reopen the device if it's not currently open. Otherwise, 833 * just update the physical size of the device. 834 */ 835 if ((cp = vd->vdev_tsd) != NULL) { 836 ASSERT(vd->vdev_reopening); 837 goto skip_open; 838 } 839 840 DROP_GIANT(); 841 g_topology_lock(); 842 error = 0; 843 844 if (vd->vdev_spa->spa_is_splitting || 845 ((vd->vdev_prevstate == VDEV_STATE_UNKNOWN && 846 (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE || 847 vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)))) { 848 /* 849 * We are dealing with a vdev that hasn't been previously 850 * opened (since boot), and we are not loading an 851 * existing pool configuration. This looks like a 852 * vdev add operation to a new or existing pool. 853 * Assume the user really wants to do this, and find 854 * GEOM provider by its name, ignoring GUID mismatches. 855 * 856 * XXPOLICY: It would be safer to only allow a device 857 * that is unlabeled or labeled but missing 858 * GUID information to be opened in this fashion, 859 * unless we are doing a split, in which case we 860 * should allow any guid. 861 */ 862 cp = vdev_geom_open_by_path(vd, 0); 863 } else { 864 /* 865 * Try using the recorded path for this device, but only 866 * accept it if its label data contains the expected GUIDs. 867 */ 868 cp = vdev_geom_open_by_path(vd, 1); 869 if (cp == NULL) { 870 /* 871 * The device at vd->vdev_path doesn't have the 872 * expected GUIDs. The disks might have merely 873 * moved around so try all other GEOM providers 874 * to find one with the right GUIDs. 875 */ 876 cp = vdev_geom_open_by_guids(vd); 877 } 878 } 879 880 /* Clear the TLS now that tasting is done */ 881 VERIFY0(tsd_set(zfs_geom_probe_vdev_key, NULL)); 882 883 if (cp == NULL) { 884 ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path); 885 error = ENOENT; 886 } else { 887 struct consumer_priv_t *priv; 888 struct consumer_vdev_elem *elem; 889 int spamode; 890 891 priv = (struct consumer_priv_t *)&cp->private; 892 if (cp->private == NULL) 893 SLIST_INIT(priv); 894 elem = g_malloc(sizeof (*elem), M_WAITOK|M_ZERO); 895 elem->vd = vd; 896 SLIST_INSERT_HEAD(priv, elem, elems); 897 898 spamode = spa_mode(vd->vdev_spa); 899 if (cp->provider->sectorsize > VDEV_PAD_SIZE || 900 !ISP2(cp->provider->sectorsize)) { 901 ZFS_LOG(1, "Provider %s has unsupported sectorsize.", 902 cp->provider->name); 903 904 vdev_geom_close_locked(vd); 905 error = EINVAL; 906 cp = NULL; 907 } else if (cp->acw == 0 && (spamode & FWRITE) != 0) { 908 int i; 909 910 for (i = 0; i < 5; i++) { 911 error = g_access(cp, 0, 1, 0); 912 if (error == 0) 913 break; 914 g_topology_unlock(); 915 tsleep(vd, 0, "vdev", hz / 2); 916 g_topology_lock(); 917 } 918 if (error != 0) { 919 printf("ZFS WARNING: Unable to open %s for " 920 "writing (error=%d).\n", 921 cp->provider->name, error); 922 vdev_geom_close_locked(vd); 923 cp = NULL; 924 } 925 } 926 } 927 928 /* Fetch initial physical path information for this device. */ 929 if (cp != NULL) { 930 vdev_geom_attrchanged(cp, "GEOM::physpath"); 931 932 /* Set other GEOM characteristics */ 933 vdev_geom_set_physpath(vd, cp, /* do_null_update */B_FALSE); 934 } 935 936 g_topology_unlock(); 937 PICKUP_GIANT(); 938 if (cp == NULL) { 939 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 940 vdev_dbgmsg(vd, "vdev_geom_open: failed to open [error=%d]", 941 error); 942 return (error); 943 } 944 skip_open: 945 pp = cp->provider; 946 947 /* 948 * Determine the actual size of the device. 949 */ 950 *max_psize = *psize = pp->mediasize; 951 952 /* 953 * Determine the device's minimum transfer size and preferred 954 * transfer size. 955 */ 956 *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; 957 *physical_ashift = 0; 958 if (pp->stripesize && pp->stripesize > (1 << *logical_ashift) && 959 ISP2(pp->stripesize) && pp->stripesize <= (1 << ASHIFT_MAX) && 960 pp->stripeoffset == 0) 961 *physical_ashift = highbit(pp->stripesize) - 1; 962 963 /* 964 * Clear the nowritecache settings, so that on a vdev_reopen() 965 * we will try again. 966 */ 967 vd->vdev_nowritecache = B_FALSE; 968 969 /* Inform the ZIO pipeline that we are non-rotational. */ 970 error = g_getattr("GEOM::rotation_rate", cp, &rate); 971 if (error == 0 && rate == DISK_RR_NON_ROTATING) 972 vd->vdev_nonrot = B_TRUE; 973 else 974 vd->vdev_nonrot = B_FALSE; 975 976 /* Set when device reports it supports TRIM. */ 977 error = g_getattr("GEOM::candelete", cp, &has_trim); 978 vd->vdev_has_trim = (error == 0 && has_trim); 979 980 /* Set when device reports it supports secure TRIM. */ 981 /* unavailable on FreeBSD */ 982 vd->vdev_has_securetrim = B_FALSE; 983 984 return (0); 985 } 986 987 static void 988 vdev_geom_close(vdev_t *vd) 989 { 990 struct g_consumer *cp; 991 boolean_t locked; 992 993 cp = vd->vdev_tsd; 994 995 DROP_GIANT(); 996 locked = g_topology_locked(); 997 if (!locked) 998 g_topology_lock(); 999 1000 if (!vd->vdev_reopening || 1001 (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 || 1002 (cp->provider != NULL && cp->provider->error != 0)))) 1003 vdev_geom_close_locked(vd); 1004 1005 if (!locked) 1006 g_topology_unlock(); 1007 PICKUP_GIANT(); 1008 } 1009 1010 static void 1011 vdev_geom_io_intr(struct bio *bp) 1012 { 1013 vdev_t *vd; 1014 zio_t *zio; 1015 1016 zio = bp->bio_caller1; 1017 vd = zio->io_vd; 1018 zio->io_error = bp->bio_error; 1019 if (zio->io_error == 0 && bp->bio_resid != 0) 1020 zio->io_error = SET_ERROR(EIO); 1021 1022 switch (zio->io_error) { 1023 case ENOTSUP: 1024 /* 1025 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know 1026 * that future attempts will never succeed. In this case 1027 * we set a persistent flag so that we don't bother with 1028 * requests in the future. 1029 */ 1030 switch (bp->bio_cmd) { 1031 case BIO_FLUSH: 1032 vd->vdev_nowritecache = B_TRUE; 1033 break; 1034 case BIO_DELETE: 1035 break; 1036 } 1037 break; 1038 case ENXIO: 1039 if (!vd->vdev_remove_wanted) { 1040 /* 1041 * If provider's error is set we assume it is being 1042 * removed. 1043 */ 1044 if (bp->bio_to->error != 0) { 1045 vd->vdev_remove_wanted = B_TRUE; 1046 spa_async_request(zio->io_spa, 1047 SPA_ASYNC_REMOVE); 1048 } else if (!vd->vdev_delayed_close) { 1049 vd->vdev_delayed_close = B_TRUE; 1050 } 1051 } 1052 break; 1053 } 1054 1055 /* 1056 * We have to split bio freeing into two parts, because the ABD code 1057 * cannot be called in this context and vdev_op_io_done is not called 1058 * for ZIO_TYPE_IOCTL zio-s. 1059 */ 1060 if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { 1061 g_destroy_bio(bp); 1062 zio->io_bio = NULL; 1063 } 1064 zio_delay_interrupt(zio); 1065 } 1066 1067 struct vdev_geom_check_unmapped_cb_state { 1068 int pages; 1069 uint_t end; 1070 }; 1071 1072 /* 1073 * Callback to check the ABD segment size/alignment and count the pages. 1074 * GEOM requires data buffer to look virtually contiguous. It means only 1075 * the first page of the buffer may not start and only the last may not 1076 * end on a page boundary. All other physical pages must be full. 1077 */ 1078 static int 1079 vdev_geom_check_unmapped_cb(void *buf, size_t len, void *priv) 1080 { 1081 struct vdev_geom_check_unmapped_cb_state *s = priv; 1082 vm_offset_t off = (vm_offset_t)buf & PAGE_MASK; 1083 1084 if (s->pages != 0 && off != 0) 1085 return (1); 1086 if (s->end != 0) 1087 return (1); 1088 s->end = (off + len) & PAGE_MASK; 1089 s->pages += (off + len + PAGE_MASK) >> PAGE_SHIFT; 1090 return (0); 1091 } 1092 1093 /* 1094 * Check whether we can use unmapped I/O for this ZIO on this device to 1095 * avoid data copying between scattered and/or gang ABD buffer and linear. 1096 */ 1097 static int 1098 vdev_geom_check_unmapped(zio_t *zio, struct g_consumer *cp) 1099 { 1100 struct vdev_geom_check_unmapped_cb_state s; 1101 1102 /* If unmapped I/O is administratively disabled, respect that. */ 1103 if (!unmapped_buf_allowed) 1104 return (0); 1105 1106 /* If the buffer is already linear, then nothing to do here. */ 1107 if (abd_is_linear(zio->io_abd)) 1108 return (0); 1109 1110 /* 1111 * If unmapped I/O is not supported by the GEOM provider, 1112 * then we can't do anything and have to copy the data. 1113 */ 1114 if ((cp->provider->flags & G_PF_ACCEPT_UNMAPPED) == 0) 1115 return (0); 1116 1117 /* Check the buffer chunks sizes/alignments and count pages. */ 1118 s.pages = s.end = 0; 1119 if (abd_iterate_func(zio->io_abd, 0, zio->io_size, 1120 vdev_geom_check_unmapped_cb, &s)) 1121 return (0); 1122 return (s.pages); 1123 } 1124 1125 /* 1126 * Callback to translate the ABD segment into array of physical pages. 1127 */ 1128 static int 1129 vdev_geom_fill_unmap_cb(void *buf, size_t len, void *priv) 1130 { 1131 struct bio *bp = priv; 1132 vm_offset_t addr = (vm_offset_t)buf; 1133 vm_offset_t end = addr + len; 1134 1135 if (bp->bio_ma_n == 0) 1136 bp->bio_ma_offset = addr & PAGE_MASK; 1137 do { 1138 bp->bio_ma[bp->bio_ma_n++] = 1139 PHYS_TO_VM_PAGE(pmap_kextract(addr)); 1140 addr += PAGE_SIZE; 1141 } while (addr < end); 1142 return (0); 1143 } 1144 1145 static void 1146 vdev_geom_io_start(zio_t *zio) 1147 { 1148 vdev_t *vd; 1149 struct g_consumer *cp; 1150 struct bio *bp; 1151 1152 vd = zio->io_vd; 1153 1154 switch (zio->io_type) { 1155 case ZIO_TYPE_IOCTL: 1156 /* XXPOLICY */ 1157 if (!vdev_readable(vd)) { 1158 zio->io_error = SET_ERROR(ENXIO); 1159 zio_interrupt(zio); 1160 return; 1161 } else { 1162 switch (zio->io_cmd) { 1163 case DKIOCFLUSHWRITECACHE: 1164 if (zfs_nocacheflush || 1165 vdev_geom_bio_flush_disable) 1166 break; 1167 if (vd->vdev_nowritecache) { 1168 zio->io_error = SET_ERROR(ENOTSUP); 1169 break; 1170 } 1171 goto sendreq; 1172 default: 1173 zio->io_error = SET_ERROR(ENOTSUP); 1174 } 1175 } 1176 1177 zio_execute(zio); 1178 return; 1179 case ZIO_TYPE_TRIM: 1180 if (!vdev_geom_bio_delete_disable) { 1181 goto sendreq; 1182 } 1183 zio_execute(zio); 1184 return; 1185 default: 1186 ; 1187 /* PASSTHROUGH --- placate compiler */ 1188 } 1189 sendreq: 1190 ASSERT(zio->io_type == ZIO_TYPE_READ || 1191 zio->io_type == ZIO_TYPE_WRITE || 1192 zio->io_type == ZIO_TYPE_TRIM || 1193 zio->io_type == ZIO_TYPE_IOCTL); 1194 1195 cp = vd->vdev_tsd; 1196 if (cp == NULL) { 1197 zio->io_error = SET_ERROR(ENXIO); 1198 zio_interrupt(zio); 1199 return; 1200 } 1201 bp = g_alloc_bio(); 1202 bp->bio_caller1 = zio; 1203 switch (zio->io_type) { 1204 case ZIO_TYPE_READ: 1205 case ZIO_TYPE_WRITE: 1206 zio->io_target_timestamp = zio_handle_io_delay(zio); 1207 bp->bio_offset = zio->io_offset; 1208 bp->bio_length = zio->io_size; 1209 if (zio->io_type == ZIO_TYPE_READ) 1210 bp->bio_cmd = BIO_READ; 1211 else 1212 bp->bio_cmd = BIO_WRITE; 1213 1214 /* 1215 * If possible, represent scattered and/or gang ABD buffer to 1216 * GEOM as an array of physical pages. It allows to satisfy 1217 * requirement of virtually contiguous buffer without copying. 1218 */ 1219 int pgs = vdev_geom_check_unmapped(zio, cp); 1220 if (pgs > 0) { 1221 bp->bio_ma = malloc(sizeof (struct vm_page *) * pgs, 1222 M_DEVBUF, M_WAITOK); 1223 bp->bio_ma_n = 0; 1224 bp->bio_ma_offset = 0; 1225 abd_iterate_func(zio->io_abd, 0, zio->io_size, 1226 vdev_geom_fill_unmap_cb, bp); 1227 bp->bio_data = unmapped_buf; 1228 bp->bio_flags |= BIO_UNMAPPED; 1229 } else { 1230 if (zio->io_type == ZIO_TYPE_READ) { 1231 bp->bio_data = abd_borrow_buf(zio->io_abd, 1232 zio->io_size); 1233 } else { 1234 bp->bio_data = abd_borrow_buf_copy(zio->io_abd, 1235 zio->io_size); 1236 } 1237 } 1238 break; 1239 case ZIO_TYPE_TRIM: 1240 bp->bio_cmd = BIO_DELETE; 1241 bp->bio_data = NULL; 1242 bp->bio_offset = zio->io_offset; 1243 bp->bio_length = zio->io_size; 1244 break; 1245 case ZIO_TYPE_IOCTL: 1246 bp->bio_cmd = BIO_FLUSH; 1247 bp->bio_data = NULL; 1248 bp->bio_offset = cp->provider->mediasize; 1249 bp->bio_length = 0; 1250 break; 1251 default: 1252 panic("invalid zio->io_type: %d\n", zio->io_type); 1253 } 1254 bp->bio_done = vdev_geom_io_intr; 1255 zio->io_bio = bp; 1256 1257 g_io_request(bp, cp); 1258 } 1259 1260 static void 1261 vdev_geom_io_done(zio_t *zio) 1262 { 1263 struct bio *bp = zio->io_bio; 1264 1265 if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { 1266 ASSERT3P(bp, ==, NULL); 1267 return; 1268 } 1269 1270 if (bp == NULL) { 1271 ASSERT3S(zio->io_error, ==, ENXIO); 1272 return; 1273 } 1274 1275 if (bp->bio_ma != NULL) { 1276 free(bp->bio_ma, M_DEVBUF); 1277 } else { 1278 if (zio->io_type == ZIO_TYPE_READ) { 1279 abd_return_buf_copy(zio->io_abd, bp->bio_data, 1280 zio->io_size); 1281 } else { 1282 abd_return_buf(zio->io_abd, bp->bio_data, 1283 zio->io_size); 1284 } 1285 } 1286 1287 g_destroy_bio(bp); 1288 zio->io_bio = NULL; 1289 } 1290 1291 static void 1292 vdev_geom_hold(vdev_t *vd) 1293 { 1294 } 1295 1296 static void 1297 vdev_geom_rele(vdev_t *vd) 1298 { 1299 } 1300 1301 vdev_ops_t vdev_disk_ops = { 1302 .vdev_op_init = NULL, 1303 .vdev_op_fini = NULL, 1304 .vdev_op_open = vdev_geom_open, 1305 .vdev_op_close = vdev_geom_close, 1306 .vdev_op_asize = vdev_default_asize, 1307 .vdev_op_min_asize = vdev_default_min_asize, 1308 .vdev_op_min_alloc = NULL, 1309 .vdev_op_io_start = vdev_geom_io_start, 1310 .vdev_op_io_done = vdev_geom_io_done, 1311 .vdev_op_state_change = NULL, 1312 .vdev_op_need_resilver = NULL, 1313 .vdev_op_hold = vdev_geom_hold, 1314 .vdev_op_rele = vdev_geom_rele, 1315 .vdev_op_remap = NULL, 1316 .vdev_op_xlate = vdev_default_xlate, 1317 .vdev_op_rebuild_asize = NULL, 1318 .vdev_op_metaslab_init = NULL, 1319 .vdev_op_config_generate = NULL, 1320 .vdev_op_nparity = NULL, 1321 .vdev_op_ndisks = NULL, 1322 .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 1323 .vdev_op_leaf = B_TRUE /* leaf vdev */ 1324 }; 1325