1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 6 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include "spdk/stdinc.h" 36 37 #include "bdev_nvme.h" 38 39 #include "spdk/accel_engine.h" 40 #include "spdk/config.h" 41 #include "spdk/endian.h" 42 #include "spdk/bdev.h" 43 #include "spdk/json.h" 44 #include "spdk/likely.h" 45 #include "spdk/nvme.h" 46 #include "spdk/nvme_ocssd.h" 47 #include "spdk/nvme_zns.h" 48 #include "spdk/opal.h" 49 #include "spdk/thread.h" 50 #include "spdk/string.h" 51 #include "spdk/util.h" 52 53 #include "spdk/bdev_module.h" 54 #include "spdk/log.h" 55 56 #include "spdk_internal/usdt.h" 57 58 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 59 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 60 61 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 62 63 struct nvme_bdev_io { 64 /** array of iovecs to transfer. */ 65 struct iovec *iovs; 66 67 /** Number of iovecs in iovs array. */ 68 int iovcnt; 69 70 /** Current iovec position. */ 71 int iovpos; 72 73 /** Offset in current iovec. */ 74 uint32_t iov_offset; 75 76 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 77 * being reset in a reset I/O. 78 */ 79 struct nvme_io_path *io_path; 80 81 /** array of iovecs to transfer. */ 82 struct iovec *fused_iovs; 83 84 /** Number of iovecs in iovs array. */ 85 int fused_iovcnt; 86 87 /** Current iovec position. */ 88 int fused_iovpos; 89 90 /** Offset in current iovec. */ 91 uint32_t fused_iov_offset; 92 93 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 94 struct spdk_nvme_cpl cpl; 95 96 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 97 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 98 99 /** Originating thread */ 100 struct spdk_thread *orig_thread; 101 102 /** Keeps track if first of fused commands was submitted */ 103 bool first_fused_submitted; 104 105 /** Temporary pointer to zone report buffer */ 106 struct spdk_nvme_zns_zone_report *zone_report_buf; 107 108 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 109 uint64_t handled_zones; 110 111 /** Expiration value in ticks to retry the current I/O. */ 112 uint64_t retry_ticks; 113 114 /* How many times the current I/O was retried. */ 115 int32_t retry_count; 116 }; 117 118 struct nvme_probe_skip_entry { 119 struct spdk_nvme_transport_id trid; 120 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 121 }; 122 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 123 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 124 g_skipped_nvme_ctrlrs); 125 126 static struct spdk_bdev_nvme_opts g_opts = { 127 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 128 .timeout_us = 0, 129 .timeout_admin_us = 0, 130 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 131 .transport_retry_count = 4, 132 .arbitration_burst = 0, 133 .low_priority_weight = 0, 134 .medium_priority_weight = 0, 135 .high_priority_weight = 0, 136 .nvme_adminq_poll_period_us = 10000ULL, 137 .nvme_ioq_poll_period_us = 0, 138 .io_queue_requests = 0, 139 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 140 .bdev_retry_count = 3, 141 .transport_ack_timeout = 0, 142 }; 143 144 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 145 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 146 147 static int g_hot_insert_nvme_controller_index = 0; 148 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 149 static bool g_nvme_hotplug_enabled = false; 150 static struct spdk_thread *g_bdev_nvme_init_thread; 151 static struct spdk_poller *g_hotplug_poller; 152 static struct spdk_poller *g_hotplug_probe_poller; 153 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 154 155 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 156 struct nvme_async_probe_ctx *ctx); 157 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 158 struct nvme_async_probe_ctx *ctx); 159 static int bdev_nvme_library_init(void); 160 static void bdev_nvme_library_fini(void); 161 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 162 struct spdk_bdev_io *bdev_io); 163 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 164 void *md, uint64_t lba_count, uint64_t lba, 165 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts); 166 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 167 void *md, uint64_t lba_count, uint64_t lba); 168 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 169 void *md, uint64_t lba_count, uint64_t lba, 170 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts); 171 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 172 void *md, uint64_t lba_count, 173 uint64_t zslba, uint32_t flags); 174 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 175 void *md, uint64_t lba_count, uint64_t lba, 176 uint32_t flags); 177 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 178 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 179 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 180 uint32_t flags); 181 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 182 uint32_t num_zones, struct spdk_bdev_zone_info *info); 183 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 184 enum spdk_bdev_zone_action action); 185 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 186 struct nvme_bdev_io *bio, 187 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 188 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 189 void *buf, size_t nbytes); 190 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 191 void *buf, size_t nbytes, void *md_buf, size_t md_len); 192 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 193 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 194 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 195 static int bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr); 196 static int bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove); 197 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 198 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 199 200 static int 201 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 202 { 203 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 204 } 205 206 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 207 208 struct spdk_nvme_qpair * 209 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 210 { 211 struct nvme_ctrlr_channel *ctrlr_ch; 212 213 assert(ctrlr_io_ch != NULL); 214 215 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 216 217 return ctrlr_ch->qpair; 218 } 219 220 static int 221 bdev_nvme_get_ctx_size(void) 222 { 223 return sizeof(struct nvme_bdev_io); 224 } 225 226 static struct spdk_bdev_module nvme_if = { 227 .name = "nvme", 228 .async_fini = true, 229 .module_init = bdev_nvme_library_init, 230 .module_fini = bdev_nvme_library_fini, 231 .config_json = bdev_nvme_config_json, 232 .get_ctx_size = bdev_nvme_get_ctx_size, 233 234 }; 235 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 236 237 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 238 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 239 bool g_bdev_nvme_module_finish; 240 241 struct nvme_bdev_ctrlr * 242 nvme_bdev_ctrlr_get_by_name(const char *name) 243 { 244 struct nvme_bdev_ctrlr *nbdev_ctrlr; 245 246 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 247 if (strcmp(name, nbdev_ctrlr->name) == 0) { 248 break; 249 } 250 } 251 252 return nbdev_ctrlr; 253 } 254 255 static struct nvme_ctrlr * 256 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 257 const struct spdk_nvme_transport_id *trid) 258 { 259 struct nvme_ctrlr *nvme_ctrlr; 260 261 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 262 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { 263 break; 264 } 265 } 266 267 return nvme_ctrlr; 268 } 269 270 static struct nvme_bdev * 271 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 272 { 273 struct nvme_bdev *bdev; 274 275 pthread_mutex_lock(&g_bdev_nvme_mutex); 276 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 277 if (bdev->nsid == nsid) { 278 break; 279 } 280 } 281 pthread_mutex_unlock(&g_bdev_nvme_mutex); 282 283 return bdev; 284 } 285 286 struct nvme_ns * 287 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 288 { 289 struct nvme_ns ns; 290 291 assert(nsid > 0); 292 293 ns.id = nsid; 294 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 295 } 296 297 struct nvme_ns * 298 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 299 { 300 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 301 } 302 303 struct nvme_ns * 304 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 305 { 306 if (ns == NULL) { 307 return NULL; 308 } 309 310 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 311 } 312 313 static struct nvme_ctrlr * 314 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) 315 { 316 struct nvme_bdev_ctrlr *nbdev_ctrlr; 317 struct nvme_ctrlr *nvme_ctrlr = NULL; 318 319 pthread_mutex_lock(&g_bdev_nvme_mutex); 320 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 321 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); 322 if (nvme_ctrlr != NULL) { 323 break; 324 } 325 } 326 pthread_mutex_unlock(&g_bdev_nvme_mutex); 327 328 return nvme_ctrlr; 329 } 330 331 struct nvme_ctrlr * 332 nvme_ctrlr_get_by_name(const char *name) 333 { 334 struct nvme_bdev_ctrlr *nbdev_ctrlr; 335 struct nvme_ctrlr *nvme_ctrlr = NULL; 336 337 if (name == NULL) { 338 return NULL; 339 } 340 341 pthread_mutex_lock(&g_bdev_nvme_mutex); 342 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 343 if (nbdev_ctrlr != NULL) { 344 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 345 } 346 pthread_mutex_unlock(&g_bdev_nvme_mutex); 347 348 return nvme_ctrlr; 349 } 350 351 void 352 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 353 { 354 struct nvme_bdev_ctrlr *nbdev_ctrlr; 355 356 pthread_mutex_lock(&g_bdev_nvme_mutex); 357 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 358 fn(nbdev_ctrlr, ctx); 359 } 360 pthread_mutex_unlock(&g_bdev_nvme_mutex); 361 } 362 363 void 364 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 365 { 366 const char *trtype_str; 367 const char *adrfam_str; 368 369 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 370 if (trtype_str) { 371 spdk_json_write_named_string(w, "trtype", trtype_str); 372 } 373 374 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 375 if (adrfam_str) { 376 spdk_json_write_named_string(w, "adrfam", adrfam_str); 377 } 378 379 if (trid->traddr[0] != '\0') { 380 spdk_json_write_named_string(w, "traddr", trid->traddr); 381 } 382 383 if (trid->trsvcid[0] != '\0') { 384 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 385 } 386 387 if (trid->subnqn[0] != '\0') { 388 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 389 } 390 } 391 392 static void 393 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 394 struct nvme_ctrlr *nvme_ctrlr) 395 { 396 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 397 pthread_mutex_lock(&g_bdev_nvme_mutex); 398 399 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 400 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 401 pthread_mutex_unlock(&g_bdev_nvme_mutex); 402 403 return; 404 } 405 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 406 407 pthread_mutex_unlock(&g_bdev_nvme_mutex); 408 409 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 410 411 free(nbdev_ctrlr->name); 412 free(nbdev_ctrlr); 413 } 414 415 static void 416 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 417 { 418 struct nvme_path_id *path_id, *tmp_path; 419 struct nvme_ns *ns, *tmp_ns; 420 421 free(nvme_ctrlr->copied_ana_desc); 422 spdk_free(nvme_ctrlr->ana_log_page); 423 424 if (nvme_ctrlr->opal_dev) { 425 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 426 nvme_ctrlr->opal_dev = NULL; 427 } 428 429 if (nvme_ctrlr->nbdev_ctrlr) { 430 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 431 } 432 433 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 434 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 435 free(ns); 436 } 437 438 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 439 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 440 free(path_id); 441 } 442 443 pthread_mutex_destroy(&nvme_ctrlr->mutex); 444 445 free(nvme_ctrlr); 446 447 pthread_mutex_lock(&g_bdev_nvme_mutex); 448 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 449 pthread_mutex_unlock(&g_bdev_nvme_mutex); 450 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 451 spdk_bdev_module_fini_done(); 452 return; 453 } 454 pthread_mutex_unlock(&g_bdev_nvme_mutex); 455 } 456 457 static int 458 nvme_detach_poller(void *arg) 459 { 460 struct nvme_ctrlr *nvme_ctrlr = arg; 461 int rc; 462 463 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 464 if (rc != -EAGAIN) { 465 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 466 _nvme_ctrlr_delete(nvme_ctrlr); 467 } 468 469 return SPDK_POLLER_BUSY; 470 } 471 472 static void 473 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 474 { 475 int rc; 476 477 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 478 479 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 480 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 481 482 /* If we got here, the reset/detach poller cannot be active */ 483 assert(nvme_ctrlr->reset_detach_poller == NULL); 484 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 485 nvme_ctrlr, 1000); 486 if (nvme_ctrlr->reset_detach_poller == NULL) { 487 SPDK_ERRLOG("Failed to register detach poller\n"); 488 goto error; 489 } 490 491 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 492 if (rc != 0) { 493 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 494 goto error; 495 } 496 497 return; 498 error: 499 /* We don't have a good way to handle errors here, so just do what we can and delete the 500 * controller without detaching the underlying NVMe device. 501 */ 502 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 503 _nvme_ctrlr_delete(nvme_ctrlr); 504 } 505 506 static void 507 nvme_ctrlr_unregister_cb(void *io_device) 508 { 509 struct nvme_ctrlr *nvme_ctrlr = io_device; 510 511 nvme_ctrlr_delete(nvme_ctrlr); 512 } 513 514 static void 515 nvme_ctrlr_unregister(struct nvme_ctrlr *nvme_ctrlr) 516 { 517 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 518 } 519 520 static bool 521 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 522 { 523 if (!nvme_ctrlr->destruct) { 524 return false; 525 } 526 527 if (nvme_ctrlr->ref > 0) { 528 return false; 529 } 530 531 if (nvme_ctrlr->resetting) { 532 return false; 533 } 534 535 if (nvme_ctrlr->ana_log_page_updating) { 536 return false; 537 } 538 539 return true; 540 } 541 542 static void 543 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 544 { 545 pthread_mutex_lock(&nvme_ctrlr->mutex); 546 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 547 548 assert(nvme_ctrlr->ref > 0); 549 nvme_ctrlr->ref--; 550 551 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 552 pthread_mutex_unlock(&nvme_ctrlr->mutex); 553 return; 554 } 555 556 pthread_mutex_unlock(&nvme_ctrlr->mutex); 557 558 nvme_ctrlr_unregister(nvme_ctrlr); 559 } 560 561 static struct nvme_io_path * 562 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 563 { 564 struct nvme_io_path *io_path; 565 566 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 567 if (io_path->nvme_ns == nvme_ns) { 568 break; 569 } 570 } 571 572 return io_path; 573 } 574 575 static int 576 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 577 { 578 struct nvme_io_path *io_path; 579 struct spdk_io_channel *ch; 580 581 io_path = calloc(1, sizeof(*io_path)); 582 if (io_path == NULL) { 583 SPDK_ERRLOG("Failed to alloc io_path.\n"); 584 return -ENOMEM; 585 } 586 587 ch = spdk_get_io_channel(nvme_ns->ctrlr); 588 if (ch == NULL) { 589 free(io_path); 590 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 591 return -ENOMEM; 592 } 593 594 io_path->ctrlr_ch = spdk_io_channel_get_ctx(ch); 595 TAILQ_INSERT_TAIL(&io_path->ctrlr_ch->io_path_list, io_path, tailq); 596 597 io_path->nvme_ns = nvme_ns; 598 599 io_path->nbdev_ch = nbdev_ch; 600 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 601 602 nbdev_ch->current_io_path = NULL; 603 604 return 0; 605 } 606 607 static void 608 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 609 { 610 struct spdk_io_channel *ch; 611 612 nbdev_ch->current_io_path = NULL; 613 614 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 615 616 TAILQ_REMOVE(&io_path->ctrlr_ch->io_path_list, io_path, tailq); 617 ch = spdk_io_channel_from_ctx(io_path->ctrlr_ch); 618 spdk_put_io_channel(ch); 619 620 free(io_path); 621 } 622 623 static void 624 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 625 { 626 struct nvme_io_path *io_path, *tmp_io_path; 627 628 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 629 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 630 } 631 } 632 633 static int 634 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 635 { 636 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 637 struct nvme_bdev *nbdev = io_device; 638 struct nvme_ns *nvme_ns; 639 int rc; 640 641 STAILQ_INIT(&nbdev_ch->io_path_list); 642 TAILQ_INIT(&nbdev_ch->retry_io_list); 643 644 pthread_mutex_lock(&nbdev->mutex); 645 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 646 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 647 if (rc != 0) { 648 pthread_mutex_unlock(&nbdev->mutex); 649 650 _bdev_nvme_delete_io_paths(nbdev_ch); 651 return rc; 652 } 653 } 654 pthread_mutex_unlock(&nbdev->mutex); 655 656 return 0; 657 } 658 659 static void 660 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 661 { 662 struct spdk_bdev_io *bdev_io, *tmp_io; 663 664 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) { 665 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 666 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 667 } 668 669 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 670 } 671 672 static void 673 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 674 { 675 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 676 677 bdev_nvme_abort_retry_ios(nbdev_ch); 678 _bdev_nvme_delete_io_paths(nbdev_ch); 679 } 680 681 static inline bool 682 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 683 { 684 switch (io_type) { 685 case SPDK_BDEV_IO_TYPE_RESET: 686 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 687 case SPDK_BDEV_IO_TYPE_ABORT: 688 return true; 689 default: 690 break; 691 } 692 693 return false; 694 } 695 696 static inline bool 697 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 698 { 699 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 700 return false; 701 } 702 703 switch (nvme_ns->ana_state) { 704 case SPDK_NVME_ANA_OPTIMIZED_STATE: 705 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 706 return true; 707 default: 708 break; 709 } 710 711 return false; 712 } 713 714 static inline bool 715 nvme_io_path_is_connected(struct nvme_io_path *io_path) 716 { 717 return io_path->ctrlr_ch->qpair != NULL; 718 } 719 720 static inline bool 721 nvme_io_path_is_available(struct nvme_io_path *io_path) 722 { 723 if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) { 724 return false; 725 } 726 727 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 728 return false; 729 } 730 731 return true; 732 } 733 734 static inline bool 735 nvme_io_path_is_failed(struct nvme_io_path *io_path) 736 { 737 struct nvme_ctrlr *nvme_ctrlr; 738 739 nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(io_path->ctrlr_ch); 740 741 if (nvme_ctrlr->destruct) { 742 return true; 743 } 744 745 if (nvme_ctrlr->fast_io_fail_timedout) { 746 return true; 747 } 748 749 if (nvme_ctrlr->resetting) { 750 if (nvme_ctrlr->reconnect_delay_sec != 0) { 751 return false; 752 } else { 753 return true; 754 } 755 } 756 757 if (nvme_ctrlr->reconnect_is_delayed) { 758 return false; 759 } 760 761 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 762 return true; 763 } else { 764 return false; 765 } 766 } 767 768 static bool 769 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 770 { 771 if (nvme_ctrlr->destruct) { 772 return false; 773 } 774 775 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 776 return false; 777 } 778 779 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 780 return false; 781 } 782 783 return true; 784 } 785 786 static inline struct nvme_io_path * 787 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 788 { 789 struct nvme_io_path *io_path, *non_optimized = NULL; 790 791 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 792 return nbdev_ch->current_io_path; 793 } 794 795 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 796 if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) { 797 /* The device is currently resetting. */ 798 continue; 799 } 800 801 if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) { 802 continue; 803 } 804 805 switch (io_path->nvme_ns->ana_state) { 806 case SPDK_NVME_ANA_OPTIMIZED_STATE: 807 nbdev_ch->current_io_path = io_path; 808 return io_path; 809 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 810 if (non_optimized == NULL) { 811 non_optimized = io_path; 812 } 813 break; 814 default: 815 break; 816 } 817 } 818 819 return non_optimized; 820 } 821 822 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 823 * or false otherwise. 824 * 825 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 826 * is likely to be non-accessible now but may become accessible. 827 * 828 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 829 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 830 * when starting to reset it but it is set to failed when the reset failed. Hence, if 831 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 832 */ 833 static bool 834 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 835 { 836 struct nvme_io_path *io_path; 837 838 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 839 if (nvme_io_path_is_connected(io_path) || 840 !nvme_io_path_is_failed(io_path)) { 841 return true; 842 } 843 } 844 845 return false; 846 } 847 848 static bool 849 any_ctrlr_may_become_available(struct nvme_bdev_channel *nbdev_ch) 850 { 851 struct nvme_io_path *io_path; 852 853 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 854 if (!nvme_io_path_is_failed(io_path)) { 855 return true; 856 } 857 } 858 859 return false; 860 } 861 862 static int 863 bdev_nvme_retry_ios(void *arg) 864 { 865 struct nvme_bdev_channel *nbdev_ch = arg; 866 struct spdk_io_channel *ch = spdk_io_channel_from_ctx(nbdev_ch); 867 struct spdk_bdev_io *bdev_io, *tmp_bdev_io; 868 struct nvme_bdev_io *bio; 869 uint64_t now, delay_us; 870 871 now = spdk_get_ticks(); 872 873 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) { 874 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 875 if (bio->retry_ticks > now) { 876 break; 877 } 878 879 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 880 881 bdev_nvme_submit_request(ch, bdev_io); 882 } 883 884 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 885 886 bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list); 887 if (bdev_io != NULL) { 888 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 889 890 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 891 892 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 893 delay_us); 894 } 895 896 return SPDK_POLLER_BUSY; 897 } 898 899 static void 900 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 901 struct nvme_bdev_io *bio, uint64_t delay_ms) 902 { 903 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 904 struct spdk_bdev_io *tmp_bdev_io; 905 struct nvme_bdev_io *tmp_bio; 906 907 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 908 909 TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) { 910 tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx; 911 912 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 913 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io, 914 module_link); 915 return; 916 } 917 } 918 919 /* No earlier I/Os were found. This I/O must be the new head. */ 920 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link); 921 922 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 923 924 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 925 delay_ms * 1000ULL); 926 } 927 928 static inline void 929 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 930 const struct spdk_nvme_cpl *cpl) 931 { 932 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 933 struct nvme_bdev_channel *nbdev_ch; 934 struct nvme_ctrlr *nvme_ctrlr; 935 const struct spdk_nvme_ctrlr_data *cdata; 936 uint64_t delay_ms; 937 938 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 939 940 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 941 goto complete; 942 } 943 944 if (cpl->status.dnr != 0 || (g_opts.bdev_retry_count != -1 && 945 bio->retry_count >= g_opts.bdev_retry_count)) { 946 goto complete; 947 } 948 949 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 950 951 assert(bio->io_path != NULL); 952 nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(bio->io_path->ctrlr_ch); 953 954 if (spdk_nvme_cpl_is_path_error(cpl) || 955 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 956 !nvme_io_path_is_available(bio->io_path) || 957 !nvme_ctrlr_is_available(nvme_ctrlr)) { 958 nbdev_ch->current_io_path = NULL; 959 if (spdk_nvme_cpl_is_ana_error(cpl)) { 960 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 961 bio->io_path->nvme_ns->ana_state_updating = true; 962 } 963 } 964 delay_ms = 0; 965 } else if (spdk_nvme_cpl_is_aborted_by_request(cpl)) { 966 goto complete; 967 } else { 968 bio->retry_count++; 969 970 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 971 972 if (cpl->status.crd != 0) { 973 delay_ms = cdata->crdt[cpl->status.crd] * 100; 974 } else { 975 delay_ms = 0; 976 } 977 } 978 979 if (any_io_path_may_become_available(nbdev_ch)) { 980 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 981 return; 982 } 983 984 complete: 985 bio->retry_count = 0; 986 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 987 } 988 989 static inline void 990 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 991 { 992 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 993 struct nvme_bdev_channel *nbdev_ch; 994 enum spdk_bdev_io_status io_status; 995 996 switch (rc) { 997 case 0: 998 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 999 break; 1000 case -ENOMEM: 1001 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1002 break; 1003 case -ENXIO: 1004 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1005 1006 nbdev_ch->current_io_path = NULL; 1007 1008 if (any_io_path_may_become_available(nbdev_ch)) { 1009 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1010 return; 1011 } 1012 1013 /* fallthrough */ 1014 default: 1015 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1016 break; 1017 } 1018 1019 bio->retry_count = 0; 1020 spdk_bdev_io_complete(bdev_io, io_status); 1021 } 1022 1023 static inline void 1024 bdev_nvme_admin_passthru_complete(struct nvme_bdev_io *bio, int rc) 1025 { 1026 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1027 struct nvme_bdev_channel *nbdev_ch; 1028 enum spdk_bdev_io_status io_status; 1029 1030 switch (rc) { 1031 case 0: 1032 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1033 break; 1034 case -ENOMEM: 1035 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1036 break; 1037 case -ENXIO: 1038 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1039 1040 if (any_ctrlr_may_become_available(nbdev_ch)) { 1041 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1042 return; 1043 } 1044 1045 /* fallthrough */ 1046 default: 1047 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1048 break; 1049 } 1050 1051 bio->retry_count = 0; 1052 spdk_bdev_io_complete(bdev_io, io_status); 1053 } 1054 1055 static void 1056 _bdev_nvme_clear_io_path_cache(struct nvme_ctrlr_channel *ctrlr_ch) 1057 { 1058 struct nvme_io_path *io_path; 1059 1060 TAILQ_FOREACH(io_path, &ctrlr_ch->io_path_list, tailq) { 1061 io_path->nbdev_ch->current_io_path = NULL; 1062 } 1063 } 1064 1065 static struct nvme_ctrlr_channel * 1066 nvme_poll_group_get_ctrlr_channel(struct nvme_poll_group *group, 1067 struct spdk_nvme_qpair *qpair) 1068 { 1069 struct nvme_ctrlr_channel *ctrlr_ch; 1070 1071 TAILQ_FOREACH(ctrlr_ch, &group->ctrlr_ch_list, tailq) { 1072 if (ctrlr_ch->qpair == qpair) { 1073 break; 1074 } 1075 } 1076 1077 return ctrlr_ch; 1078 } 1079 1080 static void 1081 bdev_nvme_destroy_qpair(struct nvme_ctrlr_channel *ctrlr_ch) 1082 { 1083 struct nvme_ctrlr *nvme_ctrlr __attribute__((unused)); 1084 1085 if (ctrlr_ch->qpair != NULL) { 1086 nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(ctrlr_ch); 1087 SPDK_DTRACE_PROBE2(bdev_nvme_destroy_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1088 spdk_nvme_qpair_get_id(ctrlr_ch->qpair)); 1089 spdk_nvme_ctrlr_free_io_qpair(ctrlr_ch->qpair); 1090 ctrlr_ch->qpair = NULL; 1091 } 1092 1093 _bdev_nvme_clear_io_path_cache(ctrlr_ch); 1094 } 1095 1096 static void 1097 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1098 { 1099 struct nvme_poll_group *group = poll_group_ctx; 1100 struct nvme_ctrlr_channel *ctrlr_ch; 1101 struct nvme_ctrlr *nvme_ctrlr; 1102 1103 SPDK_NOTICELOG("qpair %p is disconnected, free the qpair and reset controller.\n", qpair); 1104 /* 1105 * Free the I/O qpair and reset the nvme_ctrlr. 1106 */ 1107 ctrlr_ch = nvme_poll_group_get_ctrlr_channel(group, qpair); 1108 if (ctrlr_ch != NULL) { 1109 bdev_nvme_destroy_qpair(ctrlr_ch); 1110 1111 nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(ctrlr_ch); 1112 bdev_nvme_reset(nvme_ctrlr); 1113 } 1114 } 1115 1116 static int 1117 bdev_nvme_poll(void *arg) 1118 { 1119 struct nvme_poll_group *group = arg; 1120 int64_t num_completions; 1121 1122 if (group->collect_spin_stat && group->start_ticks == 0) { 1123 group->start_ticks = spdk_get_ticks(); 1124 } 1125 1126 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1127 bdev_nvme_disconnected_qpair_cb); 1128 if (group->collect_spin_stat) { 1129 if (num_completions > 0) { 1130 if (group->end_ticks != 0) { 1131 group->spin_ticks += (group->end_ticks - group->start_ticks); 1132 group->end_ticks = 0; 1133 } 1134 group->start_ticks = 0; 1135 } else { 1136 group->end_ticks = spdk_get_ticks(); 1137 } 1138 } 1139 1140 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1141 } 1142 1143 static int 1144 bdev_nvme_poll_adminq(void *arg) 1145 { 1146 int32_t rc; 1147 struct nvme_ctrlr *nvme_ctrlr = arg; 1148 1149 assert(nvme_ctrlr != NULL); 1150 1151 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1152 if (rc < 0) { 1153 bdev_nvme_failover(nvme_ctrlr, false); 1154 } 1155 1156 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1157 } 1158 1159 static void 1160 _bdev_nvme_unregister_dev_cb(void *io_device) 1161 { 1162 struct nvme_bdev *nvme_disk = io_device; 1163 1164 free(nvme_disk->disk.name); 1165 free(nvme_disk); 1166 } 1167 1168 static int 1169 bdev_nvme_destruct(void *ctx) 1170 { 1171 struct nvme_bdev *nvme_disk = ctx; 1172 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1173 1174 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1175 1176 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1177 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1178 1179 nvme_ns->bdev = NULL; 1180 1181 assert(nvme_ns->id > 0); 1182 1183 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1184 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1185 1186 nvme_ctrlr_release(nvme_ns->ctrlr); 1187 free(nvme_ns); 1188 } else { 1189 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1190 } 1191 } 1192 1193 pthread_mutex_lock(&g_bdev_nvme_mutex); 1194 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1195 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1196 1197 spdk_io_device_unregister(nvme_disk, _bdev_nvme_unregister_dev_cb); 1198 1199 return 0; 1200 } 1201 1202 static int 1203 bdev_nvme_flush(struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes) 1204 { 1205 bdev_nvme_io_complete(bio, 0); 1206 1207 return 0; 1208 } 1209 1210 static int 1211 bdev_nvme_create_qpair(struct nvme_ctrlr_channel *ctrlr_ch) 1212 { 1213 struct nvme_ctrlr *nvme_ctrlr; 1214 struct spdk_nvme_io_qpair_opts opts; 1215 struct spdk_nvme_qpair *qpair; 1216 int rc; 1217 1218 nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(ctrlr_ch); 1219 1220 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1221 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1222 opts.create_only = true; 1223 opts.async_mode = true; 1224 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1225 g_opts.io_queue_requests = opts.io_queue_requests; 1226 1227 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1228 if (qpair == NULL) { 1229 return -1; 1230 } 1231 1232 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1233 spdk_nvme_qpair_get_id(ctrlr_ch->qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1234 1235 assert(ctrlr_ch->group != NULL); 1236 1237 rc = spdk_nvme_poll_group_add(ctrlr_ch->group->group, qpair); 1238 if (rc != 0) { 1239 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1240 goto err; 1241 } 1242 1243 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1244 if (rc != 0) { 1245 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1246 goto err; 1247 } 1248 1249 ctrlr_ch->qpair = qpair; 1250 1251 _bdev_nvme_clear_io_path_cache(ctrlr_ch); 1252 1253 return 0; 1254 1255 err: 1256 spdk_nvme_ctrlr_free_io_qpair(qpair); 1257 1258 return rc; 1259 } 1260 1261 static void 1262 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1263 { 1264 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1265 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1266 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 1267 struct spdk_bdev_io *bdev_io; 1268 1269 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1270 status = SPDK_BDEV_IO_STATUS_FAILED; 1271 } 1272 1273 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1274 bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1275 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link); 1276 spdk_bdev_io_complete(bdev_io, status); 1277 } 1278 1279 spdk_for_each_channel_continue(i, 0); 1280 } 1281 1282 static void 1283 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove) 1284 { 1285 struct nvme_path_id *path_id, *next_path; 1286 int rc __attribute__((unused)); 1287 1288 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1289 assert(path_id); 1290 assert(path_id == nvme_ctrlr->active_path_id); 1291 next_path = TAILQ_NEXT(path_id, link); 1292 1293 path_id->is_failed = true; 1294 1295 if (next_path) { 1296 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1297 1298 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1299 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1300 1301 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1302 nvme_ctrlr->active_path_id = next_path; 1303 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1304 assert(rc == 0); 1305 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1306 if (!remove) { 1307 /** Shuffle the old trid to the end of the list and use the new one. 1308 * Allows for round robin through multiple connections. 1309 */ 1310 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1311 } else { 1312 free(path_id); 1313 } 1314 } 1315 } 1316 1317 static bool 1318 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1319 { 1320 int32_t elapsed; 1321 1322 if (nvme_ctrlr->ctrlr_loss_timeout_sec == 0 || 1323 nvme_ctrlr->ctrlr_loss_timeout_sec == -1) { 1324 return false; 1325 } 1326 1327 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1328 if (elapsed >= nvme_ctrlr->ctrlr_loss_timeout_sec) { 1329 return true; 1330 } else { 1331 return false; 1332 } 1333 } 1334 1335 static bool 1336 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1337 { 1338 uint32_t elapsed; 1339 1340 if (nvme_ctrlr->fast_io_fail_timeout_sec == 0) { 1341 return false; 1342 } 1343 1344 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1345 if (elapsed >= nvme_ctrlr->fast_io_fail_timeout_sec) { 1346 return true; 1347 } else { 1348 return false; 1349 } 1350 } 1351 1352 enum bdev_nvme_op_after_reset { 1353 OP_NONE, 1354 OP_COMPLETE_PENDING_DESTRUCT, 1355 OP_DESTRUCT, 1356 OP_DELAYED_RECONNECT, 1357 }; 1358 1359 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1360 1361 static _bdev_nvme_op_after_reset 1362 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1363 { 1364 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1365 /* Complete pending destruct after reset completes. */ 1366 return OP_COMPLETE_PENDING_DESTRUCT; 1367 } else if (success || nvme_ctrlr->reconnect_delay_sec == 0) { 1368 nvme_ctrlr->reset_start_tsc = 0; 1369 return OP_NONE; 1370 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1371 return OP_DESTRUCT; 1372 } else { 1373 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1374 nvme_ctrlr->fast_io_fail_timedout = true; 1375 } 1376 bdev_nvme_failover_trid(nvme_ctrlr, false); 1377 return OP_DELAYED_RECONNECT; 1378 } 1379 } 1380 1381 static int _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1382 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1383 1384 static int 1385 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 1386 { 1387 struct nvme_ctrlr *nvme_ctrlr = ctx; 1388 1389 pthread_mutex_lock(&nvme_ctrlr->mutex); 1390 1391 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 1392 1393 assert(nvme_ctrlr->reconnect_is_delayed == true); 1394 nvme_ctrlr->reconnect_is_delayed = false; 1395 1396 if (nvme_ctrlr->destruct) { 1397 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1398 return SPDK_POLLER_BUSY; 1399 } 1400 1401 assert(nvme_ctrlr->resetting == false); 1402 nvme_ctrlr->resetting = true; 1403 1404 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1405 1406 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 1407 1408 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 1409 return SPDK_POLLER_BUSY; 1410 } 1411 1412 static void 1413 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 1414 { 1415 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 1416 1417 assert(nvme_ctrlr->reconnect_is_delayed == false); 1418 nvme_ctrlr->reconnect_is_delayed = true; 1419 1420 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 1421 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 1422 nvme_ctrlr, 1423 nvme_ctrlr->reconnect_delay_sec * SPDK_SEC_TO_USEC); 1424 } 1425 1426 static void 1427 _bdev_nvme_reset_complete(struct spdk_io_channel_iter *i, int status) 1428 { 1429 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1430 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 1431 struct nvme_path_id *path_id; 1432 bdev_nvme_reset_cb reset_cb_fn = nvme_ctrlr->reset_cb_fn; 1433 void *reset_cb_arg = nvme_ctrlr->reset_cb_arg; 1434 enum bdev_nvme_op_after_reset op_after_reset; 1435 1436 assert(nvme_ctrlr->thread == spdk_get_thread()); 1437 1438 nvme_ctrlr->reset_cb_fn = NULL; 1439 nvme_ctrlr->reset_cb_arg = NULL; 1440 1441 if (!success) { 1442 SPDK_ERRLOG("Resetting controller failed.\n"); 1443 } else { 1444 SPDK_NOTICELOG("Resetting controller successful.\n"); 1445 } 1446 1447 pthread_mutex_lock(&nvme_ctrlr->mutex); 1448 nvme_ctrlr->resetting = false; 1449 1450 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1451 assert(path_id != NULL); 1452 assert(path_id == nvme_ctrlr->active_path_id); 1453 1454 path_id->is_failed = !success; 1455 1456 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 1457 1458 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1459 1460 if (reset_cb_fn) { 1461 reset_cb_fn(reset_cb_arg, success); 1462 } 1463 1464 switch (op_after_reset) { 1465 case OP_COMPLETE_PENDING_DESTRUCT: 1466 nvme_ctrlr_unregister(nvme_ctrlr); 1467 break; 1468 case OP_DESTRUCT: 1469 _bdev_nvme_delete(nvme_ctrlr, false); 1470 break; 1471 case OP_DELAYED_RECONNECT: 1472 spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1473 bdev_nvme_start_reconnect_delay_timer(nvme_ctrlr); 1474 break; 1475 default: 1476 break; 1477 } 1478 } 1479 1480 static void 1481 bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 1482 { 1483 /* Make sure we clear any pending resets before returning. */ 1484 spdk_for_each_channel(nvme_ctrlr, 1485 bdev_nvme_complete_pending_resets, 1486 success ? NULL : (void *)0x1, 1487 _bdev_nvme_reset_complete); 1488 } 1489 1490 static void 1491 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 1492 { 1493 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1494 1495 bdev_nvme_reset_complete(nvme_ctrlr, false); 1496 } 1497 1498 static void 1499 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 1500 { 1501 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 1502 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 1503 1504 bdev_nvme_destroy_qpair(ctrlr_ch); 1505 1506 spdk_for_each_channel_continue(i, 0); 1507 } 1508 1509 static void 1510 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 1511 { 1512 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1513 1514 if (status == 0) { 1515 bdev_nvme_reset_complete(nvme_ctrlr, true); 1516 } else { 1517 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 1518 spdk_for_each_channel(nvme_ctrlr, 1519 bdev_nvme_reset_destroy_qpair, 1520 NULL, 1521 bdev_nvme_reset_create_qpairs_failed); 1522 } 1523 } 1524 1525 static void 1526 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 1527 { 1528 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1529 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1530 int rc; 1531 1532 rc = bdev_nvme_create_qpair(ctrlr_ch); 1533 1534 spdk_for_each_channel_continue(i, rc); 1535 } 1536 1537 static int 1538 bdev_nvme_reconnect_ctrlr_poll(void *arg) 1539 { 1540 struct nvme_ctrlr *nvme_ctrlr = arg; 1541 int rc = -ETIMEDOUT; 1542 1543 if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1544 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 1545 if (rc == -EAGAIN) { 1546 return SPDK_POLLER_BUSY; 1547 } 1548 } 1549 1550 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 1551 if (rc == 0) { 1552 /* Recreate all of the I/O queue pairs */ 1553 spdk_for_each_channel(nvme_ctrlr, 1554 bdev_nvme_reset_create_qpair, 1555 NULL, 1556 bdev_nvme_reset_create_qpairs_done); 1557 } else { 1558 bdev_nvme_reset_complete(nvme_ctrlr, false); 1559 } 1560 return SPDK_POLLER_BUSY; 1561 } 1562 1563 static void 1564 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 1565 { 1566 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 1567 1568 assert(nvme_ctrlr->reset_detach_poller == NULL); 1569 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 1570 nvme_ctrlr, 0); 1571 } 1572 1573 static void 1574 bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status) 1575 { 1576 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1577 int rc __attribute__((unused)); 1578 1579 assert(status == 0); 1580 1581 /* Disconnect fails if ctrlr is already resetting or removed. Both cases are 1582 * not possible. Reset is controlled and the callback to hot remove is called 1583 * when ctrlr is hot removed. 1584 */ 1585 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1586 assert(rc == 0); 1587 1588 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 1589 } 1590 1591 static void 1592 _bdev_nvme_reset(void *ctx) 1593 { 1594 struct nvme_ctrlr *nvme_ctrlr = ctx; 1595 1596 assert(nvme_ctrlr->resetting == true); 1597 assert(nvme_ctrlr->thread == spdk_get_thread()); 1598 1599 spdk_nvme_ctrlr_prepare_for_reset(nvme_ctrlr->ctrlr); 1600 1601 /* First, delete all NVMe I/O queue pairs. */ 1602 spdk_for_each_channel(nvme_ctrlr, 1603 bdev_nvme_reset_destroy_qpair, 1604 NULL, 1605 bdev_nvme_reset_ctrlr); 1606 } 1607 1608 static int 1609 bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr) 1610 { 1611 pthread_mutex_lock(&nvme_ctrlr->mutex); 1612 if (nvme_ctrlr->destruct) { 1613 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1614 return -ENXIO; 1615 } 1616 1617 if (nvme_ctrlr->resetting) { 1618 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1619 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 1620 return -EBUSY; 1621 } 1622 1623 if (nvme_ctrlr->reconnect_is_delayed) { 1624 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1625 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 1626 return -EBUSY; 1627 } 1628 1629 nvme_ctrlr->resetting = true; 1630 1631 assert(nvme_ctrlr->reset_start_tsc == 0); 1632 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 1633 1634 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1635 1636 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr); 1637 return 0; 1638 } 1639 1640 int 1641 bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg) 1642 { 1643 int rc; 1644 1645 rc = bdev_nvme_reset(nvme_ctrlr); 1646 if (rc == 0) { 1647 nvme_ctrlr->reset_cb_fn = cb_fn; 1648 nvme_ctrlr->reset_cb_arg = cb_arg; 1649 } 1650 return rc; 1651 } 1652 1653 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 1654 1655 static void 1656 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 1657 { 1658 enum spdk_bdev_io_status io_status; 1659 1660 if (bio->cpl.cdw0 == 0) { 1661 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1662 } else { 1663 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1664 } 1665 1666 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), io_status); 1667 } 1668 1669 static void 1670 _bdev_nvme_reset_io_continue(void *ctx) 1671 { 1672 struct nvme_bdev_io *bio = ctx; 1673 struct nvme_io_path *prev_io_path, *next_io_path; 1674 int rc; 1675 1676 prev_io_path = bio->io_path; 1677 bio->io_path = NULL; 1678 1679 if (bio->cpl.cdw0 != 0) { 1680 goto complete; 1681 } 1682 1683 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 1684 if (next_io_path == NULL) { 1685 goto complete; 1686 } 1687 1688 rc = _bdev_nvme_reset_io(next_io_path, bio); 1689 if (rc == 0) { 1690 return; 1691 } 1692 1693 bio->cpl.cdw0 = 1; 1694 1695 complete: 1696 bdev_nvme_reset_io_complete(bio); 1697 } 1698 1699 static void 1700 bdev_nvme_reset_io_continue(void *cb_arg, bool success) 1701 { 1702 struct nvme_bdev_io *bio = cb_arg; 1703 1704 bio->cpl.cdw0 = !success; 1705 1706 spdk_thread_send_msg(bio->orig_thread, _bdev_nvme_reset_io_continue, bio); 1707 } 1708 1709 static int 1710 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 1711 { 1712 struct nvme_ctrlr_channel *ctrlr_ch = io_path->ctrlr_ch; 1713 struct nvme_ctrlr *nvme_ctrlr; 1714 struct spdk_bdev_io *bdev_io; 1715 int rc; 1716 1717 nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(ctrlr_ch); 1718 1719 rc = bdev_nvme_reset(nvme_ctrlr); 1720 if (rc == 0) { 1721 assert(bio->io_path == NULL); 1722 bio->io_path = io_path; 1723 1724 assert(nvme_ctrlr->reset_cb_fn == NULL); 1725 assert(nvme_ctrlr->reset_cb_arg == NULL); 1726 nvme_ctrlr->reset_cb_fn = bdev_nvme_reset_io_continue; 1727 nvme_ctrlr->reset_cb_arg = bio; 1728 } else if (rc == -EBUSY) { 1729 /* 1730 * Reset call is queued only if it is from the app framework. This is on purpose so that 1731 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 1732 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 1733 */ 1734 bdev_io = spdk_bdev_io_from_ctx(bio); 1735 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link); 1736 } else { 1737 return rc; 1738 } 1739 1740 return 0; 1741 } 1742 1743 static void 1744 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 1745 { 1746 struct nvme_io_path *io_path; 1747 int rc; 1748 1749 bio->cpl.cdw0 = 0; 1750 bio->orig_thread = spdk_get_thread(); 1751 1752 /* Reset only the first nvme_ctrlr in the nvme_bdev_ctrlr for now. 1753 * 1754 * TODO: Reset all nvme_ctrlrs in the nvme_bdev_ctrlr sequentially. 1755 * This will be done in the following patches. 1756 */ 1757 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 1758 assert(io_path != NULL); 1759 1760 rc = _bdev_nvme_reset_io(io_path, bio); 1761 if (rc != 0) { 1762 bio->cpl.cdw0 = 1; 1763 bdev_nvme_reset_io_complete(bio); 1764 } 1765 } 1766 1767 static int 1768 bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove) 1769 { 1770 pthread_mutex_lock(&nvme_ctrlr->mutex); 1771 if (nvme_ctrlr->destruct) { 1772 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1773 /* Don't bother resetting if the controller is in the process of being destructed. */ 1774 return -ENXIO; 1775 } 1776 1777 if (nvme_ctrlr->resetting) { 1778 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1779 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 1780 return -EBUSY; 1781 } 1782 1783 bdev_nvme_failover_trid(nvme_ctrlr, remove); 1784 1785 if (nvme_ctrlr->reconnect_is_delayed) { 1786 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1787 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 1788 1789 /* We rely on the next reconnect for the failover. */ 1790 return 0; 1791 } 1792 1793 nvme_ctrlr->resetting = true; 1794 1795 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1796 1797 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr); 1798 return 0; 1799 } 1800 1801 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 1802 uint64_t num_blocks); 1803 1804 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 1805 uint64_t num_blocks); 1806 1807 static void 1808 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1809 bool success) 1810 { 1811 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1812 struct spdk_bdev *bdev = bdev_io->bdev; 1813 int ret; 1814 1815 if (!success) { 1816 ret = -EINVAL; 1817 goto exit; 1818 } 1819 1820 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 1821 ret = -ENXIO; 1822 goto exit; 1823 } 1824 1825 ret = bdev_nvme_readv(bio, 1826 bdev_io->u.bdev.iovs, 1827 bdev_io->u.bdev.iovcnt, 1828 bdev_io->u.bdev.md_buf, 1829 bdev_io->u.bdev.num_blocks, 1830 bdev_io->u.bdev.offset_blocks, 1831 bdev->dif_check_flags, 1832 bdev_io->internal.ext_opts); 1833 1834 exit: 1835 if (spdk_unlikely(ret != 0)) { 1836 bdev_nvme_io_complete(bio, ret); 1837 } 1838 } 1839 1840 static void 1841 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 1842 { 1843 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 1844 struct spdk_bdev *bdev = bdev_io->bdev; 1845 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1846 struct nvme_bdev_io *nbdev_io_to_abort; 1847 int rc = 0; 1848 1849 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 1850 if (spdk_unlikely(!nbdev_io->io_path)) { 1851 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 1852 rc = -ENXIO; 1853 goto exit; 1854 } 1855 1856 /* Admin commands do not use the optimal I/O path. 1857 * Simply fall through even if it is not found. 1858 */ 1859 } 1860 1861 switch (bdev_io->type) { 1862 case SPDK_BDEV_IO_TYPE_READ: 1863 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 1864 rc = bdev_nvme_readv(nbdev_io, 1865 bdev_io->u.bdev.iovs, 1866 bdev_io->u.bdev.iovcnt, 1867 bdev_io->u.bdev.md_buf, 1868 bdev_io->u.bdev.num_blocks, 1869 bdev_io->u.bdev.offset_blocks, 1870 bdev->dif_check_flags, 1871 bdev_io->internal.ext_opts); 1872 } else { 1873 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 1874 bdev_io->u.bdev.num_blocks * bdev->blocklen); 1875 rc = 0; 1876 } 1877 break; 1878 case SPDK_BDEV_IO_TYPE_WRITE: 1879 rc = bdev_nvme_writev(nbdev_io, 1880 bdev_io->u.bdev.iovs, 1881 bdev_io->u.bdev.iovcnt, 1882 bdev_io->u.bdev.md_buf, 1883 bdev_io->u.bdev.num_blocks, 1884 bdev_io->u.bdev.offset_blocks, 1885 bdev->dif_check_flags, 1886 bdev_io->internal.ext_opts); 1887 break; 1888 case SPDK_BDEV_IO_TYPE_COMPARE: 1889 rc = bdev_nvme_comparev(nbdev_io, 1890 bdev_io->u.bdev.iovs, 1891 bdev_io->u.bdev.iovcnt, 1892 bdev_io->u.bdev.md_buf, 1893 bdev_io->u.bdev.num_blocks, 1894 bdev_io->u.bdev.offset_blocks, 1895 bdev->dif_check_flags); 1896 break; 1897 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 1898 rc = bdev_nvme_comparev_and_writev(nbdev_io, 1899 bdev_io->u.bdev.iovs, 1900 bdev_io->u.bdev.iovcnt, 1901 bdev_io->u.bdev.fused_iovs, 1902 bdev_io->u.bdev.fused_iovcnt, 1903 bdev_io->u.bdev.md_buf, 1904 bdev_io->u.bdev.num_blocks, 1905 bdev_io->u.bdev.offset_blocks, 1906 bdev->dif_check_flags); 1907 break; 1908 case SPDK_BDEV_IO_TYPE_UNMAP: 1909 rc = bdev_nvme_unmap(nbdev_io, 1910 bdev_io->u.bdev.offset_blocks, 1911 bdev_io->u.bdev.num_blocks); 1912 break; 1913 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 1914 rc = bdev_nvme_write_zeroes(nbdev_io, 1915 bdev_io->u.bdev.offset_blocks, 1916 bdev_io->u.bdev.num_blocks); 1917 break; 1918 case SPDK_BDEV_IO_TYPE_RESET: 1919 nbdev_io->io_path = NULL; 1920 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 1921 break; 1922 case SPDK_BDEV_IO_TYPE_FLUSH: 1923 rc = bdev_nvme_flush(nbdev_io, 1924 bdev_io->u.bdev.offset_blocks, 1925 bdev_io->u.bdev.num_blocks); 1926 break; 1927 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 1928 rc = bdev_nvme_zone_appendv(nbdev_io, 1929 bdev_io->u.bdev.iovs, 1930 bdev_io->u.bdev.iovcnt, 1931 bdev_io->u.bdev.md_buf, 1932 bdev_io->u.bdev.num_blocks, 1933 bdev_io->u.bdev.offset_blocks, 1934 bdev->dif_check_flags); 1935 break; 1936 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 1937 rc = bdev_nvme_get_zone_info(nbdev_io, 1938 bdev_io->u.zone_mgmt.zone_id, 1939 bdev_io->u.zone_mgmt.num_zones, 1940 bdev_io->u.zone_mgmt.buf); 1941 break; 1942 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 1943 rc = bdev_nvme_zone_management(nbdev_io, 1944 bdev_io->u.zone_mgmt.zone_id, 1945 bdev_io->u.zone_mgmt.zone_action); 1946 break; 1947 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 1948 nbdev_io->io_path = NULL; 1949 bdev_nvme_admin_passthru(nbdev_ch, 1950 nbdev_io, 1951 &bdev_io->u.nvme_passthru.cmd, 1952 bdev_io->u.nvme_passthru.buf, 1953 bdev_io->u.nvme_passthru.nbytes); 1954 break; 1955 case SPDK_BDEV_IO_TYPE_NVME_IO: 1956 rc = bdev_nvme_io_passthru(nbdev_io, 1957 &bdev_io->u.nvme_passthru.cmd, 1958 bdev_io->u.nvme_passthru.buf, 1959 bdev_io->u.nvme_passthru.nbytes); 1960 break; 1961 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1962 rc = bdev_nvme_io_passthru_md(nbdev_io, 1963 &bdev_io->u.nvme_passthru.cmd, 1964 bdev_io->u.nvme_passthru.buf, 1965 bdev_io->u.nvme_passthru.nbytes, 1966 bdev_io->u.nvme_passthru.md_buf, 1967 bdev_io->u.nvme_passthru.md_len); 1968 break; 1969 case SPDK_BDEV_IO_TYPE_ABORT: 1970 nbdev_io->io_path = NULL; 1971 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 1972 bdev_nvme_abort(nbdev_ch, 1973 nbdev_io, 1974 nbdev_io_to_abort); 1975 break; 1976 default: 1977 rc = -EINVAL; 1978 break; 1979 } 1980 1981 exit: 1982 if (spdk_unlikely(rc != 0)) { 1983 bdev_nvme_io_complete(nbdev_io, rc); 1984 } 1985 } 1986 1987 static bool 1988 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 1989 { 1990 struct nvme_bdev *nbdev = ctx; 1991 struct nvme_ns *nvme_ns; 1992 struct spdk_nvme_ns *ns; 1993 struct spdk_nvme_ctrlr *ctrlr; 1994 const struct spdk_nvme_ctrlr_data *cdata; 1995 1996 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 1997 assert(nvme_ns != NULL); 1998 ns = nvme_ns->ns; 1999 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2000 2001 switch (io_type) { 2002 case SPDK_BDEV_IO_TYPE_READ: 2003 case SPDK_BDEV_IO_TYPE_WRITE: 2004 case SPDK_BDEV_IO_TYPE_RESET: 2005 case SPDK_BDEV_IO_TYPE_FLUSH: 2006 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 2007 case SPDK_BDEV_IO_TYPE_NVME_IO: 2008 case SPDK_BDEV_IO_TYPE_ABORT: 2009 return true; 2010 2011 case SPDK_BDEV_IO_TYPE_COMPARE: 2012 return spdk_nvme_ns_supports_compare(ns); 2013 2014 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2015 return spdk_nvme_ns_get_md_size(ns) ? true : false; 2016 2017 case SPDK_BDEV_IO_TYPE_UNMAP: 2018 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2019 return cdata->oncs.dsm; 2020 2021 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2022 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2023 return cdata->oncs.write_zeroes; 2024 2025 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2026 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 2027 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 2028 return true; 2029 } 2030 return false; 2031 2032 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 2033 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 2034 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 2035 2036 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 2037 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 2038 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 2039 2040 default: 2041 return false; 2042 } 2043 } 2044 2045 static int 2046 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 2047 { 2048 struct nvme_ctrlr *nvme_ctrlr = io_device; 2049 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 2050 struct spdk_io_channel *pg_ch; 2051 int rc; 2052 2053 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 2054 if (!pg_ch) { 2055 return -1; 2056 } 2057 2058 ctrlr_ch->group = spdk_io_channel_get_ctx(pg_ch); 2059 TAILQ_INSERT_TAIL(&ctrlr_ch->group->ctrlr_ch_list, ctrlr_ch, tailq); 2060 2061 #ifdef SPDK_CONFIG_VTUNE 2062 ctrlr_ch->group->collect_spin_stat = true; 2063 #else 2064 ctrlr_ch->group->collect_spin_stat = false; 2065 #endif 2066 2067 TAILQ_INIT(&ctrlr_ch->pending_resets); 2068 TAILQ_INIT(&ctrlr_ch->io_path_list); 2069 2070 rc = bdev_nvme_create_qpair(ctrlr_ch); 2071 if (rc != 0) { 2072 /* nvme ctrlr can't create IO qpair during reset. In that case ctrlr_ch->qpair 2073 * pointer will be NULL and IO qpair will be created when reset completes. 2074 * If the user submits IO requests during reset, they will be queued and resubmitted later */ 2075 if (!nvme_ctrlr->resetting) { 2076 goto err_qpair; 2077 } 2078 } 2079 2080 return 0; 2081 2082 err_qpair: 2083 TAILQ_REMOVE(&ctrlr_ch->group->ctrlr_ch_list, ctrlr_ch, tailq); 2084 spdk_put_io_channel(pg_ch); 2085 2086 return rc; 2087 } 2088 2089 static void 2090 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 2091 { 2092 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 2093 2094 assert(ctrlr_ch->group != NULL); 2095 2096 bdev_nvme_destroy_qpair(ctrlr_ch); 2097 2098 TAILQ_REMOVE(&ctrlr_ch->group->ctrlr_ch_list, ctrlr_ch, tailq); 2099 2100 spdk_put_io_channel(spdk_io_channel_from_ctx(ctrlr_ch->group)); 2101 } 2102 2103 static void 2104 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 2105 uint32_t iov_cnt, uint32_t seed, 2106 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 2107 { 2108 struct nvme_poll_group *group = ctx; 2109 int rc; 2110 2111 assert(group->accel_channel != NULL); 2112 assert(cb_fn != NULL); 2113 2114 rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 2115 if (rc) { 2116 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 2117 if (rc == -ENOMEM || rc == -EINVAL) { 2118 cb_fn(cb_arg, rc); 2119 } 2120 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 2121 } 2122 } 2123 2124 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 2125 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 2126 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 2127 }; 2128 2129 static int 2130 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 2131 { 2132 struct nvme_poll_group *group = ctx_buf; 2133 2134 TAILQ_INIT(&group->ctrlr_ch_list); 2135 2136 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 2137 if (group->group == NULL) { 2138 return -1; 2139 } 2140 2141 group->accel_channel = spdk_accel_engine_get_io_channel(); 2142 if (!group->accel_channel) { 2143 spdk_nvme_poll_group_destroy(group->group); 2144 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 2145 group); 2146 return -1; 2147 } 2148 2149 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 2150 2151 if (group->poller == NULL) { 2152 spdk_put_io_channel(group->accel_channel); 2153 spdk_nvme_poll_group_destroy(group->group); 2154 return -1; 2155 } 2156 2157 return 0; 2158 } 2159 2160 static void 2161 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 2162 { 2163 struct nvme_poll_group *group = ctx_buf; 2164 2165 assert(TAILQ_EMPTY(&group->ctrlr_ch_list)); 2166 2167 if (group->accel_channel) { 2168 spdk_put_io_channel(group->accel_channel); 2169 } 2170 2171 spdk_poller_unregister(&group->poller); 2172 if (spdk_nvme_poll_group_destroy(group->group)) { 2173 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 2174 assert(false); 2175 } 2176 } 2177 2178 static struct spdk_io_channel * 2179 bdev_nvme_get_io_channel(void *ctx) 2180 { 2181 struct nvme_bdev *nvme_bdev = ctx; 2182 2183 return spdk_get_io_channel(nvme_bdev); 2184 } 2185 2186 static void * 2187 bdev_nvme_get_module_ctx(void *ctx) 2188 { 2189 struct nvme_bdev *nvme_bdev = ctx; 2190 struct nvme_ns *nvme_ns; 2191 2192 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 2193 return NULL; 2194 } 2195 2196 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 2197 if (!nvme_ns) { 2198 return NULL; 2199 } 2200 2201 return nvme_ns->ns; 2202 } 2203 2204 static const char * 2205 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 2206 { 2207 switch (ana_state) { 2208 case SPDK_NVME_ANA_OPTIMIZED_STATE: 2209 return "optimized"; 2210 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 2211 return "non_optimized"; 2212 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 2213 return "inaccessible"; 2214 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 2215 return "persistent_loss"; 2216 case SPDK_NVME_ANA_CHANGE_STATE: 2217 return "change"; 2218 default: 2219 return NULL; 2220 } 2221 } 2222 2223 static int 2224 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 2225 { 2226 struct nvme_bdev *nbdev = ctx; 2227 struct nvme_ns *nvme_ns; 2228 2229 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 2230 assert(nvme_ns != NULL); 2231 2232 return spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, domains, array_size); 2233 } 2234 2235 static void 2236 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 2237 struct nvme_ns *nvme_ns) 2238 { 2239 struct spdk_nvme_ns *ns; 2240 struct spdk_nvme_ctrlr *ctrlr; 2241 const struct spdk_nvme_ctrlr_data *cdata; 2242 const struct spdk_nvme_transport_id *trid; 2243 union spdk_nvme_vs_register vs; 2244 char buf[128]; 2245 2246 ns = nvme_ns->ns; 2247 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2248 2249 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2250 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 2251 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 2252 2253 spdk_json_write_object_begin(w); 2254 2255 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2256 spdk_json_write_named_string(w, "pci_address", trid->traddr); 2257 } 2258 2259 spdk_json_write_named_object_begin(w, "trid"); 2260 2261 nvme_bdev_dump_trid_json(trid, w); 2262 2263 spdk_json_write_object_end(w); 2264 2265 #ifdef SPDK_CONFIG_NVME_CUSE 2266 size_t cuse_name_size = 128; 2267 char cuse_name[cuse_name_size]; 2268 2269 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 2270 cuse_name, &cuse_name_size); 2271 if (rc == 0) { 2272 spdk_json_write_named_string(w, "cuse_device", cuse_name); 2273 } 2274 #endif 2275 2276 spdk_json_write_named_object_begin(w, "ctrlr_data"); 2277 2278 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 2279 2280 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 2281 spdk_str_trim(buf); 2282 spdk_json_write_named_string(w, "model_number", buf); 2283 2284 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 2285 spdk_str_trim(buf); 2286 spdk_json_write_named_string(w, "serial_number", buf); 2287 2288 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 2289 spdk_str_trim(buf); 2290 spdk_json_write_named_string(w, "firmware_revision", buf); 2291 2292 if (cdata->subnqn[0] != '\0') { 2293 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 2294 } 2295 2296 spdk_json_write_named_object_begin(w, "oacs"); 2297 2298 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 2299 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 2300 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 2301 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 2302 2303 spdk_json_write_object_end(w); 2304 2305 spdk_json_write_object_end(w); 2306 2307 spdk_json_write_named_object_begin(w, "vs"); 2308 2309 spdk_json_write_name(w, "nvme_version"); 2310 if (vs.bits.ter) { 2311 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 2312 } else { 2313 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 2314 } 2315 2316 spdk_json_write_object_end(w); 2317 2318 spdk_json_write_named_object_begin(w, "ns_data"); 2319 2320 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 2321 2322 if (cdata->cmic.ana_reporting) { 2323 spdk_json_write_named_string(w, "ana_state", 2324 _nvme_ana_state_str(nvme_ns->ana_state)); 2325 } 2326 2327 spdk_json_write_object_end(w); 2328 2329 if (cdata->oacs.security) { 2330 spdk_json_write_named_object_begin(w, "security"); 2331 2332 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 2333 2334 spdk_json_write_object_end(w); 2335 } 2336 2337 spdk_json_write_object_end(w); 2338 } 2339 2340 static int 2341 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 2342 { 2343 struct nvme_bdev *nvme_bdev = ctx; 2344 struct nvme_ns *nvme_ns; 2345 2346 pthread_mutex_lock(&nvme_bdev->mutex); 2347 spdk_json_write_named_array_begin(w, "nvme"); 2348 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 2349 nvme_namespace_info_json(w, nvme_ns); 2350 } 2351 spdk_json_write_array_end(w); 2352 pthread_mutex_unlock(&nvme_bdev->mutex); 2353 2354 return 0; 2355 } 2356 2357 static void 2358 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 2359 { 2360 /* No config per bdev needed */ 2361 } 2362 2363 static uint64_t 2364 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 2365 { 2366 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 2367 struct nvme_io_path *io_path; 2368 struct nvme_poll_group *group; 2369 uint64_t spin_time = 0; 2370 2371 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 2372 group = io_path->ctrlr_ch->group; 2373 2374 if (!group || !group->collect_spin_stat) { 2375 continue; 2376 } 2377 2378 if (group->end_ticks != 0) { 2379 group->spin_ticks += (group->end_ticks - group->start_ticks); 2380 group->end_ticks = 0; 2381 } 2382 2383 spin_time += group->spin_ticks; 2384 group->start_ticks = 0; 2385 group->spin_ticks = 0; 2386 } 2387 2388 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 2389 } 2390 2391 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 2392 .destruct = bdev_nvme_destruct, 2393 .submit_request = bdev_nvme_submit_request, 2394 .io_type_supported = bdev_nvme_io_type_supported, 2395 .get_io_channel = bdev_nvme_get_io_channel, 2396 .dump_info_json = bdev_nvme_dump_info_json, 2397 .write_config_json = bdev_nvme_write_config_json, 2398 .get_spin_time = bdev_nvme_get_spin_time, 2399 .get_module_ctx = bdev_nvme_get_module_ctx, 2400 .get_memory_domains = bdev_nvme_get_memory_domains, 2401 }; 2402 2403 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 2404 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 2405 2406 static int 2407 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 2408 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 2409 { 2410 struct spdk_nvme_ana_group_descriptor *copied_desc; 2411 uint8_t *orig_desc; 2412 uint32_t i, desc_size, copy_len; 2413 int rc = 0; 2414 2415 if (nvme_ctrlr->ana_log_page == NULL) { 2416 return -EINVAL; 2417 } 2418 2419 copied_desc = nvme_ctrlr->copied_ana_desc; 2420 2421 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 2422 copy_len = nvme_ctrlr->ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 2423 2424 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 2425 memcpy(copied_desc, orig_desc, copy_len); 2426 2427 rc = cb_fn(copied_desc, cb_arg); 2428 if (rc != 0) { 2429 break; 2430 } 2431 2432 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 2433 copied_desc->num_of_nsid * sizeof(uint32_t); 2434 orig_desc += desc_size; 2435 copy_len -= desc_size; 2436 } 2437 2438 return rc; 2439 } 2440 2441 static int 2442 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 2443 { 2444 struct nvme_ns *nvme_ns = cb_arg; 2445 uint32_t i; 2446 2447 for (i = 0; i < desc->num_of_nsid; i++) { 2448 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 2449 continue; 2450 } 2451 nvme_ns->ana_group_id = desc->ana_group_id; 2452 nvme_ns->ana_state = desc->ana_state; 2453 return 1; 2454 } 2455 2456 return 0; 2457 } 2458 2459 static int 2460 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 2461 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 2462 uint32_t prchk_flags, void *ctx) 2463 { 2464 const struct spdk_uuid *uuid; 2465 const uint8_t *nguid; 2466 const struct spdk_nvme_ctrlr_data *cdata; 2467 const struct spdk_nvme_ns_data *nsdata; 2468 enum spdk_nvme_csi csi; 2469 uint32_t atomic_bs, phys_bs, bs; 2470 2471 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2472 csi = spdk_nvme_ns_get_csi(ns); 2473 2474 switch (csi) { 2475 case SPDK_NVME_CSI_NVM: 2476 disk->product_name = "NVMe disk"; 2477 break; 2478 case SPDK_NVME_CSI_ZNS: 2479 disk->product_name = "NVMe ZNS disk"; 2480 disk->zoned = true; 2481 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 2482 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 2483 spdk_nvme_ns_get_extended_sector_size(ns); 2484 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 2485 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 2486 break; 2487 default: 2488 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 2489 return -ENOTSUP; 2490 } 2491 2492 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 2493 if (!disk->name) { 2494 return -ENOMEM; 2495 } 2496 2497 disk->write_cache = 0; 2498 if (cdata->vwc.present) { 2499 /* Enable if the Volatile Write Cache exists */ 2500 disk->write_cache = 1; 2501 } 2502 if (cdata->oncs.write_zeroes) { 2503 disk->max_write_zeroes = UINT16_MAX + 1; 2504 } 2505 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 2506 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 2507 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 2508 2509 nguid = spdk_nvme_ns_get_nguid(ns); 2510 if (!nguid) { 2511 uuid = spdk_nvme_ns_get_uuid(ns); 2512 if (uuid) { 2513 disk->uuid = *uuid; 2514 } 2515 } else { 2516 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 2517 } 2518 2519 nsdata = spdk_nvme_ns_get_data(ns); 2520 bs = spdk_nvme_ns_get_sector_size(ns); 2521 atomic_bs = bs; 2522 phys_bs = bs; 2523 if (nsdata->nabo == 0) { 2524 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 2525 atomic_bs = bs * (1 + nsdata->nawupf); 2526 } else { 2527 atomic_bs = bs * (1 + cdata->awupf); 2528 } 2529 } 2530 if (nsdata->nsfeat.optperf) { 2531 phys_bs = bs * (1 + nsdata->npwg); 2532 } 2533 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 2534 2535 disk->md_len = spdk_nvme_ns_get_md_size(ns); 2536 if (disk->md_len != 0) { 2537 disk->md_interleave = nsdata->flbas.extended; 2538 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 2539 if (disk->dif_type != SPDK_DIF_DISABLE) { 2540 disk->dif_is_head_of_md = nsdata->dps.md_start; 2541 disk->dif_check_flags = prchk_flags; 2542 } 2543 } 2544 2545 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 2546 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 2547 disk->acwu = 0; 2548 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 2549 disk->acwu = nsdata->nacwu; 2550 } else { 2551 disk->acwu = cdata->acwu; 2552 } 2553 2554 disk->ctxt = ctx; 2555 disk->fn_table = &nvmelib_fn_table; 2556 disk->module = &nvme_if; 2557 2558 return 0; 2559 } 2560 2561 static int 2562 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 2563 { 2564 struct nvme_bdev *bdev; 2565 int rc; 2566 2567 bdev = calloc(1, sizeof(*bdev)); 2568 if (!bdev) { 2569 SPDK_ERRLOG("bdev calloc() failed\n"); 2570 return -ENOMEM; 2571 } 2572 2573 rc = pthread_mutex_init(&bdev->mutex, NULL); 2574 if (rc != 0) { 2575 free(bdev); 2576 return rc; 2577 } 2578 2579 bdev->ref = 1; 2580 TAILQ_INIT(&bdev->nvme_ns_list); 2581 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 2582 bdev->opal = nvme_ctrlr->opal_dev != NULL; 2583 2584 rc = nvme_disk_create(&bdev->disk, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 2585 nvme_ns->ns, nvme_ctrlr->prchk_flags, bdev); 2586 if (rc != 0) { 2587 SPDK_ERRLOG("Failed to create NVMe disk\n"); 2588 pthread_mutex_destroy(&bdev->mutex); 2589 free(bdev); 2590 return rc; 2591 } 2592 2593 spdk_io_device_register(bdev, 2594 bdev_nvme_create_bdev_channel_cb, 2595 bdev_nvme_destroy_bdev_channel_cb, 2596 sizeof(struct nvme_bdev_channel), 2597 bdev->disk.name); 2598 2599 rc = spdk_bdev_register(&bdev->disk); 2600 if (rc != 0) { 2601 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 2602 spdk_io_device_unregister(bdev, NULL); 2603 pthread_mutex_destroy(&bdev->mutex); 2604 free(bdev->disk.name); 2605 free(bdev); 2606 return rc; 2607 } 2608 2609 nvme_ns->bdev = bdev; 2610 bdev->nsid = nvme_ns->id; 2611 2612 bdev->nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 2613 TAILQ_INSERT_TAIL(&nvme_ctrlr->nbdev_ctrlr->bdevs, bdev, tailq); 2614 2615 return 0; 2616 } 2617 2618 static bool 2619 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 2620 { 2621 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 2622 const struct spdk_uuid *uuid1, *uuid2; 2623 2624 nsdata1 = spdk_nvme_ns_get_data(ns1); 2625 nsdata2 = spdk_nvme_ns_get_data(ns2); 2626 uuid1 = spdk_nvme_ns_get_uuid(ns1); 2627 uuid2 = spdk_nvme_ns_get_uuid(ns2); 2628 2629 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 2630 nsdata1->eui64 == nsdata2->eui64 && 2631 ((uuid1 == NULL && uuid2 == NULL) || 2632 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 2633 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 2634 } 2635 2636 static bool 2637 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 2638 struct spdk_nvme_ctrlr_opts *opts) 2639 { 2640 struct nvme_probe_skip_entry *entry; 2641 2642 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 2643 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 2644 return false; 2645 } 2646 } 2647 2648 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 2649 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 2650 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 2651 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 2652 opts->disable_read_ana_log_page = true; 2653 2654 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 2655 2656 return true; 2657 } 2658 2659 static void 2660 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 2661 { 2662 struct nvme_ctrlr *nvme_ctrlr = ctx; 2663 2664 if (spdk_nvme_cpl_is_error(cpl)) { 2665 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 2666 cpl->status.sct); 2667 bdev_nvme_reset(nvme_ctrlr); 2668 } else if (cpl->cdw0 & 0x1) { 2669 SPDK_WARNLOG("Specified command could not be aborted.\n"); 2670 bdev_nvme_reset(nvme_ctrlr); 2671 } 2672 } 2673 2674 static void 2675 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 2676 struct spdk_nvme_qpair *qpair, uint16_t cid) 2677 { 2678 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 2679 union spdk_nvme_csts_register csts; 2680 int rc; 2681 2682 assert(nvme_ctrlr->ctrlr == ctrlr); 2683 2684 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 2685 2686 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 2687 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 2688 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 2689 * completion recursively. 2690 */ 2691 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 2692 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 2693 if (csts.bits.cfs) { 2694 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 2695 bdev_nvme_reset(nvme_ctrlr); 2696 return; 2697 } 2698 } 2699 2700 switch (g_opts.action_on_timeout) { 2701 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 2702 if (qpair) { 2703 /* Don't send abort to ctrlr when ctrlr is not available. */ 2704 pthread_mutex_lock(&nvme_ctrlr->mutex); 2705 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 2706 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2707 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 2708 return; 2709 } 2710 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2711 2712 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 2713 nvme_abort_cpl, nvme_ctrlr); 2714 if (rc == 0) { 2715 return; 2716 } 2717 2718 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 2719 } 2720 2721 /* FALLTHROUGH */ 2722 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 2723 bdev_nvme_reset(nvme_ctrlr); 2724 break; 2725 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 2726 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 2727 break; 2728 default: 2729 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 2730 break; 2731 } 2732 } 2733 2734 static void 2735 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 2736 { 2737 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 2738 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 2739 2740 if (rc == 0) { 2741 nvme_ns->probe_ctx = NULL; 2742 pthread_mutex_lock(&nvme_ctrlr->mutex); 2743 nvme_ctrlr->ref++; 2744 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2745 } else { 2746 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 2747 free(nvme_ns); 2748 } 2749 2750 if (ctx) { 2751 ctx->populates_in_progress--; 2752 if (ctx->populates_in_progress == 0) { 2753 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 2754 } 2755 } 2756 } 2757 2758 static void 2759 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 2760 { 2761 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2762 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 2763 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 2764 int rc; 2765 2766 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 2767 if (rc != 0) { 2768 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 2769 } 2770 2771 spdk_for_each_channel_continue(i, rc); 2772 } 2773 2774 static void 2775 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 2776 { 2777 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2778 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 2779 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 2780 struct nvme_io_path *io_path; 2781 2782 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 2783 if (io_path != NULL) { 2784 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 2785 } 2786 2787 spdk_for_each_channel_continue(i, 0); 2788 } 2789 2790 static void 2791 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 2792 { 2793 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 2794 2795 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 2796 } 2797 2798 static void 2799 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 2800 { 2801 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 2802 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 2803 2804 if (status == 0) { 2805 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 2806 } else { 2807 /* Delete the added io_paths and fail populating the namespace. */ 2808 spdk_for_each_channel(bdev, 2809 bdev_nvme_delete_io_path, 2810 nvme_ns, 2811 bdev_nvme_add_io_path_failed); 2812 } 2813 } 2814 2815 static int 2816 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 2817 { 2818 struct nvme_ns *tmp_ns; 2819 const struct spdk_nvme_ns_data *nsdata; 2820 2821 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 2822 if (!nsdata->nmic.can_share) { 2823 SPDK_ERRLOG("Namespace cannot be shared.\n"); 2824 return -EINVAL; 2825 } 2826 2827 pthread_mutex_lock(&bdev->mutex); 2828 2829 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 2830 assert(tmp_ns != NULL); 2831 2832 if (!bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 2833 pthread_mutex_unlock(&bdev->mutex); 2834 SPDK_ERRLOG("Namespaces are not identical.\n"); 2835 return -EINVAL; 2836 } 2837 2838 bdev->ref++; 2839 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 2840 nvme_ns->bdev = bdev; 2841 2842 pthread_mutex_unlock(&bdev->mutex); 2843 2844 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 2845 spdk_for_each_channel(bdev, 2846 bdev_nvme_add_io_path, 2847 nvme_ns, 2848 bdev_nvme_add_io_path_done); 2849 2850 return 0; 2851 } 2852 2853 static void 2854 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 2855 { 2856 struct spdk_nvme_ns *ns; 2857 struct nvme_bdev *bdev; 2858 int rc = 0; 2859 2860 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 2861 if (!ns) { 2862 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 2863 rc = -EINVAL; 2864 goto done; 2865 } 2866 2867 nvme_ns->ns = ns; 2868 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 2869 2870 if (nvme_ctrlr->ana_log_page != NULL) { 2871 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 2872 } 2873 2874 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 2875 if (bdev == NULL) { 2876 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 2877 } else { 2878 rc = nvme_bdev_add_ns(bdev, nvme_ns); 2879 if (rc == 0) { 2880 return; 2881 } 2882 } 2883 done: 2884 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 2885 } 2886 2887 static void 2888 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 2889 { 2890 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 2891 2892 assert(nvme_ctrlr != NULL); 2893 2894 pthread_mutex_lock(&nvme_ctrlr->mutex); 2895 2896 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 2897 2898 if (nvme_ns->bdev != NULL) { 2899 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2900 return; 2901 } 2902 2903 free(nvme_ns); 2904 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2905 2906 nvme_ctrlr_release(nvme_ctrlr); 2907 } 2908 2909 static void 2910 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 2911 { 2912 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 2913 2914 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 2915 } 2916 2917 static void 2918 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 2919 { 2920 struct nvme_bdev *bdev; 2921 2922 bdev = nvme_ns->bdev; 2923 if (bdev != NULL) { 2924 pthread_mutex_lock(&bdev->mutex); 2925 2926 assert(bdev->ref > 0); 2927 bdev->ref--; 2928 if (bdev->ref == 0) { 2929 pthread_mutex_unlock(&bdev->mutex); 2930 2931 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 2932 } else { 2933 /* spdk_bdev_unregister() is not called until the last nvme_ns is 2934 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 2935 * and clear nvme_ns->bdev here. 2936 */ 2937 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 2938 nvme_ns->bdev = NULL; 2939 2940 pthread_mutex_unlock(&bdev->mutex); 2941 2942 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 2943 * we call depopulate_namespace_done() to avoid use-after-free. 2944 */ 2945 spdk_for_each_channel(bdev, 2946 bdev_nvme_delete_io_path, 2947 nvme_ns, 2948 bdev_nvme_delete_io_path_done); 2949 return; 2950 } 2951 } 2952 2953 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 2954 } 2955 2956 static void 2957 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 2958 struct nvme_async_probe_ctx *ctx) 2959 { 2960 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 2961 struct nvme_ns *nvme_ns, *next; 2962 struct spdk_nvme_ns *ns; 2963 struct nvme_bdev *bdev; 2964 uint32_t nsid; 2965 int rc; 2966 uint64_t num_sectors; 2967 2968 if (ctx) { 2969 /* Initialize this count to 1 to handle the populate functions 2970 * calling nvme_ctrlr_populate_namespace_done() immediately. 2971 */ 2972 ctx->populates_in_progress = 1; 2973 } 2974 2975 /* First loop over our existing namespaces and see if they have been 2976 * removed. */ 2977 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 2978 while (nvme_ns != NULL) { 2979 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 2980 2981 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 2982 /* NS is still there but attributes may have changed */ 2983 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 2984 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 2985 bdev = nvme_ns->bdev; 2986 assert(bdev != NULL); 2987 if (bdev->disk.blockcnt != num_sectors) { 2988 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 2989 nvme_ns->id, 2990 bdev->disk.name, 2991 bdev->disk.blockcnt, 2992 num_sectors); 2993 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 2994 if (rc != 0) { 2995 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 2996 bdev->disk.name, rc); 2997 } 2998 } 2999 } else { 3000 /* Namespace was removed */ 3001 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 3002 } 3003 3004 nvme_ns = next; 3005 } 3006 3007 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 3008 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 3009 while (nsid != 0) { 3010 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 3011 3012 if (nvme_ns == NULL) { 3013 /* Found a new one */ 3014 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 3015 if (nvme_ns == NULL) { 3016 SPDK_ERRLOG("Failed to allocate namespace\n"); 3017 /* This just fails to attach the namespace. It may work on a future attempt. */ 3018 continue; 3019 } 3020 3021 nvme_ns->id = nsid; 3022 nvme_ns->ctrlr = nvme_ctrlr; 3023 3024 nvme_ns->bdev = NULL; 3025 3026 if (ctx) { 3027 ctx->populates_in_progress++; 3028 } 3029 nvme_ns->probe_ctx = ctx; 3030 3031 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 3032 3033 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 3034 } 3035 3036 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 3037 } 3038 3039 if (ctx) { 3040 /* Decrement this count now that the loop is over to account 3041 * for the one we started with. If the count is then 0, we 3042 * know any populate_namespace functions completed immediately, 3043 * so we'll kick the callback here. 3044 */ 3045 ctx->populates_in_progress--; 3046 if (ctx->populates_in_progress == 0) { 3047 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 3048 } 3049 } 3050 3051 } 3052 3053 static void 3054 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 3055 { 3056 struct nvme_ns *nvme_ns, *tmp; 3057 3058 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 3059 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 3060 } 3061 } 3062 3063 static int 3064 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 3065 void *cb_arg) 3066 { 3067 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 3068 struct nvme_ns *nvme_ns; 3069 uint32_t i, nsid; 3070 3071 for (i = 0; i < desc->num_of_nsid; i++) { 3072 nsid = desc->nsid[i]; 3073 if (nsid == 0) { 3074 continue; 3075 } 3076 3077 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 3078 3079 assert(nvme_ns != NULL); 3080 if (nvme_ns == NULL) { 3081 /* Target told us that an inactive namespace had an ANA change */ 3082 continue; 3083 } 3084 3085 nvme_ns->ana_group_id = desc->ana_group_id; 3086 nvme_ns->ana_state = desc->ana_state; 3087 nvme_ns->ana_state_updating = false; 3088 } 3089 3090 return 0; 3091 } 3092 3093 static void 3094 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 3095 { 3096 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3097 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 3098 3099 _bdev_nvme_clear_io_path_cache(ctrlr_ch); 3100 3101 spdk_for_each_channel_continue(i, 0); 3102 } 3103 3104 static void 3105 bdev_nvme_clear_io_path_cache_done(struct spdk_io_channel_iter *i, int status) 3106 { 3107 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 3108 3109 pthread_mutex_lock(&nvme_ctrlr->mutex); 3110 3111 assert(nvme_ctrlr->ana_log_page_updating == true); 3112 nvme_ctrlr->ana_log_page_updating = false; 3113 3114 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 3115 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3116 return; 3117 } 3118 3119 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3120 3121 nvme_ctrlr_unregister(nvme_ctrlr); 3122 } 3123 3124 static void 3125 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 3126 { 3127 struct nvme_ns *nvme_ns; 3128 3129 spdk_free(nvme_ctrlr->ana_log_page); 3130 nvme_ctrlr->ana_log_page = NULL; 3131 3132 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 3133 nvme_ns != NULL; 3134 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 3135 nvme_ns->ana_state_updating = false; 3136 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 3137 } 3138 } 3139 3140 static void 3141 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 3142 { 3143 struct nvme_ctrlr *nvme_ctrlr = ctx; 3144 3145 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 3146 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 3147 nvme_ctrlr); 3148 } else { 3149 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 3150 } 3151 3152 spdk_for_each_channel(nvme_ctrlr, 3153 bdev_nvme_clear_io_path_cache, 3154 NULL, 3155 bdev_nvme_clear_io_path_cache_done); 3156 } 3157 3158 static int 3159 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 3160 { 3161 int rc; 3162 3163 if (nvme_ctrlr->ana_log_page == NULL) { 3164 return -EINVAL; 3165 } 3166 3167 pthread_mutex_lock(&nvme_ctrlr->mutex); 3168 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 3169 nvme_ctrlr->ana_log_page_updating) { 3170 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3171 return -EBUSY; 3172 } 3173 3174 nvme_ctrlr->ana_log_page_updating = true; 3175 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3176 3177 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 3178 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 3179 SPDK_NVME_GLOBAL_NS_TAG, 3180 nvme_ctrlr->ana_log_page, 3181 nvme_ctrlr->ana_log_page_size, 0, 3182 nvme_ctrlr_read_ana_log_page_done, 3183 nvme_ctrlr); 3184 if (rc != 0) { 3185 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 3186 } 3187 3188 return rc; 3189 } 3190 3191 static void 3192 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 3193 { 3194 struct nvme_ctrlr *nvme_ctrlr = arg; 3195 union spdk_nvme_async_event_completion event; 3196 3197 if (spdk_nvme_cpl_is_error(cpl)) { 3198 SPDK_WARNLOG("AER request execute failed"); 3199 return; 3200 } 3201 3202 event.raw = cpl->cdw0; 3203 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 3204 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 3205 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 3206 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 3207 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 3208 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 3209 } 3210 } 3211 3212 static void 3213 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc) 3214 { 3215 if (ctx->cb_fn) { 3216 ctx->cb_fn(ctx->cb_ctx, count, rc); 3217 } 3218 3219 ctx->namespaces_populated = true; 3220 if (ctx->probe_done) { 3221 /* The probe was already completed, so we need to free the context 3222 * here. This can happen for cases like OCSSD, where we need to 3223 * send additional commands to the SSD after attach. 3224 */ 3225 free(ctx); 3226 } 3227 } 3228 3229 static void 3230 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 3231 struct nvme_async_probe_ctx *ctx) 3232 { 3233 spdk_io_device_register(nvme_ctrlr, 3234 bdev_nvme_create_ctrlr_channel_cb, 3235 bdev_nvme_destroy_ctrlr_channel_cb, 3236 sizeof(struct nvme_ctrlr_channel), 3237 nvme_ctrlr->nbdev_ctrlr->name); 3238 3239 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 3240 } 3241 3242 static void 3243 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 3244 { 3245 struct nvme_ctrlr *nvme_ctrlr = _ctx; 3246 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 3247 3248 nvme_ctrlr->probe_ctx = NULL; 3249 3250 if (spdk_nvme_cpl_is_error(cpl)) { 3251 nvme_ctrlr_delete(nvme_ctrlr); 3252 3253 if (ctx != NULL) { 3254 populate_namespaces_cb(ctx, 0, -1); 3255 } 3256 return; 3257 } 3258 3259 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 3260 } 3261 3262 static int 3263 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3264 struct nvme_async_probe_ctx *ctx) 3265 { 3266 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 3267 const struct spdk_nvme_ctrlr_data *cdata; 3268 uint32_t ana_log_page_size; 3269 3270 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3271 3272 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 3273 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->nn * 3274 sizeof(uint32_t); 3275 3276 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 3277 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3278 if (nvme_ctrlr->ana_log_page == NULL) { 3279 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 3280 return -ENXIO; 3281 } 3282 3283 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 3284 * Hence copy each descriptor to a temporary area when parsing it. 3285 * 3286 * Allocate a buffer whose size is as large as ANA log page buffer because 3287 * we do not know the size of a descriptor until actually reading it. 3288 */ 3289 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 3290 if (nvme_ctrlr->copied_ana_desc == NULL) { 3291 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 3292 return -ENOMEM; 3293 } 3294 3295 nvme_ctrlr->ana_log_page_size = ana_log_page_size; 3296 3297 nvme_ctrlr->probe_ctx = ctx; 3298 3299 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 3300 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 3301 SPDK_NVME_GLOBAL_NS_TAG, 3302 nvme_ctrlr->ana_log_page, 3303 nvme_ctrlr->ana_log_page_size, 0, 3304 nvme_ctrlr_init_ana_log_page_done, 3305 nvme_ctrlr); 3306 } 3307 3308 /* hostnqn and subnqn were already verified before attaching a controller. 3309 * Hence check only the multipath capability and cntlid here. 3310 */ 3311 static bool 3312 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 3313 { 3314 struct nvme_ctrlr *tmp; 3315 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 3316 3317 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3318 3319 if (!cdata->cmic.multi_ctrlr) { 3320 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 3321 return false; 3322 } 3323 3324 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 3325 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 3326 3327 if (!tmp_cdata->cmic.multi_ctrlr) { 3328 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 3329 return false; 3330 } 3331 if (cdata->cntlid == tmp_cdata->cntlid) { 3332 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 3333 return false; 3334 } 3335 } 3336 3337 return true; 3338 } 3339 3340 static int 3341 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 3342 { 3343 struct nvme_bdev_ctrlr *nbdev_ctrlr; 3344 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 3345 int rc = 0; 3346 3347 pthread_mutex_lock(&g_bdev_nvme_mutex); 3348 3349 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 3350 if (nbdev_ctrlr != NULL) { 3351 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 3352 rc = -EINVAL; 3353 goto exit; 3354 } 3355 } else { 3356 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 3357 if (nbdev_ctrlr == NULL) { 3358 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 3359 rc = -ENOMEM; 3360 goto exit; 3361 } 3362 nbdev_ctrlr->name = strdup(name); 3363 if (nbdev_ctrlr->name == NULL) { 3364 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 3365 free(nbdev_ctrlr); 3366 goto exit; 3367 } 3368 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 3369 TAILQ_INIT(&nbdev_ctrlr->bdevs); 3370 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 3371 } 3372 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 3373 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 3374 exit: 3375 pthread_mutex_unlock(&g_bdev_nvme_mutex); 3376 return rc; 3377 } 3378 3379 static int 3380 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 3381 const char *name, 3382 const struct spdk_nvme_transport_id *trid, 3383 struct nvme_async_probe_ctx *ctx) 3384 { 3385 struct nvme_ctrlr *nvme_ctrlr; 3386 struct nvme_path_id *path_id; 3387 const struct spdk_nvme_ctrlr_data *cdata; 3388 int rc; 3389 3390 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 3391 if (nvme_ctrlr == NULL) { 3392 SPDK_ERRLOG("Failed to allocate device struct\n"); 3393 return -ENOMEM; 3394 } 3395 3396 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 3397 if (rc != 0) { 3398 free(nvme_ctrlr); 3399 return rc; 3400 } 3401 3402 TAILQ_INIT(&nvme_ctrlr->trids); 3403 3404 RB_INIT(&nvme_ctrlr->namespaces); 3405 3406 path_id = calloc(1, sizeof(*path_id)); 3407 if (path_id == NULL) { 3408 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 3409 rc = -ENOMEM; 3410 goto err; 3411 } 3412 3413 path_id->trid = *trid; 3414 if (ctx != NULL) { 3415 memcpy(path_id->hostid.hostaddr, ctx->opts.src_addr, sizeof(path_id->hostid.hostaddr)); 3416 memcpy(path_id->hostid.hostsvcid, ctx->opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 3417 } 3418 nvme_ctrlr->active_path_id = path_id; 3419 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 3420 3421 nvme_ctrlr->thread = spdk_get_thread(); 3422 nvme_ctrlr->ctrlr = ctrlr; 3423 nvme_ctrlr->ref = 1; 3424 3425 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 3426 SPDK_ERRLOG("OCSSDs are not supported"); 3427 rc = -ENOTSUP; 3428 goto err; 3429 } 3430 3431 if (ctx != NULL) { 3432 nvme_ctrlr->prchk_flags = ctx->prchk_flags; 3433 nvme_ctrlr->ctrlr_loss_timeout_sec = ctx->ctrlr_loss_timeout_sec; 3434 nvme_ctrlr->reconnect_delay_sec = ctx->reconnect_delay_sec; 3435 nvme_ctrlr->fast_io_fail_timeout_sec = ctx->fast_io_fail_timeout_sec; 3436 } 3437 3438 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 3439 g_opts.nvme_adminq_poll_period_us); 3440 3441 if (g_opts.timeout_us > 0) { 3442 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 3443 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 3444 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 3445 g_opts.timeout_us : g_opts.timeout_admin_us; 3446 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 3447 adm_timeout_us, timeout_cb, nvme_ctrlr); 3448 } 3449 3450 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 3451 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 3452 3453 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 3454 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 3455 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 3456 } 3457 3458 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 3459 if (rc != 0) { 3460 goto err; 3461 } 3462 3463 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3464 3465 if (cdata->cmic.ana_reporting) { 3466 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 3467 if (rc == 0) { 3468 return 0; 3469 } 3470 } else { 3471 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 3472 return 0; 3473 } 3474 3475 err: 3476 nvme_ctrlr_delete(nvme_ctrlr); 3477 return rc; 3478 } 3479 3480 static void 3481 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 3482 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 3483 { 3484 char *name; 3485 3486 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 3487 if (!name) { 3488 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 3489 return; 3490 } 3491 3492 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 3493 3494 nvme_ctrlr_create(ctrlr, name, trid, NULL); 3495 3496 free(name); 3497 } 3498 3499 static void 3500 _nvme_ctrlr_destruct(void *ctx) 3501 { 3502 struct nvme_ctrlr *nvme_ctrlr = ctx; 3503 3504 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 3505 nvme_ctrlr_release(nvme_ctrlr); 3506 } 3507 3508 static int 3509 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 3510 { 3511 struct nvme_probe_skip_entry *entry; 3512 3513 pthread_mutex_lock(&nvme_ctrlr->mutex); 3514 3515 /* The controller's destruction was already started */ 3516 if (nvme_ctrlr->destruct) { 3517 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3518 return 0; 3519 } 3520 3521 if (!hotplug && 3522 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 3523 entry = calloc(1, sizeof(*entry)); 3524 if (!entry) { 3525 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3526 return -ENOMEM; 3527 } 3528 entry->trid = nvme_ctrlr->active_path_id->trid; 3529 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 3530 } 3531 3532 nvme_ctrlr->destruct = true; 3533 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3534 3535 _nvme_ctrlr_destruct(nvme_ctrlr); 3536 3537 return 0; 3538 } 3539 3540 static void 3541 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 3542 { 3543 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 3544 3545 _bdev_nvme_delete(nvme_ctrlr, true); 3546 } 3547 3548 static int 3549 bdev_nvme_hotplug_probe(void *arg) 3550 { 3551 if (g_hotplug_probe_ctx == NULL) { 3552 spdk_poller_unregister(&g_hotplug_probe_poller); 3553 return SPDK_POLLER_IDLE; 3554 } 3555 3556 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 3557 g_hotplug_probe_ctx = NULL; 3558 spdk_poller_unregister(&g_hotplug_probe_poller); 3559 } 3560 3561 return SPDK_POLLER_BUSY; 3562 } 3563 3564 static int 3565 bdev_nvme_hotplug(void *arg) 3566 { 3567 struct spdk_nvme_transport_id trid_pcie; 3568 3569 if (g_hotplug_probe_ctx) { 3570 return SPDK_POLLER_BUSY; 3571 } 3572 3573 memset(&trid_pcie, 0, sizeof(trid_pcie)); 3574 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 3575 3576 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 3577 hotplug_probe_cb, attach_cb, NULL); 3578 3579 if (g_hotplug_probe_ctx) { 3580 assert(g_hotplug_probe_poller == NULL); 3581 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 3582 } 3583 3584 return SPDK_POLLER_BUSY; 3585 } 3586 3587 void 3588 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 3589 { 3590 *opts = g_opts; 3591 } 3592 3593 static int 3594 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 3595 { 3596 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 3597 /* Can't set timeout_admin_us without also setting timeout_us */ 3598 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 3599 return -EINVAL; 3600 } 3601 3602 if (opts->bdev_retry_count < -1) { 3603 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 3604 return -EINVAL; 3605 } 3606 3607 return 0; 3608 } 3609 3610 int 3611 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 3612 { 3613 int ret = bdev_nvme_validate_opts(opts); 3614 if (ret) { 3615 SPDK_WARNLOG("Failed to set nvme opts.\n"); 3616 return ret; 3617 } 3618 3619 if (g_bdev_nvme_init_thread != NULL) { 3620 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 3621 return -EPERM; 3622 } 3623 } 3624 3625 g_opts = *opts; 3626 3627 return 0; 3628 } 3629 3630 struct set_nvme_hotplug_ctx { 3631 uint64_t period_us; 3632 bool enabled; 3633 spdk_msg_fn fn; 3634 void *fn_ctx; 3635 }; 3636 3637 static void 3638 set_nvme_hotplug_period_cb(void *_ctx) 3639 { 3640 struct set_nvme_hotplug_ctx *ctx = _ctx; 3641 3642 spdk_poller_unregister(&g_hotplug_poller); 3643 if (ctx->enabled) { 3644 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 3645 } 3646 3647 g_nvme_hotplug_poll_period_us = ctx->period_us; 3648 g_nvme_hotplug_enabled = ctx->enabled; 3649 if (ctx->fn) { 3650 ctx->fn(ctx->fn_ctx); 3651 } 3652 3653 free(ctx); 3654 } 3655 3656 int 3657 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 3658 { 3659 struct set_nvme_hotplug_ctx *ctx; 3660 3661 if (enabled == true && !spdk_process_is_primary()) { 3662 return -EPERM; 3663 } 3664 3665 ctx = calloc(1, sizeof(*ctx)); 3666 if (ctx == NULL) { 3667 return -ENOMEM; 3668 } 3669 3670 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 3671 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 3672 ctx->enabled = enabled; 3673 ctx->fn = cb; 3674 ctx->fn_ctx = cb_ctx; 3675 3676 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 3677 return 0; 3678 } 3679 3680 static void 3681 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 3682 struct nvme_async_probe_ctx *ctx) 3683 { 3684 struct nvme_ns *nvme_ns; 3685 struct nvme_bdev *nvme_bdev; 3686 size_t j; 3687 3688 assert(nvme_ctrlr != NULL); 3689 3690 if (ctx->names == NULL) { 3691 populate_namespaces_cb(ctx, 0, 0); 3692 return; 3693 } 3694 3695 /* 3696 * Report the new bdevs that were created in this call. 3697 * There can be more than one bdev per NVMe controller. 3698 */ 3699 j = 0; 3700 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 3701 while (nvme_ns != NULL) { 3702 nvme_bdev = nvme_ns->bdev; 3703 if (j < ctx->count) { 3704 ctx->names[j] = nvme_bdev->disk.name; 3705 j++; 3706 } else { 3707 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 3708 ctx->count); 3709 populate_namespaces_cb(ctx, 0, -ERANGE); 3710 return; 3711 } 3712 3713 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 3714 } 3715 3716 populate_namespaces_cb(ctx, j, 0); 3717 } 3718 3719 static int 3720 bdev_nvme_compare_trids(struct nvme_ctrlr *nvme_ctrlr, 3721 struct spdk_nvme_ctrlr *new_ctrlr, 3722 struct spdk_nvme_transport_id *trid) 3723 { 3724 struct nvme_path_id *tmp_trid; 3725 3726 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 3727 SPDK_ERRLOG("PCIe failover is not supported.\n"); 3728 return -ENOTSUP; 3729 } 3730 3731 /* Currently we only support failover to the same transport type. */ 3732 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 3733 return -EINVAL; 3734 } 3735 3736 /* Currently we only support failover to the same NQN. */ 3737 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 3738 return -EINVAL; 3739 } 3740 3741 /* Skip all the other checks if we've already registered this path. */ 3742 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 3743 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 3744 return -EEXIST; 3745 } 3746 } 3747 3748 return 0; 3749 } 3750 3751 static int 3752 bdev_nvme_compare_namespaces(struct nvme_ctrlr *nvme_ctrlr, 3753 struct spdk_nvme_ctrlr *new_ctrlr) 3754 { 3755 struct nvme_ns *nvme_ns; 3756 struct spdk_nvme_ns *new_ns; 3757 3758 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 3759 while (nvme_ns != NULL) { 3760 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 3761 assert(new_ns != NULL); 3762 3763 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 3764 return -EINVAL; 3765 } 3766 3767 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 3768 } 3769 3770 return 0; 3771 } 3772 3773 static int 3774 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 3775 struct spdk_nvme_transport_id *trid) 3776 { 3777 struct nvme_path_id *new_trid, *tmp_trid; 3778 3779 new_trid = calloc(1, sizeof(*new_trid)); 3780 if (new_trid == NULL) { 3781 return -ENOMEM; 3782 } 3783 new_trid->trid = *trid; 3784 new_trid->is_failed = false; 3785 3786 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 3787 if (tmp_trid->is_failed && tmp_trid != nvme_ctrlr->active_path_id) { 3788 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 3789 return 0; 3790 } 3791 } 3792 3793 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 3794 return 0; 3795 } 3796 3797 /* This is the case that a secondary path is added to an existing 3798 * nvme_ctrlr for failover. After checking if it can access the same 3799 * namespaces as the primary path, it is disconnected until failover occurs. 3800 */ 3801 static int 3802 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 3803 struct spdk_nvme_ctrlr *new_ctrlr, 3804 struct spdk_nvme_transport_id *trid) 3805 { 3806 int rc; 3807 3808 assert(nvme_ctrlr != NULL); 3809 3810 pthread_mutex_lock(&nvme_ctrlr->mutex); 3811 3812 rc = bdev_nvme_compare_trids(nvme_ctrlr, new_ctrlr, trid); 3813 if (rc != 0) { 3814 goto exit; 3815 } 3816 3817 rc = bdev_nvme_compare_namespaces(nvme_ctrlr, new_ctrlr); 3818 if (rc != 0) { 3819 goto exit; 3820 } 3821 3822 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 3823 3824 exit: 3825 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3826 3827 spdk_nvme_detach(new_ctrlr); 3828 3829 return rc; 3830 } 3831 3832 static void 3833 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 3834 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 3835 { 3836 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 3837 struct nvme_async_probe_ctx *ctx; 3838 int rc; 3839 3840 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts); 3841 ctx->ctrlr_attached = true; 3842 3843 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 3844 if (rc != 0) { 3845 populate_namespaces_cb(ctx, 0, rc); 3846 } 3847 } 3848 3849 static void 3850 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 3851 struct spdk_nvme_ctrlr *ctrlr, 3852 const struct spdk_nvme_ctrlr_opts *opts) 3853 { 3854 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 3855 struct nvme_ctrlr *nvme_ctrlr; 3856 struct nvme_async_probe_ctx *ctx; 3857 int rc; 3858 3859 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts); 3860 ctx->ctrlr_attached = true; 3861 3862 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 3863 if (nvme_ctrlr) { 3864 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 3865 } else { 3866 rc = -ENODEV; 3867 } 3868 3869 populate_namespaces_cb(ctx, 0, rc); 3870 } 3871 3872 static int 3873 bdev_nvme_async_poll(void *arg) 3874 { 3875 struct nvme_async_probe_ctx *ctx = arg; 3876 int rc; 3877 3878 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 3879 if (spdk_unlikely(rc != -EAGAIN)) { 3880 ctx->probe_done = true; 3881 spdk_poller_unregister(&ctx->poller); 3882 if (!ctx->ctrlr_attached) { 3883 /* The probe is done, but no controller was attached. 3884 * That means we had a failure, so report -EIO back to 3885 * the caller (usually the RPC). populate_namespaces_cb() 3886 * will take care of freeing the nvme_async_probe_ctx. 3887 */ 3888 populate_namespaces_cb(ctx, 0, -EIO); 3889 } else if (ctx->namespaces_populated) { 3890 /* The namespaces for the attached controller were all 3891 * populated and the response was already sent to the 3892 * caller (usually the RPC). So free the context here. 3893 */ 3894 free(ctx); 3895 } 3896 } 3897 3898 return SPDK_POLLER_BUSY; 3899 } 3900 3901 static bool 3902 bdev_nvme_check_multipath_params(int32_t ctrlr_loss_timeout_sec, 3903 uint32_t reconnect_delay_sec, 3904 uint32_t fast_io_fail_timeout_sec) 3905 { 3906 if (ctrlr_loss_timeout_sec < -1) { 3907 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 3908 return false; 3909 } else if (ctrlr_loss_timeout_sec == -1) { 3910 if (reconnect_delay_sec == 0) { 3911 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 3912 return false; 3913 } else if (fast_io_fail_timeout_sec != 0 && 3914 fast_io_fail_timeout_sec < reconnect_delay_sec) { 3915 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 3916 return false; 3917 } 3918 } else if (ctrlr_loss_timeout_sec != 0) { 3919 if (reconnect_delay_sec == 0) { 3920 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 3921 return false; 3922 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 3923 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 3924 return false; 3925 } else if (fast_io_fail_timeout_sec != 0) { 3926 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 3927 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 3928 return false; 3929 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 3930 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 3931 return false; 3932 } 3933 } 3934 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 3935 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 3936 return false; 3937 } 3938 3939 return true; 3940 } 3941 3942 int 3943 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 3944 const char *base_name, 3945 const char **names, 3946 uint32_t count, 3947 uint32_t prchk_flags, 3948 spdk_bdev_create_nvme_fn cb_fn, 3949 void *cb_ctx, 3950 struct spdk_nvme_ctrlr_opts *opts, 3951 bool multipath, 3952 int32_t ctrlr_loss_timeout_sec, 3953 uint32_t reconnect_delay_sec, 3954 uint32_t fast_io_fail_timeout_sec) 3955 { 3956 struct nvme_probe_skip_entry *entry, *tmp; 3957 struct nvme_async_probe_ctx *ctx; 3958 spdk_nvme_attach_cb attach_cb; 3959 3960 /* TODO expand this check to include both the host and target TRIDs. 3961 * Only if both are the same should we fail. 3962 */ 3963 if (nvme_ctrlr_get(trid) != NULL) { 3964 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 3965 return -EEXIST; 3966 } 3967 3968 if (!bdev_nvme_check_multipath_params(ctrlr_loss_timeout_sec, reconnect_delay_sec, 3969 fast_io_fail_timeout_sec)) { 3970 return -EINVAL; 3971 } 3972 3973 ctx = calloc(1, sizeof(*ctx)); 3974 if (!ctx) { 3975 return -ENOMEM; 3976 } 3977 ctx->base_name = base_name; 3978 ctx->names = names; 3979 ctx->count = count; 3980 ctx->cb_fn = cb_fn; 3981 ctx->cb_ctx = cb_ctx; 3982 ctx->prchk_flags = prchk_flags; 3983 ctx->trid = *trid; 3984 ctx->ctrlr_loss_timeout_sec = ctrlr_loss_timeout_sec; 3985 ctx->reconnect_delay_sec = reconnect_delay_sec; 3986 ctx->fast_io_fail_timeout_sec = fast_io_fail_timeout_sec; 3987 3988 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 3989 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 3990 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 3991 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 3992 free(entry); 3993 break; 3994 } 3995 } 3996 } 3997 3998 if (opts) { 3999 memcpy(&ctx->opts, opts, sizeof(*opts)); 4000 } else { 4001 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts)); 4002 } 4003 4004 ctx->opts.transport_retry_count = g_opts.transport_retry_count; 4005 ctx->opts.transport_ack_timeout = g_opts.transport_ack_timeout; 4006 ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 4007 ctx->opts.disable_read_ana_log_page = true; 4008 4009 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 4010 attach_cb = connect_attach_cb; 4011 } else { 4012 attach_cb = connect_set_failover_cb; 4013 } 4014 4015 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, attach_cb); 4016 if (ctx->probe_ctx == NULL) { 4017 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 4018 free(ctx); 4019 return -ENODEV; 4020 } 4021 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 4022 4023 return 0; 4024 } 4025 4026 int 4027 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id) 4028 { 4029 struct nvme_bdev_ctrlr *nbdev_ctrlr; 4030 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 4031 struct nvme_path_id *p, *t; 4032 int rc = -ENXIO; 4033 4034 if (name == NULL || path_id == NULL) { 4035 return -EINVAL; 4036 } 4037 4038 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 4039 if (nbdev_ctrlr == NULL) { 4040 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 4041 return -ENODEV; 4042 } 4043 4044 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 4045 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 4046 if (path_id->trid.trtype != 0) { 4047 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 4048 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 4049 continue; 4050 } 4051 } else { 4052 if (path_id->trid.trtype != p->trid.trtype) { 4053 continue; 4054 } 4055 } 4056 } 4057 4058 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 4059 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 4060 continue; 4061 } 4062 } 4063 4064 if (path_id->trid.adrfam != 0) { 4065 if (path_id->trid.adrfam != p->trid.adrfam) { 4066 continue; 4067 } 4068 } 4069 4070 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 4071 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 4072 continue; 4073 } 4074 } 4075 4076 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 4077 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 4078 continue; 4079 } 4080 } 4081 4082 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 4083 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 4084 continue; 4085 } 4086 } 4087 4088 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 4089 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 4090 continue; 4091 } 4092 } 4093 4094 /* If we made it here, then this path is a match! Now we need to remove it. */ 4095 if (p == nvme_ctrlr->active_path_id) { 4096 /* This is the active path in use right now. The active path is always the first in the list. */ 4097 4098 if (!TAILQ_NEXT(p, link)) { 4099 /* The current path is the only path. */ 4100 rc = _bdev_nvme_delete(nvme_ctrlr, false); 4101 } else { 4102 /* There is an alternative path. */ 4103 rc = bdev_nvme_failover(nvme_ctrlr, true); 4104 } 4105 } else { 4106 /* We are not using the specified path. */ 4107 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 4108 free(p); 4109 rc = 0; 4110 } 4111 4112 if (rc < 0 && rc != -ENXIO) { 4113 return rc; 4114 } 4115 4116 4117 } 4118 } 4119 4120 /* All nvme_ctrlrs were deleted or no nvme_ctrlr which had the trid was found. */ 4121 return rc; 4122 } 4123 4124 #define DISCOVERY_DEBUGLOG(ctx, format, ...) \ 4125 SPDK_DEBUGLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 4126 4127 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 4128 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 4129 4130 struct discovery_entry_ctx { 4131 char name[128]; 4132 struct spdk_nvme_transport_id trid; 4133 struct spdk_nvme_ctrlr_opts opts; 4134 struct spdk_nvmf_discovery_log_page_entry entry; 4135 TAILQ_ENTRY(discovery_entry_ctx) tailq; 4136 struct discovery_ctx *ctx; 4137 }; 4138 4139 struct discovery_ctx { 4140 char *name; 4141 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 4142 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 4143 void *cb_ctx; 4144 struct spdk_nvme_probe_ctx *probe_ctx; 4145 struct spdk_nvme_detach_ctx *detach_ctx; 4146 struct spdk_nvme_ctrlr *ctrlr; 4147 struct spdk_nvme_transport_id trid; 4148 struct spdk_poller *poller; 4149 struct spdk_nvme_ctrlr_opts opts; 4150 struct spdk_nvmf_discovery_log_page *log_page; 4151 TAILQ_ENTRY(discovery_ctx) tailq; 4152 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 4153 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 4154 int rc; 4155 /* Denotes if a discovery is currently in progress for this context. 4156 * That includes connecting to newly discovered subsystems. Used to 4157 * ensure we do not start a new discovery until an existing one is 4158 * complete. 4159 */ 4160 bool in_progress; 4161 4162 /* Denotes if another discovery is needed after the one in progress 4163 * completes. Set when we receive an AER completion while a discovery 4164 * is already in progress. 4165 */ 4166 bool pending; 4167 4168 /* Signal to the discovery context poller that it should detach from 4169 * the discovery controller. 4170 */ 4171 bool detach; 4172 4173 struct spdk_thread *calling_thread; 4174 uint32_t index; 4175 uint32_t attach_in_progress; 4176 char *hostnqn; 4177 }; 4178 4179 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 4180 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 4181 4182 static void get_discovery_log_page(struct discovery_ctx *ctx); 4183 4184 static void 4185 free_discovery_ctx(struct discovery_ctx *ctx) 4186 { 4187 free(ctx->hostnqn); 4188 free(ctx->name); 4189 free(ctx); 4190 } 4191 4192 static void 4193 discovery_complete(struct discovery_ctx *ctx) 4194 { 4195 ctx->in_progress = false; 4196 if (ctx->pending) { 4197 ctx->pending = false; 4198 get_discovery_log_page(ctx); 4199 } 4200 } 4201 4202 static void 4203 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 4204 struct spdk_nvmf_discovery_log_page_entry *entry) 4205 { 4206 char *space; 4207 4208 trid->trtype = entry->trtype; 4209 trid->adrfam = entry->adrfam; 4210 memcpy(trid->traddr, entry->traddr, sizeof(trid->traddr)); 4211 memcpy(trid->trsvcid, entry->trsvcid, sizeof(trid->trsvcid)); 4212 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn)); 4213 4214 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 4215 * But the log page entries typically pad them with spaces, not zeroes. 4216 * So add a NULL terminator to each of these fields at the appropriate 4217 * location. 4218 */ 4219 space = strchr(trid->traddr, ' '); 4220 if (space) { 4221 *space = 0; 4222 } 4223 space = strchr(trid->trsvcid, ' '); 4224 if (space) { 4225 *space = 0; 4226 } 4227 space = strchr(trid->subnqn, ' '); 4228 if (space) { 4229 *space = 0; 4230 } 4231 } 4232 4233 static void 4234 discovery_remove_controllers(struct discovery_ctx *ctx) 4235 { 4236 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 4237 struct discovery_entry_ctx *entry_ctx, *tmp; 4238 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 4239 struct spdk_nvme_transport_id old_trid; 4240 uint64_t numrec, i; 4241 bool found; 4242 4243 numrec = from_le64(&log_page->numrec); 4244 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 4245 found = false; 4246 old_entry = &entry_ctx->entry; 4247 build_trid_from_log_page_entry(&old_trid, old_entry); 4248 for (i = 0; i < numrec; i++) { 4249 new_entry = &log_page->entries[i]; 4250 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 4251 DISCOVERY_DEBUGLOG(ctx, "NVM %s:%s:%s found again\n", 4252 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 4253 found = true; 4254 break; 4255 } 4256 } 4257 if (!found) { 4258 struct nvme_path_id path = {}; 4259 4260 DISCOVERY_DEBUGLOG(ctx, "NVM %s:%s:%s not found\n", 4261 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 4262 4263 path.trid = entry_ctx->trid; 4264 bdev_nvme_delete(entry_ctx->name, &path); 4265 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 4266 free(entry_ctx); 4267 } 4268 } 4269 free(log_page); 4270 ctx->log_page = NULL; 4271 discovery_complete(ctx); 4272 } 4273 4274 static void 4275 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 4276 { 4277 struct discovery_entry_ctx *entry_ctx = cb_ctx; 4278 struct discovery_ctx *ctx = entry_ctx->ctx;; 4279 4280 DISCOVERY_DEBUGLOG(ctx, "attach %s done\n", entry_ctx->name); 4281 ctx->attach_in_progress--; 4282 if (ctx->attach_in_progress == 0) { 4283 discovery_remove_controllers(ctx); 4284 } 4285 } 4286 4287 static void 4288 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 4289 struct spdk_nvmf_discovery_log_page *log_page) 4290 { 4291 struct discovery_ctx *ctx = cb_arg; 4292 struct discovery_entry_ctx *entry_ctx, *tmp; 4293 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 4294 uint64_t numrec, i; 4295 bool found; 4296 4297 if (rc || spdk_nvme_cpl_is_error(cpl)) { 4298 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 4299 return; 4300 } 4301 4302 ctx->log_page = log_page; 4303 assert(ctx->attach_in_progress == 0); 4304 numrec = from_le64(&log_page->numrec); 4305 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 4306 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 4307 free(entry_ctx); 4308 } 4309 for (i = 0; i < numrec; i++) { 4310 found = false; 4311 new_entry = &log_page->entries[i]; 4312 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 4313 struct discovery_entry_ctx *new_ctx; 4314 4315 new_ctx = calloc(1, sizeof(*new_ctx)); 4316 if (new_ctx == NULL) { 4317 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 4318 break; 4319 } 4320 4321 new_ctx->ctx = ctx; 4322 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 4323 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 4324 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->opts, sizeof(new_ctx->opts)); 4325 snprintf(new_ctx->opts.hostnqn, sizeof(new_ctx->opts.hostnqn), "%s", ctx->hostnqn); 4326 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 4327 continue; 4328 } 4329 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 4330 old_entry = &entry_ctx->entry; 4331 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 4332 found = true; 4333 break; 4334 } 4335 } 4336 if (!found) { 4337 struct discovery_entry_ctx *subnqn_ctx, *new_ctx; 4338 4339 TAILQ_FOREACH(subnqn_ctx, &ctx->nvm_entry_ctxs, tailq) { 4340 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 4341 sizeof(new_entry->subnqn))) { 4342 break; 4343 } 4344 } 4345 4346 new_ctx = calloc(1, sizeof(*new_ctx)); 4347 if (new_ctx == NULL) { 4348 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 4349 break; 4350 } 4351 4352 new_ctx->ctx = ctx; 4353 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 4354 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 4355 if (subnqn_ctx) { 4356 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 4357 DISCOVERY_DEBUGLOG(ctx, "NVM %s:%s:%s new path for %s\n", 4358 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 4359 new_ctx->name); 4360 } else { 4361 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 4362 DISCOVERY_DEBUGLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 4363 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 4364 new_ctx->name); 4365 } 4366 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->opts, sizeof(new_ctx->opts)); 4367 snprintf(new_ctx->opts.hostnqn, sizeof(new_ctx->opts.hostnqn), "%s", ctx->hostnqn); 4368 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 0, 4369 discovery_attach_controller_done, new_ctx, 4370 &new_ctx->opts, true, 0, 0, 0); 4371 if (rc == 0) { 4372 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 4373 ctx->attach_in_progress++; 4374 } else { 4375 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 4376 } 4377 } 4378 } 4379 4380 if (ctx->attach_in_progress == 0) { 4381 discovery_remove_controllers(ctx); 4382 } 4383 } 4384 4385 static void 4386 get_discovery_log_page(struct discovery_ctx *ctx) 4387 { 4388 int rc; 4389 4390 assert(ctx->in_progress == false); 4391 ctx->in_progress = true; 4392 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 4393 if (rc != 0) { 4394 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 4395 } 4396 DISCOVERY_DEBUGLOG(ctx, "sent discovery log page command\n"); 4397 } 4398 4399 static void 4400 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 4401 { 4402 struct discovery_ctx *ctx = arg; 4403 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 4404 4405 if (spdk_nvme_cpl_is_error(cpl)) { 4406 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 4407 return; 4408 } 4409 4410 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 4411 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 4412 return; 4413 } 4414 4415 DISCOVERY_DEBUGLOG(ctx, "got aer\n"); 4416 if (ctx->in_progress) { 4417 ctx->pending = true; 4418 return; 4419 } 4420 4421 get_discovery_log_page(ctx); 4422 } 4423 4424 static void 4425 start_discovery_done(void *cb_ctx) 4426 { 4427 struct discovery_ctx *ctx = cb_ctx; 4428 4429 DISCOVERY_DEBUGLOG(ctx, "start discovery done\n"); 4430 ctx->start_cb_fn(ctx->cb_ctx, ctx->rc); 4431 if (ctx->rc != 0) { 4432 DISCOVERY_ERRLOG(ctx, "could not connect to discovery ctrlr\n"); 4433 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 4434 free_discovery_ctx(ctx); 4435 } 4436 } 4437 4438 static void 4439 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4440 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 4441 { 4442 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 4443 struct discovery_ctx *ctx; 4444 4445 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, opts); 4446 4447 DISCOVERY_DEBUGLOG(ctx, "discovery ctrlr attached\n"); 4448 ctx->probe_ctx = NULL; 4449 ctx->ctrlr = ctrlr; 4450 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 4451 } 4452 4453 static int 4454 discovery_poller(void *arg) 4455 { 4456 struct discovery_ctx *ctx = arg; 4457 int rc; 4458 4459 if (ctx->detach) { 4460 bool detach_done = false; 4461 4462 if (ctx->detach_ctx == NULL) { 4463 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 4464 if (rc != 0) { 4465 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 4466 detach_done = true; 4467 } 4468 } else { 4469 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 4470 if (rc != -EAGAIN) { 4471 detach_done = true; 4472 } 4473 } 4474 if (detach_done) { 4475 spdk_poller_unregister(&ctx->poller); 4476 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 4477 ctx->stop_cb_fn(ctx->cb_ctx); 4478 free_discovery_ctx(ctx); 4479 } 4480 } else if (ctx->probe_ctx) { 4481 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 4482 if (rc != -EAGAIN) { 4483 DISCOVERY_DEBUGLOG(ctx, "discovery ctrlr connected\n"); 4484 ctx->rc = rc; 4485 spdk_thread_send_msg(ctx->calling_thread, start_discovery_done, ctx); 4486 if (rc == 0) { 4487 get_discovery_log_page(ctx); 4488 } 4489 } 4490 } else { 4491 spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 4492 } 4493 4494 return SPDK_POLLER_BUSY; 4495 } 4496 4497 static void 4498 start_discovery_poller(void *arg) 4499 { 4500 struct discovery_ctx *ctx = arg; 4501 4502 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 4503 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 4504 } 4505 4506 int 4507 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 4508 const char *base_name, 4509 struct spdk_nvme_ctrlr_opts *opts, 4510 spdk_bdev_nvme_start_discovery_fn cb_fn, 4511 void *cb_ctx) 4512 { 4513 struct discovery_ctx *ctx; 4514 4515 ctx = calloc(1, sizeof(*ctx)); 4516 if (ctx == NULL) { 4517 return -ENOMEM; 4518 } 4519 4520 ctx->name = strdup(base_name); 4521 if (ctx->name == NULL) { 4522 free_discovery_ctx(ctx); 4523 return -ENOMEM; 4524 } 4525 ctx->start_cb_fn = cb_fn; 4526 ctx->cb_ctx = cb_ctx; 4527 memcpy(&ctx->opts, opts, sizeof(*opts)); 4528 ctx->calling_thread = spdk_get_thread(); 4529 TAILQ_INIT(&ctx->nvm_entry_ctxs); 4530 TAILQ_INIT(&ctx->discovery_entry_ctxs); 4531 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 4532 memcpy(&ctx->trid, trid, sizeof(*trid)); 4533 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 4534 ctx->hostnqn = strdup(ctx->opts.hostnqn); 4535 if (ctx->hostnqn == NULL) { 4536 free_discovery_ctx(ctx); 4537 return -ENOMEM; 4538 } 4539 ctx->probe_ctx = spdk_nvme_connect_async(&ctx->trid, &ctx->opts, discovery_attach_cb); 4540 if (ctx->probe_ctx == NULL) { 4541 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 4542 free_discovery_ctx(ctx); 4543 return -EIO; 4544 } 4545 4546 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 4547 return 0; 4548 } 4549 4550 int 4551 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 4552 { 4553 struct discovery_ctx *ctx; 4554 4555 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 4556 if (strcmp(name, ctx->name) == 0) { 4557 if (ctx->detach) { 4558 return -EALREADY; 4559 } 4560 ctx->detach = true; 4561 ctx->stop_cb_fn = cb_fn; 4562 ctx->cb_ctx = cb_ctx; 4563 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 4564 struct discovery_entry_ctx *entry_ctx; 4565 struct nvme_path_id path = {}; 4566 4567 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 4568 path.trid = entry_ctx->trid; 4569 bdev_nvme_delete(entry_ctx->name, &path); 4570 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 4571 free(entry_ctx); 4572 } 4573 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 4574 struct discovery_entry_ctx *entry_ctx; 4575 4576 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 4577 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 4578 free(entry_ctx); 4579 } 4580 return 0; 4581 } 4582 } 4583 4584 return -ENOENT; 4585 } 4586 4587 static int 4588 bdev_nvme_library_init(void) 4589 { 4590 g_bdev_nvme_init_thread = spdk_get_thread(); 4591 4592 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 4593 bdev_nvme_destroy_poll_group_cb, 4594 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 4595 4596 return 0; 4597 } 4598 4599 static void 4600 bdev_nvme_fini_destruct_ctrlrs(void) 4601 { 4602 struct nvme_bdev_ctrlr *nbdev_ctrlr; 4603 struct nvme_ctrlr *nvme_ctrlr; 4604 4605 pthread_mutex_lock(&g_bdev_nvme_mutex); 4606 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 4607 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 4608 pthread_mutex_lock(&nvme_ctrlr->mutex); 4609 if (nvme_ctrlr->destruct) { 4610 /* This controller's destruction was already started 4611 * before the application started shutting down 4612 */ 4613 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4614 continue; 4615 } 4616 nvme_ctrlr->destruct = true; 4617 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4618 4619 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 4620 nvme_ctrlr); 4621 } 4622 } 4623 4624 g_bdev_nvme_module_finish = true; 4625 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 4626 pthread_mutex_unlock(&g_bdev_nvme_mutex); 4627 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 4628 spdk_bdev_module_fini_done(); 4629 return; 4630 } 4631 4632 pthread_mutex_unlock(&g_bdev_nvme_mutex); 4633 } 4634 4635 static void 4636 check_discovery_fini(void *arg) 4637 { 4638 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 4639 bdev_nvme_fini_destruct_ctrlrs(); 4640 } 4641 } 4642 4643 static void 4644 bdev_nvme_library_fini(void) 4645 { 4646 struct nvme_probe_skip_entry *entry, *entry_tmp; 4647 struct discovery_ctx *ctx; 4648 4649 spdk_poller_unregister(&g_hotplug_poller); 4650 free(g_hotplug_probe_ctx); 4651 g_hotplug_probe_ctx = NULL; 4652 4653 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 4654 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 4655 free(entry); 4656 } 4657 4658 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 4659 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 4660 bdev_nvme_fini_destruct_ctrlrs(); 4661 } else { 4662 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 4663 ctx->detach = true; 4664 ctx->stop_cb_fn = check_discovery_fini; 4665 } 4666 } 4667 } 4668 4669 static void 4670 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 4671 { 4672 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 4673 struct spdk_bdev *bdev = bdev_io->bdev; 4674 struct spdk_dif_ctx dif_ctx; 4675 struct spdk_dif_error err_blk = {}; 4676 int rc; 4677 4678 rc = spdk_dif_ctx_init(&dif_ctx, 4679 bdev->blocklen, bdev->md_len, bdev->md_interleave, 4680 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 4681 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0); 4682 if (rc != 0) { 4683 SPDK_ERRLOG("Initialization of DIF context failed\n"); 4684 return; 4685 } 4686 4687 if (bdev->md_interleave) { 4688 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 4689 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 4690 } else { 4691 struct iovec md_iov = { 4692 .iov_base = bdev_io->u.bdev.md_buf, 4693 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 4694 }; 4695 4696 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 4697 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 4698 } 4699 4700 if (rc != 0) { 4701 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 4702 err_blk.err_type, err_blk.err_offset); 4703 } else { 4704 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 4705 } 4706 } 4707 4708 static void 4709 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 4710 { 4711 struct nvme_bdev_io *bio = ref; 4712 4713 if (spdk_nvme_cpl_is_success(cpl)) { 4714 /* Run PI verification for read data buffer. */ 4715 bdev_nvme_verify_pi_error(bio); 4716 } 4717 4718 /* Return original completion status */ 4719 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 4720 } 4721 4722 static void 4723 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 4724 { 4725 struct nvme_bdev_io *bio = ref; 4726 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 4727 int ret; 4728 4729 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 4730 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 4731 cpl->status.sct, cpl->status.sc); 4732 4733 /* Save completion status to use after verifying PI error. */ 4734 bio->cpl = *cpl; 4735 4736 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 4737 /* Read without PI checking to verify PI error. */ 4738 ret = bdev_nvme_no_pi_readv(bio, 4739 bdev_io->u.bdev.iovs, 4740 bdev_io->u.bdev.iovcnt, 4741 bdev_io->u.bdev.md_buf, 4742 bdev_io->u.bdev.num_blocks, 4743 bdev_io->u.bdev.offset_blocks); 4744 if (ret == 0) { 4745 return; 4746 } 4747 } 4748 } 4749 4750 bdev_nvme_io_complete_nvme_status(bio, cpl); 4751 } 4752 4753 static void 4754 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 4755 { 4756 struct nvme_bdev_io *bio = ref; 4757 4758 if (spdk_nvme_cpl_is_pi_error(cpl)) { 4759 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 4760 cpl->status.sct, cpl->status.sc); 4761 /* Run PI verification for write data buffer if PI error is detected. */ 4762 bdev_nvme_verify_pi_error(bio); 4763 } 4764 4765 bdev_nvme_io_complete_nvme_status(bio, cpl); 4766 } 4767 4768 static void 4769 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 4770 { 4771 struct nvme_bdev_io *bio = ref; 4772 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 4773 4774 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 4775 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 4776 */ 4777 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 4778 4779 if (spdk_nvme_cpl_is_pi_error(cpl)) { 4780 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 4781 cpl->status.sct, cpl->status.sc); 4782 /* Run PI verification for zone append data buffer if PI error is detected. */ 4783 bdev_nvme_verify_pi_error(bio); 4784 } 4785 4786 bdev_nvme_io_complete_nvme_status(bio, cpl); 4787 } 4788 4789 static void 4790 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 4791 { 4792 struct nvme_bdev_io *bio = ref; 4793 4794 if (spdk_nvme_cpl_is_pi_error(cpl)) { 4795 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 4796 cpl->status.sct, cpl->status.sc); 4797 /* Run PI verification for compare data buffer if PI error is detected. */ 4798 bdev_nvme_verify_pi_error(bio); 4799 } 4800 4801 bdev_nvme_io_complete_nvme_status(bio, cpl); 4802 } 4803 4804 static void 4805 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 4806 { 4807 struct nvme_bdev_io *bio = ref; 4808 4809 /* Compare operation completion */ 4810 if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) { 4811 /* Save compare result for write callback */ 4812 bio->cpl = *cpl; 4813 return; 4814 } 4815 4816 /* Write operation completion */ 4817 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 4818 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 4819 * complete the IO with the compare operation's status. 4820 */ 4821 if (!spdk_nvme_cpl_is_error(cpl)) { 4822 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 4823 } 4824 4825 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 4826 } else { 4827 bdev_nvme_io_complete_nvme_status(bio, cpl); 4828 } 4829 } 4830 4831 static void 4832 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 4833 { 4834 struct nvme_bdev_io *bio = ref; 4835 4836 bdev_nvme_io_complete_nvme_status(bio, cpl); 4837 } 4838 4839 static int 4840 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 4841 { 4842 switch (desc->zs) { 4843 case SPDK_NVME_ZONE_STATE_EMPTY: 4844 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 4845 break; 4846 case SPDK_NVME_ZONE_STATE_IOPEN: 4847 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 4848 break; 4849 case SPDK_NVME_ZONE_STATE_EOPEN: 4850 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 4851 break; 4852 case SPDK_NVME_ZONE_STATE_CLOSED: 4853 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 4854 break; 4855 case SPDK_NVME_ZONE_STATE_RONLY: 4856 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 4857 break; 4858 case SPDK_NVME_ZONE_STATE_FULL: 4859 info->state = SPDK_BDEV_ZONE_STATE_FULL; 4860 break; 4861 case SPDK_NVME_ZONE_STATE_OFFLINE: 4862 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 4863 break; 4864 default: 4865 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 4866 return -EIO; 4867 } 4868 4869 info->zone_id = desc->zslba; 4870 info->write_pointer = desc->wp; 4871 info->capacity = desc->zcap; 4872 4873 return 0; 4874 } 4875 4876 static void 4877 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 4878 { 4879 struct nvme_bdev_io *bio = ref; 4880 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 4881 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 4882 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 4883 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 4884 uint64_t max_zones_per_buf, i; 4885 uint32_t zone_report_bufsize; 4886 struct spdk_nvme_ns *ns; 4887 struct spdk_nvme_qpair *qpair; 4888 int ret; 4889 4890 if (spdk_nvme_cpl_is_error(cpl)) { 4891 goto out_complete_io_nvme_cpl; 4892 } 4893 4894 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 4895 ret = -ENXIO; 4896 goto out_complete_io_ret; 4897 } 4898 4899 ns = bio->io_path->nvme_ns->ns; 4900 qpair = bio->io_path->ctrlr_ch->qpair; 4901 4902 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 4903 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 4904 sizeof(bio->zone_report_buf->descs[0]); 4905 4906 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 4907 ret = -EINVAL; 4908 goto out_complete_io_ret; 4909 } 4910 4911 if (!bio->zone_report_buf->nr_zones) { 4912 ret = -EINVAL; 4913 goto out_complete_io_ret; 4914 } 4915 4916 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 4917 ret = fill_zone_from_report(&info[bio->handled_zones], 4918 &bio->zone_report_buf->descs[i]); 4919 if (ret) { 4920 goto out_complete_io_ret; 4921 } 4922 bio->handled_zones++; 4923 } 4924 4925 if (bio->handled_zones < zones_to_copy) { 4926 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 4927 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 4928 4929 memset(bio->zone_report_buf, 0, zone_report_bufsize); 4930 ret = spdk_nvme_zns_report_zones(ns, qpair, 4931 bio->zone_report_buf, zone_report_bufsize, 4932 slba, SPDK_NVME_ZRA_LIST_ALL, true, 4933 bdev_nvme_get_zone_info_done, bio); 4934 if (!ret) { 4935 return; 4936 } else { 4937 goto out_complete_io_ret; 4938 } 4939 } 4940 4941 out_complete_io_nvme_cpl: 4942 free(bio->zone_report_buf); 4943 bio->zone_report_buf = NULL; 4944 bdev_nvme_io_complete_nvme_status(bio, cpl); 4945 return; 4946 4947 out_complete_io_ret: 4948 free(bio->zone_report_buf); 4949 bio->zone_report_buf = NULL; 4950 bdev_nvme_io_complete(bio, ret); 4951 } 4952 4953 static void 4954 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 4955 { 4956 struct nvme_bdev_io *bio = ref; 4957 4958 bdev_nvme_io_complete_nvme_status(bio, cpl); 4959 } 4960 4961 static void 4962 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 4963 { 4964 struct nvme_bdev_io *bio = ctx; 4965 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 4966 const struct spdk_nvme_cpl *cpl = &bio->cpl; 4967 struct nvme_bdev_channel *nbdev_ch; 4968 struct nvme_ctrlr *nvme_ctrlr; 4969 const struct spdk_nvme_ctrlr_data *cdata; 4970 uint64_t delay_ms; 4971 4972 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 4973 4974 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 4975 goto complete; 4976 } 4977 4978 if (cpl->status.dnr != 0 || (g_opts.bdev_retry_count != -1 && 4979 bio->retry_count >= g_opts.bdev_retry_count)) { 4980 goto complete; 4981 } 4982 4983 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 4984 nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(bio->io_path->ctrlr_ch); 4985 4986 if (spdk_nvme_cpl_is_path_error(cpl) || 4987 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 4988 !nvme_ctrlr_is_available(nvme_ctrlr)) { 4989 delay_ms = 0; 4990 } else if (spdk_nvme_cpl_is_aborted_by_request(cpl)) { 4991 goto complete; 4992 } else { 4993 bio->retry_count++; 4994 4995 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 4996 4997 if (cpl->status.crd != 0) { 4998 delay_ms = cdata->crdt[cpl->status.crd] * 100; 4999 } else { 5000 delay_ms = 0; 5001 } 5002 } 5003 5004 if (any_ctrlr_may_become_available(nbdev_ch)) { 5005 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 5006 return; 5007 } 5008 5009 complete: 5010 bio->retry_count = 0; 5011 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 5012 } 5013 5014 static void 5015 bdev_nvme_abort_complete(void *ctx) 5016 { 5017 struct nvme_bdev_io *bio = ctx; 5018 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5019 5020 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 5021 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 5022 } else { 5023 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5024 } 5025 } 5026 5027 static void 5028 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 5029 { 5030 struct nvme_bdev_io *bio = ref; 5031 5032 bio->cpl = *cpl; 5033 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_complete, bio); 5034 } 5035 5036 static void 5037 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 5038 { 5039 struct nvme_bdev_io *bio = ref; 5040 5041 bio->cpl = *cpl; 5042 spdk_thread_send_msg(bio->orig_thread, 5043 bdev_nvme_admin_passthru_complete_nvme_status, bio); 5044 } 5045 5046 static void 5047 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 5048 { 5049 struct nvme_bdev_io *bio = ref; 5050 struct iovec *iov; 5051 5052 bio->iov_offset = sgl_offset; 5053 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 5054 iov = &bio->iovs[bio->iovpos]; 5055 if (bio->iov_offset < iov->iov_len) { 5056 break; 5057 } 5058 5059 bio->iov_offset -= iov->iov_len; 5060 } 5061 } 5062 5063 static int 5064 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 5065 { 5066 struct nvme_bdev_io *bio = ref; 5067 struct iovec *iov; 5068 5069 assert(bio->iovpos < bio->iovcnt); 5070 5071 iov = &bio->iovs[bio->iovpos]; 5072 5073 *address = iov->iov_base; 5074 *length = iov->iov_len; 5075 5076 if (bio->iov_offset) { 5077 assert(bio->iov_offset <= iov->iov_len); 5078 *address += bio->iov_offset; 5079 *length -= bio->iov_offset; 5080 } 5081 5082 bio->iov_offset += *length; 5083 if (bio->iov_offset == iov->iov_len) { 5084 bio->iovpos++; 5085 bio->iov_offset = 0; 5086 } 5087 5088 return 0; 5089 } 5090 5091 static void 5092 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 5093 { 5094 struct nvme_bdev_io *bio = ref; 5095 struct iovec *iov; 5096 5097 bio->fused_iov_offset = sgl_offset; 5098 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 5099 iov = &bio->fused_iovs[bio->fused_iovpos]; 5100 if (bio->fused_iov_offset < iov->iov_len) { 5101 break; 5102 } 5103 5104 bio->fused_iov_offset -= iov->iov_len; 5105 } 5106 } 5107 5108 static int 5109 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 5110 { 5111 struct nvme_bdev_io *bio = ref; 5112 struct iovec *iov; 5113 5114 assert(bio->fused_iovpos < bio->fused_iovcnt); 5115 5116 iov = &bio->fused_iovs[bio->fused_iovpos]; 5117 5118 *address = iov->iov_base; 5119 *length = iov->iov_len; 5120 5121 if (bio->fused_iov_offset) { 5122 assert(bio->fused_iov_offset <= iov->iov_len); 5123 *address += bio->fused_iov_offset; 5124 *length -= bio->fused_iov_offset; 5125 } 5126 5127 bio->fused_iov_offset += *length; 5128 if (bio->fused_iov_offset == iov->iov_len) { 5129 bio->fused_iovpos++; 5130 bio->fused_iov_offset = 0; 5131 } 5132 5133 return 0; 5134 } 5135 5136 static int 5137 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 5138 void *md, uint64_t lba_count, uint64_t lba) 5139 { 5140 int rc; 5141 5142 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 5143 lba_count, lba); 5144 5145 bio->iovs = iov; 5146 bio->iovcnt = iovcnt; 5147 bio->iovpos = 0; 5148 bio->iov_offset = 0; 5149 5150 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 5151 bio->io_path->ctrlr_ch->qpair, 5152 lba, lba_count, 5153 bdev_nvme_no_pi_readv_done, bio, 0, 5154 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 5155 md, 0, 0); 5156 5157 if (rc != 0 && rc != -ENOMEM) { 5158 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 5159 } 5160 return rc; 5161 } 5162 5163 static int 5164 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 5165 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 5166 struct spdk_bdev_ext_io_opts *ext_opts) 5167 { 5168 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 5169 struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair; 5170 int rc; 5171 5172 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 5173 lba_count, lba); 5174 5175 bio->iovs = iov; 5176 bio->iovcnt = iovcnt; 5177 bio->iovpos = 0; 5178 bio->iov_offset = 0; 5179 5180 if (ext_opts) { 5181 bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts); 5182 bio->ext_opts.memory_domain = ext_opts->memory_domain; 5183 bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx; 5184 bio->ext_opts.io_flags = flags; 5185 bio->ext_opts.metadata = md; 5186 5187 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 5188 bdev_nvme_readv_done, bio, 5189 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 5190 &bio->ext_opts); 5191 } else if (iovcnt == 1) { 5192 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba, 5193 lba_count, 5194 bdev_nvme_readv_done, bio, 5195 flags, 5196 0, 0); 5197 } else { 5198 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 5199 bdev_nvme_readv_done, bio, flags, 5200 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 5201 md, 0, 0); 5202 } 5203 5204 if (rc != 0 && rc != -ENOMEM) { 5205 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 5206 } 5207 return rc; 5208 } 5209 5210 static int 5211 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 5212 void *md, uint64_t lba_count, uint64_t lba, 5213 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts) 5214 { 5215 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 5216 struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair; 5217 int rc; 5218 5219 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 5220 lba_count, lba); 5221 5222 bio->iovs = iov; 5223 bio->iovcnt = iovcnt; 5224 bio->iovpos = 0; 5225 bio->iov_offset = 0; 5226 5227 if (ext_opts) { 5228 bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts); 5229 bio->ext_opts.memory_domain = ext_opts->memory_domain; 5230 bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx; 5231 bio->ext_opts.io_flags = flags; 5232 bio->ext_opts.metadata = md; 5233 5234 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 5235 bdev_nvme_writev_done, bio, 5236 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 5237 &bio->ext_opts); 5238 } else if (iovcnt == 1) { 5239 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba, 5240 lba_count, 5241 bdev_nvme_writev_done, bio, 5242 flags, 5243 0, 0); 5244 } else { 5245 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 5246 bdev_nvme_writev_done, bio, flags, 5247 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 5248 md, 0, 0); 5249 } 5250 5251 if (rc != 0 && rc != -ENOMEM) { 5252 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 5253 } 5254 return rc; 5255 } 5256 5257 static int 5258 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 5259 void *md, uint64_t lba_count, uint64_t zslba, 5260 uint32_t flags) 5261 { 5262 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 5263 struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair; 5264 int rc; 5265 5266 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 5267 lba_count, zslba); 5268 5269 bio->iovs = iov; 5270 bio->iovcnt = iovcnt; 5271 bio->iovpos = 0; 5272 bio->iov_offset = 0; 5273 5274 if (iovcnt == 1) { 5275 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 5276 lba_count, 5277 bdev_nvme_zone_appendv_done, bio, 5278 flags, 5279 0, 0); 5280 } else { 5281 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 5282 bdev_nvme_zone_appendv_done, bio, flags, 5283 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 5284 md, 0, 0); 5285 } 5286 5287 if (rc != 0 && rc != -ENOMEM) { 5288 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 5289 } 5290 return rc; 5291 } 5292 5293 static int 5294 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 5295 void *md, uint64_t lba_count, uint64_t lba, 5296 uint32_t flags) 5297 { 5298 int rc; 5299 5300 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 5301 lba_count, lba); 5302 5303 bio->iovs = iov; 5304 bio->iovcnt = iovcnt; 5305 bio->iovpos = 0; 5306 bio->iov_offset = 0; 5307 5308 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 5309 bio->io_path->ctrlr_ch->qpair, 5310 lba, lba_count, 5311 bdev_nvme_comparev_done, bio, flags, 5312 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 5313 md, 0, 0); 5314 5315 if (rc != 0 && rc != -ENOMEM) { 5316 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 5317 } 5318 return rc; 5319 } 5320 5321 static int 5322 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 5323 struct iovec *write_iov, int write_iovcnt, 5324 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 5325 { 5326 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 5327 struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair; 5328 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5329 int rc; 5330 5331 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 5332 lba_count, lba); 5333 5334 bio->iovs = cmp_iov; 5335 bio->iovcnt = cmp_iovcnt; 5336 bio->iovpos = 0; 5337 bio->iov_offset = 0; 5338 bio->fused_iovs = write_iov; 5339 bio->fused_iovcnt = write_iovcnt; 5340 bio->fused_iovpos = 0; 5341 bio->fused_iov_offset = 0; 5342 5343 if (bdev_io->num_retries == 0) { 5344 bio->first_fused_submitted = false; 5345 } 5346 5347 if (!bio->first_fused_submitted) { 5348 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 5349 memset(&bio->cpl, 0, sizeof(bio->cpl)); 5350 5351 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 5352 bdev_nvme_comparev_and_writev_done, bio, flags, 5353 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 5354 if (rc == 0) { 5355 bio->first_fused_submitted = true; 5356 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 5357 } else { 5358 if (rc != -ENOMEM) { 5359 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 5360 } 5361 return rc; 5362 } 5363 } 5364 5365 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 5366 5367 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 5368 bdev_nvme_comparev_and_writev_done, bio, flags, 5369 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 5370 if (rc != 0 && rc != -ENOMEM) { 5371 SPDK_ERRLOG("write failed: rc = %d\n", rc); 5372 rc = 0; 5373 } 5374 5375 return rc; 5376 } 5377 5378 static int 5379 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 5380 { 5381 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 5382 struct spdk_nvme_dsm_range *range; 5383 uint64_t offset, remaining; 5384 uint64_t num_ranges_u64; 5385 uint16_t num_ranges; 5386 int rc; 5387 5388 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 5389 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 5390 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 5391 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 5392 return -EINVAL; 5393 } 5394 num_ranges = (uint16_t)num_ranges_u64; 5395 5396 offset = offset_blocks; 5397 remaining = num_blocks; 5398 range = &dsm_ranges[0]; 5399 5400 /* Fill max-size ranges until the remaining blocks fit into one range */ 5401 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 5402 range->attributes.raw = 0; 5403 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 5404 range->starting_lba = offset; 5405 5406 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 5407 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 5408 range++; 5409 } 5410 5411 /* Final range describes the remaining blocks */ 5412 range->attributes.raw = 0; 5413 range->length = remaining; 5414 range->starting_lba = offset; 5415 5416 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 5417 bio->io_path->ctrlr_ch->qpair, 5418 SPDK_NVME_DSM_ATTR_DEALLOCATE, 5419 dsm_ranges, num_ranges, 5420 bdev_nvme_queued_done, bio); 5421 5422 return rc; 5423 } 5424 5425 static int 5426 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 5427 { 5428 if (num_blocks > UINT16_MAX + 1) { 5429 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 5430 return -EINVAL; 5431 } 5432 5433 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 5434 bio->io_path->ctrlr_ch->qpair, 5435 offset_blocks, num_blocks, 5436 bdev_nvme_queued_done, bio, 5437 0); 5438 } 5439 5440 static int 5441 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 5442 struct spdk_bdev_zone_info *info) 5443 { 5444 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 5445 struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair; 5446 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 5447 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 5448 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 5449 5450 if (zone_id % zone_size != 0) { 5451 return -EINVAL; 5452 } 5453 5454 if (num_zones > total_zones || !num_zones) { 5455 return -EINVAL; 5456 } 5457 5458 assert(!bio->zone_report_buf); 5459 bio->zone_report_buf = calloc(1, zone_report_bufsize); 5460 if (!bio->zone_report_buf) { 5461 return -ENOMEM; 5462 } 5463 5464 bio->handled_zones = 0; 5465 5466 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 5467 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 5468 bdev_nvme_get_zone_info_done, bio); 5469 } 5470 5471 static int 5472 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 5473 enum spdk_bdev_zone_action action) 5474 { 5475 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 5476 struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair; 5477 5478 switch (action) { 5479 case SPDK_BDEV_ZONE_CLOSE: 5480 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 5481 bdev_nvme_zone_management_done, bio); 5482 case SPDK_BDEV_ZONE_FINISH: 5483 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 5484 bdev_nvme_zone_management_done, bio); 5485 case SPDK_BDEV_ZONE_OPEN: 5486 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 5487 bdev_nvme_zone_management_done, bio); 5488 case SPDK_BDEV_ZONE_RESET: 5489 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 5490 bdev_nvme_zone_management_done, bio); 5491 case SPDK_BDEV_ZONE_OFFLINE: 5492 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 5493 bdev_nvme_zone_management_done, bio); 5494 default: 5495 return -EINVAL; 5496 } 5497 } 5498 5499 static void 5500 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 5501 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 5502 { 5503 struct nvme_io_path *io_path; 5504 struct nvme_ctrlr *nvme_ctrlr; 5505 uint32_t max_xfer_size; 5506 int rc = -ENXIO; 5507 5508 /* Choose the first ctrlr which is not failed. */ 5509 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 5510 nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(io_path->ctrlr_ch); 5511 5512 /* We should skip any unavailable nvme_ctrlr rather than checking 5513 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 5514 */ 5515 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 5516 continue; 5517 } 5518 5519 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 5520 5521 if (nbytes > max_xfer_size) { 5522 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 5523 rc = -EINVAL; 5524 goto err; 5525 } 5526 5527 bio->io_path = io_path; 5528 bio->orig_thread = spdk_get_thread(); 5529 5530 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 5531 bdev_nvme_admin_passthru_done, bio); 5532 if (rc == 0) { 5533 return; 5534 } 5535 } 5536 5537 err: 5538 bdev_nvme_admin_passthru_complete(bio, rc); 5539 } 5540 5541 static int 5542 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 5543 void *buf, size_t nbytes) 5544 { 5545 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 5546 struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair; 5547 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 5548 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 5549 5550 if (nbytes > max_xfer_size) { 5551 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 5552 return -EINVAL; 5553 } 5554 5555 /* 5556 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 5557 * so fill it out automatically. 5558 */ 5559 cmd->nsid = spdk_nvme_ns_get_id(ns); 5560 5561 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 5562 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 5563 } 5564 5565 static int 5566 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 5567 void *buf, size_t nbytes, void *md_buf, size_t md_len) 5568 { 5569 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 5570 struct spdk_nvme_qpair *qpair = bio->io_path->ctrlr_ch->qpair; 5571 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 5572 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 5573 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 5574 5575 if (nbytes > max_xfer_size) { 5576 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 5577 return -EINVAL; 5578 } 5579 5580 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 5581 SPDK_ERRLOG("invalid meta data buffer size\n"); 5582 return -EINVAL; 5583 } 5584 5585 /* 5586 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 5587 * so fill it out automatically. 5588 */ 5589 cmd->nsid = spdk_nvme_ns_get_id(ns); 5590 5591 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 5592 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 5593 } 5594 5595 static void 5596 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 5597 struct nvme_bdev_io *bio_to_abort) 5598 { 5599 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5600 struct spdk_bdev_io *bdev_io_to_abort; 5601 struct nvme_io_path *io_path; 5602 struct nvme_ctrlr *nvme_ctrlr; 5603 int rc = 0; 5604 5605 bio->orig_thread = spdk_get_thread(); 5606 5607 /* Traverse the retry_io_list first. */ 5608 TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) { 5609 if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) { 5610 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link); 5611 spdk_bdev_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 5612 5613 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 5614 return; 5615 } 5616 } 5617 5618 /* Even admin commands, they were submitted to only nvme_ctrlrs which were 5619 * on any io_path. So traverse the io_path list for not only I/O commands 5620 * but also admin commands. 5621 */ 5622 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 5623 nvme_ctrlr = nvme_ctrlr_channel_get_ctrlr(io_path->ctrlr_ch); 5624 5625 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr, 5626 io_path->ctrlr_ch->qpair, 5627 bio_to_abort, 5628 bdev_nvme_abort_done, bio); 5629 if (rc == -ENOENT) { 5630 /* If no command was found in I/O qpair, the target command may be 5631 * admin command. 5632 */ 5633 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr, 5634 NULL, 5635 bio_to_abort, 5636 bdev_nvme_abort_done, bio); 5637 } 5638 5639 if (rc != -ENOENT) { 5640 break; 5641 } 5642 } 5643 5644 if (rc != 0) { 5645 /* If no command was found or there was any error, complete the abort 5646 * request with failure. 5647 */ 5648 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5649 } 5650 } 5651 5652 static void 5653 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 5654 { 5655 const char *action; 5656 5657 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 5658 action = "reset"; 5659 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 5660 action = "abort"; 5661 } else { 5662 action = "none"; 5663 } 5664 5665 spdk_json_write_object_begin(w); 5666 5667 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 5668 5669 spdk_json_write_named_object_begin(w, "params"); 5670 spdk_json_write_named_string(w, "action_on_timeout", action); 5671 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 5672 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 5673 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 5674 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 5675 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 5676 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 5677 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 5678 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 5679 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 5680 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 5681 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 5682 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 5683 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 5684 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 5685 spdk_json_write_object_end(w); 5686 5687 spdk_json_write_object_end(w); 5688 } 5689 5690 static void 5691 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 5692 struct nvme_ctrlr *nvme_ctrlr) 5693 { 5694 struct spdk_nvme_transport_id *trid; 5695 5696 trid = &nvme_ctrlr->active_path_id->trid; 5697 5698 spdk_json_write_object_begin(w); 5699 5700 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 5701 5702 spdk_json_write_named_object_begin(w, "params"); 5703 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 5704 nvme_bdev_dump_trid_json(trid, w); 5705 spdk_json_write_named_bool(w, "prchk_reftag", 5706 (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 5707 spdk_json_write_named_bool(w, "prchk_guard", 5708 (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 5709 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->ctrlr_loss_timeout_sec); 5710 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->reconnect_delay_sec); 5711 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", nvme_ctrlr->fast_io_fail_timeout_sec); 5712 5713 spdk_json_write_object_end(w); 5714 5715 spdk_json_write_object_end(w); 5716 } 5717 5718 static void 5719 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 5720 { 5721 spdk_json_write_object_begin(w); 5722 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 5723 5724 spdk_json_write_named_object_begin(w, "params"); 5725 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 5726 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 5727 spdk_json_write_object_end(w); 5728 5729 spdk_json_write_object_end(w); 5730 } 5731 5732 static int 5733 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 5734 { 5735 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5736 struct nvme_ctrlr *nvme_ctrlr; 5737 5738 bdev_nvme_opts_config_json(w); 5739 5740 pthread_mutex_lock(&g_bdev_nvme_mutex); 5741 5742 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 5743 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 5744 nvme_ctrlr_config_json(w, nvme_ctrlr); 5745 } 5746 } 5747 5748 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 5749 * before enabling hotplug poller. 5750 */ 5751 bdev_nvme_hotplug_config_json(w); 5752 5753 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5754 return 0; 5755 } 5756 5757 struct spdk_nvme_ctrlr * 5758 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 5759 { 5760 struct nvme_bdev *nbdev; 5761 struct nvme_ns *nvme_ns; 5762 5763 if (!bdev || bdev->module != &nvme_if) { 5764 return NULL; 5765 } 5766 5767 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5768 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 5769 assert(nvme_ns != NULL); 5770 5771 return nvme_ns->ctrlr->ctrlr; 5772 } 5773 5774 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 5775