1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "bdev_nvme.h" 10 11 #include "spdk/accel_engine.h" 12 #include "spdk/config.h" 13 #include "spdk/endian.h" 14 #include "spdk/bdev.h" 15 #include "spdk/json.h" 16 #include "spdk/likely.h" 17 #include "spdk/nvme.h" 18 #include "spdk/nvme_ocssd.h" 19 #include "spdk/nvme_zns.h" 20 #include "spdk/opal.h" 21 #include "spdk/thread.h" 22 #include "spdk/string.h" 23 #include "spdk/util.h" 24 25 #include "spdk/bdev_module.h" 26 #include "spdk/log.h" 27 28 #include "spdk_internal/usdt.h" 29 30 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 31 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 32 33 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 34 35 struct nvme_bdev_io { 36 /** array of iovecs to transfer. */ 37 struct iovec *iovs; 38 39 /** Number of iovecs in iovs array. */ 40 int iovcnt; 41 42 /** Current iovec position. */ 43 int iovpos; 44 45 /** Offset in current iovec. */ 46 uint32_t iov_offset; 47 48 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 49 * being reset in a reset I/O. 50 */ 51 struct nvme_io_path *io_path; 52 53 /** array of iovecs to transfer. */ 54 struct iovec *fused_iovs; 55 56 /** Number of iovecs in iovs array. */ 57 int fused_iovcnt; 58 59 /** Current iovec position. */ 60 int fused_iovpos; 61 62 /** Offset in current iovec. */ 63 uint32_t fused_iov_offset; 64 65 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 66 struct spdk_nvme_cpl cpl; 67 68 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 69 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 70 71 /** Originating thread */ 72 struct spdk_thread *orig_thread; 73 74 /** Keeps track if first of fused commands was submitted */ 75 bool first_fused_submitted; 76 77 /** Keeps track if first of fused commands was completed */ 78 bool first_fused_completed; 79 80 /** Temporary pointer to zone report buffer */ 81 struct spdk_nvme_zns_zone_report *zone_report_buf; 82 83 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 84 uint64_t handled_zones; 85 86 /** Expiration value in ticks to retry the current I/O. */ 87 uint64_t retry_ticks; 88 89 /* How many times the current I/O was retried. */ 90 int32_t retry_count; 91 }; 92 93 struct nvme_probe_skip_entry { 94 struct spdk_nvme_transport_id trid; 95 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 96 }; 97 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 98 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 99 g_skipped_nvme_ctrlrs); 100 101 static struct spdk_bdev_nvme_opts g_opts = { 102 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 103 .timeout_us = 0, 104 .timeout_admin_us = 0, 105 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 106 .transport_retry_count = 4, 107 .arbitration_burst = 0, 108 .low_priority_weight = 0, 109 .medium_priority_weight = 0, 110 .high_priority_weight = 0, 111 .nvme_adminq_poll_period_us = 10000ULL, 112 .nvme_ioq_poll_period_us = 0, 113 .io_queue_requests = 0, 114 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 115 .bdev_retry_count = 3, 116 .transport_ack_timeout = 0, 117 .ctrlr_loss_timeout_sec = 0, 118 .reconnect_delay_sec = 0, 119 .fast_io_fail_timeout_sec = 0, 120 .disable_auto_failback = false, 121 }; 122 123 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 124 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 125 126 static int g_hot_insert_nvme_controller_index = 0; 127 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 128 static bool g_nvme_hotplug_enabled = false; 129 static struct spdk_thread *g_bdev_nvme_init_thread; 130 static struct spdk_poller *g_hotplug_poller; 131 static struct spdk_poller *g_hotplug_probe_poller; 132 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 133 134 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 135 struct nvme_async_probe_ctx *ctx); 136 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 137 struct nvme_async_probe_ctx *ctx); 138 static int bdev_nvme_library_init(void); 139 static void bdev_nvme_library_fini(void); 140 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 141 struct spdk_bdev_io *bdev_io); 142 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 143 void *md, uint64_t lba_count, uint64_t lba, 144 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts); 145 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 146 void *md, uint64_t lba_count, uint64_t lba); 147 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 148 void *md, uint64_t lba_count, uint64_t lba, 149 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts); 150 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 151 void *md, uint64_t lba_count, 152 uint64_t zslba, uint32_t flags); 153 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 154 void *md, uint64_t lba_count, uint64_t lba, 155 uint32_t flags); 156 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 157 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 158 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 159 uint32_t flags); 160 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 161 uint32_t num_zones, struct spdk_bdev_zone_info *info); 162 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 163 enum spdk_bdev_zone_action action); 164 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 165 struct nvme_bdev_io *bio, 166 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 167 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 168 void *buf, size_t nbytes); 169 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 170 void *buf, size_t nbytes, void *md_buf, size_t md_len); 171 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 172 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 173 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 174 static int bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr); 175 static int bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove); 176 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 177 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 178 179 static int 180 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 181 { 182 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 183 } 184 185 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 186 187 struct spdk_nvme_qpair * 188 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 189 { 190 struct nvme_ctrlr_channel *ctrlr_ch; 191 192 assert(ctrlr_io_ch != NULL); 193 194 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 195 196 return ctrlr_ch->qpair->qpair; 197 } 198 199 static int 200 bdev_nvme_get_ctx_size(void) 201 { 202 return sizeof(struct nvme_bdev_io); 203 } 204 205 static struct spdk_bdev_module nvme_if = { 206 .name = "nvme", 207 .async_fini = true, 208 .module_init = bdev_nvme_library_init, 209 .module_fini = bdev_nvme_library_fini, 210 .config_json = bdev_nvme_config_json, 211 .get_ctx_size = bdev_nvme_get_ctx_size, 212 213 }; 214 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 215 216 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 217 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 218 bool g_bdev_nvme_module_finish; 219 220 struct nvme_bdev_ctrlr * 221 nvme_bdev_ctrlr_get_by_name(const char *name) 222 { 223 struct nvme_bdev_ctrlr *nbdev_ctrlr; 224 225 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 226 if (strcmp(name, nbdev_ctrlr->name) == 0) { 227 break; 228 } 229 } 230 231 return nbdev_ctrlr; 232 } 233 234 static struct nvme_ctrlr * 235 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 236 const struct spdk_nvme_transport_id *trid) 237 { 238 struct nvme_ctrlr *nvme_ctrlr; 239 240 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 241 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { 242 break; 243 } 244 } 245 246 return nvme_ctrlr; 247 } 248 249 static struct nvme_bdev * 250 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 251 { 252 struct nvme_bdev *bdev; 253 254 pthread_mutex_lock(&g_bdev_nvme_mutex); 255 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 256 if (bdev->nsid == nsid) { 257 break; 258 } 259 } 260 pthread_mutex_unlock(&g_bdev_nvme_mutex); 261 262 return bdev; 263 } 264 265 struct nvme_ns * 266 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 267 { 268 struct nvme_ns ns; 269 270 assert(nsid > 0); 271 272 ns.id = nsid; 273 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 274 } 275 276 struct nvme_ns * 277 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 278 { 279 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 280 } 281 282 struct nvme_ns * 283 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 284 { 285 if (ns == NULL) { 286 return NULL; 287 } 288 289 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 290 } 291 292 static struct nvme_ctrlr * 293 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) 294 { 295 struct nvme_bdev_ctrlr *nbdev_ctrlr; 296 struct nvme_ctrlr *nvme_ctrlr = NULL; 297 298 pthread_mutex_lock(&g_bdev_nvme_mutex); 299 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 300 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); 301 if (nvme_ctrlr != NULL) { 302 break; 303 } 304 } 305 pthread_mutex_unlock(&g_bdev_nvme_mutex); 306 307 return nvme_ctrlr; 308 } 309 310 struct nvme_ctrlr * 311 nvme_ctrlr_get_by_name(const char *name) 312 { 313 struct nvme_bdev_ctrlr *nbdev_ctrlr; 314 struct nvme_ctrlr *nvme_ctrlr = NULL; 315 316 if (name == NULL) { 317 return NULL; 318 } 319 320 pthread_mutex_lock(&g_bdev_nvme_mutex); 321 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 322 if (nbdev_ctrlr != NULL) { 323 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 324 } 325 pthread_mutex_unlock(&g_bdev_nvme_mutex); 326 327 return nvme_ctrlr; 328 } 329 330 void 331 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 332 { 333 struct nvme_bdev_ctrlr *nbdev_ctrlr; 334 335 pthread_mutex_lock(&g_bdev_nvme_mutex); 336 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 337 fn(nbdev_ctrlr, ctx); 338 } 339 pthread_mutex_unlock(&g_bdev_nvme_mutex); 340 } 341 342 void 343 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 344 { 345 const char *trtype_str; 346 const char *adrfam_str; 347 348 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 349 if (trtype_str) { 350 spdk_json_write_named_string(w, "trtype", trtype_str); 351 } 352 353 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 354 if (adrfam_str) { 355 spdk_json_write_named_string(w, "adrfam", adrfam_str); 356 } 357 358 if (trid->traddr[0] != '\0') { 359 spdk_json_write_named_string(w, "traddr", trid->traddr); 360 } 361 362 if (trid->trsvcid[0] != '\0') { 363 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 364 } 365 366 if (trid->subnqn[0] != '\0') { 367 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 368 } 369 } 370 371 static void 372 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 373 struct nvme_ctrlr *nvme_ctrlr) 374 { 375 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 376 pthread_mutex_lock(&g_bdev_nvme_mutex); 377 378 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 379 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 380 pthread_mutex_unlock(&g_bdev_nvme_mutex); 381 382 return; 383 } 384 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 385 386 pthread_mutex_unlock(&g_bdev_nvme_mutex); 387 388 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 389 390 free(nbdev_ctrlr->name); 391 free(nbdev_ctrlr); 392 } 393 394 static void 395 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 396 { 397 struct nvme_path_id *path_id, *tmp_path; 398 struct nvme_ns *ns, *tmp_ns; 399 400 free(nvme_ctrlr->copied_ana_desc); 401 spdk_free(nvme_ctrlr->ana_log_page); 402 403 if (nvme_ctrlr->opal_dev) { 404 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 405 nvme_ctrlr->opal_dev = NULL; 406 } 407 408 if (nvme_ctrlr->nbdev_ctrlr) { 409 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 410 } 411 412 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 413 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 414 free(ns); 415 } 416 417 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 418 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 419 free(path_id); 420 } 421 422 pthread_mutex_destroy(&nvme_ctrlr->mutex); 423 424 free(nvme_ctrlr); 425 426 pthread_mutex_lock(&g_bdev_nvme_mutex); 427 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 428 pthread_mutex_unlock(&g_bdev_nvme_mutex); 429 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 430 spdk_bdev_module_fini_done(); 431 return; 432 } 433 pthread_mutex_unlock(&g_bdev_nvme_mutex); 434 } 435 436 static int 437 nvme_detach_poller(void *arg) 438 { 439 struct nvme_ctrlr *nvme_ctrlr = arg; 440 int rc; 441 442 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 443 if (rc != -EAGAIN) { 444 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 445 _nvme_ctrlr_delete(nvme_ctrlr); 446 } 447 448 return SPDK_POLLER_BUSY; 449 } 450 451 static void 452 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 453 { 454 int rc; 455 456 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 457 458 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 459 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 460 461 /* If we got here, the reset/detach poller cannot be active */ 462 assert(nvme_ctrlr->reset_detach_poller == NULL); 463 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 464 nvme_ctrlr, 1000); 465 if (nvme_ctrlr->reset_detach_poller == NULL) { 466 SPDK_ERRLOG("Failed to register detach poller\n"); 467 goto error; 468 } 469 470 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 471 if (rc != 0) { 472 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 473 goto error; 474 } 475 476 return; 477 error: 478 /* We don't have a good way to handle errors here, so just do what we can and delete the 479 * controller without detaching the underlying NVMe device. 480 */ 481 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 482 _nvme_ctrlr_delete(nvme_ctrlr); 483 } 484 485 static void 486 nvme_ctrlr_unregister_cb(void *io_device) 487 { 488 struct nvme_ctrlr *nvme_ctrlr = io_device; 489 490 nvme_ctrlr_delete(nvme_ctrlr); 491 } 492 493 static void 494 nvme_ctrlr_unregister(void *ctx) 495 { 496 struct nvme_ctrlr *nvme_ctrlr = ctx; 497 498 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 499 } 500 501 static bool 502 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 503 { 504 if (!nvme_ctrlr->destruct) { 505 return false; 506 } 507 508 if (nvme_ctrlr->ref > 0) { 509 return false; 510 } 511 512 if (nvme_ctrlr->resetting) { 513 return false; 514 } 515 516 if (nvme_ctrlr->ana_log_page_updating) { 517 return false; 518 } 519 520 return true; 521 } 522 523 static void 524 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 525 { 526 pthread_mutex_lock(&nvme_ctrlr->mutex); 527 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 528 529 assert(nvme_ctrlr->ref > 0); 530 nvme_ctrlr->ref--; 531 532 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 533 pthread_mutex_unlock(&nvme_ctrlr->mutex); 534 return; 535 } 536 537 pthread_mutex_unlock(&nvme_ctrlr->mutex); 538 539 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 540 } 541 542 static struct nvme_io_path * 543 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 544 { 545 struct nvme_io_path *io_path; 546 547 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 548 if (io_path->nvme_ns == nvme_ns) { 549 break; 550 } 551 } 552 553 return io_path; 554 } 555 556 static int 557 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 558 { 559 struct nvme_io_path *io_path; 560 struct spdk_io_channel *ch; 561 struct nvme_ctrlr_channel *ctrlr_ch; 562 struct nvme_qpair *nvme_qpair; 563 564 io_path = calloc(1, sizeof(*io_path)); 565 if (io_path == NULL) { 566 SPDK_ERRLOG("Failed to alloc io_path.\n"); 567 return -ENOMEM; 568 } 569 570 io_path->nvme_ns = nvme_ns; 571 572 ch = spdk_get_io_channel(nvme_ns->ctrlr); 573 if (ch == NULL) { 574 free(io_path); 575 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 576 return -ENOMEM; 577 } 578 579 ctrlr_ch = spdk_io_channel_get_ctx(ch); 580 581 nvme_qpair = ctrlr_ch->qpair; 582 assert(nvme_qpair != NULL); 583 584 io_path->qpair = nvme_qpair; 585 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 586 587 io_path->nbdev_ch = nbdev_ch; 588 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 589 590 nbdev_ch->current_io_path = NULL; 591 592 return 0; 593 } 594 595 static void 596 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 597 { 598 struct spdk_io_channel *ch; 599 struct nvme_qpair *nvme_qpair; 600 struct nvme_ctrlr_channel *ctrlr_ch; 601 602 nbdev_ch->current_io_path = NULL; 603 604 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 605 606 nvme_qpair = io_path->qpair; 607 assert(nvme_qpair != NULL); 608 609 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 610 611 ctrlr_ch = nvme_qpair->ctrlr_ch; 612 assert(ctrlr_ch != NULL); 613 614 ch = spdk_io_channel_from_ctx(ctrlr_ch); 615 spdk_put_io_channel(ch); 616 617 free(io_path); 618 } 619 620 static void 621 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 622 { 623 struct nvme_io_path *io_path, *tmp_io_path; 624 625 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 626 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 627 } 628 } 629 630 static int 631 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 632 { 633 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 634 struct nvme_bdev *nbdev = io_device; 635 struct nvme_ns *nvme_ns; 636 int rc; 637 638 STAILQ_INIT(&nbdev_ch->io_path_list); 639 TAILQ_INIT(&nbdev_ch->retry_io_list); 640 641 pthread_mutex_lock(&nbdev->mutex); 642 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 643 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 644 if (rc != 0) { 645 pthread_mutex_unlock(&nbdev->mutex); 646 647 _bdev_nvme_delete_io_paths(nbdev_ch); 648 return rc; 649 } 650 } 651 pthread_mutex_unlock(&nbdev->mutex); 652 653 return 0; 654 } 655 656 static void 657 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 658 { 659 struct spdk_bdev_io *bdev_io, *tmp_io; 660 661 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) { 662 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 663 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 664 } 665 666 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 667 } 668 669 static void 670 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 671 { 672 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 673 674 bdev_nvme_abort_retry_ios(nbdev_ch); 675 _bdev_nvme_delete_io_paths(nbdev_ch); 676 } 677 678 static inline bool 679 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 680 { 681 switch (io_type) { 682 case SPDK_BDEV_IO_TYPE_RESET: 683 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 684 case SPDK_BDEV_IO_TYPE_ABORT: 685 return true; 686 default: 687 break; 688 } 689 690 return false; 691 } 692 693 static inline bool 694 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 695 { 696 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 697 return false; 698 } 699 700 switch (nvme_ns->ana_state) { 701 case SPDK_NVME_ANA_OPTIMIZED_STATE: 702 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 703 return true; 704 default: 705 break; 706 } 707 708 return false; 709 } 710 711 static inline bool 712 nvme_io_path_is_connected(struct nvme_io_path *io_path) 713 { 714 if (spdk_unlikely(io_path->qpair->qpair == NULL)) { 715 return false; 716 } 717 718 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(io_path->qpair->qpair) != 719 SPDK_NVME_QPAIR_FAILURE_NONE)) { 720 return false; 721 } 722 723 if (spdk_unlikely(io_path->qpair->ctrlr_ch->reset_iter != NULL)) { 724 return false; 725 } 726 727 if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(io_path->qpair->ctrlr->ctrlr) != 728 SPDK_NVME_QPAIR_FAILURE_NONE) { 729 return false; 730 } 731 732 return true; 733 } 734 735 static inline bool 736 nvme_io_path_is_available(struct nvme_io_path *io_path) 737 { 738 if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) { 739 return false; 740 } 741 742 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 743 return false; 744 } 745 746 return true; 747 } 748 749 static inline bool 750 nvme_io_path_is_failed(struct nvme_io_path *io_path) 751 { 752 struct nvme_ctrlr *nvme_ctrlr; 753 754 nvme_ctrlr = io_path->qpair->ctrlr; 755 756 if (nvme_ctrlr->destruct) { 757 return true; 758 } 759 760 if (nvme_ctrlr->fast_io_fail_timedout) { 761 return true; 762 } 763 764 if (nvme_ctrlr->resetting) { 765 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 766 return false; 767 } else { 768 return true; 769 } 770 } 771 772 if (nvme_ctrlr->reconnect_is_delayed) { 773 return false; 774 } 775 776 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 777 return true; 778 } else { 779 return false; 780 } 781 } 782 783 static bool 784 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 785 { 786 if (nvme_ctrlr->destruct) { 787 return false; 788 } 789 790 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 791 return false; 792 } 793 794 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 795 return false; 796 } 797 798 return true; 799 } 800 801 /* Simulate circular linked list. */ 802 static inline struct nvme_io_path * 803 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 804 { 805 struct nvme_io_path *next_path; 806 807 next_path = STAILQ_NEXT(prev_path, stailq); 808 if (next_path != NULL) { 809 return next_path; 810 } else { 811 return STAILQ_FIRST(&nbdev_ch->io_path_list); 812 } 813 } 814 815 static struct nvme_io_path * 816 bdev_nvme_find_next_io_path(struct nvme_bdev_channel *nbdev_ch, 817 struct nvme_io_path *prev) 818 { 819 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 820 821 start = nvme_io_path_get_next(nbdev_ch, prev); 822 823 io_path = start; 824 do { 825 if (spdk_likely(nvme_io_path_is_connected(io_path) && 826 !io_path->nvme_ns->ana_state_updating)) { 827 switch (io_path->nvme_ns->ana_state) { 828 case SPDK_NVME_ANA_OPTIMIZED_STATE: 829 nbdev_ch->current_io_path = io_path; 830 return io_path; 831 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 832 if (non_optimized == NULL) { 833 non_optimized = io_path; 834 } 835 break; 836 default: 837 break; 838 } 839 } 840 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 841 } while (io_path != start); 842 843 /* We come here only if there is no optimized path. Cache even non_optimized 844 * path for load balance across multiple non_optimized paths. 845 */ 846 nbdev_ch->current_io_path = non_optimized; 847 return non_optimized; 848 } 849 850 static struct nvme_io_path * 851 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 852 { 853 struct nvme_io_path *io_path, *non_optimized = NULL; 854 855 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 856 if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) { 857 /* The device is currently resetting. */ 858 continue; 859 } 860 861 if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) { 862 continue; 863 } 864 865 switch (io_path->nvme_ns->ana_state) { 866 case SPDK_NVME_ANA_OPTIMIZED_STATE: 867 nbdev_ch->current_io_path = io_path; 868 return io_path; 869 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 870 if (non_optimized == NULL) { 871 non_optimized = io_path; 872 } 873 break; 874 default: 875 break; 876 } 877 } 878 879 return non_optimized; 880 } 881 882 static inline struct nvme_io_path * 883 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 884 { 885 if (spdk_unlikely(nbdev_ch->current_io_path == NULL)) { 886 return _bdev_nvme_find_io_path(nbdev_ch); 887 } 888 889 if (spdk_likely(nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE)) { 890 return nbdev_ch->current_io_path; 891 } else { 892 return bdev_nvme_find_next_io_path(nbdev_ch, nbdev_ch->current_io_path); 893 } 894 } 895 896 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 897 * or false otherwise. 898 * 899 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 900 * is likely to be non-accessible now but may become accessible. 901 * 902 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 903 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 904 * when starting to reset it but it is set to failed when the reset failed. Hence, if 905 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 906 */ 907 static bool 908 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 909 { 910 struct nvme_io_path *io_path; 911 912 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 913 if (io_path->nvme_ns->ana_transition_timedout) { 914 continue; 915 } 916 917 if (nvme_io_path_is_connected(io_path) || 918 !nvme_io_path_is_failed(io_path)) { 919 return true; 920 } 921 } 922 923 return false; 924 } 925 926 static bool 927 any_ctrlr_may_become_available(struct nvme_bdev_channel *nbdev_ch) 928 { 929 struct nvme_io_path *io_path; 930 931 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 932 if (!nvme_io_path_is_failed(io_path)) { 933 return true; 934 } 935 } 936 937 return false; 938 } 939 940 static int 941 bdev_nvme_retry_ios(void *arg) 942 { 943 struct nvme_bdev_channel *nbdev_ch = arg; 944 struct spdk_io_channel *ch = spdk_io_channel_from_ctx(nbdev_ch); 945 struct spdk_bdev_io *bdev_io, *tmp_bdev_io; 946 struct nvme_bdev_io *bio; 947 uint64_t now, delay_us; 948 949 now = spdk_get_ticks(); 950 951 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) { 952 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 953 if (bio->retry_ticks > now) { 954 break; 955 } 956 957 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 958 959 bdev_nvme_submit_request(ch, bdev_io); 960 } 961 962 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 963 964 bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list); 965 if (bdev_io != NULL) { 966 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 967 968 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 969 970 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 971 delay_us); 972 } 973 974 return SPDK_POLLER_BUSY; 975 } 976 977 static void 978 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 979 struct nvme_bdev_io *bio, uint64_t delay_ms) 980 { 981 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 982 struct spdk_bdev_io *tmp_bdev_io; 983 struct nvme_bdev_io *tmp_bio; 984 985 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 986 987 TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) { 988 tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx; 989 990 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 991 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io, 992 module_link); 993 return; 994 } 995 } 996 997 /* No earlier I/Os were found. This I/O must be the new head. */ 998 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link); 999 1000 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1001 1002 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1003 delay_ms * 1000ULL); 1004 } 1005 1006 static inline void 1007 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1008 const struct spdk_nvme_cpl *cpl) 1009 { 1010 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1011 struct nvme_bdev_channel *nbdev_ch; 1012 struct nvme_ctrlr *nvme_ctrlr; 1013 const struct spdk_nvme_ctrlr_data *cdata; 1014 uint64_t delay_ms; 1015 1016 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1017 1018 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1019 goto complete; 1020 } 1021 1022 if (cpl->status.dnr != 0 || (g_opts.bdev_retry_count != -1 && 1023 bio->retry_count >= g_opts.bdev_retry_count)) { 1024 goto complete; 1025 } 1026 1027 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1028 1029 assert(bio->io_path != NULL); 1030 nvme_ctrlr = bio->io_path->qpair->ctrlr; 1031 1032 if (spdk_nvme_cpl_is_path_error(cpl) || 1033 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1034 !nvme_io_path_is_available(bio->io_path) || 1035 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1036 nbdev_ch->current_io_path = NULL; 1037 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1038 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1039 bio->io_path->nvme_ns->ana_state_updating = true; 1040 } 1041 } 1042 delay_ms = 0; 1043 } else if (spdk_nvme_cpl_is_aborted_by_request(cpl)) { 1044 goto complete; 1045 } else { 1046 bio->retry_count++; 1047 1048 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1049 1050 if (cpl->status.crd != 0) { 1051 delay_ms = cdata->crdt[cpl->status.crd] * 100; 1052 } else { 1053 delay_ms = 0; 1054 } 1055 } 1056 1057 if (any_io_path_may_become_available(nbdev_ch)) { 1058 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1059 return; 1060 } 1061 1062 complete: 1063 bio->retry_count = 0; 1064 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 1065 } 1066 1067 static inline void 1068 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1069 { 1070 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1071 struct nvme_bdev_channel *nbdev_ch; 1072 enum spdk_bdev_io_status io_status; 1073 1074 switch (rc) { 1075 case 0: 1076 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1077 break; 1078 case -ENOMEM: 1079 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1080 break; 1081 case -ENXIO: 1082 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1083 1084 nbdev_ch->current_io_path = NULL; 1085 1086 if (any_io_path_may_become_available(nbdev_ch)) { 1087 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1088 return; 1089 } 1090 1091 /* fallthrough */ 1092 default: 1093 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1094 break; 1095 } 1096 1097 bio->retry_count = 0; 1098 spdk_bdev_io_complete(bdev_io, io_status); 1099 } 1100 1101 static inline void 1102 bdev_nvme_admin_passthru_complete(struct nvme_bdev_io *bio, int rc) 1103 { 1104 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1105 struct nvme_bdev_channel *nbdev_ch; 1106 enum spdk_bdev_io_status io_status; 1107 1108 switch (rc) { 1109 case 0: 1110 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1111 break; 1112 case -ENOMEM: 1113 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1114 break; 1115 case -ENXIO: 1116 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1117 1118 if (any_ctrlr_may_become_available(nbdev_ch)) { 1119 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1120 return; 1121 } 1122 1123 /* fallthrough */ 1124 default: 1125 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1126 break; 1127 } 1128 1129 bio->retry_count = 0; 1130 spdk_bdev_io_complete(bdev_io, io_status); 1131 } 1132 1133 static void 1134 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1135 { 1136 struct nvme_io_path *io_path; 1137 1138 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1139 io_path->nbdev_ch->current_io_path = NULL; 1140 } 1141 } 1142 1143 static void 1144 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1145 { 1146 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1147 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1148 1149 assert(ctrlr_ch->qpair != NULL); 1150 1151 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1152 1153 spdk_for_each_channel_continue(i, 0); 1154 } 1155 1156 static void 1157 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr, 1158 spdk_channel_for_each_cpl cpl) 1159 { 1160 spdk_for_each_channel(nvme_ctrlr, 1161 bdev_nvme_clear_io_path_cache, 1162 NULL, 1163 cpl); 1164 } 1165 1166 static struct nvme_qpair * 1167 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1168 { 1169 struct nvme_qpair *nvme_qpair; 1170 1171 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1172 if (nvme_qpair->qpair == qpair) { 1173 break; 1174 } 1175 } 1176 1177 return nvme_qpair; 1178 } 1179 1180 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1181 1182 static void 1183 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1184 { 1185 struct nvme_poll_group *group = poll_group_ctx; 1186 struct nvme_qpair *nvme_qpair; 1187 struct nvme_ctrlr_channel *ctrlr_ch; 1188 1189 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1190 if (nvme_qpair == NULL) { 1191 return; 1192 } 1193 1194 if (nvme_qpair->qpair != NULL) { 1195 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1196 nvme_qpair->qpair = NULL; 1197 } 1198 1199 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1200 1201 ctrlr_ch = nvme_qpair->ctrlr_ch; 1202 1203 if (ctrlr_ch != NULL) { 1204 if (ctrlr_ch->reset_iter != NULL) { 1205 /* If we are already in a full reset sequence, we do not have 1206 * to restart it. Just move to the next ctrlr_channel. 1207 */ 1208 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1209 qpair); 1210 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 1211 ctrlr_ch->reset_iter = NULL; 1212 } else { 1213 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1214 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1215 bdev_nvme_failover(nvme_qpair->ctrlr, false); 1216 } 1217 } else { 1218 /* In this case, ctrlr_channel is already deleted. */ 1219 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1220 nvme_qpair_delete(nvme_qpair); 1221 } 1222 } 1223 1224 static void 1225 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1226 { 1227 struct nvme_qpair *nvme_qpair; 1228 1229 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1230 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1231 continue; 1232 } 1233 1234 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1235 SPDK_NVME_QPAIR_FAILURE_NONE) { 1236 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1237 } 1238 } 1239 } 1240 1241 static int 1242 bdev_nvme_poll(void *arg) 1243 { 1244 struct nvme_poll_group *group = arg; 1245 int64_t num_completions; 1246 1247 if (group->collect_spin_stat && group->start_ticks == 0) { 1248 group->start_ticks = spdk_get_ticks(); 1249 } 1250 1251 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1252 bdev_nvme_disconnected_qpair_cb); 1253 if (group->collect_spin_stat) { 1254 if (num_completions > 0) { 1255 if (group->end_ticks != 0) { 1256 group->spin_ticks += (group->end_ticks - group->start_ticks); 1257 group->end_ticks = 0; 1258 } 1259 group->start_ticks = 0; 1260 } else { 1261 group->end_ticks = spdk_get_ticks(); 1262 } 1263 } 1264 1265 if (spdk_unlikely(num_completions < 0)) { 1266 bdev_nvme_check_io_qpairs(group); 1267 } 1268 1269 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1270 } 1271 1272 static int bdev_nvme_poll_adminq(void *arg); 1273 1274 static void 1275 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1276 { 1277 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1278 1279 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1280 nvme_ctrlr, new_period_us); 1281 } 1282 1283 static int 1284 bdev_nvme_poll_adminq(void *arg) 1285 { 1286 int32_t rc; 1287 struct nvme_ctrlr *nvme_ctrlr = arg; 1288 nvme_ctrlr_disconnected_cb disconnected_cb; 1289 1290 assert(nvme_ctrlr != NULL); 1291 1292 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1293 if (rc < 0) { 1294 disconnected_cb = nvme_ctrlr->disconnected_cb; 1295 nvme_ctrlr->disconnected_cb = NULL; 1296 1297 if (rc == -ENXIO && disconnected_cb != NULL) { 1298 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1299 g_opts.nvme_adminq_poll_period_us); 1300 disconnected_cb(nvme_ctrlr); 1301 } else { 1302 bdev_nvme_failover(nvme_ctrlr, false); 1303 } 1304 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1305 SPDK_NVME_QPAIR_FAILURE_NONE) { 1306 bdev_nvme_clear_io_path_caches(nvme_ctrlr, NULL); 1307 } 1308 1309 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1310 } 1311 1312 static void 1313 _bdev_nvme_unregister_dev_cb(void *io_device) 1314 { 1315 struct nvme_bdev *nvme_disk = io_device; 1316 1317 free(nvme_disk->disk.name); 1318 free(nvme_disk); 1319 } 1320 1321 static int 1322 bdev_nvme_destruct(void *ctx) 1323 { 1324 struct nvme_bdev *nvme_disk = ctx; 1325 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1326 1327 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1328 1329 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1330 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1331 1332 nvme_ns->bdev = NULL; 1333 1334 assert(nvme_ns->id > 0); 1335 1336 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1337 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1338 1339 nvme_ctrlr_release(nvme_ns->ctrlr); 1340 free(nvme_ns); 1341 } else { 1342 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1343 } 1344 } 1345 1346 pthread_mutex_lock(&g_bdev_nvme_mutex); 1347 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1348 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1349 1350 spdk_io_device_unregister(nvme_disk, _bdev_nvme_unregister_dev_cb); 1351 1352 return 0; 1353 } 1354 1355 static int 1356 bdev_nvme_flush(struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes) 1357 { 1358 bdev_nvme_io_complete(bio, 0); 1359 1360 return 0; 1361 } 1362 1363 static int 1364 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1365 { 1366 struct nvme_ctrlr *nvme_ctrlr; 1367 struct spdk_nvme_io_qpair_opts opts; 1368 struct spdk_nvme_qpair *qpair; 1369 int rc; 1370 1371 nvme_ctrlr = nvme_qpair->ctrlr; 1372 1373 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1374 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1375 opts.create_only = true; 1376 opts.async_mode = true; 1377 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1378 g_opts.io_queue_requests = opts.io_queue_requests; 1379 1380 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1381 if (qpair == NULL) { 1382 return -1; 1383 } 1384 1385 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1386 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1387 1388 assert(nvme_qpair->group != NULL); 1389 1390 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1391 if (rc != 0) { 1392 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1393 goto err; 1394 } 1395 1396 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1397 if (rc != 0) { 1398 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1399 goto err; 1400 } 1401 1402 nvme_qpair->qpair = qpair; 1403 1404 if (!g_opts.disable_auto_failback) { 1405 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1406 } 1407 1408 return 0; 1409 1410 err: 1411 spdk_nvme_ctrlr_free_io_qpair(qpair); 1412 1413 return rc; 1414 } 1415 1416 static void 1417 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1418 { 1419 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1420 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1421 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 1422 struct spdk_bdev_io *bdev_io; 1423 1424 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1425 status = SPDK_BDEV_IO_STATUS_FAILED; 1426 } 1427 1428 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1429 bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1430 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link); 1431 spdk_bdev_io_complete(bdev_io, status); 1432 } 1433 1434 spdk_for_each_channel_continue(i, 0); 1435 } 1436 1437 static void 1438 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove) 1439 { 1440 struct nvme_path_id *path_id, *next_path; 1441 int rc __attribute__((unused)); 1442 1443 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1444 assert(path_id); 1445 assert(path_id == nvme_ctrlr->active_path_id); 1446 next_path = TAILQ_NEXT(path_id, link); 1447 1448 path_id->is_failed = true; 1449 1450 if (next_path) { 1451 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1452 1453 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1454 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1455 1456 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1457 nvme_ctrlr->active_path_id = next_path; 1458 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1459 assert(rc == 0); 1460 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1461 if (!remove) { 1462 /** Shuffle the old trid to the end of the list and use the new one. 1463 * Allows for round robin through multiple connections. 1464 */ 1465 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1466 } else { 1467 free(path_id); 1468 } 1469 } 1470 } 1471 1472 static bool 1473 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1474 { 1475 int32_t elapsed; 1476 1477 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1478 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1479 return false; 1480 } 1481 1482 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1483 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1484 return true; 1485 } else { 1486 return false; 1487 } 1488 } 1489 1490 static bool 1491 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1492 { 1493 uint32_t elapsed; 1494 1495 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1496 return false; 1497 } 1498 1499 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1500 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1501 return true; 1502 } else { 1503 return false; 1504 } 1505 } 1506 1507 static void 1508 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1509 { 1510 int rc __attribute__((unused)); 1511 1512 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1513 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1514 */ 1515 assert(nvme_ctrlr->disconnected_cb == NULL); 1516 nvme_ctrlr->disconnected_cb = cb_fn; 1517 1518 /* Disconnect fails if ctrlr is already resetting or removed. Both cases are 1519 * not possible. Reset is controlled and the callback to hot remove is called 1520 * when ctrlr is hot removed. 1521 */ 1522 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1523 assert(rc == 0); 1524 1525 /* During disconnection, reduce the period to poll adminq more often. */ 1526 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 1527 } 1528 1529 enum bdev_nvme_op_after_reset { 1530 OP_NONE, 1531 OP_COMPLETE_PENDING_DESTRUCT, 1532 OP_DESTRUCT, 1533 OP_DELAYED_RECONNECT, 1534 }; 1535 1536 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1537 1538 static _bdev_nvme_op_after_reset 1539 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1540 { 1541 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1542 /* Complete pending destruct after reset completes. */ 1543 return OP_COMPLETE_PENDING_DESTRUCT; 1544 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1545 nvme_ctrlr->reset_start_tsc = 0; 1546 return OP_NONE; 1547 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1548 return OP_DESTRUCT; 1549 } else { 1550 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1551 nvme_ctrlr->fast_io_fail_timedout = true; 1552 } 1553 bdev_nvme_failover_trid(nvme_ctrlr, false); 1554 return OP_DELAYED_RECONNECT; 1555 } 1556 } 1557 1558 static int _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1559 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1560 1561 static int 1562 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 1563 { 1564 struct nvme_ctrlr *nvme_ctrlr = ctx; 1565 1566 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 1567 pthread_mutex_lock(&nvme_ctrlr->mutex); 1568 1569 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 1570 1571 assert(nvme_ctrlr->reconnect_is_delayed == true); 1572 nvme_ctrlr->reconnect_is_delayed = false; 1573 1574 if (nvme_ctrlr->destruct) { 1575 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1576 return SPDK_POLLER_BUSY; 1577 } 1578 1579 assert(nvme_ctrlr->resetting == false); 1580 nvme_ctrlr->resetting = true; 1581 1582 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1583 1584 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 1585 1586 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 1587 return SPDK_POLLER_BUSY; 1588 } 1589 1590 static void 1591 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 1592 { 1593 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 1594 1595 assert(nvme_ctrlr->reconnect_is_delayed == false); 1596 nvme_ctrlr->reconnect_is_delayed = true; 1597 1598 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 1599 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 1600 nvme_ctrlr, 1601 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 1602 } 1603 1604 static void 1605 _bdev_nvme_reset_complete(struct spdk_io_channel_iter *i, int status) 1606 { 1607 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1608 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 1609 struct nvme_path_id *path_id; 1610 bdev_nvme_reset_cb reset_cb_fn = nvme_ctrlr->reset_cb_fn; 1611 void *reset_cb_arg = nvme_ctrlr->reset_cb_arg; 1612 enum bdev_nvme_op_after_reset op_after_reset; 1613 1614 assert(nvme_ctrlr->thread == spdk_get_thread()); 1615 1616 nvme_ctrlr->reset_cb_fn = NULL; 1617 nvme_ctrlr->reset_cb_arg = NULL; 1618 1619 if (!success) { 1620 SPDK_ERRLOG("Resetting controller failed.\n"); 1621 } else { 1622 SPDK_NOTICELOG("Resetting controller successful.\n"); 1623 } 1624 1625 pthread_mutex_lock(&nvme_ctrlr->mutex); 1626 nvme_ctrlr->resetting = false; 1627 1628 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1629 assert(path_id != NULL); 1630 assert(path_id == nvme_ctrlr->active_path_id); 1631 1632 path_id->is_failed = !success; 1633 1634 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 1635 1636 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1637 1638 if (reset_cb_fn) { 1639 reset_cb_fn(reset_cb_arg, success); 1640 } 1641 1642 switch (op_after_reset) { 1643 case OP_COMPLETE_PENDING_DESTRUCT: 1644 nvme_ctrlr_unregister(nvme_ctrlr); 1645 break; 1646 case OP_DESTRUCT: 1647 _bdev_nvme_delete(nvme_ctrlr, false); 1648 break; 1649 case OP_DELAYED_RECONNECT: 1650 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 1651 break; 1652 default: 1653 break; 1654 } 1655 } 1656 1657 static void 1658 bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 1659 { 1660 /* Make sure we clear any pending resets before returning. */ 1661 spdk_for_each_channel(nvme_ctrlr, 1662 bdev_nvme_complete_pending_resets, 1663 success ? NULL : (void *)0x1, 1664 _bdev_nvme_reset_complete); 1665 } 1666 1667 static void 1668 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 1669 { 1670 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1671 1672 bdev_nvme_reset_complete(nvme_ctrlr, false); 1673 } 1674 1675 static void 1676 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 1677 { 1678 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 1679 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 1680 struct nvme_qpair *nvme_qpair; 1681 1682 nvme_qpair = ctrlr_ch->qpair; 1683 assert(nvme_qpair != NULL); 1684 1685 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1686 1687 if (nvme_qpair->qpair != NULL) { 1688 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 1689 1690 /* The current full reset sequence will move to the next 1691 * ctrlr_channel after the qpair is actually disconnected. 1692 */ 1693 assert(ctrlr_ch->reset_iter == NULL); 1694 ctrlr_ch->reset_iter = i; 1695 } else { 1696 spdk_for_each_channel_continue(i, 0); 1697 } 1698 } 1699 1700 static void 1701 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 1702 { 1703 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1704 1705 if (status == 0) { 1706 bdev_nvme_reset_complete(nvme_ctrlr, true); 1707 } else { 1708 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 1709 spdk_for_each_channel(nvme_ctrlr, 1710 bdev_nvme_reset_destroy_qpair, 1711 NULL, 1712 bdev_nvme_reset_create_qpairs_failed); 1713 } 1714 } 1715 1716 static void 1717 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 1718 { 1719 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1720 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1721 int rc; 1722 1723 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 1724 1725 spdk_for_each_channel_continue(i, rc); 1726 } 1727 1728 static int 1729 bdev_nvme_reconnect_ctrlr_poll(void *arg) 1730 { 1731 struct nvme_ctrlr *nvme_ctrlr = arg; 1732 int rc = -ETIMEDOUT; 1733 1734 if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1735 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 1736 if (rc == -EAGAIN) { 1737 return SPDK_POLLER_BUSY; 1738 } 1739 } 1740 1741 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 1742 if (rc == 0) { 1743 /* Recreate all of the I/O queue pairs */ 1744 spdk_for_each_channel(nvme_ctrlr, 1745 bdev_nvme_reset_create_qpair, 1746 NULL, 1747 bdev_nvme_reset_create_qpairs_done); 1748 } else { 1749 bdev_nvme_reset_complete(nvme_ctrlr, false); 1750 } 1751 return SPDK_POLLER_BUSY; 1752 } 1753 1754 static void 1755 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 1756 { 1757 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 1758 1759 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 1760 assert(nvme_ctrlr->reset_detach_poller == NULL); 1761 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 1762 nvme_ctrlr, 0); 1763 } 1764 1765 static void 1766 bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status) 1767 { 1768 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1769 1770 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 1771 assert(status == 0); 1772 1773 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 1774 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 1775 } else { 1776 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 1777 } 1778 } 1779 1780 static void 1781 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 1782 { 1783 spdk_for_each_channel(nvme_ctrlr, 1784 bdev_nvme_reset_destroy_qpair, 1785 NULL, 1786 bdev_nvme_reset_ctrlr); 1787 } 1788 1789 static void 1790 _bdev_nvme_reset(void *ctx) 1791 { 1792 struct nvme_ctrlr *nvme_ctrlr = ctx; 1793 1794 assert(nvme_ctrlr->resetting == true); 1795 assert(nvme_ctrlr->thread == spdk_get_thread()); 1796 1797 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 1798 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 1799 } else { 1800 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 1801 } 1802 } 1803 1804 static int 1805 bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr) 1806 { 1807 pthread_mutex_lock(&nvme_ctrlr->mutex); 1808 if (nvme_ctrlr->destruct) { 1809 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1810 return -ENXIO; 1811 } 1812 1813 if (nvme_ctrlr->resetting) { 1814 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1815 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 1816 return -EBUSY; 1817 } 1818 1819 if (nvme_ctrlr->reconnect_is_delayed) { 1820 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1821 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 1822 return -EBUSY; 1823 } 1824 1825 nvme_ctrlr->resetting = true; 1826 1827 assert(nvme_ctrlr->reset_start_tsc == 0); 1828 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 1829 1830 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1831 1832 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr); 1833 return 0; 1834 } 1835 1836 int 1837 bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg) 1838 { 1839 int rc; 1840 1841 rc = bdev_nvme_reset(nvme_ctrlr); 1842 if (rc == 0) { 1843 nvme_ctrlr->reset_cb_fn = cb_fn; 1844 nvme_ctrlr->reset_cb_arg = cb_arg; 1845 } 1846 return rc; 1847 } 1848 1849 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 1850 1851 static void 1852 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 1853 { 1854 enum spdk_bdev_io_status io_status; 1855 1856 if (bio->cpl.cdw0 == 0) { 1857 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1858 } else { 1859 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1860 } 1861 1862 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), io_status); 1863 } 1864 1865 static void 1866 _bdev_nvme_reset_io_continue(void *ctx) 1867 { 1868 struct nvme_bdev_io *bio = ctx; 1869 struct nvme_io_path *prev_io_path, *next_io_path; 1870 int rc; 1871 1872 prev_io_path = bio->io_path; 1873 bio->io_path = NULL; 1874 1875 if (bio->cpl.cdw0 != 0) { 1876 goto complete; 1877 } 1878 1879 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 1880 if (next_io_path == NULL) { 1881 goto complete; 1882 } 1883 1884 rc = _bdev_nvme_reset_io(next_io_path, bio); 1885 if (rc == 0) { 1886 return; 1887 } 1888 1889 bio->cpl.cdw0 = 1; 1890 1891 complete: 1892 bdev_nvme_reset_io_complete(bio); 1893 } 1894 1895 static void 1896 bdev_nvme_reset_io_continue(void *cb_arg, bool success) 1897 { 1898 struct nvme_bdev_io *bio = cb_arg; 1899 1900 bio->cpl.cdw0 = !success; 1901 1902 spdk_thread_send_msg(bio->orig_thread, _bdev_nvme_reset_io_continue, bio); 1903 } 1904 1905 static int 1906 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 1907 { 1908 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1909 struct nvme_ctrlr_channel *ctrlr_ch; 1910 struct spdk_bdev_io *bdev_io; 1911 int rc; 1912 1913 rc = bdev_nvme_reset(nvme_ctrlr); 1914 if (rc == 0) { 1915 assert(bio->io_path == NULL); 1916 bio->io_path = io_path; 1917 1918 assert(nvme_ctrlr->reset_cb_fn == NULL); 1919 assert(nvme_ctrlr->reset_cb_arg == NULL); 1920 nvme_ctrlr->reset_cb_fn = bdev_nvme_reset_io_continue; 1921 nvme_ctrlr->reset_cb_arg = bio; 1922 } else if (rc == -EBUSY) { 1923 ctrlr_ch = io_path->qpair->ctrlr_ch; 1924 assert(ctrlr_ch != NULL); 1925 /* 1926 * Reset call is queued only if it is from the app framework. This is on purpose so that 1927 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 1928 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 1929 */ 1930 bdev_io = spdk_bdev_io_from_ctx(bio); 1931 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link); 1932 } else { 1933 return rc; 1934 } 1935 1936 return 0; 1937 } 1938 1939 static void 1940 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 1941 { 1942 struct nvme_io_path *io_path; 1943 int rc; 1944 1945 bio->cpl.cdw0 = 0; 1946 bio->orig_thread = spdk_get_thread(); 1947 1948 /* Reset only the first nvme_ctrlr in the nvme_bdev_ctrlr for now. 1949 * 1950 * TODO: Reset all nvme_ctrlrs in the nvme_bdev_ctrlr sequentially. 1951 * This will be done in the following patches. 1952 */ 1953 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 1954 assert(io_path != NULL); 1955 1956 rc = _bdev_nvme_reset_io(io_path, bio); 1957 if (rc != 0) { 1958 bio->cpl.cdw0 = 1; 1959 bdev_nvme_reset_io_complete(bio); 1960 } 1961 } 1962 1963 static int 1964 bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove) 1965 { 1966 pthread_mutex_lock(&nvme_ctrlr->mutex); 1967 if (nvme_ctrlr->destruct) { 1968 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1969 /* Don't bother resetting if the controller is in the process of being destructed. */ 1970 return -ENXIO; 1971 } 1972 1973 if (nvme_ctrlr->resetting) { 1974 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1975 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 1976 return -EBUSY; 1977 } 1978 1979 bdev_nvme_failover_trid(nvme_ctrlr, remove); 1980 1981 if (nvme_ctrlr->reconnect_is_delayed) { 1982 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1983 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 1984 1985 /* We rely on the next reconnect for the failover. */ 1986 return 0; 1987 } 1988 1989 nvme_ctrlr->resetting = true; 1990 1991 assert(nvme_ctrlr->reset_start_tsc == 0); 1992 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 1993 1994 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1995 1996 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr); 1997 return 0; 1998 } 1999 2000 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2001 uint64_t num_blocks); 2002 2003 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2004 uint64_t num_blocks); 2005 2006 static void 2007 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2008 bool success) 2009 { 2010 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2011 struct spdk_bdev *bdev = bdev_io->bdev; 2012 int ret; 2013 2014 if (!success) { 2015 ret = -EINVAL; 2016 goto exit; 2017 } 2018 2019 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2020 ret = -ENXIO; 2021 goto exit; 2022 } 2023 2024 ret = bdev_nvme_readv(bio, 2025 bdev_io->u.bdev.iovs, 2026 bdev_io->u.bdev.iovcnt, 2027 bdev_io->u.bdev.md_buf, 2028 bdev_io->u.bdev.num_blocks, 2029 bdev_io->u.bdev.offset_blocks, 2030 bdev->dif_check_flags, 2031 bdev_io->u.bdev.ext_opts); 2032 2033 exit: 2034 if (spdk_unlikely(ret != 0)) { 2035 bdev_nvme_io_complete(bio, ret); 2036 } 2037 } 2038 2039 static void 2040 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2041 { 2042 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 2043 struct spdk_bdev *bdev = bdev_io->bdev; 2044 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2045 struct nvme_bdev_io *nbdev_io_to_abort; 2046 int rc = 0; 2047 2048 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 2049 if (spdk_unlikely(!nbdev_io->io_path)) { 2050 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 2051 rc = -ENXIO; 2052 goto exit; 2053 } 2054 2055 /* Admin commands do not use the optimal I/O path. 2056 * Simply fall through even if it is not found. 2057 */ 2058 } 2059 2060 switch (bdev_io->type) { 2061 case SPDK_BDEV_IO_TYPE_READ: 2062 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 2063 rc = bdev_nvme_readv(nbdev_io, 2064 bdev_io->u.bdev.iovs, 2065 bdev_io->u.bdev.iovcnt, 2066 bdev_io->u.bdev.md_buf, 2067 bdev_io->u.bdev.num_blocks, 2068 bdev_io->u.bdev.offset_blocks, 2069 bdev->dif_check_flags, 2070 bdev_io->u.bdev.ext_opts); 2071 } else { 2072 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 2073 bdev_io->u.bdev.num_blocks * bdev->blocklen); 2074 rc = 0; 2075 } 2076 break; 2077 case SPDK_BDEV_IO_TYPE_WRITE: 2078 rc = bdev_nvme_writev(nbdev_io, 2079 bdev_io->u.bdev.iovs, 2080 bdev_io->u.bdev.iovcnt, 2081 bdev_io->u.bdev.md_buf, 2082 bdev_io->u.bdev.num_blocks, 2083 bdev_io->u.bdev.offset_blocks, 2084 bdev->dif_check_flags, 2085 bdev_io->u.bdev.ext_opts); 2086 break; 2087 case SPDK_BDEV_IO_TYPE_COMPARE: 2088 rc = bdev_nvme_comparev(nbdev_io, 2089 bdev_io->u.bdev.iovs, 2090 bdev_io->u.bdev.iovcnt, 2091 bdev_io->u.bdev.md_buf, 2092 bdev_io->u.bdev.num_blocks, 2093 bdev_io->u.bdev.offset_blocks, 2094 bdev->dif_check_flags); 2095 break; 2096 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2097 rc = bdev_nvme_comparev_and_writev(nbdev_io, 2098 bdev_io->u.bdev.iovs, 2099 bdev_io->u.bdev.iovcnt, 2100 bdev_io->u.bdev.fused_iovs, 2101 bdev_io->u.bdev.fused_iovcnt, 2102 bdev_io->u.bdev.md_buf, 2103 bdev_io->u.bdev.num_blocks, 2104 bdev_io->u.bdev.offset_blocks, 2105 bdev->dif_check_flags); 2106 break; 2107 case SPDK_BDEV_IO_TYPE_UNMAP: 2108 rc = bdev_nvme_unmap(nbdev_io, 2109 bdev_io->u.bdev.offset_blocks, 2110 bdev_io->u.bdev.num_blocks); 2111 break; 2112 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2113 rc = bdev_nvme_write_zeroes(nbdev_io, 2114 bdev_io->u.bdev.offset_blocks, 2115 bdev_io->u.bdev.num_blocks); 2116 break; 2117 case SPDK_BDEV_IO_TYPE_RESET: 2118 nbdev_io->io_path = NULL; 2119 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 2120 break; 2121 case SPDK_BDEV_IO_TYPE_FLUSH: 2122 rc = bdev_nvme_flush(nbdev_io, 2123 bdev_io->u.bdev.offset_blocks, 2124 bdev_io->u.bdev.num_blocks); 2125 break; 2126 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 2127 rc = bdev_nvme_zone_appendv(nbdev_io, 2128 bdev_io->u.bdev.iovs, 2129 bdev_io->u.bdev.iovcnt, 2130 bdev_io->u.bdev.md_buf, 2131 bdev_io->u.bdev.num_blocks, 2132 bdev_io->u.bdev.offset_blocks, 2133 bdev->dif_check_flags); 2134 break; 2135 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 2136 rc = bdev_nvme_get_zone_info(nbdev_io, 2137 bdev_io->u.zone_mgmt.zone_id, 2138 bdev_io->u.zone_mgmt.num_zones, 2139 bdev_io->u.zone_mgmt.buf); 2140 break; 2141 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 2142 rc = bdev_nvme_zone_management(nbdev_io, 2143 bdev_io->u.zone_mgmt.zone_id, 2144 bdev_io->u.zone_mgmt.zone_action); 2145 break; 2146 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 2147 nbdev_io->io_path = NULL; 2148 bdev_nvme_admin_passthru(nbdev_ch, 2149 nbdev_io, 2150 &bdev_io->u.nvme_passthru.cmd, 2151 bdev_io->u.nvme_passthru.buf, 2152 bdev_io->u.nvme_passthru.nbytes); 2153 break; 2154 case SPDK_BDEV_IO_TYPE_NVME_IO: 2155 rc = bdev_nvme_io_passthru(nbdev_io, 2156 &bdev_io->u.nvme_passthru.cmd, 2157 bdev_io->u.nvme_passthru.buf, 2158 bdev_io->u.nvme_passthru.nbytes); 2159 break; 2160 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2161 rc = bdev_nvme_io_passthru_md(nbdev_io, 2162 &bdev_io->u.nvme_passthru.cmd, 2163 bdev_io->u.nvme_passthru.buf, 2164 bdev_io->u.nvme_passthru.nbytes, 2165 bdev_io->u.nvme_passthru.md_buf, 2166 bdev_io->u.nvme_passthru.md_len); 2167 break; 2168 case SPDK_BDEV_IO_TYPE_ABORT: 2169 nbdev_io->io_path = NULL; 2170 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 2171 bdev_nvme_abort(nbdev_ch, 2172 nbdev_io, 2173 nbdev_io_to_abort); 2174 break; 2175 default: 2176 rc = -EINVAL; 2177 break; 2178 } 2179 2180 exit: 2181 if (spdk_unlikely(rc != 0)) { 2182 bdev_nvme_io_complete(nbdev_io, rc); 2183 } 2184 } 2185 2186 static bool 2187 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 2188 { 2189 struct nvme_bdev *nbdev = ctx; 2190 struct nvme_ns *nvme_ns; 2191 struct spdk_nvme_ns *ns; 2192 struct spdk_nvme_ctrlr *ctrlr; 2193 const struct spdk_nvme_ctrlr_data *cdata; 2194 2195 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 2196 assert(nvme_ns != NULL); 2197 ns = nvme_ns->ns; 2198 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2199 2200 switch (io_type) { 2201 case SPDK_BDEV_IO_TYPE_READ: 2202 case SPDK_BDEV_IO_TYPE_WRITE: 2203 case SPDK_BDEV_IO_TYPE_RESET: 2204 case SPDK_BDEV_IO_TYPE_FLUSH: 2205 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 2206 case SPDK_BDEV_IO_TYPE_NVME_IO: 2207 case SPDK_BDEV_IO_TYPE_ABORT: 2208 return true; 2209 2210 case SPDK_BDEV_IO_TYPE_COMPARE: 2211 return spdk_nvme_ns_supports_compare(ns); 2212 2213 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2214 return spdk_nvme_ns_get_md_size(ns) ? true : false; 2215 2216 case SPDK_BDEV_IO_TYPE_UNMAP: 2217 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2218 return cdata->oncs.dsm; 2219 2220 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2221 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2222 return cdata->oncs.write_zeroes; 2223 2224 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2225 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 2226 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 2227 return true; 2228 } 2229 return false; 2230 2231 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 2232 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 2233 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 2234 2235 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 2236 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 2237 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 2238 2239 default: 2240 return false; 2241 } 2242 } 2243 2244 static int 2245 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 2246 { 2247 struct nvme_qpair *nvme_qpair; 2248 struct spdk_io_channel *pg_ch; 2249 int rc; 2250 2251 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 2252 if (!nvme_qpair) { 2253 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 2254 return -1; 2255 } 2256 2257 TAILQ_INIT(&nvme_qpair->io_path_list); 2258 2259 nvme_qpair->ctrlr = nvme_ctrlr; 2260 nvme_qpair->ctrlr_ch = ctrlr_ch; 2261 2262 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 2263 if (!pg_ch) { 2264 free(nvme_qpair); 2265 return -1; 2266 } 2267 2268 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 2269 2270 #ifdef SPDK_CONFIG_VTUNE 2271 nvme_qpair->group->collect_spin_stat = true; 2272 #else 2273 nvme_qpair->group->collect_spin_stat = false; 2274 #endif 2275 2276 rc = bdev_nvme_create_qpair(nvme_qpair); 2277 if (rc != 0) { 2278 /* nvme_ctrlr can't create IO qpair if connection is down. If nvme_ctrlr is 2279 * being reset or scheduled to reconnect later, ignore this failure. 2280 * Then IO qpair will be created later when reconnect completes. 2281 * If the user submits IO requests in the meantime, they will be queued and 2282 * resubmitted later */ 2283 if (!nvme_ctrlr->resetting && !nvme_ctrlr->reconnect_is_delayed) { 2284 spdk_put_io_channel(pg_ch); 2285 free(nvme_qpair); 2286 return rc; 2287 } 2288 } 2289 2290 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 2291 2292 ctrlr_ch->qpair = nvme_qpair; 2293 2294 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 2295 nvme_qpair->ctrlr->ref++; 2296 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 2297 2298 return 0; 2299 } 2300 2301 static int 2302 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 2303 { 2304 struct nvme_ctrlr *nvme_ctrlr = io_device; 2305 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 2306 2307 TAILQ_INIT(&ctrlr_ch->pending_resets); 2308 2309 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 2310 } 2311 2312 static void 2313 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 2314 { 2315 assert(nvme_qpair->group != NULL); 2316 2317 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 2318 2319 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 2320 2321 nvme_ctrlr_release(nvme_qpair->ctrlr); 2322 2323 free(nvme_qpair); 2324 } 2325 2326 static void 2327 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 2328 { 2329 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 2330 struct nvme_qpair *nvme_qpair; 2331 2332 nvme_qpair = ctrlr_ch->qpair; 2333 assert(nvme_qpair != NULL); 2334 2335 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2336 2337 if (nvme_qpair->qpair != NULL) { 2338 if (ctrlr_ch->reset_iter == NULL) { 2339 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2340 } else { 2341 /* Skip current ctrlr_channel in a full reset sequence because 2342 * it is being deleted now. The qpair is already being disconnected. 2343 * We do not have to restart disconnecting it. 2344 */ 2345 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2346 } 2347 2348 /* We cannot release a reference to the poll group now. 2349 * The qpair may be disconnected asynchronously later. 2350 * We need to poll it until it is actually disconnected. 2351 * Just detach the qpair from the deleting ctrlr_channel. 2352 */ 2353 nvme_qpair->ctrlr_ch = NULL; 2354 } else { 2355 assert(ctrlr_ch->reset_iter == NULL); 2356 2357 nvme_qpair_delete(nvme_qpair); 2358 } 2359 } 2360 2361 static void 2362 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 2363 uint32_t iov_cnt, uint32_t seed, 2364 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 2365 { 2366 struct nvme_poll_group *group = ctx; 2367 int rc; 2368 2369 assert(group->accel_channel != NULL); 2370 assert(cb_fn != NULL); 2371 2372 rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 2373 if (rc) { 2374 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 2375 if (rc == -ENOMEM || rc == -EINVAL) { 2376 cb_fn(cb_arg, rc); 2377 } 2378 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 2379 } 2380 } 2381 2382 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 2383 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 2384 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 2385 }; 2386 2387 static int 2388 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 2389 { 2390 struct nvme_poll_group *group = ctx_buf; 2391 2392 TAILQ_INIT(&group->qpair_list); 2393 2394 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 2395 if (group->group == NULL) { 2396 return -1; 2397 } 2398 2399 group->accel_channel = spdk_accel_engine_get_io_channel(); 2400 if (!group->accel_channel) { 2401 spdk_nvme_poll_group_destroy(group->group); 2402 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 2403 group); 2404 return -1; 2405 } 2406 2407 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 2408 2409 if (group->poller == NULL) { 2410 spdk_put_io_channel(group->accel_channel); 2411 spdk_nvme_poll_group_destroy(group->group); 2412 return -1; 2413 } 2414 2415 return 0; 2416 } 2417 2418 static void 2419 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 2420 { 2421 struct nvme_poll_group *group = ctx_buf; 2422 2423 assert(TAILQ_EMPTY(&group->qpair_list)); 2424 2425 if (group->accel_channel) { 2426 spdk_put_io_channel(group->accel_channel); 2427 } 2428 2429 spdk_poller_unregister(&group->poller); 2430 if (spdk_nvme_poll_group_destroy(group->group)) { 2431 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 2432 assert(false); 2433 } 2434 } 2435 2436 static struct spdk_io_channel * 2437 bdev_nvme_get_io_channel(void *ctx) 2438 { 2439 struct nvme_bdev *nvme_bdev = ctx; 2440 2441 return spdk_get_io_channel(nvme_bdev); 2442 } 2443 2444 static void * 2445 bdev_nvme_get_module_ctx(void *ctx) 2446 { 2447 struct nvme_bdev *nvme_bdev = ctx; 2448 struct nvme_ns *nvme_ns; 2449 2450 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 2451 return NULL; 2452 } 2453 2454 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 2455 if (!nvme_ns) { 2456 return NULL; 2457 } 2458 2459 return nvme_ns->ns; 2460 } 2461 2462 static const char * 2463 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 2464 { 2465 switch (ana_state) { 2466 case SPDK_NVME_ANA_OPTIMIZED_STATE: 2467 return "optimized"; 2468 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 2469 return "non_optimized"; 2470 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 2471 return "inaccessible"; 2472 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 2473 return "persistent_loss"; 2474 case SPDK_NVME_ANA_CHANGE_STATE: 2475 return "change"; 2476 default: 2477 return NULL; 2478 } 2479 } 2480 2481 static int 2482 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 2483 { 2484 struct nvme_bdev *nbdev = ctx; 2485 struct nvme_ns *nvme_ns; 2486 2487 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 2488 assert(nvme_ns != NULL); 2489 2490 return spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, domains, array_size); 2491 } 2492 2493 static const char * 2494 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 2495 { 2496 if (nvme_ctrlr->destruct) { 2497 return "deleting"; 2498 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 2499 return "failed"; 2500 } else if (nvme_ctrlr->resetting) { 2501 return "resetting"; 2502 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 2503 return "reconnect_is_delayed"; 2504 } else { 2505 return "enabled"; 2506 } 2507 } 2508 2509 void 2510 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 2511 { 2512 struct spdk_nvme_transport_id *trid; 2513 const struct spdk_nvme_ctrlr_opts *opts; 2514 const struct spdk_nvme_ctrlr_data *cdata; 2515 2516 spdk_json_write_object_begin(w); 2517 2518 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 2519 2520 #ifdef SPDK_CONFIG_NVME_CUSE 2521 size_t cuse_name_size = 128; 2522 char cuse_name[cuse_name_size]; 2523 2524 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 2525 if (rc == 0) { 2526 spdk_json_write_named_string(w, "cuse_device", cuse_name); 2527 } 2528 #endif 2529 trid = &nvme_ctrlr->active_path_id->trid; 2530 spdk_json_write_named_object_begin(w, "trid"); 2531 nvme_bdev_dump_trid_json(trid, w); 2532 spdk_json_write_object_end(w); 2533 2534 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 2535 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 2536 2537 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 2538 spdk_json_write_named_object_begin(w, "host"); 2539 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 2540 spdk_json_write_named_string(w, "addr", opts->src_addr); 2541 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 2542 spdk_json_write_object_end(w); 2543 2544 spdk_json_write_object_end(w); 2545 } 2546 2547 static void 2548 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 2549 struct nvme_ns *nvme_ns) 2550 { 2551 struct spdk_nvme_ns *ns; 2552 struct spdk_nvme_ctrlr *ctrlr; 2553 const struct spdk_nvme_ctrlr_data *cdata; 2554 const struct spdk_nvme_transport_id *trid; 2555 union spdk_nvme_vs_register vs; 2556 const struct spdk_nvme_ns_data *nsdata; 2557 char buf[128]; 2558 2559 ns = nvme_ns->ns; 2560 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2561 2562 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2563 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 2564 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 2565 2566 spdk_json_write_object_begin(w); 2567 2568 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2569 spdk_json_write_named_string(w, "pci_address", trid->traddr); 2570 } 2571 2572 spdk_json_write_named_object_begin(w, "trid"); 2573 2574 nvme_bdev_dump_trid_json(trid, w); 2575 2576 spdk_json_write_object_end(w); 2577 2578 #ifdef SPDK_CONFIG_NVME_CUSE 2579 size_t cuse_name_size = 128; 2580 char cuse_name[cuse_name_size]; 2581 2582 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 2583 cuse_name, &cuse_name_size); 2584 if (rc == 0) { 2585 spdk_json_write_named_string(w, "cuse_device", cuse_name); 2586 } 2587 #endif 2588 2589 spdk_json_write_named_object_begin(w, "ctrlr_data"); 2590 2591 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 2592 2593 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 2594 2595 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 2596 spdk_str_trim(buf); 2597 spdk_json_write_named_string(w, "model_number", buf); 2598 2599 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 2600 spdk_str_trim(buf); 2601 spdk_json_write_named_string(w, "serial_number", buf); 2602 2603 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 2604 spdk_str_trim(buf); 2605 spdk_json_write_named_string(w, "firmware_revision", buf); 2606 2607 if (cdata->subnqn[0] != '\0') { 2608 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 2609 } 2610 2611 spdk_json_write_named_object_begin(w, "oacs"); 2612 2613 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 2614 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 2615 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 2616 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 2617 2618 spdk_json_write_object_end(w); 2619 2620 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 2621 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 2622 2623 spdk_json_write_object_end(w); 2624 2625 spdk_json_write_named_object_begin(w, "vs"); 2626 2627 spdk_json_write_name(w, "nvme_version"); 2628 if (vs.bits.ter) { 2629 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 2630 } else { 2631 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 2632 } 2633 2634 spdk_json_write_object_end(w); 2635 2636 nsdata = spdk_nvme_ns_get_data(ns); 2637 2638 spdk_json_write_named_object_begin(w, "ns_data"); 2639 2640 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 2641 2642 if (cdata->cmic.ana_reporting) { 2643 spdk_json_write_named_string(w, "ana_state", 2644 _nvme_ana_state_str(nvme_ns->ana_state)); 2645 } 2646 2647 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 2648 2649 spdk_json_write_object_end(w); 2650 2651 if (cdata->oacs.security) { 2652 spdk_json_write_named_object_begin(w, "security"); 2653 2654 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 2655 2656 spdk_json_write_object_end(w); 2657 } 2658 2659 spdk_json_write_object_end(w); 2660 } 2661 2662 static const char * 2663 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 2664 { 2665 switch (nbdev->mp_policy) { 2666 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 2667 return "active_passive"; 2668 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 2669 return "active_active"; 2670 default: 2671 assert(false); 2672 return "invalid"; 2673 } 2674 } 2675 2676 static int 2677 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 2678 { 2679 struct nvme_bdev *nvme_bdev = ctx; 2680 struct nvme_ns *nvme_ns; 2681 2682 pthread_mutex_lock(&nvme_bdev->mutex); 2683 spdk_json_write_named_array_begin(w, "nvme"); 2684 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 2685 nvme_namespace_info_json(w, nvme_ns); 2686 } 2687 spdk_json_write_array_end(w); 2688 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 2689 pthread_mutex_unlock(&nvme_bdev->mutex); 2690 2691 return 0; 2692 } 2693 2694 static void 2695 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 2696 { 2697 /* No config per bdev needed */ 2698 } 2699 2700 static uint64_t 2701 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 2702 { 2703 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 2704 struct nvme_io_path *io_path; 2705 struct nvme_poll_group *group; 2706 uint64_t spin_time = 0; 2707 2708 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 2709 group = io_path->qpair->group; 2710 2711 if (!group || !group->collect_spin_stat) { 2712 continue; 2713 } 2714 2715 if (group->end_ticks != 0) { 2716 group->spin_ticks += (group->end_ticks - group->start_ticks); 2717 group->end_ticks = 0; 2718 } 2719 2720 spin_time += group->spin_ticks; 2721 group->start_ticks = 0; 2722 group->spin_ticks = 0; 2723 } 2724 2725 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 2726 } 2727 2728 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 2729 .destruct = bdev_nvme_destruct, 2730 .submit_request = bdev_nvme_submit_request, 2731 .io_type_supported = bdev_nvme_io_type_supported, 2732 .get_io_channel = bdev_nvme_get_io_channel, 2733 .dump_info_json = bdev_nvme_dump_info_json, 2734 .write_config_json = bdev_nvme_write_config_json, 2735 .get_spin_time = bdev_nvme_get_spin_time, 2736 .get_module_ctx = bdev_nvme_get_module_ctx, 2737 .get_memory_domains = bdev_nvme_get_memory_domains, 2738 }; 2739 2740 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 2741 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 2742 2743 static int 2744 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 2745 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 2746 { 2747 struct spdk_nvme_ana_group_descriptor *copied_desc; 2748 uint8_t *orig_desc; 2749 uint32_t i, desc_size, copy_len; 2750 int rc = 0; 2751 2752 if (nvme_ctrlr->ana_log_page == NULL) { 2753 return -EINVAL; 2754 } 2755 2756 copied_desc = nvme_ctrlr->copied_ana_desc; 2757 2758 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 2759 copy_len = nvme_ctrlr->ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 2760 2761 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 2762 memcpy(copied_desc, orig_desc, copy_len); 2763 2764 rc = cb_fn(copied_desc, cb_arg); 2765 if (rc != 0) { 2766 break; 2767 } 2768 2769 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 2770 copied_desc->num_of_nsid * sizeof(uint32_t); 2771 orig_desc += desc_size; 2772 copy_len -= desc_size; 2773 } 2774 2775 return rc; 2776 } 2777 2778 static int 2779 nvme_ns_ana_transition_timedout(void *ctx) 2780 { 2781 struct nvme_ns *nvme_ns = ctx; 2782 2783 spdk_poller_unregister(&nvme_ns->anatt_timer); 2784 nvme_ns->ana_transition_timedout = true; 2785 2786 return SPDK_POLLER_BUSY; 2787 } 2788 2789 static void 2790 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 2791 const struct spdk_nvme_ana_group_descriptor *desc) 2792 { 2793 const struct spdk_nvme_ctrlr_data *cdata; 2794 2795 nvme_ns->ana_group_id = desc->ana_group_id; 2796 nvme_ns->ana_state = desc->ana_state; 2797 nvme_ns->ana_state_updating = false; 2798 2799 switch (nvme_ns->ana_state) { 2800 case SPDK_NVME_ANA_OPTIMIZED_STATE: 2801 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 2802 nvme_ns->ana_transition_timedout = false; 2803 spdk_poller_unregister(&nvme_ns->anatt_timer); 2804 break; 2805 2806 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 2807 case SPDK_NVME_ANA_CHANGE_STATE: 2808 if (nvme_ns->anatt_timer != NULL) { 2809 break; 2810 } 2811 2812 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 2813 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 2814 nvme_ns, 2815 cdata->anatt * SPDK_SEC_TO_USEC); 2816 break; 2817 default: 2818 break; 2819 } 2820 } 2821 2822 static int 2823 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 2824 { 2825 struct nvme_ns *nvme_ns = cb_arg; 2826 uint32_t i; 2827 2828 for (i = 0; i < desc->num_of_nsid; i++) { 2829 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 2830 continue; 2831 } 2832 2833 _nvme_ns_set_ana_state(nvme_ns, desc); 2834 return 1; 2835 } 2836 2837 return 0; 2838 } 2839 2840 static int 2841 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 2842 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 2843 uint32_t prchk_flags, void *ctx) 2844 { 2845 const struct spdk_uuid *uuid; 2846 const uint8_t *nguid; 2847 const struct spdk_nvme_ctrlr_data *cdata; 2848 const struct spdk_nvme_ns_data *nsdata; 2849 const struct spdk_nvme_ctrlr_opts *opts; 2850 enum spdk_nvme_csi csi; 2851 uint32_t atomic_bs, phys_bs, bs; 2852 2853 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2854 csi = spdk_nvme_ns_get_csi(ns); 2855 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 2856 2857 switch (csi) { 2858 case SPDK_NVME_CSI_NVM: 2859 disk->product_name = "NVMe disk"; 2860 break; 2861 case SPDK_NVME_CSI_ZNS: 2862 disk->product_name = "NVMe ZNS disk"; 2863 disk->zoned = true; 2864 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 2865 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 2866 spdk_nvme_ns_get_extended_sector_size(ns); 2867 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 2868 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 2869 break; 2870 default: 2871 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 2872 return -ENOTSUP; 2873 } 2874 2875 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 2876 if (!disk->name) { 2877 return -ENOMEM; 2878 } 2879 2880 disk->write_cache = 0; 2881 if (cdata->vwc.present) { 2882 /* Enable if the Volatile Write Cache exists */ 2883 disk->write_cache = 1; 2884 } 2885 if (cdata->oncs.write_zeroes) { 2886 disk->max_write_zeroes = UINT16_MAX + 1; 2887 } 2888 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 2889 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 2890 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 2891 /* NVMe driver will split one request into multiple requests 2892 * based on MDTS and stripe boundary, the bdev layer will use 2893 * max_segment_size and max_num_segments to split one big IO 2894 * into multiple requests, then small request can't run out 2895 * of NVMe internal requests data structure. 2896 */ 2897 if (opts && opts->io_queue_requests) { 2898 disk->max_num_segments = opts->io_queue_requests / 2; 2899 } 2900 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 2901 2902 nguid = spdk_nvme_ns_get_nguid(ns); 2903 if (!nguid) { 2904 uuid = spdk_nvme_ns_get_uuid(ns); 2905 if (uuid) { 2906 disk->uuid = *uuid; 2907 } 2908 } else { 2909 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 2910 } 2911 2912 nsdata = spdk_nvme_ns_get_data(ns); 2913 bs = spdk_nvme_ns_get_sector_size(ns); 2914 atomic_bs = bs; 2915 phys_bs = bs; 2916 if (nsdata->nabo == 0) { 2917 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 2918 atomic_bs = bs * (1 + nsdata->nawupf); 2919 } else { 2920 atomic_bs = bs * (1 + cdata->awupf); 2921 } 2922 } 2923 if (nsdata->nsfeat.optperf) { 2924 phys_bs = bs * (1 + nsdata->npwg); 2925 } 2926 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 2927 2928 disk->md_len = spdk_nvme_ns_get_md_size(ns); 2929 if (disk->md_len != 0) { 2930 disk->md_interleave = nsdata->flbas.extended; 2931 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 2932 if (disk->dif_type != SPDK_DIF_DISABLE) { 2933 disk->dif_is_head_of_md = nsdata->dps.md_start; 2934 disk->dif_check_flags = prchk_flags; 2935 } 2936 } 2937 2938 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 2939 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 2940 disk->acwu = 0; 2941 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 2942 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 2943 } else { 2944 disk->acwu = cdata->acwu + 1; /* 0-based */ 2945 } 2946 2947 disk->ctxt = ctx; 2948 disk->fn_table = &nvmelib_fn_table; 2949 disk->module = &nvme_if; 2950 2951 return 0; 2952 } 2953 2954 static int 2955 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 2956 { 2957 struct nvme_bdev *bdev; 2958 int rc; 2959 2960 bdev = calloc(1, sizeof(*bdev)); 2961 if (!bdev) { 2962 SPDK_ERRLOG("bdev calloc() failed\n"); 2963 return -ENOMEM; 2964 } 2965 2966 rc = pthread_mutex_init(&bdev->mutex, NULL); 2967 if (rc != 0) { 2968 free(bdev); 2969 return rc; 2970 } 2971 2972 bdev->ref = 1; 2973 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 2974 TAILQ_INIT(&bdev->nvme_ns_list); 2975 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 2976 bdev->opal = nvme_ctrlr->opal_dev != NULL; 2977 2978 rc = nvme_disk_create(&bdev->disk, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 2979 nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); 2980 if (rc != 0) { 2981 SPDK_ERRLOG("Failed to create NVMe disk\n"); 2982 pthread_mutex_destroy(&bdev->mutex); 2983 free(bdev); 2984 return rc; 2985 } 2986 2987 spdk_io_device_register(bdev, 2988 bdev_nvme_create_bdev_channel_cb, 2989 bdev_nvme_destroy_bdev_channel_cb, 2990 sizeof(struct nvme_bdev_channel), 2991 bdev->disk.name); 2992 2993 rc = spdk_bdev_register(&bdev->disk); 2994 if (rc != 0) { 2995 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 2996 spdk_io_device_unregister(bdev, NULL); 2997 pthread_mutex_destroy(&bdev->mutex); 2998 free(bdev->disk.name); 2999 free(bdev); 3000 return rc; 3001 } 3002 3003 nvme_ns->bdev = bdev; 3004 bdev->nsid = nvme_ns->id; 3005 3006 bdev->nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 3007 TAILQ_INSERT_TAIL(&nvme_ctrlr->nbdev_ctrlr->bdevs, bdev, tailq); 3008 3009 return 0; 3010 } 3011 3012 static bool 3013 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 3014 { 3015 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 3016 const struct spdk_uuid *uuid1, *uuid2; 3017 3018 nsdata1 = spdk_nvme_ns_get_data(ns1); 3019 nsdata2 = spdk_nvme_ns_get_data(ns2); 3020 uuid1 = spdk_nvme_ns_get_uuid(ns1); 3021 uuid2 = spdk_nvme_ns_get_uuid(ns2); 3022 3023 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 3024 nsdata1->eui64 == nsdata2->eui64 && 3025 ((uuid1 == NULL && uuid2 == NULL) || 3026 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 3027 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 3028 } 3029 3030 static bool 3031 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 3032 struct spdk_nvme_ctrlr_opts *opts) 3033 { 3034 struct nvme_probe_skip_entry *entry; 3035 3036 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 3037 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 3038 return false; 3039 } 3040 } 3041 3042 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 3043 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 3044 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 3045 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 3046 opts->disable_read_ana_log_page = true; 3047 3048 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 3049 3050 return true; 3051 } 3052 3053 static void 3054 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 3055 { 3056 struct nvme_ctrlr *nvme_ctrlr = ctx; 3057 3058 if (spdk_nvme_cpl_is_error(cpl)) { 3059 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 3060 cpl->status.sct); 3061 bdev_nvme_reset(nvme_ctrlr); 3062 } else if (cpl->cdw0 & 0x1) { 3063 SPDK_WARNLOG("Specified command could not be aborted.\n"); 3064 bdev_nvme_reset(nvme_ctrlr); 3065 } 3066 } 3067 3068 static void 3069 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 3070 struct spdk_nvme_qpair *qpair, uint16_t cid) 3071 { 3072 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 3073 union spdk_nvme_csts_register csts; 3074 int rc; 3075 3076 assert(nvme_ctrlr->ctrlr == ctrlr); 3077 3078 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 3079 3080 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 3081 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 3082 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 3083 * completion recursively. 3084 */ 3085 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 3086 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 3087 if (csts.bits.cfs) { 3088 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 3089 bdev_nvme_reset(nvme_ctrlr); 3090 return; 3091 } 3092 } 3093 3094 switch (g_opts.action_on_timeout) { 3095 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 3096 if (qpair) { 3097 /* Don't send abort to ctrlr when ctrlr is not available. */ 3098 pthread_mutex_lock(&nvme_ctrlr->mutex); 3099 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 3100 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3101 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 3102 return; 3103 } 3104 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3105 3106 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 3107 nvme_abort_cpl, nvme_ctrlr); 3108 if (rc == 0) { 3109 return; 3110 } 3111 3112 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 3113 } 3114 3115 /* FALLTHROUGH */ 3116 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 3117 bdev_nvme_reset(nvme_ctrlr); 3118 break; 3119 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 3120 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 3121 break; 3122 default: 3123 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 3124 break; 3125 } 3126 } 3127 3128 static void 3129 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 3130 { 3131 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 3132 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 3133 3134 if (rc == 0) { 3135 nvme_ns->probe_ctx = NULL; 3136 pthread_mutex_lock(&nvme_ctrlr->mutex); 3137 nvme_ctrlr->ref++; 3138 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3139 } else { 3140 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 3141 free(nvme_ns); 3142 } 3143 3144 if (ctx) { 3145 ctx->populates_in_progress--; 3146 if (ctx->populates_in_progress == 0) { 3147 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 3148 } 3149 } 3150 } 3151 3152 static void 3153 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 3154 { 3155 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3156 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 3157 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3158 int rc; 3159 3160 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 3161 if (rc != 0) { 3162 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 3163 } 3164 3165 spdk_for_each_channel_continue(i, rc); 3166 } 3167 3168 static void 3169 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 3170 { 3171 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3172 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 3173 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3174 struct nvme_io_path *io_path; 3175 3176 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 3177 if (io_path != NULL) { 3178 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 3179 } 3180 3181 spdk_for_each_channel_continue(i, 0); 3182 } 3183 3184 static void 3185 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 3186 { 3187 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3188 3189 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 3190 } 3191 3192 static void 3193 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 3194 { 3195 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3196 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 3197 3198 if (status == 0) { 3199 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 3200 } else { 3201 /* Delete the added io_paths and fail populating the namespace. */ 3202 spdk_for_each_channel(bdev, 3203 bdev_nvme_delete_io_path, 3204 nvme_ns, 3205 bdev_nvme_add_io_path_failed); 3206 } 3207 } 3208 3209 static int 3210 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 3211 { 3212 struct nvme_ns *tmp_ns; 3213 const struct spdk_nvme_ns_data *nsdata; 3214 3215 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 3216 if (!nsdata->nmic.can_share) { 3217 SPDK_ERRLOG("Namespace cannot be shared.\n"); 3218 return -EINVAL; 3219 } 3220 3221 pthread_mutex_lock(&bdev->mutex); 3222 3223 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 3224 assert(tmp_ns != NULL); 3225 3226 if (!bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 3227 pthread_mutex_unlock(&bdev->mutex); 3228 SPDK_ERRLOG("Namespaces are not identical.\n"); 3229 return -EINVAL; 3230 } 3231 3232 bdev->ref++; 3233 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 3234 nvme_ns->bdev = bdev; 3235 3236 pthread_mutex_unlock(&bdev->mutex); 3237 3238 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 3239 spdk_for_each_channel(bdev, 3240 bdev_nvme_add_io_path, 3241 nvme_ns, 3242 bdev_nvme_add_io_path_done); 3243 3244 return 0; 3245 } 3246 3247 static void 3248 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 3249 { 3250 struct spdk_nvme_ns *ns; 3251 struct nvme_bdev *bdev; 3252 int rc = 0; 3253 3254 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 3255 if (!ns) { 3256 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 3257 rc = -EINVAL; 3258 goto done; 3259 } 3260 3261 nvme_ns->ns = ns; 3262 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 3263 3264 if (nvme_ctrlr->ana_log_page != NULL) { 3265 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 3266 } 3267 3268 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 3269 if (bdev == NULL) { 3270 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 3271 } else { 3272 rc = nvme_bdev_add_ns(bdev, nvme_ns); 3273 if (rc == 0) { 3274 return; 3275 } 3276 } 3277 done: 3278 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 3279 } 3280 3281 static void 3282 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 3283 { 3284 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 3285 3286 assert(nvme_ctrlr != NULL); 3287 3288 pthread_mutex_lock(&nvme_ctrlr->mutex); 3289 3290 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 3291 3292 if (nvme_ns->bdev != NULL) { 3293 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3294 return; 3295 } 3296 3297 free(nvme_ns); 3298 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3299 3300 nvme_ctrlr_release(nvme_ctrlr); 3301 } 3302 3303 static void 3304 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 3305 { 3306 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3307 3308 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 3309 } 3310 3311 static void 3312 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 3313 { 3314 struct nvme_bdev *bdev; 3315 3316 spdk_poller_unregister(&nvme_ns->anatt_timer); 3317 3318 bdev = nvme_ns->bdev; 3319 if (bdev != NULL) { 3320 pthread_mutex_lock(&bdev->mutex); 3321 3322 assert(bdev->ref > 0); 3323 bdev->ref--; 3324 if (bdev->ref == 0) { 3325 pthread_mutex_unlock(&bdev->mutex); 3326 3327 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 3328 } else { 3329 /* spdk_bdev_unregister() is not called until the last nvme_ns is 3330 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 3331 * and clear nvme_ns->bdev here. 3332 */ 3333 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 3334 nvme_ns->bdev = NULL; 3335 3336 pthread_mutex_unlock(&bdev->mutex); 3337 3338 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 3339 * we call depopulate_namespace_done() to avoid use-after-free. 3340 */ 3341 spdk_for_each_channel(bdev, 3342 bdev_nvme_delete_io_path, 3343 nvme_ns, 3344 bdev_nvme_delete_io_path_done); 3345 return; 3346 } 3347 } 3348 3349 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 3350 } 3351 3352 static void 3353 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 3354 struct nvme_async_probe_ctx *ctx) 3355 { 3356 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 3357 struct nvme_ns *nvme_ns, *next; 3358 struct spdk_nvme_ns *ns; 3359 struct nvme_bdev *bdev; 3360 uint32_t nsid; 3361 int rc; 3362 uint64_t num_sectors; 3363 3364 if (ctx) { 3365 /* Initialize this count to 1 to handle the populate functions 3366 * calling nvme_ctrlr_populate_namespace_done() immediately. 3367 */ 3368 ctx->populates_in_progress = 1; 3369 } 3370 3371 /* First loop over our existing namespaces and see if they have been 3372 * removed. */ 3373 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 3374 while (nvme_ns != NULL) { 3375 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 3376 3377 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 3378 /* NS is still there but attributes may have changed */ 3379 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 3380 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 3381 bdev = nvme_ns->bdev; 3382 assert(bdev != NULL); 3383 if (bdev->disk.blockcnt != num_sectors) { 3384 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 3385 nvme_ns->id, 3386 bdev->disk.name, 3387 bdev->disk.blockcnt, 3388 num_sectors); 3389 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 3390 if (rc != 0) { 3391 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 3392 bdev->disk.name, rc); 3393 } 3394 } 3395 } else { 3396 /* Namespace was removed */ 3397 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 3398 } 3399 3400 nvme_ns = next; 3401 } 3402 3403 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 3404 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 3405 while (nsid != 0) { 3406 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 3407 3408 if (nvme_ns == NULL) { 3409 /* Found a new one */ 3410 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 3411 if (nvme_ns == NULL) { 3412 SPDK_ERRLOG("Failed to allocate namespace\n"); 3413 /* This just fails to attach the namespace. It may work on a future attempt. */ 3414 continue; 3415 } 3416 3417 nvme_ns->id = nsid; 3418 nvme_ns->ctrlr = nvme_ctrlr; 3419 3420 nvme_ns->bdev = NULL; 3421 3422 if (ctx) { 3423 ctx->populates_in_progress++; 3424 } 3425 nvme_ns->probe_ctx = ctx; 3426 3427 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 3428 3429 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 3430 } 3431 3432 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 3433 } 3434 3435 if (ctx) { 3436 /* Decrement this count now that the loop is over to account 3437 * for the one we started with. If the count is then 0, we 3438 * know any populate_namespace functions completed immediately, 3439 * so we'll kick the callback here. 3440 */ 3441 ctx->populates_in_progress--; 3442 if (ctx->populates_in_progress == 0) { 3443 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 3444 } 3445 } 3446 3447 } 3448 3449 static void 3450 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 3451 { 3452 struct nvme_ns *nvme_ns, *tmp; 3453 3454 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 3455 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 3456 } 3457 } 3458 3459 static int 3460 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 3461 void *cb_arg) 3462 { 3463 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 3464 struct nvme_ns *nvme_ns; 3465 uint32_t i, nsid; 3466 3467 for (i = 0; i < desc->num_of_nsid; i++) { 3468 nsid = desc->nsid[i]; 3469 if (nsid == 0) { 3470 continue; 3471 } 3472 3473 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 3474 3475 assert(nvme_ns != NULL); 3476 if (nvme_ns == NULL) { 3477 /* Target told us that an inactive namespace had an ANA change */ 3478 continue; 3479 } 3480 3481 _nvme_ns_set_ana_state(nvme_ns, desc); 3482 } 3483 3484 return 0; 3485 } 3486 3487 static void 3488 _nvme_ctrlr_read_ana_log_page_done(struct spdk_io_channel_iter *i, int status) 3489 { 3490 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 3491 3492 pthread_mutex_lock(&nvme_ctrlr->mutex); 3493 3494 assert(nvme_ctrlr->ana_log_page_updating == true); 3495 nvme_ctrlr->ana_log_page_updating = false; 3496 3497 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 3498 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3499 return; 3500 } 3501 3502 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3503 3504 nvme_ctrlr_unregister(nvme_ctrlr); 3505 } 3506 3507 static void 3508 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 3509 { 3510 struct nvme_ns *nvme_ns; 3511 3512 spdk_free(nvme_ctrlr->ana_log_page); 3513 nvme_ctrlr->ana_log_page = NULL; 3514 3515 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 3516 nvme_ns != NULL; 3517 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 3518 nvme_ns->ana_state_updating = false; 3519 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 3520 } 3521 } 3522 3523 static void 3524 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 3525 { 3526 struct nvme_ctrlr *nvme_ctrlr = ctx; 3527 3528 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 3529 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 3530 nvme_ctrlr); 3531 } else { 3532 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 3533 } 3534 3535 bdev_nvme_clear_io_path_caches(nvme_ctrlr, _nvme_ctrlr_read_ana_log_page_done); 3536 } 3537 3538 static int 3539 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 3540 { 3541 int rc; 3542 3543 if (nvme_ctrlr->ana_log_page == NULL) { 3544 return -EINVAL; 3545 } 3546 3547 pthread_mutex_lock(&nvme_ctrlr->mutex); 3548 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 3549 nvme_ctrlr->ana_log_page_updating) { 3550 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3551 return -EBUSY; 3552 } 3553 3554 nvme_ctrlr->ana_log_page_updating = true; 3555 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3556 3557 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 3558 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 3559 SPDK_NVME_GLOBAL_NS_TAG, 3560 nvme_ctrlr->ana_log_page, 3561 nvme_ctrlr->ana_log_page_size, 0, 3562 nvme_ctrlr_read_ana_log_page_done, 3563 nvme_ctrlr); 3564 if (rc != 0) { 3565 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 3566 } 3567 3568 return rc; 3569 } 3570 3571 static void 3572 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 3573 { 3574 } 3575 3576 struct bdev_nvme_set_preferred_path_ctx { 3577 struct spdk_bdev_desc *desc; 3578 struct nvme_ns *nvme_ns; 3579 bdev_nvme_set_preferred_path_cb cb_fn; 3580 void *cb_arg; 3581 }; 3582 3583 static void 3584 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 3585 { 3586 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3587 3588 assert(ctx != NULL); 3589 assert(ctx->desc != NULL); 3590 assert(ctx->cb_fn != NULL); 3591 3592 spdk_bdev_close(ctx->desc); 3593 3594 ctx->cb_fn(ctx->cb_arg, status); 3595 3596 free(ctx); 3597 } 3598 3599 static void 3600 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 3601 { 3602 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3603 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3604 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 3605 struct nvme_io_path *io_path, *prev; 3606 3607 prev = NULL; 3608 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3609 if (io_path->nvme_ns == ctx->nvme_ns) { 3610 break; 3611 } 3612 prev = io_path; 3613 } 3614 3615 if (io_path != NULL) { 3616 if (prev != NULL) { 3617 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 3618 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 3619 } 3620 3621 /* We can set io_path to nbdev_ch->current_io_path directly here. 3622 * However, it needs to be conditional. To simplify the code, 3623 * just clear nbdev_ch->current_io_path and let find_io_path() 3624 * fill it. 3625 * 3626 * Automatic failback may be disabled. Hence even if the io_path is 3627 * already at the head, clear nbdev_ch->current_io_path. 3628 */ 3629 nbdev_ch->current_io_path = NULL; 3630 } 3631 3632 spdk_for_each_channel_continue(i, 0); 3633 } 3634 3635 static struct nvme_ns * 3636 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 3637 { 3638 struct nvme_ns *nvme_ns, *prev; 3639 const struct spdk_nvme_ctrlr_data *cdata; 3640 3641 prev = NULL; 3642 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3643 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 3644 3645 if (cdata->cntlid == cntlid) { 3646 break; 3647 } 3648 prev = nvme_ns; 3649 } 3650 3651 if (nvme_ns != NULL && prev != NULL) { 3652 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 3653 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 3654 } 3655 3656 return nvme_ns; 3657 } 3658 3659 /* This function supports only multipath mode. There is only a single I/O path 3660 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 3661 * head of the I/O path list for each NVMe bdev channel. 3662 * 3663 * NVMe bdev channel may be acquired after completing this function. move the 3664 * matched namespace to the head of the namespace list for the NVMe bdev too. 3665 */ 3666 void 3667 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 3668 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 3669 { 3670 struct bdev_nvme_set_preferred_path_ctx *ctx; 3671 struct spdk_bdev *bdev; 3672 struct nvme_bdev *nbdev; 3673 int rc = 0; 3674 3675 assert(cb_fn != NULL); 3676 3677 ctx = calloc(1, sizeof(*ctx)); 3678 if (ctx == NULL) { 3679 SPDK_ERRLOG("Failed to alloc context.\n"); 3680 rc = -ENOMEM; 3681 goto err_alloc; 3682 } 3683 3684 ctx->cb_fn = cb_fn; 3685 ctx->cb_arg = cb_arg; 3686 3687 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 3688 if (rc != 0) { 3689 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 3690 goto err_open; 3691 } 3692 3693 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 3694 3695 if (bdev->module != &nvme_if) { 3696 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 3697 rc = -ENODEV; 3698 goto err_bdev; 3699 } 3700 3701 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 3702 3703 pthread_mutex_lock(&nbdev->mutex); 3704 3705 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 3706 if (ctx->nvme_ns == NULL) { 3707 pthread_mutex_unlock(&nbdev->mutex); 3708 3709 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 3710 rc = -ENODEV; 3711 goto err_bdev; 3712 } 3713 3714 pthread_mutex_unlock(&nbdev->mutex); 3715 3716 spdk_for_each_channel(nbdev, 3717 _bdev_nvme_set_preferred_path, 3718 ctx, 3719 bdev_nvme_set_preferred_path_done); 3720 return; 3721 3722 err_bdev: 3723 spdk_bdev_close(ctx->desc); 3724 err_open: 3725 free(ctx); 3726 err_alloc: 3727 cb_fn(cb_arg, rc); 3728 } 3729 3730 struct bdev_nvme_set_multipath_policy_ctx { 3731 struct spdk_bdev_desc *desc; 3732 bdev_nvme_set_multipath_policy_cb cb_fn; 3733 void *cb_arg; 3734 }; 3735 3736 static void 3737 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 3738 { 3739 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3740 3741 assert(ctx != NULL); 3742 assert(ctx->desc != NULL); 3743 assert(ctx->cb_fn != NULL); 3744 3745 spdk_bdev_close(ctx->desc); 3746 3747 ctx->cb_fn(ctx->cb_arg, status); 3748 3749 free(ctx); 3750 } 3751 3752 static void 3753 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 3754 { 3755 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3756 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 3757 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 3758 3759 nbdev_ch->mp_policy = nbdev->mp_policy; 3760 nbdev_ch->current_io_path = NULL; 3761 3762 spdk_for_each_channel_continue(i, 0); 3763 } 3764 3765 void 3766 bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, 3767 bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 3768 { 3769 struct bdev_nvme_set_multipath_policy_ctx *ctx; 3770 struct spdk_bdev *bdev; 3771 struct nvme_bdev *nbdev; 3772 int rc; 3773 3774 assert(cb_fn != NULL); 3775 3776 ctx = calloc(1, sizeof(*ctx)); 3777 if (ctx == NULL) { 3778 SPDK_ERRLOG("Failed to alloc context.\n"); 3779 rc = -ENOMEM; 3780 goto err_alloc; 3781 } 3782 3783 ctx->cb_fn = cb_fn; 3784 ctx->cb_arg = cb_arg; 3785 3786 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 3787 if (rc != 0) { 3788 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 3789 rc = -ENODEV; 3790 goto err_open; 3791 } 3792 3793 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 3794 if (bdev->module != &nvme_if) { 3795 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 3796 rc = -ENODEV; 3797 goto err_module; 3798 } 3799 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 3800 3801 pthread_mutex_lock(&nbdev->mutex); 3802 nbdev->mp_policy = policy; 3803 pthread_mutex_unlock(&nbdev->mutex); 3804 3805 spdk_for_each_channel(nbdev, 3806 _bdev_nvme_set_multipath_policy, 3807 ctx, 3808 bdev_nvme_set_multipath_policy_done); 3809 return; 3810 3811 err_module: 3812 spdk_bdev_close(ctx->desc); 3813 err_open: 3814 free(ctx); 3815 err_alloc: 3816 cb_fn(cb_arg, rc); 3817 } 3818 3819 static void 3820 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 3821 { 3822 struct nvme_ctrlr *nvme_ctrlr = arg; 3823 union spdk_nvme_async_event_completion event; 3824 3825 if (spdk_nvme_cpl_is_error(cpl)) { 3826 SPDK_WARNLOG("AER request execute failed"); 3827 return; 3828 } 3829 3830 event.raw = cpl->cdw0; 3831 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 3832 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 3833 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 3834 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 3835 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 3836 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 3837 } 3838 } 3839 3840 static void 3841 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc) 3842 { 3843 if (ctx->cb_fn) { 3844 ctx->cb_fn(ctx->cb_ctx, count, rc); 3845 } 3846 3847 ctx->namespaces_populated = true; 3848 if (ctx->probe_done) { 3849 /* The probe was already completed, so we need to free the context 3850 * here. This can happen for cases like OCSSD, where we need to 3851 * send additional commands to the SSD after attach. 3852 */ 3853 free(ctx); 3854 } 3855 } 3856 3857 static void 3858 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 3859 struct nvme_async_probe_ctx *ctx) 3860 { 3861 spdk_io_device_register(nvme_ctrlr, 3862 bdev_nvme_create_ctrlr_channel_cb, 3863 bdev_nvme_destroy_ctrlr_channel_cb, 3864 sizeof(struct nvme_ctrlr_channel), 3865 nvme_ctrlr->nbdev_ctrlr->name); 3866 3867 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 3868 } 3869 3870 static void 3871 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 3872 { 3873 struct nvme_ctrlr *nvme_ctrlr = _ctx; 3874 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 3875 3876 nvme_ctrlr->probe_ctx = NULL; 3877 3878 if (spdk_nvme_cpl_is_error(cpl)) { 3879 nvme_ctrlr_delete(nvme_ctrlr); 3880 3881 if (ctx != NULL) { 3882 populate_namespaces_cb(ctx, 0, -1); 3883 } 3884 return; 3885 } 3886 3887 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 3888 } 3889 3890 static int 3891 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3892 struct nvme_async_probe_ctx *ctx) 3893 { 3894 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 3895 const struct spdk_nvme_ctrlr_data *cdata; 3896 uint32_t ana_log_page_size; 3897 3898 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3899 3900 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 3901 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 3902 sizeof(uint32_t); 3903 3904 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 3905 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3906 if (nvme_ctrlr->ana_log_page == NULL) { 3907 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 3908 return -ENXIO; 3909 } 3910 3911 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 3912 * Hence copy each descriptor to a temporary area when parsing it. 3913 * 3914 * Allocate a buffer whose size is as large as ANA log page buffer because 3915 * we do not know the size of a descriptor until actually reading it. 3916 */ 3917 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 3918 if (nvme_ctrlr->copied_ana_desc == NULL) { 3919 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 3920 return -ENOMEM; 3921 } 3922 3923 nvme_ctrlr->ana_log_page_size = ana_log_page_size; 3924 3925 nvme_ctrlr->probe_ctx = ctx; 3926 3927 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 3928 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 3929 SPDK_NVME_GLOBAL_NS_TAG, 3930 nvme_ctrlr->ana_log_page, 3931 nvme_ctrlr->ana_log_page_size, 0, 3932 nvme_ctrlr_init_ana_log_page_done, 3933 nvme_ctrlr); 3934 } 3935 3936 /* hostnqn and subnqn were already verified before attaching a controller. 3937 * Hence check only the multipath capability and cntlid here. 3938 */ 3939 static bool 3940 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 3941 { 3942 struct nvme_ctrlr *tmp; 3943 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 3944 3945 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3946 3947 if (!cdata->cmic.multi_ctrlr) { 3948 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 3949 return false; 3950 } 3951 3952 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 3953 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 3954 3955 if (!tmp_cdata->cmic.multi_ctrlr) { 3956 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 3957 return false; 3958 } 3959 if (cdata->cntlid == tmp_cdata->cntlid) { 3960 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 3961 return false; 3962 } 3963 } 3964 3965 return true; 3966 } 3967 3968 static int 3969 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 3970 { 3971 struct nvme_bdev_ctrlr *nbdev_ctrlr; 3972 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 3973 int rc = 0; 3974 3975 pthread_mutex_lock(&g_bdev_nvme_mutex); 3976 3977 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 3978 if (nbdev_ctrlr != NULL) { 3979 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 3980 rc = -EINVAL; 3981 goto exit; 3982 } 3983 } else { 3984 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 3985 if (nbdev_ctrlr == NULL) { 3986 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 3987 rc = -ENOMEM; 3988 goto exit; 3989 } 3990 nbdev_ctrlr->name = strdup(name); 3991 if (nbdev_ctrlr->name == NULL) { 3992 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 3993 free(nbdev_ctrlr); 3994 goto exit; 3995 } 3996 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 3997 TAILQ_INIT(&nbdev_ctrlr->bdevs); 3998 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 3999 } 4000 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 4001 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 4002 exit: 4003 pthread_mutex_unlock(&g_bdev_nvme_mutex); 4004 return rc; 4005 } 4006 4007 static int 4008 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 4009 const char *name, 4010 const struct spdk_nvme_transport_id *trid, 4011 struct nvme_async_probe_ctx *ctx) 4012 { 4013 struct nvme_ctrlr *nvme_ctrlr; 4014 struct nvme_path_id *path_id; 4015 const struct spdk_nvme_ctrlr_data *cdata; 4016 int rc; 4017 4018 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 4019 if (nvme_ctrlr == NULL) { 4020 SPDK_ERRLOG("Failed to allocate device struct\n"); 4021 return -ENOMEM; 4022 } 4023 4024 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 4025 if (rc != 0) { 4026 free(nvme_ctrlr); 4027 return rc; 4028 } 4029 4030 TAILQ_INIT(&nvme_ctrlr->trids); 4031 4032 RB_INIT(&nvme_ctrlr->namespaces); 4033 4034 path_id = calloc(1, sizeof(*path_id)); 4035 if (path_id == NULL) { 4036 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 4037 rc = -ENOMEM; 4038 goto err; 4039 } 4040 4041 path_id->trid = *trid; 4042 if (ctx != NULL) { 4043 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 4044 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 4045 } 4046 nvme_ctrlr->active_path_id = path_id; 4047 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 4048 4049 nvme_ctrlr->thread = spdk_get_thread(); 4050 nvme_ctrlr->ctrlr = ctrlr; 4051 nvme_ctrlr->ref = 1; 4052 4053 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 4054 SPDK_ERRLOG("OCSSDs are not supported"); 4055 rc = -ENOTSUP; 4056 goto err; 4057 } 4058 4059 if (ctx != NULL) { 4060 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 4061 } else { 4062 bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 4063 } 4064 4065 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 4066 g_opts.nvme_adminq_poll_period_us); 4067 4068 if (g_opts.timeout_us > 0) { 4069 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 4070 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 4071 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 4072 g_opts.timeout_us : g_opts.timeout_admin_us; 4073 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 4074 adm_timeout_us, timeout_cb, nvme_ctrlr); 4075 } 4076 4077 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 4078 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 4079 4080 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 4081 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 4082 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 4083 } 4084 4085 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 4086 if (rc != 0) { 4087 goto err; 4088 } 4089 4090 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4091 4092 if (cdata->cmic.ana_reporting) { 4093 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 4094 if (rc == 0) { 4095 return 0; 4096 } 4097 } else { 4098 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 4099 return 0; 4100 } 4101 4102 err: 4103 nvme_ctrlr_delete(nvme_ctrlr); 4104 return rc; 4105 } 4106 4107 void 4108 bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) 4109 { 4110 opts->prchk_flags = 0; 4111 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 4112 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 4113 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 4114 } 4115 4116 static void 4117 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4118 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 4119 { 4120 char *name; 4121 4122 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 4123 if (!name) { 4124 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 4125 return; 4126 } 4127 4128 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 4129 4130 nvme_ctrlr_create(ctrlr, name, trid, NULL); 4131 4132 free(name); 4133 } 4134 4135 static void 4136 _nvme_ctrlr_destruct(void *ctx) 4137 { 4138 struct nvme_ctrlr *nvme_ctrlr = ctx; 4139 4140 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 4141 nvme_ctrlr_release(nvme_ctrlr); 4142 } 4143 4144 static int 4145 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 4146 { 4147 struct nvme_probe_skip_entry *entry; 4148 4149 pthread_mutex_lock(&nvme_ctrlr->mutex); 4150 4151 /* The controller's destruction was already started */ 4152 if (nvme_ctrlr->destruct) { 4153 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4154 return 0; 4155 } 4156 4157 if (!hotplug && 4158 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 4159 entry = calloc(1, sizeof(*entry)); 4160 if (!entry) { 4161 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4162 return -ENOMEM; 4163 } 4164 entry->trid = nvme_ctrlr->active_path_id->trid; 4165 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 4166 } 4167 4168 nvme_ctrlr->destruct = true; 4169 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4170 4171 _nvme_ctrlr_destruct(nvme_ctrlr); 4172 4173 return 0; 4174 } 4175 4176 static void 4177 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 4178 { 4179 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 4180 4181 _bdev_nvme_delete(nvme_ctrlr, true); 4182 } 4183 4184 static int 4185 bdev_nvme_hotplug_probe(void *arg) 4186 { 4187 if (g_hotplug_probe_ctx == NULL) { 4188 spdk_poller_unregister(&g_hotplug_probe_poller); 4189 return SPDK_POLLER_IDLE; 4190 } 4191 4192 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 4193 g_hotplug_probe_ctx = NULL; 4194 spdk_poller_unregister(&g_hotplug_probe_poller); 4195 } 4196 4197 return SPDK_POLLER_BUSY; 4198 } 4199 4200 static int 4201 bdev_nvme_hotplug(void *arg) 4202 { 4203 struct spdk_nvme_transport_id trid_pcie; 4204 4205 if (g_hotplug_probe_ctx) { 4206 return SPDK_POLLER_BUSY; 4207 } 4208 4209 memset(&trid_pcie, 0, sizeof(trid_pcie)); 4210 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 4211 4212 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 4213 hotplug_probe_cb, attach_cb, NULL); 4214 4215 if (g_hotplug_probe_ctx) { 4216 assert(g_hotplug_probe_poller == NULL); 4217 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 4218 } 4219 4220 return SPDK_POLLER_BUSY; 4221 } 4222 4223 void 4224 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 4225 { 4226 *opts = g_opts; 4227 } 4228 4229 static bool bdev_nvme_check_multipath_params(int32_t ctrlr_loss_timeout_sec, 4230 uint32_t reconnect_delay_sec, 4231 uint32_t fast_io_fail_timeout_sec); 4232 4233 static int 4234 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 4235 { 4236 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 4237 /* Can't set timeout_admin_us without also setting timeout_us */ 4238 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 4239 return -EINVAL; 4240 } 4241 4242 if (opts->bdev_retry_count < -1) { 4243 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 4244 return -EINVAL; 4245 } 4246 4247 if (!bdev_nvme_check_multipath_params(opts->ctrlr_loss_timeout_sec, 4248 opts->reconnect_delay_sec, 4249 opts->fast_io_fail_timeout_sec)) { 4250 return -EINVAL; 4251 } 4252 4253 return 0; 4254 } 4255 4256 int 4257 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 4258 { 4259 int ret = bdev_nvme_validate_opts(opts); 4260 if (ret) { 4261 SPDK_WARNLOG("Failed to set nvme opts.\n"); 4262 return ret; 4263 } 4264 4265 if (g_bdev_nvme_init_thread != NULL) { 4266 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 4267 return -EPERM; 4268 } 4269 } 4270 4271 g_opts = *opts; 4272 4273 return 0; 4274 } 4275 4276 struct set_nvme_hotplug_ctx { 4277 uint64_t period_us; 4278 bool enabled; 4279 spdk_msg_fn fn; 4280 void *fn_ctx; 4281 }; 4282 4283 static void 4284 set_nvme_hotplug_period_cb(void *_ctx) 4285 { 4286 struct set_nvme_hotplug_ctx *ctx = _ctx; 4287 4288 spdk_poller_unregister(&g_hotplug_poller); 4289 if (ctx->enabled) { 4290 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 4291 } 4292 4293 g_nvme_hotplug_poll_period_us = ctx->period_us; 4294 g_nvme_hotplug_enabled = ctx->enabled; 4295 if (ctx->fn) { 4296 ctx->fn(ctx->fn_ctx); 4297 } 4298 4299 free(ctx); 4300 } 4301 4302 int 4303 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 4304 { 4305 struct set_nvme_hotplug_ctx *ctx; 4306 4307 if (enabled == true && !spdk_process_is_primary()) { 4308 return -EPERM; 4309 } 4310 4311 ctx = calloc(1, sizeof(*ctx)); 4312 if (ctx == NULL) { 4313 return -ENOMEM; 4314 } 4315 4316 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 4317 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 4318 ctx->enabled = enabled; 4319 ctx->fn = cb; 4320 ctx->fn_ctx = cb_ctx; 4321 4322 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 4323 return 0; 4324 } 4325 4326 static void 4327 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 4328 struct nvme_async_probe_ctx *ctx) 4329 { 4330 struct nvme_ns *nvme_ns; 4331 struct nvme_bdev *nvme_bdev; 4332 size_t j; 4333 4334 assert(nvme_ctrlr != NULL); 4335 4336 if (ctx->names == NULL) { 4337 populate_namespaces_cb(ctx, 0, 0); 4338 return; 4339 } 4340 4341 /* 4342 * Report the new bdevs that were created in this call. 4343 * There can be more than one bdev per NVMe controller. 4344 */ 4345 j = 0; 4346 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4347 while (nvme_ns != NULL) { 4348 nvme_bdev = nvme_ns->bdev; 4349 if (j < ctx->count) { 4350 ctx->names[j] = nvme_bdev->disk.name; 4351 j++; 4352 } else { 4353 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 4354 ctx->count); 4355 populate_namespaces_cb(ctx, 0, -ERANGE); 4356 return; 4357 } 4358 4359 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4360 } 4361 4362 populate_namespaces_cb(ctx, j, 0); 4363 } 4364 4365 static int 4366 bdev_nvme_compare_trids(struct nvme_ctrlr *nvme_ctrlr, 4367 struct spdk_nvme_ctrlr *new_ctrlr, 4368 struct spdk_nvme_transport_id *trid) 4369 { 4370 struct nvme_path_id *tmp_trid; 4371 4372 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 4373 SPDK_ERRLOG("PCIe failover is not supported.\n"); 4374 return -ENOTSUP; 4375 } 4376 4377 /* Currently we only support failover to the same transport type. */ 4378 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 4379 return -EINVAL; 4380 } 4381 4382 /* Currently we only support failover to the same NQN. */ 4383 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 4384 return -EINVAL; 4385 } 4386 4387 /* Skip all the other checks if we've already registered this path. */ 4388 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 4389 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 4390 return -EEXIST; 4391 } 4392 } 4393 4394 return 0; 4395 } 4396 4397 static int 4398 bdev_nvme_compare_namespaces(struct nvme_ctrlr *nvme_ctrlr, 4399 struct spdk_nvme_ctrlr *new_ctrlr) 4400 { 4401 struct nvme_ns *nvme_ns; 4402 struct spdk_nvme_ns *new_ns; 4403 4404 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4405 while (nvme_ns != NULL) { 4406 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 4407 assert(new_ns != NULL); 4408 4409 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 4410 return -EINVAL; 4411 } 4412 4413 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4414 } 4415 4416 return 0; 4417 } 4418 4419 static int 4420 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 4421 struct spdk_nvme_transport_id *trid) 4422 { 4423 struct nvme_path_id *new_trid, *tmp_trid; 4424 4425 new_trid = calloc(1, sizeof(*new_trid)); 4426 if (new_trid == NULL) { 4427 return -ENOMEM; 4428 } 4429 new_trid->trid = *trid; 4430 new_trid->is_failed = false; 4431 4432 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 4433 if (tmp_trid->is_failed && tmp_trid != nvme_ctrlr->active_path_id) { 4434 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 4435 return 0; 4436 } 4437 } 4438 4439 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 4440 return 0; 4441 } 4442 4443 /* This is the case that a secondary path is added to an existing 4444 * nvme_ctrlr for failover. After checking if it can access the same 4445 * namespaces as the primary path, it is disconnected until failover occurs. 4446 */ 4447 static int 4448 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 4449 struct spdk_nvme_ctrlr *new_ctrlr, 4450 struct spdk_nvme_transport_id *trid) 4451 { 4452 int rc; 4453 4454 assert(nvme_ctrlr != NULL); 4455 4456 pthread_mutex_lock(&nvme_ctrlr->mutex); 4457 4458 rc = bdev_nvme_compare_trids(nvme_ctrlr, new_ctrlr, trid); 4459 if (rc != 0) { 4460 goto exit; 4461 } 4462 4463 rc = bdev_nvme_compare_namespaces(nvme_ctrlr, new_ctrlr); 4464 if (rc != 0) { 4465 goto exit; 4466 } 4467 4468 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 4469 4470 exit: 4471 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4472 4473 spdk_nvme_detach(new_ctrlr); 4474 4475 return rc; 4476 } 4477 4478 static void 4479 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4480 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 4481 { 4482 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 4483 struct nvme_async_probe_ctx *ctx; 4484 int rc; 4485 4486 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 4487 ctx->ctrlr_attached = true; 4488 4489 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 4490 if (rc != 0) { 4491 populate_namespaces_cb(ctx, 0, rc); 4492 } 4493 } 4494 4495 static void 4496 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4497 struct spdk_nvme_ctrlr *ctrlr, 4498 const struct spdk_nvme_ctrlr_opts *opts) 4499 { 4500 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 4501 struct nvme_ctrlr *nvme_ctrlr; 4502 struct nvme_async_probe_ctx *ctx; 4503 int rc; 4504 4505 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 4506 ctx->ctrlr_attached = true; 4507 4508 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 4509 if (nvme_ctrlr) { 4510 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 4511 } else { 4512 rc = -ENODEV; 4513 } 4514 4515 populate_namespaces_cb(ctx, 0, rc); 4516 } 4517 4518 static int 4519 bdev_nvme_async_poll(void *arg) 4520 { 4521 struct nvme_async_probe_ctx *ctx = arg; 4522 int rc; 4523 4524 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 4525 if (spdk_unlikely(rc != -EAGAIN)) { 4526 ctx->probe_done = true; 4527 spdk_poller_unregister(&ctx->poller); 4528 if (!ctx->ctrlr_attached) { 4529 /* The probe is done, but no controller was attached. 4530 * That means we had a failure, so report -EIO back to 4531 * the caller (usually the RPC). populate_namespaces_cb() 4532 * will take care of freeing the nvme_async_probe_ctx. 4533 */ 4534 populate_namespaces_cb(ctx, 0, -EIO); 4535 } else if (ctx->namespaces_populated) { 4536 /* The namespaces for the attached controller were all 4537 * populated and the response was already sent to the 4538 * caller (usually the RPC). So free the context here. 4539 */ 4540 free(ctx); 4541 } 4542 } 4543 4544 return SPDK_POLLER_BUSY; 4545 } 4546 4547 static bool 4548 bdev_nvme_check_multipath_params(int32_t ctrlr_loss_timeout_sec, 4549 uint32_t reconnect_delay_sec, 4550 uint32_t fast_io_fail_timeout_sec) 4551 { 4552 if (ctrlr_loss_timeout_sec < -1) { 4553 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 4554 return false; 4555 } else if (ctrlr_loss_timeout_sec == -1) { 4556 if (reconnect_delay_sec == 0) { 4557 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 4558 return false; 4559 } else if (fast_io_fail_timeout_sec != 0 && 4560 fast_io_fail_timeout_sec < reconnect_delay_sec) { 4561 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 4562 return false; 4563 } 4564 } else if (ctrlr_loss_timeout_sec != 0) { 4565 if (reconnect_delay_sec == 0) { 4566 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 4567 return false; 4568 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 4569 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 4570 return false; 4571 } else if (fast_io_fail_timeout_sec != 0) { 4572 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 4573 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 4574 return false; 4575 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 4576 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 4577 return false; 4578 } 4579 } 4580 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 4581 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 4582 return false; 4583 } 4584 4585 return true; 4586 } 4587 4588 int 4589 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 4590 const char *base_name, 4591 const char **names, 4592 uint32_t count, 4593 spdk_bdev_create_nvme_fn cb_fn, 4594 void *cb_ctx, 4595 struct spdk_nvme_ctrlr_opts *drv_opts, 4596 struct nvme_ctrlr_opts *bdev_opts, 4597 bool multipath) 4598 { 4599 struct nvme_probe_skip_entry *entry, *tmp; 4600 struct nvme_async_probe_ctx *ctx; 4601 spdk_nvme_attach_cb attach_cb; 4602 4603 /* TODO expand this check to include both the host and target TRIDs. 4604 * Only if both are the same should we fail. 4605 */ 4606 if (nvme_ctrlr_get(trid) != NULL) { 4607 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 4608 return -EEXIST; 4609 } 4610 4611 if (bdev_opts != NULL && 4612 !bdev_nvme_check_multipath_params(bdev_opts->ctrlr_loss_timeout_sec, 4613 bdev_opts->reconnect_delay_sec, 4614 bdev_opts->fast_io_fail_timeout_sec)) { 4615 return -EINVAL; 4616 } 4617 4618 ctx = calloc(1, sizeof(*ctx)); 4619 if (!ctx) { 4620 return -ENOMEM; 4621 } 4622 ctx->base_name = base_name; 4623 ctx->names = names; 4624 ctx->count = count; 4625 ctx->cb_fn = cb_fn; 4626 ctx->cb_ctx = cb_ctx; 4627 ctx->trid = *trid; 4628 4629 if (bdev_opts) { 4630 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 4631 } else { 4632 bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 4633 } 4634 4635 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 4636 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 4637 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4638 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 4639 free(entry); 4640 break; 4641 } 4642 } 4643 } 4644 4645 if (drv_opts) { 4646 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 4647 } else { 4648 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); 4649 } 4650 4651 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 4652 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 4653 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 4654 ctx->drv_opts.disable_read_ana_log_page = true; 4655 4656 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 4657 attach_cb = connect_attach_cb; 4658 } else { 4659 attach_cb = connect_set_failover_cb; 4660 } 4661 4662 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 4663 if (ctx->probe_ctx == NULL) { 4664 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 4665 free(ctx); 4666 return -ENODEV; 4667 } 4668 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 4669 4670 return 0; 4671 } 4672 4673 int 4674 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id) 4675 { 4676 struct nvme_bdev_ctrlr *nbdev_ctrlr; 4677 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 4678 struct nvme_path_id *p, *t; 4679 int rc = -ENXIO; 4680 4681 if (name == NULL || path_id == NULL) { 4682 return -EINVAL; 4683 } 4684 4685 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 4686 if (nbdev_ctrlr == NULL) { 4687 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 4688 return -ENODEV; 4689 } 4690 4691 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 4692 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 4693 if (path_id->trid.trtype != 0) { 4694 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 4695 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 4696 continue; 4697 } 4698 } else { 4699 if (path_id->trid.trtype != p->trid.trtype) { 4700 continue; 4701 } 4702 } 4703 } 4704 4705 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 4706 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 4707 continue; 4708 } 4709 } 4710 4711 if (path_id->trid.adrfam != 0) { 4712 if (path_id->trid.adrfam != p->trid.adrfam) { 4713 continue; 4714 } 4715 } 4716 4717 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 4718 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 4719 continue; 4720 } 4721 } 4722 4723 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 4724 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 4725 continue; 4726 } 4727 } 4728 4729 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 4730 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 4731 continue; 4732 } 4733 } 4734 4735 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 4736 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 4737 continue; 4738 } 4739 } 4740 4741 /* If we made it here, then this path is a match! Now we need to remove it. */ 4742 if (p == nvme_ctrlr->active_path_id) { 4743 /* This is the active path in use right now. The active path is always the first in the list. */ 4744 4745 if (!TAILQ_NEXT(p, link)) { 4746 /* The current path is the only path. */ 4747 rc = _bdev_nvme_delete(nvme_ctrlr, false); 4748 } else { 4749 /* There is an alternative path. */ 4750 rc = bdev_nvme_failover(nvme_ctrlr, true); 4751 } 4752 } else { 4753 /* We are not using the specified path. */ 4754 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 4755 free(p); 4756 rc = 0; 4757 } 4758 4759 if (rc < 0 && rc != -ENXIO) { 4760 return rc; 4761 } 4762 4763 4764 } 4765 } 4766 4767 /* All nvme_ctrlrs were deleted or no nvme_ctrlr which had the trid was found. */ 4768 return rc; 4769 } 4770 4771 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 4772 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 4773 4774 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 4775 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 4776 4777 struct discovery_entry_ctx { 4778 char name[128]; 4779 struct spdk_nvme_transport_id trid; 4780 struct spdk_nvme_ctrlr_opts drv_opts; 4781 struct spdk_nvmf_discovery_log_page_entry entry; 4782 TAILQ_ENTRY(discovery_entry_ctx) tailq; 4783 struct discovery_ctx *ctx; 4784 }; 4785 4786 struct discovery_ctx { 4787 char *name; 4788 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 4789 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 4790 void *cb_ctx; 4791 struct spdk_nvme_probe_ctx *probe_ctx; 4792 struct spdk_nvme_detach_ctx *detach_ctx; 4793 struct spdk_nvme_ctrlr *ctrlr; 4794 struct spdk_nvme_transport_id trid; 4795 struct discovery_entry_ctx *entry_ctx_in_use; 4796 struct spdk_poller *poller; 4797 struct spdk_nvme_ctrlr_opts drv_opts; 4798 struct nvme_ctrlr_opts bdev_opts; 4799 struct spdk_nvmf_discovery_log_page *log_page; 4800 TAILQ_ENTRY(discovery_ctx) tailq; 4801 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 4802 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 4803 int rc; 4804 bool wait_for_attach; 4805 uint64_t timeout_ticks; 4806 /* Denotes that the discovery service is being started. We're waiting 4807 * for the initial connection to the discovery controller to be 4808 * established and attach discovered NVM ctrlrs. 4809 */ 4810 bool initializing; 4811 /* Denotes if a discovery is currently in progress for this context. 4812 * That includes connecting to newly discovered subsystems. Used to 4813 * ensure we do not start a new discovery until an existing one is 4814 * complete. 4815 */ 4816 bool in_progress; 4817 4818 /* Denotes if another discovery is needed after the one in progress 4819 * completes. Set when we receive an AER completion while a discovery 4820 * is already in progress. 4821 */ 4822 bool pending; 4823 4824 /* Signal to the discovery context poller that it should stop the 4825 * discovery service, including detaching from the current discovery 4826 * controller. 4827 */ 4828 bool stop; 4829 4830 struct spdk_thread *calling_thread; 4831 uint32_t index; 4832 uint32_t attach_in_progress; 4833 char *hostnqn; 4834 }; 4835 4836 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 4837 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 4838 4839 static void get_discovery_log_page(struct discovery_ctx *ctx); 4840 4841 static void 4842 free_discovery_ctx(struct discovery_ctx *ctx) 4843 { 4844 free(ctx->log_page); 4845 free(ctx->hostnqn); 4846 free(ctx->name); 4847 free(ctx); 4848 } 4849 4850 static void 4851 discovery_complete(struct discovery_ctx *ctx) 4852 { 4853 ctx->initializing = false; 4854 ctx->in_progress = false; 4855 if (ctx->pending) { 4856 ctx->pending = false; 4857 get_discovery_log_page(ctx); 4858 } 4859 } 4860 4861 static void 4862 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 4863 struct spdk_nvmf_discovery_log_page_entry *entry) 4864 { 4865 char *space; 4866 4867 trid->trtype = entry->trtype; 4868 trid->adrfam = entry->adrfam; 4869 memcpy(trid->traddr, entry->traddr, sizeof(trid->traddr)); 4870 memcpy(trid->trsvcid, entry->trsvcid, sizeof(trid->trsvcid)); 4871 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn)); 4872 4873 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 4874 * But the log page entries typically pad them with spaces, not zeroes. 4875 * So add a NULL terminator to each of these fields at the appropriate 4876 * location. 4877 */ 4878 space = strchr(trid->traddr, ' '); 4879 if (space) { 4880 *space = 0; 4881 } 4882 space = strchr(trid->trsvcid, ' '); 4883 if (space) { 4884 *space = 0; 4885 } 4886 space = strchr(trid->subnqn, ' '); 4887 if (space) { 4888 *space = 0; 4889 } 4890 } 4891 4892 static void 4893 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 4894 { 4895 ctx->stop = true; 4896 ctx->stop_cb_fn = cb_fn; 4897 ctx->cb_ctx = cb_ctx; 4898 4899 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 4900 struct discovery_entry_ctx *entry_ctx; 4901 struct nvme_path_id path = {}; 4902 4903 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 4904 path.trid = entry_ctx->trid; 4905 bdev_nvme_delete(entry_ctx->name, &path); 4906 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 4907 free(entry_ctx); 4908 } 4909 4910 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 4911 struct discovery_entry_ctx *entry_ctx; 4912 4913 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 4914 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 4915 free(entry_ctx); 4916 } 4917 4918 free(ctx->entry_ctx_in_use); 4919 ctx->entry_ctx_in_use = NULL; 4920 } 4921 4922 static void 4923 discovery_remove_controllers(struct discovery_ctx *ctx) 4924 { 4925 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 4926 struct discovery_entry_ctx *entry_ctx, *tmp; 4927 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 4928 struct spdk_nvme_transport_id old_trid; 4929 uint64_t numrec, i; 4930 bool found; 4931 4932 numrec = from_le64(&log_page->numrec); 4933 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 4934 found = false; 4935 old_entry = &entry_ctx->entry; 4936 build_trid_from_log_page_entry(&old_trid, old_entry); 4937 for (i = 0; i < numrec; i++) { 4938 new_entry = &log_page->entries[i]; 4939 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 4940 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 4941 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 4942 found = true; 4943 break; 4944 } 4945 } 4946 if (!found) { 4947 struct nvme_path_id path = {}; 4948 4949 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 4950 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 4951 4952 path.trid = entry_ctx->trid; 4953 bdev_nvme_delete(entry_ctx->name, &path); 4954 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 4955 free(entry_ctx); 4956 } 4957 } 4958 free(log_page); 4959 ctx->log_page = NULL; 4960 discovery_complete(ctx); 4961 } 4962 4963 static void 4964 complete_discovery_start(struct discovery_ctx *ctx, int status) 4965 { 4966 ctx->timeout_ticks = 0; 4967 ctx->rc = status; 4968 if (ctx->start_cb_fn) { 4969 ctx->start_cb_fn(ctx->cb_ctx, status); 4970 ctx->start_cb_fn = NULL; 4971 ctx->cb_ctx = NULL; 4972 } 4973 } 4974 4975 static void 4976 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 4977 { 4978 struct discovery_entry_ctx *entry_ctx = cb_ctx; 4979 struct discovery_ctx *ctx = entry_ctx->ctx; 4980 4981 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 4982 ctx->attach_in_progress--; 4983 if (ctx->attach_in_progress == 0) { 4984 complete_discovery_start(ctx, ctx->rc); 4985 if (ctx->initializing && ctx->rc != 0) { 4986 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 4987 stop_discovery(ctx, NULL, ctx->cb_ctx); 4988 } else { 4989 discovery_remove_controllers(ctx); 4990 } 4991 } 4992 } 4993 4994 static struct discovery_entry_ctx * 4995 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 4996 { 4997 struct discovery_entry_ctx *new_ctx; 4998 4999 new_ctx = calloc(1, sizeof(*new_ctx)); 5000 if (new_ctx == NULL) { 5001 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 5002 return NULL; 5003 } 5004 5005 new_ctx->ctx = ctx; 5006 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 5007 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 5008 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 5009 return new_ctx; 5010 } 5011 5012 static void 5013 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 5014 struct spdk_nvmf_discovery_log_page *log_page) 5015 { 5016 struct discovery_ctx *ctx = cb_arg; 5017 struct discovery_entry_ctx *entry_ctx, *tmp; 5018 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 5019 uint64_t numrec, i; 5020 bool found; 5021 5022 if (rc || spdk_nvme_cpl_is_error(cpl)) { 5023 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 5024 return; 5025 } 5026 5027 ctx->log_page = log_page; 5028 assert(ctx->attach_in_progress == 0); 5029 numrec = from_le64(&log_page->numrec); 5030 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 5031 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 5032 free(entry_ctx); 5033 } 5034 for (i = 0; i < numrec; i++) { 5035 found = false; 5036 new_entry = &log_page->entries[i]; 5037 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 5038 struct discovery_entry_ctx *new_ctx; 5039 struct spdk_nvme_transport_id trid = {}; 5040 5041 build_trid_from_log_page_entry(&trid, new_entry); 5042 new_ctx = create_discovery_entry_ctx(ctx, &trid); 5043 if (new_ctx == NULL) { 5044 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 5045 break; 5046 } 5047 5048 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 5049 continue; 5050 } 5051 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 5052 old_entry = &entry_ctx->entry; 5053 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 5054 found = true; 5055 break; 5056 } 5057 } 5058 if (!found) { 5059 struct discovery_entry_ctx *subnqn_ctx, *new_ctx; 5060 5061 TAILQ_FOREACH(subnqn_ctx, &ctx->nvm_entry_ctxs, tailq) { 5062 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 5063 sizeof(new_entry->subnqn))) { 5064 break; 5065 } 5066 } 5067 5068 new_ctx = calloc(1, sizeof(*new_ctx)); 5069 if (new_ctx == NULL) { 5070 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 5071 break; 5072 } 5073 5074 new_ctx->ctx = ctx; 5075 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 5076 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 5077 if (subnqn_ctx) { 5078 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 5079 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 5080 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 5081 new_ctx->name); 5082 } else { 5083 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 5084 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 5085 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 5086 new_ctx->name); 5087 } 5088 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 5089 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 5090 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 5091 discovery_attach_controller_done, new_ctx, 5092 &new_ctx->drv_opts, &ctx->bdev_opts, true); 5093 if (rc == 0) { 5094 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 5095 ctx->attach_in_progress++; 5096 } else { 5097 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 5098 } 5099 } 5100 } 5101 5102 if (ctx->attach_in_progress == 0) { 5103 discovery_remove_controllers(ctx); 5104 } 5105 } 5106 5107 static void 5108 get_discovery_log_page(struct discovery_ctx *ctx) 5109 { 5110 int rc; 5111 5112 assert(ctx->in_progress == false); 5113 ctx->in_progress = true; 5114 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 5115 if (rc != 0) { 5116 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 5117 } 5118 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 5119 } 5120 5121 static void 5122 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5123 { 5124 struct discovery_ctx *ctx = arg; 5125 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 5126 5127 if (spdk_nvme_cpl_is_error(cpl)) { 5128 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 5129 return; 5130 } 5131 5132 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 5133 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 5134 return; 5135 } 5136 5137 DISCOVERY_INFOLOG(ctx, "got aer\n"); 5138 if (ctx->in_progress) { 5139 ctx->pending = true; 5140 return; 5141 } 5142 5143 get_discovery_log_page(ctx); 5144 } 5145 5146 static void 5147 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5148 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5149 { 5150 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5151 struct discovery_ctx *ctx; 5152 5153 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 5154 5155 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 5156 ctx->probe_ctx = NULL; 5157 ctx->ctrlr = ctrlr; 5158 5159 if (ctx->rc != 0) { 5160 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 5161 ctx->rc); 5162 return; 5163 } 5164 5165 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 5166 } 5167 5168 static int 5169 discovery_poller(void *arg) 5170 { 5171 struct discovery_ctx *ctx = arg; 5172 struct spdk_nvme_transport_id *trid; 5173 int rc; 5174 5175 if (ctx->detach_ctx) { 5176 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 5177 if (rc != -EAGAIN) { 5178 ctx->detach_ctx = NULL; 5179 ctx->ctrlr = NULL; 5180 } 5181 } else if (ctx->stop) { 5182 if (ctx->ctrlr != NULL) { 5183 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 5184 if (rc == 0) { 5185 return SPDK_POLLER_BUSY; 5186 } 5187 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 5188 } 5189 spdk_poller_unregister(&ctx->poller); 5190 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 5191 assert(ctx->start_cb_fn == NULL); 5192 if (ctx->stop_cb_fn != NULL) { 5193 ctx->stop_cb_fn(ctx->cb_ctx); 5194 } 5195 free_discovery_ctx(ctx); 5196 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 5197 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 5198 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 5199 assert(ctx->initializing); 5200 spdk_poller_unregister(&ctx->poller); 5201 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 5202 complete_discovery_start(ctx, -ETIMEDOUT); 5203 stop_discovery(ctx, NULL, NULL); 5204 free_discovery_ctx(ctx); 5205 return SPDK_POLLER_BUSY; 5206 } 5207 5208 assert(ctx->entry_ctx_in_use == NULL); 5209 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 5210 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 5211 trid = &ctx->entry_ctx_in_use->trid; 5212 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 5213 if (ctx->probe_ctx) { 5214 spdk_poller_unregister(&ctx->poller); 5215 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 5216 } else { 5217 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 5218 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 5219 ctx->entry_ctx_in_use = NULL; 5220 } 5221 } else if (ctx->probe_ctx) { 5222 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 5223 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 5224 complete_discovery_start(ctx, -ETIMEDOUT); 5225 return SPDK_POLLER_BUSY; 5226 } 5227 5228 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 5229 if (rc != -EAGAIN) { 5230 if (ctx->rc != 0) { 5231 assert(ctx->initializing); 5232 stop_discovery(ctx, NULL, ctx->cb_ctx); 5233 } else { 5234 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 5235 ctx->rc = rc; 5236 if (rc == 0) { 5237 get_discovery_log_page(ctx); 5238 } 5239 } 5240 } 5241 } else { 5242 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 5243 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 5244 complete_discovery_start(ctx, -ETIMEDOUT); 5245 /* We need to wait until all NVM ctrlrs are attached before we stop the 5246 * discovery service to make sure we don't detach a ctrlr that is still 5247 * being attached. 5248 */ 5249 if (ctx->attach_in_progress == 0) { 5250 stop_discovery(ctx, NULL, ctx->cb_ctx); 5251 return SPDK_POLLER_BUSY; 5252 } 5253 } 5254 5255 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 5256 if (rc < 0) { 5257 spdk_poller_unregister(&ctx->poller); 5258 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 5259 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 5260 ctx->entry_ctx_in_use = NULL; 5261 5262 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 5263 if (rc != 0) { 5264 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 5265 ctx->ctrlr = NULL; 5266 } 5267 } 5268 } 5269 5270 return SPDK_POLLER_BUSY; 5271 } 5272 5273 static void 5274 start_discovery_poller(void *arg) 5275 { 5276 struct discovery_ctx *ctx = arg; 5277 5278 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 5279 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 5280 } 5281 5282 int 5283 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 5284 const char *base_name, 5285 struct spdk_nvme_ctrlr_opts *drv_opts, 5286 struct nvme_ctrlr_opts *bdev_opts, 5287 uint64_t attach_timeout, 5288 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 5289 { 5290 struct discovery_ctx *ctx; 5291 struct discovery_entry_ctx *discovery_entry_ctx; 5292 5293 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 5294 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 5295 if (strcmp(ctx->name, base_name) == 0) { 5296 return -EEXIST; 5297 } 5298 5299 if (ctx->entry_ctx_in_use != NULL) { 5300 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 5301 return -EEXIST; 5302 } 5303 } 5304 5305 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 5306 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 5307 return -EEXIST; 5308 } 5309 } 5310 } 5311 5312 ctx = calloc(1, sizeof(*ctx)); 5313 if (ctx == NULL) { 5314 return -ENOMEM; 5315 } 5316 5317 ctx->name = strdup(base_name); 5318 if (ctx->name == NULL) { 5319 free_discovery_ctx(ctx); 5320 return -ENOMEM; 5321 } 5322 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 5323 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 5324 ctx->bdev_opts.from_discovery_service = true; 5325 ctx->calling_thread = spdk_get_thread(); 5326 ctx->start_cb_fn = cb_fn; 5327 ctx->cb_ctx = cb_ctx; 5328 ctx->initializing = true; 5329 if (ctx->start_cb_fn) { 5330 /* We can use this when dumping json to denote if this RPC parameter 5331 * was specified or not. 5332 */ 5333 ctx->wait_for_attach = true; 5334 } 5335 if (attach_timeout != 0) { 5336 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 5337 spdk_get_ticks_hz() / 1000ull; 5338 } 5339 TAILQ_INIT(&ctx->nvm_entry_ctxs); 5340 TAILQ_INIT(&ctx->discovery_entry_ctxs); 5341 memcpy(&ctx->trid, trid, sizeof(*trid)); 5342 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 5343 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 5344 if (ctx->hostnqn == NULL) { 5345 free_discovery_ctx(ctx); 5346 return -ENOMEM; 5347 } 5348 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 5349 if (discovery_entry_ctx == NULL) { 5350 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 5351 free_discovery_ctx(ctx); 5352 return -ENOMEM; 5353 } 5354 5355 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 5356 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 5357 return 0; 5358 } 5359 5360 int 5361 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 5362 { 5363 struct discovery_ctx *ctx; 5364 5365 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 5366 if (strcmp(name, ctx->name) == 0) { 5367 if (ctx->stop) { 5368 return -EALREADY; 5369 } 5370 /* If we're still starting the discovery service and ->rc is non-zero, we're 5371 * going to stop it as soon as we can 5372 */ 5373 if (ctx->initializing && ctx->rc != 0) { 5374 return -EALREADY; 5375 } 5376 stop_discovery(ctx, cb_fn, cb_ctx); 5377 return 0; 5378 } 5379 } 5380 5381 return -ENOENT; 5382 } 5383 5384 static int 5385 bdev_nvme_library_init(void) 5386 { 5387 g_bdev_nvme_init_thread = spdk_get_thread(); 5388 5389 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 5390 bdev_nvme_destroy_poll_group_cb, 5391 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 5392 5393 return 0; 5394 } 5395 5396 static void 5397 bdev_nvme_fini_destruct_ctrlrs(void) 5398 { 5399 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5400 struct nvme_ctrlr *nvme_ctrlr; 5401 5402 pthread_mutex_lock(&g_bdev_nvme_mutex); 5403 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 5404 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 5405 pthread_mutex_lock(&nvme_ctrlr->mutex); 5406 if (nvme_ctrlr->destruct) { 5407 /* This controller's destruction was already started 5408 * before the application started shutting down 5409 */ 5410 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5411 continue; 5412 } 5413 nvme_ctrlr->destruct = true; 5414 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5415 5416 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 5417 nvme_ctrlr); 5418 } 5419 } 5420 5421 g_bdev_nvme_module_finish = true; 5422 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5423 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5424 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 5425 spdk_bdev_module_fini_done(); 5426 return; 5427 } 5428 5429 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5430 } 5431 5432 static void 5433 check_discovery_fini(void *arg) 5434 { 5435 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 5436 bdev_nvme_fini_destruct_ctrlrs(); 5437 } 5438 } 5439 5440 static void 5441 bdev_nvme_library_fini(void) 5442 { 5443 struct nvme_probe_skip_entry *entry, *entry_tmp; 5444 struct discovery_ctx *ctx; 5445 5446 spdk_poller_unregister(&g_hotplug_poller); 5447 free(g_hotplug_probe_ctx); 5448 g_hotplug_probe_ctx = NULL; 5449 5450 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 5451 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 5452 free(entry); 5453 } 5454 5455 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 5456 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 5457 bdev_nvme_fini_destruct_ctrlrs(); 5458 } else { 5459 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 5460 stop_discovery(ctx, check_discovery_fini, NULL); 5461 } 5462 } 5463 } 5464 5465 static void 5466 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 5467 { 5468 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5469 struct spdk_bdev *bdev = bdev_io->bdev; 5470 struct spdk_dif_ctx dif_ctx; 5471 struct spdk_dif_error err_blk = {}; 5472 int rc; 5473 5474 rc = spdk_dif_ctx_init(&dif_ctx, 5475 bdev->blocklen, bdev->md_len, bdev->md_interleave, 5476 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 5477 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0); 5478 if (rc != 0) { 5479 SPDK_ERRLOG("Initialization of DIF context failed\n"); 5480 return; 5481 } 5482 5483 if (bdev->md_interleave) { 5484 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 5485 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 5486 } else { 5487 struct iovec md_iov = { 5488 .iov_base = bdev_io->u.bdev.md_buf, 5489 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 5490 }; 5491 5492 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 5493 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 5494 } 5495 5496 if (rc != 0) { 5497 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 5498 err_blk.err_type, err_blk.err_offset); 5499 } else { 5500 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 5501 } 5502 } 5503 5504 static void 5505 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 5506 { 5507 struct nvme_bdev_io *bio = ref; 5508 5509 if (spdk_nvme_cpl_is_success(cpl)) { 5510 /* Run PI verification for read data buffer. */ 5511 bdev_nvme_verify_pi_error(bio); 5512 } 5513 5514 /* Return original completion status */ 5515 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 5516 } 5517 5518 static void 5519 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 5520 { 5521 struct nvme_bdev_io *bio = ref; 5522 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5523 int ret; 5524 5525 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 5526 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 5527 cpl->status.sct, cpl->status.sc); 5528 5529 /* Save completion status to use after verifying PI error. */ 5530 bio->cpl = *cpl; 5531 5532 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 5533 /* Read without PI checking to verify PI error. */ 5534 ret = bdev_nvme_no_pi_readv(bio, 5535 bdev_io->u.bdev.iovs, 5536 bdev_io->u.bdev.iovcnt, 5537 bdev_io->u.bdev.md_buf, 5538 bdev_io->u.bdev.num_blocks, 5539 bdev_io->u.bdev.offset_blocks); 5540 if (ret == 0) { 5541 return; 5542 } 5543 } 5544 } 5545 5546 bdev_nvme_io_complete_nvme_status(bio, cpl); 5547 } 5548 5549 static void 5550 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 5551 { 5552 struct nvme_bdev_io *bio = ref; 5553 5554 if (spdk_nvme_cpl_is_pi_error(cpl)) { 5555 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 5556 cpl->status.sct, cpl->status.sc); 5557 /* Run PI verification for write data buffer if PI error is detected. */ 5558 bdev_nvme_verify_pi_error(bio); 5559 } 5560 5561 bdev_nvme_io_complete_nvme_status(bio, cpl); 5562 } 5563 5564 static void 5565 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 5566 { 5567 struct nvme_bdev_io *bio = ref; 5568 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5569 5570 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 5571 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 5572 */ 5573 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 5574 5575 if (spdk_nvme_cpl_is_pi_error(cpl)) { 5576 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 5577 cpl->status.sct, cpl->status.sc); 5578 /* Run PI verification for zone append data buffer if PI error is detected. */ 5579 bdev_nvme_verify_pi_error(bio); 5580 } 5581 5582 bdev_nvme_io_complete_nvme_status(bio, cpl); 5583 } 5584 5585 static void 5586 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 5587 { 5588 struct nvme_bdev_io *bio = ref; 5589 5590 if (spdk_nvme_cpl_is_pi_error(cpl)) { 5591 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 5592 cpl->status.sct, cpl->status.sc); 5593 /* Run PI verification for compare data buffer if PI error is detected. */ 5594 bdev_nvme_verify_pi_error(bio); 5595 } 5596 5597 bdev_nvme_io_complete_nvme_status(bio, cpl); 5598 } 5599 5600 static void 5601 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 5602 { 5603 struct nvme_bdev_io *bio = ref; 5604 5605 /* Compare operation completion */ 5606 if (!bio->first_fused_completed) { 5607 /* Save compare result for write callback */ 5608 bio->cpl = *cpl; 5609 bio->first_fused_completed = true; 5610 return; 5611 } 5612 5613 /* Write operation completion */ 5614 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 5615 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 5616 * complete the IO with the compare operation's status. 5617 */ 5618 if (!spdk_nvme_cpl_is_error(cpl)) { 5619 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 5620 } 5621 5622 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 5623 } else { 5624 bdev_nvme_io_complete_nvme_status(bio, cpl); 5625 } 5626 } 5627 5628 static void 5629 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 5630 { 5631 struct nvme_bdev_io *bio = ref; 5632 5633 bdev_nvme_io_complete_nvme_status(bio, cpl); 5634 } 5635 5636 static int 5637 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 5638 { 5639 switch (desc->zs) { 5640 case SPDK_NVME_ZONE_STATE_EMPTY: 5641 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 5642 break; 5643 case SPDK_NVME_ZONE_STATE_IOPEN: 5644 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 5645 break; 5646 case SPDK_NVME_ZONE_STATE_EOPEN: 5647 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 5648 break; 5649 case SPDK_NVME_ZONE_STATE_CLOSED: 5650 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 5651 break; 5652 case SPDK_NVME_ZONE_STATE_RONLY: 5653 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 5654 break; 5655 case SPDK_NVME_ZONE_STATE_FULL: 5656 info->state = SPDK_BDEV_ZONE_STATE_FULL; 5657 break; 5658 case SPDK_NVME_ZONE_STATE_OFFLINE: 5659 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 5660 break; 5661 default: 5662 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 5663 return -EIO; 5664 } 5665 5666 info->zone_id = desc->zslba; 5667 info->write_pointer = desc->wp; 5668 info->capacity = desc->zcap; 5669 5670 return 0; 5671 } 5672 5673 static void 5674 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 5675 { 5676 struct nvme_bdev_io *bio = ref; 5677 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5678 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 5679 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 5680 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 5681 uint64_t max_zones_per_buf, i; 5682 uint32_t zone_report_bufsize; 5683 struct spdk_nvme_ns *ns; 5684 struct spdk_nvme_qpair *qpair; 5685 int ret; 5686 5687 if (spdk_nvme_cpl_is_error(cpl)) { 5688 goto out_complete_io_nvme_cpl; 5689 } 5690 5691 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 5692 ret = -ENXIO; 5693 goto out_complete_io_ret; 5694 } 5695 5696 ns = bio->io_path->nvme_ns->ns; 5697 qpair = bio->io_path->qpair->qpair; 5698 5699 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 5700 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 5701 sizeof(bio->zone_report_buf->descs[0]); 5702 5703 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 5704 ret = -EINVAL; 5705 goto out_complete_io_ret; 5706 } 5707 5708 if (!bio->zone_report_buf->nr_zones) { 5709 ret = -EINVAL; 5710 goto out_complete_io_ret; 5711 } 5712 5713 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 5714 ret = fill_zone_from_report(&info[bio->handled_zones], 5715 &bio->zone_report_buf->descs[i]); 5716 if (ret) { 5717 goto out_complete_io_ret; 5718 } 5719 bio->handled_zones++; 5720 } 5721 5722 if (bio->handled_zones < zones_to_copy) { 5723 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 5724 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 5725 5726 memset(bio->zone_report_buf, 0, zone_report_bufsize); 5727 ret = spdk_nvme_zns_report_zones(ns, qpair, 5728 bio->zone_report_buf, zone_report_bufsize, 5729 slba, SPDK_NVME_ZRA_LIST_ALL, true, 5730 bdev_nvme_get_zone_info_done, bio); 5731 if (!ret) { 5732 return; 5733 } else { 5734 goto out_complete_io_ret; 5735 } 5736 } 5737 5738 out_complete_io_nvme_cpl: 5739 free(bio->zone_report_buf); 5740 bio->zone_report_buf = NULL; 5741 bdev_nvme_io_complete_nvme_status(bio, cpl); 5742 return; 5743 5744 out_complete_io_ret: 5745 free(bio->zone_report_buf); 5746 bio->zone_report_buf = NULL; 5747 bdev_nvme_io_complete(bio, ret); 5748 } 5749 5750 static void 5751 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 5752 { 5753 struct nvme_bdev_io *bio = ref; 5754 5755 bdev_nvme_io_complete_nvme_status(bio, cpl); 5756 } 5757 5758 static void 5759 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 5760 { 5761 struct nvme_bdev_io *bio = ctx; 5762 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5763 const struct spdk_nvme_cpl *cpl = &bio->cpl; 5764 struct nvme_bdev_channel *nbdev_ch; 5765 struct nvme_ctrlr *nvme_ctrlr; 5766 const struct spdk_nvme_ctrlr_data *cdata; 5767 uint64_t delay_ms; 5768 5769 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 5770 5771 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 5772 goto complete; 5773 } 5774 5775 if (cpl->status.dnr != 0 || (g_opts.bdev_retry_count != -1 && 5776 bio->retry_count >= g_opts.bdev_retry_count)) { 5777 goto complete; 5778 } 5779 5780 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 5781 nvme_ctrlr = bio->io_path->qpair->ctrlr; 5782 5783 if (spdk_nvme_cpl_is_path_error(cpl) || 5784 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 5785 !nvme_ctrlr_is_available(nvme_ctrlr)) { 5786 delay_ms = 0; 5787 } else if (spdk_nvme_cpl_is_aborted_by_request(cpl)) { 5788 goto complete; 5789 } else { 5790 bio->retry_count++; 5791 5792 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 5793 5794 if (cpl->status.crd != 0) { 5795 delay_ms = cdata->crdt[cpl->status.crd] * 100; 5796 } else { 5797 delay_ms = 0; 5798 } 5799 } 5800 5801 if (any_ctrlr_may_become_available(nbdev_ch)) { 5802 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 5803 return; 5804 } 5805 5806 complete: 5807 bio->retry_count = 0; 5808 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 5809 } 5810 5811 static void 5812 bdev_nvme_abort_complete(void *ctx) 5813 { 5814 struct nvme_bdev_io *bio = ctx; 5815 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5816 5817 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 5818 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 5819 } else { 5820 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5821 } 5822 } 5823 5824 static void 5825 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 5826 { 5827 struct nvme_bdev_io *bio = ref; 5828 5829 bio->cpl = *cpl; 5830 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_complete, bio); 5831 } 5832 5833 static void 5834 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 5835 { 5836 struct nvme_bdev_io *bio = ref; 5837 5838 bio->cpl = *cpl; 5839 spdk_thread_send_msg(bio->orig_thread, 5840 bdev_nvme_admin_passthru_complete_nvme_status, bio); 5841 } 5842 5843 static void 5844 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 5845 { 5846 struct nvme_bdev_io *bio = ref; 5847 struct iovec *iov; 5848 5849 bio->iov_offset = sgl_offset; 5850 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 5851 iov = &bio->iovs[bio->iovpos]; 5852 if (bio->iov_offset < iov->iov_len) { 5853 break; 5854 } 5855 5856 bio->iov_offset -= iov->iov_len; 5857 } 5858 } 5859 5860 static int 5861 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 5862 { 5863 struct nvme_bdev_io *bio = ref; 5864 struct iovec *iov; 5865 5866 assert(bio->iovpos < bio->iovcnt); 5867 5868 iov = &bio->iovs[bio->iovpos]; 5869 5870 *address = iov->iov_base; 5871 *length = iov->iov_len; 5872 5873 if (bio->iov_offset) { 5874 assert(bio->iov_offset <= iov->iov_len); 5875 *address += bio->iov_offset; 5876 *length -= bio->iov_offset; 5877 } 5878 5879 bio->iov_offset += *length; 5880 if (bio->iov_offset == iov->iov_len) { 5881 bio->iovpos++; 5882 bio->iov_offset = 0; 5883 } 5884 5885 return 0; 5886 } 5887 5888 static void 5889 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 5890 { 5891 struct nvme_bdev_io *bio = ref; 5892 struct iovec *iov; 5893 5894 bio->fused_iov_offset = sgl_offset; 5895 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 5896 iov = &bio->fused_iovs[bio->fused_iovpos]; 5897 if (bio->fused_iov_offset < iov->iov_len) { 5898 break; 5899 } 5900 5901 bio->fused_iov_offset -= iov->iov_len; 5902 } 5903 } 5904 5905 static int 5906 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 5907 { 5908 struct nvme_bdev_io *bio = ref; 5909 struct iovec *iov; 5910 5911 assert(bio->fused_iovpos < bio->fused_iovcnt); 5912 5913 iov = &bio->fused_iovs[bio->fused_iovpos]; 5914 5915 *address = iov->iov_base; 5916 *length = iov->iov_len; 5917 5918 if (bio->fused_iov_offset) { 5919 assert(bio->fused_iov_offset <= iov->iov_len); 5920 *address += bio->fused_iov_offset; 5921 *length -= bio->fused_iov_offset; 5922 } 5923 5924 bio->fused_iov_offset += *length; 5925 if (bio->fused_iov_offset == iov->iov_len) { 5926 bio->fused_iovpos++; 5927 bio->fused_iov_offset = 0; 5928 } 5929 5930 return 0; 5931 } 5932 5933 static int 5934 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 5935 void *md, uint64_t lba_count, uint64_t lba) 5936 { 5937 int rc; 5938 5939 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 5940 lba_count, lba); 5941 5942 bio->iovs = iov; 5943 bio->iovcnt = iovcnt; 5944 bio->iovpos = 0; 5945 bio->iov_offset = 0; 5946 5947 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 5948 bio->io_path->qpair->qpair, 5949 lba, lba_count, 5950 bdev_nvme_no_pi_readv_done, bio, 0, 5951 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 5952 md, 0, 0); 5953 5954 if (rc != 0 && rc != -ENOMEM) { 5955 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 5956 } 5957 return rc; 5958 } 5959 5960 static int 5961 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 5962 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 5963 struct spdk_bdev_ext_io_opts *ext_opts) 5964 { 5965 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 5966 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 5967 int rc; 5968 5969 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 5970 lba_count, lba); 5971 5972 bio->iovs = iov; 5973 bio->iovcnt = iovcnt; 5974 bio->iovpos = 0; 5975 bio->iov_offset = 0; 5976 5977 if (ext_opts) { 5978 bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts); 5979 bio->ext_opts.memory_domain = ext_opts->memory_domain; 5980 bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx; 5981 bio->ext_opts.io_flags = flags; 5982 bio->ext_opts.metadata = md; 5983 5984 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 5985 bdev_nvme_readv_done, bio, 5986 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 5987 &bio->ext_opts); 5988 } else if (iovcnt == 1) { 5989 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba, 5990 lba_count, 5991 bdev_nvme_readv_done, bio, 5992 flags, 5993 0, 0); 5994 } else { 5995 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 5996 bdev_nvme_readv_done, bio, flags, 5997 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 5998 md, 0, 0); 5999 } 6000 6001 if (rc != 0 && rc != -ENOMEM) { 6002 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 6003 } 6004 return rc; 6005 } 6006 6007 static int 6008 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6009 void *md, uint64_t lba_count, uint64_t lba, 6010 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts) 6011 { 6012 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6013 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6014 int rc; 6015 6016 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 6017 lba_count, lba); 6018 6019 bio->iovs = iov; 6020 bio->iovcnt = iovcnt; 6021 bio->iovpos = 0; 6022 bio->iov_offset = 0; 6023 6024 if (ext_opts) { 6025 bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts); 6026 bio->ext_opts.memory_domain = ext_opts->memory_domain; 6027 bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx; 6028 bio->ext_opts.io_flags = flags; 6029 bio->ext_opts.metadata = md; 6030 6031 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 6032 bdev_nvme_writev_done, bio, 6033 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6034 &bio->ext_opts); 6035 } else if (iovcnt == 1) { 6036 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba, 6037 lba_count, 6038 bdev_nvme_writev_done, bio, 6039 flags, 6040 0, 0); 6041 } else { 6042 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 6043 bdev_nvme_writev_done, bio, flags, 6044 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6045 md, 0, 0); 6046 } 6047 6048 if (rc != 0 && rc != -ENOMEM) { 6049 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 6050 } 6051 return rc; 6052 } 6053 6054 static int 6055 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6056 void *md, uint64_t lba_count, uint64_t zslba, 6057 uint32_t flags) 6058 { 6059 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6060 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6061 int rc; 6062 6063 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 6064 lba_count, zslba); 6065 6066 bio->iovs = iov; 6067 bio->iovcnt = iovcnt; 6068 bio->iovpos = 0; 6069 bio->iov_offset = 0; 6070 6071 if (iovcnt == 1) { 6072 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 6073 lba_count, 6074 bdev_nvme_zone_appendv_done, bio, 6075 flags, 6076 0, 0); 6077 } else { 6078 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 6079 bdev_nvme_zone_appendv_done, bio, flags, 6080 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6081 md, 0, 0); 6082 } 6083 6084 if (rc != 0 && rc != -ENOMEM) { 6085 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 6086 } 6087 return rc; 6088 } 6089 6090 static int 6091 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6092 void *md, uint64_t lba_count, uint64_t lba, 6093 uint32_t flags) 6094 { 6095 int rc; 6096 6097 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 6098 lba_count, lba); 6099 6100 bio->iovs = iov; 6101 bio->iovcnt = iovcnt; 6102 bio->iovpos = 0; 6103 bio->iov_offset = 0; 6104 6105 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 6106 bio->io_path->qpair->qpair, 6107 lba, lba_count, 6108 bdev_nvme_comparev_done, bio, flags, 6109 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6110 md, 0, 0); 6111 6112 if (rc != 0 && rc != -ENOMEM) { 6113 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 6114 } 6115 return rc; 6116 } 6117 6118 static int 6119 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 6120 struct iovec *write_iov, int write_iovcnt, 6121 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 6122 { 6123 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6124 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6125 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6126 int rc; 6127 6128 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 6129 lba_count, lba); 6130 6131 bio->iovs = cmp_iov; 6132 bio->iovcnt = cmp_iovcnt; 6133 bio->iovpos = 0; 6134 bio->iov_offset = 0; 6135 bio->fused_iovs = write_iov; 6136 bio->fused_iovcnt = write_iovcnt; 6137 bio->fused_iovpos = 0; 6138 bio->fused_iov_offset = 0; 6139 6140 if (bdev_io->num_retries == 0) { 6141 bio->first_fused_submitted = false; 6142 bio->first_fused_completed = false; 6143 } 6144 6145 if (!bio->first_fused_submitted) { 6146 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 6147 memset(&bio->cpl, 0, sizeof(bio->cpl)); 6148 6149 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 6150 bdev_nvme_comparev_and_writev_done, bio, flags, 6151 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 6152 if (rc == 0) { 6153 bio->first_fused_submitted = true; 6154 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 6155 } else { 6156 if (rc != -ENOMEM) { 6157 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 6158 } 6159 return rc; 6160 } 6161 } 6162 6163 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 6164 6165 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 6166 bdev_nvme_comparev_and_writev_done, bio, flags, 6167 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 6168 if (rc != 0 && rc != -ENOMEM) { 6169 SPDK_ERRLOG("write failed: rc = %d\n", rc); 6170 rc = 0; 6171 } 6172 6173 return rc; 6174 } 6175 6176 static int 6177 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 6178 { 6179 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 6180 struct spdk_nvme_dsm_range *range; 6181 uint64_t offset, remaining; 6182 uint64_t num_ranges_u64; 6183 uint16_t num_ranges; 6184 int rc; 6185 6186 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 6187 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6188 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 6189 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 6190 return -EINVAL; 6191 } 6192 num_ranges = (uint16_t)num_ranges_u64; 6193 6194 offset = offset_blocks; 6195 remaining = num_blocks; 6196 range = &dsm_ranges[0]; 6197 6198 /* Fill max-size ranges until the remaining blocks fit into one range */ 6199 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 6200 range->attributes.raw = 0; 6201 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6202 range->starting_lba = offset; 6203 6204 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6205 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6206 range++; 6207 } 6208 6209 /* Final range describes the remaining blocks */ 6210 range->attributes.raw = 0; 6211 range->length = remaining; 6212 range->starting_lba = offset; 6213 6214 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 6215 bio->io_path->qpair->qpair, 6216 SPDK_NVME_DSM_ATTR_DEALLOCATE, 6217 dsm_ranges, num_ranges, 6218 bdev_nvme_queued_done, bio); 6219 6220 return rc; 6221 } 6222 6223 static int 6224 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 6225 { 6226 if (num_blocks > UINT16_MAX + 1) { 6227 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 6228 return -EINVAL; 6229 } 6230 6231 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 6232 bio->io_path->qpair->qpair, 6233 offset_blocks, num_blocks, 6234 bdev_nvme_queued_done, bio, 6235 0); 6236 } 6237 6238 static int 6239 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 6240 struct spdk_bdev_zone_info *info) 6241 { 6242 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6243 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6244 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 6245 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 6246 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 6247 6248 if (zone_id % zone_size != 0) { 6249 return -EINVAL; 6250 } 6251 6252 if (num_zones > total_zones || !num_zones) { 6253 return -EINVAL; 6254 } 6255 6256 assert(!bio->zone_report_buf); 6257 bio->zone_report_buf = calloc(1, zone_report_bufsize); 6258 if (!bio->zone_report_buf) { 6259 return -ENOMEM; 6260 } 6261 6262 bio->handled_zones = 0; 6263 6264 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 6265 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 6266 bdev_nvme_get_zone_info_done, bio); 6267 } 6268 6269 static int 6270 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 6271 enum spdk_bdev_zone_action action) 6272 { 6273 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6274 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6275 6276 switch (action) { 6277 case SPDK_BDEV_ZONE_CLOSE: 6278 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 6279 bdev_nvme_zone_management_done, bio); 6280 case SPDK_BDEV_ZONE_FINISH: 6281 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 6282 bdev_nvme_zone_management_done, bio); 6283 case SPDK_BDEV_ZONE_OPEN: 6284 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 6285 bdev_nvme_zone_management_done, bio); 6286 case SPDK_BDEV_ZONE_RESET: 6287 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 6288 bdev_nvme_zone_management_done, bio); 6289 case SPDK_BDEV_ZONE_OFFLINE: 6290 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 6291 bdev_nvme_zone_management_done, bio); 6292 default: 6293 return -EINVAL; 6294 } 6295 } 6296 6297 static void 6298 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 6299 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 6300 { 6301 struct nvme_io_path *io_path; 6302 struct nvme_ctrlr *nvme_ctrlr; 6303 uint32_t max_xfer_size; 6304 int rc = -ENXIO; 6305 6306 /* Choose the first ctrlr which is not failed. */ 6307 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 6308 nvme_ctrlr = io_path->qpair->ctrlr; 6309 6310 /* We should skip any unavailable nvme_ctrlr rather than checking 6311 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 6312 */ 6313 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 6314 continue; 6315 } 6316 6317 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 6318 6319 if (nbytes > max_xfer_size) { 6320 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 6321 rc = -EINVAL; 6322 goto err; 6323 } 6324 6325 bio->io_path = io_path; 6326 bio->orig_thread = spdk_get_thread(); 6327 6328 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 6329 bdev_nvme_admin_passthru_done, bio); 6330 if (rc == 0) { 6331 return; 6332 } 6333 } 6334 6335 err: 6336 bdev_nvme_admin_passthru_complete(bio, rc); 6337 } 6338 6339 static int 6340 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 6341 void *buf, size_t nbytes) 6342 { 6343 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6344 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6345 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 6346 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 6347 6348 if (nbytes > max_xfer_size) { 6349 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 6350 return -EINVAL; 6351 } 6352 6353 /* 6354 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 6355 * so fill it out automatically. 6356 */ 6357 cmd->nsid = spdk_nvme_ns_get_id(ns); 6358 6359 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 6360 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 6361 } 6362 6363 static int 6364 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 6365 void *buf, size_t nbytes, void *md_buf, size_t md_len) 6366 { 6367 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6368 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6369 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 6370 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 6371 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 6372 6373 if (nbytes > max_xfer_size) { 6374 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 6375 return -EINVAL; 6376 } 6377 6378 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 6379 SPDK_ERRLOG("invalid meta data buffer size\n"); 6380 return -EINVAL; 6381 } 6382 6383 /* 6384 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 6385 * so fill it out automatically. 6386 */ 6387 cmd->nsid = spdk_nvme_ns_get_id(ns); 6388 6389 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 6390 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 6391 } 6392 6393 static void 6394 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 6395 struct nvme_bdev_io *bio_to_abort) 6396 { 6397 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6398 struct spdk_bdev_io *bdev_io_to_abort; 6399 struct nvme_io_path *io_path; 6400 struct nvme_ctrlr *nvme_ctrlr; 6401 int rc = 0; 6402 6403 bio->orig_thread = spdk_get_thread(); 6404 6405 /* Traverse the retry_io_list first. */ 6406 TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) { 6407 if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) { 6408 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link); 6409 spdk_bdev_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 6410 6411 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6412 return; 6413 } 6414 } 6415 6416 /* Even admin commands, they were submitted to only nvme_ctrlrs which were 6417 * on any io_path. So traverse the io_path list for not only I/O commands 6418 * but also admin commands. 6419 */ 6420 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 6421 nvme_ctrlr = io_path->qpair->ctrlr; 6422 6423 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr, 6424 io_path->qpair->qpair, 6425 bio_to_abort, 6426 bdev_nvme_abort_done, bio); 6427 if (rc == -ENOENT) { 6428 /* If no command was found in I/O qpair, the target command may be 6429 * admin command. 6430 */ 6431 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr, 6432 NULL, 6433 bio_to_abort, 6434 bdev_nvme_abort_done, bio); 6435 } 6436 6437 if (rc != -ENOENT) { 6438 break; 6439 } 6440 } 6441 6442 if (rc != 0) { 6443 /* If no command was found or there was any error, complete the abort 6444 * request with failure. 6445 */ 6446 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6447 } 6448 } 6449 6450 static void 6451 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 6452 { 6453 const char *action; 6454 6455 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 6456 action = "reset"; 6457 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 6458 action = "abort"; 6459 } else { 6460 action = "none"; 6461 } 6462 6463 spdk_json_write_object_begin(w); 6464 6465 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 6466 6467 spdk_json_write_named_object_begin(w, "params"); 6468 spdk_json_write_named_string(w, "action_on_timeout", action); 6469 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 6470 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 6471 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 6472 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 6473 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 6474 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 6475 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 6476 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 6477 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 6478 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 6479 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 6480 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 6481 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 6482 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 6483 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 6484 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 6485 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 6486 spdk_json_write_object_end(w); 6487 6488 spdk_json_write_object_end(w); 6489 } 6490 6491 static void 6492 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 6493 { 6494 struct spdk_nvme_transport_id trid; 6495 6496 spdk_json_write_object_begin(w); 6497 6498 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 6499 6500 spdk_json_write_named_object_begin(w, "params"); 6501 spdk_json_write_named_string(w, "name", ctx->name); 6502 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 6503 6504 trid = ctx->trid; 6505 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 6506 nvme_bdev_dump_trid_json(&trid, w); 6507 6508 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 6509 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 6510 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 6511 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 6512 ctx->bdev_opts.fast_io_fail_timeout_sec); 6513 spdk_json_write_object_end(w); 6514 6515 spdk_json_write_object_end(w); 6516 } 6517 6518 static void 6519 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 6520 struct nvme_ctrlr *nvme_ctrlr) 6521 { 6522 struct spdk_nvme_transport_id *trid; 6523 6524 if (nvme_ctrlr->opts.from_discovery_service) { 6525 /* Do not emit an RPC for this - it will be implicitly 6526 * covered by a separate bdev_nvme_start_discovery RPC. 6527 */ 6528 return; 6529 } 6530 6531 trid = &nvme_ctrlr->active_path_id->trid; 6532 6533 spdk_json_write_object_begin(w); 6534 6535 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 6536 6537 spdk_json_write_named_object_begin(w, "params"); 6538 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 6539 nvme_bdev_dump_trid_json(trid, w); 6540 spdk_json_write_named_bool(w, "prchk_reftag", 6541 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 6542 spdk_json_write_named_bool(w, "prchk_guard", 6543 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 6544 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 6545 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 6546 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 6547 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 6548 6549 spdk_json_write_object_end(w); 6550 6551 spdk_json_write_object_end(w); 6552 } 6553 6554 static void 6555 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 6556 { 6557 spdk_json_write_object_begin(w); 6558 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 6559 6560 spdk_json_write_named_object_begin(w, "params"); 6561 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 6562 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 6563 spdk_json_write_object_end(w); 6564 6565 spdk_json_write_object_end(w); 6566 } 6567 6568 static int 6569 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 6570 { 6571 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6572 struct nvme_ctrlr *nvme_ctrlr; 6573 struct discovery_ctx *ctx; 6574 6575 bdev_nvme_opts_config_json(w); 6576 6577 pthread_mutex_lock(&g_bdev_nvme_mutex); 6578 6579 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 6580 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6581 nvme_ctrlr_config_json(w, nvme_ctrlr); 6582 } 6583 } 6584 6585 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6586 bdev_nvme_discovery_config_json(w, ctx); 6587 } 6588 6589 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 6590 * before enabling hotplug poller. 6591 */ 6592 bdev_nvme_hotplug_config_json(w); 6593 6594 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6595 return 0; 6596 } 6597 6598 struct spdk_nvme_ctrlr * 6599 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 6600 { 6601 struct nvme_bdev *nbdev; 6602 struct nvme_ns *nvme_ns; 6603 6604 if (!bdev || bdev->module != &nvme_if) { 6605 return NULL; 6606 } 6607 6608 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 6609 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 6610 assert(nvme_ns != NULL); 6611 6612 return nvme_ns->ctrlr->ctrlr; 6613 } 6614 6615 void 6616 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 6617 { 6618 struct nvme_ns *nvme_ns = io_path->nvme_ns; 6619 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 6620 const struct spdk_nvme_ctrlr_data *cdata; 6621 6622 spdk_json_write_object_begin(w); 6623 6624 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 6625 6626 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 6627 6628 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 6629 6630 spdk_json_write_named_bool(w, "current", io_path == io_path->nbdev_ch->current_io_path); 6631 6632 spdk_json_write_named_bool(w, "connected", nvme_io_path_is_connected(io_path)); 6633 6634 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 6635 6636 spdk_json_write_object_end(w); 6637 } 6638 6639 void 6640 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 6641 { 6642 struct discovery_ctx *ctx; 6643 struct discovery_entry_ctx *entry_ctx; 6644 6645 spdk_json_write_array_begin(w); 6646 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6647 spdk_json_write_object_begin(w); 6648 spdk_json_write_named_string(w, "name", ctx->name); 6649 6650 spdk_json_write_named_object_begin(w, "trid"); 6651 nvme_bdev_dump_trid_json(&ctx->trid, w); 6652 spdk_json_write_object_end(w); 6653 6654 spdk_json_write_named_array_begin(w, "referrals"); 6655 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 6656 spdk_json_write_object_begin(w); 6657 spdk_json_write_named_object_begin(w, "trid"); 6658 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 6659 spdk_json_write_object_end(w); 6660 spdk_json_write_object_end(w); 6661 } 6662 spdk_json_write_array_end(w); 6663 6664 spdk_json_write_object_end(w); 6665 } 6666 spdk_json_write_array_end(w); 6667 } 6668 6669 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 6670