1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "bdev_nvme.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/endian.h" 14 #include "spdk/bdev.h" 15 #include "spdk/json.h" 16 #include "spdk/likely.h" 17 #include "spdk/nvme.h" 18 #include "spdk/nvme_ocssd.h" 19 #include "spdk/nvme_zns.h" 20 #include "spdk/opal.h" 21 #include "spdk/thread.h" 22 #include "spdk/trace.h" 23 #include "spdk/string.h" 24 #include "spdk/util.h" 25 26 #include "spdk/bdev_module.h" 27 #include "spdk/log.h" 28 29 #include "spdk_internal/usdt.h" 30 #include "spdk_internal/trace_defs.h" 31 32 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 33 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 34 35 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 36 37 struct nvme_bdev_io { 38 /** array of iovecs to transfer. */ 39 struct iovec *iovs; 40 41 /** Number of iovecs in iovs array. */ 42 int iovcnt; 43 44 /** Current iovec position. */ 45 int iovpos; 46 47 /** Offset in current iovec. */ 48 uint32_t iov_offset; 49 50 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 51 * being reset in a reset I/O. 52 */ 53 struct nvme_io_path *io_path; 54 55 /** array of iovecs to transfer. */ 56 struct iovec *fused_iovs; 57 58 /** Number of iovecs in iovs array. */ 59 int fused_iovcnt; 60 61 /** Current iovec position. */ 62 int fused_iovpos; 63 64 /** Offset in current iovec. */ 65 uint32_t fused_iov_offset; 66 67 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 68 struct spdk_nvme_cpl cpl; 69 70 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 71 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 72 73 /** Originating thread */ 74 struct spdk_thread *orig_thread; 75 76 /** Keeps track if first of fused commands was submitted */ 77 bool first_fused_submitted; 78 79 /** Keeps track if first of fused commands was completed */ 80 bool first_fused_completed; 81 82 /** Temporary pointer to zone report buffer */ 83 struct spdk_nvme_zns_zone_report *zone_report_buf; 84 85 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 86 uint64_t handled_zones; 87 88 /** Expiration value in ticks to retry the current I/O. */ 89 uint64_t retry_ticks; 90 91 /* How many times the current I/O was retried. */ 92 int32_t retry_count; 93 }; 94 95 struct nvme_probe_skip_entry { 96 struct spdk_nvme_transport_id trid; 97 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 98 }; 99 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 100 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 101 g_skipped_nvme_ctrlrs); 102 103 static struct spdk_bdev_nvme_opts g_opts = { 104 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 105 .timeout_us = 0, 106 .timeout_admin_us = 0, 107 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 108 .transport_retry_count = 4, 109 .arbitration_burst = 0, 110 .low_priority_weight = 0, 111 .medium_priority_weight = 0, 112 .high_priority_weight = 0, 113 .nvme_adminq_poll_period_us = 10000ULL, 114 .nvme_ioq_poll_period_us = 0, 115 .io_queue_requests = 0, 116 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 117 .bdev_retry_count = 3, 118 .transport_ack_timeout = 0, 119 .ctrlr_loss_timeout_sec = 0, 120 .reconnect_delay_sec = 0, 121 .fast_io_fail_timeout_sec = 0, 122 .disable_auto_failback = false, 123 }; 124 125 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 126 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 127 128 static int g_hot_insert_nvme_controller_index = 0; 129 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 130 static bool g_nvme_hotplug_enabled = false; 131 static struct spdk_thread *g_bdev_nvme_init_thread; 132 static struct spdk_poller *g_hotplug_poller; 133 static struct spdk_poller *g_hotplug_probe_poller; 134 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 135 136 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 137 struct nvme_async_probe_ctx *ctx); 138 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 139 struct nvme_async_probe_ctx *ctx); 140 static int bdev_nvme_library_init(void); 141 static void bdev_nvme_library_fini(void); 142 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 143 struct spdk_bdev_io *bdev_io); 144 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 145 void *md, uint64_t lba_count, uint64_t lba, 146 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts); 147 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 148 void *md, uint64_t lba_count, uint64_t lba); 149 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 150 void *md, uint64_t lba_count, uint64_t lba, 151 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts); 152 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 153 void *md, uint64_t lba_count, 154 uint64_t zslba, uint32_t flags); 155 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 156 void *md, uint64_t lba_count, uint64_t lba, 157 uint32_t flags); 158 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 159 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 160 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 161 uint32_t flags); 162 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 163 uint32_t num_zones, struct spdk_bdev_zone_info *info); 164 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 165 enum spdk_bdev_zone_action action); 166 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 167 struct nvme_bdev_io *bio, 168 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 169 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 170 void *buf, size_t nbytes); 171 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 172 void *buf, size_t nbytes, void *md_buf, size_t md_len); 173 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 174 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 175 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 176 static int bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr); 177 static int bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove); 178 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 179 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 180 181 static int 182 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 183 { 184 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 185 } 186 187 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 188 189 struct spdk_nvme_qpair * 190 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 191 { 192 struct nvme_ctrlr_channel *ctrlr_ch; 193 194 assert(ctrlr_io_ch != NULL); 195 196 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 197 198 return ctrlr_ch->qpair->qpair; 199 } 200 201 static int 202 bdev_nvme_get_ctx_size(void) 203 { 204 return sizeof(struct nvme_bdev_io); 205 } 206 207 static struct spdk_bdev_module nvme_if = { 208 .name = "nvme", 209 .async_fini = true, 210 .module_init = bdev_nvme_library_init, 211 .module_fini = bdev_nvme_library_fini, 212 .config_json = bdev_nvme_config_json, 213 .get_ctx_size = bdev_nvme_get_ctx_size, 214 215 }; 216 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 217 218 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 219 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 220 bool g_bdev_nvme_module_finish; 221 222 struct nvme_bdev_ctrlr * 223 nvme_bdev_ctrlr_get_by_name(const char *name) 224 { 225 struct nvme_bdev_ctrlr *nbdev_ctrlr; 226 227 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 228 if (strcmp(name, nbdev_ctrlr->name) == 0) { 229 break; 230 } 231 } 232 233 return nbdev_ctrlr; 234 } 235 236 static struct nvme_ctrlr * 237 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 238 const struct spdk_nvme_transport_id *trid) 239 { 240 struct nvme_ctrlr *nvme_ctrlr; 241 242 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 243 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { 244 break; 245 } 246 } 247 248 return nvme_ctrlr; 249 } 250 251 static struct nvme_bdev * 252 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 253 { 254 struct nvme_bdev *bdev; 255 256 pthread_mutex_lock(&g_bdev_nvme_mutex); 257 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 258 if (bdev->nsid == nsid) { 259 break; 260 } 261 } 262 pthread_mutex_unlock(&g_bdev_nvme_mutex); 263 264 return bdev; 265 } 266 267 struct nvme_ns * 268 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 269 { 270 struct nvme_ns ns; 271 272 assert(nsid > 0); 273 274 ns.id = nsid; 275 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 276 } 277 278 struct nvme_ns * 279 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 280 { 281 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 282 } 283 284 struct nvme_ns * 285 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 286 { 287 if (ns == NULL) { 288 return NULL; 289 } 290 291 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 292 } 293 294 static struct nvme_ctrlr * 295 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) 296 { 297 struct nvme_bdev_ctrlr *nbdev_ctrlr; 298 struct nvme_ctrlr *nvme_ctrlr = NULL; 299 300 pthread_mutex_lock(&g_bdev_nvme_mutex); 301 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 302 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); 303 if (nvme_ctrlr != NULL) { 304 break; 305 } 306 } 307 pthread_mutex_unlock(&g_bdev_nvme_mutex); 308 309 return nvme_ctrlr; 310 } 311 312 struct nvme_ctrlr * 313 nvme_ctrlr_get_by_name(const char *name) 314 { 315 struct nvme_bdev_ctrlr *nbdev_ctrlr; 316 struct nvme_ctrlr *nvme_ctrlr = NULL; 317 318 if (name == NULL) { 319 return NULL; 320 } 321 322 pthread_mutex_lock(&g_bdev_nvme_mutex); 323 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 324 if (nbdev_ctrlr != NULL) { 325 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 326 } 327 pthread_mutex_unlock(&g_bdev_nvme_mutex); 328 329 return nvme_ctrlr; 330 } 331 332 void 333 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 334 { 335 struct nvme_bdev_ctrlr *nbdev_ctrlr; 336 337 pthread_mutex_lock(&g_bdev_nvme_mutex); 338 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 339 fn(nbdev_ctrlr, ctx); 340 } 341 pthread_mutex_unlock(&g_bdev_nvme_mutex); 342 } 343 344 void 345 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 346 { 347 const char *trtype_str; 348 const char *adrfam_str; 349 350 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 351 if (trtype_str) { 352 spdk_json_write_named_string(w, "trtype", trtype_str); 353 } 354 355 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 356 if (adrfam_str) { 357 spdk_json_write_named_string(w, "adrfam", adrfam_str); 358 } 359 360 if (trid->traddr[0] != '\0') { 361 spdk_json_write_named_string(w, "traddr", trid->traddr); 362 } 363 364 if (trid->trsvcid[0] != '\0') { 365 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 366 } 367 368 if (trid->subnqn[0] != '\0') { 369 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 370 } 371 } 372 373 static void 374 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 375 struct nvme_ctrlr *nvme_ctrlr) 376 { 377 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 378 pthread_mutex_lock(&g_bdev_nvme_mutex); 379 380 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 381 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 382 pthread_mutex_unlock(&g_bdev_nvme_mutex); 383 384 return; 385 } 386 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 387 388 pthread_mutex_unlock(&g_bdev_nvme_mutex); 389 390 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 391 392 free(nbdev_ctrlr->name); 393 free(nbdev_ctrlr); 394 } 395 396 static void 397 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 398 { 399 struct nvme_path_id *path_id, *tmp_path; 400 struct nvme_ns *ns, *tmp_ns; 401 402 free(nvme_ctrlr->copied_ana_desc); 403 spdk_free(nvme_ctrlr->ana_log_page); 404 405 if (nvme_ctrlr->opal_dev) { 406 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 407 nvme_ctrlr->opal_dev = NULL; 408 } 409 410 if (nvme_ctrlr->nbdev_ctrlr) { 411 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 412 } 413 414 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 415 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 416 free(ns); 417 } 418 419 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 420 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 421 free(path_id); 422 } 423 424 pthread_mutex_destroy(&nvme_ctrlr->mutex); 425 426 free(nvme_ctrlr); 427 428 pthread_mutex_lock(&g_bdev_nvme_mutex); 429 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 430 pthread_mutex_unlock(&g_bdev_nvme_mutex); 431 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 432 spdk_bdev_module_fini_done(); 433 return; 434 } 435 pthread_mutex_unlock(&g_bdev_nvme_mutex); 436 } 437 438 static int 439 nvme_detach_poller(void *arg) 440 { 441 struct nvme_ctrlr *nvme_ctrlr = arg; 442 int rc; 443 444 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 445 if (rc != -EAGAIN) { 446 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 447 _nvme_ctrlr_delete(nvme_ctrlr); 448 } 449 450 return SPDK_POLLER_BUSY; 451 } 452 453 static void 454 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 455 { 456 int rc; 457 458 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 459 460 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 461 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 462 463 /* If we got here, the reset/detach poller cannot be active */ 464 assert(nvme_ctrlr->reset_detach_poller == NULL); 465 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 466 nvme_ctrlr, 1000); 467 if (nvme_ctrlr->reset_detach_poller == NULL) { 468 SPDK_ERRLOG("Failed to register detach poller\n"); 469 goto error; 470 } 471 472 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 473 if (rc != 0) { 474 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 475 goto error; 476 } 477 478 return; 479 error: 480 /* We don't have a good way to handle errors here, so just do what we can and delete the 481 * controller without detaching the underlying NVMe device. 482 */ 483 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 484 _nvme_ctrlr_delete(nvme_ctrlr); 485 } 486 487 static void 488 nvme_ctrlr_unregister_cb(void *io_device) 489 { 490 struct nvme_ctrlr *nvme_ctrlr = io_device; 491 492 nvme_ctrlr_delete(nvme_ctrlr); 493 } 494 495 static void 496 nvme_ctrlr_unregister(void *ctx) 497 { 498 struct nvme_ctrlr *nvme_ctrlr = ctx; 499 500 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 501 } 502 503 static bool 504 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 505 { 506 if (!nvme_ctrlr->destruct) { 507 return false; 508 } 509 510 if (nvme_ctrlr->ref > 0) { 511 return false; 512 } 513 514 if (nvme_ctrlr->resetting) { 515 return false; 516 } 517 518 if (nvme_ctrlr->ana_log_page_updating) { 519 return false; 520 } 521 522 if (nvme_ctrlr->io_path_cache_clearing) { 523 return false; 524 } 525 526 return true; 527 } 528 529 static void 530 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 531 { 532 pthread_mutex_lock(&nvme_ctrlr->mutex); 533 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 534 535 assert(nvme_ctrlr->ref > 0); 536 nvme_ctrlr->ref--; 537 538 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 539 pthread_mutex_unlock(&nvme_ctrlr->mutex); 540 return; 541 } 542 543 pthread_mutex_unlock(&nvme_ctrlr->mutex); 544 545 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 546 } 547 548 static struct nvme_io_path * 549 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 550 { 551 struct nvme_io_path *io_path; 552 553 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 554 if (io_path->nvme_ns == nvme_ns) { 555 break; 556 } 557 } 558 559 return io_path; 560 } 561 562 static int 563 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 564 { 565 struct nvme_io_path *io_path; 566 struct spdk_io_channel *ch; 567 struct nvme_ctrlr_channel *ctrlr_ch; 568 struct nvme_qpair *nvme_qpair; 569 570 io_path = calloc(1, sizeof(*io_path)); 571 if (io_path == NULL) { 572 SPDK_ERRLOG("Failed to alloc io_path.\n"); 573 return -ENOMEM; 574 } 575 576 io_path->nvme_ns = nvme_ns; 577 578 ch = spdk_get_io_channel(nvme_ns->ctrlr); 579 if (ch == NULL) { 580 free(io_path); 581 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 582 return -ENOMEM; 583 } 584 585 ctrlr_ch = spdk_io_channel_get_ctx(ch); 586 587 nvme_qpair = ctrlr_ch->qpair; 588 assert(nvme_qpair != NULL); 589 590 io_path->qpair = nvme_qpair; 591 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 592 593 io_path->nbdev_ch = nbdev_ch; 594 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 595 596 nbdev_ch->current_io_path = NULL; 597 598 return 0; 599 } 600 601 static void 602 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 603 { 604 struct spdk_io_channel *ch; 605 struct nvme_qpair *nvme_qpair; 606 struct nvme_ctrlr_channel *ctrlr_ch; 607 608 nbdev_ch->current_io_path = NULL; 609 610 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 611 612 nvme_qpair = io_path->qpair; 613 assert(nvme_qpair != NULL); 614 615 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 616 617 ctrlr_ch = nvme_qpair->ctrlr_ch; 618 assert(ctrlr_ch != NULL); 619 620 ch = spdk_io_channel_from_ctx(ctrlr_ch); 621 spdk_put_io_channel(ch); 622 623 free(io_path); 624 } 625 626 static void 627 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 628 { 629 struct nvme_io_path *io_path, *tmp_io_path; 630 631 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 632 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 633 } 634 } 635 636 static int 637 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 638 { 639 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 640 struct nvme_bdev *nbdev = io_device; 641 struct nvme_ns *nvme_ns; 642 int rc; 643 644 STAILQ_INIT(&nbdev_ch->io_path_list); 645 TAILQ_INIT(&nbdev_ch->retry_io_list); 646 647 pthread_mutex_lock(&nbdev->mutex); 648 649 nbdev_ch->mp_policy = nbdev->mp_policy; 650 651 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 652 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 653 if (rc != 0) { 654 pthread_mutex_unlock(&nbdev->mutex); 655 656 _bdev_nvme_delete_io_paths(nbdev_ch); 657 return rc; 658 } 659 } 660 pthread_mutex_unlock(&nbdev->mutex); 661 662 return 0; 663 } 664 665 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 666 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 667 */ 668 static inline void 669 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 670 const struct spdk_nvme_cpl *cpl) 671 { 672 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 673 (uintptr_t)bdev_io); 674 if (cpl) { 675 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 676 } else { 677 spdk_bdev_io_complete(bdev_io, status); 678 } 679 } 680 681 static void 682 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 683 { 684 struct spdk_bdev_io *bdev_io, *tmp_io; 685 686 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) { 687 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 688 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 689 } 690 691 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 692 } 693 694 static void 695 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 696 { 697 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 698 699 bdev_nvme_abort_retry_ios(nbdev_ch); 700 _bdev_nvme_delete_io_paths(nbdev_ch); 701 } 702 703 static inline bool 704 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 705 { 706 switch (io_type) { 707 case SPDK_BDEV_IO_TYPE_RESET: 708 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 709 case SPDK_BDEV_IO_TYPE_ABORT: 710 return true; 711 default: 712 break; 713 } 714 715 return false; 716 } 717 718 static inline bool 719 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 720 { 721 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 722 return false; 723 } 724 725 switch (nvme_ns->ana_state) { 726 case SPDK_NVME_ANA_OPTIMIZED_STATE: 727 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 728 return true; 729 default: 730 break; 731 } 732 733 return false; 734 } 735 736 static inline bool 737 nvme_io_path_is_connected(struct nvme_io_path *io_path) 738 { 739 if (spdk_unlikely(io_path->qpair->qpair == NULL)) { 740 return false; 741 } 742 743 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(io_path->qpair->qpair) != 744 SPDK_NVME_QPAIR_FAILURE_NONE)) { 745 return false; 746 } 747 748 if (spdk_unlikely(io_path->qpair->ctrlr_ch->reset_iter != NULL)) { 749 return false; 750 } 751 752 if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(io_path->qpair->ctrlr->ctrlr) != 753 SPDK_NVME_QPAIR_FAILURE_NONE) { 754 return false; 755 } 756 757 return true; 758 } 759 760 static inline bool 761 nvme_io_path_is_available(struct nvme_io_path *io_path) 762 { 763 if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) { 764 return false; 765 } 766 767 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 768 return false; 769 } 770 771 return true; 772 } 773 774 static inline bool 775 nvme_io_path_is_failed(struct nvme_io_path *io_path) 776 { 777 struct nvme_ctrlr *nvme_ctrlr; 778 779 nvme_ctrlr = io_path->qpair->ctrlr; 780 781 if (nvme_ctrlr->destruct) { 782 return true; 783 } 784 785 if (nvme_ctrlr->fast_io_fail_timedout) { 786 return true; 787 } 788 789 if (nvme_ctrlr->resetting) { 790 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 791 return false; 792 } else { 793 return true; 794 } 795 } 796 797 if (nvme_ctrlr->reconnect_is_delayed) { 798 return false; 799 } 800 801 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 802 return true; 803 } else { 804 return false; 805 } 806 } 807 808 static bool 809 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 810 { 811 if (nvme_ctrlr->destruct) { 812 return false; 813 } 814 815 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 816 return false; 817 } 818 819 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 820 return false; 821 } 822 823 return true; 824 } 825 826 /* Simulate circular linked list. */ 827 static inline struct nvme_io_path * 828 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 829 { 830 struct nvme_io_path *next_path; 831 832 next_path = STAILQ_NEXT(prev_path, stailq); 833 if (next_path != NULL) { 834 return next_path; 835 } else { 836 return STAILQ_FIRST(&nbdev_ch->io_path_list); 837 } 838 } 839 840 static struct nvme_io_path * 841 bdev_nvme_find_next_io_path(struct nvme_bdev_channel *nbdev_ch, 842 struct nvme_io_path *prev) 843 { 844 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 845 846 start = nvme_io_path_get_next(nbdev_ch, prev); 847 848 io_path = start; 849 do { 850 if (spdk_likely(nvme_io_path_is_connected(io_path) && 851 !io_path->nvme_ns->ana_state_updating)) { 852 switch (io_path->nvme_ns->ana_state) { 853 case SPDK_NVME_ANA_OPTIMIZED_STATE: 854 nbdev_ch->current_io_path = io_path; 855 return io_path; 856 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 857 if (non_optimized == NULL) { 858 non_optimized = io_path; 859 } 860 break; 861 default: 862 break; 863 } 864 } 865 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 866 } while (io_path != start); 867 868 /* We come here only if there is no optimized path. Cache even non_optimized 869 * path for load balance across multiple non_optimized paths. 870 */ 871 nbdev_ch->current_io_path = non_optimized; 872 return non_optimized; 873 } 874 875 static struct nvme_io_path * 876 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 877 { 878 struct nvme_io_path *io_path, *non_optimized = NULL; 879 880 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 881 if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) { 882 /* The device is currently resetting. */ 883 continue; 884 } 885 886 if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) { 887 continue; 888 } 889 890 switch (io_path->nvme_ns->ana_state) { 891 case SPDK_NVME_ANA_OPTIMIZED_STATE: 892 nbdev_ch->current_io_path = io_path; 893 return io_path; 894 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 895 if (non_optimized == NULL) { 896 non_optimized = io_path; 897 } 898 break; 899 default: 900 break; 901 } 902 } 903 904 return non_optimized; 905 } 906 907 static inline struct nvme_io_path * 908 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 909 { 910 if (spdk_unlikely(nbdev_ch->current_io_path == NULL)) { 911 return _bdev_nvme_find_io_path(nbdev_ch); 912 } 913 914 if (spdk_likely(nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE)) { 915 return nbdev_ch->current_io_path; 916 } else { 917 return bdev_nvme_find_next_io_path(nbdev_ch, nbdev_ch->current_io_path); 918 } 919 } 920 921 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 922 * or false otherwise. 923 * 924 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 925 * is likely to be non-accessible now but may become accessible. 926 * 927 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 928 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 929 * when starting to reset it but it is set to failed when the reset failed. Hence, if 930 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 931 */ 932 static bool 933 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 934 { 935 struct nvme_io_path *io_path; 936 937 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 938 if (io_path->nvme_ns->ana_transition_timedout) { 939 continue; 940 } 941 942 if (nvme_io_path_is_connected(io_path) || 943 !nvme_io_path_is_failed(io_path)) { 944 return true; 945 } 946 } 947 948 return false; 949 } 950 951 static int 952 bdev_nvme_retry_ios(void *arg) 953 { 954 struct nvme_bdev_channel *nbdev_ch = arg; 955 struct spdk_io_channel *ch = spdk_io_channel_from_ctx(nbdev_ch); 956 struct spdk_bdev_io *bdev_io, *tmp_bdev_io; 957 struct nvme_bdev_io *bio; 958 uint64_t now, delay_us; 959 960 now = spdk_get_ticks(); 961 962 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) { 963 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 964 if (bio->retry_ticks > now) { 965 break; 966 } 967 968 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 969 970 bdev_nvme_submit_request(ch, bdev_io); 971 } 972 973 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 974 975 bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list); 976 if (bdev_io != NULL) { 977 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 978 979 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 980 981 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 982 delay_us); 983 } 984 985 return SPDK_POLLER_BUSY; 986 } 987 988 static void 989 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 990 struct nvme_bdev_io *bio, uint64_t delay_ms) 991 { 992 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 993 struct spdk_bdev_io *tmp_bdev_io; 994 struct nvme_bdev_io *tmp_bio; 995 996 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 997 998 TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) { 999 tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx; 1000 1001 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1002 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io, 1003 module_link); 1004 return; 1005 } 1006 } 1007 1008 /* No earlier I/Os were found. This I/O must be the new head. */ 1009 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link); 1010 1011 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1012 1013 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1014 delay_ms * 1000ULL); 1015 } 1016 1017 static inline void 1018 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1019 const struct spdk_nvme_cpl *cpl) 1020 { 1021 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1022 struct nvme_bdev_channel *nbdev_ch; 1023 struct nvme_ctrlr *nvme_ctrlr; 1024 const struct spdk_nvme_ctrlr_data *cdata; 1025 uint64_t delay_ms; 1026 1027 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1028 1029 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1030 goto complete; 1031 } 1032 1033 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1034 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1035 goto complete; 1036 } 1037 1038 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1039 1040 assert(bio->io_path != NULL); 1041 nvme_ctrlr = bio->io_path->qpair->ctrlr; 1042 1043 if (spdk_nvme_cpl_is_path_error(cpl) || 1044 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1045 !nvme_io_path_is_available(bio->io_path) || 1046 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1047 nbdev_ch->current_io_path = NULL; 1048 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1049 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1050 bio->io_path->nvme_ns->ana_state_updating = true; 1051 } 1052 } 1053 delay_ms = 0; 1054 } else { 1055 bio->retry_count++; 1056 1057 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1058 1059 if (cpl->status.crd != 0) { 1060 delay_ms = cdata->crdt[cpl->status.crd] * 100; 1061 } else { 1062 delay_ms = 0; 1063 } 1064 } 1065 1066 if (any_io_path_may_become_available(nbdev_ch)) { 1067 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1068 return; 1069 } 1070 1071 complete: 1072 bio->retry_count = 0; 1073 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1074 } 1075 1076 static inline void 1077 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1078 { 1079 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1080 struct nvme_bdev_channel *nbdev_ch; 1081 enum spdk_bdev_io_status io_status; 1082 1083 switch (rc) { 1084 case 0: 1085 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1086 break; 1087 case -ENOMEM: 1088 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1089 break; 1090 case -ENXIO: 1091 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1092 1093 nbdev_ch->current_io_path = NULL; 1094 1095 if (any_io_path_may_become_available(nbdev_ch)) { 1096 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1097 return; 1098 } 1099 1100 /* fallthrough */ 1101 default: 1102 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1103 break; 1104 } 1105 1106 bio->retry_count = 0; 1107 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1108 } 1109 1110 static inline void 1111 bdev_nvme_admin_passthru_complete(struct nvme_bdev_io *bio, int rc) 1112 { 1113 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1114 enum spdk_bdev_io_status io_status; 1115 1116 switch (rc) { 1117 case 0: 1118 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1119 break; 1120 case -ENOMEM: 1121 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1122 break; 1123 case -ENXIO: 1124 /* fallthrough */ 1125 default: 1126 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1127 break; 1128 } 1129 1130 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1131 } 1132 1133 static void 1134 bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) 1135 { 1136 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1137 1138 pthread_mutex_lock(&nvme_ctrlr->mutex); 1139 1140 assert(nvme_ctrlr->io_path_cache_clearing == true); 1141 nvme_ctrlr->io_path_cache_clearing = false; 1142 1143 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1144 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1145 return; 1146 } 1147 1148 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1149 1150 nvme_ctrlr_unregister(nvme_ctrlr); 1151 } 1152 1153 static void 1154 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1155 { 1156 struct nvme_io_path *io_path; 1157 1158 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1159 io_path->nbdev_ch->current_io_path = NULL; 1160 } 1161 } 1162 1163 static void 1164 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1165 { 1166 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1167 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1168 1169 assert(ctrlr_ch->qpair != NULL); 1170 1171 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1172 1173 spdk_for_each_channel_continue(i, 0); 1174 } 1175 1176 static void 1177 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1178 { 1179 pthread_mutex_lock(&nvme_ctrlr->mutex); 1180 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1181 nvme_ctrlr->io_path_cache_clearing) { 1182 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1183 return; 1184 } 1185 1186 nvme_ctrlr->io_path_cache_clearing = true; 1187 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1188 1189 spdk_for_each_channel(nvme_ctrlr, 1190 bdev_nvme_clear_io_path_cache, 1191 NULL, 1192 bdev_nvme_clear_io_path_caches_done); 1193 } 1194 1195 static struct nvme_qpair * 1196 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1197 { 1198 struct nvme_qpair *nvme_qpair; 1199 1200 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1201 if (nvme_qpair->qpair == qpair) { 1202 break; 1203 } 1204 } 1205 1206 return nvme_qpair; 1207 } 1208 1209 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1210 1211 static void 1212 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1213 { 1214 struct nvme_poll_group *group = poll_group_ctx; 1215 struct nvme_qpair *nvme_qpair; 1216 struct nvme_ctrlr_channel *ctrlr_ch; 1217 1218 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1219 if (nvme_qpair == NULL) { 1220 return; 1221 } 1222 1223 if (nvme_qpair->qpair != NULL) { 1224 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1225 nvme_qpair->qpair = NULL; 1226 } 1227 1228 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1229 1230 ctrlr_ch = nvme_qpair->ctrlr_ch; 1231 1232 if (ctrlr_ch != NULL) { 1233 if (ctrlr_ch->reset_iter != NULL) { 1234 /* If we are already in a full reset sequence, we do not have 1235 * to restart it. Just move to the next ctrlr_channel. 1236 */ 1237 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1238 qpair); 1239 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 1240 ctrlr_ch->reset_iter = NULL; 1241 } else { 1242 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1243 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1244 bdev_nvme_failover(nvme_qpair->ctrlr, false); 1245 } 1246 } else { 1247 /* In this case, ctrlr_channel is already deleted. */ 1248 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1249 nvme_qpair_delete(nvme_qpair); 1250 } 1251 } 1252 1253 static void 1254 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1255 { 1256 struct nvme_qpair *nvme_qpair; 1257 1258 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1259 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1260 continue; 1261 } 1262 1263 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1264 SPDK_NVME_QPAIR_FAILURE_NONE) { 1265 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1266 } 1267 } 1268 } 1269 1270 static int 1271 bdev_nvme_poll(void *arg) 1272 { 1273 struct nvme_poll_group *group = arg; 1274 int64_t num_completions; 1275 1276 if (group->collect_spin_stat && group->start_ticks == 0) { 1277 group->start_ticks = spdk_get_ticks(); 1278 } 1279 1280 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1281 bdev_nvme_disconnected_qpair_cb); 1282 if (group->collect_spin_stat) { 1283 if (num_completions > 0) { 1284 if (group->end_ticks != 0) { 1285 group->spin_ticks += (group->end_ticks - group->start_ticks); 1286 group->end_ticks = 0; 1287 } 1288 group->start_ticks = 0; 1289 } else { 1290 group->end_ticks = spdk_get_ticks(); 1291 } 1292 } 1293 1294 if (spdk_unlikely(num_completions < 0)) { 1295 bdev_nvme_check_io_qpairs(group); 1296 } 1297 1298 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1299 } 1300 1301 static int bdev_nvme_poll_adminq(void *arg); 1302 1303 static void 1304 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1305 { 1306 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1307 1308 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1309 nvme_ctrlr, new_period_us); 1310 } 1311 1312 static int 1313 bdev_nvme_poll_adminq(void *arg) 1314 { 1315 int32_t rc; 1316 struct nvme_ctrlr *nvme_ctrlr = arg; 1317 nvme_ctrlr_disconnected_cb disconnected_cb; 1318 1319 assert(nvme_ctrlr != NULL); 1320 1321 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1322 if (rc < 0) { 1323 disconnected_cb = nvme_ctrlr->disconnected_cb; 1324 nvme_ctrlr->disconnected_cb = NULL; 1325 1326 if (rc == -ENXIO && disconnected_cb != NULL) { 1327 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1328 g_opts.nvme_adminq_poll_period_us); 1329 disconnected_cb(nvme_ctrlr); 1330 } else { 1331 bdev_nvme_failover(nvme_ctrlr, false); 1332 } 1333 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1334 SPDK_NVME_QPAIR_FAILURE_NONE) { 1335 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1336 } 1337 1338 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1339 } 1340 1341 static void 1342 _bdev_nvme_unregister_dev_cb(void *io_device) 1343 { 1344 struct nvme_bdev *nvme_disk = io_device; 1345 1346 free(nvme_disk->disk.name); 1347 free(nvme_disk); 1348 } 1349 1350 static int 1351 bdev_nvme_destruct(void *ctx) 1352 { 1353 struct nvme_bdev *nvme_disk = ctx; 1354 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1355 1356 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1357 1358 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1359 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1360 1361 nvme_ns->bdev = NULL; 1362 1363 assert(nvme_ns->id > 0); 1364 1365 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1366 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1367 1368 nvme_ctrlr_release(nvme_ns->ctrlr); 1369 free(nvme_ns); 1370 } else { 1371 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1372 } 1373 } 1374 1375 pthread_mutex_lock(&g_bdev_nvme_mutex); 1376 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1377 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1378 1379 spdk_io_device_unregister(nvme_disk, _bdev_nvme_unregister_dev_cb); 1380 1381 return 0; 1382 } 1383 1384 static int 1385 bdev_nvme_flush(struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes) 1386 { 1387 bdev_nvme_io_complete(bio, 0); 1388 1389 return 0; 1390 } 1391 1392 static int 1393 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1394 { 1395 struct nvme_ctrlr *nvme_ctrlr; 1396 struct spdk_nvme_io_qpair_opts opts; 1397 struct spdk_nvme_qpair *qpair; 1398 int rc; 1399 1400 nvme_ctrlr = nvme_qpair->ctrlr; 1401 1402 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1403 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1404 opts.create_only = true; 1405 opts.async_mode = true; 1406 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1407 g_opts.io_queue_requests = opts.io_queue_requests; 1408 1409 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1410 if (qpair == NULL) { 1411 return -1; 1412 } 1413 1414 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1415 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1416 1417 assert(nvme_qpair->group != NULL); 1418 1419 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1420 if (rc != 0) { 1421 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1422 goto err; 1423 } 1424 1425 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1426 if (rc != 0) { 1427 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1428 goto err; 1429 } 1430 1431 nvme_qpair->qpair = qpair; 1432 1433 if (!g_opts.disable_auto_failback) { 1434 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1435 } 1436 1437 return 0; 1438 1439 err: 1440 spdk_nvme_ctrlr_free_io_qpair(qpair); 1441 1442 return rc; 1443 } 1444 1445 static void 1446 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1447 { 1448 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1449 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1450 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 1451 struct spdk_bdev_io *bdev_io; 1452 1453 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1454 status = SPDK_BDEV_IO_STATUS_FAILED; 1455 } 1456 1457 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1458 bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1459 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link); 1460 __bdev_nvme_io_complete(bdev_io, status, NULL); 1461 } 1462 1463 spdk_for_each_channel_continue(i, 0); 1464 } 1465 1466 static void 1467 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove) 1468 { 1469 struct nvme_path_id *path_id, *next_path; 1470 int rc __attribute__((unused)); 1471 1472 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1473 assert(path_id); 1474 assert(path_id == nvme_ctrlr->active_path_id); 1475 next_path = TAILQ_NEXT(path_id, link); 1476 1477 path_id->is_failed = true; 1478 1479 if (next_path) { 1480 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1481 1482 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1483 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1484 1485 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1486 nvme_ctrlr->active_path_id = next_path; 1487 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1488 assert(rc == 0); 1489 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1490 if (!remove) { 1491 /** Shuffle the old trid to the end of the list and use the new one. 1492 * Allows for round robin through multiple connections. 1493 */ 1494 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1495 } else { 1496 free(path_id); 1497 } 1498 } 1499 } 1500 1501 static bool 1502 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1503 { 1504 int32_t elapsed; 1505 1506 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1507 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1508 return false; 1509 } 1510 1511 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1512 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1513 return true; 1514 } else { 1515 return false; 1516 } 1517 } 1518 1519 static bool 1520 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1521 { 1522 uint32_t elapsed; 1523 1524 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1525 return false; 1526 } 1527 1528 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1529 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1530 return true; 1531 } else { 1532 return false; 1533 } 1534 } 1535 1536 static void bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 1537 1538 static void 1539 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1540 { 1541 int rc; 1542 1543 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1544 if (rc != 0) { 1545 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 1546 * fail the reset sequence immediately. 1547 */ 1548 bdev_nvme_reset_complete(nvme_ctrlr, false); 1549 return; 1550 } 1551 1552 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1553 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1554 */ 1555 assert(nvme_ctrlr->disconnected_cb == NULL); 1556 nvme_ctrlr->disconnected_cb = cb_fn; 1557 1558 /* During disconnection, reduce the period to poll adminq more often. */ 1559 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 1560 } 1561 1562 enum bdev_nvme_op_after_reset { 1563 OP_NONE, 1564 OP_COMPLETE_PENDING_DESTRUCT, 1565 OP_DESTRUCT, 1566 OP_DELAYED_RECONNECT, 1567 }; 1568 1569 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1570 1571 static _bdev_nvme_op_after_reset 1572 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1573 { 1574 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1575 /* Complete pending destruct after reset completes. */ 1576 return OP_COMPLETE_PENDING_DESTRUCT; 1577 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1578 nvme_ctrlr->reset_start_tsc = 0; 1579 return OP_NONE; 1580 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1581 return OP_DESTRUCT; 1582 } else { 1583 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1584 nvme_ctrlr->fast_io_fail_timedout = true; 1585 } 1586 bdev_nvme_failover_trid(nvme_ctrlr, false); 1587 return OP_DELAYED_RECONNECT; 1588 } 1589 } 1590 1591 static int _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1592 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1593 1594 static int 1595 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 1596 { 1597 struct nvme_ctrlr *nvme_ctrlr = ctx; 1598 1599 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 1600 pthread_mutex_lock(&nvme_ctrlr->mutex); 1601 1602 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 1603 1604 assert(nvme_ctrlr->reconnect_is_delayed == true); 1605 nvme_ctrlr->reconnect_is_delayed = false; 1606 1607 if (nvme_ctrlr->destruct) { 1608 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1609 return SPDK_POLLER_BUSY; 1610 } 1611 1612 assert(nvme_ctrlr->resetting == false); 1613 nvme_ctrlr->resetting = true; 1614 1615 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1616 1617 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 1618 1619 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 1620 return SPDK_POLLER_BUSY; 1621 } 1622 1623 static void 1624 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 1625 { 1626 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 1627 1628 assert(nvme_ctrlr->reconnect_is_delayed == false); 1629 nvme_ctrlr->reconnect_is_delayed = true; 1630 1631 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 1632 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 1633 nvme_ctrlr, 1634 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 1635 } 1636 1637 static void 1638 _bdev_nvme_reset_complete(struct spdk_io_channel_iter *i, int status) 1639 { 1640 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1641 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 1642 struct nvme_path_id *path_id; 1643 bdev_nvme_reset_cb reset_cb_fn = nvme_ctrlr->reset_cb_fn; 1644 void *reset_cb_arg = nvme_ctrlr->reset_cb_arg; 1645 enum bdev_nvme_op_after_reset op_after_reset; 1646 1647 assert(nvme_ctrlr->thread == spdk_get_thread()); 1648 1649 nvme_ctrlr->reset_cb_fn = NULL; 1650 nvme_ctrlr->reset_cb_arg = NULL; 1651 1652 if (!success) { 1653 SPDK_ERRLOG("Resetting controller failed.\n"); 1654 } else { 1655 SPDK_NOTICELOG("Resetting controller successful.\n"); 1656 } 1657 1658 pthread_mutex_lock(&nvme_ctrlr->mutex); 1659 nvme_ctrlr->resetting = false; 1660 1661 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1662 assert(path_id != NULL); 1663 assert(path_id == nvme_ctrlr->active_path_id); 1664 1665 path_id->is_failed = !success; 1666 1667 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 1668 1669 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1670 1671 if (reset_cb_fn) { 1672 reset_cb_fn(reset_cb_arg, success); 1673 } 1674 1675 switch (op_after_reset) { 1676 case OP_COMPLETE_PENDING_DESTRUCT: 1677 nvme_ctrlr_unregister(nvme_ctrlr); 1678 break; 1679 case OP_DESTRUCT: 1680 _bdev_nvme_delete(nvme_ctrlr, false); 1681 break; 1682 case OP_DELAYED_RECONNECT: 1683 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 1684 break; 1685 default: 1686 break; 1687 } 1688 } 1689 1690 static void 1691 bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 1692 { 1693 /* Make sure we clear any pending resets before returning. */ 1694 spdk_for_each_channel(nvme_ctrlr, 1695 bdev_nvme_complete_pending_resets, 1696 success ? NULL : (void *)0x1, 1697 _bdev_nvme_reset_complete); 1698 } 1699 1700 static void 1701 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 1702 { 1703 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1704 1705 bdev_nvme_reset_complete(nvme_ctrlr, false); 1706 } 1707 1708 static void 1709 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 1710 { 1711 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 1712 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 1713 struct nvme_qpair *nvme_qpair; 1714 1715 nvme_qpair = ctrlr_ch->qpair; 1716 assert(nvme_qpair != NULL); 1717 1718 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1719 1720 if (nvme_qpair->qpair != NULL) { 1721 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 1722 1723 /* The current full reset sequence will move to the next 1724 * ctrlr_channel after the qpair is actually disconnected. 1725 */ 1726 assert(ctrlr_ch->reset_iter == NULL); 1727 ctrlr_ch->reset_iter = i; 1728 } else { 1729 spdk_for_each_channel_continue(i, 0); 1730 } 1731 } 1732 1733 static void 1734 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 1735 { 1736 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1737 1738 if (status == 0) { 1739 bdev_nvme_reset_complete(nvme_ctrlr, true); 1740 } else { 1741 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 1742 spdk_for_each_channel(nvme_ctrlr, 1743 bdev_nvme_reset_destroy_qpair, 1744 NULL, 1745 bdev_nvme_reset_create_qpairs_failed); 1746 } 1747 } 1748 1749 static void 1750 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 1751 { 1752 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1753 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1754 int rc; 1755 1756 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 1757 1758 spdk_for_each_channel_continue(i, rc); 1759 } 1760 1761 static int 1762 bdev_nvme_reconnect_ctrlr_poll(void *arg) 1763 { 1764 struct nvme_ctrlr *nvme_ctrlr = arg; 1765 int rc = -ETIMEDOUT; 1766 1767 if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1768 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 1769 if (rc == -EAGAIN) { 1770 return SPDK_POLLER_BUSY; 1771 } 1772 } 1773 1774 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 1775 if (rc == 0) { 1776 /* Recreate all of the I/O queue pairs */ 1777 spdk_for_each_channel(nvme_ctrlr, 1778 bdev_nvme_reset_create_qpair, 1779 NULL, 1780 bdev_nvme_reset_create_qpairs_done); 1781 } else { 1782 bdev_nvme_reset_complete(nvme_ctrlr, false); 1783 } 1784 return SPDK_POLLER_BUSY; 1785 } 1786 1787 static void 1788 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 1789 { 1790 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 1791 1792 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 1793 assert(nvme_ctrlr->reset_detach_poller == NULL); 1794 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 1795 nvme_ctrlr, 0); 1796 } 1797 1798 static void 1799 bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status) 1800 { 1801 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1802 1803 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 1804 assert(status == 0); 1805 1806 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 1807 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 1808 } else { 1809 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 1810 } 1811 } 1812 1813 static void 1814 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 1815 { 1816 spdk_for_each_channel(nvme_ctrlr, 1817 bdev_nvme_reset_destroy_qpair, 1818 NULL, 1819 bdev_nvme_reset_ctrlr); 1820 } 1821 1822 static void 1823 _bdev_nvme_reset(void *ctx) 1824 { 1825 struct nvme_ctrlr *nvme_ctrlr = ctx; 1826 1827 assert(nvme_ctrlr->resetting == true); 1828 assert(nvme_ctrlr->thread == spdk_get_thread()); 1829 1830 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 1831 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 1832 } else { 1833 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 1834 } 1835 } 1836 1837 static int 1838 bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr) 1839 { 1840 pthread_mutex_lock(&nvme_ctrlr->mutex); 1841 if (nvme_ctrlr->destruct) { 1842 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1843 return -ENXIO; 1844 } 1845 1846 if (nvme_ctrlr->resetting) { 1847 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1848 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 1849 return -EBUSY; 1850 } 1851 1852 if (nvme_ctrlr->reconnect_is_delayed) { 1853 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1854 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 1855 return -EBUSY; 1856 } 1857 1858 nvme_ctrlr->resetting = true; 1859 1860 assert(nvme_ctrlr->reset_start_tsc == 0); 1861 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 1862 1863 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1864 1865 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr); 1866 return 0; 1867 } 1868 1869 int 1870 bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg) 1871 { 1872 int rc; 1873 1874 rc = bdev_nvme_reset(nvme_ctrlr); 1875 if (rc == 0) { 1876 nvme_ctrlr->reset_cb_fn = cb_fn; 1877 nvme_ctrlr->reset_cb_arg = cb_arg; 1878 } 1879 return rc; 1880 } 1881 1882 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 1883 1884 static void 1885 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 1886 { 1887 enum spdk_bdev_io_status io_status; 1888 1889 if (bio->cpl.cdw0 == 0) { 1890 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1891 } else { 1892 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1893 } 1894 1895 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 1896 } 1897 1898 static void 1899 _bdev_nvme_reset_io_continue(void *ctx) 1900 { 1901 struct nvme_bdev_io *bio = ctx; 1902 struct nvme_io_path *prev_io_path, *next_io_path; 1903 int rc; 1904 1905 prev_io_path = bio->io_path; 1906 bio->io_path = NULL; 1907 1908 if (bio->cpl.cdw0 != 0) { 1909 goto complete; 1910 } 1911 1912 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 1913 if (next_io_path == NULL) { 1914 goto complete; 1915 } 1916 1917 rc = _bdev_nvme_reset_io(next_io_path, bio); 1918 if (rc == 0) { 1919 return; 1920 } 1921 1922 bio->cpl.cdw0 = 1; 1923 1924 complete: 1925 bdev_nvme_reset_io_complete(bio); 1926 } 1927 1928 static void 1929 bdev_nvme_reset_io_continue(void *cb_arg, bool success) 1930 { 1931 struct nvme_bdev_io *bio = cb_arg; 1932 1933 bio->cpl.cdw0 = !success; 1934 1935 spdk_thread_send_msg(bio->orig_thread, _bdev_nvme_reset_io_continue, bio); 1936 } 1937 1938 static int 1939 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 1940 { 1941 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1942 struct nvme_ctrlr_channel *ctrlr_ch; 1943 struct spdk_bdev_io *bdev_io; 1944 int rc; 1945 1946 rc = bdev_nvme_reset(nvme_ctrlr); 1947 if (rc == 0) { 1948 assert(bio->io_path == NULL); 1949 bio->io_path = io_path; 1950 1951 assert(nvme_ctrlr->reset_cb_fn == NULL); 1952 assert(nvme_ctrlr->reset_cb_arg == NULL); 1953 nvme_ctrlr->reset_cb_fn = bdev_nvme_reset_io_continue; 1954 nvme_ctrlr->reset_cb_arg = bio; 1955 } else if (rc == -EBUSY) { 1956 ctrlr_ch = io_path->qpair->ctrlr_ch; 1957 assert(ctrlr_ch != NULL); 1958 /* 1959 * Reset call is queued only if it is from the app framework. This is on purpose so that 1960 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 1961 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 1962 */ 1963 bdev_io = spdk_bdev_io_from_ctx(bio); 1964 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link); 1965 } else { 1966 return rc; 1967 } 1968 1969 return 0; 1970 } 1971 1972 static void 1973 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 1974 { 1975 struct nvme_io_path *io_path; 1976 int rc; 1977 1978 bio->cpl.cdw0 = 0; 1979 bio->orig_thread = spdk_get_thread(); 1980 1981 /* Reset only the first nvme_ctrlr in the nvme_bdev_ctrlr for now. 1982 * 1983 * TODO: Reset all nvme_ctrlrs in the nvme_bdev_ctrlr sequentially. 1984 * This will be done in the following patches. 1985 */ 1986 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 1987 assert(io_path != NULL); 1988 1989 rc = _bdev_nvme_reset_io(io_path, bio); 1990 if (rc != 0) { 1991 bio->cpl.cdw0 = 1; 1992 bdev_nvme_reset_io_complete(bio); 1993 } 1994 } 1995 1996 static int 1997 bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove) 1998 { 1999 pthread_mutex_lock(&nvme_ctrlr->mutex); 2000 if (nvme_ctrlr->destruct) { 2001 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2002 /* Don't bother resetting if the controller is in the process of being destructed. */ 2003 return -ENXIO; 2004 } 2005 2006 if (nvme_ctrlr->resetting) { 2007 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2008 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2009 return -EBUSY; 2010 } 2011 2012 bdev_nvme_failover_trid(nvme_ctrlr, remove); 2013 2014 if (nvme_ctrlr->reconnect_is_delayed) { 2015 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2016 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2017 2018 /* We rely on the next reconnect for the failover. */ 2019 return 0; 2020 } 2021 2022 nvme_ctrlr->resetting = true; 2023 2024 assert(nvme_ctrlr->reset_start_tsc == 0); 2025 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2026 2027 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2028 2029 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr); 2030 return 0; 2031 } 2032 2033 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2034 uint64_t num_blocks); 2035 2036 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2037 uint64_t num_blocks); 2038 2039 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 2040 uint64_t src_offset_blocks, 2041 uint64_t num_blocks); 2042 2043 static void 2044 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2045 bool success) 2046 { 2047 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2048 struct spdk_bdev *bdev = bdev_io->bdev; 2049 int ret; 2050 2051 if (!success) { 2052 ret = -EINVAL; 2053 goto exit; 2054 } 2055 2056 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2057 ret = -ENXIO; 2058 goto exit; 2059 } 2060 2061 ret = bdev_nvme_readv(bio, 2062 bdev_io->u.bdev.iovs, 2063 bdev_io->u.bdev.iovcnt, 2064 bdev_io->u.bdev.md_buf, 2065 bdev_io->u.bdev.num_blocks, 2066 bdev_io->u.bdev.offset_blocks, 2067 bdev->dif_check_flags, 2068 bdev_io->u.bdev.ext_opts); 2069 2070 exit: 2071 if (spdk_unlikely(ret != 0)) { 2072 bdev_nvme_io_complete(bio, ret); 2073 } 2074 } 2075 2076 static void 2077 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2078 { 2079 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 2080 struct spdk_bdev *bdev = bdev_io->bdev; 2081 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2082 struct nvme_bdev_io *nbdev_io_to_abort; 2083 int rc = 0; 2084 2085 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 2086 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 2087 if (spdk_unlikely(!nbdev_io->io_path)) { 2088 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 2089 rc = -ENXIO; 2090 goto exit; 2091 } 2092 2093 /* Admin commands do not use the optimal I/O path. 2094 * Simply fall through even if it is not found. 2095 */ 2096 } 2097 2098 switch (bdev_io->type) { 2099 case SPDK_BDEV_IO_TYPE_READ: 2100 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 2101 rc = bdev_nvme_readv(nbdev_io, 2102 bdev_io->u.bdev.iovs, 2103 bdev_io->u.bdev.iovcnt, 2104 bdev_io->u.bdev.md_buf, 2105 bdev_io->u.bdev.num_blocks, 2106 bdev_io->u.bdev.offset_blocks, 2107 bdev->dif_check_flags, 2108 bdev_io->u.bdev.ext_opts); 2109 } else { 2110 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 2111 bdev_io->u.bdev.num_blocks * bdev->blocklen); 2112 rc = 0; 2113 } 2114 break; 2115 case SPDK_BDEV_IO_TYPE_WRITE: 2116 rc = bdev_nvme_writev(nbdev_io, 2117 bdev_io->u.bdev.iovs, 2118 bdev_io->u.bdev.iovcnt, 2119 bdev_io->u.bdev.md_buf, 2120 bdev_io->u.bdev.num_blocks, 2121 bdev_io->u.bdev.offset_blocks, 2122 bdev->dif_check_flags, 2123 bdev_io->u.bdev.ext_opts); 2124 break; 2125 case SPDK_BDEV_IO_TYPE_COMPARE: 2126 rc = bdev_nvme_comparev(nbdev_io, 2127 bdev_io->u.bdev.iovs, 2128 bdev_io->u.bdev.iovcnt, 2129 bdev_io->u.bdev.md_buf, 2130 bdev_io->u.bdev.num_blocks, 2131 bdev_io->u.bdev.offset_blocks, 2132 bdev->dif_check_flags); 2133 break; 2134 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2135 rc = bdev_nvme_comparev_and_writev(nbdev_io, 2136 bdev_io->u.bdev.iovs, 2137 bdev_io->u.bdev.iovcnt, 2138 bdev_io->u.bdev.fused_iovs, 2139 bdev_io->u.bdev.fused_iovcnt, 2140 bdev_io->u.bdev.md_buf, 2141 bdev_io->u.bdev.num_blocks, 2142 bdev_io->u.bdev.offset_blocks, 2143 bdev->dif_check_flags); 2144 break; 2145 case SPDK_BDEV_IO_TYPE_UNMAP: 2146 rc = bdev_nvme_unmap(nbdev_io, 2147 bdev_io->u.bdev.offset_blocks, 2148 bdev_io->u.bdev.num_blocks); 2149 break; 2150 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2151 rc = bdev_nvme_write_zeroes(nbdev_io, 2152 bdev_io->u.bdev.offset_blocks, 2153 bdev_io->u.bdev.num_blocks); 2154 break; 2155 case SPDK_BDEV_IO_TYPE_RESET: 2156 nbdev_io->io_path = NULL; 2157 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 2158 break; 2159 case SPDK_BDEV_IO_TYPE_FLUSH: 2160 rc = bdev_nvme_flush(nbdev_io, 2161 bdev_io->u.bdev.offset_blocks, 2162 bdev_io->u.bdev.num_blocks); 2163 break; 2164 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 2165 rc = bdev_nvme_zone_appendv(nbdev_io, 2166 bdev_io->u.bdev.iovs, 2167 bdev_io->u.bdev.iovcnt, 2168 bdev_io->u.bdev.md_buf, 2169 bdev_io->u.bdev.num_blocks, 2170 bdev_io->u.bdev.offset_blocks, 2171 bdev->dif_check_flags); 2172 break; 2173 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 2174 rc = bdev_nvme_get_zone_info(nbdev_io, 2175 bdev_io->u.zone_mgmt.zone_id, 2176 bdev_io->u.zone_mgmt.num_zones, 2177 bdev_io->u.zone_mgmt.buf); 2178 break; 2179 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 2180 rc = bdev_nvme_zone_management(nbdev_io, 2181 bdev_io->u.zone_mgmt.zone_id, 2182 bdev_io->u.zone_mgmt.zone_action); 2183 break; 2184 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 2185 nbdev_io->io_path = NULL; 2186 bdev_nvme_admin_passthru(nbdev_ch, 2187 nbdev_io, 2188 &bdev_io->u.nvme_passthru.cmd, 2189 bdev_io->u.nvme_passthru.buf, 2190 bdev_io->u.nvme_passthru.nbytes); 2191 break; 2192 case SPDK_BDEV_IO_TYPE_NVME_IO: 2193 rc = bdev_nvme_io_passthru(nbdev_io, 2194 &bdev_io->u.nvme_passthru.cmd, 2195 bdev_io->u.nvme_passthru.buf, 2196 bdev_io->u.nvme_passthru.nbytes); 2197 break; 2198 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2199 rc = bdev_nvme_io_passthru_md(nbdev_io, 2200 &bdev_io->u.nvme_passthru.cmd, 2201 bdev_io->u.nvme_passthru.buf, 2202 bdev_io->u.nvme_passthru.nbytes, 2203 bdev_io->u.nvme_passthru.md_buf, 2204 bdev_io->u.nvme_passthru.md_len); 2205 break; 2206 case SPDK_BDEV_IO_TYPE_ABORT: 2207 nbdev_io->io_path = NULL; 2208 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 2209 bdev_nvme_abort(nbdev_ch, 2210 nbdev_io, 2211 nbdev_io_to_abort); 2212 break; 2213 case SPDK_BDEV_IO_TYPE_COPY: 2214 rc = bdev_nvme_copy(nbdev_io, 2215 bdev_io->u.bdev.offset_blocks, 2216 bdev_io->u.bdev.copy.src_offset_blocks, 2217 bdev_io->u.bdev.num_blocks); 2218 break; 2219 default: 2220 rc = -EINVAL; 2221 break; 2222 } 2223 2224 exit: 2225 if (spdk_unlikely(rc != 0)) { 2226 bdev_nvme_io_complete(nbdev_io, rc); 2227 } 2228 } 2229 2230 static bool 2231 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 2232 { 2233 struct nvme_bdev *nbdev = ctx; 2234 struct nvme_ns *nvme_ns; 2235 struct spdk_nvme_ns *ns; 2236 struct spdk_nvme_ctrlr *ctrlr; 2237 const struct spdk_nvme_ctrlr_data *cdata; 2238 2239 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 2240 assert(nvme_ns != NULL); 2241 ns = nvme_ns->ns; 2242 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2243 2244 switch (io_type) { 2245 case SPDK_BDEV_IO_TYPE_READ: 2246 case SPDK_BDEV_IO_TYPE_WRITE: 2247 case SPDK_BDEV_IO_TYPE_RESET: 2248 case SPDK_BDEV_IO_TYPE_FLUSH: 2249 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 2250 case SPDK_BDEV_IO_TYPE_NVME_IO: 2251 case SPDK_BDEV_IO_TYPE_ABORT: 2252 return true; 2253 2254 case SPDK_BDEV_IO_TYPE_COMPARE: 2255 return spdk_nvme_ns_supports_compare(ns); 2256 2257 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2258 return spdk_nvme_ns_get_md_size(ns) ? true : false; 2259 2260 case SPDK_BDEV_IO_TYPE_UNMAP: 2261 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2262 return cdata->oncs.dsm; 2263 2264 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2265 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2266 return cdata->oncs.write_zeroes; 2267 2268 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2269 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 2270 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 2271 return true; 2272 } 2273 return false; 2274 2275 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 2276 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 2277 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 2278 2279 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 2280 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 2281 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 2282 2283 case SPDK_BDEV_IO_TYPE_COPY: 2284 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2285 return cdata->oncs.copy; 2286 2287 default: 2288 return false; 2289 } 2290 } 2291 2292 static int 2293 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 2294 { 2295 struct nvme_qpair *nvme_qpair; 2296 struct spdk_io_channel *pg_ch; 2297 int rc; 2298 2299 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 2300 if (!nvme_qpair) { 2301 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 2302 return -1; 2303 } 2304 2305 TAILQ_INIT(&nvme_qpair->io_path_list); 2306 2307 nvme_qpair->ctrlr = nvme_ctrlr; 2308 nvme_qpair->ctrlr_ch = ctrlr_ch; 2309 2310 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 2311 if (!pg_ch) { 2312 free(nvme_qpair); 2313 return -1; 2314 } 2315 2316 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 2317 2318 #ifdef SPDK_CONFIG_VTUNE 2319 nvme_qpair->group->collect_spin_stat = true; 2320 #else 2321 nvme_qpair->group->collect_spin_stat = false; 2322 #endif 2323 2324 rc = bdev_nvme_create_qpair(nvme_qpair); 2325 if (rc != 0) { 2326 /* nvme_ctrlr can't create IO qpair if connection is down. 2327 * 2328 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 2329 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 2330 * submitted IO will be queued until IO qpair is successfully created. 2331 * 2332 * Hence, if both are satisfied, ignore the failure. 2333 */ 2334 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 2335 spdk_put_io_channel(pg_ch); 2336 free(nvme_qpair); 2337 return rc; 2338 } 2339 } 2340 2341 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 2342 2343 ctrlr_ch->qpair = nvme_qpair; 2344 2345 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 2346 nvme_qpair->ctrlr->ref++; 2347 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 2348 2349 return 0; 2350 } 2351 2352 static int 2353 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 2354 { 2355 struct nvme_ctrlr *nvme_ctrlr = io_device; 2356 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 2357 2358 TAILQ_INIT(&ctrlr_ch->pending_resets); 2359 2360 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 2361 } 2362 2363 static void 2364 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 2365 { 2366 assert(nvme_qpair->group != NULL); 2367 2368 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 2369 2370 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 2371 2372 nvme_ctrlr_release(nvme_qpair->ctrlr); 2373 2374 free(nvme_qpair); 2375 } 2376 2377 static void 2378 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 2379 { 2380 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 2381 struct nvme_qpair *nvme_qpair; 2382 2383 nvme_qpair = ctrlr_ch->qpair; 2384 assert(nvme_qpair != NULL); 2385 2386 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2387 2388 if (nvme_qpair->qpair != NULL) { 2389 if (ctrlr_ch->reset_iter == NULL) { 2390 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2391 } else { 2392 /* Skip current ctrlr_channel in a full reset sequence because 2393 * it is being deleted now. The qpair is already being disconnected. 2394 * We do not have to restart disconnecting it. 2395 */ 2396 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2397 } 2398 2399 /* We cannot release a reference to the poll group now. 2400 * The qpair may be disconnected asynchronously later. 2401 * We need to poll it until it is actually disconnected. 2402 * Just detach the qpair from the deleting ctrlr_channel. 2403 */ 2404 nvme_qpair->ctrlr_ch = NULL; 2405 } else { 2406 assert(ctrlr_ch->reset_iter == NULL); 2407 2408 nvme_qpair_delete(nvme_qpair); 2409 } 2410 } 2411 2412 static void 2413 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 2414 uint32_t iov_cnt, uint32_t seed, 2415 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 2416 { 2417 struct nvme_poll_group *group = ctx; 2418 int rc; 2419 2420 assert(group->accel_channel != NULL); 2421 assert(cb_fn != NULL); 2422 2423 rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 2424 if (rc) { 2425 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 2426 if (rc == -ENOMEM || rc == -EINVAL) { 2427 cb_fn(cb_arg, rc); 2428 } 2429 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 2430 } 2431 } 2432 2433 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 2434 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 2435 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 2436 }; 2437 2438 static int 2439 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 2440 { 2441 struct nvme_poll_group *group = ctx_buf; 2442 2443 TAILQ_INIT(&group->qpair_list); 2444 2445 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 2446 if (group->group == NULL) { 2447 return -1; 2448 } 2449 2450 group->accel_channel = spdk_accel_get_io_channel(); 2451 if (!group->accel_channel) { 2452 spdk_nvme_poll_group_destroy(group->group); 2453 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 2454 group); 2455 return -1; 2456 } 2457 2458 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 2459 2460 if (group->poller == NULL) { 2461 spdk_put_io_channel(group->accel_channel); 2462 spdk_nvme_poll_group_destroy(group->group); 2463 return -1; 2464 } 2465 2466 return 0; 2467 } 2468 2469 static void 2470 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 2471 { 2472 struct nvme_poll_group *group = ctx_buf; 2473 2474 assert(TAILQ_EMPTY(&group->qpair_list)); 2475 2476 if (group->accel_channel) { 2477 spdk_put_io_channel(group->accel_channel); 2478 } 2479 2480 spdk_poller_unregister(&group->poller); 2481 if (spdk_nvme_poll_group_destroy(group->group)) { 2482 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 2483 assert(false); 2484 } 2485 } 2486 2487 static struct spdk_io_channel * 2488 bdev_nvme_get_io_channel(void *ctx) 2489 { 2490 struct nvme_bdev *nvme_bdev = ctx; 2491 2492 return spdk_get_io_channel(nvme_bdev); 2493 } 2494 2495 static void * 2496 bdev_nvme_get_module_ctx(void *ctx) 2497 { 2498 struct nvme_bdev *nvme_bdev = ctx; 2499 struct nvme_ns *nvme_ns; 2500 2501 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 2502 return NULL; 2503 } 2504 2505 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 2506 if (!nvme_ns) { 2507 return NULL; 2508 } 2509 2510 return nvme_ns->ns; 2511 } 2512 2513 static const char * 2514 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 2515 { 2516 switch (ana_state) { 2517 case SPDK_NVME_ANA_OPTIMIZED_STATE: 2518 return "optimized"; 2519 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 2520 return "non_optimized"; 2521 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 2522 return "inaccessible"; 2523 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 2524 return "persistent_loss"; 2525 case SPDK_NVME_ANA_CHANGE_STATE: 2526 return "change"; 2527 default: 2528 return NULL; 2529 } 2530 } 2531 2532 static int 2533 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 2534 { 2535 struct spdk_memory_domain **_domains = NULL; 2536 struct nvme_bdev *nbdev = ctx; 2537 struct nvme_ns *nvme_ns; 2538 int i = 0, _array_size = array_size; 2539 int rc = 0; 2540 2541 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 2542 if (domains && array_size >= i) { 2543 _domains = &domains[i]; 2544 } else { 2545 _domains = NULL; 2546 } 2547 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 2548 if (rc > 0) { 2549 i += rc; 2550 if (_array_size >= rc) { 2551 _array_size -= rc; 2552 } else { 2553 _array_size = 0; 2554 } 2555 } else if (rc < 0) { 2556 return rc; 2557 } 2558 } 2559 2560 return i; 2561 } 2562 2563 static const char * 2564 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 2565 { 2566 if (nvme_ctrlr->destruct) { 2567 return "deleting"; 2568 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 2569 return "failed"; 2570 } else if (nvme_ctrlr->resetting) { 2571 return "resetting"; 2572 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 2573 return "reconnect_is_delayed"; 2574 } else { 2575 return "enabled"; 2576 } 2577 } 2578 2579 void 2580 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 2581 { 2582 struct spdk_nvme_transport_id *trid; 2583 const struct spdk_nvme_ctrlr_opts *opts; 2584 const struct spdk_nvme_ctrlr_data *cdata; 2585 2586 spdk_json_write_object_begin(w); 2587 2588 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 2589 2590 #ifdef SPDK_CONFIG_NVME_CUSE 2591 size_t cuse_name_size = 128; 2592 char cuse_name[cuse_name_size]; 2593 2594 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 2595 if (rc == 0) { 2596 spdk_json_write_named_string(w, "cuse_device", cuse_name); 2597 } 2598 #endif 2599 trid = &nvme_ctrlr->active_path_id->trid; 2600 spdk_json_write_named_object_begin(w, "trid"); 2601 nvme_bdev_dump_trid_json(trid, w); 2602 spdk_json_write_object_end(w); 2603 2604 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 2605 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 2606 2607 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 2608 spdk_json_write_named_object_begin(w, "host"); 2609 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 2610 spdk_json_write_named_string(w, "addr", opts->src_addr); 2611 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 2612 spdk_json_write_object_end(w); 2613 2614 spdk_json_write_object_end(w); 2615 } 2616 2617 static void 2618 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 2619 struct nvme_ns *nvme_ns) 2620 { 2621 struct spdk_nvme_ns *ns; 2622 struct spdk_nvme_ctrlr *ctrlr; 2623 const struct spdk_nvme_ctrlr_data *cdata; 2624 const struct spdk_nvme_transport_id *trid; 2625 union spdk_nvme_vs_register vs; 2626 const struct spdk_nvme_ns_data *nsdata; 2627 char buf[128]; 2628 2629 ns = nvme_ns->ns; 2630 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2631 2632 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2633 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 2634 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 2635 2636 spdk_json_write_object_begin(w); 2637 2638 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2639 spdk_json_write_named_string(w, "pci_address", trid->traddr); 2640 } 2641 2642 spdk_json_write_named_object_begin(w, "trid"); 2643 2644 nvme_bdev_dump_trid_json(trid, w); 2645 2646 spdk_json_write_object_end(w); 2647 2648 #ifdef SPDK_CONFIG_NVME_CUSE 2649 size_t cuse_name_size = 128; 2650 char cuse_name[cuse_name_size]; 2651 2652 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 2653 cuse_name, &cuse_name_size); 2654 if (rc == 0) { 2655 spdk_json_write_named_string(w, "cuse_device", cuse_name); 2656 } 2657 #endif 2658 2659 spdk_json_write_named_object_begin(w, "ctrlr_data"); 2660 2661 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 2662 2663 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 2664 2665 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 2666 spdk_str_trim(buf); 2667 spdk_json_write_named_string(w, "model_number", buf); 2668 2669 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 2670 spdk_str_trim(buf); 2671 spdk_json_write_named_string(w, "serial_number", buf); 2672 2673 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 2674 spdk_str_trim(buf); 2675 spdk_json_write_named_string(w, "firmware_revision", buf); 2676 2677 if (cdata->subnqn[0] != '\0') { 2678 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 2679 } 2680 2681 spdk_json_write_named_object_begin(w, "oacs"); 2682 2683 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 2684 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 2685 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 2686 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 2687 2688 spdk_json_write_object_end(w); 2689 2690 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 2691 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 2692 2693 spdk_json_write_object_end(w); 2694 2695 spdk_json_write_named_object_begin(w, "vs"); 2696 2697 spdk_json_write_name(w, "nvme_version"); 2698 if (vs.bits.ter) { 2699 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 2700 } else { 2701 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 2702 } 2703 2704 spdk_json_write_object_end(w); 2705 2706 nsdata = spdk_nvme_ns_get_data(ns); 2707 2708 spdk_json_write_named_object_begin(w, "ns_data"); 2709 2710 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 2711 2712 if (cdata->cmic.ana_reporting) { 2713 spdk_json_write_named_string(w, "ana_state", 2714 _nvme_ana_state_str(nvme_ns->ana_state)); 2715 } 2716 2717 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 2718 2719 spdk_json_write_object_end(w); 2720 2721 if (cdata->oacs.security) { 2722 spdk_json_write_named_object_begin(w, "security"); 2723 2724 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 2725 2726 spdk_json_write_object_end(w); 2727 } 2728 2729 spdk_json_write_object_end(w); 2730 } 2731 2732 static const char * 2733 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 2734 { 2735 switch (nbdev->mp_policy) { 2736 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 2737 return "active_passive"; 2738 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 2739 return "active_active"; 2740 default: 2741 assert(false); 2742 return "invalid"; 2743 } 2744 } 2745 2746 static int 2747 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 2748 { 2749 struct nvme_bdev *nvme_bdev = ctx; 2750 struct nvme_ns *nvme_ns; 2751 2752 pthread_mutex_lock(&nvme_bdev->mutex); 2753 spdk_json_write_named_array_begin(w, "nvme"); 2754 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 2755 nvme_namespace_info_json(w, nvme_ns); 2756 } 2757 spdk_json_write_array_end(w); 2758 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 2759 pthread_mutex_unlock(&nvme_bdev->mutex); 2760 2761 return 0; 2762 } 2763 2764 static void 2765 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 2766 { 2767 /* No config per bdev needed */ 2768 } 2769 2770 static uint64_t 2771 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 2772 { 2773 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 2774 struct nvme_io_path *io_path; 2775 struct nvme_poll_group *group; 2776 uint64_t spin_time = 0; 2777 2778 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 2779 group = io_path->qpair->group; 2780 2781 if (!group || !group->collect_spin_stat) { 2782 continue; 2783 } 2784 2785 if (group->end_ticks != 0) { 2786 group->spin_ticks += (group->end_ticks - group->start_ticks); 2787 group->end_ticks = 0; 2788 } 2789 2790 spin_time += group->spin_ticks; 2791 group->start_ticks = 0; 2792 group->spin_ticks = 0; 2793 } 2794 2795 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 2796 } 2797 2798 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 2799 .destruct = bdev_nvme_destruct, 2800 .submit_request = bdev_nvme_submit_request, 2801 .io_type_supported = bdev_nvme_io_type_supported, 2802 .get_io_channel = bdev_nvme_get_io_channel, 2803 .dump_info_json = bdev_nvme_dump_info_json, 2804 .write_config_json = bdev_nvme_write_config_json, 2805 .get_spin_time = bdev_nvme_get_spin_time, 2806 .get_module_ctx = bdev_nvme_get_module_ctx, 2807 .get_memory_domains = bdev_nvme_get_memory_domains, 2808 }; 2809 2810 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 2811 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 2812 2813 static int 2814 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 2815 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 2816 { 2817 struct spdk_nvme_ana_group_descriptor *copied_desc; 2818 uint8_t *orig_desc; 2819 uint32_t i, desc_size, copy_len; 2820 int rc = 0; 2821 2822 if (nvme_ctrlr->ana_log_page == NULL) { 2823 return -EINVAL; 2824 } 2825 2826 copied_desc = nvme_ctrlr->copied_ana_desc; 2827 2828 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 2829 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 2830 2831 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 2832 memcpy(copied_desc, orig_desc, copy_len); 2833 2834 rc = cb_fn(copied_desc, cb_arg); 2835 if (rc != 0) { 2836 break; 2837 } 2838 2839 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 2840 copied_desc->num_of_nsid * sizeof(uint32_t); 2841 orig_desc += desc_size; 2842 copy_len -= desc_size; 2843 } 2844 2845 return rc; 2846 } 2847 2848 static int 2849 nvme_ns_ana_transition_timedout(void *ctx) 2850 { 2851 struct nvme_ns *nvme_ns = ctx; 2852 2853 spdk_poller_unregister(&nvme_ns->anatt_timer); 2854 nvme_ns->ana_transition_timedout = true; 2855 2856 return SPDK_POLLER_BUSY; 2857 } 2858 2859 static void 2860 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 2861 const struct spdk_nvme_ana_group_descriptor *desc) 2862 { 2863 const struct spdk_nvme_ctrlr_data *cdata; 2864 2865 nvme_ns->ana_group_id = desc->ana_group_id; 2866 nvme_ns->ana_state = desc->ana_state; 2867 nvme_ns->ana_state_updating = false; 2868 2869 switch (nvme_ns->ana_state) { 2870 case SPDK_NVME_ANA_OPTIMIZED_STATE: 2871 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 2872 nvme_ns->ana_transition_timedout = false; 2873 spdk_poller_unregister(&nvme_ns->anatt_timer); 2874 break; 2875 2876 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 2877 case SPDK_NVME_ANA_CHANGE_STATE: 2878 if (nvme_ns->anatt_timer != NULL) { 2879 break; 2880 } 2881 2882 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 2883 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 2884 nvme_ns, 2885 cdata->anatt * SPDK_SEC_TO_USEC); 2886 break; 2887 default: 2888 break; 2889 } 2890 } 2891 2892 static int 2893 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 2894 { 2895 struct nvme_ns *nvme_ns = cb_arg; 2896 uint32_t i; 2897 2898 for (i = 0; i < desc->num_of_nsid; i++) { 2899 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 2900 continue; 2901 } 2902 2903 _nvme_ns_set_ana_state(nvme_ns, desc); 2904 return 1; 2905 } 2906 2907 return 0; 2908 } 2909 2910 static int 2911 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 2912 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 2913 uint32_t prchk_flags, void *ctx) 2914 { 2915 const struct spdk_uuid *uuid; 2916 const uint8_t *nguid; 2917 const struct spdk_nvme_ctrlr_data *cdata; 2918 const struct spdk_nvme_ns_data *nsdata; 2919 const struct spdk_nvme_ctrlr_opts *opts; 2920 enum spdk_nvme_csi csi; 2921 uint32_t atomic_bs, phys_bs, bs; 2922 2923 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2924 csi = spdk_nvme_ns_get_csi(ns); 2925 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 2926 2927 switch (csi) { 2928 case SPDK_NVME_CSI_NVM: 2929 disk->product_name = "NVMe disk"; 2930 break; 2931 case SPDK_NVME_CSI_ZNS: 2932 disk->product_name = "NVMe ZNS disk"; 2933 disk->zoned = true; 2934 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 2935 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 2936 spdk_nvme_ns_get_extended_sector_size(ns); 2937 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 2938 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 2939 break; 2940 default: 2941 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 2942 return -ENOTSUP; 2943 } 2944 2945 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 2946 if (!disk->name) { 2947 return -ENOMEM; 2948 } 2949 2950 disk->write_cache = 0; 2951 if (cdata->vwc.present) { 2952 /* Enable if the Volatile Write Cache exists */ 2953 disk->write_cache = 1; 2954 } 2955 if (cdata->oncs.write_zeroes) { 2956 disk->max_write_zeroes = UINT16_MAX + 1; 2957 } 2958 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 2959 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 2960 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 2961 /* NVMe driver will split one request into multiple requests 2962 * based on MDTS and stripe boundary, the bdev layer will use 2963 * max_segment_size and max_num_segments to split one big IO 2964 * into multiple requests, then small request can't run out 2965 * of NVMe internal requests data structure. 2966 */ 2967 if (opts && opts->io_queue_requests) { 2968 disk->max_num_segments = opts->io_queue_requests / 2; 2969 } 2970 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 2971 2972 nguid = spdk_nvme_ns_get_nguid(ns); 2973 if (!nguid) { 2974 uuid = spdk_nvme_ns_get_uuid(ns); 2975 if (uuid) { 2976 disk->uuid = *uuid; 2977 } 2978 } else { 2979 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 2980 } 2981 2982 nsdata = spdk_nvme_ns_get_data(ns); 2983 bs = spdk_nvme_ns_get_sector_size(ns); 2984 atomic_bs = bs; 2985 phys_bs = bs; 2986 if (nsdata->nabo == 0) { 2987 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 2988 atomic_bs = bs * (1 + nsdata->nawupf); 2989 } else { 2990 atomic_bs = bs * (1 + cdata->awupf); 2991 } 2992 } 2993 if (nsdata->nsfeat.optperf) { 2994 phys_bs = bs * (1 + nsdata->npwg); 2995 } 2996 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 2997 2998 disk->md_len = spdk_nvme_ns_get_md_size(ns); 2999 if (disk->md_len != 0) { 3000 disk->md_interleave = nsdata->flbas.extended; 3001 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 3002 if (disk->dif_type != SPDK_DIF_DISABLE) { 3003 disk->dif_is_head_of_md = nsdata->dps.md_start; 3004 disk->dif_check_flags = prchk_flags; 3005 } 3006 } 3007 3008 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 3009 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 3010 disk->acwu = 0; 3011 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 3012 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 3013 } else { 3014 disk->acwu = cdata->acwu + 1; /* 0-based */ 3015 } 3016 3017 if (cdata->oncs.copy) { 3018 /* For now bdev interface allows only single segment copy */ 3019 disk->max_copy = nsdata->mssrl; 3020 } 3021 3022 disk->ctxt = ctx; 3023 disk->fn_table = &nvmelib_fn_table; 3024 disk->module = &nvme_if; 3025 3026 return 0; 3027 } 3028 3029 static int 3030 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 3031 { 3032 struct nvme_bdev *bdev; 3033 int rc; 3034 3035 bdev = calloc(1, sizeof(*bdev)); 3036 if (!bdev) { 3037 SPDK_ERRLOG("bdev calloc() failed\n"); 3038 return -ENOMEM; 3039 } 3040 3041 rc = pthread_mutex_init(&bdev->mutex, NULL); 3042 if (rc != 0) { 3043 free(bdev); 3044 return rc; 3045 } 3046 3047 bdev->ref = 1; 3048 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 3049 TAILQ_INIT(&bdev->nvme_ns_list); 3050 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 3051 bdev->opal = nvme_ctrlr->opal_dev != NULL; 3052 3053 rc = nvme_disk_create(&bdev->disk, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 3054 nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); 3055 if (rc != 0) { 3056 SPDK_ERRLOG("Failed to create NVMe disk\n"); 3057 pthread_mutex_destroy(&bdev->mutex); 3058 free(bdev); 3059 return rc; 3060 } 3061 3062 spdk_io_device_register(bdev, 3063 bdev_nvme_create_bdev_channel_cb, 3064 bdev_nvme_destroy_bdev_channel_cb, 3065 sizeof(struct nvme_bdev_channel), 3066 bdev->disk.name); 3067 3068 rc = spdk_bdev_register(&bdev->disk); 3069 if (rc != 0) { 3070 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 3071 spdk_io_device_unregister(bdev, NULL); 3072 pthread_mutex_destroy(&bdev->mutex); 3073 free(bdev->disk.name); 3074 free(bdev); 3075 return rc; 3076 } 3077 3078 nvme_ns->bdev = bdev; 3079 bdev->nsid = nvme_ns->id; 3080 3081 bdev->nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 3082 TAILQ_INSERT_TAIL(&nvme_ctrlr->nbdev_ctrlr->bdevs, bdev, tailq); 3083 3084 return 0; 3085 } 3086 3087 static bool 3088 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 3089 { 3090 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 3091 const struct spdk_uuid *uuid1, *uuid2; 3092 3093 nsdata1 = spdk_nvme_ns_get_data(ns1); 3094 nsdata2 = spdk_nvme_ns_get_data(ns2); 3095 uuid1 = spdk_nvme_ns_get_uuid(ns1); 3096 uuid2 = spdk_nvme_ns_get_uuid(ns2); 3097 3098 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 3099 nsdata1->eui64 == nsdata2->eui64 && 3100 ((uuid1 == NULL && uuid2 == NULL) || 3101 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 3102 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 3103 } 3104 3105 static bool 3106 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 3107 struct spdk_nvme_ctrlr_opts *opts) 3108 { 3109 struct nvme_probe_skip_entry *entry; 3110 3111 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 3112 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 3113 return false; 3114 } 3115 } 3116 3117 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 3118 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 3119 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 3120 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 3121 opts->disable_read_ana_log_page = true; 3122 3123 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 3124 3125 return true; 3126 } 3127 3128 static void 3129 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 3130 { 3131 struct nvme_ctrlr *nvme_ctrlr = ctx; 3132 3133 if (spdk_nvme_cpl_is_error(cpl)) { 3134 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 3135 cpl->status.sct); 3136 bdev_nvme_reset(nvme_ctrlr); 3137 } else if (cpl->cdw0 & 0x1) { 3138 SPDK_WARNLOG("Specified command could not be aborted.\n"); 3139 bdev_nvme_reset(nvme_ctrlr); 3140 } 3141 } 3142 3143 static void 3144 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 3145 struct spdk_nvme_qpair *qpair, uint16_t cid) 3146 { 3147 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 3148 union spdk_nvme_csts_register csts; 3149 int rc; 3150 3151 assert(nvme_ctrlr->ctrlr == ctrlr); 3152 3153 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 3154 3155 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 3156 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 3157 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 3158 * completion recursively. 3159 */ 3160 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 3161 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 3162 if (csts.bits.cfs) { 3163 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 3164 bdev_nvme_reset(nvme_ctrlr); 3165 return; 3166 } 3167 } 3168 3169 switch (g_opts.action_on_timeout) { 3170 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 3171 if (qpair) { 3172 /* Don't send abort to ctrlr when ctrlr is not available. */ 3173 pthread_mutex_lock(&nvme_ctrlr->mutex); 3174 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 3175 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3176 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 3177 return; 3178 } 3179 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3180 3181 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 3182 nvme_abort_cpl, nvme_ctrlr); 3183 if (rc == 0) { 3184 return; 3185 } 3186 3187 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 3188 } 3189 3190 /* FALLTHROUGH */ 3191 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 3192 bdev_nvme_reset(nvme_ctrlr); 3193 break; 3194 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 3195 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 3196 break; 3197 default: 3198 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 3199 break; 3200 } 3201 } 3202 3203 static void 3204 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 3205 { 3206 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 3207 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 3208 3209 if (rc == 0) { 3210 nvme_ns->probe_ctx = NULL; 3211 pthread_mutex_lock(&nvme_ctrlr->mutex); 3212 nvme_ctrlr->ref++; 3213 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3214 } else { 3215 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 3216 free(nvme_ns); 3217 } 3218 3219 if (ctx) { 3220 ctx->populates_in_progress--; 3221 if (ctx->populates_in_progress == 0) { 3222 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 3223 } 3224 } 3225 } 3226 3227 static void 3228 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 3229 { 3230 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3231 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 3232 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3233 int rc; 3234 3235 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 3236 if (rc != 0) { 3237 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 3238 } 3239 3240 spdk_for_each_channel_continue(i, rc); 3241 } 3242 3243 static void 3244 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 3245 { 3246 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3247 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 3248 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3249 struct nvme_io_path *io_path; 3250 3251 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 3252 if (io_path != NULL) { 3253 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 3254 } 3255 3256 spdk_for_each_channel_continue(i, 0); 3257 } 3258 3259 static void 3260 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 3261 { 3262 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3263 3264 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 3265 } 3266 3267 static void 3268 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 3269 { 3270 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3271 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 3272 3273 if (status == 0) { 3274 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 3275 } else { 3276 /* Delete the added io_paths and fail populating the namespace. */ 3277 spdk_for_each_channel(bdev, 3278 bdev_nvme_delete_io_path, 3279 nvme_ns, 3280 bdev_nvme_add_io_path_failed); 3281 } 3282 } 3283 3284 static int 3285 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 3286 { 3287 struct nvme_ns *tmp_ns; 3288 const struct spdk_nvme_ns_data *nsdata; 3289 3290 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 3291 if (!nsdata->nmic.can_share) { 3292 SPDK_ERRLOG("Namespace cannot be shared.\n"); 3293 return -EINVAL; 3294 } 3295 3296 pthread_mutex_lock(&bdev->mutex); 3297 3298 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 3299 assert(tmp_ns != NULL); 3300 3301 if (!bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 3302 pthread_mutex_unlock(&bdev->mutex); 3303 SPDK_ERRLOG("Namespaces are not identical.\n"); 3304 return -EINVAL; 3305 } 3306 3307 bdev->ref++; 3308 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 3309 nvme_ns->bdev = bdev; 3310 3311 pthread_mutex_unlock(&bdev->mutex); 3312 3313 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 3314 spdk_for_each_channel(bdev, 3315 bdev_nvme_add_io_path, 3316 nvme_ns, 3317 bdev_nvme_add_io_path_done); 3318 3319 return 0; 3320 } 3321 3322 static void 3323 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 3324 { 3325 struct spdk_nvme_ns *ns; 3326 struct nvme_bdev *bdev; 3327 int rc = 0; 3328 3329 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 3330 if (!ns) { 3331 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 3332 rc = -EINVAL; 3333 goto done; 3334 } 3335 3336 nvme_ns->ns = ns; 3337 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 3338 3339 if (nvme_ctrlr->ana_log_page != NULL) { 3340 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 3341 } 3342 3343 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 3344 if (bdev == NULL) { 3345 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 3346 } else { 3347 rc = nvme_bdev_add_ns(bdev, nvme_ns); 3348 if (rc == 0) { 3349 return; 3350 } 3351 } 3352 done: 3353 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 3354 } 3355 3356 static void 3357 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 3358 { 3359 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 3360 3361 assert(nvme_ctrlr != NULL); 3362 3363 pthread_mutex_lock(&nvme_ctrlr->mutex); 3364 3365 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 3366 3367 if (nvme_ns->bdev != NULL) { 3368 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3369 return; 3370 } 3371 3372 free(nvme_ns); 3373 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3374 3375 nvme_ctrlr_release(nvme_ctrlr); 3376 } 3377 3378 static void 3379 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 3380 { 3381 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3382 3383 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 3384 } 3385 3386 static void 3387 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 3388 { 3389 struct nvme_bdev *bdev; 3390 3391 spdk_poller_unregister(&nvme_ns->anatt_timer); 3392 3393 bdev = nvme_ns->bdev; 3394 if (bdev != NULL) { 3395 pthread_mutex_lock(&bdev->mutex); 3396 3397 assert(bdev->ref > 0); 3398 bdev->ref--; 3399 if (bdev->ref == 0) { 3400 pthread_mutex_unlock(&bdev->mutex); 3401 3402 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 3403 } else { 3404 /* spdk_bdev_unregister() is not called until the last nvme_ns is 3405 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 3406 * and clear nvme_ns->bdev here. 3407 */ 3408 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 3409 nvme_ns->bdev = NULL; 3410 3411 pthread_mutex_unlock(&bdev->mutex); 3412 3413 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 3414 * we call depopulate_namespace_done() to avoid use-after-free. 3415 */ 3416 spdk_for_each_channel(bdev, 3417 bdev_nvme_delete_io_path, 3418 nvme_ns, 3419 bdev_nvme_delete_io_path_done); 3420 return; 3421 } 3422 } 3423 3424 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 3425 } 3426 3427 static void 3428 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 3429 struct nvme_async_probe_ctx *ctx) 3430 { 3431 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 3432 struct nvme_ns *nvme_ns, *next; 3433 struct spdk_nvme_ns *ns; 3434 struct nvme_bdev *bdev; 3435 uint32_t nsid; 3436 int rc; 3437 uint64_t num_sectors; 3438 3439 if (ctx) { 3440 /* Initialize this count to 1 to handle the populate functions 3441 * calling nvme_ctrlr_populate_namespace_done() immediately. 3442 */ 3443 ctx->populates_in_progress = 1; 3444 } 3445 3446 /* First loop over our existing namespaces and see if they have been 3447 * removed. */ 3448 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 3449 while (nvme_ns != NULL) { 3450 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 3451 3452 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 3453 /* NS is still there but attributes may have changed */ 3454 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 3455 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 3456 bdev = nvme_ns->bdev; 3457 assert(bdev != NULL); 3458 if (bdev->disk.blockcnt != num_sectors) { 3459 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 3460 nvme_ns->id, 3461 bdev->disk.name, 3462 bdev->disk.blockcnt, 3463 num_sectors); 3464 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 3465 if (rc != 0) { 3466 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 3467 bdev->disk.name, rc); 3468 } 3469 } 3470 } else { 3471 /* Namespace was removed */ 3472 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 3473 } 3474 3475 nvme_ns = next; 3476 } 3477 3478 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 3479 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 3480 while (nsid != 0) { 3481 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 3482 3483 if (nvme_ns == NULL) { 3484 /* Found a new one */ 3485 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 3486 if (nvme_ns == NULL) { 3487 SPDK_ERRLOG("Failed to allocate namespace\n"); 3488 /* This just fails to attach the namespace. It may work on a future attempt. */ 3489 continue; 3490 } 3491 3492 nvme_ns->id = nsid; 3493 nvme_ns->ctrlr = nvme_ctrlr; 3494 3495 nvme_ns->bdev = NULL; 3496 3497 if (ctx) { 3498 ctx->populates_in_progress++; 3499 } 3500 nvme_ns->probe_ctx = ctx; 3501 3502 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 3503 3504 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 3505 } 3506 3507 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 3508 } 3509 3510 if (ctx) { 3511 /* Decrement this count now that the loop is over to account 3512 * for the one we started with. If the count is then 0, we 3513 * know any populate_namespace functions completed immediately, 3514 * so we'll kick the callback here. 3515 */ 3516 ctx->populates_in_progress--; 3517 if (ctx->populates_in_progress == 0) { 3518 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 3519 } 3520 } 3521 3522 } 3523 3524 static void 3525 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 3526 { 3527 struct nvme_ns *nvme_ns, *tmp; 3528 3529 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 3530 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 3531 } 3532 } 3533 3534 static uint32_t 3535 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 3536 { 3537 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 3538 const struct spdk_nvme_ctrlr_data *cdata; 3539 uint32_t nsid, ns_count = 0; 3540 3541 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3542 3543 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 3544 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 3545 ns_count++; 3546 } 3547 3548 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 3549 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 3550 sizeof(uint32_t); 3551 } 3552 3553 static int 3554 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 3555 void *cb_arg) 3556 { 3557 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 3558 struct nvme_ns *nvme_ns; 3559 uint32_t i, nsid; 3560 3561 for (i = 0; i < desc->num_of_nsid; i++) { 3562 nsid = desc->nsid[i]; 3563 if (nsid == 0) { 3564 continue; 3565 } 3566 3567 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 3568 3569 assert(nvme_ns != NULL); 3570 if (nvme_ns == NULL) { 3571 /* Target told us that an inactive namespace had an ANA change */ 3572 continue; 3573 } 3574 3575 _nvme_ns_set_ana_state(nvme_ns, desc); 3576 } 3577 3578 return 0; 3579 } 3580 3581 static void 3582 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 3583 { 3584 struct nvme_ns *nvme_ns; 3585 3586 spdk_free(nvme_ctrlr->ana_log_page); 3587 nvme_ctrlr->ana_log_page = NULL; 3588 3589 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 3590 nvme_ns != NULL; 3591 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 3592 nvme_ns->ana_state_updating = false; 3593 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 3594 } 3595 } 3596 3597 static void 3598 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 3599 { 3600 struct nvme_ctrlr *nvme_ctrlr = ctx; 3601 3602 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 3603 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 3604 nvme_ctrlr); 3605 } else { 3606 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 3607 } 3608 3609 pthread_mutex_lock(&nvme_ctrlr->mutex); 3610 3611 assert(nvme_ctrlr->ana_log_page_updating == true); 3612 nvme_ctrlr->ana_log_page_updating = false; 3613 3614 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 3615 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3616 3617 nvme_ctrlr_unregister(nvme_ctrlr); 3618 } else { 3619 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3620 3621 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 3622 } 3623 } 3624 3625 static int 3626 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 3627 { 3628 uint32_t ana_log_page_size; 3629 int rc; 3630 3631 if (nvme_ctrlr->ana_log_page == NULL) { 3632 return -EINVAL; 3633 } 3634 3635 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 3636 3637 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 3638 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 3639 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 3640 return -EINVAL; 3641 } 3642 3643 pthread_mutex_lock(&nvme_ctrlr->mutex); 3644 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 3645 nvme_ctrlr->ana_log_page_updating) { 3646 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3647 return -EBUSY; 3648 } 3649 3650 nvme_ctrlr->ana_log_page_updating = true; 3651 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3652 3653 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 3654 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 3655 SPDK_NVME_GLOBAL_NS_TAG, 3656 nvme_ctrlr->ana_log_page, 3657 ana_log_page_size, 0, 3658 nvme_ctrlr_read_ana_log_page_done, 3659 nvme_ctrlr); 3660 if (rc != 0) { 3661 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 3662 } 3663 3664 return rc; 3665 } 3666 3667 static void 3668 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 3669 { 3670 } 3671 3672 struct bdev_nvme_set_preferred_path_ctx { 3673 struct spdk_bdev_desc *desc; 3674 struct nvme_ns *nvme_ns; 3675 bdev_nvme_set_preferred_path_cb cb_fn; 3676 void *cb_arg; 3677 }; 3678 3679 static void 3680 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 3681 { 3682 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3683 3684 assert(ctx != NULL); 3685 assert(ctx->desc != NULL); 3686 assert(ctx->cb_fn != NULL); 3687 3688 spdk_bdev_close(ctx->desc); 3689 3690 ctx->cb_fn(ctx->cb_arg, status); 3691 3692 free(ctx); 3693 } 3694 3695 static void 3696 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 3697 { 3698 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3699 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3700 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 3701 struct nvme_io_path *io_path, *prev; 3702 3703 prev = NULL; 3704 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3705 if (io_path->nvme_ns == ctx->nvme_ns) { 3706 break; 3707 } 3708 prev = io_path; 3709 } 3710 3711 if (io_path != NULL) { 3712 if (prev != NULL) { 3713 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 3714 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 3715 } 3716 3717 /* We can set io_path to nbdev_ch->current_io_path directly here. 3718 * However, it needs to be conditional. To simplify the code, 3719 * just clear nbdev_ch->current_io_path and let find_io_path() 3720 * fill it. 3721 * 3722 * Automatic failback may be disabled. Hence even if the io_path is 3723 * already at the head, clear nbdev_ch->current_io_path. 3724 */ 3725 nbdev_ch->current_io_path = NULL; 3726 } 3727 3728 spdk_for_each_channel_continue(i, 0); 3729 } 3730 3731 static struct nvme_ns * 3732 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 3733 { 3734 struct nvme_ns *nvme_ns, *prev; 3735 const struct spdk_nvme_ctrlr_data *cdata; 3736 3737 prev = NULL; 3738 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3739 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 3740 3741 if (cdata->cntlid == cntlid) { 3742 break; 3743 } 3744 prev = nvme_ns; 3745 } 3746 3747 if (nvme_ns != NULL && prev != NULL) { 3748 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 3749 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 3750 } 3751 3752 return nvme_ns; 3753 } 3754 3755 /* This function supports only multipath mode. There is only a single I/O path 3756 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 3757 * head of the I/O path list for each NVMe bdev channel. 3758 * 3759 * NVMe bdev channel may be acquired after completing this function. move the 3760 * matched namespace to the head of the namespace list for the NVMe bdev too. 3761 */ 3762 void 3763 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 3764 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 3765 { 3766 struct bdev_nvme_set_preferred_path_ctx *ctx; 3767 struct spdk_bdev *bdev; 3768 struct nvme_bdev *nbdev; 3769 int rc = 0; 3770 3771 assert(cb_fn != NULL); 3772 3773 ctx = calloc(1, sizeof(*ctx)); 3774 if (ctx == NULL) { 3775 SPDK_ERRLOG("Failed to alloc context.\n"); 3776 rc = -ENOMEM; 3777 goto err_alloc; 3778 } 3779 3780 ctx->cb_fn = cb_fn; 3781 ctx->cb_arg = cb_arg; 3782 3783 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 3784 if (rc != 0) { 3785 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 3786 goto err_open; 3787 } 3788 3789 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 3790 3791 if (bdev->module != &nvme_if) { 3792 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 3793 rc = -ENODEV; 3794 goto err_bdev; 3795 } 3796 3797 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 3798 3799 pthread_mutex_lock(&nbdev->mutex); 3800 3801 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 3802 if (ctx->nvme_ns == NULL) { 3803 pthread_mutex_unlock(&nbdev->mutex); 3804 3805 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 3806 rc = -ENODEV; 3807 goto err_bdev; 3808 } 3809 3810 pthread_mutex_unlock(&nbdev->mutex); 3811 3812 spdk_for_each_channel(nbdev, 3813 _bdev_nvme_set_preferred_path, 3814 ctx, 3815 bdev_nvme_set_preferred_path_done); 3816 return; 3817 3818 err_bdev: 3819 spdk_bdev_close(ctx->desc); 3820 err_open: 3821 free(ctx); 3822 err_alloc: 3823 cb_fn(cb_arg, rc); 3824 } 3825 3826 struct bdev_nvme_set_multipath_policy_ctx { 3827 struct spdk_bdev_desc *desc; 3828 bdev_nvme_set_multipath_policy_cb cb_fn; 3829 void *cb_arg; 3830 }; 3831 3832 static void 3833 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 3834 { 3835 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3836 3837 assert(ctx != NULL); 3838 assert(ctx->desc != NULL); 3839 assert(ctx->cb_fn != NULL); 3840 3841 spdk_bdev_close(ctx->desc); 3842 3843 ctx->cb_fn(ctx->cb_arg, status); 3844 3845 free(ctx); 3846 } 3847 3848 static void 3849 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 3850 { 3851 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3852 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 3853 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 3854 3855 nbdev_ch->mp_policy = nbdev->mp_policy; 3856 nbdev_ch->current_io_path = NULL; 3857 3858 spdk_for_each_channel_continue(i, 0); 3859 } 3860 3861 void 3862 bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, 3863 bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 3864 { 3865 struct bdev_nvme_set_multipath_policy_ctx *ctx; 3866 struct spdk_bdev *bdev; 3867 struct nvme_bdev *nbdev; 3868 int rc; 3869 3870 assert(cb_fn != NULL); 3871 3872 ctx = calloc(1, sizeof(*ctx)); 3873 if (ctx == NULL) { 3874 SPDK_ERRLOG("Failed to alloc context.\n"); 3875 rc = -ENOMEM; 3876 goto err_alloc; 3877 } 3878 3879 ctx->cb_fn = cb_fn; 3880 ctx->cb_arg = cb_arg; 3881 3882 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 3883 if (rc != 0) { 3884 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 3885 rc = -ENODEV; 3886 goto err_open; 3887 } 3888 3889 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 3890 if (bdev->module != &nvme_if) { 3891 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 3892 rc = -ENODEV; 3893 goto err_module; 3894 } 3895 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 3896 3897 pthread_mutex_lock(&nbdev->mutex); 3898 nbdev->mp_policy = policy; 3899 pthread_mutex_unlock(&nbdev->mutex); 3900 3901 spdk_for_each_channel(nbdev, 3902 _bdev_nvme_set_multipath_policy, 3903 ctx, 3904 bdev_nvme_set_multipath_policy_done); 3905 return; 3906 3907 err_module: 3908 spdk_bdev_close(ctx->desc); 3909 err_open: 3910 free(ctx); 3911 err_alloc: 3912 cb_fn(cb_arg, rc); 3913 } 3914 3915 static void 3916 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 3917 { 3918 struct nvme_ctrlr *nvme_ctrlr = arg; 3919 union spdk_nvme_async_event_completion event; 3920 3921 if (spdk_nvme_cpl_is_error(cpl)) { 3922 SPDK_WARNLOG("AER request execute failed\n"); 3923 return; 3924 } 3925 3926 event.raw = cpl->cdw0; 3927 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 3928 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 3929 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 3930 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 3931 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 3932 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 3933 } 3934 } 3935 3936 static void 3937 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc) 3938 { 3939 if (ctx->cb_fn) { 3940 ctx->cb_fn(ctx->cb_ctx, count, rc); 3941 } 3942 3943 ctx->namespaces_populated = true; 3944 if (ctx->probe_done) { 3945 /* The probe was already completed, so we need to free the context 3946 * here. This can happen for cases like OCSSD, where we need to 3947 * send additional commands to the SSD after attach. 3948 */ 3949 free(ctx); 3950 } 3951 } 3952 3953 static void 3954 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 3955 struct nvme_async_probe_ctx *ctx) 3956 { 3957 spdk_io_device_register(nvme_ctrlr, 3958 bdev_nvme_create_ctrlr_channel_cb, 3959 bdev_nvme_destroy_ctrlr_channel_cb, 3960 sizeof(struct nvme_ctrlr_channel), 3961 nvme_ctrlr->nbdev_ctrlr->name); 3962 3963 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 3964 } 3965 3966 static void 3967 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 3968 { 3969 struct nvme_ctrlr *nvme_ctrlr = _ctx; 3970 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 3971 3972 nvme_ctrlr->probe_ctx = NULL; 3973 3974 if (spdk_nvme_cpl_is_error(cpl)) { 3975 nvme_ctrlr_delete(nvme_ctrlr); 3976 3977 if (ctx != NULL) { 3978 populate_namespaces_cb(ctx, 0, -1); 3979 } 3980 return; 3981 } 3982 3983 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 3984 } 3985 3986 static int 3987 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3988 struct nvme_async_probe_ctx *ctx) 3989 { 3990 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 3991 const struct spdk_nvme_ctrlr_data *cdata; 3992 uint32_t ana_log_page_size; 3993 3994 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3995 3996 /* Set buffer size enough to include maximum number of allowed namespaces. */ 3997 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 3998 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 3999 sizeof(uint32_t); 4000 4001 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 4002 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 4003 if (nvme_ctrlr->ana_log_page == NULL) { 4004 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 4005 return -ENXIO; 4006 } 4007 4008 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 4009 * Hence copy each descriptor to a temporary area when parsing it. 4010 * 4011 * Allocate a buffer whose size is as large as ANA log page buffer because 4012 * we do not know the size of a descriptor until actually reading it. 4013 */ 4014 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 4015 if (nvme_ctrlr->copied_ana_desc == NULL) { 4016 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 4017 return -ENOMEM; 4018 } 4019 4020 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 4021 4022 nvme_ctrlr->probe_ctx = ctx; 4023 4024 /* Then, set the read size only to include the current active namespaces. */ 4025 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 4026 4027 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 4028 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 4029 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 4030 return -EINVAL; 4031 } 4032 4033 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 4034 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 4035 SPDK_NVME_GLOBAL_NS_TAG, 4036 nvme_ctrlr->ana_log_page, 4037 ana_log_page_size, 0, 4038 nvme_ctrlr_init_ana_log_page_done, 4039 nvme_ctrlr); 4040 } 4041 4042 /* hostnqn and subnqn were already verified before attaching a controller. 4043 * Hence check only the multipath capability and cntlid here. 4044 */ 4045 static bool 4046 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 4047 { 4048 struct nvme_ctrlr *tmp; 4049 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 4050 4051 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4052 4053 if (!cdata->cmic.multi_ctrlr) { 4054 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 4055 return false; 4056 } 4057 4058 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 4059 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 4060 4061 if (!tmp_cdata->cmic.multi_ctrlr) { 4062 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 4063 return false; 4064 } 4065 if (cdata->cntlid == tmp_cdata->cntlid) { 4066 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 4067 return false; 4068 } 4069 } 4070 4071 return true; 4072 } 4073 4074 static int 4075 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 4076 { 4077 struct nvme_bdev_ctrlr *nbdev_ctrlr; 4078 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4079 int rc = 0; 4080 4081 pthread_mutex_lock(&g_bdev_nvme_mutex); 4082 4083 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 4084 if (nbdev_ctrlr != NULL) { 4085 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 4086 rc = -EINVAL; 4087 goto exit; 4088 } 4089 } else { 4090 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 4091 if (nbdev_ctrlr == NULL) { 4092 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 4093 rc = -ENOMEM; 4094 goto exit; 4095 } 4096 nbdev_ctrlr->name = strdup(name); 4097 if (nbdev_ctrlr->name == NULL) { 4098 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 4099 free(nbdev_ctrlr); 4100 goto exit; 4101 } 4102 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 4103 TAILQ_INIT(&nbdev_ctrlr->bdevs); 4104 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 4105 } 4106 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 4107 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 4108 exit: 4109 pthread_mutex_unlock(&g_bdev_nvme_mutex); 4110 return rc; 4111 } 4112 4113 static int 4114 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 4115 const char *name, 4116 const struct spdk_nvme_transport_id *trid, 4117 struct nvme_async_probe_ctx *ctx) 4118 { 4119 struct nvme_ctrlr *nvme_ctrlr; 4120 struct nvme_path_id *path_id; 4121 const struct spdk_nvme_ctrlr_data *cdata; 4122 int rc; 4123 4124 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 4125 if (nvme_ctrlr == NULL) { 4126 SPDK_ERRLOG("Failed to allocate device struct\n"); 4127 return -ENOMEM; 4128 } 4129 4130 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 4131 if (rc != 0) { 4132 free(nvme_ctrlr); 4133 return rc; 4134 } 4135 4136 TAILQ_INIT(&nvme_ctrlr->trids); 4137 4138 RB_INIT(&nvme_ctrlr->namespaces); 4139 4140 path_id = calloc(1, sizeof(*path_id)); 4141 if (path_id == NULL) { 4142 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 4143 rc = -ENOMEM; 4144 goto err; 4145 } 4146 4147 path_id->trid = *trid; 4148 if (ctx != NULL) { 4149 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 4150 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 4151 } 4152 nvme_ctrlr->active_path_id = path_id; 4153 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 4154 4155 nvme_ctrlr->thread = spdk_get_thread(); 4156 nvme_ctrlr->ctrlr = ctrlr; 4157 nvme_ctrlr->ref = 1; 4158 4159 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 4160 SPDK_ERRLOG("OCSSDs are not supported"); 4161 rc = -ENOTSUP; 4162 goto err; 4163 } 4164 4165 if (ctx != NULL) { 4166 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 4167 } else { 4168 bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 4169 } 4170 4171 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 4172 g_opts.nvme_adminq_poll_period_us); 4173 4174 if (g_opts.timeout_us > 0) { 4175 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 4176 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 4177 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 4178 g_opts.timeout_us : g_opts.timeout_admin_us; 4179 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 4180 adm_timeout_us, timeout_cb, nvme_ctrlr); 4181 } 4182 4183 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 4184 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 4185 4186 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 4187 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 4188 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 4189 } 4190 4191 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 4192 if (rc != 0) { 4193 goto err; 4194 } 4195 4196 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4197 4198 if (cdata->cmic.ana_reporting) { 4199 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 4200 if (rc == 0) { 4201 return 0; 4202 } 4203 } else { 4204 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 4205 return 0; 4206 } 4207 4208 err: 4209 nvme_ctrlr_delete(nvme_ctrlr); 4210 return rc; 4211 } 4212 4213 void 4214 bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) 4215 { 4216 opts->prchk_flags = 0; 4217 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 4218 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 4219 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 4220 } 4221 4222 static void 4223 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4224 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 4225 { 4226 char *name; 4227 4228 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 4229 if (!name) { 4230 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 4231 return; 4232 } 4233 4234 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 4235 4236 nvme_ctrlr_create(ctrlr, name, trid, NULL); 4237 4238 free(name); 4239 } 4240 4241 static void 4242 _nvme_ctrlr_destruct(void *ctx) 4243 { 4244 struct nvme_ctrlr *nvme_ctrlr = ctx; 4245 4246 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 4247 nvme_ctrlr_release(nvme_ctrlr); 4248 } 4249 4250 static int 4251 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 4252 { 4253 struct nvme_probe_skip_entry *entry; 4254 4255 pthread_mutex_lock(&nvme_ctrlr->mutex); 4256 4257 /* The controller's destruction was already started */ 4258 if (nvme_ctrlr->destruct) { 4259 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4260 return 0; 4261 } 4262 4263 if (!hotplug && 4264 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 4265 entry = calloc(1, sizeof(*entry)); 4266 if (!entry) { 4267 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4268 return -ENOMEM; 4269 } 4270 entry->trid = nvme_ctrlr->active_path_id->trid; 4271 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 4272 } 4273 4274 nvme_ctrlr->destruct = true; 4275 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4276 4277 _nvme_ctrlr_destruct(nvme_ctrlr); 4278 4279 return 0; 4280 } 4281 4282 static void 4283 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 4284 { 4285 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 4286 4287 _bdev_nvme_delete(nvme_ctrlr, true); 4288 } 4289 4290 static int 4291 bdev_nvme_hotplug_probe(void *arg) 4292 { 4293 if (g_hotplug_probe_ctx == NULL) { 4294 spdk_poller_unregister(&g_hotplug_probe_poller); 4295 return SPDK_POLLER_IDLE; 4296 } 4297 4298 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 4299 g_hotplug_probe_ctx = NULL; 4300 spdk_poller_unregister(&g_hotplug_probe_poller); 4301 } 4302 4303 return SPDK_POLLER_BUSY; 4304 } 4305 4306 static int 4307 bdev_nvme_hotplug(void *arg) 4308 { 4309 struct spdk_nvme_transport_id trid_pcie; 4310 4311 if (g_hotplug_probe_ctx) { 4312 return SPDK_POLLER_BUSY; 4313 } 4314 4315 memset(&trid_pcie, 0, sizeof(trid_pcie)); 4316 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 4317 4318 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 4319 hotplug_probe_cb, attach_cb, NULL); 4320 4321 if (g_hotplug_probe_ctx) { 4322 assert(g_hotplug_probe_poller == NULL); 4323 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 4324 } 4325 4326 return SPDK_POLLER_BUSY; 4327 } 4328 4329 void 4330 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 4331 { 4332 *opts = g_opts; 4333 } 4334 4335 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 4336 uint32_t reconnect_delay_sec, 4337 uint32_t fast_io_fail_timeout_sec); 4338 4339 static int 4340 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 4341 { 4342 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 4343 /* Can't set timeout_admin_us without also setting timeout_us */ 4344 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 4345 return -EINVAL; 4346 } 4347 4348 if (opts->bdev_retry_count < -1) { 4349 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 4350 return -EINVAL; 4351 } 4352 4353 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 4354 opts->reconnect_delay_sec, 4355 opts->fast_io_fail_timeout_sec)) { 4356 return -EINVAL; 4357 } 4358 4359 return 0; 4360 } 4361 4362 int 4363 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 4364 { 4365 int ret = bdev_nvme_validate_opts(opts); 4366 if (ret) { 4367 SPDK_WARNLOG("Failed to set nvme opts.\n"); 4368 return ret; 4369 } 4370 4371 if (g_bdev_nvme_init_thread != NULL) { 4372 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 4373 return -EPERM; 4374 } 4375 } 4376 4377 g_opts = *opts; 4378 4379 return 0; 4380 } 4381 4382 struct set_nvme_hotplug_ctx { 4383 uint64_t period_us; 4384 bool enabled; 4385 spdk_msg_fn fn; 4386 void *fn_ctx; 4387 }; 4388 4389 static void 4390 set_nvme_hotplug_period_cb(void *_ctx) 4391 { 4392 struct set_nvme_hotplug_ctx *ctx = _ctx; 4393 4394 spdk_poller_unregister(&g_hotplug_poller); 4395 if (ctx->enabled) { 4396 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 4397 } 4398 4399 g_nvme_hotplug_poll_period_us = ctx->period_us; 4400 g_nvme_hotplug_enabled = ctx->enabled; 4401 if (ctx->fn) { 4402 ctx->fn(ctx->fn_ctx); 4403 } 4404 4405 free(ctx); 4406 } 4407 4408 int 4409 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 4410 { 4411 struct set_nvme_hotplug_ctx *ctx; 4412 4413 if (enabled == true && !spdk_process_is_primary()) { 4414 return -EPERM; 4415 } 4416 4417 ctx = calloc(1, sizeof(*ctx)); 4418 if (ctx == NULL) { 4419 return -ENOMEM; 4420 } 4421 4422 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 4423 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 4424 ctx->enabled = enabled; 4425 ctx->fn = cb; 4426 ctx->fn_ctx = cb_ctx; 4427 4428 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 4429 return 0; 4430 } 4431 4432 static void 4433 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 4434 struct nvme_async_probe_ctx *ctx) 4435 { 4436 struct nvme_ns *nvme_ns; 4437 struct nvme_bdev *nvme_bdev; 4438 size_t j; 4439 4440 assert(nvme_ctrlr != NULL); 4441 4442 if (ctx->names == NULL) { 4443 populate_namespaces_cb(ctx, 0, 0); 4444 return; 4445 } 4446 4447 /* 4448 * Report the new bdevs that were created in this call. 4449 * There can be more than one bdev per NVMe controller. 4450 */ 4451 j = 0; 4452 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4453 while (nvme_ns != NULL) { 4454 nvme_bdev = nvme_ns->bdev; 4455 if (j < ctx->count) { 4456 ctx->names[j] = nvme_bdev->disk.name; 4457 j++; 4458 } else { 4459 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 4460 ctx->count); 4461 populate_namespaces_cb(ctx, 0, -ERANGE); 4462 return; 4463 } 4464 4465 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4466 } 4467 4468 populate_namespaces_cb(ctx, j, 0); 4469 } 4470 4471 static int 4472 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 4473 struct spdk_nvme_ctrlr *new_ctrlr, 4474 struct spdk_nvme_transport_id *trid) 4475 { 4476 struct nvme_path_id *tmp_trid; 4477 4478 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 4479 SPDK_ERRLOG("PCIe failover is not supported.\n"); 4480 return -ENOTSUP; 4481 } 4482 4483 /* Currently we only support failover to the same transport type. */ 4484 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 4485 SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", 4486 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 4487 spdk_nvme_transport_id_trtype_str(trid->trtype)); 4488 return -EINVAL; 4489 } 4490 4491 4492 /* Currently we only support failover to the same NQN. */ 4493 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 4494 SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 4495 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 4496 return -EINVAL; 4497 } 4498 4499 /* Skip all the other checks if we've already registered this path. */ 4500 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 4501 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 4502 SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, 4503 trid->subnqn); 4504 return -EEXIST; 4505 } 4506 } 4507 4508 return 0; 4509 } 4510 4511 static int 4512 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 4513 struct spdk_nvme_ctrlr *new_ctrlr) 4514 { 4515 struct nvme_ns *nvme_ns; 4516 struct spdk_nvme_ns *new_ns; 4517 4518 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4519 while (nvme_ns != NULL) { 4520 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 4521 assert(new_ns != NULL); 4522 4523 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 4524 return -EINVAL; 4525 } 4526 4527 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4528 } 4529 4530 return 0; 4531 } 4532 4533 static int 4534 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 4535 struct spdk_nvme_transport_id *trid) 4536 { 4537 struct nvme_path_id *new_trid, *tmp_trid; 4538 4539 new_trid = calloc(1, sizeof(*new_trid)); 4540 if (new_trid == NULL) { 4541 return -ENOMEM; 4542 } 4543 new_trid->trid = *trid; 4544 new_trid->is_failed = false; 4545 4546 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 4547 if (tmp_trid->is_failed && tmp_trid != nvme_ctrlr->active_path_id) { 4548 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 4549 return 0; 4550 } 4551 } 4552 4553 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 4554 return 0; 4555 } 4556 4557 /* This is the case that a secondary path is added to an existing 4558 * nvme_ctrlr for failover. After checking if it can access the same 4559 * namespaces as the primary path, it is disconnected until failover occurs. 4560 */ 4561 static int 4562 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 4563 struct spdk_nvme_ctrlr *new_ctrlr, 4564 struct spdk_nvme_transport_id *trid) 4565 { 4566 int rc; 4567 4568 assert(nvme_ctrlr != NULL); 4569 4570 pthread_mutex_lock(&nvme_ctrlr->mutex); 4571 4572 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 4573 if (rc != 0) { 4574 goto exit; 4575 } 4576 4577 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 4578 if (rc != 0) { 4579 goto exit; 4580 } 4581 4582 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 4583 4584 exit: 4585 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4586 4587 spdk_nvme_detach(new_ctrlr); 4588 4589 return rc; 4590 } 4591 4592 static void 4593 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4594 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 4595 { 4596 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 4597 struct nvme_async_probe_ctx *ctx; 4598 int rc; 4599 4600 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 4601 ctx->ctrlr_attached = true; 4602 4603 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 4604 if (rc != 0) { 4605 populate_namespaces_cb(ctx, 0, rc); 4606 } 4607 } 4608 4609 static void 4610 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4611 struct spdk_nvme_ctrlr *ctrlr, 4612 const struct spdk_nvme_ctrlr_opts *opts) 4613 { 4614 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 4615 struct nvme_ctrlr *nvme_ctrlr; 4616 struct nvme_async_probe_ctx *ctx; 4617 int rc; 4618 4619 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 4620 ctx->ctrlr_attached = true; 4621 4622 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 4623 if (nvme_ctrlr) { 4624 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 4625 } else { 4626 rc = -ENODEV; 4627 } 4628 4629 populate_namespaces_cb(ctx, 0, rc); 4630 } 4631 4632 static int 4633 bdev_nvme_async_poll(void *arg) 4634 { 4635 struct nvme_async_probe_ctx *ctx = arg; 4636 int rc; 4637 4638 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 4639 if (spdk_unlikely(rc != -EAGAIN)) { 4640 ctx->probe_done = true; 4641 spdk_poller_unregister(&ctx->poller); 4642 if (!ctx->ctrlr_attached) { 4643 /* The probe is done, but no controller was attached. 4644 * That means we had a failure, so report -EIO back to 4645 * the caller (usually the RPC). populate_namespaces_cb() 4646 * will take care of freeing the nvme_async_probe_ctx. 4647 */ 4648 populate_namespaces_cb(ctx, 0, -EIO); 4649 } else if (ctx->namespaces_populated) { 4650 /* The namespaces for the attached controller were all 4651 * populated and the response was already sent to the 4652 * caller (usually the RPC). So free the context here. 4653 */ 4654 free(ctx); 4655 } 4656 } 4657 4658 return SPDK_POLLER_BUSY; 4659 } 4660 4661 static bool 4662 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 4663 uint32_t reconnect_delay_sec, 4664 uint32_t fast_io_fail_timeout_sec) 4665 { 4666 if (ctrlr_loss_timeout_sec < -1) { 4667 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 4668 return false; 4669 } else if (ctrlr_loss_timeout_sec == -1) { 4670 if (reconnect_delay_sec == 0) { 4671 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 4672 return false; 4673 } else if (fast_io_fail_timeout_sec != 0 && 4674 fast_io_fail_timeout_sec < reconnect_delay_sec) { 4675 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 4676 return false; 4677 } 4678 } else if (ctrlr_loss_timeout_sec != 0) { 4679 if (reconnect_delay_sec == 0) { 4680 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 4681 return false; 4682 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 4683 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 4684 return false; 4685 } else if (fast_io_fail_timeout_sec != 0) { 4686 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 4687 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 4688 return false; 4689 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 4690 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 4691 return false; 4692 } 4693 } 4694 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 4695 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 4696 return false; 4697 } 4698 4699 return true; 4700 } 4701 4702 int 4703 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 4704 const char *base_name, 4705 const char **names, 4706 uint32_t count, 4707 spdk_bdev_create_nvme_fn cb_fn, 4708 void *cb_ctx, 4709 struct spdk_nvme_ctrlr_opts *drv_opts, 4710 struct nvme_ctrlr_opts *bdev_opts, 4711 bool multipath) 4712 { 4713 struct nvme_probe_skip_entry *entry, *tmp; 4714 struct nvme_async_probe_ctx *ctx; 4715 spdk_nvme_attach_cb attach_cb; 4716 4717 /* TODO expand this check to include both the host and target TRIDs. 4718 * Only if both are the same should we fail. 4719 */ 4720 if (nvme_ctrlr_get(trid) != NULL) { 4721 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 4722 return -EEXIST; 4723 } 4724 4725 if (bdev_opts != NULL && 4726 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 4727 bdev_opts->reconnect_delay_sec, 4728 bdev_opts->fast_io_fail_timeout_sec)) { 4729 return -EINVAL; 4730 } 4731 4732 ctx = calloc(1, sizeof(*ctx)); 4733 if (!ctx) { 4734 return -ENOMEM; 4735 } 4736 ctx->base_name = base_name; 4737 ctx->names = names; 4738 ctx->count = count; 4739 ctx->cb_fn = cb_fn; 4740 ctx->cb_ctx = cb_ctx; 4741 ctx->trid = *trid; 4742 4743 if (bdev_opts) { 4744 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 4745 } else { 4746 bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 4747 } 4748 4749 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 4750 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 4751 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4752 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 4753 free(entry); 4754 break; 4755 } 4756 } 4757 } 4758 4759 if (drv_opts) { 4760 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 4761 } else { 4762 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); 4763 } 4764 4765 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 4766 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 4767 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 4768 ctx->drv_opts.disable_read_ana_log_page = true; 4769 4770 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 4771 attach_cb = connect_attach_cb; 4772 } else { 4773 attach_cb = connect_set_failover_cb; 4774 } 4775 4776 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 4777 if (ctx->probe_ctx == NULL) { 4778 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 4779 free(ctx); 4780 return -ENODEV; 4781 } 4782 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 4783 4784 return 0; 4785 } 4786 4787 int 4788 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id) 4789 { 4790 struct nvme_bdev_ctrlr *nbdev_ctrlr; 4791 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 4792 struct nvme_path_id *p, *t; 4793 int rc = -ENXIO; 4794 4795 if (name == NULL || path_id == NULL) { 4796 return -EINVAL; 4797 } 4798 4799 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 4800 if (nbdev_ctrlr == NULL) { 4801 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 4802 return -ENODEV; 4803 } 4804 4805 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 4806 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 4807 if (path_id->trid.trtype != 0) { 4808 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 4809 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 4810 continue; 4811 } 4812 } else { 4813 if (path_id->trid.trtype != p->trid.trtype) { 4814 continue; 4815 } 4816 } 4817 } 4818 4819 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 4820 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 4821 continue; 4822 } 4823 } 4824 4825 if (path_id->trid.adrfam != 0) { 4826 if (path_id->trid.adrfam != p->trid.adrfam) { 4827 continue; 4828 } 4829 } 4830 4831 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 4832 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 4833 continue; 4834 } 4835 } 4836 4837 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 4838 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 4839 continue; 4840 } 4841 } 4842 4843 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 4844 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 4845 continue; 4846 } 4847 } 4848 4849 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 4850 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 4851 continue; 4852 } 4853 } 4854 4855 /* If we made it here, then this path is a match! Now we need to remove it. */ 4856 if (p == nvme_ctrlr->active_path_id) { 4857 /* This is the active path in use right now. The active path is always the first in the list. */ 4858 4859 if (!TAILQ_NEXT(p, link)) { 4860 /* The current path is the only path. */ 4861 rc = _bdev_nvme_delete(nvme_ctrlr, false); 4862 } else { 4863 /* There is an alternative path. */ 4864 rc = bdev_nvme_failover(nvme_ctrlr, true); 4865 } 4866 } else { 4867 /* We are not using the specified path. */ 4868 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 4869 free(p); 4870 rc = 0; 4871 } 4872 4873 if (rc < 0 && rc != -ENXIO) { 4874 return rc; 4875 } 4876 4877 4878 } 4879 } 4880 4881 /* All nvme_ctrlrs were deleted or no nvme_ctrlr which had the trid was found. */ 4882 return rc; 4883 } 4884 4885 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 4886 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 4887 4888 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 4889 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 4890 4891 struct discovery_entry_ctx { 4892 char name[128]; 4893 struct spdk_nvme_transport_id trid; 4894 struct spdk_nvme_ctrlr_opts drv_opts; 4895 struct spdk_nvmf_discovery_log_page_entry entry; 4896 TAILQ_ENTRY(discovery_entry_ctx) tailq; 4897 struct discovery_ctx *ctx; 4898 }; 4899 4900 struct discovery_ctx { 4901 char *name; 4902 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 4903 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 4904 void *cb_ctx; 4905 struct spdk_nvme_probe_ctx *probe_ctx; 4906 struct spdk_nvme_detach_ctx *detach_ctx; 4907 struct spdk_nvme_ctrlr *ctrlr; 4908 struct spdk_nvme_transport_id trid; 4909 struct discovery_entry_ctx *entry_ctx_in_use; 4910 struct spdk_poller *poller; 4911 struct spdk_nvme_ctrlr_opts drv_opts; 4912 struct nvme_ctrlr_opts bdev_opts; 4913 struct spdk_nvmf_discovery_log_page *log_page; 4914 TAILQ_ENTRY(discovery_ctx) tailq; 4915 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 4916 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 4917 int rc; 4918 bool wait_for_attach; 4919 uint64_t timeout_ticks; 4920 /* Denotes that the discovery service is being started. We're waiting 4921 * for the initial connection to the discovery controller to be 4922 * established and attach discovered NVM ctrlrs. 4923 */ 4924 bool initializing; 4925 /* Denotes if a discovery is currently in progress for this context. 4926 * That includes connecting to newly discovered subsystems. Used to 4927 * ensure we do not start a new discovery until an existing one is 4928 * complete. 4929 */ 4930 bool in_progress; 4931 4932 /* Denotes if another discovery is needed after the one in progress 4933 * completes. Set when we receive an AER completion while a discovery 4934 * is already in progress. 4935 */ 4936 bool pending; 4937 4938 /* Signal to the discovery context poller that it should stop the 4939 * discovery service, including detaching from the current discovery 4940 * controller. 4941 */ 4942 bool stop; 4943 4944 struct spdk_thread *calling_thread; 4945 uint32_t index; 4946 uint32_t attach_in_progress; 4947 char *hostnqn; 4948 }; 4949 4950 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 4951 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 4952 4953 static void get_discovery_log_page(struct discovery_ctx *ctx); 4954 4955 static void 4956 free_discovery_ctx(struct discovery_ctx *ctx) 4957 { 4958 free(ctx->log_page); 4959 free(ctx->hostnqn); 4960 free(ctx->name); 4961 free(ctx); 4962 } 4963 4964 static void 4965 discovery_complete(struct discovery_ctx *ctx) 4966 { 4967 ctx->initializing = false; 4968 ctx->in_progress = false; 4969 if (ctx->pending) { 4970 ctx->pending = false; 4971 get_discovery_log_page(ctx); 4972 } 4973 } 4974 4975 static void 4976 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 4977 struct spdk_nvmf_discovery_log_page_entry *entry) 4978 { 4979 char *space; 4980 4981 trid->trtype = entry->trtype; 4982 trid->adrfam = entry->adrfam; 4983 memcpy(trid->traddr, entry->traddr, sizeof(trid->traddr)); 4984 memcpy(trid->trsvcid, entry->trsvcid, sizeof(trid->trsvcid)); 4985 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn)); 4986 4987 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 4988 * But the log page entries typically pad them with spaces, not zeroes. 4989 * So add a NULL terminator to each of these fields at the appropriate 4990 * location. 4991 */ 4992 space = strchr(trid->traddr, ' '); 4993 if (space) { 4994 *space = 0; 4995 } 4996 space = strchr(trid->trsvcid, ' '); 4997 if (space) { 4998 *space = 0; 4999 } 5000 space = strchr(trid->subnqn, ' '); 5001 if (space) { 5002 *space = 0; 5003 } 5004 } 5005 5006 static void 5007 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 5008 { 5009 ctx->stop = true; 5010 ctx->stop_cb_fn = cb_fn; 5011 ctx->cb_ctx = cb_ctx; 5012 5013 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 5014 struct discovery_entry_ctx *entry_ctx; 5015 struct nvme_path_id path = {}; 5016 5017 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 5018 path.trid = entry_ctx->trid; 5019 bdev_nvme_delete(entry_ctx->name, &path); 5020 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 5021 free(entry_ctx); 5022 } 5023 5024 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 5025 struct discovery_entry_ctx *entry_ctx; 5026 5027 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 5028 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 5029 free(entry_ctx); 5030 } 5031 5032 free(ctx->entry_ctx_in_use); 5033 ctx->entry_ctx_in_use = NULL; 5034 } 5035 5036 static void 5037 discovery_remove_controllers(struct discovery_ctx *ctx) 5038 { 5039 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 5040 struct discovery_entry_ctx *entry_ctx, *tmp; 5041 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 5042 struct spdk_nvme_transport_id old_trid; 5043 uint64_t numrec, i; 5044 bool found; 5045 5046 numrec = from_le64(&log_page->numrec); 5047 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 5048 found = false; 5049 old_entry = &entry_ctx->entry; 5050 build_trid_from_log_page_entry(&old_trid, old_entry); 5051 for (i = 0; i < numrec; i++) { 5052 new_entry = &log_page->entries[i]; 5053 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 5054 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 5055 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 5056 found = true; 5057 break; 5058 } 5059 } 5060 if (!found) { 5061 struct nvme_path_id path = {}; 5062 5063 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 5064 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 5065 5066 path.trid = entry_ctx->trid; 5067 bdev_nvme_delete(entry_ctx->name, &path); 5068 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 5069 free(entry_ctx); 5070 } 5071 } 5072 free(log_page); 5073 ctx->log_page = NULL; 5074 discovery_complete(ctx); 5075 } 5076 5077 static void 5078 complete_discovery_start(struct discovery_ctx *ctx, int status) 5079 { 5080 ctx->timeout_ticks = 0; 5081 ctx->rc = status; 5082 if (ctx->start_cb_fn) { 5083 ctx->start_cb_fn(ctx->cb_ctx, status); 5084 ctx->start_cb_fn = NULL; 5085 ctx->cb_ctx = NULL; 5086 } 5087 } 5088 5089 static void 5090 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 5091 { 5092 struct discovery_entry_ctx *entry_ctx = cb_ctx; 5093 struct discovery_ctx *ctx = entry_ctx->ctx; 5094 5095 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 5096 ctx->attach_in_progress--; 5097 if (ctx->attach_in_progress == 0) { 5098 complete_discovery_start(ctx, ctx->rc); 5099 if (ctx->initializing && ctx->rc != 0) { 5100 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 5101 stop_discovery(ctx, NULL, ctx->cb_ctx); 5102 } else { 5103 discovery_remove_controllers(ctx); 5104 } 5105 } 5106 } 5107 5108 static struct discovery_entry_ctx * 5109 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 5110 { 5111 struct discovery_entry_ctx *new_ctx; 5112 5113 new_ctx = calloc(1, sizeof(*new_ctx)); 5114 if (new_ctx == NULL) { 5115 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 5116 return NULL; 5117 } 5118 5119 new_ctx->ctx = ctx; 5120 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 5121 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 5122 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 5123 return new_ctx; 5124 } 5125 5126 static void 5127 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 5128 struct spdk_nvmf_discovery_log_page *log_page) 5129 { 5130 struct discovery_ctx *ctx = cb_arg; 5131 struct discovery_entry_ctx *entry_ctx, *tmp; 5132 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 5133 uint64_t numrec, i; 5134 bool found; 5135 5136 if (rc || spdk_nvme_cpl_is_error(cpl)) { 5137 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 5138 return; 5139 } 5140 5141 ctx->log_page = log_page; 5142 assert(ctx->attach_in_progress == 0); 5143 numrec = from_le64(&log_page->numrec); 5144 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 5145 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 5146 free(entry_ctx); 5147 } 5148 for (i = 0; i < numrec; i++) { 5149 found = false; 5150 new_entry = &log_page->entries[i]; 5151 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 5152 struct discovery_entry_ctx *new_ctx; 5153 struct spdk_nvme_transport_id trid = {}; 5154 5155 build_trid_from_log_page_entry(&trid, new_entry); 5156 new_ctx = create_discovery_entry_ctx(ctx, &trid); 5157 if (new_ctx == NULL) { 5158 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 5159 break; 5160 } 5161 5162 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 5163 continue; 5164 } 5165 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 5166 old_entry = &entry_ctx->entry; 5167 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 5168 found = true; 5169 break; 5170 } 5171 } 5172 if (!found) { 5173 struct discovery_entry_ctx *subnqn_ctx, *new_ctx; 5174 5175 TAILQ_FOREACH(subnqn_ctx, &ctx->nvm_entry_ctxs, tailq) { 5176 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 5177 sizeof(new_entry->subnqn))) { 5178 break; 5179 } 5180 } 5181 5182 new_ctx = calloc(1, sizeof(*new_ctx)); 5183 if (new_ctx == NULL) { 5184 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 5185 break; 5186 } 5187 5188 new_ctx->ctx = ctx; 5189 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 5190 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 5191 if (subnqn_ctx) { 5192 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 5193 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 5194 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 5195 new_ctx->name); 5196 } else { 5197 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 5198 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 5199 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 5200 new_ctx->name); 5201 } 5202 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 5203 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 5204 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 5205 discovery_attach_controller_done, new_ctx, 5206 &new_ctx->drv_opts, &ctx->bdev_opts, true); 5207 if (rc == 0) { 5208 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 5209 ctx->attach_in_progress++; 5210 } else { 5211 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 5212 } 5213 } 5214 } 5215 5216 if (ctx->attach_in_progress == 0) { 5217 discovery_remove_controllers(ctx); 5218 } 5219 } 5220 5221 static void 5222 get_discovery_log_page(struct discovery_ctx *ctx) 5223 { 5224 int rc; 5225 5226 assert(ctx->in_progress == false); 5227 ctx->in_progress = true; 5228 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 5229 if (rc != 0) { 5230 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 5231 } 5232 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 5233 } 5234 5235 static void 5236 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5237 { 5238 struct discovery_ctx *ctx = arg; 5239 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 5240 5241 if (spdk_nvme_cpl_is_error(cpl)) { 5242 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 5243 return; 5244 } 5245 5246 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 5247 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 5248 return; 5249 } 5250 5251 DISCOVERY_INFOLOG(ctx, "got aer\n"); 5252 if (ctx->in_progress) { 5253 ctx->pending = true; 5254 return; 5255 } 5256 5257 get_discovery_log_page(ctx); 5258 } 5259 5260 static void 5261 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5262 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5263 { 5264 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5265 struct discovery_ctx *ctx; 5266 5267 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 5268 5269 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 5270 ctx->probe_ctx = NULL; 5271 ctx->ctrlr = ctrlr; 5272 5273 if (ctx->rc != 0) { 5274 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 5275 ctx->rc); 5276 return; 5277 } 5278 5279 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 5280 } 5281 5282 static int 5283 discovery_poller(void *arg) 5284 { 5285 struct discovery_ctx *ctx = arg; 5286 struct spdk_nvme_transport_id *trid; 5287 int rc; 5288 5289 if (ctx->detach_ctx) { 5290 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 5291 if (rc != -EAGAIN) { 5292 ctx->detach_ctx = NULL; 5293 ctx->ctrlr = NULL; 5294 } 5295 } else if (ctx->stop) { 5296 if (ctx->ctrlr != NULL) { 5297 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 5298 if (rc == 0) { 5299 return SPDK_POLLER_BUSY; 5300 } 5301 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 5302 } 5303 spdk_poller_unregister(&ctx->poller); 5304 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 5305 assert(ctx->start_cb_fn == NULL); 5306 if (ctx->stop_cb_fn != NULL) { 5307 ctx->stop_cb_fn(ctx->cb_ctx); 5308 } 5309 free_discovery_ctx(ctx); 5310 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 5311 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 5312 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 5313 assert(ctx->initializing); 5314 spdk_poller_unregister(&ctx->poller); 5315 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 5316 complete_discovery_start(ctx, -ETIMEDOUT); 5317 stop_discovery(ctx, NULL, NULL); 5318 free_discovery_ctx(ctx); 5319 return SPDK_POLLER_BUSY; 5320 } 5321 5322 assert(ctx->entry_ctx_in_use == NULL); 5323 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 5324 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 5325 trid = &ctx->entry_ctx_in_use->trid; 5326 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 5327 if (ctx->probe_ctx) { 5328 spdk_poller_unregister(&ctx->poller); 5329 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 5330 } else { 5331 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 5332 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 5333 ctx->entry_ctx_in_use = NULL; 5334 } 5335 } else if (ctx->probe_ctx) { 5336 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 5337 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 5338 complete_discovery_start(ctx, -ETIMEDOUT); 5339 return SPDK_POLLER_BUSY; 5340 } 5341 5342 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 5343 if (rc != -EAGAIN) { 5344 if (ctx->rc != 0) { 5345 assert(ctx->initializing); 5346 stop_discovery(ctx, NULL, ctx->cb_ctx); 5347 } else { 5348 assert(rc == 0); 5349 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 5350 ctx->rc = rc; 5351 get_discovery_log_page(ctx); 5352 } 5353 } 5354 } else { 5355 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 5356 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 5357 complete_discovery_start(ctx, -ETIMEDOUT); 5358 /* We need to wait until all NVM ctrlrs are attached before we stop the 5359 * discovery service to make sure we don't detach a ctrlr that is still 5360 * being attached. 5361 */ 5362 if (ctx->attach_in_progress == 0) { 5363 stop_discovery(ctx, NULL, ctx->cb_ctx); 5364 return SPDK_POLLER_BUSY; 5365 } 5366 } 5367 5368 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 5369 if (rc < 0) { 5370 spdk_poller_unregister(&ctx->poller); 5371 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 5372 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 5373 ctx->entry_ctx_in_use = NULL; 5374 5375 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 5376 if (rc != 0) { 5377 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 5378 ctx->ctrlr = NULL; 5379 } 5380 } 5381 } 5382 5383 return SPDK_POLLER_BUSY; 5384 } 5385 5386 static void 5387 start_discovery_poller(void *arg) 5388 { 5389 struct discovery_ctx *ctx = arg; 5390 5391 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 5392 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 5393 } 5394 5395 int 5396 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 5397 const char *base_name, 5398 struct spdk_nvme_ctrlr_opts *drv_opts, 5399 struct nvme_ctrlr_opts *bdev_opts, 5400 uint64_t attach_timeout, 5401 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 5402 { 5403 struct discovery_ctx *ctx; 5404 struct discovery_entry_ctx *discovery_entry_ctx; 5405 5406 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 5407 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 5408 if (strcmp(ctx->name, base_name) == 0) { 5409 return -EEXIST; 5410 } 5411 5412 if (ctx->entry_ctx_in_use != NULL) { 5413 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 5414 return -EEXIST; 5415 } 5416 } 5417 5418 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 5419 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 5420 return -EEXIST; 5421 } 5422 } 5423 } 5424 5425 ctx = calloc(1, sizeof(*ctx)); 5426 if (ctx == NULL) { 5427 return -ENOMEM; 5428 } 5429 5430 ctx->name = strdup(base_name); 5431 if (ctx->name == NULL) { 5432 free_discovery_ctx(ctx); 5433 return -ENOMEM; 5434 } 5435 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 5436 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 5437 ctx->bdev_opts.from_discovery_service = true; 5438 ctx->calling_thread = spdk_get_thread(); 5439 ctx->start_cb_fn = cb_fn; 5440 ctx->cb_ctx = cb_ctx; 5441 ctx->initializing = true; 5442 if (ctx->start_cb_fn) { 5443 /* We can use this when dumping json to denote if this RPC parameter 5444 * was specified or not. 5445 */ 5446 ctx->wait_for_attach = true; 5447 } 5448 if (attach_timeout != 0) { 5449 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 5450 spdk_get_ticks_hz() / 1000ull; 5451 } 5452 TAILQ_INIT(&ctx->nvm_entry_ctxs); 5453 TAILQ_INIT(&ctx->discovery_entry_ctxs); 5454 memcpy(&ctx->trid, trid, sizeof(*trid)); 5455 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 5456 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 5457 if (ctx->hostnqn == NULL) { 5458 free_discovery_ctx(ctx); 5459 return -ENOMEM; 5460 } 5461 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 5462 if (discovery_entry_ctx == NULL) { 5463 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 5464 free_discovery_ctx(ctx); 5465 return -ENOMEM; 5466 } 5467 5468 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 5469 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 5470 return 0; 5471 } 5472 5473 int 5474 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 5475 { 5476 struct discovery_ctx *ctx; 5477 5478 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 5479 if (strcmp(name, ctx->name) == 0) { 5480 if (ctx->stop) { 5481 return -EALREADY; 5482 } 5483 /* If we're still starting the discovery service and ->rc is non-zero, we're 5484 * going to stop it as soon as we can 5485 */ 5486 if (ctx->initializing && ctx->rc != 0) { 5487 return -EALREADY; 5488 } 5489 stop_discovery(ctx, cb_fn, cb_ctx); 5490 return 0; 5491 } 5492 } 5493 5494 return -ENOENT; 5495 } 5496 5497 static int 5498 bdev_nvme_library_init(void) 5499 { 5500 g_bdev_nvme_init_thread = spdk_get_thread(); 5501 5502 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 5503 bdev_nvme_destroy_poll_group_cb, 5504 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 5505 5506 return 0; 5507 } 5508 5509 static void 5510 bdev_nvme_fini_destruct_ctrlrs(void) 5511 { 5512 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5513 struct nvme_ctrlr *nvme_ctrlr; 5514 5515 pthread_mutex_lock(&g_bdev_nvme_mutex); 5516 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 5517 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 5518 pthread_mutex_lock(&nvme_ctrlr->mutex); 5519 if (nvme_ctrlr->destruct) { 5520 /* This controller's destruction was already started 5521 * before the application started shutting down 5522 */ 5523 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5524 continue; 5525 } 5526 nvme_ctrlr->destruct = true; 5527 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5528 5529 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 5530 nvme_ctrlr); 5531 } 5532 } 5533 5534 g_bdev_nvme_module_finish = true; 5535 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5536 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5537 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 5538 spdk_bdev_module_fini_done(); 5539 return; 5540 } 5541 5542 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5543 } 5544 5545 static void 5546 check_discovery_fini(void *arg) 5547 { 5548 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 5549 bdev_nvme_fini_destruct_ctrlrs(); 5550 } 5551 } 5552 5553 static void 5554 bdev_nvme_library_fini(void) 5555 { 5556 struct nvme_probe_skip_entry *entry, *entry_tmp; 5557 struct discovery_ctx *ctx; 5558 5559 spdk_poller_unregister(&g_hotplug_poller); 5560 free(g_hotplug_probe_ctx); 5561 g_hotplug_probe_ctx = NULL; 5562 5563 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 5564 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 5565 free(entry); 5566 } 5567 5568 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 5569 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 5570 bdev_nvme_fini_destruct_ctrlrs(); 5571 } else { 5572 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 5573 stop_discovery(ctx, check_discovery_fini, NULL); 5574 } 5575 } 5576 } 5577 5578 static void 5579 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 5580 { 5581 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5582 struct spdk_bdev *bdev = bdev_io->bdev; 5583 struct spdk_dif_ctx dif_ctx; 5584 struct spdk_dif_error err_blk = {}; 5585 int rc; 5586 5587 rc = spdk_dif_ctx_init(&dif_ctx, 5588 bdev->blocklen, bdev->md_len, bdev->md_interleave, 5589 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 5590 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0); 5591 if (rc != 0) { 5592 SPDK_ERRLOG("Initialization of DIF context failed\n"); 5593 return; 5594 } 5595 5596 if (bdev->md_interleave) { 5597 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 5598 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 5599 } else { 5600 struct iovec md_iov = { 5601 .iov_base = bdev_io->u.bdev.md_buf, 5602 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 5603 }; 5604 5605 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 5606 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 5607 } 5608 5609 if (rc != 0) { 5610 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 5611 err_blk.err_type, err_blk.err_offset); 5612 } else { 5613 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 5614 } 5615 } 5616 5617 static void 5618 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 5619 { 5620 struct nvme_bdev_io *bio = ref; 5621 5622 if (spdk_nvme_cpl_is_success(cpl)) { 5623 /* Run PI verification for read data buffer. */ 5624 bdev_nvme_verify_pi_error(bio); 5625 } 5626 5627 /* Return original completion status */ 5628 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 5629 } 5630 5631 static void 5632 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 5633 { 5634 struct nvme_bdev_io *bio = ref; 5635 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5636 int ret; 5637 5638 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 5639 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 5640 cpl->status.sct, cpl->status.sc); 5641 5642 /* Save completion status to use after verifying PI error. */ 5643 bio->cpl = *cpl; 5644 5645 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 5646 /* Read without PI checking to verify PI error. */ 5647 ret = bdev_nvme_no_pi_readv(bio, 5648 bdev_io->u.bdev.iovs, 5649 bdev_io->u.bdev.iovcnt, 5650 bdev_io->u.bdev.md_buf, 5651 bdev_io->u.bdev.num_blocks, 5652 bdev_io->u.bdev.offset_blocks); 5653 if (ret == 0) { 5654 return; 5655 } 5656 } 5657 } 5658 5659 bdev_nvme_io_complete_nvme_status(bio, cpl); 5660 } 5661 5662 static void 5663 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 5664 { 5665 struct nvme_bdev_io *bio = ref; 5666 5667 if (spdk_nvme_cpl_is_pi_error(cpl)) { 5668 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 5669 cpl->status.sct, cpl->status.sc); 5670 /* Run PI verification for write data buffer if PI error is detected. */ 5671 bdev_nvme_verify_pi_error(bio); 5672 } 5673 5674 bdev_nvme_io_complete_nvme_status(bio, cpl); 5675 } 5676 5677 static void 5678 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 5679 { 5680 struct nvme_bdev_io *bio = ref; 5681 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5682 5683 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 5684 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 5685 */ 5686 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 5687 5688 if (spdk_nvme_cpl_is_pi_error(cpl)) { 5689 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 5690 cpl->status.sct, cpl->status.sc); 5691 /* Run PI verification for zone append data buffer if PI error is detected. */ 5692 bdev_nvme_verify_pi_error(bio); 5693 } 5694 5695 bdev_nvme_io_complete_nvme_status(bio, cpl); 5696 } 5697 5698 static void 5699 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 5700 { 5701 struct nvme_bdev_io *bio = ref; 5702 5703 if (spdk_nvme_cpl_is_pi_error(cpl)) { 5704 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 5705 cpl->status.sct, cpl->status.sc); 5706 /* Run PI verification for compare data buffer if PI error is detected. */ 5707 bdev_nvme_verify_pi_error(bio); 5708 } 5709 5710 bdev_nvme_io_complete_nvme_status(bio, cpl); 5711 } 5712 5713 static void 5714 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 5715 { 5716 struct nvme_bdev_io *bio = ref; 5717 5718 /* Compare operation completion */ 5719 if (!bio->first_fused_completed) { 5720 /* Save compare result for write callback */ 5721 bio->cpl = *cpl; 5722 bio->first_fused_completed = true; 5723 return; 5724 } 5725 5726 /* Write operation completion */ 5727 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 5728 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 5729 * complete the IO with the compare operation's status. 5730 */ 5731 if (!spdk_nvme_cpl_is_error(cpl)) { 5732 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 5733 } 5734 5735 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 5736 } else { 5737 bdev_nvme_io_complete_nvme_status(bio, cpl); 5738 } 5739 } 5740 5741 static void 5742 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 5743 { 5744 struct nvme_bdev_io *bio = ref; 5745 5746 bdev_nvme_io_complete_nvme_status(bio, cpl); 5747 } 5748 5749 static int 5750 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 5751 { 5752 switch (desc->zt) { 5753 case SPDK_NVME_ZONE_TYPE_SEQWR: 5754 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 5755 break; 5756 default: 5757 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 5758 return -EIO; 5759 } 5760 5761 switch (desc->zs) { 5762 case SPDK_NVME_ZONE_STATE_EMPTY: 5763 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 5764 break; 5765 case SPDK_NVME_ZONE_STATE_IOPEN: 5766 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 5767 break; 5768 case SPDK_NVME_ZONE_STATE_EOPEN: 5769 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 5770 break; 5771 case SPDK_NVME_ZONE_STATE_CLOSED: 5772 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 5773 break; 5774 case SPDK_NVME_ZONE_STATE_RONLY: 5775 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 5776 break; 5777 case SPDK_NVME_ZONE_STATE_FULL: 5778 info->state = SPDK_BDEV_ZONE_STATE_FULL; 5779 break; 5780 case SPDK_NVME_ZONE_STATE_OFFLINE: 5781 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 5782 break; 5783 default: 5784 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 5785 return -EIO; 5786 } 5787 5788 info->zone_id = desc->zslba; 5789 info->write_pointer = desc->wp; 5790 info->capacity = desc->zcap; 5791 5792 return 0; 5793 } 5794 5795 static void 5796 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 5797 { 5798 struct nvme_bdev_io *bio = ref; 5799 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5800 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 5801 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 5802 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 5803 uint64_t max_zones_per_buf, i; 5804 uint32_t zone_report_bufsize; 5805 struct spdk_nvme_ns *ns; 5806 struct spdk_nvme_qpair *qpair; 5807 int ret; 5808 5809 if (spdk_nvme_cpl_is_error(cpl)) { 5810 goto out_complete_io_nvme_cpl; 5811 } 5812 5813 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 5814 ret = -ENXIO; 5815 goto out_complete_io_ret; 5816 } 5817 5818 ns = bio->io_path->nvme_ns->ns; 5819 qpair = bio->io_path->qpair->qpair; 5820 5821 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 5822 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 5823 sizeof(bio->zone_report_buf->descs[0]); 5824 5825 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 5826 ret = -EINVAL; 5827 goto out_complete_io_ret; 5828 } 5829 5830 if (!bio->zone_report_buf->nr_zones) { 5831 ret = -EINVAL; 5832 goto out_complete_io_ret; 5833 } 5834 5835 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 5836 ret = fill_zone_from_report(&info[bio->handled_zones], 5837 &bio->zone_report_buf->descs[i]); 5838 if (ret) { 5839 goto out_complete_io_ret; 5840 } 5841 bio->handled_zones++; 5842 } 5843 5844 if (bio->handled_zones < zones_to_copy) { 5845 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 5846 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 5847 5848 memset(bio->zone_report_buf, 0, zone_report_bufsize); 5849 ret = spdk_nvme_zns_report_zones(ns, qpair, 5850 bio->zone_report_buf, zone_report_bufsize, 5851 slba, SPDK_NVME_ZRA_LIST_ALL, true, 5852 bdev_nvme_get_zone_info_done, bio); 5853 if (!ret) { 5854 return; 5855 } else { 5856 goto out_complete_io_ret; 5857 } 5858 } 5859 5860 out_complete_io_nvme_cpl: 5861 free(bio->zone_report_buf); 5862 bio->zone_report_buf = NULL; 5863 bdev_nvme_io_complete_nvme_status(bio, cpl); 5864 return; 5865 5866 out_complete_io_ret: 5867 free(bio->zone_report_buf); 5868 bio->zone_report_buf = NULL; 5869 bdev_nvme_io_complete(bio, ret); 5870 } 5871 5872 static void 5873 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 5874 { 5875 struct nvme_bdev_io *bio = ref; 5876 5877 bdev_nvme_io_complete_nvme_status(bio, cpl); 5878 } 5879 5880 static void 5881 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 5882 { 5883 struct nvme_bdev_io *bio = ctx; 5884 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5885 const struct spdk_nvme_cpl *cpl = &bio->cpl; 5886 5887 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 5888 5889 __bdev_nvme_io_complete(bdev_io, 0, cpl); 5890 } 5891 5892 static void 5893 bdev_nvme_abort_complete(void *ctx) 5894 { 5895 struct nvme_bdev_io *bio = ctx; 5896 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5897 5898 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 5899 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 5900 } else { 5901 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 5902 } 5903 } 5904 5905 static void 5906 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 5907 { 5908 struct nvme_bdev_io *bio = ref; 5909 5910 bio->cpl = *cpl; 5911 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_complete, bio); 5912 } 5913 5914 static void 5915 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 5916 { 5917 struct nvme_bdev_io *bio = ref; 5918 5919 bio->cpl = *cpl; 5920 spdk_thread_send_msg(bio->orig_thread, 5921 bdev_nvme_admin_passthru_complete_nvme_status, bio); 5922 } 5923 5924 static void 5925 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 5926 { 5927 struct nvme_bdev_io *bio = ref; 5928 struct iovec *iov; 5929 5930 bio->iov_offset = sgl_offset; 5931 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 5932 iov = &bio->iovs[bio->iovpos]; 5933 if (bio->iov_offset < iov->iov_len) { 5934 break; 5935 } 5936 5937 bio->iov_offset -= iov->iov_len; 5938 } 5939 } 5940 5941 static int 5942 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 5943 { 5944 struct nvme_bdev_io *bio = ref; 5945 struct iovec *iov; 5946 5947 assert(bio->iovpos < bio->iovcnt); 5948 5949 iov = &bio->iovs[bio->iovpos]; 5950 5951 *address = iov->iov_base; 5952 *length = iov->iov_len; 5953 5954 if (bio->iov_offset) { 5955 assert(bio->iov_offset <= iov->iov_len); 5956 *address += bio->iov_offset; 5957 *length -= bio->iov_offset; 5958 } 5959 5960 bio->iov_offset += *length; 5961 if (bio->iov_offset == iov->iov_len) { 5962 bio->iovpos++; 5963 bio->iov_offset = 0; 5964 } 5965 5966 return 0; 5967 } 5968 5969 static void 5970 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 5971 { 5972 struct nvme_bdev_io *bio = ref; 5973 struct iovec *iov; 5974 5975 bio->fused_iov_offset = sgl_offset; 5976 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 5977 iov = &bio->fused_iovs[bio->fused_iovpos]; 5978 if (bio->fused_iov_offset < iov->iov_len) { 5979 break; 5980 } 5981 5982 bio->fused_iov_offset -= iov->iov_len; 5983 } 5984 } 5985 5986 static int 5987 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 5988 { 5989 struct nvme_bdev_io *bio = ref; 5990 struct iovec *iov; 5991 5992 assert(bio->fused_iovpos < bio->fused_iovcnt); 5993 5994 iov = &bio->fused_iovs[bio->fused_iovpos]; 5995 5996 *address = iov->iov_base; 5997 *length = iov->iov_len; 5998 5999 if (bio->fused_iov_offset) { 6000 assert(bio->fused_iov_offset <= iov->iov_len); 6001 *address += bio->fused_iov_offset; 6002 *length -= bio->fused_iov_offset; 6003 } 6004 6005 bio->fused_iov_offset += *length; 6006 if (bio->fused_iov_offset == iov->iov_len) { 6007 bio->fused_iovpos++; 6008 bio->fused_iov_offset = 0; 6009 } 6010 6011 return 0; 6012 } 6013 6014 static int 6015 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6016 void *md, uint64_t lba_count, uint64_t lba) 6017 { 6018 int rc; 6019 6020 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 6021 lba_count, lba); 6022 6023 bio->iovs = iov; 6024 bio->iovcnt = iovcnt; 6025 bio->iovpos = 0; 6026 bio->iov_offset = 0; 6027 6028 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 6029 bio->io_path->qpair->qpair, 6030 lba, lba_count, 6031 bdev_nvme_no_pi_readv_done, bio, 0, 6032 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6033 md, 0, 0); 6034 6035 if (rc != 0 && rc != -ENOMEM) { 6036 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 6037 } 6038 return rc; 6039 } 6040 6041 static int 6042 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6043 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 6044 struct spdk_bdev_ext_io_opts *ext_opts) 6045 { 6046 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6047 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6048 int rc; 6049 6050 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 6051 lba_count, lba); 6052 6053 bio->iovs = iov; 6054 bio->iovcnt = iovcnt; 6055 bio->iovpos = 0; 6056 bio->iov_offset = 0; 6057 6058 if (ext_opts) { 6059 bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts); 6060 bio->ext_opts.memory_domain = ext_opts->memory_domain; 6061 bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx; 6062 bio->ext_opts.io_flags = flags; 6063 bio->ext_opts.metadata = md; 6064 6065 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 6066 bdev_nvme_readv_done, bio, 6067 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6068 &bio->ext_opts); 6069 } else if (iovcnt == 1) { 6070 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba, 6071 lba_count, 6072 bdev_nvme_readv_done, bio, 6073 flags, 6074 0, 0); 6075 } else { 6076 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 6077 bdev_nvme_readv_done, bio, flags, 6078 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6079 md, 0, 0); 6080 } 6081 6082 if (rc != 0 && rc != -ENOMEM) { 6083 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 6084 } 6085 return rc; 6086 } 6087 6088 static int 6089 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6090 void *md, uint64_t lba_count, uint64_t lba, 6091 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts) 6092 { 6093 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6094 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6095 int rc; 6096 6097 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 6098 lba_count, lba); 6099 6100 bio->iovs = iov; 6101 bio->iovcnt = iovcnt; 6102 bio->iovpos = 0; 6103 bio->iov_offset = 0; 6104 6105 if (ext_opts) { 6106 bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts); 6107 bio->ext_opts.memory_domain = ext_opts->memory_domain; 6108 bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx; 6109 bio->ext_opts.io_flags = flags; 6110 bio->ext_opts.metadata = md; 6111 6112 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 6113 bdev_nvme_writev_done, bio, 6114 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6115 &bio->ext_opts); 6116 } else if (iovcnt == 1) { 6117 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba, 6118 lba_count, 6119 bdev_nvme_writev_done, bio, 6120 flags, 6121 0, 0); 6122 } else { 6123 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 6124 bdev_nvme_writev_done, bio, flags, 6125 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6126 md, 0, 0); 6127 } 6128 6129 if (rc != 0 && rc != -ENOMEM) { 6130 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 6131 } 6132 return rc; 6133 } 6134 6135 static int 6136 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6137 void *md, uint64_t lba_count, uint64_t zslba, 6138 uint32_t flags) 6139 { 6140 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6141 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6142 int rc; 6143 6144 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 6145 lba_count, zslba); 6146 6147 bio->iovs = iov; 6148 bio->iovcnt = iovcnt; 6149 bio->iovpos = 0; 6150 bio->iov_offset = 0; 6151 6152 if (iovcnt == 1) { 6153 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 6154 lba_count, 6155 bdev_nvme_zone_appendv_done, bio, 6156 flags, 6157 0, 0); 6158 } else { 6159 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 6160 bdev_nvme_zone_appendv_done, bio, flags, 6161 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6162 md, 0, 0); 6163 } 6164 6165 if (rc != 0 && rc != -ENOMEM) { 6166 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 6167 } 6168 return rc; 6169 } 6170 6171 static int 6172 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6173 void *md, uint64_t lba_count, uint64_t lba, 6174 uint32_t flags) 6175 { 6176 int rc; 6177 6178 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 6179 lba_count, lba); 6180 6181 bio->iovs = iov; 6182 bio->iovcnt = iovcnt; 6183 bio->iovpos = 0; 6184 bio->iov_offset = 0; 6185 6186 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 6187 bio->io_path->qpair->qpair, 6188 lba, lba_count, 6189 bdev_nvme_comparev_done, bio, flags, 6190 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6191 md, 0, 0); 6192 6193 if (rc != 0 && rc != -ENOMEM) { 6194 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 6195 } 6196 return rc; 6197 } 6198 6199 static int 6200 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 6201 struct iovec *write_iov, int write_iovcnt, 6202 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 6203 { 6204 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6205 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6206 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6207 int rc; 6208 6209 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 6210 lba_count, lba); 6211 6212 bio->iovs = cmp_iov; 6213 bio->iovcnt = cmp_iovcnt; 6214 bio->iovpos = 0; 6215 bio->iov_offset = 0; 6216 bio->fused_iovs = write_iov; 6217 bio->fused_iovcnt = write_iovcnt; 6218 bio->fused_iovpos = 0; 6219 bio->fused_iov_offset = 0; 6220 6221 if (bdev_io->num_retries == 0) { 6222 bio->first_fused_submitted = false; 6223 bio->first_fused_completed = false; 6224 } 6225 6226 if (!bio->first_fused_submitted) { 6227 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 6228 memset(&bio->cpl, 0, sizeof(bio->cpl)); 6229 6230 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 6231 bdev_nvme_comparev_and_writev_done, bio, flags, 6232 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 6233 if (rc == 0) { 6234 bio->first_fused_submitted = true; 6235 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 6236 } else { 6237 if (rc != -ENOMEM) { 6238 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 6239 } 6240 return rc; 6241 } 6242 } 6243 6244 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 6245 6246 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 6247 bdev_nvme_comparev_and_writev_done, bio, flags, 6248 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 6249 if (rc != 0 && rc != -ENOMEM) { 6250 SPDK_ERRLOG("write failed: rc = %d\n", rc); 6251 rc = 0; 6252 } 6253 6254 return rc; 6255 } 6256 6257 static int 6258 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 6259 { 6260 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 6261 struct spdk_nvme_dsm_range *range; 6262 uint64_t offset, remaining; 6263 uint64_t num_ranges_u64; 6264 uint16_t num_ranges; 6265 int rc; 6266 6267 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 6268 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6269 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 6270 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 6271 return -EINVAL; 6272 } 6273 num_ranges = (uint16_t)num_ranges_u64; 6274 6275 offset = offset_blocks; 6276 remaining = num_blocks; 6277 range = &dsm_ranges[0]; 6278 6279 /* Fill max-size ranges until the remaining blocks fit into one range */ 6280 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 6281 range->attributes.raw = 0; 6282 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6283 range->starting_lba = offset; 6284 6285 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6286 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6287 range++; 6288 } 6289 6290 /* Final range describes the remaining blocks */ 6291 range->attributes.raw = 0; 6292 range->length = remaining; 6293 range->starting_lba = offset; 6294 6295 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 6296 bio->io_path->qpair->qpair, 6297 SPDK_NVME_DSM_ATTR_DEALLOCATE, 6298 dsm_ranges, num_ranges, 6299 bdev_nvme_queued_done, bio); 6300 6301 return rc; 6302 } 6303 6304 static int 6305 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 6306 { 6307 if (num_blocks > UINT16_MAX + 1) { 6308 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 6309 return -EINVAL; 6310 } 6311 6312 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 6313 bio->io_path->qpair->qpair, 6314 offset_blocks, num_blocks, 6315 bdev_nvme_queued_done, bio, 6316 0); 6317 } 6318 6319 static int 6320 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 6321 struct spdk_bdev_zone_info *info) 6322 { 6323 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6324 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6325 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 6326 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 6327 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 6328 6329 if (zone_id % zone_size != 0) { 6330 return -EINVAL; 6331 } 6332 6333 if (num_zones > total_zones || !num_zones) { 6334 return -EINVAL; 6335 } 6336 6337 assert(!bio->zone_report_buf); 6338 bio->zone_report_buf = calloc(1, zone_report_bufsize); 6339 if (!bio->zone_report_buf) { 6340 return -ENOMEM; 6341 } 6342 6343 bio->handled_zones = 0; 6344 6345 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 6346 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 6347 bdev_nvme_get_zone_info_done, bio); 6348 } 6349 6350 static int 6351 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 6352 enum spdk_bdev_zone_action action) 6353 { 6354 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6355 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6356 6357 switch (action) { 6358 case SPDK_BDEV_ZONE_CLOSE: 6359 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 6360 bdev_nvme_zone_management_done, bio); 6361 case SPDK_BDEV_ZONE_FINISH: 6362 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 6363 bdev_nvme_zone_management_done, bio); 6364 case SPDK_BDEV_ZONE_OPEN: 6365 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 6366 bdev_nvme_zone_management_done, bio); 6367 case SPDK_BDEV_ZONE_RESET: 6368 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 6369 bdev_nvme_zone_management_done, bio); 6370 case SPDK_BDEV_ZONE_OFFLINE: 6371 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 6372 bdev_nvme_zone_management_done, bio); 6373 default: 6374 return -EINVAL; 6375 } 6376 } 6377 6378 static void 6379 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 6380 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 6381 { 6382 struct nvme_io_path *io_path; 6383 struct nvme_ctrlr *nvme_ctrlr; 6384 uint32_t max_xfer_size; 6385 int rc = -ENXIO; 6386 6387 /* Choose the first ctrlr which is not failed. */ 6388 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 6389 nvme_ctrlr = io_path->qpair->ctrlr; 6390 6391 /* We should skip any unavailable nvme_ctrlr rather than checking 6392 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 6393 */ 6394 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 6395 continue; 6396 } 6397 6398 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 6399 6400 if (nbytes > max_xfer_size) { 6401 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 6402 rc = -EINVAL; 6403 goto err; 6404 } 6405 6406 bio->io_path = io_path; 6407 bio->orig_thread = spdk_get_thread(); 6408 6409 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 6410 bdev_nvme_admin_passthru_done, bio); 6411 if (rc == 0) { 6412 return; 6413 } 6414 } 6415 6416 err: 6417 bdev_nvme_admin_passthru_complete(bio, rc); 6418 } 6419 6420 static int 6421 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 6422 void *buf, size_t nbytes) 6423 { 6424 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6425 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6426 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 6427 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 6428 6429 if (nbytes > max_xfer_size) { 6430 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 6431 return -EINVAL; 6432 } 6433 6434 /* 6435 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 6436 * so fill it out automatically. 6437 */ 6438 cmd->nsid = spdk_nvme_ns_get_id(ns); 6439 6440 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 6441 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 6442 } 6443 6444 static int 6445 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 6446 void *buf, size_t nbytes, void *md_buf, size_t md_len) 6447 { 6448 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6449 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6450 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 6451 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 6452 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 6453 6454 if (nbytes > max_xfer_size) { 6455 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 6456 return -EINVAL; 6457 } 6458 6459 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 6460 SPDK_ERRLOG("invalid meta data buffer size\n"); 6461 return -EINVAL; 6462 } 6463 6464 /* 6465 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 6466 * so fill it out automatically. 6467 */ 6468 cmd->nsid = spdk_nvme_ns_get_id(ns); 6469 6470 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 6471 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 6472 } 6473 6474 static void 6475 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 6476 struct nvme_bdev_io *bio_to_abort) 6477 { 6478 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6479 struct spdk_bdev_io *bdev_io_to_abort; 6480 struct nvme_io_path *io_path; 6481 struct nvme_ctrlr *nvme_ctrlr; 6482 int rc = 0; 6483 6484 bio->orig_thread = spdk_get_thread(); 6485 6486 /* Traverse the retry_io_list first. */ 6487 TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) { 6488 if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) { 6489 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link); 6490 __bdev_nvme_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 6491 6492 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 6493 return; 6494 } 6495 } 6496 6497 /* Even admin commands, they were submitted to only nvme_ctrlrs which were 6498 * on any io_path. So traverse the io_path list for not only I/O commands 6499 * but also admin commands. 6500 */ 6501 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 6502 nvme_ctrlr = io_path->qpair->ctrlr; 6503 6504 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr, 6505 io_path->qpair->qpair, 6506 bio_to_abort, 6507 bdev_nvme_abort_done, bio); 6508 if (rc == -ENOENT) { 6509 /* If no command was found in I/O qpair, the target command may be 6510 * admin command. 6511 */ 6512 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr, 6513 NULL, 6514 bio_to_abort, 6515 bdev_nvme_abort_done, bio); 6516 } 6517 6518 if (rc != -ENOENT) { 6519 break; 6520 } 6521 } 6522 6523 if (rc != 0) { 6524 /* If no command was found or there was any error, complete the abort 6525 * request with failure. 6526 */ 6527 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 6528 } 6529 } 6530 6531 static int 6532 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 6533 uint64_t num_blocks) 6534 { 6535 struct spdk_nvme_scc_source_range range = { 6536 .slba = src_offset_blocks, 6537 .nlb = num_blocks - 1 6538 }; 6539 6540 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 6541 bio->io_path->qpair->qpair, 6542 &range, 1, dst_offset_blocks, 6543 bdev_nvme_queued_done, bio); 6544 } 6545 6546 static void 6547 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 6548 { 6549 const char *action; 6550 6551 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 6552 action = "reset"; 6553 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 6554 action = "abort"; 6555 } else { 6556 action = "none"; 6557 } 6558 6559 spdk_json_write_object_begin(w); 6560 6561 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 6562 6563 spdk_json_write_named_object_begin(w, "params"); 6564 spdk_json_write_named_string(w, "action_on_timeout", action); 6565 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 6566 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 6567 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 6568 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 6569 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 6570 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 6571 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 6572 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 6573 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 6574 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 6575 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 6576 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 6577 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 6578 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 6579 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 6580 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 6581 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 6582 spdk_json_write_object_end(w); 6583 6584 spdk_json_write_object_end(w); 6585 } 6586 6587 static void 6588 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 6589 { 6590 struct spdk_nvme_transport_id trid; 6591 6592 spdk_json_write_object_begin(w); 6593 6594 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 6595 6596 spdk_json_write_named_object_begin(w, "params"); 6597 spdk_json_write_named_string(w, "name", ctx->name); 6598 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 6599 6600 trid = ctx->trid; 6601 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 6602 nvme_bdev_dump_trid_json(&trid, w); 6603 6604 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 6605 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 6606 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 6607 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 6608 ctx->bdev_opts.fast_io_fail_timeout_sec); 6609 spdk_json_write_object_end(w); 6610 6611 spdk_json_write_object_end(w); 6612 } 6613 6614 static void 6615 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 6616 struct nvme_ctrlr *nvme_ctrlr) 6617 { 6618 struct spdk_nvme_transport_id *trid; 6619 6620 if (nvme_ctrlr->opts.from_discovery_service) { 6621 /* Do not emit an RPC for this - it will be implicitly 6622 * covered by a separate bdev_nvme_start_discovery RPC. 6623 */ 6624 return; 6625 } 6626 6627 trid = &nvme_ctrlr->active_path_id->trid; 6628 6629 spdk_json_write_object_begin(w); 6630 6631 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 6632 6633 spdk_json_write_named_object_begin(w, "params"); 6634 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 6635 nvme_bdev_dump_trid_json(trid, w); 6636 spdk_json_write_named_bool(w, "prchk_reftag", 6637 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 6638 spdk_json_write_named_bool(w, "prchk_guard", 6639 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 6640 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 6641 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 6642 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 6643 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 6644 6645 spdk_json_write_object_end(w); 6646 6647 spdk_json_write_object_end(w); 6648 } 6649 6650 static void 6651 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 6652 { 6653 spdk_json_write_object_begin(w); 6654 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 6655 6656 spdk_json_write_named_object_begin(w, "params"); 6657 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 6658 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 6659 spdk_json_write_object_end(w); 6660 6661 spdk_json_write_object_end(w); 6662 } 6663 6664 static int 6665 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 6666 { 6667 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6668 struct nvme_ctrlr *nvme_ctrlr; 6669 struct discovery_ctx *ctx; 6670 6671 bdev_nvme_opts_config_json(w); 6672 6673 pthread_mutex_lock(&g_bdev_nvme_mutex); 6674 6675 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 6676 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6677 nvme_ctrlr_config_json(w, nvme_ctrlr); 6678 } 6679 } 6680 6681 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6682 bdev_nvme_discovery_config_json(w, ctx); 6683 } 6684 6685 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 6686 * before enabling hotplug poller. 6687 */ 6688 bdev_nvme_hotplug_config_json(w); 6689 6690 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6691 return 0; 6692 } 6693 6694 struct spdk_nvme_ctrlr * 6695 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 6696 { 6697 struct nvme_bdev *nbdev; 6698 struct nvme_ns *nvme_ns; 6699 6700 if (!bdev || bdev->module != &nvme_if) { 6701 return NULL; 6702 } 6703 6704 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 6705 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 6706 assert(nvme_ns != NULL); 6707 6708 return nvme_ns->ctrlr->ctrlr; 6709 } 6710 6711 void 6712 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 6713 { 6714 struct nvme_ns *nvme_ns = io_path->nvme_ns; 6715 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 6716 const struct spdk_nvme_ctrlr_data *cdata; 6717 const struct spdk_nvme_transport_id *trid; 6718 const char *adrfam_str; 6719 6720 spdk_json_write_object_begin(w); 6721 6722 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 6723 6724 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 6725 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 6726 6727 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 6728 spdk_json_write_named_bool(w, "current", io_path == io_path->nbdev_ch->current_io_path); 6729 spdk_json_write_named_bool(w, "connected", nvme_io_path_is_connected(io_path)); 6730 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 6731 6732 spdk_json_write_named_object_begin(w, "transport"); 6733 spdk_json_write_named_string(w, "trtype", trid->trstring); 6734 spdk_json_write_named_string(w, "traddr", trid->traddr); 6735 if (trid->trsvcid[0] != '\0') { 6736 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 6737 } 6738 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 6739 if (adrfam_str) { 6740 spdk_json_write_named_string(w, "adrfam", adrfam_str); 6741 } 6742 spdk_json_write_object_end(w); 6743 6744 spdk_json_write_object_end(w); 6745 } 6746 6747 void 6748 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 6749 { 6750 struct discovery_ctx *ctx; 6751 struct discovery_entry_ctx *entry_ctx; 6752 6753 spdk_json_write_array_begin(w); 6754 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6755 spdk_json_write_object_begin(w); 6756 spdk_json_write_named_string(w, "name", ctx->name); 6757 6758 spdk_json_write_named_object_begin(w, "trid"); 6759 nvme_bdev_dump_trid_json(&ctx->trid, w); 6760 spdk_json_write_object_end(w); 6761 6762 spdk_json_write_named_array_begin(w, "referrals"); 6763 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 6764 spdk_json_write_object_begin(w); 6765 spdk_json_write_named_object_begin(w, "trid"); 6766 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 6767 spdk_json_write_object_end(w); 6768 spdk_json_write_object_end(w); 6769 } 6770 spdk_json_write_array_end(w); 6771 6772 spdk_json_write_object_end(w); 6773 } 6774 spdk_json_write_array_end(w); 6775 } 6776 6777 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 6778 6779 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 6780 { 6781 struct spdk_trace_tpoint_opts opts[] = { 6782 { 6783 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 6784 OWNER_NONE, OBJECT_BDEV_NVME_IO, 1, 6785 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 6786 }, 6787 { 6788 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 6789 OWNER_NONE, OBJECT_BDEV_NVME_IO, 0, 6790 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 6791 } 6792 }; 6793 6794 6795 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 6796 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 6797 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 6798 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 6799 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 6800 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 6801 } 6802