1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "bdev_nvme.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/endian.h" 14 #include "spdk/bdev.h" 15 #include "spdk/json.h" 16 #include "spdk/likely.h" 17 #include "spdk/nvme.h" 18 #include "spdk/nvme_ocssd.h" 19 #include "spdk/nvme_zns.h" 20 #include "spdk/opal.h" 21 #include "spdk/thread.h" 22 #include "spdk/trace.h" 23 #include "spdk/string.h" 24 #include "spdk/util.h" 25 26 #include "spdk/bdev_module.h" 27 #include "spdk/log.h" 28 29 #include "spdk_internal/usdt.h" 30 #include "spdk_internal/trace_defs.h" 31 32 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 33 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 34 35 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 36 37 struct nvme_bdev_io { 38 /** array of iovecs to transfer. */ 39 struct iovec *iovs; 40 41 /** Number of iovecs in iovs array. */ 42 int iovcnt; 43 44 /** Current iovec position. */ 45 int iovpos; 46 47 /** Offset in current iovec. */ 48 uint32_t iov_offset; 49 50 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 51 * being reset in a reset I/O. 52 */ 53 struct nvme_io_path *io_path; 54 55 /** array of iovecs to transfer. */ 56 struct iovec *fused_iovs; 57 58 /** Number of iovecs in iovs array. */ 59 int fused_iovcnt; 60 61 /** Current iovec position. */ 62 int fused_iovpos; 63 64 /** Offset in current iovec. */ 65 uint32_t fused_iov_offset; 66 67 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 68 struct spdk_nvme_cpl cpl; 69 70 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 71 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 72 73 /** Originating thread */ 74 struct spdk_thread *orig_thread; 75 76 /** Keeps track if first of fused commands was submitted */ 77 bool first_fused_submitted; 78 79 /** Keeps track if first of fused commands was completed */ 80 bool first_fused_completed; 81 82 /** Temporary pointer to zone report buffer */ 83 struct spdk_nvme_zns_zone_report *zone_report_buf; 84 85 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 86 uint64_t handled_zones; 87 88 /** Expiration value in ticks to retry the current I/O. */ 89 uint64_t retry_ticks; 90 91 /* How many times the current I/O was retried. */ 92 int32_t retry_count; 93 }; 94 95 struct nvme_probe_skip_entry { 96 struct spdk_nvme_transport_id trid; 97 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 98 }; 99 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 100 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 101 g_skipped_nvme_ctrlrs); 102 103 static struct spdk_bdev_nvme_opts g_opts = { 104 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 105 .timeout_us = 0, 106 .timeout_admin_us = 0, 107 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 108 .transport_retry_count = 4, 109 .arbitration_burst = 0, 110 .low_priority_weight = 0, 111 .medium_priority_weight = 0, 112 .high_priority_weight = 0, 113 .nvme_adminq_poll_period_us = 10000ULL, 114 .nvme_ioq_poll_period_us = 0, 115 .io_queue_requests = 0, 116 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 117 .bdev_retry_count = 3, 118 .transport_ack_timeout = 0, 119 .ctrlr_loss_timeout_sec = 0, 120 .reconnect_delay_sec = 0, 121 .fast_io_fail_timeout_sec = 0, 122 .disable_auto_failback = false, 123 }; 124 125 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 126 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 127 128 static int g_hot_insert_nvme_controller_index = 0; 129 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 130 static bool g_nvme_hotplug_enabled = false; 131 static struct spdk_thread *g_bdev_nvme_init_thread; 132 static struct spdk_poller *g_hotplug_poller; 133 static struct spdk_poller *g_hotplug_probe_poller; 134 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 135 136 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 137 struct nvme_async_probe_ctx *ctx); 138 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 139 struct nvme_async_probe_ctx *ctx); 140 static int bdev_nvme_library_init(void); 141 static void bdev_nvme_library_fini(void); 142 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 143 struct spdk_bdev_io *bdev_io); 144 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 145 void *md, uint64_t lba_count, uint64_t lba, 146 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts); 147 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 148 void *md, uint64_t lba_count, uint64_t lba); 149 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 150 void *md, uint64_t lba_count, uint64_t lba, 151 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts); 152 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 153 void *md, uint64_t lba_count, 154 uint64_t zslba, uint32_t flags); 155 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 156 void *md, uint64_t lba_count, uint64_t lba, 157 uint32_t flags); 158 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 159 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 160 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 161 uint32_t flags); 162 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 163 uint32_t num_zones, struct spdk_bdev_zone_info *info); 164 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 165 enum spdk_bdev_zone_action action); 166 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 167 struct nvme_bdev_io *bio, 168 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 169 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 170 void *buf, size_t nbytes); 171 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 172 void *buf, size_t nbytes, void *md_buf, size_t md_len); 173 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 174 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 175 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 176 static int bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr); 177 static int bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove); 178 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 179 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 180 181 static int 182 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 183 { 184 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 185 } 186 187 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 188 189 struct spdk_nvme_qpair * 190 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 191 { 192 struct nvme_ctrlr_channel *ctrlr_ch; 193 194 assert(ctrlr_io_ch != NULL); 195 196 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 197 198 return ctrlr_ch->qpair->qpair; 199 } 200 201 static int 202 bdev_nvme_get_ctx_size(void) 203 { 204 return sizeof(struct nvme_bdev_io); 205 } 206 207 static struct spdk_bdev_module nvme_if = { 208 .name = "nvme", 209 .async_fini = true, 210 .module_init = bdev_nvme_library_init, 211 .module_fini = bdev_nvme_library_fini, 212 .config_json = bdev_nvme_config_json, 213 .get_ctx_size = bdev_nvme_get_ctx_size, 214 215 }; 216 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 217 218 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 219 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 220 bool g_bdev_nvme_module_finish; 221 222 struct nvme_bdev_ctrlr * 223 nvme_bdev_ctrlr_get_by_name(const char *name) 224 { 225 struct nvme_bdev_ctrlr *nbdev_ctrlr; 226 227 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 228 if (strcmp(name, nbdev_ctrlr->name) == 0) { 229 break; 230 } 231 } 232 233 return nbdev_ctrlr; 234 } 235 236 static struct nvme_ctrlr * 237 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 238 const struct spdk_nvme_transport_id *trid) 239 { 240 struct nvme_ctrlr *nvme_ctrlr; 241 242 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 243 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { 244 break; 245 } 246 } 247 248 return nvme_ctrlr; 249 } 250 251 static struct nvme_bdev * 252 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 253 { 254 struct nvme_bdev *bdev; 255 256 pthread_mutex_lock(&g_bdev_nvme_mutex); 257 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 258 if (bdev->nsid == nsid) { 259 break; 260 } 261 } 262 pthread_mutex_unlock(&g_bdev_nvme_mutex); 263 264 return bdev; 265 } 266 267 struct nvme_ns * 268 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 269 { 270 struct nvme_ns ns; 271 272 assert(nsid > 0); 273 274 ns.id = nsid; 275 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 276 } 277 278 struct nvme_ns * 279 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 280 { 281 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 282 } 283 284 struct nvme_ns * 285 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 286 { 287 if (ns == NULL) { 288 return NULL; 289 } 290 291 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 292 } 293 294 static struct nvme_ctrlr * 295 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) 296 { 297 struct nvme_bdev_ctrlr *nbdev_ctrlr; 298 struct nvme_ctrlr *nvme_ctrlr = NULL; 299 300 pthread_mutex_lock(&g_bdev_nvme_mutex); 301 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 302 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); 303 if (nvme_ctrlr != NULL) { 304 break; 305 } 306 } 307 pthread_mutex_unlock(&g_bdev_nvme_mutex); 308 309 return nvme_ctrlr; 310 } 311 312 struct nvme_ctrlr * 313 nvme_ctrlr_get_by_name(const char *name) 314 { 315 struct nvme_bdev_ctrlr *nbdev_ctrlr; 316 struct nvme_ctrlr *nvme_ctrlr = NULL; 317 318 if (name == NULL) { 319 return NULL; 320 } 321 322 pthread_mutex_lock(&g_bdev_nvme_mutex); 323 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 324 if (nbdev_ctrlr != NULL) { 325 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 326 } 327 pthread_mutex_unlock(&g_bdev_nvme_mutex); 328 329 return nvme_ctrlr; 330 } 331 332 void 333 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 334 { 335 struct nvme_bdev_ctrlr *nbdev_ctrlr; 336 337 pthread_mutex_lock(&g_bdev_nvme_mutex); 338 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 339 fn(nbdev_ctrlr, ctx); 340 } 341 pthread_mutex_unlock(&g_bdev_nvme_mutex); 342 } 343 344 void 345 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 346 { 347 const char *trtype_str; 348 const char *adrfam_str; 349 350 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 351 if (trtype_str) { 352 spdk_json_write_named_string(w, "trtype", trtype_str); 353 } 354 355 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 356 if (adrfam_str) { 357 spdk_json_write_named_string(w, "adrfam", adrfam_str); 358 } 359 360 if (trid->traddr[0] != '\0') { 361 spdk_json_write_named_string(w, "traddr", trid->traddr); 362 } 363 364 if (trid->trsvcid[0] != '\0') { 365 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 366 } 367 368 if (trid->subnqn[0] != '\0') { 369 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 370 } 371 } 372 373 static void 374 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 375 struct nvme_ctrlr *nvme_ctrlr) 376 { 377 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 378 pthread_mutex_lock(&g_bdev_nvme_mutex); 379 380 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 381 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 382 pthread_mutex_unlock(&g_bdev_nvme_mutex); 383 384 return; 385 } 386 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 387 388 pthread_mutex_unlock(&g_bdev_nvme_mutex); 389 390 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 391 392 free(nbdev_ctrlr->name); 393 free(nbdev_ctrlr); 394 } 395 396 static void 397 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 398 { 399 struct nvme_path_id *path_id, *tmp_path; 400 struct nvme_ns *ns, *tmp_ns; 401 402 free(nvme_ctrlr->copied_ana_desc); 403 spdk_free(nvme_ctrlr->ana_log_page); 404 405 if (nvme_ctrlr->opal_dev) { 406 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 407 nvme_ctrlr->opal_dev = NULL; 408 } 409 410 if (nvme_ctrlr->nbdev_ctrlr) { 411 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 412 } 413 414 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 415 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 416 free(ns); 417 } 418 419 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 420 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 421 free(path_id); 422 } 423 424 pthread_mutex_destroy(&nvme_ctrlr->mutex); 425 426 free(nvme_ctrlr); 427 428 pthread_mutex_lock(&g_bdev_nvme_mutex); 429 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 430 pthread_mutex_unlock(&g_bdev_nvme_mutex); 431 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 432 spdk_bdev_module_fini_done(); 433 return; 434 } 435 pthread_mutex_unlock(&g_bdev_nvme_mutex); 436 } 437 438 static int 439 nvme_detach_poller(void *arg) 440 { 441 struct nvme_ctrlr *nvme_ctrlr = arg; 442 int rc; 443 444 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 445 if (rc != -EAGAIN) { 446 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 447 _nvme_ctrlr_delete(nvme_ctrlr); 448 } 449 450 return SPDK_POLLER_BUSY; 451 } 452 453 static void 454 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 455 { 456 int rc; 457 458 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 459 460 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 461 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 462 463 /* If we got here, the reset/detach poller cannot be active */ 464 assert(nvme_ctrlr->reset_detach_poller == NULL); 465 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 466 nvme_ctrlr, 1000); 467 if (nvme_ctrlr->reset_detach_poller == NULL) { 468 SPDK_ERRLOG("Failed to register detach poller\n"); 469 goto error; 470 } 471 472 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 473 if (rc != 0) { 474 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 475 goto error; 476 } 477 478 return; 479 error: 480 /* We don't have a good way to handle errors here, so just do what we can and delete the 481 * controller without detaching the underlying NVMe device. 482 */ 483 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 484 _nvme_ctrlr_delete(nvme_ctrlr); 485 } 486 487 static void 488 nvme_ctrlr_unregister_cb(void *io_device) 489 { 490 struct nvme_ctrlr *nvme_ctrlr = io_device; 491 492 nvme_ctrlr_delete(nvme_ctrlr); 493 } 494 495 static void 496 nvme_ctrlr_unregister(void *ctx) 497 { 498 struct nvme_ctrlr *nvme_ctrlr = ctx; 499 500 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 501 } 502 503 static bool 504 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 505 { 506 if (!nvme_ctrlr->destruct) { 507 return false; 508 } 509 510 if (nvme_ctrlr->ref > 0) { 511 return false; 512 } 513 514 if (nvme_ctrlr->resetting) { 515 return false; 516 } 517 518 if (nvme_ctrlr->ana_log_page_updating) { 519 return false; 520 } 521 522 if (nvme_ctrlr->io_path_cache_clearing) { 523 return false; 524 } 525 526 return true; 527 } 528 529 static void 530 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 531 { 532 pthread_mutex_lock(&nvme_ctrlr->mutex); 533 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 534 535 assert(nvme_ctrlr->ref > 0); 536 nvme_ctrlr->ref--; 537 538 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 539 pthread_mutex_unlock(&nvme_ctrlr->mutex); 540 return; 541 } 542 543 pthread_mutex_unlock(&nvme_ctrlr->mutex); 544 545 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 546 } 547 548 static struct nvme_io_path * 549 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 550 { 551 struct nvme_io_path *io_path; 552 553 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 554 if (io_path->nvme_ns == nvme_ns) { 555 break; 556 } 557 } 558 559 return io_path; 560 } 561 562 static int 563 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 564 { 565 struct nvme_io_path *io_path; 566 struct spdk_io_channel *ch; 567 struct nvme_ctrlr_channel *ctrlr_ch; 568 struct nvme_qpair *nvme_qpair; 569 570 io_path = calloc(1, sizeof(*io_path)); 571 if (io_path == NULL) { 572 SPDK_ERRLOG("Failed to alloc io_path.\n"); 573 return -ENOMEM; 574 } 575 576 io_path->nvme_ns = nvme_ns; 577 578 ch = spdk_get_io_channel(nvme_ns->ctrlr); 579 if (ch == NULL) { 580 free(io_path); 581 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 582 return -ENOMEM; 583 } 584 585 ctrlr_ch = spdk_io_channel_get_ctx(ch); 586 587 nvme_qpair = ctrlr_ch->qpair; 588 assert(nvme_qpair != NULL); 589 590 io_path->qpair = nvme_qpair; 591 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 592 593 io_path->nbdev_ch = nbdev_ch; 594 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 595 596 nbdev_ch->current_io_path = NULL; 597 598 return 0; 599 } 600 601 static void 602 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 603 { 604 struct spdk_io_channel *ch; 605 struct nvme_qpair *nvme_qpair; 606 struct nvme_ctrlr_channel *ctrlr_ch; 607 608 nbdev_ch->current_io_path = NULL; 609 610 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 611 612 nvme_qpair = io_path->qpair; 613 assert(nvme_qpair != NULL); 614 615 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 616 617 ctrlr_ch = nvme_qpair->ctrlr_ch; 618 assert(ctrlr_ch != NULL); 619 620 ch = spdk_io_channel_from_ctx(ctrlr_ch); 621 spdk_put_io_channel(ch); 622 623 free(io_path); 624 } 625 626 static void 627 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 628 { 629 struct nvme_io_path *io_path, *tmp_io_path; 630 631 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 632 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 633 } 634 } 635 636 static int 637 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 638 { 639 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 640 struct nvme_bdev *nbdev = io_device; 641 struct nvme_ns *nvme_ns; 642 int rc; 643 644 STAILQ_INIT(&nbdev_ch->io_path_list); 645 TAILQ_INIT(&nbdev_ch->retry_io_list); 646 647 pthread_mutex_lock(&nbdev->mutex); 648 649 nbdev_ch->mp_policy = nbdev->mp_policy; 650 651 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 652 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 653 if (rc != 0) { 654 pthread_mutex_unlock(&nbdev->mutex); 655 656 _bdev_nvme_delete_io_paths(nbdev_ch); 657 return rc; 658 } 659 } 660 pthread_mutex_unlock(&nbdev->mutex); 661 662 return 0; 663 } 664 665 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 666 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 667 */ 668 static inline void 669 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 670 const struct spdk_nvme_cpl *cpl) 671 { 672 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 673 (uintptr_t)bdev_io); 674 if (cpl) { 675 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 676 } else { 677 spdk_bdev_io_complete(bdev_io, status); 678 } 679 } 680 681 static void 682 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 683 { 684 struct spdk_bdev_io *bdev_io, *tmp_io; 685 686 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) { 687 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 688 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 689 } 690 691 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 692 } 693 694 static void 695 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 696 { 697 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 698 699 bdev_nvme_abort_retry_ios(nbdev_ch); 700 _bdev_nvme_delete_io_paths(nbdev_ch); 701 } 702 703 static inline bool 704 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 705 { 706 switch (io_type) { 707 case SPDK_BDEV_IO_TYPE_RESET: 708 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 709 case SPDK_BDEV_IO_TYPE_ABORT: 710 return true; 711 default: 712 break; 713 } 714 715 return false; 716 } 717 718 static inline bool 719 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 720 { 721 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 722 return false; 723 } 724 725 switch (nvme_ns->ana_state) { 726 case SPDK_NVME_ANA_OPTIMIZED_STATE: 727 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 728 return true; 729 default: 730 break; 731 } 732 733 return false; 734 } 735 736 static inline bool 737 nvme_io_path_is_connected(struct nvme_io_path *io_path) 738 { 739 if (spdk_unlikely(io_path->qpair->qpair == NULL)) { 740 return false; 741 } 742 743 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(io_path->qpair->qpair) != 744 SPDK_NVME_QPAIR_FAILURE_NONE)) { 745 return false; 746 } 747 748 if (spdk_unlikely(io_path->qpair->ctrlr_ch->reset_iter != NULL)) { 749 return false; 750 } 751 752 if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(io_path->qpair->ctrlr->ctrlr) != 753 SPDK_NVME_QPAIR_FAILURE_NONE) { 754 return false; 755 } 756 757 return true; 758 } 759 760 static inline bool 761 nvme_io_path_is_available(struct nvme_io_path *io_path) 762 { 763 if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) { 764 return false; 765 } 766 767 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 768 return false; 769 } 770 771 return true; 772 } 773 774 static inline bool 775 nvme_io_path_is_failed(struct nvme_io_path *io_path) 776 { 777 struct nvme_ctrlr *nvme_ctrlr; 778 779 nvme_ctrlr = io_path->qpair->ctrlr; 780 781 if (nvme_ctrlr->destruct) { 782 return true; 783 } 784 785 if (nvme_ctrlr->fast_io_fail_timedout) { 786 return true; 787 } 788 789 if (nvme_ctrlr->resetting) { 790 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 791 return false; 792 } else { 793 return true; 794 } 795 } 796 797 if (nvme_ctrlr->reconnect_is_delayed) { 798 return false; 799 } 800 801 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 802 return true; 803 } else { 804 return false; 805 } 806 } 807 808 static bool 809 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 810 { 811 if (nvme_ctrlr->destruct) { 812 return false; 813 } 814 815 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 816 return false; 817 } 818 819 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 820 return false; 821 } 822 823 return true; 824 } 825 826 /* Simulate circular linked list. */ 827 static inline struct nvme_io_path * 828 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 829 { 830 struct nvme_io_path *next_path; 831 832 next_path = STAILQ_NEXT(prev_path, stailq); 833 if (next_path != NULL) { 834 return next_path; 835 } else { 836 return STAILQ_FIRST(&nbdev_ch->io_path_list); 837 } 838 } 839 840 static struct nvme_io_path * 841 bdev_nvme_find_next_io_path(struct nvme_bdev_channel *nbdev_ch, 842 struct nvme_io_path *prev) 843 { 844 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 845 846 start = nvme_io_path_get_next(nbdev_ch, prev); 847 848 io_path = start; 849 do { 850 if (spdk_likely(nvme_io_path_is_connected(io_path) && 851 !io_path->nvme_ns->ana_state_updating)) { 852 switch (io_path->nvme_ns->ana_state) { 853 case SPDK_NVME_ANA_OPTIMIZED_STATE: 854 nbdev_ch->current_io_path = io_path; 855 return io_path; 856 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 857 if (non_optimized == NULL) { 858 non_optimized = io_path; 859 } 860 break; 861 default: 862 break; 863 } 864 } 865 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 866 } while (io_path != start); 867 868 /* We come here only if there is no optimized path. Cache even non_optimized 869 * path for load balance across multiple non_optimized paths. 870 */ 871 nbdev_ch->current_io_path = non_optimized; 872 return non_optimized; 873 } 874 875 static struct nvme_io_path * 876 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 877 { 878 struct nvme_io_path *io_path, *non_optimized = NULL; 879 880 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 881 if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) { 882 /* The device is currently resetting. */ 883 continue; 884 } 885 886 if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) { 887 continue; 888 } 889 890 switch (io_path->nvme_ns->ana_state) { 891 case SPDK_NVME_ANA_OPTIMIZED_STATE: 892 nbdev_ch->current_io_path = io_path; 893 return io_path; 894 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 895 if (non_optimized == NULL) { 896 non_optimized = io_path; 897 } 898 break; 899 default: 900 break; 901 } 902 } 903 904 return non_optimized; 905 } 906 907 static inline struct nvme_io_path * 908 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 909 { 910 if (spdk_unlikely(nbdev_ch->current_io_path == NULL)) { 911 return _bdev_nvme_find_io_path(nbdev_ch); 912 } 913 914 if (spdk_likely(nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE)) { 915 return nbdev_ch->current_io_path; 916 } else { 917 return bdev_nvme_find_next_io_path(nbdev_ch, nbdev_ch->current_io_path); 918 } 919 } 920 921 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 922 * or false otherwise. 923 * 924 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 925 * is likely to be non-accessible now but may become accessible. 926 * 927 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 928 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 929 * when starting to reset it but it is set to failed when the reset failed. Hence, if 930 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 931 */ 932 static bool 933 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 934 { 935 struct nvme_io_path *io_path; 936 937 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 938 if (io_path->nvme_ns->ana_transition_timedout) { 939 continue; 940 } 941 942 if (nvme_io_path_is_connected(io_path) || 943 !nvme_io_path_is_failed(io_path)) { 944 return true; 945 } 946 } 947 948 return false; 949 } 950 951 static int 952 bdev_nvme_retry_ios(void *arg) 953 { 954 struct nvme_bdev_channel *nbdev_ch = arg; 955 struct spdk_io_channel *ch = spdk_io_channel_from_ctx(nbdev_ch); 956 struct spdk_bdev_io *bdev_io, *tmp_bdev_io; 957 struct nvme_bdev_io *bio; 958 uint64_t now, delay_us; 959 960 now = spdk_get_ticks(); 961 962 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) { 963 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 964 if (bio->retry_ticks > now) { 965 break; 966 } 967 968 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 969 970 bdev_nvme_submit_request(ch, bdev_io); 971 } 972 973 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 974 975 bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list); 976 if (bdev_io != NULL) { 977 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 978 979 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 980 981 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 982 delay_us); 983 } 984 985 return SPDK_POLLER_BUSY; 986 } 987 988 static void 989 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 990 struct nvme_bdev_io *bio, uint64_t delay_ms) 991 { 992 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 993 struct spdk_bdev_io *tmp_bdev_io; 994 struct nvme_bdev_io *tmp_bio; 995 996 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 997 998 TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) { 999 tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx; 1000 1001 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1002 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io, 1003 module_link); 1004 return; 1005 } 1006 } 1007 1008 /* No earlier I/Os were found. This I/O must be the new head. */ 1009 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link); 1010 1011 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1012 1013 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1014 delay_ms * 1000ULL); 1015 } 1016 1017 static inline void 1018 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1019 const struct spdk_nvme_cpl *cpl) 1020 { 1021 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1022 struct nvme_bdev_channel *nbdev_ch; 1023 struct nvme_ctrlr *nvme_ctrlr; 1024 const struct spdk_nvme_ctrlr_data *cdata; 1025 uint64_t delay_ms; 1026 1027 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1028 1029 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1030 goto complete; 1031 } 1032 1033 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1034 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1035 goto complete; 1036 } 1037 1038 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1039 1040 assert(bio->io_path != NULL); 1041 nvme_ctrlr = bio->io_path->qpair->ctrlr; 1042 1043 if (spdk_nvme_cpl_is_path_error(cpl) || 1044 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1045 !nvme_io_path_is_available(bio->io_path) || 1046 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1047 nbdev_ch->current_io_path = NULL; 1048 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1049 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1050 bio->io_path->nvme_ns->ana_state_updating = true; 1051 } 1052 } 1053 delay_ms = 0; 1054 } else { 1055 bio->retry_count++; 1056 1057 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1058 1059 if (cpl->status.crd != 0) { 1060 delay_ms = cdata->crdt[cpl->status.crd] * 100; 1061 } else { 1062 delay_ms = 0; 1063 } 1064 } 1065 1066 if (any_io_path_may_become_available(nbdev_ch)) { 1067 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1068 return; 1069 } 1070 1071 complete: 1072 bio->retry_count = 0; 1073 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1074 } 1075 1076 static inline void 1077 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1078 { 1079 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1080 struct nvme_bdev_channel *nbdev_ch; 1081 enum spdk_bdev_io_status io_status; 1082 1083 switch (rc) { 1084 case 0: 1085 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1086 break; 1087 case -ENOMEM: 1088 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1089 break; 1090 case -ENXIO: 1091 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1092 1093 nbdev_ch->current_io_path = NULL; 1094 1095 if (any_io_path_may_become_available(nbdev_ch)) { 1096 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1097 return; 1098 } 1099 1100 /* fallthrough */ 1101 default: 1102 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1103 break; 1104 } 1105 1106 bio->retry_count = 0; 1107 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1108 } 1109 1110 static inline void 1111 bdev_nvme_admin_passthru_complete(struct nvme_bdev_io *bio, int rc) 1112 { 1113 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1114 enum spdk_bdev_io_status io_status; 1115 1116 switch (rc) { 1117 case 0: 1118 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1119 break; 1120 case -ENOMEM: 1121 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1122 break; 1123 case -ENXIO: 1124 /* fallthrough */ 1125 default: 1126 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1127 break; 1128 } 1129 1130 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1131 } 1132 1133 static void 1134 bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) 1135 { 1136 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1137 1138 pthread_mutex_lock(&nvme_ctrlr->mutex); 1139 1140 assert(nvme_ctrlr->io_path_cache_clearing == true); 1141 nvme_ctrlr->io_path_cache_clearing = false; 1142 1143 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1144 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1145 return; 1146 } 1147 1148 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1149 1150 nvme_ctrlr_unregister(nvme_ctrlr); 1151 } 1152 1153 static void 1154 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1155 { 1156 struct nvme_io_path *io_path; 1157 1158 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1159 io_path->nbdev_ch->current_io_path = NULL; 1160 } 1161 } 1162 1163 static void 1164 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1165 { 1166 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1167 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1168 1169 assert(ctrlr_ch->qpair != NULL); 1170 1171 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1172 1173 spdk_for_each_channel_continue(i, 0); 1174 } 1175 1176 static void 1177 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1178 { 1179 pthread_mutex_lock(&nvme_ctrlr->mutex); 1180 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1181 nvme_ctrlr->io_path_cache_clearing) { 1182 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1183 return; 1184 } 1185 1186 nvme_ctrlr->io_path_cache_clearing = true; 1187 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1188 1189 spdk_for_each_channel(nvme_ctrlr, 1190 bdev_nvme_clear_io_path_cache, 1191 NULL, 1192 bdev_nvme_clear_io_path_caches_done); 1193 } 1194 1195 static struct nvme_qpair * 1196 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1197 { 1198 struct nvme_qpair *nvme_qpair; 1199 1200 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1201 if (nvme_qpair->qpair == qpair) { 1202 break; 1203 } 1204 } 1205 1206 return nvme_qpair; 1207 } 1208 1209 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1210 1211 static void 1212 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1213 { 1214 struct nvme_poll_group *group = poll_group_ctx; 1215 struct nvme_qpair *nvme_qpair; 1216 struct nvme_ctrlr_channel *ctrlr_ch; 1217 1218 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1219 if (nvme_qpair == NULL) { 1220 return; 1221 } 1222 1223 if (nvme_qpair->qpair != NULL) { 1224 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1225 nvme_qpair->qpair = NULL; 1226 } 1227 1228 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1229 1230 ctrlr_ch = nvme_qpair->ctrlr_ch; 1231 1232 if (ctrlr_ch != NULL) { 1233 if (ctrlr_ch->reset_iter != NULL) { 1234 /* If we are already in a full reset sequence, we do not have 1235 * to restart it. Just move to the next ctrlr_channel. 1236 */ 1237 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1238 qpair); 1239 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 1240 ctrlr_ch->reset_iter = NULL; 1241 } else { 1242 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1243 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1244 bdev_nvme_failover(nvme_qpair->ctrlr, false); 1245 } 1246 } else { 1247 /* In this case, ctrlr_channel is already deleted. */ 1248 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1249 nvme_qpair_delete(nvme_qpair); 1250 } 1251 } 1252 1253 static void 1254 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1255 { 1256 struct nvme_qpair *nvme_qpair; 1257 1258 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1259 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1260 continue; 1261 } 1262 1263 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1264 SPDK_NVME_QPAIR_FAILURE_NONE) { 1265 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1266 } 1267 } 1268 } 1269 1270 static int 1271 bdev_nvme_poll(void *arg) 1272 { 1273 struct nvme_poll_group *group = arg; 1274 int64_t num_completions; 1275 1276 if (group->collect_spin_stat && group->start_ticks == 0) { 1277 group->start_ticks = spdk_get_ticks(); 1278 } 1279 1280 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1281 bdev_nvme_disconnected_qpair_cb); 1282 if (group->collect_spin_stat) { 1283 if (num_completions > 0) { 1284 if (group->end_ticks != 0) { 1285 group->spin_ticks += (group->end_ticks - group->start_ticks); 1286 group->end_ticks = 0; 1287 } 1288 group->start_ticks = 0; 1289 } else { 1290 group->end_ticks = spdk_get_ticks(); 1291 } 1292 } 1293 1294 if (spdk_unlikely(num_completions < 0)) { 1295 bdev_nvme_check_io_qpairs(group); 1296 } 1297 1298 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1299 } 1300 1301 static int bdev_nvme_poll_adminq(void *arg); 1302 1303 static void 1304 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1305 { 1306 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1307 1308 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1309 nvme_ctrlr, new_period_us); 1310 } 1311 1312 static int 1313 bdev_nvme_poll_adminq(void *arg) 1314 { 1315 int32_t rc; 1316 struct nvme_ctrlr *nvme_ctrlr = arg; 1317 nvme_ctrlr_disconnected_cb disconnected_cb; 1318 1319 assert(nvme_ctrlr != NULL); 1320 1321 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1322 if (rc < 0) { 1323 disconnected_cb = nvme_ctrlr->disconnected_cb; 1324 nvme_ctrlr->disconnected_cb = NULL; 1325 1326 if (rc == -ENXIO && disconnected_cb != NULL) { 1327 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1328 g_opts.nvme_adminq_poll_period_us); 1329 disconnected_cb(nvme_ctrlr); 1330 } else { 1331 bdev_nvme_failover(nvme_ctrlr, false); 1332 } 1333 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1334 SPDK_NVME_QPAIR_FAILURE_NONE) { 1335 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1336 } 1337 1338 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1339 } 1340 1341 static void 1342 _bdev_nvme_unregister_dev_cb(void *io_device) 1343 { 1344 struct nvme_bdev *nvme_disk = io_device; 1345 1346 free(nvme_disk->disk.name); 1347 free(nvme_disk); 1348 } 1349 1350 static int 1351 bdev_nvme_destruct(void *ctx) 1352 { 1353 struct nvme_bdev *nvme_disk = ctx; 1354 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1355 1356 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1357 1358 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1359 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1360 1361 nvme_ns->bdev = NULL; 1362 1363 assert(nvme_ns->id > 0); 1364 1365 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1366 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1367 1368 nvme_ctrlr_release(nvme_ns->ctrlr); 1369 free(nvme_ns); 1370 } else { 1371 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1372 } 1373 } 1374 1375 pthread_mutex_lock(&g_bdev_nvme_mutex); 1376 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1377 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1378 1379 spdk_io_device_unregister(nvme_disk, _bdev_nvme_unregister_dev_cb); 1380 1381 return 0; 1382 } 1383 1384 static int 1385 bdev_nvme_flush(struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes) 1386 { 1387 bdev_nvme_io_complete(bio, 0); 1388 1389 return 0; 1390 } 1391 1392 static int 1393 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1394 { 1395 struct nvme_ctrlr *nvme_ctrlr; 1396 struct spdk_nvme_io_qpair_opts opts; 1397 struct spdk_nvme_qpair *qpair; 1398 int rc; 1399 1400 nvme_ctrlr = nvme_qpair->ctrlr; 1401 1402 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1403 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1404 opts.create_only = true; 1405 opts.async_mode = true; 1406 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1407 g_opts.io_queue_requests = opts.io_queue_requests; 1408 1409 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1410 if (qpair == NULL) { 1411 return -1; 1412 } 1413 1414 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1415 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1416 1417 assert(nvme_qpair->group != NULL); 1418 1419 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1420 if (rc != 0) { 1421 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1422 goto err; 1423 } 1424 1425 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1426 if (rc != 0) { 1427 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1428 goto err; 1429 } 1430 1431 nvme_qpair->qpair = qpair; 1432 1433 if (!g_opts.disable_auto_failback) { 1434 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1435 } 1436 1437 return 0; 1438 1439 err: 1440 spdk_nvme_ctrlr_free_io_qpair(qpair); 1441 1442 return rc; 1443 } 1444 1445 static void 1446 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1447 { 1448 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1449 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1450 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 1451 struct spdk_bdev_io *bdev_io; 1452 1453 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1454 status = SPDK_BDEV_IO_STATUS_FAILED; 1455 } 1456 1457 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1458 bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1459 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link); 1460 __bdev_nvme_io_complete(bdev_io, status, NULL); 1461 } 1462 1463 spdk_for_each_channel_continue(i, 0); 1464 } 1465 1466 static void 1467 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove) 1468 { 1469 struct nvme_path_id *path_id, *next_path; 1470 int rc __attribute__((unused)); 1471 1472 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1473 assert(path_id); 1474 assert(path_id == nvme_ctrlr->active_path_id); 1475 next_path = TAILQ_NEXT(path_id, link); 1476 1477 path_id->is_failed = true; 1478 1479 if (next_path) { 1480 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1481 1482 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1483 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1484 1485 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1486 nvme_ctrlr->active_path_id = next_path; 1487 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1488 assert(rc == 0); 1489 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1490 if (!remove) { 1491 /** Shuffle the old trid to the end of the list and use the new one. 1492 * Allows for round robin through multiple connections. 1493 */ 1494 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1495 } else { 1496 free(path_id); 1497 } 1498 } 1499 } 1500 1501 static bool 1502 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1503 { 1504 int32_t elapsed; 1505 1506 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1507 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1508 return false; 1509 } 1510 1511 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1512 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1513 return true; 1514 } else { 1515 return false; 1516 } 1517 } 1518 1519 static bool 1520 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1521 { 1522 uint32_t elapsed; 1523 1524 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1525 return false; 1526 } 1527 1528 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1529 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1530 return true; 1531 } else { 1532 return false; 1533 } 1534 } 1535 1536 static void bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 1537 1538 static void 1539 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1540 { 1541 int rc; 1542 1543 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1544 if (rc != 0) { 1545 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 1546 * fail the reset sequence immediately. 1547 */ 1548 bdev_nvme_reset_complete(nvme_ctrlr, false); 1549 return; 1550 } 1551 1552 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1553 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1554 */ 1555 assert(nvme_ctrlr->disconnected_cb == NULL); 1556 nvme_ctrlr->disconnected_cb = cb_fn; 1557 1558 /* During disconnection, reduce the period to poll adminq more often. */ 1559 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 1560 } 1561 1562 enum bdev_nvme_op_after_reset { 1563 OP_NONE, 1564 OP_COMPLETE_PENDING_DESTRUCT, 1565 OP_DESTRUCT, 1566 OP_DELAYED_RECONNECT, 1567 }; 1568 1569 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1570 1571 static _bdev_nvme_op_after_reset 1572 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1573 { 1574 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1575 /* Complete pending destruct after reset completes. */ 1576 return OP_COMPLETE_PENDING_DESTRUCT; 1577 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1578 nvme_ctrlr->reset_start_tsc = 0; 1579 return OP_NONE; 1580 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1581 return OP_DESTRUCT; 1582 } else { 1583 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1584 nvme_ctrlr->fast_io_fail_timedout = true; 1585 } 1586 bdev_nvme_failover_trid(nvme_ctrlr, false); 1587 return OP_DELAYED_RECONNECT; 1588 } 1589 } 1590 1591 static int _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1592 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1593 1594 static int 1595 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 1596 { 1597 struct nvme_ctrlr *nvme_ctrlr = ctx; 1598 1599 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 1600 pthread_mutex_lock(&nvme_ctrlr->mutex); 1601 1602 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 1603 1604 assert(nvme_ctrlr->reconnect_is_delayed == true); 1605 nvme_ctrlr->reconnect_is_delayed = false; 1606 1607 if (nvme_ctrlr->destruct) { 1608 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1609 return SPDK_POLLER_BUSY; 1610 } 1611 1612 assert(nvme_ctrlr->resetting == false); 1613 nvme_ctrlr->resetting = true; 1614 1615 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1616 1617 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 1618 1619 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 1620 return SPDK_POLLER_BUSY; 1621 } 1622 1623 static void 1624 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 1625 { 1626 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 1627 1628 assert(nvme_ctrlr->reconnect_is_delayed == false); 1629 nvme_ctrlr->reconnect_is_delayed = true; 1630 1631 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 1632 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 1633 nvme_ctrlr, 1634 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 1635 } 1636 1637 static void 1638 _bdev_nvme_reset_complete(struct spdk_io_channel_iter *i, int status) 1639 { 1640 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1641 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 1642 struct nvme_path_id *path_id; 1643 bdev_nvme_reset_cb reset_cb_fn = nvme_ctrlr->reset_cb_fn; 1644 void *reset_cb_arg = nvme_ctrlr->reset_cb_arg; 1645 enum bdev_nvme_op_after_reset op_after_reset; 1646 1647 assert(nvme_ctrlr->thread == spdk_get_thread()); 1648 1649 nvme_ctrlr->reset_cb_fn = NULL; 1650 nvme_ctrlr->reset_cb_arg = NULL; 1651 1652 if (!success) { 1653 SPDK_ERRLOG("Resetting controller failed.\n"); 1654 } else { 1655 SPDK_NOTICELOG("Resetting controller successful.\n"); 1656 } 1657 1658 pthread_mutex_lock(&nvme_ctrlr->mutex); 1659 nvme_ctrlr->resetting = false; 1660 1661 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1662 assert(path_id != NULL); 1663 assert(path_id == nvme_ctrlr->active_path_id); 1664 1665 path_id->is_failed = !success; 1666 1667 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 1668 1669 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1670 1671 if (reset_cb_fn) { 1672 reset_cb_fn(reset_cb_arg, success); 1673 } 1674 1675 switch (op_after_reset) { 1676 case OP_COMPLETE_PENDING_DESTRUCT: 1677 nvme_ctrlr_unregister(nvme_ctrlr); 1678 break; 1679 case OP_DESTRUCT: 1680 _bdev_nvme_delete(nvme_ctrlr, false); 1681 break; 1682 case OP_DELAYED_RECONNECT: 1683 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 1684 break; 1685 default: 1686 break; 1687 } 1688 } 1689 1690 static void 1691 bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 1692 { 1693 /* Make sure we clear any pending resets before returning. */ 1694 spdk_for_each_channel(nvme_ctrlr, 1695 bdev_nvme_complete_pending_resets, 1696 success ? NULL : (void *)0x1, 1697 _bdev_nvme_reset_complete); 1698 } 1699 1700 static void 1701 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 1702 { 1703 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1704 1705 bdev_nvme_reset_complete(nvme_ctrlr, false); 1706 } 1707 1708 static void 1709 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 1710 { 1711 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 1712 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 1713 struct nvme_qpair *nvme_qpair; 1714 1715 nvme_qpair = ctrlr_ch->qpair; 1716 assert(nvme_qpair != NULL); 1717 1718 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1719 1720 if (nvme_qpair->qpair != NULL) { 1721 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 1722 1723 /* The current full reset sequence will move to the next 1724 * ctrlr_channel after the qpair is actually disconnected. 1725 */ 1726 assert(ctrlr_ch->reset_iter == NULL); 1727 ctrlr_ch->reset_iter = i; 1728 } else { 1729 spdk_for_each_channel_continue(i, 0); 1730 } 1731 } 1732 1733 static void 1734 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 1735 { 1736 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1737 1738 if (status == 0) { 1739 bdev_nvme_reset_complete(nvme_ctrlr, true); 1740 } else { 1741 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 1742 spdk_for_each_channel(nvme_ctrlr, 1743 bdev_nvme_reset_destroy_qpair, 1744 NULL, 1745 bdev_nvme_reset_create_qpairs_failed); 1746 } 1747 } 1748 1749 static void 1750 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 1751 { 1752 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1753 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1754 int rc; 1755 1756 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 1757 1758 spdk_for_each_channel_continue(i, rc); 1759 } 1760 1761 static int 1762 bdev_nvme_reconnect_ctrlr_poll(void *arg) 1763 { 1764 struct nvme_ctrlr *nvme_ctrlr = arg; 1765 int rc = -ETIMEDOUT; 1766 1767 if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1768 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 1769 if (rc == -EAGAIN) { 1770 return SPDK_POLLER_BUSY; 1771 } 1772 } 1773 1774 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 1775 if (rc == 0) { 1776 /* Recreate all of the I/O queue pairs */ 1777 spdk_for_each_channel(nvme_ctrlr, 1778 bdev_nvme_reset_create_qpair, 1779 NULL, 1780 bdev_nvme_reset_create_qpairs_done); 1781 } else { 1782 bdev_nvme_reset_complete(nvme_ctrlr, false); 1783 } 1784 return SPDK_POLLER_BUSY; 1785 } 1786 1787 static void 1788 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 1789 { 1790 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 1791 1792 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 1793 assert(nvme_ctrlr->reset_detach_poller == NULL); 1794 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 1795 nvme_ctrlr, 0); 1796 } 1797 1798 static void 1799 bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status) 1800 { 1801 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1802 1803 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 1804 assert(status == 0); 1805 1806 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 1807 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 1808 } else { 1809 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 1810 } 1811 } 1812 1813 static void 1814 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 1815 { 1816 spdk_for_each_channel(nvme_ctrlr, 1817 bdev_nvme_reset_destroy_qpair, 1818 NULL, 1819 bdev_nvme_reset_ctrlr); 1820 } 1821 1822 static void 1823 _bdev_nvme_reset(void *ctx) 1824 { 1825 struct nvme_ctrlr *nvme_ctrlr = ctx; 1826 1827 assert(nvme_ctrlr->resetting == true); 1828 assert(nvme_ctrlr->thread == spdk_get_thread()); 1829 1830 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 1831 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 1832 } else { 1833 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 1834 } 1835 } 1836 1837 static int 1838 bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr) 1839 { 1840 pthread_mutex_lock(&nvme_ctrlr->mutex); 1841 if (nvme_ctrlr->destruct) { 1842 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1843 return -ENXIO; 1844 } 1845 1846 if (nvme_ctrlr->resetting) { 1847 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1848 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 1849 return -EBUSY; 1850 } 1851 1852 if (nvme_ctrlr->reconnect_is_delayed) { 1853 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1854 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 1855 return -EBUSY; 1856 } 1857 1858 nvme_ctrlr->resetting = true; 1859 1860 assert(nvme_ctrlr->reset_start_tsc == 0); 1861 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 1862 1863 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1864 1865 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr); 1866 return 0; 1867 } 1868 1869 int 1870 bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg) 1871 { 1872 int rc; 1873 1874 rc = bdev_nvme_reset(nvme_ctrlr); 1875 if (rc == 0) { 1876 nvme_ctrlr->reset_cb_fn = cb_fn; 1877 nvme_ctrlr->reset_cb_arg = cb_arg; 1878 } 1879 return rc; 1880 } 1881 1882 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 1883 1884 static void 1885 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 1886 { 1887 enum spdk_bdev_io_status io_status; 1888 1889 if (bio->cpl.cdw0 == 0) { 1890 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1891 } else { 1892 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1893 } 1894 1895 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 1896 } 1897 1898 static void 1899 _bdev_nvme_reset_io_continue(void *ctx) 1900 { 1901 struct nvme_bdev_io *bio = ctx; 1902 struct nvme_io_path *prev_io_path, *next_io_path; 1903 int rc; 1904 1905 prev_io_path = bio->io_path; 1906 bio->io_path = NULL; 1907 1908 if (bio->cpl.cdw0 != 0) { 1909 goto complete; 1910 } 1911 1912 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 1913 if (next_io_path == NULL) { 1914 goto complete; 1915 } 1916 1917 rc = _bdev_nvme_reset_io(next_io_path, bio); 1918 if (rc == 0) { 1919 return; 1920 } 1921 1922 bio->cpl.cdw0 = 1; 1923 1924 complete: 1925 bdev_nvme_reset_io_complete(bio); 1926 } 1927 1928 static void 1929 bdev_nvme_reset_io_continue(void *cb_arg, bool success) 1930 { 1931 struct nvme_bdev_io *bio = cb_arg; 1932 1933 bio->cpl.cdw0 = !success; 1934 1935 spdk_thread_send_msg(bio->orig_thread, _bdev_nvme_reset_io_continue, bio); 1936 } 1937 1938 static int 1939 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 1940 { 1941 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1942 struct nvme_ctrlr_channel *ctrlr_ch; 1943 struct spdk_bdev_io *bdev_io; 1944 int rc; 1945 1946 rc = bdev_nvme_reset(nvme_ctrlr); 1947 if (rc == 0) { 1948 assert(bio->io_path == NULL); 1949 bio->io_path = io_path; 1950 1951 assert(nvme_ctrlr->reset_cb_fn == NULL); 1952 assert(nvme_ctrlr->reset_cb_arg == NULL); 1953 nvme_ctrlr->reset_cb_fn = bdev_nvme_reset_io_continue; 1954 nvme_ctrlr->reset_cb_arg = bio; 1955 } else if (rc == -EBUSY) { 1956 ctrlr_ch = io_path->qpair->ctrlr_ch; 1957 assert(ctrlr_ch != NULL); 1958 /* 1959 * Reset call is queued only if it is from the app framework. This is on purpose so that 1960 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 1961 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 1962 */ 1963 bdev_io = spdk_bdev_io_from_ctx(bio); 1964 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link); 1965 } else { 1966 return rc; 1967 } 1968 1969 return 0; 1970 } 1971 1972 static void 1973 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 1974 { 1975 struct nvme_io_path *io_path; 1976 int rc; 1977 1978 bio->cpl.cdw0 = 0; 1979 bio->orig_thread = spdk_get_thread(); 1980 1981 /* Reset only the first nvme_ctrlr in the nvme_bdev_ctrlr for now. 1982 * 1983 * TODO: Reset all nvme_ctrlrs in the nvme_bdev_ctrlr sequentially. 1984 * This will be done in the following patches. 1985 */ 1986 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 1987 assert(io_path != NULL); 1988 1989 rc = _bdev_nvme_reset_io(io_path, bio); 1990 if (rc != 0) { 1991 bio->cpl.cdw0 = 1; 1992 bdev_nvme_reset_io_complete(bio); 1993 } 1994 } 1995 1996 static int 1997 bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove) 1998 { 1999 pthread_mutex_lock(&nvme_ctrlr->mutex); 2000 if (nvme_ctrlr->destruct) { 2001 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2002 /* Don't bother resetting if the controller is in the process of being destructed. */ 2003 return -ENXIO; 2004 } 2005 2006 if (nvme_ctrlr->resetting) { 2007 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2008 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2009 return -EBUSY; 2010 } 2011 2012 bdev_nvme_failover_trid(nvme_ctrlr, remove); 2013 2014 if (nvme_ctrlr->reconnect_is_delayed) { 2015 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2016 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2017 2018 /* We rely on the next reconnect for the failover. */ 2019 return 0; 2020 } 2021 2022 nvme_ctrlr->resetting = true; 2023 2024 assert(nvme_ctrlr->reset_start_tsc == 0); 2025 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2026 2027 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2028 2029 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr); 2030 return 0; 2031 } 2032 2033 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2034 uint64_t num_blocks); 2035 2036 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2037 uint64_t num_blocks); 2038 2039 static void 2040 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2041 bool success) 2042 { 2043 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2044 struct spdk_bdev *bdev = bdev_io->bdev; 2045 int ret; 2046 2047 if (!success) { 2048 ret = -EINVAL; 2049 goto exit; 2050 } 2051 2052 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2053 ret = -ENXIO; 2054 goto exit; 2055 } 2056 2057 ret = bdev_nvme_readv(bio, 2058 bdev_io->u.bdev.iovs, 2059 bdev_io->u.bdev.iovcnt, 2060 bdev_io->u.bdev.md_buf, 2061 bdev_io->u.bdev.num_blocks, 2062 bdev_io->u.bdev.offset_blocks, 2063 bdev->dif_check_flags, 2064 bdev_io->u.bdev.ext_opts); 2065 2066 exit: 2067 if (spdk_unlikely(ret != 0)) { 2068 bdev_nvme_io_complete(bio, ret); 2069 } 2070 } 2071 2072 static void 2073 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2074 { 2075 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 2076 struct spdk_bdev *bdev = bdev_io->bdev; 2077 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2078 struct nvme_bdev_io *nbdev_io_to_abort; 2079 int rc = 0; 2080 2081 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 2082 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 2083 if (spdk_unlikely(!nbdev_io->io_path)) { 2084 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 2085 rc = -ENXIO; 2086 goto exit; 2087 } 2088 2089 /* Admin commands do not use the optimal I/O path. 2090 * Simply fall through even if it is not found. 2091 */ 2092 } 2093 2094 switch (bdev_io->type) { 2095 case SPDK_BDEV_IO_TYPE_READ: 2096 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 2097 rc = bdev_nvme_readv(nbdev_io, 2098 bdev_io->u.bdev.iovs, 2099 bdev_io->u.bdev.iovcnt, 2100 bdev_io->u.bdev.md_buf, 2101 bdev_io->u.bdev.num_blocks, 2102 bdev_io->u.bdev.offset_blocks, 2103 bdev->dif_check_flags, 2104 bdev_io->u.bdev.ext_opts); 2105 } else { 2106 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 2107 bdev_io->u.bdev.num_blocks * bdev->blocklen); 2108 rc = 0; 2109 } 2110 break; 2111 case SPDK_BDEV_IO_TYPE_WRITE: 2112 rc = bdev_nvme_writev(nbdev_io, 2113 bdev_io->u.bdev.iovs, 2114 bdev_io->u.bdev.iovcnt, 2115 bdev_io->u.bdev.md_buf, 2116 bdev_io->u.bdev.num_blocks, 2117 bdev_io->u.bdev.offset_blocks, 2118 bdev->dif_check_flags, 2119 bdev_io->u.bdev.ext_opts); 2120 break; 2121 case SPDK_BDEV_IO_TYPE_COMPARE: 2122 rc = bdev_nvme_comparev(nbdev_io, 2123 bdev_io->u.bdev.iovs, 2124 bdev_io->u.bdev.iovcnt, 2125 bdev_io->u.bdev.md_buf, 2126 bdev_io->u.bdev.num_blocks, 2127 bdev_io->u.bdev.offset_blocks, 2128 bdev->dif_check_flags); 2129 break; 2130 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2131 rc = bdev_nvme_comparev_and_writev(nbdev_io, 2132 bdev_io->u.bdev.iovs, 2133 bdev_io->u.bdev.iovcnt, 2134 bdev_io->u.bdev.fused_iovs, 2135 bdev_io->u.bdev.fused_iovcnt, 2136 bdev_io->u.bdev.md_buf, 2137 bdev_io->u.bdev.num_blocks, 2138 bdev_io->u.bdev.offset_blocks, 2139 bdev->dif_check_flags); 2140 break; 2141 case SPDK_BDEV_IO_TYPE_UNMAP: 2142 rc = bdev_nvme_unmap(nbdev_io, 2143 bdev_io->u.bdev.offset_blocks, 2144 bdev_io->u.bdev.num_blocks); 2145 break; 2146 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2147 rc = bdev_nvme_write_zeroes(nbdev_io, 2148 bdev_io->u.bdev.offset_blocks, 2149 bdev_io->u.bdev.num_blocks); 2150 break; 2151 case SPDK_BDEV_IO_TYPE_RESET: 2152 nbdev_io->io_path = NULL; 2153 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 2154 break; 2155 case SPDK_BDEV_IO_TYPE_FLUSH: 2156 rc = bdev_nvme_flush(nbdev_io, 2157 bdev_io->u.bdev.offset_blocks, 2158 bdev_io->u.bdev.num_blocks); 2159 break; 2160 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 2161 rc = bdev_nvme_zone_appendv(nbdev_io, 2162 bdev_io->u.bdev.iovs, 2163 bdev_io->u.bdev.iovcnt, 2164 bdev_io->u.bdev.md_buf, 2165 bdev_io->u.bdev.num_blocks, 2166 bdev_io->u.bdev.offset_blocks, 2167 bdev->dif_check_flags); 2168 break; 2169 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 2170 rc = bdev_nvme_get_zone_info(nbdev_io, 2171 bdev_io->u.zone_mgmt.zone_id, 2172 bdev_io->u.zone_mgmt.num_zones, 2173 bdev_io->u.zone_mgmt.buf); 2174 break; 2175 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 2176 rc = bdev_nvme_zone_management(nbdev_io, 2177 bdev_io->u.zone_mgmt.zone_id, 2178 bdev_io->u.zone_mgmt.zone_action); 2179 break; 2180 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 2181 nbdev_io->io_path = NULL; 2182 bdev_nvme_admin_passthru(nbdev_ch, 2183 nbdev_io, 2184 &bdev_io->u.nvme_passthru.cmd, 2185 bdev_io->u.nvme_passthru.buf, 2186 bdev_io->u.nvme_passthru.nbytes); 2187 break; 2188 case SPDK_BDEV_IO_TYPE_NVME_IO: 2189 rc = bdev_nvme_io_passthru(nbdev_io, 2190 &bdev_io->u.nvme_passthru.cmd, 2191 bdev_io->u.nvme_passthru.buf, 2192 bdev_io->u.nvme_passthru.nbytes); 2193 break; 2194 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2195 rc = bdev_nvme_io_passthru_md(nbdev_io, 2196 &bdev_io->u.nvme_passthru.cmd, 2197 bdev_io->u.nvme_passthru.buf, 2198 bdev_io->u.nvme_passthru.nbytes, 2199 bdev_io->u.nvme_passthru.md_buf, 2200 bdev_io->u.nvme_passthru.md_len); 2201 break; 2202 case SPDK_BDEV_IO_TYPE_ABORT: 2203 nbdev_io->io_path = NULL; 2204 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 2205 bdev_nvme_abort(nbdev_ch, 2206 nbdev_io, 2207 nbdev_io_to_abort); 2208 break; 2209 default: 2210 rc = -EINVAL; 2211 break; 2212 } 2213 2214 exit: 2215 if (spdk_unlikely(rc != 0)) { 2216 bdev_nvme_io_complete(nbdev_io, rc); 2217 } 2218 } 2219 2220 static bool 2221 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 2222 { 2223 struct nvme_bdev *nbdev = ctx; 2224 struct nvme_ns *nvme_ns; 2225 struct spdk_nvme_ns *ns; 2226 struct spdk_nvme_ctrlr *ctrlr; 2227 const struct spdk_nvme_ctrlr_data *cdata; 2228 2229 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 2230 assert(nvme_ns != NULL); 2231 ns = nvme_ns->ns; 2232 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2233 2234 switch (io_type) { 2235 case SPDK_BDEV_IO_TYPE_READ: 2236 case SPDK_BDEV_IO_TYPE_WRITE: 2237 case SPDK_BDEV_IO_TYPE_RESET: 2238 case SPDK_BDEV_IO_TYPE_FLUSH: 2239 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 2240 case SPDK_BDEV_IO_TYPE_NVME_IO: 2241 case SPDK_BDEV_IO_TYPE_ABORT: 2242 return true; 2243 2244 case SPDK_BDEV_IO_TYPE_COMPARE: 2245 return spdk_nvme_ns_supports_compare(ns); 2246 2247 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2248 return spdk_nvme_ns_get_md_size(ns) ? true : false; 2249 2250 case SPDK_BDEV_IO_TYPE_UNMAP: 2251 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2252 return cdata->oncs.dsm; 2253 2254 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2255 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2256 return cdata->oncs.write_zeroes; 2257 2258 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2259 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 2260 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 2261 return true; 2262 } 2263 return false; 2264 2265 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 2266 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 2267 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 2268 2269 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 2270 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 2271 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 2272 2273 default: 2274 return false; 2275 } 2276 } 2277 2278 static int 2279 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 2280 { 2281 struct nvme_qpair *nvme_qpair; 2282 struct spdk_io_channel *pg_ch; 2283 int rc; 2284 2285 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 2286 if (!nvme_qpair) { 2287 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 2288 return -1; 2289 } 2290 2291 TAILQ_INIT(&nvme_qpair->io_path_list); 2292 2293 nvme_qpair->ctrlr = nvme_ctrlr; 2294 nvme_qpair->ctrlr_ch = ctrlr_ch; 2295 2296 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 2297 if (!pg_ch) { 2298 free(nvme_qpair); 2299 return -1; 2300 } 2301 2302 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 2303 2304 #ifdef SPDK_CONFIG_VTUNE 2305 nvme_qpair->group->collect_spin_stat = true; 2306 #else 2307 nvme_qpair->group->collect_spin_stat = false; 2308 #endif 2309 2310 rc = bdev_nvme_create_qpair(nvme_qpair); 2311 if (rc != 0) { 2312 /* nvme_ctrlr can't create IO qpair if connection is down. 2313 * 2314 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 2315 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 2316 * submitted IO will be queued until IO qpair is successfully created. 2317 * 2318 * Hence, if both are satisfied, ignore the failure. 2319 */ 2320 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 2321 spdk_put_io_channel(pg_ch); 2322 free(nvme_qpair); 2323 return rc; 2324 } 2325 } 2326 2327 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 2328 2329 ctrlr_ch->qpair = nvme_qpair; 2330 2331 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 2332 nvme_qpair->ctrlr->ref++; 2333 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 2334 2335 return 0; 2336 } 2337 2338 static int 2339 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 2340 { 2341 struct nvme_ctrlr *nvme_ctrlr = io_device; 2342 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 2343 2344 TAILQ_INIT(&ctrlr_ch->pending_resets); 2345 2346 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 2347 } 2348 2349 static void 2350 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 2351 { 2352 assert(nvme_qpair->group != NULL); 2353 2354 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 2355 2356 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 2357 2358 nvme_ctrlr_release(nvme_qpair->ctrlr); 2359 2360 free(nvme_qpair); 2361 } 2362 2363 static void 2364 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 2365 { 2366 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 2367 struct nvme_qpair *nvme_qpair; 2368 2369 nvme_qpair = ctrlr_ch->qpair; 2370 assert(nvme_qpair != NULL); 2371 2372 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2373 2374 if (nvme_qpair->qpair != NULL) { 2375 if (ctrlr_ch->reset_iter == NULL) { 2376 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2377 } else { 2378 /* Skip current ctrlr_channel in a full reset sequence because 2379 * it is being deleted now. The qpair is already being disconnected. 2380 * We do not have to restart disconnecting it. 2381 */ 2382 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2383 } 2384 2385 /* We cannot release a reference to the poll group now. 2386 * The qpair may be disconnected asynchronously later. 2387 * We need to poll it until it is actually disconnected. 2388 * Just detach the qpair from the deleting ctrlr_channel. 2389 */ 2390 nvme_qpair->ctrlr_ch = NULL; 2391 } else { 2392 assert(ctrlr_ch->reset_iter == NULL); 2393 2394 nvme_qpair_delete(nvme_qpair); 2395 } 2396 } 2397 2398 static void 2399 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 2400 uint32_t iov_cnt, uint32_t seed, 2401 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 2402 { 2403 struct nvme_poll_group *group = ctx; 2404 int rc; 2405 2406 assert(group->accel_channel != NULL); 2407 assert(cb_fn != NULL); 2408 2409 rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 2410 if (rc) { 2411 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 2412 if (rc == -ENOMEM || rc == -EINVAL) { 2413 cb_fn(cb_arg, rc); 2414 } 2415 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 2416 } 2417 } 2418 2419 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 2420 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 2421 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 2422 }; 2423 2424 static int 2425 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 2426 { 2427 struct nvme_poll_group *group = ctx_buf; 2428 2429 TAILQ_INIT(&group->qpair_list); 2430 2431 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 2432 if (group->group == NULL) { 2433 return -1; 2434 } 2435 2436 group->accel_channel = spdk_accel_get_io_channel(); 2437 if (!group->accel_channel) { 2438 spdk_nvme_poll_group_destroy(group->group); 2439 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 2440 group); 2441 return -1; 2442 } 2443 2444 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 2445 2446 if (group->poller == NULL) { 2447 spdk_put_io_channel(group->accel_channel); 2448 spdk_nvme_poll_group_destroy(group->group); 2449 return -1; 2450 } 2451 2452 return 0; 2453 } 2454 2455 static void 2456 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 2457 { 2458 struct nvme_poll_group *group = ctx_buf; 2459 2460 assert(TAILQ_EMPTY(&group->qpair_list)); 2461 2462 if (group->accel_channel) { 2463 spdk_put_io_channel(group->accel_channel); 2464 } 2465 2466 spdk_poller_unregister(&group->poller); 2467 if (spdk_nvme_poll_group_destroy(group->group)) { 2468 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 2469 assert(false); 2470 } 2471 } 2472 2473 static struct spdk_io_channel * 2474 bdev_nvme_get_io_channel(void *ctx) 2475 { 2476 struct nvme_bdev *nvme_bdev = ctx; 2477 2478 return spdk_get_io_channel(nvme_bdev); 2479 } 2480 2481 static void * 2482 bdev_nvme_get_module_ctx(void *ctx) 2483 { 2484 struct nvme_bdev *nvme_bdev = ctx; 2485 struct nvme_ns *nvme_ns; 2486 2487 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 2488 return NULL; 2489 } 2490 2491 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 2492 if (!nvme_ns) { 2493 return NULL; 2494 } 2495 2496 return nvme_ns->ns; 2497 } 2498 2499 static const char * 2500 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 2501 { 2502 switch (ana_state) { 2503 case SPDK_NVME_ANA_OPTIMIZED_STATE: 2504 return "optimized"; 2505 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 2506 return "non_optimized"; 2507 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 2508 return "inaccessible"; 2509 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 2510 return "persistent_loss"; 2511 case SPDK_NVME_ANA_CHANGE_STATE: 2512 return "change"; 2513 default: 2514 return NULL; 2515 } 2516 } 2517 2518 static int 2519 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 2520 { 2521 struct nvme_bdev *nbdev = ctx; 2522 struct nvme_ns *nvme_ns; 2523 2524 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 2525 assert(nvme_ns != NULL); 2526 2527 return spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, domains, array_size); 2528 } 2529 2530 static const char * 2531 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 2532 { 2533 if (nvme_ctrlr->destruct) { 2534 return "deleting"; 2535 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 2536 return "failed"; 2537 } else if (nvme_ctrlr->resetting) { 2538 return "resetting"; 2539 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 2540 return "reconnect_is_delayed"; 2541 } else { 2542 return "enabled"; 2543 } 2544 } 2545 2546 void 2547 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 2548 { 2549 struct spdk_nvme_transport_id *trid; 2550 const struct spdk_nvme_ctrlr_opts *opts; 2551 const struct spdk_nvme_ctrlr_data *cdata; 2552 2553 spdk_json_write_object_begin(w); 2554 2555 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 2556 2557 #ifdef SPDK_CONFIG_NVME_CUSE 2558 size_t cuse_name_size = 128; 2559 char cuse_name[cuse_name_size]; 2560 2561 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 2562 if (rc == 0) { 2563 spdk_json_write_named_string(w, "cuse_device", cuse_name); 2564 } 2565 #endif 2566 trid = &nvme_ctrlr->active_path_id->trid; 2567 spdk_json_write_named_object_begin(w, "trid"); 2568 nvme_bdev_dump_trid_json(trid, w); 2569 spdk_json_write_object_end(w); 2570 2571 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 2572 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 2573 2574 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 2575 spdk_json_write_named_object_begin(w, "host"); 2576 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 2577 spdk_json_write_named_string(w, "addr", opts->src_addr); 2578 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 2579 spdk_json_write_object_end(w); 2580 2581 spdk_json_write_object_end(w); 2582 } 2583 2584 static void 2585 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 2586 struct nvme_ns *nvme_ns) 2587 { 2588 struct spdk_nvme_ns *ns; 2589 struct spdk_nvme_ctrlr *ctrlr; 2590 const struct spdk_nvme_ctrlr_data *cdata; 2591 const struct spdk_nvme_transport_id *trid; 2592 union spdk_nvme_vs_register vs; 2593 const struct spdk_nvme_ns_data *nsdata; 2594 char buf[128]; 2595 2596 ns = nvme_ns->ns; 2597 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2598 2599 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2600 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 2601 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 2602 2603 spdk_json_write_object_begin(w); 2604 2605 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2606 spdk_json_write_named_string(w, "pci_address", trid->traddr); 2607 } 2608 2609 spdk_json_write_named_object_begin(w, "trid"); 2610 2611 nvme_bdev_dump_trid_json(trid, w); 2612 2613 spdk_json_write_object_end(w); 2614 2615 #ifdef SPDK_CONFIG_NVME_CUSE 2616 size_t cuse_name_size = 128; 2617 char cuse_name[cuse_name_size]; 2618 2619 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 2620 cuse_name, &cuse_name_size); 2621 if (rc == 0) { 2622 spdk_json_write_named_string(w, "cuse_device", cuse_name); 2623 } 2624 #endif 2625 2626 spdk_json_write_named_object_begin(w, "ctrlr_data"); 2627 2628 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 2629 2630 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 2631 2632 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 2633 spdk_str_trim(buf); 2634 spdk_json_write_named_string(w, "model_number", buf); 2635 2636 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 2637 spdk_str_trim(buf); 2638 spdk_json_write_named_string(w, "serial_number", buf); 2639 2640 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 2641 spdk_str_trim(buf); 2642 spdk_json_write_named_string(w, "firmware_revision", buf); 2643 2644 if (cdata->subnqn[0] != '\0') { 2645 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 2646 } 2647 2648 spdk_json_write_named_object_begin(w, "oacs"); 2649 2650 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 2651 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 2652 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 2653 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 2654 2655 spdk_json_write_object_end(w); 2656 2657 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 2658 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 2659 2660 spdk_json_write_object_end(w); 2661 2662 spdk_json_write_named_object_begin(w, "vs"); 2663 2664 spdk_json_write_name(w, "nvme_version"); 2665 if (vs.bits.ter) { 2666 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 2667 } else { 2668 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 2669 } 2670 2671 spdk_json_write_object_end(w); 2672 2673 nsdata = spdk_nvme_ns_get_data(ns); 2674 2675 spdk_json_write_named_object_begin(w, "ns_data"); 2676 2677 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 2678 2679 if (cdata->cmic.ana_reporting) { 2680 spdk_json_write_named_string(w, "ana_state", 2681 _nvme_ana_state_str(nvme_ns->ana_state)); 2682 } 2683 2684 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 2685 2686 spdk_json_write_object_end(w); 2687 2688 if (cdata->oacs.security) { 2689 spdk_json_write_named_object_begin(w, "security"); 2690 2691 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 2692 2693 spdk_json_write_object_end(w); 2694 } 2695 2696 spdk_json_write_object_end(w); 2697 } 2698 2699 static const char * 2700 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 2701 { 2702 switch (nbdev->mp_policy) { 2703 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 2704 return "active_passive"; 2705 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 2706 return "active_active"; 2707 default: 2708 assert(false); 2709 return "invalid"; 2710 } 2711 } 2712 2713 static int 2714 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 2715 { 2716 struct nvme_bdev *nvme_bdev = ctx; 2717 struct nvme_ns *nvme_ns; 2718 2719 pthread_mutex_lock(&nvme_bdev->mutex); 2720 spdk_json_write_named_array_begin(w, "nvme"); 2721 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 2722 nvme_namespace_info_json(w, nvme_ns); 2723 } 2724 spdk_json_write_array_end(w); 2725 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 2726 pthread_mutex_unlock(&nvme_bdev->mutex); 2727 2728 return 0; 2729 } 2730 2731 static void 2732 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 2733 { 2734 /* No config per bdev needed */ 2735 } 2736 2737 static uint64_t 2738 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 2739 { 2740 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 2741 struct nvme_io_path *io_path; 2742 struct nvme_poll_group *group; 2743 uint64_t spin_time = 0; 2744 2745 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 2746 group = io_path->qpair->group; 2747 2748 if (!group || !group->collect_spin_stat) { 2749 continue; 2750 } 2751 2752 if (group->end_ticks != 0) { 2753 group->spin_ticks += (group->end_ticks - group->start_ticks); 2754 group->end_ticks = 0; 2755 } 2756 2757 spin_time += group->spin_ticks; 2758 group->start_ticks = 0; 2759 group->spin_ticks = 0; 2760 } 2761 2762 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 2763 } 2764 2765 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 2766 .destruct = bdev_nvme_destruct, 2767 .submit_request = bdev_nvme_submit_request, 2768 .io_type_supported = bdev_nvme_io_type_supported, 2769 .get_io_channel = bdev_nvme_get_io_channel, 2770 .dump_info_json = bdev_nvme_dump_info_json, 2771 .write_config_json = bdev_nvme_write_config_json, 2772 .get_spin_time = bdev_nvme_get_spin_time, 2773 .get_module_ctx = bdev_nvme_get_module_ctx, 2774 .get_memory_domains = bdev_nvme_get_memory_domains, 2775 }; 2776 2777 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 2778 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 2779 2780 static int 2781 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 2782 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 2783 { 2784 struct spdk_nvme_ana_group_descriptor *copied_desc; 2785 uint8_t *orig_desc; 2786 uint32_t i, desc_size, copy_len; 2787 int rc = 0; 2788 2789 if (nvme_ctrlr->ana_log_page == NULL) { 2790 return -EINVAL; 2791 } 2792 2793 copied_desc = nvme_ctrlr->copied_ana_desc; 2794 2795 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 2796 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 2797 2798 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 2799 memcpy(copied_desc, orig_desc, copy_len); 2800 2801 rc = cb_fn(copied_desc, cb_arg); 2802 if (rc != 0) { 2803 break; 2804 } 2805 2806 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 2807 copied_desc->num_of_nsid * sizeof(uint32_t); 2808 orig_desc += desc_size; 2809 copy_len -= desc_size; 2810 } 2811 2812 return rc; 2813 } 2814 2815 static int 2816 nvme_ns_ana_transition_timedout(void *ctx) 2817 { 2818 struct nvme_ns *nvme_ns = ctx; 2819 2820 spdk_poller_unregister(&nvme_ns->anatt_timer); 2821 nvme_ns->ana_transition_timedout = true; 2822 2823 return SPDK_POLLER_BUSY; 2824 } 2825 2826 static void 2827 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 2828 const struct spdk_nvme_ana_group_descriptor *desc) 2829 { 2830 const struct spdk_nvme_ctrlr_data *cdata; 2831 2832 nvme_ns->ana_group_id = desc->ana_group_id; 2833 nvme_ns->ana_state = desc->ana_state; 2834 nvme_ns->ana_state_updating = false; 2835 2836 switch (nvme_ns->ana_state) { 2837 case SPDK_NVME_ANA_OPTIMIZED_STATE: 2838 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 2839 nvme_ns->ana_transition_timedout = false; 2840 spdk_poller_unregister(&nvme_ns->anatt_timer); 2841 break; 2842 2843 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 2844 case SPDK_NVME_ANA_CHANGE_STATE: 2845 if (nvme_ns->anatt_timer != NULL) { 2846 break; 2847 } 2848 2849 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 2850 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 2851 nvme_ns, 2852 cdata->anatt * SPDK_SEC_TO_USEC); 2853 break; 2854 default: 2855 break; 2856 } 2857 } 2858 2859 static int 2860 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 2861 { 2862 struct nvme_ns *nvme_ns = cb_arg; 2863 uint32_t i; 2864 2865 for (i = 0; i < desc->num_of_nsid; i++) { 2866 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 2867 continue; 2868 } 2869 2870 _nvme_ns_set_ana_state(nvme_ns, desc); 2871 return 1; 2872 } 2873 2874 return 0; 2875 } 2876 2877 static int 2878 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 2879 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 2880 uint32_t prchk_flags, void *ctx) 2881 { 2882 const struct spdk_uuid *uuid; 2883 const uint8_t *nguid; 2884 const struct spdk_nvme_ctrlr_data *cdata; 2885 const struct spdk_nvme_ns_data *nsdata; 2886 const struct spdk_nvme_ctrlr_opts *opts; 2887 enum spdk_nvme_csi csi; 2888 uint32_t atomic_bs, phys_bs, bs; 2889 2890 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2891 csi = spdk_nvme_ns_get_csi(ns); 2892 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 2893 2894 switch (csi) { 2895 case SPDK_NVME_CSI_NVM: 2896 disk->product_name = "NVMe disk"; 2897 break; 2898 case SPDK_NVME_CSI_ZNS: 2899 disk->product_name = "NVMe ZNS disk"; 2900 disk->zoned = true; 2901 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 2902 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 2903 spdk_nvme_ns_get_extended_sector_size(ns); 2904 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 2905 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 2906 break; 2907 default: 2908 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 2909 return -ENOTSUP; 2910 } 2911 2912 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 2913 if (!disk->name) { 2914 return -ENOMEM; 2915 } 2916 2917 disk->write_cache = 0; 2918 if (cdata->vwc.present) { 2919 /* Enable if the Volatile Write Cache exists */ 2920 disk->write_cache = 1; 2921 } 2922 if (cdata->oncs.write_zeroes) { 2923 disk->max_write_zeroes = UINT16_MAX + 1; 2924 } 2925 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 2926 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 2927 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 2928 /* NVMe driver will split one request into multiple requests 2929 * based on MDTS and stripe boundary, the bdev layer will use 2930 * max_segment_size and max_num_segments to split one big IO 2931 * into multiple requests, then small request can't run out 2932 * of NVMe internal requests data structure. 2933 */ 2934 if (opts && opts->io_queue_requests) { 2935 disk->max_num_segments = opts->io_queue_requests / 2; 2936 } 2937 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 2938 2939 nguid = spdk_nvme_ns_get_nguid(ns); 2940 if (!nguid) { 2941 uuid = spdk_nvme_ns_get_uuid(ns); 2942 if (uuid) { 2943 disk->uuid = *uuid; 2944 } 2945 } else { 2946 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 2947 } 2948 2949 nsdata = spdk_nvme_ns_get_data(ns); 2950 bs = spdk_nvme_ns_get_sector_size(ns); 2951 atomic_bs = bs; 2952 phys_bs = bs; 2953 if (nsdata->nabo == 0) { 2954 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 2955 atomic_bs = bs * (1 + nsdata->nawupf); 2956 } else { 2957 atomic_bs = bs * (1 + cdata->awupf); 2958 } 2959 } 2960 if (nsdata->nsfeat.optperf) { 2961 phys_bs = bs * (1 + nsdata->npwg); 2962 } 2963 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 2964 2965 disk->md_len = spdk_nvme_ns_get_md_size(ns); 2966 if (disk->md_len != 0) { 2967 disk->md_interleave = nsdata->flbas.extended; 2968 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 2969 if (disk->dif_type != SPDK_DIF_DISABLE) { 2970 disk->dif_is_head_of_md = nsdata->dps.md_start; 2971 disk->dif_check_flags = prchk_flags; 2972 } 2973 } 2974 2975 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 2976 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 2977 disk->acwu = 0; 2978 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 2979 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 2980 } else { 2981 disk->acwu = cdata->acwu + 1; /* 0-based */ 2982 } 2983 2984 disk->ctxt = ctx; 2985 disk->fn_table = &nvmelib_fn_table; 2986 disk->module = &nvme_if; 2987 2988 return 0; 2989 } 2990 2991 static int 2992 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 2993 { 2994 struct nvme_bdev *bdev; 2995 int rc; 2996 2997 bdev = calloc(1, sizeof(*bdev)); 2998 if (!bdev) { 2999 SPDK_ERRLOG("bdev calloc() failed\n"); 3000 return -ENOMEM; 3001 } 3002 3003 rc = pthread_mutex_init(&bdev->mutex, NULL); 3004 if (rc != 0) { 3005 free(bdev); 3006 return rc; 3007 } 3008 3009 bdev->ref = 1; 3010 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 3011 TAILQ_INIT(&bdev->nvme_ns_list); 3012 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 3013 bdev->opal = nvme_ctrlr->opal_dev != NULL; 3014 3015 rc = nvme_disk_create(&bdev->disk, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 3016 nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); 3017 if (rc != 0) { 3018 SPDK_ERRLOG("Failed to create NVMe disk\n"); 3019 pthread_mutex_destroy(&bdev->mutex); 3020 free(bdev); 3021 return rc; 3022 } 3023 3024 spdk_io_device_register(bdev, 3025 bdev_nvme_create_bdev_channel_cb, 3026 bdev_nvme_destroy_bdev_channel_cb, 3027 sizeof(struct nvme_bdev_channel), 3028 bdev->disk.name); 3029 3030 rc = spdk_bdev_register(&bdev->disk); 3031 if (rc != 0) { 3032 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 3033 spdk_io_device_unregister(bdev, NULL); 3034 pthread_mutex_destroy(&bdev->mutex); 3035 free(bdev->disk.name); 3036 free(bdev); 3037 return rc; 3038 } 3039 3040 nvme_ns->bdev = bdev; 3041 bdev->nsid = nvme_ns->id; 3042 3043 bdev->nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 3044 TAILQ_INSERT_TAIL(&nvme_ctrlr->nbdev_ctrlr->bdevs, bdev, tailq); 3045 3046 return 0; 3047 } 3048 3049 static bool 3050 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 3051 { 3052 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 3053 const struct spdk_uuid *uuid1, *uuid2; 3054 3055 nsdata1 = spdk_nvme_ns_get_data(ns1); 3056 nsdata2 = spdk_nvme_ns_get_data(ns2); 3057 uuid1 = spdk_nvme_ns_get_uuid(ns1); 3058 uuid2 = spdk_nvme_ns_get_uuid(ns2); 3059 3060 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 3061 nsdata1->eui64 == nsdata2->eui64 && 3062 ((uuid1 == NULL && uuid2 == NULL) || 3063 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 3064 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 3065 } 3066 3067 static bool 3068 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 3069 struct spdk_nvme_ctrlr_opts *opts) 3070 { 3071 struct nvme_probe_skip_entry *entry; 3072 3073 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 3074 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 3075 return false; 3076 } 3077 } 3078 3079 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 3080 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 3081 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 3082 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 3083 opts->disable_read_ana_log_page = true; 3084 3085 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 3086 3087 return true; 3088 } 3089 3090 static void 3091 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 3092 { 3093 struct nvme_ctrlr *nvme_ctrlr = ctx; 3094 3095 if (spdk_nvme_cpl_is_error(cpl)) { 3096 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 3097 cpl->status.sct); 3098 bdev_nvme_reset(nvme_ctrlr); 3099 } else if (cpl->cdw0 & 0x1) { 3100 SPDK_WARNLOG("Specified command could not be aborted.\n"); 3101 bdev_nvme_reset(nvme_ctrlr); 3102 } 3103 } 3104 3105 static void 3106 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 3107 struct spdk_nvme_qpair *qpair, uint16_t cid) 3108 { 3109 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 3110 union spdk_nvme_csts_register csts; 3111 int rc; 3112 3113 assert(nvme_ctrlr->ctrlr == ctrlr); 3114 3115 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 3116 3117 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 3118 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 3119 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 3120 * completion recursively. 3121 */ 3122 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 3123 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 3124 if (csts.bits.cfs) { 3125 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 3126 bdev_nvme_reset(nvme_ctrlr); 3127 return; 3128 } 3129 } 3130 3131 switch (g_opts.action_on_timeout) { 3132 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 3133 if (qpair) { 3134 /* Don't send abort to ctrlr when ctrlr is not available. */ 3135 pthread_mutex_lock(&nvme_ctrlr->mutex); 3136 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 3137 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3138 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 3139 return; 3140 } 3141 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3142 3143 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 3144 nvme_abort_cpl, nvme_ctrlr); 3145 if (rc == 0) { 3146 return; 3147 } 3148 3149 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 3150 } 3151 3152 /* FALLTHROUGH */ 3153 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 3154 bdev_nvme_reset(nvme_ctrlr); 3155 break; 3156 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 3157 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 3158 break; 3159 default: 3160 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 3161 break; 3162 } 3163 } 3164 3165 static void 3166 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 3167 { 3168 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 3169 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 3170 3171 if (rc == 0) { 3172 nvme_ns->probe_ctx = NULL; 3173 pthread_mutex_lock(&nvme_ctrlr->mutex); 3174 nvme_ctrlr->ref++; 3175 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3176 } else { 3177 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 3178 free(nvme_ns); 3179 } 3180 3181 if (ctx) { 3182 ctx->populates_in_progress--; 3183 if (ctx->populates_in_progress == 0) { 3184 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 3185 } 3186 } 3187 } 3188 3189 static void 3190 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 3191 { 3192 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3193 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 3194 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3195 int rc; 3196 3197 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 3198 if (rc != 0) { 3199 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 3200 } 3201 3202 spdk_for_each_channel_continue(i, rc); 3203 } 3204 3205 static void 3206 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 3207 { 3208 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3209 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 3210 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3211 struct nvme_io_path *io_path; 3212 3213 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 3214 if (io_path != NULL) { 3215 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 3216 } 3217 3218 spdk_for_each_channel_continue(i, 0); 3219 } 3220 3221 static void 3222 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 3223 { 3224 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3225 3226 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 3227 } 3228 3229 static void 3230 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 3231 { 3232 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3233 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 3234 3235 if (status == 0) { 3236 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 3237 } else { 3238 /* Delete the added io_paths and fail populating the namespace. */ 3239 spdk_for_each_channel(bdev, 3240 bdev_nvme_delete_io_path, 3241 nvme_ns, 3242 bdev_nvme_add_io_path_failed); 3243 } 3244 } 3245 3246 static int 3247 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 3248 { 3249 struct nvme_ns *tmp_ns; 3250 const struct spdk_nvme_ns_data *nsdata; 3251 3252 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 3253 if (!nsdata->nmic.can_share) { 3254 SPDK_ERRLOG("Namespace cannot be shared.\n"); 3255 return -EINVAL; 3256 } 3257 3258 pthread_mutex_lock(&bdev->mutex); 3259 3260 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 3261 assert(tmp_ns != NULL); 3262 3263 if (!bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 3264 pthread_mutex_unlock(&bdev->mutex); 3265 SPDK_ERRLOG("Namespaces are not identical.\n"); 3266 return -EINVAL; 3267 } 3268 3269 bdev->ref++; 3270 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 3271 nvme_ns->bdev = bdev; 3272 3273 pthread_mutex_unlock(&bdev->mutex); 3274 3275 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 3276 spdk_for_each_channel(bdev, 3277 bdev_nvme_add_io_path, 3278 nvme_ns, 3279 bdev_nvme_add_io_path_done); 3280 3281 return 0; 3282 } 3283 3284 static void 3285 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 3286 { 3287 struct spdk_nvme_ns *ns; 3288 struct nvme_bdev *bdev; 3289 int rc = 0; 3290 3291 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 3292 if (!ns) { 3293 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 3294 rc = -EINVAL; 3295 goto done; 3296 } 3297 3298 nvme_ns->ns = ns; 3299 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 3300 3301 if (nvme_ctrlr->ana_log_page != NULL) { 3302 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 3303 } 3304 3305 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 3306 if (bdev == NULL) { 3307 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 3308 } else { 3309 rc = nvme_bdev_add_ns(bdev, nvme_ns); 3310 if (rc == 0) { 3311 return; 3312 } 3313 } 3314 done: 3315 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 3316 } 3317 3318 static void 3319 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 3320 { 3321 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 3322 3323 assert(nvme_ctrlr != NULL); 3324 3325 pthread_mutex_lock(&nvme_ctrlr->mutex); 3326 3327 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 3328 3329 if (nvme_ns->bdev != NULL) { 3330 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3331 return; 3332 } 3333 3334 free(nvme_ns); 3335 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3336 3337 nvme_ctrlr_release(nvme_ctrlr); 3338 } 3339 3340 static void 3341 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 3342 { 3343 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3344 3345 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 3346 } 3347 3348 static void 3349 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 3350 { 3351 struct nvme_bdev *bdev; 3352 3353 spdk_poller_unregister(&nvme_ns->anatt_timer); 3354 3355 bdev = nvme_ns->bdev; 3356 if (bdev != NULL) { 3357 pthread_mutex_lock(&bdev->mutex); 3358 3359 assert(bdev->ref > 0); 3360 bdev->ref--; 3361 if (bdev->ref == 0) { 3362 pthread_mutex_unlock(&bdev->mutex); 3363 3364 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 3365 } else { 3366 /* spdk_bdev_unregister() is not called until the last nvme_ns is 3367 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 3368 * and clear nvme_ns->bdev here. 3369 */ 3370 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 3371 nvme_ns->bdev = NULL; 3372 3373 pthread_mutex_unlock(&bdev->mutex); 3374 3375 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 3376 * we call depopulate_namespace_done() to avoid use-after-free. 3377 */ 3378 spdk_for_each_channel(bdev, 3379 bdev_nvme_delete_io_path, 3380 nvme_ns, 3381 bdev_nvme_delete_io_path_done); 3382 return; 3383 } 3384 } 3385 3386 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 3387 } 3388 3389 static void 3390 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 3391 struct nvme_async_probe_ctx *ctx) 3392 { 3393 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 3394 struct nvme_ns *nvme_ns, *next; 3395 struct spdk_nvme_ns *ns; 3396 struct nvme_bdev *bdev; 3397 uint32_t nsid; 3398 int rc; 3399 uint64_t num_sectors; 3400 3401 if (ctx) { 3402 /* Initialize this count to 1 to handle the populate functions 3403 * calling nvme_ctrlr_populate_namespace_done() immediately. 3404 */ 3405 ctx->populates_in_progress = 1; 3406 } 3407 3408 /* First loop over our existing namespaces and see if they have been 3409 * removed. */ 3410 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 3411 while (nvme_ns != NULL) { 3412 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 3413 3414 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 3415 /* NS is still there but attributes may have changed */ 3416 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 3417 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 3418 bdev = nvme_ns->bdev; 3419 assert(bdev != NULL); 3420 if (bdev->disk.blockcnt != num_sectors) { 3421 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 3422 nvme_ns->id, 3423 bdev->disk.name, 3424 bdev->disk.blockcnt, 3425 num_sectors); 3426 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 3427 if (rc != 0) { 3428 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 3429 bdev->disk.name, rc); 3430 } 3431 } 3432 } else { 3433 /* Namespace was removed */ 3434 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 3435 } 3436 3437 nvme_ns = next; 3438 } 3439 3440 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 3441 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 3442 while (nsid != 0) { 3443 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 3444 3445 if (nvme_ns == NULL) { 3446 /* Found a new one */ 3447 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 3448 if (nvme_ns == NULL) { 3449 SPDK_ERRLOG("Failed to allocate namespace\n"); 3450 /* This just fails to attach the namespace. It may work on a future attempt. */ 3451 continue; 3452 } 3453 3454 nvme_ns->id = nsid; 3455 nvme_ns->ctrlr = nvme_ctrlr; 3456 3457 nvme_ns->bdev = NULL; 3458 3459 if (ctx) { 3460 ctx->populates_in_progress++; 3461 } 3462 nvme_ns->probe_ctx = ctx; 3463 3464 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 3465 3466 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 3467 } 3468 3469 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 3470 } 3471 3472 if (ctx) { 3473 /* Decrement this count now that the loop is over to account 3474 * for the one we started with. If the count is then 0, we 3475 * know any populate_namespace functions completed immediately, 3476 * so we'll kick the callback here. 3477 */ 3478 ctx->populates_in_progress--; 3479 if (ctx->populates_in_progress == 0) { 3480 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 3481 } 3482 } 3483 3484 } 3485 3486 static void 3487 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 3488 { 3489 struct nvme_ns *nvme_ns, *tmp; 3490 3491 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 3492 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 3493 } 3494 } 3495 3496 static uint32_t 3497 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 3498 { 3499 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 3500 const struct spdk_nvme_ctrlr_data *cdata; 3501 uint32_t nsid, ns_count = 0; 3502 3503 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3504 3505 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 3506 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 3507 ns_count++; 3508 } 3509 3510 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 3511 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 3512 sizeof(uint32_t); 3513 } 3514 3515 static int 3516 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 3517 void *cb_arg) 3518 { 3519 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 3520 struct nvme_ns *nvme_ns; 3521 uint32_t i, nsid; 3522 3523 for (i = 0; i < desc->num_of_nsid; i++) { 3524 nsid = desc->nsid[i]; 3525 if (nsid == 0) { 3526 continue; 3527 } 3528 3529 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 3530 3531 assert(nvme_ns != NULL); 3532 if (nvme_ns == NULL) { 3533 /* Target told us that an inactive namespace had an ANA change */ 3534 continue; 3535 } 3536 3537 _nvme_ns_set_ana_state(nvme_ns, desc); 3538 } 3539 3540 return 0; 3541 } 3542 3543 static void 3544 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 3545 { 3546 struct nvme_ns *nvme_ns; 3547 3548 spdk_free(nvme_ctrlr->ana_log_page); 3549 nvme_ctrlr->ana_log_page = NULL; 3550 3551 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 3552 nvme_ns != NULL; 3553 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 3554 nvme_ns->ana_state_updating = false; 3555 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 3556 } 3557 } 3558 3559 static void 3560 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 3561 { 3562 struct nvme_ctrlr *nvme_ctrlr = ctx; 3563 3564 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 3565 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 3566 nvme_ctrlr); 3567 } else { 3568 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 3569 } 3570 3571 pthread_mutex_lock(&nvme_ctrlr->mutex); 3572 3573 assert(nvme_ctrlr->ana_log_page_updating == true); 3574 nvme_ctrlr->ana_log_page_updating = false; 3575 3576 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 3577 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3578 3579 nvme_ctrlr_unregister(nvme_ctrlr); 3580 } else { 3581 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3582 3583 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 3584 } 3585 } 3586 3587 static int 3588 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 3589 { 3590 uint32_t ana_log_page_size; 3591 int rc; 3592 3593 if (nvme_ctrlr->ana_log_page == NULL) { 3594 return -EINVAL; 3595 } 3596 3597 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 3598 3599 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 3600 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 3601 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 3602 return -EINVAL; 3603 } 3604 3605 pthread_mutex_lock(&nvme_ctrlr->mutex); 3606 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 3607 nvme_ctrlr->ana_log_page_updating) { 3608 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3609 return -EBUSY; 3610 } 3611 3612 nvme_ctrlr->ana_log_page_updating = true; 3613 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3614 3615 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 3616 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 3617 SPDK_NVME_GLOBAL_NS_TAG, 3618 nvme_ctrlr->ana_log_page, 3619 ana_log_page_size, 0, 3620 nvme_ctrlr_read_ana_log_page_done, 3621 nvme_ctrlr); 3622 if (rc != 0) { 3623 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 3624 } 3625 3626 return rc; 3627 } 3628 3629 static void 3630 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 3631 { 3632 } 3633 3634 struct bdev_nvme_set_preferred_path_ctx { 3635 struct spdk_bdev_desc *desc; 3636 struct nvme_ns *nvme_ns; 3637 bdev_nvme_set_preferred_path_cb cb_fn; 3638 void *cb_arg; 3639 }; 3640 3641 static void 3642 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 3643 { 3644 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3645 3646 assert(ctx != NULL); 3647 assert(ctx->desc != NULL); 3648 assert(ctx->cb_fn != NULL); 3649 3650 spdk_bdev_close(ctx->desc); 3651 3652 ctx->cb_fn(ctx->cb_arg, status); 3653 3654 free(ctx); 3655 } 3656 3657 static void 3658 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 3659 { 3660 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3661 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3662 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 3663 struct nvme_io_path *io_path, *prev; 3664 3665 prev = NULL; 3666 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3667 if (io_path->nvme_ns == ctx->nvme_ns) { 3668 break; 3669 } 3670 prev = io_path; 3671 } 3672 3673 if (io_path != NULL) { 3674 if (prev != NULL) { 3675 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 3676 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 3677 } 3678 3679 /* We can set io_path to nbdev_ch->current_io_path directly here. 3680 * However, it needs to be conditional. To simplify the code, 3681 * just clear nbdev_ch->current_io_path and let find_io_path() 3682 * fill it. 3683 * 3684 * Automatic failback may be disabled. Hence even if the io_path is 3685 * already at the head, clear nbdev_ch->current_io_path. 3686 */ 3687 nbdev_ch->current_io_path = NULL; 3688 } 3689 3690 spdk_for_each_channel_continue(i, 0); 3691 } 3692 3693 static struct nvme_ns * 3694 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 3695 { 3696 struct nvme_ns *nvme_ns, *prev; 3697 const struct spdk_nvme_ctrlr_data *cdata; 3698 3699 prev = NULL; 3700 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3701 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 3702 3703 if (cdata->cntlid == cntlid) { 3704 break; 3705 } 3706 prev = nvme_ns; 3707 } 3708 3709 if (nvme_ns != NULL && prev != NULL) { 3710 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 3711 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 3712 } 3713 3714 return nvme_ns; 3715 } 3716 3717 /* This function supports only multipath mode. There is only a single I/O path 3718 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 3719 * head of the I/O path list for each NVMe bdev channel. 3720 * 3721 * NVMe bdev channel may be acquired after completing this function. move the 3722 * matched namespace to the head of the namespace list for the NVMe bdev too. 3723 */ 3724 void 3725 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 3726 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 3727 { 3728 struct bdev_nvme_set_preferred_path_ctx *ctx; 3729 struct spdk_bdev *bdev; 3730 struct nvme_bdev *nbdev; 3731 int rc = 0; 3732 3733 assert(cb_fn != NULL); 3734 3735 ctx = calloc(1, sizeof(*ctx)); 3736 if (ctx == NULL) { 3737 SPDK_ERRLOG("Failed to alloc context.\n"); 3738 rc = -ENOMEM; 3739 goto err_alloc; 3740 } 3741 3742 ctx->cb_fn = cb_fn; 3743 ctx->cb_arg = cb_arg; 3744 3745 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 3746 if (rc != 0) { 3747 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 3748 goto err_open; 3749 } 3750 3751 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 3752 3753 if (bdev->module != &nvme_if) { 3754 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 3755 rc = -ENODEV; 3756 goto err_bdev; 3757 } 3758 3759 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 3760 3761 pthread_mutex_lock(&nbdev->mutex); 3762 3763 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 3764 if (ctx->nvme_ns == NULL) { 3765 pthread_mutex_unlock(&nbdev->mutex); 3766 3767 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 3768 rc = -ENODEV; 3769 goto err_bdev; 3770 } 3771 3772 pthread_mutex_unlock(&nbdev->mutex); 3773 3774 spdk_for_each_channel(nbdev, 3775 _bdev_nvme_set_preferred_path, 3776 ctx, 3777 bdev_nvme_set_preferred_path_done); 3778 return; 3779 3780 err_bdev: 3781 spdk_bdev_close(ctx->desc); 3782 err_open: 3783 free(ctx); 3784 err_alloc: 3785 cb_fn(cb_arg, rc); 3786 } 3787 3788 struct bdev_nvme_set_multipath_policy_ctx { 3789 struct spdk_bdev_desc *desc; 3790 bdev_nvme_set_multipath_policy_cb cb_fn; 3791 void *cb_arg; 3792 }; 3793 3794 static void 3795 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 3796 { 3797 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3798 3799 assert(ctx != NULL); 3800 assert(ctx->desc != NULL); 3801 assert(ctx->cb_fn != NULL); 3802 3803 spdk_bdev_close(ctx->desc); 3804 3805 ctx->cb_fn(ctx->cb_arg, status); 3806 3807 free(ctx); 3808 } 3809 3810 static void 3811 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 3812 { 3813 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3814 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 3815 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 3816 3817 nbdev_ch->mp_policy = nbdev->mp_policy; 3818 nbdev_ch->current_io_path = NULL; 3819 3820 spdk_for_each_channel_continue(i, 0); 3821 } 3822 3823 void 3824 bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, 3825 bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 3826 { 3827 struct bdev_nvme_set_multipath_policy_ctx *ctx; 3828 struct spdk_bdev *bdev; 3829 struct nvme_bdev *nbdev; 3830 int rc; 3831 3832 assert(cb_fn != NULL); 3833 3834 ctx = calloc(1, sizeof(*ctx)); 3835 if (ctx == NULL) { 3836 SPDK_ERRLOG("Failed to alloc context.\n"); 3837 rc = -ENOMEM; 3838 goto err_alloc; 3839 } 3840 3841 ctx->cb_fn = cb_fn; 3842 ctx->cb_arg = cb_arg; 3843 3844 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 3845 if (rc != 0) { 3846 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 3847 rc = -ENODEV; 3848 goto err_open; 3849 } 3850 3851 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 3852 if (bdev->module != &nvme_if) { 3853 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 3854 rc = -ENODEV; 3855 goto err_module; 3856 } 3857 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 3858 3859 pthread_mutex_lock(&nbdev->mutex); 3860 nbdev->mp_policy = policy; 3861 pthread_mutex_unlock(&nbdev->mutex); 3862 3863 spdk_for_each_channel(nbdev, 3864 _bdev_nvme_set_multipath_policy, 3865 ctx, 3866 bdev_nvme_set_multipath_policy_done); 3867 return; 3868 3869 err_module: 3870 spdk_bdev_close(ctx->desc); 3871 err_open: 3872 free(ctx); 3873 err_alloc: 3874 cb_fn(cb_arg, rc); 3875 } 3876 3877 static void 3878 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 3879 { 3880 struct nvme_ctrlr *nvme_ctrlr = arg; 3881 union spdk_nvme_async_event_completion event; 3882 3883 if (spdk_nvme_cpl_is_error(cpl)) { 3884 SPDK_WARNLOG("AER request execute failed\n"); 3885 return; 3886 } 3887 3888 event.raw = cpl->cdw0; 3889 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 3890 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 3891 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 3892 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 3893 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 3894 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 3895 } 3896 } 3897 3898 static void 3899 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc) 3900 { 3901 if (ctx->cb_fn) { 3902 ctx->cb_fn(ctx->cb_ctx, count, rc); 3903 } 3904 3905 ctx->namespaces_populated = true; 3906 if (ctx->probe_done) { 3907 /* The probe was already completed, so we need to free the context 3908 * here. This can happen for cases like OCSSD, where we need to 3909 * send additional commands to the SSD after attach. 3910 */ 3911 free(ctx); 3912 } 3913 } 3914 3915 static void 3916 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 3917 struct nvme_async_probe_ctx *ctx) 3918 { 3919 spdk_io_device_register(nvme_ctrlr, 3920 bdev_nvme_create_ctrlr_channel_cb, 3921 bdev_nvme_destroy_ctrlr_channel_cb, 3922 sizeof(struct nvme_ctrlr_channel), 3923 nvme_ctrlr->nbdev_ctrlr->name); 3924 3925 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 3926 } 3927 3928 static void 3929 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 3930 { 3931 struct nvme_ctrlr *nvme_ctrlr = _ctx; 3932 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 3933 3934 nvme_ctrlr->probe_ctx = NULL; 3935 3936 if (spdk_nvme_cpl_is_error(cpl)) { 3937 nvme_ctrlr_delete(nvme_ctrlr); 3938 3939 if (ctx != NULL) { 3940 populate_namespaces_cb(ctx, 0, -1); 3941 } 3942 return; 3943 } 3944 3945 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 3946 } 3947 3948 static int 3949 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3950 struct nvme_async_probe_ctx *ctx) 3951 { 3952 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 3953 const struct spdk_nvme_ctrlr_data *cdata; 3954 uint32_t ana_log_page_size; 3955 3956 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3957 3958 /* Set buffer size enough to include maximum number of allowed namespaces. */ 3959 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 3960 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 3961 sizeof(uint32_t); 3962 3963 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 3964 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3965 if (nvme_ctrlr->ana_log_page == NULL) { 3966 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 3967 return -ENXIO; 3968 } 3969 3970 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 3971 * Hence copy each descriptor to a temporary area when parsing it. 3972 * 3973 * Allocate a buffer whose size is as large as ANA log page buffer because 3974 * we do not know the size of a descriptor until actually reading it. 3975 */ 3976 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 3977 if (nvme_ctrlr->copied_ana_desc == NULL) { 3978 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 3979 return -ENOMEM; 3980 } 3981 3982 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 3983 3984 nvme_ctrlr->probe_ctx = ctx; 3985 3986 /* Then, set the read size only to include the current active namespaces. */ 3987 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 3988 3989 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 3990 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 3991 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 3992 return -EINVAL; 3993 } 3994 3995 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 3996 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 3997 SPDK_NVME_GLOBAL_NS_TAG, 3998 nvme_ctrlr->ana_log_page, 3999 ana_log_page_size, 0, 4000 nvme_ctrlr_init_ana_log_page_done, 4001 nvme_ctrlr); 4002 } 4003 4004 /* hostnqn and subnqn were already verified before attaching a controller. 4005 * Hence check only the multipath capability and cntlid here. 4006 */ 4007 static bool 4008 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 4009 { 4010 struct nvme_ctrlr *tmp; 4011 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 4012 4013 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4014 4015 if (!cdata->cmic.multi_ctrlr) { 4016 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 4017 return false; 4018 } 4019 4020 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 4021 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 4022 4023 if (!tmp_cdata->cmic.multi_ctrlr) { 4024 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 4025 return false; 4026 } 4027 if (cdata->cntlid == tmp_cdata->cntlid) { 4028 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 4029 return false; 4030 } 4031 } 4032 4033 return true; 4034 } 4035 4036 static int 4037 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 4038 { 4039 struct nvme_bdev_ctrlr *nbdev_ctrlr; 4040 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4041 int rc = 0; 4042 4043 pthread_mutex_lock(&g_bdev_nvme_mutex); 4044 4045 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 4046 if (nbdev_ctrlr != NULL) { 4047 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 4048 rc = -EINVAL; 4049 goto exit; 4050 } 4051 } else { 4052 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 4053 if (nbdev_ctrlr == NULL) { 4054 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 4055 rc = -ENOMEM; 4056 goto exit; 4057 } 4058 nbdev_ctrlr->name = strdup(name); 4059 if (nbdev_ctrlr->name == NULL) { 4060 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 4061 free(nbdev_ctrlr); 4062 goto exit; 4063 } 4064 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 4065 TAILQ_INIT(&nbdev_ctrlr->bdevs); 4066 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 4067 } 4068 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 4069 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 4070 exit: 4071 pthread_mutex_unlock(&g_bdev_nvme_mutex); 4072 return rc; 4073 } 4074 4075 static int 4076 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 4077 const char *name, 4078 const struct spdk_nvme_transport_id *trid, 4079 struct nvme_async_probe_ctx *ctx) 4080 { 4081 struct nvme_ctrlr *nvme_ctrlr; 4082 struct nvme_path_id *path_id; 4083 const struct spdk_nvme_ctrlr_data *cdata; 4084 int rc; 4085 4086 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 4087 if (nvme_ctrlr == NULL) { 4088 SPDK_ERRLOG("Failed to allocate device struct\n"); 4089 return -ENOMEM; 4090 } 4091 4092 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 4093 if (rc != 0) { 4094 free(nvme_ctrlr); 4095 return rc; 4096 } 4097 4098 TAILQ_INIT(&nvme_ctrlr->trids); 4099 4100 RB_INIT(&nvme_ctrlr->namespaces); 4101 4102 path_id = calloc(1, sizeof(*path_id)); 4103 if (path_id == NULL) { 4104 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 4105 rc = -ENOMEM; 4106 goto err; 4107 } 4108 4109 path_id->trid = *trid; 4110 if (ctx != NULL) { 4111 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 4112 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 4113 } 4114 nvme_ctrlr->active_path_id = path_id; 4115 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 4116 4117 nvme_ctrlr->thread = spdk_get_thread(); 4118 nvme_ctrlr->ctrlr = ctrlr; 4119 nvme_ctrlr->ref = 1; 4120 4121 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 4122 SPDK_ERRLOG("OCSSDs are not supported"); 4123 rc = -ENOTSUP; 4124 goto err; 4125 } 4126 4127 if (ctx != NULL) { 4128 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 4129 } else { 4130 bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 4131 } 4132 4133 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 4134 g_opts.nvme_adminq_poll_period_us); 4135 4136 if (g_opts.timeout_us > 0) { 4137 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 4138 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 4139 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 4140 g_opts.timeout_us : g_opts.timeout_admin_us; 4141 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 4142 adm_timeout_us, timeout_cb, nvme_ctrlr); 4143 } 4144 4145 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 4146 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 4147 4148 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 4149 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 4150 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 4151 } 4152 4153 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 4154 if (rc != 0) { 4155 goto err; 4156 } 4157 4158 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4159 4160 if (cdata->cmic.ana_reporting) { 4161 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 4162 if (rc == 0) { 4163 return 0; 4164 } 4165 } else { 4166 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 4167 return 0; 4168 } 4169 4170 err: 4171 nvme_ctrlr_delete(nvme_ctrlr); 4172 return rc; 4173 } 4174 4175 void 4176 bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) 4177 { 4178 opts->prchk_flags = 0; 4179 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 4180 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 4181 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 4182 } 4183 4184 static void 4185 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4186 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 4187 { 4188 char *name; 4189 4190 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 4191 if (!name) { 4192 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 4193 return; 4194 } 4195 4196 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 4197 4198 nvme_ctrlr_create(ctrlr, name, trid, NULL); 4199 4200 free(name); 4201 } 4202 4203 static void 4204 _nvme_ctrlr_destruct(void *ctx) 4205 { 4206 struct nvme_ctrlr *nvme_ctrlr = ctx; 4207 4208 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 4209 nvme_ctrlr_release(nvme_ctrlr); 4210 } 4211 4212 static int 4213 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 4214 { 4215 struct nvme_probe_skip_entry *entry; 4216 4217 pthread_mutex_lock(&nvme_ctrlr->mutex); 4218 4219 /* The controller's destruction was already started */ 4220 if (nvme_ctrlr->destruct) { 4221 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4222 return 0; 4223 } 4224 4225 if (!hotplug && 4226 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 4227 entry = calloc(1, sizeof(*entry)); 4228 if (!entry) { 4229 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4230 return -ENOMEM; 4231 } 4232 entry->trid = nvme_ctrlr->active_path_id->trid; 4233 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 4234 } 4235 4236 nvme_ctrlr->destruct = true; 4237 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4238 4239 _nvme_ctrlr_destruct(nvme_ctrlr); 4240 4241 return 0; 4242 } 4243 4244 static void 4245 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 4246 { 4247 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 4248 4249 _bdev_nvme_delete(nvme_ctrlr, true); 4250 } 4251 4252 static int 4253 bdev_nvme_hotplug_probe(void *arg) 4254 { 4255 if (g_hotplug_probe_ctx == NULL) { 4256 spdk_poller_unregister(&g_hotplug_probe_poller); 4257 return SPDK_POLLER_IDLE; 4258 } 4259 4260 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 4261 g_hotplug_probe_ctx = NULL; 4262 spdk_poller_unregister(&g_hotplug_probe_poller); 4263 } 4264 4265 return SPDK_POLLER_BUSY; 4266 } 4267 4268 static int 4269 bdev_nvme_hotplug(void *arg) 4270 { 4271 struct spdk_nvme_transport_id trid_pcie; 4272 4273 if (g_hotplug_probe_ctx) { 4274 return SPDK_POLLER_BUSY; 4275 } 4276 4277 memset(&trid_pcie, 0, sizeof(trid_pcie)); 4278 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 4279 4280 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 4281 hotplug_probe_cb, attach_cb, NULL); 4282 4283 if (g_hotplug_probe_ctx) { 4284 assert(g_hotplug_probe_poller == NULL); 4285 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 4286 } 4287 4288 return SPDK_POLLER_BUSY; 4289 } 4290 4291 void 4292 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 4293 { 4294 *opts = g_opts; 4295 } 4296 4297 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 4298 uint32_t reconnect_delay_sec, 4299 uint32_t fast_io_fail_timeout_sec); 4300 4301 static int 4302 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 4303 { 4304 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 4305 /* Can't set timeout_admin_us without also setting timeout_us */ 4306 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 4307 return -EINVAL; 4308 } 4309 4310 if (opts->bdev_retry_count < -1) { 4311 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 4312 return -EINVAL; 4313 } 4314 4315 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 4316 opts->reconnect_delay_sec, 4317 opts->fast_io_fail_timeout_sec)) { 4318 return -EINVAL; 4319 } 4320 4321 return 0; 4322 } 4323 4324 int 4325 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 4326 { 4327 int ret = bdev_nvme_validate_opts(opts); 4328 if (ret) { 4329 SPDK_WARNLOG("Failed to set nvme opts.\n"); 4330 return ret; 4331 } 4332 4333 if (g_bdev_nvme_init_thread != NULL) { 4334 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 4335 return -EPERM; 4336 } 4337 } 4338 4339 g_opts = *opts; 4340 4341 return 0; 4342 } 4343 4344 struct set_nvme_hotplug_ctx { 4345 uint64_t period_us; 4346 bool enabled; 4347 spdk_msg_fn fn; 4348 void *fn_ctx; 4349 }; 4350 4351 static void 4352 set_nvme_hotplug_period_cb(void *_ctx) 4353 { 4354 struct set_nvme_hotplug_ctx *ctx = _ctx; 4355 4356 spdk_poller_unregister(&g_hotplug_poller); 4357 if (ctx->enabled) { 4358 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 4359 } 4360 4361 g_nvme_hotplug_poll_period_us = ctx->period_us; 4362 g_nvme_hotplug_enabled = ctx->enabled; 4363 if (ctx->fn) { 4364 ctx->fn(ctx->fn_ctx); 4365 } 4366 4367 free(ctx); 4368 } 4369 4370 int 4371 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 4372 { 4373 struct set_nvme_hotplug_ctx *ctx; 4374 4375 if (enabled == true && !spdk_process_is_primary()) { 4376 return -EPERM; 4377 } 4378 4379 ctx = calloc(1, sizeof(*ctx)); 4380 if (ctx == NULL) { 4381 return -ENOMEM; 4382 } 4383 4384 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 4385 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 4386 ctx->enabled = enabled; 4387 ctx->fn = cb; 4388 ctx->fn_ctx = cb_ctx; 4389 4390 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 4391 return 0; 4392 } 4393 4394 static void 4395 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 4396 struct nvme_async_probe_ctx *ctx) 4397 { 4398 struct nvme_ns *nvme_ns; 4399 struct nvme_bdev *nvme_bdev; 4400 size_t j; 4401 4402 assert(nvme_ctrlr != NULL); 4403 4404 if (ctx->names == NULL) { 4405 populate_namespaces_cb(ctx, 0, 0); 4406 return; 4407 } 4408 4409 /* 4410 * Report the new bdevs that were created in this call. 4411 * There can be more than one bdev per NVMe controller. 4412 */ 4413 j = 0; 4414 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4415 while (nvme_ns != NULL) { 4416 nvme_bdev = nvme_ns->bdev; 4417 if (j < ctx->count) { 4418 ctx->names[j] = nvme_bdev->disk.name; 4419 j++; 4420 } else { 4421 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 4422 ctx->count); 4423 populate_namespaces_cb(ctx, 0, -ERANGE); 4424 return; 4425 } 4426 4427 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4428 } 4429 4430 populate_namespaces_cb(ctx, j, 0); 4431 } 4432 4433 static int 4434 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 4435 struct spdk_nvme_ctrlr *new_ctrlr, 4436 struct spdk_nvme_transport_id *trid) 4437 { 4438 struct nvme_path_id *tmp_trid; 4439 4440 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 4441 SPDK_ERRLOG("PCIe failover is not supported.\n"); 4442 return -ENOTSUP; 4443 } 4444 4445 /* Currently we only support failover to the same transport type. */ 4446 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 4447 SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", 4448 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 4449 spdk_nvme_transport_id_trtype_str(trid->trtype)); 4450 return -EINVAL; 4451 } 4452 4453 4454 /* Currently we only support failover to the same NQN. */ 4455 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 4456 SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 4457 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 4458 return -EINVAL; 4459 } 4460 4461 /* Skip all the other checks if we've already registered this path. */ 4462 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 4463 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 4464 SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, 4465 trid->subnqn); 4466 return -EEXIST; 4467 } 4468 } 4469 4470 return 0; 4471 } 4472 4473 static int 4474 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 4475 struct spdk_nvme_ctrlr *new_ctrlr) 4476 { 4477 struct nvme_ns *nvme_ns; 4478 struct spdk_nvme_ns *new_ns; 4479 4480 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4481 while (nvme_ns != NULL) { 4482 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 4483 assert(new_ns != NULL); 4484 4485 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 4486 return -EINVAL; 4487 } 4488 4489 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4490 } 4491 4492 return 0; 4493 } 4494 4495 static int 4496 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 4497 struct spdk_nvme_transport_id *trid) 4498 { 4499 struct nvme_path_id *new_trid, *tmp_trid; 4500 4501 new_trid = calloc(1, sizeof(*new_trid)); 4502 if (new_trid == NULL) { 4503 return -ENOMEM; 4504 } 4505 new_trid->trid = *trid; 4506 new_trid->is_failed = false; 4507 4508 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 4509 if (tmp_trid->is_failed && tmp_trid != nvme_ctrlr->active_path_id) { 4510 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 4511 return 0; 4512 } 4513 } 4514 4515 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 4516 return 0; 4517 } 4518 4519 /* This is the case that a secondary path is added to an existing 4520 * nvme_ctrlr for failover. After checking if it can access the same 4521 * namespaces as the primary path, it is disconnected until failover occurs. 4522 */ 4523 static int 4524 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 4525 struct spdk_nvme_ctrlr *new_ctrlr, 4526 struct spdk_nvme_transport_id *trid) 4527 { 4528 int rc; 4529 4530 assert(nvme_ctrlr != NULL); 4531 4532 pthread_mutex_lock(&nvme_ctrlr->mutex); 4533 4534 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 4535 if (rc != 0) { 4536 goto exit; 4537 } 4538 4539 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 4540 if (rc != 0) { 4541 goto exit; 4542 } 4543 4544 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 4545 4546 exit: 4547 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4548 4549 spdk_nvme_detach(new_ctrlr); 4550 4551 return rc; 4552 } 4553 4554 static void 4555 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4556 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 4557 { 4558 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 4559 struct nvme_async_probe_ctx *ctx; 4560 int rc; 4561 4562 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 4563 ctx->ctrlr_attached = true; 4564 4565 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 4566 if (rc != 0) { 4567 populate_namespaces_cb(ctx, 0, rc); 4568 } 4569 } 4570 4571 static void 4572 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4573 struct spdk_nvme_ctrlr *ctrlr, 4574 const struct spdk_nvme_ctrlr_opts *opts) 4575 { 4576 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 4577 struct nvme_ctrlr *nvme_ctrlr; 4578 struct nvme_async_probe_ctx *ctx; 4579 int rc; 4580 4581 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 4582 ctx->ctrlr_attached = true; 4583 4584 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 4585 if (nvme_ctrlr) { 4586 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 4587 } else { 4588 rc = -ENODEV; 4589 } 4590 4591 populate_namespaces_cb(ctx, 0, rc); 4592 } 4593 4594 static int 4595 bdev_nvme_async_poll(void *arg) 4596 { 4597 struct nvme_async_probe_ctx *ctx = arg; 4598 int rc; 4599 4600 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 4601 if (spdk_unlikely(rc != -EAGAIN)) { 4602 ctx->probe_done = true; 4603 spdk_poller_unregister(&ctx->poller); 4604 if (!ctx->ctrlr_attached) { 4605 /* The probe is done, but no controller was attached. 4606 * That means we had a failure, so report -EIO back to 4607 * the caller (usually the RPC). populate_namespaces_cb() 4608 * will take care of freeing the nvme_async_probe_ctx. 4609 */ 4610 populate_namespaces_cb(ctx, 0, -EIO); 4611 } else if (ctx->namespaces_populated) { 4612 /* The namespaces for the attached controller were all 4613 * populated and the response was already sent to the 4614 * caller (usually the RPC). So free the context here. 4615 */ 4616 free(ctx); 4617 } 4618 } 4619 4620 return SPDK_POLLER_BUSY; 4621 } 4622 4623 static bool 4624 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 4625 uint32_t reconnect_delay_sec, 4626 uint32_t fast_io_fail_timeout_sec) 4627 { 4628 if (ctrlr_loss_timeout_sec < -1) { 4629 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 4630 return false; 4631 } else if (ctrlr_loss_timeout_sec == -1) { 4632 if (reconnect_delay_sec == 0) { 4633 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 4634 return false; 4635 } else if (fast_io_fail_timeout_sec != 0 && 4636 fast_io_fail_timeout_sec < reconnect_delay_sec) { 4637 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 4638 return false; 4639 } 4640 } else if (ctrlr_loss_timeout_sec != 0) { 4641 if (reconnect_delay_sec == 0) { 4642 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 4643 return false; 4644 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 4645 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 4646 return false; 4647 } else if (fast_io_fail_timeout_sec != 0) { 4648 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 4649 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 4650 return false; 4651 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 4652 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 4653 return false; 4654 } 4655 } 4656 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 4657 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 4658 return false; 4659 } 4660 4661 return true; 4662 } 4663 4664 int 4665 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 4666 const char *base_name, 4667 const char **names, 4668 uint32_t count, 4669 spdk_bdev_create_nvme_fn cb_fn, 4670 void *cb_ctx, 4671 struct spdk_nvme_ctrlr_opts *drv_opts, 4672 struct nvme_ctrlr_opts *bdev_opts, 4673 bool multipath) 4674 { 4675 struct nvme_probe_skip_entry *entry, *tmp; 4676 struct nvme_async_probe_ctx *ctx; 4677 spdk_nvme_attach_cb attach_cb; 4678 4679 /* TODO expand this check to include both the host and target TRIDs. 4680 * Only if both are the same should we fail. 4681 */ 4682 if (nvme_ctrlr_get(trid) != NULL) { 4683 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 4684 return -EEXIST; 4685 } 4686 4687 if (bdev_opts != NULL && 4688 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 4689 bdev_opts->reconnect_delay_sec, 4690 bdev_opts->fast_io_fail_timeout_sec)) { 4691 return -EINVAL; 4692 } 4693 4694 ctx = calloc(1, sizeof(*ctx)); 4695 if (!ctx) { 4696 return -ENOMEM; 4697 } 4698 ctx->base_name = base_name; 4699 ctx->names = names; 4700 ctx->count = count; 4701 ctx->cb_fn = cb_fn; 4702 ctx->cb_ctx = cb_ctx; 4703 ctx->trid = *trid; 4704 4705 if (bdev_opts) { 4706 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 4707 } else { 4708 bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 4709 } 4710 4711 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 4712 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 4713 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4714 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 4715 free(entry); 4716 break; 4717 } 4718 } 4719 } 4720 4721 if (drv_opts) { 4722 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 4723 } else { 4724 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); 4725 } 4726 4727 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 4728 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 4729 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 4730 ctx->drv_opts.disable_read_ana_log_page = true; 4731 4732 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 4733 attach_cb = connect_attach_cb; 4734 } else { 4735 attach_cb = connect_set_failover_cb; 4736 } 4737 4738 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 4739 if (ctx->probe_ctx == NULL) { 4740 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 4741 free(ctx); 4742 return -ENODEV; 4743 } 4744 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 4745 4746 return 0; 4747 } 4748 4749 int 4750 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id) 4751 { 4752 struct nvme_bdev_ctrlr *nbdev_ctrlr; 4753 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 4754 struct nvme_path_id *p, *t; 4755 int rc = -ENXIO; 4756 4757 if (name == NULL || path_id == NULL) { 4758 return -EINVAL; 4759 } 4760 4761 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 4762 if (nbdev_ctrlr == NULL) { 4763 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 4764 return -ENODEV; 4765 } 4766 4767 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 4768 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 4769 if (path_id->trid.trtype != 0) { 4770 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 4771 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 4772 continue; 4773 } 4774 } else { 4775 if (path_id->trid.trtype != p->trid.trtype) { 4776 continue; 4777 } 4778 } 4779 } 4780 4781 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 4782 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 4783 continue; 4784 } 4785 } 4786 4787 if (path_id->trid.adrfam != 0) { 4788 if (path_id->trid.adrfam != p->trid.adrfam) { 4789 continue; 4790 } 4791 } 4792 4793 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 4794 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 4795 continue; 4796 } 4797 } 4798 4799 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 4800 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 4801 continue; 4802 } 4803 } 4804 4805 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 4806 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 4807 continue; 4808 } 4809 } 4810 4811 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 4812 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 4813 continue; 4814 } 4815 } 4816 4817 /* If we made it here, then this path is a match! Now we need to remove it. */ 4818 if (p == nvme_ctrlr->active_path_id) { 4819 /* This is the active path in use right now. The active path is always the first in the list. */ 4820 4821 if (!TAILQ_NEXT(p, link)) { 4822 /* The current path is the only path. */ 4823 rc = _bdev_nvme_delete(nvme_ctrlr, false); 4824 } else { 4825 /* There is an alternative path. */ 4826 rc = bdev_nvme_failover(nvme_ctrlr, true); 4827 } 4828 } else { 4829 /* We are not using the specified path. */ 4830 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 4831 free(p); 4832 rc = 0; 4833 } 4834 4835 if (rc < 0 && rc != -ENXIO) { 4836 return rc; 4837 } 4838 4839 4840 } 4841 } 4842 4843 /* All nvme_ctrlrs were deleted or no nvme_ctrlr which had the trid was found. */ 4844 return rc; 4845 } 4846 4847 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 4848 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 4849 4850 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 4851 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 4852 4853 struct discovery_entry_ctx { 4854 char name[128]; 4855 struct spdk_nvme_transport_id trid; 4856 struct spdk_nvme_ctrlr_opts drv_opts; 4857 struct spdk_nvmf_discovery_log_page_entry entry; 4858 TAILQ_ENTRY(discovery_entry_ctx) tailq; 4859 struct discovery_ctx *ctx; 4860 }; 4861 4862 struct discovery_ctx { 4863 char *name; 4864 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 4865 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 4866 void *cb_ctx; 4867 struct spdk_nvme_probe_ctx *probe_ctx; 4868 struct spdk_nvme_detach_ctx *detach_ctx; 4869 struct spdk_nvme_ctrlr *ctrlr; 4870 struct spdk_nvme_transport_id trid; 4871 struct discovery_entry_ctx *entry_ctx_in_use; 4872 struct spdk_poller *poller; 4873 struct spdk_nvme_ctrlr_opts drv_opts; 4874 struct nvme_ctrlr_opts bdev_opts; 4875 struct spdk_nvmf_discovery_log_page *log_page; 4876 TAILQ_ENTRY(discovery_ctx) tailq; 4877 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 4878 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 4879 int rc; 4880 bool wait_for_attach; 4881 uint64_t timeout_ticks; 4882 /* Denotes that the discovery service is being started. We're waiting 4883 * for the initial connection to the discovery controller to be 4884 * established and attach discovered NVM ctrlrs. 4885 */ 4886 bool initializing; 4887 /* Denotes if a discovery is currently in progress for this context. 4888 * That includes connecting to newly discovered subsystems. Used to 4889 * ensure we do not start a new discovery until an existing one is 4890 * complete. 4891 */ 4892 bool in_progress; 4893 4894 /* Denotes if another discovery is needed after the one in progress 4895 * completes. Set when we receive an AER completion while a discovery 4896 * is already in progress. 4897 */ 4898 bool pending; 4899 4900 /* Signal to the discovery context poller that it should stop the 4901 * discovery service, including detaching from the current discovery 4902 * controller. 4903 */ 4904 bool stop; 4905 4906 struct spdk_thread *calling_thread; 4907 uint32_t index; 4908 uint32_t attach_in_progress; 4909 char *hostnqn; 4910 }; 4911 4912 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 4913 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 4914 4915 static void get_discovery_log_page(struct discovery_ctx *ctx); 4916 4917 static void 4918 free_discovery_ctx(struct discovery_ctx *ctx) 4919 { 4920 free(ctx->log_page); 4921 free(ctx->hostnqn); 4922 free(ctx->name); 4923 free(ctx); 4924 } 4925 4926 static void 4927 discovery_complete(struct discovery_ctx *ctx) 4928 { 4929 ctx->initializing = false; 4930 ctx->in_progress = false; 4931 if (ctx->pending) { 4932 ctx->pending = false; 4933 get_discovery_log_page(ctx); 4934 } 4935 } 4936 4937 static void 4938 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 4939 struct spdk_nvmf_discovery_log_page_entry *entry) 4940 { 4941 char *space; 4942 4943 trid->trtype = entry->trtype; 4944 trid->adrfam = entry->adrfam; 4945 memcpy(trid->traddr, entry->traddr, sizeof(trid->traddr)); 4946 memcpy(trid->trsvcid, entry->trsvcid, sizeof(trid->trsvcid)); 4947 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn)); 4948 4949 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 4950 * But the log page entries typically pad them with spaces, not zeroes. 4951 * So add a NULL terminator to each of these fields at the appropriate 4952 * location. 4953 */ 4954 space = strchr(trid->traddr, ' '); 4955 if (space) { 4956 *space = 0; 4957 } 4958 space = strchr(trid->trsvcid, ' '); 4959 if (space) { 4960 *space = 0; 4961 } 4962 space = strchr(trid->subnqn, ' '); 4963 if (space) { 4964 *space = 0; 4965 } 4966 } 4967 4968 static void 4969 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 4970 { 4971 ctx->stop = true; 4972 ctx->stop_cb_fn = cb_fn; 4973 ctx->cb_ctx = cb_ctx; 4974 4975 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 4976 struct discovery_entry_ctx *entry_ctx; 4977 struct nvme_path_id path = {}; 4978 4979 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 4980 path.trid = entry_ctx->trid; 4981 bdev_nvme_delete(entry_ctx->name, &path); 4982 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 4983 free(entry_ctx); 4984 } 4985 4986 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 4987 struct discovery_entry_ctx *entry_ctx; 4988 4989 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 4990 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 4991 free(entry_ctx); 4992 } 4993 4994 free(ctx->entry_ctx_in_use); 4995 ctx->entry_ctx_in_use = NULL; 4996 } 4997 4998 static void 4999 discovery_remove_controllers(struct discovery_ctx *ctx) 5000 { 5001 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 5002 struct discovery_entry_ctx *entry_ctx, *tmp; 5003 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 5004 struct spdk_nvme_transport_id old_trid; 5005 uint64_t numrec, i; 5006 bool found; 5007 5008 numrec = from_le64(&log_page->numrec); 5009 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 5010 found = false; 5011 old_entry = &entry_ctx->entry; 5012 build_trid_from_log_page_entry(&old_trid, old_entry); 5013 for (i = 0; i < numrec; i++) { 5014 new_entry = &log_page->entries[i]; 5015 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 5016 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 5017 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 5018 found = true; 5019 break; 5020 } 5021 } 5022 if (!found) { 5023 struct nvme_path_id path = {}; 5024 5025 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 5026 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 5027 5028 path.trid = entry_ctx->trid; 5029 bdev_nvme_delete(entry_ctx->name, &path); 5030 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 5031 free(entry_ctx); 5032 } 5033 } 5034 free(log_page); 5035 ctx->log_page = NULL; 5036 discovery_complete(ctx); 5037 } 5038 5039 static void 5040 complete_discovery_start(struct discovery_ctx *ctx, int status) 5041 { 5042 ctx->timeout_ticks = 0; 5043 ctx->rc = status; 5044 if (ctx->start_cb_fn) { 5045 ctx->start_cb_fn(ctx->cb_ctx, status); 5046 ctx->start_cb_fn = NULL; 5047 ctx->cb_ctx = NULL; 5048 } 5049 } 5050 5051 static void 5052 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 5053 { 5054 struct discovery_entry_ctx *entry_ctx = cb_ctx; 5055 struct discovery_ctx *ctx = entry_ctx->ctx; 5056 5057 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 5058 ctx->attach_in_progress--; 5059 if (ctx->attach_in_progress == 0) { 5060 complete_discovery_start(ctx, ctx->rc); 5061 if (ctx->initializing && ctx->rc != 0) { 5062 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 5063 stop_discovery(ctx, NULL, ctx->cb_ctx); 5064 } else { 5065 discovery_remove_controllers(ctx); 5066 } 5067 } 5068 } 5069 5070 static struct discovery_entry_ctx * 5071 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 5072 { 5073 struct discovery_entry_ctx *new_ctx; 5074 5075 new_ctx = calloc(1, sizeof(*new_ctx)); 5076 if (new_ctx == NULL) { 5077 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 5078 return NULL; 5079 } 5080 5081 new_ctx->ctx = ctx; 5082 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 5083 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 5084 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 5085 return new_ctx; 5086 } 5087 5088 static void 5089 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 5090 struct spdk_nvmf_discovery_log_page *log_page) 5091 { 5092 struct discovery_ctx *ctx = cb_arg; 5093 struct discovery_entry_ctx *entry_ctx, *tmp; 5094 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 5095 uint64_t numrec, i; 5096 bool found; 5097 5098 if (rc || spdk_nvme_cpl_is_error(cpl)) { 5099 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 5100 return; 5101 } 5102 5103 ctx->log_page = log_page; 5104 assert(ctx->attach_in_progress == 0); 5105 numrec = from_le64(&log_page->numrec); 5106 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 5107 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 5108 free(entry_ctx); 5109 } 5110 for (i = 0; i < numrec; i++) { 5111 found = false; 5112 new_entry = &log_page->entries[i]; 5113 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 5114 struct discovery_entry_ctx *new_ctx; 5115 struct spdk_nvme_transport_id trid = {}; 5116 5117 build_trid_from_log_page_entry(&trid, new_entry); 5118 new_ctx = create_discovery_entry_ctx(ctx, &trid); 5119 if (new_ctx == NULL) { 5120 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 5121 break; 5122 } 5123 5124 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 5125 continue; 5126 } 5127 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 5128 old_entry = &entry_ctx->entry; 5129 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 5130 found = true; 5131 break; 5132 } 5133 } 5134 if (!found) { 5135 struct discovery_entry_ctx *subnqn_ctx, *new_ctx; 5136 5137 TAILQ_FOREACH(subnqn_ctx, &ctx->nvm_entry_ctxs, tailq) { 5138 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 5139 sizeof(new_entry->subnqn))) { 5140 break; 5141 } 5142 } 5143 5144 new_ctx = calloc(1, sizeof(*new_ctx)); 5145 if (new_ctx == NULL) { 5146 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 5147 break; 5148 } 5149 5150 new_ctx->ctx = ctx; 5151 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 5152 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 5153 if (subnqn_ctx) { 5154 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 5155 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 5156 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 5157 new_ctx->name); 5158 } else { 5159 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 5160 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 5161 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 5162 new_ctx->name); 5163 } 5164 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 5165 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 5166 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 5167 discovery_attach_controller_done, new_ctx, 5168 &new_ctx->drv_opts, &ctx->bdev_opts, true); 5169 if (rc == 0) { 5170 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 5171 ctx->attach_in_progress++; 5172 } else { 5173 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 5174 } 5175 } 5176 } 5177 5178 if (ctx->attach_in_progress == 0) { 5179 discovery_remove_controllers(ctx); 5180 } 5181 } 5182 5183 static void 5184 get_discovery_log_page(struct discovery_ctx *ctx) 5185 { 5186 int rc; 5187 5188 assert(ctx->in_progress == false); 5189 ctx->in_progress = true; 5190 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 5191 if (rc != 0) { 5192 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 5193 } 5194 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 5195 } 5196 5197 static void 5198 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5199 { 5200 struct discovery_ctx *ctx = arg; 5201 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 5202 5203 if (spdk_nvme_cpl_is_error(cpl)) { 5204 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 5205 return; 5206 } 5207 5208 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 5209 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 5210 return; 5211 } 5212 5213 DISCOVERY_INFOLOG(ctx, "got aer\n"); 5214 if (ctx->in_progress) { 5215 ctx->pending = true; 5216 return; 5217 } 5218 5219 get_discovery_log_page(ctx); 5220 } 5221 5222 static void 5223 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5224 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5225 { 5226 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5227 struct discovery_ctx *ctx; 5228 5229 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 5230 5231 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 5232 ctx->probe_ctx = NULL; 5233 ctx->ctrlr = ctrlr; 5234 5235 if (ctx->rc != 0) { 5236 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 5237 ctx->rc); 5238 return; 5239 } 5240 5241 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 5242 } 5243 5244 static int 5245 discovery_poller(void *arg) 5246 { 5247 struct discovery_ctx *ctx = arg; 5248 struct spdk_nvme_transport_id *trid; 5249 int rc; 5250 5251 if (ctx->detach_ctx) { 5252 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 5253 if (rc != -EAGAIN) { 5254 ctx->detach_ctx = NULL; 5255 ctx->ctrlr = NULL; 5256 } 5257 } else if (ctx->stop) { 5258 if (ctx->ctrlr != NULL) { 5259 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 5260 if (rc == 0) { 5261 return SPDK_POLLER_BUSY; 5262 } 5263 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 5264 } 5265 spdk_poller_unregister(&ctx->poller); 5266 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 5267 assert(ctx->start_cb_fn == NULL); 5268 if (ctx->stop_cb_fn != NULL) { 5269 ctx->stop_cb_fn(ctx->cb_ctx); 5270 } 5271 free_discovery_ctx(ctx); 5272 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 5273 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 5274 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 5275 assert(ctx->initializing); 5276 spdk_poller_unregister(&ctx->poller); 5277 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 5278 complete_discovery_start(ctx, -ETIMEDOUT); 5279 stop_discovery(ctx, NULL, NULL); 5280 free_discovery_ctx(ctx); 5281 return SPDK_POLLER_BUSY; 5282 } 5283 5284 assert(ctx->entry_ctx_in_use == NULL); 5285 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 5286 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 5287 trid = &ctx->entry_ctx_in_use->trid; 5288 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 5289 if (ctx->probe_ctx) { 5290 spdk_poller_unregister(&ctx->poller); 5291 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 5292 } else { 5293 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 5294 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 5295 ctx->entry_ctx_in_use = NULL; 5296 } 5297 } else if (ctx->probe_ctx) { 5298 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 5299 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 5300 complete_discovery_start(ctx, -ETIMEDOUT); 5301 return SPDK_POLLER_BUSY; 5302 } 5303 5304 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 5305 if (rc != -EAGAIN) { 5306 if (ctx->rc != 0) { 5307 assert(ctx->initializing); 5308 stop_discovery(ctx, NULL, ctx->cb_ctx); 5309 } else { 5310 assert(rc == 0); 5311 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 5312 ctx->rc = rc; 5313 get_discovery_log_page(ctx); 5314 } 5315 } 5316 } else { 5317 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 5318 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 5319 complete_discovery_start(ctx, -ETIMEDOUT); 5320 /* We need to wait until all NVM ctrlrs are attached before we stop the 5321 * discovery service to make sure we don't detach a ctrlr that is still 5322 * being attached. 5323 */ 5324 if (ctx->attach_in_progress == 0) { 5325 stop_discovery(ctx, NULL, ctx->cb_ctx); 5326 return SPDK_POLLER_BUSY; 5327 } 5328 } 5329 5330 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 5331 if (rc < 0) { 5332 spdk_poller_unregister(&ctx->poller); 5333 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 5334 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 5335 ctx->entry_ctx_in_use = NULL; 5336 5337 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 5338 if (rc != 0) { 5339 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 5340 ctx->ctrlr = NULL; 5341 } 5342 } 5343 } 5344 5345 return SPDK_POLLER_BUSY; 5346 } 5347 5348 static void 5349 start_discovery_poller(void *arg) 5350 { 5351 struct discovery_ctx *ctx = arg; 5352 5353 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 5354 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 5355 } 5356 5357 int 5358 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 5359 const char *base_name, 5360 struct spdk_nvme_ctrlr_opts *drv_opts, 5361 struct nvme_ctrlr_opts *bdev_opts, 5362 uint64_t attach_timeout, 5363 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 5364 { 5365 struct discovery_ctx *ctx; 5366 struct discovery_entry_ctx *discovery_entry_ctx; 5367 5368 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 5369 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 5370 if (strcmp(ctx->name, base_name) == 0) { 5371 return -EEXIST; 5372 } 5373 5374 if (ctx->entry_ctx_in_use != NULL) { 5375 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 5376 return -EEXIST; 5377 } 5378 } 5379 5380 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 5381 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 5382 return -EEXIST; 5383 } 5384 } 5385 } 5386 5387 ctx = calloc(1, sizeof(*ctx)); 5388 if (ctx == NULL) { 5389 return -ENOMEM; 5390 } 5391 5392 ctx->name = strdup(base_name); 5393 if (ctx->name == NULL) { 5394 free_discovery_ctx(ctx); 5395 return -ENOMEM; 5396 } 5397 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 5398 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 5399 ctx->bdev_opts.from_discovery_service = true; 5400 ctx->calling_thread = spdk_get_thread(); 5401 ctx->start_cb_fn = cb_fn; 5402 ctx->cb_ctx = cb_ctx; 5403 ctx->initializing = true; 5404 if (ctx->start_cb_fn) { 5405 /* We can use this when dumping json to denote if this RPC parameter 5406 * was specified or not. 5407 */ 5408 ctx->wait_for_attach = true; 5409 } 5410 if (attach_timeout != 0) { 5411 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 5412 spdk_get_ticks_hz() / 1000ull; 5413 } 5414 TAILQ_INIT(&ctx->nvm_entry_ctxs); 5415 TAILQ_INIT(&ctx->discovery_entry_ctxs); 5416 memcpy(&ctx->trid, trid, sizeof(*trid)); 5417 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 5418 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 5419 if (ctx->hostnqn == NULL) { 5420 free_discovery_ctx(ctx); 5421 return -ENOMEM; 5422 } 5423 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 5424 if (discovery_entry_ctx == NULL) { 5425 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 5426 free_discovery_ctx(ctx); 5427 return -ENOMEM; 5428 } 5429 5430 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 5431 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 5432 return 0; 5433 } 5434 5435 int 5436 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 5437 { 5438 struct discovery_ctx *ctx; 5439 5440 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 5441 if (strcmp(name, ctx->name) == 0) { 5442 if (ctx->stop) { 5443 return -EALREADY; 5444 } 5445 /* If we're still starting the discovery service and ->rc is non-zero, we're 5446 * going to stop it as soon as we can 5447 */ 5448 if (ctx->initializing && ctx->rc != 0) { 5449 return -EALREADY; 5450 } 5451 stop_discovery(ctx, cb_fn, cb_ctx); 5452 return 0; 5453 } 5454 } 5455 5456 return -ENOENT; 5457 } 5458 5459 static int 5460 bdev_nvme_library_init(void) 5461 { 5462 g_bdev_nvme_init_thread = spdk_get_thread(); 5463 5464 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 5465 bdev_nvme_destroy_poll_group_cb, 5466 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 5467 5468 return 0; 5469 } 5470 5471 static void 5472 bdev_nvme_fini_destruct_ctrlrs(void) 5473 { 5474 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5475 struct nvme_ctrlr *nvme_ctrlr; 5476 5477 pthread_mutex_lock(&g_bdev_nvme_mutex); 5478 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 5479 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 5480 pthread_mutex_lock(&nvme_ctrlr->mutex); 5481 if (nvme_ctrlr->destruct) { 5482 /* This controller's destruction was already started 5483 * before the application started shutting down 5484 */ 5485 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5486 continue; 5487 } 5488 nvme_ctrlr->destruct = true; 5489 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5490 5491 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 5492 nvme_ctrlr); 5493 } 5494 } 5495 5496 g_bdev_nvme_module_finish = true; 5497 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5498 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5499 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 5500 spdk_bdev_module_fini_done(); 5501 return; 5502 } 5503 5504 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5505 } 5506 5507 static void 5508 check_discovery_fini(void *arg) 5509 { 5510 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 5511 bdev_nvme_fini_destruct_ctrlrs(); 5512 } 5513 } 5514 5515 static void 5516 bdev_nvme_library_fini(void) 5517 { 5518 struct nvme_probe_skip_entry *entry, *entry_tmp; 5519 struct discovery_ctx *ctx; 5520 5521 spdk_poller_unregister(&g_hotplug_poller); 5522 free(g_hotplug_probe_ctx); 5523 g_hotplug_probe_ctx = NULL; 5524 5525 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 5526 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 5527 free(entry); 5528 } 5529 5530 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 5531 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 5532 bdev_nvme_fini_destruct_ctrlrs(); 5533 } else { 5534 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 5535 stop_discovery(ctx, check_discovery_fini, NULL); 5536 } 5537 } 5538 } 5539 5540 static void 5541 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 5542 { 5543 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5544 struct spdk_bdev *bdev = bdev_io->bdev; 5545 struct spdk_dif_ctx dif_ctx; 5546 struct spdk_dif_error err_blk = {}; 5547 int rc; 5548 5549 rc = spdk_dif_ctx_init(&dif_ctx, 5550 bdev->blocklen, bdev->md_len, bdev->md_interleave, 5551 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 5552 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0); 5553 if (rc != 0) { 5554 SPDK_ERRLOG("Initialization of DIF context failed\n"); 5555 return; 5556 } 5557 5558 if (bdev->md_interleave) { 5559 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 5560 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 5561 } else { 5562 struct iovec md_iov = { 5563 .iov_base = bdev_io->u.bdev.md_buf, 5564 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 5565 }; 5566 5567 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 5568 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 5569 } 5570 5571 if (rc != 0) { 5572 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 5573 err_blk.err_type, err_blk.err_offset); 5574 } else { 5575 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 5576 } 5577 } 5578 5579 static void 5580 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 5581 { 5582 struct nvme_bdev_io *bio = ref; 5583 5584 if (spdk_nvme_cpl_is_success(cpl)) { 5585 /* Run PI verification for read data buffer. */ 5586 bdev_nvme_verify_pi_error(bio); 5587 } 5588 5589 /* Return original completion status */ 5590 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 5591 } 5592 5593 static void 5594 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 5595 { 5596 struct nvme_bdev_io *bio = ref; 5597 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5598 int ret; 5599 5600 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 5601 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 5602 cpl->status.sct, cpl->status.sc); 5603 5604 /* Save completion status to use after verifying PI error. */ 5605 bio->cpl = *cpl; 5606 5607 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 5608 /* Read without PI checking to verify PI error. */ 5609 ret = bdev_nvme_no_pi_readv(bio, 5610 bdev_io->u.bdev.iovs, 5611 bdev_io->u.bdev.iovcnt, 5612 bdev_io->u.bdev.md_buf, 5613 bdev_io->u.bdev.num_blocks, 5614 bdev_io->u.bdev.offset_blocks); 5615 if (ret == 0) { 5616 return; 5617 } 5618 } 5619 } 5620 5621 bdev_nvme_io_complete_nvme_status(bio, cpl); 5622 } 5623 5624 static void 5625 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 5626 { 5627 struct nvme_bdev_io *bio = ref; 5628 5629 if (spdk_nvme_cpl_is_pi_error(cpl)) { 5630 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 5631 cpl->status.sct, cpl->status.sc); 5632 /* Run PI verification for write data buffer if PI error is detected. */ 5633 bdev_nvme_verify_pi_error(bio); 5634 } 5635 5636 bdev_nvme_io_complete_nvme_status(bio, cpl); 5637 } 5638 5639 static void 5640 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 5641 { 5642 struct nvme_bdev_io *bio = ref; 5643 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5644 5645 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 5646 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 5647 */ 5648 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 5649 5650 if (spdk_nvme_cpl_is_pi_error(cpl)) { 5651 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 5652 cpl->status.sct, cpl->status.sc); 5653 /* Run PI verification for zone append data buffer if PI error is detected. */ 5654 bdev_nvme_verify_pi_error(bio); 5655 } 5656 5657 bdev_nvme_io_complete_nvme_status(bio, cpl); 5658 } 5659 5660 static void 5661 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 5662 { 5663 struct nvme_bdev_io *bio = ref; 5664 5665 if (spdk_nvme_cpl_is_pi_error(cpl)) { 5666 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 5667 cpl->status.sct, cpl->status.sc); 5668 /* Run PI verification for compare data buffer if PI error is detected. */ 5669 bdev_nvme_verify_pi_error(bio); 5670 } 5671 5672 bdev_nvme_io_complete_nvme_status(bio, cpl); 5673 } 5674 5675 static void 5676 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 5677 { 5678 struct nvme_bdev_io *bio = ref; 5679 5680 /* Compare operation completion */ 5681 if (!bio->first_fused_completed) { 5682 /* Save compare result for write callback */ 5683 bio->cpl = *cpl; 5684 bio->first_fused_completed = true; 5685 return; 5686 } 5687 5688 /* Write operation completion */ 5689 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 5690 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 5691 * complete the IO with the compare operation's status. 5692 */ 5693 if (!spdk_nvme_cpl_is_error(cpl)) { 5694 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 5695 } 5696 5697 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 5698 } else { 5699 bdev_nvme_io_complete_nvme_status(bio, cpl); 5700 } 5701 } 5702 5703 static void 5704 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 5705 { 5706 struct nvme_bdev_io *bio = ref; 5707 5708 bdev_nvme_io_complete_nvme_status(bio, cpl); 5709 } 5710 5711 static int 5712 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 5713 { 5714 switch (desc->zt) { 5715 case SPDK_NVME_ZONE_TYPE_SEQWR: 5716 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 5717 break; 5718 default: 5719 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 5720 return -EIO; 5721 } 5722 5723 switch (desc->zs) { 5724 case SPDK_NVME_ZONE_STATE_EMPTY: 5725 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 5726 break; 5727 case SPDK_NVME_ZONE_STATE_IOPEN: 5728 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 5729 break; 5730 case SPDK_NVME_ZONE_STATE_EOPEN: 5731 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 5732 break; 5733 case SPDK_NVME_ZONE_STATE_CLOSED: 5734 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 5735 break; 5736 case SPDK_NVME_ZONE_STATE_RONLY: 5737 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 5738 break; 5739 case SPDK_NVME_ZONE_STATE_FULL: 5740 info->state = SPDK_BDEV_ZONE_STATE_FULL; 5741 break; 5742 case SPDK_NVME_ZONE_STATE_OFFLINE: 5743 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 5744 break; 5745 default: 5746 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 5747 return -EIO; 5748 } 5749 5750 info->zone_id = desc->zslba; 5751 info->write_pointer = desc->wp; 5752 info->capacity = desc->zcap; 5753 5754 return 0; 5755 } 5756 5757 static void 5758 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 5759 { 5760 struct nvme_bdev_io *bio = ref; 5761 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5762 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 5763 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 5764 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 5765 uint64_t max_zones_per_buf, i; 5766 uint32_t zone_report_bufsize; 5767 struct spdk_nvme_ns *ns; 5768 struct spdk_nvme_qpair *qpair; 5769 int ret; 5770 5771 if (spdk_nvme_cpl_is_error(cpl)) { 5772 goto out_complete_io_nvme_cpl; 5773 } 5774 5775 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 5776 ret = -ENXIO; 5777 goto out_complete_io_ret; 5778 } 5779 5780 ns = bio->io_path->nvme_ns->ns; 5781 qpair = bio->io_path->qpair->qpair; 5782 5783 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 5784 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 5785 sizeof(bio->zone_report_buf->descs[0]); 5786 5787 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 5788 ret = -EINVAL; 5789 goto out_complete_io_ret; 5790 } 5791 5792 if (!bio->zone_report_buf->nr_zones) { 5793 ret = -EINVAL; 5794 goto out_complete_io_ret; 5795 } 5796 5797 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 5798 ret = fill_zone_from_report(&info[bio->handled_zones], 5799 &bio->zone_report_buf->descs[i]); 5800 if (ret) { 5801 goto out_complete_io_ret; 5802 } 5803 bio->handled_zones++; 5804 } 5805 5806 if (bio->handled_zones < zones_to_copy) { 5807 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 5808 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 5809 5810 memset(bio->zone_report_buf, 0, zone_report_bufsize); 5811 ret = spdk_nvme_zns_report_zones(ns, qpair, 5812 bio->zone_report_buf, zone_report_bufsize, 5813 slba, SPDK_NVME_ZRA_LIST_ALL, true, 5814 bdev_nvme_get_zone_info_done, bio); 5815 if (!ret) { 5816 return; 5817 } else { 5818 goto out_complete_io_ret; 5819 } 5820 } 5821 5822 out_complete_io_nvme_cpl: 5823 free(bio->zone_report_buf); 5824 bio->zone_report_buf = NULL; 5825 bdev_nvme_io_complete_nvme_status(bio, cpl); 5826 return; 5827 5828 out_complete_io_ret: 5829 free(bio->zone_report_buf); 5830 bio->zone_report_buf = NULL; 5831 bdev_nvme_io_complete(bio, ret); 5832 } 5833 5834 static void 5835 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 5836 { 5837 struct nvme_bdev_io *bio = ref; 5838 5839 bdev_nvme_io_complete_nvme_status(bio, cpl); 5840 } 5841 5842 static void 5843 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 5844 { 5845 struct nvme_bdev_io *bio = ctx; 5846 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5847 const struct spdk_nvme_cpl *cpl = &bio->cpl; 5848 5849 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 5850 5851 __bdev_nvme_io_complete(bdev_io, 0, cpl); 5852 } 5853 5854 static void 5855 bdev_nvme_abort_complete(void *ctx) 5856 { 5857 struct nvme_bdev_io *bio = ctx; 5858 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5859 5860 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 5861 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 5862 } else { 5863 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 5864 } 5865 } 5866 5867 static void 5868 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 5869 { 5870 struct nvme_bdev_io *bio = ref; 5871 5872 bio->cpl = *cpl; 5873 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_complete, bio); 5874 } 5875 5876 static void 5877 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 5878 { 5879 struct nvme_bdev_io *bio = ref; 5880 5881 bio->cpl = *cpl; 5882 spdk_thread_send_msg(bio->orig_thread, 5883 bdev_nvme_admin_passthru_complete_nvme_status, bio); 5884 } 5885 5886 static void 5887 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 5888 { 5889 struct nvme_bdev_io *bio = ref; 5890 struct iovec *iov; 5891 5892 bio->iov_offset = sgl_offset; 5893 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 5894 iov = &bio->iovs[bio->iovpos]; 5895 if (bio->iov_offset < iov->iov_len) { 5896 break; 5897 } 5898 5899 bio->iov_offset -= iov->iov_len; 5900 } 5901 } 5902 5903 static int 5904 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 5905 { 5906 struct nvme_bdev_io *bio = ref; 5907 struct iovec *iov; 5908 5909 assert(bio->iovpos < bio->iovcnt); 5910 5911 iov = &bio->iovs[bio->iovpos]; 5912 5913 *address = iov->iov_base; 5914 *length = iov->iov_len; 5915 5916 if (bio->iov_offset) { 5917 assert(bio->iov_offset <= iov->iov_len); 5918 *address += bio->iov_offset; 5919 *length -= bio->iov_offset; 5920 } 5921 5922 bio->iov_offset += *length; 5923 if (bio->iov_offset == iov->iov_len) { 5924 bio->iovpos++; 5925 bio->iov_offset = 0; 5926 } 5927 5928 return 0; 5929 } 5930 5931 static void 5932 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 5933 { 5934 struct nvme_bdev_io *bio = ref; 5935 struct iovec *iov; 5936 5937 bio->fused_iov_offset = sgl_offset; 5938 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 5939 iov = &bio->fused_iovs[bio->fused_iovpos]; 5940 if (bio->fused_iov_offset < iov->iov_len) { 5941 break; 5942 } 5943 5944 bio->fused_iov_offset -= iov->iov_len; 5945 } 5946 } 5947 5948 static int 5949 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 5950 { 5951 struct nvme_bdev_io *bio = ref; 5952 struct iovec *iov; 5953 5954 assert(bio->fused_iovpos < bio->fused_iovcnt); 5955 5956 iov = &bio->fused_iovs[bio->fused_iovpos]; 5957 5958 *address = iov->iov_base; 5959 *length = iov->iov_len; 5960 5961 if (bio->fused_iov_offset) { 5962 assert(bio->fused_iov_offset <= iov->iov_len); 5963 *address += bio->fused_iov_offset; 5964 *length -= bio->fused_iov_offset; 5965 } 5966 5967 bio->fused_iov_offset += *length; 5968 if (bio->fused_iov_offset == iov->iov_len) { 5969 bio->fused_iovpos++; 5970 bio->fused_iov_offset = 0; 5971 } 5972 5973 return 0; 5974 } 5975 5976 static int 5977 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 5978 void *md, uint64_t lba_count, uint64_t lba) 5979 { 5980 int rc; 5981 5982 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 5983 lba_count, lba); 5984 5985 bio->iovs = iov; 5986 bio->iovcnt = iovcnt; 5987 bio->iovpos = 0; 5988 bio->iov_offset = 0; 5989 5990 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 5991 bio->io_path->qpair->qpair, 5992 lba, lba_count, 5993 bdev_nvme_no_pi_readv_done, bio, 0, 5994 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 5995 md, 0, 0); 5996 5997 if (rc != 0 && rc != -ENOMEM) { 5998 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 5999 } 6000 return rc; 6001 } 6002 6003 static int 6004 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6005 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 6006 struct spdk_bdev_ext_io_opts *ext_opts) 6007 { 6008 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6009 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6010 int rc; 6011 6012 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 6013 lba_count, lba); 6014 6015 bio->iovs = iov; 6016 bio->iovcnt = iovcnt; 6017 bio->iovpos = 0; 6018 bio->iov_offset = 0; 6019 6020 if (ext_opts) { 6021 bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts); 6022 bio->ext_opts.memory_domain = ext_opts->memory_domain; 6023 bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx; 6024 bio->ext_opts.io_flags = flags; 6025 bio->ext_opts.metadata = md; 6026 6027 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 6028 bdev_nvme_readv_done, bio, 6029 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6030 &bio->ext_opts); 6031 } else if (iovcnt == 1) { 6032 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba, 6033 lba_count, 6034 bdev_nvme_readv_done, bio, 6035 flags, 6036 0, 0); 6037 } else { 6038 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 6039 bdev_nvme_readv_done, bio, flags, 6040 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6041 md, 0, 0); 6042 } 6043 6044 if (rc != 0 && rc != -ENOMEM) { 6045 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 6046 } 6047 return rc; 6048 } 6049 6050 static int 6051 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6052 void *md, uint64_t lba_count, uint64_t lba, 6053 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts) 6054 { 6055 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6056 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6057 int rc; 6058 6059 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 6060 lba_count, lba); 6061 6062 bio->iovs = iov; 6063 bio->iovcnt = iovcnt; 6064 bio->iovpos = 0; 6065 bio->iov_offset = 0; 6066 6067 if (ext_opts) { 6068 bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts); 6069 bio->ext_opts.memory_domain = ext_opts->memory_domain; 6070 bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx; 6071 bio->ext_opts.io_flags = flags; 6072 bio->ext_opts.metadata = md; 6073 6074 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 6075 bdev_nvme_writev_done, bio, 6076 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6077 &bio->ext_opts); 6078 } else if (iovcnt == 1) { 6079 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba, 6080 lba_count, 6081 bdev_nvme_writev_done, bio, 6082 flags, 6083 0, 0); 6084 } else { 6085 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 6086 bdev_nvme_writev_done, bio, flags, 6087 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6088 md, 0, 0); 6089 } 6090 6091 if (rc != 0 && rc != -ENOMEM) { 6092 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 6093 } 6094 return rc; 6095 } 6096 6097 static int 6098 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6099 void *md, uint64_t lba_count, uint64_t zslba, 6100 uint32_t flags) 6101 { 6102 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6103 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6104 int rc; 6105 6106 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 6107 lba_count, zslba); 6108 6109 bio->iovs = iov; 6110 bio->iovcnt = iovcnt; 6111 bio->iovpos = 0; 6112 bio->iov_offset = 0; 6113 6114 if (iovcnt == 1) { 6115 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 6116 lba_count, 6117 bdev_nvme_zone_appendv_done, bio, 6118 flags, 6119 0, 0); 6120 } else { 6121 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 6122 bdev_nvme_zone_appendv_done, bio, flags, 6123 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6124 md, 0, 0); 6125 } 6126 6127 if (rc != 0 && rc != -ENOMEM) { 6128 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 6129 } 6130 return rc; 6131 } 6132 6133 static int 6134 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6135 void *md, uint64_t lba_count, uint64_t lba, 6136 uint32_t flags) 6137 { 6138 int rc; 6139 6140 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 6141 lba_count, lba); 6142 6143 bio->iovs = iov; 6144 bio->iovcnt = iovcnt; 6145 bio->iovpos = 0; 6146 bio->iov_offset = 0; 6147 6148 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 6149 bio->io_path->qpair->qpair, 6150 lba, lba_count, 6151 bdev_nvme_comparev_done, bio, flags, 6152 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6153 md, 0, 0); 6154 6155 if (rc != 0 && rc != -ENOMEM) { 6156 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 6157 } 6158 return rc; 6159 } 6160 6161 static int 6162 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 6163 struct iovec *write_iov, int write_iovcnt, 6164 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 6165 { 6166 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6167 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6168 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6169 int rc; 6170 6171 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 6172 lba_count, lba); 6173 6174 bio->iovs = cmp_iov; 6175 bio->iovcnt = cmp_iovcnt; 6176 bio->iovpos = 0; 6177 bio->iov_offset = 0; 6178 bio->fused_iovs = write_iov; 6179 bio->fused_iovcnt = write_iovcnt; 6180 bio->fused_iovpos = 0; 6181 bio->fused_iov_offset = 0; 6182 6183 if (bdev_io->num_retries == 0) { 6184 bio->first_fused_submitted = false; 6185 bio->first_fused_completed = false; 6186 } 6187 6188 if (!bio->first_fused_submitted) { 6189 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 6190 memset(&bio->cpl, 0, sizeof(bio->cpl)); 6191 6192 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 6193 bdev_nvme_comparev_and_writev_done, bio, flags, 6194 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 6195 if (rc == 0) { 6196 bio->first_fused_submitted = true; 6197 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 6198 } else { 6199 if (rc != -ENOMEM) { 6200 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 6201 } 6202 return rc; 6203 } 6204 } 6205 6206 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 6207 6208 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 6209 bdev_nvme_comparev_and_writev_done, bio, flags, 6210 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 6211 if (rc != 0 && rc != -ENOMEM) { 6212 SPDK_ERRLOG("write failed: rc = %d\n", rc); 6213 rc = 0; 6214 } 6215 6216 return rc; 6217 } 6218 6219 static int 6220 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 6221 { 6222 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 6223 struct spdk_nvme_dsm_range *range; 6224 uint64_t offset, remaining; 6225 uint64_t num_ranges_u64; 6226 uint16_t num_ranges; 6227 int rc; 6228 6229 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 6230 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6231 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 6232 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 6233 return -EINVAL; 6234 } 6235 num_ranges = (uint16_t)num_ranges_u64; 6236 6237 offset = offset_blocks; 6238 remaining = num_blocks; 6239 range = &dsm_ranges[0]; 6240 6241 /* Fill max-size ranges until the remaining blocks fit into one range */ 6242 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 6243 range->attributes.raw = 0; 6244 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6245 range->starting_lba = offset; 6246 6247 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6248 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6249 range++; 6250 } 6251 6252 /* Final range describes the remaining blocks */ 6253 range->attributes.raw = 0; 6254 range->length = remaining; 6255 range->starting_lba = offset; 6256 6257 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 6258 bio->io_path->qpair->qpair, 6259 SPDK_NVME_DSM_ATTR_DEALLOCATE, 6260 dsm_ranges, num_ranges, 6261 bdev_nvme_queued_done, bio); 6262 6263 return rc; 6264 } 6265 6266 static int 6267 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 6268 { 6269 if (num_blocks > UINT16_MAX + 1) { 6270 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 6271 return -EINVAL; 6272 } 6273 6274 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 6275 bio->io_path->qpair->qpair, 6276 offset_blocks, num_blocks, 6277 bdev_nvme_queued_done, bio, 6278 0); 6279 } 6280 6281 static int 6282 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 6283 struct spdk_bdev_zone_info *info) 6284 { 6285 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6286 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6287 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 6288 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 6289 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 6290 6291 if (zone_id % zone_size != 0) { 6292 return -EINVAL; 6293 } 6294 6295 if (num_zones > total_zones || !num_zones) { 6296 return -EINVAL; 6297 } 6298 6299 assert(!bio->zone_report_buf); 6300 bio->zone_report_buf = calloc(1, zone_report_bufsize); 6301 if (!bio->zone_report_buf) { 6302 return -ENOMEM; 6303 } 6304 6305 bio->handled_zones = 0; 6306 6307 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 6308 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 6309 bdev_nvme_get_zone_info_done, bio); 6310 } 6311 6312 static int 6313 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 6314 enum spdk_bdev_zone_action action) 6315 { 6316 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6317 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6318 6319 switch (action) { 6320 case SPDK_BDEV_ZONE_CLOSE: 6321 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 6322 bdev_nvme_zone_management_done, bio); 6323 case SPDK_BDEV_ZONE_FINISH: 6324 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 6325 bdev_nvme_zone_management_done, bio); 6326 case SPDK_BDEV_ZONE_OPEN: 6327 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 6328 bdev_nvme_zone_management_done, bio); 6329 case SPDK_BDEV_ZONE_RESET: 6330 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 6331 bdev_nvme_zone_management_done, bio); 6332 case SPDK_BDEV_ZONE_OFFLINE: 6333 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 6334 bdev_nvme_zone_management_done, bio); 6335 default: 6336 return -EINVAL; 6337 } 6338 } 6339 6340 static void 6341 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 6342 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 6343 { 6344 struct nvme_io_path *io_path; 6345 struct nvme_ctrlr *nvme_ctrlr; 6346 uint32_t max_xfer_size; 6347 int rc = -ENXIO; 6348 6349 /* Choose the first ctrlr which is not failed. */ 6350 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 6351 nvme_ctrlr = io_path->qpair->ctrlr; 6352 6353 /* We should skip any unavailable nvme_ctrlr rather than checking 6354 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 6355 */ 6356 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 6357 continue; 6358 } 6359 6360 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 6361 6362 if (nbytes > max_xfer_size) { 6363 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 6364 rc = -EINVAL; 6365 goto err; 6366 } 6367 6368 bio->io_path = io_path; 6369 bio->orig_thread = spdk_get_thread(); 6370 6371 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 6372 bdev_nvme_admin_passthru_done, bio); 6373 if (rc == 0) { 6374 return; 6375 } 6376 } 6377 6378 err: 6379 bdev_nvme_admin_passthru_complete(bio, rc); 6380 } 6381 6382 static int 6383 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 6384 void *buf, size_t nbytes) 6385 { 6386 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6387 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6388 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 6389 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 6390 6391 if (nbytes > max_xfer_size) { 6392 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 6393 return -EINVAL; 6394 } 6395 6396 /* 6397 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 6398 * so fill it out automatically. 6399 */ 6400 cmd->nsid = spdk_nvme_ns_get_id(ns); 6401 6402 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 6403 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 6404 } 6405 6406 static int 6407 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 6408 void *buf, size_t nbytes, void *md_buf, size_t md_len) 6409 { 6410 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6411 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6412 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 6413 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 6414 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 6415 6416 if (nbytes > max_xfer_size) { 6417 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 6418 return -EINVAL; 6419 } 6420 6421 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 6422 SPDK_ERRLOG("invalid meta data buffer size\n"); 6423 return -EINVAL; 6424 } 6425 6426 /* 6427 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 6428 * so fill it out automatically. 6429 */ 6430 cmd->nsid = spdk_nvme_ns_get_id(ns); 6431 6432 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 6433 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 6434 } 6435 6436 static void 6437 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 6438 struct nvme_bdev_io *bio_to_abort) 6439 { 6440 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6441 struct spdk_bdev_io *bdev_io_to_abort; 6442 struct nvme_io_path *io_path; 6443 struct nvme_ctrlr *nvme_ctrlr; 6444 int rc = 0; 6445 6446 bio->orig_thread = spdk_get_thread(); 6447 6448 /* Traverse the retry_io_list first. */ 6449 TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) { 6450 if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) { 6451 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link); 6452 __bdev_nvme_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 6453 6454 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 6455 return; 6456 } 6457 } 6458 6459 /* Even admin commands, they were submitted to only nvme_ctrlrs which were 6460 * on any io_path. So traverse the io_path list for not only I/O commands 6461 * but also admin commands. 6462 */ 6463 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 6464 nvme_ctrlr = io_path->qpair->ctrlr; 6465 6466 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr, 6467 io_path->qpair->qpair, 6468 bio_to_abort, 6469 bdev_nvme_abort_done, bio); 6470 if (rc == -ENOENT) { 6471 /* If no command was found in I/O qpair, the target command may be 6472 * admin command. 6473 */ 6474 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr, 6475 NULL, 6476 bio_to_abort, 6477 bdev_nvme_abort_done, bio); 6478 } 6479 6480 if (rc != -ENOENT) { 6481 break; 6482 } 6483 } 6484 6485 if (rc != 0) { 6486 /* If no command was found or there was any error, complete the abort 6487 * request with failure. 6488 */ 6489 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 6490 } 6491 } 6492 6493 static void 6494 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 6495 { 6496 const char *action; 6497 6498 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 6499 action = "reset"; 6500 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 6501 action = "abort"; 6502 } else { 6503 action = "none"; 6504 } 6505 6506 spdk_json_write_object_begin(w); 6507 6508 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 6509 6510 spdk_json_write_named_object_begin(w, "params"); 6511 spdk_json_write_named_string(w, "action_on_timeout", action); 6512 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 6513 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 6514 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 6515 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 6516 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 6517 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 6518 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 6519 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 6520 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 6521 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 6522 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 6523 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 6524 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 6525 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 6526 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 6527 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 6528 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 6529 spdk_json_write_object_end(w); 6530 6531 spdk_json_write_object_end(w); 6532 } 6533 6534 static void 6535 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 6536 { 6537 struct spdk_nvme_transport_id trid; 6538 6539 spdk_json_write_object_begin(w); 6540 6541 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 6542 6543 spdk_json_write_named_object_begin(w, "params"); 6544 spdk_json_write_named_string(w, "name", ctx->name); 6545 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 6546 6547 trid = ctx->trid; 6548 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 6549 nvme_bdev_dump_trid_json(&trid, w); 6550 6551 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 6552 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 6553 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 6554 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 6555 ctx->bdev_opts.fast_io_fail_timeout_sec); 6556 spdk_json_write_object_end(w); 6557 6558 spdk_json_write_object_end(w); 6559 } 6560 6561 static void 6562 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 6563 struct nvme_ctrlr *nvme_ctrlr) 6564 { 6565 struct spdk_nvme_transport_id *trid; 6566 6567 if (nvme_ctrlr->opts.from_discovery_service) { 6568 /* Do not emit an RPC for this - it will be implicitly 6569 * covered by a separate bdev_nvme_start_discovery RPC. 6570 */ 6571 return; 6572 } 6573 6574 trid = &nvme_ctrlr->active_path_id->trid; 6575 6576 spdk_json_write_object_begin(w); 6577 6578 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 6579 6580 spdk_json_write_named_object_begin(w, "params"); 6581 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 6582 nvme_bdev_dump_trid_json(trid, w); 6583 spdk_json_write_named_bool(w, "prchk_reftag", 6584 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 6585 spdk_json_write_named_bool(w, "prchk_guard", 6586 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 6587 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 6588 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 6589 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 6590 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 6591 6592 spdk_json_write_object_end(w); 6593 6594 spdk_json_write_object_end(w); 6595 } 6596 6597 static void 6598 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 6599 { 6600 spdk_json_write_object_begin(w); 6601 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 6602 6603 spdk_json_write_named_object_begin(w, "params"); 6604 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 6605 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 6606 spdk_json_write_object_end(w); 6607 6608 spdk_json_write_object_end(w); 6609 } 6610 6611 static int 6612 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 6613 { 6614 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6615 struct nvme_ctrlr *nvme_ctrlr; 6616 struct discovery_ctx *ctx; 6617 6618 bdev_nvme_opts_config_json(w); 6619 6620 pthread_mutex_lock(&g_bdev_nvme_mutex); 6621 6622 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 6623 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6624 nvme_ctrlr_config_json(w, nvme_ctrlr); 6625 } 6626 } 6627 6628 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6629 bdev_nvme_discovery_config_json(w, ctx); 6630 } 6631 6632 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 6633 * before enabling hotplug poller. 6634 */ 6635 bdev_nvme_hotplug_config_json(w); 6636 6637 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6638 return 0; 6639 } 6640 6641 struct spdk_nvme_ctrlr * 6642 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 6643 { 6644 struct nvme_bdev *nbdev; 6645 struct nvme_ns *nvme_ns; 6646 6647 if (!bdev || bdev->module != &nvme_if) { 6648 return NULL; 6649 } 6650 6651 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 6652 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 6653 assert(nvme_ns != NULL); 6654 6655 return nvme_ns->ctrlr->ctrlr; 6656 } 6657 6658 void 6659 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 6660 { 6661 struct nvme_ns *nvme_ns = io_path->nvme_ns; 6662 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 6663 const struct spdk_nvme_ctrlr_data *cdata; 6664 const struct spdk_nvme_transport_id *trid; 6665 const char *adrfam_str; 6666 6667 spdk_json_write_object_begin(w); 6668 6669 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 6670 6671 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 6672 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 6673 6674 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 6675 spdk_json_write_named_bool(w, "current", io_path == io_path->nbdev_ch->current_io_path); 6676 spdk_json_write_named_bool(w, "connected", nvme_io_path_is_connected(io_path)); 6677 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 6678 6679 spdk_json_write_named_object_begin(w, "transport"); 6680 spdk_json_write_named_string(w, "trtype", trid->trstring); 6681 spdk_json_write_named_string(w, "traddr", trid->traddr); 6682 if (trid->trsvcid[0] != '\0') { 6683 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 6684 } 6685 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 6686 if (adrfam_str) { 6687 spdk_json_write_named_string(w, "adrfam", adrfam_str); 6688 } 6689 spdk_json_write_object_end(w); 6690 6691 spdk_json_write_object_end(w); 6692 } 6693 6694 void 6695 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 6696 { 6697 struct discovery_ctx *ctx; 6698 struct discovery_entry_ctx *entry_ctx; 6699 6700 spdk_json_write_array_begin(w); 6701 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6702 spdk_json_write_object_begin(w); 6703 spdk_json_write_named_string(w, "name", ctx->name); 6704 6705 spdk_json_write_named_object_begin(w, "trid"); 6706 nvme_bdev_dump_trid_json(&ctx->trid, w); 6707 spdk_json_write_object_end(w); 6708 6709 spdk_json_write_named_array_begin(w, "referrals"); 6710 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 6711 spdk_json_write_object_begin(w); 6712 spdk_json_write_named_object_begin(w, "trid"); 6713 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 6714 spdk_json_write_object_end(w); 6715 spdk_json_write_object_end(w); 6716 } 6717 spdk_json_write_array_end(w); 6718 6719 spdk_json_write_object_end(w); 6720 } 6721 spdk_json_write_array_end(w); 6722 } 6723 6724 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 6725 6726 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 6727 { 6728 struct spdk_trace_tpoint_opts opts[] = { 6729 { 6730 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 6731 OWNER_NONE, OBJECT_BDEV_NVME_IO, 1, 6732 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 6733 }, 6734 { 6735 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 6736 OWNER_NONE, OBJECT_BDEV_NVME_IO, 0, 6737 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 6738 } 6739 }; 6740 6741 6742 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 6743 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 6744 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 6745 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 6746 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 6747 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 6748 } 6749