1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/likely.h" 18 #include "spdk/nvme.h" 19 #include "spdk/nvme_ocssd.h" 20 #include "spdk/nvme_zns.h" 21 #include "spdk/opal.h" 22 #include "spdk/thread.h" 23 #include "spdk/trace.h" 24 #include "spdk/string.h" 25 #include "spdk/util.h" 26 27 #include "spdk/bdev_module.h" 28 #include "spdk/log.h" 29 30 #include "spdk_internal/usdt.h" 31 #include "spdk_internal/trace_defs.h" 32 33 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 34 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 35 36 #define NSID_STR_LEN 10 37 38 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 39 40 struct nvme_bdev_io { 41 /** array of iovecs to transfer. */ 42 struct iovec *iovs; 43 44 /** Number of iovecs in iovs array. */ 45 int iovcnt; 46 47 /** Current iovec position. */ 48 int iovpos; 49 50 /** Offset in current iovec. */ 51 uint32_t iov_offset; 52 53 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 54 * being reset in a reset I/O. 55 */ 56 struct nvme_io_path *io_path; 57 58 /** array of iovecs to transfer. */ 59 struct iovec *fused_iovs; 60 61 /** Number of iovecs in iovs array. */ 62 int fused_iovcnt; 63 64 /** Current iovec position. */ 65 int fused_iovpos; 66 67 /** Offset in current iovec. */ 68 uint32_t fused_iov_offset; 69 70 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 71 struct spdk_nvme_cpl cpl; 72 73 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 74 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 75 76 /** Originating thread */ 77 struct spdk_thread *orig_thread; 78 79 /** Keeps track if first of fused commands was submitted */ 80 bool first_fused_submitted; 81 82 /** Keeps track if first of fused commands was completed */ 83 bool first_fused_completed; 84 85 /** Temporary pointer to zone report buffer */ 86 struct spdk_nvme_zns_zone_report *zone_report_buf; 87 88 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 89 uint64_t handled_zones; 90 91 /** Expiration value in ticks to retry the current I/O. */ 92 uint64_t retry_ticks; 93 94 /* How many times the current I/O was retried. */ 95 int32_t retry_count; 96 }; 97 98 struct nvme_probe_skip_entry { 99 struct spdk_nvme_transport_id trid; 100 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 101 }; 102 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 103 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 104 g_skipped_nvme_ctrlrs); 105 106 static struct spdk_bdev_nvme_opts g_opts = { 107 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 108 .timeout_us = 0, 109 .timeout_admin_us = 0, 110 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 111 .transport_retry_count = 4, 112 .arbitration_burst = 0, 113 .low_priority_weight = 0, 114 .medium_priority_weight = 0, 115 .high_priority_weight = 0, 116 .nvme_adminq_poll_period_us = 10000ULL, 117 .nvme_ioq_poll_period_us = 0, 118 .io_queue_requests = 0, 119 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 120 .bdev_retry_count = 3, 121 .transport_ack_timeout = 0, 122 .ctrlr_loss_timeout_sec = 0, 123 .reconnect_delay_sec = 0, 124 .fast_io_fail_timeout_sec = 0, 125 .disable_auto_failback = false, 126 .generate_uuids = false, 127 .transport_tos = 0, 128 .nvme_error_stat = false, 129 }; 130 131 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 132 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 133 134 static int g_hot_insert_nvme_controller_index = 0; 135 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 136 static bool g_nvme_hotplug_enabled = false; 137 struct spdk_thread *g_bdev_nvme_init_thread; 138 static struct spdk_poller *g_hotplug_poller; 139 static struct spdk_poller *g_hotplug_probe_poller; 140 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 141 142 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 143 struct nvme_async_probe_ctx *ctx); 144 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 145 struct nvme_async_probe_ctx *ctx); 146 static int bdev_nvme_library_init(void); 147 static void bdev_nvme_library_fini(void); 148 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 149 struct spdk_bdev_io *bdev_io); 150 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 151 struct spdk_bdev_io *bdev_io); 152 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 153 void *md, uint64_t lba_count, uint64_t lba, 154 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts); 155 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 156 void *md, uint64_t lba_count, uint64_t lba); 157 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 158 void *md, uint64_t lba_count, uint64_t lba, 159 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts); 160 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 161 void *md, uint64_t lba_count, 162 uint64_t zslba, uint32_t flags); 163 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 164 void *md, uint64_t lba_count, uint64_t lba, 165 uint32_t flags); 166 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 167 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 168 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 169 uint32_t flags); 170 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 171 uint32_t num_zones, struct spdk_bdev_zone_info *info); 172 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 173 enum spdk_bdev_zone_action action); 174 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 175 struct nvme_bdev_io *bio, 176 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 177 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 178 void *buf, size_t nbytes); 179 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 180 void *buf, size_t nbytes, void *md_buf, size_t md_len); 181 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 182 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 183 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 184 static int bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr); 185 static int bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove); 186 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 187 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 188 189 static int 190 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 191 { 192 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 193 } 194 195 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 196 197 struct spdk_nvme_qpair * 198 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 199 { 200 struct nvme_ctrlr_channel *ctrlr_ch; 201 202 assert(ctrlr_io_ch != NULL); 203 204 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 205 206 return ctrlr_ch->qpair->qpair; 207 } 208 209 static int 210 bdev_nvme_get_ctx_size(void) 211 { 212 return sizeof(struct nvme_bdev_io); 213 } 214 215 static struct spdk_bdev_module nvme_if = { 216 .name = "nvme", 217 .async_fini = true, 218 .module_init = bdev_nvme_library_init, 219 .module_fini = bdev_nvme_library_fini, 220 .config_json = bdev_nvme_config_json, 221 .get_ctx_size = bdev_nvme_get_ctx_size, 222 223 }; 224 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 225 226 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 227 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 228 bool g_bdev_nvme_module_finish; 229 230 struct nvme_bdev_ctrlr * 231 nvme_bdev_ctrlr_get_by_name(const char *name) 232 { 233 struct nvme_bdev_ctrlr *nbdev_ctrlr; 234 235 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 236 if (strcmp(name, nbdev_ctrlr->name) == 0) { 237 break; 238 } 239 } 240 241 return nbdev_ctrlr; 242 } 243 244 static struct nvme_ctrlr * 245 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 246 const struct spdk_nvme_transport_id *trid) 247 { 248 struct nvme_ctrlr *nvme_ctrlr; 249 250 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 251 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { 252 break; 253 } 254 } 255 256 return nvme_ctrlr; 257 } 258 259 static struct nvme_bdev * 260 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 261 { 262 struct nvme_bdev *bdev; 263 264 pthread_mutex_lock(&g_bdev_nvme_mutex); 265 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 266 if (bdev->nsid == nsid) { 267 break; 268 } 269 } 270 pthread_mutex_unlock(&g_bdev_nvme_mutex); 271 272 return bdev; 273 } 274 275 struct nvme_ns * 276 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 277 { 278 struct nvme_ns ns; 279 280 assert(nsid > 0); 281 282 ns.id = nsid; 283 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 284 } 285 286 struct nvme_ns * 287 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 288 { 289 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 290 } 291 292 struct nvme_ns * 293 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 294 { 295 if (ns == NULL) { 296 return NULL; 297 } 298 299 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 300 } 301 302 static struct nvme_ctrlr * 303 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) 304 { 305 struct nvme_bdev_ctrlr *nbdev_ctrlr; 306 struct nvme_ctrlr *nvme_ctrlr = NULL; 307 308 pthread_mutex_lock(&g_bdev_nvme_mutex); 309 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 310 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); 311 if (nvme_ctrlr != NULL) { 312 break; 313 } 314 } 315 pthread_mutex_unlock(&g_bdev_nvme_mutex); 316 317 return nvme_ctrlr; 318 } 319 320 struct nvme_ctrlr * 321 nvme_ctrlr_get_by_name(const char *name) 322 { 323 struct nvme_bdev_ctrlr *nbdev_ctrlr; 324 struct nvme_ctrlr *nvme_ctrlr = NULL; 325 326 if (name == NULL) { 327 return NULL; 328 } 329 330 pthread_mutex_lock(&g_bdev_nvme_mutex); 331 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 332 if (nbdev_ctrlr != NULL) { 333 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 334 } 335 pthread_mutex_unlock(&g_bdev_nvme_mutex); 336 337 return nvme_ctrlr; 338 } 339 340 void 341 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 342 { 343 struct nvme_bdev_ctrlr *nbdev_ctrlr; 344 345 pthread_mutex_lock(&g_bdev_nvme_mutex); 346 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 347 fn(nbdev_ctrlr, ctx); 348 } 349 pthread_mutex_unlock(&g_bdev_nvme_mutex); 350 } 351 352 void 353 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 354 { 355 const char *trtype_str; 356 const char *adrfam_str; 357 358 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 359 if (trtype_str) { 360 spdk_json_write_named_string(w, "trtype", trtype_str); 361 } 362 363 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 364 if (adrfam_str) { 365 spdk_json_write_named_string(w, "adrfam", adrfam_str); 366 } 367 368 if (trid->traddr[0] != '\0') { 369 spdk_json_write_named_string(w, "traddr", trid->traddr); 370 } 371 372 if (trid->trsvcid[0] != '\0') { 373 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 374 } 375 376 if (trid->subnqn[0] != '\0') { 377 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 378 } 379 } 380 381 static void 382 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 383 struct nvme_ctrlr *nvme_ctrlr) 384 { 385 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 386 pthread_mutex_lock(&g_bdev_nvme_mutex); 387 388 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 389 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 390 pthread_mutex_unlock(&g_bdev_nvme_mutex); 391 392 return; 393 } 394 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 395 396 pthread_mutex_unlock(&g_bdev_nvme_mutex); 397 398 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 399 400 free(nbdev_ctrlr->name); 401 free(nbdev_ctrlr); 402 } 403 404 static void 405 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 406 { 407 struct nvme_path_id *path_id, *tmp_path; 408 struct nvme_ns *ns, *tmp_ns; 409 410 free(nvme_ctrlr->copied_ana_desc); 411 spdk_free(nvme_ctrlr->ana_log_page); 412 413 if (nvme_ctrlr->opal_dev) { 414 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 415 nvme_ctrlr->opal_dev = NULL; 416 } 417 418 if (nvme_ctrlr->nbdev_ctrlr) { 419 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 420 } 421 422 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 423 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 424 free(ns); 425 } 426 427 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 428 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 429 free(path_id); 430 } 431 432 pthread_mutex_destroy(&nvme_ctrlr->mutex); 433 434 free(nvme_ctrlr); 435 436 pthread_mutex_lock(&g_bdev_nvme_mutex); 437 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 438 pthread_mutex_unlock(&g_bdev_nvme_mutex); 439 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 440 spdk_bdev_module_fini_done(); 441 return; 442 } 443 pthread_mutex_unlock(&g_bdev_nvme_mutex); 444 } 445 446 static int 447 nvme_detach_poller(void *arg) 448 { 449 struct nvme_ctrlr *nvme_ctrlr = arg; 450 int rc; 451 452 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 453 if (rc != -EAGAIN) { 454 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 455 _nvme_ctrlr_delete(nvme_ctrlr); 456 } 457 458 return SPDK_POLLER_BUSY; 459 } 460 461 static void 462 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 463 { 464 int rc; 465 466 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 467 468 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 469 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 470 471 /* If we got here, the reset/detach poller cannot be active */ 472 assert(nvme_ctrlr->reset_detach_poller == NULL); 473 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 474 nvme_ctrlr, 1000); 475 if (nvme_ctrlr->reset_detach_poller == NULL) { 476 SPDK_ERRLOG("Failed to register detach poller\n"); 477 goto error; 478 } 479 480 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 481 if (rc != 0) { 482 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 483 goto error; 484 } 485 486 return; 487 error: 488 /* We don't have a good way to handle errors here, so just do what we can and delete the 489 * controller without detaching the underlying NVMe device. 490 */ 491 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 492 _nvme_ctrlr_delete(nvme_ctrlr); 493 } 494 495 static void 496 nvme_ctrlr_unregister_cb(void *io_device) 497 { 498 struct nvme_ctrlr *nvme_ctrlr = io_device; 499 500 nvme_ctrlr_delete(nvme_ctrlr); 501 } 502 503 static void 504 nvme_ctrlr_unregister(void *ctx) 505 { 506 struct nvme_ctrlr *nvme_ctrlr = ctx; 507 508 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 509 } 510 511 static bool 512 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 513 { 514 if (!nvme_ctrlr->destruct) { 515 return false; 516 } 517 518 if (nvme_ctrlr->ref > 0) { 519 return false; 520 } 521 522 if (nvme_ctrlr->resetting) { 523 return false; 524 } 525 526 if (nvme_ctrlr->ana_log_page_updating) { 527 return false; 528 } 529 530 if (nvme_ctrlr->io_path_cache_clearing) { 531 return false; 532 } 533 534 return true; 535 } 536 537 static void 538 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 539 { 540 pthread_mutex_lock(&nvme_ctrlr->mutex); 541 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 542 543 assert(nvme_ctrlr->ref > 0); 544 nvme_ctrlr->ref--; 545 546 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 547 pthread_mutex_unlock(&nvme_ctrlr->mutex); 548 return; 549 } 550 551 pthread_mutex_unlock(&nvme_ctrlr->mutex); 552 553 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 554 } 555 556 static struct nvme_io_path * 557 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 558 { 559 struct nvme_io_path *io_path; 560 561 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 562 if (io_path->nvme_ns == nvme_ns) { 563 break; 564 } 565 } 566 567 return io_path; 568 } 569 570 static int 571 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 572 { 573 struct nvme_io_path *io_path; 574 struct spdk_io_channel *ch; 575 struct nvme_ctrlr_channel *ctrlr_ch; 576 struct nvme_qpair *nvme_qpair; 577 578 io_path = calloc(1, sizeof(*io_path)); 579 if (io_path == NULL) { 580 SPDK_ERRLOG("Failed to alloc io_path.\n"); 581 return -ENOMEM; 582 } 583 584 io_path->nvme_ns = nvme_ns; 585 586 ch = spdk_get_io_channel(nvme_ns->ctrlr); 587 if (ch == NULL) { 588 free(io_path); 589 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 590 return -ENOMEM; 591 } 592 593 ctrlr_ch = spdk_io_channel_get_ctx(ch); 594 595 nvme_qpair = ctrlr_ch->qpair; 596 assert(nvme_qpair != NULL); 597 598 io_path->qpair = nvme_qpair; 599 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 600 601 io_path->nbdev_ch = nbdev_ch; 602 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 603 604 nbdev_ch->current_io_path = NULL; 605 606 return 0; 607 } 608 609 static void 610 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 611 { 612 struct spdk_io_channel *ch; 613 struct nvme_qpair *nvme_qpair; 614 struct nvme_ctrlr_channel *ctrlr_ch; 615 616 nbdev_ch->current_io_path = NULL; 617 618 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 619 620 nvme_qpair = io_path->qpair; 621 assert(nvme_qpair != NULL); 622 623 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 624 625 ctrlr_ch = nvme_qpair->ctrlr_ch; 626 assert(ctrlr_ch != NULL); 627 628 ch = spdk_io_channel_from_ctx(ctrlr_ch); 629 spdk_put_io_channel(ch); 630 631 free(io_path); 632 } 633 634 static void 635 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 636 { 637 struct nvme_io_path *io_path, *tmp_io_path; 638 639 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 640 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 641 } 642 } 643 644 static int 645 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 646 { 647 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 648 struct nvme_bdev *nbdev = io_device; 649 struct nvme_ns *nvme_ns; 650 int rc; 651 652 STAILQ_INIT(&nbdev_ch->io_path_list); 653 TAILQ_INIT(&nbdev_ch->retry_io_list); 654 655 pthread_mutex_lock(&nbdev->mutex); 656 657 nbdev_ch->mp_policy = nbdev->mp_policy; 658 659 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 660 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 661 if (rc != 0) { 662 pthread_mutex_unlock(&nbdev->mutex); 663 664 _bdev_nvme_delete_io_paths(nbdev_ch); 665 return rc; 666 } 667 } 668 pthread_mutex_unlock(&nbdev->mutex); 669 670 return 0; 671 } 672 673 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 674 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 675 */ 676 static inline void 677 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 678 const struct spdk_nvme_cpl *cpl) 679 { 680 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 681 (uintptr_t)bdev_io); 682 if (cpl) { 683 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 684 } else { 685 spdk_bdev_io_complete(bdev_io, status); 686 } 687 } 688 689 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 690 691 static void 692 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 693 { 694 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 695 696 bdev_nvme_abort_retry_ios(nbdev_ch); 697 _bdev_nvme_delete_io_paths(nbdev_ch); 698 } 699 700 static inline bool 701 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 702 { 703 switch (io_type) { 704 case SPDK_BDEV_IO_TYPE_RESET: 705 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 706 case SPDK_BDEV_IO_TYPE_ABORT: 707 return true; 708 default: 709 break; 710 } 711 712 return false; 713 } 714 715 static inline bool 716 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 717 { 718 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 719 return false; 720 } 721 722 switch (nvme_ns->ana_state) { 723 case SPDK_NVME_ANA_OPTIMIZED_STATE: 724 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 725 return true; 726 default: 727 break; 728 } 729 730 return false; 731 } 732 733 static inline bool 734 nvme_io_path_is_connected(struct nvme_io_path *io_path) 735 { 736 if (spdk_unlikely(io_path->qpair->qpair == NULL)) { 737 return false; 738 } 739 740 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(io_path->qpair->qpair) != 741 SPDK_NVME_QPAIR_FAILURE_NONE)) { 742 return false; 743 } 744 745 if (spdk_unlikely(io_path->qpair->ctrlr_ch->reset_iter != NULL)) { 746 return false; 747 } 748 749 if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(io_path->qpair->ctrlr->ctrlr) != 750 SPDK_NVME_QPAIR_FAILURE_NONE) { 751 return false; 752 } 753 754 return true; 755 } 756 757 static inline bool 758 nvme_io_path_is_available(struct nvme_io_path *io_path) 759 { 760 if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) { 761 return false; 762 } 763 764 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 765 return false; 766 } 767 768 return true; 769 } 770 771 static inline bool 772 nvme_io_path_is_failed(struct nvme_io_path *io_path) 773 { 774 struct nvme_ctrlr *nvme_ctrlr; 775 776 nvme_ctrlr = io_path->qpair->ctrlr; 777 778 if (nvme_ctrlr->destruct) { 779 return true; 780 } 781 782 if (nvme_ctrlr->fast_io_fail_timedout) { 783 return true; 784 } 785 786 if (nvme_ctrlr->resetting) { 787 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 788 return false; 789 } else { 790 return true; 791 } 792 } 793 794 if (nvme_ctrlr->reconnect_is_delayed) { 795 return false; 796 } 797 798 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 799 return true; 800 } else { 801 return false; 802 } 803 } 804 805 static bool 806 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 807 { 808 if (nvme_ctrlr->destruct) { 809 return false; 810 } 811 812 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 813 return false; 814 } 815 816 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 817 return false; 818 } 819 820 return true; 821 } 822 823 /* Simulate circular linked list. */ 824 static inline struct nvme_io_path * 825 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 826 { 827 struct nvme_io_path *next_path; 828 829 if (prev_path != NULL) { 830 next_path = STAILQ_NEXT(prev_path, stailq); 831 if (next_path != NULL) { 832 return next_path; 833 } 834 } 835 836 return STAILQ_FIRST(&nbdev_ch->io_path_list); 837 } 838 839 static struct nvme_io_path * 840 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 841 { 842 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 843 844 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 845 846 io_path = start; 847 do { 848 if (spdk_likely(nvme_io_path_is_connected(io_path) && 849 !io_path->nvme_ns->ana_state_updating)) { 850 switch (io_path->nvme_ns->ana_state) { 851 case SPDK_NVME_ANA_OPTIMIZED_STATE: 852 nbdev_ch->current_io_path = io_path; 853 return io_path; 854 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 855 if (non_optimized == NULL) { 856 non_optimized = io_path; 857 } 858 break; 859 default: 860 break; 861 } 862 } 863 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 864 } while (io_path != start); 865 866 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 867 /* We come here only if there is no optimized path. Cache even non_optimized 868 * path for load balance across multiple non_optimized paths. 869 */ 870 nbdev_ch->current_io_path = non_optimized; 871 } 872 873 return non_optimized; 874 } 875 876 static inline struct nvme_io_path * 877 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 878 { 879 if (spdk_likely(nbdev_ch->current_io_path != NULL && 880 nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE)) { 881 return nbdev_ch->current_io_path; 882 } 883 884 return _bdev_nvme_find_io_path(nbdev_ch); 885 } 886 887 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 888 * or false otherwise. 889 * 890 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 891 * is likely to be non-accessible now but may become accessible. 892 * 893 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 894 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 895 * when starting to reset it but it is set to failed when the reset failed. Hence, if 896 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 897 */ 898 static bool 899 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 900 { 901 struct nvme_io_path *io_path; 902 903 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 904 if (io_path->nvme_ns->ana_transition_timedout) { 905 continue; 906 } 907 908 if (nvme_io_path_is_connected(io_path) || 909 !nvme_io_path_is_failed(io_path)) { 910 return true; 911 } 912 } 913 914 return false; 915 } 916 917 static void 918 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 919 { 920 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 921 struct spdk_io_channel *ch; 922 923 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 924 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 925 } else { 926 ch = spdk_io_channel_from_ctx(nbdev_ch); 927 bdev_nvme_submit_request(ch, bdev_io); 928 } 929 } 930 931 static int 932 bdev_nvme_retry_ios(void *arg) 933 { 934 struct nvme_bdev_channel *nbdev_ch = arg; 935 struct spdk_bdev_io *bdev_io, *tmp_bdev_io; 936 struct nvme_bdev_io *bio; 937 uint64_t now, delay_us; 938 939 now = spdk_get_ticks(); 940 941 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) { 942 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 943 if (bio->retry_ticks > now) { 944 break; 945 } 946 947 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 948 949 bdev_nvme_retry_io(nbdev_ch, bdev_io); 950 } 951 952 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 953 954 bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list); 955 if (bdev_io != NULL) { 956 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 957 958 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 959 960 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 961 delay_us); 962 } 963 964 return SPDK_POLLER_BUSY; 965 } 966 967 static void 968 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 969 struct nvme_bdev_io *bio, uint64_t delay_ms) 970 { 971 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 972 struct spdk_bdev_io *tmp_bdev_io; 973 struct nvme_bdev_io *tmp_bio; 974 975 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 976 977 TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) { 978 tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx; 979 980 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 981 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io, 982 module_link); 983 return; 984 } 985 } 986 987 /* No earlier I/Os were found. This I/O must be the new head. */ 988 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link); 989 990 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 991 992 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 993 delay_ms * 1000ULL); 994 } 995 996 static void 997 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 998 { 999 struct spdk_bdev_io *bdev_io, *tmp_io; 1000 1001 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) { 1002 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1003 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1004 } 1005 1006 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1007 } 1008 1009 static int 1010 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1011 struct nvme_bdev_io *bio_to_abort) 1012 { 1013 struct spdk_bdev_io *bdev_io_to_abort; 1014 1015 TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) { 1016 if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) { 1017 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link); 1018 __bdev_nvme_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1019 return 0; 1020 } 1021 } 1022 1023 return -ENOENT; 1024 } 1025 1026 static void 1027 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1028 { 1029 struct nvme_bdev *nbdev; 1030 uint16_t sct, sc; 1031 1032 assert(spdk_nvme_cpl_is_error(cpl)); 1033 1034 nbdev = bdev_io->bdev->ctxt; 1035 1036 if (nbdev->err_stat == NULL) { 1037 return; 1038 } 1039 1040 sct = cpl->status.sct; 1041 sc = cpl->status.sc; 1042 1043 pthread_mutex_lock(&nbdev->mutex); 1044 1045 nbdev->err_stat->status_type[sct]++; 1046 switch (sct) { 1047 case SPDK_NVME_SCT_GENERIC: 1048 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1049 case SPDK_NVME_SCT_MEDIA_ERROR: 1050 case SPDK_NVME_SCT_PATH: 1051 nbdev->err_stat->status[sct][sc]++; 1052 break; 1053 default: 1054 break; 1055 } 1056 1057 pthread_mutex_unlock(&nbdev->mutex); 1058 } 1059 1060 static inline void 1061 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1062 const struct spdk_nvme_cpl *cpl) 1063 { 1064 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1065 struct nvme_bdev_channel *nbdev_ch; 1066 struct nvme_io_path *io_path; 1067 struct nvme_ctrlr *nvme_ctrlr; 1068 const struct spdk_nvme_ctrlr_data *cdata; 1069 uint64_t delay_ms; 1070 1071 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1072 1073 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1074 goto complete; 1075 } 1076 1077 /* Update error counts before deciding if retry is needed. 1078 * Hence, error counts may be more than the number of I/O errors. 1079 */ 1080 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1081 1082 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1083 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1084 goto complete; 1085 } 1086 1087 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1088 1089 assert(bio->io_path != NULL); 1090 io_path = bio->io_path; 1091 1092 nvme_ctrlr = io_path->qpair->ctrlr; 1093 1094 if (spdk_nvme_cpl_is_path_error(cpl) || 1095 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1096 !nvme_io_path_is_available(io_path) || 1097 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1098 nbdev_ch->current_io_path = NULL; 1099 bio->io_path = NULL; 1100 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1101 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1102 io_path->nvme_ns->ana_state_updating = true; 1103 } 1104 } 1105 if (!any_io_path_may_become_available(nbdev_ch)) { 1106 goto complete; 1107 } 1108 delay_ms = 0; 1109 } else { 1110 bio->retry_count++; 1111 1112 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1113 1114 if (cpl->status.crd != 0) { 1115 delay_ms = cdata->crdt[cpl->status.crd] * 100; 1116 } else { 1117 delay_ms = 0; 1118 } 1119 } 1120 1121 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1122 return; 1123 1124 complete: 1125 bio->retry_count = 0; 1126 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1127 } 1128 1129 static inline void 1130 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1131 { 1132 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1133 struct nvme_bdev_channel *nbdev_ch; 1134 enum spdk_bdev_io_status io_status; 1135 1136 switch (rc) { 1137 case 0: 1138 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1139 break; 1140 case -ENOMEM: 1141 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1142 break; 1143 case -ENXIO: 1144 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1145 1146 nbdev_ch->current_io_path = NULL; 1147 bio->io_path = NULL; 1148 1149 if (any_io_path_may_become_available(nbdev_ch)) { 1150 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1151 return; 1152 } 1153 1154 /* fallthrough */ 1155 default: 1156 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1157 break; 1158 } 1159 1160 bio->retry_count = 0; 1161 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1162 } 1163 1164 static inline void 1165 bdev_nvme_admin_passthru_complete(struct nvme_bdev_io *bio, int rc) 1166 { 1167 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1168 enum spdk_bdev_io_status io_status; 1169 1170 switch (rc) { 1171 case 0: 1172 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1173 break; 1174 case -ENOMEM: 1175 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1176 break; 1177 case -ENXIO: 1178 /* fallthrough */ 1179 default: 1180 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1181 break; 1182 } 1183 1184 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1185 } 1186 1187 static void 1188 bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) 1189 { 1190 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1191 1192 pthread_mutex_lock(&nvme_ctrlr->mutex); 1193 1194 assert(nvme_ctrlr->io_path_cache_clearing == true); 1195 nvme_ctrlr->io_path_cache_clearing = false; 1196 1197 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1198 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1199 return; 1200 } 1201 1202 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1203 1204 nvme_ctrlr_unregister(nvme_ctrlr); 1205 } 1206 1207 static void 1208 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1209 { 1210 struct nvme_io_path *io_path; 1211 1212 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1213 io_path->nbdev_ch->current_io_path = NULL; 1214 } 1215 } 1216 1217 static void 1218 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1219 { 1220 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1221 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1222 1223 assert(ctrlr_ch->qpair != NULL); 1224 1225 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1226 1227 spdk_for_each_channel_continue(i, 0); 1228 } 1229 1230 static void 1231 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1232 { 1233 pthread_mutex_lock(&nvme_ctrlr->mutex); 1234 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1235 nvme_ctrlr->io_path_cache_clearing) { 1236 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1237 return; 1238 } 1239 1240 nvme_ctrlr->io_path_cache_clearing = true; 1241 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1242 1243 spdk_for_each_channel(nvme_ctrlr, 1244 bdev_nvme_clear_io_path_cache, 1245 NULL, 1246 bdev_nvme_clear_io_path_caches_done); 1247 } 1248 1249 static struct nvme_qpair * 1250 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1251 { 1252 struct nvme_qpair *nvme_qpair; 1253 1254 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1255 if (nvme_qpair->qpair == qpair) { 1256 break; 1257 } 1258 } 1259 1260 return nvme_qpair; 1261 } 1262 1263 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1264 1265 static void 1266 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1267 { 1268 struct nvme_poll_group *group = poll_group_ctx; 1269 struct nvme_qpair *nvme_qpair; 1270 struct nvme_ctrlr_channel *ctrlr_ch; 1271 1272 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1273 if (nvme_qpair == NULL) { 1274 return; 1275 } 1276 1277 if (nvme_qpair->qpair != NULL) { 1278 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1279 nvme_qpair->qpair = NULL; 1280 } 1281 1282 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1283 1284 ctrlr_ch = nvme_qpair->ctrlr_ch; 1285 1286 if (ctrlr_ch != NULL) { 1287 if (ctrlr_ch->reset_iter != NULL) { 1288 /* If we are already in a full reset sequence, we do not have 1289 * to restart it. Just move to the next ctrlr_channel. 1290 */ 1291 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1292 qpair); 1293 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 1294 ctrlr_ch->reset_iter = NULL; 1295 } else { 1296 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1297 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1298 bdev_nvme_failover(nvme_qpair->ctrlr, false); 1299 } 1300 } else { 1301 /* In this case, ctrlr_channel is already deleted. */ 1302 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1303 nvme_qpair_delete(nvme_qpair); 1304 } 1305 } 1306 1307 static void 1308 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1309 { 1310 struct nvme_qpair *nvme_qpair; 1311 1312 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1313 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1314 continue; 1315 } 1316 1317 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1318 SPDK_NVME_QPAIR_FAILURE_NONE) { 1319 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1320 } 1321 } 1322 } 1323 1324 static int 1325 bdev_nvme_poll(void *arg) 1326 { 1327 struct nvme_poll_group *group = arg; 1328 int64_t num_completions; 1329 1330 if (group->collect_spin_stat && group->start_ticks == 0) { 1331 group->start_ticks = spdk_get_ticks(); 1332 } 1333 1334 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1335 bdev_nvme_disconnected_qpair_cb); 1336 if (group->collect_spin_stat) { 1337 if (num_completions > 0) { 1338 if (group->end_ticks != 0) { 1339 group->spin_ticks += (group->end_ticks - group->start_ticks); 1340 group->end_ticks = 0; 1341 } 1342 group->start_ticks = 0; 1343 } else { 1344 group->end_ticks = spdk_get_ticks(); 1345 } 1346 } 1347 1348 if (spdk_unlikely(num_completions < 0)) { 1349 bdev_nvme_check_io_qpairs(group); 1350 } 1351 1352 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1353 } 1354 1355 static int bdev_nvme_poll_adminq(void *arg); 1356 1357 static void 1358 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1359 { 1360 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1361 1362 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1363 nvme_ctrlr, new_period_us); 1364 } 1365 1366 static int 1367 bdev_nvme_poll_adminq(void *arg) 1368 { 1369 int32_t rc; 1370 struct nvme_ctrlr *nvme_ctrlr = arg; 1371 nvme_ctrlr_disconnected_cb disconnected_cb; 1372 1373 assert(nvme_ctrlr != NULL); 1374 1375 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1376 if (rc < 0) { 1377 disconnected_cb = nvme_ctrlr->disconnected_cb; 1378 nvme_ctrlr->disconnected_cb = NULL; 1379 1380 if (rc == -ENXIO && disconnected_cb != NULL) { 1381 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1382 g_opts.nvme_adminq_poll_period_us); 1383 disconnected_cb(nvme_ctrlr); 1384 } else { 1385 bdev_nvme_failover(nvme_ctrlr, false); 1386 } 1387 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1388 SPDK_NVME_QPAIR_FAILURE_NONE) { 1389 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1390 } 1391 1392 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1393 } 1394 1395 static void 1396 _bdev_nvme_unregister_dev_cb(void *io_device) 1397 { 1398 struct nvme_bdev *nvme_disk = io_device; 1399 1400 free(nvme_disk->disk.name); 1401 free(nvme_disk->err_stat); 1402 free(nvme_disk); 1403 } 1404 1405 static int 1406 bdev_nvme_destruct(void *ctx) 1407 { 1408 struct nvme_bdev *nvme_disk = ctx; 1409 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1410 1411 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1412 1413 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1414 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1415 1416 nvme_ns->bdev = NULL; 1417 1418 assert(nvme_ns->id > 0); 1419 1420 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1421 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1422 1423 nvme_ctrlr_release(nvme_ns->ctrlr); 1424 free(nvme_ns); 1425 } else { 1426 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1427 } 1428 } 1429 1430 pthread_mutex_lock(&g_bdev_nvme_mutex); 1431 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1432 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1433 1434 spdk_io_device_unregister(nvme_disk, _bdev_nvme_unregister_dev_cb); 1435 1436 return 0; 1437 } 1438 1439 static int 1440 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1441 { 1442 struct nvme_ctrlr *nvme_ctrlr; 1443 struct spdk_nvme_io_qpair_opts opts; 1444 struct spdk_nvme_qpair *qpair; 1445 int rc; 1446 1447 nvme_ctrlr = nvme_qpair->ctrlr; 1448 1449 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1450 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1451 opts.create_only = true; 1452 opts.async_mode = true; 1453 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1454 g_opts.io_queue_requests = opts.io_queue_requests; 1455 1456 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1457 if (qpair == NULL) { 1458 return -1; 1459 } 1460 1461 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1462 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1463 1464 assert(nvme_qpair->group != NULL); 1465 1466 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1467 if (rc != 0) { 1468 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1469 goto err; 1470 } 1471 1472 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1473 if (rc != 0) { 1474 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1475 goto err; 1476 } 1477 1478 nvme_qpair->qpair = qpair; 1479 1480 if (!g_opts.disable_auto_failback) { 1481 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1482 } 1483 1484 return 0; 1485 1486 err: 1487 spdk_nvme_ctrlr_free_io_qpair(qpair); 1488 1489 return rc; 1490 } 1491 1492 static void 1493 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1494 { 1495 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1496 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1497 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 1498 struct spdk_bdev_io *bdev_io; 1499 1500 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1501 status = SPDK_BDEV_IO_STATUS_FAILED; 1502 } 1503 1504 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1505 bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1506 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link); 1507 __bdev_nvme_io_complete(bdev_io, status, NULL); 1508 } 1509 1510 spdk_for_each_channel_continue(i, 0); 1511 } 1512 1513 static void 1514 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove) 1515 { 1516 struct nvme_path_id *path_id, *next_path; 1517 int rc __attribute__((unused)); 1518 1519 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1520 assert(path_id); 1521 assert(path_id == nvme_ctrlr->active_path_id); 1522 next_path = TAILQ_NEXT(path_id, link); 1523 1524 path_id->is_failed = true; 1525 1526 if (next_path) { 1527 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1528 1529 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1530 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1531 1532 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1533 nvme_ctrlr->active_path_id = next_path; 1534 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1535 assert(rc == 0); 1536 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1537 if (!remove) { 1538 /** Shuffle the old trid to the end of the list and use the new one. 1539 * Allows for round robin through multiple connections. 1540 */ 1541 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1542 } else { 1543 free(path_id); 1544 } 1545 } 1546 } 1547 1548 static bool 1549 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1550 { 1551 int32_t elapsed; 1552 1553 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1554 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1555 return false; 1556 } 1557 1558 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1559 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1560 return true; 1561 } else { 1562 return false; 1563 } 1564 } 1565 1566 static bool 1567 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1568 { 1569 uint32_t elapsed; 1570 1571 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1572 return false; 1573 } 1574 1575 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1576 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1577 return true; 1578 } else { 1579 return false; 1580 } 1581 } 1582 1583 static void bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 1584 1585 static void 1586 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1587 { 1588 int rc; 1589 1590 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1591 if (rc != 0) { 1592 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 1593 * fail the reset sequence immediately. 1594 */ 1595 bdev_nvme_reset_complete(nvme_ctrlr, false); 1596 return; 1597 } 1598 1599 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1600 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1601 */ 1602 assert(nvme_ctrlr->disconnected_cb == NULL); 1603 nvme_ctrlr->disconnected_cb = cb_fn; 1604 1605 /* During disconnection, reduce the period to poll adminq more often. */ 1606 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 1607 } 1608 1609 enum bdev_nvme_op_after_reset { 1610 OP_NONE, 1611 OP_COMPLETE_PENDING_DESTRUCT, 1612 OP_DESTRUCT, 1613 OP_DELAYED_RECONNECT, 1614 }; 1615 1616 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1617 1618 static _bdev_nvme_op_after_reset 1619 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1620 { 1621 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1622 /* Complete pending destruct after reset completes. */ 1623 return OP_COMPLETE_PENDING_DESTRUCT; 1624 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1625 nvme_ctrlr->reset_start_tsc = 0; 1626 return OP_NONE; 1627 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1628 return OP_DESTRUCT; 1629 } else { 1630 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1631 nvme_ctrlr->fast_io_fail_timedout = true; 1632 } 1633 bdev_nvme_failover_trid(nvme_ctrlr, false); 1634 return OP_DELAYED_RECONNECT; 1635 } 1636 } 1637 1638 static int _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1639 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1640 1641 static int 1642 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 1643 { 1644 struct nvme_ctrlr *nvme_ctrlr = ctx; 1645 1646 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 1647 pthread_mutex_lock(&nvme_ctrlr->mutex); 1648 1649 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 1650 1651 assert(nvme_ctrlr->reconnect_is_delayed == true); 1652 nvme_ctrlr->reconnect_is_delayed = false; 1653 1654 if (nvme_ctrlr->destruct) { 1655 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1656 return SPDK_POLLER_BUSY; 1657 } 1658 1659 assert(nvme_ctrlr->resetting == false); 1660 nvme_ctrlr->resetting = true; 1661 1662 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1663 1664 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 1665 1666 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 1667 return SPDK_POLLER_BUSY; 1668 } 1669 1670 static void 1671 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 1672 { 1673 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 1674 1675 assert(nvme_ctrlr->reconnect_is_delayed == false); 1676 nvme_ctrlr->reconnect_is_delayed = true; 1677 1678 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 1679 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 1680 nvme_ctrlr, 1681 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 1682 } 1683 1684 static void 1685 _bdev_nvme_reset_complete(struct spdk_io_channel_iter *i, int status) 1686 { 1687 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1688 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 1689 struct nvme_path_id *path_id; 1690 bdev_nvme_reset_cb reset_cb_fn = nvme_ctrlr->reset_cb_fn; 1691 void *reset_cb_arg = nvme_ctrlr->reset_cb_arg; 1692 enum bdev_nvme_op_after_reset op_after_reset; 1693 1694 assert(nvme_ctrlr->thread == spdk_get_thread()); 1695 1696 nvme_ctrlr->reset_cb_fn = NULL; 1697 nvme_ctrlr->reset_cb_arg = NULL; 1698 1699 if (!success) { 1700 SPDK_ERRLOG("Resetting controller failed.\n"); 1701 } else { 1702 SPDK_NOTICELOG("Resetting controller successful.\n"); 1703 } 1704 1705 pthread_mutex_lock(&nvme_ctrlr->mutex); 1706 nvme_ctrlr->resetting = false; 1707 1708 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1709 assert(path_id != NULL); 1710 assert(path_id == nvme_ctrlr->active_path_id); 1711 1712 path_id->is_failed = !success; 1713 1714 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 1715 1716 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1717 1718 if (reset_cb_fn) { 1719 reset_cb_fn(reset_cb_arg, success); 1720 } 1721 1722 switch (op_after_reset) { 1723 case OP_COMPLETE_PENDING_DESTRUCT: 1724 nvme_ctrlr_unregister(nvme_ctrlr); 1725 break; 1726 case OP_DESTRUCT: 1727 _bdev_nvme_delete(nvme_ctrlr, false); 1728 break; 1729 case OP_DELAYED_RECONNECT: 1730 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 1731 break; 1732 default: 1733 break; 1734 } 1735 } 1736 1737 static void 1738 bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 1739 { 1740 /* Make sure we clear any pending resets before returning. */ 1741 spdk_for_each_channel(nvme_ctrlr, 1742 bdev_nvme_complete_pending_resets, 1743 success ? NULL : (void *)0x1, 1744 _bdev_nvme_reset_complete); 1745 } 1746 1747 static void 1748 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 1749 { 1750 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1751 1752 bdev_nvme_reset_complete(nvme_ctrlr, false); 1753 } 1754 1755 static void 1756 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 1757 { 1758 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 1759 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 1760 struct nvme_qpair *nvme_qpair; 1761 1762 nvme_qpair = ctrlr_ch->qpair; 1763 assert(nvme_qpair != NULL); 1764 1765 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1766 1767 if (nvme_qpair->qpair != NULL) { 1768 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 1769 1770 /* The current full reset sequence will move to the next 1771 * ctrlr_channel after the qpair is actually disconnected. 1772 */ 1773 assert(ctrlr_ch->reset_iter == NULL); 1774 ctrlr_ch->reset_iter = i; 1775 } else { 1776 spdk_for_each_channel_continue(i, 0); 1777 } 1778 } 1779 1780 static void 1781 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 1782 { 1783 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1784 1785 if (status == 0) { 1786 bdev_nvme_reset_complete(nvme_ctrlr, true); 1787 } else { 1788 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 1789 spdk_for_each_channel(nvme_ctrlr, 1790 bdev_nvme_reset_destroy_qpair, 1791 NULL, 1792 bdev_nvme_reset_create_qpairs_failed); 1793 } 1794 } 1795 1796 static void 1797 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 1798 { 1799 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1800 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1801 int rc; 1802 1803 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 1804 1805 spdk_for_each_channel_continue(i, rc); 1806 } 1807 1808 static int 1809 bdev_nvme_reconnect_ctrlr_poll(void *arg) 1810 { 1811 struct nvme_ctrlr *nvme_ctrlr = arg; 1812 int rc = -ETIMEDOUT; 1813 1814 if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1815 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 1816 if (rc == -EAGAIN) { 1817 return SPDK_POLLER_BUSY; 1818 } 1819 } 1820 1821 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 1822 if (rc == 0) { 1823 /* Recreate all of the I/O queue pairs */ 1824 spdk_for_each_channel(nvme_ctrlr, 1825 bdev_nvme_reset_create_qpair, 1826 NULL, 1827 bdev_nvme_reset_create_qpairs_done); 1828 } else { 1829 bdev_nvme_reset_complete(nvme_ctrlr, false); 1830 } 1831 return SPDK_POLLER_BUSY; 1832 } 1833 1834 static void 1835 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 1836 { 1837 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 1838 1839 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 1840 assert(nvme_ctrlr->reset_detach_poller == NULL); 1841 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 1842 nvme_ctrlr, 0); 1843 } 1844 1845 static void 1846 bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status) 1847 { 1848 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1849 1850 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 1851 assert(status == 0); 1852 1853 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 1854 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 1855 } else { 1856 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 1857 } 1858 } 1859 1860 static void 1861 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 1862 { 1863 spdk_for_each_channel(nvme_ctrlr, 1864 bdev_nvme_reset_destroy_qpair, 1865 NULL, 1866 bdev_nvme_reset_ctrlr); 1867 } 1868 1869 static void 1870 _bdev_nvme_reset(void *ctx) 1871 { 1872 struct nvme_ctrlr *nvme_ctrlr = ctx; 1873 1874 assert(nvme_ctrlr->resetting == true); 1875 assert(nvme_ctrlr->thread == spdk_get_thread()); 1876 1877 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 1878 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 1879 } else { 1880 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 1881 } 1882 } 1883 1884 static int 1885 bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr) 1886 { 1887 pthread_mutex_lock(&nvme_ctrlr->mutex); 1888 if (nvme_ctrlr->destruct) { 1889 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1890 return -ENXIO; 1891 } 1892 1893 if (nvme_ctrlr->resetting) { 1894 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1895 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 1896 return -EBUSY; 1897 } 1898 1899 if (nvme_ctrlr->reconnect_is_delayed) { 1900 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1901 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 1902 return -EBUSY; 1903 } 1904 1905 nvme_ctrlr->resetting = true; 1906 1907 assert(nvme_ctrlr->reset_start_tsc == 0); 1908 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 1909 1910 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1911 1912 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr); 1913 return 0; 1914 } 1915 1916 int 1917 bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg) 1918 { 1919 int rc; 1920 1921 rc = bdev_nvme_reset(nvme_ctrlr); 1922 if (rc == 0) { 1923 nvme_ctrlr->reset_cb_fn = cb_fn; 1924 nvme_ctrlr->reset_cb_arg = cb_arg; 1925 } 1926 return rc; 1927 } 1928 1929 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 1930 1931 static void 1932 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 1933 { 1934 enum spdk_bdev_io_status io_status; 1935 1936 if (bio->cpl.cdw0 == 0) { 1937 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1938 } else { 1939 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1940 } 1941 1942 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 1943 } 1944 1945 static void 1946 _bdev_nvme_reset_io_continue(void *ctx) 1947 { 1948 struct nvme_bdev_io *bio = ctx; 1949 struct nvme_io_path *prev_io_path, *next_io_path; 1950 int rc; 1951 1952 prev_io_path = bio->io_path; 1953 bio->io_path = NULL; 1954 1955 if (bio->cpl.cdw0 != 0) { 1956 goto complete; 1957 } 1958 1959 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 1960 if (next_io_path == NULL) { 1961 goto complete; 1962 } 1963 1964 rc = _bdev_nvme_reset_io(next_io_path, bio); 1965 if (rc == 0) { 1966 return; 1967 } 1968 1969 bio->cpl.cdw0 = 1; 1970 1971 complete: 1972 bdev_nvme_reset_io_complete(bio); 1973 } 1974 1975 static void 1976 bdev_nvme_reset_io_continue(void *cb_arg, bool success) 1977 { 1978 struct nvme_bdev_io *bio = cb_arg; 1979 1980 bio->cpl.cdw0 = !success; 1981 1982 spdk_thread_send_msg(bio->orig_thread, _bdev_nvme_reset_io_continue, bio); 1983 } 1984 1985 static int 1986 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 1987 { 1988 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1989 struct nvme_ctrlr_channel *ctrlr_ch; 1990 struct spdk_bdev_io *bdev_io; 1991 int rc; 1992 1993 rc = bdev_nvme_reset(nvme_ctrlr); 1994 if (rc == 0) { 1995 assert(bio->io_path == NULL); 1996 bio->io_path = io_path; 1997 1998 assert(nvme_ctrlr->reset_cb_fn == NULL); 1999 assert(nvme_ctrlr->reset_cb_arg == NULL); 2000 nvme_ctrlr->reset_cb_fn = bdev_nvme_reset_io_continue; 2001 nvme_ctrlr->reset_cb_arg = bio; 2002 } else if (rc == -EBUSY) { 2003 ctrlr_ch = io_path->qpair->ctrlr_ch; 2004 assert(ctrlr_ch != NULL); 2005 /* 2006 * Reset call is queued only if it is from the app framework. This is on purpose so that 2007 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 2008 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 2009 */ 2010 bdev_io = spdk_bdev_io_from_ctx(bio); 2011 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link); 2012 } else { 2013 return rc; 2014 } 2015 2016 return 0; 2017 } 2018 2019 static void 2020 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 2021 { 2022 struct nvme_io_path *io_path; 2023 int rc; 2024 2025 bio->cpl.cdw0 = 0; 2026 bio->orig_thread = spdk_get_thread(); 2027 2028 /* Reset only the first nvme_ctrlr in the nvme_bdev_ctrlr for now. 2029 * 2030 * TODO: Reset all nvme_ctrlrs in the nvme_bdev_ctrlr sequentially. 2031 * This will be done in the following patches. 2032 */ 2033 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 2034 assert(io_path != NULL); 2035 2036 rc = _bdev_nvme_reset_io(io_path, bio); 2037 if (rc != 0) { 2038 bio->cpl.cdw0 = 1; 2039 bdev_nvme_reset_io_complete(bio); 2040 } 2041 } 2042 2043 static int 2044 bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2045 { 2046 pthread_mutex_lock(&nvme_ctrlr->mutex); 2047 if (nvme_ctrlr->destruct) { 2048 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2049 /* Don't bother resetting if the controller is in the process of being destructed. */ 2050 return -ENXIO; 2051 } 2052 2053 if (nvme_ctrlr->resetting) { 2054 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2055 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2056 return -EBUSY; 2057 } 2058 2059 bdev_nvme_failover_trid(nvme_ctrlr, remove); 2060 2061 if (nvme_ctrlr->reconnect_is_delayed) { 2062 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2063 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2064 2065 /* We rely on the next reconnect for the failover. */ 2066 return 0; 2067 } 2068 2069 nvme_ctrlr->resetting = true; 2070 2071 assert(nvme_ctrlr->reset_start_tsc == 0); 2072 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2073 2074 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2075 2076 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr); 2077 return 0; 2078 } 2079 2080 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2081 uint64_t num_blocks); 2082 2083 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2084 uint64_t num_blocks); 2085 2086 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 2087 uint64_t src_offset_blocks, 2088 uint64_t num_blocks); 2089 2090 static void 2091 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2092 bool success) 2093 { 2094 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2095 struct spdk_bdev *bdev = bdev_io->bdev; 2096 int ret; 2097 2098 if (!success) { 2099 ret = -EINVAL; 2100 goto exit; 2101 } 2102 2103 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2104 ret = -ENXIO; 2105 goto exit; 2106 } 2107 2108 ret = bdev_nvme_readv(bio, 2109 bdev_io->u.bdev.iovs, 2110 bdev_io->u.bdev.iovcnt, 2111 bdev_io->u.bdev.md_buf, 2112 bdev_io->u.bdev.num_blocks, 2113 bdev_io->u.bdev.offset_blocks, 2114 bdev->dif_check_flags, 2115 bdev_io->u.bdev.ext_opts); 2116 2117 exit: 2118 if (spdk_unlikely(ret != 0)) { 2119 bdev_nvme_io_complete(bio, ret); 2120 } 2121 } 2122 2123 static inline void 2124 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 2125 { 2126 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2127 struct spdk_bdev *bdev = bdev_io->bdev; 2128 struct nvme_bdev_io *nbdev_io_to_abort; 2129 int rc = 0; 2130 2131 switch (bdev_io->type) { 2132 case SPDK_BDEV_IO_TYPE_READ: 2133 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 2134 rc = bdev_nvme_readv(nbdev_io, 2135 bdev_io->u.bdev.iovs, 2136 bdev_io->u.bdev.iovcnt, 2137 bdev_io->u.bdev.md_buf, 2138 bdev_io->u.bdev.num_blocks, 2139 bdev_io->u.bdev.offset_blocks, 2140 bdev->dif_check_flags, 2141 bdev_io->u.bdev.ext_opts); 2142 } else { 2143 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 2144 bdev_io->u.bdev.num_blocks * bdev->blocklen); 2145 rc = 0; 2146 } 2147 break; 2148 case SPDK_BDEV_IO_TYPE_WRITE: 2149 rc = bdev_nvme_writev(nbdev_io, 2150 bdev_io->u.bdev.iovs, 2151 bdev_io->u.bdev.iovcnt, 2152 bdev_io->u.bdev.md_buf, 2153 bdev_io->u.bdev.num_blocks, 2154 bdev_io->u.bdev.offset_blocks, 2155 bdev->dif_check_flags, 2156 bdev_io->u.bdev.ext_opts); 2157 break; 2158 case SPDK_BDEV_IO_TYPE_COMPARE: 2159 rc = bdev_nvme_comparev(nbdev_io, 2160 bdev_io->u.bdev.iovs, 2161 bdev_io->u.bdev.iovcnt, 2162 bdev_io->u.bdev.md_buf, 2163 bdev_io->u.bdev.num_blocks, 2164 bdev_io->u.bdev.offset_blocks, 2165 bdev->dif_check_flags); 2166 break; 2167 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2168 rc = bdev_nvme_comparev_and_writev(nbdev_io, 2169 bdev_io->u.bdev.iovs, 2170 bdev_io->u.bdev.iovcnt, 2171 bdev_io->u.bdev.fused_iovs, 2172 bdev_io->u.bdev.fused_iovcnt, 2173 bdev_io->u.bdev.md_buf, 2174 bdev_io->u.bdev.num_blocks, 2175 bdev_io->u.bdev.offset_blocks, 2176 bdev->dif_check_flags); 2177 break; 2178 case SPDK_BDEV_IO_TYPE_UNMAP: 2179 rc = bdev_nvme_unmap(nbdev_io, 2180 bdev_io->u.bdev.offset_blocks, 2181 bdev_io->u.bdev.num_blocks); 2182 break; 2183 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2184 rc = bdev_nvme_write_zeroes(nbdev_io, 2185 bdev_io->u.bdev.offset_blocks, 2186 bdev_io->u.bdev.num_blocks); 2187 break; 2188 case SPDK_BDEV_IO_TYPE_RESET: 2189 nbdev_io->io_path = NULL; 2190 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 2191 break; 2192 case SPDK_BDEV_IO_TYPE_FLUSH: 2193 bdev_nvme_io_complete(nbdev_io, 0); 2194 break; 2195 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 2196 rc = bdev_nvme_zone_appendv(nbdev_io, 2197 bdev_io->u.bdev.iovs, 2198 bdev_io->u.bdev.iovcnt, 2199 bdev_io->u.bdev.md_buf, 2200 bdev_io->u.bdev.num_blocks, 2201 bdev_io->u.bdev.offset_blocks, 2202 bdev->dif_check_flags); 2203 break; 2204 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 2205 rc = bdev_nvme_get_zone_info(nbdev_io, 2206 bdev_io->u.zone_mgmt.zone_id, 2207 bdev_io->u.zone_mgmt.num_zones, 2208 bdev_io->u.zone_mgmt.buf); 2209 break; 2210 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 2211 rc = bdev_nvme_zone_management(nbdev_io, 2212 bdev_io->u.zone_mgmt.zone_id, 2213 bdev_io->u.zone_mgmt.zone_action); 2214 break; 2215 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 2216 nbdev_io->io_path = NULL; 2217 bdev_nvme_admin_passthru(nbdev_ch, 2218 nbdev_io, 2219 &bdev_io->u.nvme_passthru.cmd, 2220 bdev_io->u.nvme_passthru.buf, 2221 bdev_io->u.nvme_passthru.nbytes); 2222 break; 2223 case SPDK_BDEV_IO_TYPE_NVME_IO: 2224 rc = bdev_nvme_io_passthru(nbdev_io, 2225 &bdev_io->u.nvme_passthru.cmd, 2226 bdev_io->u.nvme_passthru.buf, 2227 bdev_io->u.nvme_passthru.nbytes); 2228 break; 2229 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2230 rc = bdev_nvme_io_passthru_md(nbdev_io, 2231 &bdev_io->u.nvme_passthru.cmd, 2232 bdev_io->u.nvme_passthru.buf, 2233 bdev_io->u.nvme_passthru.nbytes, 2234 bdev_io->u.nvme_passthru.md_buf, 2235 bdev_io->u.nvme_passthru.md_len); 2236 break; 2237 case SPDK_BDEV_IO_TYPE_ABORT: 2238 nbdev_io->io_path = NULL; 2239 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 2240 bdev_nvme_abort(nbdev_ch, 2241 nbdev_io, 2242 nbdev_io_to_abort); 2243 break; 2244 case SPDK_BDEV_IO_TYPE_COPY: 2245 rc = bdev_nvme_copy(nbdev_io, 2246 bdev_io->u.bdev.offset_blocks, 2247 bdev_io->u.bdev.copy.src_offset_blocks, 2248 bdev_io->u.bdev.num_blocks); 2249 break; 2250 default: 2251 rc = -EINVAL; 2252 break; 2253 } 2254 2255 if (spdk_unlikely(rc != 0)) { 2256 bdev_nvme_io_complete(nbdev_io, rc); 2257 } 2258 } 2259 2260 static void 2261 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2262 { 2263 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 2264 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2265 2266 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 2267 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 2268 if (spdk_unlikely(!nbdev_io->io_path)) { 2269 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 2270 bdev_nvme_io_complete(nbdev_io, -ENXIO); 2271 return; 2272 } 2273 2274 /* Admin commands do not use the optimal I/O path. 2275 * Simply fall through even if it is not found. 2276 */ 2277 } 2278 2279 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 2280 } 2281 2282 static bool 2283 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 2284 { 2285 struct nvme_bdev *nbdev = ctx; 2286 struct nvme_ns *nvme_ns; 2287 struct spdk_nvme_ns *ns; 2288 struct spdk_nvme_ctrlr *ctrlr; 2289 const struct spdk_nvme_ctrlr_data *cdata; 2290 2291 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 2292 assert(nvme_ns != NULL); 2293 ns = nvme_ns->ns; 2294 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2295 2296 switch (io_type) { 2297 case SPDK_BDEV_IO_TYPE_READ: 2298 case SPDK_BDEV_IO_TYPE_WRITE: 2299 case SPDK_BDEV_IO_TYPE_RESET: 2300 case SPDK_BDEV_IO_TYPE_FLUSH: 2301 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 2302 case SPDK_BDEV_IO_TYPE_NVME_IO: 2303 case SPDK_BDEV_IO_TYPE_ABORT: 2304 return true; 2305 2306 case SPDK_BDEV_IO_TYPE_COMPARE: 2307 return spdk_nvme_ns_supports_compare(ns); 2308 2309 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2310 return spdk_nvme_ns_get_md_size(ns) ? true : false; 2311 2312 case SPDK_BDEV_IO_TYPE_UNMAP: 2313 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2314 return cdata->oncs.dsm; 2315 2316 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2317 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2318 return cdata->oncs.write_zeroes; 2319 2320 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2321 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 2322 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 2323 return true; 2324 } 2325 return false; 2326 2327 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 2328 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 2329 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 2330 2331 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 2332 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 2333 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 2334 2335 case SPDK_BDEV_IO_TYPE_COPY: 2336 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2337 return cdata->oncs.copy; 2338 2339 default: 2340 return false; 2341 } 2342 } 2343 2344 static int 2345 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 2346 { 2347 struct nvme_qpair *nvme_qpair; 2348 struct spdk_io_channel *pg_ch; 2349 int rc; 2350 2351 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 2352 if (!nvme_qpair) { 2353 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 2354 return -1; 2355 } 2356 2357 TAILQ_INIT(&nvme_qpair->io_path_list); 2358 2359 nvme_qpair->ctrlr = nvme_ctrlr; 2360 nvme_qpair->ctrlr_ch = ctrlr_ch; 2361 2362 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 2363 if (!pg_ch) { 2364 free(nvme_qpair); 2365 return -1; 2366 } 2367 2368 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 2369 2370 #ifdef SPDK_CONFIG_VTUNE 2371 nvme_qpair->group->collect_spin_stat = true; 2372 #else 2373 nvme_qpair->group->collect_spin_stat = false; 2374 #endif 2375 2376 rc = bdev_nvme_create_qpair(nvme_qpair); 2377 if (rc != 0) { 2378 /* nvme_ctrlr can't create IO qpair if connection is down. 2379 * 2380 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 2381 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 2382 * submitted IO will be queued until IO qpair is successfully created. 2383 * 2384 * Hence, if both are satisfied, ignore the failure. 2385 */ 2386 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 2387 spdk_put_io_channel(pg_ch); 2388 free(nvme_qpair); 2389 return rc; 2390 } 2391 } 2392 2393 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 2394 2395 ctrlr_ch->qpair = nvme_qpair; 2396 2397 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 2398 nvme_qpair->ctrlr->ref++; 2399 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 2400 2401 return 0; 2402 } 2403 2404 static int 2405 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 2406 { 2407 struct nvme_ctrlr *nvme_ctrlr = io_device; 2408 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 2409 2410 TAILQ_INIT(&ctrlr_ch->pending_resets); 2411 2412 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 2413 } 2414 2415 static void 2416 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 2417 { 2418 assert(nvme_qpair->group != NULL); 2419 2420 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 2421 2422 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 2423 2424 nvme_ctrlr_release(nvme_qpair->ctrlr); 2425 2426 free(nvme_qpair); 2427 } 2428 2429 static void 2430 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 2431 { 2432 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 2433 struct nvme_qpair *nvme_qpair; 2434 2435 nvme_qpair = ctrlr_ch->qpair; 2436 assert(nvme_qpair != NULL); 2437 2438 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2439 2440 if (nvme_qpair->qpair != NULL) { 2441 if (ctrlr_ch->reset_iter == NULL) { 2442 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2443 } else { 2444 /* Skip current ctrlr_channel in a full reset sequence because 2445 * it is being deleted now. The qpair is already being disconnected. 2446 * We do not have to restart disconnecting it. 2447 */ 2448 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2449 } 2450 2451 /* We cannot release a reference to the poll group now. 2452 * The qpair may be disconnected asynchronously later. 2453 * We need to poll it until it is actually disconnected. 2454 * Just detach the qpair from the deleting ctrlr_channel. 2455 */ 2456 nvme_qpair->ctrlr_ch = NULL; 2457 } else { 2458 assert(ctrlr_ch->reset_iter == NULL); 2459 2460 nvme_qpair_delete(nvme_qpair); 2461 } 2462 } 2463 2464 static void 2465 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 2466 uint32_t iov_cnt, uint32_t seed, 2467 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 2468 { 2469 struct nvme_poll_group *group = ctx; 2470 int rc; 2471 2472 assert(group->accel_channel != NULL); 2473 assert(cb_fn != NULL); 2474 2475 rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 2476 if (rc) { 2477 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 2478 if (rc == -ENOMEM || rc == -EINVAL) { 2479 cb_fn(cb_arg, rc); 2480 } 2481 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 2482 } 2483 } 2484 2485 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 2486 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 2487 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 2488 }; 2489 2490 static int 2491 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 2492 { 2493 struct nvme_poll_group *group = ctx_buf; 2494 2495 TAILQ_INIT(&group->qpair_list); 2496 2497 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 2498 if (group->group == NULL) { 2499 return -1; 2500 } 2501 2502 group->accel_channel = spdk_accel_get_io_channel(); 2503 if (!group->accel_channel) { 2504 spdk_nvme_poll_group_destroy(group->group); 2505 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 2506 group); 2507 return -1; 2508 } 2509 2510 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 2511 2512 if (group->poller == NULL) { 2513 spdk_put_io_channel(group->accel_channel); 2514 spdk_nvme_poll_group_destroy(group->group); 2515 return -1; 2516 } 2517 2518 return 0; 2519 } 2520 2521 static void 2522 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 2523 { 2524 struct nvme_poll_group *group = ctx_buf; 2525 2526 assert(TAILQ_EMPTY(&group->qpair_list)); 2527 2528 if (group->accel_channel) { 2529 spdk_put_io_channel(group->accel_channel); 2530 } 2531 2532 spdk_poller_unregister(&group->poller); 2533 if (spdk_nvme_poll_group_destroy(group->group)) { 2534 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 2535 assert(false); 2536 } 2537 } 2538 2539 static struct spdk_io_channel * 2540 bdev_nvme_get_io_channel(void *ctx) 2541 { 2542 struct nvme_bdev *nvme_bdev = ctx; 2543 2544 return spdk_get_io_channel(nvme_bdev); 2545 } 2546 2547 static void * 2548 bdev_nvme_get_module_ctx(void *ctx) 2549 { 2550 struct nvme_bdev *nvme_bdev = ctx; 2551 struct nvme_ns *nvme_ns; 2552 2553 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 2554 return NULL; 2555 } 2556 2557 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 2558 if (!nvme_ns) { 2559 return NULL; 2560 } 2561 2562 return nvme_ns->ns; 2563 } 2564 2565 static const char * 2566 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 2567 { 2568 switch (ana_state) { 2569 case SPDK_NVME_ANA_OPTIMIZED_STATE: 2570 return "optimized"; 2571 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 2572 return "non_optimized"; 2573 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 2574 return "inaccessible"; 2575 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 2576 return "persistent_loss"; 2577 case SPDK_NVME_ANA_CHANGE_STATE: 2578 return "change"; 2579 default: 2580 return NULL; 2581 } 2582 } 2583 2584 static int 2585 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 2586 { 2587 struct spdk_memory_domain **_domains = NULL; 2588 struct nvme_bdev *nbdev = ctx; 2589 struct nvme_ns *nvme_ns; 2590 int i = 0, _array_size = array_size; 2591 int rc = 0; 2592 2593 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 2594 if (domains && array_size >= i) { 2595 _domains = &domains[i]; 2596 } else { 2597 _domains = NULL; 2598 } 2599 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 2600 if (rc > 0) { 2601 i += rc; 2602 if (_array_size >= rc) { 2603 _array_size -= rc; 2604 } else { 2605 _array_size = 0; 2606 } 2607 } else if (rc < 0) { 2608 return rc; 2609 } 2610 } 2611 2612 return i; 2613 } 2614 2615 static const char * 2616 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 2617 { 2618 if (nvme_ctrlr->destruct) { 2619 return "deleting"; 2620 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 2621 return "failed"; 2622 } else if (nvme_ctrlr->resetting) { 2623 return "resetting"; 2624 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 2625 return "reconnect_is_delayed"; 2626 } else { 2627 return "enabled"; 2628 } 2629 } 2630 2631 void 2632 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 2633 { 2634 struct spdk_nvme_transport_id *trid; 2635 const struct spdk_nvme_ctrlr_opts *opts; 2636 const struct spdk_nvme_ctrlr_data *cdata; 2637 2638 spdk_json_write_object_begin(w); 2639 2640 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 2641 2642 #ifdef SPDK_CONFIG_NVME_CUSE 2643 size_t cuse_name_size = 128; 2644 char cuse_name[cuse_name_size]; 2645 2646 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 2647 if (rc == 0) { 2648 spdk_json_write_named_string(w, "cuse_device", cuse_name); 2649 } 2650 #endif 2651 trid = &nvme_ctrlr->active_path_id->trid; 2652 spdk_json_write_named_object_begin(w, "trid"); 2653 nvme_bdev_dump_trid_json(trid, w); 2654 spdk_json_write_object_end(w); 2655 2656 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 2657 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 2658 2659 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 2660 spdk_json_write_named_object_begin(w, "host"); 2661 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 2662 spdk_json_write_named_string(w, "addr", opts->src_addr); 2663 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 2664 spdk_json_write_object_end(w); 2665 2666 spdk_json_write_object_end(w); 2667 } 2668 2669 static void 2670 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 2671 struct nvme_ns *nvme_ns) 2672 { 2673 struct spdk_nvme_ns *ns; 2674 struct spdk_nvme_ctrlr *ctrlr; 2675 const struct spdk_nvme_ctrlr_data *cdata; 2676 const struct spdk_nvme_transport_id *trid; 2677 union spdk_nvme_vs_register vs; 2678 const struct spdk_nvme_ns_data *nsdata; 2679 char buf[128]; 2680 2681 ns = nvme_ns->ns; 2682 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2683 2684 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2685 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 2686 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 2687 2688 spdk_json_write_object_begin(w); 2689 2690 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2691 spdk_json_write_named_string(w, "pci_address", trid->traddr); 2692 } 2693 2694 spdk_json_write_named_object_begin(w, "trid"); 2695 2696 nvme_bdev_dump_trid_json(trid, w); 2697 2698 spdk_json_write_object_end(w); 2699 2700 #ifdef SPDK_CONFIG_NVME_CUSE 2701 size_t cuse_name_size = 128; 2702 char cuse_name[cuse_name_size]; 2703 2704 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 2705 cuse_name, &cuse_name_size); 2706 if (rc == 0) { 2707 spdk_json_write_named_string(w, "cuse_device", cuse_name); 2708 } 2709 #endif 2710 2711 spdk_json_write_named_object_begin(w, "ctrlr_data"); 2712 2713 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 2714 2715 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 2716 2717 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 2718 spdk_str_trim(buf); 2719 spdk_json_write_named_string(w, "model_number", buf); 2720 2721 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 2722 spdk_str_trim(buf); 2723 spdk_json_write_named_string(w, "serial_number", buf); 2724 2725 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 2726 spdk_str_trim(buf); 2727 spdk_json_write_named_string(w, "firmware_revision", buf); 2728 2729 if (cdata->subnqn[0] != '\0') { 2730 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 2731 } 2732 2733 spdk_json_write_named_object_begin(w, "oacs"); 2734 2735 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 2736 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 2737 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 2738 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 2739 2740 spdk_json_write_object_end(w); 2741 2742 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 2743 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 2744 2745 spdk_json_write_object_end(w); 2746 2747 spdk_json_write_named_object_begin(w, "vs"); 2748 2749 spdk_json_write_name(w, "nvme_version"); 2750 if (vs.bits.ter) { 2751 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 2752 } else { 2753 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 2754 } 2755 2756 spdk_json_write_object_end(w); 2757 2758 nsdata = spdk_nvme_ns_get_data(ns); 2759 2760 spdk_json_write_named_object_begin(w, "ns_data"); 2761 2762 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 2763 2764 if (cdata->cmic.ana_reporting) { 2765 spdk_json_write_named_string(w, "ana_state", 2766 _nvme_ana_state_str(nvme_ns->ana_state)); 2767 } 2768 2769 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 2770 2771 spdk_json_write_object_end(w); 2772 2773 if (cdata->oacs.security) { 2774 spdk_json_write_named_object_begin(w, "security"); 2775 2776 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 2777 2778 spdk_json_write_object_end(w); 2779 } 2780 2781 spdk_json_write_object_end(w); 2782 } 2783 2784 static const char * 2785 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 2786 { 2787 switch (nbdev->mp_policy) { 2788 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 2789 return "active_passive"; 2790 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 2791 return "active_active"; 2792 default: 2793 assert(false); 2794 return "invalid"; 2795 } 2796 } 2797 2798 static int 2799 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 2800 { 2801 struct nvme_bdev *nvme_bdev = ctx; 2802 struct nvme_ns *nvme_ns; 2803 2804 pthread_mutex_lock(&nvme_bdev->mutex); 2805 spdk_json_write_named_array_begin(w, "nvme"); 2806 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 2807 nvme_namespace_info_json(w, nvme_ns); 2808 } 2809 spdk_json_write_array_end(w); 2810 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 2811 pthread_mutex_unlock(&nvme_bdev->mutex); 2812 2813 return 0; 2814 } 2815 2816 static void 2817 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 2818 { 2819 /* No config per bdev needed */ 2820 } 2821 2822 static uint64_t 2823 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 2824 { 2825 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 2826 struct nvme_io_path *io_path; 2827 struct nvme_poll_group *group; 2828 uint64_t spin_time = 0; 2829 2830 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 2831 group = io_path->qpair->group; 2832 2833 if (!group || !group->collect_spin_stat) { 2834 continue; 2835 } 2836 2837 if (group->end_ticks != 0) { 2838 group->spin_ticks += (group->end_ticks - group->start_ticks); 2839 group->end_ticks = 0; 2840 } 2841 2842 spin_time += group->spin_ticks; 2843 group->start_ticks = 0; 2844 group->spin_ticks = 0; 2845 } 2846 2847 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 2848 } 2849 2850 static void 2851 bdev_nvme_reset_device_stat(void *ctx) 2852 { 2853 struct nvme_bdev *nbdev = ctx; 2854 2855 if (nbdev->err_stat != NULL) { 2856 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 2857 } 2858 } 2859 2860 /* JSON string should be lowercases and underscore delimited string. */ 2861 static void 2862 bdev_nvme_format_nvme_status(char *dst, const char *src) 2863 { 2864 char tmp[256]; 2865 2866 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 2867 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 2868 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 2869 spdk_strlwr(dst); 2870 } 2871 2872 static void 2873 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 2874 { 2875 struct nvme_bdev *nbdev = ctx; 2876 struct spdk_nvme_status status = {}; 2877 uint16_t sct, sc; 2878 char status_json[256]; 2879 const char *status_str; 2880 2881 if (nbdev->err_stat == NULL) { 2882 return; 2883 } 2884 2885 spdk_json_write_named_object_begin(w, "nvme_error"); 2886 2887 spdk_json_write_named_object_begin(w, "status_type"); 2888 for (sct = 0; sct < 8; sct++) { 2889 if (nbdev->err_stat->status_type[sct] == 0) { 2890 continue; 2891 } 2892 status.sct = sct; 2893 2894 status_str = spdk_nvme_cpl_get_status_type_string(&status); 2895 assert(status_str != NULL); 2896 bdev_nvme_format_nvme_status(status_json, status_str); 2897 2898 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 2899 } 2900 spdk_json_write_object_end(w); 2901 2902 spdk_json_write_named_object_begin(w, "status_code"); 2903 for (sct = 0; sct < 4; sct++) { 2904 status.sct = sct; 2905 for (sc = 0; sc < 256; sc++) { 2906 if (nbdev->err_stat->status[sct][sc] == 0) { 2907 continue; 2908 } 2909 status.sc = sc; 2910 2911 status_str = spdk_nvme_cpl_get_status_string(&status); 2912 assert(status_str != NULL); 2913 bdev_nvme_format_nvme_status(status_json, status_str); 2914 2915 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 2916 } 2917 } 2918 spdk_json_write_object_end(w); 2919 2920 spdk_json_write_object_end(w); 2921 } 2922 2923 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 2924 .destruct = bdev_nvme_destruct, 2925 .submit_request = bdev_nvme_submit_request, 2926 .io_type_supported = bdev_nvme_io_type_supported, 2927 .get_io_channel = bdev_nvme_get_io_channel, 2928 .dump_info_json = bdev_nvme_dump_info_json, 2929 .write_config_json = bdev_nvme_write_config_json, 2930 .get_spin_time = bdev_nvme_get_spin_time, 2931 .get_module_ctx = bdev_nvme_get_module_ctx, 2932 .get_memory_domains = bdev_nvme_get_memory_domains, 2933 .reset_device_stat = bdev_nvme_reset_device_stat, 2934 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 2935 }; 2936 2937 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 2938 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 2939 2940 static int 2941 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 2942 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 2943 { 2944 struct spdk_nvme_ana_group_descriptor *copied_desc; 2945 uint8_t *orig_desc; 2946 uint32_t i, desc_size, copy_len; 2947 int rc = 0; 2948 2949 if (nvme_ctrlr->ana_log_page == NULL) { 2950 return -EINVAL; 2951 } 2952 2953 copied_desc = nvme_ctrlr->copied_ana_desc; 2954 2955 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 2956 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 2957 2958 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 2959 memcpy(copied_desc, orig_desc, copy_len); 2960 2961 rc = cb_fn(copied_desc, cb_arg); 2962 if (rc != 0) { 2963 break; 2964 } 2965 2966 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 2967 copied_desc->num_of_nsid * sizeof(uint32_t); 2968 orig_desc += desc_size; 2969 copy_len -= desc_size; 2970 } 2971 2972 return rc; 2973 } 2974 2975 static int 2976 nvme_ns_ana_transition_timedout(void *ctx) 2977 { 2978 struct nvme_ns *nvme_ns = ctx; 2979 2980 spdk_poller_unregister(&nvme_ns->anatt_timer); 2981 nvme_ns->ana_transition_timedout = true; 2982 2983 return SPDK_POLLER_BUSY; 2984 } 2985 2986 static void 2987 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 2988 const struct spdk_nvme_ana_group_descriptor *desc) 2989 { 2990 const struct spdk_nvme_ctrlr_data *cdata; 2991 2992 nvme_ns->ana_group_id = desc->ana_group_id; 2993 nvme_ns->ana_state = desc->ana_state; 2994 nvme_ns->ana_state_updating = false; 2995 2996 switch (nvme_ns->ana_state) { 2997 case SPDK_NVME_ANA_OPTIMIZED_STATE: 2998 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 2999 nvme_ns->ana_transition_timedout = false; 3000 spdk_poller_unregister(&nvme_ns->anatt_timer); 3001 break; 3002 3003 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3004 case SPDK_NVME_ANA_CHANGE_STATE: 3005 if (nvme_ns->anatt_timer != NULL) { 3006 break; 3007 } 3008 3009 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 3010 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 3011 nvme_ns, 3012 cdata->anatt * SPDK_SEC_TO_USEC); 3013 break; 3014 default: 3015 break; 3016 } 3017 } 3018 3019 static int 3020 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 3021 { 3022 struct nvme_ns *nvme_ns = cb_arg; 3023 uint32_t i; 3024 3025 for (i = 0; i < desc->num_of_nsid; i++) { 3026 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 3027 continue; 3028 } 3029 3030 _nvme_ns_set_ana_state(nvme_ns, desc); 3031 return 1; 3032 } 3033 3034 return 0; 3035 } 3036 3037 static void 3038 merge_nsid_sn_strings(const char *sn, char *nsid, int8_t *out) 3039 { 3040 int i = 0, j = 0; 3041 int sn_len = strlen(sn), nsid_len = strlen(nsid); 3042 3043 for (i = 0; i < nsid_len; i++) { 3044 out[i] = nsid[i]; 3045 } 3046 3047 /* Since last few characters are more likely to be unique, 3048 * even among the devices from the same manufacturer, 3049 * we use serial number in reverse. We also skip the 3050 * terminating character of serial number string. */ 3051 for (j = sn_len - 1; j >= 0; j--) { 3052 if (i == SPDK_UUID_STRING_LEN - 1) { 3053 break; 3054 } 3055 3056 /* There may be a lot of spaces in serial number string 3057 * and they will generate equally large number of the 3058 * same character, so just skip them. */ 3059 if (sn[j] == ' ') { 3060 continue; 3061 } 3062 3063 out[i] = sn[j]; 3064 i++; 3065 } 3066 } 3067 3068 /* Dictionary of characters for UUID generation. */ 3069 static char dict[17] = "0123456789abcdef"; 3070 3071 static struct spdk_uuid 3072 nvme_generate_uuid(const char *sn, uint32_t nsid) 3073 { 3074 struct spdk_uuid new_uuid; 3075 char buf[SPDK_UUID_STRING_LEN] = {'\0'}, merged_str[SPDK_UUID_STRING_LEN] = {'\0'}; 3076 char nsid_str[NSID_STR_LEN] = {'\0'}, tmp; 3077 uint64_t i = 0, j = 0, rem, dict_size = strlen(dict); 3078 int rc; 3079 3080 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 3081 3082 snprintf(nsid_str, NSID_STR_LEN, "%" PRIu32, nsid); 3083 3084 merge_nsid_sn_strings(sn, nsid_str, merged_str); 3085 3086 while (i < SPDK_UUID_STRING_LEN) { 3087 /* If 'j' is equal to indexes, where '-' should be placed, 3088 * insert this character and continue the loop without 3089 * increasing 'i'. */ 3090 if ((j == 8 || j == 13 || j == 18 || j == 23)) { 3091 buf[j] = '-'; 3092 j++; 3093 3094 /* Break, if we ran out of characters in 3095 * serial number and namespace ID string. */ 3096 if (j == strlen(merged_str)) { 3097 break; 3098 } 3099 continue; 3100 } 3101 3102 /* Change character in shuffled string to lower case. */ 3103 tmp = tolower(merged_str[i]); 3104 3105 if (isxdigit(tmp)) { 3106 /* If character can be represented by a hex 3107 * value as is, copy it to the result buffer. */ 3108 buf[j] = tmp; 3109 } else { 3110 /* Otherwise get its code and divide it 3111 * by the number of elements in dictionary. 3112 * The remainder will be the index of dictionary 3113 * character to replace tmp value with. */ 3114 rem = tmp % dict_size; 3115 buf[j] = dict[rem]; 3116 } 3117 3118 i++; 3119 j++; 3120 3121 /* Break, if we ran out of characters in 3122 * serial number and namespace ID string. */ 3123 if (j == strlen(merged_str)) { 3124 break; 3125 } 3126 } 3127 3128 /* If there are not enough values to fill UUID, 3129 * the rest is taken from dictionary characters. */ 3130 i = 0; 3131 while (j < SPDK_UUID_STRING_LEN - 1) { 3132 if ((j == 8 || j == 13 || j == 18 || j == 23)) { 3133 buf[j] = '-'; 3134 j++; 3135 continue; 3136 } 3137 buf[j] = dict[i % dict_size]; 3138 i++; 3139 j++; 3140 } 3141 3142 rc = spdk_uuid_parse(&new_uuid, buf); 3143 if (rc != 0) { 3144 SPDK_ERRLOG("Unexpected spdk_uuid_parse failure on %s.\n", buf); 3145 assert(false); 3146 } 3147 3148 return new_uuid; 3149 } 3150 3151 static int 3152 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 3153 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 3154 uint32_t prchk_flags, void *ctx) 3155 { 3156 const struct spdk_uuid *uuid; 3157 const uint8_t *nguid; 3158 const struct spdk_nvme_ctrlr_data *cdata; 3159 const struct spdk_nvme_ns_data *nsdata; 3160 const struct spdk_nvme_ctrlr_opts *opts; 3161 enum spdk_nvme_csi csi; 3162 uint32_t atomic_bs, phys_bs, bs; 3163 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 3164 3165 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3166 csi = spdk_nvme_ns_get_csi(ns); 3167 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 3168 3169 switch (csi) { 3170 case SPDK_NVME_CSI_NVM: 3171 disk->product_name = "NVMe disk"; 3172 break; 3173 case SPDK_NVME_CSI_ZNS: 3174 disk->product_name = "NVMe ZNS disk"; 3175 disk->zoned = true; 3176 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 3177 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 3178 spdk_nvme_ns_get_extended_sector_size(ns); 3179 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 3180 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 3181 break; 3182 default: 3183 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 3184 return -ENOTSUP; 3185 } 3186 3187 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 3188 if (!disk->name) { 3189 return -ENOMEM; 3190 } 3191 3192 disk->write_cache = 0; 3193 if (cdata->vwc.present) { 3194 /* Enable if the Volatile Write Cache exists */ 3195 disk->write_cache = 1; 3196 } 3197 if (cdata->oncs.write_zeroes) { 3198 disk->max_write_zeroes = UINT16_MAX + 1; 3199 } 3200 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 3201 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 3202 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 3203 /* NVMe driver will split one request into multiple requests 3204 * based on MDTS and stripe boundary, the bdev layer will use 3205 * max_segment_size and max_num_segments to split one big IO 3206 * into multiple requests, then small request can't run out 3207 * of NVMe internal requests data structure. 3208 */ 3209 if (opts && opts->io_queue_requests) { 3210 disk->max_num_segments = opts->io_queue_requests / 2; 3211 } 3212 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 3213 3214 nguid = spdk_nvme_ns_get_nguid(ns); 3215 if (!nguid) { 3216 uuid = spdk_nvme_ns_get_uuid(ns); 3217 if (uuid) { 3218 disk->uuid = *uuid; 3219 } else if (g_opts.generate_uuids) { 3220 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 3221 disk->uuid = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns)); 3222 } 3223 } else { 3224 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 3225 } 3226 3227 nsdata = spdk_nvme_ns_get_data(ns); 3228 bs = spdk_nvme_ns_get_sector_size(ns); 3229 atomic_bs = bs; 3230 phys_bs = bs; 3231 if (nsdata->nabo == 0) { 3232 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 3233 atomic_bs = bs * (1 + nsdata->nawupf); 3234 } else { 3235 atomic_bs = bs * (1 + cdata->awupf); 3236 } 3237 } 3238 if (nsdata->nsfeat.optperf) { 3239 phys_bs = bs * (1 + nsdata->npwg); 3240 } 3241 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 3242 3243 disk->md_len = spdk_nvme_ns_get_md_size(ns); 3244 if (disk->md_len != 0) { 3245 disk->md_interleave = nsdata->flbas.extended; 3246 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 3247 if (disk->dif_type != SPDK_DIF_DISABLE) { 3248 disk->dif_is_head_of_md = nsdata->dps.md_start; 3249 disk->dif_check_flags = prchk_flags; 3250 } 3251 } 3252 3253 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 3254 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 3255 disk->acwu = 0; 3256 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 3257 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 3258 } else { 3259 disk->acwu = cdata->acwu + 1; /* 0-based */ 3260 } 3261 3262 if (cdata->oncs.copy) { 3263 /* For now bdev interface allows only single segment copy */ 3264 disk->max_copy = nsdata->mssrl; 3265 } 3266 3267 disk->ctxt = ctx; 3268 disk->fn_table = &nvmelib_fn_table; 3269 disk->module = &nvme_if; 3270 3271 return 0; 3272 } 3273 3274 static int 3275 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 3276 { 3277 struct nvme_bdev *bdev; 3278 int rc; 3279 3280 bdev = calloc(1, sizeof(*bdev)); 3281 if (!bdev) { 3282 SPDK_ERRLOG("bdev calloc() failed\n"); 3283 return -ENOMEM; 3284 } 3285 3286 if (g_opts.nvme_error_stat) { 3287 bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 3288 if (!bdev->err_stat) { 3289 SPDK_ERRLOG("err_stat calloc() failed\n"); 3290 free(bdev); 3291 return -ENOMEM; 3292 } 3293 } 3294 3295 rc = pthread_mutex_init(&bdev->mutex, NULL); 3296 if (rc != 0) { 3297 free(bdev->err_stat); 3298 free(bdev); 3299 return rc; 3300 } 3301 3302 bdev->ref = 1; 3303 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 3304 TAILQ_INIT(&bdev->nvme_ns_list); 3305 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 3306 bdev->opal = nvme_ctrlr->opal_dev != NULL; 3307 3308 rc = nvme_disk_create(&bdev->disk, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 3309 nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); 3310 if (rc != 0) { 3311 SPDK_ERRLOG("Failed to create NVMe disk\n"); 3312 pthread_mutex_destroy(&bdev->mutex); 3313 free(bdev->err_stat); 3314 free(bdev); 3315 return rc; 3316 } 3317 3318 spdk_io_device_register(bdev, 3319 bdev_nvme_create_bdev_channel_cb, 3320 bdev_nvme_destroy_bdev_channel_cb, 3321 sizeof(struct nvme_bdev_channel), 3322 bdev->disk.name); 3323 3324 rc = spdk_bdev_register(&bdev->disk); 3325 if (rc != 0) { 3326 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 3327 spdk_io_device_unregister(bdev, NULL); 3328 pthread_mutex_destroy(&bdev->mutex); 3329 free(bdev->disk.name); 3330 free(bdev->err_stat); 3331 free(bdev); 3332 return rc; 3333 } 3334 3335 nvme_ns->bdev = bdev; 3336 bdev->nsid = nvme_ns->id; 3337 3338 bdev->nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 3339 TAILQ_INSERT_TAIL(&nvme_ctrlr->nbdev_ctrlr->bdevs, bdev, tailq); 3340 3341 return 0; 3342 } 3343 3344 static bool 3345 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 3346 { 3347 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 3348 const struct spdk_uuid *uuid1, *uuid2; 3349 3350 nsdata1 = spdk_nvme_ns_get_data(ns1); 3351 nsdata2 = spdk_nvme_ns_get_data(ns2); 3352 uuid1 = spdk_nvme_ns_get_uuid(ns1); 3353 uuid2 = spdk_nvme_ns_get_uuid(ns2); 3354 3355 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 3356 nsdata1->eui64 == nsdata2->eui64 && 3357 ((uuid1 == NULL && uuid2 == NULL) || 3358 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 3359 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 3360 } 3361 3362 static bool 3363 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 3364 struct spdk_nvme_ctrlr_opts *opts) 3365 { 3366 struct nvme_probe_skip_entry *entry; 3367 3368 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 3369 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 3370 return false; 3371 } 3372 } 3373 3374 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 3375 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 3376 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 3377 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 3378 opts->disable_read_ana_log_page = true; 3379 3380 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 3381 3382 return true; 3383 } 3384 3385 static void 3386 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 3387 { 3388 struct nvme_ctrlr *nvme_ctrlr = ctx; 3389 3390 if (spdk_nvme_cpl_is_error(cpl)) { 3391 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 3392 cpl->status.sct); 3393 bdev_nvme_reset(nvme_ctrlr); 3394 } else if (cpl->cdw0 & 0x1) { 3395 SPDK_WARNLOG("Specified command could not be aborted.\n"); 3396 bdev_nvme_reset(nvme_ctrlr); 3397 } 3398 } 3399 3400 static void 3401 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 3402 struct spdk_nvme_qpair *qpair, uint16_t cid) 3403 { 3404 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 3405 union spdk_nvme_csts_register csts; 3406 int rc; 3407 3408 assert(nvme_ctrlr->ctrlr == ctrlr); 3409 3410 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 3411 3412 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 3413 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 3414 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 3415 * completion recursively. 3416 */ 3417 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 3418 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 3419 if (csts.bits.cfs) { 3420 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 3421 bdev_nvme_reset(nvme_ctrlr); 3422 return; 3423 } 3424 } 3425 3426 switch (g_opts.action_on_timeout) { 3427 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 3428 if (qpair) { 3429 /* Don't send abort to ctrlr when ctrlr is not available. */ 3430 pthread_mutex_lock(&nvme_ctrlr->mutex); 3431 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 3432 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3433 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 3434 return; 3435 } 3436 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3437 3438 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 3439 nvme_abort_cpl, nvme_ctrlr); 3440 if (rc == 0) { 3441 return; 3442 } 3443 3444 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 3445 } 3446 3447 /* FALLTHROUGH */ 3448 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 3449 bdev_nvme_reset(nvme_ctrlr); 3450 break; 3451 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 3452 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 3453 break; 3454 default: 3455 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 3456 break; 3457 } 3458 } 3459 3460 static void 3461 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 3462 { 3463 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 3464 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 3465 3466 if (rc == 0) { 3467 nvme_ns->probe_ctx = NULL; 3468 pthread_mutex_lock(&nvme_ctrlr->mutex); 3469 nvme_ctrlr->ref++; 3470 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3471 } else { 3472 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 3473 free(nvme_ns); 3474 } 3475 3476 if (ctx) { 3477 ctx->populates_in_progress--; 3478 if (ctx->populates_in_progress == 0) { 3479 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 3480 } 3481 } 3482 } 3483 3484 static void 3485 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 3486 { 3487 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3488 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 3489 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3490 int rc; 3491 3492 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 3493 if (rc != 0) { 3494 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 3495 } 3496 3497 spdk_for_each_channel_continue(i, rc); 3498 } 3499 3500 static void 3501 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 3502 { 3503 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3504 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 3505 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3506 struct nvme_io_path *io_path; 3507 3508 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 3509 if (io_path != NULL) { 3510 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 3511 } 3512 3513 spdk_for_each_channel_continue(i, 0); 3514 } 3515 3516 static void 3517 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 3518 { 3519 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3520 3521 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 3522 } 3523 3524 static void 3525 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 3526 { 3527 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3528 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 3529 3530 if (status == 0) { 3531 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 3532 } else { 3533 /* Delete the added io_paths and fail populating the namespace. */ 3534 spdk_for_each_channel(bdev, 3535 bdev_nvme_delete_io_path, 3536 nvme_ns, 3537 bdev_nvme_add_io_path_failed); 3538 } 3539 } 3540 3541 static int 3542 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 3543 { 3544 struct nvme_ns *tmp_ns; 3545 const struct spdk_nvme_ns_data *nsdata; 3546 3547 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 3548 if (!nsdata->nmic.can_share) { 3549 SPDK_ERRLOG("Namespace cannot be shared.\n"); 3550 return -EINVAL; 3551 } 3552 3553 pthread_mutex_lock(&bdev->mutex); 3554 3555 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 3556 assert(tmp_ns != NULL); 3557 3558 if (!bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 3559 pthread_mutex_unlock(&bdev->mutex); 3560 SPDK_ERRLOG("Namespaces are not identical.\n"); 3561 return -EINVAL; 3562 } 3563 3564 bdev->ref++; 3565 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 3566 nvme_ns->bdev = bdev; 3567 3568 pthread_mutex_unlock(&bdev->mutex); 3569 3570 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 3571 spdk_for_each_channel(bdev, 3572 bdev_nvme_add_io_path, 3573 nvme_ns, 3574 bdev_nvme_add_io_path_done); 3575 3576 return 0; 3577 } 3578 3579 static void 3580 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 3581 { 3582 struct spdk_nvme_ns *ns; 3583 struct nvme_bdev *bdev; 3584 int rc = 0; 3585 3586 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 3587 if (!ns) { 3588 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 3589 rc = -EINVAL; 3590 goto done; 3591 } 3592 3593 nvme_ns->ns = ns; 3594 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 3595 3596 if (nvme_ctrlr->ana_log_page != NULL) { 3597 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 3598 } 3599 3600 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 3601 if (bdev == NULL) { 3602 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 3603 } else { 3604 rc = nvme_bdev_add_ns(bdev, nvme_ns); 3605 if (rc == 0) { 3606 return; 3607 } 3608 } 3609 done: 3610 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 3611 } 3612 3613 static void 3614 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 3615 { 3616 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 3617 3618 assert(nvme_ctrlr != NULL); 3619 3620 pthread_mutex_lock(&nvme_ctrlr->mutex); 3621 3622 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 3623 3624 if (nvme_ns->bdev != NULL) { 3625 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3626 return; 3627 } 3628 3629 free(nvme_ns); 3630 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3631 3632 nvme_ctrlr_release(nvme_ctrlr); 3633 } 3634 3635 static void 3636 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 3637 { 3638 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3639 3640 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 3641 } 3642 3643 static void 3644 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 3645 { 3646 struct nvme_bdev *bdev; 3647 3648 spdk_poller_unregister(&nvme_ns->anatt_timer); 3649 3650 bdev = nvme_ns->bdev; 3651 if (bdev != NULL) { 3652 pthread_mutex_lock(&bdev->mutex); 3653 3654 assert(bdev->ref > 0); 3655 bdev->ref--; 3656 if (bdev->ref == 0) { 3657 pthread_mutex_unlock(&bdev->mutex); 3658 3659 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 3660 } else { 3661 /* spdk_bdev_unregister() is not called until the last nvme_ns is 3662 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 3663 * and clear nvme_ns->bdev here. 3664 */ 3665 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 3666 nvme_ns->bdev = NULL; 3667 3668 pthread_mutex_unlock(&bdev->mutex); 3669 3670 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 3671 * we call depopulate_namespace_done() to avoid use-after-free. 3672 */ 3673 spdk_for_each_channel(bdev, 3674 bdev_nvme_delete_io_path, 3675 nvme_ns, 3676 bdev_nvme_delete_io_path_done); 3677 return; 3678 } 3679 } 3680 3681 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 3682 } 3683 3684 static void 3685 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 3686 struct nvme_async_probe_ctx *ctx) 3687 { 3688 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 3689 struct nvme_ns *nvme_ns, *next; 3690 struct spdk_nvme_ns *ns; 3691 struct nvme_bdev *bdev; 3692 uint32_t nsid; 3693 int rc; 3694 uint64_t num_sectors; 3695 3696 if (ctx) { 3697 /* Initialize this count to 1 to handle the populate functions 3698 * calling nvme_ctrlr_populate_namespace_done() immediately. 3699 */ 3700 ctx->populates_in_progress = 1; 3701 } 3702 3703 /* First loop over our existing namespaces and see if they have been 3704 * removed. */ 3705 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 3706 while (nvme_ns != NULL) { 3707 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 3708 3709 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 3710 /* NS is still there but attributes may have changed */ 3711 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 3712 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 3713 bdev = nvme_ns->bdev; 3714 assert(bdev != NULL); 3715 if (bdev->disk.blockcnt != num_sectors) { 3716 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 3717 nvme_ns->id, 3718 bdev->disk.name, 3719 bdev->disk.blockcnt, 3720 num_sectors); 3721 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 3722 if (rc != 0) { 3723 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 3724 bdev->disk.name, rc); 3725 } 3726 } 3727 } else { 3728 /* Namespace was removed */ 3729 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 3730 } 3731 3732 nvme_ns = next; 3733 } 3734 3735 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 3736 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 3737 while (nsid != 0) { 3738 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 3739 3740 if (nvme_ns == NULL) { 3741 /* Found a new one */ 3742 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 3743 if (nvme_ns == NULL) { 3744 SPDK_ERRLOG("Failed to allocate namespace\n"); 3745 /* This just fails to attach the namespace. It may work on a future attempt. */ 3746 continue; 3747 } 3748 3749 nvme_ns->id = nsid; 3750 nvme_ns->ctrlr = nvme_ctrlr; 3751 3752 nvme_ns->bdev = NULL; 3753 3754 if (ctx) { 3755 ctx->populates_in_progress++; 3756 } 3757 nvme_ns->probe_ctx = ctx; 3758 3759 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 3760 3761 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 3762 } 3763 3764 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 3765 } 3766 3767 if (ctx) { 3768 /* Decrement this count now that the loop is over to account 3769 * for the one we started with. If the count is then 0, we 3770 * know any populate_namespace functions completed immediately, 3771 * so we'll kick the callback here. 3772 */ 3773 ctx->populates_in_progress--; 3774 if (ctx->populates_in_progress == 0) { 3775 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 3776 } 3777 } 3778 3779 } 3780 3781 static void 3782 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 3783 { 3784 struct nvme_ns *nvme_ns, *tmp; 3785 3786 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 3787 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 3788 } 3789 } 3790 3791 static uint32_t 3792 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 3793 { 3794 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 3795 const struct spdk_nvme_ctrlr_data *cdata; 3796 uint32_t nsid, ns_count = 0; 3797 3798 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3799 3800 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 3801 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 3802 ns_count++; 3803 } 3804 3805 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 3806 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 3807 sizeof(uint32_t); 3808 } 3809 3810 static int 3811 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 3812 void *cb_arg) 3813 { 3814 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 3815 struct nvme_ns *nvme_ns; 3816 uint32_t i, nsid; 3817 3818 for (i = 0; i < desc->num_of_nsid; i++) { 3819 nsid = desc->nsid[i]; 3820 if (nsid == 0) { 3821 continue; 3822 } 3823 3824 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 3825 3826 assert(nvme_ns != NULL); 3827 if (nvme_ns == NULL) { 3828 /* Target told us that an inactive namespace had an ANA change */ 3829 continue; 3830 } 3831 3832 _nvme_ns_set_ana_state(nvme_ns, desc); 3833 } 3834 3835 return 0; 3836 } 3837 3838 static void 3839 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 3840 { 3841 struct nvme_ns *nvme_ns; 3842 3843 spdk_free(nvme_ctrlr->ana_log_page); 3844 nvme_ctrlr->ana_log_page = NULL; 3845 3846 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 3847 nvme_ns != NULL; 3848 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 3849 nvme_ns->ana_state_updating = false; 3850 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 3851 } 3852 } 3853 3854 static void 3855 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 3856 { 3857 struct nvme_ctrlr *nvme_ctrlr = ctx; 3858 3859 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 3860 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 3861 nvme_ctrlr); 3862 } else { 3863 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 3864 } 3865 3866 pthread_mutex_lock(&nvme_ctrlr->mutex); 3867 3868 assert(nvme_ctrlr->ana_log_page_updating == true); 3869 nvme_ctrlr->ana_log_page_updating = false; 3870 3871 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 3872 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3873 3874 nvme_ctrlr_unregister(nvme_ctrlr); 3875 } else { 3876 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3877 3878 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 3879 } 3880 } 3881 3882 static int 3883 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 3884 { 3885 uint32_t ana_log_page_size; 3886 int rc; 3887 3888 if (nvme_ctrlr->ana_log_page == NULL) { 3889 return -EINVAL; 3890 } 3891 3892 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 3893 3894 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 3895 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 3896 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 3897 return -EINVAL; 3898 } 3899 3900 pthread_mutex_lock(&nvme_ctrlr->mutex); 3901 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 3902 nvme_ctrlr->ana_log_page_updating) { 3903 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3904 return -EBUSY; 3905 } 3906 3907 nvme_ctrlr->ana_log_page_updating = true; 3908 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3909 3910 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 3911 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 3912 SPDK_NVME_GLOBAL_NS_TAG, 3913 nvme_ctrlr->ana_log_page, 3914 ana_log_page_size, 0, 3915 nvme_ctrlr_read_ana_log_page_done, 3916 nvme_ctrlr); 3917 if (rc != 0) { 3918 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 3919 } 3920 3921 return rc; 3922 } 3923 3924 static void 3925 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 3926 { 3927 } 3928 3929 struct bdev_nvme_set_preferred_path_ctx { 3930 struct spdk_bdev_desc *desc; 3931 struct nvme_ns *nvme_ns; 3932 bdev_nvme_set_preferred_path_cb cb_fn; 3933 void *cb_arg; 3934 }; 3935 3936 static void 3937 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 3938 { 3939 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3940 3941 assert(ctx != NULL); 3942 assert(ctx->desc != NULL); 3943 assert(ctx->cb_fn != NULL); 3944 3945 spdk_bdev_close(ctx->desc); 3946 3947 ctx->cb_fn(ctx->cb_arg, status); 3948 3949 free(ctx); 3950 } 3951 3952 static void 3953 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 3954 { 3955 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3956 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3957 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 3958 struct nvme_io_path *io_path, *prev; 3959 3960 prev = NULL; 3961 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3962 if (io_path->nvme_ns == ctx->nvme_ns) { 3963 break; 3964 } 3965 prev = io_path; 3966 } 3967 3968 if (io_path != NULL) { 3969 if (prev != NULL) { 3970 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 3971 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 3972 } 3973 3974 /* We can set io_path to nbdev_ch->current_io_path directly here. 3975 * However, it needs to be conditional. To simplify the code, 3976 * just clear nbdev_ch->current_io_path and let find_io_path() 3977 * fill it. 3978 * 3979 * Automatic failback may be disabled. Hence even if the io_path is 3980 * already at the head, clear nbdev_ch->current_io_path. 3981 */ 3982 nbdev_ch->current_io_path = NULL; 3983 } 3984 3985 spdk_for_each_channel_continue(i, 0); 3986 } 3987 3988 static struct nvme_ns * 3989 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 3990 { 3991 struct nvme_ns *nvme_ns, *prev; 3992 const struct spdk_nvme_ctrlr_data *cdata; 3993 3994 prev = NULL; 3995 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3996 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 3997 3998 if (cdata->cntlid == cntlid) { 3999 break; 4000 } 4001 prev = nvme_ns; 4002 } 4003 4004 if (nvme_ns != NULL && prev != NULL) { 4005 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 4006 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 4007 } 4008 4009 return nvme_ns; 4010 } 4011 4012 /* This function supports only multipath mode. There is only a single I/O path 4013 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 4014 * head of the I/O path list for each NVMe bdev channel. 4015 * 4016 * NVMe bdev channel may be acquired after completing this function. move the 4017 * matched namespace to the head of the namespace list for the NVMe bdev too. 4018 */ 4019 void 4020 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 4021 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 4022 { 4023 struct bdev_nvme_set_preferred_path_ctx *ctx; 4024 struct spdk_bdev *bdev; 4025 struct nvme_bdev *nbdev; 4026 int rc = 0; 4027 4028 assert(cb_fn != NULL); 4029 4030 ctx = calloc(1, sizeof(*ctx)); 4031 if (ctx == NULL) { 4032 SPDK_ERRLOG("Failed to alloc context.\n"); 4033 rc = -ENOMEM; 4034 goto err_alloc; 4035 } 4036 4037 ctx->cb_fn = cb_fn; 4038 ctx->cb_arg = cb_arg; 4039 4040 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 4041 if (rc != 0) { 4042 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 4043 goto err_open; 4044 } 4045 4046 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 4047 4048 if (bdev->module != &nvme_if) { 4049 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 4050 rc = -ENODEV; 4051 goto err_bdev; 4052 } 4053 4054 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 4055 4056 pthread_mutex_lock(&nbdev->mutex); 4057 4058 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 4059 if (ctx->nvme_ns == NULL) { 4060 pthread_mutex_unlock(&nbdev->mutex); 4061 4062 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 4063 rc = -ENODEV; 4064 goto err_bdev; 4065 } 4066 4067 pthread_mutex_unlock(&nbdev->mutex); 4068 4069 spdk_for_each_channel(nbdev, 4070 _bdev_nvme_set_preferred_path, 4071 ctx, 4072 bdev_nvme_set_preferred_path_done); 4073 return; 4074 4075 err_bdev: 4076 spdk_bdev_close(ctx->desc); 4077 err_open: 4078 free(ctx); 4079 err_alloc: 4080 cb_fn(cb_arg, rc); 4081 } 4082 4083 struct bdev_nvme_set_multipath_policy_ctx { 4084 struct spdk_bdev_desc *desc; 4085 bdev_nvme_set_multipath_policy_cb cb_fn; 4086 void *cb_arg; 4087 }; 4088 4089 static void 4090 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 4091 { 4092 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4093 4094 assert(ctx != NULL); 4095 assert(ctx->desc != NULL); 4096 assert(ctx->cb_fn != NULL); 4097 4098 spdk_bdev_close(ctx->desc); 4099 4100 ctx->cb_fn(ctx->cb_arg, status); 4101 4102 free(ctx); 4103 } 4104 4105 static void 4106 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 4107 { 4108 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4109 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4110 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 4111 4112 nbdev_ch->mp_policy = nbdev->mp_policy; 4113 nbdev_ch->current_io_path = NULL; 4114 4115 spdk_for_each_channel_continue(i, 0); 4116 } 4117 4118 void 4119 bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, 4120 bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 4121 { 4122 struct bdev_nvme_set_multipath_policy_ctx *ctx; 4123 struct spdk_bdev *bdev; 4124 struct nvme_bdev *nbdev; 4125 int rc; 4126 4127 assert(cb_fn != NULL); 4128 4129 ctx = calloc(1, sizeof(*ctx)); 4130 if (ctx == NULL) { 4131 SPDK_ERRLOG("Failed to alloc context.\n"); 4132 rc = -ENOMEM; 4133 goto err_alloc; 4134 } 4135 4136 ctx->cb_fn = cb_fn; 4137 ctx->cb_arg = cb_arg; 4138 4139 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 4140 if (rc != 0) { 4141 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 4142 rc = -ENODEV; 4143 goto err_open; 4144 } 4145 4146 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 4147 if (bdev->module != &nvme_if) { 4148 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 4149 rc = -ENODEV; 4150 goto err_module; 4151 } 4152 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 4153 4154 pthread_mutex_lock(&nbdev->mutex); 4155 nbdev->mp_policy = policy; 4156 pthread_mutex_unlock(&nbdev->mutex); 4157 4158 spdk_for_each_channel(nbdev, 4159 _bdev_nvme_set_multipath_policy, 4160 ctx, 4161 bdev_nvme_set_multipath_policy_done); 4162 return; 4163 4164 err_module: 4165 spdk_bdev_close(ctx->desc); 4166 err_open: 4167 free(ctx); 4168 err_alloc: 4169 cb_fn(cb_arg, rc); 4170 } 4171 4172 static void 4173 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 4174 { 4175 struct nvme_ctrlr *nvme_ctrlr = arg; 4176 union spdk_nvme_async_event_completion event; 4177 4178 if (spdk_nvme_cpl_is_error(cpl)) { 4179 SPDK_WARNLOG("AER request execute failed\n"); 4180 return; 4181 } 4182 4183 event.raw = cpl->cdw0; 4184 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 4185 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 4186 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 4187 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 4188 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 4189 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 4190 } 4191 } 4192 4193 static void 4194 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc) 4195 { 4196 if (ctx->cb_fn) { 4197 ctx->cb_fn(ctx->cb_ctx, count, rc); 4198 } 4199 4200 ctx->namespaces_populated = true; 4201 if (ctx->probe_done) { 4202 /* The probe was already completed, so we need to free the context 4203 * here. This can happen for cases like OCSSD, where we need to 4204 * send additional commands to the SSD after attach. 4205 */ 4206 free(ctx); 4207 } 4208 } 4209 4210 static void 4211 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 4212 struct nvme_async_probe_ctx *ctx) 4213 { 4214 spdk_io_device_register(nvme_ctrlr, 4215 bdev_nvme_create_ctrlr_channel_cb, 4216 bdev_nvme_destroy_ctrlr_channel_cb, 4217 sizeof(struct nvme_ctrlr_channel), 4218 nvme_ctrlr->nbdev_ctrlr->name); 4219 4220 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 4221 } 4222 4223 static void 4224 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 4225 { 4226 struct nvme_ctrlr *nvme_ctrlr = _ctx; 4227 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 4228 4229 nvme_ctrlr->probe_ctx = NULL; 4230 4231 if (spdk_nvme_cpl_is_error(cpl)) { 4232 nvme_ctrlr_delete(nvme_ctrlr); 4233 4234 if (ctx != NULL) { 4235 populate_namespaces_cb(ctx, 0, -1); 4236 } 4237 return; 4238 } 4239 4240 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 4241 } 4242 4243 static int 4244 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 4245 struct nvme_async_probe_ctx *ctx) 4246 { 4247 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4248 const struct spdk_nvme_ctrlr_data *cdata; 4249 uint32_t ana_log_page_size; 4250 4251 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4252 4253 /* Set buffer size enough to include maximum number of allowed namespaces. */ 4254 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 4255 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 4256 sizeof(uint32_t); 4257 4258 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 4259 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 4260 if (nvme_ctrlr->ana_log_page == NULL) { 4261 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 4262 return -ENXIO; 4263 } 4264 4265 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 4266 * Hence copy each descriptor to a temporary area when parsing it. 4267 * 4268 * Allocate a buffer whose size is as large as ANA log page buffer because 4269 * we do not know the size of a descriptor until actually reading it. 4270 */ 4271 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 4272 if (nvme_ctrlr->copied_ana_desc == NULL) { 4273 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 4274 return -ENOMEM; 4275 } 4276 4277 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 4278 4279 nvme_ctrlr->probe_ctx = ctx; 4280 4281 /* Then, set the read size only to include the current active namespaces. */ 4282 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 4283 4284 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 4285 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 4286 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 4287 return -EINVAL; 4288 } 4289 4290 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 4291 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 4292 SPDK_NVME_GLOBAL_NS_TAG, 4293 nvme_ctrlr->ana_log_page, 4294 ana_log_page_size, 0, 4295 nvme_ctrlr_init_ana_log_page_done, 4296 nvme_ctrlr); 4297 } 4298 4299 /* hostnqn and subnqn were already verified before attaching a controller. 4300 * Hence check only the multipath capability and cntlid here. 4301 */ 4302 static bool 4303 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 4304 { 4305 struct nvme_ctrlr *tmp; 4306 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 4307 4308 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4309 4310 if (!cdata->cmic.multi_ctrlr) { 4311 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 4312 return false; 4313 } 4314 4315 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 4316 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 4317 4318 if (!tmp_cdata->cmic.multi_ctrlr) { 4319 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 4320 return false; 4321 } 4322 if (cdata->cntlid == tmp_cdata->cntlid) { 4323 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 4324 return false; 4325 } 4326 } 4327 4328 return true; 4329 } 4330 4331 static int 4332 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 4333 { 4334 struct nvme_bdev_ctrlr *nbdev_ctrlr; 4335 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4336 int rc = 0; 4337 4338 pthread_mutex_lock(&g_bdev_nvme_mutex); 4339 4340 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 4341 if (nbdev_ctrlr != NULL) { 4342 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 4343 rc = -EINVAL; 4344 goto exit; 4345 } 4346 } else { 4347 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 4348 if (nbdev_ctrlr == NULL) { 4349 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 4350 rc = -ENOMEM; 4351 goto exit; 4352 } 4353 nbdev_ctrlr->name = strdup(name); 4354 if (nbdev_ctrlr->name == NULL) { 4355 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 4356 free(nbdev_ctrlr); 4357 goto exit; 4358 } 4359 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 4360 TAILQ_INIT(&nbdev_ctrlr->bdevs); 4361 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 4362 } 4363 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 4364 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 4365 exit: 4366 pthread_mutex_unlock(&g_bdev_nvme_mutex); 4367 return rc; 4368 } 4369 4370 static int 4371 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 4372 const char *name, 4373 const struct spdk_nvme_transport_id *trid, 4374 struct nvme_async_probe_ctx *ctx) 4375 { 4376 struct nvme_ctrlr *nvme_ctrlr; 4377 struct nvme_path_id *path_id; 4378 const struct spdk_nvme_ctrlr_data *cdata; 4379 int rc; 4380 4381 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 4382 if (nvme_ctrlr == NULL) { 4383 SPDK_ERRLOG("Failed to allocate device struct\n"); 4384 return -ENOMEM; 4385 } 4386 4387 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 4388 if (rc != 0) { 4389 free(nvme_ctrlr); 4390 return rc; 4391 } 4392 4393 TAILQ_INIT(&nvme_ctrlr->trids); 4394 4395 RB_INIT(&nvme_ctrlr->namespaces); 4396 4397 path_id = calloc(1, sizeof(*path_id)); 4398 if (path_id == NULL) { 4399 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 4400 rc = -ENOMEM; 4401 goto err; 4402 } 4403 4404 path_id->trid = *trid; 4405 if (ctx != NULL) { 4406 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 4407 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 4408 } 4409 nvme_ctrlr->active_path_id = path_id; 4410 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 4411 4412 nvme_ctrlr->thread = spdk_get_thread(); 4413 nvme_ctrlr->ctrlr = ctrlr; 4414 nvme_ctrlr->ref = 1; 4415 4416 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 4417 SPDK_ERRLOG("OCSSDs are not supported"); 4418 rc = -ENOTSUP; 4419 goto err; 4420 } 4421 4422 if (ctx != NULL) { 4423 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 4424 } else { 4425 bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 4426 } 4427 4428 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 4429 g_opts.nvme_adminq_poll_period_us); 4430 4431 if (g_opts.timeout_us > 0) { 4432 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 4433 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 4434 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 4435 g_opts.timeout_us : g_opts.timeout_admin_us; 4436 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 4437 adm_timeout_us, timeout_cb, nvme_ctrlr); 4438 } 4439 4440 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 4441 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 4442 4443 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 4444 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 4445 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 4446 } 4447 4448 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 4449 if (rc != 0) { 4450 goto err; 4451 } 4452 4453 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4454 4455 if (cdata->cmic.ana_reporting) { 4456 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 4457 if (rc == 0) { 4458 return 0; 4459 } 4460 } else { 4461 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 4462 return 0; 4463 } 4464 4465 err: 4466 nvme_ctrlr_delete(nvme_ctrlr); 4467 return rc; 4468 } 4469 4470 void 4471 bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) 4472 { 4473 opts->prchk_flags = 0; 4474 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 4475 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 4476 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 4477 } 4478 4479 static void 4480 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4481 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 4482 { 4483 char *name; 4484 4485 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 4486 if (!name) { 4487 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 4488 return; 4489 } 4490 4491 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 4492 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 4493 } else { 4494 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 4495 } 4496 4497 free(name); 4498 } 4499 4500 static void 4501 _nvme_ctrlr_destruct(void *ctx) 4502 { 4503 struct nvme_ctrlr *nvme_ctrlr = ctx; 4504 4505 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 4506 nvme_ctrlr_release(nvme_ctrlr); 4507 } 4508 4509 static int 4510 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 4511 { 4512 struct nvme_probe_skip_entry *entry; 4513 4514 pthread_mutex_lock(&nvme_ctrlr->mutex); 4515 4516 /* The controller's destruction was already started */ 4517 if (nvme_ctrlr->destruct) { 4518 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4519 return 0; 4520 } 4521 4522 if (!hotplug && 4523 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 4524 entry = calloc(1, sizeof(*entry)); 4525 if (!entry) { 4526 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4527 return -ENOMEM; 4528 } 4529 entry->trid = nvme_ctrlr->active_path_id->trid; 4530 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 4531 } 4532 4533 nvme_ctrlr->destruct = true; 4534 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4535 4536 _nvme_ctrlr_destruct(nvme_ctrlr); 4537 4538 return 0; 4539 } 4540 4541 static void 4542 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 4543 { 4544 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 4545 4546 _bdev_nvme_delete(nvme_ctrlr, true); 4547 } 4548 4549 static int 4550 bdev_nvme_hotplug_probe(void *arg) 4551 { 4552 if (g_hotplug_probe_ctx == NULL) { 4553 spdk_poller_unregister(&g_hotplug_probe_poller); 4554 return SPDK_POLLER_IDLE; 4555 } 4556 4557 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 4558 g_hotplug_probe_ctx = NULL; 4559 spdk_poller_unregister(&g_hotplug_probe_poller); 4560 } 4561 4562 return SPDK_POLLER_BUSY; 4563 } 4564 4565 static int 4566 bdev_nvme_hotplug(void *arg) 4567 { 4568 struct spdk_nvme_transport_id trid_pcie; 4569 4570 if (g_hotplug_probe_ctx) { 4571 return SPDK_POLLER_BUSY; 4572 } 4573 4574 memset(&trid_pcie, 0, sizeof(trid_pcie)); 4575 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 4576 4577 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 4578 hotplug_probe_cb, attach_cb, NULL); 4579 4580 if (g_hotplug_probe_ctx) { 4581 assert(g_hotplug_probe_poller == NULL); 4582 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 4583 } 4584 4585 return SPDK_POLLER_BUSY; 4586 } 4587 4588 void 4589 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 4590 { 4591 *opts = g_opts; 4592 } 4593 4594 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 4595 uint32_t reconnect_delay_sec, 4596 uint32_t fast_io_fail_timeout_sec); 4597 4598 static int 4599 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 4600 { 4601 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 4602 /* Can't set timeout_admin_us without also setting timeout_us */ 4603 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 4604 return -EINVAL; 4605 } 4606 4607 if (opts->bdev_retry_count < -1) { 4608 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 4609 return -EINVAL; 4610 } 4611 4612 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 4613 opts->reconnect_delay_sec, 4614 opts->fast_io_fail_timeout_sec)) { 4615 return -EINVAL; 4616 } 4617 4618 return 0; 4619 } 4620 4621 int 4622 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 4623 { 4624 int ret; 4625 4626 ret = bdev_nvme_validate_opts(opts); 4627 if (ret) { 4628 SPDK_WARNLOG("Failed to set nvme opts.\n"); 4629 return ret; 4630 } 4631 4632 if (g_bdev_nvme_init_thread != NULL) { 4633 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 4634 return -EPERM; 4635 } 4636 } 4637 4638 if (opts->rdma_srq_size != 0) { 4639 struct spdk_nvme_transport_opts drv_opts; 4640 4641 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 4642 drv_opts.rdma_srq_size = opts->rdma_srq_size; 4643 4644 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 4645 if (ret) { 4646 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 4647 return ret; 4648 } 4649 } 4650 4651 g_opts = *opts; 4652 4653 return 0; 4654 } 4655 4656 struct set_nvme_hotplug_ctx { 4657 uint64_t period_us; 4658 bool enabled; 4659 spdk_msg_fn fn; 4660 void *fn_ctx; 4661 }; 4662 4663 static void 4664 set_nvme_hotplug_period_cb(void *_ctx) 4665 { 4666 struct set_nvme_hotplug_ctx *ctx = _ctx; 4667 4668 spdk_poller_unregister(&g_hotplug_poller); 4669 if (ctx->enabled) { 4670 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 4671 } 4672 4673 g_nvme_hotplug_poll_period_us = ctx->period_us; 4674 g_nvme_hotplug_enabled = ctx->enabled; 4675 if (ctx->fn) { 4676 ctx->fn(ctx->fn_ctx); 4677 } 4678 4679 free(ctx); 4680 } 4681 4682 int 4683 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 4684 { 4685 struct set_nvme_hotplug_ctx *ctx; 4686 4687 if (enabled == true && !spdk_process_is_primary()) { 4688 return -EPERM; 4689 } 4690 4691 ctx = calloc(1, sizeof(*ctx)); 4692 if (ctx == NULL) { 4693 return -ENOMEM; 4694 } 4695 4696 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 4697 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 4698 ctx->enabled = enabled; 4699 ctx->fn = cb; 4700 ctx->fn_ctx = cb_ctx; 4701 4702 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 4703 return 0; 4704 } 4705 4706 static void 4707 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 4708 struct nvme_async_probe_ctx *ctx) 4709 { 4710 struct nvme_ns *nvme_ns; 4711 struct nvme_bdev *nvme_bdev; 4712 size_t j; 4713 4714 assert(nvme_ctrlr != NULL); 4715 4716 if (ctx->names == NULL) { 4717 populate_namespaces_cb(ctx, 0, 0); 4718 return; 4719 } 4720 4721 /* 4722 * Report the new bdevs that were created in this call. 4723 * There can be more than one bdev per NVMe controller. 4724 */ 4725 j = 0; 4726 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4727 while (nvme_ns != NULL) { 4728 nvme_bdev = nvme_ns->bdev; 4729 if (j < ctx->count) { 4730 ctx->names[j] = nvme_bdev->disk.name; 4731 j++; 4732 } else { 4733 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 4734 ctx->count); 4735 populate_namespaces_cb(ctx, 0, -ERANGE); 4736 return; 4737 } 4738 4739 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4740 } 4741 4742 populate_namespaces_cb(ctx, j, 0); 4743 } 4744 4745 static int 4746 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 4747 struct spdk_nvme_ctrlr *new_ctrlr, 4748 struct spdk_nvme_transport_id *trid) 4749 { 4750 struct nvme_path_id *tmp_trid; 4751 4752 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 4753 SPDK_ERRLOG("PCIe failover is not supported.\n"); 4754 return -ENOTSUP; 4755 } 4756 4757 /* Currently we only support failover to the same transport type. */ 4758 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 4759 SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", 4760 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 4761 spdk_nvme_transport_id_trtype_str(trid->trtype)); 4762 return -EINVAL; 4763 } 4764 4765 4766 /* Currently we only support failover to the same NQN. */ 4767 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 4768 SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 4769 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 4770 return -EINVAL; 4771 } 4772 4773 /* Skip all the other checks if we've already registered this path. */ 4774 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 4775 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 4776 SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, 4777 trid->subnqn); 4778 return -EEXIST; 4779 } 4780 } 4781 4782 return 0; 4783 } 4784 4785 static int 4786 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 4787 struct spdk_nvme_ctrlr *new_ctrlr) 4788 { 4789 struct nvme_ns *nvme_ns; 4790 struct spdk_nvme_ns *new_ns; 4791 4792 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4793 while (nvme_ns != NULL) { 4794 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 4795 assert(new_ns != NULL); 4796 4797 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 4798 return -EINVAL; 4799 } 4800 4801 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4802 } 4803 4804 return 0; 4805 } 4806 4807 static int 4808 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 4809 struct spdk_nvme_transport_id *trid) 4810 { 4811 struct nvme_path_id *new_trid, *tmp_trid; 4812 4813 new_trid = calloc(1, sizeof(*new_trid)); 4814 if (new_trid == NULL) { 4815 return -ENOMEM; 4816 } 4817 new_trid->trid = *trid; 4818 new_trid->is_failed = false; 4819 4820 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 4821 if (tmp_trid->is_failed && tmp_trid != nvme_ctrlr->active_path_id) { 4822 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 4823 return 0; 4824 } 4825 } 4826 4827 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 4828 return 0; 4829 } 4830 4831 /* This is the case that a secondary path is added to an existing 4832 * nvme_ctrlr for failover. After checking if it can access the same 4833 * namespaces as the primary path, it is disconnected until failover occurs. 4834 */ 4835 static int 4836 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 4837 struct spdk_nvme_ctrlr *new_ctrlr, 4838 struct spdk_nvme_transport_id *trid) 4839 { 4840 int rc; 4841 4842 assert(nvme_ctrlr != NULL); 4843 4844 pthread_mutex_lock(&nvme_ctrlr->mutex); 4845 4846 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 4847 if (rc != 0) { 4848 goto exit; 4849 } 4850 4851 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 4852 if (rc != 0) { 4853 goto exit; 4854 } 4855 4856 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 4857 4858 exit: 4859 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4860 4861 spdk_nvme_detach(new_ctrlr); 4862 4863 return rc; 4864 } 4865 4866 static void 4867 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4868 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 4869 { 4870 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 4871 struct nvme_async_probe_ctx *ctx; 4872 int rc; 4873 4874 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 4875 ctx->ctrlr_attached = true; 4876 4877 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 4878 if (rc != 0) { 4879 populate_namespaces_cb(ctx, 0, rc); 4880 } 4881 } 4882 4883 static void 4884 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4885 struct spdk_nvme_ctrlr *ctrlr, 4886 const struct spdk_nvme_ctrlr_opts *opts) 4887 { 4888 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 4889 struct nvme_ctrlr *nvme_ctrlr; 4890 struct nvme_async_probe_ctx *ctx; 4891 int rc; 4892 4893 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 4894 ctx->ctrlr_attached = true; 4895 4896 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 4897 if (nvme_ctrlr) { 4898 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 4899 } else { 4900 rc = -ENODEV; 4901 } 4902 4903 populate_namespaces_cb(ctx, 0, rc); 4904 } 4905 4906 static int 4907 bdev_nvme_async_poll(void *arg) 4908 { 4909 struct nvme_async_probe_ctx *ctx = arg; 4910 int rc; 4911 4912 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 4913 if (spdk_unlikely(rc != -EAGAIN)) { 4914 ctx->probe_done = true; 4915 spdk_poller_unregister(&ctx->poller); 4916 if (!ctx->ctrlr_attached) { 4917 /* The probe is done, but no controller was attached. 4918 * That means we had a failure, so report -EIO back to 4919 * the caller (usually the RPC). populate_namespaces_cb() 4920 * will take care of freeing the nvme_async_probe_ctx. 4921 */ 4922 populate_namespaces_cb(ctx, 0, -EIO); 4923 } else if (ctx->namespaces_populated) { 4924 /* The namespaces for the attached controller were all 4925 * populated and the response was already sent to the 4926 * caller (usually the RPC). So free the context here. 4927 */ 4928 free(ctx); 4929 } 4930 } 4931 4932 return SPDK_POLLER_BUSY; 4933 } 4934 4935 static bool 4936 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 4937 uint32_t reconnect_delay_sec, 4938 uint32_t fast_io_fail_timeout_sec) 4939 { 4940 if (ctrlr_loss_timeout_sec < -1) { 4941 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 4942 return false; 4943 } else if (ctrlr_loss_timeout_sec == -1) { 4944 if (reconnect_delay_sec == 0) { 4945 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 4946 return false; 4947 } else if (fast_io_fail_timeout_sec != 0 && 4948 fast_io_fail_timeout_sec < reconnect_delay_sec) { 4949 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 4950 return false; 4951 } 4952 } else if (ctrlr_loss_timeout_sec != 0) { 4953 if (reconnect_delay_sec == 0) { 4954 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 4955 return false; 4956 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 4957 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 4958 return false; 4959 } else if (fast_io_fail_timeout_sec != 0) { 4960 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 4961 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 4962 return false; 4963 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 4964 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 4965 return false; 4966 } 4967 } 4968 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 4969 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 4970 return false; 4971 } 4972 4973 return true; 4974 } 4975 4976 int 4977 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 4978 const char *base_name, 4979 const char **names, 4980 uint32_t count, 4981 spdk_bdev_create_nvme_fn cb_fn, 4982 void *cb_ctx, 4983 struct spdk_nvme_ctrlr_opts *drv_opts, 4984 struct nvme_ctrlr_opts *bdev_opts, 4985 bool multipath) 4986 { 4987 struct nvme_probe_skip_entry *entry, *tmp; 4988 struct nvme_async_probe_ctx *ctx; 4989 spdk_nvme_attach_cb attach_cb; 4990 4991 /* TODO expand this check to include both the host and target TRIDs. 4992 * Only if both are the same should we fail. 4993 */ 4994 if (nvme_ctrlr_get(trid) != NULL) { 4995 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 4996 return -EEXIST; 4997 } 4998 4999 if (bdev_opts != NULL && 5000 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 5001 bdev_opts->reconnect_delay_sec, 5002 bdev_opts->fast_io_fail_timeout_sec)) { 5003 return -EINVAL; 5004 } 5005 5006 ctx = calloc(1, sizeof(*ctx)); 5007 if (!ctx) { 5008 return -ENOMEM; 5009 } 5010 ctx->base_name = base_name; 5011 ctx->names = names; 5012 ctx->count = count; 5013 ctx->cb_fn = cb_fn; 5014 ctx->cb_ctx = cb_ctx; 5015 ctx->trid = *trid; 5016 5017 if (bdev_opts) { 5018 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 5019 } else { 5020 bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 5021 } 5022 5023 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5024 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 5025 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 5026 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 5027 free(entry); 5028 break; 5029 } 5030 } 5031 } 5032 5033 if (drv_opts) { 5034 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 5035 } else { 5036 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); 5037 } 5038 5039 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 5040 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 5041 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 5042 ctx->drv_opts.disable_read_ana_log_page = true; 5043 ctx->drv_opts.transport_tos = g_opts.transport_tos; 5044 5045 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 5046 attach_cb = connect_attach_cb; 5047 } else { 5048 attach_cb = connect_set_failover_cb; 5049 } 5050 5051 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 5052 if (ctx->probe_ctx == NULL) { 5053 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 5054 free(ctx); 5055 return -ENODEV; 5056 } 5057 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 5058 5059 return 0; 5060 } 5061 5062 int 5063 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id) 5064 { 5065 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5066 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 5067 struct nvme_path_id *p, *t; 5068 int rc = -ENXIO; 5069 5070 if (name == NULL || path_id == NULL) { 5071 return -EINVAL; 5072 } 5073 5074 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5075 if (nbdev_ctrlr == NULL) { 5076 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 5077 return -ENODEV; 5078 } 5079 5080 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 5081 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 5082 if (path_id->trid.trtype != 0) { 5083 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 5084 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 5085 continue; 5086 } 5087 } else { 5088 if (path_id->trid.trtype != p->trid.trtype) { 5089 continue; 5090 } 5091 } 5092 } 5093 5094 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 5095 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 5096 continue; 5097 } 5098 } 5099 5100 if (path_id->trid.adrfam != 0) { 5101 if (path_id->trid.adrfam != p->trid.adrfam) { 5102 continue; 5103 } 5104 } 5105 5106 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 5107 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 5108 continue; 5109 } 5110 } 5111 5112 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 5113 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 5114 continue; 5115 } 5116 } 5117 5118 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 5119 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 5120 continue; 5121 } 5122 } 5123 5124 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 5125 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 5126 continue; 5127 } 5128 } 5129 5130 /* If we made it here, then this path is a match! Now we need to remove it. */ 5131 if (p == nvme_ctrlr->active_path_id) { 5132 /* This is the active path in use right now. The active path is always the first in the list. */ 5133 5134 if (!TAILQ_NEXT(p, link)) { 5135 /* The current path is the only path. */ 5136 rc = _bdev_nvme_delete(nvme_ctrlr, false); 5137 } else { 5138 /* There is an alternative path. */ 5139 rc = bdev_nvme_failover(nvme_ctrlr, true); 5140 } 5141 } else { 5142 /* We are not using the specified path. */ 5143 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 5144 free(p); 5145 rc = 0; 5146 } 5147 5148 if (rc < 0 && rc != -ENXIO) { 5149 return rc; 5150 } 5151 5152 5153 } 5154 } 5155 5156 /* All nvme_ctrlrs were deleted or no nvme_ctrlr which had the trid was found. */ 5157 return rc; 5158 } 5159 5160 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 5161 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 5162 5163 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 5164 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 5165 5166 struct discovery_entry_ctx { 5167 char name[128]; 5168 struct spdk_nvme_transport_id trid; 5169 struct spdk_nvme_ctrlr_opts drv_opts; 5170 struct spdk_nvmf_discovery_log_page_entry entry; 5171 TAILQ_ENTRY(discovery_entry_ctx) tailq; 5172 struct discovery_ctx *ctx; 5173 }; 5174 5175 struct discovery_ctx { 5176 char *name; 5177 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 5178 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 5179 void *cb_ctx; 5180 struct spdk_nvme_probe_ctx *probe_ctx; 5181 struct spdk_nvme_detach_ctx *detach_ctx; 5182 struct spdk_nvme_ctrlr *ctrlr; 5183 struct spdk_nvme_transport_id trid; 5184 struct discovery_entry_ctx *entry_ctx_in_use; 5185 struct spdk_poller *poller; 5186 struct spdk_nvme_ctrlr_opts drv_opts; 5187 struct nvme_ctrlr_opts bdev_opts; 5188 struct spdk_nvmf_discovery_log_page *log_page; 5189 TAILQ_ENTRY(discovery_ctx) tailq; 5190 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 5191 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 5192 int rc; 5193 bool wait_for_attach; 5194 uint64_t timeout_ticks; 5195 /* Denotes that the discovery service is being started. We're waiting 5196 * for the initial connection to the discovery controller to be 5197 * established and attach discovered NVM ctrlrs. 5198 */ 5199 bool initializing; 5200 /* Denotes if a discovery is currently in progress for this context. 5201 * That includes connecting to newly discovered subsystems. Used to 5202 * ensure we do not start a new discovery until an existing one is 5203 * complete. 5204 */ 5205 bool in_progress; 5206 5207 /* Denotes if another discovery is needed after the one in progress 5208 * completes. Set when we receive an AER completion while a discovery 5209 * is already in progress. 5210 */ 5211 bool pending; 5212 5213 /* Signal to the discovery context poller that it should stop the 5214 * discovery service, including detaching from the current discovery 5215 * controller. 5216 */ 5217 bool stop; 5218 5219 struct spdk_thread *calling_thread; 5220 uint32_t index; 5221 uint32_t attach_in_progress; 5222 char *hostnqn; 5223 5224 /* Denotes if the discovery service was started by the mdns discovery. 5225 */ 5226 bool from_mdns_discovery_service; 5227 }; 5228 5229 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 5230 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 5231 5232 static void get_discovery_log_page(struct discovery_ctx *ctx); 5233 5234 static void 5235 free_discovery_ctx(struct discovery_ctx *ctx) 5236 { 5237 free(ctx->log_page); 5238 free(ctx->hostnqn); 5239 free(ctx->name); 5240 free(ctx); 5241 } 5242 5243 static void 5244 discovery_complete(struct discovery_ctx *ctx) 5245 { 5246 ctx->initializing = false; 5247 ctx->in_progress = false; 5248 if (ctx->pending) { 5249 ctx->pending = false; 5250 get_discovery_log_page(ctx); 5251 } 5252 } 5253 5254 static void 5255 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 5256 struct spdk_nvmf_discovery_log_page_entry *entry) 5257 { 5258 char *space; 5259 5260 trid->trtype = entry->trtype; 5261 trid->adrfam = entry->adrfam; 5262 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 5263 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 5264 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn)); 5265 5266 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 5267 * But the log page entries typically pad them with spaces, not zeroes. 5268 * So add a NULL terminator to each of these fields at the appropriate 5269 * location. 5270 */ 5271 space = strchr(trid->traddr, ' '); 5272 if (space) { 5273 *space = 0; 5274 } 5275 space = strchr(trid->trsvcid, ' '); 5276 if (space) { 5277 *space = 0; 5278 } 5279 space = strchr(trid->subnqn, ' '); 5280 if (space) { 5281 *space = 0; 5282 } 5283 } 5284 5285 static void 5286 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 5287 { 5288 ctx->stop = true; 5289 ctx->stop_cb_fn = cb_fn; 5290 ctx->cb_ctx = cb_ctx; 5291 5292 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 5293 struct discovery_entry_ctx *entry_ctx; 5294 struct nvme_path_id path = {}; 5295 5296 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 5297 path.trid = entry_ctx->trid; 5298 bdev_nvme_delete(entry_ctx->name, &path); 5299 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 5300 free(entry_ctx); 5301 } 5302 5303 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 5304 struct discovery_entry_ctx *entry_ctx; 5305 5306 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 5307 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 5308 free(entry_ctx); 5309 } 5310 5311 free(ctx->entry_ctx_in_use); 5312 ctx->entry_ctx_in_use = NULL; 5313 } 5314 5315 static void 5316 discovery_remove_controllers(struct discovery_ctx *ctx) 5317 { 5318 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 5319 struct discovery_entry_ctx *entry_ctx, *tmp; 5320 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 5321 struct spdk_nvme_transport_id old_trid; 5322 uint64_t numrec, i; 5323 bool found; 5324 5325 numrec = from_le64(&log_page->numrec); 5326 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 5327 found = false; 5328 old_entry = &entry_ctx->entry; 5329 build_trid_from_log_page_entry(&old_trid, old_entry); 5330 for (i = 0; i < numrec; i++) { 5331 new_entry = &log_page->entries[i]; 5332 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 5333 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 5334 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 5335 found = true; 5336 break; 5337 } 5338 } 5339 if (!found) { 5340 struct nvme_path_id path = {}; 5341 5342 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 5343 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 5344 5345 path.trid = entry_ctx->trid; 5346 bdev_nvme_delete(entry_ctx->name, &path); 5347 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 5348 free(entry_ctx); 5349 } 5350 } 5351 free(log_page); 5352 ctx->log_page = NULL; 5353 discovery_complete(ctx); 5354 } 5355 5356 static void 5357 complete_discovery_start(struct discovery_ctx *ctx, int status) 5358 { 5359 ctx->timeout_ticks = 0; 5360 ctx->rc = status; 5361 if (ctx->start_cb_fn) { 5362 ctx->start_cb_fn(ctx->cb_ctx, status); 5363 ctx->start_cb_fn = NULL; 5364 ctx->cb_ctx = NULL; 5365 } 5366 } 5367 5368 static void 5369 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 5370 { 5371 struct discovery_entry_ctx *entry_ctx = cb_ctx; 5372 struct discovery_ctx *ctx = entry_ctx->ctx; 5373 5374 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 5375 ctx->attach_in_progress--; 5376 if (ctx->attach_in_progress == 0) { 5377 complete_discovery_start(ctx, ctx->rc); 5378 if (ctx->initializing && ctx->rc != 0) { 5379 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 5380 stop_discovery(ctx, NULL, ctx->cb_ctx); 5381 } else { 5382 discovery_remove_controllers(ctx); 5383 } 5384 } 5385 } 5386 5387 static struct discovery_entry_ctx * 5388 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 5389 { 5390 struct discovery_entry_ctx *new_ctx; 5391 5392 new_ctx = calloc(1, sizeof(*new_ctx)); 5393 if (new_ctx == NULL) { 5394 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 5395 return NULL; 5396 } 5397 5398 new_ctx->ctx = ctx; 5399 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 5400 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 5401 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 5402 return new_ctx; 5403 } 5404 5405 static void 5406 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 5407 struct spdk_nvmf_discovery_log_page *log_page) 5408 { 5409 struct discovery_ctx *ctx = cb_arg; 5410 struct discovery_entry_ctx *entry_ctx, *tmp; 5411 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 5412 uint64_t numrec, i; 5413 bool found; 5414 5415 if (rc || spdk_nvme_cpl_is_error(cpl)) { 5416 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 5417 return; 5418 } 5419 5420 ctx->log_page = log_page; 5421 assert(ctx->attach_in_progress == 0); 5422 numrec = from_le64(&log_page->numrec); 5423 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 5424 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 5425 free(entry_ctx); 5426 } 5427 for (i = 0; i < numrec; i++) { 5428 found = false; 5429 new_entry = &log_page->entries[i]; 5430 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 5431 struct discovery_entry_ctx *new_ctx; 5432 struct spdk_nvme_transport_id trid = {}; 5433 5434 build_trid_from_log_page_entry(&trid, new_entry); 5435 new_ctx = create_discovery_entry_ctx(ctx, &trid); 5436 if (new_ctx == NULL) { 5437 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 5438 break; 5439 } 5440 5441 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 5442 continue; 5443 } 5444 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 5445 old_entry = &entry_ctx->entry; 5446 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 5447 found = true; 5448 break; 5449 } 5450 } 5451 if (!found) { 5452 struct discovery_entry_ctx *subnqn_ctx, *new_ctx; 5453 5454 TAILQ_FOREACH(subnqn_ctx, &ctx->nvm_entry_ctxs, tailq) { 5455 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 5456 sizeof(new_entry->subnqn))) { 5457 break; 5458 } 5459 } 5460 5461 new_ctx = calloc(1, sizeof(*new_ctx)); 5462 if (new_ctx == NULL) { 5463 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 5464 break; 5465 } 5466 5467 new_ctx->ctx = ctx; 5468 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 5469 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 5470 if (subnqn_ctx) { 5471 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 5472 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 5473 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 5474 new_ctx->name); 5475 } else { 5476 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 5477 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 5478 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 5479 new_ctx->name); 5480 } 5481 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 5482 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 5483 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 5484 discovery_attach_controller_done, new_ctx, 5485 &new_ctx->drv_opts, &ctx->bdev_opts, true); 5486 if (rc == 0) { 5487 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 5488 ctx->attach_in_progress++; 5489 } else { 5490 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 5491 } 5492 } 5493 } 5494 5495 if (ctx->attach_in_progress == 0) { 5496 discovery_remove_controllers(ctx); 5497 } 5498 } 5499 5500 static void 5501 get_discovery_log_page(struct discovery_ctx *ctx) 5502 { 5503 int rc; 5504 5505 assert(ctx->in_progress == false); 5506 ctx->in_progress = true; 5507 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 5508 if (rc != 0) { 5509 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 5510 } 5511 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 5512 } 5513 5514 static void 5515 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5516 { 5517 struct discovery_ctx *ctx = arg; 5518 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 5519 5520 if (spdk_nvme_cpl_is_error(cpl)) { 5521 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 5522 return; 5523 } 5524 5525 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 5526 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 5527 return; 5528 } 5529 5530 DISCOVERY_INFOLOG(ctx, "got aer\n"); 5531 if (ctx->in_progress) { 5532 ctx->pending = true; 5533 return; 5534 } 5535 5536 get_discovery_log_page(ctx); 5537 } 5538 5539 static void 5540 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5541 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5542 { 5543 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5544 struct discovery_ctx *ctx; 5545 5546 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 5547 5548 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 5549 ctx->probe_ctx = NULL; 5550 ctx->ctrlr = ctrlr; 5551 5552 if (ctx->rc != 0) { 5553 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 5554 ctx->rc); 5555 return; 5556 } 5557 5558 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 5559 } 5560 5561 static int 5562 discovery_poller(void *arg) 5563 { 5564 struct discovery_ctx *ctx = arg; 5565 struct spdk_nvme_transport_id *trid; 5566 int rc; 5567 5568 if (ctx->detach_ctx) { 5569 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 5570 if (rc != -EAGAIN) { 5571 ctx->detach_ctx = NULL; 5572 ctx->ctrlr = NULL; 5573 } 5574 } else if (ctx->stop) { 5575 if (ctx->ctrlr != NULL) { 5576 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 5577 if (rc == 0) { 5578 return SPDK_POLLER_BUSY; 5579 } 5580 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 5581 } 5582 spdk_poller_unregister(&ctx->poller); 5583 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 5584 assert(ctx->start_cb_fn == NULL); 5585 if (ctx->stop_cb_fn != NULL) { 5586 ctx->stop_cb_fn(ctx->cb_ctx); 5587 } 5588 free_discovery_ctx(ctx); 5589 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 5590 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 5591 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 5592 assert(ctx->initializing); 5593 spdk_poller_unregister(&ctx->poller); 5594 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 5595 complete_discovery_start(ctx, -ETIMEDOUT); 5596 stop_discovery(ctx, NULL, NULL); 5597 free_discovery_ctx(ctx); 5598 return SPDK_POLLER_BUSY; 5599 } 5600 5601 assert(ctx->entry_ctx_in_use == NULL); 5602 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 5603 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 5604 trid = &ctx->entry_ctx_in_use->trid; 5605 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 5606 if (ctx->probe_ctx) { 5607 spdk_poller_unregister(&ctx->poller); 5608 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 5609 } else { 5610 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 5611 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 5612 ctx->entry_ctx_in_use = NULL; 5613 } 5614 } else if (ctx->probe_ctx) { 5615 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 5616 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 5617 complete_discovery_start(ctx, -ETIMEDOUT); 5618 return SPDK_POLLER_BUSY; 5619 } 5620 5621 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 5622 if (rc != -EAGAIN) { 5623 if (ctx->rc != 0) { 5624 assert(ctx->initializing); 5625 stop_discovery(ctx, NULL, ctx->cb_ctx); 5626 } else { 5627 assert(rc == 0); 5628 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 5629 ctx->rc = rc; 5630 get_discovery_log_page(ctx); 5631 } 5632 } 5633 } else { 5634 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 5635 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 5636 complete_discovery_start(ctx, -ETIMEDOUT); 5637 /* We need to wait until all NVM ctrlrs are attached before we stop the 5638 * discovery service to make sure we don't detach a ctrlr that is still 5639 * being attached. 5640 */ 5641 if (ctx->attach_in_progress == 0) { 5642 stop_discovery(ctx, NULL, ctx->cb_ctx); 5643 return SPDK_POLLER_BUSY; 5644 } 5645 } 5646 5647 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 5648 if (rc < 0) { 5649 spdk_poller_unregister(&ctx->poller); 5650 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 5651 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 5652 ctx->entry_ctx_in_use = NULL; 5653 5654 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 5655 if (rc != 0) { 5656 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 5657 ctx->ctrlr = NULL; 5658 } 5659 } 5660 } 5661 5662 return SPDK_POLLER_BUSY; 5663 } 5664 5665 static void 5666 start_discovery_poller(void *arg) 5667 { 5668 struct discovery_ctx *ctx = arg; 5669 5670 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 5671 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 5672 } 5673 5674 int 5675 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 5676 const char *base_name, 5677 struct spdk_nvme_ctrlr_opts *drv_opts, 5678 struct nvme_ctrlr_opts *bdev_opts, 5679 uint64_t attach_timeout, 5680 bool from_mdns, 5681 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 5682 { 5683 struct discovery_ctx *ctx; 5684 struct discovery_entry_ctx *discovery_entry_ctx; 5685 5686 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 5687 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 5688 if (strcmp(ctx->name, base_name) == 0) { 5689 return -EEXIST; 5690 } 5691 5692 if (ctx->entry_ctx_in_use != NULL) { 5693 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 5694 return -EEXIST; 5695 } 5696 } 5697 5698 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 5699 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 5700 return -EEXIST; 5701 } 5702 } 5703 } 5704 5705 ctx = calloc(1, sizeof(*ctx)); 5706 if (ctx == NULL) { 5707 return -ENOMEM; 5708 } 5709 5710 ctx->name = strdup(base_name); 5711 if (ctx->name == NULL) { 5712 free_discovery_ctx(ctx); 5713 return -ENOMEM; 5714 } 5715 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 5716 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 5717 ctx->from_mdns_discovery_service = from_mdns; 5718 ctx->bdev_opts.from_discovery_service = true; 5719 ctx->calling_thread = spdk_get_thread(); 5720 ctx->start_cb_fn = cb_fn; 5721 ctx->cb_ctx = cb_ctx; 5722 ctx->initializing = true; 5723 if (ctx->start_cb_fn) { 5724 /* We can use this when dumping json to denote if this RPC parameter 5725 * was specified or not. 5726 */ 5727 ctx->wait_for_attach = true; 5728 } 5729 if (attach_timeout != 0) { 5730 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 5731 spdk_get_ticks_hz() / 1000ull; 5732 } 5733 TAILQ_INIT(&ctx->nvm_entry_ctxs); 5734 TAILQ_INIT(&ctx->discovery_entry_ctxs); 5735 memcpy(&ctx->trid, trid, sizeof(*trid)); 5736 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 5737 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 5738 if (ctx->hostnqn == NULL) { 5739 free_discovery_ctx(ctx); 5740 return -ENOMEM; 5741 } 5742 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 5743 if (discovery_entry_ctx == NULL) { 5744 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 5745 free_discovery_ctx(ctx); 5746 return -ENOMEM; 5747 } 5748 5749 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 5750 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 5751 return 0; 5752 } 5753 5754 int 5755 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 5756 { 5757 struct discovery_ctx *ctx; 5758 5759 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 5760 if (strcmp(name, ctx->name) == 0) { 5761 if (ctx->stop) { 5762 return -EALREADY; 5763 } 5764 /* If we're still starting the discovery service and ->rc is non-zero, we're 5765 * going to stop it as soon as we can 5766 */ 5767 if (ctx->initializing && ctx->rc != 0) { 5768 return -EALREADY; 5769 } 5770 stop_discovery(ctx, cb_fn, cb_ctx); 5771 return 0; 5772 } 5773 } 5774 5775 return -ENOENT; 5776 } 5777 5778 static int 5779 bdev_nvme_library_init(void) 5780 { 5781 g_bdev_nvme_init_thread = spdk_get_thread(); 5782 5783 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 5784 bdev_nvme_destroy_poll_group_cb, 5785 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 5786 5787 return 0; 5788 } 5789 5790 static void 5791 bdev_nvme_fini_destruct_ctrlrs(void) 5792 { 5793 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5794 struct nvme_ctrlr *nvme_ctrlr; 5795 5796 pthread_mutex_lock(&g_bdev_nvme_mutex); 5797 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 5798 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 5799 pthread_mutex_lock(&nvme_ctrlr->mutex); 5800 if (nvme_ctrlr->destruct) { 5801 /* This controller's destruction was already started 5802 * before the application started shutting down 5803 */ 5804 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5805 continue; 5806 } 5807 nvme_ctrlr->destruct = true; 5808 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5809 5810 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 5811 nvme_ctrlr); 5812 } 5813 } 5814 5815 g_bdev_nvme_module_finish = true; 5816 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5817 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5818 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 5819 spdk_bdev_module_fini_done(); 5820 return; 5821 } 5822 5823 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5824 } 5825 5826 static void 5827 check_discovery_fini(void *arg) 5828 { 5829 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 5830 bdev_nvme_fini_destruct_ctrlrs(); 5831 } 5832 } 5833 5834 static void 5835 bdev_nvme_library_fini(void) 5836 { 5837 struct nvme_probe_skip_entry *entry, *entry_tmp; 5838 struct discovery_ctx *ctx; 5839 5840 spdk_poller_unregister(&g_hotplug_poller); 5841 free(g_hotplug_probe_ctx); 5842 g_hotplug_probe_ctx = NULL; 5843 5844 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 5845 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 5846 free(entry); 5847 } 5848 5849 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 5850 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 5851 bdev_nvme_fini_destruct_ctrlrs(); 5852 } else { 5853 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 5854 stop_discovery(ctx, check_discovery_fini, NULL); 5855 } 5856 } 5857 } 5858 5859 static void 5860 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 5861 { 5862 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5863 struct spdk_bdev *bdev = bdev_io->bdev; 5864 struct spdk_dif_ctx dif_ctx; 5865 struct spdk_dif_error err_blk = {}; 5866 int rc; 5867 5868 rc = spdk_dif_ctx_init(&dif_ctx, 5869 bdev->blocklen, bdev->md_len, bdev->md_interleave, 5870 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 5871 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0); 5872 if (rc != 0) { 5873 SPDK_ERRLOG("Initialization of DIF context failed\n"); 5874 return; 5875 } 5876 5877 if (bdev->md_interleave) { 5878 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 5879 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 5880 } else { 5881 struct iovec md_iov = { 5882 .iov_base = bdev_io->u.bdev.md_buf, 5883 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 5884 }; 5885 5886 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 5887 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 5888 } 5889 5890 if (rc != 0) { 5891 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 5892 err_blk.err_type, err_blk.err_offset); 5893 } else { 5894 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 5895 } 5896 } 5897 5898 static void 5899 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 5900 { 5901 struct nvme_bdev_io *bio = ref; 5902 5903 if (spdk_nvme_cpl_is_success(cpl)) { 5904 /* Run PI verification for read data buffer. */ 5905 bdev_nvme_verify_pi_error(bio); 5906 } 5907 5908 /* Return original completion status */ 5909 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 5910 } 5911 5912 static void 5913 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 5914 { 5915 struct nvme_bdev_io *bio = ref; 5916 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5917 int ret; 5918 5919 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 5920 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 5921 cpl->status.sct, cpl->status.sc); 5922 5923 /* Save completion status to use after verifying PI error. */ 5924 bio->cpl = *cpl; 5925 5926 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 5927 /* Read without PI checking to verify PI error. */ 5928 ret = bdev_nvme_no_pi_readv(bio, 5929 bdev_io->u.bdev.iovs, 5930 bdev_io->u.bdev.iovcnt, 5931 bdev_io->u.bdev.md_buf, 5932 bdev_io->u.bdev.num_blocks, 5933 bdev_io->u.bdev.offset_blocks); 5934 if (ret == 0) { 5935 return; 5936 } 5937 } 5938 } 5939 5940 bdev_nvme_io_complete_nvme_status(bio, cpl); 5941 } 5942 5943 static void 5944 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 5945 { 5946 struct nvme_bdev_io *bio = ref; 5947 5948 if (spdk_nvme_cpl_is_pi_error(cpl)) { 5949 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 5950 cpl->status.sct, cpl->status.sc); 5951 /* Run PI verification for write data buffer if PI error is detected. */ 5952 bdev_nvme_verify_pi_error(bio); 5953 } 5954 5955 bdev_nvme_io_complete_nvme_status(bio, cpl); 5956 } 5957 5958 static void 5959 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 5960 { 5961 struct nvme_bdev_io *bio = ref; 5962 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5963 5964 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 5965 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 5966 */ 5967 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 5968 5969 if (spdk_nvme_cpl_is_pi_error(cpl)) { 5970 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 5971 cpl->status.sct, cpl->status.sc); 5972 /* Run PI verification for zone append data buffer if PI error is detected. */ 5973 bdev_nvme_verify_pi_error(bio); 5974 } 5975 5976 bdev_nvme_io_complete_nvme_status(bio, cpl); 5977 } 5978 5979 static void 5980 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 5981 { 5982 struct nvme_bdev_io *bio = ref; 5983 5984 if (spdk_nvme_cpl_is_pi_error(cpl)) { 5985 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 5986 cpl->status.sct, cpl->status.sc); 5987 /* Run PI verification for compare data buffer if PI error is detected. */ 5988 bdev_nvme_verify_pi_error(bio); 5989 } 5990 5991 bdev_nvme_io_complete_nvme_status(bio, cpl); 5992 } 5993 5994 static void 5995 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 5996 { 5997 struct nvme_bdev_io *bio = ref; 5998 5999 /* Compare operation completion */ 6000 if (!bio->first_fused_completed) { 6001 /* Save compare result for write callback */ 6002 bio->cpl = *cpl; 6003 bio->first_fused_completed = true; 6004 return; 6005 } 6006 6007 /* Write operation completion */ 6008 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 6009 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 6010 * complete the IO with the compare operation's status. 6011 */ 6012 if (!spdk_nvme_cpl_is_error(cpl)) { 6013 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 6014 } 6015 6016 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 6017 } else { 6018 bdev_nvme_io_complete_nvme_status(bio, cpl); 6019 } 6020 } 6021 6022 static void 6023 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 6024 { 6025 struct nvme_bdev_io *bio = ref; 6026 6027 bdev_nvme_io_complete_nvme_status(bio, cpl); 6028 } 6029 6030 static int 6031 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 6032 { 6033 switch (desc->zt) { 6034 case SPDK_NVME_ZONE_TYPE_SEQWR: 6035 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 6036 break; 6037 default: 6038 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 6039 return -EIO; 6040 } 6041 6042 switch (desc->zs) { 6043 case SPDK_NVME_ZONE_STATE_EMPTY: 6044 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 6045 break; 6046 case SPDK_NVME_ZONE_STATE_IOPEN: 6047 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 6048 break; 6049 case SPDK_NVME_ZONE_STATE_EOPEN: 6050 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 6051 break; 6052 case SPDK_NVME_ZONE_STATE_CLOSED: 6053 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 6054 break; 6055 case SPDK_NVME_ZONE_STATE_RONLY: 6056 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 6057 break; 6058 case SPDK_NVME_ZONE_STATE_FULL: 6059 info->state = SPDK_BDEV_ZONE_STATE_FULL; 6060 break; 6061 case SPDK_NVME_ZONE_STATE_OFFLINE: 6062 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 6063 break; 6064 default: 6065 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 6066 return -EIO; 6067 } 6068 6069 info->zone_id = desc->zslba; 6070 info->write_pointer = desc->wp; 6071 info->capacity = desc->zcap; 6072 6073 return 0; 6074 } 6075 6076 static void 6077 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 6078 { 6079 struct nvme_bdev_io *bio = ref; 6080 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6081 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 6082 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 6083 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 6084 uint64_t max_zones_per_buf, i; 6085 uint32_t zone_report_bufsize; 6086 struct spdk_nvme_ns *ns; 6087 struct spdk_nvme_qpair *qpair; 6088 int ret; 6089 6090 if (spdk_nvme_cpl_is_error(cpl)) { 6091 goto out_complete_io_nvme_cpl; 6092 } 6093 6094 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 6095 ret = -ENXIO; 6096 goto out_complete_io_ret; 6097 } 6098 6099 ns = bio->io_path->nvme_ns->ns; 6100 qpair = bio->io_path->qpair->qpair; 6101 6102 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 6103 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 6104 sizeof(bio->zone_report_buf->descs[0]); 6105 6106 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 6107 ret = -EINVAL; 6108 goto out_complete_io_ret; 6109 } 6110 6111 if (!bio->zone_report_buf->nr_zones) { 6112 ret = -EINVAL; 6113 goto out_complete_io_ret; 6114 } 6115 6116 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 6117 ret = fill_zone_from_report(&info[bio->handled_zones], 6118 &bio->zone_report_buf->descs[i]); 6119 if (ret) { 6120 goto out_complete_io_ret; 6121 } 6122 bio->handled_zones++; 6123 } 6124 6125 if (bio->handled_zones < zones_to_copy) { 6126 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 6127 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 6128 6129 memset(bio->zone_report_buf, 0, zone_report_bufsize); 6130 ret = spdk_nvme_zns_report_zones(ns, qpair, 6131 bio->zone_report_buf, zone_report_bufsize, 6132 slba, SPDK_NVME_ZRA_LIST_ALL, true, 6133 bdev_nvme_get_zone_info_done, bio); 6134 if (!ret) { 6135 return; 6136 } else { 6137 goto out_complete_io_ret; 6138 } 6139 } 6140 6141 out_complete_io_nvme_cpl: 6142 free(bio->zone_report_buf); 6143 bio->zone_report_buf = NULL; 6144 bdev_nvme_io_complete_nvme_status(bio, cpl); 6145 return; 6146 6147 out_complete_io_ret: 6148 free(bio->zone_report_buf); 6149 bio->zone_report_buf = NULL; 6150 bdev_nvme_io_complete(bio, ret); 6151 } 6152 6153 static void 6154 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 6155 { 6156 struct nvme_bdev_io *bio = ref; 6157 6158 bdev_nvme_io_complete_nvme_status(bio, cpl); 6159 } 6160 6161 static void 6162 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 6163 { 6164 struct nvme_bdev_io *bio = ctx; 6165 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6166 const struct spdk_nvme_cpl *cpl = &bio->cpl; 6167 6168 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 6169 6170 __bdev_nvme_io_complete(bdev_io, 0, cpl); 6171 } 6172 6173 static void 6174 bdev_nvme_abort_complete(void *ctx) 6175 { 6176 struct nvme_bdev_io *bio = ctx; 6177 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6178 6179 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 6180 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 6181 } else { 6182 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 6183 } 6184 } 6185 6186 static void 6187 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 6188 { 6189 struct nvme_bdev_io *bio = ref; 6190 6191 bio->cpl = *cpl; 6192 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_complete, bio); 6193 } 6194 6195 static void 6196 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 6197 { 6198 struct nvme_bdev_io *bio = ref; 6199 6200 bio->cpl = *cpl; 6201 spdk_thread_send_msg(bio->orig_thread, 6202 bdev_nvme_admin_passthru_complete_nvme_status, bio); 6203 } 6204 6205 static void 6206 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 6207 { 6208 struct nvme_bdev_io *bio = ref; 6209 struct iovec *iov; 6210 6211 bio->iov_offset = sgl_offset; 6212 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 6213 iov = &bio->iovs[bio->iovpos]; 6214 if (bio->iov_offset < iov->iov_len) { 6215 break; 6216 } 6217 6218 bio->iov_offset -= iov->iov_len; 6219 } 6220 } 6221 6222 static int 6223 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 6224 { 6225 struct nvme_bdev_io *bio = ref; 6226 struct iovec *iov; 6227 6228 assert(bio->iovpos < bio->iovcnt); 6229 6230 iov = &bio->iovs[bio->iovpos]; 6231 6232 *address = iov->iov_base; 6233 *length = iov->iov_len; 6234 6235 if (bio->iov_offset) { 6236 assert(bio->iov_offset <= iov->iov_len); 6237 *address += bio->iov_offset; 6238 *length -= bio->iov_offset; 6239 } 6240 6241 bio->iov_offset += *length; 6242 if (bio->iov_offset == iov->iov_len) { 6243 bio->iovpos++; 6244 bio->iov_offset = 0; 6245 } 6246 6247 return 0; 6248 } 6249 6250 static void 6251 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 6252 { 6253 struct nvme_bdev_io *bio = ref; 6254 struct iovec *iov; 6255 6256 bio->fused_iov_offset = sgl_offset; 6257 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 6258 iov = &bio->fused_iovs[bio->fused_iovpos]; 6259 if (bio->fused_iov_offset < iov->iov_len) { 6260 break; 6261 } 6262 6263 bio->fused_iov_offset -= iov->iov_len; 6264 } 6265 } 6266 6267 static int 6268 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 6269 { 6270 struct nvme_bdev_io *bio = ref; 6271 struct iovec *iov; 6272 6273 assert(bio->fused_iovpos < bio->fused_iovcnt); 6274 6275 iov = &bio->fused_iovs[bio->fused_iovpos]; 6276 6277 *address = iov->iov_base; 6278 *length = iov->iov_len; 6279 6280 if (bio->fused_iov_offset) { 6281 assert(bio->fused_iov_offset <= iov->iov_len); 6282 *address += bio->fused_iov_offset; 6283 *length -= bio->fused_iov_offset; 6284 } 6285 6286 bio->fused_iov_offset += *length; 6287 if (bio->fused_iov_offset == iov->iov_len) { 6288 bio->fused_iovpos++; 6289 bio->fused_iov_offset = 0; 6290 } 6291 6292 return 0; 6293 } 6294 6295 static int 6296 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6297 void *md, uint64_t lba_count, uint64_t lba) 6298 { 6299 int rc; 6300 6301 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 6302 lba_count, lba); 6303 6304 bio->iovs = iov; 6305 bio->iovcnt = iovcnt; 6306 bio->iovpos = 0; 6307 bio->iov_offset = 0; 6308 6309 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 6310 bio->io_path->qpair->qpair, 6311 lba, lba_count, 6312 bdev_nvme_no_pi_readv_done, bio, 0, 6313 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6314 md, 0, 0); 6315 6316 if (rc != 0 && rc != -ENOMEM) { 6317 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 6318 } 6319 return rc; 6320 } 6321 6322 static int 6323 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6324 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 6325 struct spdk_bdev_ext_io_opts *ext_opts) 6326 { 6327 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6328 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6329 int rc; 6330 6331 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 6332 lba_count, lba); 6333 6334 bio->iovs = iov; 6335 bio->iovcnt = iovcnt; 6336 bio->iovpos = 0; 6337 bio->iov_offset = 0; 6338 6339 if (ext_opts) { 6340 bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts); 6341 bio->ext_opts.memory_domain = ext_opts->memory_domain; 6342 bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx; 6343 bio->ext_opts.io_flags = flags; 6344 bio->ext_opts.metadata = md; 6345 6346 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 6347 bdev_nvme_readv_done, bio, 6348 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6349 &bio->ext_opts); 6350 } else if (iovcnt == 1) { 6351 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba, 6352 lba_count, 6353 bdev_nvme_readv_done, bio, 6354 flags, 6355 0, 0); 6356 } else { 6357 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 6358 bdev_nvme_readv_done, bio, flags, 6359 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6360 md, 0, 0); 6361 } 6362 6363 if (rc != 0 && rc != -ENOMEM) { 6364 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 6365 } 6366 return rc; 6367 } 6368 6369 static int 6370 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6371 void *md, uint64_t lba_count, uint64_t lba, 6372 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts) 6373 { 6374 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6375 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6376 int rc; 6377 6378 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 6379 lba_count, lba); 6380 6381 bio->iovs = iov; 6382 bio->iovcnt = iovcnt; 6383 bio->iovpos = 0; 6384 bio->iov_offset = 0; 6385 6386 if (ext_opts) { 6387 bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts); 6388 bio->ext_opts.memory_domain = ext_opts->memory_domain; 6389 bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx; 6390 bio->ext_opts.io_flags = flags; 6391 bio->ext_opts.metadata = md; 6392 6393 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 6394 bdev_nvme_writev_done, bio, 6395 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6396 &bio->ext_opts); 6397 } else if (iovcnt == 1) { 6398 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba, 6399 lba_count, 6400 bdev_nvme_writev_done, bio, 6401 flags, 6402 0, 0); 6403 } else { 6404 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 6405 bdev_nvme_writev_done, bio, flags, 6406 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6407 md, 0, 0); 6408 } 6409 6410 if (rc != 0 && rc != -ENOMEM) { 6411 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 6412 } 6413 return rc; 6414 } 6415 6416 static int 6417 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6418 void *md, uint64_t lba_count, uint64_t zslba, 6419 uint32_t flags) 6420 { 6421 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6422 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6423 int rc; 6424 6425 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 6426 lba_count, zslba); 6427 6428 bio->iovs = iov; 6429 bio->iovcnt = iovcnt; 6430 bio->iovpos = 0; 6431 bio->iov_offset = 0; 6432 6433 if (iovcnt == 1) { 6434 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 6435 lba_count, 6436 bdev_nvme_zone_appendv_done, bio, 6437 flags, 6438 0, 0); 6439 } else { 6440 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 6441 bdev_nvme_zone_appendv_done, bio, flags, 6442 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6443 md, 0, 0); 6444 } 6445 6446 if (rc != 0 && rc != -ENOMEM) { 6447 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 6448 } 6449 return rc; 6450 } 6451 6452 static int 6453 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6454 void *md, uint64_t lba_count, uint64_t lba, 6455 uint32_t flags) 6456 { 6457 int rc; 6458 6459 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 6460 lba_count, lba); 6461 6462 bio->iovs = iov; 6463 bio->iovcnt = iovcnt; 6464 bio->iovpos = 0; 6465 bio->iov_offset = 0; 6466 6467 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 6468 bio->io_path->qpair->qpair, 6469 lba, lba_count, 6470 bdev_nvme_comparev_done, bio, flags, 6471 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6472 md, 0, 0); 6473 6474 if (rc != 0 && rc != -ENOMEM) { 6475 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 6476 } 6477 return rc; 6478 } 6479 6480 static int 6481 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 6482 struct iovec *write_iov, int write_iovcnt, 6483 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 6484 { 6485 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6486 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6487 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6488 int rc; 6489 6490 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 6491 lba_count, lba); 6492 6493 bio->iovs = cmp_iov; 6494 bio->iovcnt = cmp_iovcnt; 6495 bio->iovpos = 0; 6496 bio->iov_offset = 0; 6497 bio->fused_iovs = write_iov; 6498 bio->fused_iovcnt = write_iovcnt; 6499 bio->fused_iovpos = 0; 6500 bio->fused_iov_offset = 0; 6501 6502 if (bdev_io->num_retries == 0) { 6503 bio->first_fused_submitted = false; 6504 bio->first_fused_completed = false; 6505 } 6506 6507 if (!bio->first_fused_submitted) { 6508 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 6509 memset(&bio->cpl, 0, sizeof(bio->cpl)); 6510 6511 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 6512 bdev_nvme_comparev_and_writev_done, bio, flags, 6513 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 6514 if (rc == 0) { 6515 bio->first_fused_submitted = true; 6516 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 6517 } else { 6518 if (rc != -ENOMEM) { 6519 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 6520 } 6521 return rc; 6522 } 6523 } 6524 6525 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 6526 6527 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 6528 bdev_nvme_comparev_and_writev_done, bio, flags, 6529 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 6530 if (rc != 0 && rc != -ENOMEM) { 6531 SPDK_ERRLOG("write failed: rc = %d\n", rc); 6532 rc = 0; 6533 } 6534 6535 return rc; 6536 } 6537 6538 static int 6539 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 6540 { 6541 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 6542 struct spdk_nvme_dsm_range *range; 6543 uint64_t offset, remaining; 6544 uint64_t num_ranges_u64; 6545 uint16_t num_ranges; 6546 int rc; 6547 6548 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 6549 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6550 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 6551 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 6552 return -EINVAL; 6553 } 6554 num_ranges = (uint16_t)num_ranges_u64; 6555 6556 offset = offset_blocks; 6557 remaining = num_blocks; 6558 range = &dsm_ranges[0]; 6559 6560 /* Fill max-size ranges until the remaining blocks fit into one range */ 6561 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 6562 range->attributes.raw = 0; 6563 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6564 range->starting_lba = offset; 6565 6566 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6567 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6568 range++; 6569 } 6570 6571 /* Final range describes the remaining blocks */ 6572 range->attributes.raw = 0; 6573 range->length = remaining; 6574 range->starting_lba = offset; 6575 6576 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 6577 bio->io_path->qpair->qpair, 6578 SPDK_NVME_DSM_ATTR_DEALLOCATE, 6579 dsm_ranges, num_ranges, 6580 bdev_nvme_queued_done, bio); 6581 6582 return rc; 6583 } 6584 6585 static int 6586 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 6587 { 6588 if (num_blocks > UINT16_MAX + 1) { 6589 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 6590 return -EINVAL; 6591 } 6592 6593 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 6594 bio->io_path->qpair->qpair, 6595 offset_blocks, num_blocks, 6596 bdev_nvme_queued_done, bio, 6597 0); 6598 } 6599 6600 static int 6601 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 6602 struct spdk_bdev_zone_info *info) 6603 { 6604 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6605 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6606 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 6607 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 6608 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 6609 6610 if (zone_id % zone_size != 0) { 6611 return -EINVAL; 6612 } 6613 6614 if (num_zones > total_zones || !num_zones) { 6615 return -EINVAL; 6616 } 6617 6618 assert(!bio->zone_report_buf); 6619 bio->zone_report_buf = calloc(1, zone_report_bufsize); 6620 if (!bio->zone_report_buf) { 6621 return -ENOMEM; 6622 } 6623 6624 bio->handled_zones = 0; 6625 6626 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 6627 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 6628 bdev_nvme_get_zone_info_done, bio); 6629 } 6630 6631 static int 6632 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 6633 enum spdk_bdev_zone_action action) 6634 { 6635 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6636 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6637 6638 switch (action) { 6639 case SPDK_BDEV_ZONE_CLOSE: 6640 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 6641 bdev_nvme_zone_management_done, bio); 6642 case SPDK_BDEV_ZONE_FINISH: 6643 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 6644 bdev_nvme_zone_management_done, bio); 6645 case SPDK_BDEV_ZONE_OPEN: 6646 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 6647 bdev_nvme_zone_management_done, bio); 6648 case SPDK_BDEV_ZONE_RESET: 6649 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 6650 bdev_nvme_zone_management_done, bio); 6651 case SPDK_BDEV_ZONE_OFFLINE: 6652 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 6653 bdev_nvme_zone_management_done, bio); 6654 default: 6655 return -EINVAL; 6656 } 6657 } 6658 6659 static void 6660 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 6661 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 6662 { 6663 struct nvme_io_path *io_path; 6664 struct nvme_ctrlr *nvme_ctrlr; 6665 uint32_t max_xfer_size; 6666 int rc = -ENXIO; 6667 6668 /* Choose the first ctrlr which is not failed. */ 6669 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 6670 nvme_ctrlr = io_path->qpair->ctrlr; 6671 6672 /* We should skip any unavailable nvme_ctrlr rather than checking 6673 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 6674 */ 6675 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 6676 continue; 6677 } 6678 6679 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 6680 6681 if (nbytes > max_xfer_size) { 6682 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 6683 rc = -EINVAL; 6684 goto err; 6685 } 6686 6687 bio->io_path = io_path; 6688 bio->orig_thread = spdk_get_thread(); 6689 6690 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 6691 bdev_nvme_admin_passthru_done, bio); 6692 if (rc == 0) { 6693 return; 6694 } 6695 } 6696 6697 err: 6698 bdev_nvme_admin_passthru_complete(bio, rc); 6699 } 6700 6701 static int 6702 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 6703 void *buf, size_t nbytes) 6704 { 6705 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6706 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6707 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 6708 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 6709 6710 if (nbytes > max_xfer_size) { 6711 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 6712 return -EINVAL; 6713 } 6714 6715 /* 6716 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 6717 * so fill it out automatically. 6718 */ 6719 cmd->nsid = spdk_nvme_ns_get_id(ns); 6720 6721 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 6722 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 6723 } 6724 6725 static int 6726 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 6727 void *buf, size_t nbytes, void *md_buf, size_t md_len) 6728 { 6729 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6730 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6731 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 6732 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 6733 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 6734 6735 if (nbytes > max_xfer_size) { 6736 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 6737 return -EINVAL; 6738 } 6739 6740 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 6741 SPDK_ERRLOG("invalid meta data buffer size\n"); 6742 return -EINVAL; 6743 } 6744 6745 /* 6746 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 6747 * so fill it out automatically. 6748 */ 6749 cmd->nsid = spdk_nvme_ns_get_id(ns); 6750 6751 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 6752 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 6753 } 6754 6755 static void 6756 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 6757 struct nvme_bdev_io *bio_to_abort) 6758 { 6759 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6760 struct nvme_io_path *io_path; 6761 struct nvme_ctrlr *nvme_ctrlr; 6762 int rc = 0; 6763 6764 bio->orig_thread = spdk_get_thread(); 6765 6766 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 6767 if (rc == 0) { 6768 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 6769 return; 6770 } 6771 6772 rc = 0; 6773 6774 /* Even admin commands, they were submitted to only nvme_ctrlrs which were 6775 * on any io_path. So traverse the io_path list for not only I/O commands 6776 * but also admin commands. 6777 */ 6778 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 6779 nvme_ctrlr = io_path->qpair->ctrlr; 6780 6781 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr, 6782 io_path->qpair->qpair, 6783 bio_to_abort, 6784 bdev_nvme_abort_done, bio); 6785 if (rc == -ENOENT) { 6786 /* If no command was found in I/O qpair, the target command may be 6787 * admin command. 6788 */ 6789 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr, 6790 NULL, 6791 bio_to_abort, 6792 bdev_nvme_abort_done, bio); 6793 } 6794 6795 if (rc != -ENOENT) { 6796 break; 6797 } 6798 } 6799 6800 if (rc != 0) { 6801 /* If no command was found or there was any error, complete the abort 6802 * request with failure. 6803 */ 6804 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 6805 } 6806 } 6807 6808 static int 6809 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 6810 uint64_t num_blocks) 6811 { 6812 struct spdk_nvme_scc_source_range range = { 6813 .slba = src_offset_blocks, 6814 .nlb = num_blocks - 1 6815 }; 6816 6817 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 6818 bio->io_path->qpair->qpair, 6819 &range, 1, dst_offset_blocks, 6820 bdev_nvme_queued_done, bio); 6821 } 6822 6823 static void 6824 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 6825 { 6826 const char *action; 6827 6828 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 6829 action = "reset"; 6830 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 6831 action = "abort"; 6832 } else { 6833 action = "none"; 6834 } 6835 6836 spdk_json_write_object_begin(w); 6837 6838 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 6839 6840 spdk_json_write_named_object_begin(w, "params"); 6841 spdk_json_write_named_string(w, "action_on_timeout", action); 6842 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 6843 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 6844 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 6845 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 6846 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 6847 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 6848 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 6849 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 6850 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 6851 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 6852 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 6853 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 6854 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 6855 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 6856 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 6857 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 6858 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 6859 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 6860 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 6861 spdk_json_write_object_end(w); 6862 6863 spdk_json_write_object_end(w); 6864 } 6865 6866 static void 6867 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 6868 { 6869 struct spdk_nvme_transport_id trid; 6870 6871 spdk_json_write_object_begin(w); 6872 6873 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 6874 6875 spdk_json_write_named_object_begin(w, "params"); 6876 spdk_json_write_named_string(w, "name", ctx->name); 6877 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 6878 6879 trid = ctx->trid; 6880 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 6881 nvme_bdev_dump_trid_json(&trid, w); 6882 6883 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 6884 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 6885 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 6886 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 6887 ctx->bdev_opts.fast_io_fail_timeout_sec); 6888 spdk_json_write_object_end(w); 6889 6890 spdk_json_write_object_end(w); 6891 } 6892 6893 static void 6894 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 6895 struct nvme_ctrlr *nvme_ctrlr) 6896 { 6897 struct spdk_nvme_transport_id *trid; 6898 6899 if (nvme_ctrlr->opts.from_discovery_service) { 6900 /* Do not emit an RPC for this - it will be implicitly 6901 * covered by a separate bdev_nvme_start_discovery or 6902 * bdev_nvme_start_mdns_discovery RPC. 6903 */ 6904 return; 6905 } 6906 6907 trid = &nvme_ctrlr->active_path_id->trid; 6908 6909 spdk_json_write_object_begin(w); 6910 6911 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 6912 6913 spdk_json_write_named_object_begin(w, "params"); 6914 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 6915 nvme_bdev_dump_trid_json(trid, w); 6916 spdk_json_write_named_bool(w, "prchk_reftag", 6917 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 6918 spdk_json_write_named_bool(w, "prchk_guard", 6919 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 6920 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 6921 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 6922 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 6923 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 6924 6925 spdk_json_write_object_end(w); 6926 6927 spdk_json_write_object_end(w); 6928 } 6929 6930 static void 6931 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 6932 { 6933 spdk_json_write_object_begin(w); 6934 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 6935 6936 spdk_json_write_named_object_begin(w, "params"); 6937 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 6938 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 6939 spdk_json_write_object_end(w); 6940 6941 spdk_json_write_object_end(w); 6942 } 6943 6944 static int 6945 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 6946 { 6947 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6948 struct nvme_ctrlr *nvme_ctrlr; 6949 struct discovery_ctx *ctx; 6950 6951 bdev_nvme_opts_config_json(w); 6952 6953 pthread_mutex_lock(&g_bdev_nvme_mutex); 6954 6955 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 6956 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6957 nvme_ctrlr_config_json(w, nvme_ctrlr); 6958 } 6959 } 6960 6961 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6962 if (!ctx->from_mdns_discovery_service) { 6963 bdev_nvme_discovery_config_json(w, ctx); 6964 } 6965 } 6966 6967 bdev_nvme_mdns_discovery_config_json(w); 6968 6969 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 6970 * before enabling hotplug poller. 6971 */ 6972 bdev_nvme_hotplug_config_json(w); 6973 6974 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6975 return 0; 6976 } 6977 6978 struct spdk_nvme_ctrlr * 6979 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 6980 { 6981 struct nvme_bdev *nbdev; 6982 struct nvme_ns *nvme_ns; 6983 6984 if (!bdev || bdev->module != &nvme_if) { 6985 return NULL; 6986 } 6987 6988 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 6989 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 6990 assert(nvme_ns != NULL); 6991 6992 return nvme_ns->ctrlr->ctrlr; 6993 } 6994 6995 void 6996 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 6997 { 6998 struct nvme_ns *nvme_ns = io_path->nvme_ns; 6999 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 7000 const struct spdk_nvme_ctrlr_data *cdata; 7001 const struct spdk_nvme_transport_id *trid; 7002 const char *adrfam_str; 7003 7004 spdk_json_write_object_begin(w); 7005 7006 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 7007 7008 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 7009 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 7010 7011 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 7012 spdk_json_write_named_bool(w, "current", io_path == io_path->nbdev_ch->current_io_path); 7013 spdk_json_write_named_bool(w, "connected", nvme_io_path_is_connected(io_path)); 7014 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 7015 7016 spdk_json_write_named_object_begin(w, "transport"); 7017 spdk_json_write_named_string(w, "trtype", trid->trstring); 7018 spdk_json_write_named_string(w, "traddr", trid->traddr); 7019 if (trid->trsvcid[0] != '\0') { 7020 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 7021 } 7022 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 7023 if (adrfam_str) { 7024 spdk_json_write_named_string(w, "adrfam", adrfam_str); 7025 } 7026 spdk_json_write_object_end(w); 7027 7028 spdk_json_write_object_end(w); 7029 } 7030 7031 void 7032 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 7033 { 7034 struct discovery_ctx *ctx; 7035 struct discovery_entry_ctx *entry_ctx; 7036 7037 spdk_json_write_array_begin(w); 7038 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7039 spdk_json_write_object_begin(w); 7040 spdk_json_write_named_string(w, "name", ctx->name); 7041 7042 spdk_json_write_named_object_begin(w, "trid"); 7043 nvme_bdev_dump_trid_json(&ctx->trid, w); 7044 spdk_json_write_object_end(w); 7045 7046 spdk_json_write_named_array_begin(w, "referrals"); 7047 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 7048 spdk_json_write_object_begin(w); 7049 spdk_json_write_named_object_begin(w, "trid"); 7050 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 7051 spdk_json_write_object_end(w); 7052 spdk_json_write_object_end(w); 7053 } 7054 spdk_json_write_array_end(w); 7055 7056 spdk_json_write_object_end(w); 7057 } 7058 spdk_json_write_array_end(w); 7059 } 7060 7061 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 7062 7063 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 7064 { 7065 struct spdk_trace_tpoint_opts opts[] = { 7066 { 7067 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 7068 OWNER_NONE, OBJECT_BDEV_NVME_IO, 1, 7069 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 7070 }, 7071 { 7072 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 7073 OWNER_NONE, OBJECT_BDEV_NVME_IO, 0, 7074 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 7075 } 7076 }; 7077 7078 7079 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 7080 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 7081 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 7082 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 7083 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 7084 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 7085 } 7086