1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/likely.h" 18 #include "spdk/nvme.h" 19 #include "spdk/nvme_ocssd.h" 20 #include "spdk/nvme_zns.h" 21 #include "spdk/opal.h" 22 #include "spdk/thread.h" 23 #include "spdk/trace.h" 24 #include "spdk/string.h" 25 #include "spdk/util.h" 26 #include "spdk/uuid.h" 27 28 #include "spdk/bdev_module.h" 29 #include "spdk/log.h" 30 31 #include "spdk_internal/usdt.h" 32 #include "spdk_internal/trace_defs.h" 33 34 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 35 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 36 37 #define NSID_STR_LEN 10 38 39 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 40 41 struct nvme_bdev_io { 42 /** array of iovecs to transfer. */ 43 struct iovec *iovs; 44 45 /** Number of iovecs in iovs array. */ 46 int iovcnt; 47 48 /** Current iovec position. */ 49 int iovpos; 50 51 /** Offset in current iovec. */ 52 uint32_t iov_offset; 53 54 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 55 * being reset in a reset I/O. 56 */ 57 struct nvme_io_path *io_path; 58 59 /** array of iovecs to transfer. */ 60 struct iovec *fused_iovs; 61 62 /** Number of iovecs in iovs array. */ 63 int fused_iovcnt; 64 65 /** Current iovec position. */ 66 int fused_iovpos; 67 68 /** Offset in current iovec. */ 69 uint32_t fused_iov_offset; 70 71 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 72 struct spdk_nvme_cpl cpl; 73 74 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 75 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 76 77 /** Originating thread */ 78 struct spdk_thread *orig_thread; 79 80 /** Keeps track if first of fused commands was submitted */ 81 bool first_fused_submitted; 82 83 /** Keeps track if first of fused commands was completed */ 84 bool first_fused_completed; 85 86 /** Temporary pointer to zone report buffer */ 87 struct spdk_nvme_zns_zone_report *zone_report_buf; 88 89 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 90 uint64_t handled_zones; 91 92 /** Expiration value in ticks to retry the current I/O. */ 93 uint64_t retry_ticks; 94 95 /* How many times the current I/O was retried. */ 96 int32_t retry_count; 97 98 /* Current tsc at submit time. */ 99 uint64_t submit_tsc; 100 }; 101 102 struct nvme_probe_skip_entry { 103 struct spdk_nvme_transport_id trid; 104 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 105 }; 106 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 107 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 108 g_skipped_nvme_ctrlrs); 109 110 static struct spdk_bdev_nvme_opts g_opts = { 111 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 112 .timeout_us = 0, 113 .timeout_admin_us = 0, 114 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 115 .transport_retry_count = 4, 116 .arbitration_burst = 0, 117 .low_priority_weight = 0, 118 .medium_priority_weight = 0, 119 .high_priority_weight = 0, 120 .nvme_adminq_poll_period_us = 10000ULL, 121 .nvme_ioq_poll_period_us = 0, 122 .io_queue_requests = 0, 123 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 124 .bdev_retry_count = 3, 125 .transport_ack_timeout = 0, 126 .ctrlr_loss_timeout_sec = 0, 127 .reconnect_delay_sec = 0, 128 .fast_io_fail_timeout_sec = 0, 129 .disable_auto_failback = false, 130 .generate_uuids = false, 131 .transport_tos = 0, 132 .nvme_error_stat = false, 133 .io_path_stat = false, 134 }; 135 136 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 137 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 138 139 static int g_hot_insert_nvme_controller_index = 0; 140 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 141 static bool g_nvme_hotplug_enabled = false; 142 struct spdk_thread *g_bdev_nvme_init_thread; 143 static struct spdk_poller *g_hotplug_poller; 144 static struct spdk_poller *g_hotplug_probe_poller; 145 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 146 147 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 148 struct nvme_async_probe_ctx *ctx); 149 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 150 struct nvme_async_probe_ctx *ctx); 151 static int bdev_nvme_library_init(void); 152 static void bdev_nvme_library_fini(void); 153 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 154 struct spdk_bdev_io *bdev_io); 155 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 156 struct spdk_bdev_io *bdev_io); 157 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 158 void *md, uint64_t lba_count, uint64_t lba, 159 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx); 160 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 161 void *md, uint64_t lba_count, uint64_t lba); 162 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 163 void *md, uint64_t lba_count, uint64_t lba, 164 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx); 165 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 166 void *md, uint64_t lba_count, 167 uint64_t zslba, uint32_t flags); 168 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 169 void *md, uint64_t lba_count, uint64_t lba, 170 uint32_t flags); 171 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 172 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 173 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 174 uint32_t flags); 175 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 176 uint32_t num_zones, struct spdk_bdev_zone_info *info); 177 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 178 enum spdk_bdev_zone_action action); 179 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 180 struct nvme_bdev_io *bio, 181 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 182 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 183 void *buf, size_t nbytes); 184 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 185 void *buf, size_t nbytes, void *md_buf, size_t md_len); 186 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 187 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 188 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 189 static int bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr); 190 static int bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove); 191 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 192 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 193 194 static struct nvme_ns *nvme_ns_alloc(void); 195 static void nvme_ns_free(struct nvme_ns *ns); 196 197 static int 198 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 199 { 200 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 201 } 202 203 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 204 205 struct spdk_nvme_qpair * 206 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 207 { 208 struct nvme_ctrlr_channel *ctrlr_ch; 209 210 assert(ctrlr_io_ch != NULL); 211 212 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 213 214 return ctrlr_ch->qpair->qpair; 215 } 216 217 static int 218 bdev_nvme_get_ctx_size(void) 219 { 220 return sizeof(struct nvme_bdev_io); 221 } 222 223 static struct spdk_bdev_module nvme_if = { 224 .name = "nvme", 225 .async_fini = true, 226 .module_init = bdev_nvme_library_init, 227 .module_fini = bdev_nvme_library_fini, 228 .config_json = bdev_nvme_config_json, 229 .get_ctx_size = bdev_nvme_get_ctx_size, 230 231 }; 232 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 233 234 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 235 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 236 bool g_bdev_nvme_module_finish; 237 238 struct nvme_bdev_ctrlr * 239 nvme_bdev_ctrlr_get_by_name(const char *name) 240 { 241 struct nvme_bdev_ctrlr *nbdev_ctrlr; 242 243 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 244 if (strcmp(name, nbdev_ctrlr->name) == 0) { 245 break; 246 } 247 } 248 249 return nbdev_ctrlr; 250 } 251 252 static struct nvme_ctrlr * 253 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 254 const struct spdk_nvme_transport_id *trid) 255 { 256 struct nvme_ctrlr *nvme_ctrlr; 257 258 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 259 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { 260 break; 261 } 262 } 263 264 return nvme_ctrlr; 265 } 266 267 static struct nvme_bdev * 268 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 269 { 270 struct nvme_bdev *bdev; 271 272 pthread_mutex_lock(&g_bdev_nvme_mutex); 273 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 274 if (bdev->nsid == nsid) { 275 break; 276 } 277 } 278 pthread_mutex_unlock(&g_bdev_nvme_mutex); 279 280 return bdev; 281 } 282 283 struct nvme_ns * 284 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 285 { 286 struct nvme_ns ns; 287 288 assert(nsid > 0); 289 290 ns.id = nsid; 291 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 292 } 293 294 struct nvme_ns * 295 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 296 { 297 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 298 } 299 300 struct nvme_ns * 301 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 302 { 303 if (ns == NULL) { 304 return NULL; 305 } 306 307 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 308 } 309 310 static struct nvme_ctrlr * 311 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) 312 { 313 struct nvme_bdev_ctrlr *nbdev_ctrlr; 314 struct nvme_ctrlr *nvme_ctrlr = NULL; 315 316 pthread_mutex_lock(&g_bdev_nvme_mutex); 317 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 318 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); 319 if (nvme_ctrlr != NULL) { 320 break; 321 } 322 } 323 pthread_mutex_unlock(&g_bdev_nvme_mutex); 324 325 return nvme_ctrlr; 326 } 327 328 struct nvme_ctrlr * 329 nvme_ctrlr_get_by_name(const char *name) 330 { 331 struct nvme_bdev_ctrlr *nbdev_ctrlr; 332 struct nvme_ctrlr *nvme_ctrlr = NULL; 333 334 if (name == NULL) { 335 return NULL; 336 } 337 338 pthread_mutex_lock(&g_bdev_nvme_mutex); 339 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 340 if (nbdev_ctrlr != NULL) { 341 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 342 } 343 pthread_mutex_unlock(&g_bdev_nvme_mutex); 344 345 return nvme_ctrlr; 346 } 347 348 void 349 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 350 { 351 struct nvme_bdev_ctrlr *nbdev_ctrlr; 352 353 pthread_mutex_lock(&g_bdev_nvme_mutex); 354 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 355 fn(nbdev_ctrlr, ctx); 356 } 357 pthread_mutex_unlock(&g_bdev_nvme_mutex); 358 } 359 360 void 361 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 362 { 363 const char *trtype_str; 364 const char *adrfam_str; 365 366 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 367 if (trtype_str) { 368 spdk_json_write_named_string(w, "trtype", trtype_str); 369 } 370 371 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 372 if (adrfam_str) { 373 spdk_json_write_named_string(w, "adrfam", adrfam_str); 374 } 375 376 if (trid->traddr[0] != '\0') { 377 spdk_json_write_named_string(w, "traddr", trid->traddr); 378 } 379 380 if (trid->trsvcid[0] != '\0') { 381 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 382 } 383 384 if (trid->subnqn[0] != '\0') { 385 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 386 } 387 } 388 389 static void 390 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 391 struct nvme_ctrlr *nvme_ctrlr) 392 { 393 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 394 pthread_mutex_lock(&g_bdev_nvme_mutex); 395 396 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 397 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 398 pthread_mutex_unlock(&g_bdev_nvme_mutex); 399 400 return; 401 } 402 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 403 404 pthread_mutex_unlock(&g_bdev_nvme_mutex); 405 406 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 407 408 free(nbdev_ctrlr->name); 409 free(nbdev_ctrlr); 410 } 411 412 static void 413 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 414 { 415 struct nvme_path_id *path_id, *tmp_path; 416 struct nvme_ns *ns, *tmp_ns; 417 418 free(nvme_ctrlr->copied_ana_desc); 419 spdk_free(nvme_ctrlr->ana_log_page); 420 421 if (nvme_ctrlr->opal_dev) { 422 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 423 nvme_ctrlr->opal_dev = NULL; 424 } 425 426 if (nvme_ctrlr->nbdev_ctrlr) { 427 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 428 } 429 430 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 431 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 432 nvme_ns_free(ns); 433 } 434 435 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 436 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 437 free(path_id); 438 } 439 440 pthread_mutex_destroy(&nvme_ctrlr->mutex); 441 442 free(nvme_ctrlr); 443 444 pthread_mutex_lock(&g_bdev_nvme_mutex); 445 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 446 pthread_mutex_unlock(&g_bdev_nvme_mutex); 447 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 448 spdk_bdev_module_fini_done(); 449 return; 450 } 451 pthread_mutex_unlock(&g_bdev_nvme_mutex); 452 } 453 454 static int 455 nvme_detach_poller(void *arg) 456 { 457 struct nvme_ctrlr *nvme_ctrlr = arg; 458 int rc; 459 460 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 461 if (rc != -EAGAIN) { 462 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 463 _nvme_ctrlr_delete(nvme_ctrlr); 464 } 465 466 return SPDK_POLLER_BUSY; 467 } 468 469 static void 470 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 471 { 472 int rc; 473 474 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 475 476 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 477 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 478 479 /* If we got here, the reset/detach poller cannot be active */ 480 assert(nvme_ctrlr->reset_detach_poller == NULL); 481 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 482 nvme_ctrlr, 1000); 483 if (nvme_ctrlr->reset_detach_poller == NULL) { 484 SPDK_ERRLOG("Failed to register detach poller\n"); 485 goto error; 486 } 487 488 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 489 if (rc != 0) { 490 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 491 goto error; 492 } 493 494 return; 495 error: 496 /* We don't have a good way to handle errors here, so just do what we can and delete the 497 * controller without detaching the underlying NVMe device. 498 */ 499 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 500 _nvme_ctrlr_delete(nvme_ctrlr); 501 } 502 503 static void 504 nvme_ctrlr_unregister_cb(void *io_device) 505 { 506 struct nvme_ctrlr *nvme_ctrlr = io_device; 507 508 nvme_ctrlr_delete(nvme_ctrlr); 509 } 510 511 static void 512 nvme_ctrlr_unregister(void *ctx) 513 { 514 struct nvme_ctrlr *nvme_ctrlr = ctx; 515 516 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 517 } 518 519 static bool 520 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 521 { 522 if (!nvme_ctrlr->destruct) { 523 return false; 524 } 525 526 if (nvme_ctrlr->ref > 0) { 527 return false; 528 } 529 530 if (nvme_ctrlr->resetting) { 531 return false; 532 } 533 534 if (nvme_ctrlr->ana_log_page_updating) { 535 return false; 536 } 537 538 if (nvme_ctrlr->io_path_cache_clearing) { 539 return false; 540 } 541 542 return true; 543 } 544 545 static void 546 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 547 { 548 pthread_mutex_lock(&nvme_ctrlr->mutex); 549 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 550 551 assert(nvme_ctrlr->ref > 0); 552 nvme_ctrlr->ref--; 553 554 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 555 pthread_mutex_unlock(&nvme_ctrlr->mutex); 556 return; 557 } 558 559 pthread_mutex_unlock(&nvme_ctrlr->mutex); 560 561 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 562 } 563 564 static void 565 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 566 { 567 nbdev_ch->current_io_path = NULL; 568 nbdev_ch->rr_counter = 0; 569 } 570 571 static struct nvme_io_path * 572 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 573 { 574 struct nvme_io_path *io_path; 575 576 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 577 if (io_path->nvme_ns == nvme_ns) { 578 break; 579 } 580 } 581 582 return io_path; 583 } 584 585 static struct nvme_io_path * 586 nvme_io_path_alloc(void) 587 { 588 struct nvme_io_path *io_path; 589 590 io_path = calloc(1, sizeof(*io_path)); 591 if (io_path == NULL) { 592 SPDK_ERRLOG("Failed to alloc io_path.\n"); 593 return NULL; 594 } 595 596 if (g_opts.io_path_stat) { 597 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 598 if (io_path->stat == NULL) { 599 free(io_path); 600 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 601 return NULL; 602 } 603 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 604 } 605 606 return io_path; 607 } 608 609 static void 610 nvme_io_path_free(struct nvme_io_path *io_path) 611 { 612 free(io_path->stat); 613 free(io_path); 614 } 615 616 static int 617 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 618 { 619 struct nvme_io_path *io_path; 620 struct spdk_io_channel *ch; 621 struct nvme_ctrlr_channel *ctrlr_ch; 622 struct nvme_qpair *nvme_qpair; 623 624 io_path = nvme_io_path_alloc(); 625 if (io_path == NULL) { 626 return -ENOMEM; 627 } 628 629 io_path->nvme_ns = nvme_ns; 630 631 ch = spdk_get_io_channel(nvme_ns->ctrlr); 632 if (ch == NULL) { 633 nvme_io_path_free(io_path); 634 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 635 return -ENOMEM; 636 } 637 638 ctrlr_ch = spdk_io_channel_get_ctx(ch); 639 640 nvme_qpair = ctrlr_ch->qpair; 641 assert(nvme_qpair != NULL); 642 643 io_path->qpair = nvme_qpair; 644 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 645 646 io_path->nbdev_ch = nbdev_ch; 647 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 648 649 bdev_nvme_clear_current_io_path(nbdev_ch); 650 651 return 0; 652 } 653 654 static void 655 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 656 { 657 struct spdk_io_channel *ch; 658 struct nvme_qpair *nvme_qpair; 659 struct nvme_ctrlr_channel *ctrlr_ch; 660 struct nvme_bdev *nbdev; 661 662 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 663 664 /* Add the statistics to nvme_ns before this path is destroyed. */ 665 pthread_mutex_lock(&nbdev->mutex); 666 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 667 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 668 } 669 pthread_mutex_unlock(&nbdev->mutex); 670 671 bdev_nvme_clear_current_io_path(nbdev_ch); 672 673 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 674 io_path->nbdev_ch = NULL; 675 676 nvme_qpair = io_path->qpair; 677 assert(nvme_qpair != NULL); 678 679 ctrlr_ch = nvme_qpair->ctrlr_ch; 680 assert(ctrlr_ch != NULL); 681 682 ch = spdk_io_channel_from_ctx(ctrlr_ch); 683 spdk_put_io_channel(ch); 684 685 /* After an io_path is removed, I/Os submitted to it may complete and update statistics 686 * of the io_path. To avoid heap-use-after-free error from this case, do not free the 687 * io_path here but free the io_path when the associated qpair is freed. It is ensured 688 * that all I/Os submitted to the io_path are completed when the associated qpair is freed. 689 */ 690 } 691 692 static void 693 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 694 { 695 struct nvme_io_path *io_path, *tmp_io_path; 696 697 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 698 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 699 } 700 } 701 702 static int 703 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 704 { 705 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 706 struct nvme_bdev *nbdev = io_device; 707 struct nvme_ns *nvme_ns; 708 int rc; 709 710 STAILQ_INIT(&nbdev_ch->io_path_list); 711 TAILQ_INIT(&nbdev_ch->retry_io_list); 712 713 pthread_mutex_lock(&nbdev->mutex); 714 715 nbdev_ch->mp_policy = nbdev->mp_policy; 716 nbdev_ch->mp_selector = nbdev->mp_selector; 717 nbdev_ch->rr_min_io = nbdev->rr_min_io; 718 719 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 720 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 721 if (rc != 0) { 722 pthread_mutex_unlock(&nbdev->mutex); 723 724 _bdev_nvme_delete_io_paths(nbdev_ch); 725 return rc; 726 } 727 } 728 pthread_mutex_unlock(&nbdev->mutex); 729 730 return 0; 731 } 732 733 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 734 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 735 */ 736 static inline void 737 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 738 const struct spdk_nvme_cpl *cpl) 739 { 740 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 741 (uintptr_t)bdev_io); 742 if (cpl) { 743 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 744 } else { 745 spdk_bdev_io_complete(bdev_io, status); 746 } 747 } 748 749 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 750 751 static void 752 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 753 { 754 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 755 756 bdev_nvme_abort_retry_ios(nbdev_ch); 757 _bdev_nvme_delete_io_paths(nbdev_ch); 758 } 759 760 static inline bool 761 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 762 { 763 switch (io_type) { 764 case SPDK_BDEV_IO_TYPE_RESET: 765 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 766 case SPDK_BDEV_IO_TYPE_ABORT: 767 return true; 768 default: 769 break; 770 } 771 772 return false; 773 } 774 775 static inline bool 776 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 777 { 778 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 779 return false; 780 } 781 782 switch (nvme_ns->ana_state) { 783 case SPDK_NVME_ANA_OPTIMIZED_STATE: 784 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 785 return true; 786 default: 787 break; 788 } 789 790 return false; 791 } 792 793 static inline bool 794 nvme_io_path_is_connected(struct nvme_io_path *io_path) 795 { 796 if (spdk_unlikely(io_path->qpair->qpair == NULL)) { 797 return false; 798 } 799 800 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(io_path->qpair->qpair) != 801 SPDK_NVME_QPAIR_FAILURE_NONE)) { 802 return false; 803 } 804 805 if (spdk_unlikely(io_path->qpair->ctrlr_ch->reset_iter != NULL)) { 806 return false; 807 } 808 809 if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(io_path->qpair->ctrlr->ctrlr) != 810 SPDK_NVME_QPAIR_FAILURE_NONE) { 811 return false; 812 } 813 814 return true; 815 } 816 817 static inline bool 818 nvme_io_path_is_available(struct nvme_io_path *io_path) 819 { 820 if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) { 821 return false; 822 } 823 824 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 825 return false; 826 } 827 828 return true; 829 } 830 831 static inline bool 832 nvme_io_path_is_failed(struct nvme_io_path *io_path) 833 { 834 struct nvme_ctrlr *nvme_ctrlr; 835 836 nvme_ctrlr = io_path->qpair->ctrlr; 837 838 if (nvme_ctrlr->destruct) { 839 return true; 840 } 841 842 if (nvme_ctrlr->fast_io_fail_timedout) { 843 return true; 844 } 845 846 if (nvme_ctrlr->resetting) { 847 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 848 return false; 849 } else { 850 return true; 851 } 852 } 853 854 if (nvme_ctrlr->reconnect_is_delayed) { 855 return false; 856 } 857 858 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 859 return true; 860 } else { 861 return false; 862 } 863 } 864 865 static bool 866 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 867 { 868 if (nvme_ctrlr->destruct) { 869 return false; 870 } 871 872 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 873 return false; 874 } 875 876 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 877 return false; 878 } 879 880 return true; 881 } 882 883 /* Simulate circular linked list. */ 884 static inline struct nvme_io_path * 885 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 886 { 887 struct nvme_io_path *next_path; 888 889 if (prev_path != NULL) { 890 next_path = STAILQ_NEXT(prev_path, stailq); 891 if (next_path != NULL) { 892 return next_path; 893 } 894 } 895 896 return STAILQ_FIRST(&nbdev_ch->io_path_list); 897 } 898 899 static struct nvme_io_path * 900 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 901 { 902 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 903 904 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 905 906 io_path = start; 907 do { 908 if (spdk_likely(nvme_io_path_is_connected(io_path) && 909 !io_path->nvme_ns->ana_state_updating)) { 910 switch (io_path->nvme_ns->ana_state) { 911 case SPDK_NVME_ANA_OPTIMIZED_STATE: 912 nbdev_ch->current_io_path = io_path; 913 return io_path; 914 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 915 if (non_optimized == NULL) { 916 non_optimized = io_path; 917 } 918 break; 919 default: 920 break; 921 } 922 } 923 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 924 } while (io_path != start); 925 926 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 927 /* We come here only if there is no optimized path. Cache even non_optimized 928 * path for load balance across multiple non_optimized paths. 929 */ 930 nbdev_ch->current_io_path = non_optimized; 931 } 932 933 return non_optimized; 934 } 935 936 static struct nvme_io_path * 937 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 938 { 939 struct nvme_io_path *io_path; 940 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 941 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 942 uint32_t num_outstanding_reqs; 943 944 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 945 if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) { 946 /* The device is currently resetting. */ 947 continue; 948 } 949 950 if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) { 951 continue; 952 } 953 954 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 955 switch (io_path->nvme_ns->ana_state) { 956 case SPDK_NVME_ANA_OPTIMIZED_STATE: 957 if (num_outstanding_reqs < opt_min_qd) { 958 opt_min_qd = num_outstanding_reqs; 959 optimized = io_path; 960 } 961 break; 962 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 963 if (num_outstanding_reqs < non_opt_min_qd) { 964 non_opt_min_qd = num_outstanding_reqs; 965 non_optimized = io_path; 966 } 967 break; 968 default: 969 break; 970 } 971 } 972 973 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 974 if (optimized != NULL) { 975 return optimized; 976 } 977 978 return non_optimized; 979 } 980 981 static inline struct nvme_io_path * 982 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 983 { 984 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 985 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 986 return nbdev_ch->current_io_path; 987 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 988 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 989 return nbdev_ch->current_io_path; 990 } 991 nbdev_ch->rr_counter = 0; 992 } 993 } 994 995 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 996 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 997 return _bdev_nvme_find_io_path(nbdev_ch); 998 } else { 999 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 1000 } 1001 } 1002 1003 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 1004 * or false otherwise. 1005 * 1006 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 1007 * is likely to be non-accessible now but may become accessible. 1008 * 1009 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 1010 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 1011 * when starting to reset it but it is set to failed when the reset failed. Hence, if 1012 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 1013 */ 1014 static bool 1015 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 1016 { 1017 struct nvme_io_path *io_path; 1018 1019 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1020 if (io_path->nvme_ns->ana_transition_timedout) { 1021 continue; 1022 } 1023 1024 if (nvme_io_path_is_connected(io_path) || 1025 !nvme_io_path_is_failed(io_path)) { 1026 return true; 1027 } 1028 } 1029 1030 return false; 1031 } 1032 1033 static void 1034 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1035 { 1036 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1037 struct spdk_io_channel *ch; 1038 1039 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1040 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1041 } else { 1042 ch = spdk_io_channel_from_ctx(nbdev_ch); 1043 bdev_nvme_submit_request(ch, bdev_io); 1044 } 1045 } 1046 1047 static int 1048 bdev_nvme_retry_ios(void *arg) 1049 { 1050 struct nvme_bdev_channel *nbdev_ch = arg; 1051 struct spdk_bdev_io *bdev_io, *tmp_bdev_io; 1052 struct nvme_bdev_io *bio; 1053 uint64_t now, delay_us; 1054 1055 now = spdk_get_ticks(); 1056 1057 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) { 1058 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1059 if (bio->retry_ticks > now) { 1060 break; 1061 } 1062 1063 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1064 1065 bdev_nvme_retry_io(nbdev_ch, bdev_io); 1066 } 1067 1068 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1069 1070 bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1071 if (bdev_io != NULL) { 1072 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1073 1074 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1075 1076 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1077 delay_us); 1078 } 1079 1080 return SPDK_POLLER_BUSY; 1081 } 1082 1083 static void 1084 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1085 struct nvme_bdev_io *bio, uint64_t delay_ms) 1086 { 1087 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1088 struct spdk_bdev_io *tmp_bdev_io; 1089 struct nvme_bdev_io *tmp_bio; 1090 1091 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1092 1093 TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) { 1094 tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx; 1095 1096 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1097 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io, 1098 module_link); 1099 return; 1100 } 1101 } 1102 1103 /* No earlier I/Os were found. This I/O must be the new head. */ 1104 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link); 1105 1106 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1107 1108 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1109 delay_ms * 1000ULL); 1110 } 1111 1112 static void 1113 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1114 { 1115 struct spdk_bdev_io *bdev_io, *tmp_io; 1116 1117 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) { 1118 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1119 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1120 } 1121 1122 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1123 } 1124 1125 static int 1126 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1127 struct nvme_bdev_io *bio_to_abort) 1128 { 1129 struct spdk_bdev_io *bdev_io_to_abort; 1130 1131 TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) { 1132 if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) { 1133 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link); 1134 __bdev_nvme_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1135 return 0; 1136 } 1137 } 1138 1139 return -ENOENT; 1140 } 1141 1142 static void 1143 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1144 { 1145 struct nvme_bdev *nbdev; 1146 uint16_t sct, sc; 1147 1148 assert(spdk_nvme_cpl_is_error(cpl)); 1149 1150 nbdev = bdev_io->bdev->ctxt; 1151 1152 if (nbdev->err_stat == NULL) { 1153 return; 1154 } 1155 1156 sct = cpl->status.sct; 1157 sc = cpl->status.sc; 1158 1159 pthread_mutex_lock(&nbdev->mutex); 1160 1161 nbdev->err_stat->status_type[sct]++; 1162 switch (sct) { 1163 case SPDK_NVME_SCT_GENERIC: 1164 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1165 case SPDK_NVME_SCT_MEDIA_ERROR: 1166 case SPDK_NVME_SCT_PATH: 1167 nbdev->err_stat->status[sct][sc]++; 1168 break; 1169 default: 1170 break; 1171 } 1172 1173 pthread_mutex_unlock(&nbdev->mutex); 1174 } 1175 1176 static inline void 1177 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1178 { 1179 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1180 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1181 uint32_t blocklen = bdev_io->bdev->blocklen; 1182 struct spdk_bdev_io_stat *stat; 1183 uint64_t tsc_diff; 1184 1185 if (bio->io_path->stat == NULL) { 1186 return; 1187 } 1188 1189 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1190 stat = bio->io_path->stat; 1191 1192 switch (bdev_io->type) { 1193 case SPDK_BDEV_IO_TYPE_READ: 1194 stat->bytes_read += num_blocks * blocklen; 1195 stat->num_read_ops++; 1196 stat->read_latency_ticks += tsc_diff; 1197 if (stat->max_read_latency_ticks < tsc_diff) { 1198 stat->max_read_latency_ticks = tsc_diff; 1199 } 1200 if (stat->min_read_latency_ticks > tsc_diff) { 1201 stat->min_read_latency_ticks = tsc_diff; 1202 } 1203 break; 1204 case SPDK_BDEV_IO_TYPE_WRITE: 1205 stat->bytes_written += num_blocks * blocklen; 1206 stat->num_write_ops++; 1207 stat->write_latency_ticks += tsc_diff; 1208 if (stat->max_write_latency_ticks < tsc_diff) { 1209 stat->max_write_latency_ticks = tsc_diff; 1210 } 1211 if (stat->min_write_latency_ticks > tsc_diff) { 1212 stat->min_write_latency_ticks = tsc_diff; 1213 } 1214 break; 1215 case SPDK_BDEV_IO_TYPE_UNMAP: 1216 stat->bytes_unmapped += num_blocks * blocklen; 1217 stat->num_unmap_ops++; 1218 stat->unmap_latency_ticks += tsc_diff; 1219 if (stat->max_unmap_latency_ticks < tsc_diff) { 1220 stat->max_unmap_latency_ticks = tsc_diff; 1221 } 1222 if (stat->min_unmap_latency_ticks > tsc_diff) { 1223 stat->min_unmap_latency_ticks = tsc_diff; 1224 } 1225 break; 1226 case SPDK_BDEV_IO_TYPE_ZCOPY: 1227 /* Track the data in the start phase only */ 1228 if (!bdev_io->u.bdev.zcopy.start) { 1229 break; 1230 } 1231 if (bdev_io->u.bdev.zcopy.populate) { 1232 stat->bytes_read += num_blocks * blocklen; 1233 stat->num_read_ops++; 1234 stat->read_latency_ticks += tsc_diff; 1235 if (stat->max_read_latency_ticks < tsc_diff) { 1236 stat->max_read_latency_ticks = tsc_diff; 1237 } 1238 if (stat->min_read_latency_ticks > tsc_diff) { 1239 stat->min_read_latency_ticks = tsc_diff; 1240 } 1241 } else { 1242 stat->bytes_written += num_blocks * blocklen; 1243 stat->num_write_ops++; 1244 stat->write_latency_ticks += tsc_diff; 1245 if (stat->max_write_latency_ticks < tsc_diff) { 1246 stat->max_write_latency_ticks = tsc_diff; 1247 } 1248 if (stat->min_write_latency_ticks > tsc_diff) { 1249 stat->min_write_latency_ticks = tsc_diff; 1250 } 1251 } 1252 break; 1253 case SPDK_BDEV_IO_TYPE_COPY: 1254 stat->bytes_copied += num_blocks * blocklen; 1255 stat->num_copy_ops++; 1256 stat->copy_latency_ticks += tsc_diff; 1257 if (stat->max_copy_latency_ticks < tsc_diff) { 1258 stat->max_copy_latency_ticks = tsc_diff; 1259 } 1260 if (stat->min_copy_latency_ticks > tsc_diff) { 1261 stat->min_copy_latency_ticks = tsc_diff; 1262 } 1263 break; 1264 default: 1265 break; 1266 } 1267 } 1268 1269 static inline void 1270 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1271 const struct spdk_nvme_cpl *cpl) 1272 { 1273 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1274 struct nvme_bdev_channel *nbdev_ch; 1275 struct nvme_io_path *io_path; 1276 struct nvme_ctrlr *nvme_ctrlr; 1277 const struct spdk_nvme_ctrlr_data *cdata; 1278 uint64_t delay_ms; 1279 1280 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1281 1282 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1283 bdev_nvme_update_io_path_stat(bio); 1284 goto complete; 1285 } 1286 1287 /* Update error counts before deciding if retry is needed. 1288 * Hence, error counts may be more than the number of I/O errors. 1289 */ 1290 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1291 1292 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1293 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1294 goto complete; 1295 } 1296 1297 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1298 1299 assert(bio->io_path != NULL); 1300 io_path = bio->io_path; 1301 1302 nvme_ctrlr = io_path->qpair->ctrlr; 1303 1304 if (spdk_nvme_cpl_is_path_error(cpl) || 1305 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1306 !nvme_io_path_is_available(io_path) || 1307 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1308 bdev_nvme_clear_current_io_path(nbdev_ch); 1309 bio->io_path = NULL; 1310 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1311 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1312 io_path->nvme_ns->ana_state_updating = true; 1313 } 1314 } 1315 if (!any_io_path_may_become_available(nbdev_ch)) { 1316 goto complete; 1317 } 1318 delay_ms = 0; 1319 } else { 1320 bio->retry_count++; 1321 1322 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1323 1324 if (cpl->status.crd != 0) { 1325 delay_ms = cdata->crdt[cpl->status.crd] * 100; 1326 } else { 1327 delay_ms = 0; 1328 } 1329 } 1330 1331 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1332 return; 1333 1334 complete: 1335 bio->retry_count = 0; 1336 bio->submit_tsc = 0; 1337 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1338 } 1339 1340 static inline void 1341 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1342 { 1343 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1344 struct nvme_bdev_channel *nbdev_ch; 1345 enum spdk_bdev_io_status io_status; 1346 1347 switch (rc) { 1348 case 0: 1349 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1350 break; 1351 case -ENOMEM: 1352 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1353 break; 1354 case -ENXIO: 1355 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1356 1357 bdev_nvme_clear_current_io_path(nbdev_ch); 1358 bio->io_path = NULL; 1359 1360 if (any_io_path_may_become_available(nbdev_ch)) { 1361 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1362 return; 1363 } 1364 1365 /* fallthrough */ 1366 default: 1367 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1368 break; 1369 } 1370 1371 bio->retry_count = 0; 1372 bio->submit_tsc = 0; 1373 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1374 } 1375 1376 static inline void 1377 bdev_nvme_admin_passthru_complete(struct nvme_bdev_io *bio, int rc) 1378 { 1379 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1380 enum spdk_bdev_io_status io_status; 1381 1382 switch (rc) { 1383 case 0: 1384 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1385 break; 1386 case -ENOMEM: 1387 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1388 break; 1389 case -ENXIO: 1390 /* fallthrough */ 1391 default: 1392 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1393 break; 1394 } 1395 1396 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1397 } 1398 1399 static void 1400 bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) 1401 { 1402 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1403 1404 pthread_mutex_lock(&nvme_ctrlr->mutex); 1405 1406 assert(nvme_ctrlr->io_path_cache_clearing == true); 1407 nvme_ctrlr->io_path_cache_clearing = false; 1408 1409 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1410 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1411 return; 1412 } 1413 1414 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1415 1416 nvme_ctrlr_unregister(nvme_ctrlr); 1417 } 1418 1419 static void 1420 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1421 { 1422 struct nvme_io_path *io_path; 1423 1424 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1425 if (io_path->nbdev_ch == NULL) { 1426 continue; 1427 } 1428 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1429 } 1430 } 1431 1432 static void 1433 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1434 { 1435 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1436 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1437 1438 assert(ctrlr_ch->qpair != NULL); 1439 1440 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1441 1442 spdk_for_each_channel_continue(i, 0); 1443 } 1444 1445 static void 1446 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1447 { 1448 pthread_mutex_lock(&nvme_ctrlr->mutex); 1449 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1450 nvme_ctrlr->io_path_cache_clearing) { 1451 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1452 return; 1453 } 1454 1455 nvme_ctrlr->io_path_cache_clearing = true; 1456 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1457 1458 spdk_for_each_channel(nvme_ctrlr, 1459 bdev_nvme_clear_io_path_cache, 1460 NULL, 1461 bdev_nvme_clear_io_path_caches_done); 1462 } 1463 1464 static struct nvme_qpair * 1465 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1466 { 1467 struct nvme_qpair *nvme_qpair; 1468 1469 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1470 if (nvme_qpair->qpair == qpair) { 1471 break; 1472 } 1473 } 1474 1475 return nvme_qpair; 1476 } 1477 1478 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1479 1480 static void 1481 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1482 { 1483 struct nvme_poll_group *group = poll_group_ctx; 1484 struct nvme_qpair *nvme_qpair; 1485 struct nvme_ctrlr_channel *ctrlr_ch; 1486 1487 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1488 if (nvme_qpair == NULL) { 1489 return; 1490 } 1491 1492 if (nvme_qpair->qpair != NULL) { 1493 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1494 nvme_qpair->qpair = NULL; 1495 } 1496 1497 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1498 1499 ctrlr_ch = nvme_qpair->ctrlr_ch; 1500 1501 if (ctrlr_ch != NULL) { 1502 if (ctrlr_ch->reset_iter != NULL) { 1503 /* If we are already in a full reset sequence, we do not have 1504 * to restart it. Just move to the next ctrlr_channel. 1505 */ 1506 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1507 qpair); 1508 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 1509 ctrlr_ch->reset_iter = NULL; 1510 } else { 1511 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1512 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1513 bdev_nvme_failover(nvme_qpair->ctrlr, false); 1514 } 1515 } else { 1516 /* In this case, ctrlr_channel is already deleted. */ 1517 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1518 nvme_qpair_delete(nvme_qpair); 1519 } 1520 } 1521 1522 static void 1523 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1524 { 1525 struct nvme_qpair *nvme_qpair; 1526 1527 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1528 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1529 continue; 1530 } 1531 1532 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1533 SPDK_NVME_QPAIR_FAILURE_NONE) { 1534 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1535 } 1536 } 1537 } 1538 1539 static int 1540 bdev_nvme_poll(void *arg) 1541 { 1542 struct nvme_poll_group *group = arg; 1543 int64_t num_completions; 1544 1545 if (group->collect_spin_stat && group->start_ticks == 0) { 1546 group->start_ticks = spdk_get_ticks(); 1547 } 1548 1549 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1550 bdev_nvme_disconnected_qpair_cb); 1551 if (group->collect_spin_stat) { 1552 if (num_completions > 0) { 1553 if (group->end_ticks != 0) { 1554 group->spin_ticks += (group->end_ticks - group->start_ticks); 1555 group->end_ticks = 0; 1556 } 1557 group->start_ticks = 0; 1558 } else { 1559 group->end_ticks = spdk_get_ticks(); 1560 } 1561 } 1562 1563 if (spdk_unlikely(num_completions < 0)) { 1564 bdev_nvme_check_io_qpairs(group); 1565 } 1566 1567 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1568 } 1569 1570 static int bdev_nvme_poll_adminq(void *arg); 1571 1572 static void 1573 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1574 { 1575 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1576 1577 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1578 nvme_ctrlr, new_period_us); 1579 } 1580 1581 static int 1582 bdev_nvme_poll_adminq(void *arg) 1583 { 1584 int32_t rc; 1585 struct nvme_ctrlr *nvme_ctrlr = arg; 1586 nvme_ctrlr_disconnected_cb disconnected_cb; 1587 1588 assert(nvme_ctrlr != NULL); 1589 1590 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1591 if (rc < 0) { 1592 disconnected_cb = nvme_ctrlr->disconnected_cb; 1593 nvme_ctrlr->disconnected_cb = NULL; 1594 1595 if (rc == -ENXIO && disconnected_cb != NULL) { 1596 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1597 g_opts.nvme_adminq_poll_period_us); 1598 disconnected_cb(nvme_ctrlr); 1599 } else { 1600 bdev_nvme_failover(nvme_ctrlr, false); 1601 } 1602 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1603 SPDK_NVME_QPAIR_FAILURE_NONE) { 1604 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1605 } 1606 1607 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1608 } 1609 1610 static void 1611 _bdev_nvme_unregister_dev_cb(void *io_device) 1612 { 1613 struct nvme_bdev *nvme_disk = io_device; 1614 1615 free(nvme_disk->disk.name); 1616 free(nvme_disk->err_stat); 1617 free(nvme_disk); 1618 } 1619 1620 static int 1621 bdev_nvme_destruct(void *ctx) 1622 { 1623 struct nvme_bdev *nvme_disk = ctx; 1624 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1625 1626 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1627 1628 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1629 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1630 1631 nvme_ns->bdev = NULL; 1632 1633 assert(nvme_ns->id > 0); 1634 1635 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1636 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1637 1638 nvme_ctrlr_release(nvme_ns->ctrlr); 1639 nvme_ns_free(nvme_ns); 1640 } else { 1641 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1642 } 1643 } 1644 1645 pthread_mutex_lock(&g_bdev_nvme_mutex); 1646 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1647 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1648 1649 spdk_io_device_unregister(nvme_disk, _bdev_nvme_unregister_dev_cb); 1650 1651 return 0; 1652 } 1653 1654 static int 1655 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1656 { 1657 struct nvme_ctrlr *nvme_ctrlr; 1658 struct spdk_nvme_io_qpair_opts opts; 1659 struct spdk_nvme_qpair *qpair; 1660 int rc; 1661 1662 nvme_ctrlr = nvme_qpair->ctrlr; 1663 1664 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1665 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1666 opts.create_only = true; 1667 opts.async_mode = true; 1668 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1669 g_opts.io_queue_requests = opts.io_queue_requests; 1670 1671 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1672 if (qpair == NULL) { 1673 return -1; 1674 } 1675 1676 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1677 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1678 1679 assert(nvme_qpair->group != NULL); 1680 1681 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1682 if (rc != 0) { 1683 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1684 goto err; 1685 } 1686 1687 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1688 if (rc != 0) { 1689 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1690 goto err; 1691 } 1692 1693 nvme_qpair->qpair = qpair; 1694 1695 if (!g_opts.disable_auto_failback) { 1696 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1697 } 1698 1699 return 0; 1700 1701 err: 1702 spdk_nvme_ctrlr_free_io_qpair(qpair); 1703 1704 return rc; 1705 } 1706 1707 static void 1708 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1709 { 1710 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1711 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1712 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 1713 struct spdk_bdev_io *bdev_io; 1714 1715 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1716 status = SPDK_BDEV_IO_STATUS_FAILED; 1717 } 1718 1719 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1720 bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1721 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link); 1722 __bdev_nvme_io_complete(bdev_io, status, NULL); 1723 } 1724 1725 spdk_for_each_channel_continue(i, 0); 1726 } 1727 1728 static void 1729 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove) 1730 { 1731 struct nvme_path_id *path_id, *next_path; 1732 int rc __attribute__((unused)); 1733 1734 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1735 assert(path_id); 1736 assert(path_id == nvme_ctrlr->active_path_id); 1737 next_path = TAILQ_NEXT(path_id, link); 1738 1739 path_id->is_failed = true; 1740 1741 if (next_path) { 1742 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1743 1744 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1745 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1746 1747 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1748 nvme_ctrlr->active_path_id = next_path; 1749 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1750 assert(rc == 0); 1751 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1752 if (!remove) { 1753 /** Shuffle the old trid to the end of the list and use the new one. 1754 * Allows for round robin through multiple connections. 1755 */ 1756 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1757 } else { 1758 free(path_id); 1759 } 1760 } 1761 } 1762 1763 static bool 1764 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1765 { 1766 int32_t elapsed; 1767 1768 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1769 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1770 return false; 1771 } 1772 1773 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1774 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1775 return true; 1776 } else { 1777 return false; 1778 } 1779 } 1780 1781 static bool 1782 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1783 { 1784 uint32_t elapsed; 1785 1786 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1787 return false; 1788 } 1789 1790 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1791 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1792 return true; 1793 } else { 1794 return false; 1795 } 1796 } 1797 1798 static void bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 1799 1800 static void 1801 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1802 { 1803 int rc; 1804 1805 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1806 if (rc != 0) { 1807 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 1808 * fail the reset sequence immediately. 1809 */ 1810 bdev_nvme_reset_complete(nvme_ctrlr, false); 1811 return; 1812 } 1813 1814 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1815 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1816 */ 1817 assert(nvme_ctrlr->disconnected_cb == NULL); 1818 nvme_ctrlr->disconnected_cb = cb_fn; 1819 1820 /* During disconnection, reduce the period to poll adminq more often. */ 1821 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 1822 } 1823 1824 enum bdev_nvme_op_after_reset { 1825 OP_NONE, 1826 OP_COMPLETE_PENDING_DESTRUCT, 1827 OP_DESTRUCT, 1828 OP_DELAYED_RECONNECT, 1829 }; 1830 1831 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1832 1833 static _bdev_nvme_op_after_reset 1834 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1835 { 1836 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1837 /* Complete pending destruct after reset completes. */ 1838 return OP_COMPLETE_PENDING_DESTRUCT; 1839 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1840 nvme_ctrlr->reset_start_tsc = 0; 1841 return OP_NONE; 1842 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1843 return OP_DESTRUCT; 1844 } else { 1845 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1846 nvme_ctrlr->fast_io_fail_timedout = true; 1847 } 1848 bdev_nvme_failover_trid(nvme_ctrlr, false); 1849 return OP_DELAYED_RECONNECT; 1850 } 1851 } 1852 1853 static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1854 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1855 1856 static int 1857 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 1858 { 1859 struct nvme_ctrlr *nvme_ctrlr = ctx; 1860 1861 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 1862 pthread_mutex_lock(&nvme_ctrlr->mutex); 1863 1864 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 1865 1866 assert(nvme_ctrlr->reconnect_is_delayed == true); 1867 nvme_ctrlr->reconnect_is_delayed = false; 1868 1869 if (nvme_ctrlr->destruct) { 1870 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1871 return SPDK_POLLER_BUSY; 1872 } 1873 1874 assert(nvme_ctrlr->resetting == false); 1875 nvme_ctrlr->resetting = true; 1876 1877 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1878 1879 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 1880 1881 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 1882 return SPDK_POLLER_BUSY; 1883 } 1884 1885 static void 1886 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 1887 { 1888 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 1889 1890 assert(nvme_ctrlr->reconnect_is_delayed == false); 1891 nvme_ctrlr->reconnect_is_delayed = true; 1892 1893 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 1894 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 1895 nvme_ctrlr, 1896 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 1897 } 1898 1899 static void 1900 _bdev_nvme_reset_complete(struct spdk_io_channel_iter *i, int status) 1901 { 1902 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1903 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 1904 struct nvme_path_id *path_id; 1905 bdev_nvme_reset_cb reset_cb_fn = nvme_ctrlr->reset_cb_fn; 1906 void *reset_cb_arg = nvme_ctrlr->reset_cb_arg; 1907 enum bdev_nvme_op_after_reset op_after_reset; 1908 1909 assert(nvme_ctrlr->thread == spdk_get_thread()); 1910 1911 nvme_ctrlr->reset_cb_fn = NULL; 1912 nvme_ctrlr->reset_cb_arg = NULL; 1913 1914 if (!success) { 1915 SPDK_ERRLOG("Resetting controller failed.\n"); 1916 } else { 1917 SPDK_NOTICELOG("Resetting controller successful.\n"); 1918 } 1919 1920 pthread_mutex_lock(&nvme_ctrlr->mutex); 1921 nvme_ctrlr->resetting = false; 1922 1923 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1924 assert(path_id != NULL); 1925 assert(path_id == nvme_ctrlr->active_path_id); 1926 1927 path_id->is_failed = !success; 1928 1929 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 1930 1931 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1932 1933 if (reset_cb_fn) { 1934 reset_cb_fn(reset_cb_arg, success); 1935 } 1936 1937 switch (op_after_reset) { 1938 case OP_COMPLETE_PENDING_DESTRUCT: 1939 nvme_ctrlr_unregister(nvme_ctrlr); 1940 break; 1941 case OP_DESTRUCT: 1942 bdev_nvme_delete_ctrlr(nvme_ctrlr, false); 1943 break; 1944 case OP_DELAYED_RECONNECT: 1945 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 1946 break; 1947 default: 1948 break; 1949 } 1950 } 1951 1952 static void 1953 bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 1954 { 1955 /* Make sure we clear any pending resets before returning. */ 1956 spdk_for_each_channel(nvme_ctrlr, 1957 bdev_nvme_complete_pending_resets, 1958 success ? NULL : (void *)0x1, 1959 _bdev_nvme_reset_complete); 1960 } 1961 1962 static void 1963 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 1964 { 1965 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1966 1967 bdev_nvme_reset_complete(nvme_ctrlr, false); 1968 } 1969 1970 static void 1971 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 1972 { 1973 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 1974 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 1975 struct nvme_qpair *nvme_qpair; 1976 1977 nvme_qpair = ctrlr_ch->qpair; 1978 assert(nvme_qpair != NULL); 1979 1980 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1981 1982 if (nvme_qpair->qpair != NULL) { 1983 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 1984 1985 /* The current full reset sequence will move to the next 1986 * ctrlr_channel after the qpair is actually disconnected. 1987 */ 1988 assert(ctrlr_ch->reset_iter == NULL); 1989 ctrlr_ch->reset_iter = i; 1990 } else { 1991 spdk_for_each_channel_continue(i, 0); 1992 } 1993 } 1994 1995 static void 1996 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 1997 { 1998 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1999 2000 if (status == 0) { 2001 bdev_nvme_reset_complete(nvme_ctrlr, true); 2002 } else { 2003 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 2004 spdk_for_each_channel(nvme_ctrlr, 2005 bdev_nvme_reset_destroy_qpair, 2006 NULL, 2007 bdev_nvme_reset_create_qpairs_failed); 2008 } 2009 } 2010 2011 static void 2012 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 2013 { 2014 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2015 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 2016 int rc; 2017 2018 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 2019 2020 spdk_for_each_channel_continue(i, rc); 2021 } 2022 2023 static int 2024 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2025 { 2026 struct nvme_ctrlr *nvme_ctrlr = arg; 2027 int rc = -ETIMEDOUT; 2028 2029 if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2030 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2031 if (rc == -EAGAIN) { 2032 return SPDK_POLLER_BUSY; 2033 } 2034 } 2035 2036 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2037 if (rc == 0) { 2038 /* Recreate all of the I/O queue pairs */ 2039 spdk_for_each_channel(nvme_ctrlr, 2040 bdev_nvme_reset_create_qpair, 2041 NULL, 2042 bdev_nvme_reset_create_qpairs_done); 2043 } else { 2044 bdev_nvme_reset_complete(nvme_ctrlr, false); 2045 } 2046 return SPDK_POLLER_BUSY; 2047 } 2048 2049 static void 2050 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2051 { 2052 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2053 2054 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2055 assert(nvme_ctrlr->reset_detach_poller == NULL); 2056 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2057 nvme_ctrlr, 0); 2058 } 2059 2060 static void 2061 bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status) 2062 { 2063 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2064 2065 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2066 assert(status == 0); 2067 2068 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2069 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2070 } else { 2071 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2072 } 2073 } 2074 2075 static void 2076 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2077 { 2078 spdk_for_each_channel(nvme_ctrlr, 2079 bdev_nvme_reset_destroy_qpair, 2080 NULL, 2081 bdev_nvme_reset_ctrlr); 2082 } 2083 2084 static void 2085 _bdev_nvme_reset(void *ctx) 2086 { 2087 struct nvme_ctrlr *nvme_ctrlr = ctx; 2088 2089 assert(nvme_ctrlr->resetting == true); 2090 assert(nvme_ctrlr->thread == spdk_get_thread()); 2091 2092 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2093 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2094 } else { 2095 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2096 } 2097 } 2098 2099 static int 2100 bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr) 2101 { 2102 pthread_mutex_lock(&nvme_ctrlr->mutex); 2103 if (nvme_ctrlr->destruct) { 2104 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2105 return -ENXIO; 2106 } 2107 2108 if (nvme_ctrlr->resetting) { 2109 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2110 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2111 return -EBUSY; 2112 } 2113 2114 if (nvme_ctrlr->reconnect_is_delayed) { 2115 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2116 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2117 return -EBUSY; 2118 } 2119 2120 nvme_ctrlr->resetting = true; 2121 2122 assert(nvme_ctrlr->reset_start_tsc == 0); 2123 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2124 2125 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2126 2127 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr); 2128 return 0; 2129 } 2130 2131 int 2132 bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg) 2133 { 2134 int rc; 2135 2136 rc = bdev_nvme_reset(nvme_ctrlr); 2137 if (rc == 0) { 2138 nvme_ctrlr->reset_cb_fn = cb_fn; 2139 nvme_ctrlr->reset_cb_arg = cb_arg; 2140 } 2141 return rc; 2142 } 2143 2144 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2145 2146 static void 2147 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 2148 { 2149 enum spdk_bdev_io_status io_status; 2150 2151 if (bio->cpl.cdw0 == 0) { 2152 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2153 } else { 2154 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2155 } 2156 2157 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2158 } 2159 2160 static void 2161 _bdev_nvme_reset_io_continue(void *ctx) 2162 { 2163 struct nvme_bdev_io *bio = ctx; 2164 struct nvme_io_path *prev_io_path, *next_io_path; 2165 int rc; 2166 2167 prev_io_path = bio->io_path; 2168 bio->io_path = NULL; 2169 2170 if (bio->cpl.cdw0 != 0) { 2171 goto complete; 2172 } 2173 2174 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 2175 if (next_io_path == NULL) { 2176 goto complete; 2177 } 2178 2179 rc = _bdev_nvme_reset_io(next_io_path, bio); 2180 if (rc == 0) { 2181 return; 2182 } 2183 2184 bio->cpl.cdw0 = 1; 2185 2186 complete: 2187 bdev_nvme_reset_io_complete(bio); 2188 } 2189 2190 static void 2191 bdev_nvme_reset_io_continue(void *cb_arg, bool success) 2192 { 2193 struct nvme_bdev_io *bio = cb_arg; 2194 2195 bio->cpl.cdw0 = !success; 2196 2197 spdk_thread_send_msg(bio->orig_thread, _bdev_nvme_reset_io_continue, bio); 2198 } 2199 2200 static int 2201 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 2202 { 2203 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 2204 struct nvme_ctrlr_channel *ctrlr_ch; 2205 struct spdk_bdev_io *bdev_io; 2206 int rc; 2207 2208 rc = bdev_nvme_reset(nvme_ctrlr); 2209 if (rc == 0) { 2210 assert(bio->io_path == NULL); 2211 bio->io_path = io_path; 2212 2213 assert(nvme_ctrlr->reset_cb_fn == NULL); 2214 assert(nvme_ctrlr->reset_cb_arg == NULL); 2215 nvme_ctrlr->reset_cb_fn = bdev_nvme_reset_io_continue; 2216 nvme_ctrlr->reset_cb_arg = bio; 2217 } else if (rc == -EBUSY) { 2218 ctrlr_ch = io_path->qpair->ctrlr_ch; 2219 assert(ctrlr_ch != NULL); 2220 /* 2221 * Reset call is queued only if it is from the app framework. This is on purpose so that 2222 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 2223 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 2224 */ 2225 bdev_io = spdk_bdev_io_from_ctx(bio); 2226 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link); 2227 rc = 0; 2228 } 2229 2230 return rc; 2231 } 2232 2233 static void 2234 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 2235 { 2236 struct nvme_io_path *io_path; 2237 int rc; 2238 2239 bio->cpl.cdw0 = 0; 2240 bio->orig_thread = spdk_get_thread(); 2241 2242 /* Reset all nvme_ctrlrs of a bdev controller sequentially. */ 2243 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 2244 assert(io_path != NULL); 2245 2246 rc = _bdev_nvme_reset_io(io_path, bio); 2247 if (rc != 0) { 2248 bio->cpl.cdw0 = 1; 2249 bdev_nvme_reset_io_complete(bio); 2250 } 2251 } 2252 2253 static int 2254 bdev_nvme_failover_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2255 { 2256 if (nvme_ctrlr->destruct) { 2257 /* Don't bother resetting if the controller is in the process of being destructed. */ 2258 return -ENXIO; 2259 } 2260 2261 if (nvme_ctrlr->resetting) { 2262 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2263 return -EBUSY; 2264 } 2265 2266 bdev_nvme_failover_trid(nvme_ctrlr, remove); 2267 2268 if (nvme_ctrlr->reconnect_is_delayed) { 2269 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2270 2271 /* We rely on the next reconnect for the failover. */ 2272 return -EALREADY; 2273 } 2274 2275 nvme_ctrlr->resetting = true; 2276 2277 assert(nvme_ctrlr->reset_start_tsc == 0); 2278 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2279 2280 return 0; 2281 } 2282 2283 static int 2284 bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2285 { 2286 int rc; 2287 2288 pthread_mutex_lock(&nvme_ctrlr->mutex); 2289 rc = bdev_nvme_failover_unsafe(nvme_ctrlr, remove); 2290 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2291 2292 if (rc == 0) { 2293 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr); 2294 } else if (rc == -EALREADY) { 2295 rc = 0; 2296 } 2297 2298 return rc; 2299 } 2300 2301 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2302 uint64_t num_blocks); 2303 2304 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2305 uint64_t num_blocks); 2306 2307 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 2308 uint64_t src_offset_blocks, 2309 uint64_t num_blocks); 2310 2311 static void 2312 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2313 bool success) 2314 { 2315 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2316 struct spdk_bdev *bdev = bdev_io->bdev; 2317 int ret; 2318 2319 if (!success) { 2320 ret = -EINVAL; 2321 goto exit; 2322 } 2323 2324 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2325 ret = -ENXIO; 2326 goto exit; 2327 } 2328 2329 ret = bdev_nvme_readv(bio, 2330 bdev_io->u.bdev.iovs, 2331 bdev_io->u.bdev.iovcnt, 2332 bdev_io->u.bdev.md_buf, 2333 bdev_io->u.bdev.num_blocks, 2334 bdev_io->u.bdev.offset_blocks, 2335 bdev->dif_check_flags, 2336 bdev_io->u.bdev.memory_domain, 2337 bdev_io->u.bdev.memory_domain_ctx); 2338 2339 exit: 2340 if (spdk_unlikely(ret != 0)) { 2341 bdev_nvme_io_complete(bio, ret); 2342 } 2343 } 2344 2345 static inline void 2346 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 2347 { 2348 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2349 struct spdk_bdev *bdev = bdev_io->bdev; 2350 struct nvme_bdev_io *nbdev_io_to_abort; 2351 int rc = 0; 2352 2353 switch (bdev_io->type) { 2354 case SPDK_BDEV_IO_TYPE_READ: 2355 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 2356 rc = bdev_nvme_readv(nbdev_io, 2357 bdev_io->u.bdev.iovs, 2358 bdev_io->u.bdev.iovcnt, 2359 bdev_io->u.bdev.md_buf, 2360 bdev_io->u.bdev.num_blocks, 2361 bdev_io->u.bdev.offset_blocks, 2362 bdev->dif_check_flags, 2363 bdev_io->u.bdev.memory_domain, 2364 bdev_io->u.bdev.memory_domain_ctx); 2365 } else { 2366 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 2367 bdev_io->u.bdev.num_blocks * bdev->blocklen); 2368 rc = 0; 2369 } 2370 break; 2371 case SPDK_BDEV_IO_TYPE_WRITE: 2372 rc = bdev_nvme_writev(nbdev_io, 2373 bdev_io->u.bdev.iovs, 2374 bdev_io->u.bdev.iovcnt, 2375 bdev_io->u.bdev.md_buf, 2376 bdev_io->u.bdev.num_blocks, 2377 bdev_io->u.bdev.offset_blocks, 2378 bdev->dif_check_flags, 2379 bdev_io->u.bdev.memory_domain, 2380 bdev_io->u.bdev.memory_domain_ctx); 2381 break; 2382 case SPDK_BDEV_IO_TYPE_COMPARE: 2383 rc = bdev_nvme_comparev(nbdev_io, 2384 bdev_io->u.bdev.iovs, 2385 bdev_io->u.bdev.iovcnt, 2386 bdev_io->u.bdev.md_buf, 2387 bdev_io->u.bdev.num_blocks, 2388 bdev_io->u.bdev.offset_blocks, 2389 bdev->dif_check_flags); 2390 break; 2391 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2392 rc = bdev_nvme_comparev_and_writev(nbdev_io, 2393 bdev_io->u.bdev.iovs, 2394 bdev_io->u.bdev.iovcnt, 2395 bdev_io->u.bdev.fused_iovs, 2396 bdev_io->u.bdev.fused_iovcnt, 2397 bdev_io->u.bdev.md_buf, 2398 bdev_io->u.bdev.num_blocks, 2399 bdev_io->u.bdev.offset_blocks, 2400 bdev->dif_check_flags); 2401 break; 2402 case SPDK_BDEV_IO_TYPE_UNMAP: 2403 rc = bdev_nvme_unmap(nbdev_io, 2404 bdev_io->u.bdev.offset_blocks, 2405 bdev_io->u.bdev.num_blocks); 2406 break; 2407 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2408 rc = bdev_nvme_write_zeroes(nbdev_io, 2409 bdev_io->u.bdev.offset_blocks, 2410 bdev_io->u.bdev.num_blocks); 2411 break; 2412 case SPDK_BDEV_IO_TYPE_RESET: 2413 nbdev_io->io_path = NULL; 2414 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 2415 break; 2416 case SPDK_BDEV_IO_TYPE_FLUSH: 2417 bdev_nvme_io_complete(nbdev_io, 0); 2418 break; 2419 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 2420 rc = bdev_nvme_zone_appendv(nbdev_io, 2421 bdev_io->u.bdev.iovs, 2422 bdev_io->u.bdev.iovcnt, 2423 bdev_io->u.bdev.md_buf, 2424 bdev_io->u.bdev.num_blocks, 2425 bdev_io->u.bdev.offset_blocks, 2426 bdev->dif_check_flags); 2427 break; 2428 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 2429 rc = bdev_nvme_get_zone_info(nbdev_io, 2430 bdev_io->u.zone_mgmt.zone_id, 2431 bdev_io->u.zone_mgmt.num_zones, 2432 bdev_io->u.zone_mgmt.buf); 2433 break; 2434 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 2435 rc = bdev_nvme_zone_management(nbdev_io, 2436 bdev_io->u.zone_mgmt.zone_id, 2437 bdev_io->u.zone_mgmt.zone_action); 2438 break; 2439 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 2440 nbdev_io->io_path = NULL; 2441 bdev_nvme_admin_passthru(nbdev_ch, 2442 nbdev_io, 2443 &bdev_io->u.nvme_passthru.cmd, 2444 bdev_io->u.nvme_passthru.buf, 2445 bdev_io->u.nvme_passthru.nbytes); 2446 break; 2447 case SPDK_BDEV_IO_TYPE_NVME_IO: 2448 rc = bdev_nvme_io_passthru(nbdev_io, 2449 &bdev_io->u.nvme_passthru.cmd, 2450 bdev_io->u.nvme_passthru.buf, 2451 bdev_io->u.nvme_passthru.nbytes); 2452 break; 2453 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2454 rc = bdev_nvme_io_passthru_md(nbdev_io, 2455 &bdev_io->u.nvme_passthru.cmd, 2456 bdev_io->u.nvme_passthru.buf, 2457 bdev_io->u.nvme_passthru.nbytes, 2458 bdev_io->u.nvme_passthru.md_buf, 2459 bdev_io->u.nvme_passthru.md_len); 2460 break; 2461 case SPDK_BDEV_IO_TYPE_ABORT: 2462 nbdev_io->io_path = NULL; 2463 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 2464 bdev_nvme_abort(nbdev_ch, 2465 nbdev_io, 2466 nbdev_io_to_abort); 2467 break; 2468 case SPDK_BDEV_IO_TYPE_COPY: 2469 rc = bdev_nvme_copy(nbdev_io, 2470 bdev_io->u.bdev.offset_blocks, 2471 bdev_io->u.bdev.copy.src_offset_blocks, 2472 bdev_io->u.bdev.num_blocks); 2473 break; 2474 default: 2475 rc = -EINVAL; 2476 break; 2477 } 2478 2479 if (spdk_unlikely(rc != 0)) { 2480 bdev_nvme_io_complete(nbdev_io, rc); 2481 } 2482 } 2483 2484 static void 2485 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2486 { 2487 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 2488 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2489 2490 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 2491 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 2492 } else { 2493 /* There are cases where submit_tsc != 0, i.e. retry I/O. 2494 * We need to update submit_tsc here. 2495 */ 2496 nbdev_io->submit_tsc = spdk_get_ticks(); 2497 } 2498 2499 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 2500 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 2501 if (spdk_unlikely(!nbdev_io->io_path)) { 2502 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 2503 bdev_nvme_io_complete(nbdev_io, -ENXIO); 2504 return; 2505 } 2506 2507 /* Admin commands do not use the optimal I/O path. 2508 * Simply fall through even if it is not found. 2509 */ 2510 } 2511 2512 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 2513 } 2514 2515 static bool 2516 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 2517 { 2518 struct nvme_bdev *nbdev = ctx; 2519 struct nvme_ns *nvme_ns; 2520 struct spdk_nvme_ns *ns; 2521 struct spdk_nvme_ctrlr *ctrlr; 2522 const struct spdk_nvme_ctrlr_data *cdata; 2523 2524 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 2525 assert(nvme_ns != NULL); 2526 ns = nvme_ns->ns; 2527 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2528 2529 switch (io_type) { 2530 case SPDK_BDEV_IO_TYPE_READ: 2531 case SPDK_BDEV_IO_TYPE_WRITE: 2532 case SPDK_BDEV_IO_TYPE_RESET: 2533 case SPDK_BDEV_IO_TYPE_FLUSH: 2534 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 2535 case SPDK_BDEV_IO_TYPE_NVME_IO: 2536 case SPDK_BDEV_IO_TYPE_ABORT: 2537 return true; 2538 2539 case SPDK_BDEV_IO_TYPE_COMPARE: 2540 return spdk_nvme_ns_supports_compare(ns); 2541 2542 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2543 return spdk_nvme_ns_get_md_size(ns) ? true : false; 2544 2545 case SPDK_BDEV_IO_TYPE_UNMAP: 2546 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2547 return cdata->oncs.dsm; 2548 2549 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2550 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2551 return cdata->oncs.write_zeroes; 2552 2553 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2554 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 2555 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 2556 return true; 2557 } 2558 return false; 2559 2560 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 2561 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 2562 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 2563 2564 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 2565 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 2566 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 2567 2568 case SPDK_BDEV_IO_TYPE_COPY: 2569 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2570 return cdata->oncs.copy; 2571 2572 default: 2573 return false; 2574 } 2575 } 2576 2577 static int 2578 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 2579 { 2580 struct nvme_qpair *nvme_qpair; 2581 struct spdk_io_channel *pg_ch; 2582 int rc; 2583 2584 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 2585 if (!nvme_qpair) { 2586 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 2587 return -1; 2588 } 2589 2590 TAILQ_INIT(&nvme_qpair->io_path_list); 2591 2592 nvme_qpair->ctrlr = nvme_ctrlr; 2593 nvme_qpair->ctrlr_ch = ctrlr_ch; 2594 2595 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 2596 if (!pg_ch) { 2597 free(nvme_qpair); 2598 return -1; 2599 } 2600 2601 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 2602 2603 #ifdef SPDK_CONFIG_VTUNE 2604 nvme_qpair->group->collect_spin_stat = true; 2605 #else 2606 nvme_qpair->group->collect_spin_stat = false; 2607 #endif 2608 2609 rc = bdev_nvme_create_qpair(nvme_qpair); 2610 if (rc != 0) { 2611 /* nvme_ctrlr can't create IO qpair if connection is down. 2612 * 2613 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 2614 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 2615 * submitted IO will be queued until IO qpair is successfully created. 2616 * 2617 * Hence, if both are satisfied, ignore the failure. 2618 */ 2619 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 2620 spdk_put_io_channel(pg_ch); 2621 free(nvme_qpair); 2622 return rc; 2623 } 2624 } 2625 2626 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 2627 2628 ctrlr_ch->qpair = nvme_qpair; 2629 2630 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 2631 nvme_qpair->ctrlr->ref++; 2632 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 2633 2634 return 0; 2635 } 2636 2637 static int 2638 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 2639 { 2640 struct nvme_ctrlr *nvme_ctrlr = io_device; 2641 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 2642 2643 TAILQ_INIT(&ctrlr_ch->pending_resets); 2644 2645 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 2646 } 2647 2648 static void 2649 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 2650 { 2651 struct nvme_io_path *io_path, *next; 2652 2653 assert(nvme_qpair->group != NULL); 2654 2655 TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) { 2656 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 2657 nvme_io_path_free(io_path); 2658 } 2659 2660 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 2661 2662 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 2663 2664 nvme_ctrlr_release(nvme_qpair->ctrlr); 2665 2666 free(nvme_qpair); 2667 } 2668 2669 static void 2670 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 2671 { 2672 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 2673 struct nvme_qpair *nvme_qpair; 2674 2675 nvme_qpair = ctrlr_ch->qpair; 2676 assert(nvme_qpair != NULL); 2677 2678 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2679 2680 if (nvme_qpair->qpair != NULL) { 2681 if (ctrlr_ch->reset_iter == NULL) { 2682 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2683 } else { 2684 /* Skip current ctrlr_channel in a full reset sequence because 2685 * it is being deleted now. The qpair is already being disconnected. 2686 * We do not have to restart disconnecting it. 2687 */ 2688 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2689 } 2690 2691 /* We cannot release a reference to the poll group now. 2692 * The qpair may be disconnected asynchronously later. 2693 * We need to poll it until it is actually disconnected. 2694 * Just detach the qpair from the deleting ctrlr_channel. 2695 */ 2696 nvme_qpair->ctrlr_ch = NULL; 2697 } else { 2698 assert(ctrlr_ch->reset_iter == NULL); 2699 2700 nvme_qpair_delete(nvme_qpair); 2701 } 2702 } 2703 2704 static void 2705 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 2706 uint32_t iov_cnt, uint32_t seed, 2707 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 2708 { 2709 struct nvme_poll_group *group = ctx; 2710 int rc; 2711 2712 assert(cb_fn != NULL); 2713 2714 if (spdk_unlikely(!group->accel_channel)) { 2715 group->accel_channel = spdk_accel_get_io_channel(); 2716 if (!group->accel_channel) { 2717 cb_fn(cb_arg, -ENOMEM); 2718 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 2719 group); 2720 return; 2721 } 2722 } 2723 2724 rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 2725 if (rc) { 2726 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 2727 if (rc == -ENOMEM || rc == -EINVAL) { 2728 cb_fn(cb_arg, rc); 2729 } 2730 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 2731 } 2732 } 2733 2734 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 2735 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 2736 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 2737 }; 2738 2739 static int 2740 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 2741 { 2742 struct nvme_poll_group *group = ctx_buf; 2743 2744 TAILQ_INIT(&group->qpair_list); 2745 2746 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 2747 if (group->group == NULL) { 2748 return -1; 2749 } 2750 2751 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 2752 2753 if (group->poller == NULL) { 2754 spdk_nvme_poll_group_destroy(group->group); 2755 return -1; 2756 } 2757 2758 return 0; 2759 } 2760 2761 static void 2762 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 2763 { 2764 struct nvme_poll_group *group = ctx_buf; 2765 2766 assert(TAILQ_EMPTY(&group->qpair_list)); 2767 2768 if (group->accel_channel) { 2769 spdk_put_io_channel(group->accel_channel); 2770 } 2771 2772 spdk_poller_unregister(&group->poller); 2773 if (spdk_nvme_poll_group_destroy(group->group)) { 2774 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 2775 assert(false); 2776 } 2777 } 2778 2779 static struct spdk_io_channel * 2780 bdev_nvme_get_io_channel(void *ctx) 2781 { 2782 struct nvme_bdev *nvme_bdev = ctx; 2783 2784 return spdk_get_io_channel(nvme_bdev); 2785 } 2786 2787 static void * 2788 bdev_nvme_get_module_ctx(void *ctx) 2789 { 2790 struct nvme_bdev *nvme_bdev = ctx; 2791 struct nvme_ns *nvme_ns; 2792 2793 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 2794 return NULL; 2795 } 2796 2797 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 2798 if (!nvme_ns) { 2799 return NULL; 2800 } 2801 2802 return nvme_ns->ns; 2803 } 2804 2805 static const char * 2806 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 2807 { 2808 switch (ana_state) { 2809 case SPDK_NVME_ANA_OPTIMIZED_STATE: 2810 return "optimized"; 2811 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 2812 return "non_optimized"; 2813 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 2814 return "inaccessible"; 2815 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 2816 return "persistent_loss"; 2817 case SPDK_NVME_ANA_CHANGE_STATE: 2818 return "change"; 2819 default: 2820 return NULL; 2821 } 2822 } 2823 2824 static int 2825 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 2826 { 2827 struct spdk_memory_domain **_domains = NULL; 2828 struct nvme_bdev *nbdev = ctx; 2829 struct nvme_ns *nvme_ns; 2830 int i = 0, _array_size = array_size; 2831 int rc = 0; 2832 2833 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 2834 if (domains && array_size >= i) { 2835 _domains = &domains[i]; 2836 } else { 2837 _domains = NULL; 2838 } 2839 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 2840 if (rc > 0) { 2841 i += rc; 2842 if (_array_size >= rc) { 2843 _array_size -= rc; 2844 } else { 2845 _array_size = 0; 2846 } 2847 } else if (rc < 0) { 2848 return rc; 2849 } 2850 } 2851 2852 return i; 2853 } 2854 2855 static const char * 2856 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 2857 { 2858 if (nvme_ctrlr->destruct) { 2859 return "deleting"; 2860 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 2861 return "failed"; 2862 } else if (nvme_ctrlr->resetting) { 2863 return "resetting"; 2864 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 2865 return "reconnect_is_delayed"; 2866 } else { 2867 return "enabled"; 2868 } 2869 } 2870 2871 void 2872 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 2873 { 2874 struct spdk_nvme_transport_id *trid; 2875 const struct spdk_nvme_ctrlr_opts *opts; 2876 const struct spdk_nvme_ctrlr_data *cdata; 2877 2878 spdk_json_write_object_begin(w); 2879 2880 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 2881 2882 #ifdef SPDK_CONFIG_NVME_CUSE 2883 size_t cuse_name_size = 128; 2884 char cuse_name[cuse_name_size]; 2885 2886 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 2887 if (rc == 0) { 2888 spdk_json_write_named_string(w, "cuse_device", cuse_name); 2889 } 2890 #endif 2891 trid = &nvme_ctrlr->active_path_id->trid; 2892 spdk_json_write_named_object_begin(w, "trid"); 2893 nvme_bdev_dump_trid_json(trid, w); 2894 spdk_json_write_object_end(w); 2895 2896 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 2897 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 2898 2899 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 2900 spdk_json_write_named_object_begin(w, "host"); 2901 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 2902 spdk_json_write_named_string(w, "addr", opts->src_addr); 2903 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 2904 spdk_json_write_object_end(w); 2905 2906 spdk_json_write_object_end(w); 2907 } 2908 2909 static void 2910 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 2911 struct nvme_ns *nvme_ns) 2912 { 2913 struct spdk_nvme_ns *ns; 2914 struct spdk_nvme_ctrlr *ctrlr; 2915 const struct spdk_nvme_ctrlr_data *cdata; 2916 const struct spdk_nvme_transport_id *trid; 2917 union spdk_nvme_vs_register vs; 2918 const struct spdk_nvme_ns_data *nsdata; 2919 char buf[128]; 2920 2921 ns = nvme_ns->ns; 2922 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2923 2924 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2925 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 2926 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 2927 2928 spdk_json_write_object_begin(w); 2929 2930 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2931 spdk_json_write_named_string(w, "pci_address", trid->traddr); 2932 } 2933 2934 spdk_json_write_named_object_begin(w, "trid"); 2935 2936 nvme_bdev_dump_trid_json(trid, w); 2937 2938 spdk_json_write_object_end(w); 2939 2940 #ifdef SPDK_CONFIG_NVME_CUSE 2941 size_t cuse_name_size = 128; 2942 char cuse_name[cuse_name_size]; 2943 2944 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 2945 cuse_name, &cuse_name_size); 2946 if (rc == 0) { 2947 spdk_json_write_named_string(w, "cuse_device", cuse_name); 2948 } 2949 #endif 2950 2951 spdk_json_write_named_object_begin(w, "ctrlr_data"); 2952 2953 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 2954 2955 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 2956 2957 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 2958 spdk_str_trim(buf); 2959 spdk_json_write_named_string(w, "model_number", buf); 2960 2961 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 2962 spdk_str_trim(buf); 2963 spdk_json_write_named_string(w, "serial_number", buf); 2964 2965 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 2966 spdk_str_trim(buf); 2967 spdk_json_write_named_string(w, "firmware_revision", buf); 2968 2969 if (cdata->subnqn[0] != '\0') { 2970 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 2971 } 2972 2973 spdk_json_write_named_object_begin(w, "oacs"); 2974 2975 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 2976 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 2977 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 2978 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 2979 2980 spdk_json_write_object_end(w); 2981 2982 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 2983 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 2984 2985 spdk_json_write_object_end(w); 2986 2987 spdk_json_write_named_object_begin(w, "vs"); 2988 2989 spdk_json_write_name(w, "nvme_version"); 2990 if (vs.bits.ter) { 2991 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 2992 } else { 2993 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 2994 } 2995 2996 spdk_json_write_object_end(w); 2997 2998 nsdata = spdk_nvme_ns_get_data(ns); 2999 3000 spdk_json_write_named_object_begin(w, "ns_data"); 3001 3002 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 3003 3004 if (cdata->cmic.ana_reporting) { 3005 spdk_json_write_named_string(w, "ana_state", 3006 _nvme_ana_state_str(nvme_ns->ana_state)); 3007 } 3008 3009 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 3010 3011 spdk_json_write_object_end(w); 3012 3013 if (cdata->oacs.security) { 3014 spdk_json_write_named_object_begin(w, "security"); 3015 3016 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 3017 3018 spdk_json_write_object_end(w); 3019 } 3020 3021 spdk_json_write_object_end(w); 3022 } 3023 3024 static const char * 3025 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 3026 { 3027 switch (nbdev->mp_policy) { 3028 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 3029 return "active_passive"; 3030 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 3031 return "active_active"; 3032 default: 3033 assert(false); 3034 return "invalid"; 3035 } 3036 } 3037 3038 static int 3039 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 3040 { 3041 struct nvme_bdev *nvme_bdev = ctx; 3042 struct nvme_ns *nvme_ns; 3043 3044 pthread_mutex_lock(&nvme_bdev->mutex); 3045 spdk_json_write_named_array_begin(w, "nvme"); 3046 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 3047 nvme_namespace_info_json(w, nvme_ns); 3048 } 3049 spdk_json_write_array_end(w); 3050 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 3051 pthread_mutex_unlock(&nvme_bdev->mutex); 3052 3053 return 0; 3054 } 3055 3056 static void 3057 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3058 { 3059 /* No config per bdev needed */ 3060 } 3061 3062 static uint64_t 3063 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 3064 { 3065 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3066 struct nvme_io_path *io_path; 3067 struct nvme_poll_group *group; 3068 uint64_t spin_time = 0; 3069 3070 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3071 group = io_path->qpair->group; 3072 3073 if (!group || !group->collect_spin_stat) { 3074 continue; 3075 } 3076 3077 if (group->end_ticks != 0) { 3078 group->spin_ticks += (group->end_ticks - group->start_ticks); 3079 group->end_ticks = 0; 3080 } 3081 3082 spin_time += group->spin_ticks; 3083 group->start_ticks = 0; 3084 group->spin_ticks = 0; 3085 } 3086 3087 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 3088 } 3089 3090 static void 3091 bdev_nvme_reset_device_stat(void *ctx) 3092 { 3093 struct nvme_bdev *nbdev = ctx; 3094 3095 if (nbdev->err_stat != NULL) { 3096 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 3097 } 3098 } 3099 3100 /* JSON string should be lowercases and underscore delimited string. */ 3101 static void 3102 bdev_nvme_format_nvme_status(char *dst, const char *src) 3103 { 3104 char tmp[256]; 3105 3106 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 3107 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 3108 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 3109 spdk_strlwr(dst); 3110 } 3111 3112 static void 3113 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 3114 { 3115 struct nvme_bdev *nbdev = ctx; 3116 struct spdk_nvme_status status = {}; 3117 uint16_t sct, sc; 3118 char status_json[256]; 3119 const char *status_str; 3120 3121 if (nbdev->err_stat == NULL) { 3122 return; 3123 } 3124 3125 spdk_json_write_named_object_begin(w, "nvme_error"); 3126 3127 spdk_json_write_named_object_begin(w, "status_type"); 3128 for (sct = 0; sct < 8; sct++) { 3129 if (nbdev->err_stat->status_type[sct] == 0) { 3130 continue; 3131 } 3132 status.sct = sct; 3133 3134 status_str = spdk_nvme_cpl_get_status_type_string(&status); 3135 assert(status_str != NULL); 3136 bdev_nvme_format_nvme_status(status_json, status_str); 3137 3138 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 3139 } 3140 spdk_json_write_object_end(w); 3141 3142 spdk_json_write_named_object_begin(w, "status_code"); 3143 for (sct = 0; sct < 4; sct++) { 3144 status.sct = sct; 3145 for (sc = 0; sc < 256; sc++) { 3146 if (nbdev->err_stat->status[sct][sc] == 0) { 3147 continue; 3148 } 3149 status.sc = sc; 3150 3151 status_str = spdk_nvme_cpl_get_status_string(&status); 3152 assert(status_str != NULL); 3153 bdev_nvme_format_nvme_status(status_json, status_str); 3154 3155 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 3156 } 3157 } 3158 spdk_json_write_object_end(w); 3159 3160 spdk_json_write_object_end(w); 3161 } 3162 3163 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 3164 .destruct = bdev_nvme_destruct, 3165 .submit_request = bdev_nvme_submit_request, 3166 .io_type_supported = bdev_nvme_io_type_supported, 3167 .get_io_channel = bdev_nvme_get_io_channel, 3168 .dump_info_json = bdev_nvme_dump_info_json, 3169 .write_config_json = bdev_nvme_write_config_json, 3170 .get_spin_time = bdev_nvme_get_spin_time, 3171 .get_module_ctx = bdev_nvme_get_module_ctx, 3172 .get_memory_domains = bdev_nvme_get_memory_domains, 3173 .reset_device_stat = bdev_nvme_reset_device_stat, 3174 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 3175 }; 3176 3177 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 3178 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 3179 3180 static int 3181 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3182 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 3183 { 3184 struct spdk_nvme_ana_group_descriptor *copied_desc; 3185 uint8_t *orig_desc; 3186 uint32_t i, desc_size, copy_len; 3187 int rc = 0; 3188 3189 if (nvme_ctrlr->ana_log_page == NULL) { 3190 return -EINVAL; 3191 } 3192 3193 copied_desc = nvme_ctrlr->copied_ana_desc; 3194 3195 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 3196 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 3197 3198 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 3199 memcpy(copied_desc, orig_desc, copy_len); 3200 3201 rc = cb_fn(copied_desc, cb_arg); 3202 if (rc != 0) { 3203 break; 3204 } 3205 3206 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 3207 copied_desc->num_of_nsid * sizeof(uint32_t); 3208 orig_desc += desc_size; 3209 copy_len -= desc_size; 3210 } 3211 3212 return rc; 3213 } 3214 3215 static int 3216 nvme_ns_ana_transition_timedout(void *ctx) 3217 { 3218 struct nvme_ns *nvme_ns = ctx; 3219 3220 spdk_poller_unregister(&nvme_ns->anatt_timer); 3221 nvme_ns->ana_transition_timedout = true; 3222 3223 return SPDK_POLLER_BUSY; 3224 } 3225 3226 static void 3227 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 3228 const struct spdk_nvme_ana_group_descriptor *desc) 3229 { 3230 const struct spdk_nvme_ctrlr_data *cdata; 3231 3232 nvme_ns->ana_group_id = desc->ana_group_id; 3233 nvme_ns->ana_state = desc->ana_state; 3234 nvme_ns->ana_state_updating = false; 3235 3236 switch (nvme_ns->ana_state) { 3237 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3238 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3239 nvme_ns->ana_transition_timedout = false; 3240 spdk_poller_unregister(&nvme_ns->anatt_timer); 3241 break; 3242 3243 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3244 case SPDK_NVME_ANA_CHANGE_STATE: 3245 if (nvme_ns->anatt_timer != NULL) { 3246 break; 3247 } 3248 3249 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 3250 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 3251 nvme_ns, 3252 cdata->anatt * SPDK_SEC_TO_USEC); 3253 break; 3254 default: 3255 break; 3256 } 3257 } 3258 3259 static int 3260 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 3261 { 3262 struct nvme_ns *nvme_ns = cb_arg; 3263 uint32_t i; 3264 3265 for (i = 0; i < desc->num_of_nsid; i++) { 3266 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 3267 continue; 3268 } 3269 3270 _nvme_ns_set_ana_state(nvme_ns, desc); 3271 return 1; 3272 } 3273 3274 return 0; 3275 } 3276 3277 static struct spdk_uuid 3278 nvme_generate_uuid(const char *sn, uint32_t nsid) 3279 { 3280 struct spdk_uuid new_uuid, namespace_uuid; 3281 char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'}; 3282 /* This namespace UUID was generated using uuid_generate() method. */ 3283 const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"}; 3284 int size; 3285 3286 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 3287 3288 memset(&new_uuid, 0, sizeof(new_uuid)); 3289 memset(&namespace_uuid, 0, sizeof(namespace_uuid)); 3290 3291 size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid); 3292 assert(size > 0 && (unsigned long)size < sizeof(merged_str)); 3293 3294 spdk_uuid_parse(&namespace_uuid, namespace_str); 3295 3296 spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size); 3297 3298 return new_uuid; 3299 } 3300 3301 static int 3302 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 3303 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 3304 uint32_t prchk_flags, void *ctx) 3305 { 3306 const struct spdk_uuid *uuid; 3307 const uint8_t *nguid; 3308 const struct spdk_nvme_ctrlr_data *cdata; 3309 const struct spdk_nvme_ns_data *nsdata; 3310 const struct spdk_nvme_ctrlr_opts *opts; 3311 enum spdk_nvme_csi csi; 3312 uint32_t atomic_bs, phys_bs, bs; 3313 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 3314 3315 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3316 csi = spdk_nvme_ns_get_csi(ns); 3317 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 3318 3319 switch (csi) { 3320 case SPDK_NVME_CSI_NVM: 3321 disk->product_name = "NVMe disk"; 3322 break; 3323 case SPDK_NVME_CSI_ZNS: 3324 disk->product_name = "NVMe ZNS disk"; 3325 disk->zoned = true; 3326 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 3327 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 3328 spdk_nvme_ns_get_extended_sector_size(ns); 3329 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 3330 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 3331 break; 3332 default: 3333 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 3334 return -ENOTSUP; 3335 } 3336 3337 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 3338 if (!disk->name) { 3339 return -ENOMEM; 3340 } 3341 3342 disk->write_cache = 0; 3343 if (cdata->vwc.present) { 3344 /* Enable if the Volatile Write Cache exists */ 3345 disk->write_cache = 1; 3346 } 3347 if (cdata->oncs.write_zeroes) { 3348 disk->max_write_zeroes = UINT16_MAX + 1; 3349 } 3350 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 3351 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 3352 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 3353 /* NVMe driver will split one request into multiple requests 3354 * based on MDTS and stripe boundary, the bdev layer will use 3355 * max_segment_size and max_num_segments to split one big IO 3356 * into multiple requests, then small request can't run out 3357 * of NVMe internal requests data structure. 3358 */ 3359 if (opts && opts->io_queue_requests) { 3360 disk->max_num_segments = opts->io_queue_requests / 2; 3361 } 3362 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 3363 3364 nguid = spdk_nvme_ns_get_nguid(ns); 3365 if (!nguid) { 3366 uuid = spdk_nvme_ns_get_uuid(ns); 3367 if (uuid) { 3368 disk->uuid = *uuid; 3369 } else if (g_opts.generate_uuids) { 3370 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 3371 disk->uuid = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns)); 3372 } 3373 } else { 3374 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 3375 } 3376 3377 nsdata = spdk_nvme_ns_get_data(ns); 3378 bs = spdk_nvme_ns_get_sector_size(ns); 3379 atomic_bs = bs; 3380 phys_bs = bs; 3381 if (nsdata->nabo == 0) { 3382 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 3383 atomic_bs = bs * (1 + nsdata->nawupf); 3384 } else { 3385 atomic_bs = bs * (1 + cdata->awupf); 3386 } 3387 } 3388 if (nsdata->nsfeat.optperf) { 3389 phys_bs = bs * (1 + nsdata->npwg); 3390 } 3391 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 3392 3393 disk->md_len = spdk_nvme_ns_get_md_size(ns); 3394 if (disk->md_len != 0) { 3395 disk->md_interleave = nsdata->flbas.extended; 3396 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 3397 if (disk->dif_type != SPDK_DIF_DISABLE) { 3398 disk->dif_is_head_of_md = nsdata->dps.md_start; 3399 disk->dif_check_flags = prchk_flags; 3400 } 3401 } 3402 3403 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 3404 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 3405 disk->acwu = 0; 3406 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 3407 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 3408 } else { 3409 disk->acwu = cdata->acwu + 1; /* 0-based */ 3410 } 3411 3412 if (cdata->oncs.copy) { 3413 /* For now bdev interface allows only single segment copy */ 3414 disk->max_copy = nsdata->mssrl; 3415 } 3416 3417 disk->ctxt = ctx; 3418 disk->fn_table = &nvmelib_fn_table; 3419 disk->module = &nvme_if; 3420 3421 return 0; 3422 } 3423 3424 static int 3425 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 3426 { 3427 struct nvme_bdev *bdev; 3428 int rc; 3429 3430 bdev = calloc(1, sizeof(*bdev)); 3431 if (!bdev) { 3432 SPDK_ERRLOG("bdev calloc() failed\n"); 3433 return -ENOMEM; 3434 } 3435 3436 if (g_opts.nvme_error_stat) { 3437 bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 3438 if (!bdev->err_stat) { 3439 SPDK_ERRLOG("err_stat calloc() failed\n"); 3440 free(bdev); 3441 return -ENOMEM; 3442 } 3443 } 3444 3445 rc = pthread_mutex_init(&bdev->mutex, NULL); 3446 if (rc != 0) { 3447 free(bdev->err_stat); 3448 free(bdev); 3449 return rc; 3450 } 3451 3452 bdev->ref = 1; 3453 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 3454 bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 3455 bdev->rr_min_io = UINT32_MAX; 3456 TAILQ_INIT(&bdev->nvme_ns_list); 3457 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 3458 bdev->opal = nvme_ctrlr->opal_dev != NULL; 3459 3460 rc = nvme_disk_create(&bdev->disk, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 3461 nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); 3462 if (rc != 0) { 3463 SPDK_ERRLOG("Failed to create NVMe disk\n"); 3464 pthread_mutex_destroy(&bdev->mutex); 3465 free(bdev->err_stat); 3466 free(bdev); 3467 return rc; 3468 } 3469 3470 spdk_io_device_register(bdev, 3471 bdev_nvme_create_bdev_channel_cb, 3472 bdev_nvme_destroy_bdev_channel_cb, 3473 sizeof(struct nvme_bdev_channel), 3474 bdev->disk.name); 3475 3476 rc = spdk_bdev_register(&bdev->disk); 3477 if (rc != 0) { 3478 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 3479 spdk_io_device_unregister(bdev, NULL); 3480 pthread_mutex_destroy(&bdev->mutex); 3481 free(bdev->disk.name); 3482 free(bdev->err_stat); 3483 free(bdev); 3484 return rc; 3485 } 3486 3487 nvme_ns->bdev = bdev; 3488 bdev->nsid = nvme_ns->id; 3489 3490 bdev->nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 3491 TAILQ_INSERT_TAIL(&nvme_ctrlr->nbdev_ctrlr->bdevs, bdev, tailq); 3492 3493 return 0; 3494 } 3495 3496 static bool 3497 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 3498 { 3499 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 3500 const struct spdk_uuid *uuid1, *uuid2; 3501 3502 nsdata1 = spdk_nvme_ns_get_data(ns1); 3503 nsdata2 = spdk_nvme_ns_get_data(ns2); 3504 uuid1 = spdk_nvme_ns_get_uuid(ns1); 3505 uuid2 = spdk_nvme_ns_get_uuid(ns2); 3506 3507 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 3508 nsdata1->eui64 == nsdata2->eui64 && 3509 ((uuid1 == NULL && uuid2 == NULL) || 3510 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 3511 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 3512 } 3513 3514 static bool 3515 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 3516 struct spdk_nvme_ctrlr_opts *opts) 3517 { 3518 struct nvme_probe_skip_entry *entry; 3519 3520 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 3521 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 3522 return false; 3523 } 3524 } 3525 3526 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 3527 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 3528 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 3529 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 3530 opts->disable_read_ana_log_page = true; 3531 3532 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 3533 3534 return true; 3535 } 3536 3537 static void 3538 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 3539 { 3540 struct nvme_ctrlr *nvme_ctrlr = ctx; 3541 3542 if (spdk_nvme_cpl_is_error(cpl)) { 3543 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 3544 cpl->status.sct); 3545 bdev_nvme_reset(nvme_ctrlr); 3546 } else if (cpl->cdw0 & 0x1) { 3547 SPDK_WARNLOG("Specified command could not be aborted.\n"); 3548 bdev_nvme_reset(nvme_ctrlr); 3549 } 3550 } 3551 3552 static void 3553 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 3554 struct spdk_nvme_qpair *qpair, uint16_t cid) 3555 { 3556 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 3557 union spdk_nvme_csts_register csts; 3558 int rc; 3559 3560 assert(nvme_ctrlr->ctrlr == ctrlr); 3561 3562 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 3563 3564 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 3565 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 3566 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 3567 * completion recursively. 3568 */ 3569 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 3570 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 3571 if (csts.bits.cfs) { 3572 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 3573 bdev_nvme_reset(nvme_ctrlr); 3574 return; 3575 } 3576 } 3577 3578 switch (g_opts.action_on_timeout) { 3579 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 3580 if (qpair) { 3581 /* Don't send abort to ctrlr when ctrlr is not available. */ 3582 pthread_mutex_lock(&nvme_ctrlr->mutex); 3583 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 3584 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3585 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 3586 return; 3587 } 3588 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3589 3590 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 3591 nvme_abort_cpl, nvme_ctrlr); 3592 if (rc == 0) { 3593 return; 3594 } 3595 3596 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 3597 } 3598 3599 /* FALLTHROUGH */ 3600 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 3601 bdev_nvme_reset(nvme_ctrlr); 3602 break; 3603 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 3604 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 3605 break; 3606 default: 3607 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 3608 break; 3609 } 3610 } 3611 3612 static struct nvme_ns * 3613 nvme_ns_alloc(void) 3614 { 3615 struct nvme_ns *nvme_ns; 3616 3617 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 3618 if (nvme_ns == NULL) { 3619 return NULL; 3620 } 3621 3622 if (g_opts.io_path_stat) { 3623 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 3624 if (nvme_ns->stat == NULL) { 3625 free(nvme_ns); 3626 return NULL; 3627 } 3628 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 3629 } 3630 3631 return nvme_ns; 3632 } 3633 3634 static void 3635 nvme_ns_free(struct nvme_ns *nvme_ns) 3636 { 3637 free(nvme_ns->stat); 3638 free(nvme_ns); 3639 } 3640 3641 static void 3642 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 3643 { 3644 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 3645 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 3646 3647 if (rc == 0) { 3648 nvme_ns->probe_ctx = NULL; 3649 pthread_mutex_lock(&nvme_ctrlr->mutex); 3650 nvme_ctrlr->ref++; 3651 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3652 } else { 3653 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 3654 nvme_ns_free(nvme_ns); 3655 } 3656 3657 if (ctx) { 3658 ctx->populates_in_progress--; 3659 if (ctx->populates_in_progress == 0) { 3660 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 3661 } 3662 } 3663 } 3664 3665 static void 3666 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 3667 { 3668 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3669 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 3670 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3671 int rc; 3672 3673 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 3674 if (rc != 0) { 3675 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 3676 } 3677 3678 spdk_for_each_channel_continue(i, rc); 3679 } 3680 3681 static void 3682 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 3683 { 3684 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3685 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 3686 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3687 struct nvme_io_path *io_path; 3688 3689 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 3690 if (io_path != NULL) { 3691 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 3692 } 3693 3694 spdk_for_each_channel_continue(i, 0); 3695 } 3696 3697 static void 3698 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 3699 { 3700 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3701 3702 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 3703 } 3704 3705 static void 3706 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 3707 { 3708 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3709 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 3710 3711 if (status == 0) { 3712 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 3713 } else { 3714 /* Delete the added io_paths and fail populating the namespace. */ 3715 spdk_for_each_channel(bdev, 3716 bdev_nvme_delete_io_path, 3717 nvme_ns, 3718 bdev_nvme_add_io_path_failed); 3719 } 3720 } 3721 3722 static int 3723 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 3724 { 3725 struct nvme_ns *tmp_ns; 3726 const struct spdk_nvme_ns_data *nsdata; 3727 3728 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 3729 if (!nsdata->nmic.can_share) { 3730 SPDK_ERRLOG("Namespace cannot be shared.\n"); 3731 return -EINVAL; 3732 } 3733 3734 pthread_mutex_lock(&bdev->mutex); 3735 3736 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 3737 assert(tmp_ns != NULL); 3738 3739 if (!bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 3740 pthread_mutex_unlock(&bdev->mutex); 3741 SPDK_ERRLOG("Namespaces are not identical.\n"); 3742 return -EINVAL; 3743 } 3744 3745 bdev->ref++; 3746 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 3747 nvme_ns->bdev = bdev; 3748 3749 pthread_mutex_unlock(&bdev->mutex); 3750 3751 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 3752 spdk_for_each_channel(bdev, 3753 bdev_nvme_add_io_path, 3754 nvme_ns, 3755 bdev_nvme_add_io_path_done); 3756 3757 return 0; 3758 } 3759 3760 static void 3761 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 3762 { 3763 struct spdk_nvme_ns *ns; 3764 struct nvme_bdev *bdev; 3765 int rc = 0; 3766 3767 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 3768 if (!ns) { 3769 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 3770 rc = -EINVAL; 3771 goto done; 3772 } 3773 3774 nvme_ns->ns = ns; 3775 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 3776 3777 if (nvme_ctrlr->ana_log_page != NULL) { 3778 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 3779 } 3780 3781 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 3782 if (bdev == NULL) { 3783 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 3784 } else { 3785 rc = nvme_bdev_add_ns(bdev, nvme_ns); 3786 if (rc == 0) { 3787 return; 3788 } 3789 } 3790 done: 3791 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 3792 } 3793 3794 static void 3795 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 3796 { 3797 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 3798 3799 assert(nvme_ctrlr != NULL); 3800 3801 pthread_mutex_lock(&nvme_ctrlr->mutex); 3802 3803 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 3804 3805 if (nvme_ns->bdev != NULL) { 3806 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3807 return; 3808 } 3809 3810 nvme_ns_free(nvme_ns); 3811 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3812 3813 nvme_ctrlr_release(nvme_ctrlr); 3814 } 3815 3816 static void 3817 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 3818 { 3819 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3820 3821 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 3822 } 3823 3824 static void 3825 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 3826 { 3827 struct nvme_bdev *bdev; 3828 3829 spdk_poller_unregister(&nvme_ns->anatt_timer); 3830 3831 bdev = nvme_ns->bdev; 3832 if (bdev != NULL) { 3833 pthread_mutex_lock(&bdev->mutex); 3834 3835 assert(bdev->ref > 0); 3836 bdev->ref--; 3837 if (bdev->ref == 0) { 3838 pthread_mutex_unlock(&bdev->mutex); 3839 3840 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 3841 } else { 3842 /* spdk_bdev_unregister() is not called until the last nvme_ns is 3843 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 3844 * and clear nvme_ns->bdev here. 3845 */ 3846 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 3847 nvme_ns->bdev = NULL; 3848 3849 pthread_mutex_unlock(&bdev->mutex); 3850 3851 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 3852 * we call depopulate_namespace_done() to avoid use-after-free. 3853 */ 3854 spdk_for_each_channel(bdev, 3855 bdev_nvme_delete_io_path, 3856 nvme_ns, 3857 bdev_nvme_delete_io_path_done); 3858 return; 3859 } 3860 } 3861 3862 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 3863 } 3864 3865 static void 3866 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 3867 struct nvme_async_probe_ctx *ctx) 3868 { 3869 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 3870 struct nvme_ns *nvme_ns, *next; 3871 struct spdk_nvme_ns *ns; 3872 struct nvme_bdev *bdev; 3873 uint32_t nsid; 3874 int rc; 3875 uint64_t num_sectors; 3876 3877 if (ctx) { 3878 /* Initialize this count to 1 to handle the populate functions 3879 * calling nvme_ctrlr_populate_namespace_done() immediately. 3880 */ 3881 ctx->populates_in_progress = 1; 3882 } 3883 3884 /* First loop over our existing namespaces and see if they have been 3885 * removed. */ 3886 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 3887 while (nvme_ns != NULL) { 3888 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 3889 3890 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 3891 /* NS is still there but attributes may have changed */ 3892 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 3893 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 3894 bdev = nvme_ns->bdev; 3895 assert(bdev != NULL); 3896 if (bdev->disk.blockcnt != num_sectors) { 3897 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 3898 nvme_ns->id, 3899 bdev->disk.name, 3900 bdev->disk.blockcnt, 3901 num_sectors); 3902 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 3903 if (rc != 0) { 3904 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 3905 bdev->disk.name, rc); 3906 } 3907 } 3908 } else { 3909 /* Namespace was removed */ 3910 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 3911 } 3912 3913 nvme_ns = next; 3914 } 3915 3916 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 3917 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 3918 while (nsid != 0) { 3919 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 3920 3921 if (nvme_ns == NULL) { 3922 /* Found a new one */ 3923 nvme_ns = nvme_ns_alloc(); 3924 if (nvme_ns == NULL) { 3925 SPDK_ERRLOG("Failed to allocate namespace\n"); 3926 /* This just fails to attach the namespace. It may work on a future attempt. */ 3927 continue; 3928 } 3929 3930 nvme_ns->id = nsid; 3931 nvme_ns->ctrlr = nvme_ctrlr; 3932 3933 nvme_ns->bdev = NULL; 3934 3935 if (ctx) { 3936 ctx->populates_in_progress++; 3937 } 3938 nvme_ns->probe_ctx = ctx; 3939 3940 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 3941 3942 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 3943 } 3944 3945 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 3946 } 3947 3948 if (ctx) { 3949 /* Decrement this count now that the loop is over to account 3950 * for the one we started with. If the count is then 0, we 3951 * know any populate_namespace functions completed immediately, 3952 * so we'll kick the callback here. 3953 */ 3954 ctx->populates_in_progress--; 3955 if (ctx->populates_in_progress == 0) { 3956 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 3957 } 3958 } 3959 3960 } 3961 3962 static void 3963 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 3964 { 3965 struct nvme_ns *nvme_ns, *tmp; 3966 3967 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 3968 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 3969 } 3970 } 3971 3972 static uint32_t 3973 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 3974 { 3975 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 3976 const struct spdk_nvme_ctrlr_data *cdata; 3977 uint32_t nsid, ns_count = 0; 3978 3979 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3980 3981 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 3982 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 3983 ns_count++; 3984 } 3985 3986 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 3987 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 3988 sizeof(uint32_t); 3989 } 3990 3991 static int 3992 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 3993 void *cb_arg) 3994 { 3995 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 3996 struct nvme_ns *nvme_ns; 3997 uint32_t i, nsid; 3998 3999 for (i = 0; i < desc->num_of_nsid; i++) { 4000 nsid = desc->nsid[i]; 4001 if (nsid == 0) { 4002 continue; 4003 } 4004 4005 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4006 4007 assert(nvme_ns != NULL); 4008 if (nvme_ns == NULL) { 4009 /* Target told us that an inactive namespace had an ANA change */ 4010 continue; 4011 } 4012 4013 _nvme_ns_set_ana_state(nvme_ns, desc); 4014 } 4015 4016 return 0; 4017 } 4018 4019 static void 4020 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4021 { 4022 struct nvme_ns *nvme_ns; 4023 4024 spdk_free(nvme_ctrlr->ana_log_page); 4025 nvme_ctrlr->ana_log_page = NULL; 4026 4027 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4028 nvme_ns != NULL; 4029 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 4030 nvme_ns->ana_state_updating = false; 4031 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4032 } 4033 } 4034 4035 static void 4036 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 4037 { 4038 struct nvme_ctrlr *nvme_ctrlr = ctx; 4039 4040 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 4041 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 4042 nvme_ctrlr); 4043 } else { 4044 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 4045 } 4046 4047 pthread_mutex_lock(&nvme_ctrlr->mutex); 4048 4049 assert(nvme_ctrlr->ana_log_page_updating == true); 4050 nvme_ctrlr->ana_log_page_updating = false; 4051 4052 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 4053 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4054 4055 nvme_ctrlr_unregister(nvme_ctrlr); 4056 } else { 4057 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4058 4059 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 4060 } 4061 } 4062 4063 static int 4064 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4065 { 4066 uint32_t ana_log_page_size; 4067 int rc; 4068 4069 if (nvme_ctrlr->ana_log_page == NULL) { 4070 return -EINVAL; 4071 } 4072 4073 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 4074 4075 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 4076 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 4077 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 4078 return -EINVAL; 4079 } 4080 4081 pthread_mutex_lock(&nvme_ctrlr->mutex); 4082 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 4083 nvme_ctrlr->ana_log_page_updating) { 4084 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4085 return -EBUSY; 4086 } 4087 4088 nvme_ctrlr->ana_log_page_updating = true; 4089 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4090 4091 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 4092 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 4093 SPDK_NVME_GLOBAL_NS_TAG, 4094 nvme_ctrlr->ana_log_page, 4095 ana_log_page_size, 0, 4096 nvme_ctrlr_read_ana_log_page_done, 4097 nvme_ctrlr); 4098 if (rc != 0) { 4099 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 4100 } 4101 4102 return rc; 4103 } 4104 4105 static void 4106 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4107 { 4108 } 4109 4110 struct bdev_nvme_set_preferred_path_ctx { 4111 struct spdk_bdev_desc *desc; 4112 struct nvme_ns *nvme_ns; 4113 bdev_nvme_set_preferred_path_cb cb_fn; 4114 void *cb_arg; 4115 }; 4116 4117 static void 4118 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 4119 { 4120 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4121 4122 assert(ctx != NULL); 4123 assert(ctx->desc != NULL); 4124 assert(ctx->cb_fn != NULL); 4125 4126 spdk_bdev_close(ctx->desc); 4127 4128 ctx->cb_fn(ctx->cb_arg, status); 4129 4130 free(ctx); 4131 } 4132 4133 static void 4134 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 4135 { 4136 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4137 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4138 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4139 struct nvme_io_path *io_path, *prev; 4140 4141 prev = NULL; 4142 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 4143 if (io_path->nvme_ns == ctx->nvme_ns) { 4144 break; 4145 } 4146 prev = io_path; 4147 } 4148 4149 if (io_path != NULL) { 4150 if (prev != NULL) { 4151 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 4152 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 4153 } 4154 4155 /* We can set io_path to nbdev_ch->current_io_path directly here. 4156 * However, it needs to be conditional. To simplify the code, 4157 * just clear nbdev_ch->current_io_path and let find_io_path() 4158 * fill it. 4159 * 4160 * Automatic failback may be disabled. Hence even if the io_path is 4161 * already at the head, clear nbdev_ch->current_io_path. 4162 */ 4163 bdev_nvme_clear_current_io_path(nbdev_ch); 4164 } 4165 4166 spdk_for_each_channel_continue(i, 0); 4167 } 4168 4169 static struct nvme_ns * 4170 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 4171 { 4172 struct nvme_ns *nvme_ns, *prev; 4173 const struct spdk_nvme_ctrlr_data *cdata; 4174 4175 prev = NULL; 4176 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 4177 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4178 4179 if (cdata->cntlid == cntlid) { 4180 break; 4181 } 4182 prev = nvme_ns; 4183 } 4184 4185 if (nvme_ns != NULL && prev != NULL) { 4186 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 4187 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 4188 } 4189 4190 return nvme_ns; 4191 } 4192 4193 /* This function supports only multipath mode. There is only a single I/O path 4194 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 4195 * head of the I/O path list for each NVMe bdev channel. 4196 * 4197 * NVMe bdev channel may be acquired after completing this function. move the 4198 * matched namespace to the head of the namespace list for the NVMe bdev too. 4199 */ 4200 void 4201 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 4202 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 4203 { 4204 struct bdev_nvme_set_preferred_path_ctx *ctx; 4205 struct spdk_bdev *bdev; 4206 struct nvme_bdev *nbdev; 4207 int rc = 0; 4208 4209 assert(cb_fn != NULL); 4210 4211 ctx = calloc(1, sizeof(*ctx)); 4212 if (ctx == NULL) { 4213 SPDK_ERRLOG("Failed to alloc context.\n"); 4214 rc = -ENOMEM; 4215 goto err_alloc; 4216 } 4217 4218 ctx->cb_fn = cb_fn; 4219 ctx->cb_arg = cb_arg; 4220 4221 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 4222 if (rc != 0) { 4223 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 4224 goto err_open; 4225 } 4226 4227 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 4228 4229 if (bdev->module != &nvme_if) { 4230 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 4231 rc = -ENODEV; 4232 goto err_bdev; 4233 } 4234 4235 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 4236 4237 pthread_mutex_lock(&nbdev->mutex); 4238 4239 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 4240 if (ctx->nvme_ns == NULL) { 4241 pthread_mutex_unlock(&nbdev->mutex); 4242 4243 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 4244 rc = -ENODEV; 4245 goto err_bdev; 4246 } 4247 4248 pthread_mutex_unlock(&nbdev->mutex); 4249 4250 spdk_for_each_channel(nbdev, 4251 _bdev_nvme_set_preferred_path, 4252 ctx, 4253 bdev_nvme_set_preferred_path_done); 4254 return; 4255 4256 err_bdev: 4257 spdk_bdev_close(ctx->desc); 4258 err_open: 4259 free(ctx); 4260 err_alloc: 4261 cb_fn(cb_arg, rc); 4262 } 4263 4264 struct bdev_nvme_set_multipath_policy_ctx { 4265 struct spdk_bdev_desc *desc; 4266 bdev_nvme_set_multipath_policy_cb cb_fn; 4267 void *cb_arg; 4268 }; 4269 4270 static void 4271 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 4272 { 4273 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4274 4275 assert(ctx != NULL); 4276 assert(ctx->desc != NULL); 4277 assert(ctx->cb_fn != NULL); 4278 4279 spdk_bdev_close(ctx->desc); 4280 4281 ctx->cb_fn(ctx->cb_arg, status); 4282 4283 free(ctx); 4284 } 4285 4286 static void 4287 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 4288 { 4289 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4290 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4291 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 4292 4293 nbdev_ch->mp_policy = nbdev->mp_policy; 4294 nbdev_ch->mp_selector = nbdev->mp_selector; 4295 nbdev_ch->rr_min_io = nbdev->rr_min_io; 4296 bdev_nvme_clear_current_io_path(nbdev_ch); 4297 4298 spdk_for_each_channel_continue(i, 0); 4299 } 4300 4301 void 4302 bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, 4303 enum bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 4304 bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 4305 { 4306 struct bdev_nvme_set_multipath_policy_ctx *ctx; 4307 struct spdk_bdev *bdev; 4308 struct nvme_bdev *nbdev; 4309 int rc; 4310 4311 assert(cb_fn != NULL); 4312 4313 if (policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE && selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 4314 if (rr_min_io == UINT32_MAX) { 4315 rr_min_io = 1; 4316 } else if (rr_min_io == 0) { 4317 rc = -EINVAL; 4318 goto exit; 4319 } 4320 } else if (rr_min_io != UINT32_MAX) { 4321 rc = -EINVAL; 4322 goto exit; 4323 } 4324 4325 ctx = calloc(1, sizeof(*ctx)); 4326 if (ctx == NULL) { 4327 SPDK_ERRLOG("Failed to alloc context.\n"); 4328 rc = -ENOMEM; 4329 goto exit; 4330 } 4331 4332 ctx->cb_fn = cb_fn; 4333 ctx->cb_arg = cb_arg; 4334 4335 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 4336 if (rc != 0) { 4337 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 4338 rc = -ENODEV; 4339 goto err_open; 4340 } 4341 4342 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 4343 if (bdev->module != &nvme_if) { 4344 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 4345 rc = -ENODEV; 4346 goto err_module; 4347 } 4348 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 4349 4350 pthread_mutex_lock(&nbdev->mutex); 4351 nbdev->mp_policy = policy; 4352 nbdev->mp_selector = selector; 4353 nbdev->rr_min_io = rr_min_io; 4354 pthread_mutex_unlock(&nbdev->mutex); 4355 4356 spdk_for_each_channel(nbdev, 4357 _bdev_nvme_set_multipath_policy, 4358 ctx, 4359 bdev_nvme_set_multipath_policy_done); 4360 return; 4361 4362 err_module: 4363 spdk_bdev_close(ctx->desc); 4364 err_open: 4365 free(ctx); 4366 exit: 4367 cb_fn(cb_arg, rc); 4368 } 4369 4370 static void 4371 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 4372 { 4373 struct nvme_ctrlr *nvme_ctrlr = arg; 4374 union spdk_nvme_async_event_completion event; 4375 4376 if (spdk_nvme_cpl_is_error(cpl)) { 4377 SPDK_WARNLOG("AER request execute failed\n"); 4378 return; 4379 } 4380 4381 event.raw = cpl->cdw0; 4382 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 4383 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 4384 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 4385 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 4386 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 4387 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 4388 } 4389 } 4390 4391 static void 4392 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc) 4393 { 4394 if (ctx->cb_fn) { 4395 ctx->cb_fn(ctx->cb_ctx, count, rc); 4396 } 4397 4398 ctx->namespaces_populated = true; 4399 if (ctx->probe_done) { 4400 /* The probe was already completed, so we need to free the context 4401 * here. This can happen for cases like OCSSD, where we need to 4402 * send additional commands to the SSD after attach. 4403 */ 4404 free(ctx); 4405 } 4406 } 4407 4408 static void 4409 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 4410 struct nvme_async_probe_ctx *ctx) 4411 { 4412 spdk_io_device_register(nvme_ctrlr, 4413 bdev_nvme_create_ctrlr_channel_cb, 4414 bdev_nvme_destroy_ctrlr_channel_cb, 4415 sizeof(struct nvme_ctrlr_channel), 4416 nvme_ctrlr->nbdev_ctrlr->name); 4417 4418 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 4419 } 4420 4421 static void 4422 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 4423 { 4424 struct nvme_ctrlr *nvme_ctrlr = _ctx; 4425 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 4426 4427 nvme_ctrlr->probe_ctx = NULL; 4428 4429 if (spdk_nvme_cpl_is_error(cpl)) { 4430 nvme_ctrlr_delete(nvme_ctrlr); 4431 4432 if (ctx != NULL) { 4433 populate_namespaces_cb(ctx, 0, -1); 4434 } 4435 return; 4436 } 4437 4438 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 4439 } 4440 4441 static int 4442 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 4443 struct nvme_async_probe_ctx *ctx) 4444 { 4445 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4446 const struct spdk_nvme_ctrlr_data *cdata; 4447 uint32_t ana_log_page_size; 4448 4449 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4450 4451 /* Set buffer size enough to include maximum number of allowed namespaces. */ 4452 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 4453 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 4454 sizeof(uint32_t); 4455 4456 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 4457 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 4458 if (nvme_ctrlr->ana_log_page == NULL) { 4459 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 4460 return -ENXIO; 4461 } 4462 4463 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 4464 * Hence copy each descriptor to a temporary area when parsing it. 4465 * 4466 * Allocate a buffer whose size is as large as ANA log page buffer because 4467 * we do not know the size of a descriptor until actually reading it. 4468 */ 4469 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 4470 if (nvme_ctrlr->copied_ana_desc == NULL) { 4471 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 4472 return -ENOMEM; 4473 } 4474 4475 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 4476 4477 nvme_ctrlr->probe_ctx = ctx; 4478 4479 /* Then, set the read size only to include the current active namespaces. */ 4480 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 4481 4482 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 4483 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 4484 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 4485 return -EINVAL; 4486 } 4487 4488 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 4489 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 4490 SPDK_NVME_GLOBAL_NS_TAG, 4491 nvme_ctrlr->ana_log_page, 4492 ana_log_page_size, 0, 4493 nvme_ctrlr_init_ana_log_page_done, 4494 nvme_ctrlr); 4495 } 4496 4497 /* hostnqn and subnqn were already verified before attaching a controller. 4498 * Hence check only the multipath capability and cntlid here. 4499 */ 4500 static bool 4501 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 4502 { 4503 struct nvme_ctrlr *tmp; 4504 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 4505 4506 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4507 4508 if (!cdata->cmic.multi_ctrlr) { 4509 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 4510 return false; 4511 } 4512 4513 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 4514 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 4515 4516 if (!tmp_cdata->cmic.multi_ctrlr) { 4517 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 4518 return false; 4519 } 4520 if (cdata->cntlid == tmp_cdata->cntlid) { 4521 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 4522 return false; 4523 } 4524 } 4525 4526 return true; 4527 } 4528 4529 static int 4530 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 4531 { 4532 struct nvme_bdev_ctrlr *nbdev_ctrlr; 4533 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4534 int rc = 0; 4535 4536 pthread_mutex_lock(&g_bdev_nvme_mutex); 4537 4538 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 4539 if (nbdev_ctrlr != NULL) { 4540 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 4541 rc = -EINVAL; 4542 goto exit; 4543 } 4544 } else { 4545 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 4546 if (nbdev_ctrlr == NULL) { 4547 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 4548 rc = -ENOMEM; 4549 goto exit; 4550 } 4551 nbdev_ctrlr->name = strdup(name); 4552 if (nbdev_ctrlr->name == NULL) { 4553 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 4554 free(nbdev_ctrlr); 4555 goto exit; 4556 } 4557 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 4558 TAILQ_INIT(&nbdev_ctrlr->bdevs); 4559 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 4560 } 4561 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 4562 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 4563 exit: 4564 pthread_mutex_unlock(&g_bdev_nvme_mutex); 4565 return rc; 4566 } 4567 4568 static int 4569 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 4570 const char *name, 4571 const struct spdk_nvme_transport_id *trid, 4572 struct nvme_async_probe_ctx *ctx) 4573 { 4574 struct nvme_ctrlr *nvme_ctrlr; 4575 struct nvme_path_id *path_id; 4576 const struct spdk_nvme_ctrlr_data *cdata; 4577 int rc; 4578 4579 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 4580 if (nvme_ctrlr == NULL) { 4581 SPDK_ERRLOG("Failed to allocate device struct\n"); 4582 return -ENOMEM; 4583 } 4584 4585 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 4586 if (rc != 0) { 4587 free(nvme_ctrlr); 4588 return rc; 4589 } 4590 4591 TAILQ_INIT(&nvme_ctrlr->trids); 4592 4593 RB_INIT(&nvme_ctrlr->namespaces); 4594 4595 path_id = calloc(1, sizeof(*path_id)); 4596 if (path_id == NULL) { 4597 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 4598 rc = -ENOMEM; 4599 goto err; 4600 } 4601 4602 path_id->trid = *trid; 4603 if (ctx != NULL) { 4604 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 4605 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 4606 } 4607 nvme_ctrlr->active_path_id = path_id; 4608 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 4609 4610 nvme_ctrlr->thread = spdk_get_thread(); 4611 nvme_ctrlr->ctrlr = ctrlr; 4612 nvme_ctrlr->ref = 1; 4613 4614 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 4615 SPDK_ERRLOG("OCSSDs are not supported"); 4616 rc = -ENOTSUP; 4617 goto err; 4618 } 4619 4620 if (ctx != NULL) { 4621 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 4622 } else { 4623 bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 4624 } 4625 4626 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 4627 g_opts.nvme_adminq_poll_period_us); 4628 4629 if (g_opts.timeout_us > 0) { 4630 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 4631 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 4632 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 4633 g_opts.timeout_us : g_opts.timeout_admin_us; 4634 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 4635 adm_timeout_us, timeout_cb, nvme_ctrlr); 4636 } 4637 4638 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 4639 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 4640 4641 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 4642 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 4643 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 4644 } 4645 4646 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 4647 if (rc != 0) { 4648 goto err; 4649 } 4650 4651 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4652 4653 if (cdata->cmic.ana_reporting) { 4654 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 4655 if (rc == 0) { 4656 return 0; 4657 } 4658 } else { 4659 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 4660 return 0; 4661 } 4662 4663 err: 4664 nvme_ctrlr_delete(nvme_ctrlr); 4665 return rc; 4666 } 4667 4668 void 4669 bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) 4670 { 4671 opts->prchk_flags = 0; 4672 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 4673 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 4674 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 4675 } 4676 4677 static void 4678 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4679 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 4680 { 4681 char *name; 4682 4683 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 4684 if (!name) { 4685 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 4686 return; 4687 } 4688 4689 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 4690 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 4691 } else { 4692 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 4693 } 4694 4695 free(name); 4696 } 4697 4698 static void 4699 _nvme_ctrlr_destruct(void *ctx) 4700 { 4701 struct nvme_ctrlr *nvme_ctrlr = ctx; 4702 4703 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 4704 nvme_ctrlr_release(nvme_ctrlr); 4705 } 4706 4707 static int 4708 bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 4709 { 4710 struct nvme_probe_skip_entry *entry; 4711 4712 /* The controller's destruction was already started */ 4713 if (nvme_ctrlr->destruct) { 4714 return -EALREADY; 4715 } 4716 4717 if (!hotplug && 4718 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 4719 entry = calloc(1, sizeof(*entry)); 4720 if (!entry) { 4721 return -ENOMEM; 4722 } 4723 entry->trid = nvme_ctrlr->active_path_id->trid; 4724 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 4725 } 4726 4727 nvme_ctrlr->destruct = true; 4728 return 0; 4729 } 4730 4731 static int 4732 bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 4733 { 4734 int rc; 4735 4736 pthread_mutex_lock(&nvme_ctrlr->mutex); 4737 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug); 4738 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4739 4740 if (rc == 0) { 4741 _nvme_ctrlr_destruct(nvme_ctrlr); 4742 } else if (rc == -EALREADY) { 4743 rc = 0; 4744 } 4745 4746 return rc; 4747 } 4748 4749 static void 4750 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 4751 { 4752 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 4753 4754 bdev_nvme_delete_ctrlr(nvme_ctrlr, true); 4755 } 4756 4757 static int 4758 bdev_nvme_hotplug_probe(void *arg) 4759 { 4760 if (g_hotplug_probe_ctx == NULL) { 4761 spdk_poller_unregister(&g_hotplug_probe_poller); 4762 return SPDK_POLLER_IDLE; 4763 } 4764 4765 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 4766 g_hotplug_probe_ctx = NULL; 4767 spdk_poller_unregister(&g_hotplug_probe_poller); 4768 } 4769 4770 return SPDK_POLLER_BUSY; 4771 } 4772 4773 static int 4774 bdev_nvme_hotplug(void *arg) 4775 { 4776 struct spdk_nvme_transport_id trid_pcie; 4777 4778 if (g_hotplug_probe_ctx) { 4779 return SPDK_POLLER_BUSY; 4780 } 4781 4782 memset(&trid_pcie, 0, sizeof(trid_pcie)); 4783 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 4784 4785 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 4786 hotplug_probe_cb, attach_cb, NULL); 4787 4788 if (g_hotplug_probe_ctx) { 4789 assert(g_hotplug_probe_poller == NULL); 4790 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 4791 } 4792 4793 return SPDK_POLLER_BUSY; 4794 } 4795 4796 void 4797 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 4798 { 4799 *opts = g_opts; 4800 } 4801 4802 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 4803 uint32_t reconnect_delay_sec, 4804 uint32_t fast_io_fail_timeout_sec); 4805 4806 static int 4807 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 4808 { 4809 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 4810 /* Can't set timeout_admin_us without also setting timeout_us */ 4811 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 4812 return -EINVAL; 4813 } 4814 4815 if (opts->bdev_retry_count < -1) { 4816 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 4817 return -EINVAL; 4818 } 4819 4820 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 4821 opts->reconnect_delay_sec, 4822 opts->fast_io_fail_timeout_sec)) { 4823 return -EINVAL; 4824 } 4825 4826 return 0; 4827 } 4828 4829 int 4830 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 4831 { 4832 int ret; 4833 4834 ret = bdev_nvme_validate_opts(opts); 4835 if (ret) { 4836 SPDK_WARNLOG("Failed to set nvme opts.\n"); 4837 return ret; 4838 } 4839 4840 if (g_bdev_nvme_init_thread != NULL) { 4841 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 4842 return -EPERM; 4843 } 4844 } 4845 4846 if (opts->rdma_srq_size != 0) { 4847 struct spdk_nvme_transport_opts drv_opts; 4848 4849 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 4850 drv_opts.rdma_srq_size = opts->rdma_srq_size; 4851 4852 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 4853 if (ret) { 4854 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 4855 return ret; 4856 } 4857 } 4858 4859 g_opts = *opts; 4860 4861 return 0; 4862 } 4863 4864 struct set_nvme_hotplug_ctx { 4865 uint64_t period_us; 4866 bool enabled; 4867 spdk_msg_fn fn; 4868 void *fn_ctx; 4869 }; 4870 4871 static void 4872 set_nvme_hotplug_period_cb(void *_ctx) 4873 { 4874 struct set_nvme_hotplug_ctx *ctx = _ctx; 4875 4876 spdk_poller_unregister(&g_hotplug_poller); 4877 if (ctx->enabled) { 4878 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 4879 } 4880 4881 g_nvme_hotplug_poll_period_us = ctx->period_us; 4882 g_nvme_hotplug_enabled = ctx->enabled; 4883 if (ctx->fn) { 4884 ctx->fn(ctx->fn_ctx); 4885 } 4886 4887 free(ctx); 4888 } 4889 4890 int 4891 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 4892 { 4893 struct set_nvme_hotplug_ctx *ctx; 4894 4895 if (enabled == true && !spdk_process_is_primary()) { 4896 return -EPERM; 4897 } 4898 4899 ctx = calloc(1, sizeof(*ctx)); 4900 if (ctx == NULL) { 4901 return -ENOMEM; 4902 } 4903 4904 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 4905 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 4906 ctx->enabled = enabled; 4907 ctx->fn = cb; 4908 ctx->fn_ctx = cb_ctx; 4909 4910 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 4911 return 0; 4912 } 4913 4914 static void 4915 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 4916 struct nvme_async_probe_ctx *ctx) 4917 { 4918 struct nvme_ns *nvme_ns; 4919 struct nvme_bdev *nvme_bdev; 4920 size_t j; 4921 4922 assert(nvme_ctrlr != NULL); 4923 4924 if (ctx->names == NULL) { 4925 populate_namespaces_cb(ctx, 0, 0); 4926 return; 4927 } 4928 4929 /* 4930 * Report the new bdevs that were created in this call. 4931 * There can be more than one bdev per NVMe controller. 4932 */ 4933 j = 0; 4934 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4935 while (nvme_ns != NULL) { 4936 nvme_bdev = nvme_ns->bdev; 4937 if (j < ctx->count) { 4938 ctx->names[j] = nvme_bdev->disk.name; 4939 j++; 4940 } else { 4941 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 4942 ctx->count); 4943 populate_namespaces_cb(ctx, 0, -ERANGE); 4944 return; 4945 } 4946 4947 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4948 } 4949 4950 populate_namespaces_cb(ctx, j, 0); 4951 } 4952 4953 static int 4954 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 4955 struct spdk_nvme_ctrlr *new_ctrlr, 4956 struct spdk_nvme_transport_id *trid) 4957 { 4958 struct nvme_path_id *tmp_trid; 4959 4960 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 4961 SPDK_ERRLOG("PCIe failover is not supported.\n"); 4962 return -ENOTSUP; 4963 } 4964 4965 /* Currently we only support failover to the same transport type. */ 4966 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 4967 SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", 4968 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 4969 spdk_nvme_transport_id_trtype_str(trid->trtype)); 4970 return -EINVAL; 4971 } 4972 4973 4974 /* Currently we only support failover to the same NQN. */ 4975 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 4976 SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 4977 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 4978 return -EINVAL; 4979 } 4980 4981 /* Skip all the other checks if we've already registered this path. */ 4982 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 4983 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 4984 SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, 4985 trid->subnqn); 4986 return -EEXIST; 4987 } 4988 } 4989 4990 return 0; 4991 } 4992 4993 static int 4994 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 4995 struct spdk_nvme_ctrlr *new_ctrlr) 4996 { 4997 struct nvme_ns *nvme_ns; 4998 struct spdk_nvme_ns *new_ns; 4999 5000 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5001 while (nvme_ns != NULL) { 5002 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 5003 assert(new_ns != NULL); 5004 5005 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 5006 return -EINVAL; 5007 } 5008 5009 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5010 } 5011 5012 return 0; 5013 } 5014 5015 static int 5016 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5017 struct spdk_nvme_transport_id *trid) 5018 { 5019 struct nvme_path_id *new_trid, *tmp_trid; 5020 5021 new_trid = calloc(1, sizeof(*new_trid)); 5022 if (new_trid == NULL) { 5023 return -ENOMEM; 5024 } 5025 new_trid->trid = *trid; 5026 new_trid->is_failed = false; 5027 5028 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 5029 if (tmp_trid->is_failed && tmp_trid != nvme_ctrlr->active_path_id) { 5030 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 5031 return 0; 5032 } 5033 } 5034 5035 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 5036 return 0; 5037 } 5038 5039 /* This is the case that a secondary path is added to an existing 5040 * nvme_ctrlr for failover. After checking if it can access the same 5041 * namespaces as the primary path, it is disconnected until failover occurs. 5042 */ 5043 static int 5044 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5045 struct spdk_nvme_ctrlr *new_ctrlr, 5046 struct spdk_nvme_transport_id *trid) 5047 { 5048 int rc; 5049 5050 assert(nvme_ctrlr != NULL); 5051 5052 pthread_mutex_lock(&nvme_ctrlr->mutex); 5053 5054 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 5055 if (rc != 0) { 5056 goto exit; 5057 } 5058 5059 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 5060 if (rc != 0) { 5061 goto exit; 5062 } 5063 5064 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 5065 5066 exit: 5067 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5068 5069 spdk_nvme_detach(new_ctrlr); 5070 5071 return rc; 5072 } 5073 5074 static void 5075 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5076 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5077 { 5078 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5079 struct nvme_async_probe_ctx *ctx; 5080 int rc; 5081 5082 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5083 ctx->ctrlr_attached = true; 5084 5085 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 5086 if (rc != 0) { 5087 populate_namespaces_cb(ctx, 0, rc); 5088 } 5089 } 5090 5091 static void 5092 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5093 struct spdk_nvme_ctrlr *ctrlr, 5094 const struct spdk_nvme_ctrlr_opts *opts) 5095 { 5096 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5097 struct nvme_ctrlr *nvme_ctrlr; 5098 struct nvme_async_probe_ctx *ctx; 5099 int rc; 5100 5101 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5102 ctx->ctrlr_attached = true; 5103 5104 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 5105 if (nvme_ctrlr) { 5106 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 5107 } else { 5108 rc = -ENODEV; 5109 } 5110 5111 populate_namespaces_cb(ctx, 0, rc); 5112 } 5113 5114 static int 5115 bdev_nvme_async_poll(void *arg) 5116 { 5117 struct nvme_async_probe_ctx *ctx = arg; 5118 int rc; 5119 5120 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 5121 if (spdk_unlikely(rc != -EAGAIN)) { 5122 ctx->probe_done = true; 5123 spdk_poller_unregister(&ctx->poller); 5124 if (!ctx->ctrlr_attached) { 5125 /* The probe is done, but no controller was attached. 5126 * That means we had a failure, so report -EIO back to 5127 * the caller (usually the RPC). populate_namespaces_cb() 5128 * will take care of freeing the nvme_async_probe_ctx. 5129 */ 5130 populate_namespaces_cb(ctx, 0, -EIO); 5131 } else if (ctx->namespaces_populated) { 5132 /* The namespaces for the attached controller were all 5133 * populated and the response was already sent to the 5134 * caller (usually the RPC). So free the context here. 5135 */ 5136 free(ctx); 5137 } 5138 } 5139 5140 return SPDK_POLLER_BUSY; 5141 } 5142 5143 static bool 5144 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5145 uint32_t reconnect_delay_sec, 5146 uint32_t fast_io_fail_timeout_sec) 5147 { 5148 if (ctrlr_loss_timeout_sec < -1) { 5149 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 5150 return false; 5151 } else if (ctrlr_loss_timeout_sec == -1) { 5152 if (reconnect_delay_sec == 0) { 5153 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 5154 return false; 5155 } else if (fast_io_fail_timeout_sec != 0 && 5156 fast_io_fail_timeout_sec < reconnect_delay_sec) { 5157 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 5158 return false; 5159 } 5160 } else if (ctrlr_loss_timeout_sec != 0) { 5161 if (reconnect_delay_sec == 0) { 5162 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 5163 return false; 5164 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 5165 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 5166 return false; 5167 } else if (fast_io_fail_timeout_sec != 0) { 5168 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 5169 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 5170 return false; 5171 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 5172 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 5173 return false; 5174 } 5175 } 5176 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 5177 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 5178 return false; 5179 } 5180 5181 return true; 5182 } 5183 5184 int 5185 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 5186 const char *base_name, 5187 const char **names, 5188 uint32_t count, 5189 spdk_bdev_create_nvme_fn cb_fn, 5190 void *cb_ctx, 5191 struct spdk_nvme_ctrlr_opts *drv_opts, 5192 struct nvme_ctrlr_opts *bdev_opts, 5193 bool multipath) 5194 { 5195 struct nvme_probe_skip_entry *entry, *tmp; 5196 struct nvme_async_probe_ctx *ctx; 5197 spdk_nvme_attach_cb attach_cb; 5198 5199 /* TODO expand this check to include both the host and target TRIDs. 5200 * Only if both are the same should we fail. 5201 */ 5202 if (nvme_ctrlr_get(trid) != NULL) { 5203 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 5204 return -EEXIST; 5205 } 5206 5207 if (bdev_opts != NULL && 5208 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 5209 bdev_opts->reconnect_delay_sec, 5210 bdev_opts->fast_io_fail_timeout_sec)) { 5211 return -EINVAL; 5212 } 5213 5214 ctx = calloc(1, sizeof(*ctx)); 5215 if (!ctx) { 5216 return -ENOMEM; 5217 } 5218 ctx->base_name = base_name; 5219 ctx->names = names; 5220 ctx->count = count; 5221 ctx->cb_fn = cb_fn; 5222 ctx->cb_ctx = cb_ctx; 5223 ctx->trid = *trid; 5224 5225 if (bdev_opts) { 5226 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 5227 } else { 5228 bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 5229 } 5230 5231 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5232 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 5233 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 5234 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 5235 free(entry); 5236 break; 5237 } 5238 } 5239 } 5240 5241 if (drv_opts) { 5242 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 5243 } else { 5244 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); 5245 } 5246 5247 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 5248 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 5249 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 5250 ctx->drv_opts.disable_read_ana_log_page = true; 5251 ctx->drv_opts.transport_tos = g_opts.transport_tos; 5252 5253 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 5254 attach_cb = connect_attach_cb; 5255 } else { 5256 attach_cb = connect_set_failover_cb; 5257 } 5258 5259 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 5260 if (ctx->probe_ctx == NULL) { 5261 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 5262 free(ctx); 5263 return -ENODEV; 5264 } 5265 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 5266 5267 return 0; 5268 } 5269 5270 static bool 5271 nvme_path_should_delete(struct nvme_path_id *p, const struct nvme_path_id *path_id) 5272 { 5273 if (path_id->trid.trtype != 0) { 5274 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 5275 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 5276 return false; 5277 } 5278 } else { 5279 if (path_id->trid.trtype != p->trid.trtype) { 5280 return false; 5281 } 5282 } 5283 } 5284 5285 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 5286 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 5287 return false; 5288 } 5289 } 5290 5291 if (path_id->trid.adrfam != 0) { 5292 if (path_id->trid.adrfam != p->trid.adrfam) { 5293 return false; 5294 } 5295 } 5296 5297 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 5298 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 5299 return false; 5300 } 5301 } 5302 5303 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 5304 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 5305 return false; 5306 } 5307 } 5308 5309 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 5310 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 5311 return false; 5312 } 5313 } 5314 5315 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 5316 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 5317 return false; 5318 } 5319 } 5320 5321 return true; 5322 } 5323 5324 static int 5325 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id) 5326 { 5327 struct nvme_path_id *p, *t; 5328 spdk_msg_fn msg_fn; 5329 int rc = -ENXIO; 5330 5331 pthread_mutex_lock(&nvme_ctrlr->mutex); 5332 5333 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 5334 if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) { 5335 break; 5336 } 5337 5338 if (!nvme_path_should_delete(p, path_id)) { 5339 continue; 5340 } 5341 5342 /* We are not using the specified path. */ 5343 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 5344 free(p); 5345 rc = 0; 5346 } 5347 5348 if (p == NULL || !nvme_path_should_delete(p, path_id)) { 5349 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5350 return rc; 5351 } 5352 5353 /* If we made it here, then this path is a match! Now we need to remove it. */ 5354 5355 /* This is the active path in use right now. The active path is always the first in the list. */ 5356 assert(p == nvme_ctrlr->active_path_id); 5357 5358 if (!TAILQ_NEXT(p, link)) { 5359 /* The current path is the only path. */ 5360 msg_fn = _nvme_ctrlr_destruct; 5361 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false); 5362 } else { 5363 /* There is an alternative path. */ 5364 msg_fn = _bdev_nvme_reset; 5365 rc = bdev_nvme_failover_unsafe(nvme_ctrlr, true); 5366 } 5367 5368 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5369 5370 if (rc == 0) { 5371 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 5372 } else if (rc == -EALREADY) { 5373 rc = 0; 5374 } 5375 5376 return rc; 5377 } 5378 5379 int 5380 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id) 5381 { 5382 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5383 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 5384 int rc = -ENXIO, _rc; 5385 5386 if (name == NULL || path_id == NULL) { 5387 return -EINVAL; 5388 } 5389 5390 pthread_mutex_lock(&g_bdev_nvme_mutex); 5391 5392 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5393 if (nbdev_ctrlr == NULL) { 5394 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5395 5396 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 5397 return -ENODEV; 5398 } 5399 5400 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 5401 _rc = _bdev_nvme_delete(nvme_ctrlr, path_id); 5402 if (_rc < 0 && _rc != -ENXIO) { 5403 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5404 5405 return _rc; 5406 } else if (_rc == 0) { 5407 /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr 5408 * was deleted successfully. To remember the successful deletion, 5409 * overwrite rc only if _rc is zero. 5410 */ 5411 rc = 0; 5412 } 5413 } 5414 5415 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5416 5417 /* All nvme_ctrlrs were deleted or no nvme_ctrlr which had the trid was found. */ 5418 return rc; 5419 } 5420 5421 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 5422 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 5423 5424 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 5425 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 5426 5427 struct discovery_entry_ctx { 5428 char name[128]; 5429 struct spdk_nvme_transport_id trid; 5430 struct spdk_nvme_ctrlr_opts drv_opts; 5431 struct spdk_nvmf_discovery_log_page_entry entry; 5432 TAILQ_ENTRY(discovery_entry_ctx) tailq; 5433 struct discovery_ctx *ctx; 5434 }; 5435 5436 struct discovery_ctx { 5437 char *name; 5438 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 5439 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 5440 void *cb_ctx; 5441 struct spdk_nvme_probe_ctx *probe_ctx; 5442 struct spdk_nvme_detach_ctx *detach_ctx; 5443 struct spdk_nvme_ctrlr *ctrlr; 5444 struct spdk_nvme_transport_id trid; 5445 struct discovery_entry_ctx *entry_ctx_in_use; 5446 struct spdk_poller *poller; 5447 struct spdk_nvme_ctrlr_opts drv_opts; 5448 struct nvme_ctrlr_opts bdev_opts; 5449 struct spdk_nvmf_discovery_log_page *log_page; 5450 TAILQ_ENTRY(discovery_ctx) tailq; 5451 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 5452 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 5453 int rc; 5454 bool wait_for_attach; 5455 uint64_t timeout_ticks; 5456 /* Denotes that the discovery service is being started. We're waiting 5457 * for the initial connection to the discovery controller to be 5458 * established and attach discovered NVM ctrlrs. 5459 */ 5460 bool initializing; 5461 /* Denotes if a discovery is currently in progress for this context. 5462 * That includes connecting to newly discovered subsystems. Used to 5463 * ensure we do not start a new discovery until an existing one is 5464 * complete. 5465 */ 5466 bool in_progress; 5467 5468 /* Denotes if another discovery is needed after the one in progress 5469 * completes. Set when we receive an AER completion while a discovery 5470 * is already in progress. 5471 */ 5472 bool pending; 5473 5474 /* Signal to the discovery context poller that it should stop the 5475 * discovery service, including detaching from the current discovery 5476 * controller. 5477 */ 5478 bool stop; 5479 5480 struct spdk_thread *calling_thread; 5481 uint32_t index; 5482 uint32_t attach_in_progress; 5483 char *hostnqn; 5484 5485 /* Denotes if the discovery service was started by the mdns discovery. 5486 */ 5487 bool from_mdns_discovery_service; 5488 }; 5489 5490 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 5491 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 5492 5493 static void get_discovery_log_page(struct discovery_ctx *ctx); 5494 5495 static void 5496 free_discovery_ctx(struct discovery_ctx *ctx) 5497 { 5498 free(ctx->log_page); 5499 free(ctx->hostnqn); 5500 free(ctx->name); 5501 free(ctx); 5502 } 5503 5504 static void 5505 discovery_complete(struct discovery_ctx *ctx) 5506 { 5507 ctx->initializing = false; 5508 ctx->in_progress = false; 5509 if (ctx->pending) { 5510 ctx->pending = false; 5511 get_discovery_log_page(ctx); 5512 } 5513 } 5514 5515 static void 5516 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 5517 struct spdk_nvmf_discovery_log_page_entry *entry) 5518 { 5519 char *space; 5520 5521 trid->trtype = entry->trtype; 5522 trid->adrfam = entry->adrfam; 5523 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 5524 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 5525 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn)); 5526 5527 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 5528 * But the log page entries typically pad them with spaces, not zeroes. 5529 * So add a NULL terminator to each of these fields at the appropriate 5530 * location. 5531 */ 5532 space = strchr(trid->traddr, ' '); 5533 if (space) { 5534 *space = 0; 5535 } 5536 space = strchr(trid->trsvcid, ' '); 5537 if (space) { 5538 *space = 0; 5539 } 5540 space = strchr(trid->subnqn, ' '); 5541 if (space) { 5542 *space = 0; 5543 } 5544 } 5545 5546 static void 5547 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 5548 { 5549 ctx->stop = true; 5550 ctx->stop_cb_fn = cb_fn; 5551 ctx->cb_ctx = cb_ctx; 5552 5553 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 5554 struct discovery_entry_ctx *entry_ctx; 5555 struct nvme_path_id path = {}; 5556 5557 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 5558 path.trid = entry_ctx->trid; 5559 bdev_nvme_delete(entry_ctx->name, &path); 5560 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 5561 free(entry_ctx); 5562 } 5563 5564 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 5565 struct discovery_entry_ctx *entry_ctx; 5566 5567 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 5568 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 5569 free(entry_ctx); 5570 } 5571 5572 free(ctx->entry_ctx_in_use); 5573 ctx->entry_ctx_in_use = NULL; 5574 } 5575 5576 static void 5577 discovery_remove_controllers(struct discovery_ctx *ctx) 5578 { 5579 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 5580 struct discovery_entry_ctx *entry_ctx, *tmp; 5581 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 5582 struct spdk_nvme_transport_id old_trid; 5583 uint64_t numrec, i; 5584 bool found; 5585 5586 numrec = from_le64(&log_page->numrec); 5587 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 5588 found = false; 5589 old_entry = &entry_ctx->entry; 5590 build_trid_from_log_page_entry(&old_trid, old_entry); 5591 for (i = 0; i < numrec; i++) { 5592 new_entry = &log_page->entries[i]; 5593 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 5594 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 5595 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 5596 found = true; 5597 break; 5598 } 5599 } 5600 if (!found) { 5601 struct nvme_path_id path = {}; 5602 5603 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 5604 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 5605 5606 path.trid = entry_ctx->trid; 5607 bdev_nvme_delete(entry_ctx->name, &path); 5608 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 5609 free(entry_ctx); 5610 } 5611 } 5612 free(log_page); 5613 ctx->log_page = NULL; 5614 discovery_complete(ctx); 5615 } 5616 5617 static void 5618 complete_discovery_start(struct discovery_ctx *ctx, int status) 5619 { 5620 ctx->timeout_ticks = 0; 5621 ctx->rc = status; 5622 if (ctx->start_cb_fn) { 5623 ctx->start_cb_fn(ctx->cb_ctx, status); 5624 ctx->start_cb_fn = NULL; 5625 ctx->cb_ctx = NULL; 5626 } 5627 } 5628 5629 static void 5630 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 5631 { 5632 struct discovery_entry_ctx *entry_ctx = cb_ctx; 5633 struct discovery_ctx *ctx = entry_ctx->ctx; 5634 5635 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 5636 ctx->attach_in_progress--; 5637 if (ctx->attach_in_progress == 0) { 5638 complete_discovery_start(ctx, ctx->rc); 5639 if (ctx->initializing && ctx->rc != 0) { 5640 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 5641 stop_discovery(ctx, NULL, ctx->cb_ctx); 5642 } else { 5643 discovery_remove_controllers(ctx); 5644 } 5645 } 5646 } 5647 5648 static struct discovery_entry_ctx * 5649 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 5650 { 5651 struct discovery_entry_ctx *new_ctx; 5652 5653 new_ctx = calloc(1, sizeof(*new_ctx)); 5654 if (new_ctx == NULL) { 5655 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 5656 return NULL; 5657 } 5658 5659 new_ctx->ctx = ctx; 5660 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 5661 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 5662 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 5663 return new_ctx; 5664 } 5665 5666 static void 5667 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 5668 struct spdk_nvmf_discovery_log_page *log_page) 5669 { 5670 struct discovery_ctx *ctx = cb_arg; 5671 struct discovery_entry_ctx *entry_ctx, *tmp; 5672 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 5673 uint64_t numrec, i; 5674 bool found; 5675 5676 if (rc || spdk_nvme_cpl_is_error(cpl)) { 5677 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 5678 return; 5679 } 5680 5681 ctx->log_page = log_page; 5682 assert(ctx->attach_in_progress == 0); 5683 numrec = from_le64(&log_page->numrec); 5684 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 5685 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 5686 free(entry_ctx); 5687 } 5688 for (i = 0; i < numrec; i++) { 5689 found = false; 5690 new_entry = &log_page->entries[i]; 5691 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 5692 struct discovery_entry_ctx *new_ctx; 5693 struct spdk_nvme_transport_id trid = {}; 5694 5695 build_trid_from_log_page_entry(&trid, new_entry); 5696 new_ctx = create_discovery_entry_ctx(ctx, &trid); 5697 if (new_ctx == NULL) { 5698 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 5699 break; 5700 } 5701 5702 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 5703 continue; 5704 } 5705 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 5706 old_entry = &entry_ctx->entry; 5707 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 5708 found = true; 5709 break; 5710 } 5711 } 5712 if (!found) { 5713 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 5714 struct discovery_ctx *d_ctx; 5715 5716 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 5717 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 5718 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 5719 sizeof(new_entry->subnqn))) { 5720 break; 5721 } 5722 } 5723 if (subnqn_ctx) { 5724 break; 5725 } 5726 } 5727 5728 new_ctx = calloc(1, sizeof(*new_ctx)); 5729 if (new_ctx == NULL) { 5730 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 5731 break; 5732 } 5733 5734 new_ctx->ctx = ctx; 5735 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 5736 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 5737 if (subnqn_ctx) { 5738 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 5739 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 5740 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 5741 new_ctx->name); 5742 } else { 5743 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 5744 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 5745 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 5746 new_ctx->name); 5747 } 5748 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 5749 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 5750 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 5751 discovery_attach_controller_done, new_ctx, 5752 &new_ctx->drv_opts, &ctx->bdev_opts, true); 5753 if (rc == 0) { 5754 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 5755 ctx->attach_in_progress++; 5756 } else { 5757 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 5758 } 5759 } 5760 } 5761 5762 if (ctx->attach_in_progress == 0) { 5763 discovery_remove_controllers(ctx); 5764 } 5765 } 5766 5767 static void 5768 get_discovery_log_page(struct discovery_ctx *ctx) 5769 { 5770 int rc; 5771 5772 assert(ctx->in_progress == false); 5773 ctx->in_progress = true; 5774 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 5775 if (rc != 0) { 5776 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 5777 } 5778 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 5779 } 5780 5781 static void 5782 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5783 { 5784 struct discovery_ctx *ctx = arg; 5785 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 5786 5787 if (spdk_nvme_cpl_is_error(cpl)) { 5788 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 5789 return; 5790 } 5791 5792 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 5793 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 5794 return; 5795 } 5796 5797 DISCOVERY_INFOLOG(ctx, "got aer\n"); 5798 if (ctx->in_progress) { 5799 ctx->pending = true; 5800 return; 5801 } 5802 5803 get_discovery_log_page(ctx); 5804 } 5805 5806 static void 5807 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5808 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5809 { 5810 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5811 struct discovery_ctx *ctx; 5812 5813 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 5814 5815 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 5816 ctx->probe_ctx = NULL; 5817 ctx->ctrlr = ctrlr; 5818 5819 if (ctx->rc != 0) { 5820 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 5821 ctx->rc); 5822 return; 5823 } 5824 5825 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 5826 } 5827 5828 static int 5829 discovery_poller(void *arg) 5830 { 5831 struct discovery_ctx *ctx = arg; 5832 struct spdk_nvme_transport_id *trid; 5833 int rc; 5834 5835 if (ctx->detach_ctx) { 5836 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 5837 if (rc != -EAGAIN) { 5838 ctx->detach_ctx = NULL; 5839 ctx->ctrlr = NULL; 5840 } 5841 } else if (ctx->stop) { 5842 if (ctx->ctrlr != NULL) { 5843 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 5844 if (rc == 0) { 5845 return SPDK_POLLER_BUSY; 5846 } 5847 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 5848 } 5849 spdk_poller_unregister(&ctx->poller); 5850 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 5851 assert(ctx->start_cb_fn == NULL); 5852 if (ctx->stop_cb_fn != NULL) { 5853 ctx->stop_cb_fn(ctx->cb_ctx); 5854 } 5855 free_discovery_ctx(ctx); 5856 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 5857 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 5858 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 5859 assert(ctx->initializing); 5860 spdk_poller_unregister(&ctx->poller); 5861 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 5862 complete_discovery_start(ctx, -ETIMEDOUT); 5863 stop_discovery(ctx, NULL, NULL); 5864 free_discovery_ctx(ctx); 5865 return SPDK_POLLER_BUSY; 5866 } 5867 5868 assert(ctx->entry_ctx_in_use == NULL); 5869 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 5870 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 5871 trid = &ctx->entry_ctx_in_use->trid; 5872 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 5873 if (ctx->probe_ctx) { 5874 spdk_poller_unregister(&ctx->poller); 5875 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 5876 } else { 5877 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 5878 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 5879 ctx->entry_ctx_in_use = NULL; 5880 } 5881 } else if (ctx->probe_ctx) { 5882 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 5883 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 5884 complete_discovery_start(ctx, -ETIMEDOUT); 5885 return SPDK_POLLER_BUSY; 5886 } 5887 5888 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 5889 if (rc != -EAGAIN) { 5890 if (ctx->rc != 0) { 5891 assert(ctx->initializing); 5892 stop_discovery(ctx, NULL, ctx->cb_ctx); 5893 } else { 5894 assert(rc == 0); 5895 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 5896 ctx->rc = rc; 5897 get_discovery_log_page(ctx); 5898 } 5899 } 5900 } else { 5901 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 5902 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 5903 complete_discovery_start(ctx, -ETIMEDOUT); 5904 /* We need to wait until all NVM ctrlrs are attached before we stop the 5905 * discovery service to make sure we don't detach a ctrlr that is still 5906 * being attached. 5907 */ 5908 if (ctx->attach_in_progress == 0) { 5909 stop_discovery(ctx, NULL, ctx->cb_ctx); 5910 return SPDK_POLLER_BUSY; 5911 } 5912 } 5913 5914 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 5915 if (rc < 0) { 5916 spdk_poller_unregister(&ctx->poller); 5917 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 5918 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 5919 ctx->entry_ctx_in_use = NULL; 5920 5921 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 5922 if (rc != 0) { 5923 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 5924 ctx->ctrlr = NULL; 5925 } 5926 } 5927 } 5928 5929 return SPDK_POLLER_BUSY; 5930 } 5931 5932 static void 5933 start_discovery_poller(void *arg) 5934 { 5935 struct discovery_ctx *ctx = arg; 5936 5937 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 5938 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 5939 } 5940 5941 int 5942 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 5943 const char *base_name, 5944 struct spdk_nvme_ctrlr_opts *drv_opts, 5945 struct nvme_ctrlr_opts *bdev_opts, 5946 uint64_t attach_timeout, 5947 bool from_mdns, 5948 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 5949 { 5950 struct discovery_ctx *ctx; 5951 struct discovery_entry_ctx *discovery_entry_ctx; 5952 5953 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 5954 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 5955 if (strcmp(ctx->name, base_name) == 0) { 5956 return -EEXIST; 5957 } 5958 5959 if (ctx->entry_ctx_in_use != NULL) { 5960 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 5961 return -EEXIST; 5962 } 5963 } 5964 5965 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 5966 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 5967 return -EEXIST; 5968 } 5969 } 5970 } 5971 5972 ctx = calloc(1, sizeof(*ctx)); 5973 if (ctx == NULL) { 5974 return -ENOMEM; 5975 } 5976 5977 ctx->name = strdup(base_name); 5978 if (ctx->name == NULL) { 5979 free_discovery_ctx(ctx); 5980 return -ENOMEM; 5981 } 5982 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 5983 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 5984 ctx->from_mdns_discovery_service = from_mdns; 5985 ctx->bdev_opts.from_discovery_service = true; 5986 ctx->calling_thread = spdk_get_thread(); 5987 ctx->start_cb_fn = cb_fn; 5988 ctx->cb_ctx = cb_ctx; 5989 ctx->initializing = true; 5990 if (ctx->start_cb_fn) { 5991 /* We can use this when dumping json to denote if this RPC parameter 5992 * was specified or not. 5993 */ 5994 ctx->wait_for_attach = true; 5995 } 5996 if (attach_timeout != 0) { 5997 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 5998 spdk_get_ticks_hz() / 1000ull; 5999 } 6000 TAILQ_INIT(&ctx->nvm_entry_ctxs); 6001 TAILQ_INIT(&ctx->discovery_entry_ctxs); 6002 memcpy(&ctx->trid, trid, sizeof(*trid)); 6003 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 6004 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 6005 if (ctx->hostnqn == NULL) { 6006 free_discovery_ctx(ctx); 6007 return -ENOMEM; 6008 } 6009 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 6010 if (discovery_entry_ctx == NULL) { 6011 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6012 free_discovery_ctx(ctx); 6013 return -ENOMEM; 6014 } 6015 6016 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 6017 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 6018 return 0; 6019 } 6020 6021 int 6022 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6023 { 6024 struct discovery_ctx *ctx; 6025 6026 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6027 if (strcmp(name, ctx->name) == 0) { 6028 if (ctx->stop) { 6029 return -EALREADY; 6030 } 6031 /* If we're still starting the discovery service and ->rc is non-zero, we're 6032 * going to stop it as soon as we can 6033 */ 6034 if (ctx->initializing && ctx->rc != 0) { 6035 return -EALREADY; 6036 } 6037 stop_discovery(ctx, cb_fn, cb_ctx); 6038 return 0; 6039 } 6040 } 6041 6042 return -ENOENT; 6043 } 6044 6045 static int 6046 bdev_nvme_library_init(void) 6047 { 6048 g_bdev_nvme_init_thread = spdk_get_thread(); 6049 6050 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 6051 bdev_nvme_destroy_poll_group_cb, 6052 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 6053 6054 return 0; 6055 } 6056 6057 static void 6058 bdev_nvme_fini_destruct_ctrlrs(void) 6059 { 6060 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6061 struct nvme_ctrlr *nvme_ctrlr; 6062 6063 pthread_mutex_lock(&g_bdev_nvme_mutex); 6064 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 6065 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6066 pthread_mutex_lock(&nvme_ctrlr->mutex); 6067 if (nvme_ctrlr->destruct) { 6068 /* This controller's destruction was already started 6069 * before the application started shutting down 6070 */ 6071 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6072 continue; 6073 } 6074 nvme_ctrlr->destruct = true; 6075 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6076 6077 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 6078 nvme_ctrlr); 6079 } 6080 } 6081 6082 g_bdev_nvme_module_finish = true; 6083 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 6084 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6085 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 6086 spdk_bdev_module_fini_done(); 6087 return; 6088 } 6089 6090 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6091 } 6092 6093 static void 6094 check_discovery_fini(void *arg) 6095 { 6096 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 6097 bdev_nvme_fini_destruct_ctrlrs(); 6098 } 6099 } 6100 6101 static void 6102 bdev_nvme_library_fini(void) 6103 { 6104 struct nvme_probe_skip_entry *entry, *entry_tmp; 6105 struct discovery_ctx *ctx; 6106 6107 spdk_poller_unregister(&g_hotplug_poller); 6108 free(g_hotplug_probe_ctx); 6109 g_hotplug_probe_ctx = NULL; 6110 6111 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 6112 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 6113 free(entry); 6114 } 6115 6116 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 6117 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 6118 bdev_nvme_fini_destruct_ctrlrs(); 6119 } else { 6120 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6121 stop_discovery(ctx, check_discovery_fini, NULL); 6122 } 6123 } 6124 } 6125 6126 static void 6127 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 6128 { 6129 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6130 struct spdk_bdev *bdev = bdev_io->bdev; 6131 struct spdk_dif_ctx dif_ctx; 6132 struct spdk_dif_error err_blk = {}; 6133 int rc; 6134 6135 rc = spdk_dif_ctx_init(&dif_ctx, 6136 bdev->blocklen, bdev->md_len, bdev->md_interleave, 6137 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 6138 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0); 6139 if (rc != 0) { 6140 SPDK_ERRLOG("Initialization of DIF context failed\n"); 6141 return; 6142 } 6143 6144 if (bdev->md_interleave) { 6145 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 6146 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 6147 } else { 6148 struct iovec md_iov = { 6149 .iov_base = bdev_io->u.bdev.md_buf, 6150 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 6151 }; 6152 6153 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 6154 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 6155 } 6156 6157 if (rc != 0) { 6158 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 6159 err_blk.err_type, err_blk.err_offset); 6160 } else { 6161 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 6162 } 6163 } 6164 6165 static void 6166 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 6167 { 6168 struct nvme_bdev_io *bio = ref; 6169 6170 if (spdk_nvme_cpl_is_success(cpl)) { 6171 /* Run PI verification for read data buffer. */ 6172 bdev_nvme_verify_pi_error(bio); 6173 } 6174 6175 /* Return original completion status */ 6176 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 6177 } 6178 6179 static void 6180 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 6181 { 6182 struct nvme_bdev_io *bio = ref; 6183 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6184 int ret; 6185 6186 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 6187 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 6188 cpl->status.sct, cpl->status.sc); 6189 6190 /* Save completion status to use after verifying PI error. */ 6191 bio->cpl = *cpl; 6192 6193 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 6194 /* Read without PI checking to verify PI error. */ 6195 ret = bdev_nvme_no_pi_readv(bio, 6196 bdev_io->u.bdev.iovs, 6197 bdev_io->u.bdev.iovcnt, 6198 bdev_io->u.bdev.md_buf, 6199 bdev_io->u.bdev.num_blocks, 6200 bdev_io->u.bdev.offset_blocks); 6201 if (ret == 0) { 6202 return; 6203 } 6204 } 6205 } 6206 6207 bdev_nvme_io_complete_nvme_status(bio, cpl); 6208 } 6209 6210 static void 6211 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 6212 { 6213 struct nvme_bdev_io *bio = ref; 6214 6215 if (spdk_nvme_cpl_is_pi_error(cpl)) { 6216 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 6217 cpl->status.sct, cpl->status.sc); 6218 /* Run PI verification for write data buffer if PI error is detected. */ 6219 bdev_nvme_verify_pi_error(bio); 6220 } 6221 6222 bdev_nvme_io_complete_nvme_status(bio, cpl); 6223 } 6224 6225 static void 6226 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 6227 { 6228 struct nvme_bdev_io *bio = ref; 6229 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6230 6231 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 6232 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 6233 */ 6234 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 6235 6236 if (spdk_nvme_cpl_is_pi_error(cpl)) { 6237 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 6238 cpl->status.sct, cpl->status.sc); 6239 /* Run PI verification for zone append data buffer if PI error is detected. */ 6240 bdev_nvme_verify_pi_error(bio); 6241 } 6242 6243 bdev_nvme_io_complete_nvme_status(bio, cpl); 6244 } 6245 6246 static void 6247 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 6248 { 6249 struct nvme_bdev_io *bio = ref; 6250 6251 if (spdk_nvme_cpl_is_pi_error(cpl)) { 6252 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 6253 cpl->status.sct, cpl->status.sc); 6254 /* Run PI verification for compare data buffer if PI error is detected. */ 6255 bdev_nvme_verify_pi_error(bio); 6256 } 6257 6258 bdev_nvme_io_complete_nvme_status(bio, cpl); 6259 } 6260 6261 static void 6262 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 6263 { 6264 struct nvme_bdev_io *bio = ref; 6265 6266 /* Compare operation completion */ 6267 if (!bio->first_fused_completed) { 6268 /* Save compare result for write callback */ 6269 bio->cpl = *cpl; 6270 bio->first_fused_completed = true; 6271 return; 6272 } 6273 6274 /* Write operation completion */ 6275 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 6276 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 6277 * complete the IO with the compare operation's status. 6278 */ 6279 if (!spdk_nvme_cpl_is_error(cpl)) { 6280 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 6281 } 6282 6283 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 6284 } else { 6285 bdev_nvme_io_complete_nvme_status(bio, cpl); 6286 } 6287 } 6288 6289 static void 6290 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 6291 { 6292 struct nvme_bdev_io *bio = ref; 6293 6294 bdev_nvme_io_complete_nvme_status(bio, cpl); 6295 } 6296 6297 static int 6298 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 6299 { 6300 switch (desc->zt) { 6301 case SPDK_NVME_ZONE_TYPE_SEQWR: 6302 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 6303 break; 6304 default: 6305 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 6306 return -EIO; 6307 } 6308 6309 switch (desc->zs) { 6310 case SPDK_NVME_ZONE_STATE_EMPTY: 6311 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 6312 break; 6313 case SPDK_NVME_ZONE_STATE_IOPEN: 6314 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 6315 break; 6316 case SPDK_NVME_ZONE_STATE_EOPEN: 6317 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 6318 break; 6319 case SPDK_NVME_ZONE_STATE_CLOSED: 6320 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 6321 break; 6322 case SPDK_NVME_ZONE_STATE_RONLY: 6323 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 6324 break; 6325 case SPDK_NVME_ZONE_STATE_FULL: 6326 info->state = SPDK_BDEV_ZONE_STATE_FULL; 6327 break; 6328 case SPDK_NVME_ZONE_STATE_OFFLINE: 6329 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 6330 break; 6331 default: 6332 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 6333 return -EIO; 6334 } 6335 6336 info->zone_id = desc->zslba; 6337 info->write_pointer = desc->wp; 6338 info->capacity = desc->zcap; 6339 6340 return 0; 6341 } 6342 6343 static void 6344 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 6345 { 6346 struct nvme_bdev_io *bio = ref; 6347 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6348 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 6349 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 6350 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 6351 uint64_t max_zones_per_buf, i; 6352 uint32_t zone_report_bufsize; 6353 struct spdk_nvme_ns *ns; 6354 struct spdk_nvme_qpair *qpair; 6355 int ret; 6356 6357 if (spdk_nvme_cpl_is_error(cpl)) { 6358 goto out_complete_io_nvme_cpl; 6359 } 6360 6361 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 6362 ret = -ENXIO; 6363 goto out_complete_io_ret; 6364 } 6365 6366 ns = bio->io_path->nvme_ns->ns; 6367 qpair = bio->io_path->qpair->qpair; 6368 6369 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 6370 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 6371 sizeof(bio->zone_report_buf->descs[0]); 6372 6373 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 6374 ret = -EINVAL; 6375 goto out_complete_io_ret; 6376 } 6377 6378 if (!bio->zone_report_buf->nr_zones) { 6379 ret = -EINVAL; 6380 goto out_complete_io_ret; 6381 } 6382 6383 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 6384 ret = fill_zone_from_report(&info[bio->handled_zones], 6385 &bio->zone_report_buf->descs[i]); 6386 if (ret) { 6387 goto out_complete_io_ret; 6388 } 6389 bio->handled_zones++; 6390 } 6391 6392 if (bio->handled_zones < zones_to_copy) { 6393 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 6394 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 6395 6396 memset(bio->zone_report_buf, 0, zone_report_bufsize); 6397 ret = spdk_nvme_zns_report_zones(ns, qpair, 6398 bio->zone_report_buf, zone_report_bufsize, 6399 slba, SPDK_NVME_ZRA_LIST_ALL, true, 6400 bdev_nvme_get_zone_info_done, bio); 6401 if (!ret) { 6402 return; 6403 } else { 6404 goto out_complete_io_ret; 6405 } 6406 } 6407 6408 out_complete_io_nvme_cpl: 6409 free(bio->zone_report_buf); 6410 bio->zone_report_buf = NULL; 6411 bdev_nvme_io_complete_nvme_status(bio, cpl); 6412 return; 6413 6414 out_complete_io_ret: 6415 free(bio->zone_report_buf); 6416 bio->zone_report_buf = NULL; 6417 bdev_nvme_io_complete(bio, ret); 6418 } 6419 6420 static void 6421 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 6422 { 6423 struct nvme_bdev_io *bio = ref; 6424 6425 bdev_nvme_io_complete_nvme_status(bio, cpl); 6426 } 6427 6428 static void 6429 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 6430 { 6431 struct nvme_bdev_io *bio = ctx; 6432 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6433 const struct spdk_nvme_cpl *cpl = &bio->cpl; 6434 6435 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 6436 6437 __bdev_nvme_io_complete(bdev_io, 0, cpl); 6438 } 6439 6440 static void 6441 bdev_nvme_abort_complete(void *ctx) 6442 { 6443 struct nvme_bdev_io *bio = ctx; 6444 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6445 6446 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 6447 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 6448 } else { 6449 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 6450 } 6451 } 6452 6453 static void 6454 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 6455 { 6456 struct nvme_bdev_io *bio = ref; 6457 6458 bio->cpl = *cpl; 6459 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_complete, bio); 6460 } 6461 6462 static void 6463 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 6464 { 6465 struct nvme_bdev_io *bio = ref; 6466 6467 bio->cpl = *cpl; 6468 spdk_thread_send_msg(bio->orig_thread, 6469 bdev_nvme_admin_passthru_complete_nvme_status, bio); 6470 } 6471 6472 static void 6473 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 6474 { 6475 struct nvme_bdev_io *bio = ref; 6476 struct iovec *iov; 6477 6478 bio->iov_offset = sgl_offset; 6479 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 6480 iov = &bio->iovs[bio->iovpos]; 6481 if (bio->iov_offset < iov->iov_len) { 6482 break; 6483 } 6484 6485 bio->iov_offset -= iov->iov_len; 6486 } 6487 } 6488 6489 static int 6490 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 6491 { 6492 struct nvme_bdev_io *bio = ref; 6493 struct iovec *iov; 6494 6495 assert(bio->iovpos < bio->iovcnt); 6496 6497 iov = &bio->iovs[bio->iovpos]; 6498 6499 *address = iov->iov_base; 6500 *length = iov->iov_len; 6501 6502 if (bio->iov_offset) { 6503 assert(bio->iov_offset <= iov->iov_len); 6504 *address += bio->iov_offset; 6505 *length -= bio->iov_offset; 6506 } 6507 6508 bio->iov_offset += *length; 6509 if (bio->iov_offset == iov->iov_len) { 6510 bio->iovpos++; 6511 bio->iov_offset = 0; 6512 } 6513 6514 return 0; 6515 } 6516 6517 static void 6518 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 6519 { 6520 struct nvme_bdev_io *bio = ref; 6521 struct iovec *iov; 6522 6523 bio->fused_iov_offset = sgl_offset; 6524 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 6525 iov = &bio->fused_iovs[bio->fused_iovpos]; 6526 if (bio->fused_iov_offset < iov->iov_len) { 6527 break; 6528 } 6529 6530 bio->fused_iov_offset -= iov->iov_len; 6531 } 6532 } 6533 6534 static int 6535 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 6536 { 6537 struct nvme_bdev_io *bio = ref; 6538 struct iovec *iov; 6539 6540 assert(bio->fused_iovpos < bio->fused_iovcnt); 6541 6542 iov = &bio->fused_iovs[bio->fused_iovpos]; 6543 6544 *address = iov->iov_base; 6545 *length = iov->iov_len; 6546 6547 if (bio->fused_iov_offset) { 6548 assert(bio->fused_iov_offset <= iov->iov_len); 6549 *address += bio->fused_iov_offset; 6550 *length -= bio->fused_iov_offset; 6551 } 6552 6553 bio->fused_iov_offset += *length; 6554 if (bio->fused_iov_offset == iov->iov_len) { 6555 bio->fused_iovpos++; 6556 bio->fused_iov_offset = 0; 6557 } 6558 6559 return 0; 6560 } 6561 6562 static int 6563 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6564 void *md, uint64_t lba_count, uint64_t lba) 6565 { 6566 int rc; 6567 6568 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 6569 lba_count, lba); 6570 6571 bio->iovs = iov; 6572 bio->iovcnt = iovcnt; 6573 bio->iovpos = 0; 6574 bio->iov_offset = 0; 6575 6576 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 6577 bio->io_path->qpair->qpair, 6578 lba, lba_count, 6579 bdev_nvme_no_pi_readv_done, bio, 0, 6580 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6581 md, 0, 0); 6582 6583 if (rc != 0 && rc != -ENOMEM) { 6584 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 6585 } 6586 return rc; 6587 } 6588 6589 static int 6590 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6591 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 6592 struct spdk_memory_domain *domain, void *domain_ctx) 6593 { 6594 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6595 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6596 int rc; 6597 6598 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 6599 lba_count, lba); 6600 6601 bio->iovs = iov; 6602 bio->iovcnt = iovcnt; 6603 bio->iovpos = 0; 6604 bio->iov_offset = 0; 6605 6606 if (domain != NULL) { 6607 bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts); 6608 bio->ext_opts.memory_domain = domain; 6609 bio->ext_opts.memory_domain_ctx = domain_ctx; 6610 bio->ext_opts.io_flags = flags; 6611 bio->ext_opts.metadata = md; 6612 6613 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 6614 bdev_nvme_readv_done, bio, 6615 bdev_nvme_queued_reset_sgl, 6616 bdev_nvme_queued_next_sge, 6617 &bio->ext_opts); 6618 } else if (iovcnt == 1) { 6619 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, 6620 md, lba, lba_count, bdev_nvme_readv_done, 6621 bio, flags, 0, 0); 6622 } else { 6623 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 6624 bdev_nvme_readv_done, bio, flags, 6625 bdev_nvme_queued_reset_sgl, 6626 bdev_nvme_queued_next_sge, md, 0, 0); 6627 } 6628 6629 if (rc != 0 && rc != -ENOMEM) { 6630 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 6631 } 6632 return rc; 6633 } 6634 6635 static int 6636 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6637 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 6638 struct spdk_memory_domain *domain, void *domain_ctx) 6639 { 6640 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6641 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6642 int rc; 6643 6644 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 6645 lba_count, lba); 6646 6647 bio->iovs = iov; 6648 bio->iovcnt = iovcnt; 6649 bio->iovpos = 0; 6650 bio->iov_offset = 0; 6651 6652 if (domain != NULL) { 6653 bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts); 6654 bio->ext_opts.memory_domain = domain; 6655 bio->ext_opts.memory_domain_ctx = domain_ctx; 6656 bio->ext_opts.io_flags = flags; 6657 bio->ext_opts.metadata = md; 6658 6659 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 6660 bdev_nvme_writev_done, bio, 6661 bdev_nvme_queued_reset_sgl, 6662 bdev_nvme_queued_next_sge, 6663 &bio->ext_opts); 6664 } else if (iovcnt == 1) { 6665 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, 6666 md, lba, lba_count, bdev_nvme_writev_done, 6667 bio, flags, 0, 0); 6668 } else { 6669 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 6670 bdev_nvme_writev_done, bio, flags, 6671 bdev_nvme_queued_reset_sgl, 6672 bdev_nvme_queued_next_sge, md, 0, 0); 6673 } 6674 6675 if (rc != 0 && rc != -ENOMEM) { 6676 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 6677 } 6678 return rc; 6679 } 6680 6681 static int 6682 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6683 void *md, uint64_t lba_count, uint64_t zslba, 6684 uint32_t flags) 6685 { 6686 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6687 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6688 int rc; 6689 6690 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 6691 lba_count, zslba); 6692 6693 bio->iovs = iov; 6694 bio->iovcnt = iovcnt; 6695 bio->iovpos = 0; 6696 bio->iov_offset = 0; 6697 6698 if (iovcnt == 1) { 6699 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 6700 lba_count, 6701 bdev_nvme_zone_appendv_done, bio, 6702 flags, 6703 0, 0); 6704 } else { 6705 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 6706 bdev_nvme_zone_appendv_done, bio, flags, 6707 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6708 md, 0, 0); 6709 } 6710 6711 if (rc != 0 && rc != -ENOMEM) { 6712 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 6713 } 6714 return rc; 6715 } 6716 6717 static int 6718 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6719 void *md, uint64_t lba_count, uint64_t lba, 6720 uint32_t flags) 6721 { 6722 int rc; 6723 6724 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 6725 lba_count, lba); 6726 6727 bio->iovs = iov; 6728 bio->iovcnt = iovcnt; 6729 bio->iovpos = 0; 6730 bio->iov_offset = 0; 6731 6732 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 6733 bio->io_path->qpair->qpair, 6734 lba, lba_count, 6735 bdev_nvme_comparev_done, bio, flags, 6736 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6737 md, 0, 0); 6738 6739 if (rc != 0 && rc != -ENOMEM) { 6740 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 6741 } 6742 return rc; 6743 } 6744 6745 static int 6746 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 6747 struct iovec *write_iov, int write_iovcnt, 6748 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 6749 { 6750 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6751 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6752 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6753 int rc; 6754 6755 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 6756 lba_count, lba); 6757 6758 bio->iovs = cmp_iov; 6759 bio->iovcnt = cmp_iovcnt; 6760 bio->iovpos = 0; 6761 bio->iov_offset = 0; 6762 bio->fused_iovs = write_iov; 6763 bio->fused_iovcnt = write_iovcnt; 6764 bio->fused_iovpos = 0; 6765 bio->fused_iov_offset = 0; 6766 6767 if (bdev_io->num_retries == 0) { 6768 bio->first_fused_submitted = false; 6769 bio->first_fused_completed = false; 6770 } 6771 6772 if (!bio->first_fused_submitted) { 6773 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 6774 memset(&bio->cpl, 0, sizeof(bio->cpl)); 6775 6776 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 6777 bdev_nvme_comparev_and_writev_done, bio, flags, 6778 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 6779 if (rc == 0) { 6780 bio->first_fused_submitted = true; 6781 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 6782 } else { 6783 if (rc != -ENOMEM) { 6784 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 6785 } 6786 return rc; 6787 } 6788 } 6789 6790 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 6791 6792 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 6793 bdev_nvme_comparev_and_writev_done, bio, flags, 6794 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 6795 if (rc != 0 && rc != -ENOMEM) { 6796 SPDK_ERRLOG("write failed: rc = %d\n", rc); 6797 rc = 0; 6798 } 6799 6800 return rc; 6801 } 6802 6803 static int 6804 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 6805 { 6806 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 6807 struct spdk_nvme_dsm_range *range; 6808 uint64_t offset, remaining; 6809 uint64_t num_ranges_u64; 6810 uint16_t num_ranges; 6811 int rc; 6812 6813 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 6814 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6815 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 6816 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 6817 return -EINVAL; 6818 } 6819 num_ranges = (uint16_t)num_ranges_u64; 6820 6821 offset = offset_blocks; 6822 remaining = num_blocks; 6823 range = &dsm_ranges[0]; 6824 6825 /* Fill max-size ranges until the remaining blocks fit into one range */ 6826 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 6827 range->attributes.raw = 0; 6828 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6829 range->starting_lba = offset; 6830 6831 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6832 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6833 range++; 6834 } 6835 6836 /* Final range describes the remaining blocks */ 6837 range->attributes.raw = 0; 6838 range->length = remaining; 6839 range->starting_lba = offset; 6840 6841 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 6842 bio->io_path->qpair->qpair, 6843 SPDK_NVME_DSM_ATTR_DEALLOCATE, 6844 dsm_ranges, num_ranges, 6845 bdev_nvme_queued_done, bio); 6846 6847 return rc; 6848 } 6849 6850 static int 6851 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 6852 { 6853 if (num_blocks > UINT16_MAX + 1) { 6854 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 6855 return -EINVAL; 6856 } 6857 6858 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 6859 bio->io_path->qpair->qpair, 6860 offset_blocks, num_blocks, 6861 bdev_nvme_queued_done, bio, 6862 0); 6863 } 6864 6865 static int 6866 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 6867 struct spdk_bdev_zone_info *info) 6868 { 6869 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6870 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6871 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 6872 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 6873 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 6874 6875 if (zone_id % zone_size != 0) { 6876 return -EINVAL; 6877 } 6878 6879 if (num_zones > total_zones || !num_zones) { 6880 return -EINVAL; 6881 } 6882 6883 assert(!bio->zone_report_buf); 6884 bio->zone_report_buf = calloc(1, zone_report_bufsize); 6885 if (!bio->zone_report_buf) { 6886 return -ENOMEM; 6887 } 6888 6889 bio->handled_zones = 0; 6890 6891 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 6892 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 6893 bdev_nvme_get_zone_info_done, bio); 6894 } 6895 6896 static int 6897 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 6898 enum spdk_bdev_zone_action action) 6899 { 6900 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6901 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6902 6903 switch (action) { 6904 case SPDK_BDEV_ZONE_CLOSE: 6905 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 6906 bdev_nvme_zone_management_done, bio); 6907 case SPDK_BDEV_ZONE_FINISH: 6908 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 6909 bdev_nvme_zone_management_done, bio); 6910 case SPDK_BDEV_ZONE_OPEN: 6911 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 6912 bdev_nvme_zone_management_done, bio); 6913 case SPDK_BDEV_ZONE_RESET: 6914 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 6915 bdev_nvme_zone_management_done, bio); 6916 case SPDK_BDEV_ZONE_OFFLINE: 6917 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 6918 bdev_nvme_zone_management_done, bio); 6919 default: 6920 return -EINVAL; 6921 } 6922 } 6923 6924 static void 6925 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 6926 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 6927 { 6928 struct nvme_io_path *io_path; 6929 struct nvme_ctrlr *nvme_ctrlr; 6930 uint32_t max_xfer_size; 6931 int rc = -ENXIO; 6932 6933 /* Choose the first ctrlr which is not failed. */ 6934 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 6935 nvme_ctrlr = io_path->qpair->ctrlr; 6936 6937 /* We should skip any unavailable nvme_ctrlr rather than checking 6938 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 6939 */ 6940 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 6941 continue; 6942 } 6943 6944 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 6945 6946 if (nbytes > max_xfer_size) { 6947 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 6948 rc = -EINVAL; 6949 goto err; 6950 } 6951 6952 bio->io_path = io_path; 6953 bio->orig_thread = spdk_get_thread(); 6954 6955 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 6956 bdev_nvme_admin_passthru_done, bio); 6957 if (rc == 0) { 6958 return; 6959 } 6960 } 6961 6962 err: 6963 bdev_nvme_admin_passthru_complete(bio, rc); 6964 } 6965 6966 static int 6967 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 6968 void *buf, size_t nbytes) 6969 { 6970 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6971 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6972 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 6973 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 6974 6975 if (nbytes > max_xfer_size) { 6976 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 6977 return -EINVAL; 6978 } 6979 6980 /* 6981 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 6982 * so fill it out automatically. 6983 */ 6984 cmd->nsid = spdk_nvme_ns_get_id(ns); 6985 6986 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 6987 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 6988 } 6989 6990 static int 6991 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 6992 void *buf, size_t nbytes, void *md_buf, size_t md_len) 6993 { 6994 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6995 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6996 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 6997 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 6998 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 6999 7000 if (nbytes > max_xfer_size) { 7001 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7002 return -EINVAL; 7003 } 7004 7005 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 7006 SPDK_ERRLOG("invalid meta data buffer size\n"); 7007 return -EINVAL; 7008 } 7009 7010 /* 7011 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 7012 * so fill it out automatically. 7013 */ 7014 cmd->nsid = spdk_nvme_ns_get_id(ns); 7015 7016 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 7017 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 7018 } 7019 7020 static void 7021 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 7022 struct nvme_bdev_io *bio_to_abort) 7023 { 7024 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7025 struct nvme_io_path *io_path; 7026 struct nvme_ctrlr *nvme_ctrlr; 7027 int rc = 0; 7028 7029 bio->orig_thread = spdk_get_thread(); 7030 7031 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 7032 if (rc == 0) { 7033 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 7034 return; 7035 } 7036 7037 rc = 0; 7038 7039 /* Even admin commands, they were submitted to only nvme_ctrlrs which were 7040 * on any io_path. So traverse the io_path list for not only I/O commands 7041 * but also admin commands. 7042 */ 7043 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 7044 nvme_ctrlr = io_path->qpair->ctrlr; 7045 7046 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr, 7047 io_path->qpair->qpair, 7048 bio_to_abort, 7049 bdev_nvme_abort_done, bio); 7050 if (rc == -ENOENT) { 7051 /* If no command was found in I/O qpair, the target command may be 7052 * admin command. 7053 */ 7054 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr, 7055 NULL, 7056 bio_to_abort, 7057 bdev_nvme_abort_done, bio); 7058 } 7059 7060 if (rc != -ENOENT) { 7061 break; 7062 } 7063 } 7064 7065 if (rc != 0) { 7066 /* If no command was found or there was any error, complete the abort 7067 * request with failure. 7068 */ 7069 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 7070 } 7071 } 7072 7073 static int 7074 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 7075 uint64_t num_blocks) 7076 { 7077 struct spdk_nvme_scc_source_range range = { 7078 .slba = src_offset_blocks, 7079 .nlb = num_blocks - 1 7080 }; 7081 7082 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 7083 bio->io_path->qpair->qpair, 7084 &range, 1, dst_offset_blocks, 7085 bdev_nvme_queued_done, bio); 7086 } 7087 7088 static void 7089 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 7090 { 7091 const char *action; 7092 7093 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 7094 action = "reset"; 7095 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 7096 action = "abort"; 7097 } else { 7098 action = "none"; 7099 } 7100 7101 spdk_json_write_object_begin(w); 7102 7103 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 7104 7105 spdk_json_write_named_object_begin(w, "params"); 7106 spdk_json_write_named_string(w, "action_on_timeout", action); 7107 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 7108 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 7109 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 7110 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 7111 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 7112 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 7113 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 7114 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 7115 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 7116 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 7117 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 7118 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 7119 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 7120 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 7121 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 7122 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 7123 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 7124 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 7125 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 7126 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 7127 spdk_json_write_object_end(w); 7128 7129 spdk_json_write_object_end(w); 7130 } 7131 7132 static void 7133 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 7134 { 7135 struct spdk_nvme_transport_id trid; 7136 7137 spdk_json_write_object_begin(w); 7138 7139 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 7140 7141 spdk_json_write_named_object_begin(w, "params"); 7142 spdk_json_write_named_string(w, "name", ctx->name); 7143 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 7144 7145 trid = ctx->trid; 7146 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 7147 nvme_bdev_dump_trid_json(&trid, w); 7148 7149 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 7150 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 7151 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 7152 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 7153 ctx->bdev_opts.fast_io_fail_timeout_sec); 7154 spdk_json_write_object_end(w); 7155 7156 spdk_json_write_object_end(w); 7157 } 7158 7159 static void 7160 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 7161 struct nvme_ctrlr *nvme_ctrlr) 7162 { 7163 struct spdk_nvme_transport_id *trid; 7164 const struct spdk_nvme_ctrlr_opts *opts; 7165 7166 if (nvme_ctrlr->opts.from_discovery_service) { 7167 /* Do not emit an RPC for this - it will be implicitly 7168 * covered by a separate bdev_nvme_start_discovery or 7169 * bdev_nvme_start_mdns_discovery RPC. 7170 */ 7171 return; 7172 } 7173 7174 trid = &nvme_ctrlr->active_path_id->trid; 7175 7176 spdk_json_write_object_begin(w); 7177 7178 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 7179 7180 spdk_json_write_named_object_begin(w, "params"); 7181 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 7182 nvme_bdev_dump_trid_json(trid, w); 7183 spdk_json_write_named_bool(w, "prchk_reftag", 7184 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 7185 spdk_json_write_named_bool(w, "prchk_guard", 7186 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 7187 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 7188 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 7189 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 7190 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 7191 7192 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 7193 spdk_json_write_named_bool(w, "hdgst", opts->header_digest); 7194 spdk_json_write_named_bool(w, "ddgst", opts->data_digest); 7195 7196 spdk_json_write_object_end(w); 7197 7198 spdk_json_write_object_end(w); 7199 } 7200 7201 static void 7202 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 7203 { 7204 spdk_json_write_object_begin(w); 7205 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 7206 7207 spdk_json_write_named_object_begin(w, "params"); 7208 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 7209 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 7210 spdk_json_write_object_end(w); 7211 7212 spdk_json_write_object_end(w); 7213 } 7214 7215 static int 7216 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 7217 { 7218 struct nvme_bdev_ctrlr *nbdev_ctrlr; 7219 struct nvme_ctrlr *nvme_ctrlr; 7220 struct discovery_ctx *ctx; 7221 7222 bdev_nvme_opts_config_json(w); 7223 7224 pthread_mutex_lock(&g_bdev_nvme_mutex); 7225 7226 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 7227 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 7228 nvme_ctrlr_config_json(w, nvme_ctrlr); 7229 } 7230 } 7231 7232 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7233 if (!ctx->from_mdns_discovery_service) { 7234 bdev_nvme_discovery_config_json(w, ctx); 7235 } 7236 } 7237 7238 bdev_nvme_mdns_discovery_config_json(w); 7239 7240 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 7241 * before enabling hotplug poller. 7242 */ 7243 bdev_nvme_hotplug_config_json(w); 7244 7245 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7246 return 0; 7247 } 7248 7249 struct spdk_nvme_ctrlr * 7250 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 7251 { 7252 struct nvme_bdev *nbdev; 7253 struct nvme_ns *nvme_ns; 7254 7255 if (!bdev || bdev->module != &nvme_if) { 7256 return NULL; 7257 } 7258 7259 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 7260 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 7261 assert(nvme_ns != NULL); 7262 7263 return nvme_ns->ctrlr->ctrlr; 7264 } 7265 7266 void 7267 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 7268 { 7269 struct nvme_ns *nvme_ns = io_path->nvme_ns; 7270 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 7271 const struct spdk_nvme_ctrlr_data *cdata; 7272 const struct spdk_nvme_transport_id *trid; 7273 const char *adrfam_str; 7274 7275 spdk_json_write_object_begin(w); 7276 7277 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 7278 7279 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 7280 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 7281 7282 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 7283 spdk_json_write_named_bool(w, "current", io_path->nbdev_ch != NULL && 7284 io_path == io_path->nbdev_ch->current_io_path); 7285 spdk_json_write_named_bool(w, "connected", nvme_io_path_is_connected(io_path)); 7286 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 7287 7288 spdk_json_write_named_object_begin(w, "transport"); 7289 spdk_json_write_named_string(w, "trtype", trid->trstring); 7290 spdk_json_write_named_string(w, "traddr", trid->traddr); 7291 if (trid->trsvcid[0] != '\0') { 7292 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 7293 } 7294 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 7295 if (adrfam_str) { 7296 spdk_json_write_named_string(w, "adrfam", adrfam_str); 7297 } 7298 spdk_json_write_object_end(w); 7299 7300 spdk_json_write_object_end(w); 7301 } 7302 7303 void 7304 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 7305 { 7306 struct discovery_ctx *ctx; 7307 struct discovery_entry_ctx *entry_ctx; 7308 7309 spdk_json_write_array_begin(w); 7310 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7311 spdk_json_write_object_begin(w); 7312 spdk_json_write_named_string(w, "name", ctx->name); 7313 7314 spdk_json_write_named_object_begin(w, "trid"); 7315 nvme_bdev_dump_trid_json(&ctx->trid, w); 7316 spdk_json_write_object_end(w); 7317 7318 spdk_json_write_named_array_begin(w, "referrals"); 7319 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 7320 spdk_json_write_object_begin(w); 7321 spdk_json_write_named_object_begin(w, "trid"); 7322 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 7323 spdk_json_write_object_end(w); 7324 spdk_json_write_object_end(w); 7325 } 7326 spdk_json_write_array_end(w); 7327 7328 spdk_json_write_object_end(w); 7329 } 7330 spdk_json_write_array_end(w); 7331 } 7332 7333 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 7334 7335 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 7336 { 7337 struct spdk_trace_tpoint_opts opts[] = { 7338 { 7339 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 7340 OWNER_NONE, OBJECT_BDEV_NVME_IO, 1, 7341 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 7342 }, 7343 { 7344 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 7345 OWNER_NONE, OBJECT_BDEV_NVME_IO, 0, 7346 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 7347 } 7348 }; 7349 7350 7351 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 7352 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 7353 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 7354 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 7355 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 7356 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 7357 } 7358