1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/likely.h" 18 #include "spdk/nvme.h" 19 #include "spdk/nvme_ocssd.h" 20 #include "spdk/nvme_zns.h" 21 #include "spdk/opal.h" 22 #include "spdk/thread.h" 23 #include "spdk/trace.h" 24 #include "spdk/string.h" 25 #include "spdk/util.h" 26 27 #include "spdk/bdev_module.h" 28 #include "spdk/log.h" 29 30 #include "spdk_internal/usdt.h" 31 #include "spdk_internal/trace_defs.h" 32 33 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 34 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 35 36 #define NSID_STR_LEN 10 37 38 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 39 40 struct nvme_bdev_io { 41 /** array of iovecs to transfer. */ 42 struct iovec *iovs; 43 44 /** Number of iovecs in iovs array. */ 45 int iovcnt; 46 47 /** Current iovec position. */ 48 int iovpos; 49 50 /** Offset in current iovec. */ 51 uint32_t iov_offset; 52 53 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 54 * being reset in a reset I/O. 55 */ 56 struct nvme_io_path *io_path; 57 58 /** array of iovecs to transfer. */ 59 struct iovec *fused_iovs; 60 61 /** Number of iovecs in iovs array. */ 62 int fused_iovcnt; 63 64 /** Current iovec position. */ 65 int fused_iovpos; 66 67 /** Offset in current iovec. */ 68 uint32_t fused_iov_offset; 69 70 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 71 struct spdk_nvme_cpl cpl; 72 73 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 74 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 75 76 /** Originating thread */ 77 struct spdk_thread *orig_thread; 78 79 /** Keeps track if first of fused commands was submitted */ 80 bool first_fused_submitted; 81 82 /** Keeps track if first of fused commands was completed */ 83 bool first_fused_completed; 84 85 /** Temporary pointer to zone report buffer */ 86 struct spdk_nvme_zns_zone_report *zone_report_buf; 87 88 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 89 uint64_t handled_zones; 90 91 /** Expiration value in ticks to retry the current I/O. */ 92 uint64_t retry_ticks; 93 94 /* How many times the current I/O was retried. */ 95 int32_t retry_count; 96 97 /* Current tsc at submit time. */ 98 uint64_t submit_tsc; 99 }; 100 101 struct nvme_probe_skip_entry { 102 struct spdk_nvme_transport_id trid; 103 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 104 }; 105 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 106 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 107 g_skipped_nvme_ctrlrs); 108 109 static struct spdk_bdev_nvme_opts g_opts = { 110 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 111 .timeout_us = 0, 112 .timeout_admin_us = 0, 113 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 114 .transport_retry_count = 4, 115 .arbitration_burst = 0, 116 .low_priority_weight = 0, 117 .medium_priority_weight = 0, 118 .high_priority_weight = 0, 119 .nvme_adminq_poll_period_us = 10000ULL, 120 .nvme_ioq_poll_period_us = 0, 121 .io_queue_requests = 0, 122 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 123 .bdev_retry_count = 3, 124 .transport_ack_timeout = 0, 125 .ctrlr_loss_timeout_sec = 0, 126 .reconnect_delay_sec = 0, 127 .fast_io_fail_timeout_sec = 0, 128 .disable_auto_failback = false, 129 .generate_uuids = false, 130 .transport_tos = 0, 131 .nvme_error_stat = false, 132 .io_path_stat = false, 133 }; 134 135 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 136 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 137 138 static int g_hot_insert_nvme_controller_index = 0; 139 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 140 static bool g_nvme_hotplug_enabled = false; 141 struct spdk_thread *g_bdev_nvme_init_thread; 142 static struct spdk_poller *g_hotplug_poller; 143 static struct spdk_poller *g_hotplug_probe_poller; 144 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 145 146 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 147 struct nvme_async_probe_ctx *ctx); 148 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 149 struct nvme_async_probe_ctx *ctx); 150 static int bdev_nvme_library_init(void); 151 static void bdev_nvme_library_fini(void); 152 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 153 struct spdk_bdev_io *bdev_io); 154 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 155 struct spdk_bdev_io *bdev_io); 156 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 157 void *md, uint64_t lba_count, uint64_t lba, 158 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts); 159 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 160 void *md, uint64_t lba_count, uint64_t lba); 161 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 162 void *md, uint64_t lba_count, uint64_t lba, 163 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts); 164 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 165 void *md, uint64_t lba_count, 166 uint64_t zslba, uint32_t flags); 167 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 168 void *md, uint64_t lba_count, uint64_t lba, 169 uint32_t flags); 170 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 171 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 172 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 173 uint32_t flags); 174 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 175 uint32_t num_zones, struct spdk_bdev_zone_info *info); 176 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 177 enum spdk_bdev_zone_action action); 178 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 179 struct nvme_bdev_io *bio, 180 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 181 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 182 void *buf, size_t nbytes); 183 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 184 void *buf, size_t nbytes, void *md_buf, size_t md_len); 185 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 186 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 187 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 188 static int bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr); 189 static int bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove); 190 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 191 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 192 193 static struct nvme_ns *nvme_ns_alloc(void); 194 static void nvme_ns_free(struct nvme_ns *ns); 195 196 static int 197 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 198 { 199 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 200 } 201 202 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 203 204 struct spdk_nvme_qpair * 205 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 206 { 207 struct nvme_ctrlr_channel *ctrlr_ch; 208 209 assert(ctrlr_io_ch != NULL); 210 211 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 212 213 return ctrlr_ch->qpair->qpair; 214 } 215 216 static int 217 bdev_nvme_get_ctx_size(void) 218 { 219 return sizeof(struct nvme_bdev_io); 220 } 221 222 static struct spdk_bdev_module nvme_if = { 223 .name = "nvme", 224 .async_fini = true, 225 .module_init = bdev_nvme_library_init, 226 .module_fini = bdev_nvme_library_fini, 227 .config_json = bdev_nvme_config_json, 228 .get_ctx_size = bdev_nvme_get_ctx_size, 229 230 }; 231 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 232 233 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 234 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 235 bool g_bdev_nvme_module_finish; 236 237 struct nvme_bdev_ctrlr * 238 nvme_bdev_ctrlr_get_by_name(const char *name) 239 { 240 struct nvme_bdev_ctrlr *nbdev_ctrlr; 241 242 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 243 if (strcmp(name, nbdev_ctrlr->name) == 0) { 244 break; 245 } 246 } 247 248 return nbdev_ctrlr; 249 } 250 251 static struct nvme_ctrlr * 252 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 253 const struct spdk_nvme_transport_id *trid) 254 { 255 struct nvme_ctrlr *nvme_ctrlr; 256 257 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 258 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { 259 break; 260 } 261 } 262 263 return nvme_ctrlr; 264 } 265 266 static struct nvme_bdev * 267 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 268 { 269 struct nvme_bdev *bdev; 270 271 pthread_mutex_lock(&g_bdev_nvme_mutex); 272 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 273 if (bdev->nsid == nsid) { 274 break; 275 } 276 } 277 pthread_mutex_unlock(&g_bdev_nvme_mutex); 278 279 return bdev; 280 } 281 282 struct nvme_ns * 283 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 284 { 285 struct nvme_ns ns; 286 287 assert(nsid > 0); 288 289 ns.id = nsid; 290 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 291 } 292 293 struct nvme_ns * 294 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 295 { 296 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 297 } 298 299 struct nvme_ns * 300 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 301 { 302 if (ns == NULL) { 303 return NULL; 304 } 305 306 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 307 } 308 309 static struct nvme_ctrlr * 310 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) 311 { 312 struct nvme_bdev_ctrlr *nbdev_ctrlr; 313 struct nvme_ctrlr *nvme_ctrlr = NULL; 314 315 pthread_mutex_lock(&g_bdev_nvme_mutex); 316 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 317 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); 318 if (nvme_ctrlr != NULL) { 319 break; 320 } 321 } 322 pthread_mutex_unlock(&g_bdev_nvme_mutex); 323 324 return nvme_ctrlr; 325 } 326 327 struct nvme_ctrlr * 328 nvme_ctrlr_get_by_name(const char *name) 329 { 330 struct nvme_bdev_ctrlr *nbdev_ctrlr; 331 struct nvme_ctrlr *nvme_ctrlr = NULL; 332 333 if (name == NULL) { 334 return NULL; 335 } 336 337 pthread_mutex_lock(&g_bdev_nvme_mutex); 338 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 339 if (nbdev_ctrlr != NULL) { 340 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 341 } 342 pthread_mutex_unlock(&g_bdev_nvme_mutex); 343 344 return nvme_ctrlr; 345 } 346 347 void 348 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 349 { 350 struct nvme_bdev_ctrlr *nbdev_ctrlr; 351 352 pthread_mutex_lock(&g_bdev_nvme_mutex); 353 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 354 fn(nbdev_ctrlr, ctx); 355 } 356 pthread_mutex_unlock(&g_bdev_nvme_mutex); 357 } 358 359 void 360 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 361 { 362 const char *trtype_str; 363 const char *adrfam_str; 364 365 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 366 if (trtype_str) { 367 spdk_json_write_named_string(w, "trtype", trtype_str); 368 } 369 370 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 371 if (adrfam_str) { 372 spdk_json_write_named_string(w, "adrfam", adrfam_str); 373 } 374 375 if (trid->traddr[0] != '\0') { 376 spdk_json_write_named_string(w, "traddr", trid->traddr); 377 } 378 379 if (trid->trsvcid[0] != '\0') { 380 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 381 } 382 383 if (trid->subnqn[0] != '\0') { 384 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 385 } 386 } 387 388 static void 389 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 390 struct nvme_ctrlr *nvme_ctrlr) 391 { 392 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 393 pthread_mutex_lock(&g_bdev_nvme_mutex); 394 395 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 396 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 397 pthread_mutex_unlock(&g_bdev_nvme_mutex); 398 399 return; 400 } 401 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 402 403 pthread_mutex_unlock(&g_bdev_nvme_mutex); 404 405 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 406 407 free(nbdev_ctrlr->name); 408 free(nbdev_ctrlr); 409 } 410 411 static void 412 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 413 { 414 struct nvme_path_id *path_id, *tmp_path; 415 struct nvme_ns *ns, *tmp_ns; 416 417 free(nvme_ctrlr->copied_ana_desc); 418 spdk_free(nvme_ctrlr->ana_log_page); 419 420 if (nvme_ctrlr->opal_dev) { 421 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 422 nvme_ctrlr->opal_dev = NULL; 423 } 424 425 if (nvme_ctrlr->nbdev_ctrlr) { 426 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 427 } 428 429 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 430 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 431 nvme_ns_free(ns); 432 } 433 434 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 435 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 436 free(path_id); 437 } 438 439 pthread_mutex_destroy(&nvme_ctrlr->mutex); 440 441 free(nvme_ctrlr); 442 443 pthread_mutex_lock(&g_bdev_nvme_mutex); 444 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 445 pthread_mutex_unlock(&g_bdev_nvme_mutex); 446 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 447 spdk_bdev_module_fini_done(); 448 return; 449 } 450 pthread_mutex_unlock(&g_bdev_nvme_mutex); 451 } 452 453 static int 454 nvme_detach_poller(void *arg) 455 { 456 struct nvme_ctrlr *nvme_ctrlr = arg; 457 int rc; 458 459 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 460 if (rc != -EAGAIN) { 461 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 462 _nvme_ctrlr_delete(nvme_ctrlr); 463 } 464 465 return SPDK_POLLER_BUSY; 466 } 467 468 static void 469 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 470 { 471 int rc; 472 473 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 474 475 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 476 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 477 478 /* If we got here, the reset/detach poller cannot be active */ 479 assert(nvme_ctrlr->reset_detach_poller == NULL); 480 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 481 nvme_ctrlr, 1000); 482 if (nvme_ctrlr->reset_detach_poller == NULL) { 483 SPDK_ERRLOG("Failed to register detach poller\n"); 484 goto error; 485 } 486 487 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 488 if (rc != 0) { 489 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 490 goto error; 491 } 492 493 return; 494 error: 495 /* We don't have a good way to handle errors here, so just do what we can and delete the 496 * controller without detaching the underlying NVMe device. 497 */ 498 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 499 _nvme_ctrlr_delete(nvme_ctrlr); 500 } 501 502 static void 503 nvme_ctrlr_unregister_cb(void *io_device) 504 { 505 struct nvme_ctrlr *nvme_ctrlr = io_device; 506 507 nvme_ctrlr_delete(nvme_ctrlr); 508 } 509 510 static void 511 nvme_ctrlr_unregister(void *ctx) 512 { 513 struct nvme_ctrlr *nvme_ctrlr = ctx; 514 515 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 516 } 517 518 static bool 519 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 520 { 521 if (!nvme_ctrlr->destruct) { 522 return false; 523 } 524 525 if (nvme_ctrlr->ref > 0) { 526 return false; 527 } 528 529 if (nvme_ctrlr->resetting) { 530 return false; 531 } 532 533 if (nvme_ctrlr->ana_log_page_updating) { 534 return false; 535 } 536 537 if (nvme_ctrlr->io_path_cache_clearing) { 538 return false; 539 } 540 541 return true; 542 } 543 544 static void 545 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 546 { 547 pthread_mutex_lock(&nvme_ctrlr->mutex); 548 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 549 550 assert(nvme_ctrlr->ref > 0); 551 nvme_ctrlr->ref--; 552 553 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 554 pthread_mutex_unlock(&nvme_ctrlr->mutex); 555 return; 556 } 557 558 pthread_mutex_unlock(&nvme_ctrlr->mutex); 559 560 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 561 } 562 563 static void 564 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 565 { 566 nbdev_ch->current_io_path = NULL; 567 nbdev_ch->rr_counter = 0; 568 } 569 570 static struct nvme_io_path * 571 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 572 { 573 struct nvme_io_path *io_path; 574 575 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 576 if (io_path->nvme_ns == nvme_ns) { 577 break; 578 } 579 } 580 581 return io_path; 582 } 583 584 static int 585 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 586 { 587 struct nvme_io_path *io_path; 588 struct spdk_io_channel *ch; 589 struct nvme_ctrlr_channel *ctrlr_ch; 590 struct nvme_qpair *nvme_qpair; 591 592 io_path = calloc(1, sizeof(*io_path)); 593 if (io_path == NULL) { 594 SPDK_ERRLOG("Failed to alloc io_path.\n"); 595 return -ENOMEM; 596 } 597 598 if (g_opts.io_path_stat) { 599 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 600 if (io_path->stat == NULL) { 601 free(io_path); 602 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 603 return -ENOMEM; 604 } 605 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 606 } 607 608 io_path->nvme_ns = nvme_ns; 609 610 ch = spdk_get_io_channel(nvme_ns->ctrlr); 611 if (ch == NULL) { 612 free(io_path->stat); 613 free(io_path); 614 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 615 return -ENOMEM; 616 } 617 618 ctrlr_ch = spdk_io_channel_get_ctx(ch); 619 620 nvme_qpair = ctrlr_ch->qpair; 621 assert(nvme_qpair != NULL); 622 623 io_path->qpair = nvme_qpair; 624 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 625 626 io_path->nbdev_ch = nbdev_ch; 627 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 628 629 bdev_nvme_clear_current_io_path(nbdev_ch); 630 631 return 0; 632 } 633 634 static void 635 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 636 { 637 struct spdk_io_channel *ch; 638 struct nvme_qpair *nvme_qpair; 639 struct nvme_ctrlr_channel *ctrlr_ch; 640 struct nvme_bdev *nbdev; 641 642 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 643 644 /* Add the statistics to nvme_ns before this path is destroyed. */ 645 pthread_mutex_lock(&nbdev->mutex); 646 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 647 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 648 } 649 pthread_mutex_unlock(&nbdev->mutex); 650 651 bdev_nvme_clear_current_io_path(nbdev_ch); 652 653 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 654 655 nvme_qpair = io_path->qpair; 656 assert(nvme_qpair != NULL); 657 658 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 659 660 ctrlr_ch = nvme_qpair->ctrlr_ch; 661 assert(ctrlr_ch != NULL); 662 663 ch = spdk_io_channel_from_ctx(ctrlr_ch); 664 spdk_put_io_channel(ch); 665 666 free(io_path->stat); 667 free(io_path); 668 } 669 670 static void 671 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 672 { 673 struct nvme_io_path *io_path, *tmp_io_path; 674 675 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 676 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 677 } 678 } 679 680 static int 681 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 682 { 683 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 684 struct nvme_bdev *nbdev = io_device; 685 struct nvme_ns *nvme_ns; 686 int rc; 687 688 STAILQ_INIT(&nbdev_ch->io_path_list); 689 TAILQ_INIT(&nbdev_ch->retry_io_list); 690 691 pthread_mutex_lock(&nbdev->mutex); 692 693 nbdev_ch->mp_policy = nbdev->mp_policy; 694 nbdev_ch->mp_selector = nbdev->mp_selector; 695 nbdev_ch->rr_min_io = nbdev->rr_min_io; 696 697 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 698 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 699 if (rc != 0) { 700 pthread_mutex_unlock(&nbdev->mutex); 701 702 _bdev_nvme_delete_io_paths(nbdev_ch); 703 return rc; 704 } 705 } 706 pthread_mutex_unlock(&nbdev->mutex); 707 708 return 0; 709 } 710 711 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 712 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 713 */ 714 static inline void 715 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 716 const struct spdk_nvme_cpl *cpl) 717 { 718 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 719 (uintptr_t)bdev_io); 720 if (cpl) { 721 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 722 } else { 723 spdk_bdev_io_complete(bdev_io, status); 724 } 725 } 726 727 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 728 729 static void 730 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 731 { 732 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 733 734 bdev_nvme_abort_retry_ios(nbdev_ch); 735 _bdev_nvme_delete_io_paths(nbdev_ch); 736 } 737 738 static inline bool 739 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 740 { 741 switch (io_type) { 742 case SPDK_BDEV_IO_TYPE_RESET: 743 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 744 case SPDK_BDEV_IO_TYPE_ABORT: 745 return true; 746 default: 747 break; 748 } 749 750 return false; 751 } 752 753 static inline bool 754 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 755 { 756 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 757 return false; 758 } 759 760 switch (nvme_ns->ana_state) { 761 case SPDK_NVME_ANA_OPTIMIZED_STATE: 762 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 763 return true; 764 default: 765 break; 766 } 767 768 return false; 769 } 770 771 static inline bool 772 nvme_io_path_is_connected(struct nvme_io_path *io_path) 773 { 774 if (spdk_unlikely(io_path->qpair->qpair == NULL)) { 775 return false; 776 } 777 778 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(io_path->qpair->qpair) != 779 SPDK_NVME_QPAIR_FAILURE_NONE)) { 780 return false; 781 } 782 783 if (spdk_unlikely(io_path->qpair->ctrlr_ch->reset_iter != NULL)) { 784 return false; 785 } 786 787 if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(io_path->qpair->ctrlr->ctrlr) != 788 SPDK_NVME_QPAIR_FAILURE_NONE) { 789 return false; 790 } 791 792 return true; 793 } 794 795 static inline bool 796 nvme_io_path_is_available(struct nvme_io_path *io_path) 797 { 798 if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) { 799 return false; 800 } 801 802 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 803 return false; 804 } 805 806 return true; 807 } 808 809 static inline bool 810 nvme_io_path_is_failed(struct nvme_io_path *io_path) 811 { 812 struct nvme_ctrlr *nvme_ctrlr; 813 814 nvme_ctrlr = io_path->qpair->ctrlr; 815 816 if (nvme_ctrlr->destruct) { 817 return true; 818 } 819 820 if (nvme_ctrlr->fast_io_fail_timedout) { 821 return true; 822 } 823 824 if (nvme_ctrlr->resetting) { 825 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 826 return false; 827 } else { 828 return true; 829 } 830 } 831 832 if (nvme_ctrlr->reconnect_is_delayed) { 833 return false; 834 } 835 836 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 837 return true; 838 } else { 839 return false; 840 } 841 } 842 843 static bool 844 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 845 { 846 if (nvme_ctrlr->destruct) { 847 return false; 848 } 849 850 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 851 return false; 852 } 853 854 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 855 return false; 856 } 857 858 return true; 859 } 860 861 /* Simulate circular linked list. */ 862 static inline struct nvme_io_path * 863 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 864 { 865 struct nvme_io_path *next_path; 866 867 if (prev_path != NULL) { 868 next_path = STAILQ_NEXT(prev_path, stailq); 869 if (next_path != NULL) { 870 return next_path; 871 } 872 } 873 874 return STAILQ_FIRST(&nbdev_ch->io_path_list); 875 } 876 877 static struct nvme_io_path * 878 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 879 { 880 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 881 882 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 883 884 io_path = start; 885 do { 886 if (spdk_likely(nvme_io_path_is_connected(io_path) && 887 !io_path->nvme_ns->ana_state_updating)) { 888 switch (io_path->nvme_ns->ana_state) { 889 case SPDK_NVME_ANA_OPTIMIZED_STATE: 890 nbdev_ch->current_io_path = io_path; 891 return io_path; 892 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 893 if (non_optimized == NULL) { 894 non_optimized = io_path; 895 } 896 break; 897 default: 898 break; 899 } 900 } 901 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 902 } while (io_path != start); 903 904 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 905 /* We come here only if there is no optimized path. Cache even non_optimized 906 * path for load balance across multiple non_optimized paths. 907 */ 908 nbdev_ch->current_io_path = non_optimized; 909 } 910 911 return non_optimized; 912 } 913 914 static struct nvme_io_path * 915 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 916 { 917 struct nvme_io_path *io_path; 918 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 919 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 920 uint32_t num_outstanding_reqs; 921 922 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 923 if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) { 924 /* The device is currently resetting. */ 925 continue; 926 } 927 928 if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) { 929 continue; 930 } 931 932 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 933 switch (io_path->nvme_ns->ana_state) { 934 case SPDK_NVME_ANA_OPTIMIZED_STATE: 935 if (num_outstanding_reqs < opt_min_qd) { 936 opt_min_qd = num_outstanding_reqs; 937 optimized = io_path; 938 } 939 break; 940 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 941 if (num_outstanding_reqs < non_opt_min_qd) { 942 non_opt_min_qd = num_outstanding_reqs; 943 non_optimized = io_path; 944 } 945 break; 946 default: 947 break; 948 } 949 } 950 951 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 952 if (optimized != NULL) { 953 return optimized; 954 } 955 956 return non_optimized; 957 } 958 959 static inline struct nvme_io_path * 960 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 961 { 962 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 963 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 964 return nbdev_ch->current_io_path; 965 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 966 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 967 return nbdev_ch->current_io_path; 968 } 969 nbdev_ch->rr_counter = 0; 970 } 971 } 972 973 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 974 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 975 return _bdev_nvme_find_io_path(nbdev_ch); 976 } else { 977 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 978 } 979 } 980 981 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 982 * or false otherwise. 983 * 984 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 985 * is likely to be non-accessible now but may become accessible. 986 * 987 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 988 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 989 * when starting to reset it but it is set to failed when the reset failed. Hence, if 990 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 991 */ 992 static bool 993 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 994 { 995 struct nvme_io_path *io_path; 996 997 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 998 if (io_path->nvme_ns->ana_transition_timedout) { 999 continue; 1000 } 1001 1002 if (nvme_io_path_is_connected(io_path) || 1003 !nvme_io_path_is_failed(io_path)) { 1004 return true; 1005 } 1006 } 1007 1008 return false; 1009 } 1010 1011 static void 1012 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1013 { 1014 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1015 struct spdk_io_channel *ch; 1016 1017 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1018 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1019 } else { 1020 ch = spdk_io_channel_from_ctx(nbdev_ch); 1021 bdev_nvme_submit_request(ch, bdev_io); 1022 } 1023 } 1024 1025 static int 1026 bdev_nvme_retry_ios(void *arg) 1027 { 1028 struct nvme_bdev_channel *nbdev_ch = arg; 1029 struct spdk_bdev_io *bdev_io, *tmp_bdev_io; 1030 struct nvme_bdev_io *bio; 1031 uint64_t now, delay_us; 1032 1033 now = spdk_get_ticks(); 1034 1035 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) { 1036 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1037 if (bio->retry_ticks > now) { 1038 break; 1039 } 1040 1041 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1042 1043 bdev_nvme_retry_io(nbdev_ch, bdev_io); 1044 } 1045 1046 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1047 1048 bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1049 if (bdev_io != NULL) { 1050 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1051 1052 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1053 1054 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1055 delay_us); 1056 } 1057 1058 return SPDK_POLLER_BUSY; 1059 } 1060 1061 static void 1062 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1063 struct nvme_bdev_io *bio, uint64_t delay_ms) 1064 { 1065 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1066 struct spdk_bdev_io *tmp_bdev_io; 1067 struct nvme_bdev_io *tmp_bio; 1068 1069 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1070 1071 TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) { 1072 tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx; 1073 1074 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1075 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io, 1076 module_link); 1077 return; 1078 } 1079 } 1080 1081 /* No earlier I/Os were found. This I/O must be the new head. */ 1082 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link); 1083 1084 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1085 1086 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1087 delay_ms * 1000ULL); 1088 } 1089 1090 static void 1091 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1092 { 1093 struct spdk_bdev_io *bdev_io, *tmp_io; 1094 1095 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) { 1096 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1097 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1098 } 1099 1100 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1101 } 1102 1103 static int 1104 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1105 struct nvme_bdev_io *bio_to_abort) 1106 { 1107 struct spdk_bdev_io *bdev_io_to_abort; 1108 1109 TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) { 1110 if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) { 1111 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link); 1112 __bdev_nvme_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1113 return 0; 1114 } 1115 } 1116 1117 return -ENOENT; 1118 } 1119 1120 static void 1121 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1122 { 1123 struct nvme_bdev *nbdev; 1124 uint16_t sct, sc; 1125 1126 assert(spdk_nvme_cpl_is_error(cpl)); 1127 1128 nbdev = bdev_io->bdev->ctxt; 1129 1130 if (nbdev->err_stat == NULL) { 1131 return; 1132 } 1133 1134 sct = cpl->status.sct; 1135 sc = cpl->status.sc; 1136 1137 pthread_mutex_lock(&nbdev->mutex); 1138 1139 nbdev->err_stat->status_type[sct]++; 1140 switch (sct) { 1141 case SPDK_NVME_SCT_GENERIC: 1142 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1143 case SPDK_NVME_SCT_MEDIA_ERROR: 1144 case SPDK_NVME_SCT_PATH: 1145 nbdev->err_stat->status[sct][sc]++; 1146 break; 1147 default: 1148 break; 1149 } 1150 1151 pthread_mutex_unlock(&nbdev->mutex); 1152 } 1153 1154 static inline void 1155 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1156 { 1157 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1158 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1159 uint32_t blocklen = bdev_io->bdev->blocklen; 1160 struct spdk_bdev_io_stat *stat; 1161 uint64_t tsc_diff; 1162 1163 if (bio->io_path->stat == NULL) { 1164 return; 1165 } 1166 1167 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1168 stat = bio->io_path->stat; 1169 1170 switch (bdev_io->type) { 1171 case SPDK_BDEV_IO_TYPE_READ: 1172 stat->bytes_read += num_blocks * blocklen; 1173 stat->num_read_ops++; 1174 stat->read_latency_ticks += tsc_diff; 1175 if (stat->max_read_latency_ticks < tsc_diff) { 1176 stat->max_read_latency_ticks = tsc_diff; 1177 } 1178 if (stat->min_read_latency_ticks > tsc_diff) { 1179 stat->min_read_latency_ticks = tsc_diff; 1180 } 1181 break; 1182 case SPDK_BDEV_IO_TYPE_WRITE: 1183 stat->bytes_written += num_blocks * blocklen; 1184 stat->num_write_ops++; 1185 stat->write_latency_ticks += tsc_diff; 1186 if (stat->max_write_latency_ticks < tsc_diff) { 1187 stat->max_write_latency_ticks = tsc_diff; 1188 } 1189 if (stat->min_write_latency_ticks > tsc_diff) { 1190 stat->min_write_latency_ticks = tsc_diff; 1191 } 1192 break; 1193 case SPDK_BDEV_IO_TYPE_UNMAP: 1194 stat->bytes_unmapped += num_blocks * blocklen; 1195 stat->num_unmap_ops++; 1196 stat->unmap_latency_ticks += tsc_diff; 1197 if (stat->max_unmap_latency_ticks < tsc_diff) { 1198 stat->max_unmap_latency_ticks = tsc_diff; 1199 } 1200 if (stat->min_unmap_latency_ticks > tsc_diff) { 1201 stat->min_unmap_latency_ticks = tsc_diff; 1202 } 1203 break; 1204 case SPDK_BDEV_IO_TYPE_ZCOPY: 1205 /* Track the data in the start phase only */ 1206 if (!bdev_io->u.bdev.zcopy.start) { 1207 break; 1208 } 1209 if (bdev_io->u.bdev.zcopy.populate) { 1210 stat->bytes_read += num_blocks * blocklen; 1211 stat->num_read_ops++; 1212 stat->read_latency_ticks += tsc_diff; 1213 if (stat->max_read_latency_ticks < tsc_diff) { 1214 stat->max_read_latency_ticks = tsc_diff; 1215 } 1216 if (stat->min_read_latency_ticks > tsc_diff) { 1217 stat->min_read_latency_ticks = tsc_diff; 1218 } 1219 } else { 1220 stat->bytes_written += num_blocks * blocklen; 1221 stat->num_write_ops++; 1222 stat->write_latency_ticks += tsc_diff; 1223 if (stat->max_write_latency_ticks < tsc_diff) { 1224 stat->max_write_latency_ticks = tsc_diff; 1225 } 1226 if (stat->min_write_latency_ticks > tsc_diff) { 1227 stat->min_write_latency_ticks = tsc_diff; 1228 } 1229 } 1230 break; 1231 case SPDK_BDEV_IO_TYPE_COPY: 1232 stat->bytes_copied += num_blocks * blocklen; 1233 stat->num_copy_ops++; 1234 stat->copy_latency_ticks += tsc_diff; 1235 if (stat->max_copy_latency_ticks < tsc_diff) { 1236 stat->max_copy_latency_ticks = tsc_diff; 1237 } 1238 if (stat->min_copy_latency_ticks > tsc_diff) { 1239 stat->min_copy_latency_ticks = tsc_diff; 1240 } 1241 break; 1242 default: 1243 break; 1244 } 1245 } 1246 1247 static inline void 1248 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1249 const struct spdk_nvme_cpl *cpl) 1250 { 1251 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1252 struct nvme_bdev_channel *nbdev_ch; 1253 struct nvme_io_path *io_path; 1254 struct nvme_ctrlr *nvme_ctrlr; 1255 const struct spdk_nvme_ctrlr_data *cdata; 1256 uint64_t delay_ms; 1257 1258 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1259 1260 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1261 bdev_nvme_update_io_path_stat(bio); 1262 goto complete; 1263 } 1264 1265 /* Update error counts before deciding if retry is needed. 1266 * Hence, error counts may be more than the number of I/O errors. 1267 */ 1268 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1269 1270 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1271 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1272 goto complete; 1273 } 1274 1275 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1276 1277 assert(bio->io_path != NULL); 1278 io_path = bio->io_path; 1279 1280 nvme_ctrlr = io_path->qpair->ctrlr; 1281 1282 if (spdk_nvme_cpl_is_path_error(cpl) || 1283 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1284 !nvme_io_path_is_available(io_path) || 1285 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1286 bdev_nvme_clear_current_io_path(nbdev_ch); 1287 bio->io_path = NULL; 1288 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1289 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1290 io_path->nvme_ns->ana_state_updating = true; 1291 } 1292 } 1293 if (!any_io_path_may_become_available(nbdev_ch)) { 1294 goto complete; 1295 } 1296 delay_ms = 0; 1297 } else { 1298 bio->retry_count++; 1299 1300 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1301 1302 if (cpl->status.crd != 0) { 1303 delay_ms = cdata->crdt[cpl->status.crd] * 100; 1304 } else { 1305 delay_ms = 0; 1306 } 1307 } 1308 1309 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1310 return; 1311 1312 complete: 1313 bio->retry_count = 0; 1314 bio->submit_tsc = 0; 1315 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1316 } 1317 1318 static inline void 1319 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1320 { 1321 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1322 struct nvme_bdev_channel *nbdev_ch; 1323 enum spdk_bdev_io_status io_status; 1324 1325 switch (rc) { 1326 case 0: 1327 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1328 break; 1329 case -ENOMEM: 1330 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1331 break; 1332 case -ENXIO: 1333 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1334 1335 bdev_nvme_clear_current_io_path(nbdev_ch); 1336 bio->io_path = NULL; 1337 1338 if (any_io_path_may_become_available(nbdev_ch)) { 1339 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1340 return; 1341 } 1342 1343 /* fallthrough */ 1344 default: 1345 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1346 break; 1347 } 1348 1349 bio->retry_count = 0; 1350 bio->submit_tsc = 0; 1351 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1352 } 1353 1354 static inline void 1355 bdev_nvme_admin_passthru_complete(struct nvme_bdev_io *bio, int rc) 1356 { 1357 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1358 enum spdk_bdev_io_status io_status; 1359 1360 switch (rc) { 1361 case 0: 1362 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1363 break; 1364 case -ENOMEM: 1365 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1366 break; 1367 case -ENXIO: 1368 /* fallthrough */ 1369 default: 1370 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1371 break; 1372 } 1373 1374 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1375 } 1376 1377 static void 1378 bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) 1379 { 1380 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1381 1382 pthread_mutex_lock(&nvme_ctrlr->mutex); 1383 1384 assert(nvme_ctrlr->io_path_cache_clearing == true); 1385 nvme_ctrlr->io_path_cache_clearing = false; 1386 1387 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1388 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1389 return; 1390 } 1391 1392 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1393 1394 nvme_ctrlr_unregister(nvme_ctrlr); 1395 } 1396 1397 static void 1398 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1399 { 1400 struct nvme_io_path *io_path; 1401 1402 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1403 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1404 } 1405 } 1406 1407 static void 1408 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1409 { 1410 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1411 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1412 1413 assert(ctrlr_ch->qpair != NULL); 1414 1415 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1416 1417 spdk_for_each_channel_continue(i, 0); 1418 } 1419 1420 static void 1421 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1422 { 1423 pthread_mutex_lock(&nvme_ctrlr->mutex); 1424 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1425 nvme_ctrlr->io_path_cache_clearing) { 1426 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1427 return; 1428 } 1429 1430 nvme_ctrlr->io_path_cache_clearing = true; 1431 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1432 1433 spdk_for_each_channel(nvme_ctrlr, 1434 bdev_nvme_clear_io_path_cache, 1435 NULL, 1436 bdev_nvme_clear_io_path_caches_done); 1437 } 1438 1439 static struct nvme_qpair * 1440 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1441 { 1442 struct nvme_qpair *nvme_qpair; 1443 1444 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1445 if (nvme_qpair->qpair == qpair) { 1446 break; 1447 } 1448 } 1449 1450 return nvme_qpair; 1451 } 1452 1453 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1454 1455 static void 1456 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1457 { 1458 struct nvme_poll_group *group = poll_group_ctx; 1459 struct nvme_qpair *nvme_qpair; 1460 struct nvme_ctrlr_channel *ctrlr_ch; 1461 1462 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1463 if (nvme_qpair == NULL) { 1464 return; 1465 } 1466 1467 if (nvme_qpair->qpair != NULL) { 1468 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1469 nvme_qpair->qpair = NULL; 1470 } 1471 1472 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1473 1474 ctrlr_ch = nvme_qpair->ctrlr_ch; 1475 1476 if (ctrlr_ch != NULL) { 1477 if (ctrlr_ch->reset_iter != NULL) { 1478 /* If we are already in a full reset sequence, we do not have 1479 * to restart it. Just move to the next ctrlr_channel. 1480 */ 1481 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1482 qpair); 1483 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 1484 ctrlr_ch->reset_iter = NULL; 1485 } else { 1486 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1487 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1488 bdev_nvme_failover(nvme_qpair->ctrlr, false); 1489 } 1490 } else { 1491 /* In this case, ctrlr_channel is already deleted. */ 1492 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1493 nvme_qpair_delete(nvme_qpair); 1494 } 1495 } 1496 1497 static void 1498 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1499 { 1500 struct nvme_qpair *nvme_qpair; 1501 1502 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1503 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1504 continue; 1505 } 1506 1507 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1508 SPDK_NVME_QPAIR_FAILURE_NONE) { 1509 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1510 } 1511 } 1512 } 1513 1514 static int 1515 bdev_nvme_poll(void *arg) 1516 { 1517 struct nvme_poll_group *group = arg; 1518 int64_t num_completions; 1519 1520 if (group->collect_spin_stat && group->start_ticks == 0) { 1521 group->start_ticks = spdk_get_ticks(); 1522 } 1523 1524 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1525 bdev_nvme_disconnected_qpair_cb); 1526 if (group->collect_spin_stat) { 1527 if (num_completions > 0) { 1528 if (group->end_ticks != 0) { 1529 group->spin_ticks += (group->end_ticks - group->start_ticks); 1530 group->end_ticks = 0; 1531 } 1532 group->start_ticks = 0; 1533 } else { 1534 group->end_ticks = spdk_get_ticks(); 1535 } 1536 } 1537 1538 if (spdk_unlikely(num_completions < 0)) { 1539 bdev_nvme_check_io_qpairs(group); 1540 } 1541 1542 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1543 } 1544 1545 static int bdev_nvme_poll_adminq(void *arg); 1546 1547 static void 1548 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1549 { 1550 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1551 1552 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1553 nvme_ctrlr, new_period_us); 1554 } 1555 1556 static int 1557 bdev_nvme_poll_adminq(void *arg) 1558 { 1559 int32_t rc; 1560 struct nvme_ctrlr *nvme_ctrlr = arg; 1561 nvme_ctrlr_disconnected_cb disconnected_cb; 1562 1563 assert(nvme_ctrlr != NULL); 1564 1565 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1566 if (rc < 0) { 1567 disconnected_cb = nvme_ctrlr->disconnected_cb; 1568 nvme_ctrlr->disconnected_cb = NULL; 1569 1570 if (rc == -ENXIO && disconnected_cb != NULL) { 1571 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1572 g_opts.nvme_adminq_poll_period_us); 1573 disconnected_cb(nvme_ctrlr); 1574 } else { 1575 bdev_nvme_failover(nvme_ctrlr, false); 1576 } 1577 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1578 SPDK_NVME_QPAIR_FAILURE_NONE) { 1579 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1580 } 1581 1582 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1583 } 1584 1585 static void 1586 _bdev_nvme_unregister_dev_cb(void *io_device) 1587 { 1588 struct nvme_bdev *nvme_disk = io_device; 1589 1590 free(nvme_disk->disk.name); 1591 free(nvme_disk->err_stat); 1592 free(nvme_disk); 1593 } 1594 1595 static int 1596 bdev_nvme_destruct(void *ctx) 1597 { 1598 struct nvme_bdev *nvme_disk = ctx; 1599 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1600 1601 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1602 1603 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1604 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1605 1606 nvme_ns->bdev = NULL; 1607 1608 assert(nvme_ns->id > 0); 1609 1610 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1611 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1612 1613 nvme_ctrlr_release(nvme_ns->ctrlr); 1614 nvme_ns_free(nvme_ns); 1615 } else { 1616 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1617 } 1618 } 1619 1620 pthread_mutex_lock(&g_bdev_nvme_mutex); 1621 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1622 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1623 1624 spdk_io_device_unregister(nvme_disk, _bdev_nvme_unregister_dev_cb); 1625 1626 return 0; 1627 } 1628 1629 static int 1630 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1631 { 1632 struct nvme_ctrlr *nvme_ctrlr; 1633 struct spdk_nvme_io_qpair_opts opts; 1634 struct spdk_nvme_qpair *qpair; 1635 int rc; 1636 1637 nvme_ctrlr = nvme_qpair->ctrlr; 1638 1639 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1640 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1641 opts.create_only = true; 1642 opts.async_mode = true; 1643 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1644 g_opts.io_queue_requests = opts.io_queue_requests; 1645 1646 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1647 if (qpair == NULL) { 1648 return -1; 1649 } 1650 1651 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1652 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1653 1654 assert(nvme_qpair->group != NULL); 1655 1656 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1657 if (rc != 0) { 1658 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1659 goto err; 1660 } 1661 1662 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1663 if (rc != 0) { 1664 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1665 goto err; 1666 } 1667 1668 nvme_qpair->qpair = qpair; 1669 1670 if (!g_opts.disable_auto_failback) { 1671 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1672 } 1673 1674 return 0; 1675 1676 err: 1677 spdk_nvme_ctrlr_free_io_qpair(qpair); 1678 1679 return rc; 1680 } 1681 1682 static void 1683 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1684 { 1685 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1686 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1687 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 1688 struct spdk_bdev_io *bdev_io; 1689 1690 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1691 status = SPDK_BDEV_IO_STATUS_FAILED; 1692 } 1693 1694 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1695 bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1696 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link); 1697 __bdev_nvme_io_complete(bdev_io, status, NULL); 1698 } 1699 1700 spdk_for_each_channel_continue(i, 0); 1701 } 1702 1703 static void 1704 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove) 1705 { 1706 struct nvme_path_id *path_id, *next_path; 1707 int rc __attribute__((unused)); 1708 1709 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1710 assert(path_id); 1711 assert(path_id == nvme_ctrlr->active_path_id); 1712 next_path = TAILQ_NEXT(path_id, link); 1713 1714 path_id->is_failed = true; 1715 1716 if (next_path) { 1717 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1718 1719 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1720 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1721 1722 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1723 nvme_ctrlr->active_path_id = next_path; 1724 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1725 assert(rc == 0); 1726 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1727 if (!remove) { 1728 /** Shuffle the old trid to the end of the list and use the new one. 1729 * Allows for round robin through multiple connections. 1730 */ 1731 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1732 } else { 1733 free(path_id); 1734 } 1735 } 1736 } 1737 1738 static bool 1739 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1740 { 1741 int32_t elapsed; 1742 1743 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1744 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1745 return false; 1746 } 1747 1748 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1749 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1750 return true; 1751 } else { 1752 return false; 1753 } 1754 } 1755 1756 static bool 1757 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1758 { 1759 uint32_t elapsed; 1760 1761 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1762 return false; 1763 } 1764 1765 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1766 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1767 return true; 1768 } else { 1769 return false; 1770 } 1771 } 1772 1773 static void bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 1774 1775 static void 1776 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1777 { 1778 int rc; 1779 1780 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1781 if (rc != 0) { 1782 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 1783 * fail the reset sequence immediately. 1784 */ 1785 bdev_nvme_reset_complete(nvme_ctrlr, false); 1786 return; 1787 } 1788 1789 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1790 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1791 */ 1792 assert(nvme_ctrlr->disconnected_cb == NULL); 1793 nvme_ctrlr->disconnected_cb = cb_fn; 1794 1795 /* During disconnection, reduce the period to poll adminq more often. */ 1796 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 1797 } 1798 1799 enum bdev_nvme_op_after_reset { 1800 OP_NONE, 1801 OP_COMPLETE_PENDING_DESTRUCT, 1802 OP_DESTRUCT, 1803 OP_DELAYED_RECONNECT, 1804 }; 1805 1806 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1807 1808 static _bdev_nvme_op_after_reset 1809 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1810 { 1811 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1812 /* Complete pending destruct after reset completes. */ 1813 return OP_COMPLETE_PENDING_DESTRUCT; 1814 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1815 nvme_ctrlr->reset_start_tsc = 0; 1816 return OP_NONE; 1817 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1818 return OP_DESTRUCT; 1819 } else { 1820 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1821 nvme_ctrlr->fast_io_fail_timedout = true; 1822 } 1823 bdev_nvme_failover_trid(nvme_ctrlr, false); 1824 return OP_DELAYED_RECONNECT; 1825 } 1826 } 1827 1828 static int _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1829 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1830 1831 static int 1832 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 1833 { 1834 struct nvme_ctrlr *nvme_ctrlr = ctx; 1835 1836 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 1837 pthread_mutex_lock(&nvme_ctrlr->mutex); 1838 1839 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 1840 1841 assert(nvme_ctrlr->reconnect_is_delayed == true); 1842 nvme_ctrlr->reconnect_is_delayed = false; 1843 1844 if (nvme_ctrlr->destruct) { 1845 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1846 return SPDK_POLLER_BUSY; 1847 } 1848 1849 assert(nvme_ctrlr->resetting == false); 1850 nvme_ctrlr->resetting = true; 1851 1852 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1853 1854 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 1855 1856 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 1857 return SPDK_POLLER_BUSY; 1858 } 1859 1860 static void 1861 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 1862 { 1863 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 1864 1865 assert(nvme_ctrlr->reconnect_is_delayed == false); 1866 nvme_ctrlr->reconnect_is_delayed = true; 1867 1868 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 1869 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 1870 nvme_ctrlr, 1871 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 1872 } 1873 1874 static void 1875 _bdev_nvme_reset_complete(struct spdk_io_channel_iter *i, int status) 1876 { 1877 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1878 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 1879 struct nvme_path_id *path_id; 1880 bdev_nvme_reset_cb reset_cb_fn = nvme_ctrlr->reset_cb_fn; 1881 void *reset_cb_arg = nvme_ctrlr->reset_cb_arg; 1882 enum bdev_nvme_op_after_reset op_after_reset; 1883 1884 assert(nvme_ctrlr->thread == spdk_get_thread()); 1885 1886 nvme_ctrlr->reset_cb_fn = NULL; 1887 nvme_ctrlr->reset_cb_arg = NULL; 1888 1889 if (!success) { 1890 SPDK_ERRLOG("Resetting controller failed.\n"); 1891 } else { 1892 SPDK_NOTICELOG("Resetting controller successful.\n"); 1893 } 1894 1895 pthread_mutex_lock(&nvme_ctrlr->mutex); 1896 nvme_ctrlr->resetting = false; 1897 1898 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1899 assert(path_id != NULL); 1900 assert(path_id == nvme_ctrlr->active_path_id); 1901 1902 path_id->is_failed = !success; 1903 1904 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 1905 1906 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1907 1908 if (reset_cb_fn) { 1909 reset_cb_fn(reset_cb_arg, success); 1910 } 1911 1912 switch (op_after_reset) { 1913 case OP_COMPLETE_PENDING_DESTRUCT: 1914 nvme_ctrlr_unregister(nvme_ctrlr); 1915 break; 1916 case OP_DESTRUCT: 1917 _bdev_nvme_delete(nvme_ctrlr, false); 1918 break; 1919 case OP_DELAYED_RECONNECT: 1920 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 1921 break; 1922 default: 1923 break; 1924 } 1925 } 1926 1927 static void 1928 bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 1929 { 1930 /* Make sure we clear any pending resets before returning. */ 1931 spdk_for_each_channel(nvme_ctrlr, 1932 bdev_nvme_complete_pending_resets, 1933 success ? NULL : (void *)0x1, 1934 _bdev_nvme_reset_complete); 1935 } 1936 1937 static void 1938 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 1939 { 1940 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1941 1942 bdev_nvme_reset_complete(nvme_ctrlr, false); 1943 } 1944 1945 static void 1946 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 1947 { 1948 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 1949 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 1950 struct nvme_qpair *nvme_qpair; 1951 1952 nvme_qpair = ctrlr_ch->qpair; 1953 assert(nvme_qpair != NULL); 1954 1955 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1956 1957 if (nvme_qpair->qpair != NULL) { 1958 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 1959 1960 /* The current full reset sequence will move to the next 1961 * ctrlr_channel after the qpair is actually disconnected. 1962 */ 1963 assert(ctrlr_ch->reset_iter == NULL); 1964 ctrlr_ch->reset_iter = i; 1965 } else { 1966 spdk_for_each_channel_continue(i, 0); 1967 } 1968 } 1969 1970 static void 1971 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 1972 { 1973 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1974 1975 if (status == 0) { 1976 bdev_nvme_reset_complete(nvme_ctrlr, true); 1977 } else { 1978 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 1979 spdk_for_each_channel(nvme_ctrlr, 1980 bdev_nvme_reset_destroy_qpair, 1981 NULL, 1982 bdev_nvme_reset_create_qpairs_failed); 1983 } 1984 } 1985 1986 static void 1987 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 1988 { 1989 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1990 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1991 int rc; 1992 1993 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 1994 1995 spdk_for_each_channel_continue(i, rc); 1996 } 1997 1998 static int 1999 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2000 { 2001 struct nvme_ctrlr *nvme_ctrlr = arg; 2002 int rc = -ETIMEDOUT; 2003 2004 if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2005 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2006 if (rc == -EAGAIN) { 2007 return SPDK_POLLER_BUSY; 2008 } 2009 } 2010 2011 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2012 if (rc == 0) { 2013 /* Recreate all of the I/O queue pairs */ 2014 spdk_for_each_channel(nvme_ctrlr, 2015 bdev_nvme_reset_create_qpair, 2016 NULL, 2017 bdev_nvme_reset_create_qpairs_done); 2018 } else { 2019 bdev_nvme_reset_complete(nvme_ctrlr, false); 2020 } 2021 return SPDK_POLLER_BUSY; 2022 } 2023 2024 static void 2025 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2026 { 2027 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2028 2029 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2030 assert(nvme_ctrlr->reset_detach_poller == NULL); 2031 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2032 nvme_ctrlr, 0); 2033 } 2034 2035 static void 2036 bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status) 2037 { 2038 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2039 2040 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2041 assert(status == 0); 2042 2043 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2044 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2045 } else { 2046 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2047 } 2048 } 2049 2050 static void 2051 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2052 { 2053 spdk_for_each_channel(nvme_ctrlr, 2054 bdev_nvme_reset_destroy_qpair, 2055 NULL, 2056 bdev_nvme_reset_ctrlr); 2057 } 2058 2059 static void 2060 _bdev_nvme_reset(void *ctx) 2061 { 2062 struct nvme_ctrlr *nvme_ctrlr = ctx; 2063 2064 assert(nvme_ctrlr->resetting == true); 2065 assert(nvme_ctrlr->thread == spdk_get_thread()); 2066 2067 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2068 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2069 } else { 2070 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2071 } 2072 } 2073 2074 static int 2075 bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr) 2076 { 2077 pthread_mutex_lock(&nvme_ctrlr->mutex); 2078 if (nvme_ctrlr->destruct) { 2079 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2080 return -ENXIO; 2081 } 2082 2083 if (nvme_ctrlr->resetting) { 2084 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2085 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2086 return -EBUSY; 2087 } 2088 2089 if (nvme_ctrlr->reconnect_is_delayed) { 2090 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2091 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2092 return -EBUSY; 2093 } 2094 2095 nvme_ctrlr->resetting = true; 2096 2097 assert(nvme_ctrlr->reset_start_tsc == 0); 2098 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2099 2100 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2101 2102 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr); 2103 return 0; 2104 } 2105 2106 int 2107 bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg) 2108 { 2109 int rc; 2110 2111 rc = bdev_nvme_reset(nvme_ctrlr); 2112 if (rc == 0) { 2113 nvme_ctrlr->reset_cb_fn = cb_fn; 2114 nvme_ctrlr->reset_cb_arg = cb_arg; 2115 } 2116 return rc; 2117 } 2118 2119 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2120 2121 static void 2122 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 2123 { 2124 enum spdk_bdev_io_status io_status; 2125 2126 if (bio->cpl.cdw0 == 0) { 2127 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2128 } else { 2129 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2130 } 2131 2132 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2133 } 2134 2135 static void 2136 _bdev_nvme_reset_io_continue(void *ctx) 2137 { 2138 struct nvme_bdev_io *bio = ctx; 2139 struct nvme_io_path *prev_io_path, *next_io_path; 2140 int rc; 2141 2142 prev_io_path = bio->io_path; 2143 bio->io_path = NULL; 2144 2145 if (bio->cpl.cdw0 != 0) { 2146 goto complete; 2147 } 2148 2149 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 2150 if (next_io_path == NULL) { 2151 goto complete; 2152 } 2153 2154 rc = _bdev_nvme_reset_io(next_io_path, bio); 2155 if (rc == 0) { 2156 return; 2157 } 2158 2159 bio->cpl.cdw0 = 1; 2160 2161 complete: 2162 bdev_nvme_reset_io_complete(bio); 2163 } 2164 2165 static void 2166 bdev_nvme_reset_io_continue(void *cb_arg, bool success) 2167 { 2168 struct nvme_bdev_io *bio = cb_arg; 2169 2170 bio->cpl.cdw0 = !success; 2171 2172 spdk_thread_send_msg(bio->orig_thread, _bdev_nvme_reset_io_continue, bio); 2173 } 2174 2175 static int 2176 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 2177 { 2178 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 2179 struct nvme_ctrlr_channel *ctrlr_ch; 2180 struct spdk_bdev_io *bdev_io; 2181 int rc; 2182 2183 rc = bdev_nvme_reset(nvme_ctrlr); 2184 if (rc == 0) { 2185 assert(bio->io_path == NULL); 2186 bio->io_path = io_path; 2187 2188 assert(nvme_ctrlr->reset_cb_fn == NULL); 2189 assert(nvme_ctrlr->reset_cb_arg == NULL); 2190 nvme_ctrlr->reset_cb_fn = bdev_nvme_reset_io_continue; 2191 nvme_ctrlr->reset_cb_arg = bio; 2192 } else if (rc == -EBUSY) { 2193 ctrlr_ch = io_path->qpair->ctrlr_ch; 2194 assert(ctrlr_ch != NULL); 2195 /* 2196 * Reset call is queued only if it is from the app framework. This is on purpose so that 2197 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 2198 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 2199 */ 2200 bdev_io = spdk_bdev_io_from_ctx(bio); 2201 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link); 2202 } else { 2203 return rc; 2204 } 2205 2206 return 0; 2207 } 2208 2209 static void 2210 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 2211 { 2212 struct nvme_io_path *io_path; 2213 int rc; 2214 2215 bio->cpl.cdw0 = 0; 2216 bio->orig_thread = spdk_get_thread(); 2217 2218 /* Reset only the first nvme_ctrlr in the nvme_bdev_ctrlr for now. 2219 * 2220 * TODO: Reset all nvme_ctrlrs in the nvme_bdev_ctrlr sequentially. 2221 * This will be done in the following patches. 2222 */ 2223 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 2224 assert(io_path != NULL); 2225 2226 rc = _bdev_nvme_reset_io(io_path, bio); 2227 if (rc != 0) { 2228 bio->cpl.cdw0 = 1; 2229 bdev_nvme_reset_io_complete(bio); 2230 } 2231 } 2232 2233 static int 2234 bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2235 { 2236 pthread_mutex_lock(&nvme_ctrlr->mutex); 2237 if (nvme_ctrlr->destruct) { 2238 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2239 /* Don't bother resetting if the controller is in the process of being destructed. */ 2240 return -ENXIO; 2241 } 2242 2243 if (nvme_ctrlr->resetting) { 2244 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2245 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2246 return -EBUSY; 2247 } 2248 2249 bdev_nvme_failover_trid(nvme_ctrlr, remove); 2250 2251 if (nvme_ctrlr->reconnect_is_delayed) { 2252 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2253 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2254 2255 /* We rely on the next reconnect for the failover. */ 2256 return 0; 2257 } 2258 2259 nvme_ctrlr->resetting = true; 2260 2261 assert(nvme_ctrlr->reset_start_tsc == 0); 2262 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2263 2264 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2265 2266 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr); 2267 return 0; 2268 } 2269 2270 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2271 uint64_t num_blocks); 2272 2273 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2274 uint64_t num_blocks); 2275 2276 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 2277 uint64_t src_offset_blocks, 2278 uint64_t num_blocks); 2279 2280 static void 2281 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2282 bool success) 2283 { 2284 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2285 struct spdk_bdev *bdev = bdev_io->bdev; 2286 int ret; 2287 2288 if (!success) { 2289 ret = -EINVAL; 2290 goto exit; 2291 } 2292 2293 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2294 ret = -ENXIO; 2295 goto exit; 2296 } 2297 2298 ret = bdev_nvme_readv(bio, 2299 bdev_io->u.bdev.iovs, 2300 bdev_io->u.bdev.iovcnt, 2301 bdev_io->u.bdev.md_buf, 2302 bdev_io->u.bdev.num_blocks, 2303 bdev_io->u.bdev.offset_blocks, 2304 bdev->dif_check_flags, 2305 bdev_io->u.bdev.ext_opts); 2306 2307 exit: 2308 if (spdk_unlikely(ret != 0)) { 2309 bdev_nvme_io_complete(bio, ret); 2310 } 2311 } 2312 2313 static inline void 2314 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 2315 { 2316 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2317 struct spdk_bdev *bdev = bdev_io->bdev; 2318 struct nvme_bdev_io *nbdev_io_to_abort; 2319 int rc = 0; 2320 2321 switch (bdev_io->type) { 2322 case SPDK_BDEV_IO_TYPE_READ: 2323 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 2324 rc = bdev_nvme_readv(nbdev_io, 2325 bdev_io->u.bdev.iovs, 2326 bdev_io->u.bdev.iovcnt, 2327 bdev_io->u.bdev.md_buf, 2328 bdev_io->u.bdev.num_blocks, 2329 bdev_io->u.bdev.offset_blocks, 2330 bdev->dif_check_flags, 2331 bdev_io->u.bdev.ext_opts); 2332 } else { 2333 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 2334 bdev_io->u.bdev.num_blocks * bdev->blocklen); 2335 rc = 0; 2336 } 2337 break; 2338 case SPDK_BDEV_IO_TYPE_WRITE: 2339 rc = bdev_nvme_writev(nbdev_io, 2340 bdev_io->u.bdev.iovs, 2341 bdev_io->u.bdev.iovcnt, 2342 bdev_io->u.bdev.md_buf, 2343 bdev_io->u.bdev.num_blocks, 2344 bdev_io->u.bdev.offset_blocks, 2345 bdev->dif_check_flags, 2346 bdev_io->u.bdev.ext_opts); 2347 break; 2348 case SPDK_BDEV_IO_TYPE_COMPARE: 2349 rc = bdev_nvme_comparev(nbdev_io, 2350 bdev_io->u.bdev.iovs, 2351 bdev_io->u.bdev.iovcnt, 2352 bdev_io->u.bdev.md_buf, 2353 bdev_io->u.bdev.num_blocks, 2354 bdev_io->u.bdev.offset_blocks, 2355 bdev->dif_check_flags); 2356 break; 2357 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2358 rc = bdev_nvme_comparev_and_writev(nbdev_io, 2359 bdev_io->u.bdev.iovs, 2360 bdev_io->u.bdev.iovcnt, 2361 bdev_io->u.bdev.fused_iovs, 2362 bdev_io->u.bdev.fused_iovcnt, 2363 bdev_io->u.bdev.md_buf, 2364 bdev_io->u.bdev.num_blocks, 2365 bdev_io->u.bdev.offset_blocks, 2366 bdev->dif_check_flags); 2367 break; 2368 case SPDK_BDEV_IO_TYPE_UNMAP: 2369 rc = bdev_nvme_unmap(nbdev_io, 2370 bdev_io->u.bdev.offset_blocks, 2371 bdev_io->u.bdev.num_blocks); 2372 break; 2373 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2374 rc = bdev_nvme_write_zeroes(nbdev_io, 2375 bdev_io->u.bdev.offset_blocks, 2376 bdev_io->u.bdev.num_blocks); 2377 break; 2378 case SPDK_BDEV_IO_TYPE_RESET: 2379 nbdev_io->io_path = NULL; 2380 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 2381 break; 2382 case SPDK_BDEV_IO_TYPE_FLUSH: 2383 bdev_nvme_io_complete(nbdev_io, 0); 2384 break; 2385 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 2386 rc = bdev_nvme_zone_appendv(nbdev_io, 2387 bdev_io->u.bdev.iovs, 2388 bdev_io->u.bdev.iovcnt, 2389 bdev_io->u.bdev.md_buf, 2390 bdev_io->u.bdev.num_blocks, 2391 bdev_io->u.bdev.offset_blocks, 2392 bdev->dif_check_flags); 2393 break; 2394 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 2395 rc = bdev_nvme_get_zone_info(nbdev_io, 2396 bdev_io->u.zone_mgmt.zone_id, 2397 bdev_io->u.zone_mgmt.num_zones, 2398 bdev_io->u.zone_mgmt.buf); 2399 break; 2400 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 2401 rc = bdev_nvme_zone_management(nbdev_io, 2402 bdev_io->u.zone_mgmt.zone_id, 2403 bdev_io->u.zone_mgmt.zone_action); 2404 break; 2405 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 2406 nbdev_io->io_path = NULL; 2407 bdev_nvme_admin_passthru(nbdev_ch, 2408 nbdev_io, 2409 &bdev_io->u.nvme_passthru.cmd, 2410 bdev_io->u.nvme_passthru.buf, 2411 bdev_io->u.nvme_passthru.nbytes); 2412 break; 2413 case SPDK_BDEV_IO_TYPE_NVME_IO: 2414 rc = bdev_nvme_io_passthru(nbdev_io, 2415 &bdev_io->u.nvme_passthru.cmd, 2416 bdev_io->u.nvme_passthru.buf, 2417 bdev_io->u.nvme_passthru.nbytes); 2418 break; 2419 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2420 rc = bdev_nvme_io_passthru_md(nbdev_io, 2421 &bdev_io->u.nvme_passthru.cmd, 2422 bdev_io->u.nvme_passthru.buf, 2423 bdev_io->u.nvme_passthru.nbytes, 2424 bdev_io->u.nvme_passthru.md_buf, 2425 bdev_io->u.nvme_passthru.md_len); 2426 break; 2427 case SPDK_BDEV_IO_TYPE_ABORT: 2428 nbdev_io->io_path = NULL; 2429 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 2430 bdev_nvme_abort(nbdev_ch, 2431 nbdev_io, 2432 nbdev_io_to_abort); 2433 break; 2434 case SPDK_BDEV_IO_TYPE_COPY: 2435 rc = bdev_nvme_copy(nbdev_io, 2436 bdev_io->u.bdev.offset_blocks, 2437 bdev_io->u.bdev.copy.src_offset_blocks, 2438 bdev_io->u.bdev.num_blocks); 2439 break; 2440 default: 2441 rc = -EINVAL; 2442 break; 2443 } 2444 2445 if (spdk_unlikely(rc != 0)) { 2446 bdev_nvme_io_complete(nbdev_io, rc); 2447 } 2448 } 2449 2450 static void 2451 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2452 { 2453 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 2454 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2455 2456 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 2457 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 2458 } else { 2459 /* There are cases where submit_tsc != 0, i.e. retry I/O. 2460 * We need to update submit_tsc here. 2461 */ 2462 nbdev_io->submit_tsc = spdk_get_ticks(); 2463 } 2464 2465 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 2466 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 2467 if (spdk_unlikely(!nbdev_io->io_path)) { 2468 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 2469 bdev_nvme_io_complete(nbdev_io, -ENXIO); 2470 return; 2471 } 2472 2473 /* Admin commands do not use the optimal I/O path. 2474 * Simply fall through even if it is not found. 2475 */ 2476 } 2477 2478 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 2479 } 2480 2481 static bool 2482 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 2483 { 2484 struct nvme_bdev *nbdev = ctx; 2485 struct nvme_ns *nvme_ns; 2486 struct spdk_nvme_ns *ns; 2487 struct spdk_nvme_ctrlr *ctrlr; 2488 const struct spdk_nvme_ctrlr_data *cdata; 2489 2490 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 2491 assert(nvme_ns != NULL); 2492 ns = nvme_ns->ns; 2493 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2494 2495 switch (io_type) { 2496 case SPDK_BDEV_IO_TYPE_READ: 2497 case SPDK_BDEV_IO_TYPE_WRITE: 2498 case SPDK_BDEV_IO_TYPE_RESET: 2499 case SPDK_BDEV_IO_TYPE_FLUSH: 2500 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 2501 case SPDK_BDEV_IO_TYPE_NVME_IO: 2502 case SPDK_BDEV_IO_TYPE_ABORT: 2503 return true; 2504 2505 case SPDK_BDEV_IO_TYPE_COMPARE: 2506 return spdk_nvme_ns_supports_compare(ns); 2507 2508 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2509 return spdk_nvme_ns_get_md_size(ns) ? true : false; 2510 2511 case SPDK_BDEV_IO_TYPE_UNMAP: 2512 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2513 return cdata->oncs.dsm; 2514 2515 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2516 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2517 return cdata->oncs.write_zeroes; 2518 2519 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2520 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 2521 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 2522 return true; 2523 } 2524 return false; 2525 2526 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 2527 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 2528 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 2529 2530 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 2531 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 2532 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 2533 2534 case SPDK_BDEV_IO_TYPE_COPY: 2535 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2536 return cdata->oncs.copy; 2537 2538 default: 2539 return false; 2540 } 2541 } 2542 2543 static int 2544 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 2545 { 2546 struct nvme_qpair *nvme_qpair; 2547 struct spdk_io_channel *pg_ch; 2548 int rc; 2549 2550 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 2551 if (!nvme_qpair) { 2552 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 2553 return -1; 2554 } 2555 2556 TAILQ_INIT(&nvme_qpair->io_path_list); 2557 2558 nvme_qpair->ctrlr = nvme_ctrlr; 2559 nvme_qpair->ctrlr_ch = ctrlr_ch; 2560 2561 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 2562 if (!pg_ch) { 2563 free(nvme_qpair); 2564 return -1; 2565 } 2566 2567 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 2568 2569 #ifdef SPDK_CONFIG_VTUNE 2570 nvme_qpair->group->collect_spin_stat = true; 2571 #else 2572 nvme_qpair->group->collect_spin_stat = false; 2573 #endif 2574 2575 rc = bdev_nvme_create_qpair(nvme_qpair); 2576 if (rc != 0) { 2577 /* nvme_ctrlr can't create IO qpair if connection is down. 2578 * 2579 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 2580 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 2581 * submitted IO will be queued until IO qpair is successfully created. 2582 * 2583 * Hence, if both are satisfied, ignore the failure. 2584 */ 2585 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 2586 spdk_put_io_channel(pg_ch); 2587 free(nvme_qpair); 2588 return rc; 2589 } 2590 } 2591 2592 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 2593 2594 ctrlr_ch->qpair = nvme_qpair; 2595 2596 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 2597 nvme_qpair->ctrlr->ref++; 2598 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 2599 2600 return 0; 2601 } 2602 2603 static int 2604 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 2605 { 2606 struct nvme_ctrlr *nvme_ctrlr = io_device; 2607 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 2608 2609 TAILQ_INIT(&ctrlr_ch->pending_resets); 2610 2611 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 2612 } 2613 2614 static void 2615 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 2616 { 2617 assert(nvme_qpair->group != NULL); 2618 2619 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 2620 2621 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 2622 2623 nvme_ctrlr_release(nvme_qpair->ctrlr); 2624 2625 free(nvme_qpair); 2626 } 2627 2628 static void 2629 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 2630 { 2631 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 2632 struct nvme_qpair *nvme_qpair; 2633 2634 nvme_qpair = ctrlr_ch->qpair; 2635 assert(nvme_qpair != NULL); 2636 2637 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2638 2639 if (nvme_qpair->qpair != NULL) { 2640 if (ctrlr_ch->reset_iter == NULL) { 2641 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2642 } else { 2643 /* Skip current ctrlr_channel in a full reset sequence because 2644 * it is being deleted now. The qpair is already being disconnected. 2645 * We do not have to restart disconnecting it. 2646 */ 2647 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2648 } 2649 2650 /* We cannot release a reference to the poll group now. 2651 * The qpair may be disconnected asynchronously later. 2652 * We need to poll it until it is actually disconnected. 2653 * Just detach the qpair from the deleting ctrlr_channel. 2654 */ 2655 nvme_qpair->ctrlr_ch = NULL; 2656 } else { 2657 assert(ctrlr_ch->reset_iter == NULL); 2658 2659 nvme_qpair_delete(nvme_qpair); 2660 } 2661 } 2662 2663 static void 2664 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 2665 uint32_t iov_cnt, uint32_t seed, 2666 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 2667 { 2668 struct nvme_poll_group *group = ctx; 2669 int rc; 2670 2671 assert(group->accel_channel != NULL); 2672 assert(cb_fn != NULL); 2673 2674 rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 2675 if (rc) { 2676 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 2677 if (rc == -ENOMEM || rc == -EINVAL) { 2678 cb_fn(cb_arg, rc); 2679 } 2680 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 2681 } 2682 } 2683 2684 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 2685 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 2686 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 2687 }; 2688 2689 static int 2690 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 2691 { 2692 struct nvme_poll_group *group = ctx_buf; 2693 2694 TAILQ_INIT(&group->qpair_list); 2695 2696 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 2697 if (group->group == NULL) { 2698 return -1; 2699 } 2700 2701 group->accel_channel = spdk_accel_get_io_channel(); 2702 if (!group->accel_channel) { 2703 spdk_nvme_poll_group_destroy(group->group); 2704 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 2705 group); 2706 return -1; 2707 } 2708 2709 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 2710 2711 if (group->poller == NULL) { 2712 spdk_put_io_channel(group->accel_channel); 2713 spdk_nvme_poll_group_destroy(group->group); 2714 return -1; 2715 } 2716 2717 return 0; 2718 } 2719 2720 static void 2721 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 2722 { 2723 struct nvme_poll_group *group = ctx_buf; 2724 2725 assert(TAILQ_EMPTY(&group->qpair_list)); 2726 2727 if (group->accel_channel) { 2728 spdk_put_io_channel(group->accel_channel); 2729 } 2730 2731 spdk_poller_unregister(&group->poller); 2732 if (spdk_nvme_poll_group_destroy(group->group)) { 2733 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 2734 assert(false); 2735 } 2736 } 2737 2738 static struct spdk_io_channel * 2739 bdev_nvme_get_io_channel(void *ctx) 2740 { 2741 struct nvme_bdev *nvme_bdev = ctx; 2742 2743 return spdk_get_io_channel(nvme_bdev); 2744 } 2745 2746 static void * 2747 bdev_nvme_get_module_ctx(void *ctx) 2748 { 2749 struct nvme_bdev *nvme_bdev = ctx; 2750 struct nvme_ns *nvme_ns; 2751 2752 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 2753 return NULL; 2754 } 2755 2756 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 2757 if (!nvme_ns) { 2758 return NULL; 2759 } 2760 2761 return nvme_ns->ns; 2762 } 2763 2764 static const char * 2765 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 2766 { 2767 switch (ana_state) { 2768 case SPDK_NVME_ANA_OPTIMIZED_STATE: 2769 return "optimized"; 2770 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 2771 return "non_optimized"; 2772 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 2773 return "inaccessible"; 2774 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 2775 return "persistent_loss"; 2776 case SPDK_NVME_ANA_CHANGE_STATE: 2777 return "change"; 2778 default: 2779 return NULL; 2780 } 2781 } 2782 2783 static int 2784 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 2785 { 2786 struct spdk_memory_domain **_domains = NULL; 2787 struct nvme_bdev *nbdev = ctx; 2788 struct nvme_ns *nvme_ns; 2789 int i = 0, _array_size = array_size; 2790 int rc = 0; 2791 2792 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 2793 if (domains && array_size >= i) { 2794 _domains = &domains[i]; 2795 } else { 2796 _domains = NULL; 2797 } 2798 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 2799 if (rc > 0) { 2800 i += rc; 2801 if (_array_size >= rc) { 2802 _array_size -= rc; 2803 } else { 2804 _array_size = 0; 2805 } 2806 } else if (rc < 0) { 2807 return rc; 2808 } 2809 } 2810 2811 return i; 2812 } 2813 2814 static const char * 2815 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 2816 { 2817 if (nvme_ctrlr->destruct) { 2818 return "deleting"; 2819 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 2820 return "failed"; 2821 } else if (nvme_ctrlr->resetting) { 2822 return "resetting"; 2823 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 2824 return "reconnect_is_delayed"; 2825 } else { 2826 return "enabled"; 2827 } 2828 } 2829 2830 void 2831 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 2832 { 2833 struct spdk_nvme_transport_id *trid; 2834 const struct spdk_nvme_ctrlr_opts *opts; 2835 const struct spdk_nvme_ctrlr_data *cdata; 2836 2837 spdk_json_write_object_begin(w); 2838 2839 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 2840 2841 #ifdef SPDK_CONFIG_NVME_CUSE 2842 size_t cuse_name_size = 128; 2843 char cuse_name[cuse_name_size]; 2844 2845 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 2846 if (rc == 0) { 2847 spdk_json_write_named_string(w, "cuse_device", cuse_name); 2848 } 2849 #endif 2850 trid = &nvme_ctrlr->active_path_id->trid; 2851 spdk_json_write_named_object_begin(w, "trid"); 2852 nvme_bdev_dump_trid_json(trid, w); 2853 spdk_json_write_object_end(w); 2854 2855 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 2856 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 2857 2858 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 2859 spdk_json_write_named_object_begin(w, "host"); 2860 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 2861 spdk_json_write_named_string(w, "addr", opts->src_addr); 2862 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 2863 spdk_json_write_object_end(w); 2864 2865 spdk_json_write_object_end(w); 2866 } 2867 2868 static void 2869 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 2870 struct nvme_ns *nvme_ns) 2871 { 2872 struct spdk_nvme_ns *ns; 2873 struct spdk_nvme_ctrlr *ctrlr; 2874 const struct spdk_nvme_ctrlr_data *cdata; 2875 const struct spdk_nvme_transport_id *trid; 2876 union spdk_nvme_vs_register vs; 2877 const struct spdk_nvme_ns_data *nsdata; 2878 char buf[128]; 2879 2880 ns = nvme_ns->ns; 2881 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2882 2883 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2884 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 2885 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 2886 2887 spdk_json_write_object_begin(w); 2888 2889 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2890 spdk_json_write_named_string(w, "pci_address", trid->traddr); 2891 } 2892 2893 spdk_json_write_named_object_begin(w, "trid"); 2894 2895 nvme_bdev_dump_trid_json(trid, w); 2896 2897 spdk_json_write_object_end(w); 2898 2899 #ifdef SPDK_CONFIG_NVME_CUSE 2900 size_t cuse_name_size = 128; 2901 char cuse_name[cuse_name_size]; 2902 2903 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 2904 cuse_name, &cuse_name_size); 2905 if (rc == 0) { 2906 spdk_json_write_named_string(w, "cuse_device", cuse_name); 2907 } 2908 #endif 2909 2910 spdk_json_write_named_object_begin(w, "ctrlr_data"); 2911 2912 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 2913 2914 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 2915 2916 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 2917 spdk_str_trim(buf); 2918 spdk_json_write_named_string(w, "model_number", buf); 2919 2920 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 2921 spdk_str_trim(buf); 2922 spdk_json_write_named_string(w, "serial_number", buf); 2923 2924 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 2925 spdk_str_trim(buf); 2926 spdk_json_write_named_string(w, "firmware_revision", buf); 2927 2928 if (cdata->subnqn[0] != '\0') { 2929 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 2930 } 2931 2932 spdk_json_write_named_object_begin(w, "oacs"); 2933 2934 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 2935 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 2936 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 2937 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 2938 2939 spdk_json_write_object_end(w); 2940 2941 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 2942 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 2943 2944 spdk_json_write_object_end(w); 2945 2946 spdk_json_write_named_object_begin(w, "vs"); 2947 2948 spdk_json_write_name(w, "nvme_version"); 2949 if (vs.bits.ter) { 2950 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 2951 } else { 2952 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 2953 } 2954 2955 spdk_json_write_object_end(w); 2956 2957 nsdata = spdk_nvme_ns_get_data(ns); 2958 2959 spdk_json_write_named_object_begin(w, "ns_data"); 2960 2961 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 2962 2963 if (cdata->cmic.ana_reporting) { 2964 spdk_json_write_named_string(w, "ana_state", 2965 _nvme_ana_state_str(nvme_ns->ana_state)); 2966 } 2967 2968 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 2969 2970 spdk_json_write_object_end(w); 2971 2972 if (cdata->oacs.security) { 2973 spdk_json_write_named_object_begin(w, "security"); 2974 2975 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 2976 2977 spdk_json_write_object_end(w); 2978 } 2979 2980 spdk_json_write_object_end(w); 2981 } 2982 2983 static const char * 2984 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 2985 { 2986 switch (nbdev->mp_policy) { 2987 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 2988 return "active_passive"; 2989 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 2990 return "active_active"; 2991 default: 2992 assert(false); 2993 return "invalid"; 2994 } 2995 } 2996 2997 static int 2998 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 2999 { 3000 struct nvme_bdev *nvme_bdev = ctx; 3001 struct nvme_ns *nvme_ns; 3002 3003 pthread_mutex_lock(&nvme_bdev->mutex); 3004 spdk_json_write_named_array_begin(w, "nvme"); 3005 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 3006 nvme_namespace_info_json(w, nvme_ns); 3007 } 3008 spdk_json_write_array_end(w); 3009 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 3010 pthread_mutex_unlock(&nvme_bdev->mutex); 3011 3012 return 0; 3013 } 3014 3015 static void 3016 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3017 { 3018 /* No config per bdev needed */ 3019 } 3020 3021 static uint64_t 3022 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 3023 { 3024 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3025 struct nvme_io_path *io_path; 3026 struct nvme_poll_group *group; 3027 uint64_t spin_time = 0; 3028 3029 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3030 group = io_path->qpair->group; 3031 3032 if (!group || !group->collect_spin_stat) { 3033 continue; 3034 } 3035 3036 if (group->end_ticks != 0) { 3037 group->spin_ticks += (group->end_ticks - group->start_ticks); 3038 group->end_ticks = 0; 3039 } 3040 3041 spin_time += group->spin_ticks; 3042 group->start_ticks = 0; 3043 group->spin_ticks = 0; 3044 } 3045 3046 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 3047 } 3048 3049 static void 3050 bdev_nvme_reset_device_stat(void *ctx) 3051 { 3052 struct nvme_bdev *nbdev = ctx; 3053 3054 if (nbdev->err_stat != NULL) { 3055 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 3056 } 3057 } 3058 3059 /* JSON string should be lowercases and underscore delimited string. */ 3060 static void 3061 bdev_nvme_format_nvme_status(char *dst, const char *src) 3062 { 3063 char tmp[256]; 3064 3065 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 3066 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 3067 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 3068 spdk_strlwr(dst); 3069 } 3070 3071 static void 3072 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 3073 { 3074 struct nvme_bdev *nbdev = ctx; 3075 struct spdk_nvme_status status = {}; 3076 uint16_t sct, sc; 3077 char status_json[256]; 3078 const char *status_str; 3079 3080 if (nbdev->err_stat == NULL) { 3081 return; 3082 } 3083 3084 spdk_json_write_named_object_begin(w, "nvme_error"); 3085 3086 spdk_json_write_named_object_begin(w, "status_type"); 3087 for (sct = 0; sct < 8; sct++) { 3088 if (nbdev->err_stat->status_type[sct] == 0) { 3089 continue; 3090 } 3091 status.sct = sct; 3092 3093 status_str = spdk_nvme_cpl_get_status_type_string(&status); 3094 assert(status_str != NULL); 3095 bdev_nvme_format_nvme_status(status_json, status_str); 3096 3097 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 3098 } 3099 spdk_json_write_object_end(w); 3100 3101 spdk_json_write_named_object_begin(w, "status_code"); 3102 for (sct = 0; sct < 4; sct++) { 3103 status.sct = sct; 3104 for (sc = 0; sc < 256; sc++) { 3105 if (nbdev->err_stat->status[sct][sc] == 0) { 3106 continue; 3107 } 3108 status.sc = sc; 3109 3110 status_str = spdk_nvme_cpl_get_status_string(&status); 3111 assert(status_str != NULL); 3112 bdev_nvme_format_nvme_status(status_json, status_str); 3113 3114 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 3115 } 3116 } 3117 spdk_json_write_object_end(w); 3118 3119 spdk_json_write_object_end(w); 3120 } 3121 3122 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 3123 .destruct = bdev_nvme_destruct, 3124 .submit_request = bdev_nvme_submit_request, 3125 .io_type_supported = bdev_nvme_io_type_supported, 3126 .get_io_channel = bdev_nvme_get_io_channel, 3127 .dump_info_json = bdev_nvme_dump_info_json, 3128 .write_config_json = bdev_nvme_write_config_json, 3129 .get_spin_time = bdev_nvme_get_spin_time, 3130 .get_module_ctx = bdev_nvme_get_module_ctx, 3131 .get_memory_domains = bdev_nvme_get_memory_domains, 3132 .reset_device_stat = bdev_nvme_reset_device_stat, 3133 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 3134 }; 3135 3136 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 3137 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 3138 3139 static int 3140 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3141 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 3142 { 3143 struct spdk_nvme_ana_group_descriptor *copied_desc; 3144 uint8_t *orig_desc; 3145 uint32_t i, desc_size, copy_len; 3146 int rc = 0; 3147 3148 if (nvme_ctrlr->ana_log_page == NULL) { 3149 return -EINVAL; 3150 } 3151 3152 copied_desc = nvme_ctrlr->copied_ana_desc; 3153 3154 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 3155 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 3156 3157 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 3158 memcpy(copied_desc, orig_desc, copy_len); 3159 3160 rc = cb_fn(copied_desc, cb_arg); 3161 if (rc != 0) { 3162 break; 3163 } 3164 3165 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 3166 copied_desc->num_of_nsid * sizeof(uint32_t); 3167 orig_desc += desc_size; 3168 copy_len -= desc_size; 3169 } 3170 3171 return rc; 3172 } 3173 3174 static int 3175 nvme_ns_ana_transition_timedout(void *ctx) 3176 { 3177 struct nvme_ns *nvme_ns = ctx; 3178 3179 spdk_poller_unregister(&nvme_ns->anatt_timer); 3180 nvme_ns->ana_transition_timedout = true; 3181 3182 return SPDK_POLLER_BUSY; 3183 } 3184 3185 static void 3186 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 3187 const struct spdk_nvme_ana_group_descriptor *desc) 3188 { 3189 const struct spdk_nvme_ctrlr_data *cdata; 3190 3191 nvme_ns->ana_group_id = desc->ana_group_id; 3192 nvme_ns->ana_state = desc->ana_state; 3193 nvme_ns->ana_state_updating = false; 3194 3195 switch (nvme_ns->ana_state) { 3196 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3197 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3198 nvme_ns->ana_transition_timedout = false; 3199 spdk_poller_unregister(&nvme_ns->anatt_timer); 3200 break; 3201 3202 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3203 case SPDK_NVME_ANA_CHANGE_STATE: 3204 if (nvme_ns->anatt_timer != NULL) { 3205 break; 3206 } 3207 3208 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 3209 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 3210 nvme_ns, 3211 cdata->anatt * SPDK_SEC_TO_USEC); 3212 break; 3213 default: 3214 break; 3215 } 3216 } 3217 3218 static int 3219 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 3220 { 3221 struct nvme_ns *nvme_ns = cb_arg; 3222 uint32_t i; 3223 3224 for (i = 0; i < desc->num_of_nsid; i++) { 3225 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 3226 continue; 3227 } 3228 3229 _nvme_ns_set_ana_state(nvme_ns, desc); 3230 return 1; 3231 } 3232 3233 return 0; 3234 } 3235 3236 static void 3237 merge_nsid_sn_strings(const char *sn, char *nsid, int8_t *out) 3238 { 3239 int i = 0, j = 0; 3240 int sn_len = strlen(sn), nsid_len = strlen(nsid); 3241 3242 for (i = 0; i < nsid_len; i++) { 3243 out[i] = nsid[i]; 3244 } 3245 3246 /* Since last few characters are more likely to be unique, 3247 * even among the devices from the same manufacturer, 3248 * we use serial number in reverse. We also skip the 3249 * terminating character of serial number string. */ 3250 for (j = sn_len - 1; j >= 0; j--) { 3251 if (i == SPDK_UUID_STRING_LEN - 1) { 3252 break; 3253 } 3254 3255 /* There may be a lot of spaces in serial number string 3256 * and they will generate equally large number of the 3257 * same character, so just skip them. */ 3258 if (sn[j] == ' ') { 3259 continue; 3260 } 3261 3262 out[i] = sn[j]; 3263 i++; 3264 } 3265 } 3266 3267 /* Dictionary of characters for UUID generation. */ 3268 static char dict[17] = "0123456789abcdef"; 3269 3270 static struct spdk_uuid 3271 nvme_generate_uuid(const char *sn, uint32_t nsid) 3272 { 3273 struct spdk_uuid new_uuid; 3274 char buf[SPDK_UUID_STRING_LEN] = {'\0'}, merged_str[SPDK_UUID_STRING_LEN] = {'\0'}; 3275 char nsid_str[NSID_STR_LEN] = {'\0'}, tmp; 3276 uint64_t i = 0, j = 0, rem, dict_size = strlen(dict); 3277 int rc; 3278 3279 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 3280 3281 snprintf(nsid_str, NSID_STR_LEN, "%" PRIu32, nsid); 3282 3283 merge_nsid_sn_strings(sn, nsid_str, merged_str); 3284 3285 while (i < SPDK_UUID_STRING_LEN) { 3286 /* If 'j' is equal to indexes, where '-' should be placed, 3287 * insert this character and continue the loop without 3288 * increasing 'i'. */ 3289 if ((j == 8 || j == 13 || j == 18 || j == 23)) { 3290 buf[j] = '-'; 3291 j++; 3292 3293 /* Break, if we ran out of characters in 3294 * serial number and namespace ID string. */ 3295 if (j == strlen(merged_str)) { 3296 break; 3297 } 3298 continue; 3299 } 3300 3301 /* Change character in shuffled string to lower case. */ 3302 tmp = tolower(merged_str[i]); 3303 3304 if (isxdigit(tmp)) { 3305 /* If character can be represented by a hex 3306 * value as is, copy it to the result buffer. */ 3307 buf[j] = tmp; 3308 } else { 3309 /* Otherwise get its code and divide it 3310 * by the number of elements in dictionary. 3311 * The remainder will be the index of dictionary 3312 * character to replace tmp value with. */ 3313 rem = tmp % dict_size; 3314 buf[j] = dict[rem]; 3315 } 3316 3317 i++; 3318 j++; 3319 3320 /* Break, if we ran out of characters in 3321 * serial number and namespace ID string. */ 3322 if (j == strlen(merged_str)) { 3323 break; 3324 } 3325 } 3326 3327 /* If there are not enough values to fill UUID, 3328 * the rest is taken from dictionary characters. */ 3329 i = 0; 3330 while (j < SPDK_UUID_STRING_LEN - 1) { 3331 if ((j == 8 || j == 13 || j == 18 || j == 23)) { 3332 buf[j] = '-'; 3333 j++; 3334 continue; 3335 } 3336 buf[j] = dict[i % dict_size]; 3337 i++; 3338 j++; 3339 } 3340 3341 rc = spdk_uuid_parse(&new_uuid, buf); 3342 if (rc != 0) { 3343 SPDK_ERRLOG("Unexpected spdk_uuid_parse failure on %s.\n", buf); 3344 assert(false); 3345 } 3346 3347 return new_uuid; 3348 } 3349 3350 static int 3351 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 3352 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 3353 uint32_t prchk_flags, void *ctx) 3354 { 3355 const struct spdk_uuid *uuid; 3356 const uint8_t *nguid; 3357 const struct spdk_nvme_ctrlr_data *cdata; 3358 const struct spdk_nvme_ns_data *nsdata; 3359 const struct spdk_nvme_ctrlr_opts *opts; 3360 enum spdk_nvme_csi csi; 3361 uint32_t atomic_bs, phys_bs, bs; 3362 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 3363 3364 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3365 csi = spdk_nvme_ns_get_csi(ns); 3366 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 3367 3368 switch (csi) { 3369 case SPDK_NVME_CSI_NVM: 3370 disk->product_name = "NVMe disk"; 3371 break; 3372 case SPDK_NVME_CSI_ZNS: 3373 disk->product_name = "NVMe ZNS disk"; 3374 disk->zoned = true; 3375 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 3376 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 3377 spdk_nvme_ns_get_extended_sector_size(ns); 3378 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 3379 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 3380 break; 3381 default: 3382 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 3383 return -ENOTSUP; 3384 } 3385 3386 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 3387 if (!disk->name) { 3388 return -ENOMEM; 3389 } 3390 3391 disk->write_cache = 0; 3392 if (cdata->vwc.present) { 3393 /* Enable if the Volatile Write Cache exists */ 3394 disk->write_cache = 1; 3395 } 3396 if (cdata->oncs.write_zeroes) { 3397 disk->max_write_zeroes = UINT16_MAX + 1; 3398 } 3399 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 3400 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 3401 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 3402 /* NVMe driver will split one request into multiple requests 3403 * based on MDTS and stripe boundary, the bdev layer will use 3404 * max_segment_size and max_num_segments to split one big IO 3405 * into multiple requests, then small request can't run out 3406 * of NVMe internal requests data structure. 3407 */ 3408 if (opts && opts->io_queue_requests) { 3409 disk->max_num_segments = opts->io_queue_requests / 2; 3410 } 3411 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 3412 3413 nguid = spdk_nvme_ns_get_nguid(ns); 3414 if (!nguid) { 3415 uuid = spdk_nvme_ns_get_uuid(ns); 3416 if (uuid) { 3417 disk->uuid = *uuid; 3418 } else if (g_opts.generate_uuids) { 3419 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 3420 disk->uuid = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns)); 3421 } 3422 } else { 3423 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 3424 } 3425 3426 nsdata = spdk_nvme_ns_get_data(ns); 3427 bs = spdk_nvme_ns_get_sector_size(ns); 3428 atomic_bs = bs; 3429 phys_bs = bs; 3430 if (nsdata->nabo == 0) { 3431 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 3432 atomic_bs = bs * (1 + nsdata->nawupf); 3433 } else { 3434 atomic_bs = bs * (1 + cdata->awupf); 3435 } 3436 } 3437 if (nsdata->nsfeat.optperf) { 3438 phys_bs = bs * (1 + nsdata->npwg); 3439 } 3440 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 3441 3442 disk->md_len = spdk_nvme_ns_get_md_size(ns); 3443 if (disk->md_len != 0) { 3444 disk->md_interleave = nsdata->flbas.extended; 3445 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 3446 if (disk->dif_type != SPDK_DIF_DISABLE) { 3447 disk->dif_is_head_of_md = nsdata->dps.md_start; 3448 disk->dif_check_flags = prchk_flags; 3449 } 3450 } 3451 3452 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 3453 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 3454 disk->acwu = 0; 3455 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 3456 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 3457 } else { 3458 disk->acwu = cdata->acwu + 1; /* 0-based */ 3459 } 3460 3461 if (cdata->oncs.copy) { 3462 /* For now bdev interface allows only single segment copy */ 3463 disk->max_copy = nsdata->mssrl; 3464 } 3465 3466 disk->ctxt = ctx; 3467 disk->fn_table = &nvmelib_fn_table; 3468 disk->module = &nvme_if; 3469 3470 return 0; 3471 } 3472 3473 static int 3474 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 3475 { 3476 struct nvme_bdev *bdev; 3477 int rc; 3478 3479 bdev = calloc(1, sizeof(*bdev)); 3480 if (!bdev) { 3481 SPDK_ERRLOG("bdev calloc() failed\n"); 3482 return -ENOMEM; 3483 } 3484 3485 if (g_opts.nvme_error_stat) { 3486 bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 3487 if (!bdev->err_stat) { 3488 SPDK_ERRLOG("err_stat calloc() failed\n"); 3489 free(bdev); 3490 return -ENOMEM; 3491 } 3492 } 3493 3494 rc = pthread_mutex_init(&bdev->mutex, NULL); 3495 if (rc != 0) { 3496 free(bdev->err_stat); 3497 free(bdev); 3498 return rc; 3499 } 3500 3501 bdev->ref = 1; 3502 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 3503 bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 3504 bdev->rr_min_io = UINT32_MAX; 3505 TAILQ_INIT(&bdev->nvme_ns_list); 3506 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 3507 bdev->opal = nvme_ctrlr->opal_dev != NULL; 3508 3509 rc = nvme_disk_create(&bdev->disk, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 3510 nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); 3511 if (rc != 0) { 3512 SPDK_ERRLOG("Failed to create NVMe disk\n"); 3513 pthread_mutex_destroy(&bdev->mutex); 3514 free(bdev->err_stat); 3515 free(bdev); 3516 return rc; 3517 } 3518 3519 spdk_io_device_register(bdev, 3520 bdev_nvme_create_bdev_channel_cb, 3521 bdev_nvme_destroy_bdev_channel_cb, 3522 sizeof(struct nvme_bdev_channel), 3523 bdev->disk.name); 3524 3525 rc = spdk_bdev_register(&bdev->disk); 3526 if (rc != 0) { 3527 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 3528 spdk_io_device_unregister(bdev, NULL); 3529 pthread_mutex_destroy(&bdev->mutex); 3530 free(bdev->disk.name); 3531 free(bdev->err_stat); 3532 free(bdev); 3533 return rc; 3534 } 3535 3536 nvme_ns->bdev = bdev; 3537 bdev->nsid = nvme_ns->id; 3538 3539 bdev->nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 3540 TAILQ_INSERT_TAIL(&nvme_ctrlr->nbdev_ctrlr->bdevs, bdev, tailq); 3541 3542 return 0; 3543 } 3544 3545 static bool 3546 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 3547 { 3548 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 3549 const struct spdk_uuid *uuid1, *uuid2; 3550 3551 nsdata1 = spdk_nvme_ns_get_data(ns1); 3552 nsdata2 = spdk_nvme_ns_get_data(ns2); 3553 uuid1 = spdk_nvme_ns_get_uuid(ns1); 3554 uuid2 = spdk_nvme_ns_get_uuid(ns2); 3555 3556 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 3557 nsdata1->eui64 == nsdata2->eui64 && 3558 ((uuid1 == NULL && uuid2 == NULL) || 3559 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 3560 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 3561 } 3562 3563 static bool 3564 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 3565 struct spdk_nvme_ctrlr_opts *opts) 3566 { 3567 struct nvme_probe_skip_entry *entry; 3568 3569 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 3570 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 3571 return false; 3572 } 3573 } 3574 3575 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 3576 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 3577 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 3578 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 3579 opts->disable_read_ana_log_page = true; 3580 3581 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 3582 3583 return true; 3584 } 3585 3586 static void 3587 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 3588 { 3589 struct nvme_ctrlr *nvme_ctrlr = ctx; 3590 3591 if (spdk_nvme_cpl_is_error(cpl)) { 3592 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 3593 cpl->status.sct); 3594 bdev_nvme_reset(nvme_ctrlr); 3595 } else if (cpl->cdw0 & 0x1) { 3596 SPDK_WARNLOG("Specified command could not be aborted.\n"); 3597 bdev_nvme_reset(nvme_ctrlr); 3598 } 3599 } 3600 3601 static void 3602 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 3603 struct spdk_nvme_qpair *qpair, uint16_t cid) 3604 { 3605 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 3606 union spdk_nvme_csts_register csts; 3607 int rc; 3608 3609 assert(nvme_ctrlr->ctrlr == ctrlr); 3610 3611 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 3612 3613 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 3614 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 3615 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 3616 * completion recursively. 3617 */ 3618 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 3619 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 3620 if (csts.bits.cfs) { 3621 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 3622 bdev_nvme_reset(nvme_ctrlr); 3623 return; 3624 } 3625 } 3626 3627 switch (g_opts.action_on_timeout) { 3628 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 3629 if (qpair) { 3630 /* Don't send abort to ctrlr when ctrlr is not available. */ 3631 pthread_mutex_lock(&nvme_ctrlr->mutex); 3632 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 3633 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3634 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 3635 return; 3636 } 3637 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3638 3639 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 3640 nvme_abort_cpl, nvme_ctrlr); 3641 if (rc == 0) { 3642 return; 3643 } 3644 3645 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 3646 } 3647 3648 /* FALLTHROUGH */ 3649 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 3650 bdev_nvme_reset(nvme_ctrlr); 3651 break; 3652 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 3653 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 3654 break; 3655 default: 3656 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 3657 break; 3658 } 3659 } 3660 3661 static struct nvme_ns * 3662 nvme_ns_alloc(void) 3663 { 3664 struct nvme_ns *nvme_ns; 3665 3666 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 3667 if (nvme_ns == NULL) { 3668 return NULL; 3669 } 3670 3671 if (g_opts.io_path_stat) { 3672 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 3673 if (nvme_ns->stat == NULL) { 3674 free(nvme_ns); 3675 return NULL; 3676 } 3677 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 3678 } 3679 3680 return nvme_ns; 3681 } 3682 3683 static void 3684 nvme_ns_free(struct nvme_ns *nvme_ns) 3685 { 3686 free(nvme_ns->stat); 3687 free(nvme_ns); 3688 } 3689 3690 static void 3691 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 3692 { 3693 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 3694 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 3695 3696 if (rc == 0) { 3697 nvme_ns->probe_ctx = NULL; 3698 pthread_mutex_lock(&nvme_ctrlr->mutex); 3699 nvme_ctrlr->ref++; 3700 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3701 } else { 3702 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 3703 nvme_ns_free(nvme_ns); 3704 } 3705 3706 if (ctx) { 3707 ctx->populates_in_progress--; 3708 if (ctx->populates_in_progress == 0) { 3709 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 3710 } 3711 } 3712 } 3713 3714 static void 3715 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 3716 { 3717 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3718 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 3719 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3720 int rc; 3721 3722 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 3723 if (rc != 0) { 3724 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 3725 } 3726 3727 spdk_for_each_channel_continue(i, rc); 3728 } 3729 3730 static void 3731 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 3732 { 3733 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3734 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 3735 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3736 struct nvme_io_path *io_path; 3737 3738 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 3739 if (io_path != NULL) { 3740 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 3741 } 3742 3743 spdk_for_each_channel_continue(i, 0); 3744 } 3745 3746 static void 3747 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 3748 { 3749 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3750 3751 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 3752 } 3753 3754 static void 3755 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 3756 { 3757 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3758 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 3759 3760 if (status == 0) { 3761 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 3762 } else { 3763 /* Delete the added io_paths and fail populating the namespace. */ 3764 spdk_for_each_channel(bdev, 3765 bdev_nvme_delete_io_path, 3766 nvme_ns, 3767 bdev_nvme_add_io_path_failed); 3768 } 3769 } 3770 3771 static int 3772 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 3773 { 3774 struct nvme_ns *tmp_ns; 3775 const struct spdk_nvme_ns_data *nsdata; 3776 3777 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 3778 if (!nsdata->nmic.can_share) { 3779 SPDK_ERRLOG("Namespace cannot be shared.\n"); 3780 return -EINVAL; 3781 } 3782 3783 pthread_mutex_lock(&bdev->mutex); 3784 3785 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 3786 assert(tmp_ns != NULL); 3787 3788 if (!bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 3789 pthread_mutex_unlock(&bdev->mutex); 3790 SPDK_ERRLOG("Namespaces are not identical.\n"); 3791 return -EINVAL; 3792 } 3793 3794 bdev->ref++; 3795 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 3796 nvme_ns->bdev = bdev; 3797 3798 pthread_mutex_unlock(&bdev->mutex); 3799 3800 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 3801 spdk_for_each_channel(bdev, 3802 bdev_nvme_add_io_path, 3803 nvme_ns, 3804 bdev_nvme_add_io_path_done); 3805 3806 return 0; 3807 } 3808 3809 static void 3810 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 3811 { 3812 struct spdk_nvme_ns *ns; 3813 struct nvme_bdev *bdev; 3814 int rc = 0; 3815 3816 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 3817 if (!ns) { 3818 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 3819 rc = -EINVAL; 3820 goto done; 3821 } 3822 3823 nvme_ns->ns = ns; 3824 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 3825 3826 if (nvme_ctrlr->ana_log_page != NULL) { 3827 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 3828 } 3829 3830 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 3831 if (bdev == NULL) { 3832 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 3833 } else { 3834 rc = nvme_bdev_add_ns(bdev, nvme_ns); 3835 if (rc == 0) { 3836 return; 3837 } 3838 } 3839 done: 3840 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 3841 } 3842 3843 static void 3844 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 3845 { 3846 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 3847 3848 assert(nvme_ctrlr != NULL); 3849 3850 pthread_mutex_lock(&nvme_ctrlr->mutex); 3851 3852 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 3853 3854 if (nvme_ns->bdev != NULL) { 3855 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3856 return; 3857 } 3858 3859 nvme_ns_free(nvme_ns); 3860 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3861 3862 nvme_ctrlr_release(nvme_ctrlr); 3863 } 3864 3865 static void 3866 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 3867 { 3868 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3869 3870 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 3871 } 3872 3873 static void 3874 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 3875 { 3876 struct nvme_bdev *bdev; 3877 3878 spdk_poller_unregister(&nvme_ns->anatt_timer); 3879 3880 bdev = nvme_ns->bdev; 3881 if (bdev != NULL) { 3882 pthread_mutex_lock(&bdev->mutex); 3883 3884 assert(bdev->ref > 0); 3885 bdev->ref--; 3886 if (bdev->ref == 0) { 3887 pthread_mutex_unlock(&bdev->mutex); 3888 3889 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 3890 } else { 3891 /* spdk_bdev_unregister() is not called until the last nvme_ns is 3892 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 3893 * and clear nvme_ns->bdev here. 3894 */ 3895 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 3896 nvme_ns->bdev = NULL; 3897 3898 pthread_mutex_unlock(&bdev->mutex); 3899 3900 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 3901 * we call depopulate_namespace_done() to avoid use-after-free. 3902 */ 3903 spdk_for_each_channel(bdev, 3904 bdev_nvme_delete_io_path, 3905 nvme_ns, 3906 bdev_nvme_delete_io_path_done); 3907 return; 3908 } 3909 } 3910 3911 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 3912 } 3913 3914 static void 3915 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 3916 struct nvme_async_probe_ctx *ctx) 3917 { 3918 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 3919 struct nvme_ns *nvme_ns, *next; 3920 struct spdk_nvme_ns *ns; 3921 struct nvme_bdev *bdev; 3922 uint32_t nsid; 3923 int rc; 3924 uint64_t num_sectors; 3925 3926 if (ctx) { 3927 /* Initialize this count to 1 to handle the populate functions 3928 * calling nvme_ctrlr_populate_namespace_done() immediately. 3929 */ 3930 ctx->populates_in_progress = 1; 3931 } 3932 3933 /* First loop over our existing namespaces and see if they have been 3934 * removed. */ 3935 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 3936 while (nvme_ns != NULL) { 3937 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 3938 3939 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 3940 /* NS is still there but attributes may have changed */ 3941 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 3942 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 3943 bdev = nvme_ns->bdev; 3944 assert(bdev != NULL); 3945 if (bdev->disk.blockcnt != num_sectors) { 3946 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 3947 nvme_ns->id, 3948 bdev->disk.name, 3949 bdev->disk.blockcnt, 3950 num_sectors); 3951 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 3952 if (rc != 0) { 3953 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 3954 bdev->disk.name, rc); 3955 } 3956 } 3957 } else { 3958 /* Namespace was removed */ 3959 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 3960 } 3961 3962 nvme_ns = next; 3963 } 3964 3965 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 3966 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 3967 while (nsid != 0) { 3968 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 3969 3970 if (nvme_ns == NULL) { 3971 /* Found a new one */ 3972 nvme_ns = nvme_ns_alloc(); 3973 if (nvme_ns == NULL) { 3974 SPDK_ERRLOG("Failed to allocate namespace\n"); 3975 /* This just fails to attach the namespace. It may work on a future attempt. */ 3976 continue; 3977 } 3978 3979 nvme_ns->id = nsid; 3980 nvme_ns->ctrlr = nvme_ctrlr; 3981 3982 nvme_ns->bdev = NULL; 3983 3984 if (ctx) { 3985 ctx->populates_in_progress++; 3986 } 3987 nvme_ns->probe_ctx = ctx; 3988 3989 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 3990 3991 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 3992 } 3993 3994 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 3995 } 3996 3997 if (ctx) { 3998 /* Decrement this count now that the loop is over to account 3999 * for the one we started with. If the count is then 0, we 4000 * know any populate_namespace functions completed immediately, 4001 * so we'll kick the callback here. 4002 */ 4003 ctx->populates_in_progress--; 4004 if (ctx->populates_in_progress == 0) { 4005 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4006 } 4007 } 4008 4009 } 4010 4011 static void 4012 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 4013 { 4014 struct nvme_ns *nvme_ns, *tmp; 4015 4016 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 4017 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4018 } 4019 } 4020 4021 static uint32_t 4022 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 4023 { 4024 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4025 const struct spdk_nvme_ctrlr_data *cdata; 4026 uint32_t nsid, ns_count = 0; 4027 4028 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4029 4030 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4031 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 4032 ns_count++; 4033 } 4034 4035 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 4036 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 4037 sizeof(uint32_t); 4038 } 4039 4040 static int 4041 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 4042 void *cb_arg) 4043 { 4044 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4045 struct nvme_ns *nvme_ns; 4046 uint32_t i, nsid; 4047 4048 for (i = 0; i < desc->num_of_nsid; i++) { 4049 nsid = desc->nsid[i]; 4050 if (nsid == 0) { 4051 continue; 4052 } 4053 4054 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4055 4056 assert(nvme_ns != NULL); 4057 if (nvme_ns == NULL) { 4058 /* Target told us that an inactive namespace had an ANA change */ 4059 continue; 4060 } 4061 4062 _nvme_ns_set_ana_state(nvme_ns, desc); 4063 } 4064 4065 return 0; 4066 } 4067 4068 static void 4069 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4070 { 4071 struct nvme_ns *nvme_ns; 4072 4073 spdk_free(nvme_ctrlr->ana_log_page); 4074 nvme_ctrlr->ana_log_page = NULL; 4075 4076 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4077 nvme_ns != NULL; 4078 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 4079 nvme_ns->ana_state_updating = false; 4080 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4081 } 4082 } 4083 4084 static void 4085 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 4086 { 4087 struct nvme_ctrlr *nvme_ctrlr = ctx; 4088 4089 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 4090 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 4091 nvme_ctrlr); 4092 } else { 4093 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 4094 } 4095 4096 pthread_mutex_lock(&nvme_ctrlr->mutex); 4097 4098 assert(nvme_ctrlr->ana_log_page_updating == true); 4099 nvme_ctrlr->ana_log_page_updating = false; 4100 4101 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 4102 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4103 4104 nvme_ctrlr_unregister(nvme_ctrlr); 4105 } else { 4106 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4107 4108 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 4109 } 4110 } 4111 4112 static int 4113 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4114 { 4115 uint32_t ana_log_page_size; 4116 int rc; 4117 4118 if (nvme_ctrlr->ana_log_page == NULL) { 4119 return -EINVAL; 4120 } 4121 4122 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 4123 4124 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 4125 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 4126 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 4127 return -EINVAL; 4128 } 4129 4130 pthread_mutex_lock(&nvme_ctrlr->mutex); 4131 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 4132 nvme_ctrlr->ana_log_page_updating) { 4133 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4134 return -EBUSY; 4135 } 4136 4137 nvme_ctrlr->ana_log_page_updating = true; 4138 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4139 4140 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 4141 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 4142 SPDK_NVME_GLOBAL_NS_TAG, 4143 nvme_ctrlr->ana_log_page, 4144 ana_log_page_size, 0, 4145 nvme_ctrlr_read_ana_log_page_done, 4146 nvme_ctrlr); 4147 if (rc != 0) { 4148 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 4149 } 4150 4151 return rc; 4152 } 4153 4154 static void 4155 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4156 { 4157 } 4158 4159 struct bdev_nvme_set_preferred_path_ctx { 4160 struct spdk_bdev_desc *desc; 4161 struct nvme_ns *nvme_ns; 4162 bdev_nvme_set_preferred_path_cb cb_fn; 4163 void *cb_arg; 4164 }; 4165 4166 static void 4167 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 4168 { 4169 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4170 4171 assert(ctx != NULL); 4172 assert(ctx->desc != NULL); 4173 assert(ctx->cb_fn != NULL); 4174 4175 spdk_bdev_close(ctx->desc); 4176 4177 ctx->cb_fn(ctx->cb_arg, status); 4178 4179 free(ctx); 4180 } 4181 4182 static void 4183 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 4184 { 4185 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4186 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4187 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4188 struct nvme_io_path *io_path, *prev; 4189 4190 prev = NULL; 4191 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 4192 if (io_path->nvme_ns == ctx->nvme_ns) { 4193 break; 4194 } 4195 prev = io_path; 4196 } 4197 4198 if (io_path != NULL) { 4199 if (prev != NULL) { 4200 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 4201 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 4202 } 4203 4204 /* We can set io_path to nbdev_ch->current_io_path directly here. 4205 * However, it needs to be conditional. To simplify the code, 4206 * just clear nbdev_ch->current_io_path and let find_io_path() 4207 * fill it. 4208 * 4209 * Automatic failback may be disabled. Hence even if the io_path is 4210 * already at the head, clear nbdev_ch->current_io_path. 4211 */ 4212 bdev_nvme_clear_current_io_path(nbdev_ch); 4213 } 4214 4215 spdk_for_each_channel_continue(i, 0); 4216 } 4217 4218 static struct nvme_ns * 4219 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 4220 { 4221 struct nvme_ns *nvme_ns, *prev; 4222 const struct spdk_nvme_ctrlr_data *cdata; 4223 4224 prev = NULL; 4225 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 4226 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4227 4228 if (cdata->cntlid == cntlid) { 4229 break; 4230 } 4231 prev = nvme_ns; 4232 } 4233 4234 if (nvme_ns != NULL && prev != NULL) { 4235 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 4236 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 4237 } 4238 4239 return nvme_ns; 4240 } 4241 4242 /* This function supports only multipath mode. There is only a single I/O path 4243 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 4244 * head of the I/O path list for each NVMe bdev channel. 4245 * 4246 * NVMe bdev channel may be acquired after completing this function. move the 4247 * matched namespace to the head of the namespace list for the NVMe bdev too. 4248 */ 4249 void 4250 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 4251 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 4252 { 4253 struct bdev_nvme_set_preferred_path_ctx *ctx; 4254 struct spdk_bdev *bdev; 4255 struct nvme_bdev *nbdev; 4256 int rc = 0; 4257 4258 assert(cb_fn != NULL); 4259 4260 ctx = calloc(1, sizeof(*ctx)); 4261 if (ctx == NULL) { 4262 SPDK_ERRLOG("Failed to alloc context.\n"); 4263 rc = -ENOMEM; 4264 goto err_alloc; 4265 } 4266 4267 ctx->cb_fn = cb_fn; 4268 ctx->cb_arg = cb_arg; 4269 4270 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 4271 if (rc != 0) { 4272 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 4273 goto err_open; 4274 } 4275 4276 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 4277 4278 if (bdev->module != &nvme_if) { 4279 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 4280 rc = -ENODEV; 4281 goto err_bdev; 4282 } 4283 4284 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 4285 4286 pthread_mutex_lock(&nbdev->mutex); 4287 4288 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 4289 if (ctx->nvme_ns == NULL) { 4290 pthread_mutex_unlock(&nbdev->mutex); 4291 4292 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 4293 rc = -ENODEV; 4294 goto err_bdev; 4295 } 4296 4297 pthread_mutex_unlock(&nbdev->mutex); 4298 4299 spdk_for_each_channel(nbdev, 4300 _bdev_nvme_set_preferred_path, 4301 ctx, 4302 bdev_nvme_set_preferred_path_done); 4303 return; 4304 4305 err_bdev: 4306 spdk_bdev_close(ctx->desc); 4307 err_open: 4308 free(ctx); 4309 err_alloc: 4310 cb_fn(cb_arg, rc); 4311 } 4312 4313 struct bdev_nvme_set_multipath_policy_ctx { 4314 struct spdk_bdev_desc *desc; 4315 bdev_nvme_set_multipath_policy_cb cb_fn; 4316 void *cb_arg; 4317 }; 4318 4319 static void 4320 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 4321 { 4322 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4323 4324 assert(ctx != NULL); 4325 assert(ctx->desc != NULL); 4326 assert(ctx->cb_fn != NULL); 4327 4328 spdk_bdev_close(ctx->desc); 4329 4330 ctx->cb_fn(ctx->cb_arg, status); 4331 4332 free(ctx); 4333 } 4334 4335 static void 4336 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 4337 { 4338 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4339 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4340 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 4341 4342 nbdev_ch->mp_policy = nbdev->mp_policy; 4343 nbdev_ch->mp_selector = nbdev->mp_selector; 4344 nbdev_ch->rr_min_io = nbdev->rr_min_io; 4345 bdev_nvme_clear_current_io_path(nbdev_ch); 4346 4347 spdk_for_each_channel_continue(i, 0); 4348 } 4349 4350 void 4351 bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, 4352 enum bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 4353 bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 4354 { 4355 struct bdev_nvme_set_multipath_policy_ctx *ctx; 4356 struct spdk_bdev *bdev; 4357 struct nvme_bdev *nbdev; 4358 int rc; 4359 4360 assert(cb_fn != NULL); 4361 4362 if (policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE && selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 4363 if (rr_min_io == UINT32_MAX) { 4364 rr_min_io = 1; 4365 } else if (rr_min_io == 0) { 4366 rc = -EINVAL; 4367 goto exit; 4368 } 4369 } else if (rr_min_io != UINT32_MAX) { 4370 rc = -EINVAL; 4371 goto exit; 4372 } 4373 4374 ctx = calloc(1, sizeof(*ctx)); 4375 if (ctx == NULL) { 4376 SPDK_ERRLOG("Failed to alloc context.\n"); 4377 rc = -ENOMEM; 4378 goto exit; 4379 } 4380 4381 ctx->cb_fn = cb_fn; 4382 ctx->cb_arg = cb_arg; 4383 4384 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 4385 if (rc != 0) { 4386 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 4387 rc = -ENODEV; 4388 goto err_open; 4389 } 4390 4391 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 4392 if (bdev->module != &nvme_if) { 4393 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 4394 rc = -ENODEV; 4395 goto err_module; 4396 } 4397 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 4398 4399 pthread_mutex_lock(&nbdev->mutex); 4400 nbdev->mp_policy = policy; 4401 nbdev->mp_selector = selector; 4402 nbdev->rr_min_io = rr_min_io; 4403 pthread_mutex_unlock(&nbdev->mutex); 4404 4405 spdk_for_each_channel(nbdev, 4406 _bdev_nvme_set_multipath_policy, 4407 ctx, 4408 bdev_nvme_set_multipath_policy_done); 4409 return; 4410 4411 err_module: 4412 spdk_bdev_close(ctx->desc); 4413 err_open: 4414 free(ctx); 4415 exit: 4416 cb_fn(cb_arg, rc); 4417 } 4418 4419 static void 4420 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 4421 { 4422 struct nvme_ctrlr *nvme_ctrlr = arg; 4423 union spdk_nvme_async_event_completion event; 4424 4425 if (spdk_nvme_cpl_is_error(cpl)) { 4426 SPDK_WARNLOG("AER request execute failed\n"); 4427 return; 4428 } 4429 4430 event.raw = cpl->cdw0; 4431 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 4432 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 4433 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 4434 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 4435 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 4436 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 4437 } 4438 } 4439 4440 static void 4441 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc) 4442 { 4443 if (ctx->cb_fn) { 4444 ctx->cb_fn(ctx->cb_ctx, count, rc); 4445 } 4446 4447 ctx->namespaces_populated = true; 4448 if (ctx->probe_done) { 4449 /* The probe was already completed, so we need to free the context 4450 * here. This can happen for cases like OCSSD, where we need to 4451 * send additional commands to the SSD after attach. 4452 */ 4453 free(ctx); 4454 } 4455 } 4456 4457 static void 4458 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 4459 struct nvme_async_probe_ctx *ctx) 4460 { 4461 spdk_io_device_register(nvme_ctrlr, 4462 bdev_nvme_create_ctrlr_channel_cb, 4463 bdev_nvme_destroy_ctrlr_channel_cb, 4464 sizeof(struct nvme_ctrlr_channel), 4465 nvme_ctrlr->nbdev_ctrlr->name); 4466 4467 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 4468 } 4469 4470 static void 4471 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 4472 { 4473 struct nvme_ctrlr *nvme_ctrlr = _ctx; 4474 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 4475 4476 nvme_ctrlr->probe_ctx = NULL; 4477 4478 if (spdk_nvme_cpl_is_error(cpl)) { 4479 nvme_ctrlr_delete(nvme_ctrlr); 4480 4481 if (ctx != NULL) { 4482 populate_namespaces_cb(ctx, 0, -1); 4483 } 4484 return; 4485 } 4486 4487 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 4488 } 4489 4490 static int 4491 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 4492 struct nvme_async_probe_ctx *ctx) 4493 { 4494 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4495 const struct spdk_nvme_ctrlr_data *cdata; 4496 uint32_t ana_log_page_size; 4497 4498 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4499 4500 /* Set buffer size enough to include maximum number of allowed namespaces. */ 4501 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 4502 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 4503 sizeof(uint32_t); 4504 4505 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 4506 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 4507 if (nvme_ctrlr->ana_log_page == NULL) { 4508 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 4509 return -ENXIO; 4510 } 4511 4512 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 4513 * Hence copy each descriptor to a temporary area when parsing it. 4514 * 4515 * Allocate a buffer whose size is as large as ANA log page buffer because 4516 * we do not know the size of a descriptor until actually reading it. 4517 */ 4518 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 4519 if (nvme_ctrlr->copied_ana_desc == NULL) { 4520 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 4521 return -ENOMEM; 4522 } 4523 4524 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 4525 4526 nvme_ctrlr->probe_ctx = ctx; 4527 4528 /* Then, set the read size only to include the current active namespaces. */ 4529 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 4530 4531 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 4532 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 4533 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 4534 return -EINVAL; 4535 } 4536 4537 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 4538 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 4539 SPDK_NVME_GLOBAL_NS_TAG, 4540 nvme_ctrlr->ana_log_page, 4541 ana_log_page_size, 0, 4542 nvme_ctrlr_init_ana_log_page_done, 4543 nvme_ctrlr); 4544 } 4545 4546 /* hostnqn and subnqn were already verified before attaching a controller. 4547 * Hence check only the multipath capability and cntlid here. 4548 */ 4549 static bool 4550 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 4551 { 4552 struct nvme_ctrlr *tmp; 4553 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 4554 4555 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4556 4557 if (!cdata->cmic.multi_ctrlr) { 4558 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 4559 return false; 4560 } 4561 4562 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 4563 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 4564 4565 if (!tmp_cdata->cmic.multi_ctrlr) { 4566 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 4567 return false; 4568 } 4569 if (cdata->cntlid == tmp_cdata->cntlid) { 4570 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 4571 return false; 4572 } 4573 } 4574 4575 return true; 4576 } 4577 4578 static int 4579 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 4580 { 4581 struct nvme_bdev_ctrlr *nbdev_ctrlr; 4582 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4583 int rc = 0; 4584 4585 pthread_mutex_lock(&g_bdev_nvme_mutex); 4586 4587 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 4588 if (nbdev_ctrlr != NULL) { 4589 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 4590 rc = -EINVAL; 4591 goto exit; 4592 } 4593 } else { 4594 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 4595 if (nbdev_ctrlr == NULL) { 4596 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 4597 rc = -ENOMEM; 4598 goto exit; 4599 } 4600 nbdev_ctrlr->name = strdup(name); 4601 if (nbdev_ctrlr->name == NULL) { 4602 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 4603 free(nbdev_ctrlr); 4604 goto exit; 4605 } 4606 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 4607 TAILQ_INIT(&nbdev_ctrlr->bdevs); 4608 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 4609 } 4610 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 4611 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 4612 exit: 4613 pthread_mutex_unlock(&g_bdev_nvme_mutex); 4614 return rc; 4615 } 4616 4617 static int 4618 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 4619 const char *name, 4620 const struct spdk_nvme_transport_id *trid, 4621 struct nvme_async_probe_ctx *ctx) 4622 { 4623 struct nvme_ctrlr *nvme_ctrlr; 4624 struct nvme_path_id *path_id; 4625 const struct spdk_nvme_ctrlr_data *cdata; 4626 int rc; 4627 4628 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 4629 if (nvme_ctrlr == NULL) { 4630 SPDK_ERRLOG("Failed to allocate device struct\n"); 4631 return -ENOMEM; 4632 } 4633 4634 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 4635 if (rc != 0) { 4636 free(nvme_ctrlr); 4637 return rc; 4638 } 4639 4640 TAILQ_INIT(&nvme_ctrlr->trids); 4641 4642 RB_INIT(&nvme_ctrlr->namespaces); 4643 4644 path_id = calloc(1, sizeof(*path_id)); 4645 if (path_id == NULL) { 4646 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 4647 rc = -ENOMEM; 4648 goto err; 4649 } 4650 4651 path_id->trid = *trid; 4652 if (ctx != NULL) { 4653 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 4654 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 4655 } 4656 nvme_ctrlr->active_path_id = path_id; 4657 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 4658 4659 nvme_ctrlr->thread = spdk_get_thread(); 4660 nvme_ctrlr->ctrlr = ctrlr; 4661 nvme_ctrlr->ref = 1; 4662 4663 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 4664 SPDK_ERRLOG("OCSSDs are not supported"); 4665 rc = -ENOTSUP; 4666 goto err; 4667 } 4668 4669 if (ctx != NULL) { 4670 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 4671 } else { 4672 bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 4673 } 4674 4675 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 4676 g_opts.nvme_adminq_poll_period_us); 4677 4678 if (g_opts.timeout_us > 0) { 4679 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 4680 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 4681 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 4682 g_opts.timeout_us : g_opts.timeout_admin_us; 4683 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 4684 adm_timeout_us, timeout_cb, nvme_ctrlr); 4685 } 4686 4687 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 4688 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 4689 4690 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 4691 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 4692 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 4693 } 4694 4695 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 4696 if (rc != 0) { 4697 goto err; 4698 } 4699 4700 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4701 4702 if (cdata->cmic.ana_reporting) { 4703 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 4704 if (rc == 0) { 4705 return 0; 4706 } 4707 } else { 4708 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 4709 return 0; 4710 } 4711 4712 err: 4713 nvme_ctrlr_delete(nvme_ctrlr); 4714 return rc; 4715 } 4716 4717 void 4718 bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) 4719 { 4720 opts->prchk_flags = 0; 4721 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 4722 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 4723 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 4724 } 4725 4726 static void 4727 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4728 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 4729 { 4730 char *name; 4731 4732 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 4733 if (!name) { 4734 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 4735 return; 4736 } 4737 4738 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 4739 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 4740 } else { 4741 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 4742 } 4743 4744 free(name); 4745 } 4746 4747 static void 4748 _nvme_ctrlr_destruct(void *ctx) 4749 { 4750 struct nvme_ctrlr *nvme_ctrlr = ctx; 4751 4752 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 4753 nvme_ctrlr_release(nvme_ctrlr); 4754 } 4755 4756 static int 4757 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 4758 { 4759 struct nvme_probe_skip_entry *entry; 4760 4761 pthread_mutex_lock(&nvme_ctrlr->mutex); 4762 4763 /* The controller's destruction was already started */ 4764 if (nvme_ctrlr->destruct) { 4765 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4766 return 0; 4767 } 4768 4769 if (!hotplug && 4770 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 4771 entry = calloc(1, sizeof(*entry)); 4772 if (!entry) { 4773 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4774 return -ENOMEM; 4775 } 4776 entry->trid = nvme_ctrlr->active_path_id->trid; 4777 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 4778 } 4779 4780 nvme_ctrlr->destruct = true; 4781 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4782 4783 _nvme_ctrlr_destruct(nvme_ctrlr); 4784 4785 return 0; 4786 } 4787 4788 static void 4789 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 4790 { 4791 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 4792 4793 _bdev_nvme_delete(nvme_ctrlr, true); 4794 } 4795 4796 static int 4797 bdev_nvme_hotplug_probe(void *arg) 4798 { 4799 if (g_hotplug_probe_ctx == NULL) { 4800 spdk_poller_unregister(&g_hotplug_probe_poller); 4801 return SPDK_POLLER_IDLE; 4802 } 4803 4804 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 4805 g_hotplug_probe_ctx = NULL; 4806 spdk_poller_unregister(&g_hotplug_probe_poller); 4807 } 4808 4809 return SPDK_POLLER_BUSY; 4810 } 4811 4812 static int 4813 bdev_nvme_hotplug(void *arg) 4814 { 4815 struct spdk_nvme_transport_id trid_pcie; 4816 4817 if (g_hotplug_probe_ctx) { 4818 return SPDK_POLLER_BUSY; 4819 } 4820 4821 memset(&trid_pcie, 0, sizeof(trid_pcie)); 4822 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 4823 4824 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 4825 hotplug_probe_cb, attach_cb, NULL); 4826 4827 if (g_hotplug_probe_ctx) { 4828 assert(g_hotplug_probe_poller == NULL); 4829 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 4830 } 4831 4832 return SPDK_POLLER_BUSY; 4833 } 4834 4835 void 4836 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 4837 { 4838 *opts = g_opts; 4839 } 4840 4841 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 4842 uint32_t reconnect_delay_sec, 4843 uint32_t fast_io_fail_timeout_sec); 4844 4845 static int 4846 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 4847 { 4848 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 4849 /* Can't set timeout_admin_us without also setting timeout_us */ 4850 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 4851 return -EINVAL; 4852 } 4853 4854 if (opts->bdev_retry_count < -1) { 4855 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 4856 return -EINVAL; 4857 } 4858 4859 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 4860 opts->reconnect_delay_sec, 4861 opts->fast_io_fail_timeout_sec)) { 4862 return -EINVAL; 4863 } 4864 4865 return 0; 4866 } 4867 4868 int 4869 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 4870 { 4871 int ret; 4872 4873 ret = bdev_nvme_validate_opts(opts); 4874 if (ret) { 4875 SPDK_WARNLOG("Failed to set nvme opts.\n"); 4876 return ret; 4877 } 4878 4879 if (g_bdev_nvme_init_thread != NULL) { 4880 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 4881 return -EPERM; 4882 } 4883 } 4884 4885 if (opts->rdma_srq_size != 0) { 4886 struct spdk_nvme_transport_opts drv_opts; 4887 4888 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 4889 drv_opts.rdma_srq_size = opts->rdma_srq_size; 4890 4891 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 4892 if (ret) { 4893 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 4894 return ret; 4895 } 4896 } 4897 4898 g_opts = *opts; 4899 4900 return 0; 4901 } 4902 4903 struct set_nvme_hotplug_ctx { 4904 uint64_t period_us; 4905 bool enabled; 4906 spdk_msg_fn fn; 4907 void *fn_ctx; 4908 }; 4909 4910 static void 4911 set_nvme_hotplug_period_cb(void *_ctx) 4912 { 4913 struct set_nvme_hotplug_ctx *ctx = _ctx; 4914 4915 spdk_poller_unregister(&g_hotplug_poller); 4916 if (ctx->enabled) { 4917 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 4918 } 4919 4920 g_nvme_hotplug_poll_period_us = ctx->period_us; 4921 g_nvme_hotplug_enabled = ctx->enabled; 4922 if (ctx->fn) { 4923 ctx->fn(ctx->fn_ctx); 4924 } 4925 4926 free(ctx); 4927 } 4928 4929 int 4930 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 4931 { 4932 struct set_nvme_hotplug_ctx *ctx; 4933 4934 if (enabled == true && !spdk_process_is_primary()) { 4935 return -EPERM; 4936 } 4937 4938 ctx = calloc(1, sizeof(*ctx)); 4939 if (ctx == NULL) { 4940 return -ENOMEM; 4941 } 4942 4943 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 4944 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 4945 ctx->enabled = enabled; 4946 ctx->fn = cb; 4947 ctx->fn_ctx = cb_ctx; 4948 4949 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 4950 return 0; 4951 } 4952 4953 static void 4954 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 4955 struct nvme_async_probe_ctx *ctx) 4956 { 4957 struct nvme_ns *nvme_ns; 4958 struct nvme_bdev *nvme_bdev; 4959 size_t j; 4960 4961 assert(nvme_ctrlr != NULL); 4962 4963 if (ctx->names == NULL) { 4964 populate_namespaces_cb(ctx, 0, 0); 4965 return; 4966 } 4967 4968 /* 4969 * Report the new bdevs that were created in this call. 4970 * There can be more than one bdev per NVMe controller. 4971 */ 4972 j = 0; 4973 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4974 while (nvme_ns != NULL) { 4975 nvme_bdev = nvme_ns->bdev; 4976 if (j < ctx->count) { 4977 ctx->names[j] = nvme_bdev->disk.name; 4978 j++; 4979 } else { 4980 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 4981 ctx->count); 4982 populate_namespaces_cb(ctx, 0, -ERANGE); 4983 return; 4984 } 4985 4986 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4987 } 4988 4989 populate_namespaces_cb(ctx, j, 0); 4990 } 4991 4992 static int 4993 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 4994 struct spdk_nvme_ctrlr *new_ctrlr, 4995 struct spdk_nvme_transport_id *trid) 4996 { 4997 struct nvme_path_id *tmp_trid; 4998 4999 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5000 SPDK_ERRLOG("PCIe failover is not supported.\n"); 5001 return -ENOTSUP; 5002 } 5003 5004 /* Currently we only support failover to the same transport type. */ 5005 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 5006 SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", 5007 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 5008 spdk_nvme_transport_id_trtype_str(trid->trtype)); 5009 return -EINVAL; 5010 } 5011 5012 5013 /* Currently we only support failover to the same NQN. */ 5014 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 5015 SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 5016 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 5017 return -EINVAL; 5018 } 5019 5020 /* Skip all the other checks if we've already registered this path. */ 5021 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 5022 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 5023 SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, 5024 trid->subnqn); 5025 return -EEXIST; 5026 } 5027 } 5028 5029 return 0; 5030 } 5031 5032 static int 5033 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 5034 struct spdk_nvme_ctrlr *new_ctrlr) 5035 { 5036 struct nvme_ns *nvme_ns; 5037 struct spdk_nvme_ns *new_ns; 5038 5039 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5040 while (nvme_ns != NULL) { 5041 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 5042 assert(new_ns != NULL); 5043 5044 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 5045 return -EINVAL; 5046 } 5047 5048 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5049 } 5050 5051 return 0; 5052 } 5053 5054 static int 5055 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5056 struct spdk_nvme_transport_id *trid) 5057 { 5058 struct nvme_path_id *new_trid, *tmp_trid; 5059 5060 new_trid = calloc(1, sizeof(*new_trid)); 5061 if (new_trid == NULL) { 5062 return -ENOMEM; 5063 } 5064 new_trid->trid = *trid; 5065 new_trid->is_failed = false; 5066 5067 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 5068 if (tmp_trid->is_failed && tmp_trid != nvme_ctrlr->active_path_id) { 5069 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 5070 return 0; 5071 } 5072 } 5073 5074 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 5075 return 0; 5076 } 5077 5078 /* This is the case that a secondary path is added to an existing 5079 * nvme_ctrlr for failover. After checking if it can access the same 5080 * namespaces as the primary path, it is disconnected until failover occurs. 5081 */ 5082 static int 5083 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5084 struct spdk_nvme_ctrlr *new_ctrlr, 5085 struct spdk_nvme_transport_id *trid) 5086 { 5087 int rc; 5088 5089 assert(nvme_ctrlr != NULL); 5090 5091 pthread_mutex_lock(&nvme_ctrlr->mutex); 5092 5093 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 5094 if (rc != 0) { 5095 goto exit; 5096 } 5097 5098 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 5099 if (rc != 0) { 5100 goto exit; 5101 } 5102 5103 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 5104 5105 exit: 5106 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5107 5108 spdk_nvme_detach(new_ctrlr); 5109 5110 return rc; 5111 } 5112 5113 static void 5114 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5115 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5116 { 5117 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5118 struct nvme_async_probe_ctx *ctx; 5119 int rc; 5120 5121 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5122 ctx->ctrlr_attached = true; 5123 5124 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 5125 if (rc != 0) { 5126 populate_namespaces_cb(ctx, 0, rc); 5127 } 5128 } 5129 5130 static void 5131 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5132 struct spdk_nvme_ctrlr *ctrlr, 5133 const struct spdk_nvme_ctrlr_opts *opts) 5134 { 5135 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5136 struct nvme_ctrlr *nvme_ctrlr; 5137 struct nvme_async_probe_ctx *ctx; 5138 int rc; 5139 5140 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5141 ctx->ctrlr_attached = true; 5142 5143 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 5144 if (nvme_ctrlr) { 5145 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 5146 } else { 5147 rc = -ENODEV; 5148 } 5149 5150 populate_namespaces_cb(ctx, 0, rc); 5151 } 5152 5153 static int 5154 bdev_nvme_async_poll(void *arg) 5155 { 5156 struct nvme_async_probe_ctx *ctx = arg; 5157 int rc; 5158 5159 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 5160 if (spdk_unlikely(rc != -EAGAIN)) { 5161 ctx->probe_done = true; 5162 spdk_poller_unregister(&ctx->poller); 5163 if (!ctx->ctrlr_attached) { 5164 /* The probe is done, but no controller was attached. 5165 * That means we had a failure, so report -EIO back to 5166 * the caller (usually the RPC). populate_namespaces_cb() 5167 * will take care of freeing the nvme_async_probe_ctx. 5168 */ 5169 populate_namespaces_cb(ctx, 0, -EIO); 5170 } else if (ctx->namespaces_populated) { 5171 /* The namespaces for the attached controller were all 5172 * populated and the response was already sent to the 5173 * caller (usually the RPC). So free the context here. 5174 */ 5175 free(ctx); 5176 } 5177 } 5178 5179 return SPDK_POLLER_BUSY; 5180 } 5181 5182 static bool 5183 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5184 uint32_t reconnect_delay_sec, 5185 uint32_t fast_io_fail_timeout_sec) 5186 { 5187 if (ctrlr_loss_timeout_sec < -1) { 5188 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 5189 return false; 5190 } else if (ctrlr_loss_timeout_sec == -1) { 5191 if (reconnect_delay_sec == 0) { 5192 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 5193 return false; 5194 } else if (fast_io_fail_timeout_sec != 0 && 5195 fast_io_fail_timeout_sec < reconnect_delay_sec) { 5196 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 5197 return false; 5198 } 5199 } else if (ctrlr_loss_timeout_sec != 0) { 5200 if (reconnect_delay_sec == 0) { 5201 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 5202 return false; 5203 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 5204 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 5205 return false; 5206 } else if (fast_io_fail_timeout_sec != 0) { 5207 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 5208 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 5209 return false; 5210 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 5211 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 5212 return false; 5213 } 5214 } 5215 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 5216 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 5217 return false; 5218 } 5219 5220 return true; 5221 } 5222 5223 int 5224 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 5225 const char *base_name, 5226 const char **names, 5227 uint32_t count, 5228 spdk_bdev_create_nvme_fn cb_fn, 5229 void *cb_ctx, 5230 struct spdk_nvme_ctrlr_opts *drv_opts, 5231 struct nvme_ctrlr_opts *bdev_opts, 5232 bool multipath) 5233 { 5234 struct nvme_probe_skip_entry *entry, *tmp; 5235 struct nvme_async_probe_ctx *ctx; 5236 spdk_nvme_attach_cb attach_cb; 5237 5238 /* TODO expand this check to include both the host and target TRIDs. 5239 * Only if both are the same should we fail. 5240 */ 5241 if (nvme_ctrlr_get(trid) != NULL) { 5242 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 5243 return -EEXIST; 5244 } 5245 5246 if (bdev_opts != NULL && 5247 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 5248 bdev_opts->reconnect_delay_sec, 5249 bdev_opts->fast_io_fail_timeout_sec)) { 5250 return -EINVAL; 5251 } 5252 5253 ctx = calloc(1, sizeof(*ctx)); 5254 if (!ctx) { 5255 return -ENOMEM; 5256 } 5257 ctx->base_name = base_name; 5258 ctx->names = names; 5259 ctx->count = count; 5260 ctx->cb_fn = cb_fn; 5261 ctx->cb_ctx = cb_ctx; 5262 ctx->trid = *trid; 5263 5264 if (bdev_opts) { 5265 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 5266 } else { 5267 bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 5268 } 5269 5270 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5271 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 5272 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 5273 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 5274 free(entry); 5275 break; 5276 } 5277 } 5278 } 5279 5280 if (drv_opts) { 5281 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 5282 } else { 5283 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); 5284 } 5285 5286 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 5287 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 5288 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 5289 ctx->drv_opts.disable_read_ana_log_page = true; 5290 ctx->drv_opts.transport_tos = g_opts.transport_tos; 5291 5292 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 5293 attach_cb = connect_attach_cb; 5294 } else { 5295 attach_cb = connect_set_failover_cb; 5296 } 5297 5298 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 5299 if (ctx->probe_ctx == NULL) { 5300 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 5301 free(ctx); 5302 return -ENODEV; 5303 } 5304 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 5305 5306 return 0; 5307 } 5308 5309 int 5310 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id) 5311 { 5312 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5313 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 5314 struct nvme_path_id *p, *t; 5315 int rc = -ENXIO; 5316 5317 if (name == NULL || path_id == NULL) { 5318 return -EINVAL; 5319 } 5320 5321 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5322 if (nbdev_ctrlr == NULL) { 5323 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 5324 return -ENODEV; 5325 } 5326 5327 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 5328 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 5329 if (path_id->trid.trtype != 0) { 5330 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 5331 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 5332 continue; 5333 } 5334 } else { 5335 if (path_id->trid.trtype != p->trid.trtype) { 5336 continue; 5337 } 5338 } 5339 } 5340 5341 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 5342 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 5343 continue; 5344 } 5345 } 5346 5347 if (path_id->trid.adrfam != 0) { 5348 if (path_id->trid.adrfam != p->trid.adrfam) { 5349 continue; 5350 } 5351 } 5352 5353 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 5354 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 5355 continue; 5356 } 5357 } 5358 5359 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 5360 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 5361 continue; 5362 } 5363 } 5364 5365 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 5366 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 5367 continue; 5368 } 5369 } 5370 5371 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 5372 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 5373 continue; 5374 } 5375 } 5376 5377 /* If we made it here, then this path is a match! Now we need to remove it. */ 5378 if (p == nvme_ctrlr->active_path_id) { 5379 /* This is the active path in use right now. The active path is always the first in the list. */ 5380 5381 if (!TAILQ_NEXT(p, link)) { 5382 /* The current path is the only path. */ 5383 rc = _bdev_nvme_delete(nvme_ctrlr, false); 5384 } else { 5385 /* There is an alternative path. */ 5386 rc = bdev_nvme_failover(nvme_ctrlr, true); 5387 } 5388 } else { 5389 /* We are not using the specified path. */ 5390 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 5391 free(p); 5392 rc = 0; 5393 } 5394 5395 if (rc < 0 && rc != -ENXIO) { 5396 return rc; 5397 } 5398 5399 5400 } 5401 } 5402 5403 /* All nvme_ctrlrs were deleted or no nvme_ctrlr which had the trid was found. */ 5404 return rc; 5405 } 5406 5407 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 5408 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 5409 5410 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 5411 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 5412 5413 struct discovery_entry_ctx { 5414 char name[128]; 5415 struct spdk_nvme_transport_id trid; 5416 struct spdk_nvme_ctrlr_opts drv_opts; 5417 struct spdk_nvmf_discovery_log_page_entry entry; 5418 TAILQ_ENTRY(discovery_entry_ctx) tailq; 5419 struct discovery_ctx *ctx; 5420 }; 5421 5422 struct discovery_ctx { 5423 char *name; 5424 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 5425 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 5426 void *cb_ctx; 5427 struct spdk_nvme_probe_ctx *probe_ctx; 5428 struct spdk_nvme_detach_ctx *detach_ctx; 5429 struct spdk_nvme_ctrlr *ctrlr; 5430 struct spdk_nvme_transport_id trid; 5431 struct discovery_entry_ctx *entry_ctx_in_use; 5432 struct spdk_poller *poller; 5433 struct spdk_nvme_ctrlr_opts drv_opts; 5434 struct nvme_ctrlr_opts bdev_opts; 5435 struct spdk_nvmf_discovery_log_page *log_page; 5436 TAILQ_ENTRY(discovery_ctx) tailq; 5437 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 5438 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 5439 int rc; 5440 bool wait_for_attach; 5441 uint64_t timeout_ticks; 5442 /* Denotes that the discovery service is being started. We're waiting 5443 * for the initial connection to the discovery controller to be 5444 * established and attach discovered NVM ctrlrs. 5445 */ 5446 bool initializing; 5447 /* Denotes if a discovery is currently in progress for this context. 5448 * That includes connecting to newly discovered subsystems. Used to 5449 * ensure we do not start a new discovery until an existing one is 5450 * complete. 5451 */ 5452 bool in_progress; 5453 5454 /* Denotes if another discovery is needed after the one in progress 5455 * completes. Set when we receive an AER completion while a discovery 5456 * is already in progress. 5457 */ 5458 bool pending; 5459 5460 /* Signal to the discovery context poller that it should stop the 5461 * discovery service, including detaching from the current discovery 5462 * controller. 5463 */ 5464 bool stop; 5465 5466 struct spdk_thread *calling_thread; 5467 uint32_t index; 5468 uint32_t attach_in_progress; 5469 char *hostnqn; 5470 5471 /* Denotes if the discovery service was started by the mdns discovery. 5472 */ 5473 bool from_mdns_discovery_service; 5474 }; 5475 5476 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 5477 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 5478 5479 static void get_discovery_log_page(struct discovery_ctx *ctx); 5480 5481 static void 5482 free_discovery_ctx(struct discovery_ctx *ctx) 5483 { 5484 free(ctx->log_page); 5485 free(ctx->hostnqn); 5486 free(ctx->name); 5487 free(ctx); 5488 } 5489 5490 static void 5491 discovery_complete(struct discovery_ctx *ctx) 5492 { 5493 ctx->initializing = false; 5494 ctx->in_progress = false; 5495 if (ctx->pending) { 5496 ctx->pending = false; 5497 get_discovery_log_page(ctx); 5498 } 5499 } 5500 5501 static void 5502 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 5503 struct spdk_nvmf_discovery_log_page_entry *entry) 5504 { 5505 char *space; 5506 5507 trid->trtype = entry->trtype; 5508 trid->adrfam = entry->adrfam; 5509 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 5510 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 5511 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn)); 5512 5513 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 5514 * But the log page entries typically pad them with spaces, not zeroes. 5515 * So add a NULL terminator to each of these fields at the appropriate 5516 * location. 5517 */ 5518 space = strchr(trid->traddr, ' '); 5519 if (space) { 5520 *space = 0; 5521 } 5522 space = strchr(trid->trsvcid, ' '); 5523 if (space) { 5524 *space = 0; 5525 } 5526 space = strchr(trid->subnqn, ' '); 5527 if (space) { 5528 *space = 0; 5529 } 5530 } 5531 5532 static void 5533 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 5534 { 5535 ctx->stop = true; 5536 ctx->stop_cb_fn = cb_fn; 5537 ctx->cb_ctx = cb_ctx; 5538 5539 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 5540 struct discovery_entry_ctx *entry_ctx; 5541 struct nvme_path_id path = {}; 5542 5543 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 5544 path.trid = entry_ctx->trid; 5545 bdev_nvme_delete(entry_ctx->name, &path); 5546 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 5547 free(entry_ctx); 5548 } 5549 5550 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 5551 struct discovery_entry_ctx *entry_ctx; 5552 5553 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 5554 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 5555 free(entry_ctx); 5556 } 5557 5558 free(ctx->entry_ctx_in_use); 5559 ctx->entry_ctx_in_use = NULL; 5560 } 5561 5562 static void 5563 discovery_remove_controllers(struct discovery_ctx *ctx) 5564 { 5565 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 5566 struct discovery_entry_ctx *entry_ctx, *tmp; 5567 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 5568 struct spdk_nvme_transport_id old_trid; 5569 uint64_t numrec, i; 5570 bool found; 5571 5572 numrec = from_le64(&log_page->numrec); 5573 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 5574 found = false; 5575 old_entry = &entry_ctx->entry; 5576 build_trid_from_log_page_entry(&old_trid, old_entry); 5577 for (i = 0; i < numrec; i++) { 5578 new_entry = &log_page->entries[i]; 5579 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 5580 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 5581 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 5582 found = true; 5583 break; 5584 } 5585 } 5586 if (!found) { 5587 struct nvme_path_id path = {}; 5588 5589 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 5590 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 5591 5592 path.trid = entry_ctx->trid; 5593 bdev_nvme_delete(entry_ctx->name, &path); 5594 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 5595 free(entry_ctx); 5596 } 5597 } 5598 free(log_page); 5599 ctx->log_page = NULL; 5600 discovery_complete(ctx); 5601 } 5602 5603 static void 5604 complete_discovery_start(struct discovery_ctx *ctx, int status) 5605 { 5606 ctx->timeout_ticks = 0; 5607 ctx->rc = status; 5608 if (ctx->start_cb_fn) { 5609 ctx->start_cb_fn(ctx->cb_ctx, status); 5610 ctx->start_cb_fn = NULL; 5611 ctx->cb_ctx = NULL; 5612 } 5613 } 5614 5615 static void 5616 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 5617 { 5618 struct discovery_entry_ctx *entry_ctx = cb_ctx; 5619 struct discovery_ctx *ctx = entry_ctx->ctx; 5620 5621 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 5622 ctx->attach_in_progress--; 5623 if (ctx->attach_in_progress == 0) { 5624 complete_discovery_start(ctx, ctx->rc); 5625 if (ctx->initializing && ctx->rc != 0) { 5626 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 5627 stop_discovery(ctx, NULL, ctx->cb_ctx); 5628 } else { 5629 discovery_remove_controllers(ctx); 5630 } 5631 } 5632 } 5633 5634 static struct discovery_entry_ctx * 5635 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 5636 { 5637 struct discovery_entry_ctx *new_ctx; 5638 5639 new_ctx = calloc(1, sizeof(*new_ctx)); 5640 if (new_ctx == NULL) { 5641 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 5642 return NULL; 5643 } 5644 5645 new_ctx->ctx = ctx; 5646 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 5647 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 5648 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 5649 return new_ctx; 5650 } 5651 5652 static void 5653 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 5654 struct spdk_nvmf_discovery_log_page *log_page) 5655 { 5656 struct discovery_ctx *ctx = cb_arg; 5657 struct discovery_entry_ctx *entry_ctx, *tmp; 5658 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 5659 uint64_t numrec, i; 5660 bool found; 5661 5662 if (rc || spdk_nvme_cpl_is_error(cpl)) { 5663 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 5664 return; 5665 } 5666 5667 ctx->log_page = log_page; 5668 assert(ctx->attach_in_progress == 0); 5669 numrec = from_le64(&log_page->numrec); 5670 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 5671 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 5672 free(entry_ctx); 5673 } 5674 for (i = 0; i < numrec; i++) { 5675 found = false; 5676 new_entry = &log_page->entries[i]; 5677 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 5678 struct discovery_entry_ctx *new_ctx; 5679 struct spdk_nvme_transport_id trid = {}; 5680 5681 build_trid_from_log_page_entry(&trid, new_entry); 5682 new_ctx = create_discovery_entry_ctx(ctx, &trid); 5683 if (new_ctx == NULL) { 5684 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 5685 break; 5686 } 5687 5688 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 5689 continue; 5690 } 5691 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 5692 old_entry = &entry_ctx->entry; 5693 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 5694 found = true; 5695 break; 5696 } 5697 } 5698 if (!found) { 5699 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 5700 struct discovery_ctx *d_ctx; 5701 5702 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 5703 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 5704 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 5705 sizeof(new_entry->subnqn))) { 5706 break; 5707 } 5708 } 5709 if (subnqn_ctx) { 5710 break; 5711 } 5712 } 5713 5714 new_ctx = calloc(1, sizeof(*new_ctx)); 5715 if (new_ctx == NULL) { 5716 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 5717 break; 5718 } 5719 5720 new_ctx->ctx = ctx; 5721 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 5722 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 5723 if (subnqn_ctx) { 5724 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 5725 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 5726 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 5727 new_ctx->name); 5728 } else { 5729 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 5730 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 5731 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 5732 new_ctx->name); 5733 } 5734 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 5735 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 5736 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 5737 discovery_attach_controller_done, new_ctx, 5738 &new_ctx->drv_opts, &ctx->bdev_opts, true); 5739 if (rc == 0) { 5740 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 5741 ctx->attach_in_progress++; 5742 } else { 5743 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 5744 } 5745 } 5746 } 5747 5748 if (ctx->attach_in_progress == 0) { 5749 discovery_remove_controllers(ctx); 5750 } 5751 } 5752 5753 static void 5754 get_discovery_log_page(struct discovery_ctx *ctx) 5755 { 5756 int rc; 5757 5758 assert(ctx->in_progress == false); 5759 ctx->in_progress = true; 5760 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 5761 if (rc != 0) { 5762 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 5763 } 5764 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 5765 } 5766 5767 static void 5768 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5769 { 5770 struct discovery_ctx *ctx = arg; 5771 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 5772 5773 if (spdk_nvme_cpl_is_error(cpl)) { 5774 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 5775 return; 5776 } 5777 5778 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 5779 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 5780 return; 5781 } 5782 5783 DISCOVERY_INFOLOG(ctx, "got aer\n"); 5784 if (ctx->in_progress) { 5785 ctx->pending = true; 5786 return; 5787 } 5788 5789 get_discovery_log_page(ctx); 5790 } 5791 5792 static void 5793 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5794 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5795 { 5796 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5797 struct discovery_ctx *ctx; 5798 5799 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 5800 5801 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 5802 ctx->probe_ctx = NULL; 5803 ctx->ctrlr = ctrlr; 5804 5805 if (ctx->rc != 0) { 5806 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 5807 ctx->rc); 5808 return; 5809 } 5810 5811 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 5812 } 5813 5814 static int 5815 discovery_poller(void *arg) 5816 { 5817 struct discovery_ctx *ctx = arg; 5818 struct spdk_nvme_transport_id *trid; 5819 int rc; 5820 5821 if (ctx->detach_ctx) { 5822 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 5823 if (rc != -EAGAIN) { 5824 ctx->detach_ctx = NULL; 5825 ctx->ctrlr = NULL; 5826 } 5827 } else if (ctx->stop) { 5828 if (ctx->ctrlr != NULL) { 5829 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 5830 if (rc == 0) { 5831 return SPDK_POLLER_BUSY; 5832 } 5833 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 5834 } 5835 spdk_poller_unregister(&ctx->poller); 5836 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 5837 assert(ctx->start_cb_fn == NULL); 5838 if (ctx->stop_cb_fn != NULL) { 5839 ctx->stop_cb_fn(ctx->cb_ctx); 5840 } 5841 free_discovery_ctx(ctx); 5842 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 5843 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 5844 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 5845 assert(ctx->initializing); 5846 spdk_poller_unregister(&ctx->poller); 5847 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 5848 complete_discovery_start(ctx, -ETIMEDOUT); 5849 stop_discovery(ctx, NULL, NULL); 5850 free_discovery_ctx(ctx); 5851 return SPDK_POLLER_BUSY; 5852 } 5853 5854 assert(ctx->entry_ctx_in_use == NULL); 5855 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 5856 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 5857 trid = &ctx->entry_ctx_in_use->trid; 5858 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 5859 if (ctx->probe_ctx) { 5860 spdk_poller_unregister(&ctx->poller); 5861 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 5862 } else { 5863 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 5864 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 5865 ctx->entry_ctx_in_use = NULL; 5866 } 5867 } else if (ctx->probe_ctx) { 5868 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 5869 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 5870 complete_discovery_start(ctx, -ETIMEDOUT); 5871 return SPDK_POLLER_BUSY; 5872 } 5873 5874 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 5875 if (rc != -EAGAIN) { 5876 if (ctx->rc != 0) { 5877 assert(ctx->initializing); 5878 stop_discovery(ctx, NULL, ctx->cb_ctx); 5879 } else { 5880 assert(rc == 0); 5881 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 5882 ctx->rc = rc; 5883 get_discovery_log_page(ctx); 5884 } 5885 } 5886 } else { 5887 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 5888 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 5889 complete_discovery_start(ctx, -ETIMEDOUT); 5890 /* We need to wait until all NVM ctrlrs are attached before we stop the 5891 * discovery service to make sure we don't detach a ctrlr that is still 5892 * being attached. 5893 */ 5894 if (ctx->attach_in_progress == 0) { 5895 stop_discovery(ctx, NULL, ctx->cb_ctx); 5896 return SPDK_POLLER_BUSY; 5897 } 5898 } 5899 5900 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 5901 if (rc < 0) { 5902 spdk_poller_unregister(&ctx->poller); 5903 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 5904 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 5905 ctx->entry_ctx_in_use = NULL; 5906 5907 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 5908 if (rc != 0) { 5909 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 5910 ctx->ctrlr = NULL; 5911 } 5912 } 5913 } 5914 5915 return SPDK_POLLER_BUSY; 5916 } 5917 5918 static void 5919 start_discovery_poller(void *arg) 5920 { 5921 struct discovery_ctx *ctx = arg; 5922 5923 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 5924 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 5925 } 5926 5927 int 5928 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 5929 const char *base_name, 5930 struct spdk_nvme_ctrlr_opts *drv_opts, 5931 struct nvme_ctrlr_opts *bdev_opts, 5932 uint64_t attach_timeout, 5933 bool from_mdns, 5934 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 5935 { 5936 struct discovery_ctx *ctx; 5937 struct discovery_entry_ctx *discovery_entry_ctx; 5938 5939 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 5940 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 5941 if (strcmp(ctx->name, base_name) == 0) { 5942 return -EEXIST; 5943 } 5944 5945 if (ctx->entry_ctx_in_use != NULL) { 5946 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 5947 return -EEXIST; 5948 } 5949 } 5950 5951 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 5952 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 5953 return -EEXIST; 5954 } 5955 } 5956 } 5957 5958 ctx = calloc(1, sizeof(*ctx)); 5959 if (ctx == NULL) { 5960 return -ENOMEM; 5961 } 5962 5963 ctx->name = strdup(base_name); 5964 if (ctx->name == NULL) { 5965 free_discovery_ctx(ctx); 5966 return -ENOMEM; 5967 } 5968 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 5969 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 5970 ctx->from_mdns_discovery_service = from_mdns; 5971 ctx->bdev_opts.from_discovery_service = true; 5972 ctx->calling_thread = spdk_get_thread(); 5973 ctx->start_cb_fn = cb_fn; 5974 ctx->cb_ctx = cb_ctx; 5975 ctx->initializing = true; 5976 if (ctx->start_cb_fn) { 5977 /* We can use this when dumping json to denote if this RPC parameter 5978 * was specified or not. 5979 */ 5980 ctx->wait_for_attach = true; 5981 } 5982 if (attach_timeout != 0) { 5983 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 5984 spdk_get_ticks_hz() / 1000ull; 5985 } 5986 TAILQ_INIT(&ctx->nvm_entry_ctxs); 5987 TAILQ_INIT(&ctx->discovery_entry_ctxs); 5988 memcpy(&ctx->trid, trid, sizeof(*trid)); 5989 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 5990 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 5991 if (ctx->hostnqn == NULL) { 5992 free_discovery_ctx(ctx); 5993 return -ENOMEM; 5994 } 5995 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 5996 if (discovery_entry_ctx == NULL) { 5997 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 5998 free_discovery_ctx(ctx); 5999 return -ENOMEM; 6000 } 6001 6002 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 6003 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 6004 return 0; 6005 } 6006 6007 int 6008 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6009 { 6010 struct discovery_ctx *ctx; 6011 6012 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6013 if (strcmp(name, ctx->name) == 0) { 6014 if (ctx->stop) { 6015 return -EALREADY; 6016 } 6017 /* If we're still starting the discovery service and ->rc is non-zero, we're 6018 * going to stop it as soon as we can 6019 */ 6020 if (ctx->initializing && ctx->rc != 0) { 6021 return -EALREADY; 6022 } 6023 stop_discovery(ctx, cb_fn, cb_ctx); 6024 return 0; 6025 } 6026 } 6027 6028 return -ENOENT; 6029 } 6030 6031 static int 6032 bdev_nvme_library_init(void) 6033 { 6034 g_bdev_nvme_init_thread = spdk_get_thread(); 6035 6036 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 6037 bdev_nvme_destroy_poll_group_cb, 6038 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 6039 6040 return 0; 6041 } 6042 6043 static void 6044 bdev_nvme_fini_destruct_ctrlrs(void) 6045 { 6046 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6047 struct nvme_ctrlr *nvme_ctrlr; 6048 6049 pthread_mutex_lock(&g_bdev_nvme_mutex); 6050 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 6051 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6052 pthread_mutex_lock(&nvme_ctrlr->mutex); 6053 if (nvme_ctrlr->destruct) { 6054 /* This controller's destruction was already started 6055 * before the application started shutting down 6056 */ 6057 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6058 continue; 6059 } 6060 nvme_ctrlr->destruct = true; 6061 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6062 6063 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 6064 nvme_ctrlr); 6065 } 6066 } 6067 6068 g_bdev_nvme_module_finish = true; 6069 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 6070 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6071 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 6072 spdk_bdev_module_fini_done(); 6073 return; 6074 } 6075 6076 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6077 } 6078 6079 static void 6080 check_discovery_fini(void *arg) 6081 { 6082 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 6083 bdev_nvme_fini_destruct_ctrlrs(); 6084 } 6085 } 6086 6087 static void 6088 bdev_nvme_library_fini(void) 6089 { 6090 struct nvme_probe_skip_entry *entry, *entry_tmp; 6091 struct discovery_ctx *ctx; 6092 6093 spdk_poller_unregister(&g_hotplug_poller); 6094 free(g_hotplug_probe_ctx); 6095 g_hotplug_probe_ctx = NULL; 6096 6097 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 6098 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 6099 free(entry); 6100 } 6101 6102 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 6103 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 6104 bdev_nvme_fini_destruct_ctrlrs(); 6105 } else { 6106 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6107 stop_discovery(ctx, check_discovery_fini, NULL); 6108 } 6109 } 6110 } 6111 6112 static void 6113 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 6114 { 6115 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6116 struct spdk_bdev *bdev = bdev_io->bdev; 6117 struct spdk_dif_ctx dif_ctx; 6118 struct spdk_dif_error err_blk = {}; 6119 int rc; 6120 6121 rc = spdk_dif_ctx_init(&dif_ctx, 6122 bdev->blocklen, bdev->md_len, bdev->md_interleave, 6123 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 6124 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0); 6125 if (rc != 0) { 6126 SPDK_ERRLOG("Initialization of DIF context failed\n"); 6127 return; 6128 } 6129 6130 if (bdev->md_interleave) { 6131 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 6132 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 6133 } else { 6134 struct iovec md_iov = { 6135 .iov_base = bdev_io->u.bdev.md_buf, 6136 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 6137 }; 6138 6139 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 6140 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 6141 } 6142 6143 if (rc != 0) { 6144 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 6145 err_blk.err_type, err_blk.err_offset); 6146 } else { 6147 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 6148 } 6149 } 6150 6151 static void 6152 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 6153 { 6154 struct nvme_bdev_io *bio = ref; 6155 6156 if (spdk_nvme_cpl_is_success(cpl)) { 6157 /* Run PI verification for read data buffer. */ 6158 bdev_nvme_verify_pi_error(bio); 6159 } 6160 6161 /* Return original completion status */ 6162 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 6163 } 6164 6165 static void 6166 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 6167 { 6168 struct nvme_bdev_io *bio = ref; 6169 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6170 int ret; 6171 6172 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 6173 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 6174 cpl->status.sct, cpl->status.sc); 6175 6176 /* Save completion status to use after verifying PI error. */ 6177 bio->cpl = *cpl; 6178 6179 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 6180 /* Read without PI checking to verify PI error. */ 6181 ret = bdev_nvme_no_pi_readv(bio, 6182 bdev_io->u.bdev.iovs, 6183 bdev_io->u.bdev.iovcnt, 6184 bdev_io->u.bdev.md_buf, 6185 bdev_io->u.bdev.num_blocks, 6186 bdev_io->u.bdev.offset_blocks); 6187 if (ret == 0) { 6188 return; 6189 } 6190 } 6191 } 6192 6193 bdev_nvme_io_complete_nvme_status(bio, cpl); 6194 } 6195 6196 static void 6197 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 6198 { 6199 struct nvme_bdev_io *bio = ref; 6200 6201 if (spdk_nvme_cpl_is_pi_error(cpl)) { 6202 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 6203 cpl->status.sct, cpl->status.sc); 6204 /* Run PI verification for write data buffer if PI error is detected. */ 6205 bdev_nvme_verify_pi_error(bio); 6206 } 6207 6208 bdev_nvme_io_complete_nvme_status(bio, cpl); 6209 } 6210 6211 static void 6212 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 6213 { 6214 struct nvme_bdev_io *bio = ref; 6215 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6216 6217 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 6218 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 6219 */ 6220 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 6221 6222 if (spdk_nvme_cpl_is_pi_error(cpl)) { 6223 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 6224 cpl->status.sct, cpl->status.sc); 6225 /* Run PI verification for zone append data buffer if PI error is detected. */ 6226 bdev_nvme_verify_pi_error(bio); 6227 } 6228 6229 bdev_nvme_io_complete_nvme_status(bio, cpl); 6230 } 6231 6232 static void 6233 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 6234 { 6235 struct nvme_bdev_io *bio = ref; 6236 6237 if (spdk_nvme_cpl_is_pi_error(cpl)) { 6238 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 6239 cpl->status.sct, cpl->status.sc); 6240 /* Run PI verification for compare data buffer if PI error is detected. */ 6241 bdev_nvme_verify_pi_error(bio); 6242 } 6243 6244 bdev_nvme_io_complete_nvme_status(bio, cpl); 6245 } 6246 6247 static void 6248 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 6249 { 6250 struct nvme_bdev_io *bio = ref; 6251 6252 /* Compare operation completion */ 6253 if (!bio->first_fused_completed) { 6254 /* Save compare result for write callback */ 6255 bio->cpl = *cpl; 6256 bio->first_fused_completed = true; 6257 return; 6258 } 6259 6260 /* Write operation completion */ 6261 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 6262 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 6263 * complete the IO with the compare operation's status. 6264 */ 6265 if (!spdk_nvme_cpl_is_error(cpl)) { 6266 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 6267 } 6268 6269 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 6270 } else { 6271 bdev_nvme_io_complete_nvme_status(bio, cpl); 6272 } 6273 } 6274 6275 static void 6276 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 6277 { 6278 struct nvme_bdev_io *bio = ref; 6279 6280 bdev_nvme_io_complete_nvme_status(bio, cpl); 6281 } 6282 6283 static int 6284 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 6285 { 6286 switch (desc->zt) { 6287 case SPDK_NVME_ZONE_TYPE_SEQWR: 6288 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 6289 break; 6290 default: 6291 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 6292 return -EIO; 6293 } 6294 6295 switch (desc->zs) { 6296 case SPDK_NVME_ZONE_STATE_EMPTY: 6297 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 6298 break; 6299 case SPDK_NVME_ZONE_STATE_IOPEN: 6300 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 6301 break; 6302 case SPDK_NVME_ZONE_STATE_EOPEN: 6303 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 6304 break; 6305 case SPDK_NVME_ZONE_STATE_CLOSED: 6306 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 6307 break; 6308 case SPDK_NVME_ZONE_STATE_RONLY: 6309 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 6310 break; 6311 case SPDK_NVME_ZONE_STATE_FULL: 6312 info->state = SPDK_BDEV_ZONE_STATE_FULL; 6313 break; 6314 case SPDK_NVME_ZONE_STATE_OFFLINE: 6315 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 6316 break; 6317 default: 6318 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 6319 return -EIO; 6320 } 6321 6322 info->zone_id = desc->zslba; 6323 info->write_pointer = desc->wp; 6324 info->capacity = desc->zcap; 6325 6326 return 0; 6327 } 6328 6329 static void 6330 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 6331 { 6332 struct nvme_bdev_io *bio = ref; 6333 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6334 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 6335 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 6336 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 6337 uint64_t max_zones_per_buf, i; 6338 uint32_t zone_report_bufsize; 6339 struct spdk_nvme_ns *ns; 6340 struct spdk_nvme_qpair *qpair; 6341 int ret; 6342 6343 if (spdk_nvme_cpl_is_error(cpl)) { 6344 goto out_complete_io_nvme_cpl; 6345 } 6346 6347 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 6348 ret = -ENXIO; 6349 goto out_complete_io_ret; 6350 } 6351 6352 ns = bio->io_path->nvme_ns->ns; 6353 qpair = bio->io_path->qpair->qpair; 6354 6355 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 6356 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 6357 sizeof(bio->zone_report_buf->descs[0]); 6358 6359 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 6360 ret = -EINVAL; 6361 goto out_complete_io_ret; 6362 } 6363 6364 if (!bio->zone_report_buf->nr_zones) { 6365 ret = -EINVAL; 6366 goto out_complete_io_ret; 6367 } 6368 6369 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 6370 ret = fill_zone_from_report(&info[bio->handled_zones], 6371 &bio->zone_report_buf->descs[i]); 6372 if (ret) { 6373 goto out_complete_io_ret; 6374 } 6375 bio->handled_zones++; 6376 } 6377 6378 if (bio->handled_zones < zones_to_copy) { 6379 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 6380 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 6381 6382 memset(bio->zone_report_buf, 0, zone_report_bufsize); 6383 ret = spdk_nvme_zns_report_zones(ns, qpair, 6384 bio->zone_report_buf, zone_report_bufsize, 6385 slba, SPDK_NVME_ZRA_LIST_ALL, true, 6386 bdev_nvme_get_zone_info_done, bio); 6387 if (!ret) { 6388 return; 6389 } else { 6390 goto out_complete_io_ret; 6391 } 6392 } 6393 6394 out_complete_io_nvme_cpl: 6395 free(bio->zone_report_buf); 6396 bio->zone_report_buf = NULL; 6397 bdev_nvme_io_complete_nvme_status(bio, cpl); 6398 return; 6399 6400 out_complete_io_ret: 6401 free(bio->zone_report_buf); 6402 bio->zone_report_buf = NULL; 6403 bdev_nvme_io_complete(bio, ret); 6404 } 6405 6406 static void 6407 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 6408 { 6409 struct nvme_bdev_io *bio = ref; 6410 6411 bdev_nvme_io_complete_nvme_status(bio, cpl); 6412 } 6413 6414 static void 6415 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 6416 { 6417 struct nvme_bdev_io *bio = ctx; 6418 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6419 const struct spdk_nvme_cpl *cpl = &bio->cpl; 6420 6421 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 6422 6423 __bdev_nvme_io_complete(bdev_io, 0, cpl); 6424 } 6425 6426 static void 6427 bdev_nvme_abort_complete(void *ctx) 6428 { 6429 struct nvme_bdev_io *bio = ctx; 6430 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6431 6432 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 6433 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 6434 } else { 6435 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 6436 } 6437 } 6438 6439 static void 6440 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 6441 { 6442 struct nvme_bdev_io *bio = ref; 6443 6444 bio->cpl = *cpl; 6445 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_complete, bio); 6446 } 6447 6448 static void 6449 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 6450 { 6451 struct nvme_bdev_io *bio = ref; 6452 6453 bio->cpl = *cpl; 6454 spdk_thread_send_msg(bio->orig_thread, 6455 bdev_nvme_admin_passthru_complete_nvme_status, bio); 6456 } 6457 6458 static void 6459 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 6460 { 6461 struct nvme_bdev_io *bio = ref; 6462 struct iovec *iov; 6463 6464 bio->iov_offset = sgl_offset; 6465 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 6466 iov = &bio->iovs[bio->iovpos]; 6467 if (bio->iov_offset < iov->iov_len) { 6468 break; 6469 } 6470 6471 bio->iov_offset -= iov->iov_len; 6472 } 6473 } 6474 6475 static int 6476 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 6477 { 6478 struct nvme_bdev_io *bio = ref; 6479 struct iovec *iov; 6480 6481 assert(bio->iovpos < bio->iovcnt); 6482 6483 iov = &bio->iovs[bio->iovpos]; 6484 6485 *address = iov->iov_base; 6486 *length = iov->iov_len; 6487 6488 if (bio->iov_offset) { 6489 assert(bio->iov_offset <= iov->iov_len); 6490 *address += bio->iov_offset; 6491 *length -= bio->iov_offset; 6492 } 6493 6494 bio->iov_offset += *length; 6495 if (bio->iov_offset == iov->iov_len) { 6496 bio->iovpos++; 6497 bio->iov_offset = 0; 6498 } 6499 6500 return 0; 6501 } 6502 6503 static void 6504 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 6505 { 6506 struct nvme_bdev_io *bio = ref; 6507 struct iovec *iov; 6508 6509 bio->fused_iov_offset = sgl_offset; 6510 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 6511 iov = &bio->fused_iovs[bio->fused_iovpos]; 6512 if (bio->fused_iov_offset < iov->iov_len) { 6513 break; 6514 } 6515 6516 bio->fused_iov_offset -= iov->iov_len; 6517 } 6518 } 6519 6520 static int 6521 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 6522 { 6523 struct nvme_bdev_io *bio = ref; 6524 struct iovec *iov; 6525 6526 assert(bio->fused_iovpos < bio->fused_iovcnt); 6527 6528 iov = &bio->fused_iovs[bio->fused_iovpos]; 6529 6530 *address = iov->iov_base; 6531 *length = iov->iov_len; 6532 6533 if (bio->fused_iov_offset) { 6534 assert(bio->fused_iov_offset <= iov->iov_len); 6535 *address += bio->fused_iov_offset; 6536 *length -= bio->fused_iov_offset; 6537 } 6538 6539 bio->fused_iov_offset += *length; 6540 if (bio->fused_iov_offset == iov->iov_len) { 6541 bio->fused_iovpos++; 6542 bio->fused_iov_offset = 0; 6543 } 6544 6545 return 0; 6546 } 6547 6548 static int 6549 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6550 void *md, uint64_t lba_count, uint64_t lba) 6551 { 6552 int rc; 6553 6554 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 6555 lba_count, lba); 6556 6557 bio->iovs = iov; 6558 bio->iovcnt = iovcnt; 6559 bio->iovpos = 0; 6560 bio->iov_offset = 0; 6561 6562 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 6563 bio->io_path->qpair->qpair, 6564 lba, lba_count, 6565 bdev_nvme_no_pi_readv_done, bio, 0, 6566 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6567 md, 0, 0); 6568 6569 if (rc != 0 && rc != -ENOMEM) { 6570 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 6571 } 6572 return rc; 6573 } 6574 6575 static int 6576 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6577 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 6578 struct spdk_bdev_ext_io_opts *ext_opts) 6579 { 6580 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6581 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6582 int rc; 6583 6584 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 6585 lba_count, lba); 6586 6587 bio->iovs = iov; 6588 bio->iovcnt = iovcnt; 6589 bio->iovpos = 0; 6590 bio->iov_offset = 0; 6591 6592 if (ext_opts) { 6593 bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts); 6594 bio->ext_opts.memory_domain = ext_opts->memory_domain; 6595 bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx; 6596 bio->ext_opts.io_flags = flags; 6597 bio->ext_opts.metadata = md; 6598 6599 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 6600 bdev_nvme_readv_done, bio, 6601 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6602 &bio->ext_opts); 6603 } else if (iovcnt == 1) { 6604 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba, 6605 lba_count, 6606 bdev_nvme_readv_done, bio, 6607 flags, 6608 0, 0); 6609 } else { 6610 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 6611 bdev_nvme_readv_done, bio, flags, 6612 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6613 md, 0, 0); 6614 } 6615 6616 if (rc != 0 && rc != -ENOMEM) { 6617 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 6618 } 6619 return rc; 6620 } 6621 6622 static int 6623 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6624 void *md, uint64_t lba_count, uint64_t lba, 6625 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts) 6626 { 6627 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6628 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6629 int rc; 6630 6631 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 6632 lba_count, lba); 6633 6634 bio->iovs = iov; 6635 bio->iovcnt = iovcnt; 6636 bio->iovpos = 0; 6637 bio->iov_offset = 0; 6638 6639 if (ext_opts) { 6640 bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts); 6641 bio->ext_opts.memory_domain = ext_opts->memory_domain; 6642 bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx; 6643 bio->ext_opts.io_flags = flags; 6644 bio->ext_opts.metadata = md; 6645 6646 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 6647 bdev_nvme_writev_done, bio, 6648 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6649 &bio->ext_opts); 6650 } else if (iovcnt == 1) { 6651 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba, 6652 lba_count, 6653 bdev_nvme_writev_done, bio, 6654 flags, 6655 0, 0); 6656 } else { 6657 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 6658 bdev_nvme_writev_done, bio, flags, 6659 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6660 md, 0, 0); 6661 } 6662 6663 if (rc != 0 && rc != -ENOMEM) { 6664 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 6665 } 6666 return rc; 6667 } 6668 6669 static int 6670 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6671 void *md, uint64_t lba_count, uint64_t zslba, 6672 uint32_t flags) 6673 { 6674 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6675 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6676 int rc; 6677 6678 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 6679 lba_count, zslba); 6680 6681 bio->iovs = iov; 6682 bio->iovcnt = iovcnt; 6683 bio->iovpos = 0; 6684 bio->iov_offset = 0; 6685 6686 if (iovcnt == 1) { 6687 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 6688 lba_count, 6689 bdev_nvme_zone_appendv_done, bio, 6690 flags, 6691 0, 0); 6692 } else { 6693 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 6694 bdev_nvme_zone_appendv_done, bio, flags, 6695 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6696 md, 0, 0); 6697 } 6698 6699 if (rc != 0 && rc != -ENOMEM) { 6700 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 6701 } 6702 return rc; 6703 } 6704 6705 static int 6706 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 6707 void *md, uint64_t lba_count, uint64_t lba, 6708 uint32_t flags) 6709 { 6710 int rc; 6711 6712 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 6713 lba_count, lba); 6714 6715 bio->iovs = iov; 6716 bio->iovcnt = iovcnt; 6717 bio->iovpos = 0; 6718 bio->iov_offset = 0; 6719 6720 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 6721 bio->io_path->qpair->qpair, 6722 lba, lba_count, 6723 bdev_nvme_comparev_done, bio, flags, 6724 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 6725 md, 0, 0); 6726 6727 if (rc != 0 && rc != -ENOMEM) { 6728 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 6729 } 6730 return rc; 6731 } 6732 6733 static int 6734 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 6735 struct iovec *write_iov, int write_iovcnt, 6736 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 6737 { 6738 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6739 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6740 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6741 int rc; 6742 6743 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 6744 lba_count, lba); 6745 6746 bio->iovs = cmp_iov; 6747 bio->iovcnt = cmp_iovcnt; 6748 bio->iovpos = 0; 6749 bio->iov_offset = 0; 6750 bio->fused_iovs = write_iov; 6751 bio->fused_iovcnt = write_iovcnt; 6752 bio->fused_iovpos = 0; 6753 bio->fused_iov_offset = 0; 6754 6755 if (bdev_io->num_retries == 0) { 6756 bio->first_fused_submitted = false; 6757 bio->first_fused_completed = false; 6758 } 6759 6760 if (!bio->first_fused_submitted) { 6761 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 6762 memset(&bio->cpl, 0, sizeof(bio->cpl)); 6763 6764 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 6765 bdev_nvme_comparev_and_writev_done, bio, flags, 6766 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 6767 if (rc == 0) { 6768 bio->first_fused_submitted = true; 6769 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 6770 } else { 6771 if (rc != -ENOMEM) { 6772 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 6773 } 6774 return rc; 6775 } 6776 } 6777 6778 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 6779 6780 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 6781 bdev_nvme_comparev_and_writev_done, bio, flags, 6782 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 6783 if (rc != 0 && rc != -ENOMEM) { 6784 SPDK_ERRLOG("write failed: rc = %d\n", rc); 6785 rc = 0; 6786 } 6787 6788 return rc; 6789 } 6790 6791 static int 6792 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 6793 { 6794 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 6795 struct spdk_nvme_dsm_range *range; 6796 uint64_t offset, remaining; 6797 uint64_t num_ranges_u64; 6798 uint16_t num_ranges; 6799 int rc; 6800 6801 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 6802 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6803 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 6804 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 6805 return -EINVAL; 6806 } 6807 num_ranges = (uint16_t)num_ranges_u64; 6808 6809 offset = offset_blocks; 6810 remaining = num_blocks; 6811 range = &dsm_ranges[0]; 6812 6813 /* Fill max-size ranges until the remaining blocks fit into one range */ 6814 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 6815 range->attributes.raw = 0; 6816 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6817 range->starting_lba = offset; 6818 6819 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6820 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6821 range++; 6822 } 6823 6824 /* Final range describes the remaining blocks */ 6825 range->attributes.raw = 0; 6826 range->length = remaining; 6827 range->starting_lba = offset; 6828 6829 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 6830 bio->io_path->qpair->qpair, 6831 SPDK_NVME_DSM_ATTR_DEALLOCATE, 6832 dsm_ranges, num_ranges, 6833 bdev_nvme_queued_done, bio); 6834 6835 return rc; 6836 } 6837 6838 static int 6839 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 6840 { 6841 if (num_blocks > UINT16_MAX + 1) { 6842 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 6843 return -EINVAL; 6844 } 6845 6846 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 6847 bio->io_path->qpair->qpair, 6848 offset_blocks, num_blocks, 6849 bdev_nvme_queued_done, bio, 6850 0); 6851 } 6852 6853 static int 6854 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 6855 struct spdk_bdev_zone_info *info) 6856 { 6857 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6858 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6859 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 6860 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 6861 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 6862 6863 if (zone_id % zone_size != 0) { 6864 return -EINVAL; 6865 } 6866 6867 if (num_zones > total_zones || !num_zones) { 6868 return -EINVAL; 6869 } 6870 6871 assert(!bio->zone_report_buf); 6872 bio->zone_report_buf = calloc(1, zone_report_bufsize); 6873 if (!bio->zone_report_buf) { 6874 return -ENOMEM; 6875 } 6876 6877 bio->handled_zones = 0; 6878 6879 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 6880 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 6881 bdev_nvme_get_zone_info_done, bio); 6882 } 6883 6884 static int 6885 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 6886 enum spdk_bdev_zone_action action) 6887 { 6888 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6889 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6890 6891 switch (action) { 6892 case SPDK_BDEV_ZONE_CLOSE: 6893 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 6894 bdev_nvme_zone_management_done, bio); 6895 case SPDK_BDEV_ZONE_FINISH: 6896 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 6897 bdev_nvme_zone_management_done, bio); 6898 case SPDK_BDEV_ZONE_OPEN: 6899 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 6900 bdev_nvme_zone_management_done, bio); 6901 case SPDK_BDEV_ZONE_RESET: 6902 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 6903 bdev_nvme_zone_management_done, bio); 6904 case SPDK_BDEV_ZONE_OFFLINE: 6905 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 6906 bdev_nvme_zone_management_done, bio); 6907 default: 6908 return -EINVAL; 6909 } 6910 } 6911 6912 static void 6913 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 6914 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 6915 { 6916 struct nvme_io_path *io_path; 6917 struct nvme_ctrlr *nvme_ctrlr; 6918 uint32_t max_xfer_size; 6919 int rc = -ENXIO; 6920 6921 /* Choose the first ctrlr which is not failed. */ 6922 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 6923 nvme_ctrlr = io_path->qpair->ctrlr; 6924 6925 /* We should skip any unavailable nvme_ctrlr rather than checking 6926 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 6927 */ 6928 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 6929 continue; 6930 } 6931 6932 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 6933 6934 if (nbytes > max_xfer_size) { 6935 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 6936 rc = -EINVAL; 6937 goto err; 6938 } 6939 6940 bio->io_path = io_path; 6941 bio->orig_thread = spdk_get_thread(); 6942 6943 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 6944 bdev_nvme_admin_passthru_done, bio); 6945 if (rc == 0) { 6946 return; 6947 } 6948 } 6949 6950 err: 6951 bdev_nvme_admin_passthru_complete(bio, rc); 6952 } 6953 6954 static int 6955 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 6956 void *buf, size_t nbytes) 6957 { 6958 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6959 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6960 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 6961 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 6962 6963 if (nbytes > max_xfer_size) { 6964 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 6965 return -EINVAL; 6966 } 6967 6968 /* 6969 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 6970 * so fill it out automatically. 6971 */ 6972 cmd->nsid = spdk_nvme_ns_get_id(ns); 6973 6974 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 6975 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 6976 } 6977 6978 static int 6979 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 6980 void *buf, size_t nbytes, void *md_buf, size_t md_len) 6981 { 6982 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6983 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6984 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 6985 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 6986 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 6987 6988 if (nbytes > max_xfer_size) { 6989 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 6990 return -EINVAL; 6991 } 6992 6993 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 6994 SPDK_ERRLOG("invalid meta data buffer size\n"); 6995 return -EINVAL; 6996 } 6997 6998 /* 6999 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 7000 * so fill it out automatically. 7001 */ 7002 cmd->nsid = spdk_nvme_ns_get_id(ns); 7003 7004 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 7005 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 7006 } 7007 7008 static void 7009 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 7010 struct nvme_bdev_io *bio_to_abort) 7011 { 7012 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7013 struct nvme_io_path *io_path; 7014 struct nvme_ctrlr *nvme_ctrlr; 7015 int rc = 0; 7016 7017 bio->orig_thread = spdk_get_thread(); 7018 7019 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 7020 if (rc == 0) { 7021 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 7022 return; 7023 } 7024 7025 rc = 0; 7026 7027 /* Even admin commands, they were submitted to only nvme_ctrlrs which were 7028 * on any io_path. So traverse the io_path list for not only I/O commands 7029 * but also admin commands. 7030 */ 7031 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 7032 nvme_ctrlr = io_path->qpair->ctrlr; 7033 7034 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr, 7035 io_path->qpair->qpair, 7036 bio_to_abort, 7037 bdev_nvme_abort_done, bio); 7038 if (rc == -ENOENT) { 7039 /* If no command was found in I/O qpair, the target command may be 7040 * admin command. 7041 */ 7042 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr, 7043 NULL, 7044 bio_to_abort, 7045 bdev_nvme_abort_done, bio); 7046 } 7047 7048 if (rc != -ENOENT) { 7049 break; 7050 } 7051 } 7052 7053 if (rc != 0) { 7054 /* If no command was found or there was any error, complete the abort 7055 * request with failure. 7056 */ 7057 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 7058 } 7059 } 7060 7061 static int 7062 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 7063 uint64_t num_blocks) 7064 { 7065 struct spdk_nvme_scc_source_range range = { 7066 .slba = src_offset_blocks, 7067 .nlb = num_blocks - 1 7068 }; 7069 7070 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 7071 bio->io_path->qpair->qpair, 7072 &range, 1, dst_offset_blocks, 7073 bdev_nvme_queued_done, bio); 7074 } 7075 7076 static void 7077 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 7078 { 7079 const char *action; 7080 7081 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 7082 action = "reset"; 7083 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 7084 action = "abort"; 7085 } else { 7086 action = "none"; 7087 } 7088 7089 spdk_json_write_object_begin(w); 7090 7091 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 7092 7093 spdk_json_write_named_object_begin(w, "params"); 7094 spdk_json_write_named_string(w, "action_on_timeout", action); 7095 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 7096 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 7097 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 7098 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 7099 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 7100 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 7101 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 7102 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 7103 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 7104 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 7105 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 7106 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 7107 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 7108 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 7109 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 7110 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 7111 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 7112 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 7113 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 7114 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 7115 spdk_json_write_object_end(w); 7116 7117 spdk_json_write_object_end(w); 7118 } 7119 7120 static void 7121 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 7122 { 7123 struct spdk_nvme_transport_id trid; 7124 7125 spdk_json_write_object_begin(w); 7126 7127 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 7128 7129 spdk_json_write_named_object_begin(w, "params"); 7130 spdk_json_write_named_string(w, "name", ctx->name); 7131 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 7132 7133 trid = ctx->trid; 7134 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 7135 nvme_bdev_dump_trid_json(&trid, w); 7136 7137 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 7138 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 7139 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 7140 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 7141 ctx->bdev_opts.fast_io_fail_timeout_sec); 7142 spdk_json_write_object_end(w); 7143 7144 spdk_json_write_object_end(w); 7145 } 7146 7147 static void 7148 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 7149 struct nvme_ctrlr *nvme_ctrlr) 7150 { 7151 struct spdk_nvme_transport_id *trid; 7152 7153 if (nvme_ctrlr->opts.from_discovery_service) { 7154 /* Do not emit an RPC for this - it will be implicitly 7155 * covered by a separate bdev_nvme_start_discovery or 7156 * bdev_nvme_start_mdns_discovery RPC. 7157 */ 7158 return; 7159 } 7160 7161 trid = &nvme_ctrlr->active_path_id->trid; 7162 7163 spdk_json_write_object_begin(w); 7164 7165 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 7166 7167 spdk_json_write_named_object_begin(w, "params"); 7168 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 7169 nvme_bdev_dump_trid_json(trid, w); 7170 spdk_json_write_named_bool(w, "prchk_reftag", 7171 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 7172 spdk_json_write_named_bool(w, "prchk_guard", 7173 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 7174 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 7175 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 7176 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 7177 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 7178 7179 spdk_json_write_object_end(w); 7180 7181 spdk_json_write_object_end(w); 7182 } 7183 7184 static void 7185 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 7186 { 7187 spdk_json_write_object_begin(w); 7188 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 7189 7190 spdk_json_write_named_object_begin(w, "params"); 7191 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 7192 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 7193 spdk_json_write_object_end(w); 7194 7195 spdk_json_write_object_end(w); 7196 } 7197 7198 static int 7199 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 7200 { 7201 struct nvme_bdev_ctrlr *nbdev_ctrlr; 7202 struct nvme_ctrlr *nvme_ctrlr; 7203 struct discovery_ctx *ctx; 7204 7205 bdev_nvme_opts_config_json(w); 7206 7207 pthread_mutex_lock(&g_bdev_nvme_mutex); 7208 7209 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 7210 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 7211 nvme_ctrlr_config_json(w, nvme_ctrlr); 7212 } 7213 } 7214 7215 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7216 if (!ctx->from_mdns_discovery_service) { 7217 bdev_nvme_discovery_config_json(w, ctx); 7218 } 7219 } 7220 7221 bdev_nvme_mdns_discovery_config_json(w); 7222 7223 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 7224 * before enabling hotplug poller. 7225 */ 7226 bdev_nvme_hotplug_config_json(w); 7227 7228 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7229 return 0; 7230 } 7231 7232 struct spdk_nvme_ctrlr * 7233 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 7234 { 7235 struct nvme_bdev *nbdev; 7236 struct nvme_ns *nvme_ns; 7237 7238 if (!bdev || bdev->module != &nvme_if) { 7239 return NULL; 7240 } 7241 7242 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 7243 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 7244 assert(nvme_ns != NULL); 7245 7246 return nvme_ns->ctrlr->ctrlr; 7247 } 7248 7249 void 7250 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 7251 { 7252 struct nvme_ns *nvme_ns = io_path->nvme_ns; 7253 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 7254 const struct spdk_nvme_ctrlr_data *cdata; 7255 const struct spdk_nvme_transport_id *trid; 7256 const char *adrfam_str; 7257 7258 spdk_json_write_object_begin(w); 7259 7260 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 7261 7262 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 7263 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 7264 7265 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 7266 spdk_json_write_named_bool(w, "current", io_path == io_path->nbdev_ch->current_io_path); 7267 spdk_json_write_named_bool(w, "connected", nvme_io_path_is_connected(io_path)); 7268 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 7269 7270 spdk_json_write_named_object_begin(w, "transport"); 7271 spdk_json_write_named_string(w, "trtype", trid->trstring); 7272 spdk_json_write_named_string(w, "traddr", trid->traddr); 7273 if (trid->trsvcid[0] != '\0') { 7274 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 7275 } 7276 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 7277 if (adrfam_str) { 7278 spdk_json_write_named_string(w, "adrfam", adrfam_str); 7279 } 7280 spdk_json_write_object_end(w); 7281 7282 spdk_json_write_object_end(w); 7283 } 7284 7285 void 7286 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 7287 { 7288 struct discovery_ctx *ctx; 7289 struct discovery_entry_ctx *entry_ctx; 7290 7291 spdk_json_write_array_begin(w); 7292 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7293 spdk_json_write_object_begin(w); 7294 spdk_json_write_named_string(w, "name", ctx->name); 7295 7296 spdk_json_write_named_object_begin(w, "trid"); 7297 nvme_bdev_dump_trid_json(&ctx->trid, w); 7298 spdk_json_write_object_end(w); 7299 7300 spdk_json_write_named_array_begin(w, "referrals"); 7301 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 7302 spdk_json_write_object_begin(w); 7303 spdk_json_write_named_object_begin(w, "trid"); 7304 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 7305 spdk_json_write_object_end(w); 7306 spdk_json_write_object_end(w); 7307 } 7308 spdk_json_write_array_end(w); 7309 7310 spdk_json_write_object_end(w); 7311 } 7312 spdk_json_write_array_end(w); 7313 } 7314 7315 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 7316 7317 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 7318 { 7319 struct spdk_trace_tpoint_opts opts[] = { 7320 { 7321 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 7322 OWNER_NONE, OBJECT_BDEV_NVME_IO, 1, 7323 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 7324 }, 7325 { 7326 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 7327 OWNER_NONE, OBJECT_BDEV_NVME_IO, 0, 7328 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 7329 } 7330 }; 7331 7332 7333 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 7334 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 7335 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 7336 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 7337 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 7338 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 7339 } 7340