1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 6 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include "spdk/stdinc.h" 36 37 #include "bdev_nvme.h" 38 39 #include "spdk/accel_engine.h" 40 #include "spdk/config.h" 41 #include "spdk/endian.h" 42 #include "spdk/bdev.h" 43 #include "spdk/json.h" 44 #include "spdk/likely.h" 45 #include "spdk/nvme.h" 46 #include "spdk/nvme_ocssd.h" 47 #include "spdk/nvme_zns.h" 48 #include "spdk/opal.h" 49 #include "spdk/thread.h" 50 #include "spdk/string.h" 51 #include "spdk/util.h" 52 53 #include "spdk/bdev_module.h" 54 #include "spdk/log.h" 55 56 #include "spdk_internal/usdt.h" 57 58 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 59 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 60 61 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 62 63 struct nvme_bdev_io { 64 /** array of iovecs to transfer. */ 65 struct iovec *iovs; 66 67 /** Number of iovecs in iovs array. */ 68 int iovcnt; 69 70 /** Current iovec position. */ 71 int iovpos; 72 73 /** Offset in current iovec. */ 74 uint32_t iov_offset; 75 76 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 77 * being reset in a reset I/O. 78 */ 79 struct nvme_io_path *io_path; 80 81 /** array of iovecs to transfer. */ 82 struct iovec *fused_iovs; 83 84 /** Number of iovecs in iovs array. */ 85 int fused_iovcnt; 86 87 /** Current iovec position. */ 88 int fused_iovpos; 89 90 /** Offset in current iovec. */ 91 uint32_t fused_iov_offset; 92 93 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 94 struct spdk_nvme_cpl cpl; 95 96 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 97 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 98 99 /** Originating thread */ 100 struct spdk_thread *orig_thread; 101 102 /** Keeps track if first of fused commands was submitted */ 103 bool first_fused_submitted; 104 105 /** Keeps track if first of fused commands was completed */ 106 bool first_fused_completed; 107 108 /** Temporary pointer to zone report buffer */ 109 struct spdk_nvme_zns_zone_report *zone_report_buf; 110 111 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 112 uint64_t handled_zones; 113 114 /** Expiration value in ticks to retry the current I/O. */ 115 uint64_t retry_ticks; 116 117 /* How many times the current I/O was retried. */ 118 int32_t retry_count; 119 }; 120 121 struct nvme_probe_skip_entry { 122 struct spdk_nvme_transport_id trid; 123 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 124 }; 125 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 126 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 127 g_skipped_nvme_ctrlrs); 128 129 static struct spdk_bdev_nvme_opts g_opts = { 130 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 131 .timeout_us = 0, 132 .timeout_admin_us = 0, 133 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 134 .transport_retry_count = 4, 135 .arbitration_burst = 0, 136 .low_priority_weight = 0, 137 .medium_priority_weight = 0, 138 .high_priority_weight = 0, 139 .nvme_adminq_poll_period_us = 10000ULL, 140 .nvme_ioq_poll_period_us = 0, 141 .io_queue_requests = 0, 142 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 143 .bdev_retry_count = 3, 144 .transport_ack_timeout = 0, 145 .ctrlr_loss_timeout_sec = 0, 146 .reconnect_delay_sec = 0, 147 .fast_io_fail_timeout_sec = 0, 148 }; 149 150 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 151 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 152 153 static int g_hot_insert_nvme_controller_index = 0; 154 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 155 static bool g_nvme_hotplug_enabled = false; 156 static struct spdk_thread *g_bdev_nvme_init_thread; 157 static struct spdk_poller *g_hotplug_poller; 158 static struct spdk_poller *g_hotplug_probe_poller; 159 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 160 161 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 162 struct nvme_async_probe_ctx *ctx); 163 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 164 struct nvme_async_probe_ctx *ctx); 165 static int bdev_nvme_library_init(void); 166 static void bdev_nvme_library_fini(void); 167 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 168 struct spdk_bdev_io *bdev_io); 169 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 170 void *md, uint64_t lba_count, uint64_t lba, 171 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts); 172 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 173 void *md, uint64_t lba_count, uint64_t lba); 174 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 175 void *md, uint64_t lba_count, uint64_t lba, 176 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts); 177 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 178 void *md, uint64_t lba_count, 179 uint64_t zslba, uint32_t flags); 180 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 181 void *md, uint64_t lba_count, uint64_t lba, 182 uint32_t flags); 183 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 184 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 185 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 186 uint32_t flags); 187 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 188 uint32_t num_zones, struct spdk_bdev_zone_info *info); 189 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 190 enum spdk_bdev_zone_action action); 191 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 192 struct nvme_bdev_io *bio, 193 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 194 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 195 void *buf, size_t nbytes); 196 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 197 void *buf, size_t nbytes, void *md_buf, size_t md_len); 198 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 199 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 200 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 201 static int bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr); 202 static int bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove); 203 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 204 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 205 206 static int 207 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 208 { 209 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 210 } 211 212 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 213 214 struct spdk_nvme_qpair * 215 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 216 { 217 struct nvme_ctrlr_channel *ctrlr_ch; 218 219 assert(ctrlr_io_ch != NULL); 220 221 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 222 223 return ctrlr_ch->qpair->qpair; 224 } 225 226 static int 227 bdev_nvme_get_ctx_size(void) 228 { 229 return sizeof(struct nvme_bdev_io); 230 } 231 232 static struct spdk_bdev_module nvme_if = { 233 .name = "nvme", 234 .async_fini = true, 235 .module_init = bdev_nvme_library_init, 236 .module_fini = bdev_nvme_library_fini, 237 .config_json = bdev_nvme_config_json, 238 .get_ctx_size = bdev_nvme_get_ctx_size, 239 240 }; 241 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 242 243 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 244 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 245 bool g_bdev_nvme_module_finish; 246 247 struct nvme_bdev_ctrlr * 248 nvme_bdev_ctrlr_get_by_name(const char *name) 249 { 250 struct nvme_bdev_ctrlr *nbdev_ctrlr; 251 252 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 253 if (strcmp(name, nbdev_ctrlr->name) == 0) { 254 break; 255 } 256 } 257 258 return nbdev_ctrlr; 259 } 260 261 static struct nvme_ctrlr * 262 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 263 const struct spdk_nvme_transport_id *trid) 264 { 265 struct nvme_ctrlr *nvme_ctrlr; 266 267 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 268 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { 269 break; 270 } 271 } 272 273 return nvme_ctrlr; 274 } 275 276 static struct nvme_bdev * 277 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 278 { 279 struct nvme_bdev *bdev; 280 281 pthread_mutex_lock(&g_bdev_nvme_mutex); 282 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 283 if (bdev->nsid == nsid) { 284 break; 285 } 286 } 287 pthread_mutex_unlock(&g_bdev_nvme_mutex); 288 289 return bdev; 290 } 291 292 struct nvme_ns * 293 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 294 { 295 struct nvme_ns ns; 296 297 assert(nsid > 0); 298 299 ns.id = nsid; 300 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 301 } 302 303 struct nvme_ns * 304 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 305 { 306 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 307 } 308 309 struct nvme_ns * 310 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 311 { 312 if (ns == NULL) { 313 return NULL; 314 } 315 316 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 317 } 318 319 static struct nvme_ctrlr * 320 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) 321 { 322 struct nvme_bdev_ctrlr *nbdev_ctrlr; 323 struct nvme_ctrlr *nvme_ctrlr = NULL; 324 325 pthread_mutex_lock(&g_bdev_nvme_mutex); 326 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 327 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); 328 if (nvme_ctrlr != NULL) { 329 break; 330 } 331 } 332 pthread_mutex_unlock(&g_bdev_nvme_mutex); 333 334 return nvme_ctrlr; 335 } 336 337 struct nvme_ctrlr * 338 nvme_ctrlr_get_by_name(const char *name) 339 { 340 struct nvme_bdev_ctrlr *nbdev_ctrlr; 341 struct nvme_ctrlr *nvme_ctrlr = NULL; 342 343 if (name == NULL) { 344 return NULL; 345 } 346 347 pthread_mutex_lock(&g_bdev_nvme_mutex); 348 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 349 if (nbdev_ctrlr != NULL) { 350 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 351 } 352 pthread_mutex_unlock(&g_bdev_nvme_mutex); 353 354 return nvme_ctrlr; 355 } 356 357 void 358 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 359 { 360 struct nvme_bdev_ctrlr *nbdev_ctrlr; 361 362 pthread_mutex_lock(&g_bdev_nvme_mutex); 363 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 364 fn(nbdev_ctrlr, ctx); 365 } 366 pthread_mutex_unlock(&g_bdev_nvme_mutex); 367 } 368 369 void 370 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 371 { 372 const char *trtype_str; 373 const char *adrfam_str; 374 375 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 376 if (trtype_str) { 377 spdk_json_write_named_string(w, "trtype", trtype_str); 378 } 379 380 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 381 if (adrfam_str) { 382 spdk_json_write_named_string(w, "adrfam", adrfam_str); 383 } 384 385 if (trid->traddr[0] != '\0') { 386 spdk_json_write_named_string(w, "traddr", trid->traddr); 387 } 388 389 if (trid->trsvcid[0] != '\0') { 390 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 391 } 392 393 if (trid->subnqn[0] != '\0') { 394 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 395 } 396 } 397 398 static void 399 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 400 struct nvme_ctrlr *nvme_ctrlr) 401 { 402 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 403 pthread_mutex_lock(&g_bdev_nvme_mutex); 404 405 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 406 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 407 pthread_mutex_unlock(&g_bdev_nvme_mutex); 408 409 return; 410 } 411 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 412 413 pthread_mutex_unlock(&g_bdev_nvme_mutex); 414 415 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 416 417 free(nbdev_ctrlr->name); 418 free(nbdev_ctrlr); 419 } 420 421 static void 422 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 423 { 424 struct nvme_path_id *path_id, *tmp_path; 425 struct nvme_ns *ns, *tmp_ns; 426 427 free(nvme_ctrlr->copied_ana_desc); 428 spdk_free(nvme_ctrlr->ana_log_page); 429 430 if (nvme_ctrlr->opal_dev) { 431 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 432 nvme_ctrlr->opal_dev = NULL; 433 } 434 435 if (nvme_ctrlr->nbdev_ctrlr) { 436 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 437 } 438 439 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 440 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 441 free(ns); 442 } 443 444 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 445 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 446 free(path_id); 447 } 448 449 pthread_mutex_destroy(&nvme_ctrlr->mutex); 450 451 free(nvme_ctrlr); 452 453 pthread_mutex_lock(&g_bdev_nvme_mutex); 454 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 455 pthread_mutex_unlock(&g_bdev_nvme_mutex); 456 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 457 spdk_bdev_module_fini_done(); 458 return; 459 } 460 pthread_mutex_unlock(&g_bdev_nvme_mutex); 461 } 462 463 static int 464 nvme_detach_poller(void *arg) 465 { 466 struct nvme_ctrlr *nvme_ctrlr = arg; 467 int rc; 468 469 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 470 if (rc != -EAGAIN) { 471 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 472 _nvme_ctrlr_delete(nvme_ctrlr); 473 } 474 475 return SPDK_POLLER_BUSY; 476 } 477 478 static void 479 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 480 { 481 int rc; 482 483 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 484 485 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 486 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 487 488 /* If we got here, the reset/detach poller cannot be active */ 489 assert(nvme_ctrlr->reset_detach_poller == NULL); 490 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 491 nvme_ctrlr, 1000); 492 if (nvme_ctrlr->reset_detach_poller == NULL) { 493 SPDK_ERRLOG("Failed to register detach poller\n"); 494 goto error; 495 } 496 497 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 498 if (rc != 0) { 499 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 500 goto error; 501 } 502 503 return; 504 error: 505 /* We don't have a good way to handle errors here, so just do what we can and delete the 506 * controller without detaching the underlying NVMe device. 507 */ 508 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 509 _nvme_ctrlr_delete(nvme_ctrlr); 510 } 511 512 static void 513 nvme_ctrlr_unregister_cb(void *io_device) 514 { 515 struct nvme_ctrlr *nvme_ctrlr = io_device; 516 517 nvme_ctrlr_delete(nvme_ctrlr); 518 } 519 520 static void 521 nvme_ctrlr_unregister(void *ctx) 522 { 523 struct nvme_ctrlr *nvme_ctrlr = ctx; 524 525 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 526 } 527 528 static bool 529 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 530 { 531 if (!nvme_ctrlr->destruct) { 532 return false; 533 } 534 535 if (nvme_ctrlr->ref > 0) { 536 return false; 537 } 538 539 if (nvme_ctrlr->resetting) { 540 return false; 541 } 542 543 if (nvme_ctrlr->ana_log_page_updating) { 544 return false; 545 } 546 547 return true; 548 } 549 550 static void 551 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 552 { 553 pthread_mutex_lock(&nvme_ctrlr->mutex); 554 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 555 556 assert(nvme_ctrlr->ref > 0); 557 nvme_ctrlr->ref--; 558 559 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 560 pthread_mutex_unlock(&nvme_ctrlr->mutex); 561 return; 562 } 563 564 pthread_mutex_unlock(&nvme_ctrlr->mutex); 565 566 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 567 } 568 569 static struct nvme_io_path * 570 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 571 { 572 struct nvme_io_path *io_path; 573 574 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 575 if (io_path->nvme_ns == nvme_ns) { 576 break; 577 } 578 } 579 580 return io_path; 581 } 582 583 static int 584 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 585 { 586 struct nvme_io_path *io_path; 587 struct spdk_io_channel *ch; 588 struct nvme_ctrlr_channel *ctrlr_ch; 589 struct nvme_qpair *nvme_qpair; 590 591 io_path = calloc(1, sizeof(*io_path)); 592 if (io_path == NULL) { 593 SPDK_ERRLOG("Failed to alloc io_path.\n"); 594 return -ENOMEM; 595 } 596 597 io_path->nvme_ns = nvme_ns; 598 599 ch = spdk_get_io_channel(nvme_ns->ctrlr); 600 if (ch == NULL) { 601 free(io_path); 602 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 603 return -ENOMEM; 604 } 605 606 ctrlr_ch = spdk_io_channel_get_ctx(ch); 607 608 nvme_qpair = ctrlr_ch->qpair; 609 assert(nvme_qpair != NULL); 610 611 io_path->qpair = nvme_qpair; 612 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 613 614 io_path->nbdev_ch = nbdev_ch; 615 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 616 617 nbdev_ch->current_io_path = NULL; 618 619 return 0; 620 } 621 622 static void 623 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 624 { 625 struct spdk_io_channel *ch; 626 struct nvme_qpair *nvme_qpair; 627 struct nvme_ctrlr_channel *ctrlr_ch; 628 629 nbdev_ch->current_io_path = NULL; 630 631 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 632 633 nvme_qpair = io_path->qpair; 634 assert(nvme_qpair != NULL); 635 636 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 637 638 ctrlr_ch = nvme_qpair->ctrlr_ch; 639 assert(ctrlr_ch != NULL); 640 641 ch = spdk_io_channel_from_ctx(ctrlr_ch); 642 spdk_put_io_channel(ch); 643 644 free(io_path); 645 } 646 647 static void 648 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 649 { 650 struct nvme_io_path *io_path, *tmp_io_path; 651 652 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 653 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 654 } 655 } 656 657 static int 658 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 659 { 660 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 661 struct nvme_bdev *nbdev = io_device; 662 struct nvme_ns *nvme_ns; 663 int rc; 664 665 STAILQ_INIT(&nbdev_ch->io_path_list); 666 TAILQ_INIT(&nbdev_ch->retry_io_list); 667 668 pthread_mutex_lock(&nbdev->mutex); 669 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 670 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 671 if (rc != 0) { 672 pthread_mutex_unlock(&nbdev->mutex); 673 674 _bdev_nvme_delete_io_paths(nbdev_ch); 675 return rc; 676 } 677 } 678 pthread_mutex_unlock(&nbdev->mutex); 679 680 return 0; 681 } 682 683 static void 684 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 685 { 686 struct spdk_bdev_io *bdev_io, *tmp_io; 687 688 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) { 689 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 690 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 691 } 692 693 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 694 } 695 696 static void 697 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 698 { 699 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 700 701 bdev_nvme_abort_retry_ios(nbdev_ch); 702 _bdev_nvme_delete_io_paths(nbdev_ch); 703 } 704 705 static inline bool 706 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 707 { 708 switch (io_type) { 709 case SPDK_BDEV_IO_TYPE_RESET: 710 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 711 case SPDK_BDEV_IO_TYPE_ABORT: 712 return true; 713 default: 714 break; 715 } 716 717 return false; 718 } 719 720 static inline bool 721 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 722 { 723 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 724 return false; 725 } 726 727 switch (nvme_ns->ana_state) { 728 case SPDK_NVME_ANA_OPTIMIZED_STATE: 729 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 730 return true; 731 default: 732 break; 733 } 734 735 return false; 736 } 737 738 static inline bool 739 nvme_io_path_is_connected(struct nvme_io_path *io_path) 740 { 741 if (spdk_unlikely(io_path->qpair->qpair == NULL)) { 742 return false; 743 } 744 745 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(io_path->qpair->qpair) != 746 SPDK_NVME_QPAIR_FAILURE_NONE)) { 747 return false; 748 } 749 750 if (spdk_unlikely(io_path->qpair->ctrlr_ch->reset_iter != NULL)) { 751 return false; 752 } 753 754 if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(io_path->qpair->ctrlr->ctrlr) != 755 SPDK_NVME_QPAIR_FAILURE_NONE) { 756 return false; 757 } 758 759 return true; 760 } 761 762 static inline bool 763 nvme_io_path_is_available(struct nvme_io_path *io_path) 764 { 765 if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) { 766 return false; 767 } 768 769 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 770 return false; 771 } 772 773 return true; 774 } 775 776 static inline bool 777 nvme_io_path_is_failed(struct nvme_io_path *io_path) 778 { 779 struct nvme_ctrlr *nvme_ctrlr; 780 781 nvme_ctrlr = io_path->qpair->ctrlr; 782 783 if (nvme_ctrlr->destruct) { 784 return true; 785 } 786 787 if (nvme_ctrlr->fast_io_fail_timedout) { 788 return true; 789 } 790 791 if (nvme_ctrlr->resetting) { 792 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 793 return false; 794 } else { 795 return true; 796 } 797 } 798 799 if (nvme_ctrlr->reconnect_is_delayed) { 800 return false; 801 } 802 803 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 804 return true; 805 } else { 806 return false; 807 } 808 } 809 810 static bool 811 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 812 { 813 if (nvme_ctrlr->destruct) { 814 return false; 815 } 816 817 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 818 return false; 819 } 820 821 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 822 return false; 823 } 824 825 return true; 826 } 827 828 /* Simulate circular linked list. */ 829 static inline struct nvme_io_path * 830 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 831 { 832 struct nvme_io_path *next_path; 833 834 next_path = STAILQ_NEXT(prev_path, stailq); 835 if (next_path != NULL) { 836 return next_path; 837 } else { 838 return STAILQ_FIRST(&nbdev_ch->io_path_list); 839 } 840 } 841 842 static struct nvme_io_path * 843 bdev_nvme_find_next_io_path(struct nvme_bdev_channel *nbdev_ch, 844 struct nvme_io_path *prev) 845 { 846 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 847 848 start = nvme_io_path_get_next(nbdev_ch, prev); 849 850 io_path = start; 851 do { 852 if (spdk_likely(nvme_io_path_is_connected(io_path) && 853 !io_path->nvme_ns->ana_state_updating)) { 854 switch (io_path->nvme_ns->ana_state) { 855 case SPDK_NVME_ANA_OPTIMIZED_STATE: 856 nbdev_ch->current_io_path = io_path; 857 return io_path; 858 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 859 if (non_optimized == NULL) { 860 non_optimized = io_path; 861 } 862 break; 863 default: 864 break; 865 } 866 } 867 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 868 } while (io_path != start); 869 870 /* We come here only if there is no optimized path. Cache even non_optimized 871 * path for load balance across multiple non_optimized paths. 872 */ 873 nbdev_ch->current_io_path = non_optimized; 874 return non_optimized; 875 } 876 877 static struct nvme_io_path * 878 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 879 { 880 struct nvme_io_path *io_path, *non_optimized = NULL; 881 882 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 883 if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) { 884 /* The device is currently resetting. */ 885 continue; 886 } 887 888 if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) { 889 continue; 890 } 891 892 switch (io_path->nvme_ns->ana_state) { 893 case SPDK_NVME_ANA_OPTIMIZED_STATE: 894 nbdev_ch->current_io_path = io_path; 895 return io_path; 896 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 897 if (non_optimized == NULL) { 898 non_optimized = io_path; 899 } 900 break; 901 default: 902 break; 903 } 904 } 905 906 return non_optimized; 907 } 908 909 static inline struct nvme_io_path * 910 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 911 { 912 if (spdk_unlikely(nbdev_ch->current_io_path == NULL)) { 913 return _bdev_nvme_find_io_path(nbdev_ch); 914 } 915 916 if (spdk_likely(nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE)) { 917 return nbdev_ch->current_io_path; 918 } else { 919 return bdev_nvme_find_next_io_path(nbdev_ch, nbdev_ch->current_io_path); 920 } 921 } 922 923 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 924 * or false otherwise. 925 * 926 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 927 * is likely to be non-accessible now but may become accessible. 928 * 929 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 930 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 931 * when starting to reset it but it is set to failed when the reset failed. Hence, if 932 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 933 */ 934 static bool 935 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 936 { 937 struct nvme_io_path *io_path; 938 939 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 940 if (io_path->nvme_ns->ana_transition_timedout) { 941 continue; 942 } 943 944 if (nvme_io_path_is_connected(io_path) || 945 !nvme_io_path_is_failed(io_path)) { 946 return true; 947 } 948 } 949 950 return false; 951 } 952 953 static bool 954 any_ctrlr_may_become_available(struct nvme_bdev_channel *nbdev_ch) 955 { 956 struct nvme_io_path *io_path; 957 958 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 959 if (!nvme_io_path_is_failed(io_path)) { 960 return true; 961 } 962 } 963 964 return false; 965 } 966 967 static int 968 bdev_nvme_retry_ios(void *arg) 969 { 970 struct nvme_bdev_channel *nbdev_ch = arg; 971 struct spdk_io_channel *ch = spdk_io_channel_from_ctx(nbdev_ch); 972 struct spdk_bdev_io *bdev_io, *tmp_bdev_io; 973 struct nvme_bdev_io *bio; 974 uint64_t now, delay_us; 975 976 now = spdk_get_ticks(); 977 978 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) { 979 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 980 if (bio->retry_ticks > now) { 981 break; 982 } 983 984 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 985 986 bdev_nvme_submit_request(ch, bdev_io); 987 } 988 989 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 990 991 bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list); 992 if (bdev_io != NULL) { 993 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 994 995 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 996 997 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 998 delay_us); 999 } 1000 1001 return SPDK_POLLER_BUSY; 1002 } 1003 1004 static void 1005 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1006 struct nvme_bdev_io *bio, uint64_t delay_ms) 1007 { 1008 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1009 struct spdk_bdev_io *tmp_bdev_io; 1010 struct nvme_bdev_io *tmp_bio; 1011 1012 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1013 1014 TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) { 1015 tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx; 1016 1017 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1018 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io, 1019 module_link); 1020 return; 1021 } 1022 } 1023 1024 /* No earlier I/Os were found. This I/O must be the new head. */ 1025 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link); 1026 1027 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1028 1029 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1030 delay_ms * 1000ULL); 1031 } 1032 1033 static inline void 1034 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1035 const struct spdk_nvme_cpl *cpl) 1036 { 1037 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1038 struct nvme_bdev_channel *nbdev_ch; 1039 struct nvme_ctrlr *nvme_ctrlr; 1040 const struct spdk_nvme_ctrlr_data *cdata; 1041 uint64_t delay_ms; 1042 1043 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1044 1045 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1046 goto complete; 1047 } 1048 1049 if (cpl->status.dnr != 0 || (g_opts.bdev_retry_count != -1 && 1050 bio->retry_count >= g_opts.bdev_retry_count)) { 1051 goto complete; 1052 } 1053 1054 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1055 1056 assert(bio->io_path != NULL); 1057 nvme_ctrlr = bio->io_path->qpair->ctrlr; 1058 1059 if (spdk_nvme_cpl_is_path_error(cpl) || 1060 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1061 !nvme_io_path_is_available(bio->io_path) || 1062 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1063 nbdev_ch->current_io_path = NULL; 1064 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1065 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1066 bio->io_path->nvme_ns->ana_state_updating = true; 1067 } 1068 } 1069 delay_ms = 0; 1070 } else if (spdk_nvme_cpl_is_aborted_by_request(cpl)) { 1071 goto complete; 1072 } else { 1073 bio->retry_count++; 1074 1075 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1076 1077 if (cpl->status.crd != 0) { 1078 delay_ms = cdata->crdt[cpl->status.crd] * 100; 1079 } else { 1080 delay_ms = 0; 1081 } 1082 } 1083 1084 if (any_io_path_may_become_available(nbdev_ch)) { 1085 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1086 return; 1087 } 1088 1089 complete: 1090 bio->retry_count = 0; 1091 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 1092 } 1093 1094 static inline void 1095 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1096 { 1097 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1098 struct nvme_bdev_channel *nbdev_ch; 1099 enum spdk_bdev_io_status io_status; 1100 1101 switch (rc) { 1102 case 0: 1103 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1104 break; 1105 case -ENOMEM: 1106 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1107 break; 1108 case -ENXIO: 1109 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1110 1111 nbdev_ch->current_io_path = NULL; 1112 1113 if (any_io_path_may_become_available(nbdev_ch)) { 1114 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1115 return; 1116 } 1117 1118 /* fallthrough */ 1119 default: 1120 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1121 break; 1122 } 1123 1124 bio->retry_count = 0; 1125 spdk_bdev_io_complete(bdev_io, io_status); 1126 } 1127 1128 static inline void 1129 bdev_nvme_admin_passthru_complete(struct nvme_bdev_io *bio, int rc) 1130 { 1131 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1132 struct nvme_bdev_channel *nbdev_ch; 1133 enum spdk_bdev_io_status io_status; 1134 1135 switch (rc) { 1136 case 0: 1137 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1138 break; 1139 case -ENOMEM: 1140 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1141 break; 1142 case -ENXIO: 1143 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1144 1145 if (any_ctrlr_may_become_available(nbdev_ch)) { 1146 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1147 return; 1148 } 1149 1150 /* fallthrough */ 1151 default: 1152 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1153 break; 1154 } 1155 1156 bio->retry_count = 0; 1157 spdk_bdev_io_complete(bdev_io, io_status); 1158 } 1159 1160 static void 1161 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1162 { 1163 struct nvme_io_path *io_path; 1164 1165 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1166 io_path->nbdev_ch->current_io_path = NULL; 1167 } 1168 } 1169 1170 static void 1171 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1172 { 1173 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1174 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1175 1176 assert(ctrlr_ch->qpair != NULL); 1177 1178 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1179 1180 spdk_for_each_channel_continue(i, 0); 1181 } 1182 1183 static void 1184 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr, 1185 spdk_channel_for_each_cpl cpl) 1186 { 1187 spdk_for_each_channel(nvme_ctrlr, 1188 bdev_nvme_clear_io_path_cache, 1189 NULL, 1190 cpl); 1191 } 1192 1193 static struct nvme_qpair * 1194 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1195 { 1196 struct nvme_qpair *nvme_qpair; 1197 1198 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1199 if (nvme_qpair->qpair == qpair) { 1200 break; 1201 } 1202 } 1203 1204 return nvme_qpair; 1205 } 1206 1207 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1208 1209 static void 1210 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1211 { 1212 struct nvme_poll_group *group = poll_group_ctx; 1213 struct nvme_qpair *nvme_qpair; 1214 struct nvme_ctrlr_channel *ctrlr_ch; 1215 1216 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1217 if (nvme_qpair == NULL) { 1218 return; 1219 } 1220 1221 if (nvme_qpair->qpair != NULL) { 1222 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1223 nvme_qpair->qpair = NULL; 1224 } 1225 1226 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1227 1228 ctrlr_ch = nvme_qpair->ctrlr_ch; 1229 1230 if (ctrlr_ch != NULL) { 1231 if (ctrlr_ch->reset_iter != NULL) { 1232 /* If we are already in a full reset sequence, we do not have 1233 * to restart it. Just move to the next ctrlr_channel. 1234 */ 1235 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1236 qpair); 1237 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 1238 ctrlr_ch->reset_iter = NULL; 1239 } else { 1240 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1241 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1242 bdev_nvme_failover(nvme_qpair->ctrlr, false); 1243 } 1244 } else { 1245 /* In this case, ctrlr_channel is already deleted. */ 1246 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1247 nvme_qpair_delete(nvme_qpair); 1248 } 1249 } 1250 1251 static void 1252 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1253 { 1254 struct nvme_qpair *nvme_qpair; 1255 1256 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1257 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1258 continue; 1259 } 1260 1261 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1262 SPDK_NVME_QPAIR_FAILURE_NONE) { 1263 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1264 } 1265 } 1266 } 1267 1268 static int 1269 bdev_nvme_poll(void *arg) 1270 { 1271 struct nvme_poll_group *group = arg; 1272 int64_t num_completions; 1273 1274 if (group->collect_spin_stat && group->start_ticks == 0) { 1275 group->start_ticks = spdk_get_ticks(); 1276 } 1277 1278 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1279 bdev_nvme_disconnected_qpair_cb); 1280 if (group->collect_spin_stat) { 1281 if (num_completions > 0) { 1282 if (group->end_ticks != 0) { 1283 group->spin_ticks += (group->end_ticks - group->start_ticks); 1284 group->end_ticks = 0; 1285 } 1286 group->start_ticks = 0; 1287 } else { 1288 group->end_ticks = spdk_get_ticks(); 1289 } 1290 } 1291 1292 if (spdk_unlikely(num_completions < 0)) { 1293 bdev_nvme_check_io_qpairs(group); 1294 } 1295 1296 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1297 } 1298 1299 static int 1300 bdev_nvme_poll_adminq(void *arg) 1301 { 1302 int32_t rc; 1303 struct nvme_ctrlr *nvme_ctrlr = arg; 1304 nvme_ctrlr_disconnected_cb disconnected_cb; 1305 1306 assert(nvme_ctrlr != NULL); 1307 1308 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1309 if (rc < 0) { 1310 disconnected_cb = nvme_ctrlr->disconnected_cb; 1311 nvme_ctrlr->disconnected_cb = NULL; 1312 1313 if (rc == -ENXIO && disconnected_cb != NULL) { 1314 disconnected_cb(nvme_ctrlr); 1315 } else { 1316 bdev_nvme_failover(nvme_ctrlr, false); 1317 } 1318 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1319 SPDK_NVME_QPAIR_FAILURE_NONE) { 1320 bdev_nvme_clear_io_path_caches(nvme_ctrlr, NULL); 1321 } 1322 1323 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1324 } 1325 1326 static void 1327 _bdev_nvme_unregister_dev_cb(void *io_device) 1328 { 1329 struct nvme_bdev *nvme_disk = io_device; 1330 1331 free(nvme_disk->disk.name); 1332 free(nvme_disk); 1333 } 1334 1335 static int 1336 bdev_nvme_destruct(void *ctx) 1337 { 1338 struct nvme_bdev *nvme_disk = ctx; 1339 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1340 1341 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1342 1343 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1344 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1345 1346 nvme_ns->bdev = NULL; 1347 1348 assert(nvme_ns->id > 0); 1349 1350 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1351 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1352 1353 nvme_ctrlr_release(nvme_ns->ctrlr); 1354 free(nvme_ns); 1355 } else { 1356 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1357 } 1358 } 1359 1360 pthread_mutex_lock(&g_bdev_nvme_mutex); 1361 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1362 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1363 1364 spdk_io_device_unregister(nvme_disk, _bdev_nvme_unregister_dev_cb); 1365 1366 return 0; 1367 } 1368 1369 static int 1370 bdev_nvme_flush(struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes) 1371 { 1372 bdev_nvme_io_complete(bio, 0); 1373 1374 return 0; 1375 } 1376 1377 static int 1378 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1379 { 1380 struct nvme_ctrlr *nvme_ctrlr; 1381 struct spdk_nvme_io_qpair_opts opts; 1382 struct spdk_nvme_qpair *qpair; 1383 int rc; 1384 1385 nvme_ctrlr = nvme_qpair->ctrlr; 1386 1387 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1388 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1389 opts.create_only = true; 1390 opts.async_mode = true; 1391 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1392 g_opts.io_queue_requests = opts.io_queue_requests; 1393 1394 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1395 if (qpair == NULL) { 1396 return -1; 1397 } 1398 1399 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1400 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1401 1402 assert(nvme_qpair->group != NULL); 1403 1404 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1405 if (rc != 0) { 1406 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1407 goto err; 1408 } 1409 1410 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1411 if (rc != 0) { 1412 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1413 goto err; 1414 } 1415 1416 nvme_qpair->qpair = qpair; 1417 1418 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1419 1420 return 0; 1421 1422 err: 1423 spdk_nvme_ctrlr_free_io_qpair(qpair); 1424 1425 return rc; 1426 } 1427 1428 static void 1429 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1430 { 1431 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1432 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1433 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 1434 struct spdk_bdev_io *bdev_io; 1435 1436 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1437 status = SPDK_BDEV_IO_STATUS_FAILED; 1438 } 1439 1440 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1441 bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1442 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link); 1443 spdk_bdev_io_complete(bdev_io, status); 1444 } 1445 1446 spdk_for_each_channel_continue(i, 0); 1447 } 1448 1449 static void 1450 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove) 1451 { 1452 struct nvme_path_id *path_id, *next_path; 1453 int rc __attribute__((unused)); 1454 1455 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1456 assert(path_id); 1457 assert(path_id == nvme_ctrlr->active_path_id); 1458 next_path = TAILQ_NEXT(path_id, link); 1459 1460 path_id->is_failed = true; 1461 1462 if (next_path) { 1463 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1464 1465 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1466 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1467 1468 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1469 nvme_ctrlr->active_path_id = next_path; 1470 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1471 assert(rc == 0); 1472 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1473 if (!remove) { 1474 /** Shuffle the old trid to the end of the list and use the new one. 1475 * Allows for round robin through multiple connections. 1476 */ 1477 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1478 } else { 1479 free(path_id); 1480 } 1481 } 1482 } 1483 1484 static bool 1485 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1486 { 1487 int32_t elapsed; 1488 1489 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1490 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1491 return false; 1492 } 1493 1494 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1495 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1496 return true; 1497 } else { 1498 return false; 1499 } 1500 } 1501 1502 static bool 1503 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1504 { 1505 uint32_t elapsed; 1506 1507 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1508 return false; 1509 } 1510 1511 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1512 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1513 return true; 1514 } else { 1515 return false; 1516 } 1517 } 1518 1519 static void 1520 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1521 { 1522 int rc __attribute__((unused)); 1523 1524 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1525 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1526 */ 1527 assert(nvme_ctrlr->disconnected_cb == NULL); 1528 nvme_ctrlr->disconnected_cb = cb_fn; 1529 1530 /* Disconnect fails if ctrlr is already resetting or removed. Both cases are 1531 * not possible. Reset is controlled and the callback to hot remove is called 1532 * when ctrlr is hot removed. 1533 */ 1534 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1535 assert(rc == 0); 1536 } 1537 1538 enum bdev_nvme_op_after_reset { 1539 OP_NONE, 1540 OP_COMPLETE_PENDING_DESTRUCT, 1541 OP_DESTRUCT, 1542 OP_DELAYED_RECONNECT, 1543 }; 1544 1545 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1546 1547 static _bdev_nvme_op_after_reset 1548 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1549 { 1550 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1551 /* Complete pending destruct after reset completes. */ 1552 return OP_COMPLETE_PENDING_DESTRUCT; 1553 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1554 nvme_ctrlr->reset_start_tsc = 0; 1555 return OP_NONE; 1556 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1557 return OP_DESTRUCT; 1558 } else { 1559 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1560 nvme_ctrlr->fast_io_fail_timedout = true; 1561 } 1562 bdev_nvme_failover_trid(nvme_ctrlr, false); 1563 return OP_DELAYED_RECONNECT; 1564 } 1565 } 1566 1567 static int _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1568 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1569 1570 static int 1571 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 1572 { 1573 struct nvme_ctrlr *nvme_ctrlr = ctx; 1574 1575 pthread_mutex_lock(&nvme_ctrlr->mutex); 1576 1577 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 1578 1579 assert(nvme_ctrlr->reconnect_is_delayed == true); 1580 nvme_ctrlr->reconnect_is_delayed = false; 1581 1582 if (nvme_ctrlr->destruct) { 1583 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1584 return SPDK_POLLER_BUSY; 1585 } 1586 1587 assert(nvme_ctrlr->resetting == false); 1588 nvme_ctrlr->resetting = true; 1589 1590 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1591 1592 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 1593 1594 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 1595 return SPDK_POLLER_BUSY; 1596 } 1597 1598 static void 1599 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 1600 { 1601 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 1602 1603 assert(nvme_ctrlr->reconnect_is_delayed == false); 1604 nvme_ctrlr->reconnect_is_delayed = true; 1605 1606 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 1607 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 1608 nvme_ctrlr, 1609 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 1610 } 1611 1612 static void 1613 _bdev_nvme_reset_complete(struct spdk_io_channel_iter *i, int status) 1614 { 1615 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1616 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 1617 struct nvme_path_id *path_id; 1618 bdev_nvme_reset_cb reset_cb_fn = nvme_ctrlr->reset_cb_fn; 1619 void *reset_cb_arg = nvme_ctrlr->reset_cb_arg; 1620 enum bdev_nvme_op_after_reset op_after_reset; 1621 1622 assert(nvme_ctrlr->thread == spdk_get_thread()); 1623 1624 nvme_ctrlr->reset_cb_fn = NULL; 1625 nvme_ctrlr->reset_cb_arg = NULL; 1626 1627 if (!success) { 1628 SPDK_ERRLOG("Resetting controller failed.\n"); 1629 } else { 1630 SPDK_NOTICELOG("Resetting controller successful.\n"); 1631 } 1632 1633 pthread_mutex_lock(&nvme_ctrlr->mutex); 1634 nvme_ctrlr->resetting = false; 1635 1636 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1637 assert(path_id != NULL); 1638 assert(path_id == nvme_ctrlr->active_path_id); 1639 1640 path_id->is_failed = !success; 1641 1642 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 1643 1644 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1645 1646 if (reset_cb_fn) { 1647 reset_cb_fn(reset_cb_arg, success); 1648 } 1649 1650 switch (op_after_reset) { 1651 case OP_COMPLETE_PENDING_DESTRUCT: 1652 nvme_ctrlr_unregister(nvme_ctrlr); 1653 break; 1654 case OP_DESTRUCT: 1655 _bdev_nvme_delete(nvme_ctrlr, false); 1656 break; 1657 case OP_DELAYED_RECONNECT: 1658 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 1659 break; 1660 default: 1661 break; 1662 } 1663 } 1664 1665 static void 1666 bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 1667 { 1668 /* Make sure we clear any pending resets before returning. */ 1669 spdk_for_each_channel(nvme_ctrlr, 1670 bdev_nvme_complete_pending_resets, 1671 success ? NULL : (void *)0x1, 1672 _bdev_nvme_reset_complete); 1673 } 1674 1675 static void 1676 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 1677 { 1678 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1679 1680 bdev_nvme_reset_complete(nvme_ctrlr, false); 1681 } 1682 1683 static void 1684 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 1685 { 1686 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 1687 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 1688 struct nvme_qpair *nvme_qpair; 1689 1690 nvme_qpair = ctrlr_ch->qpair; 1691 assert(nvme_qpair != NULL); 1692 1693 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1694 1695 if (nvme_qpair->qpair != NULL) { 1696 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 1697 1698 /* The current full reset sequence will move to the next 1699 * ctrlr_channel after the qpair is actually disconnected. 1700 */ 1701 assert(ctrlr_ch->reset_iter == NULL); 1702 ctrlr_ch->reset_iter = i; 1703 } else { 1704 spdk_for_each_channel_continue(i, 0); 1705 } 1706 } 1707 1708 static void 1709 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 1710 { 1711 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1712 1713 if (status == 0) { 1714 bdev_nvme_reset_complete(nvme_ctrlr, true); 1715 } else { 1716 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 1717 spdk_for_each_channel(nvme_ctrlr, 1718 bdev_nvme_reset_destroy_qpair, 1719 NULL, 1720 bdev_nvme_reset_create_qpairs_failed); 1721 } 1722 } 1723 1724 static void 1725 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 1726 { 1727 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1728 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1729 int rc; 1730 1731 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 1732 1733 spdk_for_each_channel_continue(i, rc); 1734 } 1735 1736 static int 1737 bdev_nvme_reconnect_ctrlr_poll(void *arg) 1738 { 1739 struct nvme_ctrlr *nvme_ctrlr = arg; 1740 int rc = -ETIMEDOUT; 1741 1742 if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1743 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 1744 if (rc == -EAGAIN) { 1745 return SPDK_POLLER_BUSY; 1746 } 1747 } 1748 1749 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 1750 if (rc == 0) { 1751 /* Recreate all of the I/O queue pairs */ 1752 spdk_for_each_channel(nvme_ctrlr, 1753 bdev_nvme_reset_create_qpair, 1754 NULL, 1755 bdev_nvme_reset_create_qpairs_done); 1756 } else { 1757 bdev_nvme_reset_complete(nvme_ctrlr, false); 1758 } 1759 return SPDK_POLLER_BUSY; 1760 } 1761 1762 static void 1763 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 1764 { 1765 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 1766 1767 assert(nvme_ctrlr->reset_detach_poller == NULL); 1768 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 1769 nvme_ctrlr, 0); 1770 } 1771 1772 static void 1773 bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status) 1774 { 1775 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1776 1777 assert(status == 0); 1778 1779 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 1780 } 1781 1782 static void 1783 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 1784 { 1785 spdk_for_each_channel(nvme_ctrlr, 1786 bdev_nvme_reset_destroy_qpair, 1787 NULL, 1788 bdev_nvme_reset_ctrlr); 1789 } 1790 1791 static void 1792 _bdev_nvme_reset(void *ctx) 1793 { 1794 struct nvme_ctrlr *nvme_ctrlr = ctx; 1795 1796 assert(nvme_ctrlr->resetting == true); 1797 assert(nvme_ctrlr->thread == spdk_get_thread()); 1798 1799 spdk_nvme_ctrlr_prepare_for_reset(nvme_ctrlr->ctrlr); 1800 1801 /* First, delete all NVMe I/O queue pairs. */ 1802 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 1803 } 1804 1805 static int 1806 bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr) 1807 { 1808 pthread_mutex_lock(&nvme_ctrlr->mutex); 1809 if (nvme_ctrlr->destruct) { 1810 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1811 return -ENXIO; 1812 } 1813 1814 if (nvme_ctrlr->resetting) { 1815 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1816 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 1817 return -EBUSY; 1818 } 1819 1820 if (nvme_ctrlr->reconnect_is_delayed) { 1821 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1822 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 1823 return -EBUSY; 1824 } 1825 1826 nvme_ctrlr->resetting = true; 1827 1828 assert(nvme_ctrlr->reset_start_tsc == 0); 1829 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 1830 1831 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1832 1833 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr); 1834 return 0; 1835 } 1836 1837 int 1838 bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg) 1839 { 1840 int rc; 1841 1842 rc = bdev_nvme_reset(nvme_ctrlr); 1843 if (rc == 0) { 1844 nvme_ctrlr->reset_cb_fn = cb_fn; 1845 nvme_ctrlr->reset_cb_arg = cb_arg; 1846 } 1847 return rc; 1848 } 1849 1850 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 1851 1852 static void 1853 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 1854 { 1855 enum spdk_bdev_io_status io_status; 1856 1857 if (bio->cpl.cdw0 == 0) { 1858 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1859 } else { 1860 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1861 } 1862 1863 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), io_status); 1864 } 1865 1866 static void 1867 _bdev_nvme_reset_io_continue(void *ctx) 1868 { 1869 struct nvme_bdev_io *bio = ctx; 1870 struct nvme_io_path *prev_io_path, *next_io_path; 1871 int rc; 1872 1873 prev_io_path = bio->io_path; 1874 bio->io_path = NULL; 1875 1876 if (bio->cpl.cdw0 != 0) { 1877 goto complete; 1878 } 1879 1880 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 1881 if (next_io_path == NULL) { 1882 goto complete; 1883 } 1884 1885 rc = _bdev_nvme_reset_io(next_io_path, bio); 1886 if (rc == 0) { 1887 return; 1888 } 1889 1890 bio->cpl.cdw0 = 1; 1891 1892 complete: 1893 bdev_nvme_reset_io_complete(bio); 1894 } 1895 1896 static void 1897 bdev_nvme_reset_io_continue(void *cb_arg, bool success) 1898 { 1899 struct nvme_bdev_io *bio = cb_arg; 1900 1901 bio->cpl.cdw0 = !success; 1902 1903 spdk_thread_send_msg(bio->orig_thread, _bdev_nvme_reset_io_continue, bio); 1904 } 1905 1906 static int 1907 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 1908 { 1909 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1910 struct nvme_ctrlr_channel *ctrlr_ch; 1911 struct spdk_bdev_io *bdev_io; 1912 int rc; 1913 1914 rc = bdev_nvme_reset(nvme_ctrlr); 1915 if (rc == 0) { 1916 assert(bio->io_path == NULL); 1917 bio->io_path = io_path; 1918 1919 assert(nvme_ctrlr->reset_cb_fn == NULL); 1920 assert(nvme_ctrlr->reset_cb_arg == NULL); 1921 nvme_ctrlr->reset_cb_fn = bdev_nvme_reset_io_continue; 1922 nvme_ctrlr->reset_cb_arg = bio; 1923 } else if (rc == -EBUSY) { 1924 ctrlr_ch = io_path->qpair->ctrlr_ch; 1925 assert(ctrlr_ch != NULL); 1926 /* 1927 * Reset call is queued only if it is from the app framework. This is on purpose so that 1928 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 1929 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 1930 */ 1931 bdev_io = spdk_bdev_io_from_ctx(bio); 1932 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link); 1933 } else { 1934 return rc; 1935 } 1936 1937 return 0; 1938 } 1939 1940 static void 1941 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 1942 { 1943 struct nvme_io_path *io_path; 1944 int rc; 1945 1946 bio->cpl.cdw0 = 0; 1947 bio->orig_thread = spdk_get_thread(); 1948 1949 /* Reset only the first nvme_ctrlr in the nvme_bdev_ctrlr for now. 1950 * 1951 * TODO: Reset all nvme_ctrlrs in the nvme_bdev_ctrlr sequentially. 1952 * This will be done in the following patches. 1953 */ 1954 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 1955 assert(io_path != NULL); 1956 1957 rc = _bdev_nvme_reset_io(io_path, bio); 1958 if (rc != 0) { 1959 bio->cpl.cdw0 = 1; 1960 bdev_nvme_reset_io_complete(bio); 1961 } 1962 } 1963 1964 static int 1965 bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove) 1966 { 1967 pthread_mutex_lock(&nvme_ctrlr->mutex); 1968 if (nvme_ctrlr->destruct) { 1969 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1970 /* Don't bother resetting if the controller is in the process of being destructed. */ 1971 return -ENXIO; 1972 } 1973 1974 if (nvme_ctrlr->resetting) { 1975 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1976 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 1977 return -EBUSY; 1978 } 1979 1980 bdev_nvme_failover_trid(nvme_ctrlr, remove); 1981 1982 if (nvme_ctrlr->reconnect_is_delayed) { 1983 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1984 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 1985 1986 /* We rely on the next reconnect for the failover. */ 1987 return 0; 1988 } 1989 1990 nvme_ctrlr->resetting = true; 1991 1992 assert(nvme_ctrlr->reset_start_tsc == 0); 1993 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 1994 1995 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1996 1997 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr); 1998 return 0; 1999 } 2000 2001 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2002 uint64_t num_blocks); 2003 2004 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2005 uint64_t num_blocks); 2006 2007 static void 2008 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2009 bool success) 2010 { 2011 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2012 struct spdk_bdev *bdev = bdev_io->bdev; 2013 int ret; 2014 2015 if (!success) { 2016 ret = -EINVAL; 2017 goto exit; 2018 } 2019 2020 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2021 ret = -ENXIO; 2022 goto exit; 2023 } 2024 2025 ret = bdev_nvme_readv(bio, 2026 bdev_io->u.bdev.iovs, 2027 bdev_io->u.bdev.iovcnt, 2028 bdev_io->u.bdev.md_buf, 2029 bdev_io->u.bdev.num_blocks, 2030 bdev_io->u.bdev.offset_blocks, 2031 bdev->dif_check_flags, 2032 bdev_io->u.bdev.ext_opts); 2033 2034 exit: 2035 if (spdk_unlikely(ret != 0)) { 2036 bdev_nvme_io_complete(bio, ret); 2037 } 2038 } 2039 2040 static void 2041 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2042 { 2043 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 2044 struct spdk_bdev *bdev = bdev_io->bdev; 2045 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2046 struct nvme_bdev_io *nbdev_io_to_abort; 2047 int rc = 0; 2048 2049 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 2050 if (spdk_unlikely(!nbdev_io->io_path)) { 2051 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 2052 rc = -ENXIO; 2053 goto exit; 2054 } 2055 2056 /* Admin commands do not use the optimal I/O path. 2057 * Simply fall through even if it is not found. 2058 */ 2059 } 2060 2061 switch (bdev_io->type) { 2062 case SPDK_BDEV_IO_TYPE_READ: 2063 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 2064 rc = bdev_nvme_readv(nbdev_io, 2065 bdev_io->u.bdev.iovs, 2066 bdev_io->u.bdev.iovcnt, 2067 bdev_io->u.bdev.md_buf, 2068 bdev_io->u.bdev.num_blocks, 2069 bdev_io->u.bdev.offset_blocks, 2070 bdev->dif_check_flags, 2071 bdev_io->u.bdev.ext_opts); 2072 } else { 2073 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 2074 bdev_io->u.bdev.num_blocks * bdev->blocklen); 2075 rc = 0; 2076 } 2077 break; 2078 case SPDK_BDEV_IO_TYPE_WRITE: 2079 rc = bdev_nvme_writev(nbdev_io, 2080 bdev_io->u.bdev.iovs, 2081 bdev_io->u.bdev.iovcnt, 2082 bdev_io->u.bdev.md_buf, 2083 bdev_io->u.bdev.num_blocks, 2084 bdev_io->u.bdev.offset_blocks, 2085 bdev->dif_check_flags, 2086 bdev_io->u.bdev.ext_opts); 2087 break; 2088 case SPDK_BDEV_IO_TYPE_COMPARE: 2089 rc = bdev_nvme_comparev(nbdev_io, 2090 bdev_io->u.bdev.iovs, 2091 bdev_io->u.bdev.iovcnt, 2092 bdev_io->u.bdev.md_buf, 2093 bdev_io->u.bdev.num_blocks, 2094 bdev_io->u.bdev.offset_blocks, 2095 bdev->dif_check_flags); 2096 break; 2097 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2098 rc = bdev_nvme_comparev_and_writev(nbdev_io, 2099 bdev_io->u.bdev.iovs, 2100 bdev_io->u.bdev.iovcnt, 2101 bdev_io->u.bdev.fused_iovs, 2102 bdev_io->u.bdev.fused_iovcnt, 2103 bdev_io->u.bdev.md_buf, 2104 bdev_io->u.bdev.num_blocks, 2105 bdev_io->u.bdev.offset_blocks, 2106 bdev->dif_check_flags); 2107 break; 2108 case SPDK_BDEV_IO_TYPE_UNMAP: 2109 rc = bdev_nvme_unmap(nbdev_io, 2110 bdev_io->u.bdev.offset_blocks, 2111 bdev_io->u.bdev.num_blocks); 2112 break; 2113 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2114 rc = bdev_nvme_write_zeroes(nbdev_io, 2115 bdev_io->u.bdev.offset_blocks, 2116 bdev_io->u.bdev.num_blocks); 2117 break; 2118 case SPDK_BDEV_IO_TYPE_RESET: 2119 nbdev_io->io_path = NULL; 2120 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 2121 break; 2122 case SPDK_BDEV_IO_TYPE_FLUSH: 2123 rc = bdev_nvme_flush(nbdev_io, 2124 bdev_io->u.bdev.offset_blocks, 2125 bdev_io->u.bdev.num_blocks); 2126 break; 2127 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 2128 rc = bdev_nvme_zone_appendv(nbdev_io, 2129 bdev_io->u.bdev.iovs, 2130 bdev_io->u.bdev.iovcnt, 2131 bdev_io->u.bdev.md_buf, 2132 bdev_io->u.bdev.num_blocks, 2133 bdev_io->u.bdev.offset_blocks, 2134 bdev->dif_check_flags); 2135 break; 2136 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 2137 rc = bdev_nvme_get_zone_info(nbdev_io, 2138 bdev_io->u.zone_mgmt.zone_id, 2139 bdev_io->u.zone_mgmt.num_zones, 2140 bdev_io->u.zone_mgmt.buf); 2141 break; 2142 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 2143 rc = bdev_nvme_zone_management(nbdev_io, 2144 bdev_io->u.zone_mgmt.zone_id, 2145 bdev_io->u.zone_mgmt.zone_action); 2146 break; 2147 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 2148 nbdev_io->io_path = NULL; 2149 bdev_nvme_admin_passthru(nbdev_ch, 2150 nbdev_io, 2151 &bdev_io->u.nvme_passthru.cmd, 2152 bdev_io->u.nvme_passthru.buf, 2153 bdev_io->u.nvme_passthru.nbytes); 2154 break; 2155 case SPDK_BDEV_IO_TYPE_NVME_IO: 2156 rc = bdev_nvme_io_passthru(nbdev_io, 2157 &bdev_io->u.nvme_passthru.cmd, 2158 bdev_io->u.nvme_passthru.buf, 2159 bdev_io->u.nvme_passthru.nbytes); 2160 break; 2161 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2162 rc = bdev_nvme_io_passthru_md(nbdev_io, 2163 &bdev_io->u.nvme_passthru.cmd, 2164 bdev_io->u.nvme_passthru.buf, 2165 bdev_io->u.nvme_passthru.nbytes, 2166 bdev_io->u.nvme_passthru.md_buf, 2167 bdev_io->u.nvme_passthru.md_len); 2168 break; 2169 case SPDK_BDEV_IO_TYPE_ABORT: 2170 nbdev_io->io_path = NULL; 2171 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 2172 bdev_nvme_abort(nbdev_ch, 2173 nbdev_io, 2174 nbdev_io_to_abort); 2175 break; 2176 default: 2177 rc = -EINVAL; 2178 break; 2179 } 2180 2181 exit: 2182 if (spdk_unlikely(rc != 0)) { 2183 bdev_nvme_io_complete(nbdev_io, rc); 2184 } 2185 } 2186 2187 static bool 2188 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 2189 { 2190 struct nvme_bdev *nbdev = ctx; 2191 struct nvme_ns *nvme_ns; 2192 struct spdk_nvme_ns *ns; 2193 struct spdk_nvme_ctrlr *ctrlr; 2194 const struct spdk_nvme_ctrlr_data *cdata; 2195 2196 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 2197 assert(nvme_ns != NULL); 2198 ns = nvme_ns->ns; 2199 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2200 2201 switch (io_type) { 2202 case SPDK_BDEV_IO_TYPE_READ: 2203 case SPDK_BDEV_IO_TYPE_WRITE: 2204 case SPDK_BDEV_IO_TYPE_RESET: 2205 case SPDK_BDEV_IO_TYPE_FLUSH: 2206 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 2207 case SPDK_BDEV_IO_TYPE_NVME_IO: 2208 case SPDK_BDEV_IO_TYPE_ABORT: 2209 return true; 2210 2211 case SPDK_BDEV_IO_TYPE_COMPARE: 2212 return spdk_nvme_ns_supports_compare(ns); 2213 2214 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2215 return spdk_nvme_ns_get_md_size(ns) ? true : false; 2216 2217 case SPDK_BDEV_IO_TYPE_UNMAP: 2218 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2219 return cdata->oncs.dsm; 2220 2221 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2222 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2223 return cdata->oncs.write_zeroes; 2224 2225 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2226 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 2227 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 2228 return true; 2229 } 2230 return false; 2231 2232 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 2233 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 2234 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 2235 2236 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 2237 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 2238 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 2239 2240 default: 2241 return false; 2242 } 2243 } 2244 2245 static int 2246 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 2247 { 2248 struct nvme_qpair *nvme_qpair; 2249 struct spdk_io_channel *pg_ch; 2250 int rc; 2251 2252 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 2253 if (!nvme_qpair) { 2254 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 2255 return -1; 2256 } 2257 2258 TAILQ_INIT(&nvme_qpair->io_path_list); 2259 2260 nvme_qpair->ctrlr = nvme_ctrlr; 2261 nvme_qpair->ctrlr_ch = ctrlr_ch; 2262 2263 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 2264 if (!pg_ch) { 2265 free(nvme_qpair); 2266 return -1; 2267 } 2268 2269 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 2270 2271 #ifdef SPDK_CONFIG_VTUNE 2272 nvme_qpair->group->collect_spin_stat = true; 2273 #else 2274 nvme_qpair->group->collect_spin_stat = false; 2275 #endif 2276 2277 rc = bdev_nvme_create_qpair(nvme_qpair); 2278 if (rc != 0) { 2279 /* nvme_ctrlr can't create IO qpair if connection is down. If nvme_ctrlr is 2280 * being reset or scheduled to reconnect later, ignore this failure. 2281 * Then IO qpair will be created later when reconnect completes. 2282 * If the user submits IO requests in the meantime, they will be queued and 2283 * resubmitted later */ 2284 if (!nvme_ctrlr->resetting && !nvme_ctrlr->reconnect_is_delayed) { 2285 spdk_put_io_channel(pg_ch); 2286 free(nvme_qpair); 2287 return rc; 2288 } 2289 } 2290 2291 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 2292 2293 ctrlr_ch->qpair = nvme_qpair; 2294 2295 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 2296 nvme_qpair->ctrlr->ref++; 2297 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 2298 2299 return 0; 2300 } 2301 2302 static int 2303 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 2304 { 2305 struct nvme_ctrlr *nvme_ctrlr = io_device; 2306 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 2307 2308 TAILQ_INIT(&ctrlr_ch->pending_resets); 2309 2310 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 2311 } 2312 2313 static void 2314 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 2315 { 2316 assert(nvme_qpair->group != NULL); 2317 2318 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 2319 2320 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 2321 2322 nvme_ctrlr_release(nvme_qpair->ctrlr); 2323 2324 free(nvme_qpair); 2325 } 2326 2327 static void 2328 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 2329 { 2330 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 2331 struct nvme_qpair *nvme_qpair; 2332 2333 nvme_qpair = ctrlr_ch->qpair; 2334 assert(nvme_qpair != NULL); 2335 2336 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2337 2338 if (nvme_qpair->qpair != NULL) { 2339 if (ctrlr_ch->reset_iter == NULL) { 2340 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2341 } else { 2342 /* Skip current ctrlr_channel in a full reset sequence because 2343 * it is being deleted now. The qpair is already being disconnected. 2344 * We do not have to restart disconnecting it. 2345 */ 2346 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2347 } 2348 2349 /* We cannot release a reference to the poll group now. 2350 * The qpair may be disconnected asynchronously later. 2351 * We need to poll it until it is actually disconnected. 2352 * Just detach the qpair from the deleting ctrlr_channel. 2353 */ 2354 nvme_qpair->ctrlr_ch = NULL; 2355 } else { 2356 assert(ctrlr_ch->reset_iter == NULL); 2357 2358 nvme_qpair_delete(nvme_qpair); 2359 } 2360 } 2361 2362 static void 2363 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 2364 uint32_t iov_cnt, uint32_t seed, 2365 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 2366 { 2367 struct nvme_poll_group *group = ctx; 2368 int rc; 2369 2370 assert(group->accel_channel != NULL); 2371 assert(cb_fn != NULL); 2372 2373 rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 2374 if (rc) { 2375 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 2376 if (rc == -ENOMEM || rc == -EINVAL) { 2377 cb_fn(cb_arg, rc); 2378 } 2379 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 2380 } 2381 } 2382 2383 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 2384 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 2385 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 2386 }; 2387 2388 static int 2389 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 2390 { 2391 struct nvme_poll_group *group = ctx_buf; 2392 2393 TAILQ_INIT(&group->qpair_list); 2394 2395 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 2396 if (group->group == NULL) { 2397 return -1; 2398 } 2399 2400 group->accel_channel = spdk_accel_engine_get_io_channel(); 2401 if (!group->accel_channel) { 2402 spdk_nvme_poll_group_destroy(group->group); 2403 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 2404 group); 2405 return -1; 2406 } 2407 2408 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 2409 2410 if (group->poller == NULL) { 2411 spdk_put_io_channel(group->accel_channel); 2412 spdk_nvme_poll_group_destroy(group->group); 2413 return -1; 2414 } 2415 2416 return 0; 2417 } 2418 2419 static void 2420 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 2421 { 2422 struct nvme_poll_group *group = ctx_buf; 2423 2424 assert(TAILQ_EMPTY(&group->qpair_list)); 2425 2426 if (group->accel_channel) { 2427 spdk_put_io_channel(group->accel_channel); 2428 } 2429 2430 spdk_poller_unregister(&group->poller); 2431 if (spdk_nvme_poll_group_destroy(group->group)) { 2432 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 2433 assert(false); 2434 } 2435 } 2436 2437 static struct spdk_io_channel * 2438 bdev_nvme_get_io_channel(void *ctx) 2439 { 2440 struct nvme_bdev *nvme_bdev = ctx; 2441 2442 return spdk_get_io_channel(nvme_bdev); 2443 } 2444 2445 static void * 2446 bdev_nvme_get_module_ctx(void *ctx) 2447 { 2448 struct nvme_bdev *nvme_bdev = ctx; 2449 struct nvme_ns *nvme_ns; 2450 2451 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 2452 return NULL; 2453 } 2454 2455 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 2456 if (!nvme_ns) { 2457 return NULL; 2458 } 2459 2460 return nvme_ns->ns; 2461 } 2462 2463 static const char * 2464 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 2465 { 2466 switch (ana_state) { 2467 case SPDK_NVME_ANA_OPTIMIZED_STATE: 2468 return "optimized"; 2469 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 2470 return "non_optimized"; 2471 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 2472 return "inaccessible"; 2473 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 2474 return "persistent_loss"; 2475 case SPDK_NVME_ANA_CHANGE_STATE: 2476 return "change"; 2477 default: 2478 return NULL; 2479 } 2480 } 2481 2482 static int 2483 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 2484 { 2485 struct nvme_bdev *nbdev = ctx; 2486 struct nvme_ns *nvme_ns; 2487 2488 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 2489 assert(nvme_ns != NULL); 2490 2491 return spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, domains, array_size); 2492 } 2493 2494 static const char * 2495 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 2496 { 2497 if (nvme_ctrlr->destruct) { 2498 return "deleting"; 2499 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 2500 return "failed"; 2501 } else if (nvme_ctrlr->resetting) { 2502 return "resetting"; 2503 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 2504 return "reconnect_is_delayed"; 2505 } else { 2506 return "enabled"; 2507 } 2508 } 2509 2510 void 2511 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 2512 { 2513 struct spdk_nvme_transport_id *trid; 2514 const struct spdk_nvme_ctrlr_opts *opts; 2515 const struct spdk_nvme_ctrlr_data *cdata; 2516 2517 spdk_json_write_object_begin(w); 2518 2519 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 2520 2521 #ifdef SPDK_CONFIG_NVME_CUSE 2522 size_t cuse_name_size = 128; 2523 char cuse_name[cuse_name_size]; 2524 2525 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 2526 if (rc == 0) { 2527 spdk_json_write_named_string(w, "cuse_device", cuse_name); 2528 } 2529 #endif 2530 trid = &nvme_ctrlr->active_path_id->trid; 2531 spdk_json_write_named_object_begin(w, "trid"); 2532 nvme_bdev_dump_trid_json(trid, w); 2533 spdk_json_write_object_end(w); 2534 2535 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 2536 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 2537 2538 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 2539 spdk_json_write_named_object_begin(w, "host"); 2540 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 2541 spdk_json_write_named_string(w, "addr", opts->src_addr); 2542 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 2543 spdk_json_write_object_end(w); 2544 2545 spdk_json_write_object_end(w); 2546 } 2547 2548 static void 2549 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 2550 struct nvme_ns *nvme_ns) 2551 { 2552 struct spdk_nvme_ns *ns; 2553 struct spdk_nvme_ctrlr *ctrlr; 2554 const struct spdk_nvme_ctrlr_data *cdata; 2555 const struct spdk_nvme_transport_id *trid; 2556 union spdk_nvme_vs_register vs; 2557 const struct spdk_nvme_ns_data *nsdata; 2558 char buf[128]; 2559 2560 ns = nvme_ns->ns; 2561 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2562 2563 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2564 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 2565 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 2566 2567 spdk_json_write_object_begin(w); 2568 2569 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2570 spdk_json_write_named_string(w, "pci_address", trid->traddr); 2571 } 2572 2573 spdk_json_write_named_object_begin(w, "trid"); 2574 2575 nvme_bdev_dump_trid_json(trid, w); 2576 2577 spdk_json_write_object_end(w); 2578 2579 #ifdef SPDK_CONFIG_NVME_CUSE 2580 size_t cuse_name_size = 128; 2581 char cuse_name[cuse_name_size]; 2582 2583 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 2584 cuse_name, &cuse_name_size); 2585 if (rc == 0) { 2586 spdk_json_write_named_string(w, "cuse_device", cuse_name); 2587 } 2588 #endif 2589 2590 spdk_json_write_named_object_begin(w, "ctrlr_data"); 2591 2592 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 2593 2594 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 2595 2596 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 2597 spdk_str_trim(buf); 2598 spdk_json_write_named_string(w, "model_number", buf); 2599 2600 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 2601 spdk_str_trim(buf); 2602 spdk_json_write_named_string(w, "serial_number", buf); 2603 2604 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 2605 spdk_str_trim(buf); 2606 spdk_json_write_named_string(w, "firmware_revision", buf); 2607 2608 if (cdata->subnqn[0] != '\0') { 2609 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 2610 } 2611 2612 spdk_json_write_named_object_begin(w, "oacs"); 2613 2614 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 2615 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 2616 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 2617 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 2618 2619 spdk_json_write_object_end(w); 2620 2621 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 2622 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 2623 2624 spdk_json_write_object_end(w); 2625 2626 spdk_json_write_named_object_begin(w, "vs"); 2627 2628 spdk_json_write_name(w, "nvme_version"); 2629 if (vs.bits.ter) { 2630 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 2631 } else { 2632 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 2633 } 2634 2635 spdk_json_write_object_end(w); 2636 2637 nsdata = spdk_nvme_ns_get_data(ns); 2638 2639 spdk_json_write_named_object_begin(w, "ns_data"); 2640 2641 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 2642 2643 if (cdata->cmic.ana_reporting) { 2644 spdk_json_write_named_string(w, "ana_state", 2645 _nvme_ana_state_str(nvme_ns->ana_state)); 2646 } 2647 2648 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 2649 2650 spdk_json_write_object_end(w); 2651 2652 if (cdata->oacs.security) { 2653 spdk_json_write_named_object_begin(w, "security"); 2654 2655 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 2656 2657 spdk_json_write_object_end(w); 2658 } 2659 2660 spdk_json_write_object_end(w); 2661 } 2662 2663 static const char * 2664 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 2665 { 2666 switch (nbdev->mp_policy) { 2667 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 2668 return "active_passive"; 2669 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 2670 return "active_active"; 2671 default: 2672 assert(false); 2673 return "invalid"; 2674 } 2675 } 2676 2677 static int 2678 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 2679 { 2680 struct nvme_bdev *nvme_bdev = ctx; 2681 struct nvme_ns *nvme_ns; 2682 2683 pthread_mutex_lock(&nvme_bdev->mutex); 2684 spdk_json_write_named_array_begin(w, "nvme"); 2685 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 2686 nvme_namespace_info_json(w, nvme_ns); 2687 } 2688 spdk_json_write_array_end(w); 2689 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 2690 pthread_mutex_unlock(&nvme_bdev->mutex); 2691 2692 return 0; 2693 } 2694 2695 static void 2696 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 2697 { 2698 /* No config per bdev needed */ 2699 } 2700 2701 static uint64_t 2702 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 2703 { 2704 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 2705 struct nvme_io_path *io_path; 2706 struct nvme_poll_group *group; 2707 uint64_t spin_time = 0; 2708 2709 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 2710 group = io_path->qpair->group; 2711 2712 if (!group || !group->collect_spin_stat) { 2713 continue; 2714 } 2715 2716 if (group->end_ticks != 0) { 2717 group->spin_ticks += (group->end_ticks - group->start_ticks); 2718 group->end_ticks = 0; 2719 } 2720 2721 spin_time += group->spin_ticks; 2722 group->start_ticks = 0; 2723 group->spin_ticks = 0; 2724 } 2725 2726 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 2727 } 2728 2729 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 2730 .destruct = bdev_nvme_destruct, 2731 .submit_request = bdev_nvme_submit_request, 2732 .io_type_supported = bdev_nvme_io_type_supported, 2733 .get_io_channel = bdev_nvme_get_io_channel, 2734 .dump_info_json = bdev_nvme_dump_info_json, 2735 .write_config_json = bdev_nvme_write_config_json, 2736 .get_spin_time = bdev_nvme_get_spin_time, 2737 .get_module_ctx = bdev_nvme_get_module_ctx, 2738 .get_memory_domains = bdev_nvme_get_memory_domains, 2739 }; 2740 2741 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 2742 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 2743 2744 static int 2745 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 2746 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 2747 { 2748 struct spdk_nvme_ana_group_descriptor *copied_desc; 2749 uint8_t *orig_desc; 2750 uint32_t i, desc_size, copy_len; 2751 int rc = 0; 2752 2753 if (nvme_ctrlr->ana_log_page == NULL) { 2754 return -EINVAL; 2755 } 2756 2757 copied_desc = nvme_ctrlr->copied_ana_desc; 2758 2759 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 2760 copy_len = nvme_ctrlr->ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 2761 2762 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 2763 memcpy(copied_desc, orig_desc, copy_len); 2764 2765 rc = cb_fn(copied_desc, cb_arg); 2766 if (rc != 0) { 2767 break; 2768 } 2769 2770 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 2771 copied_desc->num_of_nsid * sizeof(uint32_t); 2772 orig_desc += desc_size; 2773 copy_len -= desc_size; 2774 } 2775 2776 return rc; 2777 } 2778 2779 static int 2780 nvme_ns_ana_transition_timedout(void *ctx) 2781 { 2782 struct nvme_ns *nvme_ns = ctx; 2783 2784 spdk_poller_unregister(&nvme_ns->anatt_timer); 2785 nvme_ns->ana_transition_timedout = true; 2786 2787 return SPDK_POLLER_BUSY; 2788 } 2789 2790 static void 2791 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 2792 const struct spdk_nvme_ana_group_descriptor *desc) 2793 { 2794 const struct spdk_nvme_ctrlr_data *cdata; 2795 2796 nvme_ns->ana_group_id = desc->ana_group_id; 2797 nvme_ns->ana_state = desc->ana_state; 2798 nvme_ns->ana_state_updating = false; 2799 2800 switch (nvme_ns->ana_state) { 2801 case SPDK_NVME_ANA_OPTIMIZED_STATE: 2802 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 2803 nvme_ns->ana_transition_timedout = false; 2804 spdk_poller_unregister(&nvme_ns->anatt_timer); 2805 break; 2806 2807 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 2808 case SPDK_NVME_ANA_CHANGE_STATE: 2809 if (nvme_ns->anatt_timer != NULL) { 2810 break; 2811 } 2812 2813 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 2814 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 2815 nvme_ns, 2816 cdata->anatt * SPDK_SEC_TO_USEC); 2817 break; 2818 default: 2819 break; 2820 } 2821 } 2822 2823 static int 2824 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 2825 { 2826 struct nvme_ns *nvme_ns = cb_arg; 2827 uint32_t i; 2828 2829 for (i = 0; i < desc->num_of_nsid; i++) { 2830 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 2831 continue; 2832 } 2833 2834 _nvme_ns_set_ana_state(nvme_ns, desc); 2835 return 1; 2836 } 2837 2838 return 0; 2839 } 2840 2841 static int 2842 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 2843 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 2844 uint32_t prchk_flags, void *ctx) 2845 { 2846 const struct spdk_uuid *uuid; 2847 const uint8_t *nguid; 2848 const struct spdk_nvme_ctrlr_data *cdata; 2849 const struct spdk_nvme_ns_data *nsdata; 2850 enum spdk_nvme_csi csi; 2851 uint32_t atomic_bs, phys_bs, bs; 2852 2853 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2854 csi = spdk_nvme_ns_get_csi(ns); 2855 2856 switch (csi) { 2857 case SPDK_NVME_CSI_NVM: 2858 disk->product_name = "NVMe disk"; 2859 break; 2860 case SPDK_NVME_CSI_ZNS: 2861 disk->product_name = "NVMe ZNS disk"; 2862 disk->zoned = true; 2863 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 2864 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 2865 spdk_nvme_ns_get_extended_sector_size(ns); 2866 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 2867 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 2868 break; 2869 default: 2870 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 2871 return -ENOTSUP; 2872 } 2873 2874 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 2875 if (!disk->name) { 2876 return -ENOMEM; 2877 } 2878 2879 disk->write_cache = 0; 2880 if (cdata->vwc.present) { 2881 /* Enable if the Volatile Write Cache exists */ 2882 disk->write_cache = 1; 2883 } 2884 if (cdata->oncs.write_zeroes) { 2885 disk->max_write_zeroes = UINT16_MAX + 1; 2886 } 2887 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 2888 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 2889 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 2890 2891 nguid = spdk_nvme_ns_get_nguid(ns); 2892 if (!nguid) { 2893 uuid = spdk_nvme_ns_get_uuid(ns); 2894 if (uuid) { 2895 disk->uuid = *uuid; 2896 } 2897 } else { 2898 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 2899 } 2900 2901 nsdata = spdk_nvme_ns_get_data(ns); 2902 bs = spdk_nvme_ns_get_sector_size(ns); 2903 atomic_bs = bs; 2904 phys_bs = bs; 2905 if (nsdata->nabo == 0) { 2906 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 2907 atomic_bs = bs * (1 + nsdata->nawupf); 2908 } else { 2909 atomic_bs = bs * (1 + cdata->awupf); 2910 } 2911 } 2912 if (nsdata->nsfeat.optperf) { 2913 phys_bs = bs * (1 + nsdata->npwg); 2914 } 2915 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 2916 2917 disk->md_len = spdk_nvme_ns_get_md_size(ns); 2918 if (disk->md_len != 0) { 2919 disk->md_interleave = nsdata->flbas.extended; 2920 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 2921 if (disk->dif_type != SPDK_DIF_DISABLE) { 2922 disk->dif_is_head_of_md = nsdata->dps.md_start; 2923 disk->dif_check_flags = prchk_flags; 2924 } 2925 } 2926 2927 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 2928 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 2929 disk->acwu = 0; 2930 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 2931 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 2932 } else { 2933 disk->acwu = cdata->acwu + 1; /* 0-based */ 2934 } 2935 2936 disk->ctxt = ctx; 2937 disk->fn_table = &nvmelib_fn_table; 2938 disk->module = &nvme_if; 2939 2940 return 0; 2941 } 2942 2943 static int 2944 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 2945 { 2946 struct nvme_bdev *bdev; 2947 int rc; 2948 2949 bdev = calloc(1, sizeof(*bdev)); 2950 if (!bdev) { 2951 SPDK_ERRLOG("bdev calloc() failed\n"); 2952 return -ENOMEM; 2953 } 2954 2955 rc = pthread_mutex_init(&bdev->mutex, NULL); 2956 if (rc != 0) { 2957 free(bdev); 2958 return rc; 2959 } 2960 2961 bdev->ref = 1; 2962 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 2963 TAILQ_INIT(&bdev->nvme_ns_list); 2964 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 2965 bdev->opal = nvme_ctrlr->opal_dev != NULL; 2966 2967 rc = nvme_disk_create(&bdev->disk, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 2968 nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); 2969 if (rc != 0) { 2970 SPDK_ERRLOG("Failed to create NVMe disk\n"); 2971 pthread_mutex_destroy(&bdev->mutex); 2972 free(bdev); 2973 return rc; 2974 } 2975 2976 spdk_io_device_register(bdev, 2977 bdev_nvme_create_bdev_channel_cb, 2978 bdev_nvme_destroy_bdev_channel_cb, 2979 sizeof(struct nvme_bdev_channel), 2980 bdev->disk.name); 2981 2982 rc = spdk_bdev_register(&bdev->disk); 2983 if (rc != 0) { 2984 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 2985 spdk_io_device_unregister(bdev, NULL); 2986 pthread_mutex_destroy(&bdev->mutex); 2987 free(bdev->disk.name); 2988 free(bdev); 2989 return rc; 2990 } 2991 2992 nvme_ns->bdev = bdev; 2993 bdev->nsid = nvme_ns->id; 2994 2995 bdev->nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 2996 TAILQ_INSERT_TAIL(&nvme_ctrlr->nbdev_ctrlr->bdevs, bdev, tailq); 2997 2998 return 0; 2999 } 3000 3001 static bool 3002 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 3003 { 3004 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 3005 const struct spdk_uuid *uuid1, *uuid2; 3006 3007 nsdata1 = spdk_nvme_ns_get_data(ns1); 3008 nsdata2 = spdk_nvme_ns_get_data(ns2); 3009 uuid1 = spdk_nvme_ns_get_uuid(ns1); 3010 uuid2 = spdk_nvme_ns_get_uuid(ns2); 3011 3012 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 3013 nsdata1->eui64 == nsdata2->eui64 && 3014 ((uuid1 == NULL && uuid2 == NULL) || 3015 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 3016 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 3017 } 3018 3019 static bool 3020 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 3021 struct spdk_nvme_ctrlr_opts *opts) 3022 { 3023 struct nvme_probe_skip_entry *entry; 3024 3025 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 3026 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 3027 return false; 3028 } 3029 } 3030 3031 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 3032 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 3033 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 3034 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 3035 opts->disable_read_ana_log_page = true; 3036 3037 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 3038 3039 return true; 3040 } 3041 3042 static void 3043 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 3044 { 3045 struct nvme_ctrlr *nvme_ctrlr = ctx; 3046 3047 if (spdk_nvme_cpl_is_error(cpl)) { 3048 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 3049 cpl->status.sct); 3050 bdev_nvme_reset(nvme_ctrlr); 3051 } else if (cpl->cdw0 & 0x1) { 3052 SPDK_WARNLOG("Specified command could not be aborted.\n"); 3053 bdev_nvme_reset(nvme_ctrlr); 3054 } 3055 } 3056 3057 static void 3058 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 3059 struct spdk_nvme_qpair *qpair, uint16_t cid) 3060 { 3061 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 3062 union spdk_nvme_csts_register csts; 3063 int rc; 3064 3065 assert(nvme_ctrlr->ctrlr == ctrlr); 3066 3067 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 3068 3069 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 3070 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 3071 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 3072 * completion recursively. 3073 */ 3074 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 3075 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 3076 if (csts.bits.cfs) { 3077 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 3078 bdev_nvme_reset(nvme_ctrlr); 3079 return; 3080 } 3081 } 3082 3083 switch (g_opts.action_on_timeout) { 3084 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 3085 if (qpair) { 3086 /* Don't send abort to ctrlr when ctrlr is not available. */ 3087 pthread_mutex_lock(&nvme_ctrlr->mutex); 3088 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 3089 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3090 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 3091 return; 3092 } 3093 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3094 3095 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 3096 nvme_abort_cpl, nvme_ctrlr); 3097 if (rc == 0) { 3098 return; 3099 } 3100 3101 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 3102 } 3103 3104 /* FALLTHROUGH */ 3105 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 3106 bdev_nvme_reset(nvme_ctrlr); 3107 break; 3108 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 3109 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 3110 break; 3111 default: 3112 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 3113 break; 3114 } 3115 } 3116 3117 static void 3118 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 3119 { 3120 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 3121 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 3122 3123 if (rc == 0) { 3124 nvme_ns->probe_ctx = NULL; 3125 pthread_mutex_lock(&nvme_ctrlr->mutex); 3126 nvme_ctrlr->ref++; 3127 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3128 } else { 3129 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 3130 free(nvme_ns); 3131 } 3132 3133 if (ctx) { 3134 ctx->populates_in_progress--; 3135 if (ctx->populates_in_progress == 0) { 3136 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 3137 } 3138 } 3139 } 3140 3141 static void 3142 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 3143 { 3144 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3145 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 3146 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3147 int rc; 3148 3149 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 3150 if (rc != 0) { 3151 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 3152 } 3153 3154 spdk_for_each_channel_continue(i, rc); 3155 } 3156 3157 static void 3158 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 3159 { 3160 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3161 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 3162 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3163 struct nvme_io_path *io_path; 3164 3165 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 3166 if (io_path != NULL) { 3167 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 3168 } 3169 3170 spdk_for_each_channel_continue(i, 0); 3171 } 3172 3173 static void 3174 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 3175 { 3176 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3177 3178 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 3179 } 3180 3181 static void 3182 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 3183 { 3184 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3185 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 3186 3187 if (status == 0) { 3188 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 3189 } else { 3190 /* Delete the added io_paths and fail populating the namespace. */ 3191 spdk_for_each_channel(bdev, 3192 bdev_nvme_delete_io_path, 3193 nvme_ns, 3194 bdev_nvme_add_io_path_failed); 3195 } 3196 } 3197 3198 static int 3199 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 3200 { 3201 struct nvme_ns *tmp_ns; 3202 const struct spdk_nvme_ns_data *nsdata; 3203 3204 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 3205 if (!nsdata->nmic.can_share) { 3206 SPDK_ERRLOG("Namespace cannot be shared.\n"); 3207 return -EINVAL; 3208 } 3209 3210 pthread_mutex_lock(&bdev->mutex); 3211 3212 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 3213 assert(tmp_ns != NULL); 3214 3215 if (!bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 3216 pthread_mutex_unlock(&bdev->mutex); 3217 SPDK_ERRLOG("Namespaces are not identical.\n"); 3218 return -EINVAL; 3219 } 3220 3221 bdev->ref++; 3222 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 3223 nvme_ns->bdev = bdev; 3224 3225 pthread_mutex_unlock(&bdev->mutex); 3226 3227 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 3228 spdk_for_each_channel(bdev, 3229 bdev_nvme_add_io_path, 3230 nvme_ns, 3231 bdev_nvme_add_io_path_done); 3232 3233 return 0; 3234 } 3235 3236 static void 3237 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 3238 { 3239 struct spdk_nvme_ns *ns; 3240 struct nvme_bdev *bdev; 3241 int rc = 0; 3242 3243 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 3244 if (!ns) { 3245 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 3246 rc = -EINVAL; 3247 goto done; 3248 } 3249 3250 nvme_ns->ns = ns; 3251 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 3252 3253 if (nvme_ctrlr->ana_log_page != NULL) { 3254 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 3255 } 3256 3257 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 3258 if (bdev == NULL) { 3259 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 3260 } else { 3261 rc = nvme_bdev_add_ns(bdev, nvme_ns); 3262 if (rc == 0) { 3263 return; 3264 } 3265 } 3266 done: 3267 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 3268 } 3269 3270 static void 3271 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 3272 { 3273 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 3274 3275 assert(nvme_ctrlr != NULL); 3276 3277 pthread_mutex_lock(&nvme_ctrlr->mutex); 3278 3279 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 3280 3281 if (nvme_ns->bdev != NULL) { 3282 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3283 return; 3284 } 3285 3286 free(nvme_ns); 3287 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3288 3289 nvme_ctrlr_release(nvme_ctrlr); 3290 } 3291 3292 static void 3293 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 3294 { 3295 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 3296 3297 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 3298 } 3299 3300 static void 3301 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 3302 { 3303 struct nvme_bdev *bdev; 3304 3305 spdk_poller_unregister(&nvme_ns->anatt_timer); 3306 3307 bdev = nvme_ns->bdev; 3308 if (bdev != NULL) { 3309 pthread_mutex_lock(&bdev->mutex); 3310 3311 assert(bdev->ref > 0); 3312 bdev->ref--; 3313 if (bdev->ref == 0) { 3314 pthread_mutex_unlock(&bdev->mutex); 3315 3316 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 3317 } else { 3318 /* spdk_bdev_unregister() is not called until the last nvme_ns is 3319 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 3320 * and clear nvme_ns->bdev here. 3321 */ 3322 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 3323 nvme_ns->bdev = NULL; 3324 3325 pthread_mutex_unlock(&bdev->mutex); 3326 3327 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 3328 * we call depopulate_namespace_done() to avoid use-after-free. 3329 */ 3330 spdk_for_each_channel(bdev, 3331 bdev_nvme_delete_io_path, 3332 nvme_ns, 3333 bdev_nvme_delete_io_path_done); 3334 return; 3335 } 3336 } 3337 3338 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 3339 } 3340 3341 static void 3342 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 3343 struct nvme_async_probe_ctx *ctx) 3344 { 3345 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 3346 struct nvme_ns *nvme_ns, *next; 3347 struct spdk_nvme_ns *ns; 3348 struct nvme_bdev *bdev; 3349 uint32_t nsid; 3350 int rc; 3351 uint64_t num_sectors; 3352 3353 if (ctx) { 3354 /* Initialize this count to 1 to handle the populate functions 3355 * calling nvme_ctrlr_populate_namespace_done() immediately. 3356 */ 3357 ctx->populates_in_progress = 1; 3358 } 3359 3360 /* First loop over our existing namespaces and see if they have been 3361 * removed. */ 3362 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 3363 while (nvme_ns != NULL) { 3364 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 3365 3366 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 3367 /* NS is still there but attributes may have changed */ 3368 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 3369 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 3370 bdev = nvme_ns->bdev; 3371 assert(bdev != NULL); 3372 if (bdev->disk.blockcnt != num_sectors) { 3373 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 3374 nvme_ns->id, 3375 bdev->disk.name, 3376 bdev->disk.blockcnt, 3377 num_sectors); 3378 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 3379 if (rc != 0) { 3380 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 3381 bdev->disk.name, rc); 3382 } 3383 } 3384 } else { 3385 /* Namespace was removed */ 3386 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 3387 } 3388 3389 nvme_ns = next; 3390 } 3391 3392 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 3393 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 3394 while (nsid != 0) { 3395 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 3396 3397 if (nvme_ns == NULL) { 3398 /* Found a new one */ 3399 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 3400 if (nvme_ns == NULL) { 3401 SPDK_ERRLOG("Failed to allocate namespace\n"); 3402 /* This just fails to attach the namespace. It may work on a future attempt. */ 3403 continue; 3404 } 3405 3406 nvme_ns->id = nsid; 3407 nvme_ns->ctrlr = nvme_ctrlr; 3408 3409 nvme_ns->bdev = NULL; 3410 3411 if (ctx) { 3412 ctx->populates_in_progress++; 3413 } 3414 nvme_ns->probe_ctx = ctx; 3415 3416 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 3417 3418 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 3419 } 3420 3421 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 3422 } 3423 3424 if (ctx) { 3425 /* Decrement this count now that the loop is over to account 3426 * for the one we started with. If the count is then 0, we 3427 * know any populate_namespace functions completed immediately, 3428 * so we'll kick the callback here. 3429 */ 3430 ctx->populates_in_progress--; 3431 if (ctx->populates_in_progress == 0) { 3432 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 3433 } 3434 } 3435 3436 } 3437 3438 static void 3439 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 3440 { 3441 struct nvme_ns *nvme_ns, *tmp; 3442 3443 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 3444 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 3445 } 3446 } 3447 3448 static int 3449 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 3450 void *cb_arg) 3451 { 3452 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 3453 struct nvme_ns *nvme_ns; 3454 uint32_t i, nsid; 3455 3456 for (i = 0; i < desc->num_of_nsid; i++) { 3457 nsid = desc->nsid[i]; 3458 if (nsid == 0) { 3459 continue; 3460 } 3461 3462 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 3463 3464 assert(nvme_ns != NULL); 3465 if (nvme_ns == NULL) { 3466 /* Target told us that an inactive namespace had an ANA change */ 3467 continue; 3468 } 3469 3470 _nvme_ns_set_ana_state(nvme_ns, desc); 3471 } 3472 3473 return 0; 3474 } 3475 3476 static void 3477 _nvme_ctrlr_read_ana_log_page_done(struct spdk_io_channel_iter *i, int status) 3478 { 3479 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 3480 3481 pthread_mutex_lock(&nvme_ctrlr->mutex); 3482 3483 assert(nvme_ctrlr->ana_log_page_updating == true); 3484 nvme_ctrlr->ana_log_page_updating = false; 3485 3486 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 3487 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3488 return; 3489 } 3490 3491 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3492 3493 nvme_ctrlr_unregister(nvme_ctrlr); 3494 } 3495 3496 static void 3497 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 3498 { 3499 struct nvme_ns *nvme_ns; 3500 3501 spdk_free(nvme_ctrlr->ana_log_page); 3502 nvme_ctrlr->ana_log_page = NULL; 3503 3504 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 3505 nvme_ns != NULL; 3506 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 3507 nvme_ns->ana_state_updating = false; 3508 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 3509 } 3510 } 3511 3512 static void 3513 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 3514 { 3515 struct nvme_ctrlr *nvme_ctrlr = ctx; 3516 3517 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 3518 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 3519 nvme_ctrlr); 3520 } else { 3521 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 3522 } 3523 3524 bdev_nvme_clear_io_path_caches(nvme_ctrlr, _nvme_ctrlr_read_ana_log_page_done); 3525 } 3526 3527 static int 3528 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 3529 { 3530 int rc; 3531 3532 if (nvme_ctrlr->ana_log_page == NULL) { 3533 return -EINVAL; 3534 } 3535 3536 pthread_mutex_lock(&nvme_ctrlr->mutex); 3537 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 3538 nvme_ctrlr->ana_log_page_updating) { 3539 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3540 return -EBUSY; 3541 } 3542 3543 nvme_ctrlr->ana_log_page_updating = true; 3544 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3545 3546 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 3547 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 3548 SPDK_NVME_GLOBAL_NS_TAG, 3549 nvme_ctrlr->ana_log_page, 3550 nvme_ctrlr->ana_log_page_size, 0, 3551 nvme_ctrlr_read_ana_log_page_done, 3552 nvme_ctrlr); 3553 if (rc != 0) { 3554 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 3555 } 3556 3557 return rc; 3558 } 3559 3560 static void 3561 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 3562 { 3563 } 3564 3565 struct bdev_nvme_set_preferred_path_ctx { 3566 struct spdk_bdev_desc *desc; 3567 struct nvme_ns *nvme_ns; 3568 bdev_nvme_set_preferred_path_cb cb_fn; 3569 void *cb_arg; 3570 }; 3571 3572 static void 3573 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 3574 { 3575 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3576 3577 assert(ctx != NULL); 3578 assert(ctx->desc != NULL); 3579 assert(ctx->cb_fn != NULL); 3580 3581 spdk_bdev_close(ctx->desc); 3582 3583 ctx->cb_fn(ctx->cb_arg, status); 3584 3585 free(ctx); 3586 } 3587 3588 static void 3589 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 3590 { 3591 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3592 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3593 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 3594 struct nvme_io_path *io_path, *prev; 3595 3596 prev = NULL; 3597 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3598 if (io_path->nvme_ns == ctx->nvme_ns) { 3599 break; 3600 } 3601 prev = io_path; 3602 } 3603 3604 if (io_path != NULL && prev != NULL) { 3605 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 3606 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 3607 3608 /* We can set io_path to nbdev_ch->current_io_path directly here. 3609 * However, it needs to be conditional. To simplify the code, 3610 * just clear nbdev_ch->current_io_path and let find_io_path() 3611 * fill it. 3612 */ 3613 nbdev_ch->current_io_path = NULL; 3614 } 3615 3616 spdk_for_each_channel_continue(i, 0); 3617 } 3618 3619 static struct nvme_ns * 3620 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 3621 { 3622 struct nvme_ns *nvme_ns, *prev; 3623 const struct spdk_nvme_ctrlr_data *cdata; 3624 3625 prev = NULL; 3626 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3627 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 3628 3629 if (cdata->cntlid == cntlid) { 3630 break; 3631 } 3632 prev = nvme_ns; 3633 } 3634 3635 if (nvme_ns != NULL && prev != NULL) { 3636 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 3637 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 3638 } 3639 3640 return nvme_ns; 3641 } 3642 3643 /* This function supports only multipath mode. There is only a single I/O path 3644 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 3645 * head of the I/O path list for each NVMe bdev channel. 3646 * 3647 * NVMe bdev channel may be acquired after completing this function. move the 3648 * matched namespace to the head of the namespace list for the NVMe bdev too. 3649 */ 3650 void 3651 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 3652 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 3653 { 3654 struct bdev_nvme_set_preferred_path_ctx *ctx; 3655 struct spdk_bdev *bdev; 3656 struct nvme_bdev *nbdev; 3657 int rc = 0; 3658 3659 assert(cb_fn != NULL); 3660 3661 ctx = calloc(1, sizeof(*ctx)); 3662 if (ctx == NULL) { 3663 SPDK_ERRLOG("Failed to alloc context.\n"); 3664 rc = -ENOMEM; 3665 goto err_alloc; 3666 } 3667 3668 ctx->cb_fn = cb_fn; 3669 ctx->cb_arg = cb_arg; 3670 3671 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 3672 if (rc != 0) { 3673 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 3674 goto err_open; 3675 } 3676 3677 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 3678 3679 if (bdev->module != &nvme_if) { 3680 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 3681 rc = -ENODEV; 3682 goto err_bdev; 3683 } 3684 3685 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 3686 3687 pthread_mutex_lock(&nbdev->mutex); 3688 3689 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 3690 if (ctx->nvme_ns == NULL) { 3691 pthread_mutex_unlock(&nbdev->mutex); 3692 3693 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 3694 rc = -ENODEV; 3695 goto err_bdev; 3696 } 3697 3698 pthread_mutex_unlock(&nbdev->mutex); 3699 3700 spdk_for_each_channel(nbdev, 3701 _bdev_nvme_set_preferred_path, 3702 ctx, 3703 bdev_nvme_set_preferred_path_done); 3704 return; 3705 3706 err_bdev: 3707 spdk_bdev_close(ctx->desc); 3708 err_open: 3709 free(ctx); 3710 err_alloc: 3711 cb_fn(cb_arg, rc); 3712 } 3713 3714 struct bdev_nvme_set_multipath_policy_ctx { 3715 struct spdk_bdev_desc *desc; 3716 bdev_nvme_set_multipath_policy_cb cb_fn; 3717 void *cb_arg; 3718 }; 3719 3720 static void 3721 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 3722 { 3723 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3724 3725 assert(ctx != NULL); 3726 assert(ctx->desc != NULL); 3727 assert(ctx->cb_fn != NULL); 3728 3729 spdk_bdev_close(ctx->desc); 3730 3731 ctx->cb_fn(ctx->cb_arg, status); 3732 3733 free(ctx); 3734 } 3735 3736 static void 3737 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 3738 { 3739 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3740 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 3741 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 3742 3743 nbdev_ch->mp_policy = nbdev->mp_policy; 3744 nbdev_ch->current_io_path = NULL; 3745 3746 spdk_for_each_channel_continue(i, 0); 3747 } 3748 3749 void 3750 bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, 3751 bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 3752 { 3753 struct bdev_nvme_set_multipath_policy_ctx *ctx; 3754 struct spdk_bdev *bdev; 3755 struct nvme_bdev *nbdev; 3756 int rc; 3757 3758 assert(cb_fn != NULL); 3759 3760 ctx = calloc(1, sizeof(*ctx)); 3761 if (ctx == NULL) { 3762 SPDK_ERRLOG("Failed to alloc context.\n"); 3763 rc = -ENOMEM; 3764 goto err_alloc; 3765 } 3766 3767 ctx->cb_fn = cb_fn; 3768 ctx->cb_arg = cb_arg; 3769 3770 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 3771 if (rc != 0) { 3772 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 3773 rc = -ENODEV; 3774 goto err_open; 3775 } 3776 3777 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 3778 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 3779 3780 pthread_mutex_lock(&nbdev->mutex); 3781 nbdev->mp_policy = policy; 3782 pthread_mutex_unlock(&nbdev->mutex); 3783 3784 spdk_for_each_channel(nbdev, 3785 _bdev_nvme_set_multipath_policy, 3786 ctx, 3787 bdev_nvme_set_multipath_policy_done); 3788 return; 3789 3790 err_open: 3791 free(ctx); 3792 err_alloc: 3793 cb_fn(cb_arg, rc); 3794 } 3795 3796 static void 3797 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 3798 { 3799 struct nvme_ctrlr *nvme_ctrlr = arg; 3800 union spdk_nvme_async_event_completion event; 3801 3802 if (spdk_nvme_cpl_is_error(cpl)) { 3803 SPDK_WARNLOG("AER request execute failed"); 3804 return; 3805 } 3806 3807 event.raw = cpl->cdw0; 3808 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 3809 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 3810 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 3811 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 3812 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 3813 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 3814 } 3815 } 3816 3817 static void 3818 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc) 3819 { 3820 if (ctx->cb_fn) { 3821 ctx->cb_fn(ctx->cb_ctx, count, rc); 3822 } 3823 3824 ctx->namespaces_populated = true; 3825 if (ctx->probe_done) { 3826 /* The probe was already completed, so we need to free the context 3827 * here. This can happen for cases like OCSSD, where we need to 3828 * send additional commands to the SSD after attach. 3829 */ 3830 free(ctx); 3831 } 3832 } 3833 3834 static void 3835 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 3836 struct nvme_async_probe_ctx *ctx) 3837 { 3838 spdk_io_device_register(nvme_ctrlr, 3839 bdev_nvme_create_ctrlr_channel_cb, 3840 bdev_nvme_destroy_ctrlr_channel_cb, 3841 sizeof(struct nvme_ctrlr_channel), 3842 nvme_ctrlr->nbdev_ctrlr->name); 3843 3844 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 3845 } 3846 3847 static void 3848 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 3849 { 3850 struct nvme_ctrlr *nvme_ctrlr = _ctx; 3851 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 3852 3853 nvme_ctrlr->probe_ctx = NULL; 3854 3855 if (spdk_nvme_cpl_is_error(cpl)) { 3856 nvme_ctrlr_delete(nvme_ctrlr); 3857 3858 if (ctx != NULL) { 3859 populate_namespaces_cb(ctx, 0, -1); 3860 } 3861 return; 3862 } 3863 3864 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 3865 } 3866 3867 static int 3868 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3869 struct nvme_async_probe_ctx *ctx) 3870 { 3871 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 3872 const struct spdk_nvme_ctrlr_data *cdata; 3873 uint32_t ana_log_page_size; 3874 3875 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3876 3877 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 3878 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->nn * 3879 sizeof(uint32_t); 3880 3881 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 3882 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3883 if (nvme_ctrlr->ana_log_page == NULL) { 3884 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 3885 return -ENXIO; 3886 } 3887 3888 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 3889 * Hence copy each descriptor to a temporary area when parsing it. 3890 * 3891 * Allocate a buffer whose size is as large as ANA log page buffer because 3892 * we do not know the size of a descriptor until actually reading it. 3893 */ 3894 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 3895 if (nvme_ctrlr->copied_ana_desc == NULL) { 3896 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 3897 return -ENOMEM; 3898 } 3899 3900 nvme_ctrlr->ana_log_page_size = ana_log_page_size; 3901 3902 nvme_ctrlr->probe_ctx = ctx; 3903 3904 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 3905 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 3906 SPDK_NVME_GLOBAL_NS_TAG, 3907 nvme_ctrlr->ana_log_page, 3908 nvme_ctrlr->ana_log_page_size, 0, 3909 nvme_ctrlr_init_ana_log_page_done, 3910 nvme_ctrlr); 3911 } 3912 3913 /* hostnqn and subnqn were already verified before attaching a controller. 3914 * Hence check only the multipath capability and cntlid here. 3915 */ 3916 static bool 3917 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 3918 { 3919 struct nvme_ctrlr *tmp; 3920 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 3921 3922 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3923 3924 if (!cdata->cmic.multi_ctrlr) { 3925 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 3926 return false; 3927 } 3928 3929 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 3930 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 3931 3932 if (!tmp_cdata->cmic.multi_ctrlr) { 3933 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 3934 return false; 3935 } 3936 if (cdata->cntlid == tmp_cdata->cntlid) { 3937 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 3938 return false; 3939 } 3940 } 3941 3942 return true; 3943 } 3944 3945 static int 3946 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 3947 { 3948 struct nvme_bdev_ctrlr *nbdev_ctrlr; 3949 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 3950 int rc = 0; 3951 3952 pthread_mutex_lock(&g_bdev_nvme_mutex); 3953 3954 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 3955 if (nbdev_ctrlr != NULL) { 3956 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 3957 rc = -EINVAL; 3958 goto exit; 3959 } 3960 } else { 3961 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 3962 if (nbdev_ctrlr == NULL) { 3963 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 3964 rc = -ENOMEM; 3965 goto exit; 3966 } 3967 nbdev_ctrlr->name = strdup(name); 3968 if (nbdev_ctrlr->name == NULL) { 3969 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 3970 free(nbdev_ctrlr); 3971 goto exit; 3972 } 3973 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 3974 TAILQ_INIT(&nbdev_ctrlr->bdevs); 3975 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 3976 } 3977 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 3978 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 3979 exit: 3980 pthread_mutex_unlock(&g_bdev_nvme_mutex); 3981 return rc; 3982 } 3983 3984 static int 3985 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 3986 const char *name, 3987 const struct spdk_nvme_transport_id *trid, 3988 struct nvme_async_probe_ctx *ctx) 3989 { 3990 struct nvme_ctrlr *nvme_ctrlr; 3991 struct nvme_path_id *path_id; 3992 const struct spdk_nvme_ctrlr_data *cdata; 3993 int rc; 3994 3995 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 3996 if (nvme_ctrlr == NULL) { 3997 SPDK_ERRLOG("Failed to allocate device struct\n"); 3998 return -ENOMEM; 3999 } 4000 4001 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 4002 if (rc != 0) { 4003 free(nvme_ctrlr); 4004 return rc; 4005 } 4006 4007 TAILQ_INIT(&nvme_ctrlr->trids); 4008 4009 RB_INIT(&nvme_ctrlr->namespaces); 4010 4011 path_id = calloc(1, sizeof(*path_id)); 4012 if (path_id == NULL) { 4013 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 4014 rc = -ENOMEM; 4015 goto err; 4016 } 4017 4018 path_id->trid = *trid; 4019 if (ctx != NULL) { 4020 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 4021 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 4022 } 4023 nvme_ctrlr->active_path_id = path_id; 4024 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 4025 4026 nvme_ctrlr->thread = spdk_get_thread(); 4027 nvme_ctrlr->ctrlr = ctrlr; 4028 nvme_ctrlr->ref = 1; 4029 4030 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 4031 SPDK_ERRLOG("OCSSDs are not supported"); 4032 rc = -ENOTSUP; 4033 goto err; 4034 } 4035 4036 if (ctx != NULL) { 4037 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 4038 } else { 4039 bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 4040 } 4041 4042 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 4043 g_opts.nvme_adminq_poll_period_us); 4044 4045 if (g_opts.timeout_us > 0) { 4046 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 4047 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 4048 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 4049 g_opts.timeout_us : g_opts.timeout_admin_us; 4050 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 4051 adm_timeout_us, timeout_cb, nvme_ctrlr); 4052 } 4053 4054 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 4055 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 4056 4057 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 4058 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 4059 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 4060 } 4061 4062 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 4063 if (rc != 0) { 4064 goto err; 4065 } 4066 4067 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4068 4069 if (cdata->cmic.ana_reporting) { 4070 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 4071 if (rc == 0) { 4072 return 0; 4073 } 4074 } else { 4075 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 4076 return 0; 4077 } 4078 4079 err: 4080 nvme_ctrlr_delete(nvme_ctrlr); 4081 return rc; 4082 } 4083 4084 void 4085 bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) 4086 { 4087 opts->prchk_flags = 0; 4088 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 4089 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 4090 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 4091 } 4092 4093 static void 4094 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4095 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 4096 { 4097 char *name; 4098 4099 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 4100 if (!name) { 4101 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 4102 return; 4103 } 4104 4105 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 4106 4107 nvme_ctrlr_create(ctrlr, name, trid, NULL); 4108 4109 free(name); 4110 } 4111 4112 static void 4113 _nvme_ctrlr_destruct(void *ctx) 4114 { 4115 struct nvme_ctrlr *nvme_ctrlr = ctx; 4116 4117 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 4118 nvme_ctrlr_release(nvme_ctrlr); 4119 } 4120 4121 static int 4122 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 4123 { 4124 struct nvme_probe_skip_entry *entry; 4125 4126 pthread_mutex_lock(&nvme_ctrlr->mutex); 4127 4128 /* The controller's destruction was already started */ 4129 if (nvme_ctrlr->destruct) { 4130 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4131 return 0; 4132 } 4133 4134 if (!hotplug && 4135 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 4136 entry = calloc(1, sizeof(*entry)); 4137 if (!entry) { 4138 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4139 return -ENOMEM; 4140 } 4141 entry->trid = nvme_ctrlr->active_path_id->trid; 4142 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 4143 } 4144 4145 nvme_ctrlr->destruct = true; 4146 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4147 4148 _nvme_ctrlr_destruct(nvme_ctrlr); 4149 4150 return 0; 4151 } 4152 4153 static void 4154 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 4155 { 4156 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 4157 4158 _bdev_nvme_delete(nvme_ctrlr, true); 4159 } 4160 4161 static int 4162 bdev_nvme_hotplug_probe(void *arg) 4163 { 4164 if (g_hotplug_probe_ctx == NULL) { 4165 spdk_poller_unregister(&g_hotplug_probe_poller); 4166 return SPDK_POLLER_IDLE; 4167 } 4168 4169 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 4170 g_hotplug_probe_ctx = NULL; 4171 spdk_poller_unregister(&g_hotplug_probe_poller); 4172 } 4173 4174 return SPDK_POLLER_BUSY; 4175 } 4176 4177 static int 4178 bdev_nvme_hotplug(void *arg) 4179 { 4180 struct spdk_nvme_transport_id trid_pcie; 4181 4182 if (g_hotplug_probe_ctx) { 4183 return SPDK_POLLER_BUSY; 4184 } 4185 4186 memset(&trid_pcie, 0, sizeof(trid_pcie)); 4187 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 4188 4189 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 4190 hotplug_probe_cb, attach_cb, NULL); 4191 4192 if (g_hotplug_probe_ctx) { 4193 assert(g_hotplug_probe_poller == NULL); 4194 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 4195 } 4196 4197 return SPDK_POLLER_BUSY; 4198 } 4199 4200 void 4201 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 4202 { 4203 *opts = g_opts; 4204 } 4205 4206 static bool bdev_nvme_check_multipath_params(int32_t ctrlr_loss_timeout_sec, 4207 uint32_t reconnect_delay_sec, 4208 uint32_t fast_io_fail_timeout_sec); 4209 4210 static int 4211 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 4212 { 4213 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 4214 /* Can't set timeout_admin_us without also setting timeout_us */ 4215 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 4216 return -EINVAL; 4217 } 4218 4219 if (opts->bdev_retry_count < -1) { 4220 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 4221 return -EINVAL; 4222 } 4223 4224 if (!bdev_nvme_check_multipath_params(opts->ctrlr_loss_timeout_sec, 4225 opts->reconnect_delay_sec, 4226 opts->fast_io_fail_timeout_sec)) { 4227 return -EINVAL; 4228 } 4229 4230 return 0; 4231 } 4232 4233 int 4234 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 4235 { 4236 int ret = bdev_nvme_validate_opts(opts); 4237 if (ret) { 4238 SPDK_WARNLOG("Failed to set nvme opts.\n"); 4239 return ret; 4240 } 4241 4242 if (g_bdev_nvme_init_thread != NULL) { 4243 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 4244 return -EPERM; 4245 } 4246 } 4247 4248 g_opts = *opts; 4249 4250 return 0; 4251 } 4252 4253 struct set_nvme_hotplug_ctx { 4254 uint64_t period_us; 4255 bool enabled; 4256 spdk_msg_fn fn; 4257 void *fn_ctx; 4258 }; 4259 4260 static void 4261 set_nvme_hotplug_period_cb(void *_ctx) 4262 { 4263 struct set_nvme_hotplug_ctx *ctx = _ctx; 4264 4265 spdk_poller_unregister(&g_hotplug_poller); 4266 if (ctx->enabled) { 4267 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 4268 } 4269 4270 g_nvme_hotplug_poll_period_us = ctx->period_us; 4271 g_nvme_hotplug_enabled = ctx->enabled; 4272 if (ctx->fn) { 4273 ctx->fn(ctx->fn_ctx); 4274 } 4275 4276 free(ctx); 4277 } 4278 4279 int 4280 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 4281 { 4282 struct set_nvme_hotplug_ctx *ctx; 4283 4284 if (enabled == true && !spdk_process_is_primary()) { 4285 return -EPERM; 4286 } 4287 4288 ctx = calloc(1, sizeof(*ctx)); 4289 if (ctx == NULL) { 4290 return -ENOMEM; 4291 } 4292 4293 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 4294 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 4295 ctx->enabled = enabled; 4296 ctx->fn = cb; 4297 ctx->fn_ctx = cb_ctx; 4298 4299 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 4300 return 0; 4301 } 4302 4303 static void 4304 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 4305 struct nvme_async_probe_ctx *ctx) 4306 { 4307 struct nvme_ns *nvme_ns; 4308 struct nvme_bdev *nvme_bdev; 4309 size_t j; 4310 4311 assert(nvme_ctrlr != NULL); 4312 4313 if (ctx->names == NULL) { 4314 populate_namespaces_cb(ctx, 0, 0); 4315 return; 4316 } 4317 4318 /* 4319 * Report the new bdevs that were created in this call. 4320 * There can be more than one bdev per NVMe controller. 4321 */ 4322 j = 0; 4323 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4324 while (nvme_ns != NULL) { 4325 nvme_bdev = nvme_ns->bdev; 4326 if (j < ctx->count) { 4327 ctx->names[j] = nvme_bdev->disk.name; 4328 j++; 4329 } else { 4330 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 4331 ctx->count); 4332 populate_namespaces_cb(ctx, 0, -ERANGE); 4333 return; 4334 } 4335 4336 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4337 } 4338 4339 populate_namespaces_cb(ctx, j, 0); 4340 } 4341 4342 static int 4343 bdev_nvme_compare_trids(struct nvme_ctrlr *nvme_ctrlr, 4344 struct spdk_nvme_ctrlr *new_ctrlr, 4345 struct spdk_nvme_transport_id *trid) 4346 { 4347 struct nvme_path_id *tmp_trid; 4348 4349 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 4350 SPDK_ERRLOG("PCIe failover is not supported.\n"); 4351 return -ENOTSUP; 4352 } 4353 4354 /* Currently we only support failover to the same transport type. */ 4355 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 4356 return -EINVAL; 4357 } 4358 4359 /* Currently we only support failover to the same NQN. */ 4360 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 4361 return -EINVAL; 4362 } 4363 4364 /* Skip all the other checks if we've already registered this path. */ 4365 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 4366 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 4367 return -EEXIST; 4368 } 4369 } 4370 4371 return 0; 4372 } 4373 4374 static int 4375 bdev_nvme_compare_namespaces(struct nvme_ctrlr *nvme_ctrlr, 4376 struct spdk_nvme_ctrlr *new_ctrlr) 4377 { 4378 struct nvme_ns *nvme_ns; 4379 struct spdk_nvme_ns *new_ns; 4380 4381 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4382 while (nvme_ns != NULL) { 4383 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 4384 assert(new_ns != NULL); 4385 4386 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 4387 return -EINVAL; 4388 } 4389 4390 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4391 } 4392 4393 return 0; 4394 } 4395 4396 static int 4397 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 4398 struct spdk_nvme_transport_id *trid) 4399 { 4400 struct nvme_path_id *new_trid, *tmp_trid; 4401 4402 new_trid = calloc(1, sizeof(*new_trid)); 4403 if (new_trid == NULL) { 4404 return -ENOMEM; 4405 } 4406 new_trid->trid = *trid; 4407 new_trid->is_failed = false; 4408 4409 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 4410 if (tmp_trid->is_failed && tmp_trid != nvme_ctrlr->active_path_id) { 4411 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 4412 return 0; 4413 } 4414 } 4415 4416 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 4417 return 0; 4418 } 4419 4420 /* This is the case that a secondary path is added to an existing 4421 * nvme_ctrlr for failover. After checking if it can access the same 4422 * namespaces as the primary path, it is disconnected until failover occurs. 4423 */ 4424 static int 4425 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 4426 struct spdk_nvme_ctrlr *new_ctrlr, 4427 struct spdk_nvme_transport_id *trid) 4428 { 4429 int rc; 4430 4431 assert(nvme_ctrlr != NULL); 4432 4433 pthread_mutex_lock(&nvme_ctrlr->mutex); 4434 4435 rc = bdev_nvme_compare_trids(nvme_ctrlr, new_ctrlr, trid); 4436 if (rc != 0) { 4437 goto exit; 4438 } 4439 4440 rc = bdev_nvme_compare_namespaces(nvme_ctrlr, new_ctrlr); 4441 if (rc != 0) { 4442 goto exit; 4443 } 4444 4445 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 4446 4447 exit: 4448 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4449 4450 spdk_nvme_detach(new_ctrlr); 4451 4452 return rc; 4453 } 4454 4455 static void 4456 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4457 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 4458 { 4459 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 4460 struct nvme_async_probe_ctx *ctx; 4461 int rc; 4462 4463 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 4464 ctx->ctrlr_attached = true; 4465 4466 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 4467 if (rc != 0) { 4468 populate_namespaces_cb(ctx, 0, rc); 4469 } 4470 } 4471 4472 static void 4473 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4474 struct spdk_nvme_ctrlr *ctrlr, 4475 const struct spdk_nvme_ctrlr_opts *opts) 4476 { 4477 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 4478 struct nvme_ctrlr *nvme_ctrlr; 4479 struct nvme_async_probe_ctx *ctx; 4480 int rc; 4481 4482 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 4483 ctx->ctrlr_attached = true; 4484 4485 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 4486 if (nvme_ctrlr) { 4487 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 4488 } else { 4489 rc = -ENODEV; 4490 } 4491 4492 populate_namespaces_cb(ctx, 0, rc); 4493 } 4494 4495 static int 4496 bdev_nvme_async_poll(void *arg) 4497 { 4498 struct nvme_async_probe_ctx *ctx = arg; 4499 int rc; 4500 4501 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 4502 if (spdk_unlikely(rc != -EAGAIN)) { 4503 ctx->probe_done = true; 4504 spdk_poller_unregister(&ctx->poller); 4505 if (!ctx->ctrlr_attached) { 4506 /* The probe is done, but no controller was attached. 4507 * That means we had a failure, so report -EIO back to 4508 * the caller (usually the RPC). populate_namespaces_cb() 4509 * will take care of freeing the nvme_async_probe_ctx. 4510 */ 4511 populate_namespaces_cb(ctx, 0, -EIO); 4512 } else if (ctx->namespaces_populated) { 4513 /* The namespaces for the attached controller were all 4514 * populated and the response was already sent to the 4515 * caller (usually the RPC). So free the context here. 4516 */ 4517 free(ctx); 4518 } 4519 } 4520 4521 return SPDK_POLLER_BUSY; 4522 } 4523 4524 static bool 4525 bdev_nvme_check_multipath_params(int32_t ctrlr_loss_timeout_sec, 4526 uint32_t reconnect_delay_sec, 4527 uint32_t fast_io_fail_timeout_sec) 4528 { 4529 if (ctrlr_loss_timeout_sec < -1) { 4530 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 4531 return false; 4532 } else if (ctrlr_loss_timeout_sec == -1) { 4533 if (reconnect_delay_sec == 0) { 4534 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 4535 return false; 4536 } else if (fast_io_fail_timeout_sec != 0 && 4537 fast_io_fail_timeout_sec < reconnect_delay_sec) { 4538 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 4539 return false; 4540 } 4541 } else if (ctrlr_loss_timeout_sec != 0) { 4542 if (reconnect_delay_sec == 0) { 4543 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 4544 return false; 4545 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 4546 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 4547 return false; 4548 } else if (fast_io_fail_timeout_sec != 0) { 4549 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 4550 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 4551 return false; 4552 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 4553 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 4554 return false; 4555 } 4556 } 4557 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 4558 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 4559 return false; 4560 } 4561 4562 return true; 4563 } 4564 4565 int 4566 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 4567 const char *base_name, 4568 const char **names, 4569 uint32_t count, 4570 spdk_bdev_create_nvme_fn cb_fn, 4571 void *cb_ctx, 4572 struct spdk_nvme_ctrlr_opts *drv_opts, 4573 struct nvme_ctrlr_opts *bdev_opts, 4574 bool multipath) 4575 { 4576 struct nvme_probe_skip_entry *entry, *tmp; 4577 struct nvme_async_probe_ctx *ctx; 4578 spdk_nvme_attach_cb attach_cb; 4579 4580 /* TODO expand this check to include both the host and target TRIDs. 4581 * Only if both are the same should we fail. 4582 */ 4583 if (nvme_ctrlr_get(trid) != NULL) { 4584 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 4585 return -EEXIST; 4586 } 4587 4588 if (bdev_opts != NULL && 4589 !bdev_nvme_check_multipath_params(bdev_opts->ctrlr_loss_timeout_sec, 4590 bdev_opts->reconnect_delay_sec, 4591 bdev_opts->fast_io_fail_timeout_sec)) { 4592 return -EINVAL; 4593 } 4594 4595 ctx = calloc(1, sizeof(*ctx)); 4596 if (!ctx) { 4597 return -ENOMEM; 4598 } 4599 ctx->base_name = base_name; 4600 ctx->names = names; 4601 ctx->count = count; 4602 ctx->cb_fn = cb_fn; 4603 ctx->cb_ctx = cb_ctx; 4604 ctx->trid = *trid; 4605 4606 if (bdev_opts) { 4607 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 4608 } else { 4609 bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 4610 } 4611 4612 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 4613 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 4614 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4615 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 4616 free(entry); 4617 break; 4618 } 4619 } 4620 } 4621 4622 if (drv_opts) { 4623 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 4624 } else { 4625 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); 4626 } 4627 4628 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 4629 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 4630 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 4631 ctx->drv_opts.disable_read_ana_log_page = true; 4632 4633 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 4634 attach_cb = connect_attach_cb; 4635 } else { 4636 attach_cb = connect_set_failover_cb; 4637 } 4638 4639 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 4640 if (ctx->probe_ctx == NULL) { 4641 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 4642 free(ctx); 4643 return -ENODEV; 4644 } 4645 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 4646 4647 return 0; 4648 } 4649 4650 int 4651 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id) 4652 { 4653 struct nvme_bdev_ctrlr *nbdev_ctrlr; 4654 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 4655 struct nvme_path_id *p, *t; 4656 int rc = -ENXIO; 4657 4658 if (name == NULL || path_id == NULL) { 4659 return -EINVAL; 4660 } 4661 4662 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 4663 if (nbdev_ctrlr == NULL) { 4664 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 4665 return -ENODEV; 4666 } 4667 4668 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 4669 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 4670 if (path_id->trid.trtype != 0) { 4671 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 4672 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 4673 continue; 4674 } 4675 } else { 4676 if (path_id->trid.trtype != p->trid.trtype) { 4677 continue; 4678 } 4679 } 4680 } 4681 4682 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 4683 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 4684 continue; 4685 } 4686 } 4687 4688 if (path_id->trid.adrfam != 0) { 4689 if (path_id->trid.adrfam != p->trid.adrfam) { 4690 continue; 4691 } 4692 } 4693 4694 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 4695 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 4696 continue; 4697 } 4698 } 4699 4700 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 4701 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 4702 continue; 4703 } 4704 } 4705 4706 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 4707 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 4708 continue; 4709 } 4710 } 4711 4712 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 4713 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 4714 continue; 4715 } 4716 } 4717 4718 /* If we made it here, then this path is a match! Now we need to remove it. */ 4719 if (p == nvme_ctrlr->active_path_id) { 4720 /* This is the active path in use right now. The active path is always the first in the list. */ 4721 4722 if (!TAILQ_NEXT(p, link)) { 4723 /* The current path is the only path. */ 4724 rc = _bdev_nvme_delete(nvme_ctrlr, false); 4725 } else { 4726 /* There is an alternative path. */ 4727 rc = bdev_nvme_failover(nvme_ctrlr, true); 4728 } 4729 } else { 4730 /* We are not using the specified path. */ 4731 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 4732 free(p); 4733 rc = 0; 4734 } 4735 4736 if (rc < 0 && rc != -ENXIO) { 4737 return rc; 4738 } 4739 4740 4741 } 4742 } 4743 4744 /* All nvme_ctrlrs were deleted or no nvme_ctrlr which had the trid was found. */ 4745 return rc; 4746 } 4747 4748 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 4749 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 4750 4751 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 4752 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 4753 4754 struct discovery_entry_ctx { 4755 char name[128]; 4756 struct spdk_nvme_transport_id trid; 4757 struct spdk_nvme_ctrlr_opts drv_opts; 4758 struct spdk_nvmf_discovery_log_page_entry entry; 4759 TAILQ_ENTRY(discovery_entry_ctx) tailq; 4760 struct discovery_ctx *ctx; 4761 }; 4762 4763 struct discovery_ctx { 4764 char *name; 4765 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 4766 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 4767 void *cb_ctx; 4768 struct spdk_nvme_probe_ctx *probe_ctx; 4769 struct spdk_nvme_detach_ctx *detach_ctx; 4770 struct spdk_nvme_ctrlr *ctrlr; 4771 struct spdk_nvme_transport_id trid; 4772 struct discovery_entry_ctx *entry_ctx_in_use; 4773 struct spdk_poller *poller; 4774 struct spdk_nvme_ctrlr_opts drv_opts; 4775 struct nvme_ctrlr_opts bdev_opts; 4776 struct spdk_nvmf_discovery_log_page *log_page; 4777 TAILQ_ENTRY(discovery_ctx) tailq; 4778 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 4779 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 4780 int rc; 4781 bool wait_for_attach; 4782 /* Denotes if a discovery is currently in progress for this context. 4783 * That includes connecting to newly discovered subsystems. Used to 4784 * ensure we do not start a new discovery until an existing one is 4785 * complete. 4786 */ 4787 bool in_progress; 4788 4789 /* Denotes if another discovery is needed after the one in progress 4790 * completes. Set when we receive an AER completion while a discovery 4791 * is already in progress. 4792 */ 4793 bool pending; 4794 4795 /* Signal to the discovery context poller that it should stop the 4796 * discovery service, including detaching from the current discovery 4797 * controller. 4798 */ 4799 bool stop; 4800 4801 struct spdk_thread *calling_thread; 4802 uint32_t index; 4803 uint32_t attach_in_progress; 4804 char *hostnqn; 4805 }; 4806 4807 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 4808 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 4809 4810 static void get_discovery_log_page(struct discovery_ctx *ctx); 4811 4812 static void 4813 free_discovery_ctx(struct discovery_ctx *ctx) 4814 { 4815 free(ctx->hostnqn); 4816 free(ctx->name); 4817 free(ctx); 4818 } 4819 4820 static void 4821 discovery_complete(struct discovery_ctx *ctx) 4822 { 4823 ctx->in_progress = false; 4824 if (ctx->pending) { 4825 ctx->pending = false; 4826 get_discovery_log_page(ctx); 4827 } 4828 } 4829 4830 static void 4831 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 4832 struct spdk_nvmf_discovery_log_page_entry *entry) 4833 { 4834 char *space; 4835 4836 trid->trtype = entry->trtype; 4837 trid->adrfam = entry->adrfam; 4838 memcpy(trid->traddr, entry->traddr, sizeof(trid->traddr)); 4839 memcpy(trid->trsvcid, entry->trsvcid, sizeof(trid->trsvcid)); 4840 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn)); 4841 4842 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 4843 * But the log page entries typically pad them with spaces, not zeroes. 4844 * So add a NULL terminator to each of these fields at the appropriate 4845 * location. 4846 */ 4847 space = strchr(trid->traddr, ' '); 4848 if (space) { 4849 *space = 0; 4850 } 4851 space = strchr(trid->trsvcid, ' '); 4852 if (space) { 4853 *space = 0; 4854 } 4855 space = strchr(trid->subnqn, ' '); 4856 if (space) { 4857 *space = 0; 4858 } 4859 } 4860 4861 static void 4862 discovery_remove_controllers(struct discovery_ctx *ctx) 4863 { 4864 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 4865 struct discovery_entry_ctx *entry_ctx, *tmp; 4866 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 4867 struct spdk_nvme_transport_id old_trid; 4868 uint64_t numrec, i; 4869 bool found; 4870 4871 numrec = from_le64(&log_page->numrec); 4872 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 4873 found = false; 4874 old_entry = &entry_ctx->entry; 4875 build_trid_from_log_page_entry(&old_trid, old_entry); 4876 for (i = 0; i < numrec; i++) { 4877 new_entry = &log_page->entries[i]; 4878 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 4879 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 4880 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 4881 found = true; 4882 break; 4883 } 4884 } 4885 if (!found) { 4886 struct nvme_path_id path = {}; 4887 4888 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 4889 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 4890 4891 path.trid = entry_ctx->trid; 4892 bdev_nvme_delete(entry_ctx->name, &path); 4893 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 4894 free(entry_ctx); 4895 } 4896 } 4897 free(log_page); 4898 ctx->log_page = NULL; 4899 discovery_complete(ctx); 4900 } 4901 4902 static void 4903 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 4904 { 4905 struct discovery_entry_ctx *entry_ctx = cb_ctx; 4906 struct discovery_ctx *ctx = entry_ctx->ctx;; 4907 4908 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 4909 ctx->attach_in_progress--; 4910 if (ctx->attach_in_progress == 0) { 4911 if (ctx->start_cb_fn) { 4912 ctx->start_cb_fn(ctx->cb_ctx); 4913 ctx->start_cb_fn = NULL; 4914 ctx->cb_ctx = NULL; 4915 } 4916 discovery_remove_controllers(ctx); 4917 } 4918 } 4919 4920 static struct discovery_entry_ctx * 4921 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 4922 { 4923 struct discovery_entry_ctx *new_ctx; 4924 4925 new_ctx = calloc(1, sizeof(*new_ctx)); 4926 if (new_ctx == NULL) { 4927 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 4928 return NULL; 4929 } 4930 4931 new_ctx->ctx = ctx; 4932 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 4933 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 4934 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 4935 return new_ctx; 4936 } 4937 4938 static void 4939 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 4940 struct spdk_nvmf_discovery_log_page *log_page) 4941 { 4942 struct discovery_ctx *ctx = cb_arg; 4943 struct discovery_entry_ctx *entry_ctx, *tmp; 4944 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 4945 uint64_t numrec, i; 4946 bool found; 4947 4948 if (rc || spdk_nvme_cpl_is_error(cpl)) { 4949 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 4950 return; 4951 } 4952 4953 ctx->log_page = log_page; 4954 assert(ctx->attach_in_progress == 0); 4955 numrec = from_le64(&log_page->numrec); 4956 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 4957 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 4958 free(entry_ctx); 4959 } 4960 for (i = 0; i < numrec; i++) { 4961 found = false; 4962 new_entry = &log_page->entries[i]; 4963 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 4964 struct discovery_entry_ctx *new_ctx; 4965 struct spdk_nvme_transport_id trid; 4966 4967 build_trid_from_log_page_entry(&trid, new_entry); 4968 new_ctx = create_discovery_entry_ctx(ctx, &trid); 4969 if (new_ctx == NULL) { 4970 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 4971 break; 4972 } 4973 4974 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 4975 continue; 4976 } 4977 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 4978 old_entry = &entry_ctx->entry; 4979 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 4980 found = true; 4981 break; 4982 } 4983 } 4984 if (!found) { 4985 struct discovery_entry_ctx *subnqn_ctx, *new_ctx; 4986 4987 TAILQ_FOREACH(subnqn_ctx, &ctx->nvm_entry_ctxs, tailq) { 4988 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 4989 sizeof(new_entry->subnqn))) { 4990 break; 4991 } 4992 } 4993 4994 new_ctx = calloc(1, sizeof(*new_ctx)); 4995 if (new_ctx == NULL) { 4996 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 4997 break; 4998 } 4999 5000 new_ctx->ctx = ctx; 5001 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 5002 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 5003 if (subnqn_ctx) { 5004 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 5005 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 5006 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 5007 new_ctx->name); 5008 } else { 5009 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 5010 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 5011 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 5012 new_ctx->name); 5013 } 5014 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 5015 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 5016 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 5017 discovery_attach_controller_done, new_ctx, 5018 &new_ctx->drv_opts, &ctx->bdev_opts, true); 5019 if (rc == 0) { 5020 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 5021 ctx->attach_in_progress++; 5022 } else { 5023 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 5024 } 5025 } 5026 } 5027 5028 if (ctx->attach_in_progress == 0) { 5029 discovery_remove_controllers(ctx); 5030 } 5031 } 5032 5033 static void 5034 get_discovery_log_page(struct discovery_ctx *ctx) 5035 { 5036 int rc; 5037 5038 assert(ctx->in_progress == false); 5039 ctx->in_progress = true; 5040 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 5041 if (rc != 0) { 5042 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 5043 } 5044 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 5045 } 5046 5047 static void 5048 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5049 { 5050 struct discovery_ctx *ctx = arg; 5051 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 5052 5053 if (spdk_nvme_cpl_is_error(cpl)) { 5054 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 5055 return; 5056 } 5057 5058 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 5059 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 5060 return; 5061 } 5062 5063 DISCOVERY_INFOLOG(ctx, "got aer\n"); 5064 if (ctx->in_progress) { 5065 ctx->pending = true; 5066 return; 5067 } 5068 5069 get_discovery_log_page(ctx); 5070 } 5071 5072 static void 5073 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5074 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5075 { 5076 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5077 struct discovery_ctx *ctx; 5078 5079 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 5080 5081 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 5082 ctx->probe_ctx = NULL; 5083 ctx->ctrlr = ctrlr; 5084 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 5085 } 5086 5087 static int 5088 discovery_poller(void *arg) 5089 { 5090 struct discovery_ctx *ctx = arg; 5091 struct spdk_nvme_transport_id *trid; 5092 int rc; 5093 5094 if (ctx->detach_ctx) { 5095 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 5096 if (rc != -EAGAIN) { 5097 ctx->detach_ctx = NULL; 5098 ctx->ctrlr = NULL; 5099 } 5100 } else if (ctx->stop) { 5101 if (ctx->ctrlr != NULL) { 5102 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 5103 if (rc == 0) { 5104 return SPDK_POLLER_BUSY; 5105 } 5106 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 5107 } 5108 spdk_poller_unregister(&ctx->poller); 5109 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 5110 ctx->stop_cb_fn(ctx->cb_ctx); 5111 free_discovery_ctx(ctx); 5112 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 5113 assert(ctx->entry_ctx_in_use == NULL); 5114 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 5115 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 5116 trid = &ctx->entry_ctx_in_use->trid; 5117 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 5118 if (ctx->probe_ctx) { 5119 spdk_poller_unregister(&ctx->poller); 5120 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 5121 } else { 5122 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 5123 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 5124 ctx->entry_ctx_in_use = NULL; 5125 } 5126 } else if (ctx->probe_ctx) { 5127 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 5128 if (rc != -EAGAIN) { 5129 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 5130 ctx->rc = rc; 5131 if (rc == 0) { 5132 get_discovery_log_page(ctx); 5133 } 5134 } 5135 } else { 5136 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 5137 if (rc < 0) { 5138 spdk_poller_unregister(&ctx->poller); 5139 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 5140 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 5141 ctx->entry_ctx_in_use = NULL; 5142 5143 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 5144 if (rc != 0) { 5145 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 5146 ctx->ctrlr = NULL; 5147 } 5148 } 5149 } 5150 5151 return SPDK_POLLER_BUSY; 5152 } 5153 5154 static void 5155 start_discovery_poller(void *arg) 5156 { 5157 struct discovery_ctx *ctx = arg; 5158 5159 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 5160 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 5161 } 5162 5163 int 5164 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 5165 const char *base_name, 5166 struct spdk_nvme_ctrlr_opts *drv_opts, 5167 struct nvme_ctrlr_opts *bdev_opts, 5168 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 5169 { 5170 struct discovery_ctx *ctx; 5171 struct discovery_entry_ctx *discovery_entry_ctx; 5172 5173 ctx = calloc(1, sizeof(*ctx)); 5174 if (ctx == NULL) { 5175 return -ENOMEM; 5176 } 5177 5178 ctx->name = strdup(base_name); 5179 if (ctx->name == NULL) { 5180 free_discovery_ctx(ctx); 5181 return -ENOMEM; 5182 } 5183 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 5184 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 5185 ctx->bdev_opts.from_discovery_service = true; 5186 ctx->calling_thread = spdk_get_thread(); 5187 if (ctx->start_cb_fn) { 5188 /* We can use this when dumping json to denote if this RPC parameter 5189 * was specified or not. 5190 */ 5191 ctx->wait_for_attach = true; 5192 } 5193 ctx->start_cb_fn = cb_fn; 5194 ctx->cb_ctx = cb_ctx; 5195 TAILQ_INIT(&ctx->nvm_entry_ctxs); 5196 TAILQ_INIT(&ctx->discovery_entry_ctxs); 5197 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 5198 memcpy(&ctx->trid, trid, sizeof(*trid)); 5199 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 5200 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 5201 if (ctx->hostnqn == NULL) { 5202 free_discovery_ctx(ctx); 5203 return -ENOMEM; 5204 } 5205 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 5206 if (discovery_entry_ctx == NULL) { 5207 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 5208 free_discovery_ctx(ctx); 5209 return -ENOMEM; 5210 } 5211 5212 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 5213 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 5214 return 0; 5215 } 5216 5217 int 5218 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 5219 { 5220 struct discovery_ctx *ctx; 5221 5222 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 5223 if (strcmp(name, ctx->name) == 0) { 5224 if (ctx->stop) { 5225 return -EALREADY; 5226 } 5227 ctx->stop = true; 5228 ctx->stop_cb_fn = cb_fn; 5229 ctx->cb_ctx = cb_ctx; 5230 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 5231 struct discovery_entry_ctx *entry_ctx; 5232 struct nvme_path_id path = {}; 5233 5234 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 5235 path.trid = entry_ctx->trid; 5236 bdev_nvme_delete(entry_ctx->name, &path); 5237 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 5238 free(entry_ctx); 5239 } 5240 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 5241 struct discovery_entry_ctx *entry_ctx; 5242 5243 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 5244 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 5245 free(entry_ctx); 5246 } 5247 return 0; 5248 } 5249 } 5250 5251 return -ENOENT; 5252 } 5253 5254 static int 5255 bdev_nvme_library_init(void) 5256 { 5257 g_bdev_nvme_init_thread = spdk_get_thread(); 5258 5259 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 5260 bdev_nvme_destroy_poll_group_cb, 5261 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 5262 5263 return 0; 5264 } 5265 5266 static void 5267 bdev_nvme_fini_destruct_ctrlrs(void) 5268 { 5269 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5270 struct nvme_ctrlr *nvme_ctrlr; 5271 5272 pthread_mutex_lock(&g_bdev_nvme_mutex); 5273 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 5274 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 5275 pthread_mutex_lock(&nvme_ctrlr->mutex); 5276 if (nvme_ctrlr->destruct) { 5277 /* This controller's destruction was already started 5278 * before the application started shutting down 5279 */ 5280 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5281 continue; 5282 } 5283 nvme_ctrlr->destruct = true; 5284 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5285 5286 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 5287 nvme_ctrlr); 5288 } 5289 } 5290 5291 g_bdev_nvme_module_finish = true; 5292 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5293 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5294 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 5295 spdk_bdev_module_fini_done(); 5296 return; 5297 } 5298 5299 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5300 } 5301 5302 static void 5303 check_discovery_fini(void *arg) 5304 { 5305 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 5306 bdev_nvme_fini_destruct_ctrlrs(); 5307 } 5308 } 5309 5310 static void 5311 bdev_nvme_library_fini(void) 5312 { 5313 struct nvme_probe_skip_entry *entry, *entry_tmp; 5314 struct discovery_ctx *ctx; 5315 5316 spdk_poller_unregister(&g_hotplug_poller); 5317 free(g_hotplug_probe_ctx); 5318 g_hotplug_probe_ctx = NULL; 5319 5320 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 5321 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 5322 free(entry); 5323 } 5324 5325 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 5326 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 5327 bdev_nvme_fini_destruct_ctrlrs(); 5328 } else { 5329 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 5330 ctx->stop = true; 5331 ctx->stop_cb_fn = check_discovery_fini; 5332 } 5333 } 5334 } 5335 5336 static void 5337 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 5338 { 5339 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5340 struct spdk_bdev *bdev = bdev_io->bdev; 5341 struct spdk_dif_ctx dif_ctx; 5342 struct spdk_dif_error err_blk = {}; 5343 int rc; 5344 5345 rc = spdk_dif_ctx_init(&dif_ctx, 5346 bdev->blocklen, bdev->md_len, bdev->md_interleave, 5347 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 5348 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0); 5349 if (rc != 0) { 5350 SPDK_ERRLOG("Initialization of DIF context failed\n"); 5351 return; 5352 } 5353 5354 if (bdev->md_interleave) { 5355 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 5356 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 5357 } else { 5358 struct iovec md_iov = { 5359 .iov_base = bdev_io->u.bdev.md_buf, 5360 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 5361 }; 5362 5363 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 5364 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 5365 } 5366 5367 if (rc != 0) { 5368 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 5369 err_blk.err_type, err_blk.err_offset); 5370 } else { 5371 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 5372 } 5373 } 5374 5375 static void 5376 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 5377 { 5378 struct nvme_bdev_io *bio = ref; 5379 5380 if (spdk_nvme_cpl_is_success(cpl)) { 5381 /* Run PI verification for read data buffer. */ 5382 bdev_nvme_verify_pi_error(bio); 5383 } 5384 5385 /* Return original completion status */ 5386 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 5387 } 5388 5389 static void 5390 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 5391 { 5392 struct nvme_bdev_io *bio = ref; 5393 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5394 int ret; 5395 5396 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 5397 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 5398 cpl->status.sct, cpl->status.sc); 5399 5400 /* Save completion status to use after verifying PI error. */ 5401 bio->cpl = *cpl; 5402 5403 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 5404 /* Read without PI checking to verify PI error. */ 5405 ret = bdev_nvme_no_pi_readv(bio, 5406 bdev_io->u.bdev.iovs, 5407 bdev_io->u.bdev.iovcnt, 5408 bdev_io->u.bdev.md_buf, 5409 bdev_io->u.bdev.num_blocks, 5410 bdev_io->u.bdev.offset_blocks); 5411 if (ret == 0) { 5412 return; 5413 } 5414 } 5415 } 5416 5417 bdev_nvme_io_complete_nvme_status(bio, cpl); 5418 } 5419 5420 static void 5421 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 5422 { 5423 struct nvme_bdev_io *bio = ref; 5424 5425 if (spdk_nvme_cpl_is_pi_error(cpl)) { 5426 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 5427 cpl->status.sct, cpl->status.sc); 5428 /* Run PI verification for write data buffer if PI error is detected. */ 5429 bdev_nvme_verify_pi_error(bio); 5430 } 5431 5432 bdev_nvme_io_complete_nvme_status(bio, cpl); 5433 } 5434 5435 static void 5436 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 5437 { 5438 struct nvme_bdev_io *bio = ref; 5439 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5440 5441 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 5442 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 5443 */ 5444 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 5445 5446 if (spdk_nvme_cpl_is_pi_error(cpl)) { 5447 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 5448 cpl->status.sct, cpl->status.sc); 5449 /* Run PI verification for zone append data buffer if PI error is detected. */ 5450 bdev_nvme_verify_pi_error(bio); 5451 } 5452 5453 bdev_nvme_io_complete_nvme_status(bio, cpl); 5454 } 5455 5456 static void 5457 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 5458 { 5459 struct nvme_bdev_io *bio = ref; 5460 5461 if (spdk_nvme_cpl_is_pi_error(cpl)) { 5462 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 5463 cpl->status.sct, cpl->status.sc); 5464 /* Run PI verification for compare data buffer if PI error is detected. */ 5465 bdev_nvme_verify_pi_error(bio); 5466 } 5467 5468 bdev_nvme_io_complete_nvme_status(bio, cpl); 5469 } 5470 5471 static void 5472 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 5473 { 5474 struct nvme_bdev_io *bio = ref; 5475 5476 /* Compare operation completion */ 5477 if (!bio->first_fused_completed) { 5478 /* Save compare result for write callback */ 5479 bio->cpl = *cpl; 5480 bio->first_fused_completed = true; 5481 return; 5482 } 5483 5484 /* Write operation completion */ 5485 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 5486 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 5487 * complete the IO with the compare operation's status. 5488 */ 5489 if (!spdk_nvme_cpl_is_error(cpl)) { 5490 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 5491 } 5492 5493 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 5494 } else { 5495 bdev_nvme_io_complete_nvme_status(bio, cpl); 5496 } 5497 } 5498 5499 static void 5500 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 5501 { 5502 struct nvme_bdev_io *bio = ref; 5503 5504 bdev_nvme_io_complete_nvme_status(bio, cpl); 5505 } 5506 5507 static int 5508 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 5509 { 5510 switch (desc->zs) { 5511 case SPDK_NVME_ZONE_STATE_EMPTY: 5512 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 5513 break; 5514 case SPDK_NVME_ZONE_STATE_IOPEN: 5515 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 5516 break; 5517 case SPDK_NVME_ZONE_STATE_EOPEN: 5518 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 5519 break; 5520 case SPDK_NVME_ZONE_STATE_CLOSED: 5521 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 5522 break; 5523 case SPDK_NVME_ZONE_STATE_RONLY: 5524 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 5525 break; 5526 case SPDK_NVME_ZONE_STATE_FULL: 5527 info->state = SPDK_BDEV_ZONE_STATE_FULL; 5528 break; 5529 case SPDK_NVME_ZONE_STATE_OFFLINE: 5530 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 5531 break; 5532 default: 5533 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 5534 return -EIO; 5535 } 5536 5537 info->zone_id = desc->zslba; 5538 info->write_pointer = desc->wp; 5539 info->capacity = desc->zcap; 5540 5541 return 0; 5542 } 5543 5544 static void 5545 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 5546 { 5547 struct nvme_bdev_io *bio = ref; 5548 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5549 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 5550 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 5551 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 5552 uint64_t max_zones_per_buf, i; 5553 uint32_t zone_report_bufsize; 5554 struct spdk_nvme_ns *ns; 5555 struct spdk_nvme_qpair *qpair; 5556 int ret; 5557 5558 if (spdk_nvme_cpl_is_error(cpl)) { 5559 goto out_complete_io_nvme_cpl; 5560 } 5561 5562 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 5563 ret = -ENXIO; 5564 goto out_complete_io_ret; 5565 } 5566 5567 ns = bio->io_path->nvme_ns->ns; 5568 qpair = bio->io_path->qpair->qpair; 5569 5570 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 5571 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 5572 sizeof(bio->zone_report_buf->descs[0]); 5573 5574 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 5575 ret = -EINVAL; 5576 goto out_complete_io_ret; 5577 } 5578 5579 if (!bio->zone_report_buf->nr_zones) { 5580 ret = -EINVAL; 5581 goto out_complete_io_ret; 5582 } 5583 5584 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 5585 ret = fill_zone_from_report(&info[bio->handled_zones], 5586 &bio->zone_report_buf->descs[i]); 5587 if (ret) { 5588 goto out_complete_io_ret; 5589 } 5590 bio->handled_zones++; 5591 } 5592 5593 if (bio->handled_zones < zones_to_copy) { 5594 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 5595 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 5596 5597 memset(bio->zone_report_buf, 0, zone_report_bufsize); 5598 ret = spdk_nvme_zns_report_zones(ns, qpair, 5599 bio->zone_report_buf, zone_report_bufsize, 5600 slba, SPDK_NVME_ZRA_LIST_ALL, true, 5601 bdev_nvme_get_zone_info_done, bio); 5602 if (!ret) { 5603 return; 5604 } else { 5605 goto out_complete_io_ret; 5606 } 5607 } 5608 5609 out_complete_io_nvme_cpl: 5610 free(bio->zone_report_buf); 5611 bio->zone_report_buf = NULL; 5612 bdev_nvme_io_complete_nvme_status(bio, cpl); 5613 return; 5614 5615 out_complete_io_ret: 5616 free(bio->zone_report_buf); 5617 bio->zone_report_buf = NULL; 5618 bdev_nvme_io_complete(bio, ret); 5619 } 5620 5621 static void 5622 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 5623 { 5624 struct nvme_bdev_io *bio = ref; 5625 5626 bdev_nvme_io_complete_nvme_status(bio, cpl); 5627 } 5628 5629 static void 5630 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 5631 { 5632 struct nvme_bdev_io *bio = ctx; 5633 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5634 const struct spdk_nvme_cpl *cpl = &bio->cpl; 5635 struct nvme_bdev_channel *nbdev_ch; 5636 struct nvme_ctrlr *nvme_ctrlr; 5637 const struct spdk_nvme_ctrlr_data *cdata; 5638 uint64_t delay_ms; 5639 5640 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 5641 5642 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 5643 goto complete; 5644 } 5645 5646 if (cpl->status.dnr != 0 || (g_opts.bdev_retry_count != -1 && 5647 bio->retry_count >= g_opts.bdev_retry_count)) { 5648 goto complete; 5649 } 5650 5651 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 5652 nvme_ctrlr = bio->io_path->qpair->ctrlr; 5653 5654 if (spdk_nvme_cpl_is_path_error(cpl) || 5655 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 5656 !nvme_ctrlr_is_available(nvme_ctrlr)) { 5657 delay_ms = 0; 5658 } else if (spdk_nvme_cpl_is_aborted_by_request(cpl)) { 5659 goto complete; 5660 } else { 5661 bio->retry_count++; 5662 5663 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 5664 5665 if (cpl->status.crd != 0) { 5666 delay_ms = cdata->crdt[cpl->status.crd] * 100; 5667 } else { 5668 delay_ms = 0; 5669 } 5670 } 5671 5672 if (any_ctrlr_may_become_available(nbdev_ch)) { 5673 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 5674 return; 5675 } 5676 5677 complete: 5678 bio->retry_count = 0; 5679 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 5680 } 5681 5682 static void 5683 bdev_nvme_abort_complete(void *ctx) 5684 { 5685 struct nvme_bdev_io *bio = ctx; 5686 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5687 5688 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 5689 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 5690 } else { 5691 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5692 } 5693 } 5694 5695 static void 5696 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 5697 { 5698 struct nvme_bdev_io *bio = ref; 5699 5700 bio->cpl = *cpl; 5701 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_complete, bio); 5702 } 5703 5704 static void 5705 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 5706 { 5707 struct nvme_bdev_io *bio = ref; 5708 5709 bio->cpl = *cpl; 5710 spdk_thread_send_msg(bio->orig_thread, 5711 bdev_nvme_admin_passthru_complete_nvme_status, bio); 5712 } 5713 5714 static void 5715 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 5716 { 5717 struct nvme_bdev_io *bio = ref; 5718 struct iovec *iov; 5719 5720 bio->iov_offset = sgl_offset; 5721 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 5722 iov = &bio->iovs[bio->iovpos]; 5723 if (bio->iov_offset < iov->iov_len) { 5724 break; 5725 } 5726 5727 bio->iov_offset -= iov->iov_len; 5728 } 5729 } 5730 5731 static int 5732 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 5733 { 5734 struct nvme_bdev_io *bio = ref; 5735 struct iovec *iov; 5736 5737 assert(bio->iovpos < bio->iovcnt); 5738 5739 iov = &bio->iovs[bio->iovpos]; 5740 5741 *address = iov->iov_base; 5742 *length = iov->iov_len; 5743 5744 if (bio->iov_offset) { 5745 assert(bio->iov_offset <= iov->iov_len); 5746 *address += bio->iov_offset; 5747 *length -= bio->iov_offset; 5748 } 5749 5750 bio->iov_offset += *length; 5751 if (bio->iov_offset == iov->iov_len) { 5752 bio->iovpos++; 5753 bio->iov_offset = 0; 5754 } 5755 5756 return 0; 5757 } 5758 5759 static void 5760 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 5761 { 5762 struct nvme_bdev_io *bio = ref; 5763 struct iovec *iov; 5764 5765 bio->fused_iov_offset = sgl_offset; 5766 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 5767 iov = &bio->fused_iovs[bio->fused_iovpos]; 5768 if (bio->fused_iov_offset < iov->iov_len) { 5769 break; 5770 } 5771 5772 bio->fused_iov_offset -= iov->iov_len; 5773 } 5774 } 5775 5776 static int 5777 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 5778 { 5779 struct nvme_bdev_io *bio = ref; 5780 struct iovec *iov; 5781 5782 assert(bio->fused_iovpos < bio->fused_iovcnt); 5783 5784 iov = &bio->fused_iovs[bio->fused_iovpos]; 5785 5786 *address = iov->iov_base; 5787 *length = iov->iov_len; 5788 5789 if (bio->fused_iov_offset) { 5790 assert(bio->fused_iov_offset <= iov->iov_len); 5791 *address += bio->fused_iov_offset; 5792 *length -= bio->fused_iov_offset; 5793 } 5794 5795 bio->fused_iov_offset += *length; 5796 if (bio->fused_iov_offset == iov->iov_len) { 5797 bio->fused_iovpos++; 5798 bio->fused_iov_offset = 0; 5799 } 5800 5801 return 0; 5802 } 5803 5804 static int 5805 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 5806 void *md, uint64_t lba_count, uint64_t lba) 5807 { 5808 int rc; 5809 5810 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 5811 lba_count, lba); 5812 5813 bio->iovs = iov; 5814 bio->iovcnt = iovcnt; 5815 bio->iovpos = 0; 5816 bio->iov_offset = 0; 5817 5818 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 5819 bio->io_path->qpair->qpair, 5820 lba, lba_count, 5821 bdev_nvme_no_pi_readv_done, bio, 0, 5822 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 5823 md, 0, 0); 5824 5825 if (rc != 0 && rc != -ENOMEM) { 5826 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 5827 } 5828 return rc; 5829 } 5830 5831 static int 5832 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 5833 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 5834 struct spdk_bdev_ext_io_opts *ext_opts) 5835 { 5836 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 5837 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 5838 int rc; 5839 5840 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 5841 lba_count, lba); 5842 5843 bio->iovs = iov; 5844 bio->iovcnt = iovcnt; 5845 bio->iovpos = 0; 5846 bio->iov_offset = 0; 5847 5848 if (ext_opts) { 5849 bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts); 5850 bio->ext_opts.memory_domain = ext_opts->memory_domain; 5851 bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx; 5852 bio->ext_opts.io_flags = flags; 5853 bio->ext_opts.metadata = md; 5854 5855 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 5856 bdev_nvme_readv_done, bio, 5857 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 5858 &bio->ext_opts); 5859 } else if (iovcnt == 1) { 5860 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba, 5861 lba_count, 5862 bdev_nvme_readv_done, bio, 5863 flags, 5864 0, 0); 5865 } else { 5866 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 5867 bdev_nvme_readv_done, bio, flags, 5868 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 5869 md, 0, 0); 5870 } 5871 5872 if (rc != 0 && rc != -ENOMEM) { 5873 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 5874 } 5875 return rc; 5876 } 5877 5878 static int 5879 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 5880 void *md, uint64_t lba_count, uint64_t lba, 5881 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts) 5882 { 5883 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 5884 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 5885 int rc; 5886 5887 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 5888 lba_count, lba); 5889 5890 bio->iovs = iov; 5891 bio->iovcnt = iovcnt; 5892 bio->iovpos = 0; 5893 bio->iov_offset = 0; 5894 5895 if (ext_opts) { 5896 bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts); 5897 bio->ext_opts.memory_domain = ext_opts->memory_domain; 5898 bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx; 5899 bio->ext_opts.io_flags = flags; 5900 bio->ext_opts.metadata = md; 5901 5902 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 5903 bdev_nvme_writev_done, bio, 5904 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 5905 &bio->ext_opts); 5906 } else if (iovcnt == 1) { 5907 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba, 5908 lba_count, 5909 bdev_nvme_writev_done, bio, 5910 flags, 5911 0, 0); 5912 } else { 5913 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 5914 bdev_nvme_writev_done, bio, flags, 5915 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 5916 md, 0, 0); 5917 } 5918 5919 if (rc != 0 && rc != -ENOMEM) { 5920 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 5921 } 5922 return rc; 5923 } 5924 5925 static int 5926 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 5927 void *md, uint64_t lba_count, uint64_t zslba, 5928 uint32_t flags) 5929 { 5930 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 5931 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 5932 int rc; 5933 5934 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 5935 lba_count, zslba); 5936 5937 bio->iovs = iov; 5938 bio->iovcnt = iovcnt; 5939 bio->iovpos = 0; 5940 bio->iov_offset = 0; 5941 5942 if (iovcnt == 1) { 5943 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 5944 lba_count, 5945 bdev_nvme_zone_appendv_done, bio, 5946 flags, 5947 0, 0); 5948 } else { 5949 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 5950 bdev_nvme_zone_appendv_done, bio, flags, 5951 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 5952 md, 0, 0); 5953 } 5954 5955 if (rc != 0 && rc != -ENOMEM) { 5956 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 5957 } 5958 return rc; 5959 } 5960 5961 static int 5962 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 5963 void *md, uint64_t lba_count, uint64_t lba, 5964 uint32_t flags) 5965 { 5966 int rc; 5967 5968 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 5969 lba_count, lba); 5970 5971 bio->iovs = iov; 5972 bio->iovcnt = iovcnt; 5973 bio->iovpos = 0; 5974 bio->iov_offset = 0; 5975 5976 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 5977 bio->io_path->qpair->qpair, 5978 lba, lba_count, 5979 bdev_nvme_comparev_done, bio, flags, 5980 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 5981 md, 0, 0); 5982 5983 if (rc != 0 && rc != -ENOMEM) { 5984 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 5985 } 5986 return rc; 5987 } 5988 5989 static int 5990 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 5991 struct iovec *write_iov, int write_iovcnt, 5992 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 5993 { 5994 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 5995 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 5996 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 5997 int rc; 5998 5999 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 6000 lba_count, lba); 6001 6002 bio->iovs = cmp_iov; 6003 bio->iovcnt = cmp_iovcnt; 6004 bio->iovpos = 0; 6005 bio->iov_offset = 0; 6006 bio->fused_iovs = write_iov; 6007 bio->fused_iovcnt = write_iovcnt; 6008 bio->fused_iovpos = 0; 6009 bio->fused_iov_offset = 0; 6010 6011 if (bdev_io->num_retries == 0) { 6012 bio->first_fused_submitted = false; 6013 bio->first_fused_completed = false; 6014 } 6015 6016 if (!bio->first_fused_submitted) { 6017 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 6018 memset(&bio->cpl, 0, sizeof(bio->cpl)); 6019 6020 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 6021 bdev_nvme_comparev_and_writev_done, bio, flags, 6022 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 6023 if (rc == 0) { 6024 bio->first_fused_submitted = true; 6025 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 6026 } else { 6027 if (rc != -ENOMEM) { 6028 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 6029 } 6030 return rc; 6031 } 6032 } 6033 6034 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 6035 6036 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 6037 bdev_nvme_comparev_and_writev_done, bio, flags, 6038 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 6039 if (rc != 0 && rc != -ENOMEM) { 6040 SPDK_ERRLOG("write failed: rc = %d\n", rc); 6041 rc = 0; 6042 } 6043 6044 return rc; 6045 } 6046 6047 static int 6048 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 6049 { 6050 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 6051 struct spdk_nvme_dsm_range *range; 6052 uint64_t offset, remaining; 6053 uint64_t num_ranges_u64; 6054 uint16_t num_ranges; 6055 int rc; 6056 6057 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 6058 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6059 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 6060 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 6061 return -EINVAL; 6062 } 6063 num_ranges = (uint16_t)num_ranges_u64; 6064 6065 offset = offset_blocks; 6066 remaining = num_blocks; 6067 range = &dsm_ranges[0]; 6068 6069 /* Fill max-size ranges until the remaining blocks fit into one range */ 6070 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 6071 range->attributes.raw = 0; 6072 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6073 range->starting_lba = offset; 6074 6075 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6076 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 6077 range++; 6078 } 6079 6080 /* Final range describes the remaining blocks */ 6081 range->attributes.raw = 0; 6082 range->length = remaining; 6083 range->starting_lba = offset; 6084 6085 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 6086 bio->io_path->qpair->qpair, 6087 SPDK_NVME_DSM_ATTR_DEALLOCATE, 6088 dsm_ranges, num_ranges, 6089 bdev_nvme_queued_done, bio); 6090 6091 return rc; 6092 } 6093 6094 static int 6095 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 6096 { 6097 if (num_blocks > UINT16_MAX + 1) { 6098 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 6099 return -EINVAL; 6100 } 6101 6102 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 6103 bio->io_path->qpair->qpair, 6104 offset_blocks, num_blocks, 6105 bdev_nvme_queued_done, bio, 6106 0); 6107 } 6108 6109 static int 6110 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 6111 struct spdk_bdev_zone_info *info) 6112 { 6113 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6114 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6115 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 6116 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 6117 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 6118 6119 if (zone_id % zone_size != 0) { 6120 return -EINVAL; 6121 } 6122 6123 if (num_zones > total_zones || !num_zones) { 6124 return -EINVAL; 6125 } 6126 6127 assert(!bio->zone_report_buf); 6128 bio->zone_report_buf = calloc(1, zone_report_bufsize); 6129 if (!bio->zone_report_buf) { 6130 return -ENOMEM; 6131 } 6132 6133 bio->handled_zones = 0; 6134 6135 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 6136 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 6137 bdev_nvme_get_zone_info_done, bio); 6138 } 6139 6140 static int 6141 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 6142 enum spdk_bdev_zone_action action) 6143 { 6144 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6145 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6146 6147 switch (action) { 6148 case SPDK_BDEV_ZONE_CLOSE: 6149 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 6150 bdev_nvme_zone_management_done, bio); 6151 case SPDK_BDEV_ZONE_FINISH: 6152 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 6153 bdev_nvme_zone_management_done, bio); 6154 case SPDK_BDEV_ZONE_OPEN: 6155 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 6156 bdev_nvme_zone_management_done, bio); 6157 case SPDK_BDEV_ZONE_RESET: 6158 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 6159 bdev_nvme_zone_management_done, bio); 6160 case SPDK_BDEV_ZONE_OFFLINE: 6161 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 6162 bdev_nvme_zone_management_done, bio); 6163 default: 6164 return -EINVAL; 6165 } 6166 } 6167 6168 static void 6169 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 6170 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 6171 { 6172 struct nvme_io_path *io_path; 6173 struct nvme_ctrlr *nvme_ctrlr; 6174 uint32_t max_xfer_size; 6175 int rc = -ENXIO; 6176 6177 /* Choose the first ctrlr which is not failed. */ 6178 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 6179 nvme_ctrlr = io_path->qpair->ctrlr; 6180 6181 /* We should skip any unavailable nvme_ctrlr rather than checking 6182 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 6183 */ 6184 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 6185 continue; 6186 } 6187 6188 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 6189 6190 if (nbytes > max_xfer_size) { 6191 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 6192 rc = -EINVAL; 6193 goto err; 6194 } 6195 6196 bio->io_path = io_path; 6197 bio->orig_thread = spdk_get_thread(); 6198 6199 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 6200 bdev_nvme_admin_passthru_done, bio); 6201 if (rc == 0) { 6202 return; 6203 } 6204 } 6205 6206 err: 6207 bdev_nvme_admin_passthru_complete(bio, rc); 6208 } 6209 6210 static int 6211 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 6212 void *buf, size_t nbytes) 6213 { 6214 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6215 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6216 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 6217 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 6218 6219 if (nbytes > max_xfer_size) { 6220 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 6221 return -EINVAL; 6222 } 6223 6224 /* 6225 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 6226 * so fill it out automatically. 6227 */ 6228 cmd->nsid = spdk_nvme_ns_get_id(ns); 6229 6230 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 6231 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 6232 } 6233 6234 static int 6235 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 6236 void *buf, size_t nbytes, void *md_buf, size_t md_len) 6237 { 6238 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 6239 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 6240 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 6241 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 6242 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 6243 6244 if (nbytes > max_xfer_size) { 6245 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 6246 return -EINVAL; 6247 } 6248 6249 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 6250 SPDK_ERRLOG("invalid meta data buffer size\n"); 6251 return -EINVAL; 6252 } 6253 6254 /* 6255 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 6256 * so fill it out automatically. 6257 */ 6258 cmd->nsid = spdk_nvme_ns_get_id(ns); 6259 6260 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 6261 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 6262 } 6263 6264 static void 6265 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 6266 struct nvme_bdev_io *bio_to_abort) 6267 { 6268 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6269 struct spdk_bdev_io *bdev_io_to_abort; 6270 struct nvme_io_path *io_path; 6271 struct nvme_ctrlr *nvme_ctrlr; 6272 int rc = 0; 6273 6274 bio->orig_thread = spdk_get_thread(); 6275 6276 /* Traverse the retry_io_list first. */ 6277 TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) { 6278 if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) { 6279 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link); 6280 spdk_bdev_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 6281 6282 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6283 return; 6284 } 6285 } 6286 6287 /* Even admin commands, they were submitted to only nvme_ctrlrs which were 6288 * on any io_path. So traverse the io_path list for not only I/O commands 6289 * but also admin commands. 6290 */ 6291 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 6292 nvme_ctrlr = io_path->qpair->ctrlr; 6293 6294 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr, 6295 io_path->qpair->qpair, 6296 bio_to_abort, 6297 bdev_nvme_abort_done, bio); 6298 if (rc == -ENOENT) { 6299 /* If no command was found in I/O qpair, the target command may be 6300 * admin command. 6301 */ 6302 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr, 6303 NULL, 6304 bio_to_abort, 6305 bdev_nvme_abort_done, bio); 6306 } 6307 6308 if (rc != -ENOENT) { 6309 break; 6310 } 6311 } 6312 6313 if (rc != 0) { 6314 /* If no command was found or there was any error, complete the abort 6315 * request with failure. 6316 */ 6317 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6318 } 6319 } 6320 6321 static void 6322 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 6323 { 6324 const char *action; 6325 6326 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 6327 action = "reset"; 6328 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 6329 action = "abort"; 6330 } else { 6331 action = "none"; 6332 } 6333 6334 spdk_json_write_object_begin(w); 6335 6336 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 6337 6338 spdk_json_write_named_object_begin(w, "params"); 6339 spdk_json_write_named_string(w, "action_on_timeout", action); 6340 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 6341 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 6342 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 6343 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 6344 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 6345 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 6346 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 6347 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 6348 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 6349 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 6350 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 6351 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 6352 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 6353 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 6354 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 6355 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 6356 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 6357 spdk_json_write_object_end(w); 6358 6359 spdk_json_write_object_end(w); 6360 } 6361 6362 static void 6363 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 6364 { 6365 struct spdk_nvme_transport_id trid; 6366 6367 spdk_json_write_object_begin(w); 6368 6369 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 6370 6371 spdk_json_write_named_object_begin(w, "params"); 6372 spdk_json_write_named_string(w, "name", ctx->name); 6373 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 6374 6375 trid = ctx->trid; 6376 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 6377 nvme_bdev_dump_trid_json(&trid, w); 6378 6379 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 6380 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 6381 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 6382 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 6383 ctx->bdev_opts.fast_io_fail_timeout_sec); 6384 spdk_json_write_object_end(w); 6385 6386 spdk_json_write_object_end(w); 6387 } 6388 6389 static void 6390 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 6391 struct nvme_ctrlr *nvme_ctrlr) 6392 { 6393 struct spdk_nvme_transport_id *trid; 6394 6395 if (nvme_ctrlr->opts.from_discovery_service) { 6396 /* Do not emit an RPC for this - it will be implicitly 6397 * covered by a separate bdev_nvme_start_discovery RPC. 6398 */ 6399 return; 6400 } 6401 6402 trid = &nvme_ctrlr->active_path_id->trid; 6403 6404 spdk_json_write_object_begin(w); 6405 6406 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 6407 6408 spdk_json_write_named_object_begin(w, "params"); 6409 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 6410 nvme_bdev_dump_trid_json(trid, w); 6411 spdk_json_write_named_bool(w, "prchk_reftag", 6412 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 6413 spdk_json_write_named_bool(w, "prchk_guard", 6414 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 6415 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 6416 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 6417 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 6418 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 6419 6420 spdk_json_write_object_end(w); 6421 6422 spdk_json_write_object_end(w); 6423 } 6424 6425 static void 6426 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 6427 { 6428 spdk_json_write_object_begin(w); 6429 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 6430 6431 spdk_json_write_named_object_begin(w, "params"); 6432 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 6433 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 6434 spdk_json_write_object_end(w); 6435 6436 spdk_json_write_object_end(w); 6437 } 6438 6439 static int 6440 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 6441 { 6442 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6443 struct nvme_ctrlr *nvme_ctrlr; 6444 struct discovery_ctx *ctx; 6445 6446 bdev_nvme_opts_config_json(w); 6447 6448 pthread_mutex_lock(&g_bdev_nvme_mutex); 6449 6450 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 6451 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6452 nvme_ctrlr_config_json(w, nvme_ctrlr); 6453 } 6454 } 6455 6456 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6457 bdev_nvme_discovery_config_json(w, ctx); 6458 } 6459 6460 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 6461 * before enabling hotplug poller. 6462 */ 6463 bdev_nvme_hotplug_config_json(w); 6464 6465 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6466 return 0; 6467 } 6468 6469 struct spdk_nvme_ctrlr * 6470 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 6471 { 6472 struct nvme_bdev *nbdev; 6473 struct nvme_ns *nvme_ns; 6474 6475 if (!bdev || bdev->module != &nvme_if) { 6476 return NULL; 6477 } 6478 6479 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 6480 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 6481 assert(nvme_ns != NULL); 6482 6483 return nvme_ns->ctrlr->ctrlr; 6484 } 6485 6486 void 6487 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 6488 { 6489 struct nvme_ns *nvme_ns = io_path->nvme_ns; 6490 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 6491 const struct spdk_nvme_ctrlr_data *cdata; 6492 6493 spdk_json_write_object_begin(w); 6494 6495 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 6496 6497 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 6498 6499 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 6500 6501 spdk_json_write_named_bool(w, "current", io_path == io_path->nbdev_ch->current_io_path); 6502 6503 spdk_json_write_named_bool(w, "connected", nvme_io_path_is_connected(io_path)); 6504 6505 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 6506 6507 spdk_json_write_object_end(w); 6508 } 6509 6510 void 6511 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 6512 { 6513 struct discovery_ctx *ctx; 6514 struct discovery_entry_ctx *entry_ctx; 6515 6516 spdk_json_write_array_begin(w); 6517 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6518 spdk_json_write_object_begin(w); 6519 spdk_json_write_named_string(w, "name", ctx->name); 6520 6521 spdk_json_write_named_object_begin(w, "trid"); 6522 nvme_bdev_dump_trid_json(&ctx->trid, w); 6523 spdk_json_write_object_end(w); 6524 6525 spdk_json_write_named_array_begin(w, "referrals"); 6526 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 6527 spdk_json_write_object_begin(w); 6528 spdk_json_write_named_object_begin(w, "trid"); 6529 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 6530 spdk_json_write_object_end(w); 6531 spdk_json_write_object_end(w); 6532 } 6533 spdk_json_write_array_end(w); 6534 6535 spdk_json_write_object_end(w); 6536 } 6537 spdk_json_write_array_end(w); 6538 } 6539 6540 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 6541