1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/likely.h" 18 #include "spdk/nvme.h" 19 #include "spdk/nvme_ocssd.h" 20 #include "spdk/nvme_zns.h" 21 #include "spdk/opal.h" 22 #include "spdk/thread.h" 23 #include "spdk/trace.h" 24 #include "spdk/string.h" 25 #include "spdk/util.h" 26 #include "spdk/uuid.h" 27 28 #include "spdk/bdev_module.h" 29 #include "spdk/log.h" 30 31 #include "spdk_internal/usdt.h" 32 #include "spdk_internal/trace_defs.h" 33 34 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 35 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 36 37 #define NSID_STR_LEN 10 38 39 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 40 41 struct nvme_bdev_io { 42 /** array of iovecs to transfer. */ 43 struct iovec *iovs; 44 45 /** Number of iovecs in iovs array. */ 46 int iovcnt; 47 48 /** Current iovec position. */ 49 int iovpos; 50 51 /** Offset in current iovec. */ 52 uint32_t iov_offset; 53 54 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 55 * being reset in a reset I/O. 56 */ 57 struct nvme_io_path *io_path; 58 59 /** array of iovecs to transfer. */ 60 struct iovec *fused_iovs; 61 62 /** Number of iovecs in iovs array. */ 63 int fused_iovcnt; 64 65 /** Current iovec position. */ 66 int fused_iovpos; 67 68 /** Offset in current iovec. */ 69 uint32_t fused_iov_offset; 70 71 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 72 struct spdk_nvme_cpl cpl; 73 74 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 75 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 76 77 /** Keeps track if first of fused commands was submitted */ 78 bool first_fused_submitted; 79 80 /** Keeps track if first of fused commands was completed */ 81 bool first_fused_completed; 82 83 /** Temporary pointer to zone report buffer */ 84 struct spdk_nvme_zns_zone_report *zone_report_buf; 85 86 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 87 uint64_t handled_zones; 88 89 /** Expiration value in ticks to retry the current I/O. */ 90 uint64_t retry_ticks; 91 92 /* How many times the current I/O was retried. */ 93 int32_t retry_count; 94 95 /* Current tsc at submit time. */ 96 uint64_t submit_tsc; 97 }; 98 99 struct nvme_probe_skip_entry { 100 struct spdk_nvme_transport_id trid; 101 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 102 }; 103 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 104 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 105 g_skipped_nvme_ctrlrs); 106 107 static struct spdk_bdev_nvme_opts g_opts = { 108 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 109 .timeout_us = 0, 110 .timeout_admin_us = 0, 111 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 112 .transport_retry_count = 4, 113 .arbitration_burst = 0, 114 .low_priority_weight = 0, 115 .medium_priority_weight = 0, 116 .high_priority_weight = 0, 117 .nvme_adminq_poll_period_us = 10000ULL, 118 .nvme_ioq_poll_period_us = 0, 119 .io_queue_requests = 0, 120 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 121 .bdev_retry_count = 3, 122 .transport_ack_timeout = 0, 123 .ctrlr_loss_timeout_sec = 0, 124 .reconnect_delay_sec = 0, 125 .fast_io_fail_timeout_sec = 0, 126 .disable_auto_failback = false, 127 .generate_uuids = false, 128 .transport_tos = 0, 129 .nvme_error_stat = false, 130 .io_path_stat = false, 131 .allow_accel_sequence = false, 132 }; 133 134 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 135 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 136 137 static int g_hot_insert_nvme_controller_index = 0; 138 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 139 static bool g_nvme_hotplug_enabled = false; 140 struct spdk_thread *g_bdev_nvme_init_thread; 141 static struct spdk_poller *g_hotplug_poller; 142 static struct spdk_poller *g_hotplug_probe_poller; 143 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 144 145 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 146 struct nvme_async_probe_ctx *ctx); 147 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 148 struct nvme_async_probe_ctx *ctx); 149 static int bdev_nvme_library_init(void); 150 static void bdev_nvme_library_fini(void); 151 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 152 struct spdk_bdev_io *bdev_io); 153 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 154 struct spdk_bdev_io *bdev_io); 155 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 156 void *md, uint64_t lba_count, uint64_t lba, 157 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 158 struct spdk_accel_sequence *seq); 159 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 160 void *md, uint64_t lba_count, uint64_t lba); 161 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 162 void *md, uint64_t lba_count, uint64_t lba, 163 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 164 struct spdk_accel_sequence *seq); 165 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 166 void *md, uint64_t lba_count, 167 uint64_t zslba, uint32_t flags); 168 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 169 void *md, uint64_t lba_count, uint64_t lba, 170 uint32_t flags); 171 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 172 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 173 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 174 uint32_t flags); 175 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 176 uint32_t num_zones, struct spdk_bdev_zone_info *info); 177 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 178 enum spdk_bdev_zone_action action); 179 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 180 struct nvme_bdev_io *bio, 181 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 182 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 183 void *buf, size_t nbytes); 184 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 185 void *buf, size_t nbytes, void *md_buf, size_t md_len); 186 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 187 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 188 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 189 static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 190 static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool remove); 191 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 192 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 193 194 static struct nvme_ns *nvme_ns_alloc(void); 195 static void nvme_ns_free(struct nvme_ns *ns); 196 197 static int 198 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 199 { 200 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 201 } 202 203 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 204 205 struct spdk_nvme_qpair * 206 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 207 { 208 struct nvme_ctrlr_channel *ctrlr_ch; 209 210 assert(ctrlr_io_ch != NULL); 211 212 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 213 214 return ctrlr_ch->qpair->qpair; 215 } 216 217 static int 218 bdev_nvme_get_ctx_size(void) 219 { 220 return sizeof(struct nvme_bdev_io); 221 } 222 223 static struct spdk_bdev_module nvme_if = { 224 .name = "nvme", 225 .async_fini = true, 226 .module_init = bdev_nvme_library_init, 227 .module_fini = bdev_nvme_library_fini, 228 .config_json = bdev_nvme_config_json, 229 .get_ctx_size = bdev_nvme_get_ctx_size, 230 231 }; 232 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 233 234 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 235 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 236 bool g_bdev_nvme_module_finish; 237 238 struct nvme_bdev_ctrlr * 239 nvme_bdev_ctrlr_get_by_name(const char *name) 240 { 241 struct nvme_bdev_ctrlr *nbdev_ctrlr; 242 243 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 244 if (strcmp(name, nbdev_ctrlr->name) == 0) { 245 break; 246 } 247 } 248 249 return nbdev_ctrlr; 250 } 251 252 static struct nvme_ctrlr * 253 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 254 const struct spdk_nvme_transport_id *trid) 255 { 256 struct nvme_ctrlr *nvme_ctrlr; 257 258 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 259 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { 260 break; 261 } 262 } 263 264 return nvme_ctrlr; 265 } 266 267 struct nvme_ctrlr * 268 nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr, 269 uint16_t cntlid) 270 { 271 struct nvme_ctrlr *nvme_ctrlr; 272 const struct spdk_nvme_ctrlr_data *cdata; 273 274 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 275 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 276 if (cdata->cntlid == cntlid) { 277 break; 278 } 279 } 280 281 return nvme_ctrlr; 282 } 283 284 static struct nvme_bdev * 285 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 286 { 287 struct nvme_bdev *bdev; 288 289 pthread_mutex_lock(&g_bdev_nvme_mutex); 290 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 291 if (bdev->nsid == nsid) { 292 break; 293 } 294 } 295 pthread_mutex_unlock(&g_bdev_nvme_mutex); 296 297 return bdev; 298 } 299 300 struct nvme_ns * 301 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 302 { 303 struct nvme_ns ns; 304 305 assert(nsid > 0); 306 307 ns.id = nsid; 308 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 309 } 310 311 struct nvme_ns * 312 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 313 { 314 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 315 } 316 317 struct nvme_ns * 318 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 319 { 320 if (ns == NULL) { 321 return NULL; 322 } 323 324 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 325 } 326 327 static struct nvme_ctrlr * 328 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) 329 { 330 struct nvme_bdev_ctrlr *nbdev_ctrlr; 331 struct nvme_ctrlr *nvme_ctrlr = NULL; 332 333 pthread_mutex_lock(&g_bdev_nvme_mutex); 334 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 335 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); 336 if (nvme_ctrlr != NULL) { 337 break; 338 } 339 } 340 pthread_mutex_unlock(&g_bdev_nvme_mutex); 341 342 return nvme_ctrlr; 343 } 344 345 struct nvme_ctrlr * 346 nvme_ctrlr_get_by_name(const char *name) 347 { 348 struct nvme_bdev_ctrlr *nbdev_ctrlr; 349 struct nvme_ctrlr *nvme_ctrlr = NULL; 350 351 if (name == NULL) { 352 return NULL; 353 } 354 355 pthread_mutex_lock(&g_bdev_nvme_mutex); 356 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 357 if (nbdev_ctrlr != NULL) { 358 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 359 } 360 pthread_mutex_unlock(&g_bdev_nvme_mutex); 361 362 return nvme_ctrlr; 363 } 364 365 void 366 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 367 { 368 struct nvme_bdev_ctrlr *nbdev_ctrlr; 369 370 pthread_mutex_lock(&g_bdev_nvme_mutex); 371 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 372 fn(nbdev_ctrlr, ctx); 373 } 374 pthread_mutex_unlock(&g_bdev_nvme_mutex); 375 } 376 377 void 378 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 379 { 380 const char *trtype_str; 381 const char *adrfam_str; 382 383 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 384 if (trtype_str) { 385 spdk_json_write_named_string(w, "trtype", trtype_str); 386 } 387 388 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 389 if (adrfam_str) { 390 spdk_json_write_named_string(w, "adrfam", adrfam_str); 391 } 392 393 if (trid->traddr[0] != '\0') { 394 spdk_json_write_named_string(w, "traddr", trid->traddr); 395 } 396 397 if (trid->trsvcid[0] != '\0') { 398 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 399 } 400 401 if (trid->subnqn[0] != '\0') { 402 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 403 } 404 } 405 406 static void 407 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 408 struct nvme_ctrlr *nvme_ctrlr) 409 { 410 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 411 pthread_mutex_lock(&g_bdev_nvme_mutex); 412 413 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 414 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 415 pthread_mutex_unlock(&g_bdev_nvme_mutex); 416 417 return; 418 } 419 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 420 421 pthread_mutex_unlock(&g_bdev_nvme_mutex); 422 423 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 424 425 free(nbdev_ctrlr->name); 426 free(nbdev_ctrlr); 427 } 428 429 static void 430 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 431 { 432 struct nvme_path_id *path_id, *tmp_path; 433 struct nvme_ns *ns, *tmp_ns; 434 435 free(nvme_ctrlr->copied_ana_desc); 436 spdk_free(nvme_ctrlr->ana_log_page); 437 438 if (nvme_ctrlr->opal_dev) { 439 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 440 nvme_ctrlr->opal_dev = NULL; 441 } 442 443 if (nvme_ctrlr->nbdev_ctrlr) { 444 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 445 } 446 447 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 448 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 449 nvme_ns_free(ns); 450 } 451 452 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 453 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 454 free(path_id); 455 } 456 457 pthread_mutex_destroy(&nvme_ctrlr->mutex); 458 459 free(nvme_ctrlr); 460 461 pthread_mutex_lock(&g_bdev_nvme_mutex); 462 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 463 pthread_mutex_unlock(&g_bdev_nvme_mutex); 464 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 465 spdk_bdev_module_fini_done(); 466 return; 467 } 468 pthread_mutex_unlock(&g_bdev_nvme_mutex); 469 } 470 471 static int 472 nvme_detach_poller(void *arg) 473 { 474 struct nvme_ctrlr *nvme_ctrlr = arg; 475 int rc; 476 477 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 478 if (rc != -EAGAIN) { 479 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 480 _nvme_ctrlr_delete(nvme_ctrlr); 481 } 482 483 return SPDK_POLLER_BUSY; 484 } 485 486 static void 487 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 488 { 489 int rc; 490 491 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 492 493 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 494 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 495 496 /* If we got here, the reset/detach poller cannot be active */ 497 assert(nvme_ctrlr->reset_detach_poller == NULL); 498 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 499 nvme_ctrlr, 1000); 500 if (nvme_ctrlr->reset_detach_poller == NULL) { 501 SPDK_ERRLOG("Failed to register detach poller\n"); 502 goto error; 503 } 504 505 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 506 if (rc != 0) { 507 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 508 goto error; 509 } 510 511 return; 512 error: 513 /* We don't have a good way to handle errors here, so just do what we can and delete the 514 * controller without detaching the underlying NVMe device. 515 */ 516 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 517 _nvme_ctrlr_delete(nvme_ctrlr); 518 } 519 520 static void 521 nvme_ctrlr_unregister_cb(void *io_device) 522 { 523 struct nvme_ctrlr *nvme_ctrlr = io_device; 524 525 nvme_ctrlr_delete(nvme_ctrlr); 526 } 527 528 static void 529 nvme_ctrlr_unregister(void *ctx) 530 { 531 struct nvme_ctrlr *nvme_ctrlr = ctx; 532 533 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 534 } 535 536 static bool 537 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 538 { 539 if (!nvme_ctrlr->destruct) { 540 return false; 541 } 542 543 if (nvme_ctrlr->ref > 0) { 544 return false; 545 } 546 547 if (nvme_ctrlr->resetting) { 548 return false; 549 } 550 551 if (nvme_ctrlr->ana_log_page_updating) { 552 return false; 553 } 554 555 if (nvme_ctrlr->io_path_cache_clearing) { 556 return false; 557 } 558 559 return true; 560 } 561 562 static void 563 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 564 { 565 pthread_mutex_lock(&nvme_ctrlr->mutex); 566 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 567 568 assert(nvme_ctrlr->ref > 0); 569 nvme_ctrlr->ref--; 570 571 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 572 pthread_mutex_unlock(&nvme_ctrlr->mutex); 573 return; 574 } 575 576 pthread_mutex_unlock(&nvme_ctrlr->mutex); 577 578 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 579 } 580 581 static void 582 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 583 { 584 nbdev_ch->current_io_path = NULL; 585 nbdev_ch->rr_counter = 0; 586 } 587 588 static struct nvme_io_path * 589 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 590 { 591 struct nvme_io_path *io_path; 592 593 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 594 if (io_path->nvme_ns == nvme_ns) { 595 break; 596 } 597 } 598 599 return io_path; 600 } 601 602 static struct nvme_io_path * 603 nvme_io_path_alloc(void) 604 { 605 struct nvme_io_path *io_path; 606 607 io_path = calloc(1, sizeof(*io_path)); 608 if (io_path == NULL) { 609 SPDK_ERRLOG("Failed to alloc io_path.\n"); 610 return NULL; 611 } 612 613 if (g_opts.io_path_stat) { 614 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 615 if (io_path->stat == NULL) { 616 free(io_path); 617 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 618 return NULL; 619 } 620 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 621 } 622 623 return io_path; 624 } 625 626 static void 627 nvme_io_path_free(struct nvme_io_path *io_path) 628 { 629 free(io_path->stat); 630 free(io_path); 631 } 632 633 static int 634 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 635 { 636 struct nvme_io_path *io_path; 637 struct spdk_io_channel *ch; 638 struct nvme_ctrlr_channel *ctrlr_ch; 639 struct nvme_qpair *nvme_qpair; 640 641 io_path = nvme_io_path_alloc(); 642 if (io_path == NULL) { 643 return -ENOMEM; 644 } 645 646 io_path->nvme_ns = nvme_ns; 647 648 ch = spdk_get_io_channel(nvme_ns->ctrlr); 649 if (ch == NULL) { 650 nvme_io_path_free(io_path); 651 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 652 return -ENOMEM; 653 } 654 655 ctrlr_ch = spdk_io_channel_get_ctx(ch); 656 657 nvme_qpair = ctrlr_ch->qpair; 658 assert(nvme_qpair != NULL); 659 660 io_path->qpair = nvme_qpair; 661 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 662 663 io_path->nbdev_ch = nbdev_ch; 664 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 665 666 bdev_nvme_clear_current_io_path(nbdev_ch); 667 668 return 0; 669 } 670 671 static void 672 bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch, 673 struct nvme_io_path *io_path) 674 { 675 struct spdk_bdev_io *bdev_io; 676 struct nvme_bdev_io *bio; 677 678 TAILQ_FOREACH(bdev_io, &nbdev_ch->retry_io_list, module_link) { 679 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 680 if (bio->io_path == io_path) { 681 bio->io_path = NULL; 682 } 683 } 684 } 685 686 static void 687 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 688 { 689 struct spdk_io_channel *ch; 690 struct nvme_qpair *nvme_qpair; 691 struct nvme_ctrlr_channel *ctrlr_ch; 692 struct nvme_bdev *nbdev; 693 694 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 695 696 /* Add the statistics to nvme_ns before this path is destroyed. */ 697 pthread_mutex_lock(&nbdev->mutex); 698 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 699 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 700 } 701 pthread_mutex_unlock(&nbdev->mutex); 702 703 bdev_nvme_clear_current_io_path(nbdev_ch); 704 bdev_nvme_clear_retry_io_path(nbdev_ch, io_path); 705 706 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 707 io_path->nbdev_ch = NULL; 708 709 nvme_qpair = io_path->qpair; 710 assert(nvme_qpair != NULL); 711 712 ctrlr_ch = nvme_qpair->ctrlr_ch; 713 assert(ctrlr_ch != NULL); 714 715 ch = spdk_io_channel_from_ctx(ctrlr_ch); 716 spdk_put_io_channel(ch); 717 718 /* After an io_path is removed, I/Os submitted to it may complete and update statistics 719 * of the io_path. To avoid heap-use-after-free error from this case, do not free the 720 * io_path here but free the io_path when the associated qpair is freed. It is ensured 721 * that all I/Os submitted to the io_path are completed when the associated qpair is freed. 722 */ 723 } 724 725 static void 726 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 727 { 728 struct nvme_io_path *io_path, *tmp_io_path; 729 730 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 731 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 732 } 733 } 734 735 static int 736 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 737 { 738 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 739 struct nvme_bdev *nbdev = io_device; 740 struct nvme_ns *nvme_ns; 741 int rc; 742 743 STAILQ_INIT(&nbdev_ch->io_path_list); 744 TAILQ_INIT(&nbdev_ch->retry_io_list); 745 746 pthread_mutex_lock(&nbdev->mutex); 747 748 nbdev_ch->mp_policy = nbdev->mp_policy; 749 nbdev_ch->mp_selector = nbdev->mp_selector; 750 nbdev_ch->rr_min_io = nbdev->rr_min_io; 751 752 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 753 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 754 if (rc != 0) { 755 pthread_mutex_unlock(&nbdev->mutex); 756 757 _bdev_nvme_delete_io_paths(nbdev_ch); 758 return rc; 759 } 760 } 761 pthread_mutex_unlock(&nbdev->mutex); 762 763 return 0; 764 } 765 766 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 767 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 768 */ 769 static inline void 770 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 771 const struct spdk_nvme_cpl *cpl) 772 { 773 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 774 (uintptr_t)bdev_io); 775 if (cpl) { 776 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 777 } else { 778 spdk_bdev_io_complete(bdev_io, status); 779 } 780 } 781 782 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 783 784 static void 785 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 786 { 787 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 788 789 bdev_nvme_abort_retry_ios(nbdev_ch); 790 _bdev_nvme_delete_io_paths(nbdev_ch); 791 } 792 793 static inline bool 794 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 795 { 796 switch (io_type) { 797 case SPDK_BDEV_IO_TYPE_RESET: 798 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 799 case SPDK_BDEV_IO_TYPE_ABORT: 800 return true; 801 default: 802 break; 803 } 804 805 return false; 806 } 807 808 static inline bool 809 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 810 { 811 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 812 return false; 813 } 814 815 switch (nvme_ns->ana_state) { 816 case SPDK_NVME_ANA_OPTIMIZED_STATE: 817 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 818 return true; 819 default: 820 break; 821 } 822 823 return false; 824 } 825 826 static inline bool 827 nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair) 828 { 829 if (spdk_unlikely(nvme_qpair->qpair == NULL)) { 830 return false; 831 } 832 833 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 834 SPDK_NVME_QPAIR_FAILURE_NONE)) { 835 return false; 836 } 837 838 if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) { 839 return false; 840 } 841 842 if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_qpair->ctrlr->ctrlr) != 843 SPDK_NVME_QPAIR_FAILURE_NONE) { 844 return false; 845 } 846 847 return true; 848 } 849 850 static inline bool 851 nvme_io_path_is_available(struct nvme_io_path *io_path) 852 { 853 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 854 return false; 855 } 856 857 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 858 return false; 859 } 860 861 return true; 862 } 863 864 static inline bool 865 nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr) 866 { 867 if (nvme_ctrlr->destruct) { 868 return true; 869 } 870 871 if (nvme_ctrlr->fast_io_fail_timedout) { 872 return true; 873 } 874 875 if (nvme_ctrlr->resetting) { 876 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 877 return false; 878 } else { 879 return true; 880 } 881 } 882 883 if (nvme_ctrlr->reconnect_is_delayed) { 884 return false; 885 } 886 887 if (nvme_ctrlr->disabled) { 888 return true; 889 } 890 891 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 892 return true; 893 } else { 894 return false; 895 } 896 } 897 898 static bool 899 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 900 { 901 if (nvme_ctrlr->destruct) { 902 return false; 903 } 904 905 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 906 return false; 907 } 908 909 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 910 return false; 911 } 912 913 if (nvme_ctrlr->disabled) { 914 return false; 915 } 916 917 return true; 918 } 919 920 /* Simulate circular linked list. */ 921 static inline struct nvme_io_path * 922 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 923 { 924 struct nvme_io_path *next_path; 925 926 if (prev_path != NULL) { 927 next_path = STAILQ_NEXT(prev_path, stailq); 928 if (next_path != NULL) { 929 return next_path; 930 } 931 } 932 933 return STAILQ_FIRST(&nbdev_ch->io_path_list); 934 } 935 936 static struct nvme_io_path * 937 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 938 { 939 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 940 941 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 942 943 io_path = start; 944 do { 945 if (spdk_likely(nvme_qpair_is_connected(io_path->qpair) && 946 !io_path->nvme_ns->ana_state_updating)) { 947 switch (io_path->nvme_ns->ana_state) { 948 case SPDK_NVME_ANA_OPTIMIZED_STATE: 949 nbdev_ch->current_io_path = io_path; 950 return io_path; 951 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 952 if (non_optimized == NULL) { 953 non_optimized = io_path; 954 } 955 break; 956 default: 957 break; 958 } 959 } 960 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 961 } while (io_path != start); 962 963 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 964 /* We come here only if there is no optimized path. Cache even non_optimized 965 * path for load balance across multiple non_optimized paths. 966 */ 967 nbdev_ch->current_io_path = non_optimized; 968 } 969 970 return non_optimized; 971 } 972 973 static struct nvme_io_path * 974 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 975 { 976 struct nvme_io_path *io_path; 977 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 978 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 979 uint32_t num_outstanding_reqs; 980 981 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 982 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 983 /* The device is currently resetting. */ 984 continue; 985 } 986 987 if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) { 988 continue; 989 } 990 991 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 992 switch (io_path->nvme_ns->ana_state) { 993 case SPDK_NVME_ANA_OPTIMIZED_STATE: 994 if (num_outstanding_reqs < opt_min_qd) { 995 opt_min_qd = num_outstanding_reqs; 996 optimized = io_path; 997 } 998 break; 999 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1000 if (num_outstanding_reqs < non_opt_min_qd) { 1001 non_opt_min_qd = num_outstanding_reqs; 1002 non_optimized = io_path; 1003 } 1004 break; 1005 default: 1006 break; 1007 } 1008 } 1009 1010 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 1011 if (optimized != NULL) { 1012 return optimized; 1013 } 1014 1015 return non_optimized; 1016 } 1017 1018 static inline struct nvme_io_path * 1019 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1020 { 1021 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 1022 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 1023 return nbdev_ch->current_io_path; 1024 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1025 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 1026 return nbdev_ch->current_io_path; 1027 } 1028 nbdev_ch->rr_counter = 0; 1029 } 1030 } 1031 1032 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 1033 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1034 return _bdev_nvme_find_io_path(nbdev_ch); 1035 } else { 1036 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 1037 } 1038 } 1039 1040 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 1041 * or false otherwise. 1042 * 1043 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 1044 * is likely to be non-accessible now but may become accessible. 1045 * 1046 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 1047 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 1048 * when starting to reset it but it is set to failed when the reset failed. Hence, if 1049 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 1050 */ 1051 static bool 1052 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 1053 { 1054 struct nvme_io_path *io_path; 1055 1056 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1057 if (io_path->nvme_ns->ana_transition_timedout) { 1058 continue; 1059 } 1060 1061 if (nvme_qpair_is_connected(io_path->qpair) || 1062 !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) { 1063 return true; 1064 } 1065 } 1066 1067 return false; 1068 } 1069 1070 static void 1071 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1072 { 1073 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1074 struct spdk_io_channel *ch; 1075 1076 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1077 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1078 } else { 1079 ch = spdk_io_channel_from_ctx(nbdev_ch); 1080 bdev_nvme_submit_request(ch, bdev_io); 1081 } 1082 } 1083 1084 static int 1085 bdev_nvme_retry_ios(void *arg) 1086 { 1087 struct nvme_bdev_channel *nbdev_ch = arg; 1088 struct spdk_bdev_io *bdev_io, *tmp_bdev_io; 1089 struct nvme_bdev_io *bio; 1090 uint64_t now, delay_us; 1091 1092 now = spdk_get_ticks(); 1093 1094 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) { 1095 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1096 if (bio->retry_ticks > now) { 1097 break; 1098 } 1099 1100 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1101 1102 bdev_nvme_retry_io(nbdev_ch, bdev_io); 1103 } 1104 1105 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1106 1107 bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1108 if (bdev_io != NULL) { 1109 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1110 1111 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1112 1113 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1114 delay_us); 1115 } 1116 1117 return SPDK_POLLER_BUSY; 1118 } 1119 1120 static void 1121 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1122 struct nvme_bdev_io *bio, uint64_t delay_ms) 1123 { 1124 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1125 struct spdk_bdev_io *tmp_bdev_io; 1126 struct nvme_bdev_io *tmp_bio; 1127 1128 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1129 1130 TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) { 1131 tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx; 1132 1133 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1134 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io, 1135 module_link); 1136 return; 1137 } 1138 } 1139 1140 /* No earlier I/Os were found. This I/O must be the new head. */ 1141 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link); 1142 1143 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1144 1145 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1146 delay_ms * 1000ULL); 1147 } 1148 1149 static void 1150 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1151 { 1152 struct spdk_bdev_io *bdev_io, *tmp_io; 1153 1154 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) { 1155 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1156 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1157 } 1158 1159 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1160 } 1161 1162 static int 1163 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1164 struct nvme_bdev_io *bio_to_abort) 1165 { 1166 struct spdk_bdev_io *bdev_io_to_abort; 1167 1168 TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) { 1169 if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) { 1170 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link); 1171 __bdev_nvme_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1172 return 0; 1173 } 1174 } 1175 1176 return -ENOENT; 1177 } 1178 1179 static void 1180 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1181 { 1182 struct nvme_bdev *nbdev; 1183 uint16_t sct, sc; 1184 1185 assert(spdk_nvme_cpl_is_error(cpl)); 1186 1187 nbdev = bdev_io->bdev->ctxt; 1188 1189 if (nbdev->err_stat == NULL) { 1190 return; 1191 } 1192 1193 sct = cpl->status.sct; 1194 sc = cpl->status.sc; 1195 1196 pthread_mutex_lock(&nbdev->mutex); 1197 1198 nbdev->err_stat->status_type[sct]++; 1199 switch (sct) { 1200 case SPDK_NVME_SCT_GENERIC: 1201 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1202 case SPDK_NVME_SCT_MEDIA_ERROR: 1203 case SPDK_NVME_SCT_PATH: 1204 nbdev->err_stat->status[sct][sc]++; 1205 break; 1206 default: 1207 break; 1208 } 1209 1210 pthread_mutex_unlock(&nbdev->mutex); 1211 } 1212 1213 static inline void 1214 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1215 { 1216 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1217 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1218 uint32_t blocklen = bdev_io->bdev->blocklen; 1219 struct spdk_bdev_io_stat *stat; 1220 uint64_t tsc_diff; 1221 1222 if (bio->io_path->stat == NULL) { 1223 return; 1224 } 1225 1226 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1227 stat = bio->io_path->stat; 1228 1229 switch (bdev_io->type) { 1230 case SPDK_BDEV_IO_TYPE_READ: 1231 stat->bytes_read += num_blocks * blocklen; 1232 stat->num_read_ops++; 1233 stat->read_latency_ticks += tsc_diff; 1234 if (stat->max_read_latency_ticks < tsc_diff) { 1235 stat->max_read_latency_ticks = tsc_diff; 1236 } 1237 if (stat->min_read_latency_ticks > tsc_diff) { 1238 stat->min_read_latency_ticks = tsc_diff; 1239 } 1240 break; 1241 case SPDK_BDEV_IO_TYPE_WRITE: 1242 stat->bytes_written += num_blocks * blocklen; 1243 stat->num_write_ops++; 1244 stat->write_latency_ticks += tsc_diff; 1245 if (stat->max_write_latency_ticks < tsc_diff) { 1246 stat->max_write_latency_ticks = tsc_diff; 1247 } 1248 if (stat->min_write_latency_ticks > tsc_diff) { 1249 stat->min_write_latency_ticks = tsc_diff; 1250 } 1251 break; 1252 case SPDK_BDEV_IO_TYPE_UNMAP: 1253 stat->bytes_unmapped += num_blocks * blocklen; 1254 stat->num_unmap_ops++; 1255 stat->unmap_latency_ticks += tsc_diff; 1256 if (stat->max_unmap_latency_ticks < tsc_diff) { 1257 stat->max_unmap_latency_ticks = tsc_diff; 1258 } 1259 if (stat->min_unmap_latency_ticks > tsc_diff) { 1260 stat->min_unmap_latency_ticks = tsc_diff; 1261 } 1262 break; 1263 case SPDK_BDEV_IO_TYPE_ZCOPY: 1264 /* Track the data in the start phase only */ 1265 if (!bdev_io->u.bdev.zcopy.start) { 1266 break; 1267 } 1268 if (bdev_io->u.bdev.zcopy.populate) { 1269 stat->bytes_read += num_blocks * blocklen; 1270 stat->num_read_ops++; 1271 stat->read_latency_ticks += tsc_diff; 1272 if (stat->max_read_latency_ticks < tsc_diff) { 1273 stat->max_read_latency_ticks = tsc_diff; 1274 } 1275 if (stat->min_read_latency_ticks > tsc_diff) { 1276 stat->min_read_latency_ticks = tsc_diff; 1277 } 1278 } else { 1279 stat->bytes_written += num_blocks * blocklen; 1280 stat->num_write_ops++; 1281 stat->write_latency_ticks += tsc_diff; 1282 if (stat->max_write_latency_ticks < tsc_diff) { 1283 stat->max_write_latency_ticks = tsc_diff; 1284 } 1285 if (stat->min_write_latency_ticks > tsc_diff) { 1286 stat->min_write_latency_ticks = tsc_diff; 1287 } 1288 } 1289 break; 1290 case SPDK_BDEV_IO_TYPE_COPY: 1291 stat->bytes_copied += num_blocks * blocklen; 1292 stat->num_copy_ops++; 1293 stat->copy_latency_ticks += tsc_diff; 1294 if (stat->max_copy_latency_ticks < tsc_diff) { 1295 stat->max_copy_latency_ticks = tsc_diff; 1296 } 1297 if (stat->min_copy_latency_ticks > tsc_diff) { 1298 stat->min_copy_latency_ticks = tsc_diff; 1299 } 1300 break; 1301 default: 1302 break; 1303 } 1304 } 1305 1306 static bool 1307 bdev_nvme_check_retry_io(struct nvme_bdev_io *bio, 1308 const struct spdk_nvme_cpl *cpl, 1309 struct nvme_bdev_channel *nbdev_ch, 1310 uint64_t *_delay_ms) 1311 { 1312 struct nvme_io_path *io_path = bio->io_path; 1313 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1314 const struct spdk_nvme_ctrlr_data *cdata; 1315 1316 if (spdk_nvme_cpl_is_path_error(cpl) || 1317 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1318 !nvme_io_path_is_available(io_path) || 1319 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1320 bdev_nvme_clear_current_io_path(nbdev_ch); 1321 bio->io_path = NULL; 1322 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1323 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1324 io_path->nvme_ns->ana_state_updating = true; 1325 } 1326 } 1327 if (!any_io_path_may_become_available(nbdev_ch)) { 1328 return false; 1329 } 1330 *_delay_ms = 0; 1331 } else { 1332 bio->retry_count++; 1333 1334 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1335 1336 if (cpl->status.crd != 0) { 1337 *_delay_ms = cdata->crdt[cpl->status.crd] * 100; 1338 } else { 1339 *_delay_ms = 0; 1340 } 1341 } 1342 1343 return true; 1344 } 1345 1346 static inline void 1347 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1348 const struct spdk_nvme_cpl *cpl) 1349 { 1350 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1351 struct nvme_bdev_channel *nbdev_ch; 1352 uint64_t delay_ms; 1353 1354 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1355 1356 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1357 bdev_nvme_update_io_path_stat(bio); 1358 goto complete; 1359 } 1360 1361 /* Update error counts before deciding if retry is needed. 1362 * Hence, error counts may be more than the number of I/O errors. 1363 */ 1364 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1365 1366 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1367 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1368 goto complete; 1369 } 1370 1371 /* At this point we don't know whether the sequence was successfully executed or not, so we 1372 * cannot retry the IO */ 1373 if (bdev_io->u.bdev.accel_sequence != NULL) { 1374 goto complete; 1375 } 1376 1377 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1378 1379 if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) { 1380 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1381 return; 1382 } 1383 1384 complete: 1385 bio->retry_count = 0; 1386 bio->submit_tsc = 0; 1387 bdev_io->u.bdev.accel_sequence = NULL; 1388 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1389 } 1390 1391 static inline void 1392 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1393 { 1394 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1395 struct nvme_bdev_channel *nbdev_ch; 1396 enum spdk_bdev_io_status io_status; 1397 1398 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1399 1400 switch (rc) { 1401 case 0: 1402 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1403 break; 1404 case -ENOMEM: 1405 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1406 break; 1407 case -ENXIO: 1408 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1409 1410 bdev_nvme_clear_current_io_path(nbdev_ch); 1411 bio->io_path = NULL; 1412 1413 if (any_io_path_may_become_available(nbdev_ch)) { 1414 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1415 return; 1416 } 1417 1418 /* fallthrough */ 1419 default: 1420 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 1421 bdev_io->u.bdev.accel_sequence = NULL; 1422 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1423 break; 1424 } 1425 1426 bio->retry_count = 0; 1427 bio->submit_tsc = 0; 1428 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1429 } 1430 1431 static inline void 1432 bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc) 1433 { 1434 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1435 enum spdk_bdev_io_status io_status; 1436 1437 switch (rc) { 1438 case 0: 1439 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1440 break; 1441 case -ENOMEM: 1442 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1443 break; 1444 case -ENXIO: 1445 /* fallthrough */ 1446 default: 1447 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1448 break; 1449 } 1450 1451 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1452 } 1453 1454 static void 1455 bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) 1456 { 1457 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1458 1459 pthread_mutex_lock(&nvme_ctrlr->mutex); 1460 1461 assert(nvme_ctrlr->io_path_cache_clearing == true); 1462 nvme_ctrlr->io_path_cache_clearing = false; 1463 1464 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1465 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1466 return; 1467 } 1468 1469 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1470 1471 nvme_ctrlr_unregister(nvme_ctrlr); 1472 } 1473 1474 static void 1475 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1476 { 1477 struct nvme_io_path *io_path; 1478 1479 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1480 if (io_path->nbdev_ch == NULL) { 1481 continue; 1482 } 1483 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1484 } 1485 } 1486 1487 static void 1488 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1489 { 1490 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1491 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1492 1493 assert(ctrlr_ch->qpair != NULL); 1494 1495 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1496 1497 spdk_for_each_channel_continue(i, 0); 1498 } 1499 1500 static void 1501 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1502 { 1503 pthread_mutex_lock(&nvme_ctrlr->mutex); 1504 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1505 nvme_ctrlr->io_path_cache_clearing) { 1506 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1507 return; 1508 } 1509 1510 nvme_ctrlr->io_path_cache_clearing = true; 1511 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1512 1513 spdk_for_each_channel(nvme_ctrlr, 1514 bdev_nvme_clear_io_path_cache, 1515 NULL, 1516 bdev_nvme_clear_io_path_caches_done); 1517 } 1518 1519 static struct nvme_qpair * 1520 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1521 { 1522 struct nvme_qpair *nvme_qpair; 1523 1524 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1525 if (nvme_qpair->qpair == qpair) { 1526 break; 1527 } 1528 } 1529 1530 return nvme_qpair; 1531 } 1532 1533 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1534 1535 static void 1536 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1537 { 1538 struct nvme_poll_group *group = poll_group_ctx; 1539 struct nvme_qpair *nvme_qpair; 1540 struct nvme_ctrlr_channel *ctrlr_ch; 1541 int status; 1542 1543 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1544 if (nvme_qpair == NULL) { 1545 return; 1546 } 1547 1548 if (nvme_qpair->qpair != NULL) { 1549 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1550 nvme_qpair->qpair = NULL; 1551 } 1552 1553 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1554 1555 ctrlr_ch = nvme_qpair->ctrlr_ch; 1556 1557 if (ctrlr_ch != NULL) { 1558 if (ctrlr_ch->reset_iter != NULL) { 1559 /* We are in a full reset sequence. */ 1560 if (ctrlr_ch->connect_poller != NULL) { 1561 /* qpair was failed to connect. Abort the reset sequence. */ 1562 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was failed to connect. abort the reset ctrlr sequence.\n", 1563 qpair); 1564 spdk_poller_unregister(&ctrlr_ch->connect_poller); 1565 status = -1; 1566 } else { 1567 /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */ 1568 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1569 qpair); 1570 status = 0; 1571 } 1572 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, status); 1573 ctrlr_ch->reset_iter = NULL; 1574 } else { 1575 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1576 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1577 bdev_nvme_failover_ctrlr(nvme_qpair->ctrlr, false); 1578 } 1579 } else { 1580 /* In this case, ctrlr_channel is already deleted. */ 1581 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1582 nvme_qpair_delete(nvme_qpair); 1583 } 1584 } 1585 1586 static void 1587 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1588 { 1589 struct nvme_qpair *nvme_qpair; 1590 1591 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1592 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1593 continue; 1594 } 1595 1596 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1597 SPDK_NVME_QPAIR_FAILURE_NONE) { 1598 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1599 } 1600 } 1601 } 1602 1603 static int 1604 bdev_nvme_poll(void *arg) 1605 { 1606 struct nvme_poll_group *group = arg; 1607 int64_t num_completions; 1608 1609 if (group->collect_spin_stat && group->start_ticks == 0) { 1610 group->start_ticks = spdk_get_ticks(); 1611 } 1612 1613 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1614 bdev_nvme_disconnected_qpair_cb); 1615 if (group->collect_spin_stat) { 1616 if (num_completions > 0) { 1617 if (group->end_ticks != 0) { 1618 group->spin_ticks += (group->end_ticks - group->start_ticks); 1619 group->end_ticks = 0; 1620 } 1621 group->start_ticks = 0; 1622 } else { 1623 group->end_ticks = spdk_get_ticks(); 1624 } 1625 } 1626 1627 if (spdk_unlikely(num_completions < 0)) { 1628 bdev_nvme_check_io_qpairs(group); 1629 } 1630 1631 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1632 } 1633 1634 static int bdev_nvme_poll_adminq(void *arg); 1635 1636 static void 1637 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1638 { 1639 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1640 1641 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1642 nvme_ctrlr, new_period_us); 1643 } 1644 1645 static int 1646 bdev_nvme_poll_adminq(void *arg) 1647 { 1648 int32_t rc; 1649 struct nvme_ctrlr *nvme_ctrlr = arg; 1650 nvme_ctrlr_disconnected_cb disconnected_cb; 1651 1652 assert(nvme_ctrlr != NULL); 1653 1654 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1655 if (rc < 0) { 1656 disconnected_cb = nvme_ctrlr->disconnected_cb; 1657 nvme_ctrlr->disconnected_cb = NULL; 1658 1659 if (disconnected_cb != NULL) { 1660 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1661 g_opts.nvme_adminq_poll_period_us); 1662 disconnected_cb(nvme_ctrlr); 1663 } else { 1664 bdev_nvme_failover_ctrlr(nvme_ctrlr, false); 1665 } 1666 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1667 SPDK_NVME_QPAIR_FAILURE_NONE) { 1668 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1669 } 1670 1671 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1672 } 1673 1674 static void 1675 nvme_bdev_free(void *io_device) 1676 { 1677 struct nvme_bdev *nvme_disk = io_device; 1678 1679 pthread_mutex_destroy(&nvme_disk->mutex); 1680 free(nvme_disk->disk.name); 1681 free(nvme_disk->err_stat); 1682 free(nvme_disk); 1683 } 1684 1685 static int 1686 bdev_nvme_destruct(void *ctx) 1687 { 1688 struct nvme_bdev *nvme_disk = ctx; 1689 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1690 1691 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1692 1693 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1694 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1695 1696 nvme_ns->bdev = NULL; 1697 1698 assert(nvme_ns->id > 0); 1699 1700 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1701 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1702 1703 nvme_ctrlr_release(nvme_ns->ctrlr); 1704 nvme_ns_free(nvme_ns); 1705 } else { 1706 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1707 } 1708 } 1709 1710 pthread_mutex_lock(&g_bdev_nvme_mutex); 1711 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1712 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1713 1714 spdk_io_device_unregister(nvme_disk, nvme_bdev_free); 1715 1716 return 0; 1717 } 1718 1719 static int 1720 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1721 { 1722 struct nvme_ctrlr *nvme_ctrlr; 1723 struct spdk_nvme_io_qpair_opts opts; 1724 struct spdk_nvme_qpair *qpair; 1725 int rc; 1726 1727 nvme_ctrlr = nvme_qpair->ctrlr; 1728 1729 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1730 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1731 opts.create_only = true; 1732 opts.async_mode = true; 1733 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1734 g_opts.io_queue_requests = opts.io_queue_requests; 1735 1736 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1737 if (qpair == NULL) { 1738 return -1; 1739 } 1740 1741 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1742 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1743 1744 assert(nvme_qpair->group != NULL); 1745 1746 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1747 if (rc != 0) { 1748 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1749 goto err; 1750 } 1751 1752 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1753 if (rc != 0) { 1754 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1755 goto err; 1756 } 1757 1758 nvme_qpair->qpair = qpair; 1759 1760 if (!g_opts.disable_auto_failback) { 1761 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1762 } 1763 1764 return 0; 1765 1766 err: 1767 spdk_nvme_ctrlr_free_io_qpair(qpair); 1768 1769 return rc; 1770 } 1771 1772 static void 1773 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1774 { 1775 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1776 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1777 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 1778 struct spdk_bdev_io *bdev_io; 1779 1780 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1781 status = SPDK_BDEV_IO_STATUS_FAILED; 1782 } 1783 1784 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1785 bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1786 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link); 1787 __bdev_nvme_io_complete(bdev_io, status, NULL); 1788 } 1789 1790 spdk_for_each_channel_continue(i, 0); 1791 } 1792 1793 /* This function marks the current trid as failed by storing the current ticks 1794 * and then sets the next trid to the active trid within a controller if exists. 1795 * 1796 * The purpose of the boolean return value is to request the caller to disconnect 1797 * the current trid now to try connecting the next trid. 1798 */ 1799 static bool 1800 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start) 1801 { 1802 struct nvme_path_id *path_id, *next_path; 1803 int rc __attribute__((unused)); 1804 1805 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1806 assert(path_id); 1807 assert(path_id == nvme_ctrlr->active_path_id); 1808 next_path = TAILQ_NEXT(path_id, link); 1809 1810 /* Update the last failed time. It means the trid is failed if its last 1811 * failed time is non-zero. 1812 */ 1813 path_id->last_failed_tsc = spdk_get_ticks(); 1814 1815 if (next_path == NULL) { 1816 /* There is no alternate trid within a controller. */ 1817 return false; 1818 } 1819 1820 if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1821 /* Connect is not retried in a controller reset sequence. Connecting 1822 * the next trid will be done by the next bdev_nvme_failover_ctrlr() call. 1823 */ 1824 return false; 1825 } 1826 1827 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1828 1829 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1830 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1831 1832 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1833 nvme_ctrlr->active_path_id = next_path; 1834 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1835 assert(rc == 0); 1836 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1837 if (!remove) { 1838 /** Shuffle the old trid to the end of the list and use the new one. 1839 * Allows for round robin through multiple connections. 1840 */ 1841 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1842 } else { 1843 free(path_id); 1844 } 1845 1846 if (start || next_path->last_failed_tsc == 0) { 1847 /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed 1848 * or used yet. Try the next trid now. 1849 */ 1850 return true; 1851 } 1852 1853 if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() * 1854 nvme_ctrlr->opts.reconnect_delay_sec) { 1855 /* Enough backoff passed since the next trid failed. Try the next trid now. */ 1856 return true; 1857 } 1858 1859 /* The next trid will be tried after reconnect_delay_sec seconds. */ 1860 return false; 1861 } 1862 1863 static bool 1864 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1865 { 1866 int32_t elapsed; 1867 1868 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1869 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1870 return false; 1871 } 1872 1873 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1874 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1875 return true; 1876 } else { 1877 return false; 1878 } 1879 } 1880 1881 static bool 1882 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1883 { 1884 uint32_t elapsed; 1885 1886 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1887 return false; 1888 } 1889 1890 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1891 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1892 return true; 1893 } else { 1894 return false; 1895 } 1896 } 1897 1898 static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 1899 1900 static void 1901 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1902 { 1903 int rc; 1904 1905 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1906 if (rc != 0) { 1907 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 1908 * fail the reset sequence immediately. 1909 */ 1910 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 1911 return; 1912 } 1913 1914 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1915 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1916 */ 1917 assert(nvme_ctrlr->disconnected_cb == NULL); 1918 nvme_ctrlr->disconnected_cb = cb_fn; 1919 1920 /* During disconnection, reduce the period to poll adminq more often. */ 1921 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 1922 } 1923 1924 enum bdev_nvme_op_after_reset { 1925 OP_NONE, 1926 OP_COMPLETE_PENDING_DESTRUCT, 1927 OP_DESTRUCT, 1928 OP_DELAYED_RECONNECT, 1929 OP_FAILOVER, 1930 }; 1931 1932 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1933 1934 static _bdev_nvme_op_after_reset 1935 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1936 { 1937 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1938 /* Complete pending destruct after reset completes. */ 1939 return OP_COMPLETE_PENDING_DESTRUCT; 1940 } else if (nvme_ctrlr->pending_failover) { 1941 nvme_ctrlr->pending_failover = false; 1942 nvme_ctrlr->reset_start_tsc = 0; 1943 return OP_FAILOVER; 1944 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1945 nvme_ctrlr->reset_start_tsc = 0; 1946 return OP_NONE; 1947 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1948 return OP_DESTRUCT; 1949 } else { 1950 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1951 nvme_ctrlr->fast_io_fail_timedout = true; 1952 } 1953 return OP_DELAYED_RECONNECT; 1954 } 1955 } 1956 1957 static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1958 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1959 1960 static int 1961 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 1962 { 1963 struct nvme_ctrlr *nvme_ctrlr = ctx; 1964 1965 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 1966 pthread_mutex_lock(&nvme_ctrlr->mutex); 1967 1968 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 1969 1970 if (!nvme_ctrlr->reconnect_is_delayed) { 1971 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1972 return SPDK_POLLER_BUSY; 1973 } 1974 1975 nvme_ctrlr->reconnect_is_delayed = false; 1976 1977 if (nvme_ctrlr->destruct) { 1978 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1979 return SPDK_POLLER_BUSY; 1980 } 1981 1982 assert(nvme_ctrlr->resetting == false); 1983 nvme_ctrlr->resetting = true; 1984 1985 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1986 1987 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 1988 1989 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 1990 return SPDK_POLLER_BUSY; 1991 } 1992 1993 static void 1994 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 1995 { 1996 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 1997 1998 assert(nvme_ctrlr->reconnect_is_delayed == false); 1999 nvme_ctrlr->reconnect_is_delayed = true; 2000 2001 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 2002 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 2003 nvme_ctrlr, 2004 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 2005 } 2006 2007 static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr); 2008 2009 static void 2010 _bdev_nvme_reset_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2011 { 2012 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2013 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 2014 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2015 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2016 enum bdev_nvme_op_after_reset op_after_reset; 2017 2018 assert(nvme_ctrlr->thread == spdk_get_thread()); 2019 2020 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2021 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2022 2023 if (!success) { 2024 SPDK_ERRLOG("Resetting controller failed.\n"); 2025 } else { 2026 SPDK_NOTICELOG("Resetting controller successful.\n"); 2027 } 2028 2029 pthread_mutex_lock(&nvme_ctrlr->mutex); 2030 nvme_ctrlr->resetting = false; 2031 nvme_ctrlr->dont_retry = false; 2032 nvme_ctrlr->in_failover = false; 2033 2034 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 2035 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2036 2037 if (ctrlr_op_cb_fn) { 2038 ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1); 2039 } 2040 2041 switch (op_after_reset) { 2042 case OP_COMPLETE_PENDING_DESTRUCT: 2043 nvme_ctrlr_unregister(nvme_ctrlr); 2044 break; 2045 case OP_DESTRUCT: 2046 bdev_nvme_delete_ctrlr(nvme_ctrlr, false); 2047 remove_discovery_entry(nvme_ctrlr); 2048 break; 2049 case OP_DELAYED_RECONNECT: 2050 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 2051 break; 2052 case OP_FAILOVER: 2053 bdev_nvme_failover_ctrlr(nvme_ctrlr, false); 2054 break; 2055 default: 2056 break; 2057 } 2058 } 2059 2060 static void 2061 bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 2062 { 2063 pthread_mutex_lock(&nvme_ctrlr->mutex); 2064 if (!success) { 2065 /* Connecting the active trid failed. Set the next alternate trid to the 2066 * active trid if it exists. 2067 */ 2068 if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) { 2069 /* The next alternate trid exists and is ready to try. Try it now. */ 2070 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2071 2072 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2073 return; 2074 } 2075 2076 /* We came here if there is no alternate trid or if the next trid exists but 2077 * is not ready to try. We will try the active trid after reconnect_delay_sec 2078 * seconds if it is non-zero or at the next reset call otherwise. 2079 */ 2080 } else { 2081 /* Connecting the active trid succeeded. Clear the last failed time because it 2082 * means the trid is failed if its last failed time is non-zero. 2083 */ 2084 nvme_ctrlr->active_path_id->last_failed_tsc = 0; 2085 } 2086 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2087 2088 /* Make sure we clear any pending resets before returning. */ 2089 spdk_for_each_channel(nvme_ctrlr, 2090 bdev_nvme_complete_pending_resets, 2091 success ? NULL : (void *)0x1, 2092 _bdev_nvme_reset_ctrlr_complete); 2093 } 2094 2095 static void 2096 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 2097 { 2098 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2099 2100 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2101 } 2102 2103 static void 2104 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 2105 { 2106 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2107 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 2108 struct nvme_qpair *nvme_qpair; 2109 2110 nvme_qpair = ctrlr_ch->qpair; 2111 assert(nvme_qpair != NULL); 2112 2113 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2114 2115 if (nvme_qpair->qpair != NULL) { 2116 if (nvme_qpair->ctrlr->dont_retry) { 2117 spdk_nvme_qpair_set_abort_dnr(nvme_qpair->qpair, true); 2118 } 2119 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2120 2121 /* The current full reset sequence will move to the next 2122 * ctrlr_channel after the qpair is actually disconnected. 2123 */ 2124 assert(ctrlr_ch->reset_iter == NULL); 2125 ctrlr_ch->reset_iter = i; 2126 } else { 2127 spdk_for_each_channel_continue(i, 0); 2128 } 2129 } 2130 2131 static void 2132 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 2133 { 2134 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2135 2136 if (status == 0) { 2137 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true); 2138 } else { 2139 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 2140 spdk_for_each_channel(nvme_ctrlr, 2141 bdev_nvme_reset_destroy_qpair, 2142 NULL, 2143 bdev_nvme_reset_create_qpairs_failed); 2144 } 2145 } 2146 2147 static int 2148 bdev_nvme_reset_check_qpair_connected(void *ctx) 2149 { 2150 struct nvme_ctrlr_channel *ctrlr_ch = ctx; 2151 2152 if (ctrlr_ch->reset_iter == NULL) { 2153 /* qpair was already failed to connect and the reset sequence is being aborted. */ 2154 assert(ctrlr_ch->connect_poller == NULL); 2155 assert(ctrlr_ch->qpair->qpair == NULL); 2156 return SPDK_POLLER_BUSY; 2157 } 2158 2159 assert(ctrlr_ch->qpair->qpair != NULL); 2160 2161 if (!spdk_nvme_qpair_is_connected(ctrlr_ch->qpair->qpair)) { 2162 return SPDK_POLLER_BUSY; 2163 } 2164 2165 spdk_poller_unregister(&ctrlr_ch->connect_poller); 2166 2167 /* qpair was completed to connect. Move to the next ctrlr_channel */ 2168 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2169 ctrlr_ch->reset_iter = NULL; 2170 2171 return SPDK_POLLER_BUSY; 2172 } 2173 2174 static void 2175 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 2176 { 2177 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2178 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 2179 int rc; 2180 2181 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 2182 if (rc == 0) { 2183 ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected, 2184 ctrlr_ch, 0); 2185 2186 /* The current full reset sequence will move to the next 2187 * ctrlr_channel after the qpair is actually connected. 2188 */ 2189 assert(ctrlr_ch->reset_iter == NULL); 2190 ctrlr_ch->reset_iter = i; 2191 } else { 2192 spdk_for_each_channel_continue(i, rc); 2193 } 2194 } 2195 2196 static int 2197 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2198 { 2199 struct nvme_ctrlr *nvme_ctrlr = arg; 2200 int rc = -ETIMEDOUT; 2201 2202 if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2203 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2204 if (rc == -EAGAIN) { 2205 return SPDK_POLLER_BUSY; 2206 } 2207 } 2208 2209 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2210 if (rc == 0) { 2211 /* Recreate all of the I/O queue pairs */ 2212 spdk_for_each_channel(nvme_ctrlr, 2213 bdev_nvme_reset_create_qpair, 2214 NULL, 2215 bdev_nvme_reset_create_qpairs_done); 2216 } else { 2217 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2218 } 2219 return SPDK_POLLER_BUSY; 2220 } 2221 2222 static void 2223 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2224 { 2225 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2226 2227 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2228 assert(nvme_ctrlr->reset_detach_poller == NULL); 2229 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2230 nvme_ctrlr, 0); 2231 } 2232 2233 static void 2234 bdev_nvme_reset_destroy_qpair_done(struct spdk_io_channel_iter *i, int status) 2235 { 2236 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2237 2238 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2239 assert(status == 0); 2240 2241 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2242 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2243 } else { 2244 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2245 } 2246 } 2247 2248 static void 2249 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2250 { 2251 spdk_for_each_channel(nvme_ctrlr, 2252 bdev_nvme_reset_destroy_qpair, 2253 NULL, 2254 bdev_nvme_reset_destroy_qpair_done); 2255 } 2256 2257 static void 2258 bdev_nvme_reconnect_ctrlr_now(void *ctx) 2259 { 2260 struct nvme_ctrlr *nvme_ctrlr = ctx; 2261 2262 assert(nvme_ctrlr->resetting == true); 2263 assert(nvme_ctrlr->thread == spdk_get_thread()); 2264 2265 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2266 2267 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2268 2269 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2270 } 2271 2272 static void 2273 _bdev_nvme_reset_ctrlr(void *ctx) 2274 { 2275 struct nvme_ctrlr *nvme_ctrlr = ctx; 2276 2277 assert(nvme_ctrlr->resetting == true); 2278 assert(nvme_ctrlr->thread == spdk_get_thread()); 2279 2280 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2281 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2282 } else { 2283 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2284 } 2285 } 2286 2287 static int 2288 bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2289 { 2290 spdk_msg_fn msg_fn; 2291 2292 pthread_mutex_lock(&nvme_ctrlr->mutex); 2293 if (nvme_ctrlr->destruct) { 2294 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2295 return -ENXIO; 2296 } 2297 2298 if (nvme_ctrlr->resetting) { 2299 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2300 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2301 return -EBUSY; 2302 } 2303 2304 if (nvme_ctrlr->disabled) { 2305 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2306 SPDK_NOTICELOG("Unable to perform reset. Controller is disabled.\n"); 2307 return -EALREADY; 2308 } 2309 2310 nvme_ctrlr->resetting = true; 2311 nvme_ctrlr->dont_retry = true; 2312 2313 if (nvme_ctrlr->reconnect_is_delayed) { 2314 SPDK_DEBUGLOG(bdev_nvme, "Reconnect is already scheduled.\n"); 2315 msg_fn = bdev_nvme_reconnect_ctrlr_now; 2316 nvme_ctrlr->reconnect_is_delayed = false; 2317 } else { 2318 msg_fn = _bdev_nvme_reset_ctrlr; 2319 assert(nvme_ctrlr->reset_start_tsc == 0); 2320 } 2321 2322 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2323 2324 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2325 2326 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2327 return 0; 2328 } 2329 2330 static int 2331 bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2332 { 2333 pthread_mutex_lock(&nvme_ctrlr->mutex); 2334 if (nvme_ctrlr->destruct) { 2335 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2336 return -ENXIO; 2337 } 2338 2339 if (nvme_ctrlr->resetting) { 2340 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2341 return -EBUSY; 2342 } 2343 2344 if (!nvme_ctrlr->disabled) { 2345 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2346 return -EALREADY; 2347 } 2348 2349 nvme_ctrlr->disabled = false; 2350 nvme_ctrlr->resetting = true; 2351 2352 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2353 2354 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2355 2356 spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr); 2357 return 0; 2358 } 2359 2360 static void 2361 _bdev_nvme_disable_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2362 { 2363 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2364 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2365 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2366 enum bdev_nvme_op_after_reset op_after_disable; 2367 2368 assert(nvme_ctrlr->thread == spdk_get_thread()); 2369 2370 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2371 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2372 2373 pthread_mutex_lock(&nvme_ctrlr->mutex); 2374 2375 nvme_ctrlr->resetting = false; 2376 nvme_ctrlr->dont_retry = false; 2377 2378 op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true); 2379 2380 nvme_ctrlr->disabled = true; 2381 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2382 2383 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2384 2385 if (ctrlr_op_cb_fn) { 2386 ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0); 2387 } 2388 2389 switch (op_after_disable) { 2390 case OP_COMPLETE_PENDING_DESTRUCT: 2391 nvme_ctrlr_unregister(nvme_ctrlr); 2392 break; 2393 default: 2394 break; 2395 } 2396 2397 } 2398 2399 static void 2400 bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr) 2401 { 2402 /* Make sure we clear any pending resets before returning. */ 2403 spdk_for_each_channel(nvme_ctrlr, 2404 bdev_nvme_complete_pending_resets, 2405 NULL, 2406 _bdev_nvme_disable_ctrlr_complete); 2407 } 2408 2409 static void 2410 bdev_nvme_disable_destroy_qpairs_done(struct spdk_io_channel_iter *i, int status) 2411 { 2412 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2413 2414 assert(status == 0); 2415 2416 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2417 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2418 } else { 2419 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete); 2420 } 2421 } 2422 2423 static void 2424 bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2425 { 2426 spdk_for_each_channel(nvme_ctrlr, 2427 bdev_nvme_reset_destroy_qpair, 2428 NULL, 2429 bdev_nvme_disable_destroy_qpairs_done); 2430 } 2431 2432 static void 2433 _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx) 2434 { 2435 struct nvme_ctrlr *nvme_ctrlr = ctx; 2436 2437 assert(nvme_ctrlr->resetting == true); 2438 assert(nvme_ctrlr->thread == spdk_get_thread()); 2439 2440 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2441 2442 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2443 } 2444 2445 static void 2446 _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx) 2447 { 2448 struct nvme_ctrlr *nvme_ctrlr = ctx; 2449 2450 assert(nvme_ctrlr->resetting == true); 2451 assert(nvme_ctrlr->thread == spdk_get_thread()); 2452 2453 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2454 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs); 2455 } else { 2456 bdev_nvme_disable_destroy_qpairs(nvme_ctrlr); 2457 } 2458 } 2459 2460 static int 2461 bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2462 { 2463 spdk_msg_fn msg_fn; 2464 2465 pthread_mutex_lock(&nvme_ctrlr->mutex); 2466 if (nvme_ctrlr->destruct) { 2467 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2468 return -ENXIO; 2469 } 2470 2471 if (nvme_ctrlr->resetting) { 2472 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2473 return -EBUSY; 2474 } 2475 2476 if (nvme_ctrlr->disabled) { 2477 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2478 return -EALREADY; 2479 } 2480 2481 nvme_ctrlr->resetting = true; 2482 nvme_ctrlr->dont_retry = true; 2483 2484 if (nvme_ctrlr->reconnect_is_delayed) { 2485 msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr; 2486 nvme_ctrlr->reconnect_is_delayed = false; 2487 } else { 2488 msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr; 2489 } 2490 2491 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2492 2493 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2494 2495 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2496 return 0; 2497 } 2498 2499 static int 2500 nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2501 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2502 { 2503 int rc; 2504 2505 switch (op) { 2506 case NVME_CTRLR_OP_RESET: 2507 rc = bdev_nvme_reset_ctrlr(nvme_ctrlr); 2508 break; 2509 case NVME_CTRLR_OP_ENABLE: 2510 rc = bdev_nvme_enable_ctrlr(nvme_ctrlr); 2511 break; 2512 case NVME_CTRLR_OP_DISABLE: 2513 rc = bdev_nvme_disable_ctrlr(nvme_ctrlr); 2514 break; 2515 default: 2516 rc = -EINVAL; 2517 break; 2518 } 2519 2520 if (rc == 0) { 2521 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 2522 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 2523 nvme_ctrlr->ctrlr_op_cb_fn = cb_fn; 2524 nvme_ctrlr->ctrlr_op_cb_arg = cb_arg; 2525 } 2526 return rc; 2527 } 2528 2529 struct nvme_ctrlr_op_rpc_ctx { 2530 struct nvme_ctrlr *nvme_ctrlr; 2531 struct spdk_thread *orig_thread; 2532 enum nvme_ctrlr_op op; 2533 int rc; 2534 bdev_nvme_ctrlr_op_cb cb_fn; 2535 void *cb_arg; 2536 }; 2537 2538 static void 2539 _nvme_ctrlr_op_rpc_complete(void *_ctx) 2540 { 2541 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2542 2543 assert(ctx != NULL); 2544 assert(ctx->cb_fn != NULL); 2545 2546 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2547 2548 free(ctx); 2549 } 2550 2551 static void 2552 nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc) 2553 { 2554 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2555 2556 ctx->rc = rc; 2557 2558 spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx); 2559 } 2560 2561 void 2562 nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2563 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2564 { 2565 struct nvme_ctrlr_op_rpc_ctx *ctx; 2566 int rc; 2567 2568 assert(cb_fn != NULL); 2569 2570 ctx = calloc(1, sizeof(*ctx)); 2571 if (ctx == NULL) { 2572 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2573 cb_fn(cb_arg, -ENOMEM); 2574 return; 2575 } 2576 2577 ctx->orig_thread = spdk_get_thread(); 2578 ctx->cb_fn = cb_fn; 2579 ctx->cb_arg = cb_arg; 2580 2581 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx); 2582 if (rc == 0) { 2583 return; 2584 } else if (rc == -EALREADY) { 2585 rc = 0; 2586 } 2587 2588 nvme_ctrlr_op_rpc_complete(ctx, rc); 2589 } 2590 2591 static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc); 2592 2593 static void 2594 _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx) 2595 { 2596 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2597 struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr; 2598 int rc; 2599 2600 prev_nvme_ctrlr = ctx->nvme_ctrlr; 2601 ctx->nvme_ctrlr = NULL; 2602 2603 if (ctx->rc != 0) { 2604 goto complete; 2605 } 2606 2607 next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq); 2608 if (next_nvme_ctrlr == NULL) { 2609 goto complete; 2610 } 2611 2612 rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2613 if (rc == 0) { 2614 ctx->nvme_ctrlr = next_nvme_ctrlr; 2615 return; 2616 } else if (rc == -EALREADY) { 2617 ctx->nvme_ctrlr = next_nvme_ctrlr; 2618 rc = 0; 2619 } 2620 2621 ctx->rc = rc; 2622 2623 complete: 2624 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2625 free(ctx); 2626 } 2627 2628 static void 2629 nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc) 2630 { 2631 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2632 2633 ctx->rc = rc; 2634 2635 spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx); 2636 } 2637 2638 void 2639 nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op, 2640 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2641 { 2642 struct nvme_ctrlr_op_rpc_ctx *ctx; 2643 struct nvme_ctrlr *nvme_ctrlr; 2644 int rc; 2645 2646 assert(cb_fn != NULL); 2647 2648 ctx = calloc(1, sizeof(*ctx)); 2649 if (ctx == NULL) { 2650 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2651 cb_fn(cb_arg, -ENOMEM); 2652 return; 2653 } 2654 2655 ctx->orig_thread = spdk_get_thread(); 2656 ctx->op = op; 2657 ctx->cb_fn = cb_fn; 2658 ctx->cb_arg = cb_arg; 2659 2660 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 2661 assert(nvme_ctrlr != NULL); 2662 2663 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2664 if (rc == 0) { 2665 ctx->nvme_ctrlr = nvme_ctrlr; 2666 return; 2667 } else if (rc == -EALREADY) { 2668 ctx->nvme_ctrlr = nvme_ctrlr; 2669 rc = 0; 2670 } 2671 2672 nvme_bdev_ctrlr_op_rpc_continue(ctx, rc); 2673 } 2674 2675 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2676 2677 static void 2678 _bdev_nvme_reset_io_complete(struct spdk_io_channel_iter *i, int status) 2679 { 2680 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 2681 enum spdk_bdev_io_status io_status; 2682 2683 if (bio->cpl.cdw0 == 0) { 2684 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2685 } else { 2686 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2687 } 2688 2689 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2690 } 2691 2692 static void 2693 bdev_nvme_abort_bdev_channel(struct spdk_io_channel_iter *i) 2694 { 2695 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2696 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 2697 2698 bdev_nvme_abort_retry_ios(nbdev_ch); 2699 2700 spdk_for_each_channel_continue(i, 0); 2701 } 2702 2703 static void 2704 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 2705 { 2706 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2707 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2708 2709 /* Abort all queued I/Os for retry. */ 2710 spdk_for_each_channel(nbdev, 2711 bdev_nvme_abort_bdev_channel, 2712 bio, 2713 _bdev_nvme_reset_io_complete); 2714 } 2715 2716 static void 2717 _bdev_nvme_reset_io_continue(void *ctx) 2718 { 2719 struct nvme_bdev_io *bio = ctx; 2720 struct nvme_io_path *prev_io_path, *next_io_path; 2721 int rc; 2722 2723 prev_io_path = bio->io_path; 2724 bio->io_path = NULL; 2725 2726 if (bio->cpl.cdw0 != 0) { 2727 goto complete; 2728 } 2729 2730 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 2731 if (next_io_path == NULL) { 2732 goto complete; 2733 } 2734 2735 rc = _bdev_nvme_reset_io(next_io_path, bio); 2736 if (rc == 0) { 2737 return; 2738 } 2739 2740 bio->cpl.cdw0 = 1; 2741 2742 complete: 2743 bdev_nvme_reset_io_complete(bio); 2744 } 2745 2746 static void 2747 bdev_nvme_reset_io_continue(void *cb_arg, int rc) 2748 { 2749 struct nvme_bdev_io *bio = cb_arg; 2750 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2751 2752 bio->cpl.cdw0 = (rc == 0) ? 0 : 1; 2753 2754 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio); 2755 } 2756 2757 static int 2758 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 2759 { 2760 struct nvme_ctrlr_channel *ctrlr_ch; 2761 struct spdk_bdev_io *bdev_io; 2762 int rc; 2763 2764 rc = nvme_ctrlr_op(io_path->qpair->ctrlr, NVME_CTRLR_OP_RESET, 2765 bdev_nvme_reset_io_continue, bio); 2766 if (rc == 0) { 2767 assert(bio->io_path == NULL); 2768 bio->io_path = io_path; 2769 } else if (rc == -EBUSY) { 2770 ctrlr_ch = io_path->qpair->ctrlr_ch; 2771 assert(ctrlr_ch != NULL); 2772 /* 2773 * Reset call is queued only if it is from the app framework. This is on purpose so that 2774 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 2775 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 2776 */ 2777 bdev_io = spdk_bdev_io_from_ctx(bio); 2778 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link); 2779 rc = 0; 2780 } 2781 2782 return rc; 2783 } 2784 2785 static void 2786 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 2787 { 2788 struct nvme_io_path *io_path; 2789 int rc; 2790 2791 bio->cpl.cdw0 = 0; 2792 2793 /* Reset all nvme_ctrlrs of a bdev controller sequentially. */ 2794 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 2795 assert(io_path != NULL); 2796 2797 rc = _bdev_nvme_reset_io(io_path, bio); 2798 if (rc != 0) { 2799 /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */ 2800 bdev_nvme_reset_io_continue(bio, rc == -EALREADY); 2801 } 2802 } 2803 2804 static int 2805 bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2806 { 2807 if (nvme_ctrlr->destruct) { 2808 /* Don't bother resetting if the controller is in the process of being destructed. */ 2809 return -ENXIO; 2810 } 2811 2812 if (nvme_ctrlr->resetting) { 2813 if (!nvme_ctrlr->in_failover) { 2814 SPDK_NOTICELOG("Reset is already in progress. Defer failover until reset completes.\n"); 2815 2816 /* Defer failover until reset completes. */ 2817 nvme_ctrlr->pending_failover = true; 2818 return -EINPROGRESS; 2819 } else { 2820 SPDK_NOTICELOG("Unable to perform failover, already in progress.\n"); 2821 return -EBUSY; 2822 } 2823 } 2824 2825 bdev_nvme_failover_trid(nvme_ctrlr, remove, true); 2826 2827 if (nvme_ctrlr->reconnect_is_delayed) { 2828 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2829 2830 /* We rely on the next reconnect for the failover. */ 2831 return -EALREADY; 2832 } 2833 2834 if (nvme_ctrlr->disabled) { 2835 SPDK_NOTICELOG("Controller is disabled.\n"); 2836 2837 /* We rely on the enablement for the failover. */ 2838 return -EALREADY; 2839 } 2840 2841 nvme_ctrlr->resetting = true; 2842 nvme_ctrlr->in_failover = true; 2843 2844 assert(nvme_ctrlr->reset_start_tsc == 0); 2845 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2846 2847 return 0; 2848 } 2849 2850 static int 2851 bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2852 { 2853 int rc; 2854 2855 pthread_mutex_lock(&nvme_ctrlr->mutex); 2856 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, remove); 2857 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2858 2859 if (rc == 0) { 2860 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr); 2861 } else if (rc == -EALREADY) { 2862 rc = 0; 2863 } 2864 2865 return rc; 2866 } 2867 2868 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2869 uint64_t num_blocks); 2870 2871 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2872 uint64_t num_blocks); 2873 2874 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 2875 uint64_t src_offset_blocks, 2876 uint64_t num_blocks); 2877 2878 static void 2879 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2880 bool success) 2881 { 2882 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2883 struct spdk_bdev *bdev = bdev_io->bdev; 2884 int ret; 2885 2886 if (!success) { 2887 ret = -EINVAL; 2888 goto exit; 2889 } 2890 2891 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2892 ret = -ENXIO; 2893 goto exit; 2894 } 2895 2896 ret = bdev_nvme_readv(bio, 2897 bdev_io->u.bdev.iovs, 2898 bdev_io->u.bdev.iovcnt, 2899 bdev_io->u.bdev.md_buf, 2900 bdev_io->u.bdev.num_blocks, 2901 bdev_io->u.bdev.offset_blocks, 2902 bdev->dif_check_flags, 2903 bdev_io->u.bdev.memory_domain, 2904 bdev_io->u.bdev.memory_domain_ctx, 2905 bdev_io->u.bdev.accel_sequence); 2906 2907 exit: 2908 if (spdk_unlikely(ret != 0)) { 2909 bdev_nvme_io_complete(bio, ret); 2910 } 2911 } 2912 2913 static inline void 2914 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 2915 { 2916 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2917 struct spdk_bdev *bdev = bdev_io->bdev; 2918 struct nvme_bdev_io *nbdev_io_to_abort; 2919 int rc = 0; 2920 2921 switch (bdev_io->type) { 2922 case SPDK_BDEV_IO_TYPE_READ: 2923 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 2924 rc = bdev_nvme_readv(nbdev_io, 2925 bdev_io->u.bdev.iovs, 2926 bdev_io->u.bdev.iovcnt, 2927 bdev_io->u.bdev.md_buf, 2928 bdev_io->u.bdev.num_blocks, 2929 bdev_io->u.bdev.offset_blocks, 2930 bdev->dif_check_flags, 2931 bdev_io->u.bdev.memory_domain, 2932 bdev_io->u.bdev.memory_domain_ctx, 2933 bdev_io->u.bdev.accel_sequence); 2934 } else { 2935 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 2936 bdev_io->u.bdev.num_blocks * bdev->blocklen); 2937 rc = 0; 2938 } 2939 break; 2940 case SPDK_BDEV_IO_TYPE_WRITE: 2941 rc = bdev_nvme_writev(nbdev_io, 2942 bdev_io->u.bdev.iovs, 2943 bdev_io->u.bdev.iovcnt, 2944 bdev_io->u.bdev.md_buf, 2945 bdev_io->u.bdev.num_blocks, 2946 bdev_io->u.bdev.offset_blocks, 2947 bdev->dif_check_flags, 2948 bdev_io->u.bdev.memory_domain, 2949 bdev_io->u.bdev.memory_domain_ctx, 2950 bdev_io->u.bdev.accel_sequence); 2951 break; 2952 case SPDK_BDEV_IO_TYPE_COMPARE: 2953 rc = bdev_nvme_comparev(nbdev_io, 2954 bdev_io->u.bdev.iovs, 2955 bdev_io->u.bdev.iovcnt, 2956 bdev_io->u.bdev.md_buf, 2957 bdev_io->u.bdev.num_blocks, 2958 bdev_io->u.bdev.offset_blocks, 2959 bdev->dif_check_flags); 2960 break; 2961 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2962 rc = bdev_nvme_comparev_and_writev(nbdev_io, 2963 bdev_io->u.bdev.iovs, 2964 bdev_io->u.bdev.iovcnt, 2965 bdev_io->u.bdev.fused_iovs, 2966 bdev_io->u.bdev.fused_iovcnt, 2967 bdev_io->u.bdev.md_buf, 2968 bdev_io->u.bdev.num_blocks, 2969 bdev_io->u.bdev.offset_blocks, 2970 bdev->dif_check_flags); 2971 break; 2972 case SPDK_BDEV_IO_TYPE_UNMAP: 2973 rc = bdev_nvme_unmap(nbdev_io, 2974 bdev_io->u.bdev.offset_blocks, 2975 bdev_io->u.bdev.num_blocks); 2976 break; 2977 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2978 rc = bdev_nvme_write_zeroes(nbdev_io, 2979 bdev_io->u.bdev.offset_blocks, 2980 bdev_io->u.bdev.num_blocks); 2981 break; 2982 case SPDK_BDEV_IO_TYPE_RESET: 2983 nbdev_io->io_path = NULL; 2984 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 2985 return; 2986 2987 case SPDK_BDEV_IO_TYPE_FLUSH: 2988 bdev_nvme_io_complete(nbdev_io, 0); 2989 return; 2990 2991 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 2992 rc = bdev_nvme_zone_appendv(nbdev_io, 2993 bdev_io->u.bdev.iovs, 2994 bdev_io->u.bdev.iovcnt, 2995 bdev_io->u.bdev.md_buf, 2996 bdev_io->u.bdev.num_blocks, 2997 bdev_io->u.bdev.offset_blocks, 2998 bdev->dif_check_flags); 2999 break; 3000 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3001 rc = bdev_nvme_get_zone_info(nbdev_io, 3002 bdev_io->u.zone_mgmt.zone_id, 3003 bdev_io->u.zone_mgmt.num_zones, 3004 bdev_io->u.zone_mgmt.buf); 3005 break; 3006 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3007 rc = bdev_nvme_zone_management(nbdev_io, 3008 bdev_io->u.zone_mgmt.zone_id, 3009 bdev_io->u.zone_mgmt.zone_action); 3010 break; 3011 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3012 nbdev_io->io_path = NULL; 3013 bdev_nvme_admin_passthru(nbdev_ch, 3014 nbdev_io, 3015 &bdev_io->u.nvme_passthru.cmd, 3016 bdev_io->u.nvme_passthru.buf, 3017 bdev_io->u.nvme_passthru.nbytes); 3018 return; 3019 3020 case SPDK_BDEV_IO_TYPE_NVME_IO: 3021 rc = bdev_nvme_io_passthru(nbdev_io, 3022 &bdev_io->u.nvme_passthru.cmd, 3023 bdev_io->u.nvme_passthru.buf, 3024 bdev_io->u.nvme_passthru.nbytes); 3025 break; 3026 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3027 rc = bdev_nvme_io_passthru_md(nbdev_io, 3028 &bdev_io->u.nvme_passthru.cmd, 3029 bdev_io->u.nvme_passthru.buf, 3030 bdev_io->u.nvme_passthru.nbytes, 3031 bdev_io->u.nvme_passthru.md_buf, 3032 bdev_io->u.nvme_passthru.md_len); 3033 break; 3034 case SPDK_BDEV_IO_TYPE_ABORT: 3035 nbdev_io->io_path = NULL; 3036 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 3037 bdev_nvme_abort(nbdev_ch, 3038 nbdev_io, 3039 nbdev_io_to_abort); 3040 return; 3041 3042 case SPDK_BDEV_IO_TYPE_COPY: 3043 rc = bdev_nvme_copy(nbdev_io, 3044 bdev_io->u.bdev.offset_blocks, 3045 bdev_io->u.bdev.copy.src_offset_blocks, 3046 bdev_io->u.bdev.num_blocks); 3047 break; 3048 default: 3049 rc = -EINVAL; 3050 break; 3051 } 3052 3053 if (spdk_unlikely(rc != 0)) { 3054 bdev_nvme_io_complete(nbdev_io, rc); 3055 } 3056 } 3057 3058 static void 3059 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 3060 { 3061 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3062 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3063 3064 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 3065 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 3066 } else { 3067 /* There are cases where submit_tsc != 0, i.e. retry I/O. 3068 * We need to update submit_tsc here. 3069 */ 3070 nbdev_io->submit_tsc = spdk_get_ticks(); 3071 } 3072 3073 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 3074 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 3075 if (spdk_unlikely(!nbdev_io->io_path)) { 3076 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 3077 bdev_nvme_io_complete(nbdev_io, -ENXIO); 3078 return; 3079 } 3080 3081 /* Admin commands do not use the optimal I/O path. 3082 * Simply fall through even if it is not found. 3083 */ 3084 } 3085 3086 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 3087 } 3088 3089 static bool 3090 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 3091 { 3092 struct nvme_bdev *nbdev = ctx; 3093 struct nvme_ns *nvme_ns; 3094 struct spdk_nvme_ns *ns; 3095 struct spdk_nvme_ctrlr *ctrlr; 3096 const struct spdk_nvme_ctrlr_data *cdata; 3097 3098 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3099 assert(nvme_ns != NULL); 3100 ns = nvme_ns->ns; 3101 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3102 3103 switch (io_type) { 3104 case SPDK_BDEV_IO_TYPE_READ: 3105 case SPDK_BDEV_IO_TYPE_WRITE: 3106 case SPDK_BDEV_IO_TYPE_RESET: 3107 case SPDK_BDEV_IO_TYPE_FLUSH: 3108 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3109 case SPDK_BDEV_IO_TYPE_NVME_IO: 3110 case SPDK_BDEV_IO_TYPE_ABORT: 3111 return true; 3112 3113 case SPDK_BDEV_IO_TYPE_COMPARE: 3114 return spdk_nvme_ns_supports_compare(ns); 3115 3116 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3117 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3118 3119 case SPDK_BDEV_IO_TYPE_UNMAP: 3120 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3121 return cdata->oncs.dsm; 3122 3123 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3124 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3125 return cdata->oncs.write_zeroes; 3126 3127 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3128 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 3129 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 3130 return true; 3131 } 3132 return false; 3133 3134 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3135 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3136 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 3137 3138 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3139 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 3140 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 3141 3142 case SPDK_BDEV_IO_TYPE_COPY: 3143 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3144 return cdata->oncs.copy; 3145 3146 default: 3147 return false; 3148 } 3149 } 3150 3151 static int 3152 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 3153 { 3154 struct nvme_qpair *nvme_qpair; 3155 struct spdk_io_channel *pg_ch; 3156 int rc; 3157 3158 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 3159 if (!nvme_qpair) { 3160 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 3161 return -1; 3162 } 3163 3164 TAILQ_INIT(&nvme_qpair->io_path_list); 3165 3166 nvme_qpair->ctrlr = nvme_ctrlr; 3167 nvme_qpair->ctrlr_ch = ctrlr_ch; 3168 3169 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 3170 if (!pg_ch) { 3171 free(nvme_qpair); 3172 return -1; 3173 } 3174 3175 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 3176 3177 #ifdef SPDK_CONFIG_VTUNE 3178 nvme_qpair->group->collect_spin_stat = true; 3179 #else 3180 nvme_qpair->group->collect_spin_stat = false; 3181 #endif 3182 3183 if (!nvme_ctrlr->disabled) { 3184 /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will 3185 * be created when it's enabled. 3186 */ 3187 rc = bdev_nvme_create_qpair(nvme_qpair); 3188 if (rc != 0) { 3189 /* nvme_ctrlr can't create IO qpair if connection is down. 3190 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 3191 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 3192 * submitted IO will be queued until IO qpair is successfully created. 3193 * 3194 * Hence, if both are satisfied, ignore the failure. 3195 */ 3196 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 3197 spdk_put_io_channel(pg_ch); 3198 free(nvme_qpair); 3199 return rc; 3200 } 3201 } 3202 } 3203 3204 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3205 3206 ctrlr_ch->qpair = nvme_qpair; 3207 3208 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 3209 nvme_qpair->ctrlr->ref++; 3210 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 3211 3212 return 0; 3213 } 3214 3215 static int 3216 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3217 { 3218 struct nvme_ctrlr *nvme_ctrlr = io_device; 3219 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3220 3221 TAILQ_INIT(&ctrlr_ch->pending_resets); 3222 3223 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 3224 } 3225 3226 static void 3227 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 3228 { 3229 struct nvme_io_path *io_path, *next; 3230 3231 assert(nvme_qpair->group != NULL); 3232 3233 TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) { 3234 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 3235 nvme_io_path_free(io_path); 3236 } 3237 3238 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3239 3240 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 3241 3242 nvme_ctrlr_release(nvme_qpair->ctrlr); 3243 3244 free(nvme_qpair); 3245 } 3246 3247 static void 3248 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3249 { 3250 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3251 struct nvme_qpair *nvme_qpair; 3252 3253 nvme_qpair = ctrlr_ch->qpair; 3254 assert(nvme_qpair != NULL); 3255 3256 _bdev_nvme_clear_io_path_cache(nvme_qpair); 3257 3258 if (nvme_qpair->qpair != NULL) { 3259 if (ctrlr_ch->reset_iter == NULL) { 3260 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 3261 } else { 3262 /* Skip current ctrlr_channel in a full reset sequence because 3263 * it is being deleted now. The qpair is already being disconnected. 3264 * We do not have to restart disconnecting it. 3265 */ 3266 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 3267 } 3268 3269 /* We cannot release a reference to the poll group now. 3270 * The qpair may be disconnected asynchronously later. 3271 * We need to poll it until it is actually disconnected. 3272 * Just detach the qpair from the deleting ctrlr_channel. 3273 */ 3274 nvme_qpair->ctrlr_ch = NULL; 3275 } else { 3276 assert(ctrlr_ch->reset_iter == NULL); 3277 3278 nvme_qpair_delete(nvme_qpair); 3279 } 3280 } 3281 3282 static inline struct spdk_io_channel * 3283 bdev_nvme_get_accel_channel(struct nvme_poll_group *group) 3284 { 3285 if (spdk_unlikely(!group->accel_channel)) { 3286 group->accel_channel = spdk_accel_get_io_channel(); 3287 if (!group->accel_channel) { 3288 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 3289 group); 3290 return NULL; 3291 } 3292 } 3293 3294 return group->accel_channel; 3295 } 3296 3297 static void 3298 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 3299 uint32_t iov_cnt, uint32_t seed, 3300 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3301 { 3302 struct spdk_io_channel *accel_ch; 3303 struct nvme_poll_group *group = ctx; 3304 int rc; 3305 3306 assert(cb_fn != NULL); 3307 3308 accel_ch = bdev_nvme_get_accel_channel(group); 3309 if (spdk_unlikely(accel_ch == NULL)) { 3310 cb_fn(cb_arg, -ENOMEM); 3311 return; 3312 } 3313 3314 rc = spdk_accel_submit_crc32cv(accel_ch, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 3315 if (rc) { 3316 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 3317 if (rc == -ENOMEM || rc == -EINVAL) { 3318 cb_fn(cb_arg, rc); 3319 } 3320 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 3321 } 3322 } 3323 3324 static void 3325 bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3326 { 3327 spdk_accel_sequence_finish(seq, cb_fn, cb_arg); 3328 } 3329 3330 static void 3331 bdev_nvme_abort_sequence(void *seq) 3332 { 3333 spdk_accel_sequence_abort(seq); 3334 } 3335 3336 static void 3337 bdev_nvme_reverse_sequence(void *seq) 3338 { 3339 spdk_accel_sequence_reverse(seq); 3340 } 3341 3342 static int 3343 bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt, 3344 struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed, 3345 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3346 { 3347 struct spdk_io_channel *ch; 3348 struct nvme_poll_group *group = ctx; 3349 3350 ch = bdev_nvme_get_accel_channel(group); 3351 if (spdk_unlikely(ch == NULL)) { 3352 return -ENOMEM; 3353 } 3354 3355 return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt, 3356 domain, domain_ctx, seed, cb_fn, cb_arg); 3357 } 3358 3359 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 3360 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 3361 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 3362 .append_crc32c = bdev_nvme_append_crc32c, 3363 .finish_sequence = bdev_nvme_finish_sequence, 3364 .reverse_sequence = bdev_nvme_reverse_sequence, 3365 .abort_sequence = bdev_nvme_abort_sequence, 3366 }; 3367 3368 static int 3369 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 3370 { 3371 struct nvme_poll_group *group = ctx_buf; 3372 3373 TAILQ_INIT(&group->qpair_list); 3374 3375 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 3376 if (group->group == NULL) { 3377 return -1; 3378 } 3379 3380 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 3381 3382 if (group->poller == NULL) { 3383 spdk_nvme_poll_group_destroy(group->group); 3384 return -1; 3385 } 3386 3387 return 0; 3388 } 3389 3390 static void 3391 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 3392 { 3393 struct nvme_poll_group *group = ctx_buf; 3394 3395 assert(TAILQ_EMPTY(&group->qpair_list)); 3396 3397 if (group->accel_channel) { 3398 spdk_put_io_channel(group->accel_channel); 3399 } 3400 3401 spdk_poller_unregister(&group->poller); 3402 if (spdk_nvme_poll_group_destroy(group->group)) { 3403 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 3404 assert(false); 3405 } 3406 } 3407 3408 static struct spdk_io_channel * 3409 bdev_nvme_get_io_channel(void *ctx) 3410 { 3411 struct nvme_bdev *nvme_bdev = ctx; 3412 3413 return spdk_get_io_channel(nvme_bdev); 3414 } 3415 3416 static void * 3417 bdev_nvme_get_module_ctx(void *ctx) 3418 { 3419 struct nvme_bdev *nvme_bdev = ctx; 3420 struct nvme_ns *nvme_ns; 3421 3422 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 3423 return NULL; 3424 } 3425 3426 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 3427 if (!nvme_ns) { 3428 return NULL; 3429 } 3430 3431 return nvme_ns->ns; 3432 } 3433 3434 static const char * 3435 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 3436 { 3437 switch (ana_state) { 3438 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3439 return "optimized"; 3440 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3441 return "non_optimized"; 3442 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3443 return "inaccessible"; 3444 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 3445 return "persistent_loss"; 3446 case SPDK_NVME_ANA_CHANGE_STATE: 3447 return "change"; 3448 default: 3449 return NULL; 3450 } 3451 } 3452 3453 static int 3454 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 3455 { 3456 struct spdk_memory_domain **_domains = NULL; 3457 struct nvme_bdev *nbdev = ctx; 3458 struct nvme_ns *nvme_ns; 3459 int i = 0, _array_size = array_size; 3460 int rc = 0; 3461 3462 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3463 if (domains && array_size >= i) { 3464 _domains = &domains[i]; 3465 } else { 3466 _domains = NULL; 3467 } 3468 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 3469 if (rc > 0) { 3470 i += rc; 3471 if (_array_size >= rc) { 3472 _array_size -= rc; 3473 } else { 3474 _array_size = 0; 3475 } 3476 } else if (rc < 0) { 3477 return rc; 3478 } 3479 } 3480 3481 return i; 3482 } 3483 3484 static const char * 3485 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 3486 { 3487 if (nvme_ctrlr->destruct) { 3488 return "deleting"; 3489 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 3490 return "failed"; 3491 } else if (nvme_ctrlr->resetting) { 3492 return "resetting"; 3493 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 3494 return "reconnect_is_delayed"; 3495 } else if (nvme_ctrlr->disabled) { 3496 return "disabled"; 3497 } else { 3498 return "enabled"; 3499 } 3500 } 3501 3502 void 3503 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 3504 { 3505 struct spdk_nvme_transport_id *trid; 3506 const struct spdk_nvme_ctrlr_opts *opts; 3507 const struct spdk_nvme_ctrlr_data *cdata; 3508 struct nvme_path_id *path_id; 3509 3510 spdk_json_write_object_begin(w); 3511 3512 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 3513 3514 #ifdef SPDK_CONFIG_NVME_CUSE 3515 size_t cuse_name_size = 128; 3516 char cuse_name[cuse_name_size]; 3517 3518 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 3519 if (rc == 0) { 3520 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3521 } 3522 #endif 3523 trid = &nvme_ctrlr->active_path_id->trid; 3524 spdk_json_write_named_object_begin(w, "trid"); 3525 nvme_bdev_dump_trid_json(trid, w); 3526 spdk_json_write_object_end(w); 3527 3528 path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link); 3529 if (path_id != NULL) { 3530 spdk_json_write_named_array_begin(w, "alternate_trids"); 3531 do { 3532 trid = &path_id->trid; 3533 spdk_json_write_object_begin(w); 3534 nvme_bdev_dump_trid_json(trid, w); 3535 spdk_json_write_object_end(w); 3536 3537 path_id = TAILQ_NEXT(path_id, link); 3538 } while (path_id != NULL); 3539 spdk_json_write_array_end(w); 3540 } 3541 3542 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 3543 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3544 3545 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 3546 spdk_json_write_named_object_begin(w, "host"); 3547 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 3548 spdk_json_write_named_string(w, "addr", opts->src_addr); 3549 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 3550 spdk_json_write_object_end(w); 3551 3552 spdk_json_write_object_end(w); 3553 } 3554 3555 static void 3556 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 3557 struct nvme_ns *nvme_ns) 3558 { 3559 struct spdk_nvme_ns *ns; 3560 struct spdk_nvme_ctrlr *ctrlr; 3561 const struct spdk_nvme_ctrlr_data *cdata; 3562 const struct spdk_nvme_transport_id *trid; 3563 union spdk_nvme_vs_register vs; 3564 const struct spdk_nvme_ns_data *nsdata; 3565 char buf[128]; 3566 3567 ns = nvme_ns->ns; 3568 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3569 3570 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3571 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 3572 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 3573 3574 spdk_json_write_object_begin(w); 3575 3576 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 3577 spdk_json_write_named_string(w, "pci_address", trid->traddr); 3578 } 3579 3580 spdk_json_write_named_object_begin(w, "trid"); 3581 3582 nvme_bdev_dump_trid_json(trid, w); 3583 3584 spdk_json_write_object_end(w); 3585 3586 #ifdef SPDK_CONFIG_NVME_CUSE 3587 size_t cuse_name_size = 128; 3588 char cuse_name[cuse_name_size]; 3589 3590 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 3591 cuse_name, &cuse_name_size); 3592 if (rc == 0) { 3593 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3594 } 3595 #endif 3596 3597 spdk_json_write_named_object_begin(w, "ctrlr_data"); 3598 3599 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3600 3601 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 3602 3603 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 3604 spdk_str_trim(buf); 3605 spdk_json_write_named_string(w, "model_number", buf); 3606 3607 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 3608 spdk_str_trim(buf); 3609 spdk_json_write_named_string(w, "serial_number", buf); 3610 3611 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 3612 spdk_str_trim(buf); 3613 spdk_json_write_named_string(w, "firmware_revision", buf); 3614 3615 if (cdata->subnqn[0] != '\0') { 3616 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 3617 } 3618 3619 spdk_json_write_named_object_begin(w, "oacs"); 3620 3621 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 3622 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 3623 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 3624 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 3625 3626 spdk_json_write_object_end(w); 3627 3628 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 3629 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 3630 3631 spdk_json_write_object_end(w); 3632 3633 spdk_json_write_named_object_begin(w, "vs"); 3634 3635 spdk_json_write_name(w, "nvme_version"); 3636 if (vs.bits.ter) { 3637 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 3638 } else { 3639 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 3640 } 3641 3642 spdk_json_write_object_end(w); 3643 3644 nsdata = spdk_nvme_ns_get_data(ns); 3645 3646 spdk_json_write_named_object_begin(w, "ns_data"); 3647 3648 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 3649 3650 if (cdata->cmic.ana_reporting) { 3651 spdk_json_write_named_string(w, "ana_state", 3652 _nvme_ana_state_str(nvme_ns->ana_state)); 3653 } 3654 3655 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 3656 3657 spdk_json_write_object_end(w); 3658 3659 if (cdata->oacs.security) { 3660 spdk_json_write_named_object_begin(w, "security"); 3661 3662 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 3663 3664 spdk_json_write_object_end(w); 3665 } 3666 3667 spdk_json_write_object_end(w); 3668 } 3669 3670 static const char * 3671 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 3672 { 3673 switch (nbdev->mp_policy) { 3674 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 3675 return "active_passive"; 3676 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 3677 return "active_active"; 3678 default: 3679 assert(false); 3680 return "invalid"; 3681 } 3682 } 3683 3684 static int 3685 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 3686 { 3687 struct nvme_bdev *nvme_bdev = ctx; 3688 struct nvme_ns *nvme_ns; 3689 3690 pthread_mutex_lock(&nvme_bdev->mutex); 3691 spdk_json_write_named_array_begin(w, "nvme"); 3692 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 3693 nvme_namespace_info_json(w, nvme_ns); 3694 } 3695 spdk_json_write_array_end(w); 3696 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 3697 pthread_mutex_unlock(&nvme_bdev->mutex); 3698 3699 return 0; 3700 } 3701 3702 static void 3703 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3704 { 3705 /* No config per bdev needed */ 3706 } 3707 3708 static uint64_t 3709 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 3710 { 3711 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3712 struct nvme_io_path *io_path; 3713 struct nvme_poll_group *group; 3714 uint64_t spin_time = 0; 3715 3716 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3717 group = io_path->qpair->group; 3718 3719 if (!group || !group->collect_spin_stat) { 3720 continue; 3721 } 3722 3723 if (group->end_ticks != 0) { 3724 group->spin_ticks += (group->end_ticks - group->start_ticks); 3725 group->end_ticks = 0; 3726 } 3727 3728 spin_time += group->spin_ticks; 3729 group->start_ticks = 0; 3730 group->spin_ticks = 0; 3731 } 3732 3733 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 3734 } 3735 3736 static void 3737 bdev_nvme_reset_device_stat(void *ctx) 3738 { 3739 struct nvme_bdev *nbdev = ctx; 3740 3741 if (nbdev->err_stat != NULL) { 3742 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 3743 } 3744 } 3745 3746 /* JSON string should be lowercases and underscore delimited string. */ 3747 static void 3748 bdev_nvme_format_nvme_status(char *dst, const char *src) 3749 { 3750 char tmp[256]; 3751 3752 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 3753 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 3754 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 3755 spdk_strlwr(dst); 3756 } 3757 3758 static void 3759 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 3760 { 3761 struct nvme_bdev *nbdev = ctx; 3762 struct spdk_nvme_status status = {}; 3763 uint16_t sct, sc; 3764 char status_json[256]; 3765 const char *status_str; 3766 3767 if (nbdev->err_stat == NULL) { 3768 return; 3769 } 3770 3771 spdk_json_write_named_object_begin(w, "nvme_error"); 3772 3773 spdk_json_write_named_object_begin(w, "status_type"); 3774 for (sct = 0; sct < 8; sct++) { 3775 if (nbdev->err_stat->status_type[sct] == 0) { 3776 continue; 3777 } 3778 status.sct = sct; 3779 3780 status_str = spdk_nvme_cpl_get_status_type_string(&status); 3781 assert(status_str != NULL); 3782 bdev_nvme_format_nvme_status(status_json, status_str); 3783 3784 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 3785 } 3786 spdk_json_write_object_end(w); 3787 3788 spdk_json_write_named_object_begin(w, "status_code"); 3789 for (sct = 0; sct < 4; sct++) { 3790 status.sct = sct; 3791 for (sc = 0; sc < 256; sc++) { 3792 if (nbdev->err_stat->status[sct][sc] == 0) { 3793 continue; 3794 } 3795 status.sc = sc; 3796 3797 status_str = spdk_nvme_cpl_get_status_string(&status); 3798 assert(status_str != NULL); 3799 bdev_nvme_format_nvme_status(status_json, status_str); 3800 3801 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 3802 } 3803 } 3804 spdk_json_write_object_end(w); 3805 3806 spdk_json_write_object_end(w); 3807 } 3808 3809 static bool 3810 bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 3811 { 3812 struct nvme_bdev *nbdev = ctx; 3813 struct spdk_nvme_ctrlr *ctrlr; 3814 3815 if (!g_opts.allow_accel_sequence) { 3816 return false; 3817 } 3818 3819 switch (type) { 3820 case SPDK_BDEV_IO_TYPE_WRITE: 3821 case SPDK_BDEV_IO_TYPE_READ: 3822 break; 3823 default: 3824 return false; 3825 } 3826 3827 ctrlr = bdev_nvme_get_ctrlr(&nbdev->disk); 3828 assert(ctrlr != NULL); 3829 3830 return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED; 3831 } 3832 3833 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 3834 .destruct = bdev_nvme_destruct, 3835 .submit_request = bdev_nvme_submit_request, 3836 .io_type_supported = bdev_nvme_io_type_supported, 3837 .get_io_channel = bdev_nvme_get_io_channel, 3838 .dump_info_json = bdev_nvme_dump_info_json, 3839 .write_config_json = bdev_nvme_write_config_json, 3840 .get_spin_time = bdev_nvme_get_spin_time, 3841 .get_module_ctx = bdev_nvme_get_module_ctx, 3842 .get_memory_domains = bdev_nvme_get_memory_domains, 3843 .accel_sequence_supported = bdev_nvme_accel_sequence_supported, 3844 .reset_device_stat = bdev_nvme_reset_device_stat, 3845 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 3846 }; 3847 3848 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 3849 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 3850 3851 static int 3852 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3853 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 3854 { 3855 struct spdk_nvme_ana_group_descriptor *copied_desc; 3856 uint8_t *orig_desc; 3857 uint32_t i, desc_size, copy_len; 3858 int rc = 0; 3859 3860 if (nvme_ctrlr->ana_log_page == NULL) { 3861 return -EINVAL; 3862 } 3863 3864 copied_desc = nvme_ctrlr->copied_ana_desc; 3865 3866 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 3867 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 3868 3869 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 3870 memcpy(copied_desc, orig_desc, copy_len); 3871 3872 rc = cb_fn(copied_desc, cb_arg); 3873 if (rc != 0) { 3874 break; 3875 } 3876 3877 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 3878 copied_desc->num_of_nsid * sizeof(uint32_t); 3879 orig_desc += desc_size; 3880 copy_len -= desc_size; 3881 } 3882 3883 return rc; 3884 } 3885 3886 static int 3887 nvme_ns_ana_transition_timedout(void *ctx) 3888 { 3889 struct nvme_ns *nvme_ns = ctx; 3890 3891 spdk_poller_unregister(&nvme_ns->anatt_timer); 3892 nvme_ns->ana_transition_timedout = true; 3893 3894 return SPDK_POLLER_BUSY; 3895 } 3896 3897 static void 3898 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 3899 const struct spdk_nvme_ana_group_descriptor *desc) 3900 { 3901 const struct spdk_nvme_ctrlr_data *cdata; 3902 3903 nvme_ns->ana_group_id = desc->ana_group_id; 3904 nvme_ns->ana_state = desc->ana_state; 3905 nvme_ns->ana_state_updating = false; 3906 3907 switch (nvme_ns->ana_state) { 3908 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3909 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3910 nvme_ns->ana_transition_timedout = false; 3911 spdk_poller_unregister(&nvme_ns->anatt_timer); 3912 break; 3913 3914 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3915 case SPDK_NVME_ANA_CHANGE_STATE: 3916 if (nvme_ns->anatt_timer != NULL) { 3917 break; 3918 } 3919 3920 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 3921 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 3922 nvme_ns, 3923 cdata->anatt * SPDK_SEC_TO_USEC); 3924 break; 3925 default: 3926 break; 3927 } 3928 } 3929 3930 static int 3931 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 3932 { 3933 struct nvme_ns *nvme_ns = cb_arg; 3934 uint32_t i; 3935 3936 for (i = 0; i < desc->num_of_nsid; i++) { 3937 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 3938 continue; 3939 } 3940 3941 _nvme_ns_set_ana_state(nvme_ns, desc); 3942 return 1; 3943 } 3944 3945 return 0; 3946 } 3947 3948 static struct spdk_uuid 3949 nvme_generate_uuid(const char *sn, uint32_t nsid) 3950 { 3951 struct spdk_uuid new_uuid, namespace_uuid; 3952 char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'}; 3953 /* This namespace UUID was generated using uuid_generate() method. */ 3954 const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"}; 3955 int size; 3956 3957 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 3958 3959 spdk_uuid_set_null(&new_uuid); 3960 spdk_uuid_set_null(&namespace_uuid); 3961 3962 size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid); 3963 assert(size > 0 && (unsigned long)size < sizeof(merged_str)); 3964 3965 spdk_uuid_parse(&namespace_uuid, namespace_str); 3966 3967 spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size); 3968 3969 return new_uuid; 3970 } 3971 3972 static int 3973 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 3974 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 3975 uint32_t prchk_flags, void *ctx) 3976 { 3977 const struct spdk_uuid *uuid; 3978 const uint8_t *nguid; 3979 const struct spdk_nvme_ctrlr_data *cdata; 3980 const struct spdk_nvme_ns_data *nsdata; 3981 const struct spdk_nvme_ctrlr_opts *opts; 3982 enum spdk_nvme_csi csi; 3983 uint32_t atomic_bs, phys_bs, bs; 3984 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 3985 3986 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3987 csi = spdk_nvme_ns_get_csi(ns); 3988 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 3989 3990 switch (csi) { 3991 case SPDK_NVME_CSI_NVM: 3992 disk->product_name = "NVMe disk"; 3993 break; 3994 case SPDK_NVME_CSI_ZNS: 3995 disk->product_name = "NVMe ZNS disk"; 3996 disk->zoned = true; 3997 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 3998 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 3999 spdk_nvme_ns_get_extended_sector_size(ns); 4000 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 4001 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 4002 break; 4003 default: 4004 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 4005 return -ENOTSUP; 4006 } 4007 4008 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 4009 if (!disk->name) { 4010 return -ENOMEM; 4011 } 4012 4013 disk->write_cache = 0; 4014 if (cdata->vwc.present) { 4015 /* Enable if the Volatile Write Cache exists */ 4016 disk->write_cache = 1; 4017 } 4018 if (cdata->oncs.write_zeroes) { 4019 disk->max_write_zeroes = UINT16_MAX + 1; 4020 } 4021 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 4022 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 4023 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 4024 /* NVMe driver will split one request into multiple requests 4025 * based on MDTS and stripe boundary, the bdev layer will use 4026 * max_segment_size and max_num_segments to split one big IO 4027 * into multiple requests, then small request can't run out 4028 * of NVMe internal requests data structure. 4029 */ 4030 if (opts && opts->io_queue_requests) { 4031 disk->max_num_segments = opts->io_queue_requests / 2; 4032 } 4033 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 4034 4035 nguid = spdk_nvme_ns_get_nguid(ns); 4036 if (!nguid) { 4037 uuid = spdk_nvme_ns_get_uuid(ns); 4038 if (uuid) { 4039 disk->uuid = *uuid; 4040 } else if (g_opts.generate_uuids) { 4041 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 4042 disk->uuid = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns)); 4043 } 4044 } else { 4045 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 4046 } 4047 4048 nsdata = spdk_nvme_ns_get_data(ns); 4049 bs = spdk_nvme_ns_get_sector_size(ns); 4050 atomic_bs = bs; 4051 phys_bs = bs; 4052 if (nsdata->nabo == 0) { 4053 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 4054 atomic_bs = bs * (1 + nsdata->nawupf); 4055 } else { 4056 atomic_bs = bs * (1 + cdata->awupf); 4057 } 4058 } 4059 if (nsdata->nsfeat.optperf) { 4060 phys_bs = bs * (1 + nsdata->npwg); 4061 } 4062 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 4063 4064 disk->md_len = spdk_nvme_ns_get_md_size(ns); 4065 if (disk->md_len != 0) { 4066 disk->md_interleave = nsdata->flbas.extended; 4067 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 4068 if (disk->dif_type != SPDK_DIF_DISABLE) { 4069 disk->dif_is_head_of_md = nsdata->dps.md_start; 4070 disk->dif_check_flags = prchk_flags; 4071 } 4072 } 4073 4074 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 4075 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 4076 disk->acwu = 0; 4077 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 4078 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 4079 } else { 4080 disk->acwu = cdata->acwu + 1; /* 0-based */ 4081 } 4082 4083 if (cdata->oncs.copy) { 4084 /* For now bdev interface allows only single segment copy */ 4085 disk->max_copy = nsdata->mssrl; 4086 } 4087 4088 disk->ctxt = ctx; 4089 disk->fn_table = &nvmelib_fn_table; 4090 disk->module = &nvme_if; 4091 4092 return 0; 4093 } 4094 4095 static struct nvme_bdev * 4096 nvme_bdev_alloc(void) 4097 { 4098 struct nvme_bdev *bdev; 4099 int rc; 4100 4101 bdev = calloc(1, sizeof(*bdev)); 4102 if (!bdev) { 4103 SPDK_ERRLOG("bdev calloc() failed\n"); 4104 return NULL; 4105 } 4106 4107 if (g_opts.nvme_error_stat) { 4108 bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 4109 if (!bdev->err_stat) { 4110 SPDK_ERRLOG("err_stat calloc() failed\n"); 4111 free(bdev); 4112 return NULL; 4113 } 4114 } 4115 4116 rc = pthread_mutex_init(&bdev->mutex, NULL); 4117 if (rc != 0) { 4118 free(bdev->err_stat); 4119 free(bdev); 4120 return NULL; 4121 } 4122 4123 bdev->ref = 1; 4124 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 4125 bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 4126 bdev->rr_min_io = UINT32_MAX; 4127 TAILQ_INIT(&bdev->nvme_ns_list); 4128 4129 return bdev; 4130 } 4131 4132 static int 4133 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4134 { 4135 struct nvme_bdev *bdev; 4136 struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 4137 int rc; 4138 4139 bdev = nvme_bdev_alloc(); 4140 if (bdev == NULL) { 4141 SPDK_ERRLOG("Failed to allocate NVMe bdev\n"); 4142 return -ENOMEM; 4143 } 4144 4145 bdev->opal = nvme_ctrlr->opal_dev != NULL; 4146 4147 rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 4148 nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); 4149 if (rc != 0) { 4150 SPDK_ERRLOG("Failed to create NVMe disk\n"); 4151 nvme_bdev_free(bdev); 4152 return rc; 4153 } 4154 4155 spdk_io_device_register(bdev, 4156 bdev_nvme_create_bdev_channel_cb, 4157 bdev_nvme_destroy_bdev_channel_cb, 4158 sizeof(struct nvme_bdev_channel), 4159 bdev->disk.name); 4160 4161 nvme_ns->bdev = bdev; 4162 bdev->nsid = nvme_ns->id; 4163 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4164 4165 bdev->nbdev_ctrlr = nbdev_ctrlr; 4166 TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq); 4167 4168 rc = spdk_bdev_register(&bdev->disk); 4169 if (rc != 0) { 4170 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 4171 spdk_io_device_unregister(bdev, NULL); 4172 nvme_ns->bdev = NULL; 4173 TAILQ_REMOVE(&nbdev_ctrlr->bdevs, bdev, tailq); 4174 nvme_bdev_free(bdev); 4175 return rc; 4176 } 4177 4178 return 0; 4179 } 4180 4181 static bool 4182 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 4183 { 4184 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 4185 const struct spdk_uuid *uuid1, *uuid2; 4186 4187 nsdata1 = spdk_nvme_ns_get_data(ns1); 4188 nsdata2 = spdk_nvme_ns_get_data(ns2); 4189 uuid1 = spdk_nvme_ns_get_uuid(ns1); 4190 uuid2 = spdk_nvme_ns_get_uuid(ns2); 4191 4192 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 4193 nsdata1->eui64 == nsdata2->eui64 && 4194 ((uuid1 == NULL && uuid2 == NULL) || 4195 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 4196 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 4197 } 4198 4199 static bool 4200 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4201 struct spdk_nvme_ctrlr_opts *opts) 4202 { 4203 struct nvme_probe_skip_entry *entry; 4204 4205 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 4206 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4207 return false; 4208 } 4209 } 4210 4211 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 4212 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 4213 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 4214 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 4215 opts->disable_read_ana_log_page = true; 4216 4217 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 4218 4219 return true; 4220 } 4221 4222 static void 4223 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 4224 { 4225 struct nvme_ctrlr *nvme_ctrlr = ctx; 4226 4227 if (spdk_nvme_cpl_is_error(cpl)) { 4228 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 4229 cpl->status.sct); 4230 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4231 } else if (cpl->cdw0 & 0x1) { 4232 SPDK_WARNLOG("Specified command could not be aborted.\n"); 4233 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4234 } 4235 } 4236 4237 static void 4238 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 4239 struct spdk_nvme_qpair *qpair, uint16_t cid) 4240 { 4241 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4242 union spdk_nvme_csts_register csts; 4243 int rc; 4244 4245 assert(nvme_ctrlr->ctrlr == ctrlr); 4246 4247 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 4248 4249 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 4250 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 4251 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 4252 * completion recursively. 4253 */ 4254 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 4255 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 4256 if (csts.bits.cfs) { 4257 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 4258 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4259 return; 4260 } 4261 } 4262 4263 switch (g_opts.action_on_timeout) { 4264 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 4265 if (qpair) { 4266 /* Don't send abort to ctrlr when ctrlr is not available. */ 4267 pthread_mutex_lock(&nvme_ctrlr->mutex); 4268 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 4269 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4270 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 4271 return; 4272 } 4273 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4274 4275 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 4276 nvme_abort_cpl, nvme_ctrlr); 4277 if (rc == 0) { 4278 return; 4279 } 4280 4281 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 4282 } 4283 4284 /* FALLTHROUGH */ 4285 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 4286 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4287 break; 4288 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 4289 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 4290 break; 4291 default: 4292 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 4293 break; 4294 } 4295 } 4296 4297 static struct nvme_ns * 4298 nvme_ns_alloc(void) 4299 { 4300 struct nvme_ns *nvme_ns; 4301 4302 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 4303 if (nvme_ns == NULL) { 4304 return NULL; 4305 } 4306 4307 if (g_opts.io_path_stat) { 4308 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 4309 if (nvme_ns->stat == NULL) { 4310 free(nvme_ns); 4311 return NULL; 4312 } 4313 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 4314 } 4315 4316 return nvme_ns; 4317 } 4318 4319 static void 4320 nvme_ns_free(struct nvme_ns *nvme_ns) 4321 { 4322 free(nvme_ns->stat); 4323 free(nvme_ns); 4324 } 4325 4326 static void 4327 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 4328 { 4329 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4330 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 4331 4332 if (rc == 0) { 4333 nvme_ns->probe_ctx = NULL; 4334 pthread_mutex_lock(&nvme_ctrlr->mutex); 4335 nvme_ctrlr->ref++; 4336 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4337 } else { 4338 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4339 nvme_ns_free(nvme_ns); 4340 } 4341 4342 if (ctx) { 4343 ctx->populates_in_progress--; 4344 if (ctx->populates_in_progress == 0) { 4345 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4346 } 4347 } 4348 } 4349 4350 static void 4351 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 4352 { 4353 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4354 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4355 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4356 int rc; 4357 4358 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 4359 if (rc != 0) { 4360 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 4361 } 4362 4363 spdk_for_each_channel_continue(i, rc); 4364 } 4365 4366 static void 4367 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 4368 { 4369 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4370 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4371 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4372 struct nvme_io_path *io_path; 4373 4374 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 4375 if (io_path != NULL) { 4376 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 4377 } 4378 4379 spdk_for_each_channel_continue(i, 0); 4380 } 4381 4382 static void 4383 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 4384 { 4385 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4386 4387 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 4388 } 4389 4390 static void 4391 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 4392 { 4393 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4394 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 4395 4396 if (status == 0) { 4397 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 4398 } else { 4399 /* Delete the added io_paths and fail populating the namespace. */ 4400 spdk_for_each_channel(bdev, 4401 bdev_nvme_delete_io_path, 4402 nvme_ns, 4403 bdev_nvme_add_io_path_failed); 4404 } 4405 } 4406 4407 static int 4408 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 4409 { 4410 struct nvme_ns *tmp_ns; 4411 const struct spdk_nvme_ns_data *nsdata; 4412 4413 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 4414 if (!nsdata->nmic.can_share) { 4415 SPDK_ERRLOG("Namespace cannot be shared.\n"); 4416 return -EINVAL; 4417 } 4418 4419 pthread_mutex_lock(&bdev->mutex); 4420 4421 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 4422 assert(tmp_ns != NULL); 4423 4424 if (!bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 4425 pthread_mutex_unlock(&bdev->mutex); 4426 SPDK_ERRLOG("Namespaces are not identical.\n"); 4427 return -EINVAL; 4428 } 4429 4430 bdev->ref++; 4431 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4432 nvme_ns->bdev = bdev; 4433 4434 pthread_mutex_unlock(&bdev->mutex); 4435 4436 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 4437 spdk_for_each_channel(bdev, 4438 bdev_nvme_add_io_path, 4439 nvme_ns, 4440 bdev_nvme_add_io_path_done); 4441 4442 return 0; 4443 } 4444 4445 static void 4446 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4447 { 4448 struct spdk_nvme_ns *ns; 4449 struct nvme_bdev *bdev; 4450 int rc = 0; 4451 4452 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 4453 if (!ns) { 4454 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 4455 rc = -EINVAL; 4456 goto done; 4457 } 4458 4459 nvme_ns->ns = ns; 4460 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4461 4462 if (nvme_ctrlr->ana_log_page != NULL) { 4463 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 4464 } 4465 4466 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 4467 if (bdev == NULL) { 4468 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 4469 } else { 4470 rc = nvme_bdev_add_ns(bdev, nvme_ns); 4471 if (rc == 0) { 4472 return; 4473 } 4474 } 4475 done: 4476 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 4477 } 4478 4479 static void 4480 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 4481 { 4482 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4483 4484 assert(nvme_ctrlr != NULL); 4485 4486 pthread_mutex_lock(&nvme_ctrlr->mutex); 4487 4488 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4489 4490 if (nvme_ns->bdev != NULL) { 4491 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4492 return; 4493 } 4494 4495 nvme_ns_free(nvme_ns); 4496 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4497 4498 nvme_ctrlr_release(nvme_ctrlr); 4499 } 4500 4501 static void 4502 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 4503 { 4504 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4505 4506 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4507 } 4508 4509 static void 4510 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4511 { 4512 struct nvme_bdev *bdev; 4513 4514 spdk_poller_unregister(&nvme_ns->anatt_timer); 4515 4516 bdev = nvme_ns->bdev; 4517 if (bdev != NULL) { 4518 pthread_mutex_lock(&bdev->mutex); 4519 4520 assert(bdev->ref > 0); 4521 bdev->ref--; 4522 if (bdev->ref == 0) { 4523 pthread_mutex_unlock(&bdev->mutex); 4524 4525 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 4526 } else { 4527 /* spdk_bdev_unregister() is not called until the last nvme_ns is 4528 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 4529 * and clear nvme_ns->bdev here. 4530 */ 4531 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 4532 nvme_ns->bdev = NULL; 4533 4534 pthread_mutex_unlock(&bdev->mutex); 4535 4536 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 4537 * we call depopulate_namespace_done() to avoid use-after-free. 4538 */ 4539 spdk_for_each_channel(bdev, 4540 bdev_nvme_delete_io_path, 4541 nvme_ns, 4542 bdev_nvme_delete_io_path_done); 4543 return; 4544 } 4545 } 4546 4547 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4548 } 4549 4550 static void 4551 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 4552 struct nvme_async_probe_ctx *ctx) 4553 { 4554 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4555 struct nvme_ns *nvme_ns, *next; 4556 struct spdk_nvme_ns *ns; 4557 struct nvme_bdev *bdev; 4558 uint32_t nsid; 4559 int rc; 4560 uint64_t num_sectors; 4561 4562 if (ctx) { 4563 /* Initialize this count to 1 to handle the populate functions 4564 * calling nvme_ctrlr_populate_namespace_done() immediately. 4565 */ 4566 ctx->populates_in_progress = 1; 4567 } 4568 4569 /* First loop over our existing namespaces and see if they have been 4570 * removed. */ 4571 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4572 while (nvme_ns != NULL) { 4573 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4574 4575 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 4576 /* NS is still there but attributes may have changed */ 4577 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 4578 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 4579 bdev = nvme_ns->bdev; 4580 assert(bdev != NULL); 4581 if (bdev->disk.blockcnt != num_sectors) { 4582 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 4583 nvme_ns->id, 4584 bdev->disk.name, 4585 bdev->disk.blockcnt, 4586 num_sectors); 4587 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 4588 if (rc != 0) { 4589 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 4590 bdev->disk.name, rc); 4591 } 4592 } 4593 } else { 4594 /* Namespace was removed */ 4595 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4596 } 4597 4598 nvme_ns = next; 4599 } 4600 4601 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 4602 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4603 while (nsid != 0) { 4604 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4605 4606 if (nvme_ns == NULL) { 4607 /* Found a new one */ 4608 nvme_ns = nvme_ns_alloc(); 4609 if (nvme_ns == NULL) { 4610 SPDK_ERRLOG("Failed to allocate namespace\n"); 4611 /* This just fails to attach the namespace. It may work on a future attempt. */ 4612 continue; 4613 } 4614 4615 nvme_ns->id = nsid; 4616 nvme_ns->ctrlr = nvme_ctrlr; 4617 4618 nvme_ns->bdev = NULL; 4619 4620 if (ctx) { 4621 ctx->populates_in_progress++; 4622 } 4623 nvme_ns->probe_ctx = ctx; 4624 4625 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4626 4627 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 4628 } 4629 4630 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 4631 } 4632 4633 if (ctx) { 4634 /* Decrement this count now that the loop is over to account 4635 * for the one we started with. If the count is then 0, we 4636 * know any populate_namespace functions completed immediately, 4637 * so we'll kick the callback here. 4638 */ 4639 ctx->populates_in_progress--; 4640 if (ctx->populates_in_progress == 0) { 4641 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4642 } 4643 } 4644 4645 } 4646 4647 static void 4648 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 4649 { 4650 struct nvme_ns *nvme_ns, *tmp; 4651 4652 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 4653 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4654 } 4655 } 4656 4657 static uint32_t 4658 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 4659 { 4660 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4661 const struct spdk_nvme_ctrlr_data *cdata; 4662 uint32_t nsid, ns_count = 0; 4663 4664 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4665 4666 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4667 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 4668 ns_count++; 4669 } 4670 4671 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 4672 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 4673 sizeof(uint32_t); 4674 } 4675 4676 static int 4677 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 4678 void *cb_arg) 4679 { 4680 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4681 struct nvme_ns *nvme_ns; 4682 uint32_t i, nsid; 4683 4684 for (i = 0; i < desc->num_of_nsid; i++) { 4685 nsid = desc->nsid[i]; 4686 if (nsid == 0) { 4687 continue; 4688 } 4689 4690 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4691 4692 assert(nvme_ns != NULL); 4693 if (nvme_ns == NULL) { 4694 /* Target told us that an inactive namespace had an ANA change */ 4695 continue; 4696 } 4697 4698 _nvme_ns_set_ana_state(nvme_ns, desc); 4699 } 4700 4701 return 0; 4702 } 4703 4704 static void 4705 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4706 { 4707 struct nvme_ns *nvme_ns; 4708 4709 spdk_free(nvme_ctrlr->ana_log_page); 4710 nvme_ctrlr->ana_log_page = NULL; 4711 4712 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4713 nvme_ns != NULL; 4714 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 4715 nvme_ns->ana_state_updating = false; 4716 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4717 } 4718 } 4719 4720 static void 4721 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 4722 { 4723 struct nvme_ctrlr *nvme_ctrlr = ctx; 4724 4725 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 4726 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 4727 nvme_ctrlr); 4728 } else { 4729 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 4730 } 4731 4732 pthread_mutex_lock(&nvme_ctrlr->mutex); 4733 4734 assert(nvme_ctrlr->ana_log_page_updating == true); 4735 nvme_ctrlr->ana_log_page_updating = false; 4736 4737 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 4738 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4739 4740 nvme_ctrlr_unregister(nvme_ctrlr); 4741 } else { 4742 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4743 4744 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 4745 } 4746 } 4747 4748 static int 4749 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4750 { 4751 uint32_t ana_log_page_size; 4752 int rc; 4753 4754 if (nvme_ctrlr->ana_log_page == NULL) { 4755 return -EINVAL; 4756 } 4757 4758 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 4759 4760 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 4761 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 4762 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 4763 return -EINVAL; 4764 } 4765 4766 pthread_mutex_lock(&nvme_ctrlr->mutex); 4767 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 4768 nvme_ctrlr->ana_log_page_updating) { 4769 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4770 return -EBUSY; 4771 } 4772 4773 nvme_ctrlr->ana_log_page_updating = true; 4774 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4775 4776 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 4777 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 4778 SPDK_NVME_GLOBAL_NS_TAG, 4779 nvme_ctrlr->ana_log_page, 4780 ana_log_page_size, 0, 4781 nvme_ctrlr_read_ana_log_page_done, 4782 nvme_ctrlr); 4783 if (rc != 0) { 4784 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 4785 } 4786 4787 return rc; 4788 } 4789 4790 static void 4791 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4792 { 4793 } 4794 4795 struct bdev_nvme_set_preferred_path_ctx { 4796 struct spdk_bdev_desc *desc; 4797 struct nvme_ns *nvme_ns; 4798 bdev_nvme_set_preferred_path_cb cb_fn; 4799 void *cb_arg; 4800 }; 4801 4802 static void 4803 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 4804 { 4805 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4806 4807 assert(ctx != NULL); 4808 assert(ctx->desc != NULL); 4809 assert(ctx->cb_fn != NULL); 4810 4811 spdk_bdev_close(ctx->desc); 4812 4813 ctx->cb_fn(ctx->cb_arg, status); 4814 4815 free(ctx); 4816 } 4817 4818 static void 4819 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 4820 { 4821 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4822 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4823 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4824 struct nvme_io_path *io_path, *prev; 4825 4826 prev = NULL; 4827 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 4828 if (io_path->nvme_ns == ctx->nvme_ns) { 4829 break; 4830 } 4831 prev = io_path; 4832 } 4833 4834 if (io_path != NULL) { 4835 if (prev != NULL) { 4836 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 4837 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 4838 } 4839 4840 /* We can set io_path to nbdev_ch->current_io_path directly here. 4841 * However, it needs to be conditional. To simplify the code, 4842 * just clear nbdev_ch->current_io_path and let find_io_path() 4843 * fill it. 4844 * 4845 * Automatic failback may be disabled. Hence even if the io_path is 4846 * already at the head, clear nbdev_ch->current_io_path. 4847 */ 4848 bdev_nvme_clear_current_io_path(nbdev_ch); 4849 } 4850 4851 spdk_for_each_channel_continue(i, 0); 4852 } 4853 4854 static struct nvme_ns * 4855 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 4856 { 4857 struct nvme_ns *nvme_ns, *prev; 4858 const struct spdk_nvme_ctrlr_data *cdata; 4859 4860 prev = NULL; 4861 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 4862 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4863 4864 if (cdata->cntlid == cntlid) { 4865 break; 4866 } 4867 prev = nvme_ns; 4868 } 4869 4870 if (nvme_ns != NULL && prev != NULL) { 4871 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 4872 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 4873 } 4874 4875 return nvme_ns; 4876 } 4877 4878 /* This function supports only multipath mode. There is only a single I/O path 4879 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 4880 * head of the I/O path list for each NVMe bdev channel. 4881 * 4882 * NVMe bdev channel may be acquired after completing this function. move the 4883 * matched namespace to the head of the namespace list for the NVMe bdev too. 4884 */ 4885 void 4886 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 4887 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 4888 { 4889 struct bdev_nvme_set_preferred_path_ctx *ctx; 4890 struct spdk_bdev *bdev; 4891 struct nvme_bdev *nbdev; 4892 int rc = 0; 4893 4894 assert(cb_fn != NULL); 4895 4896 ctx = calloc(1, sizeof(*ctx)); 4897 if (ctx == NULL) { 4898 SPDK_ERRLOG("Failed to alloc context.\n"); 4899 rc = -ENOMEM; 4900 goto err_alloc; 4901 } 4902 4903 ctx->cb_fn = cb_fn; 4904 ctx->cb_arg = cb_arg; 4905 4906 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 4907 if (rc != 0) { 4908 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 4909 goto err_open; 4910 } 4911 4912 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 4913 4914 if (bdev->module != &nvme_if) { 4915 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 4916 rc = -ENODEV; 4917 goto err_bdev; 4918 } 4919 4920 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 4921 4922 pthread_mutex_lock(&nbdev->mutex); 4923 4924 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 4925 if (ctx->nvme_ns == NULL) { 4926 pthread_mutex_unlock(&nbdev->mutex); 4927 4928 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 4929 rc = -ENODEV; 4930 goto err_bdev; 4931 } 4932 4933 pthread_mutex_unlock(&nbdev->mutex); 4934 4935 spdk_for_each_channel(nbdev, 4936 _bdev_nvme_set_preferred_path, 4937 ctx, 4938 bdev_nvme_set_preferred_path_done); 4939 return; 4940 4941 err_bdev: 4942 spdk_bdev_close(ctx->desc); 4943 err_open: 4944 free(ctx); 4945 err_alloc: 4946 cb_fn(cb_arg, rc); 4947 } 4948 4949 struct bdev_nvme_set_multipath_policy_ctx { 4950 struct spdk_bdev_desc *desc; 4951 bdev_nvme_set_multipath_policy_cb cb_fn; 4952 void *cb_arg; 4953 }; 4954 4955 static void 4956 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 4957 { 4958 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4959 4960 assert(ctx != NULL); 4961 assert(ctx->desc != NULL); 4962 assert(ctx->cb_fn != NULL); 4963 4964 spdk_bdev_close(ctx->desc); 4965 4966 ctx->cb_fn(ctx->cb_arg, status); 4967 4968 free(ctx); 4969 } 4970 4971 static void 4972 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 4973 { 4974 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4975 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4976 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 4977 4978 nbdev_ch->mp_policy = nbdev->mp_policy; 4979 nbdev_ch->mp_selector = nbdev->mp_selector; 4980 nbdev_ch->rr_min_io = nbdev->rr_min_io; 4981 bdev_nvme_clear_current_io_path(nbdev_ch); 4982 4983 spdk_for_each_channel_continue(i, 0); 4984 } 4985 4986 void 4987 bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, 4988 enum bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 4989 bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 4990 { 4991 struct bdev_nvme_set_multipath_policy_ctx *ctx; 4992 struct spdk_bdev *bdev; 4993 struct nvme_bdev *nbdev; 4994 int rc; 4995 4996 assert(cb_fn != NULL); 4997 4998 if (policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE && selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 4999 if (rr_min_io == UINT32_MAX) { 5000 rr_min_io = 1; 5001 } else if (rr_min_io == 0) { 5002 rc = -EINVAL; 5003 goto exit; 5004 } 5005 } else if (rr_min_io != UINT32_MAX) { 5006 rc = -EINVAL; 5007 goto exit; 5008 } 5009 5010 ctx = calloc(1, sizeof(*ctx)); 5011 if (ctx == NULL) { 5012 SPDK_ERRLOG("Failed to alloc context.\n"); 5013 rc = -ENOMEM; 5014 goto exit; 5015 } 5016 5017 ctx->cb_fn = cb_fn; 5018 ctx->cb_arg = cb_arg; 5019 5020 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5021 if (rc != 0) { 5022 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5023 rc = -ENODEV; 5024 goto err_open; 5025 } 5026 5027 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5028 if (bdev->module != &nvme_if) { 5029 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5030 rc = -ENODEV; 5031 goto err_module; 5032 } 5033 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5034 5035 pthread_mutex_lock(&nbdev->mutex); 5036 nbdev->mp_policy = policy; 5037 nbdev->mp_selector = selector; 5038 nbdev->rr_min_io = rr_min_io; 5039 pthread_mutex_unlock(&nbdev->mutex); 5040 5041 spdk_for_each_channel(nbdev, 5042 _bdev_nvme_set_multipath_policy, 5043 ctx, 5044 bdev_nvme_set_multipath_policy_done); 5045 return; 5046 5047 err_module: 5048 spdk_bdev_close(ctx->desc); 5049 err_open: 5050 free(ctx); 5051 exit: 5052 cb_fn(cb_arg, rc); 5053 } 5054 5055 static void 5056 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5057 { 5058 struct nvme_ctrlr *nvme_ctrlr = arg; 5059 union spdk_nvme_async_event_completion event; 5060 5061 if (spdk_nvme_cpl_is_error(cpl)) { 5062 SPDK_WARNLOG("AER request execute failed\n"); 5063 return; 5064 } 5065 5066 event.raw = cpl->cdw0; 5067 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5068 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 5069 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 5070 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5071 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 5072 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 5073 } 5074 } 5075 5076 static void 5077 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc) 5078 { 5079 if (ctx->cb_fn) { 5080 ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc); 5081 } 5082 5083 ctx->namespaces_populated = true; 5084 if (ctx->probe_done) { 5085 /* The probe was already completed, so we need to free the context 5086 * here. This can happen for cases like OCSSD, where we need to 5087 * send additional commands to the SSD after attach. 5088 */ 5089 free(ctx); 5090 } 5091 } 5092 5093 static void 5094 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 5095 struct nvme_async_probe_ctx *ctx) 5096 { 5097 spdk_io_device_register(nvme_ctrlr, 5098 bdev_nvme_create_ctrlr_channel_cb, 5099 bdev_nvme_destroy_ctrlr_channel_cb, 5100 sizeof(struct nvme_ctrlr_channel), 5101 nvme_ctrlr->nbdev_ctrlr->name); 5102 5103 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 5104 } 5105 5106 static void 5107 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 5108 { 5109 struct nvme_ctrlr *nvme_ctrlr = _ctx; 5110 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 5111 5112 nvme_ctrlr->probe_ctx = NULL; 5113 5114 if (spdk_nvme_cpl_is_error(cpl)) { 5115 nvme_ctrlr_delete(nvme_ctrlr); 5116 5117 if (ctx != NULL) { 5118 ctx->reported_bdevs = 0; 5119 populate_namespaces_cb(ctx, -1); 5120 } 5121 return; 5122 } 5123 5124 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5125 } 5126 5127 static int 5128 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 5129 struct nvme_async_probe_ctx *ctx) 5130 { 5131 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5132 const struct spdk_nvme_ctrlr_data *cdata; 5133 uint32_t ana_log_page_size; 5134 5135 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5136 5137 /* Set buffer size enough to include maximum number of allowed namespaces. */ 5138 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5139 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 5140 sizeof(uint32_t); 5141 5142 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 5143 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5144 if (nvme_ctrlr->ana_log_page == NULL) { 5145 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 5146 return -ENXIO; 5147 } 5148 5149 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 5150 * Hence copy each descriptor to a temporary area when parsing it. 5151 * 5152 * Allocate a buffer whose size is as large as ANA log page buffer because 5153 * we do not know the size of a descriptor until actually reading it. 5154 */ 5155 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 5156 if (nvme_ctrlr->copied_ana_desc == NULL) { 5157 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 5158 return -ENOMEM; 5159 } 5160 5161 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 5162 5163 nvme_ctrlr->probe_ctx = ctx; 5164 5165 /* Then, set the read size only to include the current active namespaces. */ 5166 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5167 5168 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5169 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5170 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5171 return -EINVAL; 5172 } 5173 5174 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 5175 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5176 SPDK_NVME_GLOBAL_NS_TAG, 5177 nvme_ctrlr->ana_log_page, 5178 ana_log_page_size, 0, 5179 nvme_ctrlr_init_ana_log_page_done, 5180 nvme_ctrlr); 5181 } 5182 5183 /* hostnqn and subnqn were already verified before attaching a controller. 5184 * Hence check only the multipath capability and cntlid here. 5185 */ 5186 static bool 5187 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 5188 { 5189 struct nvme_ctrlr *tmp; 5190 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 5191 5192 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5193 5194 if (!cdata->cmic.multi_ctrlr) { 5195 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5196 return false; 5197 } 5198 5199 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 5200 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 5201 5202 if (!tmp_cdata->cmic.multi_ctrlr) { 5203 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5204 return false; 5205 } 5206 if (cdata->cntlid == tmp_cdata->cntlid) { 5207 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 5208 return false; 5209 } 5210 } 5211 5212 return true; 5213 } 5214 5215 static int 5216 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 5217 { 5218 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5219 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5220 int rc = 0; 5221 5222 pthread_mutex_lock(&g_bdev_nvme_mutex); 5223 5224 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5225 if (nbdev_ctrlr != NULL) { 5226 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 5227 rc = -EINVAL; 5228 goto exit; 5229 } 5230 } else { 5231 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 5232 if (nbdev_ctrlr == NULL) { 5233 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 5234 rc = -ENOMEM; 5235 goto exit; 5236 } 5237 nbdev_ctrlr->name = strdup(name); 5238 if (nbdev_ctrlr->name == NULL) { 5239 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 5240 free(nbdev_ctrlr); 5241 goto exit; 5242 } 5243 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 5244 TAILQ_INIT(&nbdev_ctrlr->bdevs); 5245 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 5246 } 5247 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 5248 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 5249 exit: 5250 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5251 return rc; 5252 } 5253 5254 static int 5255 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 5256 const char *name, 5257 const struct spdk_nvme_transport_id *trid, 5258 struct nvme_async_probe_ctx *ctx) 5259 { 5260 struct nvme_ctrlr *nvme_ctrlr; 5261 struct nvme_path_id *path_id; 5262 const struct spdk_nvme_ctrlr_data *cdata; 5263 int rc; 5264 5265 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 5266 if (nvme_ctrlr == NULL) { 5267 SPDK_ERRLOG("Failed to allocate device struct\n"); 5268 return -ENOMEM; 5269 } 5270 5271 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 5272 if (rc != 0) { 5273 free(nvme_ctrlr); 5274 return rc; 5275 } 5276 5277 TAILQ_INIT(&nvme_ctrlr->trids); 5278 5279 RB_INIT(&nvme_ctrlr->namespaces); 5280 5281 path_id = calloc(1, sizeof(*path_id)); 5282 if (path_id == NULL) { 5283 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 5284 rc = -ENOMEM; 5285 goto err; 5286 } 5287 5288 path_id->trid = *trid; 5289 if (ctx != NULL) { 5290 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 5291 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 5292 } 5293 nvme_ctrlr->active_path_id = path_id; 5294 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 5295 5296 nvme_ctrlr->thread = spdk_get_thread(); 5297 nvme_ctrlr->ctrlr = ctrlr; 5298 nvme_ctrlr->ref = 1; 5299 5300 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 5301 SPDK_ERRLOG("OCSSDs are not supported"); 5302 rc = -ENOTSUP; 5303 goto err; 5304 } 5305 5306 if (ctx != NULL) { 5307 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 5308 } else { 5309 bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 5310 } 5311 5312 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 5313 g_opts.nvme_adminq_poll_period_us); 5314 5315 if (g_opts.timeout_us > 0) { 5316 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 5317 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 5318 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 5319 g_opts.timeout_us : g_opts.timeout_admin_us; 5320 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 5321 adm_timeout_us, timeout_cb, nvme_ctrlr); 5322 } 5323 5324 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 5325 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 5326 5327 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 5328 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 5329 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 5330 } 5331 5332 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 5333 if (rc != 0) { 5334 goto err; 5335 } 5336 5337 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5338 5339 if (cdata->cmic.ana_reporting) { 5340 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 5341 if (rc == 0) { 5342 return 0; 5343 } 5344 } else { 5345 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5346 return 0; 5347 } 5348 5349 err: 5350 nvme_ctrlr_delete(nvme_ctrlr); 5351 return rc; 5352 } 5353 5354 void 5355 bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) 5356 { 5357 opts->prchk_flags = 0; 5358 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 5359 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 5360 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 5361 } 5362 5363 static void 5364 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5365 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 5366 { 5367 char *name; 5368 5369 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 5370 if (!name) { 5371 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 5372 return; 5373 } 5374 5375 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 5376 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 5377 } else { 5378 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 5379 } 5380 5381 free(name); 5382 } 5383 5384 static void 5385 _nvme_ctrlr_destruct(void *ctx) 5386 { 5387 struct nvme_ctrlr *nvme_ctrlr = ctx; 5388 5389 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 5390 nvme_ctrlr_release(nvme_ctrlr); 5391 } 5392 5393 static int 5394 bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5395 { 5396 struct nvme_probe_skip_entry *entry; 5397 5398 /* The controller's destruction was already started */ 5399 if (nvme_ctrlr->destruct) { 5400 return -EALREADY; 5401 } 5402 5403 if (!hotplug && 5404 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 5405 entry = calloc(1, sizeof(*entry)); 5406 if (!entry) { 5407 return -ENOMEM; 5408 } 5409 entry->trid = nvme_ctrlr->active_path_id->trid; 5410 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 5411 } 5412 5413 nvme_ctrlr->destruct = true; 5414 return 0; 5415 } 5416 5417 static int 5418 bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5419 { 5420 int rc; 5421 5422 pthread_mutex_lock(&nvme_ctrlr->mutex); 5423 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug); 5424 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5425 5426 if (rc == 0) { 5427 _nvme_ctrlr_destruct(nvme_ctrlr); 5428 } else if (rc == -EALREADY) { 5429 rc = 0; 5430 } 5431 5432 return rc; 5433 } 5434 5435 static void 5436 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 5437 { 5438 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 5439 5440 bdev_nvme_delete_ctrlr(nvme_ctrlr, true); 5441 } 5442 5443 static int 5444 bdev_nvme_hotplug_probe(void *arg) 5445 { 5446 if (g_hotplug_probe_ctx == NULL) { 5447 spdk_poller_unregister(&g_hotplug_probe_poller); 5448 return SPDK_POLLER_IDLE; 5449 } 5450 5451 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 5452 g_hotplug_probe_ctx = NULL; 5453 spdk_poller_unregister(&g_hotplug_probe_poller); 5454 } 5455 5456 return SPDK_POLLER_BUSY; 5457 } 5458 5459 static int 5460 bdev_nvme_hotplug(void *arg) 5461 { 5462 struct spdk_nvme_transport_id trid_pcie; 5463 5464 if (g_hotplug_probe_ctx) { 5465 return SPDK_POLLER_BUSY; 5466 } 5467 5468 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5469 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5470 5471 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 5472 hotplug_probe_cb, attach_cb, NULL); 5473 5474 if (g_hotplug_probe_ctx) { 5475 assert(g_hotplug_probe_poller == NULL); 5476 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 5477 } 5478 5479 return SPDK_POLLER_BUSY; 5480 } 5481 5482 void 5483 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 5484 { 5485 *opts = g_opts; 5486 } 5487 5488 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5489 uint32_t reconnect_delay_sec, 5490 uint32_t fast_io_fail_timeout_sec); 5491 5492 static int 5493 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 5494 { 5495 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 5496 /* Can't set timeout_admin_us without also setting timeout_us */ 5497 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 5498 return -EINVAL; 5499 } 5500 5501 if (opts->bdev_retry_count < -1) { 5502 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 5503 return -EINVAL; 5504 } 5505 5506 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 5507 opts->reconnect_delay_sec, 5508 opts->fast_io_fail_timeout_sec)) { 5509 return -EINVAL; 5510 } 5511 5512 return 0; 5513 } 5514 5515 int 5516 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 5517 { 5518 int ret; 5519 5520 ret = bdev_nvme_validate_opts(opts); 5521 if (ret) { 5522 SPDK_WARNLOG("Failed to set nvme opts.\n"); 5523 return ret; 5524 } 5525 5526 if (g_bdev_nvme_init_thread != NULL) { 5527 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5528 return -EPERM; 5529 } 5530 } 5531 5532 if (opts->rdma_srq_size != 0) { 5533 struct spdk_nvme_transport_opts drv_opts; 5534 5535 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 5536 drv_opts.rdma_srq_size = opts->rdma_srq_size; 5537 5538 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 5539 if (ret) { 5540 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 5541 return ret; 5542 } 5543 } 5544 5545 g_opts = *opts; 5546 5547 return 0; 5548 } 5549 5550 struct set_nvme_hotplug_ctx { 5551 uint64_t period_us; 5552 bool enabled; 5553 spdk_msg_fn fn; 5554 void *fn_ctx; 5555 }; 5556 5557 static void 5558 set_nvme_hotplug_period_cb(void *_ctx) 5559 { 5560 struct set_nvme_hotplug_ctx *ctx = _ctx; 5561 5562 spdk_poller_unregister(&g_hotplug_poller); 5563 if (ctx->enabled) { 5564 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 5565 } 5566 5567 g_nvme_hotplug_poll_period_us = ctx->period_us; 5568 g_nvme_hotplug_enabled = ctx->enabled; 5569 if (ctx->fn) { 5570 ctx->fn(ctx->fn_ctx); 5571 } 5572 5573 free(ctx); 5574 } 5575 5576 int 5577 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 5578 { 5579 struct set_nvme_hotplug_ctx *ctx; 5580 5581 if (enabled == true && !spdk_process_is_primary()) { 5582 return -EPERM; 5583 } 5584 5585 ctx = calloc(1, sizeof(*ctx)); 5586 if (ctx == NULL) { 5587 return -ENOMEM; 5588 } 5589 5590 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 5591 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 5592 ctx->enabled = enabled; 5593 ctx->fn = cb; 5594 ctx->fn_ctx = cb_ctx; 5595 5596 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 5597 return 0; 5598 } 5599 5600 static void 5601 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 5602 struct nvme_async_probe_ctx *ctx) 5603 { 5604 struct nvme_ns *nvme_ns; 5605 struct nvme_bdev *nvme_bdev; 5606 size_t j; 5607 5608 assert(nvme_ctrlr != NULL); 5609 5610 if (ctx->names == NULL) { 5611 ctx->reported_bdevs = 0; 5612 populate_namespaces_cb(ctx, 0); 5613 return; 5614 } 5615 5616 /* 5617 * Report the new bdevs that were created in this call. 5618 * There can be more than one bdev per NVMe controller. 5619 */ 5620 j = 0; 5621 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5622 while (nvme_ns != NULL) { 5623 nvme_bdev = nvme_ns->bdev; 5624 if (j < ctx->max_bdevs) { 5625 ctx->names[j] = nvme_bdev->disk.name; 5626 j++; 5627 } else { 5628 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 5629 ctx->max_bdevs); 5630 ctx->reported_bdevs = 0; 5631 populate_namespaces_cb(ctx, -ERANGE); 5632 return; 5633 } 5634 5635 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5636 } 5637 5638 ctx->reported_bdevs = j; 5639 populate_namespaces_cb(ctx, 0); 5640 } 5641 5642 static int 5643 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5644 struct spdk_nvme_ctrlr *new_ctrlr, 5645 struct spdk_nvme_transport_id *trid) 5646 { 5647 struct nvme_path_id *tmp_trid; 5648 5649 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5650 SPDK_ERRLOG("PCIe failover is not supported.\n"); 5651 return -ENOTSUP; 5652 } 5653 5654 /* Currently we only support failover to the same transport type. */ 5655 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 5656 SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", 5657 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 5658 spdk_nvme_transport_id_trtype_str(trid->trtype)); 5659 return -EINVAL; 5660 } 5661 5662 5663 /* Currently we only support failover to the same NQN. */ 5664 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 5665 SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 5666 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 5667 return -EINVAL; 5668 } 5669 5670 /* Skip all the other checks if we've already registered this path. */ 5671 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 5672 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 5673 SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, 5674 trid->subnqn); 5675 return -EEXIST; 5676 } 5677 } 5678 5679 return 0; 5680 } 5681 5682 static int 5683 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 5684 struct spdk_nvme_ctrlr *new_ctrlr) 5685 { 5686 struct nvme_ns *nvme_ns; 5687 struct spdk_nvme_ns *new_ns; 5688 5689 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5690 while (nvme_ns != NULL) { 5691 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 5692 assert(new_ns != NULL); 5693 5694 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 5695 return -EINVAL; 5696 } 5697 5698 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5699 } 5700 5701 return 0; 5702 } 5703 5704 static int 5705 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5706 struct spdk_nvme_transport_id *trid) 5707 { 5708 struct nvme_path_id *active_id, *new_trid, *tmp_trid; 5709 5710 new_trid = calloc(1, sizeof(*new_trid)); 5711 if (new_trid == NULL) { 5712 return -ENOMEM; 5713 } 5714 new_trid->trid = *trid; 5715 5716 active_id = nvme_ctrlr->active_path_id; 5717 assert(active_id != NULL); 5718 assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 5719 5720 /* Skip the active trid not to replace it until it is failed. */ 5721 tmp_trid = TAILQ_NEXT(active_id, link); 5722 if (tmp_trid == NULL) { 5723 goto add_tail; 5724 } 5725 5726 /* It means the trid is faled if its last failed time is non-zero. 5727 * Insert the new alternate trid before any failed trid. 5728 */ 5729 TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) { 5730 if (tmp_trid->last_failed_tsc != 0) { 5731 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 5732 return 0; 5733 } 5734 } 5735 5736 add_tail: 5737 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 5738 return 0; 5739 } 5740 5741 /* This is the case that a secondary path is added to an existing 5742 * nvme_ctrlr for failover. After checking if it can access the same 5743 * namespaces as the primary path, it is disconnected until failover occurs. 5744 */ 5745 static int 5746 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5747 struct spdk_nvme_ctrlr *new_ctrlr, 5748 struct spdk_nvme_transport_id *trid) 5749 { 5750 int rc; 5751 5752 assert(nvme_ctrlr != NULL); 5753 5754 pthread_mutex_lock(&nvme_ctrlr->mutex); 5755 5756 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 5757 if (rc != 0) { 5758 goto exit; 5759 } 5760 5761 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 5762 if (rc != 0) { 5763 goto exit; 5764 } 5765 5766 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 5767 5768 exit: 5769 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5770 5771 spdk_nvme_detach(new_ctrlr); 5772 5773 return rc; 5774 } 5775 5776 static void 5777 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5778 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5779 { 5780 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5781 struct nvme_async_probe_ctx *ctx; 5782 int rc; 5783 5784 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5785 ctx->ctrlr_attached = true; 5786 5787 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 5788 if (rc != 0) { 5789 ctx->reported_bdevs = 0; 5790 populate_namespaces_cb(ctx, rc); 5791 } 5792 } 5793 5794 static void 5795 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5796 struct spdk_nvme_ctrlr *ctrlr, 5797 const struct spdk_nvme_ctrlr_opts *opts) 5798 { 5799 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5800 struct nvme_ctrlr *nvme_ctrlr; 5801 struct nvme_async_probe_ctx *ctx; 5802 int rc; 5803 5804 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5805 ctx->ctrlr_attached = true; 5806 5807 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 5808 if (nvme_ctrlr) { 5809 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 5810 } else { 5811 rc = -ENODEV; 5812 } 5813 5814 ctx->reported_bdevs = 0; 5815 populate_namespaces_cb(ctx, rc); 5816 } 5817 5818 static int 5819 bdev_nvme_async_poll(void *arg) 5820 { 5821 struct nvme_async_probe_ctx *ctx = arg; 5822 int rc; 5823 5824 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 5825 if (spdk_unlikely(rc != -EAGAIN)) { 5826 ctx->probe_done = true; 5827 spdk_poller_unregister(&ctx->poller); 5828 if (!ctx->ctrlr_attached) { 5829 /* The probe is done, but no controller was attached. 5830 * That means we had a failure, so report -EIO back to 5831 * the caller (usually the RPC). populate_namespaces_cb() 5832 * will take care of freeing the nvme_async_probe_ctx. 5833 */ 5834 ctx->reported_bdevs = 0; 5835 populate_namespaces_cb(ctx, -EIO); 5836 } else if (ctx->namespaces_populated) { 5837 /* The namespaces for the attached controller were all 5838 * populated and the response was already sent to the 5839 * caller (usually the RPC). So free the context here. 5840 */ 5841 free(ctx); 5842 } 5843 } 5844 5845 return SPDK_POLLER_BUSY; 5846 } 5847 5848 static bool 5849 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5850 uint32_t reconnect_delay_sec, 5851 uint32_t fast_io_fail_timeout_sec) 5852 { 5853 if (ctrlr_loss_timeout_sec < -1) { 5854 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 5855 return false; 5856 } else if (ctrlr_loss_timeout_sec == -1) { 5857 if (reconnect_delay_sec == 0) { 5858 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 5859 return false; 5860 } else if (fast_io_fail_timeout_sec != 0 && 5861 fast_io_fail_timeout_sec < reconnect_delay_sec) { 5862 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 5863 return false; 5864 } 5865 } else if (ctrlr_loss_timeout_sec != 0) { 5866 if (reconnect_delay_sec == 0) { 5867 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 5868 return false; 5869 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 5870 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 5871 return false; 5872 } else if (fast_io_fail_timeout_sec != 0) { 5873 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 5874 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 5875 return false; 5876 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 5877 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 5878 return false; 5879 } 5880 } 5881 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 5882 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 5883 return false; 5884 } 5885 5886 return true; 5887 } 5888 5889 int 5890 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 5891 const char *base_name, 5892 const char **names, 5893 uint32_t count, 5894 spdk_bdev_create_nvme_fn cb_fn, 5895 void *cb_ctx, 5896 struct spdk_nvme_ctrlr_opts *drv_opts, 5897 struct nvme_ctrlr_opts *bdev_opts, 5898 bool multipath) 5899 { 5900 struct nvme_probe_skip_entry *entry, *tmp; 5901 struct nvme_async_probe_ctx *ctx; 5902 spdk_nvme_attach_cb attach_cb; 5903 5904 /* TODO expand this check to include both the host and target TRIDs. 5905 * Only if both are the same should we fail. 5906 */ 5907 if (nvme_ctrlr_get(trid) != NULL) { 5908 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 5909 return -EEXIST; 5910 } 5911 5912 if (bdev_opts != NULL && 5913 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 5914 bdev_opts->reconnect_delay_sec, 5915 bdev_opts->fast_io_fail_timeout_sec)) { 5916 return -EINVAL; 5917 } 5918 5919 ctx = calloc(1, sizeof(*ctx)); 5920 if (!ctx) { 5921 return -ENOMEM; 5922 } 5923 ctx->base_name = base_name; 5924 ctx->names = names; 5925 ctx->max_bdevs = count; 5926 ctx->cb_fn = cb_fn; 5927 ctx->cb_ctx = cb_ctx; 5928 ctx->trid = *trid; 5929 5930 if (bdev_opts) { 5931 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 5932 } else { 5933 bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 5934 } 5935 5936 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5937 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 5938 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 5939 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 5940 free(entry); 5941 break; 5942 } 5943 } 5944 } 5945 5946 if (drv_opts) { 5947 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 5948 } else { 5949 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); 5950 } 5951 5952 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 5953 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 5954 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 5955 ctx->drv_opts.disable_read_ana_log_page = true; 5956 ctx->drv_opts.transport_tos = g_opts.transport_tos; 5957 5958 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 5959 attach_cb = connect_attach_cb; 5960 } else { 5961 attach_cb = connect_set_failover_cb; 5962 } 5963 5964 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 5965 if (ctx->probe_ctx == NULL) { 5966 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 5967 free(ctx); 5968 return -ENODEV; 5969 } 5970 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 5971 5972 return 0; 5973 } 5974 5975 struct bdev_nvme_delete_ctx { 5976 char *name; 5977 struct nvme_path_id path_id; 5978 bdev_nvme_delete_done_fn delete_done; 5979 void *delete_done_ctx; 5980 uint64_t timeout_ticks; 5981 struct spdk_poller *poller; 5982 }; 5983 5984 static void 5985 free_bdev_nvme_delete_ctx(struct bdev_nvme_delete_ctx *ctx) 5986 { 5987 if (ctx != NULL) { 5988 free(ctx->name); 5989 free(ctx); 5990 } 5991 } 5992 5993 static bool 5994 nvme_path_id_compare(struct nvme_path_id *p, const struct nvme_path_id *path_id) 5995 { 5996 if (path_id->trid.trtype != 0) { 5997 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 5998 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 5999 return false; 6000 } 6001 } else { 6002 if (path_id->trid.trtype != p->trid.trtype) { 6003 return false; 6004 } 6005 } 6006 } 6007 6008 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 6009 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 6010 return false; 6011 } 6012 } 6013 6014 if (path_id->trid.adrfam != 0) { 6015 if (path_id->trid.adrfam != p->trid.adrfam) { 6016 return false; 6017 } 6018 } 6019 6020 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 6021 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 6022 return false; 6023 } 6024 } 6025 6026 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 6027 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 6028 return false; 6029 } 6030 } 6031 6032 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 6033 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 6034 return false; 6035 } 6036 } 6037 6038 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 6039 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 6040 return false; 6041 } 6042 } 6043 6044 return true; 6045 } 6046 6047 static bool 6048 nvme_path_id_exists(const char *name, const struct nvme_path_id *path_id) 6049 { 6050 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6051 struct nvme_ctrlr *ctrlr; 6052 struct nvme_path_id *p; 6053 6054 pthread_mutex_lock(&g_bdev_nvme_mutex); 6055 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6056 if (!nbdev_ctrlr) { 6057 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6058 return false; 6059 } 6060 6061 TAILQ_FOREACH(ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6062 pthread_mutex_lock(&ctrlr->mutex); 6063 TAILQ_FOREACH(p, &ctrlr->trids, link) { 6064 if (nvme_path_id_compare(p, path_id)) { 6065 pthread_mutex_unlock(&ctrlr->mutex); 6066 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6067 return true; 6068 } 6069 } 6070 pthread_mutex_unlock(&ctrlr->mutex); 6071 } 6072 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6073 6074 return false; 6075 } 6076 6077 static int 6078 bdev_nvme_delete_complete_poll(void *arg) 6079 { 6080 struct bdev_nvme_delete_ctx *ctx = arg; 6081 int rc = 0; 6082 6083 if (nvme_path_id_exists(ctx->name, &ctx->path_id)) { 6084 if (ctx->timeout_ticks > spdk_get_ticks()) { 6085 return SPDK_POLLER_BUSY; 6086 } 6087 6088 SPDK_ERRLOG("NVMe path '%s' still exists after delete\n", ctx->name); 6089 rc = -ETIMEDOUT; 6090 } 6091 6092 spdk_poller_unregister(&ctx->poller); 6093 6094 ctx->delete_done(ctx->delete_done_ctx, rc); 6095 free_bdev_nvme_delete_ctx(ctx); 6096 6097 return SPDK_POLLER_BUSY; 6098 } 6099 6100 static int 6101 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id) 6102 { 6103 struct nvme_path_id *p, *t; 6104 spdk_msg_fn msg_fn; 6105 int rc = -ENXIO; 6106 6107 pthread_mutex_lock(&nvme_ctrlr->mutex); 6108 6109 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 6110 if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) { 6111 break; 6112 } 6113 6114 if (!nvme_path_id_compare(p, path_id)) { 6115 continue; 6116 } 6117 6118 /* We are not using the specified path. */ 6119 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 6120 free(p); 6121 rc = 0; 6122 } 6123 6124 if (p == NULL || !nvme_path_id_compare(p, path_id)) { 6125 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6126 return rc; 6127 } 6128 6129 /* If we made it here, then this path is a match! Now we need to remove it. */ 6130 6131 /* This is the active path in use right now. The active path is always the first in the list. */ 6132 assert(p == nvme_ctrlr->active_path_id); 6133 6134 if (!TAILQ_NEXT(p, link)) { 6135 /* The current path is the only path. */ 6136 msg_fn = _nvme_ctrlr_destruct; 6137 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false); 6138 } else { 6139 /* There is an alternative path. */ 6140 msg_fn = _bdev_nvme_reset_ctrlr; 6141 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true); 6142 } 6143 6144 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6145 6146 if (rc == 0) { 6147 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 6148 } else if (rc == -EALREADY) { 6149 rc = 0; 6150 } 6151 6152 return rc; 6153 } 6154 6155 int 6156 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id, 6157 bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx) 6158 { 6159 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6160 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 6161 struct bdev_nvme_delete_ctx *ctx = NULL; 6162 int rc = -ENXIO, _rc; 6163 6164 if (name == NULL || path_id == NULL) { 6165 rc = -EINVAL; 6166 goto exit; 6167 } 6168 6169 pthread_mutex_lock(&g_bdev_nvme_mutex); 6170 6171 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6172 if (nbdev_ctrlr == NULL) { 6173 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6174 6175 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 6176 rc = -ENODEV; 6177 goto exit; 6178 } 6179 6180 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 6181 _rc = _bdev_nvme_delete(nvme_ctrlr, path_id); 6182 if (_rc < 0 && _rc != -ENXIO) { 6183 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6184 rc = _rc; 6185 goto exit; 6186 } else if (_rc == 0) { 6187 /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr 6188 * was deleted successfully. To remember the successful deletion, 6189 * overwrite rc only if _rc is zero. 6190 */ 6191 rc = 0; 6192 } 6193 } 6194 6195 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6196 6197 if (rc != 0 || delete_done == NULL) { 6198 goto exit; 6199 } 6200 6201 ctx = calloc(1, sizeof(*ctx)); 6202 if (ctx == NULL) { 6203 SPDK_ERRLOG("Failed to allocate context for bdev_nvme_delete\n"); 6204 rc = -ENOMEM; 6205 goto exit; 6206 } 6207 6208 ctx->name = strdup(name); 6209 if (ctx->name == NULL) { 6210 SPDK_ERRLOG("Failed to copy controller name for deletion\n"); 6211 rc = -ENOMEM; 6212 goto exit; 6213 } 6214 6215 ctx->delete_done = delete_done; 6216 ctx->delete_done_ctx = delete_done_ctx; 6217 ctx->path_id = *path_id; 6218 ctx->timeout_ticks = spdk_get_ticks() + 10 * spdk_get_ticks_hz(); 6219 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_delete_complete_poll, ctx, 1000); 6220 if (ctx->poller == NULL) { 6221 SPDK_ERRLOG("Failed to register bdev_nvme_delete poller\n"); 6222 rc = -ENOMEM; 6223 goto exit; 6224 } 6225 6226 exit: 6227 if (rc != 0) { 6228 free_bdev_nvme_delete_ctx(ctx); 6229 } 6230 6231 return rc; 6232 } 6233 6234 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 6235 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6236 6237 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 6238 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6239 6240 struct discovery_entry_ctx { 6241 char name[128]; 6242 struct spdk_nvme_transport_id trid; 6243 struct spdk_nvme_ctrlr_opts drv_opts; 6244 struct spdk_nvmf_discovery_log_page_entry entry; 6245 TAILQ_ENTRY(discovery_entry_ctx) tailq; 6246 struct discovery_ctx *ctx; 6247 }; 6248 6249 struct discovery_ctx { 6250 char *name; 6251 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 6252 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 6253 void *cb_ctx; 6254 struct spdk_nvme_probe_ctx *probe_ctx; 6255 struct spdk_nvme_detach_ctx *detach_ctx; 6256 struct spdk_nvme_ctrlr *ctrlr; 6257 struct spdk_nvme_transport_id trid; 6258 struct discovery_entry_ctx *entry_ctx_in_use; 6259 struct spdk_poller *poller; 6260 struct spdk_nvme_ctrlr_opts drv_opts; 6261 struct nvme_ctrlr_opts bdev_opts; 6262 struct spdk_nvmf_discovery_log_page *log_page; 6263 TAILQ_ENTRY(discovery_ctx) tailq; 6264 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 6265 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 6266 int rc; 6267 bool wait_for_attach; 6268 uint64_t timeout_ticks; 6269 /* Denotes that the discovery service is being started. We're waiting 6270 * for the initial connection to the discovery controller to be 6271 * established and attach discovered NVM ctrlrs. 6272 */ 6273 bool initializing; 6274 /* Denotes if a discovery is currently in progress for this context. 6275 * That includes connecting to newly discovered subsystems. Used to 6276 * ensure we do not start a new discovery until an existing one is 6277 * complete. 6278 */ 6279 bool in_progress; 6280 6281 /* Denotes if another discovery is needed after the one in progress 6282 * completes. Set when we receive an AER completion while a discovery 6283 * is already in progress. 6284 */ 6285 bool pending; 6286 6287 /* Signal to the discovery context poller that it should stop the 6288 * discovery service, including detaching from the current discovery 6289 * controller. 6290 */ 6291 bool stop; 6292 6293 struct spdk_thread *calling_thread; 6294 uint32_t index; 6295 uint32_t attach_in_progress; 6296 char *hostnqn; 6297 6298 /* Denotes if the discovery service was started by the mdns discovery. 6299 */ 6300 bool from_mdns_discovery_service; 6301 }; 6302 6303 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 6304 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 6305 6306 static void get_discovery_log_page(struct discovery_ctx *ctx); 6307 6308 static void 6309 free_discovery_ctx(struct discovery_ctx *ctx) 6310 { 6311 free(ctx->log_page); 6312 free(ctx->hostnqn); 6313 free(ctx->name); 6314 free(ctx); 6315 } 6316 6317 static void 6318 discovery_complete(struct discovery_ctx *ctx) 6319 { 6320 ctx->initializing = false; 6321 ctx->in_progress = false; 6322 if (ctx->pending) { 6323 ctx->pending = false; 6324 get_discovery_log_page(ctx); 6325 } 6326 } 6327 6328 static void 6329 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 6330 struct spdk_nvmf_discovery_log_page_entry *entry) 6331 { 6332 char *space; 6333 6334 trid->trtype = entry->trtype; 6335 trid->adrfam = entry->adrfam; 6336 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 6337 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 6338 /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and 6339 * before call to this function trid->subnqn is zeroed out, we need 6340 * to copy sizeof(trid->subnqn) minus one byte to make sure the last character 6341 * remains 0. Then we can shorten the string (replace ' ' with 0) if required 6342 */ 6343 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1); 6344 6345 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 6346 * But the log page entries typically pad them with spaces, not zeroes. 6347 * So add a NULL terminator to each of these fields at the appropriate 6348 * location. 6349 */ 6350 space = strchr(trid->traddr, ' '); 6351 if (space) { 6352 *space = 0; 6353 } 6354 space = strchr(trid->trsvcid, ' '); 6355 if (space) { 6356 *space = 0; 6357 } 6358 space = strchr(trid->subnqn, ' '); 6359 if (space) { 6360 *space = 0; 6361 } 6362 } 6363 6364 static void 6365 _stop_discovery(void *_ctx) 6366 { 6367 struct discovery_ctx *ctx = _ctx; 6368 6369 if (ctx->attach_in_progress > 0) { 6370 spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx); 6371 return; 6372 } 6373 6374 ctx->stop = true; 6375 6376 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 6377 struct discovery_entry_ctx *entry_ctx; 6378 struct nvme_path_id path = {}; 6379 6380 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 6381 path.trid = entry_ctx->trid; 6382 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6383 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6384 free(entry_ctx); 6385 } 6386 6387 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 6388 struct discovery_entry_ctx *entry_ctx; 6389 6390 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6391 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6392 free(entry_ctx); 6393 } 6394 6395 free(ctx->entry_ctx_in_use); 6396 ctx->entry_ctx_in_use = NULL; 6397 } 6398 6399 static void 6400 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6401 { 6402 ctx->stop_cb_fn = cb_fn; 6403 ctx->cb_ctx = cb_ctx; 6404 6405 if (ctx->attach_in_progress > 0) { 6406 DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n", 6407 ctx->attach_in_progress); 6408 } 6409 6410 _stop_discovery(ctx); 6411 } 6412 6413 static void 6414 remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr) 6415 { 6416 struct discovery_ctx *d_ctx; 6417 struct nvme_path_id *path_id; 6418 struct spdk_nvme_transport_id trid = {}; 6419 struct discovery_entry_ctx *entry_ctx, *tmp; 6420 6421 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 6422 6423 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6424 TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) { 6425 build_trid_from_log_page_entry(&trid, &entry_ctx->entry); 6426 if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) { 6427 continue; 6428 } 6429 6430 TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq); 6431 free(entry_ctx); 6432 DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n", 6433 trid.subnqn, trid.traddr, trid.trsvcid); 6434 6435 /* Fail discovery ctrlr to force reattach attempt */ 6436 spdk_nvme_ctrlr_fail(d_ctx->ctrlr); 6437 } 6438 } 6439 } 6440 6441 static void 6442 discovery_remove_controllers(struct discovery_ctx *ctx) 6443 { 6444 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 6445 struct discovery_entry_ctx *entry_ctx, *tmp; 6446 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6447 struct spdk_nvme_transport_id old_trid = {}; 6448 uint64_t numrec, i; 6449 bool found; 6450 6451 numrec = from_le64(&log_page->numrec); 6452 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 6453 found = false; 6454 old_entry = &entry_ctx->entry; 6455 build_trid_from_log_page_entry(&old_trid, old_entry); 6456 for (i = 0; i < numrec; i++) { 6457 new_entry = &log_page->entries[i]; 6458 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 6459 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 6460 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6461 found = true; 6462 break; 6463 } 6464 } 6465 if (!found) { 6466 struct nvme_path_id path = {}; 6467 6468 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 6469 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6470 6471 path.trid = entry_ctx->trid; 6472 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6473 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6474 free(entry_ctx); 6475 } 6476 } 6477 free(log_page); 6478 ctx->log_page = NULL; 6479 discovery_complete(ctx); 6480 } 6481 6482 static void 6483 complete_discovery_start(struct discovery_ctx *ctx, int status) 6484 { 6485 ctx->timeout_ticks = 0; 6486 ctx->rc = status; 6487 if (ctx->start_cb_fn) { 6488 ctx->start_cb_fn(ctx->cb_ctx, status); 6489 ctx->start_cb_fn = NULL; 6490 ctx->cb_ctx = NULL; 6491 } 6492 } 6493 6494 static void 6495 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 6496 { 6497 struct discovery_entry_ctx *entry_ctx = cb_ctx; 6498 struct discovery_ctx *ctx = entry_ctx->ctx; 6499 6500 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 6501 ctx->attach_in_progress--; 6502 if (ctx->attach_in_progress == 0) { 6503 complete_discovery_start(ctx, ctx->rc); 6504 if (ctx->initializing && ctx->rc != 0) { 6505 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 6506 stop_discovery(ctx, NULL, ctx->cb_ctx); 6507 } else { 6508 discovery_remove_controllers(ctx); 6509 } 6510 } 6511 } 6512 6513 static struct discovery_entry_ctx * 6514 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 6515 { 6516 struct discovery_entry_ctx *new_ctx; 6517 6518 new_ctx = calloc(1, sizeof(*new_ctx)); 6519 if (new_ctx == NULL) { 6520 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6521 return NULL; 6522 } 6523 6524 new_ctx->ctx = ctx; 6525 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 6526 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6527 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6528 return new_ctx; 6529 } 6530 6531 static void 6532 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 6533 struct spdk_nvmf_discovery_log_page *log_page) 6534 { 6535 struct discovery_ctx *ctx = cb_arg; 6536 struct discovery_entry_ctx *entry_ctx, *tmp; 6537 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6538 uint64_t numrec, i; 6539 bool found; 6540 6541 if (rc || spdk_nvme_cpl_is_error(cpl)) { 6542 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6543 return; 6544 } 6545 6546 ctx->log_page = log_page; 6547 assert(ctx->attach_in_progress == 0); 6548 numrec = from_le64(&log_page->numrec); 6549 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 6550 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6551 free(entry_ctx); 6552 } 6553 for (i = 0; i < numrec; i++) { 6554 found = false; 6555 new_entry = &log_page->entries[i]; 6556 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT || 6557 new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 6558 struct discovery_entry_ctx *new_ctx; 6559 struct spdk_nvme_transport_id trid = {}; 6560 6561 build_trid_from_log_page_entry(&trid, new_entry); 6562 new_ctx = create_discovery_entry_ctx(ctx, &trid); 6563 if (new_ctx == NULL) { 6564 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6565 break; 6566 } 6567 6568 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 6569 continue; 6570 } 6571 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 6572 old_entry = &entry_ctx->entry; 6573 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 6574 found = true; 6575 break; 6576 } 6577 } 6578 if (!found) { 6579 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 6580 struct discovery_ctx *d_ctx; 6581 6582 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6583 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 6584 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 6585 sizeof(new_entry->subnqn))) { 6586 break; 6587 } 6588 } 6589 if (subnqn_ctx) { 6590 break; 6591 } 6592 } 6593 6594 new_ctx = calloc(1, sizeof(*new_ctx)); 6595 if (new_ctx == NULL) { 6596 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6597 break; 6598 } 6599 6600 new_ctx->ctx = ctx; 6601 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 6602 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 6603 if (subnqn_ctx) { 6604 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 6605 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 6606 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6607 new_ctx->name); 6608 } else { 6609 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 6610 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 6611 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6612 new_ctx->name); 6613 } 6614 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6615 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6616 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 6617 discovery_attach_controller_done, new_ctx, 6618 &new_ctx->drv_opts, &ctx->bdev_opts, true); 6619 if (rc == 0) { 6620 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 6621 ctx->attach_in_progress++; 6622 } else { 6623 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 6624 } 6625 } 6626 } 6627 6628 if (ctx->attach_in_progress == 0) { 6629 discovery_remove_controllers(ctx); 6630 } 6631 } 6632 6633 static void 6634 get_discovery_log_page(struct discovery_ctx *ctx) 6635 { 6636 int rc; 6637 6638 assert(ctx->in_progress == false); 6639 ctx->in_progress = true; 6640 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 6641 if (rc != 0) { 6642 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6643 } 6644 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 6645 } 6646 6647 static void 6648 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 6649 { 6650 struct discovery_ctx *ctx = arg; 6651 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 6652 6653 if (spdk_nvme_cpl_is_error(cpl)) { 6654 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 6655 return; 6656 } 6657 6658 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 6659 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 6660 return; 6661 } 6662 6663 DISCOVERY_INFOLOG(ctx, "got aer\n"); 6664 if (ctx->in_progress) { 6665 ctx->pending = true; 6666 return; 6667 } 6668 6669 get_discovery_log_page(ctx); 6670 } 6671 6672 static void 6673 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6674 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 6675 { 6676 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6677 struct discovery_ctx *ctx; 6678 6679 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 6680 6681 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 6682 ctx->probe_ctx = NULL; 6683 ctx->ctrlr = ctrlr; 6684 6685 if (ctx->rc != 0) { 6686 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 6687 ctx->rc); 6688 return; 6689 } 6690 6691 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 6692 } 6693 6694 static int 6695 discovery_poller(void *arg) 6696 { 6697 struct discovery_ctx *ctx = arg; 6698 struct spdk_nvme_transport_id *trid; 6699 int rc; 6700 6701 if (ctx->detach_ctx) { 6702 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 6703 if (rc != -EAGAIN) { 6704 ctx->detach_ctx = NULL; 6705 ctx->ctrlr = NULL; 6706 } 6707 } else if (ctx->stop) { 6708 if (ctx->ctrlr != NULL) { 6709 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 6710 if (rc == 0) { 6711 return SPDK_POLLER_BUSY; 6712 } 6713 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 6714 } 6715 spdk_poller_unregister(&ctx->poller); 6716 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6717 assert(ctx->start_cb_fn == NULL); 6718 if (ctx->stop_cb_fn != NULL) { 6719 ctx->stop_cb_fn(ctx->cb_ctx); 6720 } 6721 free_discovery_ctx(ctx); 6722 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 6723 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6724 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6725 assert(ctx->initializing); 6726 spdk_poller_unregister(&ctx->poller); 6727 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6728 complete_discovery_start(ctx, -ETIMEDOUT); 6729 stop_discovery(ctx, NULL, NULL); 6730 free_discovery_ctx(ctx); 6731 return SPDK_POLLER_BUSY; 6732 } 6733 6734 assert(ctx->entry_ctx_in_use == NULL); 6735 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6736 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6737 trid = &ctx->entry_ctx_in_use->trid; 6738 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 6739 if (ctx->probe_ctx) { 6740 spdk_poller_unregister(&ctx->poller); 6741 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 6742 } else { 6743 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 6744 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6745 ctx->entry_ctx_in_use = NULL; 6746 } 6747 } else if (ctx->probe_ctx) { 6748 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6749 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6750 complete_discovery_start(ctx, -ETIMEDOUT); 6751 return SPDK_POLLER_BUSY; 6752 } 6753 6754 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 6755 if (rc != -EAGAIN) { 6756 if (ctx->rc != 0) { 6757 assert(ctx->initializing); 6758 stop_discovery(ctx, NULL, ctx->cb_ctx); 6759 } else { 6760 assert(rc == 0); 6761 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 6762 ctx->rc = rc; 6763 get_discovery_log_page(ctx); 6764 } 6765 } 6766 } else { 6767 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6768 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 6769 complete_discovery_start(ctx, -ETIMEDOUT); 6770 /* We need to wait until all NVM ctrlrs are attached before we stop the 6771 * discovery service to make sure we don't detach a ctrlr that is still 6772 * being attached. 6773 */ 6774 if (ctx->attach_in_progress == 0) { 6775 stop_discovery(ctx, NULL, ctx->cb_ctx); 6776 return SPDK_POLLER_BUSY; 6777 } 6778 } 6779 6780 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 6781 if (rc < 0) { 6782 spdk_poller_unregister(&ctx->poller); 6783 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 6784 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6785 ctx->entry_ctx_in_use = NULL; 6786 6787 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 6788 if (rc != 0) { 6789 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 6790 ctx->ctrlr = NULL; 6791 } 6792 } 6793 } 6794 6795 return SPDK_POLLER_BUSY; 6796 } 6797 6798 static void 6799 start_discovery_poller(void *arg) 6800 { 6801 struct discovery_ctx *ctx = arg; 6802 6803 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 6804 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 6805 } 6806 6807 int 6808 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 6809 const char *base_name, 6810 struct spdk_nvme_ctrlr_opts *drv_opts, 6811 struct nvme_ctrlr_opts *bdev_opts, 6812 uint64_t attach_timeout, 6813 bool from_mdns, 6814 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 6815 { 6816 struct discovery_ctx *ctx; 6817 struct discovery_entry_ctx *discovery_entry_ctx; 6818 6819 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 6820 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6821 if (strcmp(ctx->name, base_name) == 0) { 6822 return -EEXIST; 6823 } 6824 6825 if (ctx->entry_ctx_in_use != NULL) { 6826 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 6827 return -EEXIST; 6828 } 6829 } 6830 6831 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 6832 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 6833 return -EEXIST; 6834 } 6835 } 6836 } 6837 6838 ctx = calloc(1, sizeof(*ctx)); 6839 if (ctx == NULL) { 6840 return -ENOMEM; 6841 } 6842 6843 ctx->name = strdup(base_name); 6844 if (ctx->name == NULL) { 6845 free_discovery_ctx(ctx); 6846 return -ENOMEM; 6847 } 6848 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 6849 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 6850 ctx->from_mdns_discovery_service = from_mdns; 6851 ctx->bdev_opts.from_discovery_service = true; 6852 ctx->calling_thread = spdk_get_thread(); 6853 ctx->start_cb_fn = cb_fn; 6854 ctx->cb_ctx = cb_ctx; 6855 ctx->initializing = true; 6856 if (ctx->start_cb_fn) { 6857 /* We can use this when dumping json to denote if this RPC parameter 6858 * was specified or not. 6859 */ 6860 ctx->wait_for_attach = true; 6861 } 6862 if (attach_timeout != 0) { 6863 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 6864 spdk_get_ticks_hz() / 1000ull; 6865 } 6866 TAILQ_INIT(&ctx->nvm_entry_ctxs); 6867 TAILQ_INIT(&ctx->discovery_entry_ctxs); 6868 memcpy(&ctx->trid, trid, sizeof(*trid)); 6869 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 6870 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 6871 if (ctx->hostnqn == NULL) { 6872 free_discovery_ctx(ctx); 6873 return -ENOMEM; 6874 } 6875 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 6876 if (discovery_entry_ctx == NULL) { 6877 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6878 free_discovery_ctx(ctx); 6879 return -ENOMEM; 6880 } 6881 6882 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 6883 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 6884 return 0; 6885 } 6886 6887 int 6888 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6889 { 6890 struct discovery_ctx *ctx; 6891 6892 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6893 if (strcmp(name, ctx->name) == 0) { 6894 if (ctx->stop) { 6895 return -EALREADY; 6896 } 6897 /* If we're still starting the discovery service and ->rc is non-zero, we're 6898 * going to stop it as soon as we can 6899 */ 6900 if (ctx->initializing && ctx->rc != 0) { 6901 return -EALREADY; 6902 } 6903 stop_discovery(ctx, cb_fn, cb_ctx); 6904 return 0; 6905 } 6906 } 6907 6908 return -ENOENT; 6909 } 6910 6911 static int 6912 bdev_nvme_library_init(void) 6913 { 6914 g_bdev_nvme_init_thread = spdk_get_thread(); 6915 6916 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 6917 bdev_nvme_destroy_poll_group_cb, 6918 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 6919 6920 return 0; 6921 } 6922 6923 static void 6924 bdev_nvme_fini_destruct_ctrlrs(void) 6925 { 6926 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6927 struct nvme_ctrlr *nvme_ctrlr; 6928 6929 pthread_mutex_lock(&g_bdev_nvme_mutex); 6930 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 6931 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6932 pthread_mutex_lock(&nvme_ctrlr->mutex); 6933 if (nvme_ctrlr->destruct) { 6934 /* This controller's destruction was already started 6935 * before the application started shutting down 6936 */ 6937 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6938 continue; 6939 } 6940 nvme_ctrlr->destruct = true; 6941 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6942 6943 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 6944 nvme_ctrlr); 6945 } 6946 } 6947 6948 g_bdev_nvme_module_finish = true; 6949 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 6950 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6951 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 6952 spdk_bdev_module_fini_done(); 6953 return; 6954 } 6955 6956 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6957 } 6958 6959 static void 6960 check_discovery_fini(void *arg) 6961 { 6962 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 6963 bdev_nvme_fini_destruct_ctrlrs(); 6964 } 6965 } 6966 6967 static void 6968 bdev_nvme_library_fini(void) 6969 { 6970 struct nvme_probe_skip_entry *entry, *entry_tmp; 6971 struct discovery_ctx *ctx; 6972 6973 spdk_poller_unregister(&g_hotplug_poller); 6974 free(g_hotplug_probe_ctx); 6975 g_hotplug_probe_ctx = NULL; 6976 6977 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 6978 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 6979 free(entry); 6980 } 6981 6982 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 6983 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 6984 bdev_nvme_fini_destruct_ctrlrs(); 6985 } else { 6986 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6987 stop_discovery(ctx, check_discovery_fini, NULL); 6988 } 6989 } 6990 } 6991 6992 static void 6993 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 6994 { 6995 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6996 struct spdk_bdev *bdev = bdev_io->bdev; 6997 struct spdk_dif_ctx dif_ctx; 6998 struct spdk_dif_error err_blk = {}; 6999 int rc; 7000 struct spdk_dif_ctx_init_ext_opts dif_opts; 7001 7002 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 7003 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 7004 rc = spdk_dif_ctx_init(&dif_ctx, 7005 bdev->blocklen, bdev->md_len, bdev->md_interleave, 7006 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 7007 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts); 7008 if (rc != 0) { 7009 SPDK_ERRLOG("Initialization of DIF context failed\n"); 7010 return; 7011 } 7012 7013 if (bdev->md_interleave) { 7014 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7015 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7016 } else { 7017 struct iovec md_iov = { 7018 .iov_base = bdev_io->u.bdev.md_buf, 7019 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 7020 }; 7021 7022 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7023 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7024 } 7025 7026 if (rc != 0) { 7027 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 7028 err_blk.err_type, err_blk.err_offset); 7029 } else { 7030 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 7031 } 7032 } 7033 7034 static void 7035 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7036 { 7037 struct nvme_bdev_io *bio = ref; 7038 7039 if (spdk_nvme_cpl_is_success(cpl)) { 7040 /* Run PI verification for read data buffer. */ 7041 bdev_nvme_verify_pi_error(bio); 7042 } 7043 7044 /* Return original completion status */ 7045 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7046 } 7047 7048 static void 7049 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7050 { 7051 struct nvme_bdev_io *bio = ref; 7052 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7053 int ret; 7054 7055 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7056 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 7057 cpl->status.sct, cpl->status.sc); 7058 7059 /* Save completion status to use after verifying PI error. */ 7060 bio->cpl = *cpl; 7061 7062 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 7063 /* Read without PI checking to verify PI error. */ 7064 ret = bdev_nvme_no_pi_readv(bio, 7065 bdev_io->u.bdev.iovs, 7066 bdev_io->u.bdev.iovcnt, 7067 bdev_io->u.bdev.md_buf, 7068 bdev_io->u.bdev.num_blocks, 7069 bdev_io->u.bdev.offset_blocks); 7070 if (ret == 0) { 7071 return; 7072 } 7073 } 7074 } 7075 7076 bdev_nvme_io_complete_nvme_status(bio, cpl); 7077 } 7078 7079 static void 7080 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7081 { 7082 struct nvme_bdev_io *bio = ref; 7083 7084 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7085 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 7086 cpl->status.sct, cpl->status.sc); 7087 /* Run PI verification for write data buffer if PI error is detected. */ 7088 bdev_nvme_verify_pi_error(bio); 7089 } 7090 7091 bdev_nvme_io_complete_nvme_status(bio, cpl); 7092 } 7093 7094 static void 7095 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7096 { 7097 struct nvme_bdev_io *bio = ref; 7098 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7099 7100 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 7101 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 7102 */ 7103 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 7104 7105 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7106 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 7107 cpl->status.sct, cpl->status.sc); 7108 /* Run PI verification for zone append data buffer if PI error is detected. */ 7109 bdev_nvme_verify_pi_error(bio); 7110 } 7111 7112 bdev_nvme_io_complete_nvme_status(bio, cpl); 7113 } 7114 7115 static void 7116 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7117 { 7118 struct nvme_bdev_io *bio = ref; 7119 7120 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7121 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 7122 cpl->status.sct, cpl->status.sc); 7123 /* Run PI verification for compare data buffer if PI error is detected. */ 7124 bdev_nvme_verify_pi_error(bio); 7125 } 7126 7127 bdev_nvme_io_complete_nvme_status(bio, cpl); 7128 } 7129 7130 static void 7131 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7132 { 7133 struct nvme_bdev_io *bio = ref; 7134 7135 /* Compare operation completion */ 7136 if (!bio->first_fused_completed) { 7137 /* Save compare result for write callback */ 7138 bio->cpl = *cpl; 7139 bio->first_fused_completed = true; 7140 return; 7141 } 7142 7143 /* Write operation completion */ 7144 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 7145 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 7146 * complete the IO with the compare operation's status. 7147 */ 7148 if (!spdk_nvme_cpl_is_error(cpl)) { 7149 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 7150 } 7151 7152 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7153 } else { 7154 bdev_nvme_io_complete_nvme_status(bio, cpl); 7155 } 7156 } 7157 7158 static void 7159 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 7160 { 7161 struct nvme_bdev_io *bio = ref; 7162 7163 bdev_nvme_io_complete_nvme_status(bio, cpl); 7164 } 7165 7166 static int 7167 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 7168 { 7169 switch (desc->zt) { 7170 case SPDK_NVME_ZONE_TYPE_SEQWR: 7171 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 7172 break; 7173 default: 7174 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 7175 return -EIO; 7176 } 7177 7178 switch (desc->zs) { 7179 case SPDK_NVME_ZONE_STATE_EMPTY: 7180 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 7181 break; 7182 case SPDK_NVME_ZONE_STATE_IOPEN: 7183 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 7184 break; 7185 case SPDK_NVME_ZONE_STATE_EOPEN: 7186 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 7187 break; 7188 case SPDK_NVME_ZONE_STATE_CLOSED: 7189 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 7190 break; 7191 case SPDK_NVME_ZONE_STATE_RONLY: 7192 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 7193 break; 7194 case SPDK_NVME_ZONE_STATE_FULL: 7195 info->state = SPDK_BDEV_ZONE_STATE_FULL; 7196 break; 7197 case SPDK_NVME_ZONE_STATE_OFFLINE: 7198 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 7199 break; 7200 default: 7201 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 7202 return -EIO; 7203 } 7204 7205 info->zone_id = desc->zslba; 7206 info->write_pointer = desc->wp; 7207 info->capacity = desc->zcap; 7208 7209 return 0; 7210 } 7211 7212 static void 7213 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 7214 { 7215 struct nvme_bdev_io *bio = ref; 7216 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7217 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 7218 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 7219 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 7220 uint64_t max_zones_per_buf, i; 7221 uint32_t zone_report_bufsize; 7222 struct spdk_nvme_ns *ns; 7223 struct spdk_nvme_qpair *qpair; 7224 int ret; 7225 7226 if (spdk_nvme_cpl_is_error(cpl)) { 7227 goto out_complete_io_nvme_cpl; 7228 } 7229 7230 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 7231 ret = -ENXIO; 7232 goto out_complete_io_ret; 7233 } 7234 7235 ns = bio->io_path->nvme_ns->ns; 7236 qpair = bio->io_path->qpair->qpair; 7237 7238 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7239 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 7240 sizeof(bio->zone_report_buf->descs[0]); 7241 7242 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 7243 ret = -EINVAL; 7244 goto out_complete_io_ret; 7245 } 7246 7247 if (!bio->zone_report_buf->nr_zones) { 7248 ret = -EINVAL; 7249 goto out_complete_io_ret; 7250 } 7251 7252 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 7253 ret = fill_zone_from_report(&info[bio->handled_zones], 7254 &bio->zone_report_buf->descs[i]); 7255 if (ret) { 7256 goto out_complete_io_ret; 7257 } 7258 bio->handled_zones++; 7259 } 7260 7261 if (bio->handled_zones < zones_to_copy) { 7262 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7263 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 7264 7265 memset(bio->zone_report_buf, 0, zone_report_bufsize); 7266 ret = spdk_nvme_zns_report_zones(ns, qpair, 7267 bio->zone_report_buf, zone_report_bufsize, 7268 slba, SPDK_NVME_ZRA_LIST_ALL, true, 7269 bdev_nvme_get_zone_info_done, bio); 7270 if (!ret) { 7271 return; 7272 } else { 7273 goto out_complete_io_ret; 7274 } 7275 } 7276 7277 out_complete_io_nvme_cpl: 7278 free(bio->zone_report_buf); 7279 bio->zone_report_buf = NULL; 7280 bdev_nvme_io_complete_nvme_status(bio, cpl); 7281 return; 7282 7283 out_complete_io_ret: 7284 free(bio->zone_report_buf); 7285 bio->zone_report_buf = NULL; 7286 bdev_nvme_io_complete(bio, ret); 7287 } 7288 7289 static void 7290 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 7291 { 7292 struct nvme_bdev_io *bio = ref; 7293 7294 bdev_nvme_io_complete_nvme_status(bio, cpl); 7295 } 7296 7297 static void 7298 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 7299 { 7300 struct nvme_bdev_io *bio = ctx; 7301 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7302 const struct spdk_nvme_cpl *cpl = &bio->cpl; 7303 7304 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 7305 7306 __bdev_nvme_io_complete(bdev_io, 0, cpl); 7307 } 7308 7309 static void 7310 bdev_nvme_abort_complete(void *ctx) 7311 { 7312 struct nvme_bdev_io *bio = ctx; 7313 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7314 7315 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 7316 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 7317 } else { 7318 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 7319 } 7320 } 7321 7322 static void 7323 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 7324 { 7325 struct nvme_bdev_io *bio = ref; 7326 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7327 7328 bio->cpl = *cpl; 7329 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio); 7330 } 7331 7332 static void 7333 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 7334 { 7335 struct nvme_bdev_io *bio = ref; 7336 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7337 7338 bio->cpl = *cpl; 7339 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7340 bdev_nvme_admin_passthru_complete_nvme_status, bio); 7341 } 7342 7343 static void 7344 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 7345 { 7346 struct nvme_bdev_io *bio = ref; 7347 struct iovec *iov; 7348 7349 bio->iov_offset = sgl_offset; 7350 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 7351 iov = &bio->iovs[bio->iovpos]; 7352 if (bio->iov_offset < iov->iov_len) { 7353 break; 7354 } 7355 7356 bio->iov_offset -= iov->iov_len; 7357 } 7358 } 7359 7360 static int 7361 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 7362 { 7363 struct nvme_bdev_io *bio = ref; 7364 struct iovec *iov; 7365 7366 assert(bio->iovpos < bio->iovcnt); 7367 7368 iov = &bio->iovs[bio->iovpos]; 7369 7370 *address = iov->iov_base; 7371 *length = iov->iov_len; 7372 7373 if (bio->iov_offset) { 7374 assert(bio->iov_offset <= iov->iov_len); 7375 *address += bio->iov_offset; 7376 *length -= bio->iov_offset; 7377 } 7378 7379 bio->iov_offset += *length; 7380 if (bio->iov_offset == iov->iov_len) { 7381 bio->iovpos++; 7382 bio->iov_offset = 0; 7383 } 7384 7385 return 0; 7386 } 7387 7388 static void 7389 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 7390 { 7391 struct nvme_bdev_io *bio = ref; 7392 struct iovec *iov; 7393 7394 bio->fused_iov_offset = sgl_offset; 7395 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 7396 iov = &bio->fused_iovs[bio->fused_iovpos]; 7397 if (bio->fused_iov_offset < iov->iov_len) { 7398 break; 7399 } 7400 7401 bio->fused_iov_offset -= iov->iov_len; 7402 } 7403 } 7404 7405 static int 7406 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 7407 { 7408 struct nvme_bdev_io *bio = ref; 7409 struct iovec *iov; 7410 7411 assert(bio->fused_iovpos < bio->fused_iovcnt); 7412 7413 iov = &bio->fused_iovs[bio->fused_iovpos]; 7414 7415 *address = iov->iov_base; 7416 *length = iov->iov_len; 7417 7418 if (bio->fused_iov_offset) { 7419 assert(bio->fused_iov_offset <= iov->iov_len); 7420 *address += bio->fused_iov_offset; 7421 *length -= bio->fused_iov_offset; 7422 } 7423 7424 bio->fused_iov_offset += *length; 7425 if (bio->fused_iov_offset == iov->iov_len) { 7426 bio->fused_iovpos++; 7427 bio->fused_iov_offset = 0; 7428 } 7429 7430 return 0; 7431 } 7432 7433 static int 7434 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7435 void *md, uint64_t lba_count, uint64_t lba) 7436 { 7437 int rc; 7438 7439 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 7440 lba_count, lba); 7441 7442 bio->iovs = iov; 7443 bio->iovcnt = iovcnt; 7444 bio->iovpos = 0; 7445 bio->iov_offset = 0; 7446 7447 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 7448 bio->io_path->qpair->qpair, 7449 lba, lba_count, 7450 bdev_nvme_no_pi_readv_done, bio, 0, 7451 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7452 md, 0, 0); 7453 7454 if (rc != 0 && rc != -ENOMEM) { 7455 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 7456 } 7457 return rc; 7458 } 7459 7460 static int 7461 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7462 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7463 struct spdk_memory_domain *domain, void *domain_ctx, 7464 struct spdk_accel_sequence *seq) 7465 { 7466 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7467 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7468 int rc; 7469 7470 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7471 lba_count, lba); 7472 7473 bio->iovs = iov; 7474 bio->iovcnt = iovcnt; 7475 bio->iovpos = 0; 7476 bio->iov_offset = 0; 7477 7478 if (domain != NULL || seq != NULL) { 7479 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7480 bio->ext_opts.memory_domain = domain; 7481 bio->ext_opts.memory_domain_ctx = domain_ctx; 7482 bio->ext_opts.io_flags = flags; 7483 bio->ext_opts.metadata = md; 7484 bio->ext_opts.accel_sequence = seq; 7485 7486 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 7487 bdev_nvme_readv_done, bio, 7488 bdev_nvme_queued_reset_sgl, 7489 bdev_nvme_queued_next_sge, 7490 &bio->ext_opts); 7491 } else if (iovcnt == 1) { 7492 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, 7493 md, lba, lba_count, bdev_nvme_readv_done, 7494 bio, flags, 0, 0); 7495 } else { 7496 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 7497 bdev_nvme_readv_done, bio, flags, 7498 bdev_nvme_queued_reset_sgl, 7499 bdev_nvme_queued_next_sge, md, 0, 0); 7500 } 7501 7502 if (rc != 0 && rc != -ENOMEM) { 7503 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 7504 } 7505 return rc; 7506 } 7507 7508 static int 7509 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7510 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7511 struct spdk_memory_domain *domain, void *domain_ctx, 7512 struct spdk_accel_sequence *seq) 7513 { 7514 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7515 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7516 int rc; 7517 7518 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7519 lba_count, lba); 7520 7521 bio->iovs = iov; 7522 bio->iovcnt = iovcnt; 7523 bio->iovpos = 0; 7524 bio->iov_offset = 0; 7525 7526 if (domain != NULL || seq != NULL) { 7527 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7528 bio->ext_opts.memory_domain = domain; 7529 bio->ext_opts.memory_domain_ctx = domain_ctx; 7530 bio->ext_opts.io_flags = flags; 7531 bio->ext_opts.metadata = md; 7532 bio->ext_opts.accel_sequence = seq; 7533 7534 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 7535 bdev_nvme_writev_done, bio, 7536 bdev_nvme_queued_reset_sgl, 7537 bdev_nvme_queued_next_sge, 7538 &bio->ext_opts); 7539 } else if (iovcnt == 1) { 7540 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, 7541 md, lba, lba_count, bdev_nvme_writev_done, 7542 bio, flags, 0, 0); 7543 } else { 7544 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7545 bdev_nvme_writev_done, bio, flags, 7546 bdev_nvme_queued_reset_sgl, 7547 bdev_nvme_queued_next_sge, md, 0, 0); 7548 } 7549 7550 if (rc != 0 && rc != -ENOMEM) { 7551 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 7552 } 7553 return rc; 7554 } 7555 7556 static int 7557 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7558 void *md, uint64_t lba_count, uint64_t zslba, 7559 uint32_t flags) 7560 { 7561 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7562 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7563 int rc; 7564 7565 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 7566 lba_count, zslba); 7567 7568 bio->iovs = iov; 7569 bio->iovcnt = iovcnt; 7570 bio->iovpos = 0; 7571 bio->iov_offset = 0; 7572 7573 if (iovcnt == 1) { 7574 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 7575 lba_count, 7576 bdev_nvme_zone_appendv_done, bio, 7577 flags, 7578 0, 0); 7579 } else { 7580 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 7581 bdev_nvme_zone_appendv_done, bio, flags, 7582 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7583 md, 0, 0); 7584 } 7585 7586 if (rc != 0 && rc != -ENOMEM) { 7587 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 7588 } 7589 return rc; 7590 } 7591 7592 static int 7593 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7594 void *md, uint64_t lba_count, uint64_t lba, 7595 uint32_t flags) 7596 { 7597 int rc; 7598 7599 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7600 lba_count, lba); 7601 7602 bio->iovs = iov; 7603 bio->iovcnt = iovcnt; 7604 bio->iovpos = 0; 7605 bio->iov_offset = 0; 7606 7607 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 7608 bio->io_path->qpair->qpair, 7609 lba, lba_count, 7610 bdev_nvme_comparev_done, bio, flags, 7611 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7612 md, 0, 0); 7613 7614 if (rc != 0 && rc != -ENOMEM) { 7615 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 7616 } 7617 return rc; 7618 } 7619 7620 static int 7621 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 7622 struct iovec *write_iov, int write_iovcnt, 7623 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 7624 { 7625 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7626 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7627 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7628 int rc; 7629 7630 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7631 lba_count, lba); 7632 7633 bio->iovs = cmp_iov; 7634 bio->iovcnt = cmp_iovcnt; 7635 bio->iovpos = 0; 7636 bio->iov_offset = 0; 7637 bio->fused_iovs = write_iov; 7638 bio->fused_iovcnt = write_iovcnt; 7639 bio->fused_iovpos = 0; 7640 bio->fused_iov_offset = 0; 7641 7642 if (bdev_io->num_retries == 0) { 7643 bio->first_fused_submitted = false; 7644 bio->first_fused_completed = false; 7645 } 7646 7647 if (!bio->first_fused_submitted) { 7648 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7649 memset(&bio->cpl, 0, sizeof(bio->cpl)); 7650 7651 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 7652 bdev_nvme_comparev_and_writev_done, bio, flags, 7653 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 7654 if (rc == 0) { 7655 bio->first_fused_submitted = true; 7656 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7657 } else { 7658 if (rc != -ENOMEM) { 7659 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 7660 } 7661 return rc; 7662 } 7663 } 7664 7665 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 7666 7667 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7668 bdev_nvme_comparev_and_writev_done, bio, flags, 7669 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 7670 if (rc != 0 && rc != -ENOMEM) { 7671 SPDK_ERRLOG("write failed: rc = %d\n", rc); 7672 rc = 0; 7673 } 7674 7675 return rc; 7676 } 7677 7678 static int 7679 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7680 { 7681 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 7682 struct spdk_nvme_dsm_range *range; 7683 uint64_t offset, remaining; 7684 uint64_t num_ranges_u64; 7685 uint16_t num_ranges; 7686 int rc; 7687 7688 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 7689 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7690 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 7691 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 7692 return -EINVAL; 7693 } 7694 num_ranges = (uint16_t)num_ranges_u64; 7695 7696 offset = offset_blocks; 7697 remaining = num_blocks; 7698 range = &dsm_ranges[0]; 7699 7700 /* Fill max-size ranges until the remaining blocks fit into one range */ 7701 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 7702 range->attributes.raw = 0; 7703 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7704 range->starting_lba = offset; 7705 7706 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7707 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7708 range++; 7709 } 7710 7711 /* Final range describes the remaining blocks */ 7712 range->attributes.raw = 0; 7713 range->length = remaining; 7714 range->starting_lba = offset; 7715 7716 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 7717 bio->io_path->qpair->qpair, 7718 SPDK_NVME_DSM_ATTR_DEALLOCATE, 7719 dsm_ranges, num_ranges, 7720 bdev_nvme_queued_done, bio); 7721 7722 return rc; 7723 } 7724 7725 static int 7726 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7727 { 7728 if (num_blocks > UINT16_MAX + 1) { 7729 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 7730 return -EINVAL; 7731 } 7732 7733 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 7734 bio->io_path->qpair->qpair, 7735 offset_blocks, num_blocks, 7736 bdev_nvme_queued_done, bio, 7737 0); 7738 } 7739 7740 static int 7741 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 7742 struct spdk_bdev_zone_info *info) 7743 { 7744 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7745 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7746 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7747 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7748 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 7749 7750 if (zone_id % zone_size != 0) { 7751 return -EINVAL; 7752 } 7753 7754 if (num_zones > total_zones || !num_zones) { 7755 return -EINVAL; 7756 } 7757 7758 assert(!bio->zone_report_buf); 7759 bio->zone_report_buf = calloc(1, zone_report_bufsize); 7760 if (!bio->zone_report_buf) { 7761 return -ENOMEM; 7762 } 7763 7764 bio->handled_zones = 0; 7765 7766 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 7767 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 7768 bdev_nvme_get_zone_info_done, bio); 7769 } 7770 7771 static int 7772 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 7773 enum spdk_bdev_zone_action action) 7774 { 7775 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7776 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7777 7778 switch (action) { 7779 case SPDK_BDEV_ZONE_CLOSE: 7780 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 7781 bdev_nvme_zone_management_done, bio); 7782 case SPDK_BDEV_ZONE_FINISH: 7783 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 7784 bdev_nvme_zone_management_done, bio); 7785 case SPDK_BDEV_ZONE_OPEN: 7786 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 7787 bdev_nvme_zone_management_done, bio); 7788 case SPDK_BDEV_ZONE_RESET: 7789 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 7790 bdev_nvme_zone_management_done, bio); 7791 case SPDK_BDEV_ZONE_OFFLINE: 7792 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 7793 bdev_nvme_zone_management_done, bio); 7794 default: 7795 return -EINVAL; 7796 } 7797 } 7798 7799 static void 7800 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 7801 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 7802 { 7803 struct nvme_io_path *io_path; 7804 struct nvme_ctrlr *nvme_ctrlr; 7805 uint32_t max_xfer_size; 7806 int rc = -ENXIO; 7807 7808 /* Choose the first ctrlr which is not failed. */ 7809 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 7810 nvme_ctrlr = io_path->qpair->ctrlr; 7811 7812 /* We should skip any unavailable nvme_ctrlr rather than checking 7813 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 7814 */ 7815 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 7816 continue; 7817 } 7818 7819 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 7820 7821 if (nbytes > max_xfer_size) { 7822 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7823 rc = -EINVAL; 7824 goto err; 7825 } 7826 7827 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 7828 bdev_nvme_admin_passthru_done, bio); 7829 if (rc == 0) { 7830 return; 7831 } 7832 } 7833 7834 err: 7835 bdev_nvme_admin_complete(bio, rc); 7836 } 7837 7838 static int 7839 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 7840 void *buf, size_t nbytes) 7841 { 7842 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7843 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7844 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 7845 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 7846 7847 if (nbytes > max_xfer_size) { 7848 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7849 return -EINVAL; 7850 } 7851 7852 /* 7853 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 7854 * so fill it out automatically. 7855 */ 7856 cmd->nsid = spdk_nvme_ns_get_id(ns); 7857 7858 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 7859 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 7860 } 7861 7862 static int 7863 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 7864 void *buf, size_t nbytes, void *md_buf, size_t md_len) 7865 { 7866 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7867 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7868 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 7869 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 7870 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 7871 7872 if (nbytes > max_xfer_size) { 7873 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7874 return -EINVAL; 7875 } 7876 7877 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 7878 SPDK_ERRLOG("invalid meta data buffer size\n"); 7879 return -EINVAL; 7880 } 7881 7882 /* 7883 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 7884 * so fill it out automatically. 7885 */ 7886 cmd->nsid = spdk_nvme_ns_get_id(ns); 7887 7888 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 7889 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 7890 } 7891 7892 static void 7893 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 7894 struct nvme_bdev_io *bio_to_abort) 7895 { 7896 struct nvme_io_path *io_path; 7897 int rc = 0; 7898 7899 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 7900 if (rc == 0) { 7901 bdev_nvme_admin_complete(bio, 0); 7902 return; 7903 } 7904 7905 io_path = bio_to_abort->io_path; 7906 if (io_path != NULL) { 7907 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 7908 io_path->qpair->qpair, 7909 bio_to_abort, 7910 bdev_nvme_abort_done, bio); 7911 } else { 7912 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 7913 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 7914 NULL, 7915 bio_to_abort, 7916 bdev_nvme_abort_done, bio); 7917 7918 if (rc != -ENOENT) { 7919 break; 7920 } 7921 } 7922 } 7923 7924 if (rc != 0) { 7925 /* If no command was found or there was any error, complete the abort 7926 * request with failure. 7927 */ 7928 bdev_nvme_admin_complete(bio, rc); 7929 } 7930 } 7931 7932 static int 7933 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 7934 uint64_t num_blocks) 7935 { 7936 struct spdk_nvme_scc_source_range range = { 7937 .slba = src_offset_blocks, 7938 .nlb = num_blocks - 1 7939 }; 7940 7941 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 7942 bio->io_path->qpair->qpair, 7943 &range, 1, dst_offset_blocks, 7944 bdev_nvme_queued_done, bio); 7945 } 7946 7947 static void 7948 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 7949 { 7950 const char *action; 7951 7952 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 7953 action = "reset"; 7954 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 7955 action = "abort"; 7956 } else { 7957 action = "none"; 7958 } 7959 7960 spdk_json_write_object_begin(w); 7961 7962 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 7963 7964 spdk_json_write_named_object_begin(w, "params"); 7965 spdk_json_write_named_string(w, "action_on_timeout", action); 7966 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 7967 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 7968 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 7969 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 7970 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 7971 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 7972 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 7973 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 7974 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 7975 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 7976 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 7977 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 7978 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 7979 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 7980 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 7981 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 7982 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 7983 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 7984 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 7985 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 7986 spdk_json_write_named_bool(w, "allow_accel_sequence", g_opts.allow_accel_sequence); 7987 spdk_json_write_object_end(w); 7988 7989 spdk_json_write_object_end(w); 7990 } 7991 7992 static void 7993 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 7994 { 7995 struct spdk_nvme_transport_id trid; 7996 7997 spdk_json_write_object_begin(w); 7998 7999 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 8000 8001 spdk_json_write_named_object_begin(w, "params"); 8002 spdk_json_write_named_string(w, "name", ctx->name); 8003 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 8004 8005 trid = ctx->trid; 8006 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 8007 nvme_bdev_dump_trid_json(&trid, w); 8008 8009 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 8010 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 8011 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 8012 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8013 ctx->bdev_opts.fast_io_fail_timeout_sec); 8014 spdk_json_write_object_end(w); 8015 8016 spdk_json_write_object_end(w); 8017 } 8018 8019 #ifdef SPDK_CONFIG_NVME_CUSE 8020 static void 8021 nvme_ctrlr_cuse_config_json(struct spdk_json_write_ctx *w, 8022 struct nvme_ctrlr *nvme_ctrlr) 8023 { 8024 size_t cuse_name_size = 128; 8025 char cuse_name[cuse_name_size]; 8026 8027 if (spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, 8028 cuse_name, &cuse_name_size) != 0) { 8029 return; 8030 } 8031 8032 spdk_json_write_object_begin(w); 8033 8034 spdk_json_write_named_string(w, "method", "bdev_nvme_cuse_register"); 8035 8036 spdk_json_write_named_object_begin(w, "params"); 8037 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8038 spdk_json_write_object_end(w); 8039 8040 spdk_json_write_object_end(w); 8041 } 8042 #endif 8043 8044 static void 8045 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 8046 struct nvme_ctrlr *nvme_ctrlr) 8047 { 8048 struct spdk_nvme_transport_id *trid; 8049 const struct spdk_nvme_ctrlr_opts *opts; 8050 8051 if (nvme_ctrlr->opts.from_discovery_service) { 8052 /* Do not emit an RPC for this - it will be implicitly 8053 * covered by a separate bdev_nvme_start_discovery or 8054 * bdev_nvme_start_mdns_discovery RPC. 8055 */ 8056 return; 8057 } 8058 8059 trid = &nvme_ctrlr->active_path_id->trid; 8060 8061 spdk_json_write_object_begin(w); 8062 8063 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 8064 8065 spdk_json_write_named_object_begin(w, "params"); 8066 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8067 nvme_bdev_dump_trid_json(trid, w); 8068 spdk_json_write_named_bool(w, "prchk_reftag", 8069 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 8070 spdk_json_write_named_bool(w, "prchk_guard", 8071 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 8072 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 8073 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 8074 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8075 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 8076 if (nvme_ctrlr->opts.psk_path[0] != '\0') { 8077 spdk_json_write_named_string(w, "psk", nvme_ctrlr->opts.psk_path); 8078 } 8079 8080 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 8081 spdk_json_write_named_string(w, "hostnqn", opts->hostnqn); 8082 spdk_json_write_named_bool(w, "hdgst", opts->header_digest); 8083 spdk_json_write_named_bool(w, "ddgst", opts->data_digest); 8084 8085 spdk_json_write_object_end(w); 8086 8087 spdk_json_write_object_end(w); 8088 } 8089 8090 static void 8091 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 8092 { 8093 spdk_json_write_object_begin(w); 8094 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 8095 8096 spdk_json_write_named_object_begin(w, "params"); 8097 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 8098 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 8099 spdk_json_write_object_end(w); 8100 8101 spdk_json_write_object_end(w); 8102 } 8103 8104 static int 8105 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 8106 { 8107 struct nvme_bdev_ctrlr *nbdev_ctrlr; 8108 struct nvme_ctrlr *nvme_ctrlr; 8109 struct discovery_ctx *ctx; 8110 8111 bdev_nvme_opts_config_json(w); 8112 8113 pthread_mutex_lock(&g_bdev_nvme_mutex); 8114 8115 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 8116 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 8117 nvme_ctrlr_config_json(w, nvme_ctrlr); 8118 8119 #ifdef SPDK_CONFIG_NVME_CUSE 8120 nvme_ctrlr_cuse_config_json(w, nvme_ctrlr); 8121 #endif 8122 } 8123 } 8124 8125 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8126 if (!ctx->from_mdns_discovery_service) { 8127 bdev_nvme_discovery_config_json(w, ctx); 8128 } 8129 } 8130 8131 bdev_nvme_mdns_discovery_config_json(w); 8132 8133 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 8134 * before enabling hotplug poller. 8135 */ 8136 bdev_nvme_hotplug_config_json(w); 8137 8138 pthread_mutex_unlock(&g_bdev_nvme_mutex); 8139 return 0; 8140 } 8141 8142 struct spdk_nvme_ctrlr * 8143 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 8144 { 8145 struct nvme_bdev *nbdev; 8146 struct nvme_ns *nvme_ns; 8147 8148 if (!bdev || bdev->module != &nvme_if) { 8149 return NULL; 8150 } 8151 8152 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 8153 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 8154 assert(nvme_ns != NULL); 8155 8156 return nvme_ns->ctrlr->ctrlr; 8157 } 8158 8159 void 8160 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 8161 { 8162 struct nvme_ns *nvme_ns = io_path->nvme_ns; 8163 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 8164 const struct spdk_nvme_ctrlr_data *cdata; 8165 const struct spdk_nvme_transport_id *trid; 8166 const char *adrfam_str; 8167 8168 spdk_json_write_object_begin(w); 8169 8170 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 8171 8172 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 8173 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 8174 8175 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 8176 spdk_json_write_named_bool(w, "current", io_path->nbdev_ch != NULL && 8177 io_path == io_path->nbdev_ch->current_io_path); 8178 spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair)); 8179 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 8180 8181 spdk_json_write_named_object_begin(w, "transport"); 8182 spdk_json_write_named_string(w, "trtype", trid->trstring); 8183 spdk_json_write_named_string(w, "traddr", trid->traddr); 8184 if (trid->trsvcid[0] != '\0') { 8185 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 8186 } 8187 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 8188 if (adrfam_str) { 8189 spdk_json_write_named_string(w, "adrfam", adrfam_str); 8190 } 8191 spdk_json_write_object_end(w); 8192 8193 spdk_json_write_object_end(w); 8194 } 8195 8196 void 8197 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 8198 { 8199 struct discovery_ctx *ctx; 8200 struct discovery_entry_ctx *entry_ctx; 8201 8202 spdk_json_write_array_begin(w); 8203 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8204 spdk_json_write_object_begin(w); 8205 spdk_json_write_named_string(w, "name", ctx->name); 8206 8207 spdk_json_write_named_object_begin(w, "trid"); 8208 nvme_bdev_dump_trid_json(&ctx->trid, w); 8209 spdk_json_write_object_end(w); 8210 8211 spdk_json_write_named_array_begin(w, "referrals"); 8212 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 8213 spdk_json_write_object_begin(w); 8214 spdk_json_write_named_object_begin(w, "trid"); 8215 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 8216 spdk_json_write_object_end(w); 8217 spdk_json_write_object_end(w); 8218 } 8219 spdk_json_write_array_end(w); 8220 8221 spdk_json_write_object_end(w); 8222 } 8223 spdk_json_write_array_end(w); 8224 } 8225 8226 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 8227 8228 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 8229 { 8230 struct spdk_trace_tpoint_opts opts[] = { 8231 { 8232 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 8233 OWNER_NONE, OBJECT_BDEV_NVME_IO, 1, 8234 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8235 }, 8236 { 8237 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 8238 OWNER_NONE, OBJECT_BDEV_NVME_IO, 0, 8239 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8240 } 8241 }; 8242 8243 8244 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 8245 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8246 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8247 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8248 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8249 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8250 } 8251